Bug Summary

File:nnc/ccv_nnc_symbolic_graph_compile.c
Warning:line 3893, column 8
Branch condition evaluates to a garbage value

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-unknown-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name ccv_nnc_symbolic_graph_compile.c -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -mrelocation-model pic -pic-level 2 -pic-is-pie -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -target-feature +sse2 -tune-cpu generic -debugger-tuning=gdb -fdebug-compilation-dir=/home/liu/actions-runner/_work/ccv/ccv/lib/nnc -fcoverage-compilation-dir=/home/liu/actions-runner/_work/ccv/ccv/lib/nnc -resource-dir /usr/local/lib/clang/19 -I ../ -I /usr/local/cuda/include -D HAVE_CBLAS -D HAVE_LIBPNG -D HAVE_LIBJPEG -D HAVE_FFTW3 -D HAVE_PTHREAD -D HAVE_LIBLINEAR -D HAVE_TESSERACT -D HAVE_AVCODEC -D HAVE_AVFORMAT -D HAVE_AVUTIL -D HAVE_SWSCALE -D HAVE_SSE2 -D HAVE_GSL -D HAVE_CUDA -D HAVE_CUDNN -D HAVE_NCCL -D USE_SYSTEM_CUB -I /usr/local/include -internal-isystem /usr/local/lib/clang/19/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/12/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -ferror-limit 19 -fgnuc-version=4.2.1 -fskip-odr-check-in-gmf -vectorize-loops -vectorize-slp -analyzer-output=html -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /home/liu/actions-runner/_work/ccv/ccv/_analyze/2025-03-17-111301-58955-1 -x c ccv_nnc_symbolic_graph_compile.c
1#include "ccv_nnc.h"
2#include "ccv_nnc_internal.h"
3#include "ccv_nnc_easy.h"
4#include "ccv_internal.h"
5#ifdef HAVE_CUDA1
6#include "gpu/ccv_nnc_compat.h"
7#elif defined(HAVE_MPS)
8#include "mps/ccv_nnc_mps.h"
9#endif
10#include "_ccv_nnc_graph.h"
11#include "_ccv_nnc_symbolic_graph.h"
12
13// MARK - Level-3 API
14
15typedef struct {
16 int flags;
17 int type;
18 int pin_mem; // This memory need to be pinned.
19 int ref; // Reference to another tensor block. Start with 1.
20 int alias_ref; // If reference to another tensor, and the other one is an alias. Start with 1.
21 int bypass_ref; // Copy over the bypass_ref from tensor symbol underneath. Start with 1.
22 int companion_ref; // Reference to another block that they two share the same memory region. Start with 1. the current crude implementation requires the two mutually be companion. Because there are two, we took the one that companion_ref <= i as the primary and companion_ref > i is the secondary. For allocation algorithm, we use the primary throughout.
23 int unfoldable_except_ref; // Reference to a tensor block that can be the exception to unfoldable (as output). Start with 1.
24 ccv_array_t* r_refs; // If this is referenced by another block, the array point back to these blocks. Start with 1.
25 uint64_t size; // The size of the tensor expected.
26 int p_refs[2]; // Reference to the parent tensor block, at max there will be only two. Start with 1.
27 ccv_array_t* dup_p_refs; // Reference to the parent tensor block from the duplicated tensor blocks. It could be many. Start with 0.
28 ccv_array_t* head; // The head nodes (it could be multiple if from the graph, one cannot determine which is the first).
29 ccv_array_t* tail; // The tail nodes (it could be multiple if from the graph, one cannot determine which is the last).
30} ccv_nnc_tensor_block_t; // Tensor Arena Block
31
32#define IS_PRIMARY_COMPANION(idx, block)((idx) < (uint32_t)((block).companion_ref - 1)) ((idx) < (uint32_t)((block).companion_ref - 1))
33
34enum {
35 UNASSIGNED = 0x1,
36 ALIAS = 0x2,
37 READ_ONLY = 0x4,
38 WRITE_ONLY = 0x8,
39 READ_WRITE = 0xc,
40 ANONYMOUS = 0x10, // Mark this block as anonymous (thus, not reference to any specific tensor).
41 UNFOLDABLE_AS_INPUT = 0x20, // If this block is used as input, it cannot be folded into any output blocks.
42 UNFOLDABLE_AS_OUTPUT = 0x40, // If this block is used as output, it cannot be folded into any input blocks.
43};
44
45#define TENSOR_EXPECT_ORDINARY(t)((t.flags & 0x3) == 0) ((t.flags & 0x3) == 0)
46#define TENSOR_EXPECT_SET_ORDINARY(t)(t.flags = (t.flags & ~0x3)) (t.flags = (t.flags & ~0x3))
47#define TENSOR_EXPECT_UNASSIGNED(t)((t.flags & 0x3) == UNASSIGNED) ((t.flags & 0x3) == UNASSIGNED)
48#define TENSOR_EXPECT_SET_UNASSIGNED(t)(t.flags = ((t.flags & ~0x3) | UNASSIGNED)) (t.flags = ((t.flags & ~0x3) | UNASSIGNED))
49#define TENSOR_EXPECT_UNSET_UNASSIGNED(t)(t.flags = (t.flags & ~0x1)) (t.flags = (t.flags & ~0x1))
50#define TENSOR_EXPECT_ALIAS(t)((t.flags & 0x3) == ALIAS) ((t.flags & 0x3) == ALIAS)
51#define TENSOR_EXPECT_COMPUTABLE(t)(!((t.flags & 0x3) == ALIAS) && !((t.flags & 0x3
) == UNASSIGNED))
(!TENSOR_EXPECT_ALIAS(t)((t.flags & 0x3) == ALIAS) && !TENSOR_EXPECT_UNASSIGNED(t)((t.flags & 0x3) == UNASSIGNED))
52#define TENSOR_READ_WRITE(t)(t.flags & 0xc) (t.flags & 0xc)
53#define TENSOR_SET_READ_WRITE(t, rw)(t.flags = ((t.flags & ~0xc) | rw)) (t.flags = ((t.flags & ~0xc) | rw))
54#define TENSOR_SET_ANONYMOUS(t)(t.flags = ((t.flags & ~0x10) | ANONYMOUS)) (t.flags = ((t.flags & ~0x10) | ANONYMOUS))
55#define TENSOR_IS_ANONYMOUS(t)(t.flags & ANONYMOUS) (t.flags & ANONYMOUS)
56#define TENSOR_SET_UNFOLDABLE_AS_INPUT(t)(t.flags = (t.flags | UNFOLDABLE_AS_INPUT)) (t.flags = (t.flags | UNFOLDABLE_AS_INPUT))
57#define TENSOR_IS_UNFOLDABLE_AS_INPUT(t)(t.flags & UNFOLDABLE_AS_INPUT) (t.flags & UNFOLDABLE_AS_INPUT)
58#define TENSOR_SET_UNFOLDABLE_AS_OUTPUT(t)(t.flags = (t.flags | UNFOLDABLE_AS_OUTPUT)) (t.flags = (t.flags | UNFOLDABLE_AS_OUTPUT))
59#define TENSOR_IS_UNFOLDABLE_AS_OUTPUT(t)(t.flags & UNFOLDABLE_AS_OUTPUT) (t.flags & UNFOLDABLE_AS_OUTPUT)
60
61#define TENSOR_REQUIRE_INIT(flags)(((flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS) || ((flags)
& CCV_NNC_TENSOR_SYMBOL_INIT_ONES))
(((flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS) || ((flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ONES))
62
63// Holds additional information about the exe nodes.
64typedef struct {
65 int flags;
66} ccv_nnc_graph_exec_flag_t;
67
68enum {
69 CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO = 0x1, // Need to insert additional IO transfer for case..of statement.
70};
71
72typedef struct {
73 int index;
74 int oc;
75 int type;
76 uint64_t size;
77} ccv_nnc_tensor_opt_t;
78
79// We first sort the same type together (because they won't be reused at all.
80// And then we sort by size, after that, sort by oc.
81#define more_than(i1, i2, aux) (((i1).size > (i2).size) || ((i1).size == (i2).size && (i1).oc >= (i2).oc))
82static CCV_IMPLEMENT_QSORT(_ccv_nnc_tensor_opt_sort_by_size_and_oc, ccv_nnc_tensor_opt_t, more_than)void _ccv_nnc_tensor_opt_sort_by_size_and_oc(ccv_nnc_tensor_opt_t
*array, size_t total, int aux) { int isort_thresh = 7; ccv_nnc_tensor_opt_t
t; int sp = 0; struct { ccv_nnc_tensor_opt_t *lb; ccv_nnc_tensor_opt_t
*ub; } stack[48]; if( total <= 1 ) return; stack[0].lb = array
; stack[0].ub = array + (total - 1); while( sp >= 0 ) { ccv_nnc_tensor_opt_t
* left = stack[sp].lb; ccv_nnc_tensor_opt_t* right = stack[sp
--].ub; for(;;) { int i, n = (int)(right - left) + 1, m; ccv_nnc_tensor_opt_t
* ptr; ccv_nnc_tensor_opt_t* ptr2; if( n <= isort_thresh )
{ insert_sort: for( ptr = left + 1; ptr <= right; ptr++ )
{ for( ptr2 = ptr; ptr2 > left && more_than(ptr2[
0],ptr2[-1], aux); ptr2--) (((t)) = ((ptr2[0])), ((ptr2[0])) =
((ptr2[-1])), ((ptr2[-1])) = ((t))); } break; } else { ccv_nnc_tensor_opt_t
* left0; ccv_nnc_tensor_opt_t* left1; ccv_nnc_tensor_opt_t* right0
; ccv_nnc_tensor_opt_t* right1; ccv_nnc_tensor_opt_t* pivot; ccv_nnc_tensor_opt_t
* a; ccv_nnc_tensor_opt_t* b; ccv_nnc_tensor_opt_t* c; int swap_cnt
= 0; left0 = left; right0 = right; pivot = left + (n/2); if(
n > 40 ) { int d = n / 8; a = left, b = left + d, c = left
+ 2*d; left = more_than(*a, *b, aux) ? (more_than(*b, *c, aux
) ? b : (more_than(*a, *c, aux) ? c : a)) : (more_than(*c, *b
, aux) ? b : (more_than(*a, *c, aux) ? a : c)); a = pivot - d
, b = pivot, c = pivot + d; pivot = more_than(*a, *b, aux) ? (
more_than(*b, *c, aux) ? b : (more_than(*a, *c, aux) ? c : a)
) : (more_than(*c, *b, aux) ? b : (more_than(*a, *c, aux) ? a
: c)); a = right - 2*d, b = right - d, c = right; right = more_than
(*a, *b, aux) ? (more_than(*b, *c, aux) ? b : (more_than(*a, *
c, aux) ? c : a)) : (more_than(*c, *b, aux) ? b : (more_than(
*a, *c, aux) ? a : c)); } a = left, b = pivot, c = right; pivot
= more_than(*a, *b, aux) ? (more_than(*b, *c, aux) ? b : (more_than
(*a, *c, aux) ? c : a)) : (more_than(*c, *b, aux) ? b : (more_than
(*a, *c, aux) ? a : c)); if( pivot != left0 ) { (((t)) = ((*pivot
)), ((*pivot)) = ((*left0)), ((*left0)) = ((t))); pivot = left0
; } left = left1 = left0 + 1; right = right1 = right0; for(;;
) { while( left <= right && !more_than(*pivot, *left
, aux) ) { if( !more_than(*left, *pivot, aux) ) { if( left >
left1 ) (((t)) = ((*left1)), ((*left1)) = ((*left)), ((*left
)) = ((t))); swap_cnt = 1; left1++; } left++; } while( left <=
right && !more_than(*right, *pivot, aux) ) { if( !more_than
(*pivot, *right, aux) ) { if( right < right1 ) (((t)) = ((
*right1)), ((*right1)) = ((*right)), ((*right)) = ((t))); swap_cnt
= 1; right1--; } right--; } if( left > right ) break; (((
t)) = ((*left)), ((*left)) = ((*right)), ((*right)) = ((t)));
swap_cnt = 1; left++; right--; } if( swap_cnt == 0 ) { left =
left0, right = right0; goto insert_sort; } n = ({ typeof ((int
)(left1 - left0)) _a = ((int)(left1 - left0)); typeof ((int)(
left - left1)) _b = ((int)(left - left1)); (_a < _b) ? _a :
_b; }); for( i = 0; i < n; i++ ) (((t)) = ((left0[i])), (
(left0[i])) = ((left[i-n])), ((left[i-n])) = ((t))); n = ({ typeof
((int)(right0 - right1)) _a = ((int)(right0 - right1)); typeof
((int)(right1 - right)) _b = ((int)(right1 - right)); (_a <
_b) ? _a : _b; }); for( i = 0; i < n; i++ ) (((t)) = ((left
[i])), ((left[i])) = ((right0[i-n+1])), ((right0[i-n+1])) = (
(t))); n = (int)(left - left1); m = (int)(right1 - right); if
( n > 1 ) { if( m > 1 ) { if( n > m ) { stack[++sp].
lb = left0; stack[sp].ub = left0 + n - 1; left = right0 - m +
1, right = right0; } else { stack[++sp].lb = right0 - m + 1;
stack[sp].ub = right0; left = left0, right = left0 + n - 1; }
} else left = left0, right = left0 + n - 1; } else if( m >
1 ) left = right0 - m + 1, right = right0; else break; } } }
}
83#undef more_than
84typedef struct {
85 int idx;
86 int hop;
87} ccv_nnc_tensor_hop_t;
88#define less_than(i1, i2, aux) ((i1).hop < (i2).hop)
89static CCV_IMPLEMENT_QSORT(_ccv_nnc_sort_by_hops, ccv_nnc_tensor_hop_t, less_than)void _ccv_nnc_sort_by_hops(ccv_nnc_tensor_hop_t *array, size_t
total, int aux) { int isort_thresh = 7; ccv_nnc_tensor_hop_t
t; int sp = 0; struct { ccv_nnc_tensor_hop_t *lb; ccv_nnc_tensor_hop_t
*ub; } stack[48]; if( total <= 1 ) return; stack[0].lb = array
; stack[0].ub = array + (total - 1); while( sp >= 0 ) { ccv_nnc_tensor_hop_t
* left = stack[sp].lb; ccv_nnc_tensor_hop_t* right = stack[sp
--].ub; for(;;) { int i, n = (int)(right - left) + 1, m; ccv_nnc_tensor_hop_t
* ptr; ccv_nnc_tensor_hop_t* ptr2; if( n <= isort_thresh )
{ insert_sort: for( ptr = left + 1; ptr <= right; ptr++ )
{ for( ptr2 = ptr; ptr2 > left && less_than(ptr2[
0],ptr2[-1], aux); ptr2--) (((t)) = ((ptr2[0])), ((ptr2[0])) =
((ptr2[-1])), ((ptr2[-1])) = ((t))); } break; } else { ccv_nnc_tensor_hop_t
* left0; ccv_nnc_tensor_hop_t* left1; ccv_nnc_tensor_hop_t* right0
; ccv_nnc_tensor_hop_t* right1; ccv_nnc_tensor_hop_t* pivot; ccv_nnc_tensor_hop_t
* a; ccv_nnc_tensor_hop_t* b; ccv_nnc_tensor_hop_t* c; int swap_cnt
= 0; left0 = left; right0 = right; pivot = left + (n/2); if(
n > 40 ) { int d = n / 8; a = left, b = left + d, c = left
+ 2*d; left = less_than(*a, *b, aux) ? (less_than(*b, *c, aux
) ? b : (less_than(*a, *c, aux) ? c : a)) : (less_than(*c, *b
, aux) ? b : (less_than(*a, *c, aux) ? a : c)); a = pivot - d
, b = pivot, c = pivot + d; pivot = less_than(*a, *b, aux) ? (
less_than(*b, *c, aux) ? b : (less_than(*a, *c, aux) ? c : a)
) : (less_than(*c, *b, aux) ? b : (less_than(*a, *c, aux) ? a
: c)); a = right - 2*d, b = right - d, c = right; right = less_than
(*a, *b, aux) ? (less_than(*b, *c, aux) ? b : (less_than(*a, *
c, aux) ? c : a)) : (less_than(*c, *b, aux) ? b : (less_than(
*a, *c, aux) ? a : c)); } a = left, b = pivot, c = right; pivot
= less_than(*a, *b, aux) ? (less_than(*b, *c, aux) ? b : (less_than
(*a, *c, aux) ? c : a)) : (less_than(*c, *b, aux) ? b : (less_than
(*a, *c, aux) ? a : c)); if( pivot != left0 ) { (((t)) = ((*pivot
)), ((*pivot)) = ((*left0)), ((*left0)) = ((t))); pivot = left0
; } left = left1 = left0 + 1; right = right1 = right0; for(;;
) { while( left <= right && !less_than(*pivot, *left
, aux) ) { if( !less_than(*left, *pivot, aux) ) { if( left >
left1 ) (((t)) = ((*left1)), ((*left1)) = ((*left)), ((*left
)) = ((t))); swap_cnt = 1; left1++; } left++; } while( left <=
right && !less_than(*right, *pivot, aux) ) { if( !less_than
(*pivot, *right, aux) ) { if( right < right1 ) (((t)) = ((
*right1)), ((*right1)) = ((*right)), ((*right)) = ((t))); swap_cnt
= 1; right1--; } right--; } if( left > right ) break; (((
t)) = ((*left)), ((*left)) = ((*right)), ((*right)) = ((t)));
swap_cnt = 1; left++; right--; } if( swap_cnt == 0 ) { left =
left0, right = right0; goto insert_sort; } n = ({ typeof ((int
)(left1 - left0)) _a = ((int)(left1 - left0)); typeof ((int)(
left - left1)) _b = ((int)(left - left1)); (_a < _b) ? _a :
_b; }); for( i = 0; i < n; i++ ) (((t)) = ((left0[i])), (
(left0[i])) = ((left[i-n])), ((left[i-n])) = ((t))); n = ({ typeof
((int)(right0 - right1)) _a = ((int)(right0 - right1)); typeof
((int)(right1 - right)) _b = ((int)(right1 - right)); (_a <
_b) ? _a : _b; }); for( i = 0; i < n; i++ ) (((t)) = ((left
[i])), ((left[i])) = ((right0[i-n+1])), ((right0[i-n+1])) = (
(t))); n = (int)(left - left1); m = (int)(right1 - right); if
( n > 1 ) { if( m > 1 ) { if( n > m ) { stack[++sp].
lb = left0; stack[sp].ub = left0 + n - 1; left = right0 - m +
1, right = right0; } else { stack[++sp].lb = right0 - m + 1;
stack[sp].ub = right0; left = left0, right = left0 + n - 1; }
} else left = left0, right = left0 + n - 1; } else if( m >
1 ) left = right0 - m + 1, right = right0; else break; } } }
}
90#undef less_than
91
92// If b has items overlap with a, a is still after b (inclusive).
93static int _ccv_nnc_tensor_block_a_after_b_inclusively(const ccv_sparse_matrix_t* const exec_dep, const ccv_array_t* const a, const ccv_array_t* const b)
94{
95 assert(a)((void) sizeof ((a) ? 1 : 0), __extension__ ({ if (a) ; else __assert_fail
("a", "ccv_nnc_symbolic_graph_compile.c", 95, __extension__ __PRETTY_FUNCTION__
); }))
;
96 assert(b)((void) sizeof ((b) ? 1 : 0), __extension__ ({ if (b) ; else __assert_fail
("b", "ccv_nnc_symbolic_graph_compile.c", 96, __extension__ __PRETTY_FUNCTION__
); }))
;
97 int x, y;
98 for (x = 0; x < b->rnum; x++)
99 {
100 const int p = *(int*)ccv_array_get(b, x)((void*)(((char*)((b)->data)) + (size_t)(b)->rsize * (size_t
)(x)))
;
101 int flag = 0;
102 // In extreme cases where a is a superset of b, then a is still after b, we are good.
103 for (y = 0; !flag && y < a->rnum; y++)
104 {
105 const int q = *(int*)ccv_array_get(a, y)((void*)(((char*)((a)->data)) + (size_t)(a)->rsize * (size_t
)(y)))
;
106 flag = (p == q);
107 }
108 if (!flag)
109 for (y = 0; y < a->rnum; y++)
110 {
111 ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, *(int*)ccv_array_get(a, y)((void*)(((char*)((a)->data)) + (size_t)(a)->rsize * (size_t
)(y)))
, p);
112 if (!cell.i32 || cell.i32[0] == 0)
113 return 0;
114 }
115 }
116 // If b->rnum == 0, a is after b for sure.
117 // Otherwise, if a->rnum == 0, we don't check any, buf if b->rnum > 0, then we cannot say a is after b.
118 // if both a->rnum > 0 and b->rnum > 0, above logic should checked all.
119 return (a->rnum > 0 || b->rnum == 0);
120}
121
122static int _ccv_nnc_tensor_block_a_after_b_exclusively(const ccv_sparse_matrix_t* const exec_dep, const ccv_array_t* const a, const ccv_array_t* const b)
123{
124 assert(a)((void) sizeof ((a) ? 1 : 0), __extension__ ({ if (a) ; else __assert_fail
("a", "ccv_nnc_symbolic_graph_compile.c", 124, __extension__
__PRETTY_FUNCTION__); }))
;
125 assert(b)((void) sizeof ((b) ? 1 : 0), __extension__ ({ if (b) ; else __assert_fail
("b", "ccv_nnc_symbolic_graph_compile.c", 125, __extension__
__PRETTY_FUNCTION__); }))
;
126 int x, y, max_hop = 0;
127 for (x = 0; x < a->rnum; x++)
128 for (y = 0; y < b->rnum; y++)
129 {
130 ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, *(int*)ccv_array_get(a, x)((void*)(((char*)((a)->data)) + (size_t)(a)->rsize * (size_t
)(x)))
, *(int*)ccv_array_get(b, y)((void*)(((char*)((b)->data)) + (size_t)(b)->rsize * (size_t
)(y)))
);
131 if (!cell.i32 || cell.i32[0] == 0)
132 return 0;
133 max_hop = ccv_max(cell.i32[0], max_hop)({ typeof (cell.i32[0]) _a = (cell.i32[0]); typeof (max_hop) _b
= (max_hop); (_a > _b) ? _a : _b; })
;
134 }
135 // We've entered this nested-for loop, therefore, it must be verifiably, deterministically after b now.
136 // The max hop also denotes if that is the case, how many hops, maximally speaking, we need to get from a to b.
137 return max_hop;
138}
139
140// If every a's head is deterministically after b's tail
141static int _ccv_nnc_tensor_block_head_after_tail(const ccv_sparse_matrix_t* const exec_dep, const ccv_nnc_tensor_block_t a, const ccv_nnc_tensor_block_t b)
142{
143 return _ccv_nnc_tensor_block_a_after_b_exclusively(exec_dep, a.head, b.tail);
144}
145
146typedef struct {
147 ccv_array_t** alloc_dep;
148 int vt_block_size;
149 int buffer_size;
150 int block_size;
151 int* vt_blocks; // A reference to the block, because blocks only contains available block (thus, doesn't consider alias etc.). -1 means no block pointed to. Starts at 0.
152 struct {
153 int type; // The type from tensor blocks.
154 int pin_mem; // Whether this is pinned memory.
155 int flags; // The flags (currently for READ_ONLY or not).
156 uint64_t size; // The size of the buffer allocated.
157 int p_refs[2]; // Reference to the upper level block, Starts at 1. Only index 0 is valid throughout, I do use two in the code as a temporary placeholder.
158 ccv_array_t* dup_p_refs; // Reference to the parent tensor block from the duplicated tensor blocks. From buffer, it can point to multiple because it can be associated with multiple tensor blocks that points to different outputs (for example, in 1st unroll, pointing to one block while in 2nd unroll, pointing to another). Start with 0.
159 }* buffers;
160 struct {
161 int buffer_ref; // A reference for block to which buffer to use. Starts at 0.
162 int block_ref; // A reference to which block in the given tensor_block to use.
163 uint64_t offset; // The offset of this block.
164 }* blocks;
165} ccv_nnc_tensor_alloc_prep_t;
166
167typedef struct ccv_nnc_symbolic_graph_prep_s {
168 int flags;
169 int while_count_tensor; // This graph will generate a while count tensor. If this is set to 1, we reserve tensor_metadata at 0 for this.
170 int p_idx; // Reference to the index in its parent graph's sub-graph array, Starts at 1.
171 int exec_idx;
172 int unroll_count; // How many times this graph is unrolled before we can have proper assignment.
173 int tensor_symbol_info_size;
174 int exec_symbol_info_size;
175 int tensor_block_size;
176 int sub_prep_size;
177 ccv_nnc_tensor_block_t* tensor_blocks;
178 ccv_nnc_tensor_symbol_info_t* tensor_symbol_info;
179 ccv_nnc_graph_exec_flag_t* exec_flags;
180 ccv_nnc_graph_exec_symbol_info_t* exec_symbol_info;
181 int* dup_tensor_block_ref;
182 ccv_nnc_graph_visit_t* visit;
183 ccv_nnc_tensor_alloc_prep_t* alloc_prep;
184 struct ccv_nnc_symbolic_graph_prep_s* p;
185 struct ccv_nnc_symbolic_graph_prep_s** sub_preps; // The preps of its sub-graphs.
186 // Structures that don't require to be freed after deallocation.
187 const ccv_nnc_symbolic_graph_t* symbolic_graph; // Constant because I cannot modify it.
188 ccv_nnc_graph_t* graph; // Materialized graph, not managed by prep after created.
189 ccv_nnc_tensor_arena_t* tensor_arena; // Tensor arena, not managed by prep as well.
190 ccv_array_t* dup_breakpoints; // The noop breakpoints, used to extend the inputs life-cycle for while expr.
191} ccv_nnc_symbolic_graph_prep_t;
192
193typedef struct {
194 int oc;
195 ccv_array_t* itf;
196} ccv_nnc_tensor_block_adjacent_t;
197
198static ccv_nnc_tensor_alloc_prep_t* _ccv_nnc_tensor_alloc_prep_new_and_free_exec_dep(ccv_sparse_matrix_t* const exec_dep, const ccv_nnc_tensor_block_t* const tensor_blocks, const int tensor_block_size)
199{
200 // Compute how many dis-continuous buffers are needed.
201 // We prefer to have several dis-continuous buffers instead of one big buffer because
202 // in this way, we can rely on system memory allocators (jemalloc, tcmalloc, or CUDA's allocator)
203 // to fully utilize memory.
204 int i, j, k;
205 ccv_array_t** const alloc_dep = (ccv_array_t**)cccalloccalloc(tensor_block_size, sizeof(ccv_array_t*));
206 int allocable_tensor_size = 0, available_tensor_size = 0;
207 for (i = 0; i < tensor_block_size; i++)
208 if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED))
209 {
210 // Tensors that we need the header info.
211 ++available_tensor_size;
212 if (!TENSOR_EXPECT_ALIAS(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == ALIAS))
213 // Tensors that we actually need to allocate (exclude the alias).
214 ++allocable_tensor_size;
215 }
216 ccv_sparse_matrix_t* const tensor_df = ccv_sparse_matrix_new(tensor_block_size, tensor_block_size, CCV_32S | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0);
217 ccv_sparse_matrix_t* const tensor_dt = ccv_sparse_matrix_new(tensor_block_size, tensor_block_size, CCV_32S | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0);
218 ccv_nnc_tensor_block_adjacent_t* const adj = (ccv_nnc_tensor_block_adjacent_t*)cccalloccalloc(tensor_block_size, sizeof(ccv_nnc_tensor_block_adjacent_t));
219 // Overlap count.
220 for (i = 0; i < tensor_block_size; i++)
221 if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !(
(tensor_blocks[i].flags & 0x3) == UNASSIGNED))
)
222 for (j = i + 1; j < tensor_block_size; j++)
223 if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[j])(!((tensor_blocks[j].flags & 0x3) == ALIAS) && !(
(tensor_blocks[j].flags & 0x3) == UNASSIGNED))
)
224 {
225 // Check to see if they interfere (default to yes).
226 // If any of the i's head is deterministically later than j's tail
227 // or any of the i's tail is deterministically earlier than j's head, they don't interfere.
228 const int i_hop_j = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[i], tensor_blocks[j]);
229 if (i_hop_j > 0)
230 {
231 ccv_set_sparse_matrix_cell(tensor_dt, i, j, &i_hop_j);
232 ccv_set_sparse_matrix_cell(tensor_df, j, i, &i_hop_j);
233 }
234 const int j_hop_i = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[j], tensor_blocks[i]);
235 if (j_hop_i > 0)
236 {
237 ccv_set_sparse_matrix_cell(tensor_dt, j, i, &j_hop_i);
238 ccv_set_sparse_matrix_cell(tensor_df, i, j, &j_hop_i);
239 }
240 // It cannot be that both i can hop to j can j can hop to i.
241 assert(!(i_hop_j > 0 && j_hop_i > 0))((void) sizeof ((!(i_hop_j > 0 && j_hop_i > 0))
? 1 : 0), __extension__ ({ if (!(i_hop_j > 0 && j_hop_i
> 0)) ; else __assert_fail ("!(i_hop_j > 0 && j_hop_i > 0)"
, "ccv_nnc_symbolic_graph_compile.c", 241, __extension__ __PRETTY_FUNCTION__
); }))
;
242 if (!i_hop_j && !j_hop_i && tensor_blocks[i].type == tensor_blocks[j].type)
243 {
244 if (!adj[i].itf)
245 adj[i].itf = ccv_array_new(sizeof(int), 1, 0);
246 ccv_array_push(adj[i].itf, &j);
247 ++adj[i].oc;
248 if (!adj[j].itf)
249 adj[j].itf = ccv_array_new(sizeof(int), 1, 0);
250 ccv_array_push(adj[j].itf, &i);
251 ++adj[j].oc;
252 }
253 }
254 const int exec_dep_rows = exec_dep->rows;
255 ccv_matrix_free(exec_dep);
256 ccv_nnc_tensor_hop_t* const buf = (ccv_nnc_tensor_hop_t*)ccmallocmalloc(sizeof(ccv_nnc_tensor_hop_t) * tensor_block_size);
257 int* const assigned = (int*)cccalloccalloc(tensor_block_size, sizeof(int));
258 uint64_t* const allocated_offset = (uint64_t*)cccalloccalloc(tensor_block_size, sizeof(uint64_t));
259 uint64_t* const allocated_size = (uint64_t*)cccalloccalloc(tensor_block_size, sizeof(uint64_t));
260 uint32_t* const tensor_block_cannot_insert = (uint32_t*)cccalloccalloc(((tensor_block_size + 31) >> 5), sizeof(uint32_t));
261 int num_assigned = 0;
262 // I can do a bit optimization here to assign out const tensor first, but heck, this just works for now.
263 // Allocation graph (assuming there is a source node, and a destination node, which is 0, and (tensor_block_size + 1)
264 // The first channel denotes the bytes available for allocation,
265 // the second channel denotes the offset available for the allocation,
266 ccv_sparse_matrix_t* alloc = ccv_sparse_matrix_new(tensor_block_size + 2, tensor_block_size + 2, CCV_64S | CCV_C2, CCV_SPARSE_ROW_MAJOR, 0);
267 ccv_array_t* opt = ccv_array_new(sizeof(ccv_nnc_tensor_opt_t), 1, 0);
268 for (j = 0; j < allocable_tensor_size;)
269 {
270 // Find the one with largest overlap (in case overlap is the same, larger size), and it is not assigned.
271 uint64_t max_size = 0;
272 ccv_array_clear(opt);
273 int current_type = 0; // Deal with one type at a time.
274 for (i = 0; i < tensor_block_size; i++)
275 if (tensor_blocks[i].size >= max_size &&
276 TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !(
(tensor_blocks[i].flags & 0x3) == UNASSIGNED))
&& !assigned[i] &&
277 IS_PRIMARY_COMPANION(i, tensor_blocks[i])((i) < (uint32_t)((tensor_blocks[i]).companion_ref - 1)) &&
278 (!current_type || tensor_blocks[i].type == current_type))
279 {
280 ccv_nnc_tensor_opt_t a = {
281 .size = tensor_blocks[i].size,
282 .index = i,
283 .oc = adj[i].oc,
284 .type = tensor_blocks[i].type,
285 };
286 assert(a.type)((void) sizeof ((a.type) ? 1 : 0), __extension__ ({ if (a.type
) ; else __assert_fail ("a.type", "ccv_nnc_symbolic_graph_compile.c"
, 286, __extension__ __PRETTY_FUNCTION__); }))
;
287 current_type = a.type; // Now we now the primary type we should deal with.
288 if (tensor_blocks[i].companion_ref)
289 {
290 const int companion_ref = tensor_blocks[i].companion_ref - 1;
291 a.size = ccv_max(a.size, tensor_blocks[companion_ref].size)({ typeof (a.size) _a = (a.size); typeof (tensor_blocks[companion_ref
].size) _b = (tensor_blocks[companion_ref].size); (_a > _b
) ? _a : _b; })
;
292 a.oc += adj[companion_ref].oc;
293 }
294 // In case we have a tie, take them all in the array.
295 if (a.size > max_size)
296 ccv_array_clear(opt), max_size = a.size;
297 ccv_array_push(opt, &a);
298 }
299 assert(opt->rnum > 0)((void) sizeof ((opt->rnum > 0) ? 1 : 0), __extension__
({ if (opt->rnum > 0) ; else __assert_fail ("opt->rnum > 0"
, "ccv_nnc_symbolic_graph_compile.c", 299, __extension__ __PRETTY_FUNCTION__
); }))
;
300 // Order opt array by the oc because type and size should be equal at this point.
301 _ccv_nnc_tensor_opt_sort_by_size_and_oc((ccv_nnc_tensor_opt_t*)opt->data, opt->rnum, 0);
302 // Go through opt array again, this time, it is ordered by size, therefore, if we found a place to insert, we are good.
303 int min_y = 0, min_x = tensor_block_size + 1, min_i = -1, min_hop = exec_dep_rows * 3;
304 uint64_t min_val[2] = {
305 0, 0
306 };
307 if (j > 0)
308 {
309 for (i = 0; i < opt->rnum; i++)
310 {
311 ccv_nnc_tensor_opt_t a = *(ccv_nnc_tensor_opt_t*)ccv_array_get(opt, i)((void*)(((char*)((opt)->data)) + (size_t)(opt)->rsize *
(size_t)(i)))
;
312 if ((tensor_block_cannot_insert[a.index >> 5] & (1u << (a.index & 0x1f))))
313 continue;
314 // Now, determine the order between a and c. After this, we can always check whether y
315 // can hop to the earliest one and if the latest one can hop to x.
316 // The earliest one will be called p and the latest one will be called q.
317 int p = a.index;
318 int q = a.index;
319 if (tensor_blocks[a.index].companion_ref)
320 {
321 const int companion_ref = tensor_blocks[a.index].companion_ref - 1;
322 if ((tensor_block_cannot_insert[companion_ref >> 5] & (1u << (companion_ref & 0x1f))))
323 continue;
324 const ccv_numeric_data_t b_hop_p = ccv_get_sparse_matrix_cell(tensor_dt, p, companion_ref);
325 if (b_hop_p.i32 && b_hop_p.i32[0] > 0)
326 p = companion_ref;
327 else {
328 const ccv_numeric_data_t q_hop_b = ccv_get_sparse_matrix_cell(tensor_dt, companion_ref, q);
329 if (q_hop_b.i32 && q_hop_b.i32[0] > 0)
330 q = companion_ref;
331 else { // Otherwise, b is in between p and q.
332 const ccv_numeric_data_t p_hop_b = ccv_get_sparse_matrix_cell(tensor_dt, companion_ref, p);
333 const ccv_numeric_data_t b_hop_q = ccv_get_sparse_matrix_cell(tensor_dt, q, companion_ref);
334 assert(p_hop_b.i32 && p_hop_b.i32[0] > 0 && b_hop_q.i32 && b_hop_q.i32[0] > 0)((void) sizeof ((p_hop_b.i32 && p_hop_b.i32[0] > 0
&& b_hop_q.i32 && b_hop_q.i32[0] > 0) ? 1
: 0), __extension__ ({ if (p_hop_b.i32 && p_hop_b.i32
[0] > 0 && b_hop_q.i32 && b_hop_q.i32[0] >
0) ; else __assert_fail ("p_hop_b.i32 && p_hop_b.i32[0] > 0 && b_hop_q.i32 && b_hop_q.i32[0] > 0"
, "ccv_nnc_symbolic_graph_compile.c", 334, __extension__ __PRETTY_FUNCTION__
); }))
;
335 }
336 }
337 }
338 assert(tensor_blocks[q].type == tensor_blocks[p].type)((void) sizeof ((tensor_blocks[q].type == tensor_blocks[p].type
) ? 1 : 0), __extension__ ({ if (tensor_blocks[q].type == tensor_blocks
[p].type) ; else __assert_fail ("tensor_blocks[q].type == tensor_blocks[p].type"
, "ccv_nnc_symbolic_graph_compile.c", 338, __extension__ __PRETTY_FUNCTION__
); }))
;
339 const int type = tensor_blocks[p].type;
340 // y is always earlier than x, but this is hard to assert now.
341 // If this edge satisfy the requirement, now we need to find the ones with tightest possible bounds.
342 // Thus, the hop between y and x (through a) should be smallest ones.
343 // We optimized this by first find all allocated nodes that comes to p, and all allocated nodes that
344 // out of q. For these nodes, we try to verify whether they form a connection (by checking against
345 // alloc sparse matrix). If they do, try to see whether we can insert with tightest bound.
346 int y_size = 0;
347 ccv_nnc_tensor_hop_t* const y_buf = buf;
348#define for_block(y, val) do { \
349 if (((int*)val)[0] > 0 && assigned[y] && tensor_blocks[y].type == type && tensor_blocks[y].size >= a.size) \
350 y_buf[y_size++] = (ccv_nnc_tensor_hop_t){ \
351 .idx = y + 1, .hop = ((int*)val)[0] \
352 }; \
353 } while(0)
354 ccv_sparse_matrix_vector_t* const y_vector = ccv_get_sparse_matrix_vector(tensor_dt, p);
355 if (y_vector)
356 CCV_SPARSE_VECTOR_FOREACH(tensor_dt, y_vector, for_block)do { switch ((((tensor_dt)->type) & 0xFF000)) { case CCV_32S
: { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.i32 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i32 + (0))); } } } while (0); break; } case CCV_32F:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.f32 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f32 + (0))); } } } while (0); break; } case CCV_64S:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.i64 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i64 + (0))); } } } while (0); break; } case CCV_64F:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.f64 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f64 + (0))); } } } while (0); break; } default: { do
{ int _i_; __attribute__((unused)) const size_t _c_ = (((tensor_dt
)->type) & 0xFFF); if ((tensor_dt)->type & CCV_DENSE_VECTOR
) { for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { for_block
((_i_), ((y_vector)->data.u8 + (_i_ * _c_))); } } else { const
size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size
[(((tensor_dt)->type) & 0xFF000) >> 12] * (((tensor_dt
)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_
= (uint8_t*)(y_vector)->index; for (_i_ = 0; _i_ < (y_vector
)->size; _i_++) { ccv_sparse_matrix_index_t* const _idx_i_
= (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if
(_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_
= { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.u8 + (0))); } } } while (0); } } } while (0)
;
357#undef for_block
358 assert(y_size <= tensor_block_size)((void) sizeof ((y_size <= tensor_block_size) ? 1 : 0), __extension__
({ if (y_size <= tensor_block_size) ; else __assert_fail (
"y_size <= tensor_block_size", "ccv_nnc_symbolic_graph_compile.c"
, 358, __extension__ __PRETTY_FUNCTION__); }))
;
359 int x_size = 0;
360 ccv_nnc_tensor_hop_t* const x_buf = buf + y_size;
361#define for_block(x, val) do { \
362 if (((int*)val)[0] > 0 && assigned[x] && tensor_blocks[x].type == type && tensor_blocks[x].size >= a.size) \
363 x_buf[x_size++] = (ccv_nnc_tensor_hop_t){ \
364 .idx = x + 1, .hop = ((int*)val)[0] \
365 }; \
366 } while(0)
367 ccv_sparse_matrix_vector_t* const x_vector = ccv_get_sparse_matrix_vector(tensor_df, q);
368 if (x_vector)
369 CCV_SPARSE_VECTOR_FOREACH(tensor_df, x_vector, for_block)do { switch ((((tensor_df)->type) & 0xFF000)) { case CCV_32S
: { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.i32 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i32 + (0))); } } } while (0); break; } case CCV_32F:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.f32 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f32 + (0))); } } } while (0); break; } case CCV_64S:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.i64 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i64 + (0))); } } } while (0); break; } case CCV_64F:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.f64 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f64 + (0))); } } } while (0); break; } default: { do
{ int _i_; __attribute__((unused)) const size_t _c_ = (((tensor_df
)->type) & 0xFFF); if ((tensor_df)->type & CCV_DENSE_VECTOR
) { for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { for_block
((_i_), ((x_vector)->data.u8 + (_i_ * _c_))); } } else { const
size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size
[(((tensor_df)->type) & 0xFF000) >> 12] * (((tensor_df
)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_
= (uint8_t*)(x_vector)->index; for (_i_ = 0; _i_ < (x_vector
)->size; _i_++) { ccv_sparse_matrix_index_t* const _idx_i_
= (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if
(_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_
= { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.u8 + (0))); } } } while (0); } } } while (0)
;
370#undef for_block
371 assert(y_size + x_size <= tensor_block_size)((void) sizeof ((y_size + x_size <= tensor_block_size) ? 1
: 0), __extension__ ({ if (y_size + x_size <= tensor_block_size
) ; else __assert_fail ("y_size + x_size <= tensor_block_size"
, "ccv_nnc_symbolic_graph_compile.c", 371, __extension__ __PRETTY_FUNCTION__
); }))
;
372 int x, y;
373 _ccv_nnc_sort_by_hops(y_buf, y_size, 0);
374 for (y = 0; y < y_size; y++)
375 {
376 const int hop = exec_dep_rows + y_buf[y].hop;
377 if (hop >= min_hop)
378 break;
379 const ccv_numeric_data_t val = ccv_get_sparse_matrix_cell(alloc, y_buf[y].idx, tensor_block_size + 1);
380 if (val.u64 && val.u64[0] >= a.size)
381 {
382 min_y = y_buf[y].idx, min_x = tensor_block_size + 1, min_hop = hop,
383 min_val[0] = val.u64[0], min_val[1] = val.u64[1];
384 break;
385 }
386 }
387 _ccv_nnc_sort_by_hops(x_buf, x_size, 0);
388 for (x = 0; x < x_size; x++)
389 {
390 const int hop = exec_dep_rows + x_buf[x].hop;
391 if (hop >= min_hop)
392 break;
393 const ccv_numeric_data_t val = ccv_get_sparse_matrix_cell(alloc, 0, x_buf[x].idx);
394 if (val.u64 && val.u64[0] >= a.size)
395 {
396 min_y = 0, min_x = x_buf[x].idx, min_hop = hop,
397 min_val[0] = val.u64[0], min_val[1] = val.u64[1];
398 break;
399 }
400 }
401 if (x_size > 0)
402 {
403 const int x_min_hop = x_buf[0].hop;
404 for (y = 0; y < y_size; y++)
405 {
406 const int y_hop_p_v = y_buf[y].hop;
407 if (y_hop_p_v + x_min_hop >= min_hop)
408 break;
409 ccv_sparse_matrix_vector_t* const y_vector = ccv_get_sparse_matrix_vector(alloc, y_buf[y].idx);
410 if (y_vector)
411 {
412 for (x = 0; x < x_size; x++)
413 {
414 const int q_hop_x_v = x_buf[x].hop;
415 const int hop = y_hop_p_v + q_hop_x_v;
416 if (hop >= min_hop)
417 break;
418 const ccv_numeric_data_t val = ccv_get_sparse_matrix_cell_from_vector(alloc, y_vector, x_buf[x].idx);
419 if (val.u64 && val.u64[0] >= a.size)
420 {
421 min_y = y_buf[y].idx, min_x = x_buf[x].idx, min_hop = hop,
422 min_val[0] = val.u64[0], min_val[1] = val.u64[1];
423 break;
424 }
425 }
426 }
427 }
428 }
429 // If I found a place, stop, and exit.
430 if (min_y > 0 || min_x < tensor_block_size + 1)
431 {
432 min_i = i;
433 break;
434 }
435 // There is no space to insert this block, mark it as such.
436 tensor_block_cannot_insert[a.index >> 5] |= (1u << (a.index & 0x1f));
437 if (tensor_blocks[a.index].companion_ref)
438 {
439 const int companion_ref = tensor_blocks[a.index].companion_ref - 1;
440 tensor_block_cannot_insert[companion_ref >> 5] |= (1u << (companion_ref & 0x1f));
441 }
442 }
443 }
444 // If I cannot find a place, then start a new connection between min_y and min_x (a new assignment group).
445 // and default to largest size available.
446 ccv_nnc_tensor_opt_t a = *(ccv_nnc_tensor_opt_t*)ccv_array_get(opt, ccv_max(0, min_i))((void*)(((char*)((opt)->data)) + (size_t)(opt)->rsize *
(size_t)(({ typeof (0) _a = (0); typeof (min_i) _b = (min_i)
; (_a > _b) ? _a : _b; }))))
;
447 if (min_i == -1)
448 {
449 allocated_size[num_assigned] = a.size;
450 ++num_assigned;
451 }
452 int assign_group = num_assigned;
453 if (min_y > 0)
454 {
455 assign_group = assigned[min_y - 1];
456 // The y and x should belong to the same assigned group.
457 assert(min_x == tensor_block_size + 1 || assigned[min_x - 1] == assign_group)((void) sizeof ((min_x == tensor_block_size + 1 || assigned[min_x
- 1] == assign_group) ? 1 : 0), __extension__ ({ if (min_x ==
tensor_block_size + 1 || assigned[min_x - 1] == assign_group
) ; else __assert_fail ("min_x == tensor_block_size + 1 || assigned[min_x - 1] == assign_group"
, "ccv_nnc_symbolic_graph_compile.c", 457, __extension__ __PRETTY_FUNCTION__
); }))
;
458 } else if (min_x < tensor_block_size + 1)
459 assign_group = assigned[min_x - 1];
460 // If min_y is source and min_x is destination, we don't need to do anything, otherwise, decrease the weight on that edge.
461 if (min_y != 0 || min_x != tensor_block_size + 1)
462 {
463 uint64_t val[2] = {
464 min_val[0], min_val[1]
465 };
466 assert(val[0] >= a.size)((void) sizeof ((val[0] >= a.size) ? 1 : 0), __extension__
({ if (val[0] >= a.size) ; else __assert_fail ("val[0] >= a.size"
, "ccv_nnc_symbolic_graph_compile.c", 466, __extension__ __PRETTY_FUNCTION__
); }))
;
467 val[0] -= a.size;
468 val[1] = val[1] + a.size; // Move the offset to the next one.
469 ccv_set_sparse_matrix_cell(alloc, min_y, min_x, val);
470 }
471 int strings[3];
472 strings[0] = a.index + 1;
473 int string_size = 1;
474 // Assign out designated companion if it exist.
475 if (tensor_blocks[a.index].companion_ref)
476 {
477 const int companion_ref = tensor_blocks[a.index].companion_ref - 1;
478 assert(tensor_blocks[a.index].type == tensor_blocks[companion_ref].type)((void) sizeof ((tensor_blocks[a.index].type == tensor_blocks
[companion_ref].type) ? 1 : 0), __extension__ ({ if (tensor_blocks
[a.index].type == tensor_blocks[companion_ref].type) ; else __assert_fail
("tensor_blocks[a.index].type == tensor_blocks[companion_ref].type"
, "ccv_nnc_symbolic_graph_compile.c", 478, __extension__ __PRETTY_FUNCTION__
); }))
;
479 const ccv_numeric_data_t b_hop_p = ccv_get_sparse_matrix_cell(tensor_dt, strings[0] - 1, companion_ref);
480 if (b_hop_p.i32 && b_hop_p.i32[0] > 0)
481 {
482 for (i = 0; i < string_size; i++)
483 strings[i + 1] = strings[i];
484 strings[0] = companion_ref + 1;
485 } else {
486 const ccv_numeric_data_t q_hop_b = ccv_get_sparse_matrix_cell(tensor_dt, companion_ref, strings[string_size - 1] - 1);
487 if (q_hop_b.i32 && q_hop_b.i32[0] > 0)
488 strings[string_size] = companion_ref + 1;
489 else {
490 // Because b_hop_p is 0, q_hop_b is nil, p != q, and b must in between p and q. Therefore, I must have 2 allocations.
491 assert(string_size == 2)((void) sizeof ((string_size == 2) ? 1 : 0), __extension__ ({
if (string_size == 2) ; else __assert_fail ("string_size == 2"
, "ccv_nnc_symbolic_graph_compile.c", 491, __extension__ __PRETTY_FUNCTION__
); }))
;
492 strings[2] = strings[1];
493 strings[1] = companion_ref + 1;
494 }
495 }
496 ++string_size;
497 }
498 // Assign out and update oc.
499 for (i = 0; i < string_size; i++)
500 {
501 const int index = strings[i] - 1;
502 // Assign out the selected one.
503 assigned[index] = assign_group;
504 // The offset for this one, should be either 0 (started a new group, when min_i == -1), or the offset on this edge.
505 allocated_offset[index] = min_val[1];
506 if (adj[index].itf)
507 for (k = 0; k < adj[index].itf->rnum; k++)
508 {
509 const int d = *(int*)ccv_array_get(adj[index].itf, k)((void*)(((char*)((adj[index].itf)->data)) + (size_t)(adj[
index].itf)->rsize * (size_t)(k)))
;
510 if (!assigned[d] && TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d])(!((tensor_blocks[d].flags & 0x3) == ALIAS) && !(
(tensor_blocks[d].flags & 0x3) == UNASSIGNED))
)
511 --adj[d].oc;
512 }
513 }
514 uint64_t val[2] = {
515 a.size, min_val[1]
516 };
517 uint64_t consumed_size = 0;
518 // Go over from min_y to string_size (excluding min_x).
519 for (i = 0; i < string_size; i++)
520 {
521 const uint64_t size = tensor_blocks[strings[i] - 1].size;
522 assert(size <= a.size)((void) sizeof ((size <= a.size) ? 1 : 0), __extension__ (
{ if (size <= a.size) ; else __assert_fail ("size <= a.size"
, "ccv_nnc_symbolic_graph_compile.c", 522, __extension__ __PRETTY_FUNCTION__
); }))
;
523 // Update consumed size if it is bigger than "size".
524 if (size > consumed_size)
525 {
526 val[0] = size - consumed_size;
527 ccv_set_sparse_matrix_cell(alloc, min_y, strings[i], val);
528 consumed_size = size;
529 val[1] = min_val[1] + consumed_size;
530 }
531 // If it consumed all the flow, break out.
532 if (consumed_size == a.size)
533 break;
534 }
535 for (i = 0; i < string_size; i++)
536 {
537 const uint64_t i_size = tensor_blocks[strings[i] - 1].size;
538 uint64_t val[2] = {
539 i_size, min_val[1]
540 };
541 uint64_t consumed_size = 0;
542 for (k = i + 1; k < string_size; k++)
543 {
544 const uint64_t size = ccv_min(i_size, tensor_blocks[strings[k] - 1].size)({ typeof (i_size) _a = (i_size); typeof (tensor_blocks[strings
[k] - 1].size) _b = (tensor_blocks[strings[k] - 1].size); (_a
< _b) ? _a : _b; })
;
545 // Update consumed size if it is bigger than "size".
546 if (size > consumed_size)
547 {
548 val[0] = size - consumed_size;
549 ccv_set_sparse_matrix_cell(alloc, strings[i], strings[k], val);
550 consumed_size = size;
551 val[1] = min_val[1] + consumed_size;
552 }
553 // If it consumed all the flow, break out.
554 if (consumed_size == i_size)
555 break;
556 }
557 val[0] = i_size - consumed_size;
558 // Still have residual, flow it to min_x.
559 if (val[0] > 0)
560 ccv_set_sparse_matrix_cell(alloc, strings[i], min_x, val);
561 }
562 if (min_i == -1)
563 {
564 // If we decide to insert a new edge, simply marking anyone who is not interfere with it to redo.
565 const int p = strings[0] - 1;
566 const int q = strings[string_size - 1] - 1;
567 const int type = tensor_blocks[p].type;
568#define for_block(y, val) do { \
569 if (((int*)val)[0] > 0 && !assigned[y] && tensor_blocks[y].type == type && tensor_blocks[y].size <= a.size) \
570 { \
571 tensor_block_cannot_insert[y >> 5] &= ~(1u << (y & 0x1f)); \
572 if (tensor_blocks[y].companion_ref) \
573 { \
574 const int companion_ref = tensor_blocks[y].companion_ref - 1; \
575 tensor_block_cannot_insert[companion_ref >> 5] |= (1u << (companion_ref & 0x1f)); \
576 } \
577 } \
578 } while(0)
579 ccv_sparse_matrix_vector_t* const y_vector = ccv_get_sparse_matrix_vector(tensor_dt, p);
580 if (y_vector)
581 CCV_SPARSE_VECTOR_FOREACH(tensor_dt, y_vector, for_block)do { switch ((((tensor_dt)->type) & 0xFF000)) { case CCV_32S
: { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.i32 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i32 + (0))); } } } while (0); break; } case CCV_32F:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.f32 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f32 + (0))); } } } while (0); break; } case CCV_64S:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.i64 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i64 + (0))); } } } while (0); break; } case CCV_64F:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.f64 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f64 + (0))); } } } while (0); break; } default: { do
{ int _i_; __attribute__((unused)) const size_t _c_ = (((tensor_dt
)->type) & 0xFFF); if ((tensor_dt)->type & CCV_DENSE_VECTOR
) { for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { for_block
((_i_), ((y_vector)->data.u8 + (_i_ * _c_))); } } else { const
size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size
[(((tensor_dt)->type) & 0xFF000) >> 12] * (((tensor_dt
)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_
= (uint8_t*)(y_vector)->index; for (_i_ = 0; _i_ < (y_vector
)->size; _i_++) { ccv_sparse_matrix_index_t* const _idx_i_
= (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if
(_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_
= { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.u8 + (0))); } } } while (0); } } } while (0)
;
582#undef for_block
583#define for_block(x, val) do { \
584 if (((int*)val)[0] > 0 && !assigned[x] && tensor_blocks[x].type == type && tensor_blocks[x].size <= a.size) \
585 { \
586 tensor_block_cannot_insert[x >> 5] &= ~(1u << (x & 0x1f)); \
587 if (tensor_blocks[x].companion_ref) \
588 { \
589 const int companion_ref = tensor_blocks[x].companion_ref - 1; \
590 tensor_block_cannot_insert[companion_ref >> 5] |= (1u << (companion_ref & 0x1f)); \
591 } \
592 } \
593 } while(0)
594 ccv_sparse_matrix_vector_t* const x_vector = ccv_get_sparse_matrix_vector(tensor_df, q);
595 if (x_vector)
596 CCV_SPARSE_VECTOR_FOREACH(tensor_df, x_vector, for_block)do { switch ((((tensor_df)->type) & 0xFF000)) { case CCV_32S
: { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.i32 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i32 + (0))); } } } while (0); break; } case CCV_32F:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.f32 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f32 + (0))); } } } while (0); break; } case CCV_64S:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.i64 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i64 + (0))); } } } while (0); break; } case CCV_64F:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.f64 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f64 + (0))); } } } while (0); break; } default: { do
{ int _i_; __attribute__((unused)) const size_t _c_ = (((tensor_df
)->type) & 0xFFF); if ((tensor_df)->type & CCV_DENSE_VECTOR
) { for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { for_block
((_i_), ((x_vector)->data.u8 + (_i_ * _c_))); } } else { const
size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size
[(((tensor_df)->type) & 0xFF000) >> 12] * (((tensor_df
)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_
= (uint8_t*)(x_vector)->index; for (_i_ = 0; _i_ < (x_vector
)->size; _i_++) { ccv_sparse_matrix_index_t* const _idx_i_
= (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if
(_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_
= { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.u8 + (0))); } } } while (0); } } } while (0)
;
597#undef for_block
598 }
599 j += string_size;
600 }
601 ccfreefree(tensor_block_cannot_insert);
602 ccfreefree(buf);
603 ccv_array_free(opt);
604 ccv_matrix_free(tensor_df);
605 ccv_matrix_free(tensor_dt);
606#define for_block(y, x, val) do { \
607 if (((uint64_t*)val)[0] > 0 && y > 0 && x < tensor_block_size + 1) \
608 { \
609 if (!alloc_dep[x - 1]) \
610 alloc_dep[x - 1] = ccv_array_new(sizeof(int), 1, 0); \
611 ccv_array_add_unique_int(alloc_dep[x - 1], y - 1); \
612 } \
613 } while (0)
614 CCV_SPARSE_FOREACH(alloc, for_block)do { if ((alloc)->major & CCV_SPARSE_COL_MAJOR) { switch
((((alloc)->type) & 0xFF000)) { case CCV_32S: { do { uint32_t
_i_, _j_; const uint32_t _size_ = (alloc)->size; __attribute__
((unused)) const size_t _c_ = (((alloc)->type) & 0xFFF
); if ((alloc)->type & CCV_DENSE_VECTOR) { for (_i_ = 0
; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t* const _idx_
= (alloc)->index + _i_; ccv_sparse_matrix_vector_t* const
_v_ = (alloc)->vector + _i_; if (_idx_->ifbit <= 1 ||
!_v_->size) continue; for (_j_ = 0; _j_ < _v_->size
; _j_++) { for_block((_j_), (_idx_->i), (_v_->data.i32 +
(_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
& 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
+ _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
+ _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_j_->
i), (_idx_->i), (_d_.i32 + (0))); } } } } while (0); break
; } case CCV_32F: { do { uint32_t _i_, _j_; const uint32_t _size_
= (alloc)->size; __attribute__((unused)) const size_t _c_
= (((alloc)->type) & 0xFFF); if ((alloc)->type &
CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_j_), (_idx_->i), (_v_->data.
f32 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
& 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
+ _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
+ _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_j_->
i), (_idx_->i), (_d_.f32 + (0))); } } } } while (0); break
; } case CCV_64S: { do { uint32_t _i_, _j_; const uint32_t _size_
= (alloc)->size; __attribute__((unused)) const size_t _c_
= (((alloc)->type) & 0xFFF); if ((alloc)->type &
CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_j_), (_idx_->i), (_v_->data.
i64 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
& 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
+ _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
+ _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_j_->
i), (_idx_->i), (_d_.i64 + (0))); } } } } while (0); break
; } case CCV_64F: { do { uint32_t _i_, _j_; const uint32_t _size_
= (alloc)->size; __attribute__((unused)) const size_t _c_
= (((alloc)->type) & 0xFFF); if ((alloc)->type &
CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_j_), (_idx_->i), (_v_->data.
f64 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
& 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
+ _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
+ _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_j_->
i), (_idx_->i), (_d_.f64 + (0))); } } } } while (0); break
; } default: { do { uint32_t _i_, _j_; const uint32_t _size_ =
(alloc)->size; __attribute__((unused)) const size_t _c_ =
(((alloc)->type) & 0xFFF); if ((alloc)->type &
CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_j_), (_idx_->i), (_v_->data.
u8 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
& 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
+ _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
+ _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_j_->
i), (_idx_->i), (_d_.u8 + (0))); } } } } while (0); } } } else
{ switch ((((alloc)->type) & 0xFF000)) { case CCV_32S
: { do { uint32_t _i_, _j_; const uint32_t _size_ = (alloc)->
size; __attribute__((unused)) const size_t _c_ = (((alloc)->
type) & 0xFFF); if ((alloc)->type & CCV_DENSE_VECTOR
) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_idx_->i), (_j_), (_v_->data.
i32 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
& 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
+ _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
+ _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_->
i), (_idx_j_->i), (_d_.i32 + (0))); } } } } while (0); break
; } case CCV_32F: { do { uint32_t _i_, _j_; const uint32_t _size_
= (alloc)->size; __attribute__((unused)) const size_t _c_
= (((alloc)->type) & 0xFFF); if ((alloc)->type &
CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_idx_->i), (_j_), (_v_->data.
f32 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
& 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
+ _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
+ _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_->
i), (_idx_j_->i), (_d_.f32 + (0))); } } } } while (0); break
; } case CCV_64S: { do { uint32_t _i_, _j_; const uint32_t _size_
= (alloc)->size; __attribute__((unused)) const size_t _c_
= (((alloc)->type) & 0xFFF); if ((alloc)->type &
CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_idx_->i), (_j_), (_v_->data.
i64 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
& 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
+ _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
+ _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_->
i), (_idx_j_->i), (_d_.i64 + (0))); } } } } while (0); break
; } case CCV_64F: { do { uint32_t _i_, _j_; const uint32_t _size_
= (alloc)->size; __attribute__((unused)) const size_t _c_
= (((alloc)->type) & 0xFFF); if ((alloc)->type &
CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_idx_->i), (_j_), (_v_->data.
f64 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
& 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
+ _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
+ _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_->
i), (_idx_j_->i), (_d_.f64 + (0))); } } } } while (0); break
; } default: { do { uint32_t _i_, _j_; const uint32_t _size_ =
(alloc)->size; __attribute__((unused)) const size_t _c_ =
(((alloc)->type) & 0xFFF); if ((alloc)->type &
CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_idx_->i), (_j_), (_v_->data.
u8 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
& 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
+ _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
+ _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_->
i), (_idx_j_->i), (_d_.u8 + (0))); } } } } while (0); } } }
} while (0)
;
615#undef for_block
616 ccv_matrix_free(alloc);
617 for (i = 0; i < tensor_block_size; i++)
618 if (adj[i].itf)
619 ccv_array_free(adj[i].itf);
620 ccfreefree(adj);
621 ccv_nnc_tensor_alloc_prep_t* alloc_prep = (ccv_nnc_tensor_alloc_prep_t*)ccmallocmalloc(sizeof(ccv_nnc_tensor_alloc_prep_t) + sizeof(alloc_prep->blocks[0]) * available_tensor_size + sizeof(alloc_prep->buffers[0]) * num_assigned + sizeof(int) * tensor_block_size);
622 alloc_prep->alloc_dep = alloc_dep;
623 alloc_prep->vt_block_size = tensor_block_size;
624 alloc_prep->buffer_size = num_assigned;
625 alloc_prep->block_size = available_tensor_size;
626 alloc_prep->blocks = (void*)(alloc_prep + 1); // From the biggest structs to smaller ones.
627 alloc_prep->buffers = (void*)(alloc_prep->blocks + available_tensor_size);
628 alloc_prep->vt_blocks = (int*)(alloc_prep->buffers + num_assigned);
629 memset(alloc_prep->buffers, 0, sizeof(alloc_prep->buffers[0]) * num_assigned);
630 for (i = 0; i < num_assigned; i++)
631 alloc_prep->buffers[i].size = allocated_size[i];
632 if (CCV_CLI_OUTPUT_LEVEL_IS(CCV_CLI_INFO)(CCV_CLI_INFO & ccv_cli_get_output_levels()))
633 {
634 size_t total_size = 0;
635 for (i = 0; i < num_assigned; i++)
636 total_size += allocated_size[i];
637 PRINT(CCV_CLI_INFO, "Total buffer size of %zu to be allocated\n", total_size)do { if ((CCV_CLI_INFO & ccv_cli_get_output_levels())) { printf
("Total buffer size of %zu to be allocated\n", total_size); fflush
(stdout); } } while (0)
;
638 }
639 ccfreefree(allocated_size);
640 j = 0;
641 // Assigning out the tensors (in case of sharing tensors / in-place ops).
642 for (i = 0; i < tensor_block_size; i++)
643 if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED))
644 {
645 alloc_prep->blocks[j].block_ref = i;
646 if (!TENSOR_EXPECT_ALIAS(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == ALIAS))
647 {
648 alloc_prep->vt_blocks[i] = j;
649 // Also, set its allocations.
650 assert(assigned[i] > 0)((void) sizeof ((assigned[i] > 0) ? 1 : 0), __extension__ (
{ if (assigned[i] > 0) ; else __assert_fail ("assigned[i] > 0"
, "ccv_nnc_symbolic_graph_compile.c", 650, __extension__ __PRETTY_FUNCTION__
); }))
;
651 const int buffer_ref = alloc_prep->blocks[j].buffer_ref = assigned[i] - 1;
652 alloc_prep->blocks[j].offset = allocated_offset[i];
653 if (!alloc_prep->buffers[buffer_ref].type)
654 alloc_prep->buffers[buffer_ref].type = tensor_blocks[i].type;
655 alloc_prep->buffers[buffer_ref].pin_mem = alloc_prep->buffers[buffer_ref].pin_mem || tensor_blocks[i].pin_mem;
656 alloc_prep->buffers[buffer_ref].flags |= TENSOR_READ_WRITE(tensor_blocks[i])(tensor_blocks[i].flags & 0xc);
657 assert(allocated_offset[i] + tensor_blocks[i].size <= alloc_prep->buffers[buffer_ref].size)((void) sizeof ((allocated_offset[i] + tensor_blocks[i].size <=
alloc_prep->buffers[buffer_ref].size) ? 1 : 0), __extension__
({ if (allocated_offset[i] + tensor_blocks[i].size <= alloc_prep
->buffers[buffer_ref].size) ; else __assert_fail ("allocated_offset[i] + tensor_blocks[i].size <= alloc_prep->buffers[buffer_ref].size"
, "ccv_nnc_symbolic_graph_compile.c", 657, __extension__ __PRETTY_FUNCTION__
); }))
;
658 } else {
659 alloc_prep->vt_blocks[i] = -1;
660 alloc_prep->blocks[j].buffer_ref = -1;
661 alloc_prep->blocks[j].offset = 0;
662 }
663 ++j;
664 } else
665 alloc_prep->vt_blocks[i] = -1;
666 ccfreefree(allocated_offset);
667 ccfreefree(assigned);
668 return alloc_prep;
669}
670
671static void _ccv_nnc_tensor_alloc_prep_free(ccv_nnc_tensor_alloc_prep_t* alloc_prep)
672{
673 int i;
674 for (i = 0; i < alloc_prep->vt_block_size; i++)
675 if (alloc_prep->alloc_dep[i])
676 ccv_array_free(alloc_prep->alloc_dep[i]);
677 for (i = 0; i < alloc_prep->buffer_size; i++)
678 if (alloc_prep->buffers[i].dup_p_refs)
679 ccv_array_free(alloc_prep->buffers[i].dup_p_refs);
680 ccfreefree(alloc_prep->alloc_dep);
681 ccfreefree(alloc_prep);
682}
683
684// Simple allocator from ccv_array_t.
685static int _ccv_nnc_tensor_metadata_pos_new(ccv_array_t* const tensor_metadata, const size_t size)
686{
687 int pos = tensor_metadata->rnum;
688 int rsize = (size + 15) / 16;
689 ccv_array_resize(tensor_metadata, pos + rsize);
690 return (pos << 1) + 1;
691}
692
693static ccv_nnc_tensor_t* _ccv_nnc_tensor_metadata_get(const ccv_array_t* const tensor_metadata, const int pos)
694{
695 assert((pos >> 1) < tensor_metadata->rnum)((void) sizeof (((pos >> 1) < tensor_metadata->rnum
) ? 1 : 0), __extension__ ({ if ((pos >> 1) < tensor_metadata
->rnum) ; else __assert_fail ("(pos >> 1) < tensor_metadata->rnum"
, "ccv_nnc_symbolic_graph_compile.c", 695, __extension__ __PRETTY_FUNCTION__
); }))
;
696 return (ccv_nnc_tensor_t*)ccv_array_get(tensor_metadata, pos >> 1)((void*)(((char*)((tensor_metadata)->data)) + (size_t)(tensor_metadata
)->rsize * (size_t)(pos >> 1)))
;
697}
698
699#define CCV_NNC_IS_METADATA_POS(ptr)((uintptr_t)(ptr) & 1) ((uintptr_t)(ptr) & 1)
700
701static ccv_nnc_tensor_t* _ccv_nnc_tensor_metadata_rewire(const ccv_array_t* const tensor_metadata, ccv_nnc_tensor_t* const vt_tensor)
702{
703 // If the low bit is not 1, this is not a position (but a normal tensor pointer), just return directly.
704 if (!CCV_NNC_IS_METADATA_POS(vt_tensor)((uintptr_t)(vt_tensor) & 1))
705 return vt_tensor;
706 ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)vt_tensor);
707 if (tensor->alias_ref && CCV_NNC_IS_METADATA_POS(tensor->alias_ref)((uintptr_t)(tensor->alias_ref) & 1))
708 {
709 const int alias_ref = tensor->alias_ref;
710 tensor->alias_ref = (uintptr_t)_ccv_nnc_tensor_metadata_get(tensor_metadata, (int)tensor->alias_ref);
711 _ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)alias_ref);
712 }
713 if (CCV_IS_TENSOR_MULTIVIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_MULTIVIEW))
714 {
715 ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)tensor;
716 int i;
717 const int count = mv->kind + mv->repeat;
718 for (i = 0; i < count; i++)
719 {
720 if (CCV_NNC_IS_METADATA_POS(CCV_NNC_MULTIVIEW_DATA(mv)[i])((uintptr_t)(((mv)->_heap_data ? (mv)->_heap_data : (mv
)->_inline_data)[i]) & 1)
)
721 {
722 const int pos = (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i];
723 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i]);
724 _ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)pos);
725 }
726 }
727 // No need to recursively do parent pointer, otherwise we are in deep rewire.
728 if (mv->p && CCV_NNC_IS_METADATA_POS(mv->p)((uintptr_t)(mv->p) & 1))
729 mv->p = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)mv->p);
730 if (mv->sp)
731 for (i = 0; i < mv->sp->rnum; i++)
732 {
733 ccv_nnc_tensor_t** const tensor = (ccv_nnc_tensor_t**)ccv_array_get(mv->sp, i)((void*)(((char*)((mv->sp)->data)) + (size_t)(mv->sp
)->rsize * (size_t)(i)))
;
734 if (CCV_NNC_IS_METADATA_POS(*tensor)((uintptr_t)(*tensor) & 1))
735 {
736 const int pos = (int)(intptr_t)*tensor;
737 *tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
738 assert(!CCV_IS_TENSOR_MULTIVIEW(*tensor))((void) sizeof ((!((*(int*)(*tensor)) & CCV_TENSOR_MULTIVIEW
)) ? 1 : 0), __extension__ ({ if (!((*(int*)(*tensor)) & CCV_TENSOR_MULTIVIEW
)) ; else __assert_fail ("!CCV_IS_TENSOR_MULTIVIEW(*tensor)",
"ccv_nnc_symbolic_graph_compile.c", 738, __extension__ __PRETTY_FUNCTION__
); }))
;
739 _ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)pos);
740 }
741 }
742 }
743 return tensor;
744}
745
746typedef struct {
747 const uint8_t* ptr;
748 int pos;
749} ccv_nnc_tensor_block_pos_t;
750
751static int _ccv_nnc_tensor_multiview_find_pos(ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_param_t params, const ccv_nnc_symbolic_graph_prep_t* const *const preps, const int block_ref, const int* const ch, const int idx, const ccv_nnc_symbolic_graph_prep_t* prep)
752{
753 int i;
754 int unref_block_ref = block_ref;
755 while (prep->tensor_blocks[unref_block_ref].ref)
756 unref_block_ref = prep->tensor_blocks[unref_block_ref].ref - 1;
757 int vt_ref = prep->alloc_prep->vt_blocks[unref_block_ref];
758 assert(vt_ref >= 0)((void) sizeof ((vt_ref >= 0) ? 1 : 0), __extension__ ({ if
(vt_ref >= 0) ; else __assert_fail ("vt_ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 758, __extension__ __PRETTY_FUNCTION__); }))
;
759 assert(unref_block_ref == prep->alloc_prep->blocks[vt_ref].block_ref)((void) sizeof ((unref_block_ref == prep->alloc_prep->blocks
[vt_ref].block_ref) ? 1 : 0), __extension__ ({ if (unref_block_ref
== prep->alloc_prep->blocks[vt_ref].block_ref) ; else __assert_fail
("unref_block_ref == prep->alloc_prep->blocks[vt_ref].block_ref"
, "ccv_nnc_symbolic_graph_compile.c", 759, __extension__ __PRETTY_FUNCTION__
); }))
;
760 const int buffer_ref = prep->alloc_prep->blocks[vt_ref].buffer_ref;
761 uint64_t offset = prep->alloc_prep->blocks[vt_ref].offset;
762 int p_ref = prep->alloc_prep->buffers[buffer_ref].p_refs[0] - 1;
763 for (i = idx - 1; i >= 0; i--)
764 {
765 assert(p_ref >= 0)((void) sizeof ((p_ref >= 0) ? 1 : 0), __extension__ ({ if
(p_ref >= 0) ; else __assert_fail ("p_ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 765, __extension__ __PRETTY_FUNCTION__); }))
;
766 const ccv_nnc_symbolic_graph_prep_t* const graph_prep = preps[i];
767 const int unroll_count = graph_prep->unroll_count;
768 if (ch[i]) // Prefer the dup side of things.
769 p_ref = graph_prep->dup_tensor_block_ref[p_ref * unroll_count + ch[i] - 1];
770 int unref_p_ref = p_ref;
771 while (graph_prep->tensor_blocks[unref_p_ref].ref)
772 unref_p_ref = graph_prep->tensor_blocks[unref_p_ref].ref - 1;
773 vt_ref = graph_prep->alloc_prep->vt_blocks[unref_p_ref];
774 const int buffer_ref = graph_prep->alloc_prep->blocks[vt_ref].buffer_ref;
775 offset += graph_prep->alloc_prep->blocks[vt_ref].offset;
776 // If the buffer already exists, prefer that.
777 const uint8_t* ptr = graph_prep->tensor_arena->buffers[buffer_ref].ptr;
778 if (ptr)
779 {
780 // If I have any remaining path that is not covered from 0, I cannot possibly
781 // have any pointer from buffer (that can only happen if it is not dup).
782 for (--i; i >= 0; i--)
783 if (ch[i] != 0)
784 return 0;
785 // Try to find the created tensor block pos in the array, just linear scan.
786 const int tv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_t));
787 ccv_nnc_tensor_t* const tv = (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, tv_pos);
788 *tv = ccv_nnc_tensor(graph_prep->tensor_arena->buffers[buffer_ref].ptr, params, 0);
789 ccv_nnc_tensor_data_add(tv->info, offset, &tv->data, &tv->dataof);
790 return tv_pos;
791 }
792 p_ref = graph_prep->alloc_prep->buffers[buffer_ref].p_refs[0] - 1;
793 }
794 return 0;
795}
796
797// Descent from root to the prep level, and compose multiview from there.
798static int _ccv_nnc_tensor_multiview_down_find_pos(ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_param_t params, const int preserve, const int assign_update, const ccv_nnc_symbolic_graph_prep_t* const *const preps, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int block_ref, int* ch, const int idx, int* const pos_ref)
799{
800 assert(pos_ref)((void) sizeof ((pos_ref) ? 1 : 0), __extension__ ({ if (pos_ref
) ; else __assert_fail ("pos_ref", "ccv_nnc_symbolic_graph_compile.c"
, 800, __extension__ __PRETTY_FUNCTION__); }))
;
801 int i;
802 const ccv_nnc_symbolic_graph_prep_t* const prep = preps[idx];
803 const int unroll_count = prep->unroll_count;
804 if (prep == graph_prep)
805 {
806 const int data_pos = _ccv_nnc_tensor_multiview_find_pos(tensor_metadata, params, preps, block_ref, ch, idx, prep);
807 if (!data_pos)
808 return -1;
809 // Based on ch, go all the way back to find the exact pointer to compose.
810 if (// !assign_update && // If I plan to receive assign update, we don't need to have multiple receiver. Just one tensor to receive update is enough.
811 prep->dup_tensor_block_ref &&
812 prep->dup_tensor_block_ref[block_ref * unroll_count] >= 0 &&
813 prep->dup_tensor_block_ref[block_ref * unroll_count] != block_ref)
814 {
815 int pos[unroll_count + 1];
816 pos[0] = data_pos;
817 for (i = 0; i < unroll_count; i++)
818 pos[i + 1] = _ccv_nnc_tensor_multiview_find_pos(tensor_metadata, params, preps, prep->dup_tensor_block_ref[block_ref * unroll_count + i], ch, idx, prep);
819 const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
820 ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
821 ccv_nnc_tensor_t* data[unroll_count + 1];
822 for (i = 0; i < unroll_count + 1; i++)
823 data[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos[i]);
824 ccv_nnc_tensor_multiview(data, CCV_NNC_MULTIVIEW_K0N, unroll_count + 1, prep->graph, mv);
825 for (i = 0; i < unroll_count + 1; i++)
826 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i] = (ccv_nnc_tensor_t*)(intptr_t)pos[i];
827 *pos_ref = mv_pos;
828 } else {
829 *pos_ref = data_pos;
830 }
831 if (preserve)
832 {
833 // If need to preserve, this need to be more complicated. At loop 0, I need to access the new assigned tv.
834 // at any other loops, it should be the same. Thus, for this case, I will create a mv tensor as following:
835 // mv of K11, thus, when loop is 0, it unwrap to mv->data[0], otherwise, unwrap to mv->data[1].
836 // mv->data[0] (thin_mv) is a K01, which points to the assigned tv (using 1 as a placeholder here until parent
837 // arena allocated).
838 // mv->data[1] (prev_mv_pos_ is a K01 or K02, depending on whether above we passed raw pointer directly or
839 // a mv structure. If we pass a mv structure, we just pass it here. If we pass a raw pointer, we need to wrap
840 // it to a K01 structure.
841 // Why we didn't wrap it directly as mv->data[0] pointing to a assigned tv pointer and the mv->data[1] pointing
842 // to the raw pointer (as ptr_ref) with K11? The reason is we don't know the assigned tv is pointing to one
843 // memory region, or is a managed by multi-view tensor, which could pointing to different memory regions.
844 int prev_mv_pos = *pos_ref;
845 if (prev_mv_pos == -1)
846 {
847 prev_mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
848 ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, prev_mv_pos);
849 ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_metadata, data_pos);
850 ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
851 tv,
852 }, CCV_NNC_MULTIVIEW_K0N, 1, prep->graph, mv);
853 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[0] = (ccv_nnc_tensor_t*)(intptr_t)data_pos;
854 }
855 const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
856 ccv_nnc_tensor_multiview_t* const prev_mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, prev_mv_pos);
857 ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
858 ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
859 CCV_NNC_TENSOR_PLACEHOLDER((ccv_nnc_tensor_t*)(intptr_t)(0x10)),
860 (ccv_nnc_tensor_t*)prev_mv,
861 }, CCV_NNC_MULTIVIEW_K1N, 1, prep->graph, mv);
862 prev_mv->p = (void*)(intptr_t)mv_pos;
863 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[0] = CCV_NNC_TENSOR_PLACEHOLDER((ccv_nnc_tensor_t*)(intptr_t)(0x10));
864 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[1] = (ccv_nnc_tensor_t*)(intptr_t)prev_mv_pos;
865 *pos_ref = mv_pos;
866 }
867 return 0;
868 }
869 ch[idx] = 0;
870 int pos[unroll_count + 1];
871 pos[0] = 0;
872 const int retval = _ccv_nnc_tensor_multiview_down_find_pos(tensor_metadata, params, preserve, assign_update, preps, graph_prep, block_ref, ch, idx + 1, pos);
873 assert(retval == 0)((void) sizeof ((retval == 0) ? 1 : 0), __extension__ ({ if (
retval == 0) ; else __assert_fail ("retval == 0", "ccv_nnc_symbolic_graph_compile.c"
, 873, __extension__ __PRETTY_FUNCTION__); }))
;
874 for (i = 0; i < unroll_count; i++)
875 {
876 ch[idx] = i + 1;
877 pos[i + 1] = 0;
878 const int dup_retval = _ccv_nnc_tensor_multiview_down_find_pos(tensor_metadata, params, preserve, assign_update, preps, graph_prep, block_ref, ch, idx + 1, pos + i + 1);
879 if (dup_retval < 0)
880 {
881 assert(i == 0)((void) sizeof ((i == 0) ? 1 : 0), __extension__ ({ if (i == 0
) ; else __assert_fail ("i == 0", "ccv_nnc_symbolic_graph_compile.c"
, 881, __extension__ __PRETTY_FUNCTION__); }))
;
882 break;
883 }
884 }
885 // If current prep has no dup.
886 if (i == 0)
887 {
888 *pos_ref = pos[0];
889 return 0;
890 }
891 ccv_nnc_tensor_t* data[unroll_count + 1];
892 // Compose to a new multiview.
893 for (i = 0; i < unroll_count + 1; i++)
894 { assert(pos[i] > 0)((void) sizeof ((pos[i] > 0) ? 1 : 0), __extension__ ({ if
(pos[i] > 0) ; else __assert_fail ("pos[i] > 0", "ccv_nnc_symbolic_graph_compile.c"
, 894, __extension__ __PRETTY_FUNCTION__); }))
; }
895 const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
896 for (i = 0; i < unroll_count + 1; i++)
897 data[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos[i]);
898 ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
899 ccv_nnc_tensor_multiview(data, CCV_NNC_MULTIVIEW_K0N, unroll_count + 1, prep->graph, mv);
900 for (i = 0; i < unroll_count + 1; i++)
901 if (data[i] != CCV_NNC_TENSOR_PLACEHOLDER((ccv_nnc_tensor_t*)(intptr_t)(0x10)) && CCV_IS_TENSOR_MULTIVIEW(data[i])((*(int*)(data[i])) & CCV_TENSOR_MULTIVIEW))
902 ((ccv_nnc_tensor_multiview_t*)data[i])->p = (void*)(intptr_t)mv_pos;
903 for (i = 0; i < unroll_count + 1; i++)
904 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i] = (ccv_nnc_tensor_t*)(intptr_t)pos[i];
905 *pos_ref = mv_pos;
906 return 0;
907}
908
909static int _ccv_nnc_is_symbolic_graph_exec_input_or_output(const int p_ref, const ccv_nnc_graph_exec_symbol_info_t *const node)
910{
911 int i;
912 int is_input = 0;
913 assert(node)((void) sizeof ((node) ? 1 : 0), __extension__ ({ if (node) ;
else __assert_fail ("node", "ccv_nnc_symbolic_graph_compile.c"
, 913, __extension__ __PRETTY_FUNCTION__); }))
;
914 for (i = 0; i < node->input_size && !is_input; i++)
915 if (p_ref == node->inputs[i])
916 is_input = 1;
917 int is_output = 0;
918 for (i = 0; i < node->output_size && !is_output; i++)
919 if (p_ref == node->outputs[i])
920 is_output = 1;
921 // Prefer it is an output if it is both the input and the output.
922 if (is_output)
923 return 1;
924 if (is_input)
925 return -1;
926 return 0;
927}
928
929static int _ccv_nnc_tensor_block_check_preserve(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int block_ref)
930{
931 // No need to check whether to preserve if this is not a while loop.
932 if (!(graph_prep->flags & CCV_NNC_GRAPH_EXEC_P_WHILE))
933 return 0;
934 assert(block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size)((void) sizeof ((block_ref >= 0 && block_ref < graph_prep
->tensor_symbol_info_size) ? 1 : 0), __extension__ ({ if (
block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size
) ; else __assert_fail ("block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size"
, "ccv_nnc_symbolic_graph_compile.c", 934, __extension__ __PRETTY_FUNCTION__
); }))
;
935 // If it is unassigned, no need to preserve.
936 if (TENSOR_EXPECT_UNASSIGNED(graph_prep->tensor_blocks[block_ref])((graph_prep->tensor_blocks[block_ref].flags & 0x3) ==
UNASSIGNED)
)
937 return 0;
938 const int p_ref = graph_prep->tensor_blocks[block_ref].p_refs[0] - 1;
939 // If p is not input, no need to preserve at all.
940 if (-1 != _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref, graph_prep->p->exec_symbol_info + (graph_prep->exec_idx - 1)))
941 return 0;
942 const int vt_ref = graph_prep->alloc_prep->vt_blocks[block_ref];
943 assert(vt_ref >= 0)((void) sizeof ((vt_ref >= 0) ? 1 : 0), __extension__ ({ if
(vt_ref >= 0) ; else __assert_fail ("vt_ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 943, __extension__ __PRETTY_FUNCTION__); }))
;
944 assert(block_ref == graph_prep->alloc_prep->blocks[vt_ref].block_ref)((void) sizeof ((block_ref == graph_prep->alloc_prep->blocks
[vt_ref].block_ref) ? 1 : 0), __extension__ ({ if (block_ref ==
graph_prep->alloc_prep->blocks[vt_ref].block_ref) ; else
__assert_fail ("block_ref == graph_prep->alloc_prep->blocks[vt_ref].block_ref"
, "ccv_nnc_symbolic_graph_compile.c", 944, __extension__ __PRETTY_FUNCTION__
); }))
;
945 const int buffer_ref = graph_prep->alloc_prep->blocks[vt_ref].buffer_ref;
946 // If the buffer is a truly read-only one, no need to preserve.
947 if (TENSOR_READ_WRITE(graph_prep->alloc_prep->buffers[buffer_ref])(graph_prep->alloc_prep->buffers[buffer_ref].flags &
0xc)
== READ_ONLY)
948 return 0;
949 /* This needs detailed explanation, what does preserve mean?
950 * For a parameterized loop, such as while { y = x + 1 } (y => x), if tensor x is
951 * also used outside of the while loop, we cannot reuse the memory region of x for
952 * the for loop, otherwise we will destroy x when doing y = x + 1 computation (assuming
953 * y uses the same memory region as x). The way to workaround this is by using a different
954 * memory region for y = x + 1, but for the first iteration, having x pointing to the
955 * original. During the allocation process, the way to identify whether x should preserve
956 * its value or not by looking up its parent tensor. If the symbol (tensor_block)'s input
957 * parent tensor is the same as the memory region it plans to use in the buffer, then we are
958 * good (buffer.p_refs[0] == p_refs[0]). A buffer can only point to one parent tensor, and
959 * it is the input tensor whenever that is possible. A tensor block can point to two parent
960 * tensors, one is input tensor, one is the output tensor. p_refs[0] should be the input
961 * tensor whenever that is possible. */
962 if (graph_prep->alloc_prep->buffers[buffer_ref].p_refs[0] - 1 == p_ref)
963 return 0;
964 // Otherwise, return 1 because we now need to preserve.
965 return 1;
966}
967
968static int _ccv_nnc_tensor_block_check_force_broadcast(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int block_ref)
969{
970 assert(block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size)((void) sizeof ((block_ref >= 0 && block_ref < graph_prep
->tensor_symbol_info_size) ? 1 : 0), __extension__ ({ if (
block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size
) ; else __assert_fail ("block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size"
, "ccv_nnc_symbolic_graph_compile.c", 970, __extension__ __PRETTY_FUNCTION__
); }))
;
971 // If it is unassigned, no need to preserve.
972 if (TENSOR_EXPECT_UNASSIGNED(graph_prep->tensor_blocks[block_ref])((graph_prep->tensor_blocks[block_ref].flags & 0x3) ==
UNASSIGNED)
)
973 return 0;
974 // Only tape var need to force broadcast, otherwise we already share the same memory region.
975 if (!(graph_prep->tensor_symbol_info[block_ref].flags & CCV_NNC_TENSOR_SYMBOL_TAPE_VAR))
976 return 0;
977 const int p_ref = graph_prep->tensor_blocks[block_ref].p_refs[0] - 1;
978 // If p is not output, no need to broadcast at all.
979 if (1 != _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref, graph_prep->p->exec_symbol_info + (graph_prep->exec_idx - 1)))
980 return 0;
981 const int vt_ref = graph_prep->alloc_prep->vt_blocks[block_ref];
982 assert(vt_ref >= 0)((void) sizeof ((vt_ref >= 0) ? 1 : 0), __extension__ ({ if
(vt_ref >= 0) ; else __assert_fail ("vt_ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 982, __extension__ __PRETTY_FUNCTION__); }))
;
983 assert(block_ref == graph_prep->alloc_prep->blocks[vt_ref].block_ref)((void) sizeof ((block_ref == graph_prep->alloc_prep->blocks
[vt_ref].block_ref) ? 1 : 0), __extension__ ({ if (block_ref ==
graph_prep->alloc_prep->blocks[vt_ref].block_ref) ; else
__assert_fail ("block_ref == graph_prep->alloc_prep->blocks[vt_ref].block_ref"
, "ccv_nnc_symbolic_graph_compile.c", 983, __extension__ __PRETTY_FUNCTION__
); }))
;
984 const int buffer_ref = graph_prep->alloc_prep->blocks[vt_ref].buffer_ref;
985 // If the buffer is a truly read-only one, no need to broadcast.
986 if (TENSOR_READ_WRITE(graph_prep->alloc_prep->buffers[buffer_ref])(graph_prep->alloc_prep->buffers[buffer_ref].flags &
0xc)
== READ_ONLY)
987 return 0;
988 // Otherwise, return 1 because we now need to force broadcast for this tape var.
989 return 1;
990}
991
992static void _ccv_nnc_tensor_multiview_full_pos(ccv_nnc_tensor_multiview_t* const mv, ccv_nnc_tensor_t* const tensor)
993{
994 assert(CCV_IS_TENSOR_MULTIVIEW(mv))((void) sizeof ((((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW)) ?
1 : 0), __extension__ ({ if (((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW
)) ; else __assert_fail ("CCV_IS_TENSOR_MULTIVIEW(mv)", "ccv_nnc_symbolic_graph_compile.c"
, 994, __extension__ __PRETTY_FUNCTION__); }))
;
995 int i;
996 for (i = 0; i < mv->kind + mv->repeat; i++)
997 if (CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i] == CCV_NNC_TENSOR_PLACEHOLDER((ccv_nnc_tensor_t*)(intptr_t)(0x10)))
998 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i] = tensor;
999 else if (CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(mv)[i])((*(int*)(((mv)->_heap_data ? (mv)->_heap_data : (mv)->
_inline_data)[i])) & CCV_TENSOR_MULTIVIEW)
)
1000 _ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i], tensor);
1001}
1002
1003static void _ccv_nnc_tensor_multiview_full_pos_rewire(const ccv_array_t* const tensor_metadata, ccv_nnc_tensor_multiview_t* const mv)
1004{
1005 assert(CCV_IS_TENSOR_MULTIVIEW(mv))((void) sizeof ((((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW)) ?
1 : 0), __extension__ ({ if (((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW
)) ; else __assert_fail ("CCV_IS_TENSOR_MULTIVIEW(mv)", "ccv_nnc_symbolic_graph_compile.c"
, 1005, __extension__ __PRETTY_FUNCTION__); }))
;
1006 int i;
1007 if (mv->sp)
1008 for (i = 0; i < mv->sp->rnum; i++)
1009 {
1010 ccv_nnc_tensor_t** const tensor = (ccv_nnc_tensor_t**)ccv_array_get(mv->sp, i)((void*)(((char*)((mv->sp)->data)) + (size_t)(mv->sp
)->rsize * (size_t)(i)))
;
1011 if (CCV_NNC_IS_METADATA_POS(*tensor)((uintptr_t)(*tensor) & 1))
1012 {
1013 const int pos = (int)(intptr_t)*tensor;
1014 *tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
1015 assert(!CCV_IS_TENSOR_MULTIVIEW(*tensor))((void) sizeof ((!((*(int*)(*tensor)) & CCV_TENSOR_MULTIVIEW
)) ? 1 : 0), __extension__ ({ if (!((*(int*)(*tensor)) & CCV_TENSOR_MULTIVIEW
)) ; else __assert_fail ("!CCV_IS_TENSOR_MULTIVIEW(*tensor)",
"ccv_nnc_symbolic_graph_compile.c", 1015, __extension__ __PRETTY_FUNCTION__
); }))
;
1016 _ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)pos);
1017 }
1018 }
1019 for (i = 0; i < mv->kind + mv->repeat; i++)
1020 {
1021 if (CCV_NNC_IS_METADATA_POS((int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[i])((uintptr_t)((int)(intptr_t)((mv)->_heap_data ? (mv)->_heap_data
: (mv)->_inline_data)[i]) & 1)
)
1022 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i]);
1023 if (CCV_NNC_IS_METADATA_POS((int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[i]->alias_ref)((uintptr_t)((int)(intptr_t)((mv)->_heap_data ? (mv)->_heap_data
: (mv)->_inline_data)[i]->alias_ref) & 1)
)
1024 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i]->alias_ref = (uintptr_t)_ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i]->alias_ref);
1025 if (CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(mv)[i])((*(int*)(((mv)->_heap_data ? (mv)->_heap_data : (mv)->
_inline_data)[i])) & CCV_TENSOR_MULTIVIEW)
)
1026 _ccv_nnc_tensor_multiview_full_pos_rewire(tensor_metadata, (ccv_nnc_tensor_multiview_t*)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i]);
1027 }
1028}
1029
1030static int _ccv_nnc_tensor_multiview_gen(ccv_array_t* const tensor_metadata, const int preserve, const int assign_update, const ccv_nnc_tensor_param_t params, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_tensor_arena_t* const tensor_arena, const int block_ref)
1031{
1032 // Go to the root of the graph.
1033 const ccv_nnc_symbolic_graph_prep_t* prep = graph_prep;
1034 int i;
1035 for (i = 1; prep->p; i++)
1036 prep = prep->p;
1037 // Root graph should have no dup tensor blocks.
1038 assert(!prep->dup_tensor_block_ref)((void) sizeof ((!prep->dup_tensor_block_ref) ? 1 : 0), __extension__
({ if (!prep->dup_tensor_block_ref) ; else __assert_fail (
"!prep->dup_tensor_block_ref", "ccv_nnc_symbolic_graph_compile.c"
, 1038, __extension__ __PRETTY_FUNCTION__); }))
;
1039 const int c = i;
1040 const ccv_nnc_symbolic_graph_prep_t* preps[c];
1041 prep = graph_prep;
1042 preps[c - 1] = prep;
1043 for (i = 0; prep->p; i++)
1044 preps[c - 2 - i] = prep = prep->p;
1045 int ch[c]; // Use dynamic allocation for array. This is an array to record our selections when recursive from top to bottom.
1046 memset(ch, 0, sizeof(int) * c);
1047 int pos = 0;
1048 _ccv_nnc_tensor_multiview_down_find_pos(tensor_metadata, params, preserve, assign_update, preps, graph_prep, block_ref, ch, 0, &pos);
1049 assert(ch[c - 1] == 0)((void) sizeof ((ch[c - 1] == 0) ? 1 : 0), __extension__ ({ if
(ch[c - 1] == 0) ; else __assert_fail ("ch[c - 1] == 0", "ccv_nnc_symbolic_graph_compile.c"
, 1049, __extension__ __PRETTY_FUNCTION__); }))
; // This shouldn't never be modified.
1050 assert(pos > 0)((void) sizeof ((pos > 0) ? 1 : 0), __extension__ ({ if (pos
> 0) ; else __assert_fail ("pos > 0", "ccv_nnc_symbolic_graph_compile.c"
, 1050, __extension__ __PRETTY_FUNCTION__); }))
;
1051 return pos;
1052}
1053
1054static int _ccv_nnc_tensor_multiview_preserve_gen(ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_param_t params, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, ccv_nnc_tensor_t* const tensor)
1055{
1056 const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
1057 ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
1058 ccv_nnc_tensor_t* const tv = CCV_NNC_IS_METADATA_POS(tensor)((uintptr_t)(tensor) & 1) ? _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)tensor) : tensor;
1059 ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
1060 CCV_NNC_TENSOR_PLACEHOLDER((ccv_nnc_tensor_t*)(intptr_t)(0x10)),
1061 tv,
1062 }, CCV_NNC_MULTIVIEW_K1N, 1, graph_prep->graph, mv);
1063 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[0] = CCV_NNC_TENSOR_PLACEHOLDER((ccv_nnc_tensor_t*)(intptr_t)(0x10));
1064 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[1] = tensor;
1065 return mv_pos;
1066}
1067
1068static int _ccv_nnc_tensor_flat_if_multiview(ccv_array_t* const tensor_metadata, const int pos)
1069{
1070 ccv_nnc_tensor_t* tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
1071 const int is_multiview = CCV_IS_TENSOR_MULTIVIEW(tensor_ptr)((*(int*)(tensor_ptr)) & CCV_TENSOR_MULTIVIEW);
1072 if (!is_multiview)
1073 return pos;
1074 while (CCV_IS_TENSOR_MULTIVIEW(tensor_ptr)((*(int*)(tensor_ptr)) & CCV_TENSOR_MULTIVIEW))
1075 {
1076 const ccv_nnc_tensor_multiview_t* const mv = (const ccv_nnc_tensor_multiview_t*)tensor_ptr;
1077 tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[0]);
1078 }
1079 const ccv_nnc_tensor_t tensor = *tensor_ptr;
1080 const int new_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_t));
1081 ccv_nnc_tensor_t* const new_tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, new_pos);
1082 *new_tensor = ccv_nnc_tensor(tensor.data.u8, tensor.info, 0);
1083 new_tensor->dataof = tensor.dataof;
1084 ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
1085 new_tensor->alias_ref = (uintptr_t)pos;
1086 ccv_nnc_tensor_synchronize_to_multiview(mv, (ccv_nnc_tensor_t*)(intptr_t)new_pos);
1087 return new_pos;
1088}
1089
1090static void _ccv_nnc_assign_vt_tensor_aliases(ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int block_ref, ccv_nnc_tensor_t** const vt_tensors)
1091{
1092 const int alias_ref = tensor_symbol_info[block_ref].alias_ref - 1;
1093 // It referenced to is not an alias.
1094 assert(vt_tensors[alias_ref])((void) sizeof ((vt_tensors[alias_ref]) ? 1 : 0), __extension__
({ if (vt_tensors[alias_ref]) ; else __assert_fail ("vt_tensors[alias_ref]"
, "ccv_nnc_symbolic_graph_compile.c", 1094, __extension__ __PRETTY_FUNCTION__
); }))
;
1095 const int alias_pos = (int)(intptr_t)vt_tensors[alias_ref];
1096 const ccv_nnc_tensor_t* alias_tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_metadata, alias_pos);
1097 assert(!CCV_IS_TENSOR_VIEW(alias_tensor_ptr))((void) sizeof ((!((*(int*)(alias_tensor_ptr)) & CCV_TENSOR_VIEW
)) ? 1 : 0), __extension__ ({ if (!((*(int*)(alias_tensor_ptr
)) & CCV_TENSOR_VIEW)) ; else __assert_fail ("!CCV_IS_TENSOR_VIEW(alias_tensor_ptr)"
, "ccv_nnc_symbolic_graph_compile.c", 1097, __extension__ __PRETTY_FUNCTION__
); }))
;
1098 // Will use that to determine whether insert reference or not.
1099 const int is_multiview = CCV_IS_TENSOR_MULTIVIEW(alias_tensor_ptr)((*(int*)(alias_tensor_ptr)) & CCV_TENSOR_MULTIVIEW);
1100 while (CCV_IS_TENSOR_MULTIVIEW(alias_tensor_ptr)((*(int*)(alias_tensor_ptr)) & CCV_TENSOR_MULTIVIEW))
1101 {
1102 const ccv_nnc_tensor_multiview_t* const mv = (const ccv_nnc_tensor_multiview_t*)alias_tensor_ptr;
1103 alias_tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[0]);
1104 }
1105 const ccv_nnc_tensor_t alias_tensor = *alias_tensor_ptr;
1106 // If there is no ofs, and inc is the same as dim, we take a shortcut and just init as normal tensor.
1107 int pos;
1108 if (memcmp(ccv_nnc_no_ofs, tensor_symbol_info[block_ref].ofs, sizeof(ccv_nnc_no_ofs)) == 0 &&
1109 ccv_nnc_is_tensor_stride_packed(tensor_symbol_info[block_ref].stride, tensor_symbol_info[block_ref].info.dim))
1110 {
1111 pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_t));
1112 ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
1113 *tensor = ccv_nnc_tensor(alias_tensor.data.u8, tensor_symbol_info[block_ref].info, 0);
1114 tensor->dataof = alias_tensor.dataof;
1115 } else {
1116 pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_view_t));
1117 ccv_nnc_tensor_view_t* const tensor_view = (ccv_nnc_tensor_view_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
1118 // Otherwise initialize a tensor view
1119 *tensor_view = ccv_nnc_tensor_view(&alias_tensor, tensor_symbol_info[block_ref].info, tensor_symbol_info[block_ref].ofs, tensor_symbol_info[block_ref].stride);
1120 tensor_view->alias_ref = (uintptr_t)alias_pos;
1121 }
1122 vt_tensors[block_ref] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1123 if (is_multiview)
1124 {
1125 ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, alias_pos);
1126 ccv_nnc_tensor_synchronize_to_multiview(mv, (ccv_nnc_tensor_t*)(intptr_t)pos);
1127 }
1128}
1129
1130static void _ccv_nnc_recursively_assign_vt_tensor_aliases(const ccv_nnc_tensor_block_t* const tensor_blocks, ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int block_ref, ccv_nnc_tensor_t** const vt_tensors)
1131{
1132 // If this is an alias_ref and it hasn't been assigned, it must be an alias itself. Do this recursively.
1133 if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[block_ref])((tensor_blocks[block_ref].flags & 0x3) == UNASSIGNED) && tensor_blocks[block_ref].alias_ref && !vt_tensors[block_ref])
1134 {
1135 const int ref = tensor_blocks[block_ref].alias_ref - 1;
1136 if (!vt_tensors[ref])
1137 _ccv_nnc_recursively_assign_vt_tensor_aliases(tensor_blocks, tensor_metadata, tensor_symbol_info, ref, vt_tensors);
1138 vt_tensors[block_ref] = vt_tensors[ref];
1139 return;
1140 }
1141 assert(tensor_symbol_info[block_ref].alias_ref)((void) sizeof ((tensor_symbol_info[block_ref].alias_ref) ? 1
: 0), __extension__ ({ if (tensor_symbol_info[block_ref].alias_ref
) ; else __assert_fail ("tensor_symbol_info[block_ref].alias_ref"
, "ccv_nnc_symbolic_graph_compile.c", 1141, __extension__ __PRETTY_FUNCTION__
); }))
;
1142 const int alias_ref = tensor_symbol_info[block_ref].alias_ref - 1;
1143 // If we don't have vt_tensors, this must be a ref with alias_ref (through folding). If that is the case, do this recursively until all aliases assigned.
1144 if (!vt_tensors[alias_ref])
1145 _ccv_nnc_recursively_assign_vt_tensor_aliases(tensor_blocks, tensor_metadata, tensor_symbol_info, alias_ref, vt_tensors);
1146 _ccv_nnc_assign_vt_tensor_aliases(tensor_metadata, tensor_symbol_info, block_ref, vt_tensors);
1147}
1148
1149// Turn a linear pointer to an object storage (such as MTLBuffer).
1150#ifdef HAVE_MPS
1151static void _ccv_nnc_tensor_arena_obj_dispose(void* ptr, void* userdata)
1152{
1153 mpobjfree(0, ptr);
1154}
1155#endif
1156
1157typedef struct {
1158 size_t size;
1159 void* obj;
1160} tensor_arena_obj_track_t;
1161
1162typedef struct {
1163 void* ptr;
1164 off_t offset;
1165 size_t size;
1166} obj_ptr_key_t;
1167
1168static inline khint32_t _kh_obj_ptr_hash_func(const obj_ptr_key_t key)
1169{
1170 return ((uint64_t)(uintptr_t)key.ptr >> 4) + key.offset + key.size;
1171}
1172
1173static inline int _kh_obj_ptr_hash_equal(const obj_ptr_key_t a, const obj_ptr_key_t b)
1174{
1175 return (a.ptr == b.ptr && a.offset == b.offset && a.size == b.size);
1176}
1177
1178KHASH_INIT(obj_ptr, obj_ptr_key_t, void*, 1, _kh_obj_ptr_hash_func, _kh_obj_ptr_hash_equal)typedef struct kh_obj_ptr_s { khint_t n_buckets, size, n_occupied
, upper_bound; khint32_t *flags; obj_ptr_key_t *keys; void* *
vals; } kh_obj_ptr_t; static inline __attribute__ ((__unused__
)) kh_obj_ptr_t *kh_init_obj_ptr(void) { return (kh_obj_ptr_t
*)calloc(1,sizeof(kh_obj_ptr_t)); } static inline __attribute__
((__unused__)) void kh_destroy_obj_ptr(kh_obj_ptr_t *h) { if
(h) { free((void *)h->keys); free(h->flags); free((void
*)h->vals); free(h); } } static inline __attribute__ ((__unused__
)) void kh_clear_obj_ptr(kh_obj_ptr_t *h) { if (h && h
->flags) { memset(h->flags, 0xaa, ((h->n_buckets) <
16? 1 : (h->n_buckets)>>4) * sizeof(khint32_t)); h->
size = h->n_occupied = 0; } } static inline __attribute__ (
(__unused__)) khint_t kh_get_obj_ptr(const kh_obj_ptr_t *h, obj_ptr_key_t
key) { if (h->n_buckets) { khint_t k, i, last, mask, step
= 0; mask = h->n_buckets - 1; k = _kh_obj_ptr_hash_func(key
); i = k & mask; last = i; while (!((h->flags[i>>
4]>>((i&0xfU)<<1))&2) && (((h->
flags[i>>4]>>((i&0xfU)<<1))&1) || !
_kh_obj_ptr_hash_equal(h->keys[i], key))) { i = (i + (++step
)) & mask; if (i == last) return h->n_buckets; } return
((h->flags[i>>4]>>((i&0xfU)<<1))&
3)? h->n_buckets : i; } else return 0; } static inline __attribute__
((__unused__)) int kh_resize_obj_ptr(kh_obj_ptr_t *h, khint_t
new_n_buckets) { khint32_t *new_flags = 0; khint_t j = 1; { (
--(new_n_buckets), (new_n_buckets)|=(new_n_buckets)>>1,
(new_n_buckets)|=(new_n_buckets)>>2, (new_n_buckets)|=
(new_n_buckets)>>4, (new_n_buckets)|=(new_n_buckets)>>
8, (new_n_buckets)|=(new_n_buckets)>>16, ++(new_n_buckets
)); if (new_n_buckets < 4) new_n_buckets = 4; if (h->size
>= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0
; else { new_flags = (khint32_t*)malloc(((new_n_buckets) <
16? 1 : (new_n_buckets)>>4) * sizeof(khint32_t)); if (
!new_flags) return -1; memset(new_flags, 0xaa, ((new_n_buckets
) < 16? 1 : (new_n_buckets)>>4) * sizeof(khint32_t))
; if (h->n_buckets < new_n_buckets) { obj_ptr_key_t *new_keys
= (obj_ptr_key_t*)realloc((void *)h->keys,new_n_buckets *
sizeof(obj_ptr_key_t)); if (!new_keys) { free(new_flags); return
-1; } h->keys = new_keys; if (1) { void* *new_vals = (void
**)realloc((void *)h->vals,new_n_buckets * sizeof(void*));
if (!new_vals) { free(new_flags); return -1; } h->vals = new_vals
; } } } } if (j) { for (j = 0; j != h->n_buckets; ++j) { if
(((h->flags[j>>4]>>((j&0xfU)<<1))&
3) == 0) { obj_ptr_key_t key = h->keys[j]; void* val; khint_t
new_mask; new_mask = new_n_buckets - 1; if (1) val = h->vals
[j]; (h->flags[j>>4]|=1ul<<((j&0xfU)<<
1)); while (1) { khint_t k, i, step = 0; k = _kh_obj_ptr_hash_func
(key); i = k & new_mask; while (!((new_flags[i>>4]>>
((i&0xfU)<<1))&2)) i = (i + (++step)) & new_mask
; (new_flags[i>>4]&=~(2ul<<((i&0xfU)<<
1))); if (i < h->n_buckets && ((h->flags[i>>
4]>>((i&0xfU)<<1))&3) == 0) { { obj_ptr_key_t
tmp = h->keys[i]; h->keys[i] = key; key = tmp; } if (1
) { void* tmp = h->vals[i]; h->vals[i] = val; val = tmp
; } (h->flags[i>>4]|=1ul<<((i&0xfU)<<
1)); } else { h->keys[i] = key; if (1) h->vals[i] = val
; break; } } } } if (h->n_buckets > new_n_buckets) { h->
keys = (obj_ptr_key_t*)realloc((void *)h->keys,new_n_buckets
* sizeof(obj_ptr_key_t)); if (1) h->vals = (void**)realloc
((void *)h->vals,new_n_buckets * sizeof(void*)); } free(h->
flags); h->flags = new_flags; h->n_buckets = new_n_buckets
; h->n_occupied = h->size; h->upper_bound = (khint_t
)(h->n_buckets * __ac_HASH_UPPER + 0.5); } return 0; } static
inline __attribute__ ((__unused__)) khint_t kh_put_obj_ptr(kh_obj_ptr_t
*h, obj_ptr_key_t key, int *ret) { khint_t x; if (h->n_occupied
>= h->upper_bound) { if (h->n_buckets > (h->size
<<1)) { if (kh_resize_obj_ptr(h, h->n_buckets - 1) <
0) { *ret = -1; return h->n_buckets; } } else if (kh_resize_obj_ptr
(h, h->n_buckets + 1) < 0) { *ret = -1; return h->n_buckets
; } } { khint_t k, i, site, last, mask = h->n_buckets - 1,
step = 0; x = site = h->n_buckets; k = _kh_obj_ptr_hash_func
(key); i = k & mask; if (((h->flags[i>>4]>>
((i&0xfU)<<1))&2)) x = i; else { last = i; while
(!((h->flags[i>>4]>>((i&0xfU)<<1))&
2) && (((h->flags[i>>4]>>((i&0xfU)
<<1))&1) || !_kh_obj_ptr_hash_equal(h->keys[i], key
))) { if (((h->flags[i>>4]>>((i&0xfU)<<
1))&1)) site = i; i = (i + (++step)) & mask; if (i ==
last) { x = site; break; } } if (x == h->n_buckets) { if (
((h->flags[i>>4]>>((i&0xfU)<<1))&
2) && site != h->n_buckets) x = site; else x = i; }
} } if (((h->flags[x>>4]>>((x&0xfU)<<
1))&2)) { h->keys[x] = key; (h->flags[x>>4]&=
~(3ul<<((x&0xfU)<<1))); ++h->size; ++h->
n_occupied; *ret = 1; } else if (((h->flags[x>>4]>>
((x&0xfU)<<1))&1)) { h->keys[x] = key; (h->
flags[x>>4]&=~(3ul<<((x&0xfU)<<1)))
; ++h->size; *ret = 2; } else *ret = 0; return x; } static
inline __attribute__ ((__unused__)) void kh_del_obj_ptr(kh_obj_ptr_t
*h, khint_t x) { if (x != h->n_buckets && !((h->
flags[x>>4]>>((x&0xfU)<<1))&3)) { (
h->flags[x>>4]|=1ul<<((x&0xfU)<<1));
--h->size; } }
1179
1180static inline void* _ccv_nnc_tensor_arena_obj_create(khash_t(obj_ptr)kh_obj_ptr_t* obj_ptr_map, void* ptr, const size_t total_size, const off_t offset, const ccv_nnc_tensor_param_t params, ccv_nnc_tensor_arena_t* tensor_arena)
1181{
1182 if (params.dim[0] == 0)
1183 return 0;
1184#ifdef HAVE_MPS
1185 if (CCV_TENSOR_GET_MEMORY(params.type)((params.type) & 0x3) == CCV_TENSOR_GPU_MEMORY)
1186 {
1187 int ret;
1188 const size_t size = CCV_GET_DATA_TYPE_SIZE(params.datatype)_ccv_get_data_type_size[((params.datatype) & 0xFF000) >>
12]
* ccv_nnc_tensor_count(params);
1189 const obj_ptr_key_t key = {
1190 .ptr = ptr,
1191 .offset = offset,
1192 .size = size,
1193 };
1194 khiter_t k = kh_put(obj_ptr, obj_ptr_map, key, &ret)kh_put_obj_ptr(obj_ptr_map, key, &ret);
1195 if (ret != 0)
1196 {
1197 void* obj = mpobjcreate(ptr, offset, size);
1198 if (!tensor_arena->disposers)
1199 tensor_arena->disposers = ccv_array_new(sizeof(ccv_nnc_arena_disposer_t), 1, 0);
1200 ccv_nnc_arena_disposer_t disposer = {
1201 .ptr = obj,
1202 .userdata = 0,
1203 .dispose = _ccv_nnc_tensor_arena_obj_dispose
1204 };
1205 ccv_array_push(tensor_arena->disposers, &disposer);
1206 kh_val(obj_ptr_map, k)((obj_ptr_map)->vals[k]) = obj;
1207 return obj;
1208 } else
1209 return kh_val(obj_ptr_map, k)((obj_ptr_map)->vals[k]);
1210 }
1211#endif
1212 return ptr + offset;
1213}
1214
1215static ccv_nnc_tensor_arena_t* _ccv_nnc_tensor_arena_new(ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_symbolic_graph_compile_allocator_t allocator, const ccv_nnc_tensor_arena_t* const p_arena, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size)
1216{
1217 // All tensors assigned out, now, the num_assigned is the number of dis-continuous buffers,
1218 // Each tensor have the designation in assigned array, and offset in allocated_offset.
1219 const ccv_nnc_tensor_alloc_prep_t* const alloc_prep = graph_prep->alloc_prep;
1220 ccv_nnc_tensor_block_t* const tensor_blocks = graph_prep->tensor_blocks;
1221 const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info = graph_prep->tensor_symbol_info;
1222 const int tensor_symbol_info_size = graph_prep->tensor_symbol_info_size;
1223 const ccv_nnc_symbolic_graph_prep_t* const p_graph_prep = graph_prep->p;
1224 const ccv_nnc_tensor_alloc_prep_t* const p_alloc_prep = p_graph_prep ? p_graph_prep->alloc_prep : 0;
1225 const int* const dup_tensor_block_ref = graph_prep->dup_tensor_block_ref;
1226 const int unroll_count = graph_prep->unroll_count;
1227 int i, j;
1228 for (i = 0; i < tensor_symbol_info_size; i++)
1229 for (j = 0; TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED) && j < unroll_count; j++)
1230 {
1231 const int dup_ref = dup_tensor_block_ref[i * unroll_count + j];
1232 if (dup_ref >= 0 && !TENSOR_EXPECT_UNASSIGNED(tensor_blocks[dup_ref])((tensor_blocks[dup_ref].flags & 0x3) == UNASSIGNED))
1233 TENSOR_EXPECT_UNSET_UNASSIGNED(tensor_blocks[i])(tensor_blocks[i].flags = (tensor_blocks[i].flags & ~0x1)
)
;
1234 }
1235 ccv_nnc_tensor_arena_t* tensor_arena = (ccv_nnc_tensor_arena_t*)ccmallocmalloc(sizeof(ccv_nnc_tensor_arena_t) + sizeof(tensor_arena->buffers[0]) * alloc_prep->buffer_size + sizeof(ccv_nnc_tensor_arena_t*) * graph_prep->sub_prep_size + sizeof(ccv_nnc_tensor_t*) * tensor_symbol_info_size + sizeof(int) * tensor_symbol_info_size);
1236 graph_prep->tensor_arena = tensor_arena;
1237 tensor_arena->graph_ref = (intptr_t)graph_prep->symbolic_graph;
1238 tensor_arena->buffers = (void*)(tensor_arena + 1);
1239 tensor_arena->buffer_size = alloc_prep->buffer_size;
1240 tensor_arena->vt_tensor_size = tensor_symbol_info_size;
1241 tensor_arena->sub_arenas = (ccv_nnc_tensor_arena_t**)(tensor_arena->buffers + alloc_prep->buffer_size);
1242 tensor_arena->vt_tensors = (ccv_nnc_tensor_t**)(tensor_arena->sub_arenas + graph_prep->sub_prep_size);
1243 tensor_arena->vt_alias_refs = (int*)(tensor_arena->vt_tensors + tensor_symbol_info_size);
1244 tensor_arena->pb_vt_tensors = 0;
1245 tensor_arena->vt_alias_r_refs_p = 0;
1246 tensor_arena->vt_alias_r_refs = 0;
1247 tensor_arena->vt_sizes = 0;
1248 tensor_arena->sub_arena_size = graph_prep->sub_prep_size;
1249 tensor_arena->tensor_metadata = ccv_array_new(16 /* align to 16 bytes */, 0, 0);
1250 tensor_arena->m_tensor_idx = ccv_array_new(sizeof(int), 0, 0);
1251 tensor_arena->allocator.context.free = allocator.context.free;
1252 tensor_arena->allocator.isa = allocator.isa;
1253 tensor_arena->disposers = 0;
1254 // Copy alias_ref info back to the tensor arena.
1255 for (i = 0; i < tensor_symbol_info_size; i++)
1256 tensor_arena->vt_alias_refs[i] = tensor_symbol_info[i].alias_ref;
1257 // Do the buffer copies.
1258 for (i = 0; i < alloc_prep->buffer_size; i++)
1259 tensor_arena->buffers[i].type = alloc_prep->buffers[i].type,
1260 tensor_arena->buffers[i].pin_mem = alloc_prep->buffers[i].pin_mem,
1261 tensor_arena->buffers[i].size = alloc_prep->buffers[i].size;
1262 if (graph_prep->while_count_tensor)
1263 {
1264 // If we need to have a while count tensor, allocate that first, set its pointer to point the while_count variable.
1265 int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1266 assert((0 << 1) + 1 == pos)((void) sizeof (((0 << 1) + 1 == pos) ? 1 : 0), __extension__
({ if ((0 << 1) + 1 == pos) ; else __assert_fail ("(0 << 1) + 1 == pos"
, "ccv_nnc_symbolic_graph_compile.c", 1266, __extension__ __PRETTY_FUNCTION__
); }))
; // pos must be 0 position.
1267 ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1268 *tensor = ccv_nnc_tensor_for_while_count(graph_prep->graph);
1269 }
1270 assert((p_arena && p_graph_prep) || (!p_arena && !p_graph_prep))((void) sizeof (((p_arena && p_graph_prep) || (!p_arena
&& !p_graph_prep)) ? 1 : 0), __extension__ ({ if ((p_arena
&& p_graph_prep) || (!p_arena && !p_graph_prep
)) ; else __assert_fail ("(p_arena && p_graph_prep) || (!p_arena && !p_graph_prep)"
, "ccv_nnc_symbolic_graph_compile.c", 1270, __extension__ __PRETTY_FUNCTION__
); }))
;
1271 if (p_arena && p_graph_prep)
1272 {
1273 // Don't need to allocate the actual buffer, just use the pointer from the above.
1274 PRINT(CCV_CLI_VERBOSE, "Buffer assignment for sub arena %p (parent %p)\n", tensor_arena, p_arena)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
{ printf("Buffer assignment for sub arena %p (parent %p)\n",
tensor_arena, p_arena); fflush(stdout); } } while (0)
;
1275 for (i = 0; i < tensor_arena->buffer_size; i++)
1276 {
1277 const int p_ref = alloc_prep->buffers[i].p_refs[0] - 1;
1278 int unref_p_ref = p_ref;
1279 while (p_graph_prep->tensor_blocks[unref_p_ref].ref)
1280 unref_p_ref = p_graph_prep->tensor_blocks[unref_p_ref].ref - 1;
1281 assert(unref_p_ref >= 0)((void) sizeof ((unref_p_ref >= 0) ? 1 : 0), __extension__
({ if (unref_p_ref >= 0) ; else __assert_fail ("unref_p_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 1281, __extension__ __PRETTY_FUNCTION__
); }))
;
1282 const int p_unroll_count = p_graph_prep->unroll_count;
1283 if (p_graph_prep->dup_tensor_block_ref &&
1284 p_graph_prep->dup_tensor_block_ref[p_ref * p_unroll_count] >= 0 &&
1285 p_graph_prep->dup_tensor_block_ref[p_ref * p_unroll_count] != p_ref)
1286 {
1287 // This condition means in the parent graph, we point to multiple tensor blocks for the same
1288 // buffer, therefore, we cannot have one single pointer assigned in this case.
1289 // Later we will handle this by generate ccv_tensor_multiview_t structure.
1290 tensor_arena->buffers[i].ptr = 0;
1291 PRINT(CCV_CLI_VERBOSE, "|-Cannot assign buffer %d, it points to multiple blocks (multi view tensor required)\n", i)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
{ printf("|-Cannot assign buffer %d, it points to multiple blocks (multi view tensor required)\n"
, i); fflush(stdout); } } while (0)
;
1292 continue;
1293 }
1294 // Otherwise, find the actual buffer pointer.
1295 const int vt_ref = p_alloc_prep->vt_blocks[unref_p_ref];
1296 assert(vt_ref >= 0)((void) sizeof ((vt_ref >= 0) ? 1 : 0), __extension__ ({ if
(vt_ref >= 0) ; else __assert_fail ("vt_ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 1296, __extension__ __PRETTY_FUNCTION__); }))
;
1297 const int buffer_ref = p_alloc_prep->blocks[vt_ref].buffer_ref;
1298 if (!p_arena->buffers[buffer_ref].ptr)
1299 {
1300 // Pass it down as 0 ptr.
1301 tensor_arena->buffers[i].ptr = 0;
1302 PRINT(CCV_CLI_VERBOSE, "|-Cannot assign buffer %d, it points to multiple blocks (multi view tensor required)\n", i)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
{ printf("|-Cannot assign buffer %d, it points to multiple blocks (multi view tensor required)\n"
, i); fflush(stdout); } } while (0)
;
1303 continue;
1304 }
1305 const uint64_t offset = p_alloc_prep->blocks[vt_ref].offset;
1306 tensor_arena->buffers[i].ptr = p_arena->buffers[buffer_ref].ptr + offset;
1307 PRINT(CCV_CLI_VERBOSE, "|-Assign block %d in parent arena to buffer %d with offset %lu\n", vt_ref, i, (unsigned long)offset)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
{ printf("|-Assign block %d in parent arena to buffer %d with offset %lu\n"
, vt_ref, i, (unsigned long)offset); fflush(stdout); } } while
(0)
;
1308 }
1309 } else {
1310 // Now, allocate actual buffers.
1311 PRINT(CCV_CLI_VERBOSE, "Buffer allocation for arena %p\n", tensor_arena)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
{ printf("Buffer allocation for arena %p\n", tensor_arena); fflush
(stdout); } } while (0)
;
1312 for (i = 0; i < tensor_arena->buffer_size; i++)
1313 {
1314 const int buffer_type = tensor_arena->buffers[i].type;
1315 const int memory_type = CCV_TENSOR_GET_MEMORY(buffer_type)((buffer_type) & 0x3);
1316#ifdef HAVE_CUDA1
1317 if (memory_type == CCV_TENSOR_GPU_MEMORY)
1318 {
1319 const int device_id = CCV_TENSOR_GET_DEVICE_ID(buffer_type)(((buffer_type) & 0xfff00) >> 8);
1320 if (allocator.isa && allocator.isa->alloc)
1321 tensor_arena->buffers[i].ptr = (uint8_t*)allocator.isa->alloc(buffer_type, 0, tensor_arena->buffers[i].size, allocator.context.alloc);
1322 else
1323 tensor_arena->buffers[i].ptr = (uint8_t*)cumalloc(device_id, tensor_arena->buffers[i].size);
1324 PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
{ printf("|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena
->buffers[i].ptr, (unsigned long)tensor_arena->buffers[
i].size); fflush(stdout); } } while (0)
;
1325 } else {
1326 assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0
), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY)
; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY"
, "ccv_nnc_symbolic_graph_compile.c", 1326, __extension__ __PRETTY_FUNCTION__
); }))
;
1327 if (tensor_arena->buffers[i].pin_mem)
1328 tensor_arena->buffers[i].ptr = (uint8_t*)cuhostalloc(tensor_arena->buffers[i].size);
1329 else
1330 ccmemalignposix_memalign((void**)&tensor_arena->buffers[i].ptr, 64, tensor_arena->buffers[i].size);
1331 PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
{ printf("|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena
->buffers[i].ptr, (unsigned long)tensor_arena->buffers[
i].size); fflush(stdout); } } while (0)
;
1332 }
1333#elif defined(HAVE_MPS)
1334 if (memory_type == CCV_TENSOR_GPU_MEMORY)
1335 {
1336 const int device_id = CCV_TENSOR_GET_DEVICE_ID(buffer_type)(((buffer_type) & 0xfff00) >> 8);
1337 // if (allocator.isa && allocator.isa->alloc)
1338 // tensor_arena->buffers[i].ptr = (uint8_t*)allocator.isa->alloc(buffer_type, 0, tensor_arena->buffers[i].size, allocator.context.alloc);
1339 // else
1340 tensor_arena->buffers[i].ptr = (uint8_t*)mpheapalloc(device_id, tensor_arena->buffers[i].size);
1341 PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
{ printf("|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena
->buffers[i].ptr, (unsigned long)tensor_arena->buffers[
i].size); fflush(stdout); } } while (0)
;
1342 } else {
1343 assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0
), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY)
; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY"
, "ccv_nnc_symbolic_graph_compile.c", 1343, __extension__ __PRETTY_FUNCTION__
); }))
;
1344 ccmemalignposix_memalign((void**)&tensor_arena->buffers[i].ptr, 64, tensor_arena->buffers[i].size);
1345 PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
{ printf("|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena
->buffers[i].ptr, (unsigned long)tensor_arena->buffers[
i].size); fflush(stdout); } } while (0)
;
1346 }
1347#else
1348 assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0
), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY)
; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY"
, "ccv_nnc_symbolic_graph_compile.c", 1348, __extension__ __PRETTY_FUNCTION__
); }))
;
1349 ccmemalignposix_memalign((void**)&tensor_arena->buffers[i].ptr, 64, tensor_arena->buffers[i].size);
1350 PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
{ printf("|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena
->buffers[i].ptr, (unsigned long)tensor_arena->buffers[
i].size); fflush(stdout); } } while (0)
;
1351#endif
1352 assert(tensor_arena->buffers[i].ptr)((void) sizeof ((tensor_arena->buffers[i].ptr) ? 1 : 0), __extension__
({ if (tensor_arena->buffers[i].ptr) ; else __assert_fail
("tensor_arena->buffers[i].ptr", "ccv_nnc_symbolic_graph_compile.c"
, 1352, __extension__ __PRETTY_FUNCTION__); }))
;
1353 }
1354 }
1355 // Go over sub_preps and allocate arenas for them. Do it this early because
1356 // we may reference tensors from sub arenas, the reason why we need to reference
1357 // tensors from sub arenas is because for output tensors, sub arena's tensor
1358 // will have automatic reference updates.
1359 for (i = 0; i < tensor_arena->sub_arena_size; i++)
1360 if (graph_prep->sub_preps[i])
1361 tensor_arena->sub_arenas[i] = _ccv_nnc_tensor_arena_new(graph_prep->sub_preps[i], allocator, tensor_arena, tensor_binds, tensor_bind_size);
1362 else
1363 tensor_arena->sub_arenas[i] = 0;
1364 memset(tensor_arena->vt_tensors, 0, sizeof(ccv_nnc_tensor_t*) * tensor_symbol_info_size);
1365 // Now sub-arenas are all assigned, go over its outputs to assign out tensors from its output directly.
1366 ccv_nnc_tensor_t** sub_arena_out_tensors = tensor_arena->sub_arena_size ? (ccv_nnc_tensor_t**)cccalloccalloc(tensor_symbol_info_size, sizeof(ccv_nnc_tensor_t*)) : 0;
1367#ifdef HAVE_MPS
1368 khash_t(obj_ptr)kh_obj_ptr_t* obj_ptr_map = kh_init(obj_ptr)kh_init_obj_ptr();
1369#else
1370 khash_t(obj_ptr)kh_obj_ptr_t* obj_ptr_map = 0;
1371#endif
1372 for (i = 0; i < tensor_arena->sub_arena_size; i++)
1373 if (tensor_arena->sub_arenas[i])
1374 {
1375 assert(graph_prep->sub_preps[i])((void) sizeof ((graph_prep->sub_preps[i]) ? 1 : 0), __extension__
({ if (graph_prep->sub_preps[i]) ; else __assert_fail ("graph_prep->sub_preps[i]"
, "ccv_nnc_symbolic_graph_compile.c", 1375, __extension__ __PRETTY_FUNCTION__
); }))
;
1376 const int exec_idx = graph_prep->sub_preps[i]->exec_idx - 1;
1377 const ccv_nnc_graph_exec_symbol_info_t* const node = graph_prep->exec_symbol_info + exec_idx;
1378 if (node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
1379 for (j = 0; j < node->output_size; j++)
1380 {
1381 const int idx = node->outputs[j];
1382 const int s_idx = *(int*)ccv_array_get(tensor_symbol_info[idx].s_ref, i)((void*)(((char*)((tensor_symbol_info[idx].s_ref)->data)) +
(size_t)(tensor_symbol_info[idx].s_ref)->rsize * (size_t)
(i)))
- 1;
1383 assert(s_idx >= 0)((void) sizeof ((s_idx >= 0) ? 1 : 0), __extension__ ({ if
(s_idx >= 0) ; else __assert_fail ("s_idx >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 1383, __extension__ __PRETTY_FUNCTION__); }))
;
1384 ccv_nnc_tensor_t* sub_tensor = tensor_arena->sub_arenas[i]->vt_tensors[s_idx];
1385 assert(sub_arena_out_tensors[idx] == 0)((void) sizeof ((sub_arena_out_tensors[idx] == 0) ? 1 : 0), __extension__
({ if (sub_arena_out_tensors[idx] == 0) ; else __assert_fail
("sub_arena_out_tensors[idx] == 0", "ccv_nnc_symbolic_graph_compile.c"
, 1385, __extension__ __PRETTY_FUNCTION__); }))
;
1386 ccv_nnc_tensor_t* sub_alias = (ccv_nnc_tensor_t*)sub_tensor->alias_ref;
1387 // Only assign if it is a multiview tensor.
1388 if (CCV_IS_TENSOR_MULTIVIEW(sub_tensor)((*(int*)(sub_tensor)) & CCV_TENSOR_MULTIVIEW) ||
1389 (sub_alias && CCV_IS_TENSOR_MULTIVIEW(sub_alias)((*(int*)(sub_alias)) & CCV_TENSOR_MULTIVIEW)))
1390 sub_arena_out_tensors[idx] = sub_tensor;
1391 }
1392 }
1393 // Assigning out the tensors (in case of sharing tensors / in-place ops).
1394 for (i = 0; i < tensor_symbol_info_size; i++)
1395 if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !(
(tensor_blocks[i].flags & 0x3) == UNASSIGNED))
)
1396 {
1397 const int vt_ref = alloc_prep->vt_blocks[i];
1398 const int buffer_ref = vt_ref >= 0 ? alloc_prep->blocks[vt_ref].buffer_ref : -1;
1399 // Either we have dup_tensor_block_ref in current layer, or we have that in
1400 // previous layer, therefore, cannot really find the buffer ptr.
1401 if ((!sub_arena_out_tensors || !sub_arena_out_tensors[i]) && // If it is already generated by sub arena, it can be ordinary out tensors. (What if out tensor is not even generated by sub graph when running? In this case, the behavior is undefined anyway).
1402 ((graph_prep->dup_tensor_block_ref &&
1403 graph_prep->dup_tensor_block_ref[i * unroll_count] >= 0 &&
1404 graph_prep->dup_tensor_block_ref[i * unroll_count] != i) ||
1405 (buffer_ref >= 0 && !tensor_arena->buffers[buffer_ref].ptr)))
1406 {
1407 assert(graph_prep->p)((void) sizeof ((graph_prep->p) ? 1 : 0), __extension__ ({
if (graph_prep->p) ; else __assert_fail ("graph_prep->p"
, "ccv_nnc_symbolic_graph_compile.c", 1407, __extension__ __PRETTY_FUNCTION__
); }))
; // This must be in a sub-graph.
1408 // If this is an input tensor, and it need to be preserved, wait until when we go through inputs to preserve.
1409 if (graph_prep->tensor_blocks[i].p_refs[0] && _ccv_nnc_tensor_block_check_preserve(graph_prep, i))
1410 continue;
1411 const int pos = _ccv_nnc_tensor_multiview_gen(tensor_arena->tensor_metadata, 0, tensor_symbol_info[i].assign_ref, tensor_symbol_info[i].info, graph_prep, tensor_arena, i);
1412 tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1413 ccv_array_push(tensor_arena->m_tensor_idx, &pos);
1414 } else if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED)) {
1415 // When we want to allocate, we don't really need to if it need force broadcast, because we will handle that later.
1416 const uint64_t offset = alloc_prep->blocks[vt_ref].offset;
1417 // If already created, use the same tensor, and continue.
1418 // Having ptr.
1419 int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1420 ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1421 // Also, set its allocations.
1422 // Since tensor view is bit compatible with tensor, we can just cast.
1423 void* obj = _ccv_nnc_tensor_arena_obj_create(obj_ptr_map, tensor_arena->buffers[buffer_ref].ptr, tensor_arena->buffers[buffer_ref].size, offset, tensor_symbol_info[i].info, tensor_arena);
1424 *tensor = ccv_nnc_tensor(obj, tensor_symbol_info[i].info, 0);
1425 assert(offset + tensor_blocks[i].size <= tensor_arena->buffers[buffer_ref].size)((void) sizeof ((offset + tensor_blocks[i].size <= tensor_arena
->buffers[buffer_ref].size) ? 1 : 0), __extension__ ({ if (
offset + tensor_blocks[i].size <= tensor_arena->buffers
[buffer_ref].size) ; else __assert_fail ("offset + tensor_blocks[i].size <= tensor_arena->buffers[buffer_ref].size"
, "ccv_nnc_symbolic_graph_compile.c", 1425, __extension__ __PRETTY_FUNCTION__
); }))
;
1426 // If we need to force broadcast, we need to wrap it in a multiview.
1427 if (graph_prep->tensor_blocks[i].p_refs[0] &&
1428 _ccv_nnc_tensor_block_check_force_broadcast(graph_prep, i))
1429 {
1430 const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
1431 ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, mv_pos);
1432 ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1433 ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
1434 tv,
1435 }, 0, 1, graph_prep->graph, mv);
1436 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[0] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1437 pos = mv_pos;
1438 ccv_array_push(tensor_arena->m_tensor_idx, &mv_pos);
1439 }
1440 tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)pos; // Cast into vt_tensors for now, and later will rewire it.
1441 }
1442 }
1443#ifdef HAVE_MPS
1444 kh_destroy(obj_ptr, obj_ptr_map)kh_destroy_obj_ptr(obj_ptr_map);
1445#endif
1446 // Handle binded tensors. First handle cases without aliases.
1447 for (i = 0; i < tensor_bind_size; i++)
1448 {
1449 assert(tensor_binds[i].tensor)((void) sizeof ((tensor_binds[i].tensor) ? 1 : 0), __extension__
({ if (tensor_binds[i].tensor) ; else __assert_fail ("tensor_binds[i].tensor"
, "ccv_nnc_symbolic_graph_compile.c", 1449, __extension__ __PRETTY_FUNCTION__
); }))
;
1450 const ccv_nnc_tensor_symbol_t resolved_symbol = ccv_nnc_tensor_symbol_resolve(graph_prep->symbolic_graph, tensor_binds[i].symbol);
1451 if (resolved_symbol.d >= 0)
1452 {
1453 int d = resolved_symbol.d;
1454 if (TENSOR_EXPECT_ALIAS(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == ALIAS))
1455 continue;
1456 // This check is for in-place ops. Only in-place op could have unassigned but ref.
1457 // It has nothing to do with alias.
1458 while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED) && tensor_blocks[d].ref)
1459 d = tensor_blocks[d].ref - 1;
1460 // For binded tensors, it shouldn't be assigned yet.
1461 // If it is assigned, the pointer should match the ones from the binded tensor.
1462 // This can only happen if an enforced in-place tensor is binded twice. If that
1463 // happens, we need to make sure it is binded to the same location.
1464 assert(!tensor_arena->vt_tensors[d] || tensor_arena->vt_tensors[d]->data.u8 == tensor_binds[i].tensor->data.u8)((void) sizeof ((!tensor_arena->vt_tensors[d] || tensor_arena
->vt_tensors[d]->data.u8 == tensor_binds[i].tensor->
data.u8) ? 1 : 0), __extension__ ({ if (!tensor_arena->vt_tensors
[d] || tensor_arena->vt_tensors[d]->data.u8 == tensor_binds
[i].tensor->data.u8) ; else __assert_fail ("!tensor_arena->vt_tensors[d] || tensor_arena->vt_tensors[d]->data.u8 == tensor_binds[i].tensor->data.u8"
, "ccv_nnc_symbolic_graph_compile.c", 1464, __extension__ __PRETTY_FUNCTION__
); }))
;
1465 // See above assertion.
1466 if (tensor_arena->vt_tensors[d])
1467 continue;
1468 if (CCV_IS_TENSOR_VIEW(tensor_binds[i].tensor)((*(int*)(tensor_binds[i].tensor)) & CCV_TENSOR_VIEW))
1469 {
1470 int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_view_t));
1471 ccv_nnc_tensor_view_t* const tv = (ccv_nnc_tensor_view_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1472 ccv_nnc_tensor_view_t* const otv = (ccv_nnc_tensor_view_t*)tensor_binds[i].tensor;
1473 if (otv->off > 0) // If there is a off. This has to be the same dimensionality, or smaller at each dimension.
1474 for (j = 0; j < CCV_NNC_MAX_DIM_ALLOC(12); j++)
1475 { assert(tensor_symbol_info[d].info.dim[j] <= otv->info.dim[j])((void) sizeof ((tensor_symbol_info[d].info.dim[j] <= otv->
info.dim[j]) ? 1 : 0), __extension__ ({ if (tensor_symbol_info
[d].info.dim[j] <= otv->info.dim[j]) ; else __assert_fail
("tensor_symbol_info[d].info.dim[j] <= otv->info.dim[j]"
, "ccv_nnc_symbolic_graph_compile.c", 1475, __extension__ __PRETTY_FUNCTION__
); }))
; }
1476 // It is OK to be just as a whole smaller or equal to the binded one.
1477 assert(ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count(tensor_symbol_info[d].info))((void) sizeof ((ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count
(tensor_symbol_info[d].info)) ? 1 : 0), __extension__ ({ if (
ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count
(tensor_symbol_info[d].info)) ; else __assert_fail ("ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count(tensor_symbol_info[d].info)"
, "ccv_nnc_symbolic_graph_compile.c", 1477, __extension__ __PRETTY_FUNCTION__
); }))
;
1478 memcpy(tv, otv, sizeof(ccv_nnc_tensor_view_t));
1479 memcpy(tv->info.dim, tensor_symbol_info[d].info.dim, sizeof(tv->info.dim));
1480 tensor_arena->vt_tensors[d] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1481 } else {
1482 int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1483 ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1484 *tv = ccv_nnc_tensor(tensor_binds[i].tensor->data.u8, tensor_symbol_info[d].info, 0);
1485 tv->info.datatype = tensor_binds[i].tensor->info.datatype;
1486 tv->info.reserved = tensor_binds[i].tensor->info.reserved;
1487 tv->data = tensor_binds[i].tensor->data; // If there are offsets, copy it over.
1488 tv->dataof = tensor_binds[i].tensor->dataof;
1489 tensor_arena->vt_tensors[d] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1490 }
1491 }
1492 }
1493 // Handle binded tensors. We handle alias here so it can reference to binded tensors.
1494 for (i = 0; i < tensor_bind_size; i++)
1495 {
1496 assert(tensor_binds[i].tensor)((void) sizeof ((tensor_binds[i].tensor) ? 1 : 0), __extension__
({ if (tensor_binds[i].tensor) ; else __assert_fail ("tensor_binds[i].tensor"
, "ccv_nnc_symbolic_graph_compile.c", 1496, __extension__ __PRETTY_FUNCTION__
); }))
;
1497 const ccv_nnc_tensor_symbol_t resolved_symbol = ccv_nnc_tensor_symbol_resolve(graph_prep->symbolic_graph, tensor_binds[i].symbol);
1498 if (resolved_symbol.d >= 0)
1499 {
1500 int d = resolved_symbol.d;
1501 if (TENSOR_EXPECT_ALIAS(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == ALIAS))
1502 d = tensor_symbol_info[d].alias_ref - 1; // Bind back to the original.
1503 // This check is for in-place ops. Only in-place op could have unassigned but ref.
1504 // It has nothing to do with alias.
1505 while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED) && tensor_blocks[d].ref)
1506 d = tensor_blocks[d].ref - 1;
1507 if (tensor_arena->vt_tensors[d])
1508 continue;
1509 // Assert original alias has no ofs. Otherwise our binding will be problematic.
1510 for (j = 0; j < CCV_NNC_MAX_DIM_ALLOC(12); j++)
1511 { assert(tensor_symbol_info[resolved_symbol.d].ofs[j] == 0)((void) sizeof ((tensor_symbol_info[resolved_symbol.d].ofs[j]
== 0) ? 1 : 0), __extension__ ({ if (tensor_symbol_info[resolved_symbol
.d].ofs[j] == 0) ; else __assert_fail ("tensor_symbol_info[resolved_symbol.d].ofs[j] == 0"
, "ccv_nnc_symbolic_graph_compile.c", 1511, __extension__ __PRETTY_FUNCTION__
); }))
; }
1512 if (CCV_IS_TENSOR_VIEW(tensor_binds[i].tensor)((*(int*)(tensor_binds[i].tensor)) & CCV_TENSOR_VIEW))
1513 {
1514 int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_view_t));
1515 ccv_nnc_tensor_view_t* const tv = (ccv_nnc_tensor_view_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1516 ccv_nnc_tensor_view_t* const otv = (ccv_nnc_tensor_view_t*)tensor_binds[i].tensor;
1517 if (otv->off > 0) // If there is a off. This has to be the same dimensionality, or smaller at each dimension.
1518 for (j = 0; j < CCV_NNC_MAX_DIM_ALLOC(12); j++)
1519 { assert(tensor_symbol_info[d].info.dim[j] <= otv->info.dim[j])((void) sizeof ((tensor_symbol_info[d].info.dim[j] <= otv->
info.dim[j]) ? 1 : 0), __extension__ ({ if (tensor_symbol_info
[d].info.dim[j] <= otv->info.dim[j]) ; else __assert_fail
("tensor_symbol_info[d].info.dim[j] <= otv->info.dim[j]"
, "ccv_nnc_symbolic_graph_compile.c", 1519, __extension__ __PRETTY_FUNCTION__
); }))
; }
1520 // It is OK to be just as a whole smaller or equal to the binded one.
1521 assert(ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count(tensor_symbol_info[d].info))((void) sizeof ((ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count
(tensor_symbol_info[d].info)) ? 1 : 0), __extension__ ({ if (
ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count
(tensor_symbol_info[d].info)) ; else __assert_fail ("ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count(tensor_symbol_info[d].info)"
, "ccv_nnc_symbolic_graph_compile.c", 1521, __extension__ __PRETTY_FUNCTION__
); }))
;
1522 memcpy(tv, otv, sizeof(ccv_nnc_tensor_view_t));
1523 memcpy(tv->info.dim, tensor_symbol_info[d].info.dim, sizeof(tv->info.dim));
1524 tensor_arena->vt_tensors[d] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1525 } else {
1526 int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1527 ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1528 *tv = ccv_nnc_tensor(tensor_binds[i].tensor->data.u8, tensor_symbol_info[d].info, 0);
1529 tv->info.datatype = tensor_binds[i].tensor->info.datatype;
1530 tv->info.reserved = tensor_binds[i].tensor->info.reserved;
1531 tv->data = tensor_binds[i].tensor->data;
1532 tv->dataof = tensor_binds[i].tensor->dataof;
1533 tensor_arena->vt_tensors[d] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1534 }
1535 }
1536 }
1537 // Assign out refs, refs are simple ones, we should handle it first. (because they point to exactly the same metadata and same region).
1538 // Avoiding refs that actually is an alias.
1539 for (i = 0; i < tensor_symbol_info_size; i++)
1540 // It could be binded tensor (or unused), in that case, it doesn't have a ref.
1541 if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED) && tensor_blocks[i].ref && !tensor_arena->vt_tensors[i] && !tensor_blocks[i].alias_ref)
1542 {
1543 int ref = tensor_blocks[i].ref - 1;
1544 while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[ref])((tensor_blocks[ref].flags & 0x3) == UNASSIGNED) && tensor_blocks[ref].ref)
1545 ref = tensor_blocks[ref].ref - 1;
1546 assert(tensor_arena->vt_tensors[ref])((void) sizeof ((tensor_arena->vt_tensors[ref]) ? 1 : 0), __extension__
({ if (tensor_arena->vt_tensors[ref]) ; else __assert_fail
("tensor_arena->vt_tensors[ref]", "ccv_nnc_symbolic_graph_compile.c"
, 1546, __extension__ __PRETTY_FUNCTION__); }))
;
1547 tensor_arena->vt_tensors[i] = tensor_arena->vt_tensors[ref];
1548 }
1549 // Now after refs assigned out, handle the case I need to preserve because I am a sub graph of while loop.
1550 if (graph_prep->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
1551 {
1552 assert(graph_prep->p)((void) sizeof ((graph_prep->p) ? 1 : 0), __extension__ ({
if (graph_prep->p) ; else __assert_fail ("graph_prep->p"
, "ccv_nnc_symbolic_graph_compile.c", 1552, __extension__ __PRETTY_FUNCTION__
); }))
;
1553 const ccv_nnc_graph_exec_symbol_info_t* node = graph_prep->p->exec_symbol_info + (graph_prep->exec_idx - 1);
1554 const int p_idx = graph_prep->p_idx - 1;
1555 for (i = 0; i < node->input_size; i++)
1556 {
1557 const int idx = node->inputs[i];
1558 int block_ref = *(int*)ccv_array_get(graph_prep->p->tensor_symbol_info[idx].s_ref, p_idx)((void*)(((char*)((graph_prep->p->tensor_symbol_info[idx
].s_ref)->data)) + (size_t)(graph_prep->p->tensor_symbol_info
[idx].s_ref)->rsize * (size_t)(p_idx)))
- 1;
1559 assert(!tensor_blocks[block_ref].ref)((void) sizeof ((!tensor_blocks[block_ref].ref) ? 1 : 0), __extension__
({ if (!tensor_blocks[block_ref].ref) ; else __assert_fail (
"!tensor_blocks[block_ref].ref", "ccv_nnc_symbolic_graph_compile.c"
, 1559, __extension__ __PRETTY_FUNCTION__); }))
;
1560 const int vt_ref = alloc_prep->vt_blocks[block_ref];
1561 if (!_ccv_nnc_tensor_block_check_preserve(graph_prep, block_ref))
1562 continue;
1563 assert(vt_ref >= 0)((void) sizeof ((vt_ref >= 0) ? 1 : 0), __extension__ ({ if
(vt_ref >= 0) ; else __assert_fail ("vt_ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 1563, __extension__ __PRETTY_FUNCTION__); }))
;
1564 const int buffer_ref = alloc_prep->blocks[vt_ref].buffer_ref;
1565 assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[block_ref]))((void) sizeof ((!((tensor_blocks[block_ref].flags & 0x3)
== UNASSIGNED)) ? 1 : 0), __extension__ ({ if (!((tensor_blocks
[block_ref].flags & 0x3) == UNASSIGNED)) ; else __assert_fail
("!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[block_ref])", "ccv_nnc_symbolic_graph_compile.c"
, 1565, __extension__ __PRETTY_FUNCTION__); }))
;
1566 assert(!TENSOR_EXPECT_ALIAS(tensor_blocks[block_ref]))((void) sizeof ((!((tensor_blocks[block_ref].flags & 0x3)
== ALIAS)) ? 1 : 0), __extension__ ({ if (!((tensor_blocks[block_ref
].flags & 0x3) == ALIAS)) ; else __assert_fail ("!TENSOR_EXPECT_ALIAS(tensor_blocks[block_ref])"
, "ccv_nnc_symbolic_graph_compile.c", 1566, __extension__ __PRETTY_FUNCTION__
); }))
;
1567 // Either we have dup_tensor_block_ref in current layer, or we have that in
1568 // previous layer, therefore, cannot really find the buffer ptr.
1569 if ((!sub_arena_out_tensors || !sub_arena_out_tensors[block_ref]) && // If it is already generated by sub arena, it can be ordinary out tensors. (What if out tensor is not even generated by sub graph when running? In this case, the behavior is undefined anyway).
1570 ((graph_prep->dup_tensor_block_ref &&
1571 graph_prep->dup_tensor_block_ref[block_ref * unroll_count] >= 0 &&
1572 graph_prep->dup_tensor_block_ref[block_ref * unroll_count] != block_ref) ||
1573 !tensor_arena->buffers[buffer_ref].ptr))
1574 {
1575 // We haven't allocated anything for this yet.
1576 assert(tensor_arena->vt_tensors[block_ref] == 0)((void) sizeof ((tensor_arena->vt_tensors[block_ref] == 0)
? 1 : 0), __extension__ ({ if (tensor_arena->vt_tensors[block_ref
] == 0) ; else __assert_fail ("tensor_arena->vt_tensors[block_ref] == 0"
, "ccv_nnc_symbolic_graph_compile.c", 1576, __extension__ __PRETTY_FUNCTION__
); }))
;
1577 const int pos = _ccv_nnc_tensor_multiview_gen(tensor_arena->tensor_metadata, 1, tensor_symbol_info[i].assign_ref, tensor_symbol_info[block_ref].info, graph_prep, tensor_arena, block_ref);
1578 tensor_arena->vt_tensors[block_ref] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1579 ccv_array_push(tensor_arena->m_tensor_idx, &pos);
1580 } else {
1581 const int mv_pos = _ccv_nnc_tensor_multiview_preserve_gen(tensor_arena->tensor_metadata, tensor_symbol_info[block_ref].info, graph_prep, tensor_arena->vt_tensors[block_ref]);
1582 tensor_arena->vt_tensors[block_ref] = (ccv_nnc_tensor_t*)(intptr_t)mv_pos; // Cast into vt_tensors for now, and later will rewire.
1583 ccv_array_push(tensor_arena->m_tensor_idx, &mv_pos);
1584 }
1585 }
1586 }
1587 // For case..of statement, the output is a phi variable, thus, if we take the skip branch, we will select the original input.
1588 // This created the multi-view tensor to achieve that.
1589 for (i = 0; i < tensor_symbol_info_size; i++)
1590 if (tensor_blocks[i].bypass_ref && tensor_arena->vt_tensors[i])
1591 {
1592 const int bypass_ref = tensor_blocks[i].bypass_ref - 1;
1593 // Create phi multi-view.
1594 const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
1595 const int intv_pos = _ccv_nnc_tensor_flat_if_multiview(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[bypass_ref]);
1596 const int outv_pos = _ccv_nnc_tensor_flat_if_multiview(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[i]);
1597 ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, mv_pos);
1598 ccv_nnc_tensor_t* const intv = (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, intv_pos);
1599 ccv_nnc_tensor_t* const outv = (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, outv_pos);
1600 ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
1601 intv,
1602 outv,
1603 }, CCV_NNC_MULTIVIEW_K0N, 2, (ccv_nnc_graph_t*)CCV_NNC_MULTIVIEW_PHI(intptr_t)0x1, mv);
1604 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[0] = (ccv_nnc_tensor_t*)(intptr_t)intv_pos;
1605 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[1] = (ccv_nnc_tensor_t*)(intptr_t)outv_pos;
1606 tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)mv_pos;
1607 ccv_array_push(tensor_arena->m_tensor_idx, &mv_pos);
1608 }
1609 // Now it is time to handle alias.
1610 for (i = 0; i < alloc_prep->block_size; i++)
1611 if (alloc_prep->blocks[i].block_ref < tensor_symbol_info_size)
1612 {
1613 const int block_ref = alloc_prep->blocks[i].block_ref;
1614 if (TENSOR_EXPECT_ALIAS(tensor_blocks[block_ref])((tensor_blocks[block_ref].flags & 0x3) == ALIAS) && !tensor_arena->vt_tensors[block_ref])
1615 {
1616 // Assigning out the tensor aliases.
1617 assert(tensor_symbol_info[block_ref].alias_ref)((void) sizeof ((tensor_symbol_info[block_ref].alias_ref) ? 1
: 0), __extension__ ({ if (tensor_symbol_info[block_ref].alias_ref
) ; else __assert_fail ("tensor_symbol_info[block_ref].alias_ref"
, "ccv_nnc_symbolic_graph_compile.c", 1617, __extension__ __PRETTY_FUNCTION__
); }))
;
1618 _ccv_nnc_recursively_assign_vt_tensor_aliases(tensor_blocks, tensor_arena->tensor_metadata, tensor_symbol_info, block_ref, tensor_arena->vt_tensors);
1619 }
1620 }
1621 // Now assigning out the rest of alias refs.
1622 for (i = 0; i < tensor_symbol_info_size; i++)
1623 // It could be binded tensor (or unused), in that case, it doesn't have a ref.
1624 if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED) && tensor_blocks[i].alias_ref && !tensor_arena->vt_tensors[i])
1625 {
1626 int ref = tensor_blocks[i].alias_ref - 1;
1627 assert(tensor_arena->vt_tensors[ref])((void) sizeof ((tensor_arena->vt_tensors[ref]) ? 1 : 0), __extension__
({ if (tensor_arena->vt_tensors[ref]) ; else __assert_fail
("tensor_arena->vt_tensors[ref]", "ccv_nnc_symbolic_graph_compile.c"
, 1627, __extension__ __PRETTY_FUNCTION__); }))
;
1628 tensor_arena->vt_tensors[i] = tensor_arena->vt_tensors[ref];
1629 }
1630 // Replacing the tensor placeholder within sub arena's multi-view to the input tensor.
1631 for (i = 0; i < tensor_arena->sub_arena_size; i++)
1632 if (tensor_arena->sub_arenas[i])
1633 {
1634 const int exec_idx = graph_prep->sub_preps[i]->exec_idx - 1;
1635 const ccv_nnc_graph_exec_symbol_info_t* const node = graph_prep->exec_symbol_info + exec_idx;
1636 for (j = 0; j < node->input_size; j++)
1637 {
1638 const int idx = node->inputs[j];
1639 const int s_idx = (tensor_symbol_info[idx].s_ref && tensor_symbol_info[idx].s_ref->rnum > i) ? *(int*)ccv_array_get(tensor_symbol_info[idx].s_ref, i)((void*)(((char*)((tensor_symbol_info[idx].s_ref)->data)) +
(size_t)(tensor_symbol_info[idx].s_ref)->rsize * (size_t)
(i)))
- 1 : -1;
1640 if (s_idx < 0)
1641 continue;
1642 ccv_nnc_tensor_t* sub_tensor = tensor_arena->sub_arenas[i]->vt_tensors[s_idx];
1643 // Only do the replacement if it is a multi-view tensor.
1644 // sub_tensor can be unassigned if it is a tape variable. It will get fixed up later from its pair.
1645 if (sub_tensor && CCV_IS_TENSOR_MULTIVIEW(sub_tensor)((*(int*)(sub_tensor)) & CCV_TENSOR_MULTIVIEW) && !TENSOR_EXPECT_UNASSIGNED(tensor_blocks[idx])((tensor_blocks[idx].flags & 0x3) == UNASSIGNED))
1646 {
1647 // It cannot be binded tensor.
1648 assert(CCV_NNC_IS_METADATA_POS(tensor_arena->vt_tensors[idx]))((void) sizeof ((((uintptr_t)(tensor_arena->vt_tensors[idx
]) & 1)) ? 1 : 0), __extension__ ({ if (((uintptr_t)(tensor_arena
->vt_tensors[idx]) & 1)) ; else __assert_fail ("CCV_NNC_IS_METADATA_POS(tensor_arena->vt_tensors[idx])"
, "ccv_nnc_symbolic_graph_compile.c", 1648, __extension__ __PRETTY_FUNCTION__
); }))
;
1649 const int vt_pos = (int)(intptr_t)tensor_arena->vt_tensors[idx];
1650 const int is_sub_arena_out_tensor = (sub_arena_out_tensors && sub_arena_out_tensors[idx]);
1651 ccv_nnc_tensor_t* const vt_tensor = is_sub_arena_out_tensor ? sub_arena_out_tensors[idx] : _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, vt_pos);
1652 // If this tensor is also an multiview, we need to first generate a new tensor, and then generate a reference
1653 // to this tensor.
1654 if (CCV_IS_TENSOR_MULTIVIEW(vt_tensor)((*(int*)(vt_tensor)) & CCV_TENSOR_MULTIVIEW))
1655 {
1656 const int ref_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1657 ccv_nnc_tensor_t* const ref_tensor = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, ref_pos);
1658 ccv_nnc_tensor_multiview_t* const multiview = (ccv_nnc_tensor_multiview_t*)(is_sub_arena_out_tensor ? vt_tensor : _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, vt_pos));
1659 ref_tensor->alias_ref = is_sub_arena_out_tensor ? (uintptr_t)vt_tensor : (uintptr_t)vt_pos;
1660 ccv_nnc_tensor_synchronize_to_multiview(multiview, (ccv_nnc_tensor_t*)(intptr_t)ref_pos);
1661 ccv_nnc_tensor_t* tv = (ccv_nnc_tensor_t*)(CCV_NNC_IS_METADATA_POS(CCV_NNC_MULTIVIEW_DATA(multiview)[0])((uintptr_t)(((multiview)->_heap_data ? (multiview)->_heap_data
: (multiview)->_inline_data)[0]) & 1)
? _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(multiview)((multiview)->_heap_data ? (multiview)->_heap_data : (multiview
)->_inline_data)
[0]) : CCV_NNC_MULTIVIEW_DATA(multiview)((multiview)->_heap_data ? (multiview)->_heap_data : (multiview
)->_inline_data)
[0]);
1662 while (CCV_IS_TENSOR_MULTIVIEW(tv)((*(int*)(tv)) & CCV_TENSOR_MULTIVIEW))
1663 tv = (ccv_nnc_tensor_t*)(CCV_NNC_IS_METADATA_POS(CCV_NNC_MULTIVIEW_DATA((ccv_nnc_tensor_multiview_t*)tv)[0])((uintptr_t)((((ccv_nnc_tensor_multiview_t*)tv)->_heap_data
? ((ccv_nnc_tensor_multiview_t*)tv)->_heap_data : ((ccv_nnc_tensor_multiview_t
*)tv)->_inline_data)[0]) & 1)
? _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA((ccv_nnc_tensor_multiview_t*)tv)(((ccv_nnc_tensor_multiview_t*)tv)->_heap_data ? ((ccv_nnc_tensor_multiview_t
*)tv)->_heap_data : ((ccv_nnc_tensor_multiview_t*)tv)->
_inline_data)
[0]) : CCV_NNC_MULTIVIEW_DATA((ccv_nnc_tensor_multiview_t*)tv)(((ccv_nnc_tensor_multiview_t*)tv)->_heap_data ? ((ccv_nnc_tensor_multiview_t
*)tv)->_heap_data : ((ccv_nnc_tensor_multiview_t*)tv)->
_inline_data)
[0]);
1664 *ref_tensor = ccv_nnc_tensor(tv->data.u8, tv->info, 0);
1665 ref_tensor->data = tv->data;
1666 ref_tensor->dataof = tv->dataof;
1667 _ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)sub_tensor, (ccv_nnc_tensor_t*)(intptr_t)ref_pos);
1668 } else
1669 _ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)sub_tensor, is_sub_arena_out_tensor ? vt_tensor : (ccv_nnc_tensor_t*)(intptr_t)vt_pos);
1670 }
1671 }
1672 }
1673 // After alias created, for case..of statement, we now revert back to flat tensor rather than multi-view.
1674 // No worries though, this new tensor is subscribed for the phi multi-view. More over, we have logic
1675 // when initialize case..of node, which will take the phi multi-view again.
1676 for (i = 0; i < tensor_symbol_info_size; i++)
1677 if (tensor_blocks[i].bypass_ref && tensor_arena->vt_tensors[i])
1678 {
1679 assert(CCV_NNC_IS_METADATA_POS(tensor_arena->vt_tensors[i]))((void) sizeof ((((uintptr_t)(tensor_arena->vt_tensors[i])
& 1)) ? 1 : 0), __extension__ ({ if (((uintptr_t)(tensor_arena
->vt_tensors[i]) & 1)) ; else __assert_fail ("CCV_NNC_IS_METADATA_POS(tensor_arena->vt_tensors[i])"
, "ccv_nnc_symbolic_graph_compile.c", 1679, __extension__ __PRETTY_FUNCTION__
); }))
;
1680 ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[i]);
1681 assert(mv->anchor == CCV_NNC_MULTIVIEW_PHI)((void) sizeof ((mv->anchor == (intptr_t)0x1) ? 1 : 0), __extension__
({ if (mv->anchor == (intptr_t)0x1) ; else __assert_fail (
"mv->anchor == CCV_NNC_MULTIVIEW_PHI", "ccv_nnc_symbolic_graph_compile.c"
, 1681, __extension__ __PRETTY_FUNCTION__); }))
;
1682 tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)_ccv_nnc_tensor_flat_if_multiview(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[i]);
1683 }
1684 // rewire the rest. I can rewire multiple times because I can identify whether this is wired or not.
1685 for (i = 0; i < tensor_symbol_info_size; i++)
1686 if (tensor_arena->vt_tensors[i])
1687 tensor_arena->vt_tensors[i] = _ccv_nnc_tensor_metadata_rewire(tensor_arena->tensor_metadata, tensor_arena->vt_tensors[i]);
1688 // Associate multiview tensors from sub arena to the parent.
1689 if (sub_arena_out_tensors)
1690 {
1691 for (i = 0; i < alloc_prep->block_size; i++)
1692 if (alloc_prep->blocks[i].block_ref < tensor_symbol_info_size)
1693 {
1694 const int block_ref = alloc_prep->blocks[i].block_ref;
1695 if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[block_ref])((tensor_blocks[block_ref].flags & 0x3) == UNASSIGNED))
1696 continue;
1697 int sub_arena_ref = block_ref;
1698 if (TENSOR_EXPECT_ALIAS(tensor_blocks[block_ref])((tensor_blocks[block_ref].flags & 0x3) == ALIAS))
1699 {
1700 // Assigning out the tensor aliases.
1701 assert(tensor_symbol_info[block_ref].alias_ref)((void) sizeof ((tensor_symbol_info[block_ref].alias_ref) ? 1
: 0), __extension__ ({ if (tensor_symbol_info[block_ref].alias_ref
) ; else __assert_fail ("tensor_symbol_info[block_ref].alias_ref"
, "ccv_nnc_symbolic_graph_compile.c", 1701, __extension__ __PRETTY_FUNCTION__
); }))
;
1702 const int alias_ref = tensor_symbol_info[block_ref].alias_ref - 1;
1703 // It referenced to is not an alias.
1704 assert(tensor_arena->vt_tensors[alias_ref])((void) sizeof ((tensor_arena->vt_tensors[alias_ref]) ? 1 :
0), __extension__ ({ if (tensor_arena->vt_tensors[alias_ref
]) ; else __assert_fail ("tensor_arena->vt_tensors[alias_ref]"
, "ccv_nnc_symbolic_graph_compile.c", 1704, __extension__ __PRETTY_FUNCTION__
); }))
;
1705 sub_arena_ref = alias_ref;
1706 if (!sub_arena_out_tensors[sub_arena_ref])
1707 continue;
1708 }
1709 if (!sub_arena_out_tensors[sub_arena_ref])
1710 continue;
1711 ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)(CCV_IS_TENSOR_MULTIVIEW(sub_arena_out_tensors[sub_arena_ref])((*(int*)(sub_arena_out_tensors[sub_arena_ref])) & CCV_TENSOR_MULTIVIEW
)
? sub_arena_out_tensors[sub_arena_ref] : (ccv_nnc_tensor_t*)sub_arena_out_tensors[sub_arena_ref]->alias_ref);
1712 assert(CCV_IS_TENSOR_MULTIVIEW(mv))((void) sizeof ((((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW)) ?
1 : 0), __extension__ ({ if (((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW
)) ; else __assert_fail ("CCV_IS_TENSOR_MULTIVIEW(mv)", "ccv_nnc_symbolic_graph_compile.c"
, 1712, __extension__ __PRETTY_FUNCTION__); }))
;
1713 // This is only possible if the vt_tensors is a phi node.
1714 if (tensor_arena->vt_tensors[block_ref]->alias_ref)
1715 {
1716 // For phi node, the sub_arena_out_tensors are only relevant to its selected output. Therefore, setting that to be the receiver of the broadcast.
1717 ccv_nnc_tensor_multiview_t* const phi = (ccv_nnc_tensor_multiview_t*)(tensor_arena->vt_tensors[block_ref]->alias_ref);
1718 assert(phi->anchor == CCV_NNC_MULTIVIEW_PHI)((void) sizeof ((phi->anchor == (intptr_t)0x1) ? 1 : 0), __extension__
({ if (phi->anchor == (intptr_t)0x1) ; else __assert_fail
("phi->anchor == CCV_NNC_MULTIVIEW_PHI", "ccv_nnc_symbolic_graph_compile.c"
, 1718, __extension__ __PRETTY_FUNCTION__); }))
;
1719 assert(!CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(phi)[1]))((void) sizeof ((!((*(int*)(((phi)->_heap_data ? (phi)->
_heap_data : (phi)->_inline_data)[1])) & CCV_TENSOR_MULTIVIEW
)) ? 1 : 0), __extension__ ({ if (!((*(int*)(((phi)->_heap_data
? (phi)->_heap_data : (phi)->_inline_data)[1])) & CCV_TENSOR_MULTIVIEW
)) ; else __assert_fail ("!CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(phi)[1])"
, "ccv_nnc_symbolic_graph_compile.c", 1719, __extension__ __PRETTY_FUNCTION__
); }))
;
1720 CCV_NNC_MULTIVIEW_DATA(phi)((phi)->_heap_data ? (phi)->_heap_data : (phi)->_inline_data
)
[1]->alias_ref = (uintptr_t)mv;
1721 ccv_nnc_tensor_synchronize_to_multiview(mv, CCV_NNC_MULTIVIEW_DATA(phi)((phi)->_heap_data ? (phi)->_heap_data : (phi)->_inline_data
)
[1]);
1722 } else {
1723 tensor_arena->vt_tensors[block_ref]->alias_ref = (uintptr_t)mv;
1724 ccv_nnc_tensor_synchronize_to_multiview(mv, tensor_arena->vt_tensors[block_ref]);
1725 }
1726 }
1727 }
1728 // Go over all the tensors that has assign_ref. If the tensor it is assigned from is:
1729 // 1). From sub_arena_out_tensors, it could be possible that it now pointing to an area this arena doesn't know.
1730 // 2). From phi multi-view, for this case, it is in fact that this arena won't know which memory I am going to use prior.
1731 // Therefore, for above two scenarios, the tensor has assign_ref, even it is a multiview tensor, need to subscribe
1732 // to the output of assign_ref tensor.
1733 for (i = 0; i < tensor_symbol_info_size; i++)
1734 if (tensor_arena->vt_tensors[i] && tensor_symbol_info[i].assign_ref)
1735 {
1736 const int assign_ref = tensor_symbol_info[i].assign_ref - 1;
1737 ccv_nnc_tensor_t* assign_tensor;
1738 if (sub_arena_out_tensors && sub_arena_out_tensors[assign_ref])
1739 assign_tensor = CCV_IS_TENSOR_MULTIVIEW(sub_arena_out_tensors[assign_ref])((*(int*)(sub_arena_out_tensors[assign_ref])) & CCV_TENSOR_MULTIVIEW
)
? sub_arena_out_tensors[assign_ref] : (ccv_nnc_tensor_t*)sub_arena_out_tensors[assign_ref]->alias_ref;
1740 else
1741 assign_tensor = tensor_arena->vt_tensors[assign_ref];
1742 ccv_nnc_graph_add_carry_over(graph_prep->graph, assign_tensor, tensor_arena->vt_tensors[i]);
1743 }
1744 // After everything handled, assertion again to make sure the tensors and tensor binds pointing to the right location. This is really just for assertion.
1745 for (i = 0; i < tensor_bind_size; i++)
1746 {
1747 assert(tensor_binds[i].tensor)((void) sizeof ((tensor_binds[i].tensor) ? 1 : 0), __extension__
({ if (tensor_binds[i].tensor) ; else __assert_fail ("tensor_binds[i].tensor"
, "ccv_nnc_symbolic_graph_compile.c", 1747, __extension__ __PRETTY_FUNCTION__
); }))
;
1748 const ccv_nnc_tensor_symbol_t resolved_symbol = ccv_nnc_tensor_symbol_resolve(graph_prep->symbolic_graph, tensor_binds[i].symbol);
1749 if (resolved_symbol.d >= 0)
1750 {
1751 int d = resolved_symbol.d;
1752 // This check is for in-place ops. Only in-place op could have unassigned but ref.
1753 // It has nothing to do with alias.
1754 while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED) && tensor_blocks[d].ref)
1755 d = tensor_blocks[d].ref - 1;
1756 // Note we don't trace back on alias. This is intentional.
1757 assert(tensor_arena->vt_tensors[d]->data.u8 == tensor_binds[i].tensor->data.u8)((void) sizeof ((tensor_arena->vt_tensors[d]->data.u8 ==
tensor_binds[i].tensor->data.u8) ? 1 : 0), __extension__ (
{ if (tensor_arena->vt_tensors[d]->data.u8 == tensor_binds
[i].tensor->data.u8) ; else __assert_fail ("tensor_arena->vt_tensors[d]->data.u8 == tensor_binds[i].tensor->data.u8"
, "ccv_nnc_symbolic_graph_compile.c", 1757, __extension__ __PRETTY_FUNCTION__
); }))
;
1758 }
1759 }
1760 if (sub_arena_out_tensors)
1761 ccfreefree(sub_arena_out_tensors);
1762 // Rewire sub arena's tensor references.
1763 for (i = 0; i < tensor_arena->sub_arena_size; i++)
1764 if (tensor_arena->sub_arenas[i])
1765 {
1766 const int exec_idx = graph_prep->sub_preps[i]->exec_idx - 1;
1767 const ccv_nnc_graph_exec_symbol_info_t* const node = graph_prep->exec_symbol_info + exec_idx;
1768 for (j = 0; j < node->input_size; j++)
1769 {
1770 const int idx = node->inputs[j];
1771 const int s_idx = (tensor_symbol_info[idx].s_ref && tensor_symbol_info[idx].s_ref->rnum > i) ? *(int*)ccv_array_get(tensor_symbol_info[idx].s_ref, i)((void*)(((char*)((tensor_symbol_info[idx].s_ref)->data)) +
(size_t)(tensor_symbol_info[idx].s_ref)->rsize * (size_t)
(i)))
- 1 : -1;
1772 if (s_idx < 0)
1773 continue;
1774 ccv_nnc_tensor_t* sub_tensor = tensor_arena->sub_arenas[i]->vt_tensors[s_idx];
1775 // Only do the replacement if it is a multi-view tensor.
1776 // sub_tensor can be unassigned if it is a tape variable. It will get fixed up later from its pair.
1777 if (sub_tensor && CCV_IS_TENSOR_MULTIVIEW(sub_tensor)((*(int*)(sub_tensor)) & CCV_TENSOR_MULTIVIEW))
1778 {
1779 // This is binded tensor, bind it now.
1780 if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[idx])((tensor_blocks[idx].flags & 0x3) == UNASSIGNED))
1781 _ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)sub_tensor, tensor_arena->vt_tensors[idx]);
1782 else
1783 _ccv_nnc_tensor_multiview_full_pos_rewire(tensor_arena->tensor_metadata, (ccv_nnc_tensor_multiview_t*)sub_tensor);
1784 }
1785 }
1786 }
1787 return tensor_arena;
1788}
1789
1790static ccv_nnc_tensor_t* _ccv_nnc_tensor_arena_find_pair_ref(const ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_symbolic_graph_t* const graph, const int pair_ref)
1791{
1792 assert(graph)((void) sizeof ((graph) ? 1 : 0), __extension__ ({ if (graph)
; else __assert_fail ("graph", "ccv_nnc_symbolic_graph_compile.c"
, 1792, __extension__ __PRETTY_FUNCTION__); }))
;
1793 if ((intptr_t)graph == tensor_arena->graph_ref)
1794 {
1795 assert(pair_ref >= 0 && pair_ref < tensor_arena->vt_tensor_size)((void) sizeof ((pair_ref >= 0 && pair_ref < tensor_arena
->vt_tensor_size) ? 1 : 0), __extension__ ({ if (pair_ref >=
0 && pair_ref < tensor_arena->vt_tensor_size) ;
else __assert_fail ("pair_ref >= 0 && pair_ref < tensor_arena->vt_tensor_size"
, "ccv_nnc_symbolic_graph_compile.c", 1795, __extension__ __PRETTY_FUNCTION__
); }))
;
1796 return tensor_arena->vt_tensors[pair_ref];
1797 }
1798 int i;
1799 for (i = 0; i < tensor_arena->sub_arena_size; i++)
1800 if (tensor_arena->sub_arenas[i])
1801 {
1802 ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_arena_find_pair_ref(tensor_arena->sub_arenas[i], graph, pair_ref);
1803 if (tensor)
1804 return tensor;
1805 }
1806 return 0;
1807}
1808
1809static void _ccv_nnc_tensor_mark_as_tape_var(ccv_nnc_tensor_t* const tensor)
1810{
1811 if (!CCV_IS_TENSOR_MULTIVIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_MULTIVIEW))
1812 tensor->type |= CCV_TAPE_ALLOC;
1813 else {
1814 ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)tensor;
1815 mv->type |= CCV_TAPE_ALLOC;
1816 int i;
1817 for (i = 0; i < mv->repeat + mv->kind; i++)
1818 _ccv_nnc_tensor_mark_as_tape_var(CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i]);
1819 }
1820}
1821
1822static void _ccv_nnc_tensor_arena_fixup_pair_ref_and_tape_var(const ccv_nnc_tensor_arena_t* const root_arena, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, ccv_nnc_tensor_arena_t* const tensor_arena)
1823{
1824 assert(tensor_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph)((void) sizeof ((tensor_arena->graph_ref == (intptr_t)graph_prep
->symbolic_graph) ? 1 : 0), __extension__ ({ if (tensor_arena
->graph_ref == (intptr_t)graph_prep->symbolic_graph) ; else
__assert_fail ("tensor_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 1824, __extension__ __PRETTY_FUNCTION__
); }))
;
1825 int i;
1826 for (i = 0; i < graph_prep->tensor_symbol_info_size; i++)
1827 {
1828 if (graph_prep->tensor_symbol_info[i].pair_ref)
1829 {
1830 tensor_arena->vt_tensors[i] = _ccv_nnc_tensor_arena_find_pair_ref(root_arena, graph_prep->symbolic_graph->pair, graph_prep->tensor_symbol_info[i].pair_ref - 1);
1831 // No need to continue check this if it is from its pair.
1832 continue;
1833 }
1834 if ((graph_prep->tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_TAPE_VAR) && tensor_arena->vt_tensors[i])
1835 {
1836 // If it is a normal tensor, and the buffer it relies on is read only, no need to mark as tape var.
1837 if (!CCV_IS_TENSOR_MULTIVIEW(tensor_arena->vt_tensors[i])((*(int*)(tensor_arena->vt_tensors[i])) & CCV_TENSOR_MULTIVIEW
)
)
1838 {
1839 const int vt_ref = graph_prep->alloc_prep->vt_blocks[i];
1840 if (vt_ref >= 0 &&
1841 TENSOR_READ_WRITE(graph_prep->alloc_prep->buffers[graph_prep->alloc_prep->blocks[vt_ref].buffer_ref])(graph_prep->alloc_prep->buffers[graph_prep->alloc_prep
->blocks[vt_ref].buffer_ref].flags & 0xc)
== READ_ONLY)
1842 continue;
1843 }
1844 _ccv_nnc_tensor_mark_as_tape_var(tensor_arena->vt_tensors[i]);
1845 }
1846 }
1847 for (i = 0; i < graph_prep->sub_prep_size; i++)
1848 if (graph_prep->sub_preps[i])
1849 _ccv_nnc_tensor_arena_fixup_pair_ref_and_tape_var(root_arena, graph_prep->sub_preps[i], tensor_arena->sub_arenas[i]);
1850}
1851
1852static void _ccv_nnc_tensor_block_add_exec(const ccv_sparse_matrix_t* const exec_dep, const int idx, ccv_nnc_tensor_block_t tensor_blocks)
1853{
1854 int i, found = 0;
1855 // Try to insert head.
1856 ccv_array_t* head = tensor_blocks.head;
1857 assert(head)((void) sizeof ((head) ? 1 : 0), __extension__ ({ if (head) ;
else __assert_fail ("head", "ccv_nnc_symbolic_graph_compile.c"
, 1857, __extension__ __PRETTY_FUNCTION__); }))
;
1858 for (i = 0; i < head->rnum;)
1859 {
1860 const int head_idx = *(int*)ccv_array_get(head, i)((void*)(((char*)((head)->data)) + (size_t)(head)->rsize
* (size_t)(i)))
;
1861 if (head_idx == idx)
1862 {
1863 found = 1;
1864 break;
1865 }
1866 ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, head_idx, idx);
1867 if (cell.i32 && cell.i32[0] > 0)
1868 {
1869 /* If the current node is the parent of the head node, check if we found it or not. */
1870 /* If not found, replace the current one. */
1871 if (!found)
1872 {
1873 found = 1;
1874 *(int*)ccv_array_get(head, i)((void*)(((char*)((head)->data)) + (size_t)(head)->rsize
* (size_t)(i)))
= idx;
1875 } else {
1876 /* Remove the current one, change the rnum. */
1877 if (i < head->rnum - 1)
1878 *(int*)ccv_array_get(head, i)((void*)(((char*)((head)->data)) + (size_t)(head)->rsize
* (size_t)(i)))
= *(int*)ccv_array_get(head, head->rnum - 1)((void*)(((char*)((head)->data)) + (size_t)(head)->rsize
* (size_t)(head->rnum - 1)))
;
1879 --head->rnum;
1880 continue;
1881 }
1882 } else {
1883 // If the head is the parent of the idx, we cannot add it to the array (it is deterministically later than head).
1884 cell = ccv_get_sparse_matrix_cell(exec_dep, idx, head_idx);
1885 if (cell.i32 && cell.i32[0] > 0)
1886 {
1887 found = 1;
1888 break;
1889 }
1890 }
1891 /* Advancing i. */
1892 ++i;
1893 }
1894 /* If not found, push this idx to the end of the array. */
1895 if (!found)
1896 ccv_array_push(head, &idx);
1897 // Try to insert tail.
1898 found = 0;
1899 ccv_array_t* tail = tensor_blocks.tail;
1900 assert(tail)((void) sizeof ((tail) ? 1 : 0), __extension__ ({ if (tail) ;
else __assert_fail ("tail", "ccv_nnc_symbolic_graph_compile.c"
, 1900, __extension__ __PRETTY_FUNCTION__); }))
;
1901 for (i = 0; i < tail->rnum;)
1902 {
1903 const int tail_idx = *(int*)ccv_array_get(tail, i)((void*)(((char*)((tail)->data)) + (size_t)(tail)->rsize
* (size_t)(i)))
;
1904 if (tail_idx == idx)
1905 {
1906 found = 1;
1907 break;
1908 }
1909 ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, idx, tail_idx);
1910 if (cell.i32 && cell.i32[0] > 0)
1911 {
1912 /* If the current node is the child of the tail node, check if we found it or not. */
1913 /* If not found, replace the current one. */
1914 if (!found)
1915 {
1916 found = 1;
1917 *(int*)ccv_array_get(tail, i)((void*)(((char*)((tail)->data)) + (size_t)(tail)->rsize
* (size_t)(i)))
= idx;
1918 } else {
1919 /* Remove the current one, change the rnum. */
1920 *(int*)ccv_array_get(tail, i)((void*)(((char*)((tail)->data)) + (size_t)(tail)->rsize
* (size_t)(i)))
= *(int*)ccv_array_get(tail, tail->rnum - 1)((void*)(((char*)((tail)->data)) + (size_t)(tail)->rsize
* (size_t)(tail->rnum - 1)))
;
1921 --tail->rnum;
1922 continue;
1923 }
1924 } else {
1925 // If the tail is the child of the idx, we cannot add it to the array (it is deterministically earlier than tail).
1926 cell = ccv_get_sparse_matrix_cell(exec_dep, tail_idx, idx);
1927 if (cell.i32 && cell.i32[0] > 0)
1928 {
1929 found = 1;
1930 break;
1931 }
1932 }
1933 /* Advancing i. */
1934 ++i;
1935 }
1936 /* If not found, push this idx to the end of the array. */
1937 if (!found)
1938 ccv_array_push(tail, &idx);
1939}
1940
1941ccv_nnc_tensor_t* ccv_nnc_tensor_from_symbol(const ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_tensor_symbol_t symbol)
1942{
1943 if ((intptr_t)symbol.graph == tensor_arena->graph_ref)
1944 {
1945 assert(symbol.d >= 0 && symbol.d < tensor_arena->vt_tensor_size)((void) sizeof ((symbol.d >= 0 && symbol.d < tensor_arena
->vt_tensor_size) ? 1 : 0), __extension__ ({ if (symbol.d >=
0 && symbol.d < tensor_arena->vt_tensor_size) ;
else __assert_fail ("symbol.d >= 0 && symbol.d < tensor_arena->vt_tensor_size"
, "ccv_nnc_symbolic_graph_compile.c", 1945, __extension__ __PRETTY_FUNCTION__
); }))
;
1946 ccv_nnc_tensor_t* tensor = tensor_arena->vt_tensors[symbol.d];
1947 if (tensor && CCV_IS_TENSOR_MULTIVIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_MULTIVIEW))
1948 {
1949 ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)tensor;
1950 while (CCV_IS_TENSOR_MULTIVIEW(mv)((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW))
1951 mv = (ccv_nnc_tensor_multiview_t*)(mv->it ? mv->it : CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[0]);
1952 return (ccv_nnc_tensor_t*)mv;
1953 }
1954 return tensor;
1955 }
1956 int i;
1957 for (i = 0; i < tensor_arena->sub_arena_size; i++)
1958 if (tensor_arena->sub_arenas[i])
1959 {
1960 ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_from_symbol(tensor_arena->sub_arenas[i], symbol);
1961 if (tensor)
1962 return tensor;
1963 }
1964 return 0;
1965}
1966
1967ccv_nnc_graph_exec_t ccv_nnc_graph_exec_from_symbol(const ccv_nnc_graph_exec_arena_t* const graph_exec_arena, const ccv_nnc_graph_exec_symbol_t symbol)
1968{
1969 if ((intptr_t)symbol.graph == graph_exec_arena->graph_ref)
1970 {
1971 assert(symbol.d >= 0 && symbol.d < graph_exec_arena->graph_exec_size)((void) sizeof ((symbol.d >= 0 && symbol.d < graph_exec_arena
->graph_exec_size) ? 1 : 0), __extension__ ({ if (symbol.d
>= 0 && symbol.d < graph_exec_arena->graph_exec_size
) ; else __assert_fail ("symbol.d >= 0 && symbol.d < graph_exec_arena->graph_exec_size"
, "ccv_nnc_symbolic_graph_compile.c", 1971, __extension__ __PRETTY_FUNCTION__
); }))
;
1972 return graph_exec_arena->graph_execs[symbol.d];
1973 }
1974 int i;
1975 for (i = 0; i < graph_exec_arena->sub_arena_size; i++)
1976 if (graph_exec_arena->sub_arenas[i])
1977 {
1978 ccv_nnc_graph_exec_t exec = ccv_nnc_graph_exec_from_symbol(graph_exec_arena->sub_arenas[i], symbol);
1979 if (!CCV_NO_GRAPH_EXEC(exec)((exec).graph == 0))
1980 return exec;
1981 }
1982 return (ccv_nnc_graph_exec_t){}; // 0.
1983}
1984
1985ccv_nnc_graph_exec_t ccv_nnc_graph_exec_source(const ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
1986{
1987 return graph_exec_arena->source;
1988}
1989
1990ccv_nnc_graph_exec_t ccv_nnc_graph_exec_destination(const ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
1991{
1992 return graph_exec_arena->destination;
1993}
1994
1995// Check whether the head is the beginning of this block.
1996static int _ccv_nnc_tensor_block_check_head(const ccv_nnc_tensor_block_t* const tensor_block, const int head_node)
1997{
1998 assert(tensor_block->head)((void) sizeof ((tensor_block->head) ? 1 : 0), __extension__
({ if (tensor_block->head) ; else __assert_fail ("tensor_block->head"
, "ccv_nnc_symbolic_graph_compile.c", 1998, __extension__ __PRETTY_FUNCTION__
); }))
;
1999 return (tensor_block->head->rnum == 1 && *(int*)ccv_array_get(tensor_block->head, 0)((void*)(((char*)((tensor_block->head)->data)) + (size_t
)(tensor_block->head)->rsize * (size_t)(0)))
== head_node);
2000}
2001
2002// Check whether the tail is the end of this block.
2003static int _ccv_nnc_tensor_block_check_tail(const ccv_nnc_tensor_block_t* const tensor_block, const int tail_node)
2004{
2005 assert(tensor_block->tail)((void) sizeof ((tensor_block->tail) ? 1 : 0), __extension__
({ if (tensor_block->tail) ; else __assert_fail ("tensor_block->tail"
, "ccv_nnc_symbolic_graph_compile.c", 2005, __extension__ __PRETTY_FUNCTION__
); }))
;
2006 return (tensor_block->tail->rnum == 1 && *(int*)ccv_array_get(tensor_block->tail, 0)((void*)(((char*)((tensor_block->tail)->data)) + (size_t
)(tensor_block->tail)->rsize * (size_t)(0)))
== tail_node);
2007}
2008
2009// Make two tensor blocks one. Return 1 if that happened.
2010static int _ccv_nnc_tensor_blocks_try_fold(ccv_nnc_tensor_block_t* const tensor_blocks, const int p_ref_0, const int p_ref_1)
2011{
2012 // Now we are sure p_ref_0 points to the input, p_ref_1 points to the output.
2013 if (!TENSOR_IS_UNFOLDABLE_AS_INPUT(tensor_blocks[p_ref_0])(tensor_blocks[p_ref_0].flags & UNFOLDABLE_AS_INPUT) &&
2014 (!TENSOR_IS_UNFOLDABLE_AS_OUTPUT(tensor_blocks[p_ref_1])(tensor_blocks[p_ref_1].flags & UNFOLDABLE_AS_OUTPUT) || tensor_blocks[p_ref_1].unfoldable_except_ref == p_ref_0 + 1) &&
2015 tensor_blocks[p_ref_0].tail->rnum == 1 &&
2016 tensor_blocks[p_ref_1].head->rnum == 1 &&
2017 tensor_blocks[p_ref_0].type == tensor_blocks[p_ref_1].type && // Must be the same type.
2018 *(int*)ccv_array_get(tensor_blocks[p_ref_0].tail, 0)((void*)(((char*)((tensor_blocks[p_ref_0].tail)->data)) + (
size_t)(tensor_blocks[p_ref_0].tail)->rsize * (size_t)(0))
)
== *(int*)ccv_array_get(tensor_blocks[p_ref_1].head, 0)((void*)(((char*)((tensor_blocks[p_ref_1].head)->data)) + (
size_t)(tensor_blocks[p_ref_1].head)->rsize * (size_t)(0))
)
)
2019 {
2020 // If the two parent refs matches (thus, they meet at the same node), we can concatenate with each other and mark one as a ref. This is very similar to in-place operation combining.
2021 assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[p_ref_0]))((void) sizeof (((!((tensor_blocks[p_ref_0].flags & 0x3) ==
ALIAS) && !((tensor_blocks[p_ref_0].flags & 0x3)
== UNASSIGNED))) ? 1 : 0), __extension__ ({ if ((!((tensor_blocks
[p_ref_0].flags & 0x3) == ALIAS) && !((tensor_blocks
[p_ref_0].flags & 0x3) == UNASSIGNED))) ; else __assert_fail
("TENSOR_EXPECT_COMPUTABLE(tensor_blocks[p_ref_0])", "ccv_nnc_symbolic_graph_compile.c"
, 2021, __extension__ __PRETTY_FUNCTION__); }))
;
2022 assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[p_ref_1]))((void) sizeof (((!((tensor_blocks[p_ref_1].flags & 0x3) ==
ALIAS) && !((tensor_blocks[p_ref_1].flags & 0x3)
== UNASSIGNED))) ? 1 : 0), __extension__ ({ if ((!((tensor_blocks
[p_ref_1].flags & 0x3) == ALIAS) && !((tensor_blocks
[p_ref_1].flags & 0x3) == UNASSIGNED))) ; else __assert_fail
("TENSOR_EXPECT_COMPUTABLE(tensor_blocks[p_ref_1])", "ccv_nnc_symbolic_graph_compile.c"
, 2022, __extension__ __PRETTY_FUNCTION__); }))
;
2023 ccv_array_free(tensor_blocks[p_ref_0].tail);
2024 tensor_blocks[p_ref_0].tail = tensor_blocks[p_ref_1].tail;
2025 if (tensor_blocks[p_ref_1].p_refs[0])
2026 {
2027 assert(tensor_blocks[p_ref_1].p_refs[1] == 0)((void) sizeof ((tensor_blocks[p_ref_1].p_refs[1] == 0) ? 1 :
0), __extension__ ({ if (tensor_blocks[p_ref_1].p_refs[1] ==
0) ; else __assert_fail ("tensor_blocks[p_ref_1].p_refs[1] == 0"
, "ccv_nnc_symbolic_graph_compile.c", 2027, __extension__ __PRETTY_FUNCTION__
); }))
; // It simply cannot have more than one p_refs, otherwise we cannot merge.
2028 if (!tensor_blocks[p_ref_0].p_refs[0])
2029 tensor_blocks[p_ref_0].p_refs[0] = tensor_blocks[p_ref_1].p_refs[0];
2030 else
2031 tensor_blocks[p_ref_0].p_refs[1] = tensor_blocks[p_ref_1].p_refs[0];
2032 }
2033 tensor_blocks[p_ref_0].pin_mem = tensor_blocks[p_ref_0].pin_mem || tensor_blocks[p_ref_1].pin_mem;
2034 TENSOR_SET_READ_WRITE(tensor_blocks[p_ref_0], TENSOR_READ_WRITE(tensor_blocks[p_ref_0]) | TENSOR_READ_WRITE(tensor_blocks[p_ref_1]))(tensor_blocks[p_ref_0].flags = ((tensor_blocks[p_ref_0].flags
& ~0xc) | (tensor_blocks[p_ref_0].flags & 0xc) | (tensor_blocks
[p_ref_1].flags & 0xc)))
;
2035 ccv_array_free(tensor_blocks[p_ref_1].head);
2036 if (TENSOR_IS_UNFOLDABLE_AS_INPUT(tensor_blocks[p_ref_1])(tensor_blocks[p_ref_1].flags & UNFOLDABLE_AS_INPUT))
2037 TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[p_ref_0])(tensor_blocks[p_ref_0].flags = (tensor_blocks[p_ref_0].flags
| UNFOLDABLE_AS_INPUT))
;
2038 // Don't need to check UNFOLDABLE_AS_OUTPUT for p_ref_1 because if it is so, we cannot fold right now.
2039 TENSOR_EXPECT_SET_UNASSIGNED(tensor_blocks[p_ref_1])(tensor_blocks[p_ref_1].flags = ((tensor_blocks[p_ref_1].flags
& ~0x3) | UNASSIGNED))
;
2040 tensor_blocks[p_ref_1].ref = p_ref_0 + 1;
2041 if (!tensor_blocks[p_ref_0].r_refs)
2042 tensor_blocks[p_ref_0].r_refs = ccv_array_new(sizeof(int), 0, 0);
2043 ccv_array_add_unique_int(tensor_blocks[p_ref_0].r_refs, p_ref_1 + 1);
2044 tensor_blocks[p_ref_1].size = 0;
2045 tensor_blocks[p_ref_1].head = 0;
2046 tensor_blocks[p_ref_1].tail = 0;
2047 return 1;
2048 }
2049 return 0;
2050}
2051
2052static void _ccv_nnc_exec_dep_and_tensor_blocks_prep(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_info_t* const p_node_info, const ccv_nnc_graph_visit_t* const visit, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int unroll_count, const int* const dup_tensor_block_ref, const int* const dup_tensor_from_ref, const int* const dup_exec_from_ref, ccv_nnc_graph_exec_flag_t* const exec_flags, ccv_sparse_matrix_t** r_exec_dep, ccv_nnc_tensor_block_t** r_tensor_blocks)
2053{
2054 int i, j, k;
2055 // Generate exec dependencies (or, in other words, partial ordering of executions).
2056 ccv_sparse_matrix_t* exec_dep = ccv_sparse_matrix_new(symbolic_graph->exec_symbol_info->rnum, symbolic_graph->exec_symbol_info->rnum, CCV_32S | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0);
2057 int* buf = (int*)ccmallocmalloc(sizeof(int) * symbolic_graph->exec_symbol_info->rnum * 2);
2058 int buf_size;
2059 if (p_node_info)
2060 { assert(output_size == 0)((void) sizeof ((output_size == 0) ? 1 : 0), __extension__ ({
if (output_size == 0) ; else __assert_fail ("output_size == 0"
, "ccv_nnc_symbolic_graph_compile.c", 2060, __extension__ __PRETTY_FUNCTION__
); }))
; }
2061#define for_block(x, val) \
2062 do { \
2063 if (((int32_t*)val)[0] > 0) \
2064 { \
2065 buf[buf_size * 2] = x; \
2066 buf[buf_size * 2 + 1] = ((int32_t*)val)[0] + 1; \
2067 ++buf_size; \
2068 } \
2069 } while (0)
2070 ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx, term){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int term __attribute__((unused)) = (visit)->node[_i_
].term; typeof ((exec_symbol_info)) const node __attribute__(
(unused)) = (exec_symbol_info) + idx;
{
2071 buf_size = 0; /* save all its parent deps to this buffer */
2072 ccv_sparse_matrix_vector_t* vector = ccv_get_sparse_matrix_vector(exec_dep, idx);
2073 if (vector)
2074 CCV_SPARSE_VECTOR_FOREACH(exec_dep, vector, for_block)do { switch ((((exec_dep)->type) & 0xFF000)) { case CCV_32S
: { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((exec_dep)->type) & 0xFFF); if ((exec_dep)->type &
CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (vector)->size
; _i_++) { for_block((_i_), ((vector)->data.i32 + (_i_ * _c_
))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((exec_dep)->type) & 0xFF000
) >> 12] * (((exec_dep)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(vector)->index; for
(_i_ = 0; _i_ < (vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i32 + (0))); } } } while (0); break; } case CCV_32F:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((exec_dep)->type) & 0xFFF); if ((exec_dep)->type &
CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (vector)->size
; _i_++) { for_block((_i_), ((vector)->data.f32 + (_i_ * _c_
))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((exec_dep)->type) & 0xFF000
) >> 12] * (((exec_dep)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(vector)->index; for
(_i_ = 0; _i_ < (vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f32 + (0))); } } } while (0); break; } case CCV_64S:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((exec_dep)->type) & 0xFFF); if ((exec_dep)->type &
CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (vector)->size
; _i_++) { for_block((_i_), ((vector)->data.i64 + (_i_ * _c_
))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((exec_dep)->type) & 0xFF000
) >> 12] * (((exec_dep)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(vector)->index; for
(_i_ = 0; _i_ < (vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i64 + (0))); } } } while (0); break; } case CCV_64F:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((exec_dep)->type) & 0xFFF); if ((exec_dep)->type &
CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (vector)->size
; _i_++) { for_block((_i_), ((vector)->data.f64 + (_i_ * _c_
))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((exec_dep)->type) & 0xFF000
) >> 12] * (((exec_dep)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(vector)->index; for
(_i_ = 0; _i_ < (vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f64 + (0))); } } } while (0); break; } default: { do
{ int _i_; __attribute__((unused)) const size_t _c_ = (((exec_dep
)->type) & 0xFFF); if ((exec_dep)->type & CCV_DENSE_VECTOR
) { for (_i_ = 0; _i_ < (vector)->size; _i_++) { for_block
((_i_), ((vector)->data.u8 + (_i_ * _c_))); } } else { const
size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size
[(((exec_dep)->type) & 0xFF000) >> 12] * (((exec_dep
)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_
= (uint8_t*)(vector)->index; for (_i_ = 0; _i_ < (vector
)->size; _i_++) { ccv_sparse_matrix_index_t* const _idx_i_
= (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if
(_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_
= { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.u8 + (0))); } } } while (0); } } } while (0)
;
2075 if (!node->outgoings)
2076 continue;
2077 for (i = 0; i < node->outgoings->rnum; i++)
2078 {
2079 int outgoing = *(int*)ccv_array_get(node->outgoings, i)((void*)(((char*)((node->outgoings)->data)) + (size_t)(
node->outgoings)->rsize * (size_t)(i)))
;
2080 const int32_t one = 1;
2081 ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, outgoing, idx);
2082 /* If not found, set, if the current node is the destination node, no need
2083 * set itself as parent of subsequent nodes because its terminal nature. */
2084 if (!cell.i32 || cell.i32[0] == 0)
2085 ccv_set_sparse_matrix_cell(exec_dep, outgoing, idx, &one);
2086 if (buf_size > 0)
2087 {
2088 ccv_sparse_matrix_vector_t* vector = ccv_get_sparse_matrix_vector(exec_dep, outgoing);
2089 assert(vector)((void) sizeof ((vector) ? 1 : 0), __extension__ ({ if (vector
) ; else __assert_fail ("vector", "ccv_nnc_symbolic_graph_compile.c"
, 2089, __extension__ __PRETTY_FUNCTION__); }))
;
2090 for (j = 0; j < buf_size; j++) /* set with all idx's dependencies as well */
2091 {
2092 ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell_from_vector(exec_dep, vector, buf[j * 2]);
2093 /* If not found, set */
2094 if (!cell.i32 || cell.i32[0] == 0)
2095 ccv_set_sparse_matrix_cell_from_vector(exec_dep, vector, buf[j * 2], &buf[j * 2 + 1]);
2096 else {
2097 /* Otherwise, set to the longest one */
2098 int32_t dep = ccv_max(cell.i32[0], buf[j * 2 + 1])({ typeof (cell.i32[0]) _a = (cell.i32[0]); typeof (buf[j * 2
+ 1]) _b = (buf[j * 2 + 1]); (_a > _b) ? _a : _b; })
;
2099 ccv_set_sparse_matrix_cell_from_vector(exec_dep, vector, buf[j * 2], &dep);
2100 }
2101 }
2102 }
2103 }
2104 } ccv_nnc_graph_visit_endfor} }
2105#undef for_block
2106 ccfreefree(buf);
2107 // This struct is allocated earlier to collect information about the tensor's expected start / end execs.
2108 const int tensor_block_size = symbolic_graph->tensor_symbol_info->rnum;
2109 ccv_nnc_tensor_block_t* tensor_blocks = (ccv_nnc_tensor_block_t*)cccalloccalloc(tensor_block_size, sizeof(ccv_nnc_tensor_block_t));
2110 // The reason is that I need to make everyone of them to be unassigned unless it is used somewhere. It
2111 // happens that I have to loop through all relevant node to find out if one is used or not.
2112 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2113 tensor_blocks[i].flags = UNASSIGNED, tensor_blocks[i].type = tensor_symbol_info[i].info.type, tensor_blocks[i].bypass_ref = tensor_symbol_info[i].bypass_ref;
2114 ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx;
{
2115 for (i = 0; i < node->input_size; i++)
2116 if (node->inputs[i] >= 0)
2117 {
2118 tensor_blocks[node->inputs[i]].flags = 0;
2119 // If this is a data transfer node, and this is a CPU memory, mark the memory type to be pinned mem.
2120 // This will get propagated back to the buffer, and used there to determine the allocation function to use.
2121 if (CCV_TENSOR_GET_MEMORY(tensor_blocks[node->inputs[i]].type)((tensor_blocks[node->inputs[i]].type) & 0x3) == CCV_TENSOR_CPU_MEMORY &&
2122 (node->cmd.cmd == CCV_NNC_DATA_TRANSFER_FORWARD || node->cmd.cmd == CCV_NNC_DATA_TRANSFER_BACKWARD))
2123 tensor_blocks[node->inputs[i]].pin_mem = 1;
2124 }
2125 for (i = 0; i < node->output_size; i++)
2126 if (node->outputs[i] >= 0)
2127 {
2128 tensor_blocks[node->outputs[i]].flags = 0;
2129 // If this is a data transfer node, and this is a CPU memory, mark the memory type to be pinned mem.
2130 // This will get propagated back to the buffer, and used there to determine the allocation function to use.
2131 if (CCV_TENSOR_GET_MEMORY(tensor_blocks[node->outputs[i]].type)((tensor_blocks[node->outputs[i]].type) & 0x3) == CCV_TENSOR_CPU_MEMORY &&
2132 (node->cmd.cmd == CCV_NNC_DATA_TRANSFER_FORWARD || node->cmd.cmd == CCV_NNC_DATA_TRANSFER_BACKWARD))
2133 tensor_blocks[node->outputs[i]].pin_mem = 1;
2134 }
2135 } ccv_nnc_graph_visit_endfor} }
2136 if (p_node_info)
2137 {
2138 assert(p_tensor_symbol_info)((void) sizeof ((p_tensor_symbol_info) ? 1 : 0), __extension__
({ if (p_tensor_symbol_info) ; else __assert_fail ("p_tensor_symbol_info"
, "ccv_nnc_symbolic_graph_compile.c", 2138, __extension__ __PRETTY_FUNCTION__
); }))
;
2139 // Mark it as used if it is used in either input or output.
2140 for (i = 0; i < p_node_info->input_size; i++)
2141 if (p_node_info->inputs[i] >= 0)
2142 {
2143 const int d = p_node_info->inputs[i];
2144 if (p_tensor_symbol_info[d].s_ref && p_tensor_symbol_info[d].s_ref->rnum >= symbolic_graph->p_idx)
2145 {
2146 const int dd = *(int*)ccv_array_get(p_tensor_symbol_info[d].s_ref, symbolic_graph->p_idx - 1)((void*)(((char*)((p_tensor_symbol_info[d].s_ref)->data)) +
(size_t)(p_tensor_symbol_info[d].s_ref)->rsize * (size_t)
(symbolic_graph->p_idx - 1)))
- 1;
2147 if (dd >= 0) // If this exists in this sub-graph, great.
2148 tensor_blocks[dd].flags = 0;
2149 }
2150 }
2151 for (i = 0; i < p_node_info->output_size; i++)
2152 if (p_node_info->outputs[i] >= 0)
2153 {
2154 const int d = p_node_info->outputs[i];
2155 if (p_tensor_symbol_info[d].s_ref && p_tensor_symbol_info[d].s_ref->rnum >= symbolic_graph->p_idx)
2156 {
2157 const int dd = *(int*)ccv_array_get(p_tensor_symbol_info[d].s_ref, symbolic_graph->p_idx - 1)((void*)(((char*)((p_tensor_symbol_info[d].s_ref)->data)) +
(size_t)(p_tensor_symbol_info[d].s_ref)->rsize * (size_t)
(symbolic_graph->p_idx - 1)))
- 1;
2158 if (dd >= 0) // If this exists in this sub-graph, great.
2159 tensor_blocks[dd].flags = 0;
2160 }
2161 }
2162 }
2163 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2164 if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED))
2165 {
2166 // Check no tensor info is auto now.
2167 assert(!ccv_nnc_is_tensor_auto(tensor_symbol_info[i].info))((void) sizeof ((!ccv_nnc_is_tensor_auto(tensor_symbol_info[i
].info)) ? 1 : 0), __extension__ ({ if (!ccv_nnc_is_tensor_auto
(tensor_symbol_info[i].info)) ; else __assert_fail ("!ccv_nnc_is_tensor_auto(tensor_symbol_info[i].info)"
, "ccv_nnc_symbolic_graph_compile.c", 2167, __extension__ __PRETTY_FUNCTION__
); }))
;
2168 // If this tensor is used in assign_ref, set it to be un-foldable. (It will be used as parameter,
2169 // therefore, itself life-cycle almost certainly won't concatenate properly with the tensor to
2170 // fold to).
2171 if (tensor_symbol_info[i].assign_ref)
2172 {
2173 // TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[i]);
2174 // It can be folded as input (it is fine to be overwritten), but it cannot as output (when folded as input,
2175 // it kept its own representation, which is not the case for output).
2176 TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[i])(tensor_blocks[i].flags = (tensor_blocks[i].flags | UNFOLDABLE_AS_OUTPUT
))
;
2177 const int assign_ref = tensor_symbol_info[i].assign_ref - 1;
2178 // But for where it comes from, it cannot be folded as input, because it cannot be overwritten any time.
2179 TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[assign_ref])(tensor_blocks[assign_ref].flags = (tensor_blocks[assign_ref]
.flags | UNFOLDABLE_AS_INPUT))
;
2180 // It also cannot be folded as output (except i), because we need to keep its own representation.
2181 TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[assign_ref])(tensor_blocks[assign_ref].flags = (tensor_blocks[assign_ref]
.flags | UNFOLDABLE_AS_OUTPUT))
;
2182 assert(tensor_blocks[assign_ref].unfoldable_except_ref == 0)((void) sizeof ((tensor_blocks[assign_ref].unfoldable_except_ref
== 0) ? 1 : 0), __extension__ ({ if (tensor_blocks[assign_ref
].unfoldable_except_ref == 0) ; else __assert_fail ("tensor_blocks[assign_ref].unfoldable_except_ref == 0"
, "ccv_nnc_symbolic_graph_compile.c", 2182, __extension__ __PRETTY_FUNCTION__
); }))
;
2183 tensor_blocks[assign_ref].unfoldable_except_ref = i + 1;
2184 for (j = 0; j < unroll_count; j++)
2185 {
2186 TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]])(tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]].flags
= (tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]]
.flags | UNFOLDABLE_AS_INPUT))
;
2187 TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]])(tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]].flags
= (tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]]
.flags | UNFOLDABLE_AS_OUTPUT))
;
2188 }
2189 if (tensor_blocks[assign_ref].bypass_ref)
2190 {
2191 // If it contains a bypass_ref, that means we can fold into both the bypass and except_ref, making it untenable.
2192 tensor_blocks[assign_ref].unfoldable_except_ref = 0;
2193 const int bypass_ref = tensor_blocks[assign_ref].bypass_ref - 1;
2194 TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[bypass_ref])(tensor_blocks[bypass_ref].flags = (tensor_blocks[bypass_ref]
.flags | UNFOLDABLE_AS_INPUT))
;
2195 TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[bypass_ref])(tensor_blocks[bypass_ref].flags = (tensor_blocks[bypass_ref]
.flags | UNFOLDABLE_AS_OUTPUT))
;
2196 // On the other hand, it can be folded into the except_ref for the bypass_ref.
2197 tensor_blocks[bypass_ref].unfoldable_except_ref = i + 1;
2198 if (dup_tensor_from_ref)
2199 {
2200 const int bypass_from_ref = dup_tensor_from_ref[bypass_ref];
2201 if (bypass_from_ref >= 0)
2202 {
2203 TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[bypass_from_ref])(tensor_blocks[bypass_from_ref].flags = (tensor_blocks[bypass_from_ref
].flags | UNFOLDABLE_AS_INPUT))
;
2204 TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[bypass_from_ref])(tensor_blocks[bypass_from_ref].flags = (tensor_blocks[bypass_from_ref
].flags | UNFOLDABLE_AS_OUTPUT))
;
2205 assert(dup_tensor_block_ref[bypass_from_ref * unroll_count + unroll_count - 1] == bypass_ref)((void) sizeof ((dup_tensor_block_ref[bypass_from_ref * unroll_count
+ unroll_count - 1] == bypass_ref) ? 1 : 0), __extension__ (
{ if (dup_tensor_block_ref[bypass_from_ref * unroll_count + unroll_count
- 1] == bypass_ref) ; else __assert_fail ("dup_tensor_block_ref[bypass_from_ref * unroll_count + unroll_count - 1] == bypass_ref"
, "ccv_nnc_symbolic_graph_compile.c", 2205, __extension__ __PRETTY_FUNCTION__
); }))
;
2206 for (j = 0; j < unroll_count - 1; j++)
2207 {
2208 // Mark every incarnation as unfold-able.
2209 TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[dup_tensor_block_ref[bypass_from_ref * unroll_count + j]])(tensor_blocks[dup_tensor_block_ref[bypass_from_ref * unroll_count
+ j]].flags = (tensor_blocks[dup_tensor_block_ref[bypass_from_ref
* unroll_count + j]].flags | UNFOLDABLE_AS_INPUT))
;
2210 TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[dup_tensor_block_ref[bypass_from_ref * unroll_count + j]])(tensor_blocks[dup_tensor_block_ref[bypass_from_ref * unroll_count
+ j]].flags = (tensor_blocks[dup_tensor_block_ref[bypass_from_ref
* unroll_count + j]].flags | UNFOLDABLE_AS_OUTPUT))
;
2211 }
2212 }
2213 }
2214 }
2215 }
2216 }
2217 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2218 {
2219 // If it has a pair reference, we don't need to allocate this tensor at all,
2220 // set it to be unassigned.
2221 if (tensor_symbol_info[i].pair_ref)
2222 TENSOR_EXPECT_SET_UNASSIGNED(tensor_blocks[i])(tensor_blocks[i].flags = ((tensor_blocks[i].flags & ~0x3
) | UNASSIGNED))
;
2223 // If it is a tape variable, set it to be un-foldable as too (otherwise we cannot use tape properly).
2224 else if (tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_TAPE_VAR) {
2225 TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[i])(tensor_blocks[i].flags = (tensor_blocks[i].flags | UNFOLDABLE_AS_INPUT
))
;
2226 TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[i])(tensor_blocks[i].flags = (tensor_blocks[i].flags | UNFOLDABLE_AS_OUTPUT
))
;
2227 // For this case, there is no exception.
2228 tensor_blocks[i].unfoldable_except_ref = 0;
2229 } else if (tensor_symbol_info[i].p_ref) {
2230 assert(p_node_info)((void) sizeof ((p_node_info) ? 1 : 0), __extension__ ({ if (
p_node_info) ; else __assert_fail ("p_node_info", "ccv_nnc_symbolic_graph_compile.c"
, 2230, __extension__ __PRETTY_FUNCTION__); }))
;
2231 const int p_ref_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(tensor_symbol_info[i].p_ref - 1, p_node_info);
2232 // If I am a case of graph, and this tensor is the input from the parent graph, you cannot fold it as input.
2233 if (p_node_info->flags & CCV_NNC_GRAPH_EXEC_CASE_OF)
2234 // TODO: This check can be lifted if we can fold in the parent graph.
2235 if (-1 == p_ref_is_in_or_out)
2236 TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[i])(tensor_blocks[i].flags = (tensor_blocks[i].flags | UNFOLDABLE_AS_INPUT
))
;
2237 if (1 == p_ref_is_in_or_out) // If p_ref is out, it cannot be fold as input.
2238 TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[i])(tensor_blocks[i].flags = (tensor_blocks[i].flags | UNFOLDABLE_AS_INPUT
))
;
2239 }
2240 }
2241 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2242 {
2243 if (tensor_symbol_info[i].alias_ref)
2244 {
2245 const int ref = tensor_symbol_info[i].alias_ref - 1;
2246 // If the referenced one is unassigned, mark this as assigned only if current one is assigned.
2247 if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[ref])((tensor_blocks[ref].flags & 0x3) == UNASSIGNED) && !TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED))
2248 tensor_blocks[ref].flags = 0;
2249 // An alias cannot ref to another alias.
2250 assert(!tensor_symbol_info[ref].alias_ref)((void) sizeof ((!tensor_symbol_info[ref].alias_ref) ? 1 : 0)
, __extension__ ({ if (!tensor_symbol_info[ref].alias_ref) ; else
__assert_fail ("!tensor_symbol_info[ref].alias_ref", "ccv_nnc_symbolic_graph_compile.c"
, 2250, __extension__ __PRETTY_FUNCTION__); }))
;
2251 tensor_blocks[i].flags = ALIAS;
2252 tensor_blocks[i].ref = ref + 1; // Assign the ref.
2253 if (!tensor_blocks[ref].r_refs)
2254 tensor_blocks[ref].r_refs = ccv_array_new(sizeof(int), 0, 0);
2255 ccv_array_add_unique_int(tensor_blocks[ref].r_refs, i + 1);
2256 }
2257 }
2258 // Scan again and if the ref is not assigned, mark the alias not assigned.
2259 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2260 if (TENSOR_EXPECT_ALIAS(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == ALIAS))
2261 {
2262 const int ref = tensor_blocks[i].ref - 1;
2263 if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[ref])((tensor_blocks[ref].flags & 0x3) == UNASSIGNED))
2264 {
2265 // Mark this as unassigned.
2266 tensor_blocks[i].flags = UNASSIGNED;
2267 tensor_blocks[i].ref = 0;
2268 }
2269 }
2270 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2271 {
2272 // If this tensor is not expected to be unassigned, allocate the arrays for s and t.
2273 if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !(
(tensor_blocks[i].flags & 0x3) == UNASSIGNED))
)
2274 {
2275 tensor_blocks[i].head = ccv_array_new(sizeof(int), 0, 0);
2276 tensor_blocks[i].tail = ccv_array_new(sizeof(int), 0, 0);
2277 // Cache tensor size (align to 16 bytes).
2278 tensor_blocks[i].size = (uint64_t)ccv_nnc_tensor_data_size(tensor_symbol_info[i].info);
2279 }
2280 // If there is a p_ref, add the one to the p_refs list.
2281 if (tensor_symbol_info[i].p_ref)
2282 tensor_blocks[i].p_refs[0] = tensor_symbol_info[i].p_ref;
2283 }
2284 ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx;
{
2285 for (i = 0; i < node->input_size; i++)
2286 {
2287 int d = node->inputs[i];
2288 if (d < 0)
2289 continue;
2290 if (TENSOR_EXPECT_ALIAS(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == ALIAS))
2291 d = tensor_symbol_info[d].alias_ref - 1;
2292 tensor_blocks[d].flags |= READ_ONLY;
2293 if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED))
2294 continue;
2295 assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]))((void) sizeof (((!((tensor_blocks[d].flags & 0x3) == ALIAS
) && !((tensor_blocks[d].flags & 0x3) == UNASSIGNED
))) ? 1 : 0), __extension__ ({ if ((!((tensor_blocks[d].flags
& 0x3) == ALIAS) && !((tensor_blocks[d].flags &
0x3) == UNASSIGNED))) ; else __assert_fail ("TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d])"
, "ccv_nnc_symbolic_graph_compile.c", 2295, __extension__ __PRETTY_FUNCTION__
); }))
;
2296 /* If this is first encounter, its head starts (this tensor is init'ed outside of the graph
2297 * from the very beginning of the graph life-cycle and ends here. */
2298 if (tensor_blocks[d].head->rnum == 0 && !TENSOR_REQUIRE_INIT(tensor_symbol_info[d].flags)(((tensor_symbol_info[d].flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS
) || ((tensor_symbol_info[d].flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ONES
))
)
2299 {
2300 for (j = 0; j < source_size; j++)
2301 {
2302 // If the source is connecting to current node, add (otherwise we will create tensor blocks that used in other streams, which is unneccessary).
2303 const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, idx, sources[j].d);
2304 if (cell.i32 && cell.i32[0] > 0)
2305 _ccv_nnc_tensor_block_add_exec(exec_dep, sources[j].d, tensor_blocks[d]);
2306 }
2307 /* If this is a read-only (based on SSA, if first encountered as read), and this is
2308 * sub-graph (TODO: this condition can be lifted for case..of that is never in a while
2309 * loop, however, in that case, you need to prevent read-only gets reused for the
2310 * output tensor, which is not obvious how to implement correctly), and it is not
2311 * assign_ref from anywhere (not a parameterized loop). We cannot reuse this region
2312 * of memory anyway (because on second loop, we want to read the same value out).
2313 * Mark it to the end of the graph. */
2314 if (p_node_info && !tensor_symbol_info[d].assign_ref)
2315 for (j = 0; j < destination_size; j++)
2316 {
2317 // If the destination is connecting to current node, add (otherwise we will create tensor blocks that used in other streams, which is unneccessary).
2318 const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, destinations[j].d, idx);
2319 if (cell.i32 && cell.i32[0] > 0)
2320 _ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[d]);
2321 }
2322 }
2323 _ccv_nnc_tensor_block_add_exec(exec_dep, idx, tensor_blocks[d]);
2324 }
2325 for (i = 0; i < node->output_size; i++)
2326 {
2327 int d = node->outputs[i];
2328 if (d < 0)
2329 continue;
2330 if (TENSOR_EXPECT_ALIAS(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == ALIAS))
2331 d = tensor_symbol_info[d].alias_ref - 1;
2332 tensor_blocks[d].flags |= WRITE_ONLY;
2333 if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED))
2334 continue;
2335 assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]))((void) sizeof (((!((tensor_blocks[d].flags & 0x3) == ALIAS
) && !((tensor_blocks[d].flags & 0x3) == UNASSIGNED
))) ? 1 : 0), __extension__ ({ if ((!((tensor_blocks[d].flags
& 0x3) == ALIAS) && !((tensor_blocks[d].flags &
0x3) == UNASSIGNED))) ; else __assert_fail ("TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d])"
, "ccv_nnc_symbolic_graph_compile.c", 2335, __extension__ __PRETTY_FUNCTION__
); }))
;
2336 _ccv_nnc_tensor_block_add_exec(exec_dep, idx, tensor_blocks[d]);
2337 }
2338 } ccv_nnc_graph_visit_endfor} }
2339 // For any assign_ref, its life-time kept until the end and wrap over.
2340 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2341 // If this tensor is not unassigned (or alias) and it is assigned from somewhere else,
2342 // that "somewhere else" need to keep its life-time til the end.
2343 if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !(
(tensor_blocks[i].flags & 0x3) == UNASSIGNED))
&&
2344 p_node_info && tensor_symbol_info[i].assign_ref)
2345 {
2346 const int assign_ref = tensor_symbol_info[i].assign_ref - 1;
2347 for (j = 0; j < destination_size; j++)
2348 {
2349 // This logic is to be more conservative about which destination we add to.
2350 // As of now, if we add everything, it is fine most likely. However, it may
2351 // cause issues in the future to do so naively. Thus, instead, we only add
2352 // the destination to it iff either the tensor is not used at all, or, the
2353 // destination is on the same stream as of the tensor block some way.
2354 int flag = !tensor_blocks[assign_ref].tail;
2355 for (k = 0; !flag && k < tensor_blocks[assign_ref].tail->rnum; k++)
2356 {
2357 const int idx = *(int*)ccv_array_get(tensor_blocks[assign_ref].tail, k)((void*)(((char*)((tensor_blocks[assign_ref].tail)->data))
+ (size_t)(tensor_blocks[assign_ref].tail)->rsize * (size_t
)(k)))
;
2358 const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, destinations[j].d, idx);
2359 flag = (cell.i32 && cell.i32[0] > 0);
2360 }
2361 if (flag) // If there is no tail at all, add it. Otherwise, only add it if the destination is on the same stream with this tensor block somehow.
2362 _ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[assign_ref]);
2363 }
2364 }
2365 for (i = 0; i < output_size; i++)
2366 {
2367 assert(outputs[i].graph == symbolic_graph)((void) sizeof ((outputs[i].graph == symbolic_graph) ? 1 : 0)
, __extension__ ({ if (outputs[i].graph == symbolic_graph) ; else
__assert_fail ("outputs[i].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 2367, __extension__ __PRETTY_FUNCTION__); }))
;
2368 int d = outputs[i].d;
2369 if (d < 0)
2370 continue;
2371 if (TENSOR_EXPECT_ALIAS(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == ALIAS))
2372 d = tensor_symbol_info[d].alias_ref - 1;
2373 if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED))
2374 continue;
2375 assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]))((void) sizeof (((!((tensor_blocks[d].flags & 0x3) == ALIAS
) && !((tensor_blocks[d].flags & 0x3) == UNASSIGNED
))) ? 1 : 0), __extension__ ({ if ((!((tensor_blocks[d].flags
& 0x3) == ALIAS) && !((tensor_blocks[d].flags &
0x3) == UNASSIGNED))) ; else __assert_fail ("TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d])"
, "ccv_nnc_symbolic_graph_compile.c", 2375, __extension__ __PRETTY_FUNCTION__
); }))
;
2376 for (j = 0; j < destination_size; j++)
2377 {
2378 int flag = !tensor_blocks[d].tail;
2379 for (k = 0; !flag && k < tensor_blocks[d].tail->rnum; k++)
2380 {
2381 const int idx = *(int*)ccv_array_get(tensor_blocks[d].tail, k)((void*)(((char*)((tensor_blocks[d].tail)->data)) + (size_t
)(tensor_blocks[d].tail)->rsize * (size_t)(k)))
;
2382 const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, destinations[j].d, idx);
2383 flag = (cell.i32 && cell.i32[0] > 0);
2384 }
2385 if (flag) // If there is no tail at all, add it. Otherwise, only add it if the destination is on the same stream with this tensor block somehow.
2386 _ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[d]);
2387 }
2388 }
2389 // Enforce tensor reuse by collapse tensors for in-place operations. We will fault if this cannot be done.
2390 ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx;
{
2391 int x, y;
2392 for (x = 0; x < node->input_size; x++)
2393 for (y = 0; y < node->output_size; y++)
2394 /* Some operations enforces some tensors to be the same for inputs / outputs. */
2395 if (ccv_nnc_cmd_enforce_inplace(node->cmd, x, node->input_size, y, node->output_size))
2396 {
2397 // If both unassigned, it is fine.
2398 if (node->inputs[x] < 0 && node->outputs[y] < 0)
2399 continue;
2400 int ref = node->inputs[x];
2401 assert(ref >= 0)((void) sizeof ((ref >= 0) ? 1 : 0), __extension__ ({ if (
ref >= 0) ; else __assert_fail ("ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 2401, __extension__ __PRETTY_FUNCTION__); }))
;
2402 while (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref])(!((tensor_blocks[ref].flags & 0x3) == ALIAS) && !
((tensor_blocks[ref].flags & 0x3) == UNASSIGNED))
&& tensor_blocks[ref].ref)
2403 ref = tensor_blocks[ref].ref - 1;
2404 const int node_output_y = node->outputs[y];
2405 assert(node_output_y >= 0)((void) sizeof ((node_output_y >= 0) ? 1 : 0), __extension__
({ if (node_output_y >= 0) ; else __assert_fail ("node_output_y >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 2405, __extension__ __PRETTY_FUNCTION__
); }))
;
2406 // If both are not computable, it is fine, we don't need to enforce.
2407 if (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref])(!((tensor_blocks[ref].flags & 0x3) == ALIAS) && !
((tensor_blocks[ref].flags & 0x3) == UNASSIGNED))
&&
2408 !TENSOR_EXPECT_COMPUTABLE(tensor_blocks[node_output_y])(!((tensor_blocks[node_output_y].flags & 0x3) == ALIAS) &&
!((tensor_blocks[node_output_y].flags & 0x3) == UNASSIGNED
))
)
2409 continue;
2410 // Otherwise, enforce and error out if failed.
2411 if (!_ccv_nnc_tensor_blocks_try_fold(tensor_blocks, ref, node_output_y))
2412 { assert(0 && "cannot enforce inplace for the two tensors")((void) sizeof ((0 && "cannot enforce inplace for the two tensors"
) ? 1 : 0), __extension__ ({ if (0 && "cannot enforce inplace for the two tensors"
) ; else __assert_fail ("0 && \"cannot enforce inplace for the two tensors\""
, "ccv_nnc_symbolic_graph_compile.c", 2412, __extension__ __PRETTY_FUNCTION__
); }))
; }
2413 }
2414 } ccv_nnc_graph_visit_endfor} }
2415 // Ignore tensors that are already binded, no matter if it is used or not. Doing it here because
2416 // we need to make sure enforced tensors are properly assigned, so that we don't bind on a tensor
2417 // that is not enforced in-place (because the tensor enforced in-place will be different than the
2418 // binding one).
2419 for (i = 0; i < tensor_bind_size; i++)
2420 {
2421 const ccv_nnc_tensor_symbol_t resolved_symbol = ccv_nnc_tensor_symbol_resolve(symbolic_graph, tensor_binds[i].symbol);
2422 // If there is a tensor binded, then it is unassigned.
2423 if (resolved_symbol.d >= 0)
2424 {
2425 int d = resolved_symbol.d;
2426 // I cannot assert too much at this moment.
2427 if (TENSOR_EXPECT_ALIAS(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == ALIAS))
2428 d = tensor_symbol_info[d].alias_ref - 1; // Bind back to the original.
2429 // This check is for in-place ops. Only in-place op could have unassigned but ref.
2430 // It has nothing to do with alias.
2431 while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED) && tensor_blocks[d].ref)
2432 d = tensor_blocks[d].ref - 1;
2433 // Doesn't work if this is a loop carrying variable.
2434 assert(!tensor_symbol_info[d].assign_ref)((void) sizeof ((!tensor_symbol_info[d].assign_ref) ? 1 : 0),
__extension__ ({ if (!tensor_symbol_info[d].assign_ref) ; else
__assert_fail ("!tensor_symbol_info[d].assign_ref", "ccv_nnc_symbolic_graph_compile.c"
, 2434, __extension__ __PRETTY_FUNCTION__); }))
;
2435 tensor_blocks[d].flags = UNASSIGNED;
2436 tensor_blocks[d].ref = 0; // No need to have ref as well.
2437 }
2438 }
2439 // Maximum tensor reuse by collapse tensors allows in-place operations (and it matches the start, end tensor).
2440 ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx;
{
2441 int x, y;
2442 for (x = 0; x < node->input_size; x++)
2443 {
2444 /* If the input is not assigned, it can be referenced, find the referenced one */
2445 int ref = node->inputs[x];
2446 if (ref < 0)
2447 continue;
2448 const ccv_nnc_tensor_symbol_info_t x_symbol = tensor_symbol_info[ref];
2449 while (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref])(!((tensor_blocks[ref].flags & 0x3) == ALIAS) && !
((tensor_blocks[ref].flags & 0x3) == UNASSIGNED))
&& tensor_blocks[ref].ref)
2450 ref = tensor_blocks[ref].ref - 1;
2451 assert(tensor_blocks[ref].ref == 0)((void) sizeof ((tensor_blocks[ref].ref == 0) ? 1 : 0), __extension__
({ if (tensor_blocks[ref].ref == 0) ; else __assert_fail ("tensor_blocks[ref].ref == 0"
, "ccv_nnc_symbolic_graph_compile.c", 2451, __extension__ __PRETTY_FUNCTION__
); }))
;
2452 if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref])(!((tensor_blocks[ref].flags & 0x3) == ALIAS) && !
((tensor_blocks[ref].flags & 0x3) == UNASSIGNED))
&&
2453 tensor_blocks[ref].tail->rnum == 1)
2454 {
2455 for (y = 0; y < node->output_size; y++)
2456 /* Only proceed if the input symbol is different from the output symbol, */
2457 /* and the input symbol meets the output symbol exactly at the same spot. */
2458 if (ccv_nnc_cmd_allow_inplace(node->cmd, x, node->input_size, y, node->output_size) &&
2459 node->outputs[y] >= 0 &&
2460 ref != node->outputs[y] &&
2461 TENSOR_EXPECT_COMPUTABLE(tensor_blocks[node->outputs[y]])(!((tensor_blocks[node->outputs[y]].flags & 0x3) == ALIAS
) && !((tensor_blocks[node->outputs[y]].flags &
0x3) == UNASSIGNED))
)
2462 {
2463 const int node_output_y = node->outputs[y];
2464 const ccv_nnc_tensor_symbol_info_t y_symbol = tensor_symbol_info[node_output_y];
2465 /* If dimension matches perfectly, then we can assign y_symbol to x.
2466 * If both of them are aliases, making sure their origin matches in size too. */
2467 if (memcmp(x_symbol.info.dim, y_symbol.info.dim, sizeof(int) * CCV_NNC_MAX_DIM_ALLOC(12)) == 0)
2468 {
2469 _ccv_nnc_tensor_blocks_try_fold(tensor_blocks, ref, node_output_y);
2470 // This refers to an alias itself, now mark it and will be processed later.
2471 if (ref != node->inputs[x])
2472 tensor_blocks[node_output_y].alias_ref = node->inputs[x] + 1;
2473 }
2474 }
2475 }
2476 }
2477 } ccv_nnc_graph_visit_endfor} }
2478 // Specifically handle the bypass. This need to be done after the first pass.
2479 // I need to extend the bypass life-time to the same as the one I am going with.
2480 // It is important we visit these nodes and assign bypass_ref to its dependents in topological order.
2481 ccv_nnc_tensor_block_t empty_block = {};
2482 empty_block.head = ccv_array_new(sizeof(int), 0, 0);
2483 empty_block.tail = ccv_array_new(sizeof(int), 0, 0);
2484 ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx;
{
2485 if (node->flags & CCV_NNC_GRAPH_EXEC_CASE_OF)
2486 {
2487 int can_bypass = 1;
2488 for (i = 0; can_bypass && i < node->output_size; i++)
2489 {
2490 int d = node->outputs[i];
2491 if (d < 0)
2492 continue;
2493 if (!tensor_blocks[d].bypass_ref)
2494 continue;
2495 while (tensor_blocks[d].ref)
2496 d = tensor_blocks[d].ref - 1;
2497 int bypass_ref = tensor_blocks[node->outputs[i]].bypass_ref - 1;
2498 while (tensor_blocks[bypass_ref].ref)
2499 bypass_ref = tensor_blocks[bypass_ref].ref - 1;
2500 // If this doesn't participate in the while loop, we don't need to check the while loop constraint.
2501 if (!tensor_symbol_info[bypass_ref].assign_ref && !tensor_symbol_info[bypass_ref].r_assign_ref)
2502 continue;
2503 ccv_array_clear(empty_block.head);
2504 for (j = 0; tensor_blocks[bypass_ref].head && j < tensor_blocks[bypass_ref].head->rnum; j++)
2505 ccv_array_push(empty_block.head, ccv_array_get(tensor_blocks[bypass_ref].head, j)((void*)(((char*)((tensor_blocks[bypass_ref].head)->data))
+ (size_t)(tensor_blocks[bypass_ref].head)->rsize * (size_t
)(j)))
);
2506 ccv_array_clear(empty_block.tail);
2507 for (j = 0; tensor_blocks[bypass_ref].tail && j < tensor_blocks[bypass_ref].tail->rnum; j++)
2508 ccv_array_push(empty_block.tail, ccv_array_get(tensor_blocks[bypass_ref].tail, j)((void*)(((char*)((tensor_blocks[bypass_ref].tail)->data))
+ (size_t)(tensor_blocks[bypass_ref].tail)->rsize * (size_t
)(j)))
);
2509 for (j = 0; tensor_blocks[d].head && j < tensor_blocks[d].head->rnum; j++)
2510 _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].head, j)((void*)(((char*)((tensor_blocks[d].head)->data)) + (size_t
)(tensor_blocks[d].head)->rsize * (size_t)(j)))
, empty_block);
2511 for (j = 0; tensor_blocks[d].tail && j < tensor_blocks[d].tail->rnum; j++)
2512 _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].tail, j)((void*)(((char*)((tensor_blocks[d].tail)->data)) + (size_t
)(tensor_blocks[d].tail)->rsize * (size_t)(j)))
, empty_block);
2513 // It can only be unfoldable due to while constraint. Check whether this satisfies the while loop constraint.
2514 assert(!(tensor_symbol_info[bypass_ref].assign_ref && tensor_symbol_info[bypass_ref].r_assign_ref))((void) sizeof ((!(tensor_symbol_info[bypass_ref].assign_ref &&
tensor_symbol_info[bypass_ref].r_assign_ref)) ? 1 : 0), __extension__
({ if (!(tensor_symbol_info[bypass_ref].assign_ref &&
tensor_symbol_info[bypass_ref].r_assign_ref)) ; else __assert_fail
("!(tensor_symbol_info[bypass_ref].assign_ref && tensor_symbol_info[bypass_ref].r_assign_ref)"
, "ccv_nnc_symbolic_graph_compile.c", 2514, __extension__ __PRETTY_FUNCTION__
); }))
;
2515 int b_ref = (tensor_symbol_info[bypass_ref].assign_ref) ? tensor_symbol_info[bypass_ref].assign_ref - 1 : tensor_symbol_info[bypass_ref].r_assign_ref - 1;
2516 while (tensor_blocks[b_ref].ref)
2517 b_ref = tensor_blocks[b_ref].ref - 1;
2518 int a_hop_b = _ccv_nnc_tensor_block_head_after_tail(exec_dep, empty_block, tensor_blocks[b_ref]);
2519 int b_hop_a = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[b_ref], empty_block);
2520 // These two can be assigned to the same region of memory without issue (because their life-time doesn't interfere)
2521 // even after we extend the life-time of bypass_ref. Then we are in a good shape.
2522 can_bypass = can_bypass && (a_hop_b || b_hop_a);
2523 }
2524 if (can_bypass)
2525 {
2526 for (i = 0; i < node->output_size; i++)
2527 {
2528 int d = node->outputs[i];
2529 if (d < 0)
2530 continue;
2531 if (!tensor_blocks[d].bypass_ref)
2532 continue;
2533 while (tensor_blocks[d].ref)
2534 d = tensor_blocks[d].ref - 1;
2535 int bypass_ref = tensor_blocks[node->outputs[i]].bypass_ref - 1;
2536 while (tensor_blocks[bypass_ref].ref)
2537 bypass_ref = tensor_blocks[bypass_ref].ref - 1;
2538 // The bypass_ref can extend its life-time.
2539 for (j = 0; tensor_blocks[d].head && j < tensor_blocks[d].head->rnum; j++)
2540 _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].head, j)((void*)(((char*)((tensor_blocks[d].head)->data)) + (size_t
)(tensor_blocks[d].head)->rsize * (size_t)(j)))
, tensor_blocks[bypass_ref]);
2541 for (j = 0; tensor_blocks[d].tail && j < tensor_blocks[d].tail->rnum; j++)
2542 _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].tail, j)((void*)(((char*)((tensor_blocks[d].tail)->data)) + (size_t
)(tensor_blocks[d].tail)->rsize * (size_t)(j)))
, tensor_blocks[bypass_ref]);
2543 }
2544 } else {
2545 for (i = 0; i < node->output_size; i++)
2546 tensor_blocks[node->outputs[i]].bypass_ref = 0;
2547 const int exec_idx = (dup_exec_from_ref) ? dup_exec_from_ref[idx] : idx;
2548 // Mark this exec as no bypass IO (thus, I need to insert explicit data transfer.
2549 exec_flags[exec_idx].flags |= CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO;
2550 }
2551 }
2552 } ccv_nnc_graph_visit_endfor} }
2553 ccv_array_free(empty_block.head);
2554 ccv_array_free(empty_block.tail);
2555 *r_exec_dep = exec_dep;
2556 *r_tensor_blocks = tensor_blocks;
2557}
2558
2559static ccv_nnc_cmd_t _ccv_nnc_subst_sub_graph_with_noop(const ccv_nnc_graph_exec_symbol_t symbol, const ccv_nnc_cmd_t cmd)
2560{
2561 if (cmd.cmd == CCV_NNC_GRAPH_FORWARD || cmd.cmd == CCV_NNC_GRAPH_BACKWARD)
2562 {
2563 ccv_nnc_cmd_t retval = cmd;
2564 retval.cmd = CCV_NNC_NOOP;
2565 return retval;
2566 }
2567 return cmd;
2568}
2569
2570static ccv_nnc_tensor_symbol_t _ccv_nnc_dup_tensor_symbol(ccv_nnc_symbolic_graph_t* const dup_graph, const int unroll_count, int* const dup_tensor_block_ref, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int input)
2571{
2572 if (dup_tensor_block_ref[input * unroll_count] < 0) // No tensor ref, create one.
2573 {
2574 if (tensor_symbol_info[input].alias_ref)
2575 {
2576 const int alias_ref = tensor_symbol_info[input].alias_ref - 1;
2577 assert(tensor_symbol_info[alias_ref].alias_ref == 0)((void) sizeof ((tensor_symbol_info[alias_ref].alias_ref == 0
) ? 1 : 0), __extension__ ({ if (tensor_symbol_info[alias_ref
].alias_ref == 0) ; else __assert_fail ("tensor_symbol_info[alias_ref].alias_ref == 0"
, "ccv_nnc_symbolic_graph_compile.c", 2577, __extension__ __PRETTY_FUNCTION__
); }))
;
2578 ccv_nnc_tensor_symbol_t tensor_symbol = {};
2579 if (dup_tensor_block_ref[alias_ref * unroll_count] < 0)
2580 {
2581 tensor_symbol = ccv_nnc_tensor_symbol_new(dup_graph, tensor_symbol_info[alias_ref].info, 0);
2582 if (tensor_symbol_info[alias_ref].pair_ref)
2583 ccv_nnc_tensor_symbol_pair_with(dup_graph, tensor_symbol, (ccv_nnc_tensor_symbol_t){
2584 .d = tensor_symbol_info[alias_ref].pair_ref - 1,
2585 .graph = dup_graph->pair
2586 });
2587 ccv_nnc_tensor_symbol_set_flags(dup_graph, tensor_symbol, tensor_symbol_info[alias_ref].flags);
2588 dup_tensor_block_ref[alias_ref * unroll_count] = tensor_symbol.d;
2589 } else {
2590 tensor_symbol.d = dup_tensor_block_ref[alias_ref * unroll_count];
2591 tensor_symbol.graph = dup_graph;
2592 }
2593 ccv_nnc_tensor_symbol_t alias_symbol = ccv_nnc_tensor_symbol_alias_new(dup_graph, tensor_symbol, tensor_symbol_info[input].ofs, tensor_symbol_info[input].stride, tensor_symbol_info[input].info, 0);
2594 if (tensor_symbol_info[input].pair_ref)
2595 ccv_nnc_tensor_symbol_pair_with(dup_graph, alias_symbol, (ccv_nnc_tensor_symbol_t){
2596 .d = tensor_symbol_info[input].pair_ref - 1,
2597 .graph = dup_graph->pair
2598 });
2599 ccv_nnc_tensor_symbol_set_flags(dup_graph, alias_symbol, tensor_symbol_info[input].flags);
2600 dup_tensor_block_ref[input * unroll_count] = alias_symbol.d;
2601 } else {
2602 ccv_nnc_tensor_symbol_t tensor_symbol = ccv_nnc_tensor_symbol_new(dup_graph, tensor_symbol_info[input].info, 0);
2603 if (tensor_symbol_info[input].pair_ref)
2604 ccv_nnc_tensor_symbol_pair_with(dup_graph, tensor_symbol, (ccv_nnc_tensor_symbol_t){
2605 .d = tensor_symbol_info[input].pair_ref - 1,
2606 .graph = dup_graph->pair
2607 });
2608 ccv_nnc_tensor_symbol_set_flags(dup_graph, tensor_symbol, tensor_symbol_info[input].flags);
2609 dup_tensor_block_ref[input * unroll_count] = tensor_symbol.d;
2610 }
2611 if (tensor_symbol_info[input].bypass_ref)
2612 {
2613 const int dup_bypass_ref = dup_tensor_block_ref[(tensor_symbol_info[input].bypass_ref - 1) * unroll_count];
2614 assert(dup_bypass_ref >= 0)((void) sizeof ((dup_bypass_ref >= 0) ? 1 : 0), __extension__
({ if (dup_bypass_ref >= 0) ; else __assert_fail ("dup_bypass_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 2614, __extension__ __PRETTY_FUNCTION__
); }))
;
2615 ccv_nnc_tensor_symbol_info_t* const symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(dup_graph->tensor_symbol_info, dup_tensor_block_ref[input * unroll_count])((void*)(((char*)((dup_graph->tensor_symbol_info)->data
)) + (size_t)(dup_graph->tensor_symbol_info)->rsize * (
size_t)(dup_tensor_block_ref[input * unroll_count])))
;
2616 symbol_info->bypass_ref = dup_bypass_ref + 1;
2617 }
2618 }
2619 return (ccv_nnc_tensor_symbol_t) {
2620 .d = dup_tensor_block_ref[input * unroll_count],
2621 .graph = dup_graph,
2622 };
2623}
2624
2625static ccv_nnc_graph_exec_symbol_t _ccv_nnc_dup_graph_exec_symbol(ccv_nnc_symbolic_graph_t* const dup_graph, const int unroll_count, int* const dup_exec_ref, int* const dup_tensor_block_ref, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_nnc_graph_exec_symbol_info_t* const node, const int idx, ccv_nnc_tensor_symbol_t* const max_inputs, ccv_nnc_tensor_symbol_t* const max_outputs)
2626{
2627 int i;
2628 if (dup_exec_ref[idx * unroll_count] < 0)
2629 {
2630 // Input has to come before output, because output could has a bypass reference to the input.
2631 for (i = 0; i < node->input_size; i++)
2632 max_inputs[i] = (node->inputs[i] >= 0) ? _ccv_nnc_dup_tensor_symbol(dup_graph, unroll_count, dup_tensor_block_ref, tensor_symbol_info, node->inputs[i]) : (ccv_nnc_tensor_symbol_t){ .d = node->inputs[i], .graph = dup_graph };
2633 for (i = 0; i < node->output_size; i++)
2634 max_outputs[i] = (node->outputs[i] >= 0) ? _ccv_nnc_dup_tensor_symbol(dup_graph, unroll_count, dup_tensor_block_ref, tensor_symbol_info, node->outputs[i]) : (ccv_nnc_tensor_symbol_t){ .d = node->outputs[i], .graph = dup_graph };
2635 ccv_nnc_graph_exec_symbol_t exec_symbol = ccv_nnc_graph_exec_symbol_new(dup_graph, node->cmd, max_inputs, node->input_size, max_outputs, node->output_size, 0);
2636 dup_exec_ref[idx * unroll_count] = exec_symbol.d;
2637 }
2638 return (ccv_nnc_graph_exec_symbol_t) {
2639 .d = dup_exec_ref[idx * unroll_count],
2640 .graph = dup_graph,
2641 };
2642}
2643
2644static void _ccv_nnc_tensor_blocks_free(ccv_nnc_tensor_block_t* const tensor_blocks, const int tensor_block_size)
2645{
2646 int i;
2647 for (i = 0; i < tensor_block_size; i++)
2648 {
2649 if (tensor_blocks[i].head)
2650 ccv_array_free(tensor_blocks[i].head);
2651 if (tensor_blocks[i].tail)
2652 ccv_array_free(tensor_blocks[i].tail);
2653 if (tensor_blocks[i].r_refs)
2654 ccv_array_free(tensor_blocks[i].r_refs);
2655 if (tensor_blocks[i].dup_p_refs)
2656 ccv_array_free(tensor_blocks[i].dup_p_refs);
2657 }
2658 ccfreefree(tensor_blocks);
2659}
2660
2661// Find tensors that cannot be solved by co-allocating to the same location.
2662static int _ccv_nnc_exec_dep_and_tensor_blocks_unroll_count(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_sparse_matrix_t* const exec_dep, ccv_nnc_tensor_block_t* const tensor_blocks)
2663{
2664 int i, j, unroll_count = 0;
2665 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2666 if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED) && tensor_symbol_info[i].assign_ref)
2667 {
2668 // This is is a parameter, thus, it has to be either an alias or used.
2669 assert(tensor_blocks[i].ref || TENSOR_EXPECT_ORDINARY(tensor_blocks[i]))((void) sizeof ((tensor_blocks[i].ref || ((tensor_blocks[i].flags
& 0x3) == 0)) ? 1 : 0), __extension__ ({ if (tensor_blocks
[i].ref || ((tensor_blocks[i].flags & 0x3) == 0)) ; else __assert_fail
("tensor_blocks[i].ref || TENSOR_EXPECT_ORDINARY(tensor_blocks[i])"
, "ccv_nnc_symbolic_graph_compile.c", 2669, __extension__ __PRETTY_FUNCTION__
); }))
;
2670 const int assign_ref = tensor_symbol_info[i].assign_ref - 1; // Starts at 1.
2671 // The parameter it assign to has to be either an alias or used.
2672 assert(tensor_blocks[assign_ref].ref || TENSOR_EXPECT_ORDINARY(tensor_blocks[assign_ref]))((void) sizeof ((tensor_blocks[assign_ref].ref || ((tensor_blocks
[assign_ref].flags & 0x3) == 0)) ? 1 : 0), __extension__ (
{ if (tensor_blocks[assign_ref].ref || ((tensor_blocks[assign_ref
].flags & 0x3) == 0)) ; else __assert_fail ("tensor_blocks[assign_ref].ref || TENSOR_EXPECT_ORDINARY(tensor_blocks[assign_ref])"
, "ccv_nnc_symbolic_graph_compile.c", 2672, __extension__ __PRETTY_FUNCTION__
); }))
;
2673 // If any of this two (assigner and assignee) is an alias, check to see if they are the same.
2674 // If it is the same, we are good, no need to extend.
2675 int a_ref = i;
2676 while (tensor_blocks[a_ref].ref)
2677 a_ref = tensor_blocks[a_ref].ref - 1;
2678 int b_ref = assign_ref;
2679 while (tensor_blocks[b_ref].ref)
2680 b_ref = tensor_blocks[b_ref].ref - 1;
2681 if (a_ref != b_ref)
2682 {
2683 // If any of the b's head is deterministically later than a's tail
2684 // or any of the b's tail is deterministically earlier than a's head, they don't interfere.
2685 int a_hop_b = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[b_ref], tensor_blocks[a_ref]);
2686 int b_hop_a = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[a_ref], tensor_blocks[b_ref]);
2687 // It cannot be that both i can hop to j can j can hop to i.
2688 assert(!(a_hop_b > 0 && b_hop_a > 0))((void) sizeof ((!(a_hop_b > 0 && b_hop_a > 0))
? 1 : 0), __extension__ ({ if (!(a_hop_b > 0 && b_hop_a
> 0)) ; else __assert_fail ("!(a_hop_b > 0 && b_hop_a > 0)"
, "ccv_nnc_symbolic_graph_compile.c", 2688, __extension__ __PRETTY_FUNCTION__
); }))
;
2689 // Can it be folded
2690 // These two can be assigned to the same region of memory without issue (because their life-time doesn't interfere).
2691 if (a_hop_b || b_hop_a)
2692 {
2693 tensor_blocks[a_ref].companion_ref = b_ref + 1;
2694 tensor_blocks[b_ref].companion_ref = a_ref + 1;
2695 continue;
2696 }
2697 int c_ref = tensor_symbol_info[b_ref].assign_ref - 1;
2698 for (j = 0; c_ref >= 0; j++)
2699 {
2700 while (tensor_blocks[c_ref].ref)
2701 c_ref = tensor_blocks[c_ref].ref - 1;
2702 c_ref = tensor_symbol_info[c_ref].assign_ref - 1;
2703 }
2704 unroll_count = ccv_max(unroll_count, j + 1)({ typeof (unroll_count) _a = (unroll_count); typeof (j + 1) _b
= (j + 1); (_a > _b) ? _a : _b; })
;
2705 }
2706 }
2707 // Reset companion_ref if need to unroll.
2708 if (unroll_count)
2709 for (j = 0; j < symbolic_graph->tensor_symbol_info->rnum; j++)
2710 tensor_blocks[j].companion_ref = 0;
2711 return unroll_count;
2712}
2713
2714static void _ccv_nnc_exec_dep_and_tensor_blocks_unroll_n(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_visit_t* const visit, const int unroll_count, const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_sparse_matrix_t* const exec_dep, const ccv_nnc_tensor_block_t* const tensor_blocks, ccv_nnc_symbolic_graph_t* const dup_graph, int* const r_dup_tensor_block_ref, int* const r_dup_exec_ref)
2715{
2716 int i, j, n;
2717 // The inout exec nodes, these are the nodes we are going to extend.
2718 uint8_t* inout = (uint8_t*)cccalloccalloc(symbolic_graph->exec_symbol_info->rnum, sizeof(uint8_t));
2719 int max_input_size = 0;
2720 int max_output_size = 0;
2721 for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; i++)
2722 {
2723 max_input_size = ccv_max(exec_symbol_info[i].input_size, max_input_size)({ typeof (exec_symbol_info[i].input_size) _a = (exec_symbol_info
[i].input_size); typeof (max_input_size) _b = (max_input_size
); (_a > _b) ? _a : _b; })
;
2724 max_output_size = ccv_max(exec_symbol_info[i].output_size, max_output_size)({ typeof (exec_symbol_info[i].output_size) _a = (exec_symbol_info
[i].output_size); typeof (max_output_size) _b = (max_output_size
); (_a > _b) ? _a : _b; })
;
2725 }
2726 ccv_nnc_tensor_symbol_t max_inputs[ccv_max(1, max_input_size)({ typeof (1) _a = (1); typeof (max_input_size) _b = (max_input_size
); (_a > _b) ? _a : _b; })
];
2727 ccv_nnc_tensor_symbol_t max_outputs[ccv_max(1, max_output_size)({ typeof (1) _a = (1); typeof (max_output_size) _b = (max_output_size
); (_a > _b) ? _a : _b; })
];
2728 // Doing graph expansion
2729 // It goes without saying, we must have more than one tensors / execs (otherwise I cannot use 0 as no exec ref).
2730 assert(dup_graph->exec_symbol_info->rnum > 0)((void) sizeof ((dup_graph->exec_symbol_info->rnum >
0) ? 1 : 0), __extension__ ({ if (dup_graph->exec_symbol_info
->rnum > 0) ; else __assert_fail ("dup_graph->exec_symbol_info->rnum > 0"
, "ccv_nnc_symbolic_graph_compile.c", 2730, __extension__ __PRETTY_FUNCTION__
); }))
;
2731 assert(dup_graph->tensor_symbol_info->rnum > 0)((void) sizeof ((dup_graph->tensor_symbol_info->rnum >
0) ? 1 : 0), __extension__ ({ if (dup_graph->tensor_symbol_info
->rnum > 0) ; else __assert_fail ("dup_graph->tensor_symbol_info->rnum > 0"
, "ccv_nnc_symbolic_graph_compile.c", 2731, __extension__ __PRETTY_FUNCTION__
); }))
;
2732#define INCOMING_NODE (1)
2733#define OUTGOING_NODE (2)
2734 // Unroll the graph n times.
2735 for (n = 0; n < unroll_count; n++)
2736 {
2737 int* const dup_exec_ref = r_dup_exec_ref + n;
2738 const int* const prev_dup_tensor_block_ref = n > 0 ? r_dup_tensor_block_ref + (n - 1) : 0;
2739 int* const dup_tensor_block_ref = r_dup_tensor_block_ref + n;
2740 for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; i++)
2741 dup_exec_ref[i * unroll_count] = -1;
2742 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2743 {
2744 // If there is a assign_ref, that means I don't need to dup the tensor.
2745 if (tensor_symbol_info[i].assign_ref)
2746 {
2747 const int assign_ref = tensor_symbol_info[i].assign_ref - 1;
2748 dup_tensor_block_ref[i * unroll_count] = prev_dup_tensor_block_ref ? prev_dup_tensor_block_ref[assign_ref * unroll_count] : assign_ref;
2749 } else if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !(
(tensor_blocks[i].flags & 0x3) == UNASSIGNED))
&& TENSOR_READ_WRITE(tensor_blocks[i])(tensor_blocks[i].flags & 0xc) == READ_ONLY)
2750 // If this is a read-only tensor block, no need to duplicate because the value never changes
2751 // (note we handled assign_ref first), therefore, no need to generate duplicate.
2752 dup_tensor_block_ref[i * unroll_count] = i;
2753 else
2754 dup_tensor_block_ref[i * unroll_count] = -1;
2755 }
2756 // Go through the original graph, make copies of the node if it is inout.
2757 ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx;
{
2758 ccv_nnc_graph_exec_symbol_t exec_symbol = _ccv_nnc_dup_graph_exec_symbol(dup_graph, unroll_count, dup_exec_ref, dup_tensor_block_ref, tensor_symbol_info, node, idx, max_inputs, max_outputs);
2759 inout[idx] |= INCOMING_NODE; /* Mark this node as incoming. */
2760 if (!node->outgoings)
2761 continue;
2762 for (i = 0; i < node->outgoings->rnum; i++)
2763 {
2764 const int outgoing_idx = *(int*)ccv_array_get(node->outgoings, i)((void*)(((char*)((node->outgoings)->data)) + (size_t)(
node->outgoings)->rsize * (size_t)(i)))
;
2765 inout[outgoing_idx] |= OUTGOING_NODE; /* Mark this node as outgoing. */
2766 ccv_nnc_graph_exec_symbol_t outgoing_symbol = _ccv_nnc_dup_graph_exec_symbol(dup_graph, unroll_count, dup_exec_ref, dup_tensor_block_ref, tensor_symbol_info, exec_symbol_info + outgoing_idx, outgoing_idx, max_inputs, max_outputs);
2767 ccv_nnc_graph_exec_symbol_concat(dup_graph, exec_symbol, outgoing_symbol);
2768 }
2769 } ccv_nnc_graph_visit_endfor} }
2770 // Check the visitor are all marked as either incoming or outgoing.
2771 const ccv_nnc_graph_exec_symbol_t* const dup_destinations = ccv_nnc_symbolic_graph_destinations(dup_graph);
2772 const int dup_destination_size = ccv_nnc_symbolic_graph_destination_size(dup_graph);
2773 for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; i++)
2774 {
2775 if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_symbol_info[i].flags)((exec_symbol_info[i].flags) & CCV_NNC_GRAPH_EXEC_DEAD))
2776 continue;
2777 assert((inout[i] & INCOMING_NODE) || (inout[i] & OUTGOING_NODE))((void) sizeof (((inout[i] & INCOMING_NODE) || (inout[i] &
OUTGOING_NODE)) ? 1 : 0), __extension__ ({ if ((inout[i] &
INCOMING_NODE) || (inout[i] & OUTGOING_NODE)) ; else __assert_fail
("(inout[i] & INCOMING_NODE) || (inout[i] & OUTGOING_NODE)"
, "ccv_nnc_symbolic_graph_compile.c", 2777, __extension__ __PRETTY_FUNCTION__
); }))
;
2778 // If this is pure incoming nodes, then I need to concat this one with all original destination node
2779 if (inout[i] == INCOMING_NODE)
2780 for (j = 0; j < dup_destination_size; j++)
2781 {
2782 ccv_nnc_graph_exec_symbol_concat(dup_graph, (ccv_nnc_graph_exec_symbol_t) {
2783 .d = dup_destinations[j].d,
2784 .graph = dup_graph,
2785 }, (ccv_nnc_graph_exec_symbol_t) {
2786 .d = dup_exec_ref[i * unroll_count],
2787 .graph = dup_graph,
2788 });
2789 }
2790 }
2791 if (dup_graph->destinations)
2792 ccv_array_clear(dup_graph->destinations);
2793 for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; i++)
2794 {
2795 if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_symbol_info[i].flags)((exec_symbol_info[i].flags) & CCV_NNC_GRAPH_EXEC_DEAD))
2796 continue;
2797 const int d = dup_exec_ref[i * unroll_count];
2798 ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(dup_graph->exec_symbol_info, d)((void*)(((char*)((dup_graph->exec_symbol_info)->data))
+ (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(d)))
;
2799 // If this has no outgoing node, add to the destination.
2800 if (!exec_symbol_info->outgoings || exec_symbol_info->outgoings->rnum == 0)
2801 ccv_nnc_symbolic_graph_add_destination(dup_graph, (ccv_nnc_graph_exec_symbol_t) {
2802 .graph = dup_graph,
2803 .d = d,
2804 });
2805 }
2806 }
2807#undef INCOMING_NODE
2808#undef OUTGOING_NODE
2809 ccfreefree(inout);
2810}
2811
2812static void _ccv_nnc_fixup_assign_ref_after_unroll(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const int unroll_count, const ccv_nnc_tensor_block_t* const tensor_blocks, const int* const dup_tensor_block_ref, ccv_nnc_tensor_symbol_info_t* const dup_tensor_symbol_info)
2813{
2814 int i;
2815 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++) // symbolic graph is the old graph and tensor blocks is the old tensor blocks.
2816 // Now can assign them (The dup) as companion.
2817 // Get to the last one, which we will wrap over.
2818 if (dup_tensor_symbol_info[i].assign_ref)
2819 {
2820 dup_tensor_symbol_info[dup_tensor_symbol_info[i].assign_ref - 1].r_assign_ref = 0;
2821 dup_tensor_symbol_info[i].assign_ref = dup_tensor_block_ref[(dup_tensor_symbol_info[i].assign_ref - 1) * unroll_count + unroll_count - 1] + 1;
2822 assert(dup_tensor_symbol_info[i].assign_ref)((void) sizeof ((dup_tensor_symbol_info[i].assign_ref) ? 1 : 0
), __extension__ ({ if (dup_tensor_symbol_info[i].assign_ref)
; else __assert_fail ("dup_tensor_symbol_info[i].assign_ref"
, "ccv_nnc_symbolic_graph_compile.c", 2822, __extension__ __PRETTY_FUNCTION__
); }))
;
2823 dup_tensor_symbol_info[dup_tensor_symbol_info[i].assign_ref - 1].r_assign_ref = i + 1;
2824 }
2825}
2826
2827// If the tensor blocks are the outputs of this graph, its life-time should be extended to the end of this graph.
2828// However, it is not that simple if the graph is unrolled. For unrolled graph, it needs to reach the end of
2829// the "original" graph and all its duplicated ends (for their duplicated tensor blocks).
2830static void _ccv_nnc_fixup_tensor_blocks_for_outputs(ccv_sparse_matrix_t* const exec_dep, ccv_nnc_tensor_block_t* const tensor_blocks, const ccv_nnc_graph_exec_symbol_info_t* const p_node_info, const int unroll_count, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const int p_idx, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int* const dup_exec_ref, const int* const dup_tensor_block_ref)
2831{
2832 int i, j, k;
2833 for (i = 0; i < p_node_info->output_size; i++)
2834 {
2835 const int d = p_node_info->outputs[i];
2836 const int s_ref = *(int*)ccv_array_get(p_tensor_symbol_info[d].s_ref, p_idx)((void*)(((char*)((p_tensor_symbol_info[d].s_ref)->data)) +
(size_t)(p_tensor_symbol_info[d].s_ref)->rsize * (size_t)
(p_idx)))
- 1;
2837 if (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[s_ref])(!((tensor_blocks[s_ref].flags & 0x3) == ALIAS) &&
!((tensor_blocks[s_ref].flags & 0x3) == UNASSIGNED))
)
2838 continue;
2839 for (k = 0; k < destination_size; k++)
2840 _ccv_nnc_tensor_block_add_exec(exec_dep, destinations[k].d, tensor_blocks[s_ref]);
2841 // Add the duplicated destinations to the tensor_block_ref.
2842 for (j = 0; j < unroll_count; j++)
2843 for (k = 0; k < destination_size; k++)
2844 {
2845 const int dup_exec_idx = dup_exec_ref[destinations[k].d * unroll_count + j];
2846 const int dup_tensor_block_idx = dup_tensor_block_ref[s_ref * unroll_count + j];
2847 if (dup_exec_idx >= 0 && dup_tensor_block_idx >= 0)
2848 _ccv_nnc_tensor_block_add_exec(exec_dep, dup_exec_idx, tensor_blocks[dup_tensor_block_idx]);
2849 }
2850 }
2851}
2852
2853static void _ccv_nnc_redo_exec_dep_and_tensor_blocks_when_unroll(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_info_t* const p_node_info, const ccv_nnc_graph_visit_t* const visit, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, ccv_nnc_graph_exec_flag_t* const exec_flags, ccv_sparse_matrix_t** r_exec_dep, ccv_nnc_tensor_block_t** r_tensor_blocks, int* r_tensor_block_size, ccv_nnc_symbolic_graph_t** r_dup_graph, int* r_unroll_count, int** r_dup_exec_ref, int** r_dup_tensor_block_ref)
2854{
2855 int i, j;
2856 ccv_sparse_matrix_t* exec_dep = *r_exec_dep;
2857 ccv_nnc_tensor_block_t* tensor_blocks = *r_tensor_blocks;
2858 // blocks that cannot be simply solved with either in-place operation tensor block folding or using the same memory region.
2859 // Unfortunately, I cannot do this analysis to the block folding done for sub-graphs, because we do sub-graph placement later.
2860 // No need to change anything, we are good.
2861 const int unroll_count = _ccv_nnc_exec_dep_and_tensor_blocks_unroll_count(symbolic_graph, tensor_symbol_info, exec_dep, tensor_blocks);
2862 if (!unroll_count)
2863 return;
2864 // Have conditions that cannot be satisfied with simple solution (allocate to the same memory region).
2865 // Doing graph expansion, first duplicate the old graph, but replace all sub graphs with noop.
2866 ccv_nnc_symbolic_graph_t* dup_graph = ccv_nnc_symbolic_graph_dup(symbolic_graph, _ccv_nnc_subst_sub_graph_with_noop);
2867 int* dup_exec_ref = (int*)ccmallocmalloc(sizeof(int) * symbolic_graph->exec_symbol_info->rnum * unroll_count);
2868 int* dup_tensor_block_ref = (int*)ccmallocmalloc(sizeof(int) * symbolic_graph->tensor_symbol_info->rnum * unroll_count);
2869 _ccv_nnc_exec_dep_and_tensor_blocks_unroll_n(symbolic_graph, visit, unroll_count, exec_symbol_info, tensor_symbol_info, exec_dep, tensor_blocks, dup_graph, dup_tensor_block_ref, dup_exec_ref);
2870 ccv_nnc_tensor_symbol_info_t* const dup_tensor_symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccmallocmalloc(sizeof(ccv_nnc_tensor_symbol_info_t) * dup_graph->tensor_symbol_info->rnum);
2871 ccv_nnc_graph_exec_symbol_info_t* const dup_exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccmallocmalloc(sizeof(ccv_nnc_graph_exec_symbol_info_t) * dup_graph->exec_symbol_info->rnum);
2872 ccv_nnc_graph_visit_t* dup_visit = ccv_nnc_graph_visit_new(dup_graph, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(dup_graph->exec_symbol_info, 0), dup_graph->exec_symbol_info->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->sources, 0), dup_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->destinations, 0), dup_graph->destinations->rnum, 0)({ ccv_nnc_graph_visit_t* _visit_ = (ccv_nnc_graph_visit_t*)malloc
(sizeof(ccv_nnc_graph_visit_t) + sizeof(_visit_->node[0]) *
((dup_graph->exec_symbol_info->rnum) - 1)); _visit_->
size = 0; do { typedef struct { int8_t d; int8_t r; uint16_t c
; int32_t edges; } ccv_nnc_incoming_t; int _i_, _j_; int _incoming_edges_
= 0; for (_i_ = 0; _i_ < (dup_graph->exec_symbol_info->
rnum); _i_++) _incoming_edges_ += (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_i_].outgoings) ? ((ccv_nnc_graph_exec_symbol_info_t*
)((void*)(((char*)((dup_graph->exec_symbol_info)->data)
) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_i_].outgoings->rnum : 0; const int _heap_mem_ = (
(dup_graph->exec_symbol_info->rnum) + _incoming_edges_ >
1024); ccv_nnc_incoming_t* _incomings_; if (_heap_mem_) _incomings_
= (ccv_nnc_incoming_t*)malloc(sizeof(ccv_nnc_incoming_t) * (
dup_graph->exec_symbol_info->rnum) + sizeof(int32_t) * (
(dup_graph->exec_symbol_info->rnum) * 2 + _incoming_edges_
)); else _incomings_ = (ccv_nnc_incoming_t*)__builtin_alloca (
sizeof(ccv_nnc_incoming_t) * (dup_graph->exec_symbol_info->
rnum) + sizeof(int32_t) * ((dup_graph->exec_symbol_info->
rnum) * 2 + _incoming_edges_)); memset(_incomings_, 0, sizeof
(ccv_nnc_incoming_t) * (dup_graph->exec_symbol_info->rnum
)); int32_t* _exists_[2] = { (int32_t*)(_incomings_ + (dup_graph
->exec_symbol_info->rnum)), (int32_t*)(_incomings_ + (dup_graph
->exec_symbol_info->rnum)) + (dup_graph->exec_symbol_info
->rnum), }; int32_t* const _edges_ = _exists_[1] + (dup_graph
->exec_symbol_info->rnum); for (_i_ = 0; _i_ < (dup_graph
->sources->rnum); _i_++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->sources)->data)) + (size_t
)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph
== dup_graph) ? 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->sources)->data)) + (size_t
)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph
== dup_graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->sources)->data)) + (size_t)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph == dup_graph"
, "ccv_nnc_symbolic_graph_compile.c", 2872, __extension__ __PRETTY_FUNCTION__
); })); _incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)((
(char*)((dup_graph->sources)->data)) + (size_t)(dup_graph
->sources)->rsize * (size_t)(0))))[_i_].d].r = 1; _exists_
[0][_i_] = ((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((
dup_graph->sources)->data)) + (size_t)(dup_graph->sources
)->rsize * (size_t)(0))))[_i_].d; } int _exist_size_[2] = {
(dup_graph->sources->rnum), 0, }; int _p_ = 0, _q_ = 1
; while (_exist_size_[_p_] > 0) { _exist_size_[_q_] = 0; for
(_i_ = 0; _i_ < _exist_size_[_p_]; _i_++) { const int32_t
_idx_ = _exists_[_p_][_i_]; if (_incomings_[_idx_].r != 1) continue
; _incomings_[_idx_].r = 2; if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings) for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings->rnum; _j_++) { const int d = *(int
*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t*)((void
*)(((char*)((dup_graph->exec_symbol_info)->data)) + (size_t
)(dup_graph->exec_symbol_info)->rsize * (size_t)(0))))[
_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_))); ++_incomings_
[d].c; if (_incomings_[d].r != 0) continue; _incomings_[d].r =
1; ((void) sizeof ((_exist_size_[_q_] < (dup_graph->exec_symbol_info
->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_[_q_] <
(dup_graph->exec_symbol_info->rnum)) ; else __assert_fail
("_exist_size_[_q_] < (dup_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 2872, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
(_i_ = 0; _i_ < (dup_graph->sources->rnum); _i_++) {
((void) sizeof ((((ccv_nnc_graph_exec_symbol_t*)((void*)(((char
*)((dup_graph->sources)->data)) + (size_t)(dup_graph->
sources)->rsize * (size_t)(0))))[_i_].graph == dup_graph) ?
1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t*)
((void*)(((char*)((dup_graph->sources)->data)) + (size_t
)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph
== dup_graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->sources)->data)) + (size_t)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph == dup_graph"
, "ccv_nnc_symbolic_graph_compile.c", 2872, __extension__ __PRETTY_FUNCTION__
); })); _incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)((
(char*)((dup_graph->sources)->data)) + (size_t)(dup_graph
->sources)->rsize * (size_t)(0))))[_i_].d].r = 3; _exists_
[0][_i_] = ((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((
dup_graph->sources)->data)) + (size_t)(dup_graph->sources
)->rsize * (size_t)(0))))[_i_].d; } _exist_size_[0] = (dup_graph
->sources->rnum); _exist_size_[1] = 0; _p_ = 0, _q_ = 1
; int _bump_ = 1; while (_exist_size_[_p_] > 0) { _exist_size_
[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_]; _i_++) {
const int32_t _idx_ = _exists_[_p_][_i_]; if (_incomings_[_idx_
].r != 3) continue; _incomings_[_idx_].r = 4; if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings) for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings->rnum; _j_++) { const int d = *(int
*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t*)((void
*)(((char*)((dup_graph->exec_symbol_info)->data)) + (size_t
)(dup_graph->exec_symbol_info)->rsize * (size_t)(0))))[
_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_))); if (_incomings_
[d].edges == 0) { _incomings_[d].edges = _bump_; _bump_ += _incomings_
[d].c; _incomings_[d].c = 0; } _edges_[_incomings_[d].edges -
1 + _incomings_[d].c] = _idx_; ++_incomings_[d].c; if (_incomings_
[d].r != 2) continue; _incomings_[d].r = 3; ((void) sizeof ((
_exist_size_[_q_] < (dup_graph->exec_symbol_info->rnum
)) ? 1 : 0), __extension__ ({ if (_exist_size_[_q_] < (dup_graph
->exec_symbol_info->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (dup_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 2872, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
(_i_ = 0; _i_ < (dup_graph->destinations->rnum); _i_
++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t*)((void*
)(((char*)((dup_graph->destinations)->data)) + (size_t)
(dup_graph->destinations)->rsize * (size_t)(0))))[_i_].
graph == dup_graph) ? 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->destinations)->data)) +
(size_t)(dup_graph->destinations)->rsize * (size_t)(0)
)))[_i_].graph == dup_graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph->destinations)->rsize * (size_t)(0))))[_i_].graph == dup_graph"
, "ccv_nnc_symbolic_graph_compile.c", 2872, __extension__ __PRETTY_FUNCTION__
); })); _incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)((
(char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph
->destinations)->rsize * (size_t)(0))))[_i_].d].r = 5; _exists_
[0][_i_] = ((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((
dup_graph->destinations)->data)) + (size_t)(dup_graph->
destinations)->rsize * (size_t)(0))))[_i_].d; } _exist_size_
[0] = (dup_graph->destinations->rnum); _exist_size_[1] =
0; _p_ = 0, _q_ = 1; while (_exist_size_[_p_] > 0) { _exist_size_
[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_]; _i_++) {
const int32_t _idx_ = _exists_[_p_][_i_]; if (_incomings_[_idx_
].r != 5) continue; _incomings_[_idx_].r = 6; if (_incomings_
[_idx_].edges > 0) for (_j_ = 0; _j_ < _incomings_[_idx_
].c; _j_++) { const int d = _edges_[_incomings_[_idx_].edges -
1 + _j_]; if (_incomings_[d].r != 4) continue; _incomings_[d
].r = 5; ((void) sizeof ((_exist_size_[_q_] < (dup_graph->
exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_
[_q_] < (dup_graph->exec_symbol_info->rnum)) ; else __assert_fail
("_exist_size_[_q_] < (dup_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 2872, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
(_i_ = 0; _i_ < (dup_graph->destinations->rnum); _i_
++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t*)((void*
)(((char*)((dup_graph->destinations)->data)) + (size_t)
(dup_graph->destinations)->rsize * (size_t)(0))))[_i_].
graph == dup_graph) ? 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->destinations)->data)) +
(size_t)(dup_graph->destinations)->rsize * (size_t)(0)
)))[_i_].graph == dup_graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph->destinations)->rsize * (size_t)(0))))[_i_].graph == dup_graph"
, "ccv_nnc_symbolic_graph_compile.c", 2872, __extension__ __PRETTY_FUNCTION__
); })); _incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)((
(char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph
->destinations)->rsize * (size_t)(0))))[_i_].d].d = 1; }
for (_i_ = 0; _i_ < (dup_graph->sources->rnum); _i_
++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t*)((void*
)(((char*)((dup_graph->sources)->data)) + (size_t)(dup_graph
->sources)->rsize * (size_t)(0))))[_i_].graph == dup_graph
) ? 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->sources)->data)) + (size_t
)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph
== dup_graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->sources)->data)) + (size_t)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph == dup_graph"
, "ccv_nnc_symbolic_graph_compile.c", 2872, __extension__ __PRETTY_FUNCTION__
); })); _exists_[0][_i_] = ((ccv_nnc_graph_exec_symbol_t*)((void
*)(((char*)((dup_graph->sources)->data)) + (size_t)(dup_graph
->sources)->rsize * (size_t)(0))))[_i_].d; } _p_ = 0; _q_
= 1; _exist_size_[0] = (dup_graph->sources->rnum); _exist_size_
[1] = 0; int _d_ = 0; while (_exist_size_[_p_] > 0) { _exist_size_
[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_];) { const
int32_t _idx_ = _exists_[_p_][_i_]; _visit_->node[_visit_
->size].index = ((_idx_)); _visit_->node[_visit_->size
].term = ((_incomings_[_idx_].d)); ++_visit_->size;; if (_incomings_
[_idx_].d) { ++_d_; _incomings_[_idx_].r = 7; } if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings) { if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings->rnum == 1) { const int d = *(int
*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t*)((void
*)(((char*)((dup_graph->exec_symbol_info)->data)) + (size_t
)(dup_graph->exec_symbol_info)->rsize * (size_t)(0))))[
_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings)->rsize * (size_t)(0))); --_incomings_
[d].c; if (_incomings_[d].c == 0 && _incomings_[d].r ==
6 && _d_ < (dup_graph->destinations->rnum))
{ _exists_[_p_][_i_] = d; continue; } } else for (_j_ = 0; _j_
< ((ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)((
dup_graph->exec_symbol_info)->data)) + (size_t)(dup_graph
->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings
->rnum; _j_++) { const int d = *(int*)((void*)(((char*)(((
(ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)((dup_graph
->exec_symbol_info)->data)) + (size_t)(dup_graph->exec_symbol_info
)->rsize * (size_t)(0))))[_idx_].outgoings)->data)) + (
size_t)(((ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)
((dup_graph->exec_symbol_info)->data)) + (size_t)(dup_graph
->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings
)->rsize * (size_t)(_j_))); --_incomings_[d].c; if (_incomings_
[d].c == 0 && _incomings_[d].r == 6 && _d_ <
(dup_graph->destinations->rnum)) { ((void) sizeof ((_exist_size_
[_q_] < (dup_graph->exec_symbol_info->rnum)) ? 1 : 0
), __extension__ ({ if (_exist_size_[_q_] < (dup_graph->
exec_symbol_info->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (dup_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 2872, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } } ++_i_; } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (
_i_)); } for (_i_ = 0; _i_ < (dup_graph->destinations->
rnum); _i_++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->destinations)->data)) +
(size_t)(dup_graph->destinations)->rsize * (size_t)(0)
)))[_i_].graph == dup_graph) ? 1 : 0), __extension__ ({ if ((
(ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->
destinations)->data)) + (size_t)(dup_graph->destinations
)->rsize * (size_t)(0))))[_i_].graph == dup_graph) ; else __assert_fail
("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph->destinations)->rsize * (size_t)(0))))[_i_].graph == dup_graph"
, "ccv_nnc_symbolic_graph_compile.c", 2872, __extension__ __PRETTY_FUNCTION__
); })); if (_incomings_[((ccv_nnc_graph_exec_symbol_t*)((void
*)(((char*)((dup_graph->destinations)->data)) + (size_t
)(dup_graph->destinations)->rsize * (size_t)(0))))[_i_]
.d].r == 7) continue; if (!(0)) { ((void) sizeof ((_incomings_
[((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->
destinations)->data)) + (size_t)(dup_graph->destinations
)->rsize * (size_t)(0))))[_i_].d].c == 0) ? 1 : 0), __extension__
({ if (_incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)((
(char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph
->destinations)->rsize * (size_t)(0))))[_i_].d].c == 0)
; else __assert_fail ("_incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph->destinations)->rsize * (size_t)(0))))[_i_].d].c == 0"
, "ccv_nnc_symbolic_graph_compile.c", 2872, __extension__ __PRETTY_FUNCTION__
); })); } else if (_incomings_[((ccv_nnc_graph_exec_symbol_t*
)((void*)(((char*)((dup_graph->destinations)->data)) + (
size_t)(dup_graph->destinations)->rsize * (size_t)(0)))
)[_i_].d].c > 0) continue; _visit_->node[_visit_->size
].index = ((((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)(
(dup_graph->destinations)->data)) + (size_t)(dup_graph->
destinations)->rsize * (size_t)(0))))[_i_].d)); _visit_->
node[_visit_->size].term = ((_incomings_[((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->destinations)->data)) +
(size_t)(dup_graph->destinations)->rsize * (size_t)(0)
)))[_i_].d].d)); ++_visit_->size;; } if (_heap_mem_) free(
_incomings_); } while (0);; ((void) sizeof ((_visit_->size
<= (dup_graph->exec_symbol_info->rnum)) ? 1 : 0), __extension__
({ if (_visit_->size <= (dup_graph->exec_symbol_info
->rnum)) ; else __assert_fail ("_visit_->size <= (dup_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 2872, __extension__ __PRETTY_FUNCTION__
); })); _visit_; })
;
2873 ccv_nnc_symbolic_graph_symbol_infer(dup_graph, dup_visit, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->sources, 0)((void*)(((char*)((dup_graph->sources)->data)) + (size_t
)(dup_graph->sources)->rsize * (size_t)(0)))
, dup_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->destinations, 0)((void*)(((char*)((dup_graph->destinations)->data)) + (
size_t)(dup_graph->destinations)->rsize * (size_t)(0)))
, dup_graph->destinations->rnum, p_tensor_symbol_info, p_tensor_symbol_info_size, dup_tensor_symbol_info, dup_exec_symbol_info);
2874 _ccv_nnc_fixup_assign_ref_after_unroll(symbolic_graph, unroll_count, tensor_blocks, dup_tensor_block_ref, dup_tensor_symbol_info);
2875 // Free out the old exec_dep
2876 ccv_matrix_free(exec_dep);
2877 // and the tensor blocks, prepare for the new.
2878 _ccv_nnc_tensor_blocks_free(tensor_blocks, symbolic_graph->tensor_symbol_info->rnum);
2879 // A reverse map to find where the original tensor comes from.
2880 int* dup_tensor_from_ref = (int*)ccmallocmalloc(sizeof(int) * dup_graph->tensor_symbol_info->rnum);
2881 for (i = 0; i < dup_graph->tensor_symbol_info->rnum; i++)
2882 dup_tensor_from_ref[i] = -1;
2883 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2884 for (j = 0; j < unroll_count; j++)
2885 if (dup_tensor_block_ref[i * unroll_count + j] >= 0)
2886 dup_tensor_from_ref[dup_tensor_block_ref[i * unroll_count + j]] = i;
2887 int* dup_exec_from_ref = (int*)ccmallocmalloc(sizeof(int) * dup_graph->exec_symbol_info->rnum);
2888 for (i = 0; i < dup_graph->exec_symbol_info->rnum; i++)
2889 dup_exec_from_ref[i] = -1;
2890 for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; i++)
2891 {
2892 if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_symbol_info[i].flags)((exec_symbol_info[i].flags) & CCV_NNC_GRAPH_EXEC_DEAD))
2893 continue;
2894 dup_exec_from_ref[i] = i; // Reference back.
2895 for (j = 0; j < unroll_count; j++)
2896 if (dup_exec_ref[i * unroll_count + j] >= 0)
2897 dup_exec_from_ref[dup_exec_ref[i * unroll_count + j]] = i;
2898 }
2899 // Reset all attr.
2900 memset(exec_flags, 0, sizeof(ccv_nnc_graph_exec_flag_t) * symbolic_graph->exec_symbol_info->rnum);
2901 _ccv_nnc_exec_dep_and_tensor_blocks_prep(dup_graph, p_node_info, dup_visit, tensor_binds, tensor_bind_size, 0, 0, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->sources, 0)((void*)(((char*)((dup_graph->sources)->data)) + (size_t
)(dup_graph->sources)->rsize * (size_t)(0)))
, dup_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->destinations, 0)((void*)(((char*)((dup_graph->destinations)->data)) + (
size_t)(dup_graph->destinations)->rsize * (size_t)(0)))
, dup_graph->destinations->rnum, p_tensor_symbol_info, p_tensor_symbol_info_size, dup_exec_symbol_info, dup_tensor_symbol_info, unroll_count, dup_tensor_block_ref, dup_tensor_from_ref, dup_exec_from_ref, exec_flags, &exec_dep, &tensor_blocks);
2902 ccv_nnc_graph_visit_free(dup_visit);
2903 ccfreefree(dup_exec_symbol_info);
2904 ccfreefree(dup_exec_from_ref);
2905 ccfreefree(dup_tensor_from_ref);
2906 // Assign out dup_p_ref, which will be used to extended the anonymous block life-time.
2907 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2908 // Loop over all possible duplications to assign dup_p_ref properly.
2909 for (j = 0; j < unroll_count; j++)
2910 {
2911 const int dup_idx = dup_tensor_block_ref[j + i * unroll_count];
2912 if (dup_idx >= 0 && (tensor_blocks[i].p_refs[0] || tensor_blocks[i].p_refs[1]))
2913 {
2914 const int p_ref_0 = tensor_blocks[i].p_refs[0] - 1;
2915 const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info);
2916 if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this.
2917 {
2918 if (!tensor_blocks[dup_idx].dup_p_refs)
2919 tensor_blocks[dup_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
2920 ccv_array_add_unique_int(tensor_blocks[dup_idx].dup_p_refs, p_ref_0);
2921 }
2922 if (p_ref_0_is_in_or_out == 1 || tensor_blocks[i].p_refs[1] == 0)
2923 continue;
2924 const int p_ref_1 = tensor_blocks[i].p_refs[1] - 1;
2925 const int p_ref_1_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_1, p_node_info);
2926 if (p_ref_1_is_in_or_out == 1)
2927 {
2928 if (!tensor_blocks[dup_idx].dup_p_refs)
2929 tensor_blocks[dup_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
2930 ccv_array_add_unique_int(tensor_blocks[dup_idx].dup_p_refs, p_ref_1);
2931 }
2932 }
2933 }
2934 // companion_ref
2935 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2936 // Now can assign them (The dup) as companion.
2937 if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED) && dup_tensor_symbol_info[i].assign_ref)
2938 {
2939 // Get to the last one, which we will wrap over.
2940 const int assign_ref = dup_tensor_symbol_info[i].assign_ref - 1;
2941 if (assign_ref >= 0)
2942 {
2943 int b_ref = assign_ref;
2944 while (tensor_blocks[b_ref].ref)
2945 b_ref = tensor_blocks[b_ref].ref - 1;
2946 int a_hop_b = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[i], tensor_blocks[b_ref]);
2947 int b_hop_a = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[b_ref], tensor_blocks[i]);
2948 // It cannot be that both i can hop to j can j can hop to i.
2949 // And it can be hop from one to another now after duplication.
2950 assert(a_hop_b > 0 || b_hop_a > 0)((void) sizeof ((a_hop_b > 0 || b_hop_a > 0) ? 1 : 0), __extension__
({ if (a_hop_b > 0 || b_hop_a > 0) ; else __assert_fail
("a_hop_b > 0 || b_hop_a > 0", "ccv_nnc_symbolic_graph_compile.c"
, 2950, __extension__ __PRETTY_FUNCTION__); }))
;
2951 tensor_blocks[i].companion_ref = b_ref + 1;
2952 tensor_blocks[b_ref].companion_ref = i + 1;
2953 }
2954 }
2955 ccfreefree(dup_tensor_symbol_info);
2956 // Extend the dup tensor block ref, prepare for future extensions.
2957 dup_tensor_block_ref = (int*)ccreallocrealloc(dup_tensor_block_ref, sizeof(int) * dup_graph->tensor_symbol_info->rnum * unroll_count);
2958 for (i = symbolic_graph->tensor_symbol_info->rnum * unroll_count; i < dup_graph->tensor_symbol_info->rnum * unroll_count; i++)
2959 dup_tensor_block_ref[i] = -1;
2960 // Assign out changed properties.
2961 *r_exec_dep = exec_dep;
2962 *r_tensor_blocks = tensor_blocks;
2963 *r_tensor_block_size = dup_graph->tensor_symbol_info->rnum;
2964 *r_dup_graph = dup_graph;
2965 *r_unroll_count = unroll_count;
2966 *r_dup_exec_ref = dup_exec_ref;
2967 *r_dup_tensor_block_ref = dup_tensor_block_ref;
2968}
2969
2970static int _ccv_nnc_anonymous_tensor_block_from_free_list(const ccv_nnc_tensor_block_t* const tensor_blocks, const int tensor_block_size, const ccv_array_t* const anonymous_block_free_list, const int anonymous_block_free_list_cap, const int type, const uint64_t size, const ccv_sparse_matrix_t* const exec_dep, const ccv_array_t* const dup_p_refs)
2971{
2972 if (!anonymous_block_free_list || !anonymous_block_free_list_cap)
2973 return tensor_block_size;
2974 int i;
2975 const int no_dup_p_refs = (!dup_p_refs || !dup_p_refs->rnum);
2976 int found_idx = tensor_block_size;
2977 for (i = 0; i < anonymous_block_free_list_cap; i++)
2978 {
2979 const int idx = *(int*)ccv_array_get(anonymous_block_free_list, i)((void*)(((char*)((anonymous_block_free_list)->data)) + (size_t
)(anonymous_block_free_list)->rsize * (size_t)(i)))
;
2980 assert(idx < tensor_block_size)((void) sizeof ((idx < tensor_block_size) ? 1 : 0), __extension__
({ if (idx < tensor_block_size) ; else __assert_fail ("idx < tensor_block_size"
, "ccv_nnc_symbolic_graph_compile.c", 2980, __extension__ __PRETTY_FUNCTION__
); }))
;
2981 // If the type doesn't match, ignore.
2982 if (tensor_blocks[idx].type != type)
2983 continue;
2984 // Heuristic about how to select the best tensor block to move forward.
2985 // If the size is larger, and no dup_p_refs, found, I cannot do better than this, just return directly.
2986 if (tensor_blocks[idx].size >= size)
2987 {
2988 if (no_dup_p_refs)
2989 return idx;
2990 // Otherwise, only if the current tensor block's dup_p_refs is after (or at) the dup_p_refs,
2991 // then we cannot do better than this, if that is the case, just return.
2992 if (tensor_blocks[idx].dup_p_refs && tensor_blocks[idx].dup_p_refs->rnum &&
2993 _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, dup_p_refs))
2994 return idx;
2995 }
2996 int64_t found_idx_size_diff;
2997 int64_t idx_size_diff;
2998 if (found_idx == tensor_block_size || // If no found_idx yet, set the current one to be the found one, and continue.
2999 // Now, compare whether this one or the found_idx one is better.
3000 // At this point, there is no point of comparing the dup_p_refs, we only care about which one
3001 // is closer to the size we request. Only on a tie, dup_p_refs or not is important again.
3002 (found_idx_size_diff = llabs((int64_t)tensor_blocks[found_idx].size - (int64_t)size)) < (idx_size_diff = llabs((int64_t)tensor_blocks[idx].size - (int64_t)size)))
3003 {
3004 found_idx = idx;
3005 continue;
3006 }
3007 // No need to update if found_idx is better than idx.
3008 if (found_idx_size_diff > idx_size_diff)
3009 continue;
3010 // We bias towards the bigger one in case of similar.
3011 if (found_idx_size_diff == idx_size_diff && tensor_blocks[idx].size > tensor_blocks[found_idx].size)
3012 {
3013 found_idx = idx;
3014 continue;
3015 }
3016 assert(tensor_blocks[idx].size == tensor_blocks[found_idx].size)((void) sizeof ((tensor_blocks[idx].size == tensor_blocks[found_idx
].size) ? 1 : 0), __extension__ ({ if (tensor_blocks[idx].size
== tensor_blocks[found_idx].size) ; else __assert_fail ("tensor_blocks[idx].size == tensor_blocks[found_idx].size"
, "ccv_nnc_symbolic_graph_compile.c", 3016, __extension__ __PRETTY_FUNCTION__
); }))
;
3017 // On a tie, check which one has tighter life-cycle.
3018 if (tensor_blocks[idx].size >= size) // If this block size covers the size we request, we prefer longer life-cycle ones.
3019 {
3020 // Check whether the current tensor blocks life-cycle is longer than the previous one.
3021 if (tensor_blocks[idx].dup_p_refs && tensor_blocks[idx].dup_p_refs->rnum > 0 &&
3022 (!tensor_blocks[found_idx].dup_p_refs || !tensor_blocks[found_idx].dup_p_refs->rnum ||
3023 _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, tensor_blocks[found_idx].dup_p_refs)))
3024 found_idx = idx;
3025 continue;
3026 }
3027 // Now both our size is smaller than requested size, in this case, we need to increase the tensor block size.
3028 // We prefer to choose the one that has life-cycle closer to the expected ones.
3029 if (no_dup_p_refs)
3030 {
3031 // Whoever is shorter wins.
3032 if (tensor_blocks[found_idx].dup_p_refs && tensor_blocks[found_idx].dup_p_refs->rnum > 0 &&
3033 (!tensor_blocks[idx].dup_p_refs || !tensor_blocks[idx].dup_p_refs->rnum ||
3034 _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[found_idx].dup_p_refs, tensor_blocks[idx].dup_p_refs)))
3035 found_idx = idx;
3036 continue;
3037 }
3038 if (!tensor_blocks[idx].dup_p_refs || !tensor_blocks[idx].dup_p_refs->rnum)
3039 continue;
3040 if (!tensor_blocks[found_idx].dup_p_refs || !tensor_blocks[found_idx].dup_p_refs->rnum)
3041 {
3042 found_idx = idx;
3043 continue;
3044 }
3045 // If both covers the request dup_p_refs, we prefer the shorter one, otherwise we prefer the longer one.
3046 const int idx_after_request = _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, dup_p_refs);
3047 const int found_idx_after_request = _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[found_idx].dup_p_refs, dup_p_refs);
3048 if (idx_after_request && found_idx_after_request)
3049 {
3050 if (_ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[found_idx].dup_p_refs, tensor_blocks[idx].dup_p_refs))
3051 found_idx = idx;
3052 continue;
3053 } else {
3054 // We entered this branch must be either idx_after_request is false or found_idx_after_request is false or both.
3055 // If found_idx_after_request is not false, we are currently doing fine, no need to proceed.
3056 // Otherwise, if idx_after_request is true, it is preferred. If both are false, then prefer the longer one.
3057 if (!found_idx_after_request && (idx_after_request ||
3058 _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, tensor_blocks[found_idx].dup_p_refs)))
3059 found_idx = idx;
3060 continue;
3061 }
3062 }
3063 return found_idx;
3064}
3065
3066static ccv_array_t* _ccv_nnc_dup_breakpoints_with_p_node_inputs(ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_info_t* const p_node_info)
3067{
3068 if (!(p_node_info && (p_node_info->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)))
3069 return 0;
3070 int i, j, k;
3071 int input_size = 0;
3072 for (i = 0; i < p_node_info->p_while.input_size; i++)
3073 if (p_node_info->p_while.inputs[i] >= 0)
3074 ++input_size;
3075 // If doesn't have tensor inputs (thus, only special inputs), just return.
3076 if (!input_size)
3077 return 0;
3078 ccv_nnc_tensor_symbol_t inputs[input_size];
3079 input_size = 0;
3080 for (i = 0; i < p_node_info->p_while.input_size; i++)
3081 if (p_node_info->p_while.inputs[i] >= 0)
3082 inputs[input_size++] = (ccv_nnc_tensor_symbol_t){
3083 .d = p_node_info->p_while.inputs[i],
3084 .graph = symbolic_graph,
3085 };
3086 assert(symbolic_graph->breakpoint_size > 0)((void) sizeof ((symbolic_graph->breakpoint_size > 0) ?
1 : 0), __extension__ ({ if (symbolic_graph->breakpoint_size
> 0) ; else __assert_fail ("symbolic_graph->breakpoint_size > 0"
, "ccv_nnc_symbolic_graph_compile.c", 3086, __extension__ __PRETTY_FUNCTION__
); }))
;
3087 ccv_array_t* dup_breakpoints = ccv_array_new(sizeof(ccv_nnc_graph_exec_symbol_t), symbolic_graph->breakpoint_size, 0);
3088 const int exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum;
3089 for (i = 0; i < symbolic_graph->breakpoint_size; i++)
3090 {
3091 // Make a noop copy of the breakpoint, but with some tensor inputs.
3092 ccv_nnc_graph_exec_symbol_t noop = ccv_nnc_graph_exec_symbol_new(symbolic_graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC()((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}}}), 0), inputs, input_size, 0, 0, 0);
3093 ccv_array_push(dup_breakpoints, &noop);
3094 // Connect this noop to the outgoing nodes of breakpoints.
3095 const ccv_nnc_graph_exec_symbol_info_t* const symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, symbolic_graph->breakpoints[i].d)((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
(size_t)(symbolic_graph->breakpoints[i].d)))
;
3096 if (symbol_info->outgoings)
3097 for (j = 0; j < symbol_info->outgoings->rnum; j++)
3098 {
3099 const int d = *(int*)ccv_array_get(symbol_info->outgoings, j)((void*)(((char*)((symbol_info->outgoings)->data)) + (size_t
)(symbol_info->outgoings)->rsize * (size_t)(j)))
;
3100 ccv_nnc_graph_exec_symbol_concat(symbolic_graph, noop, (ccv_nnc_graph_exec_symbol_t){
3101 .d = d,
3102 .graph = symbolic_graph,
3103 });
3104 }
3105 }
3106 for (i = 0; i < exec_symbol_info_size; i++)
3107 {
3108 const ccv_nnc_graph_exec_symbol_info_t* const symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, i)((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
(size_t)(i)))
;
3109 if (CCV_NNC_GRAPH_EXEC_IS_DEAD(symbol_info->flags)((symbol_info->flags) & CCV_NNC_GRAPH_EXEC_DEAD))
3110 continue;
3111 if (symbol_info->outgoings)
3112 {
3113 const int outgoing_size = symbol_info->outgoings->rnum;
3114 for (j = 0; j < outgoing_size; j++)
3115 {
3116 const int d = *(int*)ccv_array_get(symbol_info->outgoings, j)((void*)(((char*)((symbol_info->outgoings)->data)) + (size_t
)(symbol_info->outgoings)->rsize * (size_t)(j)))
;
3117 for (k = 0; k < symbolic_graph->breakpoint_size; k++)
3118 if (d == symbolic_graph->breakpoints[k].d)
3119 {
3120 ccv_nnc_graph_exec_symbol_t noop = *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_breakpoints, k)((void*)(((char*)((dup_breakpoints)->data)) + (size_t)(dup_breakpoints
)->rsize * (size_t)(k)))
;
3121 ccv_nnc_graph_exec_symbol_concat(symbolic_graph, (ccv_nnc_graph_exec_symbol_t){
3122 .d = i,
3123 .graph = symbolic_graph,
3124 }, noop);
3125 // Found, connected, exit.
3126 break;
3127 }
3128 }
3129 }
3130 }
3131 // Add the dup_breakpoints to source if neccessary.
3132 assert(symbolic_graph->sources)((void) sizeof ((symbolic_graph->sources) ? 1 : 0), __extension__
({ if (symbolic_graph->sources) ; else __assert_fail ("symbolic_graph->sources"
, "ccv_nnc_symbolic_graph_compile.c", 3132, __extension__ __PRETTY_FUNCTION__
); }))
;
3133 const int source_size = symbolic_graph->sources->rnum;
3134 for (i = 0; i < source_size; i++)
3135 {
3136 const int d = ((ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->sources, i)((void*)(((char*)((symbolic_graph->sources)->data)) + (
size_t)(symbolic_graph->sources)->rsize * (size_t)(i)))
)->d;
3137 for (j = 0; j < symbolic_graph->breakpoint_size; j++)
3138 if (d == symbolic_graph->breakpoints[j].d)
3139 {
3140 ccv_nnc_graph_exec_symbol_t noop = *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_breakpoints, j)((void*)(((char*)((dup_breakpoints)->data)) + (size_t)(dup_breakpoints
)->rsize * (size_t)(j)))
;
3141 ccv_nnc_symbolic_graph_add_source(symbolic_graph, noop);
3142 // Found, made, exit.
3143 break;
3144 }
3145 }
3146 // Add the dup_breakpoints to destination if neccessary.
3147 assert(symbolic_graph->destinations)((void) sizeof ((symbolic_graph->destinations) ? 1 : 0), __extension__
({ if (symbolic_graph->destinations) ; else __assert_fail
("symbolic_graph->destinations", "ccv_nnc_symbolic_graph_compile.c"
, 3147, __extension__ __PRETTY_FUNCTION__); }))
;
3148 const int destination_size = symbolic_graph->destinations->rnum;
3149 for (i = 0; i < destination_size; i++)
3150 {
3151 const int d = ((ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->destinations, i)((void*)(((char*)((symbolic_graph->destinations)->data)
) + (size_t)(symbolic_graph->destinations)->rsize * (size_t
)(i)))
)->d;
3152 for (j = 0; j < symbolic_graph->breakpoint_size; j++)
3153 if (d == symbolic_graph->breakpoints[j].d)
3154 {
3155 ccv_nnc_graph_exec_symbol_t noop = *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_breakpoints, j)((void*)(((char*)((dup_breakpoints)->data)) + (size_t)(dup_breakpoints
)->rsize * (size_t)(j)))
;
3156 ccv_nnc_symbolic_graph_add_destination(symbolic_graph, noop);
3157 // Found, made, exit.
3158 break;
3159 }
3160 }
3161 return dup_breakpoints;
3162}
3163
3164// Plan out how we allocate tensor (should I do optimizations on graph here or not at all?).
3165static ccv_nnc_symbolic_graph_prep_t* _ccv_nnc_symbolic_graph_prep_new(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_graph_exec_symbol_info_t* const p_exec_symbol_info, const int p_exec_symbol_info_size)
3166{
3167 assert(source_size > 0)((void) sizeof ((source_size > 0) ? 1 : 0), __extension__ (
{ if (source_size > 0) ; else __assert_fail ("source_size > 0"
, "ccv_nnc_symbolic_graph_compile.c", 3167, __extension__ __PRETTY_FUNCTION__
); }))
;
3168 assert(destination_size > 0)((void) sizeof ((destination_size > 0) ? 1 : 0), __extension__
({ if (destination_size > 0) ; else __assert_fail ("destination_size > 0"
, "ccv_nnc_symbolic_graph_compile.c", 3168, __extension__ __PRETTY_FUNCTION__
); }))
;
3169 // First, fill all the "auto" holes.
3170 // This is the symbol table that with "auto" info filled up.
3171 ccv_nnc_tensor_symbol_info_t* tensor_symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccmallocmalloc(sizeof(ccv_nnc_tensor_symbol_info_t) * symbolic_graph->tensor_symbol_info->rnum);
3172 ccv_nnc_graph_exec_symbol_info_t* exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccmallocmalloc(sizeof(ccv_nnc_graph_exec_symbol_info_t) * symbolic_graph->exec_symbol_info->rnum);
3173 ccv_nnc_graph_exec_flag_t* exec_flags = (ccv_nnc_graph_exec_flag_t*)cccalloccalloc(symbolic_graph->exec_symbol_info->rnum, sizeof(ccv_nnc_graph_exec_flag_t));
3174 ccv_nnc_graph_visit_t* visit = ccv_nnc_graph_visit_new(symbolic_graph, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, 0), symbolic_graph->exec_symbol_info->rnum, sources, source_size, destinations, destination_size, 0)({ ccv_nnc_graph_visit_t* _visit_ = (ccv_nnc_graph_visit_t*)malloc
(sizeof(ccv_nnc_graph_visit_t) + sizeof(_visit_->node[0]) *
((symbolic_graph->exec_symbol_info->rnum) - 1)); _visit_
->size = 0; do { typedef struct { int8_t d; int8_t r; uint16_t
c; int32_t edges; } ccv_nnc_incoming_t; int _i_, _j_; int _incoming_edges_
= 0; for (_i_ = 0; _i_ < (symbolic_graph->exec_symbol_info
->rnum); _i_++) _incoming_edges_ += (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_i_].outgoings) ? ((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_i_].outgoings->rnum : 0; const int _heap_mem_
= ((symbolic_graph->exec_symbol_info->rnum) + _incoming_edges_
> 1024); ccv_nnc_incoming_t* _incomings_; if (_heap_mem_)
_incomings_ = (ccv_nnc_incoming_t*)malloc(sizeof(ccv_nnc_incoming_t
) * (symbolic_graph->exec_symbol_info->rnum) + sizeof(int32_t
) * ((symbolic_graph->exec_symbol_info->rnum) * 2 + _incoming_edges_
)); else _incomings_ = (ccv_nnc_incoming_t*)__builtin_alloca (
sizeof(ccv_nnc_incoming_t) * (symbolic_graph->exec_symbol_info
->rnum) + sizeof(int32_t) * ((symbolic_graph->exec_symbol_info
->rnum) * 2 + _incoming_edges_)); memset(_incomings_, 0, sizeof
(ccv_nnc_incoming_t) * (symbolic_graph->exec_symbol_info->
rnum)); int32_t* _exists_[2] = { (int32_t*)(_incomings_ + (symbolic_graph
->exec_symbol_info->rnum)), (int32_t*)(_incomings_ + (symbolic_graph
->exec_symbol_info->rnum)) + (symbolic_graph->exec_symbol_info
->rnum), }; int32_t* const _edges_ = _exists_[1] + (symbolic_graph
->exec_symbol_info->rnum); for (_i_ = 0; _i_ < (source_size
); _i_++) { ((void) sizeof (((sources)[_i_].graph == symbolic_graph
) ? 1 : 0), __extension__ ({ if ((sources)[_i_].graph == symbolic_graph
) ; else __assert_fail ("(sources)[_i_].graph == symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 3174, __extension__ __PRETTY_FUNCTION__
); })); _incomings_[(sources)[_i_].d].r = 1; _exists_[0][_i_]
= (sources)[_i_].d; } int _exist_size_[2] = { (source_size),
0, }; int _p_ = 0, _q_ = 1; while (_exist_size_[_p_] > 0)
{ _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_
[_p_]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (
_incomings_[_idx_].r != 1) continue; _incomings_[_idx_].r = 2
; if (((ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)((
symbolic_graph->exec_symbol_info)->data)) + (size_t)(symbolic_graph
->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings
) for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t*)
((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
(size_t)(0))))[_idx_].outgoings->rnum; _j_++) { const int
d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_)
)); ++_incomings_[d].c; if (_incomings_[d].r != 0) continue; _incomings_
[d].r = 1; ((void) sizeof ((_exist_size_[_q_] < (symbolic_graph
->exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if
(_exist_size_[_q_] < (symbolic_graph->exec_symbol_info
->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 3174, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
(_i_ = 0; _i_ < (source_size); _i_++) { ((void) sizeof ((
(sources)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__
({ if ((sources)[_i_].graph == symbolic_graph) ; else __assert_fail
("(sources)[_i_].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 3174, __extension__ __PRETTY_FUNCTION__); })); _incomings_[
(sources)[_i_].d].r = 3; _exists_[0][_i_] = (sources)[_i_].d;
} _exist_size_[0] = (source_size); _exist_size_[1] = 0; _p_ =
0, _q_ = 1; int _bump_ = 1; while (_exist_size_[_p_] > 0)
{ _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_
[_p_]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (
_incomings_[_idx_].r != 3) continue; _incomings_[_idx_].r = 4
; if (((ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)((
symbolic_graph->exec_symbol_info)->data)) + (size_t)(symbolic_graph
->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings
) for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t*)
((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
(size_t)(0))))[_idx_].outgoings->rnum; _j_++) { const int
d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_)
)); if (_incomings_[d].edges == 0) { _incomings_[d].edges = _bump_
; _bump_ += _incomings_[d].c; _incomings_[d].c = 0; } _edges_
[_incomings_[d].edges - 1 + _incomings_[d].c] = _idx_; ++_incomings_
[d].c; if (_incomings_[d].r != 2) continue; _incomings_[d].r =
3; ((void) sizeof ((_exist_size_[_q_] < (symbolic_graph->
exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_
[_q_] < (symbolic_graph->exec_symbol_info->rnum)) ; else
__assert_fail ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 3174, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
(_i_ = 0; _i_ < (destination_size); _i_++) { ((void) sizeof
(((destinations)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__
({ if ((destinations)[_i_].graph == symbolic_graph) ; else __assert_fail
("(destinations)[_i_].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 3174, __extension__ __PRETTY_FUNCTION__); })); _incomings_[
(destinations)[_i_].d].r = 5; _exists_[0][_i_] = (destinations
)[_i_].d; } _exist_size_[0] = (destination_size); _exist_size_
[1] = 0; _p_ = 0, _q_ = 1; while (_exist_size_[_p_] > 0) {
_exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_
]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (_incomings_
[_idx_].r != 5) continue; _incomings_[_idx_].r = 6; if (_incomings_
[_idx_].edges > 0) for (_j_ = 0; _j_ < _incomings_[_idx_
].c; _j_++) { const int d = _edges_[_incomings_[_idx_].edges -
1 + _j_]; if (_incomings_[d].r != 4) continue; _incomings_[d
].r = 5; ((void) sizeof ((_exist_size_[_q_] < (symbolic_graph
->exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if
(_exist_size_[_q_] < (symbolic_graph->exec_symbol_info
->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 3174, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
(_i_ = 0; _i_ < (destination_size); _i_++) { ((void) sizeof
(((destinations)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__
({ if ((destinations)[_i_].graph == symbolic_graph) ; else __assert_fail
("(destinations)[_i_].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 3174, __extension__ __PRETTY_FUNCTION__); })); _incomings_[
(destinations)[_i_].d].d = 1; } for (_i_ = 0; _i_ < (source_size
); _i_++) { ((void) sizeof (((sources)[_i_].graph == symbolic_graph
) ? 1 : 0), __extension__ ({ if ((sources)[_i_].graph == symbolic_graph
) ; else __assert_fail ("(sources)[_i_].graph == symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 3174, __extension__ __PRETTY_FUNCTION__
); })); _exists_[0][_i_] = (sources)[_i_].d; } _p_ = 0; _q_ =
1; _exist_size_[0] = (source_size); _exist_size_[1] = 0; int
_d_ = 0; while (_exist_size_[_p_] > 0) { _exist_size_[_q_
] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_];) { const int32_t
_idx_ = _exists_[_p_][_i_]; _visit_->node[_visit_->size
].index = ((_idx_)); _visit_->node[_visit_->size].term =
((_incomings_[_idx_].d)); ++_visit_->size;; if (_incomings_
[_idx_].d) { ++_d_; _incomings_[_idx_].r = 7; } if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings) { if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings->rnum == 1) { const int
d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(0)))
; --_incomings_[d].c; if (_incomings_[d].c == 0 && _incomings_
[d].r == 6 && _d_ < (destination_size)) { _exists_
[_p_][_i_] = d; continue; } } else for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings->rnum; _j_++) { const int
d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_)
)); --_incomings_[d].c; if (_incomings_[d].c == 0 && _incomings_
[d].r == 6 && _d_ < (destination_size)) { ((void) sizeof
((_exist_size_[_q_] < (symbolic_graph->exec_symbol_info
->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_[_q_] <
(symbolic_graph->exec_symbol_info->rnum)) ; else __assert_fail
("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 3174, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } } ++_i_; } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (
_i_)); } for (_i_ = 0; _i_ < (destination_size); _i_++) { (
(void) sizeof (((destinations)[_i_].graph == symbolic_graph) ?
1 : 0), __extension__ ({ if ((destinations)[_i_].graph == symbolic_graph
) ; else __assert_fail ("(destinations)[_i_].graph == symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 3174, __extension__ __PRETTY_FUNCTION__
); })); if (_incomings_[(destinations)[_i_].d].r == 7) continue
; if (!(0)) { ((void) sizeof ((_incomings_[(destinations)[_i_
].d].c == 0) ? 1 : 0), __extension__ ({ if (_incomings_[(destinations
)[_i_].d].c == 0) ; else __assert_fail ("_incomings_[(destinations)[_i_].d].c == 0"
, "ccv_nnc_symbolic_graph_compile.c", 3174, __extension__ __PRETTY_FUNCTION__
); })); } else if (_incomings_[(destinations)[_i_].d].c > 0
) continue; _visit_->node[_visit_->size].index = (((destinations
)[_i_].d)); _visit_->node[_visit_->size].term = ((_incomings_
[(destinations)[_i_].d].d)); ++_visit_->size;; } if (_heap_mem_
) free(_incomings_); } while (0);; ((void) sizeof ((_visit_->
size <= (symbolic_graph->exec_symbol_info->rnum)) ? 1
: 0), __extension__ ({ if (_visit_->size <= (symbolic_graph
->exec_symbol_info->rnum)) ; else __assert_fail ("_visit_->size <= (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 3174, __extension__ __PRETTY_FUNCTION__
); })); _visit_; })
;
3175 ccv_nnc_symbolic_graph_symbol_infer(symbolic_graph, visit, sources, source_size, destinations, destination_size, p_tensor_symbol_info, p_tensor_symbol_info_size, tensor_symbol_info, exec_symbol_info);
3176 int i, j, k, p, q;
3177 const ccv_nnc_graph_exec_symbol_info_t* const p_node_info = p_exec_symbol_info ? p_exec_symbol_info + (symbolic_graph->exec_idx - 1) : 0;
3178 ccv_sparse_matrix_t* exec_dep;
3179 ccv_nnc_tensor_block_t* tensor_blocks;
3180 _ccv_nnc_exec_dep_and_tensor_blocks_prep(symbolic_graph, p_node_info, visit, tensor_binds, tensor_bind_size, outputs, output_size, sources, source_size, destinations, destination_size, p_tensor_symbol_info, p_tensor_symbol_info_size, exec_symbol_info, tensor_symbol_info, 0, 0, 0, 0, exec_flags, &exec_dep, &tensor_blocks);
3181 int tensor_block_size = symbolic_graph->tensor_symbol_info->rnum;
3182 // Now, everything is prepared, tensor life is analyzed, inplace operations are collapsed, all tensor symbols and hints
3183 // are automatically filled in, and all the sub-graphs are processed.
3184 // There is a last step though, for a while loop, it is parameterized:
3185 // while (x > 5) {
3186 // y = x + 1;
3187 // } (y => x) // This means after this loop is done, y's value will be copied over to x.
3188 // we will do our best to avoid to do the actual data copy, what we do here is to check whether y can be x's alias.
3189 // If y can be x's alias, this is good, no other changes required. In above case, y can be x's alias because
3190 // it is a inplace operation.
3191 // But if y cannot be x's alias, for example, this while loop looks like this:
3192 // while (x > 5) {
3193 // y = x + a
3194 // b = x + y
3195 // } (y => x, b => a) // This means after this loop is done, y's value copied to x and b's value copied to a.
3196 // For this example, y cannot be x's alias because x is used later to compute b (and that computation
3197 // has dependency on y as well).
3198 // For this case, we need to modify the computation graph. Previously, the graph looks like this:
3199 // y = x + a -> b = x + y
3200 // This graph will be extended to look like this:
3201 // y0 = x0 + a0 -> b0 = x0 + y0 -> y1 = y0 + b0 -> b1 = y0 + y1, or:
3202 // while (x0 > 5) {
3203 // y0 = x0 + a0
3204 // b0 = x0 + y0
3205 // if (y0 > 5) break
3206 // y1 = y0 + b0
3207 // b1 = y0 + y1
3208 // } (y1 => x0, b1 => a0)
3209 // After this expansion, y1 now can be the alias of x0, as well as b1 can be alias of a0 (they don't interfere
3210 // with each other now).
3211 // With this algorithm, we don't need to insert any data copy logic, the only thing need is to switch pointers
3212 // which is covered by the tensor_multiview_t construct (thus, y (y0, y1), x (y1, y0), b (b0, b1), a (b1, b0))
3213 ccv_nnc_symbolic_graph_t* dup_graph = 0;
3214 int* dup_exec_ref = 0;
3215 int* dup_tensor_block_ref = 0;
3216 int unroll_count = 0;
3217 // In true recursive fashion, I need to call all the sub graphs and do the pre compilation for them one by one.
3218 ccv_nnc_symbolic_graph_prep_t* prep = (ccv_nnc_symbolic_graph_prep_t*)ccmallocmalloc(sizeof(ccv_nnc_symbolic_graph_prep_t));
3219 prep->graph = ccv_nnc_graph_new(); // Just allocate the graph right now.
3220 prep->flags = 0;
3221 // Cannot handle dup a node that is a graph as well.
3222 if (p_exec_symbol_info)
3223 {
3224 prep->flags = p_node_info->flags;
3225 if (p_node_info->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
3226 {
3227 _ccv_nnc_redo_exec_dep_and_tensor_blocks_when_unroll(symbolic_graph, p_node_info, visit, tensor_binds, tensor_bind_size, sources, source_size, destinations, destination_size, p_tensor_symbol_info, p_tensor_symbol_info_size, exec_symbol_info, tensor_symbol_info, exec_flags, &exec_dep, &tensor_blocks, &tensor_block_size, &dup_graph, &unroll_count, &dup_exec_ref, &dup_tensor_block_ref);
3228 _ccv_nnc_fixup_tensor_blocks_for_outputs(exec_dep, tensor_blocks, p_node_info, unroll_count, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->destinations, 0)((void*)(((char*)((symbolic_graph->destinations)->data)
) + (size_t)(symbolic_graph->destinations)->rsize * (size_t
)(0)))
, symbolic_graph->destinations->rnum, symbolic_graph->p_idx - 1, p_tensor_symbol_info, p_tensor_symbol_info_size, tensor_symbol_info, dup_exec_ref, dup_tensor_block_ref);
3229 } else if (p_node_info->flags & CCV_NNC_GRAPH_EXEC_CASE_OF) {
3230 // TODO: We want to try our best to fit as much of its corresponding inputs / outputs into companion_ref group.
3231 }
3232 }
3233 ccv_nnc_symbolic_graph_prep_t** sub_preps = symbolic_graph->sub_graphs && symbolic_graph->sub_graphs->rnum ? (ccv_nnc_symbolic_graph_prep_t**)cccalloccalloc(symbolic_graph->sub_graphs->rnum, sizeof(ccv_nnc_symbolic_graph_prep_t*)) : 0;
3234 ccv_array_t* anonymous_block_free_list = 0;
3235 const int tensor_fold_size = (tensor_block_size + 31) >> 5;
3236 // Record whether this tensor is folded in this round.
3237 uint32_t* const tensor_fold = (uint32_t*)ccmallocmalloc(sizeof(uint32_t) * tensor_fold_size);
3238 ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx;
{
3239 for (p = 0; p < node->graph_ref_size; p++)
3240 {
3241 assert(symbolic_graph->sub_graphs)((void) sizeof ((symbolic_graph->sub_graphs) ? 1 : 0), __extension__
({ if (symbolic_graph->sub_graphs) ; else __assert_fail (
"symbolic_graph->sub_graphs", "ccv_nnc_symbolic_graph_compile.c"
, 3241, __extension__ __PRETTY_FUNCTION__); }))
;
3242 ccv_nnc_symbolic_graph_t* const sub_graph = *(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, CCV_NNC_GRAPH_REF(node)[p] - 1)((void*)(((char*)((symbolic_graph->sub_graphs)->data)) +
(size_t)(symbolic_graph->sub_graphs)->rsize * (size_t)
(((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (
node)->_inline_graph_ref)[p] - 1)))
;
3243 ccv_array_t* const dup_breakpoints = _ccv_nnc_dup_breakpoints_with_p_node_inputs(sub_graph, node);
3244 ccv_nnc_symbolic_graph_prep_t* const sub_prep = _ccv_nnc_symbolic_graph_prep_new(sub_graph, tensor_binds, tensor_bind_size, 0, 0, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sub_graph->sources, 0)((void*)(((char*)((sub_graph->sources)->data)) + (size_t
)(sub_graph->sources)->rsize * (size_t)(0)))
, sub_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sub_graph->destinations, 0)((void*)(((char*)((sub_graph->destinations)->data)) + (
size_t)(sub_graph->destinations)->rsize * (size_t)(0)))
, sub_graph->destinations->rnum, tensor_symbol_info, symbolic_graph->tensor_symbol_info->rnum, exec_symbol_info, symbolic_graph->exec_symbol_info->rnum);
3245 sub_prep->dup_breakpoints = dup_breakpoints;
3246 sub_prep->p = prep;
3247 sub_preps[CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)
[p] - 1] = sub_prep;
3248 const ccv_nnc_tensor_alloc_prep_t* const s_alloc_prep = sub_prep->alloc_prep;
3249 const ccv_nnc_tensor_block_t* const s_tensor_blocks = sub_prep->tensor_blocks;
3250 for (i = 0; i < s_alloc_prep->block_size; i++)
3251 {
3252 const int block_ref = s_alloc_prep->blocks[i].block_ref;
3253 const int buffer_ref = s_alloc_prep->blocks[i].buffer_ref;
3254 if (block_ref < sub_prep->tensor_symbol_info_size)
3255 {
3256 // If this block has a bypass, and its bypass has a different p_refs, then it doesn't matter.
3257 // I cannot assign p_refs to its parent buffer, and that buffer has to be anonymous.
3258 if (s_tensor_blocks[block_ref].bypass_ref)
3259 {
3260 int bypass_ref = s_tensor_blocks[block_ref].bypass_ref - 1;
3261 while (s_tensor_blocks[bypass_ref].ref)
3262 bypass_ref = s_tensor_blocks[bypass_ref].ref - 1;
3263 if (s_tensor_blocks[block_ref].p_refs[0] != s_tensor_blocks[bypass_ref].p_refs[0] ||
3264 s_tensor_blocks[block_ref].p_refs[1] != s_tensor_blocks[bypass_ref].p_refs[1])
3265 continue;
3266 }
3267 if (s_tensor_blocks[block_ref].p_refs[0])
3268 {
3269 /* If it is already properly assigned, next. */
3270 if (s_alloc_prep->buffers[buffer_ref].p_refs[0] != s_tensor_blocks[block_ref].p_refs[0] &&
3271 s_alloc_prep->buffers[buffer_ref].p_refs[1] != s_tensor_blocks[block_ref].p_refs[0])
3272 {
3273 if (!s_alloc_prep->buffers[buffer_ref].p_refs[0])
3274 s_alloc_prep->buffers[buffer_ref].p_refs[0] = s_tensor_blocks[block_ref].p_refs[0];
3275 else {
3276 assert(!s_alloc_prep->buffers[buffer_ref].p_refs[1])((void) sizeof ((!s_alloc_prep->buffers[buffer_ref].p_refs
[1]) ? 1 : 0), __extension__ ({ if (!s_alloc_prep->buffers
[buffer_ref].p_refs[1]) ; else __assert_fail ("!s_alloc_prep->buffers[buffer_ref].p_refs[1]"
, "ccv_nnc_symbolic_graph_compile.c", 3276, __extension__ __PRETTY_FUNCTION__
); }))
;
3277 s_alloc_prep->buffers[buffer_ref].p_refs[1] = s_tensor_blocks[block_ref].p_refs[0];
3278 }
3279 }
3280 /* When entering this branch, s_alloc_prep->buffers[buffer_ref].p_refs[0] cannot be 0. */
3281 if (s_tensor_blocks[block_ref].p_refs[1] &&
3282 s_alloc_prep->buffers[buffer_ref].p_refs[0] != s_tensor_blocks[block_ref].p_refs[1] &&
3283 s_alloc_prep->buffers[buffer_ref].p_refs[1] != s_tensor_blocks[block_ref].p_refs[1])
3284 {
3285 assert(s_alloc_prep->buffers[buffer_ref].p_refs[0])((void) sizeof ((s_alloc_prep->buffers[buffer_ref].p_refs[
0]) ? 1 : 0), __extension__ ({ if (s_alloc_prep->buffers[buffer_ref
].p_refs[0]) ; else __assert_fail ("s_alloc_prep->buffers[buffer_ref].p_refs[0]"
, "ccv_nnc_symbolic_graph_compile.c", 3285, __extension__ __PRETTY_FUNCTION__
); }))
;
3286 assert(!s_alloc_prep->buffers[buffer_ref].p_refs[1])((void) sizeof ((!s_alloc_prep->buffers[buffer_ref].p_refs
[1]) ? 1 : 0), __extension__ ({ if (!s_alloc_prep->buffers
[buffer_ref].p_refs[1]) ; else __assert_fail ("!s_alloc_prep->buffers[buffer_ref].p_refs[1]"
, "ccv_nnc_symbolic_graph_compile.c", 3286, __extension__ __PRETTY_FUNCTION__
); }))
;
3287 s_alloc_prep->buffers[buffer_ref].p_refs[1] = s_tensor_blocks[block_ref].p_refs[1];
3288 }
3289 }
3290 } else if (s_tensor_blocks[block_ref].dup_p_refs) {
3291 /* In this case, only relevant bit is dup_p_ref. dup_p_ref extends the life-time of anonymous block
3292 * which by default only has life-cycle shared with this sub-graph node. The reason to extend is that
3293 * these anonymous blocks that has dup_p_ref may contain data that will be used as output (thus, dup_p_ref
3294 * always points to an output tensor of this sub-graph node) therefore, the memory region must extend
3295 * its life-time to the end of the output tensor. */
3296 if (!s_alloc_prep->buffers[buffer_ref].dup_p_refs)
3297 s_alloc_prep->buffers[buffer_ref].dup_p_refs = ccv_array_new(sizeof(int), s_tensor_blocks[block_ref].dup_p_refs->rnum, 0);
3298 for (j = 0; j < s_tensor_blocks[block_ref].dup_p_refs->rnum; j++)
3299 ccv_array_add_unique_int(s_alloc_prep->buffers[buffer_ref].dup_p_refs, *(int*)ccv_array_get(s_tensor_blocks[block_ref].dup_p_refs, j)((void*)(((char*)((s_tensor_blocks[block_ref].dup_p_refs)->
data)) + (size_t)(s_tensor_blocks[block_ref].dup_p_refs)->
rsize * (size_t)(j)))
);
3300 }
3301 }
3302 }
3303 const int init_tensor_block_size = tensor_block_size;
3304 int rw_anonymous_buffer_size_cap = 0;
3305 int ro_anonymous_buffer_size_cap = 0;
3306 if (anonymous_block_free_list)
3307 ccv_array_clear(anonymous_block_free_list);
3308 memset(tensor_fold, 0, sizeof(uint32_t) * tensor_fold_size);
3309 for (p = 0; p < node->graph_ref_size; p++)
3310 {
3311 ccv_nnc_symbolic_graph_prep_t* const sub_prep = sub_preps[CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)
[p] - 1];
3312 const ccv_nnc_tensor_alloc_prep_t* const s_alloc_prep = sub_prep->alloc_prep;
3313 int rw_anonymous_buffer_size = 0;
3314 int ro_anonymous_buffer_size = 0;
3315 for (i = 0; i < s_alloc_prep->buffer_size; i++)
3316 if (s_alloc_prep->buffers[i].p_refs[0])
3317 {
3318 /* Reduce 2 p_refs, if it is, to 1 p_ref (by doing block folding). */
3319 int p_ref_0 = s_alloc_prep->buffers[i].p_refs[0] - 1;
3320 /* Need to go through refs. Since we reuse the tensor block for this input, it now has to have allocate at least this much space. */
3321 int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, node);
3322 assert(p_ref_0_is_in_or_out != 0)((void) sizeof ((p_ref_0_is_in_or_out != 0) ? 1 : 0), __extension__
({ if (p_ref_0_is_in_or_out != 0) ; else __assert_fail ("p_ref_0_is_in_or_out != 0"
, "ccv_nnc_symbolic_graph_compile.c", 3322, __extension__ __PRETTY_FUNCTION__
); }))
;
3323 int unref_p_ref_0 = p_ref_0;
3324 while (tensor_blocks[unref_p_ref_0].ref)
3325 unref_p_ref_0 = tensor_blocks[unref_p_ref_0].ref - 1;
3326 /* This parent tensor block cannot be unassigned because it is either input / output of this sub-graph node. */
3327 assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_0]))((void) sizeof ((!((tensor_blocks[unref_p_ref_0].flags & 0x3
) == UNASSIGNED)) ? 1 : 0), __extension__ ({ if (!((tensor_blocks
[unref_p_ref_0].flags & 0x3) == UNASSIGNED)) ; else __assert_fail
("!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_0])", "ccv_nnc_symbolic_graph_compile.c"
, 3327, __extension__ __PRETTY_FUNCTION__); }))
;
3328 if (s_alloc_prep->buffers[i].p_refs[1])
3329 {
3330 int p_ref_1 = s_alloc_prep->buffers[i].p_refs[1] - 1;
3331 const int p_ref_1_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_1, node);
3332 assert(p_ref_1_is_in_or_out != 0)((void) sizeof ((p_ref_1_is_in_or_out != 0) ? 1 : 0), __extension__
({ if (p_ref_1_is_in_or_out != 0) ; else __assert_fail ("p_ref_1_is_in_or_out != 0"
, "ccv_nnc_symbolic_graph_compile.c", 3332, __extension__ __PRETTY_FUNCTION__
); }))
;
3333 int unref_p_ref_1 = p_ref_1;
3334 while (tensor_blocks[unref_p_ref_1].ref)
3335 unref_p_ref_1 = tensor_blocks[unref_p_ref_1].ref - 1;
3336 /* See above comment for the similar p_ref_0 check. */
3337 assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_1]))((void) sizeof ((!((tensor_blocks[unref_p_ref_1].flags & 0x3
) == UNASSIGNED)) ? 1 : 0), __extension__ ({ if (!((tensor_blocks
[unref_p_ref_1].flags & 0x3) == UNASSIGNED)) ; else __assert_fail
("!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_1])", "ccv_nnc_symbolic_graph_compile.c"
, 3337, __extension__ __PRETTY_FUNCTION__); }))
;
3338 assert(p_ref_0_is_in_or_out != p_ref_1_is_in_or_out)((void) sizeof ((p_ref_0_is_in_or_out != p_ref_1_is_in_or_out
) ? 1 : 0), __extension__ ({ if (p_ref_0_is_in_or_out != p_ref_1_is_in_or_out
) ; else __assert_fail ("p_ref_0_is_in_or_out != p_ref_1_is_in_or_out"
, "ccv_nnc_symbolic_graph_compile.c", 3338, __extension__ __PRETTY_FUNCTION__
); }))
;
3339 int p_ref_t;
3340 if (p_ref_0_is_in_or_out < p_ref_1_is_in_or_out) /* if p_ref_0 is input and p_ref_1 is output, switch. */
3341 {
3342 CCV_SWAP(p_ref_0, p_ref_1, p_ref_t)((p_ref_t) = (p_ref_0), (p_ref_0) = (p_ref_1), (p_ref_1) = (p_ref_t
))
;
3343 CCV_SWAP(unref_p_ref_0, unref_p_ref_1, p_ref_t)((p_ref_t) = (unref_p_ref_0), (unref_p_ref_0) = (unref_p_ref_1
), (unref_p_ref_1) = (p_ref_t))
;
3344 }
3345 p_ref_0_is_in_or_out = 1; /* Now p_ref_0 surely is the output tensor. */
3346 /* If the dimension matches, can fold. TODO: shoud the dimension matches perfectly here? */
3347 if (memcmp(tensor_symbol_info[unref_p_ref_1].info.dim, tensor_symbol_info[unref_p_ref_0].info.dim, sizeof(int) * CCV_NNC_MAX_DIM_ALLOC(12)) == 0)
3348 {
3349 const int folded = _ccv_nnc_tensor_blocks_try_fold(tensor_blocks, unref_p_ref_1, unref_p_ref_0);
3350 if (folded)
3351 {
3352 p_ref_0 = p_ref_1;
3353 unref_p_ref_0 = unref_p_ref_1; // p_ref_0 now folded into p_ref_1, therefore, pointing to p_ref_1 now.
3354 tensor_fold[unref_p_ref_0 >> 5] |= (1u << (unref_p_ref_0 & 0x1f));
3355 for (j = 0; j < unroll_count; j++) /* Fold its duplicates as well. */
3356 {
3357 const int folded = _ccv_nnc_tensor_blocks_try_fold(tensor_blocks, dup_tensor_block_ref[unref_p_ref_1 * unroll_count + j], dup_tensor_block_ref[unref_p_ref_0 * unroll_count + j]);
3358 assert(folded && "the subsequent duplicates can be folded too.")((void) sizeof ((folded && "the subsequent duplicates can be folded too."
) ? 1 : 0), __extension__ ({ if (folded && "the subsequent duplicates can be folded too."
) ; else __assert_fail ("folded && \"the subsequent duplicates can be folded too.\""
, "ccv_nnc_symbolic_graph_compile.c", 3358, __extension__ __PRETTY_FUNCTION__
); }))
;
3359 }
3360 }
3361 }
3362 }
3363 /* Only proceed if it is folded here (thus, the input / output tensor can be connected, reuse is not a problem
3364 * Or if the p_ref_0 is the output, it is the first started from this node (thus, I have full control over
3365 * its life-cycle). Or if the p_ref_0 is the input, it is ended in this node (thus, I can take over i
3366 * life-cycle freely within this sub-graph (otherwise, if it is used anywhere, I cannot change the content
3367 * within its memory region)). Unless this buffer is used as read-only, and we don't have any output
3368 * associated with it, then we are good. */
3369 if ((tensor_fold[unref_p_ref_0 >> 5] & (1u << (unref_p_ref_0 & 0x1f))) ||
3370 (p_ref_0_is_in_or_out == 1 && _ccv_nnc_tensor_block_check_head(tensor_blocks + unref_p_ref_0, idx)) ||
3371 (p_ref_0_is_in_or_out == -1 && _ccv_nnc_tensor_block_check_tail(tensor_blocks + unref_p_ref_0, idx)) ||
3372 TENSOR_READ_WRITE(s_alloc_prep->buffers[i])(s_alloc_prep->buffers[i].flags & 0xc) == READ_ONLY)
3373 {
3374 if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i])(s_alloc_prep->buffers[i].flags & 0xc) == READ_ONLY)
3375 { assert(s_alloc_prep->buffers[i].p_refs[1] == 0)((void) sizeof ((s_alloc_prep->buffers[i].p_refs[1] == 0) ?
1 : 0), __extension__ ({ if (s_alloc_prep->buffers[i].p_refs
[1] == 0) ; else __assert_fail ("s_alloc_prep->buffers[i].p_refs[1] == 0"
, "ccv_nnc_symbolic_graph_compile.c", 3375, __extension__ __PRETTY_FUNCTION__
); }))
; }
3376 /* p_ref_0 is either the only one, or the output tensor, we always prefer the output tensor (there
3377 * is a long argument why that is the case, the digest is, it is much easier to control your output
3378 * than your input). */
3379 s_alloc_prep->buffers[i].p_refs[0] = p_ref_0 + 1;
3380 s_alloc_prep->buffers[i].p_refs[1] = 0;
3381 /* This parent tensor block cannot be unassigned because it is either input / output of this sub-graph node. */
3382 assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_0]))((void) sizeof ((!((tensor_blocks[unref_p_ref_0].flags & 0x3
) == UNASSIGNED)) ? 1 : 0), __extension__ ({ if (!((tensor_blocks
[unref_p_ref_0].flags & 0x3) == UNASSIGNED)) ; else __assert_fail
("!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_0])", "ccv_nnc_symbolic_graph_compile.c"
, 3382, __extension__ __PRETTY_FUNCTION__); }))
;
3383 tensor_blocks[unref_p_ref_0].size = ccv_max(s_alloc_prep->buffers[i].size, tensor_blocks[unref_p_ref_0].size)({ typeof (s_alloc_prep->buffers[i].size) _a = (s_alloc_prep
->buffers[i].size); typeof (tensor_blocks[unref_p_ref_0].size
) _b = (tensor_blocks[unref_p_ref_0].size); (_a > _b) ? _a
: _b; })
;
3384 for (j = 0; j < unroll_count; j++) /* Change the size of its duplicates as well. */
3385 tensor_blocks[dup_tensor_block_ref[p_ref_0 * unroll_count + j]].size =
3386 tensor_blocks[dup_tensor_block_ref[unref_p_ref_0 * unroll_count + j]].size =
3387 tensor_blocks[unref_p_ref_0].size;
3388 } else {
3389 s_alloc_prep->buffers[i].p_refs[0] = s_alloc_prep->buffers[i].p_refs[1] = 0;
3390 if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i])(s_alloc_prep->buffers[i].flags & 0xc) == READ_ONLY)
3391 ++ro_anonymous_buffer_size;
3392 else
3393 rw_anonymous_buffer_size += unroll_count + 1;
3394 }
3395 } else {
3396 if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i])(s_alloc_prep->buffers[i].flags & 0xc) == READ_ONLY)
3397 ++ro_anonymous_buffer_size;
3398 else
3399 rw_anonymous_buffer_size += unroll_count + 1;
3400 }
3401 if (ro_anonymous_buffer_size || rw_anonymous_buffer_size)
3402 {
3403 const int anonymous_block_free_list_cap = anonymous_block_free_list ? anonymous_block_free_list->rnum : 0;
3404 // All read-write buffer (potentially) can be reused between each case..of branch.
3405 rw_anonymous_buffer_size_cap += rw_anonymous_buffer_size;
3406 // Read-only buffer cannot be reused between each case..of branch.
3407 ro_anonymous_buffer_size_cap += ro_anonymous_buffer_size;
3408 /* Anonymous block, allocate additional tensor blocks for this. */
3409 /* This is either because this is an internal tensor (don't have p_ref) */
3410 /* or it is an anonymous block itself within the sub graphs of this while graph. */
3411 tensor_blocks = (ccv_nnc_tensor_block_t*)ccreallocrealloc(tensor_blocks, sizeof(ccv_nnc_tensor_block_t) * (init_tensor_block_size + (unroll_count + 1) * rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap));
3412 memset(tensor_blocks + tensor_block_size, 0, sizeof(ccv_nnc_tensor_block_t) * (init_tensor_block_size + (unroll_count + 1) * rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap - tensor_block_size));
3413 if (dup_tensor_block_ref)
3414 dup_tensor_block_ref = (int*)ccreallocrealloc(dup_tensor_block_ref, sizeof(int) * unroll_count * (init_tensor_block_size + rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap));
3415 for (i = 0; i < s_alloc_prep->buffer_size; i++)
3416 if (!s_alloc_prep->buffers[i].p_refs[0])
3417 {
3418 if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i])(s_alloc_prep->buffers[i].flags & 0xc) == READ_ONLY) /* If it is read-only, add all sources (destinations) to it. */
3419 {
3420 assert(tensor_block_size < init_tensor_block_size + rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap)((void) sizeof ((tensor_block_size < init_tensor_block_size
+ rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap
) ? 1 : 0), __extension__ ({ if (tensor_block_size < init_tensor_block_size
+ rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap
) ; else __assert_fail ("tensor_block_size < init_tensor_block_size + rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap"
, "ccv_nnc_symbolic_graph_compile.c", 3420, __extension__ __PRETTY_FUNCTION__
); }))
;
3421 TENSOR_SET_ANONYMOUS(tensor_blocks[tensor_block_size])(tensor_blocks[tensor_block_size].flags = ((tensor_blocks[tensor_block_size
].flags & ~0x10) | ANONYMOUS))
;
3422 TENSOR_SET_READ_WRITE(tensor_blocks[tensor_block_size], TENSOR_READ_WRITE(s_alloc_prep->buffers[i]))(tensor_blocks[tensor_block_size].flags = ((tensor_blocks[tensor_block_size
].flags & ~0xc) | (s_alloc_prep->buffers[i].flags &
0xc)))
;
3423 tensor_blocks[tensor_block_size].type = s_alloc_prep->buffers[i].type;
3424 tensor_blocks[tensor_block_size].pin_mem = s_alloc_prep->buffers[i].pin_mem;
3425 tensor_blocks[tensor_block_size].size = s_alloc_prep->buffers[i].size;
3426 s_alloc_prep->buffers[i].p_refs[0] = tensor_block_size + 1;
3427 tensor_blocks[tensor_block_size].head = ccv_array_new(sizeof(int), 1, 0);
3428 ccv_array_push(tensor_blocks[tensor_block_size].head, &idx);
3429 ccv_array_t* const dup_p_refs = s_alloc_prep->buffers[i].dup_p_refs;
3430 if (dup_p_refs && dup_p_refs->rnum > 0)
3431 {
3432 for (j = 0; j < dup_p_refs->rnum; j++)
3433 {
3434 const int dup_p_ref = *(int*)ccv_array_get(dup_p_refs, j)((void*)(((char*)((dup_p_refs)->data)) + (size_t)(dup_p_refs
)->rsize * (size_t)(j)))
;
3435 assert(dup_p_ref >= 0)((void) sizeof ((dup_p_ref >= 0) ? 1 : 0), __extension__ (
{ if (dup_p_ref >= 0) ; else __assert_fail ("dup_p_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3435, __extension__ __PRETTY_FUNCTION__
); }))
;
3436 assert(dup_p_ref < symbolic_graph->tensor_symbol_info->rnum)((void) sizeof ((dup_p_ref < symbolic_graph->tensor_symbol_info
->rnum) ? 1 : 0), __extension__ ({ if (dup_p_ref < symbolic_graph
->tensor_symbol_info->rnum) ; else __assert_fail ("dup_p_ref < symbolic_graph->tensor_symbol_info->rnum"
, "ccv_nnc_symbolic_graph_compile.c", 3436, __extension__ __PRETTY_FUNCTION__
); }))
;
3437 assert(tensor_blocks[dup_p_ref].tail)((void) sizeof ((tensor_blocks[dup_p_ref].tail) ? 1 : 0), __extension__
({ if (tensor_blocks[dup_p_ref].tail) ; else __assert_fail (
"tensor_blocks[dup_p_ref].tail", "ccv_nnc_symbolic_graph_compile.c"
, 3437, __extension__ __PRETTY_FUNCTION__); }))
;
3438 // If it points to a p_ref upwards, check whether this is an output, if it is an output, add it to
3439 // this block's dup_p_refs. It propagates back all the way to upper layer's buffer object.
3440 if (tensor_symbol_info[dup_p_ref].p_ref)
3441 {
3442 const int p_ref_0 = tensor_symbol_info[dup_p_ref].p_ref - 1;
3443 assert(p_node_info)((void) sizeof ((p_node_info) ? 1 : 0), __extension__ ({ if (
p_node_info) ; else __assert_fail ("p_node_info", "ccv_nnc_symbolic_graph_compile.c"
, 3443, __extension__ __PRETTY_FUNCTION__); }))
;
3444 const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info);
3445 if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this.
3446 {
3447 if (!tensor_blocks[tensor_block_size].dup_p_refs)
3448 tensor_blocks[tensor_block_size].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
3449 ccv_array_add_unique_int(tensor_blocks[tensor_block_size].dup_p_refs, p_ref_0);
3450 }
3451 }
3452 if (!tensor_blocks[tensor_block_size].tail)
3453 tensor_blocks[tensor_block_size].tail = ccv_array_new(sizeof(int), tensor_blocks[dup_p_ref].tail->rnum, 0);
3454 for (k = 0; k < tensor_blocks[dup_p_ref].tail->rnum; k++)
3455 _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[dup_p_ref].tail, k)((void*)(((char*)((tensor_blocks[dup_p_ref].tail)->data)) +
(size_t)(tensor_blocks[dup_p_ref].tail)->rsize * (size_t)
(k)))
, tensor_blocks[tensor_block_size]);
3456 }
3457 } else {
3458 tensor_blocks[tensor_block_size].tail = ccv_array_new(sizeof(int), 1, 0);
3459 ccv_array_push(tensor_blocks[tensor_block_size].tail, &idx);
3460 }
3461 for (j = 0; j < source_size; j++)
3462 _ccv_nnc_tensor_block_add_exec(exec_dep, sources[j].d, tensor_blocks[tensor_block_size]);
3463 /* If this is a read-only (based on SSA, if first encountered as read), and this is
3464 * sub-graph. Mark it to the end of the graph. */
3465 if (p_exec_symbol_info)
3466 for (j = 0; j < destination_size; j++)
3467 _ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[tensor_block_size]);
3468 /* If it is read-only, it is self-reflecting. */
3469 for (k = 0; k < unroll_count; k++)
3470 {
3471 for (j = 0; j < destination_size; j++)
3472 if (dup_exec_ref[destinations[j].d * unroll_count + k] >= 0)
3473 _ccv_nnc_tensor_block_add_exec(exec_dep, dup_exec_ref[destinations[j].d * unroll_count + k], tensor_blocks[tensor_block_size]);
3474 /* No need to extend life-time, because this is a sub-graph and we already extended read-only to the end of destination. */
3475 assert(symbolic_graph->p)((void) sizeof ((symbolic_graph->p) ? 1 : 0), __extension__
({ if (symbolic_graph->p) ; else __assert_fail ("symbolic_graph->p"
, "ccv_nnc_symbolic_graph_compile.c", 3475, __extension__ __PRETTY_FUNCTION__
); }))
;
3476 dup_tensor_block_ref[tensor_block_size * unroll_count + k] = tensor_block_size;
3477 }
3478 ++tensor_block_size;
3479 } else {
3480 ccv_array_t* const dup_p_refs = s_alloc_prep->buffers[i].dup_p_refs;
3481 const int tensor_block_idx = _ccv_nnc_anonymous_tensor_block_from_free_list(tensor_blocks, tensor_block_size, anonymous_block_free_list, anonymous_block_free_list_cap, s_alloc_prep->buffers[i].type, s_alloc_prep->buffers[i].size, exec_dep, dup_p_refs);
3482 const int new_anonymous_tensor_block = (tensor_block_idx == tensor_block_size);
3483 // Find suitable tensor block from the free list.
3484 TENSOR_SET_ANONYMOUS(tensor_blocks[tensor_block_idx])(tensor_blocks[tensor_block_idx].flags = ((tensor_blocks[tensor_block_idx
].flags & ~0x10) | ANONYMOUS))
;
3485 TENSOR_SET_READ_WRITE(tensor_blocks[tensor_block_idx], TENSOR_READ_WRITE(s_alloc_prep->buffers[i]))(tensor_blocks[tensor_block_idx].flags = ((tensor_blocks[tensor_block_idx
].flags & ~0xc) | (s_alloc_prep->buffers[i].flags &
0xc)))
;
3486 s_alloc_prep->buffers[i].p_refs[0] = tensor_block_idx + 1;
3487 if (new_anonymous_tensor_block)
3488 {
3489 tensor_blocks[tensor_block_idx].type = s_alloc_prep->buffers[i].type;
3490 tensor_blocks[tensor_block_idx].pin_mem = s_alloc_prep->buffers[i].pin_mem;
3491 tensor_blocks[tensor_block_idx].size = s_alloc_prep->buffers[i].size;
3492 tensor_blocks[tensor_block_idx].head = ccv_array_new(sizeof(int), 1, 0);
3493 ccv_array_push(tensor_blocks[tensor_block_idx].head, &idx);
3494 } else {
3495 tensor_blocks[tensor_block_idx].pin_mem = tensor_blocks[tensor_block_idx].pin_mem || s_alloc_prep->buffers[i].pin_mem;
3496 tensor_blocks[tensor_block_idx].size = ccv_max(tensor_blocks[tensor_block_idx].size, s_alloc_prep->buffers[i].size)({ typeof (tensor_blocks[tensor_block_idx].size) _a = (tensor_blocks
[tensor_block_idx].size); typeof (s_alloc_prep->buffers[i]
.size) _b = (s_alloc_prep->buffers[i].size); (_a > _b) ?
_a : _b; })
;
3497 }
3498 if (dup_p_refs && dup_p_refs->rnum > 0)
3499 {
3500 for (j = 0; j < dup_p_refs->rnum; j++)
3501 {
3502 const int dup_p_ref = *(int*)ccv_array_get(dup_p_refs, j)((void*)(((char*)((dup_p_refs)->data)) + (size_t)(dup_p_refs
)->rsize * (size_t)(j)))
;
3503 assert(dup_p_ref >= 0)((void) sizeof ((dup_p_ref >= 0) ? 1 : 0), __extension__ (
{ if (dup_p_ref >= 0) ; else __assert_fail ("dup_p_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3503, __extension__ __PRETTY_FUNCTION__
); }))
;
3504 assert(dup_p_ref < symbolic_graph->tensor_symbol_info->rnum)((void) sizeof ((dup_p_ref < symbolic_graph->tensor_symbol_info
->rnum) ? 1 : 0), __extension__ ({ if (dup_p_ref < symbolic_graph
->tensor_symbol_info->rnum) ; else __assert_fail ("dup_p_ref < symbolic_graph->tensor_symbol_info->rnum"
, "ccv_nnc_symbolic_graph_compile.c", 3504, __extension__ __PRETTY_FUNCTION__
); }))
;
3505 // If it points to a p_ref upwards, check whether this is an output, if it is an output, add it to
3506 // this block's dup_p_refs. It propagates back all the way to upper layer's buffer object.
3507 if (tensor_symbol_info[dup_p_ref].p_ref)
3508 {
3509 const int p_ref_0 = tensor_symbol_info[dup_p_ref].p_ref - 1;
3510 assert(p_node_info)((void) sizeof ((p_node_info) ? 1 : 0), __extension__ ({ if (
p_node_info) ; else __assert_fail ("p_node_info", "ccv_nnc_symbolic_graph_compile.c"
, 3510, __extension__ __PRETTY_FUNCTION__); }))
;
3511 const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info);
3512 if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this.
3513 {
3514 if (!tensor_blocks[tensor_block_idx].dup_p_refs)
3515 tensor_blocks[tensor_block_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
3516 ccv_array_add_unique_int(tensor_blocks[tensor_block_idx].dup_p_refs, p_ref_0);
3517 }
3518 }
3519 assert(tensor_blocks[dup_p_ref].tail)((void) sizeof ((tensor_blocks[dup_p_ref].tail) ? 1 : 0), __extension__
({ if (tensor_blocks[dup_p_ref].tail) ; else __assert_fail (
"tensor_blocks[dup_p_ref].tail", "ccv_nnc_symbolic_graph_compile.c"
, 3519, __extension__ __PRETTY_FUNCTION__); }))
;
3520 if (!tensor_blocks[tensor_block_idx].tail)
3521 tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), tensor_blocks[dup_p_ref].tail->rnum, 0);
3522 for (k = 0; k < tensor_blocks[dup_p_ref].tail->rnum; k++)
3523 _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[dup_p_ref].tail, k)((void*)(((char*)((tensor_blocks[dup_p_ref].tail)->data)) +
(size_t)(tensor_blocks[dup_p_ref].tail)->rsize * (size_t)
(k)))
, tensor_blocks[tensor_block_idx]);
3524 // We have to add it to the warp around companion_ref as well.
3525 // TODO: Although we know this wasted space (any space in between current one and its companion_ref will still
3526 // be occupied and unlikely to be reused), but we cannot really do too much about it because the companion_ref's
3527 // definition is too free-form and if we enforce stronger gaurantee on this (such as it must wrap around), this
3528 // gaurantee may be broken down in the line.
3529 if (tensor_blocks[dup_p_ref].companion_ref)
3530 {
3531 const int companion_ref = tensor_blocks[dup_p_ref].companion_ref - 1;
3532 for (q = 0; tensor_blocks[companion_ref].head && q < tensor_blocks[companion_ref].head->rnum; q++)
3533 _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].head, q)((void*)(((char*)((tensor_blocks[companion_ref].head)->data
)) + (size_t)(tensor_blocks[companion_ref].head)->rsize * (
size_t)(q)))
, tensor_blocks[tensor_block_idx]);
3534 for (q = 0; tensor_blocks[companion_ref].tail && q < tensor_blocks[companion_ref].tail->rnum; q++)
3535 _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].tail, q)((void*)(((char*)((tensor_blocks[companion_ref].tail)->data
)) + (size_t)(tensor_blocks[companion_ref].tail)->rsize * (
size_t)(q)))
, tensor_blocks[tensor_block_idx]);
3536 }
3537 }
3538 } else if (new_anonymous_tensor_block) {
3539 tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), 1, 0);
3540 ccv_array_push(tensor_blocks[tensor_block_idx].tail, &idx);
3541 }
3542 const int prev_tensor_block_idx = tensor_block_idx;
3543 if (new_anonymous_tensor_block)
3544 {
3545 if (!anonymous_block_free_list)
3546 anonymous_block_free_list = ccv_array_new(sizeof(int), 0, 0);
3547 ccv_array_push(anonymous_block_free_list, &tensor_block_size);
3548 ++tensor_block_size;
3549 }
3550 for (k = 0; k < unroll_count; k++)
3551 {
3552 const int tensor_block_idx = new_anonymous_tensor_block ?
3553 (dup_tensor_block_ref[prev_tensor_block_idx * unroll_count + k] = tensor_block_size) :
3554 dup_tensor_block_ref[prev_tensor_block_idx * unroll_count + k];
3555 TENSOR_SET_ANONYMOUS(tensor_blocks[tensor_block_idx])(tensor_blocks[tensor_block_idx].flags = ((tensor_blocks[tensor_block_idx
].flags & ~0x10) | ANONYMOUS))
;
3556 TENSOR_SET_READ_WRITE(tensor_blocks[tensor_block_idx], TENSOR_READ_WRITE(s_alloc_prep->buffers[i]))(tensor_blocks[tensor_block_idx].flags = ((tensor_blocks[tensor_block_idx
].flags & ~0xc) | (s_alloc_prep->buffers[i].flags &
0xc)))
;
3557 if (new_anonymous_tensor_block)
3558 {
3559 tensor_blocks[tensor_block_idx].type = s_alloc_prep->buffers[i].type;
3560 tensor_blocks[tensor_block_idx].pin_mem = s_alloc_prep->buffers[i].pin_mem;
3561 tensor_blocks[tensor_block_idx].size = s_alloc_prep->buffers[i].size;
3562 tensor_blocks[tensor_block_idx].head = ccv_array_new(sizeof(int), 1, 0);
3563 /* Attach to duplicated exec for this tensor block. */
3564 ccv_array_push(tensor_blocks[tensor_block_idx].head, &dup_exec_ref[idx * unroll_count + k]);
3565 } else {
3566 tensor_blocks[tensor_block_idx].pin_mem = tensor_blocks[tensor_block_idx].pin_mem || s_alloc_prep->buffers[i].pin_mem;
3567 tensor_blocks[tensor_block_idx].size = ccv_max(tensor_blocks[tensor_block_idx].size, s_alloc_prep->buffers[i].size)({ typeof (tensor_blocks[tensor_block_idx].size) _a = (tensor_blocks
[tensor_block_idx].size); typeof (s_alloc_prep->buffers[i]
.size) _b = (s_alloc_prep->buffers[i].size); (_a > _b) ?
_a : _b; })
;
3568 _ccv_nnc_tensor_block_add_exec(exec_dep, dup_exec_ref[idx * unroll_count + k], tensor_blocks[tensor_block_idx]);
3569
3570 }
3571 if (dup_p_refs && dup_p_refs->rnum > 0)
3572 {
3573 /* Not nil, not self-reflecting. */
3574 for (j = 0; j < dup_p_refs->rnum; j++)
3575 {
3576 const int dup_p_ref = *(int*)ccv_array_get(dup_p_refs, j)((void*)(((char*)((dup_p_refs)->data)) + (size_t)(dup_p_refs
)->rsize * (size_t)(j)))
;
3577 assert(dup_p_ref >= 0)((void) sizeof ((dup_p_ref >= 0) ? 1 : 0), __extension__ (
{ if (dup_p_ref >= 0) ; else __assert_fail ("dup_p_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3577, __extension__ __PRETTY_FUNCTION__
); }))
;
3578 assert(dup_p_ref < symbolic_graph->tensor_symbol_info->rnum)((void) sizeof ((dup_p_ref < symbolic_graph->tensor_symbol_info
->rnum) ? 1 : 0), __extension__ ({ if (dup_p_ref < symbolic_graph
->tensor_symbol_info->rnum) ; else __assert_fail ("dup_p_ref < symbolic_graph->tensor_symbol_info->rnum"
, "ccv_nnc_symbolic_graph_compile.c", 3578, __extension__ __PRETTY_FUNCTION__
); }))
;
3579 // If it points to a p_ref upwards, check whether this is an output, if it is an output, add it to
3580 // this block's dup_p_refs. It propagates back all the way to upper layer's buffer object.
3581 if (tensor_symbol_info[dup_p_ref].p_ref)
3582 {
3583 const int p_ref_0 = tensor_symbol_info[dup_p_ref].p_ref - 1;
3584 assert(p_node_info)((void) sizeof ((p_node_info) ? 1 : 0), __extension__ ({ if (
p_node_info) ; else __assert_fail ("p_node_info", "ccv_nnc_symbolic_graph_compile.c"
, 3584, __extension__ __PRETTY_FUNCTION__); }))
;
3585 const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info);
3586 if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this.
3587 {
3588 if (!tensor_blocks[tensor_block_idx].dup_p_refs)
3589 tensor_blocks[tensor_block_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
3590 ccv_array_add_unique_int(tensor_blocks[tensor_block_idx].dup_p_refs, p_ref_0);
3591 }
3592 }
3593 assert(dup_tensor_block_ref[dup_p_ref * unroll_count + k] >= 0 && dup_tensor_block_ref[dup_p_ref * unroll_count + k] != dup_p_ref)((void) sizeof ((dup_tensor_block_ref[dup_p_ref * unroll_count
+ k] >= 0 && dup_tensor_block_ref[dup_p_ref * unroll_count
+ k] != dup_p_ref) ? 1 : 0), __extension__ ({ if (dup_tensor_block_ref
[dup_p_ref * unroll_count + k] >= 0 && dup_tensor_block_ref
[dup_p_ref * unroll_count + k] != dup_p_ref) ; else __assert_fail
("dup_tensor_block_ref[dup_p_ref * unroll_count + k] >= 0 && dup_tensor_block_ref[dup_p_ref * unroll_count + k] != dup_p_ref"
, "ccv_nnc_symbolic_graph_compile.c", 3593, __extension__ __PRETTY_FUNCTION__
); }))
;
3594 const int dup_dup_p_ref = dup_tensor_block_ref[dup_p_ref * unroll_count + k];
3595 assert(tensor_blocks[dup_dup_p_ref].tail)((void) sizeof ((tensor_blocks[dup_dup_p_ref].tail) ? 1 : 0),
__extension__ ({ if (tensor_blocks[dup_dup_p_ref].tail) ; else
__assert_fail ("tensor_blocks[dup_dup_p_ref].tail", "ccv_nnc_symbolic_graph_compile.c"
, 3595, __extension__ __PRETTY_FUNCTION__); }))
;
3596 if (!tensor_blocks[tensor_block_idx].tail)
3597 tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), tensor_blocks[dup_dup_p_ref].tail->rnum, 0);
3598 for (q = 0; q < tensor_blocks[dup_dup_p_ref].tail->rnum; q++)
3599 _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[dup_dup_p_ref].tail, q)((void*)(((char*)((tensor_blocks[dup_dup_p_ref].tail)->data
)) + (size_t)(tensor_blocks[dup_dup_p_ref].tail)->rsize * (
size_t)(q)))
, tensor_blocks[tensor_block_idx]);
3600 // We have to add it to the warp around companion_ref as well.
3601 if (tensor_blocks[dup_dup_p_ref].companion_ref)
3602 {
3603 const int companion_ref = tensor_blocks[dup_dup_p_ref].companion_ref - 1;
3604 for (q = 0; tensor_blocks[companion_ref].head && q < tensor_blocks[companion_ref].head->rnum; q++)
3605 _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].head, q)((void*)(((char*)((tensor_blocks[companion_ref].head)->data
)) + (size_t)(tensor_blocks[companion_ref].head)->rsize * (
size_t)(q)))
, tensor_blocks[tensor_block_idx]);
3606 for (q = 0; tensor_blocks[companion_ref].tail && q < tensor_blocks[companion_ref].tail->rnum; q++)
3607 _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].tail, q)((void*)(((char*)((tensor_blocks[companion_ref].tail)->data
)) + (size_t)(tensor_blocks[companion_ref].tail)->rsize * (
size_t)(q)))
, tensor_blocks[tensor_block_idx]);
3608 }
3609 }
3610 } else if (new_anonymous_tensor_block) {
3611 tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), 1, 0);
3612 ccv_array_push(tensor_blocks[tensor_block_idx].tail, &dup_exec_ref[idx * unroll_count + k]);
3613 }
3614 if (new_anonymous_tensor_block)
3615 ++tensor_block_size;
3616 }
3617 }
3618 }
3619 }
3620 }
3621 } ccv_nnc_graph_visit_endfor} }
3622 if (anonymous_block_free_list)
3623 ccv_array_free(anonymous_block_free_list);
3624 ccfreefree(tensor_fold);
3625 // It is time to guess what's the best tensor placement and create the opaque tensor arena. The alloc_dep will return
3626 // the allocation dependencies, thus, which tensor is reused to the existing tensor.
3627 ccv_nnc_tensor_alloc_prep_t* alloc_prep = _ccv_nnc_tensor_alloc_prep_new_and_free_exec_dep(exec_dep, tensor_blocks, tensor_block_size);
3628 prep->while_count_tensor = 0;
3629 prep->dup_breakpoints = 0;
3630 prep->p = 0;
3631 prep->symbolic_graph = symbolic_graph;
3632 prep->p_idx = symbolic_graph->p_idx;
3633 prep->exec_idx = symbolic_graph->exec_idx;
3634 prep->sub_prep_size = symbolic_graph->sub_graphs ? symbolic_graph->sub_graphs->rnum : 0;
3635 prep->sub_preps = sub_preps;
3636 prep->exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum;
3637 prep->exec_symbol_info = exec_symbol_info;
3638 prep->tensor_symbol_info_size = symbolic_graph->tensor_symbol_info->rnum;
3639 prep->tensor_symbol_info = tensor_symbol_info;
3640 prep->unroll_count = unroll_count;
3641 prep->dup_tensor_block_ref = dup_tensor_block_ref;
3642 prep->tensor_block_size = tensor_block_size;
3643 prep->tensor_blocks = tensor_blocks;
3644 prep->exec_flags = exec_flags;
3645 prep->visit = visit;
3646 prep->alloc_prep = alloc_prep;
3647 if (dup_graph)
3648 ccv_nnc_symbolic_graph_free(dup_graph);
3649 if (dup_exec_ref)
3650 ccfreefree(dup_exec_ref);
3651 return prep;
3652}
3653
3654static void _ccv_nnc_symbolic_graph_prep_free(ccv_nnc_symbolic_graph_prep_t* const prep)
3655{
3656 int i;
3657 _ccv_nnc_tensor_blocks_free(prep->tensor_blocks, prep->tensor_block_size);
3658 ccfreefree(prep->exec_flags);
3659 for (i = 0; i < prep->sub_prep_size; i++)
3660 if (prep->sub_preps[i])
3661 _ccv_nnc_symbolic_graph_prep_free(prep->sub_preps[i]);
3662 if (prep->sub_preps)
3663 ccfreefree(prep->sub_preps);
3664 ccfreefree(prep->tensor_symbol_info);
3665 ccfreefree(prep->exec_symbol_info);
3666 if (prep->dup_tensor_block_ref)
3667 ccfreefree(prep->dup_tensor_block_ref);
3668 _ccv_nnc_tensor_alloc_prep_free(prep->alloc_prep);
3669 ccv_nnc_graph_visit_free(prep->visit);
3670 ccfreefree(prep);
3671}
3672
3673static void _ccv_nnc_symbolic_graph_prep_while_count_tensor(ccv_nnc_symbolic_graph_prep_t* const graph_prep)
3674{
3675 int i, j;
3676 ccv_nnc_graph_visit_for(graph_prep->visit, graph_prep->exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (graph_prep->visit)->
size; _i_++) { const int idx __attribute__((unused)) = (graph_prep
->visit)->node[_i_].index; const int _node_unused_ __attribute__
((unused)) = (graph_prep->visit)->node[_i_].term; typeof
((graph_prep->exec_symbol_info)) const node __attribute__
((unused)) = (graph_prep->exec_symbol_info) + idx;
{
3677 if (node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
3678 {
3679 const int graph_ref = CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)
[0] - 1;
3680 assert(graph_ref >= 0)((void) sizeof ((graph_ref >= 0) ? 1 : 0), __extension__ (
{ if (graph_ref >= 0) ; else __assert_fail ("graph_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3680, __extension__ __PRETTY_FUNCTION__
); }))
;
3681 ccv_nnc_symbolic_graph_prep_t* const sub_prep = graph_prep->sub_preps[graph_ref];
3682 for (i = 0; i < node->p_while.input_size; i++)
3683 if (CCV_NNC_IS_WHILE_COUNT_TENSOR_SYMBOL(node->p_while.inputs[i])(((uint32_t)(node->p_while.inputs[i]) & 0xf) == 0xe))
3684 {
3685 ccv_nnc_symbolic_graph_prep_t* prep = sub_prep;
3686 const int d = CCV_NNC_DECODE_WHILE_COUNT_SYMBOL(node->p_while.inputs[i])((~(uint32_t)(node->p_while.inputs[i])) >> 4);
3687 for (j = 0; j < d; j++)
3688 prep = prep->p;
3689 prep->while_count_tensor = 1;
3690 }
3691 }
3692 for (i = 0; i < node->graph_ref_size; i++)
3693 {
3694 const int graph_ref = CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)
[i] - 1;
3695 if (graph_ref >= 0)
3696 _ccv_nnc_symbolic_graph_prep_while_count_tensor(graph_prep->sub_preps[graph_ref]);
3697 }
3698 } ccv_nnc_graph_visit_endfor} }
3699}
3700
3701static ccv_nnc_tensor_t* _ccv_nnc_tensor_from_graph_prep(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int symbol)
3702{
3703 if (symbol >= 0)
3704 return graph_prep->tensor_arena->vt_tensors[symbol];
3705 if (symbol == CCV_NNC_NO_TENSOR_SYMBOL)
3706 return 0;
3707 assert(CCV_NNC_IS_WHILE_COUNT_TENSOR_SYMBOL(symbol))((void) sizeof (((((uint32_t)(symbol) & 0xf) == 0xe)) ? 1
: 0), __extension__ ({ if ((((uint32_t)(symbol) & 0xf) ==
0xe)) ; else __assert_fail ("CCV_NNC_IS_WHILE_COUNT_TENSOR_SYMBOL(symbol)"
, "ccv_nnc_symbolic_graph_compile.c", 3707, __extension__ __PRETTY_FUNCTION__
); }))
;
3708 const ccv_nnc_symbolic_graph_prep_t* prep = graph_prep;
3709 int i;
3710 const int d = CCV_NNC_DECODE_WHILE_COUNT_SYMBOL(symbol)((~(uint32_t)(symbol)) >> 4);
3711 for (i = 0; i < d; i++)
3712 prep = prep->p;
3713 assert(prep->while_count_tensor)((void) sizeof ((prep->while_count_tensor) ? 1 : 0), __extension__
({ if (prep->while_count_tensor) ; else __assert_fail ("prep->while_count_tensor"
, "ccv_nnc_symbolic_graph_compile.c", 3713, __extension__ __PRETTY_FUNCTION__
); }))
;
3714 return (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(prep->tensor_arena->tensor_metadata, (0 << 1) + 1);
3715}
3716
3717static void _ccv_nnc_graph_exec_arena_topsort(ccv_nnc_graph_t* const graph, ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
3718{
3719 int i;
3720 int* const exec_cvt = (int*)ccmallocmalloc(sizeof(int) * graph->exec_info->rnum);
3721 ccv_nnc_graph_topsort(graph, exec_cvt, graph->exec_info->rnum);
3722 graph_exec_arena->source.d = exec_cvt[graph_exec_arena->source.d];
3723 graph_exec_arena->destination.d = exec_cvt[graph_exec_arena->destination.d];
3724 ccv_nnc_graph_exec_t* const graph_execs = graph_exec_arena->graph_execs;
3725 for (i = 0; i < graph_exec_arena->graph_exec_size; i++)
3726 if (graph_execs[i].graph == graph)
3727 graph_execs[i].d = exec_cvt[graph_execs[i].d];
3728 ccfreefree(exec_cvt);
3729}
3730
3731static ccv_nnc_graph_exec_arena_t* _ccv_nnc_graph_exec_arena_new(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_tensor_arena_t* const tensor_arena)
3732{
3733 int i, j, k;
3734 ccv_nnc_graph_t* const graph = graph_prep->graph;
3735 const int exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum;
3736 ccv_nnc_graph_exec_arena_t* const graph_exec_arena = (ccv_nnc_graph_exec_arena_t*)ccmallocmalloc(sizeof(ccv_nnc_graph_exec_arena_t) + sizeof(ccv_nnc_graph_exec_arena_t*) * graph_prep->sub_prep_size + sizeof(ccv_nnc_graph_exec_t) * (exec_symbol_info_size - 1));
1
Uninitialized value stored to field 'graph'
3737 graph_exec_arena->graph_ref = (intptr_t)symbolic_graph;
3738 graph_exec_arena->graph_exec_size = exec_symbol_info_size;
3739 graph_exec_arena->sub_arena_size = graph_prep->sub_prep_size;
3740 graph_exec_arena->sub_arenas = (ccv_nnc_graph_exec_arena_t**)(graph_exec_arena->graph_execs + exec_symbol_info_size);
3741 memset(graph_exec_arena->sub_arenas, 0, sizeof(ccv_nnc_graph_exec_arena_t*) * graph_exec_arena->sub_arena_size);
3742 ccv_nnc_graph_exec_t* const graph_execs = graph_exec_arena->graph_execs;
3743 int max_input_size = 0, max_output_size = 0, max_breakpoint_size = 0;
3744 for (i = 0; i < exec_symbol_info_size; i++)
2
Assuming 'i' is >= 'exec_symbol_info_size'
3
Loop condition is false. Execution continues on line 3753
3745 {
3746 max_input_size = ccv_max(max_input_size, graph_prep->exec_symbol_info[i].input_size)({ typeof (max_input_size) _a = (max_input_size); typeof (graph_prep
->exec_symbol_info[i].input_size) _b = (graph_prep->exec_symbol_info
[i].input_size); (_a > _b) ? _a : _b; })
;
3747 max_output_size = ccv_max(max_output_size, graph_prep->exec_symbol_info[i].output_size)({ typeof (max_output_size) _a = (max_output_size); typeof (graph_prep
->exec_symbol_info[i].output_size) _b = (graph_prep->exec_symbol_info
[i].output_size); (_a > _b) ? _a : _b; })
;
3748 if (graph_prep->exec_symbol_info[i].flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
3749 max_input_size = ccv_max(max_input_size, graph_prep->exec_symbol_info[i].p_while.input_size)({ typeof (max_input_size) _a = (max_input_size); typeof (graph_prep
->exec_symbol_info[i].p_while.input_size) _b = (graph_prep
->exec_symbol_info[i].p_while.input_size); (_a > _b) ? _a
: _b; })
;
3750 graph_execs[i].d = CCV_NNC_NO_TENSOR_SYMBOL;
3751 graph_execs[i].graph = 0;
3752 }
3753 for (i = 0; i < graph_prep->sub_prep_size; i++)
4
Assuming 'i' is >= field 'sub_prep_size'
5
Loop condition is false. Execution continues on line 3755
3754 max_breakpoint_size = ccv_max(max_breakpoint_size, (*(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, i))->breakpoint_size)({ typeof (max_breakpoint_size) _a = (max_breakpoint_size); typeof
((*(ccv_nnc_symbolic_graph_t**)((void*)(((char*)((symbolic_graph
->sub_graphs)->data)) + (size_t)(symbolic_graph->sub_graphs
)->rsize * (size_t)(i))))->breakpoint_size) _b = ((*(ccv_nnc_symbolic_graph_t
**)((void*)(((char*)((symbolic_graph->sub_graphs)->data
)) + (size_t)(symbolic_graph->sub_graphs)->rsize * (size_t
)(i))))->breakpoint_size); (_a > _b) ? _a : _b; })
;
3755 ccv_nnc_tensor_t* max_inputs[ccv_max(1, max_input_size)({ typeof (1) _a = (1); typeof (max_input_size) _b = (max_input_size
); (_a > _b) ? _a : _b; })
];
6
'?' condition is true
3756 ccv_nnc_tensor_t* max_outputs[ccv_max(1, max_output_size)({ typeof (1) _a = (1); typeof (max_output_size) _b = (max_output_size
); (_a > _b) ? _a : _b; })
];
7
'?' condition is true
3757 ccv_nnc_graph_exec_t max_breakpoints[ccv_max(1, max_breakpoint_size)({ typeof (1) _a = (1); typeof (max_breakpoint_size) _b = (max_breakpoint_size
); (_a > _b) ? _a : _b; })
];
8
'?' condition is true
3758 const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info = graph_prep->exec_symbol_info;
3759 const ccv_nnc_graph_exec_flag_t* const exec_flags = graph_prep->exec_flags;
3760 // Create node, this is in topological order.
3761 ccv_nnc_graph_visit_for(graph_prep->visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (graph_prep->visit)->
size; _i_++) { const int idx __attribute__((unused)) = (graph_prep
->visit)->node[_i_].index; const int _node_unused_ __attribute__
((unused)) = (graph_prep->visit)->node[_i_].term; typeof
((exec_symbol_info)) const node __attribute__((unused)) = (exec_symbol_info
) + idx;
{
9
Assuming '_i_' is >= field 'size'
10
Loop condition is false. Execution continues on line 3834
3762 if (CCV_NO_GRAPH_EXEC(graph_execs[idx])((graph_execs[idx]).graph == 0))
3763 {
3764 for (i = 0; i < node->input_size; i++)
3765 max_inputs[i] = _ccv_nnc_tensor_from_graph_prep(graph_prep, node->inputs[i]);
3766 for (i = 0; i < node->output_size; i++)
3767 max_outputs[i] = node->outputs[i] >= 0 ? tensor_arena->vt_tensors[node->outputs[i]] : 0;
3768 if (node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
3769 {
3770 const int graph_ref = CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)
[0] - 1;
3771 assert(graph_ref >= 0)((void) sizeof ((graph_ref >= 0) ? 1 : 0), __extension__ (
{ if (graph_ref >= 0) ; else __assert_fail ("graph_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3771, __extension__ __PRETTY_FUNCTION__
); }))
;
3772 ccv_nnc_symbolic_graph_prep_t* const sub_prep = graph_prep->sub_preps[graph_ref];
3773 ccv_nnc_graph_t* const sub_graph = sub_prep->graph;
3774 graph_execs[idx] = ccv_nnc_graph_while(graph, node->cmd.cmd, sub_graph);
3775 const ccv_nnc_symbolic_graph_t* const sub_symbolic_graph = *(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, graph_ref)((void*)(((char*)((symbolic_graph->sub_graphs)->data)) +
(size_t)(symbolic_graph->sub_graphs)->rsize * (size_t)
(graph_ref)))
;
3776 ccv_nnc_graph_exec_arena_t* const sub_arena = graph_exec_arena->sub_arenas[graph_ref] = _ccv_nnc_graph_exec_arena_new(sub_symbolic_graph, ccv_nnc_symbolic_graph_sources(sub_symbolic_graph), ccv_nnc_symbolic_graph_source_size(sub_symbolic_graph), ccv_nnc_symbolic_graph_destinations(sub_symbolic_graph), ccv_nnc_symbolic_graph_destination_size(sub_symbolic_graph), graph_prep->sub_preps[graph_ref], tensor_arena->sub_arenas[graph_ref]);
3777 ccv_nnc_graph_exec_set_io(graph, graph_execs[idx], max_inputs, node->input_size, max_outputs, node->output_size);
3778 for (i = 0; i < node->p_while.input_size; i++)
3779 max_inputs[i] = _ccv_nnc_tensor_from_graph_prep(sub_prep, node->p_while.inputs[i]);
3780 for (i = 0; i < sub_symbolic_graph->breakpoint_size; i++)
3781 max_breakpoints[i] = ccv_nnc_graph_exec_from_symbol(sub_arena, sub_symbolic_graph->breakpoints[i]);
3782 ccv_nnc_graph_set_while_expr(sub_graph, node->p_while.expr, node->p_while.data, max_inputs, node->p_while.input_size, max_breakpoints, sub_symbolic_graph->breakpoint_size);
3783 _ccv_nnc_graph_exec_arena_topsort(sub_graph, sub_arena);
3784 } else if (node->flags & CCV_NNC_GRAPH_EXEC_CASE_OF) {
3785 for (i = 0; i < node->output_size; i++)
3786 if (max_outputs[i] && max_outputs[i]->alias_ref)
3787 max_outputs[i] = (ccv_nnc_tensor_t*)max_outputs[i]->alias_ref;
3788 graph_execs[idx] = ccv_nnc_graph_case_of_new(graph, node->cmd.cmd, max_inputs + node->case_of.argument.offset, node->case_of.argument.size, max_outputs, node->output_size);
3789 // Check whether this is already covered in the inputs, if not, need to be covered in the update.
3790 for (i = 0; i < node->case_of.argument.offset; i++)
3791 {
3792 ccv_nnc_tensor_t* const update = max_inputs[i];
3793 if (!CCV_IS_TENSOR_MULTIVIEW(update)((*(int*)(update)) & CCV_TENSOR_MULTIVIEW)) // No need if it is a naked tensor.
3794 continue;
3795 int flag = 0;
3796 for (j = node->case_of.argument.offset; !flag && j < node->case_of.argument.size; j++)
3797 flag = (update == max_inputs[j]);
3798 if (!flag)
3799 ccv_nnc_graph_exec_add_as_affected(graph, graph_execs[idx], update);
3800 }
3801 const int offset = (exec_flags[idx].flags & CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO) ? 1 : 0;
3802 ccv_nnc_graph_set_case_of_expr(graph, graph_execs[idx], node->case_of.expr, node->case_of.data, offset);
3803 if (exec_flags[idx].flags & CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO)
3804 {
3805 // Add another graph for data transfer.
3806 ccv_nnc_graph_t* sub_graph = ccv_nnc_graph_new();
3807 for (i = 0; i < node->output_size; i++)
3808 max_outputs[i] = node->outputs[i] >= 0 ? tensor_arena->vt_tensors[node->outputs[i]] : 0;
3809 ccv_nnc_graph_exec_t io = ccv_nnc_graph_exec_new(sub_graph, ccv_nnc_cmd(CCV_NNC_DATA_TRANSFER_FORWARD, 0, CMD_GENERIC()((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}}}), 0), ccv_nnc_no_hint, max_inputs, ccv_min(node->input_size, node->output_size)({ typeof (node->input_size) _a = (node->input_size); typeof
(node->output_size) _b = (node->output_size); (_a <
_b) ? _a : _b; })
, max_outputs, ccv_min(node->input_size, node->output_size)({ typeof (node->input_size) _a = (node->input_size); typeof
(node->output_size) _b = (node->output_size); (_a <
_b) ? _a : _b; })
);
3810 ccv_nnc_graph_set_sources(sub_graph, &io, 1);
3811 ccv_nnc_graph_set_destinations(sub_graph, &io, 1);
3812 ccv_nnc_graph_set_case_of(graph, graph_execs[idx], sub_graph, 0);
3813 int exec_cvt;
3814 ccv_nnc_graph_topsort(sub_graph, &exec_cvt, 1);
3815 }
3816 for (i = 0; i < node->graph_ref_size; i++)
3817 {
3818 const int graph_ref = CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)
[i] - 1;
3819 if (graph_ref < 0)
3820 continue;
3821 ccv_nnc_graph_t* const sub_graph = graph_prep->sub_preps[graph_ref]->graph;
3822 const ccv_nnc_symbolic_graph_t* const sub_symbolic_graph = *(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, graph_ref)((void*)(((char*)((symbolic_graph->sub_graphs)->data)) +
(size_t)(symbolic_graph->sub_graphs)->rsize * (size_t)
(graph_ref)))
;
3823 ccv_nnc_graph_exec_arena_t* const sub_arena = graph_exec_arena->sub_arenas[graph_ref] = _ccv_nnc_graph_exec_arena_new(sub_symbolic_graph, ccv_nnc_symbolic_graph_sources(sub_symbolic_graph), ccv_nnc_symbolic_graph_source_size(sub_symbolic_graph), ccv_nnc_symbolic_graph_destinations(sub_symbolic_graph), ccv_nnc_symbolic_graph_destination_size(sub_symbolic_graph), graph_prep->sub_preps[graph_ref], tensor_arena->sub_arenas[graph_ref]);
3824 ccv_nnc_graph_set_case_of(graph, graph_execs[idx], sub_graph, i + offset);
3825 _ccv_nnc_graph_exec_arena_topsort(sub_graph, sub_arena);
3826 }
3827 } else {
3828 graph_execs[idx] = ccv_nnc_graph_exec_new(graph, node->cmd, node->hint, max_inputs, node->input_size, max_outputs, node->output_size);
3829 }
3830 ccv_nnc_graph_exec_set_io_flags(graph, graph_execs[idx], 0, 0, 0, 0);
3831 }
3832 } ccv_nnc_graph_visit_endfor} }
3833 // Then connect them.
3834 ccv_nnc_graph_visit_for(graph_prep->visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (graph_prep->visit)->
size; _i_++) { const int idx __attribute__((unused)) = (graph_prep
->visit)->node[_i_].index; const int _node_unused_ __attribute__
((unused)) = (graph_prep->visit)->node[_i_].term; typeof
((exec_symbol_info)) const node __attribute__((unused)) = (exec_symbol_info
) + idx;
{
11
Loop condition is false. Execution continues on line 3843
3835 if (node->outgoings)
3836 for (i = 0; i < node->outgoings->rnum; i++)
3837 {
3838 const int outgoing = *(int*)ccv_array_get(node->outgoings, i)((void*)(((char*)((node->outgoings)->data)) + (size_t)(
node->outgoings)->rsize * (size_t)(i)))
;
3839 if (graph_execs[outgoing].graph)
3840 ccv_nnc_graph_exec_concat(graph, graph_execs[idx], graph_execs[outgoing]);
3841 }
3842 } ccv_nnc_graph_visit_endfor} }
3843 int source_exec_created = 0;
3844 const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info = graph_prep->tensor_symbol_info;
3845 const ccv_nnc_tensor_block_t* const tensor_blocks = graph_prep->tensor_blocks;
3846 ccv_array_t* const* const alloc_dep = graph_prep->alloc_prep->alloc_dep;
3847 // After the graph is materialized, we need to handle the case that some of these tensors require to be initialized to zero before use.
3848 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
12
Assuming 'i' is < field 'rnum'
3849 {
3850 if (TENSOR_REQUIRE_INIT(tensor_symbol_info[i].flags)(((tensor_symbol_info[i].flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS
) || ((tensor_symbol_info[i].flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ONES
))
)
13
Loop condition is true. Entering loop body
14
Assuming the condition is true
3851 {
3852 int ref = i;
3853 while (tensor_symbol_info[ref].alias_ref)
3854 ref = tensor_symbol_info[ref].alias_ref - 1;
3855 while (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref])(!((tensor_blocks[ref].flags & 0x3) == ALIAS) && !
((tensor_blocks[ref].flags & 0x3) == UNASSIGNED))
&& tensor_blocks[ref].ref)
15
Loop condition is false. Execution continues on line 3855
16
Assuming the condition is false
17
Assuming the condition is false
3856 ref = tensor_blocks[ref].ref - 1;
3857 // This is not computable. It could be that we marked a const tensor as init zero.
3858 if (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref])(!((tensor_blocks[ref].flags & 0x3) == ALIAS) && !
((tensor_blocks[ref].flags & 0x3) == UNASSIGNED))
)
3859 continue;
3860 // If this tensor is not used by any exec, we don't need to init at all. Skip.
3861 if (!tensor_blocks[ref].head || tensor_blocks[ref].head->rnum == 0)
18
Assuming field 'head' is non-null
19
Assuming field 'rnum' is not equal to 0
20
Taking false branch
3862 continue;
3863 ccv_nnc_tensor_t* tensor = tensor_arena->vt_tensors[ref];
3864 // Now, we have the original tensor, we can get the actual tensor, and construct the set command.
3865 ccv_nnc_graph_exec_t set_exec;
3866 if (tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS)
21
Taking true branch
3867 set_exec = ccv_nnc_graph_exec_new(graph, CMD_SET_FORWARD(0)ccv_nnc_cmd(CCV_NNC_SET_FORWARD, 0, (ccv_nnc_cmd_param_t){.size
={.dim={1,1,1}},.blas={.a={0,}}}, 0)
, ccv_nnc_no_hint, 0, 0, &tensor, 1);
3868 else if (tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_INIT_ONES)
3869 set_exec = ccv_nnc_graph_exec_new(graph, CMD_SET_FORWARD(1)ccv_nnc_cmd(CCV_NNC_SET_FORWARD, 0, (ccv_nnc_cmd_param_t){.size
={.dim={1,1,1}},.blas={.a={1,}}}, 0)
, ccv_nnc_no_hint, 0, 0, &tensor, 1);
3870 for (j = 0; j < tensor_blocks[ref].head->rnum; j++)
22
Assuming 'j' is >= field 'rnum'
23
Loop condition is false. Execution continues on line 3879
3871 {
3872 const int outgoing = *(int*)ccv_array_get(tensor_blocks[ref].head, j)((void*)(((char*)((tensor_blocks[ref].head)->data)) + (size_t
)(tensor_blocks[ref].head)->rsize * (size_t)(j)))
;
3873 if (outgoing >= exec_symbol_info_size)
3874 continue;
3875 assert(outgoing >= 0)((void) sizeof ((outgoing >= 0) ? 1 : 0), __extension__ ({
if (outgoing >= 0) ; else __assert_fail ("outgoing >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3875, __extension__ __PRETTY_FUNCTION__
); }))
;
3876 assert(graph_execs[outgoing].graph)((void) sizeof ((graph_execs[outgoing].graph) ? 1 : 0), __extension__
({ if (graph_execs[outgoing].graph) ; else __assert_fail ("graph_execs[outgoing].graph"
, "ccv_nnc_symbolic_graph_compile.c", 3876, __extension__ __PRETTY_FUNCTION__
); }))
;
3877 ccv_nnc_graph_exec_concat(graph, set_exec, graph_execs[outgoing]);
3878 }
3879 int flags = 0;
3880 if (alloc_dep[ref])
24
Assuming the condition is true
25
Taking true branch
3881 for (j = 0; j < alloc_dep[ref]->rnum; j++)
26
Assuming 'j' is < field 'rnum'
27
Loop condition is true. Entering loop body
3882 {
3883 const int d = *(int*)ccv_array_get(alloc_dep[ref], j)((void*)(((char*)((alloc_dep[ref])->data)) + (size_t)(alloc_dep
[ref])->rsize * (size_t)(j)))
;
3884 // This is from alloc_dep, it should be computable.
3885 assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]))((void) sizeof (((!((tensor_blocks[d].flags & 0x3) == ALIAS
) && !((tensor_blocks[d].flags & 0x3) == UNASSIGNED
))) ? 1 : 0), __extension__ ({ if ((!((tensor_blocks[d].flags
& 0x3) == ALIAS) && !((tensor_blocks[d].flags &
0x3) == UNASSIGNED))) ; else __assert_fail ("TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d])"
, "ccv_nnc_symbolic_graph_compile.c", 3885, __extension__ __PRETTY_FUNCTION__
); }))
;
28
Assuming the condition is false
29
Assuming the condition is false
30
Taking true branch
3886 if (tensor_blocks[d].tail)
31
Assuming field 'tail' is non-null
32
Taking true branch
3887 for (k = 0; k < tensor_blocks[d].tail->rnum; k++)
33
Assuming 'k' is < field 'rnum'
34
Loop condition is true. Entering loop body
3888 {
3889 const int incoming = *(int*)ccv_array_get(tensor_blocks[d].tail, k)((void*)(((char*)((tensor_blocks[d].tail)->data)) + (size_t
)(tensor_blocks[d].tail)->rsize * (size_t)(k)))
;
3890 if (incoming >= exec_symbol_info_size)
35
Assuming 'incoming' is < 'exec_symbol_info_size'
36
Taking false branch
3891 continue;
3892 assert(incoming >= 0)((void) sizeof ((incoming >= 0) ? 1 : 0), __extension__ ({
if (incoming >= 0) ; else __assert_fail ("incoming >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3892, __extension__ __PRETTY_FUNCTION__
); }))
;
37
Assuming 'incoming' is >= 0
38
Taking true branch
3893 assert(graph_execs[incoming].graph)((void) sizeof ((graph_execs[incoming].graph) ? 1 : 0), __extension__
({ if (graph_execs[incoming].graph) ; else __assert_fail ("graph_execs[incoming].graph"
, "ccv_nnc_symbolic_graph_compile.c", 3893, __extension__ __PRETTY_FUNCTION__
); }))
;
39
Branch condition evaluates to a garbage value
3894 ccv_nnc_graph_exec_concat(graph, graph_execs[incoming], set_exec);
3895 flags = 1;
3896 }
3897 }
3898 // If cannot find a start node for this exec, we need to append it to the no-op of the start.
3899 if (!flags)
3900 {
3901 if (!source_exec_created)
3902 {
3903 graph_exec_arena->source = ccv_nnc_graph_exec_new(graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC()((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}}}), 0), ccv_nnc_no_hint, 0, 0, 0, 0);
3904 source_exec_created = 1;
3905 }
3906 ccv_nnc_graph_exec_concat(graph, graph_exec_arena->source, set_exec);
3907 }
3908 }
3909 }
3910 // Now go through the list of tensors to see whether we need to do explicit broadcast for these tensor multi-views
3911 // (we need that if it is not associated as inputs / outputs of any execs, this is possible if all execs associate
3912 // with its alias).
3913 assert(tensor_arena->vt_tensor_size == graph_prep->tensor_symbol_info_size)((void) sizeof ((tensor_arena->vt_tensor_size == graph_prep
->tensor_symbol_info_size) ? 1 : 0), __extension__ ({ if (
tensor_arena->vt_tensor_size == graph_prep->tensor_symbol_info_size
) ; else __assert_fail ("tensor_arena->vt_tensor_size == graph_prep->tensor_symbol_info_size"
, "ccv_nnc_symbolic_graph_compile.c", 3913, __extension__ __PRETTY_FUNCTION__
); }))
;
3914 for (i = 0; i < tensor_arena->vt_tensor_size; i++)
3915 {
3916 ccv_nnc_tensor_t* mv = tensor_arena->vt_tensors[i];
3917 // If it is multiview tensor, inspect all its head to see whether we already associated with the node.
3918 if (mv && CCV_IS_TENSOR_MULTIVIEW(mv)((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW))
3919 {
3920 const ccv_array_t* const head = tensor_blocks[i].head;
3921 if (head && head->rnum > 0)
3922 for (j = 0; j < head->rnum; j++)
3923 {
3924 const int idx = *(int*)ccv_array_get(head, j)((void*)(((char*)((head)->data)) + (size_t)(head)->rsize
* (size_t)(j)))
;
3925 if (idx >= exec_symbol_info_size)
3926 continue;
3927 assert(idx >= 0)((void) sizeof ((idx >= 0) ? 1 : 0), __extension__ ({ if (
idx >= 0) ; else __assert_fail ("idx >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 3927, __extension__ __PRETTY_FUNCTION__); }))
;
3928 const int d = graph_execs[idx].d;
3929 ccv_nnc_graph_exec_info_t* const exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, d)((void*)(((char*)((graph->exec_info)->data)) + (size_t)
(graph->exec_info)->rsize * (size_t)(d)))
;
3930 int flag = 0;
3931 if (exec_info->tensor_wraps_ref)
3932 {
3933 ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = *(ccv_nnc_graph_tensor_wrap_array_t**)ccv_array_get(graph->tensor_wraps, exec_info->tensor_wraps_ref - 1)((void*)(((char*)((graph->tensor_wraps)->data)) + (size_t
)(graph->tensor_wraps)->rsize * (size_t)(exec_info->
tensor_wraps_ref - 1)))
;
3934 for (k = 0; k < tensor_wrap_array->size && !flag; k++)
3935 flag = (tensor_wrap_array->tensor_wraps[k] && tensor_wrap_array->tensor_wraps[k]->tensors[0] == mv);
3936 }
3937 // If none is in the flag, it need to be included in the cast.
3938 if (!flag)
3939 ccv_nnc_graph_exec_add_as_affected(graph, graph_execs[idx], mv);
3940 }
3941 }
3942 }
3943 // Create source / destination phony node. This is to facilitate use of compiled graph.
3944 // Also, this is needed if you have init zero execs.
3945 if (source_exec_created || source_size > 1)
3946 {
3947 if (!source_exec_created)
3948 graph_exec_arena->source = ccv_nnc_graph_exec_new(graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC()((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}}}), 0), ccv_nnc_no_hint, 0, 0, 0, 0);
3949 for (i = 0; i < source_size; i++)
3950 ccv_nnc_graph_exec_concat(graph, graph_exec_arena->source, graph_execs[sources[i].d]);
3951 } else {
3952 assert(!source_exec_created)((void) sizeof ((!source_exec_created) ? 1 : 0), __extension__
({ if (!source_exec_created) ; else __assert_fail ("!source_exec_created"
, "ccv_nnc_symbolic_graph_compile.c", 3952, __extension__ __PRETTY_FUNCTION__
); }))
;
3953 assert(source_size == 1)((void) sizeof ((source_size == 1) ? 1 : 0), __extension__ ({
if (source_size == 1) ; else __assert_fail ("source_size == 1"
, "ccv_nnc_symbolic_graph_compile.c", 3953, __extension__ __PRETTY_FUNCTION__
); }))
;
3954 graph_exec_arena->source = graph_execs[sources[0].d];
3955 }
3956 if (destination_size == 1)
3957 graph_exec_arena->destination = graph_execs[destinations[0].d];
3958 else {
3959 graph_exec_arena->destination = ccv_nnc_graph_exec_new(graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC()((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}}}), 0), ccv_nnc_no_hint, 0, 0, 0, 0);
3960 for (i = 0; i < destination_size; i++)
3961 ccv_nnc_graph_exec_concat(graph, graph_execs[destinations[i].d], graph_exec_arena->destination);
3962 }
3963 ccv_nnc_graph_set_sources(graph, &graph_exec_arena->source, 1);
3964 ccv_nnc_graph_set_destinations(graph, &graph_exec_arena->destination, 1);
3965 return graph_exec_arena;
3966}
3967
3968static ccv_nnc_graph_t* _ccv_nnc_graph_find_pair(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_symbolic_graph_t* const pair)
3969{
3970 if (graph_prep->symbolic_graph == pair)
3971 return graph_prep->graph;
3972 int i;
3973 for (i = 0; i < graph_prep->sub_prep_size; i++)
3974 if (graph_prep->sub_preps[i])
3975 {
3976 ccv_nnc_graph_t* const graph = _ccv_nnc_graph_find_pair(graph_prep->sub_preps[i], pair);
3977 if (graph)
3978 return graph;
3979 }
3980 return 0;
3981}
3982
3983static void _ccv_nnc_graph_fixup_pair(const ccv_nnc_symbolic_graph_prep_t* const root_prep, ccv_nnc_symbolic_graph_prep_t* const graph_prep)
3984{
3985 int i;
3986 for (i = 0; i < graph_prep->sub_prep_size; i++)
3987 if (graph_prep->sub_preps[i])
3988 {
3989 if (graph_prep->sub_preps[i]->symbolic_graph->pair)
3990 graph_prep->sub_preps[i]->graph->pair = _ccv_nnc_graph_find_pair(root_prep, graph_prep->sub_preps[i]->symbolic_graph->pair);
3991 }
3992}
3993
3994static void _ccv_nnc_graph_exec_arena_fixup_pair_ref(const ccv_nnc_graph_exec_arena_t* const root_arena, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
3995{
3996 assert(graph_exec_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph)((void) sizeof ((graph_exec_arena->graph_ref == (intptr_t)
graph_prep->symbolic_graph) ? 1 : 0), __extension__ ({ if (
graph_exec_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph
) ; else __assert_fail ("graph_exec_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 3996, __extension__ __PRETTY_FUNCTION__
); }))
;
3997 int i;
3998 for (i = 0; i < graph_prep->exec_symbol_info_size; i++)
3999 {
4000 if (CCV_NNC_GRAPH_EXEC_IS_DEAD(graph_prep->exec_symbol_info[i].flags)((graph_prep->exec_symbol_info[i].flags) & CCV_NNC_GRAPH_EXEC_DEAD
)
)
4001 continue;
4002 if (graph_exec_arena->graph_execs[i].graph && graph_prep->exec_symbol_info[i].pair_ref)
4003 {
4004 ccv_nnc_graph_exec_t pair_exec = ccv_nnc_graph_exec_from_symbol(root_arena, (ccv_nnc_graph_exec_symbol_t){
4005 .d = graph_prep->exec_symbol_info[i].pair_ref - 1,
4006 .graph = graph_prep->symbolic_graph->pair ? graph_prep->symbolic_graph->pair : graph_prep->symbolic_graph,
4007 });
4008 if (pair_exec.d >= 0)
4009 ccv_nnc_graph_exec_pair_with(graph_prep->graph, graph_exec_arena->graph_execs[i], pair_exec);
4010 }
4011 }
4012 for (i = 0; i < graph_prep->sub_prep_size; i++)
4013 if (graph_prep->sub_preps[i])
4014 _ccv_nnc_graph_exec_arena_fixup_pair_ref(root_arena, graph_prep->sub_preps[i], graph_exec_arena->sub_arenas[i]);
4015}
4016
4017static void _ccv_nnc_symbolic_graph_prep_dup_breakpoints_free(ccv_nnc_symbolic_graph_prep_t* const graph_prep)
4018{
4019 int i;
4020 if (graph_prep->dup_breakpoints)
4021 {
4022 // Strip the const modifier only possible because it is a sub-graph.
4023 ccv_nnc_symbolic_graph_t* const symbolic_graph = (ccv_nnc_symbolic_graph_t*)graph_prep->symbolic_graph;
4024 for (i = 0; i < graph_prep->dup_breakpoints->rnum; i++)
4025 ccv_nnc_graph_exec_symbol_free(symbolic_graph, *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(graph_prep->dup_breakpoints, i)((void*)(((char*)((graph_prep->dup_breakpoints)->data))
+ (size_t)(graph_prep->dup_breakpoints)->rsize * (size_t
)(i)))
);
4026 ccv_array_free(graph_prep->dup_breakpoints);
4027 graph_prep->dup_breakpoints = 0;
4028 graph_prep->exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum;
4029 // Afterwards, we have to regenerate the exec_symbol_info, fill in the information (through symbol_infer).
4030 memcpy(graph_prep->exec_symbol_info, ccv_array_get(symbolic_graph->exec_symbol_info, 0)((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
(size_t)(0)))
, sizeof(ccv_nnc_graph_exec_symbol_info_t) * graph_prep->exec_symbol_info_size);
4031 // Since exec_symbol_info changed, create a new visit object.
4032 assert(symbolic_graph->sources)((void) sizeof ((symbolic_graph->sources) ? 1 : 0), __extension__
({ if (symbolic_graph->sources) ; else __assert_fail ("symbolic_graph->sources"
, "ccv_nnc_symbolic_graph_compile.c", 4032, __extension__ __PRETTY_FUNCTION__
); }))
;
4033 assert(symbolic_graph->destinations)((void) sizeof ((symbolic_graph->destinations) ? 1 : 0), __extension__
({ if (symbolic_graph->destinations) ; else __assert_fail
("symbolic_graph->destinations", "ccv_nnc_symbolic_graph_compile.c"
, 4033, __extension__ __PRETTY_FUNCTION__); }))
;
4034 ccv_nnc_graph_exec_symbol_t* const sources = (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->sources, 0)((void*)(((char*)((symbolic_graph->sources)->data)) + (
size_t)(symbolic_graph->sources)->rsize * (size_t)(0)))
;
4035 const int source_size = symbolic_graph->sources->rnum;
4036 ccv_nnc_graph_exec_symbol_t* const destinations = (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->destinations, 0)((void*)(((char*)((symbolic_graph->destinations)->data)
) + (size_t)(symbolic_graph->destinations)->rsize * (size_t
)(0)))
;
4037 const int destination_size = symbolic_graph->destinations->rnum;
4038 ccv_nnc_graph_visit_t* visit = ccv_nnc_graph_visit_new(symbolic_graph, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, 0), symbolic_graph->exec_symbol_info->rnum, sources, source_size, destinations, destination_size, 0)({ ccv_nnc_graph_visit_t* _visit_ = (ccv_nnc_graph_visit_t*)malloc
(sizeof(ccv_nnc_graph_visit_t) + sizeof(_visit_->node[0]) *
((symbolic_graph->exec_symbol_info->rnum) - 1)); _visit_
->size = 0; do { typedef struct { int8_t d; int8_t r; uint16_t
c; int32_t edges; } ccv_nnc_incoming_t; int _i_, _j_; int _incoming_edges_
= 0; for (_i_ = 0; _i_ < (symbolic_graph->exec_symbol_info
->rnum); _i_++) _incoming_edges_ += (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_i_].outgoings) ? ((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_i_].outgoings->rnum : 0; const int _heap_mem_
= ((symbolic_graph->exec_symbol_info->rnum) + _incoming_edges_
> 1024); ccv_nnc_incoming_t* _incomings_; if (_heap_mem_)
_incomings_ = (ccv_nnc_incoming_t*)malloc(sizeof(ccv_nnc_incoming_t
) * (symbolic_graph->exec_symbol_info->rnum) + sizeof(int32_t
) * ((symbolic_graph->exec_symbol_info->rnum) * 2 + _incoming_edges_
)); else _incomings_ = (ccv_nnc_incoming_t*)__builtin_alloca (
sizeof(ccv_nnc_incoming_t) * (symbolic_graph->exec_symbol_info
->rnum) + sizeof(int32_t) * ((symbolic_graph->exec_symbol_info
->rnum) * 2 + _incoming_edges_)); memset(_incomings_, 0, sizeof
(ccv_nnc_incoming_t) * (symbolic_graph->exec_symbol_info->
rnum)); int32_t* _exists_[2] = { (int32_t*)(_incomings_ + (symbolic_graph
->exec_symbol_info->rnum)), (int32_t*)(_incomings_ + (symbolic_graph
->exec_symbol_info->rnum)) + (symbolic_graph->exec_symbol_info
->rnum), }; int32_t* const _edges_ = _exists_[1] + (symbolic_graph
->exec_symbol_info->rnum); for (_i_ = 0; _i_ < (source_size
); _i_++) { ((void) sizeof (((sources)[_i_].graph == symbolic_graph
) ? 1 : 0), __extension__ ({ if ((sources)[_i_].graph == symbolic_graph
) ; else __assert_fail ("(sources)[_i_].graph == symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 4038, __extension__ __PRETTY_FUNCTION__
); })); _incomings_[(sources)[_i_].d].r = 1; _exists_[0][_i_]
= (sources)[_i_].d; } int _exist_size_[2] = { (source_size),
0, }; int _p_ = 0, _q_ = 1; while (_exist_size_[_p_] > 0)
{ _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_
[_p_]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (
_incomings_[_idx_].r != 1) continue; _incomings_[_idx_].r = 2
; if (((ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)((
symbolic_graph->exec_symbol_info)->data)) + (size_t)(symbolic_graph
->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings
) for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t*)
((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
(size_t)(0))))[_idx_].outgoings->rnum; _j_++) { const int
d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_)
)); ++_incomings_[d].c; if (_incomings_[d].r != 0) continue; _incomings_
[d].r = 1; ((void) sizeof ((_exist_size_[_q_] < (symbolic_graph
->exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if
(_exist_size_[_q_] < (symbolic_graph->exec_symbol_info
->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 4038, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
(_i_ = 0; _i_ < (source_size); _i_++) { ((void) sizeof ((
(sources)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__
({ if ((sources)[_i_].graph == symbolic_graph) ; else __assert_fail
("(sources)[_i_].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 4038, __extension__ __PRETTY_FUNCTION__); })); _incomings_[
(sources)[_i_].d].r = 3; _exists_[0][_i_] = (sources)[_i_].d;
} _exist_size_[0] = (source_size); _exist_size_[1] = 0; _p_ =
0, _q_ = 1; int _bump_ = 1; while (_exist_size_[_p_] > 0)
{ _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_
[_p_]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (
_incomings_[_idx_].r != 3) continue; _incomings_[_idx_].r = 4
; if (((ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)((
symbolic_graph->exec_symbol_info)->data)) + (size_t)(symbolic_graph
->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings
) for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t*)
((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
(size_t)(0))))[_idx_].outgoings->rnum; _j_++) { const int
d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_)
)); if (_incomings_[d].edges == 0) { _incomings_[d].edges = _bump_
; _bump_ += _incomings_[d].c; _incomings_[d].c = 0; } _edges_
[_incomings_[d].edges - 1 + _incomings_[d].c] = _idx_; ++_incomings_
[d].c; if (_incomings_[d].r != 2) continue; _incomings_[d].r =
3; ((void) sizeof ((_exist_size_[_q_] < (symbolic_graph->
exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_
[_q_] < (symbolic_graph->exec_symbol_info->rnum)) ; else
__assert_fail ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 4038, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
(_i_ = 0; _i_ < (destination_size); _i_++) { ((void) sizeof
(((destinations)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__
({ if ((destinations)[_i_].graph == symbolic_graph) ; else __assert_fail
("(destinations)[_i_].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 4038, __extension__ __PRETTY_FUNCTION__); })); _incomings_[
(destinations)[_i_].d].r = 5; _exists_[0][_i_] = (destinations
)[_i_].d; } _exist_size_[0] = (destination_size); _exist_size_
[1] = 0; _p_ = 0, _q_ = 1; while (_exist_size_[_p_] > 0) {
_exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_
]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (_incomings_
[_idx_].r != 5) continue; _incomings_[_idx_].r = 6; if (_incomings_
[_idx_].edges > 0) for (_j_ = 0; _j_ < _incomings_[_idx_
].c; _j_++) { const int d = _edges_[_incomings_[_idx_].edges -
1 + _j_]; if (_incomings_[d].r != 4) continue; _incomings_[d
].r = 5; ((void) sizeof ((_exist_size_[_q_] < (symbolic_graph
->exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if
(_exist_size_[_q_] < (symbolic_graph->exec_symbol_info
->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 4038, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
(_i_ = 0; _i_ < (destination_size); _i_++) { ((void) sizeof
(((destinations)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__
({ if ((destinations)[_i_].graph == symbolic_graph) ; else __assert_fail
("(destinations)[_i_].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 4038, __extension__ __PRETTY_FUNCTION__); })); _incomings_[
(destinations)[_i_].d].d = 1; } for (_i_ = 0; _i_ < (source_size
); _i_++) { ((void) sizeof (((sources)[_i_].graph == symbolic_graph
) ? 1 : 0), __extension__ ({ if ((sources)[_i_].graph == symbolic_graph
) ; else __assert_fail ("(sources)[_i_].graph == symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 4038, __extension__ __PRETTY_FUNCTION__
); })); _exists_[0][_i_] = (sources)[_i_].d; } _p_ = 0; _q_ =
1; _exist_size_[0] = (source_size); _exist_size_[1] = 0; int
_d_ = 0; while (_exist_size_[_p_] > 0) { _exist_size_[_q_
] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_];) { const int32_t
_idx_ = _exists_[_p_][_i_]; _visit_->node[_visit_->size
].index = ((_idx_)); _visit_->node[_visit_->size].term =
((_incomings_[_idx_].d)); ++_visit_->size;; if (_incomings_
[_idx_].d) { ++_d_; _incomings_[_idx_].r = 7; } if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings) { if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings->rnum == 1) { const int
d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(0)))
; --_incomings_[d].c; if (_incomings_[d].c == 0 && _incomings_
[d].r == 6 && _d_ < (destination_size)) { _exists_
[_p_][_i_] = d; continue; } } else for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings->rnum; _j_++) { const int
d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_)
)); --_incomings_[d].c; if (_incomings_[d].c == 0 && _incomings_
[d].r == 6 && _d_ < (destination_size)) { ((void) sizeof
((_exist_size_[_q_] < (symbolic_graph->exec_symbol_info
->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_[_q_] <
(symbolic_graph->exec_symbol_info->rnum)) ; else __assert_fail
("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 4038, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } } ++_i_; } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (
_i_)); } for (_i_ = 0; _i_ < (destination_size); _i_++) { (
(void) sizeof (((destinations)[_i_].graph == symbolic_graph) ?
1 : 0), __extension__ ({ if ((destinations)[_i_].graph == symbolic_graph
) ; else __assert_fail ("(destinations)[_i_].graph == symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 4038, __extension__ __PRETTY_FUNCTION__
); })); if (_incomings_[(destinations)[_i_].d].r == 7) continue
; if (!(0)) { ((void) sizeof ((_incomings_[(destinations)[_i_
].d].c == 0) ? 1 : 0), __extension__ ({ if (_incomings_[(destinations
)[_i_].d].c == 0) ; else __assert_fail ("_incomings_[(destinations)[_i_].d].c == 0"
, "ccv_nnc_symbolic_graph_compile.c", 4038, __extension__ __PRETTY_FUNCTION__
); })); } else if (_incomings_[(destinations)[_i_].d].c > 0
) continue; _visit_->node[_visit_->size].index = (((destinations
)[_i_].d)); _visit_->node[_visit_->size].term = ((_incomings_
[(destinations)[_i_].d].d)); ++_visit_->size;; } if (_heap_mem_
) free(_incomings_); } while (0);; ((void) sizeof ((_visit_->
size <= (symbolic_graph->exec_symbol_info->rnum)) ? 1
: 0), __extension__ ({ if (_visit_->size <= (symbolic_graph
->exec_symbol_info->rnum)) ; else __assert_fail ("_visit_->size <= (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 4038, __extension__ __PRETTY_FUNCTION__
); })); _visit_; })
;
4039 ccv_nnc_graph_visit_free(graph_prep->visit);
4040 graph_prep->visit = visit;
4041 assert(graph_prep->p)((void) sizeof ((graph_prep->p) ? 1 : 0), __extension__ ({
if (graph_prep->p) ; else __assert_fail ("graph_prep->p"
, "ccv_nnc_symbolic_graph_compile.c", 4041, __extension__ __PRETTY_FUNCTION__
); }))
;
4042 ccv_nnc_symbolic_graph_symbol_infer(symbolic_graph, visit, sources, source_size, destinations, destination_size, graph_prep->p->tensor_symbol_info, graph_prep->p->tensor_symbol_info_size, graph_prep->tensor_symbol_info, graph_prep->exec_symbol_info);
4043 }
4044 ccv_nnc_graph_visit_for(graph_prep->visit, graph_prep->exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (graph_prep->visit)->
size; _i_++) { const int idx __attribute__((unused)) = (graph_prep
->visit)->node[_i_].index; const int _node_unused_ __attribute__
((unused)) = (graph_prep->visit)->node[_i_].term; typeof
((graph_prep->exec_symbol_info)) const node __attribute__
((unused)) = (graph_prep->exec_symbol_info) + idx;
{
4045 for (i = 0; i < node->graph_ref_size; i++)
4046 {
4047 const int graph_ref = CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)
[i] - 1;
4048 if (graph_ref >= 0)
4049 _ccv_nnc_symbolic_graph_prep_dup_breakpoints_free(graph_prep->sub_preps[graph_ref]);
4050 }
4051 } ccv_nnc_graph_visit_endfor} }
4052}
4053
4054const ccv_nnc_symbolic_graph_compile_param_t ccv_nnc_default_compile_params = {};
4055
4056void ccv_nnc_symbolic_graph_compile(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_symbolic_graph_compile_param_t compile_params, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, ccv_nnc_graph_t** const graph_ref, ccv_nnc_tensor_arena_t** const tensor_arena_ref, ccv_nnc_graph_exec_arena_t** const graph_exec_arena_ref)
4057{
4058 assert(graph_ref)((void) sizeof ((graph_ref) ? 1 : 0), __extension__ ({ if (graph_ref
) ; else __assert_fail ("graph_ref", "ccv_nnc_symbolic_graph_compile.c"
, 4058, __extension__ __PRETTY_FUNCTION__); }))
;
4059 assert(tensor_arena_ref)((void) sizeof ((tensor_arena_ref) ? 1 : 0), __extension__ ({
if (tensor_arena_ref) ; else __assert_fail ("tensor_arena_ref"
, "ccv_nnc_symbolic_graph_compile.c", 4059, __extension__ __PRETTY_FUNCTION__
); }))
;
4060 assert(graph_exec_arena_ref)((void) sizeof ((graph_exec_arena_ref) ? 1 : 0), __extension__
({ if (graph_exec_arena_ref) ; else __assert_fail ("graph_exec_arena_ref"
, "ccv_nnc_symbolic_graph_compile.c", 4060, __extension__ __PRETTY_FUNCTION__
); }))
;
4061 int i;
4062 // Cannot bind the multi-view.
4063 for (i = 0; i < tensor_bind_size; i++)
4064 {
4065 assert(tensor_binds[i].tensor)((void) sizeof ((tensor_binds[i].tensor) ? 1 : 0), __extension__
({ if (tensor_binds[i].tensor) ; else __assert_fail ("tensor_binds[i].tensor"
, "ccv_nnc_symbolic_graph_compile.c", 4065, __extension__ __PRETTY_FUNCTION__
); }))
;
4066 assert(!CCV_IS_TENSOR_MULTIVIEW(tensor_binds[i].tensor))((void) sizeof ((!((*(int*)(tensor_binds[i].tensor)) & CCV_TENSOR_MULTIVIEW
)) ? 1 : 0), __extension__ ({ if (!((*(int*)(tensor_binds[i].
tensor)) & CCV_TENSOR_MULTIVIEW)) ; else __assert_fail ("!CCV_IS_TENSOR_MULTIVIEW(tensor_binds[i].tensor)"
, "ccv_nnc_symbolic_graph_compile.c", 4066, __extension__ __PRETTY_FUNCTION__
); }))
;
4067 }
4068 ccv_nnc_symbolic_graph_prep_t* graph_prep = _ccv_nnc_symbolic_graph_prep_new(symbolic_graph, tensor_binds, tensor_bind_size, outputs, output_size, sources, source_size, destinations, destination_size, 0, 0, 0, 0);
4069 _ccv_nnc_symbolic_graph_prep_while_count_tensor(graph_prep);
4070 ccv_nnc_tensor_arena_t* tensor_arena = _ccv_nnc_tensor_arena_new(graph_prep, compile_params.allocator, 0, tensor_binds, tensor_bind_size);
4071 _ccv_nnc_tensor_arena_fixup_pair_ref_and_tape_var(tensor_arena, graph_prep, tensor_arena);
4072 *tensor_arena_ref = tensor_arena;
4073 // The above handled tensor allocation, now we need to materialize the graph from symbolic to real.
4074 _ccv_nnc_graph_fixup_pair(graph_prep, graph_prep);
4075 // Now tensor allocation is done, if there are any dup_breakpoints, I need to clean it up.
4076 _ccv_nnc_symbolic_graph_prep_dup_breakpoints_free(graph_prep);
4077 *graph_ref = graph_prep->graph;
4078 ccv_nnc_graph_exec_arena_t* graph_exec_arena = _ccv_nnc_graph_exec_arena_new(symbolic_graph, sources, source_size, destinations, destination_size, graph_prep, tensor_arena);
4079 _ccv_nnc_graph_exec_arena_topsort(graph_prep->graph, graph_exec_arena);
4080 _ccv_nnc_graph_exec_arena_fixup_pair_ref(graph_exec_arena, graph_prep, graph_exec_arena);
4081 *graph_exec_arena_ref = graph_exec_arena;
4082 _ccv_nnc_symbolic_graph_prep_free(graph_prep);
4083}
4084
4085static void _ccv_nnc_tensor_arena_free(ccv_nnc_tensor_arena_t* const tensor_arena)
4086{
4087 // Buffers are inherited from above, no need to dealloc.
4088 int i;
4089 for (i = 0; i < tensor_arena->sub_arena_size; i++)
4090 if (tensor_arena->sub_arenas[i])
4091 _ccv_nnc_tensor_arena_free(tensor_arena->sub_arenas[i]);
4092 for (i = 0; i < tensor_arena->m_tensor_idx->rnum; i++)
4093 {
4094 ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, *(int*)ccv_array_get(tensor_arena->m_tensor_idx, i)((void*)(((char*)((tensor_arena->m_tensor_idx)->data)) +
(size_t)(tensor_arena->m_tensor_idx)->rsize * (size_t)
(i)))
);
4095 assert(mv && CCV_IS_TENSOR_MULTIVIEW(mv))((void) sizeof ((mv && ((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW
)) ? 1 : 0), __extension__ ({ if (mv && ((*(int*)(mv)
) & CCV_TENSOR_MULTIVIEW)) ; else __assert_fail ("mv && CCV_IS_TENSOR_MULTIVIEW(mv)"
, "ccv_nnc_symbolic_graph_compile.c", 4095, __extension__ __PRETTY_FUNCTION__
); }))
;
4096 ccv_nnc_tensor_multiview_free(*mv);
4097 }
4098 ccv_array_free(tensor_arena->tensor_metadata);
4099 ccv_array_free(tensor_arena->m_tensor_idx);
4100 if (tensor_arena->pb_vt_tensors)
4101 ccfreefree(tensor_arena->pb_vt_tensors);
4102 if (tensor_arena->vt_alias_r_refs_p)
4103 ccfreefree(tensor_arena->vt_alias_r_refs_p);
4104 if (tensor_arena->vt_sizes)
4105 ccfreefree(tensor_arena->vt_sizes);
4106 ccfreefree(tensor_arena);
4107}
4108
4109void ccv_nnc_tensor_bind_symbol(ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_tensor_symbol_t symbol, const ccv_nnc_tensor_t* const tensor)
4110{
4111 assert(tensor_arena->graph_ref == (intptr_t)symbol.graph)((void) sizeof ((tensor_arena->graph_ref == (intptr_t)symbol
.graph) ? 1 : 0), __extension__ ({ if (tensor_arena->graph_ref
== (intptr_t)symbol.graph) ; else __assert_fail ("tensor_arena->graph_ref == (intptr_t)symbol.graph"
, "ccv_nnc_symbolic_graph_compile.c", 4111, __extension__ __PRETTY_FUNCTION__
); }))
;
4112 assert(symbol.d < tensor_arena->vt_tensor_size)((void) sizeof ((symbol.d < tensor_arena->vt_tensor_size
) ? 1 : 0), __extension__ ({ if (symbol.d < tensor_arena->
vt_tensor_size) ; else __assert_fail ("symbol.d < tensor_arena->vt_tensor_size"
, "ccv_nnc_symbolic_graph_compile.c", 4112, __extension__ __PRETTY_FUNCTION__
); }))
;
4113 assert(symbol.d >= 0)((void) sizeof ((symbol.d >= 0) ? 1 : 0), __extension__ ({
if (symbol.d >= 0) ; else __assert_fail ("symbol.d >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 4113, __extension__ __PRETTY_FUNCTION__
); }))
;
4114 // Only allocate this on-demand because not everyone uses this ccv_nnc_tensor_bind_symbol method.
4115 int i;
4116 if (!tensor_arena->pb_vt_tensors)
4117 {
4118 tensor_arena->pb_vt_tensors = (ccv_numeric_data_t*)cccalloccalloc(tensor_arena->vt_tensor_size, sizeof(ccv_numeric_data_t));
4119 for (i = 0; i < tensor_arena->vt_tensor_size; i++)
4120 if (tensor_arena->vt_tensors[i])
4121 tensor_arena->pb_vt_tensors[i] = tensor_arena->vt_tensors[i]->data;
4122 }
4123 if (!tensor_arena->vt_alias_r_refs_p)
4124 {
4125 tensor_arena->vt_alias_r_refs_p = (int*)cccalloccalloc(tensor_arena->vt_tensor_size * 2, sizeof(int));
4126 tensor_arena->vt_alias_r_refs = tensor_arena->vt_alias_r_refs_p + tensor_arena->vt_tensor_size;
4127 for (i = 0; i < tensor_arena->vt_tensor_size; i++)
4128 if (tensor_arena->vt_alias_refs[i])
4129 {
4130 const int alias_ref = tensor_arena->vt_alias_refs[i] - 1;
4131 assert(alias_ref >= 0 && alias_ref < tensor_arena->vt_tensor_size)((void) sizeof ((alias_ref >= 0 && alias_ref < tensor_arena
->vt_tensor_size) ? 1 : 0), __extension__ ({ if (alias_ref
>= 0 && alias_ref < tensor_arena->vt_tensor_size
) ; else __assert_fail ("alias_ref >= 0 && alias_ref < tensor_arena->vt_tensor_size"
, "ccv_nnc_symbolic_graph_compile.c", 4131, __extension__ __PRETTY_FUNCTION__
); }))
;
4132 ++tensor_arena->vt_alias_r_refs_p[alias_ref]; // Count how many alias there are.
4133 }
4134 int refp = 0;
4135 for (i = 1; i < tensor_arena->vt_tensor_size; i++) // Allocate each with aliases position on vt_alias_r_refs. It points to the end.
4136 if (tensor_arena->vt_alias_r_refs_p[i])
4137 refp = (tensor_arena->vt_alias_r_refs_p[i] += refp);
4138 else
4139 tensor_arena->vt_alias_r_refs_p[i] = -1; // This has no refs.
4140 for (i = refp; i < tensor_arena->vt_tensor_size; i++)
4141 tensor_arena->vt_alias_r_refs[i] = -1; // These are not allocated.
4142 for (i = 0; i < tensor_arena->vt_tensor_size; i++)
4143 if (tensor_arena->vt_alias_refs[i])
4144 {
4145 const int alias_ref = tensor_arena->vt_alias_refs[i] - 1;
4146 assert(alias_ref >= 0 && alias_ref < tensor_arena->vt_tensor_size)((void) sizeof ((alias_ref >= 0 && alias_ref < tensor_arena
->vt_tensor_size) ? 1 : 0), __extension__ ({ if (alias_ref
>= 0 && alias_ref < tensor_arena->vt_tensor_size
) ; else __assert_fail ("alias_ref >= 0 && alias_ref < tensor_arena->vt_tensor_size"
, "ccv_nnc_symbolic_graph_compile.c", 4146, __extension__ __PRETTY_FUNCTION__
); }))
;
4147 const int pos = --tensor_arena->vt_alias_r_refs_p[alias_ref];
4148 assert(pos >= 0)((void) sizeof ((pos >= 0) ? 1 : 0), __extension__ ({ if (
pos >= 0) ; else __assert_fail ("pos >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 4148, __extension__ __PRETTY_FUNCTION__); }))
;
4149 tensor_arena->vt_alias_r_refs[pos] = i;
4150 }
4151 }
4152 const int symbol_d = tensor_arena->vt_alias_refs[symbol.d] ? tensor_arena->vt_alias_refs[symbol.d] - 1 : symbol.d;
4153 if (CCV_IS_TENSOR_VIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_VIEW))
4154 {
4155 assert(((ccv_nnc_tensor_view_t*)tensor)->off == 0)((void) sizeof ((((ccv_nnc_tensor_view_t*)tensor)->off == 0
) ? 1 : 0), __extension__ ({ if (((ccv_nnc_tensor_view_t*)tensor
)->off == 0) ; else __assert_fail ("((ccv_nnc_tensor_view_t*)tensor)->off == 0"
, "ccv_nnc_symbolic_graph_compile.c", 4155, __extension__ __PRETTY_FUNCTION__
); }))
; // I cannot handle off > 0 at the moment, it is possible, but requires additional verifications.
4156 assert((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor)->stride) == 0 &&((void) sizeof (((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t
*)tensor)->stride) == 0 && ccv_nnc_tensor_count(tensor
->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors
[symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor
)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ? 1 : 0), __extension__
({ if ((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor
)->stride) == 0 && ccv_nnc_tensor_count(tensor->
info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[
symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor
)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ; else __assert_fail
("(ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor)->stride) == 0 && ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)"
, "ccv_nnc_symbolic_graph_compile.c", 4158, __extension__ __PRETTY_FUNCTION__
); }))
4157 ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)) ||((void) sizeof (((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t
*)tensor)->stride) == 0 && ccv_nnc_tensor_count(tensor
->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors
[symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor
)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ? 1 : 0), __extension__
({ if ((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor
)->stride) == 0 && ccv_nnc_tensor_count(tensor->
info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[
symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor
)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ; else __assert_fail
("(ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor)->stride) == 0 && ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)"
, "ccv_nnc_symbolic_graph_compile.c", 4158, __extension__ __PRETTY_FUNCTION__
); }))
4158 (size_t)((ccv_nnc_tensor_view_t*)tensor)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info))((void) sizeof (((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t
*)tensor)->stride) == 0 && ccv_nnc_tensor_count(tensor
->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors
[symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor
)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ? 1 : 0), __extension__
({ if ((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor
)->stride) == 0 && ccv_nnc_tensor_count(tensor->
info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[
symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor
)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ; else __assert_fail
("(ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor)->stride) == 0 && ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)"
, "ccv_nnc_symbolic_graph_compile.c", 4158, __extension__ __PRETTY_FUNCTION__
); }))
;
4159 } else
4160 { assert(ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info))((void) sizeof ((ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ? 1 : 0), __extension__
({ if (ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ; else __assert_fail
("ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)"
, "ccv_nnc_symbolic_graph_compile.c", 4160, __extension__ __PRETTY_FUNCTION__
); }))
; }
4161 if (CCV_IS_TENSOR_VIEW(tensor_arena->vt_tensors[symbol.d])((*(int*)(tensor_arena->vt_tensors[symbol.d])) & CCV_TENSOR_VIEW
)
)
4162 { assert(((ccv_nnc_tensor_view_t*)tensor_arena->vt_tensors[symbol.d])->off == 0)((void) sizeof ((((ccv_nnc_tensor_view_t*)tensor_arena->vt_tensors
[symbol.d])->off == 0) ? 1 : 0), __extension__ ({ if (((ccv_nnc_tensor_view_t
*)tensor_arena->vt_tensors[symbol.d])->off == 0) ; else
__assert_fail ("((ccv_nnc_tensor_view_t*)tensor_arena->vt_tensors[symbol.d])->off == 0"
, "ccv_nnc_symbolic_graph_compile.c", 4162, __extension__ __PRETTY_FUNCTION__
); }))
; }
4163 tensor_arena->vt_tensors[symbol_d]->data = tensor->data;
4164 if (tensor_arena->vt_alias_r_refs_p[symbol_d] >= 0)
4165 for (i = tensor_arena->vt_alias_r_refs_p[symbol_d]; i < tensor_arena->vt_tensor_size; i++)
4166 {
4167 const int d = tensor_arena->vt_alias_r_refs[i];
4168 if (d < 0 || symbol_d + 1 != tensor_arena->vt_alias_refs[d]) // Doesn't match, reached the end of it.
4169 break;
4170 ccv_nnc_tensor_t* const d_tensor = tensor_arena->vt_tensors[d];
4171 d_tensor->info.datatype = tensor->info.datatype;
4172 d_tensor->info.reserved = tensor->info.reserved;
4173 if (CCV_IS_TENSOR_VIEW(d_tensor)((*(int*)(d_tensor)) & CCV_TENSOR_VIEW))
4174 ccv_nnc_tensor_data(tensor->info, tensor->data.u8, ((ccv_nnc_tensor_view_t*)d_tensor)->off + tensor->dataof, &d_tensor->data, &d_tensor->dataof);
4175 else {
4176 d_tensor->data.u8 = tensor->data.u8;
4177 d_tensor->dataof = tensor->dataof;
4178 }
4179 }
4180}
4181
4182void ccv_nnc_tensor_arena_clear_bindings(ccv_nnc_tensor_arena_t* const tensor_arena)
4183{
4184 if (!tensor_arena->pb_vt_tensors)
4185 return;
4186 int i;
4187 for (i = 0; i < tensor_arena->vt_tensor_size; i++)
4188 if (tensor_arena->vt_tensors[i])
4189 tensor_arena->vt_tensors[i]->data = tensor_arena->pb_vt_tensors[i];
4190}
4191
4192uint64_t ccv_nnc_tensor_arena_size(const ccv_nnc_tensor_arena_t* const tensor_arena)
4193{
4194 uint64_t total_size = 0;
4195 int i;
4196 for (i = 0; i < tensor_arena->buffer_size; i++)
4197 total_size += tensor_arena->buffers[i].size;
4198 return total_size;
4199}
4200
4201static void _ccv_nnc_multiview_update_params(ccv_nnc_tensor_multiview_t* const mv, const ccv_nnc_tensor_param_t params)
4202{
4203 int i;
4204 if (mv->it)
4205 mv->it->info = params;
4206 for (i = 0; i < mv->repeat + mv->kind; i++)
4207 {
4208 ccv_nnc_tensor_t* tensor = CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i];
4209 if (CCV_IS_TENSOR_MULTIVIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_MULTIVIEW))
4210 _ccv_nnc_multiview_update_params((ccv_nnc_tensor_multiview_t*)tensor, params);
4211 else
4212 tensor->info = params;
4213 }
4214}
4215
4216int ccv_nnc_tensor_arena_reinit(ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_symbolic_graph_t* const graph)
4217{
4218 int i;
4219 assert(graph->tensor_symbol_info->rnum >= tensor_arena->vt_tensor_size)((void) sizeof ((graph->tensor_symbol_info->rnum >= tensor_arena
->vt_tensor_size) ? 1 : 0), __extension__ ({ if (graph->
tensor_symbol_info->rnum >= tensor_arena->vt_tensor_size
) ; else __assert_fail ("graph->tensor_symbol_info->rnum >= tensor_arena->vt_tensor_size"
, "ccv_nnc_symbolic_graph_compile.c", 4219, __extension__ __PRETTY_FUNCTION__
); }))
;
4220 if (!tensor_arena->vt_sizes) // Keep the original size so we can check against to see if we will overflow.
4221 {
4222 tensor_arena->vt_sizes = (size_t*)ccmallocmalloc(sizeof(size_t) * tensor_arena->vt_tensor_size);
4223 for (i = 0; i < tensor_arena->vt_tensor_size; i++)
4224 if (tensor_arena->vt_tensors[i] && !tensor_arena->vt_alias_refs[i])
4225 {
4226 ccv_nnc_tensor_t* tensor = tensor_arena->vt_tensors[i];
4227 if (CCV_IS_TENSOR_MULTIVIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_MULTIVIEW))
4228 {
4229 ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)tensor;
4230 while (CCV_IS_TENSOR_MULTIVIEW(mv)((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW))
4231 mv = (ccv_nnc_tensor_multiview_t*)(mv->it ? mv->it : CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[0]);
4232 tensor = (ccv_nnc_tensor_t*)mv;
4233 }
4234 tensor_arena->vt_sizes[i] = ccv_nnc_tensor_data_size(tensor->info);
4235 }
4236 }
4237 int flag = 0;
4238 for (i = 0; !flag && i < tensor_arena->vt_tensor_size; i++)
4239 if (tensor_arena->vt_tensors[i] && !tensor_arena->vt_alias_refs[i])
4240 {
4241 ccv_nnc_tensor_symbol_info_t* const symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, i)((void*)(((char*)((graph->tensor_symbol_info)->data)) +
(size_t)(graph->tensor_symbol_info)->rsize * (size_t)(
i)))
;
4242 ccv_nnc_tensor_param_t params = symbol_info->info;
4243 params.datatype = tensor_arena->vt_tensors[i]->info.datatype;
4244 params.reserved = tensor_arena->vt_tensors[i]->info.reserved;
4245 flag = (tensor_arena->vt_sizes[i] < ccv_nnc_tensor_data_size(params));
4246 }
4247 if (flag)
4248 return -1;
4249 for (i = 0; i < tensor_arena->vt_tensor_size; i++)
4250 if (tensor_arena->vt_tensors[i])
4251 {
4252 ccv_nnc_tensor_symbol_info_t* const symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, i)((void*)(((char*)((graph->tensor_symbol_info)->data)) +
(size_t)(graph->tensor_symbol_info)->rsize * (size_t)(
i)))
;
4253 ccv_nnc_tensor_t* tensor = tensor_arena->vt_tensors[i];
4254 if (CCV_IS_TENSOR_MULTIVIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_MULTIVIEW))
4255 {
4256 assert(!tensor_arena->vt_alias_refs[i])((void) sizeof ((!tensor_arena->vt_alias_refs[i]) ? 1 : 0)
, __extension__ ({ if (!tensor_arena->vt_alias_refs[i]) ; else
__assert_fail ("!tensor_arena->vt_alias_refs[i]", "ccv_nnc_symbolic_graph_compile.c"
, 4256, __extension__ __PRETTY_FUNCTION__); }))
;
4257 _ccv_nnc_multiview_update_params((ccv_nnc_tensor_multiview_t*)tensor, symbol_info->info);
4258 } else if (!tensor_arena->vt_alias_refs[i]) {
4259 ccv_nnc_tensor_param_t params = symbol_info->info;
4260 params.datatype = tensor->info.datatype;
4261 params.reserved = tensor->info.reserved;
4262 tensor->info = params;
4263 } else {
4264 off_t off = ccv_nnc_tensor_view_offset(tensor->info.datatype, symbol_info->stride, symbol_info->ofs);
4265 ccv_nnc_tensor_param_t params = symbol_info->info;
4266 params.datatype = tensor->info.datatype;
4267 params.reserved = tensor->info.reserved;
4268 tensor->info = params;
4269 const int alias_ref = tensor_arena->vt_alias_refs[i] - 1;
4270 ccv_nnc_tensor_data(tensor->info, tensor_arena->vt_tensors[alias_ref]->data.u8, off + tensor_arena->vt_tensors[alias_ref]->dataof, &tensor->data, &tensor->dataof);
4271 if (CCV_IS_TENSOR_VIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_VIEW))
4272 {
4273 ((ccv_nnc_tensor_view_t*)tensor)->off = off;
4274 memcpy(((ccv_nnc_tensor_view_t*)tensor)->stride, symbol_info->stride, sizeof(((ccv_nnc_tensor_view_t*)tensor)->stride));
4275 }
4276 }
4277 }
4278 // Should handle sub_tensor_arena, don't do that at the moment.
4279 assert(!graph->sub_graphs)((void) sizeof ((!graph->sub_graphs) ? 1 : 0), __extension__
({ if (!graph->sub_graphs) ; else __assert_fail ("!graph->sub_graphs"
, "ccv_nnc_symbolic_graph_compile.c", 4279, __extension__ __PRETTY_FUNCTION__
); }))
;
4280 return 0;
4281}
4282
4283void ccv_nnc_graph_exec_reinit(ccv_nnc_graph_exec_arena_t* const graph_exec_arena, ccv_nnc_graph_t* const graph, const ccv_nnc_symbolic_graph_t* const symbolic_graph)
4284{
4285 assert(symbolic_graph->exec_symbol_info->rnum >= graph_exec_arena->graph_exec_size)((void) sizeof ((symbolic_graph->exec_symbol_info->rnum
>= graph_exec_arena->graph_exec_size) ? 1 : 0), __extension__
({ if (symbolic_graph->exec_symbol_info->rnum >= graph_exec_arena
->graph_exec_size) ; else __assert_fail ("symbolic_graph->exec_symbol_info->rnum >= graph_exec_arena->graph_exec_size"
, "ccv_nnc_symbolic_graph_compile.c", 4285, __extension__ __PRETTY_FUNCTION__
); }))
;
4286 int i;
4287 for (i = 0; i < graph_exec_arena->graph_exec_size; i++)
4288 {
4289 const ccv_nnc_graph_exec_t graph_exec = graph_exec_arena->graph_execs[i];
4290 if (graph_exec.d < 0)
4291 continue;
4292 const ccv_nnc_cmd_t existing_cmd = ccv_nnc_graph_exec_cmd(graph, graph_exec);
4293 const ccv_nnc_graph_exec_symbol_info_t* const symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, i)((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
(size_t)(i)))
;
4294 ccv_nnc_cmd_t new_cmd = symbol_info->cmd;
4295 if (new_cmd.cmd == existing_cmd.cmd) // If the command matches, replacing the backend and algorithm to the existing one, which hypothetically has been autotuned..
4296 {
4297 new_cmd.backend = existing_cmd.backend;
4298 new_cmd.algorithm = existing_cmd.algorithm;
4299 }
4300 ccv_nnc_graph_exec_set(graph, graph_exec, new_cmd);
4301 }
4302}
4303
4304void ccv_nnc_tensor_arena_buffer_free(ccv_nnc_tensor_arena_t* const tensor_arena)
4305{
4306 int i;
4307 for (i = 0; i < tensor_arena->buffer_size; i++)
4308 {
4309 if (!tensor_arena->buffers[i].ptr)
4310 continue;
4311 const int buffer_type = tensor_arena->buffers[i].type;;
4312 const int memory_type = CCV_TENSOR_GET_MEMORY(buffer_type)((buffer_type) & 0x3);
4313#ifdef HAVE_CUDA1
4314 const int device_id = CCV_TENSOR_GET_DEVICE_ID(buffer_type)(((buffer_type) & 0xfff00) >> 8);
4315 if (memory_type == CCV_TENSOR_GPU_MEMORY)
4316 {
4317 if (tensor_arena->allocator.isa && tensor_arena->allocator.isa->free)
4318 tensor_arena->allocator.isa->free(tensor_arena->buffers[i].ptr, tensor_arena->allocator.context.free);
4319 else
4320 cufree(device_id, tensor_arena->buffers[i].ptr);
4321 } else {
4322 assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0
), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY)
; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY"
, "ccv_nnc_symbolic_graph_compile.c", 4322, __extension__ __PRETTY_FUNCTION__
); }))
;
4323 if (tensor_arena->buffers[i].pin_mem)
4324 cuhostfree(tensor_arena->buffers[i].ptr);
4325 else
4326 ccfreefree(tensor_arena->buffers[i].ptr);
4327 }
4328#elif defined(HAVE_MPS)
4329 const int device_id = CCV_TENSOR_GET_DEVICE_ID(buffer_type)(((buffer_type) & 0xfff00) >> 8);
4330 if (memory_type == CCV_TENSOR_GPU_MEMORY)
4331 {
4332 // if (tensor_arena->allocator.isa && tensor_arena->allocator.isa->free)
4333 // tensor_arena->allocator.isa->free(tensor_arena->buffers[i].ptr, tensor_arena->allocator.context.free);
4334 // else
4335 mpheapfree(device_id, tensor_arena->buffers[i].ptr);
4336 } else {
4337 assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0
), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY)
; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY"
, "ccv_nnc_symbolic_graph_compile.c", 4337, __extension__ __PRETTY_FUNCTION__
); }))
;
4338 ccfreefree(tensor_arena->buffers[i].ptr);
4339 }
4340#else
4341 assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0
), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY)
; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY"
, "ccv_nnc_symbolic_graph_compile.c", 4341, __extension__ __PRETTY_FUNCTION__
); }))
;
4342 ccfreefree(tensor_arena->buffers[i].ptr);
4343#endif
4344 tensor_arena->buffers[i].ptr = 0;
4345 }
4346 // For now, the life-cycle of the disposers lives with the buffer. It may ends before the tensor arena deallocates.
4347 if (tensor_arena->disposers)
4348 {
4349 for (i = 0; i < tensor_arena->disposers->rnum; i++)
4350 {
4351 ccv_nnc_arena_disposer_t* const disposer = (ccv_nnc_arena_disposer_t*)ccv_array_get(tensor_arena->disposers, i)((void*)(((char*)((tensor_arena->disposers)->data)) + (
size_t)(tensor_arena->disposers)->rsize * (size_t)(i)))
;
4352 disposer->dispose(disposer->ptr, disposer->userdata);
4353 }
4354 ccv_array_free(tensor_arena->disposers);
4355 tensor_arena->disposers = 0;
4356 }
4357}
4358
4359void ccv_nnc_tensor_arena_free(ccv_nnc_tensor_arena_t* const tensor_arena)
4360{
4361 ccv_nnc_tensor_arena_buffer_free(tensor_arena);
4362 _ccv_nnc_tensor_arena_free(tensor_arena);
4363}
4364
4365void ccv_nnc_graph_exec_arena_free(ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
4366{
4367 int i;
4368 for (i = 0; i < graph_exec_arena->sub_arena_size; i++)
4369 if (graph_exec_arena->sub_arenas[i])
4370 ccv_nnc_graph_exec_arena_free(graph_exec_arena->sub_arenas[i]);
4371 ccfreefree(graph_exec_arena);
4372}