Bug Summary

File:nnc/ccv_nnc_symbolic_graph_compile.c
Warning:line 3759, column 7
The left operand of '==' is a garbage value

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-unknown-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name ccv_nnc_symbolic_graph_compile.c -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -mrelocation-model pic -pic-level 2 -pic-is-pie -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -target-feature +sse2 -tune-cpu generic -debugger-tuning=gdb -fdebug-compilation-dir=/home/liu/actions-runner/_work/ccv/ccv/lib/nnc -fcoverage-compilation-dir=/home/liu/actions-runner/_work/ccv/ccv/lib/nnc -resource-dir /usr/local/lib/clang/18 -I ../ -I /usr/local/cuda/include -D HAVE_CBLAS -D HAVE_LIBPNG -D HAVE_LIBJPEG -D HAVE_FFTW3 -D HAVE_PTHREAD -D HAVE_LIBLINEAR -D HAVE_TESSERACT -D HAVE_AVCODEC -D HAVE_AVFORMAT -D HAVE_AVUTIL -D HAVE_SWSCALE -D HAVE_SSE2 -D HAVE_GSL -D HAVE_CUDA -D HAVE_CUDNN -D HAVE_NCCL -D USE_SYSTEM_CUB -D HAVE_CUDA_SM80 -I /usr/local/include -internal-isystem /usr/local/lib/clang/18/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/12/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -ferror-limit 19 -fgnuc-version=4.2.1 -fskip-odr-check-in-gmf -vectorize-loops -vectorize-slp -analyzer-output=html -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /home/liu/actions-runner/_work/ccv/ccv/_analyze/2024-09-13-112407-53539-1 -x c ccv_nnc_symbolic_graph_compile.c
1#include "ccv_nnc.h"
2#include "ccv_nnc_internal.h"
3#include "ccv_nnc_easy.h"
4#include "ccv_internal.h"
5#ifdef HAVE_CUDA1
6#include "gpu/ccv_nnc_compat.h"
7#elif defined(HAVE_MPS)
8#include "mps/ccv_nnc_mps.h"
9#endif
10#include "_ccv_nnc_graph.h"
11#include "_ccv_nnc_symbolic_graph.h"
12
13// MARK - Level-3 API
14
15typedef struct {
16 int flags;
17 int type;
18 int pin_mem; // This memory need to be pinned.
19 int ref; // Reference to another tensor block. Start with 1.
20 int alias_ref; // If reference to another tensor, and the other one is an alias. Start with 1.
21 int bypass_ref; // Copy over the bypass_ref from tensor symbol underneath. Start with 1.
22 int companion_ref; // Reference to another block that they two share the same memory region. Start with 1. the current crude implementation requires the two mutually be companion. Because there are two, we took the one that companion_ref <= i as the primary and companion_ref > i is the secondary. For allocation algorithm, we use the primary throughout.
23 int unfoldable_except_ref; // Reference to a tensor block that can be the exception to unfoldable (as output). Start with 1.
24 ccv_array_t* r_refs; // If this is referenced by another block, the array point back to these blocks. Start with 1.
25 uint64_t size; // The size of the tensor expected.
26 int p_refs[2]; // Reference to the parent tensor block, at max there will be only two. Start with 1.
27 ccv_array_t* dup_p_refs; // Reference to the parent tensor block from the duplicated tensor blocks. It could be many. Start with 0.
28 ccv_array_t* head; // The head nodes (it could be multiple if from the graph, one cannot determine which is the first).
29 ccv_array_t* tail; // The tail nodes (it could be multiple if from the graph, one cannot determine which is the last).
30} ccv_nnc_tensor_block_t; // Tensor Arena Block
31
32#define IS_PRIMARY_COMPANION(idx, block)((idx) < (uint32_t)((block).companion_ref - 1)) ((idx) < (uint32_t)((block).companion_ref - 1))
33
34enum {
35 UNASSIGNED = 0x1,
36 ALIAS = 0x2,
37 READ_ONLY = 0x4,
38 WRITE_ONLY = 0x8,
39 READ_WRITE = 0xc,
40 ANONYMOUS = 0x10, // Mark this block as anonymous (thus, not reference to any specific tensor).
41 UNFOLDABLE_AS_INPUT = 0x20, // If this block is used as input, it cannot be folded into any output blocks.
42 UNFOLDABLE_AS_OUTPUT = 0x40, // If this block is used as output, it cannot be folded into any input blocks.
43};
44
45#define TENSOR_EXPECT_ORDINARY(t)((t.flags & 0x3) == 0) ((t.flags & 0x3) == 0)
46#define TENSOR_EXPECT_SET_ORDINARY(t)(t.flags = (t.flags & ~0x3)) (t.flags = (t.flags & ~0x3))
47#define TENSOR_EXPECT_UNASSIGNED(t)((t.flags & 0x3) == UNASSIGNED) ((t.flags & 0x3) == UNASSIGNED)
48#define TENSOR_EXPECT_SET_UNASSIGNED(t)(t.flags = ((t.flags & ~0x3) | UNASSIGNED)) (t.flags = ((t.flags & ~0x3) | UNASSIGNED))
49#define TENSOR_EXPECT_UNSET_UNASSIGNED(t)(t.flags = (t.flags & ~0x1)) (t.flags = (t.flags & ~0x1))
50#define TENSOR_EXPECT_ALIAS(t)((t.flags & 0x3) == ALIAS) ((t.flags & 0x3) == ALIAS)
51#define TENSOR_EXPECT_COMPUTABLE(t)(!((t.flags & 0x3) == ALIAS) && !((t.flags & 0x3
) == UNASSIGNED))
(!TENSOR_EXPECT_ALIAS(t)((t.flags & 0x3) == ALIAS) && !TENSOR_EXPECT_UNASSIGNED(t)((t.flags & 0x3) == UNASSIGNED))
52#define TENSOR_READ_WRITE(t)(t.flags & 0xc) (t.flags & 0xc)
53#define TENSOR_SET_READ_WRITE(t, rw)(t.flags = ((t.flags & ~0xc) | rw)) (t.flags = ((t.flags & ~0xc) | rw))
54#define TENSOR_SET_ANONYMOUS(t)(t.flags = ((t.flags & ~0x10) | ANONYMOUS)) (t.flags = ((t.flags & ~0x10) | ANONYMOUS))
55#define TENSOR_IS_ANONYMOUS(t)(t.flags & ANONYMOUS) (t.flags & ANONYMOUS)
56#define TENSOR_SET_UNFOLDABLE_AS_INPUT(t)(t.flags = (t.flags | UNFOLDABLE_AS_INPUT)) (t.flags = (t.flags | UNFOLDABLE_AS_INPUT))
57#define TENSOR_IS_UNFOLDABLE_AS_INPUT(t)(t.flags & UNFOLDABLE_AS_INPUT) (t.flags & UNFOLDABLE_AS_INPUT)
58#define TENSOR_SET_UNFOLDABLE_AS_OUTPUT(t)(t.flags = (t.flags | UNFOLDABLE_AS_OUTPUT)) (t.flags = (t.flags | UNFOLDABLE_AS_OUTPUT))
59#define TENSOR_IS_UNFOLDABLE_AS_OUTPUT(t)(t.flags & UNFOLDABLE_AS_OUTPUT) (t.flags & UNFOLDABLE_AS_OUTPUT)
60
61#define TENSOR_REQUIRE_INIT(flags)(((flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS) || ((flags)
& CCV_NNC_TENSOR_SYMBOL_INIT_ONES))
(((flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS) || ((flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ONES))
62
63// Holds additional information about the exe nodes.
64typedef struct {
65 int flags;
66} ccv_nnc_graph_exec_flag_t;
67
68enum {
69 CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO = 0x1, // Need to insert additional IO transfer for case..of statement.
70};
71
72typedef struct {
73 int index;
74 int oc;
75 int type;
76 uint64_t size;
77} ccv_nnc_tensor_opt_t;
78
79// We first sort the same type together (because they won't be reused at all.
80// And then we sort by size, after that, sort by oc.
81#define more_than(i1, i2, aux) (((i1).size > (i2).size) || ((i1).size == (i2).size && (i1).oc >= (i2).oc))
82static CCV_IMPLEMENT_QSORT(_ccv_nnc_tensor_opt_sort_by_size_and_oc, ccv_nnc_tensor_opt_t, more_than)void _ccv_nnc_tensor_opt_sort_by_size_and_oc(ccv_nnc_tensor_opt_t
*array, size_t total, int aux) { int isort_thresh = 7; ccv_nnc_tensor_opt_t
t; int sp = 0; struct { ccv_nnc_tensor_opt_t *lb; ccv_nnc_tensor_opt_t
*ub; } stack[48]; if( total <= 1 ) return; stack[0].lb = array
; stack[0].ub = array + (total - 1); while( sp >= 0 ) { ccv_nnc_tensor_opt_t
* left = stack[sp].lb; ccv_nnc_tensor_opt_t* right = stack[sp
--].ub; for(;;) { int i, n = (int)(right - left) + 1, m; ccv_nnc_tensor_opt_t
* ptr; ccv_nnc_tensor_opt_t* ptr2; if( n <= isort_thresh )
{ insert_sort: for( ptr = left + 1; ptr <= right; ptr++ )
{ for( ptr2 = ptr; ptr2 > left && more_than(ptr2[
0],ptr2[-1], aux); ptr2--) (((t)) = ((ptr2[0])), ((ptr2[0])) =
((ptr2[-1])), ((ptr2[-1])) = ((t))); } break; } else { ccv_nnc_tensor_opt_t
* left0; ccv_nnc_tensor_opt_t* left1; ccv_nnc_tensor_opt_t* right0
; ccv_nnc_tensor_opt_t* right1; ccv_nnc_tensor_opt_t* pivot; ccv_nnc_tensor_opt_t
* a; ccv_nnc_tensor_opt_t* b; ccv_nnc_tensor_opt_t* c; int swap_cnt
= 0; left0 = left; right0 = right; pivot = left + (n/2); if(
n > 40 ) { int d = n / 8; a = left, b = left + d, c = left
+ 2*d; left = more_than(*a, *b, aux) ? (more_than(*b, *c, aux
) ? b : (more_than(*a, *c, aux) ? c : a)) : (more_than(*c, *b
, aux) ? b : (more_than(*a, *c, aux) ? a : c)); a = pivot - d
, b = pivot, c = pivot + d; pivot = more_than(*a, *b, aux) ? (
more_than(*b, *c, aux) ? b : (more_than(*a, *c, aux) ? c : a)
) : (more_than(*c, *b, aux) ? b : (more_than(*a, *c, aux) ? a
: c)); a = right - 2*d, b = right - d, c = right; right = more_than
(*a, *b, aux) ? (more_than(*b, *c, aux) ? b : (more_than(*a, *
c, aux) ? c : a)) : (more_than(*c, *b, aux) ? b : (more_than(
*a, *c, aux) ? a : c)); } a = left, b = pivot, c = right; pivot
= more_than(*a, *b, aux) ? (more_than(*b, *c, aux) ? b : (more_than
(*a, *c, aux) ? c : a)) : (more_than(*c, *b, aux) ? b : (more_than
(*a, *c, aux) ? a : c)); if( pivot != left0 ) { (((t)) = ((*pivot
)), ((*pivot)) = ((*left0)), ((*left0)) = ((t))); pivot = left0
; } left = left1 = left0 + 1; right = right1 = right0; for(;;
) { while( left <= right && !more_than(*pivot, *left
, aux) ) { if( !more_than(*left, *pivot, aux) ) { if( left >
left1 ) (((t)) = ((*left1)), ((*left1)) = ((*left)), ((*left
)) = ((t))); swap_cnt = 1; left1++; } left++; } while( left <=
right && !more_than(*right, *pivot, aux) ) { if( !more_than
(*pivot, *right, aux) ) { if( right < right1 ) (((t)) = ((
*right1)), ((*right1)) = ((*right)), ((*right)) = ((t))); swap_cnt
= 1; right1--; } right--; } if( left > right ) break; (((
t)) = ((*left)), ((*left)) = ((*right)), ((*right)) = ((t)));
swap_cnt = 1; left++; right--; } if( swap_cnt == 0 ) { left =
left0, right = right0; goto insert_sort; } n = ({ typeof ((int
)(left1 - left0)) _a = ((int)(left1 - left0)); typeof ((int)(
left - left1)) _b = ((int)(left - left1)); (_a < _b) ? _a :
_b; }); for( i = 0; i < n; i++ ) (((t)) = ((left0[i])), (
(left0[i])) = ((left[i-n])), ((left[i-n])) = ((t))); n = ({ typeof
((int)(right0 - right1)) _a = ((int)(right0 - right1)); typeof
((int)(right1 - right)) _b = ((int)(right1 - right)); (_a <
_b) ? _a : _b; }); for( i = 0; i < n; i++ ) (((t)) = ((left
[i])), ((left[i])) = ((right0[i-n+1])), ((right0[i-n+1])) = (
(t))); n = (int)(left - left1); m = (int)(right1 - right); if
( n > 1 ) { if( m > 1 ) { if( n > m ) { stack[++sp].
lb = left0; stack[sp].ub = left0 + n - 1; left = right0 - m +
1, right = right0; } else { stack[++sp].lb = right0 - m + 1;
stack[sp].ub = right0; left = left0, right = left0 + n - 1; }
} else left = left0, right = left0 + n - 1; } else if( m >
1 ) left = right0 - m + 1, right = right0; else break; } } }
}
83#undef more_than
84typedef struct {
85 int idx;
86 int hop;
87} ccv_nnc_tensor_hop_t;
88#define less_than(i1, i2, aux) ((i1).hop < (i2).hop)
89static CCV_IMPLEMENT_QSORT(_ccv_nnc_sort_by_hops, ccv_nnc_tensor_hop_t, less_than)void _ccv_nnc_sort_by_hops(ccv_nnc_tensor_hop_t *array, size_t
total, int aux) { int isort_thresh = 7; ccv_nnc_tensor_hop_t
t; int sp = 0; struct { ccv_nnc_tensor_hop_t *lb; ccv_nnc_tensor_hop_t
*ub; } stack[48]; if( total <= 1 ) return; stack[0].lb = array
; stack[0].ub = array + (total - 1); while( sp >= 0 ) { ccv_nnc_tensor_hop_t
* left = stack[sp].lb; ccv_nnc_tensor_hop_t* right = stack[sp
--].ub; for(;;) { int i, n = (int)(right - left) + 1, m; ccv_nnc_tensor_hop_t
* ptr; ccv_nnc_tensor_hop_t* ptr2; if( n <= isort_thresh )
{ insert_sort: for( ptr = left + 1; ptr <= right; ptr++ )
{ for( ptr2 = ptr; ptr2 > left && less_than(ptr2[
0],ptr2[-1], aux); ptr2--) (((t)) = ((ptr2[0])), ((ptr2[0])) =
((ptr2[-1])), ((ptr2[-1])) = ((t))); } break; } else { ccv_nnc_tensor_hop_t
* left0; ccv_nnc_tensor_hop_t* left1; ccv_nnc_tensor_hop_t* right0
; ccv_nnc_tensor_hop_t* right1; ccv_nnc_tensor_hop_t* pivot; ccv_nnc_tensor_hop_t
* a; ccv_nnc_tensor_hop_t* b; ccv_nnc_tensor_hop_t* c; int swap_cnt
= 0; left0 = left; right0 = right; pivot = left + (n/2); if(
n > 40 ) { int d = n / 8; a = left, b = left + d, c = left
+ 2*d; left = less_than(*a, *b, aux) ? (less_than(*b, *c, aux
) ? b : (less_than(*a, *c, aux) ? c : a)) : (less_than(*c, *b
, aux) ? b : (less_than(*a, *c, aux) ? a : c)); a = pivot - d
, b = pivot, c = pivot + d; pivot = less_than(*a, *b, aux) ? (
less_than(*b, *c, aux) ? b : (less_than(*a, *c, aux) ? c : a)
) : (less_than(*c, *b, aux) ? b : (less_than(*a, *c, aux) ? a
: c)); a = right - 2*d, b = right - d, c = right; right = less_than
(*a, *b, aux) ? (less_than(*b, *c, aux) ? b : (less_than(*a, *
c, aux) ? c : a)) : (less_than(*c, *b, aux) ? b : (less_than(
*a, *c, aux) ? a : c)); } a = left, b = pivot, c = right; pivot
= less_than(*a, *b, aux) ? (less_than(*b, *c, aux) ? b : (less_than
(*a, *c, aux) ? c : a)) : (less_than(*c, *b, aux) ? b : (less_than
(*a, *c, aux) ? a : c)); if( pivot != left0 ) { (((t)) = ((*pivot
)), ((*pivot)) = ((*left0)), ((*left0)) = ((t))); pivot = left0
; } left = left1 = left0 + 1; right = right1 = right0; for(;;
) { while( left <= right && !less_than(*pivot, *left
, aux) ) { if( !less_than(*left, *pivot, aux) ) { if( left >
left1 ) (((t)) = ((*left1)), ((*left1)) = ((*left)), ((*left
)) = ((t))); swap_cnt = 1; left1++; } left++; } while( left <=
right && !less_than(*right, *pivot, aux) ) { if( !less_than
(*pivot, *right, aux) ) { if( right < right1 ) (((t)) = ((
*right1)), ((*right1)) = ((*right)), ((*right)) = ((t))); swap_cnt
= 1; right1--; } right--; } if( left > right ) break; (((
t)) = ((*left)), ((*left)) = ((*right)), ((*right)) = ((t)));
swap_cnt = 1; left++; right--; } if( swap_cnt == 0 ) { left =
left0, right = right0; goto insert_sort; } n = ({ typeof ((int
)(left1 - left0)) _a = ((int)(left1 - left0)); typeof ((int)(
left - left1)) _b = ((int)(left - left1)); (_a < _b) ? _a :
_b; }); for( i = 0; i < n; i++ ) (((t)) = ((left0[i])), (
(left0[i])) = ((left[i-n])), ((left[i-n])) = ((t))); n = ({ typeof
((int)(right0 - right1)) _a = ((int)(right0 - right1)); typeof
((int)(right1 - right)) _b = ((int)(right1 - right)); (_a <
_b) ? _a : _b; }); for( i = 0; i < n; i++ ) (((t)) = ((left
[i])), ((left[i])) = ((right0[i-n+1])), ((right0[i-n+1])) = (
(t))); n = (int)(left - left1); m = (int)(right1 - right); if
( n > 1 ) { if( m > 1 ) { if( n > m ) { stack[++sp].
lb = left0; stack[sp].ub = left0 + n - 1; left = right0 - m +
1, right = right0; } else { stack[++sp].lb = right0 - m + 1;
stack[sp].ub = right0; left = left0, right = left0 + n - 1; }
} else left = left0, right = left0 + n - 1; } else if( m >
1 ) left = right0 - m + 1, right = right0; else break; } } }
}
90#undef less_than
91
92// If b has items overlap with a, a is still after b (inclusive).
93static int _ccv_nnc_tensor_block_a_after_b_inclusively(const ccv_sparse_matrix_t* const exec_dep, const ccv_array_t* const a, const ccv_array_t* const b)
94{
95 assert(a)((void) sizeof ((a) ? 1 : 0), __extension__ ({ if (a) ; else __assert_fail
("a", "ccv_nnc_symbolic_graph_compile.c", 95, __extension__ __PRETTY_FUNCTION__
); }))
;
96 assert(b)((void) sizeof ((b) ? 1 : 0), __extension__ ({ if (b) ; else __assert_fail
("b", "ccv_nnc_symbolic_graph_compile.c", 96, __extension__ __PRETTY_FUNCTION__
); }))
;
97 int x, y;
98 for (x = 0; x < b->rnum; x++)
99 {
100 const int p = *(int*)ccv_array_get(b, x)((void*)(((char*)((b)->data)) + (size_t)(b)->rsize * (size_t
)(x)))
;
101 int flag = 0;
102 // In extreme cases where a is a superset of b, then a is still after b, we are good.
103 for (y = 0; !flag && y < a->rnum; y++)
104 {
105 const int q = *(int*)ccv_array_get(a, y)((void*)(((char*)((a)->data)) + (size_t)(a)->rsize * (size_t
)(y)))
;
106 flag = (p == q);
107 }
108 if (!flag)
109 for (y = 0; y < a->rnum; y++)
110 {
111 ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, *(int*)ccv_array_get(a, y)((void*)(((char*)((a)->data)) + (size_t)(a)->rsize * (size_t
)(y)))
, p);
112 if (!cell.i32 || cell.i32[0] == 0)
113 return 0;
114 }
115 }
116 // If b->rnum == 0, a is after b for sure.
117 // Otherwise, if a->rnum == 0, we don't check any, buf if b->rnum > 0, then we cannot say a is after b.
118 // if both a->rnum > 0 and b->rnum > 0, above logic should checked all.
119 return (a->rnum > 0 || b->rnum == 0);
120}
121
122static int _ccv_nnc_tensor_block_a_after_b_exclusively(const ccv_sparse_matrix_t* const exec_dep, const ccv_array_t* const a, const ccv_array_t* const b)
123{
124 assert(a)((void) sizeof ((a) ? 1 : 0), __extension__ ({ if (a) ; else __assert_fail
("a", "ccv_nnc_symbolic_graph_compile.c", 124, __extension__
__PRETTY_FUNCTION__); }))
;
125 assert(b)((void) sizeof ((b) ? 1 : 0), __extension__ ({ if (b) ; else __assert_fail
("b", "ccv_nnc_symbolic_graph_compile.c", 125, __extension__
__PRETTY_FUNCTION__); }))
;
126 int x, y, max_hop = 0;
127 for (x = 0; x < a->rnum; x++)
128 for (y = 0; y < b->rnum; y++)
129 {
130 ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, *(int*)ccv_array_get(a, x)((void*)(((char*)((a)->data)) + (size_t)(a)->rsize * (size_t
)(x)))
, *(int*)ccv_array_get(b, y)((void*)(((char*)((b)->data)) + (size_t)(b)->rsize * (size_t
)(y)))
);
131 if (!cell.i32 || cell.i32[0] == 0)
132 return 0;
133 max_hop = ccv_max(cell.i32[0], max_hop)({ typeof (cell.i32[0]) _a = (cell.i32[0]); typeof (max_hop) _b
= (max_hop); (_a > _b) ? _a : _b; })
;
134 }
135 // We've entered this nested-for loop, therefore, it must be verifiably, deterministically after b now.
136 // The max hop also denotes if that is the case, how many hops, maximally speaking, we need to get from a to b.
137 return max_hop;
138}
139
140// If every a's head is deterministically after b's tail
141static int _ccv_nnc_tensor_block_head_after_tail(const ccv_sparse_matrix_t* const exec_dep, const ccv_nnc_tensor_block_t a, const ccv_nnc_tensor_block_t b)
142{
143 return _ccv_nnc_tensor_block_a_after_b_exclusively(exec_dep, a.head, b.tail);
144}
145
146typedef struct {
147 ccv_array_t** alloc_dep;
148 int vt_block_size;
149 int buffer_size;
150 int block_size;
151 int* vt_blocks; // A reference to the block, because blocks only contains available block (thus, doesn't consider alias etc.). -1 means no block pointed to. Starts at 0.
152 struct {
153 int type; // The type from tensor blocks.
154 int pin_mem; // Whether this is pinned memory.
155 int flags; // The flags (currently for READ_ONLY or not).
156 uint64_t size; // The size of the buffer allocated.
157 int p_refs[2]; // Reference to the upper level block, Starts at 1. Only index 0 is valid throughout, I do use two in the code as a temporary placeholder.
158 ccv_array_t* dup_p_refs; // Reference to the parent tensor block from the duplicated tensor blocks. From buffer, it can point to multiple because it can be associated with multiple tensor blocks that points to different outputs (for example, in 1st unroll, pointing to one block while in 2nd unroll, pointing to another). Start with 0.
159 }* buffers;
160 struct {
161 int buffer_ref; // A reference for block to which buffer to use. Starts at 0.
162 int block_ref; // A reference to which block in the given tensor_block to use.
163 uint64_t offset; // The offset of this block.
164 }* blocks;
165} ccv_nnc_tensor_alloc_prep_t;
166
167typedef struct ccv_nnc_symbolic_graph_prep_s {
168 int flags;
169 int while_count_tensor; // This graph will generate a while count tensor. If this is set to 1, we reserve tensor_metadata at 0 for this.
170 int p_idx; // Reference to the index in its parent graph's sub-graph array, Starts at 1.
171 int exec_idx;
172 int unroll_count; // How many times this graph is unrolled before we can have proper assignment.
173 int tensor_symbol_info_size;
174 int exec_symbol_info_size;
175 int tensor_block_size;
176 int sub_prep_size;
177 ccv_nnc_tensor_block_t* tensor_blocks;
178 ccv_nnc_tensor_symbol_info_t* tensor_symbol_info;
179 ccv_nnc_graph_exec_flag_t* exec_flags;
180 ccv_nnc_graph_exec_symbol_info_t* exec_symbol_info;
181 int* dup_tensor_block_ref;
182 ccv_nnc_graph_visit_t* visit;
183 ccv_nnc_tensor_alloc_prep_t* alloc_prep;
184 struct ccv_nnc_symbolic_graph_prep_s* p;
185 struct ccv_nnc_symbolic_graph_prep_s** sub_preps; // The preps of its sub-graphs.
186 // Structures that don't require to be freed after deallocation.
187 const ccv_nnc_symbolic_graph_t* symbolic_graph; // Constant because I cannot modify it.
188 ccv_nnc_graph_t* graph; // Materialized graph, not managed by prep after created.
189 ccv_nnc_tensor_arena_t* tensor_arena; // Tensor arena, not managed by prep as well.
190 ccv_array_t* dup_breakpoints; // The noop breakpoints, used to extend the inputs life-cycle for while expr.
191} ccv_nnc_symbolic_graph_prep_t;
192
193typedef struct {
194 int oc;
195 ccv_array_t* itf;
196} ccv_nnc_tensor_block_adjacent_t;
197
198static ccv_nnc_tensor_alloc_prep_t* _ccv_nnc_tensor_alloc_prep_new_and_free_exec_dep(ccv_sparse_matrix_t* const exec_dep, const ccv_nnc_tensor_block_t* const tensor_blocks, const int tensor_block_size)
199{
200 // Compute how many dis-continuous buffers are needed.
201 // We prefer to have several dis-continuous buffers instead of one big buffer because
202 // in this way, we can rely on system memory allocators (jemalloc, tcmalloc, or CUDA's allocator)
203 // to fully utilize memory.
204 int i, j, k;
205 ccv_array_t** const alloc_dep = (ccv_array_t**)cccalloccalloc(tensor_block_size, sizeof(ccv_array_t*));
206 int allocable_tensor_size = 0, available_tensor_size = 0;
207 for (i = 0; i < tensor_block_size; i++)
208 if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED))
209 {
210 // Tensors that we need the header info.
211 ++available_tensor_size;
212 if (!TENSOR_EXPECT_ALIAS(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == ALIAS))
213 // Tensors that we actually need to allocate (exclude the alias).
214 ++allocable_tensor_size;
215 }
216 ccv_sparse_matrix_t* const tensor_df = ccv_sparse_matrix_new(tensor_block_size, tensor_block_size, CCV_32S | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0);
217 ccv_sparse_matrix_t* const tensor_dt = ccv_sparse_matrix_new(tensor_block_size, tensor_block_size, CCV_32S | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0);
218 ccv_nnc_tensor_block_adjacent_t* const adj = (ccv_nnc_tensor_block_adjacent_t*)cccalloccalloc(tensor_block_size, sizeof(ccv_nnc_tensor_block_adjacent_t));
219 // Overlap count.
220 for (i = 0; i < tensor_block_size; i++)
221 if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !(
(tensor_blocks[i].flags & 0x3) == UNASSIGNED))
)
222 for (j = i + 1; j < tensor_block_size; j++)
223 if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[j])(!((tensor_blocks[j].flags & 0x3) == ALIAS) && !(
(tensor_blocks[j].flags & 0x3) == UNASSIGNED))
)
224 {
225 // Check to see if they interfere (default to yes).
226 // If any of the i's head is deterministically later than j's tail
227 // or any of the i's tail is deterministically earlier than j's head, they don't interfere.
228 const int i_hop_j = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[i], tensor_blocks[j]);
229 if (i_hop_j > 0)
230 {
231 ccv_set_sparse_matrix_cell(tensor_dt, i, j, &i_hop_j);
232 ccv_set_sparse_matrix_cell(tensor_df, j, i, &i_hop_j);
233 }
234 const int j_hop_i = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[j], tensor_blocks[i]);
235 if (j_hop_i > 0)
236 {
237 ccv_set_sparse_matrix_cell(tensor_dt, j, i, &j_hop_i);
238 ccv_set_sparse_matrix_cell(tensor_df, i, j, &j_hop_i);
239 }
240 // It cannot be that both i can hop to j can j can hop to i.
241 assert(!(i_hop_j > 0 && j_hop_i > 0))((void) sizeof ((!(i_hop_j > 0 && j_hop_i > 0))
? 1 : 0), __extension__ ({ if (!(i_hop_j > 0 && j_hop_i
> 0)) ; else __assert_fail ("!(i_hop_j > 0 && j_hop_i > 0)"
, "ccv_nnc_symbolic_graph_compile.c", 241, __extension__ __PRETTY_FUNCTION__
); }))
;
242 if (!i_hop_j && !j_hop_i && tensor_blocks[i].type == tensor_blocks[j].type)
243 {
244 if (!adj[i].itf)
245 adj[i].itf = ccv_array_new(sizeof(int), 1, 0);
246 ccv_array_push(adj[i].itf, &j);
247 ++adj[i].oc;
248 if (!adj[j].itf)
249 adj[j].itf = ccv_array_new(sizeof(int), 1, 0);
250 ccv_array_push(adj[j].itf, &i);
251 ++adj[j].oc;
252 }
253 }
254 const int exec_dep_rows = exec_dep->rows;
255 ccv_matrix_free(exec_dep);
256 ccv_nnc_tensor_hop_t* const buf = (ccv_nnc_tensor_hop_t*)ccmallocmalloc(sizeof(ccv_nnc_tensor_hop_t) * tensor_block_size);
257 int* const assigned = (int*)cccalloccalloc(tensor_block_size, sizeof(int));
258 uint64_t* const allocated_offset = (uint64_t*)cccalloccalloc(tensor_block_size, sizeof(uint64_t));
259 uint64_t* const allocated_size = (uint64_t*)cccalloccalloc(tensor_block_size, sizeof(uint64_t));
260 uint32_t* const tensor_block_cannot_insert = (uint32_t*)cccalloccalloc(((tensor_block_size + 31) >> 5), sizeof(uint32_t));
261 int num_assigned = 0;
262 // I can do a bit optimization here to assign out const tensor first, but heck, this just works for now.
263 // Allocation graph (assuming there is a source node, and a destination node, which is 0, and (tensor_block_size + 1)
264 // The first channel denotes the bytes available for allocation,
265 // the second channel denotes the offset available for the allocation,
266 ccv_sparse_matrix_t* alloc = ccv_sparse_matrix_new(tensor_block_size + 2, tensor_block_size + 2, CCV_64S | CCV_C2, CCV_SPARSE_ROW_MAJOR, 0);
267 ccv_array_t* opt = ccv_array_new(sizeof(ccv_nnc_tensor_opt_t), 1, 0);
268 for (j = 0; j < allocable_tensor_size;)
269 {
270 // Find the one with largest overlap (in case overlap is the same, larger size), and it is not assigned.
271 uint64_t max_size = 0;
272 ccv_array_clear(opt);
273 int current_type = 0; // Deal with one type at a time.
274 for (i = 0; i < tensor_block_size; i++)
275 if (tensor_blocks[i].size >= max_size &&
276 TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !(
(tensor_blocks[i].flags & 0x3) == UNASSIGNED))
&& !assigned[i] &&
277 IS_PRIMARY_COMPANION(i, tensor_blocks[i])((i) < (uint32_t)((tensor_blocks[i]).companion_ref - 1)) &&
278 (!current_type || tensor_blocks[i].type == current_type))
279 {
280 ccv_nnc_tensor_opt_t a = {
281 .size = tensor_blocks[i].size,
282 .index = i,
283 .oc = adj[i].oc,
284 .type = tensor_blocks[i].type,
285 };
286 assert(a.type)((void) sizeof ((a.type) ? 1 : 0), __extension__ ({ if (a.type
) ; else __assert_fail ("a.type", "ccv_nnc_symbolic_graph_compile.c"
, 286, __extension__ __PRETTY_FUNCTION__); }))
;
287 current_type = a.type; // Now we now the primary type we should deal with.
288 if (tensor_blocks[i].companion_ref)
289 {
290 const int companion_ref = tensor_blocks[i].companion_ref - 1;
291 a.size = ccv_max(a.size, tensor_blocks[companion_ref].size)({ typeof (a.size) _a = (a.size); typeof (tensor_blocks[companion_ref
].size) _b = (tensor_blocks[companion_ref].size); (_a > _b
) ? _a : _b; })
;
292 a.oc += adj[companion_ref].oc;
293 }
294 // In case we have a tie, take them all in the array.
295 if (a.size > max_size)
296 ccv_array_clear(opt), max_size = a.size;
297 ccv_array_push(opt, &a);
298 }
299 assert(opt->rnum > 0)((void) sizeof ((opt->rnum > 0) ? 1 : 0), __extension__
({ if (opt->rnum > 0) ; else __assert_fail ("opt->rnum > 0"
, "ccv_nnc_symbolic_graph_compile.c", 299, __extension__ __PRETTY_FUNCTION__
); }))
;
300 // Order opt array by the oc because type and size should be equal at this point.
301 _ccv_nnc_tensor_opt_sort_by_size_and_oc((ccv_nnc_tensor_opt_t*)opt->data, opt->rnum, 0);
302 // Go through opt array again, this time, it is ordered by size, therefore, if we found a place to insert, we are good.
303 int min_y = 0, min_x = tensor_block_size + 1, min_i = -1, min_hop = exec_dep_rows * 3;
304 uint64_t min_val[2] = {
305 0, 0
306 };
307 if (j > 0)
308 {
309 for (i = 0; i < opt->rnum; i++)
310 {
311 ccv_nnc_tensor_opt_t a = *(ccv_nnc_tensor_opt_t*)ccv_array_get(opt, i)((void*)(((char*)((opt)->data)) + (size_t)(opt)->rsize *
(size_t)(i)))
;
312 if ((tensor_block_cannot_insert[a.index >> 5] & (1u << (a.index & 0x1f))))
313 continue;
314 // Now, determine the order between a and c. After this, we can always check whether y
315 // can hop to the earliest one and if the latest one can hop to x.
316 // The earliest one will be called p and the latest one will be called q.
317 int p = a.index;
318 int q = a.index;
319 if (tensor_blocks[a.index].companion_ref)
320 {
321 const int companion_ref = tensor_blocks[a.index].companion_ref - 1;
322 if ((tensor_block_cannot_insert[companion_ref >> 5] & (1u << (companion_ref & 0x1f))))
323 continue;
324 const ccv_numeric_data_t b_hop_p = ccv_get_sparse_matrix_cell(tensor_dt, p, companion_ref);
325 if (b_hop_p.i32 && b_hop_p.i32[0] > 0)
326 p = companion_ref;
327 else {
328 const ccv_numeric_data_t q_hop_b = ccv_get_sparse_matrix_cell(tensor_dt, companion_ref, q);
329 if (q_hop_b.i32 && q_hop_b.i32[0] > 0)
330 q = companion_ref;
331 else { // Otherwise, b is in between p and q.
332 const ccv_numeric_data_t p_hop_b = ccv_get_sparse_matrix_cell(tensor_dt, companion_ref, p);
333 const ccv_numeric_data_t b_hop_q = ccv_get_sparse_matrix_cell(tensor_dt, q, companion_ref);
334 assert(p_hop_b.i32 && p_hop_b.i32[0] > 0 && b_hop_q.i32 && b_hop_q.i32[0] > 0)((void) sizeof ((p_hop_b.i32 && p_hop_b.i32[0] > 0
&& b_hop_q.i32 && b_hop_q.i32[0] > 0) ? 1
: 0), __extension__ ({ if (p_hop_b.i32 && p_hop_b.i32
[0] > 0 && b_hop_q.i32 && b_hop_q.i32[0] >
0) ; else __assert_fail ("p_hop_b.i32 && p_hop_b.i32[0] > 0 && b_hop_q.i32 && b_hop_q.i32[0] > 0"
, "ccv_nnc_symbolic_graph_compile.c", 334, __extension__ __PRETTY_FUNCTION__
); }))
;
335 }
336 }
337 }
338 assert(tensor_blocks[q].type == tensor_blocks[p].type)((void) sizeof ((tensor_blocks[q].type == tensor_blocks[p].type
) ? 1 : 0), __extension__ ({ if (tensor_blocks[q].type == tensor_blocks
[p].type) ; else __assert_fail ("tensor_blocks[q].type == tensor_blocks[p].type"
, "ccv_nnc_symbolic_graph_compile.c", 338, __extension__ __PRETTY_FUNCTION__
); }))
;
339 const int type = tensor_blocks[p].type;
340 // y is always earlier than x, but this is hard to assert now.
341 // If this edge satisfy the requirement, now we need to find the ones with tightest possible bounds.
342 // Thus, the hop between y and x (through a) should be smallest ones.
343 // We optimized this by first find all allocated nodes that comes to p, and all allocated nodes that
344 // out of q. For these nodes, we try to verify whether they form a connection (by checking against
345 // alloc sparse matrix). If they do, try to see whether we can insert with tightest bound.
346 int y_size = 0;
347 ccv_nnc_tensor_hop_t* const y_buf = buf;
348#define for_block(y, val) do { \
349 if (((int*)val)[0] > 0 && assigned[y] && tensor_blocks[y].type == type && tensor_blocks[y].size >= a.size) \
350 y_buf[y_size++] = (ccv_nnc_tensor_hop_t){ \
351 .idx = y + 1, .hop = ((int*)val)[0] \
352 }; \
353 } while(0)
354 ccv_sparse_matrix_vector_t* const y_vector = ccv_get_sparse_matrix_vector(tensor_dt, p);
355 if (y_vector)
356 CCV_SPARSE_VECTOR_FOREACH(tensor_dt, y_vector, for_block)do { switch ((((tensor_dt)->type) & 0xFF000)) { case CCV_32S
: { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.i32 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i32 + (0))); } } } while (0); break; } case CCV_32F:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.f32 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f32 + (0))); } } } while (0); break; } case CCV_64S:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.i64 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i64 + (0))); } } } while (0); break; } case CCV_64F:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.f64 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f64 + (0))); } } } while (0); break; } default: { do
{ int _i_; __attribute__((unused)) const size_t _c_ = (((tensor_dt
)->type) & 0xFFF); if ((tensor_dt)->type & CCV_DENSE_VECTOR
) { for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { for_block
((_i_), ((y_vector)->data.u8 + (_i_ * _c_))); } } else { const
size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size
[(((tensor_dt)->type) & 0xFF000) >> 12] * (((tensor_dt
)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_
= (uint8_t*)(y_vector)->index; for (_i_ = 0; _i_ < (y_vector
)->size; _i_++) { ccv_sparse_matrix_index_t* const _idx_i_
= (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if
(_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_
= { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.u8 + (0))); } } } while (0); } } } while (0)
;
357#undef for_block
358 assert(y_size <= tensor_block_size)((void) sizeof ((y_size <= tensor_block_size) ? 1 : 0), __extension__
({ if (y_size <= tensor_block_size) ; else __assert_fail (
"y_size <= tensor_block_size", "ccv_nnc_symbolic_graph_compile.c"
, 358, __extension__ __PRETTY_FUNCTION__); }))
;
359 int x_size = 0;
360 ccv_nnc_tensor_hop_t* const x_buf = buf + y_size;
361#define for_block(x, val) do { \
362 if (((int*)val)[0] > 0 && assigned[x] && tensor_blocks[x].type == type && tensor_blocks[x].size >= a.size) \
363 x_buf[x_size++] = (ccv_nnc_tensor_hop_t){ \
364 .idx = x + 1, .hop = ((int*)val)[0] \
365 }; \
366 } while(0)
367 ccv_sparse_matrix_vector_t* const x_vector = ccv_get_sparse_matrix_vector(tensor_df, q);
368 if (x_vector)
369 CCV_SPARSE_VECTOR_FOREACH(tensor_df, x_vector, for_block)do { switch ((((tensor_df)->type) & 0xFF000)) { case CCV_32S
: { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.i32 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i32 + (0))); } } } while (0); break; } case CCV_32F:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.f32 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f32 + (0))); } } } while (0); break; } case CCV_64S:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.i64 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i64 + (0))); } } } while (0); break; } case CCV_64F:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.f64 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f64 + (0))); } } } while (0); break; } default: { do
{ int _i_; __attribute__((unused)) const size_t _c_ = (((tensor_df
)->type) & 0xFFF); if ((tensor_df)->type & CCV_DENSE_VECTOR
) { for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { for_block
((_i_), ((x_vector)->data.u8 + (_i_ * _c_))); } } else { const
size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size
[(((tensor_df)->type) & 0xFF000) >> 12] * (((tensor_df
)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_
= (uint8_t*)(x_vector)->index; for (_i_ = 0; _i_ < (x_vector
)->size; _i_++) { ccv_sparse_matrix_index_t* const _idx_i_
= (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if
(_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_
= { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.u8 + (0))); } } } while (0); } } } while (0)
;
370#undef for_block
371 assert(y_size + x_size <= tensor_block_size)((void) sizeof ((y_size + x_size <= tensor_block_size) ? 1
: 0), __extension__ ({ if (y_size + x_size <= tensor_block_size
) ; else __assert_fail ("y_size + x_size <= tensor_block_size"
, "ccv_nnc_symbolic_graph_compile.c", 371, __extension__ __PRETTY_FUNCTION__
); }))
;
372 int x, y;
373 _ccv_nnc_sort_by_hops(y_buf, y_size, 0);
374 for (y = 0; y < y_size; y++)
375 {
376 const int hop = exec_dep_rows + y_buf[y].hop;
377 if (hop >= min_hop)
378 break;
379 const ccv_numeric_data_t val = ccv_get_sparse_matrix_cell(alloc, y_buf[y].idx, tensor_block_size + 1);
380 if (val.u64 && val.u64[0] >= a.size)
381 {
382 min_y = y_buf[y].idx, min_x = tensor_block_size + 1, min_hop = hop,
383 min_val[0] = val.u64[0], min_val[1] = val.u64[1];
384 break;
385 }
386 }
387 _ccv_nnc_sort_by_hops(x_buf, x_size, 0);
388 for (x = 0; x < x_size; x++)
389 {
390 const int hop = exec_dep_rows + x_buf[x].hop;
391 if (hop >= min_hop)
392 break;
393 const ccv_numeric_data_t val = ccv_get_sparse_matrix_cell(alloc, 0, x_buf[x].idx);
394 if (val.u64 && val.u64[0] >= a.size)
395 {
396 min_y = 0, min_x = x_buf[x].idx, min_hop = hop,
397 min_val[0] = val.u64[0], min_val[1] = val.u64[1];
398 break;
399 }
400 }
401 const int x_min_hop = x_buf[0].hop;
402 for (y = 0; y < y_size; y++)
403 {
404 const int y_hop_p_v = y_buf[y].hop;
405 if (y_hop_p_v + x_min_hop >= min_hop)
406 break;
407 ccv_sparse_matrix_vector_t* const y_vector = ccv_get_sparse_matrix_vector(alloc, y_buf[y].idx);
408 if (y_vector)
409 {
410 for (x = 0; x < x_size; x++)
411 {
412 const int q_hop_x_v = x_buf[x].hop;
413 const int hop = y_hop_p_v + q_hop_x_v;
414 if (hop >= min_hop)
415 break;
416 const ccv_numeric_data_t val = ccv_get_sparse_matrix_cell_from_vector(alloc, y_vector, x_buf[x].idx);
417 if (val.u64 && val.u64[0] >= a.size)
418 {
419 min_y = y_buf[y].idx, min_x = x_buf[x].idx, min_hop = hop,
420 min_val[0] = val.u64[0], min_val[1] = val.u64[1];
421 break;
422 }
423 }
424 }
425 }
426 // If I found a place, stop, and exit.
427 if (min_y > 0 || min_x < tensor_block_size + 1)
428 {
429 min_i = i;
430 break;
431 }
432 // There is no space to insert this block, mark it as such.
433 tensor_block_cannot_insert[a.index >> 5] |= (1u << (a.index & 0x1f));
434 if (tensor_blocks[a.index].companion_ref)
435 {
436 const int companion_ref = tensor_blocks[a.index].companion_ref - 1;
437 tensor_block_cannot_insert[companion_ref >> 5] |= (1u << (companion_ref & 0x1f));
438 }
439 }
440 }
441 // If I cannot find a place, then start a new connection between min_y and min_x (a new assignment group).
442 // and default to largest size available.
443 ccv_nnc_tensor_opt_t a = *(ccv_nnc_tensor_opt_t*)ccv_array_get(opt, ccv_max(0, min_i))((void*)(((char*)((opt)->data)) + (size_t)(opt)->rsize *
(size_t)(({ typeof (0) _a = (0); typeof (min_i) _b = (min_i)
; (_a > _b) ? _a : _b; }))))
;
444 if (min_i == -1)
445 {
446 allocated_size[num_assigned] = a.size;
447 ++num_assigned;
448 }
449 int assign_group = num_assigned;
450 if (min_y > 0)
451 {
452 assign_group = assigned[min_y - 1];
453 // The y and x should belong to the same assigned group.
454 assert(min_x == tensor_block_size + 1 || assigned[min_x - 1] == assign_group)((void) sizeof ((min_x == tensor_block_size + 1 || assigned[min_x
- 1] == assign_group) ? 1 : 0), __extension__ ({ if (min_x ==
tensor_block_size + 1 || assigned[min_x - 1] == assign_group
) ; else __assert_fail ("min_x == tensor_block_size + 1 || assigned[min_x - 1] == assign_group"
, "ccv_nnc_symbolic_graph_compile.c", 454, __extension__ __PRETTY_FUNCTION__
); }))
;
455 } else if (min_x < tensor_block_size + 1)
456 assign_group = assigned[min_x - 1];
457 // If min_y is source and min_x is destination, we don't need to do anything, otherwise, decrease the weight on that edge.
458 if (min_y != 0 || min_x != tensor_block_size + 1)
459 {
460 uint64_t val[2] = {
461 min_val[0], min_val[1]
462 };
463 assert(val[0] >= a.size)((void) sizeof ((val[0] >= a.size) ? 1 : 0), __extension__
({ if (val[0] >= a.size) ; else __assert_fail ("val[0] >= a.size"
, "ccv_nnc_symbolic_graph_compile.c", 463, __extension__ __PRETTY_FUNCTION__
); }))
;
464 val[0] -= a.size;
465 val[1] = val[1] + a.size; // Move the offset to the next one.
466 ccv_set_sparse_matrix_cell(alloc, min_y, min_x, val);
467 }
468 int strings[3];
469 strings[0] = a.index + 1;
470 int string_size = 1;
471 // Assign out designated companion if it exist.
472 if (tensor_blocks[a.index].companion_ref)
473 {
474 const int companion_ref = tensor_blocks[a.index].companion_ref - 1;
475 assert(tensor_blocks[a.index].type == tensor_blocks[companion_ref].type)((void) sizeof ((tensor_blocks[a.index].type == tensor_blocks
[companion_ref].type) ? 1 : 0), __extension__ ({ if (tensor_blocks
[a.index].type == tensor_blocks[companion_ref].type) ; else __assert_fail
("tensor_blocks[a.index].type == tensor_blocks[companion_ref].type"
, "ccv_nnc_symbolic_graph_compile.c", 475, __extension__ __PRETTY_FUNCTION__
); }))
;
476 const ccv_numeric_data_t b_hop_p = ccv_get_sparse_matrix_cell(tensor_dt, strings[0] - 1, companion_ref);
477 if (b_hop_p.i32 && b_hop_p.i32[0] > 0)
478 {
479 for (i = 0; i < string_size; i++)
480 strings[i + 1] = strings[i];
481 strings[0] = companion_ref + 1;
482 } else {
483 const ccv_numeric_data_t q_hop_b = ccv_get_sparse_matrix_cell(tensor_dt, companion_ref, strings[string_size - 1] - 1);
484 if (q_hop_b.i32 && q_hop_b.i32[0] > 0)
485 strings[string_size] = companion_ref + 1;
486 else {
487 // Because b_hop_p is 0, q_hop_b is nil, p != q, and b must in between p and q. Therefore, I must have 2 allocations.
488 assert(string_size == 2)((void) sizeof ((string_size == 2) ? 1 : 0), __extension__ ({
if (string_size == 2) ; else __assert_fail ("string_size == 2"
, "ccv_nnc_symbolic_graph_compile.c", 488, __extension__ __PRETTY_FUNCTION__
); }))
;
489 strings[2] = strings[1];
490 strings[1] = companion_ref + 1;
491 }
492 }
493 ++string_size;
494 }
495 // Assign out and update oc.
496 for (i = 0; i < string_size; i++)
497 {
498 const int index = strings[i] - 1;
499 // Assign out the selected one.
500 assigned[index] = assign_group;
501 // The offset for this one, should be either 0 (started a new group, when min_i == -1), or the offset on this edge.
502 allocated_offset[index] = min_val[1];
503 if (adj[index].itf)
504 for (k = 0; k < adj[index].itf->rnum; k++)
505 {
506 const int d = *(int*)ccv_array_get(adj[index].itf, k)((void*)(((char*)((adj[index].itf)->data)) + (size_t)(adj[
index].itf)->rsize * (size_t)(k)))
;
507 if (!assigned[d] && TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d])(!((tensor_blocks[d].flags & 0x3) == ALIAS) && !(
(tensor_blocks[d].flags & 0x3) == UNASSIGNED))
)
508 --adj[d].oc;
509 }
510 }
511 uint64_t val[2] = {
512 a.size, min_val[1]
513 };
514 uint64_t consumed_size = 0;
515 // Go over from min_y to string_size (excluding min_x).
516 for (i = 0; i < string_size; i++)
517 {
518 const uint64_t size = tensor_blocks[strings[i] - 1].size;
519 assert(size <= a.size)((void) sizeof ((size <= a.size) ? 1 : 0), __extension__ (
{ if (size <= a.size) ; else __assert_fail ("size <= a.size"
, "ccv_nnc_symbolic_graph_compile.c", 519, __extension__ __PRETTY_FUNCTION__
); }))
;
520 // Update consumed size if it is bigger than "size".
521 if (size > consumed_size)
522 {
523 val[0] = size - consumed_size;
524 ccv_set_sparse_matrix_cell(alloc, min_y, strings[i], val);
525 consumed_size = size;
526 val[1] = min_val[1] + consumed_size;
527 }
528 // If it consumed all the flow, break out.
529 if (consumed_size == a.size)
530 break;
531 }
532 for (i = 0; i < string_size; i++)
533 {
534 const uint64_t i_size = tensor_blocks[strings[i] - 1].size;
535 uint64_t val[2] = {
536 i_size, min_val[1]
537 };
538 uint64_t consumed_size = 0;
539 for (k = i + 1; k < string_size; k++)
540 {
541 const uint64_t size = ccv_min(i_size, tensor_blocks[strings[k] - 1].size)({ typeof (i_size) _a = (i_size); typeof (tensor_blocks[strings
[k] - 1].size) _b = (tensor_blocks[strings[k] - 1].size); (_a
< _b) ? _a : _b; })
;
542 // Update consumed size if it is bigger than "size".
543 if (size > consumed_size)
544 {
545 val[0] = size - consumed_size;
546 ccv_set_sparse_matrix_cell(alloc, strings[i], strings[k], val);
547 consumed_size = size;
548 val[1] = min_val[1] + consumed_size;
549 }
550 // If it consumed all the flow, break out.
551 if (consumed_size == i_size)
552 break;
553 }
554 val[0] = i_size - consumed_size;
555 // Still have residual, flow it to min_x.
556 if (val[0] > 0)
557 ccv_set_sparse_matrix_cell(alloc, strings[i], min_x, val);
558 }
559 if (min_i == -1)
560 {
561 // If we decide to insert a new edge, simply marking anyone who is not interfere with it to redo.
562 const int p = strings[0] - 1;
563 const int q = strings[string_size - 1] - 1;
564 const int type = tensor_blocks[p].type;
565#define for_block(y, val) do { \
566 if (((int*)val)[0] > 0 && !assigned[y] && tensor_blocks[y].type == type && tensor_blocks[y].size <= a.size) \
567 { \
568 tensor_block_cannot_insert[y >> 5] &= ~(1u << (y & 0x1f)); \
569 if (tensor_blocks[y].companion_ref) \
570 { \
571 const int companion_ref = tensor_blocks[y].companion_ref - 1; \
572 tensor_block_cannot_insert[companion_ref >> 5] |= (1u << (companion_ref & 0x1f)); \
573 } \
574 } \
575 } while(0)
576 ccv_sparse_matrix_vector_t* const y_vector = ccv_get_sparse_matrix_vector(tensor_dt, p);
577 if (y_vector)
578 CCV_SPARSE_VECTOR_FOREACH(tensor_dt, y_vector, for_block)do { switch ((((tensor_dt)->type) & 0xFF000)) { case CCV_32S
: { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.i32 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i32 + (0))); } } } while (0); break; } case CCV_32F:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.f32 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f32 + (0))); } } } while (0); break; } case CCV_64S:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.i64 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i64 + (0))); } } } while (0); break; } case CCV_64F:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.f64 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f64 + (0))); } } } while (0); break; } default: { do
{ int _i_; __attribute__((unused)) const size_t _c_ = (((tensor_dt
)->type) & 0xFFF); if ((tensor_dt)->type & CCV_DENSE_VECTOR
) { for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { for_block
((_i_), ((y_vector)->data.u8 + (_i_ * _c_))); } } else { const
size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size
[(((tensor_dt)->type) & 0xFF000) >> 12] * (((tensor_dt
)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_
= (uint8_t*)(y_vector)->index; for (_i_ = 0; _i_ < (y_vector
)->size; _i_++) { ccv_sparse_matrix_index_t* const _idx_i_
= (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if
(_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_
= { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.u8 + (0))); } } } while (0); } } } while (0)
;
579#undef for_block
580#define for_block(x, val) do { \
581 if (((int*)val)[0] > 0 && !assigned[x] && tensor_blocks[x].type == type && tensor_blocks[x].size <= a.size) \
582 { \
583 tensor_block_cannot_insert[x >> 5] &= ~(1u << (x & 0x1f)); \
584 if (tensor_blocks[x].companion_ref) \
585 { \
586 const int companion_ref = tensor_blocks[x].companion_ref - 1; \
587 tensor_block_cannot_insert[companion_ref >> 5] |= (1u << (companion_ref & 0x1f)); \
588 } \
589 } \
590 } while(0)
591 ccv_sparse_matrix_vector_t* const x_vector = ccv_get_sparse_matrix_vector(tensor_df, q);
592 if (x_vector)
593 CCV_SPARSE_VECTOR_FOREACH(tensor_df, x_vector, for_block)do { switch ((((tensor_df)->type) & 0xFF000)) { case CCV_32S
: { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.i32 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i32 + (0))); } } } while (0); break; } case CCV_32F:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.f32 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f32 + (0))); } } } while (0); break; } case CCV_64S:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.i64 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i64 + (0))); } } } while (0); break; } case CCV_64F:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.f64 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f64 + (0))); } } } while (0); break; } default: { do
{ int _i_; __attribute__((unused)) const size_t _c_ = (((tensor_df
)->type) & 0xFFF); if ((tensor_df)->type & CCV_DENSE_VECTOR
) { for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { for_block
((_i_), ((x_vector)->data.u8 + (_i_ * _c_))); } } else { const
size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size
[(((tensor_df)->type) & 0xFF000) >> 12] * (((tensor_df
)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_
= (uint8_t*)(x_vector)->index; for (_i_ = 0; _i_ < (x_vector
)->size; _i_++) { ccv_sparse_matrix_index_t* const _idx_i_
= (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if
(_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_
= { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.u8 + (0))); } } } while (0); } } } while (0)
;
594#undef for_block
595 }
596 j += string_size;
597 }
598 ccfreefree(tensor_block_cannot_insert);
599 ccfreefree(buf);
600 ccv_array_free(opt);
601 ccv_matrix_free(tensor_df);
602 ccv_matrix_free(tensor_dt);
603#define for_block(y, x, val) do { \
604 if (((uint64_t*)val)[0] > 0 && y > 0 && x < tensor_block_size + 1) \
605 { \
606 if (!alloc_dep[x - 1]) \
607 alloc_dep[x - 1] = ccv_array_new(sizeof(int), 1, 0); \
608 ccv_array_add_unique_int(alloc_dep[x - 1], y - 1); \
609 } \
610 } while (0)
611 CCV_SPARSE_FOREACH(alloc, for_block)do { if ((alloc)->major & CCV_SPARSE_COL_MAJOR) { switch
((((alloc)->type) & 0xFF000)) { case CCV_32S: { do { uint32_t
_i_, _j_; const uint32_t _size_ = (alloc)->size; __attribute__
((unused)) const size_t _c_ = (((alloc)->type) & 0xFFF
); if ((alloc)->type & CCV_DENSE_VECTOR) { for (_i_ = 0
; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t* const _idx_
= (alloc)->index + _i_; ccv_sparse_matrix_vector_t* const
_v_ = (alloc)->vector + _i_; if (_idx_->ifbit <= 1 ||
!_v_->size) continue; for (_j_ = 0; _j_ < _v_->size
; _j_++) { for_block((_j_), (_idx_->i), (_v_->data.i32 +
(_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
& 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
+ _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
+ _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_j_->
i), (_idx_->i), (_d_.i32 + (0))); } } } } while (0); break
; } case CCV_32F: { do { uint32_t _i_, _j_; const uint32_t _size_
= (alloc)->size; __attribute__((unused)) const size_t _c_
= (((alloc)->type) & 0xFFF); if ((alloc)->type &
CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_j_), (_idx_->i), (_v_->data.
f32 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
& 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
+ _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
+ _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_j_->
i), (_idx_->i), (_d_.f32 + (0))); } } } } while (0); break
; } case CCV_64S: { do { uint32_t _i_, _j_; const uint32_t _size_
= (alloc)->size; __attribute__((unused)) const size_t _c_
= (((alloc)->type) & 0xFFF); if ((alloc)->type &
CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_j_), (_idx_->i), (_v_->data.
i64 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
& 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
+ _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
+ _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_j_->
i), (_idx_->i), (_d_.i64 + (0))); } } } } while (0); break
; } case CCV_64F: { do { uint32_t _i_, _j_; const uint32_t _size_
= (alloc)->size; __attribute__((unused)) const size_t _c_
= (((alloc)->type) & 0xFFF); if ((alloc)->type &
CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_j_), (_idx_->i), (_v_->data.
f64 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
& 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
+ _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
+ _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_j_->
i), (_idx_->i), (_d_.f64 + (0))); } } } } while (0); break
; } default: { do { uint32_t _i_, _j_; const uint32_t _size_ =
(alloc)->size; __attribute__((unused)) const size_t _c_ =
(((alloc)->type) & 0xFFF); if ((alloc)->type &
CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_j_), (_idx_->i), (_v_->data.
u8 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
& 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
+ _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
+ _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_j_->
i), (_idx_->i), (_d_.u8 + (0))); } } } } while (0); } } } else
{ switch ((((alloc)->type) & 0xFF000)) { case CCV_32S
: { do { uint32_t _i_, _j_; const uint32_t _size_ = (alloc)->
size; __attribute__((unused)) const size_t _c_ = (((alloc)->
type) & 0xFFF); if ((alloc)->type & CCV_DENSE_VECTOR
) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_idx_->i), (_j_), (_v_->data.
i32 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
& 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
+ _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
+ _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_->
i), (_idx_j_->i), (_d_.i32 + (0))); } } } } while (0); break
; } case CCV_32F: { do { uint32_t _i_, _j_; const uint32_t _size_
= (alloc)->size; __attribute__((unused)) const size_t _c_
= (((alloc)->type) & 0xFFF); if ((alloc)->type &
CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_idx_->i), (_j_), (_v_->data.
f32 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
& 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
+ _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
+ _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_->
i), (_idx_j_->i), (_d_.f32 + (0))); } } } } while (0); break
; } case CCV_64S: { do { uint32_t _i_, _j_; const uint32_t _size_
= (alloc)->size; __attribute__((unused)) const size_t _c_
= (((alloc)->type) & 0xFFF); if ((alloc)->type &
CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_idx_->i), (_j_), (_v_->data.
i64 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
& 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
+ _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
+ _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_->
i), (_idx_j_->i), (_d_.i64 + (0))); } } } } while (0); break
; } case CCV_64F: { do { uint32_t _i_, _j_; const uint32_t _size_
= (alloc)->size; __attribute__((unused)) const size_t _c_
= (((alloc)->type) & 0xFFF); if ((alloc)->type &
CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_idx_->i), (_j_), (_v_->data.
f64 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
& 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
+ _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
+ _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_->
i), (_idx_j_->i), (_d_.f64 + (0))); } } } } while (0); break
; } default: { do { uint32_t _i_, _j_; const uint32_t _size_ =
(alloc)->size; __attribute__((unused)) const size_t _c_ =
(((alloc)->type) & 0xFFF); if ((alloc)->type &
CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_idx_->i), (_j_), (_v_->data.
u8 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
& 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
+ _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
+ _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_->
i), (_idx_j_->i), (_d_.u8 + (0))); } } } } while (0); } } }
} while (0)
;
612#undef for_block
613 ccv_matrix_free(alloc);
614 for (i = 0; i < tensor_block_size; i++)
615 if (adj[i].itf)
616 ccv_array_free(adj[i].itf);
617 ccfreefree(adj);
618 ccv_nnc_tensor_alloc_prep_t* alloc_prep = (ccv_nnc_tensor_alloc_prep_t*)ccmallocmalloc(sizeof(ccv_nnc_tensor_alloc_prep_t) + sizeof(alloc_prep->blocks[0]) * available_tensor_size + sizeof(alloc_prep->buffers[0]) * num_assigned + sizeof(int) * tensor_block_size);
619 alloc_prep->alloc_dep = alloc_dep;
620 alloc_prep->vt_block_size = tensor_block_size;
621 alloc_prep->buffer_size = num_assigned;
622 alloc_prep->block_size = available_tensor_size;
623 alloc_prep->blocks = (void*)(alloc_prep + 1); // From the biggest structs to smaller ones.
624 alloc_prep->buffers = (void*)(alloc_prep->blocks + available_tensor_size);
625 alloc_prep->vt_blocks = (int*)(alloc_prep->buffers + num_assigned);
626 memset(alloc_prep->buffers, 0, sizeof(alloc_prep->buffers[0]) * num_assigned);
627 for (i = 0; i < num_assigned; i++)
628 alloc_prep->buffers[i].size = allocated_size[i];
629 if (CCV_CLI_OUTPUT_LEVEL_IS(CCV_CLI_INFO)(CCV_CLI_INFO & ccv_cli_get_output_levels()))
630 {
631 size_t total_size = 0;
632 for (i = 0; i < num_assigned; i++)
633 total_size += allocated_size[i];
634 PRINT(CCV_CLI_INFO, "Total buffer size of %zu to be allocated\n", total_size)do { if ((CCV_CLI_INFO & ccv_cli_get_output_levels())) { printf
("Total buffer size of %zu to be allocated\n", total_size); fflush
(stdout); } } while (0)
;
635 }
636 ccfreefree(allocated_size);
637 j = 0;
638 // Assigning out the tensors (in case of sharing tensors / in-place ops).
639 for (i = 0; i < tensor_block_size; i++)
640 if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED))
641 {
642 alloc_prep->blocks[j].block_ref = i;
643 if (!TENSOR_EXPECT_ALIAS(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == ALIAS))
644 {
645 alloc_prep->vt_blocks[i] = j;
646 // Also, set its allocations.
647 assert(assigned[i] > 0)((void) sizeof ((assigned[i] > 0) ? 1 : 0), __extension__ (
{ if (assigned[i] > 0) ; else __assert_fail ("assigned[i] > 0"
, "ccv_nnc_symbolic_graph_compile.c", 647, __extension__ __PRETTY_FUNCTION__
); }))
;
648 const int buffer_ref = alloc_prep->blocks[j].buffer_ref = assigned[i] - 1;
649 alloc_prep->blocks[j].offset = allocated_offset[i];
650 if (!alloc_prep->buffers[buffer_ref].type)
651 alloc_prep->buffers[buffer_ref].type = tensor_blocks[i].type;
652 alloc_prep->buffers[buffer_ref].pin_mem = alloc_prep->buffers[buffer_ref].pin_mem || tensor_blocks[i].pin_mem;
653 alloc_prep->buffers[buffer_ref].flags |= TENSOR_READ_WRITE(tensor_blocks[i])(tensor_blocks[i].flags & 0xc);
654 assert(allocated_offset[i] + tensor_blocks[i].size <= alloc_prep->buffers[buffer_ref].size)((void) sizeof ((allocated_offset[i] + tensor_blocks[i].size <=
alloc_prep->buffers[buffer_ref].size) ? 1 : 0), __extension__
({ if (allocated_offset[i] + tensor_blocks[i].size <= alloc_prep
->buffers[buffer_ref].size) ; else __assert_fail ("allocated_offset[i] + tensor_blocks[i].size <= alloc_prep->buffers[buffer_ref].size"
, "ccv_nnc_symbolic_graph_compile.c", 654, __extension__ __PRETTY_FUNCTION__
); }))
;
655 } else {
656 alloc_prep->vt_blocks[i] = -1;
657 alloc_prep->blocks[j].buffer_ref = -1;
658 alloc_prep->blocks[j].offset = 0;
659 }
660 ++j;
661 } else
662 alloc_prep->vt_blocks[i] = -1;
663 ccfreefree(allocated_offset);
664 ccfreefree(assigned);
665 return alloc_prep;
666}
667
668static void _ccv_nnc_tensor_alloc_prep_free(ccv_nnc_tensor_alloc_prep_t* alloc_prep)
669{
670 int i;
671 for (i = 0; i < alloc_prep->vt_block_size; i++)
672 if (alloc_prep->alloc_dep[i])
673 ccv_array_free(alloc_prep->alloc_dep[i]);
674 for (i = 0; i < alloc_prep->buffer_size; i++)
675 if (alloc_prep->buffers[i].dup_p_refs)
676 ccv_array_free(alloc_prep->buffers[i].dup_p_refs);
677 ccfreefree(alloc_prep->alloc_dep);
678 ccfreefree(alloc_prep);
679}
680
681// Simple allocator from ccv_array_t.
682static int _ccv_nnc_tensor_metadata_pos_new(ccv_array_t* const tensor_metadata, const size_t size)
683{
684 int pos = tensor_metadata->rnum;
685 int rsize = (size + 15) / 16;
686 ccv_array_resize(tensor_metadata, pos + rsize);
687 return (pos << 1) + 1;
688}
689
690static ccv_nnc_tensor_t* _ccv_nnc_tensor_metadata_get(const ccv_array_t* const tensor_metadata, const int pos)
691{
692 assert((pos >> 1) < tensor_metadata->rnum)((void) sizeof (((pos >> 1) < tensor_metadata->rnum
) ? 1 : 0), __extension__ ({ if ((pos >> 1) < tensor_metadata
->rnum) ; else __assert_fail ("(pos >> 1) < tensor_metadata->rnum"
, "ccv_nnc_symbolic_graph_compile.c", 692, __extension__ __PRETTY_FUNCTION__
); }))
;
693 return (ccv_nnc_tensor_t*)ccv_array_get(tensor_metadata, pos >> 1)((void*)(((char*)((tensor_metadata)->data)) + (size_t)(tensor_metadata
)->rsize * (size_t)(pos >> 1)))
;
694}
695
696#define CCV_NNC_IS_METADATA_POS(ptr)((uintptr_t)(ptr) & 1) ((uintptr_t)(ptr) & 1)
697
698static ccv_nnc_tensor_t* _ccv_nnc_tensor_metadata_rewire(const ccv_array_t* const tensor_metadata, ccv_nnc_tensor_t* const vt_tensor)
699{
700 // If the low bit is not 1, this is not a position (but a normal tensor pointer), just return directly.
701 if (!CCV_NNC_IS_METADATA_POS(vt_tensor)((uintptr_t)(vt_tensor) & 1))
702 return vt_tensor;
703 ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)vt_tensor);
704 if (tensor->alias_ref && CCV_NNC_IS_METADATA_POS(tensor->alias_ref)((uintptr_t)(tensor->alias_ref) & 1))
705 {
706 const int alias_ref = tensor->alias_ref;
707 tensor->alias_ref = (uintptr_t)_ccv_nnc_tensor_metadata_get(tensor_metadata, (int)tensor->alias_ref);
708 _ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)alias_ref);
709 }
710 if (CCV_IS_TENSOR_MULTIVIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_MULTIVIEW))
711 {
712 ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)tensor;
713 int i;
714 const int count = mv->kind + mv->repeat;
715 for (i = 0; i < count; i++)
716 {
717 if (CCV_NNC_IS_METADATA_POS(CCV_NNC_MULTIVIEW_DATA(mv)[i])((uintptr_t)(((mv)->_heap_data ? (mv)->_heap_data : (mv
)->_inline_data)[i]) & 1)
)
718 {
719 const int pos = (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i];
720 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i]);
721 _ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)pos);
722 }
723 }
724 // No need to recursively do parent pointer, otherwise we are in deep rewire.
725 if (mv->p && CCV_NNC_IS_METADATA_POS(mv->p)((uintptr_t)(mv->p) & 1))
726 mv->p = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)mv->p);
727 if (mv->sp)
728 for (i = 0; i < mv->sp->rnum; i++)
729 {
730 ccv_nnc_tensor_t** const tensor = (ccv_nnc_tensor_t**)ccv_array_get(mv->sp, i)((void*)(((char*)((mv->sp)->data)) + (size_t)(mv->sp
)->rsize * (size_t)(i)))
;
731 if (CCV_NNC_IS_METADATA_POS(*tensor)((uintptr_t)(*tensor) & 1))
732 {
733 const int pos = (int)(intptr_t)*tensor;
734 *tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
735 assert(!CCV_IS_TENSOR_MULTIVIEW(*tensor))((void) sizeof ((!((*(int*)(*tensor)) & CCV_TENSOR_MULTIVIEW
)) ? 1 : 0), __extension__ ({ if (!((*(int*)(*tensor)) & CCV_TENSOR_MULTIVIEW
)) ; else __assert_fail ("!CCV_IS_TENSOR_MULTIVIEW(*tensor)",
"ccv_nnc_symbolic_graph_compile.c", 735, __extension__ __PRETTY_FUNCTION__
); }))
;
736 _ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)pos);
737 }
738 }
739 }
740 return tensor;
741}
742
743typedef struct {
744 const uint8_t* ptr;
745 int pos;
746} ccv_nnc_tensor_block_pos_t;
747
748static int _ccv_nnc_tensor_multiview_find_pos(ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_param_t params, const ccv_nnc_symbolic_graph_prep_t* const *const preps, const int block_ref, const int* const ch, const int idx, const ccv_nnc_symbolic_graph_prep_t* prep)
749{
750 int i;
751 int unref_block_ref = block_ref;
752 while (prep->tensor_blocks[unref_block_ref].ref)
753 unref_block_ref = prep->tensor_blocks[unref_block_ref].ref - 1;
754 int vt_ref = prep->alloc_prep->vt_blocks[unref_block_ref];
755 assert(vt_ref >= 0)((void) sizeof ((vt_ref >= 0) ? 1 : 0), __extension__ ({ if
(vt_ref >= 0) ; else __assert_fail ("vt_ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 755, __extension__ __PRETTY_FUNCTION__); }))
;
756 assert(unref_block_ref == prep->alloc_prep->blocks[vt_ref].block_ref)((void) sizeof ((unref_block_ref == prep->alloc_prep->blocks
[vt_ref].block_ref) ? 1 : 0), __extension__ ({ if (unref_block_ref
== prep->alloc_prep->blocks[vt_ref].block_ref) ; else __assert_fail
("unref_block_ref == prep->alloc_prep->blocks[vt_ref].block_ref"
, "ccv_nnc_symbolic_graph_compile.c", 756, __extension__ __PRETTY_FUNCTION__
); }))
;
757 const int buffer_ref = prep->alloc_prep->blocks[vt_ref].buffer_ref;
758 uint64_t offset = prep->alloc_prep->blocks[vt_ref].offset;
759 int p_ref = prep->alloc_prep->buffers[buffer_ref].p_refs[0] - 1;
760 for (i = idx - 1; i >= 0; i--)
761 {
762 assert(p_ref >= 0)((void) sizeof ((p_ref >= 0) ? 1 : 0), __extension__ ({ if
(p_ref >= 0) ; else __assert_fail ("p_ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 762, __extension__ __PRETTY_FUNCTION__); }))
;
763 const ccv_nnc_symbolic_graph_prep_t* const graph_prep = preps[i];
764 const int unroll_count = graph_prep->unroll_count;
765 if (ch[i]) // Prefer the dup side of things.
766 p_ref = graph_prep->dup_tensor_block_ref[p_ref * unroll_count + ch[i] - 1];
767 int unref_p_ref = p_ref;
768 while (graph_prep->tensor_blocks[unref_p_ref].ref)
769 unref_p_ref = graph_prep->tensor_blocks[unref_p_ref].ref - 1;
770 vt_ref = graph_prep->alloc_prep->vt_blocks[unref_p_ref];
771 const int buffer_ref = graph_prep->alloc_prep->blocks[vt_ref].buffer_ref;
772 offset += graph_prep->alloc_prep->blocks[vt_ref].offset;
773 // If the buffer already exists, prefer that.
774 const uint8_t* ptr = graph_prep->tensor_arena->buffers[buffer_ref].ptr;
775 if (ptr)
776 {
777 // If I have any remaining path that is not covered from 0, I cannot possibly
778 // have any pointer from buffer (that can only happen if it is not dup).
779 for (--i; i >= 0; i--)
780 if (ch[i] != 0)
781 return 0;
782 // Try to find the created tensor block pos in the array, just linear scan.
783 const int tv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_t));
784 ccv_nnc_tensor_t* const tv = (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, tv_pos);
785 *tv = ccv_nnc_tensor(graph_prep->tensor_arena->buffers[buffer_ref].ptr, params, 0);
786 ccv_nnc_tensor_data_add(tv->info, offset, &tv->data, &tv->dataof);
787 return tv_pos;
788 }
789 p_ref = graph_prep->alloc_prep->buffers[buffer_ref].p_refs[0] - 1;
790 }
791 return 0;
792}
793
794// Descent from root to the prep level, and compose multiview from there.
795static int _ccv_nnc_tensor_multiview_down_find_pos(ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_param_t params, const int preserve, const int assign_update, const ccv_nnc_symbolic_graph_prep_t* const *const preps, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int block_ref, int* ch, const int idx, int* const pos_ref)
796{
797 assert(pos_ref)((void) sizeof ((pos_ref) ? 1 : 0), __extension__ ({ if (pos_ref
) ; else __assert_fail ("pos_ref", "ccv_nnc_symbolic_graph_compile.c"
, 797, __extension__ __PRETTY_FUNCTION__); }))
;
798 int i;
799 const ccv_nnc_symbolic_graph_prep_t* const prep = preps[idx];
800 const int unroll_count = prep->unroll_count;
801 if (prep == graph_prep)
802 {
803 const int data_pos = _ccv_nnc_tensor_multiview_find_pos(tensor_metadata, params, preps, block_ref, ch, idx, prep);
804 if (!data_pos)
805 return -1;
806 // Based on ch, go all the way back to find the exact pointer to compose.
807 if (// !assign_update && // If I plan to receive assign update, we don't need to have multiple receiver. Just one tensor to receive update is enough.
808 prep->dup_tensor_block_ref &&
809 prep->dup_tensor_block_ref[block_ref * unroll_count] >= 0 &&
810 prep->dup_tensor_block_ref[block_ref * unroll_count] != block_ref)
811 {
812 int pos[unroll_count + 1];
813 pos[0] = data_pos;
814 for (i = 0; i < unroll_count; i++)
815 pos[i + 1] = _ccv_nnc_tensor_multiview_find_pos(tensor_metadata, params, preps, prep->dup_tensor_block_ref[block_ref * unroll_count + i], ch, idx, prep);
816 const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
817 ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
818 ccv_nnc_tensor_t* data[unroll_count + 1];
819 for (i = 0; i < unroll_count + 1; i++)
820 data[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos[i]);
821 ccv_nnc_tensor_multiview(data, CCV_NNC_MULTIVIEW_K0N, unroll_count + 1, prep->graph, mv);
822 for (i = 0; i < unroll_count + 1; i++)
823 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i] = (ccv_nnc_tensor_t*)(intptr_t)pos[i];
824 *pos_ref = mv_pos;
825 } else {
826 *pos_ref = data_pos;
827 }
828 if (preserve)
829 {
830 // If need to preserve, this need to be more complicated. At loop 0, I need to access the new assigned tv.
831 // at any other loops, it should be the same. Thus, for this case, I will create a mv tensor as following:
832 // mv of K11, thus, when loop is 0, it unwrap to mv->data[0], otherwise, unwrap to mv->data[1].
833 // mv->data[0] (thin_mv) is a K01, which points to the assigned tv (using 1 as a placeholder here until parent
834 // arena allocated).
835 // mv->data[1] (prev_mv_pos_ is a K01 or K02, depending on whether above we passed raw pointer directly or
836 // a mv structure. If we pass a mv structure, we just pass it here. If we pass a raw pointer, we need to wrap
837 // it to a K01 structure.
838 // Why we didn't wrap it directly as mv->data[0] pointing to a assigned tv pointer and the mv->data[1] pointing
839 // to the raw pointer (as ptr_ref) with K11? The reason is we don't know the assigned tv is pointing to one
840 // memory region, or is a managed by multi-view tensor, which could pointing to different memory regions.
841 int prev_mv_pos = *pos_ref;
842 if (prev_mv_pos == -1)
843 {
844 prev_mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
845 ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, prev_mv_pos);
846 ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_metadata, data_pos);
847 ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
848 tv,
849 }, CCV_NNC_MULTIVIEW_K0N, 1, prep->graph, mv);
850 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[0] = (ccv_nnc_tensor_t*)(intptr_t)data_pos;
851 }
852 const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
853 ccv_nnc_tensor_multiview_t* const prev_mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, prev_mv_pos);
854 ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
855 ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
856 CCV_NNC_TENSOR_PLACEHOLDER((ccv_nnc_tensor_t*)(intptr_t)(0x10)),
857 (ccv_nnc_tensor_t*)prev_mv,
858 }, CCV_NNC_MULTIVIEW_K1N, 1, prep->graph, mv);
859 prev_mv->p = (void*)(intptr_t)mv_pos;
860 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[0] = CCV_NNC_TENSOR_PLACEHOLDER((ccv_nnc_tensor_t*)(intptr_t)(0x10));
861 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[1] = (ccv_nnc_tensor_t*)(intptr_t)prev_mv_pos;
862 *pos_ref = mv_pos;
863 }
864 return 0;
865 }
866 ch[idx] = 0;
867 int pos[unroll_count + 1];
868 pos[0] = 0;
869 const int retval = _ccv_nnc_tensor_multiview_down_find_pos(tensor_metadata, params, preserve, assign_update, preps, graph_prep, block_ref, ch, idx + 1, pos);
870 assert(retval == 0)((void) sizeof ((retval == 0) ? 1 : 0), __extension__ ({ if (
retval == 0) ; else __assert_fail ("retval == 0", "ccv_nnc_symbolic_graph_compile.c"
, 870, __extension__ __PRETTY_FUNCTION__); }))
;
871 for (i = 0; i < unroll_count; i++)
872 {
873 ch[idx] = i + 1;
874 pos[i + 1] = 0;
875 const int dup_retval = _ccv_nnc_tensor_multiview_down_find_pos(tensor_metadata, params, preserve, assign_update, preps, graph_prep, block_ref, ch, idx + 1, pos + i + 1);
876 if (dup_retval < 0)
877 {
878 assert(i == 0)((void) sizeof ((i == 0) ? 1 : 0), __extension__ ({ if (i == 0
) ; else __assert_fail ("i == 0", "ccv_nnc_symbolic_graph_compile.c"
, 878, __extension__ __PRETTY_FUNCTION__); }))
;
879 break;
880 }
881 }
882 // If current prep has no dup.
883 if (i == 0)
884 {
885 *pos_ref = pos[0];
886 return 0;
887 }
888 ccv_nnc_tensor_t* data[unroll_count + 1];
889 // Compose to a new multiview.
890 for (i = 0; i < unroll_count + 1; i++)
891 { assert(pos[i] > 0)((void) sizeof ((pos[i] > 0) ? 1 : 0), __extension__ ({ if
(pos[i] > 0) ; else __assert_fail ("pos[i] > 0", "ccv_nnc_symbolic_graph_compile.c"
, 891, __extension__ __PRETTY_FUNCTION__); }))
; }
892 const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
893 for (i = 0; i < unroll_count + 1; i++)
894 data[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos[i]);
895 ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
896 ccv_nnc_tensor_multiview(data, CCV_NNC_MULTIVIEW_K0N, unroll_count + 1, prep->graph, mv);
897 for (i = 0; i < unroll_count + 1; i++)
898 if (data[i] != CCV_NNC_TENSOR_PLACEHOLDER((ccv_nnc_tensor_t*)(intptr_t)(0x10)) && CCV_IS_TENSOR_MULTIVIEW(data[i])((*(int*)(data[i])) & CCV_TENSOR_MULTIVIEW))
899 ((ccv_nnc_tensor_multiview_t*)data[i])->p = (void*)(intptr_t)mv_pos;
900 for (i = 0; i < unroll_count + 1; i++)
901 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i] = (ccv_nnc_tensor_t*)(intptr_t)pos[i];
902 *pos_ref = mv_pos;
903 return 0;
904}
905
906static int _ccv_nnc_is_symbolic_graph_exec_input_or_output(const int p_ref, const ccv_nnc_graph_exec_symbol_info_t *const node)
907{
908 int i;
909 int is_input = 0;
910 assert(node)((void) sizeof ((node) ? 1 : 0), __extension__ ({ if (node) ;
else __assert_fail ("node", "ccv_nnc_symbolic_graph_compile.c"
, 910, __extension__ __PRETTY_FUNCTION__); }))
;
911 for (i = 0; i < node->input_size && !is_input; i++)
912 if (p_ref == node->inputs[i])
913 is_input = 1;
914 int is_output = 0;
915 for (i = 0; i < node->output_size && !is_output; i++)
916 if (p_ref == node->outputs[i])
917 is_output = 1;
918 // Prefer it is an output if it is both the input and the output.
919 if (is_output)
920 return 1;
921 if (is_input)
922 return -1;
923 return 0;
924}
925
926static int _ccv_nnc_tensor_block_check_preserve(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int block_ref)
927{
928 // No need to check whether to preserve if this is not a while loop.
929 if (!(graph_prep->flags & CCV_NNC_GRAPH_EXEC_P_WHILE))
930 return 0;
931 assert(block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size)((void) sizeof ((block_ref >= 0 && block_ref < graph_prep
->tensor_symbol_info_size) ? 1 : 0), __extension__ ({ if (
block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size
) ; else __assert_fail ("block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size"
, "ccv_nnc_symbolic_graph_compile.c", 931, __extension__ __PRETTY_FUNCTION__
); }))
;
932 // If it is unassigned, no need to preserve.
933 if (TENSOR_EXPECT_UNASSIGNED(graph_prep->tensor_blocks[block_ref])((graph_prep->tensor_blocks[block_ref].flags & 0x3) ==
UNASSIGNED)
)
934 return 0;
935 const int p_ref = graph_prep->tensor_blocks[block_ref].p_refs[0] - 1;
936 // If p is not input, no need to preserve at all.
937 if (-1 != _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref, graph_prep->p->exec_symbol_info + (graph_prep->exec_idx - 1)))
938 return 0;
939 const int vt_ref = graph_prep->alloc_prep->vt_blocks[block_ref];
940 assert(vt_ref >= 0)((void) sizeof ((vt_ref >= 0) ? 1 : 0), __extension__ ({ if
(vt_ref >= 0) ; else __assert_fail ("vt_ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 940, __extension__ __PRETTY_FUNCTION__); }))
;
941 assert(block_ref == graph_prep->alloc_prep->blocks[vt_ref].block_ref)((void) sizeof ((block_ref == graph_prep->alloc_prep->blocks
[vt_ref].block_ref) ? 1 : 0), __extension__ ({ if (block_ref ==
graph_prep->alloc_prep->blocks[vt_ref].block_ref) ; else
__assert_fail ("block_ref == graph_prep->alloc_prep->blocks[vt_ref].block_ref"
, "ccv_nnc_symbolic_graph_compile.c", 941, __extension__ __PRETTY_FUNCTION__
); }))
;
942 const int buffer_ref = graph_prep->alloc_prep->blocks[vt_ref].buffer_ref;
943 // If the buffer is a truly read-only one, no need to preserve.
944 if (TENSOR_READ_WRITE(graph_prep->alloc_prep->buffers[buffer_ref])(graph_prep->alloc_prep->buffers[buffer_ref].flags &
0xc)
== READ_ONLY)
945 return 0;
946 /* This needs detailed explanation, what does preserve mean?
947 * For a parameterized loop, such as while { y = x + 1 } (y => x), if tensor x is
948 * also used outside of the while loop, we cannot reuse the memory region of x for
949 * the for loop, otherwise we will destroy x when doing y = x + 1 computation (assuming
950 * y uses the same memory region as x). The way to workaround this is by using a different
951 * memory region for y = x + 1, but for the first iteration, having x pointing to the
952 * original. During the allocation process, the way to identify whether x should preserve
953 * its value or not by looking up its parent tensor. If the symbol (tensor_block)'s input
954 * parent tensor is the same as the memory region it plans to use in the buffer, then we are
955 * good (buffer.p_refs[0] == p_refs[0]). A buffer can only point to one parent tensor, and
956 * it is the input tensor whenever that is possible. A tensor block can point to two parent
957 * tensors, one is input tensor, one is the output tensor. p_refs[0] should be the input
958 * tensor whenever that is possible. */
959 if (graph_prep->alloc_prep->buffers[buffer_ref].p_refs[0] - 1 == p_ref)
960 return 0;
961 // Otherwise, return 1 because we now need to preserve.
962 return 1;
963}
964
965static int _ccv_nnc_tensor_block_check_force_broadcast(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int block_ref)
966{
967 assert(block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size)((void) sizeof ((block_ref >= 0 && block_ref < graph_prep
->tensor_symbol_info_size) ? 1 : 0), __extension__ ({ if (
block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size
) ; else __assert_fail ("block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size"
, "ccv_nnc_symbolic_graph_compile.c", 967, __extension__ __PRETTY_FUNCTION__
); }))
;
968 // If it is unassigned, no need to preserve.
969 if (TENSOR_EXPECT_UNASSIGNED(graph_prep->tensor_blocks[block_ref])((graph_prep->tensor_blocks[block_ref].flags & 0x3) ==
UNASSIGNED)
)
970 return 0;
971 // Only tape var need to force broadcast, otherwise we already share the same memory region.
972 if (!(graph_prep->tensor_symbol_info[block_ref].flags & CCV_NNC_TENSOR_SYMBOL_TAPE_VAR))
973 return 0;
974 const int p_ref = graph_prep->tensor_blocks[block_ref].p_refs[0] - 1;
975 // If p is not output, no need to broadcast at all.
976 if (1 != _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref, graph_prep->p->exec_symbol_info + (graph_prep->exec_idx - 1)))
977 return 0;
978 const int vt_ref = graph_prep->alloc_prep->vt_blocks[block_ref];
979 assert(vt_ref >= 0)((void) sizeof ((vt_ref >= 0) ? 1 : 0), __extension__ ({ if
(vt_ref >= 0) ; else __assert_fail ("vt_ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 979, __extension__ __PRETTY_FUNCTION__); }))
;
980 assert(block_ref == graph_prep->alloc_prep->blocks[vt_ref].block_ref)((void) sizeof ((block_ref == graph_prep->alloc_prep->blocks
[vt_ref].block_ref) ? 1 : 0), __extension__ ({ if (block_ref ==
graph_prep->alloc_prep->blocks[vt_ref].block_ref) ; else
__assert_fail ("block_ref == graph_prep->alloc_prep->blocks[vt_ref].block_ref"
, "ccv_nnc_symbolic_graph_compile.c", 980, __extension__ __PRETTY_FUNCTION__
); }))
;
981 const int buffer_ref = graph_prep->alloc_prep->blocks[vt_ref].buffer_ref;
982 // If the buffer is a truly read-only one, no need to broadcast.
983 if (TENSOR_READ_WRITE(graph_prep->alloc_prep->buffers[buffer_ref])(graph_prep->alloc_prep->buffers[buffer_ref].flags &
0xc)
== READ_ONLY)
984 return 0;
985 // Otherwise, return 1 because we now need to force broadcast for this tape var.
986 return 1;
987}
988
989static void _ccv_nnc_tensor_multiview_full_pos(ccv_nnc_tensor_multiview_t* const mv, ccv_nnc_tensor_t* const tensor)
990{
991 assert(CCV_IS_TENSOR_MULTIVIEW(mv))((void) sizeof ((((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW)) ?
1 : 0), __extension__ ({ if (((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW
)) ; else __assert_fail ("CCV_IS_TENSOR_MULTIVIEW(mv)", "ccv_nnc_symbolic_graph_compile.c"
, 991, __extension__ __PRETTY_FUNCTION__); }))
;
992 int i;
993 for (i = 0; i < mv->kind + mv->repeat; i++)
994 if (CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i] == CCV_NNC_TENSOR_PLACEHOLDER((ccv_nnc_tensor_t*)(intptr_t)(0x10)))
995 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i] = tensor;
996 else if (CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(mv)[i])((*(int*)(((mv)->_heap_data ? (mv)->_heap_data : (mv)->
_inline_data)[i])) & CCV_TENSOR_MULTIVIEW)
)
997 _ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i], tensor);
998}
999
1000static void _ccv_nnc_tensor_multiview_full_pos_rewire(const ccv_array_t* const tensor_metadata, ccv_nnc_tensor_multiview_t* const mv)
1001{
1002 assert(CCV_IS_TENSOR_MULTIVIEW(mv))((void) sizeof ((((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW)) ?
1 : 0), __extension__ ({ if (((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW
)) ; else __assert_fail ("CCV_IS_TENSOR_MULTIVIEW(mv)", "ccv_nnc_symbolic_graph_compile.c"
, 1002, __extension__ __PRETTY_FUNCTION__); }))
;
1003 int i;
1004 if (mv->sp)
1005 for (i = 0; i < mv->sp->rnum; i++)
1006 {
1007 ccv_nnc_tensor_t** const tensor = (ccv_nnc_tensor_t**)ccv_array_get(mv->sp, i)((void*)(((char*)((mv->sp)->data)) + (size_t)(mv->sp
)->rsize * (size_t)(i)))
;
1008 if (CCV_NNC_IS_METADATA_POS(*tensor)((uintptr_t)(*tensor) & 1))
1009 {
1010 const int pos = (int)(intptr_t)*tensor;
1011 *tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
1012 assert(!CCV_IS_TENSOR_MULTIVIEW(*tensor))((void) sizeof ((!((*(int*)(*tensor)) & CCV_TENSOR_MULTIVIEW
)) ? 1 : 0), __extension__ ({ if (!((*(int*)(*tensor)) & CCV_TENSOR_MULTIVIEW
)) ; else __assert_fail ("!CCV_IS_TENSOR_MULTIVIEW(*tensor)",
"ccv_nnc_symbolic_graph_compile.c", 1012, __extension__ __PRETTY_FUNCTION__
); }))
;
1013 _ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)pos);
1014 }
1015 }
1016 for (i = 0; i < mv->kind + mv->repeat; i++)
1017 {
1018 if (CCV_NNC_IS_METADATA_POS((int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[i])((uintptr_t)((int)(intptr_t)((mv)->_heap_data ? (mv)->_heap_data
: (mv)->_inline_data)[i]) & 1)
)
1019 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i]);
1020 if (CCV_NNC_IS_METADATA_POS((int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[i]->alias_ref)((uintptr_t)((int)(intptr_t)((mv)->_heap_data ? (mv)->_heap_data
: (mv)->_inline_data)[i]->alias_ref) & 1)
)
1021 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i]->alias_ref = (uintptr_t)_ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i]->alias_ref);
1022 if (CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(mv)[i])((*(int*)(((mv)->_heap_data ? (mv)->_heap_data : (mv)->
_inline_data)[i])) & CCV_TENSOR_MULTIVIEW)
)
1023 _ccv_nnc_tensor_multiview_full_pos_rewire(tensor_metadata, (ccv_nnc_tensor_multiview_t*)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i]);
1024 }
1025}
1026
1027static int _ccv_nnc_tensor_multiview_gen(ccv_array_t* const tensor_metadata, const int preserve, const int assign_update, const ccv_nnc_tensor_param_t params, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_tensor_arena_t* const tensor_arena, const int block_ref)
1028{
1029 // Go to the root of the graph.
1030 const ccv_nnc_symbolic_graph_prep_t* prep = graph_prep;
1031 int i;
1032 for (i = 1; prep->p; i++)
1033 prep = prep->p;
1034 // Root graph should have no dup tensor blocks.
1035 assert(!prep->dup_tensor_block_ref)((void) sizeof ((!prep->dup_tensor_block_ref) ? 1 : 0), __extension__
({ if (!prep->dup_tensor_block_ref) ; else __assert_fail (
"!prep->dup_tensor_block_ref", "ccv_nnc_symbolic_graph_compile.c"
, 1035, __extension__ __PRETTY_FUNCTION__); }))
;
1036 const int c = i;
1037 const ccv_nnc_symbolic_graph_prep_t* preps[c];
1038 prep = graph_prep;
1039 preps[c - 1] = prep;
1040 for (i = 0; prep->p; i++)
1041 preps[c - 2 - i] = prep = prep->p;
1042 int ch[c]; // Use dynamic allocation for array. This is an array to record our selections when recursive from top to bottom.
1043 memset(ch, 0, sizeof(int) * c);
1044 int pos = 0;
1045 _ccv_nnc_tensor_multiview_down_find_pos(tensor_metadata, params, preserve, assign_update, preps, graph_prep, block_ref, ch, 0, &pos);
1046 assert(ch[c - 1] == 0)((void) sizeof ((ch[c - 1] == 0) ? 1 : 0), __extension__ ({ if
(ch[c - 1] == 0) ; else __assert_fail ("ch[c - 1] == 0", "ccv_nnc_symbolic_graph_compile.c"
, 1046, __extension__ __PRETTY_FUNCTION__); }))
; // This shouldn't never be modified.
1047 assert(pos > 0)((void) sizeof ((pos > 0) ? 1 : 0), __extension__ ({ if (pos
> 0) ; else __assert_fail ("pos > 0", "ccv_nnc_symbolic_graph_compile.c"
, 1047, __extension__ __PRETTY_FUNCTION__); }))
;
1048 return pos;
1049}
1050
1051static int _ccv_nnc_tensor_multiview_preserve_gen(ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_param_t params, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, ccv_nnc_tensor_t* const tensor)
1052{
1053 const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
1054 ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
1055 ccv_nnc_tensor_t* const tv = CCV_NNC_IS_METADATA_POS(tensor)((uintptr_t)(tensor) & 1) ? _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)tensor) : tensor;
1056 ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
1057 CCV_NNC_TENSOR_PLACEHOLDER((ccv_nnc_tensor_t*)(intptr_t)(0x10)),
1058 tv,
1059 }, CCV_NNC_MULTIVIEW_K1N, 1, graph_prep->graph, mv);
1060 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[0] = CCV_NNC_TENSOR_PLACEHOLDER((ccv_nnc_tensor_t*)(intptr_t)(0x10));
1061 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[1] = tensor;
1062 return mv_pos;
1063}
1064
1065static int _ccv_nnc_tensor_flat_if_multiview(ccv_array_t* const tensor_metadata, const int pos)
1066{
1067 ccv_nnc_tensor_t* tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
1068 const int is_multiview = CCV_IS_TENSOR_MULTIVIEW(tensor_ptr)((*(int*)(tensor_ptr)) & CCV_TENSOR_MULTIVIEW);
1069 if (!is_multiview)
1070 return pos;
1071 while (CCV_IS_TENSOR_MULTIVIEW(tensor_ptr)((*(int*)(tensor_ptr)) & CCV_TENSOR_MULTIVIEW))
1072 {
1073 const ccv_nnc_tensor_multiview_t* const mv = (const ccv_nnc_tensor_multiview_t*)tensor_ptr;
1074 tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[0]);
1075 }
1076 const ccv_nnc_tensor_t tensor = *tensor_ptr;
1077 const int new_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_t));
1078 ccv_nnc_tensor_t* const new_tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, new_pos);
1079 *new_tensor = ccv_nnc_tensor(tensor.data.u8, tensor.info, 0);
1080 new_tensor->dataof = tensor.dataof;
1081 ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
1082 new_tensor->alias_ref = (uintptr_t)pos;
1083 ccv_nnc_tensor_synchronize_to_multiview(mv, (ccv_nnc_tensor_t*)(intptr_t)new_pos);
1084 return new_pos;
1085}
1086
1087static void _ccv_nnc_assign_vt_tensor_aliases(ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int block_ref, ccv_nnc_tensor_t** const vt_tensors)
1088{
1089 const int alias_ref = tensor_symbol_info[block_ref].alias_ref - 1;
1090 // It referenced to is not an alias.
1091 assert(vt_tensors[alias_ref])((void) sizeof ((vt_tensors[alias_ref]) ? 1 : 0), __extension__
({ if (vt_tensors[alias_ref]) ; else __assert_fail ("vt_tensors[alias_ref]"
, "ccv_nnc_symbolic_graph_compile.c", 1091, __extension__ __PRETTY_FUNCTION__
); }))
;
1092 const int alias_pos = (int)(intptr_t)vt_tensors[alias_ref];
1093 const ccv_nnc_tensor_t* alias_tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_metadata, alias_pos);
1094 assert(!CCV_IS_TENSOR_VIEW(alias_tensor_ptr))((void) sizeof ((!((*(int*)(alias_tensor_ptr)) & CCV_TENSOR_VIEW
)) ? 1 : 0), __extension__ ({ if (!((*(int*)(alias_tensor_ptr
)) & CCV_TENSOR_VIEW)) ; else __assert_fail ("!CCV_IS_TENSOR_VIEW(alias_tensor_ptr)"
, "ccv_nnc_symbolic_graph_compile.c", 1094, __extension__ __PRETTY_FUNCTION__
); }))
;
1095 // Will use that to determine whether insert reference or not.
1096 const int is_multiview = CCV_IS_TENSOR_MULTIVIEW(alias_tensor_ptr)((*(int*)(alias_tensor_ptr)) & CCV_TENSOR_MULTIVIEW);
1097 while (CCV_IS_TENSOR_MULTIVIEW(alias_tensor_ptr)((*(int*)(alias_tensor_ptr)) & CCV_TENSOR_MULTIVIEW))
1098 {
1099 const ccv_nnc_tensor_multiview_t* const mv = (const ccv_nnc_tensor_multiview_t*)alias_tensor_ptr;
1100 alias_tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[0]);
1101 }
1102 const ccv_nnc_tensor_t alias_tensor = *alias_tensor_ptr;
1103 // If there is no ofs, and inc is the same as dim, we take a shortcut and just init as normal tensor.
1104 int pos;
1105 if (memcmp(ccv_nnc_no_ofs, tensor_symbol_info[block_ref].ofs, sizeof(ccv_nnc_no_ofs)) == 0 &&
1106 ccv_nnc_is_tensor_stride_packed(tensor_symbol_info[block_ref].stride, tensor_symbol_info[block_ref].info.dim))
1107 {
1108 pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_t));
1109 ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
1110 *tensor = ccv_nnc_tensor(alias_tensor.data.u8, tensor_symbol_info[block_ref].info, 0);
1111 tensor->dataof = alias_tensor.dataof;
1112 } else {
1113 pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_view_t));
1114 ccv_nnc_tensor_view_t* const tensor_view = (ccv_nnc_tensor_view_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
1115 // Otherwise initialize a tensor view
1116 *tensor_view = ccv_nnc_tensor_view(&alias_tensor, tensor_symbol_info[block_ref].info, tensor_symbol_info[block_ref].ofs, tensor_symbol_info[block_ref].stride);
1117 tensor_view->alias_ref = (uintptr_t)alias_pos;
1118 }
1119 vt_tensors[block_ref] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1120 if (is_multiview)
1121 {
1122 ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, alias_pos);
1123 ccv_nnc_tensor_synchronize_to_multiview(mv, (ccv_nnc_tensor_t*)(intptr_t)pos);
1124 }
1125}
1126
1127static void _ccv_nnc_recursively_assign_vt_tensor_aliases(const ccv_nnc_tensor_block_t* const tensor_blocks, ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int block_ref, ccv_nnc_tensor_t** const vt_tensors)
1128{
1129 // If this is an alias_ref and it hasn't been assigned, it must be an alias itself. Do this recursively.
1130 if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[block_ref])((tensor_blocks[block_ref].flags & 0x3) == UNASSIGNED) && tensor_blocks[block_ref].alias_ref && !vt_tensors[block_ref])
1131 {
1132 const int ref = tensor_blocks[block_ref].alias_ref - 1;
1133 if (!vt_tensors[ref])
1134 _ccv_nnc_recursively_assign_vt_tensor_aliases(tensor_blocks, tensor_metadata, tensor_symbol_info, ref, vt_tensors);
1135 vt_tensors[block_ref] = vt_tensors[ref];
1136 return;
1137 }
1138 assert(tensor_symbol_info[block_ref].alias_ref)((void) sizeof ((tensor_symbol_info[block_ref].alias_ref) ? 1
: 0), __extension__ ({ if (tensor_symbol_info[block_ref].alias_ref
) ; else __assert_fail ("tensor_symbol_info[block_ref].alias_ref"
, "ccv_nnc_symbolic_graph_compile.c", 1138, __extension__ __PRETTY_FUNCTION__
); }))
;
1139 const int alias_ref = tensor_symbol_info[block_ref].alias_ref - 1;
1140 // If we don't have vt_tensors, this must be a ref with alias_ref (through folding). If that is the case, do this recursively until all aliases assigned.
1141 if (!vt_tensors[alias_ref])
1142 _ccv_nnc_recursively_assign_vt_tensor_aliases(tensor_blocks, tensor_metadata, tensor_symbol_info, alias_ref, vt_tensors);
1143 _ccv_nnc_assign_vt_tensor_aliases(tensor_metadata, tensor_symbol_info, block_ref, vt_tensors);
1144}
1145
1146// Turn a linear pointer to an object storage (such as MTLBuffer).
1147#ifdef HAVE_MPS
1148static void _ccv_nnc_tensor_arena_obj_dispose(void* ptr, void* userdata)
1149{
1150 mpobjfree(0, ptr);
1151}
1152#endif
1153
1154typedef struct {
1155 size_t size;
1156 void* obj;
1157} tensor_arena_obj_track_t;
1158
1159typedef struct {
1160 void* ptr;
1161 off_t offset;
1162 size_t size;
1163} obj_ptr_key_t;
1164
1165static inline khint32_t _kh_obj_ptr_hash_func(const obj_ptr_key_t key)
1166{
1167 return ((uint64_t)(uintptr_t)key.ptr >> 4) + key.offset + key.size;
1168}
1169
1170static inline int _kh_obj_ptr_hash_equal(const obj_ptr_key_t a, const obj_ptr_key_t b)
1171{
1172 return (a.ptr == b.ptr && a.offset == b.offset && a.size == b.size);
1173}
1174
1175KHASH_INIT(obj_ptr, obj_ptr_key_t, void*, 1, _kh_obj_ptr_hash_func, _kh_obj_ptr_hash_equal)typedef struct kh_obj_ptr_s { khint_t n_buckets, size, n_occupied
, upper_bound; khint32_t *flags; obj_ptr_key_t *keys; void* *
vals; } kh_obj_ptr_t; static inline __attribute__ ((__unused__
)) kh_obj_ptr_t *kh_init_obj_ptr(void) { return (kh_obj_ptr_t
*)calloc(1,sizeof(kh_obj_ptr_t)); } static inline __attribute__
((__unused__)) void kh_destroy_obj_ptr(kh_obj_ptr_t *h) { if
(h) { free((void *)h->keys); free(h->flags); free((void
*)h->vals); free(h); } } static inline __attribute__ ((__unused__
)) void kh_clear_obj_ptr(kh_obj_ptr_t *h) { if (h && h
->flags) { memset(h->flags, 0xaa, ((h->n_buckets) <
16? 1 : (h->n_buckets)>>4) * sizeof(khint32_t)); h->
size = h->n_occupied = 0; } } static inline __attribute__ (
(__unused__)) khint_t kh_get_obj_ptr(const kh_obj_ptr_t *h, obj_ptr_key_t
key) { if (h->n_buckets) { khint_t k, i, last, mask, step
= 0; mask = h->n_buckets - 1; k = _kh_obj_ptr_hash_func(key
); i = k & mask; last = i; while (!((h->flags[i>>
4]>>((i&0xfU)<<1))&2) && (((h->
flags[i>>4]>>((i&0xfU)<<1))&1) || !
_kh_obj_ptr_hash_equal(h->keys[i], key))) { i = (i + (++step
)) & mask; if (i == last) return h->n_buckets; } return
((h->flags[i>>4]>>((i&0xfU)<<1))&
3)? h->n_buckets : i; } else return 0; } static inline __attribute__
((__unused__)) int kh_resize_obj_ptr(kh_obj_ptr_t *h, khint_t
new_n_buckets) { khint32_t *new_flags = 0; khint_t j = 1; { (
--(new_n_buckets), (new_n_buckets)|=(new_n_buckets)>>1,
(new_n_buckets)|=(new_n_buckets)>>2, (new_n_buckets)|=
(new_n_buckets)>>4, (new_n_buckets)|=(new_n_buckets)>>
8, (new_n_buckets)|=(new_n_buckets)>>16, ++(new_n_buckets
)); if (new_n_buckets < 4) new_n_buckets = 4; if (h->size
>= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0
; else { new_flags = (khint32_t*)malloc(((new_n_buckets) <
16? 1 : (new_n_buckets)>>4) * sizeof(khint32_t)); if (
!new_flags) return -1; memset(new_flags, 0xaa, ((new_n_buckets
) < 16? 1 : (new_n_buckets)>>4) * sizeof(khint32_t))
; if (h->n_buckets < new_n_buckets) { obj_ptr_key_t *new_keys
= (obj_ptr_key_t*)realloc((void *)h->keys,new_n_buckets *
sizeof(obj_ptr_key_t)); if (!new_keys) { free(new_flags); return
-1; } h->keys = new_keys; if (1) { void* *new_vals = (void
**)realloc((void *)h->vals,new_n_buckets * sizeof(void*));
if (!new_vals) { free(new_flags); return -1; } h->vals = new_vals
; } } } } if (j) { for (j = 0; j != h->n_buckets; ++j) { if
(((h->flags[j>>4]>>((j&0xfU)<<1))&
3) == 0) { obj_ptr_key_t key = h->keys[j]; void* val; khint_t
new_mask; new_mask = new_n_buckets - 1; if (1) val = h->vals
[j]; (h->flags[j>>4]|=1ul<<((j&0xfU)<<
1)); while (1) { khint_t k, i, step = 0; k = _kh_obj_ptr_hash_func
(key); i = k & new_mask; while (!((new_flags[i>>4]>>
((i&0xfU)<<1))&2)) i = (i + (++step)) & new_mask
; (new_flags[i>>4]&=~(2ul<<((i&0xfU)<<
1))); if (i < h->n_buckets && ((h->flags[i>>
4]>>((i&0xfU)<<1))&3) == 0) { { obj_ptr_key_t
tmp = h->keys[i]; h->keys[i] = key; key = tmp; } if (1
) { void* tmp = h->vals[i]; h->vals[i] = val; val = tmp
; } (h->flags[i>>4]|=1ul<<((i&0xfU)<<
1)); } else { h->keys[i] = key; if (1) h->vals[i] = val
; break; } } } } if (h->n_buckets > new_n_buckets) { h->
keys = (obj_ptr_key_t*)realloc((void *)h->keys,new_n_buckets
* sizeof(obj_ptr_key_t)); if (1) h->vals = (void**)realloc
((void *)h->vals,new_n_buckets * sizeof(void*)); } free(h->
flags); h->flags = new_flags; h->n_buckets = new_n_buckets
; h->n_occupied = h->size; h->upper_bound = (khint_t
)(h->n_buckets * __ac_HASH_UPPER + 0.5); } return 0; } static
inline __attribute__ ((__unused__)) khint_t kh_put_obj_ptr(kh_obj_ptr_t
*h, obj_ptr_key_t key, int *ret) { khint_t x; if (h->n_occupied
>= h->upper_bound) { if (h->n_buckets > (h->size
<<1)) { if (kh_resize_obj_ptr(h, h->n_buckets - 1) <
0) { *ret = -1; return h->n_buckets; } } else if (kh_resize_obj_ptr
(h, h->n_buckets + 1) < 0) { *ret = -1; return h->n_buckets
; } } { khint_t k, i, site, last, mask = h->n_buckets - 1,
step = 0; x = site = h->n_buckets; k = _kh_obj_ptr_hash_func
(key); i = k & mask; if (((h->flags[i>>4]>>
((i&0xfU)<<1))&2)) x = i; else { last = i; while
(!((h->flags[i>>4]>>((i&0xfU)<<1))&
2) && (((h->flags[i>>4]>>((i&0xfU)
<<1))&1) || !_kh_obj_ptr_hash_equal(h->keys[i], key
))) { if (((h->flags[i>>4]>>((i&0xfU)<<
1))&1)) site = i; i = (i + (++step)) & mask; if (i ==
last) { x = site; break; } } if (x == h->n_buckets) { if (
((h->flags[i>>4]>>((i&0xfU)<<1))&
2) && site != h->n_buckets) x = site; else x = i; }
} } if (((h->flags[x>>4]>>((x&0xfU)<<
1))&2)) { h->keys[x] = key; (h->flags[x>>4]&=
~(3ul<<((x&0xfU)<<1))); ++h->size; ++h->
n_occupied; *ret = 1; } else if (((h->flags[x>>4]>>
((x&0xfU)<<1))&1)) { h->keys[x] = key; (h->
flags[x>>4]&=~(3ul<<((x&0xfU)<<1)))
; ++h->size; *ret = 2; } else *ret = 0; return x; } static
inline __attribute__ ((__unused__)) void kh_del_obj_ptr(kh_obj_ptr_t
*h, khint_t x) { if (x != h->n_buckets && !((h->
flags[x>>4]>>((x&0xfU)<<1))&3)) { (
h->flags[x>>4]|=1ul<<((x&0xfU)<<1));
--h->size; } }
1176
1177static inline void* _ccv_nnc_tensor_arena_obj_create(khash_t(obj_ptr)kh_obj_ptr_t* obj_ptr_map, void* ptr, const size_t total_size, const off_t offset, const ccv_nnc_tensor_param_t params, ccv_nnc_tensor_arena_t* tensor_arena)
1178{
1179 if (params.dim[0] == 0)
1180 return 0;
1181#ifdef HAVE_MPS
1182 if (CCV_TENSOR_GET_MEMORY(params.type)((params.type) & 0x3) == CCV_TENSOR_GPU_MEMORY)
1183 {
1184 int ret;
1185 const size_t size = CCV_GET_DATA_TYPE_SIZE(params.datatype)_ccv_get_data_type_size[((params.datatype) & 0xFF000) >>
12]
* ccv_nnc_tensor_count(params);
1186 const obj_ptr_key_t key = {
1187 .ptr = ptr,
1188 .offset = offset,
1189 .size = size,
1190 };
1191 khiter_t k = kh_put(obj_ptr, obj_ptr_map, key, &ret)kh_put_obj_ptr(obj_ptr_map, key, &ret);
1192 if (ret != 0)
1193 {
1194 void* obj = mpobjcreate(ptr, offset, size);
1195 if (!tensor_arena->disposers)
1196 tensor_arena->disposers = ccv_array_new(sizeof(ccv_nnc_arena_disposer_t), 1, 0);
1197 ccv_nnc_arena_disposer_t disposer = {
1198 .ptr = obj,
1199 .userdata = 0,
1200 .dispose = _ccv_nnc_tensor_arena_obj_dispose
1201 };
1202 ccv_array_push(tensor_arena->disposers, &disposer);
1203 kh_val(obj_ptr_map, k)((obj_ptr_map)->vals[k]) = obj;
1204 return obj;
1205 } else
1206 return kh_val(obj_ptr_map, k)((obj_ptr_map)->vals[k]);
1207 }
1208#endif
1209 return ptr + offset;
1210}
1211
1212static ccv_nnc_tensor_arena_t* _ccv_nnc_tensor_arena_new(ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_symbolic_graph_compile_allocator_t allocator, const ccv_nnc_tensor_arena_t* const p_arena, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size)
1213{
1214 // All tensors assigned out, now, the num_assigned is the number of dis-continuous buffers,
1215 // Each tensor have the designation in assigned array, and offset in allocated_offset.
1216 const ccv_nnc_tensor_alloc_prep_t* const alloc_prep = graph_prep->alloc_prep;
1217 ccv_nnc_tensor_block_t* const tensor_blocks = graph_prep->tensor_blocks;
1218 const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info = graph_prep->tensor_symbol_info;
1219 const int tensor_symbol_info_size = graph_prep->tensor_symbol_info_size;
1220 const ccv_nnc_symbolic_graph_prep_t* const p_graph_prep = graph_prep->p;
1221 const ccv_nnc_tensor_alloc_prep_t* const p_alloc_prep = p_graph_prep ? p_graph_prep->alloc_prep : 0;
1222 const int* const dup_tensor_block_ref = graph_prep->dup_tensor_block_ref;
1223 const int unroll_count = graph_prep->unroll_count;
1224 int i, j;
1225 for (i = 0; i < tensor_symbol_info_size; i++)
1226 for (j = 0; TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED) && j < unroll_count; j++)
1227 {
1228 const int dup_ref = dup_tensor_block_ref[i * unroll_count + j];
1229 if (dup_ref >= 0 && !TENSOR_EXPECT_UNASSIGNED(tensor_blocks[dup_ref])((tensor_blocks[dup_ref].flags & 0x3) == UNASSIGNED))
1230 TENSOR_EXPECT_UNSET_UNASSIGNED(tensor_blocks[i])(tensor_blocks[i].flags = (tensor_blocks[i].flags & ~0x1)
)
;
1231 }
1232 ccv_nnc_tensor_arena_t* tensor_arena = (ccv_nnc_tensor_arena_t*)ccmallocmalloc(sizeof(ccv_nnc_tensor_arena_t) + sizeof(tensor_arena->buffers[0]) * alloc_prep->buffer_size + sizeof(ccv_nnc_tensor_arena_t*) * graph_prep->sub_prep_size + sizeof(ccv_nnc_tensor_t*) * tensor_symbol_info_size + sizeof(int) * tensor_symbol_info_size);
1233 graph_prep->tensor_arena = tensor_arena;
1234 tensor_arena->graph_ref = (intptr_t)graph_prep->symbolic_graph;
1235 tensor_arena->buffers = (void*)(tensor_arena + 1);
1236 tensor_arena->buffer_size = alloc_prep->buffer_size;
1237 tensor_arena->vt_tensor_size = tensor_symbol_info_size;
1238 tensor_arena->sub_arenas = (ccv_nnc_tensor_arena_t**)(tensor_arena->buffers + alloc_prep->buffer_size);
1239 tensor_arena->vt_tensors = (ccv_nnc_tensor_t**)(tensor_arena->sub_arenas + graph_prep->sub_prep_size);
1240 tensor_arena->vt_alias_refs = (int*)(tensor_arena->vt_tensors + tensor_symbol_info_size);
1241 tensor_arena->pb_vt_tensors = 0;
1242 tensor_arena->vt_alias_r_refs_p = 0;
1243 tensor_arena->vt_alias_r_refs = 0;
1244 tensor_arena->vt_sizes = 0;
1245 tensor_arena->sub_arena_size = graph_prep->sub_prep_size;
1246 tensor_arena->tensor_metadata = ccv_array_new(16 /* align to 16 bytes */, 0, 0);
1247 tensor_arena->m_tensor_idx = ccv_array_new(sizeof(int), 0, 0);
1248 tensor_arena->allocator.context.free = allocator.context.free;
1249 tensor_arena->allocator.isa = allocator.isa;
1250 tensor_arena->disposers = 0;
1251 // Copy alias_ref info back to the tensor arena.
1252 for (i = 0; i < tensor_symbol_info_size; i++)
1253 tensor_arena->vt_alias_refs[i] = tensor_symbol_info[i].alias_ref;
1254 // Do the buffer copies.
1255 for (i = 0; i < alloc_prep->buffer_size; i++)
1256 tensor_arena->buffers[i].type = alloc_prep->buffers[i].type,
1257 tensor_arena->buffers[i].pin_mem = alloc_prep->buffers[i].pin_mem,
1258 tensor_arena->buffers[i].size = alloc_prep->buffers[i].size;
1259 if (graph_prep->while_count_tensor)
1260 {
1261 // If we need to have a while count tensor, allocate that first, set its pointer to point the while_count variable.
1262 int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1263 assert((0 << 1) + 1 == pos)((void) sizeof (((0 << 1) + 1 == pos) ? 1 : 0), __extension__
({ if ((0 << 1) + 1 == pos) ; else __assert_fail ("(0 << 1) + 1 == pos"
, "ccv_nnc_symbolic_graph_compile.c", 1263, __extension__ __PRETTY_FUNCTION__
); }))
; // pos must be 0 position.
1264 ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1265 *tensor = ccv_nnc_tensor_for_while_count(graph_prep->graph);
1266 }
1267 assert((p_arena && p_graph_prep) || (!p_arena && !p_graph_prep))((void) sizeof (((p_arena && p_graph_prep) || (!p_arena
&& !p_graph_prep)) ? 1 : 0), __extension__ ({ if ((p_arena
&& p_graph_prep) || (!p_arena && !p_graph_prep
)) ; else __assert_fail ("(p_arena && p_graph_prep) || (!p_arena && !p_graph_prep)"
, "ccv_nnc_symbolic_graph_compile.c", 1267, __extension__ __PRETTY_FUNCTION__
); }))
;
1268 if (p_arena && p_graph_prep)
1269 {
1270 // Don't need to allocate the actual buffer, just use the pointer from the above.
1271 PRINT(CCV_CLI_VERBOSE, "Buffer assignment for sub arena %p (parent %p)\n", tensor_arena, p_arena)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
{ printf("Buffer assignment for sub arena %p (parent %p)\n",
tensor_arena, p_arena); fflush(stdout); } } while (0)
;
1272 for (i = 0; i < tensor_arena->buffer_size; i++)
1273 {
1274 const int p_ref = alloc_prep->buffers[i].p_refs[0] - 1;
1275 int unref_p_ref = p_ref;
1276 while (p_graph_prep->tensor_blocks[unref_p_ref].ref)
1277 unref_p_ref = p_graph_prep->tensor_blocks[unref_p_ref].ref - 1;
1278 assert(unref_p_ref >= 0)((void) sizeof ((unref_p_ref >= 0) ? 1 : 0), __extension__
({ if (unref_p_ref >= 0) ; else __assert_fail ("unref_p_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 1278, __extension__ __PRETTY_FUNCTION__
); }))
;
1279 const int p_unroll_count = p_graph_prep->unroll_count;
1280 if (p_graph_prep->dup_tensor_block_ref &&
1281 p_graph_prep->dup_tensor_block_ref[p_ref * p_unroll_count] >= 0 &&
1282 p_graph_prep->dup_tensor_block_ref[p_ref * p_unroll_count] != p_ref)
1283 {
1284 // This condition means in the parent graph, we point to multiple tensor blocks for the same
1285 // buffer, therefore, we cannot have one single pointer assigned in this case.
1286 // Later we will handle this by generate ccv_tensor_multiview_t structure.
1287 tensor_arena->buffers[i].ptr = 0;
1288 PRINT(CCV_CLI_VERBOSE, "|-Cannot assign buffer %d, it points to multiple blocks (multi view tensor required)\n", i)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
{ printf("|-Cannot assign buffer %d, it points to multiple blocks (multi view tensor required)\n"
, i); fflush(stdout); } } while (0)
;
1289 continue;
1290 }
1291 // Otherwise, find the actual buffer pointer.
1292 const int vt_ref = p_alloc_prep->vt_blocks[unref_p_ref];
1293 assert(vt_ref >= 0)((void) sizeof ((vt_ref >= 0) ? 1 : 0), __extension__ ({ if
(vt_ref >= 0) ; else __assert_fail ("vt_ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 1293, __extension__ __PRETTY_FUNCTION__); }))
;
1294 const int buffer_ref = p_alloc_prep->blocks[vt_ref].buffer_ref;
1295 if (!p_arena->buffers[buffer_ref].ptr)
1296 {
1297 // Pass it down as 0 ptr.
1298 tensor_arena->buffers[i].ptr = 0;
1299 PRINT(CCV_CLI_VERBOSE, "|-Cannot assign buffer %d, it points to multiple blocks (multi view tensor required)\n", i)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
{ printf("|-Cannot assign buffer %d, it points to multiple blocks (multi view tensor required)\n"
, i); fflush(stdout); } } while (0)
;
1300 continue;
1301 }
1302 const uint64_t offset = p_alloc_prep->blocks[vt_ref].offset;
1303 tensor_arena->buffers[i].ptr = p_arena->buffers[buffer_ref].ptr + offset;
1304 PRINT(CCV_CLI_VERBOSE, "|-Assign block %d in parent arena to buffer %d with offset %lu\n", vt_ref, i, (unsigned long)offset)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
{ printf("|-Assign block %d in parent arena to buffer %d with offset %lu\n"
, vt_ref, i, (unsigned long)offset); fflush(stdout); } } while
(0)
;
1305 }
1306 } else {
1307 // Now, allocate actual buffers.
1308 PRINT(CCV_CLI_VERBOSE, "Buffer allocation for arena %p\n", tensor_arena)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
{ printf("Buffer allocation for arena %p\n", tensor_arena); fflush
(stdout); } } while (0)
;
1309 for (i = 0; i < tensor_arena->buffer_size; i++)
1310 {
1311 const int buffer_type = tensor_arena->buffers[i].type;
1312 const int memory_type = CCV_TENSOR_GET_MEMORY(buffer_type)((buffer_type) & 0x3);
1313#ifdef HAVE_CUDA1
1314 if (memory_type == CCV_TENSOR_GPU_MEMORY)
1315 {
1316 const int device_id = CCV_TENSOR_GET_DEVICE_ID(buffer_type)(((buffer_type) & 0xfff00) >> 8);
1317 if (allocator.isa && allocator.isa->alloc)
1318 tensor_arena->buffers[i].ptr = (uint8_t*)allocator.isa->alloc(buffer_type, 0, tensor_arena->buffers[i].size, allocator.context.alloc);
1319 else
1320 tensor_arena->buffers[i].ptr = (uint8_t*)cumalloc(device_id, tensor_arena->buffers[i].size);
1321 PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
{ printf("|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena
->buffers[i].ptr, (unsigned long)tensor_arena->buffers[
i].size); fflush(stdout); } } while (0)
;
1322 } else {
1323 assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0
), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY)
; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY"
, "ccv_nnc_symbolic_graph_compile.c", 1323, __extension__ __PRETTY_FUNCTION__
); }))
;
1324 if (tensor_arena->buffers[i].pin_mem)
1325 tensor_arena->buffers[i].ptr = (uint8_t*)cuhostalloc(tensor_arena->buffers[i].size);
1326 else
1327 ccmemalignposix_memalign((void**)&tensor_arena->buffers[i].ptr, 64, tensor_arena->buffers[i].size);
1328 PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
{ printf("|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena
->buffers[i].ptr, (unsigned long)tensor_arena->buffers[
i].size); fflush(stdout); } } while (0)
;
1329 }
1330#elif defined(HAVE_MPS)
1331 if (memory_type == CCV_TENSOR_GPU_MEMORY)
1332 {
1333 const int device_id = CCV_TENSOR_GET_DEVICE_ID(buffer_type)(((buffer_type) & 0xfff00) >> 8);
1334 // if (allocator.isa && allocator.isa->alloc)
1335 // tensor_arena->buffers[i].ptr = (uint8_t*)allocator.isa->alloc(buffer_type, 0, tensor_arena->buffers[i].size, allocator.context.alloc);
1336 // else
1337 tensor_arena->buffers[i].ptr = (uint8_t*)mpheapalloc(device_id, tensor_arena->buffers[i].size);
1338 PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
{ printf("|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena
->buffers[i].ptr, (unsigned long)tensor_arena->buffers[
i].size); fflush(stdout); } } while (0)
;
1339 } else {
1340 assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0
), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY)
; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY"
, "ccv_nnc_symbolic_graph_compile.c", 1340, __extension__ __PRETTY_FUNCTION__
); }))
;
1341 ccmemalignposix_memalign((void**)&tensor_arena->buffers[i].ptr, 64, tensor_arena->buffers[i].size);
1342 PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
{ printf("|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena
->buffers[i].ptr, (unsigned long)tensor_arena->buffers[
i].size); fflush(stdout); } } while (0)
;
1343 }
1344#else
1345 assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0
), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY)
; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY"
, "ccv_nnc_symbolic_graph_compile.c", 1345, __extension__ __PRETTY_FUNCTION__
); }))
;
1346 ccmemalignposix_memalign((void**)&tensor_arena->buffers[i].ptr, 64, tensor_arena->buffers[i].size);
1347 PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
{ printf("|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena
->buffers[i].ptr, (unsigned long)tensor_arena->buffers[
i].size); fflush(stdout); } } while (0)
;
1348#endif
1349 assert(tensor_arena->buffers[i].ptr)((void) sizeof ((tensor_arena->buffers[i].ptr) ? 1 : 0), __extension__
({ if (tensor_arena->buffers[i].ptr) ; else __assert_fail
("tensor_arena->buffers[i].ptr", "ccv_nnc_symbolic_graph_compile.c"
, 1349, __extension__ __PRETTY_FUNCTION__); }))
;
1350 }
1351 }
1352 // Go over sub_preps and allocate arenas for them. Do it this early because
1353 // we may reference tensors from sub arenas, the reason why we need to reference
1354 // tensors from sub arenas is because for output tensors, sub arena's tensor
1355 // will have automatic reference updates.
1356 for (i = 0; i < tensor_arena->sub_arena_size; i++)
1357 if (graph_prep->sub_preps[i])
1358 tensor_arena->sub_arenas[i] = _ccv_nnc_tensor_arena_new(graph_prep->sub_preps[i], allocator, tensor_arena, tensor_binds, tensor_bind_size);
1359 else
1360 tensor_arena->sub_arenas[i] = 0;
1361 memset(tensor_arena->vt_tensors, 0, sizeof(ccv_nnc_tensor_t*) * tensor_symbol_info_size);
1362 // Now sub-arenas are all assigned, go over its outputs to assign out tensors from its output directly.
1363 ccv_nnc_tensor_t** sub_arena_out_tensors = tensor_arena->sub_arena_size ? (ccv_nnc_tensor_t**)cccalloccalloc(tensor_symbol_info_size, sizeof(ccv_nnc_tensor_t*)) : 0;
1364#ifdef HAVE_MPS
1365 khash_t(obj_ptr)kh_obj_ptr_t* obj_ptr_map = kh_init(obj_ptr)kh_init_obj_ptr();
1366#else
1367 khash_t(obj_ptr)kh_obj_ptr_t* obj_ptr_map = 0;
1368#endif
1369 for (i = 0; i < tensor_arena->sub_arena_size; i++)
1370 if (tensor_arena->sub_arenas[i])
1371 {
1372 assert(graph_prep->sub_preps[i])((void) sizeof ((graph_prep->sub_preps[i]) ? 1 : 0), __extension__
({ if (graph_prep->sub_preps[i]) ; else __assert_fail ("graph_prep->sub_preps[i]"
, "ccv_nnc_symbolic_graph_compile.c", 1372, __extension__ __PRETTY_FUNCTION__
); }))
;
1373 const int exec_idx = graph_prep->sub_preps[i]->exec_idx - 1;
1374 const ccv_nnc_graph_exec_symbol_info_t* const node = graph_prep->exec_symbol_info + exec_idx;
1375 if (node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
1376 for (j = 0; j < node->output_size; j++)
1377 {
1378 const int idx = node->outputs[j];
1379 const int s_idx = *(int*)ccv_array_get(tensor_symbol_info[idx].s_ref, i)((void*)(((char*)((tensor_symbol_info[idx].s_ref)->data)) +
(size_t)(tensor_symbol_info[idx].s_ref)->rsize * (size_t)
(i)))
- 1;
1380 assert(s_idx >= 0)((void) sizeof ((s_idx >= 0) ? 1 : 0), __extension__ ({ if
(s_idx >= 0) ; else __assert_fail ("s_idx >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 1380, __extension__ __PRETTY_FUNCTION__); }))
;
1381 ccv_nnc_tensor_t* sub_tensor = tensor_arena->sub_arenas[i]->vt_tensors[s_idx];
1382 assert(sub_arena_out_tensors[idx] == 0)((void) sizeof ((sub_arena_out_tensors[idx] == 0) ? 1 : 0), __extension__
({ if (sub_arena_out_tensors[idx] == 0) ; else __assert_fail
("sub_arena_out_tensors[idx] == 0", "ccv_nnc_symbolic_graph_compile.c"
, 1382, __extension__ __PRETTY_FUNCTION__); }))
;
1383 ccv_nnc_tensor_t* sub_alias = (ccv_nnc_tensor_t*)sub_tensor->alias_ref;
1384 // Only assign if it is a multiview tensor.
1385 if (CCV_IS_TENSOR_MULTIVIEW(sub_tensor)((*(int*)(sub_tensor)) & CCV_TENSOR_MULTIVIEW) ||
1386 (sub_alias && CCV_IS_TENSOR_MULTIVIEW(sub_alias)((*(int*)(sub_alias)) & CCV_TENSOR_MULTIVIEW)))
1387 sub_arena_out_tensors[idx] = sub_tensor;
1388 }
1389 }
1390 // Assigning out the tensors (in case of sharing tensors / in-place ops).
1391 for (i = 0; i < tensor_symbol_info_size; i++)
1392 if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !(
(tensor_blocks[i].flags & 0x3) == UNASSIGNED))
)
1393 {
1394 const int vt_ref = alloc_prep->vt_blocks[i];
1395 const int buffer_ref = vt_ref >= 0 ? alloc_prep->blocks[vt_ref].buffer_ref : -1;
1396 // Either we have dup_tensor_block_ref in current layer, or we have that in
1397 // previous layer, therefore, cannot really find the buffer ptr.
1398 if ((!sub_arena_out_tensors || !sub_arena_out_tensors[i]) && // If it is already generated by sub arena, it can be ordinary out tensors. (What if out tensor is not even generated by sub graph when running? In this case, the behavior is undefined anyway).
1399 ((graph_prep->dup_tensor_block_ref &&
1400 graph_prep->dup_tensor_block_ref[i * unroll_count] >= 0 &&
1401 graph_prep->dup_tensor_block_ref[i * unroll_count] != i) ||
1402 (buffer_ref >= 0 && !tensor_arena->buffers[buffer_ref].ptr)))
1403 {
1404 assert(graph_prep->p)((void) sizeof ((graph_prep->p) ? 1 : 0), __extension__ ({
if (graph_prep->p) ; else __assert_fail ("graph_prep->p"
, "ccv_nnc_symbolic_graph_compile.c", 1404, __extension__ __PRETTY_FUNCTION__
); }))
; // This must be in a sub-graph.
1405 // If this is an input tensor, and it need to be preserved, wait until when we go through inputs to preserve.
1406 if (graph_prep->tensor_blocks[i].p_refs[0] && _ccv_nnc_tensor_block_check_preserve(graph_prep, i))
1407 continue;
1408 const int pos = _ccv_nnc_tensor_multiview_gen(tensor_arena->tensor_metadata, 0, tensor_symbol_info[i].assign_ref, tensor_symbol_info[i].info, graph_prep, tensor_arena, i);
1409 tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1410 ccv_array_push(tensor_arena->m_tensor_idx, &pos);
1411 } else if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED)) {
1412 // When we want to allocate, we don't really need to if it need force broadcast, because we will handle that later.
1413 const uint64_t offset = alloc_prep->blocks[vt_ref].offset;
1414 // If already created, use the same tensor, and continue.
1415 // Having ptr.
1416 int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1417 ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1418 // Also, set its allocations.
1419 // Since tensor view is bit compatible with tensor, we can just cast.
1420 void* obj = _ccv_nnc_tensor_arena_obj_create(obj_ptr_map, tensor_arena->buffers[buffer_ref].ptr, tensor_arena->buffers[buffer_ref].size, offset, tensor_symbol_info[i].info, tensor_arena);
1421 *tensor = ccv_nnc_tensor(obj, tensor_symbol_info[i].info, 0);
1422 assert(offset + tensor_blocks[i].size <= tensor_arena->buffers[buffer_ref].size)((void) sizeof ((offset + tensor_blocks[i].size <= tensor_arena
->buffers[buffer_ref].size) ? 1 : 0), __extension__ ({ if (
offset + tensor_blocks[i].size <= tensor_arena->buffers
[buffer_ref].size) ; else __assert_fail ("offset + tensor_blocks[i].size <= tensor_arena->buffers[buffer_ref].size"
, "ccv_nnc_symbolic_graph_compile.c", 1422, __extension__ __PRETTY_FUNCTION__
); }))
;
1423 // If we need to force broadcast, we need to wrap it in a multiview.
1424 if (graph_prep->tensor_blocks[i].p_refs[0] &&
1425 _ccv_nnc_tensor_block_check_force_broadcast(graph_prep, i))
1426 {
1427 const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
1428 ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, mv_pos);
1429 ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1430 ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
1431 tv,
1432 }, 0, 1, graph_prep->graph, mv);
1433 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[0] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1434 pos = mv_pos;
1435 ccv_array_push(tensor_arena->m_tensor_idx, &mv_pos);
1436 }
1437 tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)pos; // Cast into vt_tensors for now, and later will rewire it.
1438 }
1439 }
1440#ifdef HAVE_MPS
1441 kh_destroy(obj_ptr, obj_ptr_map)kh_destroy_obj_ptr(obj_ptr_map);
1442#endif
1443 // Handle binded tensors. First handle cases without aliases.
1444 for (i = 0; i < tensor_bind_size; i++)
1445 {
1446 assert(tensor_binds[i].tensor)((void) sizeof ((tensor_binds[i].tensor) ? 1 : 0), __extension__
({ if (tensor_binds[i].tensor) ; else __assert_fail ("tensor_binds[i].tensor"
, "ccv_nnc_symbolic_graph_compile.c", 1446, __extension__ __PRETTY_FUNCTION__
); }))
;
1447 const ccv_nnc_tensor_symbol_t resolved_symbol = ccv_nnc_tensor_symbol_resolve(graph_prep->symbolic_graph, tensor_binds[i].symbol);
1448 if (resolved_symbol.d >= 0)
1449 {
1450 int d = resolved_symbol.d;
1451 if (TENSOR_EXPECT_ALIAS(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == ALIAS))
1452 continue;
1453 // This check is for in-place ops. Only in-place op could have unassigned but ref.
1454 // It has nothing to do with alias.
1455 while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED) && tensor_blocks[d].ref)
1456 d = tensor_blocks[d].ref - 1;
1457 // For binded tensors, it shouldn't be assigned yet.
1458 // If it is assigned, the pointer should match the ones from the binded tensor.
1459 // This can only happen if an enforced in-place tensor is binded twice. If that
1460 // happens, we need to make sure it is binded to the same location.
1461 assert(!tensor_arena->vt_tensors[d] || tensor_arena->vt_tensors[d]->data.u8 == tensor_binds[i].tensor->data.u8)((void) sizeof ((!tensor_arena->vt_tensors[d] || tensor_arena
->vt_tensors[d]->data.u8 == tensor_binds[i].tensor->
data.u8) ? 1 : 0), __extension__ ({ if (!tensor_arena->vt_tensors
[d] || tensor_arena->vt_tensors[d]->data.u8 == tensor_binds
[i].tensor->data.u8) ; else __assert_fail ("!tensor_arena->vt_tensors[d] || tensor_arena->vt_tensors[d]->data.u8 == tensor_binds[i].tensor->data.u8"
, "ccv_nnc_symbolic_graph_compile.c", 1461, __extension__ __PRETTY_FUNCTION__
); }))
;
1462 // See above assertion.
1463 if (tensor_arena->vt_tensors[d])
1464 continue;
1465 if (CCV_IS_TENSOR_VIEW(tensor_binds[i].tensor)((*(int*)(tensor_binds[i].tensor)) & CCV_TENSOR_VIEW))
1466 {
1467 int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_view_t));
1468 ccv_nnc_tensor_view_t* const tv = (ccv_nnc_tensor_view_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1469 ccv_nnc_tensor_view_t* const otv = (ccv_nnc_tensor_view_t*)tensor_binds[i].tensor;
1470 if (otv->off > 0) // If there is a off. This has to be the same dimensionality, or smaller at each dimension.
1471 for (j = 0; j < CCV_NNC_MAX_DIM_ALLOC(12); j++)
1472 { assert(tensor_symbol_info[d].info.dim[j] <= otv->info.dim[j])((void) sizeof ((tensor_symbol_info[d].info.dim[j] <= otv->
info.dim[j]) ? 1 : 0), __extension__ ({ if (tensor_symbol_info
[d].info.dim[j] <= otv->info.dim[j]) ; else __assert_fail
("tensor_symbol_info[d].info.dim[j] <= otv->info.dim[j]"
, "ccv_nnc_symbolic_graph_compile.c", 1472, __extension__ __PRETTY_FUNCTION__
); }))
; }
1473 // It is OK to be just as a whole smaller or equal to the binded one.
1474 assert(ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count(tensor_symbol_info[d].info))((void) sizeof ((ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count
(tensor_symbol_info[d].info)) ? 1 : 0), __extension__ ({ if (
ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count
(tensor_symbol_info[d].info)) ; else __assert_fail ("ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count(tensor_symbol_info[d].info)"
, "ccv_nnc_symbolic_graph_compile.c", 1474, __extension__ __PRETTY_FUNCTION__
); }))
;
1475 memcpy(tv, otv, sizeof(ccv_nnc_tensor_view_t));
1476 memcpy(tv->info.dim, tensor_symbol_info[d].info.dim, sizeof(tv->info.dim));
1477 tensor_arena->vt_tensors[d] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1478 } else {
1479 int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1480 ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1481 *tv = ccv_nnc_tensor(tensor_binds[i].tensor->data.u8, tensor_symbol_info[d].info, 0);
1482 tv->info.datatype = tensor_binds[i].tensor->info.datatype;
1483 tv->info.reserved = tensor_binds[i].tensor->info.reserved;
1484 tv->data = tensor_binds[i].tensor->data; // If there are offsets, copy it over.
1485 tv->dataof = tensor_binds[i].tensor->dataof;
1486 tensor_arena->vt_tensors[d] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1487 }
1488 }
1489 }
1490 // Handle binded tensors. We handle alias here so it can reference to binded tensors.
1491 for (i = 0; i < tensor_bind_size; i++)
1492 {
1493 assert(tensor_binds[i].tensor)((void) sizeof ((tensor_binds[i].tensor) ? 1 : 0), __extension__
({ if (tensor_binds[i].tensor) ; else __assert_fail ("tensor_binds[i].tensor"
, "ccv_nnc_symbolic_graph_compile.c", 1493, __extension__ __PRETTY_FUNCTION__
); }))
;
1494 const ccv_nnc_tensor_symbol_t resolved_symbol = ccv_nnc_tensor_symbol_resolve(graph_prep->symbolic_graph, tensor_binds[i].symbol);
1495 if (resolved_symbol.d >= 0)
1496 {
1497 int d = resolved_symbol.d;
1498 if (TENSOR_EXPECT_ALIAS(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == ALIAS))
1499 d = tensor_symbol_info[d].alias_ref - 1; // Bind back to the original.
1500 // This check is for in-place ops. Only in-place op could have unassigned but ref.
1501 // It has nothing to do with alias.
1502 while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED) && tensor_blocks[d].ref)
1503 d = tensor_blocks[d].ref - 1;
1504 if (tensor_arena->vt_tensors[d])
1505 continue;
1506 // Assert original alias has no ofs. Otherwise our binding will be problematic.
1507 for (j = 0; j < CCV_NNC_MAX_DIM_ALLOC(12); j++)
1508 { assert(tensor_symbol_info[resolved_symbol.d].ofs[j] == 0)((void) sizeof ((tensor_symbol_info[resolved_symbol.d].ofs[j]
== 0) ? 1 : 0), __extension__ ({ if (tensor_symbol_info[resolved_symbol
.d].ofs[j] == 0) ; else __assert_fail ("tensor_symbol_info[resolved_symbol.d].ofs[j] == 0"
, "ccv_nnc_symbolic_graph_compile.c", 1508, __extension__ __PRETTY_FUNCTION__
); }))
; }
1509 if (CCV_IS_TENSOR_VIEW(tensor_binds[i].tensor)((*(int*)(tensor_binds[i].tensor)) & CCV_TENSOR_VIEW))
1510 {
1511 int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_view_t));
1512 ccv_nnc_tensor_view_t* const tv = (ccv_nnc_tensor_view_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1513 ccv_nnc_tensor_view_t* const otv = (ccv_nnc_tensor_view_t*)tensor_binds[i].tensor;
1514 if (otv->off > 0) // If there is a off. This has to be the same dimensionality, or smaller at each dimension.
1515 for (j = 0; j < CCV_NNC_MAX_DIM_ALLOC(12); j++)
1516 { assert(tensor_symbol_info[d].info.dim[j] <= otv->info.dim[j])((void) sizeof ((tensor_symbol_info[d].info.dim[j] <= otv->
info.dim[j]) ? 1 : 0), __extension__ ({ if (tensor_symbol_info
[d].info.dim[j] <= otv->info.dim[j]) ; else __assert_fail
("tensor_symbol_info[d].info.dim[j] <= otv->info.dim[j]"
, "ccv_nnc_symbolic_graph_compile.c", 1516, __extension__ __PRETTY_FUNCTION__
); }))
; }
1517 // It is OK to be just as a whole smaller or equal to the binded one.
1518 assert(ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count(tensor_symbol_info[d].info))((void) sizeof ((ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count
(tensor_symbol_info[d].info)) ? 1 : 0), __extension__ ({ if (
ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count
(tensor_symbol_info[d].info)) ; else __assert_fail ("ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count(tensor_symbol_info[d].info)"
, "ccv_nnc_symbolic_graph_compile.c", 1518, __extension__ __PRETTY_FUNCTION__
); }))
;
1519 memcpy(tv, otv, sizeof(ccv_nnc_tensor_view_t));
1520 memcpy(tv->info.dim, tensor_symbol_info[d].info.dim, sizeof(tv->info.dim));
1521 tensor_arena->vt_tensors[d] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1522 } else {
1523 int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1524 ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1525 *tv = ccv_nnc_tensor(tensor_binds[i].tensor->data.u8, tensor_symbol_info[d].info, 0);
1526 tv->info.datatype = tensor_binds[i].tensor->info.datatype;
1527 tv->info.reserved = tensor_binds[i].tensor->info.reserved;
1528 tv->data = tensor_binds[i].tensor->data;
1529 tv->dataof = tensor_binds[i].tensor->dataof;
1530 tensor_arena->vt_tensors[d] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1531 }
1532 }
1533 }
1534 // Assign out refs, refs are simple ones, we should handle it first. (because they point to exactly the same metadata and same region).
1535 // Avoiding refs that actually is an alias.
1536 for (i = 0; i < tensor_symbol_info_size; i++)
1537 // It could be binded tensor (or unused), in that case, it doesn't have a ref.
1538 if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED) && tensor_blocks[i].ref && !tensor_arena->vt_tensors[i] && !tensor_blocks[i].alias_ref)
1539 {
1540 int ref = tensor_blocks[i].ref - 1;
1541 while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[ref])((tensor_blocks[ref].flags & 0x3) == UNASSIGNED) && tensor_blocks[ref].ref)
1542 ref = tensor_blocks[ref].ref - 1;
1543 assert(tensor_arena->vt_tensors[ref])((void) sizeof ((tensor_arena->vt_tensors[ref]) ? 1 : 0), __extension__
({ if (tensor_arena->vt_tensors[ref]) ; else __assert_fail
("tensor_arena->vt_tensors[ref]", "ccv_nnc_symbolic_graph_compile.c"
, 1543, __extension__ __PRETTY_FUNCTION__); }))
;
1544 tensor_arena->vt_tensors[i] = tensor_arena->vt_tensors[ref];
1545 }
1546 // Now after refs assigned out, handle the case I need to preserve because I am a sub graph of while loop.
1547 if (graph_prep->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
1548 {
1549 assert(graph_prep->p)((void) sizeof ((graph_prep->p) ? 1 : 0), __extension__ ({
if (graph_prep->p) ; else __assert_fail ("graph_prep->p"
, "ccv_nnc_symbolic_graph_compile.c", 1549, __extension__ __PRETTY_FUNCTION__
); }))
;
1550 const ccv_nnc_graph_exec_symbol_info_t* node = graph_prep->p->exec_symbol_info + (graph_prep->exec_idx - 1);
1551 const int p_idx = graph_prep->p_idx - 1;
1552 for (i = 0; i < node->input_size; i++)
1553 {
1554 const int idx = node->inputs[i];
1555 int block_ref = *(int*)ccv_array_get(graph_prep->p->tensor_symbol_info[idx].s_ref, p_idx)((void*)(((char*)((graph_prep->p->tensor_symbol_info[idx
].s_ref)->data)) + (size_t)(graph_prep->p->tensor_symbol_info
[idx].s_ref)->rsize * (size_t)(p_idx)))
- 1;
1556 assert(!tensor_blocks[block_ref].ref)((void) sizeof ((!tensor_blocks[block_ref].ref) ? 1 : 0), __extension__
({ if (!tensor_blocks[block_ref].ref) ; else __assert_fail (
"!tensor_blocks[block_ref].ref", "ccv_nnc_symbolic_graph_compile.c"
, 1556, __extension__ __PRETTY_FUNCTION__); }))
;
1557 const int vt_ref = alloc_prep->vt_blocks[block_ref];
1558 if (!_ccv_nnc_tensor_block_check_preserve(graph_prep, block_ref))
1559 continue;
1560 assert(vt_ref >= 0)((void) sizeof ((vt_ref >= 0) ? 1 : 0), __extension__ ({ if
(vt_ref >= 0) ; else __assert_fail ("vt_ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 1560, __extension__ __PRETTY_FUNCTION__); }))
;
1561 const int buffer_ref = alloc_prep->blocks[vt_ref].buffer_ref;
1562 assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[block_ref]))((void) sizeof ((!((tensor_blocks[block_ref].flags & 0x3)
== UNASSIGNED)) ? 1 : 0), __extension__ ({ if (!((tensor_blocks
[block_ref].flags & 0x3) == UNASSIGNED)) ; else __assert_fail
("!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[block_ref])", "ccv_nnc_symbolic_graph_compile.c"
, 1562, __extension__ __PRETTY_FUNCTION__); }))
;
1563 assert(!TENSOR_EXPECT_ALIAS(tensor_blocks[block_ref]))((void) sizeof ((!((tensor_blocks[block_ref].flags & 0x3)
== ALIAS)) ? 1 : 0), __extension__ ({ if (!((tensor_blocks[block_ref
].flags & 0x3) == ALIAS)) ; else __assert_fail ("!TENSOR_EXPECT_ALIAS(tensor_blocks[block_ref])"
, "ccv_nnc_symbolic_graph_compile.c", 1563, __extension__ __PRETTY_FUNCTION__
); }))
;
1564 // Either we have dup_tensor_block_ref in current layer, or we have that in
1565 // previous layer, therefore, cannot really find the buffer ptr.
1566 if ((!sub_arena_out_tensors || !sub_arena_out_tensors[block_ref]) && // If it is already generated by sub arena, it can be ordinary out tensors. (What if out tensor is not even generated by sub graph when running? In this case, the behavior is undefined anyway).
1567 ((graph_prep->dup_tensor_block_ref &&
1568 graph_prep->dup_tensor_block_ref[block_ref * unroll_count] >= 0 &&
1569 graph_prep->dup_tensor_block_ref[block_ref * unroll_count] != block_ref) ||
1570 !tensor_arena->buffers[buffer_ref].ptr))
1571 {
1572 // We haven't allocated anything for this yet.
1573 assert(tensor_arena->vt_tensors[block_ref] == 0)((void) sizeof ((tensor_arena->vt_tensors[block_ref] == 0)
? 1 : 0), __extension__ ({ if (tensor_arena->vt_tensors[block_ref
] == 0) ; else __assert_fail ("tensor_arena->vt_tensors[block_ref] == 0"
, "ccv_nnc_symbolic_graph_compile.c", 1573, __extension__ __PRETTY_FUNCTION__
); }))
;
1574 const int pos = _ccv_nnc_tensor_multiview_gen(tensor_arena->tensor_metadata, 1, tensor_symbol_info[i].assign_ref, tensor_symbol_info[block_ref].info, graph_prep, tensor_arena, block_ref);
1575 tensor_arena->vt_tensors[block_ref] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1576 ccv_array_push(tensor_arena->m_tensor_idx, &pos);
1577 } else {
1578 const int mv_pos = _ccv_nnc_tensor_multiview_preserve_gen(tensor_arena->tensor_metadata, tensor_symbol_info[block_ref].info, graph_prep, tensor_arena->vt_tensors[block_ref]);
1579 tensor_arena->vt_tensors[block_ref] = (ccv_nnc_tensor_t*)(intptr_t)mv_pos; // Cast into vt_tensors for now, and later will rewire.
1580 ccv_array_push(tensor_arena->m_tensor_idx, &mv_pos);
1581 }
1582 }
1583 }
1584 // For case..of statement, the output is a phi variable, thus, if we take the skip branch, we will select the original input.
1585 // This created the multi-view tensor to achieve that.
1586 for (i = 0; i < tensor_symbol_info_size; i++)
1587 if (tensor_blocks[i].bypass_ref && tensor_arena->vt_tensors[i])
1588 {
1589 const int bypass_ref = tensor_blocks[i].bypass_ref - 1;
1590 // Create phi multi-view.
1591 const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
1592 const int intv_pos = _ccv_nnc_tensor_flat_if_multiview(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[bypass_ref]);
1593 const int outv_pos = _ccv_nnc_tensor_flat_if_multiview(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[i]);
1594 ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, mv_pos);
1595 ccv_nnc_tensor_t* const intv = (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, intv_pos);
1596 ccv_nnc_tensor_t* const outv = (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, outv_pos);
1597 ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
1598 intv,
1599 outv,
1600 }, CCV_NNC_MULTIVIEW_K0N, 2, (ccv_nnc_graph_t*)CCV_NNC_MULTIVIEW_PHI(intptr_t)0x1, mv);
1601 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[0] = (ccv_nnc_tensor_t*)(intptr_t)intv_pos;
1602 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[1] = (ccv_nnc_tensor_t*)(intptr_t)outv_pos;
1603 tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)mv_pos;
1604 ccv_array_push(tensor_arena->m_tensor_idx, &mv_pos);
1605 }
1606 // Now it is time to handle alias.
1607 for (i = 0; i < alloc_prep->block_size; i++)
1608 if (alloc_prep->blocks[i].block_ref < tensor_symbol_info_size)
1609 {
1610 const int block_ref = alloc_prep->blocks[i].block_ref;
1611 if (TENSOR_EXPECT_ALIAS(tensor_blocks[block_ref])((tensor_blocks[block_ref].flags & 0x3) == ALIAS) && !tensor_arena->vt_tensors[block_ref])
1612 {
1613 // Assigning out the tensor aliases.
1614 assert(tensor_symbol_info[block_ref].alias_ref)((void) sizeof ((tensor_symbol_info[block_ref].alias_ref) ? 1
: 0), __extension__ ({ if (tensor_symbol_info[block_ref].alias_ref
) ; else __assert_fail ("tensor_symbol_info[block_ref].alias_ref"
, "ccv_nnc_symbolic_graph_compile.c", 1614, __extension__ __PRETTY_FUNCTION__
); }))
;
1615 _ccv_nnc_recursively_assign_vt_tensor_aliases(tensor_blocks, tensor_arena->tensor_metadata, tensor_symbol_info, block_ref, tensor_arena->vt_tensors);
1616 }
1617 }
1618 // Now assigning out the rest of alias refs.
1619 for (i = 0; i < tensor_symbol_info_size; i++)
1620 // It could be binded tensor (or unused), in that case, it doesn't have a ref.
1621 if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED) && tensor_blocks[i].alias_ref && !tensor_arena->vt_tensors[i])
1622 {
1623 int ref = tensor_blocks[i].alias_ref - 1;
1624 assert(tensor_arena->vt_tensors[ref])((void) sizeof ((tensor_arena->vt_tensors[ref]) ? 1 : 0), __extension__
({ if (tensor_arena->vt_tensors[ref]) ; else __assert_fail
("tensor_arena->vt_tensors[ref]", "ccv_nnc_symbolic_graph_compile.c"
, 1624, __extension__ __PRETTY_FUNCTION__); }))
;
1625 tensor_arena->vt_tensors[i] = tensor_arena->vt_tensors[ref];
1626 }
1627 // Replacing the tensor placeholder within sub arena's multi-view to the input tensor.
1628 for (i = 0; i < tensor_arena->sub_arena_size; i++)
1629 if (tensor_arena->sub_arenas[i])
1630 {
1631 const int exec_idx = graph_prep->sub_preps[i]->exec_idx - 1;
1632 const ccv_nnc_graph_exec_symbol_info_t* const node = graph_prep->exec_symbol_info + exec_idx;
1633 for (j = 0; j < node->input_size; j++)
1634 {
1635 const int idx = node->inputs[j];
1636 const int s_idx = (tensor_symbol_info[idx].s_ref && tensor_symbol_info[idx].s_ref->rnum > i) ? *(int*)ccv_array_get(tensor_symbol_info[idx].s_ref, i)((void*)(((char*)((tensor_symbol_info[idx].s_ref)->data)) +
(size_t)(tensor_symbol_info[idx].s_ref)->rsize * (size_t)
(i)))
- 1 : -1;
1637 if (s_idx < 0)
1638 continue;
1639 ccv_nnc_tensor_t* sub_tensor = tensor_arena->sub_arenas[i]->vt_tensors[s_idx];
1640 // Only do the replacement if it is a multi-view tensor.
1641 // sub_tensor can be unassigned if it is a tape variable. It will get fixed up later from its pair.
1642 if (sub_tensor && CCV_IS_TENSOR_MULTIVIEW(sub_tensor)((*(int*)(sub_tensor)) & CCV_TENSOR_MULTIVIEW) && !TENSOR_EXPECT_UNASSIGNED(tensor_blocks[idx])((tensor_blocks[idx].flags & 0x3) == UNASSIGNED))
1643 {
1644 // It cannot be binded tensor.
1645 assert(CCV_NNC_IS_METADATA_POS(tensor_arena->vt_tensors[idx]))((void) sizeof ((((uintptr_t)(tensor_arena->vt_tensors[idx
]) & 1)) ? 1 : 0), __extension__ ({ if (((uintptr_t)(tensor_arena
->vt_tensors[idx]) & 1)) ; else __assert_fail ("CCV_NNC_IS_METADATA_POS(tensor_arena->vt_tensors[idx])"
, "ccv_nnc_symbolic_graph_compile.c", 1645, __extension__ __PRETTY_FUNCTION__
); }))
;
1646 const int vt_pos = (int)(intptr_t)tensor_arena->vt_tensors[idx];
1647 const int is_sub_arena_out_tensor = (sub_arena_out_tensors && sub_arena_out_tensors[idx]);
1648 ccv_nnc_tensor_t* const vt_tensor = is_sub_arena_out_tensor ? sub_arena_out_tensors[idx] : _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, vt_pos);
1649 // If this tensor is also an multiview, we need to first generate a new tensor, and then generate a reference
1650 // to this tensor.
1651 if (CCV_IS_TENSOR_MULTIVIEW(vt_tensor)((*(int*)(vt_tensor)) & CCV_TENSOR_MULTIVIEW))
1652 {
1653 const int ref_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1654 ccv_nnc_tensor_t* const ref_tensor = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, ref_pos);
1655 ccv_nnc_tensor_multiview_t* const multiview = (ccv_nnc_tensor_multiview_t*)(is_sub_arena_out_tensor ? vt_tensor : _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, vt_pos));
1656 ref_tensor->alias_ref = is_sub_arena_out_tensor ? (uintptr_t)vt_tensor : (uintptr_t)vt_pos;
1657 ccv_nnc_tensor_synchronize_to_multiview(multiview, (ccv_nnc_tensor_t*)(intptr_t)ref_pos);
1658 ccv_nnc_tensor_t* tv = (ccv_nnc_tensor_t*)(CCV_NNC_IS_METADATA_POS(CCV_NNC_MULTIVIEW_DATA(multiview)[0])((uintptr_t)(((multiview)->_heap_data ? (multiview)->_heap_data
: (multiview)->_inline_data)[0]) & 1)
? _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(multiview)((multiview)->_heap_data ? (multiview)->_heap_data : (multiview
)->_inline_data)
[0]) : CCV_NNC_MULTIVIEW_DATA(multiview)((multiview)->_heap_data ? (multiview)->_heap_data : (multiview
)->_inline_data)
[0]);
1659 while (CCV_IS_TENSOR_MULTIVIEW(tv)((*(int*)(tv)) & CCV_TENSOR_MULTIVIEW))
1660 tv = (ccv_nnc_tensor_t*)(CCV_NNC_IS_METADATA_POS(CCV_NNC_MULTIVIEW_DATA((ccv_nnc_tensor_multiview_t*)tv)[0])((uintptr_t)((((ccv_nnc_tensor_multiview_t*)tv)->_heap_data
? ((ccv_nnc_tensor_multiview_t*)tv)->_heap_data : ((ccv_nnc_tensor_multiview_t
*)tv)->_inline_data)[0]) & 1)
? _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA((ccv_nnc_tensor_multiview_t*)tv)(((ccv_nnc_tensor_multiview_t*)tv)->_heap_data ? ((ccv_nnc_tensor_multiview_t
*)tv)->_heap_data : ((ccv_nnc_tensor_multiview_t*)tv)->
_inline_data)
[0]) : CCV_NNC_MULTIVIEW_DATA((ccv_nnc_tensor_multiview_t*)tv)(((ccv_nnc_tensor_multiview_t*)tv)->_heap_data ? ((ccv_nnc_tensor_multiview_t
*)tv)->_heap_data : ((ccv_nnc_tensor_multiview_t*)tv)->
_inline_data)
[0]);
1661 *ref_tensor = ccv_nnc_tensor(tv->data.u8, tv->info, 0);
1662 ref_tensor->data = tv->data;
1663 ref_tensor->dataof = tv->dataof;
1664 _ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)sub_tensor, (ccv_nnc_tensor_t*)(intptr_t)ref_pos);
1665 } else
1666 _ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)sub_tensor, is_sub_arena_out_tensor ? vt_tensor : (ccv_nnc_tensor_t*)(intptr_t)vt_pos);
1667 }
1668 }
1669 }
1670 // After alias created, for case..of statement, we now revert back to flat tensor rather than multi-view.
1671 // No worries though, this new tensor is subscribed for the phi multi-view. More over, we have logic
1672 // when initialize case..of node, which will take the phi multi-view again.
1673 for (i = 0; i < tensor_symbol_info_size; i++)
1674 if (tensor_blocks[i].bypass_ref && tensor_arena->vt_tensors[i])
1675 {
1676 assert(CCV_NNC_IS_METADATA_POS(tensor_arena->vt_tensors[i]))((void) sizeof ((((uintptr_t)(tensor_arena->vt_tensors[i])
& 1)) ? 1 : 0), __extension__ ({ if (((uintptr_t)(tensor_arena
->vt_tensors[i]) & 1)) ; else __assert_fail ("CCV_NNC_IS_METADATA_POS(tensor_arena->vt_tensors[i])"
, "ccv_nnc_symbolic_graph_compile.c", 1676, __extension__ __PRETTY_FUNCTION__
); }))
;
1677 ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[i]);
1678 assert(mv->anchor == CCV_NNC_MULTIVIEW_PHI)((void) sizeof ((mv->anchor == (intptr_t)0x1) ? 1 : 0), __extension__
({ if (mv->anchor == (intptr_t)0x1) ; else __assert_fail (
"mv->anchor == CCV_NNC_MULTIVIEW_PHI", "ccv_nnc_symbolic_graph_compile.c"
, 1678, __extension__ __PRETTY_FUNCTION__); }))
;
1679 tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)_ccv_nnc_tensor_flat_if_multiview(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[i]);
1680 }
1681 // rewire the rest. I can rewire multiple times because I can identify whether this is wired or not.
1682 for (i = 0; i < tensor_symbol_info_size; i++)
1683 if (tensor_arena->vt_tensors[i])
1684 tensor_arena->vt_tensors[i] = _ccv_nnc_tensor_metadata_rewire(tensor_arena->tensor_metadata, tensor_arena->vt_tensors[i]);
1685 // Associate multiview tensors from sub arena to the parent.
1686 if (sub_arena_out_tensors)
1687 {
1688 for (i = 0; i < alloc_prep->block_size; i++)
1689 if (alloc_prep->blocks[i].block_ref < tensor_symbol_info_size)
1690 {
1691 const int block_ref = alloc_prep->blocks[i].block_ref;
1692 if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[block_ref])((tensor_blocks[block_ref].flags & 0x3) == UNASSIGNED))
1693 continue;
1694 int sub_arena_ref = block_ref;
1695 if (TENSOR_EXPECT_ALIAS(tensor_blocks[block_ref])((tensor_blocks[block_ref].flags & 0x3) == ALIAS))
1696 {
1697 // Assigning out the tensor aliases.
1698 assert(tensor_symbol_info[block_ref].alias_ref)((void) sizeof ((tensor_symbol_info[block_ref].alias_ref) ? 1
: 0), __extension__ ({ if (tensor_symbol_info[block_ref].alias_ref
) ; else __assert_fail ("tensor_symbol_info[block_ref].alias_ref"
, "ccv_nnc_symbolic_graph_compile.c", 1698, __extension__ __PRETTY_FUNCTION__
); }))
;
1699 const int alias_ref = tensor_symbol_info[block_ref].alias_ref - 1;
1700 // It referenced to is not an alias.
1701 assert(tensor_arena->vt_tensors[alias_ref])((void) sizeof ((tensor_arena->vt_tensors[alias_ref]) ? 1 :
0), __extension__ ({ if (tensor_arena->vt_tensors[alias_ref
]) ; else __assert_fail ("tensor_arena->vt_tensors[alias_ref]"
, "ccv_nnc_symbolic_graph_compile.c", 1701, __extension__ __PRETTY_FUNCTION__
); }))
;
1702 sub_arena_ref = alias_ref;
1703 if (!sub_arena_out_tensors[sub_arena_ref])
1704 continue;
1705 }
1706 if (!sub_arena_out_tensors[sub_arena_ref])
1707 continue;
1708 ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)(CCV_IS_TENSOR_MULTIVIEW(sub_arena_out_tensors[sub_arena_ref])((*(int*)(sub_arena_out_tensors[sub_arena_ref])) & CCV_TENSOR_MULTIVIEW
)
? sub_arena_out_tensors[sub_arena_ref] : (ccv_nnc_tensor_t*)sub_arena_out_tensors[sub_arena_ref]->alias_ref);
1709 assert(CCV_IS_TENSOR_MULTIVIEW(mv))((void) sizeof ((((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW)) ?
1 : 0), __extension__ ({ if (((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW
)) ; else __assert_fail ("CCV_IS_TENSOR_MULTIVIEW(mv)", "ccv_nnc_symbolic_graph_compile.c"
, 1709, __extension__ __PRETTY_FUNCTION__); }))
;
1710 // This is only possible if the vt_tensors is a phi node.
1711 if (tensor_arena->vt_tensors[block_ref]->alias_ref)
1712 {
1713 // For phi node, the sub_arena_out_tensors are only relevant to its selected output. Therefore, setting that to be the receiver of the broadcast.
1714 ccv_nnc_tensor_multiview_t* const phi = (ccv_nnc_tensor_multiview_t*)(tensor_arena->vt_tensors[block_ref]->alias_ref);
1715 assert(phi->anchor == CCV_NNC_MULTIVIEW_PHI)((void) sizeof ((phi->anchor == (intptr_t)0x1) ? 1 : 0), __extension__
({ if (phi->anchor == (intptr_t)0x1) ; else __assert_fail
("phi->anchor == CCV_NNC_MULTIVIEW_PHI", "ccv_nnc_symbolic_graph_compile.c"
, 1715, __extension__ __PRETTY_FUNCTION__); }))
;
1716 assert(!CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(phi)[1]))((void) sizeof ((!((*(int*)(((phi)->_heap_data ? (phi)->
_heap_data : (phi)->_inline_data)[1])) & CCV_TENSOR_MULTIVIEW
)) ? 1 : 0), __extension__ ({ if (!((*(int*)(((phi)->_heap_data
? (phi)->_heap_data : (phi)->_inline_data)[1])) & CCV_TENSOR_MULTIVIEW
)) ; else __assert_fail ("!CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(phi)[1])"
, "ccv_nnc_symbolic_graph_compile.c", 1716, __extension__ __PRETTY_FUNCTION__
); }))
;
1717 CCV_NNC_MULTIVIEW_DATA(phi)((phi)->_heap_data ? (phi)->_heap_data : (phi)->_inline_data
)
[1]->alias_ref = (uintptr_t)mv;
1718 ccv_nnc_tensor_synchronize_to_multiview(mv, CCV_NNC_MULTIVIEW_DATA(phi)((phi)->_heap_data ? (phi)->_heap_data : (phi)->_inline_data
)
[1]);
1719 } else {
1720 tensor_arena->vt_tensors[block_ref]->alias_ref = (uintptr_t)mv;
1721 ccv_nnc_tensor_synchronize_to_multiview(mv, tensor_arena->vt_tensors[block_ref]);
1722 }
1723 }
1724 }
1725 // Go over all the tensors that has assign_ref. If the tensor it is assigned from is:
1726 // 1). From sub_arena_out_tensors, it could be possible that it now pointing to an area this arena doesn't know.
1727 // 2). From phi multi-view, for this case, it is in fact that this arena won't know which memory I am going to use prior.
1728 // Therefore, for above two scenarios, the tensor has assign_ref, even it is a multiview tensor, need to subscribe
1729 // to the output of assign_ref tensor.
1730 for (i = 0; i < tensor_symbol_info_size; i++)
1731 if (tensor_arena->vt_tensors[i] && tensor_symbol_info[i].assign_ref)
1732 {
1733 const int assign_ref = tensor_symbol_info[i].assign_ref - 1;
1734 ccv_nnc_tensor_t* assign_tensor;
1735 if (sub_arena_out_tensors && sub_arena_out_tensors[assign_ref])
1736 assign_tensor = CCV_IS_TENSOR_MULTIVIEW(sub_arena_out_tensors[assign_ref])((*(int*)(sub_arena_out_tensors[assign_ref])) & CCV_TENSOR_MULTIVIEW
)
? sub_arena_out_tensors[assign_ref] : (ccv_nnc_tensor_t*)sub_arena_out_tensors[assign_ref]->alias_ref;
1737 else
1738 assign_tensor = tensor_arena->vt_tensors[assign_ref];
1739 ccv_nnc_graph_add_carry_over(graph_prep->graph, assign_tensor, tensor_arena->vt_tensors[i]);
1740 }
1741 // After everything handled, assertion again to make sure the tensors and tensor binds pointing to the right location. This is really just for assertion.
1742 for (i = 0; i < tensor_bind_size; i++)
1743 {
1744 assert(tensor_binds[i].tensor)((void) sizeof ((tensor_binds[i].tensor) ? 1 : 0), __extension__
({ if (tensor_binds[i].tensor) ; else __assert_fail ("tensor_binds[i].tensor"
, "ccv_nnc_symbolic_graph_compile.c", 1744, __extension__ __PRETTY_FUNCTION__
); }))
;
1745 const ccv_nnc_tensor_symbol_t resolved_symbol = ccv_nnc_tensor_symbol_resolve(graph_prep->symbolic_graph, tensor_binds[i].symbol);
1746 if (resolved_symbol.d >= 0)
1747 {
1748 int d = resolved_symbol.d;
1749 // This check is for in-place ops. Only in-place op could have unassigned but ref.
1750 // It has nothing to do with alias.
1751 while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED) && tensor_blocks[d].ref)
1752 d = tensor_blocks[d].ref - 1;
1753 // Note we don't trace back on alias. This is intentional.
1754 assert(tensor_arena->vt_tensors[d]->data.u8 == tensor_binds[i].tensor->data.u8)((void) sizeof ((tensor_arena->vt_tensors[d]->data.u8 ==
tensor_binds[i].tensor->data.u8) ? 1 : 0), __extension__ (
{ if (tensor_arena->vt_tensors[d]->data.u8 == tensor_binds
[i].tensor->data.u8) ; else __assert_fail ("tensor_arena->vt_tensors[d]->data.u8 == tensor_binds[i].tensor->data.u8"
, "ccv_nnc_symbolic_graph_compile.c", 1754, __extension__ __PRETTY_FUNCTION__
); }))
;
1755 }
1756 }
1757 if (sub_arena_out_tensors)
1758 ccfreefree(sub_arena_out_tensors);
1759 // Rewire sub arena's tensor references.
1760 for (i = 0; i < tensor_arena->sub_arena_size; i++)
1761 if (tensor_arena->sub_arenas[i])
1762 {
1763 const int exec_idx = graph_prep->sub_preps[i]->exec_idx - 1;
1764 const ccv_nnc_graph_exec_symbol_info_t* const node = graph_prep->exec_symbol_info + exec_idx;
1765 for (j = 0; j < node->input_size; j++)
1766 {
1767 const int idx = node->inputs[j];
1768 const int s_idx = (tensor_symbol_info[idx].s_ref && tensor_symbol_info[idx].s_ref->rnum > i) ? *(int*)ccv_array_get(tensor_symbol_info[idx].s_ref, i)((void*)(((char*)((tensor_symbol_info[idx].s_ref)->data)) +
(size_t)(tensor_symbol_info[idx].s_ref)->rsize * (size_t)
(i)))
- 1 : -1;
1769 if (s_idx < 0)
1770 continue;
1771 ccv_nnc_tensor_t* sub_tensor = tensor_arena->sub_arenas[i]->vt_tensors[s_idx];
1772 // Only do the replacement if it is a multi-view tensor.
1773 // sub_tensor can be unassigned if it is a tape variable. It will get fixed up later from its pair.
1774 if (sub_tensor && CCV_IS_TENSOR_MULTIVIEW(sub_tensor)((*(int*)(sub_tensor)) & CCV_TENSOR_MULTIVIEW))
1775 {
1776 // This is binded tensor, bind it now.
1777 if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[idx])((tensor_blocks[idx].flags & 0x3) == UNASSIGNED))
1778 _ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)sub_tensor, tensor_arena->vt_tensors[idx]);
1779 else
1780 _ccv_nnc_tensor_multiview_full_pos_rewire(tensor_arena->tensor_metadata, (ccv_nnc_tensor_multiview_t*)sub_tensor);
1781 }
1782 }
1783 }
1784 return tensor_arena;
1785}
1786
1787static ccv_nnc_tensor_t* _ccv_nnc_tensor_arena_find_pair_ref(const ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_symbolic_graph_t* const graph, const int pair_ref)
1788{
1789 assert(graph)((void) sizeof ((graph) ? 1 : 0), __extension__ ({ if (graph)
; else __assert_fail ("graph", "ccv_nnc_symbolic_graph_compile.c"
, 1789, __extension__ __PRETTY_FUNCTION__); }))
;
1790 if ((intptr_t)graph == tensor_arena->graph_ref)
1791 {
1792 assert(pair_ref >= 0 && pair_ref < tensor_arena->vt_tensor_size)((void) sizeof ((pair_ref >= 0 && pair_ref < tensor_arena
->vt_tensor_size) ? 1 : 0), __extension__ ({ if (pair_ref >=
0 && pair_ref < tensor_arena->vt_tensor_size) ;
else __assert_fail ("pair_ref >= 0 && pair_ref < tensor_arena->vt_tensor_size"
, "ccv_nnc_symbolic_graph_compile.c", 1792, __extension__ __PRETTY_FUNCTION__
); }))
;
1793 return tensor_arena->vt_tensors[pair_ref];
1794 }
1795 int i;
1796 for (i = 0; i < tensor_arena->sub_arena_size; i++)
1797 if (tensor_arena->sub_arenas[i])
1798 {
1799 ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_arena_find_pair_ref(tensor_arena->sub_arenas[i], graph, pair_ref);
1800 if (tensor)
1801 return tensor;
1802 }
1803 return 0;
1804}
1805
1806static void _ccv_nnc_tensor_mark_as_tape_var(ccv_nnc_tensor_t* const tensor)
1807{
1808 if (!CCV_IS_TENSOR_MULTIVIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_MULTIVIEW))
1809 tensor->type |= CCV_TAPE_ALLOC;
1810 else {
1811 ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)tensor;
1812 mv->type |= CCV_TAPE_ALLOC;
1813 int i;
1814 for (i = 0; i < mv->repeat + mv->kind; i++)
1815 _ccv_nnc_tensor_mark_as_tape_var(CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i]);
1816 }
1817}
1818
1819static void _ccv_nnc_tensor_arena_fixup_pair_ref_and_tape_var(const ccv_nnc_tensor_arena_t* const root_arena, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, ccv_nnc_tensor_arena_t* const tensor_arena)
1820{
1821 assert(tensor_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph)((void) sizeof ((tensor_arena->graph_ref == (intptr_t)graph_prep
->symbolic_graph) ? 1 : 0), __extension__ ({ if (tensor_arena
->graph_ref == (intptr_t)graph_prep->symbolic_graph) ; else
__assert_fail ("tensor_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 1821, __extension__ __PRETTY_FUNCTION__
); }))
;
1822 int i;
1823 for (i = 0; i < graph_prep->tensor_symbol_info_size; i++)
1824 {
1825 if (graph_prep->tensor_symbol_info[i].pair_ref)
1826 {
1827 tensor_arena->vt_tensors[i] = _ccv_nnc_tensor_arena_find_pair_ref(root_arena, graph_prep->symbolic_graph->pair, graph_prep->tensor_symbol_info[i].pair_ref - 1);
1828 // No need to continue check this if it is from its pair.
1829 continue;
1830 }
1831 if ((graph_prep->tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_TAPE_VAR) && tensor_arena->vt_tensors[i])
1832 {
1833 // If it is a normal tensor, and the buffer it relies on is read only, no need to mark as tape var.
1834 if (!CCV_IS_TENSOR_MULTIVIEW(tensor_arena->vt_tensors[i])((*(int*)(tensor_arena->vt_tensors[i])) & CCV_TENSOR_MULTIVIEW
)
)
1835 {
1836 const int vt_ref = graph_prep->alloc_prep->vt_blocks[i];
1837 if (vt_ref >= 0 &&
1838 TENSOR_READ_WRITE(graph_prep->alloc_prep->buffers[graph_prep->alloc_prep->blocks[vt_ref].buffer_ref])(graph_prep->alloc_prep->buffers[graph_prep->alloc_prep
->blocks[vt_ref].buffer_ref].flags & 0xc)
== READ_ONLY)
1839 continue;
1840 }
1841 _ccv_nnc_tensor_mark_as_tape_var(tensor_arena->vt_tensors[i]);
1842 }
1843 }
1844 for (i = 0; i < graph_prep->sub_prep_size; i++)
1845 if (graph_prep->sub_preps[i])
1846 _ccv_nnc_tensor_arena_fixup_pair_ref_and_tape_var(root_arena, graph_prep->sub_preps[i], tensor_arena->sub_arenas[i]);
1847}
1848
1849static void _ccv_nnc_tensor_block_add_exec(const ccv_sparse_matrix_t* const exec_dep, const int idx, ccv_nnc_tensor_block_t tensor_blocks)
1850{
1851 int i, found = 0;
1852 // Try to insert head.
1853 ccv_array_t* head = tensor_blocks.head;
1854 assert(head)((void) sizeof ((head) ? 1 : 0), __extension__ ({ if (head) ;
else __assert_fail ("head", "ccv_nnc_symbolic_graph_compile.c"
, 1854, __extension__ __PRETTY_FUNCTION__); }))
;
1855 for (i = 0; i < head->rnum;)
1856 {
1857 const int head_idx = *(int*)ccv_array_get(head, i)((void*)(((char*)((head)->data)) + (size_t)(head)->rsize
* (size_t)(i)))
;
1858 if (head_idx == idx)
1859 {
1860 found = 1;
1861 break;
1862 }
1863 ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, head_idx, idx);
1864 if (cell.i32 && cell.i32[0] > 0)
1865 {
1866 /* If the current node is the parent of the head node, check if we found it or not. */
1867 /* If not found, replace the current one. */
1868 if (!found)
1869 {
1870 found = 1;
1871 *(int*)ccv_array_get(head, i)((void*)(((char*)((head)->data)) + (size_t)(head)->rsize
* (size_t)(i)))
= idx;
1872 } else {
1873 /* Remove the current one, change the rnum. */
1874 if (i < head->rnum - 1)
1875 *(int*)ccv_array_get(head, i)((void*)(((char*)((head)->data)) + (size_t)(head)->rsize
* (size_t)(i)))
= *(int*)ccv_array_get(head, head->rnum - 1)((void*)(((char*)((head)->data)) + (size_t)(head)->rsize
* (size_t)(head->rnum - 1)))
;
1876 --head->rnum;
1877 continue;
1878 }
1879 } else {
1880 // If the head is the parent of the idx, we cannot add it to the array (it is deterministically later than head).
1881 cell = ccv_get_sparse_matrix_cell(exec_dep, idx, head_idx);
1882 if (cell.i32 && cell.i32[0] > 0)
1883 {
1884 found = 1;
1885 break;
1886 }
1887 }
1888 /* Advancing i. */
1889 ++i;
1890 }
1891 /* If not found, push this idx to the end of the array. */
1892 if (!found)
1893 ccv_array_push(head, &idx);
1894 // Try to insert tail.
1895 found = 0;
1896 ccv_array_t* tail = tensor_blocks.tail;
1897 assert(tail)((void) sizeof ((tail) ? 1 : 0), __extension__ ({ if (tail) ;
else __assert_fail ("tail", "ccv_nnc_symbolic_graph_compile.c"
, 1897, __extension__ __PRETTY_FUNCTION__); }))
;
1898 for (i = 0; i < tail->rnum;)
1899 {
1900 const int tail_idx = *(int*)ccv_array_get(tail, i)((void*)(((char*)((tail)->data)) + (size_t)(tail)->rsize
* (size_t)(i)))
;
1901 if (tail_idx == idx)
1902 {
1903 found = 1;
1904 break;
1905 }
1906 ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, idx, tail_idx);
1907 if (cell.i32 && cell.i32[0] > 0)
1908 {
1909 /* If the current node is the child of the tail node, check if we found it or not. */
1910 /* If not found, replace the current one. */
1911 if (!found)
1912 {
1913 found = 1;
1914 *(int*)ccv_array_get(tail, i)((void*)(((char*)((tail)->data)) + (size_t)(tail)->rsize
* (size_t)(i)))
= idx;
1915 } else {
1916 /* Remove the current one, change the rnum. */
1917 *(int*)ccv_array_get(tail, i)((void*)(((char*)((tail)->data)) + (size_t)(tail)->rsize
* (size_t)(i)))
= *(int*)ccv_array_get(tail, tail->rnum - 1)((void*)(((char*)((tail)->data)) + (size_t)(tail)->rsize
* (size_t)(tail->rnum - 1)))
;
1918 --tail->rnum;
1919 continue;
1920 }
1921 } else {
1922 // If the tail is the child of the idx, we cannot add it to the array (it is deterministically earlier than tail).
1923 cell = ccv_get_sparse_matrix_cell(exec_dep, tail_idx, idx);
1924 if (cell.i32 && cell.i32[0] > 0)
1925 {
1926 found = 1;
1927 break;
1928 }
1929 }
1930 /* Advancing i. */
1931 ++i;
1932 }
1933 /* If not found, push this idx to the end of the array. */
1934 if (!found)
1935 ccv_array_push(tail, &idx);
1936}
1937
1938ccv_nnc_tensor_t* ccv_nnc_tensor_from_symbol(const ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_tensor_symbol_t symbol)
1939{
1940 if ((intptr_t)symbol.graph == tensor_arena->graph_ref)
1941 {
1942 assert(symbol.d >= 0 && symbol.d < tensor_arena->vt_tensor_size)((void) sizeof ((symbol.d >= 0 && symbol.d < tensor_arena
->vt_tensor_size) ? 1 : 0), __extension__ ({ if (symbol.d >=
0 && symbol.d < tensor_arena->vt_tensor_size) ;
else __assert_fail ("symbol.d >= 0 && symbol.d < tensor_arena->vt_tensor_size"
, "ccv_nnc_symbolic_graph_compile.c", 1942, __extension__ __PRETTY_FUNCTION__
); }))
;
1943 ccv_nnc_tensor_t* tensor = tensor_arena->vt_tensors[symbol.d];
1944 if (tensor && CCV_IS_TENSOR_MULTIVIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_MULTIVIEW))
1945 {
1946 ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)tensor;
1947 while (CCV_IS_TENSOR_MULTIVIEW(mv)((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW))
1948 mv = (ccv_nnc_tensor_multiview_t*)(mv->it ? mv->it : CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[0]);
1949 return (ccv_nnc_tensor_t*)mv;
1950 }
1951 return tensor;
1952 }
1953 int i;
1954 for (i = 0; i < tensor_arena->sub_arena_size; i++)
1955 if (tensor_arena->sub_arenas[i])
1956 {
1957 ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_from_symbol(tensor_arena->sub_arenas[i], symbol);
1958 if (tensor)
1959 return tensor;
1960 }
1961 return 0;
1962}
1963
1964ccv_nnc_graph_exec_t ccv_nnc_graph_exec_from_symbol(const ccv_nnc_graph_exec_arena_t* const graph_exec_arena, const ccv_nnc_graph_exec_symbol_t symbol)
1965{
1966 if ((intptr_t)symbol.graph == graph_exec_arena->graph_ref)
1967 {
1968 assert(symbol.d >= 0 && symbol.d < graph_exec_arena->graph_exec_size)((void) sizeof ((symbol.d >= 0 && symbol.d < graph_exec_arena
->graph_exec_size) ? 1 : 0), __extension__ ({ if (symbol.d
>= 0 && symbol.d < graph_exec_arena->graph_exec_size
) ; else __assert_fail ("symbol.d >= 0 && symbol.d < graph_exec_arena->graph_exec_size"
, "ccv_nnc_symbolic_graph_compile.c", 1968, __extension__ __PRETTY_FUNCTION__
); }))
;
1969 return graph_exec_arena->graph_execs[symbol.d];
1970 }
1971 int i;
1972 for (i = 0; i < graph_exec_arena->sub_arena_size; i++)
1973 if (graph_exec_arena->sub_arenas[i])
1974 {
1975 ccv_nnc_graph_exec_t exec = ccv_nnc_graph_exec_from_symbol(graph_exec_arena->sub_arenas[i], symbol);
1976 if (!CCV_NO_GRAPH_EXEC(exec)((exec).graph == 0))
1977 return exec;
1978 }
1979 return (ccv_nnc_graph_exec_t){}; // 0.
1980}
1981
1982ccv_nnc_graph_exec_t ccv_nnc_graph_exec_source(const ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
1983{
1984 return graph_exec_arena->source;
1985}
1986
1987ccv_nnc_graph_exec_t ccv_nnc_graph_exec_destination(const ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
1988{
1989 return graph_exec_arena->destination;
1990}
1991
1992// Check whether the head is the beginning of this block.
1993static int _ccv_nnc_tensor_block_check_head(const ccv_nnc_tensor_block_t* const tensor_block, const int head_node)
1994{
1995 assert(tensor_block->head)((void) sizeof ((tensor_block->head) ? 1 : 0), __extension__
({ if (tensor_block->head) ; else __assert_fail ("tensor_block->head"
, "ccv_nnc_symbolic_graph_compile.c", 1995, __extension__ __PRETTY_FUNCTION__
); }))
;
1996 return (tensor_block->head->rnum == 1 && *(int*)ccv_array_get(tensor_block->head, 0)((void*)(((char*)((tensor_block->head)->data)) + (size_t
)(tensor_block->head)->rsize * (size_t)(0)))
== head_node);
1997}
1998
1999// Check whether the tail is the end of this block.
2000static int _ccv_nnc_tensor_block_check_tail(const ccv_nnc_tensor_block_t* const tensor_block, const int tail_node)
2001{
2002 assert(tensor_block->tail)((void) sizeof ((tensor_block->tail) ? 1 : 0), __extension__
({ if (tensor_block->tail) ; else __assert_fail ("tensor_block->tail"
, "ccv_nnc_symbolic_graph_compile.c", 2002, __extension__ __PRETTY_FUNCTION__
); }))
;
2003 return (tensor_block->tail->rnum == 1 && *(int*)ccv_array_get(tensor_block->tail, 0)((void*)(((char*)((tensor_block->tail)->data)) + (size_t
)(tensor_block->tail)->rsize * (size_t)(0)))
== tail_node);
2004}
2005
2006// Make two tensor blocks one. Return 1 if that happened.
2007static int _ccv_nnc_tensor_blocks_try_fold(ccv_nnc_tensor_block_t* const tensor_blocks, const int p_ref_0, const int p_ref_1)
2008{
2009 // Now we are sure p_ref_0 points to the input, p_ref_1 points to the output.
2010 if (!TENSOR_IS_UNFOLDABLE_AS_INPUT(tensor_blocks[p_ref_0])(tensor_blocks[p_ref_0].flags & UNFOLDABLE_AS_INPUT) &&
2011 (!TENSOR_IS_UNFOLDABLE_AS_OUTPUT(tensor_blocks[p_ref_1])(tensor_blocks[p_ref_1].flags & UNFOLDABLE_AS_OUTPUT) || tensor_blocks[p_ref_1].unfoldable_except_ref == p_ref_0 + 1) &&
2012 tensor_blocks[p_ref_0].tail->rnum == 1 &&
2013 tensor_blocks[p_ref_1].head->rnum == 1 &&
2014 tensor_blocks[p_ref_0].type == tensor_blocks[p_ref_1].type && // Must be the same type.
2015 *(int*)ccv_array_get(tensor_blocks[p_ref_0].tail, 0)((void*)(((char*)((tensor_blocks[p_ref_0].tail)->data)) + (
size_t)(tensor_blocks[p_ref_0].tail)->rsize * (size_t)(0))
)
== *(int*)ccv_array_get(tensor_blocks[p_ref_1].head, 0)((void*)(((char*)((tensor_blocks[p_ref_1].head)->data)) + (
size_t)(tensor_blocks[p_ref_1].head)->rsize * (size_t)(0))
)
)
2016 {
2017 // If the two parent refs matches (thus, they meet at the same node), we can concatenate with each other and mark one as a ref. This is very similar to in-place operation combining.
2018 assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[p_ref_0]))((void) sizeof (((!((tensor_blocks[p_ref_0].flags & 0x3) ==
ALIAS) && !((tensor_blocks[p_ref_0].flags & 0x3)
== UNASSIGNED))) ? 1 : 0), __extension__ ({ if ((!((tensor_blocks
[p_ref_0].flags & 0x3) == ALIAS) && !((tensor_blocks
[p_ref_0].flags & 0x3) == UNASSIGNED))) ; else __assert_fail
("TENSOR_EXPECT_COMPUTABLE(tensor_blocks[p_ref_0])", "ccv_nnc_symbolic_graph_compile.c"
, 2018, __extension__ __PRETTY_FUNCTION__); }))
;
2019 assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[p_ref_1]))((void) sizeof (((!((tensor_blocks[p_ref_1].flags & 0x3) ==
ALIAS) && !((tensor_blocks[p_ref_1].flags & 0x3)
== UNASSIGNED))) ? 1 : 0), __extension__ ({ if ((!((tensor_blocks
[p_ref_1].flags & 0x3) == ALIAS) && !((tensor_blocks
[p_ref_1].flags & 0x3) == UNASSIGNED))) ; else __assert_fail
("TENSOR_EXPECT_COMPUTABLE(tensor_blocks[p_ref_1])", "ccv_nnc_symbolic_graph_compile.c"
, 2019, __extension__ __PRETTY_FUNCTION__); }))
;
2020 ccv_array_free(tensor_blocks[p_ref_0].tail);
2021 tensor_blocks[p_ref_0].tail = tensor_blocks[p_ref_1].tail;
2022 if (tensor_blocks[p_ref_1].p_refs[0])
2023 {
2024 assert(tensor_blocks[p_ref_1].p_refs[1] == 0)((void) sizeof ((tensor_blocks[p_ref_1].p_refs[1] == 0) ? 1 :
0), __extension__ ({ if (tensor_blocks[p_ref_1].p_refs[1] ==
0) ; else __assert_fail ("tensor_blocks[p_ref_1].p_refs[1] == 0"
, "ccv_nnc_symbolic_graph_compile.c", 2024, __extension__ __PRETTY_FUNCTION__
); }))
; // It simply cannot have more than one p_refs, otherwise we cannot merge.
2025 if (!tensor_blocks[p_ref_0].p_refs[0])
2026 tensor_blocks[p_ref_0].p_refs[0] = tensor_blocks[p_ref_1].p_refs[0];
2027 else
2028 tensor_blocks[p_ref_0].p_refs[1] = tensor_blocks[p_ref_1].p_refs[0];
2029 }
2030 tensor_blocks[p_ref_0].pin_mem = tensor_blocks[p_ref_0].pin_mem || tensor_blocks[p_ref_1].pin_mem;
2031 TENSOR_SET_READ_WRITE(tensor_blocks[p_ref_0], TENSOR_READ_WRITE(tensor_blocks[p_ref_0]) | TENSOR_READ_WRITE(tensor_blocks[p_ref_1]))(tensor_blocks[p_ref_0].flags = ((tensor_blocks[p_ref_0].flags
& ~0xc) | (tensor_blocks[p_ref_0].flags & 0xc) | (tensor_blocks
[p_ref_1].flags & 0xc)))
;
2032 ccv_array_free(tensor_blocks[p_ref_1].head);
2033 if (TENSOR_IS_UNFOLDABLE_AS_INPUT(tensor_blocks[p_ref_1])(tensor_blocks[p_ref_1].flags & UNFOLDABLE_AS_INPUT))
2034 TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[p_ref_0])(tensor_blocks[p_ref_0].flags = (tensor_blocks[p_ref_0].flags
| UNFOLDABLE_AS_INPUT))
;
2035 // Don't need to check UNFOLDABLE_AS_OUTPUT for p_ref_1 because if it is so, we cannot fold right now.
2036 TENSOR_EXPECT_SET_UNASSIGNED(tensor_blocks[p_ref_1])(tensor_blocks[p_ref_1].flags = ((tensor_blocks[p_ref_1].flags
& ~0x3) | UNASSIGNED))
;
2037 tensor_blocks[p_ref_1].ref = p_ref_0 + 1;
2038 if (!tensor_blocks[p_ref_0].r_refs)
2039 tensor_blocks[p_ref_0].r_refs = ccv_array_new(sizeof(int), 0, 0);
2040 ccv_array_add_unique_int(tensor_blocks[p_ref_0].r_refs, p_ref_1 + 1);
2041 tensor_blocks[p_ref_1].size = 0;
2042 tensor_blocks[p_ref_1].head = 0;
2043 tensor_blocks[p_ref_1].tail = 0;
2044 return 1;
2045 }
2046 return 0;
2047}
2048
2049static void _ccv_nnc_exec_dep_and_tensor_blocks_prep(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_info_t* const p_node_info, const ccv_nnc_graph_visit_t* const visit, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int unroll_count, const int* const dup_tensor_block_ref, const int* const dup_tensor_from_ref, const int* const dup_exec_from_ref, ccv_nnc_graph_exec_flag_t* const exec_flags, ccv_sparse_matrix_t** r_exec_dep, ccv_nnc_tensor_block_t** r_tensor_blocks)
2050{
2051 int i, j, k;
2052 // Generate exec dependencies (or, in other words, partial ordering of executions).
2053 ccv_sparse_matrix_t* exec_dep = ccv_sparse_matrix_new(symbolic_graph->exec_symbol_info->rnum, symbolic_graph->exec_symbol_info->rnum, CCV_32S | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0);
2054 int* buf = (int*)ccmallocmalloc(sizeof(int) * symbolic_graph->exec_symbol_info->rnum * 2);
2055 int buf_size;
2056 if (p_node_info)
2057 { assert(output_size == 0)((void) sizeof ((output_size == 0) ? 1 : 0), __extension__ ({
if (output_size == 0) ; else __assert_fail ("output_size == 0"
, "ccv_nnc_symbolic_graph_compile.c", 2057, __extension__ __PRETTY_FUNCTION__
); }))
; }
2058#define for_block(x, val) \
2059 do { \
2060 if (((int32_t*)val)[0] > 0) \
2061 { \
2062 buf[buf_size * 2] = x; \
2063 buf[buf_size * 2 + 1] = ((int32_t*)val)[0] + 1; \
2064 ++buf_size; \
2065 } \
2066 } while (0)
2067 ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx, term){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int term __attribute__((unused)) = (visit)->node[_i_
].term; typeof ((exec_symbol_info)) const node __attribute__(
(unused)) = (exec_symbol_info) + idx;
{
2068 buf_size = 0; /* save all its parent deps to this buffer */
2069 ccv_sparse_matrix_vector_t* vector = ccv_get_sparse_matrix_vector(exec_dep, idx);
2070 if (vector)
2071 CCV_SPARSE_VECTOR_FOREACH(exec_dep, vector, for_block)do { switch ((((exec_dep)->type) & 0xFF000)) { case CCV_32S
: { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((exec_dep)->type) & 0xFFF); if ((exec_dep)->type &
CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (vector)->size
; _i_++) { for_block((_i_), ((vector)->data.i32 + (_i_ * _c_
))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((exec_dep)->type) & 0xFF000
) >> 12] * (((exec_dep)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(vector)->index; for
(_i_ = 0; _i_ < (vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i32 + (0))); } } } while (0); break; } case CCV_32F:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((exec_dep)->type) & 0xFFF); if ((exec_dep)->type &
CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (vector)->size
; _i_++) { for_block((_i_), ((vector)->data.f32 + (_i_ * _c_
))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((exec_dep)->type) & 0xFF000
) >> 12] * (((exec_dep)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(vector)->index; for
(_i_ = 0; _i_ < (vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f32 + (0))); } } } while (0); break; } case CCV_64S:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((exec_dep)->type) & 0xFFF); if ((exec_dep)->type &
CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (vector)->size
; _i_++) { for_block((_i_), ((vector)->data.i64 + (_i_ * _c_
))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((exec_dep)->type) & 0xFF000
) >> 12] * (((exec_dep)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(vector)->index; for
(_i_ = 0; _i_ < (vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i64 + (0))); } } } while (0); break; } case CCV_64F:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((exec_dep)->type) & 0xFFF); if ((exec_dep)->type &
CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (vector)->size
; _i_++) { for_block((_i_), ((vector)->data.f64 + (_i_ * _c_
))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((exec_dep)->type) & 0xFF000
) >> 12] * (((exec_dep)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(vector)->index; for
(_i_ = 0; _i_ < (vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f64 + (0))); } } } while (0); break; } default: { do
{ int _i_; __attribute__((unused)) const size_t _c_ = (((exec_dep
)->type) & 0xFFF); if ((exec_dep)->type & CCV_DENSE_VECTOR
) { for (_i_ = 0; _i_ < (vector)->size; _i_++) { for_block
((_i_), ((vector)->data.u8 + (_i_ * _c_))); } } else { const
size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size
[(((exec_dep)->type) & 0xFF000) >> 12] * (((exec_dep
)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_
= (uint8_t*)(vector)->index; for (_i_ = 0; _i_ < (vector
)->size; _i_++) { ccv_sparse_matrix_index_t* const _idx_i_
= (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if
(_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_
= { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.u8 + (0))); } } } while (0); } } } while (0)
;
2072 if (!node->outgoings)
2073 continue;
2074 for (i = 0; i < node->outgoings->rnum; i++)
2075 {
2076 int outgoing = *(int*)ccv_array_get(node->outgoings, i)((void*)(((char*)((node->outgoings)->data)) + (size_t)(
node->outgoings)->rsize * (size_t)(i)))
;
2077 const int32_t one = 1;
2078 ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, outgoing, idx);
2079 /* If not found, set, if the current node is the destination node, no need
2080 * set itself as parent of subsequent nodes because its terminal nature. */
2081 if (!cell.i32 || cell.i32[0] == 0)
2082 ccv_set_sparse_matrix_cell(exec_dep, outgoing, idx, &one);
2083 if (buf_size > 0)
2084 {
2085 ccv_sparse_matrix_vector_t* vector = ccv_get_sparse_matrix_vector(exec_dep, outgoing);
2086 assert(vector)((void) sizeof ((vector) ? 1 : 0), __extension__ ({ if (vector
) ; else __assert_fail ("vector", "ccv_nnc_symbolic_graph_compile.c"
, 2086, __extension__ __PRETTY_FUNCTION__); }))
;
2087 for (j = 0; j < buf_size; j++) /* set with all idx's dependencies as well */
2088 {
2089 ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell_from_vector(exec_dep, vector, buf[j * 2]);
2090 /* If not found, set */
2091 if (!cell.i32 || cell.i32[0] == 0)
2092 ccv_set_sparse_matrix_cell_from_vector(exec_dep, vector, buf[j * 2], &buf[j * 2 + 1]);
2093 else {
2094 /* Otherwise, set to the longest one */
2095 int32_t dep = ccv_max(cell.i32[0], buf[j * 2 + 1])({ typeof (cell.i32[0]) _a = (cell.i32[0]); typeof (buf[j * 2
+ 1]) _b = (buf[j * 2 + 1]); (_a > _b) ? _a : _b; })
;
2096 ccv_set_sparse_matrix_cell_from_vector(exec_dep, vector, buf[j * 2], &dep);
2097 }
2098 }
2099 }
2100 }
2101 } ccv_nnc_graph_visit_endfor} }
2102#undef for_block
2103 ccfreefree(buf);
2104 // This struct is allocated earlier to collect information about the tensor's expected start / end execs.
2105 const int tensor_block_size = symbolic_graph->tensor_symbol_info->rnum;
2106 ccv_nnc_tensor_block_t* tensor_blocks = (ccv_nnc_tensor_block_t*)cccalloccalloc(tensor_block_size, sizeof(ccv_nnc_tensor_block_t));
2107 // The reason is that I need to make everyone of them to be unassigned unless it is used somewhere. It
2108 // happens that I have to loop through all relevant node to find out if one is used or not.
2109 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2110 tensor_blocks[i].flags = UNASSIGNED, tensor_blocks[i].type = tensor_symbol_info[i].info.type, tensor_blocks[i].bypass_ref = tensor_symbol_info[i].bypass_ref;
2111 ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx;
{
2112 for (i = 0; i < node->input_size; i++)
2113 if (node->inputs[i] >= 0)
2114 {
2115 tensor_blocks[node->inputs[i]].flags = 0;
2116 // If this is a data transfer node, and this is a CPU memory, mark the memory type to be pinned mem.
2117 // This will get propagated back to the buffer, and used there to determine the allocation function to use.
2118 if (CCV_TENSOR_GET_MEMORY(tensor_blocks[node->inputs[i]].type)((tensor_blocks[node->inputs[i]].type) & 0x3) == CCV_TENSOR_CPU_MEMORY &&
2119 (node->cmd.cmd == CCV_NNC_DATA_TRANSFER_FORWARD || node->cmd.cmd == CCV_NNC_DATA_TRANSFER_BACKWARD))
2120 tensor_blocks[node->inputs[i]].pin_mem = 1;
2121 }
2122 for (i = 0; i < node->output_size; i++)
2123 if (node->outputs[i] >= 0)
2124 {
2125 tensor_blocks[node->outputs[i]].flags = 0;
2126 // If this is a data transfer node, and this is a CPU memory, mark the memory type to be pinned mem.
2127 // This will get propagated back to the buffer, and used there to determine the allocation function to use.
2128 if (CCV_TENSOR_GET_MEMORY(tensor_blocks[node->outputs[i]].type)((tensor_blocks[node->outputs[i]].type) & 0x3) == CCV_TENSOR_CPU_MEMORY &&
2129 (node->cmd.cmd == CCV_NNC_DATA_TRANSFER_FORWARD || node->cmd.cmd == CCV_NNC_DATA_TRANSFER_BACKWARD))
2130 tensor_blocks[node->outputs[i]].pin_mem = 1;
2131 }
2132 } ccv_nnc_graph_visit_endfor} }
2133 if (p_node_info)
2134 {
2135 assert(p_tensor_symbol_info)((void) sizeof ((p_tensor_symbol_info) ? 1 : 0), __extension__
({ if (p_tensor_symbol_info) ; else __assert_fail ("p_tensor_symbol_info"
, "ccv_nnc_symbolic_graph_compile.c", 2135, __extension__ __PRETTY_FUNCTION__
); }))
;
2136 // Mark it as used if it is used in either input or output.
2137 for (i = 0; i < p_node_info->input_size; i++)
2138 if (p_node_info->inputs[i] >= 0)
2139 {
2140 const int d = p_node_info->inputs[i];
2141 if (p_tensor_symbol_info[d].s_ref && p_tensor_symbol_info[d].s_ref->rnum >= symbolic_graph->p_idx)
2142 {
2143 const int dd = *(int*)ccv_array_get(p_tensor_symbol_info[d].s_ref, symbolic_graph->p_idx - 1)((void*)(((char*)((p_tensor_symbol_info[d].s_ref)->data)) +
(size_t)(p_tensor_symbol_info[d].s_ref)->rsize * (size_t)
(symbolic_graph->p_idx - 1)))
- 1;
2144 if (dd >= 0) // If this exists in this sub-graph, great.
2145 tensor_blocks[dd].flags = 0;
2146 }
2147 }
2148 for (i = 0; i < p_node_info->output_size; i++)
2149 if (p_node_info->outputs[i] >= 0)
2150 {
2151 const int d = p_node_info->outputs[i];
2152 if (p_tensor_symbol_info[d].s_ref && p_tensor_symbol_info[d].s_ref->rnum >= symbolic_graph->p_idx)
2153 {
2154 const int dd = *(int*)ccv_array_get(p_tensor_symbol_info[d].s_ref, symbolic_graph->p_idx - 1)((void*)(((char*)((p_tensor_symbol_info[d].s_ref)->data)) +
(size_t)(p_tensor_symbol_info[d].s_ref)->rsize * (size_t)
(symbolic_graph->p_idx - 1)))
- 1;
2155 if (dd >= 0) // If this exists in this sub-graph, great.
2156 tensor_blocks[dd].flags = 0;
2157 }
2158 }
2159 }
2160 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2161 if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED))
2162 {
2163 // Check no tensor info is auto now.
2164 assert(!ccv_nnc_is_tensor_auto(tensor_symbol_info[i].info))((void) sizeof ((!ccv_nnc_is_tensor_auto(tensor_symbol_info[i
].info)) ? 1 : 0), __extension__ ({ if (!ccv_nnc_is_tensor_auto
(tensor_symbol_info[i].info)) ; else __assert_fail ("!ccv_nnc_is_tensor_auto(tensor_symbol_info[i].info)"
, "ccv_nnc_symbolic_graph_compile.c", 2164, __extension__ __PRETTY_FUNCTION__
); }))
;
2165 // If this tensor is used in assign_ref, set it to be un-foldable. (It will be used as parameter,
2166 // therefore, itself life-cycle almost certainly won't concatenate properly with the tensor to
2167 // fold to).
2168 if (tensor_symbol_info[i].assign_ref)
2169 {
2170 // TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[i]);
2171 // It can be folded as input (it is fine to be overwritten), but it cannot as output (when folded as input,
2172 // it kept its own representation, which is not the case for output).
2173 TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[i])(tensor_blocks[i].flags = (tensor_blocks[i].flags | UNFOLDABLE_AS_OUTPUT
))
;
2174 const int assign_ref = tensor_symbol_info[i].assign_ref - 1;
2175 // But for where it comes from, it cannot be folded as input, because it cannot be overwritten any time.
2176 TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[assign_ref])(tensor_blocks[assign_ref].flags = (tensor_blocks[assign_ref]
.flags | UNFOLDABLE_AS_INPUT))
;
2177 // It also cannot be folded as output (except i), because we need to keep its own representation.
2178 TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[assign_ref])(tensor_blocks[assign_ref].flags = (tensor_blocks[assign_ref]
.flags | UNFOLDABLE_AS_OUTPUT))
;
2179 assert(tensor_blocks[assign_ref].unfoldable_except_ref == 0)((void) sizeof ((tensor_blocks[assign_ref].unfoldable_except_ref
== 0) ? 1 : 0), __extension__ ({ if (tensor_blocks[assign_ref
].unfoldable_except_ref == 0) ; else __assert_fail ("tensor_blocks[assign_ref].unfoldable_except_ref == 0"
, "ccv_nnc_symbolic_graph_compile.c", 2179, __extension__ __PRETTY_FUNCTION__
); }))
;
2180 tensor_blocks[assign_ref].unfoldable_except_ref = i + 1;
2181 for (j = 0; j < unroll_count; j++)
2182 {
2183 TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]])(tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]].flags
= (tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]]
.flags | UNFOLDABLE_AS_INPUT))
;
2184 TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]])(tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]].flags
= (tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]]
.flags | UNFOLDABLE_AS_OUTPUT))
;
2185 }
2186 if (tensor_blocks[assign_ref].bypass_ref)
2187 {
2188 // If it contains a bypass_ref, that means we can fold into both the bypass and except_ref, making it untenable.
2189 tensor_blocks[assign_ref].unfoldable_except_ref = 0;
2190 const int bypass_ref = tensor_blocks[assign_ref].bypass_ref - 1;
2191 TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[bypass_ref])(tensor_blocks[bypass_ref].flags = (tensor_blocks[bypass_ref]
.flags | UNFOLDABLE_AS_INPUT))
;
2192 TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[bypass_ref])(tensor_blocks[bypass_ref].flags = (tensor_blocks[bypass_ref]
.flags | UNFOLDABLE_AS_OUTPUT))
;
2193 // On the other hand, it can be folded into the except_ref for the bypass_ref.
2194 tensor_blocks[bypass_ref].unfoldable_except_ref = i + 1;
2195 if (dup_tensor_from_ref)
2196 {
2197 const int bypass_from_ref = dup_tensor_from_ref[bypass_ref];
2198 if (bypass_from_ref >= 0)
2199 {
2200 TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[bypass_from_ref])(tensor_blocks[bypass_from_ref].flags = (tensor_blocks[bypass_from_ref
].flags | UNFOLDABLE_AS_INPUT))
;
2201 TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[bypass_from_ref])(tensor_blocks[bypass_from_ref].flags = (tensor_blocks[bypass_from_ref
].flags | UNFOLDABLE_AS_OUTPUT))
;
2202 assert(dup_tensor_block_ref[bypass_from_ref * unroll_count + unroll_count - 1] == bypass_ref)((void) sizeof ((dup_tensor_block_ref[bypass_from_ref * unroll_count
+ unroll_count - 1] == bypass_ref) ? 1 : 0), __extension__ (
{ if (dup_tensor_block_ref[bypass_from_ref * unroll_count + unroll_count
- 1] == bypass_ref) ; else __assert_fail ("dup_tensor_block_ref[bypass_from_ref * unroll_count + unroll_count - 1] == bypass_ref"
, "ccv_nnc_symbolic_graph_compile.c", 2202, __extension__ __PRETTY_FUNCTION__
); }))
;
2203 for (j = 0; j < unroll_count - 1; j++)
2204 {
2205 // Mark every incarnation as unfold-able.
2206 TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[dup_tensor_block_ref[bypass_from_ref * unroll_count + j]])(tensor_blocks[dup_tensor_block_ref[bypass_from_ref * unroll_count
+ j]].flags = (tensor_blocks[dup_tensor_block_ref[bypass_from_ref
* unroll_count + j]].flags | UNFOLDABLE_AS_INPUT))
;
2207 TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[dup_tensor_block_ref[bypass_from_ref * unroll_count + j]])(tensor_blocks[dup_tensor_block_ref[bypass_from_ref * unroll_count
+ j]].flags = (tensor_blocks[dup_tensor_block_ref[bypass_from_ref
* unroll_count + j]].flags | UNFOLDABLE_AS_OUTPUT))
;
2208 }
2209 }
2210 }
2211 }
2212 }
2213 }
2214 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2215 {
2216 // If it has a pair reference, we don't need to allocate this tensor at all,
2217 // set it to be unassigned.
2218 if (tensor_symbol_info[i].pair_ref)
2219 TENSOR_EXPECT_SET_UNASSIGNED(tensor_blocks[i])(tensor_blocks[i].flags = ((tensor_blocks[i].flags & ~0x3
) | UNASSIGNED))
;
2220 // If it is a tape variable, set it to be un-foldable as too (otherwise we cannot use tape properly).
2221 else if (tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_TAPE_VAR) {
2222 TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[i])(tensor_blocks[i].flags = (tensor_blocks[i].flags | UNFOLDABLE_AS_INPUT
))
;
2223 TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[i])(tensor_blocks[i].flags = (tensor_blocks[i].flags | UNFOLDABLE_AS_OUTPUT
))
;
2224 // For this case, there is no exception.
2225 tensor_blocks[i].unfoldable_except_ref = 0;
2226 } else if (tensor_symbol_info[i].p_ref) {
2227 assert(p_node_info)((void) sizeof ((p_node_info) ? 1 : 0), __extension__ ({ if (
p_node_info) ; else __assert_fail ("p_node_info", "ccv_nnc_symbolic_graph_compile.c"
, 2227, __extension__ __PRETTY_FUNCTION__); }))
;
2228 const int p_ref_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(tensor_symbol_info[i].p_ref - 1, p_node_info);
2229 // If I am a case of graph, and this tensor is the input from the parent graph, you cannot fold it as input.
2230 if (p_node_info->flags & CCV_NNC_GRAPH_EXEC_CASE_OF)
2231 // TODO: This check can be lifted if we can fold in the parent graph.
2232 if (-1 == p_ref_is_in_or_out)
2233 TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[i])(tensor_blocks[i].flags = (tensor_blocks[i].flags | UNFOLDABLE_AS_INPUT
))
;
2234 if (1 == p_ref_is_in_or_out) // If p_ref is out, it cannot be fold as input.
2235 TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[i])(tensor_blocks[i].flags = (tensor_blocks[i].flags | UNFOLDABLE_AS_INPUT
))
;
2236 }
2237 }
2238 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2239 {
2240 if (tensor_symbol_info[i].alias_ref)
2241 {
2242 const int ref = tensor_symbol_info[i].alias_ref - 1;
2243 // If the referenced one is unassigned, mark this as assigned only if current one is assigned.
2244 if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[ref])((tensor_blocks[ref].flags & 0x3) == UNASSIGNED) && !TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED))
2245 tensor_blocks[ref].flags = 0;
2246 // An alias cannot ref to another alias.
2247 assert(!tensor_symbol_info[ref].alias_ref)((void) sizeof ((!tensor_symbol_info[ref].alias_ref) ? 1 : 0)
, __extension__ ({ if (!tensor_symbol_info[ref].alias_ref) ; else
__assert_fail ("!tensor_symbol_info[ref].alias_ref", "ccv_nnc_symbolic_graph_compile.c"
, 2247, __extension__ __PRETTY_FUNCTION__); }))
;
2248 tensor_blocks[i].flags = ALIAS;
2249 tensor_blocks[i].ref = ref + 1; // Assign the ref.
2250 if (!tensor_blocks[ref].r_refs)
2251 tensor_blocks[ref].r_refs = ccv_array_new(sizeof(int), 0, 0);
2252 ccv_array_add_unique_int(tensor_blocks[ref].r_refs, i + 1);
2253 }
2254 }
2255 // Scan again and if the ref is not assigned, mark the alias not assigned.
2256 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2257 if (TENSOR_EXPECT_ALIAS(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == ALIAS))
2258 {
2259 const int ref = tensor_blocks[i].ref - 1;
2260 if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[ref])((tensor_blocks[ref].flags & 0x3) == UNASSIGNED))
2261 {
2262 // Mark this as unassigned.
2263 tensor_blocks[i].flags = UNASSIGNED;
2264 tensor_blocks[i].ref = 0;
2265 }
2266 }
2267 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2268 {
2269 // If this tensor is not expected to be unassigned, allocate the arrays for s and t.
2270 if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !(
(tensor_blocks[i].flags & 0x3) == UNASSIGNED))
)
2271 {
2272 tensor_blocks[i].head = ccv_array_new(sizeof(int), 0, 0);
2273 tensor_blocks[i].tail = ccv_array_new(sizeof(int), 0, 0);
2274 // Cache tensor size (align to 16 bytes).
2275 tensor_blocks[i].size = (uint64_t)ccv_nnc_tensor_data_size(tensor_symbol_info[i].info);
2276 }
2277 // If there is a p_ref, add the one to the p_refs list.
2278 if (tensor_symbol_info[i].p_ref)
2279 tensor_blocks[i].p_refs[0] = tensor_symbol_info[i].p_ref;
2280 }
2281 ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx;
{
2282 for (i = 0; i < node->input_size; i++)
2283 {
2284 int d = node->inputs[i];
2285 if (d < 0)
2286 continue;
2287 if (TENSOR_EXPECT_ALIAS(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == ALIAS))
2288 d = tensor_symbol_info[d].alias_ref - 1;
2289 tensor_blocks[d].flags |= READ_ONLY;
2290 if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED))
2291 continue;
2292 assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]))((void) sizeof (((!((tensor_blocks[d].flags & 0x3) == ALIAS
) && !((tensor_blocks[d].flags & 0x3) == UNASSIGNED
))) ? 1 : 0), __extension__ ({ if ((!((tensor_blocks[d].flags
& 0x3) == ALIAS) && !((tensor_blocks[d].flags &
0x3) == UNASSIGNED))) ; else __assert_fail ("TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d])"
, "ccv_nnc_symbolic_graph_compile.c", 2292, __extension__ __PRETTY_FUNCTION__
); }))
;
2293 /* If this is first encounter, its head starts (this tensor is init'ed outside of the graph
2294 * from the very beginning of the graph life-cycle and ends here. */
2295 if (tensor_blocks[d].head->rnum == 0 && !TENSOR_REQUIRE_INIT(tensor_symbol_info[d].flags)(((tensor_symbol_info[d].flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS
) || ((tensor_symbol_info[d].flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ONES
))
)
2296 {
2297 for (j = 0; j < source_size; j++)
2298 {
2299 // If the source is connecting to current node, add (otherwise we will create tensor blocks that used in other streams, which is unneccessary).
2300 const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, idx, sources[j].d);
2301 if (cell.i32 && cell.i32[0] > 0)
2302 _ccv_nnc_tensor_block_add_exec(exec_dep, sources[j].d, tensor_blocks[d]);
2303 }
2304 /* If this is a read-only (based on SSA, if first encountered as read), and this is
2305 * sub-graph (TODO: this condition can be lifted for case..of that is never in a while
2306 * loop, however, in that case, you need to prevent read-only gets reused for the
2307 * output tensor, which is not obvious how to implement correctly), and it is not
2308 * assign_ref from anywhere (not a parameterized loop). We cannot reuse this region
2309 * of memory anyway (because on second loop, we want to read the same value out).
2310 * Mark it to the end of the graph. */
2311 if (p_node_info && !tensor_symbol_info[d].assign_ref)
2312 for (j = 0; j < destination_size; j++)
2313 {
2314 // If the destination is connecting to current node, add (otherwise we will create tensor blocks that used in other streams, which is unneccessary).
2315 const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, destinations[j].d, idx);
2316 if (cell.i32 && cell.i32[0] > 0)
2317 _ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[d]);
2318 }
2319 }
2320 _ccv_nnc_tensor_block_add_exec(exec_dep, idx, tensor_blocks[d]);
2321 }
2322 for (i = 0; i < node->output_size; i++)
2323 {
2324 int d = node->outputs[i];
2325 if (d < 0)
2326 continue;
2327 if (TENSOR_EXPECT_ALIAS(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == ALIAS))
2328 d = tensor_symbol_info[d].alias_ref - 1;
2329 tensor_blocks[d].flags |= WRITE_ONLY;
2330 if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED))
2331 continue;
2332 assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]))((void) sizeof (((!((tensor_blocks[d].flags & 0x3) == ALIAS
) && !((tensor_blocks[d].flags & 0x3) == UNASSIGNED
))) ? 1 : 0), __extension__ ({ if ((!((tensor_blocks[d].flags
& 0x3) == ALIAS) && !((tensor_blocks[d].flags &
0x3) == UNASSIGNED))) ; else __assert_fail ("TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d])"
, "ccv_nnc_symbolic_graph_compile.c", 2332, __extension__ __PRETTY_FUNCTION__
); }))
;
2333 _ccv_nnc_tensor_block_add_exec(exec_dep, idx, tensor_blocks[d]);
2334 }
2335 } ccv_nnc_graph_visit_endfor} }
2336 // For any assign_ref, its life-time kept until the end and wrap over.
2337 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2338 // If this tensor is not unassigned (or alias) and it is assigned from somewhere else,
2339 // that "somewhere else" need to keep its life-time til the end.
2340 if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !(
(tensor_blocks[i].flags & 0x3) == UNASSIGNED))
&&
2341 p_node_info && tensor_symbol_info[i].assign_ref)
2342 {
2343 const int assign_ref = tensor_symbol_info[i].assign_ref - 1;
2344 for (j = 0; j < destination_size; j++)
2345 {
2346 // This logic is to be more conservative about which destination we add to.
2347 // As of now, if we add everything, it is fine most likely. However, it may
2348 // cause issues in the future to do so naively. Thus, instead, we only add
2349 // the destination to it iff either the tensor is not used at all, or, the
2350 // destination is on the same stream as of the tensor block some way.
2351 int flag = !tensor_blocks[assign_ref].tail;
2352 for (k = 0; !flag && k < tensor_blocks[assign_ref].tail->rnum; k++)
2353 {
2354 const int idx = *(int*)ccv_array_get(tensor_blocks[assign_ref].tail, k)((void*)(((char*)((tensor_blocks[assign_ref].tail)->data))
+ (size_t)(tensor_blocks[assign_ref].tail)->rsize * (size_t
)(k)))
;
2355 const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, destinations[j].d, idx);
2356 flag = (cell.i32 && cell.i32[0] > 0);
2357 }
2358 if (flag) // If there is no tail at all, add it. Otherwise, only add it if the destination is on the same stream with this tensor block somehow.
2359 _ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[assign_ref]);
2360 }
2361 }
2362 for (i = 0; i < output_size; i++)
2363 {
2364 assert(outputs[i].graph == symbolic_graph)((void) sizeof ((outputs[i].graph == symbolic_graph) ? 1 : 0)
, __extension__ ({ if (outputs[i].graph == symbolic_graph) ; else
__assert_fail ("outputs[i].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 2364, __extension__ __PRETTY_FUNCTION__); }))
;
2365 int d = outputs[i].d;
2366 if (d < 0)
2367 continue;
2368 if (TENSOR_EXPECT_ALIAS(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == ALIAS))
2369 d = tensor_symbol_info[d].alias_ref - 1;
2370 if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED))
2371 continue;
2372 assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]))((void) sizeof (((!((tensor_blocks[d].flags & 0x3) == ALIAS
) && !((tensor_blocks[d].flags & 0x3) == UNASSIGNED
))) ? 1 : 0), __extension__ ({ if ((!((tensor_blocks[d].flags
& 0x3) == ALIAS) && !((tensor_blocks[d].flags &
0x3) == UNASSIGNED))) ; else __assert_fail ("TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d])"
, "ccv_nnc_symbolic_graph_compile.c", 2372, __extension__ __PRETTY_FUNCTION__
); }))
;
2373 for (j = 0; j < destination_size; j++)
2374 {
2375 int flag = !tensor_blocks[d].tail;
2376 for (k = 0; !flag && k < tensor_blocks[d].tail->rnum; k++)
2377 {
2378 const int idx = *(int*)ccv_array_get(tensor_blocks[d].tail, k)((void*)(((char*)((tensor_blocks[d].tail)->data)) + (size_t
)(tensor_blocks[d].tail)->rsize * (size_t)(k)))
;
2379 const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, destinations[j].d, idx);
2380 flag = (cell.i32 && cell.i32[0] > 0);
2381 }
2382 if (flag) // If there is no tail at all, add it. Otherwise, only add it if the destination is on the same stream with this tensor block somehow.
2383 _ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[d]);
2384 }
2385 }
2386 // Enforce tensor reuse by collapse tensors for in-place operations. We will fault if this cannot be done.
2387 ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx;
{
2388 int x, y;
2389 for (x = 0; x < node->input_size; x++)
2390 for (y = 0; y < node->output_size; y++)
2391 /* Some operations enforces some tensors to be the same for inputs / outputs. */
2392 if (ccv_nnc_cmd_enforce_inplace(node->cmd, x, node->input_size, y, node->output_size))
2393 {
2394 // If both unassigned, it is fine.
2395 if (node->inputs[x] < 0 && node->outputs[y] < 0)
2396 continue;
2397 int ref = node->inputs[x];
2398 assert(ref >= 0)((void) sizeof ((ref >= 0) ? 1 : 0), __extension__ ({ if (
ref >= 0) ; else __assert_fail ("ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 2398, __extension__ __PRETTY_FUNCTION__); }))
;
2399 while (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref])(!((tensor_blocks[ref].flags & 0x3) == ALIAS) && !
((tensor_blocks[ref].flags & 0x3) == UNASSIGNED))
&& tensor_blocks[ref].ref)
2400 ref = tensor_blocks[ref].ref - 1;
2401 const int node_output_y = node->outputs[y];
2402 assert(node_output_y >= 0)((void) sizeof ((node_output_y >= 0) ? 1 : 0), __extension__
({ if (node_output_y >= 0) ; else __assert_fail ("node_output_y >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 2402, __extension__ __PRETTY_FUNCTION__
); }))
;
2403 // If both are not computable, it is fine, we don't need to enforce.
2404 if (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref])(!((tensor_blocks[ref].flags & 0x3) == ALIAS) && !
((tensor_blocks[ref].flags & 0x3) == UNASSIGNED))
&&
2405 !TENSOR_EXPECT_COMPUTABLE(tensor_blocks[node_output_y])(!((tensor_blocks[node_output_y].flags & 0x3) == ALIAS) &&
!((tensor_blocks[node_output_y].flags & 0x3) == UNASSIGNED
))
)
2406 continue;
2407 // Otherwise, enforce and error out if failed.
2408 if (!_ccv_nnc_tensor_blocks_try_fold(tensor_blocks, ref, node_output_y))
2409 { assert(0 && "cannot enforce inplace for the two tensors")((void) sizeof ((0 && "cannot enforce inplace for the two tensors"
) ? 1 : 0), __extension__ ({ if (0 && "cannot enforce inplace for the two tensors"
) ; else __assert_fail ("0 && \"cannot enforce inplace for the two tensors\""
, "ccv_nnc_symbolic_graph_compile.c", 2409, __extension__ __PRETTY_FUNCTION__
); }))
; }
2410 }
2411 } ccv_nnc_graph_visit_endfor} }
2412 // Ignore tensors that are already binded, no matter if it is used or not. Doing it here because
2413 // we need to make sure enforced tensors are properly assigned, so that we don't bind on a tensor
2414 // that is not enforced in-place (because the tensor enforced in-place will be different than the
2415 // binding one).
2416 for (i = 0; i < tensor_bind_size; i++)
2417 {
2418 const ccv_nnc_tensor_symbol_t resolved_symbol = ccv_nnc_tensor_symbol_resolve(symbolic_graph, tensor_binds[i].symbol);
2419 // If there is a tensor binded, then it is unassigned.
2420 if (resolved_symbol.d >= 0)
2421 {
2422 int d = resolved_symbol.d;
2423 // I cannot assert too much at this moment.
2424 if (TENSOR_EXPECT_ALIAS(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == ALIAS))
2425 d = tensor_symbol_info[d].alias_ref - 1; // Bind back to the original.
2426 // This check is for in-place ops. Only in-place op could have unassigned but ref.
2427 // It has nothing to do with alias.
2428 while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED) && tensor_blocks[d].ref)
2429 d = tensor_blocks[d].ref - 1;
2430 // Doesn't work if this is a loop carrying variable.
2431 assert(!tensor_symbol_info[d].assign_ref)((void) sizeof ((!tensor_symbol_info[d].assign_ref) ? 1 : 0),
__extension__ ({ if (!tensor_symbol_info[d].assign_ref) ; else
__assert_fail ("!tensor_symbol_info[d].assign_ref", "ccv_nnc_symbolic_graph_compile.c"
, 2431, __extension__ __PRETTY_FUNCTION__); }))
;
2432 tensor_blocks[d].flags = UNASSIGNED;
2433 tensor_blocks[d].ref = 0; // No need to have ref as well.
2434 }
2435 }
2436 // Maximum tensor reuse by collapse tensors allows in-place operations (and it matches the start, end tensor).
2437 ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx;
{
2438 int x, y;
2439 for (x = 0; x < node->input_size; x++)
2440 {
2441 /* If the input is not assigned, it can be referenced, find the referenced one */
2442 int ref = node->inputs[x];
2443 if (ref < 0)
2444 continue;
2445 const ccv_nnc_tensor_symbol_info_t x_symbol = tensor_symbol_info[ref];
2446 while (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref])(!((tensor_blocks[ref].flags & 0x3) == ALIAS) && !
((tensor_blocks[ref].flags & 0x3) == UNASSIGNED))
&& tensor_blocks[ref].ref)
2447 ref = tensor_blocks[ref].ref - 1;
2448 assert(tensor_blocks[ref].ref == 0)((void) sizeof ((tensor_blocks[ref].ref == 0) ? 1 : 0), __extension__
({ if (tensor_blocks[ref].ref == 0) ; else __assert_fail ("tensor_blocks[ref].ref == 0"
, "ccv_nnc_symbolic_graph_compile.c", 2448, __extension__ __PRETTY_FUNCTION__
); }))
;
2449 if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref])(!((tensor_blocks[ref].flags & 0x3) == ALIAS) && !
((tensor_blocks[ref].flags & 0x3) == UNASSIGNED))
&&
2450 tensor_blocks[ref].tail->rnum == 1)
2451 {
2452 for (y = 0; y < node->output_size; y++)
2453 /* Only proceed if the input symbol is different from the output symbol, */
2454 /* and the input symbol meets the output symbol exactly at the same spot. */
2455 if (ccv_nnc_cmd_allow_inplace(node->cmd, x, node->input_size, y, node->output_size) &&
2456 node->outputs[y] >= 0 &&
2457 ref != node->outputs[y] &&
2458 TENSOR_EXPECT_COMPUTABLE(tensor_blocks[node->outputs[y]])(!((tensor_blocks[node->outputs[y]].flags & 0x3) == ALIAS
) && !((tensor_blocks[node->outputs[y]].flags &
0x3) == UNASSIGNED))
)
2459 {
2460 const int node_output_y = node->outputs[y];
2461 const ccv_nnc_tensor_symbol_info_t y_symbol = tensor_symbol_info[node_output_y];
2462 /* If dimension matches perfectly, then we can assign y_symbol to x.
2463 * If both of them are aliases, making sure their origin matches in size too. */
2464 if (memcmp(x_symbol.info.dim, y_symbol.info.dim, sizeof(int) * CCV_NNC_MAX_DIM_ALLOC(12)) == 0)
2465 {
2466 _ccv_nnc_tensor_blocks_try_fold(tensor_blocks, ref, node_output_y);
2467 // This refers to an alias itself, now mark it and will be processed later.
2468 if (ref != node->inputs[x])
2469 tensor_blocks[node_output_y].alias_ref = node->inputs[x] + 1;
2470 }
2471 }
2472 }
2473 }
2474 } ccv_nnc_graph_visit_endfor} }
2475 // Specifically handle the bypass. This need to be done after the first pass.
2476 // I need to extend the bypass life-time to the same as the one I am going with.
2477 // It is important we visit these nodes and assign bypass_ref to its dependents in topological order.
2478 ccv_nnc_tensor_block_t empty_block = {};
2479 empty_block.head = ccv_array_new(sizeof(int), 0, 0);
2480 empty_block.tail = ccv_array_new(sizeof(int), 0, 0);
2481 ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx;
{
2482 if (node->flags & CCV_NNC_GRAPH_EXEC_CASE_OF)
2483 {
2484 int can_bypass = 1;
2485 for (i = 0; can_bypass && i < node->output_size; i++)
2486 {
2487 int d = node->outputs[i];
2488 if (d < 0)
2489 continue;
2490 if (!tensor_blocks[d].bypass_ref)
2491 continue;
2492 while (tensor_blocks[d].ref)
2493 d = tensor_blocks[d].ref - 1;
2494 int bypass_ref = tensor_blocks[node->outputs[i]].bypass_ref - 1;
2495 while (tensor_blocks[bypass_ref].ref)
2496 bypass_ref = tensor_blocks[bypass_ref].ref - 1;
2497 // If this doesn't participate in the while loop, we don't need to check the while loop constraint.
2498 if (!tensor_symbol_info[bypass_ref].assign_ref && !tensor_symbol_info[bypass_ref].r_assign_ref)
2499 continue;
2500 ccv_array_clear(empty_block.head);
2501 for (j = 0; tensor_blocks[bypass_ref].head && j < tensor_blocks[bypass_ref].head->rnum; j++)
2502 ccv_array_push(empty_block.head, ccv_array_get(tensor_blocks[bypass_ref].head, j)((void*)(((char*)((tensor_blocks[bypass_ref].head)->data))
+ (size_t)(tensor_blocks[bypass_ref].head)->rsize * (size_t
)(j)))
);
2503 ccv_array_clear(empty_block.tail);
2504 for (j = 0; tensor_blocks[bypass_ref].tail && j < tensor_blocks[bypass_ref].tail->rnum; j++)
2505 ccv_array_push(empty_block.tail, ccv_array_get(tensor_blocks[bypass_ref].tail, j)((void*)(((char*)((tensor_blocks[bypass_ref].tail)->data))
+ (size_t)(tensor_blocks[bypass_ref].tail)->rsize * (size_t
)(j)))
);
2506 for (j = 0; tensor_blocks[d].head && j < tensor_blocks[d].head->rnum; j++)
2507 _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].head, j)((void*)(((char*)((tensor_blocks[d].head)->data)) + (size_t
)(tensor_blocks[d].head)->rsize * (size_t)(j)))
, empty_block);
2508 for (j = 0; tensor_blocks[d].tail && j < tensor_blocks[d].tail->rnum; j++)
2509 _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].tail, j)((void*)(((char*)((tensor_blocks[d].tail)->data)) + (size_t
)(tensor_blocks[d].tail)->rsize * (size_t)(j)))
, empty_block);
2510 // It can only be unfoldable due to while constraint. Check whether this satisfies the while loop constraint.
2511 assert(!(tensor_symbol_info[bypass_ref].assign_ref && tensor_symbol_info[bypass_ref].r_assign_ref))((void) sizeof ((!(tensor_symbol_info[bypass_ref].assign_ref &&
tensor_symbol_info[bypass_ref].r_assign_ref)) ? 1 : 0), __extension__
({ if (!(tensor_symbol_info[bypass_ref].assign_ref &&
tensor_symbol_info[bypass_ref].r_assign_ref)) ; else __assert_fail
("!(tensor_symbol_info[bypass_ref].assign_ref && tensor_symbol_info[bypass_ref].r_assign_ref)"
, "ccv_nnc_symbolic_graph_compile.c", 2511, __extension__ __PRETTY_FUNCTION__
); }))
;
2512 int b_ref = (tensor_symbol_info[bypass_ref].assign_ref) ? tensor_symbol_info[bypass_ref].assign_ref - 1 : tensor_symbol_info[bypass_ref].r_assign_ref - 1;
2513 while (tensor_blocks[b_ref].ref)
2514 b_ref = tensor_blocks[b_ref].ref - 1;
2515 int a_hop_b = _ccv_nnc_tensor_block_head_after_tail(exec_dep, empty_block, tensor_blocks[b_ref]);
2516 int b_hop_a = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[b_ref], empty_block);
2517 // These two can be assigned to the same region of memory without issue (because their life-time doesn't interfere)
2518 // even after we extend the life-time of bypass_ref. Then we are in a good shape.
2519 can_bypass = can_bypass && (a_hop_b || b_hop_a);
2520 }
2521 if (can_bypass)
2522 {
2523 for (i = 0; i < node->output_size; i++)
2524 {
2525 int d = node->outputs[i];
2526 if (d < 0)
2527 continue;
2528 if (!tensor_blocks[d].bypass_ref)
2529 continue;
2530 while (tensor_blocks[d].ref)
2531 d = tensor_blocks[d].ref - 1;
2532 int bypass_ref = tensor_blocks[node->outputs[i]].bypass_ref - 1;
2533 while (tensor_blocks[bypass_ref].ref)
2534 bypass_ref = tensor_blocks[bypass_ref].ref - 1;
2535 // The bypass_ref can extend its life-time.
2536 for (j = 0; tensor_blocks[d].head && j < tensor_blocks[d].head->rnum; j++)
2537 _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].head, j)((void*)(((char*)((tensor_blocks[d].head)->data)) + (size_t
)(tensor_blocks[d].head)->rsize * (size_t)(j)))
, tensor_blocks[bypass_ref]);
2538 for (j = 0; tensor_blocks[d].tail && j < tensor_blocks[d].tail->rnum; j++)
2539 _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].tail, j)((void*)(((char*)((tensor_blocks[d].tail)->data)) + (size_t
)(tensor_blocks[d].tail)->rsize * (size_t)(j)))
, tensor_blocks[bypass_ref]);
2540 }
2541 } else {
2542 for (i = 0; i < node->output_size; i++)
2543 tensor_blocks[node->outputs[i]].bypass_ref = 0;
2544 const int exec_idx = (dup_exec_from_ref) ? dup_exec_from_ref[idx] : idx;
2545 // Mark this exec as no bypass IO (thus, I need to insert explicit data transfer.
2546 exec_flags[exec_idx].flags |= CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO;
2547 }
2548 }
2549 } ccv_nnc_graph_visit_endfor} }
2550 ccv_array_free(empty_block.head);
2551 ccv_array_free(empty_block.tail);
2552 *r_exec_dep = exec_dep;
2553 *r_tensor_blocks = tensor_blocks;
2554}
2555
2556static ccv_nnc_cmd_t _ccv_nnc_subst_sub_graph_with_noop(const ccv_nnc_graph_exec_symbol_t symbol, const ccv_nnc_cmd_t cmd)
2557{
2558 if (cmd.cmd == CCV_NNC_GRAPH_FORWARD || cmd.cmd == CCV_NNC_GRAPH_BACKWARD)
2559 {
2560 ccv_nnc_cmd_t retval = cmd;
2561 retval.cmd = CCV_NNC_NOOP;
2562 return retval;
2563 }
2564 return cmd;
2565}
2566
2567static ccv_nnc_tensor_symbol_t _ccv_nnc_dup_tensor_symbol(ccv_nnc_symbolic_graph_t* const dup_graph, const int unroll_count, int* const dup_tensor_block_ref, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int input)
2568{
2569 if (dup_tensor_block_ref[input * unroll_count] < 0) // No tensor ref, create one.
2570 {
2571 if (tensor_symbol_info[input].alias_ref)
2572 {
2573 const int alias_ref = tensor_symbol_info[input].alias_ref - 1;
2574 assert(tensor_symbol_info[alias_ref].alias_ref == 0)((void) sizeof ((tensor_symbol_info[alias_ref].alias_ref == 0
) ? 1 : 0), __extension__ ({ if (tensor_symbol_info[alias_ref
].alias_ref == 0) ; else __assert_fail ("tensor_symbol_info[alias_ref].alias_ref == 0"
, "ccv_nnc_symbolic_graph_compile.c", 2574, __extension__ __PRETTY_FUNCTION__
); }))
;
2575 ccv_nnc_tensor_symbol_t tensor_symbol = {};
2576 if (dup_tensor_block_ref[alias_ref * unroll_count] < 0)
2577 {
2578 tensor_symbol = ccv_nnc_tensor_symbol_new(dup_graph, tensor_symbol_info[alias_ref].info, 0);
2579 if (tensor_symbol_info[alias_ref].pair_ref)
2580 ccv_nnc_tensor_symbol_pair_with(dup_graph, tensor_symbol, (ccv_nnc_tensor_symbol_t){
2581 .d = tensor_symbol_info[alias_ref].pair_ref - 1,
2582 .graph = dup_graph->pair
2583 });
2584 ccv_nnc_tensor_symbol_set_flags(dup_graph, tensor_symbol, tensor_symbol_info[alias_ref].flags);
2585 dup_tensor_block_ref[alias_ref * unroll_count] = tensor_symbol.d;
2586 } else {
2587 tensor_symbol.d = dup_tensor_block_ref[alias_ref * unroll_count];
2588 tensor_symbol.graph = dup_graph;
2589 }
2590 ccv_nnc_tensor_symbol_t alias_symbol = ccv_nnc_tensor_symbol_alias_new(dup_graph, tensor_symbol, tensor_symbol_info[input].ofs, tensor_symbol_info[input].stride, tensor_symbol_info[input].info, 0);
2591 if (tensor_symbol_info[input].pair_ref)
2592 ccv_nnc_tensor_symbol_pair_with(dup_graph, alias_symbol, (ccv_nnc_tensor_symbol_t){
2593 .d = tensor_symbol_info[input].pair_ref - 1,
2594 .graph = dup_graph->pair
2595 });
2596 ccv_nnc_tensor_symbol_set_flags(dup_graph, alias_symbol, tensor_symbol_info[input].flags);
2597 dup_tensor_block_ref[input * unroll_count] = alias_symbol.d;
2598 } else {
2599 ccv_nnc_tensor_symbol_t tensor_symbol = ccv_nnc_tensor_symbol_new(dup_graph, tensor_symbol_info[input].info, 0);
2600 if (tensor_symbol_info[input].pair_ref)
2601 ccv_nnc_tensor_symbol_pair_with(dup_graph, tensor_symbol, (ccv_nnc_tensor_symbol_t){
2602 .d = tensor_symbol_info[input].pair_ref - 1,
2603 .graph = dup_graph->pair
2604 });
2605 ccv_nnc_tensor_symbol_set_flags(dup_graph, tensor_symbol, tensor_symbol_info[input].flags);
2606 dup_tensor_block_ref[input * unroll_count] = tensor_symbol.d;
2607 }
2608 if (tensor_symbol_info[input].bypass_ref)
2609 {
2610 const int dup_bypass_ref = dup_tensor_block_ref[(tensor_symbol_info[input].bypass_ref - 1) * unroll_count];
2611 assert(dup_bypass_ref >= 0)((void) sizeof ((dup_bypass_ref >= 0) ? 1 : 0), __extension__
({ if (dup_bypass_ref >= 0) ; else __assert_fail ("dup_bypass_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 2611, __extension__ __PRETTY_FUNCTION__
); }))
;
2612 ccv_nnc_tensor_symbol_info_t* const symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(dup_graph->tensor_symbol_info, dup_tensor_block_ref[input * unroll_count])((void*)(((char*)((dup_graph->tensor_symbol_info)->data
)) + (size_t)(dup_graph->tensor_symbol_info)->rsize * (
size_t)(dup_tensor_block_ref[input * unroll_count])))
;
2613 symbol_info->bypass_ref = dup_bypass_ref + 1;
2614 }
2615 }
2616 return (ccv_nnc_tensor_symbol_t) {
2617 .d = dup_tensor_block_ref[input * unroll_count],
2618 .graph = dup_graph,
2619 };
2620}
2621
2622static ccv_nnc_graph_exec_symbol_t _ccv_nnc_dup_graph_exec_symbol(ccv_nnc_symbolic_graph_t* const dup_graph, const int unroll_count, int* const dup_exec_ref, int* const dup_tensor_block_ref, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_nnc_graph_exec_symbol_info_t* const node, const int idx, ccv_nnc_tensor_symbol_t* const max_inputs, ccv_nnc_tensor_symbol_t* const max_outputs)
2623{
2624 int i;
2625 if (dup_exec_ref[idx * unroll_count] < 0)
2626 {
2627 // Input has to come before output, because output could has a bypass reference to the input.
2628 for (i = 0; i < node->input_size; i++)
2629 max_inputs[i] = (node->inputs[i] >= 0) ? _ccv_nnc_dup_tensor_symbol(dup_graph, unroll_count, dup_tensor_block_ref, tensor_symbol_info, node->inputs[i]) : (ccv_nnc_tensor_symbol_t){ .d = node->inputs[i], .graph = dup_graph };
2630 for (i = 0; i < node->output_size; i++)
2631 max_outputs[i] = (node->outputs[i] >= 0) ? _ccv_nnc_dup_tensor_symbol(dup_graph, unroll_count, dup_tensor_block_ref, tensor_symbol_info, node->outputs[i]) : (ccv_nnc_tensor_symbol_t){ .d = node->outputs[i], .graph = dup_graph };
2632 ccv_nnc_graph_exec_symbol_t exec_symbol = ccv_nnc_graph_exec_symbol_new(dup_graph, node->cmd, max_inputs, node->input_size, max_outputs, node->output_size, 0);
2633 dup_exec_ref[idx * unroll_count] = exec_symbol.d;
2634 }
2635 return (ccv_nnc_graph_exec_symbol_t) {
2636 .d = dup_exec_ref[idx * unroll_count],
2637 .graph = dup_graph,
2638 };
2639}
2640
2641static void _ccv_nnc_tensor_blocks_free(ccv_nnc_tensor_block_t* const tensor_blocks, const int tensor_block_size)
2642{
2643 int i;
2644 for (i = 0; i < tensor_block_size; i++)
2645 {
2646 if (tensor_blocks[i].head)
2647 ccv_array_free(tensor_blocks[i].head);
2648 if (tensor_blocks[i].tail)
2649 ccv_array_free(tensor_blocks[i].tail);
2650 if (tensor_blocks[i].r_refs)
2651 ccv_array_free(tensor_blocks[i].r_refs);
2652 if (tensor_blocks[i].dup_p_refs)
2653 ccv_array_free(tensor_blocks[i].dup_p_refs);
2654 }
2655 ccfreefree(tensor_blocks);
2656}
2657
2658// Find tensors that cannot be solved by co-allocating to the same location.
2659static int _ccv_nnc_exec_dep_and_tensor_blocks_unroll_count(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_sparse_matrix_t* const exec_dep, ccv_nnc_tensor_block_t* const tensor_blocks)
2660{
2661 int i, j, unroll_count = 0;
2662 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2663 if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED) && tensor_symbol_info[i].assign_ref)
2664 {
2665 // This is is a parameter, thus, it has to be either an alias or used.
2666 assert(tensor_blocks[i].ref || TENSOR_EXPECT_ORDINARY(tensor_blocks[i]))((void) sizeof ((tensor_blocks[i].ref || ((tensor_blocks[i].flags
& 0x3) == 0)) ? 1 : 0), __extension__ ({ if (tensor_blocks
[i].ref || ((tensor_blocks[i].flags & 0x3) == 0)) ; else __assert_fail
("tensor_blocks[i].ref || TENSOR_EXPECT_ORDINARY(tensor_blocks[i])"
, "ccv_nnc_symbolic_graph_compile.c", 2666, __extension__ __PRETTY_FUNCTION__
); }))
;
2667 const int assign_ref = tensor_symbol_info[i].assign_ref - 1; // Starts at 1.
2668 // The parameter it assign to has to be either an alias or used.
2669 assert(tensor_blocks[assign_ref].ref || TENSOR_EXPECT_ORDINARY(tensor_blocks[assign_ref]))((void) sizeof ((tensor_blocks[assign_ref].ref || ((tensor_blocks
[assign_ref].flags & 0x3) == 0)) ? 1 : 0), __extension__ (
{ if (tensor_blocks[assign_ref].ref || ((tensor_blocks[assign_ref
].flags & 0x3) == 0)) ; else __assert_fail ("tensor_blocks[assign_ref].ref || TENSOR_EXPECT_ORDINARY(tensor_blocks[assign_ref])"
, "ccv_nnc_symbolic_graph_compile.c", 2669, __extension__ __PRETTY_FUNCTION__
); }))
;
2670 // If any of this two (assigner and assignee) is an alias, check to see if they are the same.
2671 // If it is the same, we are good, no need to extend.
2672 int a_ref = i;
2673 while (tensor_blocks[a_ref].ref)
2674 a_ref = tensor_blocks[a_ref].ref - 1;
2675 int b_ref = assign_ref;
2676 while (tensor_blocks[b_ref].ref)
2677 b_ref = tensor_blocks[b_ref].ref - 1;
2678 if (a_ref != b_ref)
2679 {
2680 // If any of the b's head is deterministically later than a's tail
2681 // or any of the b's tail is deterministically earlier than a's head, they don't interfere.
2682 int a_hop_b = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[b_ref], tensor_blocks[a_ref]);
2683 int b_hop_a = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[a_ref], tensor_blocks[b_ref]);
2684 // It cannot be that both i can hop to j can j can hop to i.
2685 assert(!(a_hop_b > 0 && b_hop_a > 0))((void) sizeof ((!(a_hop_b > 0 && b_hop_a > 0))
? 1 : 0), __extension__ ({ if (!(a_hop_b > 0 && b_hop_a
> 0)) ; else __assert_fail ("!(a_hop_b > 0 && b_hop_a > 0)"
, "ccv_nnc_symbolic_graph_compile.c", 2685, __extension__ __PRETTY_FUNCTION__
); }))
;
2686 // Can it be folded
2687 // These two can be assigned to the same region of memory without issue (because their life-time doesn't interfere).
2688 if (a_hop_b || b_hop_a)
2689 {
2690 tensor_blocks[a_ref].companion_ref = b_ref + 1;
2691 tensor_blocks[b_ref].companion_ref = a_ref + 1;
2692 continue;
2693 }
2694 int c_ref = tensor_symbol_info[b_ref].assign_ref - 1;
2695 for (j = 0; c_ref >= 0; j++)
2696 {
2697 while (tensor_blocks[c_ref].ref)
2698 c_ref = tensor_blocks[c_ref].ref - 1;
2699 c_ref = tensor_symbol_info[c_ref].assign_ref - 1;
2700 }
2701 unroll_count = ccv_max(unroll_count, j + 1)({ typeof (unroll_count) _a = (unroll_count); typeof (j + 1) _b
= (j + 1); (_a > _b) ? _a : _b; })
;
2702 }
2703 }
2704 // Reset companion_ref if need to unroll.
2705 if (unroll_count)
2706 for (j = 0; j < symbolic_graph->tensor_symbol_info->rnum; j++)
2707 tensor_blocks[j].companion_ref = 0;
2708 return unroll_count;
2709}
2710
2711static void _ccv_nnc_exec_dep_and_tensor_blocks_unroll_n(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_visit_t* const visit, const int unroll_count, const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_sparse_matrix_t* const exec_dep, const ccv_nnc_tensor_block_t* const tensor_blocks, ccv_nnc_symbolic_graph_t* const dup_graph, int* const r_dup_tensor_block_ref, int* const r_dup_exec_ref)
2712{
2713 int i, j, n;
2714 // The inout exec nodes, these are the nodes we are going to extend.
2715 uint8_t* inout = (uint8_t*)cccalloccalloc(symbolic_graph->exec_symbol_info->rnum, sizeof(uint8_t));
2716 int max_input_size = 0;
2717 int max_output_size = 0;
2718 for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; i++)
2719 {
2720 max_input_size = ccv_max(exec_symbol_info[i].input_size, max_input_size)({ typeof (exec_symbol_info[i].input_size) _a = (exec_symbol_info
[i].input_size); typeof (max_input_size) _b = (max_input_size
); (_a > _b) ? _a : _b; })
;
2721 max_output_size = ccv_max(exec_symbol_info[i].output_size, max_output_size)({ typeof (exec_symbol_info[i].output_size) _a = (exec_symbol_info
[i].output_size); typeof (max_output_size) _b = (max_output_size
); (_a > _b) ? _a : _b; })
;
2722 }
2723 ccv_nnc_tensor_symbol_t max_inputs[ccv_max(1, max_input_size)({ typeof (1) _a = (1); typeof (max_input_size) _b = (max_input_size
); (_a > _b) ? _a : _b; })
];
2724 ccv_nnc_tensor_symbol_t max_outputs[ccv_max(1, max_output_size)({ typeof (1) _a = (1); typeof (max_output_size) _b = (max_output_size
); (_a > _b) ? _a : _b; })
];
2725 // Doing graph expansion
2726 // It goes without saying, we must have more than one tensors / execs (otherwise I cannot use 0 as no exec ref).
2727 assert(dup_graph->exec_symbol_info->rnum > 0)((void) sizeof ((dup_graph->exec_symbol_info->rnum >
0) ? 1 : 0), __extension__ ({ if (dup_graph->exec_symbol_info
->rnum > 0) ; else __assert_fail ("dup_graph->exec_symbol_info->rnum > 0"
, "ccv_nnc_symbolic_graph_compile.c", 2727, __extension__ __PRETTY_FUNCTION__
); }))
;
2728 assert(dup_graph->tensor_symbol_info->rnum > 0)((void) sizeof ((dup_graph->tensor_symbol_info->rnum >
0) ? 1 : 0), __extension__ ({ if (dup_graph->tensor_symbol_info
->rnum > 0) ; else __assert_fail ("dup_graph->tensor_symbol_info->rnum > 0"
, "ccv_nnc_symbolic_graph_compile.c", 2728, __extension__ __PRETTY_FUNCTION__
); }))
;
2729#define INCOMING_NODE (1)
2730#define OUTGOING_NODE (2)
2731 // Unroll the graph n times.
2732 for (n = 0; n < unroll_count; n++)
2733 {
2734 int* const dup_exec_ref = r_dup_exec_ref + n;
2735 const int* const prev_dup_tensor_block_ref = n > 0 ? r_dup_tensor_block_ref + (n - 1) : 0;
2736 int* const dup_tensor_block_ref = r_dup_tensor_block_ref + n;
2737 for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; i++)
2738 dup_exec_ref[i * unroll_count] = -1;
2739 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2740 {
2741 // If there is a assign_ref, that means I don't need to dup the tensor.
2742 if (tensor_symbol_info[i].assign_ref)
2743 {
2744 const int assign_ref = tensor_symbol_info[i].assign_ref - 1;
2745 dup_tensor_block_ref[i * unroll_count] = prev_dup_tensor_block_ref ? prev_dup_tensor_block_ref[assign_ref * unroll_count] : assign_ref;
2746 } else if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !(
(tensor_blocks[i].flags & 0x3) == UNASSIGNED))
&& TENSOR_READ_WRITE(tensor_blocks[i])(tensor_blocks[i].flags & 0xc) == READ_ONLY)
2747 // If this is a read-only tensor block, no need to duplicate because the value never changes
2748 // (note we handled assign_ref first), therefore, no need to generate duplicate.
2749 dup_tensor_block_ref[i * unroll_count] = i;
2750 else
2751 dup_tensor_block_ref[i * unroll_count] = -1;
2752 }
2753 // Go through the original graph, make copies of the node if it is inout.
2754 ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx;
{
2755 ccv_nnc_graph_exec_symbol_t exec_symbol = _ccv_nnc_dup_graph_exec_symbol(dup_graph, unroll_count, dup_exec_ref, dup_tensor_block_ref, tensor_symbol_info, node, idx, max_inputs, max_outputs);
2756 inout[idx] |= INCOMING_NODE; /* Mark this node as incoming. */
2757 if (!node->outgoings)
2758 continue;
2759 for (i = 0; i < node->outgoings->rnum; i++)
2760 {
2761 const int outgoing_idx = *(int*)ccv_array_get(node->outgoings, i)((void*)(((char*)((node->outgoings)->data)) + (size_t)(
node->outgoings)->rsize * (size_t)(i)))
;
2762 inout[outgoing_idx] |= OUTGOING_NODE; /* Mark this node as outgoing. */
2763 ccv_nnc_graph_exec_symbol_t outgoing_symbol = _ccv_nnc_dup_graph_exec_symbol(dup_graph, unroll_count, dup_exec_ref, dup_tensor_block_ref, tensor_symbol_info, exec_symbol_info + outgoing_idx, outgoing_idx, max_inputs, max_outputs);
2764 ccv_nnc_graph_exec_symbol_concat(dup_graph, exec_symbol, outgoing_symbol);
2765 }
2766 } ccv_nnc_graph_visit_endfor} }
2767 // Check the visitor are all marked as either incoming or outgoing.
2768 const ccv_nnc_graph_exec_symbol_t* const dup_destinations = ccv_nnc_symbolic_graph_destinations(dup_graph);
2769 const int dup_destination_size = ccv_nnc_symbolic_graph_destination_size(dup_graph);
2770 for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; i++)
2771 {
2772 if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_symbol_info[i].flags)((exec_symbol_info[i].flags) & CCV_NNC_GRAPH_EXEC_DEAD))
2773 continue;
2774 assert((inout[i] & INCOMING_NODE) || (inout[i] & OUTGOING_NODE))((void) sizeof (((inout[i] & INCOMING_NODE) || (inout[i] &
OUTGOING_NODE)) ? 1 : 0), __extension__ ({ if ((inout[i] &
INCOMING_NODE) || (inout[i] & OUTGOING_NODE)) ; else __assert_fail
("(inout[i] & INCOMING_NODE) || (inout[i] & OUTGOING_NODE)"
, "ccv_nnc_symbolic_graph_compile.c", 2774, __extension__ __PRETTY_FUNCTION__
); }))
;
2775 // If this is pure incoming nodes, then I need to concat this one with all original destination node
2776 if (inout[i] == INCOMING_NODE)
2777 for (j = 0; j < dup_destination_size; j++)
2778 {
2779 ccv_nnc_graph_exec_symbol_concat(dup_graph, (ccv_nnc_graph_exec_symbol_t) {
2780 .d = dup_destinations[j].d,
2781 .graph = dup_graph,
2782 }, (ccv_nnc_graph_exec_symbol_t) {
2783 .d = dup_exec_ref[i * unroll_count],
2784 .graph = dup_graph,
2785 });
2786 }
2787 }
2788 if (dup_graph->destinations)
2789 ccv_array_clear(dup_graph->destinations);
2790 for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; i++)
2791 {
2792 if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_symbol_info[i].flags)((exec_symbol_info[i].flags) & CCV_NNC_GRAPH_EXEC_DEAD))
2793 continue;
2794 const int d = dup_exec_ref[i * unroll_count];
2795 ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(dup_graph->exec_symbol_info, d)((void*)(((char*)((dup_graph->exec_symbol_info)->data))
+ (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(d)))
;
2796 // If this has no outgoing node, add to the destination.
2797 if (!exec_symbol_info->outgoings || exec_symbol_info->outgoings->rnum == 0)
2798 ccv_nnc_symbolic_graph_add_destination(dup_graph, (ccv_nnc_graph_exec_symbol_t) {
2799 .graph = dup_graph,
2800 .d = d,
2801 });
2802 }
2803 }
2804#undef INCOMING_NODE
2805#undef OUTGOING_NODE
2806 ccfreefree(inout);
2807}
2808
2809static void _ccv_nnc_fixup_assign_ref_after_unroll(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const int unroll_count, const ccv_nnc_tensor_block_t* const tensor_blocks, const int* const dup_tensor_block_ref, ccv_nnc_tensor_symbol_info_t* const dup_tensor_symbol_info)
2810{
2811 int i;
2812 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++) // symbolic graph is the old graph and tensor blocks is the old tensor blocks.
2813 // Now can assign them (The dup) as companion.
2814 // Get to the last one, which we will wrap over.
2815 if (dup_tensor_symbol_info[i].assign_ref)
2816 {
2817 dup_tensor_symbol_info[dup_tensor_symbol_info[i].assign_ref - 1].r_assign_ref = 0;
2818 dup_tensor_symbol_info[i].assign_ref = dup_tensor_block_ref[(dup_tensor_symbol_info[i].assign_ref - 1) * unroll_count + unroll_count - 1] + 1;
2819 assert(dup_tensor_symbol_info[i].assign_ref)((void) sizeof ((dup_tensor_symbol_info[i].assign_ref) ? 1 : 0
), __extension__ ({ if (dup_tensor_symbol_info[i].assign_ref)
; else __assert_fail ("dup_tensor_symbol_info[i].assign_ref"
, "ccv_nnc_symbolic_graph_compile.c", 2819, __extension__ __PRETTY_FUNCTION__
); }))
;
2820 dup_tensor_symbol_info[dup_tensor_symbol_info[i].assign_ref - 1].r_assign_ref = i + 1;
2821 }
2822}
2823
2824// If the tensor blocks are the outputs of this graph, its life-time should be extended to the end of this graph.
2825// However, it is not that simple if the graph is unrolled. For unrolled graph, it needs to reach the end of
2826// the "original" graph and all its duplicated ends (for their duplicated tensor blocks).
2827static void _ccv_nnc_fixup_tensor_blocks_for_outputs(ccv_sparse_matrix_t* const exec_dep, ccv_nnc_tensor_block_t* const tensor_blocks, const ccv_nnc_graph_exec_symbol_info_t* const p_node_info, const int unroll_count, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const int p_idx, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int* const dup_exec_ref, const int* const dup_tensor_block_ref)
2828{
2829 int i, j, k;
2830 for (i = 0; i < p_node_info->output_size; i++)
2831 {
2832 const int d = p_node_info->outputs[i];
2833 const int s_ref = *(int*)ccv_array_get(p_tensor_symbol_info[d].s_ref, p_idx)((void*)(((char*)((p_tensor_symbol_info[d].s_ref)->data)) +
(size_t)(p_tensor_symbol_info[d].s_ref)->rsize * (size_t)
(p_idx)))
- 1;
2834 if (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[s_ref])(!((tensor_blocks[s_ref].flags & 0x3) == ALIAS) &&
!((tensor_blocks[s_ref].flags & 0x3) == UNASSIGNED))
)
2835 continue;
2836 for (k = 0; k < destination_size; k++)
2837 _ccv_nnc_tensor_block_add_exec(exec_dep, destinations[k].d, tensor_blocks[s_ref]);
2838 // Add the duplicated destinations to the tensor_block_ref.
2839 for (j = 0; j < unroll_count; j++)
2840 for (k = 0; k < destination_size; k++)
2841 {
2842 const int dup_exec_idx = dup_exec_ref[destinations[k].d * unroll_count + j];
2843 const int dup_tensor_block_idx = dup_tensor_block_ref[s_ref * unroll_count + j];
2844 if (dup_exec_idx >= 0 && dup_tensor_block_idx >= 0)
2845 _ccv_nnc_tensor_block_add_exec(exec_dep, dup_exec_idx, tensor_blocks[dup_tensor_block_idx]);
2846 }
2847 }
2848}
2849
2850static void _ccv_nnc_redo_exec_dep_and_tensor_blocks_when_unroll(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_info_t* const p_node_info, const ccv_nnc_graph_visit_t* const visit, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, ccv_nnc_graph_exec_flag_t* const exec_flags, ccv_sparse_matrix_t** r_exec_dep, ccv_nnc_tensor_block_t** r_tensor_blocks, int* r_tensor_block_size, ccv_nnc_symbolic_graph_t** r_dup_graph, int* r_unroll_count, int** r_dup_exec_ref, int** r_dup_tensor_block_ref)
2851{
2852 int i, j;
2853 ccv_sparse_matrix_t* exec_dep = *r_exec_dep;
2854 ccv_nnc_tensor_block_t* tensor_blocks = *r_tensor_blocks;
2855 // blocks that cannot be simply solved with either in-place operation tensor block folding or using the same memory region.
2856 // Unfortunately, I cannot do this analysis to the block folding done for sub-graphs, because we do sub-graph placement later.
2857 // No need to change anything, we are good.
2858 const int unroll_count = _ccv_nnc_exec_dep_and_tensor_blocks_unroll_count(symbolic_graph, tensor_symbol_info, exec_dep, tensor_blocks);
2859 if (!unroll_count)
2860 return;
2861 // Have conditions that cannot be satisfied with simple solution (allocate to the same memory region).
2862 // Doing graph expansion, first duplicate the old graph, but replace all sub graphs with noop.
2863 ccv_nnc_symbolic_graph_t* dup_graph = ccv_nnc_symbolic_graph_dup(symbolic_graph, _ccv_nnc_subst_sub_graph_with_noop);
2864 int* dup_exec_ref = (int*)ccmallocmalloc(sizeof(int) * symbolic_graph->exec_symbol_info->rnum * unroll_count);
2865 int* dup_tensor_block_ref = (int*)ccmallocmalloc(sizeof(int) * symbolic_graph->tensor_symbol_info->rnum * unroll_count);
2866 _ccv_nnc_exec_dep_and_tensor_blocks_unroll_n(symbolic_graph, visit, unroll_count, exec_symbol_info, tensor_symbol_info, exec_dep, tensor_blocks, dup_graph, dup_tensor_block_ref, dup_exec_ref);
2867 ccv_nnc_tensor_symbol_info_t* const dup_tensor_symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccmallocmalloc(sizeof(ccv_nnc_tensor_symbol_info_t) * dup_graph->tensor_symbol_info->rnum);
2868 ccv_nnc_graph_exec_symbol_info_t* const dup_exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccmallocmalloc(sizeof(ccv_nnc_graph_exec_symbol_info_t) * dup_graph->exec_symbol_info->rnum);
2869 ccv_nnc_graph_visit_t* dup_visit = ccv_nnc_graph_visit_new(dup_graph, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(dup_graph->exec_symbol_info, 0), dup_graph->exec_symbol_info->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->sources, 0), dup_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->destinations, 0), dup_graph->destinations->rnum, 0)({ ccv_nnc_graph_visit_t* _visit_ = (ccv_nnc_graph_visit_t*)malloc
(sizeof(ccv_nnc_graph_visit_t) + sizeof(_visit_->node[0]) *
((dup_graph->exec_symbol_info->rnum) - 1)); _visit_->
size = 0; do { typedef struct { int8_t d; int8_t r; uint16_t c
; int32_t edges; } ccv_nnc_incoming_t; int _i_, _j_; int _incoming_edges_
= 0; for (_i_ = 0; _i_ < (dup_graph->exec_symbol_info->
rnum); _i_++) _incoming_edges_ += (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_i_].outgoings) ? ((ccv_nnc_graph_exec_symbol_info_t*
)((void*)(((char*)((dup_graph->exec_symbol_info)->data)
) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_i_].outgoings->rnum : 0; const int _heap_mem_ = (
(dup_graph->exec_symbol_info->rnum) + _incoming_edges_ >
1024); ccv_nnc_incoming_t* _incomings_; if (_heap_mem_) _incomings_
= (ccv_nnc_incoming_t*)malloc(sizeof(ccv_nnc_incoming_t) * (
dup_graph->exec_symbol_info->rnum) + sizeof(int32_t) * (
(dup_graph->exec_symbol_info->rnum) * 2 + _incoming_edges_
)); else _incomings_ = (ccv_nnc_incoming_t*)__builtin_alloca (
sizeof(ccv_nnc_incoming_t) * (dup_graph->exec_symbol_info->
rnum) + sizeof(int32_t) * ((dup_graph->exec_symbol_info->
rnum) * 2 + _incoming_edges_)); memset(_incomings_, 0, sizeof
(ccv_nnc_incoming_t) * (dup_graph->exec_symbol_info->rnum
)); int32_t* _exists_[2] = { (int32_t*)(_incomings_ + (dup_graph
->exec_symbol_info->rnum)), (int32_t*)(_incomings_ + (dup_graph
->exec_symbol_info->rnum)) + (dup_graph->exec_symbol_info
->rnum), }; int32_t* const _edges_ = _exists_[1] + (dup_graph
->exec_symbol_info->rnum); for (_i_ = 0; _i_ < (dup_graph
->sources->rnum); _i_++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->sources)->data)) + (size_t
)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph
== dup_graph) ? 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->sources)->data)) + (size_t
)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph
== dup_graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->sources)->data)) + (size_t)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph == dup_graph"
, "ccv_nnc_symbolic_graph_compile.c", 2869, __extension__ __PRETTY_FUNCTION__
); })); _incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)((
(char*)((dup_graph->sources)->data)) + (size_t)(dup_graph
->sources)->rsize * (size_t)(0))))[_i_].d].r = 1; _exists_
[0][_i_] = ((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((
dup_graph->sources)->data)) + (size_t)(dup_graph->sources
)->rsize * (size_t)(0))))[_i_].d; } int _exist_size_[2] = {
(dup_graph->sources->rnum), 0, }; int _p_ = 0, _q_ = 1
; while (_exist_size_[_p_] > 0) { _exist_size_[_q_] = 0; for
(_i_ = 0; _i_ < _exist_size_[_p_]; _i_++) { const int32_t
_idx_ = _exists_[_p_][_i_]; if (_incomings_[_idx_].r != 1) continue
; _incomings_[_idx_].r = 2; if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings) for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings->rnum; _j_++) { const int d = *(int
*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t*)((void
*)(((char*)((dup_graph->exec_symbol_info)->data)) + (size_t
)(dup_graph->exec_symbol_info)->rsize * (size_t)(0))))[
_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_))); ++_incomings_
[d].c; if (_incomings_[d].r != 0) continue; _incomings_[d].r =
1; ((void) sizeof ((_exist_size_[_q_] < (dup_graph->exec_symbol_info
->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_[_q_] <
(dup_graph->exec_symbol_info->rnum)) ; else __assert_fail
("_exist_size_[_q_] < (dup_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 2869, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
(_i_ = 0; _i_ < (dup_graph->sources->rnum); _i_++) {
((void) sizeof ((((ccv_nnc_graph_exec_symbol_t*)((void*)(((char
*)((dup_graph->sources)->data)) + (size_t)(dup_graph->
sources)->rsize * (size_t)(0))))[_i_].graph == dup_graph) ?
1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t*)
((void*)(((char*)((dup_graph->sources)->data)) + (size_t
)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph
== dup_graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->sources)->data)) + (size_t)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph == dup_graph"
, "ccv_nnc_symbolic_graph_compile.c", 2869, __extension__ __PRETTY_FUNCTION__
); })); _incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)((
(char*)((dup_graph->sources)->data)) + (size_t)(dup_graph
->sources)->rsize * (size_t)(0))))[_i_].d].r = 3; _exists_
[0][_i_] = ((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((
dup_graph->sources)->data)) + (size_t)(dup_graph->sources
)->rsize * (size_t)(0))))[_i_].d; } _exist_size_[0] = (dup_graph
->sources->rnum); _exist_size_[1] = 0; _p_ = 0, _q_ = 1
; int _bump_ = 1; while (_exist_size_[_p_] > 0) { _exist_size_
[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_]; _i_++) {
const int32_t _idx_ = _exists_[_p_][_i_]; if (_incomings_[_idx_
].r != 3) continue; _incomings_[_idx_].r = 4; if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings) for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings->rnum; _j_++) { const int d = *(int
*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t*)((void
*)(((char*)((dup_graph->exec_symbol_info)->data)) + (size_t
)(dup_graph->exec_symbol_info)->rsize * (size_t)(0))))[
_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_))); if (_incomings_
[d].edges == 0) { _incomings_[d].edges = _bump_; _bump_ += _incomings_
[d].c; _incomings_[d].c = 0; } _edges_[_incomings_[d].edges -
1 + _incomings_[d].c] = _idx_; ++_incomings_[d].c; if (_incomings_
[d].r != 2) continue; _incomings_[d].r = 3; ((void) sizeof ((
_exist_size_[_q_] < (dup_graph->exec_symbol_info->rnum
)) ? 1 : 0), __extension__ ({ if (_exist_size_[_q_] < (dup_graph
->exec_symbol_info->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (dup_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 2869, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
(_i_ = 0; _i_ < (dup_graph->destinations->rnum); _i_
++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t*)((void*
)(((char*)((dup_graph->destinations)->data)) + (size_t)
(dup_graph->destinations)->rsize * (size_t)(0))))[_i_].
graph == dup_graph) ? 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->destinations)->data)) +
(size_t)(dup_graph->destinations)->rsize * (size_t)(0)
)))[_i_].graph == dup_graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph->destinations)->rsize * (size_t)(0))))[_i_].graph == dup_graph"
, "ccv_nnc_symbolic_graph_compile.c", 2869, __extension__ __PRETTY_FUNCTION__
); })); _incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)((
(char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph
->destinations)->rsize * (size_t)(0))))[_i_].d].r = 5; _exists_
[0][_i_] = ((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((
dup_graph->destinations)->data)) + (size_t)(dup_graph->
destinations)->rsize * (size_t)(0))))[_i_].d; } _exist_size_
[0] = (dup_graph->destinations->rnum); _exist_size_[1] =
0; _p_ = 0, _q_ = 1; while (_exist_size_[_p_] > 0) { _exist_size_
[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_]; _i_++) {
const int32_t _idx_ = _exists_[_p_][_i_]; if (_incomings_[_idx_
].r != 5) continue; _incomings_[_idx_].r = 6; if (_incomings_
[_idx_].edges > 0) for (_j_ = 0; _j_ < _incomings_[_idx_
].c; _j_++) { const int d = _edges_[_incomings_[_idx_].edges -
1 + _j_]; if (_incomings_[d].r != 4) continue; _incomings_[d
].r = 5; ((void) sizeof ((_exist_size_[_q_] < (dup_graph->
exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_
[_q_] < (dup_graph->exec_symbol_info->rnum)) ; else __assert_fail
("_exist_size_[_q_] < (dup_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 2869, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
(_i_ = 0; _i_ < (dup_graph->destinations->rnum); _i_
++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t*)((void*
)(((char*)((dup_graph->destinations)->data)) + (size_t)
(dup_graph->destinations)->rsize * (size_t)(0))))[_i_].
graph == dup_graph) ? 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->destinations)->data)) +
(size_t)(dup_graph->destinations)->rsize * (size_t)(0)
)))[_i_].graph == dup_graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph->destinations)->rsize * (size_t)(0))))[_i_].graph == dup_graph"
, "ccv_nnc_symbolic_graph_compile.c", 2869, __extension__ __PRETTY_FUNCTION__
); })); _incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)((
(char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph
->destinations)->rsize * (size_t)(0))))[_i_].d].d = 1; }
for (_i_ = 0; _i_ < (dup_graph->sources->rnum); _i_
++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t*)((void*
)(((char*)((dup_graph->sources)->data)) + (size_t)(dup_graph
->sources)->rsize * (size_t)(0))))[_i_].graph == dup_graph
) ? 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->sources)->data)) + (size_t
)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph
== dup_graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->sources)->data)) + (size_t)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph == dup_graph"
, "ccv_nnc_symbolic_graph_compile.c", 2869, __extension__ __PRETTY_FUNCTION__
); })); _exists_[0][_i_] = ((ccv_nnc_graph_exec_symbol_t*)((void
*)(((char*)((dup_graph->sources)->data)) + (size_t)(dup_graph
->sources)->rsize * (size_t)(0))))[_i_].d; } _p_ = 0; _q_
= 1; _exist_size_[0] = (dup_graph->sources->rnum); _exist_size_
[1] = 0; int _d_ = 0; while (_exist_size_[_p_] > 0) { _exist_size_
[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_];) { const
int32_t _idx_ = _exists_[_p_][_i_]; _visit_->node[_visit_
->size].index = ((_idx_)); _visit_->node[_visit_->size
].term = ((_incomings_[_idx_].d)); ++_visit_->size;; if (_incomings_
[_idx_].d) { ++_d_; _incomings_[_idx_].r = 7; } if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings) { if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings->rnum == 1) { const int d = *(int
*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t*)((void
*)(((char*)((dup_graph->exec_symbol_info)->data)) + (size_t
)(dup_graph->exec_symbol_info)->rsize * (size_t)(0))))[
_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings)->rsize * (size_t)(0))); --_incomings_
[d].c; if (_incomings_[d].c == 0 && _incomings_[d].r ==
6 && _d_ < (dup_graph->destinations->rnum))
{ _exists_[_p_][_i_] = d; continue; } } else for (_j_ = 0; _j_
< ((ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)((
dup_graph->exec_symbol_info)->data)) + (size_t)(dup_graph
->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings
->rnum; _j_++) { const int d = *(int*)((void*)(((char*)(((
(ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)((dup_graph
->exec_symbol_info)->data)) + (size_t)(dup_graph->exec_symbol_info
)->rsize * (size_t)(0))))[_idx_].outgoings)->data)) + (
size_t)(((ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)
((dup_graph->exec_symbol_info)->data)) + (size_t)(dup_graph
->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings
)->rsize * (size_t)(_j_))); --_incomings_[d].c; if (_incomings_
[d].c == 0 && _incomings_[d].r == 6 && _d_ <
(dup_graph->destinations->rnum)) { ((void) sizeof ((_exist_size_
[_q_] < (dup_graph->exec_symbol_info->rnum)) ? 1 : 0
), __extension__ ({ if (_exist_size_[_q_] < (dup_graph->
exec_symbol_info->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (dup_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 2869, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } } ++_i_; } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (
_i_)); } for (_i_ = 0; _i_ < (dup_graph->destinations->
rnum); _i_++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->destinations)->data)) +
(size_t)(dup_graph->destinations)->rsize * (size_t)(0)
)))[_i_].graph == dup_graph) ? 1 : 0), __extension__ ({ if ((
(ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->
destinations)->data)) + (size_t)(dup_graph->destinations
)->rsize * (size_t)(0))))[_i_].graph == dup_graph) ; else __assert_fail
("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph->destinations)->rsize * (size_t)(0))))[_i_].graph == dup_graph"
, "ccv_nnc_symbolic_graph_compile.c", 2869, __extension__ __PRETTY_FUNCTION__
); })); if (_incomings_[((ccv_nnc_graph_exec_symbol_t*)((void
*)(((char*)((dup_graph->destinations)->data)) + (size_t
)(dup_graph->destinations)->rsize * (size_t)(0))))[_i_]
.d].r == 7) continue; if (!(0)) { ((void) sizeof ((_incomings_
[((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->
destinations)->data)) + (size_t)(dup_graph->destinations
)->rsize * (size_t)(0))))[_i_].d].c == 0) ? 1 : 0), __extension__
({ if (_incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)((
(char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph
->destinations)->rsize * (size_t)(0))))[_i_].d].c == 0)
; else __assert_fail ("_incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph->destinations)->rsize * (size_t)(0))))[_i_].d].c == 0"
, "ccv_nnc_symbolic_graph_compile.c", 2869, __extension__ __PRETTY_FUNCTION__
); })); } else if (_incomings_[((ccv_nnc_graph_exec_symbol_t*
)((void*)(((char*)((dup_graph->destinations)->data)) + (
size_t)(dup_graph->destinations)->rsize * (size_t)(0)))
)[_i_].d].c > 0) continue; _visit_->node[_visit_->size
].index = ((((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)(
(dup_graph->destinations)->data)) + (size_t)(dup_graph->
destinations)->rsize * (size_t)(0))))[_i_].d)); _visit_->
node[_visit_->size].term = ((_incomings_[((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->destinations)->data)) +
(size_t)(dup_graph->destinations)->rsize * (size_t)(0)
)))[_i_].d].d)); ++_visit_->size;; } if (_heap_mem_) free(
_incomings_); } while (0);; ((void) sizeof ((_visit_->size
<= (dup_graph->exec_symbol_info->rnum)) ? 1 : 0), __extension__
({ if (_visit_->size <= (dup_graph->exec_symbol_info
->rnum)) ; else __assert_fail ("_visit_->size <= (dup_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 2869, __extension__ __PRETTY_FUNCTION__
); })); _visit_; })
;
2870 ccv_nnc_symbolic_graph_symbol_infer(dup_graph, dup_visit, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->sources, 0)((void*)(((char*)((dup_graph->sources)->data)) + (size_t
)(dup_graph->sources)->rsize * (size_t)(0)))
, dup_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->destinations, 0)((void*)(((char*)((dup_graph->destinations)->data)) + (
size_t)(dup_graph->destinations)->rsize * (size_t)(0)))
, dup_graph->destinations->rnum, p_tensor_symbol_info, p_tensor_symbol_info_size, dup_tensor_symbol_info, dup_exec_symbol_info);
2871 _ccv_nnc_fixup_assign_ref_after_unroll(symbolic_graph, unroll_count, tensor_blocks, dup_tensor_block_ref, dup_tensor_symbol_info);
2872 // Free out the old exec_dep
2873 ccv_matrix_free(exec_dep);
2874 // and the tensor blocks, prepare for the new.
2875 _ccv_nnc_tensor_blocks_free(tensor_blocks, symbolic_graph->tensor_symbol_info->rnum);
2876 // A reverse map to find where the original tensor comes from.
2877 int* dup_tensor_from_ref = (int*)ccmallocmalloc(sizeof(int) * dup_graph->tensor_symbol_info->rnum);
2878 for (i = 0; i < dup_graph->tensor_symbol_info->rnum; i++)
2879 dup_tensor_from_ref[i] = -1;
2880 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2881 for (j = 0; j < unroll_count; j++)
2882 if (dup_tensor_block_ref[i * unroll_count + j] >= 0)
2883 dup_tensor_from_ref[dup_tensor_block_ref[i * unroll_count + j]] = i;
2884 int* dup_exec_from_ref = (int*)ccmallocmalloc(sizeof(int) * dup_graph->exec_symbol_info->rnum);
2885 for (i = 0; i < dup_graph->exec_symbol_info->rnum; i++)
2886 dup_exec_from_ref[i] = -1;
2887 for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; i++)
2888 {
2889 if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_symbol_info[i].flags)((exec_symbol_info[i].flags) & CCV_NNC_GRAPH_EXEC_DEAD))
2890 continue;
2891 dup_exec_from_ref[i] = i; // Reference back.
2892 for (j = 0; j < unroll_count; j++)
2893 if (dup_exec_ref[i * unroll_count + j] >= 0)
2894 dup_exec_from_ref[dup_exec_ref[i * unroll_count + j]] = i;
2895 }
2896 // Reset all attr.
2897 memset(exec_flags, 0, sizeof(ccv_nnc_graph_exec_flag_t) * symbolic_graph->exec_symbol_info->rnum);
2898 _ccv_nnc_exec_dep_and_tensor_blocks_prep(dup_graph, p_node_info, dup_visit, tensor_binds, tensor_bind_size, 0, 0, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->sources, 0)((void*)(((char*)((dup_graph->sources)->data)) + (size_t
)(dup_graph->sources)->rsize * (size_t)(0)))
, dup_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->destinations, 0)((void*)(((char*)((dup_graph->destinations)->data)) + (
size_t)(dup_graph->destinations)->rsize * (size_t)(0)))
, dup_graph->destinations->rnum, p_tensor_symbol_info, p_tensor_symbol_info_size, dup_exec_symbol_info, dup_tensor_symbol_info, unroll_count, dup_tensor_block_ref, dup_tensor_from_ref, dup_exec_from_ref, exec_flags, &exec_dep, &tensor_blocks);
2899 ccv_nnc_graph_visit_free(dup_visit);
2900 ccfreefree(dup_exec_symbol_info);
2901 ccfreefree(dup_exec_from_ref);
2902 ccfreefree(dup_tensor_from_ref);
2903 // Assign out dup_p_ref, which will be used to extended the anonymous block life-time.
2904 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2905 // Loop over all possible duplications to assign dup_p_ref properly.
2906 for (j = 0; j < unroll_count; j++)
2907 {
2908 const int dup_idx = dup_tensor_block_ref[j + i * unroll_count];
2909 if (dup_idx >= 0 && (tensor_blocks[i].p_refs[0] || tensor_blocks[i].p_refs[1]))
2910 {
2911 const int p_ref_0 = tensor_blocks[i].p_refs[0] - 1;
2912 const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info);
2913 if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this.
2914 {
2915 if (!tensor_blocks[dup_idx].dup_p_refs)
2916 tensor_blocks[dup_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
2917 ccv_array_add_unique_int(tensor_blocks[dup_idx].dup_p_refs, p_ref_0);
2918 }
2919 if (p_ref_0_is_in_or_out == 1 || tensor_blocks[i].p_refs[1] == 0)
2920 continue;
2921 const int p_ref_1 = tensor_blocks[i].p_refs[1] - 1;
2922 const int p_ref_1_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_1, p_node_info);
2923 if (p_ref_1_is_in_or_out == 1)
2924 {
2925 if (!tensor_blocks[dup_idx].dup_p_refs)
2926 tensor_blocks[dup_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
2927 ccv_array_add_unique_int(tensor_blocks[dup_idx].dup_p_refs, p_ref_1);
2928 }
2929 }
2930 }
2931 // companion_ref
2932 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2933 // Now can assign them (The dup) as companion.
2934 if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED) && dup_tensor_symbol_info[i].assign_ref)
2935 {
2936 // Get to the last one, which we will wrap over.
2937 const int assign_ref = dup_tensor_symbol_info[i].assign_ref - 1;
2938 if (assign_ref >= 0)
2939 {
2940 int b_ref = assign_ref;
2941 while (tensor_blocks[b_ref].ref)
2942 b_ref = tensor_blocks[b_ref].ref - 1;
2943 int a_hop_b = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[i], tensor_blocks[b_ref]);
2944 int b_hop_a = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[b_ref], tensor_blocks[i]);
2945 // It cannot be that both i can hop to j can j can hop to i.
2946 // And it can be hop from one to another now after duplication.
2947 assert(a_hop_b > 0 || b_hop_a > 0)((void) sizeof ((a_hop_b > 0 || b_hop_a > 0) ? 1 : 0), __extension__
({ if (a_hop_b > 0 || b_hop_a > 0) ; else __assert_fail
("a_hop_b > 0 || b_hop_a > 0", "ccv_nnc_symbolic_graph_compile.c"
, 2947, __extension__ __PRETTY_FUNCTION__); }))
;
2948 tensor_blocks[i].companion_ref = b_ref + 1;
2949 tensor_blocks[b_ref].companion_ref = i + 1;
2950 }
2951 }
2952 ccfreefree(dup_tensor_symbol_info);
2953 // Extend the dup tensor block ref, prepare for future extensions.
2954 dup_tensor_block_ref = (int*)ccreallocrealloc(dup_tensor_block_ref, sizeof(int) * dup_graph->tensor_symbol_info->rnum * unroll_count);
2955 for (i = symbolic_graph->tensor_symbol_info->rnum * unroll_count; i < dup_graph->tensor_symbol_info->rnum * unroll_count; i++)
2956 dup_tensor_block_ref[i] = -1;
2957 // Assign out changed properties.
2958 *r_exec_dep = exec_dep;
2959 *r_tensor_blocks = tensor_blocks;
2960 *r_tensor_block_size = dup_graph->tensor_symbol_info->rnum;
2961 *r_dup_graph = dup_graph;
2962 *r_unroll_count = unroll_count;
2963 *r_dup_exec_ref = dup_exec_ref;
2964 *r_dup_tensor_block_ref = dup_tensor_block_ref;
2965}
2966
2967static int _ccv_nnc_anonymous_tensor_block_from_free_list(const ccv_nnc_tensor_block_t* const tensor_blocks, const int tensor_block_size, const ccv_array_t* const anonymous_block_free_list, const int anonymous_block_free_list_cap, const int type, const uint64_t size, const ccv_sparse_matrix_t* const exec_dep, const ccv_array_t* const dup_p_refs)
2968{
2969 if (!anonymous_block_free_list || !anonymous_block_free_list_cap)
2970 return tensor_block_size;
2971 int i;
2972 const int no_dup_p_refs = (!dup_p_refs || !dup_p_refs->rnum);
2973 int found_idx = tensor_block_size;
2974 for (i = 0; i < anonymous_block_free_list_cap; i++)
2975 {
2976 const int idx = *(int*)ccv_array_get(anonymous_block_free_list, i)((void*)(((char*)((anonymous_block_free_list)->data)) + (size_t
)(anonymous_block_free_list)->rsize * (size_t)(i)))
;
2977 assert(idx < tensor_block_size)((void) sizeof ((idx < tensor_block_size) ? 1 : 0), __extension__
({ if (idx < tensor_block_size) ; else __assert_fail ("idx < tensor_block_size"
, "ccv_nnc_symbolic_graph_compile.c", 2977, __extension__ __PRETTY_FUNCTION__
); }))
;
2978 // If the type doesn't match, ignore.
2979 if (tensor_blocks[idx].type != type)
2980 continue;
2981 // Heuristic about how to select the best tensor block to move forward.
2982 // If the size is larger, and no dup_p_refs, found, I cannot do better than this, just return directly.
2983 if (tensor_blocks[idx].size >= size)
2984 {
2985 if (no_dup_p_refs)
2986 return idx;
2987 // Otherwise, only if the current tensor block's dup_p_refs is after (or at) the dup_p_refs,
2988 // then we cannot do better than this, if that is the case, just return.
2989 if (tensor_blocks[idx].dup_p_refs && tensor_blocks[idx].dup_p_refs->rnum &&
2990 _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, dup_p_refs))
2991 return idx;
2992 }
2993 int64_t found_idx_size_diff;
2994 int64_t idx_size_diff;
2995 if (found_idx == tensor_block_size || // If no found_idx yet, set the current one to be the found one, and continue.
2996 // Now, compare whether this one or the found_idx one is better.
2997 // At this point, there is no point of comparing the dup_p_refs, we only care about which one
2998 // is closer to the size we request. Only on a tie, dup_p_refs or not is important again.
2999 (found_idx_size_diff = llabs((int64_t)tensor_blocks[found_idx].size - (int64_t)size)) < (idx_size_diff = llabs((int64_t)tensor_blocks[idx].size - (int64_t)size)))
3000 {
3001 found_idx = idx;
3002 continue;
3003 }
3004 // No need to update if found_idx is better than idx.
3005 if (found_idx_size_diff > idx_size_diff)
3006 continue;
3007 // We bias towards the bigger one in case of similar.
3008 if (found_idx_size_diff == idx_size_diff && tensor_blocks[idx].size > tensor_blocks[found_idx].size)
3009 {
3010 found_idx = idx;
3011 continue;
3012 }
3013 assert(tensor_blocks[idx].size == tensor_blocks[found_idx].size)((void) sizeof ((tensor_blocks[idx].size == tensor_blocks[found_idx
].size) ? 1 : 0), __extension__ ({ if (tensor_blocks[idx].size
== tensor_blocks[found_idx].size) ; else __assert_fail ("tensor_blocks[idx].size == tensor_blocks[found_idx].size"
, "ccv_nnc_symbolic_graph_compile.c", 3013, __extension__ __PRETTY_FUNCTION__
); }))
;
3014 // On a tie, check which one has tighter life-cycle.
3015 if (tensor_blocks[idx].size >= size) // If this block size covers the size we request, we prefer longer life-cycle ones.
3016 {
3017 // Check whether the current tensor blocks life-cycle is longer than the previous one.
3018 if (tensor_blocks[idx].dup_p_refs && tensor_blocks[idx].dup_p_refs->rnum > 0 &&
3019 (!tensor_blocks[found_idx].dup_p_refs || !tensor_blocks[found_idx].dup_p_refs->rnum ||
3020 _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, tensor_blocks[found_idx].dup_p_refs)))
3021 found_idx = idx;
3022 continue;
3023 }
3024 // Now both our size is smaller than requested size, in this case, we need to increase the tensor block size.
3025 // We prefer to choose the one that has life-cycle closer to the expected ones.
3026 if (no_dup_p_refs)
3027 {
3028 // Whoever is shorter wins.
3029 if (tensor_blocks[found_idx].dup_p_refs && tensor_blocks[found_idx].dup_p_refs->rnum > 0 &&
3030 (!tensor_blocks[idx].dup_p_refs || !tensor_blocks[idx].dup_p_refs->rnum ||
3031 _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[found_idx].dup_p_refs, tensor_blocks[idx].dup_p_refs)))
3032 found_idx = idx;
3033 continue;
3034 }
3035 if (!tensor_blocks[idx].dup_p_refs || !tensor_blocks[idx].dup_p_refs->rnum)
3036 continue;
3037 if (!tensor_blocks[found_idx].dup_p_refs || !tensor_blocks[found_idx].dup_p_refs->rnum)
3038 {
3039 found_idx = idx;
3040 continue;
3041 }
3042 // If both covers the request dup_p_refs, we prefer the shorter one, otherwise we prefer the longer one.
3043 const int idx_after_request = _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, dup_p_refs);
3044 const int found_idx_after_request = _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[found_idx].dup_p_refs, dup_p_refs);
3045 if (idx_after_request && found_idx_after_request)
3046 {
3047 if (_ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[found_idx].dup_p_refs, tensor_blocks[idx].dup_p_refs))
3048 found_idx = idx;
3049 continue;
3050 } else {
3051 // We entered this branch must be either idx_after_request is false or found_idx_after_request is false or both.
3052 // If found_idx_after_request is not false, we are currently doing fine, no need to proceed.
3053 // Otherwise, if idx_after_request is true, it is preferred. If both are false, then prefer the longer one.
3054 if (!found_idx_after_request && (idx_after_request ||
3055 _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, tensor_blocks[found_idx].dup_p_refs)))
3056 found_idx = idx;
3057 continue;
3058 }
3059 }
3060 return found_idx;
3061}
3062
3063static ccv_array_t* _ccv_nnc_dup_breakpoints_with_p_node_inputs(ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_info_t* const p_node_info)
3064{
3065 if (!(p_node_info && (p_node_info->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)))
3066 return 0;
3067 int i, j, k;
3068 int input_size = 0;
3069 for (i = 0; i < p_node_info->p_while.input_size; i++)
3070 if (p_node_info->p_while.inputs[i] >= 0)
3071 ++input_size;
3072 // If doesn't have tensor inputs (thus, only special inputs), just return.
3073 if (!input_size)
3074 return 0;
3075 ccv_nnc_tensor_symbol_t inputs[input_size];
3076 input_size = 0;
3077 for (i = 0; i < p_node_info->p_while.input_size; i++)
3078 if (p_node_info->p_while.inputs[i] >= 0)
3079 inputs[input_size++] = (ccv_nnc_tensor_symbol_t){
3080 .d = p_node_info->p_while.inputs[i],
3081 .graph = symbolic_graph,
3082 };
3083 assert(symbolic_graph->breakpoint_size > 0)((void) sizeof ((symbolic_graph->breakpoint_size > 0) ?
1 : 0), __extension__ ({ if (symbolic_graph->breakpoint_size
> 0) ; else __assert_fail ("symbolic_graph->breakpoint_size > 0"
, "ccv_nnc_symbolic_graph_compile.c", 3083, __extension__ __PRETTY_FUNCTION__
); }))
;
3084 ccv_array_t* dup_breakpoints = ccv_array_new(sizeof(ccv_nnc_graph_exec_symbol_t), symbolic_graph->breakpoint_size, 0);
3085 const int exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum;
3086 for (i = 0; i < symbolic_graph->breakpoint_size; i++)
3087 {
3088 // Make a noop copy of the breakpoint, but with some tensor inputs.
3089 ccv_nnc_graph_exec_symbol_t noop = ccv_nnc_graph_exec_symbol_new(symbolic_graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC()((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}}}), 0), inputs, input_size, 0, 0, 0);
3090 ccv_array_push(dup_breakpoints, &noop);
3091 // Connect this noop to the outgoing nodes of breakpoints.
3092 const ccv_nnc_graph_exec_symbol_info_t* const symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, symbolic_graph->breakpoints[i].d)((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
(size_t)(symbolic_graph->breakpoints[i].d)))
;
3093 if (symbol_info->outgoings)
3094 for (j = 0; j < symbol_info->outgoings->rnum; j++)
3095 {
3096 const int d = *(int*)ccv_array_get(symbol_info->outgoings, j)((void*)(((char*)((symbol_info->outgoings)->data)) + (size_t
)(symbol_info->outgoings)->rsize * (size_t)(j)))
;
3097 ccv_nnc_graph_exec_symbol_concat(symbolic_graph, noop, (ccv_nnc_graph_exec_symbol_t){
3098 .d = d,
3099 .graph = symbolic_graph,
3100 });
3101 }
3102 }
3103 for (i = 0; i < exec_symbol_info_size; i++)
3104 {
3105 const ccv_nnc_graph_exec_symbol_info_t* const symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, i)((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
(size_t)(i)))
;
3106 if (CCV_NNC_GRAPH_EXEC_IS_DEAD(symbol_info->flags)((symbol_info->flags) & CCV_NNC_GRAPH_EXEC_DEAD))
3107 continue;
3108 if (symbol_info->outgoings)
3109 {
3110 const int outgoing_size = symbol_info->outgoings->rnum;
3111 for (j = 0; j < outgoing_size; j++)
3112 {
3113 const int d = *(int*)ccv_array_get(symbol_info->outgoings, j)((void*)(((char*)((symbol_info->outgoings)->data)) + (size_t
)(symbol_info->outgoings)->rsize * (size_t)(j)))
;
3114 for (k = 0; k < symbolic_graph->breakpoint_size; k++)
3115 if (d == symbolic_graph->breakpoints[k].d)
3116 {
3117 ccv_nnc_graph_exec_symbol_t noop = *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_breakpoints, k)((void*)(((char*)((dup_breakpoints)->data)) + (size_t)(dup_breakpoints
)->rsize * (size_t)(k)))
;
3118 ccv_nnc_graph_exec_symbol_concat(symbolic_graph, (ccv_nnc_graph_exec_symbol_t){
3119 .d = i,
3120 .graph = symbolic_graph,
3121 }, noop);
3122 // Found, connected, exit.
3123 break;
3124 }
3125 }
3126 }
3127 }
3128 // Add the dup_breakpoints to source if neccessary.
3129 assert(symbolic_graph->sources)((void) sizeof ((symbolic_graph->sources) ? 1 : 0), __extension__
({ if (symbolic_graph->sources) ; else __assert_fail ("symbolic_graph->sources"
, "ccv_nnc_symbolic_graph_compile.c", 3129, __extension__ __PRETTY_FUNCTION__
); }))
;
3130 const int source_size = symbolic_graph->sources->rnum;
3131 for (i = 0; i < source_size; i++)
3132 {
3133 const int d = ((ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->sources, i)((void*)(((char*)((symbolic_graph->sources)->data)) + (
size_t)(symbolic_graph->sources)->rsize * (size_t)(i)))
)->d;
3134 for (j = 0; j < symbolic_graph->breakpoint_size; j++)
3135 if (d == symbolic_graph->breakpoints[j].d)
3136 {
3137 ccv_nnc_graph_exec_symbol_t noop = *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_breakpoints, j)((void*)(((char*)((dup_breakpoints)->data)) + (size_t)(dup_breakpoints
)->rsize * (size_t)(j)))
;
3138 ccv_nnc_symbolic_graph_add_source(symbolic_graph, noop);
3139 // Found, made, exit.
3140 break;
3141 }
3142 }
3143 // Add the dup_breakpoints to destination if neccessary.
3144 assert(symbolic_graph->destinations)((void) sizeof ((symbolic_graph->destinations) ? 1 : 0), __extension__
({ if (symbolic_graph->destinations) ; else __assert_fail
("symbolic_graph->destinations", "ccv_nnc_symbolic_graph_compile.c"
, 3144, __extension__ __PRETTY_FUNCTION__); }))
;
3145 const int destination_size = symbolic_graph->destinations->rnum;
3146 for (i = 0; i < destination_size; i++)
3147 {
3148 const int d = ((ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->destinations, i)((void*)(((char*)((symbolic_graph->destinations)->data)
) + (size_t)(symbolic_graph->destinations)->rsize * (size_t
)(i)))
)->d;
3149 for (j = 0; j < symbolic_graph->breakpoint_size; j++)
3150 if (d == symbolic_graph->breakpoints[j].d)
3151 {
3152 ccv_nnc_graph_exec_symbol_t noop = *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_breakpoints, j)((void*)(((char*)((dup_breakpoints)->data)) + (size_t)(dup_breakpoints
)->rsize * (size_t)(j)))
;
3153 ccv_nnc_symbolic_graph_add_destination(symbolic_graph, noop);
3154 // Found, made, exit.
3155 break;
3156 }
3157 }
3158 return dup_breakpoints;
3159}
3160
3161// Plan out how we allocate tensor (should I do optimizations on graph here or not at all?).
3162static ccv_nnc_symbolic_graph_prep_t* _ccv_nnc_symbolic_graph_prep_new(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_graph_exec_symbol_info_t* const p_exec_symbol_info, const int p_exec_symbol_info_size)
3163{
3164 assert(source_size > 0)((void) sizeof ((source_size > 0) ? 1 : 0), __extension__ (
{ if (source_size > 0) ; else __assert_fail ("source_size > 0"
, "ccv_nnc_symbolic_graph_compile.c", 3164, __extension__ __PRETTY_FUNCTION__
); }))
;
3165 assert(destination_size > 0)((void) sizeof ((destination_size > 0) ? 1 : 0), __extension__
({ if (destination_size > 0) ; else __assert_fail ("destination_size > 0"
, "ccv_nnc_symbolic_graph_compile.c", 3165, __extension__ __PRETTY_FUNCTION__
); }))
;
3166 // First, fill all the "auto" holes.
3167 // This is the symbol table that with "auto" info filled up.
3168 ccv_nnc_tensor_symbol_info_t* tensor_symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccmallocmalloc(sizeof(ccv_nnc_tensor_symbol_info_t) * symbolic_graph->tensor_symbol_info->rnum);
3169 ccv_nnc_graph_exec_symbol_info_t* exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccmallocmalloc(sizeof(ccv_nnc_graph_exec_symbol_info_t) * symbolic_graph->exec_symbol_info->rnum);
3170 ccv_nnc_graph_exec_flag_t* exec_flags = (ccv_nnc_graph_exec_flag_t*)cccalloccalloc(symbolic_graph->exec_symbol_info->rnum, sizeof(ccv_nnc_graph_exec_flag_t));
3171 ccv_nnc_graph_visit_t* visit = ccv_nnc_graph_visit_new(symbolic_graph, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, 0), symbolic_graph->exec_symbol_info->rnum, sources, source_size, destinations, destination_size, 0)({ ccv_nnc_graph_visit_t* _visit_ = (ccv_nnc_graph_visit_t*)malloc
(sizeof(ccv_nnc_graph_visit_t) + sizeof(_visit_->node[0]) *
((symbolic_graph->exec_symbol_info->rnum) - 1)); _visit_
->size = 0; do { typedef struct { int8_t d; int8_t r; uint16_t
c; int32_t edges; } ccv_nnc_incoming_t; int _i_, _j_; int _incoming_edges_
= 0; for (_i_ = 0; _i_ < (symbolic_graph->exec_symbol_info
->rnum); _i_++) _incoming_edges_ += (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_i_].outgoings) ? ((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_i_].outgoings->rnum : 0; const int _heap_mem_
= ((symbolic_graph->exec_symbol_info->rnum) + _incoming_edges_
> 1024); ccv_nnc_incoming_t* _incomings_; if (_heap_mem_)
_incomings_ = (ccv_nnc_incoming_t*)malloc(sizeof(ccv_nnc_incoming_t
) * (symbolic_graph->exec_symbol_info->rnum) + sizeof(int32_t
) * ((symbolic_graph->exec_symbol_info->rnum) * 2 + _incoming_edges_
)); else _incomings_ = (ccv_nnc_incoming_t*)__builtin_alloca (
sizeof(ccv_nnc_incoming_t) * (symbolic_graph->exec_symbol_info
->rnum) + sizeof(int32_t) * ((symbolic_graph->exec_symbol_info
->rnum) * 2 + _incoming_edges_)); memset(_incomings_, 0, sizeof
(ccv_nnc_incoming_t) * (symbolic_graph->exec_symbol_info->
rnum)); int32_t* _exists_[2] = { (int32_t*)(_incomings_ + (symbolic_graph
->exec_symbol_info->rnum)), (int32_t*)(_incomings_ + (symbolic_graph
->exec_symbol_info->rnum)) + (symbolic_graph->exec_symbol_info
->rnum), }; int32_t* const _edges_ = _exists_[1] + (symbolic_graph
->exec_symbol_info->rnum); for (_i_ = 0; _i_ < (source_size
); _i_++) { ((void) sizeof (((sources)[_i_].graph == symbolic_graph
) ? 1 : 0), __extension__ ({ if ((sources)[_i_].graph == symbolic_graph
) ; else __assert_fail ("(sources)[_i_].graph == symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 3171, __extension__ __PRETTY_FUNCTION__
); })); _incomings_[(sources)[_i_].d].r = 1; _exists_[0][_i_]
= (sources)[_i_].d; } int _exist_size_[2] = { (source_size),
0, }; int _p_ = 0, _q_ = 1; while (_exist_size_[_p_] > 0)
{ _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_
[_p_]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (
_incomings_[_idx_].r != 1) continue; _incomings_[_idx_].r = 2
; if (((ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)((
symbolic_graph->exec_symbol_info)->data)) + (size_t)(symbolic_graph
->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings
) for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t*)
((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
(size_t)(0))))[_idx_].outgoings->rnum; _j_++) { const int
d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_)
)); ++_incomings_[d].c; if (_incomings_[d].r != 0) continue; _incomings_
[d].r = 1; ((void) sizeof ((_exist_size_[_q_] < (symbolic_graph
->exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if
(_exist_size_[_q_] < (symbolic_graph->exec_symbol_info
->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 3171, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
(_i_ = 0; _i_ < (source_size); _i_++) { ((void) sizeof ((
(sources)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__
({ if ((sources)[_i_].graph == symbolic_graph) ; else __assert_fail
("(sources)[_i_].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 3171, __extension__ __PRETTY_FUNCTION__); })); _incomings_[
(sources)[_i_].d].r = 3; _exists_[0][_i_] = (sources)[_i_].d;
} _exist_size_[0] = (source_size); _exist_size_[1] = 0; _p_ =
0, _q_ = 1; int _bump_ = 1; while (_exist_size_[_p_] > 0)
{ _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_
[_p_]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (
_incomings_[_idx_].r != 3) continue; _incomings_[_idx_].r = 4
; if (((ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)((
symbolic_graph->exec_symbol_info)->data)) + (size_t)(symbolic_graph
->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings
) for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t*)
((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
(size_t)(0))))[_idx_].outgoings->rnum; _j_++) { const int
d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_)
)); if (_incomings_[d].edges == 0) { _incomings_[d].edges = _bump_
; _bump_ += _incomings_[d].c; _incomings_[d].c = 0; } _edges_
[_incomings_[d].edges - 1 + _incomings_[d].c] = _idx_; ++_incomings_
[d].c; if (_incomings_[d].r != 2) continue; _incomings_[d].r =
3; ((void) sizeof ((_exist_size_[_q_] < (symbolic_graph->
exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_
[_q_] < (symbolic_graph->exec_symbol_info->rnum)) ; else
__assert_fail ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 3171, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
(_i_ = 0; _i_ < (destination_size); _i_++) { ((void) sizeof
(((destinations)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__
({ if ((destinations)[_i_].graph == symbolic_graph) ; else __assert_fail
("(destinations)[_i_].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 3171, __extension__ __PRETTY_FUNCTION__); })); _incomings_[
(destinations)[_i_].d].r = 5; _exists_[0][_i_] = (destinations
)[_i_].d; } _exist_size_[0] = (destination_size); _exist_size_
[1] = 0; _p_ = 0, _q_ = 1; while (_exist_size_[_p_] > 0) {
_exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_
]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (_incomings_
[_idx_].r != 5) continue; _incomings_[_idx_].r = 6; if (_incomings_
[_idx_].edges > 0) for (_j_ = 0; _j_ < _incomings_[_idx_
].c; _j_++) { const int d = _edges_[_incomings_[_idx_].edges -
1 + _j_]; if (_incomings_[d].r != 4) continue; _incomings_[d
].r = 5; ((void) sizeof ((_exist_size_[_q_] < (symbolic_graph
->exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if
(_exist_size_[_q_] < (symbolic_graph->exec_symbol_info
->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 3171, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
(_i_ = 0; _i_ < (destination_size); _i_++) { ((void) sizeof
(((destinations)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__
({ if ((destinations)[_i_].graph == symbolic_graph) ; else __assert_fail
("(destinations)[_i_].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 3171, __extension__ __PRETTY_FUNCTION__); })); _incomings_[
(destinations)[_i_].d].d = 1; } for (_i_ = 0; _i_ < (source_size
); _i_++) { ((void) sizeof (((sources)[_i_].graph == symbolic_graph
) ? 1 : 0), __extension__ ({ if ((sources)[_i_].graph == symbolic_graph
) ; else __assert_fail ("(sources)[_i_].graph == symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 3171, __extension__ __PRETTY_FUNCTION__
); })); _exists_[0][_i_] = (sources)[_i_].d; } _p_ = 0; _q_ =
1; _exist_size_[0] = (source_size); _exist_size_[1] = 0; int
_d_ = 0; while (_exist_size_[_p_] > 0) { _exist_size_[_q_
] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_];) { const int32_t
_idx_ = _exists_[_p_][_i_]; _visit_->node[_visit_->size
].index = ((_idx_)); _visit_->node[_visit_->size].term =
((_incomings_[_idx_].d)); ++_visit_->size;; if (_incomings_
[_idx_].d) { ++_d_; _incomings_[_idx_].r = 7; } if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings) { if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings->rnum == 1) { const int
d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(0)))
; --_incomings_[d].c; if (_incomings_[d].c == 0 && _incomings_
[d].r == 6 && _d_ < (destination_size)) { _exists_
[_p_][_i_] = d; continue; } } else for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings->rnum; _j_++) { const int
d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_)
)); --_incomings_[d].c; if (_incomings_[d].c == 0 && _incomings_
[d].r == 6 && _d_ < (destination_size)) { ((void) sizeof
((_exist_size_[_q_] < (symbolic_graph->exec_symbol_info
->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_[_q_] <
(symbolic_graph->exec_symbol_info->rnum)) ; else __assert_fail
("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 3171, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } } ++_i_; } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (
_i_)); } for (_i_ = 0; _i_ < (destination_size); _i_++) { (
(void) sizeof (((destinations)[_i_].graph == symbolic_graph) ?
1 : 0), __extension__ ({ if ((destinations)[_i_].graph == symbolic_graph
) ; else __assert_fail ("(destinations)[_i_].graph == symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 3171, __extension__ __PRETTY_FUNCTION__
); })); if (_incomings_[(destinations)[_i_].d].r == 7) continue
; if (!(0)) { ((void) sizeof ((_incomings_[(destinations)[_i_
].d].c == 0) ? 1 : 0), __extension__ ({ if (_incomings_[(destinations
)[_i_].d].c == 0) ; else __assert_fail ("_incomings_[(destinations)[_i_].d].c == 0"
, "ccv_nnc_symbolic_graph_compile.c", 3171, __extension__ __PRETTY_FUNCTION__
); })); } else if (_incomings_[(destinations)[_i_].d].c > 0
) continue; _visit_->node[_visit_->size].index = (((destinations
)[_i_].d)); _visit_->node[_visit_->size].term = ((_incomings_
[(destinations)[_i_].d].d)); ++_visit_->size;; } if (_heap_mem_
) free(_incomings_); } while (0);; ((void) sizeof ((_visit_->
size <= (symbolic_graph->exec_symbol_info->rnum)) ? 1
: 0), __extension__ ({ if (_visit_->size <= (symbolic_graph
->exec_symbol_info->rnum)) ; else __assert_fail ("_visit_->size <= (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 3171, __extension__ __PRETTY_FUNCTION__
); })); _visit_; })
;
3172 ccv_nnc_symbolic_graph_symbol_infer(symbolic_graph, visit, sources, source_size, destinations, destination_size, p_tensor_symbol_info, p_tensor_symbol_info_size, tensor_symbol_info, exec_symbol_info);
3173 int i, j, k, p, q;
3174 const ccv_nnc_graph_exec_symbol_info_t* const p_node_info = p_exec_symbol_info ? p_exec_symbol_info + (symbolic_graph->exec_idx - 1) : 0;
3175 ccv_sparse_matrix_t* exec_dep;
3176 ccv_nnc_tensor_block_t* tensor_blocks;
3177 _ccv_nnc_exec_dep_and_tensor_blocks_prep(symbolic_graph, p_node_info, visit, tensor_binds, tensor_bind_size, outputs, output_size, sources, source_size, destinations, destination_size, p_tensor_symbol_info, p_tensor_symbol_info_size, exec_symbol_info, tensor_symbol_info, 0, 0, 0, 0, exec_flags, &exec_dep, &tensor_blocks);
3178 int tensor_block_size = symbolic_graph->tensor_symbol_info->rnum;
3179 // Now, everything is prepared, tensor life is analyzed, inplace operations are collapsed, all tensor symbols and hints
3180 // are automatically filled in, and all the sub-graphs are processed.
3181 // There is a last step though, for a while loop, it is parameterized:
3182 // while (x > 5) {
3183 // y = x + 1;
3184 // } (y => x) // This means after this loop is done, y's value will be copied over to x.
3185 // we will do our best to avoid to do the actual data copy, what we do here is to check whether y can be x's alias.
3186 // If y can be x's alias, this is good, no other changes required. In above case, y can be x's alias because
3187 // it is a inplace operation.
3188 // But if y cannot be x's alias, for example, this while loop looks like this:
3189 // while (x > 5) {
3190 // y = x + a
3191 // b = x + y
3192 // } (y => x, b => a) // This means after this loop is done, y's value copied to x and b's value copied to a.
3193 // For this example, y cannot be x's alias because x is used later to compute b (and that computation
3194 // has dependency on y as well).
3195 // For this case, we need to modify the computation graph. Previously, the graph looks like this:
3196 // y = x + a -> b = x + y
3197 // This graph will be extended to look like this:
3198 // y0 = x0 + a0 -> b0 = x0 + y0 -> y1 = y0 + b0 -> b1 = y0 + y1, or:
3199 // while (x0 > 5) {
3200 // y0 = x0 + a0
3201 // b0 = x0 + y0
3202 // if (y0 > 5) break
3203 // y1 = y0 + b0
3204 // b1 = y0 + y1
3205 // } (y1 => x0, b1 => a0)
3206 // After this expansion, y1 now can be the alias of x0, as well as b1 can be alias of a0 (they don't interfere
3207 // with each other now).
3208 // With this algorithm, we don't need to insert any data copy logic, the only thing need is to switch pointers
3209 // which is covered by the tensor_multiview_t construct (thus, y (y0, y1), x (y1, y0), b (b0, b1), a (b1, b0))
3210 ccv_nnc_symbolic_graph_t* dup_graph = 0;
3211 int* dup_exec_ref = 0;
3212 int* dup_tensor_block_ref = 0;
3213 int unroll_count = 0;
3214 // In true recursive fashion, I need to call all the sub graphs and do the pre compilation for them one by one.
3215 ccv_nnc_symbolic_graph_prep_t* prep = (ccv_nnc_symbolic_graph_prep_t*)ccmallocmalloc(sizeof(ccv_nnc_symbolic_graph_prep_t));
3216 prep->graph = ccv_nnc_graph_new(); // Just allocate the graph right now.
3217 prep->flags = 0;
3218 // Cannot handle dup a node that is a graph as well.
3219 if (p_exec_symbol_info)
3220 {
3221 prep->flags = p_node_info->flags;
3222 if (p_node_info->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
3223 {
3224 _ccv_nnc_redo_exec_dep_and_tensor_blocks_when_unroll(symbolic_graph, p_node_info, visit, tensor_binds, tensor_bind_size, sources, source_size, destinations, destination_size, p_tensor_symbol_info, p_tensor_symbol_info_size, exec_symbol_info, tensor_symbol_info, exec_flags, &exec_dep, &tensor_blocks, &tensor_block_size, &dup_graph, &unroll_count, &dup_exec_ref, &dup_tensor_block_ref);
3225 _ccv_nnc_fixup_tensor_blocks_for_outputs(exec_dep, tensor_blocks, p_node_info, unroll_count, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->destinations, 0)((void*)(((char*)((symbolic_graph->destinations)->data)
) + (size_t)(symbolic_graph->destinations)->rsize * (size_t
)(0)))
, symbolic_graph->destinations->rnum, symbolic_graph->p_idx - 1, p_tensor_symbol_info, p_tensor_symbol_info_size, tensor_symbol_info, dup_exec_ref, dup_tensor_block_ref);
3226 } else if (p_node_info->flags & CCV_NNC_GRAPH_EXEC_CASE_OF) {
3227 // TODO: We want to try our best to fit as much of its corresponding inputs / outputs into companion_ref group.
3228 }
3229 }
3230 ccv_nnc_symbolic_graph_prep_t** sub_preps = symbolic_graph->sub_graphs && symbolic_graph->sub_graphs->rnum ? (ccv_nnc_symbolic_graph_prep_t**)cccalloccalloc(symbolic_graph->sub_graphs->rnum, sizeof(ccv_nnc_symbolic_graph_prep_t*)) : 0;
3231 ccv_array_t* anonymous_block_free_list = 0;
3232 const int tensor_fold_size = (tensor_block_size + 31) >> 5;
3233 // Record whether this tensor is folded in this round.
3234 uint32_t* const tensor_fold = (uint32_t*)ccmallocmalloc(sizeof(uint32_t) * tensor_fold_size);
3235 ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx;
{
3236 for (p = 0; p < node->graph_ref_size; p++)
3237 {
3238 assert(symbolic_graph->sub_graphs)((void) sizeof ((symbolic_graph->sub_graphs) ? 1 : 0), __extension__
({ if (symbolic_graph->sub_graphs) ; else __assert_fail (
"symbolic_graph->sub_graphs", "ccv_nnc_symbolic_graph_compile.c"
, 3238, __extension__ __PRETTY_FUNCTION__); }))
;
3239 ccv_nnc_symbolic_graph_t* const sub_graph = *(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, CCV_NNC_GRAPH_REF(node)[p] - 1)((void*)(((char*)((symbolic_graph->sub_graphs)->data)) +
(size_t)(symbolic_graph->sub_graphs)->rsize * (size_t)
(((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (
node)->_inline_graph_ref)[p] - 1)))
;
3240 ccv_array_t* const dup_breakpoints = _ccv_nnc_dup_breakpoints_with_p_node_inputs(sub_graph, node);
3241 ccv_nnc_symbolic_graph_prep_t* const sub_prep = _ccv_nnc_symbolic_graph_prep_new(sub_graph, tensor_binds, tensor_bind_size, 0, 0, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sub_graph->sources, 0)((void*)(((char*)((sub_graph->sources)->data)) + (size_t
)(sub_graph->sources)->rsize * (size_t)(0)))
, sub_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sub_graph->destinations, 0)((void*)(((char*)((sub_graph->destinations)->data)) + (
size_t)(sub_graph->destinations)->rsize * (size_t)(0)))
, sub_graph->destinations->rnum, tensor_symbol_info, symbolic_graph->tensor_symbol_info->rnum, exec_symbol_info, symbolic_graph->exec_symbol_info->rnum);
3242 sub_prep->dup_breakpoints = dup_breakpoints;
3243 sub_prep->p = prep;
3244 sub_preps[CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)
[p] - 1] = sub_prep;
3245 const ccv_nnc_tensor_alloc_prep_t* const s_alloc_prep = sub_prep->alloc_prep;
3246 const ccv_nnc_tensor_block_t* const s_tensor_blocks = sub_prep->tensor_blocks;
3247 for (i = 0; i < s_alloc_prep->block_size; i++)
3248 {
3249 const int block_ref = s_alloc_prep->blocks[i].block_ref;
3250 const int buffer_ref = s_alloc_prep->blocks[i].buffer_ref;
3251 if (block_ref < sub_prep->tensor_symbol_info_size)
3252 {
3253 // If this block has a bypass, and its bypass has a different p_refs, then it doesn't matter.
3254 // I cannot assign p_refs to its parent buffer, and that buffer has to be anonymous.
3255 if (s_tensor_blocks[block_ref].bypass_ref)
3256 {
3257 int bypass_ref = s_tensor_blocks[block_ref].bypass_ref - 1;
3258 while (s_tensor_blocks[bypass_ref].ref)
3259 bypass_ref = s_tensor_blocks[bypass_ref].ref - 1;
3260 if (s_tensor_blocks[block_ref].p_refs[0] != s_tensor_blocks[bypass_ref].p_refs[0] ||
3261 s_tensor_blocks[block_ref].p_refs[1] != s_tensor_blocks[bypass_ref].p_refs[1])
3262 continue;
3263 }
3264 if (s_tensor_blocks[block_ref].p_refs[0])
3265 {
3266 /* If it is already properly assigned, next. */
3267 if (s_alloc_prep->buffers[buffer_ref].p_refs[0] != s_tensor_blocks[block_ref].p_refs[0] &&
3268 s_alloc_prep->buffers[buffer_ref].p_refs[1] != s_tensor_blocks[block_ref].p_refs[0])
3269 {
3270 if (!s_alloc_prep->buffers[buffer_ref].p_refs[0])
3271 s_alloc_prep->buffers[buffer_ref].p_refs[0] = s_tensor_blocks[block_ref].p_refs[0];
3272 else {
3273 assert(!s_alloc_prep->buffers[buffer_ref].p_refs[1])((void) sizeof ((!s_alloc_prep->buffers[buffer_ref].p_refs
[1]) ? 1 : 0), __extension__ ({ if (!s_alloc_prep->buffers
[buffer_ref].p_refs[1]) ; else __assert_fail ("!s_alloc_prep->buffers[buffer_ref].p_refs[1]"
, "ccv_nnc_symbolic_graph_compile.c", 3273, __extension__ __PRETTY_FUNCTION__
); }))
;
3274 s_alloc_prep->buffers[buffer_ref].p_refs[1] = s_tensor_blocks[block_ref].p_refs[0];
3275 }
3276 }
3277 /* When entering this branch, s_alloc_prep->buffers[buffer_ref].p_refs[0] cannot be 0. */
3278 if (s_tensor_blocks[block_ref].p_refs[1] &&
3279 s_alloc_prep->buffers[buffer_ref].p_refs[0] != s_tensor_blocks[block_ref].p_refs[1] &&
3280 s_alloc_prep->buffers[buffer_ref].p_refs[1] != s_tensor_blocks[block_ref].p_refs[1])
3281 {
3282 assert(s_alloc_prep->buffers[buffer_ref].p_refs[0])((void) sizeof ((s_alloc_prep->buffers[buffer_ref].p_refs[
0]) ? 1 : 0), __extension__ ({ if (s_alloc_prep->buffers[buffer_ref
].p_refs[0]) ; else __assert_fail ("s_alloc_prep->buffers[buffer_ref].p_refs[0]"
, "ccv_nnc_symbolic_graph_compile.c", 3282, __extension__ __PRETTY_FUNCTION__
); }))
;
3283 assert(!s_alloc_prep->buffers[buffer_ref].p_refs[1])((void) sizeof ((!s_alloc_prep->buffers[buffer_ref].p_refs
[1]) ? 1 : 0), __extension__ ({ if (!s_alloc_prep->buffers
[buffer_ref].p_refs[1]) ; else __assert_fail ("!s_alloc_prep->buffers[buffer_ref].p_refs[1]"
, "ccv_nnc_symbolic_graph_compile.c", 3283, __extension__ __PRETTY_FUNCTION__
); }))
;
3284 s_alloc_prep->buffers[buffer_ref].p_refs[1] = s_tensor_blocks[block_ref].p_refs[1];
3285 }
3286 }
3287 } else if (s_tensor_blocks[block_ref].dup_p_refs) {
3288 /* In this case, only relevant bit is dup_p_ref. dup_p_ref extends the life-time of anonymous block
3289 * which by default only has life-cycle shared with this sub-graph node. The reason to extend is that
3290 * these anonymous blocks that has dup_p_ref may contain data that will be used as output (thus, dup_p_ref
3291 * always points to an output tensor of this sub-graph node) therefore, the memory region must extend
3292 * its life-time to the end of the output tensor. */
3293 if (!s_alloc_prep->buffers[buffer_ref].dup_p_refs)
3294 s_alloc_prep->buffers[buffer_ref].dup_p_refs = ccv_array_new(sizeof(int), s_tensor_blocks[block_ref].dup_p_refs->rnum, 0);
3295 for (j = 0; j < s_tensor_blocks[block_ref].dup_p_refs->rnum; j++)
3296 ccv_array_add_unique_int(s_alloc_prep->buffers[buffer_ref].dup_p_refs, *(int*)ccv_array_get(s_tensor_blocks[block_ref].dup_p_refs, j)((void*)(((char*)((s_tensor_blocks[block_ref].dup_p_refs)->
data)) + (size_t)(s_tensor_blocks[block_ref].dup_p_refs)->
rsize * (size_t)(j)))
);
3297 }
3298 }
3299 }
3300 const int init_tensor_block_size = tensor_block_size;
3301 int rw_anonymous_buffer_size_cap = 0;
3302 int ro_anonymous_buffer_size_cap = 0;
3303 if (anonymous_block_free_list)
3304 ccv_array_clear(anonymous_block_free_list);
3305 memset(tensor_fold, 0, sizeof(uint32_t) * tensor_fold_size);
3306 for (p = 0; p < node->graph_ref_size; p++)
3307 {
3308 ccv_nnc_symbolic_graph_prep_t* const sub_prep = sub_preps[CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)
[p] - 1];
3309 const ccv_nnc_tensor_alloc_prep_t* const s_alloc_prep = sub_prep->alloc_prep;
3310 int rw_anonymous_buffer_size = 0;
3311 int ro_anonymous_buffer_size = 0;
3312 for (i = 0; i < s_alloc_prep->buffer_size; i++)
3313 if (s_alloc_prep->buffers[i].p_refs[0])
3314 {
3315 /* Reduce 2 p_refs, if it is, to 1 p_ref (by doing block folding). */
3316 int p_ref_0 = s_alloc_prep->buffers[i].p_refs[0] - 1;
3317 /* Need to go through refs. Since we reuse the tensor block for this input, it now has to have allocate at least this much space. */
3318 int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, node);
3319 assert(p_ref_0_is_in_or_out != 0)((void) sizeof ((p_ref_0_is_in_or_out != 0) ? 1 : 0), __extension__
({ if (p_ref_0_is_in_or_out != 0) ; else __assert_fail ("p_ref_0_is_in_or_out != 0"
, "ccv_nnc_symbolic_graph_compile.c", 3319, __extension__ __PRETTY_FUNCTION__
); }))
;
3320 int unref_p_ref_0 = p_ref_0;
3321 while (tensor_blocks[unref_p_ref_0].ref)
3322 unref_p_ref_0 = tensor_blocks[unref_p_ref_0].ref - 1;
3323 /* This parent tensor block cannot be unassigned because it is either input / output of this sub-graph node. */
3324 assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_0]))((void) sizeof ((!((tensor_blocks[unref_p_ref_0].flags & 0x3
) == UNASSIGNED)) ? 1 : 0), __extension__ ({ if (!((tensor_blocks
[unref_p_ref_0].flags & 0x3) == UNASSIGNED)) ; else __assert_fail
("!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_0])", "ccv_nnc_symbolic_graph_compile.c"
, 3324, __extension__ __PRETTY_FUNCTION__); }))
;
3325 if (s_alloc_prep->buffers[i].p_refs[1])
3326 {
3327 int p_ref_1 = s_alloc_prep->buffers[i].p_refs[1] - 1;
3328 const int p_ref_1_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_1, node);
3329 assert(p_ref_1_is_in_or_out != 0)((void) sizeof ((p_ref_1_is_in_or_out != 0) ? 1 : 0), __extension__
({ if (p_ref_1_is_in_or_out != 0) ; else __assert_fail ("p_ref_1_is_in_or_out != 0"
, "ccv_nnc_symbolic_graph_compile.c", 3329, __extension__ __PRETTY_FUNCTION__
); }))
;
3330 int unref_p_ref_1 = p_ref_1;
3331 while (tensor_blocks[unref_p_ref_1].ref)
3332 unref_p_ref_1 = tensor_blocks[unref_p_ref_1].ref - 1;
3333 /* See above comment for the similar p_ref_0 check. */
3334 assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_1]))((void) sizeof ((!((tensor_blocks[unref_p_ref_1].flags & 0x3
) == UNASSIGNED)) ? 1 : 0), __extension__ ({ if (!((tensor_blocks
[unref_p_ref_1].flags & 0x3) == UNASSIGNED)) ; else __assert_fail
("!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_1])", "ccv_nnc_symbolic_graph_compile.c"
, 3334, __extension__ __PRETTY_FUNCTION__); }))
;
3335 assert(p_ref_0_is_in_or_out != p_ref_1_is_in_or_out)((void) sizeof ((p_ref_0_is_in_or_out != p_ref_1_is_in_or_out
) ? 1 : 0), __extension__ ({ if (p_ref_0_is_in_or_out != p_ref_1_is_in_or_out
) ; else __assert_fail ("p_ref_0_is_in_or_out != p_ref_1_is_in_or_out"
, "ccv_nnc_symbolic_graph_compile.c", 3335, __extension__ __PRETTY_FUNCTION__
); }))
;
3336 int p_ref_t;
3337 if (p_ref_0_is_in_or_out < p_ref_1_is_in_or_out) /* if p_ref_0 is input and p_ref_1 is output, switch. */
3338 {
3339 CCV_SWAP(p_ref_0, p_ref_1, p_ref_t)((p_ref_t) = (p_ref_0), (p_ref_0) = (p_ref_1), (p_ref_1) = (p_ref_t
))
;
3340 CCV_SWAP(unref_p_ref_0, unref_p_ref_1, p_ref_t)((p_ref_t) = (unref_p_ref_0), (unref_p_ref_0) = (unref_p_ref_1
), (unref_p_ref_1) = (p_ref_t))
;
3341 }
3342 p_ref_0_is_in_or_out = 1; /* Now p_ref_0 surely is the output tensor. */
3343 /* If the dimension matches, can fold. TODO: shoud the dimension matches perfectly here? */
3344 if (memcmp(tensor_symbol_info[unref_p_ref_1].info.dim, tensor_symbol_info[unref_p_ref_0].info.dim, sizeof(int) * CCV_NNC_MAX_DIM_ALLOC(12)) == 0)
3345 {
3346 const int folded = _ccv_nnc_tensor_blocks_try_fold(tensor_blocks, unref_p_ref_1, unref_p_ref_0);
3347 if (folded)
3348 {
3349 p_ref_0 = p_ref_1;
3350 unref_p_ref_0 = unref_p_ref_1; // p_ref_0 now folded into p_ref_1, therefore, pointing to p_ref_1 now.
3351 tensor_fold[unref_p_ref_0 >> 5] |= (1u << (unref_p_ref_0 & 0x1f));
3352 for (j = 0; j < unroll_count; j++) /* Fold its duplicates as well. */
3353 {
3354 const int folded = _ccv_nnc_tensor_blocks_try_fold(tensor_blocks, dup_tensor_block_ref[unref_p_ref_1 * unroll_count + j], dup_tensor_block_ref[unref_p_ref_0 * unroll_count + j]);
3355 assert(folded && "the subsequent duplicates can be folded too.")((void) sizeof ((folded && "the subsequent duplicates can be folded too."
) ? 1 : 0), __extension__ ({ if (folded && "the subsequent duplicates can be folded too."
) ; else __assert_fail ("folded && \"the subsequent duplicates can be folded too.\""
, "ccv_nnc_symbolic_graph_compile.c", 3355, __extension__ __PRETTY_FUNCTION__
); }))
;
3356 }
3357 }
3358 }
3359 }
3360 /* Only proceed if it is folded here (thus, the input / output tensor can be connected, reuse is not a problem
3361 * Or if the p_ref_0 is the output, it is the first started from this node (thus, I have full control over
3362 * its life-cycle). Or if the p_ref_0 is the input, it is ended in this node (thus, I can take over i
3363 * life-cycle freely within this sub-graph (otherwise, if it is used anywhere, I cannot change the content
3364 * within its memory region)). Unless this buffer is used as read-only, and we don't have any output
3365 * associated with it, then we are good. */
3366 if ((tensor_fold[unref_p_ref_0 >> 5] & (1u << (unref_p_ref_0 & 0x1f))) ||
3367 (p_ref_0_is_in_or_out == 1 && _ccv_nnc_tensor_block_check_head(tensor_blocks + unref_p_ref_0, idx)) ||
3368 (p_ref_0_is_in_or_out == -1 && _ccv_nnc_tensor_block_check_tail(tensor_blocks + unref_p_ref_0, idx)) ||
3369 TENSOR_READ_WRITE(s_alloc_prep->buffers[i])(s_alloc_prep->buffers[i].flags & 0xc) == READ_ONLY)
3370 {
3371 if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i])(s_alloc_prep->buffers[i].flags & 0xc) == READ_ONLY)
3372 { assert(s_alloc_prep->buffers[i].p_refs[1] == 0)((void) sizeof ((s_alloc_prep->buffers[i].p_refs[1] == 0) ?
1 : 0), __extension__ ({ if (s_alloc_prep->buffers[i].p_refs
[1] == 0) ; else __assert_fail ("s_alloc_prep->buffers[i].p_refs[1] == 0"
, "ccv_nnc_symbolic_graph_compile.c", 3372, __extension__ __PRETTY_FUNCTION__
); }))
; }
3373 /* p_ref_0 is either the only one, or the output tensor, we always prefer the output tensor (there
3374 * is a long argument why that is the case, the digest is, it is much easier to control your output
3375 * than your input). */
3376 s_alloc_prep->buffers[i].p_refs[0] = p_ref_0 + 1;
3377 s_alloc_prep->buffers[i].p_refs[1] = 0;
3378 /* This parent tensor block cannot be unassigned because it is either input / output of this sub-graph node. */
3379 assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_0]))((void) sizeof ((!((tensor_blocks[unref_p_ref_0].flags & 0x3
) == UNASSIGNED)) ? 1 : 0), __extension__ ({ if (!((tensor_blocks
[unref_p_ref_0].flags & 0x3) == UNASSIGNED)) ; else __assert_fail
("!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_0])", "ccv_nnc_symbolic_graph_compile.c"
, 3379, __extension__ __PRETTY_FUNCTION__); }))
;
3380 tensor_blocks[unref_p_ref_0].size = ccv_max(s_alloc_prep->buffers[i].size, tensor_blocks[unref_p_ref_0].size)({ typeof (s_alloc_prep->buffers[i].size) _a = (s_alloc_prep
->buffers[i].size); typeof (tensor_blocks[unref_p_ref_0].size
) _b = (tensor_blocks[unref_p_ref_0].size); (_a > _b) ? _a
: _b; })
;
3381 for (j = 0; j < unroll_count; j++) /* Change the size of its duplicates as well. */
3382 tensor_blocks[dup_tensor_block_ref[p_ref_0 * unroll_count + j]].size =
3383 tensor_blocks[dup_tensor_block_ref[unref_p_ref_0 * unroll_count + j]].size =
3384 tensor_blocks[unref_p_ref_0].size;
3385 } else {
3386 s_alloc_prep->buffers[i].p_refs[0] = s_alloc_prep->buffers[i].p_refs[1] = 0;
3387 if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i])(s_alloc_prep->buffers[i].flags & 0xc) == READ_ONLY)
3388 ++ro_anonymous_buffer_size;
3389 else
3390 rw_anonymous_buffer_size += unroll_count + 1;
3391 }
3392 } else {
3393 if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i])(s_alloc_prep->buffers[i].flags & 0xc) == READ_ONLY)
3394 ++ro_anonymous_buffer_size;
3395 else
3396 rw_anonymous_buffer_size += unroll_count + 1;
3397 }
3398 if (ro_anonymous_buffer_size || rw_anonymous_buffer_size)
3399 {
3400 const int anonymous_block_free_list_cap = anonymous_block_free_list ? anonymous_block_free_list->rnum : 0;
3401 // All read-write buffer (potentially) can be reused between each case..of branch.
3402 rw_anonymous_buffer_size_cap += rw_anonymous_buffer_size;
3403 // Read-only buffer cannot be reused between each case..of branch.
3404 ro_anonymous_buffer_size_cap += ro_anonymous_buffer_size;
3405 /* Anonymous block, allocate additional tensor blocks for this. */
3406 /* This is either because this is an internal tensor (don't have p_ref) */
3407 /* or it is an anonymous block itself within the sub graphs of this while graph. */
3408 tensor_blocks = (ccv_nnc_tensor_block_t*)ccreallocrealloc(tensor_blocks, sizeof(ccv_nnc_tensor_block_t) * (init_tensor_block_size + (unroll_count + 1) * rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap));
3409 memset(tensor_blocks + tensor_block_size, 0, sizeof(ccv_nnc_tensor_block_t) * (init_tensor_block_size + (unroll_count + 1) * rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap - tensor_block_size));
3410 if (dup_tensor_block_ref)
3411 dup_tensor_block_ref = (int*)ccreallocrealloc(dup_tensor_block_ref, sizeof(int) * unroll_count * (init_tensor_block_size + rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap));
3412 for (i = 0; i < s_alloc_prep->buffer_size; i++)
3413 if (!s_alloc_prep->buffers[i].p_refs[0])
3414 {
3415 if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i])(s_alloc_prep->buffers[i].flags & 0xc) == READ_ONLY) /* If it is read-only, add all sources (destinations) to it. */
3416 {
3417 assert(tensor_block_size < init_tensor_block_size + rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap)((void) sizeof ((tensor_block_size < init_tensor_block_size
+ rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap
) ? 1 : 0), __extension__ ({ if (tensor_block_size < init_tensor_block_size
+ rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap
) ; else __assert_fail ("tensor_block_size < init_tensor_block_size + rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap"
, "ccv_nnc_symbolic_graph_compile.c", 3417, __extension__ __PRETTY_FUNCTION__
); }))
;
3418 TENSOR_SET_ANONYMOUS(tensor_blocks[tensor_block_size])(tensor_blocks[tensor_block_size].flags = ((tensor_blocks[tensor_block_size
].flags & ~0x10) | ANONYMOUS))
;
3419 TENSOR_SET_READ_WRITE(tensor_blocks[tensor_block_size], TENSOR_READ_WRITE(s_alloc_prep->buffers[i]))(tensor_blocks[tensor_block_size].flags = ((tensor_blocks[tensor_block_size
].flags & ~0xc) | (s_alloc_prep->buffers[i].flags &
0xc)))
;
3420 tensor_blocks[tensor_block_size].type = s_alloc_prep->buffers[i].type;
3421 tensor_blocks[tensor_block_size].pin_mem = s_alloc_prep->buffers[i].pin_mem;
3422 tensor_blocks[tensor_block_size].size = s_alloc_prep->buffers[i].size;
3423 s_alloc_prep->buffers[i].p_refs[0] = tensor_block_size + 1;
3424 tensor_blocks[tensor_block_size].head = ccv_array_new(sizeof(int), 1, 0);
3425 ccv_array_push(tensor_blocks[tensor_block_size].head, &idx);
3426 ccv_array_t* const dup_p_refs = s_alloc_prep->buffers[i].dup_p_refs;
3427 if (dup_p_refs && dup_p_refs->rnum > 0)
3428 {
3429 for (j = 0; j < dup_p_refs->rnum; j++)
3430 {
3431 const int dup_p_ref = *(int*)ccv_array_get(dup_p_refs, j)((void*)(((char*)((dup_p_refs)->data)) + (size_t)(dup_p_refs
)->rsize * (size_t)(j)))
;
3432 assert(dup_p_ref >= 0)((void) sizeof ((dup_p_ref >= 0) ? 1 : 0), __extension__ (
{ if (dup_p_ref >= 0) ; else __assert_fail ("dup_p_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3432, __extension__ __PRETTY_FUNCTION__
); }))
;
3433 assert(dup_p_ref < symbolic_graph->tensor_symbol_info->rnum)((void) sizeof ((dup_p_ref < symbolic_graph->tensor_symbol_info
->rnum) ? 1 : 0), __extension__ ({ if (dup_p_ref < symbolic_graph
->tensor_symbol_info->rnum) ; else __assert_fail ("dup_p_ref < symbolic_graph->tensor_symbol_info->rnum"
, "ccv_nnc_symbolic_graph_compile.c", 3433, __extension__ __PRETTY_FUNCTION__
); }))
;
3434 assert(tensor_blocks[dup_p_ref].tail)((void) sizeof ((tensor_blocks[dup_p_ref].tail) ? 1 : 0), __extension__
({ if (tensor_blocks[dup_p_ref].tail) ; else __assert_fail (
"tensor_blocks[dup_p_ref].tail", "ccv_nnc_symbolic_graph_compile.c"
, 3434, __extension__ __PRETTY_FUNCTION__); }))
;
3435 // If it points to a p_ref upwards, check whether this is an output, if it is an output, add it to
3436 // this block's dup_p_refs. It propagates back all the way to upper layer's buffer object.
3437 if (tensor_symbol_info[dup_p_ref].p_ref)
3438 {
3439 const int p_ref_0 = tensor_symbol_info[dup_p_ref].p_ref - 1;
3440 assert(p_node_info)((void) sizeof ((p_node_info) ? 1 : 0), __extension__ ({ if (
p_node_info) ; else __assert_fail ("p_node_info", "ccv_nnc_symbolic_graph_compile.c"
, 3440, __extension__ __PRETTY_FUNCTION__); }))
;
3441 const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info);
3442 if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this.
3443 {
3444 if (!tensor_blocks[tensor_block_size].dup_p_refs)
3445 tensor_blocks[tensor_block_size].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
3446 ccv_array_add_unique_int(tensor_blocks[tensor_block_size].dup_p_refs, p_ref_0);
3447 }
3448 }
3449 if (!tensor_blocks[tensor_block_size].tail)
3450 tensor_blocks[tensor_block_size].tail = ccv_array_new(sizeof(int), tensor_blocks[dup_p_ref].tail->rnum, 0);
3451 for (k = 0; k < tensor_blocks[dup_p_ref].tail->rnum; k++)
3452 _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[dup_p_ref].tail, k)((void*)(((char*)((tensor_blocks[dup_p_ref].tail)->data)) +
(size_t)(tensor_blocks[dup_p_ref].tail)->rsize * (size_t)
(k)))
, tensor_blocks[tensor_block_size]);
3453 }
3454 } else {
3455 tensor_blocks[tensor_block_size].tail = ccv_array_new(sizeof(int), 1, 0);
3456 ccv_array_push(tensor_blocks[tensor_block_size].tail, &idx);
3457 }
3458 for (j = 0; j < source_size; j++)
3459 _ccv_nnc_tensor_block_add_exec(exec_dep, sources[j].d, tensor_blocks[tensor_block_size]);
3460 /* If this is a read-only (based on SSA, if first encountered as read), and this is
3461 * sub-graph. Mark it to the end of the graph. */
3462 if (p_exec_symbol_info)
3463 for (j = 0; j < destination_size; j++)
3464 _ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[tensor_block_size]);
3465 /* If it is read-only, it is self-reflecting. */
3466 for (k = 0; k < unroll_count; k++)
3467 {
3468 for (j = 0; j < destination_size; j++)
3469 if (dup_exec_ref[destinations[j].d * unroll_count + k] >= 0)
3470 _ccv_nnc_tensor_block_add_exec(exec_dep, dup_exec_ref[destinations[j].d * unroll_count + k], tensor_blocks[tensor_block_size]);
3471 /* No need to extend life-time, because this is a sub-graph and we already extended read-only to the end of destination. */
3472 assert(symbolic_graph->p)((void) sizeof ((symbolic_graph->p) ? 1 : 0), __extension__
({ if (symbolic_graph->p) ; else __assert_fail ("symbolic_graph->p"
, "ccv_nnc_symbolic_graph_compile.c", 3472, __extension__ __PRETTY_FUNCTION__
); }))
;
3473 dup_tensor_block_ref[tensor_block_size * unroll_count + k] = tensor_block_size;
3474 }
3475 ++tensor_block_size;
3476 } else {
3477 ccv_array_t* const dup_p_refs = s_alloc_prep->buffers[i].dup_p_refs;
3478 const int tensor_block_idx = _ccv_nnc_anonymous_tensor_block_from_free_list(tensor_blocks, tensor_block_size, anonymous_block_free_list, anonymous_block_free_list_cap, s_alloc_prep->buffers[i].type, s_alloc_prep->buffers[i].size, exec_dep, dup_p_refs);
3479 const int new_anonymous_tensor_block = (tensor_block_idx == tensor_block_size);
3480 // Find suitable tensor block from the free list.
3481 TENSOR_SET_ANONYMOUS(tensor_blocks[tensor_block_idx])(tensor_blocks[tensor_block_idx].flags = ((tensor_blocks[tensor_block_idx
].flags & ~0x10) | ANONYMOUS))
;
3482 TENSOR_SET_READ_WRITE(tensor_blocks[tensor_block_idx], TENSOR_READ_WRITE(s_alloc_prep->buffers[i]))(tensor_blocks[tensor_block_idx].flags = ((tensor_blocks[tensor_block_idx
].flags & ~0xc) | (s_alloc_prep->buffers[i].flags &
0xc)))
;
3483 s_alloc_prep->buffers[i].p_refs[0] = tensor_block_idx + 1;
3484 if (new_anonymous_tensor_block)
3485 {
3486 tensor_blocks[tensor_block_idx].type = s_alloc_prep->buffers[i].type;
3487 tensor_blocks[tensor_block_idx].pin_mem = s_alloc_prep->buffers[i].pin_mem;
3488 tensor_blocks[tensor_block_idx].size = s_alloc_prep->buffers[i].size;
3489 tensor_blocks[tensor_block_idx].head = ccv_array_new(sizeof(int), 1, 0);
3490 ccv_array_push(tensor_blocks[tensor_block_idx].head, &idx);
3491 } else {
3492 tensor_blocks[tensor_block_idx].pin_mem = tensor_blocks[tensor_block_idx].pin_mem || s_alloc_prep->buffers[i].pin_mem;
3493 tensor_blocks[tensor_block_idx].size = ccv_max(tensor_blocks[tensor_block_idx].size, s_alloc_prep->buffers[i].size)({ typeof (tensor_blocks[tensor_block_idx].size) _a = (tensor_blocks
[tensor_block_idx].size); typeof (s_alloc_prep->buffers[i]
.size) _b = (s_alloc_prep->buffers[i].size); (_a > _b) ?
_a : _b; })
;
3494 }
3495 if (dup_p_refs && dup_p_refs->rnum > 0)
3496 {
3497 for (j = 0; j < dup_p_refs->rnum; j++)
3498 {
3499 const int dup_p_ref = *(int*)ccv_array_get(dup_p_refs, j)((void*)(((char*)((dup_p_refs)->data)) + (size_t)(dup_p_refs
)->rsize * (size_t)(j)))
;
3500 assert(dup_p_ref >= 0)((void) sizeof ((dup_p_ref >= 0) ? 1 : 0), __extension__ (
{ if (dup_p_ref >= 0) ; else __assert_fail ("dup_p_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3500, __extension__ __PRETTY_FUNCTION__
); }))
;
3501 assert(dup_p_ref < symbolic_graph->tensor_symbol_info->rnum)((void) sizeof ((dup_p_ref < symbolic_graph->tensor_symbol_info
->rnum) ? 1 : 0), __extension__ ({ if (dup_p_ref < symbolic_graph
->tensor_symbol_info->rnum) ; else __assert_fail ("dup_p_ref < symbolic_graph->tensor_symbol_info->rnum"
, "ccv_nnc_symbolic_graph_compile.c", 3501, __extension__ __PRETTY_FUNCTION__
); }))
;
3502 // If it points to a p_ref upwards, check whether this is an output, if it is an output, add it to
3503 // this block's dup_p_refs. It propagates back all the way to upper layer's buffer object.
3504 if (tensor_symbol_info[dup_p_ref].p_ref)
3505 {
3506 const int p_ref_0 = tensor_symbol_info[dup_p_ref].p_ref - 1;
3507 assert(p_node_info)((void) sizeof ((p_node_info) ? 1 : 0), __extension__ ({ if (
p_node_info) ; else __assert_fail ("p_node_info", "ccv_nnc_symbolic_graph_compile.c"
, 3507, __extension__ __PRETTY_FUNCTION__); }))
;
3508 const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info);
3509 if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this.
3510 {
3511 if (!tensor_blocks[tensor_block_idx].dup_p_refs)
3512 tensor_blocks[tensor_block_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
3513 ccv_array_add_unique_int(tensor_blocks[tensor_block_idx].dup_p_refs, p_ref_0);
3514 }
3515 }
3516 assert(tensor_blocks[dup_p_ref].tail)((void) sizeof ((tensor_blocks[dup_p_ref].tail) ? 1 : 0), __extension__
({ if (tensor_blocks[dup_p_ref].tail) ; else __assert_fail (
"tensor_blocks[dup_p_ref].tail", "ccv_nnc_symbolic_graph_compile.c"
, 3516, __extension__ __PRETTY_FUNCTION__); }))
;
3517 if (!tensor_blocks[tensor_block_idx].tail)
3518 tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), tensor_blocks[dup_p_ref].tail->rnum, 0);
3519 for (k = 0; k < tensor_blocks[dup_p_ref].tail->rnum; k++)
3520 _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[dup_p_ref].tail, k)((void*)(((char*)((tensor_blocks[dup_p_ref].tail)->data)) +
(size_t)(tensor_blocks[dup_p_ref].tail)->rsize * (size_t)
(k)))
, tensor_blocks[tensor_block_idx]);
3521 // We have to add it to the warp around companion_ref as well.
3522 // TODO: Although we know this wasted space (any space in between current one and its companion_ref will still
3523 // be occupied and unlikely to be reused), but we cannot really do too much about it because the companion_ref's
3524 // definition is too free-form and if we enforce stronger gaurantee on this (such as it must wrap around), this
3525 // gaurantee may be broken down in the line.
3526 if (tensor_blocks[dup_p_ref].companion_ref)
3527 {
3528 const int companion_ref = tensor_blocks[dup_p_ref].companion_ref - 1;
3529 for (q = 0; tensor_blocks[companion_ref].head && q < tensor_blocks[companion_ref].head->rnum; q++)
3530 _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].head, q)((void*)(((char*)((tensor_blocks[companion_ref].head)->data
)) + (size_t)(tensor_blocks[companion_ref].head)->rsize * (
size_t)(q)))
, tensor_blocks[tensor_block_idx]);
3531 for (q = 0; tensor_blocks[companion_ref].tail && q < tensor_blocks[companion_ref].tail->rnum; q++)
3532 _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].tail, q)((void*)(((char*)((tensor_blocks[companion_ref].tail)->data
)) + (size_t)(tensor_blocks[companion_ref].tail)->rsize * (
size_t)(q)))
, tensor_blocks[tensor_block_idx]);
3533 }
3534 }
3535 } else if (new_anonymous_tensor_block) {
3536 tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), 1, 0);
3537 ccv_array_push(tensor_blocks[tensor_block_idx].tail, &idx);
3538 }
3539 const int prev_tensor_block_idx = tensor_block_idx;
3540 if (new_anonymous_tensor_block)
3541 {
3542 if (!anonymous_block_free_list)
3543 anonymous_block_free_list = ccv_array_new(sizeof(int), 0, 0);
3544 ccv_array_push(anonymous_block_free_list, &tensor_block_size);
3545 ++tensor_block_size;
3546 }
3547 for (k = 0; k < unroll_count; k++)
3548 {
3549 const int tensor_block_idx = new_anonymous_tensor_block ?
3550 (dup_tensor_block_ref[prev_tensor_block_idx * unroll_count + k] = tensor_block_size) :
3551 dup_tensor_block_ref[prev_tensor_block_idx * unroll_count + k];
3552 TENSOR_SET_ANONYMOUS(tensor_blocks[tensor_block_idx])(tensor_blocks[tensor_block_idx].flags = ((tensor_blocks[tensor_block_idx
].flags & ~0x10) | ANONYMOUS))
;
3553 TENSOR_SET_READ_WRITE(tensor_blocks[tensor_block_idx], TENSOR_READ_WRITE(s_alloc_prep->buffers[i]))(tensor_blocks[tensor_block_idx].flags = ((tensor_blocks[tensor_block_idx
].flags & ~0xc) | (s_alloc_prep->buffers[i].flags &
0xc)))
;
3554 if (new_anonymous_tensor_block)
3555 {
3556 tensor_blocks[tensor_block_idx].type = s_alloc_prep->buffers[i].type;
3557 tensor_blocks[tensor_block_idx].pin_mem = s_alloc_prep->buffers[i].pin_mem;
3558 tensor_blocks[tensor_block_idx].size = s_alloc_prep->buffers[i].size;
3559 tensor_blocks[tensor_block_idx].head = ccv_array_new(sizeof(int), 1, 0);
3560 /* Attach to duplicated exec for this tensor block. */
3561 ccv_array_push(tensor_blocks[tensor_block_idx].head, &dup_exec_ref[idx * unroll_count + k]);
3562 } else {
3563 tensor_blocks[tensor_block_idx].pin_mem = tensor_blocks[tensor_block_idx].pin_mem || s_alloc_prep->buffers[i].pin_mem;
3564 tensor_blocks[tensor_block_idx].size = ccv_max(tensor_blocks[tensor_block_idx].size, s_alloc_prep->buffers[i].size)({ typeof (tensor_blocks[tensor_block_idx].size) _a = (tensor_blocks
[tensor_block_idx].size); typeof (s_alloc_prep->buffers[i]
.size) _b = (s_alloc_prep->buffers[i].size); (_a > _b) ?
_a : _b; })
;
3565 _ccv_nnc_tensor_block_add_exec(exec_dep, dup_exec_ref[idx * unroll_count + k], tensor_blocks[tensor_block_idx]);
3566
3567 }
3568 if (dup_p_refs && dup_p_refs->rnum > 0)
3569 {
3570 /* Not nil, not self-reflecting. */
3571 for (j = 0; j < dup_p_refs->rnum; j++)
3572 {
3573 const int dup_p_ref = *(int*)ccv_array_get(dup_p_refs, j)((void*)(((char*)((dup_p_refs)->data)) + (size_t)(dup_p_refs
)->rsize * (size_t)(j)))
;
3574 assert(dup_p_ref >= 0)((void) sizeof ((dup_p_ref >= 0) ? 1 : 0), __extension__ (
{ if (dup_p_ref >= 0) ; else __assert_fail ("dup_p_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3574, __extension__ __PRETTY_FUNCTION__
); }))
;
3575 assert(dup_p_ref < symbolic_graph->tensor_symbol_info->rnum)((void) sizeof ((dup_p_ref < symbolic_graph->tensor_symbol_info
->rnum) ? 1 : 0), __extension__ ({ if (dup_p_ref < symbolic_graph
->tensor_symbol_info->rnum) ; else __assert_fail ("dup_p_ref < symbolic_graph->tensor_symbol_info->rnum"
, "ccv_nnc_symbolic_graph_compile.c", 3575, __extension__ __PRETTY_FUNCTION__
); }))
;
3576 // If it points to a p_ref upwards, check whether this is an output, if it is an output, add it to
3577 // this block's dup_p_refs. It propagates back all the way to upper layer's buffer object.
3578 if (tensor_symbol_info[dup_p_ref].p_ref)
3579 {
3580 const int p_ref_0 = tensor_symbol_info[dup_p_ref].p_ref - 1;
3581 assert(p_node_info)((void) sizeof ((p_node_info) ? 1 : 0), __extension__ ({ if (
p_node_info) ; else __assert_fail ("p_node_info", "ccv_nnc_symbolic_graph_compile.c"
, 3581, __extension__ __PRETTY_FUNCTION__); }))
;
3582 const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info);
3583 if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this.
3584 {
3585 if (!tensor_blocks[tensor_block_idx].dup_p_refs)
3586 tensor_blocks[tensor_block_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
3587 ccv_array_add_unique_int(tensor_blocks[tensor_block_idx].dup_p_refs, p_ref_0);
3588 }
3589 }
3590 assert(dup_tensor_block_ref[dup_p_ref * unroll_count + k] >= 0 && dup_tensor_block_ref[dup_p_ref * unroll_count + k] != dup_p_ref)((void) sizeof ((dup_tensor_block_ref[dup_p_ref * unroll_count
+ k] >= 0 && dup_tensor_block_ref[dup_p_ref * unroll_count
+ k] != dup_p_ref) ? 1 : 0), __extension__ ({ if (dup_tensor_block_ref
[dup_p_ref * unroll_count + k] >= 0 && dup_tensor_block_ref
[dup_p_ref * unroll_count + k] != dup_p_ref) ; else __assert_fail
("dup_tensor_block_ref[dup_p_ref * unroll_count + k] >= 0 && dup_tensor_block_ref[dup_p_ref * unroll_count + k] != dup_p_ref"
, "ccv_nnc_symbolic_graph_compile.c", 3590, __extension__ __PRETTY_FUNCTION__
); }))
;
3591 const int dup_dup_p_ref = dup_tensor_block_ref[dup_p_ref * unroll_count + k];
3592 assert(tensor_blocks[dup_dup_p_ref].tail)((void) sizeof ((tensor_blocks[dup_dup_p_ref].tail) ? 1 : 0),
__extension__ ({ if (tensor_blocks[dup_dup_p_ref].tail) ; else
__assert_fail ("tensor_blocks[dup_dup_p_ref].tail", "ccv_nnc_symbolic_graph_compile.c"
, 3592, __extension__ __PRETTY_FUNCTION__); }))
;
3593 if (!tensor_blocks[tensor_block_idx].tail)
3594 tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), tensor_blocks[dup_dup_p_ref].tail->rnum, 0);
3595 for (q = 0; q < tensor_blocks[dup_dup_p_ref].tail->rnum; q++)
3596 _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[dup_dup_p_ref].tail, q)((void*)(((char*)((tensor_blocks[dup_dup_p_ref].tail)->data
)) + (size_t)(tensor_blocks[dup_dup_p_ref].tail)->rsize * (
size_t)(q)))
, tensor_blocks[tensor_block_idx]);
3597 // We have to add it to the warp around companion_ref as well.
3598 if (tensor_blocks[dup_dup_p_ref].companion_ref)
3599 {
3600 const int companion_ref = tensor_blocks[dup_dup_p_ref].companion_ref - 1;
3601 for (q = 0; tensor_blocks[companion_ref].head && q < tensor_blocks[companion_ref].head->rnum; q++)
3602 _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].head, q)((void*)(((char*)((tensor_blocks[companion_ref].head)->data
)) + (size_t)(tensor_blocks[companion_ref].head)->rsize * (
size_t)(q)))
, tensor_blocks[tensor_block_idx]);
3603 for (q = 0; tensor_blocks[companion_ref].tail && q < tensor_blocks[companion_ref].tail->rnum; q++)
3604 _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].tail, q)((void*)(((char*)((tensor_blocks[companion_ref].tail)->data
)) + (size_t)(tensor_blocks[companion_ref].tail)->rsize * (
size_t)(q)))
, tensor_blocks[tensor_block_idx]);
3605 }
3606 }
3607 } else if (new_anonymous_tensor_block) {
3608 tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), 1, 0);
3609 ccv_array_push(tensor_blocks[tensor_block_idx].tail, &dup_exec_ref[idx * unroll_count + k]);
3610 }
3611 if (new_anonymous_tensor_block)
3612 ++tensor_block_size;
3613 }
3614 }
3615 }
3616 }
3617 }
3618 } ccv_nnc_graph_visit_endfor} }
3619 if (anonymous_block_free_list)
3620 ccv_array_free(anonymous_block_free_list);
3621 ccfreefree(tensor_fold);
3622 // It is time to guess what's the best tensor placement and create the opaque tensor arena. The alloc_dep will return
3623 // the allocation dependencies, thus, which tensor is reused to the existing tensor.
3624 ccv_nnc_tensor_alloc_prep_t* alloc_prep = _ccv_nnc_tensor_alloc_prep_new_and_free_exec_dep(exec_dep, tensor_blocks, tensor_block_size);
3625 prep->while_count_tensor = 0;
3626 prep->dup_breakpoints = 0;
3627 prep->p = 0;
3628 prep->symbolic_graph = symbolic_graph;
3629 prep->p_idx = symbolic_graph->p_idx;
3630 prep->exec_idx = symbolic_graph->exec_idx;
3631 prep->sub_prep_size = symbolic_graph->sub_graphs ? symbolic_graph->sub_graphs->rnum : 0;
3632 prep->sub_preps = sub_preps;
3633 prep->exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum;
3634 prep->exec_symbol_info = exec_symbol_info;
3635 prep->tensor_symbol_info_size = symbolic_graph->tensor_symbol_info->rnum;
3636 prep->tensor_symbol_info = tensor_symbol_info;
3637 prep->unroll_count = unroll_count;
3638 prep->dup_tensor_block_ref = dup_tensor_block_ref;
3639 prep->tensor_block_size = tensor_block_size;
3640 prep->tensor_blocks = tensor_blocks;
3641 prep->exec_flags = exec_flags;
3642 prep->visit = visit;
3643 prep->alloc_prep = alloc_prep;
3644 if (dup_graph)
3645 ccv_nnc_symbolic_graph_free(dup_graph);
3646 if (dup_exec_ref)
3647 ccfreefree(dup_exec_ref);
3648 return prep;
3649}
3650
3651static void _ccv_nnc_symbolic_graph_prep_free(ccv_nnc_symbolic_graph_prep_t* const prep)
3652{
3653 int i;
3654 _ccv_nnc_tensor_blocks_free(prep->tensor_blocks, prep->tensor_block_size);
3655 ccfreefree(prep->exec_flags);
3656 for (i = 0; i < prep->sub_prep_size; i++)
3657 if (prep->sub_preps[i])
3658 _ccv_nnc_symbolic_graph_prep_free(prep->sub_preps[i]);
3659 if (prep->sub_preps)
3660 ccfreefree(prep->sub_preps);
3661 ccfreefree(prep->tensor_symbol_info);
3662 ccfreefree(prep->exec_symbol_info);
3663 if (prep->dup_tensor_block_ref)
3664 ccfreefree(prep->dup_tensor_block_ref);
3665 _ccv_nnc_tensor_alloc_prep_free(prep->alloc_prep);
3666 ccv_nnc_graph_visit_free(prep->visit);
3667 ccfreefree(prep);
3668}
3669
3670static void _ccv_nnc_symbolic_graph_prep_while_count_tensor(ccv_nnc_symbolic_graph_prep_t* const graph_prep)
3671{
3672 int i, j;
3673 ccv_nnc_graph_visit_for(graph_prep->visit, graph_prep->exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (graph_prep->visit)->
size; _i_++) { const int idx __attribute__((unused)) = (graph_prep
->visit)->node[_i_].index; const int _node_unused_ __attribute__
((unused)) = (graph_prep->visit)->node[_i_].term; typeof
((graph_prep->exec_symbol_info)) const node __attribute__
((unused)) = (graph_prep->exec_symbol_info) + idx;
{
3674 if (node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
3675 {
3676 const int graph_ref = CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)
[0] - 1;
3677 assert(graph_ref >= 0)((void) sizeof ((graph_ref >= 0) ? 1 : 0), __extension__ (
{ if (graph_ref >= 0) ; else __assert_fail ("graph_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3677, __extension__ __PRETTY_FUNCTION__
); }))
;
3678 ccv_nnc_symbolic_graph_prep_t* const sub_prep = graph_prep->sub_preps[graph_ref];
3679 for (i = 0; i < node->p_while.input_size; i++)
3680 if (CCV_NNC_IS_WHILE_COUNT_TENSOR_SYMBOL(node->p_while.inputs[i])(((uint32_t)(node->p_while.inputs[i]) & 0xf) == 0xe))
3681 {
3682 ccv_nnc_symbolic_graph_prep_t* prep = sub_prep;
3683 const int d = CCV_NNC_DECODE_WHILE_COUNT_SYMBOL(node->p_while.inputs[i])((~(uint32_t)(node->p_while.inputs[i])) >> 4);
3684 for (j = 0; j < d; j++)
3685 prep = prep->p;
3686 prep->while_count_tensor = 1;
3687 }
3688 }
3689 for (i = 0; i < node->graph_ref_size; i++)
3690 {
3691 const int graph_ref = CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)
[i] - 1;
3692 if (graph_ref >= 0)
3693 _ccv_nnc_symbolic_graph_prep_while_count_tensor(graph_prep->sub_preps[graph_ref]);
3694 }
3695 } ccv_nnc_graph_visit_endfor} }
3696}
3697
3698static ccv_nnc_tensor_t* _ccv_nnc_tensor_from_graph_prep(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int symbol)
3699{
3700 if (symbol >= 0)
3701 return graph_prep->tensor_arena->vt_tensors[symbol];
3702 if (symbol == CCV_NNC_NO_TENSOR_SYMBOL)
3703 return 0;
3704 assert(CCV_NNC_IS_WHILE_COUNT_TENSOR_SYMBOL(symbol))((void) sizeof (((((uint32_t)(symbol) & 0xf) == 0xe)) ? 1
: 0), __extension__ ({ if ((((uint32_t)(symbol) & 0xf) ==
0xe)) ; else __assert_fail ("CCV_NNC_IS_WHILE_COUNT_TENSOR_SYMBOL(symbol)"
, "ccv_nnc_symbolic_graph_compile.c", 3704, __extension__ __PRETTY_FUNCTION__
); }))
;
3705 const ccv_nnc_symbolic_graph_prep_t* prep = graph_prep;
3706 int i;
3707 const int d = CCV_NNC_DECODE_WHILE_COUNT_SYMBOL(symbol)((~(uint32_t)(symbol)) >> 4);
3708 for (i = 0; i < d; i++)
3709 prep = prep->p;
3710 assert(prep->while_count_tensor)((void) sizeof ((prep->while_count_tensor) ? 1 : 0), __extension__
({ if (prep->while_count_tensor) ; else __assert_fail ("prep->while_count_tensor"
, "ccv_nnc_symbolic_graph_compile.c", 3710, __extension__ __PRETTY_FUNCTION__
); }))
;
3711 return (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(prep->tensor_arena->tensor_metadata, (0 << 1) + 1);
3712}
3713
3714static void _ccv_nnc_graph_exec_arena_topsort(ccv_nnc_graph_t* const graph, ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
3715{
3716 int i;
3717 int* const exec_cvt = (int*)ccmallocmalloc(sizeof(int) * graph->exec_info->rnum);
3718 ccv_nnc_graph_topsort(graph, exec_cvt, graph->exec_info->rnum);
3719 graph_exec_arena->source.d = exec_cvt[graph_exec_arena->source.d];
3720 graph_exec_arena->destination.d = exec_cvt[graph_exec_arena->destination.d];
3721 ccv_nnc_graph_exec_t* const graph_execs = graph_exec_arena->graph_execs;
3722 for (i = 0; i < graph_exec_arena->graph_exec_size; i++)
3723 if (graph_execs[i].graph == graph)
3724 graph_execs[i].d = exec_cvt[graph_execs[i].d];
3725 ccfreefree(exec_cvt);
3726}
3727
3728static ccv_nnc_graph_exec_arena_t* _ccv_nnc_graph_exec_arena_new(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_tensor_arena_t* const tensor_arena)
3729{
3730 int i, j, k;
3731 ccv_nnc_graph_t* const graph = graph_prep->graph;
3732 const int exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum;
3733 ccv_nnc_graph_exec_arena_t* const graph_exec_arena = (ccv_nnc_graph_exec_arena_t*)ccmallocmalloc(sizeof(ccv_nnc_graph_exec_arena_t) + sizeof(ccv_nnc_graph_exec_arena_t*) * graph_prep->sub_prep_size + sizeof(ccv_nnc_graph_exec_t) * (exec_symbol_info_size - 1));
1
Uninitialized value stored to field 'graph'
3734 graph_exec_arena->graph_ref = (intptr_t)symbolic_graph;
3735 graph_exec_arena->graph_exec_size = exec_symbol_info_size;
3736 graph_exec_arena->sub_arena_size = graph_prep->sub_prep_size;
3737 graph_exec_arena->sub_arenas = (ccv_nnc_graph_exec_arena_t**)(graph_exec_arena->graph_execs + exec_symbol_info_size);
3738 memset(graph_exec_arena->sub_arenas, 0, sizeof(ccv_nnc_graph_exec_arena_t*) * graph_exec_arena->sub_arena_size);
3739 ccv_nnc_graph_exec_t* const graph_execs = graph_exec_arena->graph_execs;
3740 int max_input_size = 0, max_output_size = 0, max_breakpoint_size = 0;
3741 for (i = 0; i < exec_symbol_info_size; i++)
2
Assuming 'i' is >= 'exec_symbol_info_size'
3
Loop condition is false. Execution continues on line 3750
3742 {
3743 max_input_size = ccv_max(max_input_size, graph_prep->exec_symbol_info[i].input_size)({ typeof (max_input_size) _a = (max_input_size); typeof (graph_prep
->exec_symbol_info[i].input_size) _b = (graph_prep->exec_symbol_info
[i].input_size); (_a > _b) ? _a : _b; })
;
3744 max_output_size = ccv_max(max_output_size, graph_prep->exec_symbol_info[i].output_size)({ typeof (max_output_size) _a = (max_output_size); typeof (graph_prep
->exec_symbol_info[i].output_size) _b = (graph_prep->exec_symbol_info
[i].output_size); (_a > _b) ? _a : _b; })
;
3745 if (graph_prep->exec_symbol_info[i].flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
3746 max_input_size = ccv_max(max_input_size, graph_prep->exec_symbol_info[i].p_while.input_size)({ typeof (max_input_size) _a = (max_input_size); typeof (graph_prep
->exec_symbol_info[i].p_while.input_size) _b = (graph_prep
->exec_symbol_info[i].p_while.input_size); (_a > _b) ? _a
: _b; })
;
3747 graph_execs[i].d = CCV_NNC_NO_TENSOR_SYMBOL;
3748 graph_execs[i].graph = 0;
3749 }
3750 for (i = 0; i < graph_prep->sub_prep_size; i++)
4
Assuming 'i' is >= field 'sub_prep_size'
5
Loop condition is false. Execution continues on line 3752
3751 max_breakpoint_size = ccv_max(max_breakpoint_size, (*(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, i))->breakpoint_size)({ typeof (max_breakpoint_size) _a = (max_breakpoint_size); typeof
((*(ccv_nnc_symbolic_graph_t**)((void*)(((char*)((symbolic_graph
->sub_graphs)->data)) + (size_t)(symbolic_graph->sub_graphs
)->rsize * (size_t)(i))))->breakpoint_size) _b = ((*(ccv_nnc_symbolic_graph_t
**)((void*)(((char*)((symbolic_graph->sub_graphs)->data
)) + (size_t)(symbolic_graph->sub_graphs)->rsize * (size_t
)(i))))->breakpoint_size); (_a > _b) ? _a : _b; })
;
3752 ccv_nnc_tensor_t* max_inputs[ccv_max(1, max_input_size)({ typeof (1) _a = (1); typeof (max_input_size) _b = (max_input_size
); (_a > _b) ? _a : _b; })
];
6
'?' condition is true
3753 ccv_nnc_tensor_t* max_outputs[ccv_max(1, max_output_size)({ typeof (1) _a = (1); typeof (max_output_size) _b = (max_output_size
); (_a > _b) ? _a : _b; })
];
7
'?' condition is true
3754 ccv_nnc_graph_exec_t max_breakpoints[ccv_max(1, max_breakpoint_size)({ typeof (1) _a = (1); typeof (max_breakpoint_size) _b = (max_breakpoint_size
); (_a > _b) ? _a : _b; })
];
8
'?' condition is true
3755 const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info = graph_prep->exec_symbol_info;
3756 const ccv_nnc_graph_exec_flag_t* const exec_flags = graph_prep->exec_flags;
3757 // Create node, this is in topological order.
3758 ccv_nnc_graph_visit_for(graph_prep->visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (graph_prep->visit)->
size; _i_++) { const int idx __attribute__((unused)) = (graph_prep
->visit)->node[_i_].index; const int _node_unused_ __attribute__
((unused)) = (graph_prep->visit)->node[_i_].term; typeof
((exec_symbol_info)) const node __attribute__((unused)) = (exec_symbol_info
) + idx;
{
9
Assuming '_i_' is < field 'size'
10
Loop condition is true. Entering loop body
3759 if (CCV_NO_GRAPH_EXEC(graph_execs[idx])((graph_execs[idx]).graph == 0))
11
The left operand of '==' is a garbage value
3760 {
3761 for (i = 0; i < node->input_size; i++)
3762 max_inputs[i] = _ccv_nnc_tensor_from_graph_prep(graph_prep, node->inputs[i]);
3763 for (i = 0; i < node->output_size; i++)
3764 max_outputs[i] = node->outputs[i] >= 0 ? tensor_arena->vt_tensors[node->outputs[i]] : 0;
3765 if (node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
3766 {
3767 const int graph_ref = CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)
[0] - 1;
3768 assert(graph_ref >= 0)((void) sizeof ((graph_ref >= 0) ? 1 : 0), __extension__ (
{ if (graph_ref >= 0) ; else __assert_fail ("graph_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3768, __extension__ __PRETTY_FUNCTION__
); }))
;
3769 ccv_nnc_symbolic_graph_prep_t* const sub_prep = graph_prep->sub_preps[graph_ref];
3770 ccv_nnc_graph_t* const sub_graph = sub_prep->graph;
3771 graph_execs[idx] = ccv_nnc_graph_while(graph, node->cmd.cmd, sub_graph);
3772 const ccv_nnc_symbolic_graph_t* const sub_symbolic_graph = *(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, graph_ref)((void*)(((char*)((symbolic_graph->sub_graphs)->data)) +
(size_t)(symbolic_graph->sub_graphs)->rsize * (size_t)
(graph_ref)))
;
3773 ccv_nnc_graph_exec_arena_t* const sub_arena = graph_exec_arena->sub_arenas[graph_ref] = _ccv_nnc_graph_exec_arena_new(sub_symbolic_graph, ccv_nnc_symbolic_graph_sources(sub_symbolic_graph), ccv_nnc_symbolic_graph_source_size(sub_symbolic_graph), ccv_nnc_symbolic_graph_destinations(sub_symbolic_graph), ccv_nnc_symbolic_graph_destination_size(sub_symbolic_graph), graph_prep->sub_preps[graph_ref], tensor_arena->sub_arenas[graph_ref]);
3774 ccv_nnc_graph_exec_set_io(graph, graph_execs[idx], max_inputs, node->input_size, max_outputs, node->output_size);
3775 for (i = 0; i < node->p_while.input_size; i++)
3776 max_inputs[i] = _ccv_nnc_tensor_from_graph_prep(sub_prep, node->p_while.inputs[i]);
3777 for (i = 0; i < sub_symbolic_graph->breakpoint_size; i++)
3778 max_breakpoints[i] = ccv_nnc_graph_exec_from_symbol(sub_arena, sub_symbolic_graph->breakpoints[i]);
3779 ccv_nnc_graph_set_while_expr(sub_graph, node->p_while.expr, node->p_while.data, max_inputs, node->p_while.input_size, max_breakpoints, sub_symbolic_graph->breakpoint_size);
3780 _ccv_nnc_graph_exec_arena_topsort(sub_graph, sub_arena);
3781 } else if (node->flags & CCV_NNC_GRAPH_EXEC_CASE_OF) {
3782 for (i = 0; i < node->output_size; i++)
3783 if (max_outputs[i] && max_outputs[i]->alias_ref)
3784 max_outputs[i] = (ccv_nnc_tensor_t*)max_outputs[i]->alias_ref;
3785 graph_execs[idx] = ccv_nnc_graph_case_of_new(graph, node->cmd.cmd, max_inputs + node->case_of.argument.offset, node->case_of.argument.size, max_outputs, node->output_size);
3786 // Check whether this is already covered in the inputs, if not, need to be covered in the update.
3787 for (i = 0; i < node->case_of.argument.offset; i++)
3788 {
3789 ccv_nnc_tensor_t* const update = max_inputs[i];
3790 if (!CCV_IS_TENSOR_MULTIVIEW(update)((*(int*)(update)) & CCV_TENSOR_MULTIVIEW)) // No need if it is a naked tensor.
3791 continue;
3792 int flag = 0;
3793 for (j = node->case_of.argument.offset; !flag && j < node->case_of.argument.size; j++)
3794 flag = (update == max_inputs[j]);
3795 if (!flag)
3796 ccv_nnc_graph_exec_add_as_affected(graph, graph_execs[idx], update);
3797 }
3798 const int offset = (exec_flags[idx].flags & CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO) ? 1 : 0;
3799 ccv_nnc_graph_set_case_of_expr(graph, graph_execs[idx], node->case_of.expr, node->case_of.data, offset);
3800 if (exec_flags[idx].flags & CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO)
3801 {
3802 // Add another graph for data transfer.
3803 ccv_nnc_graph_t* sub_graph = ccv_nnc_graph_new();
3804 for (i = 0; i < node->output_size; i++)
3805 max_outputs[i] = node->outputs[i] >= 0 ? tensor_arena->vt_tensors[node->outputs[i]] : 0;
3806 ccv_nnc_graph_exec_t io = ccv_nnc_graph_exec_new(sub_graph, ccv_nnc_cmd(CCV_NNC_DATA_TRANSFER_FORWARD, 0, CMD_GENERIC()((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}}}), 0), ccv_nnc_no_hint, max_inputs, ccv_min(node->input_size, node->output_size)({ typeof (node->input_size) _a = (node->input_size); typeof
(node->output_size) _b = (node->output_size); (_a <
_b) ? _a : _b; })
, max_outputs, ccv_min(node->input_size, node->output_size)({ typeof (node->input_size) _a = (node->input_size); typeof
(node->output_size) _b = (node->output_size); (_a <
_b) ? _a : _b; })
);
3807 ccv_nnc_graph_set_sources(sub_graph, &io, 1);
3808 ccv_nnc_graph_set_destinations(sub_graph, &io, 1);
3809 ccv_nnc_graph_set_case_of(graph, graph_execs[idx], sub_graph, 0);
3810 int exec_cvt;
3811 ccv_nnc_graph_topsort(sub_graph, &exec_cvt, 1);
3812 }
3813 for (i = 0; i < node->graph_ref_size; i++)
3814 {
3815 const int graph_ref = CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)
[i] - 1;
3816 if (graph_ref < 0)
3817 continue;
3818 ccv_nnc_graph_t* const sub_graph = graph_prep->sub_preps[graph_ref]->graph;
3819 const ccv_nnc_symbolic_graph_t* const sub_symbolic_graph = *(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, graph_ref)((void*)(((char*)((symbolic_graph->sub_graphs)->data)) +
(size_t)(symbolic_graph->sub_graphs)->rsize * (size_t)
(graph_ref)))
;
3820 ccv_nnc_graph_exec_arena_t* const sub_arena = graph_exec_arena->sub_arenas[graph_ref] = _ccv_nnc_graph_exec_arena_new(sub_symbolic_graph, ccv_nnc_symbolic_graph_sources(sub_symbolic_graph), ccv_nnc_symbolic_graph_source_size(sub_symbolic_graph), ccv_nnc_symbolic_graph_destinations(sub_symbolic_graph), ccv_nnc_symbolic_graph_destination_size(sub_symbolic_graph), graph_prep->sub_preps[graph_ref], tensor_arena->sub_arenas[graph_ref]);
3821 ccv_nnc_graph_set_case_of(graph, graph_execs[idx], sub_graph, i + offset);
3822 _ccv_nnc_graph_exec_arena_topsort(sub_graph, sub_arena);
3823 }
3824 } else {
3825 graph_execs[idx] = ccv_nnc_graph_exec_new(graph, node->cmd, node->hint, max_inputs, node->input_size, max_outputs, node->output_size);
3826 }
3827 ccv_nnc_graph_exec_set_io_flags(graph, graph_execs[idx], 0, 0, 0, 0);
3828 }
3829 } ccv_nnc_graph_visit_endfor} }
3830 // Then connect them.
3831 ccv_nnc_graph_visit_for(graph_prep->visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (graph_prep->visit)->
size; _i_++) { const int idx __attribute__((unused)) = (graph_prep
->visit)->node[_i_].index; const int _node_unused_ __attribute__
((unused)) = (graph_prep->visit)->node[_i_].term; typeof
((exec_symbol_info)) const node __attribute__((unused)) = (exec_symbol_info
) + idx;
{
3832 if (node->outgoings)
3833 for (i = 0; i < node->outgoings->rnum; i++)
3834 {
3835 const int outgoing = *(int*)ccv_array_get(node->outgoings, i)((void*)(((char*)((node->outgoings)->data)) + (size_t)(
node->outgoings)->rsize * (size_t)(i)))
;
3836 if (graph_execs[outgoing].graph)
3837 ccv_nnc_graph_exec_concat(graph, graph_execs[idx], graph_execs[outgoing]);
3838 }
3839 } ccv_nnc_graph_visit_endfor} }
3840 int source_exec_created = 0;
3841 const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info = graph_prep->tensor_symbol_info;
3842 const ccv_nnc_tensor_block_t* const tensor_blocks = graph_prep->tensor_blocks;
3843 ccv_array_t* const* const alloc_dep = graph_prep->alloc_prep->alloc_dep;
3844 // After the graph is materialized, we need to handle the case that some of these tensors require to be initialized to zero before use.
3845 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
3846 {
3847 if (TENSOR_REQUIRE_INIT(tensor_symbol_info[i].flags)(((tensor_symbol_info[i].flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS
) || ((tensor_symbol_info[i].flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ONES
))
)
3848 {
3849 int ref = i;
3850 while (tensor_symbol_info[ref].alias_ref)
3851 ref = tensor_symbol_info[ref].alias_ref - 1;
3852 while (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref])(!((tensor_blocks[ref].flags & 0x3) == ALIAS) && !
((tensor_blocks[ref].flags & 0x3) == UNASSIGNED))
&& tensor_blocks[ref].ref)
3853 ref = tensor_blocks[ref].ref - 1;
3854 // This is not computable. It could be that we marked a const tensor as init zero.
3855 if (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref])(!((tensor_blocks[ref].flags & 0x3) == ALIAS) && !
((tensor_blocks[ref].flags & 0x3) == UNASSIGNED))
)
3856 continue;
3857 // If this tensor is not used by any exec, we don't need to init at all. Skip.
3858 if (!tensor_blocks[ref].head || tensor_blocks[ref].head->rnum == 0)
3859 continue;
3860 ccv_nnc_tensor_t* tensor = tensor_arena->vt_tensors[ref];
3861 // Now, we have the original tensor, we can get the actual tensor, and construct the set command.
3862 ccv_nnc_graph_exec_t set_exec;
3863 if (tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS)
3864 set_exec = ccv_nnc_graph_exec_new(graph, CMD_SET_FORWARD(0)ccv_nnc_cmd(CCV_NNC_SET_FORWARD, 0, (ccv_nnc_cmd_param_t){.size
={.dim={1,1,1}},.blas={.a={0,}}}, 0)
, ccv_nnc_no_hint, 0, 0, &tensor, 1);
3865 else if (tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_INIT_ONES)
3866 set_exec = ccv_nnc_graph_exec_new(graph, CMD_SET_FORWARD(1)ccv_nnc_cmd(CCV_NNC_SET_FORWARD, 0, (ccv_nnc_cmd_param_t){.size
={.dim={1,1,1}},.blas={.a={1,}}}, 0)
, ccv_nnc_no_hint, 0, 0, &tensor, 1);
3867 for (j = 0; j < tensor_blocks[ref].head->rnum; j++)
3868 {
3869 const int outgoing = *(int*)ccv_array_get(tensor_blocks[ref].head, j)((void*)(((char*)((tensor_blocks[ref].head)->data)) + (size_t
)(tensor_blocks[ref].head)->rsize * (size_t)(j)))
;
3870 if (outgoing >= exec_symbol_info_size)
3871 continue;
3872 assert(outgoing >= 0)((void) sizeof ((outgoing >= 0) ? 1 : 0), __extension__ ({
if (outgoing >= 0) ; else __assert_fail ("outgoing >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3872, __extension__ __PRETTY_FUNCTION__
); }))
;
3873 assert(graph_execs[outgoing].graph)((void) sizeof ((graph_execs[outgoing].graph) ? 1 : 0), __extension__
({ if (graph_execs[outgoing].graph) ; else __assert_fail ("graph_execs[outgoing].graph"
, "ccv_nnc_symbolic_graph_compile.c", 3873, __extension__ __PRETTY_FUNCTION__
); }))
;
3874 ccv_nnc_graph_exec_concat(graph, set_exec, graph_execs[outgoing]);
3875 }
3876 int flags = 0;
3877 if (alloc_dep[ref])
3878 for (j = 0; j < alloc_dep[ref]->rnum; j++)
3879 {
3880 const int d = *(int*)ccv_array_get(alloc_dep[ref], j)((void*)(((char*)((alloc_dep[ref])->data)) + (size_t)(alloc_dep
[ref])->rsize * (size_t)(j)))
;
3881 // This is from alloc_dep, it should be computable.
3882 assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]))((void) sizeof (((!((tensor_blocks[d].flags & 0x3) == ALIAS
) && !((tensor_blocks[d].flags & 0x3) == UNASSIGNED
))) ? 1 : 0), __extension__ ({ if ((!((tensor_blocks[d].flags
& 0x3) == ALIAS) && !((tensor_blocks[d].flags &
0x3) == UNASSIGNED))) ; else __assert_fail ("TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d])"
, "ccv_nnc_symbolic_graph_compile.c", 3882, __extension__ __PRETTY_FUNCTION__
); }))
;
3883 if (tensor_blocks[d].tail)
3884 for (k = 0; k < tensor_blocks[d].tail->rnum; k++)
3885 {
3886 const int incoming = *(int*)ccv_array_get(tensor_blocks[d].tail, k)((void*)(((char*)((tensor_blocks[d].tail)->data)) + (size_t
)(tensor_blocks[d].tail)->rsize * (size_t)(k)))
;
3887 if (incoming >= exec_symbol_info_size)
3888 continue;
3889 assert(incoming >= 0)((void) sizeof ((incoming >= 0) ? 1 : 0), __extension__ ({
if (incoming >= 0) ; else __assert_fail ("incoming >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3889, __extension__ __PRETTY_FUNCTION__
); }))
;
3890 assert(graph_execs[incoming].graph)((void) sizeof ((graph_execs[incoming].graph) ? 1 : 0), __extension__
({ if (graph_execs[incoming].graph) ; else __assert_fail ("graph_execs[incoming].graph"
, "ccv_nnc_symbolic_graph_compile.c", 3890, __extension__ __PRETTY_FUNCTION__
); }))
;
3891 ccv_nnc_graph_exec_concat(graph, graph_execs[incoming], set_exec);
3892 flags = 1;
3893 }
3894 }
3895 // If cannot find a start node for this exec, we need to append it to the no-op of the start.
3896 if (!flags)
3897 {
3898 if (!source_exec_created)
3899 {
3900 graph_exec_arena->source = ccv_nnc_graph_exec_new(graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC()((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}}}), 0), ccv_nnc_no_hint, 0, 0, 0, 0);
3901 source_exec_created = 1;
3902 }
3903 ccv_nnc_graph_exec_concat(graph, graph_exec_arena->source, set_exec);
3904 }
3905 }
3906 }
3907 // Now go through the list of tensors to see whether we need to do explicit broadcast for these tensor multi-views
3908 // (we need that if it is not associated as inputs / outputs of any execs, this is possible if all execs associate
3909 // with its alias).
3910 assert(tensor_arena->vt_tensor_size == graph_prep->tensor_symbol_info_size)((void) sizeof ((tensor_arena->vt_tensor_size == graph_prep
->tensor_symbol_info_size) ? 1 : 0), __extension__ ({ if (
tensor_arena->vt_tensor_size == graph_prep->tensor_symbol_info_size
) ; else __assert_fail ("tensor_arena->vt_tensor_size == graph_prep->tensor_symbol_info_size"
, "ccv_nnc_symbolic_graph_compile.c", 3910, __extension__ __PRETTY_FUNCTION__
); }))
;
3911 for (i = 0; i < tensor_arena->vt_tensor_size; i++)
3912 {
3913 ccv_nnc_tensor_t* mv = tensor_arena->vt_tensors[i];
3914 // If it is multiview tensor, inspect all its head to see whether we already associated with the node.
3915 if (mv && CCV_IS_TENSOR_MULTIVIEW(mv)((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW))
3916 {
3917 const ccv_array_t* const head = tensor_blocks[i].head;
3918 if (head && head->rnum > 0)
3919 for (j = 0; j < head->rnum; j++)
3920 {
3921 const int idx = *(int*)ccv_array_get(head, j)((void*)(((char*)((head)->data)) + (size_t)(head)->rsize
* (size_t)(j)))
;
3922 if (idx >= exec_symbol_info_size)
3923 continue;
3924 assert(idx >= 0)((void) sizeof ((idx >= 0) ? 1 : 0), __extension__ ({ if (
idx >= 0) ; else __assert_fail ("idx >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 3924, __extension__ __PRETTY_FUNCTION__); }))
;
3925 const int d = graph_execs[idx].d;
3926 ccv_nnc_graph_exec_info_t* const exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, d)((void*)(((char*)((graph->exec_info)->data)) + (size_t)
(graph->exec_info)->rsize * (size_t)(d)))
;
3927 int flag = 0;
3928 if (exec_info->tensor_wraps_ref)
3929 {
3930 ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = *(ccv_nnc_graph_tensor_wrap_array_t**)ccv_array_get(graph->tensor_wraps, exec_info->tensor_wraps_ref - 1)((void*)(((char*)((graph->tensor_wraps)->data)) + (size_t
)(graph->tensor_wraps)->rsize * (size_t)(exec_info->
tensor_wraps_ref - 1)))
;
3931 for (k = 0; k < tensor_wrap_array->size && !flag; k++)
3932 flag = (tensor_wrap_array->tensor_wraps[k] && tensor_wrap_array->tensor_wraps[k]->tensors[0] == mv);
3933 }
3934 // If none is in the flag, it need to be included in the cast.
3935 if (!flag)
3936 ccv_nnc_graph_exec_add_as_affected(graph, graph_execs[idx], mv);
3937 }
3938 }
3939 }
3940 // Create source / destination phony node. This is to facilitate use of compiled graph.
3941 // Also, this is needed if you have init zero execs.
3942 if (source_exec_created || source_size > 1)
3943 {
3944 if (!source_exec_created)
3945 graph_exec_arena->source = ccv_nnc_graph_exec_new(graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC()((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}}}), 0), ccv_nnc_no_hint, 0, 0, 0, 0);
3946 for (i = 0; i < source_size; i++)
3947 ccv_nnc_graph_exec_concat(graph, graph_exec_arena->source, graph_execs[sources[i].d]);
3948 } else {
3949 assert(!source_exec_created)((void) sizeof ((!source_exec_created) ? 1 : 0), __extension__
({ if (!source_exec_created) ; else __assert_fail ("!source_exec_created"
, "ccv_nnc_symbolic_graph_compile.c", 3949, __extension__ __PRETTY_FUNCTION__
); }))
;
3950 assert(source_size == 1)((void) sizeof ((source_size == 1) ? 1 : 0), __extension__ ({
if (source_size == 1) ; else __assert_fail ("source_size == 1"
, "ccv_nnc_symbolic_graph_compile.c", 3950, __extension__ __PRETTY_FUNCTION__
); }))
;
3951 graph_exec_arena->source = graph_execs[sources[0].d];
3952 }
3953 if (destination_size == 1)
3954 graph_exec_arena->destination = graph_execs[destinations[0].d];
3955 else {
3956 graph_exec_arena->destination = ccv_nnc_graph_exec_new(graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC()((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}}}), 0), ccv_nnc_no_hint, 0, 0, 0, 0);
3957 for (i = 0; i < destination_size; i++)
3958 ccv_nnc_graph_exec_concat(graph, graph_execs[destinations[i].d], graph_exec_arena->destination);
3959 }
3960 ccv_nnc_graph_set_sources(graph, &graph_exec_arena->source, 1);
3961 ccv_nnc_graph_set_destinations(graph, &graph_exec_arena->destination, 1);
3962 return graph_exec_arena;
3963}
3964
3965static ccv_nnc_graph_t* _ccv_nnc_graph_find_pair(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_symbolic_graph_t* const pair)
3966{
3967 if (graph_prep->symbolic_graph == pair)
3968 return graph_prep->graph;
3969 int i;
3970 for (i = 0; i < graph_prep->sub_prep_size; i++)
3971 if (graph_prep->sub_preps[i])
3972 {
3973 ccv_nnc_graph_t* const graph = _ccv_nnc_graph_find_pair(graph_prep->sub_preps[i], pair);
3974 if (graph)
3975 return graph;
3976 }
3977 return 0;
3978}
3979
3980static void _ccv_nnc_graph_fixup_pair(const ccv_nnc_symbolic_graph_prep_t* const root_prep, ccv_nnc_symbolic_graph_prep_t* const graph_prep)
3981{
3982 int i;
3983 for (i = 0; i < graph_prep->sub_prep_size; i++)
3984 if (graph_prep->sub_preps[i])
3985 {
3986 if (graph_prep->sub_preps[i]->symbolic_graph->pair)
3987 graph_prep->sub_preps[i]->graph->pair = _ccv_nnc_graph_find_pair(root_prep, graph_prep->sub_preps[i]->symbolic_graph->pair);
3988 }
3989}
3990
3991static void _ccv_nnc_graph_exec_arena_fixup_pair_ref(const ccv_nnc_graph_exec_arena_t* const root_arena, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
3992{
3993 assert(graph_exec_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph)((void) sizeof ((graph_exec_arena->graph_ref == (intptr_t)
graph_prep->symbolic_graph) ? 1 : 0), __extension__ ({ if (
graph_exec_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph
) ; else __assert_fail ("graph_exec_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 3993, __extension__ __PRETTY_FUNCTION__
); }))
;
3994 int i;
3995 for (i = 0; i < graph_prep->exec_symbol_info_size; i++)
3996 {
3997 if (CCV_NNC_GRAPH_EXEC_IS_DEAD(graph_prep->exec_symbol_info[i].flags)((graph_prep->exec_symbol_info[i].flags) & CCV_NNC_GRAPH_EXEC_DEAD
)
)
3998 continue;
3999 if (graph_exec_arena->graph_execs[i].graph && graph_prep->exec_symbol_info[i].pair_ref)
4000 {
4001 ccv_nnc_graph_exec_t pair_exec = ccv_nnc_graph_exec_from_symbol(root_arena, (ccv_nnc_graph_exec_symbol_t){
4002 .d = graph_prep->exec_symbol_info[i].pair_ref - 1,
4003 .graph = graph_prep->symbolic_graph->pair ? graph_prep->symbolic_graph->pair : graph_prep->symbolic_graph,
4004 });
4005 if (pair_exec.d >= 0)
4006 ccv_nnc_graph_exec_pair_with(graph_prep->graph, graph_exec_arena->graph_execs[i], pair_exec);
4007 }
4008 }
4009 for (i = 0; i < graph_prep->sub_prep_size; i++)
4010 if (graph_prep->sub_preps[i])
4011 _ccv_nnc_graph_exec_arena_fixup_pair_ref(root_arena, graph_prep->sub_preps[i], graph_exec_arena->sub_arenas[i]);
4012}
4013
4014static void _ccv_nnc_symbolic_graph_prep_dup_breakpoints_free(ccv_nnc_symbolic_graph_prep_t* const graph_prep)
4015{
4016 int i;
4017 if (graph_prep->dup_breakpoints)
4018 {
4019 // Strip the const modifier only possible because it is a sub-graph.
4020 ccv_nnc_symbolic_graph_t* const symbolic_graph = (ccv_nnc_symbolic_graph_t*)graph_prep->symbolic_graph;
4021 for (i = 0; i < graph_prep->dup_breakpoints->rnum; i++)
4022 ccv_nnc_graph_exec_symbol_free(symbolic_graph, *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(graph_prep->dup_breakpoints, i)((void*)(((char*)((graph_prep->dup_breakpoints)->data))
+ (size_t)(graph_prep->dup_breakpoints)->rsize * (size_t
)(i)))
);
4023 ccv_array_free(graph_prep->dup_breakpoints);
4024 graph_prep->dup_breakpoints = 0;
4025 graph_prep->exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum;
4026 // Afterwards, we have to regenerate the exec_symbol_info, fill in the information (through symbol_infer).
4027 memcpy(graph_prep->exec_symbol_info, ccv_array_get(symbolic_graph->exec_symbol_info, 0)((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
(size_t)(0)))
, sizeof(ccv_nnc_graph_exec_symbol_info_t) * graph_prep->exec_symbol_info_size);
4028 // Since exec_symbol_info changed, create a new visit object.
4029 assert(symbolic_graph->sources)((void) sizeof ((symbolic_graph->sources) ? 1 : 0), __extension__
({ if (symbolic_graph->sources) ; else __assert_fail ("symbolic_graph->sources"
, "ccv_nnc_symbolic_graph_compile.c", 4029, __extension__ __PRETTY_FUNCTION__
); }))
;
4030 assert(symbolic_graph->destinations)((void) sizeof ((symbolic_graph->destinations) ? 1 : 0), __extension__
({ if (symbolic_graph->destinations) ; else __assert_fail
("symbolic_graph->destinations", "ccv_nnc_symbolic_graph_compile.c"
, 4030, __extension__ __PRETTY_FUNCTION__); }))
;
4031 ccv_nnc_graph_exec_symbol_t* const sources = (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->sources, 0)((void*)(((char*)((symbolic_graph->sources)->data)) + (
size_t)(symbolic_graph->sources)->rsize * (size_t)(0)))
;
4032 const int source_size = symbolic_graph->sources->rnum;
4033 ccv_nnc_graph_exec_symbol_t* const destinations = (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->destinations, 0)((void*)(((char*)((symbolic_graph->destinations)->data)
) + (size_t)(symbolic_graph->destinations)->rsize * (size_t
)(0)))
;
4034 const int destination_size = symbolic_graph->destinations->rnum;
4035 ccv_nnc_graph_visit_t* visit = ccv_nnc_graph_visit_new(symbolic_graph, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, 0), symbolic_graph->exec_symbol_info->rnum, sources, source_size, destinations, destination_size, 0)({ ccv_nnc_graph_visit_t* _visit_ = (ccv_nnc_graph_visit_t*)malloc
(sizeof(ccv_nnc_graph_visit_t) + sizeof(_visit_->node[0]) *
((symbolic_graph->exec_symbol_info->rnum) - 1)); _visit_
->size = 0; do { typedef struct { int8_t d; int8_t r; uint16_t
c; int32_t edges; } ccv_nnc_incoming_t; int _i_, _j_; int _incoming_edges_
= 0; for (_i_ = 0; _i_ < (symbolic_graph->exec_symbol_info
->rnum); _i_++) _incoming_edges_ += (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_i_].outgoings) ? ((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_i_].outgoings->rnum : 0; const int _heap_mem_
= ((symbolic_graph->exec_symbol_info->rnum) + _incoming_edges_
> 1024); ccv_nnc_incoming_t* _incomings_; if (_heap_mem_)
_incomings_ = (ccv_nnc_incoming_t*)malloc(sizeof(ccv_nnc_incoming_t
) * (symbolic_graph->exec_symbol_info->rnum) + sizeof(int32_t
) * ((symbolic_graph->exec_symbol_info->rnum) * 2 + _incoming_edges_
)); else _incomings_ = (ccv_nnc_incoming_t*)__builtin_alloca (
sizeof(ccv_nnc_incoming_t) * (symbolic_graph->exec_symbol_info
->rnum) + sizeof(int32_t) * ((symbolic_graph->exec_symbol_info
->rnum) * 2 + _incoming_edges_)); memset(_incomings_, 0, sizeof
(ccv_nnc_incoming_t) * (symbolic_graph->exec_symbol_info->
rnum)); int32_t* _exists_[2] = { (int32_t*)(_incomings_ + (symbolic_graph
->exec_symbol_info->rnum)), (int32_t*)(_incomings_ + (symbolic_graph
->exec_symbol_info->rnum)) + (symbolic_graph->exec_symbol_info
->rnum), }; int32_t* const _edges_ = _exists_[1] + (symbolic_graph
->exec_symbol_info->rnum); for (_i_ = 0; _i_ < (source_size
); _i_++) { ((void) sizeof (((sources)[_i_].graph == symbolic_graph
) ? 1 : 0), __extension__ ({ if ((sources)[_i_].graph == symbolic_graph
) ; else __assert_fail ("(sources)[_i_].graph == symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 4035, __extension__ __PRETTY_FUNCTION__
); })); _incomings_[(sources)[_i_].d].r = 1; _exists_[0][_i_]
= (sources)[_i_].d; } int _exist_size_[2] = { (source_size),
0, }; int _p_ = 0, _q_ = 1; while (_exist_size_[_p_] > 0)
{ _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_
[_p_]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (
_incomings_[_idx_].r != 1) continue; _incomings_[_idx_].r = 2
; if (((ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)((
symbolic_graph->exec_symbol_info)->data)) + (size_t)(symbolic_graph
->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings
) for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t*)
((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
(size_t)(0))))[_idx_].outgoings->rnum; _j_++) { const int
d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_)
)); ++_incomings_[d].c; if (_incomings_[d].r != 0) continue; _incomings_
[d].r = 1; ((void) sizeof ((_exist_size_[_q_] < (symbolic_graph
->exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if
(_exist_size_[_q_] < (symbolic_graph->exec_symbol_info
->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 4035, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
(_i_ = 0; _i_ < (source_size); _i_++) { ((void) sizeof ((
(sources)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__
({ if ((sources)[_i_].graph == symbolic_graph) ; else __assert_fail
("(sources)[_i_].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 4035, __extension__ __PRETTY_FUNCTION__); })); _incomings_[
(sources)[_i_].d].r = 3; _exists_[0][_i_] = (sources)[_i_].d;
} _exist_size_[0] = (source_size); _exist_size_[1] = 0; _p_ =
0, _q_ = 1; int _bump_ = 1; while (_exist_size_[_p_] > 0)
{ _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_
[_p_]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (
_incomings_[_idx_].r != 3) continue; _incomings_[_idx_].r = 4
; if (((ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)((
symbolic_graph->exec_symbol_info)->data)) + (size_t)(symbolic_graph
->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings
) for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t*)
((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
(size_t)(0))))[_idx_].outgoings->rnum; _j_++) { const int
d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_)
)); if (_incomings_[d].edges == 0) { _incomings_[d].edges = _bump_
; _bump_ += _incomings_[d].c; _incomings_[d].c = 0; } _edges_
[_incomings_[d].edges - 1 + _incomings_[d].c] = _idx_; ++_incomings_
[d].c; if (_incomings_[d].r != 2) continue; _incomings_[d].r =
3; ((void) sizeof ((_exist_size_[_q_] < (symbolic_graph->
exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_
[_q_] < (symbolic_graph->exec_symbol_info->rnum)) ; else
__assert_fail ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 4035, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
(_i_ = 0; _i_ < (destination_size); _i_++) { ((void) sizeof
(((destinations)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__
({ if ((destinations)[_i_].graph == symbolic_graph) ; else __assert_fail
("(destinations)[_i_].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 4035, __extension__ __PRETTY_FUNCTION__); })); _incomings_[
(destinations)[_i_].d].r = 5; _exists_[0][_i_] = (destinations
)[_i_].d; } _exist_size_[0] = (destination_size); _exist_size_
[1] = 0; _p_ = 0, _q_ = 1; while (_exist_size_[_p_] > 0) {
_exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_
]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (_incomings_
[_idx_].r != 5) continue; _incomings_[_idx_].r = 6; if (_incomings_
[_idx_].edges > 0) for (_j_ = 0; _j_ < _incomings_[_idx_
].c; _j_++) { const int d = _edges_[_incomings_[_idx_].edges -
1 + _j_]; if (_incomings_[d].r != 4) continue; _incomings_[d
].r = 5; ((void) sizeof ((_exist_size_[_q_] < (symbolic_graph
->exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if
(_exist_size_[_q_] < (symbolic_graph->exec_symbol_info
->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 4035, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
(_i_ = 0; _i_ < (destination_size); _i_++) { ((void) sizeof
(((destinations)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__
({ if ((destinations)[_i_].graph == symbolic_graph) ; else __assert_fail
("(destinations)[_i_].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 4035, __extension__ __PRETTY_FUNCTION__); })); _incomings_[
(destinations)[_i_].d].d = 1; } for (_i_ = 0; _i_ < (source_size
); _i_++) { ((void) sizeof (((sources)[_i_].graph == symbolic_graph
) ? 1 : 0), __extension__ ({ if ((sources)[_i_].graph == symbolic_graph
) ; else __assert_fail ("(sources)[_i_].graph == symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 4035, __extension__ __PRETTY_FUNCTION__
); })); _exists_[0][_i_] = (sources)[_i_].d; } _p_ = 0; _q_ =
1; _exist_size_[0] = (source_size); _exist_size_[1] = 0; int
_d_ = 0; while (_exist_size_[_p_] > 0) { _exist_size_[_q_
] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_];) { const int32_t
_idx_ = _exists_[_p_][_i_]; _visit_->node[_visit_->size
].index = ((_idx_)); _visit_->node[_visit_->size].term =
((_incomings_[_idx_].d)); ++_visit_->size;; if (_incomings_
[_idx_].d) { ++_d_; _incomings_[_idx_].r = 7; } if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings) { if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings->rnum == 1) { const int
d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(0)))
; --_incomings_[d].c; if (_incomings_[d].c == 0 && _incomings_
[d].r == 6 && _d_ < (destination_size)) { _exists_
[_p_][_i_] = d; continue; } } else for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings->rnum; _j_++) { const int
d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_)
)); --_incomings_[d].c; if (_incomings_[d].c == 0 && _incomings_
[d].r == 6 && _d_ < (destination_size)) { ((void) sizeof
((_exist_size_[_q_] < (symbolic_graph->exec_symbol_info
->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_[_q_] <
(symbolic_graph->exec_symbol_info->rnum)) ; else __assert_fail
("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 4035, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } } ++_i_; } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (
_i_)); } for (_i_ = 0; _i_ < (destination_size); _i_++) { (
(void) sizeof (((destinations)[_i_].graph == symbolic_graph) ?
1 : 0), __extension__ ({ if ((destinations)[_i_].graph == symbolic_graph
) ; else __assert_fail ("(destinations)[_i_].graph == symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 4035, __extension__ __PRETTY_FUNCTION__
); })); if (_incomings_[(destinations)[_i_].d].r == 7) continue
; if (!(0)) { ((void) sizeof ((_incomings_[(destinations)[_i_
].d].c == 0) ? 1 : 0), __extension__ ({ if (_incomings_[(destinations
)[_i_].d].c == 0) ; else __assert_fail ("_incomings_[(destinations)[_i_].d].c == 0"
, "ccv_nnc_symbolic_graph_compile.c", 4035, __extension__ __PRETTY_FUNCTION__
); })); } else if (_incomings_[(destinations)[_i_].d].c > 0
) continue; _visit_->node[_visit_->size].index = (((destinations
)[_i_].d)); _visit_->node[_visit_->size].term = ((_incomings_
[(destinations)[_i_].d].d)); ++_visit_->size;; } if (_heap_mem_
) free(_incomings_); } while (0);; ((void) sizeof ((_visit_->
size <= (symbolic_graph->exec_symbol_info->rnum)) ? 1
: 0), __extension__ ({ if (_visit_->size <= (symbolic_graph
->exec_symbol_info->rnum)) ; else __assert_fail ("_visit_->size <= (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 4035, __extension__ __PRETTY_FUNCTION__
); })); _visit_; })
;
4036 ccv_nnc_graph_visit_free(graph_prep->visit);
4037 graph_prep->visit = visit;
4038 assert(graph_prep->p)((void) sizeof ((graph_prep->p) ? 1 : 0), __extension__ ({
if (graph_prep->p) ; else __assert_fail ("graph_prep->p"
, "ccv_nnc_symbolic_graph_compile.c", 4038, __extension__ __PRETTY_FUNCTION__
); }))
;
4039 ccv_nnc_symbolic_graph_symbol_infer(symbolic_graph, visit, sources, source_size, destinations, destination_size, graph_prep->p->tensor_symbol_info, graph_prep->p->tensor_symbol_info_size, graph_prep->tensor_symbol_info, graph_prep->exec_symbol_info);
4040 }
4041 ccv_nnc_graph_visit_for(graph_prep->visit, graph_prep->exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (graph_prep->visit)->
size; _i_++) { const int idx __attribute__((unused)) = (graph_prep
->visit)->node[_i_].index; const int _node_unused_ __attribute__
((unused)) = (graph_prep->visit)->node[_i_].term; typeof
((graph_prep->exec_symbol_info)) const node __attribute__
((unused)) = (graph_prep->exec_symbol_info) + idx;
{
4042 for (i = 0; i < node->graph_ref_size; i++)
4043 {
4044 const int graph_ref = CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)
[i] - 1;
4045 if (graph_ref >= 0)
4046 _ccv_nnc_symbolic_graph_prep_dup_breakpoints_free(graph_prep->sub_preps[graph_ref]);
4047 }
4048 } ccv_nnc_graph_visit_endfor} }
4049}
4050
4051const ccv_nnc_symbolic_graph_compile_param_t ccv_nnc_default_compile_params = {};
4052
4053void ccv_nnc_symbolic_graph_compile(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_symbolic_graph_compile_param_t compile_params, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, ccv_nnc_graph_t** const graph_ref, ccv_nnc_tensor_arena_t** const tensor_arena_ref, ccv_nnc_graph_exec_arena_t** const graph_exec_arena_ref)
4054{
4055 assert(graph_ref)((void) sizeof ((graph_ref) ? 1 : 0), __extension__ ({ if (graph_ref
) ; else __assert_fail ("graph_ref", "ccv_nnc_symbolic_graph_compile.c"
, 4055, __extension__ __PRETTY_FUNCTION__); }))
;
4056 assert(tensor_arena_ref)((void) sizeof ((tensor_arena_ref) ? 1 : 0), __extension__ ({
if (tensor_arena_ref) ; else __assert_fail ("tensor_arena_ref"
, "ccv_nnc_symbolic_graph_compile.c", 4056, __extension__ __PRETTY_FUNCTION__
); }))
;
4057 assert(graph_exec_arena_ref)((void) sizeof ((graph_exec_arena_ref) ? 1 : 0), __extension__
({ if (graph_exec_arena_ref) ; else __assert_fail ("graph_exec_arena_ref"
, "ccv_nnc_symbolic_graph_compile.c", 4057, __extension__ __PRETTY_FUNCTION__
); }))
;
4058 int i;
4059 // Cannot bind the multi-view.
4060 for (i = 0; i < tensor_bind_size; i++)
4061 {
4062 assert(tensor_binds[i].tensor)((void) sizeof ((tensor_binds[i].tensor) ? 1 : 0), __extension__
({ if (tensor_binds[i].tensor) ; else __assert_fail ("tensor_binds[i].tensor"
, "ccv_nnc_symbolic_graph_compile.c", 4062, __extension__ __PRETTY_FUNCTION__
); }))
;
4063 assert(!CCV_IS_TENSOR_MULTIVIEW(tensor_binds[i].tensor))((void) sizeof ((!((*(int*)(tensor_binds[i].tensor)) & CCV_TENSOR_MULTIVIEW
)) ? 1 : 0), __extension__ ({ if (!((*(int*)(tensor_binds[i].
tensor)) & CCV_TENSOR_MULTIVIEW)) ; else __assert_fail ("!CCV_IS_TENSOR_MULTIVIEW(tensor_binds[i].tensor)"
, "ccv_nnc_symbolic_graph_compile.c", 4063, __extension__ __PRETTY_FUNCTION__
); }))
;
4064 }
4065 ccv_nnc_symbolic_graph_prep_t* graph_prep = _ccv_nnc_symbolic_graph_prep_new(symbolic_graph, tensor_binds, tensor_bind_size, outputs, output_size, sources, source_size, destinations, destination_size, 0, 0, 0, 0);
4066 _ccv_nnc_symbolic_graph_prep_while_count_tensor(graph_prep);
4067 ccv_nnc_tensor_arena_t* tensor_arena = _ccv_nnc_tensor_arena_new(graph_prep, compile_params.allocator, 0, tensor_binds, tensor_bind_size);
4068 _ccv_nnc_tensor_arena_fixup_pair_ref_and_tape_var(tensor_arena, graph_prep, tensor_arena);
4069 *tensor_arena_ref = tensor_arena;
4070 // The above handled tensor allocation, now we need to materialize the graph from symbolic to real.
4071 _ccv_nnc_graph_fixup_pair(graph_prep, graph_prep);
4072 // Now tensor allocation is done, if there are any dup_breakpoints, I need to clean it up.
4073 _ccv_nnc_symbolic_graph_prep_dup_breakpoints_free(graph_prep);
4074 *graph_ref = graph_prep->graph;
4075 ccv_nnc_graph_exec_arena_t* graph_exec_arena = _ccv_nnc_graph_exec_arena_new(symbolic_graph, sources, source_size, destinations, destination_size, graph_prep, tensor_arena);
4076 _ccv_nnc_graph_exec_arena_topsort(graph_prep->graph, graph_exec_arena);
4077 _ccv_nnc_graph_exec_arena_fixup_pair_ref(graph_exec_arena, graph_prep, graph_exec_arena);
4078 *graph_exec_arena_ref = graph_exec_arena;
4079 _ccv_nnc_symbolic_graph_prep_free(graph_prep);
4080}
4081
4082static void _ccv_nnc_tensor_arena_free(ccv_nnc_tensor_arena_t* const tensor_arena)
4083{
4084 // Buffers are inherited from above, no need to dealloc.
4085 int i;
4086 for (i = 0; i < tensor_arena->sub_arena_size; i++)
4087 if (tensor_arena->sub_arenas[i])
4088 _ccv_nnc_tensor_arena_free(tensor_arena->sub_arenas[i]);
4089 for (i = 0; i < tensor_arena->m_tensor_idx->rnum; i++)
4090 {
4091 ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, *(int*)ccv_array_get(tensor_arena->m_tensor_idx, i)((void*)(((char*)((tensor_arena->m_tensor_idx)->data)) +
(size_t)(tensor_arena->m_tensor_idx)->rsize * (size_t)
(i)))
);
4092 assert(mv && CCV_IS_TENSOR_MULTIVIEW(mv))((void) sizeof ((mv && ((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW
)) ? 1 : 0), __extension__ ({ if (mv && ((*(int*)(mv)
) & CCV_TENSOR_MULTIVIEW)) ; else __assert_fail ("mv && CCV_IS_TENSOR_MULTIVIEW(mv)"
, "ccv_nnc_symbolic_graph_compile.c", 4092, __extension__ __PRETTY_FUNCTION__
); }))
;
4093 ccv_nnc_tensor_multiview_free(*mv);
4094 }
4095 ccv_array_free(tensor_arena->tensor_metadata);
4096 ccv_array_free(tensor_arena->m_tensor_idx);
4097 if (tensor_arena->pb_vt_tensors)
4098 ccfreefree(tensor_arena->pb_vt_tensors);
4099 if (tensor_arena->vt_alias_r_refs_p)
4100 ccfreefree(tensor_arena->vt_alias_r_refs_p);
4101 if (tensor_arena->vt_sizes)
4102 ccfreefree(tensor_arena->vt_sizes);
4103 ccfreefree(tensor_arena);
4104}
4105
4106void ccv_nnc_tensor_bind_symbol(ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_tensor_symbol_t symbol, const ccv_nnc_tensor_t* const tensor)
4107{
4108 assert(tensor_arena->graph_ref == (intptr_t)symbol.graph)((void) sizeof ((tensor_arena->graph_ref == (intptr_t)symbol
.graph) ? 1 : 0), __extension__ ({ if (tensor_arena->graph_ref
== (intptr_t)symbol.graph) ; else __assert_fail ("tensor_arena->graph_ref == (intptr_t)symbol.graph"
, "ccv_nnc_symbolic_graph_compile.c", 4108, __extension__ __PRETTY_FUNCTION__
); }))
;
4109 assert(symbol.d < tensor_arena->vt_tensor_size)((void) sizeof ((symbol.d < tensor_arena->vt_tensor_size
) ? 1 : 0), __extension__ ({ if (symbol.d < tensor_arena->
vt_tensor_size) ; else __assert_fail ("symbol.d < tensor_arena->vt_tensor_size"
, "ccv_nnc_symbolic_graph_compile.c", 4109, __extension__ __PRETTY_FUNCTION__
); }))
;
4110 assert(symbol.d >= 0)((void) sizeof ((symbol.d >= 0) ? 1 : 0), __extension__ ({
if (symbol.d >= 0) ; else __assert_fail ("symbol.d >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 4110, __extension__ __PRETTY_FUNCTION__
); }))
;
4111 // Only allocate this on-demand because not everyone uses this ccv_nnc_tensor_bind_symbol method.
4112 int i;
4113 if (!tensor_arena->pb_vt_tensors)
4114 {
4115 tensor_arena->pb_vt_tensors = (ccv_numeric_data_t*)cccalloccalloc(tensor_arena->vt_tensor_size, sizeof(ccv_numeric_data_t));
4116 for (i = 0; i < tensor_arena->vt_tensor_size; i++)
4117 if (tensor_arena->vt_tensors[i])
4118 tensor_arena->pb_vt_tensors[i] = tensor_arena->vt_tensors[i]->data;
4119 }
4120 if (!tensor_arena->vt_alias_r_refs_p)
4121 {
4122 tensor_arena->vt_alias_r_refs_p = (int*)cccalloccalloc(tensor_arena->vt_tensor_size * 2, sizeof(int));
4123 tensor_arena->vt_alias_r_refs = tensor_arena->vt_alias_r_refs_p + tensor_arena->vt_tensor_size;
4124 for (i = 0; i < tensor_arena->vt_tensor_size; i++)
4125 if (tensor_arena->vt_alias_refs[i])
4126 {
4127 const int alias_ref = tensor_arena->vt_alias_refs[i] - 1;
4128 assert(alias_ref >= 0 && alias_ref < tensor_arena->vt_tensor_size)((void) sizeof ((alias_ref >= 0 && alias_ref < tensor_arena
->vt_tensor_size) ? 1 : 0), __extension__ ({ if (alias_ref
>= 0 && alias_ref < tensor_arena->vt_tensor_size
) ; else __assert_fail ("alias_ref >= 0 && alias_ref < tensor_arena->vt_tensor_size"
, "ccv_nnc_symbolic_graph_compile.c", 4128, __extension__ __PRETTY_FUNCTION__
); }))
;
4129 ++tensor_arena->vt_alias_r_refs_p[alias_ref]; // Count how many alias there are.
4130 }
4131 int refp = 0;
4132 for (i = 1; i < tensor_arena->vt_tensor_size; i++) // Allocate each with aliases position on vt_alias_r_refs. It points to the end.
4133 if (tensor_arena->vt_alias_r_refs_p[i])
4134 refp = (tensor_arena->vt_alias_r_refs_p[i] += refp);
4135 else
4136 tensor_arena->vt_alias_r_refs_p[i] = -1; // This has no refs.
4137 for (i = refp; i < tensor_arena->vt_tensor_size; i++)
4138 tensor_arena->vt_alias_r_refs[i] = -1; // These are not allocated.
4139 for (i = 0; i < tensor_arena->vt_tensor_size; i++)
4140 if (tensor_arena->vt_alias_refs[i])
4141 {
4142 const int alias_ref = tensor_arena->vt_alias_refs[i] - 1;
4143 assert(alias_ref >= 0 && alias_ref < tensor_arena->vt_tensor_size)((void) sizeof ((alias_ref >= 0 && alias_ref < tensor_arena
->vt_tensor_size) ? 1 : 0), __extension__ ({ if (alias_ref
>= 0 && alias_ref < tensor_arena->vt_tensor_size
) ; else __assert_fail ("alias_ref >= 0 && alias_ref < tensor_arena->vt_tensor_size"
, "ccv_nnc_symbolic_graph_compile.c", 4143, __extension__ __PRETTY_FUNCTION__
); }))
;
4144 const int pos = --tensor_arena->vt_alias_r_refs_p[alias_ref];
4145 assert(pos >= 0)((void) sizeof ((pos >= 0) ? 1 : 0), __extension__ ({ if (
pos >= 0) ; else __assert_fail ("pos >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 4145, __extension__ __PRETTY_FUNCTION__); }))
;
4146 tensor_arena->vt_alias_r_refs[pos] = i;
4147 }
4148 }
4149 const int symbol_d = tensor_arena->vt_alias_refs[symbol.d] ? tensor_arena->vt_alias_refs[symbol.d] - 1 : symbol.d;
4150 if (CCV_IS_TENSOR_VIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_VIEW))
4151 {
4152 assert(((ccv_nnc_tensor_view_t*)tensor)->off == 0)((void) sizeof ((((ccv_nnc_tensor_view_t*)tensor)->off == 0
) ? 1 : 0), __extension__ ({ if (((ccv_nnc_tensor_view_t*)tensor
)->off == 0) ; else __assert_fail ("((ccv_nnc_tensor_view_t*)tensor)->off == 0"
, "ccv_nnc_symbolic_graph_compile.c", 4152, __extension__ __PRETTY_FUNCTION__
); }))
; // I cannot handle off > 0 at the moment, it is possible, but requires additional verifications.
4153 assert((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor)->stride) == 0 &&((void) sizeof (((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t
*)tensor)->stride) == 0 && ccv_nnc_tensor_count(tensor
->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors
[symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor
)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ? 1 : 0), __extension__
({ if ((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor
)->stride) == 0 && ccv_nnc_tensor_count(tensor->
info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[
symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor
)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ; else __assert_fail
("(ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor)->stride) == 0 && ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)"
, "ccv_nnc_symbolic_graph_compile.c", 4155, __extension__ __PRETTY_FUNCTION__
); }))
4154 ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)) ||((void) sizeof (((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t
*)tensor)->stride) == 0 && ccv_nnc_tensor_count(tensor
->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors
[symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor
)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ? 1 : 0), __extension__
({ if ((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor
)->stride) == 0 && ccv_nnc_tensor_count(tensor->
info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[
symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor
)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ; else __assert_fail
("(ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor)->stride) == 0 && ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)"
, "ccv_nnc_symbolic_graph_compile.c", 4155, __extension__ __PRETTY_FUNCTION__
); }))
4155 (size_t)((ccv_nnc_tensor_view_t*)tensor)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info))((void) sizeof (((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t
*)tensor)->stride) == 0 && ccv_nnc_tensor_count(tensor
->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors
[symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor
)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ? 1 : 0), __extension__
({ if ((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor
)->stride) == 0 && ccv_nnc_tensor_count(tensor->
info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[
symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor
)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ; else __assert_fail
("(ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor)->stride) == 0 && ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)"
, "ccv_nnc_symbolic_graph_compile.c", 4155, __extension__ __PRETTY_FUNCTION__
); }))
;
4156 } else
4157 { assert(ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info))((void) sizeof ((ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ? 1 : 0), __extension__
({ if (ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ; else __assert_fail
("ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)"
, "ccv_nnc_symbolic_graph_compile.c", 4157, __extension__ __PRETTY_FUNCTION__
); }))
; }
4158 if (CCV_IS_TENSOR_VIEW(tensor_arena->vt_tensors[symbol.d])((*(int*)(tensor_arena->vt_tensors[symbol.d])) & CCV_TENSOR_VIEW
)
)
4159 { assert(((ccv_nnc_tensor_view_t*)tensor_arena->vt_tensors[symbol.d])->off == 0)((void) sizeof ((((ccv_nnc_tensor_view_t*)tensor_arena->vt_tensors
[symbol.d])->off == 0) ? 1 : 0), __extension__ ({ if (((ccv_nnc_tensor_view_t
*)tensor_arena->vt_tensors[symbol.d])->off == 0) ; else
__assert_fail ("((ccv_nnc_tensor_view_t*)tensor_arena->vt_tensors[symbol.d])->off == 0"
, "ccv_nnc_symbolic_graph_compile.c", 4159, __extension__ __PRETTY_FUNCTION__
); }))
; }
4160 tensor_arena->vt_tensors[symbol_d]->data = tensor->data;
4161 if (tensor_arena->vt_alias_r_refs_p[symbol_d] >= 0)
4162 for (i = tensor_arena->vt_alias_r_refs_p[symbol_d]; i < tensor_arena->vt_tensor_size; i++)
4163 {
4164 const int d = tensor_arena->vt_alias_r_refs[i];
4165 if (d < 0 || symbol_d + 1 != tensor_arena->vt_alias_refs[d]) // Doesn't match, reached the end of it.
4166 break;
4167 ccv_nnc_tensor_t* const d_tensor = tensor_arena->vt_tensors[d];
4168 d_tensor->info.datatype = tensor->info.datatype;
4169 d_tensor->info.reserved = tensor->info.reserved;
4170 if (CCV_IS_TENSOR_VIEW(d_tensor)((*(int*)(d_tensor)) & CCV_TENSOR_VIEW))
4171 ccv_nnc_tensor_data(tensor->info, tensor->data.u8, ((ccv_nnc_tensor_view_t*)d_tensor)->off + tensor->dataof, &d_tensor->data, &d_tensor->dataof);
4172 else {
4173 d_tensor->data.u8 = tensor->data.u8;
4174 d_tensor->dataof = tensor->dataof;
4175 }
4176 }
4177}
4178
4179void ccv_nnc_tensor_arena_clear_bindings(ccv_nnc_tensor_arena_t* const tensor_arena)
4180{
4181 if (!tensor_arena->pb_vt_tensors)
4182 return;
4183 int i;
4184 for (i = 0; i < tensor_arena->vt_tensor_size; i++)
4185 if (tensor_arena->vt_tensors[i])
4186 tensor_arena->vt_tensors[i]->data = tensor_arena->pb_vt_tensors[i];
4187}
4188
4189uint64_t ccv_nnc_tensor_arena_size(const ccv_nnc_tensor_arena_t* const tensor_arena)
4190{
4191 uint64_t total_size = 0;
4192 int i;
4193 for (i = 0; i < tensor_arena->buffer_size; i++)
4194 total_size += tensor_arena->buffers[i].size;
4195 return total_size;
4196}
4197
4198static void _ccv_nnc_multiview_update_params(ccv_nnc_tensor_multiview_t* const mv, const ccv_nnc_tensor_param_t params)
4199{
4200 int i;
4201 if (mv->it)
4202 mv->it->info = params;
4203 for (i = 0; i < mv->repeat + mv->kind; i++)
4204 {
4205 ccv_nnc_tensor_t* tensor = CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i];
4206 if (CCV_IS_TENSOR_MULTIVIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_MULTIVIEW))
4207 _ccv_nnc_multiview_update_params((ccv_nnc_tensor_multiview_t*)tensor, params);
4208 else
4209 tensor->info = params;
4210 }
4211}
4212
4213int ccv_nnc_tensor_arena_reinit(ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_symbolic_graph_t* const graph)
4214{
4215 int i;
4216 assert(graph->tensor_symbol_info->rnum >= tensor_arena->vt_tensor_size)((void) sizeof ((graph->tensor_symbol_info->rnum >= tensor_arena
->vt_tensor_size) ? 1 : 0), __extension__ ({ if (graph->
tensor_symbol_info->rnum >= tensor_arena->vt_tensor_size
) ; else __assert_fail ("graph->tensor_symbol_info->rnum >= tensor_arena->vt_tensor_size"
, "ccv_nnc_symbolic_graph_compile.c", 4216, __extension__ __PRETTY_FUNCTION__
); }))
;
4217 if (!tensor_arena->vt_sizes) // Keep the original size so we can check against to see if we will overflow.
4218 {
4219 tensor_arena->vt_sizes = (size_t*)ccmallocmalloc(sizeof(size_t) * tensor_arena->vt_tensor_size);
4220 for (i = 0; i < tensor_arena->vt_tensor_size; i++)
4221 if (tensor_arena->vt_tensors[i] && !tensor_arena->vt_alias_refs[i])
4222 {
4223 ccv_nnc_tensor_t* tensor = tensor_arena->vt_tensors[i];
4224 if (CCV_IS_TENSOR_MULTIVIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_MULTIVIEW))
4225 {
4226 ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)tensor;
4227 while (CCV_IS_TENSOR_MULTIVIEW(mv)((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW))
4228 mv = (ccv_nnc_tensor_multiview_t*)(mv->it ? mv->it : CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[0]);
4229 tensor = (ccv_nnc_tensor_t*)mv;
4230 }
4231 tensor_arena->vt_sizes[i] = ccv_nnc_tensor_data_size(tensor->info);
4232 }
4233 }
4234 int flag = 0;
4235 for (i = 0; !flag && i < tensor_arena->vt_tensor_size; i++)
4236 if (tensor_arena->vt_tensors[i] && !tensor_arena->vt_alias_refs[i])
4237 {
4238 ccv_nnc_tensor_symbol_info_t* const symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, i)((void*)(((char*)((graph->tensor_symbol_info)->data)) +
(size_t)(graph->tensor_symbol_info)->rsize * (size_t)(
i)))
;
4239 ccv_nnc_tensor_param_t params = symbol_info->info;
4240 params.datatype = tensor_arena->vt_tensors[i]->info.datatype;
4241 params.reserved = tensor_arena->vt_tensors[i]->info.reserved;
4242 flag = (tensor_arena->vt_sizes[i] < ccv_nnc_tensor_data_size(params));
4243 }
4244 if (flag)
4245 return -1;
4246 for (i = 0; i < tensor_arena->vt_tensor_size; i++)
4247 if (tensor_arena->vt_tensors[i])
4248 {
4249 ccv_nnc_tensor_symbol_info_t* const symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, i)((void*)(((char*)((graph->tensor_symbol_info)->data)) +
(size_t)(graph->tensor_symbol_info)->rsize * (size_t)(
i)))
;
4250 ccv_nnc_tensor_t* tensor = tensor_arena->vt_tensors[i];
4251 if (CCV_IS_TENSOR_MULTIVIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_MULTIVIEW))
4252 {
4253 assert(!tensor_arena->vt_alias_refs[i])((void) sizeof ((!tensor_arena->vt_alias_refs[i]) ? 1 : 0)
, __extension__ ({ if (!tensor_arena->vt_alias_refs[i]) ; else
__assert_fail ("!tensor_arena->vt_alias_refs[i]", "ccv_nnc_symbolic_graph_compile.c"
, 4253, __extension__ __PRETTY_FUNCTION__); }))
;
4254 _ccv_nnc_multiview_update_params((ccv_nnc_tensor_multiview_t*)tensor, symbol_info->info);
4255 } else if (!tensor_arena->vt_alias_refs[i]) {
4256 ccv_nnc_tensor_param_t params = symbol_info->info;
4257 params.datatype = tensor->info.datatype;
4258 params.reserved = tensor->info.reserved;
4259 tensor->info = params;
4260 } else {
4261 off_t off = ccv_nnc_tensor_view_offset(tensor->info.datatype, symbol_info->stride, symbol_info->ofs);
4262 ccv_nnc_tensor_param_t params = symbol_info->info;
4263 params.datatype = tensor->info.datatype;
4264 params.reserved = tensor->info.reserved;
4265 tensor->info = params;
4266 const int alias_ref = tensor_arena->vt_alias_refs[i] - 1;
4267 ccv_nnc_tensor_data(tensor->info, tensor_arena->vt_tensors[alias_ref]->data.u8, off + tensor_arena->vt_tensors[alias_ref]->dataof, &tensor->data, &tensor->dataof);
4268 if (CCV_IS_TENSOR_VIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_VIEW))
4269 ((ccv_nnc_tensor_view_t*)tensor)->off = off;
4270 }
4271 }
4272 // Should handle sub_tensor_arena, don't do that at the moment.
4273 assert(!graph->sub_graphs)((void) sizeof ((!graph->sub_graphs) ? 1 : 0), __extension__
({ if (!graph->sub_graphs) ; else __assert_fail ("!graph->sub_graphs"
, "ccv_nnc_symbolic_graph_compile.c", 4273, __extension__ __PRETTY_FUNCTION__
); }))
;
4274 return 0;
4275}
4276
4277void ccv_nnc_graph_exec_reinit(ccv_nnc_graph_exec_arena_t* const graph_exec_arena, ccv_nnc_graph_t* const graph, const ccv_nnc_symbolic_graph_t* const symbolic_graph)
4278{
4279 assert(symbolic_graph->exec_symbol_info->rnum >= graph_exec_arena->graph_exec_size)((void) sizeof ((symbolic_graph->exec_symbol_info->rnum
>= graph_exec_arena->graph_exec_size) ? 1 : 0), __extension__
({ if (symbolic_graph->exec_symbol_info->rnum >= graph_exec_arena
->graph_exec_size) ; else __assert_fail ("symbolic_graph->exec_symbol_info->rnum >= graph_exec_arena->graph_exec_size"
, "ccv_nnc_symbolic_graph_compile.c", 4279, __extension__ __PRETTY_FUNCTION__
); }))
;
4280 int i;
4281 for (i = 0; i < graph_exec_arena->graph_exec_size; i++)
4282 {
4283 const ccv_nnc_graph_exec_t graph_exec = graph_exec_arena->graph_execs[i];
4284 if (graph_exec.d < 0)
4285 continue;
4286 const ccv_nnc_cmd_t existing_cmd = ccv_nnc_graph_exec_cmd(graph, graph_exec);
4287 const ccv_nnc_graph_exec_symbol_info_t* const symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, i)((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
(size_t)(i)))
;
4288 ccv_nnc_cmd_t new_cmd = symbol_info->cmd;
4289 if (new_cmd.cmd == existing_cmd.cmd) // If the command matches, replacing the backend and algorithm to the existing one, which hypothetically has been autotuned..
4290 {
4291 new_cmd.backend = existing_cmd.backend;
4292 new_cmd.algorithm = existing_cmd.algorithm;
4293 }
4294 ccv_nnc_graph_exec_set(graph, graph_exec, new_cmd);
4295 }
4296}
4297
4298void ccv_nnc_tensor_arena_buffer_free(ccv_nnc_tensor_arena_t* const tensor_arena)
4299{
4300 int i;
4301 for (i = 0; i < tensor_arena->buffer_size; i++)
4302 {
4303 if (!tensor_arena->buffers[i].ptr)
4304 continue;
4305 const int buffer_type = tensor_arena->buffers[i].type;;
4306 const int memory_type = CCV_TENSOR_GET_MEMORY(buffer_type)((buffer_type) & 0x3);
4307#ifdef HAVE_CUDA1
4308 const int device_id = CCV_TENSOR_GET_DEVICE_ID(buffer_type)(((buffer_type) & 0xfff00) >> 8);
4309 if (memory_type == CCV_TENSOR_GPU_MEMORY)
4310 {
4311 if (tensor_arena->allocator.isa && tensor_arena->allocator.isa->free)
4312 tensor_arena->allocator.isa->free(tensor_arena->buffers[i].ptr, tensor_arena->allocator.context.free);
4313 else
4314 cufree(device_id, tensor_arena->buffers[i].ptr);
4315 } else {
4316 assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0
), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY)
; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY"
, "ccv_nnc_symbolic_graph_compile.c", 4316, __extension__ __PRETTY_FUNCTION__
); }))
;
4317 if (tensor_arena->buffers[i].pin_mem)
4318 cuhostfree(tensor_arena->buffers[i].ptr);
4319 else
4320 ccfreefree(tensor_arena->buffers[i].ptr);
4321 }
4322#elif defined(HAVE_MPS)
4323 const int device_id = CCV_TENSOR_GET_DEVICE_ID(buffer_type)(((buffer_type) & 0xfff00) >> 8);
4324 if (memory_type == CCV_TENSOR_GPU_MEMORY)
4325 {
4326 // if (tensor_arena->allocator.isa && tensor_arena->allocator.isa->free)
4327 // tensor_arena->allocator.isa->free(tensor_arena->buffers[i].ptr, tensor_arena->allocator.context.free);
4328 // else
4329 mpheapfree(device_id, tensor_arena->buffers[i].ptr);
4330 } else {
4331 assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0
), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY)
; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY"
, "ccv_nnc_symbolic_graph_compile.c", 4331, __extension__ __PRETTY_FUNCTION__
); }))
;
4332 ccfreefree(tensor_arena->buffers[i].ptr);
4333 }
4334#else
4335 assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0
), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY)
; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY"
, "ccv_nnc_symbolic_graph_compile.c", 4335, __extension__ __PRETTY_FUNCTION__
); }))
;
4336 ccfreefree(tensor_arena->buffers[i].ptr);
4337#endif
4338 tensor_arena->buffers[i].ptr = 0;
4339 }
4340 // For now, the life-cycle of the disposers lives with the buffer. It may ends before the tensor arena deallocates.
4341 if (tensor_arena->disposers)
4342 {
4343 for (i = 0; i < tensor_arena->disposers->rnum; i++)
4344 {
4345 ccv_nnc_arena_disposer_t* const disposer = (ccv_nnc_arena_disposer_t*)ccv_array_get(tensor_arena->disposers, i)((void*)(((char*)((tensor_arena->disposers)->data)) + (
size_t)(tensor_arena->disposers)->rsize * (size_t)(i)))
;
4346 disposer->dispose(disposer->ptr, disposer->userdata);
4347 }
4348 ccv_array_free(tensor_arena->disposers);
4349 tensor_arena->disposers = 0;
4350 }
4351}
4352
4353void ccv_nnc_tensor_arena_free(ccv_nnc_tensor_arena_t* const tensor_arena)
4354{
4355 ccv_nnc_tensor_arena_buffer_free(tensor_arena);
4356 _ccv_nnc_tensor_arena_free(tensor_arena);
4357}
4358
4359void ccv_nnc_graph_exec_arena_free(ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
4360{
4361 int i;
4362 for (i = 0; i < graph_exec_arena->sub_arena_size; i++)
4363 if (graph_exec_arena->sub_arenas[i])
4364 ccv_nnc_graph_exec_arena_free(graph_exec_arena->sub_arenas[i]);
4365 ccfreefree(graph_exec_arena);
4366}