Bug Summary

File:nnc/ccv_nnc_symbolic_graph_compile.c
Warning:line 3514, column 7
The left operand of '==' is a garbage value

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -triple x86_64-unknown-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name ccv_nnc_symbolic_graph_compile.c -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -mrelocation-model static -mthread-model posix -fmath-errno -masm-verbose -mconstructor-aliases -munwind-tables -fuse-init-array -target-cpu x86-64 -target-feature +sse2 -dwarf-column-info -debugger-tuning=gdb -momit-leaf-frame-pointer -resource-dir /usr/local/lib/clang/8.0.0 -I ../ -I /usr/local/cuda/include -D HAVE_CBLAS -D HAVE_LIBPNG -D HAVE_LIBJPEG -D HAVE_FFTW3 -D HAVE_PTHREAD -D HAVE_UCONTEXT -D HAVE_LIBLINEAR -D HAVE_TESSERACT -D HAVE_AVCODEC -D HAVE_AVFORMAT -D HAVE_AVUTIL -D HAVE_SWSCALE -D USE_DISPATCH -D HAVE_SSE2 -D HAVE_GSL -D HAVE_CUDA -D HAVE_CUDNN -D HAVE_NCCL -I /usr/local/include -internal-isystem /usr/local/include -internal-isystem /usr/local/lib/clang/8.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -fdebug-compilation-dir /home/liu/buildslave/linux-x64-runtests/build/lib/nnc -ferror-limit 19 -fmessage-length 0 -fblocks -fobjc-runtime=gcc -fdiagnostics-show-option -vectorize-loops -vectorize-slp -analyzer-output=html -o /home/liu/buildslave/public_html/analyze/2019-07-03-215927-77989-1 -x c ccv_nnc_symbolic_graph_compile.c -faddrsig
1#include "ccv_nnc.h"
2#include "ccv_nnc_easy.h"
3#include "ccv_nnc_internal.h"
4#include "ccv_internal.h"
5#ifdef HAVE_CUDA1
6#include "gpu/ccv_nnc_compat.h"
7#endif
8#include "_ccv_nnc_graph.h"
9#include "_ccv_nnc_symbolic_graph.h"
10
11#pragma mark - Level-3 API
12
13typedef struct {
14 int flags;
15 int type;
16 int pin_mem; // This memory need to be pinned.
17 int ref; // Reference to another tensor block. Start with 1.
18 int bypass_ref; // Copy over the bypass_ref from tensor symbol underneath. Start with 1.
19 int companion_ref; // Reference to another block that they two share the same memory region. Start with 1. the current crude implementation requires the two mutually be companion. Because there are two, we took the one that companion_ref <= i as the primary and companion_ref > i is the secondary. For allocation algorithm, we use the primary throughout.
20 int unfoldable_except_ref; // Reference to a tensor block that can be the exception to unfoldable (as output). Start with 1.
21 ccv_array_t* r_refs; // If this is referenced by another block, the array point back to these blocks. Start with 1.
22 uint64_t size; // The size of the tensor expected.
23 int p_refs[2]; // Reference to the parent tensor block, at max there will be only two. Start with 1.
24 ccv_array_t* dup_p_refs; // Reference to the parent tensor block from the duplicated tensor blocks. It could be many. Start with 0.
25 ccv_array_t* head; // The head nodes (it could be multiple if from the graph, one cannot determine which is the first).
26 ccv_array_t* tail; // The tail nodes (it could be multiple if from the graph, one cannot determine which is the last).
27} ccv_nnc_tensor_block_t; // Tensor Arena Block
28
29#define IS_PRIMARY_COMPANION(idx, block)((idx) < (uint32_t)((block).companion_ref - 1)) ((idx) < (uint32_t)((block).companion_ref - 1))
30
31enum {
32 UNASSIGNED = 0x1,
33 ALIAS = 0x2,
34 READ_ONLY = 0x4,
35 WRITE_ONLY = 0x8,
36 READ_WRITE = 0xc,
37 ANONYMOUS = 0x10, // Mark this block as anonymous (thus, not reference to any specific tensor).
38 UNFOLDABLE_AS_INPUT = 0x20, // If this block is used as input, it cannot be folded into any output blocks.
39 UNFOLDABLE_AS_OUTPUT = 0x40, // If this block is used as output, it cannot be folded into any input blocks.
40};
41
42#define TENSOR_EXPECT_ORDINARY(t)((t.flags & 0x3) == 0) ((t.flags & 0x3) == 0)
43#define TENSOR_EXPECT_SET_ORDINARY(t)(t.flags = (t.flags & ~0x3)) (t.flags = (t.flags & ~0x3))
44#define TENSOR_EXPECT_UNASSIGNED(t)((t.flags & 0x3) == UNASSIGNED) ((t.flags & 0x3) == UNASSIGNED)
45#define TENSOR_EXPECT_SET_UNASSIGNED(t)(t.flags = ((t.flags & ~0x3) | UNASSIGNED)) (t.flags = ((t.flags & ~0x3) | UNASSIGNED))
46#define TENSOR_EXPECT_UNSET_UNASSIGNED(t)(t.flags = (t.flags & ~0x1)) (t.flags = (t.flags & ~0x1))
47#define TENSOR_EXPECT_ALIAS(t)((t.flags & 0x3) == ALIAS) ((t.flags & 0x3) == ALIAS)
48#define TENSOR_EXPECT_COMPUTABLE(t)(!((t.flags & 0x3) == ALIAS) && !((t.flags & 0x3
) == UNASSIGNED))
(!TENSOR_EXPECT_ALIAS(t)((t.flags & 0x3) == ALIAS) && !TENSOR_EXPECT_UNASSIGNED(t)((t.flags & 0x3) == UNASSIGNED))
49#define TENSOR_READ_WRITE(t)(t.flags & 0xc) (t.flags & 0xc)
50#define TENSOR_SET_READ_WRITE(t, rw)(t.flags = ((t.flags & ~0xc) | rw)) (t.flags = ((t.flags & ~0xc) | rw))
51#define TENSOR_SET_ANONYMOUS(t)(t.flags = (t.flags & ~0x10 | ANONYMOUS)) (t.flags = (t.flags & ~0x10 | ANONYMOUS))
52#define TENSOR_IS_ANONYMOUS(t)(t.flags & ANONYMOUS) (t.flags & ANONYMOUS)
53#define TENSOR_SET_UNFOLDABLE_AS_INPUT(t)(t.flags = (t.flags | UNFOLDABLE_AS_INPUT)) (t.flags = (t.flags | UNFOLDABLE_AS_INPUT))
54#define TENSOR_IS_UNFOLDABLE_AS_INPUT(t)(t.flags & UNFOLDABLE_AS_INPUT) (t.flags & UNFOLDABLE_AS_INPUT)
55#define TENSOR_SET_UNFOLDABLE_AS_OUTPUT(t)(t.flags = (t.flags | UNFOLDABLE_AS_OUTPUT)) (t.flags = (t.flags | UNFOLDABLE_AS_OUTPUT))
56#define TENSOR_IS_UNFOLDABLE_AS_OUTPUT(t)(t.flags & UNFOLDABLE_AS_OUTPUT) (t.flags & UNFOLDABLE_AS_OUTPUT)
57
58#define TENSOR_REQUIRE_INIT(flags)(((flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS) || ((flags)
& CCV_NNC_TENSOR_SYMBOL_INIT_ONES))
(((flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS) || ((flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ONES))
59
60// Holds additional information about the exe nodes.
61typedef struct {
62 int flags;
63} ccv_nnc_graph_exec_flag_t;
64
65enum {
66 CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO = 0x1, // Need to insert additional IO transfer for case..of statement.
67};
68
69typedef struct {
70 int index;
71 int companion; // The companion node index (the node that doesn't interfere with current one).
72 int oc;
73 int type;
74 uint64_t size;
75} ccv_nnc_tensor_opt_t;
76
77// We first sort the same type together (because they won't be reused at all.
78// And then we sort by size, after that, sort by oc.
79#define more_than(i1, i2, aux) (((i1).size > (i2).size) || ((i1).size == (i2).size && (i1).oc >= (i2).oc))
80static CCV_IMPLEMENT_QSORT(_ccv_nnc_tensor_opt_sort_by_size_and_oc, ccv_nnc_tensor_opt_t, more_than)void _ccv_nnc_tensor_opt_sort_by_size_and_oc(ccv_nnc_tensor_opt_t
*array, size_t total, int aux) { int isort_thresh = 7; ccv_nnc_tensor_opt_t
t; int sp = 0; struct { ccv_nnc_tensor_opt_t *lb; ccv_nnc_tensor_opt_t
*ub; } stack[48]; if( total <= 1 ) return; stack[0].lb = array
; stack[0].ub = array + (total - 1); while( sp >= 0 ) { ccv_nnc_tensor_opt_t
* left = stack[sp].lb; ccv_nnc_tensor_opt_t* right = stack[sp
--].ub; for(;;) { int i, n = (int)(right - left) + 1, m; ccv_nnc_tensor_opt_t
* ptr; ccv_nnc_tensor_opt_t* ptr2; if( n <= isort_thresh )
{ insert_sort: for( ptr = left + 1; ptr <= right; ptr++ )
{ for( ptr2 = ptr; ptr2 > left && more_than(ptr2[
0],ptr2[-1], aux); ptr2--) (((t)) = ((ptr2[0])), ((ptr2[0])) =
((ptr2[-1])), ((ptr2[-1])) = ((t))); } break; } else { ccv_nnc_tensor_opt_t
* left0; ccv_nnc_tensor_opt_t* left1; ccv_nnc_tensor_opt_t* right0
; ccv_nnc_tensor_opt_t* right1; ccv_nnc_tensor_opt_t* pivot; ccv_nnc_tensor_opt_t
* a; ccv_nnc_tensor_opt_t* b; ccv_nnc_tensor_opt_t* c; int swap_cnt
= 0; left0 = left; right0 = right; pivot = left + (n/2); if(
n > 40 ) { int d = n / 8; a = left, b = left + d, c = left
+ 2*d; left = more_than(*a, *b, aux) ? (more_than(*b, *c, aux
) ? b : (more_than(*a, *c, aux) ? c : a)) : (more_than(*c, *b
, aux) ? b : (more_than(*a, *c, aux) ? a : c)); a = pivot - d
, b = pivot, c = pivot + d; pivot = more_than(*a, *b, aux) ? (
more_than(*b, *c, aux) ? b : (more_than(*a, *c, aux) ? c : a)
) : (more_than(*c, *b, aux) ? b : (more_than(*a, *c, aux) ? a
: c)); a = right - 2*d, b = right - d, c = right; right = more_than
(*a, *b, aux) ? (more_than(*b, *c, aux) ? b : (more_than(*a, *
c, aux) ? c : a)) : (more_than(*c, *b, aux) ? b : (more_than(
*a, *c, aux) ? a : c)); } a = left, b = pivot, c = right; pivot
= more_than(*a, *b, aux) ? (more_than(*b, *c, aux) ? b : (more_than
(*a, *c, aux) ? c : a)) : (more_than(*c, *b, aux) ? b : (more_than
(*a, *c, aux) ? a : c)); if( pivot != left0 ) { (((t)) = ((*pivot
)), ((*pivot)) = ((*left0)), ((*left0)) = ((t))); pivot = left0
; } left = left1 = left0 + 1; right = right1 = right0; for(;;
) { while( left <= right && !more_than(*pivot, *left
, aux) ) { if( !more_than(*left, *pivot, aux) ) { if( left >
left1 ) (((t)) = ((*left1)), ((*left1)) = ((*left)), ((*left
)) = ((t))); swap_cnt = 1; left1++; } left++; } while( left <=
right && !more_than(*right, *pivot, aux) ) { if( !more_than
(*pivot, *right, aux) ) { if( right < right1 ) (((t)) = ((
*right1)), ((*right1)) = ((*right)), ((*right)) = ((t))); swap_cnt
= 1; right1--; } right--; } if( left > right ) break; (((
t)) = ((*left)), ((*left)) = ((*right)), ((*right)) = ((t)));
swap_cnt = 1; left++; right--; } if( swap_cnt == 0 ) { left =
left0, right = right0; goto insert_sort; } n = ({ typeof ((int
)(left1 - left0)) _a = ((int)(left1 - left0)); typeof ((int)(
left - left1)) _b = ((int)(left - left1)); (_a < _b) ? _a :
_b; }); for( i = 0; i < n; i++ ) (((t)) = ((left0[i])), (
(left0[i])) = ((left[i-n])), ((left[i-n])) = ((t))); n = ({ typeof
((int)(right0 - right1)) _a = ((int)(right0 - right1)); typeof
((int)(right1 - right)) _b = ((int)(right1 - right)); (_a <
_b) ? _a : _b; }); for( i = 0; i < n; i++ ) (((t)) = ((left
[i])), ((left[i])) = ((right0[i-n+1])), ((right0[i-n+1])) = (
(t))); n = (int)(left - left1); m = (int)(right1 - right); if
( n > 1 ) { if( m > 1 ) { if( n > m ) { stack[++sp].
lb = left0; stack[sp].ub = left0 + n - 1; left = right0 - m +
1, right = right0; } else { stack[++sp].lb = right0 - m + 1;
stack[sp].ub = right0; left = left0, right = left0 + n - 1; }
} else left = left0, right = left0 + n - 1; } else if( m >
1 ) left = right0 - m + 1, right = right0; else break; } } }
}
81#undef more_than
82
83// If b has items overlap with a, a is still after b (inclusive).
84static int _ccv_nnc_tensor_block_a_after_b_inclusively(const ccv_sparse_matrix_t* const exec_dep, const ccv_array_t* const a, const ccv_array_t* const b)
85{
86 assert(a)((void) sizeof ((a) ? 1 : 0), __extension__ ({ if (a) ; else __assert_fail
("a", "ccv_nnc_symbolic_graph_compile.c", 86, __extension__ __PRETTY_FUNCTION__
); }))
;
87 assert(b)((void) sizeof ((b) ? 1 : 0), __extension__ ({ if (b) ; else __assert_fail
("b", "ccv_nnc_symbolic_graph_compile.c", 87, __extension__ __PRETTY_FUNCTION__
); }))
;
88 int x, y;
89 for (x = 0; x < b->rnum; x++)
90 {
91 const int p = *(int*)ccv_array_get(b, x)((void*)(((char*)((b)->data)) + (size_t)(b)->rsize * (size_t
)(x)))
;
92 int flag = 0;
93 // In extreme cases where a is a superset of b, then a is still after b, we are good.
94 for (y = 0; !flag && y < a->rnum; y++)
95 {
96 const int q = *(int*)ccv_array_get(a, y)((void*)(((char*)((a)->data)) + (size_t)(a)->rsize * (size_t
)(y)))
;
97 flag = (p == q);
98 }
99 if (!flag)
100 for (y = 0; y < a->rnum; y++)
101 {
102 ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, *(int*)ccv_array_get(a, y)((void*)(((char*)((a)->data)) + (size_t)(a)->rsize * (size_t
)(y)))
, p);
103 if (!cell.i32 || cell.i32[0] == 0)
104 return 0;
105 }
106 }
107 // If b->rnum == 0, a is after b for sure.
108 // Otherwise, if a->rnum == 0, we don't check any, buf if b->rnum > 0, then we cannot say a is after b.
109 // if both a->rnum > 0 and b->rnum > 0, above logic should checked all.
110 return (a->rnum > 0 || b->rnum == 0);
111}
112
113static int _ccv_nnc_tensor_block_a_after_b_exclusively(const ccv_sparse_matrix_t* const exec_dep, const ccv_array_t* const a, const ccv_array_t* const b)
114{
115 assert(a)((void) sizeof ((a) ? 1 : 0), __extension__ ({ if (a) ; else __assert_fail
("a", "ccv_nnc_symbolic_graph_compile.c", 115, __extension__
__PRETTY_FUNCTION__); }))
;
116 assert(b)((void) sizeof ((b) ? 1 : 0), __extension__ ({ if (b) ; else __assert_fail
("b", "ccv_nnc_symbolic_graph_compile.c", 116, __extension__
__PRETTY_FUNCTION__); }))
;
117 int x, y, max_hop = 0;
118 for (x = 0; x < a->rnum; x++)
119 for (y = 0; y < b->rnum; y++)
120 {
121 ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, *(int*)ccv_array_get(a, x)((void*)(((char*)((a)->data)) + (size_t)(a)->rsize * (size_t
)(x)))
, *(int*)ccv_array_get(b, y)((void*)(((char*)((b)->data)) + (size_t)(b)->rsize * (size_t
)(y)))
);
122 if (!cell.i32 || cell.i32[0] == 0)
123 return 0;
124 max_hop = ccv_max(cell.i32[0], max_hop)({ typeof (cell.i32[0]) _a = (cell.i32[0]); typeof (max_hop) _b
= (max_hop); (_a > _b) ? _a : _b; })
;
125 }
126 // We've entered this nested-for loop, therefore, it must be verifiably, deterministically after b now.
127 // The max hop also denotes if that is the case, how many hops, maximally speaking, we need to get from a to b.
128 return max_hop;
129}
130
131// If every a's head is deterministically after b's tail
132static int _ccv_nnc_tensor_block_head_after_tail(const ccv_sparse_matrix_t* const exec_dep, const ccv_nnc_tensor_block_t a, const ccv_nnc_tensor_block_t b)
133{
134 return _ccv_nnc_tensor_block_a_after_b_exclusively(exec_dep, a.head, b.tail);
135}
136
137typedef struct {
138 ccv_array_t** alloc_dep;
139 int vt_block_size;
140 int buffer_size;
141 int block_size;
142 int* vt_blocks; // A reference to the block, because blocks only contains available block (thus, doesn't consider alias etc.). -1 means no block pointed to. Starts at 0.
143 struct {
144 int type; // The type from tensor blocks.
145 int pin_mem; // Whether this is pinned memory.
146 int flags; // The flags (currently for READ_ONLY or not).
147 uint64_t size; // The size of the buffer allocated.
148 int p_refs[2]; // Reference to the upper level block, Starts at 1. Only index 0 is valid throughout, I do use two in the code as a temporary placeholder.
149 ccv_array_t* dup_p_refs; // Reference to the parent tensor block from the duplicated tensor blocks. From buffer, it can point to multiple because it can be associated with multiple tensor blocks that points to different outputs (for example, in 1st unroll, pointing to one block while in 2nd unroll, pointing to another). Start with 0.
150 }* buffers;
151 struct {
152 int buffer_ref; // A reference for block to which buffer to use. Starts at 0.
153 int block_ref; // A reference to which block in the given tensor_block to use.
154 uint64_t offset; // The offset of this block.
155 }* blocks;
156} ccv_nnc_tensor_alloc_prep_t;
157
158typedef struct ccv_nnc_symbolic_graph_prep_s {
159 int flags;
160 int while_count_tensor; // This graph will generate a while count tensor. If this is set to 1, we reserve tensor_metadata at 0 for this.
161 int p_idx; // Reference to the index in its parent graph's sub-graph array, Starts at 1.
162 int exec_idx;
163 int unroll_count; // How many times this graph is unrolled before we can have proper assignment.
164 int tensor_symbol_info_size;
165 int exec_symbol_info_size;
166 int tensor_block_size;
167 int sub_prep_size;
168 ccv_nnc_tensor_block_t* tensor_blocks;
169 ccv_nnc_tensor_symbol_info_t* tensor_symbol_info;
170 ccv_nnc_graph_exec_flag_t* exec_flags;
171 ccv_nnc_graph_exec_symbol_info_t* exec_symbol_info;
172 int* dup_tensor_block_ref;
173 ccv_nnc_graph_visit_t* visit;
174 ccv_nnc_tensor_alloc_prep_t* alloc_prep;
175 struct ccv_nnc_symbolic_graph_prep_s* p;
176 struct ccv_nnc_symbolic_graph_prep_s** sub_preps; // The preps of its sub-graphs.
177 // Structures that don't require to be freed after deallocation.
178 const ccv_nnc_symbolic_graph_t* symbolic_graph; // Constant because I cannot modify it.
179 ccv_nnc_graph_t* graph; // Materialized graph, not managed by prep after created.
180 ccv_nnc_tensor_arena_t* tensor_arena; // Tensor arena, not managed by prep as well.
181 ccv_array_t* dup_breakpoints; // The noop breakpoints, used to extend the inputs life-cycle for while expr.
182} ccv_nnc_symbolic_graph_prep_t;
183
184static ccv_nnc_tensor_alloc_prep_t* _ccv_nnc_tensor_alloc_prep_new(const ccv_sparse_matrix_t* const exec_dep, const ccv_nnc_tensor_block_t* const tensor_blocks, const int tensor_block_size)
185{
186 // Compute how many dis-continuous buffers are needed.
187 // We prefer to have several dis-continuous buffers instead of one big buffer because
188 // in this way, we can rely on system memory allocators (jemalloc, tcmalloc, or CUDA's allocator)
189 // to fully utilize memory.
190 int i, j, k;
191 ccv_array_t** const alloc_dep = (ccv_array_t**)cccalloccalloc(tensor_block_size, sizeof(ccv_array_t*));
192 int allocable_tensor_size = 0, available_tensor_size = 0;
193 for (i = 0; i < tensor_block_size; i++)
194 if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED))
195 {
196 // Tensors that we need the header info.
197 ++available_tensor_size;
198 if (!TENSOR_EXPECT_ALIAS(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == ALIAS))
199 // Tensors that we actually need to allocate (exclude the alias).
200 ++allocable_tensor_size;
201 }
202 ccv_sparse_matrix_t* const tensor_df = ccv_sparse_matrix_new(tensor_block_size, tensor_block_size, CCV_32S | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0);
203 ccv_sparse_matrix_t* const tensor_dt = ccv_sparse_matrix_new(tensor_block_size, tensor_block_size, CCV_32S | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0);
204 ccv_sparse_matrix_t* const tensor_itf = ccv_sparse_matrix_new(tensor_block_size, tensor_block_size, CCV_8U | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0);
205 // Overlap count.
206 for (i = 0; i < tensor_block_size; i++)
207 for (j = i + 1; j < tensor_block_size; j++)
208 if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !(
(tensor_blocks[i].flags & 0x3) == UNASSIGNED))
&& TENSOR_EXPECT_COMPUTABLE(tensor_blocks[j])(!((tensor_blocks[j].flags & 0x3) == ALIAS) && !(
(tensor_blocks[j].flags & 0x3) == UNASSIGNED))
)
209 {
210 // Check to see if they interfere (default to yes).
211 // If any of the i's head is deterministically later than j's tail
212 // or any of the i's tail is deterministically earlier than j's head, they don't interfere.
213 const uint8_t one = 1;
214 const int i_hop_j = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[i], tensor_blocks[j]);
215 if (i_hop_j > 0)
216 {
217 ccv_set_sparse_matrix_cell(tensor_dt, i, j, &i_hop_j);
218 ccv_set_sparse_matrix_cell(tensor_df, j, i, &i_hop_j);
219 }
220 const int j_hop_i = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[j], tensor_blocks[i]);
221 if (j_hop_i > 0)
222 {
223 ccv_set_sparse_matrix_cell(tensor_dt, j, i, &j_hop_i);
224 ccv_set_sparse_matrix_cell(tensor_df, i, j, &j_hop_i);
225 }
226 // It cannot be that both i can hop to j can j can hop to i.
227 assert(!(i_hop_j > 0 && j_hop_i > 0))((void) sizeof ((!(i_hop_j > 0 && j_hop_i > 0))
? 1 : 0), __extension__ ({ if (!(i_hop_j > 0 && j_hop_i
> 0)) ; else __assert_fail ("!(i_hop_j > 0 && j_hop_i > 0)"
, "ccv_nnc_symbolic_graph_compile.c", 227, __extension__ __PRETTY_FUNCTION__
); }))
;
228 if (!i_hop_j && !j_hop_i)
229 ccv_set_sparse_matrix_cell(tensor_itf, i, j, &one);
230 }
231 int* const oc = (int*)cccalloccalloc(tensor_block_size, sizeof(int));
232 for (i = 0; i < tensor_block_size; i++)
233 for (j = 0; j < tensor_block_size; j++)
234 // If these two tensors are still alive, analyze them.
235 if (i != j && TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !(
(tensor_blocks[i].flags & 0x3) == UNASSIGNED))
&& TENSOR_EXPECT_COMPUTABLE(tensor_blocks[j])(!((tensor_blocks[j].flags & 0x3) == ALIAS) && !(
(tensor_blocks[j].flags & 0x3) == UNASSIGNED))
)
236 {
237 ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(tensor_itf, ccv_min(i, j)({ typeof (i) _a = (i); typeof (j) _b = (j); (_a < _b) ? _a
: _b; })
, ccv_max(i, j)({ typeof (i) _a = (i); typeof (j) _b = (j); (_a > _b) ? _a
: _b; })
);
238 // If their life time overlaps, compute how many tensors it overlap.
239 if (cell.u8 && cell.u8[0] == 1)
240 ++oc[i];
241 }
242 int* const buf = (int*)ccmallocmalloc(sizeof(int) * tensor_block_size);
243 int* const assigned = (int*)cccalloccalloc(tensor_block_size, sizeof(int));
244 uint64_t* const allocated_offset = (uint64_t*)cccalloccalloc(tensor_block_size, sizeof(uint64_t));
245 uint64_t* const allocated_size = (uint64_t*)cccalloccalloc(tensor_block_size, sizeof(uint64_t));
246 int num_assigned = 0;
247 // I can do a bit optimization here to assign out const tensor first, but heck, this just works for now.
248 // Allocation graph (assuming there is a source node, and a destination node, which is 0, and (tensor_block_size + 1)
249 // The first channel denotes the bytes available for allocation,
250 // the second channel denotes the offset available for the allocation,
251 ccv_sparse_matrix_t* alloc = ccv_sparse_matrix_new(tensor_block_size + 2, tensor_block_size + 2, CCV_64S | CCV_C2, CCV_SPARSE_ROW_MAJOR, 0);
252 ccv_array_t* opt = ccv_array_new(sizeof(ccv_nnc_tensor_opt_t), 1, 0);
253 for (j = 0; j < allocable_tensor_size;)
254 {
255 // Find the one with largest overlap (in case overlap is the same, larger size), and it is not assigned.
256 int max_oc = 0;
257 uint64_t max_size = 0;
258 ccv_array_clear(opt);
259 int current_type = 0; // Deal with one type at a time.
260 for (i = 0; i < tensor_block_size; i++)
261 if (oc[i] >= max_oc &&
262 TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !(
(tensor_blocks[i].flags & 0x3) == UNASSIGNED))
&& !assigned[i] &&
263 IS_PRIMARY_COMPANION(i, tensor_blocks[i])((i) < (uint32_t)((tensor_blocks[i]).companion_ref - 1)) &&
264 (!current_type || tensor_blocks[i].type == current_type))
265 {
266 ccv_nnc_tensor_opt_t a = {
267 .size = tensor_blocks[i].size,
268 .index = i,
269 .companion = -1, // If already have a designated companion, use that.
270 .oc = oc[i],
271 .type = tensor_blocks[i].type,
272 };
273 assert(a.type)((void) sizeof ((a.type) ? 1 : 0), __extension__ ({ if (a.type
) ; else __assert_fail ("a.type", "ccv_nnc_symbolic_graph_compile.c"
, 273, __extension__ __PRETTY_FUNCTION__); }))
;
274 current_type = a.type; // Now we now the primary type we should deal with.
275 if (tensor_blocks[i].companion_ref)
276 {
277 const int companion_ref = tensor_blocks[i].companion_ref - 1;
278 a.size = ccv_max(a.size, tensor_blocks[companion_ref].size)({ typeof (a.size) _a = (a.size); typeof (tensor_blocks[companion_ref
].size) _b = (tensor_blocks[companion_ref].size); (_a > _b
) ? _a : _b; })
;
279 a.oc += oc[companion_ref];
280 }
281 // In case we have a tie, take them all in the array.
282 if (a.oc > max_oc || (a.oc == max_oc && a.size > max_size))
283 ccv_array_clear(opt), max_oc = a.oc, max_size = a.size;
284 ccv_array_push(opt, &a);
285 }
286 assert(opt->rnum > 0)((void) sizeof ((opt->rnum > 0) ? 1 : 0), __extension__
({ if (opt->rnum > 0) ; else __assert_fail ("opt->rnum > 0"
, "ccv_nnc_symbolic_graph_compile.c", 286, __extension__ __PRETTY_FUNCTION__
); }))
;
287 // Go through opt array, find all tensors that doesn't interfere with it, and have tensor size larger than it.
288 // Push them with the "companion" into the opt array as well.
289 const int rnum = opt->rnum;
290 for (i = 0; i < rnum; i++)
291 {
292 // Copy it out, because after insertion, it may hold invalid pointer.
293 ccv_nnc_tensor_opt_t a = *(ccv_nnc_tensor_opt_t*)ccv_array_get(opt, i)((void*)(((char*)((opt)->data)) + (size_t)(opt)->rsize *
(size_t)(i)))
;
294 assert(a.companion == -1)((void) sizeof ((a.companion == -1) ? 1 : 0), __extension__ (
{ if (a.companion == -1) ; else __assert_fail ("a.companion == -1"
, "ccv_nnc_symbolic_graph_compile.c", 294, __extension__ __PRETTY_FUNCTION__
); }))
;
295 const int companion_ref = tensor_blocks[a.index].companion_ref - 1;
296 for (k = 0; k < tensor_block_size; k++)
297 // Find non-overlapping tensor that has larger size (of course, is unassigned and is not one with designated companion).
298 if (k != a.index && !tensor_blocks[k].companion_ref &&
299 TENSOR_EXPECT_COMPUTABLE(tensor_blocks[k])(!((tensor_blocks[k].flags & 0x3) == ALIAS) && !(
(tensor_blocks[k].flags & 0x3) == UNASSIGNED))
&& !assigned[k] &&
300 tensor_blocks[k].size > a.size && tensor_blocks[k].type == a.type)
301 {
302 ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(tensor_itf, ccv_min(a.index, k)({ typeof (a.index) _a = (a.index); typeof (k) _b = (k); (_a <
_b) ? _a : _b; })
, ccv_max(a.index, k)({ typeof (a.index) _a = (a.index); typeof (k) _b = (k); (_a >
_b) ? _a : _b; })
);
303 // Good, push to opt array.
304 if (cell.u8 && cell.u8[0] == 1)
305 continue;
306 if (companion_ref >= 0)
307 {
308 assert(companion_ref != k)((void) sizeof ((companion_ref != k) ? 1 : 0), __extension__ (
{ if (companion_ref != k) ; else __assert_fail ("companion_ref != k"
, "ccv_nnc_symbolic_graph_compile.c", 308, __extension__ __PRETTY_FUNCTION__
); }))
;
309 // Have to make sure k doesn't interfere with the designated companion as well.
310 ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(tensor_itf, ccv_min(companion_ref, k)({ typeof (companion_ref) _a = (companion_ref); typeof (k) _b
= (k); (_a < _b) ? _a : _b; })
, ccv_max(companion_ref, k)({ typeof (companion_ref) _a = (companion_ref); typeof (k) _b
= (k); (_a > _b) ? _a : _b; })
);
311 if (cell.u8 && cell.u8[0] == 1)
312 continue;
313 }
314 ccv_nnc_tensor_opt_t b = a;
315 b.companion = k;
316 b.oc = a.oc + oc[k];
317 b.size = tensor_blocks[k].size;
318 ccv_array_push(opt, &b);
319 }
320 }
321 // Order opt array by the oc because type and size should be equal at this point.
322 _ccv_nnc_tensor_opt_sort_by_size_and_oc((ccv_nnc_tensor_opt_t*)opt->data, opt->rnum, 0);
323 // Go through opt array again, this time, it is ordered by size, therefore, if we found a place to insert, we are good.
324 int min_y = 0, min_x = tensor_block_size + 1, min_i = -1, min_hop = exec_dep->rows * 3;
325 uint64_t min_val[2] = {
326 0, 0
327 };
328 for (i = 0; i < opt->rnum; i++)
329 {
330 ccv_nnc_tensor_opt_t a = *(ccv_nnc_tensor_opt_t*)ccv_array_get(opt, i)((void*)(((char*)((opt)->data)) + (size_t)(opt)->rsize *
(size_t)(i)))
;
331 // Now, determine the order between a and c. After this, we can always check whether y
332 // can hop to the earliest one and if the latest one can hop to x.
333 // The earliest one will be called p and the latest one will be called q.
334 int p = a.index;
335 int q = a.index;
336 if (a.companion >= 0)
337 {
338 const ccv_numeric_data_t a_hop_c = ccv_get_sparse_matrix_cell(tensor_dt, a.companion, a.index);
339 const ccv_numeric_data_t c_hop_a = ccv_get_sparse_matrix_cell(tensor_dt, a.index, a.companion);
340 assert((a_hop_c.i32 && a_hop_c.i32[0] > 0 && (c_hop_a.i32 == 0 || c_hop_a.i32[0] == 0)) ||((void) sizeof (((a_hop_c.i32 && a_hop_c.i32[0] > 0
&& (c_hop_a.i32 == 0 || c_hop_a.i32[0] == 0)) || ((a_hop_c
.i32 == 0 || a_hop_c.i32[0] == 0) && c_hop_a.i32 &&
c_hop_a.i32[0] > 0)) ? 1 : 0), __extension__ ({ if ((a_hop_c
.i32 && a_hop_c.i32[0] > 0 && (c_hop_a.i32
== 0 || c_hop_a.i32[0] == 0)) || ((a_hop_c.i32 == 0 || a_hop_c
.i32[0] == 0) && c_hop_a.i32 && c_hop_a.i32[0
] > 0)) ; else __assert_fail ("(a_hop_c.i32 && a_hop_c.i32[0] > 0 && (c_hop_a.i32 == 0 || c_hop_a.i32[0] == 0)) || ((a_hop_c.i32 == 0 || a_hop_c.i32[0] == 0) && c_hop_a.i32 && c_hop_a.i32[0] > 0)"
, "ccv_nnc_symbolic_graph_compile.c", 341, __extension__ __PRETTY_FUNCTION__
); }))
341 ((a_hop_c.i32 == 0 || a_hop_c.i32[0] == 0) && c_hop_a.i32 && c_hop_a.i32[0] > 0))((void) sizeof (((a_hop_c.i32 && a_hop_c.i32[0] > 0
&& (c_hop_a.i32 == 0 || c_hop_a.i32[0] == 0)) || ((a_hop_c
.i32 == 0 || a_hop_c.i32[0] == 0) && c_hop_a.i32 &&
c_hop_a.i32[0] > 0)) ? 1 : 0), __extension__ ({ if ((a_hop_c
.i32 && a_hop_c.i32[0] > 0 && (c_hop_a.i32
== 0 || c_hop_a.i32[0] == 0)) || ((a_hop_c.i32 == 0 || a_hop_c
.i32[0] == 0) && c_hop_a.i32 && c_hop_a.i32[0
] > 0)) ; else __assert_fail ("(a_hop_c.i32 && a_hop_c.i32[0] > 0 && (c_hop_a.i32 == 0 || c_hop_a.i32[0] == 0)) || ((a_hop_c.i32 == 0 || a_hop_c.i32[0] == 0) && c_hop_a.i32 && c_hop_a.i32[0] > 0)"
, "ccv_nnc_symbolic_graph_compile.c", 341, __extension__ __PRETTY_FUNCTION__
); }))
;
342 if (a_hop_c.i32 && a_hop_c.i32[0] > 0)
343 q = a.companion;
344 else
345 p = a.companion;
346 }
347 if (tensor_blocks[a.index].companion_ref)
348 {
349 const int companion_ref = tensor_blocks[a.index].companion_ref - 1;
350 assert(a.companion != companion_ref)((void) sizeof ((a.companion != companion_ref) ? 1 : 0), __extension__
({ if (a.companion != companion_ref) ; else __assert_fail ("a.companion != companion_ref"
, "ccv_nnc_symbolic_graph_compile.c", 350, __extension__ __PRETTY_FUNCTION__
); }))
;
351 const ccv_numeric_data_t b_hop_p = ccv_get_sparse_matrix_cell(tensor_dt, p, companion_ref);
352 if (b_hop_p.i32 && b_hop_p.i32[0] > 0)
353 p = companion_ref;
354 else {
355 const ccv_numeric_data_t q_hop_b = ccv_get_sparse_matrix_cell(tensor_dt, companion_ref, q);
356 if (q_hop_b.i32 && q_hop_b.i32[0] > 0)
357 q = companion_ref;
358 else { // Otherwise, b is in between p and q.
359 const ccv_numeric_data_t p_hop_b = ccv_get_sparse_matrix_cell(tensor_dt, companion_ref, p);
360 const ccv_numeric_data_t b_hop_q = ccv_get_sparse_matrix_cell(tensor_dt, q, companion_ref);
361 assert(p_hop_b.i32 && p_hop_b.i32[0] > 0 && b_hop_q.i32 && b_hop_q.i32[0] > 0)((void) sizeof ((p_hop_b.i32 && p_hop_b.i32[0] > 0
&& b_hop_q.i32 && b_hop_q.i32[0] > 0) ? 1
: 0), __extension__ ({ if (p_hop_b.i32 && p_hop_b.i32
[0] > 0 && b_hop_q.i32 && b_hop_q.i32[0] >
0) ; else __assert_fail ("p_hop_b.i32 && p_hop_b.i32[0] > 0 && b_hop_q.i32 && b_hop_q.i32[0] > 0"
, "ccv_nnc_symbolic_graph_compile.c", 361, __extension__ __PRETTY_FUNCTION__
); }))
;
362 }
363 }
364 }
365 assert(tensor_blocks[q].type == tensor_blocks[p].type)((void) sizeof ((tensor_blocks[q].type == tensor_blocks[p].type
) ? 1 : 0), __extension__ ({ if (tensor_blocks[q].type == tensor_blocks
[p].type) ; else __assert_fail ("tensor_blocks[q].type == tensor_blocks[p].type"
, "ccv_nnc_symbolic_graph_compile.c", 365, __extension__ __PRETTY_FUNCTION__
); }))
;
366 const int type = tensor_blocks[p].type;
367 // y is always earlier than x, but this is hard to assert now.
368 // If this edge satisfy the requirement, now we need to find the ones with tightest possible bounds.
369 // Thus, the hop between y and x (through a) should be smallest ones.
370 // We optimized this by first find all allocated nodes that comes to p, and all allocated nodes that
371 // out of q. For these nodes, we try to verify whether they form a connection (by checking against
372 // alloc sparse matrix). If they do, try to see whether we can insert with tightest bound.
373 int y_size = 0;
374 int* const y_buf = buf;
375#define for_block(y, val) do { \
376 if (((int*)val)[0] > 0 && assigned[y] && tensor_blocks[y].type == type) \
377 y_buf[y_size++] = y + 1; \
378 } while(0)
379 ccv_sparse_matrix_vector_t* const y_vector = ccv_get_sparse_matrix_vector(tensor_dt, p);
380 if (y_vector)
381 CCV_SPARSE_VECTOR_FOREACH(tensor_dt, y_vector, for_block)do { switch ((((tensor_dt)->type) & 0xFF000)) { case CCV_32S
: { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.i32 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i32 + (0))); } } } while (0); break; } case CCV_32F:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.f32 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f32 + (0))); } } } while (0); break; } case CCV_64S:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.i64 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i64 + (0))); } } } while (0); break; } case CCV_64F:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.f64 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f64 + (0))); } } } while (0); break; } default: { do
{ int _i_; __attribute__((unused)) const size_t _c_ = (((tensor_dt
)->type) & 0xFFF); if ((tensor_dt)->type & CCV_DENSE_VECTOR
) { for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { for_block
((_i_), ((y_vector)->data.u8 + (_i_ * _c_))); } } else { const
size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size
[(((tensor_dt)->type) & 0xFF000) >> 12] * (((tensor_dt
)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_
= (uint8_t*)(y_vector)->index; for (_i_ = 0; _i_ < (y_vector
)->size; _i_++) { ccv_sparse_matrix_index_t* const _idx_i_
= (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if
(_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_
= { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.u8 + (0))); } } } while (0); } } } while (0)
;
382#undef for_block
383 assert(y_size <= tensor_block_size)((void) sizeof ((y_size <= tensor_block_size) ? 1 : 0), __extension__
({ if (y_size <= tensor_block_size) ; else __assert_fail (
"y_size <= tensor_block_size", "ccv_nnc_symbolic_graph_compile.c"
, 383, __extension__ __PRETTY_FUNCTION__); }))
;
384 int x_size = 0;
385 int* const x_buf = buf + y_size;
386#define for_block(x, val) do { \
387 if (((int*)val)[0] > 0 && assigned[x] && tensor_blocks[x].type == type) \
388 x_buf[x_size++] = x + 1; \
389 } while(0)
390 ccv_sparse_matrix_vector_t* const x_vector = ccv_get_sparse_matrix_vector(tensor_df, q);
391 if (x_vector)
392 CCV_SPARSE_VECTOR_FOREACH(tensor_df, x_vector, for_block)do { switch ((((tensor_df)->type) & 0xFF000)) { case CCV_32S
: { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.i32 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i32 + (0))); } } } while (0); break; } case CCV_32F:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.f32 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f32 + (0))); } } } while (0); break; } case CCV_64S:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.i64 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i64 + (0))); } } } while (0); break; } case CCV_64F:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.f64 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f64 + (0))); } } } while (0); break; } default: { do
{ int _i_; __attribute__((unused)) const size_t _c_ = (((tensor_df
)->type) & 0xFFF); if ((tensor_df)->type & CCV_DENSE_VECTOR
) { for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { for_block
((_i_), ((x_vector)->data.u8 + (_i_ * _c_))); } } else { const
size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size
[(((tensor_df)->type) & 0xFF000) >> 12] * (((tensor_df
)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_
= (uint8_t*)(x_vector)->index; for (_i_ = 0; _i_ < (x_vector
)->size; _i_++) { ccv_sparse_matrix_index_t* const _idx_i_
= (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if
(_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_
= { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.u8 + (0))); } } } while (0); } } } while (0)
;
393#undef for_block
394 assert(y_size + x_size <= tensor_block_size)((void) sizeof ((y_size + x_size <= tensor_block_size) ? 1
: 0), __extension__ ({ if (y_size + x_size <= tensor_block_size
) ; else __assert_fail ("y_size + x_size <= tensor_block_size"
, "ccv_nnc_symbolic_graph_compile.c", 394, __extension__ __PRETTY_FUNCTION__
); }))
;
395 int x, y;
396 for (y = 0; y < y_size; y++)
397 {
398 const ccv_numeric_data_t val = ccv_get_sparse_matrix_cell(alloc, y_buf[y], tensor_block_size + 1);
399 if (val.u64 && val.u64[0] >= a.size)
400 {
401 const ccv_numeric_data_t y_hop_p = ccv_get_sparse_matrix_cell(tensor_dt, p, y_buf[y] - 1);
402 assert(y_hop_p.i32 && y_hop_p.i32[0] > 0)((void) sizeof ((y_hop_p.i32 && y_hop_p.i32[0] > 0
) ? 1 : 0), __extension__ ({ if (y_hop_p.i32 && y_hop_p
.i32[0] > 0) ; else __assert_fail ("y_hop_p.i32 && y_hop_p.i32[0] > 0"
, "ccv_nnc_symbolic_graph_compile.c", 402, __extension__ __PRETTY_FUNCTION__
); }))
;
403 const int hop = exec_dep->rows + y_hop_p.i32[0];
404 if (hop < min_hop)
405 min_y = y_buf[y], min_x = tensor_block_size + 1, min_hop = hop,
406 min_val[0] = val.u64[0], min_val[1] = val.u64[1];
407 }
408 }
409 for (x = 0; x < x_size; x++)
410 {
411 const ccv_numeric_data_t val = ccv_get_sparse_matrix_cell(alloc, 0, x_buf[x]);
412 if (val.u64 && val.u64[0] >= a.size)
413 {
414 const ccv_numeric_data_t q_hop_x = ccv_get_sparse_matrix_cell(tensor_dt, x_buf[x] - 1, q);
415 assert(q_hop_x.i32 && q_hop_x.i32[0] > 0)((void) sizeof ((q_hop_x.i32 && q_hop_x.i32[0] > 0
) ? 1 : 0), __extension__ ({ if (q_hop_x.i32 && q_hop_x
.i32[0] > 0) ; else __assert_fail ("q_hop_x.i32 && q_hop_x.i32[0] > 0"
, "ccv_nnc_symbolic_graph_compile.c", 415, __extension__ __PRETTY_FUNCTION__
); }))
;
416 const int hop = exec_dep->rows + q_hop_x.i32[0];
417 if (hop < min_hop)
418 min_y = 0, min_x = x_buf[x], min_hop = hop,
419 min_val[0] = val.u64[0], min_val[1] = val.u64[1];
420 }
421 }
422 for (y = 0; y < y_size; y++)
423 {
424 ccv_sparse_matrix_vector_t* const y_vector = ccv_get_sparse_matrix_vector(alloc, y_buf[y]);
425 if (y_vector)
426 for (x = 0; x < x_size; x++)
427 {
428 const ccv_numeric_data_t val = ccv_get_sparse_matrix_cell_from_vector(alloc, y_vector, x_buf[x]);
429 if (val.u64 && val.u64[0] >= a.size)
430 {
431 const ccv_numeric_data_t y_hop_p = ccv_get_sparse_matrix_cell(tensor_dt, p, y_buf[y] - 1);
432 const ccv_numeric_data_t q_hop_x = ccv_get_sparse_matrix_cell(tensor_dt, x_buf[x] - 1, q);
433 assert(y_hop_p.i32 && y_hop_p.i32[0] > 0)((void) sizeof ((y_hop_p.i32 && y_hop_p.i32[0] > 0
) ? 1 : 0), __extension__ ({ if (y_hop_p.i32 && y_hop_p
.i32[0] > 0) ; else __assert_fail ("y_hop_p.i32 && y_hop_p.i32[0] > 0"
, "ccv_nnc_symbolic_graph_compile.c", 433, __extension__ __PRETTY_FUNCTION__
); }))
;
434 assert(q_hop_x.i32 && q_hop_x.i32[0] > 0)((void) sizeof ((q_hop_x.i32 && q_hop_x.i32[0] > 0
) ? 1 : 0), __extension__ ({ if (q_hop_x.i32 && q_hop_x
.i32[0] > 0) ; else __assert_fail ("q_hop_x.i32 && q_hop_x.i32[0] > 0"
, "ccv_nnc_symbolic_graph_compile.c", 434, __extension__ __PRETTY_FUNCTION__
); }))
;
435 const int hop = y_hop_p.i32[0] + q_hop_x.i32[0];
436 if (hop < min_hop)
437 min_y = y_buf[y], min_x = x_buf[x], min_hop = hop,
438 min_val[0] = val.u64[0], min_val[1] = val.u64[1];
439 }
440 }
441 }
442 // If I found a place, stop, and exit.
443 if (min_y > 0 || min_x < tensor_block_size + 1)
444 {
445 min_i = i;
446 break;
447 }
448 }
449 // If I cannot find a place, then start a new connection between min_y and min_x (a new assignment group).
450 // and default to largest size available.
451 ccv_nnc_tensor_opt_t a = *(ccv_nnc_tensor_opt_t*)ccv_array_get(opt, ccv_max(0, min_i))((void*)(((char*)((opt)->data)) + (size_t)(opt)->rsize *
(size_t)(({ typeof (0) _a = (0); typeof (min_i) _b = (min_i)
; (_a > _b) ? _a : _b; }))))
;
452 if (min_i == -1)
453 {
454 allocated_size[num_assigned] = a.size;
455 ++num_assigned;
456 }
457 int assign_group = num_assigned;
458 if (min_y > 0)
459 {
460 assign_group = assigned[min_y - 1];
461 // The y and x should belong to the same assigned group.
462 assert(min_x == tensor_block_size + 1 || assigned[min_x - 1] == assign_group)((void) sizeof ((min_x == tensor_block_size + 1 || assigned[min_x
- 1] == assign_group) ? 1 : 0), __extension__ ({ if (min_x ==
tensor_block_size + 1 || assigned[min_x - 1] == assign_group
) ; else __assert_fail ("min_x == tensor_block_size + 1 || assigned[min_x - 1] == assign_group"
, "ccv_nnc_symbolic_graph_compile.c", 462, __extension__ __PRETTY_FUNCTION__
); }))
;
463 } else if (min_x < tensor_block_size + 1)
464 assign_group = assigned[min_x - 1];
465 // If min_y is source and min_x is destination, we don't need to do anything, otherwise, decrease the weight on that edge.
466 if (min_y != 0 || min_x != tensor_block_size + 1)
467 {
468 uint64_t val[2] = {
469 min_val[0], min_val[1]
470 };
471 assert(val[0] >= a.size)((void) sizeof ((val[0] >= a.size) ? 1 : 0), __extension__
({ if (val[0] >= a.size) ; else __assert_fail ("val[0] >= a.size"
, "ccv_nnc_symbolic_graph_compile.c", 471, __extension__ __PRETTY_FUNCTION__
); }))
;
472 val[0] -= a.size;
473 val[1] = val[1] + a.size; // Move the offset to the next one.
474 ccv_set_sparse_matrix_cell(alloc, min_y, min_x, val);
475 }
476 int strings[3];
477 strings[0] = a.index + 1;
478 int string_size = 1;
479 // Assign out companion as well.
480 if (a.companion >= 0)
481 {
482 const ccv_numeric_data_t a_hop_c = ccv_get_sparse_matrix_cell(tensor_dt, a.companion, a.index);
483 if (a_hop_c.i32 && a_hop_c.i32[0] > 0)
484 strings[1] = a.companion + 1;
485 else {
486 strings[1] = strings[0];
487 strings[0] = a.companion + 1;
488 }
489 ++string_size;
490 }
491 // Assign out designated companion if it exist.
492 if (tensor_blocks[a.index].companion_ref && a.companion != tensor_blocks[a.index].companion_ref - 1)
493 {
494 const int companion_ref = tensor_blocks[a.index].companion_ref - 1;
495 assert(tensor_blocks[a.index].type == tensor_blocks[companion_ref].type)((void) sizeof ((tensor_blocks[a.index].type == tensor_blocks
[companion_ref].type) ? 1 : 0), __extension__ ({ if (tensor_blocks
[a.index].type == tensor_blocks[companion_ref].type) ; else __assert_fail
("tensor_blocks[a.index].type == tensor_blocks[companion_ref].type"
, "ccv_nnc_symbolic_graph_compile.c", 495, __extension__ __PRETTY_FUNCTION__
); }))
;
496 const ccv_numeric_data_t b_hop_p = ccv_get_sparse_matrix_cell(tensor_dt, strings[0] - 1, companion_ref);
497 if (b_hop_p.i32 && b_hop_p.i32[0] > 0)
498 {
499 for (i = 0; i < string_size; i++)
500 strings[i + 1] = strings[i];
501 strings[0] = companion_ref + 1;
502 } else {
503 const ccv_numeric_data_t q_hop_b = ccv_get_sparse_matrix_cell(tensor_dt, companion_ref, strings[string_size - 1] - 1);
504 if (q_hop_b.i32 && q_hop_b.i32[0] > 0)
505 strings[string_size] = companion_ref + 1;
506 else {
507 // Because b_hop_p is 0, q_hop_b is nil, p != q, and b must in between p and q. Therefore, I must have 2 allocations.
508 assert(string_size == 2)((void) sizeof ((string_size == 2) ? 1 : 0), __extension__ ({
if (string_size == 2) ; else __assert_fail ("string_size == 2"
, "ccv_nnc_symbolic_graph_compile.c", 508, __extension__ __PRETTY_FUNCTION__
); }))
;
509 strings[2] = strings[1];
510 strings[1] = companion_ref + 1;
511 }
512 }
513 ++string_size;
514 }
515 // Assign out and update oc.
516 for (i = 0; i < string_size; i++)
517 {
518 const int index = strings[i] - 1;
519 // Assign out the selected one.
520 assigned[index] = assign_group;
521 // The offset for this one, should be either 0 (started a new group, when min_i == -1), or the offset on this edge.
522 allocated_offset[index] = min_val[1];
523 for (k = 0; k < tensor_block_size; k++)
524 if (!assigned[k] && TENSOR_EXPECT_COMPUTABLE(tensor_blocks[k])(!((tensor_blocks[k].flags & 0x3) == ALIAS) && !(
(tensor_blocks[k].flags & 0x3) == UNASSIGNED))
)
525 {
526 ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(tensor_itf, ccv_min(k, index)({ typeof (k) _a = (k); typeof (index) _b = (index); (_a <
_b) ? _a : _b; })
, ccv_max(k, index)({ typeof (k) _a = (k); typeof (index) _b = (index); (_a >
_b) ? _a : _b; })
);
527 if (cell.u8 && cell.u8[0] == 1)
528 --oc[k];
529 }
530 }
531 uint64_t val[2] = {
532 a.size, min_val[1]
533 };
534 uint64_t consumed_size = 0;
535 // Go over from min_y to string_size (excluding min_x).
536 for (i = 0; i < string_size; i++)
537 {
538 const uint64_t size = tensor_blocks[strings[i] - 1].size;
539 assert(size <= a.size)((void) sizeof ((size <= a.size) ? 1 : 0), __extension__ (
{ if (size <= a.size) ; else __assert_fail ("size <= a.size"
, "ccv_nnc_symbolic_graph_compile.c", 539, __extension__ __PRETTY_FUNCTION__
); }))
;
540 // Update consumed size if it is bigger than "size".
541 if (size > consumed_size)
542 {
543 val[0] = size - consumed_size;
544 ccv_set_sparse_matrix_cell(alloc, min_y, strings[i], val);
545 consumed_size = size;
546 val[1] = min_val[1] + consumed_size;
547 }
548 // If it consumed all the flow, break out.
549 if (consumed_size == a.size)
550 break;
551 }
552 for (i = 0; i < string_size; i++)
553 {
554 const uint64_t i_size = tensor_blocks[strings[i] - 1].size;
555 uint64_t val[2] = {
556 i_size, min_val[1]
557 };
558 uint64_t consumed_size = 0;
559 for (k = i + 1; k < string_size; k++)
560 {
561 const uint64_t size = ccv_min(i_size, tensor_blocks[strings[k] - 1].size)({ typeof (i_size) _a = (i_size); typeof (tensor_blocks[strings
[k] - 1].size) _b = (tensor_blocks[strings[k] - 1].size); (_a
< _b) ? _a : _b; })
;
562 // Update consumed size if it is bigger than "size".
563 if (size > consumed_size)
564 {
565 val[0] = size - consumed_size;
566 ccv_set_sparse_matrix_cell(alloc, strings[i], strings[k], val);
567 consumed_size = size;
568 val[1] = min_val[1] + consumed_size;
569 }
570 // If it consumed all the flow, break out.
571 if (consumed_size == i_size)
572 break;
573 }
574 val[0] = i_size - consumed_size;
575 // Still have residual, flow it to min_x.
576 if (val[0] > 0)
577 ccv_set_sparse_matrix_cell(alloc, strings[i], min_x, val);
578 }
579 j += string_size;
580 }
581 ccfreefree(buf);
582 ccv_array_free(opt);
583 ccv_matrix_free(tensor_df);
584 ccv_matrix_free(tensor_dt);
585 ccv_matrix_free(tensor_itf);
586#define for_block(y, x, val) do { \
587 if (((uint64_t*)val)[0] > 0 && y > 0 && x < tensor_block_size + 1) \
588 { \
589 if (!alloc_dep[x - 1]) \
590 alloc_dep[x - 1] = ccv_array_new(sizeof(int), 1, 0); \
591 ccv_array_add_unique_int(alloc_dep[x - 1], y - 1); \
592 } \
593 } while (0)
594 CCV_SPARSE_FOREACH(alloc, for_block)do { if ((alloc)->major & CCV_SPARSE_COL_MAJOR) { switch
((((alloc)->type) & 0xFF000)) { case CCV_32S: { do { uint32_t
_i_, _j_; const uint32_t _size_ = (alloc)->size; __attribute__
((unused)) const size_t _c_ = (((alloc)->type) & 0xFFF
); if ((alloc)->type & CCV_DENSE_VECTOR) { for (_i_ = 0
; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t* const _idx_
= (alloc)->index + _i_; ccv_sparse_matrix_vector_t* const
_v_ = (alloc)->vector + _i_; if (_idx_->ifbit <= 1 ||
!_v_->size) continue; for (_j_ = 0; _j_ < _v_->size
; _j_++) { for_block((_j_), (_idx_->i), (_v_->data.i32 +
(_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
& 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
+ _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
+ _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_j_->
i), (_idx_->i), (_d_.i32 + (0))); } } } } while (0); break
; } case CCV_32F: { do { uint32_t _i_, _j_; const uint32_t _size_
= (alloc)->size; __attribute__((unused)) const size_t _c_
= (((alloc)->type) & 0xFFF); if ((alloc)->type &
CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_j_), (_idx_->i), (_v_->data.
f32 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
& 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
+ _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
+ _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_j_->
i), (_idx_->i), (_d_.f32 + (0))); } } } } while (0); break
; } case CCV_64S: { do { uint32_t _i_, _j_; const uint32_t _size_
= (alloc)->size; __attribute__((unused)) const size_t _c_
= (((alloc)->type) & 0xFFF); if ((alloc)->type &
CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_j_), (_idx_->i), (_v_->data.
i64 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
& 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
+ _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
+ _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_j_->
i), (_idx_->i), (_d_.i64 + (0))); } } } } while (0); break
; } case CCV_64F: { do { uint32_t _i_, _j_; const uint32_t _size_
= (alloc)->size; __attribute__((unused)) const size_t _c_
= (((alloc)->type) & 0xFFF); if ((alloc)->type &
CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_j_), (_idx_->i), (_v_->data.
f64 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
& 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
+ _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
+ _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_j_->
i), (_idx_->i), (_d_.f64 + (0))); } } } } while (0); break
; } default: { do { uint32_t _i_, _j_; const uint32_t _size_ =
(alloc)->size; __attribute__((unused)) const size_t _c_ =
(((alloc)->type) & 0xFFF); if ((alloc)->type &
CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_j_), (_idx_->i), (_v_->data.
u8 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
& 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
+ _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
+ _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_j_->
i), (_idx_->i), (_d_.u8 + (0))); } } } } while (0); } } } else
{ switch ((((alloc)->type) & 0xFF000)) { case CCV_32S
: { do { uint32_t _i_, _j_; const uint32_t _size_ = (alloc)->
size; __attribute__((unused)) const size_t _c_ = (((alloc)->
type) & 0xFFF); if ((alloc)->type & CCV_DENSE_VECTOR
) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_idx_->i), (_j_), (_v_->data.
i32 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
& 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
+ _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
+ _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_->
i), (_idx_j_->i), (_d_.i32 + (0))); } } } } while (0); break
; } case CCV_32F: { do { uint32_t _i_, _j_; const uint32_t _size_
= (alloc)->size; __attribute__((unused)) const size_t _c_
= (((alloc)->type) & 0xFFF); if ((alloc)->type &
CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_idx_->i), (_j_), (_v_->data.
f32 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
& 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
+ _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
+ _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_->
i), (_idx_j_->i), (_d_.f32 + (0))); } } } } while (0); break
; } case CCV_64S: { do { uint32_t _i_, _j_; const uint32_t _size_
= (alloc)->size; __attribute__((unused)) const size_t _c_
= (((alloc)->type) & 0xFFF); if ((alloc)->type &
CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_idx_->i), (_j_), (_v_->data.
i64 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
& 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
+ _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
+ _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_->
i), (_idx_j_->i), (_d_.i64 + (0))); } } } } while (0); break
; } case CCV_64F: { do { uint32_t _i_, _j_; const uint32_t _size_
= (alloc)->size; __attribute__((unused)) const size_t _c_
= (((alloc)->type) & 0xFFF); if ((alloc)->type &
CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_idx_->i), (_j_), (_v_->data.
f64 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
& 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
+ _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
+ _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_->
i), (_idx_j_->i), (_d_.f64 + (0))); } } } } while (0); break
; } default: { do { uint32_t _i_, _j_; const uint32_t _size_ =
(alloc)->size; __attribute__((unused)) const size_t _c_ =
(((alloc)->type) & 0xFFF); if ((alloc)->type &
CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_idx_->i), (_j_), (_v_->data.
u8 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
& 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
+ _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
+ _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_->
i), (_idx_j_->i), (_d_.u8 + (0))); } } } } while (0); } } }
} while (0)
;
595#undef for_block
596 ccv_matrix_free(alloc);
597 ccfreefree(oc);
598 ccv_nnc_tensor_alloc_prep_t* alloc_prep = (ccv_nnc_tensor_alloc_prep_t*)ccmallocmalloc(sizeof(ccv_nnc_tensor_alloc_prep_t) + sizeof(alloc_prep->blocks[0]) * available_tensor_size + sizeof(alloc_prep->buffers[0]) * num_assigned + sizeof(int) * tensor_block_size);
599 alloc_prep->alloc_dep = alloc_dep;
600 alloc_prep->vt_block_size = tensor_block_size;
601 alloc_prep->buffer_size = num_assigned;
602 alloc_prep->block_size = available_tensor_size;
603 alloc_prep->blocks = (void*)(alloc_prep + 1); // From the biggest structs to smaller ones.
604 alloc_prep->buffers = (void*)(alloc_prep->blocks + available_tensor_size);
605 alloc_prep->vt_blocks = (int*)(alloc_prep->buffers + num_assigned);
606 memset(alloc_prep->buffers, 0, sizeof(alloc_prep->buffers[0]) * num_assigned);
607 for (i = 0; i < num_assigned; i++)
608 alloc_prep->buffers[i].size = allocated_size[i];
609 ccfreefree(allocated_size);
610 j = 0;
611 // Assigning out the tensors (in case of sharing tensors / in-place ops).
612 for (i = 0; i < tensor_block_size; i++)
613 if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED))
614 {
615 alloc_prep->blocks[j].block_ref = i;
616 if (!TENSOR_EXPECT_ALIAS(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == ALIAS))
617 {
618 alloc_prep->vt_blocks[i] = j;
619 // Also, set its allocations.
620 assert(assigned[i] > 0)((void) sizeof ((assigned[i] > 0) ? 1 : 0), __extension__ (
{ if (assigned[i] > 0) ; else __assert_fail ("assigned[i] > 0"
, "ccv_nnc_symbolic_graph_compile.c", 620, __extension__ __PRETTY_FUNCTION__
); }))
;
621 const int buffer_ref = alloc_prep->blocks[j].buffer_ref = assigned[i] - 1;
622 alloc_prep->blocks[j].offset = allocated_offset[i];
623 if (!alloc_prep->buffers[buffer_ref].type)
624 alloc_prep->buffers[buffer_ref].type = tensor_blocks[i].type;
625 alloc_prep->buffers[buffer_ref].pin_mem = alloc_prep->buffers[buffer_ref].pin_mem || tensor_blocks[i].pin_mem;
626 alloc_prep->buffers[buffer_ref].flags |= TENSOR_READ_WRITE(tensor_blocks[i])(tensor_blocks[i].flags & 0xc);
627 assert(allocated_offset[i] + tensor_blocks[i].size <= alloc_prep->buffers[buffer_ref].size)((void) sizeof ((allocated_offset[i] + tensor_blocks[i].size <=
alloc_prep->buffers[buffer_ref].size) ? 1 : 0), __extension__
({ if (allocated_offset[i] + tensor_blocks[i].size <= alloc_prep
->buffers[buffer_ref].size) ; else __assert_fail ("allocated_offset[i] + tensor_blocks[i].size <= alloc_prep->buffers[buffer_ref].size"
, "ccv_nnc_symbolic_graph_compile.c", 627, __extension__ __PRETTY_FUNCTION__
); }))
;
628 } else {
629 alloc_prep->vt_blocks[i] = -1;
630 alloc_prep->blocks[j].buffer_ref = -1;
631 alloc_prep->blocks[j].offset = 0;
632 }
633 ++j;
634 } else
635 alloc_prep->vt_blocks[i] = -1;
636 ccfreefree(allocated_offset);
637 ccfreefree(assigned);
638 return alloc_prep;
639}
640
641static void _ccv_nnc_tensor_alloc_prep_free(ccv_nnc_tensor_alloc_prep_t* alloc_prep)
642{
643 int i;
644 for (i = 0; i < alloc_prep->vt_block_size; i++)
645 if (alloc_prep->alloc_dep[i])
646 ccv_array_free(alloc_prep->alloc_dep[i]);
647 for (i = 0; i < alloc_prep->buffer_size; i++)
648 if (alloc_prep->buffers[i].dup_p_refs)
649 ccv_array_free(alloc_prep->buffers[i].dup_p_refs);
650 ccfreefree(alloc_prep->alloc_dep);
651 ccfreefree(alloc_prep);
652}
653
654// Simple allocator from ccv_array_t.
655static int _ccv_nnc_tensor_metadata_pos_new(ccv_array_t* const tensor_metadata, const size_t size)
656{
657 int pos = tensor_metadata->rnum;
658 int rsize = (size + 15) / 16;
659 ccv_array_resize(tensor_metadata, pos + rsize);
660 return (pos << 1) + 1;
661}
662
663static ccv_nnc_tensor_t* _ccv_nnc_tensor_metadata_get(const ccv_array_t* const tensor_metadata, const int pos)
664{
665 assert((pos >> 1) < tensor_metadata->rnum)((void) sizeof (((pos >> 1) < tensor_metadata->rnum
) ? 1 : 0), __extension__ ({ if ((pos >> 1) < tensor_metadata
->rnum) ; else __assert_fail ("(pos >> 1) < tensor_metadata->rnum"
, "ccv_nnc_symbolic_graph_compile.c", 665, __extension__ __PRETTY_FUNCTION__
); }))
;
666 return (ccv_nnc_tensor_t*)ccv_array_get(tensor_metadata, pos >> 1)((void*)(((char*)((tensor_metadata)->data)) + (size_t)(tensor_metadata
)->rsize * (size_t)(pos >> 1)))
;
667}
668
669#define CCV_NNC_IS_METADATA_POS(ptr)((uintptr_t)(ptr) & 1) ((uintptr_t)(ptr) & 1)
670
671static ccv_nnc_tensor_t* _ccv_nnc_tensor_metadata_rewire(const ccv_array_t* const tensor_metadata, ccv_nnc_tensor_t* const vt_tensor)
672{
673 // If the low bit is not 1, this is not a position (but a normal tensor pointer), just return directly.
674 if (!CCV_NNC_IS_METADATA_POS(vt_tensor)((uintptr_t)(vt_tensor) & 1))
675 return vt_tensor;
676 ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)vt_tensor);
677 if (tensor->alias_ref && CCV_NNC_IS_METADATA_POS(tensor->alias_ref)((uintptr_t)(tensor->alias_ref) & 1))
678 {
679 const int alias_ref = tensor->alias_ref;
680 tensor->alias_ref = (uintptr_t)_ccv_nnc_tensor_metadata_get(tensor_metadata, (int)tensor->alias_ref);
681 _ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)alias_ref);
682 }
683 if (CCV_IS_TENSOR_MULTIVIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_MULTIVIEW))
684 {
685 ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)tensor;
686 int i;
687 const int count = mv->kind + mv->repeat;
688 for (i = 0; i < count; i++)
689 {
690 if (CCV_NNC_IS_METADATA_POS(CCV_NNC_MULTIVIEW_DATA(mv)[i])((uintptr_t)(((mv)->_heap_data ? (mv)->_heap_data : (mv
)->_inline_data)[i]) & 1)
)
691 {
692 const int pos = (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i];
693 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i]);
694 _ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)pos);
695 }
696 }
697 // No need to recursively do parent pointer, otherwise we are in deep rewire.
698 if (mv->p && CCV_NNC_IS_METADATA_POS(mv->p)((uintptr_t)(mv->p) & 1))
699 mv->p = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)mv->p);
700 if (mv->sp)
701 for (i = 0; i < mv->sp->rnum; i++)
702 {
703 ccv_nnc_tensor_t** const tensor = (ccv_nnc_tensor_t**)ccv_array_get(mv->sp, i)((void*)(((char*)((mv->sp)->data)) + (size_t)(mv->sp
)->rsize * (size_t)(i)))
;
704 if (CCV_NNC_IS_METADATA_POS(*tensor)((uintptr_t)(*tensor) & 1))
705 {
706 const int pos = (int)(intptr_t)*tensor;
707 *tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
708 assert(!CCV_IS_TENSOR_MULTIVIEW(*tensor))((void) sizeof ((!((*(int*)(*tensor)) & CCV_TENSOR_MULTIVIEW
)) ? 1 : 0), __extension__ ({ if (!((*(int*)(*tensor)) & CCV_TENSOR_MULTIVIEW
)) ; else __assert_fail ("!CCV_IS_TENSOR_MULTIVIEW(*tensor)",
"ccv_nnc_symbolic_graph_compile.c", 708, __extension__ __PRETTY_FUNCTION__
); }))
;
709 _ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)pos);
710 }
711 }
712 }
713 return tensor;
714}
715
716typedef struct {
717 const uint8_t* ptr;
718 int pos;
719} ccv_nnc_tensor_block_pos_t;
720
721static int _ccv_nnc_tensor_multiview_find_pos(ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_param_t params, const ccv_nnc_symbolic_graph_prep_t* const *const preps, const int block_ref, const int* const ch, const int idx, const ccv_nnc_symbolic_graph_prep_t* prep)
722{
723 int i;
724 int unref_block_ref = block_ref;
725 while (prep->tensor_blocks[unref_block_ref].ref)
726 unref_block_ref = prep->tensor_blocks[unref_block_ref].ref - 1;
727 int vt_ref = prep->alloc_prep->vt_blocks[unref_block_ref];
728 assert(vt_ref >= 0)((void) sizeof ((vt_ref >= 0) ? 1 : 0), __extension__ ({ if
(vt_ref >= 0) ; else __assert_fail ("vt_ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 728, __extension__ __PRETTY_FUNCTION__); }))
;
729 assert(unref_block_ref == prep->alloc_prep->blocks[vt_ref].block_ref)((void) sizeof ((unref_block_ref == prep->alloc_prep->blocks
[vt_ref].block_ref) ? 1 : 0), __extension__ ({ if (unref_block_ref
== prep->alloc_prep->blocks[vt_ref].block_ref) ; else __assert_fail
("unref_block_ref == prep->alloc_prep->blocks[vt_ref].block_ref"
, "ccv_nnc_symbolic_graph_compile.c", 729, __extension__ __PRETTY_FUNCTION__
); }))
;
730 const int buffer_ref = prep->alloc_prep->blocks[vt_ref].buffer_ref;
731 uint64_t offset = prep->alloc_prep->blocks[vt_ref].offset;
732 int p_ref = prep->alloc_prep->buffers[buffer_ref].p_refs[0] - 1;
733 for (i = idx - 1; i >= 0; i--)
734 {
735 assert(p_ref >= 0)((void) sizeof ((p_ref >= 0) ? 1 : 0), __extension__ ({ if
(p_ref >= 0) ; else __assert_fail ("p_ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 735, __extension__ __PRETTY_FUNCTION__); }))
;
736 const ccv_nnc_symbolic_graph_prep_t* const graph_prep = preps[i];
737 const int unroll_count = graph_prep->unroll_count;
738 if (ch[i]) // Prefer the dup side of things.
739 p_ref = graph_prep->dup_tensor_block_ref[p_ref * unroll_count + ch[i] - 1];
740 int unref_p_ref = p_ref;
741 while (graph_prep->tensor_blocks[unref_p_ref].ref)
742 unref_p_ref = graph_prep->tensor_blocks[unref_p_ref].ref - 1;
743 vt_ref = graph_prep->alloc_prep->vt_blocks[unref_p_ref];
744 const int buffer_ref = graph_prep->alloc_prep->blocks[vt_ref].buffer_ref;
745 offset += graph_prep->alloc_prep->blocks[vt_ref].offset;
746 // If the buffer already exists, prefer that.
747 const uint8_t* ptr = graph_prep->tensor_arena->buffers[buffer_ref].ptr;
748 if (ptr)
749 {
750 // If I have any remaining path that is not covered from 0, I cannot possibly
751 // have any pointer from buffer (that can only happen if it is not dup).
752 for (--i; i >= 0; i--)
753 if (ch[i] != 0)
754 return 0;
755 // Try to find the created tensor block pos in the array, just linear scan.
756 const int tv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_t));
757 ccv_nnc_tensor_t* const tv = (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, tv_pos);
758 *tv = ccv_nnc_tensor(graph_prep->tensor_arena->buffers[buffer_ref].ptr + offset, params, 0);
759 return tv_pos;
760 }
761 p_ref = graph_prep->alloc_prep->buffers[buffer_ref].p_refs[0] - 1;
762 }
763 return 0;
764}
765
766// Descent from root to the prep level, and compose multiview from there.
767static int _ccv_nnc_tensor_multiview_down_find_pos(ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_param_t params, const int preserve, const int assign_update, const ccv_nnc_symbolic_graph_prep_t* const *const preps, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int block_ref, int* ch, const int idx, int* const pos_ref)
768{
769 assert(pos_ref)((void) sizeof ((pos_ref) ? 1 : 0), __extension__ ({ if (pos_ref
) ; else __assert_fail ("pos_ref", "ccv_nnc_symbolic_graph_compile.c"
, 769, __extension__ __PRETTY_FUNCTION__); }))
;
770 int i;
771 const ccv_nnc_symbolic_graph_prep_t* const prep = preps[idx];
772 const int unroll_count = prep->unroll_count;
773 if (prep == graph_prep)
774 {
775 const int data_pos = _ccv_nnc_tensor_multiview_find_pos(tensor_metadata, params, preps, block_ref, ch, idx, prep);
776 if (!data_pos)
777 return -1;
778 // Based on ch, go all the way back to find the exact pointer to compose.
779 if (// !assign_update && // If I plan to receive assign update, we don't need to have multiple receiver. Just one tensor to receive update is enough.
780 prep->dup_tensor_block_ref &&
781 prep->dup_tensor_block_ref[block_ref * unroll_count] >= 0 &&
782 prep->dup_tensor_block_ref[block_ref * unroll_count] != block_ref)
783 {
784 int pos[unroll_count + 1];
785 pos[0] = data_pos;
786 for (i = 0; i < unroll_count; i++)
787 pos[i + 1] = _ccv_nnc_tensor_multiview_find_pos(tensor_metadata, params, preps, prep->dup_tensor_block_ref[block_ref * unroll_count + i], ch, idx, prep);
788 const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
789 ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
790 ccv_nnc_tensor_t* data[unroll_count + 1];
791 for (i = 0; i < unroll_count + 1; i++)
792 data[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos[i]);
793 ccv_nnc_tensor_multiview(data, CCV_NNC_MULTIVIEW_K0N, unroll_count + 1, prep->graph, mv);
794 for (i = 0; i < unroll_count + 1; i++)
795 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i] = (ccv_nnc_tensor_t*)(intptr_t)pos[i];
796 *pos_ref = mv_pos;
797 } else {
798 *pos_ref = data_pos;
799 }
800 if (preserve)
801 {
802 // If need to preserve, this need to be more complicated. At loop 0, I need to access the new assigned tv.
803 // at any other loops, it should be the same. Thus, for this case, I will create a mv tensor as following:
804 // mv of K11, thus, when loop is 0, it unwrap to mv->data[0], otherwise, unwrap to mv->data[1].
805 // mv->data[0] (thin_mv) is a K01, which points to the assigned tv (using 1 as a placeholder here until parent
806 // arena allocated).
807 // mv->data[1] (prev_mv_pos_ is a K01 or K02, depending on whether above we passed raw pointer directly or
808 // a mv structure. If we pass a mv structure, we just pass it here. If we pass a raw pointer, we need to wrap
809 // it to a K01 structure.
810 // Why we didn't wrap it directly as mv->data[0] pointing to a assigned tv pointer and the mv->data[1] pointing
811 // to the raw pointer (as ptr_ref) with K11? The reason is we don't know the assigned tv is pointing to one
812 // memory region, or is a managed by multi-view tensor, which could pointing to different memory regions.
813 int prev_mv_pos = *pos_ref;
814 if (prev_mv_pos == -1)
815 {
816 prev_mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
817 ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, prev_mv_pos);
818 ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_metadata, data_pos);
819 ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
820 tv,
821 }, CCV_NNC_MULTIVIEW_K0N, 1, prep->graph, mv);
822 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[0] = (ccv_nnc_tensor_t*)(intptr_t)data_pos;
823 }
824 const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
825 ccv_nnc_tensor_multiview_t* const prev_mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, prev_mv_pos);
826 ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
827 ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
828 CCV_NNC_TENSOR_PLACEHOLDER((ccv_nnc_tensor_t*)(intptr_t)(0x10)),
829 (ccv_nnc_tensor_t*)prev_mv,
830 }, CCV_NNC_MULTIVIEW_K1N, 1, prep->graph, mv);
831 prev_mv->p = (void*)(intptr_t)mv_pos;
832 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[0] = CCV_NNC_TENSOR_PLACEHOLDER((ccv_nnc_tensor_t*)(intptr_t)(0x10));
833 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[1] = (ccv_nnc_tensor_t*)(intptr_t)prev_mv_pos;
834 *pos_ref = mv_pos;
835 }
836 return 0;
837 }
838 ch[idx] = 0;
839 int pos[unroll_count + 1];
840 pos[0] = 0;
841 const int retval = _ccv_nnc_tensor_multiview_down_find_pos(tensor_metadata, params, preserve, assign_update, preps, graph_prep, block_ref, ch, idx + 1, pos);
842 assert(retval == 0)((void) sizeof ((retval == 0) ? 1 : 0), __extension__ ({ if (
retval == 0) ; else __assert_fail ("retval == 0", "ccv_nnc_symbolic_graph_compile.c"
, 842, __extension__ __PRETTY_FUNCTION__); }))
;
843 for (i = 0; i < unroll_count; i++)
844 {
845 ch[idx] = i + 1;
846 pos[i + 1] = 0;
847 const int dup_retval = _ccv_nnc_tensor_multiview_down_find_pos(tensor_metadata, params, preserve, assign_update, preps, graph_prep, block_ref, ch, idx + 1, pos + i + 1);
848 if (dup_retval < 0)
849 {
850 assert(i == 0)((void) sizeof ((i == 0) ? 1 : 0), __extension__ ({ if (i == 0
) ; else __assert_fail ("i == 0", "ccv_nnc_symbolic_graph_compile.c"
, 850, __extension__ __PRETTY_FUNCTION__); }))
;
851 break;
852 }
853 }
854 // If current prep has no dup.
855 if (i == 0)
856 {
857 *pos_ref = pos[0];
858 return 0;
859 }
860 ccv_nnc_tensor_t* data[unroll_count + 1];
861 // Compose to a new multiview.
862 for (i = 0; i < unroll_count + 1; i++)
863 { assert(pos[i] > 0)((void) sizeof ((pos[i] > 0) ? 1 : 0), __extension__ ({ if
(pos[i] > 0) ; else __assert_fail ("pos[i] > 0", "ccv_nnc_symbolic_graph_compile.c"
, 863, __extension__ __PRETTY_FUNCTION__); }))
; }
864 const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
865 for (i = 0; i < unroll_count + 1; i++)
866 data[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos[i]);
867 ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
868 ccv_nnc_tensor_multiview(data, CCV_NNC_MULTIVIEW_K0N, unroll_count + 1, prep->graph, mv);
869 for (i = 0; i < unroll_count + 1; i++)
870 if (data[i] != CCV_NNC_TENSOR_PLACEHOLDER((ccv_nnc_tensor_t*)(intptr_t)(0x10)) && CCV_IS_TENSOR_MULTIVIEW(data[i])((*(int*)(data[i])) & CCV_TENSOR_MULTIVIEW))
871 ((ccv_nnc_tensor_multiview_t*)data[i])->p = (void*)(intptr_t)mv_pos;
872 for (i = 0; i < unroll_count + 1; i++)
873 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i] = (ccv_nnc_tensor_t*)(intptr_t)pos[i];
874 *pos_ref = mv_pos;
875 return 0;
876}
877
878static int _ccv_nnc_is_symbolic_graph_exec_input_or_output(const int p_ref, const ccv_nnc_graph_exec_symbol_info_t *const node)
879{
880 int i;
881 int is_input = 0;
882 assert(node)((void) sizeof ((node) ? 1 : 0), __extension__ ({ if (node) ;
else __assert_fail ("node", "ccv_nnc_symbolic_graph_compile.c"
, 882, __extension__ __PRETTY_FUNCTION__); }))
;
883 for (i = 0; i < node->input_size && !is_input; i++)
884 if (p_ref == node->inputs[i])
885 is_input = 1;
886 int is_output = 0;
887 for (i = 0; i < node->output_size && !is_output; i++)
888 if (p_ref == node->outputs[i])
889 is_output = 1;
890 // Prefer it is an output if it is both the input and the output.
891 if (is_output)
892 return 1;
893 if (is_input)
894 return -1;
895 return 0;
896}
897
898static int _ccv_nnc_tensor_block_check_preserve(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int block_ref)
899{
900 // No need to check whether to preserve if this is not a while loop.
901 if (!(graph_prep->flags & CCV_NNC_GRAPH_EXEC_P_WHILE))
902 return 0;
903 assert(block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size)((void) sizeof ((block_ref >= 0 && block_ref < graph_prep
->tensor_symbol_info_size) ? 1 : 0), __extension__ ({ if (
block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size
) ; else __assert_fail ("block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size"
, "ccv_nnc_symbolic_graph_compile.c", 903, __extension__ __PRETTY_FUNCTION__
); }))
;
904 // If it is unassigned, no need to preserve.
905 if (TENSOR_EXPECT_UNASSIGNED(graph_prep->tensor_blocks[block_ref])((graph_prep->tensor_blocks[block_ref].flags & 0x3) ==
UNASSIGNED)
)
906 return 0;
907 const int p_ref = graph_prep->tensor_blocks[block_ref].p_refs[0] - 1;
908 // If p is not input, no need to preserve at all.
909 if (-1 != _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref, graph_prep->p->exec_symbol_info + (graph_prep->exec_idx - 1)))
910 return 0;
911 const int vt_ref = graph_prep->alloc_prep->vt_blocks[block_ref];
912 assert(vt_ref >= 0)((void) sizeof ((vt_ref >= 0) ? 1 : 0), __extension__ ({ if
(vt_ref >= 0) ; else __assert_fail ("vt_ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 912, __extension__ __PRETTY_FUNCTION__); }))
;
913 assert(block_ref == graph_prep->alloc_prep->blocks[vt_ref].block_ref)((void) sizeof ((block_ref == graph_prep->alloc_prep->blocks
[vt_ref].block_ref) ? 1 : 0), __extension__ ({ if (block_ref ==
graph_prep->alloc_prep->blocks[vt_ref].block_ref) ; else
__assert_fail ("block_ref == graph_prep->alloc_prep->blocks[vt_ref].block_ref"
, "ccv_nnc_symbolic_graph_compile.c", 913, __extension__ __PRETTY_FUNCTION__
); }))
;
914 const int buffer_ref = graph_prep->alloc_prep->blocks[vt_ref].buffer_ref;
915 // If the buffer is a truly read-only one, no need to preserve.
916 if (TENSOR_READ_WRITE(graph_prep->alloc_prep->buffers[buffer_ref])(graph_prep->alloc_prep->buffers[buffer_ref].flags &
0xc)
== READ_ONLY)
917 return 0;
918 /* This needs detailed explanation, what does preserve mean?
919 * For a parameterized loop, such as while { y = x + 1 } (y => x), if tensor x is
920 * also used outside of the while loop, we cannot reuse the memory region of x for
921 * the for loop, otherwise we will destroy x when doing y = x + 1 computation (assuming
922 * y uses the same memory region as x). The way to workaround this is by using a different
923 * memory region for y = x + 1, but for the first iteration, having x pointing to the
924 * original. During the allocation process, the way to identify whether x should preserve
925 * its value or not by looking up its parent tensor. If the symbol (tensor_block)'s input
926 * parent tensor is the same as the memory region it plans to use in the buffer, then we are
927 * good (buffer.p_refs[0] == p_refs[0]). A buffer can only point to one parent tensor, and
928 * it is the input tensor whenever that is possible. A tensor block can point to two parent
929 * tensors, one is input tensor, one is the output tensor. p_refs[0] should be the input
930 * tensor whenever that is possible. */
931 if (graph_prep->alloc_prep->buffers[buffer_ref].p_refs[0] - 1 == p_ref)
932 return 0;
933 // Otherwise, return 1 because we now need to preserve.
934 return 1;
935}
936
937static int _ccv_nnc_tensor_block_check_force_broadcast(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int block_ref)
938{
939 assert(block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size)((void) sizeof ((block_ref >= 0 && block_ref < graph_prep
->tensor_symbol_info_size) ? 1 : 0), __extension__ ({ if (
block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size
) ; else __assert_fail ("block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size"
, "ccv_nnc_symbolic_graph_compile.c", 939, __extension__ __PRETTY_FUNCTION__
); }))
;
940 // If it is unassigned, no need to preserve.
941 if (TENSOR_EXPECT_UNASSIGNED(graph_prep->tensor_blocks[block_ref])((graph_prep->tensor_blocks[block_ref].flags & 0x3) ==
UNASSIGNED)
)
942 return 0;
943 // Only tape var need to force broadcast, otherwise we already share the same memory region.
944 if (!(graph_prep->tensor_symbol_info[block_ref].flags & CCV_NNC_TENSOR_SYMBOL_TAPE_VAR))
945 return 0;
946 const int p_ref = graph_prep->tensor_blocks[block_ref].p_refs[0] - 1;
947 // If p is not output, no need to broadcast at all.
948 if (1 != _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref, graph_prep->p->exec_symbol_info + (graph_prep->exec_idx - 1)))
949 return 0;
950 const int vt_ref = graph_prep->alloc_prep->vt_blocks[block_ref];
951 assert(vt_ref >= 0)((void) sizeof ((vt_ref >= 0) ? 1 : 0), __extension__ ({ if
(vt_ref >= 0) ; else __assert_fail ("vt_ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 951, __extension__ __PRETTY_FUNCTION__); }))
;
952 assert(block_ref == graph_prep->alloc_prep->blocks[vt_ref].block_ref)((void) sizeof ((block_ref == graph_prep->alloc_prep->blocks
[vt_ref].block_ref) ? 1 : 0), __extension__ ({ if (block_ref ==
graph_prep->alloc_prep->blocks[vt_ref].block_ref) ; else
__assert_fail ("block_ref == graph_prep->alloc_prep->blocks[vt_ref].block_ref"
, "ccv_nnc_symbolic_graph_compile.c", 952, __extension__ __PRETTY_FUNCTION__
); }))
;
953 const int buffer_ref = graph_prep->alloc_prep->blocks[vt_ref].buffer_ref;
954 // If the buffer is a truly read-only one, no need to broadcast.
955 if (TENSOR_READ_WRITE(graph_prep->alloc_prep->buffers[buffer_ref])(graph_prep->alloc_prep->buffers[buffer_ref].flags &
0xc)
== READ_ONLY)
956 return 0;
957 // Otherwise, return 1 because we now need to force broadcast for this tape var.
958 return 1;
959}
960
961static void _ccv_nnc_tensor_multiview_full_pos(ccv_nnc_tensor_multiview_t* const mv, ccv_nnc_tensor_t* const tensor)
962{
963 assert(CCV_IS_TENSOR_MULTIVIEW(mv))((void) sizeof ((((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW)) ?
1 : 0), __extension__ ({ if (((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW
)) ; else __assert_fail ("CCV_IS_TENSOR_MULTIVIEW(mv)", "ccv_nnc_symbolic_graph_compile.c"
, 963, __extension__ __PRETTY_FUNCTION__); }))
;
964 int i;
965 for (i = 0; i < mv->kind + mv->repeat; i++)
966 if (CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i] == CCV_NNC_TENSOR_PLACEHOLDER((ccv_nnc_tensor_t*)(intptr_t)(0x10)))
967 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i] = tensor;
968 else if (CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(mv)[i])((*(int*)(((mv)->_heap_data ? (mv)->_heap_data : (mv)->
_inline_data)[i])) & CCV_TENSOR_MULTIVIEW)
)
969 _ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i], tensor);
970}
971
972static void _ccv_nnc_tensor_multiview_full_pos_rewire(const ccv_array_t* const tensor_metadata, ccv_nnc_tensor_multiview_t* const mv)
973{
974 assert(CCV_IS_TENSOR_MULTIVIEW(mv))((void) sizeof ((((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW)) ?
1 : 0), __extension__ ({ if (((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW
)) ; else __assert_fail ("CCV_IS_TENSOR_MULTIVIEW(mv)", "ccv_nnc_symbolic_graph_compile.c"
, 974, __extension__ __PRETTY_FUNCTION__); }))
;
975 int i;
976 if (mv->sp)
977 for (i = 0; i < mv->sp->rnum; i++)
978 {
979 ccv_nnc_tensor_t** const tensor = (ccv_nnc_tensor_t**)ccv_array_get(mv->sp, i)((void*)(((char*)((mv->sp)->data)) + (size_t)(mv->sp
)->rsize * (size_t)(i)))
;
980 if (CCV_NNC_IS_METADATA_POS(*tensor)((uintptr_t)(*tensor) & 1))
981 {
982 const int pos = (int)(intptr_t)*tensor;
983 *tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
984 assert(!CCV_IS_TENSOR_MULTIVIEW(*tensor))((void) sizeof ((!((*(int*)(*tensor)) & CCV_TENSOR_MULTIVIEW
)) ? 1 : 0), __extension__ ({ if (!((*(int*)(*tensor)) & CCV_TENSOR_MULTIVIEW
)) ; else __assert_fail ("!CCV_IS_TENSOR_MULTIVIEW(*tensor)",
"ccv_nnc_symbolic_graph_compile.c", 984, __extension__ __PRETTY_FUNCTION__
); }))
;
985 _ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)pos);
986 }
987 }
988 for (i = 0; i < mv->kind + mv->repeat; i++)
989 {
990 if (CCV_NNC_IS_METADATA_POS((int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[i])((uintptr_t)((int)(intptr_t)((mv)->_heap_data ? (mv)->_heap_data
: (mv)->_inline_data)[i]) & 1)
)
991 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i]);
992 if (CCV_NNC_IS_METADATA_POS((int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[i]->alias_ref)((uintptr_t)((int)(intptr_t)((mv)->_heap_data ? (mv)->_heap_data
: (mv)->_inline_data)[i]->alias_ref) & 1)
)
993 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i]->alias_ref = (uintptr_t)_ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i]->alias_ref);
994 if (CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(mv)[i])((*(int*)(((mv)->_heap_data ? (mv)->_heap_data : (mv)->
_inline_data)[i])) & CCV_TENSOR_MULTIVIEW)
)
995 _ccv_nnc_tensor_multiview_full_pos_rewire(tensor_metadata, (ccv_nnc_tensor_multiview_t*)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i]);
996 }
997}
998
999static int _ccv_nnc_tensor_multiview_gen(ccv_array_t* const tensor_metadata, const int preserve, const int assign_update, const ccv_nnc_tensor_param_t params, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_tensor_arena_t* const tensor_arena, const int block_ref)
1000{
1001 // Go to the root of the graph.
1002 const ccv_nnc_symbolic_graph_prep_t* prep = graph_prep;
1003 int i;
1004 for (i = 1; prep->p; i++)
1005 prep = prep->p;
1006 // Root graph should have no dup tensor blocks.
1007 assert(!prep->dup_tensor_block_ref)((void) sizeof ((!prep->dup_tensor_block_ref) ? 1 : 0), __extension__
({ if (!prep->dup_tensor_block_ref) ; else __assert_fail (
"!prep->dup_tensor_block_ref", "ccv_nnc_symbolic_graph_compile.c"
, 1007, __extension__ __PRETTY_FUNCTION__); }))
;
1008 const int c = i;
1009 const ccv_nnc_symbolic_graph_prep_t* preps[c];
1010 prep = graph_prep;
1011 preps[c - 1] = prep;
1012 for (i = 0; prep->p; i++)
1013 preps[c - 2 - i] = prep = prep->p;
1014 int ch[c]; // Use dynamic allocation for array. This is an array to record our selections when recursive from top to bottom.
1015 memset(ch, 0, sizeof(int) * c);
1016 int pos = 0;
1017 _ccv_nnc_tensor_multiview_down_find_pos(tensor_metadata, params, preserve, assign_update, preps, graph_prep, block_ref, ch, 0, &pos);
1018 assert(ch[c - 1] == 0)((void) sizeof ((ch[c - 1] == 0) ? 1 : 0), __extension__ ({ if
(ch[c - 1] == 0) ; else __assert_fail ("ch[c - 1] == 0", "ccv_nnc_symbolic_graph_compile.c"
, 1018, __extension__ __PRETTY_FUNCTION__); }))
; // This shouldn't never be modified.
1019 assert(pos > 0)((void) sizeof ((pos > 0) ? 1 : 0), __extension__ ({ if (pos
> 0) ; else __assert_fail ("pos > 0", "ccv_nnc_symbolic_graph_compile.c"
, 1019, __extension__ __PRETTY_FUNCTION__); }))
;
1020 return pos;
1021}
1022
1023static int _ccv_nnc_tensor_multiview_preserve_gen(ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_param_t params, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, ccv_nnc_tensor_t* const tensor)
1024{
1025 const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
1026 ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
1027 ccv_nnc_tensor_t* const tv = CCV_NNC_IS_METADATA_POS(tensor)((uintptr_t)(tensor) & 1) ? _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)tensor) : tensor;
1028 ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
1029 CCV_NNC_TENSOR_PLACEHOLDER((ccv_nnc_tensor_t*)(intptr_t)(0x10)),
1030 tv,
1031 }, CCV_NNC_MULTIVIEW_K1N, 1, graph_prep->graph, mv);
1032 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[0] = CCV_NNC_TENSOR_PLACEHOLDER((ccv_nnc_tensor_t*)(intptr_t)(0x10));
1033 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[1] = tensor;
1034 return mv_pos;
1035}
1036
1037static int _ccv_nnc_tensor_flat_if_multiview(ccv_array_t* const tensor_metadata, const int pos)
1038{
1039 ccv_nnc_tensor_t* tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
1040 const int is_multiview = CCV_IS_TENSOR_MULTIVIEW(tensor_ptr)((*(int*)(tensor_ptr)) & CCV_TENSOR_MULTIVIEW);
1041 if (!is_multiview)
1042 return pos;
1043 while (CCV_IS_TENSOR_MULTIVIEW(tensor_ptr)((*(int*)(tensor_ptr)) & CCV_TENSOR_MULTIVIEW))
1044 {
1045 const ccv_nnc_tensor_multiview_t* const mv = (const ccv_nnc_tensor_multiview_t*)tensor_ptr;
1046 tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[0]);
1047 }
1048 const ccv_nnc_tensor_t tensor = *tensor_ptr;
1049 const int new_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_t));
1050 ccv_nnc_tensor_t* const new_tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, new_pos);
1051 *new_tensor = ccv_nnc_tensor(tensor.data.u8, tensor.info, 0);
1052 ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
1053 new_tensor->alias_ref = (uintptr_t)pos;
1054 ccv_nnc_tensor_synchronize_to_multiview(mv, (ccv_nnc_tensor_t*)(intptr_t)new_pos);
1055 return new_pos;
1056}
1057
1058static ccv_nnc_tensor_arena_t* _ccv_nnc_tensor_arena_new(ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_tensor_arena_t* const p_arena, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size)
1059{
1060 // All tensors assigned out, now, the num_assigned is the number of dis-continuous buffers,
1061 // Each tensor have the designation in assigned array, and offset in allocated_offset.
1062 const ccv_nnc_tensor_alloc_prep_t* const alloc_prep = graph_prep->alloc_prep;
1063 ccv_nnc_tensor_block_t* const tensor_blocks = graph_prep->tensor_blocks;
1064 const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info = graph_prep->tensor_symbol_info;
1065 const int tensor_symbol_info_size = graph_prep->tensor_symbol_info_size;
1066 const ccv_nnc_symbolic_graph_prep_t* const p_graph_prep = graph_prep->p;
1067 const ccv_nnc_tensor_alloc_prep_t* const p_alloc_prep = p_graph_prep ? p_graph_prep->alloc_prep : 0;
1068 const int* const dup_tensor_block_ref = graph_prep->dup_tensor_block_ref;
1069 const int unroll_count = graph_prep->unroll_count;
1070 int i, j;
1071 for (i = 0; i < tensor_symbol_info_size; i++)
1072 for (j = 0; TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED) && j < unroll_count; j++)
1073 {
1074 const int dup_ref = dup_tensor_block_ref[i * unroll_count + j];
1075 if (dup_ref >= 0 && !TENSOR_EXPECT_UNASSIGNED(tensor_blocks[dup_ref])((tensor_blocks[dup_ref].flags & 0x3) == UNASSIGNED))
1076 TENSOR_EXPECT_UNSET_UNASSIGNED(tensor_blocks[i])(tensor_blocks[i].flags = (tensor_blocks[i].flags & ~0x1)
)
;
1077 }
1078 ccv_nnc_tensor_arena_t* tensor_arena = (ccv_nnc_tensor_arena_t*)ccmallocmalloc(sizeof(ccv_nnc_tensor_arena_t) + sizeof(tensor_arena->buffers[0]) * alloc_prep->buffer_size + sizeof(ccv_nnc_tensor_t*) * tensor_symbol_info_size + sizeof(ccv_nnc_tensor_arena_t*) * graph_prep->sub_prep_size);
1079 graph_prep->tensor_arena = tensor_arena;
1080 tensor_arena->graph_ref = (intptr_t)graph_prep->symbolic_graph;
1081 tensor_arena->buffers = (void*)(tensor_arena + 1);
1082 tensor_arena->buffer_size = alloc_prep->buffer_size;
1083 tensor_arena->vt_tensor_size = tensor_symbol_info_size;
1084 tensor_arena->vt_tensors = (ccv_nnc_tensor_t**)(tensor_arena->buffers + alloc_prep->buffer_size);
1085 tensor_arena->sub_arenas = (ccv_nnc_tensor_arena_t**)(tensor_arena->vt_tensors + tensor_symbol_info_size);
1086 tensor_arena->sub_arena_size = graph_prep->sub_prep_size;
1087 tensor_arena->tensor_metadata = ccv_array_new(16 /* align to 16 bytes */, 0, 0);
1088 tensor_arena->m_tensor_idx = ccv_array_new(sizeof(int), 0, 0);
1089 for (i = 0; i < alloc_prep->buffer_size; i++)
1090 tensor_arena->buffers[i].type = alloc_prep->buffers[i].type,
1091 tensor_arena->buffers[i].pin_mem = alloc_prep->buffers[i].pin_mem,
1092 tensor_arena->buffers[i].size = alloc_prep->buffers[i].size;
1093 if (graph_prep->while_count_tensor)
1094 {
1095 // If we need to have a while count tensor, allocate that first, set its pointer to point the while_count variable.
1096 int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1097 assert((0 << 1) + 1 == pos)((void) sizeof (((0 << 1) + 1 == pos) ? 1 : 0), __extension__
({ if ((0 << 1) + 1 == pos) ; else __assert_fail ("(0 << 1) + 1 == pos"
, "ccv_nnc_symbolic_graph_compile.c", 1097, __extension__ __PRETTY_FUNCTION__
); }))
; // pos must be 0 position.
1098 ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1099 *tensor = ccv_nnc_tensor_for_while_count(graph_prep->graph);
1100 }
1101 assert((p_arena && p_graph_prep) || (!p_arena && !p_graph_prep))((void) sizeof (((p_arena && p_graph_prep) || (!p_arena
&& !p_graph_prep)) ? 1 : 0), __extension__ ({ if ((p_arena
&& p_graph_prep) || (!p_arena && !p_graph_prep
)) ; else __assert_fail ("(p_arena && p_graph_prep) || (!p_arena && !p_graph_prep)"
, "ccv_nnc_symbolic_graph_compile.c", 1101, __extension__ __PRETTY_FUNCTION__
); }))
;
1102 if (p_arena && p_graph_prep)
1103 {
1104 // Don't need to allocate the actual buffer, just use the pointer from the above.
1105 PRINT(CCV_CLI_VERBOSE, "Buffer assignment for sub arena %p (parent %p)\n", tensor_arena, p_arena)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
{ printf("Buffer assignment for sub arena %p (parent %p)\n",
tensor_arena, p_arena); fflush(stdout); } } while (0)
;
1106 for (i = 0; i < tensor_arena->buffer_size; i++)
1107 {
1108 const int p_ref = alloc_prep->buffers[i].p_refs[0] - 1;
1109 int unref_p_ref = p_ref;
1110 while (p_graph_prep->tensor_blocks[unref_p_ref].ref)
1111 unref_p_ref = p_graph_prep->tensor_blocks[unref_p_ref].ref - 1;
1112 assert(unref_p_ref >= 0)((void) sizeof ((unref_p_ref >= 0) ? 1 : 0), __extension__
({ if (unref_p_ref >= 0) ; else __assert_fail ("unref_p_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 1112, __extension__ __PRETTY_FUNCTION__
); }))
;
1113 const int p_unroll_count = p_graph_prep->unroll_count;
1114 if (p_graph_prep->dup_tensor_block_ref &&
1115 p_graph_prep->dup_tensor_block_ref[p_ref * p_unroll_count] >= 0 &&
1116 p_graph_prep->dup_tensor_block_ref[p_ref * p_unroll_count] != p_ref)
1117 {
1118 // This condition means in the parent graph, we point to multiple tensor blocks for the same
1119 // buffer, therefore, we cannot have one single pointer assigned in this case.
1120 // Later we will handle this by generate ccv_tensor_multiview_t structure.
1121 tensor_arena->buffers[i].ptr = 0;
1122 PRINT(CCV_CLI_VERBOSE, "|-Cannot assign buffer %d, it points to multiple blocks (multi view tensor required)\n", i)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
{ printf("|-Cannot assign buffer %d, it points to multiple blocks (multi view tensor required)\n"
, i); fflush(stdout); } } while (0)
;
1123 continue;
1124 }
1125 // Otherwise, find the actual buffer pointer.
1126 const int vt_ref = p_alloc_prep->vt_blocks[unref_p_ref];
1127 assert(vt_ref >= 0)((void) sizeof ((vt_ref >= 0) ? 1 : 0), __extension__ ({ if
(vt_ref >= 0) ; else __assert_fail ("vt_ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 1127, __extension__ __PRETTY_FUNCTION__); }))
;
1128 const int buffer_ref = p_alloc_prep->blocks[vt_ref].buffer_ref;
1129 if (!p_arena->buffers[buffer_ref].ptr)
1130 {
1131 // Pass it down as 0 ptr.
1132 tensor_arena->buffers[i].ptr = 0;
1133 PRINT(CCV_CLI_VERBOSE, "|-Cannot assign buffer %d, it points to multiple blocks (multi view tensor required)\n", i)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
{ printf("|-Cannot assign buffer %d, it points to multiple blocks (multi view tensor required)\n"
, i); fflush(stdout); } } while (0)
;
1134 continue;
1135 }
1136 const uint64_t offset = p_alloc_prep->blocks[vt_ref].offset;
1137 tensor_arena->buffers[i].ptr = p_arena->buffers[buffer_ref].ptr + offset;
1138 PRINT(CCV_CLI_VERBOSE, "|-Assign block %d in parent arena to buffer %d with offset %lu\n", vt_ref, i, (unsigned long)offset)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
{ printf("|-Assign block %d in parent arena to buffer %d with offset %lu\n"
, vt_ref, i, (unsigned long)offset); fflush(stdout); } } while
(0)
;
1139 }
1140 } else {
1141 // Now, allocate actual buffers.
1142 PRINT(CCV_CLI_VERBOSE, "Buffer allocation for arena %p\n", tensor_arena)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
{ printf("Buffer allocation for arena %p\n", tensor_arena); fflush
(stdout); } } while (0)
;
1143 for (i = 0; i < tensor_arena->buffer_size; i++)
1144 {
1145 const int buffer_type = tensor_arena->buffers[i].type;
1146 const int memory_type = CCV_TENSOR_GET_MEMORY(buffer_type)((buffer_type) & 0x3);
1147#ifdef HAVE_CUDA1
1148 if (memory_type == CCV_TENSOR_GPU_MEMORY)
1149 {
1150 const int device_id = CCV_TENSOR_GET_DEVICE_ID(buffer_type)(((buffer_type) & 0xfff00) >> 8);
1151 tensor_arena->buffers[i].ptr = (uint8_t*)cumalloc(device_id, tensor_arena->buffers[i].size);
1152 PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
{ printf("|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena
->buffers[i].ptr, (unsigned long)tensor_arena->buffers[
i].size); fflush(stdout); } } while (0)
;
1153 } else {
1154 assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0
), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY)
; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY"
, "ccv_nnc_symbolic_graph_compile.c", 1154, __extension__ __PRETTY_FUNCTION__
); }))
;
1155 if (tensor_arena->buffers[i].pin_mem)
1156 tensor_arena->buffers[i].ptr = (uint8_t*)cuhostalloc(tensor_arena->buffers[i].size);
1157 else
1158 ccmemalignposix_memalign((void**)&tensor_arena->buffers[i].ptr, 16, tensor_arena->buffers[i].size);
1159 PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
{ printf("|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena
->buffers[i].ptr, (unsigned long)tensor_arena->buffers[
i].size); fflush(stdout); } } while (0)
;
1160 }
1161#else
1162 assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0
), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY)
; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY"
, "ccv_nnc_symbolic_graph_compile.c", 1162, __extension__ __PRETTY_FUNCTION__
); }))
;
1163 ccmemalignposix_memalign((void**)&tensor_arena->buffers[i].ptr, 16, tensor_arena->buffers[i].size);
1164 PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
{ printf("|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena
->buffers[i].ptr, (unsigned long)tensor_arena->buffers[
i].size); fflush(stdout); } } while (0)
;
1165#endif
1166 assert(tensor_arena->buffers[i].ptr)((void) sizeof ((tensor_arena->buffers[i].ptr) ? 1 : 0), __extension__
({ if (tensor_arena->buffers[i].ptr) ; else __assert_fail
("tensor_arena->buffers[i].ptr", "ccv_nnc_symbolic_graph_compile.c"
, 1166, __extension__ __PRETTY_FUNCTION__); }))
;
1167 }
1168 }
1169 // Go over sub_preps and allocate arenas for them. Do it this early because
1170 // we may reference tensors from sub arenas, the reason why we need to reference
1171 // tensors from sub arenas is because for output tensors, sub arena's tensor
1172 // will have automatic reference updates.
1173 for (i = 0; i < tensor_arena->sub_arena_size; i++)
1174 if (graph_prep->sub_preps[i])
1175 tensor_arena->sub_arenas[i] = _ccv_nnc_tensor_arena_new(graph_prep->sub_preps[i], tensor_arena, tensor_binds, tensor_bind_size);
1176 else
1177 tensor_arena->sub_arenas[i] = 0;
1178 memset(tensor_arena->vt_tensors, 0, sizeof(ccv_nnc_tensor_t*) * tensor_symbol_info_size);
1179 // Now sub-arenas are all assigned, go over its outputs to assign out tensors from its output directly.
1180 ccv_nnc_tensor_t** sub_arena_out_tensors = tensor_arena->sub_arena_size ? (ccv_nnc_tensor_t**)cccalloccalloc(tensor_symbol_info_size, sizeof(ccv_nnc_tensor_t*)) : 0;
1181 for (i = 0; i < tensor_arena->sub_arena_size; i++)
1182 if (tensor_arena->sub_arenas[i])
1183 {
1184 assert(graph_prep->sub_preps[i])((void) sizeof ((graph_prep->sub_preps[i]) ? 1 : 0), __extension__
({ if (graph_prep->sub_preps[i]) ; else __assert_fail ("graph_prep->sub_preps[i]"
, "ccv_nnc_symbolic_graph_compile.c", 1184, __extension__ __PRETTY_FUNCTION__
); }))
;
1185 const int exec_idx = graph_prep->sub_preps[i]->exec_idx - 1;
1186 const ccv_nnc_graph_exec_symbol_info_t* const node = graph_prep->exec_symbol_info + exec_idx;
1187 if (node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
1188 for (j = 0; j < node->output_size; j++)
1189 {
1190 const int idx = node->outputs[j];
1191 const int s_idx = *(int*)ccv_array_get(tensor_symbol_info[idx].s_ref, i)((void*)(((char*)((tensor_symbol_info[idx].s_ref)->data)) +
(size_t)(tensor_symbol_info[idx].s_ref)->rsize * (size_t)
(i)))
- 1;
1192 assert(s_idx >= 0)((void) sizeof ((s_idx >= 0) ? 1 : 0), __extension__ ({ if
(s_idx >= 0) ; else __assert_fail ("s_idx >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 1192, __extension__ __PRETTY_FUNCTION__); }))
;
1193 ccv_nnc_tensor_t* sub_tensor = tensor_arena->sub_arenas[i]->vt_tensors[s_idx];
1194 assert(sub_arena_out_tensors[idx] == 0)((void) sizeof ((sub_arena_out_tensors[idx] == 0) ? 1 : 0), __extension__
({ if (sub_arena_out_tensors[idx] == 0) ; else __assert_fail
("sub_arena_out_tensors[idx] == 0", "ccv_nnc_symbolic_graph_compile.c"
, 1194, __extension__ __PRETTY_FUNCTION__); }))
;
1195 ccv_nnc_tensor_t* sub_alias = (ccv_nnc_tensor_t*)sub_tensor->alias_ref;
1196 // Only assign if it is a multiview tensor.
1197 if (CCV_IS_TENSOR_MULTIVIEW(sub_tensor)((*(int*)(sub_tensor)) & CCV_TENSOR_MULTIVIEW) ||
1198 (sub_alias && CCV_IS_TENSOR_MULTIVIEW(sub_alias)((*(int*)(sub_alias)) & CCV_TENSOR_MULTIVIEW)))
1199 sub_arena_out_tensors[idx] = sub_tensor;
1200 }
1201 }
1202 // Assigning out the tensors (in case of sharing tensors / in-place ops).
1203 for (i = 0; i < tensor_symbol_info_size; i++)
1204 if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !(
(tensor_blocks[i].flags & 0x3) == UNASSIGNED))
)
1205 {
1206 const int vt_ref = alloc_prep->vt_blocks[i];
1207 const int buffer_ref = vt_ref >= 0 ? alloc_prep->blocks[vt_ref].buffer_ref : -1;
1208 // Either we have dup_tensor_block_ref in current layer, or we have that in
1209 // previous layer, therefore, cannot really find the buffer ptr.
1210 if ((!sub_arena_out_tensors || !sub_arena_out_tensors[i]) && // If it is already generated by sub arena, it can be ordinary out tensors. (What if out tensor is not even generated by sub graph when running? In this case, the behavior is undefined anyway).
1211 ((graph_prep->dup_tensor_block_ref &&
1212 graph_prep->dup_tensor_block_ref[i * unroll_count] >= 0 &&
1213 graph_prep->dup_tensor_block_ref[i * unroll_count] != i) ||
1214 (buffer_ref >= 0 && !tensor_arena->buffers[buffer_ref].ptr)))
1215 {
1216 assert(graph_prep->p)((void) sizeof ((graph_prep->p) ? 1 : 0), __extension__ ({
if (graph_prep->p) ; else __assert_fail ("graph_prep->p"
, "ccv_nnc_symbolic_graph_compile.c", 1216, __extension__ __PRETTY_FUNCTION__
); }))
; // This must be in a sub-graph.
1217 // If this is an input tensor, and it need to be preserved, wait until when we go through inputs to preserve.
1218 if (graph_prep->tensor_blocks[i].p_refs[0] && _ccv_nnc_tensor_block_check_preserve(graph_prep, i))
1219 continue;
1220 const int pos = _ccv_nnc_tensor_multiview_gen(tensor_arena->tensor_metadata, 0, tensor_symbol_info[i].assign_ref, tensor_symbol_info[i].info, graph_prep, tensor_arena, i);
1221 tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1222 ccv_array_push(tensor_arena->m_tensor_idx, &pos);
1223 } else if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED)) {
1224 // When we want to allocate, we don't really need to if it need force broadcast, because we will handle that later.
1225 const uint64_t offset = alloc_prep->blocks[vt_ref].offset;
1226 // If already created, use the same tensor, and continue.
1227 // Having ptr.
1228 int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1229 ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1230 // Also, set its allocations.
1231 // Since tensor view is bit compatible with tensor, we can just cast.
1232 *tensor = ccv_nnc_tensor(tensor_arena->buffers[buffer_ref].ptr + offset, tensor_symbol_info[i].info, 0);
1233 assert(offset + tensor_blocks[i].size <= tensor_arena->buffers[buffer_ref].size)((void) sizeof ((offset + tensor_blocks[i].size <= tensor_arena
->buffers[buffer_ref].size) ? 1 : 0), __extension__ ({ if (
offset + tensor_blocks[i].size <= tensor_arena->buffers
[buffer_ref].size) ; else __assert_fail ("offset + tensor_blocks[i].size <= tensor_arena->buffers[buffer_ref].size"
, "ccv_nnc_symbolic_graph_compile.c", 1233, __extension__ __PRETTY_FUNCTION__
); }))
;
1234 // If we need to force broadcast, we need to wrap it in a multiview.
1235 if (graph_prep->tensor_blocks[i].p_refs[0] &&
1236 _ccv_nnc_tensor_block_check_force_broadcast(graph_prep, i))
1237 {
1238 const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
1239 ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, mv_pos);
1240 ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1241 ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
1242 tv,
1243 }, 0, 1, graph_prep->graph, mv);
1244 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[0] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1245 pos = mv_pos;
1246 ccv_array_push(tensor_arena->m_tensor_idx, &mv_pos);
1247 }
1248 tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)pos; // Cast into vt_tensors for now, and later will rewire it.
1249 }
1250 }
1251 // Handle binded tensors. We handle it here so the alias can reference to binded tensors.
1252 for (i = 0; i < tensor_bind_size; i++)
1253 {
1254 assert(tensor_binds[i].tensor)((void) sizeof ((tensor_binds[i].tensor) ? 1 : 0), __extension__
({ if (tensor_binds[i].tensor) ; else __assert_fail ("tensor_binds[i].tensor"
, "ccv_nnc_symbolic_graph_compile.c", 1254, __extension__ __PRETTY_FUNCTION__
); }))
;
1255 const ccv_nnc_tensor_symbol_t resolved_symbol = ccv_nnc_tensor_symbol_resolve(graph_prep->symbolic_graph, tensor_binds[i].symbol);
1256 if (resolved_symbol.d >= 0)
1257 {
1258 int d = resolved_symbol.d;
1259 while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED) && tensor_blocks[d].ref)
1260 d = tensor_blocks[d].ref - 1;
1261 // For binded tensors, it shouldn't be assigned yet.
1262 // If it is assigned, the pointer should match the ones from the binded tensor.
1263 // This can only happen if an enforced in-place tensor is binded twice. If that
1264 // happens, we need to make sure it is binded to the same location.
1265 assert(!tensor_arena->vt_tensors[d] || tensor_arena->vt_tensors[d]->data.u8 == tensor_binds[i].tensor->data.u8)((void) sizeof ((!tensor_arena->vt_tensors[d] || tensor_arena
->vt_tensors[d]->data.u8 == tensor_binds[i].tensor->
data.u8) ? 1 : 0), __extension__ ({ if (!tensor_arena->vt_tensors
[d] || tensor_arena->vt_tensors[d]->data.u8 == tensor_binds
[i].tensor->data.u8) ; else __assert_fail ("!tensor_arena->vt_tensors[d] || tensor_arena->vt_tensors[d]->data.u8 == tensor_binds[i].tensor->data.u8"
, "ccv_nnc_symbolic_graph_compile.c", 1265, __extension__ __PRETTY_FUNCTION__
); }))
;
1266 if (CCV_IS_TENSOR_VIEW(tensor_binds[i].tensor)((*(int*)(tensor_binds[i].tensor)) & CCV_TENSOR_VIEW))
1267 {
1268 int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_view_t));
1269 ccv_nnc_tensor_view_t* const tv = (ccv_nnc_tensor_view_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1270 memcpy(tv, tensor_binds[i].tensor, sizeof(ccv_nnc_tensor_view_t));
1271 tensor_arena->vt_tensors[d] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1272 } else {
1273 int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1274 ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1275 *tv = ccv_nnc_tensor(tensor_binds[i].tensor->data.ptr, tensor_binds[i].tensor->info, 0);
1276 tensor_arena->vt_tensors[d] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1277 }
1278 }
1279 }
1280 // Assign out refs, refs are simple ones, we should handle it first. (because they point to exactly the same metadata and same region).
1281 for (i = 0; i < tensor_symbol_info_size; i++)
1282 // It could be binded tensor (or unused), in that case, it doesn't have a ref.
1283 if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED) && tensor_blocks[i].ref && !tensor_arena->vt_tensors[i])
1284 {
1285 int ref = tensor_blocks[i].ref - 1;
1286 while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[ref])((tensor_blocks[ref].flags & 0x3) == UNASSIGNED) && tensor_blocks[ref].ref)
1287 ref = tensor_blocks[ref].ref - 1;
1288 assert(tensor_arena->vt_tensors[ref])((void) sizeof ((tensor_arena->vt_tensors[ref]) ? 1 : 0), __extension__
({ if (tensor_arena->vt_tensors[ref]) ; else __assert_fail
("tensor_arena->vt_tensors[ref]", "ccv_nnc_symbolic_graph_compile.c"
, 1288, __extension__ __PRETTY_FUNCTION__); }))
;
1289 tensor_arena->vt_tensors[i] = tensor_arena->vt_tensors[ref];
1290 }
1291 // Now after refs assigned out, handle the case I need to preserve because I am a sub graph of while loop.
1292 if (graph_prep->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
1293 {
1294 assert(graph_prep->p)((void) sizeof ((graph_prep->p) ? 1 : 0), __extension__ ({
if (graph_prep->p) ; else __assert_fail ("graph_prep->p"
, "ccv_nnc_symbolic_graph_compile.c", 1294, __extension__ __PRETTY_FUNCTION__
); }))
;
1295 const ccv_nnc_graph_exec_symbol_info_t* node = graph_prep->p->exec_symbol_info + (graph_prep->exec_idx - 1);
1296 const int p_idx = graph_prep->p_idx - 1;
1297 for (i = 0; i < node->input_size; i++)
1298 {
1299 const int idx = node->inputs[i];
1300 int block_ref = *(int*)ccv_array_get(graph_prep->p->tensor_symbol_info[idx].s_ref, p_idx)((void*)(((char*)((graph_prep->p->tensor_symbol_info[idx
].s_ref)->data)) + (size_t)(graph_prep->p->tensor_symbol_info
[idx].s_ref)->rsize * (size_t)(p_idx)))
- 1;
1301 assert(!tensor_blocks[block_ref].ref)((void) sizeof ((!tensor_blocks[block_ref].ref) ? 1 : 0), __extension__
({ if (!tensor_blocks[block_ref].ref) ; else __assert_fail (
"!tensor_blocks[block_ref].ref", "ccv_nnc_symbolic_graph_compile.c"
, 1301, __extension__ __PRETTY_FUNCTION__); }))
;
1302 const int vt_ref = alloc_prep->vt_blocks[block_ref];
1303 if (!_ccv_nnc_tensor_block_check_preserve(graph_prep, block_ref))
1304 continue;
1305 assert(vt_ref >= 0)((void) sizeof ((vt_ref >= 0) ? 1 : 0), __extension__ ({ if
(vt_ref >= 0) ; else __assert_fail ("vt_ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 1305, __extension__ __PRETTY_FUNCTION__); }))
;
1306 const int buffer_ref = alloc_prep->blocks[vt_ref].buffer_ref;
1307 assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[block_ref]))((void) sizeof ((!((tensor_blocks[block_ref].flags & 0x3)
== UNASSIGNED)) ? 1 : 0), __extension__ ({ if (!((tensor_blocks
[block_ref].flags & 0x3) == UNASSIGNED)) ; else __assert_fail
("!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[block_ref])", "ccv_nnc_symbolic_graph_compile.c"
, 1307, __extension__ __PRETTY_FUNCTION__); }))
;
1308 assert(!TENSOR_EXPECT_ALIAS(tensor_blocks[block_ref]))((void) sizeof ((!((tensor_blocks[block_ref].flags & 0x3)
== ALIAS)) ? 1 : 0), __extension__ ({ if (!((tensor_blocks[block_ref
].flags & 0x3) == ALIAS)) ; else __assert_fail ("!TENSOR_EXPECT_ALIAS(tensor_blocks[block_ref])"
, "ccv_nnc_symbolic_graph_compile.c", 1308, __extension__ __PRETTY_FUNCTION__
); }))
;
1309 // Either we have dup_tensor_block_ref in current layer, or we have that in
1310 // previous layer, therefore, cannot really find the buffer ptr.
1311 if ((!sub_arena_out_tensors || !sub_arena_out_tensors[block_ref]) && // If it is already generated by sub arena, it can be ordinary out tensors. (What if out tensor is not even generated by sub graph when running? In this case, the behavior is undefined anyway).
1312 ((graph_prep->dup_tensor_block_ref &&
1313 graph_prep->dup_tensor_block_ref[block_ref * unroll_count] >= 0 &&
1314 graph_prep->dup_tensor_block_ref[block_ref * unroll_count] != block_ref) ||
1315 !tensor_arena->buffers[buffer_ref].ptr))
1316 {
1317 // We haven't allocated anything for this yet.
1318 assert(tensor_arena->vt_tensors[block_ref] == 0)((void) sizeof ((tensor_arena->vt_tensors[block_ref] == 0)
? 1 : 0), __extension__ ({ if (tensor_arena->vt_tensors[block_ref
] == 0) ; else __assert_fail ("tensor_arena->vt_tensors[block_ref] == 0"
, "ccv_nnc_symbolic_graph_compile.c", 1318, __extension__ __PRETTY_FUNCTION__
); }))
;
1319 const int pos = _ccv_nnc_tensor_multiview_gen(tensor_arena->tensor_metadata, 1, tensor_symbol_info[i].assign_ref, tensor_symbol_info[block_ref].info, graph_prep, tensor_arena, block_ref);
1320 tensor_arena->vt_tensors[block_ref] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1321 ccv_array_push(tensor_arena->m_tensor_idx, &pos);
1322 } else {
1323 const int mv_pos = _ccv_nnc_tensor_multiview_preserve_gen(tensor_arena->tensor_metadata, tensor_symbol_info[block_ref].info, graph_prep, tensor_arena->vt_tensors[block_ref]);
1324 tensor_arena->vt_tensors[block_ref] = (ccv_nnc_tensor_t*)(intptr_t)mv_pos; // Cast into vt_tensors for now, and later will rewire.
1325 ccv_array_push(tensor_arena->m_tensor_idx, &mv_pos);
1326 }
1327 }
1328 }
1329 // For case..of statement, the output is a phi variable, thus, if we take the skip branch, we will select the original input.
1330 // This created the multi-view tensor to achieve that.
1331 for (i = 0; i < tensor_symbol_info_size; i++)
1332 if (tensor_blocks[i].bypass_ref && tensor_arena->vt_tensors[i])
1333 {
1334 const int bypass_ref = tensor_blocks[i].bypass_ref - 1;
1335 // Create phi multi-view.
1336 const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
1337 const int intv_pos = _ccv_nnc_tensor_flat_if_multiview(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[bypass_ref]);
1338 const int outv_pos = _ccv_nnc_tensor_flat_if_multiview(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[i]);
1339 ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, mv_pos);
1340 ccv_nnc_tensor_t* const intv = (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, intv_pos);
1341 ccv_nnc_tensor_t* const outv = (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, outv_pos);
1342 ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
1343 intv,
1344 outv,
1345 }, CCV_NNC_MULTIVIEW_K0N, 2, (ccv_nnc_graph_t*)CCV_NNC_MULTIVIEW_PHI(intptr_t)0x1, mv);
1346 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[0] = (ccv_nnc_tensor_t*)(intptr_t)intv_pos;
1347 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[1] = (ccv_nnc_tensor_t*)(intptr_t)outv_pos;
1348 tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)mv_pos;
1349 ccv_array_push(tensor_arena->m_tensor_idx, &mv_pos);
1350 }
1351 // Now it is time to handle alias.
1352 for (i = 0; i < alloc_prep->block_size; i++)
1353 if (alloc_prep->blocks[i].block_ref < tensor_symbol_info_size)
1354 {
1355 const int block_ref = alloc_prep->blocks[i].block_ref;
1356 if (TENSOR_EXPECT_ALIAS(tensor_blocks[block_ref])((tensor_blocks[block_ref].flags & 0x3) == ALIAS))
1357 {
1358 // Assigning out the tensor aliases.
1359 assert(tensor_symbol_info[block_ref].alias_ref)((void) sizeof ((tensor_symbol_info[block_ref].alias_ref) ? 1
: 0), __extension__ ({ if (tensor_symbol_info[block_ref].alias_ref
) ; else __assert_fail ("tensor_symbol_info[block_ref].alias_ref"
, "ccv_nnc_symbolic_graph_compile.c", 1359, __extension__ __PRETTY_FUNCTION__
); }))
;
1360 const int alias_ref = tensor_symbol_info[block_ref].alias_ref - 1;
1361 // It referenced to is not an alias.
1362 assert(tensor_arena->vt_tensors[alias_ref])((void) sizeof ((tensor_arena->vt_tensors[alias_ref]) ? 1 :
0), __extension__ ({ if (tensor_arena->vt_tensors[alias_ref
]) ; else __assert_fail ("tensor_arena->vt_tensors[alias_ref]"
, "ccv_nnc_symbolic_graph_compile.c", 1362, __extension__ __PRETTY_FUNCTION__
); }))
;
1363 // If this is not alias (it is binded then).
1364 if (!CCV_NNC_IS_METADATA_POS(tensor_arena->vt_tensors[alias_ref])((uintptr_t)(tensor_arena->vt_tensors[alias_ref]) & 1))
1365 {
1366 int pos;
1367 if (memcmp(ccv_nnc_no_ofs, tensor_symbol_info[block_ref].ofs, sizeof(ccv_nnc_no_ofs)) == 0 &&
1368 memcmp(tensor_symbol_info[block_ref].inc, tensor_symbol_info[block_ref].info.dim, sizeof(int) * CCV_NNC_MAX_DIM_ALLOC(8)) == 0)
1369 {
1370 pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1371 ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1372 *tensor = ccv_nnc_tensor(tensor_arena->vt_tensors[alias_ref]->data.u8, tensor_symbol_info[block_ref].info, 0);
1373 } else {
1374 pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_view_t));
1375 ccv_nnc_tensor_view_t* const tensor_view = (ccv_nnc_tensor_view_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1376 // Otherwise initialize a tensor view
1377 *tensor_view = ccv_nnc_tensor_view(tensor_arena->vt_tensors[alias_ref], tensor_symbol_info[block_ref].info.dim, tensor_symbol_info[block_ref].ofs, tensor_symbol_info[block_ref].inc);
1378 tensor_view->alias_ref = (uintptr_t)tensor_arena->vt_tensors[alias_ref];
1379 }
1380 tensor_arena->vt_tensors[block_ref] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1381 continue;
1382 }
1383 const int alias_pos = (int)(intptr_t)tensor_arena->vt_tensors[alias_ref];
1384 const ccv_nnc_tensor_t* alias_tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, alias_pos);
1385 assert(!CCV_IS_TENSOR_VIEW(alias_tensor_ptr))((void) sizeof ((!((*(int*)(alias_tensor_ptr)) & CCV_TENSOR_VIEW
)) ? 1 : 0), __extension__ ({ if (!((*(int*)(alias_tensor_ptr
)) & CCV_TENSOR_VIEW)) ; else __assert_fail ("!CCV_IS_TENSOR_VIEW(alias_tensor_ptr)"
, "ccv_nnc_symbolic_graph_compile.c", 1385, __extension__ __PRETTY_FUNCTION__
); }))
;
1386 // Will use that to determine whether insert reference or not.
1387 const int is_multiview = CCV_IS_TENSOR_MULTIVIEW(alias_tensor_ptr)((*(int*)(alias_tensor_ptr)) & CCV_TENSOR_MULTIVIEW);
1388 while (CCV_IS_TENSOR_MULTIVIEW(alias_tensor_ptr)((*(int*)(alias_tensor_ptr)) & CCV_TENSOR_MULTIVIEW))
1389 {
1390 const ccv_nnc_tensor_multiview_t* const mv = (const ccv_nnc_tensor_multiview_t*)alias_tensor_ptr;
1391 alias_tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[0]);
1392 }
1393 const ccv_nnc_tensor_t alias_tensor = *alias_tensor_ptr;
1394 // If there is no ofs, and inc is the same as dim, we take a shortcut and just init as normal tensor.
1395 int pos;
1396 if (memcmp(ccv_nnc_no_ofs, tensor_symbol_info[block_ref].ofs, sizeof(ccv_nnc_no_ofs)) == 0 &&
1397 memcmp(tensor_symbol_info[block_ref].inc, tensor_symbol_info[block_ref].info.dim, sizeof(int) * CCV_NNC_MAX_DIM_ALLOC(8)) == 0)
1398 {
1399 pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1400 ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1401 *tensor = ccv_nnc_tensor(alias_tensor.data.u8, tensor_symbol_info[block_ref].info, 0);
1402 } else {
1403 pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_view_t));
1404 ccv_nnc_tensor_view_t* const tensor_view = (ccv_nnc_tensor_view_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1405 // Otherwise initialize a tensor view
1406 *tensor_view = ccv_nnc_tensor_view(&alias_tensor, tensor_symbol_info[block_ref].info.dim, tensor_symbol_info[block_ref].ofs, tensor_symbol_info[block_ref].inc);
1407 tensor_view->alias_ref = (uintptr_t)alias_pos;
1408 }
1409 tensor_arena->vt_tensors[block_ref] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1410 if (is_multiview)
1411 {
1412 ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, alias_pos);
1413 ccv_nnc_tensor_synchronize_to_multiview(mv, (ccv_nnc_tensor_t*)(intptr_t)pos);
1414 }
1415 }
1416 }
1417 // Replacing the tensor placeholder within sub arena's multi-view to the input tensor.
1418 for (i = 0; i < tensor_arena->sub_arena_size; i++)
1419 if (tensor_arena->sub_arenas[i])
1420 {
1421 const int exec_idx = graph_prep->sub_preps[i]->exec_idx - 1;
1422 const ccv_nnc_graph_exec_symbol_info_t* const node = graph_prep->exec_symbol_info + exec_idx;
1423 for (j = 0; j < node->input_size; j++)
1424 {
1425 const int idx = node->inputs[j];
1426 const int s_idx = (tensor_symbol_info[idx].s_ref && tensor_symbol_info[idx].s_ref->rnum > i) ? *(int*)ccv_array_get(tensor_symbol_info[idx].s_ref, i)((void*)(((char*)((tensor_symbol_info[idx].s_ref)->data)) +
(size_t)(tensor_symbol_info[idx].s_ref)->rsize * (size_t)
(i)))
- 1 : -1;
1427 if (s_idx < 0)
1428 continue;
1429 ccv_nnc_tensor_t* sub_tensor = tensor_arena->sub_arenas[i]->vt_tensors[s_idx];
1430 // Only do the replacement if it is a multi-view tensor.
1431 // sub_tensor can be unassigned if it is a tape variable. It will get fixed up later from its peer.
1432 if (sub_tensor && CCV_IS_TENSOR_MULTIVIEW(sub_tensor)((*(int*)(sub_tensor)) & CCV_TENSOR_MULTIVIEW) && !TENSOR_EXPECT_UNASSIGNED(tensor_blocks[idx])((tensor_blocks[idx].flags & 0x3) == UNASSIGNED))
1433 {
1434 // It cannot be binded tensor.
1435 assert(CCV_NNC_IS_METADATA_POS(tensor_arena->vt_tensors[idx]))((void) sizeof ((((uintptr_t)(tensor_arena->vt_tensors[idx
]) & 1)) ? 1 : 0), __extension__ ({ if (((uintptr_t)(tensor_arena
->vt_tensors[idx]) & 1)) ; else __assert_fail ("CCV_NNC_IS_METADATA_POS(tensor_arena->vt_tensors[idx])"
, "ccv_nnc_symbolic_graph_compile.c", 1435, __extension__ __PRETTY_FUNCTION__
); }))
;
1436 const int vt_pos = (int)(intptr_t)tensor_arena->vt_tensors[idx];
1437 const int is_sub_arena_out_tensor = (sub_arena_out_tensors && sub_arena_out_tensors[idx]);
1438 ccv_nnc_tensor_t* const vt_tensor = is_sub_arena_out_tensor ? sub_arena_out_tensors[idx] : _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, vt_pos);
1439 // If this tensor is also an multiview, we need to first generate a new tensor, and then generate a reference
1440 // to this tensor.
1441 if (CCV_IS_TENSOR_MULTIVIEW(vt_tensor)((*(int*)(vt_tensor)) & CCV_TENSOR_MULTIVIEW))
1442 {
1443 const int ref_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1444 ccv_nnc_tensor_t* const ref_tensor = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, ref_pos);
1445 ccv_nnc_tensor_multiview_t* const multiview = (ccv_nnc_tensor_multiview_t*)(is_sub_arena_out_tensor ? vt_tensor : _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, vt_pos));
1446 ref_tensor->alias_ref = is_sub_arena_out_tensor ? (uintptr_t)vt_tensor : (uintptr_t)vt_pos;
1447 ccv_nnc_tensor_synchronize_to_multiview(multiview, (ccv_nnc_tensor_t*)(intptr_t)ref_pos);
1448 ccv_nnc_tensor_t* tv = (ccv_nnc_tensor_t*)(CCV_NNC_IS_METADATA_POS(CCV_NNC_MULTIVIEW_DATA(multiview)[0])((uintptr_t)(((multiview)->_heap_data ? (multiview)->_heap_data
: (multiview)->_inline_data)[0]) & 1)
? _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(multiview)((multiview)->_heap_data ? (multiview)->_heap_data : (multiview
)->_inline_data)
[0]) : CCV_NNC_MULTIVIEW_DATA(multiview)((multiview)->_heap_data ? (multiview)->_heap_data : (multiview
)->_inline_data)
[0]);
1449 while (CCV_IS_TENSOR_MULTIVIEW(tv)((*(int*)(tv)) & CCV_TENSOR_MULTIVIEW))
1450 tv = (ccv_nnc_tensor_t*)(CCV_NNC_IS_METADATA_POS(CCV_NNC_MULTIVIEW_DATA((ccv_nnc_tensor_multiview_t*)tv)[0])((uintptr_t)((((ccv_nnc_tensor_multiview_t*)tv)->_heap_data
? ((ccv_nnc_tensor_multiview_t*)tv)->_heap_data : ((ccv_nnc_tensor_multiview_t
*)tv)->_inline_data)[0]) & 1)
? _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA((ccv_nnc_tensor_multiview_t*)tv)(((ccv_nnc_tensor_multiview_t*)tv)->_heap_data ? ((ccv_nnc_tensor_multiview_t
*)tv)->_heap_data : ((ccv_nnc_tensor_multiview_t*)tv)->
_inline_data)
[0]) : CCV_NNC_MULTIVIEW_DATA((ccv_nnc_tensor_multiview_t*)tv)(((ccv_nnc_tensor_multiview_t*)tv)->_heap_data ? ((ccv_nnc_tensor_multiview_t
*)tv)->_heap_data : ((ccv_nnc_tensor_multiview_t*)tv)->
_inline_data)
[0]);
1451 *ref_tensor = ccv_nnc_tensor(tv->data.ptr, tv->info, 0);
1452 _ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)sub_tensor, (ccv_nnc_tensor_t*)(intptr_t)ref_pos);
1453 } else
1454 _ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)sub_tensor, is_sub_arena_out_tensor ? vt_tensor : (ccv_nnc_tensor_t*)(intptr_t)vt_pos);
1455 }
1456 }
1457 }
1458 // After alias created, for case..of statement, we now revert back to flat tensor rather than multi-view.
1459 // No worries though, this new tensor is subscribed for the phi multi-view. More over, we have logic
1460 // when initialize case..of node, which will take the phi multi-view again.
1461 for (i = 0; i < tensor_symbol_info_size; i++)
1462 if (tensor_blocks[i].bypass_ref && tensor_arena->vt_tensors[i])
1463 {
1464 assert(CCV_NNC_IS_METADATA_POS(tensor_arena->vt_tensors[i]))((void) sizeof ((((uintptr_t)(tensor_arena->vt_tensors[i])
& 1)) ? 1 : 0), __extension__ ({ if (((uintptr_t)(tensor_arena
->vt_tensors[i]) & 1)) ; else __assert_fail ("CCV_NNC_IS_METADATA_POS(tensor_arena->vt_tensors[i])"
, "ccv_nnc_symbolic_graph_compile.c", 1464, __extension__ __PRETTY_FUNCTION__
); }))
;
1465 ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[i]);
1466 assert(mv->anchor == CCV_NNC_MULTIVIEW_PHI)((void) sizeof ((mv->anchor == (intptr_t)0x1) ? 1 : 0), __extension__
({ if (mv->anchor == (intptr_t)0x1) ; else __assert_fail (
"mv->anchor == CCV_NNC_MULTIVIEW_PHI", "ccv_nnc_symbolic_graph_compile.c"
, 1466, __extension__ __PRETTY_FUNCTION__); }))
;
1467 tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)_ccv_nnc_tensor_flat_if_multiview(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[i]);
1468 }
1469 // rewire the rest. I can rewire multiple times because I can identify whether this is wired or not.
1470 for (i = 0; i < tensor_symbol_info_size; i++)
1471 if (tensor_arena->vt_tensors[i])
1472 tensor_arena->vt_tensors[i] = _ccv_nnc_tensor_metadata_rewire(tensor_arena->tensor_metadata, tensor_arena->vt_tensors[i]);
1473 // Associate multiview tensors from sub arena to the parent.
1474 if (sub_arena_out_tensors)
1475 {
1476 for (i = 0; i < alloc_prep->block_size; i++)
1477 if (alloc_prep->blocks[i].block_ref < tensor_symbol_info_size)
1478 {
1479 const int block_ref = alloc_prep->blocks[i].block_ref;
1480 if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[block_ref])((tensor_blocks[block_ref].flags & 0x3) == UNASSIGNED))
1481 continue;
1482 int sub_arena_ref = block_ref;
1483 if (TENSOR_EXPECT_ALIAS(tensor_blocks[block_ref])((tensor_blocks[block_ref].flags & 0x3) == ALIAS))
1484 {
1485 // Assigning out the tensor aliases.
1486 assert(tensor_symbol_info[block_ref].alias_ref)((void) sizeof ((tensor_symbol_info[block_ref].alias_ref) ? 1
: 0), __extension__ ({ if (tensor_symbol_info[block_ref].alias_ref
) ; else __assert_fail ("tensor_symbol_info[block_ref].alias_ref"
, "ccv_nnc_symbolic_graph_compile.c", 1486, __extension__ __PRETTY_FUNCTION__
); }))
;
1487 const int alias_ref = tensor_symbol_info[block_ref].alias_ref - 1;
1488 // It referenced to is not an alias.
1489 assert(tensor_arena->vt_tensors[alias_ref])((void) sizeof ((tensor_arena->vt_tensors[alias_ref]) ? 1 :
0), __extension__ ({ if (tensor_arena->vt_tensors[alias_ref
]) ; else __assert_fail ("tensor_arena->vt_tensors[alias_ref]"
, "ccv_nnc_symbolic_graph_compile.c", 1489, __extension__ __PRETTY_FUNCTION__
); }))
;
1490 sub_arena_ref = alias_ref;
1491 if (!sub_arena_out_tensors[sub_arena_ref])
1492 continue;
1493 }
1494 if (!sub_arena_out_tensors[sub_arena_ref])
1495 continue;
1496 ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)(CCV_IS_TENSOR_MULTIVIEW(sub_arena_out_tensors[sub_arena_ref])((*(int*)(sub_arena_out_tensors[sub_arena_ref])) & CCV_TENSOR_MULTIVIEW
)
? sub_arena_out_tensors[sub_arena_ref] : (ccv_nnc_tensor_t*)sub_arena_out_tensors[sub_arena_ref]->alias_ref);
1497 assert(CCV_IS_TENSOR_MULTIVIEW(mv))((void) sizeof ((((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW)) ?
1 : 0), __extension__ ({ if (((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW
)) ; else __assert_fail ("CCV_IS_TENSOR_MULTIVIEW(mv)", "ccv_nnc_symbolic_graph_compile.c"
, 1497, __extension__ __PRETTY_FUNCTION__); }))
;
1498 // This is only possible if the vt_tensors is a phi node.
1499 if (tensor_arena->vt_tensors[block_ref]->alias_ref)
1500 {
1501 // For phi node, the sub_arena_out_tensors are only relevant to its selected output. Therefore, setting that to be the receiver of the broadcast.
1502 ccv_nnc_tensor_multiview_t* const phi = (ccv_nnc_tensor_multiview_t*)(tensor_arena->vt_tensors[block_ref]->alias_ref);
1503 assert(phi->anchor == CCV_NNC_MULTIVIEW_PHI)((void) sizeof ((phi->anchor == (intptr_t)0x1) ? 1 : 0), __extension__
({ if (phi->anchor == (intptr_t)0x1) ; else __assert_fail
("phi->anchor == CCV_NNC_MULTIVIEW_PHI", "ccv_nnc_symbolic_graph_compile.c"
, 1503, __extension__ __PRETTY_FUNCTION__); }))
;
1504 assert(!CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(phi)[1]))((void) sizeof ((!((*(int*)(((phi)->_heap_data ? (phi)->
_heap_data : (phi)->_inline_data)[1])) & CCV_TENSOR_MULTIVIEW
)) ? 1 : 0), __extension__ ({ if (!((*(int*)(((phi)->_heap_data
? (phi)->_heap_data : (phi)->_inline_data)[1])) & CCV_TENSOR_MULTIVIEW
)) ; else __assert_fail ("!CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(phi)[1])"
, "ccv_nnc_symbolic_graph_compile.c", 1504, __extension__ __PRETTY_FUNCTION__
); }))
;
1505 CCV_NNC_MULTIVIEW_DATA(phi)((phi)->_heap_data ? (phi)->_heap_data : (phi)->_inline_data
)
[1]->alias_ref = (uintptr_t)mv;
1506 ccv_nnc_tensor_synchronize_to_multiview(mv, CCV_NNC_MULTIVIEW_DATA(phi)((phi)->_heap_data ? (phi)->_heap_data : (phi)->_inline_data
)
[1]);
1507 } else {
1508 tensor_arena->vt_tensors[block_ref]->alias_ref = (uintptr_t)mv;
1509 ccv_nnc_tensor_synchronize_to_multiview(mv, tensor_arena->vt_tensors[block_ref]);
1510 }
1511 }
1512 }
1513 // Go over all the tensors that has assign_ref. If the tensor it is assigned from is:
1514 // 1). From sub_arena_out_tensors, it could be possible that it now pointing to an area this arena doesn't know.
1515 // 2). From phi multi-view, for this case, it is in fact that this arena won't know which memory I am going to use prior.
1516 // Therefore, for above two scenarios, the tensor has assign_ref, even it is a multiview tensor, need to subscribe
1517 // to the output of assign_ref tensor.
1518 for (i = 0; i < tensor_symbol_info_size; i++)
1519 if (tensor_arena->vt_tensors[i] && tensor_symbol_info[i].assign_ref)
1520 {
1521 const int assign_ref = tensor_symbol_info[i].assign_ref - 1;
1522 ccv_nnc_tensor_t* assign_tensor;
1523 if (sub_arena_out_tensors && sub_arena_out_tensors[assign_ref])
1524 assign_tensor = CCV_IS_TENSOR_MULTIVIEW(sub_arena_out_tensors[assign_ref])((*(int*)(sub_arena_out_tensors[assign_ref])) & CCV_TENSOR_MULTIVIEW
)
? sub_arena_out_tensors[assign_ref] : (ccv_nnc_tensor_t*)sub_arena_out_tensors[assign_ref]->alias_ref;
1525 else
1526 assign_tensor = tensor_arena->vt_tensors[assign_ref];
1527 ccv_nnc_graph_add_carry_over(graph_prep->graph, assign_tensor, tensor_arena->vt_tensors[i]);
1528 }
1529 if (sub_arena_out_tensors)
1530 ccfreefree(sub_arena_out_tensors);
1531 // Rewire sub arena's tensor references.
1532 for (i = 0; i < tensor_arena->sub_arena_size; i++)
1533 if (tensor_arena->sub_arenas[i])
1534 {
1535 const int exec_idx = graph_prep->sub_preps[i]->exec_idx - 1;
1536 const ccv_nnc_graph_exec_symbol_info_t* const node = graph_prep->exec_symbol_info + exec_idx;
1537 for (j = 0; j < node->input_size; j++)
1538 {
1539 const int idx = node->inputs[j];
1540 const int s_idx = (tensor_symbol_info[idx].s_ref && tensor_symbol_info[idx].s_ref->rnum > i) ? *(int*)ccv_array_get(tensor_symbol_info[idx].s_ref, i)((void*)(((char*)((tensor_symbol_info[idx].s_ref)->data)) +
(size_t)(tensor_symbol_info[idx].s_ref)->rsize * (size_t)
(i)))
- 1 : -1;
1541 if (s_idx < 0)
1542 continue;
1543 ccv_nnc_tensor_t* sub_tensor = tensor_arena->sub_arenas[i]->vt_tensors[s_idx];
1544 // Only do the replacement if it is a multi-view tensor.
1545 // sub_tensor can be unassigned if it is a tape variable. It will get fixed up later from its peer.
1546 if (sub_tensor && CCV_IS_TENSOR_MULTIVIEW(sub_tensor)((*(int*)(sub_tensor)) & CCV_TENSOR_MULTIVIEW))
1547 {
1548 // This is binded tensor, bind it now.
1549 if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[idx])((tensor_blocks[idx].flags & 0x3) == UNASSIGNED))
1550 _ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)sub_tensor, tensor_arena->vt_tensors[idx]);
1551 else
1552 _ccv_nnc_tensor_multiview_full_pos_rewire(tensor_arena->tensor_metadata, (ccv_nnc_tensor_multiview_t*)sub_tensor);
1553 }
1554 }
1555 }
1556 return tensor_arena;
1557}
1558
1559static ccv_nnc_tensor_t* _ccv_nnc_tensor_arena_find_peer_ref(const ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_symbolic_graph_t* const graph, const int peer_ref)
1560{
1561 assert(graph)((void) sizeof ((graph) ? 1 : 0), __extension__ ({ if (graph)
; else __assert_fail ("graph", "ccv_nnc_symbolic_graph_compile.c"
, 1561, __extension__ __PRETTY_FUNCTION__); }))
;
1562 if ((intptr_t)graph == tensor_arena->graph_ref)
1563 {
1564 assert(peer_ref >= 0 && peer_ref < tensor_arena->vt_tensor_size)((void) sizeof ((peer_ref >= 0 && peer_ref < tensor_arena
->vt_tensor_size) ? 1 : 0), __extension__ ({ if (peer_ref >=
0 && peer_ref < tensor_arena->vt_tensor_size) ;
else __assert_fail ("peer_ref >= 0 && peer_ref < tensor_arena->vt_tensor_size"
, "ccv_nnc_symbolic_graph_compile.c", 1564, __extension__ __PRETTY_FUNCTION__
); }))
;
1565 return tensor_arena->vt_tensors[peer_ref];
1566 }
1567 int i;
1568 for (i = 0; i < tensor_arena->sub_arena_size; i++)
1569 if (tensor_arena->sub_arenas[i])
1570 {
1571 ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_arena_find_peer_ref(tensor_arena->sub_arenas[i], graph, peer_ref);
1572 if (tensor)
1573 return tensor;
1574 }
1575 return 0;
1576}
1577
1578static void _ccv_nnc_tensor_mark_as_tape_var(ccv_nnc_tensor_t* const tensor)
1579{
1580 if (!CCV_IS_TENSOR_MULTIVIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_MULTIVIEW))
1581 tensor->type |= CCV_TAPE_ALLOC;
1582 else {
1583 ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)tensor;
1584 mv->type |= CCV_TAPE_ALLOC;
1585 int i;
1586 for (i = 0; i < mv->repeat + mv->kind; i++)
1587 _ccv_nnc_tensor_mark_as_tape_var(CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i]);
1588 }
1589}
1590
1591static void _ccv_nnc_tensor_arena_fixup_peer_ref_and_tape_var(const ccv_nnc_tensor_arena_t* const root_arena, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, ccv_nnc_tensor_arena_t* const tensor_arena)
1592{
1593 assert(tensor_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph)((void) sizeof ((tensor_arena->graph_ref == (intptr_t)graph_prep
->symbolic_graph) ? 1 : 0), __extension__ ({ if (tensor_arena
->graph_ref == (intptr_t)graph_prep->symbolic_graph) ; else
__assert_fail ("tensor_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 1593, __extension__ __PRETTY_FUNCTION__
); }))
;
1594 int i;
1595 for (i = 0; i < graph_prep->tensor_symbol_info_size; i++)
1596 {
1597 if (graph_prep->tensor_symbol_info[i].peer_ref)
1598 {
1599 tensor_arena->vt_tensors[i] = _ccv_nnc_tensor_arena_find_peer_ref(root_arena, graph_prep->symbolic_graph->peer, graph_prep->tensor_symbol_info[i].peer_ref - 1);
1600 // No need to continue check this if it is from its peer.
1601 continue;
1602 }
1603 if ((graph_prep->tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_TAPE_VAR) && tensor_arena->vt_tensors[i])
1604 {
1605 // If it is a normal tensor, and the buffer it relies on is read only, no need to mark as tape var.
1606 if (!CCV_IS_TENSOR_MULTIVIEW(tensor_arena->vt_tensors[i])((*(int*)(tensor_arena->vt_tensors[i])) & CCV_TENSOR_MULTIVIEW
)
)
1607 {
1608 const int vt_ref = graph_prep->alloc_prep->vt_blocks[i];
1609 if (vt_ref >= 0 &&
1610 TENSOR_READ_WRITE(graph_prep->alloc_prep->buffers[graph_prep->alloc_prep->blocks[vt_ref].buffer_ref])(graph_prep->alloc_prep->buffers[graph_prep->alloc_prep
->blocks[vt_ref].buffer_ref].flags & 0xc)
== READ_ONLY)
1611 continue;
1612 }
1613 _ccv_nnc_tensor_mark_as_tape_var(tensor_arena->vt_tensors[i]);
1614 }
1615 }
1616 for (i = 0; i < graph_prep->sub_prep_size; i++)
1617 if (graph_prep->sub_preps[i])
1618 _ccv_nnc_tensor_arena_fixup_peer_ref_and_tape_var(root_arena, graph_prep->sub_preps[i], tensor_arena->sub_arenas[i]);
1619}
1620
1621static void _ccv_nnc_tensor_block_add_exec(const ccv_sparse_matrix_t* const exec_dep, const int idx, ccv_nnc_tensor_block_t tensor_blocks)
1622{
1623 int i, found = 0;
1624 // Try to insert head.
1625 ccv_array_t* head = tensor_blocks.head;
1626 assert(head)((void) sizeof ((head) ? 1 : 0), __extension__ ({ if (head) ;
else __assert_fail ("head", "ccv_nnc_symbolic_graph_compile.c"
, 1626, __extension__ __PRETTY_FUNCTION__); }))
;
1627 for (i = 0; i < head->rnum;)
1628 {
1629 const int head_idx = *(int*)ccv_array_get(head, i)((void*)(((char*)((head)->data)) + (size_t)(head)->rsize
* (size_t)(i)))
;
1630 if (head_idx == idx)
1631 {
1632 found = 1;
1633 break;
1634 }
1635 ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, head_idx, idx);
1636 if (cell.i32 && cell.i32[0] > 0)
1637 {
1638 /* If the current node is the parent of the head node, check if we found it or not. */
1639 /* If not found, replace the current one. */
1640 if (!found)
1641 {
1642 found = 1;
1643 *(int*)ccv_array_get(head, i)((void*)(((char*)((head)->data)) + (size_t)(head)->rsize
* (size_t)(i)))
= idx;
1644 } else {
1645 /* Remove the current one, change the rnum. */
1646 if (i < head->rnum - 1)
1647 *(int*)ccv_array_get(head, i)((void*)(((char*)((head)->data)) + (size_t)(head)->rsize
* (size_t)(i)))
= *(int*)ccv_array_get(head, head->rnum - 1)((void*)(((char*)((head)->data)) + (size_t)(head)->rsize
* (size_t)(head->rnum - 1)))
;
1648 --head->rnum;
1649 continue;
1650 }
1651 } else {
1652 // If the head is the parent of the idx, we cannot add it to the array (it is deterministically later than head).
1653 cell = ccv_get_sparse_matrix_cell(exec_dep, idx, head_idx);
1654 if (cell.i32 && cell.i32[0] > 0)
1655 {
1656 found = 1;
1657 break;
1658 }
1659 }
1660 /* Advancing i. */
1661 ++i;
1662 }
1663 /* If not found, push this idx to the end of the array. */
1664 if (!found)
1665 ccv_array_push(head, &idx);
1666 // Try to insert tail.
1667 found = 0;
1668 ccv_array_t* tail = tensor_blocks.tail;
1669 assert(tail)((void) sizeof ((tail) ? 1 : 0), __extension__ ({ if (tail) ;
else __assert_fail ("tail", "ccv_nnc_symbolic_graph_compile.c"
, 1669, __extension__ __PRETTY_FUNCTION__); }))
;
1670 for (i = 0; i < tail->rnum;)
1671 {
1672 const int tail_idx = *(int*)ccv_array_get(tail, i)((void*)(((char*)((tail)->data)) + (size_t)(tail)->rsize
* (size_t)(i)))
;
1673 if (tail_idx == idx)
1674 {
1675 found = 1;
1676 break;
1677 }
1678 ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, idx, tail_idx);
1679 if (cell.i32 && cell.i32[0] > 0)
1680 {
1681 /* If the current node is the child of the tail node, check if we found it or not. */
1682 /* If not found, replace the current one. */
1683 if (!found)
1684 {
1685 found = 1;
1686 *(int*)ccv_array_get(tail, i)((void*)(((char*)((tail)->data)) + (size_t)(tail)->rsize
* (size_t)(i)))
= idx;
1687 } else {
1688 /* Remove the current one, change the rnum. */
1689 *(int*)ccv_array_get(tail, i)((void*)(((char*)((tail)->data)) + (size_t)(tail)->rsize
* (size_t)(i)))
= *(int*)ccv_array_get(tail, tail->rnum - 1)((void*)(((char*)((tail)->data)) + (size_t)(tail)->rsize
* (size_t)(tail->rnum - 1)))
;
1690 --tail->rnum;
1691 continue;
1692 }
1693 } else {
1694 // If the tail is the child of the idx, we cannot add it to the array (it is deterministically earlier than tail).
1695 cell = ccv_get_sparse_matrix_cell(exec_dep, tail_idx, idx);
1696 if (cell.i32 && cell.i32[0] > 0)
1697 {
1698 found = 1;
1699 break;
1700 }
1701 }
1702 /* Advancing i. */
1703 ++i;
1704 }
1705 /* If not found, push this idx to the end of the array. */
1706 if (!found)
1707 ccv_array_push(tail, &idx);
1708}
1709
1710ccv_nnc_tensor_t* ccv_nnc_tensor_from_symbol(const ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_tensor_symbol_t symbol)
1711{
1712 if ((intptr_t)symbol.graph == tensor_arena->graph_ref)
1713 {
1714 assert(symbol.d >= 0 && symbol.d < tensor_arena->vt_tensor_size)((void) sizeof ((symbol.d >= 0 && symbol.d < tensor_arena
->vt_tensor_size) ? 1 : 0), __extension__ ({ if (symbol.d >=
0 && symbol.d < tensor_arena->vt_tensor_size) ;
else __assert_fail ("symbol.d >= 0 && symbol.d < tensor_arena->vt_tensor_size"
, "ccv_nnc_symbolic_graph_compile.c", 1714, __extension__ __PRETTY_FUNCTION__
); }))
;
1715 ccv_nnc_tensor_t* tensor = tensor_arena->vt_tensors[symbol.d];
1716 if (tensor && CCV_IS_TENSOR_MULTIVIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_MULTIVIEW))
1717 {
1718 ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)tensor;
1719 while (CCV_IS_TENSOR_MULTIVIEW(mv)((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW))
1720 mv = (ccv_nnc_tensor_multiview_t*)(mv->it ? mv->it : CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[0]);
1721 return (ccv_nnc_tensor_t*)mv;
1722 }
1723 return tensor;
1724 }
1725 int i;
1726 for (i = 0; i < tensor_arena->sub_arena_size; i++)
1727 if (tensor_arena->sub_arenas[i])
1728 {
1729 ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_from_symbol(tensor_arena->sub_arenas[i], symbol);
1730 if (tensor)
1731 return tensor;
1732 }
1733 return 0;
1734}
1735
1736ccv_nnc_graph_exec_t ccv_nnc_graph_exec_from_symbol(const ccv_nnc_graph_exec_arena_t* const graph_exec_arena, const ccv_nnc_graph_exec_symbol_t symbol)
1737{
1738 if ((intptr_t)symbol.graph == graph_exec_arena->graph_ref)
1739 {
1740 assert(symbol.d >= 0 && symbol.d < graph_exec_arena->graph_exec_size)((void) sizeof ((symbol.d >= 0 && symbol.d < graph_exec_arena
->graph_exec_size) ? 1 : 0), __extension__ ({ if (symbol.d
>= 0 && symbol.d < graph_exec_arena->graph_exec_size
) ; else __assert_fail ("symbol.d >= 0 && symbol.d < graph_exec_arena->graph_exec_size"
, "ccv_nnc_symbolic_graph_compile.c", 1740, __extension__ __PRETTY_FUNCTION__
); }))
;
1741 return graph_exec_arena->graph_execs[symbol.d];
1742 }
1743 int i;
1744 for (i = 0; i < graph_exec_arena->sub_arena_size; i++)
1745 if (graph_exec_arena->sub_arenas[i])
1746 {
1747 ccv_nnc_graph_exec_t exec = ccv_nnc_graph_exec_from_symbol(graph_exec_arena->sub_arenas[i], symbol);
1748 if (!CCV_NO_GRAPH_EXEC(exec)((exec).graph == 0))
1749 return exec;
1750 }
1751 return (ccv_nnc_graph_exec_t){}; // 0.
1752}
1753
1754ccv_nnc_graph_exec_t ccv_nnc_graph_exec_source(const ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
1755{
1756 return graph_exec_arena->source;
1757}
1758
1759ccv_nnc_graph_exec_t ccv_nnc_graph_exec_destination(const ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
1760{
1761 return graph_exec_arena->destination;
1762}
1763
1764// Check whether the head is the beginning of this block.
1765static int _ccv_nnc_tensor_block_check_head(const ccv_nnc_tensor_block_t* const tensor_block, const int head_node)
1766{
1767 assert(tensor_block->head)((void) sizeof ((tensor_block->head) ? 1 : 0), __extension__
({ if (tensor_block->head) ; else __assert_fail ("tensor_block->head"
, "ccv_nnc_symbolic_graph_compile.c", 1767, __extension__ __PRETTY_FUNCTION__
); }))
;
1768 return (tensor_block->head->rnum == 1 && *(int*)ccv_array_get(tensor_block->head, 0)((void*)(((char*)((tensor_block->head)->data)) + (size_t
)(tensor_block->head)->rsize * (size_t)(0)))
== head_node);
1769}
1770
1771// Check whether the tail is the end of this block.
1772static int _ccv_nnc_tensor_block_check_tail(const ccv_nnc_tensor_block_t* const tensor_block, const int tail_node)
1773{
1774 assert(tensor_block->tail)((void) sizeof ((tensor_block->tail) ? 1 : 0), __extension__
({ if (tensor_block->tail) ; else __assert_fail ("tensor_block->tail"
, "ccv_nnc_symbolic_graph_compile.c", 1774, __extension__ __PRETTY_FUNCTION__
); }))
;
1775 return (tensor_block->tail->rnum == 1 && *(int*)ccv_array_get(tensor_block->tail, 0)((void*)(((char*)((tensor_block->tail)->data)) + (size_t
)(tensor_block->tail)->rsize * (size_t)(0)))
== tail_node);
1776}
1777
1778// Make two tensor blocks one. Return 1 if that happened.
1779static int _ccv_nnc_tensor_blocks_try_fold(ccv_nnc_tensor_block_t* const tensor_blocks, const int p_ref_0, const int p_ref_1)
1780{
1781 // Now we are sure p_ref_0 points to the input, p_ref_1 points to the output.
1782 if (!TENSOR_IS_UNFOLDABLE_AS_INPUT(tensor_blocks[p_ref_0])(tensor_blocks[p_ref_0].flags & UNFOLDABLE_AS_INPUT) &&
1783 (!TENSOR_IS_UNFOLDABLE_AS_OUTPUT(tensor_blocks[p_ref_1])(tensor_blocks[p_ref_1].flags & UNFOLDABLE_AS_OUTPUT) || tensor_blocks[p_ref_1].unfoldable_except_ref == p_ref_0 + 1) &&
1784 tensor_blocks[p_ref_0].tail->rnum == 1 &&
1785 tensor_blocks[p_ref_1].head->rnum == 1 &&
1786 tensor_blocks[p_ref_0].type == tensor_blocks[p_ref_1].type && // Must be the same type.
1787 *(int*)ccv_array_get(tensor_blocks[p_ref_0].tail, 0)((void*)(((char*)((tensor_blocks[p_ref_0].tail)->data)) + (
size_t)(tensor_blocks[p_ref_0].tail)->rsize * (size_t)(0))
)
== *(int*)ccv_array_get(tensor_blocks[p_ref_1].head, 0)((void*)(((char*)((tensor_blocks[p_ref_1].head)->data)) + (
size_t)(tensor_blocks[p_ref_1].head)->rsize * (size_t)(0))
)
)
1788 {
1789 // If the two parent refs matches (thus, they meet at the same node), we can concatenate with each other and mark one as a ref. This is very similar to in-place operation combining.
1790 assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[p_ref_0]))((void) sizeof (((!((tensor_blocks[p_ref_0].flags & 0x3) ==
ALIAS) && !((tensor_blocks[p_ref_0].flags & 0x3)
== UNASSIGNED))) ? 1 : 0), __extension__ ({ if ((!((tensor_blocks
[p_ref_0].flags & 0x3) == ALIAS) && !((tensor_blocks
[p_ref_0].flags & 0x3) == UNASSIGNED))) ; else __assert_fail
("TENSOR_EXPECT_COMPUTABLE(tensor_blocks[p_ref_0])", "ccv_nnc_symbolic_graph_compile.c"
, 1790, __extension__ __PRETTY_FUNCTION__); }))
;
1791 assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[p_ref_1]))((void) sizeof (((!((tensor_blocks[p_ref_1].flags & 0x3) ==
ALIAS) && !((tensor_blocks[p_ref_1].flags & 0x3)
== UNASSIGNED))) ? 1 : 0), __extension__ ({ if ((!((tensor_blocks
[p_ref_1].flags & 0x3) == ALIAS) && !((tensor_blocks
[p_ref_1].flags & 0x3) == UNASSIGNED))) ; else __assert_fail
("TENSOR_EXPECT_COMPUTABLE(tensor_blocks[p_ref_1])", "ccv_nnc_symbolic_graph_compile.c"
, 1791, __extension__ __PRETTY_FUNCTION__); }))
;
1792 ccv_array_free(tensor_blocks[p_ref_0].tail);
1793 tensor_blocks[p_ref_0].tail = tensor_blocks[p_ref_1].tail;
1794 if (tensor_blocks[p_ref_1].p_refs[0])
1795 {
1796 assert(tensor_blocks[p_ref_1].p_refs[1] == 0)((void) sizeof ((tensor_blocks[p_ref_1].p_refs[1] == 0) ? 1 :
0), __extension__ ({ if (tensor_blocks[p_ref_1].p_refs[1] ==
0) ; else __assert_fail ("tensor_blocks[p_ref_1].p_refs[1] == 0"
, "ccv_nnc_symbolic_graph_compile.c", 1796, __extension__ __PRETTY_FUNCTION__
); }))
; // It simply cannot have more than one p_refs, otherwise we cannot merge.
1797 if (!tensor_blocks[p_ref_0].p_refs[0])
1798 tensor_blocks[p_ref_0].p_refs[0] = tensor_blocks[p_ref_1].p_refs[0];
1799 else
1800 tensor_blocks[p_ref_0].p_refs[1] = tensor_blocks[p_ref_1].p_refs[0];
1801 }
1802 tensor_blocks[p_ref_0].pin_mem = tensor_blocks[p_ref_0].pin_mem || tensor_blocks[p_ref_1].pin_mem;
1803 TENSOR_SET_READ_WRITE(tensor_blocks[p_ref_0], TENSOR_READ_WRITE(tensor_blocks[p_ref_0]) | TENSOR_READ_WRITE(tensor_blocks[p_ref_1]))(tensor_blocks[p_ref_0].flags = ((tensor_blocks[p_ref_0].flags
& ~0xc) | (tensor_blocks[p_ref_0].flags & 0xc) | (tensor_blocks
[p_ref_1].flags & 0xc)))
;
1804 ccv_array_free(tensor_blocks[p_ref_1].head);
1805 if (TENSOR_IS_UNFOLDABLE_AS_INPUT(tensor_blocks[p_ref_1])(tensor_blocks[p_ref_1].flags & UNFOLDABLE_AS_INPUT))
1806 TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[p_ref_0])(tensor_blocks[p_ref_0].flags = (tensor_blocks[p_ref_0].flags
| UNFOLDABLE_AS_INPUT))
;
1807 // Don't need to check UNFOLDABLE_AS_OUTPUT for p_ref_1 because if it is so, we cannot fold right now.
1808 TENSOR_EXPECT_SET_UNASSIGNED(tensor_blocks[p_ref_1])(tensor_blocks[p_ref_1].flags = ((tensor_blocks[p_ref_1].flags
& ~0x3) | UNASSIGNED))
;
1809 tensor_blocks[p_ref_1].ref = p_ref_0 + 1;
1810 if (!tensor_blocks[p_ref_0].r_refs)
1811 tensor_blocks[p_ref_0].r_refs = ccv_array_new(sizeof(int), 0, 0);
1812 ccv_array_add_unique_int(tensor_blocks[p_ref_0].r_refs, p_ref_1 + 1);
1813 tensor_blocks[p_ref_1].size = 0;
1814 tensor_blocks[p_ref_1].head = 0;
1815 tensor_blocks[p_ref_1].tail = 0;
1816 return 1;
1817 }
1818 return 0;
1819}
1820
1821static void _ccv_nnc_exec_dep_and_tensor_blocks_prep(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_info_t* const p_node_info, const ccv_nnc_graph_visit_t* const visit, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int unroll_count, const int* const dup_tensor_block_ref, const int* const dup_tensor_from_ref, const int* const dup_exec_from_ref, ccv_nnc_graph_exec_flag_t* const exec_flags, ccv_sparse_matrix_t** r_exec_dep, ccv_nnc_tensor_block_t** r_tensor_blocks)
1822{
1823 int i, j, k;
1824 // Generate exec dependencies (or, in other words, partial ordering of executions).
1825 ccv_sparse_matrix_t* exec_dep = ccv_sparse_matrix_new(symbolic_graph->exec_symbol_info->rnum, symbolic_graph->exec_symbol_info->rnum, CCV_32S | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0);
1826 int* buf = (int*)ccmallocmalloc(sizeof(int) * symbolic_graph->exec_symbol_info->rnum * 2);
1827 int buf_size;
1828 if (p_node_info)
1829 { assert(output_size == 0)((void) sizeof ((output_size == 0) ? 1 : 0), __extension__ ({
if (output_size == 0) ; else __assert_fail ("output_size == 0"
, "ccv_nnc_symbolic_graph_compile.c", 1829, __extension__ __PRETTY_FUNCTION__
); }))
; }
1830#define for_block(x, val) \
1831 do { \
1832 if (((int32_t*)val)[0] > 0) \
1833 { \
1834 buf[buf_size * 2] = x; \
1835 buf[buf_size * 2 + 1] = ((int32_t*)val)[0] + 1; \
1836 ++buf_size; \
1837 } \
1838 } while (0)
1839 ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx, term){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int term __attribute__((unused)) = (visit)->node[_i_
].term; typeof ((exec_symbol_info)) const node __attribute__(
(unused)) = (exec_symbol_info) + idx;
{
1840 buf_size = 0; /* save all its parent deps to this buffer */
1841 ccv_sparse_matrix_vector_t* vector = ccv_get_sparse_matrix_vector(exec_dep, idx);
1842 if (vector)
1843 CCV_SPARSE_VECTOR_FOREACH(exec_dep, vector, for_block)do { switch ((((exec_dep)->type) & 0xFF000)) { case CCV_32S
: { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((exec_dep)->type) & 0xFFF); if ((exec_dep)->type &
CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (vector)->size
; _i_++) { for_block((_i_), ((vector)->data.i32 + (_i_ * _c_
))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((exec_dep)->type) & 0xFF000
) >> 12] * (((exec_dep)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(vector)->index; for
(_i_ = 0; _i_ < (vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i32 + (0))); } } } while (0); break; } case CCV_32F:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((exec_dep)->type) & 0xFFF); if ((exec_dep)->type &
CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (vector)->size
; _i_++) { for_block((_i_), ((vector)->data.f32 + (_i_ * _c_
))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((exec_dep)->type) & 0xFF000
) >> 12] * (((exec_dep)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(vector)->index; for
(_i_ = 0; _i_ < (vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f32 + (0))); } } } while (0); break; } case CCV_64S:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((exec_dep)->type) & 0xFFF); if ((exec_dep)->type &
CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (vector)->size
; _i_++) { for_block((_i_), ((vector)->data.i64 + (_i_ * _c_
))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((exec_dep)->type) & 0xFF000
) >> 12] * (((exec_dep)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(vector)->index; for
(_i_ = 0; _i_ < (vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i64 + (0))); } } } while (0); break; } case CCV_64F:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((exec_dep)->type) & 0xFFF); if ((exec_dep)->type &
CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (vector)->size
; _i_++) { for_block((_i_), ((vector)->data.f64 + (_i_ * _c_
))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((exec_dep)->type) & 0xFF000
) >> 12] * (((exec_dep)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(vector)->index; for
(_i_ = 0; _i_ < (vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f64 + (0))); } } } while (0); break; } default: { do
{ int _i_; __attribute__((unused)) const size_t _c_ = (((exec_dep
)->type) & 0xFFF); if ((exec_dep)->type & CCV_DENSE_VECTOR
) { for (_i_ = 0; _i_ < (vector)->size; _i_++) { for_block
((_i_), ((vector)->data.u8 + (_i_ * _c_))); } } else { const
size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size
[(((exec_dep)->type) & 0xFF000) >> 12] * (((exec_dep
)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_
= (uint8_t*)(vector)->index; for (_i_ = 0; _i_ < (vector
)->size; _i_++) { ccv_sparse_matrix_index_t* const _idx_i_
= (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if
(_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_
= { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.u8 + (0))); } } } while (0); } } } while (0)
;
1844 if (!node->outgoings)
1845 continue;
1846 for (i = 0; i < node->outgoings->rnum; i++)
1847 {
1848 int outgoing = *(int*)ccv_array_get(node->outgoings, i)((void*)(((char*)((node->outgoings)->data)) + (size_t)(
node->outgoings)->rsize * (size_t)(i)))
;
1849 const int32_t one = 1;
1850 ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, outgoing, idx);
1851 /* If not found, set, if the current node is the destination node, no need
1852 * set itself as parent of subsequent nodes because its terminal nature. */
1853 if (!term && (!cell.i32 || cell.i32[0] == 0))
1854 ccv_set_sparse_matrix_cell(exec_dep, outgoing, idx, &one);
1855 for (j = 0; j < buf_size; j++) /* set with all idx's dependencies as well */
1856 {
1857 ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, outgoing, buf[j * 2]);
1858 /* If not found, set */
1859 if (!cell.i32 || cell.i32[0] == 0)
1860 ccv_set_sparse_matrix_cell(exec_dep, outgoing, buf[j * 2], &buf[j * 2 + 1]);
1861 else {
1862 /* Otherwise, set to the longest one */
1863 int32_t dep = ccv_max(cell.i32[0], buf[j * 2 + 1])({ typeof (cell.i32[0]) _a = (cell.i32[0]); typeof (buf[j * 2
+ 1]) _b = (buf[j * 2 + 1]); (_a > _b) ? _a : _b; })
;
1864 ccv_set_sparse_matrix_cell(exec_dep, outgoing, buf[j * 2], &dep);
1865 }
1866 }
1867 }
1868 } ccv_nnc_graph_visit_endfor} }
1869#undef for_block
1870 ccfreefree(buf);
1871 // This struct is allocated earlier to collect information about the tensor's expected start / end execs.
1872 const int tensor_block_size = symbolic_graph->tensor_symbol_info->rnum;
1873 ccv_nnc_tensor_block_t* tensor_blocks = (ccv_nnc_tensor_block_t*)cccalloccalloc(tensor_block_size, sizeof(ccv_nnc_tensor_block_t));
1874 // The reason is that I need to make everyone of them to be unassigned unless it is used somewhere. It
1875 // happens that I have to loop through all relevant node to find out if one is used or not.
1876 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
1877 tensor_blocks[i].flags = UNASSIGNED, tensor_blocks[i].type = tensor_symbol_info[i].info.type, tensor_blocks[i].bypass_ref = tensor_symbol_info[i].bypass_ref;
1878 ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx;
{
1879 for (i = 0; i < node->input_size; i++)
1880 if (node->inputs[i] >= 0)
1881 {
1882 tensor_blocks[node->inputs[i]].flags = 0;
1883 // If this is a data transfer node, and this is a CPU memory, mark the memory type to be pinned mem.
1884 // This will get propagated back to the buffer, and used there to determine the allocation function to use.
1885 if (CCV_TENSOR_GET_MEMORY(tensor_blocks[node->inputs[i]].type)((tensor_blocks[node->inputs[i]].type) & 0x3) == CCV_TENSOR_CPU_MEMORY &&
1886 (node->cmd.cmd == CCV_NNC_DATA_TRANSFER_FORWARD || node->cmd.cmd == CCV_NNC_DATA_TRANSFER_BACKWARD))
1887 tensor_blocks[node->inputs[i]].pin_mem = 1;
1888 }
1889 for (i = 0; i < node->output_size; i++)
1890 if (node->outputs[i] >= 0)
1891 {
1892 tensor_blocks[node->outputs[i]].flags = 0;
1893 // If this is a data transfer node, and this is a CPU memory, mark the memory type to be pinned mem.
1894 // This will get propagated back to the buffer, and used there to determine the allocation function to use.
1895 if (CCV_TENSOR_GET_MEMORY(tensor_blocks[node->outputs[i]].type)((tensor_blocks[node->outputs[i]].type) & 0x3) == CCV_TENSOR_CPU_MEMORY &&
1896 (node->cmd.cmd == CCV_NNC_DATA_TRANSFER_FORWARD || node->cmd.cmd == CCV_NNC_DATA_TRANSFER_BACKWARD))
1897 tensor_blocks[node->outputs[i]].pin_mem = 1;
1898 }
1899 } ccv_nnc_graph_visit_endfor} }
1900 if (p_node_info)
1901 {
1902 assert(p_tensor_symbol_info)((void) sizeof ((p_tensor_symbol_info) ? 1 : 0), __extension__
({ if (p_tensor_symbol_info) ; else __assert_fail ("p_tensor_symbol_info"
, "ccv_nnc_symbolic_graph_compile.c", 1902, __extension__ __PRETTY_FUNCTION__
); }))
;
1903 // Mark it as used if it is used in either input or output.
1904 for (i = 0; i < p_node_info->input_size; i++)
1905 if (p_node_info->inputs[i] >= 0)
1906 {
1907 const int d = p_node_info->inputs[i];
1908 if (p_tensor_symbol_info[d].s_ref && p_tensor_symbol_info[d].s_ref->rnum >= symbolic_graph->p_idx)
1909 {
1910 const int dd = *(int*)ccv_array_get(p_tensor_symbol_info[d].s_ref, symbolic_graph->p_idx - 1)((void*)(((char*)((p_tensor_symbol_info[d].s_ref)->data)) +
(size_t)(p_tensor_symbol_info[d].s_ref)->rsize * (size_t)
(symbolic_graph->p_idx - 1)))
- 1;
1911 if (dd >= 0) // If this exists in this sub-graph, great.
1912 tensor_blocks[dd].flags = 0;
1913 }
1914 }
1915 for (i = 0; i < p_node_info->output_size; i++)
1916 if (p_node_info->outputs[i] >= 0)
1917 {
1918 const int d = p_node_info->outputs[i];
1919 if (p_tensor_symbol_info[d].s_ref && p_tensor_symbol_info[d].s_ref->rnum >= symbolic_graph->p_idx)
1920 {
1921 const int dd = *(int*)ccv_array_get(p_tensor_symbol_info[d].s_ref, symbolic_graph->p_idx - 1)((void*)(((char*)((p_tensor_symbol_info[d].s_ref)->data)) +
(size_t)(p_tensor_symbol_info[d].s_ref)->rsize * (size_t)
(symbolic_graph->p_idx - 1)))
- 1;
1922 if (dd >= 0) // If this exists in this sub-graph, great.
1923 tensor_blocks[dd].flags = 0;
1924 }
1925 }
1926 }
1927 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
1928 if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED))
1929 {
1930 // Check no tensor info is auto now.
1931 assert(!ccv_nnc_is_tensor_auto(tensor_symbol_info[i].info))((void) sizeof ((!ccv_nnc_is_tensor_auto(tensor_symbol_info[i
].info)) ? 1 : 0), __extension__ ({ if (!ccv_nnc_is_tensor_auto
(tensor_symbol_info[i].info)) ; else __assert_fail ("!ccv_nnc_is_tensor_auto(tensor_symbol_info[i].info)"
, "ccv_nnc_symbolic_graph_compile.c", 1931, __extension__ __PRETTY_FUNCTION__
); }))
;
1932 // If this tensor is used in assign_ref, set it to be un-foldable. (It will be used as parameter,
1933 // therefore, itself life-cycle almost certainly won't concatenate properly with the tensor to
1934 // fold to).
1935 if (tensor_symbol_info[i].assign_ref)
1936 {
1937 // TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[i]);
1938 // It can be folded as input (it is fine to be overwritten), but it cannot as output (when folded as input,
1939 // it kept its own representation, which is not the case for output).
1940 TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[i])(tensor_blocks[i].flags = (tensor_blocks[i].flags | UNFOLDABLE_AS_OUTPUT
))
;
1941 const int assign_ref = tensor_symbol_info[i].assign_ref - 1;
1942 // But for where it comes from, it cannot be folded as input, because it cannot be overwritten any time.
1943 TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[assign_ref])(tensor_blocks[assign_ref].flags = (tensor_blocks[assign_ref]
.flags | UNFOLDABLE_AS_INPUT))
;
1944 // It also cannot be folded as output (except i), because we need to keep its own representation.
1945 TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[assign_ref])(tensor_blocks[assign_ref].flags = (tensor_blocks[assign_ref]
.flags | UNFOLDABLE_AS_OUTPUT))
;
1946 assert(tensor_blocks[assign_ref].unfoldable_except_ref == 0)((void) sizeof ((tensor_blocks[assign_ref].unfoldable_except_ref
== 0) ? 1 : 0), __extension__ ({ if (tensor_blocks[assign_ref
].unfoldable_except_ref == 0) ; else __assert_fail ("tensor_blocks[assign_ref].unfoldable_except_ref == 0"
, "ccv_nnc_symbolic_graph_compile.c", 1946, __extension__ __PRETTY_FUNCTION__
); }))
;
1947 tensor_blocks[assign_ref].unfoldable_except_ref = i + 1;
1948 for (j = 0; j < unroll_count; j++)
1949 {
1950 TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]])(tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]].flags
= (tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]]
.flags | UNFOLDABLE_AS_INPUT))
;
1951 TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]])(tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]].flags
= (tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]]
.flags | UNFOLDABLE_AS_OUTPUT))
;
1952 }
1953 if (tensor_blocks[assign_ref].bypass_ref)
1954 {
1955 // If it contains a bypass_ref, that means we can fold into both the bypass and except_ref, making it untenable.
1956 tensor_blocks[assign_ref].unfoldable_except_ref = 0;
1957 const int bypass_ref = tensor_blocks[assign_ref].bypass_ref - 1;
1958 TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[bypass_ref])(tensor_blocks[bypass_ref].flags = (tensor_blocks[bypass_ref]
.flags | UNFOLDABLE_AS_INPUT))
;
1959 TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[bypass_ref])(tensor_blocks[bypass_ref].flags = (tensor_blocks[bypass_ref]
.flags | UNFOLDABLE_AS_OUTPUT))
;
1960 // On the other hand, it can be folded into the except_ref for the bypass_ref.
1961 tensor_blocks[bypass_ref].unfoldable_except_ref = i + 1;
1962 if (dup_tensor_from_ref)
1963 {
1964 const int bypass_from_ref = dup_tensor_from_ref[bypass_ref];
1965 if (bypass_from_ref >= 0)
1966 {
1967 TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[bypass_from_ref])(tensor_blocks[bypass_from_ref].flags = (tensor_blocks[bypass_from_ref
].flags | UNFOLDABLE_AS_INPUT))
;
1968 TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[bypass_from_ref])(tensor_blocks[bypass_from_ref].flags = (tensor_blocks[bypass_from_ref
].flags | UNFOLDABLE_AS_OUTPUT))
;
1969 assert(dup_tensor_block_ref[bypass_from_ref * unroll_count + unroll_count - 1] == bypass_ref)((void) sizeof ((dup_tensor_block_ref[bypass_from_ref * unroll_count
+ unroll_count - 1] == bypass_ref) ? 1 : 0), __extension__ (
{ if (dup_tensor_block_ref[bypass_from_ref * unroll_count + unroll_count
- 1] == bypass_ref) ; else __assert_fail ("dup_tensor_block_ref[bypass_from_ref * unroll_count + unroll_count - 1] == bypass_ref"
, "ccv_nnc_symbolic_graph_compile.c", 1969, __extension__ __PRETTY_FUNCTION__
); }))
;
1970 for (j = 0; j < unroll_count - 1; j++)
1971 {
1972 // Mark every incarnation as unfold-able.
1973 TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[dup_tensor_block_ref[bypass_from_ref * unroll_count + j]])(tensor_blocks[dup_tensor_block_ref[bypass_from_ref * unroll_count
+ j]].flags = (tensor_blocks[dup_tensor_block_ref[bypass_from_ref
* unroll_count + j]].flags | UNFOLDABLE_AS_INPUT))
;
1974 TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[dup_tensor_block_ref[bypass_from_ref * unroll_count + j]])(tensor_blocks[dup_tensor_block_ref[bypass_from_ref * unroll_count
+ j]].flags = (tensor_blocks[dup_tensor_block_ref[bypass_from_ref
* unroll_count + j]].flags | UNFOLDABLE_AS_OUTPUT))
;
1975 }
1976 }
1977 }
1978 }
1979 }
1980 }
1981 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
1982 {
1983 // If it has a peer reference, we don't need to allocate this tensor at all,
1984 // set it to be unassigned.
1985 if (tensor_symbol_info[i].peer_ref)
1986 TENSOR_EXPECT_SET_UNASSIGNED(tensor_blocks[i])(tensor_blocks[i].flags = ((tensor_blocks[i].flags & ~0x3
) | UNASSIGNED))
;
1987 // If it is a tape variable, set it to be un-foldable as too (otherwise we cannot use tape properly).
1988 else if (tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_TAPE_VAR) {
1989 TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[i])(tensor_blocks[i].flags = (tensor_blocks[i].flags | UNFOLDABLE_AS_INPUT
))
;
1990 TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[i])(tensor_blocks[i].flags = (tensor_blocks[i].flags | UNFOLDABLE_AS_OUTPUT
))
;
1991 // For this case, there is no exception.
1992 tensor_blocks[i].unfoldable_except_ref = 0;
1993 } else if (tensor_symbol_info[i].p_ref) {
1994 assert(p_node_info)((void) sizeof ((p_node_info) ? 1 : 0), __extension__ ({ if (
p_node_info) ; else __assert_fail ("p_node_info", "ccv_nnc_symbolic_graph_compile.c"
, 1994, __extension__ __PRETTY_FUNCTION__); }))
;
1995 const int p_ref_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(tensor_symbol_info[i].p_ref - 1, p_node_info);
1996 // If I am a case of graph, and this tensor is the input from the parent graph, you cannot fold it as input.
1997 if (p_node_info->flags & CCV_NNC_GRAPH_EXEC_CASE_OF)
1998 // TODO: This check can be lifted if we can fold in the parent graph.
1999 if (-1 == p_ref_is_in_or_out)
2000 TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[i])(tensor_blocks[i].flags = (tensor_blocks[i].flags | UNFOLDABLE_AS_INPUT
))
;
2001 if (1 == p_ref_is_in_or_out) // If p_ref is out, it cannot be fold as input.
2002 TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[i])(tensor_blocks[i].flags = (tensor_blocks[i].flags | UNFOLDABLE_AS_INPUT
))
;
2003 }
2004 }
2005 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2006 {
2007 if (tensor_symbol_info[i].alias_ref)
2008 {
2009 const int ref = tensor_symbol_info[i].alias_ref - 1;
2010 // If the referenced one is unassigned, mark this as assigned only if current one is assigned.
2011 if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[ref])((tensor_blocks[ref].flags & 0x3) == UNASSIGNED) && !TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED))
2012 tensor_blocks[ref].flags = 0;
2013 // An alias cannot ref to another alias.
2014 assert(!tensor_symbol_info[ref].alias_ref)((void) sizeof ((!tensor_symbol_info[ref].alias_ref) ? 1 : 0)
, __extension__ ({ if (!tensor_symbol_info[ref].alias_ref) ; else
__assert_fail ("!tensor_symbol_info[ref].alias_ref", "ccv_nnc_symbolic_graph_compile.c"
, 2014, __extension__ __PRETTY_FUNCTION__); }))
;
2015 tensor_blocks[i].flags = ALIAS;
2016 tensor_blocks[i].ref = ref + 1; // Assign the ref.
2017 if (!tensor_blocks[ref].r_refs)
2018 tensor_blocks[ref].r_refs = ccv_array_new(sizeof(int), 0, 0);
2019 ccv_array_add_unique_int(tensor_blocks[ref].r_refs, i + 1);
2020 }
2021 }
2022 // Scan again and if the ref is not assigned, mark the alias not assigned.
2023 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2024 if (TENSOR_EXPECT_ALIAS(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == ALIAS))
2025 {
2026 const int ref = tensor_blocks[i].ref - 1;
2027 if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[ref])((tensor_blocks[ref].flags & 0x3) == UNASSIGNED))
2028 {
2029 // Mark this as unassigned.
2030 tensor_blocks[i].flags = UNASSIGNED;
2031 tensor_blocks[i].ref = 0;
2032 }
2033 }
2034 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2035 {
2036 // If this tensor is not expected to be unassigned, allocate the arrays for s and t.
2037 if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !(
(tensor_blocks[i].flags & 0x3) == UNASSIGNED))
)
2038 {
2039 tensor_blocks[i].head = ccv_array_new(sizeof(int), 0, 0);
2040 tensor_blocks[i].tail = ccv_array_new(sizeof(int), 0, 0);
2041 // Cache tensor size (align to 16 bytes).
2042 tensor_blocks[i].size = (uint64_t)ccv_nnc_tensor_data_size(tensor_symbol_info[i].info);
2043 }
2044 // If there is a p_ref, add the one to the p_refs list.
2045 if (tensor_symbol_info[i].p_ref)
2046 tensor_blocks[i].p_refs[0] = tensor_symbol_info[i].p_ref;
2047 }
2048 ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx;
{
2049 for (i = 0; i < node->input_size; i++)
2050 {
2051 int d = node->inputs[i];
2052 if (d < 0)
2053 continue;
2054 if (TENSOR_EXPECT_ALIAS(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == ALIAS))
2055 d = tensor_symbol_info[d].alias_ref - 1;
2056 tensor_blocks[d].flags |= READ_ONLY;
2057 if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED))
2058 continue;
2059 assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]))((void) sizeof (((!((tensor_blocks[d].flags & 0x3) == ALIAS
) && !((tensor_blocks[d].flags & 0x3) == UNASSIGNED
))) ? 1 : 0), __extension__ ({ if ((!((tensor_blocks[d].flags
& 0x3) == ALIAS) && !((tensor_blocks[d].flags &
0x3) == UNASSIGNED))) ; else __assert_fail ("TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d])"
, "ccv_nnc_symbolic_graph_compile.c", 2059, __extension__ __PRETTY_FUNCTION__
); }))
;
2060 /* If this is first encounter, its head starts (this tensor is init'ed outside of the graph
2061 * from the very beginning of the graph life-cycle and ends here. */
2062 if (tensor_blocks[d].head->rnum == 0 && !TENSOR_REQUIRE_INIT(tensor_symbol_info[d].flags)(((tensor_symbol_info[d].flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS
) || ((tensor_symbol_info[d].flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ONES
))
)
2063 {
2064 for (j = 0; j < source_size; j++)
2065 {
2066 // If the source is connecting to current node, add (otherwise we will create tensor blocks that used in other streams, which is unneccessary).
2067 const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, idx, sources[j].d);
2068 if (cell.i32 && cell.i32[0] > 0)
2069 _ccv_nnc_tensor_block_add_exec(exec_dep, sources[j].d, tensor_blocks[d]);
2070 }
2071 /* If this is a read-only (based on SSA, if first encountered as read), and this is
2072 * sub-graph (TODO: this condition can be lifted for case..of that is never in a while
2073 * loop, however, in that case, you need to prevent read-only gets reused for the
2074 * output tensor, which is not obvious how to implement correctly), and it is not
2075 * assign_ref from anywhere (not a parameterized loop). We cannot reuse this region
2076 * of memory anyway (because on second loop, we want to read the same value out).
2077 * Mark it to the end of the graph. */
2078 if (p_node_info && !tensor_symbol_info[d].assign_ref)
2079 for (j = 0; j < destination_size; j++)
2080 {
2081 // If the destination is connecting to current node, add (otherwise we will create tensor blocks that used in other streams, which is unneccessary).
2082 const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, destinations[j].d, idx);
2083 if (cell.i32 && cell.i32[0] > 0)
2084 _ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[d]);
2085 }
2086 }
2087 _ccv_nnc_tensor_block_add_exec(exec_dep, idx, tensor_blocks[d]);
2088 }
2089 for (i = 0; i < node->output_size; i++)
2090 {
2091 int d = node->outputs[i];
2092 if (d < 0)
2093 continue;
2094 if (TENSOR_EXPECT_ALIAS(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == ALIAS))
2095 d = tensor_symbol_info[d].alias_ref - 1;
2096 tensor_blocks[d].flags |= WRITE_ONLY;
2097 if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED))
2098 continue;
2099 assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]))((void) sizeof (((!((tensor_blocks[d].flags & 0x3) == ALIAS
) && !((tensor_blocks[d].flags & 0x3) == UNASSIGNED
))) ? 1 : 0), __extension__ ({ if ((!((tensor_blocks[d].flags
& 0x3) == ALIAS) && !((tensor_blocks[d].flags &
0x3) == UNASSIGNED))) ; else __assert_fail ("TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d])"
, "ccv_nnc_symbolic_graph_compile.c", 2099, __extension__ __PRETTY_FUNCTION__
); }))
;
2100 _ccv_nnc_tensor_block_add_exec(exec_dep, idx, tensor_blocks[d]);
2101 }
2102 } ccv_nnc_graph_visit_endfor} }
2103 // For any assign_ref, its life-time kept until the end and wrap over.
2104 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2105 // If this tensor is not unassigned (or alias) and it is assigned from somewhere else,
2106 // that "somewhere else" need to keep its life-time til the end.
2107 if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !(
(tensor_blocks[i].flags & 0x3) == UNASSIGNED))
&&
2108 p_node_info && tensor_symbol_info[i].assign_ref)
2109 {
2110 const int assign_ref = tensor_symbol_info[i].assign_ref - 1;
2111 for (j = 0; j < destination_size; j++)
2112 {
2113 // This logic is to be more conservative about which destination we add to.
2114 // As of now, if we add everything, it is fine most likely. However, it may
2115 // cause issues in the future to do so naively. Thus, instead, we only add
2116 // the destination to it iff either the tensor is not used at all, or, the
2117 // destination is on the same stream as of the tensor block some way.
2118 int flag = !tensor_blocks[assign_ref].tail;
2119 for (k = 0; !flag && k < tensor_blocks[assign_ref].tail->rnum; k++)
2120 {
2121 const int idx = *(int*)ccv_array_get(tensor_blocks[assign_ref].tail, k)((void*)(((char*)((tensor_blocks[assign_ref].tail)->data))
+ (size_t)(tensor_blocks[assign_ref].tail)->rsize * (size_t
)(k)))
;
2122 const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, destinations[j].d, idx);
2123 flag = (cell.i32 && cell.i32[0] > 0);
2124 }
2125 if (flag) // If there is no tail at all, add it. Otherwise, only add it if the destination is on the same stream with this tensor block somehow.
2126 _ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[assign_ref]);
2127 }
2128 }
2129 for (i = 0; i < output_size; i++)
2130 {
2131 assert(outputs[i].graph == symbolic_graph)((void) sizeof ((outputs[i].graph == symbolic_graph) ? 1 : 0)
, __extension__ ({ if (outputs[i].graph == symbolic_graph) ; else
__assert_fail ("outputs[i].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 2131, __extension__ __PRETTY_FUNCTION__); }))
;
2132 int d = outputs[i].d;
2133 if (d < 0)
2134 continue;
2135 if (TENSOR_EXPECT_ALIAS(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == ALIAS))
2136 d = tensor_symbol_info[d].alias_ref - 1;
2137 if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED))
2138 continue;
2139 assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]))((void) sizeof (((!((tensor_blocks[d].flags & 0x3) == ALIAS
) && !((tensor_blocks[d].flags & 0x3) == UNASSIGNED
))) ? 1 : 0), __extension__ ({ if ((!((tensor_blocks[d].flags
& 0x3) == ALIAS) && !((tensor_blocks[d].flags &
0x3) == UNASSIGNED))) ; else __assert_fail ("TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d])"
, "ccv_nnc_symbolic_graph_compile.c", 2139, __extension__ __PRETTY_FUNCTION__
); }))
;
2140 for (j = 0; j < destination_size; j++)
2141 {
2142 int flag = !tensor_blocks[d].tail;
2143 for (k = 0; !flag && k < tensor_blocks[d].tail->rnum; k++)
2144 {
2145 const int idx = *(int*)ccv_array_get(tensor_blocks[d].tail, k)((void*)(((char*)((tensor_blocks[d].tail)->data)) + (size_t
)(tensor_blocks[d].tail)->rsize * (size_t)(k)))
;
2146 const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, destinations[j].d, idx);
2147 flag = (cell.i32 && cell.i32[0] > 0);
2148 }
2149 if (flag) // If there is no tail at all, add it. Otherwise, only add it if the destination is on the same stream with this tensor block somehow.
2150 _ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[d]);
2151 }
2152 }
2153 // Enforce tensor reuse by collapse tensors for in-place operations. We will fault if this cannot be done.
2154 ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx;
{
2155 int x, y;
2156 for (x = 0; x < node->input_size; x++)
2157 for (y = 0; y < node->output_size; y++)
2158 /* Some operations enforces some tensors to be the same for inputs / outputs. */
2159 if (ccv_nnc_cmd_enforce_inplace(node->cmd, x, node->input_size, y, node->output_size))
2160 {
2161 // If both unassigned, it is fine.
2162 if (node->inputs[x] < 0 && node->outputs[y] < 0)
2163 continue;
2164 int ref = node->inputs[x];
2165 assert(ref >= 0)((void) sizeof ((ref >= 0) ? 1 : 0), __extension__ ({ if (
ref >= 0) ; else __assert_fail ("ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 2165, __extension__ __PRETTY_FUNCTION__); }))
;
2166 while (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref])(!((tensor_blocks[ref].flags & 0x3) == ALIAS) && !
((tensor_blocks[ref].flags & 0x3) == UNASSIGNED))
&& tensor_blocks[ref].ref)
2167 ref = tensor_blocks[ref].ref - 1;
2168 const int node_output_y = node->outputs[y];
2169 assert(node_output_y >= 0)((void) sizeof ((node_output_y >= 0) ? 1 : 0), __extension__
({ if (node_output_y >= 0) ; else __assert_fail ("node_output_y >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 2169, __extension__ __PRETTY_FUNCTION__
); }))
;
2170 // If both are not computable, it is fine, we don't need to enforce.
2171 if (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref])(!((tensor_blocks[ref].flags & 0x3) == ALIAS) && !
((tensor_blocks[ref].flags & 0x3) == UNASSIGNED))
&&
2172 !TENSOR_EXPECT_COMPUTABLE(tensor_blocks[node_output_y])(!((tensor_blocks[node_output_y].flags & 0x3) == ALIAS) &&
!((tensor_blocks[node_output_y].flags & 0x3) == UNASSIGNED
))
)
2173 continue;
2174 // Otherwise, enforce and error out if failed.
2175 if (!_ccv_nnc_tensor_blocks_try_fold(tensor_blocks, ref, node_output_y))
2176 { assert(0 && "cannot enforce inplace for the two tensors")((void) sizeof ((0 && "cannot enforce inplace for the two tensors"
) ? 1 : 0), __extension__ ({ if (0 && "cannot enforce inplace for the two tensors"
) ; else __assert_fail ("0 && \"cannot enforce inplace for the two tensors\""
, "ccv_nnc_symbolic_graph_compile.c", 2176, __extension__ __PRETTY_FUNCTION__
); }))
; }
2177 }
2178 } ccv_nnc_graph_visit_endfor} }
2179 // Ignore tensors that are already binded, no matter if it is used or not. Doing it here because
2180 // we need to make sure enforced tensors are properly assigned, so that we don't bind on a tensor
2181 // that is not enforced in-place (because the tensor enforced in-place will be different than the
2182 // binding one).
2183 for (i = 0; i < tensor_bind_size; i++)
2184 {
2185 const ccv_nnc_tensor_symbol_t resolved_symbol = ccv_nnc_tensor_symbol_resolve(symbolic_graph, tensor_binds[i].symbol);
2186 // If there is a tensor binded, then it is unassigned.
2187 if (resolved_symbol.d >= 0)
2188 {
2189 int d = resolved_symbol.d;
2190 // If it is unused, this is not an alias.
2191 while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED) && tensor_blocks[d].ref)
2192 d = tensor_blocks[d].ref - 1;
2193 // Doesn't work if this is a loop carrying variable.
2194 assert(!tensor_symbol_info[d].assign_ref)((void) sizeof ((!tensor_symbol_info[d].assign_ref) ? 1 : 0),
__extension__ ({ if (!tensor_symbol_info[d].assign_ref) ; else
__assert_fail ("!tensor_symbol_info[d].assign_ref", "ccv_nnc_symbolic_graph_compile.c"
, 2194, __extension__ __PRETTY_FUNCTION__); }))
;
2195 tensor_blocks[d].flags = UNASSIGNED;
2196 tensor_blocks[d].ref = 0; // No need to have ref as well.
2197 }
2198 }
2199 // Maximum tensor reuse by collapse tensors allows in-place operations (and it matches the start, end tensor).
2200 ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx;
{
2201 int x, y;
2202 for (x = 0; x < node->input_size; x++)
2203 {
2204 /* If the input is not assigned, it can be referenced, find the referenced one */
2205 int ref = node->inputs[x];
2206 if (ref < 0)
2207 continue;
2208 while (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref])(!((tensor_blocks[ref].flags & 0x3) == ALIAS) && !
((tensor_blocks[ref].flags & 0x3) == UNASSIGNED))
&& tensor_blocks[ref].ref)
2209 ref = tensor_blocks[ref].ref - 1;
2210 assert(tensor_blocks[ref].ref == 0)((void) sizeof ((tensor_blocks[ref].ref == 0) ? 1 : 0), __extension__
({ if (tensor_blocks[ref].ref == 0) ; else __assert_fail ("tensor_blocks[ref].ref == 0"
, "ccv_nnc_symbolic_graph_compile.c", 2210, __extension__ __PRETTY_FUNCTION__
); }))
;
2211 const ccv_nnc_tensor_symbol_info_t x_symbol = tensor_symbol_info[ref];
2212 if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref])(!((tensor_blocks[ref].flags & 0x3) == ALIAS) && !
((tensor_blocks[ref].flags & 0x3) == UNASSIGNED))
&&
2213 tensor_blocks[ref].tail->rnum == 1)
2214 for (y = 0; y < node->output_size; y++)
2215 /* Only proceed if the input symbol is different from the output symbol, */
2216 /* and the input symbol meets the output symbol exactly at the same spot. */
2217 if (ccv_nnc_cmd_allow_inplace(node->cmd, x, node->input_size, y, node->output_size) &&
2218 node->outputs[y] >= 0 &&
2219 ref != node->outputs[y] &&
2220 TENSOR_EXPECT_COMPUTABLE(tensor_blocks[node->outputs[y]])(!((tensor_blocks[node->outputs[y]].flags & 0x3) == ALIAS
) && !((tensor_blocks[node->outputs[y]].flags &
0x3) == UNASSIGNED))
)
2221 {
2222 const int node_output_y = node->outputs[y];
2223 const ccv_nnc_tensor_symbol_info_t y_symbol = tensor_symbol_info[node_output_y];
2224 /* If dimension matches perfectly, then we can assign y_symbol to x. */
2225 if (memcmp(x_symbol.info.dim, y_symbol.info.dim, sizeof(int) * CCV_NNC_MAX_DIM_ALLOC(8)) == 0)
2226 _ccv_nnc_tensor_blocks_try_fold(tensor_blocks, ref, node_output_y);
2227 }
2228 }
2229 } ccv_nnc_graph_visit_endfor} }
2230 // Specifically handle the bypass. This need to be done after the first pass.
2231 // I need to extend the bypass life-time to the same as the one I am going with.
2232 // It is important we visit these nodes and assign bypass_ref to its dependents in topological order.
2233 ccv_nnc_tensor_block_t empty_block = {};
2234 empty_block.head = ccv_array_new(sizeof(int), 0, 0);
2235 empty_block.tail = ccv_array_new(sizeof(int), 0, 0);
2236 ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx;
{
2237 if (node->flags & CCV_NNC_GRAPH_EXEC_CASE_OF)
2238 {
2239 int can_bypass = 1;
2240 for (i = 0; can_bypass && i < node->output_size; i++)
2241 {
2242 int d = node->outputs[i];
2243 if (d < 0)
2244 continue;
2245 if (!tensor_blocks[d].bypass_ref)
2246 continue;
2247 while (tensor_blocks[d].ref)
2248 d = tensor_blocks[d].ref - 1;
2249 int bypass_ref = tensor_blocks[node->outputs[i]].bypass_ref - 1;
2250 while (tensor_blocks[bypass_ref].ref)
2251 bypass_ref = tensor_blocks[bypass_ref].ref - 1;
2252 // If this doesn't participate in the while loop, we don't need to check the while loop constraint.
2253 if (!tensor_symbol_info[bypass_ref].assign_ref && !tensor_symbol_info[bypass_ref].r_assign_ref)
2254 continue;
2255 ccv_array_clear(empty_block.head);
2256 for (j = 0; tensor_blocks[bypass_ref].head && j < tensor_blocks[bypass_ref].head->rnum; j++)
2257 ccv_array_push(empty_block.head, ccv_array_get(tensor_blocks[bypass_ref].head, j)((void*)(((char*)((tensor_blocks[bypass_ref].head)->data))
+ (size_t)(tensor_blocks[bypass_ref].head)->rsize * (size_t
)(j)))
);
2258 ccv_array_clear(empty_block.tail);
2259 for (j = 0; tensor_blocks[bypass_ref].tail && j < tensor_blocks[bypass_ref].tail->rnum; j++)
2260 ccv_array_push(empty_block.tail, ccv_array_get(tensor_blocks[bypass_ref].tail, j)((void*)(((char*)((tensor_blocks[bypass_ref].tail)->data))
+ (size_t)(tensor_blocks[bypass_ref].tail)->rsize * (size_t
)(j)))
);
2261 for (j = 0; tensor_blocks[d].head && j < tensor_blocks[d].head->rnum; j++)
2262 _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].head, j)((void*)(((char*)((tensor_blocks[d].head)->data)) + (size_t
)(tensor_blocks[d].head)->rsize * (size_t)(j)))
, empty_block);
2263 for (j = 0; tensor_blocks[d].tail && j < tensor_blocks[d].tail->rnum; j++)
2264 _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].tail, j)((void*)(((char*)((tensor_blocks[d].tail)->data)) + (size_t
)(tensor_blocks[d].tail)->rsize * (size_t)(j)))
, empty_block);
2265 // It can only be unfoldable due to while constraint. Check whether this satisfies the while loop constraint.
2266 assert(!(tensor_symbol_info[bypass_ref].assign_ref && tensor_symbol_info[bypass_ref].r_assign_ref))((void) sizeof ((!(tensor_symbol_info[bypass_ref].assign_ref &&
tensor_symbol_info[bypass_ref].r_assign_ref)) ? 1 : 0), __extension__
({ if (!(tensor_symbol_info[bypass_ref].assign_ref &&
tensor_symbol_info[bypass_ref].r_assign_ref)) ; else __assert_fail
("!(tensor_symbol_info[bypass_ref].assign_ref && tensor_symbol_info[bypass_ref].r_assign_ref)"
, "ccv_nnc_symbolic_graph_compile.c", 2266, __extension__ __PRETTY_FUNCTION__
); }))
;
2267 int b_ref = (tensor_symbol_info[bypass_ref].assign_ref) ? tensor_symbol_info[bypass_ref].assign_ref - 1 : tensor_symbol_info[bypass_ref].r_assign_ref - 1;
2268 while (tensor_blocks[b_ref].ref)
2269 b_ref = tensor_blocks[b_ref].ref - 1;
2270 int a_hop_b = _ccv_nnc_tensor_block_head_after_tail(exec_dep, empty_block, tensor_blocks[b_ref]);
2271 int b_hop_a = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[b_ref], empty_block);
2272 // These two can be assigned to the same region of memory without issue (because their life-time doesn't interfere)
2273 // even after we extend the life-time of bypass_ref. Then we are in a good shape.
2274 can_bypass = can_bypass && (a_hop_b || b_hop_a);
2275 }
2276 if (can_bypass)
2277 {
2278 for (i = 0; i < node->output_size; i++)
2279 {
2280 int d = node->outputs[i];
2281 if (d < 0)
2282 continue;
2283 if (!tensor_blocks[d].bypass_ref)
2284 continue;
2285 while (tensor_blocks[d].ref)
2286 d = tensor_blocks[d].ref - 1;
2287 int bypass_ref = tensor_blocks[node->outputs[i]].bypass_ref - 1;
2288 while (tensor_blocks[bypass_ref].ref)
2289 bypass_ref = tensor_blocks[bypass_ref].ref - 1;
2290 // The bypass_ref can extend its life-time.
2291 for (j = 0; tensor_blocks[d].head && j < tensor_blocks[d].head->rnum; j++)
2292 _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].head, j)((void*)(((char*)((tensor_blocks[d].head)->data)) + (size_t
)(tensor_blocks[d].head)->rsize * (size_t)(j)))
, tensor_blocks[bypass_ref]);
2293 for (j = 0; tensor_blocks[d].tail && j < tensor_blocks[d].tail->rnum; j++)
2294 _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].tail, j)((void*)(((char*)((tensor_blocks[d].tail)->data)) + (size_t
)(tensor_blocks[d].tail)->rsize * (size_t)(j)))
, tensor_blocks[bypass_ref]);
2295 }
2296 } else {
2297 for (i = 0; i < node->output_size; i++)
2298 tensor_blocks[node->outputs[i]].bypass_ref = 0;
2299 const int exec_idx = (dup_exec_from_ref) ? dup_exec_from_ref[idx] : idx;
2300 // Mark this exec as no bypass IO (thus, I need to insert explicit data transfer.
2301 exec_flags[exec_idx].flags |= CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO;
2302 }
2303 }
2304 } ccv_nnc_graph_visit_endfor} }
2305 ccv_array_free(empty_block.head);
2306 ccv_array_free(empty_block.tail);
2307 *r_exec_dep = exec_dep;
2308 *r_tensor_blocks = tensor_blocks;
2309}
2310
2311static ccv_nnc_cmd_t _ccv_nnc_subst_sub_graph_with_noop(const ccv_nnc_graph_exec_symbol_t symbol, const ccv_nnc_cmd_t cmd)
2312{
2313 if (cmd.cmd == CCV_NNC_GRAPH_FORWARD || cmd.cmd == CCV_NNC_GRAPH_BACKWARD)
2314 {
2315 ccv_nnc_cmd_t retval = cmd;
2316 retval.cmd = CCV_NNC_NOOP;
2317 return retval;
2318 }
2319 return cmd;
2320}
2321
2322static ccv_nnc_tensor_symbol_t _ccv_nnc_dup_tensor_symbol(ccv_nnc_symbolic_graph_t* const dup_graph, const int unroll_count, int* const dup_tensor_block_ref, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int input)
2323{
2324 if (dup_tensor_block_ref[input * unroll_count] < 0) // No tensor ref, create one.
2325 {
2326 if (tensor_symbol_info[input].alias_ref)
2327 {
2328 const int alias_ref = tensor_symbol_info[input].alias_ref - 1;
2329 assert(tensor_symbol_info[alias_ref].alias_ref == 0)((void) sizeof ((tensor_symbol_info[alias_ref].alias_ref == 0
) ? 1 : 0), __extension__ ({ if (tensor_symbol_info[alias_ref
].alias_ref == 0) ; else __assert_fail ("tensor_symbol_info[alias_ref].alias_ref == 0"
, "ccv_nnc_symbolic_graph_compile.c", 2329, __extension__ __PRETTY_FUNCTION__
); }))
;
2330 ccv_nnc_tensor_symbol_t tensor_symbol = {};
2331 if (dup_tensor_block_ref[alias_ref * unroll_count] < 0)
2332 {
2333 tensor_symbol = ccv_nnc_tensor_symbol_new(dup_graph, tensor_symbol_info[alias_ref].info, 0);
2334 if (tensor_symbol_info[alias_ref].peer_ref)
2335 ccv_nnc_tensor_symbol_set_peer(dup_graph, tensor_symbol, (ccv_nnc_tensor_symbol_t){
2336 .d = tensor_symbol_info[alias_ref].peer_ref - 1,
2337 .graph = dup_graph->peer
2338 });
2339 ccv_nnc_tensor_symbol_set_flags(dup_graph, tensor_symbol, tensor_symbol_info[alias_ref].flags);
2340 dup_tensor_block_ref[alias_ref * unroll_count] = tensor_symbol.d;
2341 } else {
2342 tensor_symbol.d = dup_tensor_block_ref[alias_ref * unroll_count];
2343 tensor_symbol.graph = dup_graph;
2344 }
2345 ccv_nnc_tensor_symbol_t alias_symbol = ccv_nnc_tensor_symbol_alias_new(dup_graph, tensor_symbol, tensor_symbol_info[input].ofs, tensor_symbol_info[input].inc, tensor_symbol_info[input].info, 0);
2346 if (tensor_symbol_info[input].peer_ref)
2347 ccv_nnc_tensor_symbol_set_peer(dup_graph, alias_symbol, (ccv_nnc_tensor_symbol_t){
2348 .d = tensor_symbol_info[input].peer_ref - 1,
2349 .graph = dup_graph->peer
2350 });
2351 ccv_nnc_tensor_symbol_set_flags(dup_graph, alias_symbol, tensor_symbol_info[input].flags);
2352 dup_tensor_block_ref[input * unroll_count] = alias_symbol.d;
2353 } else {
2354 ccv_nnc_tensor_symbol_t tensor_symbol = ccv_nnc_tensor_symbol_new(dup_graph, tensor_symbol_info[input].info, 0);
2355 if (tensor_symbol_info[input].peer_ref)
2356 ccv_nnc_tensor_symbol_set_peer(dup_graph, tensor_symbol, (ccv_nnc_tensor_symbol_t){
2357 .d = tensor_symbol_info[input].peer_ref - 1,
2358 .graph = dup_graph->peer
2359 });
2360 ccv_nnc_tensor_symbol_set_flags(dup_graph, tensor_symbol, tensor_symbol_info[input].flags);
2361 dup_tensor_block_ref[input * unroll_count] = tensor_symbol.d;
2362 }
2363 if (tensor_symbol_info[input].bypass_ref)
2364 {
2365 const int dup_bypass_ref = dup_tensor_block_ref[(tensor_symbol_info[input].bypass_ref - 1) * unroll_count];
2366 assert(dup_bypass_ref >= 0)((void) sizeof ((dup_bypass_ref >= 0) ? 1 : 0), __extension__
({ if (dup_bypass_ref >= 0) ; else __assert_fail ("dup_bypass_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 2366, __extension__ __PRETTY_FUNCTION__
); }))
;
2367 ccv_nnc_tensor_symbol_info_t* const symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(dup_graph->tensor_symbol_info, dup_tensor_block_ref[input * unroll_count])((void*)(((char*)((dup_graph->tensor_symbol_info)->data
)) + (size_t)(dup_graph->tensor_symbol_info)->rsize * (
size_t)(dup_tensor_block_ref[input * unroll_count])))
;
2368 symbol_info->bypass_ref = dup_bypass_ref + 1;
2369 }
2370 }
2371 return (ccv_nnc_tensor_symbol_t) {
2372 .d = dup_tensor_block_ref[input * unroll_count],
2373 .graph = dup_graph,
2374 };
2375}
2376
2377static ccv_nnc_graph_exec_symbol_t _ccv_nnc_dup_graph_exec_symbol(ccv_nnc_symbolic_graph_t* const dup_graph, const int unroll_count, int* const dup_exec_ref, int* const dup_tensor_block_ref, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_nnc_graph_exec_symbol_info_t* const node, const int idx, ccv_nnc_tensor_symbol_t* const max_inputs, ccv_nnc_tensor_symbol_t* const max_outputs)
2378{
2379 int i;
2380 if (dup_exec_ref[idx * unroll_count] < 0)
2381 {
2382 // Input has to come before output, because output could has a bypass reference to the input.
2383 for (i = 0; i < node->input_size; i++)
2384 max_inputs[i] = (node->inputs[i] >= 0) ? _ccv_nnc_dup_tensor_symbol(dup_graph, unroll_count, dup_tensor_block_ref, tensor_symbol_info, node->inputs[i]) : (ccv_nnc_tensor_symbol_t){ .d = node->inputs[i], .graph = dup_graph };
2385 for (i = 0; i < node->output_size; i++)
2386 max_outputs[i] = (node->outputs[i] >= 0) ? _ccv_nnc_dup_tensor_symbol(dup_graph, unroll_count, dup_tensor_block_ref, tensor_symbol_info, node->outputs[i]) : (ccv_nnc_tensor_symbol_t){ .d = node->outputs[i], .graph = dup_graph };
2387 ccv_nnc_graph_exec_symbol_t exec_symbol = ccv_nnc_graph_exec_symbol_new(dup_graph, node->cmd, max_inputs, node->input_size, max_outputs, node->output_size, 0);
2388 dup_exec_ref[idx * unroll_count] = exec_symbol.d;
2389 }
2390 return (ccv_nnc_graph_exec_symbol_t) {
2391 .d = dup_exec_ref[idx * unroll_count],
2392 .graph = dup_graph,
2393 };
2394}
2395
2396static void _ccv_nnc_tensor_blocks_free(ccv_nnc_tensor_block_t* const tensor_blocks, const int tensor_block_size)
2397{
2398 int i;
2399 for (i = 0; i < tensor_block_size; i++)
2400 {
2401 if (tensor_blocks[i].head)
2402 ccv_array_free(tensor_blocks[i].head);
2403 if (tensor_blocks[i].tail)
2404 ccv_array_free(tensor_blocks[i].tail);
2405 if (tensor_blocks[i].r_refs)
2406 ccv_array_free(tensor_blocks[i].r_refs);
2407 if (tensor_blocks[i].dup_p_refs)
2408 ccv_array_free(tensor_blocks[i].dup_p_refs);
2409 }
2410 ccfreefree(tensor_blocks);
2411}
2412
2413// Find tensors that cannot be solved by co-allocating to the same location.
2414static int _ccv_nnc_exec_dep_and_tensor_blocks_unroll_count(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_sparse_matrix_t* const exec_dep, ccv_nnc_tensor_block_t* const tensor_blocks)
2415{
2416 int i, j, unroll_count = 0;
2417 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2418 if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED) && tensor_symbol_info[i].assign_ref)
2419 {
2420 // This is is a parameter, thus, it has to be either an alias or used.
2421 assert(tensor_blocks[i].ref || TENSOR_EXPECT_ORDINARY(tensor_blocks[i]))((void) sizeof ((tensor_blocks[i].ref || ((tensor_blocks[i].flags
& 0x3) == 0)) ? 1 : 0), __extension__ ({ if (tensor_blocks
[i].ref || ((tensor_blocks[i].flags & 0x3) == 0)) ; else __assert_fail
("tensor_blocks[i].ref || TENSOR_EXPECT_ORDINARY(tensor_blocks[i])"
, "ccv_nnc_symbolic_graph_compile.c", 2421, __extension__ __PRETTY_FUNCTION__
); }))
;
2422 const int assign_ref = tensor_symbol_info[i].assign_ref - 1; // Starts at 1.
2423 // The parameter it assign to has to be either an alias or used.
2424 assert(tensor_blocks[assign_ref].ref || TENSOR_EXPECT_ORDINARY(tensor_blocks[assign_ref]))((void) sizeof ((tensor_blocks[assign_ref].ref || ((tensor_blocks
[assign_ref].flags & 0x3) == 0)) ? 1 : 0), __extension__ (
{ if (tensor_blocks[assign_ref].ref || ((tensor_blocks[assign_ref
].flags & 0x3) == 0)) ; else __assert_fail ("tensor_blocks[assign_ref].ref || TENSOR_EXPECT_ORDINARY(tensor_blocks[assign_ref])"
, "ccv_nnc_symbolic_graph_compile.c", 2424, __extension__ __PRETTY_FUNCTION__
); }))
;
2425 // If any of this two (assigner and assignee) is an alias, check to see if they are the same.
2426 // If it is the same, we are good, no need to extend.
2427 int a_ref = i;
2428 while (tensor_blocks[a_ref].ref)
2429 a_ref = tensor_blocks[a_ref].ref - 1;
2430 int b_ref = assign_ref;
2431 while (tensor_blocks[b_ref].ref)
2432 b_ref = tensor_blocks[b_ref].ref - 1;
2433 if (a_ref != b_ref)
2434 {
2435 // If any of the b's head is deterministically later than a's tail
2436 // or any of the b's tail is deterministically earlier than a's head, they don't interfere.
2437 int a_hop_b = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[b_ref], tensor_blocks[a_ref]);
2438 int b_hop_a = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[a_ref], tensor_blocks[b_ref]);
2439 // It cannot be that both i can hop to j can j can hop to i.
2440 assert(!(a_hop_b > 0 && b_hop_a > 0))((void) sizeof ((!(a_hop_b > 0 && b_hop_a > 0))
? 1 : 0), __extension__ ({ if (!(a_hop_b > 0 && b_hop_a
> 0)) ; else __assert_fail ("!(a_hop_b > 0 && b_hop_a > 0)"
, "ccv_nnc_symbolic_graph_compile.c", 2440, __extension__ __PRETTY_FUNCTION__
); }))
;
2441 // Can it be folded
2442 // These two can be assigned to the same region of memory without issue (because their life-time doesn't interfere).
2443 if (a_hop_b || b_hop_a)
2444 {
2445 tensor_blocks[a_ref].companion_ref = b_ref + 1;
2446 tensor_blocks[b_ref].companion_ref = a_ref + 1;
2447 continue;
2448 }
2449 int c_ref = tensor_symbol_info[b_ref].assign_ref - 1;
2450 for (j = 0; c_ref >= 0; j++)
2451 {
2452 while (tensor_blocks[c_ref].ref)
2453 c_ref = tensor_blocks[c_ref].ref - 1;
2454 c_ref = tensor_symbol_info[c_ref].assign_ref - 1;
2455 }
2456 unroll_count = ccv_max(unroll_count, j + 1)({ typeof (unroll_count) _a = (unroll_count); typeof (j + 1) _b
= (j + 1); (_a > _b) ? _a : _b; })
;
2457 }
2458 }
2459 // Reset companion_ref if need to unroll.
2460 if (unroll_count)
2461 for (j = 0; j < symbolic_graph->tensor_symbol_info->rnum; j++)
2462 tensor_blocks[j].companion_ref = 0;
2463 return unroll_count;
2464}
2465
2466static void _ccv_nnc_exec_dep_and_tensor_blocks_unroll_n(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_visit_t* const visit, const int unroll_count, const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_sparse_matrix_t* const exec_dep, const ccv_nnc_tensor_block_t* const tensor_blocks, ccv_nnc_symbolic_graph_t* const dup_graph, int* const r_dup_tensor_block_ref, int* const r_dup_exec_ref)
2467{
2468 int i, j, n;
2469 // The inout exec nodes, these are the nodes we are going to extend.
2470 uint8_t* inout = (uint8_t*)cccalloccalloc(symbolic_graph->exec_symbol_info->rnum, sizeof(uint8_t));
2471 int max_input_size = 0;
2472 int max_output_size = 0;
2473 for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; i++)
2474 {
2475 max_input_size = ccv_max(exec_symbol_info[i].input_size, max_input_size)({ typeof (exec_symbol_info[i].input_size) _a = (exec_symbol_info
[i].input_size); typeof (max_input_size) _b = (max_input_size
); (_a > _b) ? _a : _b; })
;
2476 max_output_size = ccv_max(exec_symbol_info[i].output_size, max_output_size)({ typeof (exec_symbol_info[i].output_size) _a = (exec_symbol_info
[i].output_size); typeof (max_output_size) _b = (max_output_size
); (_a > _b) ? _a : _b; })
;
2477 }
2478 ccv_nnc_tensor_symbol_t max_inputs[ccv_max(1, max_input_size)({ typeof (1) _a = (1); typeof (max_input_size) _b = (max_input_size
); (_a > _b) ? _a : _b; })
];
2479 ccv_nnc_tensor_symbol_t max_outputs[ccv_max(1, max_output_size)({ typeof (1) _a = (1); typeof (max_output_size) _b = (max_output_size
); (_a > _b) ? _a : _b; })
];
2480 // Doing graph expansion
2481 // It goes without saying, we must have more than one tensors / execs (otherwise I cannot use 0 as no exec ref).
2482 assert(dup_graph->exec_symbol_info->rnum > 0)((void) sizeof ((dup_graph->exec_symbol_info->rnum >
0) ? 1 : 0), __extension__ ({ if (dup_graph->exec_symbol_info
->rnum > 0) ; else __assert_fail ("dup_graph->exec_symbol_info->rnum > 0"
, "ccv_nnc_symbolic_graph_compile.c", 2482, __extension__ __PRETTY_FUNCTION__
); }))
;
2483 assert(dup_graph->tensor_symbol_info->rnum > 0)((void) sizeof ((dup_graph->tensor_symbol_info->rnum >
0) ? 1 : 0), __extension__ ({ if (dup_graph->tensor_symbol_info
->rnum > 0) ; else __assert_fail ("dup_graph->tensor_symbol_info->rnum > 0"
, "ccv_nnc_symbolic_graph_compile.c", 2483, __extension__ __PRETTY_FUNCTION__
); }))
;
2484#define INCOMING_NODE (1)
2485#define OUTGOING_NODE (2)
2486 // Unroll the graph n times.
2487 for (n = 0; n < unroll_count; n++)
2488 {
2489 int* const dup_exec_ref = r_dup_exec_ref + n;
2490 const int* const prev_dup_tensor_block_ref = n > 0 ? r_dup_tensor_block_ref + (n - 1) : 0;
2491 int* const dup_tensor_block_ref = r_dup_tensor_block_ref + n;
2492 for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; i++)
2493 dup_exec_ref[i * unroll_count] = -1;
2494 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2495 {
2496 // If there is a assign_ref, that means I don't need to dup the tensor.
2497 if (tensor_symbol_info[i].assign_ref)
2498 {
2499 const int assign_ref = tensor_symbol_info[i].assign_ref - 1;
2500 dup_tensor_block_ref[i * unroll_count] = prev_dup_tensor_block_ref ? prev_dup_tensor_block_ref[assign_ref * unroll_count] : assign_ref;
2501 } else if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !(
(tensor_blocks[i].flags & 0x3) == UNASSIGNED))
&& TENSOR_READ_WRITE(tensor_blocks[i])(tensor_blocks[i].flags & 0xc) == READ_ONLY)
2502 // If this is a read-only tensor block, no need to duplicate because the value never changes
2503 // (note we handled assign_ref first), therefore, no need to generate duplicate.
2504 dup_tensor_block_ref[i * unroll_count] = i;
2505 else
2506 dup_tensor_block_ref[i * unroll_count] = -1;
2507 }
2508 // Go through the original graph, make copies of the node if it is inout.
2509 ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx;
{
2510 ccv_nnc_graph_exec_symbol_t exec_symbol = _ccv_nnc_dup_graph_exec_symbol(dup_graph, unroll_count, dup_exec_ref, dup_tensor_block_ref, tensor_symbol_info, node, idx, max_inputs, max_outputs);
2511 inout[idx] |= INCOMING_NODE; /* Mark this node as incoming. */
2512 if (!node->outgoings)
2513 continue;
2514 for (i = 0; i < node->outgoings->rnum; i++)
2515 {
2516 const int outgoing_idx = *(int*)ccv_array_get(node->outgoings, i)((void*)(((char*)((node->outgoings)->data)) + (size_t)(
node->outgoings)->rsize * (size_t)(i)))
;
2517 inout[outgoing_idx] |= OUTGOING_NODE; /* Mark this node as outgoing. */
2518 ccv_nnc_graph_exec_symbol_t outgoing_symbol = _ccv_nnc_dup_graph_exec_symbol(dup_graph, unroll_count, dup_exec_ref, dup_tensor_block_ref, tensor_symbol_info, exec_symbol_info + outgoing_idx, outgoing_idx, max_inputs, max_outputs);
2519 ccv_nnc_graph_exec_symbol_concat(dup_graph, exec_symbol, outgoing_symbol);
2520 }
2521 } ccv_nnc_graph_visit_endfor} }
2522 // Check the visitor are all marked as either incoming or outgoing.
2523 const ccv_nnc_graph_exec_symbol_t* const dup_destinations = ccv_nnc_symbolic_graph_destinations(dup_graph);
2524 const int dup_destination_size = ccv_nnc_symbolic_graph_destination_size(dup_graph);
2525 for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; i++)
2526 {
2527 if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_symbol_info[i].flags)((exec_symbol_info[i].flags) & CCV_NNC_GRAPH_EXEC_DEAD))
2528 continue;
2529 assert((inout[i] & INCOMING_NODE) || (inout[i] & OUTGOING_NODE))((void) sizeof (((inout[i] & INCOMING_NODE) || (inout[i] &
OUTGOING_NODE)) ? 1 : 0), __extension__ ({ if ((inout[i] &
INCOMING_NODE) || (inout[i] & OUTGOING_NODE)) ; else __assert_fail
("(inout[i] & INCOMING_NODE) || (inout[i] & OUTGOING_NODE)"
, "ccv_nnc_symbolic_graph_compile.c", 2529, __extension__ __PRETTY_FUNCTION__
); }))
;
2530 // If this is pure incoming nodes, then I need to concat this one with all original destination node
2531 if (inout[i] == INCOMING_NODE)
2532 for (j = 0; j < dup_destination_size; j++)
2533 {
2534 ccv_nnc_graph_exec_symbol_concat(dup_graph, (ccv_nnc_graph_exec_symbol_t) {
2535 .d = dup_destinations[j].d,
2536 .graph = dup_graph,
2537 }, (ccv_nnc_graph_exec_symbol_t) {
2538 .d = dup_exec_ref[i * unroll_count],
2539 .graph = dup_graph,
2540 });
2541 }
2542 }
2543 if (dup_graph->destinations)
2544 ccv_array_clear(dup_graph->destinations);
2545 for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; i++)
2546 {
2547 if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_symbol_info[i].flags)((exec_symbol_info[i].flags) & CCV_NNC_GRAPH_EXEC_DEAD))
2548 continue;
2549 const int d = dup_exec_ref[i * unroll_count];
2550 ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(dup_graph->exec_symbol_info, d)((void*)(((char*)((dup_graph->exec_symbol_info)->data))
+ (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(d)))
;
2551 // If this has no outgoing node, add to the destination.
2552 if (!exec_symbol_info->outgoings || exec_symbol_info->outgoings->rnum == 0)
2553 ccv_nnc_symbolic_graph_add_destination(dup_graph, (ccv_nnc_graph_exec_symbol_t) {
2554 .graph = dup_graph,
2555 .d = d,
2556 });
2557 }
2558 }
2559#undef INCOMING_NODE
2560#undef OUTGOING_NODE
2561 ccfreefree(inout);
2562}
2563
2564static void _ccv_nnc_fixup_assign_ref_after_unroll(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const int unroll_count, const ccv_nnc_tensor_block_t* const tensor_blocks, const int* const dup_tensor_block_ref, ccv_nnc_tensor_symbol_info_t* const dup_tensor_symbol_info)
2565{
2566 int i;
2567 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++) // symbolic graph is the old graph and tensor blocks is the old tensor blocks.
2568 // Now can assign them (The dup) as companion.
2569 // Get to the last one, which we will wrap over.
2570 if (dup_tensor_symbol_info[i].assign_ref)
2571 {
2572 dup_tensor_symbol_info[dup_tensor_symbol_info[i].assign_ref - 1].r_assign_ref = 0;
2573 dup_tensor_symbol_info[i].assign_ref = dup_tensor_block_ref[(dup_tensor_symbol_info[i].assign_ref - 1) * unroll_count + unroll_count - 1] + 1;
2574 assert(dup_tensor_symbol_info[i].assign_ref)((void) sizeof ((dup_tensor_symbol_info[i].assign_ref) ? 1 : 0
), __extension__ ({ if (dup_tensor_symbol_info[i].assign_ref)
; else __assert_fail ("dup_tensor_symbol_info[i].assign_ref"
, "ccv_nnc_symbolic_graph_compile.c", 2574, __extension__ __PRETTY_FUNCTION__
); }))
;
2575 dup_tensor_symbol_info[dup_tensor_symbol_info[i].assign_ref - 1].r_assign_ref = i + 1;
2576 }
2577}
2578
2579// If the tensor blocks are the outputs of this graph, its life-time should be extended to the end of this graph.
2580// However, it is not that simple if the graph is unrolled. For unrolled graph, it needs to reach the end of
2581// the "original" graph and all its duplicated ends (for their duplicated tensor blocks).
2582static void _ccv_nnc_fixup_tensor_blocks_for_outputs(ccv_sparse_matrix_t* const exec_dep, ccv_nnc_tensor_block_t* const tensor_blocks, const ccv_nnc_graph_exec_symbol_info_t* const p_node_info, const int unroll_count, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const int p_idx, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int* const dup_exec_ref, const int* const dup_tensor_block_ref)
2583{
2584 int i, j, k;
2585 for (i = 0; i < p_node_info->output_size; i++)
2586 {
2587 const int d = p_node_info->outputs[i];
2588 const int s_ref = *(int*)ccv_array_get(p_tensor_symbol_info[d].s_ref, p_idx)((void*)(((char*)((p_tensor_symbol_info[d].s_ref)->data)) +
(size_t)(p_tensor_symbol_info[d].s_ref)->rsize * (size_t)
(p_idx)))
- 1;
2589 if (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[s_ref])(!((tensor_blocks[s_ref].flags & 0x3) == ALIAS) &&
!((tensor_blocks[s_ref].flags & 0x3) == UNASSIGNED))
)
2590 continue;
2591 for (k = 0; k < destination_size; k++)
2592 _ccv_nnc_tensor_block_add_exec(exec_dep, destinations[k].d, tensor_blocks[s_ref]);
2593 // Add the duplicated destinations to the tensor_block_ref.
2594 for (j = 0; j < unroll_count; j++)
2595 for (k = 0; k < destination_size; k++)
2596 {
2597 const int dup_exec_idx = dup_exec_ref[destinations[k].d * unroll_count + j];
2598 const int dup_tensor_block_idx = dup_tensor_block_ref[s_ref * unroll_count + j];
2599 if (dup_exec_idx >= 0 && dup_tensor_block_idx >= 0)
2600 _ccv_nnc_tensor_block_add_exec(exec_dep, dup_exec_idx, tensor_blocks[dup_tensor_block_idx]);
2601 }
2602 }
2603}
2604
2605static void _ccv_nnc_redo_exec_dep_and_tensor_blocks_when_unroll(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_info_t* const p_node_info, const ccv_nnc_graph_visit_t* const visit, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, ccv_nnc_graph_exec_flag_t* const exec_flags, ccv_sparse_matrix_t** r_exec_dep, ccv_nnc_tensor_block_t** r_tensor_blocks, int* r_tensor_block_size, ccv_nnc_symbolic_graph_t** r_dup_graph, int* r_unroll_count, int** r_dup_exec_ref, int** r_dup_tensor_block_ref)
2606{
2607 int i, j;
2608 ccv_sparse_matrix_t* exec_dep = *r_exec_dep;
2609 ccv_nnc_tensor_block_t* tensor_blocks = *r_tensor_blocks;
2610 // blocks that cannot be simply solved with either in-place operation tensor block folding or using the same memory region.
2611 // Unfortunately, I cannot do this analysis to the block folding done for sub-graphs, because we do sub-graph placement later.
2612 // No need to change anything, we are good.
2613 const int unroll_count = _ccv_nnc_exec_dep_and_tensor_blocks_unroll_count(symbolic_graph, tensor_symbol_info, exec_dep, tensor_blocks);
2614 if (!unroll_count)
2615 return;
2616 // Have conditions that cannot be satisfied with simple solution (allocate to the same memory region).
2617 // Doing graph expansion, first duplicate the old graph, but replace all sub graphs with noop.
2618 ccv_nnc_symbolic_graph_t* dup_graph = ccv_nnc_symbolic_graph_dup(symbolic_graph, _ccv_nnc_subst_sub_graph_with_noop);
2619 int* dup_exec_ref = (int*)ccmallocmalloc(sizeof(int) * symbolic_graph->exec_symbol_info->rnum * unroll_count);
2620 int* dup_tensor_block_ref = (int*)ccmallocmalloc(sizeof(int) * symbolic_graph->tensor_symbol_info->rnum * unroll_count);
2621 _ccv_nnc_exec_dep_and_tensor_blocks_unroll_n(symbolic_graph, visit, unroll_count, exec_symbol_info, tensor_symbol_info, exec_dep, tensor_blocks, dup_graph, dup_tensor_block_ref, dup_exec_ref);
2622 ccv_nnc_tensor_symbol_info_t* const dup_tensor_symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccmallocmalloc(sizeof(ccv_nnc_tensor_symbol_info_t) * dup_graph->tensor_symbol_info->rnum);
2623 ccv_nnc_graph_exec_symbol_info_t* const dup_exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccmallocmalloc(sizeof(ccv_nnc_graph_exec_symbol_info_t) * dup_graph->exec_symbol_info->rnum);
2624 ccv_nnc_graph_visit_t* dup_visit = ccv_nnc_graph_visit_new(dup_graph, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(dup_graph->exec_symbol_info, 0), dup_graph->exec_symbol_info->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->sources, 0), dup_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->destinations, 0), dup_graph->destinations->rnum, 0)({ ccv_nnc_graph_visit_t* _visit_ = (ccv_nnc_graph_visit_t*)malloc
(sizeof(ccv_nnc_graph_visit_t) + sizeof(_visit_->node[0]) *
((dup_graph->exec_symbol_info->rnum) - 1)); _visit_->
size = 0; do { typedef struct { int8_t d; int8_t r; uint16_t c
; int32_t edges; } ccv_nnc_incoming_t; int _i_, _j_; int _incoming_edges_
= 0; for (_i_ = 0; _i_ < (dup_graph->exec_symbol_info->
rnum); _i_++) _incoming_edges_ += (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_i_].outgoings) ? ((ccv_nnc_graph_exec_symbol_info_t*
)((void*)(((char*)((dup_graph->exec_symbol_info)->data)
) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_i_].outgoings->rnum : 0; const int _heap_mem_ = (
dup_graph->exec_symbol_info->rnum + _incoming_edges_ >
1024); ccv_nnc_incoming_t* _incomings_; if (_heap_mem_) _incomings_
= (ccv_nnc_incoming_t*)malloc(sizeof(ccv_nnc_incoming_t) * (
dup_graph->exec_symbol_info->rnum) + sizeof(int32_t) * (
(dup_graph->exec_symbol_info->rnum) * 2 + _incoming_edges_
)); else _incomings_ = (ccv_nnc_incoming_t*)__builtin_alloca (
sizeof(ccv_nnc_incoming_t) * (dup_graph->exec_symbol_info->
rnum) + sizeof(int32_t) * ((dup_graph->exec_symbol_info->
rnum) * 2 + _incoming_edges_)); memset(_incomings_, 0, sizeof
(ccv_nnc_incoming_t) * (dup_graph->exec_symbol_info->rnum
)); int32_t* _exists_[2] = { (int32_t*)(_incomings_ + (dup_graph
->exec_symbol_info->rnum)), (int32_t*)(_incomings_ + (dup_graph
->exec_symbol_info->rnum)) + (dup_graph->exec_symbol_info
->rnum), }; int32_t* const _edges_ = _exists_[1] + (dup_graph
->exec_symbol_info->rnum); for (_i_ = 0; _i_ < (dup_graph
->sources->rnum); _i_++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->sources)->data)) + (size_t
)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph
== dup_graph) ? 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->sources)->data)) + (size_t
)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph
== dup_graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->sources)->data)) + (size_t)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph == dup_graph"
, "ccv_nnc_symbolic_graph_compile.c", 2624, __extension__ __PRETTY_FUNCTION__
); })); _exists_[0][_i_] = ((ccv_nnc_graph_exec_symbol_t*)((void
*)(((char*)((dup_graph->sources)->data)) + (size_t)(dup_graph
->sources)->rsize * (size_t)(0))))[_i_].d; } int _exist_size_
[2] = { (dup_graph->sources->rnum), 0, }; int _p_ = 0, _q_
= 1; while (_exist_size_[_p_] > 0) { _exist_size_[_q_] = 0
; for (_i_ = 0; _i_ < _exist_size_[_p_]; _i_++) { const int32_t
_idx_ = _exists_[_p_][_i_]; if (_incomings_[_idx_].r == 1) continue
; _incomings_[_idx_].r = 1; if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings) for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings->rnum; _j_++) { const int d = *(int
*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t*)((void
*)(((char*)((dup_graph->exec_symbol_info)->data)) + (size_t
)(dup_graph->exec_symbol_info)->rsize * (size_t)(0))))[
_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_))); ++_incomings_
[d].c; _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[_q_
]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for (
_i_ = 0; _i_ < (dup_graph->sources->rnum); _i_++) { (
(void) sizeof ((((ccv_nnc_graph_exec_symbol_t*)((void*)(((char
*)((dup_graph->sources)->data)) + (size_t)(dup_graph->
sources)->rsize * (size_t)(0))))[_i_].graph == dup_graph) ?
1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t*)
((void*)(((char*)((dup_graph->sources)->data)) + (size_t
)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph
== dup_graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->sources)->data)) + (size_t)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph == dup_graph"
, "ccv_nnc_symbolic_graph_compile.c", 2624, __extension__ __PRETTY_FUNCTION__
); })); _exists_[0][_i_] = ((ccv_nnc_graph_exec_symbol_t*)((void
*)(((char*)((dup_graph->sources)->data)) + (size_t)(dup_graph
->sources)->rsize * (size_t)(0))))[_i_].d; } _exist_size_
[0] = (dup_graph->sources->rnum); _exist_size_[1] = 0; _p_
= 0, _q_ = 1; int _bump_ = 1; while (_exist_size_[_p_] > 0
) { _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_
[_p_]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (
_incomings_[_idx_].r == 2) continue; _incomings_[_idx_].r = 2
; if (((ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)((
dup_graph->exec_symbol_info)->data)) + (size_t)(dup_graph
->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings
) for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t*)
((void*)(((char*)((dup_graph->exec_symbol_info)->data))
+ (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings->rnum; _j_++) { const int d = *(int
*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t*)((void
*)(((char*)((dup_graph->exec_symbol_info)->data)) + (size_t
)(dup_graph->exec_symbol_info)->rsize * (size_t)(0))))[
_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_))); if (_incomings_
[d].edges == 0) { _incomings_[d].edges = _bump_; _bump_ += _incomings_
[d].c; _incomings_[d].c = 0; } _edges_[_incomings_[d].edges -
1 + _incomings_[d].c] = _idx_; ++_incomings_[d].c; _exists_[
_q_][_exist_size_[_q_]] = d; ++_exist_size_[_q_]; } } ((_i_) =
(_p_), (_p_) = (_q_), (_q_) = (_i_)); } for (_i_ = 0; _i_ <
(dup_graph->destinations->rnum); _i_++) { ((void) sizeof
((((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph
->destinations)->data)) + (size_t)(dup_graph->destinations
)->rsize * (size_t)(0))))[_i_].graph == dup_graph) ? 1 : 0
), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t*)((void
*)(((char*)((dup_graph->destinations)->data)) + (size_t
)(dup_graph->destinations)->rsize * (size_t)(0))))[_i_]
.graph == dup_graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph->destinations)->rsize * (size_t)(0))))[_i_].graph == dup_graph"
, "ccv_nnc_symbolic_graph_compile.c", 2624, __extension__ __PRETTY_FUNCTION__
); })); _exists_[0][_i_] = ((ccv_nnc_graph_exec_symbol_t*)((void
*)(((char*)((dup_graph->destinations)->data)) + (size_t
)(dup_graph->destinations)->rsize * (size_t)(0))))[_i_]
.d; } _exist_size_[0] = (dup_graph->destinations->rnum)
; _exist_size_[1] = 0; _p_ = 0, _q_ = 1; while (_exist_size_[
_p_] > 0) { _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_
[_p_]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (
_incomings_[_idx_].r != 2) continue; _incomings_[_idx_].r = 3
; if (_incomings_[_idx_].edges > 0) for (_j_ = 0; _j_ <
_incomings_[_idx_].c; _j_++) { const int d = _edges_[_incomings_
[_idx_].edges - 1 + _j_]; _exists_[_q_][_exist_size_[_q_]] = d
; ++_exist_size_[_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_
) = (_i_)); } for (_i_ = 0; _i_ < (dup_graph->destinations
->rnum); _i_++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->destinations)->data)) +
(size_t)(dup_graph->destinations)->rsize * (size_t)(0)
)))[_i_].graph == dup_graph) ? 1 : 0), __extension__ ({ if ((
(ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->
destinations)->data)) + (size_t)(dup_graph->destinations
)->rsize * (size_t)(0))))[_i_].graph == dup_graph) ; else __assert_fail
("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph->destinations)->rsize * (size_t)(0))))[_i_].graph == dup_graph"
, "ccv_nnc_symbolic_graph_compile.c", 2624, __extension__ __PRETTY_FUNCTION__
); })); _incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)((
(char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph
->destinations)->rsize * (size_t)(0))))[_i_].d].d = 1; }
for (_i_ = 0; _i_ < (dup_graph->sources->rnum); _i_
++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t*)((void*
)(((char*)((dup_graph->sources)->data)) + (size_t)(dup_graph
->sources)->rsize * (size_t)(0))))[_i_].graph == dup_graph
) ? 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->sources)->data)) + (size_t
)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph
== dup_graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->sources)->data)) + (size_t)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph == dup_graph"
, "ccv_nnc_symbolic_graph_compile.c", 2624, __extension__ __PRETTY_FUNCTION__
); })); _exists_[0][_i_] = ((ccv_nnc_graph_exec_symbol_t*)((void
*)(((char*)((dup_graph->sources)->data)) + (size_t)(dup_graph
->sources)->rsize * (size_t)(0))))[_i_].d; } _p_ = 0; _q_
= 1; _exist_size_[0] = (dup_graph->sources->rnum); _exist_size_
[1] = 0; int _d_ = 0; while (_exist_size_[_p_] > 0) { _exist_size_
[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_];) { const
int32_t _idx_ = _exists_[_p_][_i_]; _visit_->node[_visit_
->size].index = ((_idx_)); _visit_->node[_visit_->size
].term = ((_incomings_[_idx_].d)); ++_visit_->size;; if (_incomings_
[_idx_].d) { ++_d_; _incomings_[_idx_].r = 4; } if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings) { if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings->rnum == 1) { const int d = *(int
*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t*)((void
*)(((char*)((dup_graph->exec_symbol_info)->data)) + (size_t
)(dup_graph->exec_symbol_info)->rsize * (size_t)(0))))[
_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings)->rsize * (size_t)(0))); --_incomings_
[d].c; if (_incomings_[d].c == 0 && _incomings_[d].r ==
3 && _d_ < (dup_graph->destinations->rnum))
{ _exists_[_p_][_i_] = d; continue; } } else for (_j_ = 0; _j_
< ((ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)((
dup_graph->exec_symbol_info)->data)) + (size_t)(dup_graph
->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings
->rnum; _j_++) { const int d = *(int*)((void*)(((char*)(((
(ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)((dup_graph
->exec_symbol_info)->data)) + (size_t)(dup_graph->exec_symbol_info
)->rsize * (size_t)(0))))[_idx_].outgoings)->data)) + (
size_t)(((ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)
((dup_graph->exec_symbol_info)->data)) + (size_t)(dup_graph
->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings
)->rsize * (size_t)(_j_))); --_incomings_[d].c; if (_incomings_
[d].c == 0 && _incomings_[d].r == 3 && _d_ <
(dup_graph->destinations->rnum)) { _exists_[_q_][_exist_size_
[_q_]] = d; ++_exist_size_[_q_]; } } } ++_i_; } ((_i_) = (_p_
), (_p_) = (_q_), (_q_) = (_i_)); } for (_i_ = 0; _i_ < (dup_graph
->destinations->rnum); _i_++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->destinations)->data)) +
(size_t)(dup_graph->destinations)->rsize * (size_t)(0)
)))[_i_].graph == dup_graph) ? 1 : 0), __extension__ ({ if ((
(ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->
destinations)->data)) + (size_t)(dup_graph->destinations
)->rsize * (size_t)(0))))[_i_].graph == dup_graph) ; else __assert_fail
("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph->destinations)->rsize * (size_t)(0))))[_i_].graph == dup_graph"
, "ccv_nnc_symbolic_graph_compile.c", 2624, __extension__ __PRETTY_FUNCTION__
); })); if (_incomings_[((ccv_nnc_graph_exec_symbol_t*)((void
*)(((char*)((dup_graph->destinations)->data)) + (size_t
)(dup_graph->destinations)->rsize * (size_t)(0))))[_i_]
.d].r == 4) continue; if (!(0)) { ((void) sizeof ((_incomings_
[((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->
destinations)->data)) + (size_t)(dup_graph->destinations
)->rsize * (size_t)(0))))[_i_].d].c == 0) ? 1 : 0), __extension__
({ if (_incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)((
(char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph
->destinations)->rsize * (size_t)(0))))[_i_].d].c == 0)
; else __assert_fail ("_incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph->destinations)->rsize * (size_t)(0))))[_i_].d].c == 0"
, "ccv_nnc_symbolic_graph_compile.c", 2624, __extension__ __PRETTY_FUNCTION__
); })); } else if (_incomings_[((ccv_nnc_graph_exec_symbol_t*
)((void*)(((char*)((dup_graph->destinations)->data)) + (
size_t)(dup_graph->destinations)->rsize * (size_t)(0)))
)[_i_].d].c > 0) continue; _visit_->node[_visit_->size
].index = ((((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)(
(dup_graph->destinations)->data)) + (size_t)(dup_graph->
destinations)->rsize * (size_t)(0))))[_i_].d)); _visit_->
node[_visit_->size].term = ((_incomings_[((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->destinations)->data)) +
(size_t)(dup_graph->destinations)->rsize * (size_t)(0)
)))[_i_].d].d)); ++_visit_->size;; } if (_heap_mem_) free(
_incomings_); } while (0);; ((void) sizeof ((_visit_->size
<= (dup_graph->exec_symbol_info->rnum)) ? 1 : 0), __extension__
({ if (_visit_->size <= (dup_graph->exec_symbol_info
->rnum)) ; else __assert_fail ("_visit_->size <= (dup_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 2624, __extension__ __PRETTY_FUNCTION__
); })); _visit_; })
;
2625 ccv_nnc_symbolic_graph_symbol_infer(dup_graph, dup_visit, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->sources, 0)((void*)(((char*)((dup_graph->sources)->data)) + (size_t
)(dup_graph->sources)->rsize * (size_t)(0)))
, dup_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->destinations, 0)((void*)(((char*)((dup_graph->destinations)->data)) + (
size_t)(dup_graph->destinations)->rsize * (size_t)(0)))
, dup_graph->destinations->rnum, p_tensor_symbol_info, p_tensor_symbol_info_size, dup_tensor_symbol_info, dup_exec_symbol_info);
2626 _ccv_nnc_fixup_assign_ref_after_unroll(symbolic_graph, unroll_count, tensor_blocks, dup_tensor_block_ref, dup_tensor_symbol_info);
2627 // Free out the old exec_dep
2628 ccv_matrix_free(exec_dep);
2629 // and the tensor blocks, prepare for the new.
2630 _ccv_nnc_tensor_blocks_free(tensor_blocks, symbolic_graph->tensor_symbol_info->rnum);
2631 // A reverse map to find where the original tensor comes from.
2632 int* dup_tensor_from_ref = (int*)ccmallocmalloc(sizeof(int) * dup_graph->tensor_symbol_info->rnum);
2633 for (i = 0; i < dup_graph->tensor_symbol_info->rnum; i++)
2634 dup_tensor_from_ref[i] = -1;
2635 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2636 for (j = 0; j < unroll_count; j++)
2637 if (dup_tensor_block_ref[i * unroll_count + j] >= 0)
2638 dup_tensor_from_ref[dup_tensor_block_ref[i * unroll_count + j]] = i;
2639 int* dup_exec_from_ref = (int*)ccmallocmalloc(sizeof(int) * dup_graph->exec_symbol_info->rnum);
2640 for (i = 0; i < dup_graph->exec_symbol_info->rnum; i++)
2641 dup_exec_from_ref[i] = -1;
2642 for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; i++)
2643 {
2644 if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_symbol_info[i].flags)((exec_symbol_info[i].flags) & CCV_NNC_GRAPH_EXEC_DEAD))
2645 continue;
2646 dup_exec_from_ref[i] = i; // Reference back.
2647 for (j = 0; j < unroll_count; j++)
2648 if (dup_exec_ref[i * unroll_count + j] >= 0)
2649 dup_exec_from_ref[dup_exec_ref[i * unroll_count + j]] = i;
2650 }
2651 // Reset all attr.
2652 memset(exec_flags, 0, sizeof(ccv_nnc_graph_exec_flag_t) * symbolic_graph->exec_symbol_info->rnum);
2653 _ccv_nnc_exec_dep_and_tensor_blocks_prep(dup_graph, p_node_info, dup_visit, tensor_binds, tensor_bind_size, 0, 0, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->sources, 0)((void*)(((char*)((dup_graph->sources)->data)) + (size_t
)(dup_graph->sources)->rsize * (size_t)(0)))
, dup_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->destinations, 0)((void*)(((char*)((dup_graph->destinations)->data)) + (
size_t)(dup_graph->destinations)->rsize * (size_t)(0)))
, dup_graph->destinations->rnum, p_tensor_symbol_info, p_tensor_symbol_info_size, dup_exec_symbol_info, dup_tensor_symbol_info, unroll_count, dup_tensor_block_ref, dup_tensor_from_ref, dup_exec_from_ref, exec_flags, &exec_dep, &tensor_blocks);
2654 ccv_nnc_graph_visit_free(dup_visit);
2655 ccfreefree(dup_exec_symbol_info);
2656 ccfreefree(dup_exec_from_ref);
2657 ccfreefree(dup_tensor_from_ref);
2658 // Assign out dup_p_ref, which will be used to extended the anonymous block life-time.
2659 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2660 // Loop over all possible duplications to assign dup_p_ref properly.
2661 for (j = 0; j < unroll_count; j++)
2662 {
2663 const int dup_idx = dup_tensor_block_ref[j + i * unroll_count];
2664 if (dup_idx >= 0 && (tensor_blocks[i].p_refs[0] || tensor_blocks[i].p_refs[1]))
2665 {
2666 const int p_ref_0 = tensor_blocks[i].p_refs[0] - 1;
2667 const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info);
2668 if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this.
2669 {
2670 if (!tensor_blocks[dup_idx].dup_p_refs)
2671 tensor_blocks[dup_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
2672 ccv_array_add_unique_int(tensor_blocks[dup_idx].dup_p_refs, p_ref_0);
2673 }
2674 if (p_ref_0_is_in_or_out == 1 || tensor_blocks[i].p_refs[1] == 0)
2675 continue;
2676 const int p_ref_1 = tensor_blocks[i].p_refs[1] - 1;
2677 const int p_ref_1_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_1, p_node_info);
2678 if (p_ref_1_is_in_or_out == 1)
2679 {
2680 if (!tensor_blocks[dup_idx].dup_p_refs)
2681 tensor_blocks[dup_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
2682 ccv_array_add_unique_int(tensor_blocks[dup_idx].dup_p_refs, p_ref_1);
2683 }
2684 }
2685 }
2686 // companion_ref
2687 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2688 // Now can assign them (The dup) as companion.
2689 if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED) && dup_tensor_symbol_info[i].assign_ref)
2690 {
2691 // Get to the last one, which we will wrap over.
2692 const int assign_ref = dup_tensor_symbol_info[i].assign_ref - 1;
2693 if (assign_ref >= 0)
2694 {
2695 int b_ref = assign_ref;
2696 while (tensor_blocks[b_ref].ref)
2697 b_ref = tensor_blocks[b_ref].ref - 1;
2698 int a_hop_b = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[i], tensor_blocks[b_ref]);
2699 int b_hop_a = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[b_ref], tensor_blocks[i]);
2700 // It cannot be that both i can hop to j can j can hop to i.
2701 // And it can be hop from one to another now after duplication.
2702 assert(a_hop_b > 0 || b_hop_a > 0)((void) sizeof ((a_hop_b > 0 || b_hop_a > 0) ? 1 : 0), __extension__
({ if (a_hop_b > 0 || b_hop_a > 0) ; else __assert_fail
("a_hop_b > 0 || b_hop_a > 0", "ccv_nnc_symbolic_graph_compile.c"
, 2702, __extension__ __PRETTY_FUNCTION__); }))
;
2703 tensor_blocks[i].companion_ref = b_ref + 1;
2704 tensor_blocks[b_ref].companion_ref = i + 1;
2705 }
2706 }
2707 ccfreefree(dup_tensor_symbol_info);
2708 // Extend the dup tensor block ref, prepare for future extensions.
2709 dup_tensor_block_ref = (int*)ccreallocrealloc(dup_tensor_block_ref, sizeof(int) * dup_graph->tensor_symbol_info->rnum * unroll_count);
2710 for (i = symbolic_graph->tensor_symbol_info->rnum * unroll_count; i < dup_graph->tensor_symbol_info->rnum * unroll_count; i++)
2711 dup_tensor_block_ref[i] = -1;
2712 // Assign out changed properties.
2713 *r_exec_dep = exec_dep;
2714 *r_tensor_blocks = tensor_blocks;
2715 *r_tensor_block_size = dup_graph->tensor_symbol_info->rnum;
2716 *r_dup_graph = dup_graph;
2717 *r_unroll_count = unroll_count;
2718 *r_dup_exec_ref = dup_exec_ref;
2719 *r_dup_tensor_block_ref = dup_tensor_block_ref;
2720}
2721
2722static int _ccv_nnc_anonymous_tensor_block_from_free_list(const ccv_nnc_tensor_block_t* const tensor_blocks, const int tensor_block_size, const ccv_array_t* const anonymous_block_free_list, const int anonymous_block_free_list_cap, const int type, const uint64_t size, const ccv_sparse_matrix_t* const exec_dep, const ccv_array_t* const dup_p_refs)
2723{
2724 if (!anonymous_block_free_list || !anonymous_block_free_list_cap)
2725 return tensor_block_size;
2726 int i;
2727 const int no_dup_p_refs = (!dup_p_refs || !dup_p_refs->rnum);
2728 int found_idx = tensor_block_size;
2729 for (i = 0; i < anonymous_block_free_list_cap; i++)
2730 {
2731 const int idx = *(int*)ccv_array_get(anonymous_block_free_list, i)((void*)(((char*)((anonymous_block_free_list)->data)) + (size_t
)(anonymous_block_free_list)->rsize * (size_t)(i)))
;
2732 assert(idx < tensor_block_size)((void) sizeof ((idx < tensor_block_size) ? 1 : 0), __extension__
({ if (idx < tensor_block_size) ; else __assert_fail ("idx < tensor_block_size"
, "ccv_nnc_symbolic_graph_compile.c", 2732, __extension__ __PRETTY_FUNCTION__
); }))
;
2733 // If the type doesn't match, ignore.
2734 if (tensor_blocks[idx].type != type)
2735 continue;
2736 // Heuristic about how to select the best tensor block to move forward.
2737 // If the size is larger, and no dup_p_refs, found, I cannot do better than this, just return directly.
2738 if (tensor_blocks[idx].size >= size)
2739 {
2740 if (no_dup_p_refs)
2741 return idx;
2742 // Otherwise, only if the current tensor block's dup_p_refs is after (or at) the dup_p_refs,
2743 // then we cannot do better than this, if that is the case, just return.
2744 if (tensor_blocks[idx].dup_p_refs && tensor_blocks[idx].dup_p_refs->rnum &&
2745 _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, dup_p_refs))
2746 return idx;
2747 }
2748 int64_t found_idx_size_diff;
2749 int64_t idx_size_diff;
2750 if (found_idx == tensor_block_size || // If no found_idx yet, set the current one to be the found one, and continue.
2751 // Now, compare whether this one or the found_idx one is better.
2752 // At this point, there is no point of comparing the dup_p_refs, we only care about which one
2753 // is closer to the size we request. Only on a tie, dup_p_refs or not is important again.
2754 (found_idx_size_diff = llabs((int64_t)tensor_blocks[found_idx].size - (int64_t)size)) < (idx_size_diff = llabs((int64_t)tensor_blocks[idx].size - (int64_t)size)))
2755 {
2756 found_idx = idx;
2757 continue;
2758 }
2759 // No need to update if found_idx is better than idx.
2760 if (found_idx_size_diff > idx_size_diff)
2761 continue;
2762 // We bias towards the bigger one in case of similar.
2763 if (found_idx_size_diff == idx_size_diff && tensor_blocks[idx].size > tensor_blocks[found_idx].size)
2764 {
2765 found_idx = idx;
2766 continue;
2767 }
2768 assert(tensor_blocks[idx].size == tensor_blocks[found_idx].size)((void) sizeof ((tensor_blocks[idx].size == tensor_blocks[found_idx
].size) ? 1 : 0), __extension__ ({ if (tensor_blocks[idx].size
== tensor_blocks[found_idx].size) ; else __assert_fail ("tensor_blocks[idx].size == tensor_blocks[found_idx].size"
, "ccv_nnc_symbolic_graph_compile.c", 2768, __extension__ __PRETTY_FUNCTION__
); }))
;
2769 // On a tie, check which one has tighter life-cycle.
2770 if (tensor_blocks[idx].size >= size) // If this block size covers the size we request, we prefer longer life-cycle ones.
2771 {
2772 // Check whether the current tensor blocks life-cycle is longer than the previous one.
2773 if (tensor_blocks[idx].dup_p_refs && tensor_blocks[idx].dup_p_refs->rnum > 0 &&
2774 (!tensor_blocks[found_idx].dup_p_refs || !tensor_blocks[found_idx].dup_p_refs->rnum ||
2775 _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, tensor_blocks[found_idx].dup_p_refs)))
2776 found_idx = idx;
2777 continue;
2778 }
2779 // Now both our size is smaller than requested size, in this case, we need to increase the tensor block size.
2780 // We prefer to choose the one that has life-cycle closer to the expected ones.
2781 if (no_dup_p_refs)
2782 {
2783 // Whoever is shorter wins.
2784 if (tensor_blocks[found_idx].dup_p_refs && tensor_blocks[found_idx].dup_p_refs->rnum > 0 &&
2785 (!tensor_blocks[idx].dup_p_refs || !tensor_blocks[idx].dup_p_refs->rnum ||
2786 _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[found_idx].dup_p_refs, tensor_blocks[idx].dup_p_refs)))
2787 found_idx = idx;
2788 continue;
2789 }
2790 if (!tensor_blocks[idx].dup_p_refs || !tensor_blocks[idx].dup_p_refs->rnum)
2791 continue;
2792 if (!tensor_blocks[found_idx].dup_p_refs || !tensor_blocks[found_idx].dup_p_refs->rnum)
2793 {
2794 found_idx = idx;
2795 continue;
2796 }
2797 // If both covers the request dup_p_refs, we prefer the shorter one, otherwise we prefer the longer one.
2798 const int idx_after_request = _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, dup_p_refs);
2799 const int found_idx_after_request = _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[found_idx].dup_p_refs, dup_p_refs);
2800 if (idx_after_request && found_idx_after_request)
2801 {
2802 if (_ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[found_idx].dup_p_refs, tensor_blocks[idx].dup_p_refs))
2803 found_idx = idx;
2804 continue;
2805 } else {
2806 // We entered this branch must be either idx_after_request is false or found_idx_after_request is false or both.
2807 // If found_idx_after_request is not false, we are currently doing fine, no need to proceed.
2808 // Otherwise, if idx_after_request is true, it is preferred. If both are false, then prefer the longer one.
2809 if (!found_idx_after_request && (idx_after_request ||
2810 _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, tensor_blocks[found_idx].dup_p_refs)))
2811 found_idx = idx;
2812 continue;
2813 }
2814 }
2815 return found_idx;
2816}
2817
2818static ccv_array_t* _ccv_nnc_dup_breakpoints_with_p_node_inputs(ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_info_t* const p_node_info)
2819{
2820 if (!(p_node_info && (p_node_info->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)))
2821 return 0;
2822 int i, j, k;
2823 int input_size = 0;
2824 for (i = 0; i < p_node_info->p_while.input_size; i++)
2825 if (p_node_info->p_while.inputs[i] >= 0)
2826 ++input_size;
2827 // If doesn't have tensor inputs (thus, only special inputs), just return.
2828 if (!input_size)
2829 return 0;
2830 ccv_nnc_tensor_symbol_t inputs[input_size];
2831 input_size = 0;
2832 for (i = 0; i < p_node_info->p_while.input_size; i++)
2833 if (p_node_info->p_while.inputs[i] >= 0)
2834 inputs[input_size++] = (ccv_nnc_tensor_symbol_t){
2835 .d = p_node_info->p_while.inputs[i],
2836 .graph = symbolic_graph,
2837 };
2838 assert(symbolic_graph->breakpoint_size > 0)((void) sizeof ((symbolic_graph->breakpoint_size > 0) ?
1 : 0), __extension__ ({ if (symbolic_graph->breakpoint_size
> 0) ; else __assert_fail ("symbolic_graph->breakpoint_size > 0"
, "ccv_nnc_symbolic_graph_compile.c", 2838, __extension__ __PRETTY_FUNCTION__
); }))
;
2839 ccv_array_t* dup_breakpoints = ccv_array_new(sizeof(ccv_nnc_graph_exec_symbol_t), symbolic_graph->breakpoint_size, 0);
2840 const int exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum;
2841 for (i = 0; i < symbolic_graph->breakpoint_size; i++)
2842 {
2843 // Make a noop copy of the breakpoint, but with some tensor inputs.
2844 ccv_nnc_graph_exec_symbol_t noop = ccv_nnc_graph_exec_symbol_new(symbolic_graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC()((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}}}), 0), inputs, input_size, 0, 0, 0);
2845 ccv_array_push(dup_breakpoints, &noop);
2846 // Connect this noop to the outgoing nodes of breakpoints.
2847 const ccv_nnc_graph_exec_symbol_info_t* const symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, symbolic_graph->breakpoints[i].d)((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
(size_t)(symbolic_graph->breakpoints[i].d)))
;
2848 if (symbol_info->outgoings)
2849 for (j = 0; j < symbol_info->outgoings->rnum; j++)
2850 {
2851 const int d = *(int*)ccv_array_get(symbol_info->outgoings, j)((void*)(((char*)((symbol_info->outgoings)->data)) + (size_t
)(symbol_info->outgoings)->rsize * (size_t)(j)))
;
2852 ccv_nnc_graph_exec_symbol_concat(symbolic_graph, noop, (ccv_nnc_graph_exec_symbol_t){
2853 .d = d,
2854 .graph = symbolic_graph,
2855 });
2856 }
2857 }
2858 for (i = 0; i < exec_symbol_info_size; i++)
2859 {
2860 const ccv_nnc_graph_exec_symbol_info_t* const symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, i)((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
(size_t)(i)))
;
2861 if (CCV_NNC_GRAPH_EXEC_IS_DEAD(symbol_info->flags)((symbol_info->flags) & CCV_NNC_GRAPH_EXEC_DEAD))
2862 continue;
2863 if (symbol_info->outgoings)
2864 {
2865 const int outgoing_size = symbol_info->outgoings->rnum;
2866 for (j = 0; j < outgoing_size; j++)
2867 {
2868 const int d = *(int*)ccv_array_get(symbol_info->outgoings, j)((void*)(((char*)((symbol_info->outgoings)->data)) + (size_t
)(symbol_info->outgoings)->rsize * (size_t)(j)))
;
2869 for (k = 0; k < symbolic_graph->breakpoint_size; k++)
2870 if (d == symbolic_graph->breakpoints[k].d)
2871 {
2872 ccv_nnc_graph_exec_symbol_t noop = *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_breakpoints, k)((void*)(((char*)((dup_breakpoints)->data)) + (size_t)(dup_breakpoints
)->rsize * (size_t)(k)))
;
2873 ccv_nnc_graph_exec_symbol_concat(symbolic_graph, (ccv_nnc_graph_exec_symbol_t){
2874 .d = i,
2875 .graph = symbolic_graph,
2876 }, noop);
2877 // Found, connected, exit.
2878 break;
2879 }
2880 }
2881 }
2882 }
2883 // Add the dup_breakpoints to source if neccessary.
2884 assert(symbolic_graph->sources)((void) sizeof ((symbolic_graph->sources) ? 1 : 0), __extension__
({ if (symbolic_graph->sources) ; else __assert_fail ("symbolic_graph->sources"
, "ccv_nnc_symbolic_graph_compile.c", 2884, __extension__ __PRETTY_FUNCTION__
); }))
;
2885 const int source_size = symbolic_graph->sources->rnum;
2886 for (i = 0; i < source_size; i++)
2887 {
2888 const int d = ((ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->sources, i)((void*)(((char*)((symbolic_graph->sources)->data)) + (
size_t)(symbolic_graph->sources)->rsize * (size_t)(i)))
)->d;
2889 for (j = 0; j < symbolic_graph->breakpoint_size; j++)
2890 if (d == symbolic_graph->breakpoints[j].d)
2891 {
2892 ccv_nnc_graph_exec_symbol_t noop = *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_breakpoints, j)((void*)(((char*)((dup_breakpoints)->data)) + (size_t)(dup_breakpoints
)->rsize * (size_t)(j)))
;
2893 ccv_nnc_symbolic_graph_add_source(symbolic_graph, noop);
2894 // Found, made, exit.
2895 break;
2896 }
2897 }
2898 // Add the dup_breakpoints to destination if neccessary.
2899 assert(symbolic_graph->destinations)((void) sizeof ((symbolic_graph->destinations) ? 1 : 0), __extension__
({ if (symbolic_graph->destinations) ; else __assert_fail
("symbolic_graph->destinations", "ccv_nnc_symbolic_graph_compile.c"
, 2899, __extension__ __PRETTY_FUNCTION__); }))
;
2900 const int destination_size = symbolic_graph->destinations->rnum;
2901 for (i = 0; i < destination_size; i++)
2902 {
2903 const int d = ((ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->destinations, i)((void*)(((char*)((symbolic_graph->destinations)->data)
) + (size_t)(symbolic_graph->destinations)->rsize * (size_t
)(i)))
)->d;
2904 for (j = 0; j < symbolic_graph->breakpoint_size; j++)
2905 if (d == symbolic_graph->breakpoints[j].d)
2906 {
2907 ccv_nnc_graph_exec_symbol_t noop = *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_breakpoints, j)((void*)(((char*)((dup_breakpoints)->data)) + (size_t)(dup_breakpoints
)->rsize * (size_t)(j)))
;
2908 ccv_nnc_symbolic_graph_add_destination(symbolic_graph, noop);
2909 // Found, made, exit.
2910 break;
2911 }
2912 }
2913 return dup_breakpoints;
2914}
2915
2916// Plan out how we allocate tensor (should I do optimizations on graph here or not at all?).
2917static ccv_nnc_symbolic_graph_prep_t* _ccv_nnc_symbolic_graph_prep_new(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_graph_exec_symbol_info_t* const p_exec_symbol_info, const int p_exec_symbol_info_size)
2918{
2919 assert(source_size > 0)((void) sizeof ((source_size > 0) ? 1 : 0), __extension__ (
{ if (source_size > 0) ; else __assert_fail ("source_size > 0"
, "ccv_nnc_symbolic_graph_compile.c", 2919, __extension__ __PRETTY_FUNCTION__
); }))
;
2920 assert(destination_size > 0)((void) sizeof ((destination_size > 0) ? 1 : 0), __extension__
({ if (destination_size > 0) ; else __assert_fail ("destination_size > 0"
, "ccv_nnc_symbolic_graph_compile.c", 2920, __extension__ __PRETTY_FUNCTION__
); }))
;
2921 // First, fill all the "auto" holes.
2922 // This is the symbol table that with "auto" info filled up.
2923 ccv_nnc_tensor_symbol_info_t* tensor_symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccmallocmalloc(sizeof(ccv_nnc_tensor_symbol_info_t) * symbolic_graph->tensor_symbol_info->rnum);
2924 ccv_nnc_graph_exec_symbol_info_t* exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccmallocmalloc(sizeof(ccv_nnc_graph_exec_symbol_info_t) * symbolic_graph->exec_symbol_info->rnum);
2925 ccv_nnc_graph_exec_flag_t* exec_flags = (ccv_nnc_graph_exec_flag_t*)cccalloccalloc(symbolic_graph->exec_symbol_info->rnum, sizeof(ccv_nnc_graph_exec_flag_t));
2926 ccv_nnc_graph_visit_t* visit = ccv_nnc_graph_visit_new(symbolic_graph, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, 0), symbolic_graph->exec_symbol_info->rnum, sources, source_size, destinations, destination_size, 0)({ ccv_nnc_graph_visit_t* _visit_ = (ccv_nnc_graph_visit_t*)malloc
(sizeof(ccv_nnc_graph_visit_t) + sizeof(_visit_->node[0]) *
((symbolic_graph->exec_symbol_info->rnum) - 1)); _visit_
->size = 0; do { typedef struct { int8_t d; int8_t r; uint16_t
c; int32_t edges; } ccv_nnc_incoming_t; int _i_, _j_; int _incoming_edges_
= 0; for (_i_ = 0; _i_ < (symbolic_graph->exec_symbol_info
->rnum); _i_++) _incoming_edges_ += (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_i_].outgoings) ? ((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_i_].outgoings->rnum : 0; const int _heap_mem_
= (symbolic_graph->exec_symbol_info->rnum + _incoming_edges_
> 1024); ccv_nnc_incoming_t* _incomings_; if (_heap_mem_)
_incomings_ = (ccv_nnc_incoming_t*)malloc(sizeof(ccv_nnc_incoming_t
) * (symbolic_graph->exec_symbol_info->rnum) + sizeof(int32_t
) * ((symbolic_graph->exec_symbol_info->rnum) * 2 + _incoming_edges_
)); else _incomings_ = (ccv_nnc_incoming_t*)__builtin_alloca (
sizeof(ccv_nnc_incoming_t) * (symbolic_graph->exec_symbol_info
->rnum) + sizeof(int32_t) * ((symbolic_graph->exec_symbol_info
->rnum) * 2 + _incoming_edges_)); memset(_incomings_, 0, sizeof
(ccv_nnc_incoming_t) * (symbolic_graph->exec_symbol_info->
rnum)); int32_t* _exists_[2] = { (int32_t*)(_incomings_ + (symbolic_graph
->exec_symbol_info->rnum)), (int32_t*)(_incomings_ + (symbolic_graph
->exec_symbol_info->rnum)) + (symbolic_graph->exec_symbol_info
->rnum), }; int32_t* const _edges_ = _exists_[1] + (symbolic_graph
->exec_symbol_info->rnum); for (_i_ = 0; _i_ < (source_size
); _i_++) { ((void) sizeof (((sources)[_i_].graph == symbolic_graph
) ? 1 : 0), __extension__ ({ if ((sources)[_i_].graph == symbolic_graph
) ; else __assert_fail ("(sources)[_i_].graph == symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 2926, __extension__ __PRETTY_FUNCTION__
); })); _exists_[0][_i_] = (sources)[_i_].d; } int _exist_size_
[2] = { (source_size), 0, }; int _p_ = 0, _q_ = 1; while (_exist_size_
[_p_] > 0) { _exist_size_[_q_] = 0; for (_i_ = 0; _i_ <
_exist_size_[_p_]; _i_++) { const int32_t _idx_ = _exists_[_p_
][_i_]; if (_incomings_[_idx_].r == 1) continue; _incomings_[
_idx_].r = 1; if (((ccv_nnc_graph_exec_symbol_info_t*)((void*
)(((char*)((symbolic_graph->exec_symbol_info)->data)) +
(size_t)(symbolic_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings) for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings->rnum; _j_++) { const int
d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_)
)); ++_incomings_[d].c; _exists_[_q_][_exist_size_[_q_]] = d;
++_exist_size_[_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_
) = (_i_)); } for (_i_ = 0; _i_ < (source_size); _i_++) { (
(void) sizeof (((sources)[_i_].graph == symbolic_graph) ? 1 :
0), __extension__ ({ if ((sources)[_i_].graph == symbolic_graph
) ; else __assert_fail ("(sources)[_i_].graph == symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 2926, __extension__ __PRETTY_FUNCTION__
); })); _exists_[0][_i_] = (sources)[_i_].d; } _exist_size_[0
] = (source_size); _exist_size_[1] = 0; _p_ = 0, _q_ = 1; int
_bump_ = 1; while (_exist_size_[_p_] > 0) { _exist_size_[
_q_] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_]; _i_++) { const
int32_t _idx_ = _exists_[_p_][_i_]; if (_incomings_[_idx_].r
== 2) continue; _incomings_[_idx_].r = 2; if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings) for (_j_ = 0; _j_ < ((
ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)((symbolic_graph
->exec_symbol_info)->data)) + (size_t)(symbolic_graph->
exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings
->rnum; _j_++) { const int d = *(int*)((void*)(((char*)(((
(ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)((symbolic_graph
->exec_symbol_info)->data)) + (size_t)(symbolic_graph->
exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings
)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t*)(
(void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
(size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_)))
; if (_incomings_[d].edges == 0) { _incomings_[d].edges = _bump_
; _bump_ += _incomings_[d].c; _incomings_[d].c = 0; } _edges_
[_incomings_[d].edges - 1 + _incomings_[d].c] = _idx_; ++_incomings_
[d].c; _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[_q_
]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for (
_i_ = 0; _i_ < (destination_size); _i_++) { ((void) sizeof
(((destinations)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__
({ if ((destinations)[_i_].graph == symbolic_graph) ; else __assert_fail
("(destinations)[_i_].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 2926, __extension__ __PRETTY_FUNCTION__); })); _exists_[0][
_i_] = (destinations)[_i_].d; } _exist_size_[0] = (destination_size
); _exist_size_[1] = 0; _p_ = 0, _q_ = 1; while (_exist_size_
[_p_] > 0) { _exist_size_[_q_] = 0; for (_i_ = 0; _i_ <
_exist_size_[_p_]; _i_++) { const int32_t _idx_ = _exists_[_p_
][_i_]; if (_incomings_[_idx_].r != 2) continue; _incomings_[
_idx_].r = 3; if (_incomings_[_idx_].edges > 0) for (_j_ =
0; _j_ < _incomings_[_idx_].c; _j_++) { const int d = _edges_
[_incomings_[_idx_].edges - 1 + _j_]; _exists_[_q_][_exist_size_
[_q_]] = d; ++_exist_size_[_q_]; } } ((_i_) = (_p_), (_p_) = (
_q_), (_q_) = (_i_)); } for (_i_ = 0; _i_ < (destination_size
); _i_++) { ((void) sizeof (((destinations)[_i_].graph == symbolic_graph
) ? 1 : 0), __extension__ ({ if ((destinations)[_i_].graph ==
symbolic_graph) ; else __assert_fail ("(destinations)[_i_].graph == symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 2926, __extension__ __PRETTY_FUNCTION__
); })); _incomings_[(destinations)[_i_].d].d = 1; } for (_i_ =
0; _i_ < (source_size); _i_++) { ((void) sizeof (((sources
)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__ ({ if
((sources)[_i_].graph == symbolic_graph) ; else __assert_fail
("(sources)[_i_].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 2926, __extension__ __PRETTY_FUNCTION__); })); _exists_[0][
_i_] = (sources)[_i_].d; } _p_ = 0; _q_ = 1; _exist_size_[0] =
(source_size); _exist_size_[1] = 0; int _d_ = 0; while (_exist_size_
[_p_] > 0) { _exist_size_[_q_] = 0; for (_i_ = 0; _i_ <
_exist_size_[_p_];) { const int32_t _idx_ = _exists_[_p_][_i_
]; _visit_->node[_visit_->size].index = ((_idx_)); _visit_
->node[_visit_->size].term = ((_incomings_[_idx_].d)); ++
_visit_->size;; if (_incomings_[_idx_].d) { ++_d_; _incomings_
[_idx_].r = 4; } if (((ccv_nnc_graph_exec_symbol_info_t*)((void
*)(((char*)((symbolic_graph->exec_symbol_info)->data)) +
(size_t)(symbolic_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings) { if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings->rnum == 1) { const int
d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(0)))
; --_incomings_[d].c; if (_incomings_[d].c == 0 && _incomings_
[d].r == 3 && _d_ < (destination_size)) { _exists_
[_p_][_i_] = d; continue; } } else for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings->rnum; _j_++) { const int
d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_)
)); --_incomings_[d].c; if (_incomings_[d].c == 0 && _incomings_
[d].r == 3 && _d_ < (destination_size)) { _exists_
[_q_][_exist_size_[_q_]] = d; ++_exist_size_[_q_]; } } } ++_i_
; } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for (_i_
= 0; _i_ < (destination_size); _i_++) { ((void) sizeof ((
(destinations)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__
({ if ((destinations)[_i_].graph == symbolic_graph) ; else __assert_fail
("(destinations)[_i_].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 2926, __extension__ __PRETTY_FUNCTION__); })); if (_incomings_
[(destinations)[_i_].d].r == 4) continue; if (!(0)) { ((void)
sizeof ((_incomings_[(destinations)[_i_].d].c == 0) ? 1 : 0)
, __extension__ ({ if (_incomings_[(destinations)[_i_].d].c ==
0) ; else __assert_fail ("_incomings_[(destinations)[_i_].d].c == 0"
, "ccv_nnc_symbolic_graph_compile.c", 2926, __extension__ __PRETTY_FUNCTION__
); })); } else if (_incomings_[(destinations)[_i_].d].c > 0
) continue; _visit_->node[_visit_->size].index = (((destinations
)[_i_].d)); _visit_->node[_visit_->size].term = ((_incomings_
[(destinations)[_i_].d].d)); ++_visit_->size;; } if (_heap_mem_
) free(_incomings_); } while (0);; ((void) sizeof ((_visit_->
size <= (symbolic_graph->exec_symbol_info->rnum)) ? 1
: 0), __extension__ ({ if (_visit_->size <= (symbolic_graph
->exec_symbol_info->rnum)) ; else __assert_fail ("_visit_->size <= (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 2926, __extension__ __PRETTY_FUNCTION__
); })); _visit_; })
;
2927 ccv_nnc_symbolic_graph_symbol_infer(symbolic_graph, visit, sources, source_size, destinations, destination_size, p_tensor_symbol_info, p_tensor_symbol_info_size, tensor_symbol_info, exec_symbol_info);
2928 int i, j, k, p, q;
2929 const ccv_nnc_graph_exec_symbol_info_t* const p_node_info = p_exec_symbol_info ? p_exec_symbol_info + (symbolic_graph->exec_idx - 1) : 0;
2930 ccv_sparse_matrix_t* exec_dep;
2931 ccv_nnc_tensor_block_t* tensor_blocks;
2932 _ccv_nnc_exec_dep_and_tensor_blocks_prep(symbolic_graph, p_node_info, visit, tensor_binds, tensor_bind_size, outputs, output_size, sources, source_size, destinations, destination_size, p_tensor_symbol_info, p_tensor_symbol_info_size, exec_symbol_info, tensor_symbol_info, 0, 0, 0, 0, exec_flags, &exec_dep, &tensor_blocks);
2933 int tensor_block_size = symbolic_graph->tensor_symbol_info->rnum;
2934 // Now, everything is prepared, tensor life is analyzed, inplace operations are collapsed, all tensor symbols and hints
2935 // are automatically filled in, and all the sub-graphs are processed.
2936 // There is a last step though, for a while loop, it is parameterized:
2937 // while (x > 5) {
2938 // y = x + 1;
2939 // } (y => x) // This means after this loop is done, y's value will be copied over to x.
2940 // we will do our best to avoid to do the actual data copy, what we do here is to check whether y can be x's alias.
2941 // If y can be x's alias, this is good, no other changes required. In above case, y can be x's alias because
2942 // it is a inplace operation.
2943 // But if y cannot be x's alias, for example, this while loop looks like this:
2944 // while (x > 5) {
2945 // y = x + a
2946 // b = x + y
2947 // } (y => x, b => a) // This means after this loop is done, y's value copied to x and b's value copied to a.
2948 // For this example, y cannot be x's alias because x is used later to compute b (and that computation
2949 // has dependency on y as well).
2950 // For this case, we need to modify the computation graph. Previously, the graph looks like this:
2951 // y = x + a -> b = x + y
2952 // This graph will be extended to look like this:
2953 // y0 = x0 + a0 -> b0 = x0 + y0 -> y1 = y0 + b0 -> b1 = y0 + y1, or:
2954 // while (x0 > 5) {
2955 // y0 = x0 + a0
2956 // b0 = x0 + y0
2957 // if (y0 > 5) break
2958 // y1 = y0 + b0
2959 // b1 = y0 + y1
2960 // } (y1 => x0, b1 => a0)
2961 // After this expansion, y1 now can be the alias of x0, as well as b1 can be alias of a0 (they don't interfere
2962 // with each other now).
2963 // With this algorithm, we don't need to insert any data copy logic, the only thing need is to switch pointers
2964 // which is covered by the tensor_multiview_t construct (thus, y (y0, y1), x (y1, y0), b (b0, b1), a (b1, b0))
2965 ccv_nnc_symbolic_graph_t* dup_graph = 0;
2966 int* dup_exec_ref = 0;
2967 int* dup_tensor_block_ref = 0;
2968 int unroll_count = 0;
2969 // In true recursive fashion, I need to call all the sub graphs and do the pre compilation for them one by one.
2970 ccv_nnc_symbolic_graph_prep_t* prep = (ccv_nnc_symbolic_graph_prep_t*)ccmallocmalloc(sizeof(ccv_nnc_symbolic_graph_prep_t));
2971 prep->graph = ccv_nnc_graph_new(); // Just allocate the graph right now.
2972 prep->flags = 0;
2973 // Cannot handle dup a node that is a graph as well.
2974 if (p_exec_symbol_info)
2975 {
2976 prep->flags = p_node_info->flags;
2977 if (p_node_info->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
2978 {
2979 _ccv_nnc_redo_exec_dep_and_tensor_blocks_when_unroll(symbolic_graph, p_node_info, visit, tensor_binds, tensor_bind_size, sources, source_size, destinations, destination_size, p_tensor_symbol_info, p_tensor_symbol_info_size, exec_symbol_info, tensor_symbol_info, exec_flags, &exec_dep, &tensor_blocks, &tensor_block_size, &dup_graph, &unroll_count, &dup_exec_ref, &dup_tensor_block_ref);
2980 _ccv_nnc_fixup_tensor_blocks_for_outputs(exec_dep, tensor_blocks, p_node_info, unroll_count, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->destinations, 0)((void*)(((char*)((symbolic_graph->destinations)->data)
) + (size_t)(symbolic_graph->destinations)->rsize * (size_t
)(0)))
, symbolic_graph->destinations->rnum, symbolic_graph->p_idx - 1, p_tensor_symbol_info, p_tensor_symbol_info_size, tensor_symbol_info, dup_exec_ref, dup_tensor_block_ref);
2981 } else if (p_node_info->flags & CCV_NNC_GRAPH_EXEC_CASE_OF) {
2982 // TODO: We want to try our best to fit as much of its corresponding inputs / outputs into companion_ref group.
2983 }
2984 }
2985 ccv_nnc_symbolic_graph_prep_t** sub_preps = symbolic_graph->sub_graphs && symbolic_graph->sub_graphs->rnum ? (ccv_nnc_symbolic_graph_prep_t**)cccalloccalloc(symbolic_graph->sub_graphs->rnum, sizeof(ccv_nnc_symbolic_graph_prep_t*)) : 0;
2986 ccv_array_t* anonymous_block_free_list = 0;
2987 const int tensor_fold_size = (tensor_block_size + 31) >> 5;
2988 // Record whether this tensor is folded in this round.
2989 uint32_t* const tensor_fold = (uint32_t*)ccmallocmalloc(sizeof(uint32_t) * tensor_fold_size);
2990 ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx;
{
2991 for (p = 0; p < node->graph_ref_size; p++)
2992 {
2993 assert(symbolic_graph->sub_graphs)((void) sizeof ((symbolic_graph->sub_graphs) ? 1 : 0), __extension__
({ if (symbolic_graph->sub_graphs) ; else __assert_fail (
"symbolic_graph->sub_graphs", "ccv_nnc_symbolic_graph_compile.c"
, 2993, __extension__ __PRETTY_FUNCTION__); }))
;
2994 ccv_nnc_symbolic_graph_t* const sub_graph = *(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, CCV_NNC_GRAPH_REF(node)[p] - 1)((void*)(((char*)((symbolic_graph->sub_graphs)->data)) +
(size_t)(symbolic_graph->sub_graphs)->rsize * (size_t)
(((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (
node)->_inline_graph_ref)[p] - 1)))
;
2995 ccv_array_t* const dup_breakpoints = _ccv_nnc_dup_breakpoints_with_p_node_inputs(sub_graph, node);
2996 ccv_nnc_symbolic_graph_prep_t* const sub_prep = _ccv_nnc_symbolic_graph_prep_new(sub_graph, tensor_binds, tensor_bind_size, 0, 0, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sub_graph->sources, 0)((void*)(((char*)((sub_graph->sources)->data)) + (size_t
)(sub_graph->sources)->rsize * (size_t)(0)))
, sub_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sub_graph->destinations, 0)((void*)(((char*)((sub_graph->destinations)->data)) + (
size_t)(sub_graph->destinations)->rsize * (size_t)(0)))
, sub_graph->destinations->rnum, tensor_symbol_info, symbolic_graph->tensor_symbol_info->rnum, exec_symbol_info, symbolic_graph->exec_symbol_info->rnum);
2997 sub_prep->dup_breakpoints = dup_breakpoints;
2998 sub_prep->p = prep;
2999 sub_preps[CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)
[p] - 1] = sub_prep;
3000 const ccv_nnc_tensor_alloc_prep_t* const s_alloc_prep = sub_prep->alloc_prep;
3001 const ccv_nnc_tensor_block_t* const s_tensor_blocks = sub_prep->tensor_blocks;
3002 for (i = 0; i < s_alloc_prep->block_size; i++)
3003 {
3004 const int block_ref = s_alloc_prep->blocks[i].block_ref;
3005 const int buffer_ref = s_alloc_prep->blocks[i].buffer_ref;
3006 if (block_ref < sub_prep->tensor_symbol_info_size)
3007 {
3008 // If this block has a bypass, and its bypass has a different p_refs, then it doesn't matter.
3009 // I cannot assign p_refs to its parent buffer, and that buffer has to be anonymous.
3010 if (s_tensor_blocks[block_ref].bypass_ref)
3011 {
3012 int bypass_ref = s_tensor_blocks[block_ref].bypass_ref - 1;
3013 while (s_tensor_blocks[bypass_ref].ref)
3014 bypass_ref = s_tensor_blocks[bypass_ref].ref - 1;
3015 if (s_tensor_blocks[block_ref].p_refs[0] != s_tensor_blocks[bypass_ref].p_refs[0] ||
3016 s_tensor_blocks[block_ref].p_refs[1] != s_tensor_blocks[bypass_ref].p_refs[1])
3017 continue;
3018 }
3019 if (s_tensor_blocks[block_ref].p_refs[0])
3020 {
3021 /* If it is already properly assigned, next. */
3022 if (s_alloc_prep->buffers[buffer_ref].p_refs[0] != s_tensor_blocks[block_ref].p_refs[0] &&
3023 s_alloc_prep->buffers[buffer_ref].p_refs[1] != s_tensor_blocks[block_ref].p_refs[0])
3024 {
3025 if (!s_alloc_prep->buffers[buffer_ref].p_refs[0])
3026 s_alloc_prep->buffers[buffer_ref].p_refs[0] = s_tensor_blocks[block_ref].p_refs[0];
3027 else {
3028 assert(!s_alloc_prep->buffers[buffer_ref].p_refs[1])((void) sizeof ((!s_alloc_prep->buffers[buffer_ref].p_refs
[1]) ? 1 : 0), __extension__ ({ if (!s_alloc_prep->buffers
[buffer_ref].p_refs[1]) ; else __assert_fail ("!s_alloc_prep->buffers[buffer_ref].p_refs[1]"
, "ccv_nnc_symbolic_graph_compile.c", 3028, __extension__ __PRETTY_FUNCTION__
); }))
;
3029 s_alloc_prep->buffers[buffer_ref].p_refs[1] = s_tensor_blocks[block_ref].p_refs[0];
3030 }
3031 }
3032 /* When entering this branch, s_alloc_prep->buffers[buffer_ref].p_refs[0] cannot be 0. */
3033 if (s_tensor_blocks[block_ref].p_refs[1] &&
3034 s_alloc_prep->buffers[buffer_ref].p_refs[0] != s_tensor_blocks[block_ref].p_refs[1] &&
3035 s_alloc_prep->buffers[buffer_ref].p_refs[1] != s_tensor_blocks[block_ref].p_refs[1])
3036 {
3037 assert(s_alloc_prep->buffers[buffer_ref].p_refs[0])((void) sizeof ((s_alloc_prep->buffers[buffer_ref].p_refs[
0]) ? 1 : 0), __extension__ ({ if (s_alloc_prep->buffers[buffer_ref
].p_refs[0]) ; else __assert_fail ("s_alloc_prep->buffers[buffer_ref].p_refs[0]"
, "ccv_nnc_symbolic_graph_compile.c", 3037, __extension__ __PRETTY_FUNCTION__
); }))
;
3038 assert(!s_alloc_prep->buffers[buffer_ref].p_refs[1])((void) sizeof ((!s_alloc_prep->buffers[buffer_ref].p_refs
[1]) ? 1 : 0), __extension__ ({ if (!s_alloc_prep->buffers
[buffer_ref].p_refs[1]) ; else __assert_fail ("!s_alloc_prep->buffers[buffer_ref].p_refs[1]"
, "ccv_nnc_symbolic_graph_compile.c", 3038, __extension__ __PRETTY_FUNCTION__
); }))
;
3039 s_alloc_prep->buffers[buffer_ref].p_refs[1] = s_tensor_blocks[block_ref].p_refs[1];
3040 }
3041 }
3042 } else if (s_tensor_blocks[block_ref].dup_p_refs) {
3043 /* In this case, only relevant bit is dup_p_ref. dup_p_ref extends the life-time of anonymous block
3044 * which by default only has life-cycle shared with this sub-graph node. The reason to extend is that
3045 * these anonymous blocks that has dup_p_ref may contain data that will be used as output (thus, dup_p_ref
3046 * always points to an output tensor of this sub-graph node) therefore, the memory region must extend
3047 * its life-time to the end of the output tensor. */
3048 if (!s_alloc_prep->buffers[buffer_ref].dup_p_refs)
3049 s_alloc_prep->buffers[buffer_ref].dup_p_refs = ccv_array_new(sizeof(int), s_tensor_blocks[block_ref].dup_p_refs->rnum, 0);
3050 for (j = 0; j < s_tensor_blocks[block_ref].dup_p_refs->rnum; j++)
3051 ccv_array_add_unique_int(s_alloc_prep->buffers[buffer_ref].dup_p_refs, *(int*)ccv_array_get(s_tensor_blocks[block_ref].dup_p_refs, j)((void*)(((char*)((s_tensor_blocks[block_ref].dup_p_refs)->
data)) + (size_t)(s_tensor_blocks[block_ref].dup_p_refs)->
rsize * (size_t)(j)))
);
3052 }
3053 }
3054 }
3055 const int init_tensor_block_size = tensor_block_size;
3056 int rw_anonymous_buffer_size_cap = 0;
3057 int ro_anonymous_buffer_size_cap = 0;
3058 if (anonymous_block_free_list)
3059 ccv_array_clear(anonymous_block_free_list);
3060 memset(tensor_fold, 0, sizeof(uint32_t) * tensor_fold_size);
3061 for (p = 0; p < node->graph_ref_size; p++)
3062 {
3063 ccv_nnc_symbolic_graph_prep_t* const sub_prep = sub_preps[CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)
[p] - 1];
3064 const ccv_nnc_tensor_alloc_prep_t* const s_alloc_prep = sub_prep->alloc_prep;
3065 int rw_anonymous_buffer_size = 0;
3066 int ro_anonymous_buffer_size = 0;
3067 for (i = 0; i < s_alloc_prep->buffer_size; i++)
3068 if (s_alloc_prep->buffers[i].p_refs[0])
3069 {
3070 /* Reduce 2 p_refs, if it is, to 1 p_ref (by doing block folding). */
3071 int p_ref_0 = s_alloc_prep->buffers[i].p_refs[0] - 1;
3072 /* Need to go through refs. Since we reuse the tensor block for this input, it now has to have allocate at least this much space. */
3073 int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, node);
3074 assert(p_ref_0_is_in_or_out != 0)((void) sizeof ((p_ref_0_is_in_or_out != 0) ? 1 : 0), __extension__
({ if (p_ref_0_is_in_or_out != 0) ; else __assert_fail ("p_ref_0_is_in_or_out != 0"
, "ccv_nnc_symbolic_graph_compile.c", 3074, __extension__ __PRETTY_FUNCTION__
); }))
;
3075 int unref_p_ref_0 = p_ref_0;
3076 while (tensor_blocks[unref_p_ref_0].ref)
3077 unref_p_ref_0 = tensor_blocks[unref_p_ref_0].ref - 1;
3078 /* This parent tensor block cannot be unassigned because it is either input / output of this sub-graph node. */
3079 assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_0]))((void) sizeof ((!((tensor_blocks[unref_p_ref_0].flags & 0x3
) == UNASSIGNED)) ? 1 : 0), __extension__ ({ if (!((tensor_blocks
[unref_p_ref_0].flags & 0x3) == UNASSIGNED)) ; else __assert_fail
("!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_0])", "ccv_nnc_symbolic_graph_compile.c"
, 3079, __extension__ __PRETTY_FUNCTION__); }))
;
3080 if (s_alloc_prep->buffers[i].p_refs[1])
3081 {
3082 int p_ref_1 = s_alloc_prep->buffers[i].p_refs[1] - 1;
3083 const int p_ref_1_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_1, node);
3084 assert(p_ref_1_is_in_or_out != 0)((void) sizeof ((p_ref_1_is_in_or_out != 0) ? 1 : 0), __extension__
({ if (p_ref_1_is_in_or_out != 0) ; else __assert_fail ("p_ref_1_is_in_or_out != 0"
, "ccv_nnc_symbolic_graph_compile.c", 3084, __extension__ __PRETTY_FUNCTION__
); }))
;
3085 int unref_p_ref_1 = p_ref_1;
3086 while (tensor_blocks[unref_p_ref_1].ref)
3087 unref_p_ref_1 = tensor_blocks[unref_p_ref_1].ref - 1;
3088 /* See above comment for the similar p_ref_0 check. */
3089 assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_1]))((void) sizeof ((!((tensor_blocks[unref_p_ref_1].flags & 0x3
) == UNASSIGNED)) ? 1 : 0), __extension__ ({ if (!((tensor_blocks
[unref_p_ref_1].flags & 0x3) == UNASSIGNED)) ; else __assert_fail
("!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_1])", "ccv_nnc_symbolic_graph_compile.c"
, 3089, __extension__ __PRETTY_FUNCTION__); }))
;
3090 assert(p_ref_0_is_in_or_out != p_ref_1_is_in_or_out)((void) sizeof ((p_ref_0_is_in_or_out != p_ref_1_is_in_or_out
) ? 1 : 0), __extension__ ({ if (p_ref_0_is_in_or_out != p_ref_1_is_in_or_out
) ; else __assert_fail ("p_ref_0_is_in_or_out != p_ref_1_is_in_or_out"
, "ccv_nnc_symbolic_graph_compile.c", 3090, __extension__ __PRETTY_FUNCTION__
); }))
;
3091 int p_ref_t;
3092 if (p_ref_0_is_in_or_out < p_ref_1_is_in_or_out) /* if p_ref_0 is input and p_ref_1 is output, switch. */
3093 {
3094 CCV_SWAP(p_ref_0, p_ref_1, p_ref_t)((p_ref_t) = (p_ref_0), (p_ref_0) = (p_ref_1), (p_ref_1) = (p_ref_t
))
;
3095 CCV_SWAP(unref_p_ref_0, unref_p_ref_1, p_ref_t)((p_ref_t) = (unref_p_ref_0), (unref_p_ref_0) = (unref_p_ref_1
), (unref_p_ref_1) = (p_ref_t))
;
3096 }
3097 p_ref_0_is_in_or_out = 1; /* Now p_ref_0 surely is the output tensor. */
3098 /* If the dimension matches, can fold. */
3099 if (memcmp(tensor_symbol_info[unref_p_ref_1].info.dim, tensor_symbol_info[unref_p_ref_0].info.dim, sizeof(int) * CCV_NNC_MAX_DIM_ALLOC(8)) == 0)
3100 {
3101 const int folded = _ccv_nnc_tensor_blocks_try_fold(tensor_blocks, unref_p_ref_1, unref_p_ref_0);
3102 if (folded)
3103 {
3104 p_ref_0 = p_ref_1;
3105 unref_p_ref_0 = unref_p_ref_1; // p_ref_0 now folded into p_ref_1, therefore, pointing to p_ref_1 now.
3106 tensor_fold[unref_p_ref_0 >> 5] |= (1u << (unref_p_ref_0 & 0x1f));
3107 for (j = 0; j < unroll_count; j++) /* Fold its duplicates as well. */
3108 {
3109 const int folded = _ccv_nnc_tensor_blocks_try_fold(tensor_blocks, dup_tensor_block_ref[unref_p_ref_1 * unroll_count + j], dup_tensor_block_ref[unref_p_ref_0 * unroll_count + j]);
3110 assert(folded && "the subsequent duplicates can be folded too.")((void) sizeof ((folded && "the subsequent duplicates can be folded too."
) ? 1 : 0), __extension__ ({ if (folded && "the subsequent duplicates can be folded too."
) ; else __assert_fail ("folded && \"the subsequent duplicates can be folded too.\""
, "ccv_nnc_symbolic_graph_compile.c", 3110, __extension__ __PRETTY_FUNCTION__
); }))
;
3111 }
3112 }
3113 }
3114 }
3115 /* Only proceed if it is folded here (thus, the input / output tensor can be connected, reuse is not a problem
3116 * Or if the p_ref_0 is the output, it is the first started from this node (thus, I have full control over
3117 * its life-cycle). Or if the p_ref_0 is the input, it is ended in this node (thus, I can take over i
3118 * life-cycle freely within this sub-graph (otherwise, if it is used anywhere, I cannot change the content
3119 * within its memory region)). Unless this buffer is used as read-only, and we don't have any output
3120 * associated with it, then we are good. */
3121 if ((tensor_fold[unref_p_ref_0 >> 5] & (1u << (unref_p_ref_0 & 0x1f))) ||
3122 (p_ref_0_is_in_or_out == 1 && _ccv_nnc_tensor_block_check_head(tensor_blocks + unref_p_ref_0, idx)) ||
3123 (p_ref_0_is_in_or_out == -1 && _ccv_nnc_tensor_block_check_tail(tensor_blocks + unref_p_ref_0, idx)) ||
3124 TENSOR_READ_WRITE(s_alloc_prep->buffers[i])(s_alloc_prep->buffers[i].flags & 0xc) == READ_ONLY)
3125 {
3126 if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i])(s_alloc_prep->buffers[i].flags & 0xc) == READ_ONLY)
3127 { assert(s_alloc_prep->buffers[i].p_refs[1] == 0)((void) sizeof ((s_alloc_prep->buffers[i].p_refs[1] == 0) ?
1 : 0), __extension__ ({ if (s_alloc_prep->buffers[i].p_refs
[1] == 0) ; else __assert_fail ("s_alloc_prep->buffers[i].p_refs[1] == 0"
, "ccv_nnc_symbolic_graph_compile.c", 3127, __extension__ __PRETTY_FUNCTION__
); }))
; }
3128 /* p_ref_0 is either the only one, or the output tensor, we always prefer the output tensor (there
3129 * is a long argument why that is the case, the digest is, it is much easier to control your output
3130 * than your input). */
3131 s_alloc_prep->buffers[i].p_refs[0] = p_ref_0 + 1;
3132 s_alloc_prep->buffers[i].p_refs[1] = 0;
3133 /* This parent tensor block cannot be unassigned because it is either input / output of this sub-graph node. */
3134 assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_0]))((void) sizeof ((!((tensor_blocks[unref_p_ref_0].flags & 0x3
) == UNASSIGNED)) ? 1 : 0), __extension__ ({ if (!((tensor_blocks
[unref_p_ref_0].flags & 0x3) == UNASSIGNED)) ; else __assert_fail
("!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_0])", "ccv_nnc_symbolic_graph_compile.c"
, 3134, __extension__ __PRETTY_FUNCTION__); }))
;
3135 tensor_blocks[unref_p_ref_0].size = ccv_max(s_alloc_prep->buffers[i].size, tensor_blocks[unref_p_ref_0].size)({ typeof (s_alloc_prep->buffers[i].size) _a = (s_alloc_prep
->buffers[i].size); typeof (tensor_blocks[unref_p_ref_0].size
) _b = (tensor_blocks[unref_p_ref_0].size); (_a > _b) ? _a
: _b; })
;
3136 for (j = 0; j < unroll_count; j++) /* Change the size of its duplicates as well. */
3137 tensor_blocks[dup_tensor_block_ref[p_ref_0 * unroll_count + j]].size =
3138 tensor_blocks[dup_tensor_block_ref[unref_p_ref_0 * unroll_count + j]].size =
3139 tensor_blocks[unref_p_ref_0].size;
3140 } else {
3141 s_alloc_prep->buffers[i].p_refs[0] = s_alloc_prep->buffers[i].p_refs[1] = 0;
3142 if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i])(s_alloc_prep->buffers[i].flags & 0xc) == READ_ONLY)
3143 ++ro_anonymous_buffer_size;
3144 else
3145 ++rw_anonymous_buffer_size;
3146 }
3147 } else {
3148 if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i])(s_alloc_prep->buffers[i].flags & 0xc) == READ_ONLY)
3149 ++ro_anonymous_buffer_size;
3150 else
3151 ++rw_anonymous_buffer_size;
3152 }
3153 if (ro_anonymous_buffer_size || rw_anonymous_buffer_size)
3154 {
3155 const int anonymous_block_free_list_cap = anonymous_block_free_list ? anonymous_block_free_list->rnum : 0;
3156 // All read-write buffer (potentially) can be reused between each case..of branch.
3157 rw_anonymous_buffer_size_cap += rw_anonymous_buffer_size;
3158 // Read-only buffer cannot be reused between each case..of branch.
3159 ro_anonymous_buffer_size_cap += ro_anonymous_buffer_size;
3160 /* Anonymous block, allocate additional tensor blocks for this. */
3161 /* This is either because this is an internal tensor (don't have p_ref) */
3162 /* or it is an anonymous block itself within the sub graphs of this while graph. */
3163 tensor_blocks = (ccv_nnc_tensor_block_t*)ccreallocrealloc(tensor_blocks, sizeof(ccv_nnc_tensor_block_t) * (init_tensor_block_size + (unroll_count + 1) * rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap));
3164 memset(tensor_blocks + tensor_block_size, 0, sizeof(ccv_nnc_tensor_block_t) * (init_tensor_block_size + (unroll_count + 1) * rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap - tensor_block_size));
3165 if (dup_tensor_block_ref)
3166 dup_tensor_block_ref = (int*)ccreallocrealloc(dup_tensor_block_ref, sizeof(int) * unroll_count * (init_tensor_block_size + rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap));
3167 for (i = 0; i < s_alloc_prep->buffer_size; i++)
3168 if (!s_alloc_prep->buffers[i].p_refs[0])
3169 {
3170 if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i])(s_alloc_prep->buffers[i].flags & 0xc) == READ_ONLY) /* If it is read-only, add all sources (destinations) to it. */
3171 {
3172 TENSOR_SET_ANONYMOUS(tensor_blocks[tensor_block_size])(tensor_blocks[tensor_block_size].flags = (tensor_blocks[tensor_block_size
].flags & ~0x10 | ANONYMOUS))
;
3173 TENSOR_SET_READ_WRITE(tensor_blocks[tensor_block_size], TENSOR_READ_WRITE(s_alloc_prep->buffers[i]))(tensor_blocks[tensor_block_size].flags = ((tensor_blocks[tensor_block_size
].flags & ~0xc) | (s_alloc_prep->buffers[i].flags &
0xc)))
;
3174 tensor_blocks[tensor_block_size].type = s_alloc_prep->buffers[i].type;
3175 tensor_blocks[tensor_block_size].pin_mem = s_alloc_prep->buffers[i].pin_mem;
3176 tensor_blocks[tensor_block_size].size = s_alloc_prep->buffers[i].size;
3177 s_alloc_prep->buffers[i].p_refs[0] = tensor_block_size + 1;
3178 tensor_blocks[tensor_block_size].head = ccv_array_new(sizeof(int), 1, 0);
3179 ccv_array_push(tensor_blocks[tensor_block_size].head, &idx);
3180 ccv_array_t* const dup_p_refs = s_alloc_prep->buffers[i].dup_p_refs;
3181 if (dup_p_refs && dup_p_refs->rnum > 0)
3182 {
3183 for (j = 0; j < dup_p_refs->rnum; j++)
3184 {
3185 const int dup_p_ref = *(int*)ccv_array_get(dup_p_refs, j)((void*)(((char*)((dup_p_refs)->data)) + (size_t)(dup_p_refs
)->rsize * (size_t)(j)))
;
3186 assert(dup_p_ref >= 0)((void) sizeof ((dup_p_ref >= 0) ? 1 : 0), __extension__ (
{ if (dup_p_ref >= 0) ; else __assert_fail ("dup_p_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3186, __extension__ __PRETTY_FUNCTION__
); }))
;
3187 assert(dup_p_ref < symbolic_graph->tensor_symbol_info->rnum)((void) sizeof ((dup_p_ref < symbolic_graph->tensor_symbol_info
->rnum) ? 1 : 0), __extension__ ({ if (dup_p_ref < symbolic_graph
->tensor_symbol_info->rnum) ; else __assert_fail ("dup_p_ref < symbolic_graph->tensor_symbol_info->rnum"
, "ccv_nnc_symbolic_graph_compile.c", 3187, __extension__ __PRETTY_FUNCTION__
); }))
;
3188 assert(tensor_blocks[dup_p_ref].tail)((void) sizeof ((tensor_blocks[dup_p_ref].tail) ? 1 : 0), __extension__
({ if (tensor_blocks[dup_p_ref].tail) ; else __assert_fail (
"tensor_blocks[dup_p_ref].tail", "ccv_nnc_symbolic_graph_compile.c"
, 3188, __extension__ __PRETTY_FUNCTION__); }))
;
3189 // If it points to a p_ref upwards, check whether this is an output, if it is an output, add it to
3190 // this block's dup_p_refs. It propagates back all the way to upper layer's buffer object.
3191 if (tensor_symbol_info[dup_p_ref].p_ref)
3192 {
3193 const int p_ref_0 = tensor_symbol_info[dup_p_ref].p_ref - 1;
3194 assert(p_node_info)((void) sizeof ((p_node_info) ? 1 : 0), __extension__ ({ if (
p_node_info) ; else __assert_fail ("p_node_info", "ccv_nnc_symbolic_graph_compile.c"
, 3194, __extension__ __PRETTY_FUNCTION__); }))
;
3195 const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info);
3196 if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this.
3197 {
3198 if (!tensor_blocks[tensor_block_size].dup_p_refs)
3199 tensor_blocks[tensor_block_size].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
3200 ccv_array_add_unique_int(tensor_blocks[tensor_block_size].dup_p_refs, p_ref_0);
3201 }
3202 }
3203 if (!tensor_blocks[tensor_block_size].tail)
3204 tensor_blocks[tensor_block_size].tail = ccv_array_new(sizeof(int), tensor_blocks[dup_p_ref].tail->rnum, 0);
3205 for (k = 0; k < tensor_blocks[dup_p_ref].tail->rnum; k++)
3206 _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[dup_p_ref].tail, k)((void*)(((char*)((tensor_blocks[dup_p_ref].tail)->data)) +
(size_t)(tensor_blocks[dup_p_ref].tail)->rsize * (size_t)
(k)))
, tensor_blocks[tensor_block_size]);
3207 }
3208 } else {
3209 tensor_blocks[tensor_block_size].tail = ccv_array_new(sizeof(int), 1, 0);
3210 ccv_array_push(tensor_blocks[tensor_block_size].tail, &idx);
3211 }
3212 for (j = 0; j < source_size; j++)
3213 _ccv_nnc_tensor_block_add_exec(exec_dep, sources[j].d, tensor_blocks[tensor_block_size]);
3214 /* If this is a read-only (based on SSA, if first encountered as read), and this is
3215 * sub-graph. Mark it to the end of the graph. */
3216 if (p_exec_symbol_info)
3217 for (j = 0; j < destination_size; j++)
3218 _ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[tensor_block_size]);
3219 /* If it is read-only, it is self-reflecting. */
3220 for (k = 0; k < unroll_count; k++)
3221 {
3222 for (j = 0; j < destination_size; j++)
3223 if (dup_exec_ref[destinations[j].d * unroll_count + k] >= 0)
3224 _ccv_nnc_tensor_block_add_exec(exec_dep, dup_exec_ref[destinations[j].d * unroll_count + k], tensor_blocks[tensor_block_size]);
3225 /* No need to extend life-time, because this is a sub-graph and we already extended read-only to the end of destination. */
3226 assert(symbolic_graph->p)((void) sizeof ((symbolic_graph->p) ? 1 : 0), __extension__
({ if (symbolic_graph->p) ; else __assert_fail ("symbolic_graph->p"
, "ccv_nnc_symbolic_graph_compile.c", 3226, __extension__ __PRETTY_FUNCTION__
); }))
;
3227 dup_tensor_block_ref[tensor_block_size * unroll_count + k] = tensor_block_size;
3228 }
3229 ++tensor_block_size;
3230 } else {
3231 ccv_array_t* const dup_p_refs = s_alloc_prep->buffers[i].dup_p_refs;
3232 const int tensor_block_idx = _ccv_nnc_anonymous_tensor_block_from_free_list(tensor_blocks, tensor_block_size, anonymous_block_free_list, anonymous_block_free_list_cap, s_alloc_prep->buffers[i].type, s_alloc_prep->buffers[i].size, exec_dep, dup_p_refs);
3233 const int new_anonymous_tensor_block = (tensor_block_idx == tensor_block_size);
3234 // Find suitable tensor block from the free list.
3235 TENSOR_SET_ANONYMOUS(tensor_blocks[tensor_block_idx])(tensor_blocks[tensor_block_idx].flags = (tensor_blocks[tensor_block_idx
].flags & ~0x10 | ANONYMOUS))
;
3236 TENSOR_SET_READ_WRITE(tensor_blocks[tensor_block_idx], TENSOR_READ_WRITE(s_alloc_prep->buffers[i]))(tensor_blocks[tensor_block_idx].flags = ((tensor_blocks[tensor_block_idx
].flags & ~0xc) | (s_alloc_prep->buffers[i].flags &
0xc)))
;
3237 s_alloc_prep->buffers[i].p_refs[0] = tensor_block_idx + 1;
3238 if (new_anonymous_tensor_block)
3239 {
3240 tensor_blocks[tensor_block_idx].type = s_alloc_prep->buffers[i].type;
3241 tensor_blocks[tensor_block_idx].pin_mem = s_alloc_prep->buffers[i].pin_mem;
3242 tensor_blocks[tensor_block_idx].size = s_alloc_prep->buffers[i].size;
3243 tensor_blocks[tensor_block_idx].head = ccv_array_new(sizeof(int), 1, 0);
3244 ccv_array_push(tensor_blocks[tensor_block_idx].head, &idx);
3245 } else {
3246 tensor_blocks[tensor_block_idx].pin_mem = tensor_blocks[tensor_block_idx].pin_mem || s_alloc_prep->buffers[i].pin_mem;
3247 tensor_blocks[tensor_block_idx].size = ccv_max(tensor_blocks[tensor_block_idx].size, s_alloc_prep->buffers[i].size)({ typeof (tensor_blocks[tensor_block_idx].size) _a = (tensor_blocks
[tensor_block_idx].size); typeof (s_alloc_prep->buffers[i]
.size) _b = (s_alloc_prep->buffers[i].size); (_a > _b) ?
_a : _b; })
;
3248 }
3249 if (dup_p_refs && dup_p_refs->rnum > 0)
3250 {
3251 for (j = 0; j < dup_p_refs->rnum; j++)
3252 {
3253 const int dup_p_ref = *(int*)ccv_array_get(dup_p_refs, j)((void*)(((char*)((dup_p_refs)->data)) + (size_t)(dup_p_refs
)->rsize * (size_t)(j)))
;
3254 assert(dup_p_ref >= 0)((void) sizeof ((dup_p_ref >= 0) ? 1 : 0), __extension__ (
{ if (dup_p_ref >= 0) ; else __assert_fail ("dup_p_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3254, __extension__ __PRETTY_FUNCTION__
); }))
;
3255 assert(dup_p_ref < symbolic_graph->tensor_symbol_info->rnum)((void) sizeof ((dup_p_ref < symbolic_graph->tensor_symbol_info
->rnum) ? 1 : 0), __extension__ ({ if (dup_p_ref < symbolic_graph
->tensor_symbol_info->rnum) ; else __assert_fail ("dup_p_ref < symbolic_graph->tensor_symbol_info->rnum"
, "ccv_nnc_symbolic_graph_compile.c", 3255, __extension__ __PRETTY_FUNCTION__
); }))
;
3256 // If it points to a p_ref upwards, check whether this is an output, if it is an output, add it to
3257 // this block's dup_p_refs. It propagates back all the way to upper layer's buffer object.
3258 if (tensor_symbol_info[dup_p_ref].p_ref)
3259 {
3260 const int p_ref_0 = tensor_symbol_info[dup_p_ref].p_ref - 1;
3261 assert(p_node_info)((void) sizeof ((p_node_info) ? 1 : 0), __extension__ ({ if (
p_node_info) ; else __assert_fail ("p_node_info", "ccv_nnc_symbolic_graph_compile.c"
, 3261, __extension__ __PRETTY_FUNCTION__); }))
;
3262 const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info);
3263 if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this.
3264 {
3265 if (!tensor_blocks[tensor_block_idx].dup_p_refs)
3266 tensor_blocks[tensor_block_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
3267 ccv_array_add_unique_int(tensor_blocks[tensor_block_idx].dup_p_refs, p_ref_0);
3268 }
3269 }
3270 assert(tensor_blocks[dup_p_ref].tail)((void) sizeof ((tensor_blocks[dup_p_ref].tail) ? 1 : 0), __extension__
({ if (tensor_blocks[dup_p_ref].tail) ; else __assert_fail (
"tensor_blocks[dup_p_ref].tail", "ccv_nnc_symbolic_graph_compile.c"
, 3270, __extension__ __PRETTY_FUNCTION__); }))
;
3271 if (!tensor_blocks[tensor_block_idx].tail)
3272 tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), tensor_blocks[dup_p_ref].tail->rnum, 0);
3273 for (k = 0; k < tensor_blocks[dup_p_ref].tail->rnum; k++)
3274 _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[dup_p_ref].tail, k)((void*)(((char*)((tensor_blocks[dup_p_ref].tail)->data)) +
(size_t)(tensor_blocks[dup_p_ref].tail)->rsize * (size_t)
(k)))
, tensor_blocks[tensor_block_idx]);
3275 // We have to add it to the warp around companion_ref as well.
3276 // TODO: Although we know this wasted space (any space in between current one and its companion_ref will still
3277 // be occupied and unlikely to be reused), but we cannot really do too much about it because the companion_ref's
3278 // definition is too free-form and if we enforce stronger gaurantee on this (such as it must wrap around), this
3279 // gaurantee may be broken down in the line.
3280 if (tensor_blocks[dup_p_ref].companion_ref)
3281 {
3282 const int companion_ref = tensor_blocks[dup_p_ref].companion_ref - 1;
3283 for (q = 0; tensor_blocks[companion_ref].head && q < tensor_blocks[companion_ref].head->rnum; q++)
3284 _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].head, q)((void*)(((char*)((tensor_blocks[companion_ref].head)->data
)) + (size_t)(tensor_blocks[companion_ref].head)->rsize * (
size_t)(q)))
, tensor_blocks[tensor_block_idx]);
3285 for (q = 0; tensor_blocks[companion_ref].tail && q < tensor_blocks[companion_ref].tail->rnum; q++)
3286 _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].tail, q)((void*)(((char*)((tensor_blocks[companion_ref].tail)->data
)) + (size_t)(tensor_blocks[companion_ref].tail)->rsize * (
size_t)(q)))
, tensor_blocks[tensor_block_idx]);
3287 }
3288 }
3289 } else if (new_anonymous_tensor_block) {
3290 tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), 1, 0);
3291 ccv_array_push(tensor_blocks[tensor_block_idx].tail, &idx);
3292 }
3293 const int prev_tensor_block_idx = tensor_block_idx;
3294 if (new_anonymous_tensor_block)
3295 {
3296 if (!anonymous_block_free_list)
3297 anonymous_block_free_list = ccv_array_new(sizeof(int), 0, 0);
3298 ccv_array_push(anonymous_block_free_list, &tensor_block_size);
3299 ++tensor_block_size;
3300 }
3301 for (k = 0; k < unroll_count; k++)
3302 {
3303 const int tensor_block_idx = new_anonymous_tensor_block ?
3304 (dup_tensor_block_ref[prev_tensor_block_idx * unroll_count + k] = tensor_block_size) :
3305 dup_tensor_block_ref[prev_tensor_block_idx * unroll_count + k];
3306 TENSOR_SET_ANONYMOUS(tensor_blocks[tensor_block_idx])(tensor_blocks[tensor_block_idx].flags = (tensor_blocks[tensor_block_idx
].flags & ~0x10 | ANONYMOUS))
;
3307 TENSOR_SET_READ_WRITE(tensor_blocks[tensor_block_idx], TENSOR_READ_WRITE(s_alloc_prep->buffers[i]))(tensor_blocks[tensor_block_idx].flags = ((tensor_blocks[tensor_block_idx
].flags & ~0xc) | (s_alloc_prep->buffers[i].flags &
0xc)))
;
3308 if (new_anonymous_tensor_block)
3309 {
3310 tensor_blocks[tensor_block_idx].type = s_alloc_prep->buffers[i].type;
3311 tensor_blocks[tensor_block_idx].pin_mem = s_alloc_prep->buffers[i].pin_mem;
3312 tensor_blocks[tensor_block_idx].size = s_alloc_prep->buffers[i].size;
3313 tensor_blocks[tensor_block_idx].head = ccv_array_new(sizeof(int), 1, 0);
3314 /* Attach to duplicated exec for this tensor block. */
3315 ccv_array_push(tensor_blocks[tensor_block_idx].head, &dup_exec_ref[idx * unroll_count + k]);
3316 } else {
3317 tensor_blocks[tensor_block_idx].pin_mem = tensor_blocks[tensor_block_idx].pin_mem || s_alloc_prep->buffers[i].pin_mem;
3318 tensor_blocks[tensor_block_idx].size = ccv_max(tensor_blocks[tensor_block_idx].size, s_alloc_prep->buffers[i].size)({ typeof (tensor_blocks[tensor_block_idx].size) _a = (tensor_blocks
[tensor_block_idx].size); typeof (s_alloc_prep->buffers[i]
.size) _b = (s_alloc_prep->buffers[i].size); (_a > _b) ?
_a : _b; })
;
3319 _ccv_nnc_tensor_block_add_exec(exec_dep, dup_exec_ref[idx * unroll_count + k], tensor_blocks[tensor_block_idx]);
3320
3321 }
3322 if (dup_p_refs && dup_p_refs->rnum > 0)
3323 {
3324 /* Not nil, not self-reflecting. */
3325 for (j = 0; j < dup_p_refs->rnum; j++)
3326 {
3327 const int dup_p_ref = *(int*)ccv_array_get(dup_p_refs, j)((void*)(((char*)((dup_p_refs)->data)) + (size_t)(dup_p_refs
)->rsize * (size_t)(j)))
;
3328 assert(dup_p_ref >= 0)((void) sizeof ((dup_p_ref >= 0) ? 1 : 0), __extension__ (
{ if (dup_p_ref >= 0) ; else __assert_fail ("dup_p_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3328, __extension__ __PRETTY_FUNCTION__
); }))
;
3329 assert(dup_p_ref < symbolic_graph->tensor_symbol_info->rnum)((void) sizeof ((dup_p_ref < symbolic_graph->tensor_symbol_info
->rnum) ? 1 : 0), __extension__ ({ if (dup_p_ref < symbolic_graph
->tensor_symbol_info->rnum) ; else __assert_fail ("dup_p_ref < symbolic_graph->tensor_symbol_info->rnum"
, "ccv_nnc_symbolic_graph_compile.c", 3329, __extension__ __PRETTY_FUNCTION__
); }))
;
3330 // If it points to a p_ref upwards, check whether this is an output, if it is an output, add it to
3331 // this block's dup_p_refs. It propagates back all the way to upper layer's buffer object.
3332 if (tensor_symbol_info[dup_p_ref].p_ref)
3333 {
3334 const int p_ref_0 = tensor_symbol_info[dup_p_ref].p_ref - 1;
3335 assert(p_node_info)((void) sizeof ((p_node_info) ? 1 : 0), __extension__ ({ if (
p_node_info) ; else __assert_fail ("p_node_info", "ccv_nnc_symbolic_graph_compile.c"
, 3335, __extension__ __PRETTY_FUNCTION__); }))
;
3336 const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info);
3337 if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this.
3338 {
3339 if (!tensor_blocks[tensor_block_idx].dup_p_refs)
3340 tensor_blocks[tensor_block_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
3341 ccv_array_add_unique_int(tensor_blocks[tensor_block_idx].dup_p_refs, p_ref_0);
3342 }
3343 }
3344 assert(dup_tensor_block_ref[dup_p_ref * unroll_count + k] >= 0 && dup_tensor_block_ref[dup_p_ref * unroll_count + k] != dup_p_ref)((void) sizeof ((dup_tensor_block_ref[dup_p_ref * unroll_count
+ k] >= 0 && dup_tensor_block_ref[dup_p_ref * unroll_count
+ k] != dup_p_ref) ? 1 : 0), __extension__ ({ if (dup_tensor_block_ref
[dup_p_ref * unroll_count + k] >= 0 && dup_tensor_block_ref
[dup_p_ref * unroll_count + k] != dup_p_ref) ; else __assert_fail
("dup_tensor_block_ref[dup_p_ref * unroll_count + k] >= 0 && dup_tensor_block_ref[dup_p_ref * unroll_count + k] != dup_p_ref"
, "ccv_nnc_symbolic_graph_compile.c", 3344, __extension__ __PRETTY_FUNCTION__
); }))
;
3345 const int dup_dup_p_ref = dup_tensor_block_ref[dup_p_ref * unroll_count + k];
3346 assert(tensor_blocks[dup_dup_p_ref].tail)((void) sizeof ((tensor_blocks[dup_dup_p_ref].tail) ? 1 : 0),
__extension__ ({ if (tensor_blocks[dup_dup_p_ref].tail) ; else
__assert_fail ("tensor_blocks[dup_dup_p_ref].tail", "ccv_nnc_symbolic_graph_compile.c"
, 3346, __extension__ __PRETTY_FUNCTION__); }))
;
3347 if (!tensor_blocks[tensor_block_idx].tail)
3348 tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), tensor_blocks[dup_dup_p_ref].tail->rnum, 0);
3349 for (q = 0; q < tensor_blocks[dup_dup_p_ref].tail->rnum; q++)
3350 _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[dup_dup_p_ref].tail, q)((void*)(((char*)((tensor_blocks[dup_dup_p_ref].tail)->data
)) + (size_t)(tensor_blocks[dup_dup_p_ref].tail)->rsize * (
size_t)(q)))
, tensor_blocks[tensor_block_idx]);
3351 // We have to add it to the warp around companion_ref as well.
3352 if (tensor_blocks[dup_dup_p_ref].companion_ref)
3353 {
3354 const int companion_ref = tensor_blocks[dup_dup_p_ref].companion_ref - 1;
3355 for (q = 0; tensor_blocks[companion_ref].head && q < tensor_blocks[companion_ref].head->rnum; q++)
3356 _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].head, q)((void*)(((char*)((tensor_blocks[companion_ref].head)->data
)) + (size_t)(tensor_blocks[companion_ref].head)->rsize * (
size_t)(q)))
, tensor_blocks[tensor_block_idx]);
3357 for (q = 0; tensor_blocks[companion_ref].tail && q < tensor_blocks[companion_ref].tail->rnum; q++)
3358 _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].tail, q)((void*)(((char*)((tensor_blocks[companion_ref].tail)->data
)) + (size_t)(tensor_blocks[companion_ref].tail)->rsize * (
size_t)(q)))
, tensor_blocks[tensor_block_idx]);
3359 }
3360 }
3361 } else if (new_anonymous_tensor_block) {
3362 tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), 1, 0);
3363 ccv_array_push(tensor_blocks[tensor_block_idx].tail, &dup_exec_ref[idx * unroll_count + k]);
3364 }
3365 if (new_anonymous_tensor_block)
3366 ++tensor_block_size;
3367 }
3368 }
3369 }
3370 }
3371 }
3372 } ccv_nnc_graph_visit_endfor} }
3373 if (anonymous_block_free_list)
3374 ccv_array_free(anonymous_block_free_list);
3375 ccfreefree(tensor_fold);
3376 // It is time to guess what's the best tensor placement and create the opaque tensor arena. The alloc_dep will return
3377 // the allocation dependencies, thus, which tensor is reused to the existing tensor.
3378 ccv_nnc_tensor_alloc_prep_t* alloc_prep = _ccv_nnc_tensor_alloc_prep_new(exec_dep, tensor_blocks, tensor_block_size);
3379 ccv_matrix_free(exec_dep);
3380 prep->while_count_tensor = 0;
3381 prep->dup_breakpoints = 0;
3382 prep->p = 0;
3383 prep->symbolic_graph = symbolic_graph;
3384 prep->p_idx = symbolic_graph->p_idx;
3385 prep->exec_idx = symbolic_graph->exec_idx;
3386 prep->sub_prep_size = symbolic_graph->sub_graphs ? symbolic_graph->sub_graphs->rnum : 0;
3387 prep->sub_preps = sub_preps;
3388 prep->exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum;
3389 prep->exec_symbol_info = exec_symbol_info;
3390 prep->tensor_symbol_info_size = symbolic_graph->tensor_symbol_info->rnum;
3391 prep->tensor_symbol_info = tensor_symbol_info;
3392 prep->unroll_count = unroll_count;
3393 prep->dup_tensor_block_ref = dup_tensor_block_ref;
3394 prep->tensor_block_size = tensor_block_size;
3395 prep->tensor_blocks = tensor_blocks;
3396 prep->exec_flags = exec_flags;
3397 prep->visit = visit;
3398 prep->alloc_prep = alloc_prep;
3399 if (dup_graph)
3400 ccv_nnc_symbolic_graph_free(dup_graph);
3401 if (dup_exec_ref)
3402 ccfreefree(dup_exec_ref);
3403 return prep;
3404}
3405
3406static void _ccv_nnc_symbolic_graph_prep_free(ccv_nnc_symbolic_graph_prep_t* const prep)
3407{
3408 int i;
3409 _ccv_nnc_tensor_blocks_free(prep->tensor_blocks, prep->tensor_block_size);
3410 ccfreefree(prep->exec_flags);
3411 for (i = 0; i < prep->sub_prep_size; i++)
3412 if (prep->sub_preps[i])
3413 _ccv_nnc_symbolic_graph_prep_free(prep->sub_preps[i]);
3414 if (prep->sub_preps)
3415 ccfreefree(prep->sub_preps);
3416 ccfreefree(prep->tensor_symbol_info);
3417 ccfreefree(prep->exec_symbol_info);
3418 if (prep->dup_tensor_block_ref)
3419 ccfreefree(prep->dup_tensor_block_ref);
3420 _ccv_nnc_tensor_alloc_prep_free(prep->alloc_prep);
3421 ccv_nnc_graph_visit_free(prep->visit);
3422 ccfreefree(prep);
3423}
3424
3425static void _ccv_nnc_symbolic_graph_prep_while_count_tensor(ccv_nnc_symbolic_graph_prep_t* const graph_prep)
3426{
3427 int i, j;
3428 ccv_nnc_graph_visit_for(graph_prep->visit, graph_prep->exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (graph_prep->visit)->
size; _i_++) { const int idx __attribute__((unused)) = (graph_prep
->visit)->node[_i_].index; const int _node_unused_ __attribute__
((unused)) = (graph_prep->visit)->node[_i_].term; typeof
((graph_prep->exec_symbol_info)) const node __attribute__
((unused)) = (graph_prep->exec_symbol_info) + idx;
{
3429 if (node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
3430 {
3431 const int graph_ref = CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)
[0] - 1;
3432 assert(graph_ref >= 0)((void) sizeof ((graph_ref >= 0) ? 1 : 0), __extension__ (
{ if (graph_ref >= 0) ; else __assert_fail ("graph_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3432, __extension__ __PRETTY_FUNCTION__
); }))
;
3433 ccv_nnc_symbolic_graph_prep_t* const sub_prep = graph_prep->sub_preps[graph_ref];
3434 for (i = 0; i < node->p_while.input_size; i++)
3435 if (CCV_NNC_IS_WHILE_COUNT_TENSOR_SYMBOL(node->p_while.inputs[i])(((uint32_t)(node->p_while.inputs[i]) & 0xf) == 0xe))
3436 {
3437 ccv_nnc_symbolic_graph_prep_t* prep = sub_prep;
3438 const int d = CCV_NNC_DECODE_WHILE_COUNT_SYMBOL(node->p_while.inputs[i])((~(uint32_t)(node->p_while.inputs[i])) >> 4);
3439 for (j = 0; j < d; j++)
3440 prep = prep->p;
3441 prep->while_count_tensor = 1;
3442 }
3443 }
3444 for (i = 0; i < node->graph_ref_size; i++)
3445 {
3446 const int graph_ref = CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)
[i] - 1;
3447 if (graph_ref >= 0)
3448 _ccv_nnc_symbolic_graph_prep_while_count_tensor(graph_prep->sub_preps[graph_ref]);
3449 }
3450 } ccv_nnc_graph_visit_endfor} }
3451}
3452
3453static ccv_nnc_tensor_t* _ccv_nnc_tensor_from_graph_prep(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int symbol)
3454{
3455 if (symbol >= 0)
3456 return graph_prep->tensor_arena->vt_tensors[symbol];
3457 if (symbol == CCV_NNC_NO_TENSOR_SYMBOL)
3458 return 0;
3459 assert(CCV_NNC_IS_WHILE_COUNT_TENSOR_SYMBOL(symbol))((void) sizeof (((((uint32_t)(symbol) & 0xf) == 0xe)) ? 1
: 0), __extension__ ({ if ((((uint32_t)(symbol) & 0xf) ==
0xe)) ; else __assert_fail ("CCV_NNC_IS_WHILE_COUNT_TENSOR_SYMBOL(symbol)"
, "ccv_nnc_symbolic_graph_compile.c", 3459, __extension__ __PRETTY_FUNCTION__
); }))
;
3460 const ccv_nnc_symbolic_graph_prep_t* prep = graph_prep;
3461 int i;
3462 const int d = CCV_NNC_DECODE_WHILE_COUNT_SYMBOL(symbol)((~(uint32_t)(symbol)) >> 4);
3463 for (i = 0; i < d; i++)
3464 prep = prep->p;
3465 assert(prep->while_count_tensor)((void) sizeof ((prep->while_count_tensor) ? 1 : 0), __extension__
({ if (prep->while_count_tensor) ; else __assert_fail ("prep->while_count_tensor"
, "ccv_nnc_symbolic_graph_compile.c", 3465, __extension__ __PRETTY_FUNCTION__
); }))
;
3466 return (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(prep->tensor_arena->tensor_metadata, (0 << 1) + 1);
3467}
3468
3469static void _ccv_nnc_graph_exec_arena_topsort(ccv_nnc_graph_t* const graph, ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
3470{
3471 int i;
3472 int* const exec_cvt = (int*)ccmallocmalloc(sizeof(int) * graph->exec_info->rnum);
3473 ccv_nnc_graph_topsort(graph, exec_cvt, graph->exec_info->rnum);
3474 graph_exec_arena->source.d = exec_cvt[graph_exec_arena->source.d];
3475 graph_exec_arena->destination.d = exec_cvt[graph_exec_arena->destination.d];
3476 ccv_nnc_graph_exec_t* const graph_execs = graph_exec_arena->graph_execs;
3477 for (i = 0; i < graph_exec_arena->graph_exec_size; i++)
3478 if (graph_execs[i].graph == graph)
3479 graph_execs[i].d = exec_cvt[graph_execs[i].d];
3480 ccfreefree(exec_cvt);
3481}
3482
3483static ccv_nnc_graph_exec_arena_t* _ccv_nnc_graph_exec_arena_new(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_tensor_arena_t* const tensor_arena)
3484{
3485 int i, j, k;
3486 ccv_nnc_graph_t* const graph = graph_prep->graph;
3487 const int exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum;
3488 ccv_nnc_graph_exec_arena_t* const graph_exec_arena = (ccv_nnc_graph_exec_arena_t*)ccmallocmalloc(sizeof(ccv_nnc_graph_exec_arena_t) + sizeof(ccv_nnc_graph_exec_arena_t*) * graph_prep->sub_prep_size + sizeof(ccv_nnc_graph_exec_t) * (exec_symbol_info_size - 1));
1
Uninitialized value stored to field 'graph'
3489 graph_exec_arena->graph_ref = (intptr_t)symbolic_graph;
3490 graph_exec_arena->graph_exec_size = exec_symbol_info_size;
3491 graph_exec_arena->sub_arena_size = graph_prep->sub_prep_size;
3492 graph_exec_arena->sub_arenas = (ccv_nnc_graph_exec_arena_t**)(graph_exec_arena->graph_execs + exec_symbol_info_size);
3493 memset(graph_exec_arena->sub_arenas, 0, sizeof(ccv_nnc_graph_exec_arena_t*) * graph_exec_arena->sub_arena_size);
3494 ccv_nnc_graph_exec_t* const graph_execs = graph_exec_arena->graph_execs;
3495 int max_input_size = 0, max_output_size = 0, max_breakpoint_size = 0;
3496 for (i = 0; i < exec_symbol_info_size; i++)
2
Assuming 'i' is >= 'exec_symbol_info_size'
3
Loop condition is false. Execution continues on line 3505
3497 {
3498 max_input_size = ccv_max(max_input_size, graph_prep->exec_symbol_info[i].input_size)({ typeof (max_input_size) _a = (max_input_size); typeof (graph_prep
->exec_symbol_info[i].input_size) _b = (graph_prep->exec_symbol_info
[i].input_size); (_a > _b) ? _a : _b; })
;
3499 max_output_size = ccv_max(max_input_size, graph_prep->exec_symbol_info[i].output_size)({ typeof (max_input_size) _a = (max_input_size); typeof (graph_prep
->exec_symbol_info[i].output_size) _b = (graph_prep->exec_symbol_info
[i].output_size); (_a > _b) ? _a : _b; })
;
3500 if (graph_prep->exec_symbol_info[i].flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
3501 max_input_size = ccv_max(max_input_size, graph_prep->exec_symbol_info[i].p_while.input_size)({ typeof (max_input_size) _a = (max_input_size); typeof (graph_prep
->exec_symbol_info[i].p_while.input_size) _b = (graph_prep
->exec_symbol_info[i].p_while.input_size); (_a > _b) ? _a
: _b; })
;
3502 graph_execs[i].d = CCV_NNC_NO_TENSOR_SYMBOL;
3503 graph_execs[i].graph = 0;
3504 }
3505 for (i = 0; i < graph_prep->sub_prep_size; i++)
4
Assuming the condition is false
5
Loop condition is false. Execution continues on line 3507
3506 max_breakpoint_size = ccv_max(max_breakpoint_size, (*(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, i))->breakpoint_size)({ typeof (max_breakpoint_size) _a = (max_breakpoint_size); typeof
((*(ccv_nnc_symbolic_graph_t**)((void*)(((char*)((symbolic_graph
->sub_graphs)->data)) + (size_t)(symbolic_graph->sub_graphs
)->rsize * (size_t)(i))))->breakpoint_size) _b = ((*(ccv_nnc_symbolic_graph_t
**)((void*)(((char*)((symbolic_graph->sub_graphs)->data
)) + (size_t)(symbolic_graph->sub_graphs)->rsize * (size_t
)(i))))->breakpoint_size); (_a > _b) ? _a : _b; })
;
3507 ccv_nnc_tensor_t* max_inputs[ccv_max(1, max_input_size)({ typeof (1) _a = (1); typeof (max_input_size) _b = (max_input_size
); (_a > _b) ? _a : _b; })
];
6
'?' condition is true
3508 ccv_nnc_tensor_t* max_outputs[ccv_max(1, max_output_size)({ typeof (1) _a = (1); typeof (max_output_size) _b = (max_output_size
); (_a > _b) ? _a : _b; })
];
7
'?' condition is true
3509 ccv_nnc_graph_exec_t max_breakpoints[ccv_max(1, max_breakpoint_size)({ typeof (1) _a = (1); typeof (max_breakpoint_size) _b = (max_breakpoint_size
); (_a > _b) ? _a : _b; })
];
8
'?' condition is true
3510 const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info = graph_prep->exec_symbol_info;
3511 const ccv_nnc_graph_exec_flag_t* const exec_flags = graph_prep->exec_flags;
3512 // Create node, this is in topological order.
3513 ccv_nnc_graph_visit_for(graph_prep->visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (graph_prep->visit)->
size; _i_++) { const int idx __attribute__((unused)) = (graph_prep
->visit)->node[_i_].index; const int _node_unused_ __attribute__
((unused)) = (graph_prep->visit)->node[_i_].term; typeof
((exec_symbol_info)) const node __attribute__((unused)) = (exec_symbol_info
) + idx;
{
9
Assuming the condition is true
10
Loop condition is true. Entering loop body
3514 if (CCV_NO_GRAPH_EXEC(graph_execs[idx])((graph_execs[idx]).graph == 0))
11
The left operand of '==' is a garbage value
3515 {
3516 for (i = 0; i < node->input_size; i++)
3517 max_inputs[i] = _ccv_nnc_tensor_from_graph_prep(graph_prep, node->inputs[i]);
3518 for (i = 0; i < node->output_size; i++)
3519 max_outputs[i] = node->outputs[i] >= 0 ? tensor_arena->vt_tensors[node->outputs[i]] : 0;
3520 if (node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
3521 {
3522 const int graph_ref = CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)
[0] - 1;
3523 assert(graph_ref >= 0)((void) sizeof ((graph_ref >= 0) ? 1 : 0), __extension__ (
{ if (graph_ref >= 0) ; else __assert_fail ("graph_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3523, __extension__ __PRETTY_FUNCTION__
); }))
;
3524 ccv_nnc_symbolic_graph_prep_t* const sub_prep = graph_prep->sub_preps[graph_ref];
3525 ccv_nnc_graph_t* const sub_graph = sub_prep->graph;
3526 graph_execs[idx] = ccv_nnc_graph_while(graph, node->cmd.cmd, sub_graph);
3527 const ccv_nnc_symbolic_graph_t* const sub_symbolic_graph = *(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, graph_ref)((void*)(((char*)((symbolic_graph->sub_graphs)->data)) +
(size_t)(symbolic_graph->sub_graphs)->rsize * (size_t)
(graph_ref)))
;
3528 ccv_nnc_graph_exec_arena_t* const sub_arena = graph_exec_arena->sub_arenas[graph_ref] = _ccv_nnc_graph_exec_arena_new(sub_symbolic_graph, ccv_nnc_symbolic_graph_sources(sub_symbolic_graph), ccv_nnc_symbolic_graph_source_size(sub_symbolic_graph), ccv_nnc_symbolic_graph_destinations(sub_symbolic_graph), ccv_nnc_symbolic_graph_destination_size(sub_symbolic_graph), graph_prep->sub_preps[graph_ref], tensor_arena->sub_arenas[graph_ref]);
3529 ccv_nnc_graph_exec_set_io(graph, graph_execs[idx], max_inputs, node->input_size, max_outputs, node->output_size);
3530 for (i = 0; i < node->p_while.input_size; i++)
3531 max_inputs[i] = _ccv_nnc_tensor_from_graph_prep(sub_prep, node->p_while.inputs[i]);
3532 for (i = 0; i < sub_symbolic_graph->breakpoint_size; i++)
3533 max_breakpoints[i] = ccv_nnc_graph_exec_from_symbol(sub_arena, sub_symbolic_graph->breakpoints[i]);
3534 ccv_nnc_graph_set_while_expr(sub_graph, node->p_while.expr, node->p_while.data, max_inputs, node->p_while.input_size, max_breakpoints, sub_symbolic_graph->breakpoint_size);
3535 _ccv_nnc_graph_exec_arena_topsort(sub_graph, sub_arena);
3536 } else if (node->flags & CCV_NNC_GRAPH_EXEC_CASE_OF) {
3537 for (i = 0; i < node->output_size; i++)
3538 if (max_outputs[i] && max_outputs[i]->alias_ref)
3539 max_outputs[i] = (ccv_nnc_tensor_t*)max_outputs[i]->alias_ref;
3540 graph_execs[idx] = ccv_nnc_graph_case_of_new(graph, node->cmd.cmd, max_inputs + node->case_of.argument.offset, node->case_of.argument.size, max_outputs, node->output_size);
3541 // Check whether this is already covered in the inputs, if not, need to be covered in the update.
3542 for (i = 0; i < node->case_of.argument.offset; i++)
3543 {
3544 ccv_nnc_tensor_t* const update = max_inputs[i];
3545 if (!CCV_IS_TENSOR_MULTIVIEW(update)((*(int*)(update)) & CCV_TENSOR_MULTIVIEW)) // No need if it is a naked tensor.
3546 continue;
3547 int flag = 0;
3548 for (j = node->case_of.argument.offset; !flag && j < node->case_of.argument.size; j++)
3549 flag = (update == max_inputs[j]);
3550 if (!flag)
3551 ccv_nnc_graph_exec_add_update(graph, graph_execs[idx], update);
3552 }
3553 const int offset = (exec_flags[idx].flags & CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO) ? 1 : 0;
3554 ccv_nnc_graph_set_case_of_expr(graph, graph_execs[idx], node->case_of.expr, node->case_of.data, offset);
3555 if (exec_flags[idx].flags & CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO)
3556 {
3557 // Add another graph for data transfer.
3558 ccv_nnc_graph_t* sub_graph = ccv_nnc_graph_new();
3559 for (i = 0; i < node->output_size; i++)
3560 max_outputs[i] = node->outputs[i] >= 0 ? tensor_arena->vt_tensors[node->outputs[i]] : 0;
3561 ccv_nnc_graph_exec_t io = ccv_nnc_graph_exec_new(sub_graph, ccv_nnc_cmd(CCV_NNC_DATA_TRANSFER_FORWARD, 0, CMD_GENERIC()((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}}}), 0), ccv_nnc_no_hint, max_inputs, ccv_min(node->input_size, node->output_size)({ typeof (node->input_size) _a = (node->input_size); typeof
(node->output_size) _b = (node->output_size); (_a <
_b) ? _a : _b; })
, max_outputs, ccv_min(node->input_size, node->output_size)({ typeof (node->input_size) _a = (node->input_size); typeof
(node->output_size) _b = (node->output_size); (_a <
_b) ? _a : _b; })
);
3562 ccv_nnc_graph_set_sources(sub_graph, &io, 1);
3563 ccv_nnc_graph_set_destinations(sub_graph, &io, 1);
3564 ccv_nnc_graph_set_case_of(graph, graph_execs[idx], sub_graph, 0);
3565 int exec_cvt;
3566 ccv_nnc_graph_topsort(sub_graph, &exec_cvt, 1);
3567 }
3568 for (i = 0; i < node->graph_ref_size; i++)
3569 {
3570 const int graph_ref = CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)
[i] - 1;
3571 if (graph_ref < 0)
3572 continue;
3573 ccv_nnc_graph_t* const sub_graph = graph_prep->sub_preps[graph_ref]->graph;
3574 const ccv_nnc_symbolic_graph_t* const sub_symbolic_graph = *(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, graph_ref)((void*)(((char*)((symbolic_graph->sub_graphs)->data)) +
(size_t)(symbolic_graph->sub_graphs)->rsize * (size_t)
(graph_ref)))
;
3575 ccv_nnc_graph_exec_arena_t* const sub_arena = graph_exec_arena->sub_arenas[graph_ref] = _ccv_nnc_graph_exec_arena_new(sub_symbolic_graph, ccv_nnc_symbolic_graph_sources(sub_symbolic_graph), ccv_nnc_symbolic_graph_source_size(sub_symbolic_graph), ccv_nnc_symbolic_graph_destinations(sub_symbolic_graph), ccv_nnc_symbolic_graph_destination_size(sub_symbolic_graph), graph_prep->sub_preps[graph_ref], tensor_arena->sub_arenas[graph_ref]);
3576 ccv_nnc_graph_set_case_of(graph, graph_execs[idx], sub_graph, i + offset);
3577 _ccv_nnc_graph_exec_arena_topsort(sub_graph, sub_arena);
3578 }
3579 } else {
3580 graph_execs[idx] = ccv_nnc_graph_exec_new(graph, node->cmd, node->hint, max_inputs, node->input_size, max_outputs, node->output_size);
3581 }
3582 ccv_nnc_graph_exec_set_io_flags(graph, graph_execs[idx], 0, 0, 0, 0);
3583 }
3584 } ccv_nnc_graph_visit_endfor} }
3585 // Then connect them.
3586 ccv_nnc_graph_visit_for(graph_prep->visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (graph_prep->visit)->
size; _i_++) { const int idx __attribute__((unused)) = (graph_prep
->visit)->node[_i_].index; const int _node_unused_ __attribute__
((unused)) = (graph_prep->visit)->node[_i_].term; typeof
((exec_symbol_info)) const node __attribute__((unused)) = (exec_symbol_info
) + idx;
{
3587 if (node->outgoings)
3588 for (i = 0; i < node->outgoings->rnum; i++)
3589 {
3590 const int outgoing = *(int*)ccv_array_get(node->outgoings, i)((void*)(((char*)((node->outgoings)->data)) + (size_t)(
node->outgoings)->rsize * (size_t)(i)))
;
3591 if (graph_execs[outgoing].graph)
3592 ccv_nnc_graph_exec_concat(graph, graph_execs[idx], graph_execs[outgoing]);
3593 }
3594 } ccv_nnc_graph_visit_endfor} }
3595 int source_exec_created = 0;
3596 const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info = graph_prep->tensor_symbol_info;
3597 const ccv_nnc_tensor_block_t* const tensor_blocks = graph_prep->tensor_blocks;
3598 ccv_array_t* const* const alloc_dep = graph_prep->alloc_prep->alloc_dep;
3599 // After the graph is materialized, we need to handle the case that some of these tensors require to be initialized to zero before use.
3600 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
3601 {
3602 if (TENSOR_REQUIRE_INIT(tensor_symbol_info[i].flags)(((tensor_symbol_info[i].flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS
) || ((tensor_symbol_info[i].flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ONES
))
)
3603 {
3604 int ref = i;
3605 while (tensor_symbol_info[ref].alias_ref)
3606 ref = tensor_symbol_info[ref].alias_ref - 1;
3607 while (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref])(!((tensor_blocks[ref].flags & 0x3) == ALIAS) && !
((tensor_blocks[ref].flags & 0x3) == UNASSIGNED))
&& tensor_blocks[ref].ref)
3608 ref = tensor_blocks[ref].ref - 1;
3609 // This is not computable. It could be that we marked a const tensor as init zero.
3610 if (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref])(!((tensor_blocks[ref].flags & 0x3) == ALIAS) && !
((tensor_blocks[ref].flags & 0x3) == UNASSIGNED))
)
3611 continue;
3612 // If this tensor is not used by any exec, we don't need to init at all. Skip.
3613 if (!tensor_blocks[ref].head || tensor_blocks[ref].head->rnum == 0)
3614 continue;
3615 ccv_nnc_tensor_t* tensor = tensor_arena->vt_tensors[ref];
3616 // Now, we have the original tensor, we can get the actual tensor, and construct the set command.
3617 ccv_nnc_graph_exec_t set_exec;
3618 if (tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS)
3619 set_exec = ccv_nnc_graph_exec_new(graph, ccv_nnc_cmd(CCV_NNC_SET_FORWARD, 0, CMD_BLAS(0)((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}},.blas={.a={0}}}), 0), ccv_nnc_no_hint, 0, 0, &tensor, 1);
3620 else if (tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_INIT_ONES)
3621 set_exec = ccv_nnc_graph_exec_new(graph, ccv_nnc_cmd(CCV_NNC_SET_FORWARD, 0, CMD_BLAS(1)((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}},.blas={.a={1}}}), 0), ccv_nnc_no_hint, 0, 0, &tensor, 1);
3622 for (j = 0; j < tensor_blocks[ref].head->rnum; j++)
3623 {
3624 const int outgoing = *(int*)ccv_array_get(tensor_blocks[ref].head, j)((void*)(((char*)((tensor_blocks[ref].head)->data)) + (size_t
)(tensor_blocks[ref].head)->rsize * (size_t)(j)))
;
3625 if (outgoing >= exec_symbol_info_size)
3626 continue;
3627 assert(outgoing >= 0)((void) sizeof ((outgoing >= 0) ? 1 : 0), __extension__ ({
if (outgoing >= 0) ; else __assert_fail ("outgoing >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3627, __extension__ __PRETTY_FUNCTION__
); }))
;
3628 assert(graph_execs[outgoing].graph)((void) sizeof ((graph_execs[outgoing].graph) ? 1 : 0), __extension__
({ if (graph_execs[outgoing].graph) ; else __assert_fail ("graph_execs[outgoing].graph"
, "ccv_nnc_symbolic_graph_compile.c", 3628, __extension__ __PRETTY_FUNCTION__
); }))
;
3629 ccv_nnc_graph_exec_concat(graph, set_exec, graph_execs[outgoing]);
3630 }
3631 int flags = 0;
3632 if (alloc_dep[ref])
3633 for (j = 0; j < alloc_dep[ref]->rnum; j++)
3634 {
3635 const int d = *(int*)ccv_array_get(alloc_dep[ref], j)((void*)(((char*)((alloc_dep[ref])->data)) + (size_t)(alloc_dep
[ref])->rsize * (size_t)(j)))
;
3636 // This is from alloc_dep, it should be computable.
3637 assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]))((void) sizeof (((!((tensor_blocks[d].flags & 0x3) == ALIAS
) && !((tensor_blocks[d].flags & 0x3) == UNASSIGNED
))) ? 1 : 0), __extension__ ({ if ((!((tensor_blocks[d].flags
& 0x3) == ALIAS) && !((tensor_blocks[d].flags &
0x3) == UNASSIGNED))) ; else __assert_fail ("TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d])"
, "ccv_nnc_symbolic_graph_compile.c", 3637, __extension__ __PRETTY_FUNCTION__
); }))
;
3638 if (tensor_blocks[d].tail)
3639 for (k = 0; k < tensor_blocks[d].tail->rnum; k++)
3640 {
3641 const int incoming = *(int*)ccv_array_get(tensor_blocks[d].tail, k)((void*)(((char*)((tensor_blocks[d].tail)->data)) + (size_t
)(tensor_blocks[d].tail)->rsize * (size_t)(k)))
;
3642 if (incoming >= exec_symbol_info_size)
3643 continue;
3644 assert(incoming >= 0)((void) sizeof ((incoming >= 0) ? 1 : 0), __extension__ ({
if (incoming >= 0) ; else __assert_fail ("incoming >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3644, __extension__ __PRETTY_FUNCTION__
); }))
;
3645 assert(graph_execs[incoming].graph)((void) sizeof ((graph_execs[incoming].graph) ? 1 : 0), __extension__
({ if (graph_execs[incoming].graph) ; else __assert_fail ("graph_execs[incoming].graph"
, "ccv_nnc_symbolic_graph_compile.c", 3645, __extension__ __PRETTY_FUNCTION__
); }))
;
3646 ccv_nnc_graph_exec_concat(graph, graph_execs[incoming], set_exec);
3647 flags = 1;
3648 }
3649 }
3650 // If cannot find a start node for this exec, we need to append it to the no-op of the start.
3651 if (!flags)
3652 {
3653 if (!source_exec_created)
3654 {
3655 graph_exec_arena->source = ccv_nnc_graph_exec_new(graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC()((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}}}), 0), ccv_nnc_no_hint, 0, 0, 0, 0);
3656 source_exec_created = 1;
3657 }
3658 ccv_nnc_graph_exec_concat(graph, graph_exec_arena->source, set_exec);
3659 }
3660 }
3661 }
3662 // Now go through the list of tensors to see whether we need to do explicit broadcast for these tensor multi-views
3663 // (we need that if it is not associated as inputs / outputs of any execs, this is possible if all execs associate
3664 // with its alias).
3665 assert(tensor_arena->vt_tensor_size == graph_prep->tensor_symbol_info_size)((void) sizeof ((tensor_arena->vt_tensor_size == graph_prep
->tensor_symbol_info_size) ? 1 : 0), __extension__ ({ if (
tensor_arena->vt_tensor_size == graph_prep->tensor_symbol_info_size
) ; else __assert_fail ("tensor_arena->vt_tensor_size == graph_prep->tensor_symbol_info_size"
, "ccv_nnc_symbolic_graph_compile.c", 3665, __extension__ __PRETTY_FUNCTION__
); }))
;
3666 for (i = 0; i < tensor_arena->vt_tensor_size; i++)
3667 {
3668 ccv_nnc_tensor_t* mv = tensor_arena->vt_tensors[i];
3669 // If it is multiview tensor, inspect all its head to see whether we already associated with the node.
3670 if (mv && CCV_IS_TENSOR_MULTIVIEW(mv)((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW))
3671 {
3672 const ccv_array_t* const head = tensor_blocks[i].head;
3673 if (head && head->rnum > 0)
3674 for (j = 0; j < head->rnum; j++)
3675 {
3676 const int idx = *(int*)ccv_array_get(head, j)((void*)(((char*)((head)->data)) + (size_t)(head)->rsize
* (size_t)(j)))
;
3677 if (idx >= exec_symbol_info_size)
3678 continue;
3679 assert(idx >= 0)((void) sizeof ((idx >= 0) ? 1 : 0), __extension__ ({ if (
idx >= 0) ; else __assert_fail ("idx >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 3679, __extension__ __PRETTY_FUNCTION__); }))
;
3680 const int d = graph_execs[idx].d;
3681 ccv_nnc_graph_exec_info_t* const exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, d)((void*)(((char*)((graph->exec_info)->data)) + (size_t)
(graph->exec_info)->rsize * (size_t)(d)))
;
3682 int flag = 0;
3683 if (exec_info->tensor_wraps_ref)
3684 {
3685 ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = *(ccv_nnc_graph_tensor_wrap_array_t**)ccv_array_get(graph->tensor_wraps, exec_info->tensor_wraps_ref - 1)((void*)(((char*)((graph->tensor_wraps)->data)) + (size_t
)(graph->tensor_wraps)->rsize * (size_t)(exec_info->
tensor_wraps_ref - 1)))
;
3686 for (k = 0; k < tensor_wrap_array->size && !flag; k++)
3687 flag = (tensor_wrap_array->tensor_wraps[k] && tensor_wrap_array->tensor_wraps[k]->tensors[0] == mv);
3688 }
3689 // If none is in the flag, it need to be included in the cast.
3690 if (!flag)
3691 ccv_nnc_graph_exec_add_update(graph, graph_execs[idx], mv);
3692 }
3693 }
3694 }
3695 // Create source / destination phony node. This is to facilitate use of compiled graph.
3696 // Also, this is needed if you have init zero execs.
3697 if (source_exec_created || source_size > 1)
3698 {
3699 if (!source_exec_created)
3700 graph_exec_arena->source = ccv_nnc_graph_exec_new(graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC()((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}}}), 0), ccv_nnc_no_hint, 0, 0, 0, 0);
3701 for (i = 0; i < source_size; i++)
3702 ccv_nnc_graph_exec_concat(graph, graph_exec_arena->source, graph_execs[sources[i].d]);
3703 } else {
3704 assert(!source_exec_created)((void) sizeof ((!source_exec_created) ? 1 : 0), __extension__
({ if (!source_exec_created) ; else __assert_fail ("!source_exec_created"
, "ccv_nnc_symbolic_graph_compile.c", 3704, __extension__ __PRETTY_FUNCTION__
); }))
;
3705 assert(source_size == 1)((void) sizeof ((source_size == 1) ? 1 : 0), __extension__ ({
if (source_size == 1) ; else __assert_fail ("source_size == 1"
, "ccv_nnc_symbolic_graph_compile.c", 3705, __extension__ __PRETTY_FUNCTION__
); }))
;
3706 graph_exec_arena->source = graph_execs[sources[0].d];
3707 }
3708 if (destination_size == 1)
3709 graph_exec_arena->destination = graph_execs[destinations[0].d];
3710 else {
3711 graph_exec_arena->destination = ccv_nnc_graph_exec_new(graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC()((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}}}), 0), ccv_nnc_no_hint, 0, 0, 0, 0);
3712 for (i = 0; i < destination_size; i++)
3713 ccv_nnc_graph_exec_concat(graph, graph_execs[destinations[i].d], graph_exec_arena->destination);
3714 }
3715 ccv_nnc_graph_set_sources(graph, &graph_exec_arena->source, 1);
3716 ccv_nnc_graph_set_destinations(graph, &graph_exec_arena->destination, 1);
3717 return graph_exec_arena;
3718}
3719
3720static ccv_nnc_graph_t* _ccv_nnc_graph_find_peer(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_symbolic_graph_t* const peer)
3721{
3722 if (graph_prep->symbolic_graph == peer)
3723 return graph_prep->graph;
3724 int i;
3725 for (i = 0; i < graph_prep->sub_prep_size; i++)
3726 if (graph_prep->sub_preps[i])
3727 {
3728 ccv_nnc_graph_t* const graph = _ccv_nnc_graph_find_peer(graph_prep->sub_preps[i], peer);
3729 if (graph)
3730 return graph;
3731 }
3732 return 0;
3733}
3734
3735static void _ccv_nnc_graph_fixup_peer(const ccv_nnc_symbolic_graph_prep_t* const root_prep, ccv_nnc_symbolic_graph_prep_t* const graph_prep)
3736{
3737 int i;
3738 for (i = 0; i < graph_prep->sub_prep_size; i++)
3739 if (graph_prep->sub_preps[i])
3740 {
3741 if (graph_prep->sub_preps[i]->symbolic_graph->peer)
3742 graph_prep->sub_preps[i]->graph->peer = _ccv_nnc_graph_find_peer(root_prep, graph_prep->sub_preps[i]->symbolic_graph->peer);
3743 }
3744}
3745
3746static void _ccv_nnc_graph_exec_arena_fixup_peer_ref(const ccv_nnc_graph_exec_arena_t* const root_arena, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
3747{
3748 assert(graph_exec_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph)((void) sizeof ((graph_exec_arena->graph_ref == (intptr_t)
graph_prep->symbolic_graph) ? 1 : 0), __extension__ ({ if (
graph_exec_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph
) ; else __assert_fail ("graph_exec_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 3748, __extension__ __PRETTY_FUNCTION__
); }))
;
3749 int i;
3750 for (i = 0; i < graph_prep->exec_symbol_info_size; i++)
3751 {
3752 if (CCV_NNC_GRAPH_EXEC_IS_DEAD(graph_prep->exec_symbol_info[i].flags)((graph_prep->exec_symbol_info[i].flags) & CCV_NNC_GRAPH_EXEC_DEAD
)
)
3753 continue;
3754 if (graph_exec_arena->graph_execs[i].graph && graph_prep->exec_symbol_info[i].peer_ref)
3755 {
3756 ccv_nnc_graph_exec_t peer_exec = ccv_nnc_graph_exec_from_symbol(root_arena, (ccv_nnc_graph_exec_symbol_t){
3757 .d = graph_prep->exec_symbol_info[i].peer_ref - 1,
3758 .graph = graph_prep->symbolic_graph->peer ? graph_prep->symbolic_graph->peer : graph_prep->symbolic_graph,
3759 });
3760 if (peer_exec.d >= 0)
3761 ccv_nnc_graph_exec_set_peer(graph_prep->graph, graph_exec_arena->graph_execs[i], peer_exec);
3762 }
3763 }
3764 for (i = 0; i < graph_prep->sub_prep_size; i++)
3765 if (graph_prep->sub_preps[i])
3766 _ccv_nnc_graph_exec_arena_fixup_peer_ref(root_arena, graph_prep->sub_preps[i], graph_exec_arena->sub_arenas[i]);
3767}
3768
3769static void _ccv_nnc_symbolic_graph_prep_dup_breakpoints_free(ccv_nnc_symbolic_graph_prep_t* const graph_prep)
3770{
3771 int i;
3772 if (graph_prep->dup_breakpoints)
3773 {
3774 // Strip the const modifier only possible because it is a sub-graph.
3775 ccv_nnc_symbolic_graph_t* const symbolic_graph = (ccv_nnc_symbolic_graph_t*)graph_prep->symbolic_graph;
3776 for (i = 0; i < graph_prep->dup_breakpoints->rnum; i++)
3777 ccv_nnc_graph_exec_symbol_free(symbolic_graph, *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(graph_prep->dup_breakpoints, i)((void*)(((char*)((graph_prep->dup_breakpoints)->data))
+ (size_t)(graph_prep->dup_breakpoints)->rsize * (size_t
)(i)))
);
3778 ccv_array_free(graph_prep->dup_breakpoints);
3779 graph_prep->dup_breakpoints = 0;
3780 graph_prep->exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum;
3781 // Afterwards, we have to regenerate the exec_symbol_info, fill in the information (through symbol_infer).
3782 memcpy(graph_prep->exec_symbol_info, ccv_array_get(symbolic_graph->exec_symbol_info, 0)((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
(size_t)(0)))
, sizeof(ccv_nnc_graph_exec_symbol_info_t) * graph_prep->exec_symbol_info_size);
3783 // Since exec_symbol_info changed, create a new visit object.
3784 assert(symbolic_graph->sources)((void) sizeof ((symbolic_graph->sources) ? 1 : 0), __extension__
({ if (symbolic_graph->sources) ; else __assert_fail ("symbolic_graph->sources"
, "ccv_nnc_symbolic_graph_compile.c", 3784, __extension__ __PRETTY_FUNCTION__
); }))
;
3785 assert(symbolic_graph->destinations)((void) sizeof ((symbolic_graph->destinations) ? 1 : 0), __extension__
({ if (symbolic_graph->destinations) ; else __assert_fail
("symbolic_graph->destinations", "ccv_nnc_symbolic_graph_compile.c"
, 3785, __extension__ __PRETTY_FUNCTION__); }))
;
3786 ccv_nnc_graph_exec_symbol_t* const sources = (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->sources, 0)((void*)(((char*)((symbolic_graph->sources)->data)) + (
size_t)(symbolic_graph->sources)->rsize * (size_t)(0)))
;
3787 const int source_size = symbolic_graph->sources->rnum;
3788 ccv_nnc_graph_exec_symbol_t* const destinations = (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->destinations, 0)((void*)(((char*)((symbolic_graph->destinations)->data)
) + (size_t)(symbolic_graph->destinations)->rsize * (size_t
)(0)))
;
3789 const int destination_size = symbolic_graph->destinations->rnum;
3790 ccv_nnc_graph_visit_t* visit = ccv_nnc_graph_visit_new(symbolic_graph, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, 0), symbolic_graph->exec_symbol_info->rnum, sources, source_size, destinations, destination_size, 0)({ ccv_nnc_graph_visit_t* _visit_ = (ccv_nnc_graph_visit_t*)malloc
(sizeof(ccv_nnc_graph_visit_t) + sizeof(_visit_->node[0]) *
((symbolic_graph->exec_symbol_info->rnum) - 1)); _visit_
->size = 0; do { typedef struct { int8_t d; int8_t r; uint16_t
c; int32_t edges; } ccv_nnc_incoming_t; int _i_, _j_; int _incoming_edges_
= 0; for (_i_ = 0; _i_ < (symbolic_graph->exec_symbol_info
->rnum); _i_++) _incoming_edges_ += (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_i_].outgoings) ? ((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_i_].outgoings->rnum : 0; const int _heap_mem_
= (symbolic_graph->exec_symbol_info->rnum + _incoming_edges_
> 1024); ccv_nnc_incoming_t* _incomings_; if (_heap_mem_)
_incomings_ = (ccv_nnc_incoming_t*)malloc(sizeof(ccv_nnc_incoming_t
) * (symbolic_graph->exec_symbol_info->rnum) + sizeof(int32_t
) * ((symbolic_graph->exec_symbol_info->rnum) * 2 + _incoming_edges_
)); else _incomings_ = (ccv_nnc_incoming_t*)__builtin_alloca (
sizeof(ccv_nnc_incoming_t) * (symbolic_graph->exec_symbol_info
->rnum) + sizeof(int32_t) * ((symbolic_graph->exec_symbol_info
->rnum) * 2 + _incoming_edges_)); memset(_incomings_, 0, sizeof
(ccv_nnc_incoming_t) * (symbolic_graph->exec_symbol_info->
rnum)); int32_t* _exists_[2] = { (int32_t*)(_incomings_ + (symbolic_graph
->exec_symbol_info->rnum)), (int32_t*)(_incomings_ + (symbolic_graph
->exec_symbol_info->rnum)) + (symbolic_graph->exec_symbol_info
->rnum), }; int32_t* const _edges_ = _exists_[1] + (symbolic_graph
->exec_symbol_info->rnum); for (_i_ = 0; _i_ < (source_size
); _i_++) { ((void) sizeof (((sources)[_i_].graph == symbolic_graph
) ? 1 : 0), __extension__ ({ if ((sources)[_i_].graph == symbolic_graph
) ; else __assert_fail ("(sources)[_i_].graph == symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 3790, __extension__ __PRETTY_FUNCTION__
); })); _exists_[0][_i_] = (sources)[_i_].d; } int _exist_size_
[2] = { (source_size), 0, }; int _p_ = 0, _q_ = 1; while (_exist_size_
[_p_] > 0) { _exist_size_[_q_] = 0; for (_i_ = 0; _i_ <
_exist_size_[_p_]; _i_++) { const int32_t _idx_ = _exists_[_p_
][_i_]; if (_incomings_[_idx_].r == 1) continue; _incomings_[
_idx_].r = 1; if (((ccv_nnc_graph_exec_symbol_info_t*)((void*
)(((char*)((symbolic_graph->exec_symbol_info)->data)) +
(size_t)(symbolic_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings) for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings->rnum; _j_++) { const int
d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_)
)); ++_incomings_[d].c; _exists_[_q_][_exist_size_[_q_]] = d;
++_exist_size_[_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_
) = (_i_)); } for (_i_ = 0; _i_ < (source_size); _i_++) { (
(void) sizeof (((sources)[_i_].graph == symbolic_graph) ? 1 :
0), __extension__ ({ if ((sources)[_i_].graph == symbolic_graph
) ; else __assert_fail ("(sources)[_i_].graph == symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 3790, __extension__ __PRETTY_FUNCTION__
); })); _exists_[0][_i_] = (sources)[_i_].d; } _exist_size_[0
] = (source_size); _exist_size_[1] = 0; _p_ = 0, _q_ = 1; int
_bump_ = 1; while (_exist_size_[_p_] > 0) { _exist_size_[
_q_] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_]; _i_++) { const
int32_t _idx_ = _exists_[_p_][_i_]; if (_incomings_[_idx_].r
== 2) continue; _incomings_[_idx_].r = 2; if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings) for (_j_ = 0; _j_ < ((
ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)((symbolic_graph
->exec_symbol_info)->data)) + (size_t)(symbolic_graph->
exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings
->rnum; _j_++) { const int d = *(int*)((void*)(((char*)(((
(ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)((symbolic_graph
->exec_symbol_info)->data)) + (size_t)(symbolic_graph->
exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings
)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t*)(
(void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
(size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_)))
; if (_incomings_[d].edges == 0) { _incomings_[d].edges = _bump_
; _bump_ += _incomings_[d].c; _incomings_[d].c = 0; } _edges_
[_incomings_[d].edges - 1 + _incomings_[d].c] = _idx_; ++_incomings_
[d].c; _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[_q_
]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for (
_i_ = 0; _i_ < (destination_size); _i_++) { ((void) sizeof
(((destinations)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__
({ if ((destinations)[_i_].graph == symbolic_graph) ; else __assert_fail
("(destinations)[_i_].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 3790, __extension__ __PRETTY_FUNCTION__); })); _exists_[0][
_i_] = (destinations)[_i_].d; } _exist_size_[0] = (destination_size
); _exist_size_[1] = 0; _p_ = 0, _q_ = 1; while (_exist_size_
[_p_] > 0) { _exist_size_[_q_] = 0; for (_i_ = 0; _i_ <
_exist_size_[_p_]; _i_++) { const int32_t _idx_ = _exists_[_p_
][_i_]; if (_incomings_[_idx_].r != 2) continue; _incomings_[
_idx_].r = 3; if (_incomings_[_idx_].edges > 0) for (_j_ =
0; _j_ < _incomings_[_idx_].c; _j_++) { const int d = _edges_
[_incomings_[_idx_].edges - 1 + _j_]; _exists_[_q_][_exist_size_
[_q_]] = d; ++_exist_size_[_q_]; } } ((_i_) = (_p_), (_p_) = (
_q_), (_q_) = (_i_)); } for (_i_ = 0; _i_ < (destination_size
); _i_++) { ((void) sizeof (((destinations)[_i_].graph == symbolic_graph
) ? 1 : 0), __extension__ ({ if ((destinations)[_i_].graph ==
symbolic_graph) ; else __assert_fail ("(destinations)[_i_].graph == symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 3790, __extension__ __PRETTY_FUNCTION__
); })); _incomings_[(destinations)[_i_].d].d = 1; } for (_i_ =
0; _i_ < (source_size); _i_++) { ((void) sizeof (((sources
)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__ ({ if
((sources)[_i_].graph == symbolic_graph) ; else __assert_fail
("(sources)[_i_].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 3790, __extension__ __PRETTY_FUNCTION__); })); _exists_[0][
_i_] = (sources)[_i_].d; } _p_ = 0; _q_ = 1; _exist_size_[0] =
(source_size); _exist_size_[1] = 0; int _d_ = 0; while (_exist_size_
[_p_] > 0) { _exist_size_[_q_] = 0; for (_i_ = 0; _i_ <
_exist_size_[_p_];) { const int32_t _idx_ = _exists_[_p_][_i_
]; _visit_->node[_visit_->size].index = ((_idx_)); _visit_
->node[_visit_->size].term = ((_incomings_[_idx_].d)); ++
_visit_->size;; if (_incomings_[_idx_].d) { ++_d_; _incomings_
[_idx_].r = 4; } if (((ccv_nnc_graph_exec_symbol_info_t*)((void
*)(((char*)((symbolic_graph->exec_symbol_info)->data)) +
(size_t)(symbolic_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings) { if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings->rnum == 1) { const int
d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(0)))
; --_incomings_[d].c; if (_incomings_[d].c == 0 && _incomings_
[d].r == 3 && _d_ < (destination_size)) { _exists_
[_p_][_i_] = d; continue; } } else for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings->rnum; _j_++) { const int
d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_)
)); --_incomings_[d].c; if (_incomings_[d].c == 0 && _incomings_
[d].r == 3 && _d_ < (destination_size)) { _exists_
[_q_][_exist_size_[_q_]] = d; ++_exist_size_[_q_]; } } } ++_i_
; } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for (_i_
= 0; _i_ < (destination_size); _i_++) { ((void) sizeof ((
(destinations)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__
({ if ((destinations)[_i_].graph == symbolic_graph) ; else __assert_fail
("(destinations)[_i_].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 3790, __extension__ __PRETTY_FUNCTION__); })); if (_incomings_
[(destinations)[_i_].d].r == 4) continue; if (!(0)) { ((void)
sizeof ((_incomings_[(destinations)[_i_].d].c == 0) ? 1 : 0)
, __extension__ ({ if (_incomings_[(destinations)[_i_].d].c ==
0) ; else __assert_fail ("_incomings_[(destinations)[_i_].d].c == 0"
, "ccv_nnc_symbolic_graph_compile.c", 3790, __extension__ __PRETTY_FUNCTION__
); })); } else if (_incomings_[(destinations)[_i_].d].c > 0
) continue; _visit_->node[_visit_->size].index = (((destinations
)[_i_].d)); _visit_->node[_visit_->size].term = ((_incomings_
[(destinations)[_i_].d].d)); ++_visit_->size;; } if (_heap_mem_
) free(_incomings_); } while (0);; ((void) sizeof ((_visit_->
size <= (symbolic_graph->exec_symbol_info->rnum)) ? 1
: 0), __extension__ ({ if (_visit_->size <= (symbolic_graph
->exec_symbol_info->rnum)) ; else __assert_fail ("_visit_->size <= (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 3790, __extension__ __PRETTY_FUNCTION__
); })); _visit_; })
;
3791 ccv_nnc_graph_visit_free(graph_prep->visit);
3792 graph_prep->visit = visit;
3793 assert(graph_prep->p)((void) sizeof ((graph_prep->p) ? 1 : 0), __extension__ ({
if (graph_prep->p) ; else __assert_fail ("graph_prep->p"
, "ccv_nnc_symbolic_graph_compile.c", 3793, __extension__ __PRETTY_FUNCTION__
); }))
;
3794 ccv_nnc_symbolic_graph_symbol_infer(symbolic_graph, visit, sources, source_size, destinations, destination_size, graph_prep->p->tensor_symbol_info, graph_prep->p->tensor_symbol_info_size, graph_prep->tensor_symbol_info, graph_prep->exec_symbol_info);
3795 }
3796 ccv_nnc_graph_visit_for(graph_prep->visit, graph_prep->exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (graph_prep->visit)->
size; _i_++) { const int idx __attribute__((unused)) = (graph_prep
->visit)->node[_i_].index; const int _node_unused_ __attribute__
((unused)) = (graph_prep->visit)->node[_i_].term; typeof
((graph_prep->exec_symbol_info)) const node __attribute__
((unused)) = (graph_prep->exec_symbol_info) + idx;
{
3797 for (i = 0; i < node->graph_ref_size; i++)
3798 {
3799 const int graph_ref = CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)
[i] - 1;
3800 if (graph_ref >= 0)
3801 _ccv_nnc_symbolic_graph_prep_dup_breakpoints_free(graph_prep->sub_preps[graph_ref]);
3802 }
3803 } ccv_nnc_graph_visit_endfor} }
3804}
3805
3806void ccv_nnc_symbolic_graph_compile(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, ccv_nnc_graph_t** const graph_ref, ccv_nnc_tensor_arena_t** const tensor_arena_ref, ccv_nnc_graph_exec_arena_t** const graph_exec_arena_ref)
3807{
3808 assert(graph_ref)((void) sizeof ((graph_ref) ? 1 : 0), __extension__ ({ if (graph_ref
) ; else __assert_fail ("graph_ref", "ccv_nnc_symbolic_graph_compile.c"
, 3808, __extension__ __PRETTY_FUNCTION__); }))
;
3809 assert(tensor_arena_ref)((void) sizeof ((tensor_arena_ref) ? 1 : 0), __extension__ ({
if (tensor_arena_ref) ; else __assert_fail ("tensor_arena_ref"
, "ccv_nnc_symbolic_graph_compile.c", 3809, __extension__ __PRETTY_FUNCTION__
); }))
;
3810 assert(graph_exec_arena_ref)((void) sizeof ((graph_exec_arena_ref) ? 1 : 0), __extension__
({ if (graph_exec_arena_ref) ; else __assert_fail ("graph_exec_arena_ref"
, "ccv_nnc_symbolic_graph_compile.c", 3810, __extension__ __PRETTY_FUNCTION__
); }))
;
3811 int i;
3812 // Cannot bind the multi-view.
3813 for (i = 0; i < tensor_bind_size; i++)
3814 {
3815 assert(tensor_binds[i].tensor)((void) sizeof ((tensor_binds[i].tensor) ? 1 : 0), __extension__
({ if (tensor_binds[i].tensor) ; else __assert_fail ("tensor_binds[i].tensor"
, "ccv_nnc_symbolic_graph_compile.c", 3815, __extension__ __PRETTY_FUNCTION__
); }))
;
3816 assert(!CCV_IS_TENSOR_MULTIVIEW(tensor_binds[i].tensor))((void) sizeof ((!((*(int*)(tensor_binds[i].tensor)) & CCV_TENSOR_MULTIVIEW
)) ? 1 : 0), __extension__ ({ if (!((*(int*)(tensor_binds[i].
tensor)) & CCV_TENSOR_MULTIVIEW)) ; else __assert_fail ("!CCV_IS_TENSOR_MULTIVIEW(tensor_binds[i].tensor)"
, "ccv_nnc_symbolic_graph_compile.c", 3816, __extension__ __PRETTY_FUNCTION__
); }))
;
3817 }
3818 ccv_nnc_symbolic_graph_prep_t* graph_prep = _ccv_nnc_symbolic_graph_prep_new(symbolic_graph, tensor_binds, tensor_bind_size, outputs, output_size, sources, source_size, destinations, destination_size, 0, 0, 0, 0);
3819 _ccv_nnc_symbolic_graph_prep_while_count_tensor(graph_prep);
3820 ccv_nnc_tensor_arena_t* tensor_arena = _ccv_nnc_tensor_arena_new(graph_prep, 0, tensor_binds, tensor_bind_size);
3821 _ccv_nnc_tensor_arena_fixup_peer_ref_and_tape_var(tensor_arena, graph_prep, tensor_arena);
3822 *tensor_arena_ref = tensor_arena;
3823 // The above handled tensor allocation, now we need to materialize the graph from symbolic to real.
3824 _ccv_nnc_graph_fixup_peer(graph_prep, graph_prep);
3825 // Now tensor allocation is done, if there are any dup_breakpoints, I need to clean it up.
3826 _ccv_nnc_symbolic_graph_prep_dup_breakpoints_free(graph_prep);
3827 *graph_ref = graph_prep->graph;
3828 ccv_nnc_graph_exec_arena_t* graph_exec_arena = _ccv_nnc_graph_exec_arena_new(symbolic_graph, sources, source_size, destinations, destination_size, graph_prep, tensor_arena);
3829 _ccv_nnc_graph_exec_arena_topsort(graph_prep->graph, graph_exec_arena);
3830 _ccv_nnc_graph_exec_arena_fixup_peer_ref(graph_exec_arena, graph_prep, graph_exec_arena);
3831 *graph_exec_arena_ref = graph_exec_arena;
3832 _ccv_nnc_symbolic_graph_prep_free(graph_prep);
3833}
3834
3835static void _ccv_nnc_tensor_arena_free(ccv_nnc_tensor_arena_t* const tensor_arena)
3836{
3837 // Buffers are inherited from above, no need to dealloc.
3838 int i;
3839 for (i = 0; i < tensor_arena->sub_arena_size; i++)
3840 if (tensor_arena->sub_arenas[i])
3841 _ccv_nnc_tensor_arena_free(tensor_arena->sub_arenas[i]);
3842 for (i = 0; i < tensor_arena->m_tensor_idx->rnum; i++)
3843 {
3844 ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, *(int*)ccv_array_get(tensor_arena->m_tensor_idx, i)((void*)(((char*)((tensor_arena->m_tensor_idx)->data)) +
(size_t)(tensor_arena->m_tensor_idx)->rsize * (size_t)
(i)))
);;
3845 assert(mv && CCV_IS_TENSOR_MULTIVIEW(mv))((void) sizeof ((mv && ((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW
)) ? 1 : 0), __extension__ ({ if (mv && ((*(int*)(mv)
) & CCV_TENSOR_MULTIVIEW)) ; else __assert_fail ("mv && CCV_IS_TENSOR_MULTIVIEW(mv)"
, "ccv_nnc_symbolic_graph_compile.c", 3845, __extension__ __PRETTY_FUNCTION__
); }))
;
3846 ccv_nnc_tensor_multiview_free(*mv);
3847 }
3848 ccv_array_free(tensor_arena->tensor_metadata);
3849 ccv_array_free(tensor_arena->m_tensor_idx);
3850 ccfreefree(tensor_arena);
3851}
3852
3853void ccv_nnc_tensor_bind_symbol(const ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_tensor_symbol_t symbol, const ccv_nnc_tensor_t* const tensor)
3854{
3855 assert(tensor_arena->graph_ref == (intptr_t)symbol.graph)((void) sizeof ((tensor_arena->graph_ref == (intptr_t)symbol
.graph) ? 1 : 0), __extension__ ({ if (tensor_arena->graph_ref
== (intptr_t)symbol.graph) ; else __assert_fail ("tensor_arena->graph_ref == (intptr_t)symbol.graph"
, "ccv_nnc_symbolic_graph_compile.c", 3855, __extension__ __PRETTY_FUNCTION__
); }))
;
3856 assert(symbol.d < tensor_arena->vt_tensor_size)((void) sizeof ((symbol.d < tensor_arena->vt_tensor_size
) ? 1 : 0), __extension__ ({ if (symbol.d < tensor_arena->
vt_tensor_size) ; else __assert_fail ("symbol.d < tensor_arena->vt_tensor_size"
, "ccv_nnc_symbolic_graph_compile.c", 3856, __extension__ __PRETTY_FUNCTION__
); }))
;
3857 tensor_arena->vt_tensors[symbol.d]->data.ptr = tensor->data.ptr;
3858}
3859
3860uint64_t ccv_nnc_tensor_arena_size(const ccv_nnc_tensor_arena_t* const tensor_arena)
3861{
3862 uint64_t total_size = 0;
3863 int i;
3864 for (i = 0; i < tensor_arena->buffer_size; i++)
3865 total_size += tensor_arena->buffers[i].size;
3866 return total_size;
3867}
3868
3869void ccv_nnc_tensor_arena_free(ccv_nnc_tensor_arena_t* const tensor_arena)
3870{
3871 int i;
3872 for (i = 0; i < tensor_arena->buffer_size; i++)
3873 {
3874 const int buffer_type = tensor_arena->buffers[i].type;;
3875 const int memory_type = CCV_TENSOR_GET_MEMORY(buffer_type)((buffer_type) & 0x3);
3876#ifdef HAVE_CUDA1
3877 const int device_id = CCV_TENSOR_GET_DEVICE_ID(buffer_type)(((buffer_type) & 0xfff00) >> 8);
3878 if (memory_type == CCV_TENSOR_GPU_MEMORY)
3879 cufree(device_id, tensor_arena->buffers[i].ptr);
3880 else {
3881 assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0
), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY)
; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY"
, "ccv_nnc_symbolic_graph_compile.c", 3881, __extension__ __PRETTY_FUNCTION__
); }))
;
3882 if (tensor_arena->buffers[i].pin_mem)
3883 cuhostfree(tensor_arena->buffers[i].ptr);
3884 else
3885 ccfreefree(tensor_arena->buffers[i].ptr);
3886 }
3887#else
3888 assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0
), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY)
; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY"
, "ccv_nnc_symbolic_graph_compile.c", 3888, __extension__ __PRETTY_FUNCTION__
); }))
;
3889 ccfreefree(tensor_arena->buffers[i].ptr);
3890#endif
3891 }
3892 _ccv_nnc_tensor_arena_free(tensor_arena);
3893}
3894
3895void ccv_nnc_graph_exec_arena_free(ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
3896{
3897 int i;
3898 for (i = 0; i < graph_exec_arena->sub_arena_size; i++)
3899 if (graph_exec_arena->sub_arenas[i])
3900 ccv_nnc_graph_exec_arena_free(graph_exec_arena->sub_arenas[i]);
3901 ccfreefree(graph_exec_arena);
3902}