File: | nnc/ccv_nnc_symbolic_graph_compile.c |
Warning: | line 3890, column 8 Branch condition evaluates to a garbage value |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | #include "ccv_nnc.h" | |||
2 | #include "ccv_nnc_internal.h" | |||
3 | #include "ccv_nnc_easy.h" | |||
4 | #include "ccv_internal.h" | |||
5 | #ifdef HAVE_CUDA1 | |||
6 | #include "gpu/ccv_nnc_compat.h" | |||
7 | #elif defined(HAVE_MPS) | |||
8 | #include "mps/ccv_nnc_mps.h" | |||
9 | #endif | |||
10 | #include "_ccv_nnc_graph.h" | |||
11 | #include "_ccv_nnc_symbolic_graph.h" | |||
12 | ||||
13 | // MARK - Level-3 API | |||
14 | ||||
15 | typedef struct { | |||
16 | int flags; | |||
17 | int type; | |||
18 | int pin_mem; // This memory need to be pinned. | |||
19 | int ref; // Reference to another tensor block. Start with 1. | |||
20 | int alias_ref; // If reference to another tensor, and the other one is an alias. Start with 1. | |||
21 | int bypass_ref; // Copy over the bypass_ref from tensor symbol underneath. Start with 1. | |||
22 | int companion_ref; // Reference to another block that they two share the same memory region. Start with 1. the current crude implementation requires the two mutually be companion. Because there are two, we took the one that companion_ref <= i as the primary and companion_ref > i is the secondary. For allocation algorithm, we use the primary throughout. | |||
23 | int unfoldable_except_ref; // Reference to a tensor block that can be the exception to unfoldable (as output). Start with 1. | |||
24 | ccv_array_t* r_refs; // If this is referenced by another block, the array point back to these blocks. Start with 1. | |||
25 | uint64_t size; // The size of the tensor expected. | |||
26 | int p_refs[2]; // Reference to the parent tensor block, at max there will be only two. Start with 1. | |||
27 | ccv_array_t* dup_p_refs; // Reference to the parent tensor block from the duplicated tensor blocks. It could be many. Start with 0. | |||
28 | ccv_array_t* head; // The head nodes (it could be multiple if from the graph, one cannot determine which is the first). | |||
29 | ccv_array_t* tail; // The tail nodes (it could be multiple if from the graph, one cannot determine which is the last). | |||
30 | } ccv_nnc_tensor_block_t; // Tensor Arena Block | |||
31 | ||||
32 | #define IS_PRIMARY_COMPANION(idx, block)((idx) < (uint32_t)((block).companion_ref - 1)) ((idx) < (uint32_t)((block).companion_ref - 1)) | |||
33 | ||||
34 | enum { | |||
35 | UNASSIGNED = 0x1, | |||
36 | ALIAS = 0x2, | |||
37 | READ_ONLY = 0x4, | |||
38 | WRITE_ONLY = 0x8, | |||
39 | READ_WRITE = 0xc, | |||
40 | ANONYMOUS = 0x10, // Mark this block as anonymous (thus, not reference to any specific tensor). | |||
41 | UNFOLDABLE_AS_INPUT = 0x20, // If this block is used as input, it cannot be folded into any output blocks. | |||
42 | UNFOLDABLE_AS_OUTPUT = 0x40, // If this block is used as output, it cannot be folded into any input blocks. | |||
43 | }; | |||
44 | ||||
45 | #define TENSOR_EXPECT_ORDINARY(t)((t.flags & 0x3) == 0) ((t.flags & 0x3) == 0) | |||
46 | #define TENSOR_EXPECT_SET_ORDINARY(t)(t.flags = (t.flags & ~0x3)) (t.flags = (t.flags & ~0x3)) | |||
47 | #define TENSOR_EXPECT_UNASSIGNED(t)((t.flags & 0x3) == UNASSIGNED) ((t.flags & 0x3) == UNASSIGNED) | |||
48 | #define TENSOR_EXPECT_SET_UNASSIGNED(t)(t.flags = ((t.flags & ~0x3) | UNASSIGNED)) (t.flags = ((t.flags & ~0x3) | UNASSIGNED)) | |||
49 | #define TENSOR_EXPECT_UNSET_UNASSIGNED(t)(t.flags = (t.flags & ~0x1)) (t.flags = (t.flags & ~0x1)) | |||
50 | #define TENSOR_EXPECT_ALIAS(t)((t.flags & 0x3) == ALIAS) ((t.flags & 0x3) == ALIAS) | |||
51 | #define TENSOR_EXPECT_COMPUTABLE(t)(!((t.flags & 0x3) == ALIAS) && !((t.flags & 0x3 ) == UNASSIGNED)) (!TENSOR_EXPECT_ALIAS(t)((t.flags & 0x3) == ALIAS) && !TENSOR_EXPECT_UNASSIGNED(t)((t.flags & 0x3) == UNASSIGNED)) | |||
52 | #define TENSOR_READ_WRITE(t)(t.flags & 0xc) (t.flags & 0xc) | |||
53 | #define TENSOR_SET_READ_WRITE(t, rw)(t.flags = ((t.flags & ~0xc) | rw)) (t.flags = ((t.flags & ~0xc) | rw)) | |||
54 | #define TENSOR_SET_ANONYMOUS(t)(t.flags = ((t.flags & ~0x10) | ANONYMOUS)) (t.flags = ((t.flags & ~0x10) | ANONYMOUS)) | |||
55 | #define TENSOR_IS_ANONYMOUS(t)(t.flags & ANONYMOUS) (t.flags & ANONYMOUS) | |||
56 | #define TENSOR_SET_UNFOLDABLE_AS_INPUT(t)(t.flags = (t.flags | UNFOLDABLE_AS_INPUT)) (t.flags = (t.flags | UNFOLDABLE_AS_INPUT)) | |||
57 | #define TENSOR_IS_UNFOLDABLE_AS_INPUT(t)(t.flags & UNFOLDABLE_AS_INPUT) (t.flags & UNFOLDABLE_AS_INPUT) | |||
58 | #define TENSOR_SET_UNFOLDABLE_AS_OUTPUT(t)(t.flags = (t.flags | UNFOLDABLE_AS_OUTPUT)) (t.flags = (t.flags | UNFOLDABLE_AS_OUTPUT)) | |||
59 | #define TENSOR_IS_UNFOLDABLE_AS_OUTPUT(t)(t.flags & UNFOLDABLE_AS_OUTPUT) (t.flags & UNFOLDABLE_AS_OUTPUT) | |||
60 | ||||
61 | #define TENSOR_REQUIRE_INIT(flags)(((flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS) || ((flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ONES)) (((flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS) || ((flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ONES)) | |||
62 | ||||
63 | // Holds additional information about the exe nodes. | |||
64 | typedef struct { | |||
65 | int flags; | |||
66 | } ccv_nnc_graph_exec_flag_t; | |||
67 | ||||
68 | enum { | |||
69 | CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO = 0x1, // Need to insert additional IO transfer for case..of statement. | |||
70 | }; | |||
71 | ||||
72 | typedef struct { | |||
73 | int index; | |||
74 | int oc; | |||
75 | int type; | |||
76 | uint64_t size; | |||
77 | } ccv_nnc_tensor_opt_t; | |||
78 | ||||
79 | // We first sort the same type together (because they won't be reused at all. | |||
80 | // And then we sort by size, after that, sort by oc. | |||
81 | #define more_than(i1, i2, aux) (((i1).size > (i2).size) || ((i1).size == (i2).size && (i1).oc >= (i2).oc)) | |||
82 | static CCV_IMPLEMENT_QSORT(_ccv_nnc_tensor_opt_sort_by_size_and_oc, ccv_nnc_tensor_opt_t, more_than)void _ccv_nnc_tensor_opt_sort_by_size_and_oc(ccv_nnc_tensor_opt_t *array, size_t total, int aux) { int isort_thresh = 7; ccv_nnc_tensor_opt_t t; int sp = 0; struct { ccv_nnc_tensor_opt_t *lb; ccv_nnc_tensor_opt_t *ub; } stack[48]; if( total <= 1 ) return; stack[0].lb = array ; stack[0].ub = array + (total - 1); while( sp >= 0 ) { ccv_nnc_tensor_opt_t * left = stack[sp].lb; ccv_nnc_tensor_opt_t* right = stack[sp --].ub; for(;;) { int i, n = (int)(right - left) + 1, m; ccv_nnc_tensor_opt_t * ptr; ccv_nnc_tensor_opt_t* ptr2; if( n <= isort_thresh ) { insert_sort: for( ptr = left + 1; ptr <= right; ptr++ ) { for( ptr2 = ptr; ptr2 > left && more_than(ptr2[ 0],ptr2[-1], aux); ptr2--) (((t)) = ((ptr2[0])), ((ptr2[0])) = ((ptr2[-1])), ((ptr2[-1])) = ((t))); } break; } else { ccv_nnc_tensor_opt_t * left0; ccv_nnc_tensor_opt_t* left1; ccv_nnc_tensor_opt_t* right0 ; ccv_nnc_tensor_opt_t* right1; ccv_nnc_tensor_opt_t* pivot; ccv_nnc_tensor_opt_t * a; ccv_nnc_tensor_opt_t* b; ccv_nnc_tensor_opt_t* c; int swap_cnt = 0; left0 = left; right0 = right; pivot = left + (n/2); if( n > 40 ) { int d = n / 8; a = left, b = left + d, c = left + 2*d; left = more_than(*a, *b, aux) ? (more_than(*b, *c, aux ) ? b : (more_than(*a, *c, aux) ? c : a)) : (more_than(*c, *b , aux) ? b : (more_than(*a, *c, aux) ? a : c)); a = pivot - d , b = pivot, c = pivot + d; pivot = more_than(*a, *b, aux) ? ( more_than(*b, *c, aux) ? b : (more_than(*a, *c, aux) ? c : a) ) : (more_than(*c, *b, aux) ? b : (more_than(*a, *c, aux) ? a : c)); a = right - 2*d, b = right - d, c = right; right = more_than (*a, *b, aux) ? (more_than(*b, *c, aux) ? b : (more_than(*a, * c, aux) ? c : a)) : (more_than(*c, *b, aux) ? b : (more_than( *a, *c, aux) ? a : c)); } a = left, b = pivot, c = right; pivot = more_than(*a, *b, aux) ? (more_than(*b, *c, aux) ? b : (more_than (*a, *c, aux) ? c : a)) : (more_than(*c, *b, aux) ? b : (more_than (*a, *c, aux) ? a : c)); if( pivot != left0 ) { (((t)) = ((*pivot )), ((*pivot)) = ((*left0)), ((*left0)) = ((t))); pivot = left0 ; } left = left1 = left0 + 1; right = right1 = right0; for(;; ) { while( left <= right && !more_than(*pivot, *left , aux) ) { if( !more_than(*left, *pivot, aux) ) { if( left > left1 ) (((t)) = ((*left1)), ((*left1)) = ((*left)), ((*left )) = ((t))); swap_cnt = 1; left1++; } left++; } while( left <= right && !more_than(*right, *pivot, aux) ) { if( !more_than (*pivot, *right, aux) ) { if( right < right1 ) (((t)) = (( *right1)), ((*right1)) = ((*right)), ((*right)) = ((t))); swap_cnt = 1; right1--; } right--; } if( left > right ) break; ((( t)) = ((*left)), ((*left)) = ((*right)), ((*right)) = ((t))); swap_cnt = 1; left++; right--; } if( swap_cnt == 0 ) { left = left0, right = right0; goto insert_sort; } n = ({ typeof ((int )(left1 - left0)) _a = ((int)(left1 - left0)); typeof ((int)( left - left1)) _b = ((int)(left - left1)); (_a < _b) ? _a : _b; }); for( i = 0; i < n; i++ ) (((t)) = ((left0[i])), ( (left0[i])) = ((left[i-n])), ((left[i-n])) = ((t))); n = ({ typeof ((int)(right0 - right1)) _a = ((int)(right0 - right1)); typeof ((int)(right1 - right)) _b = ((int)(right1 - right)); (_a < _b) ? _a : _b; }); for( i = 0; i < n; i++ ) (((t)) = ((left [i])), ((left[i])) = ((right0[i-n+1])), ((right0[i-n+1])) = ( (t))); n = (int)(left - left1); m = (int)(right1 - right); if ( n > 1 ) { if( m > 1 ) { if( n > m ) { stack[++sp]. lb = left0; stack[sp].ub = left0 + n - 1; left = right0 - m + 1, right = right0; } else { stack[++sp].lb = right0 - m + 1; stack[sp].ub = right0; left = left0, right = left0 + n - 1; } } else left = left0, right = left0 + n - 1; } else if( m > 1 ) left = right0 - m + 1, right = right0; else break; } } } } | |||
83 | #undef more_than | |||
84 | typedef struct { | |||
85 | int idx; | |||
86 | int hop; | |||
87 | } ccv_nnc_tensor_hop_t; | |||
88 | #define less_than(i1, i2, aux) ((i1).hop < (i2).hop) | |||
89 | static CCV_IMPLEMENT_QSORT(_ccv_nnc_sort_by_hops, ccv_nnc_tensor_hop_t, less_than)void _ccv_nnc_sort_by_hops(ccv_nnc_tensor_hop_t *array, size_t total, int aux) { int isort_thresh = 7; ccv_nnc_tensor_hop_t t; int sp = 0; struct { ccv_nnc_tensor_hop_t *lb; ccv_nnc_tensor_hop_t *ub; } stack[48]; if( total <= 1 ) return; stack[0].lb = array ; stack[0].ub = array + (total - 1); while( sp >= 0 ) { ccv_nnc_tensor_hop_t * left = stack[sp].lb; ccv_nnc_tensor_hop_t* right = stack[sp --].ub; for(;;) { int i, n = (int)(right - left) + 1, m; ccv_nnc_tensor_hop_t * ptr; ccv_nnc_tensor_hop_t* ptr2; if( n <= isort_thresh ) { insert_sort: for( ptr = left + 1; ptr <= right; ptr++ ) { for( ptr2 = ptr; ptr2 > left && less_than(ptr2[ 0],ptr2[-1], aux); ptr2--) (((t)) = ((ptr2[0])), ((ptr2[0])) = ((ptr2[-1])), ((ptr2[-1])) = ((t))); } break; } else { ccv_nnc_tensor_hop_t * left0; ccv_nnc_tensor_hop_t* left1; ccv_nnc_tensor_hop_t* right0 ; ccv_nnc_tensor_hop_t* right1; ccv_nnc_tensor_hop_t* pivot; ccv_nnc_tensor_hop_t * a; ccv_nnc_tensor_hop_t* b; ccv_nnc_tensor_hop_t* c; int swap_cnt = 0; left0 = left; right0 = right; pivot = left + (n/2); if( n > 40 ) { int d = n / 8; a = left, b = left + d, c = left + 2*d; left = less_than(*a, *b, aux) ? (less_than(*b, *c, aux ) ? b : (less_than(*a, *c, aux) ? c : a)) : (less_than(*c, *b , aux) ? b : (less_than(*a, *c, aux) ? a : c)); a = pivot - d , b = pivot, c = pivot + d; pivot = less_than(*a, *b, aux) ? ( less_than(*b, *c, aux) ? b : (less_than(*a, *c, aux) ? c : a) ) : (less_than(*c, *b, aux) ? b : (less_than(*a, *c, aux) ? a : c)); a = right - 2*d, b = right - d, c = right; right = less_than (*a, *b, aux) ? (less_than(*b, *c, aux) ? b : (less_than(*a, * c, aux) ? c : a)) : (less_than(*c, *b, aux) ? b : (less_than( *a, *c, aux) ? a : c)); } a = left, b = pivot, c = right; pivot = less_than(*a, *b, aux) ? (less_than(*b, *c, aux) ? b : (less_than (*a, *c, aux) ? c : a)) : (less_than(*c, *b, aux) ? b : (less_than (*a, *c, aux) ? a : c)); if( pivot != left0 ) { (((t)) = ((*pivot )), ((*pivot)) = ((*left0)), ((*left0)) = ((t))); pivot = left0 ; } left = left1 = left0 + 1; right = right1 = right0; for(;; ) { while( left <= right && !less_than(*pivot, *left , aux) ) { if( !less_than(*left, *pivot, aux) ) { if( left > left1 ) (((t)) = ((*left1)), ((*left1)) = ((*left)), ((*left )) = ((t))); swap_cnt = 1; left1++; } left++; } while( left <= right && !less_than(*right, *pivot, aux) ) { if( !less_than (*pivot, *right, aux) ) { if( right < right1 ) (((t)) = (( *right1)), ((*right1)) = ((*right)), ((*right)) = ((t))); swap_cnt = 1; right1--; } right--; } if( left > right ) break; ((( t)) = ((*left)), ((*left)) = ((*right)), ((*right)) = ((t))); swap_cnt = 1; left++; right--; } if( swap_cnt == 0 ) { left = left0, right = right0; goto insert_sort; } n = ({ typeof ((int )(left1 - left0)) _a = ((int)(left1 - left0)); typeof ((int)( left - left1)) _b = ((int)(left - left1)); (_a < _b) ? _a : _b; }); for( i = 0; i < n; i++ ) (((t)) = ((left0[i])), ( (left0[i])) = ((left[i-n])), ((left[i-n])) = ((t))); n = ({ typeof ((int)(right0 - right1)) _a = ((int)(right0 - right1)); typeof ((int)(right1 - right)) _b = ((int)(right1 - right)); (_a < _b) ? _a : _b; }); for( i = 0; i < n; i++ ) (((t)) = ((left [i])), ((left[i])) = ((right0[i-n+1])), ((right0[i-n+1])) = ( (t))); n = (int)(left - left1); m = (int)(right1 - right); if ( n > 1 ) { if( m > 1 ) { if( n > m ) { stack[++sp]. lb = left0; stack[sp].ub = left0 + n - 1; left = right0 - m + 1, right = right0; } else { stack[++sp].lb = right0 - m + 1; stack[sp].ub = right0; left = left0, right = left0 + n - 1; } } else left = left0, right = left0 + n - 1; } else if( m > 1 ) left = right0 - m + 1, right = right0; else break; } } } } | |||
90 | #undef less_than | |||
91 | ||||
92 | // If b has items overlap with a, a is still after b (inclusive). | |||
93 | static int _ccv_nnc_tensor_block_a_after_b_inclusively(const ccv_sparse_matrix_t* const exec_dep, const ccv_array_t* const a, const ccv_array_t* const b) | |||
94 | { | |||
95 | assert(a)((void) sizeof ((a) ? 1 : 0), __extension__ ({ if (a) ; else __assert_fail ("a", "ccv_nnc_symbolic_graph_compile.c", 95, __extension__ __PRETTY_FUNCTION__ ); })); | |||
96 | assert(b)((void) sizeof ((b) ? 1 : 0), __extension__ ({ if (b) ; else __assert_fail ("b", "ccv_nnc_symbolic_graph_compile.c", 96, __extension__ __PRETTY_FUNCTION__ ); })); | |||
97 | int x, y; | |||
98 | for (x = 0; x < b->rnum; x++) | |||
99 | { | |||
100 | const int p = *(int*)ccv_array_get(b, x)((void*)(((char*)((b)->data)) + (size_t)(b)->rsize * (size_t )(x))); | |||
101 | int flag = 0; | |||
102 | // In extreme cases where a is a superset of b, then a is still after b, we are good. | |||
103 | for (y = 0; !flag && y < a->rnum; y++) | |||
104 | { | |||
105 | const int q = *(int*)ccv_array_get(a, y)((void*)(((char*)((a)->data)) + (size_t)(a)->rsize * (size_t )(y))); | |||
106 | flag = (p == q); | |||
107 | } | |||
108 | if (!flag) | |||
109 | for (y = 0; y < a->rnum; y++) | |||
110 | { | |||
111 | ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, *(int*)ccv_array_get(a, y)((void*)(((char*)((a)->data)) + (size_t)(a)->rsize * (size_t )(y))), p); | |||
112 | if (!cell.i32 || cell.i32[0] == 0) | |||
113 | return 0; | |||
114 | } | |||
115 | } | |||
116 | // If b->rnum == 0, a is after b for sure. | |||
117 | // Otherwise, if a->rnum == 0, we don't check any, buf if b->rnum > 0, then we cannot say a is after b. | |||
118 | // if both a->rnum > 0 and b->rnum > 0, above logic should checked all. | |||
119 | return (a->rnum > 0 || b->rnum == 0); | |||
120 | } | |||
121 | ||||
122 | static int _ccv_nnc_tensor_block_a_after_b_exclusively(const ccv_sparse_matrix_t* const exec_dep, const ccv_array_t* const a, const ccv_array_t* const b) | |||
123 | { | |||
124 | assert(a)((void) sizeof ((a) ? 1 : 0), __extension__ ({ if (a) ; else __assert_fail ("a", "ccv_nnc_symbolic_graph_compile.c", 124, __extension__ __PRETTY_FUNCTION__); })); | |||
125 | assert(b)((void) sizeof ((b) ? 1 : 0), __extension__ ({ if (b) ; else __assert_fail ("b", "ccv_nnc_symbolic_graph_compile.c", 125, __extension__ __PRETTY_FUNCTION__); })); | |||
126 | int x, y, max_hop = 0; | |||
127 | for (x = 0; x < a->rnum; x++) | |||
128 | for (y = 0; y < b->rnum; y++) | |||
129 | { | |||
130 | ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, *(int*)ccv_array_get(a, x)((void*)(((char*)((a)->data)) + (size_t)(a)->rsize * (size_t )(x))), *(int*)ccv_array_get(b, y)((void*)(((char*)((b)->data)) + (size_t)(b)->rsize * (size_t )(y)))); | |||
131 | if (!cell.i32 || cell.i32[0] == 0) | |||
132 | return 0; | |||
133 | max_hop = ccv_max(cell.i32[0], max_hop)({ typeof (cell.i32[0]) _a = (cell.i32[0]); typeof (max_hop) _b = (max_hop); (_a > _b) ? _a : _b; }); | |||
134 | } | |||
135 | // We've entered this nested-for loop, therefore, it must be verifiably, deterministically after b now. | |||
136 | // The max hop also denotes if that is the case, how many hops, maximally speaking, we need to get from a to b. | |||
137 | return max_hop; | |||
138 | } | |||
139 | ||||
140 | // If every a's head is deterministically after b's tail | |||
141 | static int _ccv_nnc_tensor_block_head_after_tail(const ccv_sparse_matrix_t* const exec_dep, const ccv_nnc_tensor_block_t a, const ccv_nnc_tensor_block_t b) | |||
142 | { | |||
143 | return _ccv_nnc_tensor_block_a_after_b_exclusively(exec_dep, a.head, b.tail); | |||
144 | } | |||
145 | ||||
146 | typedef struct { | |||
147 | ccv_array_t** alloc_dep; | |||
148 | int vt_block_size; | |||
149 | int buffer_size; | |||
150 | int block_size; | |||
151 | int* vt_blocks; // A reference to the block, because blocks only contains available block (thus, doesn't consider alias etc.). -1 means no block pointed to. Starts at 0. | |||
152 | struct { | |||
153 | int type; // The type from tensor blocks. | |||
154 | int pin_mem; // Whether this is pinned memory. | |||
155 | int flags; // The flags (currently for READ_ONLY or not). | |||
156 | uint64_t size; // The size of the buffer allocated. | |||
157 | int p_refs[2]; // Reference to the upper level block, Starts at 1. Only index 0 is valid throughout, I do use two in the code as a temporary placeholder. | |||
158 | ccv_array_t* dup_p_refs; // Reference to the parent tensor block from the duplicated tensor blocks. From buffer, it can point to multiple because it can be associated with multiple tensor blocks that points to different outputs (for example, in 1st unroll, pointing to one block while in 2nd unroll, pointing to another). Start with 0. | |||
159 | }* buffers; | |||
160 | struct { | |||
161 | int buffer_ref; // A reference for block to which buffer to use. Starts at 0. | |||
162 | int block_ref; // A reference to which block in the given tensor_block to use. | |||
163 | uint64_t offset; // The offset of this block. | |||
164 | }* blocks; | |||
165 | } ccv_nnc_tensor_alloc_prep_t; | |||
166 | ||||
167 | typedef struct ccv_nnc_symbolic_graph_prep_s { | |||
168 | int flags; | |||
169 | int while_count_tensor; // This graph will generate a while count tensor. If this is set to 1, we reserve tensor_metadata at 0 for this. | |||
170 | int p_idx; // Reference to the index in its parent graph's sub-graph array, Starts at 1. | |||
171 | int exec_idx; | |||
172 | int unroll_count; // How many times this graph is unrolled before we can have proper assignment. | |||
173 | int tensor_symbol_info_size; | |||
174 | int exec_symbol_info_size; | |||
175 | int tensor_block_size; | |||
176 | int sub_prep_size; | |||
177 | ccv_nnc_tensor_block_t* tensor_blocks; | |||
178 | ccv_nnc_tensor_symbol_info_t* tensor_symbol_info; | |||
179 | ccv_nnc_graph_exec_flag_t* exec_flags; | |||
180 | ccv_nnc_graph_exec_symbol_info_t* exec_symbol_info; | |||
181 | int* dup_tensor_block_ref; | |||
182 | ccv_nnc_graph_visit_t* visit; | |||
183 | ccv_nnc_tensor_alloc_prep_t* alloc_prep; | |||
184 | struct ccv_nnc_symbolic_graph_prep_s* p; | |||
185 | struct ccv_nnc_symbolic_graph_prep_s** sub_preps; // The preps of its sub-graphs. | |||
186 | // Structures that don't require to be freed after deallocation. | |||
187 | const ccv_nnc_symbolic_graph_t* symbolic_graph; // Constant because I cannot modify it. | |||
188 | ccv_nnc_graph_t* graph; // Materialized graph, not managed by prep after created. | |||
189 | ccv_nnc_tensor_arena_t* tensor_arena; // Tensor arena, not managed by prep as well. | |||
190 | ccv_array_t* dup_breakpoints; // The noop breakpoints, used to extend the inputs life-cycle for while expr. | |||
191 | } ccv_nnc_symbolic_graph_prep_t; | |||
192 | ||||
193 | typedef struct { | |||
194 | int oc; | |||
195 | ccv_array_t* itf; | |||
196 | } ccv_nnc_tensor_block_adjacent_t; | |||
197 | ||||
198 | static ccv_nnc_tensor_alloc_prep_t* _ccv_nnc_tensor_alloc_prep_new_and_free_exec_dep(ccv_sparse_matrix_t* const exec_dep, const ccv_nnc_tensor_block_t* const tensor_blocks, const int tensor_block_size) | |||
199 | { | |||
200 | // Compute how many dis-continuous buffers are needed. | |||
201 | // We prefer to have several dis-continuous buffers instead of one big buffer because | |||
202 | // in this way, we can rely on system memory allocators (jemalloc, tcmalloc, or CUDA's allocator) | |||
203 | // to fully utilize memory. | |||
204 | int i, j, k; | |||
205 | ccv_array_t** const alloc_dep = (ccv_array_t**)cccalloccalloc(tensor_block_size, sizeof(ccv_array_t*)); | |||
206 | int allocable_tensor_size = 0, available_tensor_size = 0; | |||
207 | for (i = 0; i < tensor_block_size; i++) | |||
208 | if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED)) | |||
209 | { | |||
210 | // Tensors that we need the header info. | |||
211 | ++available_tensor_size; | |||
212 | if (!TENSOR_EXPECT_ALIAS(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == ALIAS)) | |||
213 | // Tensors that we actually need to allocate (exclude the alias). | |||
214 | ++allocable_tensor_size; | |||
215 | } | |||
216 | ccv_sparse_matrix_t* const tensor_df = ccv_sparse_matrix_new(tensor_block_size, tensor_block_size, CCV_32S | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0); | |||
217 | ccv_sparse_matrix_t* const tensor_dt = ccv_sparse_matrix_new(tensor_block_size, tensor_block_size, CCV_32S | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0); | |||
218 | ccv_nnc_tensor_block_adjacent_t* const adj = (ccv_nnc_tensor_block_adjacent_t*)cccalloccalloc(tensor_block_size, sizeof(ccv_nnc_tensor_block_adjacent_t)); | |||
219 | // Overlap count. | |||
220 | for (i = 0; i < tensor_block_size; i++) | |||
221 | if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !( (tensor_blocks[i].flags & 0x3) == UNASSIGNED))) | |||
222 | for (j = i + 1; j < tensor_block_size; j++) | |||
223 | if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[j])(!((tensor_blocks[j].flags & 0x3) == ALIAS) && !( (tensor_blocks[j].flags & 0x3) == UNASSIGNED))) | |||
224 | { | |||
225 | // Check to see if they interfere (default to yes). | |||
226 | // If any of the i's head is deterministically later than j's tail | |||
227 | // or any of the i's tail is deterministically earlier than j's head, they don't interfere. | |||
228 | const int i_hop_j = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[i], tensor_blocks[j]); | |||
229 | if (i_hop_j > 0) | |||
230 | { | |||
231 | ccv_set_sparse_matrix_cell(tensor_dt, i, j, &i_hop_j); | |||
232 | ccv_set_sparse_matrix_cell(tensor_df, j, i, &i_hop_j); | |||
233 | } | |||
234 | const int j_hop_i = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[j], tensor_blocks[i]); | |||
235 | if (j_hop_i > 0) | |||
236 | { | |||
237 | ccv_set_sparse_matrix_cell(tensor_dt, j, i, &j_hop_i); | |||
238 | ccv_set_sparse_matrix_cell(tensor_df, i, j, &j_hop_i); | |||
239 | } | |||
240 | // It cannot be that both i can hop to j can j can hop to i. | |||
241 | assert(!(i_hop_j > 0 && j_hop_i > 0))((void) sizeof ((!(i_hop_j > 0 && j_hop_i > 0)) ? 1 : 0), __extension__ ({ if (!(i_hop_j > 0 && j_hop_i > 0)) ; else __assert_fail ("!(i_hop_j > 0 && j_hop_i > 0)" , "ccv_nnc_symbolic_graph_compile.c", 241, __extension__ __PRETTY_FUNCTION__ ); })); | |||
242 | if (!i_hop_j && !j_hop_i && tensor_blocks[i].type == tensor_blocks[j].type) | |||
243 | { | |||
244 | if (!adj[i].itf) | |||
245 | adj[i].itf = ccv_array_new(sizeof(int), 1, 0); | |||
246 | ccv_array_push(adj[i].itf, &j); | |||
247 | ++adj[i].oc; | |||
248 | if (!adj[j].itf) | |||
249 | adj[j].itf = ccv_array_new(sizeof(int), 1, 0); | |||
250 | ccv_array_push(adj[j].itf, &i); | |||
251 | ++adj[j].oc; | |||
252 | } | |||
253 | } | |||
254 | const int exec_dep_rows = exec_dep->rows; | |||
255 | ccv_matrix_free(exec_dep); | |||
256 | ccv_nnc_tensor_hop_t* const buf = (ccv_nnc_tensor_hop_t*)ccmallocmalloc(sizeof(ccv_nnc_tensor_hop_t) * tensor_block_size); | |||
257 | int* const assigned = (int*)cccalloccalloc(tensor_block_size, sizeof(int)); | |||
258 | uint64_t* const allocated_offset = (uint64_t*)cccalloccalloc(tensor_block_size, sizeof(uint64_t)); | |||
259 | uint64_t* const allocated_size = (uint64_t*)cccalloccalloc(tensor_block_size, sizeof(uint64_t)); | |||
260 | uint32_t* const tensor_block_cannot_insert = (uint32_t*)cccalloccalloc(((tensor_block_size + 31) >> 5), sizeof(uint32_t)); | |||
261 | int num_assigned = 0; | |||
262 | // I can do a bit optimization here to assign out const tensor first, but heck, this just works for now. | |||
263 | // Allocation graph (assuming there is a source node, and a destination node, which is 0, and (tensor_block_size + 1) | |||
264 | // The first channel denotes the bytes available for allocation, | |||
265 | // the second channel denotes the offset available for the allocation, | |||
266 | ccv_sparse_matrix_t* alloc = ccv_sparse_matrix_new(tensor_block_size + 2, tensor_block_size + 2, CCV_64S | CCV_C2, CCV_SPARSE_ROW_MAJOR, 0); | |||
267 | ccv_array_t* opt = ccv_array_new(sizeof(ccv_nnc_tensor_opt_t), 1, 0); | |||
268 | for (j = 0; j < allocable_tensor_size;) | |||
269 | { | |||
270 | // Find the one with largest overlap (in case overlap is the same, larger size), and it is not assigned. | |||
271 | uint64_t max_size = 0; | |||
272 | ccv_array_clear(opt); | |||
273 | int current_type = 0; // Deal with one type at a time. | |||
274 | for (i = 0; i < tensor_block_size; i++) | |||
275 | if (tensor_blocks[i].size >= max_size && | |||
276 | TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !( (tensor_blocks[i].flags & 0x3) == UNASSIGNED)) && !assigned[i] && | |||
277 | IS_PRIMARY_COMPANION(i, tensor_blocks[i])((i) < (uint32_t)((tensor_blocks[i]).companion_ref - 1)) && | |||
278 | (!current_type || tensor_blocks[i].type == current_type)) | |||
279 | { | |||
280 | ccv_nnc_tensor_opt_t a = { | |||
281 | .size = tensor_blocks[i].size, | |||
282 | .index = i, | |||
283 | .oc = adj[i].oc, | |||
284 | .type = tensor_blocks[i].type, | |||
285 | }; | |||
286 | assert(a.type)((void) sizeof ((a.type) ? 1 : 0), __extension__ ({ if (a.type ) ; else __assert_fail ("a.type", "ccv_nnc_symbolic_graph_compile.c" , 286, __extension__ __PRETTY_FUNCTION__); })); | |||
287 | current_type = a.type; // Now we now the primary type we should deal with. | |||
288 | if (tensor_blocks[i].companion_ref) | |||
289 | { | |||
290 | const int companion_ref = tensor_blocks[i].companion_ref - 1; | |||
291 | a.size = ccv_max(a.size, tensor_blocks[companion_ref].size)({ typeof (a.size) _a = (a.size); typeof (tensor_blocks[companion_ref ].size) _b = (tensor_blocks[companion_ref].size); (_a > _b ) ? _a : _b; }); | |||
292 | a.oc += adj[companion_ref].oc; | |||
293 | } | |||
294 | // In case we have a tie, take them all in the array. | |||
295 | if (a.size > max_size) | |||
296 | ccv_array_clear(opt), max_size = a.size; | |||
297 | ccv_array_push(opt, &a); | |||
298 | } | |||
299 | assert(opt->rnum > 0)((void) sizeof ((opt->rnum > 0) ? 1 : 0), __extension__ ({ if (opt->rnum > 0) ; else __assert_fail ("opt->rnum > 0" , "ccv_nnc_symbolic_graph_compile.c", 299, __extension__ __PRETTY_FUNCTION__ ); })); | |||
300 | // Order opt array by the oc because type and size should be equal at this point. | |||
301 | _ccv_nnc_tensor_opt_sort_by_size_and_oc((ccv_nnc_tensor_opt_t*)opt->data, opt->rnum, 0); | |||
302 | // Go through opt array again, this time, it is ordered by size, therefore, if we found a place to insert, we are good. | |||
303 | int min_y = 0, min_x = tensor_block_size + 1, min_i = -1, min_hop = exec_dep_rows * 3; | |||
304 | uint64_t min_val[2] = { | |||
305 | 0, 0 | |||
306 | }; | |||
307 | if (j > 0) | |||
308 | { | |||
309 | for (i = 0; i < opt->rnum; i++) | |||
310 | { | |||
311 | ccv_nnc_tensor_opt_t a = *(ccv_nnc_tensor_opt_t*)ccv_array_get(opt, i)((void*)(((char*)((opt)->data)) + (size_t)(opt)->rsize * (size_t)(i))); | |||
312 | if ((tensor_block_cannot_insert[a.index >> 5] & (1u << (a.index & 0x1f)))) | |||
313 | continue; | |||
314 | // Now, determine the order between a and c. After this, we can always check whether y | |||
315 | // can hop to the earliest one and if the latest one can hop to x. | |||
316 | // The earliest one will be called p and the latest one will be called q. | |||
317 | int p = a.index; | |||
318 | int q = a.index; | |||
319 | if (tensor_blocks[a.index].companion_ref) | |||
320 | { | |||
321 | const int companion_ref = tensor_blocks[a.index].companion_ref - 1; | |||
322 | if ((tensor_block_cannot_insert[companion_ref >> 5] & (1u << (companion_ref & 0x1f)))) | |||
323 | continue; | |||
324 | const ccv_numeric_data_t b_hop_p = ccv_get_sparse_matrix_cell(tensor_dt, p, companion_ref); | |||
325 | if (b_hop_p.i32 && b_hop_p.i32[0] > 0) | |||
326 | p = companion_ref; | |||
327 | else { | |||
328 | const ccv_numeric_data_t q_hop_b = ccv_get_sparse_matrix_cell(tensor_dt, companion_ref, q); | |||
329 | if (q_hop_b.i32 && q_hop_b.i32[0] > 0) | |||
330 | q = companion_ref; | |||
331 | else { // Otherwise, b is in between p and q. | |||
332 | const ccv_numeric_data_t p_hop_b = ccv_get_sparse_matrix_cell(tensor_dt, companion_ref, p); | |||
333 | const ccv_numeric_data_t b_hop_q = ccv_get_sparse_matrix_cell(tensor_dt, q, companion_ref); | |||
334 | assert(p_hop_b.i32 && p_hop_b.i32[0] > 0 && b_hop_q.i32 && b_hop_q.i32[0] > 0)((void) sizeof ((p_hop_b.i32 && p_hop_b.i32[0] > 0 && b_hop_q.i32 && b_hop_q.i32[0] > 0) ? 1 : 0), __extension__ ({ if (p_hop_b.i32 && p_hop_b.i32 [0] > 0 && b_hop_q.i32 && b_hop_q.i32[0] > 0) ; else __assert_fail ("p_hop_b.i32 && p_hop_b.i32[0] > 0 && b_hop_q.i32 && b_hop_q.i32[0] > 0" , "ccv_nnc_symbolic_graph_compile.c", 334, __extension__ __PRETTY_FUNCTION__ ); })); | |||
335 | } | |||
336 | } | |||
337 | } | |||
338 | assert(tensor_blocks[q].type == tensor_blocks[p].type)((void) sizeof ((tensor_blocks[q].type == tensor_blocks[p].type ) ? 1 : 0), __extension__ ({ if (tensor_blocks[q].type == tensor_blocks [p].type) ; else __assert_fail ("tensor_blocks[q].type == tensor_blocks[p].type" , "ccv_nnc_symbolic_graph_compile.c", 338, __extension__ __PRETTY_FUNCTION__ ); })); | |||
339 | const int type = tensor_blocks[p].type; | |||
340 | // y is always earlier than x, but this is hard to assert now. | |||
341 | // If this edge satisfy the requirement, now we need to find the ones with tightest possible bounds. | |||
342 | // Thus, the hop between y and x (through a) should be smallest ones. | |||
343 | // We optimized this by first find all allocated nodes that comes to p, and all allocated nodes that | |||
344 | // out of q. For these nodes, we try to verify whether they form a connection (by checking against | |||
345 | // alloc sparse matrix). If they do, try to see whether we can insert with tightest bound. | |||
346 | int y_size = 0; | |||
347 | ccv_nnc_tensor_hop_t* const y_buf = buf; | |||
348 | #define for_block(y, val) do { \ | |||
349 | if (((int*)val)[0] > 0 && assigned[y] && tensor_blocks[y].type == type && tensor_blocks[y].size >= a.size) \ | |||
350 | y_buf[y_size++] = (ccv_nnc_tensor_hop_t){ \ | |||
351 | .idx = y + 1, .hop = ((int*)val)[0] \ | |||
352 | }; \ | |||
353 | } while(0) | |||
354 | ccv_sparse_matrix_vector_t* const y_vector = ccv_get_sparse_matrix_vector(tensor_dt, p); | |||
355 | if (y_vector) | |||
356 | CCV_SPARSE_VECTOR_FOREACH(tensor_dt, y_vector, for_block)do { switch ((((tensor_dt)->type) & 0xFF000)) { case CCV_32S : { do { int _i_; __attribute__((unused)) const size_t _c_ = ( ((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)-> size; _i_++) { for_block((_i_), ((y_vector)->data.i32 + (_i_ * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t ) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000 ) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index; for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t * const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_-> i), (_d_.i32 + (0))); } } } while (0); break; } case CCV_32F: { do { int _i_; __attribute__((unused)) const size_t _c_ = ( ((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)-> size; _i_++) { for_block((_i_), ((y_vector)->data.f32 + (_i_ * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t ) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000 ) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index; for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t * const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_-> i), (_d_.f32 + (0))); } } } while (0); break; } case CCV_64S: { do { int _i_; __attribute__((unused)) const size_t _c_ = ( ((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)-> size; _i_++) { for_block((_i_), ((y_vector)->data.i64 + (_i_ * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t ) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000 ) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index; for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t * const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_-> i), (_d_.i64 + (0))); } } } while (0); break; } case CCV_64F: { do { int _i_; __attribute__((unused)) const size_t _c_ = ( ((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)-> size; _i_++) { for_block((_i_), ((y_vector)->data.f64 + (_i_ * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t ) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000 ) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index; for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t * const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_-> i), (_d_.f64 + (0))); } } } while (0); break; } default: { do { int _i_; __attribute__((unused)) const size_t _c_ = (((tensor_dt )->type) & 0xFFF); if ((tensor_dt)->type & CCV_DENSE_VECTOR ) { for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { for_block ((_i_), ((y_vector)->data.u8 + (_i_ * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size [(((tensor_dt)->type) & 0xFF000) >> 12] * (((tensor_dt )->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index; for (_i_ = 0; _i_ < (y_vector )->size; _i_++) { ccv_sparse_matrix_index_t* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_-> i), (_d_.u8 + (0))); } } } while (0); } } } while (0); | |||
357 | #undef for_block | |||
358 | assert(y_size <= tensor_block_size)((void) sizeof ((y_size <= tensor_block_size) ? 1 : 0), __extension__ ({ if (y_size <= tensor_block_size) ; else __assert_fail ( "y_size <= tensor_block_size", "ccv_nnc_symbolic_graph_compile.c" , 358, __extension__ __PRETTY_FUNCTION__); })); | |||
359 | int x_size = 0; | |||
360 | ccv_nnc_tensor_hop_t* const x_buf = buf + y_size; | |||
361 | #define for_block(x, val) do { \ | |||
362 | if (((int*)val)[0] > 0 && assigned[x] && tensor_blocks[x].type == type && tensor_blocks[x].size >= a.size) \ | |||
363 | x_buf[x_size++] = (ccv_nnc_tensor_hop_t){ \ | |||
364 | .idx = x + 1, .hop = ((int*)val)[0] \ | |||
365 | }; \ | |||
366 | } while(0) | |||
367 | ccv_sparse_matrix_vector_t* const x_vector = ccv_get_sparse_matrix_vector(tensor_df, q); | |||
368 | if (x_vector) | |||
369 | CCV_SPARSE_VECTOR_FOREACH(tensor_df, x_vector, for_block)do { switch ((((tensor_df)->type) & 0xFF000)) { case CCV_32S : { do { int _i_; __attribute__((unused)) const size_t _c_ = ( ((tensor_df)->type) & 0xFFF); if ((tensor_df)->type & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)-> size; _i_++) { for_block((_i_), ((x_vector)->data.i32 + (_i_ * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t ) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000 ) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index; for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t * const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_-> i), (_d_.i32 + (0))); } } } while (0); break; } case CCV_32F: { do { int _i_; __attribute__((unused)) const size_t _c_ = ( ((tensor_df)->type) & 0xFFF); if ((tensor_df)->type & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)-> size; _i_++) { for_block((_i_), ((x_vector)->data.f32 + (_i_ * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t ) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000 ) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index; for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t * const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_-> i), (_d_.f32 + (0))); } } } while (0); break; } case CCV_64S: { do { int _i_; __attribute__((unused)) const size_t _c_ = ( ((tensor_df)->type) & 0xFFF); if ((tensor_df)->type & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)-> size; _i_++) { for_block((_i_), ((x_vector)->data.i64 + (_i_ * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t ) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000 ) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index; for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t * const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_-> i), (_d_.i64 + (0))); } } } while (0); break; } case CCV_64F: { do { int _i_; __attribute__((unused)) const size_t _c_ = ( ((tensor_df)->type) & 0xFFF); if ((tensor_df)->type & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)-> size; _i_++) { for_block((_i_), ((x_vector)->data.f64 + (_i_ * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t ) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000 ) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index; for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t * const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_-> i), (_d_.f64 + (0))); } } } while (0); break; } default: { do { int _i_; __attribute__((unused)) const size_t _c_ = (((tensor_df )->type) & 0xFFF); if ((tensor_df)->type & CCV_DENSE_VECTOR ) { for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { for_block ((_i_), ((x_vector)->data.u8 + (_i_ * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size [(((tensor_df)->type) & 0xFF000) >> 12] * (((tensor_df )->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index; for (_i_ = 0; _i_ < (x_vector )->size; _i_++) { ccv_sparse_matrix_index_t* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_-> i), (_d_.u8 + (0))); } } } while (0); } } } while (0); | |||
370 | #undef for_block | |||
371 | assert(y_size + x_size <= tensor_block_size)((void) sizeof ((y_size + x_size <= tensor_block_size) ? 1 : 0), __extension__ ({ if (y_size + x_size <= tensor_block_size ) ; else __assert_fail ("y_size + x_size <= tensor_block_size" , "ccv_nnc_symbolic_graph_compile.c", 371, __extension__ __PRETTY_FUNCTION__ ); })); | |||
372 | int x, y; | |||
373 | _ccv_nnc_sort_by_hops(y_buf, y_size, 0); | |||
374 | for (y = 0; y < y_size; y++) | |||
375 | { | |||
376 | const int hop = exec_dep_rows + y_buf[y].hop; | |||
377 | if (hop >= min_hop) | |||
378 | break; | |||
379 | const ccv_numeric_data_t val = ccv_get_sparse_matrix_cell(alloc, y_buf[y].idx, tensor_block_size + 1); | |||
380 | if (val.u64 && val.u64[0] >= a.size) | |||
381 | { | |||
382 | min_y = y_buf[y].idx, min_x = tensor_block_size + 1, min_hop = hop, | |||
383 | min_val[0] = val.u64[0], min_val[1] = val.u64[1]; | |||
384 | break; | |||
385 | } | |||
386 | } | |||
387 | _ccv_nnc_sort_by_hops(x_buf, x_size, 0); | |||
388 | for (x = 0; x < x_size; x++) | |||
389 | { | |||
390 | const int hop = exec_dep_rows + x_buf[x].hop; | |||
391 | if (hop >= min_hop) | |||
392 | break; | |||
393 | const ccv_numeric_data_t val = ccv_get_sparse_matrix_cell(alloc, 0, x_buf[x].idx); | |||
394 | if (val.u64 && val.u64[0] >= a.size) | |||
395 | { | |||
396 | min_y = 0, min_x = x_buf[x].idx, min_hop = hop, | |||
397 | min_val[0] = val.u64[0], min_val[1] = val.u64[1]; | |||
398 | break; | |||
399 | } | |||
400 | } | |||
401 | const int x_min_hop = x_buf[0].hop; | |||
402 | for (y = 0; y < y_size; y++) | |||
403 | { | |||
404 | const int y_hop_p_v = y_buf[y].hop; | |||
405 | if (y_hop_p_v + x_min_hop >= min_hop) | |||
406 | break; | |||
407 | ccv_sparse_matrix_vector_t* const y_vector = ccv_get_sparse_matrix_vector(alloc, y_buf[y].idx); | |||
408 | if (y_vector) | |||
409 | { | |||
410 | for (x = 0; x < x_size; x++) | |||
411 | { | |||
412 | const int q_hop_x_v = x_buf[x].hop; | |||
413 | const int hop = y_hop_p_v + q_hop_x_v; | |||
414 | if (hop >= min_hop) | |||
415 | break; | |||
416 | const ccv_numeric_data_t val = ccv_get_sparse_matrix_cell_from_vector(alloc, y_vector, x_buf[x].idx); | |||
417 | if (val.u64 && val.u64[0] >= a.size) | |||
418 | { | |||
419 | min_y = y_buf[y].idx, min_x = x_buf[x].idx, min_hop = hop, | |||
420 | min_val[0] = val.u64[0], min_val[1] = val.u64[1]; | |||
421 | break; | |||
422 | } | |||
423 | } | |||
424 | } | |||
425 | } | |||
426 | // If I found a place, stop, and exit. | |||
427 | if (min_y > 0 || min_x < tensor_block_size + 1) | |||
428 | { | |||
429 | min_i = i; | |||
430 | break; | |||
431 | } | |||
432 | // There is no space to insert this block, mark it as such. | |||
433 | tensor_block_cannot_insert[a.index >> 5] |= (1u << (a.index & 0x1f)); | |||
434 | if (tensor_blocks[a.index].companion_ref) | |||
435 | { | |||
436 | const int companion_ref = tensor_blocks[a.index].companion_ref - 1; | |||
437 | tensor_block_cannot_insert[companion_ref >> 5] |= (1u << (companion_ref & 0x1f)); | |||
438 | } | |||
439 | } | |||
440 | } | |||
441 | // If I cannot find a place, then start a new connection between min_y and min_x (a new assignment group). | |||
442 | // and default to largest size available. | |||
443 | ccv_nnc_tensor_opt_t a = *(ccv_nnc_tensor_opt_t*)ccv_array_get(opt, ccv_max(0, min_i))((void*)(((char*)((opt)->data)) + (size_t)(opt)->rsize * (size_t)(({ typeof (0) _a = (0); typeof (min_i) _b = (min_i) ; (_a > _b) ? _a : _b; })))); | |||
444 | if (min_i == -1) | |||
445 | { | |||
446 | allocated_size[num_assigned] = a.size; | |||
447 | ++num_assigned; | |||
448 | } | |||
449 | int assign_group = num_assigned; | |||
450 | if (min_y > 0) | |||
451 | { | |||
452 | assign_group = assigned[min_y - 1]; | |||
453 | // The y and x should belong to the same assigned group. | |||
454 | assert(min_x == tensor_block_size + 1 || assigned[min_x - 1] == assign_group)((void) sizeof ((min_x == tensor_block_size + 1 || assigned[min_x - 1] == assign_group) ? 1 : 0), __extension__ ({ if (min_x == tensor_block_size + 1 || assigned[min_x - 1] == assign_group ) ; else __assert_fail ("min_x == tensor_block_size + 1 || assigned[min_x - 1] == assign_group" , "ccv_nnc_symbolic_graph_compile.c", 454, __extension__ __PRETTY_FUNCTION__ ); })); | |||
455 | } else if (min_x < tensor_block_size + 1) | |||
456 | assign_group = assigned[min_x - 1]; | |||
457 | // If min_y is source and min_x is destination, we don't need to do anything, otherwise, decrease the weight on that edge. | |||
458 | if (min_y != 0 || min_x != tensor_block_size + 1) | |||
459 | { | |||
460 | uint64_t val[2] = { | |||
461 | min_val[0], min_val[1] | |||
462 | }; | |||
463 | assert(val[0] >= a.size)((void) sizeof ((val[0] >= a.size) ? 1 : 0), __extension__ ({ if (val[0] >= a.size) ; else __assert_fail ("val[0] >= a.size" , "ccv_nnc_symbolic_graph_compile.c", 463, __extension__ __PRETTY_FUNCTION__ ); })); | |||
464 | val[0] -= a.size; | |||
465 | val[1] = val[1] + a.size; // Move the offset to the next one. | |||
466 | ccv_set_sparse_matrix_cell(alloc, min_y, min_x, val); | |||
467 | } | |||
468 | int strings[3]; | |||
469 | strings[0] = a.index + 1; | |||
470 | int string_size = 1; | |||
471 | // Assign out designated companion if it exist. | |||
472 | if (tensor_blocks[a.index].companion_ref) | |||
473 | { | |||
474 | const int companion_ref = tensor_blocks[a.index].companion_ref - 1; | |||
475 | assert(tensor_blocks[a.index].type == tensor_blocks[companion_ref].type)((void) sizeof ((tensor_blocks[a.index].type == tensor_blocks [companion_ref].type) ? 1 : 0), __extension__ ({ if (tensor_blocks [a.index].type == tensor_blocks[companion_ref].type) ; else __assert_fail ("tensor_blocks[a.index].type == tensor_blocks[companion_ref].type" , "ccv_nnc_symbolic_graph_compile.c", 475, __extension__ __PRETTY_FUNCTION__ ); })); | |||
476 | const ccv_numeric_data_t b_hop_p = ccv_get_sparse_matrix_cell(tensor_dt, strings[0] - 1, companion_ref); | |||
477 | if (b_hop_p.i32 && b_hop_p.i32[0] > 0) | |||
478 | { | |||
479 | for (i = 0; i < string_size; i++) | |||
480 | strings[i + 1] = strings[i]; | |||
481 | strings[0] = companion_ref + 1; | |||
482 | } else { | |||
483 | const ccv_numeric_data_t q_hop_b = ccv_get_sparse_matrix_cell(tensor_dt, companion_ref, strings[string_size - 1] - 1); | |||
484 | if (q_hop_b.i32 && q_hop_b.i32[0] > 0) | |||
485 | strings[string_size] = companion_ref + 1; | |||
486 | else { | |||
487 | // Because b_hop_p is 0, q_hop_b is nil, p != q, and b must in between p and q. Therefore, I must have 2 allocations. | |||
488 | assert(string_size == 2)((void) sizeof ((string_size == 2) ? 1 : 0), __extension__ ({ if (string_size == 2) ; else __assert_fail ("string_size == 2" , "ccv_nnc_symbolic_graph_compile.c", 488, __extension__ __PRETTY_FUNCTION__ ); })); | |||
489 | strings[2] = strings[1]; | |||
490 | strings[1] = companion_ref + 1; | |||
491 | } | |||
492 | } | |||
493 | ++string_size; | |||
494 | } | |||
495 | // Assign out and update oc. | |||
496 | for (i = 0; i < string_size; i++) | |||
497 | { | |||
498 | const int index = strings[i] - 1; | |||
499 | // Assign out the selected one. | |||
500 | assigned[index] = assign_group; | |||
501 | // The offset for this one, should be either 0 (started a new group, when min_i == -1), or the offset on this edge. | |||
502 | allocated_offset[index] = min_val[1]; | |||
503 | if (adj[index].itf) | |||
504 | for (k = 0; k < adj[index].itf->rnum; k++) | |||
505 | { | |||
506 | const int d = *(int*)ccv_array_get(adj[index].itf, k)((void*)(((char*)((adj[index].itf)->data)) + (size_t)(adj[ index].itf)->rsize * (size_t)(k))); | |||
507 | if (!assigned[d] && TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d])(!((tensor_blocks[d].flags & 0x3) == ALIAS) && !( (tensor_blocks[d].flags & 0x3) == UNASSIGNED))) | |||
508 | --adj[d].oc; | |||
509 | } | |||
510 | } | |||
511 | uint64_t val[2] = { | |||
512 | a.size, min_val[1] | |||
513 | }; | |||
514 | uint64_t consumed_size = 0; | |||
515 | // Go over from min_y to string_size (excluding min_x). | |||
516 | for (i = 0; i < string_size; i++) | |||
517 | { | |||
518 | const uint64_t size = tensor_blocks[strings[i] - 1].size; | |||
519 | assert(size <= a.size)((void) sizeof ((size <= a.size) ? 1 : 0), __extension__ ( { if (size <= a.size) ; else __assert_fail ("size <= a.size" , "ccv_nnc_symbolic_graph_compile.c", 519, __extension__ __PRETTY_FUNCTION__ ); })); | |||
520 | // Update consumed size if it is bigger than "size". | |||
521 | if (size > consumed_size) | |||
522 | { | |||
523 | val[0] = size - consumed_size; | |||
524 | ccv_set_sparse_matrix_cell(alloc, min_y, strings[i], val); | |||
525 | consumed_size = size; | |||
526 | val[1] = min_val[1] + consumed_size; | |||
527 | } | |||
528 | // If it consumed all the flow, break out. | |||
529 | if (consumed_size == a.size) | |||
530 | break; | |||
531 | } | |||
532 | for (i = 0; i < string_size; i++) | |||
533 | { | |||
534 | const uint64_t i_size = tensor_blocks[strings[i] - 1].size; | |||
535 | uint64_t val[2] = { | |||
536 | i_size, min_val[1] | |||
537 | }; | |||
538 | uint64_t consumed_size = 0; | |||
539 | for (k = i + 1; k < string_size; k++) | |||
540 | { | |||
541 | const uint64_t size = ccv_min(i_size, tensor_blocks[strings[k] - 1].size)({ typeof (i_size) _a = (i_size); typeof (tensor_blocks[strings [k] - 1].size) _b = (tensor_blocks[strings[k] - 1].size); (_a < _b) ? _a : _b; }); | |||
542 | // Update consumed size if it is bigger than "size". | |||
543 | if (size > consumed_size) | |||
544 | { | |||
545 | val[0] = size - consumed_size; | |||
546 | ccv_set_sparse_matrix_cell(alloc, strings[i], strings[k], val); | |||
547 | consumed_size = size; | |||
548 | val[1] = min_val[1] + consumed_size; | |||
549 | } | |||
550 | // If it consumed all the flow, break out. | |||
551 | if (consumed_size == i_size) | |||
552 | break; | |||
553 | } | |||
554 | val[0] = i_size - consumed_size; | |||
555 | // Still have residual, flow it to min_x. | |||
556 | if (val[0] > 0) | |||
557 | ccv_set_sparse_matrix_cell(alloc, strings[i], min_x, val); | |||
558 | } | |||
559 | if (min_i == -1) | |||
560 | { | |||
561 | // If we decide to insert a new edge, simply marking anyone who is not interfere with it to redo. | |||
562 | const int p = strings[0] - 1; | |||
563 | const int q = strings[string_size - 1] - 1; | |||
564 | const int type = tensor_blocks[p].type; | |||
565 | #define for_block(y, val) do { \ | |||
566 | if (((int*)val)[0] > 0 && !assigned[y] && tensor_blocks[y].type == type && tensor_blocks[y].size <= a.size) \ | |||
567 | { \ | |||
568 | tensor_block_cannot_insert[y >> 5] &= ~(1u << (y & 0x1f)); \ | |||
569 | if (tensor_blocks[y].companion_ref) \ | |||
570 | { \ | |||
571 | const int companion_ref = tensor_blocks[y].companion_ref - 1; \ | |||
572 | tensor_block_cannot_insert[companion_ref >> 5] |= (1u << (companion_ref & 0x1f)); \ | |||
573 | } \ | |||
574 | } \ | |||
575 | } while(0) | |||
576 | ccv_sparse_matrix_vector_t* const y_vector = ccv_get_sparse_matrix_vector(tensor_dt, p); | |||
577 | if (y_vector) | |||
578 | CCV_SPARSE_VECTOR_FOREACH(tensor_dt, y_vector, for_block)do { switch ((((tensor_dt)->type) & 0xFF000)) { case CCV_32S : { do { int _i_; __attribute__((unused)) const size_t _c_ = ( ((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)-> size; _i_++) { for_block((_i_), ((y_vector)->data.i32 + (_i_ * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t ) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000 ) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index; for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t * const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_-> i), (_d_.i32 + (0))); } } } while (0); break; } case CCV_32F: { do { int _i_; __attribute__((unused)) const size_t _c_ = ( ((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)-> size; _i_++) { for_block((_i_), ((y_vector)->data.f32 + (_i_ * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t ) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000 ) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index; for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t * const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_-> i), (_d_.f32 + (0))); } } } while (0); break; } case CCV_64S: { do { int _i_; __attribute__((unused)) const size_t _c_ = ( ((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)-> size; _i_++) { for_block((_i_), ((y_vector)->data.i64 + (_i_ * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t ) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000 ) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index; for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t * const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_-> i), (_d_.i64 + (0))); } } } while (0); break; } case CCV_64F: { do { int _i_; __attribute__((unused)) const size_t _c_ = ( ((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)-> size; _i_++) { for_block((_i_), ((y_vector)->data.f64 + (_i_ * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t ) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000 ) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index; for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t * const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_-> i), (_d_.f64 + (0))); } } } while (0); break; } default: { do { int _i_; __attribute__((unused)) const size_t _c_ = (((tensor_dt )->type) & 0xFFF); if ((tensor_dt)->type & CCV_DENSE_VECTOR ) { for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { for_block ((_i_), ((y_vector)->data.u8 + (_i_ * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size [(((tensor_dt)->type) & 0xFF000) >> 12] * (((tensor_dt )->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index; for (_i_ = 0; _i_ < (y_vector )->size; _i_++) { ccv_sparse_matrix_index_t* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_-> i), (_d_.u8 + (0))); } } } while (0); } } } while (0); | |||
579 | #undef for_block | |||
580 | #define for_block(x, val) do { \ | |||
581 | if (((int*)val)[0] > 0 && !assigned[x] && tensor_blocks[x].type == type && tensor_blocks[x].size <= a.size) \ | |||
582 | { \ | |||
583 | tensor_block_cannot_insert[x >> 5] &= ~(1u << (x & 0x1f)); \ | |||
584 | if (tensor_blocks[x].companion_ref) \ | |||
585 | { \ | |||
586 | const int companion_ref = tensor_blocks[x].companion_ref - 1; \ | |||
587 | tensor_block_cannot_insert[companion_ref >> 5] |= (1u << (companion_ref & 0x1f)); \ | |||
588 | } \ | |||
589 | } \ | |||
590 | } while(0) | |||
591 | ccv_sparse_matrix_vector_t* const x_vector = ccv_get_sparse_matrix_vector(tensor_df, q); | |||
592 | if (x_vector) | |||
593 | CCV_SPARSE_VECTOR_FOREACH(tensor_df, x_vector, for_block)do { switch ((((tensor_df)->type) & 0xFF000)) { case CCV_32S : { do { int _i_; __attribute__((unused)) const size_t _c_ = ( ((tensor_df)->type) & 0xFFF); if ((tensor_df)->type & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)-> size; _i_++) { for_block((_i_), ((x_vector)->data.i32 + (_i_ * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t ) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000 ) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index; for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t * const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_-> i), (_d_.i32 + (0))); } } } while (0); break; } case CCV_32F: { do { int _i_; __attribute__((unused)) const size_t _c_ = ( ((tensor_df)->type) & 0xFFF); if ((tensor_df)->type & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)-> size; _i_++) { for_block((_i_), ((x_vector)->data.f32 + (_i_ * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t ) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000 ) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index; for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t * const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_-> i), (_d_.f32 + (0))); } } } while (0); break; } case CCV_64S: { do { int _i_; __attribute__((unused)) const size_t _c_ = ( ((tensor_df)->type) & 0xFFF); if ((tensor_df)->type & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)-> size; _i_++) { for_block((_i_), ((x_vector)->data.i64 + (_i_ * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t ) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000 ) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index; for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t * const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_-> i), (_d_.i64 + (0))); } } } while (0); break; } case CCV_64F: { do { int _i_; __attribute__((unused)) const size_t _c_ = ( ((tensor_df)->type) & 0xFFF); if ((tensor_df)->type & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)-> size; _i_++) { for_block((_i_), ((x_vector)->data.f64 + (_i_ * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t ) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000 ) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index; for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t * const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_-> i), (_d_.f64 + (0))); } } } while (0); break; } default: { do { int _i_; __attribute__((unused)) const size_t _c_ = (((tensor_df )->type) & 0xFFF); if ((tensor_df)->type & CCV_DENSE_VECTOR ) { for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { for_block ((_i_), ((x_vector)->data.u8 + (_i_ * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size [(((tensor_df)->type) & 0xFF000) >> 12] * (((tensor_df )->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index; for (_i_ = 0; _i_ < (x_vector )->size; _i_++) { ccv_sparse_matrix_index_t* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_-> i), (_d_.u8 + (0))); } } } while (0); } } } while (0); | |||
594 | #undef for_block | |||
595 | } | |||
596 | j += string_size; | |||
597 | } | |||
598 | ccfreefree(tensor_block_cannot_insert); | |||
599 | ccfreefree(buf); | |||
600 | ccv_array_free(opt); | |||
601 | ccv_matrix_free(tensor_df); | |||
602 | ccv_matrix_free(tensor_dt); | |||
603 | #define for_block(y, x, val) do { \ | |||
604 | if (((uint64_t*)val)[0] > 0 && y > 0 && x < tensor_block_size + 1) \ | |||
605 | { \ | |||
606 | if (!alloc_dep[x - 1]) \ | |||
607 | alloc_dep[x - 1] = ccv_array_new(sizeof(int), 1, 0); \ | |||
608 | ccv_array_add_unique_int(alloc_dep[x - 1], y - 1); \ | |||
609 | } \ | |||
610 | } while (0) | |||
611 | CCV_SPARSE_FOREACH(alloc, for_block)do { if ((alloc)->major & CCV_SPARSE_COL_MAJOR) { switch ((((alloc)->type) & 0xFF000)) { case CCV_32S: { do { uint32_t _i_, _j_; const uint32_t _size_ = (alloc)->size; __attribute__ ((unused)) const size_t _c_ = (((alloc)->type) & 0xFFF ); if ((alloc)->type & CCV_DENSE_VECTOR) { for (_i_ = 0 ; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <= 1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->size ; _j_++) { for_block((_j_), (_idx_->i), (_v_->data.i32 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof (ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc )->type) & 0xFF000) >> 12] * (((alloc)->type) & 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_ ++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue ; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ = 0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t * const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t _d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_j_-> i), (_idx_->i), (_d_.i32 + (0))); } } } } while (0); break ; } case CCV_32F: { do { uint32_t _i_, _j_; const uint32_t _size_ = (alloc)->size; __attribute__((unused)) const size_t _c_ = (((alloc)->type) & 0xFFF); if ((alloc)->type & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t * const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t * const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <= 1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_-> size; _j_++) { for_block((_j_), (_idx_->i), (_v_->data. f32 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof (ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc )->type) & 0xFF000) >> 12] * (((alloc)->type) & 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_ ++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue ; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ = 0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t * const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t _d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_j_-> i), (_idx_->i), (_d_.f32 + (0))); } } } } while (0); break ; } case CCV_64S: { do { uint32_t _i_, _j_; const uint32_t _size_ = (alloc)->size; __attribute__((unused)) const size_t _c_ = (((alloc)->type) & 0xFFF); if ((alloc)->type & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t * const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t * const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <= 1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_-> size; _j_++) { for_block((_j_), (_idx_->i), (_v_->data. i64 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof (ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc )->type) & 0xFF000) >> 12] * (((alloc)->type) & 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_ ++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue ; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ = 0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t * const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t _d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_j_-> i), (_idx_->i), (_d_.i64 + (0))); } } } } while (0); break ; } case CCV_64F: { do { uint32_t _i_, _j_; const uint32_t _size_ = (alloc)->size; __attribute__((unused)) const size_t _c_ = (((alloc)->type) & 0xFFF); if ((alloc)->type & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t * const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t * const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <= 1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_-> size; _j_++) { for_block((_j_), (_idx_->i), (_v_->data. f64 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof (ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc )->type) & 0xFF000) >> 12] * (((alloc)->type) & 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_ ++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue ; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ = 0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t * const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t _d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_j_-> i), (_idx_->i), (_d_.f64 + (0))); } } } } while (0); break ; } default: { do { uint32_t _i_, _j_; const uint32_t _size_ = (alloc)->size; __attribute__((unused)) const size_t _c_ = (((alloc)->type) & 0xFFF); if ((alloc)->type & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t * const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t * const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <= 1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_-> size; _j_++) { for_block((_j_), (_idx_->i), (_v_->data. u8 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof (ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc )->type) & 0xFF000) >> 12] * (((alloc)->type) & 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_ ++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue ; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ = 0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t * const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t _d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_j_-> i), (_idx_->i), (_d_.u8 + (0))); } } } } while (0); } } } else { switch ((((alloc)->type) & 0xFF000)) { case CCV_32S : { do { uint32_t _i_, _j_; const uint32_t _size_ = (alloc)-> size; __attribute__((unused)) const size_t _c_ = (((alloc)-> type) & 0xFFF); if ((alloc)->type & CCV_DENSE_VECTOR ) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t * const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t * const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <= 1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_-> size; _j_++) { for_block((_idx_->i), (_j_), (_v_->data. i32 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof (ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc )->type) & 0xFF000) >> 12] * (((alloc)->type) & 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_ ++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue ; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ = 0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t * const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t _d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_-> i), (_idx_j_->i), (_d_.i32 + (0))); } } } } while (0); break ; } case CCV_32F: { do { uint32_t _i_, _j_; const uint32_t _size_ = (alloc)->size; __attribute__((unused)) const size_t _c_ = (((alloc)->type) & 0xFFF); if ((alloc)->type & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t * const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t * const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <= 1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_-> size; _j_++) { for_block((_idx_->i), (_j_), (_v_->data. f32 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof (ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc )->type) & 0xFF000) >> 12] * (((alloc)->type) & 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_ ++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue ; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ = 0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t * const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t _d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_-> i), (_idx_j_->i), (_d_.f32 + (0))); } } } } while (0); break ; } case CCV_64S: { do { uint32_t _i_, _j_; const uint32_t _size_ = (alloc)->size; __attribute__((unused)) const size_t _c_ = (((alloc)->type) & 0xFFF); if ((alloc)->type & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t * const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t * const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <= 1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_-> size; _j_++) { for_block((_idx_->i), (_j_), (_v_->data. i64 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof (ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc )->type) & 0xFF000) >> 12] * (((alloc)->type) & 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_ ++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue ; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ = 0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t * const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t _d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_-> i), (_idx_j_->i), (_d_.i64 + (0))); } } } } while (0); break ; } case CCV_64F: { do { uint32_t _i_, _j_; const uint32_t _size_ = (alloc)->size; __attribute__((unused)) const size_t _c_ = (((alloc)->type) & 0xFFF); if ((alloc)->type & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t * const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t * const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <= 1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_-> size; _j_++) { for_block((_idx_->i), (_j_), (_v_->data. f64 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof (ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc )->type) & 0xFF000) >> 12] * (((alloc)->type) & 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_ ++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue ; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ = 0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t * const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t _d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_-> i), (_idx_j_->i), (_d_.f64 + (0))); } } } } while (0); break ; } default: { do { uint32_t _i_, _j_; const uint32_t _size_ = (alloc)->size; __attribute__((unused)) const size_t _c_ = (((alloc)->type) & 0xFFF); if ((alloc)->type & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t * const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t * const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <= 1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_-> size; _j_++) { for_block((_idx_->i), (_j_), (_v_->data. u8 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof (ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc )->type) & 0xFF000) >> 12] * (((alloc)->type) & 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_ ++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue ; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ = 0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t * const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t _d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_-> i), (_idx_j_->i), (_d_.u8 + (0))); } } } } while (0); } } } } while (0); | |||
612 | #undef for_block | |||
613 | ccv_matrix_free(alloc); | |||
614 | for (i = 0; i < tensor_block_size; i++) | |||
615 | if (adj[i].itf) | |||
616 | ccv_array_free(adj[i].itf); | |||
617 | ccfreefree(adj); | |||
618 | ccv_nnc_tensor_alloc_prep_t* alloc_prep = (ccv_nnc_tensor_alloc_prep_t*)ccmallocmalloc(sizeof(ccv_nnc_tensor_alloc_prep_t) + sizeof(alloc_prep->blocks[0]) * available_tensor_size + sizeof(alloc_prep->buffers[0]) * num_assigned + sizeof(int) * tensor_block_size); | |||
619 | alloc_prep->alloc_dep = alloc_dep; | |||
620 | alloc_prep->vt_block_size = tensor_block_size; | |||
621 | alloc_prep->buffer_size = num_assigned; | |||
622 | alloc_prep->block_size = available_tensor_size; | |||
623 | alloc_prep->blocks = (void*)(alloc_prep + 1); // From the biggest structs to smaller ones. | |||
624 | alloc_prep->buffers = (void*)(alloc_prep->blocks + available_tensor_size); | |||
625 | alloc_prep->vt_blocks = (int*)(alloc_prep->buffers + num_assigned); | |||
626 | memset(alloc_prep->buffers, 0, sizeof(alloc_prep->buffers[0]) * num_assigned); | |||
627 | for (i = 0; i < num_assigned; i++) | |||
628 | alloc_prep->buffers[i].size = allocated_size[i]; | |||
629 | if (CCV_CLI_OUTPUT_LEVEL_IS(CCV_CLI_INFO)(CCV_CLI_INFO & ccv_cli_get_output_levels())) | |||
630 | { | |||
631 | size_t total_size = 0; | |||
632 | for (i = 0; i < num_assigned; i++) | |||
633 | total_size += allocated_size[i]; | |||
634 | PRINT(CCV_CLI_INFO, "Total buffer size of %zu to be allocated\n", total_size)do { if ((CCV_CLI_INFO & ccv_cli_get_output_levels())) { printf ("Total buffer size of %zu to be allocated\n", total_size); fflush (stdout); } } while (0); | |||
635 | } | |||
636 | ccfreefree(allocated_size); | |||
637 | j = 0; | |||
638 | // Assigning out the tensors (in case of sharing tensors / in-place ops). | |||
639 | for (i = 0; i < tensor_block_size; i++) | |||
640 | if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED)) | |||
641 | { | |||
642 | alloc_prep->blocks[j].block_ref = i; | |||
643 | if (!TENSOR_EXPECT_ALIAS(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == ALIAS)) | |||
644 | { | |||
645 | alloc_prep->vt_blocks[i] = j; | |||
646 | // Also, set its allocations. | |||
647 | assert(assigned[i] > 0)((void) sizeof ((assigned[i] > 0) ? 1 : 0), __extension__ ( { if (assigned[i] > 0) ; else __assert_fail ("assigned[i] > 0" , "ccv_nnc_symbolic_graph_compile.c", 647, __extension__ __PRETTY_FUNCTION__ ); })); | |||
648 | const int buffer_ref = alloc_prep->blocks[j].buffer_ref = assigned[i] - 1; | |||
649 | alloc_prep->blocks[j].offset = allocated_offset[i]; | |||
650 | if (!alloc_prep->buffers[buffer_ref].type) | |||
651 | alloc_prep->buffers[buffer_ref].type = tensor_blocks[i].type; | |||
652 | alloc_prep->buffers[buffer_ref].pin_mem = alloc_prep->buffers[buffer_ref].pin_mem || tensor_blocks[i].pin_mem; | |||
653 | alloc_prep->buffers[buffer_ref].flags |= TENSOR_READ_WRITE(tensor_blocks[i])(tensor_blocks[i].flags & 0xc); | |||
654 | assert(allocated_offset[i] + tensor_blocks[i].size <= alloc_prep->buffers[buffer_ref].size)((void) sizeof ((allocated_offset[i] + tensor_blocks[i].size <= alloc_prep->buffers[buffer_ref].size) ? 1 : 0), __extension__ ({ if (allocated_offset[i] + tensor_blocks[i].size <= alloc_prep ->buffers[buffer_ref].size) ; else __assert_fail ("allocated_offset[i] + tensor_blocks[i].size <= alloc_prep->buffers[buffer_ref].size" , "ccv_nnc_symbolic_graph_compile.c", 654, __extension__ __PRETTY_FUNCTION__ ); })); | |||
655 | } else { | |||
656 | alloc_prep->vt_blocks[i] = -1; | |||
657 | alloc_prep->blocks[j].buffer_ref = -1; | |||
658 | alloc_prep->blocks[j].offset = 0; | |||
659 | } | |||
660 | ++j; | |||
661 | } else | |||
662 | alloc_prep->vt_blocks[i] = -1; | |||
663 | ccfreefree(allocated_offset); | |||
664 | ccfreefree(assigned); | |||
665 | return alloc_prep; | |||
666 | } | |||
667 | ||||
668 | static void _ccv_nnc_tensor_alloc_prep_free(ccv_nnc_tensor_alloc_prep_t* alloc_prep) | |||
669 | { | |||
670 | int i; | |||
671 | for (i = 0; i < alloc_prep->vt_block_size; i++) | |||
672 | if (alloc_prep->alloc_dep[i]) | |||
673 | ccv_array_free(alloc_prep->alloc_dep[i]); | |||
674 | for (i = 0; i < alloc_prep->buffer_size; i++) | |||
675 | if (alloc_prep->buffers[i].dup_p_refs) | |||
676 | ccv_array_free(alloc_prep->buffers[i].dup_p_refs); | |||
677 | ccfreefree(alloc_prep->alloc_dep); | |||
678 | ccfreefree(alloc_prep); | |||
679 | } | |||
680 | ||||
681 | // Simple allocator from ccv_array_t. | |||
682 | static int _ccv_nnc_tensor_metadata_pos_new(ccv_array_t* const tensor_metadata, const size_t size) | |||
683 | { | |||
684 | int pos = tensor_metadata->rnum; | |||
685 | int rsize = (size + 15) / 16; | |||
686 | ccv_array_resize(tensor_metadata, pos + rsize); | |||
687 | return (pos << 1) + 1; | |||
688 | } | |||
689 | ||||
690 | static ccv_nnc_tensor_t* _ccv_nnc_tensor_metadata_get(const ccv_array_t* const tensor_metadata, const int pos) | |||
691 | { | |||
692 | assert((pos >> 1) < tensor_metadata->rnum)((void) sizeof (((pos >> 1) < tensor_metadata->rnum ) ? 1 : 0), __extension__ ({ if ((pos >> 1) < tensor_metadata ->rnum) ; else __assert_fail ("(pos >> 1) < tensor_metadata->rnum" , "ccv_nnc_symbolic_graph_compile.c", 692, __extension__ __PRETTY_FUNCTION__ ); })); | |||
693 | return (ccv_nnc_tensor_t*)ccv_array_get(tensor_metadata, pos >> 1)((void*)(((char*)((tensor_metadata)->data)) + (size_t)(tensor_metadata )->rsize * (size_t)(pos >> 1))); | |||
694 | } | |||
695 | ||||
696 | #define CCV_NNC_IS_METADATA_POS(ptr)((uintptr_t)(ptr) & 1) ((uintptr_t)(ptr) & 1) | |||
697 | ||||
698 | static ccv_nnc_tensor_t* _ccv_nnc_tensor_metadata_rewire(const ccv_array_t* const tensor_metadata, ccv_nnc_tensor_t* const vt_tensor) | |||
699 | { | |||
700 | // If the low bit is not 1, this is not a position (but a normal tensor pointer), just return directly. | |||
701 | if (!CCV_NNC_IS_METADATA_POS(vt_tensor)((uintptr_t)(vt_tensor) & 1)) | |||
702 | return vt_tensor; | |||
703 | ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)vt_tensor); | |||
704 | if (tensor->alias_ref && CCV_NNC_IS_METADATA_POS(tensor->alias_ref)((uintptr_t)(tensor->alias_ref) & 1)) | |||
705 | { | |||
706 | const int alias_ref = tensor->alias_ref; | |||
707 | tensor->alias_ref = (uintptr_t)_ccv_nnc_tensor_metadata_get(tensor_metadata, (int)tensor->alias_ref); | |||
708 | _ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)alias_ref); | |||
709 | } | |||
710 | if (CCV_IS_TENSOR_MULTIVIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_MULTIVIEW)) | |||
711 | { | |||
712 | ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)tensor; | |||
713 | int i; | |||
714 | const int count = mv->kind + mv->repeat; | |||
715 | for (i = 0; i < count; i++) | |||
716 | { | |||
717 | if (CCV_NNC_IS_METADATA_POS(CCV_NNC_MULTIVIEW_DATA(mv)[i])((uintptr_t)(((mv)->_heap_data ? (mv)->_heap_data : (mv )->_inline_data)[i]) & 1)) | |||
718 | { | |||
719 | const int pos = (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data )[i]; | |||
720 | CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data )[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data )[i]); | |||
721 | _ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)pos); | |||
722 | } | |||
723 | } | |||
724 | // No need to recursively do parent pointer, otherwise we are in deep rewire. | |||
725 | if (mv->p && CCV_NNC_IS_METADATA_POS(mv->p)((uintptr_t)(mv->p) & 1)) | |||
726 | mv->p = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)mv->p); | |||
727 | if (mv->sp) | |||
728 | for (i = 0; i < mv->sp->rnum; i++) | |||
729 | { | |||
730 | ccv_nnc_tensor_t** const tensor = (ccv_nnc_tensor_t**)ccv_array_get(mv->sp, i)((void*)(((char*)((mv->sp)->data)) + (size_t)(mv->sp )->rsize * (size_t)(i))); | |||
731 | if (CCV_NNC_IS_METADATA_POS(*tensor)((uintptr_t)(*tensor) & 1)) | |||
732 | { | |||
733 | const int pos = (int)(intptr_t)*tensor; | |||
734 | *tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos); | |||
735 | assert(!CCV_IS_TENSOR_MULTIVIEW(*tensor))((void) sizeof ((!((*(int*)(*tensor)) & CCV_TENSOR_MULTIVIEW )) ? 1 : 0), __extension__ ({ if (!((*(int*)(*tensor)) & CCV_TENSOR_MULTIVIEW )) ; else __assert_fail ("!CCV_IS_TENSOR_MULTIVIEW(*tensor)", "ccv_nnc_symbolic_graph_compile.c", 735, __extension__ __PRETTY_FUNCTION__ ); })); | |||
736 | _ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)pos); | |||
737 | } | |||
738 | } | |||
739 | } | |||
740 | return tensor; | |||
741 | } | |||
742 | ||||
743 | typedef struct { | |||
744 | const uint8_t* ptr; | |||
745 | int pos; | |||
746 | } ccv_nnc_tensor_block_pos_t; | |||
747 | ||||
748 | static int _ccv_nnc_tensor_multiview_find_pos(ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_param_t params, const ccv_nnc_symbolic_graph_prep_t* const *const preps, const int block_ref, const int* const ch, const int idx, const ccv_nnc_symbolic_graph_prep_t* prep) | |||
749 | { | |||
750 | int i; | |||
751 | int unref_block_ref = block_ref; | |||
752 | while (prep->tensor_blocks[unref_block_ref].ref) | |||
753 | unref_block_ref = prep->tensor_blocks[unref_block_ref].ref - 1; | |||
754 | int vt_ref = prep->alloc_prep->vt_blocks[unref_block_ref]; | |||
755 | assert(vt_ref >= 0)((void) sizeof ((vt_ref >= 0) ? 1 : 0), __extension__ ({ if (vt_ref >= 0) ; else __assert_fail ("vt_ref >= 0", "ccv_nnc_symbolic_graph_compile.c" , 755, __extension__ __PRETTY_FUNCTION__); })); | |||
756 | assert(unref_block_ref == prep->alloc_prep->blocks[vt_ref].block_ref)((void) sizeof ((unref_block_ref == prep->alloc_prep->blocks [vt_ref].block_ref) ? 1 : 0), __extension__ ({ if (unref_block_ref == prep->alloc_prep->blocks[vt_ref].block_ref) ; else __assert_fail ("unref_block_ref == prep->alloc_prep->blocks[vt_ref].block_ref" , "ccv_nnc_symbolic_graph_compile.c", 756, __extension__ __PRETTY_FUNCTION__ ); })); | |||
757 | const int buffer_ref = prep->alloc_prep->blocks[vt_ref].buffer_ref; | |||
758 | uint64_t offset = prep->alloc_prep->blocks[vt_ref].offset; | |||
759 | int p_ref = prep->alloc_prep->buffers[buffer_ref].p_refs[0] - 1; | |||
760 | for (i = idx - 1; i >= 0; i--) | |||
761 | { | |||
762 | assert(p_ref >= 0)((void) sizeof ((p_ref >= 0) ? 1 : 0), __extension__ ({ if (p_ref >= 0) ; else __assert_fail ("p_ref >= 0", "ccv_nnc_symbolic_graph_compile.c" , 762, __extension__ __PRETTY_FUNCTION__); })); | |||
763 | const ccv_nnc_symbolic_graph_prep_t* const graph_prep = preps[i]; | |||
764 | const int unroll_count = graph_prep->unroll_count; | |||
765 | if (ch[i]) // Prefer the dup side of things. | |||
766 | p_ref = graph_prep->dup_tensor_block_ref[p_ref * unroll_count + ch[i] - 1]; | |||
767 | int unref_p_ref = p_ref; | |||
768 | while (graph_prep->tensor_blocks[unref_p_ref].ref) | |||
769 | unref_p_ref = graph_prep->tensor_blocks[unref_p_ref].ref - 1; | |||
770 | vt_ref = graph_prep->alloc_prep->vt_blocks[unref_p_ref]; | |||
771 | const int buffer_ref = graph_prep->alloc_prep->blocks[vt_ref].buffer_ref; | |||
772 | offset += graph_prep->alloc_prep->blocks[vt_ref].offset; | |||
773 | // If the buffer already exists, prefer that. | |||
774 | const uint8_t* ptr = graph_prep->tensor_arena->buffers[buffer_ref].ptr; | |||
775 | if (ptr) | |||
776 | { | |||
777 | // If I have any remaining path that is not covered from 0, I cannot possibly | |||
778 | // have any pointer from buffer (that can only happen if it is not dup). | |||
779 | for (--i; i >= 0; i--) | |||
780 | if (ch[i] != 0) | |||
781 | return 0; | |||
782 | // Try to find the created tensor block pos in the array, just linear scan. | |||
783 | const int tv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_t)); | |||
784 | ccv_nnc_tensor_t* const tv = (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, tv_pos); | |||
785 | *tv = ccv_nnc_tensor(graph_prep->tensor_arena->buffers[buffer_ref].ptr, params, 0); | |||
786 | ccv_nnc_tensor_data_add(tv->info, offset, &tv->data, &tv->dataof); | |||
787 | return tv_pos; | |||
788 | } | |||
789 | p_ref = graph_prep->alloc_prep->buffers[buffer_ref].p_refs[0] - 1; | |||
790 | } | |||
791 | return 0; | |||
792 | } | |||
793 | ||||
794 | // Descent from root to the prep level, and compose multiview from there. | |||
795 | static int _ccv_nnc_tensor_multiview_down_find_pos(ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_param_t params, const int preserve, const int assign_update, const ccv_nnc_symbolic_graph_prep_t* const *const preps, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int block_ref, int* ch, const int idx, int* const pos_ref) | |||
796 | { | |||
797 | assert(pos_ref)((void) sizeof ((pos_ref) ? 1 : 0), __extension__ ({ if (pos_ref ) ; else __assert_fail ("pos_ref", "ccv_nnc_symbolic_graph_compile.c" , 797, __extension__ __PRETTY_FUNCTION__); })); | |||
798 | int i; | |||
799 | const ccv_nnc_symbolic_graph_prep_t* const prep = preps[idx]; | |||
800 | const int unroll_count = prep->unroll_count; | |||
801 | if (prep == graph_prep) | |||
802 | { | |||
803 | const int data_pos = _ccv_nnc_tensor_multiview_find_pos(tensor_metadata, params, preps, block_ref, ch, idx, prep); | |||
804 | if (!data_pos) | |||
805 | return -1; | |||
806 | // Based on ch, go all the way back to find the exact pointer to compose. | |||
807 | if (// !assign_update && // If I plan to receive assign update, we don't need to have multiple receiver. Just one tensor to receive update is enough. | |||
808 | prep->dup_tensor_block_ref && | |||
809 | prep->dup_tensor_block_ref[block_ref * unroll_count] >= 0 && | |||
810 | prep->dup_tensor_block_ref[block_ref * unroll_count] != block_ref) | |||
811 | { | |||
812 | int pos[unroll_count + 1]; | |||
813 | pos[0] = data_pos; | |||
814 | for (i = 0; i < unroll_count; i++) | |||
815 | pos[i + 1] = _ccv_nnc_tensor_multiview_find_pos(tensor_metadata, params, preps, prep->dup_tensor_block_ref[block_ref * unroll_count + i], ch, idx, prep); | |||
816 | const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t)); | |||
817 | ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos); | |||
818 | ccv_nnc_tensor_t* data[unroll_count + 1]; | |||
819 | for (i = 0; i < unroll_count + 1; i++) | |||
820 | data[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos[i]); | |||
821 | ccv_nnc_tensor_multiview(data, CCV_NNC_MULTIVIEW_K0N, unroll_count + 1, prep->graph, mv); | |||
822 | for (i = 0; i < unroll_count + 1; i++) | |||
823 | CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data )[i] = (ccv_nnc_tensor_t*)(intptr_t)pos[i]; | |||
824 | *pos_ref = mv_pos; | |||
825 | } else { | |||
826 | *pos_ref = data_pos; | |||
827 | } | |||
828 | if (preserve) | |||
829 | { | |||
830 | // If need to preserve, this need to be more complicated. At loop 0, I need to access the new assigned tv. | |||
831 | // at any other loops, it should be the same. Thus, for this case, I will create a mv tensor as following: | |||
832 | // mv of K11, thus, when loop is 0, it unwrap to mv->data[0], otherwise, unwrap to mv->data[1]. | |||
833 | // mv->data[0] (thin_mv) is a K01, which points to the assigned tv (using 1 as a placeholder here until parent | |||
834 | // arena allocated). | |||
835 | // mv->data[1] (prev_mv_pos_ is a K01 or K02, depending on whether above we passed raw pointer directly or | |||
836 | // a mv structure. If we pass a mv structure, we just pass it here. If we pass a raw pointer, we need to wrap | |||
837 | // it to a K01 structure. | |||
838 | // Why we didn't wrap it directly as mv->data[0] pointing to a assigned tv pointer and the mv->data[1] pointing | |||
839 | // to the raw pointer (as ptr_ref) with K11? The reason is we don't know the assigned tv is pointing to one | |||
840 | // memory region, or is a managed by multi-view tensor, which could pointing to different memory regions. | |||
841 | int prev_mv_pos = *pos_ref; | |||
842 | if (prev_mv_pos == -1) | |||
843 | { | |||
844 | prev_mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t)); | |||
845 | ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, prev_mv_pos); | |||
846 | ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_metadata, data_pos); | |||
847 | ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){ | |||
848 | tv, | |||
849 | }, CCV_NNC_MULTIVIEW_K0N, 1, prep->graph, mv); | |||
850 | CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data )[0] = (ccv_nnc_tensor_t*)(intptr_t)data_pos; | |||
851 | } | |||
852 | const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t)); | |||
853 | ccv_nnc_tensor_multiview_t* const prev_mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, prev_mv_pos); | |||
854 | ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos); | |||
855 | ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){ | |||
856 | CCV_NNC_TENSOR_PLACEHOLDER((ccv_nnc_tensor_t*)(intptr_t)(0x10)), | |||
857 | (ccv_nnc_tensor_t*)prev_mv, | |||
858 | }, CCV_NNC_MULTIVIEW_K1N, 1, prep->graph, mv); | |||
859 | prev_mv->p = (void*)(intptr_t)mv_pos; | |||
860 | CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data )[0] = CCV_NNC_TENSOR_PLACEHOLDER((ccv_nnc_tensor_t*)(intptr_t)(0x10)); | |||
861 | CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data )[1] = (ccv_nnc_tensor_t*)(intptr_t)prev_mv_pos; | |||
862 | *pos_ref = mv_pos; | |||
863 | } | |||
864 | return 0; | |||
865 | } | |||
866 | ch[idx] = 0; | |||
867 | int pos[unroll_count + 1]; | |||
868 | pos[0] = 0; | |||
869 | const int retval = _ccv_nnc_tensor_multiview_down_find_pos(tensor_metadata, params, preserve, assign_update, preps, graph_prep, block_ref, ch, idx + 1, pos); | |||
870 | assert(retval == 0)((void) sizeof ((retval == 0) ? 1 : 0), __extension__ ({ if ( retval == 0) ; else __assert_fail ("retval == 0", "ccv_nnc_symbolic_graph_compile.c" , 870, __extension__ __PRETTY_FUNCTION__); })); | |||
871 | for (i = 0; i < unroll_count; i++) | |||
872 | { | |||
873 | ch[idx] = i + 1; | |||
874 | pos[i + 1] = 0; | |||
875 | const int dup_retval = _ccv_nnc_tensor_multiview_down_find_pos(tensor_metadata, params, preserve, assign_update, preps, graph_prep, block_ref, ch, idx + 1, pos + i + 1); | |||
876 | if (dup_retval < 0) | |||
877 | { | |||
878 | assert(i == 0)((void) sizeof ((i == 0) ? 1 : 0), __extension__ ({ if (i == 0 ) ; else __assert_fail ("i == 0", "ccv_nnc_symbolic_graph_compile.c" , 878, __extension__ __PRETTY_FUNCTION__); })); | |||
879 | break; | |||
880 | } | |||
881 | } | |||
882 | // If current prep has no dup. | |||
883 | if (i == 0) | |||
884 | { | |||
885 | *pos_ref = pos[0]; | |||
886 | return 0; | |||
887 | } | |||
888 | ccv_nnc_tensor_t* data[unroll_count + 1]; | |||
889 | // Compose to a new multiview. | |||
890 | for (i = 0; i < unroll_count + 1; i++) | |||
891 | { assert(pos[i] > 0)((void) sizeof ((pos[i] > 0) ? 1 : 0), __extension__ ({ if (pos[i] > 0) ; else __assert_fail ("pos[i] > 0", "ccv_nnc_symbolic_graph_compile.c" , 891, __extension__ __PRETTY_FUNCTION__); })); } | |||
892 | const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t)); | |||
893 | for (i = 0; i < unroll_count + 1; i++) | |||
894 | data[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos[i]); | |||
895 | ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos); | |||
896 | ccv_nnc_tensor_multiview(data, CCV_NNC_MULTIVIEW_K0N, unroll_count + 1, prep->graph, mv); | |||
897 | for (i = 0; i < unroll_count + 1; i++) | |||
898 | if (data[i] != CCV_NNC_TENSOR_PLACEHOLDER((ccv_nnc_tensor_t*)(intptr_t)(0x10)) && CCV_IS_TENSOR_MULTIVIEW(data[i])((*(int*)(data[i])) & CCV_TENSOR_MULTIVIEW)) | |||
899 | ((ccv_nnc_tensor_multiview_t*)data[i])->p = (void*)(intptr_t)mv_pos; | |||
900 | for (i = 0; i < unroll_count + 1; i++) | |||
901 | CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data )[i] = (ccv_nnc_tensor_t*)(intptr_t)pos[i]; | |||
902 | *pos_ref = mv_pos; | |||
903 | return 0; | |||
904 | } | |||
905 | ||||
906 | static int _ccv_nnc_is_symbolic_graph_exec_input_or_output(const int p_ref, const ccv_nnc_graph_exec_symbol_info_t *const node) | |||
907 | { | |||
908 | int i; | |||
909 | int is_input = 0; | |||
910 | assert(node)((void) sizeof ((node) ? 1 : 0), __extension__ ({ if (node) ; else __assert_fail ("node", "ccv_nnc_symbolic_graph_compile.c" , 910, __extension__ __PRETTY_FUNCTION__); })); | |||
911 | for (i = 0; i < node->input_size && !is_input; i++) | |||
912 | if (p_ref == node->inputs[i]) | |||
913 | is_input = 1; | |||
914 | int is_output = 0; | |||
915 | for (i = 0; i < node->output_size && !is_output; i++) | |||
916 | if (p_ref == node->outputs[i]) | |||
917 | is_output = 1; | |||
918 | // Prefer it is an output if it is both the input and the output. | |||
919 | if (is_output) | |||
920 | return 1; | |||
921 | if (is_input) | |||
922 | return -1; | |||
923 | return 0; | |||
924 | } | |||
925 | ||||
926 | static int _ccv_nnc_tensor_block_check_preserve(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int block_ref) | |||
927 | { | |||
928 | // No need to check whether to preserve if this is not a while loop. | |||
929 | if (!(graph_prep->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)) | |||
930 | return 0; | |||
931 | assert(block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size)((void) sizeof ((block_ref >= 0 && block_ref < graph_prep ->tensor_symbol_info_size) ? 1 : 0), __extension__ ({ if ( block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size ) ; else __assert_fail ("block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size" , "ccv_nnc_symbolic_graph_compile.c", 931, __extension__ __PRETTY_FUNCTION__ ); })); | |||
932 | // If it is unassigned, no need to preserve. | |||
933 | if (TENSOR_EXPECT_UNASSIGNED(graph_prep->tensor_blocks[block_ref])((graph_prep->tensor_blocks[block_ref].flags & 0x3) == UNASSIGNED)) | |||
934 | return 0; | |||
935 | const int p_ref = graph_prep->tensor_blocks[block_ref].p_refs[0] - 1; | |||
936 | // If p is not input, no need to preserve at all. | |||
937 | if (-1 != _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref, graph_prep->p->exec_symbol_info + (graph_prep->exec_idx - 1))) | |||
938 | return 0; | |||
939 | const int vt_ref = graph_prep->alloc_prep->vt_blocks[block_ref]; | |||
940 | assert(vt_ref >= 0)((void) sizeof ((vt_ref >= 0) ? 1 : 0), __extension__ ({ if (vt_ref >= 0) ; else __assert_fail ("vt_ref >= 0", "ccv_nnc_symbolic_graph_compile.c" , 940, __extension__ __PRETTY_FUNCTION__); })); | |||
941 | assert(block_ref == graph_prep->alloc_prep->blocks[vt_ref].block_ref)((void) sizeof ((block_ref == graph_prep->alloc_prep->blocks [vt_ref].block_ref) ? 1 : 0), __extension__ ({ if (block_ref == graph_prep->alloc_prep->blocks[vt_ref].block_ref) ; else __assert_fail ("block_ref == graph_prep->alloc_prep->blocks[vt_ref].block_ref" , "ccv_nnc_symbolic_graph_compile.c", 941, __extension__ __PRETTY_FUNCTION__ ); })); | |||
942 | const int buffer_ref = graph_prep->alloc_prep->blocks[vt_ref].buffer_ref; | |||
943 | // If the buffer is a truly read-only one, no need to preserve. | |||
944 | if (TENSOR_READ_WRITE(graph_prep->alloc_prep->buffers[buffer_ref])(graph_prep->alloc_prep->buffers[buffer_ref].flags & 0xc) == READ_ONLY) | |||
945 | return 0; | |||
946 | /* This needs detailed explanation, what does preserve mean? | |||
947 | * For a parameterized loop, such as while { y = x + 1 } (y => x), if tensor x is | |||
948 | * also used outside of the while loop, we cannot reuse the memory region of x for | |||
949 | * the for loop, otherwise we will destroy x when doing y = x + 1 computation (assuming | |||
950 | * y uses the same memory region as x). The way to workaround this is by using a different | |||
951 | * memory region for y = x + 1, but for the first iteration, having x pointing to the | |||
952 | * original. During the allocation process, the way to identify whether x should preserve | |||
953 | * its value or not by looking up its parent tensor. If the symbol (tensor_block)'s input | |||
954 | * parent tensor is the same as the memory region it plans to use in the buffer, then we are | |||
955 | * good (buffer.p_refs[0] == p_refs[0]). A buffer can only point to one parent tensor, and | |||
956 | * it is the input tensor whenever that is possible. A tensor block can point to two parent | |||
957 | * tensors, one is input tensor, one is the output tensor. p_refs[0] should be the input | |||
958 | * tensor whenever that is possible. */ | |||
959 | if (graph_prep->alloc_prep->buffers[buffer_ref].p_refs[0] - 1 == p_ref) | |||
960 | return 0; | |||
961 | // Otherwise, return 1 because we now need to preserve. | |||
962 | return 1; | |||
963 | } | |||
964 | ||||
965 | static int _ccv_nnc_tensor_block_check_force_broadcast(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int block_ref) | |||
966 | { | |||
967 | assert(block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size)((void) sizeof ((block_ref >= 0 && block_ref < graph_prep ->tensor_symbol_info_size) ? 1 : 0), __extension__ ({ if ( block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size ) ; else __assert_fail ("block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size" , "ccv_nnc_symbolic_graph_compile.c", 967, __extension__ __PRETTY_FUNCTION__ ); })); | |||
968 | // If it is unassigned, no need to preserve. | |||
969 | if (TENSOR_EXPECT_UNASSIGNED(graph_prep->tensor_blocks[block_ref])((graph_prep->tensor_blocks[block_ref].flags & 0x3) == UNASSIGNED)) | |||
970 | return 0; | |||
971 | // Only tape var need to force broadcast, otherwise we already share the same memory region. | |||
972 | if (!(graph_prep->tensor_symbol_info[block_ref].flags & CCV_NNC_TENSOR_SYMBOL_TAPE_VAR)) | |||
973 | return 0; | |||
974 | const int p_ref = graph_prep->tensor_blocks[block_ref].p_refs[0] - 1; | |||
975 | // If p is not output, no need to broadcast at all. | |||
976 | if (1 != _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref, graph_prep->p->exec_symbol_info + (graph_prep->exec_idx - 1))) | |||
977 | return 0; | |||
978 | const int vt_ref = graph_prep->alloc_prep->vt_blocks[block_ref]; | |||
979 | assert(vt_ref >= 0)((void) sizeof ((vt_ref >= 0) ? 1 : 0), __extension__ ({ if (vt_ref >= 0) ; else __assert_fail ("vt_ref >= 0", "ccv_nnc_symbolic_graph_compile.c" , 979, __extension__ __PRETTY_FUNCTION__); })); | |||
980 | assert(block_ref == graph_prep->alloc_prep->blocks[vt_ref].block_ref)((void) sizeof ((block_ref == graph_prep->alloc_prep->blocks [vt_ref].block_ref) ? 1 : 0), __extension__ ({ if (block_ref == graph_prep->alloc_prep->blocks[vt_ref].block_ref) ; else __assert_fail ("block_ref == graph_prep->alloc_prep->blocks[vt_ref].block_ref" , "ccv_nnc_symbolic_graph_compile.c", 980, __extension__ __PRETTY_FUNCTION__ ); })); | |||
981 | const int buffer_ref = graph_prep->alloc_prep->blocks[vt_ref].buffer_ref; | |||
982 | // If the buffer is a truly read-only one, no need to broadcast. | |||
983 | if (TENSOR_READ_WRITE(graph_prep->alloc_prep->buffers[buffer_ref])(graph_prep->alloc_prep->buffers[buffer_ref].flags & 0xc) == READ_ONLY) | |||
984 | return 0; | |||
985 | // Otherwise, return 1 because we now need to force broadcast for this tape var. | |||
986 | return 1; | |||
987 | } | |||
988 | ||||
989 | static void _ccv_nnc_tensor_multiview_full_pos(ccv_nnc_tensor_multiview_t* const mv, ccv_nnc_tensor_t* const tensor) | |||
990 | { | |||
991 | assert(CCV_IS_TENSOR_MULTIVIEW(mv))((void) sizeof ((((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW)) ? 1 : 0), __extension__ ({ if (((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW )) ; else __assert_fail ("CCV_IS_TENSOR_MULTIVIEW(mv)", "ccv_nnc_symbolic_graph_compile.c" , 991, __extension__ __PRETTY_FUNCTION__); })); | |||
992 | int i; | |||
993 | for (i = 0; i < mv->kind + mv->repeat; i++) | |||
994 | if (CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data )[i] == CCV_NNC_TENSOR_PLACEHOLDER((ccv_nnc_tensor_t*)(intptr_t)(0x10))) | |||
995 | CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data )[i] = tensor; | |||
996 | else if (CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(mv)[i])((*(int*)(((mv)->_heap_data ? (mv)->_heap_data : (mv)-> _inline_data)[i])) & CCV_TENSOR_MULTIVIEW)) | |||
997 | _ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data )[i], tensor); | |||
998 | } | |||
999 | ||||
1000 | static void _ccv_nnc_tensor_multiview_full_pos_rewire(const ccv_array_t* const tensor_metadata, ccv_nnc_tensor_multiview_t* const mv) | |||
1001 | { | |||
1002 | assert(CCV_IS_TENSOR_MULTIVIEW(mv))((void) sizeof ((((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW)) ? 1 : 0), __extension__ ({ if (((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW )) ; else __assert_fail ("CCV_IS_TENSOR_MULTIVIEW(mv)", "ccv_nnc_symbolic_graph_compile.c" , 1002, __extension__ __PRETTY_FUNCTION__); })); | |||
1003 | int i; | |||
1004 | if (mv->sp) | |||
1005 | for (i = 0; i < mv->sp->rnum; i++) | |||
1006 | { | |||
1007 | ccv_nnc_tensor_t** const tensor = (ccv_nnc_tensor_t**)ccv_array_get(mv->sp, i)((void*)(((char*)((mv->sp)->data)) + (size_t)(mv->sp )->rsize * (size_t)(i))); | |||
1008 | if (CCV_NNC_IS_METADATA_POS(*tensor)((uintptr_t)(*tensor) & 1)) | |||
1009 | { | |||
1010 | const int pos = (int)(intptr_t)*tensor; | |||
1011 | *tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos); | |||
1012 | assert(!CCV_IS_TENSOR_MULTIVIEW(*tensor))((void) sizeof ((!((*(int*)(*tensor)) & CCV_TENSOR_MULTIVIEW )) ? 1 : 0), __extension__ ({ if (!((*(int*)(*tensor)) & CCV_TENSOR_MULTIVIEW )) ; else __assert_fail ("!CCV_IS_TENSOR_MULTIVIEW(*tensor)", "ccv_nnc_symbolic_graph_compile.c", 1012, __extension__ __PRETTY_FUNCTION__ ); })); | |||
1013 | _ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)pos); | |||
1014 | } | |||
1015 | } | |||
1016 | for (i = 0; i < mv->kind + mv->repeat; i++) | |||
1017 | { | |||
1018 | if (CCV_NNC_IS_METADATA_POS((int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[i])((uintptr_t)((int)(intptr_t)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data)[i]) & 1)) | |||
1019 | CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data )[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data )[i]); | |||
1020 | if (CCV_NNC_IS_METADATA_POS((int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[i]->alias_ref)((uintptr_t)((int)(intptr_t)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data)[i]->alias_ref) & 1)) | |||
1021 | CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data )[i]->alias_ref = (uintptr_t)_ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data )[i]->alias_ref); | |||
1022 | if (CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(mv)[i])((*(int*)(((mv)->_heap_data ? (mv)->_heap_data : (mv)-> _inline_data)[i])) & CCV_TENSOR_MULTIVIEW)) | |||
1023 | _ccv_nnc_tensor_multiview_full_pos_rewire(tensor_metadata, (ccv_nnc_tensor_multiview_t*)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data )[i]); | |||
1024 | } | |||
1025 | } | |||
1026 | ||||
1027 | static int _ccv_nnc_tensor_multiview_gen(ccv_array_t* const tensor_metadata, const int preserve, const int assign_update, const ccv_nnc_tensor_param_t params, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_tensor_arena_t* const tensor_arena, const int block_ref) | |||
1028 | { | |||
1029 | // Go to the root of the graph. | |||
1030 | const ccv_nnc_symbolic_graph_prep_t* prep = graph_prep; | |||
1031 | int i; | |||
1032 | for (i = 1; prep->p; i++) | |||
1033 | prep = prep->p; | |||
1034 | // Root graph should have no dup tensor blocks. | |||
1035 | assert(!prep->dup_tensor_block_ref)((void) sizeof ((!prep->dup_tensor_block_ref) ? 1 : 0), __extension__ ({ if (!prep->dup_tensor_block_ref) ; else __assert_fail ( "!prep->dup_tensor_block_ref", "ccv_nnc_symbolic_graph_compile.c" , 1035, __extension__ __PRETTY_FUNCTION__); })); | |||
1036 | const int c = i; | |||
1037 | const ccv_nnc_symbolic_graph_prep_t* preps[c]; | |||
1038 | prep = graph_prep; | |||
1039 | preps[c - 1] = prep; | |||
1040 | for (i = 0; prep->p; i++) | |||
1041 | preps[c - 2 - i] = prep = prep->p; | |||
1042 | int ch[c]; // Use dynamic allocation for array. This is an array to record our selections when recursive from top to bottom. | |||
1043 | memset(ch, 0, sizeof(int) * c); | |||
1044 | int pos = 0; | |||
1045 | _ccv_nnc_tensor_multiview_down_find_pos(tensor_metadata, params, preserve, assign_update, preps, graph_prep, block_ref, ch, 0, &pos); | |||
1046 | assert(ch[c - 1] == 0)((void) sizeof ((ch[c - 1] == 0) ? 1 : 0), __extension__ ({ if (ch[c - 1] == 0) ; else __assert_fail ("ch[c - 1] == 0", "ccv_nnc_symbolic_graph_compile.c" , 1046, __extension__ __PRETTY_FUNCTION__); })); // This shouldn't never be modified. | |||
1047 | assert(pos > 0)((void) sizeof ((pos > 0) ? 1 : 0), __extension__ ({ if (pos > 0) ; else __assert_fail ("pos > 0", "ccv_nnc_symbolic_graph_compile.c" , 1047, __extension__ __PRETTY_FUNCTION__); })); | |||
1048 | return pos; | |||
1049 | } | |||
1050 | ||||
1051 | static int _ccv_nnc_tensor_multiview_preserve_gen(ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_param_t params, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, ccv_nnc_tensor_t* const tensor) | |||
1052 | { | |||
1053 | const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t)); | |||
1054 | ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos); | |||
1055 | ccv_nnc_tensor_t* const tv = CCV_NNC_IS_METADATA_POS(tensor)((uintptr_t)(tensor) & 1) ? _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)tensor) : tensor; | |||
1056 | ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){ | |||
1057 | CCV_NNC_TENSOR_PLACEHOLDER((ccv_nnc_tensor_t*)(intptr_t)(0x10)), | |||
1058 | tv, | |||
1059 | }, CCV_NNC_MULTIVIEW_K1N, 1, graph_prep->graph, mv); | |||
1060 | CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data )[0] = CCV_NNC_TENSOR_PLACEHOLDER((ccv_nnc_tensor_t*)(intptr_t)(0x10)); | |||
1061 | CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data )[1] = tensor; | |||
1062 | return mv_pos; | |||
1063 | } | |||
1064 | ||||
1065 | static int _ccv_nnc_tensor_flat_if_multiview(ccv_array_t* const tensor_metadata, const int pos) | |||
1066 | { | |||
1067 | ccv_nnc_tensor_t* tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos); | |||
1068 | const int is_multiview = CCV_IS_TENSOR_MULTIVIEW(tensor_ptr)((*(int*)(tensor_ptr)) & CCV_TENSOR_MULTIVIEW); | |||
1069 | if (!is_multiview) | |||
1070 | return pos; | |||
1071 | while (CCV_IS_TENSOR_MULTIVIEW(tensor_ptr)((*(int*)(tensor_ptr)) & CCV_TENSOR_MULTIVIEW)) | |||
1072 | { | |||
1073 | const ccv_nnc_tensor_multiview_t* const mv = (const ccv_nnc_tensor_multiview_t*)tensor_ptr; | |||
1074 | tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data )[0]); | |||
1075 | } | |||
1076 | const ccv_nnc_tensor_t tensor = *tensor_ptr; | |||
1077 | const int new_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_t)); | |||
1078 | ccv_nnc_tensor_t* const new_tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, new_pos); | |||
1079 | *new_tensor = ccv_nnc_tensor(tensor.data.u8, tensor.info, 0); | |||
1080 | new_tensor->dataof = tensor.dataof; | |||
1081 | ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, pos); | |||
1082 | new_tensor->alias_ref = (uintptr_t)pos; | |||
1083 | ccv_nnc_tensor_synchronize_to_multiview(mv, (ccv_nnc_tensor_t*)(intptr_t)new_pos); | |||
1084 | return new_pos; | |||
1085 | } | |||
1086 | ||||
1087 | static void _ccv_nnc_assign_vt_tensor_aliases(ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int block_ref, ccv_nnc_tensor_t** const vt_tensors) | |||
1088 | { | |||
1089 | const int alias_ref = tensor_symbol_info[block_ref].alias_ref - 1; | |||
1090 | // It referenced to is not an alias. | |||
1091 | assert(vt_tensors[alias_ref])((void) sizeof ((vt_tensors[alias_ref]) ? 1 : 0), __extension__ ({ if (vt_tensors[alias_ref]) ; else __assert_fail ("vt_tensors[alias_ref]" , "ccv_nnc_symbolic_graph_compile.c", 1091, __extension__ __PRETTY_FUNCTION__ ); })); | |||
1092 | const int alias_pos = (int)(intptr_t)vt_tensors[alias_ref]; | |||
1093 | const ccv_nnc_tensor_t* alias_tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_metadata, alias_pos); | |||
1094 | assert(!CCV_IS_TENSOR_VIEW(alias_tensor_ptr))((void) sizeof ((!((*(int*)(alias_tensor_ptr)) & CCV_TENSOR_VIEW )) ? 1 : 0), __extension__ ({ if (!((*(int*)(alias_tensor_ptr )) & CCV_TENSOR_VIEW)) ; else __assert_fail ("!CCV_IS_TENSOR_VIEW(alias_tensor_ptr)" , "ccv_nnc_symbolic_graph_compile.c", 1094, __extension__ __PRETTY_FUNCTION__ ); })); | |||
1095 | // Will use that to determine whether insert reference or not. | |||
1096 | const int is_multiview = CCV_IS_TENSOR_MULTIVIEW(alias_tensor_ptr)((*(int*)(alias_tensor_ptr)) & CCV_TENSOR_MULTIVIEW); | |||
1097 | while (CCV_IS_TENSOR_MULTIVIEW(alias_tensor_ptr)((*(int*)(alias_tensor_ptr)) & CCV_TENSOR_MULTIVIEW)) | |||
1098 | { | |||
1099 | const ccv_nnc_tensor_multiview_t* const mv = (const ccv_nnc_tensor_multiview_t*)alias_tensor_ptr; | |||
1100 | alias_tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data )[0]); | |||
1101 | } | |||
1102 | const ccv_nnc_tensor_t alias_tensor = *alias_tensor_ptr; | |||
1103 | // If there is no ofs, and inc is the same as dim, we take a shortcut and just init as normal tensor. | |||
1104 | int pos; | |||
1105 | if (memcmp(ccv_nnc_no_ofs, tensor_symbol_info[block_ref].ofs, sizeof(ccv_nnc_no_ofs)) == 0 && | |||
1106 | ccv_nnc_is_tensor_stride_packed(tensor_symbol_info[block_ref].stride, tensor_symbol_info[block_ref].info.dim)) | |||
1107 | { | |||
1108 | pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_t)); | |||
1109 | ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos); | |||
1110 | *tensor = ccv_nnc_tensor(alias_tensor.data.u8, tensor_symbol_info[block_ref].info, 0); | |||
1111 | tensor->dataof = alias_tensor.dataof; | |||
1112 | } else { | |||
1113 | pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_view_t)); | |||
1114 | ccv_nnc_tensor_view_t* const tensor_view = (ccv_nnc_tensor_view_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, pos); | |||
1115 | // Otherwise initialize a tensor view | |||
1116 | *tensor_view = ccv_nnc_tensor_view(&alias_tensor, tensor_symbol_info[block_ref].info, tensor_symbol_info[block_ref].ofs, tensor_symbol_info[block_ref].stride); | |||
1117 | tensor_view->alias_ref = (uintptr_t)alias_pos; | |||
1118 | } | |||
1119 | vt_tensors[block_ref] = (ccv_nnc_tensor_t*)(intptr_t)pos; | |||
1120 | if (is_multiview) | |||
1121 | { | |||
1122 | ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, alias_pos); | |||
1123 | ccv_nnc_tensor_synchronize_to_multiview(mv, (ccv_nnc_tensor_t*)(intptr_t)pos); | |||
1124 | } | |||
1125 | } | |||
1126 | ||||
1127 | static void _ccv_nnc_recursively_assign_vt_tensor_aliases(const ccv_nnc_tensor_block_t* const tensor_blocks, ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int block_ref, ccv_nnc_tensor_t** const vt_tensors) | |||
1128 | { | |||
1129 | // If this is an alias_ref and it hasn't been assigned, it must be an alias itself. Do this recursively. | |||
1130 | if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[block_ref])((tensor_blocks[block_ref].flags & 0x3) == UNASSIGNED) && tensor_blocks[block_ref].alias_ref && !vt_tensors[block_ref]) | |||
1131 | { | |||
1132 | const int ref = tensor_blocks[block_ref].alias_ref - 1; | |||
1133 | if (!vt_tensors[ref]) | |||
1134 | _ccv_nnc_recursively_assign_vt_tensor_aliases(tensor_blocks, tensor_metadata, tensor_symbol_info, ref, vt_tensors); | |||
1135 | vt_tensors[block_ref] = vt_tensors[ref]; | |||
1136 | return; | |||
1137 | } | |||
1138 | assert(tensor_symbol_info[block_ref].alias_ref)((void) sizeof ((tensor_symbol_info[block_ref].alias_ref) ? 1 : 0), __extension__ ({ if (tensor_symbol_info[block_ref].alias_ref ) ; else __assert_fail ("tensor_symbol_info[block_ref].alias_ref" , "ccv_nnc_symbolic_graph_compile.c", 1138, __extension__ __PRETTY_FUNCTION__ ); })); | |||
1139 | const int alias_ref = tensor_symbol_info[block_ref].alias_ref - 1; | |||
1140 | // If we don't have vt_tensors, this must be a ref with alias_ref (through folding). If that is the case, do this recursively until all aliases assigned. | |||
1141 | if (!vt_tensors[alias_ref]) | |||
1142 | _ccv_nnc_recursively_assign_vt_tensor_aliases(tensor_blocks, tensor_metadata, tensor_symbol_info, alias_ref, vt_tensors); | |||
1143 | _ccv_nnc_assign_vt_tensor_aliases(tensor_metadata, tensor_symbol_info, block_ref, vt_tensors); | |||
1144 | } | |||
1145 | ||||
1146 | // Turn a linear pointer to an object storage (such as MTLBuffer). | |||
1147 | #ifdef HAVE_MPS | |||
1148 | static void _ccv_nnc_tensor_arena_obj_dispose(void* ptr, void* userdata) | |||
1149 | { | |||
1150 | mpobjfree(0, ptr); | |||
1151 | } | |||
1152 | #endif | |||
1153 | ||||
1154 | typedef struct { | |||
1155 | size_t size; | |||
1156 | void* obj; | |||
1157 | } tensor_arena_obj_track_t; | |||
1158 | ||||
1159 | typedef struct { | |||
1160 | void* ptr; | |||
1161 | off_t offset; | |||
1162 | size_t size; | |||
1163 | } obj_ptr_key_t; | |||
1164 | ||||
1165 | static inline khint32_t _kh_obj_ptr_hash_func(const obj_ptr_key_t key) | |||
1166 | { | |||
1167 | return ((uint64_t)(uintptr_t)key.ptr >> 4) + key.offset + key.size; | |||
1168 | } | |||
1169 | ||||
1170 | static inline int _kh_obj_ptr_hash_equal(const obj_ptr_key_t a, const obj_ptr_key_t b) | |||
1171 | { | |||
1172 | return (a.ptr == b.ptr && a.offset == b.offset && a.size == b.size); | |||
1173 | } | |||
1174 | ||||
1175 | KHASH_INIT(obj_ptr, obj_ptr_key_t, void*, 1, _kh_obj_ptr_hash_func, _kh_obj_ptr_hash_equal)typedef struct kh_obj_ptr_s { khint_t n_buckets, size, n_occupied , upper_bound; khint32_t *flags; obj_ptr_key_t *keys; void* * vals; } kh_obj_ptr_t; static inline __attribute__ ((__unused__ )) kh_obj_ptr_t *kh_init_obj_ptr(void) { return (kh_obj_ptr_t *)calloc(1,sizeof(kh_obj_ptr_t)); } static inline __attribute__ ((__unused__)) void kh_destroy_obj_ptr(kh_obj_ptr_t *h) { if (h) { free((void *)h->keys); free(h->flags); free((void *)h->vals); free(h); } } static inline __attribute__ ((__unused__ )) void kh_clear_obj_ptr(kh_obj_ptr_t *h) { if (h && h ->flags) { memset(h->flags, 0xaa, ((h->n_buckets) < 16? 1 : (h->n_buckets)>>4) * sizeof(khint32_t)); h-> size = h->n_occupied = 0; } } static inline __attribute__ ( (__unused__)) khint_t kh_get_obj_ptr(const kh_obj_ptr_t *h, obj_ptr_key_t key) { if (h->n_buckets) { khint_t k, i, last, mask, step = 0; mask = h->n_buckets - 1; k = _kh_obj_ptr_hash_func(key ); i = k & mask; last = i; while (!((h->flags[i>> 4]>>((i&0xfU)<<1))&2) && (((h-> flags[i>>4]>>((i&0xfU)<<1))&1) || ! _kh_obj_ptr_hash_equal(h->keys[i], key))) { i = (i + (++step )) & mask; if (i == last) return h->n_buckets; } return ((h->flags[i>>4]>>((i&0xfU)<<1))& 3)? h->n_buckets : i; } else return 0; } static inline __attribute__ ((__unused__)) int kh_resize_obj_ptr(kh_obj_ptr_t *h, khint_t new_n_buckets) { khint32_t *new_flags = 0; khint_t j = 1; { ( --(new_n_buckets), (new_n_buckets)|=(new_n_buckets)>>1, (new_n_buckets)|=(new_n_buckets)>>2, (new_n_buckets)|= (new_n_buckets)>>4, (new_n_buckets)|=(new_n_buckets)>> 8, (new_n_buckets)|=(new_n_buckets)>>16, ++(new_n_buckets )); if (new_n_buckets < 4) new_n_buckets = 4; if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0 ; else { new_flags = (khint32_t*)malloc(((new_n_buckets) < 16? 1 : (new_n_buckets)>>4) * sizeof(khint32_t)); if ( !new_flags) return -1; memset(new_flags, 0xaa, ((new_n_buckets ) < 16? 1 : (new_n_buckets)>>4) * sizeof(khint32_t)) ; if (h->n_buckets < new_n_buckets) { obj_ptr_key_t *new_keys = (obj_ptr_key_t*)realloc((void *)h->keys,new_n_buckets * sizeof(obj_ptr_key_t)); if (!new_keys) { free(new_flags); return -1; } h->keys = new_keys; if (1) { void* *new_vals = (void **)realloc((void *)h->vals,new_n_buckets * sizeof(void*)); if (!new_vals) { free(new_flags); return -1; } h->vals = new_vals ; } } } } if (j) { for (j = 0; j != h->n_buckets; ++j) { if (((h->flags[j>>4]>>((j&0xfU)<<1))& 3) == 0) { obj_ptr_key_t key = h->keys[j]; void* val; khint_t new_mask; new_mask = new_n_buckets - 1; if (1) val = h->vals [j]; (h->flags[j>>4]|=1ul<<((j&0xfU)<< 1)); while (1) { khint_t k, i, step = 0; k = _kh_obj_ptr_hash_func (key); i = k & new_mask; while (!((new_flags[i>>4]>> ((i&0xfU)<<1))&2)) i = (i + (++step)) & new_mask ; (new_flags[i>>4]&=~(2ul<<((i&0xfU)<< 1))); if (i < h->n_buckets && ((h->flags[i>> 4]>>((i&0xfU)<<1))&3) == 0) { { obj_ptr_key_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } if (1 ) { void* tmp = h->vals[i]; h->vals[i] = val; val = tmp ; } (h->flags[i>>4]|=1ul<<((i&0xfU)<< 1)); } else { h->keys[i] = key; if (1) h->vals[i] = val ; break; } } } } if (h->n_buckets > new_n_buckets) { h-> keys = (obj_ptr_key_t*)realloc((void *)h->keys,new_n_buckets * sizeof(obj_ptr_key_t)); if (1) h->vals = (void**)realloc ((void *)h->vals,new_n_buckets * sizeof(void*)); } free(h-> flags); h->flags = new_flags; h->n_buckets = new_n_buckets ; h->n_occupied = h->size; h->upper_bound = (khint_t )(h->n_buckets * __ac_HASH_UPPER + 0.5); } return 0; } static inline __attribute__ ((__unused__)) khint_t kh_put_obj_ptr(kh_obj_ptr_t *h, obj_ptr_key_t key, int *ret) { khint_t x; if (h->n_occupied >= h->upper_bound) { if (h->n_buckets > (h->size <<1)) { if (kh_resize_obj_ptr(h, h->n_buckets - 1) < 0) { *ret = -1; return h->n_buckets; } } else if (kh_resize_obj_ptr (h, h->n_buckets + 1) < 0) { *ret = -1; return h->n_buckets ; } } { khint_t k, i, site, last, mask = h->n_buckets - 1, step = 0; x = site = h->n_buckets; k = _kh_obj_ptr_hash_func (key); i = k & mask; if (((h->flags[i>>4]>> ((i&0xfU)<<1))&2)) x = i; else { last = i; while (!((h->flags[i>>4]>>((i&0xfU)<<1))& 2) && (((h->flags[i>>4]>>((i&0xfU) <<1))&1) || !_kh_obj_ptr_hash_equal(h->keys[i], key ))) { if (((h->flags[i>>4]>>((i&0xfU)<< 1))&1)) site = i; i = (i + (++step)) & mask; if (i == last) { x = site; break; } } if (x == h->n_buckets) { if ( ((h->flags[i>>4]>>((i&0xfU)<<1))& 2) && site != h->n_buckets) x = site; else x = i; } } } if (((h->flags[x>>4]>>((x&0xfU)<< 1))&2)) { h->keys[x] = key; (h->flags[x>>4]&= ~(3ul<<((x&0xfU)<<1))); ++h->size; ++h-> n_occupied; *ret = 1; } else if (((h->flags[x>>4]>> ((x&0xfU)<<1))&1)) { h->keys[x] = key; (h-> flags[x>>4]&=~(3ul<<((x&0xfU)<<1))) ; ++h->size; *ret = 2; } else *ret = 0; return x; } static inline __attribute__ ((__unused__)) void kh_del_obj_ptr(kh_obj_ptr_t *h, khint_t x) { if (x != h->n_buckets && !((h-> flags[x>>4]>>((x&0xfU)<<1))&3)) { ( h->flags[x>>4]|=1ul<<((x&0xfU)<<1)); --h->size; } } | |||
1176 | ||||
1177 | static inline void* _ccv_nnc_tensor_arena_obj_create(khash_t(obj_ptr)kh_obj_ptr_t* obj_ptr_map, void* ptr, const size_t total_size, const off_t offset, const ccv_nnc_tensor_param_t params, ccv_nnc_tensor_arena_t* tensor_arena) | |||
1178 | { | |||
1179 | if (params.dim[0] == 0) | |||
1180 | return 0; | |||
1181 | #ifdef HAVE_MPS | |||
1182 | if (CCV_TENSOR_GET_MEMORY(params.type)((params.type) & 0x3) == CCV_TENSOR_GPU_MEMORY) | |||
1183 | { | |||
1184 | int ret; | |||
1185 | const size_t size = CCV_GET_DATA_TYPE_SIZE(params.datatype)_ccv_get_data_type_size[((params.datatype) & 0xFF000) >> 12] * ccv_nnc_tensor_count(params); | |||
1186 | const obj_ptr_key_t key = { | |||
1187 | .ptr = ptr, | |||
1188 | .offset = offset, | |||
1189 | .size = size, | |||
1190 | }; | |||
1191 | khiter_t k = kh_put(obj_ptr, obj_ptr_map, key, &ret)kh_put_obj_ptr(obj_ptr_map, key, &ret); | |||
1192 | if (ret != 0) | |||
1193 | { | |||
1194 | void* obj = mpobjcreate(ptr, offset, size); | |||
1195 | if (!tensor_arena->disposers) | |||
1196 | tensor_arena->disposers = ccv_array_new(sizeof(ccv_nnc_arena_disposer_t), 1, 0); | |||
1197 | ccv_nnc_arena_disposer_t disposer = { | |||
1198 | .ptr = obj, | |||
1199 | .userdata = 0, | |||
1200 | .dispose = _ccv_nnc_tensor_arena_obj_dispose | |||
1201 | }; | |||
1202 | ccv_array_push(tensor_arena->disposers, &disposer); | |||
1203 | kh_val(obj_ptr_map, k)((obj_ptr_map)->vals[k]) = obj; | |||
1204 | return obj; | |||
1205 | } else | |||
1206 | return kh_val(obj_ptr_map, k)((obj_ptr_map)->vals[k]); | |||
1207 | } | |||
1208 | #endif | |||
1209 | return ptr + offset; | |||
1210 | } | |||
1211 | ||||
1212 | static ccv_nnc_tensor_arena_t* _ccv_nnc_tensor_arena_new(ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_symbolic_graph_compile_allocator_t allocator, const ccv_nnc_tensor_arena_t* const p_arena, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size) | |||
1213 | { | |||
1214 | // All tensors assigned out, now, the num_assigned is the number of dis-continuous buffers, | |||
1215 | // Each tensor have the designation in assigned array, and offset in allocated_offset. | |||
1216 | const ccv_nnc_tensor_alloc_prep_t* const alloc_prep = graph_prep->alloc_prep; | |||
1217 | ccv_nnc_tensor_block_t* const tensor_blocks = graph_prep->tensor_blocks; | |||
1218 | const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info = graph_prep->tensor_symbol_info; | |||
1219 | const int tensor_symbol_info_size = graph_prep->tensor_symbol_info_size; | |||
1220 | const ccv_nnc_symbolic_graph_prep_t* const p_graph_prep = graph_prep->p; | |||
1221 | const ccv_nnc_tensor_alloc_prep_t* const p_alloc_prep = p_graph_prep ? p_graph_prep->alloc_prep : 0; | |||
1222 | const int* const dup_tensor_block_ref = graph_prep->dup_tensor_block_ref; | |||
1223 | const int unroll_count = graph_prep->unroll_count; | |||
1224 | int i, j; | |||
1225 | for (i = 0; i < tensor_symbol_info_size; i++) | |||
1226 | for (j = 0; TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED) && j < unroll_count; j++) | |||
1227 | { | |||
1228 | const int dup_ref = dup_tensor_block_ref[i * unroll_count + j]; | |||
1229 | if (dup_ref >= 0 && !TENSOR_EXPECT_UNASSIGNED(tensor_blocks[dup_ref])((tensor_blocks[dup_ref].flags & 0x3) == UNASSIGNED)) | |||
1230 | TENSOR_EXPECT_UNSET_UNASSIGNED(tensor_blocks[i])(tensor_blocks[i].flags = (tensor_blocks[i].flags & ~0x1) ); | |||
1231 | } | |||
1232 | ccv_nnc_tensor_arena_t* tensor_arena = (ccv_nnc_tensor_arena_t*)ccmallocmalloc(sizeof(ccv_nnc_tensor_arena_t) + sizeof(tensor_arena->buffers[0]) * alloc_prep->buffer_size + sizeof(ccv_nnc_tensor_arena_t*) * graph_prep->sub_prep_size + sizeof(ccv_nnc_tensor_t*) * tensor_symbol_info_size + sizeof(int) * tensor_symbol_info_size); | |||
1233 | graph_prep->tensor_arena = tensor_arena; | |||
1234 | tensor_arena->graph_ref = (intptr_t)graph_prep->symbolic_graph; | |||
1235 | tensor_arena->buffers = (void*)(tensor_arena + 1); | |||
1236 | tensor_arena->buffer_size = alloc_prep->buffer_size; | |||
1237 | tensor_arena->vt_tensor_size = tensor_symbol_info_size; | |||
1238 | tensor_arena->sub_arenas = (ccv_nnc_tensor_arena_t**)(tensor_arena->buffers + alloc_prep->buffer_size); | |||
1239 | tensor_arena->vt_tensors = (ccv_nnc_tensor_t**)(tensor_arena->sub_arenas + graph_prep->sub_prep_size); | |||
1240 | tensor_arena->vt_alias_refs = (int*)(tensor_arena->vt_tensors + tensor_symbol_info_size); | |||
1241 | tensor_arena->pb_vt_tensors = 0; | |||
1242 | tensor_arena->vt_alias_r_refs_p = 0; | |||
1243 | tensor_arena->vt_alias_r_refs = 0; | |||
1244 | tensor_arena->vt_sizes = 0; | |||
1245 | tensor_arena->sub_arena_size = graph_prep->sub_prep_size; | |||
1246 | tensor_arena->tensor_metadata = ccv_array_new(16 /* align to 16 bytes */, 0, 0); | |||
1247 | tensor_arena->m_tensor_idx = ccv_array_new(sizeof(int), 0, 0); | |||
1248 | tensor_arena->allocator.context.free = allocator.context.free; | |||
1249 | tensor_arena->allocator.isa = allocator.isa; | |||
1250 | tensor_arena->disposers = 0; | |||
1251 | // Copy alias_ref info back to the tensor arena. | |||
1252 | for (i = 0; i < tensor_symbol_info_size; i++) | |||
1253 | tensor_arena->vt_alias_refs[i] = tensor_symbol_info[i].alias_ref; | |||
1254 | // Do the buffer copies. | |||
1255 | for (i = 0; i < alloc_prep->buffer_size; i++) | |||
1256 | tensor_arena->buffers[i].type = alloc_prep->buffers[i].type, | |||
1257 | tensor_arena->buffers[i].pin_mem = alloc_prep->buffers[i].pin_mem, | |||
1258 | tensor_arena->buffers[i].size = alloc_prep->buffers[i].size; | |||
1259 | if (graph_prep->while_count_tensor) | |||
1260 | { | |||
1261 | // If we need to have a while count tensor, allocate that first, set its pointer to point the while_count variable. | |||
1262 | int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t)); | |||
1263 | assert((0 << 1) + 1 == pos)((void) sizeof (((0 << 1) + 1 == pos) ? 1 : 0), __extension__ ({ if ((0 << 1) + 1 == pos) ; else __assert_fail ("(0 << 1) + 1 == pos" , "ccv_nnc_symbolic_graph_compile.c", 1263, __extension__ __PRETTY_FUNCTION__ ); })); // pos must be 0 position. | |||
1264 | ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos); | |||
1265 | *tensor = ccv_nnc_tensor_for_while_count(graph_prep->graph); | |||
1266 | } | |||
1267 | assert((p_arena && p_graph_prep) || (!p_arena && !p_graph_prep))((void) sizeof (((p_arena && p_graph_prep) || (!p_arena && !p_graph_prep)) ? 1 : 0), __extension__ ({ if ((p_arena && p_graph_prep) || (!p_arena && !p_graph_prep )) ; else __assert_fail ("(p_arena && p_graph_prep) || (!p_arena && !p_graph_prep)" , "ccv_nnc_symbolic_graph_compile.c", 1267, __extension__ __PRETTY_FUNCTION__ ); })); | |||
1268 | if (p_arena && p_graph_prep) | |||
1269 | { | |||
1270 | // Don't need to allocate the actual buffer, just use the pointer from the above. | |||
1271 | PRINT(CCV_CLI_VERBOSE, "Buffer assignment for sub arena %p (parent %p)\n", tensor_arena, p_arena)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels())) { printf("Buffer assignment for sub arena %p (parent %p)\n", tensor_arena, p_arena); fflush(stdout); } } while (0); | |||
1272 | for (i = 0; i < tensor_arena->buffer_size; i++) | |||
1273 | { | |||
1274 | const int p_ref = alloc_prep->buffers[i].p_refs[0] - 1; | |||
1275 | int unref_p_ref = p_ref; | |||
1276 | while (p_graph_prep->tensor_blocks[unref_p_ref].ref) | |||
1277 | unref_p_ref = p_graph_prep->tensor_blocks[unref_p_ref].ref - 1; | |||
1278 | assert(unref_p_ref >= 0)((void) sizeof ((unref_p_ref >= 0) ? 1 : 0), __extension__ ({ if (unref_p_ref >= 0) ; else __assert_fail ("unref_p_ref >= 0" , "ccv_nnc_symbolic_graph_compile.c", 1278, __extension__ __PRETTY_FUNCTION__ ); })); | |||
1279 | const int p_unroll_count = p_graph_prep->unroll_count; | |||
1280 | if (p_graph_prep->dup_tensor_block_ref && | |||
1281 | p_graph_prep->dup_tensor_block_ref[p_ref * p_unroll_count] >= 0 && | |||
1282 | p_graph_prep->dup_tensor_block_ref[p_ref * p_unroll_count] != p_ref) | |||
1283 | { | |||
1284 | // This condition means in the parent graph, we point to multiple tensor blocks for the same | |||
1285 | // buffer, therefore, we cannot have one single pointer assigned in this case. | |||
1286 | // Later we will handle this by generate ccv_tensor_multiview_t structure. | |||
1287 | tensor_arena->buffers[i].ptr = 0; | |||
1288 | PRINT(CCV_CLI_VERBOSE, "|-Cannot assign buffer %d, it points to multiple blocks (multi view tensor required)\n", i)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels())) { printf("|-Cannot assign buffer %d, it points to multiple blocks (multi view tensor required)\n" , i); fflush(stdout); } } while (0); | |||
1289 | continue; | |||
1290 | } | |||
1291 | // Otherwise, find the actual buffer pointer. | |||
1292 | const int vt_ref = p_alloc_prep->vt_blocks[unref_p_ref]; | |||
1293 | assert(vt_ref >= 0)((void) sizeof ((vt_ref >= 0) ? 1 : 0), __extension__ ({ if (vt_ref >= 0) ; else __assert_fail ("vt_ref >= 0", "ccv_nnc_symbolic_graph_compile.c" , 1293, __extension__ __PRETTY_FUNCTION__); })); | |||
1294 | const int buffer_ref = p_alloc_prep->blocks[vt_ref].buffer_ref; | |||
1295 | if (!p_arena->buffers[buffer_ref].ptr) | |||
1296 | { | |||
1297 | // Pass it down as 0 ptr. | |||
1298 | tensor_arena->buffers[i].ptr = 0; | |||
1299 | PRINT(CCV_CLI_VERBOSE, "|-Cannot assign buffer %d, it points to multiple blocks (multi view tensor required)\n", i)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels())) { printf("|-Cannot assign buffer %d, it points to multiple blocks (multi view tensor required)\n" , i); fflush(stdout); } } while (0); | |||
1300 | continue; | |||
1301 | } | |||
1302 | const uint64_t offset = p_alloc_prep->blocks[vt_ref].offset; | |||
1303 | tensor_arena->buffers[i].ptr = p_arena->buffers[buffer_ref].ptr + offset; | |||
1304 | PRINT(CCV_CLI_VERBOSE, "|-Assign block %d in parent arena to buffer %d with offset %lu\n", vt_ref, i, (unsigned long)offset)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels())) { printf("|-Assign block %d in parent arena to buffer %d with offset %lu\n" , vt_ref, i, (unsigned long)offset); fflush(stdout); } } while (0); | |||
1305 | } | |||
1306 | } else { | |||
1307 | // Now, allocate actual buffers. | |||
1308 | PRINT(CCV_CLI_VERBOSE, "Buffer allocation for arena %p\n", tensor_arena)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels())) { printf("Buffer allocation for arena %p\n", tensor_arena); fflush (stdout); } } while (0); | |||
1309 | for (i = 0; i < tensor_arena->buffer_size; i++) | |||
1310 | { | |||
1311 | const int buffer_type = tensor_arena->buffers[i].type; | |||
1312 | const int memory_type = CCV_TENSOR_GET_MEMORY(buffer_type)((buffer_type) & 0x3); | |||
1313 | #ifdef HAVE_CUDA1 | |||
1314 | if (memory_type == CCV_TENSOR_GPU_MEMORY) | |||
1315 | { | |||
1316 | const int device_id = CCV_TENSOR_GET_DEVICE_ID(buffer_type)(((buffer_type) & 0xfff00) >> 8); | |||
1317 | if (allocator.isa && allocator.isa->alloc) | |||
1318 | tensor_arena->buffers[i].ptr = (uint8_t*)allocator.isa->alloc(buffer_type, 0, tensor_arena->buffers[i].size, allocator.context.alloc); | |||
1319 | else | |||
1320 | tensor_arena->buffers[i].ptr = (uint8_t*)cumalloc(device_id, tensor_arena->buffers[i].size); | |||
1321 | PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels())) { printf("|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena ->buffers[i].ptr, (unsigned long)tensor_arena->buffers[ i].size); fflush(stdout); } } while (0); | |||
1322 | } else { | |||
1323 | assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0 ), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY) ; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY" , "ccv_nnc_symbolic_graph_compile.c", 1323, __extension__ __PRETTY_FUNCTION__ ); })); | |||
1324 | if (tensor_arena->buffers[i].pin_mem) | |||
1325 | tensor_arena->buffers[i].ptr = (uint8_t*)cuhostalloc(tensor_arena->buffers[i].size); | |||
1326 | else | |||
1327 | ccmemalignposix_memalign((void**)&tensor_arena->buffers[i].ptr, 64, tensor_arena->buffers[i].size); | |||
1328 | PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels())) { printf("|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena ->buffers[i].ptr, (unsigned long)tensor_arena->buffers[ i].size); fflush(stdout); } } while (0); | |||
1329 | } | |||
1330 | #elif defined(HAVE_MPS) | |||
1331 | if (memory_type == CCV_TENSOR_GPU_MEMORY) | |||
1332 | { | |||
1333 | const int device_id = CCV_TENSOR_GET_DEVICE_ID(buffer_type)(((buffer_type) & 0xfff00) >> 8); | |||
1334 | // if (allocator.isa && allocator.isa->alloc) | |||
1335 | // tensor_arena->buffers[i].ptr = (uint8_t*)allocator.isa->alloc(buffer_type, 0, tensor_arena->buffers[i].size, allocator.context.alloc); | |||
1336 | // else | |||
1337 | tensor_arena->buffers[i].ptr = (uint8_t*)mpheapalloc(device_id, tensor_arena->buffers[i].size); | |||
1338 | PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels())) { printf("|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena ->buffers[i].ptr, (unsigned long)tensor_arena->buffers[ i].size); fflush(stdout); } } while (0); | |||
1339 | } else { | |||
1340 | assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0 ), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY) ; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY" , "ccv_nnc_symbolic_graph_compile.c", 1340, __extension__ __PRETTY_FUNCTION__ ); })); | |||
1341 | ccmemalignposix_memalign((void**)&tensor_arena->buffers[i].ptr, 64, tensor_arena->buffers[i].size); | |||
1342 | PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels())) { printf("|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena ->buffers[i].ptr, (unsigned long)tensor_arena->buffers[ i].size); fflush(stdout); } } while (0); | |||
1343 | } | |||
1344 | #else | |||
1345 | assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0 ), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY) ; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY" , "ccv_nnc_symbolic_graph_compile.c", 1345, __extension__ __PRETTY_FUNCTION__ ); })); | |||
1346 | ccmemalignposix_memalign((void**)&tensor_arena->buffers[i].ptr, 64, tensor_arena->buffers[i].size); | |||
1347 | PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels())) { printf("|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena ->buffers[i].ptr, (unsigned long)tensor_arena->buffers[ i].size); fflush(stdout); } } while (0); | |||
1348 | #endif | |||
1349 | assert(tensor_arena->buffers[i].ptr)((void) sizeof ((tensor_arena->buffers[i].ptr) ? 1 : 0), __extension__ ({ if (tensor_arena->buffers[i].ptr) ; else __assert_fail ("tensor_arena->buffers[i].ptr", "ccv_nnc_symbolic_graph_compile.c" , 1349, __extension__ __PRETTY_FUNCTION__); })); | |||
1350 | } | |||
1351 | } | |||
1352 | // Go over sub_preps and allocate arenas for them. Do it this early because | |||
1353 | // we may reference tensors from sub arenas, the reason why we need to reference | |||
1354 | // tensors from sub arenas is because for output tensors, sub arena's tensor | |||
1355 | // will have automatic reference updates. | |||
1356 | for (i = 0; i < tensor_arena->sub_arena_size; i++) | |||
1357 | if (graph_prep->sub_preps[i]) | |||
1358 | tensor_arena->sub_arenas[i] = _ccv_nnc_tensor_arena_new(graph_prep->sub_preps[i], allocator, tensor_arena, tensor_binds, tensor_bind_size); | |||
1359 | else | |||
1360 | tensor_arena->sub_arenas[i] = 0; | |||
1361 | memset(tensor_arena->vt_tensors, 0, sizeof(ccv_nnc_tensor_t*) * tensor_symbol_info_size); | |||
1362 | // Now sub-arenas are all assigned, go over its outputs to assign out tensors from its output directly. | |||
1363 | ccv_nnc_tensor_t** sub_arena_out_tensors = tensor_arena->sub_arena_size ? (ccv_nnc_tensor_t**)cccalloccalloc(tensor_symbol_info_size, sizeof(ccv_nnc_tensor_t*)) : 0; | |||
1364 | #ifdef HAVE_MPS | |||
1365 | khash_t(obj_ptr)kh_obj_ptr_t* obj_ptr_map = kh_init(obj_ptr)kh_init_obj_ptr(); | |||
1366 | #else | |||
1367 | khash_t(obj_ptr)kh_obj_ptr_t* obj_ptr_map = 0; | |||
1368 | #endif | |||
1369 | for (i = 0; i < tensor_arena->sub_arena_size; i++) | |||
1370 | if (tensor_arena->sub_arenas[i]) | |||
1371 | { | |||
1372 | assert(graph_prep->sub_preps[i])((void) sizeof ((graph_prep->sub_preps[i]) ? 1 : 0), __extension__ ({ if (graph_prep->sub_preps[i]) ; else __assert_fail ("graph_prep->sub_preps[i]" , "ccv_nnc_symbolic_graph_compile.c", 1372, __extension__ __PRETTY_FUNCTION__ ); })); | |||
1373 | const int exec_idx = graph_prep->sub_preps[i]->exec_idx - 1; | |||
1374 | const ccv_nnc_graph_exec_symbol_info_t* const node = graph_prep->exec_symbol_info + exec_idx; | |||
1375 | if (node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE) | |||
1376 | for (j = 0; j < node->output_size; j++) | |||
1377 | { | |||
1378 | const int idx = node->outputs[j]; | |||
1379 | const int s_idx = *(int*)ccv_array_get(tensor_symbol_info[idx].s_ref, i)((void*)(((char*)((tensor_symbol_info[idx].s_ref)->data)) + (size_t)(tensor_symbol_info[idx].s_ref)->rsize * (size_t) (i))) - 1; | |||
1380 | assert(s_idx >= 0)((void) sizeof ((s_idx >= 0) ? 1 : 0), __extension__ ({ if (s_idx >= 0) ; else __assert_fail ("s_idx >= 0", "ccv_nnc_symbolic_graph_compile.c" , 1380, __extension__ __PRETTY_FUNCTION__); })); | |||
1381 | ccv_nnc_tensor_t* sub_tensor = tensor_arena->sub_arenas[i]->vt_tensors[s_idx]; | |||
1382 | assert(sub_arena_out_tensors[idx] == 0)((void) sizeof ((sub_arena_out_tensors[idx] == 0) ? 1 : 0), __extension__ ({ if (sub_arena_out_tensors[idx] == 0) ; else __assert_fail ("sub_arena_out_tensors[idx] == 0", "ccv_nnc_symbolic_graph_compile.c" , 1382, __extension__ __PRETTY_FUNCTION__); })); | |||
1383 | ccv_nnc_tensor_t* sub_alias = (ccv_nnc_tensor_t*)sub_tensor->alias_ref; | |||
1384 | // Only assign if it is a multiview tensor. | |||
1385 | if (CCV_IS_TENSOR_MULTIVIEW(sub_tensor)((*(int*)(sub_tensor)) & CCV_TENSOR_MULTIVIEW) || | |||
1386 | (sub_alias && CCV_IS_TENSOR_MULTIVIEW(sub_alias)((*(int*)(sub_alias)) & CCV_TENSOR_MULTIVIEW))) | |||
1387 | sub_arena_out_tensors[idx] = sub_tensor; | |||
1388 | } | |||
1389 | } | |||
1390 | // Assigning out the tensors (in case of sharing tensors / in-place ops). | |||
1391 | for (i = 0; i < tensor_symbol_info_size; i++) | |||
1392 | if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !( (tensor_blocks[i].flags & 0x3) == UNASSIGNED))) | |||
1393 | { | |||
1394 | const int vt_ref = alloc_prep->vt_blocks[i]; | |||
1395 | const int buffer_ref = vt_ref >= 0 ? alloc_prep->blocks[vt_ref].buffer_ref : -1; | |||
1396 | // Either we have dup_tensor_block_ref in current layer, or we have that in | |||
1397 | // previous layer, therefore, cannot really find the buffer ptr. | |||
1398 | if ((!sub_arena_out_tensors || !sub_arena_out_tensors[i]) && // If it is already generated by sub arena, it can be ordinary out tensors. (What if out tensor is not even generated by sub graph when running? In this case, the behavior is undefined anyway). | |||
1399 | ((graph_prep->dup_tensor_block_ref && | |||
1400 | graph_prep->dup_tensor_block_ref[i * unroll_count] >= 0 && | |||
1401 | graph_prep->dup_tensor_block_ref[i * unroll_count] != i) || | |||
1402 | (buffer_ref >= 0 && !tensor_arena->buffers[buffer_ref].ptr))) | |||
1403 | { | |||
1404 | assert(graph_prep->p)((void) sizeof ((graph_prep->p) ? 1 : 0), __extension__ ({ if (graph_prep->p) ; else __assert_fail ("graph_prep->p" , "ccv_nnc_symbolic_graph_compile.c", 1404, __extension__ __PRETTY_FUNCTION__ ); })); // This must be in a sub-graph. | |||
1405 | // If this is an input tensor, and it need to be preserved, wait until when we go through inputs to preserve. | |||
1406 | if (graph_prep->tensor_blocks[i].p_refs[0] && _ccv_nnc_tensor_block_check_preserve(graph_prep, i)) | |||
1407 | continue; | |||
1408 | const int pos = _ccv_nnc_tensor_multiview_gen(tensor_arena->tensor_metadata, 0, tensor_symbol_info[i].assign_ref, tensor_symbol_info[i].info, graph_prep, tensor_arena, i); | |||
1409 | tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)pos; | |||
1410 | ccv_array_push(tensor_arena->m_tensor_idx, &pos); | |||
1411 | } else if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED)) { | |||
1412 | // When we want to allocate, we don't really need to if it need force broadcast, because we will handle that later. | |||
1413 | const uint64_t offset = alloc_prep->blocks[vt_ref].offset; | |||
1414 | // If already created, use the same tensor, and continue. | |||
1415 | // Having ptr. | |||
1416 | int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t)); | |||
1417 | ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos); | |||
1418 | // Also, set its allocations. | |||
1419 | // Since tensor view is bit compatible with tensor, we can just cast. | |||
1420 | void* obj = _ccv_nnc_tensor_arena_obj_create(obj_ptr_map, tensor_arena->buffers[buffer_ref].ptr, tensor_arena->buffers[buffer_ref].size, offset, tensor_symbol_info[i].info, tensor_arena); | |||
1421 | *tensor = ccv_nnc_tensor(obj, tensor_symbol_info[i].info, 0); | |||
1422 | assert(offset + tensor_blocks[i].size <= tensor_arena->buffers[buffer_ref].size)((void) sizeof ((offset + tensor_blocks[i].size <= tensor_arena ->buffers[buffer_ref].size) ? 1 : 0), __extension__ ({ if ( offset + tensor_blocks[i].size <= tensor_arena->buffers [buffer_ref].size) ; else __assert_fail ("offset + tensor_blocks[i].size <= tensor_arena->buffers[buffer_ref].size" , "ccv_nnc_symbolic_graph_compile.c", 1422, __extension__ __PRETTY_FUNCTION__ ); })); | |||
1423 | // If we need to force broadcast, we need to wrap it in a multiview. | |||
1424 | if (graph_prep->tensor_blocks[i].p_refs[0] && | |||
1425 | _ccv_nnc_tensor_block_check_force_broadcast(graph_prep, i)) | |||
1426 | { | |||
1427 | const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t)); | |||
1428 | ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, mv_pos); | |||
1429 | ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos); | |||
1430 | ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){ | |||
1431 | tv, | |||
1432 | }, 0, 1, graph_prep->graph, mv); | |||
1433 | CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data )[0] = (ccv_nnc_tensor_t*)(intptr_t)pos; | |||
1434 | pos = mv_pos; | |||
1435 | ccv_array_push(tensor_arena->m_tensor_idx, &mv_pos); | |||
1436 | } | |||
1437 | tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)pos; // Cast into vt_tensors for now, and later will rewire it. | |||
1438 | } | |||
1439 | } | |||
1440 | #ifdef HAVE_MPS | |||
1441 | kh_destroy(obj_ptr, obj_ptr_map)kh_destroy_obj_ptr(obj_ptr_map); | |||
1442 | #endif | |||
1443 | // Handle binded tensors. First handle cases without aliases. | |||
1444 | for (i = 0; i < tensor_bind_size; i++) | |||
1445 | { | |||
1446 | assert(tensor_binds[i].tensor)((void) sizeof ((tensor_binds[i].tensor) ? 1 : 0), __extension__ ({ if (tensor_binds[i].tensor) ; else __assert_fail ("tensor_binds[i].tensor" , "ccv_nnc_symbolic_graph_compile.c", 1446, __extension__ __PRETTY_FUNCTION__ ); })); | |||
1447 | const ccv_nnc_tensor_symbol_t resolved_symbol = ccv_nnc_tensor_symbol_resolve(graph_prep->symbolic_graph, tensor_binds[i].symbol); | |||
1448 | if (resolved_symbol.d >= 0) | |||
1449 | { | |||
1450 | int d = resolved_symbol.d; | |||
1451 | if (TENSOR_EXPECT_ALIAS(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == ALIAS)) | |||
1452 | continue; | |||
1453 | // This check is for in-place ops. Only in-place op could have unassigned but ref. | |||
1454 | // It has nothing to do with alias. | |||
1455 | while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED) && tensor_blocks[d].ref) | |||
1456 | d = tensor_blocks[d].ref - 1; | |||
1457 | // For binded tensors, it shouldn't be assigned yet. | |||
1458 | // If it is assigned, the pointer should match the ones from the binded tensor. | |||
1459 | // This can only happen if an enforced in-place tensor is binded twice. If that | |||
1460 | // happens, we need to make sure it is binded to the same location. | |||
1461 | assert(!tensor_arena->vt_tensors[d] || tensor_arena->vt_tensors[d]->data.u8 == tensor_binds[i].tensor->data.u8)((void) sizeof ((!tensor_arena->vt_tensors[d] || tensor_arena ->vt_tensors[d]->data.u8 == tensor_binds[i].tensor-> data.u8) ? 1 : 0), __extension__ ({ if (!tensor_arena->vt_tensors [d] || tensor_arena->vt_tensors[d]->data.u8 == tensor_binds [i].tensor->data.u8) ; else __assert_fail ("!tensor_arena->vt_tensors[d] || tensor_arena->vt_tensors[d]->data.u8 == tensor_binds[i].tensor->data.u8" , "ccv_nnc_symbolic_graph_compile.c", 1461, __extension__ __PRETTY_FUNCTION__ ); })); | |||
1462 | // See above assertion. | |||
1463 | if (tensor_arena->vt_tensors[d]) | |||
1464 | continue; | |||
1465 | if (CCV_IS_TENSOR_VIEW(tensor_binds[i].tensor)((*(int*)(tensor_binds[i].tensor)) & CCV_TENSOR_VIEW)) | |||
1466 | { | |||
1467 | int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_view_t)); | |||
1468 | ccv_nnc_tensor_view_t* const tv = (ccv_nnc_tensor_view_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos); | |||
1469 | ccv_nnc_tensor_view_t* const otv = (ccv_nnc_tensor_view_t*)tensor_binds[i].tensor; | |||
1470 | if (otv->off > 0) // If there is a off. This has to be the same dimensionality, or smaller at each dimension. | |||
1471 | for (j = 0; j < CCV_NNC_MAX_DIM_ALLOC(12); j++) | |||
1472 | { assert(tensor_symbol_info[d].info.dim[j] <= otv->info.dim[j])((void) sizeof ((tensor_symbol_info[d].info.dim[j] <= otv-> info.dim[j]) ? 1 : 0), __extension__ ({ if (tensor_symbol_info [d].info.dim[j] <= otv->info.dim[j]) ; else __assert_fail ("tensor_symbol_info[d].info.dim[j] <= otv->info.dim[j]" , "ccv_nnc_symbolic_graph_compile.c", 1472, __extension__ __PRETTY_FUNCTION__ ); })); } | |||
1473 | // It is OK to be just as a whole smaller or equal to the binded one. | |||
1474 | assert(ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count(tensor_symbol_info[d].info))((void) sizeof ((ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count (tensor_symbol_info[d].info)) ? 1 : 0), __extension__ ({ if ( ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count (tensor_symbol_info[d].info)) ; else __assert_fail ("ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count(tensor_symbol_info[d].info)" , "ccv_nnc_symbolic_graph_compile.c", 1474, __extension__ __PRETTY_FUNCTION__ ); })); | |||
1475 | memcpy(tv, otv, sizeof(ccv_nnc_tensor_view_t)); | |||
1476 | memcpy(tv->info.dim, tensor_symbol_info[d].info.dim, sizeof(tv->info.dim)); | |||
1477 | tensor_arena->vt_tensors[d] = (ccv_nnc_tensor_t*)(intptr_t)pos; | |||
1478 | } else { | |||
1479 | int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t)); | |||
1480 | ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos); | |||
1481 | *tv = ccv_nnc_tensor(tensor_binds[i].tensor->data.u8, tensor_symbol_info[d].info, 0); | |||
1482 | tv->info.datatype = tensor_binds[i].tensor->info.datatype; | |||
1483 | tv->info.reserved = tensor_binds[i].tensor->info.reserved; | |||
1484 | tv->data = tensor_binds[i].tensor->data; // If there are offsets, copy it over. | |||
1485 | tv->dataof = tensor_binds[i].tensor->dataof; | |||
1486 | tensor_arena->vt_tensors[d] = (ccv_nnc_tensor_t*)(intptr_t)pos; | |||
1487 | } | |||
1488 | } | |||
1489 | } | |||
1490 | // Handle binded tensors. We handle alias here so it can reference to binded tensors. | |||
1491 | for (i = 0; i < tensor_bind_size; i++) | |||
1492 | { | |||
1493 | assert(tensor_binds[i].tensor)((void) sizeof ((tensor_binds[i].tensor) ? 1 : 0), __extension__ ({ if (tensor_binds[i].tensor) ; else __assert_fail ("tensor_binds[i].tensor" , "ccv_nnc_symbolic_graph_compile.c", 1493, __extension__ __PRETTY_FUNCTION__ ); })); | |||
1494 | const ccv_nnc_tensor_symbol_t resolved_symbol = ccv_nnc_tensor_symbol_resolve(graph_prep->symbolic_graph, tensor_binds[i].symbol); | |||
1495 | if (resolved_symbol.d >= 0) | |||
1496 | { | |||
1497 | int d = resolved_symbol.d; | |||
1498 | if (TENSOR_EXPECT_ALIAS(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == ALIAS)) | |||
1499 | d = tensor_symbol_info[d].alias_ref - 1; // Bind back to the original. | |||
1500 | // This check is for in-place ops. Only in-place op could have unassigned but ref. | |||
1501 | // It has nothing to do with alias. | |||
1502 | while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED) && tensor_blocks[d].ref) | |||
1503 | d = tensor_blocks[d].ref - 1; | |||
1504 | if (tensor_arena->vt_tensors[d]) | |||
1505 | continue; | |||
1506 | // Assert original alias has no ofs. Otherwise our binding will be problematic. | |||
1507 | for (j = 0; j < CCV_NNC_MAX_DIM_ALLOC(12); j++) | |||
1508 | { assert(tensor_symbol_info[resolved_symbol.d].ofs[j] == 0)((void) sizeof ((tensor_symbol_info[resolved_symbol.d].ofs[j] == 0) ? 1 : 0), __extension__ ({ if (tensor_symbol_info[resolved_symbol .d].ofs[j] == 0) ; else __assert_fail ("tensor_symbol_info[resolved_symbol.d].ofs[j] == 0" , "ccv_nnc_symbolic_graph_compile.c", 1508, __extension__ __PRETTY_FUNCTION__ ); })); } | |||
1509 | if (CCV_IS_TENSOR_VIEW(tensor_binds[i].tensor)((*(int*)(tensor_binds[i].tensor)) & CCV_TENSOR_VIEW)) | |||
1510 | { | |||
1511 | int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_view_t)); | |||
1512 | ccv_nnc_tensor_view_t* const tv = (ccv_nnc_tensor_view_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos); | |||
1513 | ccv_nnc_tensor_view_t* const otv = (ccv_nnc_tensor_view_t*)tensor_binds[i].tensor; | |||
1514 | if (otv->off > 0) // If there is a off. This has to be the same dimensionality, or smaller at each dimension. | |||
1515 | for (j = 0; j < CCV_NNC_MAX_DIM_ALLOC(12); j++) | |||
1516 | { assert(tensor_symbol_info[d].info.dim[j] <= otv->info.dim[j])((void) sizeof ((tensor_symbol_info[d].info.dim[j] <= otv-> info.dim[j]) ? 1 : 0), __extension__ ({ if (tensor_symbol_info [d].info.dim[j] <= otv->info.dim[j]) ; else __assert_fail ("tensor_symbol_info[d].info.dim[j] <= otv->info.dim[j]" , "ccv_nnc_symbolic_graph_compile.c", 1516, __extension__ __PRETTY_FUNCTION__ ); })); } | |||
1517 | // It is OK to be just as a whole smaller or equal to the binded one. | |||
1518 | assert(ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count(tensor_symbol_info[d].info))((void) sizeof ((ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count (tensor_symbol_info[d].info)) ? 1 : 0), __extension__ ({ if ( ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count (tensor_symbol_info[d].info)) ; else __assert_fail ("ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count(tensor_symbol_info[d].info)" , "ccv_nnc_symbolic_graph_compile.c", 1518, __extension__ __PRETTY_FUNCTION__ ); })); | |||
1519 | memcpy(tv, otv, sizeof(ccv_nnc_tensor_view_t)); | |||
1520 | memcpy(tv->info.dim, tensor_symbol_info[d].info.dim, sizeof(tv->info.dim)); | |||
1521 | tensor_arena->vt_tensors[d] = (ccv_nnc_tensor_t*)(intptr_t)pos; | |||
1522 | } else { | |||
1523 | int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t)); | |||
1524 | ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos); | |||
1525 | *tv = ccv_nnc_tensor(tensor_binds[i].tensor->data.u8, tensor_symbol_info[d].info, 0); | |||
1526 | tv->info.datatype = tensor_binds[i].tensor->info.datatype; | |||
1527 | tv->info.reserved = tensor_binds[i].tensor->info.reserved; | |||
1528 | tv->data = tensor_binds[i].tensor->data; | |||
1529 | tv->dataof = tensor_binds[i].tensor->dataof; | |||
1530 | tensor_arena->vt_tensors[d] = (ccv_nnc_tensor_t*)(intptr_t)pos; | |||
1531 | } | |||
1532 | } | |||
1533 | } | |||
1534 | // Assign out refs, refs are simple ones, we should handle it first. (because they point to exactly the same metadata and same region). | |||
1535 | // Avoiding refs that actually is an alias. | |||
1536 | for (i = 0; i < tensor_symbol_info_size; i++) | |||
1537 | // It could be binded tensor (or unused), in that case, it doesn't have a ref. | |||
1538 | if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED) && tensor_blocks[i].ref && !tensor_arena->vt_tensors[i] && !tensor_blocks[i].alias_ref) | |||
1539 | { | |||
1540 | int ref = tensor_blocks[i].ref - 1; | |||
1541 | while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[ref])((tensor_blocks[ref].flags & 0x3) == UNASSIGNED) && tensor_blocks[ref].ref) | |||
1542 | ref = tensor_blocks[ref].ref - 1; | |||
1543 | assert(tensor_arena->vt_tensors[ref])((void) sizeof ((tensor_arena->vt_tensors[ref]) ? 1 : 0), __extension__ ({ if (tensor_arena->vt_tensors[ref]) ; else __assert_fail ("tensor_arena->vt_tensors[ref]", "ccv_nnc_symbolic_graph_compile.c" , 1543, __extension__ __PRETTY_FUNCTION__); })); | |||
1544 | tensor_arena->vt_tensors[i] = tensor_arena->vt_tensors[ref]; | |||
1545 | } | |||
1546 | // Now after refs assigned out, handle the case I need to preserve because I am a sub graph of while loop. | |||
1547 | if (graph_prep->flags & CCV_NNC_GRAPH_EXEC_P_WHILE) | |||
1548 | { | |||
1549 | assert(graph_prep->p)((void) sizeof ((graph_prep->p) ? 1 : 0), __extension__ ({ if (graph_prep->p) ; else __assert_fail ("graph_prep->p" , "ccv_nnc_symbolic_graph_compile.c", 1549, __extension__ __PRETTY_FUNCTION__ ); })); | |||
1550 | const ccv_nnc_graph_exec_symbol_info_t* node = graph_prep->p->exec_symbol_info + (graph_prep->exec_idx - 1); | |||
1551 | const int p_idx = graph_prep->p_idx - 1; | |||
1552 | for (i = 0; i < node->input_size; i++) | |||
1553 | { | |||
1554 | const int idx = node->inputs[i]; | |||
1555 | int block_ref = *(int*)ccv_array_get(graph_prep->p->tensor_symbol_info[idx].s_ref, p_idx)((void*)(((char*)((graph_prep->p->tensor_symbol_info[idx ].s_ref)->data)) + (size_t)(graph_prep->p->tensor_symbol_info [idx].s_ref)->rsize * (size_t)(p_idx))) - 1; | |||
1556 | assert(!tensor_blocks[block_ref].ref)((void) sizeof ((!tensor_blocks[block_ref].ref) ? 1 : 0), __extension__ ({ if (!tensor_blocks[block_ref].ref) ; else __assert_fail ( "!tensor_blocks[block_ref].ref", "ccv_nnc_symbolic_graph_compile.c" , 1556, __extension__ __PRETTY_FUNCTION__); })); | |||
1557 | const int vt_ref = alloc_prep->vt_blocks[block_ref]; | |||
1558 | if (!_ccv_nnc_tensor_block_check_preserve(graph_prep, block_ref)) | |||
1559 | continue; | |||
1560 | assert(vt_ref >= 0)((void) sizeof ((vt_ref >= 0) ? 1 : 0), __extension__ ({ if (vt_ref >= 0) ; else __assert_fail ("vt_ref >= 0", "ccv_nnc_symbolic_graph_compile.c" , 1560, __extension__ __PRETTY_FUNCTION__); })); | |||
1561 | const int buffer_ref = alloc_prep->blocks[vt_ref].buffer_ref; | |||
1562 | assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[block_ref]))((void) sizeof ((!((tensor_blocks[block_ref].flags & 0x3) == UNASSIGNED)) ? 1 : 0), __extension__ ({ if (!((tensor_blocks [block_ref].flags & 0x3) == UNASSIGNED)) ; else __assert_fail ("!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[block_ref])", "ccv_nnc_symbolic_graph_compile.c" , 1562, __extension__ __PRETTY_FUNCTION__); })); | |||
1563 | assert(!TENSOR_EXPECT_ALIAS(tensor_blocks[block_ref]))((void) sizeof ((!((tensor_blocks[block_ref].flags & 0x3) == ALIAS)) ? 1 : 0), __extension__ ({ if (!((tensor_blocks[block_ref ].flags & 0x3) == ALIAS)) ; else __assert_fail ("!TENSOR_EXPECT_ALIAS(tensor_blocks[block_ref])" , "ccv_nnc_symbolic_graph_compile.c", 1563, __extension__ __PRETTY_FUNCTION__ ); })); | |||
1564 | // Either we have dup_tensor_block_ref in current layer, or we have that in | |||
1565 | // previous layer, therefore, cannot really find the buffer ptr. | |||
1566 | if ((!sub_arena_out_tensors || !sub_arena_out_tensors[block_ref]) && // If it is already generated by sub arena, it can be ordinary out tensors. (What if out tensor is not even generated by sub graph when running? In this case, the behavior is undefined anyway). | |||
1567 | ((graph_prep->dup_tensor_block_ref && | |||
1568 | graph_prep->dup_tensor_block_ref[block_ref * unroll_count] >= 0 && | |||
1569 | graph_prep->dup_tensor_block_ref[block_ref * unroll_count] != block_ref) || | |||
1570 | !tensor_arena->buffers[buffer_ref].ptr)) | |||
1571 | { | |||
1572 | // We haven't allocated anything for this yet. | |||
1573 | assert(tensor_arena->vt_tensors[block_ref] == 0)((void) sizeof ((tensor_arena->vt_tensors[block_ref] == 0) ? 1 : 0), __extension__ ({ if (tensor_arena->vt_tensors[block_ref ] == 0) ; else __assert_fail ("tensor_arena->vt_tensors[block_ref] == 0" , "ccv_nnc_symbolic_graph_compile.c", 1573, __extension__ __PRETTY_FUNCTION__ ); })); | |||
1574 | const int pos = _ccv_nnc_tensor_multiview_gen(tensor_arena->tensor_metadata, 1, tensor_symbol_info[i].assign_ref, tensor_symbol_info[block_ref].info, graph_prep, tensor_arena, block_ref); | |||
1575 | tensor_arena->vt_tensors[block_ref] = (ccv_nnc_tensor_t*)(intptr_t)pos; | |||
1576 | ccv_array_push(tensor_arena->m_tensor_idx, &pos); | |||
1577 | } else { | |||
1578 | const int mv_pos = _ccv_nnc_tensor_multiview_preserve_gen(tensor_arena->tensor_metadata, tensor_symbol_info[block_ref].info, graph_prep, tensor_arena->vt_tensors[block_ref]); | |||
1579 | tensor_arena->vt_tensors[block_ref] = (ccv_nnc_tensor_t*)(intptr_t)mv_pos; // Cast into vt_tensors for now, and later will rewire. | |||
1580 | ccv_array_push(tensor_arena->m_tensor_idx, &mv_pos); | |||
1581 | } | |||
1582 | } | |||
1583 | } | |||
1584 | // For case..of statement, the output is a phi variable, thus, if we take the skip branch, we will select the original input. | |||
1585 | // This created the multi-view tensor to achieve that. | |||
1586 | for (i = 0; i < tensor_symbol_info_size; i++) | |||
1587 | if (tensor_blocks[i].bypass_ref && tensor_arena->vt_tensors[i]) | |||
1588 | { | |||
1589 | const int bypass_ref = tensor_blocks[i].bypass_ref - 1; | |||
1590 | // Create phi multi-view. | |||
1591 | const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t)); | |||
1592 | const int intv_pos = _ccv_nnc_tensor_flat_if_multiview(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[bypass_ref]); | |||
1593 | const int outv_pos = _ccv_nnc_tensor_flat_if_multiview(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[i]); | |||
1594 | ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, mv_pos); | |||
1595 | ccv_nnc_tensor_t* const intv = (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, intv_pos); | |||
1596 | ccv_nnc_tensor_t* const outv = (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, outv_pos); | |||
1597 | ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){ | |||
1598 | intv, | |||
1599 | outv, | |||
1600 | }, CCV_NNC_MULTIVIEW_K0N, 2, (ccv_nnc_graph_t*)CCV_NNC_MULTIVIEW_PHI(intptr_t)0x1, mv); | |||
1601 | CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data )[0] = (ccv_nnc_tensor_t*)(intptr_t)intv_pos; | |||
1602 | CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data )[1] = (ccv_nnc_tensor_t*)(intptr_t)outv_pos; | |||
1603 | tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)mv_pos; | |||
1604 | ccv_array_push(tensor_arena->m_tensor_idx, &mv_pos); | |||
1605 | } | |||
1606 | // Now it is time to handle alias. | |||
1607 | for (i = 0; i < alloc_prep->block_size; i++) | |||
1608 | if (alloc_prep->blocks[i].block_ref < tensor_symbol_info_size) | |||
1609 | { | |||
1610 | const int block_ref = alloc_prep->blocks[i].block_ref; | |||
1611 | if (TENSOR_EXPECT_ALIAS(tensor_blocks[block_ref])((tensor_blocks[block_ref].flags & 0x3) == ALIAS) && !tensor_arena->vt_tensors[block_ref]) | |||
1612 | { | |||
1613 | // Assigning out the tensor aliases. | |||
1614 | assert(tensor_symbol_info[block_ref].alias_ref)((void) sizeof ((tensor_symbol_info[block_ref].alias_ref) ? 1 : 0), __extension__ ({ if (tensor_symbol_info[block_ref].alias_ref ) ; else __assert_fail ("tensor_symbol_info[block_ref].alias_ref" , "ccv_nnc_symbolic_graph_compile.c", 1614, __extension__ __PRETTY_FUNCTION__ ); })); | |||
1615 | _ccv_nnc_recursively_assign_vt_tensor_aliases(tensor_blocks, tensor_arena->tensor_metadata, tensor_symbol_info, block_ref, tensor_arena->vt_tensors); | |||
1616 | } | |||
1617 | } | |||
1618 | // Now assigning out the rest of alias refs. | |||
1619 | for (i = 0; i < tensor_symbol_info_size; i++) | |||
1620 | // It could be binded tensor (or unused), in that case, it doesn't have a ref. | |||
1621 | if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED) && tensor_blocks[i].alias_ref && !tensor_arena->vt_tensors[i]) | |||
1622 | { | |||
1623 | int ref = tensor_blocks[i].alias_ref - 1; | |||
1624 | assert(tensor_arena->vt_tensors[ref])((void) sizeof ((tensor_arena->vt_tensors[ref]) ? 1 : 0), __extension__ ({ if (tensor_arena->vt_tensors[ref]) ; else __assert_fail ("tensor_arena->vt_tensors[ref]", "ccv_nnc_symbolic_graph_compile.c" , 1624, __extension__ __PRETTY_FUNCTION__); })); | |||
1625 | tensor_arena->vt_tensors[i] = tensor_arena->vt_tensors[ref]; | |||
1626 | } | |||
1627 | // Replacing the tensor placeholder within sub arena's multi-view to the input tensor. | |||
1628 | for (i = 0; i < tensor_arena->sub_arena_size; i++) | |||
1629 | if (tensor_arena->sub_arenas[i]) | |||
1630 | { | |||
1631 | const int exec_idx = graph_prep->sub_preps[i]->exec_idx - 1; | |||
1632 | const ccv_nnc_graph_exec_symbol_info_t* const node = graph_prep->exec_symbol_info + exec_idx; | |||
1633 | for (j = 0; j < node->input_size; j++) | |||
1634 | { | |||
1635 | const int idx = node->inputs[j]; | |||
1636 | const int s_idx = (tensor_symbol_info[idx].s_ref && tensor_symbol_info[idx].s_ref->rnum > i) ? *(int*)ccv_array_get(tensor_symbol_info[idx].s_ref, i)((void*)(((char*)((tensor_symbol_info[idx].s_ref)->data)) + (size_t)(tensor_symbol_info[idx].s_ref)->rsize * (size_t) (i))) - 1 : -1; | |||
1637 | if (s_idx < 0) | |||
1638 | continue; | |||
1639 | ccv_nnc_tensor_t* sub_tensor = tensor_arena->sub_arenas[i]->vt_tensors[s_idx]; | |||
1640 | // Only do the replacement if it is a multi-view tensor. | |||
1641 | // sub_tensor can be unassigned if it is a tape variable. It will get fixed up later from its pair. | |||
1642 | if (sub_tensor && CCV_IS_TENSOR_MULTIVIEW(sub_tensor)((*(int*)(sub_tensor)) & CCV_TENSOR_MULTIVIEW) && !TENSOR_EXPECT_UNASSIGNED(tensor_blocks[idx])((tensor_blocks[idx].flags & 0x3) == UNASSIGNED)) | |||
1643 | { | |||
1644 | // It cannot be binded tensor. | |||
1645 | assert(CCV_NNC_IS_METADATA_POS(tensor_arena->vt_tensors[idx]))((void) sizeof ((((uintptr_t)(tensor_arena->vt_tensors[idx ]) & 1)) ? 1 : 0), __extension__ ({ if (((uintptr_t)(tensor_arena ->vt_tensors[idx]) & 1)) ; else __assert_fail ("CCV_NNC_IS_METADATA_POS(tensor_arena->vt_tensors[idx])" , "ccv_nnc_symbolic_graph_compile.c", 1645, __extension__ __PRETTY_FUNCTION__ ); })); | |||
1646 | const int vt_pos = (int)(intptr_t)tensor_arena->vt_tensors[idx]; | |||
1647 | const int is_sub_arena_out_tensor = (sub_arena_out_tensors && sub_arena_out_tensors[idx]); | |||
1648 | ccv_nnc_tensor_t* const vt_tensor = is_sub_arena_out_tensor ? sub_arena_out_tensors[idx] : _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, vt_pos); | |||
1649 | // If this tensor is also an multiview, we need to first generate a new tensor, and then generate a reference | |||
1650 | // to this tensor. | |||
1651 | if (CCV_IS_TENSOR_MULTIVIEW(vt_tensor)((*(int*)(vt_tensor)) & CCV_TENSOR_MULTIVIEW)) | |||
1652 | { | |||
1653 | const int ref_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t)); | |||
1654 | ccv_nnc_tensor_t* const ref_tensor = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, ref_pos); | |||
1655 | ccv_nnc_tensor_multiview_t* const multiview = (ccv_nnc_tensor_multiview_t*)(is_sub_arena_out_tensor ? vt_tensor : _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, vt_pos)); | |||
1656 | ref_tensor->alias_ref = is_sub_arena_out_tensor ? (uintptr_t)vt_tensor : (uintptr_t)vt_pos; | |||
1657 | ccv_nnc_tensor_synchronize_to_multiview(multiview, (ccv_nnc_tensor_t*)(intptr_t)ref_pos); | |||
1658 | ccv_nnc_tensor_t* tv = (ccv_nnc_tensor_t*)(CCV_NNC_IS_METADATA_POS(CCV_NNC_MULTIVIEW_DATA(multiview)[0])((uintptr_t)(((multiview)->_heap_data ? (multiview)->_heap_data : (multiview)->_inline_data)[0]) & 1) ? _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(multiview)((multiview)->_heap_data ? (multiview)->_heap_data : (multiview )->_inline_data)[0]) : CCV_NNC_MULTIVIEW_DATA(multiview)((multiview)->_heap_data ? (multiview)->_heap_data : (multiview )->_inline_data)[0]); | |||
1659 | while (CCV_IS_TENSOR_MULTIVIEW(tv)((*(int*)(tv)) & CCV_TENSOR_MULTIVIEW)) | |||
1660 | tv = (ccv_nnc_tensor_t*)(CCV_NNC_IS_METADATA_POS(CCV_NNC_MULTIVIEW_DATA((ccv_nnc_tensor_multiview_t*)tv)[0])((uintptr_t)((((ccv_nnc_tensor_multiview_t*)tv)->_heap_data ? ((ccv_nnc_tensor_multiview_t*)tv)->_heap_data : ((ccv_nnc_tensor_multiview_t *)tv)->_inline_data)[0]) & 1) ? _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA((ccv_nnc_tensor_multiview_t*)tv)(((ccv_nnc_tensor_multiview_t*)tv)->_heap_data ? ((ccv_nnc_tensor_multiview_t *)tv)->_heap_data : ((ccv_nnc_tensor_multiview_t*)tv)-> _inline_data)[0]) : CCV_NNC_MULTIVIEW_DATA((ccv_nnc_tensor_multiview_t*)tv)(((ccv_nnc_tensor_multiview_t*)tv)->_heap_data ? ((ccv_nnc_tensor_multiview_t *)tv)->_heap_data : ((ccv_nnc_tensor_multiview_t*)tv)-> _inline_data)[0]); | |||
1661 | *ref_tensor = ccv_nnc_tensor(tv->data.u8, tv->info, 0); | |||
1662 | ref_tensor->data = tv->data; | |||
1663 | ref_tensor->dataof = tv->dataof; | |||
1664 | _ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)sub_tensor, (ccv_nnc_tensor_t*)(intptr_t)ref_pos); | |||
1665 | } else | |||
1666 | _ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)sub_tensor, is_sub_arena_out_tensor ? vt_tensor : (ccv_nnc_tensor_t*)(intptr_t)vt_pos); | |||
1667 | } | |||
1668 | } | |||
1669 | } | |||
1670 | // After alias created, for case..of statement, we now revert back to flat tensor rather than multi-view. | |||
1671 | // No worries though, this new tensor is subscribed for the phi multi-view. More over, we have logic | |||
1672 | // when initialize case..of node, which will take the phi multi-view again. | |||
1673 | for (i = 0; i < tensor_symbol_info_size; i++) | |||
1674 | if (tensor_blocks[i].bypass_ref && tensor_arena->vt_tensors[i]) | |||
1675 | { | |||
1676 | assert(CCV_NNC_IS_METADATA_POS(tensor_arena->vt_tensors[i]))((void) sizeof ((((uintptr_t)(tensor_arena->vt_tensors[i]) & 1)) ? 1 : 0), __extension__ ({ if (((uintptr_t)(tensor_arena ->vt_tensors[i]) & 1)) ; else __assert_fail ("CCV_NNC_IS_METADATA_POS(tensor_arena->vt_tensors[i])" , "ccv_nnc_symbolic_graph_compile.c", 1676, __extension__ __PRETTY_FUNCTION__ ); })); | |||
1677 | ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[i]); | |||
1678 | assert(mv->anchor == CCV_NNC_MULTIVIEW_PHI)((void) sizeof ((mv->anchor == (intptr_t)0x1) ? 1 : 0), __extension__ ({ if (mv->anchor == (intptr_t)0x1) ; else __assert_fail ( "mv->anchor == CCV_NNC_MULTIVIEW_PHI", "ccv_nnc_symbolic_graph_compile.c" , 1678, __extension__ __PRETTY_FUNCTION__); })); | |||
1679 | tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)_ccv_nnc_tensor_flat_if_multiview(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[i]); | |||
1680 | } | |||
1681 | // rewire the rest. I can rewire multiple times because I can identify whether this is wired or not. | |||
1682 | for (i = 0; i < tensor_symbol_info_size; i++) | |||
1683 | if (tensor_arena->vt_tensors[i]) | |||
1684 | tensor_arena->vt_tensors[i] = _ccv_nnc_tensor_metadata_rewire(tensor_arena->tensor_metadata, tensor_arena->vt_tensors[i]); | |||
1685 | // Associate multiview tensors from sub arena to the parent. | |||
1686 | if (sub_arena_out_tensors) | |||
1687 | { | |||
1688 | for (i = 0; i < alloc_prep->block_size; i++) | |||
1689 | if (alloc_prep->blocks[i].block_ref < tensor_symbol_info_size) | |||
1690 | { | |||
1691 | const int block_ref = alloc_prep->blocks[i].block_ref; | |||
1692 | if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[block_ref])((tensor_blocks[block_ref].flags & 0x3) == UNASSIGNED)) | |||
1693 | continue; | |||
1694 | int sub_arena_ref = block_ref; | |||
1695 | if (TENSOR_EXPECT_ALIAS(tensor_blocks[block_ref])((tensor_blocks[block_ref].flags & 0x3) == ALIAS)) | |||
1696 | { | |||
1697 | // Assigning out the tensor aliases. | |||
1698 | assert(tensor_symbol_info[block_ref].alias_ref)((void) sizeof ((tensor_symbol_info[block_ref].alias_ref) ? 1 : 0), __extension__ ({ if (tensor_symbol_info[block_ref].alias_ref ) ; else __assert_fail ("tensor_symbol_info[block_ref].alias_ref" , "ccv_nnc_symbolic_graph_compile.c", 1698, __extension__ __PRETTY_FUNCTION__ ); })); | |||
1699 | const int alias_ref = tensor_symbol_info[block_ref].alias_ref - 1; | |||
1700 | // It referenced to is not an alias. | |||
1701 | assert(tensor_arena->vt_tensors[alias_ref])((void) sizeof ((tensor_arena->vt_tensors[alias_ref]) ? 1 : 0), __extension__ ({ if (tensor_arena->vt_tensors[alias_ref ]) ; else __assert_fail ("tensor_arena->vt_tensors[alias_ref]" , "ccv_nnc_symbolic_graph_compile.c", 1701, __extension__ __PRETTY_FUNCTION__ ); })); | |||
1702 | sub_arena_ref = alias_ref; | |||
1703 | if (!sub_arena_out_tensors[sub_arena_ref]) | |||
1704 | continue; | |||
1705 | } | |||
1706 | if (!sub_arena_out_tensors[sub_arena_ref]) | |||
1707 | continue; | |||
1708 | ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)(CCV_IS_TENSOR_MULTIVIEW(sub_arena_out_tensors[sub_arena_ref])((*(int*)(sub_arena_out_tensors[sub_arena_ref])) & CCV_TENSOR_MULTIVIEW ) ? sub_arena_out_tensors[sub_arena_ref] : (ccv_nnc_tensor_t*)sub_arena_out_tensors[sub_arena_ref]->alias_ref); | |||
1709 | assert(CCV_IS_TENSOR_MULTIVIEW(mv))((void) sizeof ((((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW)) ? 1 : 0), __extension__ ({ if (((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW )) ; else __assert_fail ("CCV_IS_TENSOR_MULTIVIEW(mv)", "ccv_nnc_symbolic_graph_compile.c" , 1709, __extension__ __PRETTY_FUNCTION__); })); | |||
1710 | // This is only possible if the vt_tensors is a phi node. | |||
1711 | if (tensor_arena->vt_tensors[block_ref]->alias_ref) | |||
1712 | { | |||
1713 | // For phi node, the sub_arena_out_tensors are only relevant to its selected output. Therefore, setting that to be the receiver of the broadcast. | |||
1714 | ccv_nnc_tensor_multiview_t* const phi = (ccv_nnc_tensor_multiview_t*)(tensor_arena->vt_tensors[block_ref]->alias_ref); | |||
1715 | assert(phi->anchor == CCV_NNC_MULTIVIEW_PHI)((void) sizeof ((phi->anchor == (intptr_t)0x1) ? 1 : 0), __extension__ ({ if (phi->anchor == (intptr_t)0x1) ; else __assert_fail ("phi->anchor == CCV_NNC_MULTIVIEW_PHI", "ccv_nnc_symbolic_graph_compile.c" , 1715, __extension__ __PRETTY_FUNCTION__); })); | |||
1716 | assert(!CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(phi)[1]))((void) sizeof ((!((*(int*)(((phi)->_heap_data ? (phi)-> _heap_data : (phi)->_inline_data)[1])) & CCV_TENSOR_MULTIVIEW )) ? 1 : 0), __extension__ ({ if (!((*(int*)(((phi)->_heap_data ? (phi)->_heap_data : (phi)->_inline_data)[1])) & CCV_TENSOR_MULTIVIEW )) ; else __assert_fail ("!CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(phi)[1])" , "ccv_nnc_symbolic_graph_compile.c", 1716, __extension__ __PRETTY_FUNCTION__ ); })); | |||
1717 | CCV_NNC_MULTIVIEW_DATA(phi)((phi)->_heap_data ? (phi)->_heap_data : (phi)->_inline_data )[1]->alias_ref = (uintptr_t)mv; | |||
1718 | ccv_nnc_tensor_synchronize_to_multiview(mv, CCV_NNC_MULTIVIEW_DATA(phi)((phi)->_heap_data ? (phi)->_heap_data : (phi)->_inline_data )[1]); | |||
1719 | } else { | |||
1720 | tensor_arena->vt_tensors[block_ref]->alias_ref = (uintptr_t)mv; | |||
1721 | ccv_nnc_tensor_synchronize_to_multiview(mv, tensor_arena->vt_tensors[block_ref]); | |||
1722 | } | |||
1723 | } | |||
1724 | } | |||
1725 | // Go over all the tensors that has assign_ref. If the tensor it is assigned from is: | |||
1726 | // 1). From sub_arena_out_tensors, it could be possible that it now pointing to an area this arena doesn't know. | |||
1727 | // 2). From phi multi-view, for this case, it is in fact that this arena won't know which memory I am going to use prior. | |||
1728 | // Therefore, for above two scenarios, the tensor has assign_ref, even it is a multiview tensor, need to subscribe | |||
1729 | // to the output of assign_ref tensor. | |||
1730 | for (i = 0; i < tensor_symbol_info_size; i++) | |||
1731 | if (tensor_arena->vt_tensors[i] && tensor_symbol_info[i].assign_ref) | |||
1732 | { | |||
1733 | const int assign_ref = tensor_symbol_info[i].assign_ref - 1; | |||
1734 | ccv_nnc_tensor_t* assign_tensor; | |||
1735 | if (sub_arena_out_tensors && sub_arena_out_tensors[assign_ref]) | |||
1736 | assign_tensor = CCV_IS_TENSOR_MULTIVIEW(sub_arena_out_tensors[assign_ref])((*(int*)(sub_arena_out_tensors[assign_ref])) & CCV_TENSOR_MULTIVIEW ) ? sub_arena_out_tensors[assign_ref] : (ccv_nnc_tensor_t*)sub_arena_out_tensors[assign_ref]->alias_ref; | |||
1737 | else | |||
1738 | assign_tensor = tensor_arena->vt_tensors[assign_ref]; | |||
1739 | ccv_nnc_graph_add_carry_over(graph_prep->graph, assign_tensor, tensor_arena->vt_tensors[i]); | |||
1740 | } | |||
1741 | // After everything handled, assertion again to make sure the tensors and tensor binds pointing to the right location. This is really just for assertion. | |||
1742 | for (i = 0; i < tensor_bind_size; i++) | |||
1743 | { | |||
1744 | assert(tensor_binds[i].tensor)((void) sizeof ((tensor_binds[i].tensor) ? 1 : 0), __extension__ ({ if (tensor_binds[i].tensor) ; else __assert_fail ("tensor_binds[i].tensor" , "ccv_nnc_symbolic_graph_compile.c", 1744, __extension__ __PRETTY_FUNCTION__ ); })); | |||
1745 | const ccv_nnc_tensor_symbol_t resolved_symbol = ccv_nnc_tensor_symbol_resolve(graph_prep->symbolic_graph, tensor_binds[i].symbol); | |||
1746 | if (resolved_symbol.d >= 0) | |||
1747 | { | |||
1748 | int d = resolved_symbol.d; | |||
1749 | // This check is for in-place ops. Only in-place op could have unassigned but ref. | |||
1750 | // It has nothing to do with alias. | |||
1751 | while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED) && tensor_blocks[d].ref) | |||
1752 | d = tensor_blocks[d].ref - 1; | |||
1753 | // Note we don't trace back on alias. This is intentional. | |||
1754 | assert(tensor_arena->vt_tensors[d]->data.u8 == tensor_binds[i].tensor->data.u8)((void) sizeof ((tensor_arena->vt_tensors[d]->data.u8 == tensor_binds[i].tensor->data.u8) ? 1 : 0), __extension__ ( { if (tensor_arena->vt_tensors[d]->data.u8 == tensor_binds [i].tensor->data.u8) ; else __assert_fail ("tensor_arena->vt_tensors[d]->data.u8 == tensor_binds[i].tensor->data.u8" , "ccv_nnc_symbolic_graph_compile.c", 1754, __extension__ __PRETTY_FUNCTION__ ); })); | |||
1755 | } | |||
1756 | } | |||
1757 | if (sub_arena_out_tensors) | |||
1758 | ccfreefree(sub_arena_out_tensors); | |||
1759 | // Rewire sub arena's tensor references. | |||
1760 | for (i = 0; i < tensor_arena->sub_arena_size; i++) | |||
1761 | if (tensor_arena->sub_arenas[i]) | |||
1762 | { | |||
1763 | const int exec_idx = graph_prep->sub_preps[i]->exec_idx - 1; | |||
1764 | const ccv_nnc_graph_exec_symbol_info_t* const node = graph_prep->exec_symbol_info + exec_idx; | |||
1765 | for (j = 0; j < node->input_size; j++) | |||
1766 | { | |||
1767 | const int idx = node->inputs[j]; | |||
1768 | const int s_idx = (tensor_symbol_info[idx].s_ref && tensor_symbol_info[idx].s_ref->rnum > i) ? *(int*)ccv_array_get(tensor_symbol_info[idx].s_ref, i)((void*)(((char*)((tensor_symbol_info[idx].s_ref)->data)) + (size_t)(tensor_symbol_info[idx].s_ref)->rsize * (size_t) (i))) - 1 : -1; | |||
1769 | if (s_idx < 0) | |||
1770 | continue; | |||
1771 | ccv_nnc_tensor_t* sub_tensor = tensor_arena->sub_arenas[i]->vt_tensors[s_idx]; | |||
1772 | // Only do the replacement if it is a multi-view tensor. | |||
1773 | // sub_tensor can be unassigned if it is a tape variable. It will get fixed up later from its pair. | |||
1774 | if (sub_tensor && CCV_IS_TENSOR_MULTIVIEW(sub_tensor)((*(int*)(sub_tensor)) & CCV_TENSOR_MULTIVIEW)) | |||
1775 | { | |||
1776 | // This is binded tensor, bind it now. | |||
1777 | if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[idx])((tensor_blocks[idx].flags & 0x3) == UNASSIGNED)) | |||
1778 | _ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)sub_tensor, tensor_arena->vt_tensors[idx]); | |||
1779 | else | |||
1780 | _ccv_nnc_tensor_multiview_full_pos_rewire(tensor_arena->tensor_metadata, (ccv_nnc_tensor_multiview_t*)sub_tensor); | |||
1781 | } | |||
1782 | } | |||
1783 | } | |||
1784 | return tensor_arena; | |||
1785 | } | |||
1786 | ||||
1787 | static ccv_nnc_tensor_t* _ccv_nnc_tensor_arena_find_pair_ref(const ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_symbolic_graph_t* const graph, const int pair_ref) | |||
1788 | { | |||
1789 | assert(graph)((void) sizeof ((graph) ? 1 : 0), __extension__ ({ if (graph) ; else __assert_fail ("graph", "ccv_nnc_symbolic_graph_compile.c" , 1789, __extension__ __PRETTY_FUNCTION__); })); | |||
1790 | if ((intptr_t)graph == tensor_arena->graph_ref) | |||
1791 | { | |||
1792 | assert(pair_ref >= 0 && pair_ref < tensor_arena->vt_tensor_size)((void) sizeof ((pair_ref >= 0 && pair_ref < tensor_arena ->vt_tensor_size) ? 1 : 0), __extension__ ({ if (pair_ref >= 0 && pair_ref < tensor_arena->vt_tensor_size) ; else __assert_fail ("pair_ref >= 0 && pair_ref < tensor_arena->vt_tensor_size" , "ccv_nnc_symbolic_graph_compile.c", 1792, __extension__ __PRETTY_FUNCTION__ ); })); | |||
1793 | return tensor_arena->vt_tensors[pair_ref]; | |||
1794 | } | |||
1795 | int i; | |||
1796 | for (i = 0; i < tensor_arena->sub_arena_size; i++) | |||
1797 | if (tensor_arena->sub_arenas[i]) | |||
1798 | { | |||
1799 | ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_arena_find_pair_ref(tensor_arena->sub_arenas[i], graph, pair_ref); | |||
1800 | if (tensor) | |||
1801 | return tensor; | |||
1802 | } | |||
1803 | return 0; | |||
1804 | } | |||
1805 | ||||
1806 | static void _ccv_nnc_tensor_mark_as_tape_var(ccv_nnc_tensor_t* const tensor) | |||
1807 | { | |||
1808 | if (!CCV_IS_TENSOR_MULTIVIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_MULTIVIEW)) | |||
1809 | tensor->type |= CCV_TAPE_ALLOC; | |||
1810 | else { | |||
1811 | ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)tensor; | |||
1812 | mv->type |= CCV_TAPE_ALLOC; | |||
1813 | int i; | |||
1814 | for (i = 0; i < mv->repeat + mv->kind; i++) | |||
1815 | _ccv_nnc_tensor_mark_as_tape_var(CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data )[i]); | |||
1816 | } | |||
1817 | } | |||
1818 | ||||
1819 | static void _ccv_nnc_tensor_arena_fixup_pair_ref_and_tape_var(const ccv_nnc_tensor_arena_t* const root_arena, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, ccv_nnc_tensor_arena_t* const tensor_arena) | |||
1820 | { | |||
1821 | assert(tensor_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph)((void) sizeof ((tensor_arena->graph_ref == (intptr_t)graph_prep ->symbolic_graph) ? 1 : 0), __extension__ ({ if (tensor_arena ->graph_ref == (intptr_t)graph_prep->symbolic_graph) ; else __assert_fail ("tensor_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph" , "ccv_nnc_symbolic_graph_compile.c", 1821, __extension__ __PRETTY_FUNCTION__ ); })); | |||
1822 | int i; | |||
1823 | for (i = 0; i < graph_prep->tensor_symbol_info_size; i++) | |||
1824 | { | |||
1825 | if (graph_prep->tensor_symbol_info[i].pair_ref) | |||
1826 | { | |||
1827 | tensor_arena->vt_tensors[i] = _ccv_nnc_tensor_arena_find_pair_ref(root_arena, graph_prep->symbolic_graph->pair, graph_prep->tensor_symbol_info[i].pair_ref - 1); | |||
1828 | // No need to continue check this if it is from its pair. | |||
1829 | continue; | |||
1830 | } | |||
1831 | if ((graph_prep->tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_TAPE_VAR) && tensor_arena->vt_tensors[i]) | |||
1832 | { | |||
1833 | // If it is a normal tensor, and the buffer it relies on is read only, no need to mark as tape var. | |||
1834 | if (!CCV_IS_TENSOR_MULTIVIEW(tensor_arena->vt_tensors[i])((*(int*)(tensor_arena->vt_tensors[i])) & CCV_TENSOR_MULTIVIEW )) | |||
1835 | { | |||
1836 | const int vt_ref = graph_prep->alloc_prep->vt_blocks[i]; | |||
1837 | if (vt_ref >= 0 && | |||
1838 | TENSOR_READ_WRITE(graph_prep->alloc_prep->buffers[graph_prep->alloc_prep->blocks[vt_ref].buffer_ref])(graph_prep->alloc_prep->buffers[graph_prep->alloc_prep ->blocks[vt_ref].buffer_ref].flags & 0xc) == READ_ONLY) | |||
1839 | continue; | |||
1840 | } | |||
1841 | _ccv_nnc_tensor_mark_as_tape_var(tensor_arena->vt_tensors[i]); | |||
1842 | } | |||
1843 | } | |||
1844 | for (i = 0; i < graph_prep->sub_prep_size; i++) | |||
1845 | if (graph_prep->sub_preps[i]) | |||
1846 | _ccv_nnc_tensor_arena_fixup_pair_ref_and_tape_var(root_arena, graph_prep->sub_preps[i], tensor_arena->sub_arenas[i]); | |||
1847 | } | |||
1848 | ||||
1849 | static void _ccv_nnc_tensor_block_add_exec(const ccv_sparse_matrix_t* const exec_dep, const int idx, ccv_nnc_tensor_block_t tensor_blocks) | |||
1850 | { | |||
1851 | int i, found = 0; | |||
1852 | // Try to insert head. | |||
1853 | ccv_array_t* head = tensor_blocks.head; | |||
1854 | assert(head)((void) sizeof ((head) ? 1 : 0), __extension__ ({ if (head) ; else __assert_fail ("head", "ccv_nnc_symbolic_graph_compile.c" , 1854, __extension__ __PRETTY_FUNCTION__); })); | |||
1855 | for (i = 0; i < head->rnum;) | |||
1856 | { | |||
1857 | const int head_idx = *(int*)ccv_array_get(head, i)((void*)(((char*)((head)->data)) + (size_t)(head)->rsize * (size_t)(i))); | |||
1858 | if (head_idx == idx) | |||
1859 | { | |||
1860 | found = 1; | |||
1861 | break; | |||
1862 | } | |||
1863 | ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, head_idx, idx); | |||
1864 | if (cell.i32 && cell.i32[0] > 0) | |||
1865 | { | |||
1866 | /* If the current node is the parent of the head node, check if we found it or not. */ | |||
1867 | /* If not found, replace the current one. */ | |||
1868 | if (!found) | |||
1869 | { | |||
1870 | found = 1; | |||
1871 | *(int*)ccv_array_get(head, i)((void*)(((char*)((head)->data)) + (size_t)(head)->rsize * (size_t)(i))) = idx; | |||
1872 | } else { | |||
1873 | /* Remove the current one, change the rnum. */ | |||
1874 | if (i < head->rnum - 1) | |||
1875 | *(int*)ccv_array_get(head, i)((void*)(((char*)((head)->data)) + (size_t)(head)->rsize * (size_t)(i))) = *(int*)ccv_array_get(head, head->rnum - 1)((void*)(((char*)((head)->data)) + (size_t)(head)->rsize * (size_t)(head->rnum - 1))); | |||
1876 | --head->rnum; | |||
1877 | continue; | |||
1878 | } | |||
1879 | } else { | |||
1880 | // If the head is the parent of the idx, we cannot add it to the array (it is deterministically later than head). | |||
1881 | cell = ccv_get_sparse_matrix_cell(exec_dep, idx, head_idx); | |||
1882 | if (cell.i32 && cell.i32[0] > 0) | |||
1883 | { | |||
1884 | found = 1; | |||
1885 | break; | |||
1886 | } | |||
1887 | } | |||
1888 | /* Advancing i. */ | |||
1889 | ++i; | |||
1890 | } | |||
1891 | /* If not found, push this idx to the end of the array. */ | |||
1892 | if (!found) | |||
1893 | ccv_array_push(head, &idx); | |||
1894 | // Try to insert tail. | |||
1895 | found = 0; | |||
1896 | ccv_array_t* tail = tensor_blocks.tail; | |||
1897 | assert(tail)((void) sizeof ((tail) ? 1 : 0), __extension__ ({ if (tail) ; else __assert_fail ("tail", "ccv_nnc_symbolic_graph_compile.c" , 1897, __extension__ __PRETTY_FUNCTION__); })); | |||
1898 | for (i = 0; i < tail->rnum;) | |||
1899 | { | |||
1900 | const int tail_idx = *(int*)ccv_array_get(tail, i)((void*)(((char*)((tail)->data)) + (size_t)(tail)->rsize * (size_t)(i))); | |||
1901 | if (tail_idx == idx) | |||
1902 | { | |||
1903 | found = 1; | |||
1904 | break; | |||
1905 | } | |||
1906 | ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, idx, tail_idx); | |||
1907 | if (cell.i32 && cell.i32[0] > 0) | |||
1908 | { | |||
1909 | /* If the current node is the child of the tail node, check if we found it or not. */ | |||
1910 | /* If not found, replace the current one. */ | |||
1911 | if (!found) | |||
1912 | { | |||
1913 | found = 1; | |||
1914 | *(int*)ccv_array_get(tail, i)((void*)(((char*)((tail)->data)) + (size_t)(tail)->rsize * (size_t)(i))) = idx; | |||
1915 | } else { | |||
1916 | /* Remove the current one, change the rnum. */ | |||
1917 | *(int*)ccv_array_get(tail, i)((void*)(((char*)((tail)->data)) + (size_t)(tail)->rsize * (size_t)(i))) = *(int*)ccv_array_get(tail, tail->rnum - 1)((void*)(((char*)((tail)->data)) + (size_t)(tail)->rsize * (size_t)(tail->rnum - 1))); | |||
1918 | --tail->rnum; | |||
1919 | continue; | |||
1920 | } | |||
1921 | } else { | |||
1922 | // If the tail is the child of the idx, we cannot add it to the array (it is deterministically earlier than tail). | |||
1923 | cell = ccv_get_sparse_matrix_cell(exec_dep, tail_idx, idx); | |||
1924 | if (cell.i32 && cell.i32[0] > 0) | |||
1925 | { | |||
1926 | found = 1; | |||
1927 | break; | |||
1928 | } | |||
1929 | } | |||
1930 | /* Advancing i. */ | |||
1931 | ++i; | |||
1932 | } | |||
1933 | /* If not found, push this idx to the end of the array. */ | |||
1934 | if (!found) | |||
1935 | ccv_array_push(tail, &idx); | |||
1936 | } | |||
1937 | ||||
1938 | ccv_nnc_tensor_t* ccv_nnc_tensor_from_symbol(const ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_tensor_symbol_t symbol) | |||
1939 | { | |||
1940 | if ((intptr_t)symbol.graph == tensor_arena->graph_ref) | |||
1941 | { | |||
1942 | assert(symbol.d >= 0 && symbol.d < tensor_arena->vt_tensor_size)((void) sizeof ((symbol.d >= 0 && symbol.d < tensor_arena ->vt_tensor_size) ? 1 : 0), __extension__ ({ if (symbol.d >= 0 && symbol.d < tensor_arena->vt_tensor_size) ; else __assert_fail ("symbol.d >= 0 && symbol.d < tensor_arena->vt_tensor_size" , "ccv_nnc_symbolic_graph_compile.c", 1942, __extension__ __PRETTY_FUNCTION__ ); })); | |||
1943 | ccv_nnc_tensor_t* tensor = tensor_arena->vt_tensors[symbol.d]; | |||
1944 | if (tensor && CCV_IS_TENSOR_MULTIVIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_MULTIVIEW)) | |||
1945 | { | |||
1946 | ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)tensor; | |||
1947 | while (CCV_IS_TENSOR_MULTIVIEW(mv)((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW)) | |||
1948 | mv = (ccv_nnc_tensor_multiview_t*)(mv->it ? mv->it : CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data )[0]); | |||
1949 | return (ccv_nnc_tensor_t*)mv; | |||
1950 | } | |||
1951 | return tensor; | |||
1952 | } | |||
1953 | int i; | |||
1954 | for (i = 0; i < tensor_arena->sub_arena_size; i++) | |||
1955 | if (tensor_arena->sub_arenas[i]) | |||
1956 | { | |||
1957 | ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_from_symbol(tensor_arena->sub_arenas[i], symbol); | |||
1958 | if (tensor) | |||
1959 | return tensor; | |||
1960 | } | |||
1961 | return 0; | |||
1962 | } | |||
1963 | ||||
1964 | ccv_nnc_graph_exec_t ccv_nnc_graph_exec_from_symbol(const ccv_nnc_graph_exec_arena_t* const graph_exec_arena, const ccv_nnc_graph_exec_symbol_t symbol) | |||
1965 | { | |||
1966 | if ((intptr_t)symbol.graph == graph_exec_arena->graph_ref) | |||
1967 | { | |||
1968 | assert(symbol.d >= 0 && symbol.d < graph_exec_arena->graph_exec_size)((void) sizeof ((symbol.d >= 0 && symbol.d < graph_exec_arena ->graph_exec_size) ? 1 : 0), __extension__ ({ if (symbol.d >= 0 && symbol.d < graph_exec_arena->graph_exec_size ) ; else __assert_fail ("symbol.d >= 0 && symbol.d < graph_exec_arena->graph_exec_size" , "ccv_nnc_symbolic_graph_compile.c", 1968, __extension__ __PRETTY_FUNCTION__ ); })); | |||
1969 | return graph_exec_arena->graph_execs[symbol.d]; | |||
1970 | } | |||
1971 | int i; | |||
1972 | for (i = 0; i < graph_exec_arena->sub_arena_size; i++) | |||
1973 | if (graph_exec_arena->sub_arenas[i]) | |||
1974 | { | |||
1975 | ccv_nnc_graph_exec_t exec = ccv_nnc_graph_exec_from_symbol(graph_exec_arena->sub_arenas[i], symbol); | |||
1976 | if (!CCV_NO_GRAPH_EXEC(exec)((exec).graph == 0)) | |||
1977 | return exec; | |||
1978 | } | |||
1979 | return (ccv_nnc_graph_exec_t){}; // 0. | |||
1980 | } | |||
1981 | ||||
1982 | ccv_nnc_graph_exec_t ccv_nnc_graph_exec_source(const ccv_nnc_graph_exec_arena_t* const graph_exec_arena) | |||
1983 | { | |||
1984 | return graph_exec_arena->source; | |||
1985 | } | |||
1986 | ||||
1987 | ccv_nnc_graph_exec_t ccv_nnc_graph_exec_destination(const ccv_nnc_graph_exec_arena_t* const graph_exec_arena) | |||
1988 | { | |||
1989 | return graph_exec_arena->destination; | |||
1990 | } | |||
1991 | ||||
1992 | // Check whether the head is the beginning of this block. | |||
1993 | static int _ccv_nnc_tensor_block_check_head(const ccv_nnc_tensor_block_t* const tensor_block, const int head_node) | |||
1994 | { | |||
1995 | assert(tensor_block->head)((void) sizeof ((tensor_block->head) ? 1 : 0), __extension__ ({ if (tensor_block->head) ; else __assert_fail ("tensor_block->head" , "ccv_nnc_symbolic_graph_compile.c", 1995, __extension__ __PRETTY_FUNCTION__ ); })); | |||
1996 | return (tensor_block->head->rnum == 1 && *(int*)ccv_array_get(tensor_block->head, 0)((void*)(((char*)((tensor_block->head)->data)) + (size_t )(tensor_block->head)->rsize * (size_t)(0))) == head_node); | |||
1997 | } | |||
1998 | ||||
1999 | // Check whether the tail is the end of this block. | |||
2000 | static int _ccv_nnc_tensor_block_check_tail(const ccv_nnc_tensor_block_t* const tensor_block, const int tail_node) | |||
2001 | { | |||
2002 | assert(tensor_block->tail)((void) sizeof ((tensor_block->tail) ? 1 : 0), __extension__ ({ if (tensor_block->tail) ; else __assert_fail ("tensor_block->tail" , "ccv_nnc_symbolic_graph_compile.c", 2002, __extension__ __PRETTY_FUNCTION__ ); })); | |||
2003 | return (tensor_block->tail->rnum == 1 && *(int*)ccv_array_get(tensor_block->tail, 0)((void*)(((char*)((tensor_block->tail)->data)) + (size_t )(tensor_block->tail)->rsize * (size_t)(0))) == tail_node); | |||
2004 | } | |||
2005 | ||||
2006 | // Make two tensor blocks one. Return 1 if that happened. | |||
2007 | static int _ccv_nnc_tensor_blocks_try_fold(ccv_nnc_tensor_block_t* const tensor_blocks, const int p_ref_0, const int p_ref_1) | |||
2008 | { | |||
2009 | // Now we are sure p_ref_0 points to the input, p_ref_1 points to the output. | |||
2010 | if (!TENSOR_IS_UNFOLDABLE_AS_INPUT(tensor_blocks[p_ref_0])(tensor_blocks[p_ref_0].flags & UNFOLDABLE_AS_INPUT) && | |||
2011 | (!TENSOR_IS_UNFOLDABLE_AS_OUTPUT(tensor_blocks[p_ref_1])(tensor_blocks[p_ref_1].flags & UNFOLDABLE_AS_OUTPUT) || tensor_blocks[p_ref_1].unfoldable_except_ref == p_ref_0 + 1) && | |||
2012 | tensor_blocks[p_ref_0].tail->rnum == 1 && | |||
2013 | tensor_blocks[p_ref_1].head->rnum == 1 && | |||
2014 | tensor_blocks[p_ref_0].type == tensor_blocks[p_ref_1].type && // Must be the same type. | |||
2015 | *(int*)ccv_array_get(tensor_blocks[p_ref_0].tail, 0)((void*)(((char*)((tensor_blocks[p_ref_0].tail)->data)) + ( size_t)(tensor_blocks[p_ref_0].tail)->rsize * (size_t)(0)) ) == *(int*)ccv_array_get(tensor_blocks[p_ref_1].head, 0)((void*)(((char*)((tensor_blocks[p_ref_1].head)->data)) + ( size_t)(tensor_blocks[p_ref_1].head)->rsize * (size_t)(0)) )) | |||
2016 | { | |||
2017 | // If the two parent refs matches (thus, they meet at the same node), we can concatenate with each other and mark one as a ref. This is very similar to in-place operation combining. | |||
2018 | assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[p_ref_0]))((void) sizeof (((!((tensor_blocks[p_ref_0].flags & 0x3) == ALIAS) && !((tensor_blocks[p_ref_0].flags & 0x3) == UNASSIGNED))) ? 1 : 0), __extension__ ({ if ((!((tensor_blocks [p_ref_0].flags & 0x3) == ALIAS) && !((tensor_blocks [p_ref_0].flags & 0x3) == UNASSIGNED))) ; else __assert_fail ("TENSOR_EXPECT_COMPUTABLE(tensor_blocks[p_ref_0])", "ccv_nnc_symbolic_graph_compile.c" , 2018, __extension__ __PRETTY_FUNCTION__); })); | |||
2019 | assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[p_ref_1]))((void) sizeof (((!((tensor_blocks[p_ref_1].flags & 0x3) == ALIAS) && !((tensor_blocks[p_ref_1].flags & 0x3) == UNASSIGNED))) ? 1 : 0), __extension__ ({ if ((!((tensor_blocks [p_ref_1].flags & 0x3) == ALIAS) && !((tensor_blocks [p_ref_1].flags & 0x3) == UNASSIGNED))) ; else __assert_fail ("TENSOR_EXPECT_COMPUTABLE(tensor_blocks[p_ref_1])", "ccv_nnc_symbolic_graph_compile.c" , 2019, __extension__ __PRETTY_FUNCTION__); })); | |||
2020 | ccv_array_free(tensor_blocks[p_ref_0].tail); | |||
2021 | tensor_blocks[p_ref_0].tail = tensor_blocks[p_ref_1].tail; | |||
2022 | if (tensor_blocks[p_ref_1].p_refs[0]) | |||
2023 | { | |||
2024 | assert(tensor_blocks[p_ref_1].p_refs[1] == 0)((void) sizeof ((tensor_blocks[p_ref_1].p_refs[1] == 0) ? 1 : 0), __extension__ ({ if (tensor_blocks[p_ref_1].p_refs[1] == 0) ; else __assert_fail ("tensor_blocks[p_ref_1].p_refs[1] == 0" , "ccv_nnc_symbolic_graph_compile.c", 2024, __extension__ __PRETTY_FUNCTION__ ); })); // It simply cannot have more than one p_refs, otherwise we cannot merge. | |||
2025 | if (!tensor_blocks[p_ref_0].p_refs[0]) | |||
2026 | tensor_blocks[p_ref_0].p_refs[0] = tensor_blocks[p_ref_1].p_refs[0]; | |||
2027 | else | |||
2028 | tensor_blocks[p_ref_0].p_refs[1] = tensor_blocks[p_ref_1].p_refs[0]; | |||
2029 | } | |||
2030 | tensor_blocks[p_ref_0].pin_mem = tensor_blocks[p_ref_0].pin_mem || tensor_blocks[p_ref_1].pin_mem; | |||
2031 | TENSOR_SET_READ_WRITE(tensor_blocks[p_ref_0], TENSOR_READ_WRITE(tensor_blocks[p_ref_0]) | TENSOR_READ_WRITE(tensor_blocks[p_ref_1]))(tensor_blocks[p_ref_0].flags = ((tensor_blocks[p_ref_0].flags & ~0xc) | (tensor_blocks[p_ref_0].flags & 0xc) | (tensor_blocks [p_ref_1].flags & 0xc))); | |||
2032 | ccv_array_free(tensor_blocks[p_ref_1].head); | |||
2033 | if (TENSOR_IS_UNFOLDABLE_AS_INPUT(tensor_blocks[p_ref_1])(tensor_blocks[p_ref_1].flags & UNFOLDABLE_AS_INPUT)) | |||
2034 | TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[p_ref_0])(tensor_blocks[p_ref_0].flags = (tensor_blocks[p_ref_0].flags | UNFOLDABLE_AS_INPUT)); | |||
2035 | // Don't need to check UNFOLDABLE_AS_OUTPUT for p_ref_1 because if it is so, we cannot fold right now. | |||
2036 | TENSOR_EXPECT_SET_UNASSIGNED(tensor_blocks[p_ref_1])(tensor_blocks[p_ref_1].flags = ((tensor_blocks[p_ref_1].flags & ~0x3) | UNASSIGNED)); | |||
2037 | tensor_blocks[p_ref_1].ref = p_ref_0 + 1; | |||
2038 | if (!tensor_blocks[p_ref_0].r_refs) | |||
2039 | tensor_blocks[p_ref_0].r_refs = ccv_array_new(sizeof(int), 0, 0); | |||
2040 | ccv_array_add_unique_int(tensor_blocks[p_ref_0].r_refs, p_ref_1 + 1); | |||
2041 | tensor_blocks[p_ref_1].size = 0; | |||
2042 | tensor_blocks[p_ref_1].head = 0; | |||
2043 | tensor_blocks[p_ref_1].tail = 0; | |||
2044 | return 1; | |||
2045 | } | |||
2046 | return 0; | |||
2047 | } | |||
2048 | ||||
2049 | static void _ccv_nnc_exec_dep_and_tensor_blocks_prep(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_info_t* const p_node_info, const ccv_nnc_graph_visit_t* const visit, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int unroll_count, const int* const dup_tensor_block_ref, const int* const dup_tensor_from_ref, const int* const dup_exec_from_ref, ccv_nnc_graph_exec_flag_t* const exec_flags, ccv_sparse_matrix_t** r_exec_dep, ccv_nnc_tensor_block_t** r_tensor_blocks) | |||
2050 | { | |||
2051 | int i, j, k; | |||
2052 | // Generate exec dependencies (or, in other words, partial ordering of executions). | |||
2053 | ccv_sparse_matrix_t* exec_dep = ccv_sparse_matrix_new(symbolic_graph->exec_symbol_info->rnum, symbolic_graph->exec_symbol_info->rnum, CCV_32S | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0); | |||
2054 | int* buf = (int*)ccmallocmalloc(sizeof(int) * symbolic_graph->exec_symbol_info->rnum * 2); | |||
2055 | int buf_size; | |||
2056 | if (p_node_info) | |||
2057 | { assert(output_size == 0)((void) sizeof ((output_size == 0) ? 1 : 0), __extension__ ({ if (output_size == 0) ; else __assert_fail ("output_size == 0" , "ccv_nnc_symbolic_graph_compile.c", 2057, __extension__ __PRETTY_FUNCTION__ ); })); } | |||
2058 | #define for_block(x, val) \ | |||
2059 | do { \ | |||
2060 | if (((int32_t*)val)[0] > 0) \ | |||
2061 | { \ | |||
2062 | buf[buf_size * 2] = x; \ | |||
2063 | buf[buf_size * 2 + 1] = ((int32_t*)val)[0] + 1; \ | |||
2064 | ++buf_size; \ | |||
2065 | } \ | |||
2066 | } while (0) | |||
2067 | ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx, term){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const int idx __attribute__((unused)) = (visit)->node[_i_].index ; const int term __attribute__((unused)) = (visit)->node[_i_ ].term; typeof ((exec_symbol_info)) const node __attribute__( (unused)) = (exec_symbol_info) + idx; { | |||
2068 | buf_size = 0; /* save all its parent deps to this buffer */ | |||
2069 | ccv_sparse_matrix_vector_t* vector = ccv_get_sparse_matrix_vector(exec_dep, idx); | |||
2070 | if (vector) | |||
2071 | CCV_SPARSE_VECTOR_FOREACH(exec_dep, vector, for_block)do { switch ((((exec_dep)->type) & 0xFF000)) { case CCV_32S : { do { int _i_; __attribute__((unused)) const size_t _c_ = ( ((exec_dep)->type) & 0xFFF); if ((exec_dep)->type & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (vector)->size ; _i_++) { for_block((_i_), ((vector)->data.i32 + (_i_ * _c_ ))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t ) + ((_ccv_get_data_type_size[(((exec_dep)->type) & 0xFF000 ) >> 12] * (((exec_dep)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_ = (uint8_t*)(vector)->index; for (_i_ = 0; _i_ < (vector)->size; _i_++) { ccv_sparse_matrix_index_t * const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_-> i), (_d_.i32 + (0))); } } } while (0); break; } case CCV_32F: { do { int _i_; __attribute__((unused)) const size_t _c_ = ( ((exec_dep)->type) & 0xFFF); if ((exec_dep)->type & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (vector)->size ; _i_++) { for_block((_i_), ((vector)->data.f32 + (_i_ * _c_ ))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t ) + ((_ccv_get_data_type_size[(((exec_dep)->type) & 0xFF000 ) >> 12] * (((exec_dep)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_ = (uint8_t*)(vector)->index; for (_i_ = 0; _i_ < (vector)->size; _i_++) { ccv_sparse_matrix_index_t * const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_-> i), (_d_.f32 + (0))); } } } while (0); break; } case CCV_64S: { do { int _i_; __attribute__((unused)) const size_t _c_ = ( ((exec_dep)->type) & 0xFFF); if ((exec_dep)->type & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (vector)->size ; _i_++) { for_block((_i_), ((vector)->data.i64 + (_i_ * _c_ ))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t ) + ((_ccv_get_data_type_size[(((exec_dep)->type) & 0xFF000 ) >> 12] * (((exec_dep)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_ = (uint8_t*)(vector)->index; for (_i_ = 0; _i_ < (vector)->size; _i_++) { ccv_sparse_matrix_index_t * const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_-> i), (_d_.i64 + (0))); } } } while (0); break; } case CCV_64F: { do { int _i_; __attribute__((unused)) const size_t _c_ = ( ((exec_dep)->type) & 0xFFF); if ((exec_dep)->type & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (vector)->size ; _i_++) { for_block((_i_), ((vector)->data.f64 + (_i_ * _c_ ))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t ) + ((_ccv_get_data_type_size[(((exec_dep)->type) & 0xFF000 ) >> 12] * (((exec_dep)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_ = (uint8_t*)(vector)->index; for (_i_ = 0; _i_ < (vector)->size; _i_++) { ccv_sparse_matrix_index_t * const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_-> i), (_d_.f64 + (0))); } } } while (0); break; } default: { do { int _i_; __attribute__((unused)) const size_t _c_ = (((exec_dep )->type) & 0xFFF); if ((exec_dep)->type & CCV_DENSE_VECTOR ) { for (_i_ = 0; _i_ < (vector)->size; _i_++) { for_block ((_i_), ((vector)->data.u8 + (_i_ * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size [(((exec_dep)->type) & 0xFF000) >> 12] * (((exec_dep )->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_ = (uint8_t*)(vector)->index; for (_i_ = 0; _i_ < (vector )->size; _i_++) { ccv_sparse_matrix_index_t* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_-> i), (_d_.u8 + (0))); } } } while (0); } } } while (0); | |||
2072 | if (!node->outgoings) | |||
2073 | continue; | |||
2074 | for (i = 0; i < node->outgoings->rnum; i++) | |||
2075 | { | |||
2076 | int outgoing = *(int*)ccv_array_get(node->outgoings, i)((void*)(((char*)((node->outgoings)->data)) + (size_t)( node->outgoings)->rsize * (size_t)(i))); | |||
2077 | const int32_t one = 1; | |||
2078 | ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, outgoing, idx); | |||
2079 | /* If not found, set, if the current node is the destination node, no need | |||
2080 | * set itself as parent of subsequent nodes because its terminal nature. */ | |||
2081 | if (!cell.i32 || cell.i32[0] == 0) | |||
2082 | ccv_set_sparse_matrix_cell(exec_dep, outgoing, idx, &one); | |||
2083 | if (buf_size > 0) | |||
2084 | { | |||
2085 | ccv_sparse_matrix_vector_t* vector = ccv_get_sparse_matrix_vector(exec_dep, outgoing); | |||
2086 | assert(vector)((void) sizeof ((vector) ? 1 : 0), __extension__ ({ if (vector ) ; else __assert_fail ("vector", "ccv_nnc_symbolic_graph_compile.c" , 2086, __extension__ __PRETTY_FUNCTION__); })); | |||
2087 | for (j = 0; j < buf_size; j++) /* set with all idx's dependencies as well */ | |||
2088 | { | |||
2089 | ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell_from_vector(exec_dep, vector, buf[j * 2]); | |||
2090 | /* If not found, set */ | |||
2091 | if (!cell.i32 || cell.i32[0] == 0) | |||
2092 | ccv_set_sparse_matrix_cell_from_vector(exec_dep, vector, buf[j * 2], &buf[j * 2 + 1]); | |||
2093 | else { | |||
2094 | /* Otherwise, set to the longest one */ | |||
2095 | int32_t dep = ccv_max(cell.i32[0], buf[j * 2 + 1])({ typeof (cell.i32[0]) _a = (cell.i32[0]); typeof (buf[j * 2 + 1]) _b = (buf[j * 2 + 1]); (_a > _b) ? _a : _b; }); | |||
2096 | ccv_set_sparse_matrix_cell_from_vector(exec_dep, vector, buf[j * 2], &dep); | |||
2097 | } | |||
2098 | } | |||
2099 | } | |||
2100 | } | |||
2101 | } ccv_nnc_graph_visit_endfor} } | |||
2102 | #undef for_block | |||
2103 | ccfreefree(buf); | |||
2104 | // This struct is allocated earlier to collect information about the tensor's expected start / end execs. | |||
2105 | const int tensor_block_size = symbolic_graph->tensor_symbol_info->rnum; | |||
2106 | ccv_nnc_tensor_block_t* tensor_blocks = (ccv_nnc_tensor_block_t*)cccalloccalloc(tensor_block_size, sizeof(ccv_nnc_tensor_block_t)); | |||
2107 | // The reason is that I need to make everyone of them to be unassigned unless it is used somewhere. It | |||
2108 | // happens that I have to loop through all relevant node to find out if one is used or not. | |||
2109 | for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++) | |||
2110 | tensor_blocks[i].flags = UNASSIGNED, tensor_blocks[i].type = tensor_symbol_info[i].info.type, tensor_blocks[i].bypass_ref = tensor_symbol_info[i].bypass_ref; | |||
2111 | ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const int idx __attribute__((unused)) = (visit)->node[_i_].index ; const int _node_unused_ __attribute__((unused)) = (visit)-> node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__ ((unused)) = (exec_symbol_info) + idx; { | |||
2112 | for (i = 0; i < node->input_size; i++) | |||
2113 | if (node->inputs[i] >= 0) | |||
2114 | { | |||
2115 | tensor_blocks[node->inputs[i]].flags = 0; | |||
2116 | // If this is a data transfer node, and this is a CPU memory, mark the memory type to be pinned mem. | |||
2117 | // This will get propagated back to the buffer, and used there to determine the allocation function to use. | |||
2118 | if (CCV_TENSOR_GET_MEMORY(tensor_blocks[node->inputs[i]].type)((tensor_blocks[node->inputs[i]].type) & 0x3) == CCV_TENSOR_CPU_MEMORY && | |||
2119 | (node->cmd.cmd == CCV_NNC_DATA_TRANSFER_FORWARD || node->cmd.cmd == CCV_NNC_DATA_TRANSFER_BACKWARD)) | |||
2120 | tensor_blocks[node->inputs[i]].pin_mem = 1; | |||
2121 | } | |||
2122 | for (i = 0; i < node->output_size; i++) | |||
2123 | if (node->outputs[i] >= 0) | |||
2124 | { | |||
2125 | tensor_blocks[node->outputs[i]].flags = 0; | |||
2126 | // If this is a data transfer node, and this is a CPU memory, mark the memory type to be pinned mem. | |||
2127 | // This will get propagated back to the buffer, and used there to determine the allocation function to use. | |||
2128 | if (CCV_TENSOR_GET_MEMORY(tensor_blocks[node->outputs[i]].type)((tensor_blocks[node->outputs[i]].type) & 0x3) == CCV_TENSOR_CPU_MEMORY && | |||
2129 | (node->cmd.cmd == CCV_NNC_DATA_TRANSFER_FORWARD || node->cmd.cmd == CCV_NNC_DATA_TRANSFER_BACKWARD)) | |||
2130 | tensor_blocks[node->outputs[i]].pin_mem = 1; | |||
2131 | } | |||
2132 | } ccv_nnc_graph_visit_endfor} } | |||
2133 | if (p_node_info) | |||
2134 | { | |||
2135 | assert(p_tensor_symbol_info)((void) sizeof ((p_tensor_symbol_info) ? 1 : 0), __extension__ ({ if (p_tensor_symbol_info) ; else __assert_fail ("p_tensor_symbol_info" , "ccv_nnc_symbolic_graph_compile.c", 2135, __extension__ __PRETTY_FUNCTION__ ); })); | |||
2136 | // Mark it as used if it is used in either input or output. | |||
2137 | for (i = 0; i < p_node_info->input_size; i++) | |||
2138 | if (p_node_info->inputs[i] >= 0) | |||
2139 | { | |||
2140 | const int d = p_node_info->inputs[i]; | |||
2141 | if (p_tensor_symbol_info[d].s_ref && p_tensor_symbol_info[d].s_ref->rnum >= symbolic_graph->p_idx) | |||
2142 | { | |||
2143 | const int dd = *(int*)ccv_array_get(p_tensor_symbol_info[d].s_ref, symbolic_graph->p_idx - 1)((void*)(((char*)((p_tensor_symbol_info[d].s_ref)->data)) + (size_t)(p_tensor_symbol_info[d].s_ref)->rsize * (size_t) (symbolic_graph->p_idx - 1))) - 1; | |||
2144 | if (dd >= 0) // If this exists in this sub-graph, great. | |||
2145 | tensor_blocks[dd].flags = 0; | |||
2146 | } | |||
2147 | } | |||
2148 | for (i = 0; i < p_node_info->output_size; i++) | |||
2149 | if (p_node_info->outputs[i] >= 0) | |||
2150 | { | |||
2151 | const int d = p_node_info->outputs[i]; | |||
2152 | if (p_tensor_symbol_info[d].s_ref && p_tensor_symbol_info[d].s_ref->rnum >= symbolic_graph->p_idx) | |||
2153 | { | |||
2154 | const int dd = *(int*)ccv_array_get(p_tensor_symbol_info[d].s_ref, symbolic_graph->p_idx - 1)((void*)(((char*)((p_tensor_symbol_info[d].s_ref)->data)) + (size_t)(p_tensor_symbol_info[d].s_ref)->rsize * (size_t) (symbolic_graph->p_idx - 1))) - 1; | |||
2155 | if (dd >= 0) // If this exists in this sub-graph, great. | |||
2156 | tensor_blocks[dd].flags = 0; | |||
2157 | } | |||
2158 | } | |||
2159 | } | |||
2160 | for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++) | |||
2161 | if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED)) | |||
2162 | { | |||
2163 | // Check no tensor info is auto now. | |||
2164 | assert(!ccv_nnc_is_tensor_auto(tensor_symbol_info[i].info))((void) sizeof ((!ccv_nnc_is_tensor_auto(tensor_symbol_info[i ].info)) ? 1 : 0), __extension__ ({ if (!ccv_nnc_is_tensor_auto (tensor_symbol_info[i].info)) ; else __assert_fail ("!ccv_nnc_is_tensor_auto(tensor_symbol_info[i].info)" , "ccv_nnc_symbolic_graph_compile.c", 2164, __extension__ __PRETTY_FUNCTION__ ); })); | |||
2165 | // If this tensor is used in assign_ref, set it to be un-foldable. (It will be used as parameter, | |||
2166 | // therefore, itself life-cycle almost certainly won't concatenate properly with the tensor to | |||
2167 | // fold to). | |||
2168 | if (tensor_symbol_info[i].assign_ref) | |||
2169 | { | |||
2170 | // TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[i]); | |||
2171 | // It can be folded as input (it is fine to be overwritten), but it cannot as output (when folded as input, | |||
2172 | // it kept its own representation, which is not the case for output). | |||
2173 | TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[i])(tensor_blocks[i].flags = (tensor_blocks[i].flags | UNFOLDABLE_AS_OUTPUT )); | |||
2174 | const int assign_ref = tensor_symbol_info[i].assign_ref - 1; | |||
2175 | // But for where it comes from, it cannot be folded as input, because it cannot be overwritten any time. | |||
2176 | TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[assign_ref])(tensor_blocks[assign_ref].flags = (tensor_blocks[assign_ref] .flags | UNFOLDABLE_AS_INPUT)); | |||
2177 | // It also cannot be folded as output (except i), because we need to keep its own representation. | |||
2178 | TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[assign_ref])(tensor_blocks[assign_ref].flags = (tensor_blocks[assign_ref] .flags | UNFOLDABLE_AS_OUTPUT)); | |||
2179 | assert(tensor_blocks[assign_ref].unfoldable_except_ref == 0)((void) sizeof ((tensor_blocks[assign_ref].unfoldable_except_ref == 0) ? 1 : 0), __extension__ ({ if (tensor_blocks[assign_ref ].unfoldable_except_ref == 0) ; else __assert_fail ("tensor_blocks[assign_ref].unfoldable_except_ref == 0" , "ccv_nnc_symbolic_graph_compile.c", 2179, __extension__ __PRETTY_FUNCTION__ ); })); | |||
2180 | tensor_blocks[assign_ref].unfoldable_except_ref = i + 1; | |||
2181 | for (j = 0; j < unroll_count; j++) | |||
2182 | { | |||
2183 | TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]])(tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]].flags = (tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]] .flags | UNFOLDABLE_AS_INPUT)); | |||
2184 | TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]])(tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]].flags = (tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]] .flags | UNFOLDABLE_AS_OUTPUT)); | |||
2185 | } | |||
2186 | if (tensor_blocks[assign_ref].bypass_ref) | |||
2187 | { | |||
2188 | // If it contains a bypass_ref, that means we can fold into both the bypass and except_ref, making it untenable. | |||
2189 | tensor_blocks[assign_ref].unfoldable_except_ref = 0; | |||
2190 | const int bypass_ref = tensor_blocks[assign_ref].bypass_ref - 1; | |||
2191 | TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[bypass_ref])(tensor_blocks[bypass_ref].flags = (tensor_blocks[bypass_ref] .flags | UNFOLDABLE_AS_INPUT)); | |||
2192 | TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[bypass_ref])(tensor_blocks[bypass_ref].flags = (tensor_blocks[bypass_ref] .flags | UNFOLDABLE_AS_OUTPUT)); | |||
2193 | // On the other hand, it can be folded into the except_ref for the bypass_ref. | |||
2194 | tensor_blocks[bypass_ref].unfoldable_except_ref = i + 1; | |||
2195 | if (dup_tensor_from_ref) | |||
2196 | { | |||
2197 | const int bypass_from_ref = dup_tensor_from_ref[bypass_ref]; | |||
2198 | if (bypass_from_ref >= 0) | |||
2199 | { | |||
2200 | TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[bypass_from_ref])(tensor_blocks[bypass_from_ref].flags = (tensor_blocks[bypass_from_ref ].flags | UNFOLDABLE_AS_INPUT)); | |||
2201 | TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[bypass_from_ref])(tensor_blocks[bypass_from_ref].flags = (tensor_blocks[bypass_from_ref ].flags | UNFOLDABLE_AS_OUTPUT)); | |||
2202 | assert(dup_tensor_block_ref[bypass_from_ref * unroll_count + unroll_count - 1] == bypass_ref)((void) sizeof ((dup_tensor_block_ref[bypass_from_ref * unroll_count + unroll_count - 1] == bypass_ref) ? 1 : 0), __extension__ ( { if (dup_tensor_block_ref[bypass_from_ref * unroll_count + unroll_count - 1] == bypass_ref) ; else __assert_fail ("dup_tensor_block_ref[bypass_from_ref * unroll_count + unroll_count - 1] == bypass_ref" , "ccv_nnc_symbolic_graph_compile.c", 2202, __extension__ __PRETTY_FUNCTION__ ); })); | |||
2203 | for (j = 0; j < unroll_count - 1; j++) | |||
2204 | { | |||
2205 | // Mark every incarnation as unfold-able. | |||
2206 | TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[dup_tensor_block_ref[bypass_from_ref * unroll_count + j]])(tensor_blocks[dup_tensor_block_ref[bypass_from_ref * unroll_count + j]].flags = (tensor_blocks[dup_tensor_block_ref[bypass_from_ref * unroll_count + j]].flags | UNFOLDABLE_AS_INPUT)); | |||
2207 | TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[dup_tensor_block_ref[bypass_from_ref * unroll_count + j]])(tensor_blocks[dup_tensor_block_ref[bypass_from_ref * unroll_count + j]].flags = (tensor_blocks[dup_tensor_block_ref[bypass_from_ref * unroll_count + j]].flags | UNFOLDABLE_AS_OUTPUT)); | |||
2208 | } | |||
2209 | } | |||
2210 | } | |||
2211 | } | |||
2212 | } | |||
2213 | } | |||
2214 | for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++) | |||
2215 | { | |||
2216 | // If it has a pair reference, we don't need to allocate this tensor at all, | |||
2217 | // set it to be unassigned. | |||
2218 | if (tensor_symbol_info[i].pair_ref) | |||
2219 | TENSOR_EXPECT_SET_UNASSIGNED(tensor_blocks[i])(tensor_blocks[i].flags = ((tensor_blocks[i].flags & ~0x3 ) | UNASSIGNED)); | |||
2220 | // If it is a tape variable, set it to be un-foldable as too (otherwise we cannot use tape properly). | |||
2221 | else if (tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_TAPE_VAR) { | |||
2222 | TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[i])(tensor_blocks[i].flags = (tensor_blocks[i].flags | UNFOLDABLE_AS_INPUT )); | |||
2223 | TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[i])(tensor_blocks[i].flags = (tensor_blocks[i].flags | UNFOLDABLE_AS_OUTPUT )); | |||
2224 | // For this case, there is no exception. | |||
2225 | tensor_blocks[i].unfoldable_except_ref = 0; | |||
2226 | } else if (tensor_symbol_info[i].p_ref) { | |||
2227 | assert(p_node_info)((void) sizeof ((p_node_info) ? 1 : 0), __extension__ ({ if ( p_node_info) ; else __assert_fail ("p_node_info", "ccv_nnc_symbolic_graph_compile.c" , 2227, __extension__ __PRETTY_FUNCTION__); })); | |||
2228 | const int p_ref_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(tensor_symbol_info[i].p_ref - 1, p_node_info); | |||
2229 | // If I am a case of graph, and this tensor is the input from the parent graph, you cannot fold it as input. | |||
2230 | if (p_node_info->flags & CCV_NNC_GRAPH_EXEC_CASE_OF) | |||
2231 | // TODO: This check can be lifted if we can fold in the parent graph. | |||
2232 | if (-1 == p_ref_is_in_or_out) | |||
2233 | TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[i])(tensor_blocks[i].flags = (tensor_blocks[i].flags | UNFOLDABLE_AS_INPUT )); | |||
2234 | if (1 == p_ref_is_in_or_out) // If p_ref is out, it cannot be fold as input. | |||
2235 | TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[i])(tensor_blocks[i].flags = (tensor_blocks[i].flags | UNFOLDABLE_AS_INPUT )); | |||
2236 | } | |||
2237 | } | |||
2238 | for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++) | |||
2239 | { | |||
2240 | if (tensor_symbol_info[i].alias_ref) | |||
2241 | { | |||
2242 | const int ref = tensor_symbol_info[i].alias_ref - 1; | |||
2243 | // If the referenced one is unassigned, mark this as assigned only if current one is assigned. | |||
2244 | if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[ref])((tensor_blocks[ref].flags & 0x3) == UNASSIGNED) && !TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED)) | |||
2245 | tensor_blocks[ref].flags = 0; | |||
2246 | // An alias cannot ref to another alias. | |||
2247 | assert(!tensor_symbol_info[ref].alias_ref)((void) sizeof ((!tensor_symbol_info[ref].alias_ref) ? 1 : 0) , __extension__ ({ if (!tensor_symbol_info[ref].alias_ref) ; else __assert_fail ("!tensor_symbol_info[ref].alias_ref", "ccv_nnc_symbolic_graph_compile.c" , 2247, __extension__ __PRETTY_FUNCTION__); })); | |||
2248 | tensor_blocks[i].flags = ALIAS; | |||
2249 | tensor_blocks[i].ref = ref + 1; // Assign the ref. | |||
2250 | if (!tensor_blocks[ref].r_refs) | |||
2251 | tensor_blocks[ref].r_refs = ccv_array_new(sizeof(int), 0, 0); | |||
2252 | ccv_array_add_unique_int(tensor_blocks[ref].r_refs, i + 1); | |||
2253 | } | |||
2254 | } | |||
2255 | // Scan again and if the ref is not assigned, mark the alias not assigned. | |||
2256 | for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++) | |||
2257 | if (TENSOR_EXPECT_ALIAS(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == ALIAS)) | |||
2258 | { | |||
2259 | const int ref = tensor_blocks[i].ref - 1; | |||
2260 | if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[ref])((tensor_blocks[ref].flags & 0x3) == UNASSIGNED)) | |||
2261 | { | |||
2262 | // Mark this as unassigned. | |||
2263 | tensor_blocks[i].flags = UNASSIGNED; | |||
2264 | tensor_blocks[i].ref = 0; | |||
2265 | } | |||
2266 | } | |||
2267 | for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++) | |||
2268 | { | |||
2269 | // If this tensor is not expected to be unassigned, allocate the arrays for s and t. | |||
2270 | if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !( (tensor_blocks[i].flags & 0x3) == UNASSIGNED))) | |||
2271 | { | |||
2272 | tensor_blocks[i].head = ccv_array_new(sizeof(int), 0, 0); | |||
2273 | tensor_blocks[i].tail = ccv_array_new(sizeof(int), 0, 0); | |||
2274 | // Cache tensor size (align to 16 bytes). | |||
2275 | tensor_blocks[i].size = (uint64_t)ccv_nnc_tensor_data_size(tensor_symbol_info[i].info); | |||
2276 | } | |||
2277 | // If there is a p_ref, add the one to the p_refs list. | |||
2278 | if (tensor_symbol_info[i].p_ref) | |||
2279 | tensor_blocks[i].p_refs[0] = tensor_symbol_info[i].p_ref; | |||
2280 | } | |||
2281 | ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const int idx __attribute__((unused)) = (visit)->node[_i_].index ; const int _node_unused_ __attribute__((unused)) = (visit)-> node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__ ((unused)) = (exec_symbol_info) + idx; { | |||
2282 | for (i = 0; i < node->input_size; i++) | |||
2283 | { | |||
2284 | int d = node->inputs[i]; | |||
2285 | if (d < 0) | |||
2286 | continue; | |||
2287 | if (TENSOR_EXPECT_ALIAS(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == ALIAS)) | |||
2288 | d = tensor_symbol_info[d].alias_ref - 1; | |||
2289 | tensor_blocks[d].flags |= READ_ONLY; | |||
2290 | if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED)) | |||
2291 | continue; | |||
2292 | assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]))((void) sizeof (((!((tensor_blocks[d].flags & 0x3) == ALIAS ) && !((tensor_blocks[d].flags & 0x3) == UNASSIGNED ))) ? 1 : 0), __extension__ ({ if ((!((tensor_blocks[d].flags & 0x3) == ALIAS) && !((tensor_blocks[d].flags & 0x3) == UNASSIGNED))) ; else __assert_fail ("TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d])" , "ccv_nnc_symbolic_graph_compile.c", 2292, __extension__ __PRETTY_FUNCTION__ ); })); | |||
2293 | /* If this is first encounter, its head starts (this tensor is init'ed outside of the graph | |||
2294 | * from the very beginning of the graph life-cycle and ends here. */ | |||
2295 | if (tensor_blocks[d].head->rnum == 0 && !TENSOR_REQUIRE_INIT(tensor_symbol_info[d].flags)(((tensor_symbol_info[d].flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS ) || ((tensor_symbol_info[d].flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ONES ))) | |||
2296 | { | |||
2297 | for (j = 0; j < source_size; j++) | |||
2298 | { | |||
2299 | // If the source is connecting to current node, add (otherwise we will create tensor blocks that used in other streams, which is unneccessary). | |||
2300 | const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, idx, sources[j].d); | |||
2301 | if (cell.i32 && cell.i32[0] > 0) | |||
2302 | _ccv_nnc_tensor_block_add_exec(exec_dep, sources[j].d, tensor_blocks[d]); | |||
2303 | } | |||
2304 | /* If this is a read-only (based on SSA, if first encountered as read), and this is | |||
2305 | * sub-graph (TODO: this condition can be lifted for case..of that is never in a while | |||
2306 | * loop, however, in that case, you need to prevent read-only gets reused for the | |||
2307 | * output tensor, which is not obvious how to implement correctly), and it is not | |||
2308 | * assign_ref from anywhere (not a parameterized loop). We cannot reuse this region | |||
2309 | * of memory anyway (because on second loop, we want to read the same value out). | |||
2310 | * Mark it to the end of the graph. */ | |||
2311 | if (p_node_info && !tensor_symbol_info[d].assign_ref) | |||
2312 | for (j = 0; j < destination_size; j++) | |||
2313 | { | |||
2314 | // If the destination is connecting to current node, add (otherwise we will create tensor blocks that used in other streams, which is unneccessary). | |||
2315 | const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, destinations[j].d, idx); | |||
2316 | if (cell.i32 && cell.i32[0] > 0) | |||
2317 | _ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[d]); | |||
2318 | } | |||
2319 | } | |||
2320 | _ccv_nnc_tensor_block_add_exec(exec_dep, idx, tensor_blocks[d]); | |||
2321 | } | |||
2322 | for (i = 0; i < node->output_size; i++) | |||
2323 | { | |||
2324 | int d = node->outputs[i]; | |||
2325 | if (d < 0) | |||
2326 | continue; | |||
2327 | if (TENSOR_EXPECT_ALIAS(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == ALIAS)) | |||
2328 | d = tensor_symbol_info[d].alias_ref - 1; | |||
2329 | tensor_blocks[d].flags |= WRITE_ONLY; | |||
2330 | if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED)) | |||
2331 | continue; | |||
2332 | assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]))((void) sizeof (((!((tensor_blocks[d].flags & 0x3) == ALIAS ) && !((tensor_blocks[d].flags & 0x3) == UNASSIGNED ))) ? 1 : 0), __extension__ ({ if ((!((tensor_blocks[d].flags & 0x3) == ALIAS) && !((tensor_blocks[d].flags & 0x3) == UNASSIGNED))) ; else __assert_fail ("TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d])" , "ccv_nnc_symbolic_graph_compile.c", 2332, __extension__ __PRETTY_FUNCTION__ ); })); | |||
2333 | _ccv_nnc_tensor_block_add_exec(exec_dep, idx, tensor_blocks[d]); | |||
2334 | } | |||
2335 | } ccv_nnc_graph_visit_endfor} } | |||
2336 | // For any assign_ref, its life-time kept until the end and wrap over. | |||
2337 | for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++) | |||
2338 | // If this tensor is not unassigned (or alias) and it is assigned from somewhere else, | |||
2339 | // that "somewhere else" need to keep its life-time til the end. | |||
2340 | if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !( (tensor_blocks[i].flags & 0x3) == UNASSIGNED)) && | |||
2341 | p_node_info && tensor_symbol_info[i].assign_ref) | |||
2342 | { | |||
2343 | const int assign_ref = tensor_symbol_info[i].assign_ref - 1; | |||
2344 | for (j = 0; j < destination_size; j++) | |||
2345 | { | |||
2346 | // This logic is to be more conservative about which destination we add to. | |||
2347 | // As of now, if we add everything, it is fine most likely. However, it may | |||
2348 | // cause issues in the future to do so naively. Thus, instead, we only add | |||
2349 | // the destination to it iff either the tensor is not used at all, or, the | |||
2350 | // destination is on the same stream as of the tensor block some way. | |||
2351 | int flag = !tensor_blocks[assign_ref].tail; | |||
2352 | for (k = 0; !flag && k < tensor_blocks[assign_ref].tail->rnum; k++) | |||
2353 | { | |||
2354 | const int idx = *(int*)ccv_array_get(tensor_blocks[assign_ref].tail, k)((void*)(((char*)((tensor_blocks[assign_ref].tail)->data)) + (size_t)(tensor_blocks[assign_ref].tail)->rsize * (size_t )(k))); | |||
2355 | const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, destinations[j].d, idx); | |||
2356 | flag = (cell.i32 && cell.i32[0] > 0); | |||
2357 | } | |||
2358 | if (flag) // If there is no tail at all, add it. Otherwise, only add it if the destination is on the same stream with this tensor block somehow. | |||
2359 | _ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[assign_ref]); | |||
2360 | } | |||
2361 | } | |||
2362 | for (i = 0; i < output_size; i++) | |||
2363 | { | |||
2364 | assert(outputs[i].graph == symbolic_graph)((void) sizeof ((outputs[i].graph == symbolic_graph) ? 1 : 0) , __extension__ ({ if (outputs[i].graph == symbolic_graph) ; else __assert_fail ("outputs[i].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c" , 2364, __extension__ __PRETTY_FUNCTION__); })); | |||
2365 | int d = outputs[i].d; | |||
2366 | if (d < 0) | |||
2367 | continue; | |||
2368 | if (TENSOR_EXPECT_ALIAS(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == ALIAS)) | |||
2369 | d = tensor_symbol_info[d].alias_ref - 1; | |||
2370 | if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED)) | |||
2371 | continue; | |||
2372 | assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]))((void) sizeof (((!((tensor_blocks[d].flags & 0x3) == ALIAS ) && !((tensor_blocks[d].flags & 0x3) == UNASSIGNED ))) ? 1 : 0), __extension__ ({ if ((!((tensor_blocks[d].flags & 0x3) == ALIAS) && !((tensor_blocks[d].flags & 0x3) == UNASSIGNED))) ; else __assert_fail ("TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d])" , "ccv_nnc_symbolic_graph_compile.c", 2372, __extension__ __PRETTY_FUNCTION__ ); })); | |||
2373 | for (j = 0; j < destination_size; j++) | |||
2374 | { | |||
2375 | int flag = !tensor_blocks[d].tail; | |||
2376 | for (k = 0; !flag && k < tensor_blocks[d].tail->rnum; k++) | |||
2377 | { | |||
2378 | const int idx = *(int*)ccv_array_get(tensor_blocks[d].tail, k)((void*)(((char*)((tensor_blocks[d].tail)->data)) + (size_t )(tensor_blocks[d].tail)->rsize * (size_t)(k))); | |||
2379 | const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, destinations[j].d, idx); | |||
2380 | flag = (cell.i32 && cell.i32[0] > 0); | |||
2381 | } | |||
2382 | if (flag) // If there is no tail at all, add it. Otherwise, only add it if the destination is on the same stream with this tensor block somehow. | |||
2383 | _ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[d]); | |||
2384 | } | |||
2385 | } | |||
2386 | // Enforce tensor reuse by collapse tensors for in-place operations. We will fault if this cannot be done. | |||
2387 | ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const int idx __attribute__((unused)) = (visit)->node[_i_].index ; const int _node_unused_ __attribute__((unused)) = (visit)-> node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__ ((unused)) = (exec_symbol_info) + idx; { | |||
2388 | int x, y; | |||
2389 | for (x = 0; x < node->input_size; x++) | |||
2390 | for (y = 0; y < node->output_size; y++) | |||
2391 | /* Some operations enforces some tensors to be the same for inputs / outputs. */ | |||
2392 | if (ccv_nnc_cmd_enforce_inplace(node->cmd, x, node->input_size, y, node->output_size)) | |||
2393 | { | |||
2394 | // If both unassigned, it is fine. | |||
2395 | if (node->inputs[x] < 0 && node->outputs[y] < 0) | |||
2396 | continue; | |||
2397 | int ref = node->inputs[x]; | |||
2398 | assert(ref >= 0)((void) sizeof ((ref >= 0) ? 1 : 0), __extension__ ({ if ( ref >= 0) ; else __assert_fail ("ref >= 0", "ccv_nnc_symbolic_graph_compile.c" , 2398, __extension__ __PRETTY_FUNCTION__); })); | |||
2399 | while (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref])(!((tensor_blocks[ref].flags & 0x3) == ALIAS) && ! ((tensor_blocks[ref].flags & 0x3) == UNASSIGNED)) && tensor_blocks[ref].ref) | |||
2400 | ref = tensor_blocks[ref].ref - 1; | |||
2401 | const int node_output_y = node->outputs[y]; | |||
2402 | assert(node_output_y >= 0)((void) sizeof ((node_output_y >= 0) ? 1 : 0), __extension__ ({ if (node_output_y >= 0) ; else __assert_fail ("node_output_y >= 0" , "ccv_nnc_symbolic_graph_compile.c", 2402, __extension__ __PRETTY_FUNCTION__ ); })); | |||
2403 | // If both are not computable, it is fine, we don't need to enforce. | |||
2404 | if (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref])(!((tensor_blocks[ref].flags & 0x3) == ALIAS) && ! ((tensor_blocks[ref].flags & 0x3) == UNASSIGNED)) && | |||
2405 | !TENSOR_EXPECT_COMPUTABLE(tensor_blocks[node_output_y])(!((tensor_blocks[node_output_y].flags & 0x3) == ALIAS) && !((tensor_blocks[node_output_y].flags & 0x3) == UNASSIGNED ))) | |||
2406 | continue; | |||
2407 | // Otherwise, enforce and error out if failed. | |||
2408 | if (!_ccv_nnc_tensor_blocks_try_fold(tensor_blocks, ref, node_output_y)) | |||
2409 | { assert(0 && "cannot enforce inplace for the two tensors")((void) sizeof ((0 && "cannot enforce inplace for the two tensors" ) ? 1 : 0), __extension__ ({ if (0 && "cannot enforce inplace for the two tensors" ) ; else __assert_fail ("0 && \"cannot enforce inplace for the two tensors\"" , "ccv_nnc_symbolic_graph_compile.c", 2409, __extension__ __PRETTY_FUNCTION__ ); })); } | |||
2410 | } | |||
2411 | } ccv_nnc_graph_visit_endfor} } | |||
2412 | // Ignore tensors that are already binded, no matter if it is used or not. Doing it here because | |||
2413 | // we need to make sure enforced tensors are properly assigned, so that we don't bind on a tensor | |||
2414 | // that is not enforced in-place (because the tensor enforced in-place will be different than the | |||
2415 | // binding one). | |||
2416 | for (i = 0; i < tensor_bind_size; i++) | |||
2417 | { | |||
2418 | const ccv_nnc_tensor_symbol_t resolved_symbol = ccv_nnc_tensor_symbol_resolve(symbolic_graph, tensor_binds[i].symbol); | |||
2419 | // If there is a tensor binded, then it is unassigned. | |||
2420 | if (resolved_symbol.d >= 0) | |||
2421 | { | |||
2422 | int d = resolved_symbol.d; | |||
2423 | // I cannot assert too much at this moment. | |||
2424 | if (TENSOR_EXPECT_ALIAS(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == ALIAS)) | |||
2425 | d = tensor_symbol_info[d].alias_ref - 1; // Bind back to the original. | |||
2426 | // This check is for in-place ops. Only in-place op could have unassigned but ref. | |||
2427 | // It has nothing to do with alias. | |||
2428 | while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED) && tensor_blocks[d].ref) | |||
2429 | d = tensor_blocks[d].ref - 1; | |||
2430 | // Doesn't work if this is a loop carrying variable. | |||
2431 | assert(!tensor_symbol_info[d].assign_ref)((void) sizeof ((!tensor_symbol_info[d].assign_ref) ? 1 : 0), __extension__ ({ if (!tensor_symbol_info[d].assign_ref) ; else __assert_fail ("!tensor_symbol_info[d].assign_ref", "ccv_nnc_symbolic_graph_compile.c" , 2431, __extension__ __PRETTY_FUNCTION__); })); | |||
2432 | tensor_blocks[d].flags = UNASSIGNED; | |||
2433 | tensor_blocks[d].ref = 0; // No need to have ref as well. | |||
2434 | } | |||
2435 | } | |||
2436 | // Maximum tensor reuse by collapse tensors allows in-place operations (and it matches the start, end tensor). | |||
2437 | ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const int idx __attribute__((unused)) = (visit)->node[_i_].index ; const int _node_unused_ __attribute__((unused)) = (visit)-> node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__ ((unused)) = (exec_symbol_info) + idx; { | |||
2438 | int x, y; | |||
2439 | for (x = 0; x < node->input_size; x++) | |||
2440 | { | |||
2441 | /* If the input is not assigned, it can be referenced, find the referenced one */ | |||
2442 | int ref = node->inputs[x]; | |||
2443 | if (ref < 0) | |||
2444 | continue; | |||
2445 | const ccv_nnc_tensor_symbol_info_t x_symbol = tensor_symbol_info[ref]; | |||
2446 | while (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref])(!((tensor_blocks[ref].flags & 0x3) == ALIAS) && ! ((tensor_blocks[ref].flags & 0x3) == UNASSIGNED)) && tensor_blocks[ref].ref) | |||
2447 | ref = tensor_blocks[ref].ref - 1; | |||
2448 | assert(tensor_blocks[ref].ref == 0)((void) sizeof ((tensor_blocks[ref].ref == 0) ? 1 : 0), __extension__ ({ if (tensor_blocks[ref].ref == 0) ; else __assert_fail ("tensor_blocks[ref].ref == 0" , "ccv_nnc_symbolic_graph_compile.c", 2448, __extension__ __PRETTY_FUNCTION__ ); })); | |||
2449 | if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref])(!((tensor_blocks[ref].flags & 0x3) == ALIAS) && ! ((tensor_blocks[ref].flags & 0x3) == UNASSIGNED)) && | |||
2450 | tensor_blocks[ref].tail->rnum == 1) | |||
2451 | { | |||
2452 | for (y = 0; y < node->output_size; y++) | |||
2453 | /* Only proceed if the input symbol is different from the output symbol, */ | |||
2454 | /* and the input symbol meets the output symbol exactly at the same spot. */ | |||
2455 | if (ccv_nnc_cmd_allow_inplace(node->cmd, x, node->input_size, y, node->output_size) && | |||
2456 | node->outputs[y] >= 0 && | |||
2457 | ref != node->outputs[y] && | |||
2458 | TENSOR_EXPECT_COMPUTABLE(tensor_blocks[node->outputs[y]])(!((tensor_blocks[node->outputs[y]].flags & 0x3) == ALIAS ) && !((tensor_blocks[node->outputs[y]].flags & 0x3) == UNASSIGNED))) | |||
2459 | { | |||
2460 | const int node_output_y = node->outputs[y]; | |||
2461 | const ccv_nnc_tensor_symbol_info_t y_symbol = tensor_symbol_info[node_output_y]; | |||
2462 | /* If dimension matches perfectly, then we can assign y_symbol to x. | |||
2463 | * If both of them are aliases, making sure their origin matches in size too. */ | |||
2464 | if (memcmp(x_symbol.info.dim, y_symbol.info.dim, sizeof(int) * CCV_NNC_MAX_DIM_ALLOC(12)) == 0) | |||
2465 | { | |||
2466 | _ccv_nnc_tensor_blocks_try_fold(tensor_blocks, ref, node_output_y); | |||
2467 | // This refers to an alias itself, now mark it and will be processed later. | |||
2468 | if (ref != node->inputs[x]) | |||
2469 | tensor_blocks[node_output_y].alias_ref = node->inputs[x] + 1; | |||
2470 | } | |||
2471 | } | |||
2472 | } | |||
2473 | } | |||
2474 | } ccv_nnc_graph_visit_endfor} } | |||
2475 | // Specifically handle the bypass. This need to be done after the first pass. | |||
2476 | // I need to extend the bypass life-time to the same as the one I am going with. | |||
2477 | // It is important we visit these nodes and assign bypass_ref to its dependents in topological order. | |||
2478 | ccv_nnc_tensor_block_t empty_block = {}; | |||
2479 | empty_block.head = ccv_array_new(sizeof(int), 0, 0); | |||
2480 | empty_block.tail = ccv_array_new(sizeof(int), 0, 0); | |||
2481 | ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const int idx __attribute__((unused)) = (visit)->node[_i_].index ; const int _node_unused_ __attribute__((unused)) = (visit)-> node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__ ((unused)) = (exec_symbol_info) + idx; { | |||
2482 | if (node->flags & CCV_NNC_GRAPH_EXEC_CASE_OF) | |||
2483 | { | |||
2484 | int can_bypass = 1; | |||
2485 | for (i = 0; can_bypass && i < node->output_size; i++) | |||
2486 | { | |||
2487 | int d = node->outputs[i]; | |||
2488 | if (d < 0) | |||
2489 | continue; | |||
2490 | if (!tensor_blocks[d].bypass_ref) | |||
2491 | continue; | |||
2492 | while (tensor_blocks[d].ref) | |||
2493 | d = tensor_blocks[d].ref - 1; | |||
2494 | int bypass_ref = tensor_blocks[node->outputs[i]].bypass_ref - 1; | |||
2495 | while (tensor_blocks[bypass_ref].ref) | |||
2496 | bypass_ref = tensor_blocks[bypass_ref].ref - 1; | |||
2497 | // If this doesn't participate in the while loop, we don't need to check the while loop constraint. | |||
2498 | if (!tensor_symbol_info[bypass_ref].assign_ref && !tensor_symbol_info[bypass_ref].r_assign_ref) | |||
2499 | continue; | |||
2500 | ccv_array_clear(empty_block.head); | |||
2501 | for (j = 0; tensor_blocks[bypass_ref].head && j < tensor_blocks[bypass_ref].head->rnum; j++) | |||
2502 | ccv_array_push(empty_block.head, ccv_array_get(tensor_blocks[bypass_ref].head, j)((void*)(((char*)((tensor_blocks[bypass_ref].head)->data)) + (size_t)(tensor_blocks[bypass_ref].head)->rsize * (size_t )(j)))); | |||
2503 | ccv_array_clear(empty_block.tail); | |||
2504 | for (j = 0; tensor_blocks[bypass_ref].tail && j < tensor_blocks[bypass_ref].tail->rnum; j++) | |||
2505 | ccv_array_push(empty_block.tail, ccv_array_get(tensor_blocks[bypass_ref].tail, j)((void*)(((char*)((tensor_blocks[bypass_ref].tail)->data)) + (size_t)(tensor_blocks[bypass_ref].tail)->rsize * (size_t )(j)))); | |||
2506 | for (j = 0; tensor_blocks[d].head && j < tensor_blocks[d].head->rnum; j++) | |||
2507 | _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].head, j)((void*)(((char*)((tensor_blocks[d].head)->data)) + (size_t )(tensor_blocks[d].head)->rsize * (size_t)(j))), empty_block); | |||
2508 | for (j = 0; tensor_blocks[d].tail && j < tensor_blocks[d].tail->rnum; j++) | |||
2509 | _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].tail, j)((void*)(((char*)((tensor_blocks[d].tail)->data)) + (size_t )(tensor_blocks[d].tail)->rsize * (size_t)(j))), empty_block); | |||
2510 | // It can only be unfoldable due to while constraint. Check whether this satisfies the while loop constraint. | |||
2511 | assert(!(tensor_symbol_info[bypass_ref].assign_ref && tensor_symbol_info[bypass_ref].r_assign_ref))((void) sizeof ((!(tensor_symbol_info[bypass_ref].assign_ref && tensor_symbol_info[bypass_ref].r_assign_ref)) ? 1 : 0), __extension__ ({ if (!(tensor_symbol_info[bypass_ref].assign_ref && tensor_symbol_info[bypass_ref].r_assign_ref)) ; else __assert_fail ("!(tensor_symbol_info[bypass_ref].assign_ref && tensor_symbol_info[bypass_ref].r_assign_ref)" , "ccv_nnc_symbolic_graph_compile.c", 2511, __extension__ __PRETTY_FUNCTION__ ); })); | |||
2512 | int b_ref = (tensor_symbol_info[bypass_ref].assign_ref) ? tensor_symbol_info[bypass_ref].assign_ref - 1 : tensor_symbol_info[bypass_ref].r_assign_ref - 1; | |||
2513 | while (tensor_blocks[b_ref].ref) | |||
2514 | b_ref = tensor_blocks[b_ref].ref - 1; | |||
2515 | int a_hop_b = _ccv_nnc_tensor_block_head_after_tail(exec_dep, empty_block, tensor_blocks[b_ref]); | |||
2516 | int b_hop_a = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[b_ref], empty_block); | |||
2517 | // These two can be assigned to the same region of memory without issue (because their life-time doesn't interfere) | |||
2518 | // even after we extend the life-time of bypass_ref. Then we are in a good shape. | |||
2519 | can_bypass = can_bypass && (a_hop_b || b_hop_a); | |||
2520 | } | |||
2521 | if (can_bypass) | |||
2522 | { | |||
2523 | for (i = 0; i < node->output_size; i++) | |||
2524 | { | |||
2525 | int d = node->outputs[i]; | |||
2526 | if (d < 0) | |||
2527 | continue; | |||
2528 | if (!tensor_blocks[d].bypass_ref) | |||
2529 | continue; | |||
2530 | while (tensor_blocks[d].ref) | |||
2531 | d = tensor_blocks[d].ref - 1; | |||
2532 | int bypass_ref = tensor_blocks[node->outputs[i]].bypass_ref - 1; | |||
2533 | while (tensor_blocks[bypass_ref].ref) | |||
2534 | bypass_ref = tensor_blocks[bypass_ref].ref - 1; | |||
2535 | // The bypass_ref can extend its life-time. | |||
2536 | for (j = 0; tensor_blocks[d].head && j < tensor_blocks[d].head->rnum; j++) | |||
2537 | _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].head, j)((void*)(((char*)((tensor_blocks[d].head)->data)) + (size_t )(tensor_blocks[d].head)->rsize * (size_t)(j))), tensor_blocks[bypass_ref]); | |||
2538 | for (j = 0; tensor_blocks[d].tail && j < tensor_blocks[d].tail->rnum; j++) | |||
2539 | _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].tail, j)((void*)(((char*)((tensor_blocks[d].tail)->data)) + (size_t )(tensor_blocks[d].tail)->rsize * (size_t)(j))), tensor_blocks[bypass_ref]); | |||
2540 | } | |||
2541 | } else { | |||
2542 | for (i = 0; i < node->output_size; i++) | |||
2543 | tensor_blocks[node->outputs[i]].bypass_ref = 0; | |||
2544 | const int exec_idx = (dup_exec_from_ref) ? dup_exec_from_ref[idx] : idx; | |||
2545 | // Mark this exec as no bypass IO (thus, I need to insert explicit data transfer. | |||
2546 | exec_flags[exec_idx].flags |= CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO; | |||
2547 | } | |||
2548 | } | |||
2549 | } ccv_nnc_graph_visit_endfor} } | |||
2550 | ccv_array_free(empty_block.head); | |||
2551 | ccv_array_free(empty_block.tail); | |||
2552 | *r_exec_dep = exec_dep; | |||
2553 | *r_tensor_blocks = tensor_blocks; | |||
2554 | } | |||
2555 | ||||
2556 | static ccv_nnc_cmd_t _ccv_nnc_subst_sub_graph_with_noop(const ccv_nnc_graph_exec_symbol_t symbol, const ccv_nnc_cmd_t cmd) | |||
2557 | { | |||
2558 | if (cmd.cmd == CCV_NNC_GRAPH_FORWARD || cmd.cmd == CCV_NNC_GRAPH_BACKWARD) | |||
2559 | { | |||
2560 | ccv_nnc_cmd_t retval = cmd; | |||
2561 | retval.cmd = CCV_NNC_NOOP; | |||
2562 | return retval; | |||
2563 | } | |||
2564 | return cmd; | |||
2565 | } | |||
2566 | ||||
2567 | static ccv_nnc_tensor_symbol_t _ccv_nnc_dup_tensor_symbol(ccv_nnc_symbolic_graph_t* const dup_graph, const int unroll_count, int* const dup_tensor_block_ref, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int input) | |||
2568 | { | |||
2569 | if (dup_tensor_block_ref[input * unroll_count] < 0) // No tensor ref, create one. | |||
2570 | { | |||
2571 | if (tensor_symbol_info[input].alias_ref) | |||
2572 | { | |||
2573 | const int alias_ref = tensor_symbol_info[input].alias_ref - 1; | |||
2574 | assert(tensor_symbol_info[alias_ref].alias_ref == 0)((void) sizeof ((tensor_symbol_info[alias_ref].alias_ref == 0 ) ? 1 : 0), __extension__ ({ if (tensor_symbol_info[alias_ref ].alias_ref == 0) ; else __assert_fail ("tensor_symbol_info[alias_ref].alias_ref == 0" , "ccv_nnc_symbolic_graph_compile.c", 2574, __extension__ __PRETTY_FUNCTION__ ); })); | |||
2575 | ccv_nnc_tensor_symbol_t tensor_symbol = {}; | |||
2576 | if (dup_tensor_block_ref[alias_ref * unroll_count] < 0) | |||
2577 | { | |||
2578 | tensor_symbol = ccv_nnc_tensor_symbol_new(dup_graph, tensor_symbol_info[alias_ref].info, 0); | |||
2579 | if (tensor_symbol_info[alias_ref].pair_ref) | |||
2580 | ccv_nnc_tensor_symbol_pair_with(dup_graph, tensor_symbol, (ccv_nnc_tensor_symbol_t){ | |||
2581 | .d = tensor_symbol_info[alias_ref].pair_ref - 1, | |||
2582 | .graph = dup_graph->pair | |||
2583 | }); | |||
2584 | ccv_nnc_tensor_symbol_set_flags(dup_graph, tensor_symbol, tensor_symbol_info[alias_ref].flags); | |||
2585 | dup_tensor_block_ref[alias_ref * unroll_count] = tensor_symbol.d; | |||
2586 | } else { | |||
2587 | tensor_symbol.d = dup_tensor_block_ref[alias_ref * unroll_count]; | |||
2588 | tensor_symbol.graph = dup_graph; | |||
2589 | } | |||
2590 | ccv_nnc_tensor_symbol_t alias_symbol = ccv_nnc_tensor_symbol_alias_new(dup_graph, tensor_symbol, tensor_symbol_info[input].ofs, tensor_symbol_info[input].stride, tensor_symbol_info[input].info, 0); | |||
2591 | if (tensor_symbol_info[input].pair_ref) | |||
2592 | ccv_nnc_tensor_symbol_pair_with(dup_graph, alias_symbol, (ccv_nnc_tensor_symbol_t){ | |||
2593 | .d = tensor_symbol_info[input].pair_ref - 1, | |||
2594 | .graph = dup_graph->pair | |||
2595 | }); | |||
2596 | ccv_nnc_tensor_symbol_set_flags(dup_graph, alias_symbol, tensor_symbol_info[input].flags); | |||
2597 | dup_tensor_block_ref[input * unroll_count] = alias_symbol.d; | |||
2598 | } else { | |||
2599 | ccv_nnc_tensor_symbol_t tensor_symbol = ccv_nnc_tensor_symbol_new(dup_graph, tensor_symbol_info[input].info, 0); | |||
2600 | if (tensor_symbol_info[input].pair_ref) | |||
2601 | ccv_nnc_tensor_symbol_pair_with(dup_graph, tensor_symbol, (ccv_nnc_tensor_symbol_t){ | |||
2602 | .d = tensor_symbol_info[input].pair_ref - 1, | |||
2603 | .graph = dup_graph->pair | |||
2604 | }); | |||
2605 | ccv_nnc_tensor_symbol_set_flags(dup_graph, tensor_symbol, tensor_symbol_info[input].flags); | |||
2606 | dup_tensor_block_ref[input * unroll_count] = tensor_symbol.d; | |||
2607 | } | |||
2608 | if (tensor_symbol_info[input].bypass_ref) | |||
2609 | { | |||
2610 | const int dup_bypass_ref = dup_tensor_block_ref[(tensor_symbol_info[input].bypass_ref - 1) * unroll_count]; | |||
2611 | assert(dup_bypass_ref >= 0)((void) sizeof ((dup_bypass_ref >= 0) ? 1 : 0), __extension__ ({ if (dup_bypass_ref >= 0) ; else __assert_fail ("dup_bypass_ref >= 0" , "ccv_nnc_symbolic_graph_compile.c", 2611, __extension__ __PRETTY_FUNCTION__ ); })); | |||
2612 | ccv_nnc_tensor_symbol_info_t* const symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(dup_graph->tensor_symbol_info, dup_tensor_block_ref[input * unroll_count])((void*)(((char*)((dup_graph->tensor_symbol_info)->data )) + (size_t)(dup_graph->tensor_symbol_info)->rsize * ( size_t)(dup_tensor_block_ref[input * unroll_count]))); | |||
2613 | symbol_info->bypass_ref = dup_bypass_ref + 1; | |||
2614 | } | |||
2615 | } | |||
2616 | return (ccv_nnc_tensor_symbol_t) { | |||
2617 | .d = dup_tensor_block_ref[input * unroll_count], | |||
2618 | .graph = dup_graph, | |||
2619 | }; | |||
2620 | } | |||
2621 | ||||
2622 | static ccv_nnc_graph_exec_symbol_t _ccv_nnc_dup_graph_exec_symbol(ccv_nnc_symbolic_graph_t* const dup_graph, const int unroll_count, int* const dup_exec_ref, int* const dup_tensor_block_ref, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_nnc_graph_exec_symbol_info_t* const node, const int idx, ccv_nnc_tensor_symbol_t* const max_inputs, ccv_nnc_tensor_symbol_t* const max_outputs) | |||
2623 | { | |||
2624 | int i; | |||
2625 | if (dup_exec_ref[idx * unroll_count] < 0) | |||
2626 | { | |||
2627 | // Input has to come before output, because output could has a bypass reference to the input. | |||
2628 | for (i = 0; i < node->input_size; i++) | |||
2629 | max_inputs[i] = (node->inputs[i] >= 0) ? _ccv_nnc_dup_tensor_symbol(dup_graph, unroll_count, dup_tensor_block_ref, tensor_symbol_info, node->inputs[i]) : (ccv_nnc_tensor_symbol_t){ .d = node->inputs[i], .graph = dup_graph }; | |||
2630 | for (i = 0; i < node->output_size; i++) | |||
2631 | max_outputs[i] = (node->outputs[i] >= 0) ? _ccv_nnc_dup_tensor_symbol(dup_graph, unroll_count, dup_tensor_block_ref, tensor_symbol_info, node->outputs[i]) : (ccv_nnc_tensor_symbol_t){ .d = node->outputs[i], .graph = dup_graph }; | |||
2632 | ccv_nnc_graph_exec_symbol_t exec_symbol = ccv_nnc_graph_exec_symbol_new(dup_graph, node->cmd, max_inputs, node->input_size, max_outputs, node->output_size, 0); | |||
2633 | dup_exec_ref[idx * unroll_count] = exec_symbol.d; | |||
2634 | } | |||
2635 | return (ccv_nnc_graph_exec_symbol_t) { | |||
2636 | .d = dup_exec_ref[idx * unroll_count], | |||
2637 | .graph = dup_graph, | |||
2638 | }; | |||
2639 | } | |||
2640 | ||||
2641 | static void _ccv_nnc_tensor_blocks_free(ccv_nnc_tensor_block_t* const tensor_blocks, const int tensor_block_size) | |||
2642 | { | |||
2643 | int i; | |||
2644 | for (i = 0; i < tensor_block_size; i++) | |||
2645 | { | |||
2646 | if (tensor_blocks[i].head) | |||
2647 | ccv_array_free(tensor_blocks[i].head); | |||
2648 | if (tensor_blocks[i].tail) | |||
2649 | ccv_array_free(tensor_blocks[i].tail); | |||
2650 | if (tensor_blocks[i].r_refs) | |||
2651 | ccv_array_free(tensor_blocks[i].r_refs); | |||
2652 | if (tensor_blocks[i].dup_p_refs) | |||
2653 | ccv_array_free(tensor_blocks[i].dup_p_refs); | |||
2654 | } | |||
2655 | ccfreefree(tensor_blocks); | |||
2656 | } | |||
2657 | ||||
2658 | // Find tensors that cannot be solved by co-allocating to the same location. | |||
2659 | static int _ccv_nnc_exec_dep_and_tensor_blocks_unroll_count(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_sparse_matrix_t* const exec_dep, ccv_nnc_tensor_block_t* const tensor_blocks) | |||
2660 | { | |||
2661 | int i, j, unroll_count = 0; | |||
2662 | for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++) | |||
2663 | if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED) && tensor_symbol_info[i].assign_ref) | |||
2664 | { | |||
2665 | // This is is a parameter, thus, it has to be either an alias or used. | |||
2666 | assert(tensor_blocks[i].ref || TENSOR_EXPECT_ORDINARY(tensor_blocks[i]))((void) sizeof ((tensor_blocks[i].ref || ((tensor_blocks[i].flags & 0x3) == 0)) ? 1 : 0), __extension__ ({ if (tensor_blocks [i].ref || ((tensor_blocks[i].flags & 0x3) == 0)) ; else __assert_fail ("tensor_blocks[i].ref || TENSOR_EXPECT_ORDINARY(tensor_blocks[i])" , "ccv_nnc_symbolic_graph_compile.c", 2666, __extension__ __PRETTY_FUNCTION__ ); })); | |||
2667 | const int assign_ref = tensor_symbol_info[i].assign_ref - 1; // Starts at 1. | |||
2668 | // The parameter it assign to has to be either an alias or used. | |||
2669 | assert(tensor_blocks[assign_ref].ref || TENSOR_EXPECT_ORDINARY(tensor_blocks[assign_ref]))((void) sizeof ((tensor_blocks[assign_ref].ref || ((tensor_blocks [assign_ref].flags & 0x3) == 0)) ? 1 : 0), __extension__ ( { if (tensor_blocks[assign_ref].ref || ((tensor_blocks[assign_ref ].flags & 0x3) == 0)) ; else __assert_fail ("tensor_blocks[assign_ref].ref || TENSOR_EXPECT_ORDINARY(tensor_blocks[assign_ref])" , "ccv_nnc_symbolic_graph_compile.c", 2669, __extension__ __PRETTY_FUNCTION__ ); })); | |||
2670 | // If any of this two (assigner and assignee) is an alias, check to see if they are the same. | |||
2671 | // If it is the same, we are good, no need to extend. | |||
2672 | int a_ref = i; | |||
2673 | while (tensor_blocks[a_ref].ref) | |||
2674 | a_ref = tensor_blocks[a_ref].ref - 1; | |||
2675 | int b_ref = assign_ref; | |||
2676 | while (tensor_blocks[b_ref].ref) | |||
2677 | b_ref = tensor_blocks[b_ref].ref - 1; | |||
2678 | if (a_ref != b_ref) | |||
2679 | { | |||
2680 | // If any of the b's head is deterministically later than a's tail | |||
2681 | // or any of the b's tail is deterministically earlier than a's head, they don't interfere. | |||
2682 | int a_hop_b = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[b_ref], tensor_blocks[a_ref]); | |||
2683 | int b_hop_a = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[a_ref], tensor_blocks[b_ref]); | |||
2684 | // It cannot be that both i can hop to j can j can hop to i. | |||
2685 | assert(!(a_hop_b > 0 && b_hop_a > 0))((void) sizeof ((!(a_hop_b > 0 && b_hop_a > 0)) ? 1 : 0), __extension__ ({ if (!(a_hop_b > 0 && b_hop_a > 0)) ; else __assert_fail ("!(a_hop_b > 0 && b_hop_a > 0)" , "ccv_nnc_symbolic_graph_compile.c", 2685, __extension__ __PRETTY_FUNCTION__ ); })); | |||
2686 | // Can it be folded | |||
2687 | // These two can be assigned to the same region of memory without issue (because their life-time doesn't interfere). | |||
2688 | if (a_hop_b || b_hop_a) | |||
2689 | { | |||
2690 | tensor_blocks[a_ref].companion_ref = b_ref + 1; | |||
2691 | tensor_blocks[b_ref].companion_ref = a_ref + 1; | |||
2692 | continue; | |||
2693 | } | |||
2694 | int c_ref = tensor_symbol_info[b_ref].assign_ref - 1; | |||
2695 | for (j = 0; c_ref >= 0; j++) | |||
2696 | { | |||
2697 | while (tensor_blocks[c_ref].ref) | |||
2698 | c_ref = tensor_blocks[c_ref].ref - 1; | |||
2699 | c_ref = tensor_symbol_info[c_ref].assign_ref - 1; | |||
2700 | } | |||
2701 | unroll_count = ccv_max(unroll_count, j + 1)({ typeof (unroll_count) _a = (unroll_count); typeof (j + 1) _b = (j + 1); (_a > _b) ? _a : _b; }); | |||
2702 | } | |||
2703 | } | |||
2704 | // Reset companion_ref if need to unroll. | |||
2705 | if (unroll_count) | |||
2706 | for (j = 0; j < symbolic_graph->tensor_symbol_info->rnum; j++) | |||
2707 | tensor_blocks[j].companion_ref = 0; | |||
2708 | return unroll_count; | |||
2709 | } | |||
2710 | ||||
2711 | static void _ccv_nnc_exec_dep_and_tensor_blocks_unroll_n(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_visit_t* const visit, const int unroll_count, const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_sparse_matrix_t* const exec_dep, const ccv_nnc_tensor_block_t* const tensor_blocks, ccv_nnc_symbolic_graph_t* const dup_graph, int* const r_dup_tensor_block_ref, int* const r_dup_exec_ref) | |||
2712 | { | |||
2713 | int i, j, n; | |||
2714 | // The inout exec nodes, these are the nodes we are going to extend. | |||
2715 | uint8_t* inout = (uint8_t*)cccalloccalloc(symbolic_graph->exec_symbol_info->rnum, sizeof(uint8_t)); | |||
2716 | int max_input_size = 0; | |||
2717 | int max_output_size = 0; | |||
2718 | for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; i++) | |||
2719 | { | |||
2720 | max_input_size = ccv_max(exec_symbol_info[i].input_size, max_input_size)({ typeof (exec_symbol_info[i].input_size) _a = (exec_symbol_info [i].input_size); typeof (max_input_size) _b = (max_input_size ); (_a > _b) ? _a : _b; }); | |||
2721 | max_output_size = ccv_max(exec_symbol_info[i].output_size, max_output_size)({ typeof (exec_symbol_info[i].output_size) _a = (exec_symbol_info [i].output_size); typeof (max_output_size) _b = (max_output_size ); (_a > _b) ? _a : _b; }); | |||
2722 | } | |||
2723 | ccv_nnc_tensor_symbol_t max_inputs[ccv_max(1, max_input_size)({ typeof (1) _a = (1); typeof (max_input_size) _b = (max_input_size ); (_a > _b) ? _a : _b; })]; | |||
2724 | ccv_nnc_tensor_symbol_t max_outputs[ccv_max(1, max_output_size)({ typeof (1) _a = (1); typeof (max_output_size) _b = (max_output_size ); (_a > _b) ? _a : _b; })]; | |||
2725 | // Doing graph expansion | |||
2726 | // It goes without saying, we must have more than one tensors / execs (otherwise I cannot use 0 as no exec ref). | |||
2727 | assert(dup_graph->exec_symbol_info->rnum > 0)((void) sizeof ((dup_graph->exec_symbol_info->rnum > 0) ? 1 : 0), __extension__ ({ if (dup_graph->exec_symbol_info ->rnum > 0) ; else __assert_fail ("dup_graph->exec_symbol_info->rnum > 0" , "ccv_nnc_symbolic_graph_compile.c", 2727, __extension__ __PRETTY_FUNCTION__ ); })); | |||
2728 | assert(dup_graph->tensor_symbol_info->rnum > 0)((void) sizeof ((dup_graph->tensor_symbol_info->rnum > 0) ? 1 : 0), __extension__ ({ if (dup_graph->tensor_symbol_info ->rnum > 0) ; else __assert_fail ("dup_graph->tensor_symbol_info->rnum > 0" , "ccv_nnc_symbolic_graph_compile.c", 2728, __extension__ __PRETTY_FUNCTION__ ); })); | |||
2729 | #define INCOMING_NODE (1) | |||
2730 | #define OUTGOING_NODE (2) | |||
2731 | // Unroll the graph n times. | |||
2732 | for (n = 0; n < unroll_count; n++) | |||
2733 | { | |||
2734 | int* const dup_exec_ref = r_dup_exec_ref + n; | |||
2735 | const int* const prev_dup_tensor_block_ref = n > 0 ? r_dup_tensor_block_ref + (n - 1) : 0; | |||
2736 | int* const dup_tensor_block_ref = r_dup_tensor_block_ref + n; | |||
2737 | for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; i++) | |||
2738 | dup_exec_ref[i * unroll_count] = -1; | |||
2739 | for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++) | |||
2740 | { | |||
2741 | // If there is a assign_ref, that means I don't need to dup the tensor. | |||
2742 | if (tensor_symbol_info[i].assign_ref) | |||
2743 | { | |||
2744 | const int assign_ref = tensor_symbol_info[i].assign_ref - 1; | |||
2745 | dup_tensor_block_ref[i * unroll_count] = prev_dup_tensor_block_ref ? prev_dup_tensor_block_ref[assign_ref * unroll_count] : assign_ref; | |||
2746 | } else if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !( (tensor_blocks[i].flags & 0x3) == UNASSIGNED)) && TENSOR_READ_WRITE(tensor_blocks[i])(tensor_blocks[i].flags & 0xc) == READ_ONLY) | |||
2747 | // If this is a read-only tensor block, no need to duplicate because the value never changes | |||
2748 | // (note we handled assign_ref first), therefore, no need to generate duplicate. | |||
2749 | dup_tensor_block_ref[i * unroll_count] = i; | |||
2750 | else | |||
2751 | dup_tensor_block_ref[i * unroll_count] = -1; | |||
2752 | } | |||
2753 | // Go through the original graph, make copies of the node if it is inout. | |||
2754 | ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const int idx __attribute__((unused)) = (visit)->node[_i_].index ; const int _node_unused_ __attribute__((unused)) = (visit)-> node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__ ((unused)) = (exec_symbol_info) + idx; { | |||
2755 | ccv_nnc_graph_exec_symbol_t exec_symbol = _ccv_nnc_dup_graph_exec_symbol(dup_graph, unroll_count, dup_exec_ref, dup_tensor_block_ref, tensor_symbol_info, node, idx, max_inputs, max_outputs); | |||
2756 | inout[idx] |= INCOMING_NODE; /* Mark this node as incoming. */ | |||
2757 | if (!node->outgoings) | |||
2758 | continue; | |||
2759 | for (i = 0; i < node->outgoings->rnum; i++) | |||
2760 | { | |||
2761 | const int outgoing_idx = *(int*)ccv_array_get(node->outgoings, i)((void*)(((char*)((node->outgoings)->data)) + (size_t)( node->outgoings)->rsize * (size_t)(i))); | |||
2762 | inout[outgoing_idx] |= OUTGOING_NODE; /* Mark this node as outgoing. */ | |||
2763 | ccv_nnc_graph_exec_symbol_t outgoing_symbol = _ccv_nnc_dup_graph_exec_symbol(dup_graph, unroll_count, dup_exec_ref, dup_tensor_block_ref, tensor_symbol_info, exec_symbol_info + outgoing_idx, outgoing_idx, max_inputs, max_outputs); | |||
2764 | ccv_nnc_graph_exec_symbol_concat(dup_graph, exec_symbol, outgoing_symbol); | |||
2765 | } | |||
2766 | } ccv_nnc_graph_visit_endfor} } | |||
2767 | // Check the visitor are all marked as either incoming or outgoing. | |||
2768 | const ccv_nnc_graph_exec_symbol_t* const dup_destinations = ccv_nnc_symbolic_graph_destinations(dup_graph); | |||
2769 | const int dup_destination_size = ccv_nnc_symbolic_graph_destination_size(dup_graph); | |||
2770 | for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; i++) | |||
2771 | { | |||
2772 | if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_symbol_info[i].flags)((exec_symbol_info[i].flags) & CCV_NNC_GRAPH_EXEC_DEAD)) | |||
2773 | continue; | |||
2774 | assert((inout[i] & INCOMING_NODE) || (inout[i] & OUTGOING_NODE))((void) sizeof (((inout[i] & INCOMING_NODE) || (inout[i] & OUTGOING_NODE)) ? 1 : 0), __extension__ ({ if ((inout[i] & INCOMING_NODE) || (inout[i] & OUTGOING_NODE)) ; else __assert_fail ("(inout[i] & INCOMING_NODE) || (inout[i] & OUTGOING_NODE)" , "ccv_nnc_symbolic_graph_compile.c", 2774, __extension__ __PRETTY_FUNCTION__ ); })); | |||
2775 | // If this is pure incoming nodes, then I need to concat this one with all original destination node | |||
2776 | if (inout[i] == INCOMING_NODE) | |||
2777 | for (j = 0; j < dup_destination_size; j++) | |||
2778 | { | |||
2779 | ccv_nnc_graph_exec_symbol_concat(dup_graph, (ccv_nnc_graph_exec_symbol_t) { | |||
2780 | .d = dup_destinations[j].d, | |||
2781 | .graph = dup_graph, | |||
2782 | }, (ccv_nnc_graph_exec_symbol_t) { | |||
2783 | .d = dup_exec_ref[i * unroll_count], | |||
2784 | .graph = dup_graph, | |||
2785 | }); | |||
2786 | } | |||
2787 | } | |||
2788 | if (dup_graph->destinations) | |||
2789 | ccv_array_clear(dup_graph->destinations); | |||
2790 | for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; i++) | |||
2791 | { | |||
2792 | if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_symbol_info[i].flags)((exec_symbol_info[i].flags) & CCV_NNC_GRAPH_EXEC_DEAD)) | |||
2793 | continue; | |||
2794 | const int d = dup_exec_ref[i * unroll_count]; | |||
2795 | ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(dup_graph->exec_symbol_info, d)((void*)(((char*)((dup_graph->exec_symbol_info)->data)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t )(d))); | |||
2796 | // If this has no outgoing node, add to the destination. | |||
2797 | if (!exec_symbol_info->outgoings || exec_symbol_info->outgoings->rnum == 0) | |||
2798 | ccv_nnc_symbolic_graph_add_destination(dup_graph, (ccv_nnc_graph_exec_symbol_t) { | |||
2799 | .graph = dup_graph, | |||
2800 | .d = d, | |||
2801 | }); | |||
2802 | } | |||
2803 | } | |||
2804 | #undef INCOMING_NODE | |||
2805 | #undef OUTGOING_NODE | |||
2806 | ccfreefree(inout); | |||
2807 | } | |||
2808 | ||||
2809 | static void _ccv_nnc_fixup_assign_ref_after_unroll(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const int unroll_count, const ccv_nnc_tensor_block_t* const tensor_blocks, const int* const dup_tensor_block_ref, ccv_nnc_tensor_symbol_info_t* const dup_tensor_symbol_info) | |||
2810 | { | |||
2811 | int i; | |||
2812 | for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++) // symbolic graph is the old graph and tensor blocks is the old tensor blocks. | |||
2813 | // Now can assign them (The dup) as companion. | |||
2814 | // Get to the last one, which we will wrap over. | |||
2815 | if (dup_tensor_symbol_info[i].assign_ref) | |||
2816 | { | |||
2817 | dup_tensor_symbol_info[dup_tensor_symbol_info[i].assign_ref - 1].r_assign_ref = 0; | |||
2818 | dup_tensor_symbol_info[i].assign_ref = dup_tensor_block_ref[(dup_tensor_symbol_info[i].assign_ref - 1) * unroll_count + unroll_count - 1] + 1; | |||
2819 | assert(dup_tensor_symbol_info[i].assign_ref)((void) sizeof ((dup_tensor_symbol_info[i].assign_ref) ? 1 : 0 ), __extension__ ({ if (dup_tensor_symbol_info[i].assign_ref) ; else __assert_fail ("dup_tensor_symbol_info[i].assign_ref" , "ccv_nnc_symbolic_graph_compile.c", 2819, __extension__ __PRETTY_FUNCTION__ ); })); | |||
2820 | dup_tensor_symbol_info[dup_tensor_symbol_info[i].assign_ref - 1].r_assign_ref = i + 1; | |||
2821 | } | |||
2822 | } | |||
2823 | ||||
2824 | // If the tensor blocks are the outputs of this graph, its life-time should be extended to the end of this graph. | |||
2825 | // However, it is not that simple if the graph is unrolled. For unrolled graph, it needs to reach the end of | |||
2826 | // the "original" graph and all its duplicated ends (for their duplicated tensor blocks). | |||
2827 | static void _ccv_nnc_fixup_tensor_blocks_for_outputs(ccv_sparse_matrix_t* const exec_dep, ccv_nnc_tensor_block_t* const tensor_blocks, const ccv_nnc_graph_exec_symbol_info_t* const p_node_info, const int unroll_count, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const int p_idx, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int* const dup_exec_ref, const int* const dup_tensor_block_ref) | |||
2828 | { | |||
2829 | int i, j, k; | |||
2830 | for (i = 0; i < p_node_info->output_size; i++) | |||
2831 | { | |||
2832 | const int d = p_node_info->outputs[i]; | |||
2833 | const int s_ref = *(int*)ccv_array_get(p_tensor_symbol_info[d].s_ref, p_idx)((void*)(((char*)((p_tensor_symbol_info[d].s_ref)->data)) + (size_t)(p_tensor_symbol_info[d].s_ref)->rsize * (size_t) (p_idx))) - 1; | |||
2834 | if (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[s_ref])(!((tensor_blocks[s_ref].flags & 0x3) == ALIAS) && !((tensor_blocks[s_ref].flags & 0x3) == UNASSIGNED))) | |||
2835 | continue; | |||
2836 | for (k = 0; k < destination_size; k++) | |||
2837 | _ccv_nnc_tensor_block_add_exec(exec_dep, destinations[k].d, tensor_blocks[s_ref]); | |||
2838 | // Add the duplicated destinations to the tensor_block_ref. | |||
2839 | for (j = 0; j < unroll_count; j++) | |||
2840 | for (k = 0; k < destination_size; k++) | |||
2841 | { | |||
2842 | const int dup_exec_idx = dup_exec_ref[destinations[k].d * unroll_count + j]; | |||
2843 | const int dup_tensor_block_idx = dup_tensor_block_ref[s_ref * unroll_count + j]; | |||
2844 | if (dup_exec_idx >= 0 && dup_tensor_block_idx >= 0) | |||
2845 | _ccv_nnc_tensor_block_add_exec(exec_dep, dup_exec_idx, tensor_blocks[dup_tensor_block_idx]); | |||
2846 | } | |||
2847 | } | |||
2848 | } | |||
2849 | ||||
2850 | static void _ccv_nnc_redo_exec_dep_and_tensor_blocks_when_unroll(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_info_t* const p_node_info, const ccv_nnc_graph_visit_t* const visit, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, ccv_nnc_graph_exec_flag_t* const exec_flags, ccv_sparse_matrix_t** r_exec_dep, ccv_nnc_tensor_block_t** r_tensor_blocks, int* r_tensor_block_size, ccv_nnc_symbolic_graph_t** r_dup_graph, int* r_unroll_count, int** r_dup_exec_ref, int** r_dup_tensor_block_ref) | |||
2851 | { | |||
2852 | int i, j; | |||
2853 | ccv_sparse_matrix_t* exec_dep = *r_exec_dep; | |||
2854 | ccv_nnc_tensor_block_t* tensor_blocks = *r_tensor_blocks; | |||
2855 | // blocks that cannot be simply solved with either in-place operation tensor block folding or using the same memory region. | |||
2856 | // Unfortunately, I cannot do this analysis to the block folding done for sub-graphs, because we do sub-graph placement later. | |||
2857 | // No need to change anything, we are good. | |||
2858 | const int unroll_count = _ccv_nnc_exec_dep_and_tensor_blocks_unroll_count(symbolic_graph, tensor_symbol_info, exec_dep, tensor_blocks); | |||
2859 | if (!unroll_count) | |||
2860 | return; | |||
2861 | // Have conditions that cannot be satisfied with simple solution (allocate to the same memory region). | |||
2862 | // Doing graph expansion, first duplicate the old graph, but replace all sub graphs with noop. | |||
2863 | ccv_nnc_symbolic_graph_t* dup_graph = ccv_nnc_symbolic_graph_dup(symbolic_graph, _ccv_nnc_subst_sub_graph_with_noop); | |||
2864 | int* dup_exec_ref = (int*)ccmallocmalloc(sizeof(int) * symbolic_graph->exec_symbol_info->rnum * unroll_count); | |||
2865 | int* dup_tensor_block_ref = (int*)ccmallocmalloc(sizeof(int) * symbolic_graph->tensor_symbol_info->rnum * unroll_count); | |||
2866 | _ccv_nnc_exec_dep_and_tensor_blocks_unroll_n(symbolic_graph, visit, unroll_count, exec_symbol_info, tensor_symbol_info, exec_dep, tensor_blocks, dup_graph, dup_tensor_block_ref, dup_exec_ref); | |||
2867 | ccv_nnc_tensor_symbol_info_t* const dup_tensor_symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccmallocmalloc(sizeof(ccv_nnc_tensor_symbol_info_t) * dup_graph->tensor_symbol_info->rnum); | |||
2868 | ccv_nnc_graph_exec_symbol_info_t* const dup_exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccmallocmalloc(sizeof(ccv_nnc_graph_exec_symbol_info_t) * dup_graph->exec_symbol_info->rnum); | |||
2869 | ccv_nnc_graph_visit_t* dup_visit = ccv_nnc_graph_visit_new(dup_graph, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(dup_graph->exec_symbol_info, 0), dup_graph->exec_symbol_info->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->sources, 0), dup_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->destinations, 0), dup_graph->destinations->rnum, 0)({ ccv_nnc_graph_visit_t* _visit_ = (ccv_nnc_graph_visit_t*)malloc (sizeof(ccv_nnc_graph_visit_t) + sizeof(_visit_->node[0]) * ((dup_graph->exec_symbol_info->rnum) - 1)); _visit_-> size = 0; do { typedef struct { int8_t d; int8_t r; uint16_t c ; int32_t edges; } ccv_nnc_incoming_t; int _i_, _j_; int _incoming_edges_ = 0; for (_i_ = 0; _i_ < (dup_graph->exec_symbol_info-> rnum); _i_++) _incoming_edges_ += (((ccv_nnc_graph_exec_symbol_info_t *)((void*)(((char*)((dup_graph->exec_symbol_info)->data )) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t )(0))))[_i_].outgoings) ? ((ccv_nnc_graph_exec_symbol_info_t* )((void*)(((char*)((dup_graph->exec_symbol_info)->data) ) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t )(0))))[_i_].outgoings->rnum : 0; const int _heap_mem_ = ( (dup_graph->exec_symbol_info->rnum) + _incoming_edges_ > 1024); ccv_nnc_incoming_t* _incomings_; if (_heap_mem_) _incomings_ = (ccv_nnc_incoming_t*)malloc(sizeof(ccv_nnc_incoming_t) * ( dup_graph->exec_symbol_info->rnum) + sizeof(int32_t) * ( (dup_graph->exec_symbol_info->rnum) * 2 + _incoming_edges_ )); else _incomings_ = (ccv_nnc_incoming_t*)__builtin_alloca ( sizeof(ccv_nnc_incoming_t) * (dup_graph->exec_symbol_info-> rnum) + sizeof(int32_t) * ((dup_graph->exec_symbol_info-> rnum) * 2 + _incoming_edges_)); memset(_incomings_, 0, sizeof (ccv_nnc_incoming_t) * (dup_graph->exec_symbol_info->rnum )); int32_t* _exists_[2] = { (int32_t*)(_incomings_ + (dup_graph ->exec_symbol_info->rnum)), (int32_t*)(_incomings_ + (dup_graph ->exec_symbol_info->rnum)) + (dup_graph->exec_symbol_info ->rnum), }; int32_t* const _edges_ = _exists_[1] + (dup_graph ->exec_symbol_info->rnum); for (_i_ = 0; _i_ < (dup_graph ->sources->rnum); _i_++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((dup_graph->sources)->data)) + (size_t )(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph == dup_graph) ? 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((dup_graph->sources)->data)) + (size_t )(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph == dup_graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->sources)->data)) + (size_t)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph == dup_graph" , "ccv_nnc_symbolic_graph_compile.c", 2869, __extension__ __PRETTY_FUNCTION__ ); })); _incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)(( (char*)((dup_graph->sources)->data)) + (size_t)(dup_graph ->sources)->rsize * (size_t)(0))))[_i_].d].r = 1; _exists_ [0][_i_] = ((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)(( dup_graph->sources)->data)) + (size_t)(dup_graph->sources )->rsize * (size_t)(0))))[_i_].d; } int _exist_size_[2] = { (dup_graph->sources->rnum), 0, }; int _p_ = 0, _q_ = 1 ; while (_exist_size_[_p_] > 0) { _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (_incomings_[_idx_].r != 1) continue ; _incomings_[_idx_].r = 2; if (((ccv_nnc_graph_exec_symbol_info_t *)((void*)(((char*)((dup_graph->exec_symbol_info)->data )) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t )(0))))[_idx_].outgoings) for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t *)((void*)(((char*)((dup_graph->exec_symbol_info)->data )) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t )(0))))[_idx_].outgoings->rnum; _j_++) { const int d = *(int *)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t*)((void *)(((char*)((dup_graph->exec_symbol_info)->data)) + (size_t )(dup_graph->exec_symbol_info)->rsize * (size_t)(0))))[ _idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t *)((void*)(((char*)((dup_graph->exec_symbol_info)->data )) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t )(0))))[_idx_].outgoings)->rsize * (size_t)(_j_))); ++_incomings_ [d].c; if (_incomings_[d].r != 0) continue; _incomings_[d].r = 1; ((void) sizeof ((_exist_size_[_q_] < (dup_graph->exec_symbol_info ->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_[_q_] < (dup_graph->exec_symbol_info->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (dup_graph->exec_symbol_info->rnum)" , "ccv_nnc_symbolic_graph_compile.c", 2869, __extension__ __PRETTY_FUNCTION__ ); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[ _q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for (_i_ = 0; _i_ < (dup_graph->sources->rnum); _i_++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t*)((void*)(((char *)((dup_graph->sources)->data)) + (size_t)(dup_graph-> sources)->rsize * (size_t)(0))))[_i_].graph == dup_graph) ? 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t*) ((void*)(((char*)((dup_graph->sources)->data)) + (size_t )(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph == dup_graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->sources)->data)) + (size_t)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph == dup_graph" , "ccv_nnc_symbolic_graph_compile.c", 2869, __extension__ __PRETTY_FUNCTION__ ); })); _incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)(( (char*)((dup_graph->sources)->data)) + (size_t)(dup_graph ->sources)->rsize * (size_t)(0))))[_i_].d].r = 3; _exists_ [0][_i_] = ((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)(( dup_graph->sources)->data)) + (size_t)(dup_graph->sources )->rsize * (size_t)(0))))[_i_].d; } _exist_size_[0] = (dup_graph ->sources->rnum); _exist_size_[1] = 0; _p_ = 0, _q_ = 1 ; int _bump_ = 1; while (_exist_size_[_p_] > 0) { _exist_size_ [_q_] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (_incomings_[_idx_ ].r != 3) continue; _incomings_[_idx_].r = 4; if (((ccv_nnc_graph_exec_symbol_info_t *)((void*)(((char*)((dup_graph->exec_symbol_info)->data )) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t )(0))))[_idx_].outgoings) for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t *)((void*)(((char*)((dup_graph->exec_symbol_info)->data )) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t )(0))))[_idx_].outgoings->rnum; _j_++) { const int d = *(int *)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t*)((void *)(((char*)((dup_graph->exec_symbol_info)->data)) + (size_t )(dup_graph->exec_symbol_info)->rsize * (size_t)(0))))[ _idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t *)((void*)(((char*)((dup_graph->exec_symbol_info)->data )) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t )(0))))[_idx_].outgoings)->rsize * (size_t)(_j_))); if (_incomings_ [d].edges == 0) { _incomings_[d].edges = _bump_; _bump_ += _incomings_ [d].c; _incomings_[d].c = 0; } _edges_[_incomings_[d].edges - 1 + _incomings_[d].c] = _idx_; ++_incomings_[d].c; if (_incomings_ [d].r != 2) continue; _incomings_[d].r = 3; ((void) sizeof (( _exist_size_[_q_] < (dup_graph->exec_symbol_info->rnum )) ? 1 : 0), __extension__ ({ if (_exist_size_[_q_] < (dup_graph ->exec_symbol_info->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (dup_graph->exec_symbol_info->rnum)" , "ccv_nnc_symbolic_graph_compile.c", 2869, __extension__ __PRETTY_FUNCTION__ ); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[ _q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for (_i_ = 0; _i_ < (dup_graph->destinations->rnum); _i_ ++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t*)((void* )(((char*)((dup_graph->destinations)->data)) + (size_t) (dup_graph->destinations)->rsize * (size_t)(0))))[_i_]. graph == dup_graph) ? 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph->destinations)->rsize * (size_t)(0) )))[_i_].graph == dup_graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph->destinations)->rsize * (size_t)(0))))[_i_].graph == dup_graph" , "ccv_nnc_symbolic_graph_compile.c", 2869, __extension__ __PRETTY_FUNCTION__ ); })); _incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)(( (char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph ->destinations)->rsize * (size_t)(0))))[_i_].d].r = 5; _exists_ [0][_i_] = ((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)(( dup_graph->destinations)->data)) + (size_t)(dup_graph-> destinations)->rsize * (size_t)(0))))[_i_].d; } _exist_size_ [0] = (dup_graph->destinations->rnum); _exist_size_[1] = 0; _p_ = 0, _q_ = 1; while (_exist_size_[_p_] > 0) { _exist_size_ [_q_] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (_incomings_[_idx_ ].r != 5) continue; _incomings_[_idx_].r = 6; if (_incomings_ [_idx_].edges > 0) for (_j_ = 0; _j_ < _incomings_[_idx_ ].c; _j_++) { const int d = _edges_[_incomings_[_idx_].edges - 1 + _j_]; if (_incomings_[d].r != 4) continue; _incomings_[d ].r = 5; ((void) sizeof ((_exist_size_[_q_] < (dup_graph-> exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_ [_q_] < (dup_graph->exec_symbol_info->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (dup_graph->exec_symbol_info->rnum)" , "ccv_nnc_symbolic_graph_compile.c", 2869, __extension__ __PRETTY_FUNCTION__ ); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[ _q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for (_i_ = 0; _i_ < (dup_graph->destinations->rnum); _i_ ++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t*)((void* )(((char*)((dup_graph->destinations)->data)) + (size_t) (dup_graph->destinations)->rsize * (size_t)(0))))[_i_]. graph == dup_graph) ? 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph->destinations)->rsize * (size_t)(0) )))[_i_].graph == dup_graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph->destinations)->rsize * (size_t)(0))))[_i_].graph == dup_graph" , "ccv_nnc_symbolic_graph_compile.c", 2869, __extension__ __PRETTY_FUNCTION__ ); })); _incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)(( (char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph ->destinations)->rsize * (size_t)(0))))[_i_].d].d = 1; } for (_i_ = 0; _i_ < (dup_graph->sources->rnum); _i_ ++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t*)((void* )(((char*)((dup_graph->sources)->data)) + (size_t)(dup_graph ->sources)->rsize * (size_t)(0))))[_i_].graph == dup_graph ) ? 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((dup_graph->sources)->data)) + (size_t )(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph == dup_graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->sources)->data)) + (size_t)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph == dup_graph" , "ccv_nnc_symbolic_graph_compile.c", 2869, __extension__ __PRETTY_FUNCTION__ ); })); _exists_[0][_i_] = ((ccv_nnc_graph_exec_symbol_t*)((void *)(((char*)((dup_graph->sources)->data)) + (size_t)(dup_graph ->sources)->rsize * (size_t)(0))))[_i_].d; } _p_ = 0; _q_ = 1; _exist_size_[0] = (dup_graph->sources->rnum); _exist_size_ [1] = 0; int _d_ = 0; while (_exist_size_[_p_] > 0) { _exist_size_ [_q_] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_];) { const int32_t _idx_ = _exists_[_p_][_i_]; _visit_->node[_visit_ ->size].index = ((_idx_)); _visit_->node[_visit_->size ].term = ((_incomings_[_idx_].d)); ++_visit_->size;; if (_incomings_ [_idx_].d) { ++_d_; _incomings_[_idx_].r = 7; } if (((ccv_nnc_graph_exec_symbol_info_t *)((void*)(((char*)((dup_graph->exec_symbol_info)->data )) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t )(0))))[_idx_].outgoings) { if (((ccv_nnc_graph_exec_symbol_info_t *)((void*)(((char*)((dup_graph->exec_symbol_info)->data )) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t )(0))))[_idx_].outgoings->rnum == 1) { const int d = *(int *)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t*)((void *)(((char*)((dup_graph->exec_symbol_info)->data)) + (size_t )(dup_graph->exec_symbol_info)->rsize * (size_t)(0))))[ _idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t *)((void*)(((char*)((dup_graph->exec_symbol_info)->data )) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t )(0))))[_idx_].outgoings)->rsize * (size_t)(0))); --_incomings_ [d].c; if (_incomings_[d].c == 0 && _incomings_[d].r == 6 && _d_ < (dup_graph->destinations->rnum)) { _exists_[_p_][_i_] = d; continue; } } else for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)(( dup_graph->exec_symbol_info)->data)) + (size_t)(dup_graph ->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings ->rnum; _j_++) { const int d = *(int*)((void*)(((char*)((( (ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)((dup_graph ->exec_symbol_info)->data)) + (size_t)(dup_graph->exec_symbol_info )->rsize * (size_t)(0))))[_idx_].outgoings)->data)) + ( size_t)(((ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*) ((dup_graph->exec_symbol_info)->data)) + (size_t)(dup_graph ->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings )->rsize * (size_t)(_j_))); --_incomings_[d].c; if (_incomings_ [d].c == 0 && _incomings_[d].r == 6 && _d_ < (dup_graph->destinations->rnum)) { ((void) sizeof ((_exist_size_ [_q_] < (dup_graph->exec_symbol_info->rnum)) ? 1 : 0 ), __extension__ ({ if (_exist_size_[_q_] < (dup_graph-> exec_symbol_info->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (dup_graph->exec_symbol_info->rnum)" , "ccv_nnc_symbolic_graph_compile.c", 2869, __extension__ __PRETTY_FUNCTION__ ); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[ _q_]; } } } ++_i_; } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = ( _i_)); } for (_i_ = 0; _i_ < (dup_graph->destinations-> rnum); _i_++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph->destinations)->rsize * (size_t)(0) )))[_i_].graph == dup_graph) ? 1 : 0), __extension__ ({ if (( (ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph-> destinations)->data)) + (size_t)(dup_graph->destinations )->rsize * (size_t)(0))))[_i_].graph == dup_graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph->destinations)->rsize * (size_t)(0))))[_i_].graph == dup_graph" , "ccv_nnc_symbolic_graph_compile.c", 2869, __extension__ __PRETTY_FUNCTION__ ); })); if (_incomings_[((ccv_nnc_graph_exec_symbol_t*)((void *)(((char*)((dup_graph->destinations)->data)) + (size_t )(dup_graph->destinations)->rsize * (size_t)(0))))[_i_] .d].r == 7) continue; if (!(0)) { ((void) sizeof ((_incomings_ [((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph-> destinations)->data)) + (size_t)(dup_graph->destinations )->rsize * (size_t)(0))))[_i_].d].c == 0) ? 1 : 0), __extension__ ({ if (_incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)(( (char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph ->destinations)->rsize * (size_t)(0))))[_i_].d].c == 0) ; else __assert_fail ("_incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph->destinations)->rsize * (size_t)(0))))[_i_].d].c == 0" , "ccv_nnc_symbolic_graph_compile.c", 2869, __extension__ __PRETTY_FUNCTION__ ); })); } else if (_incomings_[((ccv_nnc_graph_exec_symbol_t* )((void*)(((char*)((dup_graph->destinations)->data)) + ( size_t)(dup_graph->destinations)->rsize * (size_t)(0))) )[_i_].d].c > 0) continue; _visit_->node[_visit_->size ].index = ((((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)( (dup_graph->destinations)->data)) + (size_t)(dup_graph-> destinations)->rsize * (size_t)(0))))[_i_].d)); _visit_-> node[_visit_->size].term = ((_incomings_[((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph->destinations)->rsize * (size_t)(0) )))[_i_].d].d)); ++_visit_->size;; } if (_heap_mem_) free( _incomings_); } while (0);; ((void) sizeof ((_visit_->size <= (dup_graph->exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if (_visit_->size <= (dup_graph->exec_symbol_info ->rnum)) ; else __assert_fail ("_visit_->size <= (dup_graph->exec_symbol_info->rnum)" , "ccv_nnc_symbolic_graph_compile.c", 2869, __extension__ __PRETTY_FUNCTION__ ); })); _visit_; }); | |||
2870 | ccv_nnc_symbolic_graph_symbol_infer(dup_graph, dup_visit, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->sources, 0)((void*)(((char*)((dup_graph->sources)->data)) + (size_t )(dup_graph->sources)->rsize * (size_t)(0))), dup_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->destinations, 0)((void*)(((char*)((dup_graph->destinations)->data)) + ( size_t)(dup_graph->destinations)->rsize * (size_t)(0))), dup_graph->destinations->rnum, p_tensor_symbol_info, p_tensor_symbol_info_size, dup_tensor_symbol_info, dup_exec_symbol_info); | |||
2871 | _ccv_nnc_fixup_assign_ref_after_unroll(symbolic_graph, unroll_count, tensor_blocks, dup_tensor_block_ref, dup_tensor_symbol_info); | |||
2872 | // Free out the old exec_dep | |||
2873 | ccv_matrix_free(exec_dep); | |||
2874 | // and the tensor blocks, prepare for the new. | |||
2875 | _ccv_nnc_tensor_blocks_free(tensor_blocks, symbolic_graph->tensor_symbol_info->rnum); | |||
2876 | // A reverse map to find where the original tensor comes from. | |||
2877 | int* dup_tensor_from_ref = (int*)ccmallocmalloc(sizeof(int) * dup_graph->tensor_symbol_info->rnum); | |||
2878 | for (i = 0; i < dup_graph->tensor_symbol_info->rnum; i++) | |||
2879 | dup_tensor_from_ref[i] = -1; | |||
2880 | for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++) | |||
2881 | for (j = 0; j < unroll_count; j++) | |||
2882 | if (dup_tensor_block_ref[i * unroll_count + j] >= 0) | |||
2883 | dup_tensor_from_ref[dup_tensor_block_ref[i * unroll_count + j]] = i; | |||
2884 | int* dup_exec_from_ref = (int*)ccmallocmalloc(sizeof(int) * dup_graph->exec_symbol_info->rnum); | |||
2885 | for (i = 0; i < dup_graph->exec_symbol_info->rnum; i++) | |||
2886 | dup_exec_from_ref[i] = -1; | |||
2887 | for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; i++) | |||
2888 | { | |||
2889 | if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_symbol_info[i].flags)((exec_symbol_info[i].flags) & CCV_NNC_GRAPH_EXEC_DEAD)) | |||
2890 | continue; | |||
2891 | dup_exec_from_ref[i] = i; // Reference back. | |||
2892 | for (j = 0; j < unroll_count; j++) | |||
2893 | if (dup_exec_ref[i * unroll_count + j] >= 0) | |||
2894 | dup_exec_from_ref[dup_exec_ref[i * unroll_count + j]] = i; | |||
2895 | } | |||
2896 | // Reset all attr. | |||
2897 | memset(exec_flags, 0, sizeof(ccv_nnc_graph_exec_flag_t) * symbolic_graph->exec_symbol_info->rnum); | |||
2898 | _ccv_nnc_exec_dep_and_tensor_blocks_prep(dup_graph, p_node_info, dup_visit, tensor_binds, tensor_bind_size, 0, 0, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->sources, 0)((void*)(((char*)((dup_graph->sources)->data)) + (size_t )(dup_graph->sources)->rsize * (size_t)(0))), dup_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->destinations, 0)((void*)(((char*)((dup_graph->destinations)->data)) + ( size_t)(dup_graph->destinations)->rsize * (size_t)(0))), dup_graph->destinations->rnum, p_tensor_symbol_info, p_tensor_symbol_info_size, dup_exec_symbol_info, dup_tensor_symbol_info, unroll_count, dup_tensor_block_ref, dup_tensor_from_ref, dup_exec_from_ref, exec_flags, &exec_dep, &tensor_blocks); | |||
2899 | ccv_nnc_graph_visit_free(dup_visit); | |||
2900 | ccfreefree(dup_exec_symbol_info); | |||
2901 | ccfreefree(dup_exec_from_ref); | |||
2902 | ccfreefree(dup_tensor_from_ref); | |||
2903 | // Assign out dup_p_ref, which will be used to extended the anonymous block life-time. | |||
2904 | for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++) | |||
2905 | // Loop over all possible duplications to assign dup_p_ref properly. | |||
2906 | for (j = 0; j < unroll_count; j++) | |||
2907 | { | |||
2908 | const int dup_idx = dup_tensor_block_ref[j + i * unroll_count]; | |||
2909 | if (dup_idx >= 0 && (tensor_blocks[i].p_refs[0] || tensor_blocks[i].p_refs[1])) | |||
2910 | { | |||
2911 | const int p_ref_0 = tensor_blocks[i].p_refs[0] - 1; | |||
2912 | const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info); | |||
2913 | if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this. | |||
2914 | { | |||
2915 | if (!tensor_blocks[dup_idx].dup_p_refs) | |||
2916 | tensor_blocks[dup_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0); | |||
2917 | ccv_array_add_unique_int(tensor_blocks[dup_idx].dup_p_refs, p_ref_0); | |||
2918 | } | |||
2919 | if (p_ref_0_is_in_or_out == 1 || tensor_blocks[i].p_refs[1] == 0) | |||
2920 | continue; | |||
2921 | const int p_ref_1 = tensor_blocks[i].p_refs[1] - 1; | |||
2922 | const int p_ref_1_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_1, p_node_info); | |||
2923 | if (p_ref_1_is_in_or_out == 1) | |||
2924 | { | |||
2925 | if (!tensor_blocks[dup_idx].dup_p_refs) | |||
2926 | tensor_blocks[dup_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0); | |||
2927 | ccv_array_add_unique_int(tensor_blocks[dup_idx].dup_p_refs, p_ref_1); | |||
2928 | } | |||
2929 | } | |||
2930 | } | |||
2931 | // companion_ref | |||
2932 | for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++) | |||
2933 | // Now can assign them (The dup) as companion. | |||
2934 | if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED) && dup_tensor_symbol_info[i].assign_ref) | |||
2935 | { | |||
2936 | // Get to the last one, which we will wrap over. | |||
2937 | const int assign_ref = dup_tensor_symbol_info[i].assign_ref - 1; | |||
2938 | if (assign_ref >= 0) | |||
2939 | { | |||
2940 | int b_ref = assign_ref; | |||
2941 | while (tensor_blocks[b_ref].ref) | |||
2942 | b_ref = tensor_blocks[b_ref].ref - 1; | |||
2943 | int a_hop_b = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[i], tensor_blocks[b_ref]); | |||
2944 | int b_hop_a = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[b_ref], tensor_blocks[i]); | |||
2945 | // It cannot be that both i can hop to j can j can hop to i. | |||
2946 | // And it can be hop from one to another now after duplication. | |||
2947 | assert(a_hop_b > 0 || b_hop_a > 0)((void) sizeof ((a_hop_b > 0 || b_hop_a > 0) ? 1 : 0), __extension__ ({ if (a_hop_b > 0 || b_hop_a > 0) ; else __assert_fail ("a_hop_b > 0 || b_hop_a > 0", "ccv_nnc_symbolic_graph_compile.c" , 2947, __extension__ __PRETTY_FUNCTION__); })); | |||
2948 | tensor_blocks[i].companion_ref = b_ref + 1; | |||
2949 | tensor_blocks[b_ref].companion_ref = i + 1; | |||
2950 | } | |||
2951 | } | |||
2952 | ccfreefree(dup_tensor_symbol_info); | |||
2953 | // Extend the dup tensor block ref, prepare for future extensions. | |||
2954 | dup_tensor_block_ref = (int*)ccreallocrealloc(dup_tensor_block_ref, sizeof(int) * dup_graph->tensor_symbol_info->rnum * unroll_count); | |||
2955 | for (i = symbolic_graph->tensor_symbol_info->rnum * unroll_count; i < dup_graph->tensor_symbol_info->rnum * unroll_count; i++) | |||
2956 | dup_tensor_block_ref[i] = -1; | |||
2957 | // Assign out changed properties. | |||
2958 | *r_exec_dep = exec_dep; | |||
2959 | *r_tensor_blocks = tensor_blocks; | |||
2960 | *r_tensor_block_size = dup_graph->tensor_symbol_info->rnum; | |||
2961 | *r_dup_graph = dup_graph; | |||
2962 | *r_unroll_count = unroll_count; | |||
2963 | *r_dup_exec_ref = dup_exec_ref; | |||
2964 | *r_dup_tensor_block_ref = dup_tensor_block_ref; | |||
2965 | } | |||
2966 | ||||
2967 | static int _ccv_nnc_anonymous_tensor_block_from_free_list(const ccv_nnc_tensor_block_t* const tensor_blocks, const int tensor_block_size, const ccv_array_t* const anonymous_block_free_list, const int anonymous_block_free_list_cap, const int type, const uint64_t size, const ccv_sparse_matrix_t* const exec_dep, const ccv_array_t* const dup_p_refs) | |||
2968 | { | |||
2969 | if (!anonymous_block_free_list || !anonymous_block_free_list_cap) | |||
2970 | return tensor_block_size; | |||
2971 | int i; | |||
2972 | const int no_dup_p_refs = (!dup_p_refs || !dup_p_refs->rnum); | |||
2973 | int found_idx = tensor_block_size; | |||
2974 | for (i = 0; i < anonymous_block_free_list_cap; i++) | |||
2975 | { | |||
2976 | const int idx = *(int*)ccv_array_get(anonymous_block_free_list, i)((void*)(((char*)((anonymous_block_free_list)->data)) + (size_t )(anonymous_block_free_list)->rsize * (size_t)(i))); | |||
2977 | assert(idx < tensor_block_size)((void) sizeof ((idx < tensor_block_size) ? 1 : 0), __extension__ ({ if (idx < tensor_block_size) ; else __assert_fail ("idx < tensor_block_size" , "ccv_nnc_symbolic_graph_compile.c", 2977, __extension__ __PRETTY_FUNCTION__ ); })); | |||
2978 | // If the type doesn't match, ignore. | |||
2979 | if (tensor_blocks[idx].type != type) | |||
2980 | continue; | |||
2981 | // Heuristic about how to select the best tensor block to move forward. | |||
2982 | // If the size is larger, and no dup_p_refs, found, I cannot do better than this, just return directly. | |||
2983 | if (tensor_blocks[idx].size >= size) | |||
2984 | { | |||
2985 | if (no_dup_p_refs) | |||
2986 | return idx; | |||
2987 | // Otherwise, only if the current tensor block's dup_p_refs is after (or at) the dup_p_refs, | |||
2988 | // then we cannot do better than this, if that is the case, just return. | |||
2989 | if (tensor_blocks[idx].dup_p_refs && tensor_blocks[idx].dup_p_refs->rnum && | |||
2990 | _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, dup_p_refs)) | |||
2991 | return idx; | |||
2992 | } | |||
2993 | int64_t found_idx_size_diff; | |||
2994 | int64_t idx_size_diff; | |||
2995 | if (found_idx == tensor_block_size || // If no found_idx yet, set the current one to be the found one, and continue. | |||
2996 | // Now, compare whether this one or the found_idx one is better. | |||
2997 | // At this point, there is no point of comparing the dup_p_refs, we only care about which one | |||
2998 | // is closer to the size we request. Only on a tie, dup_p_refs or not is important again. | |||
2999 | (found_idx_size_diff = llabs((int64_t)tensor_blocks[found_idx].size - (int64_t)size)) < (idx_size_diff = llabs((int64_t)tensor_blocks[idx].size - (int64_t)size))) | |||
3000 | { | |||
3001 | found_idx = idx; | |||
3002 | continue; | |||
3003 | } | |||
3004 | // No need to update if found_idx is better than idx. | |||
3005 | if (found_idx_size_diff > idx_size_diff) | |||
3006 | continue; | |||
3007 | // We bias towards the bigger one in case of similar. | |||
3008 | if (found_idx_size_diff == idx_size_diff && tensor_blocks[idx].size > tensor_blocks[found_idx].size) | |||
3009 | { | |||
3010 | found_idx = idx; | |||
3011 | continue; | |||
3012 | } | |||
3013 | assert(tensor_blocks[idx].size == tensor_blocks[found_idx].size)((void) sizeof ((tensor_blocks[idx].size == tensor_blocks[found_idx ].size) ? 1 : 0), __extension__ ({ if (tensor_blocks[idx].size == tensor_blocks[found_idx].size) ; else __assert_fail ("tensor_blocks[idx].size == tensor_blocks[found_idx].size" , "ccv_nnc_symbolic_graph_compile.c", 3013, __extension__ __PRETTY_FUNCTION__ ); })); | |||
3014 | // On a tie, check which one has tighter life-cycle. | |||
3015 | if (tensor_blocks[idx].size >= size) // If this block size covers the size we request, we prefer longer life-cycle ones. | |||
3016 | { | |||
3017 | // Check whether the current tensor blocks life-cycle is longer than the previous one. | |||
3018 | if (tensor_blocks[idx].dup_p_refs && tensor_blocks[idx].dup_p_refs->rnum > 0 && | |||
3019 | (!tensor_blocks[found_idx].dup_p_refs || !tensor_blocks[found_idx].dup_p_refs->rnum || | |||
3020 | _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, tensor_blocks[found_idx].dup_p_refs))) | |||
3021 | found_idx = idx; | |||
3022 | continue; | |||
3023 | } | |||
3024 | // Now both our size is smaller than requested size, in this case, we need to increase the tensor block size. | |||
3025 | // We prefer to choose the one that has life-cycle closer to the expected ones. | |||
3026 | if (no_dup_p_refs) | |||
3027 | { | |||
3028 | // Whoever is shorter wins. | |||
3029 | if (tensor_blocks[found_idx].dup_p_refs && tensor_blocks[found_idx].dup_p_refs->rnum > 0 && | |||
3030 | (!tensor_blocks[idx].dup_p_refs || !tensor_blocks[idx].dup_p_refs->rnum || | |||
3031 | _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[found_idx].dup_p_refs, tensor_blocks[idx].dup_p_refs))) | |||
3032 | found_idx = idx; | |||
3033 | continue; | |||
3034 | } | |||
3035 | if (!tensor_blocks[idx].dup_p_refs || !tensor_blocks[idx].dup_p_refs->rnum) | |||
3036 | continue; | |||
3037 | if (!tensor_blocks[found_idx].dup_p_refs || !tensor_blocks[found_idx].dup_p_refs->rnum) | |||
3038 | { | |||
3039 | found_idx = idx; | |||
3040 | continue; | |||
3041 | } | |||
3042 | // If both covers the request dup_p_refs, we prefer the shorter one, otherwise we prefer the longer one. | |||
3043 | const int idx_after_request = _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, dup_p_refs); | |||
3044 | const int found_idx_after_request = _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[found_idx].dup_p_refs, dup_p_refs); | |||
3045 | if (idx_after_request && found_idx_after_request) | |||
3046 | { | |||
3047 | if (_ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[found_idx].dup_p_refs, tensor_blocks[idx].dup_p_refs)) | |||
3048 | found_idx = idx; | |||
3049 | continue; | |||
3050 | } else { | |||
3051 | // We entered this branch must be either idx_after_request is false or found_idx_after_request is false or both. | |||
3052 | // If found_idx_after_request is not false, we are currently doing fine, no need to proceed. | |||
3053 | // Otherwise, if idx_after_request is true, it is preferred. If both are false, then prefer the longer one. | |||
3054 | if (!found_idx_after_request && (idx_after_request || | |||
3055 | _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, tensor_blocks[found_idx].dup_p_refs))) | |||
3056 | found_idx = idx; | |||
3057 | continue; | |||
3058 | } | |||
3059 | } | |||
3060 | return found_idx; | |||
3061 | } | |||
3062 | ||||
3063 | static ccv_array_t* _ccv_nnc_dup_breakpoints_with_p_node_inputs(ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_info_t* const p_node_info) | |||
3064 | { | |||
3065 | if (!(p_node_info && (p_node_info->flags & CCV_NNC_GRAPH_EXEC_P_WHILE))) | |||
3066 | return 0; | |||
3067 | int i, j, k; | |||
3068 | int input_size = 0; | |||
3069 | for (i = 0; i < p_node_info->p_while.input_size; i++) | |||
3070 | if (p_node_info->p_while.inputs[i] >= 0) | |||
3071 | ++input_size; | |||
3072 | // If doesn't have tensor inputs (thus, only special inputs), just return. | |||
3073 | if (!input_size) | |||
3074 | return 0; | |||
3075 | ccv_nnc_tensor_symbol_t inputs[input_size]; | |||
3076 | input_size = 0; | |||
3077 | for (i = 0; i < p_node_info->p_while.input_size; i++) | |||
3078 | if (p_node_info->p_while.inputs[i] >= 0) | |||
3079 | inputs[input_size++] = (ccv_nnc_tensor_symbol_t){ | |||
3080 | .d = p_node_info->p_while.inputs[i], | |||
3081 | .graph = symbolic_graph, | |||
3082 | }; | |||
3083 | assert(symbolic_graph->breakpoint_size > 0)((void) sizeof ((symbolic_graph->breakpoint_size > 0) ? 1 : 0), __extension__ ({ if (symbolic_graph->breakpoint_size > 0) ; else __assert_fail ("symbolic_graph->breakpoint_size > 0" , "ccv_nnc_symbolic_graph_compile.c", 3083, __extension__ __PRETTY_FUNCTION__ ); })); | |||
3084 | ccv_array_t* dup_breakpoints = ccv_array_new(sizeof(ccv_nnc_graph_exec_symbol_t), symbolic_graph->breakpoint_size, 0); | |||
3085 | const int exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum; | |||
3086 | for (i = 0; i < symbolic_graph->breakpoint_size; i++) | |||
3087 | { | |||
3088 | // Make a noop copy of the breakpoint, but with some tensor inputs. | |||
3089 | ccv_nnc_graph_exec_symbol_t noop = ccv_nnc_graph_exec_symbol_new(symbolic_graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC()((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}}}), 0), inputs, input_size, 0, 0, 0); | |||
3090 | ccv_array_push(dup_breakpoints, &noop); | |||
3091 | // Connect this noop to the outgoing nodes of breakpoints. | |||
3092 | const ccv_nnc_graph_exec_symbol_info_t* const symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, symbolic_graph->breakpoints[i].d)((void*)(((char*)((symbolic_graph->exec_symbol_info)->data )) + (size_t)(symbolic_graph->exec_symbol_info)->rsize * (size_t)(symbolic_graph->breakpoints[i].d))); | |||
3093 | if (symbol_info->outgoings) | |||
3094 | for (j = 0; j < symbol_info->outgoings->rnum; j++) | |||
3095 | { | |||
3096 | const int d = *(int*)ccv_array_get(symbol_info->outgoings, j)((void*)(((char*)((symbol_info->outgoings)->data)) + (size_t )(symbol_info->outgoings)->rsize * (size_t)(j))); | |||
3097 | ccv_nnc_graph_exec_symbol_concat(symbolic_graph, noop, (ccv_nnc_graph_exec_symbol_t){ | |||
3098 | .d = d, | |||
3099 | .graph = symbolic_graph, | |||
3100 | }); | |||
3101 | } | |||
3102 | } | |||
3103 | for (i = 0; i < exec_symbol_info_size; i++) | |||
3104 | { | |||
3105 | const ccv_nnc_graph_exec_symbol_info_t* const symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, i)((void*)(((char*)((symbolic_graph->exec_symbol_info)->data )) + (size_t)(symbolic_graph->exec_symbol_info)->rsize * (size_t)(i))); | |||
3106 | if (CCV_NNC_GRAPH_EXEC_IS_DEAD(symbol_info->flags)((symbol_info->flags) & CCV_NNC_GRAPH_EXEC_DEAD)) | |||
3107 | continue; | |||
3108 | if (symbol_info->outgoings) | |||
3109 | { | |||
3110 | const int outgoing_size = symbol_info->outgoings->rnum; | |||
3111 | for (j = 0; j < outgoing_size; j++) | |||
3112 | { | |||
3113 | const int d = *(int*)ccv_array_get(symbol_info->outgoings, j)((void*)(((char*)((symbol_info->outgoings)->data)) + (size_t )(symbol_info->outgoings)->rsize * (size_t)(j))); | |||
3114 | for (k = 0; k < symbolic_graph->breakpoint_size; k++) | |||
3115 | if (d == symbolic_graph->breakpoints[k].d) | |||
3116 | { | |||
3117 | ccv_nnc_graph_exec_symbol_t noop = *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_breakpoints, k)((void*)(((char*)((dup_breakpoints)->data)) + (size_t)(dup_breakpoints )->rsize * (size_t)(k))); | |||
3118 | ccv_nnc_graph_exec_symbol_concat(symbolic_graph, (ccv_nnc_graph_exec_symbol_t){ | |||
3119 | .d = i, | |||
3120 | .graph = symbolic_graph, | |||
3121 | }, noop); | |||
3122 | // Found, connected, exit. | |||
3123 | break; | |||
3124 | } | |||
3125 | } | |||
3126 | } | |||
3127 | } | |||
3128 | // Add the dup_breakpoints to source if neccessary. | |||
3129 | assert(symbolic_graph->sources)((void) sizeof ((symbolic_graph->sources) ? 1 : 0), __extension__ ({ if (symbolic_graph->sources) ; else __assert_fail ("symbolic_graph->sources" , "ccv_nnc_symbolic_graph_compile.c", 3129, __extension__ __PRETTY_FUNCTION__ ); })); | |||
3130 | const int source_size = symbolic_graph->sources->rnum; | |||
3131 | for (i = 0; i < source_size; i++) | |||
3132 | { | |||
3133 | const int d = ((ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->sources, i)((void*)(((char*)((symbolic_graph->sources)->data)) + ( size_t)(symbolic_graph->sources)->rsize * (size_t)(i))))->d; | |||
3134 | for (j = 0; j < symbolic_graph->breakpoint_size; j++) | |||
3135 | if (d == symbolic_graph->breakpoints[j].d) | |||
3136 | { | |||
3137 | ccv_nnc_graph_exec_symbol_t noop = *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_breakpoints, j)((void*)(((char*)((dup_breakpoints)->data)) + (size_t)(dup_breakpoints )->rsize * (size_t)(j))); | |||
3138 | ccv_nnc_symbolic_graph_add_source(symbolic_graph, noop); | |||
3139 | // Found, made, exit. | |||
3140 | break; | |||
3141 | } | |||
3142 | } | |||
3143 | // Add the dup_breakpoints to destination if neccessary. | |||
3144 | assert(symbolic_graph->destinations)((void) sizeof ((symbolic_graph->destinations) ? 1 : 0), __extension__ ({ if (symbolic_graph->destinations) ; else __assert_fail ("symbolic_graph->destinations", "ccv_nnc_symbolic_graph_compile.c" , 3144, __extension__ __PRETTY_FUNCTION__); })); | |||
3145 | const int destination_size = symbolic_graph->destinations->rnum; | |||
3146 | for (i = 0; i < destination_size; i++) | |||
3147 | { | |||
3148 | const int d = ((ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->destinations, i)((void*)(((char*)((symbolic_graph->destinations)->data) ) + (size_t)(symbolic_graph->destinations)->rsize * (size_t )(i))))->d; | |||
3149 | for (j = 0; j < symbolic_graph->breakpoint_size; j++) | |||
3150 | if (d == symbolic_graph->breakpoints[j].d) | |||
3151 | { | |||
3152 | ccv_nnc_graph_exec_symbol_t noop = *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_breakpoints, j)((void*)(((char*)((dup_breakpoints)->data)) + (size_t)(dup_breakpoints )->rsize * (size_t)(j))); | |||
3153 | ccv_nnc_symbolic_graph_add_destination(symbolic_graph, noop); | |||
3154 | // Found, made, exit. | |||
3155 | break; | |||
3156 | } | |||
3157 | } | |||
3158 | return dup_breakpoints; | |||
3159 | } | |||
3160 | ||||
3161 | // Plan out how we allocate tensor (should I do optimizations on graph here or not at all?). | |||
3162 | static ccv_nnc_symbolic_graph_prep_t* _ccv_nnc_symbolic_graph_prep_new(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_graph_exec_symbol_info_t* const p_exec_symbol_info, const int p_exec_symbol_info_size) | |||
3163 | { | |||
3164 | assert(source_size > 0)((void) sizeof ((source_size > 0) ? 1 : 0), __extension__ ( { if (source_size > 0) ; else __assert_fail ("source_size > 0" , "ccv_nnc_symbolic_graph_compile.c", 3164, __extension__ __PRETTY_FUNCTION__ ); })); | |||
3165 | assert(destination_size > 0)((void) sizeof ((destination_size > 0) ? 1 : 0), __extension__ ({ if (destination_size > 0) ; else __assert_fail ("destination_size > 0" , "ccv_nnc_symbolic_graph_compile.c", 3165, __extension__ __PRETTY_FUNCTION__ ); })); | |||
3166 | // First, fill all the "auto" holes. | |||
3167 | // This is the symbol table that with "auto" info filled up. | |||
3168 | ccv_nnc_tensor_symbol_info_t* tensor_symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccmallocmalloc(sizeof(ccv_nnc_tensor_symbol_info_t) * symbolic_graph->tensor_symbol_info->rnum); | |||
3169 | ccv_nnc_graph_exec_symbol_info_t* exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccmallocmalloc(sizeof(ccv_nnc_graph_exec_symbol_info_t) * symbolic_graph->exec_symbol_info->rnum); | |||
3170 | ccv_nnc_graph_exec_flag_t* exec_flags = (ccv_nnc_graph_exec_flag_t*)cccalloccalloc(symbolic_graph->exec_symbol_info->rnum, sizeof(ccv_nnc_graph_exec_flag_t)); | |||
3171 | ccv_nnc_graph_visit_t* visit = ccv_nnc_graph_visit_new(symbolic_graph, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, 0), symbolic_graph->exec_symbol_info->rnum, sources, source_size, destinations, destination_size, 0)({ ccv_nnc_graph_visit_t* _visit_ = (ccv_nnc_graph_visit_t*)malloc (sizeof(ccv_nnc_graph_visit_t) + sizeof(_visit_->node[0]) * ((symbolic_graph->exec_symbol_info->rnum) - 1)); _visit_ ->size = 0; do { typedef struct { int8_t d; int8_t r; uint16_t c; int32_t edges; } ccv_nnc_incoming_t; int _i_, _j_; int _incoming_edges_ = 0; for (_i_ = 0; _i_ < (symbolic_graph->exec_symbol_info ->rnum); _i_++) _incoming_edges_ += (((ccv_nnc_graph_exec_symbol_info_t *)((void*)(((char*)((symbolic_graph->exec_symbol_info)-> data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize * (size_t)(0))))[_i_].outgoings) ? ((ccv_nnc_graph_exec_symbol_info_t *)((void*)(((char*)((symbolic_graph->exec_symbol_info)-> data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize * (size_t)(0))))[_i_].outgoings->rnum : 0; const int _heap_mem_ = ((symbolic_graph->exec_symbol_info->rnum) + _incoming_edges_ > 1024); ccv_nnc_incoming_t* _incomings_; if (_heap_mem_) _incomings_ = (ccv_nnc_incoming_t*)malloc(sizeof(ccv_nnc_incoming_t ) * (symbolic_graph->exec_symbol_info->rnum) + sizeof(int32_t ) * ((symbolic_graph->exec_symbol_info->rnum) * 2 + _incoming_edges_ )); else _incomings_ = (ccv_nnc_incoming_t*)__builtin_alloca ( sizeof(ccv_nnc_incoming_t) * (symbolic_graph->exec_symbol_info ->rnum) + sizeof(int32_t) * ((symbolic_graph->exec_symbol_info ->rnum) * 2 + _incoming_edges_)); memset(_incomings_, 0, sizeof (ccv_nnc_incoming_t) * (symbolic_graph->exec_symbol_info-> rnum)); int32_t* _exists_[2] = { (int32_t*)(_incomings_ + (symbolic_graph ->exec_symbol_info->rnum)), (int32_t*)(_incomings_ + (symbolic_graph ->exec_symbol_info->rnum)) + (symbolic_graph->exec_symbol_info ->rnum), }; int32_t* const _edges_ = _exists_[1] + (symbolic_graph ->exec_symbol_info->rnum); for (_i_ = 0; _i_ < (source_size ); _i_++) { ((void) sizeof (((sources)[_i_].graph == symbolic_graph ) ? 1 : 0), __extension__ ({ if ((sources)[_i_].graph == symbolic_graph ) ; else __assert_fail ("(sources)[_i_].graph == symbolic_graph" , "ccv_nnc_symbolic_graph_compile.c", 3171, __extension__ __PRETTY_FUNCTION__ ); })); _incomings_[(sources)[_i_].d].r = 1; _exists_[0][_i_] = (sources)[_i_].d; } int _exist_size_[2] = { (source_size), 0, }; int _p_ = 0, _q_ = 1; while (_exist_size_[_p_] > 0) { _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_ [_p_]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if ( _incomings_[_idx_].r != 1) continue; _incomings_[_idx_].r = 2 ; if (((ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)(( symbolic_graph->exec_symbol_info)->data)) + (size_t)(symbolic_graph ->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings ) for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t*) ((void*)(((char*)((symbolic_graph->exec_symbol_info)->data )) + (size_t)(symbolic_graph->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings->rnum; _j_++) { const int d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t *)((void*)(((char*)((symbolic_graph->exec_symbol_info)-> data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t *)((void*)(((char*)((symbolic_graph->exec_symbol_info)-> data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_) )); ++_incomings_[d].c; if (_incomings_[d].r != 0) continue; _incomings_ [d].r = 1; ((void) sizeof ((_exist_size_[_q_] < (symbolic_graph ->exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_[_q_] < (symbolic_graph->exec_symbol_info ->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)" , "ccv_nnc_symbolic_graph_compile.c", 3171, __extension__ __PRETTY_FUNCTION__ ); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[ _q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for (_i_ = 0; _i_ < (source_size); _i_++) { ((void) sizeof (( (sources)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__ ({ if ((sources)[_i_].graph == symbolic_graph) ; else __assert_fail ("(sources)[_i_].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c" , 3171, __extension__ __PRETTY_FUNCTION__); })); _incomings_[ (sources)[_i_].d].r = 3; _exists_[0][_i_] = (sources)[_i_].d; } _exist_size_[0] = (source_size); _exist_size_[1] = 0; _p_ = 0, _q_ = 1; int _bump_ = 1; while (_exist_size_[_p_] > 0) { _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_ [_p_]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if ( _incomings_[_idx_].r != 3) continue; _incomings_[_idx_].r = 4 ; if (((ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)(( symbolic_graph->exec_symbol_info)->data)) + (size_t)(symbolic_graph ->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings ) for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t*) ((void*)(((char*)((symbolic_graph->exec_symbol_info)->data )) + (size_t)(symbolic_graph->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings->rnum; _j_++) { const int d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t *)((void*)(((char*)((symbolic_graph->exec_symbol_info)-> data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t *)((void*)(((char*)((symbolic_graph->exec_symbol_info)-> data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_) )); if (_incomings_[d].edges == 0) { _incomings_[d].edges = _bump_ ; _bump_ += _incomings_[d].c; _incomings_[d].c = 0; } _edges_ [_incomings_[d].edges - 1 + _incomings_[d].c] = _idx_; ++_incomings_ [d].c; if (_incomings_[d].r != 2) continue; _incomings_[d].r = 3; ((void) sizeof ((_exist_size_[_q_] < (symbolic_graph-> exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_ [_q_] < (symbolic_graph->exec_symbol_info->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)" , "ccv_nnc_symbolic_graph_compile.c", 3171, __extension__ __PRETTY_FUNCTION__ ); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[ _q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for (_i_ = 0; _i_ < (destination_size); _i_++) { ((void) sizeof (((destinations)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__ ({ if ((destinations)[_i_].graph == symbolic_graph) ; else __assert_fail ("(destinations)[_i_].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c" , 3171, __extension__ __PRETTY_FUNCTION__); })); _incomings_[ (destinations)[_i_].d].r = 5; _exists_[0][_i_] = (destinations )[_i_].d; } _exist_size_[0] = (destination_size); _exist_size_ [1] = 0; _p_ = 0, _q_ = 1; while (_exist_size_[_p_] > 0) { _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_ ]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (_incomings_ [_idx_].r != 5) continue; _incomings_[_idx_].r = 6; if (_incomings_ [_idx_].edges > 0) for (_j_ = 0; _j_ < _incomings_[_idx_ ].c; _j_++) { const int d = _edges_[_incomings_[_idx_].edges - 1 + _j_]; if (_incomings_[d].r != 4) continue; _incomings_[d ].r = 5; ((void) sizeof ((_exist_size_[_q_] < (symbolic_graph ->exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_[_q_] < (symbolic_graph->exec_symbol_info ->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)" , "ccv_nnc_symbolic_graph_compile.c", 3171, __extension__ __PRETTY_FUNCTION__ ); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[ _q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for (_i_ = 0; _i_ < (destination_size); _i_++) { ((void) sizeof (((destinations)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__ ({ if ((destinations)[_i_].graph == symbolic_graph) ; else __assert_fail ("(destinations)[_i_].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c" , 3171, __extension__ __PRETTY_FUNCTION__); })); _incomings_[ (destinations)[_i_].d].d = 1; } for (_i_ = 0; _i_ < (source_size ); _i_++) { ((void) sizeof (((sources)[_i_].graph == symbolic_graph ) ? 1 : 0), __extension__ ({ if ((sources)[_i_].graph == symbolic_graph ) ; else __assert_fail ("(sources)[_i_].graph == symbolic_graph" , "ccv_nnc_symbolic_graph_compile.c", 3171, __extension__ __PRETTY_FUNCTION__ ); })); _exists_[0][_i_] = (sources)[_i_].d; } _p_ = 0; _q_ = 1; _exist_size_[0] = (source_size); _exist_size_[1] = 0; int _d_ = 0; while (_exist_size_[_p_] > 0) { _exist_size_[_q_ ] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_];) { const int32_t _idx_ = _exists_[_p_][_i_]; _visit_->node[_visit_->size ].index = ((_idx_)); _visit_->node[_visit_->size].term = ((_incomings_[_idx_].d)); ++_visit_->size;; if (_incomings_ [_idx_].d) { ++_d_; _incomings_[_idx_].r = 7; } if (((ccv_nnc_graph_exec_symbol_info_t *)((void*)(((char*)((symbolic_graph->exec_symbol_info)-> data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings) { if (((ccv_nnc_graph_exec_symbol_info_t *)((void*)(((char*)((symbolic_graph->exec_symbol_info)-> data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings->rnum == 1) { const int d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t *)((void*)(((char*)((symbolic_graph->exec_symbol_info)-> data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t *)((void*)(((char*)((symbolic_graph->exec_symbol_info)-> data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(0))) ; --_incomings_[d].c; if (_incomings_[d].c == 0 && _incomings_ [d].r == 6 && _d_ < (destination_size)) { _exists_ [_p_][_i_] = d; continue; } } else for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t *)((void*)(((char*)((symbolic_graph->exec_symbol_info)-> data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings->rnum; _j_++) { const int d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t *)((void*)(((char*)((symbolic_graph->exec_symbol_info)-> data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t *)((void*)(((char*)((symbolic_graph->exec_symbol_info)-> data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_) )); --_incomings_[d].c; if (_incomings_[d].c == 0 && _incomings_ [d].r == 6 && _d_ < (destination_size)) { ((void) sizeof ((_exist_size_[_q_] < (symbolic_graph->exec_symbol_info ->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)" , "ccv_nnc_symbolic_graph_compile.c", 3171, __extension__ __PRETTY_FUNCTION__ ); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[ _q_]; } } } ++_i_; } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = ( _i_)); } for (_i_ = 0; _i_ < (destination_size); _i_++) { ( (void) sizeof (((destinations)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__ ({ if ((destinations)[_i_].graph == symbolic_graph ) ; else __assert_fail ("(destinations)[_i_].graph == symbolic_graph" , "ccv_nnc_symbolic_graph_compile.c", 3171, __extension__ __PRETTY_FUNCTION__ ); })); if (_incomings_[(destinations)[_i_].d].r == 7) continue ; if (!(0)) { ((void) sizeof ((_incomings_[(destinations)[_i_ ].d].c == 0) ? 1 : 0), __extension__ ({ if (_incomings_[(destinations )[_i_].d].c == 0) ; else __assert_fail ("_incomings_[(destinations)[_i_].d].c == 0" , "ccv_nnc_symbolic_graph_compile.c", 3171, __extension__ __PRETTY_FUNCTION__ ); })); } else if (_incomings_[(destinations)[_i_].d].c > 0 ) continue; _visit_->node[_visit_->size].index = (((destinations )[_i_].d)); _visit_->node[_visit_->size].term = ((_incomings_ [(destinations)[_i_].d].d)); ++_visit_->size;; } if (_heap_mem_ ) free(_incomings_); } while (0);; ((void) sizeof ((_visit_-> size <= (symbolic_graph->exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if (_visit_->size <= (symbolic_graph ->exec_symbol_info->rnum)) ; else __assert_fail ("_visit_->size <= (symbolic_graph->exec_symbol_info->rnum)" , "ccv_nnc_symbolic_graph_compile.c", 3171, __extension__ __PRETTY_FUNCTION__ ); })); _visit_; }); | |||
3172 | ccv_nnc_symbolic_graph_symbol_infer(symbolic_graph, visit, sources, source_size, destinations, destination_size, p_tensor_symbol_info, p_tensor_symbol_info_size, tensor_symbol_info, exec_symbol_info); | |||
3173 | int i, j, k, p, q; | |||
3174 | const ccv_nnc_graph_exec_symbol_info_t* const p_node_info = p_exec_symbol_info ? p_exec_symbol_info + (symbolic_graph->exec_idx - 1) : 0; | |||
3175 | ccv_sparse_matrix_t* exec_dep; | |||
3176 | ccv_nnc_tensor_block_t* tensor_blocks; | |||
3177 | _ccv_nnc_exec_dep_and_tensor_blocks_prep(symbolic_graph, p_node_info, visit, tensor_binds, tensor_bind_size, outputs, output_size, sources, source_size, destinations, destination_size, p_tensor_symbol_info, p_tensor_symbol_info_size, exec_symbol_info, tensor_symbol_info, 0, 0, 0, 0, exec_flags, &exec_dep, &tensor_blocks); | |||
3178 | int tensor_block_size = symbolic_graph->tensor_symbol_info->rnum; | |||
3179 | // Now, everything is prepared, tensor life is analyzed, inplace operations are collapsed, all tensor symbols and hints | |||
3180 | // are automatically filled in, and all the sub-graphs are processed. | |||
3181 | // There is a last step though, for a while loop, it is parameterized: | |||
3182 | // while (x > 5) { | |||
3183 | // y = x + 1; | |||
3184 | // } (y => x) // This means after this loop is done, y's value will be copied over to x. | |||
3185 | // we will do our best to avoid to do the actual data copy, what we do here is to check whether y can be x's alias. | |||
3186 | // If y can be x's alias, this is good, no other changes required. In above case, y can be x's alias because | |||
3187 | // it is a inplace operation. | |||
3188 | // But if y cannot be x's alias, for example, this while loop looks like this: | |||
3189 | // while (x > 5) { | |||
3190 | // y = x + a | |||
3191 | // b = x + y | |||
3192 | // } (y => x, b => a) // This means after this loop is done, y's value copied to x and b's value copied to a. | |||
3193 | // For this example, y cannot be x's alias because x is used later to compute b (and that computation | |||
3194 | // has dependency on y as well). | |||
3195 | // For this case, we need to modify the computation graph. Previously, the graph looks like this: | |||
3196 | // y = x + a -> b = x + y | |||
3197 | // This graph will be extended to look like this: | |||
3198 | // y0 = x0 + a0 -> b0 = x0 + y0 -> y1 = y0 + b0 -> b1 = y0 + y1, or: | |||
3199 | // while (x0 > 5) { | |||
3200 | // y0 = x0 + a0 | |||
3201 | // b0 = x0 + y0 | |||
3202 | // if (y0 > 5) break | |||
3203 | // y1 = y0 + b0 | |||
3204 | // b1 = y0 + y1 | |||
3205 | // } (y1 => x0, b1 => a0) | |||
3206 | // After this expansion, y1 now can be the alias of x0, as well as b1 can be alias of a0 (they don't interfere | |||
3207 | // with each other now). | |||
3208 | // With this algorithm, we don't need to insert any data copy logic, the only thing need is to switch pointers | |||
3209 | // which is covered by the tensor_multiview_t construct (thus, y (y0, y1), x (y1, y0), b (b0, b1), a (b1, b0)) | |||
3210 | ccv_nnc_symbolic_graph_t* dup_graph = 0; | |||
3211 | int* dup_exec_ref = 0; | |||
3212 | int* dup_tensor_block_ref = 0; | |||
3213 | int unroll_count = 0; | |||
3214 | // In true recursive fashion, I need to call all the sub graphs and do the pre compilation for them one by one. | |||
3215 | ccv_nnc_symbolic_graph_prep_t* prep = (ccv_nnc_symbolic_graph_prep_t*)ccmallocmalloc(sizeof(ccv_nnc_symbolic_graph_prep_t)); | |||
3216 | prep->graph = ccv_nnc_graph_new(); // Just allocate the graph right now. | |||
3217 | prep->flags = 0; | |||
3218 | // Cannot handle dup a node that is a graph as well. | |||
3219 | if (p_exec_symbol_info) | |||
3220 | { | |||
3221 | prep->flags = p_node_info->flags; | |||
3222 | if (p_node_info->flags & CCV_NNC_GRAPH_EXEC_P_WHILE) | |||
3223 | { | |||
3224 | _ccv_nnc_redo_exec_dep_and_tensor_blocks_when_unroll(symbolic_graph, p_node_info, visit, tensor_binds, tensor_bind_size, sources, source_size, destinations, destination_size, p_tensor_symbol_info, p_tensor_symbol_info_size, exec_symbol_info, tensor_symbol_info, exec_flags, &exec_dep, &tensor_blocks, &tensor_block_size, &dup_graph, &unroll_count, &dup_exec_ref, &dup_tensor_block_ref); | |||
3225 | _ccv_nnc_fixup_tensor_blocks_for_outputs(exec_dep, tensor_blocks, p_node_info, unroll_count, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->destinations, 0)((void*)(((char*)((symbolic_graph->destinations)->data) ) + (size_t)(symbolic_graph->destinations)->rsize * (size_t )(0))), symbolic_graph->destinations->rnum, symbolic_graph->p_idx - 1, p_tensor_symbol_info, p_tensor_symbol_info_size, tensor_symbol_info, dup_exec_ref, dup_tensor_block_ref); | |||
3226 | } else if (p_node_info->flags & CCV_NNC_GRAPH_EXEC_CASE_OF) { | |||
3227 | // TODO: We want to try our best to fit as much of its corresponding inputs / outputs into companion_ref group. | |||
3228 | } | |||
3229 | } | |||
3230 | ccv_nnc_symbolic_graph_prep_t** sub_preps = symbolic_graph->sub_graphs && symbolic_graph->sub_graphs->rnum ? (ccv_nnc_symbolic_graph_prep_t**)cccalloccalloc(symbolic_graph->sub_graphs->rnum, sizeof(ccv_nnc_symbolic_graph_prep_t*)) : 0; | |||
3231 | ccv_array_t* anonymous_block_free_list = 0; | |||
3232 | const int tensor_fold_size = (tensor_block_size + 31) >> 5; | |||
3233 | // Record whether this tensor is folded in this round. | |||
3234 | uint32_t* const tensor_fold = (uint32_t*)ccmallocmalloc(sizeof(uint32_t) * tensor_fold_size); | |||
3235 | ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const int idx __attribute__((unused)) = (visit)->node[_i_].index ; const int _node_unused_ __attribute__((unused)) = (visit)-> node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__ ((unused)) = (exec_symbol_info) + idx; { | |||
3236 | for (p = 0; p < node->graph_ref_size; p++) | |||
3237 | { | |||
3238 | assert(symbolic_graph->sub_graphs)((void) sizeof ((symbolic_graph->sub_graphs) ? 1 : 0), __extension__ ({ if (symbolic_graph->sub_graphs) ; else __assert_fail ( "symbolic_graph->sub_graphs", "ccv_nnc_symbolic_graph_compile.c" , 3238, __extension__ __PRETTY_FUNCTION__); })); | |||
3239 | ccv_nnc_symbolic_graph_t* const sub_graph = *(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, CCV_NNC_GRAPH_REF(node)[p] - 1)((void*)(((char*)((symbolic_graph->sub_graphs)->data)) + (size_t)(symbolic_graph->sub_graphs)->rsize * (size_t) (((node)->_heap_graph_ref ? (node)->_heap_graph_ref : ( node)->_inline_graph_ref)[p] - 1))); | |||
3240 | ccv_array_t* const dup_breakpoints = _ccv_nnc_dup_breakpoints_with_p_node_inputs(sub_graph, node); | |||
3241 | ccv_nnc_symbolic_graph_prep_t* const sub_prep = _ccv_nnc_symbolic_graph_prep_new(sub_graph, tensor_binds, tensor_bind_size, 0, 0, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sub_graph->sources, 0)((void*)(((char*)((sub_graph->sources)->data)) + (size_t )(sub_graph->sources)->rsize * (size_t)(0))), sub_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sub_graph->destinations, 0)((void*)(((char*)((sub_graph->destinations)->data)) + ( size_t)(sub_graph->destinations)->rsize * (size_t)(0))), sub_graph->destinations->rnum, tensor_symbol_info, symbolic_graph->tensor_symbol_info->rnum, exec_symbol_info, symbolic_graph->exec_symbol_info->rnum); | |||
3242 | sub_prep->dup_breakpoints = dup_breakpoints; | |||
3243 | sub_prep->p = prep; | |||
3244 | sub_preps[CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node )->_inline_graph_ref)[p] - 1] = sub_prep; | |||
3245 | const ccv_nnc_tensor_alloc_prep_t* const s_alloc_prep = sub_prep->alloc_prep; | |||
3246 | const ccv_nnc_tensor_block_t* const s_tensor_blocks = sub_prep->tensor_blocks; | |||
3247 | for (i = 0; i < s_alloc_prep->block_size; i++) | |||
3248 | { | |||
3249 | const int block_ref = s_alloc_prep->blocks[i].block_ref; | |||
3250 | const int buffer_ref = s_alloc_prep->blocks[i].buffer_ref; | |||
3251 | if (block_ref < sub_prep->tensor_symbol_info_size) | |||
3252 | { | |||
3253 | // If this block has a bypass, and its bypass has a different p_refs, then it doesn't matter. | |||
3254 | // I cannot assign p_refs to its parent buffer, and that buffer has to be anonymous. | |||
3255 | if (s_tensor_blocks[block_ref].bypass_ref) | |||
3256 | { | |||
3257 | int bypass_ref = s_tensor_blocks[block_ref].bypass_ref - 1; | |||
3258 | while (s_tensor_blocks[bypass_ref].ref) | |||
3259 | bypass_ref = s_tensor_blocks[bypass_ref].ref - 1; | |||
3260 | if (s_tensor_blocks[block_ref].p_refs[0] != s_tensor_blocks[bypass_ref].p_refs[0] || | |||
3261 | s_tensor_blocks[block_ref].p_refs[1] != s_tensor_blocks[bypass_ref].p_refs[1]) | |||
3262 | continue; | |||
3263 | } | |||
3264 | if (s_tensor_blocks[block_ref].p_refs[0]) | |||
3265 | { | |||
3266 | /* If it is already properly assigned, next. */ | |||
3267 | if (s_alloc_prep->buffers[buffer_ref].p_refs[0] != s_tensor_blocks[block_ref].p_refs[0] && | |||
3268 | s_alloc_prep->buffers[buffer_ref].p_refs[1] != s_tensor_blocks[block_ref].p_refs[0]) | |||
3269 | { | |||
3270 | if (!s_alloc_prep->buffers[buffer_ref].p_refs[0]) | |||
3271 | s_alloc_prep->buffers[buffer_ref].p_refs[0] = s_tensor_blocks[block_ref].p_refs[0]; | |||
3272 | else { | |||
3273 | assert(!s_alloc_prep->buffers[buffer_ref].p_refs[1])((void) sizeof ((!s_alloc_prep->buffers[buffer_ref].p_refs [1]) ? 1 : 0), __extension__ ({ if (!s_alloc_prep->buffers [buffer_ref].p_refs[1]) ; else __assert_fail ("!s_alloc_prep->buffers[buffer_ref].p_refs[1]" , "ccv_nnc_symbolic_graph_compile.c", 3273, __extension__ __PRETTY_FUNCTION__ ); })); | |||
3274 | s_alloc_prep->buffers[buffer_ref].p_refs[1] = s_tensor_blocks[block_ref].p_refs[0]; | |||
3275 | } | |||
3276 | } | |||
3277 | /* When entering this branch, s_alloc_prep->buffers[buffer_ref].p_refs[0] cannot be 0. */ | |||
3278 | if (s_tensor_blocks[block_ref].p_refs[1] && | |||
3279 | s_alloc_prep->buffers[buffer_ref].p_refs[0] != s_tensor_blocks[block_ref].p_refs[1] && | |||
3280 | s_alloc_prep->buffers[buffer_ref].p_refs[1] != s_tensor_blocks[block_ref].p_refs[1]) | |||
3281 | { | |||
3282 | assert(s_alloc_prep->buffers[buffer_ref].p_refs[0])((void) sizeof ((s_alloc_prep->buffers[buffer_ref].p_refs[ 0]) ? 1 : 0), __extension__ ({ if (s_alloc_prep->buffers[buffer_ref ].p_refs[0]) ; else __assert_fail ("s_alloc_prep->buffers[buffer_ref].p_refs[0]" , "ccv_nnc_symbolic_graph_compile.c", 3282, __extension__ __PRETTY_FUNCTION__ ); })); | |||
3283 | assert(!s_alloc_prep->buffers[buffer_ref].p_refs[1])((void) sizeof ((!s_alloc_prep->buffers[buffer_ref].p_refs [1]) ? 1 : 0), __extension__ ({ if (!s_alloc_prep->buffers [buffer_ref].p_refs[1]) ; else __assert_fail ("!s_alloc_prep->buffers[buffer_ref].p_refs[1]" , "ccv_nnc_symbolic_graph_compile.c", 3283, __extension__ __PRETTY_FUNCTION__ ); })); | |||
3284 | s_alloc_prep->buffers[buffer_ref].p_refs[1] = s_tensor_blocks[block_ref].p_refs[1]; | |||
3285 | } | |||
3286 | } | |||
3287 | } else if (s_tensor_blocks[block_ref].dup_p_refs) { | |||
3288 | /* In this case, only relevant bit is dup_p_ref. dup_p_ref extends the life-time of anonymous block | |||
3289 | * which by default only has life-cycle shared with this sub-graph node. The reason to extend is that | |||
3290 | * these anonymous blocks that has dup_p_ref may contain data that will be used as output (thus, dup_p_ref | |||
3291 | * always points to an output tensor of this sub-graph node) therefore, the memory region must extend | |||
3292 | * its life-time to the end of the output tensor. */ | |||
3293 | if (!s_alloc_prep->buffers[buffer_ref].dup_p_refs) | |||
3294 | s_alloc_prep->buffers[buffer_ref].dup_p_refs = ccv_array_new(sizeof(int), s_tensor_blocks[block_ref].dup_p_refs->rnum, 0); | |||
3295 | for (j = 0; j < s_tensor_blocks[block_ref].dup_p_refs->rnum; j++) | |||
3296 | ccv_array_add_unique_int(s_alloc_prep->buffers[buffer_ref].dup_p_refs, *(int*)ccv_array_get(s_tensor_blocks[block_ref].dup_p_refs, j)((void*)(((char*)((s_tensor_blocks[block_ref].dup_p_refs)-> data)) + (size_t)(s_tensor_blocks[block_ref].dup_p_refs)-> rsize * (size_t)(j)))); | |||
3297 | } | |||
3298 | } | |||
3299 | } | |||
3300 | const int init_tensor_block_size = tensor_block_size; | |||
3301 | int rw_anonymous_buffer_size_cap = 0; | |||
3302 | int ro_anonymous_buffer_size_cap = 0; | |||
3303 | if (anonymous_block_free_list) | |||
3304 | ccv_array_clear(anonymous_block_free_list); | |||
3305 | memset(tensor_fold, 0, sizeof(uint32_t) * tensor_fold_size); | |||
3306 | for (p = 0; p < node->graph_ref_size; p++) | |||
3307 | { | |||
3308 | ccv_nnc_symbolic_graph_prep_t* const sub_prep = sub_preps[CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node )->_inline_graph_ref)[p] - 1]; | |||
3309 | const ccv_nnc_tensor_alloc_prep_t* const s_alloc_prep = sub_prep->alloc_prep; | |||
3310 | int rw_anonymous_buffer_size = 0; | |||
3311 | int ro_anonymous_buffer_size = 0; | |||
3312 | for (i = 0; i < s_alloc_prep->buffer_size; i++) | |||
3313 | if (s_alloc_prep->buffers[i].p_refs[0]) | |||
3314 | { | |||
3315 | /* Reduce 2 p_refs, if it is, to 1 p_ref (by doing block folding). */ | |||
3316 | int p_ref_0 = s_alloc_prep->buffers[i].p_refs[0] - 1; | |||
3317 | /* Need to go through refs. Since we reuse the tensor block for this input, it now has to have allocate at least this much space. */ | |||
3318 | int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, node); | |||
3319 | assert(p_ref_0_is_in_or_out != 0)((void) sizeof ((p_ref_0_is_in_or_out != 0) ? 1 : 0), __extension__ ({ if (p_ref_0_is_in_or_out != 0) ; else __assert_fail ("p_ref_0_is_in_or_out != 0" , "ccv_nnc_symbolic_graph_compile.c", 3319, __extension__ __PRETTY_FUNCTION__ ); })); | |||
3320 | int unref_p_ref_0 = p_ref_0; | |||
3321 | while (tensor_blocks[unref_p_ref_0].ref) | |||
3322 | unref_p_ref_0 = tensor_blocks[unref_p_ref_0].ref - 1; | |||
3323 | /* This parent tensor block cannot be unassigned because it is either input / output of this sub-graph node. */ | |||
3324 | assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_0]))((void) sizeof ((!((tensor_blocks[unref_p_ref_0].flags & 0x3 ) == UNASSIGNED)) ? 1 : 0), __extension__ ({ if (!((tensor_blocks [unref_p_ref_0].flags & 0x3) == UNASSIGNED)) ; else __assert_fail ("!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_0])", "ccv_nnc_symbolic_graph_compile.c" , 3324, __extension__ __PRETTY_FUNCTION__); })); | |||
3325 | if (s_alloc_prep->buffers[i].p_refs[1]) | |||
3326 | { | |||
3327 | int p_ref_1 = s_alloc_prep->buffers[i].p_refs[1] - 1; | |||
3328 | const int p_ref_1_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_1, node); | |||
3329 | assert(p_ref_1_is_in_or_out != 0)((void) sizeof ((p_ref_1_is_in_or_out != 0) ? 1 : 0), __extension__ ({ if (p_ref_1_is_in_or_out != 0) ; else __assert_fail ("p_ref_1_is_in_or_out != 0" , "ccv_nnc_symbolic_graph_compile.c", 3329, __extension__ __PRETTY_FUNCTION__ ); })); | |||
3330 | int unref_p_ref_1 = p_ref_1; | |||
3331 | while (tensor_blocks[unref_p_ref_1].ref) | |||
3332 | unref_p_ref_1 = tensor_blocks[unref_p_ref_1].ref - 1; | |||
3333 | /* See above comment for the similar p_ref_0 check. */ | |||
3334 | assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_1]))((void) sizeof ((!((tensor_blocks[unref_p_ref_1].flags & 0x3 ) == UNASSIGNED)) ? 1 : 0), __extension__ ({ if (!((tensor_blocks [unref_p_ref_1].flags & 0x3) == UNASSIGNED)) ; else __assert_fail ("!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_1])", "ccv_nnc_symbolic_graph_compile.c" , 3334, __extension__ __PRETTY_FUNCTION__); })); | |||
3335 | assert(p_ref_0_is_in_or_out != p_ref_1_is_in_or_out)((void) sizeof ((p_ref_0_is_in_or_out != p_ref_1_is_in_or_out ) ? 1 : 0), __extension__ ({ if (p_ref_0_is_in_or_out != p_ref_1_is_in_or_out ) ; else __assert_fail ("p_ref_0_is_in_or_out != p_ref_1_is_in_or_out" , "ccv_nnc_symbolic_graph_compile.c", 3335, __extension__ __PRETTY_FUNCTION__ ); })); | |||
3336 | int p_ref_t; | |||
3337 | if (p_ref_0_is_in_or_out < p_ref_1_is_in_or_out) /* if p_ref_0 is input and p_ref_1 is output, switch. */ | |||
3338 | { | |||
3339 | CCV_SWAP(p_ref_0, p_ref_1, p_ref_t)((p_ref_t) = (p_ref_0), (p_ref_0) = (p_ref_1), (p_ref_1) = (p_ref_t )); | |||
3340 | CCV_SWAP(unref_p_ref_0, unref_p_ref_1, p_ref_t)((p_ref_t) = (unref_p_ref_0), (unref_p_ref_0) = (unref_p_ref_1 ), (unref_p_ref_1) = (p_ref_t)); | |||
3341 | } | |||
3342 | p_ref_0_is_in_or_out = 1; /* Now p_ref_0 surely is the output tensor. */ | |||
3343 | /* If the dimension matches, can fold. TODO: shoud the dimension matches perfectly here? */ | |||
3344 | if (memcmp(tensor_symbol_info[unref_p_ref_1].info.dim, tensor_symbol_info[unref_p_ref_0].info.dim, sizeof(int) * CCV_NNC_MAX_DIM_ALLOC(12)) == 0) | |||
3345 | { | |||
3346 | const int folded = _ccv_nnc_tensor_blocks_try_fold(tensor_blocks, unref_p_ref_1, unref_p_ref_0); | |||
3347 | if (folded) | |||
3348 | { | |||
3349 | p_ref_0 = p_ref_1; | |||
3350 | unref_p_ref_0 = unref_p_ref_1; // p_ref_0 now folded into p_ref_1, therefore, pointing to p_ref_1 now. | |||
3351 | tensor_fold[unref_p_ref_0 >> 5] |= (1u << (unref_p_ref_0 & 0x1f)); | |||
3352 | for (j = 0; j < unroll_count; j++) /* Fold its duplicates as well. */ | |||
3353 | { | |||
3354 | const int folded = _ccv_nnc_tensor_blocks_try_fold(tensor_blocks, dup_tensor_block_ref[unref_p_ref_1 * unroll_count + j], dup_tensor_block_ref[unref_p_ref_0 * unroll_count + j]); | |||
3355 | assert(folded && "the subsequent duplicates can be folded too.")((void) sizeof ((folded && "the subsequent duplicates can be folded too." ) ? 1 : 0), __extension__ ({ if (folded && "the subsequent duplicates can be folded too." ) ; else __assert_fail ("folded && \"the subsequent duplicates can be folded too.\"" , "ccv_nnc_symbolic_graph_compile.c", 3355, __extension__ __PRETTY_FUNCTION__ ); })); | |||
3356 | } | |||
3357 | } | |||
3358 | } | |||
3359 | } | |||
3360 | /* Only proceed if it is folded here (thus, the input / output tensor can be connected, reuse is not a problem | |||
3361 | * Or if the p_ref_0 is the output, it is the first started from this node (thus, I have full control over | |||
3362 | * its life-cycle). Or if the p_ref_0 is the input, it is ended in this node (thus, I can take over i | |||
3363 | * life-cycle freely within this sub-graph (otherwise, if it is used anywhere, I cannot change the content | |||
3364 | * within its memory region)). Unless this buffer is used as read-only, and we don't have any output | |||
3365 | * associated with it, then we are good. */ | |||
3366 | if ((tensor_fold[unref_p_ref_0 >> 5] & (1u << (unref_p_ref_0 & 0x1f))) || | |||
3367 | (p_ref_0_is_in_or_out == 1 && _ccv_nnc_tensor_block_check_head(tensor_blocks + unref_p_ref_0, idx)) || | |||
3368 | (p_ref_0_is_in_or_out == -1 && _ccv_nnc_tensor_block_check_tail(tensor_blocks + unref_p_ref_0, idx)) || | |||
3369 | TENSOR_READ_WRITE(s_alloc_prep->buffers[i])(s_alloc_prep->buffers[i].flags & 0xc) == READ_ONLY) | |||
3370 | { | |||
3371 | if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i])(s_alloc_prep->buffers[i].flags & 0xc) == READ_ONLY) | |||
3372 | { assert(s_alloc_prep->buffers[i].p_refs[1] == 0)((void) sizeof ((s_alloc_prep->buffers[i].p_refs[1] == 0) ? 1 : 0), __extension__ ({ if (s_alloc_prep->buffers[i].p_refs [1] == 0) ; else __assert_fail ("s_alloc_prep->buffers[i].p_refs[1] == 0" , "ccv_nnc_symbolic_graph_compile.c", 3372, __extension__ __PRETTY_FUNCTION__ ); })); } | |||
3373 | /* p_ref_0 is either the only one, or the output tensor, we always prefer the output tensor (there | |||
3374 | * is a long argument why that is the case, the digest is, it is much easier to control your output | |||
3375 | * than your input). */ | |||
3376 | s_alloc_prep->buffers[i].p_refs[0] = p_ref_0 + 1; | |||
3377 | s_alloc_prep->buffers[i].p_refs[1] = 0; | |||
3378 | /* This parent tensor block cannot be unassigned because it is either input / output of this sub-graph node. */ | |||
3379 | assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_0]))((void) sizeof ((!((tensor_blocks[unref_p_ref_0].flags & 0x3 ) == UNASSIGNED)) ? 1 : 0), __extension__ ({ if (!((tensor_blocks [unref_p_ref_0].flags & 0x3) == UNASSIGNED)) ; else __assert_fail ("!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_0])", "ccv_nnc_symbolic_graph_compile.c" , 3379, __extension__ __PRETTY_FUNCTION__); })); | |||
3380 | tensor_blocks[unref_p_ref_0].size = ccv_max(s_alloc_prep->buffers[i].size, tensor_blocks[unref_p_ref_0].size)({ typeof (s_alloc_prep->buffers[i].size) _a = (s_alloc_prep ->buffers[i].size); typeof (tensor_blocks[unref_p_ref_0].size ) _b = (tensor_blocks[unref_p_ref_0].size); (_a > _b) ? _a : _b; }); | |||
3381 | for (j = 0; j < unroll_count; j++) /* Change the size of its duplicates as well. */ | |||
3382 | tensor_blocks[dup_tensor_block_ref[p_ref_0 * unroll_count + j]].size = | |||
3383 | tensor_blocks[dup_tensor_block_ref[unref_p_ref_0 * unroll_count + j]].size = | |||
3384 | tensor_blocks[unref_p_ref_0].size; | |||
3385 | } else { | |||
3386 | s_alloc_prep->buffers[i].p_refs[0] = s_alloc_prep->buffers[i].p_refs[1] = 0; | |||
3387 | if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i])(s_alloc_prep->buffers[i].flags & 0xc) == READ_ONLY) | |||
3388 | ++ro_anonymous_buffer_size; | |||
3389 | else | |||
3390 | rw_anonymous_buffer_size += unroll_count + 1; | |||
3391 | } | |||
3392 | } else { | |||
3393 | if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i])(s_alloc_prep->buffers[i].flags & 0xc) == READ_ONLY) | |||
3394 | ++ro_anonymous_buffer_size; | |||
3395 | else | |||
3396 | rw_anonymous_buffer_size += unroll_count + 1; | |||
3397 | } | |||
3398 | if (ro_anonymous_buffer_size || rw_anonymous_buffer_size) | |||
3399 | { | |||
3400 | const int anonymous_block_free_list_cap = anonymous_block_free_list ? anonymous_block_free_list->rnum : 0; | |||
3401 | // All read-write buffer (potentially) can be reused between each case..of branch. | |||
3402 | rw_anonymous_buffer_size_cap += rw_anonymous_buffer_size; | |||
3403 | // Read-only buffer cannot be reused between each case..of branch. | |||
3404 | ro_anonymous_buffer_size_cap += ro_anonymous_buffer_size; | |||
3405 | /* Anonymous block, allocate additional tensor blocks for this. */ | |||
3406 | /* This is either because this is an internal tensor (don't have p_ref) */ | |||
3407 | /* or it is an anonymous block itself within the sub graphs of this while graph. */ | |||
3408 | tensor_blocks = (ccv_nnc_tensor_block_t*)ccreallocrealloc(tensor_blocks, sizeof(ccv_nnc_tensor_block_t) * (init_tensor_block_size + (unroll_count + 1) * rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap)); | |||
3409 | memset(tensor_blocks + tensor_block_size, 0, sizeof(ccv_nnc_tensor_block_t) * (init_tensor_block_size + (unroll_count + 1) * rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap - tensor_block_size)); | |||
3410 | if (dup_tensor_block_ref) | |||
3411 | dup_tensor_block_ref = (int*)ccreallocrealloc(dup_tensor_block_ref, sizeof(int) * unroll_count * (init_tensor_block_size + rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap)); | |||
3412 | for (i = 0; i < s_alloc_prep->buffer_size; i++) | |||
3413 | if (!s_alloc_prep->buffers[i].p_refs[0]) | |||
3414 | { | |||
3415 | if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i])(s_alloc_prep->buffers[i].flags & 0xc) == READ_ONLY) /* If it is read-only, add all sources (destinations) to it. */ | |||
3416 | { | |||
3417 | assert(tensor_block_size < init_tensor_block_size + rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap)((void) sizeof ((tensor_block_size < init_tensor_block_size + rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap ) ? 1 : 0), __extension__ ({ if (tensor_block_size < init_tensor_block_size + rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap ) ; else __assert_fail ("tensor_block_size < init_tensor_block_size + rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap" , "ccv_nnc_symbolic_graph_compile.c", 3417, __extension__ __PRETTY_FUNCTION__ ); })); | |||
3418 | TENSOR_SET_ANONYMOUS(tensor_blocks[tensor_block_size])(tensor_blocks[tensor_block_size].flags = ((tensor_blocks[tensor_block_size ].flags & ~0x10) | ANONYMOUS)); | |||
3419 | TENSOR_SET_READ_WRITE(tensor_blocks[tensor_block_size], TENSOR_READ_WRITE(s_alloc_prep->buffers[i]))(tensor_blocks[tensor_block_size].flags = ((tensor_blocks[tensor_block_size ].flags & ~0xc) | (s_alloc_prep->buffers[i].flags & 0xc))); | |||
3420 | tensor_blocks[tensor_block_size].type = s_alloc_prep->buffers[i].type; | |||
3421 | tensor_blocks[tensor_block_size].pin_mem = s_alloc_prep->buffers[i].pin_mem; | |||
3422 | tensor_blocks[tensor_block_size].size = s_alloc_prep->buffers[i].size; | |||
3423 | s_alloc_prep->buffers[i].p_refs[0] = tensor_block_size + 1; | |||
3424 | tensor_blocks[tensor_block_size].head = ccv_array_new(sizeof(int), 1, 0); | |||
3425 | ccv_array_push(tensor_blocks[tensor_block_size].head, &idx); | |||
3426 | ccv_array_t* const dup_p_refs = s_alloc_prep->buffers[i].dup_p_refs; | |||
3427 | if (dup_p_refs && dup_p_refs->rnum > 0) | |||
3428 | { | |||
3429 | for (j = 0; j < dup_p_refs->rnum; j++) | |||
3430 | { | |||
3431 | const int dup_p_ref = *(int*)ccv_array_get(dup_p_refs, j)((void*)(((char*)((dup_p_refs)->data)) + (size_t)(dup_p_refs )->rsize * (size_t)(j))); | |||
3432 | assert(dup_p_ref >= 0)((void) sizeof ((dup_p_ref >= 0) ? 1 : 0), __extension__ ( { if (dup_p_ref >= 0) ; else __assert_fail ("dup_p_ref >= 0" , "ccv_nnc_symbolic_graph_compile.c", 3432, __extension__ __PRETTY_FUNCTION__ ); })); | |||
3433 | assert(dup_p_ref < symbolic_graph->tensor_symbol_info->rnum)((void) sizeof ((dup_p_ref < symbolic_graph->tensor_symbol_info ->rnum) ? 1 : 0), __extension__ ({ if (dup_p_ref < symbolic_graph ->tensor_symbol_info->rnum) ; else __assert_fail ("dup_p_ref < symbolic_graph->tensor_symbol_info->rnum" , "ccv_nnc_symbolic_graph_compile.c", 3433, __extension__ __PRETTY_FUNCTION__ ); })); | |||
3434 | assert(tensor_blocks[dup_p_ref].tail)((void) sizeof ((tensor_blocks[dup_p_ref].tail) ? 1 : 0), __extension__ ({ if (tensor_blocks[dup_p_ref].tail) ; else __assert_fail ( "tensor_blocks[dup_p_ref].tail", "ccv_nnc_symbolic_graph_compile.c" , 3434, __extension__ __PRETTY_FUNCTION__); })); | |||
3435 | // If it points to a p_ref upwards, check whether this is an output, if it is an output, add it to | |||
3436 | // this block's dup_p_refs. It propagates back all the way to upper layer's buffer object. | |||
3437 | if (tensor_symbol_info[dup_p_ref].p_ref) | |||
3438 | { | |||
3439 | const int p_ref_0 = tensor_symbol_info[dup_p_ref].p_ref - 1; | |||
3440 | assert(p_node_info)((void) sizeof ((p_node_info) ? 1 : 0), __extension__ ({ if ( p_node_info) ; else __assert_fail ("p_node_info", "ccv_nnc_symbolic_graph_compile.c" , 3440, __extension__ __PRETTY_FUNCTION__); })); | |||
3441 | const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info); | |||
3442 | if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this. | |||
3443 | { | |||
3444 | if (!tensor_blocks[tensor_block_size].dup_p_refs) | |||
3445 | tensor_blocks[tensor_block_size].dup_p_refs = ccv_array_new(sizeof(int), 1, 0); | |||
3446 | ccv_array_add_unique_int(tensor_blocks[tensor_block_size].dup_p_refs, p_ref_0); | |||
3447 | } | |||
3448 | } | |||
3449 | if (!tensor_blocks[tensor_block_size].tail) | |||
3450 | tensor_blocks[tensor_block_size].tail = ccv_array_new(sizeof(int), tensor_blocks[dup_p_ref].tail->rnum, 0); | |||
3451 | for (k = 0; k < tensor_blocks[dup_p_ref].tail->rnum; k++) | |||
3452 | _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[dup_p_ref].tail, k)((void*)(((char*)((tensor_blocks[dup_p_ref].tail)->data)) + (size_t)(tensor_blocks[dup_p_ref].tail)->rsize * (size_t) (k))), tensor_blocks[tensor_block_size]); | |||
3453 | } | |||
3454 | } else { | |||
3455 | tensor_blocks[tensor_block_size].tail = ccv_array_new(sizeof(int), 1, 0); | |||
3456 | ccv_array_push(tensor_blocks[tensor_block_size].tail, &idx); | |||
3457 | } | |||
3458 | for (j = 0; j < source_size; j++) | |||
3459 | _ccv_nnc_tensor_block_add_exec(exec_dep, sources[j].d, tensor_blocks[tensor_block_size]); | |||
3460 | /* If this is a read-only (based on SSA, if first encountered as read), and this is | |||
3461 | * sub-graph. Mark it to the end of the graph. */ | |||
3462 | if (p_exec_symbol_info) | |||
3463 | for (j = 0; j < destination_size; j++) | |||
3464 | _ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[tensor_block_size]); | |||
3465 | /* If it is read-only, it is self-reflecting. */ | |||
3466 | for (k = 0; k < unroll_count; k++) | |||
3467 | { | |||
3468 | for (j = 0; j < destination_size; j++) | |||
3469 | if (dup_exec_ref[destinations[j].d * unroll_count + k] >= 0) | |||
3470 | _ccv_nnc_tensor_block_add_exec(exec_dep, dup_exec_ref[destinations[j].d * unroll_count + k], tensor_blocks[tensor_block_size]); | |||
3471 | /* No need to extend life-time, because this is a sub-graph and we already extended read-only to the end of destination. */ | |||
3472 | assert(symbolic_graph->p)((void) sizeof ((symbolic_graph->p) ? 1 : 0), __extension__ ({ if (symbolic_graph->p) ; else __assert_fail ("symbolic_graph->p" , "ccv_nnc_symbolic_graph_compile.c", 3472, __extension__ __PRETTY_FUNCTION__ ); })); | |||
3473 | dup_tensor_block_ref[tensor_block_size * unroll_count + k] = tensor_block_size; | |||
3474 | } | |||
3475 | ++tensor_block_size; | |||
3476 | } else { | |||
3477 | ccv_array_t* const dup_p_refs = s_alloc_prep->buffers[i].dup_p_refs; | |||
3478 | const int tensor_block_idx = _ccv_nnc_anonymous_tensor_block_from_free_list(tensor_blocks, tensor_block_size, anonymous_block_free_list, anonymous_block_free_list_cap, s_alloc_prep->buffers[i].type, s_alloc_prep->buffers[i].size, exec_dep, dup_p_refs); | |||
3479 | const int new_anonymous_tensor_block = (tensor_block_idx == tensor_block_size); | |||
3480 | // Find suitable tensor block from the free list. | |||
3481 | TENSOR_SET_ANONYMOUS(tensor_blocks[tensor_block_idx])(tensor_blocks[tensor_block_idx].flags = ((tensor_blocks[tensor_block_idx ].flags & ~0x10) | ANONYMOUS)); | |||
3482 | TENSOR_SET_READ_WRITE(tensor_blocks[tensor_block_idx], TENSOR_READ_WRITE(s_alloc_prep->buffers[i]))(tensor_blocks[tensor_block_idx].flags = ((tensor_blocks[tensor_block_idx ].flags & ~0xc) | (s_alloc_prep->buffers[i].flags & 0xc))); | |||
3483 | s_alloc_prep->buffers[i].p_refs[0] = tensor_block_idx + 1; | |||
3484 | if (new_anonymous_tensor_block) | |||
3485 | { | |||
3486 | tensor_blocks[tensor_block_idx].type = s_alloc_prep->buffers[i].type; | |||
3487 | tensor_blocks[tensor_block_idx].pin_mem = s_alloc_prep->buffers[i].pin_mem; | |||
3488 | tensor_blocks[tensor_block_idx].size = s_alloc_prep->buffers[i].size; | |||
3489 | tensor_blocks[tensor_block_idx].head = ccv_array_new(sizeof(int), 1, 0); | |||
3490 | ccv_array_push(tensor_blocks[tensor_block_idx].head, &idx); | |||
3491 | } else { | |||
3492 | tensor_blocks[tensor_block_idx].pin_mem = tensor_blocks[tensor_block_idx].pin_mem || s_alloc_prep->buffers[i].pin_mem; | |||
3493 | tensor_blocks[tensor_block_idx].size = ccv_max(tensor_blocks[tensor_block_idx].size, s_alloc_prep->buffers[i].size)({ typeof (tensor_blocks[tensor_block_idx].size) _a = (tensor_blocks [tensor_block_idx].size); typeof (s_alloc_prep->buffers[i] .size) _b = (s_alloc_prep->buffers[i].size); (_a > _b) ? _a : _b; }); | |||
3494 | } | |||
3495 | if (dup_p_refs && dup_p_refs->rnum > 0) | |||
3496 | { | |||
3497 | for (j = 0; j < dup_p_refs->rnum; j++) | |||
3498 | { | |||
3499 | const int dup_p_ref = *(int*)ccv_array_get(dup_p_refs, j)((void*)(((char*)((dup_p_refs)->data)) + (size_t)(dup_p_refs )->rsize * (size_t)(j))); | |||
3500 | assert(dup_p_ref >= 0)((void) sizeof ((dup_p_ref >= 0) ? 1 : 0), __extension__ ( { if (dup_p_ref >= 0) ; else __assert_fail ("dup_p_ref >= 0" , "ccv_nnc_symbolic_graph_compile.c", 3500, __extension__ __PRETTY_FUNCTION__ ); })); | |||
3501 | assert(dup_p_ref < symbolic_graph->tensor_symbol_info->rnum)((void) sizeof ((dup_p_ref < symbolic_graph->tensor_symbol_info ->rnum) ? 1 : 0), __extension__ ({ if (dup_p_ref < symbolic_graph ->tensor_symbol_info->rnum) ; else __assert_fail ("dup_p_ref < symbolic_graph->tensor_symbol_info->rnum" , "ccv_nnc_symbolic_graph_compile.c", 3501, __extension__ __PRETTY_FUNCTION__ ); })); | |||
3502 | // If it points to a p_ref upwards, check whether this is an output, if it is an output, add it to | |||
3503 | // this block's dup_p_refs. It propagates back all the way to upper layer's buffer object. | |||
3504 | if (tensor_symbol_info[dup_p_ref].p_ref) | |||
3505 | { | |||
3506 | const int p_ref_0 = tensor_symbol_info[dup_p_ref].p_ref - 1; | |||
3507 | assert(p_node_info)((void) sizeof ((p_node_info) ? 1 : 0), __extension__ ({ if ( p_node_info) ; else __assert_fail ("p_node_info", "ccv_nnc_symbolic_graph_compile.c" , 3507, __extension__ __PRETTY_FUNCTION__); })); | |||
3508 | const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info); | |||
3509 | if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this. | |||
3510 | { | |||
3511 | if (!tensor_blocks[tensor_block_idx].dup_p_refs) | |||
3512 | tensor_blocks[tensor_block_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0); | |||
3513 | ccv_array_add_unique_int(tensor_blocks[tensor_block_idx].dup_p_refs, p_ref_0); | |||
3514 | } | |||
3515 | } | |||
3516 | assert(tensor_blocks[dup_p_ref].tail)((void) sizeof ((tensor_blocks[dup_p_ref].tail) ? 1 : 0), __extension__ ({ if (tensor_blocks[dup_p_ref].tail) ; else __assert_fail ( "tensor_blocks[dup_p_ref].tail", "ccv_nnc_symbolic_graph_compile.c" , 3516, __extension__ __PRETTY_FUNCTION__); })); | |||
3517 | if (!tensor_blocks[tensor_block_idx].tail) | |||
3518 | tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), tensor_blocks[dup_p_ref].tail->rnum, 0); | |||
3519 | for (k = 0; k < tensor_blocks[dup_p_ref].tail->rnum; k++) | |||
3520 | _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[dup_p_ref].tail, k)((void*)(((char*)((tensor_blocks[dup_p_ref].tail)->data)) + (size_t)(tensor_blocks[dup_p_ref].tail)->rsize * (size_t) (k))), tensor_blocks[tensor_block_idx]); | |||
3521 | // We have to add it to the warp around companion_ref as well. | |||
3522 | // TODO: Although we know this wasted space (any space in between current one and its companion_ref will still | |||
3523 | // be occupied and unlikely to be reused), but we cannot really do too much about it because the companion_ref's | |||
3524 | // definition is too free-form and if we enforce stronger gaurantee on this (such as it must wrap around), this | |||
3525 | // gaurantee may be broken down in the line. | |||
3526 | if (tensor_blocks[dup_p_ref].companion_ref) | |||
3527 | { | |||
3528 | const int companion_ref = tensor_blocks[dup_p_ref].companion_ref - 1; | |||
3529 | for (q = 0; tensor_blocks[companion_ref].head && q < tensor_blocks[companion_ref].head->rnum; q++) | |||
3530 | _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].head, q)((void*)(((char*)((tensor_blocks[companion_ref].head)->data )) + (size_t)(tensor_blocks[companion_ref].head)->rsize * ( size_t)(q))), tensor_blocks[tensor_block_idx]); | |||
3531 | for (q = 0; tensor_blocks[companion_ref].tail && q < tensor_blocks[companion_ref].tail->rnum; q++) | |||
3532 | _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].tail, q)((void*)(((char*)((tensor_blocks[companion_ref].tail)->data )) + (size_t)(tensor_blocks[companion_ref].tail)->rsize * ( size_t)(q))), tensor_blocks[tensor_block_idx]); | |||
3533 | } | |||
3534 | } | |||
3535 | } else if (new_anonymous_tensor_block) { | |||
3536 | tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), 1, 0); | |||
3537 | ccv_array_push(tensor_blocks[tensor_block_idx].tail, &idx); | |||
3538 | } | |||
3539 | const int prev_tensor_block_idx = tensor_block_idx; | |||
3540 | if (new_anonymous_tensor_block) | |||
3541 | { | |||
3542 | if (!anonymous_block_free_list) | |||
3543 | anonymous_block_free_list = ccv_array_new(sizeof(int), 0, 0); | |||
3544 | ccv_array_push(anonymous_block_free_list, &tensor_block_size); | |||
3545 | ++tensor_block_size; | |||
3546 | } | |||
3547 | for (k = 0; k < unroll_count; k++) | |||
3548 | { | |||
3549 | const int tensor_block_idx = new_anonymous_tensor_block ? | |||
3550 | (dup_tensor_block_ref[prev_tensor_block_idx * unroll_count + k] = tensor_block_size) : | |||
3551 | dup_tensor_block_ref[prev_tensor_block_idx * unroll_count + k]; | |||
3552 | TENSOR_SET_ANONYMOUS(tensor_blocks[tensor_block_idx])(tensor_blocks[tensor_block_idx].flags = ((tensor_blocks[tensor_block_idx ].flags & ~0x10) | ANONYMOUS)); | |||
3553 | TENSOR_SET_READ_WRITE(tensor_blocks[tensor_block_idx], TENSOR_READ_WRITE(s_alloc_prep->buffers[i]))(tensor_blocks[tensor_block_idx].flags = ((tensor_blocks[tensor_block_idx ].flags & ~0xc) | (s_alloc_prep->buffers[i].flags & 0xc))); | |||
3554 | if (new_anonymous_tensor_block) | |||
3555 | { | |||
3556 | tensor_blocks[tensor_block_idx].type = s_alloc_prep->buffers[i].type; | |||
3557 | tensor_blocks[tensor_block_idx].pin_mem = s_alloc_prep->buffers[i].pin_mem; | |||
3558 | tensor_blocks[tensor_block_idx].size = s_alloc_prep->buffers[i].size; | |||
3559 | tensor_blocks[tensor_block_idx].head = ccv_array_new(sizeof(int), 1, 0); | |||
3560 | /* Attach to duplicated exec for this tensor block. */ | |||
3561 | ccv_array_push(tensor_blocks[tensor_block_idx].head, &dup_exec_ref[idx * unroll_count + k]); | |||
3562 | } else { | |||
3563 | tensor_blocks[tensor_block_idx].pin_mem = tensor_blocks[tensor_block_idx].pin_mem || s_alloc_prep->buffers[i].pin_mem; | |||
3564 | tensor_blocks[tensor_block_idx].size = ccv_max(tensor_blocks[tensor_block_idx].size, s_alloc_prep->buffers[i].size)({ typeof (tensor_blocks[tensor_block_idx].size) _a = (tensor_blocks [tensor_block_idx].size); typeof (s_alloc_prep->buffers[i] .size) _b = (s_alloc_prep->buffers[i].size); (_a > _b) ? _a : _b; }); | |||
3565 | _ccv_nnc_tensor_block_add_exec(exec_dep, dup_exec_ref[idx * unroll_count + k], tensor_blocks[tensor_block_idx]); | |||
3566 | ||||
3567 | } | |||
3568 | if (dup_p_refs && dup_p_refs->rnum > 0) | |||
3569 | { | |||
3570 | /* Not nil, not self-reflecting. */ | |||
3571 | for (j = 0; j < dup_p_refs->rnum; j++) | |||
3572 | { | |||
3573 | const int dup_p_ref = *(int*)ccv_array_get(dup_p_refs, j)((void*)(((char*)((dup_p_refs)->data)) + (size_t)(dup_p_refs )->rsize * (size_t)(j))); | |||
3574 | assert(dup_p_ref >= 0)((void) sizeof ((dup_p_ref >= 0) ? 1 : 0), __extension__ ( { if (dup_p_ref >= 0) ; else __assert_fail ("dup_p_ref >= 0" , "ccv_nnc_symbolic_graph_compile.c", 3574, __extension__ __PRETTY_FUNCTION__ ); })); | |||
3575 | assert(dup_p_ref < symbolic_graph->tensor_symbol_info->rnum)((void) sizeof ((dup_p_ref < symbolic_graph->tensor_symbol_info ->rnum) ? 1 : 0), __extension__ ({ if (dup_p_ref < symbolic_graph ->tensor_symbol_info->rnum) ; else __assert_fail ("dup_p_ref < symbolic_graph->tensor_symbol_info->rnum" , "ccv_nnc_symbolic_graph_compile.c", 3575, __extension__ __PRETTY_FUNCTION__ ); })); | |||
3576 | // If it points to a p_ref upwards, check whether this is an output, if it is an output, add it to | |||
3577 | // this block's dup_p_refs. It propagates back all the way to upper layer's buffer object. | |||
3578 | if (tensor_symbol_info[dup_p_ref].p_ref) | |||
3579 | { | |||
3580 | const int p_ref_0 = tensor_symbol_info[dup_p_ref].p_ref - 1; | |||
3581 | assert(p_node_info)((void) sizeof ((p_node_info) ? 1 : 0), __extension__ ({ if ( p_node_info) ; else __assert_fail ("p_node_info", "ccv_nnc_symbolic_graph_compile.c" , 3581, __extension__ __PRETTY_FUNCTION__); })); | |||
3582 | const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info); | |||
3583 | if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this. | |||
3584 | { | |||
3585 | if (!tensor_blocks[tensor_block_idx].dup_p_refs) | |||
3586 | tensor_blocks[tensor_block_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0); | |||
3587 | ccv_array_add_unique_int(tensor_blocks[tensor_block_idx].dup_p_refs, p_ref_0); | |||
3588 | } | |||
3589 | } | |||
3590 | assert(dup_tensor_block_ref[dup_p_ref * unroll_count + k] >= 0 && dup_tensor_block_ref[dup_p_ref * unroll_count + k] != dup_p_ref)((void) sizeof ((dup_tensor_block_ref[dup_p_ref * unroll_count + k] >= 0 && dup_tensor_block_ref[dup_p_ref * unroll_count + k] != dup_p_ref) ? 1 : 0), __extension__ ({ if (dup_tensor_block_ref [dup_p_ref * unroll_count + k] >= 0 && dup_tensor_block_ref [dup_p_ref * unroll_count + k] != dup_p_ref) ; else __assert_fail ("dup_tensor_block_ref[dup_p_ref * unroll_count + k] >= 0 && dup_tensor_block_ref[dup_p_ref * unroll_count + k] != dup_p_ref" , "ccv_nnc_symbolic_graph_compile.c", 3590, __extension__ __PRETTY_FUNCTION__ ); })); | |||
3591 | const int dup_dup_p_ref = dup_tensor_block_ref[dup_p_ref * unroll_count + k]; | |||
3592 | assert(tensor_blocks[dup_dup_p_ref].tail)((void) sizeof ((tensor_blocks[dup_dup_p_ref].tail) ? 1 : 0), __extension__ ({ if (tensor_blocks[dup_dup_p_ref].tail) ; else __assert_fail ("tensor_blocks[dup_dup_p_ref].tail", "ccv_nnc_symbolic_graph_compile.c" , 3592, __extension__ __PRETTY_FUNCTION__); })); | |||
3593 | if (!tensor_blocks[tensor_block_idx].tail) | |||
3594 | tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), tensor_blocks[dup_dup_p_ref].tail->rnum, 0); | |||
3595 | for (q = 0; q < tensor_blocks[dup_dup_p_ref].tail->rnum; q++) | |||
3596 | _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[dup_dup_p_ref].tail, q)((void*)(((char*)((tensor_blocks[dup_dup_p_ref].tail)->data )) + (size_t)(tensor_blocks[dup_dup_p_ref].tail)->rsize * ( size_t)(q))), tensor_blocks[tensor_block_idx]); | |||
3597 | // We have to add it to the warp around companion_ref as well. | |||
3598 | if (tensor_blocks[dup_dup_p_ref].companion_ref) | |||
3599 | { | |||
3600 | const int companion_ref = tensor_blocks[dup_dup_p_ref].companion_ref - 1; | |||
3601 | for (q = 0; tensor_blocks[companion_ref].head && q < tensor_blocks[companion_ref].head->rnum; q++) | |||
3602 | _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].head, q)((void*)(((char*)((tensor_blocks[companion_ref].head)->data )) + (size_t)(tensor_blocks[companion_ref].head)->rsize * ( size_t)(q))), tensor_blocks[tensor_block_idx]); | |||
3603 | for (q = 0; tensor_blocks[companion_ref].tail && q < tensor_blocks[companion_ref].tail->rnum; q++) | |||
3604 | _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].tail, q)((void*)(((char*)((tensor_blocks[companion_ref].tail)->data )) + (size_t)(tensor_blocks[companion_ref].tail)->rsize * ( size_t)(q))), tensor_blocks[tensor_block_idx]); | |||
3605 | } | |||
3606 | } | |||
3607 | } else if (new_anonymous_tensor_block) { | |||
3608 | tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), 1, 0); | |||
3609 | ccv_array_push(tensor_blocks[tensor_block_idx].tail, &dup_exec_ref[idx * unroll_count + k]); | |||
3610 | } | |||
3611 | if (new_anonymous_tensor_block) | |||
3612 | ++tensor_block_size; | |||
3613 | } | |||
3614 | } | |||
3615 | } | |||
3616 | } | |||
3617 | } | |||
3618 | } ccv_nnc_graph_visit_endfor} } | |||
3619 | if (anonymous_block_free_list) | |||
3620 | ccv_array_free(anonymous_block_free_list); | |||
3621 | ccfreefree(tensor_fold); | |||
3622 | // It is time to guess what's the best tensor placement and create the opaque tensor arena. The alloc_dep will return | |||
3623 | // the allocation dependencies, thus, which tensor is reused to the existing tensor. | |||
3624 | ccv_nnc_tensor_alloc_prep_t* alloc_prep = _ccv_nnc_tensor_alloc_prep_new_and_free_exec_dep(exec_dep, tensor_blocks, tensor_block_size); | |||
3625 | prep->while_count_tensor = 0; | |||
3626 | prep->dup_breakpoints = 0; | |||
3627 | prep->p = 0; | |||
3628 | prep->symbolic_graph = symbolic_graph; | |||
3629 | prep->p_idx = symbolic_graph->p_idx; | |||
3630 | prep->exec_idx = symbolic_graph->exec_idx; | |||
3631 | prep->sub_prep_size = symbolic_graph->sub_graphs ? symbolic_graph->sub_graphs->rnum : 0; | |||
3632 | prep->sub_preps = sub_preps; | |||
3633 | prep->exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum; | |||
3634 | prep->exec_symbol_info = exec_symbol_info; | |||
3635 | prep->tensor_symbol_info_size = symbolic_graph->tensor_symbol_info->rnum; | |||
3636 | prep->tensor_symbol_info = tensor_symbol_info; | |||
3637 | prep->unroll_count = unroll_count; | |||
3638 | prep->dup_tensor_block_ref = dup_tensor_block_ref; | |||
3639 | prep->tensor_block_size = tensor_block_size; | |||
3640 | prep->tensor_blocks = tensor_blocks; | |||
3641 | prep->exec_flags = exec_flags; | |||
3642 | prep->visit = visit; | |||
3643 | prep->alloc_prep = alloc_prep; | |||
3644 | if (dup_graph) | |||
3645 | ccv_nnc_symbolic_graph_free(dup_graph); | |||
3646 | if (dup_exec_ref) | |||
3647 | ccfreefree(dup_exec_ref); | |||
3648 | return prep; | |||
3649 | } | |||
3650 | ||||
3651 | static void _ccv_nnc_symbolic_graph_prep_free(ccv_nnc_symbolic_graph_prep_t* const prep) | |||
3652 | { | |||
3653 | int i; | |||
3654 | _ccv_nnc_tensor_blocks_free(prep->tensor_blocks, prep->tensor_block_size); | |||
3655 | ccfreefree(prep->exec_flags); | |||
3656 | for (i = 0; i < prep->sub_prep_size; i++) | |||
3657 | if (prep->sub_preps[i]) | |||
3658 | _ccv_nnc_symbolic_graph_prep_free(prep->sub_preps[i]); | |||
3659 | if (prep->sub_preps) | |||
3660 | ccfreefree(prep->sub_preps); | |||
3661 | ccfreefree(prep->tensor_symbol_info); | |||
3662 | ccfreefree(prep->exec_symbol_info); | |||
3663 | if (prep->dup_tensor_block_ref) | |||
3664 | ccfreefree(prep->dup_tensor_block_ref); | |||
3665 | _ccv_nnc_tensor_alloc_prep_free(prep->alloc_prep); | |||
3666 | ccv_nnc_graph_visit_free(prep->visit); | |||
3667 | ccfreefree(prep); | |||
3668 | } | |||
3669 | ||||
3670 | static void _ccv_nnc_symbolic_graph_prep_while_count_tensor(ccv_nnc_symbolic_graph_prep_t* const graph_prep) | |||
3671 | { | |||
3672 | int i, j; | |||
3673 | ccv_nnc_graph_visit_for(graph_prep->visit, graph_prep->exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (graph_prep->visit)-> size; _i_++) { const int idx __attribute__((unused)) = (graph_prep ->visit)->node[_i_].index; const int _node_unused_ __attribute__ ((unused)) = (graph_prep->visit)->node[_i_].term; typeof ((graph_prep->exec_symbol_info)) const node __attribute__ ((unused)) = (graph_prep->exec_symbol_info) + idx; { | |||
3674 | if (node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE) | |||
3675 | { | |||
3676 | const int graph_ref = CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node )->_inline_graph_ref)[0] - 1; | |||
3677 | assert(graph_ref >= 0)((void) sizeof ((graph_ref >= 0) ? 1 : 0), __extension__ ( { if (graph_ref >= 0) ; else __assert_fail ("graph_ref >= 0" , "ccv_nnc_symbolic_graph_compile.c", 3677, __extension__ __PRETTY_FUNCTION__ ); })); | |||
3678 | ccv_nnc_symbolic_graph_prep_t* const sub_prep = graph_prep->sub_preps[graph_ref]; | |||
3679 | for (i = 0; i < node->p_while.input_size; i++) | |||
3680 | if (CCV_NNC_IS_WHILE_COUNT_TENSOR_SYMBOL(node->p_while.inputs[i])(((uint32_t)(node->p_while.inputs[i]) & 0xf) == 0xe)) | |||
3681 | { | |||
3682 | ccv_nnc_symbolic_graph_prep_t* prep = sub_prep; | |||
3683 | const int d = CCV_NNC_DECODE_WHILE_COUNT_SYMBOL(node->p_while.inputs[i])((~(uint32_t)(node->p_while.inputs[i])) >> 4); | |||
3684 | for (j = 0; j < d; j++) | |||
3685 | prep = prep->p; | |||
3686 | prep->while_count_tensor = 1; | |||
3687 | } | |||
3688 | } | |||
3689 | for (i = 0; i < node->graph_ref_size; i++) | |||
3690 | { | |||
3691 | const int graph_ref = CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node )->_inline_graph_ref)[i] - 1; | |||
3692 | if (graph_ref >= 0) | |||
3693 | _ccv_nnc_symbolic_graph_prep_while_count_tensor(graph_prep->sub_preps[graph_ref]); | |||
3694 | } | |||
3695 | } ccv_nnc_graph_visit_endfor} } | |||
3696 | } | |||
3697 | ||||
3698 | static ccv_nnc_tensor_t* _ccv_nnc_tensor_from_graph_prep(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int symbol) | |||
3699 | { | |||
3700 | if (symbol >= 0) | |||
3701 | return graph_prep->tensor_arena->vt_tensors[symbol]; | |||
3702 | if (symbol == CCV_NNC_NO_TENSOR_SYMBOL) | |||
3703 | return 0; | |||
3704 | assert(CCV_NNC_IS_WHILE_COUNT_TENSOR_SYMBOL(symbol))((void) sizeof (((((uint32_t)(symbol) & 0xf) == 0xe)) ? 1 : 0), __extension__ ({ if ((((uint32_t)(symbol) & 0xf) == 0xe)) ; else __assert_fail ("CCV_NNC_IS_WHILE_COUNT_TENSOR_SYMBOL(symbol)" , "ccv_nnc_symbolic_graph_compile.c", 3704, __extension__ __PRETTY_FUNCTION__ ); })); | |||
3705 | const ccv_nnc_symbolic_graph_prep_t* prep = graph_prep; | |||
3706 | int i; | |||
3707 | const int d = CCV_NNC_DECODE_WHILE_COUNT_SYMBOL(symbol)((~(uint32_t)(symbol)) >> 4); | |||
3708 | for (i = 0; i < d; i++) | |||
3709 | prep = prep->p; | |||
3710 | assert(prep->while_count_tensor)((void) sizeof ((prep->while_count_tensor) ? 1 : 0), __extension__ ({ if (prep->while_count_tensor) ; else __assert_fail ("prep->while_count_tensor" , "ccv_nnc_symbolic_graph_compile.c", 3710, __extension__ __PRETTY_FUNCTION__ ); })); | |||
3711 | return (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(prep->tensor_arena->tensor_metadata, (0 << 1) + 1); | |||
3712 | } | |||
3713 | ||||
3714 | static void _ccv_nnc_graph_exec_arena_topsort(ccv_nnc_graph_t* const graph, ccv_nnc_graph_exec_arena_t* const graph_exec_arena) | |||
3715 | { | |||
3716 | int i; | |||
3717 | int* const exec_cvt = (int*)ccmallocmalloc(sizeof(int) * graph->exec_info->rnum); | |||
3718 | ccv_nnc_graph_topsort(graph, exec_cvt, graph->exec_info->rnum); | |||
3719 | graph_exec_arena->source.d = exec_cvt[graph_exec_arena->source.d]; | |||
3720 | graph_exec_arena->destination.d = exec_cvt[graph_exec_arena->destination.d]; | |||
3721 | ccv_nnc_graph_exec_t* const graph_execs = graph_exec_arena->graph_execs; | |||
3722 | for (i = 0; i < graph_exec_arena->graph_exec_size; i++) | |||
3723 | if (graph_execs[i].graph == graph) | |||
3724 | graph_execs[i].d = exec_cvt[graph_execs[i].d]; | |||
3725 | ccfreefree(exec_cvt); | |||
3726 | } | |||
3727 | ||||
3728 | static ccv_nnc_graph_exec_arena_t* _ccv_nnc_graph_exec_arena_new(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_tensor_arena_t* const tensor_arena) | |||
3729 | { | |||
3730 | int i, j, k; | |||
3731 | ccv_nnc_graph_t* const graph = graph_prep->graph; | |||
3732 | const int exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum; | |||
3733 | ccv_nnc_graph_exec_arena_t* const graph_exec_arena = (ccv_nnc_graph_exec_arena_t*)ccmallocmalloc(sizeof(ccv_nnc_graph_exec_arena_t) + sizeof(ccv_nnc_graph_exec_arena_t*) * graph_prep->sub_prep_size + sizeof(ccv_nnc_graph_exec_t) * (exec_symbol_info_size - 1)); | |||
| ||||
3734 | graph_exec_arena->graph_ref = (intptr_t)symbolic_graph; | |||
3735 | graph_exec_arena->graph_exec_size = exec_symbol_info_size; | |||
3736 | graph_exec_arena->sub_arena_size = graph_prep->sub_prep_size; | |||
3737 | graph_exec_arena->sub_arenas = (ccv_nnc_graph_exec_arena_t**)(graph_exec_arena->graph_execs + exec_symbol_info_size); | |||
3738 | memset(graph_exec_arena->sub_arenas, 0, sizeof(ccv_nnc_graph_exec_arena_t*) * graph_exec_arena->sub_arena_size); | |||
3739 | ccv_nnc_graph_exec_t* const graph_execs = graph_exec_arena->graph_execs; | |||
3740 | int max_input_size = 0, max_output_size = 0, max_breakpoint_size = 0; | |||
3741 | for (i = 0; i < exec_symbol_info_size; i++) | |||
3742 | { | |||
3743 | max_input_size = ccv_max(max_input_size, graph_prep->exec_symbol_info[i].input_size)({ typeof (max_input_size) _a = (max_input_size); typeof (graph_prep ->exec_symbol_info[i].input_size) _b = (graph_prep->exec_symbol_info [i].input_size); (_a > _b) ? _a : _b; }); | |||
3744 | max_output_size = ccv_max(max_output_size, graph_prep->exec_symbol_info[i].output_size)({ typeof (max_output_size) _a = (max_output_size); typeof (graph_prep ->exec_symbol_info[i].output_size) _b = (graph_prep->exec_symbol_info [i].output_size); (_a > _b) ? _a : _b; }); | |||
3745 | if (graph_prep->exec_symbol_info[i].flags & CCV_NNC_GRAPH_EXEC_P_WHILE) | |||
3746 | max_input_size = ccv_max(max_input_size, graph_prep->exec_symbol_info[i].p_while.input_size)({ typeof (max_input_size) _a = (max_input_size); typeof (graph_prep ->exec_symbol_info[i].p_while.input_size) _b = (graph_prep ->exec_symbol_info[i].p_while.input_size); (_a > _b) ? _a : _b; }); | |||
3747 | graph_execs[i].d = CCV_NNC_NO_TENSOR_SYMBOL; | |||
3748 | graph_execs[i].graph = 0; | |||
3749 | } | |||
3750 | for (i = 0; i < graph_prep->sub_prep_size; i++) | |||
3751 | max_breakpoint_size = ccv_max(max_breakpoint_size, (*(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, i))->breakpoint_size)({ typeof (max_breakpoint_size) _a = (max_breakpoint_size); typeof ((*(ccv_nnc_symbolic_graph_t**)((void*)(((char*)((symbolic_graph ->sub_graphs)->data)) + (size_t)(symbolic_graph->sub_graphs )->rsize * (size_t)(i))))->breakpoint_size) _b = ((*(ccv_nnc_symbolic_graph_t **)((void*)(((char*)((symbolic_graph->sub_graphs)->data )) + (size_t)(symbolic_graph->sub_graphs)->rsize * (size_t )(i))))->breakpoint_size); (_a > _b) ? _a : _b; }); | |||
3752 | ccv_nnc_tensor_t* max_inputs[ccv_max(1, max_input_size)({ typeof (1) _a = (1); typeof (max_input_size) _b = (max_input_size ); (_a > _b) ? _a : _b; })]; | |||
3753 | ccv_nnc_tensor_t* max_outputs[ccv_max(1, max_output_size)({ typeof (1) _a = (1); typeof (max_output_size) _b = (max_output_size ); (_a > _b) ? _a : _b; })]; | |||
3754 | ccv_nnc_graph_exec_t max_breakpoints[ccv_max(1, max_breakpoint_size)({ typeof (1) _a = (1); typeof (max_breakpoint_size) _b = (max_breakpoint_size ); (_a > _b) ? _a : _b; })]; | |||
3755 | const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info = graph_prep->exec_symbol_info; | |||
3756 | const ccv_nnc_graph_exec_flag_t* const exec_flags = graph_prep->exec_flags; | |||
3757 | // Create node, this is in topological order. | |||
3758 | ccv_nnc_graph_visit_for(graph_prep->visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (graph_prep->visit)-> size; _i_++) { const int idx __attribute__((unused)) = (graph_prep ->visit)->node[_i_].index; const int _node_unused_ __attribute__ ((unused)) = (graph_prep->visit)->node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__((unused)) = (exec_symbol_info ) + idx; { | |||
3759 | if (CCV_NO_GRAPH_EXEC(graph_execs[idx])((graph_execs[idx]).graph == 0)) | |||
3760 | { | |||
3761 | for (i = 0; i < node->input_size; i++) | |||
3762 | max_inputs[i] = _ccv_nnc_tensor_from_graph_prep(graph_prep, node->inputs[i]); | |||
3763 | for (i = 0; i < node->output_size; i++) | |||
3764 | max_outputs[i] = node->outputs[i] >= 0 ? tensor_arena->vt_tensors[node->outputs[i]] : 0; | |||
3765 | if (node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE) | |||
3766 | { | |||
3767 | const int graph_ref = CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node )->_inline_graph_ref)[0] - 1; | |||
3768 | assert(graph_ref >= 0)((void) sizeof ((graph_ref >= 0) ? 1 : 0), __extension__ ( { if (graph_ref >= 0) ; else __assert_fail ("graph_ref >= 0" , "ccv_nnc_symbolic_graph_compile.c", 3768, __extension__ __PRETTY_FUNCTION__ ); })); | |||
3769 | ccv_nnc_symbolic_graph_prep_t* const sub_prep = graph_prep->sub_preps[graph_ref]; | |||
3770 | ccv_nnc_graph_t* const sub_graph = sub_prep->graph; | |||
3771 | graph_execs[idx] = ccv_nnc_graph_while(graph, node->cmd.cmd, sub_graph); | |||
3772 | const ccv_nnc_symbolic_graph_t* const sub_symbolic_graph = *(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, graph_ref)((void*)(((char*)((symbolic_graph->sub_graphs)->data)) + (size_t)(symbolic_graph->sub_graphs)->rsize * (size_t) (graph_ref))); | |||
3773 | ccv_nnc_graph_exec_arena_t* const sub_arena = graph_exec_arena->sub_arenas[graph_ref] = _ccv_nnc_graph_exec_arena_new(sub_symbolic_graph, ccv_nnc_symbolic_graph_sources(sub_symbolic_graph), ccv_nnc_symbolic_graph_source_size(sub_symbolic_graph), ccv_nnc_symbolic_graph_destinations(sub_symbolic_graph), ccv_nnc_symbolic_graph_destination_size(sub_symbolic_graph), graph_prep->sub_preps[graph_ref], tensor_arena->sub_arenas[graph_ref]); | |||
3774 | ccv_nnc_graph_exec_set_io(graph, graph_execs[idx], max_inputs, node->input_size, max_outputs, node->output_size); | |||
3775 | for (i = 0; i < node->p_while.input_size; i++) | |||
3776 | max_inputs[i] = _ccv_nnc_tensor_from_graph_prep(sub_prep, node->p_while.inputs[i]); | |||
3777 | for (i = 0; i < sub_symbolic_graph->breakpoint_size; i++) | |||
3778 | max_breakpoints[i] = ccv_nnc_graph_exec_from_symbol(sub_arena, sub_symbolic_graph->breakpoints[i]); | |||
3779 | ccv_nnc_graph_set_while_expr(sub_graph, node->p_while.expr, node->p_while.data, max_inputs, node->p_while.input_size, max_breakpoints, sub_symbolic_graph->breakpoint_size); | |||
3780 | _ccv_nnc_graph_exec_arena_topsort(sub_graph, sub_arena); | |||
3781 | } else if (node->flags & CCV_NNC_GRAPH_EXEC_CASE_OF) { | |||
3782 | for (i = 0; i < node->output_size; i++) | |||
3783 | if (max_outputs[i] && max_outputs[i]->alias_ref) | |||
3784 | max_outputs[i] = (ccv_nnc_tensor_t*)max_outputs[i]->alias_ref; | |||
3785 | graph_execs[idx] = ccv_nnc_graph_case_of_new(graph, node->cmd.cmd, max_inputs + node->case_of.argument.offset, node->case_of.argument.size, max_outputs, node->output_size); | |||
3786 | // Check whether this is already covered in the inputs, if not, need to be covered in the update. | |||
3787 | for (i = 0; i < node->case_of.argument.offset; i++) | |||
3788 | { | |||
3789 | ccv_nnc_tensor_t* const update = max_inputs[i]; | |||
3790 | if (!CCV_IS_TENSOR_MULTIVIEW(update)((*(int*)(update)) & CCV_TENSOR_MULTIVIEW)) // No need if it is a naked tensor. | |||
3791 | continue; | |||
3792 | int flag = 0; | |||
3793 | for (j = node->case_of.argument.offset; !flag && j < node->case_of.argument.size; j++) | |||
3794 | flag = (update == max_inputs[j]); | |||
3795 | if (!flag) | |||
3796 | ccv_nnc_graph_exec_add_as_affected(graph, graph_execs[idx], update); | |||
3797 | } | |||
3798 | const int offset = (exec_flags[idx].flags & CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO) ? 1 : 0; | |||
3799 | ccv_nnc_graph_set_case_of_expr(graph, graph_execs[idx], node->case_of.expr, node->case_of.data, offset); | |||
3800 | if (exec_flags[idx].flags & CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO) | |||
3801 | { | |||
3802 | // Add another graph for data transfer. | |||
3803 | ccv_nnc_graph_t* sub_graph = ccv_nnc_graph_new(); | |||
3804 | for (i = 0; i < node->output_size; i++) | |||
3805 | max_outputs[i] = node->outputs[i] >= 0 ? tensor_arena->vt_tensors[node->outputs[i]] : 0; | |||
3806 | ccv_nnc_graph_exec_t io = ccv_nnc_graph_exec_new(sub_graph, ccv_nnc_cmd(CCV_NNC_DATA_TRANSFER_FORWARD, 0, CMD_GENERIC()((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}}}), 0), ccv_nnc_no_hint, max_inputs, ccv_min(node->input_size, node->output_size)({ typeof (node->input_size) _a = (node->input_size); typeof (node->output_size) _b = (node->output_size); (_a < _b) ? _a : _b; }), max_outputs, ccv_min(node->input_size, node->output_size)({ typeof (node->input_size) _a = (node->input_size); typeof (node->output_size) _b = (node->output_size); (_a < _b) ? _a : _b; })); | |||
3807 | ccv_nnc_graph_set_sources(sub_graph, &io, 1); | |||
3808 | ccv_nnc_graph_set_destinations(sub_graph, &io, 1); | |||
3809 | ccv_nnc_graph_set_case_of(graph, graph_execs[idx], sub_graph, 0); | |||
3810 | int exec_cvt; | |||
3811 | ccv_nnc_graph_topsort(sub_graph, &exec_cvt, 1); | |||
3812 | } | |||
3813 | for (i = 0; i < node->graph_ref_size; i++) | |||
3814 | { | |||
3815 | const int graph_ref = CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node )->_inline_graph_ref)[i] - 1; | |||
3816 | if (graph_ref < 0) | |||
3817 | continue; | |||
3818 | ccv_nnc_graph_t* const sub_graph = graph_prep->sub_preps[graph_ref]->graph; | |||
3819 | const ccv_nnc_symbolic_graph_t* const sub_symbolic_graph = *(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, graph_ref)((void*)(((char*)((symbolic_graph->sub_graphs)->data)) + (size_t)(symbolic_graph->sub_graphs)->rsize * (size_t) (graph_ref))); | |||
3820 | ccv_nnc_graph_exec_arena_t* const sub_arena = graph_exec_arena->sub_arenas[graph_ref] = _ccv_nnc_graph_exec_arena_new(sub_symbolic_graph, ccv_nnc_symbolic_graph_sources(sub_symbolic_graph), ccv_nnc_symbolic_graph_source_size(sub_symbolic_graph), ccv_nnc_symbolic_graph_destinations(sub_symbolic_graph), ccv_nnc_symbolic_graph_destination_size(sub_symbolic_graph), graph_prep->sub_preps[graph_ref], tensor_arena->sub_arenas[graph_ref]); | |||
3821 | ccv_nnc_graph_set_case_of(graph, graph_execs[idx], sub_graph, i + offset); | |||
3822 | _ccv_nnc_graph_exec_arena_topsort(sub_graph, sub_arena); | |||
3823 | } | |||
3824 | } else { | |||
3825 | graph_execs[idx] = ccv_nnc_graph_exec_new(graph, node->cmd, node->hint, max_inputs, node->input_size, max_outputs, node->output_size); | |||
3826 | } | |||
3827 | ccv_nnc_graph_exec_set_io_flags(graph, graph_execs[idx], 0, 0, 0, 0); | |||
3828 | } | |||
3829 | } ccv_nnc_graph_visit_endfor} } | |||
3830 | // Then connect them. | |||
3831 | ccv_nnc_graph_visit_for(graph_prep->visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (graph_prep->visit)-> size; _i_++) { const int idx __attribute__((unused)) = (graph_prep ->visit)->node[_i_].index; const int _node_unused_ __attribute__ ((unused)) = (graph_prep->visit)->node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__((unused)) = (exec_symbol_info ) + idx; { | |||
3832 | if (node->outgoings) | |||
3833 | for (i = 0; i < node->outgoings->rnum; i++) | |||
3834 | { | |||
3835 | const int outgoing = *(int*)ccv_array_get(node->outgoings, i)((void*)(((char*)((node->outgoings)->data)) + (size_t)( node->outgoings)->rsize * (size_t)(i))); | |||
3836 | if (graph_execs[outgoing].graph) | |||
3837 | ccv_nnc_graph_exec_concat(graph, graph_execs[idx], graph_execs[outgoing]); | |||
3838 | } | |||
3839 | } ccv_nnc_graph_visit_endfor} } | |||
3840 | int source_exec_created = 0; | |||
3841 | const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info = graph_prep->tensor_symbol_info; | |||
3842 | const ccv_nnc_tensor_block_t* const tensor_blocks = graph_prep->tensor_blocks; | |||
3843 | ccv_array_t* const* const alloc_dep = graph_prep->alloc_prep->alloc_dep; | |||
3844 | // After the graph is materialized, we need to handle the case that some of these tensors require to be initialized to zero before use. | |||
3845 | for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++) | |||
3846 | { | |||
3847 | if (TENSOR_REQUIRE_INIT(tensor_symbol_info[i].flags)(((tensor_symbol_info[i].flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS ) || ((tensor_symbol_info[i].flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ONES ))) | |||
3848 | { | |||
3849 | int ref = i; | |||
3850 | while (tensor_symbol_info[ref].alias_ref) | |||
3851 | ref = tensor_symbol_info[ref].alias_ref - 1; | |||
3852 | while (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref])(!((tensor_blocks[ref].flags & 0x3) == ALIAS) && ! ((tensor_blocks[ref].flags & 0x3) == UNASSIGNED)) && tensor_blocks[ref].ref) | |||
3853 | ref = tensor_blocks[ref].ref - 1; | |||
3854 | // This is not computable. It could be that we marked a const tensor as init zero. | |||
3855 | if (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref])(!((tensor_blocks[ref].flags & 0x3) == ALIAS) && ! ((tensor_blocks[ref].flags & 0x3) == UNASSIGNED))) | |||
3856 | continue; | |||
3857 | // If this tensor is not used by any exec, we don't need to init at all. Skip. | |||
3858 | if (!tensor_blocks[ref].head || tensor_blocks[ref].head->rnum == 0) | |||
3859 | continue; | |||
3860 | ccv_nnc_tensor_t* tensor = tensor_arena->vt_tensors[ref]; | |||
3861 | // Now, we have the original tensor, we can get the actual tensor, and construct the set command. | |||
3862 | ccv_nnc_graph_exec_t set_exec; | |||
3863 | if (tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS) | |||
3864 | set_exec = ccv_nnc_graph_exec_new(graph, CMD_SET_FORWARD(0)ccv_nnc_cmd(CCV_NNC_SET_FORWARD, 0, (ccv_nnc_cmd_param_t){.size ={.dim={1,1,1}},.blas={.a={0,}}}, 0), ccv_nnc_no_hint, 0, 0, &tensor, 1); | |||
3865 | else if (tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_INIT_ONES) | |||
3866 | set_exec = ccv_nnc_graph_exec_new(graph, CMD_SET_FORWARD(1)ccv_nnc_cmd(CCV_NNC_SET_FORWARD, 0, (ccv_nnc_cmd_param_t){.size ={.dim={1,1,1}},.blas={.a={1,}}}, 0), ccv_nnc_no_hint, 0, 0, &tensor, 1); | |||
3867 | for (j = 0; j < tensor_blocks[ref].head->rnum; j++) | |||
3868 | { | |||
3869 | const int outgoing = *(int*)ccv_array_get(tensor_blocks[ref].head, j)((void*)(((char*)((tensor_blocks[ref].head)->data)) + (size_t )(tensor_blocks[ref].head)->rsize * (size_t)(j))); | |||
3870 | if (outgoing >= exec_symbol_info_size) | |||
3871 | continue; | |||
3872 | assert(outgoing >= 0)((void) sizeof ((outgoing >= 0) ? 1 : 0), __extension__ ({ if (outgoing >= 0) ; else __assert_fail ("outgoing >= 0" , "ccv_nnc_symbolic_graph_compile.c", 3872, __extension__ __PRETTY_FUNCTION__ ); })); | |||
3873 | assert(graph_execs[outgoing].graph)((void) sizeof ((graph_execs[outgoing].graph) ? 1 : 0), __extension__ ({ if (graph_execs[outgoing].graph) ; else __assert_fail ("graph_execs[outgoing].graph" , "ccv_nnc_symbolic_graph_compile.c", 3873, __extension__ __PRETTY_FUNCTION__ ); })); | |||
3874 | ccv_nnc_graph_exec_concat(graph, set_exec, graph_execs[outgoing]); | |||
3875 | } | |||
3876 | int flags = 0; | |||
3877 | if (alloc_dep[ref]) | |||
3878 | for (j = 0; j < alloc_dep[ref]->rnum; j++) | |||
3879 | { | |||
3880 | const int d = *(int*)ccv_array_get(alloc_dep[ref], j)((void*)(((char*)((alloc_dep[ref])->data)) + (size_t)(alloc_dep [ref])->rsize * (size_t)(j))); | |||
3881 | // This is from alloc_dep, it should be computable. | |||
3882 | assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]))((void) sizeof (((!((tensor_blocks[d].flags & 0x3) == ALIAS ) && !((tensor_blocks[d].flags & 0x3) == UNASSIGNED ))) ? 1 : 0), __extension__ ({ if ((!((tensor_blocks[d].flags & 0x3) == ALIAS) && !((tensor_blocks[d].flags & 0x3) == UNASSIGNED))) ; else __assert_fail ("TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d])" , "ccv_nnc_symbolic_graph_compile.c", 3882, __extension__ __PRETTY_FUNCTION__ ); })); | |||
3883 | if (tensor_blocks[d].tail) | |||
3884 | for (k = 0; k < tensor_blocks[d].tail->rnum; k++) | |||
3885 | { | |||
3886 | const int incoming = *(int*)ccv_array_get(tensor_blocks[d].tail, k)((void*)(((char*)((tensor_blocks[d].tail)->data)) + (size_t )(tensor_blocks[d].tail)->rsize * (size_t)(k))); | |||
3887 | if (incoming >= exec_symbol_info_size) | |||
3888 | continue; | |||
3889 | assert(incoming >= 0)((void) sizeof ((incoming >= 0) ? 1 : 0), __extension__ ({ if (incoming >= 0) ; else __assert_fail ("incoming >= 0" , "ccv_nnc_symbolic_graph_compile.c", 3889, __extension__ __PRETTY_FUNCTION__ ); })); | |||
3890 | assert(graph_execs[incoming].graph)((void) sizeof ((graph_execs[incoming].graph) ? 1 : 0), __extension__ ({ if (graph_execs[incoming].graph) ; else __assert_fail ("graph_execs[incoming].graph" , "ccv_nnc_symbolic_graph_compile.c", 3890, __extension__ __PRETTY_FUNCTION__ ); })); | |||
| ||||
3891 | ccv_nnc_graph_exec_concat(graph, graph_execs[incoming], set_exec); | |||
3892 | flags = 1; | |||
3893 | } | |||
3894 | } | |||
3895 | // If cannot find a start node for this exec, we need to append it to the no-op of the start. | |||
3896 | if (!flags) | |||
3897 | { | |||
3898 | if (!source_exec_created) | |||
3899 | { | |||
3900 | graph_exec_arena->source = ccv_nnc_graph_exec_new(graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC()((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}}}), 0), ccv_nnc_no_hint, 0, 0, 0, 0); | |||
3901 | source_exec_created = 1; | |||
3902 | } | |||
3903 | ccv_nnc_graph_exec_concat(graph, graph_exec_arena->source, set_exec); | |||
3904 | } | |||
3905 | } | |||
3906 | } | |||
3907 | // Now go through the list of tensors to see whether we need to do explicit broadcast for these tensor multi-views | |||
3908 | // (we need that if it is not associated as inputs / outputs of any execs, this is possible if all execs associate | |||
3909 | // with its alias). | |||
3910 | assert(tensor_arena->vt_tensor_size == graph_prep->tensor_symbol_info_size)((void) sizeof ((tensor_arena->vt_tensor_size == graph_prep ->tensor_symbol_info_size) ? 1 : 0), __extension__ ({ if ( tensor_arena->vt_tensor_size == graph_prep->tensor_symbol_info_size ) ; else __assert_fail ("tensor_arena->vt_tensor_size == graph_prep->tensor_symbol_info_size" , "ccv_nnc_symbolic_graph_compile.c", 3910, __extension__ __PRETTY_FUNCTION__ ); })); | |||
3911 | for (i = 0; i < tensor_arena->vt_tensor_size; i++) | |||
3912 | { | |||
3913 | ccv_nnc_tensor_t* mv = tensor_arena->vt_tensors[i]; | |||
3914 | // If it is multiview tensor, inspect all its head to see whether we already associated with the node. | |||
3915 | if (mv && CCV_IS_TENSOR_MULTIVIEW(mv)((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW)) | |||
3916 | { | |||
3917 | const ccv_array_t* const head = tensor_blocks[i].head; | |||
3918 | if (head && head->rnum > 0) | |||
3919 | for (j = 0; j < head->rnum; j++) | |||
3920 | { | |||
3921 | const int idx = *(int*)ccv_array_get(head, j)((void*)(((char*)((head)->data)) + (size_t)(head)->rsize * (size_t)(j))); | |||
3922 | if (idx >= exec_symbol_info_size) | |||
3923 | continue; | |||
3924 | assert(idx >= 0)((void) sizeof ((idx >= 0) ? 1 : 0), __extension__ ({ if ( idx >= 0) ; else __assert_fail ("idx >= 0", "ccv_nnc_symbolic_graph_compile.c" , 3924, __extension__ __PRETTY_FUNCTION__); })); | |||
3925 | const int d = graph_execs[idx].d; | |||
3926 | ccv_nnc_graph_exec_info_t* const exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, d)((void*)(((char*)((graph->exec_info)->data)) + (size_t) (graph->exec_info)->rsize * (size_t)(d))); | |||
3927 | int flag = 0; | |||
3928 | if (exec_info->tensor_wraps_ref) | |||
3929 | { | |||
3930 | ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = *(ccv_nnc_graph_tensor_wrap_array_t**)ccv_array_get(graph->tensor_wraps, exec_info->tensor_wraps_ref - 1)((void*)(((char*)((graph->tensor_wraps)->data)) + (size_t )(graph->tensor_wraps)->rsize * (size_t)(exec_info-> tensor_wraps_ref - 1))); | |||
3931 | for (k = 0; k < tensor_wrap_array->size && !flag; k++) | |||
3932 | flag = (tensor_wrap_array->tensor_wraps[k] && tensor_wrap_array->tensor_wraps[k]->tensors[0] == mv); | |||
3933 | } | |||
3934 | // If none is in the flag, it need to be included in the cast. | |||
3935 | if (!flag) | |||
3936 | ccv_nnc_graph_exec_add_as_affected(graph, graph_execs[idx], mv); | |||
3937 | } | |||
3938 | } | |||
3939 | } | |||
3940 | // Create source / destination phony node. This is to facilitate use of compiled graph. | |||
3941 | // Also, this is needed if you have init zero execs. | |||
3942 | if (source_exec_created || source_size > 1) | |||
3943 | { | |||
3944 | if (!source_exec_created) | |||
3945 | graph_exec_arena->source = ccv_nnc_graph_exec_new(graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC()((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}}}), 0), ccv_nnc_no_hint, 0, 0, 0, 0); | |||
3946 | for (i = 0; i < source_size; i++) | |||
3947 | ccv_nnc_graph_exec_concat(graph, graph_exec_arena->source, graph_execs[sources[i].d]); | |||
3948 | } else { | |||
3949 | assert(!source_exec_created)((void) sizeof ((!source_exec_created) ? 1 : 0), __extension__ ({ if (!source_exec_created) ; else __assert_fail ("!source_exec_created" , "ccv_nnc_symbolic_graph_compile.c", 3949, __extension__ __PRETTY_FUNCTION__ ); })); | |||
3950 | assert(source_size == 1)((void) sizeof ((source_size == 1) ? 1 : 0), __extension__ ({ if (source_size == 1) ; else __assert_fail ("source_size == 1" , "ccv_nnc_symbolic_graph_compile.c", 3950, __extension__ __PRETTY_FUNCTION__ ); })); | |||
3951 | graph_exec_arena->source = graph_execs[sources[0].d]; | |||
3952 | } | |||
3953 | if (destination_size == 1) | |||
3954 | graph_exec_arena->destination = graph_execs[destinations[0].d]; | |||
3955 | else { | |||
3956 | graph_exec_arena->destination = ccv_nnc_graph_exec_new(graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC()((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}}}), 0), ccv_nnc_no_hint, 0, 0, 0, 0); | |||
3957 | for (i = 0; i < destination_size; i++) | |||
3958 | ccv_nnc_graph_exec_concat(graph, graph_execs[destinations[i].d], graph_exec_arena->destination); | |||
3959 | } | |||
3960 | ccv_nnc_graph_set_sources(graph, &graph_exec_arena->source, 1); | |||
3961 | ccv_nnc_graph_set_destinations(graph, &graph_exec_arena->destination, 1); | |||
3962 | return graph_exec_arena; | |||
3963 | } | |||
3964 | ||||
3965 | static ccv_nnc_graph_t* _ccv_nnc_graph_find_pair(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_symbolic_graph_t* const pair) | |||
3966 | { | |||
3967 | if (graph_prep->symbolic_graph == pair) | |||
3968 | return graph_prep->graph; | |||
3969 | int i; | |||
3970 | for (i = 0; i < graph_prep->sub_prep_size; i++) | |||
3971 | if (graph_prep->sub_preps[i]) | |||
3972 | { | |||
3973 | ccv_nnc_graph_t* const graph = _ccv_nnc_graph_find_pair(graph_prep->sub_preps[i], pair); | |||
3974 | if (graph) | |||
3975 | return graph; | |||
3976 | } | |||
3977 | return 0; | |||
3978 | } | |||
3979 | ||||
3980 | static void _ccv_nnc_graph_fixup_pair(const ccv_nnc_symbolic_graph_prep_t* const root_prep, ccv_nnc_symbolic_graph_prep_t* const graph_prep) | |||
3981 | { | |||
3982 | int i; | |||
3983 | for (i = 0; i < graph_prep->sub_prep_size; i++) | |||
3984 | if (graph_prep->sub_preps[i]) | |||
3985 | { | |||
3986 | if (graph_prep->sub_preps[i]->symbolic_graph->pair) | |||
3987 | graph_prep->sub_preps[i]->graph->pair = _ccv_nnc_graph_find_pair(root_prep, graph_prep->sub_preps[i]->symbolic_graph->pair); | |||
3988 | } | |||
3989 | } | |||
3990 | ||||
3991 | static void _ccv_nnc_graph_exec_arena_fixup_pair_ref(const ccv_nnc_graph_exec_arena_t* const root_arena, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, ccv_nnc_graph_exec_arena_t* const graph_exec_arena) | |||
3992 | { | |||
3993 | assert(graph_exec_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph)((void) sizeof ((graph_exec_arena->graph_ref == (intptr_t) graph_prep->symbolic_graph) ? 1 : 0), __extension__ ({ if ( graph_exec_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph ) ; else __assert_fail ("graph_exec_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph" , "ccv_nnc_symbolic_graph_compile.c", 3993, __extension__ __PRETTY_FUNCTION__ ); })); | |||
3994 | int i; | |||
3995 | for (i = 0; i < graph_prep->exec_symbol_info_size; i++) | |||
3996 | { | |||
3997 | if (CCV_NNC_GRAPH_EXEC_IS_DEAD(graph_prep->exec_symbol_info[i].flags)((graph_prep->exec_symbol_info[i].flags) & CCV_NNC_GRAPH_EXEC_DEAD )) | |||
3998 | continue; | |||
3999 | if (graph_exec_arena->graph_execs[i].graph && graph_prep->exec_symbol_info[i].pair_ref) | |||
4000 | { | |||
4001 | ccv_nnc_graph_exec_t pair_exec = ccv_nnc_graph_exec_from_symbol(root_arena, (ccv_nnc_graph_exec_symbol_t){ | |||
4002 | .d = graph_prep->exec_symbol_info[i].pair_ref - 1, | |||
4003 | .graph = graph_prep->symbolic_graph->pair ? graph_prep->symbolic_graph->pair : graph_prep->symbolic_graph, | |||
4004 | }); | |||
4005 | if (pair_exec.d >= 0) | |||
4006 | ccv_nnc_graph_exec_pair_with(graph_prep->graph, graph_exec_arena->graph_execs[i], pair_exec); | |||
4007 | } | |||
4008 | } | |||
4009 | for (i = 0; i < graph_prep->sub_prep_size; i++) | |||
4010 | if (graph_prep->sub_preps[i]) | |||
4011 | _ccv_nnc_graph_exec_arena_fixup_pair_ref(root_arena, graph_prep->sub_preps[i], graph_exec_arena->sub_arenas[i]); | |||
4012 | } | |||
4013 | ||||
4014 | static void _ccv_nnc_symbolic_graph_prep_dup_breakpoints_free(ccv_nnc_symbolic_graph_prep_t* const graph_prep) | |||
4015 | { | |||
4016 | int i; | |||
4017 | if (graph_prep->dup_breakpoints) | |||
4018 | { | |||
4019 | // Strip the const modifier only possible because it is a sub-graph. | |||
4020 | ccv_nnc_symbolic_graph_t* const symbolic_graph = (ccv_nnc_symbolic_graph_t*)graph_prep->symbolic_graph; | |||
4021 | for (i = 0; i < graph_prep->dup_breakpoints->rnum; i++) | |||
4022 | ccv_nnc_graph_exec_symbol_free(symbolic_graph, *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(graph_prep->dup_breakpoints, i)((void*)(((char*)((graph_prep->dup_breakpoints)->data)) + (size_t)(graph_prep->dup_breakpoints)->rsize * (size_t )(i)))); | |||
4023 | ccv_array_free(graph_prep->dup_breakpoints); | |||
4024 | graph_prep->dup_breakpoints = 0; | |||
4025 | graph_prep->exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum; | |||
4026 | // Afterwards, we have to regenerate the exec_symbol_info, fill in the information (through symbol_infer). | |||
4027 | memcpy(graph_prep->exec_symbol_info, ccv_array_get(symbolic_graph->exec_symbol_info, 0)((void*)(((char*)((symbolic_graph->exec_symbol_info)->data )) + (size_t)(symbolic_graph->exec_symbol_info)->rsize * (size_t)(0))), sizeof(ccv_nnc_graph_exec_symbol_info_t) * graph_prep->exec_symbol_info_size); | |||
4028 | // Since exec_symbol_info changed, create a new visit object. | |||
4029 | assert(symbolic_graph->sources)((void) sizeof ((symbolic_graph->sources) ? 1 : 0), __extension__ ({ if (symbolic_graph->sources) ; else __assert_fail ("symbolic_graph->sources" , "ccv_nnc_symbolic_graph_compile.c", 4029, __extension__ __PRETTY_FUNCTION__ ); })); | |||
4030 | assert(symbolic_graph->destinations)((void) sizeof ((symbolic_graph->destinations) ? 1 : 0), __extension__ ({ if (symbolic_graph->destinations) ; else __assert_fail ("symbolic_graph->destinations", "ccv_nnc_symbolic_graph_compile.c" , 4030, __extension__ __PRETTY_FUNCTION__); })); | |||
4031 | ccv_nnc_graph_exec_symbol_t* const sources = (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->sources, 0)((void*)(((char*)((symbolic_graph->sources)->data)) + ( size_t)(symbolic_graph->sources)->rsize * (size_t)(0))); | |||
4032 | const int source_size = symbolic_graph->sources->rnum; | |||
4033 | ccv_nnc_graph_exec_symbol_t* const destinations = (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->destinations, 0)((void*)(((char*)((symbolic_graph->destinations)->data) ) + (size_t)(symbolic_graph->destinations)->rsize * (size_t )(0))); | |||
4034 | const int destination_size = symbolic_graph->destinations->rnum; | |||
4035 | ccv_nnc_graph_visit_t* visit = ccv_nnc_graph_visit_new(symbolic_graph, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, 0), symbolic_graph->exec_symbol_info->rnum, sources, source_size, destinations, destination_size, 0)({ ccv_nnc_graph_visit_t* _visit_ = (ccv_nnc_graph_visit_t*)malloc (sizeof(ccv_nnc_graph_visit_t) + sizeof(_visit_->node[0]) * ((symbolic_graph->exec_symbol_info->rnum) - 1)); _visit_ ->size = 0; do { typedef struct { int8_t d; int8_t r; uint16_t c; int32_t edges; } ccv_nnc_incoming_t; int _i_, _j_; int _incoming_edges_ = 0; for (_i_ = 0; _i_ < (symbolic_graph->exec_symbol_info ->rnum); _i_++) _incoming_edges_ += (((ccv_nnc_graph_exec_symbol_info_t *)((void*)(((char*)((symbolic_graph->exec_symbol_info)-> data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize * (size_t)(0))))[_i_].outgoings) ? ((ccv_nnc_graph_exec_symbol_info_t *)((void*)(((char*)((symbolic_graph->exec_symbol_info)-> data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize * (size_t)(0))))[_i_].outgoings->rnum : 0; const int _heap_mem_ = ((symbolic_graph->exec_symbol_info->rnum) + _incoming_edges_ > 1024); ccv_nnc_incoming_t* _incomings_; if (_heap_mem_) _incomings_ = (ccv_nnc_incoming_t*)malloc(sizeof(ccv_nnc_incoming_t ) * (symbolic_graph->exec_symbol_info->rnum) + sizeof(int32_t ) * ((symbolic_graph->exec_symbol_info->rnum) * 2 + _incoming_edges_ )); else _incomings_ = (ccv_nnc_incoming_t*)__builtin_alloca ( sizeof(ccv_nnc_incoming_t) * (symbolic_graph->exec_symbol_info ->rnum) + sizeof(int32_t) * ((symbolic_graph->exec_symbol_info ->rnum) * 2 + _incoming_edges_)); memset(_incomings_, 0, sizeof (ccv_nnc_incoming_t) * (symbolic_graph->exec_symbol_info-> rnum)); int32_t* _exists_[2] = { (int32_t*)(_incomings_ + (symbolic_graph ->exec_symbol_info->rnum)), (int32_t*)(_incomings_ + (symbolic_graph ->exec_symbol_info->rnum)) + (symbolic_graph->exec_symbol_info ->rnum), }; int32_t* const _edges_ = _exists_[1] + (symbolic_graph ->exec_symbol_info->rnum); for (_i_ = 0; _i_ < (source_size ); _i_++) { ((void) sizeof (((sources)[_i_].graph == symbolic_graph ) ? 1 : 0), __extension__ ({ if ((sources)[_i_].graph == symbolic_graph ) ; else __assert_fail ("(sources)[_i_].graph == symbolic_graph" , "ccv_nnc_symbolic_graph_compile.c", 4035, __extension__ __PRETTY_FUNCTION__ ); })); _incomings_[(sources)[_i_].d].r = 1; _exists_[0][_i_] = (sources)[_i_].d; } int _exist_size_[2] = { (source_size), 0, }; int _p_ = 0, _q_ = 1; while (_exist_size_[_p_] > 0) { _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_ [_p_]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if ( _incomings_[_idx_].r != 1) continue; _incomings_[_idx_].r = 2 ; if (((ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)(( symbolic_graph->exec_symbol_info)->data)) + (size_t)(symbolic_graph ->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings ) for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t*) ((void*)(((char*)((symbolic_graph->exec_symbol_info)->data )) + (size_t)(symbolic_graph->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings->rnum; _j_++) { const int d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t *)((void*)(((char*)((symbolic_graph->exec_symbol_info)-> data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t *)((void*)(((char*)((symbolic_graph->exec_symbol_info)-> data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_) )); ++_incomings_[d].c; if (_incomings_[d].r != 0) continue; _incomings_ [d].r = 1; ((void) sizeof ((_exist_size_[_q_] < (symbolic_graph ->exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_[_q_] < (symbolic_graph->exec_symbol_info ->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)" , "ccv_nnc_symbolic_graph_compile.c", 4035, __extension__ __PRETTY_FUNCTION__ ); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[ _q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for (_i_ = 0; _i_ < (source_size); _i_++) { ((void) sizeof (( (sources)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__ ({ if ((sources)[_i_].graph == symbolic_graph) ; else __assert_fail ("(sources)[_i_].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c" , 4035, __extension__ __PRETTY_FUNCTION__); })); _incomings_[ (sources)[_i_].d].r = 3; _exists_[0][_i_] = (sources)[_i_].d; } _exist_size_[0] = (source_size); _exist_size_[1] = 0; _p_ = 0, _q_ = 1; int _bump_ = 1; while (_exist_size_[_p_] > 0) { _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_ [_p_]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if ( _incomings_[_idx_].r != 3) continue; _incomings_[_idx_].r = 4 ; if (((ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)(( symbolic_graph->exec_symbol_info)->data)) + (size_t)(symbolic_graph ->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings ) for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t*) ((void*)(((char*)((symbolic_graph->exec_symbol_info)->data )) + (size_t)(symbolic_graph->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings->rnum; _j_++) { const int d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t *)((void*)(((char*)((symbolic_graph->exec_symbol_info)-> data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t *)((void*)(((char*)((symbolic_graph->exec_symbol_info)-> data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_) )); if (_incomings_[d].edges == 0) { _incomings_[d].edges = _bump_ ; _bump_ += _incomings_[d].c; _incomings_[d].c = 0; } _edges_ [_incomings_[d].edges - 1 + _incomings_[d].c] = _idx_; ++_incomings_ [d].c; if (_incomings_[d].r != 2) continue; _incomings_[d].r = 3; ((void) sizeof ((_exist_size_[_q_] < (symbolic_graph-> exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_ [_q_] < (symbolic_graph->exec_symbol_info->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)" , "ccv_nnc_symbolic_graph_compile.c", 4035, __extension__ __PRETTY_FUNCTION__ ); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[ _q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for (_i_ = 0; _i_ < (destination_size); _i_++) { ((void) sizeof (((destinations)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__ ({ if ((destinations)[_i_].graph == symbolic_graph) ; else __assert_fail ("(destinations)[_i_].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c" , 4035, __extension__ __PRETTY_FUNCTION__); })); _incomings_[ (destinations)[_i_].d].r = 5; _exists_[0][_i_] = (destinations )[_i_].d; } _exist_size_[0] = (destination_size); _exist_size_ [1] = 0; _p_ = 0, _q_ = 1; while (_exist_size_[_p_] > 0) { _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_ ]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (_incomings_ [_idx_].r != 5) continue; _incomings_[_idx_].r = 6; if (_incomings_ [_idx_].edges > 0) for (_j_ = 0; _j_ < _incomings_[_idx_ ].c; _j_++) { const int d = _edges_[_incomings_[_idx_].edges - 1 + _j_]; if (_incomings_[d].r != 4) continue; _incomings_[d ].r = 5; ((void) sizeof ((_exist_size_[_q_] < (symbolic_graph ->exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_[_q_] < (symbolic_graph->exec_symbol_info ->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)" , "ccv_nnc_symbolic_graph_compile.c", 4035, __extension__ __PRETTY_FUNCTION__ ); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[ _q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for (_i_ = 0; _i_ < (destination_size); _i_++) { ((void) sizeof (((destinations)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__ ({ if ((destinations)[_i_].graph == symbolic_graph) ; else __assert_fail ("(destinations)[_i_].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c" , 4035, __extension__ __PRETTY_FUNCTION__); })); _incomings_[ (destinations)[_i_].d].d = 1; } for (_i_ = 0; _i_ < (source_size ); _i_++) { ((void) sizeof (((sources)[_i_].graph == symbolic_graph ) ? 1 : 0), __extension__ ({ if ((sources)[_i_].graph == symbolic_graph ) ; else __assert_fail ("(sources)[_i_].graph == symbolic_graph" , "ccv_nnc_symbolic_graph_compile.c", 4035, __extension__ __PRETTY_FUNCTION__ ); })); _exists_[0][_i_] = (sources)[_i_].d; } _p_ = 0; _q_ = 1; _exist_size_[0] = (source_size); _exist_size_[1] = 0; int _d_ = 0; while (_exist_size_[_p_] > 0) { _exist_size_[_q_ ] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_];) { const int32_t _idx_ = _exists_[_p_][_i_]; _visit_->node[_visit_->size ].index = ((_idx_)); _visit_->node[_visit_->size].term = ((_incomings_[_idx_].d)); ++_visit_->size;; if (_incomings_ [_idx_].d) { ++_d_; _incomings_[_idx_].r = 7; } if (((ccv_nnc_graph_exec_symbol_info_t *)((void*)(((char*)((symbolic_graph->exec_symbol_info)-> data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings) { if (((ccv_nnc_graph_exec_symbol_info_t *)((void*)(((char*)((symbolic_graph->exec_symbol_info)-> data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings->rnum == 1) { const int d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t *)((void*)(((char*)((symbolic_graph->exec_symbol_info)-> data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t *)((void*)(((char*)((symbolic_graph->exec_symbol_info)-> data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(0))) ; --_incomings_[d].c; if (_incomings_[d].c == 0 && _incomings_ [d].r == 6 && _d_ < (destination_size)) { _exists_ [_p_][_i_] = d; continue; } } else for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t *)((void*)(((char*)((symbolic_graph->exec_symbol_info)-> data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings->rnum; _j_++) { const int d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t *)((void*)(((char*)((symbolic_graph->exec_symbol_info)-> data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t *)((void*)(((char*)((symbolic_graph->exec_symbol_info)-> data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_) )); --_incomings_[d].c; if (_incomings_[d].c == 0 && _incomings_ [d].r == 6 && _d_ < (destination_size)) { ((void) sizeof ((_exist_size_[_q_] < (symbolic_graph->exec_symbol_info ->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)" , "ccv_nnc_symbolic_graph_compile.c", 4035, __extension__ __PRETTY_FUNCTION__ ); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[ _q_]; } } } ++_i_; } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = ( _i_)); } for (_i_ = 0; _i_ < (destination_size); _i_++) { ( (void) sizeof (((destinations)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__ ({ if ((destinations)[_i_].graph == symbolic_graph ) ; else __assert_fail ("(destinations)[_i_].graph == symbolic_graph" , "ccv_nnc_symbolic_graph_compile.c", 4035, __extension__ __PRETTY_FUNCTION__ ); })); if (_incomings_[(destinations)[_i_].d].r == 7) continue ; if (!(0)) { ((void) sizeof ((_incomings_[(destinations)[_i_ ].d].c == 0) ? 1 : 0), __extension__ ({ if (_incomings_[(destinations )[_i_].d].c == 0) ; else __assert_fail ("_incomings_[(destinations)[_i_].d].c == 0" , "ccv_nnc_symbolic_graph_compile.c", 4035, __extension__ __PRETTY_FUNCTION__ ); })); } else if (_incomings_[(destinations)[_i_].d].c > 0 ) continue; _visit_->node[_visit_->size].index = (((destinations )[_i_].d)); _visit_->node[_visit_->size].term = ((_incomings_ [(destinations)[_i_].d].d)); ++_visit_->size;; } if (_heap_mem_ ) free(_incomings_); } while (0);; ((void) sizeof ((_visit_-> size <= (symbolic_graph->exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if (_visit_->size <= (symbolic_graph ->exec_symbol_info->rnum)) ; else __assert_fail ("_visit_->size <= (symbolic_graph->exec_symbol_info->rnum)" , "ccv_nnc_symbolic_graph_compile.c", 4035, __extension__ __PRETTY_FUNCTION__ ); })); _visit_; }); | |||
4036 | ccv_nnc_graph_visit_free(graph_prep->visit); | |||
4037 | graph_prep->visit = visit; | |||
4038 | assert(graph_prep->p)((void) sizeof ((graph_prep->p) ? 1 : 0), __extension__ ({ if (graph_prep->p) ; else __assert_fail ("graph_prep->p" , "ccv_nnc_symbolic_graph_compile.c", 4038, __extension__ __PRETTY_FUNCTION__ ); })); | |||
4039 | ccv_nnc_symbolic_graph_symbol_infer(symbolic_graph, visit, sources, source_size, destinations, destination_size, graph_prep->p->tensor_symbol_info, graph_prep->p->tensor_symbol_info_size, graph_prep->tensor_symbol_info, graph_prep->exec_symbol_info); | |||
4040 | } | |||
4041 | ccv_nnc_graph_visit_for(graph_prep->visit, graph_prep->exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (graph_prep->visit)-> size; _i_++) { const int idx __attribute__((unused)) = (graph_prep ->visit)->node[_i_].index; const int _node_unused_ __attribute__ ((unused)) = (graph_prep->visit)->node[_i_].term; typeof ((graph_prep->exec_symbol_info)) const node __attribute__ ((unused)) = (graph_prep->exec_symbol_info) + idx; { | |||
4042 | for (i = 0; i < node->graph_ref_size; i++) | |||
4043 | { | |||
4044 | const int graph_ref = CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node )->_inline_graph_ref)[i] - 1; | |||
4045 | if (graph_ref >= 0) | |||
4046 | _ccv_nnc_symbolic_graph_prep_dup_breakpoints_free(graph_prep->sub_preps[graph_ref]); | |||
4047 | } | |||
4048 | } ccv_nnc_graph_visit_endfor} } | |||
4049 | } | |||
4050 | ||||
4051 | const ccv_nnc_symbolic_graph_compile_param_t ccv_nnc_default_compile_params = {}; | |||
4052 | ||||
4053 | void ccv_nnc_symbolic_graph_compile(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_symbolic_graph_compile_param_t compile_params, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, ccv_nnc_graph_t** const graph_ref, ccv_nnc_tensor_arena_t** const tensor_arena_ref, ccv_nnc_graph_exec_arena_t** const graph_exec_arena_ref) | |||
4054 | { | |||
4055 | assert(graph_ref)((void) sizeof ((graph_ref) ? 1 : 0), __extension__ ({ if (graph_ref ) ; else __assert_fail ("graph_ref", "ccv_nnc_symbolic_graph_compile.c" , 4055, __extension__ __PRETTY_FUNCTION__); })); | |||
4056 | assert(tensor_arena_ref)((void) sizeof ((tensor_arena_ref) ? 1 : 0), __extension__ ({ if (tensor_arena_ref) ; else __assert_fail ("tensor_arena_ref" , "ccv_nnc_symbolic_graph_compile.c", 4056, __extension__ __PRETTY_FUNCTION__ ); })); | |||
4057 | assert(graph_exec_arena_ref)((void) sizeof ((graph_exec_arena_ref) ? 1 : 0), __extension__ ({ if (graph_exec_arena_ref) ; else __assert_fail ("graph_exec_arena_ref" , "ccv_nnc_symbolic_graph_compile.c", 4057, __extension__ __PRETTY_FUNCTION__ ); })); | |||
4058 | int i; | |||
4059 | // Cannot bind the multi-view. | |||
4060 | for (i = 0; i < tensor_bind_size; i++) | |||
4061 | { | |||
4062 | assert(tensor_binds[i].tensor)((void) sizeof ((tensor_binds[i].tensor) ? 1 : 0), __extension__ ({ if (tensor_binds[i].tensor) ; else __assert_fail ("tensor_binds[i].tensor" , "ccv_nnc_symbolic_graph_compile.c", 4062, __extension__ __PRETTY_FUNCTION__ ); })); | |||
4063 | assert(!CCV_IS_TENSOR_MULTIVIEW(tensor_binds[i].tensor))((void) sizeof ((!((*(int*)(tensor_binds[i].tensor)) & CCV_TENSOR_MULTIVIEW )) ? 1 : 0), __extension__ ({ if (!((*(int*)(tensor_binds[i]. tensor)) & CCV_TENSOR_MULTIVIEW)) ; else __assert_fail ("!CCV_IS_TENSOR_MULTIVIEW(tensor_binds[i].tensor)" , "ccv_nnc_symbolic_graph_compile.c", 4063, __extension__ __PRETTY_FUNCTION__ ); })); | |||
4064 | } | |||
4065 | ccv_nnc_symbolic_graph_prep_t* graph_prep = _ccv_nnc_symbolic_graph_prep_new(symbolic_graph, tensor_binds, tensor_bind_size, outputs, output_size, sources, source_size, destinations, destination_size, 0, 0, 0, 0); | |||
4066 | _ccv_nnc_symbolic_graph_prep_while_count_tensor(graph_prep); | |||
4067 | ccv_nnc_tensor_arena_t* tensor_arena = _ccv_nnc_tensor_arena_new(graph_prep, compile_params.allocator, 0, tensor_binds, tensor_bind_size); | |||
4068 | _ccv_nnc_tensor_arena_fixup_pair_ref_and_tape_var(tensor_arena, graph_prep, tensor_arena); | |||
4069 | *tensor_arena_ref = tensor_arena; | |||
4070 | // The above handled tensor allocation, now we need to materialize the graph from symbolic to real. | |||
4071 | _ccv_nnc_graph_fixup_pair(graph_prep, graph_prep); | |||
4072 | // Now tensor allocation is done, if there are any dup_breakpoints, I need to clean it up. | |||
4073 | _ccv_nnc_symbolic_graph_prep_dup_breakpoints_free(graph_prep); | |||
4074 | *graph_ref = graph_prep->graph; | |||
4075 | ccv_nnc_graph_exec_arena_t* graph_exec_arena = _ccv_nnc_graph_exec_arena_new(symbolic_graph, sources, source_size, destinations, destination_size, graph_prep, tensor_arena); | |||
4076 | _ccv_nnc_graph_exec_arena_topsort(graph_prep->graph, graph_exec_arena); | |||
4077 | _ccv_nnc_graph_exec_arena_fixup_pair_ref(graph_exec_arena, graph_prep, graph_exec_arena); | |||
4078 | *graph_exec_arena_ref = graph_exec_arena; | |||
4079 | _ccv_nnc_symbolic_graph_prep_free(graph_prep); | |||
4080 | } | |||
4081 | ||||
4082 | static void _ccv_nnc_tensor_arena_free(ccv_nnc_tensor_arena_t* const tensor_arena) | |||
4083 | { | |||
4084 | // Buffers are inherited from above, no need to dealloc. | |||
4085 | int i; | |||
4086 | for (i = 0; i < tensor_arena->sub_arena_size; i++) | |||
4087 | if (tensor_arena->sub_arenas[i]) | |||
4088 | _ccv_nnc_tensor_arena_free(tensor_arena->sub_arenas[i]); | |||
4089 | for (i = 0; i < tensor_arena->m_tensor_idx->rnum; i++) | |||
4090 | { | |||
4091 | ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, *(int*)ccv_array_get(tensor_arena->m_tensor_idx, i)((void*)(((char*)((tensor_arena->m_tensor_idx)->data)) + (size_t)(tensor_arena->m_tensor_idx)->rsize * (size_t) (i)))); | |||
4092 | assert(mv && CCV_IS_TENSOR_MULTIVIEW(mv))((void) sizeof ((mv && ((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW )) ? 1 : 0), __extension__ ({ if (mv && ((*(int*)(mv) ) & CCV_TENSOR_MULTIVIEW)) ; else __assert_fail ("mv && CCV_IS_TENSOR_MULTIVIEW(mv)" , "ccv_nnc_symbolic_graph_compile.c", 4092, __extension__ __PRETTY_FUNCTION__ ); })); | |||
4093 | ccv_nnc_tensor_multiview_free(*mv); | |||
4094 | } | |||
4095 | ccv_array_free(tensor_arena->tensor_metadata); | |||
4096 | ccv_array_free(tensor_arena->m_tensor_idx); | |||
4097 | if (tensor_arena->pb_vt_tensors) | |||
4098 | ccfreefree(tensor_arena->pb_vt_tensors); | |||
4099 | if (tensor_arena->vt_alias_r_refs_p) | |||
4100 | ccfreefree(tensor_arena->vt_alias_r_refs_p); | |||
4101 | if (tensor_arena->vt_sizes) | |||
4102 | ccfreefree(tensor_arena->vt_sizes); | |||
4103 | ccfreefree(tensor_arena); | |||
4104 | } | |||
4105 | ||||
4106 | void ccv_nnc_tensor_bind_symbol(ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_tensor_symbol_t symbol, const ccv_nnc_tensor_t* const tensor) | |||
4107 | { | |||
4108 | assert(tensor_arena->graph_ref == (intptr_t)symbol.graph)((void) sizeof ((tensor_arena->graph_ref == (intptr_t)symbol .graph) ? 1 : 0), __extension__ ({ if (tensor_arena->graph_ref == (intptr_t)symbol.graph) ; else __assert_fail ("tensor_arena->graph_ref == (intptr_t)symbol.graph" , "ccv_nnc_symbolic_graph_compile.c", 4108, __extension__ __PRETTY_FUNCTION__ ); })); | |||
4109 | assert(symbol.d < tensor_arena->vt_tensor_size)((void) sizeof ((symbol.d < tensor_arena->vt_tensor_size ) ? 1 : 0), __extension__ ({ if (symbol.d < tensor_arena-> vt_tensor_size) ; else __assert_fail ("symbol.d < tensor_arena->vt_tensor_size" , "ccv_nnc_symbolic_graph_compile.c", 4109, __extension__ __PRETTY_FUNCTION__ ); })); | |||
4110 | assert(symbol.d >= 0)((void) sizeof ((symbol.d >= 0) ? 1 : 0), __extension__ ({ if (symbol.d >= 0) ; else __assert_fail ("symbol.d >= 0" , "ccv_nnc_symbolic_graph_compile.c", 4110, __extension__ __PRETTY_FUNCTION__ ); })); | |||
4111 | // Only allocate this on-demand because not everyone uses this ccv_nnc_tensor_bind_symbol method. | |||
4112 | int i; | |||
4113 | if (!tensor_arena->pb_vt_tensors) | |||
4114 | { | |||
4115 | tensor_arena->pb_vt_tensors = (ccv_numeric_data_t*)cccalloccalloc(tensor_arena->vt_tensor_size, sizeof(ccv_numeric_data_t)); | |||
4116 | for (i = 0; i < tensor_arena->vt_tensor_size; i++) | |||
4117 | if (tensor_arena->vt_tensors[i]) | |||
4118 | tensor_arena->pb_vt_tensors[i] = tensor_arena->vt_tensors[i]->data; | |||
4119 | } | |||
4120 | if (!tensor_arena->vt_alias_r_refs_p) | |||
4121 | { | |||
4122 | tensor_arena->vt_alias_r_refs_p = (int*)cccalloccalloc(tensor_arena->vt_tensor_size * 2, sizeof(int)); | |||
4123 | tensor_arena->vt_alias_r_refs = tensor_arena->vt_alias_r_refs_p + tensor_arena->vt_tensor_size; | |||
4124 | for (i = 0; i < tensor_arena->vt_tensor_size; i++) | |||
4125 | if (tensor_arena->vt_alias_refs[i]) | |||
4126 | { | |||
4127 | const int alias_ref = tensor_arena->vt_alias_refs[i] - 1; | |||
4128 | assert(alias_ref >= 0 && alias_ref < tensor_arena->vt_tensor_size)((void) sizeof ((alias_ref >= 0 && alias_ref < tensor_arena ->vt_tensor_size) ? 1 : 0), __extension__ ({ if (alias_ref >= 0 && alias_ref < tensor_arena->vt_tensor_size ) ; else __assert_fail ("alias_ref >= 0 && alias_ref < tensor_arena->vt_tensor_size" , "ccv_nnc_symbolic_graph_compile.c", 4128, __extension__ __PRETTY_FUNCTION__ ); })); | |||
4129 | ++tensor_arena->vt_alias_r_refs_p[alias_ref]; // Count how many alias there are. | |||
4130 | } | |||
4131 | int refp = 0; | |||
4132 | for (i = 1; i < tensor_arena->vt_tensor_size; i++) // Allocate each with aliases position on vt_alias_r_refs. It points to the end. | |||
4133 | if (tensor_arena->vt_alias_r_refs_p[i]) | |||
4134 | refp = (tensor_arena->vt_alias_r_refs_p[i] += refp); | |||
4135 | else | |||
4136 | tensor_arena->vt_alias_r_refs_p[i] = -1; // This has no refs. | |||
4137 | for (i = refp; i < tensor_arena->vt_tensor_size; i++) | |||
4138 | tensor_arena->vt_alias_r_refs[i] = -1; // These are not allocated. | |||
4139 | for (i = 0; i < tensor_arena->vt_tensor_size; i++) | |||
4140 | if (tensor_arena->vt_alias_refs[i]) | |||
4141 | { | |||
4142 | const int alias_ref = tensor_arena->vt_alias_refs[i] - 1; | |||
4143 | assert(alias_ref >= 0 && alias_ref < tensor_arena->vt_tensor_size)((void) sizeof ((alias_ref >= 0 && alias_ref < tensor_arena ->vt_tensor_size) ? 1 : 0), __extension__ ({ if (alias_ref >= 0 && alias_ref < tensor_arena->vt_tensor_size ) ; else __assert_fail ("alias_ref >= 0 && alias_ref < tensor_arena->vt_tensor_size" , "ccv_nnc_symbolic_graph_compile.c", 4143, __extension__ __PRETTY_FUNCTION__ ); })); | |||
4144 | const int pos = --tensor_arena->vt_alias_r_refs_p[alias_ref]; | |||
4145 | assert(pos >= 0)((void) sizeof ((pos >= 0) ? 1 : 0), __extension__ ({ if ( pos >= 0) ; else __assert_fail ("pos >= 0", "ccv_nnc_symbolic_graph_compile.c" , 4145, __extension__ __PRETTY_FUNCTION__); })); | |||
4146 | tensor_arena->vt_alias_r_refs[pos] = i; | |||
4147 | } | |||
4148 | } | |||
4149 | const int symbol_d = tensor_arena->vt_alias_refs[symbol.d] ? tensor_arena->vt_alias_refs[symbol.d] - 1 : symbol.d; | |||
4150 | if (CCV_IS_TENSOR_VIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_VIEW)) | |||
4151 | { | |||
4152 | assert(((ccv_nnc_tensor_view_t*)tensor)->off == 0)((void) sizeof ((((ccv_nnc_tensor_view_t*)tensor)->off == 0 ) ? 1 : 0), __extension__ ({ if (((ccv_nnc_tensor_view_t*)tensor )->off == 0) ; else __assert_fail ("((ccv_nnc_tensor_view_t*)tensor)->off == 0" , "ccv_nnc_symbolic_graph_compile.c", 4152, __extension__ __PRETTY_FUNCTION__ ); })); // I cannot handle off > 0 at the moment, it is possible, but requires additional verifications. | |||
4153 | assert((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor)->stride) == 0 &&((void) sizeof (((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t *)tensor)->stride) == 0 && ccv_nnc_tensor_count(tensor ->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors [symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor )->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count (tensor_arena->vt_tensors[symbol_d]->info)) ? 1 : 0), __extension__ ({ if ((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor )->stride) == 0 && ccv_nnc_tensor_count(tensor-> info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[ symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor )->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count (tensor_arena->vt_tensors[symbol_d]->info)) ; else __assert_fail ("(ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor)->stride) == 0 && ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)" , "ccv_nnc_symbolic_graph_compile.c", 4155, __extension__ __PRETTY_FUNCTION__ ); })) | |||
4154 | ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)) ||((void) sizeof (((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t *)tensor)->stride) == 0 && ccv_nnc_tensor_count(tensor ->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors [symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor )->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count (tensor_arena->vt_tensors[symbol_d]->info)) ? 1 : 0), __extension__ ({ if ((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor )->stride) == 0 && ccv_nnc_tensor_count(tensor-> info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[ symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor )->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count (tensor_arena->vt_tensors[symbol_d]->info)) ; else __assert_fail ("(ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor)->stride) == 0 && ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)" , "ccv_nnc_symbolic_graph_compile.c", 4155, __extension__ __PRETTY_FUNCTION__ ); })) | |||
4155 | (size_t)((ccv_nnc_tensor_view_t*)tensor)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info))((void) sizeof (((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t *)tensor)->stride) == 0 && ccv_nnc_tensor_count(tensor ->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors [symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor )->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count (tensor_arena->vt_tensors[symbol_d]->info)) ? 1 : 0), __extension__ ({ if ((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor )->stride) == 0 && ccv_nnc_tensor_count(tensor-> info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[ symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor )->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count (tensor_arena->vt_tensors[symbol_d]->info)) ; else __assert_fail ("(ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor)->stride) == 0 && ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)" , "ccv_nnc_symbolic_graph_compile.c", 4155, __extension__ __PRETTY_FUNCTION__ ); })); | |||
4156 | } else | |||
4157 | { assert(ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info))((void) sizeof ((ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count (tensor_arena->vt_tensors[symbol_d]->info)) ? 1 : 0), __extension__ ({ if (ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count (tensor_arena->vt_tensors[symbol_d]->info)) ; else __assert_fail ("ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)" , "ccv_nnc_symbolic_graph_compile.c", 4157, __extension__ __PRETTY_FUNCTION__ ); })); } | |||
4158 | if (CCV_IS_TENSOR_VIEW(tensor_arena->vt_tensors[symbol.d])((*(int*)(tensor_arena->vt_tensors[symbol.d])) & CCV_TENSOR_VIEW )) | |||
4159 | { assert(((ccv_nnc_tensor_view_t*)tensor_arena->vt_tensors[symbol.d])->off == 0)((void) sizeof ((((ccv_nnc_tensor_view_t*)tensor_arena->vt_tensors [symbol.d])->off == 0) ? 1 : 0), __extension__ ({ if (((ccv_nnc_tensor_view_t *)tensor_arena->vt_tensors[symbol.d])->off == 0) ; else __assert_fail ("((ccv_nnc_tensor_view_t*)tensor_arena->vt_tensors[symbol.d])->off == 0" , "ccv_nnc_symbolic_graph_compile.c", 4159, __extension__ __PRETTY_FUNCTION__ ); })); } | |||
4160 | tensor_arena->vt_tensors[symbol_d]->data = tensor->data; | |||
4161 | if (tensor_arena->vt_alias_r_refs_p[symbol_d] >= 0) | |||
4162 | for (i = tensor_arena->vt_alias_r_refs_p[symbol_d]; i < tensor_arena->vt_tensor_size; i++) | |||
4163 | { | |||
4164 | const int d = tensor_arena->vt_alias_r_refs[i]; | |||
4165 | if (d < 0 || symbol_d + 1 != tensor_arena->vt_alias_refs[d]) // Doesn't match, reached the end of it. | |||
4166 | break; | |||
4167 | ccv_nnc_tensor_t* const d_tensor = tensor_arena->vt_tensors[d]; | |||
4168 | d_tensor->info.datatype = tensor->info.datatype; | |||
4169 | d_tensor->info.reserved = tensor->info.reserved; | |||
4170 | if (CCV_IS_TENSOR_VIEW(d_tensor)((*(int*)(d_tensor)) & CCV_TENSOR_VIEW)) | |||
4171 | ccv_nnc_tensor_data(tensor->info, tensor->data.u8, ((ccv_nnc_tensor_view_t*)d_tensor)->off + tensor->dataof, &d_tensor->data, &d_tensor->dataof); | |||
4172 | else { | |||
4173 | d_tensor->data.u8 = tensor->data.u8; | |||
4174 | d_tensor->dataof = tensor->dataof; | |||
4175 | } | |||
4176 | } | |||
4177 | } | |||
4178 | ||||
4179 | void ccv_nnc_tensor_arena_clear_bindings(ccv_nnc_tensor_arena_t* const tensor_arena) | |||
4180 | { | |||
4181 | if (!tensor_arena->pb_vt_tensors) | |||
4182 | return; | |||
4183 | int i; | |||
4184 | for (i = 0; i < tensor_arena->vt_tensor_size; i++) | |||
4185 | if (tensor_arena->vt_tensors[i]) | |||
4186 | tensor_arena->vt_tensors[i]->data = tensor_arena->pb_vt_tensors[i]; | |||
4187 | } | |||
4188 | ||||
4189 | uint64_t ccv_nnc_tensor_arena_size(const ccv_nnc_tensor_arena_t* const tensor_arena) | |||
4190 | { | |||
4191 | uint64_t total_size = 0; | |||
4192 | int i; | |||
4193 | for (i = 0; i < tensor_arena->buffer_size; i++) | |||
4194 | total_size += tensor_arena->buffers[i].size; | |||
4195 | return total_size; | |||
4196 | } | |||
4197 | ||||
4198 | static void _ccv_nnc_multiview_update_params(ccv_nnc_tensor_multiview_t* const mv, const ccv_nnc_tensor_param_t params) | |||
4199 | { | |||
4200 | int i; | |||
4201 | if (mv->it) | |||
4202 | mv->it->info = params; | |||
4203 | for (i = 0; i < mv->repeat + mv->kind; i++) | |||
4204 | { | |||
4205 | ccv_nnc_tensor_t* tensor = CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data )[i]; | |||
4206 | if (CCV_IS_TENSOR_MULTIVIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_MULTIVIEW)) | |||
4207 | _ccv_nnc_multiview_update_params((ccv_nnc_tensor_multiview_t*)tensor, params); | |||
4208 | else | |||
4209 | tensor->info = params; | |||
4210 | } | |||
4211 | } | |||
4212 | ||||
4213 | int ccv_nnc_tensor_arena_reinit(ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_symbolic_graph_t* const graph) | |||
4214 | { | |||
4215 | int i; | |||
4216 | assert(graph->tensor_symbol_info->rnum >= tensor_arena->vt_tensor_size)((void) sizeof ((graph->tensor_symbol_info->rnum >= tensor_arena ->vt_tensor_size) ? 1 : 0), __extension__ ({ if (graph-> tensor_symbol_info->rnum >= tensor_arena->vt_tensor_size ) ; else __assert_fail ("graph->tensor_symbol_info->rnum >= tensor_arena->vt_tensor_size" , "ccv_nnc_symbolic_graph_compile.c", 4216, __extension__ __PRETTY_FUNCTION__ ); })); | |||
4217 | if (!tensor_arena->vt_sizes) // Keep the original size so we can check against to see if we will overflow. | |||
4218 | { | |||
4219 | tensor_arena->vt_sizes = (size_t*)ccmallocmalloc(sizeof(size_t) * tensor_arena->vt_tensor_size); | |||
4220 | for (i = 0; i < tensor_arena->vt_tensor_size; i++) | |||
4221 | if (tensor_arena->vt_tensors[i] && !tensor_arena->vt_alias_refs[i]) | |||
4222 | { | |||
4223 | ccv_nnc_tensor_t* tensor = tensor_arena->vt_tensors[i]; | |||
4224 | if (CCV_IS_TENSOR_MULTIVIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_MULTIVIEW)) | |||
4225 | { | |||
4226 | ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)tensor; | |||
4227 | while (CCV_IS_TENSOR_MULTIVIEW(mv)((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW)) | |||
4228 | mv = (ccv_nnc_tensor_multiview_t*)(mv->it ? mv->it : CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data )[0]); | |||
4229 | tensor = (ccv_nnc_tensor_t*)mv; | |||
4230 | } | |||
4231 | tensor_arena->vt_sizes[i] = ccv_nnc_tensor_data_size(tensor->info); | |||
4232 | } | |||
4233 | } | |||
4234 | int flag = 0; | |||
4235 | for (i = 0; !flag && i < tensor_arena->vt_tensor_size; i++) | |||
4236 | if (tensor_arena->vt_tensors[i] && !tensor_arena->vt_alias_refs[i]) | |||
4237 | { | |||
4238 | ccv_nnc_tensor_symbol_info_t* const symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, i)((void*)(((char*)((graph->tensor_symbol_info)->data)) + (size_t)(graph->tensor_symbol_info)->rsize * (size_t)( i))); | |||
4239 | ccv_nnc_tensor_param_t params = symbol_info->info; | |||
4240 | params.datatype = tensor_arena->vt_tensors[i]->info.datatype; | |||
4241 | params.reserved = tensor_arena->vt_tensors[i]->info.reserved; | |||
4242 | flag = (tensor_arena->vt_sizes[i] < ccv_nnc_tensor_data_size(params)); | |||
4243 | } | |||
4244 | if (flag) | |||
4245 | return -1; | |||
4246 | for (i = 0; i < tensor_arena->vt_tensor_size; i++) | |||
4247 | if (tensor_arena->vt_tensors[i]) | |||
4248 | { | |||
4249 | ccv_nnc_tensor_symbol_info_t* const symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, i)((void*)(((char*)((graph->tensor_symbol_info)->data)) + (size_t)(graph->tensor_symbol_info)->rsize * (size_t)( i))); | |||
4250 | ccv_nnc_tensor_t* tensor = tensor_arena->vt_tensors[i]; | |||
4251 | if (CCV_IS_TENSOR_MULTIVIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_MULTIVIEW)) | |||
4252 | { | |||
4253 | assert(!tensor_arena->vt_alias_refs[i])((void) sizeof ((!tensor_arena->vt_alias_refs[i]) ? 1 : 0) , __extension__ ({ if (!tensor_arena->vt_alias_refs[i]) ; else __assert_fail ("!tensor_arena->vt_alias_refs[i]", "ccv_nnc_symbolic_graph_compile.c" , 4253, __extension__ __PRETTY_FUNCTION__); })); | |||
4254 | _ccv_nnc_multiview_update_params((ccv_nnc_tensor_multiview_t*)tensor, symbol_info->info); | |||
4255 | } else if (!tensor_arena->vt_alias_refs[i]) { | |||
4256 | ccv_nnc_tensor_param_t params = symbol_info->info; | |||
4257 | params.datatype = tensor->info.datatype; | |||
4258 | params.reserved = tensor->info.reserved; | |||
4259 | tensor->info = params; | |||
4260 | } else { | |||
4261 | off_t off = ccv_nnc_tensor_view_offset(tensor->info.datatype, symbol_info->stride, symbol_info->ofs); | |||
4262 | ccv_nnc_tensor_param_t params = symbol_info->info; | |||
4263 | params.datatype = tensor->info.datatype; | |||
4264 | params.reserved = tensor->info.reserved; | |||
4265 | tensor->info = params; | |||
4266 | const int alias_ref = tensor_arena->vt_alias_refs[i] - 1; | |||
4267 | ccv_nnc_tensor_data(tensor->info, tensor_arena->vt_tensors[alias_ref]->data.u8, off + tensor_arena->vt_tensors[alias_ref]->dataof, &tensor->data, &tensor->dataof); | |||
4268 | if (CCV_IS_TENSOR_VIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_VIEW)) | |||
4269 | ((ccv_nnc_tensor_view_t*)tensor)->off = off; | |||
4270 | } | |||
4271 | } | |||
4272 | // Should handle sub_tensor_arena, don't do that at the moment. | |||
4273 | assert(!graph->sub_graphs)((void) sizeof ((!graph->sub_graphs) ? 1 : 0), __extension__ ({ if (!graph->sub_graphs) ; else __assert_fail ("!graph->sub_graphs" , "ccv_nnc_symbolic_graph_compile.c", 4273, __extension__ __PRETTY_FUNCTION__ ); })); | |||
4274 | return 0; | |||
4275 | } | |||
4276 | ||||
4277 | void ccv_nnc_graph_exec_reinit(ccv_nnc_graph_exec_arena_t* const graph_exec_arena, ccv_nnc_graph_t* const graph, const ccv_nnc_symbolic_graph_t* const symbolic_graph) | |||
4278 | { | |||
4279 | assert(symbolic_graph->exec_symbol_info->rnum >= graph_exec_arena->graph_exec_size)((void) sizeof ((symbolic_graph->exec_symbol_info->rnum >= graph_exec_arena->graph_exec_size) ? 1 : 0), __extension__ ({ if (symbolic_graph->exec_symbol_info->rnum >= graph_exec_arena ->graph_exec_size) ; else __assert_fail ("symbolic_graph->exec_symbol_info->rnum >= graph_exec_arena->graph_exec_size" , "ccv_nnc_symbolic_graph_compile.c", 4279, __extension__ __PRETTY_FUNCTION__ ); })); | |||
4280 | int i; | |||
4281 | for (i = 0; i < graph_exec_arena->graph_exec_size; i++) | |||
4282 | { | |||
4283 | const ccv_nnc_graph_exec_t graph_exec = graph_exec_arena->graph_execs[i]; | |||
4284 | if (graph_exec.d < 0) | |||
4285 | continue; | |||
4286 | const ccv_nnc_cmd_t existing_cmd = ccv_nnc_graph_exec_cmd(graph, graph_exec); | |||
4287 | const ccv_nnc_graph_exec_symbol_info_t* const symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, i)((void*)(((char*)((symbolic_graph->exec_symbol_info)->data )) + (size_t)(symbolic_graph->exec_symbol_info)->rsize * (size_t)(i))); | |||
4288 | ccv_nnc_cmd_t new_cmd = symbol_info->cmd; | |||
4289 | if (new_cmd.cmd == existing_cmd.cmd) // If the command matches, replacing the backend and algorithm to the existing one, which hypothetically has been autotuned.. | |||
4290 | { | |||
4291 | new_cmd.backend = existing_cmd.backend; | |||
4292 | new_cmd.algorithm = existing_cmd.algorithm; | |||
4293 | } | |||
4294 | ccv_nnc_graph_exec_set(graph, graph_exec, new_cmd); | |||
4295 | } | |||
4296 | } | |||
4297 | ||||
4298 | void ccv_nnc_tensor_arena_buffer_free(ccv_nnc_tensor_arena_t* const tensor_arena) | |||
4299 | { | |||
4300 | int i; | |||
4301 | for (i = 0; i < tensor_arena->buffer_size; i++) | |||
4302 | { | |||
4303 | if (!tensor_arena->buffers[i].ptr) | |||
4304 | continue; | |||
4305 | const int buffer_type = tensor_arena->buffers[i].type;; | |||
4306 | const int memory_type = CCV_TENSOR_GET_MEMORY(buffer_type)((buffer_type) & 0x3); | |||
4307 | #ifdef HAVE_CUDA1 | |||
4308 | const int device_id = CCV_TENSOR_GET_DEVICE_ID(buffer_type)(((buffer_type) & 0xfff00) >> 8); | |||
4309 | if (memory_type == CCV_TENSOR_GPU_MEMORY) | |||
4310 | { | |||
4311 | if (tensor_arena->allocator.isa && tensor_arena->allocator.isa->free) | |||
4312 | tensor_arena->allocator.isa->free(tensor_arena->buffers[i].ptr, tensor_arena->allocator.context.free); | |||
4313 | else | |||
4314 | cufree(device_id, tensor_arena->buffers[i].ptr); | |||
4315 | } else { | |||
4316 | assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0 ), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY) ; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY" , "ccv_nnc_symbolic_graph_compile.c", 4316, __extension__ __PRETTY_FUNCTION__ ); })); | |||
4317 | if (tensor_arena->buffers[i].pin_mem) | |||
4318 | cuhostfree(tensor_arena->buffers[i].ptr); | |||
4319 | else | |||
4320 | ccfreefree(tensor_arena->buffers[i].ptr); | |||
4321 | } | |||
4322 | #elif defined(HAVE_MPS) | |||
4323 | const int device_id = CCV_TENSOR_GET_DEVICE_ID(buffer_type)(((buffer_type) & 0xfff00) >> 8); | |||
4324 | if (memory_type == CCV_TENSOR_GPU_MEMORY) | |||
4325 | { | |||
4326 | // if (tensor_arena->allocator.isa && tensor_arena->allocator.isa->free) | |||
4327 | // tensor_arena->allocator.isa->free(tensor_arena->buffers[i].ptr, tensor_arena->allocator.context.free); | |||
4328 | // else | |||
4329 | mpheapfree(device_id, tensor_arena->buffers[i].ptr); | |||
4330 | } else { | |||
4331 | assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0 ), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY) ; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY" , "ccv_nnc_symbolic_graph_compile.c", 4331, __extension__ __PRETTY_FUNCTION__ ); })); | |||
4332 | ccfreefree(tensor_arena->buffers[i].ptr); | |||
4333 | } | |||
4334 | #else | |||
4335 | assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0 ), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY) ; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY" , "ccv_nnc_symbolic_graph_compile.c", 4335, __extension__ __PRETTY_FUNCTION__ ); })); | |||
4336 | ccfreefree(tensor_arena->buffers[i].ptr); | |||
4337 | #endif | |||
4338 | tensor_arena->buffers[i].ptr = 0; | |||
4339 | } | |||
4340 | // For now, the life-cycle of the disposers lives with the buffer. It may ends before the tensor arena deallocates. | |||
4341 | if (tensor_arena->disposers) | |||
4342 | { | |||
4343 | for (i = 0; i < tensor_arena->disposers->rnum; i++) | |||
4344 | { | |||
4345 | ccv_nnc_arena_disposer_t* const disposer = (ccv_nnc_arena_disposer_t*)ccv_array_get(tensor_arena->disposers, i)((void*)(((char*)((tensor_arena->disposers)->data)) + ( size_t)(tensor_arena->disposers)->rsize * (size_t)(i))); | |||
4346 | disposer->dispose(disposer->ptr, disposer->userdata); | |||
4347 | } | |||
4348 | ccv_array_free(tensor_arena->disposers); | |||
4349 | tensor_arena->disposers = 0; | |||
4350 | } | |||
4351 | } | |||
4352 | ||||
4353 | void ccv_nnc_tensor_arena_free(ccv_nnc_tensor_arena_t* const tensor_arena) | |||
4354 | { | |||
4355 | ccv_nnc_tensor_arena_buffer_free(tensor_arena); | |||
4356 | _ccv_nnc_tensor_arena_free(tensor_arena); | |||
4357 | } | |||
4358 | ||||
4359 | void ccv_nnc_graph_exec_arena_free(ccv_nnc_graph_exec_arena_t* const graph_exec_arena) | |||
4360 | { | |||
4361 | int i; | |||
4362 | for (i = 0; i < graph_exec_arena->sub_arena_size; i++) | |||
4363 | if (graph_exec_arena->sub_arenas[i]) | |||
4364 | ccv_nnc_graph_exec_arena_free(graph_exec_arena->sub_arenas[i]); | |||
4365 | ccfreefree(graph_exec_arena); | |||
4366 | } |