Bug Summary

File:nnc/ccv_nnc_cmd.c
Warning:line 580, column 27
Potential memory leak

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-unknown-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name ccv_nnc_cmd.c -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -mrelocation-model pic -pic-level 2 -pic-is-pie -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -target-feature +sse2 -tune-cpu generic -debugger-tuning=gdb -fdebug-compilation-dir=/home/liu/actions-runner/_work/ccv/ccv/lib/nnc -fcoverage-compilation-dir=/home/liu/actions-runner/_work/ccv/ccv/lib/nnc -resource-dir /usr/local/lib/clang/19 -I ../ -I /usr/local/cuda/include -D HAVE_CBLAS -D HAVE_LIBPNG -D HAVE_LIBJPEG -D HAVE_FFTW3 -D HAVE_PTHREAD -D HAVE_LIBLINEAR -D HAVE_TESSERACT -D HAVE_AVCODEC -D HAVE_AVFORMAT -D HAVE_AVUTIL -D HAVE_SWSCALE -D HAVE_SSE2 -D HAVE_GSL -D HAVE_CUDA -D HAVE_CUDNN -D HAVE_NCCL -D USE_SYSTEM_CUB -I /usr/local/include -internal-isystem /usr/local/lib/clang/19/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/12/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -ferror-limit 19 -fgnuc-version=4.2.1 -fskip-odr-check-in-gmf -vectorize-loops -vectorize-slp -analyzer-output=html -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /home/liu/actions-runner/_work/ccv/ccv/_analyze/2025-03-17-111301-58955-1 -x c ccv_nnc_cmd.c
1#include "ccv_nnc.h"
2#include "ccv_nnc_internal.h"
3#include "3rdparty/khash/khash.h"
4#include "ccv_nnc_easy.h"
5#ifdef HAVE_CUDA1
6#include "gpu/ccv_nnc_compat.h"
7#elif defined(HAVE_MPS)
8#include "mps/ccv_nnc_mps.h"
9#endif
10#include <time.h>
11#include <sys/time.h>
12
13typedef struct {
14 const uint32_t cmd;
15 const char* name;
16 ccv_nnc_cmd_registry_t registry;
17 ccv_nnc_cmd_backend_registry_t backends[CCV_NNC_BACKEND_COUNT];
18} ccv_nnc_cmd_init_t;
19
20typedef struct {
21 const uint32_t backend;
22 const char* name;
23} ccv_nnc_cmd_backend_init_t;
24
25// The generated code configures command and its mapping.
26#include "cmd/ccv_nnc_cmd.inc"
27
28void ccv_nnc_init(void)
29{
30 _ccv_nnc_cmd_init();
31}
32
33static uint64_t _ccv_nnc_flags = 0;
34
35uint64_t ccv_nnc_flags(void)
36{
37 return _ccv_nnc_flags;
38}
39
40void ccv_nnc_enable_flag(uint64_t flag)
41{
42 _ccv_nnc_flags |= flag;
43}
44
45void ccv_nnc_disable_flag(uint64_t flag)
46{
47 _ccv_nnc_flags &= ~flag;
48}
49
50const char* ccv_nnc_cmd_name(const uint32_t cmd)
51{
52 switch (cmd)
53 {
54 case CCV_NNC_NOOP:
55 return "CCV_NNC_NOOP";
56 case CCV_NNC_CUSTOM_FORWARD:
57 return "CCV_NNC_CUSTOM_FORWARD";
58 case CCV_NNC_CUSTOM_BACKWARD:
59 return "CCV_NNC_CUSTOM_BACKWARD";
60 case CCV_NNC_GRAPH_FORWARD:
61 return "CCV_NNC_GRAPH_FORWARD";
62 case CCV_NNC_GRAPH_BACKWARD:
63 return "CCV_NNC_GRAPH_BACKWARD";
64 }
65 const int idx = _ccv_nnc_cmd_ph(cmd);
66 assert(idx >= 0)((void) sizeof ((idx >= 0) ? 1 : 0), __extension__ ({ if (
idx >= 0) ; else __assert_fail ("idx >= 0", "ccv_nnc_cmd.c"
, 66, __extension__ __PRETTY_FUNCTION__); }))
;
67 assert(idx < sizeof(init_map) / sizeof(init_map[0]))((void) sizeof ((idx < sizeof(init_map) / sizeof(init_map[
0])) ? 1 : 0), __extension__ ({ if (idx < sizeof(init_map)
/ sizeof(init_map[0])) ; else __assert_fail ("idx < sizeof(init_map) / sizeof(init_map[0])"
, "ccv_nnc_cmd.c", 67, __extension__ __PRETTY_FUNCTION__); })
)
;
68 return init_map[idx].name;
69}
70
71const char* ccv_nnc_cmd_backend_name(const uint32_t backend)
72{
73 if (backend == CCV_NNC_NO_BACKEND)
74 return "CCV_NNC_NO_BACKEND";
75 const int idx = _ccv_nnc_cmd_backend_ph(backend);
76 assert(idx >= 0)((void) sizeof ((idx >= 0) ? 1 : 0), __extension__ ({ if (
idx >= 0) ; else __assert_fail ("idx >= 0", "ccv_nnc_cmd.c"
, 76, __extension__ __PRETTY_FUNCTION__); }))
;
77 assert(idx < CCV_NNC_BACKEND_COUNT)((void) sizeof ((idx < CCV_NNC_BACKEND_COUNT) ? 1 : 0), __extension__
({ if (idx < CCV_NNC_BACKEND_COUNT) ; else __assert_fail (
"idx < CCV_NNC_BACKEND_COUNT", "ccv_nnc_cmd.c", 77, __extension__
__PRETTY_FUNCTION__); }))
;
78 return backend_init_map[idx].name;
79}
80
81const ccv_nnc_cmd_param_t ccv_nnc_cmd_auto = {};
82
83int ccv_nnc_is_cmd_auto(const ccv_nnc_cmd_param_t params)
84{
85 return (memcmp(&params, &ccv_nnc_cmd_auto, sizeof(ccv_nnc_cmd_param_t)) == 0);
86}
87
88int ccv_nnc_cmd_is_forward(const ccv_nnc_cmd_t cmd)
89{
90 switch (cmd.cmd)
91 {
92 case CCV_NNC_NOOP:
93 return 0;
94 case CCV_NNC_CUSTOM_FORWARD:
95 case CCV_NNC_CUSTOM_BACKWARD:
96 case CCV_NNC_GRAPH_FORWARD:
97 case CCV_NNC_GRAPH_BACKWARD:
98 default:
99 return !(cmd.cmd & 0x1); // If it is even, it is forward
100 }
101}
102
103int ccv_nnc_cmd_is_backward(const ccv_nnc_cmd_t cmd)
104{
105 switch (cmd.cmd)
106 {
107 case CCV_NNC_NOOP:
108 return 0;
109 case CCV_NNC_CUSTOM_FORWARD:
110 case CCV_NNC_CUSTOM_BACKWARD:
111 case CCV_NNC_GRAPH_FORWARD:
112 case CCV_NNC_GRAPH_BACKWARD:
113 default:
114 return !!(cmd.cmd & 0x1); // If it is odd, it is backward
115 }
116}
117
118int ccv_nnc_cmd_ok(const uint32_t cmd, const uint32_t backend)
119{
120 // If it is a custom command, a no op, or a graph op, there is no backend to check.
121 if (cmd == CCV_NNC_NOOP ||
122 cmd == CCV_NNC_GRAPH_FORWARD || cmd == CCV_NNC_GRAPH_BACKWARD ||
123 cmd == CCV_NNC_CUSTOM_FORWARD || cmd == CCV_NNC_CUSTOM_BACKWARD)
124 return 1;
125 const int cmd_idx = _ccv_nnc_cmd_ph(cmd);
126 const int backend_idx = _ccv_nnc_cmd_backend_ph(backend);
127 assert(cmd_idx >= 0 && cmd_idx < sizeof(init_map) / sizeof(init_map[0]))((void) sizeof ((cmd_idx >= 0 && cmd_idx < sizeof
(init_map) / sizeof(init_map[0])) ? 1 : 0), __extension__ ({ if
(cmd_idx >= 0 && cmd_idx < sizeof(init_map) / sizeof
(init_map[0])) ; else __assert_fail ("cmd_idx >= 0 && cmd_idx < sizeof(init_map) / sizeof(init_map[0])"
, "ccv_nnc_cmd.c", 127, __extension__ __PRETTY_FUNCTION__); }
))
;
128 assert(backend_idx >= 0 && backend_idx < CCV_NNC_BACKEND_COUNT)((void) sizeof ((backend_idx >= 0 && backend_idx <
CCV_NNC_BACKEND_COUNT) ? 1 : 0), __extension__ ({ if (backend_idx
>= 0 && backend_idx < CCV_NNC_BACKEND_COUNT) ;
else __assert_fail ("backend_idx >= 0 && backend_idx < CCV_NNC_BACKEND_COUNT"
, "ccv_nnc_cmd.c", 128, __extension__ __PRETTY_FUNCTION__); }
))
;
129 const ccv_nnc_cmd_backend_registry_t api_registry = init_map[cmd_idx].backends[backend_idx];
130 // Check if the execution function exists or not.
131 return !!api_registry.exec;
132}
133
134ccv_nnc_cmd_t ccv_nnc_cmd(const uint32_t _cmd, ccv_nnc_cmd_vtab_t* const isa, const ccv_nnc_cmd_param_t params, const int flags)
135{
136 ccv_nnc_cmd_t cmd;
137 cmd.info = params;
138 cmd.backend = CCV_NNC_NO_BACKEND;
139 assert((_cmd == CCV_NNC_CUSTOM_FORWARD && isa) || (_cmd != CCV_NNC_CUSTOM_FORWARD && !isa))((void) sizeof (((_cmd == CCV_NNC_CUSTOM_FORWARD && isa
) || (_cmd != CCV_NNC_CUSTOM_FORWARD && !isa)) ? 1 : 0
), __extension__ ({ if ((_cmd == CCV_NNC_CUSTOM_FORWARD &&
isa) || (_cmd != CCV_NNC_CUSTOM_FORWARD && !isa)) ; else
__assert_fail ("(_cmd == CCV_NNC_CUSTOM_FORWARD && isa) || (_cmd != CCV_NNC_CUSTOM_FORWARD && !isa)"
, "ccv_nnc_cmd.c", 139, __extension__ __PRETTY_FUNCTION__); }
))
;
140 cmd.cmd = _cmd;
141 cmd.algorithm = -1; // This is default.
142 cmd.isa = isa;
143 cmd.data = 0;
144 return cmd;
145}
146
147const ccv_nnc_hint_t ccv_nnc_no_hint = {};
148
149int ccv_nnc_is_no_hint(const ccv_nnc_hint_t hint)
150{
151 return (memcmp(&hint, &ccv_nnc_no_hint, sizeof(ccv_nnc_hint_t)) == 0);
152}
153
154int ccv_nnc_hint_verify(const ccv_nnc_hint_t hint, const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t a, const ccv_nnc_tensor_param_t b)
155{
156 int i;
157 assert(a.format == b.format)((void) sizeof ((a.format == b.format) ? 1 : 0), __extension__
({ if (a.format == b.format) ; else __assert_fail ("a.format == b.format"
, "ccv_nnc_cmd.c", 157, __extension__ __PRETTY_FUNCTION__); }
))
;
158 const int nd = ccv_nnc_tensor_nd(a.dim);
159 const int size_nd = ccv_max(2, ccv_nnc_tensor_nd(cmd.size.dim) - 1)({ typeof (2) _a = (2); typeof (ccv_nnc_tensor_nd(cmd.size.dim
) - 1) _b = (ccv_nnc_tensor_nd(cmd.size.dim) - 1); (_a > _b
) ? _a : _b; })
;
160 assert(size_nd == 2 || size_nd == 3)((void) sizeof ((size_nd == 2 || size_nd == 3) ? 1 : 0), __extension__
({ if (size_nd == 2 || size_nd == 3) ; else __assert_fail ("size_nd == 2 || size_nd == 3"
, "ccv_nnc_cmd.c", 160, __extension__ __PRETTY_FUNCTION__); }
))
; // Support 3D convolution.
161 assert(nd == size_nd + 1 || nd == size_nd + 2)((void) sizeof ((nd == size_nd + 1 || nd == size_nd + 2) ? 1 :
0), __extension__ ({ if (nd == size_nd + 1 || nd == size_nd +
2) ; else __assert_fail ("nd == size_nd + 1 || nd == size_nd + 2"
, "ccv_nnc_cmd.c", 161, __extension__ __PRETTY_FUNCTION__); }
))
;
162 int hw;
163 if ((a.format == CCV_TENSOR_FORMAT_CHWN) ||
164 (a.format == CCV_TENSOR_FORMAT_NHWC && nd == size_nd + 1))
165 hw = 0;
166 else if ((a.format == CCV_TENSOR_FORMAT_NHWC && nd == size_nd + 2) ||
167 (a.format == CCV_TENSOR_FORMAT_NCHW && nd == size_nd + 1))
168 hw = 1;
169 else if (a.format == CCV_TENSOR_FORMAT_NCHW && nd == size_nd + 2)
170 hw = 2;
171 else
172 assert(0 && "unknown format")((void) sizeof ((0 && "unknown format") ? 1 : 0), __extension__
({ if (0 && "unknown format") ; else __assert_fail (
"0 && \"unknown format\"", "ccv_nnc_cmd.c", 172, __extension__
__PRETTY_FUNCTION__); }))
;
173 for (i = 0; i < size_nd; i++)
174 {
175 if ((hint.border.begin[i] + hint.border.end[i] + a.dim[i + hw] - cmd.size.dim[i]) % hint.stride.dim[i] != 0)
176 return -1;
177 int expected = (hint.border.begin[i] + hint.border.end[i] + a.dim[i + hw] - cmd.size.dim[i]) / hint.stride.dim[i] + 1;
178 if (expected != b.dim[i + hw])
179 return -1;
180 }
181 return 0;
182}
183
184ccv_nnc_hint_t ccv_nnc_hint_auto(const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t a, const ccv_nnc_tensor_param_t b)
185{
186 int i;
187 if (a.format != b.format)
188 return ccv_nnc_no_hint;
189 assert(a.format == b.format)((void) sizeof ((a.format == b.format) ? 1 : 0), __extension__
({ if (a.format == b.format) ; else __assert_fail ("a.format == b.format"
, "ccv_nnc_cmd.c", 189, __extension__ __PRETTY_FUNCTION__); }
))
;
190 const int a_nd = ccv_nnc_tensor_nd(a.dim);
191 const int b_nd = ccv_nnc_tensor_nd(b.dim);
192 const int size_nd = ccv_max(2, ccv_nnc_tensor_nd(cmd.size.dim) - 1)({ typeof (2) _a = (2); typeof (ccv_nnc_tensor_nd(cmd.size.dim
) - 1) _b = (ccv_nnc_tensor_nd(cmd.size.dim) - 1); (_a > _b
) ? _a : _b; })
;
193 assert(size_nd == 2 || size_nd == 3)((void) sizeof ((size_nd == 2 || size_nd == 3) ? 1 : 0), __extension__
({ if (size_nd == 2 || size_nd == 3) ; else __assert_fail ("size_nd == 2 || size_nd == 3"
, "ccv_nnc_cmd.c", 193, __extension__ __PRETTY_FUNCTION__); }
))
; // Support 3D convolution.
194 // Is not auto hint deducible dimensions.
195 if (a_nd != b_nd || (a_nd != size_nd + 1 && a_nd != size_nd + 2))
196 return ccv_nnc_no_hint;
197 int hw;
198 if ((a.format == CCV_TENSOR_FORMAT_CHWN) ||
199 (a.format == CCV_TENSOR_FORMAT_NHWC && a_nd == size_nd + 1))
200 hw = 0;
201 else if ((a.format == CCV_TENSOR_FORMAT_NHWC && a_nd == size_nd + 2) ||
202 (a.format == CCV_TENSOR_FORMAT_NCHW && a_nd == size_nd + 1))
203 hw = 1;
204 else if (a.format == CCV_TENSOR_FORMAT_NCHW && a_nd == size_nd + 2)
205 hw = 2;
206 else
207 assert(0 && "unknown format")((void) sizeof ((0 && "unknown format") ? 1 : 0), __extension__
({ if (0 && "unknown format") ; else __assert_fail (
"0 && \"unknown format\"", "ccv_nnc_cmd.c", 207, __extension__
__PRETTY_FUNCTION__); }))
;
208 ccv_nnc_hint_t hint_auto = {};
209 // 0-dim is reserved for channels
210 for (i = 0; i < size_nd; i++)
211 {
212 // Cannot have one of the dim is zero, we cannot auto the hint, return no hint.
213 assert(a.dim[i + hw] && b.dim[i + hw])((void) sizeof ((a.dim[i + hw] && b.dim[i + hw]) ? 1 :
0), __extension__ ({ if (a.dim[i + hw] && b.dim[i + hw
]) ; else __assert_fail ("a.dim[i + hw] && b.dim[i + hw]"
, "ccv_nnc_cmd.c", 213, __extension__ __PRETTY_FUNCTION__); }
))
;
214 // This is guessed by having a stride that will approximately match the scale.
215 int stride = (a.dim[i + hw] + b.dim[i + hw] / 2) / b.dim[i + hw];
216 hint_auto.stride.dim[i] = stride;
217 int border = (b.dim[i + hw] - 1) * stride - a.dim[i + hw] + cmd.size.dim[i];
218 hint_auto.border.begin[i] = (border + 1) / 2; // Always prefer to have more padding in the beginning, this matches CUDNN behavior.
219 hint_auto.border.end[i] = border - hint_auto.border.begin[i];
220 }
221 return hint_auto;
222}
223
224void ccv_nnc_hint_tensor_auto_forward_from_inputs(const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_hint_t hint, ccv_nnc_tensor_param_t* const outputs, const int output_size)
225{
226 int i;
227 assert(output_size <= input_size)((void) sizeof ((output_size <= input_size) ? 1 : 0), __extension__
({ if (output_size <= input_size) ; else __assert_fail ("output_size <= input_size"
, "ccv_nnc_cmd.c", 227, __extension__ __PRETTY_FUNCTION__); }
))
;
228 for (i = 0; i < output_size; i++)
229 outputs[i] = inputs[i];
230}
231
232void ccv_nnc_hint_tensor_auto_backward_from_gradient(const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_hint_t hint, ccv_nnc_tensor_param_t* const outputs, const int output_size)
233{
234 int i;
235 for (i = 0; i < output_size; i++)
236 outputs[i] = inputs[0];
237}
238
239void ccv_nnc_hint_tensor_auto_backward_from_inputs(const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_hint_t hint, ccv_nnc_tensor_param_t* const outputs, const int output_size)
240{
241 int i;
242 assert(output_size < input_size)((void) sizeof ((output_size < input_size) ? 1 : 0), __extension__
({ if (output_size < input_size) ; else __assert_fail ("output_size < input_size"
, "ccv_nnc_cmd.c", 242, __extension__ __PRETTY_FUNCTION__); }
))
;
243 for (i = 0; i < output_size; i++)
244 outputs[i] = inputs[i + 1];
245}
246
247void ccv_nnc_hint_tensor_auto_backward_from_gradient_and_inputs(const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_hint_t hint, ccv_nnc_tensor_param_t* const outputs, const int output_size)
248{
249 int i;
250 outputs[0] = inputs[0];
251 assert(output_size < input_size)((void) sizeof ((output_size < input_size) ? 1 : 0), __extension__
({ if (output_size < input_size) ; else __assert_fail ("output_size < input_size"
, "ccv_nnc_cmd.c", 251, __extension__ __PRETTY_FUNCTION__); }
))
;
252 for (i = 1; i < output_size; i++)
253 outputs[i] = inputs[i + 1];
254}
255
256void ccv_nnc_hint_tensor_auto(const ccv_nnc_cmd_t cmd, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_hint_t hint, ccv_nnc_tensor_param_t* const outputs, const int output_size)
257{
258 // zero out the parameters
259 const ccv_nnc_tensor_param_t z = {};
260 int i;
261 for (i = 0; i < output_size; i++)
262 outputs[i] = z; // Reset the outputs.
263 // Cannot handle these situations.
264 if (cmd.cmd == CCV_NNC_NOOP || cmd.cmd == CCV_NNC_CUSTOM_BACKWARD || cmd.cmd == CCV_NNC_GRAPH_FORWARD || cmd.cmd == CCV_NNC_GRAPH_BACKWARD)
265 return;
266 if (cmd.cmd == CCV_NNC_CUSTOM_FORWARD)
267 {
268 if (cmd.isa->tensor_auto)
269 cmd.isa->tensor_auto(cmd, inputs, input_size, hint, outputs, output_size);
270 return;
271 }
272 const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd);
273 const ccv_nnc_cmd_registry_t registry = init_map[cmd_idx].registry;
274 if (registry.tensor_auto)
275 registry.tensor_auto(cmd.info, inputs, input_size, hint, outputs, output_size);
276 else if (ccv_nnc_cmd_is_forward(cmd)) // For forward, the default auto is forward_from_inputs
277 ccv_nnc_hint_tensor_auto_forward_from_inputs(cmd.info, inputs, input_size, hint, outputs, output_size);
278 else // For backward, the default auto is backward_from_inputs
279 ccv_nnc_hint_tensor_auto_backward_from_inputs(cmd.info, inputs, input_size, hint, outputs, output_size);
280}
281
282int ccv_nnc_cmd_allow_inplace(const ccv_nnc_cmd_t cmd, const int input_idx, const int input_size, const int output_idx, const int output_size)
283{
284 if (cmd.cmd == CCV_NNC_NOOP || cmd.cmd == CCV_NNC_CUSTOM_FORWARD || cmd.cmd == CCV_NNC_CUSTOM_BACKWARD || cmd.cmd == CCV_NNC_GRAPH_FORWARD || cmd.cmd == CCV_NNC_GRAPH_BACKWARD)
285 return 0;
286 const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd);
287 const ccv_nnc_cmd_registry_t registry = init_map[cmd_idx].registry;
288 if (registry.allow_inplace)
289 return registry.allow_inplace(cmd.info, input_idx, input_size, output_idx, output_size);
290 return 0;
291}
292
293int ccv_nnc_cmd_enforce_inplace(const ccv_nnc_cmd_t cmd, const int input_idx, const int input_size, const int output_idx, const int output_size)
294{
295 if (cmd.cmd == CCV_NNC_NOOP || cmd.cmd == CCV_NNC_CUSTOM_FORWARD || cmd.cmd == CCV_NNC_CUSTOM_BACKWARD || cmd.cmd == CCV_NNC_GRAPH_FORWARD || cmd.cmd == CCV_NNC_GRAPH_BACKWARD)
296 return 0;
297 const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd);
298 const ccv_nnc_cmd_registry_t registry = init_map[cmd_idx].registry;
299 if (registry.enforce_inplace)
300 return registry.enforce_inplace(cmd.info, input_idx, input_size, output_idx, output_size);
301 return 0;
302}
303
304// This returns absolute time.
305uint64_t ccv_nnc_cmd_mono_time(void)
306{
307 struct timespec ts;
308 clock_gettime(CLOCK_MONOTONIC1, &ts);
309 return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
310}
311
312uint32_t ccv_nnc_cmd_find_backend(const ccv_nnc_cmd_t cmd, const int tensor_memory, const int tensor_formats, const int tensor_datatypes)
313{
314 if (cmd.cmd == CCV_NNC_NOOP ||
315 cmd.cmd == CCV_NNC_GRAPH_FORWARD || cmd.cmd == CCV_NNC_GRAPH_BACKWARD ||
316 cmd.cmd == CCV_NNC_CUSTOM_FORWARD || cmd.cmd == CCV_NNC_CUSTOM_BACKWARD)
317 return cmd.backend;
318 const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd);
319 assert(cmd_idx >= 0 && cmd_idx < sizeof(init_map) / sizeof(init_map[0]))((void) sizeof ((cmd_idx >= 0 && cmd_idx < sizeof
(init_map) / sizeof(init_map[0])) ? 1 : 0), __extension__ ({ if
(cmd_idx >= 0 && cmd_idx < sizeof(init_map) / sizeof
(init_map[0])) ; else __assert_fail ("cmd_idx >= 0 && cmd_idx < sizeof(init_map) / sizeof(init_map[0])"
, "ccv_nnc_cmd.c", 319, __extension__ __PRETTY_FUNCTION__); }
))
;
320 assert(tensor_memory != 0 && tensor_formats != 0 && tensor_datatypes != 0)((void) sizeof ((tensor_memory != 0 && tensor_formats
!= 0 && tensor_datatypes != 0) ? 1 : 0), __extension__
({ if (tensor_memory != 0 && tensor_formats != 0 &&
tensor_datatypes != 0) ; else __assert_fail ("tensor_memory != 0 && tensor_formats != 0 && tensor_datatypes != 0"
, "ccv_nnc_cmd.c", 320, __extension__ __PRETTY_FUNCTION__); }
))
;
321 int i;
322 for (i = 0; i < CCV_NNC_BACKEND_COUNT; i++)
323 {
324 const ccv_nnc_cmd_backend_registry_t api_registry = init_map[cmd_idx].backends[i];
325 // We have the exec kernel, and support all the tensor memory types.
326 if (api_registry.exec &&
327 (api_registry.tensor_memory & tensor_memory) == tensor_memory &&
328 (api_registry.tensor_formats & tensor_formats) == tensor_formats &&
329 (api_registry.tensor_datatypes & tensor_datatypes) == tensor_datatypes)
330 return backend_init_map[i].backend;
331 }
332 return cmd.backend;
333}
334
335#define AUTO_TUNE_TRIAL_SIZE(3) (3)
336
337static void _ccv_nnc_cmd_set_device_id(ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
338{
339#ifdef HAVE_CUDA1
340 if (!stream_context)
341 {
342 int device_id;
343 if (ccv_nnc_device_ids_for_io(inputs, input_size, outputs, output_size, CCV_TENSOR_GPU_MEMORY, &device_id, 1) > 0)
344 cudevice(device_id);
345 }
346#endif
347}
348
349typedef struct {
350 int format;
351 int datatype;
352 int nd;
353 off_t dataof;
354 int dim[CCV_NNC_MAX_DIM_ALLOC(12)];
355 int stride[CCV_NNC_MAX_DIM_ALLOC(12)];
356} ccv_nnc_cmd_autotune_tensor_shape_t;
357
358typedef struct {
359 uint32_t cmd;
360 ccv_nnc_cmd_param_t params;
361 ccv_nnc_hint_t hint;
362 int flags;
363 int input_size;
364 int output_size;
365 size_t workspace_size;
366 ccv_nnc_cmd_autotune_tensor_shape_t* inputs;
367 ccv_nnc_cmd_autotune_tensor_shape_t* outputs;
368} ccv_nnc_cmd_autotune_key_t;
369
370static CCV_WARN_UNUSED(ccv_nnc_cmd_autotune_key_t)ccv_nnc_cmd_autotune_key_t __attribute__((warn_unused_result)
)
ccv_nnc_cmd_autotune_key_new(const ccv_nnc_cmd_t cmd, const size_t workspace_size, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size)
371{
372 ccv_nnc_cmd_autotune_key_t key = {
373 .cmd = cmd.cmd,
374 .params = cmd.info,
375 .hint = hint,
376 .workspace_size = workspace_size,
377 .inputs = 0,
378 .input_size = 0,
379 .outputs = 0,
380 .output_size = 0
381 };
382 if (input_size == 0 && output_size
20.1
'output_size' is not equal to 0
== 0)
20
Assuming 'input_size' is equal to 0
21
Taking false branch
383 return key;
384 assert(input_size >= 0 && output_size >= 0)((void) sizeof ((input_size >= 0 && output_size >=
0) ? 1 : 0), __extension__ ({ if (input_size >= 0 &&
output_size >= 0) ; else __assert_fail ("input_size >= 0 && output_size >= 0"
, "ccv_nnc_cmd.c", 384, __extension__ __PRETTY_FUNCTION__); }
))
;
22
Taking true branch
385 key.input_size = input_size;
386 key.output_size = output_size;
387 key.inputs = (ccv_nnc_cmd_autotune_tensor_shape_t*)ccmallocmalloc(sizeof(ccv_nnc_cmd_autotune_tensor_shape_t) * (input_size + output_size));
23
Memory is allocated
388 key.outputs = key.inputs + input_size;
389 int i, j;
390 for (i = 0; i < input_size; i++)
24
Loop condition is false. Execution continues on line 412
391 {
392 memset(key.inputs[i].dim, 0, sizeof(key.inputs[i].dim));
393 memset(key.inputs[i].stride, 0, sizeof(key.inputs[i].stride));
394 if (!inputs[i])
395 {
396 key.inputs[i].format = 0;
397 key.inputs[i].datatype = 0;
398 key.inputs[i].dataof = 0;
399 key.inputs[i].nd = 0;
400 continue;
401 }
402 key.inputs[i].format = inputs[i]->info.format;
403 key.inputs[i].datatype = inputs[i]->info.datatype;
404 key.inputs[i].dataof = inputs[i]->dataof;
405 const int nd = key.inputs[i].nd = ccv_nnc_tensor_nd(inputs[i]->info.dim);
406 for (j = 0; j < nd; j++)
407 key.inputs[i].dim[j] = inputs[i]->info.dim[j];
408 if (CCV_IS_TENSOR_VIEW(inputs[i])((*(int*)(inputs[i])) & CCV_TENSOR_VIEW))
409 for (j = 0; j < nd; j++)
410 key.inputs[i].stride[j] = ((ccv_nnc_tensor_view_t*)inputs[i])->stride[j];
411 }
412 for (i = 0; i < output_size; i++)
25
Loop condition is true. Entering loop body
31
Loop condition is false. Execution continues on line 434
413 {
414 memset(key.outputs[i].dim, 0, sizeof(key.outputs[i].dim));
415 memset(key.outputs[i].stride, 0, sizeof(key.outputs[i].stride));
416 if (!outputs[i])
26
Taking false branch
417 {
418 key.outputs[i].format = 0;
419 key.outputs[i].datatype = 0;
420 key.outputs[i].dataof = 0;
421 key.outputs[i].nd = 0;
422 continue;
423 }
424 key.outputs[i].format = outputs[i]->info.format;
425 key.outputs[i].datatype = outputs[i]->info.datatype;
426 key.outputs[i].dataof = outputs[i]->dataof;
427 const int nd = key.outputs[i].nd = ccv_nnc_tensor_nd(outputs[i]->info.dim);
428 for (j = 0; j < nd; j++)
27
Assuming 'j' is >= 'nd'
28
Loop condition is false. Execution continues on line 430
429 key.outputs[i].dim[j] = outputs[i]->info.dim[j];
430 if (CCV_IS_TENSOR_VIEW(outputs[i])((*(int*)(outputs[i])) & CCV_TENSOR_VIEW))
29
Assuming the condition is false
30
Taking false branch
431 for (j = 0; j < nd; j++)
432 key.outputs[i].stride[j] = ((ccv_nnc_tensor_view_t*)outputs[i])->stride[j];
433 }
434 return key;
435}
436
437// autotune cache.
438static inline uint32_t twang_32from64(uint64_t key)
439{
440 key = (~key) + (key << 18);
441 key = key ^ (key >> 31);
442 key = key * 21;
443 key = key ^ (key >> 11);
444 key = key + (key << 6);
445 key = key ^ (key >> 22);
446 return (uint32_t)(key);
447}
448
449static inline khint32_t _kh_autotune_key_executable_hash_func(const ccv_nnc_cmd_autotune_key_t key)
450{
451 uint32_t h = key.cmd;
452 int i, j;
453 uint32_t* data = (uint32_t*)&key.params;
454 for (i = 0; i < sizeof(key.params) / sizeof(uint32_t); i++)
455 h = twang_32from64(((uint64_t)h << 32) | data[i]);
456 data = (uint32_t*)&key.hint;
457 for (i = 0; i < sizeof(key.hint) / sizeof(uint32_t); i++)
458 h = twang_32from64(((uint64_t)h << 32) | data[i]);
459 h = twang_32from64(((uint64_t)h << 32) | key.workspace_size);
460 h = twang_32from64(((uint64_t)h << 32) | key.input_size);
461 h = twang_32from64(((uint64_t)h << 32) | key.output_size);
462 for (i = 0; i < key.input_size; i++)
463 {
464 h = twang_32from64(((uint64_t)h << 32) | key.inputs[i].format);
465 h = twang_32from64(((uint64_t)h << 32) | key.inputs[i].datatype);
466 h = twang_32from64(((uint64_t)h << 32) | key.inputs[i].dataof);
467 h = twang_32from64(((uint64_t)h << 32) | key.inputs[i].nd);
468 for (j = 0; j < key.inputs[i].nd; j++)
469 {
470 h = twang_32from64(((uint64_t)h << 32) | key.inputs[i].dim[j]);
471 h = twang_32from64(((uint64_t)h << 32) | key.inputs[i].stride[j]);
472 }
473 }
474 for (i = 0; i < key.output_size; i++)
475 {
476 h = twang_32from64(((uint64_t)h << 32) | key.outputs[i].format);
477 h = twang_32from64(((uint64_t)h << 32) | key.outputs[i].datatype);
478 h = twang_32from64(((uint64_t)h << 32) | key.outputs[i].dataof);
479 h = twang_32from64(((uint64_t)h << 32) | key.outputs[i].nd);
480 for (j = 0; j < key.outputs[i].nd; j++)
481 {
482 h = twang_32from64(((uint64_t)h << 32) | key.outputs[i].dim[j]);
483 h = twang_32from64(((uint64_t)h << 32) | key.outputs[i].stride[j]);
484 }
485 }
486 return (khint32_t)h;
487}
488
489static inline int _kh_autotune_key_executable_hash_equal(const ccv_nnc_cmd_autotune_key_t a, const ccv_nnc_cmd_autotune_key_t b)
490{
491 if (a.cmd != b.cmd || a.flags != b.flags || a.workspace_size != b.workspace_size || a.input_size != b.input_size || a.output_size != b.output_size)
492 return 0;
493 if (memcmp(&a.params, &b.params, sizeof(a.params)) != 0)
494 return 0;
495 if (memcmp(&a.hint, &b.hint, sizeof(a.hint)) != 0)
496 return 0;
497 int i, j;
498 for (i = 0; i < a.input_size; i++)
499 {
500 if (a.inputs[i].format != b.inputs[i].format || a.inputs[i].datatype != b.inputs[i].datatype || a.inputs[i].nd != b.inputs[i].nd || a.inputs[i].dataof != b.inputs[i].dataof)
501 return 0;
502 for (j = 0; j < a.inputs[i].nd; j++)
503 if (a.inputs[i].dim[j] != b.inputs[i].dim[j] || a.inputs[i].stride[j] != b.inputs[i].stride[j])
504 return 0;
505 }
506 for (i = 0; i < a.output_size; i++)
507 {
508 if (a.outputs[i].format != b.outputs[i].format || a.outputs[i].datatype != b.outputs[i].datatype || a.outputs[i].nd != b.outputs[i].nd || a.outputs[i].dataof != b.outputs[i].dataof)
509 return 0;
510 for (j = 0; j < a.outputs[i].nd; j++)
511 if (a.outputs[i].dim[j] != b.outputs[i].dim[j] || a.outputs[i].stride[j] != b.outputs[i].stride[j])
512 return 0;
513 }
514 return 1;
515}
516
517typedef struct {
518 int backend;
519 int algorithm;
520} ccv_nnc_cmd_autotune_val_t;
521
522KHASH_INIT(autotune_executable_cache, ccv_nnc_cmd_autotune_key_t, ccv_nnc_cmd_autotune_val_t, 1, _kh_autotune_key_executable_hash_func, _kh_autotune_key_executable_hash_equal)typedef struct kh_autotune_executable_cache_s { khint_t n_buckets
, size, n_occupied, upper_bound; khint32_t *flags; ccv_nnc_cmd_autotune_key_t
*keys; ccv_nnc_cmd_autotune_val_t *vals; } kh_autotune_executable_cache_t
; static inline __attribute__ ((__unused__)) kh_autotune_executable_cache_t
*kh_init_autotune_executable_cache(void) { return (kh_autotune_executable_cache_t
*)calloc(1,sizeof(kh_autotune_executable_cache_t)); } static inline
__attribute__ ((__unused__)) void kh_destroy_autotune_executable_cache
(kh_autotune_executable_cache_t *h) { if (h) { free((void *)h
->keys); free(h->flags); free((void *)h->vals); free
(h); } } static inline __attribute__ ((__unused__)) void kh_clear_autotune_executable_cache
(kh_autotune_executable_cache_t *h) { if (h && h->
flags) { memset(h->flags, 0xaa, ((h->n_buckets) < 16
? 1 : (h->n_buckets)>>4) * sizeof(khint32_t)); h->
size = h->n_occupied = 0; } } static inline __attribute__ (
(__unused__)) khint_t kh_get_autotune_executable_cache(const kh_autotune_executable_cache_t
*h, ccv_nnc_cmd_autotune_key_t key) { if (h->n_buckets) {
khint_t k, i, last, mask, step = 0; mask = h->n_buckets -
1; k = _kh_autotune_key_executable_hash_func(key); i = k &
mask; last = i; while (!((h->flags[i>>4]>>((i
&0xfU)<<1))&2) && (((h->flags[i>>
4]>>((i&0xfU)<<1))&1) || !_kh_autotune_key_executable_hash_equal
(h->keys[i], key))) { i = (i + (++step)) & mask; if (i
== last) return h->n_buckets; } return ((h->flags[i>>
4]>>((i&0xfU)<<1))&3)? h->n_buckets : i
; } else return 0; } static inline __attribute__ ((__unused__
)) int kh_resize_autotune_executable_cache(kh_autotune_executable_cache_t
*h, khint_t new_n_buckets) { khint32_t *new_flags = 0; khint_t
j = 1; { (--(new_n_buckets), (new_n_buckets)|=(new_n_buckets
)>>1, (new_n_buckets)|=(new_n_buckets)>>2, (new_n_buckets
)|=(new_n_buckets)>>4, (new_n_buckets)|=(new_n_buckets)
>>8, (new_n_buckets)|=(new_n_buckets)>>16, ++(new_n_buckets
)); if (new_n_buckets < 4) new_n_buckets = 4; if (h->size
>= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0
; else { new_flags = (khint32_t*)malloc(((new_n_buckets) <
16? 1 : (new_n_buckets)>>4) * sizeof(khint32_t)); if (
!new_flags) return -1; memset(new_flags, 0xaa, ((new_n_buckets
) < 16? 1 : (new_n_buckets)>>4) * sizeof(khint32_t))
; if (h->n_buckets < new_n_buckets) { ccv_nnc_cmd_autotune_key_t
*new_keys = (ccv_nnc_cmd_autotune_key_t*)realloc((void *)h->
keys,new_n_buckets * sizeof(ccv_nnc_cmd_autotune_key_t)); if (
!new_keys) { free(new_flags); return -1; } h->keys = new_keys
; if (1) { ccv_nnc_cmd_autotune_val_t *new_vals = (ccv_nnc_cmd_autotune_val_t
*)realloc((void *)h->vals,new_n_buckets * sizeof(ccv_nnc_cmd_autotune_val_t
)); if (!new_vals) { free(new_flags); return -1; } h->vals
= new_vals; } } } } if (j) { for (j = 0; j != h->n_buckets
; ++j) { if (((h->flags[j>>4]>>((j&0xfU)<<
1))&3) == 0) { ccv_nnc_cmd_autotune_key_t key = h->keys
[j]; ccv_nnc_cmd_autotune_val_t val; khint_t new_mask; new_mask
= new_n_buckets - 1; if (1) val = h->vals[j]; (h->flags
[j>>4]|=1ul<<((j&0xfU)<<1)); while (1) {
khint_t k, i, step = 0; k = _kh_autotune_key_executable_hash_func
(key); i = k & new_mask; while (!((new_flags[i>>4]>>
((i&0xfU)<<1))&2)) i = (i + (++step)) & new_mask
; (new_flags[i>>4]&=~(2ul<<((i&0xfU)<<
1))); if (i < h->n_buckets && ((h->flags[i>>
4]>>((i&0xfU)<<1))&3) == 0) { { ccv_nnc_cmd_autotune_key_t
tmp = h->keys[i]; h->keys[i] = key; key = tmp; } if (1
) { ccv_nnc_cmd_autotune_val_t tmp = h->vals[i]; h->vals
[i] = val; val = tmp; } (h->flags[i>>4]|=1ul<<
((i&0xfU)<<1)); } else { h->keys[i] = key; if (1
) h->vals[i] = val; break; } } } } if (h->n_buckets >
new_n_buckets) { h->keys = (ccv_nnc_cmd_autotune_key_t*)realloc
((void *)h->keys,new_n_buckets * sizeof(ccv_nnc_cmd_autotune_key_t
)); if (1) h->vals = (ccv_nnc_cmd_autotune_val_t*)realloc(
(void *)h->vals,new_n_buckets * sizeof(ccv_nnc_cmd_autotune_val_t
)); } free(h->flags); h->flags = new_flags; h->n_buckets
= new_n_buckets; h->n_occupied = h->size; h->upper_bound
= (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); } return
0; } static inline __attribute__ ((__unused__)) khint_t kh_put_autotune_executable_cache
(kh_autotune_executable_cache_t *h, ccv_nnc_cmd_autotune_key_t
key, int *ret) { khint_t x; if (h->n_occupied >= h->
upper_bound) { if (h->n_buckets > (h->size<<1)
) { if (kh_resize_autotune_executable_cache(h, h->n_buckets
- 1) < 0) { *ret = -1; return h->n_buckets; } } else if
(kh_resize_autotune_executable_cache(h, h->n_buckets + 1)
< 0) { *ret = -1; return h->n_buckets; } } { khint_t k
, i, site, last, mask = h->n_buckets - 1, step = 0; x = site
= h->n_buckets; k = _kh_autotune_key_executable_hash_func
(key); i = k & mask; if (((h->flags[i>>4]>>
((i&0xfU)<<1))&2)) x = i; else { last = i; while
(!((h->flags[i>>4]>>((i&0xfU)<<1))&
2) && (((h->flags[i>>4]>>((i&0xfU)
<<1))&1) || !_kh_autotune_key_executable_hash_equal
(h->keys[i], key))) { if (((h->flags[i>>4]>>
((i&0xfU)<<1))&1)) site = i; i = (i + (++step))
& mask; if (i == last) { x = site; break; } } if (x == h
->n_buckets) { if (((h->flags[i>>4]>>((i&
0xfU)<<1))&2) && site != h->n_buckets) x
= site; else x = i; } } } if (((h->flags[x>>4]>>
((x&0xfU)<<1))&2)) { h->keys[x] = key; (h->
flags[x>>4]&=~(3ul<<((x&0xfU)<<1)))
; ++h->size; ++h->n_occupied; *ret = 1; } else if (((h->
flags[x>>4]>>((x&0xfU)<<1))&1)) { h
->keys[x] = key; (h->flags[x>>4]&=~(3ul<<
((x&0xfU)<<1))); ++h->size; *ret = 2; } else *ret
= 0; return x; } static inline __attribute__ ((__unused__)) void
kh_del_autotune_executable_cache(kh_autotune_executable_cache_t
*h, khint_t x) { if (x != h->n_buckets && !((h->
flags[x>>4]>>((x&0xfU)<<1))&3)) { (
h->flags[x>>4]|=1ul<<((x&0xfU)<<1));
--h->size; } }
523
524static khash_t(autotune_executable_cache)kh_autotune_executable_cache_t* g_autotune_executable_cache = 0;
525
526static inline void ccv_nnc_cmd_autotune_key_free(ccv_nnc_cmd_autotune_key_t key)
527{
528 if (key.inputs)
529 ccfreefree(key.inputs);
530}
531
532void ccv_nnc_drain_autotune_cache(void)
533{
534 if (!g_autotune_executable_cache)
535 return;
536 khiter_t k;
537 for (k = kh_begin(g_autotune_executable_cache)(khint_t)(0); k < kh_end(g_autotune_executable_cache)((g_autotune_executable_cache)->n_buckets); k++)
538 {
539 if (!kh_exist(g_autotune_executable_cache, k)(!(((g_autotune_executable_cache)->flags[(k)>>4]>>
(((k)&0xfU)<<1))&3))
)
540 continue;
541 ccv_nnc_cmd_autotune_key_free(kh_key(g_autotune_executable_cache, k)((g_autotune_executable_cache)->keys[k]));
542 kh_del(autotune_executable_cache, g_autotune_executable_cache, k)kh_del_autotune_executable_cache(g_autotune_executable_cache,
k)
;
543 }
544}
545
546ccv_nnc_cmd_t ccv_nnc_cmd_autotune(const ccv_nnc_cmd_t cmd, const size_t max_workspace_size, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
547{
548 // This is a custom cmd kernel, no need to autotune.
549 if (cmd.cmd == CCV_NNC_NOOP ||
1
Assuming field 'cmd' is not equal to CCV_NNC_NOOP
6
Taking false branch
550 cmd.cmd == CCV_NNC_GRAPH_FORWARD || cmd.cmd == CCV_NNC_GRAPH_BACKWARD ||
2
Assuming field 'cmd' is not equal to CCV_NNC_GRAPH_FORWARD
3
Assuming field 'cmd' is not equal to CCV_NNC_GRAPH_BACKWARD
551 cmd.cmd == CCV_NNC_CUSTOM_FORWARD || cmd.cmd == CCV_NNC_CUSTOM_BACKWARD)
4
Assuming field 'cmd' is not equal to CCV_NNC_CUSTOM_FORWARD
5
Assuming field 'cmd' is not equal to CCV_NNC_CUSTOM_BACKWARD
552 return cmd;
553 int i, j, k;
554 // Go through all the backends that supports the same type of memory input / output tensors support.
555 int tensor_memory = 0, tensor_formats = 0, tensor_datatypes = 0;
556 for (i = 0; i < input_size; i++)
7
Assuming 'i' is >= 'input_size'
8
Loop condition is false. Execution continues on line 559
557 if (inputs[i])
558 tensor_memory |= CCV_TENSOR_GET_MEMORY(inputs[i]->info.type)((inputs[i]->info.type) & 0x3), tensor_formats |= inputs[i]->info.format, tensor_datatypes |= CCV_GET_DATA_TYPE(inputs[i]->info.datatype)((inputs[i]->info.datatype) & 0xFF000);
559 for (i = 0; i < output_size; i++)
9
Assuming 'i' is < 'output_size'
10
Loop condition is true. Entering loop body
13
Assuming 'i' is >= 'output_size'
14
Loop condition is false. Execution continues on line 563
560 if (outputs[i])
11
Assuming the condition is true
12
Taking true branch
561 tensor_memory |= CCV_TENSOR_GET_MEMORY(outputs[i]->info.type)((outputs[i]->info.type) & 0x3), tensor_formats |= outputs[i]->info.format, tensor_datatypes |= CCV_GET_DATA_TYPE(outputs[i]->info.datatype)((outputs[i]->info.datatype) & 0xFF000);
562 // In this case, we cannot determine the type of the tensor, skip auto-tune.
563 if (!tensor_memory)
15
Assuming 'tensor_memory' is not equal to 0
16
Taking false branch
564 return cmd;
565 // Otherwise, we are good to go.
566 ccv_nnc_cmd_t tuned_cmd = cmd;
567 if (!g_autotune_executable_cache)
17
Assuming 'g_autotune_executable_cache' is non-null
18
Taking false branch
568 g_autotune_executable_cache = kh_init(autotune_executable_cache)kh_init_autotune_executable_cache();
569 int ret = 0;
570 ccv_nnc_cmd_autotune_key_t key = ccv_nnc_cmd_autotune_key_new(cmd, max_workspace_size, hint, flags, inputs, input_size, outputs, output_size);
19
Calling 'ccv_nnc_cmd_autotune_key_new'
32
Returned allocated memory
571 khiter_t kiter = kh_put(autotune_executable_cache, g_autotune_executable_cache, key, &ret)kh_put_autotune_executable_cache(g_autotune_executable_cache,
key, &ret)
;
572 if (ret
32.1
'ret' is not equal to 0
== 0)
33
Taking false branch
573 {
574 ccv_nnc_cmd_autotune_key_free(key);
575 const ccv_nnc_cmd_autotune_val_t val = kh_val(g_autotune_executable_cache, kiter)((g_autotune_executable_cache)->vals[kiter]);
576 tuned_cmd.backend = val.backend;
577 tuned_cmd.algorithm = val.algorithm;
578 return tuned_cmd;
579 }
580 int64_t best_measured = -1;
34
Potential memory leak
581 const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd);
582 assert(cmd_idx >= 0 && cmd_idx < sizeof(init_map) / sizeof(init_map[0]))((void) sizeof ((cmd_idx >= 0 && cmd_idx < sizeof
(init_map) / sizeof(init_map[0])) ? 1 : 0), __extension__ ({ if
(cmd_idx >= 0 && cmd_idx < sizeof(init_map) / sizeof
(init_map[0])) ; else __assert_fail ("cmd_idx >= 0 && cmd_idx < sizeof(init_map) / sizeof(init_map[0])"
, "ccv_nnc_cmd.c", 582, __extension__ __PRETTY_FUNCTION__); }
))
;
583 int flag = 0, autotune_available_1 = 0; // This is only applicable if we have only one backend.
584 for (i = 0; i < CCV_NNC_BACKEND_COUNT; i++)
585 {
586 const ccv_nnc_cmd_backend_registry_t api_registry = init_map[cmd_idx].backends[i];
587 // We have the exec kernel, and support all the tensor memory types.
588 if (api_registry.exec &&
589 (api_registry.tensor_memory & tensor_memory) == tensor_memory &&
590 (api_registry.tensor_formats & tensor_formats) == tensor_formats &&
591 (api_registry.tensor_datatypes & tensor_datatypes) == tensor_datatypes)
592 {
593 if (api_registry.autotune)
594 autotune_available_1 = 1;
595 if ((++flag) >= 2) // If we have more than 2 suitable backend, we can do this now.
596 break;
597 }
598 }
599 if (flag == 0)
600 return cmd;
601 _ccv_nnc_cmd_set_device_id(inputs, input_size, outputs, output_size, stream_context);
602 // Allocate inputs / outputs and fill them in.
603 ccv_nnc_tensor_t** copy_inputs;
604 ccv_nnc_tensor_t** copy_outputs;
605 ccv_nnc_tensor_t** allocated_inputs;
606 ccv_nnc_tensor_t** allocated_outputs;
607 ccv_nnc_tensor_view_t** allocated_input_views;
608 ccv_nnc_tensor_view_t** allocated_output_views;
609 if (flag > 1 || autotune_available_1)
610 {
611 copy_inputs = (ccv_nnc_tensor_t**)cccalloccalloc((input_size + output_size) * 3, sizeof(ccv_nnc_tensor_t*));
612 copy_outputs = copy_inputs + input_size;
613 allocated_inputs = copy_outputs + output_size;
614 allocated_outputs = allocated_inputs + input_size;
615 allocated_input_views = (ccv_nnc_tensor_view_t**)(allocated_outputs + output_size);
616 allocated_output_views = allocated_input_views + input_size;
617 int stride[CCV_NNC_MAX_DIM_ALLOC(12)];
618 for (i = 0; i < output_size; i++)
619 if (outputs[i])
620 {
621 for (j = 0; j < input_size; j++)
622 if (inputs[j])
623 {
624 if (outputs[i] == inputs[j])
625 {
626 if (!copy_inputs[j])
627 {
628 allocated_inputs[j] = ccv_nnc_tensor_new(0, inputs[j]->info, 0);
629 if (CCV_IS_TENSOR_VIEW(inputs[j])((*(int*)(inputs[j])) & CCV_TENSOR_VIEW))
630 {
631 ccv_nnc_tensor_get_stride(inputs[j]->info.dim, stride);
632 copy_inputs[j] = (ccv_nnc_tensor_t*)(allocated_input_views[j] = ccv_nnc_tensor_view_new(allocated_inputs[j], inputs[j]->info, DIM_ALLOC()(int [(12)]){}, stride));
633 } else
634 copy_inputs[j] = allocated_inputs[j];
635 }
636 copy_outputs[i] = copy_inputs[j];
637 break;
638 } else if (outputs[i]->data.u8 == inputs[j]->data.u8 &&
639 ccv_nnc_tensor_count(outputs[i]->info) == ccv_nnc_tensor_count(inputs[j]->info)) {
640 if (!copy_inputs[j])
641 {
642 allocated_inputs[j] = ccv_nnc_tensor_new(0, inputs[j]->info, 0);
643 if (CCV_IS_TENSOR_VIEW(inputs[j])((*(int*)(inputs[j])) & CCV_TENSOR_VIEW))
644 {
645 ccv_nnc_tensor_get_stride(inputs[j]->info.dim, stride);
646 copy_inputs[j] = (ccv_nnc_tensor_t*)(allocated_input_views[j] = ccv_nnc_tensor_view_new(allocated_inputs[j], inputs[j]->info, DIM_ALLOC()(int [(12)]){}, stride));
647 } else
648 copy_inputs[j] = allocated_inputs[j];
649 }
650 allocated_outputs[i] = ccv_nnc_tensor_new(copy_inputs[j]->data.u8, outputs[i]->info, 0);
651 if (CCV_IS_TENSOR_VIEW(outputs[i])((*(int*)(outputs[i])) & CCV_TENSOR_VIEW))
652 {
653 ccv_nnc_tensor_get_stride(outputs[i]->info.dim, stride);
654 copy_outputs[i] = (ccv_nnc_tensor_t*)(allocated_output_views[i] = ccv_nnc_tensor_view_new(allocated_outputs[i], outputs[i]->info, DIM_ALLOC()(int [(12)]){}, stride));
655 } else
656 copy_outputs[i] = allocated_outputs[i];
657 break;
658 }
659 }
660 if (!copy_outputs[i])
661 {
662 allocated_outputs[i] = ccv_nnc_tensor_new(0, outputs[i]->info, 0);
663 if (CCV_IS_TENSOR_VIEW(outputs[i])((*(int*)(outputs[i])) & CCV_TENSOR_VIEW))
664 {
665 ccv_nnc_tensor_get_stride(outputs[i]->info.dim, stride);
666 copy_outputs[i] = (ccv_nnc_tensor_t*)(allocated_output_views[i] = ccv_nnc_tensor_view_new(allocated_outputs[i], outputs[i]->info, DIM_ALLOC()(int [(12)]){}, stride));
667 } else
668 copy_outputs[i] = allocated_outputs[i];
669 }
670 }
671 for (i = 0; i < input_size; i++)
672 if (inputs[i] && !copy_inputs[i])
673 copy_inputs[i] = inputs[i];
674 }
675 if (flag == 1)
676 {
677 for (i = 0; i < CCV_NNC_BACKEND_COUNT; i++)
678 {
679 const ccv_nnc_cmd_backend_registry_t api_registry = init_map[cmd_idx].backends[i];
680 // We have the exec kernel, and support all the tensor memory types.
681 if (api_registry.exec &&
682 (api_registry.tensor_memory & tensor_memory) == tensor_memory &&
683 (api_registry.tensor_formats & tensor_formats) == tensor_formats &&
684 (api_registry.tensor_datatypes & tensor_datatypes) == tensor_datatypes)
685 {
686 tuned_cmd.backend = backend_init_map[i].backend;
687 // If a given API exist an autotune function, use that to pick the top algorithm.
688 if (api_registry.autotune)
689 {
690 ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD()ccv_nnc_cmd(CCV_NNC_DATA_TRANSFER_FORWARD, 0, ccv_nnc_cmd_auto
, 0)
, ccv_nnc_no_hint, 0, inputs, input_size, copy_inputs, input_size, stream_context);
691 _ccv_nnc_cmd_set_device_id(copy_inputs, input_size, copy_outputs, output_size, stream_context);
692 tuned_cmd.algorithm = api_registry.autotune(tuned_cmd, max_workspace_size, hint, flags, copy_inputs, input_size, copy_outputs, output_size, stream_context);
693 // Drain the context, autotune can use excessive amount of memory. Need to drain it now.
694 ccv_nnc_stream_context_drain(stream_context);
695 }
696 break;
697 }
698 }
699 if (autotune_available_1)
700 {
701 for (i = 0; i < input_size; i++)
702 {
703 if (allocated_inputs[i])
704 ccv_nnc_tensor_free(allocated_inputs[i]);
705 if (allocated_input_views[i])
706 ccv_nnc_tensor_view_free(allocated_input_views[i]);
707 }
708 for (i = 0; i < output_size; i++)
709 {
710 if (allocated_outputs[i])
711 ccv_nnc_tensor_free(allocated_outputs[i]);
712 if (allocated_output_views[i])
713 ccv_nnc_tensor_view_free(allocated_output_views[i]);
714 }
715 ccfreefree(copy_inputs);
716 }
717 const ccv_nnc_cmd_autotune_val_t val = {
718 .backend = tuned_cmd.backend,
719 .algorithm = tuned_cmd.algorithm
720 };
721 kh_val(g_autotune_executable_cache, kiter)((g_autotune_executable_cache)->vals[kiter]) = val;
722 return tuned_cmd;
723 }
724 // We need to have trial loop through all the data.
725 for (k = 0; k < AUTO_TUNE_TRIAL_SIZE(3); k++)
726 {
727 for (i = 0; i < CCV_NNC_BACKEND_COUNT; i++)
728 {
729 const ccv_nnc_cmd_backend_registry_t api_registry = init_map[cmd_idx].backends[i];
730 // We have the exec kernel, and support all the tensor memory types.
731 if (api_registry.exec &&
732 (api_registry.tensor_memory & tensor_memory) == tensor_memory &&
733 (api_registry.tensor_formats & tensor_formats) == tensor_formats &&
734 (api_registry.tensor_datatypes & tensor_datatypes) == tensor_datatypes)
735 {
736 ccv_nnc_cmd_t candid_cmd = cmd;
737 candid_cmd.backend = backend_init_map[i].backend;
738 // If a given API exist an autotune function, use that to pick the top algorithm.
739 if (api_registry.autotune)
740 {
741 // Assuming k == 0 is sufficient, and we can skip.
742 if (k > 0)
743 continue;
744 ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD()ccv_nnc_cmd(CCV_NNC_DATA_TRANSFER_FORWARD, 0, ccv_nnc_cmd_auto
, 0)
, ccv_nnc_no_hint, 0, inputs, input_size, copy_inputs, input_size, stream_context);
745 _ccv_nnc_cmd_set_device_id(copy_inputs, input_size, copy_outputs, output_size, stream_context);
746 candid_cmd.algorithm = api_registry.autotune(candid_cmd, max_workspace_size, hint, flags, copy_inputs, input_size, copy_outputs, output_size, stream_context);
747 // Drain the context, autotune can use excessive amount of memory. Need to drain it now.
748 ccv_nnc_stream_context_drain(stream_context);
749 uint64_t elapsed = ccv_nnc_cmd_mono_time();
750 // Ready to run.
751 int status = ccv_nnc_cmd_exec(candid_cmd, hint, flags, inputs, input_size, outputs, output_size, stream_context);
752 ccv_nnc_stream_context_wait(stream_context);
753 elapsed = ccv_nnc_cmd_mono_time() - elapsed;
754 if (status == CCV_NNC_EXEC_SUCCESS &&
755 (best_measured == -1 || elapsed < best_measured))
756 {
757 best_measured = elapsed;
758 tuned_cmd = candid_cmd;
759 }
760 } else {
761 // Otherwise loop over the existing algorithms and pick the top one.
762 for (j = 0; j < api_registry.algorithms; j++)
763 {
764 candid_cmd.algorithm = j;
765 ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD()ccv_nnc_cmd(CCV_NNC_DATA_TRANSFER_FORWARD, 0, ccv_nnc_cmd_auto
, 0)
, ccv_nnc_no_hint, 0, inputs, input_size, copy_inputs, input_size, stream_context);
766 _ccv_nnc_cmd_set_device_id(copy_inputs, input_size, copy_outputs, output_size, stream_context);
767 uint64_t elapsed = ccv_nnc_cmd_mono_time();
768 // Ready to run.
769 int status = ccv_nnc_cmd_exec(candid_cmd, hint, flags, copy_inputs, input_size, copy_outputs, output_size, stream_context);
770 elapsed = ccv_nnc_cmd_mono_time() - elapsed;
771 if (status == CCV_NNC_EXEC_SUCCESS &&
772 (best_measured == -1 || elapsed < best_measured))
773 {
774 best_measured = elapsed;
775 tuned_cmd = candid_cmd;
776 }
777 }
778 }
779 }
780 }
781 }
782 for (i = 0; i < input_size; i++)
783 {
784 if (allocated_inputs[i])
785 ccv_nnc_tensor_free(allocated_inputs[i]);
786 if (allocated_input_views[i])
787 ccv_nnc_tensor_view_free(allocated_input_views[i]);
788 }
789 for (i = 0; i < output_size; i++)
790 {
791 if (allocated_outputs[i])
792 ccv_nnc_tensor_free(allocated_outputs[i]);
793 if (allocated_output_views[i])
794 ccv_nnc_tensor_view_free(allocated_output_views[i]);
795 }
796 ccfreefree(copy_inputs);
797 const ccv_nnc_cmd_autotune_val_t val = {
798 .backend = tuned_cmd.backend,
799 .algorithm = tuned_cmd.algorithm
800 };
801 kh_val(g_autotune_executable_cache, kiter)((g_autotune_executable_cache)->vals[kiter]) = val;
802 return tuned_cmd;
803}
804
805int ccv_nnc_cmd_bitmask(const ccv_nnc_cmd_t cmd, const int input_size, const int output_size, const uint64_t* const input_bitmasks, const int input_bitmask_size, const uint64_t* const output_bitmasks, const int output_bitmask_size)
806{
807 // If it is no-op, return true, it can deal with any number of parameters.
808 if (cmd.cmd == CCV_NNC_NOOP)
809 return 1;
810 // If it is a custom command, I cannot check it at all, return false.
811 if (cmd.cmd == CCV_NNC_CUSTOM_FORWARD || cmd.cmd == CCV_NNC_CUSTOM_BACKWARD)
812 return 0;
813 const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd);
814 const ccv_nnc_cmd_registry_t cmd_registry = init_map[cmd_idx].registry;
815 if (cmd_registry.bitmask)
816 return cmd_registry.bitmask(cmd.info, input_size, output_size, input_bitmasks, input_bitmask_size, output_bitmasks, output_bitmask_size);
817 // If there is not checking, none can pass.
818 return 0;
819}
820
821int ccv_nnc_device_ids_for_io(ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, const int tensor_type, int* const device_ids, const int max_device_id_size)
822{
823 int i, j;
824 int device_id_size = 0;
825 if (max_device_id_size <= device_id_size)
826 return device_id_size;
827 // The device id of the exec is determined by its outputs.
828 for (i = 0; i < output_size; i++)
829 if (outputs[i] &&
830 CCV_TENSOR_GET_MEMORY(outputs[i]->info.type)((outputs[i]->info.type) & 0x3) == tensor_type &&
831 CCV_TENSOR_GET_DEVICE(outputs[i]->info.type)((outputs[i]->info.type) & 0xfff00) != CCV_COMPUTE_DEVICE_ANY)
832 {
833 const int device_id = CCV_TENSOR_GET_DEVICE_ID(outputs[i]->info.type)(((outputs[i]->info.type) & 0xfff00) >> 8);
834 int flag = 0;
835 for (j = 0; !flag && j < device_id_size; j++)
836 flag = (device_ids[j] == device_id);
837 if (flag)
838 continue;
839 device_ids[device_id_size++] = device_id;
840 if (device_id_size >= max_device_id_size)
841 return device_id_size;
842 }
843 if (device_id_size == 0)
844 {
845 int device_id = -1;
846 for (i = 0; i < input_size; i++)
847 if (inputs[i] &&
848 CCV_TENSOR_GET_MEMORY(inputs[i]->info.type)((inputs[i]->info.type) & 0x3) == tensor_type &&
849 CCV_TENSOR_GET_DEVICE(inputs[i]->info.type)((inputs[i]->info.type) & 0xfff00) != CCV_COMPUTE_DEVICE_ANY &&
850 (device_id < 0 || CCV_TENSOR_GET_DEVICE_ID(inputs[i]->info.type)(((inputs[i]->info.type) & 0xfff00) >> 8) < device_id))
851 device_id = CCV_TENSOR_GET_DEVICE_ID(inputs[i]->info.type)(((inputs[i]->info.type) & 0xfff00) >> 8);
852 if (device_id >= 0)
853 {
854 device_ids[0] = device_id;
855 return 1;
856 }
857 }
858 return device_id_size;
859}
860
861void* ccv_nnc_cmd_aux(const ccv_nnc_cmd_t cmd)
862{
863 if (cmd.cmd == CCV_NNC_NOOP ||
864 cmd.cmd == CCV_NNC_CUSTOM_FORWARD || cmd.cmd == CCV_NNC_CUSTOM_BACKWARD ||
865 cmd.cmd == CCV_NNC_GRAPH_FORWARD || cmd.cmd == CCV_NNC_GRAPH_BACKWARD)
866 return 0;
867 assert(cmd.backend != CCV_NNC_NO_BACKEND)((void) sizeof ((cmd.backend != CCV_NNC_NO_BACKEND) ? 1 : 0),
__extension__ ({ if (cmd.backend != CCV_NNC_NO_BACKEND) ; else
__assert_fail ("cmd.backend != CCV_NNC_NO_BACKEND", "ccv_nnc_cmd.c"
, 867, __extension__ __PRETTY_FUNCTION__); }))
;
868 const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd);
869 assert(cmd_idx >= 0 && cmd_idx < sizeof(init_map) / sizeof(init_map[0]))((void) sizeof ((cmd_idx >= 0 && cmd_idx < sizeof
(init_map) / sizeof(init_map[0])) ? 1 : 0), __extension__ ({ if
(cmd_idx >= 0 && cmd_idx < sizeof(init_map) / sizeof
(init_map[0])) ; else __assert_fail ("cmd_idx >= 0 && cmd_idx < sizeof(init_map) / sizeof(init_map[0])"
, "ccv_nnc_cmd.c", 869, __extension__ __PRETTY_FUNCTION__); }
))
;
870 const int backend_idx = _ccv_nnc_cmd_backend_ph(cmd.backend);
871 assert(backend_idx >= 0 && backend_idx < CCV_NNC_BACKEND_COUNT)((void) sizeof ((backend_idx >= 0 && backend_idx <
CCV_NNC_BACKEND_COUNT) ? 1 : 0), __extension__ ({ if (backend_idx
>= 0 && backend_idx < CCV_NNC_BACKEND_COUNT) ;
else __assert_fail ("backend_idx >= 0 && backend_idx < CCV_NNC_BACKEND_COUNT"
, "ccv_nnc_cmd.c", 871, __extension__ __PRETTY_FUNCTION__); }
))
;
872 const ccv_nnc_cmd_backend_registry_t api_registry = init_map[cmd_idx].backends[backend_idx];
873 return api_registry.aux;
874}
875
876int ccv_nnc_cmd_exec(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
877{
878 // If it is no-op, return as if succeed already.
879 if (cmd.cmd == CCV_NNC_NOOP)
880 return 0;
881 _ccv_nnc_cmd_set_device_id(inputs, input_size, outputs, output_size, stream_context);
882 // If it is a custom command, just apply it directly.
883 if (cmd.cmd == CCV_NNC_CUSTOM_FORWARD || cmd.cmd == CCV_NNC_CUSTOM_BACKWARD)
884 {
885 int ret = cmd.isa->exec(cmd, hint, flags, inputs, input_size, outputs, output_size, stream_context);
886 if (!stream_context)
887 ccv_nnc_stream_context_drain(stream_context);
888 return ret;
889 }
890 assert(cmd.cmd != CCV_NNC_GRAPH_FORWARD && cmd.cmd != CCV_NNC_GRAPH_BACKWARD)((void) sizeof ((cmd.cmd != CCV_NNC_GRAPH_FORWARD && cmd
.cmd != CCV_NNC_GRAPH_BACKWARD) ? 1 : 0), __extension__ ({ if
(cmd.cmd != CCV_NNC_GRAPH_FORWARD && cmd.cmd != CCV_NNC_GRAPH_BACKWARD
) ; else __assert_fail ("cmd.cmd != CCV_NNC_GRAPH_FORWARD && cmd.cmd != CCV_NNC_GRAPH_BACKWARD"
, "ccv_nnc_cmd.c", 890, __extension__ __PRETTY_FUNCTION__); }
))
;
891 const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd);
892 assert(cmd_idx >= 0 && cmd_idx < sizeof(init_map) / sizeof(init_map[0]))((void) sizeof ((cmd_idx >= 0 && cmd_idx < sizeof
(init_map) / sizeof(init_map[0])) ? 1 : 0), __extension__ ({ if
(cmd_idx >= 0 && cmd_idx < sizeof(init_map) / sizeof
(init_map[0])) ; else __assert_fail ("cmd_idx >= 0 && cmd_idx < sizeof(init_map) / sizeof(init_map[0])"
, "ccv_nnc_cmd.c", 892, __extension__ __PRETTY_FUNCTION__); }
))
;
893 int i;
894 uint32_t backend = cmd.backend;
895 if (backend == CCV_NNC_NO_BACKEND)
896 {
897 // Find a suitable backend.
898 int tensor_memory = 0, tensor_formats = 0, tensor_datatypes = 0;
899 for (i = 0; i < input_size; i++)
900 if (inputs[i])
901 tensor_memory |= CCV_TENSOR_GET_MEMORY(inputs[i]->info.type)((inputs[i]->info.type) & 0x3), tensor_formats |= inputs[i]->info.format, tensor_datatypes |= CCV_GET_DATA_TYPE(inputs[i]->info.datatype)((inputs[i]->info.datatype) & 0xFF000);
902 for (i = 0; i < output_size; i++)
903 if (outputs[i])
904 tensor_memory |= CCV_TENSOR_GET_MEMORY(outputs[i]->info.type)((outputs[i]->info.type) & 0x3), tensor_formats |= outputs[i]->info.format, tensor_datatypes |= CCV_GET_DATA_TYPE(outputs[i]->info.datatype)((outputs[i]->info.datatype) & 0xFF000);
905 backend = ccv_nnc_cmd_find_backend(cmd, tensor_memory, tensor_formats, tensor_datatypes);
906 }
907 assert(backend != CCV_NNC_NO_BACKEND)((void) sizeof ((backend != CCV_NNC_NO_BACKEND) ? 1 : 0), __extension__
({ if (backend != CCV_NNC_NO_BACKEND) ; else __assert_fail (
"backend != CCV_NNC_NO_BACKEND", "ccv_nnc_cmd.c", 907, __extension__
__PRETTY_FUNCTION__); }))
;
908 const int backend_idx = _ccv_nnc_cmd_backend_ph(backend);
909 assert(backend_idx >= 0 && backend_idx < CCV_NNC_BACKEND_COUNT)((void) sizeof ((backend_idx >= 0 && backend_idx <
CCV_NNC_BACKEND_COUNT) ? 1 : 0), __extension__ ({ if (backend_idx
>= 0 && backend_idx < CCV_NNC_BACKEND_COUNT) ;
else __assert_fail ("backend_idx >= 0 && backend_idx < CCV_NNC_BACKEND_COUNT"
, "ccv_nnc_cmd.c", 909, __extension__ __PRETTY_FUNCTION__); }
))
;
910 const ccv_nnc_cmd_backend_registry_t api_registry = init_map[cmd_idx].backends[backend_idx];
911 if (!api_registry.exec)
912 return CCV_NNC_EXEC_NO_KERNEL;
913 // Everything is out, call the underlying implementation.
914 int ret = api_registry.exec(cmd, hint, flags, inputs, input_size, outputs, output_size, stream_context);
915 if (!stream_context)
916 ccv_nnc_stream_context_drain(stream_context);
917 return ret;
918}
919
920int ccv_nnc_cmd_attr(const ccv_nnc_cmd_t cmd, const int flags)
921{
922 // No additional attr for noop.
923 if (cmd.cmd == CCV_NNC_NOOP ||
924 // If it is a custom command, just apply it directly.
925 cmd.cmd == CCV_NNC_CUSTOM_FORWARD || cmd.cmd == CCV_NNC_CUSTOM_BACKWARD ||
926 // If it is sub-graph, there is no additional attr as well.
927 cmd.cmd == CCV_NNC_GRAPH_FORWARD || cmd.cmd == CCV_NNC_GRAPH_BACKWARD)
928 return 0;
929 const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd);
930 assert(cmd_idx >= 0 && cmd_idx <sizeof(init_map) / sizeof(init_map[0]))((void) sizeof ((cmd_idx >= 0 && cmd_idx <sizeof
(init_map) / sizeof(init_map[0])) ? 1 : 0), __extension__ ({ if
(cmd_idx >= 0 && cmd_idx <sizeof(init_map) / sizeof
(init_map[0])) ; else __assert_fail ("cmd_idx >= 0 && cmd_idx <sizeof(init_map) / sizeof(init_map[0])"
, "ccv_nnc_cmd.c", 930, __extension__ __PRETTY_FUNCTION__); }
))
;
931 const ccv_nnc_cmd_registry_t cmd_registry = init_map[cmd_idx].registry;
932 return !!(cmd_registry.flags & flags);
933}
934
935void ccv_nnc_set_profiler(int state)
936{
937#ifdef HAVE_CUDA1
938 cusetprofiler(state);
939#endif
940}
941
942int ccv_nnc_queue_watermark(void)
943{
944#ifdef HAVE_MPS
945 return ccv_nnc_mps_queue_watermark();
946#else
947 return 0;
948#endif
949}
950
951void ccv_nnc_set_queue_watermark(int watermark)
952{
953#ifdef HAVE_MPS
954 // If we need to be memory efficient, we need to bound how many in-flight command buffers there are.
955 ccv_nnc_mps_set_queue_watermark(watermark);
956#endif
957}
958
959void ccv_nnc_set_device_permutation(const int type, const int* const device_map, const int size)
960{
961 if (type != CCV_STREAM_CONTEXT_GPU)
962 return;
963#ifdef HAVE_CUDA1
964 cusetdevicemap(device_map, size);
965#endif
966}