/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/ccv_nnc_cmd.c
Line | Count | Source |
1 | | #include "ccv_nnc.h" |
2 | | #include "ccv_nnc_internal.h" |
3 | | #include "ccv_nnc_easy.h" |
4 | | #ifdef HAVE_CUDA |
5 | | #include "gpu/ccv_nnc_compat.h" |
6 | | #elif defined(HAVE_MPS) |
7 | | #include "mps/ccv_nnc_mps.h" |
8 | | #endif |
9 | | #include <time.h> |
10 | | #include <sys/time.h> |
11 | | |
12 | | typedef struct { |
13 | | const uint32_t cmd; |
14 | | const char* name; |
15 | | ccv_nnc_cmd_registry_t registry; |
16 | | ccv_nnc_cmd_backend_registry_t backends[CCV_NNC_BACKEND_COUNT]; |
17 | | } ccv_nnc_cmd_init_t; |
18 | | |
19 | | typedef struct { |
20 | | const uint32_t backend; |
21 | | const char* name; |
22 | | } ccv_nnc_cmd_backend_init_t; |
23 | | |
24 | | // The generated code configures command and its mapping. |
25 | | #include "cmd/ccv_nnc_cmd.inc" |
26 | | |
27 | | void ccv_nnc_init(void) |
28 | 1 | { |
29 | 1 | _ccv_nnc_cmd_init(); |
30 | 1 | } |
31 | | |
32 | | static uint64_t _ccv_nnc_flags = 0; |
33 | | |
34 | | uint64_t ccv_nnc_flags(void) |
35 | 0 | { |
36 | 0 | return _ccv_nnc_flags; |
37 | 0 | } |
38 | | |
39 | | void ccv_nnc_enable_flag(uint64_t flag) |
40 | 0 | { |
41 | 0 | _ccv_nnc_flags |= flag; |
42 | 0 | } |
43 | | |
44 | | void ccv_nnc_disable_flag(uint64_t flag) |
45 | 0 | { |
46 | 0 | _ccv_nnc_flags &= ~flag; |
47 | 0 | } |
48 | | |
49 | | const char* ccv_nnc_cmd_name(const uint32_t cmd) |
50 | 2.28k | { |
51 | 2.28k | switch (cmd) |
52 | 2.28k | { |
53 | 86 | case CCV_NNC_NOOP: |
54 | 86 | return "CCV_NNC_NOOP"; |
55 | 3 | case CCV_NNC_CUSTOM_FORWARD: |
56 | 3 | return "CCV_NNC_CUSTOM_FORWARD"; |
57 | 0 | case CCV_NNC_CUSTOM_BACKWARD: |
58 | 0 | return "CCV_NNC_CUSTOM_BACKWARD"; |
59 | 64 | case CCV_NNC_GRAPH_FORWARD: |
60 | 64 | return "CCV_NNC_GRAPH_FORWARD"; |
61 | 5 | case CCV_NNC_GRAPH_BACKWARD: |
62 | 5 | return "CCV_NNC_GRAPH_BACKWARD"; |
63 | 2.28k | } |
64 | 2.12k | const int idx = _ccv_nnc_cmd_ph(cmd); |
65 | 2.12k | assert(idx >= 0); |
66 | 2.12k | assert(idx < sizeof(init_map) / sizeof(init_map[0])); |
67 | 2.12k | return init_map[idx].name; |
68 | 2.12k | } |
69 | | |
70 | | const char* ccv_nnc_cmd_backend_name(const uint32_t backend) |
71 | 0 | { |
72 | 0 | if (backend == CCV_NNC_NO_BACKEND) |
73 | 0 | return "CCV_NNC_NO_BACKEND"; |
74 | 0 | const int idx = _ccv_nnc_cmd_backend_ph(backend); |
75 | 0 | assert(idx >= 0); |
76 | 0 | assert(idx < CCV_NNC_BACKEND_COUNT); |
77 | 0 | return backend_init_map[idx].name; |
78 | 0 | } |
79 | | |
80 | | const ccv_nnc_cmd_param_t ccv_nnc_cmd_auto = {}; |
81 | | |
82 | | int ccv_nnc_is_cmd_auto(const ccv_nnc_cmd_param_t params) |
83 | 0 | { |
84 | 0 | return (memcmp(¶ms, &ccv_nnc_cmd_auto, sizeof(ccv_nnc_cmd_param_t)) == 0); |
85 | 0 | } |
86 | | |
87 | | int ccv_nnc_cmd_is_forward(const ccv_nnc_cmd_t cmd) |
88 | 26.8k | { |
89 | 26.8k | switch (cmd.cmd) |
90 | 26.8k | { |
91 | 2 | case CCV_NNC_NOOP: |
92 | 2 | return 0; |
93 | 2.40k | case CCV_NNC_CUSTOM_FORWARD: |
94 | 2.40k | case CCV_NNC_CUSTOM_BACKWARD: |
95 | 2.40k | case CCV_NNC_GRAPH_FORWARD: |
96 | 2.40k | case CCV_NNC_GRAPH_BACKWARD: |
97 | 26.8k | default: |
98 | 26.8k | return !(cmd.cmd & 0x1); // If it is even, it is forward |
99 | 26.8k | } |
100 | 26.8k | } |
101 | | |
102 | | int ccv_nnc_cmd_is_backward(const ccv_nnc_cmd_t cmd) |
103 | 38.4k | { |
104 | 38.4k | switch (cmd.cmd) |
105 | 38.4k | { |
106 | 2 | case CCV_NNC_NOOP: |
107 | 2 | return 0; |
108 | 0 | case CCV_NNC_CUSTOM_FORWARD: |
109 | 4.80k | case CCV_NNC_CUSTOM_BACKWARD: |
110 | 4.80k | case CCV_NNC_GRAPH_FORWARD: |
111 | 4.81k | case CCV_NNC_GRAPH_BACKWARD: |
112 | 38.4k | default: |
113 | 38.4k | return !!(cmd.cmd & 0x1); // If it is odd, it is backward |
114 | 38.4k | } |
115 | 38.4k | } |
116 | | |
117 | | int ccv_nnc_cmd_ok(const uint32_t cmd, const uint32_t backend) |
118 | 789 | { |
119 | | // If it is a custom command, a no op, or a graph op, there is no backend to check. |
120 | 789 | if (cmd == CCV_NNC_NOOP || |
121 | 789 | cmd == CCV_NNC_GRAPH_FORWARD || cmd == CCV_NNC_GRAPH_BACKWARD || |
122 | 789 | cmd == CCV_NNC_CUSTOM_FORWARD || cmd == CCV_NNC_CUSTOM_BACKWARD) |
123 | 0 | return 1; |
124 | 789 | const int cmd_idx = _ccv_nnc_cmd_ph(cmd); |
125 | 789 | const int backend_idx = _ccv_nnc_cmd_backend_ph(backend); |
126 | 789 | assert(cmd_idx >= 0 && cmd_idx < sizeof(init_map) / sizeof(init_map[0])); |
127 | 789 | assert(backend_idx >= 0 && backend_idx < CCV_NNC_BACKEND_COUNT); |
128 | 789 | const ccv_nnc_cmd_backend_registry_t api_registry = init_map[cmd_idx].backends[backend_idx]; |
129 | | // Check if the execution function exists or not. |
130 | 789 | return !!api_registry.exec; |
131 | 789 | } |
132 | | |
133 | | ccv_nnc_cmd_t ccv_nnc_cmd(const uint32_t _cmd, ccv_nnc_cmd_vtab_t* const isa, const ccv_nnc_cmd_param_t params, const int flags) |
134 | 50.9k | { |
135 | 50.9k | ccv_nnc_cmd_t cmd; |
136 | 50.9k | cmd.info = params; |
137 | 50.9k | cmd.backend = CCV_NNC_NO_BACKEND; |
138 | 50.9k | assert((_cmd == CCV_NNC_CUSTOM_FORWARD && isa) || (_cmd != CCV_NNC_CUSTOM_FORWARD && !isa)); |
139 | 50.9k | cmd.cmd = _cmd; |
140 | 50.9k | cmd.algorithm = -1; // This is default. |
141 | 50.9k | cmd.isa = isa; |
142 | 50.9k | cmd.data = 0; |
143 | 50.9k | return cmd; |
144 | 50.9k | } |
145 | | |
146 | | const ccv_nnc_hint_t ccv_nnc_no_hint = {}; |
147 | | |
148 | | int ccv_nnc_is_no_hint(const ccv_nnc_hint_t hint) |
149 | 143k | { |
150 | 143k | return (memcmp(&hint, &ccv_nnc_no_hint, sizeof(ccv_nnc_hint_t)) == 0); |
151 | 143k | } |
152 | | |
153 | | int ccv_nnc_hint_verify(const ccv_nnc_hint_t hint, const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t a, const ccv_nnc_tensor_param_t b) |
154 | 11 | { |
155 | 11 | int i; |
156 | 11 | assert(a.format == b.format); |
157 | 11 | const int nd = ccv_nnc_tensor_nd(a.dim); |
158 | 11 | const int size_nd = ccv_max(2, ccv_nnc_tensor_nd(cmd.size.dim) - 1); |
159 | 11 | assert(size_nd == 2 || size_nd == 3); // Support 3D convolution. |
160 | 11 | assert(nd == size_nd + 1 || nd == size_nd + 2); |
161 | 11 | int hw; |
162 | 11 | if ((a.format == CCV_TENSOR_FORMAT_CHWN) || |
163 | 11 | (a.format == CCV_TENSOR_FORMAT_NHWC && nd == size_nd + 19 )) |
164 | 0 | hw = 0; |
165 | 11 | else if ((a.format == CCV_TENSOR_FORMAT_NHWC && nd == size_nd + 29 ) || |
166 | 11 | (2 a.format == CCV_TENSOR_FORMAT_NCHW2 && nd == size_nd + 12 )) |
167 | 9 | hw = 1; |
168 | 2 | else if (a.format == CCV_TENSOR_FORMAT_NCHW && nd == size_nd + 2) |
169 | 2 | hw = 2; |
170 | 0 | else |
171 | 2 | assert(0 && "unknown format"); |
172 | 35 | for (i = 0; 11 i < size_nd; i++24 ) |
173 | 24 | { |
174 | 24 | if ((hint.border.begin[i] + hint.border.end[i] + a.dim[i + hw] - cmd.size.dim[i]) % hint.stride.dim[i] != 0) |
175 | 0 | return -1; |
176 | 24 | int expected = (hint.border.begin[i] + hint.border.end[i] + a.dim[i + hw] - cmd.size.dim[i]) / hint.stride.dim[i] + 1; |
177 | 24 | if (expected != b.dim[i + hw]) |
178 | 0 | return -1; |
179 | 24 | } |
180 | 11 | return 0; |
181 | 11 | } |
182 | | |
183 | | ccv_nnc_hint_t ccv_nnc_hint_auto(const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t a, const ccv_nnc_tensor_param_t b) |
184 | 112k | { |
185 | 112k | int i; |
186 | 112k | if (a.format != b.format) |
187 | 0 | return ccv_nnc_no_hint; |
188 | 112k | assert(a.format == b.format); |
189 | 112k | const int a_nd = ccv_nnc_tensor_nd(a.dim); |
190 | 112k | const int b_nd = ccv_nnc_tensor_nd(b.dim); |
191 | 112k | const int size_nd = ccv_max(2, ccv_nnc_tensor_nd(cmd.size.dim) - 1); |
192 | 112k | assert(size_nd == 2 || size_nd == 3); // Support 3D convolution. |
193 | | // Is not auto hint deducible dimensions. |
194 | 112k | if (a_nd != b_nd || (111k a_nd != size_nd + 1111k && a_nd != size_nd + 2111k )) |
195 | 110k | return ccv_nnc_no_hint; |
196 | 1.59k | int hw; |
197 | 1.59k | if ((a.format == CCV_TENSOR_FORMAT_CHWN) || |
198 | 1.59k | (a.format == CCV_TENSOR_FORMAT_NHWC && a_nd == size_nd + 1609 )) |
199 | 140 | hw = 0; |
200 | 1.45k | else if ((a.format == CCV_TENSOR_FORMAT_NHWC && a_nd == size_nd + 2469 ) || |
201 | 1.45k | (983 a.format == CCV_TENSOR_FORMAT_NCHW983 && a_nd == size_nd + 1983 )) |
202 | 669 | hw = 1; |
203 | 783 | else if (a.format == CCV_TENSOR_FORMAT_NCHW && a_nd == size_nd + 2) |
204 | 783 | hw = 2; |
205 | 0 | else |
206 | 783 | assert(0 && "unknown format"); |
207 | 1.59k | ccv_nnc_hint_t hint_auto = {}; |
208 | | // 0-dim is reserved for channels |
209 | 4.77k | for (i = 0; i < size_nd; i++3.18k ) |
210 | 3.18k | { |
211 | | // Cannot have one of the dim is zero, we cannot auto the hint, return no hint. |
212 | 3.18k | assert(a.dim[i + hw] && b.dim[i + hw]); |
213 | | // This is guessed by having a stride that will approximately match the scale. |
214 | 3.18k | int stride = (a.dim[i + hw] + b.dim[i + hw] / 2) / b.dim[i + hw]; |
215 | 3.18k | hint_auto.stride.dim[i] = stride; |
216 | 3.18k | int border = (b.dim[i + hw] - 1) * stride - a.dim[i + hw] + cmd.size.dim[i]; |
217 | 3.18k | hint_auto.border.begin[i] = (border + 1) / 2; // Always prefer to have more padding in the beginning, this matches CUDNN behavior. |
218 | 3.18k | hint_auto.border.end[i] = border - hint_auto.border.begin[i]; |
219 | 3.18k | } |
220 | 1.59k | return hint_auto; |
221 | 1.59k | } |
222 | | |
223 | | void ccv_nnc_hint_tensor_auto_forward_from_inputs(const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_hint_t hint, ccv_nnc_tensor_param_t* const outputs, const int output_size) |
224 | 27.9k | { |
225 | 27.9k | int i; |
226 | 27.9k | assert(output_size <= input_size); |
227 | 57.7k | for (i = 0; 27.9k i < output_size; i++29.7k ) |
228 | 29.7k | outputs[i] = inputs[i]; |
229 | 27.9k | } |
230 | | |
231 | | void ccv_nnc_hint_tensor_auto_backward_from_gradient(const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_hint_t hint, ccv_nnc_tensor_param_t* const outputs, const int output_size) |
232 | 7.73k | { |
233 | 7.73k | int i; |
234 | 18.5k | for (i = 0; i < output_size; i++10.7k ) |
235 | 10.7k | outputs[i] = inputs[0]; |
236 | 7.73k | } |
237 | | |
238 | | void ccv_nnc_hint_tensor_auto_backward_from_inputs(const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_hint_t hint, ccv_nnc_tensor_param_t* const outputs, const int output_size) |
239 | 20.3k | { |
240 | 20.3k | int i; |
241 | 20.3k | assert(output_size < input_size); |
242 | 65.8k | for (i = 0; 20.3k i < output_size; i++45.5k ) |
243 | 45.5k | outputs[i] = inputs[i + 1]; |
244 | 20.3k | } |
245 | | |
246 | | void ccv_nnc_hint_tensor_auto_backward_from_gradient_and_inputs(const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_hint_t hint, ccv_nnc_tensor_param_t* const outputs, const int output_size) |
247 | 148 | { |
248 | 148 | int i; |
249 | 148 | outputs[0] = inputs[0]; |
250 | 148 | assert(output_size < input_size); |
251 | 296 | for (i = 1; 148 i < output_size; i++148 ) |
252 | 148 | outputs[i] = inputs[i + 1]; |
253 | 148 | } |
254 | | |
255 | | void ccv_nnc_hint_tensor_auto(const ccv_nnc_cmd_t cmd, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_hint_t hint, ccv_nnc_tensor_param_t* const outputs, const int output_size) |
256 | 136k | { |
257 | | // zero out the parameters |
258 | 136k | const ccv_nnc_tensor_param_t z = {}; |
259 | 136k | int i; |
260 | 340k | for (i = 0; i < output_size; i++204k ) |
261 | 204k | outputs[i] = z; // Reset the outputs. |
262 | | // Cannot handle these situations. |
263 | 136k | if (cmd.cmd == CCV_NNC_NOOP || cmd.cmd == CCV_NNC_CUSTOM_BACKWARD135k || cmd.cmd == CCV_NNC_GRAPH_FORWARD132k || cmd.cmd == CCV_NNC_GRAPH_BACKWARD132k ) |
264 | 3.43k | return; |
265 | 132k | if (cmd.cmd == CCV_NNC_CUSTOM_FORWARD) |
266 | 4.42k | { |
267 | 4.42k | if (cmd.isa->tensor_auto) |
268 | 4.41k | cmd.isa->tensor_auto(cmd, inputs, input_size, hint, outputs, output_size); |
269 | 4.42k | return; |
270 | 4.42k | } |
271 | 128k | const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd); |
272 | 128k | const ccv_nnc_cmd_registry_t registry = init_map[cmd_idx].registry; |
273 | 128k | if (registry.tensor_auto) |
274 | 128k | registry.tensor_auto(cmd.info, inputs, input_size, hint, outputs, output_size); |
275 | 0 | else if (ccv_nnc_cmd_is_forward(cmd)) // For forward, the default auto is forward_from_inputs |
276 | 0 | ccv_nnc_hint_tensor_auto_forward_from_inputs(cmd.info, inputs, input_size, hint, outputs, output_size); |
277 | 0 | else // For backward, the default auto is backward_from_inputs |
278 | 0 | ccv_nnc_hint_tensor_auto_backward_from_inputs(cmd.info, inputs, input_size, hint, outputs, output_size); |
279 | 128k | } |
280 | | |
281 | | int ccv_nnc_cmd_allow_inplace(const ccv_nnc_cmd_t cmd, const int input_idx, const int input_size, const int output_idx, const int output_size) |
282 | 53.6k | { |
283 | 53.6k | if (cmd.cmd == CCV_NNC_NOOP || cmd.cmd == CCV_NNC_CUSTOM_FORWARD53.6k || cmd.cmd == CCV_NNC_CUSTOM_BACKWARD53.6k || cmd.cmd == CCV_NNC_GRAPH_FORWARD51.1k || cmd.cmd == CCV_NNC_GRAPH_BACKWARD51.1k ) |
284 | 2.54k | return 0; |
285 | 51.0k | const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd); |
286 | 51.0k | const ccv_nnc_cmd_registry_t registry = init_map[cmd_idx].registry; |
287 | 51.0k | if (registry.allow_inplace) |
288 | 19.4k | return registry.allow_inplace(cmd.info, input_idx, input_size, output_idx, output_size); |
289 | 31.6k | return 0; |
290 | 51.0k | } |
291 | | |
292 | | int ccv_nnc_cmd_enforce_inplace(const ccv_nnc_cmd_t cmd, const int input_idx, const int input_size, const int output_idx, const int output_size) |
293 | 198k | { |
294 | 198k | if (cmd.cmd == CCV_NNC_NOOP || cmd.cmd == CCV_NNC_CUSTOM_FORWARD198k || cmd.cmd == CCV_NNC_CUSTOM_BACKWARD196k || cmd.cmd == CCV_NNC_GRAPH_FORWARD188k || cmd.cmd == CCV_NNC_GRAPH_BACKWARD188k ) |
295 | 10.1k | return 0; |
296 | 188k | const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd); |
297 | 188k | const ccv_nnc_cmd_registry_t registry = init_map[cmd_idx].registry; |
298 | 188k | if (registry.enforce_inplace) |
299 | 2.27k | return registry.enforce_inplace(cmd.info, input_idx, input_size, output_idx, output_size); |
300 | 186k | return 0; |
301 | 188k | } |
302 | | |
303 | | // This returns absolute time. |
304 | | uint64_t ccv_nnc_cmd_mono_time(void) |
305 | 3.98k | { |
306 | 3.98k | struct timespec ts; |
307 | 3.98k | clock_gettime(CLOCK_MONOTONIC, &ts); |
308 | 3.98k | return ts.tv_sec * 1000000000ULL + ts.tv_nsec; |
309 | 3.98k | } |
310 | | |
311 | | uint32_t ccv_nnc_cmd_find_backend(const ccv_nnc_cmd_t cmd, const int tensor_memory, const int tensor_formats, const int tensor_datatypes) |
312 | 269k | { |
313 | 269k | if (cmd.cmd == CCV_NNC_NOOP || |
314 | 269k | cmd.cmd == CCV_NNC_GRAPH_FORWARD269k || cmd.cmd == CCV_NNC_GRAPH_BACKWARD269k || |
315 | 269k | cmd.cmd == CCV_NNC_CUSTOM_FORWARD269k || cmd.cmd == CCV_NNC_CUSTOM_BACKWARD267k ) |
316 | 7.77k | return cmd.backend; |
317 | 262k | const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd); |
318 | 262k | assert(cmd_idx >= 0 && cmd_idx < sizeof(init_map) / sizeof(init_map[0])); |
319 | 262k | assert(tensor_memory != 0 && tensor_formats != 0 && tensor_datatypes != 0); |
320 | 262k | int i; |
321 | 863k | for (i = 0; i < CCV_NNC_BACKEND_COUNT; i++600k ) |
322 | 863k | { |
323 | 863k | const ccv_nnc_cmd_backend_registry_t api_registry = init_map[cmd_idx].backends[i]; |
324 | | // We have the exec kernel, and support all the tensor memory types. |
325 | 863k | if (api_registry.exec && |
326 | 863k | (api_registry.tensor_memory & tensor_memory) == tensor_memory305k && |
327 | 863k | (api_registry.tensor_formats & tensor_formats) == tensor_formats262k && |
328 | 863k | (api_registry.tensor_datatypes & tensor_datatypes) == tensor_datatypes262k ) |
329 | 262k | return backend_init_map[i].backend; |
330 | 863k | } |
331 | 0 | return cmd.backend; |
332 | 262k | } |
333 | | |
334 | 736 | #define AUTO_TUNE_TRIAL_SIZE (3) |
335 | | |
336 | | static void _ccv_nnc_cmd_set_device_id(ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
337 | 414k | { |
338 | 414k | #ifdef HAVE_CUDA |
339 | 414k | if (!stream_context) |
340 | 114k | { |
341 | 114k | int device_id; |
342 | 114k | if (ccv_nnc_device_ids_for_io(inputs, input_size, outputs, output_size, CCV_TENSOR_GPU_MEMORY, &device_id, 1) > 0) |
343 | 5.92k | cudevice(device_id); |
344 | 114k | } |
345 | 414k | #endif |
346 | 414k | } |
347 | | |
348 | | ccv_nnc_cmd_t ccv_nnc_cmd_autotune(const ccv_nnc_cmd_t cmd, const size_t max_workspace_size, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
349 | 2.60k | { |
350 | | // This is a custom cmd kernel, no need to autotune. |
351 | 2.60k | if (cmd.cmd == CCV_NNC_NOOP || |
352 | 2.60k | cmd.cmd == CCV_NNC_GRAPH_FORWARD || cmd.cmd == CCV_NNC_GRAPH_BACKWARD || |
353 | 2.60k | cmd.cmd == CCV_NNC_CUSTOM_FORWARD || cmd.cmd == CCV_NNC_CUSTOM_BACKWARD2.60k ) |
354 | 1 | return cmd; |
355 | 2.60k | int i, j, k; |
356 | | // Go through all the backends that supports the same type of memory input / output tensors support. |
357 | 2.60k | int tensor_memory = 0, tensor_formats = 0, tensor_datatypes = 0; |
358 | 10.9k | for (i = 0; i < input_size; i++8.33k ) |
359 | 8.33k | if (inputs[i]) |
360 | 6.12k | tensor_memory |= CCV_TENSOR_GET_MEMORY(inputs[i]->info.type), tensor_formats |= inputs[i]->info.format, tensor_datatypes |= CCV_GET_DATA_TYPE(inputs[i]->info.datatype); |
361 | 7.11k | for (i = 0; i < output_size; i++4.50k ) |
362 | 4.50k | if (outputs[i]) |
363 | 4.30k | tensor_memory |= CCV_TENSOR_GET_MEMORY(outputs[i]->info.type), tensor_formats |= outputs[i]->info.format, tensor_datatypes |= CCV_GET_DATA_TYPE(outputs[i]->info.datatype); |
364 | | // In this case, we cannot determine the type of the tensor, skip auto-tune. |
365 | 2.60k | if (!tensor_memory) |
366 | 0 | return cmd; |
367 | | // Otherwise, we are good to go. |
368 | 2.60k | ccv_nnc_cmd_t tuned_cmd = cmd; |
369 | 2.60k | int64_t best_measured = -1; |
370 | 2.60k | const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd); |
371 | 2.60k | assert(cmd_idx >= 0 && cmd_idx < sizeof(init_map) / sizeof(init_map[0])); |
372 | 2.60k | int flag = 0, autotune_available_1 = 0; // This is only applicable if we have only one backend. |
373 | 20.2k | for (i = 0; i < CCV_NNC_BACKEND_COUNT; i++17.6k ) |
374 | 17.8k | { |
375 | 17.8k | const ccv_nnc_cmd_backend_registry_t api_registry = init_map[cmd_idx].backends[i]; |
376 | | // We have the exec kernel, and support all the tensor memory types. |
377 | 17.8k | if (api_registry.exec && |
378 | 17.8k | (api_registry.tensor_memory & tensor_memory) == tensor_memory5.69k && |
379 | 17.8k | (api_registry.tensor_formats & tensor_formats) == tensor_formats2.79k && |
380 | 17.8k | (api_registry.tensor_datatypes & tensor_datatypes) == tensor_datatypes2.79k ) |
381 | 2.79k | { |
382 | 2.79k | if (api_registry.autotune) |
383 | 159 | autotune_available_1 = 1; |
384 | 2.79k | if ((++flag) >= 2) // If we have more than 2 suitable backend, we can do this now. |
385 | 184 | break; |
386 | 2.79k | } |
387 | 17.8k | } |
388 | 2.60k | if (flag == 0) |
389 | 0 | return cmd; |
390 | 2.60k | _ccv_nnc_cmd_set_device_id(inputs, input_size, outputs, output_size, stream_context); |
391 | | // Allocate inputs / outputs and fill them in. |
392 | 2.60k | ccv_nnc_tensor_t** copy_inputs; |
393 | 2.60k | ccv_nnc_tensor_t** copy_outputs; |
394 | 2.60k | ccv_nnc_tensor_t** allocated_inputs; |
395 | 2.60k | ccv_nnc_tensor_t** allocated_outputs; |
396 | 2.60k | ccv_nnc_tensor_view_t** allocated_input_views; |
397 | 2.60k | ccv_nnc_tensor_view_t** allocated_output_views; |
398 | 2.60k | if (flag > 1 || autotune_available_12.42k ) |
399 | 343 | { |
400 | 343 | copy_inputs = (ccv_nnc_tensor_t**)cccalloc((input_size + output_size) * 3, sizeof(ccv_nnc_tensor_t*)); |
401 | 343 | copy_outputs = copy_inputs + input_size; |
402 | 343 | allocated_inputs = copy_outputs + output_size; |
403 | 343 | allocated_outputs = allocated_inputs + input_size; |
404 | 343 | allocated_input_views = (ccv_nnc_tensor_view_t**)(allocated_outputs + output_size); |
405 | 343 | allocated_output_views = allocated_input_views + input_size; |
406 | 343 | int stride[CCV_NNC_MAX_DIM_ALLOC]; |
407 | 890 | for (i = 0; i < output_size; i++547 ) |
408 | 547 | if (outputs[i]) |
409 | 529 | { |
410 | 2.60k | for (j = 0; j < input_size; j++2.07k ) |
411 | 2.07k | if (inputs[j]) |
412 | 1.50k | { |
413 | 1.50k | if (outputs[i] == inputs[j]) |
414 | 0 | { |
415 | 0 | if (!copy_inputs[j]) |
416 | 0 | { |
417 | 0 | allocated_inputs[j] = ccv_nnc_tensor_new(0, inputs[j]->info, 0); |
418 | 0 | if (CCV_IS_TENSOR_VIEW(inputs[j])) |
419 | 0 | { |
420 | 0 | ccv_nnc_tensor_get_stride(inputs[j]->info.dim, stride); |
421 | 0 | copy_inputs[j] = (ccv_nnc_tensor_t*)(allocated_input_views[j] = ccv_nnc_tensor_view_new(allocated_inputs[j], inputs[j]->info, DIM_ALLOC(), stride)); |
422 | 0 | } else |
423 | 0 | copy_inputs[j] = allocated_inputs[j]; |
424 | 0 | } |
425 | 0 | copy_outputs[i] = copy_inputs[j]; |
426 | 0 | break; |
427 | 1.50k | } else if (outputs[i]->data.u8 == inputs[j]->data.u8 && |
428 | 1.50k | ccv_nnc_tensor_count(outputs[i]->info) == ccv_nnc_tensor_count(inputs[j]->info)0 ) { |
429 | 0 | if (!copy_inputs[j]) |
430 | 0 | { |
431 | 0 | allocated_inputs[j] = ccv_nnc_tensor_new(0, inputs[j]->info, 0); |
432 | 0 | if (CCV_IS_TENSOR_VIEW(inputs[j])) |
433 | 0 | { |
434 | 0 | ccv_nnc_tensor_get_stride(inputs[j]->info.dim, stride); |
435 | 0 | copy_inputs[j] = (ccv_nnc_tensor_t*)(allocated_input_views[j] = ccv_nnc_tensor_view_new(allocated_inputs[j], inputs[j]->info, DIM_ALLOC(), stride)); |
436 | 0 | } else |
437 | 0 | copy_inputs[j] = allocated_inputs[j]; |
438 | 0 | } |
439 | 0 | allocated_outputs[i] = ccv_nnc_tensor_new(copy_inputs[j]->data.u8, outputs[i]->info, 0); |
440 | 0 | if (CCV_IS_TENSOR_VIEW(outputs[i])) |
441 | 0 | { |
442 | 0 | ccv_nnc_tensor_get_stride(outputs[i]->info.dim, stride); |
443 | 0 | copy_outputs[i] = (ccv_nnc_tensor_t*)(allocated_output_views[i] = ccv_nnc_tensor_view_new(allocated_outputs[i], outputs[i]->info, DIM_ALLOC(), stride)); |
444 | 0 | } else |
445 | 0 | copy_outputs[i] = allocated_outputs[i]; |
446 | 0 | break; |
447 | 0 | } |
448 | 1.50k | } |
449 | 529 | if (!copy_outputs[i]) |
450 | 529 | { |
451 | 529 | allocated_outputs[i] = ccv_nnc_tensor_new(0, outputs[i]->info, 0); |
452 | 529 | if (CCV_IS_TENSOR_VIEW(outputs[i])) |
453 | 3 | { |
454 | 3 | ccv_nnc_tensor_get_stride(outputs[i]->info.dim, stride); |
455 | 3 | copy_outputs[i] = (ccv_nnc_tensor_t*)(allocated_output_views[i] = ccv_nnc_tensor_view_new(allocated_outputs[i], outputs[i]->info, DIM_ALLOC(), stride)); |
456 | 3 | } else |
457 | 526 | copy_outputs[i] = allocated_outputs[i]; |
458 | 529 | } |
459 | 529 | } |
460 | 1.51k | for (i = 0; i < input_size; i++1.17k ) |
461 | 1.17k | if (inputs[i] && !copy_inputs[i]958 ) |
462 | 958 | copy_inputs[i] = inputs[i]; |
463 | 343 | } |
464 | 2.60k | if (flag == 1) |
465 | 2.42k | { |
466 | 8.71k | for (i = 0; i < CCV_NNC_BACKEND_COUNT; i++6.29k ) |
467 | 8.71k | { |
468 | 8.71k | const ccv_nnc_cmd_backend_registry_t api_registry = init_map[cmd_idx].backends[i]; |
469 | | // We have the exec kernel, and support all the tensor memory types. |
470 | 8.71k | if (api_registry.exec && |
471 | 8.71k | (api_registry.tensor_memory & tensor_memory) == tensor_memory4.02k && |
472 | 8.71k | (api_registry.tensor_formats & tensor_formats) == tensor_formats2.42k && |
473 | 8.71k | (api_registry.tensor_datatypes & tensor_datatypes) == tensor_datatypes2.42k ) |
474 | 2.42k | { |
475 | 2.42k | tuned_cmd.backend = backend_init_map[i].backend; |
476 | | // If a given API exist an autotune function, use that to pick the top algorithm. |
477 | 2.42k | if (api_registry.autotune) |
478 | 159 | { |
479 | 159 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, inputs, input_size, copy_inputs, input_size, stream_context); |
480 | 159 | _ccv_nnc_cmd_set_device_id(copy_inputs, input_size, copy_outputs, output_size, stream_context); |
481 | 159 | tuned_cmd.algorithm = api_registry.autotune(tuned_cmd, max_workspace_size, hint, flags, copy_inputs, input_size, copy_outputs, output_size, stream_context); |
482 | | // Drain the context, autotune can use excessive amount of memory. Need to drain it now. |
483 | 159 | ccv_nnc_stream_context_drain(stream_context); |
484 | 159 | } |
485 | 2.42k | break; |
486 | 2.42k | } |
487 | 8.71k | } |
488 | 2.42k | if (autotune_available_1) |
489 | 159 | { |
490 | 780 | for (i = 0; i < input_size; i++621 ) |
491 | 621 | { |
492 | 621 | if (allocated_inputs[i]) |
493 | 0 | ccv_nnc_tensor_free(allocated_inputs[i]); |
494 | 621 | if (allocated_input_views[i]) |
495 | 0 | ccv_nnc_tensor_view_free(allocated_input_views[i]); |
496 | 621 | } |
497 | 470 | for (i = 0; i < output_size; i++311 ) |
498 | 311 | { |
499 | 311 | if (allocated_outputs[i]) |
500 | 303 | ccv_nnc_tensor_free(allocated_outputs[i]); |
501 | 311 | if (allocated_output_views[i]) |
502 | 0 | ccv_nnc_tensor_view_free(allocated_output_views[i]); |
503 | 311 | } |
504 | 159 | ccfree(copy_inputs); |
505 | 159 | } |
506 | 2.42k | return tuned_cmd; |
507 | 2.42k | } |
508 | | // We need to have trial loop through all the data. |
509 | 736 | for (k = 0; 184 k < AUTO_TUNE_TRIAL_SIZE; k++552 ) |
510 | 552 | { |
511 | 4.41k | for (i = 0; i < CCV_NNC_BACKEND_COUNT; i++3.86k ) |
512 | 3.86k | { |
513 | 3.86k | const ccv_nnc_cmd_backend_registry_t api_registry = init_map[cmd_idx].backends[i]; |
514 | | // We have the exec kernel, and support all the tensor memory types. |
515 | 3.86k | if (api_registry.exec && |
516 | 3.86k | (api_registry.tensor_memory & tensor_memory) == tensor_memory1.65k && |
517 | 3.86k | (api_registry.tensor_formats & tensor_formats) == tensor_formats1.10k && |
518 | 3.86k | (api_registry.tensor_datatypes & tensor_datatypes) == tensor_datatypes1.10k ) |
519 | 1.10k | { |
520 | 1.10k | ccv_nnc_cmd_t candid_cmd = cmd; |
521 | 1.10k | candid_cmd.backend = backend_init_map[i].backend; |
522 | | // If a given API exist an autotune function, use that to pick the top algorithm. |
523 | 1.10k | if (api_registry.autotune) |
524 | 0 | { |
525 | | // Assuming k == 0 is sufficient, and we can skip. |
526 | 0 | if (k > 0) |
527 | 0 | continue; |
528 | 0 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, inputs, input_size, copy_inputs, input_size, stream_context); |
529 | 0 | _ccv_nnc_cmd_set_device_id(copy_inputs, input_size, copy_outputs, output_size, stream_context); |
530 | 0 | candid_cmd.algorithm = api_registry.autotune(candid_cmd, max_workspace_size, hint, flags, copy_inputs, input_size, copy_outputs, output_size, stream_context); |
531 | | // Drain the context, autotune can use excessive amount of memory. Need to drain it now. |
532 | 0 | ccv_nnc_stream_context_drain(stream_context); |
533 | 0 | uint64_t elapsed = ccv_nnc_cmd_mono_time(); |
534 | | // Ready to run. |
535 | 0 | int status = ccv_nnc_cmd_exec(candid_cmd, hint, flags, inputs, input_size, outputs, output_size, stream_context); |
536 | 0 | ccv_nnc_stream_context_wait(stream_context); |
537 | 0 | elapsed = ccv_nnc_cmd_mono_time() - elapsed; |
538 | 0 | if (status == CCV_NNC_EXEC_SUCCESS && |
539 | 0 | (best_measured == -1 || elapsed < best_measured)) |
540 | 0 | { |
541 | 0 | best_measured = elapsed; |
542 | 0 | tuned_cmd = candid_cmd; |
543 | 0 | } |
544 | 1.10k | } else { |
545 | | // Otherwise loop over the existing algorithms and pick the top one. |
546 | 3.09k | for (j = 0; j < api_registry.algorithms; j++1.99k ) |
547 | 1.99k | { |
548 | 1.99k | candid_cmd.algorithm = j; |
549 | 1.99k | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, inputs, input_size, copy_inputs, input_size, stream_context); |
550 | 1.99k | _ccv_nnc_cmd_set_device_id(copy_inputs, input_size, copy_outputs, output_size, stream_context); |
551 | 1.99k | uint64_t elapsed = ccv_nnc_cmd_mono_time(); |
552 | | // Ready to run. |
553 | 1.99k | int status = ccv_nnc_cmd_exec(candid_cmd, hint, flags, copy_inputs, input_size, copy_outputs, output_size, stream_context); |
554 | 1.99k | elapsed = ccv_nnc_cmd_mono_time() - elapsed; |
555 | 1.99k | if (status == CCV_NNC_EXEC_SUCCESS && |
556 | 1.99k | (1.29k best_measured == -11.29k || elapsed < best_measured1.10k )) |
557 | 647 | { |
558 | 647 | best_measured = elapsed; |
559 | 647 | tuned_cmd = candid_cmd; |
560 | 647 | } |
561 | 1.99k | } |
562 | 1.10k | } |
563 | 1.10k | } |
564 | 3.86k | } |
565 | 552 | } |
566 | 735 | for (i = 0; i < input_size; i++551 ) |
567 | 551 | { |
568 | 551 | if (allocated_inputs[i]) |
569 | 0 | ccv_nnc_tensor_free(allocated_inputs[i]); |
570 | 551 | if (allocated_input_views[i]) |
571 | 0 | ccv_nnc_tensor_view_free(allocated_input_views[i]); |
572 | 551 | } |
573 | 420 | for (i = 0; i < output_size; i++236 ) |
574 | 236 | { |
575 | 236 | if (allocated_outputs[i]) |
576 | 226 | ccv_nnc_tensor_free(allocated_outputs[i]); |
577 | 236 | if (allocated_output_views[i]) |
578 | 3 | ccv_nnc_tensor_view_free(allocated_output_views[i]); |
579 | 236 | } |
580 | 184 | ccfree(copy_inputs); |
581 | 184 | return tuned_cmd; |
582 | 2.60k | } |
583 | | |
584 | | int ccv_nnc_cmd_bitmask(const ccv_nnc_cmd_t cmd, const int input_size, const int output_size, const uint64_t* const input_bitmasks, const int input_bitmask_size, const uint64_t* const output_bitmasks, const int output_bitmask_size) |
585 | 146k | { |
586 | | // If it is no-op, return true, it can deal with any number of parameters. |
587 | 146k | if (cmd.cmd == CCV_NNC_NOOP) |
588 | 112 | return 1; |
589 | | // If it is a custom command, I cannot check it at all, return false. |
590 | 146k | if (cmd.cmd == CCV_NNC_CUSTOM_FORWARD || cmd.cmd == CCV_NNC_CUSTOM_BACKWARD) |
591 | 2.40k | return 0; |
592 | 144k | const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd); |
593 | 144k | const ccv_nnc_cmd_registry_t cmd_registry = init_map[cmd_idx].registry; |
594 | 144k | if (cmd_registry.bitmask) |
595 | 144k | return cmd_registry.bitmask(cmd.info, input_size, output_size, input_bitmasks, input_bitmask_size, output_bitmasks, output_bitmask_size); |
596 | | // If there is not checking, none can pass. |
597 | 0 | return 0; |
598 | 144k | } |
599 | | |
600 | | int ccv_nnc_device_ids_for_io(ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, const int tensor_type, int* const device_ids, const int max_device_id_size) |
601 | 127k | { |
602 | 127k | int i, j; |
603 | 127k | int device_id_size = 0; |
604 | 127k | if (max_device_id_size <= device_id_size) |
605 | 0 | return device_id_size; |
606 | | // The device id of the exec is determined by its outputs. |
607 | 299k | for (i = 0; 127k i < output_size; i++171k ) |
608 | 177k | if (outputs[i] && |
609 | 177k | CCV_TENSOR_GET_MEMORY158k (outputs[i]->info.type) == tensor_type158k && |
610 | 177k | CCV_TENSOR_GET_DEVICE24.5k (outputs[i]->info.type) != CCV_COMPUTE_DEVICE_ANY24.5k ) |
611 | 24.5k | { |
612 | 24.5k | const int device_id = CCV_TENSOR_GET_DEVICE_ID(outputs[i]->info.type); |
613 | 24.5k | int flag = 0; |
614 | 35.3k | for (j = 0; !flag && j < device_id_size27.5k ; j++10.7k ) |
615 | 10.7k | flag = (device_ids[j] == device_id); |
616 | 24.5k | if (flag) |
617 | 7.76k | continue; |
618 | 16.8k | device_ids[device_id_size++] = device_id; |
619 | 16.8k | if (device_id_size >= max_device_id_size) |
620 | 5.33k | return device_id_size; |
621 | 16.8k | } |
622 | 122k | if (device_id_size == 0) |
623 | 112k | { |
624 | 112k | int device_id = -1; |
625 | 363k | for (i = 0; i < input_size; i++251k ) |
626 | 251k | if (inputs[i] && |
627 | 251k | CCV_TENSOR_GET_MEMORY205k (inputs[i]->info.type) == tensor_type205k && |
628 | 251k | CCV_TENSOR_GET_DEVICE2.10k (inputs[i]->info.type) != CCV_COMPUTE_DEVICE_ANY2.10k && |
629 | 251k | (2.09k device_id < 02.09k || CCV_TENSOR_GET_DEVICE_ID1.10k (inputs[i]->info.type) < device_id1.10k )) |
630 | 996 | device_id = CCV_TENSOR_GET_DEVICE_ID(inputs[i]->info.type); |
631 | 112k | if (device_id >= 0) |
632 | 996 | { |
633 | 996 | device_ids[0] = device_id; |
634 | 996 | return 1; |
635 | 996 | } |
636 | 112k | } |
637 | 121k | return device_id_size; |
638 | 122k | } |
639 | | |
640 | | void* ccv_nnc_cmd_aux(const ccv_nnc_cmd_t cmd) |
641 | 11 | { |
642 | 11 | if (cmd.cmd == CCV_NNC_NOOP || |
643 | 11 | cmd.cmd == CCV_NNC_CUSTOM_FORWARD || cmd.cmd == CCV_NNC_CUSTOM_BACKWARD || |
644 | 11 | cmd.cmd == CCV_NNC_GRAPH_FORWARD || cmd.cmd == CCV_NNC_GRAPH_BACKWARD) |
645 | 0 | return 0; |
646 | 11 | assert(cmd.backend != CCV_NNC_NO_BACKEND); |
647 | 11 | const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd); |
648 | 11 | assert(cmd_idx >= 0 && cmd_idx < sizeof(init_map) / sizeof(init_map[0])); |
649 | 11 | const int backend_idx = _ccv_nnc_cmd_backend_ph(cmd.backend); |
650 | 11 | assert(backend_idx >= 0 && backend_idx < CCV_NNC_BACKEND_COUNT); |
651 | 11 | const ccv_nnc_cmd_backend_registry_t api_registry = init_map[cmd_idx].backends[backend_idx]; |
652 | 11 | return api_registry.aux; |
653 | 11 | } |
654 | | |
655 | | int ccv_nnc_cmd_exec(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
656 | 440k | { |
657 | | // If it is no-op, return as if succeed already. |
658 | 440k | if (cmd.cmd == CCV_NNC_NOOP) |
659 | 31.2k | return 0; |
660 | 409k | _ccv_nnc_cmd_set_device_id(inputs, input_size, outputs, output_size, stream_context); |
661 | | // If it is a custom command, just apply it directly. |
662 | 409k | if (cmd.cmd == CCV_NNC_CUSTOM_FORWARD || cmd.cmd == CCV_NNC_CUSTOM_BACKWARD406k ) |
663 | 4.82k | { |
664 | 4.82k | int ret = cmd.isa->exec(cmd, hint, flags, inputs, input_size, outputs, output_size, stream_context); |
665 | 4.82k | if (!stream_context) |
666 | 4.43k | ccv_nnc_stream_context_drain(stream_context); |
667 | 4.82k | return ret; |
668 | 4.82k | } |
669 | 409k | assert(cmd.cmd != CCV_NNC_GRAPH_FORWARD && cmd.cmd != CCV_NNC_GRAPH_BACKWARD)404k ; |
670 | 404k | const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd); |
671 | 404k | assert(cmd_idx >= 0 && cmd_idx < sizeof(init_map) / sizeof(init_map[0])); |
672 | 404k | int i; |
673 | 404k | uint32_t backend = cmd.backend; |
674 | 404k | if (backend == CCV_NNC_NO_BACKEND) |
675 | 180k | { |
676 | | // Find a suitable backend. |
677 | 180k | int tensor_memory = 0, tensor_formats = 0, tensor_datatypes = 0; |
678 | 451k | for (i = 0; i < input_size; i++271k ) |
679 | 271k | if (inputs[i]) |
680 | 269k | tensor_memory |= CCV_TENSOR_GET_MEMORY(inputs[i]->info.type), tensor_formats |= inputs[i]->info.format, tensor_datatypes |= CCV_GET_DATA_TYPE(inputs[i]->info.datatype); |
681 | 408k | for (i = 0; i < output_size; i++227k ) |
682 | 227k | if (outputs[i]) |
683 | 226k | tensor_memory |= CCV_TENSOR_GET_MEMORY(outputs[i]->info.type), tensor_formats |= outputs[i]->info.format, tensor_datatypes |= CCV_GET_DATA_TYPE(outputs[i]->info.datatype); |
684 | 180k | backend = ccv_nnc_cmd_find_backend(cmd, tensor_memory, tensor_formats, tensor_datatypes); |
685 | 180k | } |
686 | 404k | assert(backend != CCV_NNC_NO_BACKEND); |
687 | 404k | const int backend_idx = _ccv_nnc_cmd_backend_ph(backend); |
688 | 404k | assert(backend_idx >= 0 && backend_idx < CCV_NNC_BACKEND_COUNT); |
689 | 404k | const ccv_nnc_cmd_backend_registry_t api_registry = init_map[cmd_idx].backends[backend_idx]; |
690 | 404k | if (!api_registry.exec) |
691 | 0 | return CCV_NNC_EXEC_NO_KERNEL; |
692 | | // Everything is out, call the underlying implementation. |
693 | 404k | int ret = api_registry.exec(cmd, hint, flags, inputs, input_size, outputs, output_size, stream_context); |
694 | 404k | if (!stream_context) |
695 | 105k | ccv_nnc_stream_context_drain(stream_context); |
696 | 404k | return ret; |
697 | 404k | } |
698 | | |
699 | | int ccv_nnc_cmd_attr(const ccv_nnc_cmd_t cmd, const int flags) |
700 | 0 | { |
701 | | // No additional attr for noop. |
702 | 0 | if (cmd.cmd == CCV_NNC_NOOP || |
703 | | // If it is a custom command, just apply it directly. |
704 | 0 | cmd.cmd == CCV_NNC_CUSTOM_FORWARD || cmd.cmd == CCV_NNC_CUSTOM_BACKWARD || |
705 | | // If it is sub-graph, there is no additional attr as well. |
706 | 0 | cmd.cmd == CCV_NNC_GRAPH_FORWARD || cmd.cmd == CCV_NNC_GRAPH_BACKWARD) |
707 | 0 | return 0; |
708 | 0 | const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd); |
709 | 0 | assert(cmd_idx >= 0 && cmd_idx <sizeof(init_map) / sizeof(init_map[0])); |
710 | 0 | const ccv_nnc_cmd_registry_t cmd_registry = init_map[cmd_idx].registry; |
711 | 0 | return !!(cmd_registry.flags & flags); |
712 | 0 | } |
713 | | |
714 | | void ccv_nnc_set_profiler(int state) |
715 | 0 | { |
716 | 0 | #ifdef HAVE_CUDA |
717 | 0 | cusetprofiler(state); |
718 | 0 | #endif |
719 | 0 | } |
720 | | |
721 | | int ccv_nnc_queue_watermark(void) |
722 | 0 | { |
723 | | #ifdef HAVE_MPS |
724 | | return ccv_nnc_mps_queue_watermark(); |
725 | | #else |
726 | 0 | return 0; |
727 | 0 | #endif |
728 | 0 | } |
729 | | |
730 | | void ccv_nnc_set_queue_watermark(int watermark) |
731 | 0 | { |
732 | | #ifdef HAVE_MPS |
733 | | // If we need to be memory efficient, we need to bound how many in-flight command buffers there are. |
734 | | ccv_nnc_mps_set_queue_watermark(watermark); |
735 | | #endif |
736 | 0 | } |
737 | | |
738 | | void ccv_nnc_set_device_permutation(const int type, const int* const device_map, const int size) |
739 | 2 | { |
740 | 2 | if (type != CCV_STREAM_CONTEXT_GPU) |
741 | 0 | return; |
742 | 2 | #ifdef HAVE_CUDA |
743 | 2 | cusetdevicemap(device_map, size); |
744 | 2 | #endif |
745 | 2 | } |