/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/ccv_nnc_cmd.c
Line | Count | Source |
1 | | #include "ccv_nnc.h" |
2 | | #include "ccv_nnc_internal.h" |
3 | | #include "3rdparty/khash/khash.h" |
4 | | #include "ccv_nnc_easy.h" |
5 | | #ifdef HAVE_CUDA |
6 | | #include "gpu/ccv_nnc_compat.h" |
7 | | #elif defined(HAVE_MPS) |
8 | | #include "mps/ccv_nnc_mps.h" |
9 | | #endif |
10 | | #include <time.h> |
11 | | #include <sys/time.h> |
12 | | |
13 | | typedef struct { |
14 | | const uint32_t cmd; |
15 | | const char* name; |
16 | | ccv_nnc_cmd_registry_t registry; |
17 | | ccv_nnc_cmd_backend_registry_t backends[CCV_NNC_BACKEND_COUNT]; |
18 | | } ccv_nnc_cmd_init_t; |
19 | | |
20 | | typedef struct { |
21 | | const uint32_t backend; |
22 | | const char* name; |
23 | | } ccv_nnc_cmd_backend_init_t; |
24 | | |
25 | | // The generated code configures command and its mapping. |
26 | | #include "cmd/ccv_nnc_cmd.inc" |
27 | | |
28 | | void ccv_nnc_init(void) |
29 | 1 | { |
30 | 1 | _ccv_nnc_cmd_init(); |
31 | 1 | } |
32 | | |
33 | | static uint64_t _ccv_nnc_flags = 0; |
34 | | |
35 | | uint64_t ccv_nnc_flags(void) |
36 | 0 | { |
37 | 0 | return _ccv_nnc_flags; |
38 | 0 | } |
39 | | |
40 | | void ccv_nnc_enable_flag(uint64_t flag) |
41 | 0 | { |
42 | 0 | _ccv_nnc_flags |= flag; |
43 | 0 | } |
44 | | |
45 | | void ccv_nnc_disable_flag(uint64_t flag) |
46 | 0 | { |
47 | 0 | _ccv_nnc_flags &= ~flag; |
48 | 0 | } |
49 | | |
50 | | const char* ccv_nnc_cmd_name(const uint32_t cmd) |
51 | 2.29k | { |
52 | 2.29k | switch (cmd) |
53 | 2.29k | { |
54 | 86 | case CCV_NNC_NOOP: |
55 | 86 | return "CCV_NNC_NOOP"; |
56 | 3 | case CCV_NNC_CUSTOM_FORWARD: |
57 | 3 | return "CCV_NNC_CUSTOM_FORWARD"; |
58 | 0 | case CCV_NNC_CUSTOM_BACKWARD: |
59 | 0 | return "CCV_NNC_CUSTOM_BACKWARD"; |
60 | 64 | case CCV_NNC_GRAPH_FORWARD: |
61 | 64 | return "CCV_NNC_GRAPH_FORWARD"; |
62 | 5 | case CCV_NNC_GRAPH_BACKWARD: |
63 | 5 | return "CCV_NNC_GRAPH_BACKWARD"; |
64 | 2.29k | } |
65 | 2.14k | const int idx = _ccv_nnc_cmd_ph(cmd); |
66 | 2.14k | assert(idx >= 0); |
67 | 2.14k | assert(idx < sizeof(init_map) / sizeof(init_map[0])); |
68 | 2.14k | return init_map[idx].name; |
69 | 2.14k | } |
70 | | |
71 | | const char* ccv_nnc_cmd_backend_name(const uint32_t backend) |
72 | 0 | { |
73 | 0 | if (backend == CCV_NNC_NO_BACKEND) |
74 | 0 | return "CCV_NNC_NO_BACKEND"; |
75 | 0 | const int idx = _ccv_nnc_cmd_backend_ph(backend); |
76 | 0 | assert(idx >= 0); |
77 | 0 | assert(idx < CCV_NNC_BACKEND_COUNT); |
78 | 0 | return backend_init_map[idx].name; |
79 | 0 | } |
80 | | |
81 | | const ccv_nnc_cmd_param_t ccv_nnc_cmd_auto = {}; |
82 | | |
83 | | int ccv_nnc_is_cmd_auto(const ccv_nnc_cmd_param_t params) |
84 | 0 | { |
85 | 0 | return (memcmp(¶ms, &ccv_nnc_cmd_auto, sizeof(ccv_nnc_cmd_param_t)) == 0); |
86 | 0 | } |
87 | | |
88 | | int ccv_nnc_cmd_is_forward(const ccv_nnc_cmd_t cmd) |
89 | 26.9k | { |
90 | 26.9k | switch (cmd.cmd) |
91 | 26.9k | { |
92 | 2 | case CCV_NNC_NOOP: |
93 | 2 | return 0; |
94 | 2.40k | case CCV_NNC_CUSTOM_FORWARD: |
95 | 2.40k | case CCV_NNC_CUSTOM_BACKWARD: |
96 | 2.40k | case CCV_NNC_GRAPH_FORWARD: |
97 | 2.40k | case CCV_NNC_GRAPH_BACKWARD: |
98 | 26.9k | default: |
99 | 26.9k | return !(cmd.cmd & 0x1); // If it is even, it is forward |
100 | 26.9k | } |
101 | 26.9k | } |
102 | | |
103 | | int ccv_nnc_cmd_is_backward(const ccv_nnc_cmd_t cmd) |
104 | 38.4k | { |
105 | 38.4k | switch (cmd.cmd) |
106 | 38.4k | { |
107 | 2 | case CCV_NNC_NOOP: |
108 | 2 | return 0; |
109 | 0 | case CCV_NNC_CUSTOM_FORWARD: |
110 | 4.80k | case CCV_NNC_CUSTOM_BACKWARD: |
111 | 4.80k | case CCV_NNC_GRAPH_FORWARD: |
112 | 4.81k | case CCV_NNC_GRAPH_BACKWARD: |
113 | 38.4k | default: |
114 | 38.4k | return !!(cmd.cmd & 0x1); // If it is odd, it is backward |
115 | 38.4k | } |
116 | 38.4k | } |
117 | | |
118 | | int ccv_nnc_cmd_ok(const uint32_t cmd, const uint32_t backend) |
119 | 846 | { |
120 | | // If it is a custom command, a no op, or a graph op, there is no backend to check. |
121 | 846 | if (cmd == CCV_NNC_NOOP || |
122 | 846 | cmd == CCV_NNC_GRAPH_FORWARD || cmd == CCV_NNC_GRAPH_BACKWARD || |
123 | 846 | cmd == CCV_NNC_CUSTOM_FORWARD || cmd == CCV_NNC_CUSTOM_BACKWARD) |
124 | 0 | return 1; |
125 | 846 | const int cmd_idx = _ccv_nnc_cmd_ph(cmd); |
126 | 846 | const int backend_idx = _ccv_nnc_cmd_backend_ph(backend); |
127 | 846 | assert(cmd_idx >= 0 && cmd_idx < sizeof(init_map) / sizeof(init_map[0])); |
128 | 846 | assert(backend_idx >= 0 && backend_idx < CCV_NNC_BACKEND_COUNT); |
129 | 846 | const ccv_nnc_cmd_backend_registry_t api_registry = init_map[cmd_idx].backends[backend_idx]; |
130 | | // Check if the execution function exists or not. |
131 | 846 | return !!api_registry.exec; |
132 | 846 | } |
133 | | |
134 | | ccv_nnc_cmd_t ccv_nnc_cmd(const uint32_t _cmd, ccv_nnc_cmd_vtab_t* const isa, const ccv_nnc_cmd_param_t params, const int flags) |
135 | 50.8k | { |
136 | 50.8k | ccv_nnc_cmd_t cmd; |
137 | 50.8k | cmd.info = params; |
138 | 50.8k | cmd.backend = CCV_NNC_NO_BACKEND; |
139 | 50.8k | assert((_cmd == CCV_NNC_CUSTOM_FORWARD && isa) || (_cmd != CCV_NNC_CUSTOM_FORWARD && !isa)); |
140 | 50.8k | cmd.cmd = _cmd; |
141 | 50.8k | cmd.algorithm = -1; // This is default. |
142 | 50.8k | cmd.isa = isa; |
143 | 50.8k | cmd.data = 0; |
144 | 50.8k | return cmd; |
145 | 50.8k | } |
146 | | |
147 | | const ccv_nnc_hint_t ccv_nnc_no_hint = {}; |
148 | | |
149 | | int ccv_nnc_is_no_hint(const ccv_nnc_hint_t hint) |
150 | 143k | { |
151 | 143k | return (memcmp(&hint, &ccv_nnc_no_hint, sizeof(ccv_nnc_hint_t)) == 0); |
152 | 143k | } |
153 | | |
154 | | int ccv_nnc_hint_verify(const ccv_nnc_hint_t hint, const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t a, const ccv_nnc_tensor_param_t b) |
155 | 11 | { |
156 | 11 | int i; |
157 | 11 | assert(a.format == b.format); |
158 | 11 | const int nd = ccv_nnc_tensor_nd(a.dim); |
159 | 11 | const int size_nd = ccv_max(2, ccv_nnc_tensor_nd(cmd.size.dim) - 1); |
160 | 11 | assert(size_nd == 2 || size_nd == 3); // Support 3D convolution. |
161 | 11 | assert(nd == size_nd + 1 || nd == size_nd + 2); |
162 | 11 | int hw; |
163 | 11 | if ((a.format == CCV_TENSOR_FORMAT_CHWN) || |
164 | 11 | (a.format == CCV_TENSOR_FORMAT_NHWC && nd == size_nd + 19 )) |
165 | 0 | hw = 0; |
166 | 11 | else if ((a.format == CCV_TENSOR_FORMAT_NHWC && nd == size_nd + 29 ) || |
167 | 11 | (2 a.format == CCV_TENSOR_FORMAT_NCHW2 && nd == size_nd + 12 )) |
168 | 9 | hw = 1; |
169 | 2 | else if (a.format == CCV_TENSOR_FORMAT_NCHW && nd == size_nd + 2) |
170 | 2 | hw = 2; |
171 | 0 | else |
172 | 2 | assert(0 && "unknown format"); |
173 | 35 | for (i = 0; 11 i < size_nd; i++24 ) |
174 | 24 | { |
175 | 24 | if ((hint.border.begin[i] + hint.border.end[i] + a.dim[i + hw] - cmd.size.dim[i]) % hint.stride.dim[i] != 0) |
176 | 0 | return -1; |
177 | 24 | int expected = (hint.border.begin[i] + hint.border.end[i] + a.dim[i + hw] - cmd.size.dim[i]) / hint.stride.dim[i] + 1; |
178 | 24 | if (expected != b.dim[i + hw]) |
179 | 0 | return -1; |
180 | 24 | } |
181 | 11 | return 0; |
182 | 11 | } |
183 | | |
184 | | ccv_nnc_hint_t ccv_nnc_hint_auto(const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t a, const ccv_nnc_tensor_param_t b) |
185 | 112k | { |
186 | 112k | int i; |
187 | 112k | if (a.format != b.format) |
188 | 0 | return ccv_nnc_no_hint; |
189 | 112k | assert(a.format == b.format); |
190 | 112k | const int a_nd = ccv_nnc_tensor_nd(a.dim); |
191 | 112k | const int b_nd = ccv_nnc_tensor_nd(b.dim); |
192 | 112k | const int size_nd = ccv_max(2, ccv_nnc_tensor_nd(cmd.size.dim) - 1); |
193 | 112k | assert(size_nd == 2 || size_nd == 3); // Support 3D convolution. |
194 | | // Is not auto hint deducible dimensions. |
195 | 112k | if (a_nd != b_nd || (111k a_nd != size_nd + 1111k && a_nd != size_nd + 2111k )) |
196 | 110k | return ccv_nnc_no_hint; |
197 | 1.59k | int hw; |
198 | 1.59k | if ((a.format == CCV_TENSOR_FORMAT_CHWN) || |
199 | 1.59k | (a.format == CCV_TENSOR_FORMAT_NHWC && a_nd == size_nd + 1609 )) |
200 | 140 | hw = 0; |
201 | 1.45k | else if ((a.format == CCV_TENSOR_FORMAT_NHWC && a_nd == size_nd + 2469 ) || |
202 | 1.45k | (983 a.format == CCV_TENSOR_FORMAT_NCHW983 && a_nd == size_nd + 1983 )) |
203 | 669 | hw = 1; |
204 | 783 | else if (a.format == CCV_TENSOR_FORMAT_NCHW && a_nd == size_nd + 2) |
205 | 783 | hw = 2; |
206 | 0 | else |
207 | 783 | assert(0 && "unknown format"); |
208 | 1.59k | ccv_nnc_hint_t hint_auto = {}; |
209 | | // 0-dim is reserved for channels |
210 | 4.77k | for (i = 0; i < size_nd; i++3.18k ) |
211 | 3.18k | { |
212 | | // Cannot have one of the dim is zero, we cannot auto the hint, return no hint. |
213 | 3.18k | assert(a.dim[i + hw] && b.dim[i + hw]); |
214 | | // This is guessed by having a stride that will approximately match the scale. |
215 | 3.18k | int stride = (a.dim[i + hw] + b.dim[i + hw] / 2) / b.dim[i + hw]; |
216 | 3.18k | hint_auto.stride.dim[i] = stride; |
217 | 3.18k | int border = (b.dim[i + hw] - 1) * stride - a.dim[i + hw] + cmd.size.dim[i]; |
218 | 3.18k | hint_auto.border.begin[i] = (border + 1) / 2; // Always prefer to have more padding in the beginning, this matches CUDNN behavior. |
219 | 3.18k | hint_auto.border.end[i] = border - hint_auto.border.begin[i]; |
220 | 3.18k | } |
221 | 1.59k | return hint_auto; |
222 | 1.59k | } |
223 | | |
224 | | void ccv_nnc_hint_tensor_auto_forward_from_inputs(const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_hint_t hint, ccv_nnc_tensor_param_t* const outputs, const int output_size) |
225 | 28.0k | { |
226 | 28.0k | int i; |
227 | 28.0k | assert(output_size <= input_size); |
228 | 57.8k | for (i = 0; 28.0k i < output_size; i++29.8k ) |
229 | 29.8k | outputs[i] = inputs[i]; |
230 | 28.0k | } |
231 | | |
232 | | void ccv_nnc_hint_tensor_auto_backward_from_gradient(const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_hint_t hint, ccv_nnc_tensor_param_t* const outputs, const int output_size) |
233 | 7.74k | { |
234 | 7.74k | int i; |
235 | 18.5k | for (i = 0; i < output_size; i++10.8k ) |
236 | 10.8k | outputs[i] = inputs[0]; |
237 | 7.74k | } |
238 | | |
239 | | void ccv_nnc_hint_tensor_auto_backward_from_inputs(const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_hint_t hint, ccv_nnc_tensor_param_t* const outputs, const int output_size) |
240 | 20.3k | { |
241 | 20.3k | int i; |
242 | 20.3k | assert(output_size < input_size); |
243 | 65.8k | for (i = 0; 20.3k i < output_size; i++45.5k ) |
244 | 45.5k | outputs[i] = inputs[i + 1]; |
245 | 20.3k | } |
246 | | |
247 | | void ccv_nnc_hint_tensor_auto_backward_from_gradient_and_inputs(const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_hint_t hint, ccv_nnc_tensor_param_t* const outputs, const int output_size) |
248 | 148 | { |
249 | 148 | int i; |
250 | 148 | outputs[0] = inputs[0]; |
251 | 148 | assert(output_size < input_size); |
252 | 296 | for (i = 1; 148 i < output_size; i++148 ) |
253 | 148 | outputs[i] = inputs[i + 1]; |
254 | 148 | } |
255 | | |
256 | | void ccv_nnc_hint_tensor_auto(const ccv_nnc_cmd_t cmd, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_hint_t hint, ccv_nnc_tensor_param_t* const outputs, const int output_size) |
257 | 136k | { |
258 | | // zero out the parameters |
259 | 136k | const ccv_nnc_tensor_param_t z = {}; |
260 | 136k | int i; |
261 | 341k | for (i = 0; i < output_size; i++204k ) |
262 | 204k | outputs[i] = z; // Reset the outputs. |
263 | | // Cannot handle these situations. |
264 | 136k | if (cmd.cmd == CCV_NNC_NOOP || cmd.cmd == CCV_NNC_CUSTOM_BACKWARD135k || cmd.cmd == CCV_NNC_GRAPH_FORWARD132k || cmd.cmd == CCV_NNC_GRAPH_BACKWARD132k ) |
265 | 3.43k | return; |
266 | 132k | if (cmd.cmd == CCV_NNC_CUSTOM_FORWARD) |
267 | 4.42k | { |
268 | 4.42k | if (cmd.isa->tensor_auto) |
269 | 4.41k | cmd.isa->tensor_auto(cmd, inputs, input_size, hint, outputs, output_size); |
270 | 4.42k | return; |
271 | 4.42k | } |
272 | 128k | const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd); |
273 | 128k | const ccv_nnc_cmd_registry_t registry = init_map[cmd_idx].registry; |
274 | 128k | if (registry.tensor_auto) |
275 | 128k | registry.tensor_auto(cmd.info, inputs, input_size, hint, outputs, output_size); |
276 | 0 | else if (ccv_nnc_cmd_is_forward(cmd)) // For forward, the default auto is forward_from_inputs |
277 | 0 | ccv_nnc_hint_tensor_auto_forward_from_inputs(cmd.info, inputs, input_size, hint, outputs, output_size); |
278 | 0 | else // For backward, the default auto is backward_from_inputs |
279 | 0 | ccv_nnc_hint_tensor_auto_backward_from_inputs(cmd.info, inputs, input_size, hint, outputs, output_size); |
280 | 128k | } |
281 | | |
282 | | int ccv_nnc_cmd_allow_inplace(const ccv_nnc_cmd_t cmd, const int input_idx, const int input_size, const int output_idx, const int output_size) |
283 | 53.7k | { |
284 | 53.7k | if (cmd.cmd == CCV_NNC_NOOP || cmd.cmd == CCV_NNC_CUSTOM_FORWARD53.7k || cmd.cmd == CCV_NNC_CUSTOM_BACKWARD53.7k || cmd.cmd == CCV_NNC_GRAPH_FORWARD51.2k || cmd.cmd == CCV_NNC_GRAPH_BACKWARD51.1k ) |
285 | 2.54k | return 0; |
286 | 51.1k | const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd); |
287 | 51.1k | const ccv_nnc_cmd_registry_t registry = init_map[cmd_idx].registry; |
288 | 51.1k | if (registry.allow_inplace) |
289 | 19.4k | return registry.allow_inplace(cmd.info, input_idx, input_size, output_idx, output_size); |
290 | 31.7k | return 0; |
291 | 51.1k | } |
292 | | |
293 | | int ccv_nnc_cmd_enforce_inplace(const ccv_nnc_cmd_t cmd, const int input_idx, const int input_size, const int output_idx, const int output_size) |
294 | 198k | { |
295 | 198k | if (cmd.cmd == CCV_NNC_NOOP || cmd.cmd == CCV_NNC_CUSTOM_FORWARD198k || cmd.cmd == CCV_NNC_CUSTOM_BACKWARD196k || cmd.cmd == CCV_NNC_GRAPH_FORWARD188k || cmd.cmd == CCV_NNC_GRAPH_BACKWARD188k ) |
296 | 10.1k | return 0; |
297 | 188k | const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd); |
298 | 188k | const ccv_nnc_cmd_registry_t registry = init_map[cmd_idx].registry; |
299 | 188k | if (registry.enforce_inplace) |
300 | 2.27k | return registry.enforce_inplace(cmd.info, input_idx, input_size, output_idx, output_size); |
301 | 186k | return 0; |
302 | 188k | } |
303 | | |
304 | | // This returns absolute time. |
305 | | uint64_t ccv_nnc_cmd_mono_time(void) |
306 | 3.29k | { |
307 | 3.29k | struct timespec ts; |
308 | 3.29k | clock_gettime(CLOCK_MONOTONIC, &ts); |
309 | 3.29k | return ts.tv_sec * 1000000000ULL + ts.tv_nsec; |
310 | 3.29k | } |
311 | | |
312 | | uint32_t ccv_nnc_cmd_find_backend(const ccv_nnc_cmd_t cmd, const int tensor_memory, const int tensor_formats, const int tensor_datatypes) |
313 | 269k | { |
314 | 269k | if (cmd.cmd == CCV_NNC_NOOP || |
315 | 269k | cmd.cmd == CCV_NNC_GRAPH_FORWARD269k || cmd.cmd == CCV_NNC_GRAPH_BACKWARD269k || |
316 | 269k | cmd.cmd == CCV_NNC_CUSTOM_FORWARD269k || cmd.cmd == CCV_NNC_CUSTOM_BACKWARD266k ) |
317 | 7.77k | return cmd.backend; |
318 | 261k | const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd); |
319 | 261k | assert(cmd_idx >= 0 && cmd_idx < sizeof(init_map) / sizeof(init_map[0])); |
320 | 261k | assert(tensor_memory != 0 && tensor_formats != 0 && tensor_datatypes != 0); |
321 | 261k | int i; |
322 | 862k | for (i = 0; i < CCV_NNC_BACKEND_COUNT; i++600k ) |
323 | 862k | { |
324 | 862k | const ccv_nnc_cmd_backend_registry_t api_registry = init_map[cmd_idx].backends[i]; |
325 | | // We have the exec kernel, and support all the tensor memory types. |
326 | 862k | if (api_registry.exec && |
327 | 862k | (api_registry.tensor_memory & tensor_memory) == tensor_memory305k && |
328 | 862k | (api_registry.tensor_formats & tensor_formats) == tensor_formats261k && |
329 | 862k | (api_registry.tensor_datatypes & tensor_datatypes) == tensor_datatypes261k ) |
330 | 261k | return backend_init_map[i].backend; |
331 | 862k | } |
332 | 0 | return cmd.backend; |
333 | 261k | } |
334 | | |
335 | 604 | #define AUTO_TUNE_TRIAL_SIZE (3) |
336 | | |
337 | | static void _ccv_nnc_cmd_set_device_id(ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
338 | 411k | { |
339 | 411k | #ifdef HAVE_CUDA |
340 | 411k | if (!stream_context) |
341 | 112k | { |
342 | 112k | int device_id; |
343 | 112k | if (ccv_nnc_device_ids_for_io(inputs, input_size, outputs, output_size, CCV_TENSOR_GPU_MEMORY, &device_id, 1) > 0) |
344 | 4.21k | cudevice(device_id); |
345 | 112k | } |
346 | 411k | #endif |
347 | 411k | } |
348 | | |
349 | | typedef struct { |
350 | | int format; |
351 | | int datatype; |
352 | | int nd; |
353 | | off_t dataof; |
354 | | int dim[CCV_NNC_MAX_DIM_ALLOC]; |
355 | | int stride[CCV_NNC_MAX_DIM_ALLOC]; |
356 | | } ccv_nnc_cmd_autotune_tensor_shape_t; |
357 | | |
358 | | typedef struct { |
359 | | uint32_t cmd; |
360 | | ccv_nnc_cmd_param_t params; |
361 | | ccv_nnc_hint_t hint; |
362 | | int flags; |
363 | | int input_size; |
364 | | int output_size; |
365 | | size_t workspace_size; |
366 | | ccv_nnc_cmd_autotune_tensor_shape_t* inputs; |
367 | | ccv_nnc_cmd_autotune_tensor_shape_t* outputs; |
368 | | } ccv_nnc_cmd_autotune_key_t; |
369 | | |
370 | | static CCV_WARN_UNUSED(ccv_nnc_cmd_autotune_key_t) ccv_nnc_cmd_autotune_key_new(const ccv_nnc_cmd_t cmd, const size_t workspace_size, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size) |
371 | 2.65k | { |
372 | 2.65k | ccv_nnc_cmd_autotune_key_t key = { |
373 | 2.65k | .cmd = cmd.cmd, |
374 | 2.65k | .params = cmd.info, |
375 | 2.65k | .hint = hint, |
376 | 2.65k | .workspace_size = workspace_size, |
377 | 2.65k | .inputs = 0, |
378 | 2.65k | .input_size = 0, |
379 | 2.65k | .outputs = 0, |
380 | 2.65k | .output_size = 0 |
381 | 2.65k | }; |
382 | 2.65k | if (input_size == 0 && output_size == 070 ) |
383 | 0 | return key; |
384 | 2.65k | assert(input_size >= 0 && output_size >= 0); |
385 | 2.65k | key.input_size = input_size; |
386 | 2.65k | key.output_size = output_size; |
387 | 2.65k | key.inputs = (ccv_nnc_cmd_autotune_tensor_shape_t*)ccmalloc(sizeof(ccv_nnc_cmd_autotune_tensor_shape_t) * (input_size + output_size)); |
388 | 2.65k | key.outputs = key.inputs + input_size; |
389 | 2.65k | int i, j; |
390 | 11.1k | for (i = 0; i < input_size; i++8.45k ) |
391 | 8.45k | { |
392 | 8.45k | memset(key.inputs[i].dim, 0, sizeof(key.inputs[i].dim)); |
393 | 8.45k | memset(key.inputs[i].stride, 0, sizeof(key.inputs[i].stride)); |
394 | 8.45k | if (!inputs[i]) |
395 | 2.24k | { |
396 | 2.24k | key.inputs[i].format = 0; |
397 | 2.24k | key.inputs[i].datatype = 0; |
398 | 2.24k | key.inputs[i].dataof = 0; |
399 | 2.24k | key.inputs[i].nd = 0; |
400 | 2.24k | continue; |
401 | 2.24k | } |
402 | 6.21k | key.inputs[i].format = inputs[i]->info.format; |
403 | 6.21k | key.inputs[i].datatype = inputs[i]->info.datatype; |
404 | 6.21k | key.inputs[i].dataof = inputs[i]->dataof; |
405 | 6.21k | const int nd = key.inputs[i].nd = ccv_nnc_tensor_nd(inputs[i]->info.dim); |
406 | 20.4k | for (j = 0; j < nd; j++14.1k ) |
407 | 14.1k | key.inputs[i].dim[j] = inputs[i]->info.dim[j]; |
408 | 6.21k | if (CCV_IS_TENSOR_VIEW(inputs[i])) |
409 | 18 | for (j = 0; 6 j < nd; j++12 ) |
410 | 12 | key.inputs[i].stride[j] = ((ccv_nnc_tensor_view_t*)inputs[i])->stride[j]; |
411 | 6.21k | } |
412 | 7.22k | for (i = 0; i < output_size; i++4.57k ) |
413 | 4.57k | { |
414 | 4.57k | memset(key.outputs[i].dim, 0, sizeof(key.outputs[i].dim)); |
415 | 4.57k | memset(key.outputs[i].stride, 0, sizeof(key.outputs[i].stride)); |
416 | 4.57k | if (!outputs[i]) |
417 | 205 | { |
418 | 205 | key.outputs[i].format = 0; |
419 | 205 | key.outputs[i].datatype = 0; |
420 | 205 | key.outputs[i].dataof = 0; |
421 | 205 | key.outputs[i].nd = 0; |
422 | 205 | continue; |
423 | 205 | } |
424 | 4.36k | key.outputs[i].format = outputs[i]->info.format; |
425 | 4.36k | key.outputs[i].datatype = outputs[i]->info.datatype; |
426 | 4.36k | key.outputs[i].dataof = outputs[i]->dataof; |
427 | 4.36k | const int nd = key.outputs[i].nd = ccv_nnc_tensor_nd(outputs[i]->info.dim); |
428 | 14.1k | for (j = 0; j < nd; j++9.78k ) |
429 | 9.78k | key.outputs[i].dim[j] = outputs[i]->info.dim[j]; |
430 | 4.36k | if (CCV_IS_TENSOR_VIEW(outputs[i])) |
431 | 16 | for (j = 0; 7 j < nd; j++9 ) |
432 | 9 | key.outputs[i].stride[j] = ((ccv_nnc_tensor_view_t*)outputs[i])->stride[j]; |
433 | 4.36k | } |
434 | 2.65k | return key; |
435 | 2.65k | } |
436 | | |
437 | | // autotune cache. |
438 | | static inline uint32_t twang_32from64(uint64_t key) |
439 | 301k | { |
440 | 301k | key = (~key) + (key << 18); |
441 | 301k | key = key ^ (key >> 31); |
442 | 301k | key = key * 21; |
443 | 301k | key = key ^ (key >> 11); |
444 | 301k | key = key + (key << 6); |
445 | 301k | key = key ^ (key >> 22); |
446 | 301k | return (uint32_t)(key); |
447 | 301k | } |
448 | | |
449 | | static inline khint32_t _kh_autotune_key_executable_hash_func(const ccv_nnc_cmd_autotune_key_t key) |
450 | 2.82k | { |
451 | 2.82k | uint32_t h = key.cmd; |
452 | 2.82k | int i, j; |
453 | 2.82k | uint32_t* data = (uint32_t*)&key.params; |
454 | 87.4k | for (i = 0; i < sizeof(key.params) / sizeof(uint32_t); i++84.6k ) |
455 | 84.6k | h = twang_32from64(((uint64_t)h << 32) | data[i]); |
456 | 2.82k | data = (uint32_t*)&key.hint; |
457 | 104k | for (i = 0; i < sizeof(key.hint) / sizeof(uint32_t); i++101k ) |
458 | 101k | h = twang_32from64(((uint64_t)h << 32) | data[i]); |
459 | 2.82k | h = twang_32from64(((uint64_t)h << 32) | key.workspace_size); |
460 | 2.82k | h = twang_32from64(((uint64_t)h << 32) | key.input_size); |
461 | 2.82k | h = twang_32from64(((uint64_t)h << 32) | key.output_size); |
462 | 11.7k | for (i = 0; i < key.input_size; i++8.97k ) |
463 | 8.97k | { |
464 | 8.97k | h = twang_32from64(((uint64_t)h << 32) | key.inputs[i].format); |
465 | 8.97k | h = twang_32from64(((uint64_t)h << 32) | key.inputs[i].datatype); |
466 | 8.97k | h = twang_32from64(((uint64_t)h << 32) | key.inputs[i].dataof); |
467 | 8.97k | h = twang_32from64(((uint64_t)h << 32) | key.inputs[i].nd); |
468 | 24.2k | for (j = 0; j < key.inputs[i].nd; j++15.3k ) |
469 | 15.3k | { |
470 | 15.3k | h = twang_32from64(((uint64_t)h << 32) | key.inputs[i].dim[j]); |
471 | 15.3k | h = twang_32from64(((uint64_t)h << 32) | key.inputs[i].stride[j]); |
472 | 15.3k | } |
473 | 8.97k | } |
474 | 7.72k | for (i = 0; i < key.output_size; i++4.90k ) |
475 | 4.90k | { |
476 | 4.90k | h = twang_32from64(((uint64_t)h << 32) | key.outputs[i].format); |
477 | 4.90k | h = twang_32from64(((uint64_t)h << 32) | key.outputs[i].datatype); |
478 | 4.90k | h = twang_32from64(((uint64_t)h << 32) | key.outputs[i].dataof); |
479 | 4.90k | h = twang_32from64(((uint64_t)h << 32) | key.outputs[i].nd); |
480 | 15.4k | for (j = 0; j < key.outputs[i].nd; j++10.5k ) |
481 | 10.5k | { |
482 | 10.5k | h = twang_32from64(((uint64_t)h << 32) | key.outputs[i].dim[j]); |
483 | 10.5k | h = twang_32from64(((uint64_t)h << 32) | key.outputs[i].stride[j]); |
484 | 10.5k | } |
485 | 4.90k | } |
486 | 2.82k | return (khint32_t)h; |
487 | 2.82k | } |
488 | | |
489 | | static inline int _kh_autotune_key_executable_hash_equal(const ccv_nnc_cmd_autotune_key_t a, const ccv_nnc_cmd_autotune_key_t b) |
490 | 3.42k | { |
491 | 3.42k | if (a.cmd != b.cmd || a.flags != b.flags1.86k || a.workspace_size != b.workspace_size1.86k || a.input_size != b.input_size1.86k || a.output_size != b.output_size1.86k ) |
492 | 1.56k | return 0; |
493 | 1.86k | if (memcmp(&a.params, &b.params, sizeof(a.params)) != 0) |
494 | 56 | return 0; |
495 | 1.81k | if (memcmp(&a.hint, &b.hint, sizeof(a.hint)) != 0) |
496 | 11 | return 0; |
497 | 1.80k | int i, j; |
498 | 7.63k | for (i = 0; i < a.input_size; i++5.83k ) |
499 | 5.89k | { |
500 | 5.89k | if (a.inputs[i].format != b.inputs[i].format || a.inputs[i].datatype != b.inputs[i].datatype || a.inputs[i].nd != b.inputs[i].nd || a.inputs[i].dataof != b.inputs[i].dataof) |
501 | 0 | return 0; |
502 | 15.6k | for (j = 0; 5.89k j < a.inputs[i].nd; j++9.72k ) |
503 | 9.78k | if (a.inputs[i].dim[j] != b.inputs[i].dim[j] || a.inputs[i].stride[j] != b.inputs[i].stride[j]9.72k ) |
504 | 56 | return 0; |
505 | 5.89k | } |
506 | 4.87k | for (i = 0; 1.74k i < a.output_size; i++3.12k ) |
507 | 3.12k | { |
508 | 3.12k | if (a.outputs[i].format != b.outputs[i].format || a.outputs[i].datatype != b.outputs[i].datatype || a.outputs[i].nd != b.outputs[i].nd || a.outputs[i].dataof != b.outputs[i].dataof) |
509 | 0 | return 0; |
510 | 9.90k | for (j = 0; 3.12k j < a.outputs[i].nd; j++6.77k ) |
511 | 6.77k | if (a.outputs[i].dim[j] != b.outputs[i].dim[j] || a.outputs[i].stride[j] != b.outputs[i].stride[j]) |
512 | 0 | return 0; |
513 | 3.12k | } |
514 | 1.74k | return 1; |
515 | 1.74k | } |
516 | | |
517 | | typedef struct { |
518 | | int backend; |
519 | | int algorithm; |
520 | | } ccv_nnc_cmd_autotune_val_t; |
521 | | |
522 | | KHASH_INIT(autotune_executable_cache, ccv_nnc_cmd_autotune_key_t, ccv_nnc_cmd_autotune_val_t, 1, _kh_autotune_key_executable_hash_func, _kh_autotune_key_executable_hash_equal) |
523 | | |
524 | | static khash_t(autotune_executable_cache)* g_autotune_executable_cache = 0; |
525 | | |
526 | | static inline void ccv_nnc_cmd_autotune_key_free(ccv_nnc_cmd_autotune_key_t key) |
527 | 2.65k | { |
528 | 2.65k | if (key.inputs) |
529 | 2.65k | ccfree(key.inputs); |
530 | 2.65k | } |
531 | | |
532 | | void ccv_nnc_drain_autotune_cache(void) |
533 | 262 | { |
534 | 262 | if (!g_autotune_executable_cache) |
535 | 1 | return; |
536 | 261 | khiter_t k; |
537 | 33.6k | for (k = kh_begin261 (g_autotune_executable_cache); k < kh_end(g_autotune_executable_cache); k++33.4k ) |
538 | 33.4k | { |
539 | 33.4k | if (!kh_exist(g_autotune_executable_cache, k)) |
540 | 32.5k | continue; |
541 | 907 | ccv_nnc_cmd_autotune_key_free(kh_key(g_autotune_executable_cache, k)); |
542 | 907 | kh_del(autotune_executable_cache, g_autotune_executable_cache, k); |
543 | 907 | } |
544 | 261 | } |
545 | | |
546 | | ccv_nnc_cmd_t ccv_nnc_cmd_autotune(const ccv_nnc_cmd_t cmd, const size_t max_workspace_size, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
547 | 2.65k | { |
548 | | // This is a custom cmd kernel, no need to autotune. |
549 | 2.65k | if (cmd.cmd == CCV_NNC_NOOP || |
550 | 2.65k | cmd.cmd == CCV_NNC_GRAPH_FORWARD || cmd.cmd == CCV_NNC_GRAPH_BACKWARD || |
551 | 2.65k | cmd.cmd == CCV_NNC_CUSTOM_FORWARD || cmd.cmd == CCV_NNC_CUSTOM_BACKWARD2.65k ) |
552 | 1 | return cmd; |
553 | 2.65k | int i, j, k; |
554 | | // Go through all the backends that supports the same type of memory input / output tensors support. |
555 | 2.65k | int tensor_memory = 0, tensor_formats = 0, tensor_datatypes = 0; |
556 | 11.1k | for (i = 0; i < input_size; i++8.45k ) |
557 | 8.45k | if (inputs[i]) |
558 | 6.21k | tensor_memory |= CCV_TENSOR_GET_MEMORY(inputs[i]->info.type), tensor_formats |= inputs[i]->info.format, tensor_datatypes |= CCV_GET_DATA_TYPE(inputs[i]->info.datatype); |
559 | 7.22k | for (i = 0; i < output_size; i++4.57k ) |
560 | 4.57k | if (outputs[i]) |
561 | 4.36k | tensor_memory |= CCV_TENSOR_GET_MEMORY(outputs[i]->info.type), tensor_formats |= outputs[i]->info.format, tensor_datatypes |= CCV_GET_DATA_TYPE(outputs[i]->info.datatype); |
562 | | // In this case, we cannot determine the type of the tensor, skip auto-tune. |
563 | 2.65k | if (!tensor_memory) |
564 | 0 | return cmd; |
565 | | // Otherwise, we are good to go. |
566 | 2.65k | ccv_nnc_cmd_t tuned_cmd = cmd; |
567 | 2.65k | if (!g_autotune_executable_cache) |
568 | 1 | g_autotune_executable_cache = kh_init(autotune_executable_cache); |
569 | 2.65k | int ret = 0; |
570 | 2.65k | ccv_nnc_cmd_autotune_key_t key = ccv_nnc_cmd_autotune_key_new(cmd, max_workspace_size, hint, flags, inputs, input_size, outputs, output_size); |
571 | 2.65k | khiter_t kiter = kh_put(autotune_executable_cache, g_autotune_executable_cache, key, &ret); |
572 | 2.65k | if (ret == 0) |
573 | 1.74k | { |
574 | 1.74k | ccv_nnc_cmd_autotune_key_free(key); |
575 | 1.74k | const ccv_nnc_cmd_autotune_val_t val = kh_val(g_autotune_executable_cache, kiter); |
576 | 1.74k | tuned_cmd.backend = val.backend; |
577 | 1.74k | tuned_cmd.algorithm = val.algorithm; |
578 | 1.74k | return tuned_cmd; |
579 | 1.74k | } |
580 | 907 | int64_t best_measured = -1; |
581 | 907 | const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd); |
582 | 907 | assert(cmd_idx >= 0 && cmd_idx < sizeof(init_map) / sizeof(init_map[0])); |
583 | 907 | int flag = 0, autotune_available_1 = 0; // This is only applicable if we have only one backend. |
584 | 6.80k | for (i = 0; i < CCV_NNC_BACKEND_COUNT; i++5.89k ) |
585 | 6.04k | { |
586 | 6.04k | const ccv_nnc_cmd_backend_registry_t api_registry = init_map[cmd_idx].backends[i]; |
587 | | // We have the exec kernel, and support all the tensor memory types. |
588 | 6.04k | if (api_registry.exec && |
589 | 6.04k | (api_registry.tensor_memory & tensor_memory) == tensor_memory1.97k && |
590 | 6.04k | (api_registry.tensor_formats & tensor_formats) == tensor_formats1.05k && |
591 | 6.04k | (api_registry.tensor_datatypes & tensor_datatypes) == tensor_datatypes1.05k ) |
592 | 1.05k | { |
593 | 1.05k | if (api_registry.autotune) |
594 | 51 | autotune_available_1 = 1; |
595 | 1.05k | if ((++flag) >= 2) // If we have more than 2 suitable backend, we can do this now. |
596 | 151 | break; |
597 | 1.05k | } |
598 | 6.04k | } |
599 | 907 | if (flag == 0) |
600 | 0 | return cmd; |
601 | 907 | _ccv_nnc_cmd_set_device_id(inputs, input_size, outputs, output_size, stream_context); |
602 | | // Allocate inputs / outputs and fill them in. |
603 | 907 | ccv_nnc_tensor_t** copy_inputs; |
604 | 907 | ccv_nnc_tensor_t** copy_outputs; |
605 | 907 | ccv_nnc_tensor_t** allocated_inputs; |
606 | 907 | ccv_nnc_tensor_t** allocated_outputs; |
607 | 907 | ccv_nnc_tensor_view_t** allocated_input_views; |
608 | 907 | ccv_nnc_tensor_view_t** allocated_output_views; |
609 | 907 | if (flag > 1 || autotune_available_1756 ) |
610 | 202 | { |
611 | 202 | copy_inputs = (ccv_nnc_tensor_t**)cccalloc((input_size + output_size) * 3, sizeof(ccv_nnc_tensor_t*)); |
612 | 202 | copy_outputs = copy_inputs + input_size; |
613 | 202 | allocated_inputs = copy_outputs + output_size; |
614 | 202 | allocated_outputs = allocated_inputs + input_size; |
615 | 202 | allocated_input_views = (ccv_nnc_tensor_view_t**)(allocated_outputs + output_size); |
616 | 202 | allocated_output_views = allocated_input_views + input_size; |
617 | 202 | int stride[CCV_NNC_MAX_DIM_ALLOC]; |
618 | 490 | for (i = 0; i < output_size; i++288 ) |
619 | 288 | if (outputs[i]) |
620 | 279 | { |
621 | 1.28k | for (j = 0; j < input_size; j++1.00k ) |
622 | 1.00k | if (inputs[j]) |
623 | 785 | { |
624 | 785 | if (outputs[i] == inputs[j]) |
625 | 0 | { |
626 | 0 | if (!copy_inputs[j]) |
627 | 0 | { |
628 | 0 | allocated_inputs[j] = ccv_nnc_tensor_new(0, inputs[j]->info, 0); |
629 | 0 | if (CCV_IS_TENSOR_VIEW(inputs[j])) |
630 | 0 | { |
631 | 0 | ccv_nnc_tensor_get_stride(inputs[j]->info.dim, stride); |
632 | 0 | copy_inputs[j] = (ccv_nnc_tensor_t*)(allocated_input_views[j] = ccv_nnc_tensor_view_new(allocated_inputs[j], inputs[j]->info, DIM_ALLOC(), stride)); |
633 | 0 | } else |
634 | 0 | copy_inputs[j] = allocated_inputs[j]; |
635 | 0 | } |
636 | 0 | copy_outputs[i] = copy_inputs[j]; |
637 | 0 | break; |
638 | 785 | } else if (outputs[i]->data.u8 == inputs[j]->data.u8 && |
639 | 785 | ccv_nnc_tensor_count(outputs[i]->info) == ccv_nnc_tensor_count(inputs[j]->info)0 ) { |
640 | 0 | if (!copy_inputs[j]) |
641 | 0 | { |
642 | 0 | allocated_inputs[j] = ccv_nnc_tensor_new(0, inputs[j]->info, 0); |
643 | 0 | if (CCV_IS_TENSOR_VIEW(inputs[j])) |
644 | 0 | { |
645 | 0 | ccv_nnc_tensor_get_stride(inputs[j]->info.dim, stride); |
646 | 0 | copy_inputs[j] = (ccv_nnc_tensor_t*)(allocated_input_views[j] = ccv_nnc_tensor_view_new(allocated_inputs[j], inputs[j]->info, DIM_ALLOC(), stride)); |
647 | 0 | } else |
648 | 0 | copy_inputs[j] = allocated_inputs[j]; |
649 | 0 | } |
650 | 0 | allocated_outputs[i] = ccv_nnc_tensor_new(copy_inputs[j]->data.u8, outputs[i]->info, 0); |
651 | 0 | if (CCV_IS_TENSOR_VIEW(outputs[i])) |
652 | 0 | { |
653 | 0 | ccv_nnc_tensor_get_stride(outputs[i]->info.dim, stride); |
654 | 0 | copy_outputs[i] = (ccv_nnc_tensor_t*)(allocated_output_views[i] = ccv_nnc_tensor_view_new(allocated_outputs[i], outputs[i]->info, DIM_ALLOC(), stride)); |
655 | 0 | } else |
656 | 0 | copy_outputs[i] = allocated_outputs[i]; |
657 | 0 | break; |
658 | 0 | } |
659 | 785 | } |
660 | 279 | if (!copy_outputs[i]) |
661 | 279 | { |
662 | 279 | allocated_outputs[i] = ccv_nnc_tensor_new(0, outputs[i]->info, 0); |
663 | 279 | if (CCV_IS_TENSOR_VIEW(outputs[i])) |
664 | 3 | { |
665 | 3 | ccv_nnc_tensor_get_stride(outputs[i]->info.dim, stride); |
666 | 3 | copy_outputs[i] = (ccv_nnc_tensor_t*)(allocated_output_views[i] = ccv_nnc_tensor_view_new(allocated_outputs[i], outputs[i]->info, DIM_ALLOC(), stride)); |
667 | 3 | } else |
668 | 276 | copy_outputs[i] = allocated_outputs[i]; |
669 | 279 | } |
670 | 279 | } |
671 | 848 | for (i = 0; i < input_size; i++646 ) |
672 | 646 | if (inputs[i] && !copy_inputs[i]559 ) |
673 | 559 | copy_inputs[i] = inputs[i]; |
674 | 202 | } |
675 | 907 | if (flag == 1) |
676 | 756 | { |
677 | 2.53k | for (i = 0; i < CCV_NNC_BACKEND_COUNT; i++1.78k ) |
678 | 2.53k | { |
679 | 2.53k | const ccv_nnc_cmd_backend_registry_t api_registry = init_map[cmd_idx].backends[i]; |
680 | | // We have the exec kernel, and support all the tensor memory types. |
681 | 2.53k | if (api_registry.exec && |
682 | 2.53k | (api_registry.tensor_memory & tensor_memory) == tensor_memory1.07k && |
683 | 2.53k | (api_registry.tensor_formats & tensor_formats) == tensor_formats756 && |
684 | 2.53k | (api_registry.tensor_datatypes & tensor_datatypes) == tensor_datatypes756 ) |
685 | 756 | { |
686 | 756 | tuned_cmd.backend = backend_init_map[i].backend; |
687 | | // If a given API exist an autotune function, use that to pick the top algorithm. |
688 | 756 | if (api_registry.autotune) |
689 | 51 | { |
690 | 51 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, inputs, input_size, copy_inputs, input_size, stream_context); |
691 | 51 | _ccv_nnc_cmd_set_device_id(copy_inputs, input_size, copy_outputs, output_size, stream_context); |
692 | 51 | tuned_cmd.algorithm = api_registry.autotune(tuned_cmd, max_workspace_size, hint, flags, copy_inputs, input_size, copy_outputs, output_size, stream_context); |
693 | | // Drain the context, autotune can use excessive amount of memory. Need to drain it now. |
694 | 51 | ccv_nnc_stream_context_drain(stream_context); |
695 | 51 | } |
696 | 756 | break; |
697 | 756 | } |
698 | 2.53k | } |
699 | 756 | if (autotune_available_1) |
700 | 51 | { |
701 | 240 | for (i = 0; i < input_size; i++189 ) |
702 | 189 | { |
703 | 189 | if (allocated_inputs[i]) |
704 | 0 | ccv_nnc_tensor_free(allocated_inputs[i]); |
705 | 189 | if (allocated_input_views[i]) |
706 | 0 | ccv_nnc_tensor_view_free(allocated_input_views[i]); |
707 | 189 | } |
708 | 146 | for (i = 0; i < output_size; i++95 ) |
709 | 95 | { |
710 | 95 | if (allocated_outputs[i]) |
711 | 93 | ccv_nnc_tensor_free(allocated_outputs[i]); |
712 | 95 | if (allocated_output_views[i]) |
713 | 0 | ccv_nnc_tensor_view_free(allocated_output_views[i]); |
714 | 95 | } |
715 | 51 | ccfree(copy_inputs); |
716 | 51 | } |
717 | 756 | const ccv_nnc_cmd_autotune_val_t val = { |
718 | 756 | .backend = tuned_cmd.backend, |
719 | 756 | .algorithm = tuned_cmd.algorithm |
720 | 756 | }; |
721 | 756 | kh_val(g_autotune_executable_cache, kiter) = val; |
722 | 756 | return tuned_cmd; |
723 | 756 | } |
724 | | // We need to have trial loop through all the data. |
725 | 604 | for (k = 0; 151 k < AUTO_TUNE_TRIAL_SIZE; k++453 ) |
726 | 453 | { |
727 | 3.62k | for (i = 0; i < CCV_NNC_BACKEND_COUNT; i++3.17k ) |
728 | 3.17k | { |
729 | 3.17k | const ccv_nnc_cmd_backend_registry_t api_registry = init_map[cmd_idx].backends[i]; |
730 | | // We have the exec kernel, and support all the tensor memory types. |
731 | 3.17k | if (api_registry.exec && |
732 | 3.17k | (api_registry.tensor_memory & tensor_memory) == tensor_memory1.35k && |
733 | 3.17k | (api_registry.tensor_formats & tensor_formats) == tensor_formats906 && |
734 | 3.17k | (api_registry.tensor_datatypes & tensor_datatypes) == tensor_datatypes906 ) |
735 | 906 | { |
736 | 906 | ccv_nnc_cmd_t candid_cmd = cmd; |
737 | 906 | candid_cmd.backend = backend_init_map[i].backend; |
738 | | // If a given API exist an autotune function, use that to pick the top algorithm. |
739 | 906 | if (api_registry.autotune) |
740 | 0 | { |
741 | | // Assuming k == 0 is sufficient, and we can skip. |
742 | 0 | if (k > 0) |
743 | 0 | continue; |
744 | 0 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, inputs, input_size, copy_inputs, input_size, stream_context); |
745 | 0 | _ccv_nnc_cmd_set_device_id(copy_inputs, input_size, copy_outputs, output_size, stream_context); |
746 | 0 | candid_cmd.algorithm = api_registry.autotune(candid_cmd, max_workspace_size, hint, flags, copy_inputs, input_size, copy_outputs, output_size, stream_context); |
747 | | // Drain the context, autotune can use excessive amount of memory. Need to drain it now. |
748 | 0 | ccv_nnc_stream_context_drain(stream_context); |
749 | 0 | uint64_t elapsed = ccv_nnc_cmd_mono_time(); |
750 | | // Ready to run. |
751 | 0 | int status = ccv_nnc_cmd_exec(candid_cmd, hint, flags, inputs, input_size, outputs, output_size, stream_context); |
752 | 0 | ccv_nnc_stream_context_wait(stream_context); |
753 | 0 | elapsed = ccv_nnc_cmd_mono_time() - elapsed; |
754 | 0 | if (status == CCV_NNC_EXEC_SUCCESS && |
755 | 0 | (best_measured == -1 || elapsed < best_measured)) |
756 | 0 | { |
757 | 0 | best_measured = elapsed; |
758 | 0 | tuned_cmd = candid_cmd; |
759 | 0 | } |
760 | 906 | } else { |
761 | | // Otherwise loop over the existing algorithms and pick the top one. |
762 | 2.55k | for (j = 0; j < api_registry.algorithms; j++1.64k ) |
763 | 1.64k | { |
764 | 1.64k | candid_cmd.algorithm = j; |
765 | 1.64k | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, inputs, input_size, copy_inputs, input_size, stream_context); |
766 | 1.64k | _ccv_nnc_cmd_set_device_id(copy_inputs, input_size, copy_outputs, output_size, stream_context); |
767 | 1.64k | uint64_t elapsed = ccv_nnc_cmd_mono_time(); |
768 | | // Ready to run. |
769 | 1.64k | int status = ccv_nnc_cmd_exec(candid_cmd, hint, flags, copy_inputs, input_size, copy_outputs, output_size, stream_context); |
770 | 1.64k | elapsed = ccv_nnc_cmd_mono_time() - elapsed; |
771 | 1.64k | if (status == CCV_NNC_EXEC_SUCCESS && |
772 | 1.64k | (1.07k best_measured == -11.07k || elapsed < best_measured920 )) |
773 | 525 | { |
774 | 525 | best_measured = elapsed; |
775 | 525 | tuned_cmd = candid_cmd; |
776 | 525 | } |
777 | 1.64k | } |
778 | 906 | } |
779 | 906 | } |
780 | 3.17k | } |
781 | 453 | } |
782 | 608 | for (i = 0; i < input_size; i++457 ) |
783 | 457 | { |
784 | 457 | if (allocated_inputs[i]) |
785 | 0 | ccv_nnc_tensor_free(allocated_inputs[i]); |
786 | 457 | if (allocated_input_views[i]) |
787 | 0 | ccv_nnc_tensor_view_free(allocated_input_views[i]); |
788 | 457 | } |
789 | 344 | for (i = 0; i < output_size; i++193 ) |
790 | 193 | { |
791 | 193 | if (allocated_outputs[i]) |
792 | 186 | ccv_nnc_tensor_free(allocated_outputs[i]); |
793 | 193 | if (allocated_output_views[i]) |
794 | 3 | ccv_nnc_tensor_view_free(allocated_output_views[i]); |
795 | 193 | } |
796 | 151 | ccfree(copy_inputs); |
797 | 151 | const ccv_nnc_cmd_autotune_val_t val = { |
798 | 151 | .backend = tuned_cmd.backend, |
799 | 151 | .algorithm = tuned_cmd.algorithm |
800 | 151 | }; |
801 | 151 | kh_val(g_autotune_executable_cache, kiter) = val; |
802 | 151 | return tuned_cmd; |
803 | 907 | } |
804 | | |
805 | | int ccv_nnc_cmd_bitmask(const ccv_nnc_cmd_t cmd, const int input_size, const int output_size, const uint64_t* const input_bitmasks, const int input_bitmask_size, const uint64_t* const output_bitmasks, const int output_bitmask_size) |
806 | 147k | { |
807 | | // If it is no-op, return true, it can deal with any number of parameters. |
808 | 147k | if (cmd.cmd == CCV_NNC_NOOP) |
809 | 118 | return 1; |
810 | | // If it is a custom command, I cannot check it at all, return false. |
811 | 147k | if (cmd.cmd == CCV_NNC_CUSTOM_FORWARD || cmd.cmd == CCV_NNC_CUSTOM_BACKWARD) |
812 | 2.40k | return 0; |
813 | 144k | const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd); |
814 | 144k | const ccv_nnc_cmd_registry_t cmd_registry = init_map[cmd_idx].registry; |
815 | 144k | if (cmd_registry.bitmask) |
816 | 144k | return cmd_registry.bitmask(cmd.info, input_size, output_size, input_bitmasks, input_bitmask_size, output_bitmasks, output_bitmask_size); |
817 | | // If there is not checking, none can pass. |
818 | 0 | return 0; |
819 | 144k | } |
820 | | |
821 | | int ccv_nnc_device_ids_for_io(ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, const int tensor_type, int* const device_ids, const int max_device_id_size) |
822 | 125k | { |
823 | 125k | int i, j; |
824 | 125k | int device_id_size = 0; |
825 | 125k | if (max_device_id_size <= device_id_size) |
826 | 0 | return device_id_size; |
827 | | // The device id of the exec is determined by its outputs. |
828 | 295k | for (i = 0; 125k i < output_size; i++170k ) |
829 | 173k | if (outputs[i] && |
830 | 173k | CCV_TENSOR_GET_MEMORY155k (outputs[i]->info.type) == tensor_type155k && |
831 | 173k | CCV_TENSOR_GET_DEVICE22.8k (outputs[i]->info.type) != CCV_COMPUTE_DEVICE_ANY22.8k ) |
832 | 22.8k | { |
833 | 22.8k | const int device_id = CCV_TENSOR_GET_DEVICE_ID(outputs[i]->info.type); |
834 | 22.8k | int flag = 0; |
835 | 33.5k | for (j = 0; !flag && j < device_id_size25.7k ; j++10.7k ) |
836 | 10.7k | flag = (device_ids[j] == device_id); |
837 | 22.8k | if (flag) |
838 | 7.76k | continue; |
839 | 15.0k | device_ids[device_id_size++] = device_id; |
840 | 15.0k | if (device_id_size >= max_device_id_size) |
841 | 3.57k | return device_id_size; |
842 | 15.0k | } |
843 | 121k | if (device_id_size == 0) |
844 | 111k | { |
845 | 111k | int device_id = -1; |
846 | 360k | for (i = 0; i < input_size; i++249k ) |
847 | 249k | if (inputs[i] && |
848 | 249k | CCV_TENSOR_GET_MEMORY203k (inputs[i]->info.type) == tensor_type203k && |
849 | 249k | CCV_TENSOR_GET_DEVICE2.17k (inputs[i]->info.type) != CCV_COMPUTE_DEVICE_ANY2.17k && |
850 | 249k | (2.17k device_id < 02.17k || CCV_TENSOR_GET_DEVICE_ID1.13k (inputs[i]->info.type) < device_id1.13k )) |
851 | 1.04k | device_id = CCV_TENSOR_GET_DEVICE_ID(inputs[i]->info.type); |
852 | 111k | if (device_id >= 0) |
853 | 1.04k | { |
854 | 1.04k | device_ids[0] = device_id; |
855 | 1.04k | return 1; |
856 | 1.04k | } |
857 | 111k | } |
858 | 120k | return device_id_size; |
859 | 121k | } |
860 | | |
861 | | void* ccv_nnc_cmd_aux(const ccv_nnc_cmd_t cmd) |
862 | 11 | { |
863 | 11 | if (cmd.cmd == CCV_NNC_NOOP || |
864 | 11 | cmd.cmd == CCV_NNC_CUSTOM_FORWARD || cmd.cmd == CCV_NNC_CUSTOM_BACKWARD || |
865 | 11 | cmd.cmd == CCV_NNC_GRAPH_FORWARD || cmd.cmd == CCV_NNC_GRAPH_BACKWARD) |
866 | 0 | return 0; |
867 | 11 | assert(cmd.backend != CCV_NNC_NO_BACKEND); |
868 | 11 | const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd); |
869 | 11 | assert(cmd_idx >= 0 && cmd_idx < sizeof(init_map) / sizeof(init_map[0])); |
870 | 11 | const int backend_idx = _ccv_nnc_cmd_backend_ph(cmd.backend); |
871 | 11 | assert(backend_idx >= 0 && backend_idx < CCV_NNC_BACKEND_COUNT); |
872 | 11 | const ccv_nnc_cmd_backend_registry_t api_registry = init_map[cmd_idx].backends[backend_idx]; |
873 | 11 | return api_registry.aux; |
874 | 11 | } |
875 | | |
876 | | int ccv_nnc_cmd_exec(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
877 | 440k | { |
878 | | // If it is no-op, return as if succeed already. |
879 | 440k | if (cmd.cmd == CCV_NNC_NOOP) |
880 | 31.3k | return 0; |
881 | 409k | _ccv_nnc_cmd_set_device_id(inputs, input_size, outputs, output_size, stream_context); |
882 | | // If it is a custom command, just apply it directly. |
883 | 409k | if (cmd.cmd == CCV_NNC_CUSTOM_FORWARD || cmd.cmd == CCV_NNC_CUSTOM_BACKWARD406k ) |
884 | 4.82k | { |
885 | 4.82k | int ret = cmd.isa->exec(cmd, hint, flags, inputs, input_size, outputs, output_size, stream_context); |
886 | 4.82k | if (!stream_context) |
887 | 4.43k | ccv_nnc_stream_context_drain(stream_context); |
888 | 4.82k | return ret; |
889 | 4.82k | } |
890 | 409k | assert(cmd.cmd != CCV_NNC_GRAPH_FORWARD && cmd.cmd != CCV_NNC_GRAPH_BACKWARD)404k ; |
891 | 404k | const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd); |
892 | 404k | assert(cmd_idx >= 0 && cmd_idx < sizeof(init_map) / sizeof(init_map[0])); |
893 | 404k | int i; |
894 | 404k | uint32_t backend = cmd.backend; |
895 | 404k | if (backend == CCV_NNC_NO_BACKEND) |
896 | 180k | { |
897 | | // Find a suitable backend. |
898 | 180k | int tensor_memory = 0, tensor_formats = 0, tensor_datatypes = 0; |
899 | 450k | for (i = 0; i < input_size; i++270k ) |
900 | 270k | if (inputs[i]) |
901 | 268k | tensor_memory |= CCV_TENSOR_GET_MEMORY(inputs[i]->info.type), tensor_formats |= inputs[i]->info.format, tensor_datatypes |= CCV_GET_DATA_TYPE(inputs[i]->info.datatype); |
902 | 406k | for (i = 0; i < output_size; i++226k ) |
903 | 226k | if (outputs[i]) |
904 | 225k | tensor_memory |= CCV_TENSOR_GET_MEMORY(outputs[i]->info.type), tensor_formats |= outputs[i]->info.format, tensor_datatypes |= CCV_GET_DATA_TYPE(outputs[i]->info.datatype); |
905 | 180k | backend = ccv_nnc_cmd_find_backend(cmd, tensor_memory, tensor_formats, tensor_datatypes); |
906 | 180k | } |
907 | 404k | assert(backend != CCV_NNC_NO_BACKEND); |
908 | 404k | const int backend_idx = _ccv_nnc_cmd_backend_ph(backend); |
909 | 404k | assert(backend_idx >= 0 && backend_idx < CCV_NNC_BACKEND_COUNT); |
910 | 404k | const ccv_nnc_cmd_backend_registry_t api_registry = init_map[cmd_idx].backends[backend_idx]; |
911 | 404k | if (!api_registry.exec) |
912 | 0 | return CCV_NNC_EXEC_NO_KERNEL; |
913 | | // Everything is out, call the underlying implementation. |
914 | 404k | int ret = api_registry.exec(cmd, hint, flags, inputs, input_size, outputs, output_size, stream_context); |
915 | 404k | if (!stream_context) |
916 | 104k | ccv_nnc_stream_context_drain(stream_context); |
917 | 404k | return ret; |
918 | 404k | } |
919 | | |
920 | | int ccv_nnc_cmd_attr(const ccv_nnc_cmd_t cmd, const int flags) |
921 | 0 | { |
922 | | // No additional attr for noop. |
923 | 0 | if (cmd.cmd == CCV_NNC_NOOP || |
924 | | // If it is a custom command, just apply it directly. |
925 | 0 | cmd.cmd == CCV_NNC_CUSTOM_FORWARD || cmd.cmd == CCV_NNC_CUSTOM_BACKWARD || |
926 | | // If it is sub-graph, there is no additional attr as well. |
927 | 0 | cmd.cmd == CCV_NNC_GRAPH_FORWARD || cmd.cmd == CCV_NNC_GRAPH_BACKWARD) |
928 | 0 | return 0; |
929 | 0 | const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd); |
930 | 0 | assert(cmd_idx >= 0 && cmd_idx <sizeof(init_map) / sizeof(init_map[0])); |
931 | 0 | const ccv_nnc_cmd_registry_t cmd_registry = init_map[cmd_idx].registry; |
932 | 0 | return !!(cmd_registry.flags & flags); |
933 | 0 | } |
934 | | |
935 | | void ccv_nnc_set_profiler(int state) |
936 | 0 | { |
937 | 0 | #ifdef HAVE_CUDA |
938 | 0 | cusetprofiler(state); |
939 | 0 | #endif |
940 | 0 | } |
941 | | |
942 | | int ccv_nnc_queue_watermark(void) |
943 | 0 | { |
944 | | #ifdef HAVE_MPS |
945 | | return ccv_nnc_mps_queue_watermark(); |
946 | | #else |
947 | 0 | return 0; |
948 | 0 | #endif |
949 | 0 | } |
950 | | |
951 | | void ccv_nnc_set_queue_watermark(int watermark) |
952 | 0 | { |
953 | | #ifdef HAVE_MPS |
954 | | // If we need to be memory efficient, we need to bound how many in-flight command buffers there are. |
955 | | ccv_nnc_mps_set_queue_watermark(watermark); |
956 | | #endif |
957 | 0 | } |
958 | | |
959 | | void ccv_nnc_set_device_permutation(const int type, const int* const device_map, const int size) |
960 | 2 | { |
961 | 2 | if (type != CCV_STREAM_CONTEXT_GPU) |
962 | 0 | return; |
963 | 2 | #ifdef HAVE_CUDA |
964 | 2 | cusetdevicemap(device_map, size); |
965 | 2 | #endif |
966 | 2 | } |