| File: | nnc/ccv_nnc_cmd.c |
| Warning: | line 580, column 27 Potential memory leak |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
| 1 | #include "ccv_nnc.h" | |||
| 2 | #include "ccv_nnc_internal.h" | |||
| 3 | #include "3rdparty/khash/khash.h" | |||
| 4 | #include "ccv_nnc_easy.h" | |||
| 5 | #ifdef HAVE_CUDA1 | |||
| 6 | #include "gpu/ccv_nnc_compat.h" | |||
| 7 | #elif defined(HAVE_MPS) | |||
| 8 | #include "mps/ccv_nnc_mps.h" | |||
| 9 | #endif | |||
| 10 | #include <time.h> | |||
| 11 | #include <sys/time.h> | |||
| 12 | ||||
| 13 | typedef struct { | |||
| 14 | const uint32_t cmd; | |||
| 15 | const char* name; | |||
| 16 | ccv_nnc_cmd_registry_t registry; | |||
| 17 | ccv_nnc_cmd_backend_registry_t backends[CCV_NNC_BACKEND_COUNT]; | |||
| 18 | } ccv_nnc_cmd_init_t; | |||
| 19 | ||||
| 20 | typedef struct { | |||
| 21 | const uint32_t backend; | |||
| 22 | const char* name; | |||
| 23 | } ccv_nnc_cmd_backend_init_t; | |||
| 24 | ||||
| 25 | // The generated code configures command and its mapping. | |||
| 26 | #include "cmd/ccv_nnc_cmd.inc" | |||
| 27 | ||||
| 28 | void ccv_nnc_init(void) | |||
| 29 | { | |||
| 30 | _ccv_nnc_cmd_init(); | |||
| 31 | } | |||
| 32 | ||||
| 33 | static uint64_t _ccv_nnc_flags = 0; | |||
| 34 | ||||
| 35 | uint64_t ccv_nnc_flags(void) | |||
| 36 | { | |||
| 37 | return _ccv_nnc_flags; | |||
| 38 | } | |||
| 39 | ||||
| 40 | void ccv_nnc_enable_flag(uint64_t flag) | |||
| 41 | { | |||
| 42 | _ccv_nnc_flags |= flag; | |||
| 43 | } | |||
| 44 | ||||
| 45 | void ccv_nnc_disable_flag(uint64_t flag) | |||
| 46 | { | |||
| 47 | _ccv_nnc_flags &= ~flag; | |||
| 48 | } | |||
| 49 | ||||
| 50 | const char* ccv_nnc_cmd_name(const uint32_t cmd) | |||
| 51 | { | |||
| 52 | switch (cmd) | |||
| 53 | { | |||
| 54 | case CCV_NNC_NOOP: | |||
| 55 | return "CCV_NNC_NOOP"; | |||
| 56 | case CCV_NNC_CUSTOM_FORWARD: | |||
| 57 | return "CCV_NNC_CUSTOM_FORWARD"; | |||
| 58 | case CCV_NNC_CUSTOM_BACKWARD: | |||
| 59 | return "CCV_NNC_CUSTOM_BACKWARD"; | |||
| 60 | case CCV_NNC_GRAPH_FORWARD: | |||
| 61 | return "CCV_NNC_GRAPH_FORWARD"; | |||
| 62 | case CCV_NNC_GRAPH_BACKWARD: | |||
| 63 | return "CCV_NNC_GRAPH_BACKWARD"; | |||
| 64 | } | |||
| 65 | const int idx = _ccv_nnc_cmd_ph(cmd); | |||
| 66 | assert(idx >= 0)((void) sizeof ((idx >= 0) ? 1 : 0), __extension__ ({ if ( idx >= 0) ; else __assert_fail ("idx >= 0", "ccv_nnc_cmd.c" , 66, __extension__ __PRETTY_FUNCTION__); })); | |||
| 67 | assert(idx < sizeof(init_map) / sizeof(init_map[0]))((void) sizeof ((idx < sizeof(init_map) / sizeof(init_map[ 0])) ? 1 : 0), __extension__ ({ if (idx < sizeof(init_map) / sizeof(init_map[0])) ; else __assert_fail ("idx < sizeof(init_map) / sizeof(init_map[0])" , "ccv_nnc_cmd.c", 67, __extension__ __PRETTY_FUNCTION__); }) ); | |||
| 68 | return init_map[idx].name; | |||
| 69 | } | |||
| 70 | ||||
| 71 | const char* ccv_nnc_cmd_backend_name(const uint32_t backend) | |||
| 72 | { | |||
| 73 | if (backend == CCV_NNC_NO_BACKEND) | |||
| 74 | return "CCV_NNC_NO_BACKEND"; | |||
| 75 | const int idx = _ccv_nnc_cmd_backend_ph(backend); | |||
| 76 | assert(idx >= 0)((void) sizeof ((idx >= 0) ? 1 : 0), __extension__ ({ if ( idx >= 0) ; else __assert_fail ("idx >= 0", "ccv_nnc_cmd.c" , 76, __extension__ __PRETTY_FUNCTION__); })); | |||
| 77 | assert(idx < CCV_NNC_BACKEND_COUNT)((void) sizeof ((idx < CCV_NNC_BACKEND_COUNT) ? 1 : 0), __extension__ ({ if (idx < CCV_NNC_BACKEND_COUNT) ; else __assert_fail ( "idx < CCV_NNC_BACKEND_COUNT", "ccv_nnc_cmd.c", 77, __extension__ __PRETTY_FUNCTION__); })); | |||
| 78 | return backend_init_map[idx].name; | |||
| 79 | } | |||
| 80 | ||||
| 81 | const ccv_nnc_cmd_param_t ccv_nnc_cmd_auto = {}; | |||
| 82 | ||||
| 83 | int ccv_nnc_is_cmd_auto(const ccv_nnc_cmd_param_t params) | |||
| 84 | { | |||
| 85 | return (memcmp(¶ms, &ccv_nnc_cmd_auto, sizeof(ccv_nnc_cmd_param_t)) == 0); | |||
| 86 | } | |||
| 87 | ||||
| 88 | int ccv_nnc_cmd_is_forward(const ccv_nnc_cmd_t cmd) | |||
| 89 | { | |||
| 90 | switch (cmd.cmd) | |||
| 91 | { | |||
| 92 | case CCV_NNC_NOOP: | |||
| 93 | return 0; | |||
| 94 | case CCV_NNC_CUSTOM_FORWARD: | |||
| 95 | case CCV_NNC_CUSTOM_BACKWARD: | |||
| 96 | case CCV_NNC_GRAPH_FORWARD: | |||
| 97 | case CCV_NNC_GRAPH_BACKWARD: | |||
| 98 | default: | |||
| 99 | return !(cmd.cmd & 0x1); // If it is even, it is forward | |||
| 100 | } | |||
| 101 | } | |||
| 102 | ||||
| 103 | int ccv_nnc_cmd_is_backward(const ccv_nnc_cmd_t cmd) | |||
| 104 | { | |||
| 105 | switch (cmd.cmd) | |||
| 106 | { | |||
| 107 | case CCV_NNC_NOOP: | |||
| 108 | return 0; | |||
| 109 | case CCV_NNC_CUSTOM_FORWARD: | |||
| 110 | case CCV_NNC_CUSTOM_BACKWARD: | |||
| 111 | case CCV_NNC_GRAPH_FORWARD: | |||
| 112 | case CCV_NNC_GRAPH_BACKWARD: | |||
| 113 | default: | |||
| 114 | return !!(cmd.cmd & 0x1); // If it is odd, it is backward | |||
| 115 | } | |||
| 116 | } | |||
| 117 | ||||
| 118 | int ccv_nnc_cmd_ok(const uint32_t cmd, const uint32_t backend) | |||
| 119 | { | |||
| 120 | // If it is a custom command, a no op, or a graph op, there is no backend to check. | |||
| 121 | if (cmd == CCV_NNC_NOOP || | |||
| 122 | cmd == CCV_NNC_GRAPH_FORWARD || cmd == CCV_NNC_GRAPH_BACKWARD || | |||
| 123 | cmd == CCV_NNC_CUSTOM_FORWARD || cmd == CCV_NNC_CUSTOM_BACKWARD) | |||
| 124 | return 1; | |||
| 125 | const int cmd_idx = _ccv_nnc_cmd_ph(cmd); | |||
| 126 | const int backend_idx = _ccv_nnc_cmd_backend_ph(backend); | |||
| 127 | assert(cmd_idx >= 0 && cmd_idx < sizeof(init_map) / sizeof(init_map[0]))((void) sizeof ((cmd_idx >= 0 && cmd_idx < sizeof (init_map) / sizeof(init_map[0])) ? 1 : 0), __extension__ ({ if (cmd_idx >= 0 && cmd_idx < sizeof(init_map) / sizeof (init_map[0])) ; else __assert_fail ("cmd_idx >= 0 && cmd_idx < sizeof(init_map) / sizeof(init_map[0])" , "ccv_nnc_cmd.c", 127, __extension__ __PRETTY_FUNCTION__); } )); | |||
| 128 | assert(backend_idx >= 0 && backend_idx < CCV_NNC_BACKEND_COUNT)((void) sizeof ((backend_idx >= 0 && backend_idx < CCV_NNC_BACKEND_COUNT) ? 1 : 0), __extension__ ({ if (backend_idx >= 0 && backend_idx < CCV_NNC_BACKEND_COUNT) ; else __assert_fail ("backend_idx >= 0 && backend_idx < CCV_NNC_BACKEND_COUNT" , "ccv_nnc_cmd.c", 128, __extension__ __PRETTY_FUNCTION__); } )); | |||
| 129 | const ccv_nnc_cmd_backend_registry_t api_registry = init_map[cmd_idx].backends[backend_idx]; | |||
| 130 | // Check if the execution function exists or not. | |||
| 131 | return !!api_registry.exec; | |||
| 132 | } | |||
| 133 | ||||
| 134 | ccv_nnc_cmd_t ccv_nnc_cmd(const uint32_t _cmd, ccv_nnc_cmd_vtab_t* const isa, const ccv_nnc_cmd_param_t params, const int flags) | |||
| 135 | { | |||
| 136 | ccv_nnc_cmd_t cmd; | |||
| 137 | cmd.info = params; | |||
| 138 | cmd.backend = CCV_NNC_NO_BACKEND; | |||
| 139 | assert((_cmd == CCV_NNC_CUSTOM_FORWARD && isa) || (_cmd != CCV_NNC_CUSTOM_FORWARD && !isa))((void) sizeof (((_cmd == CCV_NNC_CUSTOM_FORWARD && isa ) || (_cmd != CCV_NNC_CUSTOM_FORWARD && !isa)) ? 1 : 0 ), __extension__ ({ if ((_cmd == CCV_NNC_CUSTOM_FORWARD && isa) || (_cmd != CCV_NNC_CUSTOM_FORWARD && !isa)) ; else __assert_fail ("(_cmd == CCV_NNC_CUSTOM_FORWARD && isa) || (_cmd != CCV_NNC_CUSTOM_FORWARD && !isa)" , "ccv_nnc_cmd.c", 139, __extension__ __PRETTY_FUNCTION__); } )); | |||
| 140 | cmd.cmd = _cmd; | |||
| 141 | cmd.algorithm = -1; // This is default. | |||
| 142 | cmd.isa = isa; | |||
| 143 | cmd.data = 0; | |||
| 144 | return cmd; | |||
| 145 | } | |||
| 146 | ||||
| 147 | const ccv_nnc_hint_t ccv_nnc_no_hint = {}; | |||
| 148 | ||||
| 149 | int ccv_nnc_is_no_hint(const ccv_nnc_hint_t hint) | |||
| 150 | { | |||
| 151 | return (memcmp(&hint, &ccv_nnc_no_hint, sizeof(ccv_nnc_hint_t)) == 0); | |||
| 152 | } | |||
| 153 | ||||
| 154 | int ccv_nnc_hint_verify(const ccv_nnc_hint_t hint, const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t a, const ccv_nnc_tensor_param_t b) | |||
| 155 | { | |||
| 156 | int i; | |||
| 157 | assert(a.format == b.format)((void) sizeof ((a.format == b.format) ? 1 : 0), __extension__ ({ if (a.format == b.format) ; else __assert_fail ("a.format == b.format" , "ccv_nnc_cmd.c", 157, __extension__ __PRETTY_FUNCTION__); } )); | |||
| 158 | const int nd = ccv_nnc_tensor_nd(a.dim); | |||
| 159 | const int size_nd = ccv_max(2, ccv_nnc_tensor_nd(cmd.size.dim) - 1)({ typeof (2) _a = (2); typeof (ccv_nnc_tensor_nd(cmd.size.dim ) - 1) _b = (ccv_nnc_tensor_nd(cmd.size.dim) - 1); (_a > _b ) ? _a : _b; }); | |||
| 160 | assert(size_nd == 2 || size_nd == 3)((void) sizeof ((size_nd == 2 || size_nd == 3) ? 1 : 0), __extension__ ({ if (size_nd == 2 || size_nd == 3) ; else __assert_fail ("size_nd == 2 || size_nd == 3" , "ccv_nnc_cmd.c", 160, __extension__ __PRETTY_FUNCTION__); } )); // Support 3D convolution. | |||
| 161 | assert(nd == size_nd + 1 || nd == size_nd + 2)((void) sizeof ((nd == size_nd + 1 || nd == size_nd + 2) ? 1 : 0), __extension__ ({ if (nd == size_nd + 1 || nd == size_nd + 2) ; else __assert_fail ("nd == size_nd + 1 || nd == size_nd + 2" , "ccv_nnc_cmd.c", 161, __extension__ __PRETTY_FUNCTION__); } )); | |||
| 162 | int hw; | |||
| 163 | if ((a.format == CCV_TENSOR_FORMAT_CHWN) || | |||
| 164 | (a.format == CCV_TENSOR_FORMAT_NHWC && nd == size_nd + 1)) | |||
| 165 | hw = 0; | |||
| 166 | else if ((a.format == CCV_TENSOR_FORMAT_NHWC && nd == size_nd + 2) || | |||
| 167 | (a.format == CCV_TENSOR_FORMAT_NCHW && nd == size_nd + 1)) | |||
| 168 | hw = 1; | |||
| 169 | else if (a.format == CCV_TENSOR_FORMAT_NCHW && nd == size_nd + 2) | |||
| 170 | hw = 2; | |||
| 171 | else | |||
| 172 | assert(0 && "unknown format")((void) sizeof ((0 && "unknown format") ? 1 : 0), __extension__ ({ if (0 && "unknown format") ; else __assert_fail ( "0 && \"unknown format\"", "ccv_nnc_cmd.c", 172, __extension__ __PRETTY_FUNCTION__); })); | |||
| 173 | for (i = 0; i < size_nd; i++) | |||
| 174 | { | |||
| 175 | if ((hint.border.begin[i] + hint.border.end[i] + a.dim[i + hw] - cmd.size.dim[i]) % hint.stride.dim[i] != 0) | |||
| 176 | return -1; | |||
| 177 | int expected = (hint.border.begin[i] + hint.border.end[i] + a.dim[i + hw] - cmd.size.dim[i]) / hint.stride.dim[i] + 1; | |||
| 178 | if (expected != b.dim[i + hw]) | |||
| 179 | return -1; | |||
| 180 | } | |||
| 181 | return 0; | |||
| 182 | } | |||
| 183 | ||||
| 184 | ccv_nnc_hint_t ccv_nnc_hint_auto(const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t a, const ccv_nnc_tensor_param_t b) | |||
| 185 | { | |||
| 186 | int i; | |||
| 187 | if (a.format != b.format) | |||
| 188 | return ccv_nnc_no_hint; | |||
| 189 | assert(a.format == b.format)((void) sizeof ((a.format == b.format) ? 1 : 0), __extension__ ({ if (a.format == b.format) ; else __assert_fail ("a.format == b.format" , "ccv_nnc_cmd.c", 189, __extension__ __PRETTY_FUNCTION__); } )); | |||
| 190 | const int a_nd = ccv_nnc_tensor_nd(a.dim); | |||
| 191 | const int b_nd = ccv_nnc_tensor_nd(b.dim); | |||
| 192 | const int size_nd = ccv_max(2, ccv_nnc_tensor_nd(cmd.size.dim) - 1)({ typeof (2) _a = (2); typeof (ccv_nnc_tensor_nd(cmd.size.dim ) - 1) _b = (ccv_nnc_tensor_nd(cmd.size.dim) - 1); (_a > _b ) ? _a : _b; }); | |||
| 193 | assert(size_nd == 2 || size_nd == 3)((void) sizeof ((size_nd == 2 || size_nd == 3) ? 1 : 0), __extension__ ({ if (size_nd == 2 || size_nd == 3) ; else __assert_fail ("size_nd == 2 || size_nd == 3" , "ccv_nnc_cmd.c", 193, __extension__ __PRETTY_FUNCTION__); } )); // Support 3D convolution. | |||
| 194 | // Is not auto hint deducible dimensions. | |||
| 195 | if (a_nd != b_nd || (a_nd != size_nd + 1 && a_nd != size_nd + 2)) | |||
| 196 | return ccv_nnc_no_hint; | |||
| 197 | int hw; | |||
| 198 | if ((a.format == CCV_TENSOR_FORMAT_CHWN) || | |||
| 199 | (a.format == CCV_TENSOR_FORMAT_NHWC && a_nd == size_nd + 1)) | |||
| 200 | hw = 0; | |||
| 201 | else if ((a.format == CCV_TENSOR_FORMAT_NHWC && a_nd == size_nd + 2) || | |||
| 202 | (a.format == CCV_TENSOR_FORMAT_NCHW && a_nd == size_nd + 1)) | |||
| 203 | hw = 1; | |||
| 204 | else if (a.format == CCV_TENSOR_FORMAT_NCHW && a_nd == size_nd + 2) | |||
| 205 | hw = 2; | |||
| 206 | else | |||
| 207 | assert(0 && "unknown format")((void) sizeof ((0 && "unknown format") ? 1 : 0), __extension__ ({ if (0 && "unknown format") ; else __assert_fail ( "0 && \"unknown format\"", "ccv_nnc_cmd.c", 207, __extension__ __PRETTY_FUNCTION__); })); | |||
| 208 | ccv_nnc_hint_t hint_auto = {}; | |||
| 209 | // 0-dim is reserved for channels | |||
| 210 | for (i = 0; i < size_nd; i++) | |||
| 211 | { | |||
| 212 | // Cannot have one of the dim is zero, we cannot auto the hint, return no hint. | |||
| 213 | assert(a.dim[i + hw] && b.dim[i + hw])((void) sizeof ((a.dim[i + hw] && b.dim[i + hw]) ? 1 : 0), __extension__ ({ if (a.dim[i + hw] && b.dim[i + hw ]) ; else __assert_fail ("a.dim[i + hw] && b.dim[i + hw]" , "ccv_nnc_cmd.c", 213, __extension__ __PRETTY_FUNCTION__); } )); | |||
| 214 | // This is guessed by having a stride that will approximately match the scale. | |||
| 215 | int stride = (a.dim[i + hw] + b.dim[i + hw] / 2) / b.dim[i + hw]; | |||
| 216 | hint_auto.stride.dim[i] = stride; | |||
| 217 | int border = (b.dim[i + hw] - 1) * stride - a.dim[i + hw] + cmd.size.dim[i]; | |||
| 218 | hint_auto.border.begin[i] = (border + 1) / 2; // Always prefer to have more padding in the beginning, this matches CUDNN behavior. | |||
| 219 | hint_auto.border.end[i] = border - hint_auto.border.begin[i]; | |||
| 220 | } | |||
| 221 | return hint_auto; | |||
| 222 | } | |||
| 223 | ||||
| 224 | void ccv_nnc_hint_tensor_auto_forward_from_inputs(const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_hint_t hint, ccv_nnc_tensor_param_t* const outputs, const int output_size) | |||
| 225 | { | |||
| 226 | int i; | |||
| 227 | assert(output_size <= input_size)((void) sizeof ((output_size <= input_size) ? 1 : 0), __extension__ ({ if (output_size <= input_size) ; else __assert_fail ("output_size <= input_size" , "ccv_nnc_cmd.c", 227, __extension__ __PRETTY_FUNCTION__); } )); | |||
| 228 | for (i = 0; i < output_size; i++) | |||
| 229 | outputs[i] = inputs[i]; | |||
| 230 | } | |||
| 231 | ||||
| 232 | void ccv_nnc_hint_tensor_auto_backward_from_gradient(const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_hint_t hint, ccv_nnc_tensor_param_t* const outputs, const int output_size) | |||
| 233 | { | |||
| 234 | int i; | |||
| 235 | for (i = 0; i < output_size; i++) | |||
| 236 | outputs[i] = inputs[0]; | |||
| 237 | } | |||
| 238 | ||||
| 239 | void ccv_nnc_hint_tensor_auto_backward_from_inputs(const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_hint_t hint, ccv_nnc_tensor_param_t* const outputs, const int output_size) | |||
| 240 | { | |||
| 241 | int i; | |||
| 242 | assert(output_size < input_size)((void) sizeof ((output_size < input_size) ? 1 : 0), __extension__ ({ if (output_size < input_size) ; else __assert_fail ("output_size < input_size" , "ccv_nnc_cmd.c", 242, __extension__ __PRETTY_FUNCTION__); } )); | |||
| 243 | for (i = 0; i < output_size; i++) | |||
| 244 | outputs[i] = inputs[i + 1]; | |||
| 245 | } | |||
| 246 | ||||
| 247 | void ccv_nnc_hint_tensor_auto_backward_from_gradient_and_inputs(const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_hint_t hint, ccv_nnc_tensor_param_t* const outputs, const int output_size) | |||
| 248 | { | |||
| 249 | int i; | |||
| 250 | outputs[0] = inputs[0]; | |||
| 251 | assert(output_size < input_size)((void) sizeof ((output_size < input_size) ? 1 : 0), __extension__ ({ if (output_size < input_size) ; else __assert_fail ("output_size < input_size" , "ccv_nnc_cmd.c", 251, __extension__ __PRETTY_FUNCTION__); } )); | |||
| 252 | for (i = 1; i < output_size; i++) | |||
| 253 | outputs[i] = inputs[i + 1]; | |||
| 254 | } | |||
| 255 | ||||
| 256 | void ccv_nnc_hint_tensor_auto(const ccv_nnc_cmd_t cmd, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_hint_t hint, ccv_nnc_tensor_param_t* const outputs, const int output_size) | |||
| 257 | { | |||
| 258 | // zero out the parameters | |||
| 259 | const ccv_nnc_tensor_param_t z = {}; | |||
| 260 | int i; | |||
| 261 | for (i = 0; i < output_size; i++) | |||
| 262 | outputs[i] = z; // Reset the outputs. | |||
| 263 | // Cannot handle these situations. | |||
| 264 | if (cmd.cmd == CCV_NNC_NOOP || cmd.cmd == CCV_NNC_CUSTOM_BACKWARD || cmd.cmd == CCV_NNC_GRAPH_FORWARD || cmd.cmd == CCV_NNC_GRAPH_BACKWARD) | |||
| 265 | return; | |||
| 266 | if (cmd.cmd == CCV_NNC_CUSTOM_FORWARD) | |||
| 267 | { | |||
| 268 | if (cmd.isa->tensor_auto) | |||
| 269 | cmd.isa->tensor_auto(cmd, inputs, input_size, hint, outputs, output_size); | |||
| 270 | return; | |||
| 271 | } | |||
| 272 | const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd); | |||
| 273 | const ccv_nnc_cmd_registry_t registry = init_map[cmd_idx].registry; | |||
| 274 | if (registry.tensor_auto) | |||
| 275 | registry.tensor_auto(cmd.info, inputs, input_size, hint, outputs, output_size); | |||
| 276 | else if (ccv_nnc_cmd_is_forward(cmd)) // For forward, the default auto is forward_from_inputs | |||
| 277 | ccv_nnc_hint_tensor_auto_forward_from_inputs(cmd.info, inputs, input_size, hint, outputs, output_size); | |||
| 278 | else // For backward, the default auto is backward_from_inputs | |||
| 279 | ccv_nnc_hint_tensor_auto_backward_from_inputs(cmd.info, inputs, input_size, hint, outputs, output_size); | |||
| 280 | } | |||
| 281 | ||||
| 282 | int ccv_nnc_cmd_allow_inplace(const ccv_nnc_cmd_t cmd, const int input_idx, const int input_size, const int output_idx, const int output_size) | |||
| 283 | { | |||
| 284 | if (cmd.cmd == CCV_NNC_NOOP || cmd.cmd == CCV_NNC_CUSTOM_FORWARD || cmd.cmd == CCV_NNC_CUSTOM_BACKWARD || cmd.cmd == CCV_NNC_GRAPH_FORWARD || cmd.cmd == CCV_NNC_GRAPH_BACKWARD) | |||
| 285 | return 0; | |||
| 286 | const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd); | |||
| 287 | const ccv_nnc_cmd_registry_t registry = init_map[cmd_idx].registry; | |||
| 288 | if (registry.allow_inplace) | |||
| 289 | return registry.allow_inplace(cmd.info, input_idx, input_size, output_idx, output_size); | |||
| 290 | return 0; | |||
| 291 | } | |||
| 292 | ||||
| 293 | int ccv_nnc_cmd_enforce_inplace(const ccv_nnc_cmd_t cmd, const int input_idx, const int input_size, const int output_idx, const int output_size) | |||
| 294 | { | |||
| 295 | if (cmd.cmd == CCV_NNC_NOOP || cmd.cmd == CCV_NNC_CUSTOM_FORWARD || cmd.cmd == CCV_NNC_CUSTOM_BACKWARD || cmd.cmd == CCV_NNC_GRAPH_FORWARD || cmd.cmd == CCV_NNC_GRAPH_BACKWARD) | |||
| 296 | return 0; | |||
| 297 | const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd); | |||
| 298 | const ccv_nnc_cmd_registry_t registry = init_map[cmd_idx].registry; | |||
| 299 | if (registry.enforce_inplace) | |||
| 300 | return registry.enforce_inplace(cmd.info, input_idx, input_size, output_idx, output_size); | |||
| 301 | return 0; | |||
| 302 | } | |||
| 303 | ||||
| 304 | // This returns absolute time. | |||
| 305 | uint64_t ccv_nnc_cmd_mono_time(void) | |||
| 306 | { | |||
| 307 | struct timespec ts; | |||
| 308 | clock_gettime(CLOCK_MONOTONIC1, &ts); | |||
| 309 | return ts.tv_sec * 1000000000ULL + ts.tv_nsec; | |||
| 310 | } | |||
| 311 | ||||
| 312 | uint32_t ccv_nnc_cmd_find_backend(const ccv_nnc_cmd_t cmd, const int tensor_memory, const int tensor_formats, const int tensor_datatypes) | |||
| 313 | { | |||
| 314 | if (cmd.cmd == CCV_NNC_NOOP || | |||
| 315 | cmd.cmd == CCV_NNC_GRAPH_FORWARD || cmd.cmd == CCV_NNC_GRAPH_BACKWARD || | |||
| 316 | cmd.cmd == CCV_NNC_CUSTOM_FORWARD || cmd.cmd == CCV_NNC_CUSTOM_BACKWARD) | |||
| 317 | return cmd.backend; | |||
| 318 | const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd); | |||
| 319 | assert(cmd_idx >= 0 && cmd_idx < sizeof(init_map) / sizeof(init_map[0]))((void) sizeof ((cmd_idx >= 0 && cmd_idx < sizeof (init_map) / sizeof(init_map[0])) ? 1 : 0), __extension__ ({ if (cmd_idx >= 0 && cmd_idx < sizeof(init_map) / sizeof (init_map[0])) ; else __assert_fail ("cmd_idx >= 0 && cmd_idx < sizeof(init_map) / sizeof(init_map[0])" , "ccv_nnc_cmd.c", 319, __extension__ __PRETTY_FUNCTION__); } )); | |||
| 320 | assert(tensor_memory != 0 && tensor_formats != 0 && tensor_datatypes != 0)((void) sizeof ((tensor_memory != 0 && tensor_formats != 0 && tensor_datatypes != 0) ? 1 : 0), __extension__ ({ if (tensor_memory != 0 && tensor_formats != 0 && tensor_datatypes != 0) ; else __assert_fail ("tensor_memory != 0 && tensor_formats != 0 && tensor_datatypes != 0" , "ccv_nnc_cmd.c", 320, __extension__ __PRETTY_FUNCTION__); } )); | |||
| 321 | int i; | |||
| 322 | for (i = 0; i < CCV_NNC_BACKEND_COUNT; i++) | |||
| 323 | { | |||
| 324 | const ccv_nnc_cmd_backend_registry_t api_registry = init_map[cmd_idx].backends[i]; | |||
| 325 | // We have the exec kernel, and support all the tensor memory types. | |||
| 326 | if (api_registry.exec && | |||
| 327 | (api_registry.tensor_memory & tensor_memory) == tensor_memory && | |||
| 328 | (api_registry.tensor_formats & tensor_formats) == tensor_formats && | |||
| 329 | (api_registry.tensor_datatypes & tensor_datatypes) == tensor_datatypes) | |||
| 330 | return backend_init_map[i].backend; | |||
| 331 | } | |||
| 332 | return cmd.backend; | |||
| 333 | } | |||
| 334 | ||||
| 335 | #define AUTO_TUNE_TRIAL_SIZE(3) (3) | |||
| 336 | ||||
| 337 | static void _ccv_nnc_cmd_set_device_id(ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) | |||
| 338 | { | |||
| 339 | #ifdef HAVE_CUDA1 | |||
| 340 | if (!stream_context) | |||
| 341 | { | |||
| 342 | int device_id; | |||
| 343 | if (ccv_nnc_device_ids_for_io(inputs, input_size, outputs, output_size, CCV_TENSOR_GPU_MEMORY, &device_id, 1) > 0) | |||
| 344 | cudevice(device_id); | |||
| 345 | } | |||
| 346 | #endif | |||
| 347 | } | |||
| 348 | ||||
| 349 | typedef struct { | |||
| 350 | int format; | |||
| 351 | int datatype; | |||
| 352 | int nd; | |||
| 353 | off_t dataof; | |||
| 354 | int dim[CCV_NNC_MAX_DIM_ALLOC(12)]; | |||
| 355 | int stride[CCV_NNC_MAX_DIM_ALLOC(12)]; | |||
| 356 | } ccv_nnc_cmd_autotune_tensor_shape_t; | |||
| 357 | ||||
| 358 | typedef struct { | |||
| 359 | uint32_t cmd; | |||
| 360 | ccv_nnc_cmd_param_t params; | |||
| 361 | ccv_nnc_hint_t hint; | |||
| 362 | int flags; | |||
| 363 | int input_size; | |||
| 364 | int output_size; | |||
| 365 | size_t workspace_size; | |||
| 366 | ccv_nnc_cmd_autotune_tensor_shape_t* inputs; | |||
| 367 | ccv_nnc_cmd_autotune_tensor_shape_t* outputs; | |||
| 368 | } ccv_nnc_cmd_autotune_key_t; | |||
| 369 | ||||
| 370 | static CCV_WARN_UNUSED(ccv_nnc_cmd_autotune_key_t)ccv_nnc_cmd_autotune_key_t __attribute__((warn_unused_result) ) ccv_nnc_cmd_autotune_key_new(const ccv_nnc_cmd_t cmd, const size_t workspace_size, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size) | |||
| 371 | { | |||
| 372 | ccv_nnc_cmd_autotune_key_t key = { | |||
| 373 | .cmd = cmd.cmd, | |||
| 374 | .params = cmd.info, | |||
| 375 | .hint = hint, | |||
| 376 | .workspace_size = workspace_size, | |||
| 377 | .inputs = 0, | |||
| 378 | .input_size = 0, | |||
| 379 | .outputs = 0, | |||
| 380 | .output_size = 0 | |||
| 381 | }; | |||
| 382 | if (input_size == 0 && output_size
| |||
| 383 | return key; | |||
| 384 | assert(input_size >= 0 && output_size >= 0)((void) sizeof ((input_size >= 0 && output_size >= 0) ? 1 : 0), __extension__ ({ if (input_size >= 0 && output_size >= 0) ; else __assert_fail ("input_size >= 0 && output_size >= 0" , "ccv_nnc_cmd.c", 384, __extension__ __PRETTY_FUNCTION__); } )); | |||
| 385 | key.input_size = input_size; | |||
| 386 | key.output_size = output_size; | |||
| 387 | key.inputs = (ccv_nnc_cmd_autotune_tensor_shape_t*)ccmallocmalloc(sizeof(ccv_nnc_cmd_autotune_tensor_shape_t) * (input_size + output_size)); | |||
| 388 | key.outputs = key.inputs + input_size; | |||
| 389 | int i, j; | |||
| 390 | for (i = 0; i < input_size; i++) | |||
| 391 | { | |||
| 392 | memset(key.inputs[i].dim, 0, sizeof(key.inputs[i].dim)); | |||
| 393 | memset(key.inputs[i].stride, 0, sizeof(key.inputs[i].stride)); | |||
| 394 | if (!inputs[i]) | |||
| 395 | { | |||
| 396 | key.inputs[i].format = 0; | |||
| 397 | key.inputs[i].datatype = 0; | |||
| 398 | key.inputs[i].dataof = 0; | |||
| 399 | key.inputs[i].nd = 0; | |||
| 400 | continue; | |||
| 401 | } | |||
| 402 | key.inputs[i].format = inputs[i]->info.format; | |||
| 403 | key.inputs[i].datatype = inputs[i]->info.datatype; | |||
| 404 | key.inputs[i].dataof = inputs[i]->dataof; | |||
| 405 | const int nd = key.inputs[i].nd = ccv_nnc_tensor_nd(inputs[i]->info.dim); | |||
| 406 | for (j = 0; j < nd; j++) | |||
| 407 | key.inputs[i].dim[j] = inputs[i]->info.dim[j]; | |||
| 408 | if (CCV_IS_TENSOR_VIEW(inputs[i])((*(int*)(inputs[i])) & CCV_TENSOR_VIEW)) | |||
| 409 | for (j = 0; j < nd; j++) | |||
| 410 | key.inputs[i].stride[j] = ((ccv_nnc_tensor_view_t*)inputs[i])->stride[j]; | |||
| 411 | } | |||
| 412 | for (i = 0; i < output_size; i++) | |||
| 413 | { | |||
| 414 | memset(key.outputs[i].dim, 0, sizeof(key.outputs[i].dim)); | |||
| 415 | memset(key.outputs[i].stride, 0, sizeof(key.outputs[i].stride)); | |||
| 416 | if (!outputs[i]) | |||
| 417 | { | |||
| 418 | key.outputs[i].format = 0; | |||
| 419 | key.outputs[i].datatype = 0; | |||
| 420 | key.outputs[i].dataof = 0; | |||
| 421 | key.outputs[i].nd = 0; | |||
| 422 | continue; | |||
| 423 | } | |||
| 424 | key.outputs[i].format = outputs[i]->info.format; | |||
| 425 | key.outputs[i].datatype = outputs[i]->info.datatype; | |||
| 426 | key.outputs[i].dataof = outputs[i]->dataof; | |||
| 427 | const int nd = key.outputs[i].nd = ccv_nnc_tensor_nd(outputs[i]->info.dim); | |||
| 428 | for (j = 0; j < nd; j++) | |||
| 429 | key.outputs[i].dim[j] = outputs[i]->info.dim[j]; | |||
| 430 | if (CCV_IS_TENSOR_VIEW(outputs[i])((*(int*)(outputs[i])) & CCV_TENSOR_VIEW)) | |||
| 431 | for (j = 0; j < nd; j++) | |||
| 432 | key.outputs[i].stride[j] = ((ccv_nnc_tensor_view_t*)outputs[i])->stride[j]; | |||
| 433 | } | |||
| 434 | return key; | |||
| 435 | } | |||
| 436 | ||||
| 437 | // autotune cache. | |||
| 438 | static inline uint32_t twang_32from64(uint64_t key) | |||
| 439 | { | |||
| 440 | key = (~key) + (key << 18); | |||
| 441 | key = key ^ (key >> 31); | |||
| 442 | key = key * 21; | |||
| 443 | key = key ^ (key >> 11); | |||
| 444 | key = key + (key << 6); | |||
| 445 | key = key ^ (key >> 22); | |||
| 446 | return (uint32_t)(key); | |||
| 447 | } | |||
| 448 | ||||
| 449 | static inline khint32_t _kh_autotune_key_executable_hash_func(const ccv_nnc_cmd_autotune_key_t key) | |||
| 450 | { | |||
| 451 | uint32_t h = key.cmd; | |||
| 452 | int i, j; | |||
| 453 | uint32_t* data = (uint32_t*)&key.params; | |||
| 454 | for (i = 0; i < sizeof(key.params) / sizeof(uint32_t); i++) | |||
| 455 | h = twang_32from64(((uint64_t)h << 32) | data[i]); | |||
| 456 | data = (uint32_t*)&key.hint; | |||
| 457 | for (i = 0; i < sizeof(key.hint) / sizeof(uint32_t); i++) | |||
| 458 | h = twang_32from64(((uint64_t)h << 32) | data[i]); | |||
| 459 | h = twang_32from64(((uint64_t)h << 32) | key.workspace_size); | |||
| 460 | h = twang_32from64(((uint64_t)h << 32) | key.input_size); | |||
| 461 | h = twang_32from64(((uint64_t)h << 32) | key.output_size); | |||
| 462 | for (i = 0; i < key.input_size; i++) | |||
| 463 | { | |||
| 464 | h = twang_32from64(((uint64_t)h << 32) | key.inputs[i].format); | |||
| 465 | h = twang_32from64(((uint64_t)h << 32) | key.inputs[i].datatype); | |||
| 466 | h = twang_32from64(((uint64_t)h << 32) | key.inputs[i].dataof); | |||
| 467 | h = twang_32from64(((uint64_t)h << 32) | key.inputs[i].nd); | |||
| 468 | for (j = 0; j < key.inputs[i].nd; j++) | |||
| 469 | { | |||
| 470 | h = twang_32from64(((uint64_t)h << 32) | key.inputs[i].dim[j]); | |||
| 471 | h = twang_32from64(((uint64_t)h << 32) | key.inputs[i].stride[j]); | |||
| 472 | } | |||
| 473 | } | |||
| 474 | for (i = 0; i < key.output_size; i++) | |||
| 475 | { | |||
| 476 | h = twang_32from64(((uint64_t)h << 32) | key.outputs[i].format); | |||
| 477 | h = twang_32from64(((uint64_t)h << 32) | key.outputs[i].datatype); | |||
| 478 | h = twang_32from64(((uint64_t)h << 32) | key.outputs[i].dataof); | |||
| 479 | h = twang_32from64(((uint64_t)h << 32) | key.outputs[i].nd); | |||
| 480 | for (j = 0; j < key.outputs[i].nd; j++) | |||
| 481 | { | |||
| 482 | h = twang_32from64(((uint64_t)h << 32) | key.outputs[i].dim[j]); | |||
| 483 | h = twang_32from64(((uint64_t)h << 32) | key.outputs[i].stride[j]); | |||
| 484 | } | |||
| 485 | } | |||
| 486 | return (khint32_t)h; | |||
| 487 | } | |||
| 488 | ||||
| 489 | static inline int _kh_autotune_key_executable_hash_equal(const ccv_nnc_cmd_autotune_key_t a, const ccv_nnc_cmd_autotune_key_t b) | |||
| 490 | { | |||
| 491 | if (a.cmd != b.cmd || a.flags != b.flags || a.workspace_size != b.workspace_size || a.input_size != b.input_size || a.output_size != b.output_size) | |||
| 492 | return 0; | |||
| 493 | if (memcmp(&a.params, &b.params, sizeof(a.params)) != 0) | |||
| 494 | return 0; | |||
| 495 | if (memcmp(&a.hint, &b.hint, sizeof(a.hint)) != 0) | |||
| 496 | return 0; | |||
| 497 | int i, j; | |||
| 498 | for (i = 0; i < a.input_size; i++) | |||
| 499 | { | |||
| 500 | if (a.inputs[i].format != b.inputs[i].format || a.inputs[i].datatype != b.inputs[i].datatype || a.inputs[i].nd != b.inputs[i].nd || a.inputs[i].dataof != b.inputs[i].dataof) | |||
| 501 | return 0; | |||
| 502 | for (j = 0; j < a.inputs[i].nd; j++) | |||
| 503 | if (a.inputs[i].dim[j] != b.inputs[i].dim[j] || a.inputs[i].stride[j] != b.inputs[i].stride[j]) | |||
| 504 | return 0; | |||
| 505 | } | |||
| 506 | for (i = 0; i < a.output_size; i++) | |||
| 507 | { | |||
| 508 | if (a.outputs[i].format != b.outputs[i].format || a.outputs[i].datatype != b.outputs[i].datatype || a.outputs[i].nd != b.outputs[i].nd || a.outputs[i].dataof != b.outputs[i].dataof) | |||
| 509 | return 0; | |||
| 510 | for (j = 0; j < a.outputs[i].nd; j++) | |||
| 511 | if (a.outputs[i].dim[j] != b.outputs[i].dim[j] || a.outputs[i].stride[j] != b.outputs[i].stride[j]) | |||
| 512 | return 0; | |||
| 513 | } | |||
| 514 | return 1; | |||
| 515 | } | |||
| 516 | ||||
| 517 | typedef struct { | |||
| 518 | int backend; | |||
| 519 | int algorithm; | |||
| 520 | } ccv_nnc_cmd_autotune_val_t; | |||
| 521 | ||||
| 522 | KHASH_INIT(autotune_executable_cache, ccv_nnc_cmd_autotune_key_t, ccv_nnc_cmd_autotune_val_t, 1, _kh_autotune_key_executable_hash_func, _kh_autotune_key_executable_hash_equal)typedef struct kh_autotune_executable_cache_s { khint_t n_buckets , size, n_occupied, upper_bound; khint32_t *flags; ccv_nnc_cmd_autotune_key_t *keys; ccv_nnc_cmd_autotune_val_t *vals; } kh_autotune_executable_cache_t ; static inline __attribute__ ((__unused__)) kh_autotune_executable_cache_t *kh_init_autotune_executable_cache(void) { return (kh_autotune_executable_cache_t *)calloc(1,sizeof(kh_autotune_executable_cache_t)); } static inline __attribute__ ((__unused__)) void kh_destroy_autotune_executable_cache (kh_autotune_executable_cache_t *h) { if (h) { free((void *)h ->keys); free(h->flags); free((void *)h->vals); free (h); } } static inline __attribute__ ((__unused__)) void kh_clear_autotune_executable_cache (kh_autotune_executable_cache_t *h) { if (h && h-> flags) { memset(h->flags, 0xaa, ((h->n_buckets) < 16 ? 1 : (h->n_buckets)>>4) * sizeof(khint32_t)); h-> size = h->n_occupied = 0; } } static inline __attribute__ ( (__unused__)) khint_t kh_get_autotune_executable_cache(const kh_autotune_executable_cache_t *h, ccv_nnc_cmd_autotune_key_t key) { if (h->n_buckets) { khint_t k, i, last, mask, step = 0; mask = h->n_buckets - 1; k = _kh_autotune_key_executable_hash_func(key); i = k & mask; last = i; while (!((h->flags[i>>4]>>((i &0xfU)<<1))&2) && (((h->flags[i>> 4]>>((i&0xfU)<<1))&1) || !_kh_autotune_key_executable_hash_equal (h->keys[i], key))) { i = (i + (++step)) & mask; if (i == last) return h->n_buckets; } return ((h->flags[i>> 4]>>((i&0xfU)<<1))&3)? h->n_buckets : i ; } else return 0; } static inline __attribute__ ((__unused__ )) int kh_resize_autotune_executable_cache(kh_autotune_executable_cache_t *h, khint_t new_n_buckets) { khint32_t *new_flags = 0; khint_t j = 1; { (--(new_n_buckets), (new_n_buckets)|=(new_n_buckets )>>1, (new_n_buckets)|=(new_n_buckets)>>2, (new_n_buckets )|=(new_n_buckets)>>4, (new_n_buckets)|=(new_n_buckets) >>8, (new_n_buckets)|=(new_n_buckets)>>16, ++(new_n_buckets )); if (new_n_buckets < 4) new_n_buckets = 4; if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0 ; else { new_flags = (khint32_t*)malloc(((new_n_buckets) < 16? 1 : (new_n_buckets)>>4) * sizeof(khint32_t)); if ( !new_flags) return -1; memset(new_flags, 0xaa, ((new_n_buckets ) < 16? 1 : (new_n_buckets)>>4) * sizeof(khint32_t)) ; if (h->n_buckets < new_n_buckets) { ccv_nnc_cmd_autotune_key_t *new_keys = (ccv_nnc_cmd_autotune_key_t*)realloc((void *)h-> keys,new_n_buckets * sizeof(ccv_nnc_cmd_autotune_key_t)); if ( !new_keys) { free(new_flags); return -1; } h->keys = new_keys ; if (1) { ccv_nnc_cmd_autotune_val_t *new_vals = (ccv_nnc_cmd_autotune_val_t *)realloc((void *)h->vals,new_n_buckets * sizeof(ccv_nnc_cmd_autotune_val_t )); if (!new_vals) { free(new_flags); return -1; } h->vals = new_vals; } } } } if (j) { for (j = 0; j != h->n_buckets ; ++j) { if (((h->flags[j>>4]>>((j&0xfU)<< 1))&3) == 0) { ccv_nnc_cmd_autotune_key_t key = h->keys [j]; ccv_nnc_cmd_autotune_val_t val; khint_t new_mask; new_mask = new_n_buckets - 1; if (1) val = h->vals[j]; (h->flags [j>>4]|=1ul<<((j&0xfU)<<1)); while (1) { khint_t k, i, step = 0; k = _kh_autotune_key_executable_hash_func (key); i = k & new_mask; while (!((new_flags[i>>4]>> ((i&0xfU)<<1))&2)) i = (i + (++step)) & new_mask ; (new_flags[i>>4]&=~(2ul<<((i&0xfU)<< 1))); if (i < h->n_buckets && ((h->flags[i>> 4]>>((i&0xfU)<<1))&3) == 0) { { ccv_nnc_cmd_autotune_key_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } if (1 ) { ccv_nnc_cmd_autotune_val_t tmp = h->vals[i]; h->vals [i] = val; val = tmp; } (h->flags[i>>4]|=1ul<< ((i&0xfU)<<1)); } else { h->keys[i] = key; if (1 ) h->vals[i] = val; break; } } } } if (h->n_buckets > new_n_buckets) { h->keys = (ccv_nnc_cmd_autotune_key_t*)realloc ((void *)h->keys,new_n_buckets * sizeof(ccv_nnc_cmd_autotune_key_t )); if (1) h->vals = (ccv_nnc_cmd_autotune_val_t*)realloc( (void *)h->vals,new_n_buckets * sizeof(ccv_nnc_cmd_autotune_val_t )); } free(h->flags); h->flags = new_flags; h->n_buckets = new_n_buckets; h->n_occupied = h->size; h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); } return 0; } static inline __attribute__ ((__unused__)) khint_t kh_put_autotune_executable_cache (kh_autotune_executable_cache_t *h, ccv_nnc_cmd_autotune_key_t key, int *ret) { khint_t x; if (h->n_occupied >= h-> upper_bound) { if (h->n_buckets > (h->size<<1) ) { if (kh_resize_autotune_executable_cache(h, h->n_buckets - 1) < 0) { *ret = -1; return h->n_buckets; } } else if (kh_resize_autotune_executable_cache(h, h->n_buckets + 1) < 0) { *ret = -1; return h->n_buckets; } } { khint_t k , i, site, last, mask = h->n_buckets - 1, step = 0; x = site = h->n_buckets; k = _kh_autotune_key_executable_hash_func (key); i = k & mask; if (((h->flags[i>>4]>> ((i&0xfU)<<1))&2)) x = i; else { last = i; while (!((h->flags[i>>4]>>((i&0xfU)<<1))& 2) && (((h->flags[i>>4]>>((i&0xfU) <<1))&1) || !_kh_autotune_key_executable_hash_equal (h->keys[i], key))) { if (((h->flags[i>>4]>> ((i&0xfU)<<1))&1)) site = i; i = (i + (++step)) & mask; if (i == last) { x = site; break; } } if (x == h ->n_buckets) { if (((h->flags[i>>4]>>((i& 0xfU)<<1))&2) && site != h->n_buckets) x = site; else x = i; } } } if (((h->flags[x>>4]>> ((x&0xfU)<<1))&2)) { h->keys[x] = key; (h-> flags[x>>4]&=~(3ul<<((x&0xfU)<<1))) ; ++h->size; ++h->n_occupied; *ret = 1; } else if (((h-> flags[x>>4]>>((x&0xfU)<<1))&1)) { h ->keys[x] = key; (h->flags[x>>4]&=~(3ul<< ((x&0xfU)<<1))); ++h->size; *ret = 2; } else *ret = 0; return x; } static inline __attribute__ ((__unused__)) void kh_del_autotune_executable_cache(kh_autotune_executable_cache_t *h, khint_t x) { if (x != h->n_buckets && !((h-> flags[x>>4]>>((x&0xfU)<<1))&3)) { ( h->flags[x>>4]|=1ul<<((x&0xfU)<<1)); --h->size; } } | |||
| 523 | ||||
| 524 | static khash_t(autotune_executable_cache)kh_autotune_executable_cache_t* g_autotune_executable_cache = 0; | |||
| 525 | ||||
| 526 | static inline void ccv_nnc_cmd_autotune_key_free(ccv_nnc_cmd_autotune_key_t key) | |||
| 527 | { | |||
| 528 | if (key.inputs) | |||
| 529 | ccfreefree(key.inputs); | |||
| 530 | } | |||
| 531 | ||||
| 532 | void ccv_nnc_drain_autotune_cache(void) | |||
| 533 | { | |||
| 534 | if (!g_autotune_executable_cache) | |||
| 535 | return; | |||
| 536 | khiter_t k; | |||
| 537 | for (k = kh_begin(g_autotune_executable_cache)(khint_t)(0); k < kh_end(g_autotune_executable_cache)((g_autotune_executable_cache)->n_buckets); k++) | |||
| 538 | { | |||
| 539 | if (!kh_exist(g_autotune_executable_cache, k)(!(((g_autotune_executable_cache)->flags[(k)>>4]>> (((k)&0xfU)<<1))&3))) | |||
| 540 | continue; | |||
| 541 | ccv_nnc_cmd_autotune_key_free(kh_key(g_autotune_executable_cache, k)((g_autotune_executable_cache)->keys[k])); | |||
| 542 | kh_del(autotune_executable_cache, g_autotune_executable_cache, k)kh_del_autotune_executable_cache(g_autotune_executable_cache, k); | |||
| 543 | } | |||
| 544 | } | |||
| 545 | ||||
| 546 | ccv_nnc_cmd_t ccv_nnc_cmd_autotune(const ccv_nnc_cmd_t cmd, const size_t max_workspace_size, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) | |||
| 547 | { | |||
| 548 | // This is a custom cmd kernel, no need to autotune. | |||
| 549 | if (cmd.cmd == CCV_NNC_NOOP || | |||
| ||||
| 550 | cmd.cmd == CCV_NNC_GRAPH_FORWARD || cmd.cmd == CCV_NNC_GRAPH_BACKWARD || | |||
| 551 | cmd.cmd == CCV_NNC_CUSTOM_FORWARD || cmd.cmd == CCV_NNC_CUSTOM_BACKWARD) | |||
| 552 | return cmd; | |||
| 553 | int i, j, k; | |||
| 554 | // Go through all the backends that supports the same type of memory input / output tensors support. | |||
| 555 | int tensor_memory = 0, tensor_formats = 0, tensor_datatypes = 0; | |||
| 556 | for (i = 0; i < input_size; i++) | |||
| 557 | if (inputs[i]) | |||
| 558 | tensor_memory |= CCV_TENSOR_GET_MEMORY(inputs[i]->info.type)((inputs[i]->info.type) & 0x3), tensor_formats |= inputs[i]->info.format, tensor_datatypes |= CCV_GET_DATA_TYPE(inputs[i]->info.datatype)((inputs[i]->info.datatype) & 0xFF000); | |||
| 559 | for (i = 0; i < output_size; i++) | |||
| 560 | if (outputs[i]) | |||
| 561 | tensor_memory |= CCV_TENSOR_GET_MEMORY(outputs[i]->info.type)((outputs[i]->info.type) & 0x3), tensor_formats |= outputs[i]->info.format, tensor_datatypes |= CCV_GET_DATA_TYPE(outputs[i]->info.datatype)((outputs[i]->info.datatype) & 0xFF000); | |||
| 562 | // In this case, we cannot determine the type of the tensor, skip auto-tune. | |||
| 563 | if (!tensor_memory) | |||
| 564 | return cmd; | |||
| 565 | // Otherwise, we are good to go. | |||
| 566 | ccv_nnc_cmd_t tuned_cmd = cmd; | |||
| 567 | if (!g_autotune_executable_cache) | |||
| 568 | g_autotune_executable_cache = kh_init(autotune_executable_cache)kh_init_autotune_executable_cache(); | |||
| 569 | int ret = 0; | |||
| 570 | ccv_nnc_cmd_autotune_key_t key = ccv_nnc_cmd_autotune_key_new(cmd, max_workspace_size, hint, flags, inputs, input_size, outputs, output_size); | |||
| 571 | khiter_t kiter = kh_put(autotune_executable_cache, g_autotune_executable_cache, key, &ret)kh_put_autotune_executable_cache(g_autotune_executable_cache, key, &ret); | |||
| 572 | if (ret
| |||
| 573 | { | |||
| 574 | ccv_nnc_cmd_autotune_key_free(key); | |||
| 575 | const ccv_nnc_cmd_autotune_val_t val = kh_val(g_autotune_executable_cache, kiter)((g_autotune_executable_cache)->vals[kiter]); | |||
| 576 | tuned_cmd.backend = val.backend; | |||
| 577 | tuned_cmd.algorithm = val.algorithm; | |||
| 578 | return tuned_cmd; | |||
| 579 | } | |||
| 580 | int64_t best_measured = -1; | |||
| ||||
| 581 | const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd); | |||
| 582 | assert(cmd_idx >= 0 && cmd_idx < sizeof(init_map) / sizeof(init_map[0]))((void) sizeof ((cmd_idx >= 0 && cmd_idx < sizeof (init_map) / sizeof(init_map[0])) ? 1 : 0), __extension__ ({ if (cmd_idx >= 0 && cmd_idx < sizeof(init_map) / sizeof (init_map[0])) ; else __assert_fail ("cmd_idx >= 0 && cmd_idx < sizeof(init_map) / sizeof(init_map[0])" , "ccv_nnc_cmd.c", 582, __extension__ __PRETTY_FUNCTION__); } )); | |||
| 583 | int flag = 0, autotune_available_1 = 0; // This is only applicable if we have only one backend. | |||
| 584 | for (i = 0; i < CCV_NNC_BACKEND_COUNT; i++) | |||
| 585 | { | |||
| 586 | const ccv_nnc_cmd_backend_registry_t api_registry = init_map[cmd_idx].backends[i]; | |||
| 587 | // We have the exec kernel, and support all the tensor memory types. | |||
| 588 | if (api_registry.exec && | |||
| 589 | (api_registry.tensor_memory & tensor_memory) == tensor_memory && | |||
| 590 | (api_registry.tensor_formats & tensor_formats) == tensor_formats && | |||
| 591 | (api_registry.tensor_datatypes & tensor_datatypes) == tensor_datatypes) | |||
| 592 | { | |||
| 593 | if (api_registry.autotune) | |||
| 594 | autotune_available_1 = 1; | |||
| 595 | if ((++flag) >= 2) // If we have more than 2 suitable backend, we can do this now. | |||
| 596 | break; | |||
| 597 | } | |||
| 598 | } | |||
| 599 | if (flag == 0) | |||
| 600 | return cmd; | |||
| 601 | _ccv_nnc_cmd_set_device_id(inputs, input_size, outputs, output_size, stream_context); | |||
| 602 | // Allocate inputs / outputs and fill them in. | |||
| 603 | ccv_nnc_tensor_t** copy_inputs; | |||
| 604 | ccv_nnc_tensor_t** copy_outputs; | |||
| 605 | ccv_nnc_tensor_t** allocated_inputs; | |||
| 606 | ccv_nnc_tensor_t** allocated_outputs; | |||
| 607 | ccv_nnc_tensor_view_t** allocated_input_views; | |||
| 608 | ccv_nnc_tensor_view_t** allocated_output_views; | |||
| 609 | if (flag > 1 || autotune_available_1) | |||
| 610 | { | |||
| 611 | copy_inputs = (ccv_nnc_tensor_t**)cccalloccalloc((input_size + output_size) * 3, sizeof(ccv_nnc_tensor_t*)); | |||
| 612 | copy_outputs = copy_inputs + input_size; | |||
| 613 | allocated_inputs = copy_outputs + output_size; | |||
| 614 | allocated_outputs = allocated_inputs + input_size; | |||
| 615 | allocated_input_views = (ccv_nnc_tensor_view_t**)(allocated_outputs + output_size); | |||
| 616 | allocated_output_views = allocated_input_views + input_size; | |||
| 617 | int stride[CCV_NNC_MAX_DIM_ALLOC(12)]; | |||
| 618 | for (i = 0; i < output_size; i++) | |||
| 619 | if (outputs[i]) | |||
| 620 | { | |||
| 621 | for (j = 0; j < input_size; j++) | |||
| 622 | if (inputs[j]) | |||
| 623 | { | |||
| 624 | if (outputs[i] == inputs[j]) | |||
| 625 | { | |||
| 626 | if (!copy_inputs[j]) | |||
| 627 | { | |||
| 628 | allocated_inputs[j] = ccv_nnc_tensor_new(0, inputs[j]->info, 0); | |||
| 629 | if (CCV_IS_TENSOR_VIEW(inputs[j])((*(int*)(inputs[j])) & CCV_TENSOR_VIEW)) | |||
| 630 | { | |||
| 631 | ccv_nnc_tensor_get_stride(inputs[j]->info.dim, stride); | |||
| 632 | copy_inputs[j] = (ccv_nnc_tensor_t*)(allocated_input_views[j] = ccv_nnc_tensor_view_new(allocated_inputs[j], inputs[j]->info, DIM_ALLOC()(int [(12)]){}, stride)); | |||
| 633 | } else | |||
| 634 | copy_inputs[j] = allocated_inputs[j]; | |||
| 635 | } | |||
| 636 | copy_outputs[i] = copy_inputs[j]; | |||
| 637 | break; | |||
| 638 | } else if (outputs[i]->data.u8 == inputs[j]->data.u8 && | |||
| 639 | ccv_nnc_tensor_count(outputs[i]->info) == ccv_nnc_tensor_count(inputs[j]->info)) { | |||
| 640 | if (!copy_inputs[j]) | |||
| 641 | { | |||
| 642 | allocated_inputs[j] = ccv_nnc_tensor_new(0, inputs[j]->info, 0); | |||
| 643 | if (CCV_IS_TENSOR_VIEW(inputs[j])((*(int*)(inputs[j])) & CCV_TENSOR_VIEW)) | |||
| 644 | { | |||
| 645 | ccv_nnc_tensor_get_stride(inputs[j]->info.dim, stride); | |||
| 646 | copy_inputs[j] = (ccv_nnc_tensor_t*)(allocated_input_views[j] = ccv_nnc_tensor_view_new(allocated_inputs[j], inputs[j]->info, DIM_ALLOC()(int [(12)]){}, stride)); | |||
| 647 | } else | |||
| 648 | copy_inputs[j] = allocated_inputs[j]; | |||
| 649 | } | |||
| 650 | allocated_outputs[i] = ccv_nnc_tensor_new(copy_inputs[j]->data.u8, outputs[i]->info, 0); | |||
| 651 | if (CCV_IS_TENSOR_VIEW(outputs[i])((*(int*)(outputs[i])) & CCV_TENSOR_VIEW)) | |||
| 652 | { | |||
| 653 | ccv_nnc_tensor_get_stride(outputs[i]->info.dim, stride); | |||
| 654 | copy_outputs[i] = (ccv_nnc_tensor_t*)(allocated_output_views[i] = ccv_nnc_tensor_view_new(allocated_outputs[i], outputs[i]->info, DIM_ALLOC()(int [(12)]){}, stride)); | |||
| 655 | } else | |||
| 656 | copy_outputs[i] = allocated_outputs[i]; | |||
| 657 | break; | |||
| 658 | } | |||
| 659 | } | |||
| 660 | if (!copy_outputs[i]) | |||
| 661 | { | |||
| 662 | allocated_outputs[i] = ccv_nnc_tensor_new(0, outputs[i]->info, 0); | |||
| 663 | if (CCV_IS_TENSOR_VIEW(outputs[i])((*(int*)(outputs[i])) & CCV_TENSOR_VIEW)) | |||
| 664 | { | |||
| 665 | ccv_nnc_tensor_get_stride(outputs[i]->info.dim, stride); | |||
| 666 | copy_outputs[i] = (ccv_nnc_tensor_t*)(allocated_output_views[i] = ccv_nnc_tensor_view_new(allocated_outputs[i], outputs[i]->info, DIM_ALLOC()(int [(12)]){}, stride)); | |||
| 667 | } else | |||
| 668 | copy_outputs[i] = allocated_outputs[i]; | |||
| 669 | } | |||
| 670 | } | |||
| 671 | for (i = 0; i < input_size; i++) | |||
| 672 | if (inputs[i] && !copy_inputs[i]) | |||
| 673 | copy_inputs[i] = inputs[i]; | |||
| 674 | } | |||
| 675 | if (flag == 1) | |||
| 676 | { | |||
| 677 | for (i = 0; i < CCV_NNC_BACKEND_COUNT; i++) | |||
| 678 | { | |||
| 679 | const ccv_nnc_cmd_backend_registry_t api_registry = init_map[cmd_idx].backends[i]; | |||
| 680 | // We have the exec kernel, and support all the tensor memory types. | |||
| 681 | if (api_registry.exec && | |||
| 682 | (api_registry.tensor_memory & tensor_memory) == tensor_memory && | |||
| 683 | (api_registry.tensor_formats & tensor_formats) == tensor_formats && | |||
| 684 | (api_registry.tensor_datatypes & tensor_datatypes) == tensor_datatypes) | |||
| 685 | { | |||
| 686 | tuned_cmd.backend = backend_init_map[i].backend; | |||
| 687 | // If a given API exist an autotune function, use that to pick the top algorithm. | |||
| 688 | if (api_registry.autotune) | |||
| 689 | { | |||
| 690 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD()ccv_nnc_cmd(CCV_NNC_DATA_TRANSFER_FORWARD, 0, ccv_nnc_cmd_auto , 0), ccv_nnc_no_hint, 0, inputs, input_size, copy_inputs, input_size, stream_context); | |||
| 691 | _ccv_nnc_cmd_set_device_id(copy_inputs, input_size, copy_outputs, output_size, stream_context); | |||
| 692 | tuned_cmd.algorithm = api_registry.autotune(tuned_cmd, max_workspace_size, hint, flags, copy_inputs, input_size, copy_outputs, output_size, stream_context); | |||
| 693 | // Drain the context, autotune can use excessive amount of memory. Need to drain it now. | |||
| 694 | ccv_nnc_stream_context_drain(stream_context); | |||
| 695 | } | |||
| 696 | break; | |||
| 697 | } | |||
| 698 | } | |||
| 699 | if (autotune_available_1) | |||
| 700 | { | |||
| 701 | for (i = 0; i < input_size; i++) | |||
| 702 | { | |||
| 703 | if (allocated_inputs[i]) | |||
| 704 | ccv_nnc_tensor_free(allocated_inputs[i]); | |||
| 705 | if (allocated_input_views[i]) | |||
| 706 | ccv_nnc_tensor_view_free(allocated_input_views[i]); | |||
| 707 | } | |||
| 708 | for (i = 0; i < output_size; i++) | |||
| 709 | { | |||
| 710 | if (allocated_outputs[i]) | |||
| 711 | ccv_nnc_tensor_free(allocated_outputs[i]); | |||
| 712 | if (allocated_output_views[i]) | |||
| 713 | ccv_nnc_tensor_view_free(allocated_output_views[i]); | |||
| 714 | } | |||
| 715 | ccfreefree(copy_inputs); | |||
| 716 | } | |||
| 717 | const ccv_nnc_cmd_autotune_val_t val = { | |||
| 718 | .backend = tuned_cmd.backend, | |||
| 719 | .algorithm = tuned_cmd.algorithm | |||
| 720 | }; | |||
| 721 | kh_val(g_autotune_executable_cache, kiter)((g_autotune_executable_cache)->vals[kiter]) = val; | |||
| 722 | return tuned_cmd; | |||
| 723 | } | |||
| 724 | // We need to have trial loop through all the data. | |||
| 725 | for (k = 0; k < AUTO_TUNE_TRIAL_SIZE(3); k++) | |||
| 726 | { | |||
| 727 | for (i = 0; i < CCV_NNC_BACKEND_COUNT; i++) | |||
| 728 | { | |||
| 729 | const ccv_nnc_cmd_backend_registry_t api_registry = init_map[cmd_idx].backends[i]; | |||
| 730 | // We have the exec kernel, and support all the tensor memory types. | |||
| 731 | if (api_registry.exec && | |||
| 732 | (api_registry.tensor_memory & tensor_memory) == tensor_memory && | |||
| 733 | (api_registry.tensor_formats & tensor_formats) == tensor_formats && | |||
| 734 | (api_registry.tensor_datatypes & tensor_datatypes) == tensor_datatypes) | |||
| 735 | { | |||
| 736 | ccv_nnc_cmd_t candid_cmd = cmd; | |||
| 737 | candid_cmd.backend = backend_init_map[i].backend; | |||
| 738 | // If a given API exist an autotune function, use that to pick the top algorithm. | |||
| 739 | if (api_registry.autotune) | |||
| 740 | { | |||
| 741 | // Assuming k == 0 is sufficient, and we can skip. | |||
| 742 | if (k > 0) | |||
| 743 | continue; | |||
| 744 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD()ccv_nnc_cmd(CCV_NNC_DATA_TRANSFER_FORWARD, 0, ccv_nnc_cmd_auto , 0), ccv_nnc_no_hint, 0, inputs, input_size, copy_inputs, input_size, stream_context); | |||
| 745 | _ccv_nnc_cmd_set_device_id(copy_inputs, input_size, copy_outputs, output_size, stream_context); | |||
| 746 | candid_cmd.algorithm = api_registry.autotune(candid_cmd, max_workspace_size, hint, flags, copy_inputs, input_size, copy_outputs, output_size, stream_context); | |||
| 747 | // Drain the context, autotune can use excessive amount of memory. Need to drain it now. | |||
| 748 | ccv_nnc_stream_context_drain(stream_context); | |||
| 749 | uint64_t elapsed = ccv_nnc_cmd_mono_time(); | |||
| 750 | // Ready to run. | |||
| 751 | int status = ccv_nnc_cmd_exec(candid_cmd, hint, flags, inputs, input_size, outputs, output_size, stream_context); | |||
| 752 | ccv_nnc_stream_context_wait(stream_context); | |||
| 753 | elapsed = ccv_nnc_cmd_mono_time() - elapsed; | |||
| 754 | if (status == CCV_NNC_EXEC_SUCCESS && | |||
| 755 | (best_measured == -1 || elapsed < best_measured)) | |||
| 756 | { | |||
| 757 | best_measured = elapsed; | |||
| 758 | tuned_cmd = candid_cmd; | |||
| 759 | } | |||
| 760 | } else { | |||
| 761 | // Otherwise loop over the existing algorithms and pick the top one. | |||
| 762 | for (j = 0; j < api_registry.algorithms; j++) | |||
| 763 | { | |||
| 764 | candid_cmd.algorithm = j; | |||
| 765 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD()ccv_nnc_cmd(CCV_NNC_DATA_TRANSFER_FORWARD, 0, ccv_nnc_cmd_auto , 0), ccv_nnc_no_hint, 0, inputs, input_size, copy_inputs, input_size, stream_context); | |||
| 766 | _ccv_nnc_cmd_set_device_id(copy_inputs, input_size, copy_outputs, output_size, stream_context); | |||
| 767 | uint64_t elapsed = ccv_nnc_cmd_mono_time(); | |||
| 768 | // Ready to run. | |||
| 769 | int status = ccv_nnc_cmd_exec(candid_cmd, hint, flags, copy_inputs, input_size, copy_outputs, output_size, stream_context); | |||
| 770 | elapsed = ccv_nnc_cmd_mono_time() - elapsed; | |||
| 771 | if (status == CCV_NNC_EXEC_SUCCESS && | |||
| 772 | (best_measured == -1 || elapsed < best_measured)) | |||
| 773 | { | |||
| 774 | best_measured = elapsed; | |||
| 775 | tuned_cmd = candid_cmd; | |||
| 776 | } | |||
| 777 | } | |||
| 778 | } | |||
| 779 | } | |||
| 780 | } | |||
| 781 | } | |||
| 782 | for (i = 0; i < input_size; i++) | |||
| 783 | { | |||
| 784 | if (allocated_inputs[i]) | |||
| 785 | ccv_nnc_tensor_free(allocated_inputs[i]); | |||
| 786 | if (allocated_input_views[i]) | |||
| 787 | ccv_nnc_tensor_view_free(allocated_input_views[i]); | |||
| 788 | } | |||
| 789 | for (i = 0; i < output_size; i++) | |||
| 790 | { | |||
| 791 | if (allocated_outputs[i]) | |||
| 792 | ccv_nnc_tensor_free(allocated_outputs[i]); | |||
| 793 | if (allocated_output_views[i]) | |||
| 794 | ccv_nnc_tensor_view_free(allocated_output_views[i]); | |||
| 795 | } | |||
| 796 | ccfreefree(copy_inputs); | |||
| 797 | const ccv_nnc_cmd_autotune_val_t val = { | |||
| 798 | .backend = tuned_cmd.backend, | |||
| 799 | .algorithm = tuned_cmd.algorithm | |||
| 800 | }; | |||
| 801 | kh_val(g_autotune_executable_cache, kiter)((g_autotune_executable_cache)->vals[kiter]) = val; | |||
| 802 | return tuned_cmd; | |||
| 803 | } | |||
| 804 | ||||
| 805 | int ccv_nnc_cmd_bitmask(const ccv_nnc_cmd_t cmd, const int input_size, const int output_size, const uint64_t* const input_bitmasks, const int input_bitmask_size, const uint64_t* const output_bitmasks, const int output_bitmask_size) | |||
| 806 | { | |||
| 807 | // If it is no-op, return true, it can deal with any number of parameters. | |||
| 808 | if (cmd.cmd == CCV_NNC_NOOP) | |||
| 809 | return 1; | |||
| 810 | // If it is a custom command, I cannot check it at all, return false. | |||
| 811 | if (cmd.cmd == CCV_NNC_CUSTOM_FORWARD || cmd.cmd == CCV_NNC_CUSTOM_BACKWARD) | |||
| 812 | return 0; | |||
| 813 | const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd); | |||
| 814 | const ccv_nnc_cmd_registry_t cmd_registry = init_map[cmd_idx].registry; | |||
| 815 | if (cmd_registry.bitmask) | |||
| 816 | return cmd_registry.bitmask(cmd.info, input_size, output_size, input_bitmasks, input_bitmask_size, output_bitmasks, output_bitmask_size); | |||
| 817 | // If there is not checking, none can pass. | |||
| 818 | return 0; | |||
| 819 | } | |||
| 820 | ||||
| 821 | int ccv_nnc_device_ids_for_io(ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, const int tensor_type, int* const device_ids, const int max_device_id_size) | |||
| 822 | { | |||
| 823 | int i, j; | |||
| 824 | int device_id_size = 0; | |||
| 825 | if (max_device_id_size <= device_id_size) | |||
| 826 | return device_id_size; | |||
| 827 | // The device id of the exec is determined by its outputs. | |||
| 828 | for (i = 0; i < output_size; i++) | |||
| 829 | if (outputs[i] && | |||
| 830 | CCV_TENSOR_GET_MEMORY(outputs[i]->info.type)((outputs[i]->info.type) & 0x3) == tensor_type && | |||
| 831 | CCV_TENSOR_GET_DEVICE(outputs[i]->info.type)((outputs[i]->info.type) & 0xfff00) != CCV_COMPUTE_DEVICE_ANY) | |||
| 832 | { | |||
| 833 | const int device_id = CCV_TENSOR_GET_DEVICE_ID(outputs[i]->info.type)(((outputs[i]->info.type) & 0xfff00) >> 8); | |||
| 834 | int flag = 0; | |||
| 835 | for (j = 0; !flag && j < device_id_size; j++) | |||
| 836 | flag = (device_ids[j] == device_id); | |||
| 837 | if (flag) | |||
| 838 | continue; | |||
| 839 | device_ids[device_id_size++] = device_id; | |||
| 840 | if (device_id_size >= max_device_id_size) | |||
| 841 | return device_id_size; | |||
| 842 | } | |||
| 843 | if (device_id_size == 0) | |||
| 844 | { | |||
| 845 | int device_id = -1; | |||
| 846 | for (i = 0; i < input_size; i++) | |||
| 847 | if (inputs[i] && | |||
| 848 | CCV_TENSOR_GET_MEMORY(inputs[i]->info.type)((inputs[i]->info.type) & 0x3) == tensor_type && | |||
| 849 | CCV_TENSOR_GET_DEVICE(inputs[i]->info.type)((inputs[i]->info.type) & 0xfff00) != CCV_COMPUTE_DEVICE_ANY && | |||
| 850 | (device_id < 0 || CCV_TENSOR_GET_DEVICE_ID(inputs[i]->info.type)(((inputs[i]->info.type) & 0xfff00) >> 8) < device_id)) | |||
| 851 | device_id = CCV_TENSOR_GET_DEVICE_ID(inputs[i]->info.type)(((inputs[i]->info.type) & 0xfff00) >> 8); | |||
| 852 | if (device_id >= 0) | |||
| 853 | { | |||
| 854 | device_ids[0] = device_id; | |||
| 855 | return 1; | |||
| 856 | } | |||
| 857 | } | |||
| 858 | return device_id_size; | |||
| 859 | } | |||
| 860 | ||||
| 861 | void* ccv_nnc_cmd_aux(const ccv_nnc_cmd_t cmd) | |||
| 862 | { | |||
| 863 | if (cmd.cmd == CCV_NNC_NOOP || | |||
| 864 | cmd.cmd == CCV_NNC_CUSTOM_FORWARD || cmd.cmd == CCV_NNC_CUSTOM_BACKWARD || | |||
| 865 | cmd.cmd == CCV_NNC_GRAPH_FORWARD || cmd.cmd == CCV_NNC_GRAPH_BACKWARD) | |||
| 866 | return 0; | |||
| 867 | assert(cmd.backend != CCV_NNC_NO_BACKEND)((void) sizeof ((cmd.backend != CCV_NNC_NO_BACKEND) ? 1 : 0), __extension__ ({ if (cmd.backend != CCV_NNC_NO_BACKEND) ; else __assert_fail ("cmd.backend != CCV_NNC_NO_BACKEND", "ccv_nnc_cmd.c" , 867, __extension__ __PRETTY_FUNCTION__); })); | |||
| 868 | const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd); | |||
| 869 | assert(cmd_idx >= 0 && cmd_idx < sizeof(init_map) / sizeof(init_map[0]))((void) sizeof ((cmd_idx >= 0 && cmd_idx < sizeof (init_map) / sizeof(init_map[0])) ? 1 : 0), __extension__ ({ if (cmd_idx >= 0 && cmd_idx < sizeof(init_map) / sizeof (init_map[0])) ; else __assert_fail ("cmd_idx >= 0 && cmd_idx < sizeof(init_map) / sizeof(init_map[0])" , "ccv_nnc_cmd.c", 869, __extension__ __PRETTY_FUNCTION__); } )); | |||
| 870 | const int backend_idx = _ccv_nnc_cmd_backend_ph(cmd.backend); | |||
| 871 | assert(backend_idx >= 0 && backend_idx < CCV_NNC_BACKEND_COUNT)((void) sizeof ((backend_idx >= 0 && backend_idx < CCV_NNC_BACKEND_COUNT) ? 1 : 0), __extension__ ({ if (backend_idx >= 0 && backend_idx < CCV_NNC_BACKEND_COUNT) ; else __assert_fail ("backend_idx >= 0 && backend_idx < CCV_NNC_BACKEND_COUNT" , "ccv_nnc_cmd.c", 871, __extension__ __PRETTY_FUNCTION__); } )); | |||
| 872 | const ccv_nnc_cmd_backend_registry_t api_registry = init_map[cmd_idx].backends[backend_idx]; | |||
| 873 | return api_registry.aux; | |||
| 874 | } | |||
| 875 | ||||
| 876 | int ccv_nnc_cmd_exec(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) | |||
| 877 | { | |||
| 878 | // If it is no-op, return as if succeed already. | |||
| 879 | if (cmd.cmd == CCV_NNC_NOOP) | |||
| 880 | return 0; | |||
| 881 | _ccv_nnc_cmd_set_device_id(inputs, input_size, outputs, output_size, stream_context); | |||
| 882 | // If it is a custom command, just apply it directly. | |||
| 883 | if (cmd.cmd == CCV_NNC_CUSTOM_FORWARD || cmd.cmd == CCV_NNC_CUSTOM_BACKWARD) | |||
| 884 | { | |||
| 885 | int ret = cmd.isa->exec(cmd, hint, flags, inputs, input_size, outputs, output_size, stream_context); | |||
| 886 | if (!stream_context) | |||
| 887 | ccv_nnc_stream_context_drain(stream_context); | |||
| 888 | return ret; | |||
| 889 | } | |||
| 890 | assert(cmd.cmd != CCV_NNC_GRAPH_FORWARD && cmd.cmd != CCV_NNC_GRAPH_BACKWARD)((void) sizeof ((cmd.cmd != CCV_NNC_GRAPH_FORWARD && cmd .cmd != CCV_NNC_GRAPH_BACKWARD) ? 1 : 0), __extension__ ({ if (cmd.cmd != CCV_NNC_GRAPH_FORWARD && cmd.cmd != CCV_NNC_GRAPH_BACKWARD ) ; else __assert_fail ("cmd.cmd != CCV_NNC_GRAPH_FORWARD && cmd.cmd != CCV_NNC_GRAPH_BACKWARD" , "ccv_nnc_cmd.c", 890, __extension__ __PRETTY_FUNCTION__); } )); | |||
| 891 | const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd); | |||
| 892 | assert(cmd_idx >= 0 && cmd_idx < sizeof(init_map) / sizeof(init_map[0]))((void) sizeof ((cmd_idx >= 0 && cmd_idx < sizeof (init_map) / sizeof(init_map[0])) ? 1 : 0), __extension__ ({ if (cmd_idx >= 0 && cmd_idx < sizeof(init_map) / sizeof (init_map[0])) ; else __assert_fail ("cmd_idx >= 0 && cmd_idx < sizeof(init_map) / sizeof(init_map[0])" , "ccv_nnc_cmd.c", 892, __extension__ __PRETTY_FUNCTION__); } )); | |||
| 893 | int i; | |||
| 894 | uint32_t backend = cmd.backend; | |||
| 895 | if (backend == CCV_NNC_NO_BACKEND) | |||
| 896 | { | |||
| 897 | // Find a suitable backend. | |||
| 898 | int tensor_memory = 0, tensor_formats = 0, tensor_datatypes = 0; | |||
| 899 | for (i = 0; i < input_size; i++) | |||
| 900 | if (inputs[i]) | |||
| 901 | tensor_memory |= CCV_TENSOR_GET_MEMORY(inputs[i]->info.type)((inputs[i]->info.type) & 0x3), tensor_formats |= inputs[i]->info.format, tensor_datatypes |= CCV_GET_DATA_TYPE(inputs[i]->info.datatype)((inputs[i]->info.datatype) & 0xFF000); | |||
| 902 | for (i = 0; i < output_size; i++) | |||
| 903 | if (outputs[i]) | |||
| 904 | tensor_memory |= CCV_TENSOR_GET_MEMORY(outputs[i]->info.type)((outputs[i]->info.type) & 0x3), tensor_formats |= outputs[i]->info.format, tensor_datatypes |= CCV_GET_DATA_TYPE(outputs[i]->info.datatype)((outputs[i]->info.datatype) & 0xFF000); | |||
| 905 | backend = ccv_nnc_cmd_find_backend(cmd, tensor_memory, tensor_formats, tensor_datatypes); | |||
| 906 | } | |||
| 907 | assert(backend != CCV_NNC_NO_BACKEND)((void) sizeof ((backend != CCV_NNC_NO_BACKEND) ? 1 : 0), __extension__ ({ if (backend != CCV_NNC_NO_BACKEND) ; else __assert_fail ( "backend != CCV_NNC_NO_BACKEND", "ccv_nnc_cmd.c", 907, __extension__ __PRETTY_FUNCTION__); })); | |||
| 908 | const int backend_idx = _ccv_nnc_cmd_backend_ph(backend); | |||
| 909 | assert(backend_idx >= 0 && backend_idx < CCV_NNC_BACKEND_COUNT)((void) sizeof ((backend_idx >= 0 && backend_idx < CCV_NNC_BACKEND_COUNT) ? 1 : 0), __extension__ ({ if (backend_idx >= 0 && backend_idx < CCV_NNC_BACKEND_COUNT) ; else __assert_fail ("backend_idx >= 0 && backend_idx < CCV_NNC_BACKEND_COUNT" , "ccv_nnc_cmd.c", 909, __extension__ __PRETTY_FUNCTION__); } )); | |||
| 910 | const ccv_nnc_cmd_backend_registry_t api_registry = init_map[cmd_idx].backends[backend_idx]; | |||
| 911 | if (!api_registry.exec) | |||
| 912 | return CCV_NNC_EXEC_NO_KERNEL; | |||
| 913 | // Everything is out, call the underlying implementation. | |||
| 914 | int ret = api_registry.exec(cmd, hint, flags, inputs, input_size, outputs, output_size, stream_context); | |||
| 915 | if (!stream_context) | |||
| 916 | ccv_nnc_stream_context_drain(stream_context); | |||
| 917 | return ret; | |||
| 918 | } | |||
| 919 | ||||
| 920 | int ccv_nnc_cmd_attr(const ccv_nnc_cmd_t cmd, const int flags) | |||
| 921 | { | |||
| 922 | // No additional attr for noop. | |||
| 923 | if (cmd.cmd == CCV_NNC_NOOP || | |||
| 924 | // If it is a custom command, just apply it directly. | |||
| 925 | cmd.cmd == CCV_NNC_CUSTOM_FORWARD || cmd.cmd == CCV_NNC_CUSTOM_BACKWARD || | |||
| 926 | // If it is sub-graph, there is no additional attr as well. | |||
| 927 | cmd.cmd == CCV_NNC_GRAPH_FORWARD || cmd.cmd == CCV_NNC_GRAPH_BACKWARD) | |||
| 928 | return 0; | |||
| 929 | const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd); | |||
| 930 | assert(cmd_idx >= 0 && cmd_idx <sizeof(init_map) / sizeof(init_map[0]))((void) sizeof ((cmd_idx >= 0 && cmd_idx <sizeof (init_map) / sizeof(init_map[0])) ? 1 : 0), __extension__ ({ if (cmd_idx >= 0 && cmd_idx <sizeof(init_map) / sizeof (init_map[0])) ; else __assert_fail ("cmd_idx >= 0 && cmd_idx <sizeof(init_map) / sizeof(init_map[0])" , "ccv_nnc_cmd.c", 930, __extension__ __PRETTY_FUNCTION__); } )); | |||
| 931 | const ccv_nnc_cmd_registry_t cmd_registry = init_map[cmd_idx].registry; | |||
| 932 | return !!(cmd_registry.flags & flags); | |||
| 933 | } | |||
| 934 | ||||
| 935 | void ccv_nnc_set_profiler(int state) | |||
| 936 | { | |||
| 937 | #ifdef HAVE_CUDA1 | |||
| 938 | cusetprofiler(state); | |||
| 939 | #endif | |||
| 940 | } | |||
| 941 | ||||
| 942 | int ccv_nnc_queue_watermark(void) | |||
| 943 | { | |||
| 944 | #ifdef HAVE_MPS | |||
| 945 | return ccv_nnc_mps_queue_watermark(); | |||
| 946 | #else | |||
| 947 | return 0; | |||
| 948 | #endif | |||
| 949 | } | |||
| 950 | ||||
| 951 | void ccv_nnc_set_queue_watermark(int watermark) | |||
| 952 | { | |||
| 953 | #ifdef HAVE_MPS | |||
| 954 | // If we need to be memory efficient, we need to bound how many in-flight command buffers there are. | |||
| 955 | ccv_nnc_mps_set_queue_watermark(watermark); | |||
| 956 | #endif | |||
| 957 | } | |||
| 958 | ||||
| 959 | void ccv_nnc_set_device_permutation(const int type, const int* const device_map, const int size) | |||
| 960 | { | |||
| 961 | if (type != CCV_STREAM_CONTEXT_GPU) | |||
| 962 | return; | |||
| 963 | #ifdef HAVE_CUDA1 | |||
| 964 | cusetdevicemap(device_map, size); | |||
| 965 | #endif | |||
| 966 | } | |||
| 967 | ||||
| 968 | void ccv_nnc_set_binary_artifacts(const char** const paths_to_read, const int paths_to_read_size, const char* const path_to_write) | |||
| 969 | { | |||
| 970 | #ifdef HAVE_MPS | |||
| 971 | ccv_nnc_mps_set_binary_artifacts(paths_to_read, paths_to_read_size, path_to_write); | |||
| 972 | #endif | |||
| 973 | } |