/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/cmd/ew/ccv_nnc_ew.c
Line | Count | Source |
1 | | #include "ccv.h" |
2 | | #include "nnc/ccv_nnc.h" |
3 | | #include "nnc/ccv_nnc_internal.h" |
4 | | |
5 | | static int _ccv_nnc_arbitary_inplace(const ccv_nnc_cmd_param_t cmd, const int input_idx, const int input_size, const int output_idx, const int output_size) |
6 | 11.6k | { |
7 | 11.6k | return 1; |
8 | 11.6k | } |
9 | | |
10 | | static int _ccv_nnc_ewsum_forw_bitmask(const ccv_nnc_cmd_param_t cmd, const int input_size, const int output_size, const uint64_t* const input_bitmasks, const int input_bitmask_size, const uint64_t* const output_bitmasks, const int output_bitmask_size) |
11 | 126 | { |
12 | 126 | if (output_size == 1 && output_bitmasks[0] == 1) |
13 | 126 | { |
14 | 126 | int i, j, flag = 0; |
15 | 126 | int input_bitcount = 0; |
16 | 252 | for (i = 0; i < input_bitmask_size; i++126 ) |
17 | 126 | { |
18 | 382 | for (j = 0; j < 64; j++256 ) |
19 | 382 | if (input_bitmasks[i] & (uint64_t)1 << j) |
20 | 256 | { |
21 | 256 | if (flag) |
22 | 0 | return 0; |
23 | 256 | } else |
24 | 126 | break; |
25 | 126 | input_bitcount += j; |
26 | | // Trailing zero even if it is not the end of input_bitmask_size, mark flag, |
27 | | // if we encounter additional 1, return invalid. |
28 | 126 | if (j < 64) |
29 | 126 | flag = 1; |
30 | | // Always like 1111100000, no 1110010101 |
31 | 7.93k | for (; j < 64; j++7.80k ) |
32 | 7.80k | if (input_bitmasks[i] & (uint64_t)1 << j) |
33 | 0 | return 0; |
34 | 126 | } |
35 | 126 | return input_size == input_bitcount; |
36 | 126 | } |
37 | 0 | return 0; |
38 | 126 | } |
39 | | |
40 | | static int _ccv_nnc_ewsum_back_bitmask(const ccv_nnc_cmd_param_t cmd, const int input_size, const int output_size, const uint64_t* const input_bitmasks, const int input_bitmask_size, const uint64_t* const output_bitmasks, const int output_bitmask_size) |
41 | 351 | { |
42 | 351 | if (input_size >= 1 && (input_bitmasks[0] & 1u) == 1u) |
43 | 271 | { |
44 | 271 | int i, j, flag = 0; |
45 | 271 | int output_bitcount = 0; |
46 | 530 | for (i = 0; i < output_bitmask_size; i++259 ) |
47 | 271 | { |
48 | 760 | for (j = 0; j < 64; j++489 ) |
49 | 760 | if (output_bitmasks[i] & (uint64_t)1 << j) |
50 | 489 | { |
51 | 489 | if (flag) |
52 | 0 | return 0; |
53 | 489 | } else |
54 | 271 | break; |
55 | 271 | output_bitcount += j; |
56 | | // Trailing zero even if it is not the end of input_bitmask_size, mark flag, |
57 | | // if we encounter additional 1, return invalid. |
58 | 271 | if (j < 64) |
59 | 271 | flag = 1; |
60 | | // Always like 1111100000, no 1110010101 |
61 | 16.3k | for (; j < 64; j++16.0k ) |
62 | 16.1k | if (output_bitmasks[i] & (uint64_t)1 << j) |
63 | 12 | return 0; |
64 | 271 | } |
65 | 259 | return output_size == output_bitcount; |
66 | 271 | } |
67 | 80 | return 0; |
68 | 351 | } |
69 | | |
70 | | REGISTER_COMMAND(CCV_NNC_EWSUM_FORWARD)(ccv_nnc_cmd_registry_t* const registry) |
71 | | FIND_BACKEND(ccv_nnc_ew_cpu_ref.c, gpu/ccv_nnc_ew_gpu_cudnn.cu, mps/ccv_nnc_ew_mps.m) |
72 | 1 | { |
73 | 1 | registry->bitmask = _ccv_nnc_ewsum_forw_bitmask; |
74 | 1 | registry->tensor_auto = ccv_nnc_hint_tensor_auto_forward_from_inputs; |
75 | 1 | registry->allow_inplace = _ccv_nnc_arbitary_inplace; |
76 | 1 | } |
77 | | |
78 | | REGISTER_COMMAND(CCV_NNC_EWSUM_BACKWARD)(ccv_nnc_cmd_registry_t* const registry) |
79 | | FIND_BACKEND(ccv_nnc_ew_cpu_ref.c, gpu/ccv_nnc_ew_gpu_cudnn.cu, mps/ccv_nnc_ew_mps.m) |
80 | 1 | { |
81 | 1 | registry->flags = CCV_NNC_CMD_ATTR_PASSTHROUGH | CCV_NNC_CMD_ATTR_NULL_IS_ONES; |
82 | 1 | registry->bitmask = _ccv_nnc_ewsum_back_bitmask; |
83 | 1 | registry->tensor_auto = ccv_nnc_hint_tensor_auto_backward_from_gradient; |
84 | 1 | registry->allow_inplace = _ccv_nnc_arbitary_inplace; |
85 | 1 | } |
86 | | |
87 | | //@REGISTER_EASY_COMMAND_MACRO(CCV_NNC_EWSUM_FORWARD) |
88 | | #define CMD_EWSUM_FORWARD() ccv_nnc_cmd(CCV_NNC_EWSUM_FORWARD, 0, ccv_nnc_cmd_auto, 0) |
89 | | //@REGISTER_EASY_COMMAND_MACRO(CCV_NNC_EWSUM_BACKWARD) |
90 | | #define CMD_EWSUM_BACKWARD() ccv_nnc_cmd(CCV_NNC_EWSUM_BACKWARD, 0, ccv_nnc_cmd_auto, 0) |
91 | | |
92 | | static int _ccv_nnc_ewprod_forw_bitmask(const ccv_nnc_cmd_param_t cmd, const int input_size, const int output_size, const uint64_t* const input_bitmasks, const int input_bitmask_size, const uint64_t* const output_bitmasks, const int output_bitmask_size) |
93 | 59 | { |
94 | 59 | if (output_size == 1 && output_bitmasks[0] == 1) |
95 | 59 | { |
96 | 59 | int i, j, flag = 0; |
97 | 59 | int input_bitcount = 0; |
98 | 118 | for (i = 0; i < input_bitmask_size; i++59 ) |
99 | 59 | { |
100 | 177 | for (j = 0; j < 64; j++118 ) |
101 | 177 | if (input_bitmasks[i] & (uint64_t)1 << j) |
102 | 118 | { |
103 | 118 | if (flag) |
104 | 0 | return 0; |
105 | 118 | } else |
106 | 59 | break; |
107 | 59 | input_bitcount += j; |
108 | | // Trailing zero even if it is not the end of input_bitmask_size, mark flag, |
109 | | // if we encounter additional 1, return invalid. |
110 | 59 | if (j < 64) |
111 | 59 | flag = 1; |
112 | | // Always like 1111100000, no 1110010101 |
113 | 3.71k | for (; j < 64; j++3.65k ) |
114 | 3.65k | if (input_bitmasks[i] & (uint64_t)1 << j) |
115 | 0 | return 0; |
116 | 59 | } |
117 | 59 | return input_size == input_bitcount; |
118 | 59 | } |
119 | 0 | return 0; |
120 | 59 | } |
121 | | |
122 | | static int _ccv_nnc_ewprod_back_bitmask(const ccv_nnc_cmd_param_t cmd, const int input_size, const int output_size, const uint64_t* const input_bitmasks, const int input_bitmask_size, const uint64_t* const output_bitmasks, const int output_bitmask_size) |
123 | 11.3k | { |
124 | 11.3k | int i, j; |
125 | 11.3k | int input_flag = 0; |
126 | 11.3k | int input_bitcount = 0; |
127 | 15.8k | for (i = 0; i < input_bitmask_size; i++4.54k ) |
128 | 11.3k | { |
129 | 34.0k | for (j = 0; j < 64; j++22.6k ) |
130 | 34.0k | if (input_bitmasks[i] & (uint64_t)1 << j) |
131 | 22.6k | { |
132 | 22.6k | if (input_flag) |
133 | 0 | return 0; |
134 | 22.6k | } else |
135 | 11.3k | break; |
136 | 11.3k | input_bitcount += j; |
137 | 11.3k | if (j < 64) |
138 | 11.3k | input_flag = 1; |
139 | | // Always like 1111100000, no 1110010101 |
140 | 292k | for (; j < 64; j++281k ) |
141 | 288k | if (input_bitmasks[i] & (uint64_t)1 << j) |
142 | 6.78k | return 0; |
143 | 11.3k | } |
144 | 4.54k | int output_flag = 0; |
145 | 4.54k | int output_bitcount = 0; |
146 | 9.07k | for (i = 0; i < output_bitmask_size; i++4.53k ) |
147 | 4.54k | { |
148 | 13.6k | for (j = 0; j < 64; j++9.06k ) |
149 | 13.6k | if ((output_bitmasks[i] & (uint64_t)1 << j)) |
150 | 9.06k | { |
151 | 9.06k | if (output_flag) |
152 | 0 | return 0; |
153 | 9.06k | } else |
154 | 4.54k | break; |
155 | 4.54k | output_bitcount += j; |
156 | 4.54k | if (j < 64) |
157 | 4.54k | output_flag = 1; |
158 | 285k | for (; j < 64; j++281k ) |
159 | 281k | if (output_bitmasks[i] & (uint64_t)1 << j) |
160 | 2 | return 0; |
161 | 4.54k | } |
162 | 4.53k | if (output_bitcount != output_size) |
163 | 10 | return 0; |
164 | 4.52k | return output_bitcount + 2 /* Gradient + Original output */ == input_bitcount; |
165 | 4.53k | } |
166 | | |
167 | | REGISTER_COMMAND(CCV_NNC_EWPROD_FORWARD)(ccv_nnc_cmd_registry_t* const registry) |
168 | | FIND_BACKEND(ccv_nnc_ew_cpu_ref.c) |
169 | 1 | { |
170 | 1 | registry->bitmask = _ccv_nnc_ewprod_forw_bitmask; |
171 | 1 | registry->tensor_auto = ccv_nnc_hint_tensor_auto_forward_from_inputs; |
172 | 1 | registry->allow_inplace = _ccv_nnc_arbitary_inplace; |
173 | 1 | } |
174 | | |
175 | | REGISTER_COMMAND(CCV_NNC_EWPROD_BACKWARD)(ccv_nnc_cmd_registry_t* const registry) |
176 | | FIND_BACKEND(ccv_nnc_ew_cpu_ref.c) |
177 | 1 | { |
178 | 1 | registry->flags = CCV_NNC_CMD_ATTR_NULL_IS_ONES; |
179 | 1 | registry->bitmask = _ccv_nnc_ewprod_back_bitmask; |
180 | 1 | registry->tensor_auto = ccv_nnc_hint_tensor_auto_backward_from_gradient; |
181 | 1 | } |
182 | | |
183 | | //@REGISTER_EASY_COMMAND_MACRO(CCV_NNC_EWPROD_FORWARD) |
184 | | #define CMD_EWPROD_FORWARD() ccv_nnc_cmd(CCV_NNC_EWPROD_FORWARD, 0, ccv_nnc_cmd_auto, 0) |
185 | | //@REGISTER_EASY_COMMAND_MACRO(CCV_NNC_EWPROD_BACKWARD) |
186 | | #define CMD_EWPROD_BACKWARD() ccv_nnc_cmd(CCV_NNC_EWPROD_BACKWARD, 0, ccv_nnc_cmd_auto, 0) |
187 | | |
188 | | static int _ccv_nnc_ewdiv_forw_bitmask(const ccv_nnc_cmd_param_t cmd, const int input_size, const int output_size, const uint64_t* const input_bitmasks, const int input_bitmask_size, const uint64_t* const output_bitmasks, const int output_bitmask_size) |
189 | 4 | { |
190 | 4 | if ((input_bitmasks[0] & 3u) == ((1u << 0) | (1u << 1)) && output_bitmasks[0] == 1u2 ) |
191 | 2 | return 1; |
192 | | // Nominator can be null (meaning 1). |
193 | 2 | if ((input_bitmasks[0] & 3u) == ((0u << 0) | (1u << 1)) && output_bitmasks[0] == 1u) |
194 | 2 | return 1; |
195 | 0 | return 0; |
196 | 2 | } |
197 | | |
198 | | static int _ccv_nnc_ewdiv_back_bitmask(const ccv_nnc_cmd_param_t cmd, const int input_size, const int output_size, const uint64_t* const input_bitmasks, const int input_bitmask_size, const uint64_t* const output_bitmasks, const int output_bitmask_size) |
199 | 72 | { |
200 | 72 | if ((input_bitmasks[0] & (15u & ~((uint64_t)1u << 1))) == ((1u << 0) | (0u << 1) | (1u << 2) | (1u << 3)) && output_bitmasks[0] == ((1u << 0) | (1u << 1))21 ) |
201 | 5 | return 1; |
202 | | // We don't need to know the original output. |
203 | 67 | if ((input_bitmasks[0] & (15u & ~((uint64_t)1u << 1))) == ((1u << 0) | (0u << 1) | (1u << 2) | (0u << 3)) && output_bitmasks[0] == ((1u << 0) | (0u << 1))18 ) |
204 | 0 | return 1; |
205 | 67 | if ((input_bitmasks[0] & (15u & ~((uint64_t)1u << 1))) == ((1u << 0) | (0u << 1) | (1u << 2) | (1u << 3)) && output_bitmasks[0] == ((0u << 0) | (1u << 1))16 ) |
206 | 16 | return 1; |
207 | 51 | return 0; |
208 | 67 | } |
209 | | |
210 | | static void _ccv_nnc_ewdiv_tensor_auto_forw(const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_hint_t hint, ccv_nnc_tensor_param_t* const outputs, const int output_size) |
211 | 49 | { |
212 | 49 | assert(output_size >= 1); |
213 | 49 | assert(input_size >= 2); |
214 | 49 | int i; |
215 | 98 | for (i = 0; i < output_size; i++49 ) |
216 | 49 | outputs[i] = inputs[1]; |
217 | 49 | } |
218 | | |
219 | | REGISTER_COMMAND(CCV_NNC_EWDIV_FORWARD)(ccv_nnc_cmd_registry_t* const registry) |
220 | | FIND_BACKEND(ccv_nnc_ew_cpu_ref.c, gpu/ccv_nnc_ew_gpu_ref.cu, mps/ccv_nnc_ew_mps.m) |
221 | 1 | { |
222 | 1 | registry->flags = CCV_NNC_CMD_ATTR_NULL_IS_ONES; |
223 | 1 | registry->bitmask = _ccv_nnc_ewdiv_forw_bitmask; |
224 | 1 | registry->tensor_auto = _ccv_nnc_ewdiv_tensor_auto_forw; |
225 | 1 | registry->allow_inplace = _ccv_nnc_arbitary_inplace; |
226 | 1 | } |
227 | | |
228 | | REGISTER_COMMAND(CCV_NNC_EWDIV_BACKWARD)(ccv_nnc_cmd_registry_t* const registry) |
229 | | FIND_BACKEND(ccv_nnc_ew_cpu_ref.c, gpu/ccv_nnc_ew_gpu_ref.cu, mps/ccv_nnc_ew_mps.m) |
230 | 1 | { |
231 | 1 | registry->flags = CCV_NNC_CMD_ATTR_NULL_IS_ONES; |
232 | 1 | registry->bitmask = _ccv_nnc_ewdiv_back_bitmask; |
233 | 1 | registry->tensor_auto = ccv_nnc_hint_tensor_auto_backward_from_gradient; |
234 | 1 | } |
235 | | |
236 | | //@REGISTER_EASY_COMMAND_MACRO(CCV_NNC_EWDIV_FORWARD) |
237 | | #define CMD_EWDIV_FORWARD() ccv_nnc_cmd(CCV_NNC_EWDIV_FORWARD, 0, ccv_nnc_cmd_auto, 0) |
238 | | //@REGISTER_EASY_COMMAND_MACRO(CCV_NNC_EWDIV_BACKWARD) |
239 | | #define CMD_EWDIV_BACKWARD() ccv_nnc_cmd(CCV_NNC_EWDIV_BACKWARD, 0, ccv_nnc_cmd_auto, 0) |
240 | | |
241 | | static int _ccv_nnc_ewexp_forw_bitmask(const ccv_nnc_cmd_param_t cmd, const int input_size, const int output_size, const uint64_t* const input_bitmasks, const int input_bitmask_size, const uint64_t* const output_bitmasks, const int output_bitmask_size) |
242 | 0 | { |
243 | 0 | if ((input_bitmasks[0] & 1u) == 1u && output_bitmasks[0] == 1u) |
244 | 0 | return 1; |
245 | 0 | return 0; |
246 | 0 | } |
247 | | |
248 | | static int _ccv_nnc_ewexp_back_bitmask(const ccv_nnc_cmd_param_t cmd, const int input_size, const int output_size, const uint64_t* const input_bitmasks, const int input_bitmask_size, const uint64_t* const output_bitmasks, const int output_bitmask_size) |
249 | 54 | { |
250 | | // We don't care about the original input. |
251 | 54 | if ((input_bitmasks[0] & (7u & ~((uint64_t)1u << 1))) == ((1u << 0) | (0u << 1) | (1u << 2)) && output_bitmasks[0] == 1u18 ) |
252 | 18 | return 1; |
253 | 36 | return 0; |
254 | 54 | } |
255 | | |
256 | | REGISTER_COMMAND(CCV_NNC_EWEXP_FORWARD)(ccv_nnc_cmd_registry_t* const registry) |
257 | | FIND_BACKEND(ccv_nnc_ew_cpu_ref.c, gpu/ccv_nnc_ew_gpu_ref.cu, mps/ccv_nnc_ew_mps.m) |
258 | 1 | { |
259 | 1 | registry->bitmask = _ccv_nnc_ewexp_forw_bitmask; |
260 | 1 | registry->tensor_auto = ccv_nnc_hint_tensor_auto_forward_from_inputs; |
261 | 1 | registry->allow_inplace = _ccv_nnc_arbitary_inplace; |
262 | 1 | } |
263 | | |
264 | | REGISTER_COMMAND(CCV_NNC_EWEXP_BACKWARD)(ccv_nnc_cmd_registry_t* const registry) |
265 | | FIND_BACKEND(ccv_nnc_ew_cpu_ref.c, gpu/ccv_nnc_ew_gpu_ref.cu, mps/ccv_nnc_ew_mps.m) |
266 | 1 | { |
267 | 1 | registry->flags = CCV_NNC_CMD_ATTR_NULL_IS_ONES; |
268 | 1 | registry->bitmask = _ccv_nnc_ewexp_back_bitmask; |
269 | 1 | registry->tensor_auto = ccv_nnc_hint_tensor_auto_backward_from_gradient; |
270 | 1 | registry->allow_inplace = _ccv_nnc_arbitary_inplace; |
271 | 1 | } |
272 | | |
273 | | //@REGISTER_EASY_COMMAND_MACRO(CCV_NNC_EWEXP_FORWARD) |
274 | | #define CMD_EWEXP_FORWARD() ccv_nnc_cmd(CCV_NNC_EWEXP_FORWARD, 0, ccv_nnc_cmd_auto, 0) |
275 | | //@REGISTER_EASY_COMMAND_MACRO(CCV_NNC_EWEXP_BACKWARD) |
276 | | #define CMD_EWEXP_BACKWARD() ccv_nnc_cmd(CCV_NNC_EWEXP_BACKWARD, 0, ccv_nnc_cmd_auto, 0) |
277 | | |
278 | | static int _ccv_nnc_ewpow_forw_bitmask(const ccv_nnc_cmd_param_t cmd, const int input_size, const int output_size, const uint64_t* const input_bitmasks, const int input_bitmask_size, const uint64_t* const output_bitmasks, const int output_bitmask_size) |
279 | 2 | { |
280 | 2 | if ((input_bitmasks[0] & 1u) == 1u && output_bitmasks[0] == 1u) |
281 | 2 | return 1; |
282 | 0 | return 0; |
283 | 2 | } |
284 | | |
285 | | static int _ccv_nnc_ewpow_back_bitmask(const ccv_nnc_cmd_param_t cmd, const int input_size, const int output_size, const uint64_t* const input_bitmasks, const int input_bitmask_size, const uint64_t* const output_bitmasks, const int output_bitmask_size) |
286 | 9 | { |
287 | | // We only care about the original input. |
288 | 9 | if ((input_bitmasks[0] & (7u & ~((uint64_t)1u << 0) & ~((uint64_t)1u << 2))) == ((0u << 0) | (1u << 1) | (0u << 2)) && output_bitmasks[0] == 1u5 ) |
289 | 5 | return 1; |
290 | 4 | return 0; |
291 | 9 | } |
292 | | |
293 | | static void _ccv_nnc_ewpow_tensor_auto_forw(const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_hint_t hint, ccv_nnc_tensor_param_t* const outputs, const int output_size) |
294 | 8 | { |
295 | 8 | assert(output_size >= 1); |
296 | 8 | assert(input_size >= 1); |
297 | 8 | int i; |
298 | 16 | for (i = 0; i < output_size; i++8 ) |
299 | 8 | outputs[i] = inputs[0]; |
300 | 8 | } |
301 | | |
302 | | REGISTER_COMMAND(CCV_NNC_EWPOW_FORWARD)(ccv_nnc_cmd_registry_t* const registry) |
303 | | FIND_BACKEND(ccv_nnc_ew_cpu_ref.c, gpu/ccv_nnc_ew_gpu_ref.cu, mps/ccv_nnc_ew_mps.m) |
304 | 1 | { |
305 | 1 | registry->bitmask = _ccv_nnc_ewpow_forw_bitmask; |
306 | 1 | registry->tensor_auto = _ccv_nnc_ewpow_tensor_auto_forw; |
307 | 1 | registry->allow_inplace = _ccv_nnc_arbitary_inplace; |
308 | 1 | } |
309 | | |
310 | | REGISTER_COMMAND(CCV_NNC_EWPOW_BACKWARD)(ccv_nnc_cmd_registry_t* const registry) |
311 | | FIND_BACKEND(ccv_nnc_ew_cpu_ref.c, gpu/ccv_nnc_ew_gpu_ref.cu) |
312 | 1 | { |
313 | 1 | registry->flags = CCV_NNC_CMD_ATTR_NULL_IS_ONES; |
314 | 1 | registry->bitmask = _ccv_nnc_ewpow_back_bitmask; |
315 | 1 | registry->tensor_auto = ccv_nnc_hint_tensor_auto_backward_from_gradient; |
316 | 1 | } |
317 | | |
318 | | //@REGISTER_EASY_COMMAND_MACRO(CCV_NNC_EWPOW_FORWARD) |
319 | | #define CMD_EWPOW_FORWARD(_exponent) ccv_nnc_cmd(CCV_NNC_EWPOW_FORWARD, 0, ((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}},.pow={.exponent=_exponent}}), 0) |
320 | | //@REGISTER_EASY_COMMAND_MACRO(CCV_NNC_EWPOW_BACKWARD) |
321 | | #define CMD_EWPOW_BACKWARD(_exponent) ccv_nnc_cmd(CCV_NNC_EWPOW_BACKWARD, 0, ((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}},.pow={.exponent=_exponent}}), 0) |
322 | | |
323 | | static int _ccv_nnc_ewlog_forw_bitmask(const ccv_nnc_cmd_param_t cmd, const int input_size, const int output_size, const uint64_t* const input_bitmasks, const int input_bitmask_size, const uint64_t* const output_bitmasks, const int output_bitmask_size) |
324 | 2 | { |
325 | 2 | if ((input_bitmasks[0] & 1u) == 1u && output_bitmasks[0] == 1u) |
326 | 2 | return 1; |
327 | 0 | return 0; |
328 | 2 | } |
329 | | |
330 | | static int _ccv_nnc_ewlog_back_bitmask(const ccv_nnc_cmd_param_t cmd, const int input_size, const int output_size, const uint64_t* const input_bitmasks, const int input_bitmask_size, const uint64_t* const output_bitmasks, const int output_bitmask_size) |
331 | 1.32k | { |
332 | | // We don't care about the original output. |
333 | 1.32k | if ((input_bitmasks[0] & 3u) == 3u && output_bitmasks[0] == 1u442 ) |
334 | 442 | return 1; |
335 | 878 | return 0; |
336 | 1.32k | } |
337 | | |
338 | | REGISTER_COMMAND(CCV_NNC_EWLOG_FORWARD)(ccv_nnc_cmd_registry_t* const registry) |
339 | | FIND_BACKEND(ccv_nnc_ew_cpu_ref.c, gpu/ccv_nnc_ew_gpu_ref.cu, mps/ccv_nnc_ew_mps.m) |
340 | 1 | { |
341 | 1 | registry->bitmask = _ccv_nnc_ewlog_forw_bitmask; |
342 | 1 | registry->tensor_auto = ccv_nnc_hint_tensor_auto_forward_from_inputs; |
343 | 1 | registry->allow_inplace = _ccv_nnc_arbitary_inplace; |
344 | 1 | } |
345 | | |
346 | | REGISTER_COMMAND(CCV_NNC_EWLOG_BACKWARD)(ccv_nnc_cmd_registry_t* const registry) |
347 | | FIND_BACKEND(ccv_nnc_ew_cpu_ref.c, gpu/ccv_nnc_ew_gpu_ref.cu, mps/ccv_nnc_ew_mps.m) |
348 | 1 | { |
349 | 1 | registry->flags = CCV_NNC_CMD_ATTR_NULL_IS_ONES; |
350 | 1 | registry->bitmask = _ccv_nnc_ewlog_back_bitmask; |
351 | 1 | registry->tensor_auto = ccv_nnc_hint_tensor_auto_backward_from_gradient; |
352 | 1 | registry->allow_inplace = _ccv_nnc_arbitary_inplace; |
353 | 1 | } |
354 | | |
355 | | //@REGISTER_EASY_COMMAND_MACRO(CCV_NNC_EWLOG_FORWARD) |
356 | | #define CMD_EWLOG_FORWARD() ccv_nnc_cmd(CCV_NNC_EWLOG_FORWARD, 0, ccv_nnc_cmd_auto, 0) |
357 | | //@REGISTER_EASY_COMMAND_MACRO(CCV_NNC_EWLOG_BACKWARD) |
358 | | #define CMD_EWLOG_BACKWARD() ccv_nnc_cmd(CCV_NNC_EWLOG_BACKWARD, 0, ccv_nnc_cmd_auto, 0) |
359 | | |
360 | | static int _ccv_nnc_ewsqrt_forw_bitmask(const ccv_nnc_cmd_param_t cmd, const int input_size, const int output_size, const uint64_t* const input_bitmasks, const int input_bitmask_size, const uint64_t* const output_bitmasks, const int output_bitmask_size) |
361 | 0 | { |
362 | 0 | if ((input_bitmasks[0] & 1u) == 1u && output_bitmasks[0] == 1u) |
363 | 0 | return 1; |
364 | 0 | return 0; |
365 | 0 | } |
366 | | |
367 | | static int _ccv_nnc_ewsqrt_back_bitmask(const ccv_nnc_cmd_param_t cmd, const int input_size, const int output_size, const uint64_t* const input_bitmasks, const int input_bitmask_size, const uint64_t* const output_bitmasks, const int output_bitmask_size) |
368 | 6 | { |
369 | | // We don't care about the original input. |
370 | 6 | if ((input_bitmasks[0] & (7u & ~((uint64_t)1u << 1))) == ((1u << 0) | (0u << 1) | (1u << 2)) && output_bitmasks[0] == 1u2 ) |
371 | 2 | return 1; |
372 | 4 | return 0; |
373 | 6 | } |
374 | | |
375 | | REGISTER_COMMAND(CCV_NNC_EWSQRT_FORWARD)(ccv_nnc_cmd_registry_t* const registry) |
376 | | FIND_BACKEND(ccv_nnc_ew_cpu_ref.c, gpu/ccv_nnc_ew_gpu_ref.cu, mps/ccv_nnc_ew_mps.m) |
377 | 1 | { |
378 | 1 | registry->bitmask = _ccv_nnc_ewsqrt_forw_bitmask; |
379 | 1 | registry->tensor_auto = ccv_nnc_hint_tensor_auto_forward_from_inputs; |
380 | 1 | registry->allow_inplace = _ccv_nnc_arbitary_inplace; |
381 | 1 | } |
382 | | |
383 | | REGISTER_COMMAND(CCV_NNC_EWSQRT_BACKWARD)(ccv_nnc_cmd_registry_t* const registry) |
384 | | FIND_BACKEND(ccv_nnc_ew_cpu_ref.c, gpu/ccv_nnc_ew_gpu_ref.cu, mps/ccv_nnc_ew_mps.m) |
385 | 1 | { |
386 | 1 | registry->flags = CCV_NNC_CMD_ATTR_NULL_IS_ONES; |
387 | 1 | registry->bitmask = _ccv_nnc_ewsqrt_back_bitmask; |
388 | 1 | registry->tensor_auto = ccv_nnc_hint_tensor_auto_backward_from_gradient; |
389 | 1 | registry->allow_inplace = _ccv_nnc_arbitary_inplace; |
390 | 1 | } |
391 | | |
392 | | //@REGISTER_EASY_COMMAND_MACRO(CCV_NNC_EWSQRT_FORWARD) |
393 | | #define CMD_EWSQRT_FORWARD() ccv_nnc_cmd(CCV_NNC_EWSQRT_FORWARD, 0, ccv_nnc_cmd_auto, 0) |
394 | | //@REGISTER_EASY_COMMAND_MACRO(CCV_NNC_EWSQRT_BACKWARD) |
395 | | #define CMD_EWSQRT_BACKWARD() ccv_nnc_cmd(CCV_NNC_EWSQRT_BACKWARD, 0, ccv_nnc_cmd_auto, 0) |
396 | | |
397 | | static int _ccv_nnc_ewsin_forw_bitmask(const ccv_nnc_cmd_param_t cmd, const int input_size, const int output_size, const uint64_t* const input_bitmasks, const int input_bitmask_size, const uint64_t* const output_bitmasks, const int output_bitmask_size) |
398 | 2 | { |
399 | 2 | if ((input_bitmasks[0] & 1u) == 1u && output_bitmasks[0] == 1u) |
400 | 2 | return 1; |
401 | 0 | return 0; |
402 | 2 | } |
403 | | |
404 | | static int _ccv_nnc_ewsin_back_bitmask(const ccv_nnc_cmd_param_t cmd, const int input_size, const int output_size, const uint64_t* const input_bitmasks, const int input_bitmask_size, const uint64_t* const output_bitmasks, const int output_bitmask_size) |
405 | 9 | { |
406 | | // We only care about the original input. |
407 | 9 | if ((input_bitmasks[0] & (7u & ~((uint64_t)1u << 0) & ~((uint64_t)1u << 2))) == ((0u << 0) | (1u << 1) | (0u << 2)) && output_bitmasks[0] == 1u5 ) |
408 | 5 | return 1; |
409 | 4 | return 0; |
410 | 9 | } |
411 | | |
412 | | REGISTER_COMMAND(CCV_NNC_EWSIN_FORWARD)(ccv_nnc_cmd_registry_t* const registry) |
413 | | FIND_BACKEND(ccv_nnc_ew_cpu_ref.c, gpu/ccv_nnc_ew_gpu_ref.cu, mps/ccv_nnc_ew_mps.m) |
414 | 1 | { |
415 | 1 | registry->bitmask = _ccv_nnc_ewsin_forw_bitmask; |
416 | 1 | registry->tensor_auto = ccv_nnc_hint_tensor_auto_forward_from_inputs; |
417 | 1 | registry->allow_inplace = _ccv_nnc_arbitary_inplace; |
418 | 1 | } |
419 | | |
420 | | REGISTER_COMMAND(CCV_NNC_EWSIN_BACKWARD)(ccv_nnc_cmd_registry_t* const registry) |
421 | | FIND_BACKEND(ccv_nnc_ew_cpu_ref.c, gpu/ccv_nnc_ew_gpu_ref.cu) |
422 | 1 | { |
423 | 1 | registry->flags = CCV_NNC_CMD_ATTR_NULL_IS_ONES; |
424 | 1 | registry->bitmask = _ccv_nnc_ewsin_back_bitmask; |
425 | 1 | registry->tensor_auto = ccv_nnc_hint_tensor_auto_backward_from_gradient; |
426 | 1 | } |
427 | | |
428 | | //@REGISTER_EASY_COMMAND_MACRO(CCV_NNC_EWSIN_FORWARD) |
429 | | #define CMD_EWSIN_FORWARD() ccv_nnc_cmd(CCV_NNC_EWSIN_FORWARD, 0, ccv_nnc_cmd_auto, 0) |
430 | | //@REGISTER_EASY_COMMAND_MACRO(CCV_NNC_EWSIN_BACKWARD) |
431 | | #define CMD_EWSIN_BACKWARD() ccv_nnc_cmd(CCV_NNC_EWSIN_BACKWARD, 0, ccv_nnc_cmd_auto, 0) |
432 | | |
433 | | static int _ccv_nnc_ewcos_forw_bitmask(const ccv_nnc_cmd_param_t cmd, const int input_size, const int output_size, const uint64_t* const input_bitmasks, const int input_bitmask_size, const uint64_t* const output_bitmasks, const int output_bitmask_size) |
434 | 2 | { |
435 | 2 | if ((input_bitmasks[0] & 1u) == 1u && output_bitmasks[0] == 1u) |
436 | 2 | return 1; |
437 | 0 | return 0; |
438 | 2 | } |
439 | | |
440 | | static int _ccv_nnc_ewcos_back_bitmask(const ccv_nnc_cmd_param_t cmd, const int input_size, const int output_size, const uint64_t* const input_bitmasks, const int input_bitmask_size, const uint64_t* const output_bitmasks, const int output_bitmask_size) |
441 | 9 | { |
442 | | // We only care about the original input. |
443 | 9 | if ((input_bitmasks[0] & (7u & ~((uint64_t)1u << 0) & ~((uint64_t)1u << 2))) == ((0u << 0) | (1u << 1) | (0u << 2)) && output_bitmasks[0] == 1u5 ) |
444 | 5 | return 1; |
445 | 4 | return 0; |
446 | 9 | } |
447 | | |
448 | | REGISTER_COMMAND(CCV_NNC_EWCOS_FORWARD)(ccv_nnc_cmd_registry_t* const registry) |
449 | | FIND_BACKEND(ccv_nnc_ew_cpu_ref.c, gpu/ccv_nnc_ew_gpu_ref.cu, mps/ccv_nnc_ew_mps.m) |
450 | 1 | { |
451 | 1 | registry->bitmask = _ccv_nnc_ewcos_forw_bitmask; |
452 | 1 | registry->tensor_auto = ccv_nnc_hint_tensor_auto_forward_from_inputs; |
453 | 1 | registry->allow_inplace = _ccv_nnc_arbitary_inplace; |
454 | 1 | } |
455 | | |
456 | | REGISTER_COMMAND(CCV_NNC_EWCOS_BACKWARD)(ccv_nnc_cmd_registry_t* const registry) |
457 | | FIND_BACKEND(ccv_nnc_ew_cpu_ref.c, gpu/ccv_nnc_ew_gpu_ref.cu) |
458 | 1 | { |
459 | 1 | registry->flags = CCV_NNC_CMD_ATTR_NULL_IS_ONES; |
460 | 1 | registry->bitmask = _ccv_nnc_ewcos_back_bitmask; |
461 | 1 | registry->tensor_auto = ccv_nnc_hint_tensor_auto_backward_from_gradient; |
462 | 1 | } |
463 | | |
464 | | //@REGISTER_EASY_COMMAND_MACRO(CCV_NNC_EWCOS_FORWARD) |
465 | | #define CMD_EWCOS_FORWARD() ccv_nnc_cmd(CCV_NNC_EWCOS_FORWARD, 0, ccv_nnc_cmd_auto, 0) |
466 | | //@REGISTER_EASY_COMMAND_MACRO(CCV_NNC_EWCOS_BACKWARD) |
467 | | #define CMD_EWCOS_BACKWARD() ccv_nnc_cmd(CCV_NNC_EWCOS_BACKWARD, 0, ccv_nnc_cmd_auto, 0) |
468 | | |
469 | | static int _ccv_nnc_ewabs_forw_bitmask(const ccv_nnc_cmd_param_t cmd, const int input_size, const int output_size, const uint64_t* const input_bitmasks, const int input_bitmask_size, const uint64_t* const output_bitmasks, const int output_bitmask_size) |
470 | 0 | { |
471 | 0 | if ((input_bitmasks[0] & 1u) == 1u && output_bitmasks[0] == 1u) |
472 | 0 | return 1; |
473 | 0 | return 0; |
474 | 0 | } |
475 | | |
476 | | static int _ccv_nnc_ewabs_back_bitmask(const ccv_nnc_cmd_param_t cmd, const int input_size, const int output_size, const uint64_t* const input_bitmasks, const int input_bitmask_size, const uint64_t* const output_bitmasks, const int output_bitmask_size) |
477 | 0 | { |
478 | | // We only care about the original input. |
479 | 0 | if ((input_bitmasks[0] & (7u & ~((uint64_t)1u << 2))) == ((1u << 0) | (1u << 1) | (0u << 2)) && output_bitmasks[0] == 1u) |
480 | 0 | return 1; |
481 | 0 | return 0; |
482 | 0 | } |
483 | | |
484 | | REGISTER_COMMAND(CCV_NNC_EWABS_FORWARD)(ccv_nnc_cmd_registry_t* const registry) |
485 | | FIND_BACKEND(ccv_nnc_ew_cpu_ref.c, gpu/ccv_nnc_ew_gpu_ref.cu, mps/ccv_nnc_ew_mps.m) |
486 | 1 | { |
487 | 1 | registry->bitmask = _ccv_nnc_ewabs_forw_bitmask; |
488 | 1 | registry->tensor_auto = ccv_nnc_hint_tensor_auto_forward_from_inputs; |
489 | 1 | registry->allow_inplace = _ccv_nnc_arbitary_inplace; |
490 | 1 | } |
491 | | |
492 | | REGISTER_COMMAND(CCV_NNC_EWABS_BACKWARD)(ccv_nnc_cmd_registry_t* const registry) |
493 | | FIND_BACKEND(ccv_nnc_ew_cpu_ref.c, gpu/ccv_nnc_ew_gpu_ref.cu, mps/ccv_nnc_ew_mps.m) |
494 | 1 | { |
495 | 1 | registry->bitmask = _ccv_nnc_ewabs_back_bitmask; |
496 | 1 | registry->tensor_auto = ccv_nnc_hint_tensor_auto_backward_from_gradient; |
497 | 1 | registry->allow_inplace = _ccv_nnc_arbitary_inplace; |
498 | 1 | } |
499 | | |
500 | | //@REGISTER_EASY_COMMAND_MACRO(CCV_NNC_EWABS_FORWARD) |
501 | | #define CMD_EWABS_FORWARD() ccv_nnc_cmd(CCV_NNC_EWABS_FORWARD, 0, ccv_nnc_cmd_auto, 0) |
502 | | //@REGISTER_EASY_COMMAND_MACRO(CCV_NNC_EWABS_BACKWARD) |
503 | | #define CMD_EWABS_BACKWARD() ccv_nnc_cmd(CCV_NNC_EWABS_BACKWARD, 0, ccv_nnc_cmd_auto, 0) |
504 | | |
505 | | static int _ccv_nnc_clamp_forw_bitmask(const ccv_nnc_cmd_param_t cmd, const int input_size, const int output_size, const uint64_t* const input_bitmasks, const int input_bitmask_size, const uint64_t* const output_bitmasks, const int output_bitmask_size) |
506 | 0 | { |
507 | 0 | if ((input_bitmasks[0] & 1u) == 1u && output_bitmasks[0] == 1u) |
508 | 0 | return 1; |
509 | 0 | return 0; |
510 | 0 | } |
511 | | |
512 | | static int _ccv_nnc_clamp_back_bitmask(const ccv_nnc_cmd_param_t cmd, const int input_size, const int output_size, const uint64_t* const input_bitmasks, const int input_bitmask_size, const uint64_t* const output_bitmasks, const int output_bitmask_size) |
513 | 0 | { |
514 | | // We don't care about the original input. |
515 | 0 | if ((input_bitmasks[0] & (7u & ~((uint64_t)1u << 1))) == ((1u << 0) | (0u << 1) | (1u << 2)) && output_bitmasks[0] == 1u) |
516 | 0 | return 1; |
517 | 0 | return 0; |
518 | 0 | } |
519 | | |
520 | | REGISTER_COMMAND(CCV_NNC_CLAMP_FORWARD)(ccv_nnc_cmd_registry_t* const registry) |
521 | | FIND_BACKEND(ccv_nnc_ew_cpu_ref.c, gpu/ccv_nnc_ew_gpu_ref.cu, mps/ccv_nnc_ew_mps.m) |
522 | 1 | { |
523 | 1 | registry->bitmask = _ccv_nnc_clamp_forw_bitmask; |
524 | 1 | registry->tensor_auto = ccv_nnc_hint_tensor_auto_forward_from_inputs; |
525 | 1 | registry->allow_inplace = _ccv_nnc_arbitary_inplace; |
526 | 1 | } |
527 | | |
528 | | REGISTER_COMMAND(CCV_NNC_CLAMP_BACKWARD)(ccv_nnc_cmd_registry_t* const registry) |
529 | | FIND_BACKEND(ccv_nnc_ew_cpu_ref.c, gpu/ccv_nnc_ew_gpu_ref.cu, mps/ccv_nnc_ew_mps.m) |
530 | 1 | { |
531 | 1 | registry->flags = CCV_NNC_CMD_ATTR_NULL_IS_ONES; |
532 | 1 | registry->bitmask = _ccv_nnc_clamp_back_bitmask; |
533 | 1 | registry->tensor_auto = ccv_nnc_hint_tensor_auto_backward_from_gradient; |
534 | 1 | registry->allow_inplace = _ccv_nnc_arbitary_inplace; |
535 | 1 | } |
536 | | |
537 | | //@REGISTER_EASY_COMMAND_MACRO(CCV_NNC_CLAMP_FORWARD) |
538 | | #define CMD_CLAMP_FORWARD(_min, _max) ccv_nnc_cmd(CCV_NNC_CLAMP_FORWARD, 0, (ccv_nnc_cmd_param_t){.size={.dim={1,1,1}},.clamp={.min=_min,.max=_max}}, 0) |
539 | | //@REGISTER_EASY_COMMAND_MACRO(CCV_NNC_CLAMP_BACKWARD) |
540 | | #define CMD_CLAMP_BACKWARD(_min, _max) ccv_nnc_cmd(CCV_NNC_CLAMP_BACKWARD, 0, (ccv_nnc_cmd_param_t){.size={.dim={1,1,1}},.clamp={.min=_min,.max=_max}}, 0) |