/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/cmd/norm/ccv_nnc_norm.c
Line | Count | Source (jump to first uncovered line) |
1 | | #include "ccv.h" |
2 | | #include "nnc/ccv_nnc.h" |
3 | | #include "nnc/ccv_nnc_internal.h" |
4 | | |
5 | | static int _ccv_nnc_batch_norm_forw_bitmask(const ccv_nnc_cmd_param_t cmd, const int input_size, const int output_size, const uint64_t* const input_bitmasks, const int input_bitmask_size, const uint64_t* const output_bitmasks, const int output_bitmask_size) |
6 | 675 | { |
7 | | // 5 inputs (x, scale, bias, mean, var) |
8 | | // 1 outputs (y) |
9 | 675 | if (input_bitmasks[0] == 31u && output_bitmasks[0] == 1u) |
10 | 0 | return 1; |
11 | | // 5 inputs (x, scale, bias, mean, var) |
12 | | // 5 outputs (y, mean, var, saved_mean, saved_inv_var) |
13 | | // Both mean and var in output is inplace for the input mean, var |
14 | 675 | if (input_bitmasks[0] == 31u && output_bitmasks[0] == 31u) |
15 | 375 | return 1; |
16 | 300 | return 0; |
17 | 675 | } |
18 | | |
19 | | static int _ccv_nnc_batch_norm_enforce_inplace(const ccv_nnc_cmd_param_t cmd, const int input_idx, const int input_size, const int output_idx, const int output_size) |
20 | 2.27k | { |
21 | 2.27k | if (input_idx == 3 && output_idx == 1455 ) |
22 | 91 | return 1; |
23 | 2.18k | if (input_idx == 4 && output_idx == 2454 ) |
24 | 91 | return 1; |
25 | 2.09k | return 0; |
26 | 2.18k | } |
27 | | |
28 | | static int _ccv_nnc_batch_norm_back_bitmask(const ccv_nnc_cmd_param_t cmd, const int input_size, const int output_size, const uint64_t* const input_bitmasks, const int input_bitmask_size, const uint64_t* const output_bitmasks, const int output_bitmask_size) |
29 | 657 | { |
30 | | // 0b110000001100001 |
31 | | // Inputs (gradient, 0, 0, 0, 0, x, scale, 0, 0, 0, 0, 0, 0, saved_mean, saved_inv_var) |
32 | | // Output the propagated error, dscale and dbias |
33 | 657 | if ((input_bitmasks[0] & 24673u) == 24673u && (output_bitmasks[0] & 7u) == 7u377 ) |
34 | 377 | return 1; |
35 | 280 | return 0; |
36 | 657 | } |
37 | | |
38 | | static void _ccv_nnc_batch_norm_tensor_auto_forw(const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_hint_t hint, ccv_nnc_tensor_param_t* const outputs, const int output_size) |
39 | 1.90k | { |
40 | 1.90k | assert(input_size == 5); |
41 | 1.90k | assert(output_size == 1 || output_size == 5); |
42 | 1.90k | outputs[0] = inputs[0]; |
43 | 1.90k | if (output_size == 1) |
44 | 0 | return; |
45 | 1.90k | int i, j; |
46 | 9.52k | for (i = 1; i < output_size; i++7.62k ) |
47 | 7.62k | { |
48 | 7.62k | outputs[i] = inputs[0]; |
49 | 30.4k | for (j = 0; j < cmd.bnorm.count; j++22.8k ) |
50 | 22.8k | outputs[i].dim[cmd.bnorm.axis[j]] = 1; // Reduce the dimension to 1. |
51 | 7.62k | } |
52 | 1.90k | } |
53 | | |
54 | | static void _ccv_nnc_batch_norm_tensor_auto_back(const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_hint_t hint, ccv_nnc_tensor_param_t* const outputs, const int output_size) |
55 | 1.66k | { |
56 | 1.66k | assert(input_size == 15); |
57 | 1.66k | assert(output_size == 5); |
58 | 1.66k | outputs[0] = inputs[0]; |
59 | 1.66k | int i, j; |
60 | 8.30k | for (i = 1; i < output_size; i++6.64k ) |
61 | 6.64k | { |
62 | 6.64k | outputs[i] = inputs[0]; |
63 | 26.5k | for (j = 0; j < cmd.bnorm.count; j++19.9k ) |
64 | 19.9k | outputs[i].dim[cmd.bnorm.axis[j]] = 1; // Reduce the dimension to 1. |
65 | 6.64k | } |
66 | 1.66k | } |
67 | | |
68 | | REGISTER_COMMAND(CCV_NNC_BATCH_NORM_FORWARD)(ccv_nnc_cmd_registry_t* const registry) |
69 | | FIND_BACKEND(ccv_nnc_batch_norm_cpu_ref.c, gpu/ccv_nnc_batch_norm_gpu_cudnn.cu) |
70 | 1 | { |
71 | 1 | registry->bitmask = _ccv_nnc_batch_norm_forw_bitmask; |
72 | 1 | registry->tensor_auto = _ccv_nnc_batch_norm_tensor_auto_forw; |
73 | 1 | registry->enforce_inplace = _ccv_nnc_batch_norm_enforce_inplace; |
74 | 1 | } |
75 | | |
76 | | REGISTER_COMMAND(CCV_NNC_BATCH_NORM_BACKWARD)(ccv_nnc_cmd_registry_t* const registry) |
77 | | FIND_BACKEND(ccv_nnc_batch_norm_cpu_ref.c, gpu/ccv_nnc_batch_norm_gpu_cudnn.cu) |
78 | 1 | { |
79 | 1 | registry->bitmask = _ccv_nnc_batch_norm_back_bitmask; |
80 | 1 | registry->tensor_auto = _ccv_nnc_batch_norm_tensor_auto_back; |
81 | 1 | } |
82 | | |
83 | | //@REGISTER_EASY_COMMAND_MACRO(CCV_NNC_BATCH_NORM_FORWARD) |
84 | | #define CMD_BATCH_NORM_FORWARD(_epsilon, _is_test, _momentum, ...) ccv_nnc_cmd(CCV_NNC_BATCH_NORM_FORWARD, 0, ((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}},.bnorm={.epsilon=_epsilon,.is_test=_is_test,.momentum=_momentum,.count=LIST_COUNT(__VA_ARGS__),.axis={__VA_ARGS__}}}), 0) |
85 | | //@REGISTER_EASY_COMMAND_MACRO(CCV_NNC_BATCH_NORM_BACKWARD) |
86 | | #define CMD_BATCH_NORM_BACKWARD(_epsilon, _is_test, _momentum, ...) ccv_nnc_cmd(CCV_NNC_BATCH_NORM_BACKWARD, 0, ((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}},.bnorm={.epsilon=_epsilon,.is_test=_is_test,.momentum=_momentum,.count=LIST_COUNT(__VA_ARGS__),.axis={__VA_ARGS__}}}), 0) |
87 | | |
88 | | static int _ccv_nnc_layer_norm_forw_bitmask(const ccv_nnc_cmd_param_t cmd, const int input_size, const int output_size, const uint64_t* const input_bitmasks, const int input_bitmask_size, const uint64_t* const output_bitmasks, const int output_bitmask_size) |
89 | 40 | { |
90 | 40 | if (cmd.lnorm.elementwise_affine) |
91 | 40 | { |
92 | | // 3 inputs (x, gamma, beta) |
93 | | // 3 outputs (y, saved_mean, saved_inv_std) |
94 | 40 | if (input_bitmasks[0] == 7u && output_bitmasks[0] == 7u) |
95 | 24 | return 1; |
96 | | // 3 inputs (x, gamma, beta) |
97 | | // 1 output (y) |
98 | 16 | if (input_bitmasks[0] == 7u && output_bitmasks[0] == 1u) |
99 | 0 | return 1; |
100 | 16 | } else { |
101 | | // 1 inputs (x) |
102 | | // 3 outputs (y, saved_mean, saved_inv_std) |
103 | 0 | if (input_bitmasks[0] == 1u && output_bitmasks[0] == 7u) |
104 | 0 | return 1; |
105 | | // 1 inputs (x) |
106 | | // 1 output (y) |
107 | 0 | if (input_bitmasks[0] == 1u && output_bitmasks[0] == 1u) |
108 | 0 | return 1; |
109 | 0 | } |
110 | 16 | return 0; |
111 | 40 | } |
112 | | |
113 | | static int _ccv_nnc_layer_norm_back_bitmask(const ccv_nnc_cmd_param_t cmd, const int input_size, const int output_size, const uint64_t* const input_bitmasks, const int input_bitmask_size, const uint64_t* const output_bitmasks, const int output_bitmask_size) |
114 | 239 | { |
115 | 239 | if (cmd.lnorm.elementwise_affine) |
116 | 189 | { |
117 | | // 0b110011001 |
118 | | // Inputs (gradient, 0, 0, x, gamma, 0, 0, saved_mean, saved_inv_std) |
119 | | // Output the propagated error, dgamma and dbeta |
120 | 189 | if ((input_bitmasks[0] & 409u) == 409u && (output_bitmasks[0] & 7u) == 7u69 ) |
121 | 57 | return 1; |
122 | 132 | if ((input_bitmasks[0] & 409u) == 409u && (output_bitmasks[0] & 5u) == 5u12 ) |
123 | 2 | return 1; |
124 | 130 | if ((input_bitmasks[0] & 409u) == 409u && (output_bitmasks[0] & 3u) == 3u10 ) |
125 | 0 | return 1; |
126 | 130 | if ((input_bitmasks[0] & 409u) == 409u && (output_bitmasks[0] & 1u) == 1u10 ) |
127 | 10 | return 1; |
128 | 130 | } else { |
129 | | // 0b1101001 |
130 | | // Inputs (gradient, 0, 0, x, 0, saved_mean, saved_inv_std) |
131 | | // Output the propagated error |
132 | 50 | if ((input_bitmasks[0] & 105u) == 105u && (output_bitmasks[0] & 1u) == 1u20 ) |
133 | 20 | return 1; |
134 | 50 | } |
135 | 150 | return 0; |
136 | 239 | } |
137 | | |
138 | | static void _ccv_nnc_layer_norm_tensor_auto_forw(const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_hint_t hint, ccv_nnc_tensor_param_t* const outputs, const int output_size) |
139 | 386 | { |
140 | 386 | assert(input_size == 3 || input_size == 1); |
141 | 386 | assert(output_size == 1 || output_size == 3); |
142 | 386 | outputs[0] = inputs[0]; |
143 | 386 | if (output_size == 1) |
144 | 0 | return; |
145 | 386 | int i, j; |
146 | 1.15k | for (i = 1; i < output_size; i++772 ) |
147 | 772 | { |
148 | 772 | outputs[i] = inputs[0]; |
149 | 1.64k | for (j = 0; j < cmd.lnorm.count; j++868 ) |
150 | 868 | outputs[i].dim[cmd.lnorm.axis[j]] = 1; // Reduce the dimension to 1. |
151 | 772 | } |
152 | 386 | } |
153 | | |
154 | | static void _ccv_nnc_layer_norm_tensor_auto_back(const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_hint_t hint, ccv_nnc_tensor_param_t* const outputs, const int output_size) |
155 | 306 | { |
156 | 306 | assert(input_size == 9 || input_size == 7); |
157 | 306 | assert(output_size == 1 || output_size == 3); |
158 | 306 | outputs[0] = inputs[0]; |
159 | 306 | int i, j; |
160 | 908 | for (i = 1; i < output_size; i++602 ) |
161 | 602 | { |
162 | 602 | outputs[i] = inputs[0]; |
163 | 1.22k | for (j = 0; j < cmd.lnorm.count; j++620 ) |
164 | 620 | outputs[i].dim[cmd.lnorm.axis[j]] = 1; // Reduce the dimension to 1. |
165 | 602 | } |
166 | 306 | } |
167 | | |
168 | | REGISTER_COMMAND(CCV_NNC_LAYER_NORM_FORWARD)(ccv_nnc_cmd_registry_t* const registry) |
169 | | FIND_BACKEND(ccv_nnc_layer_norm_cpu_ref.c, gpu/ccv_nnc_layer_norm_gpu_cudnn.cu, mps/ccv_nnc_layer_norm_mps.m) |
170 | 1 | { |
171 | 1 | registry->bitmask = _ccv_nnc_layer_norm_forw_bitmask; |
172 | 1 | registry->tensor_auto = _ccv_nnc_layer_norm_tensor_auto_forw; |
173 | 1 | } |
174 | | |
175 | | REGISTER_COMMAND(CCV_NNC_LAYER_NORM_BACKWARD)(ccv_nnc_cmd_registry_t* const registry) |
176 | | FIND_BACKEND(ccv_nnc_layer_norm_cpu_ref.c, gpu/ccv_nnc_layer_norm_gpu_cudnn.cu, mps/ccv_nnc_layer_norm_mps.m) |
177 | 1 | { |
178 | 1 | registry->bitmask = _ccv_nnc_layer_norm_back_bitmask; |
179 | 1 | registry->tensor_auto = _ccv_nnc_layer_norm_tensor_auto_back; |
180 | 1 | } |
181 | | |
182 | | //@REGISTER_EASY_COMMAND_MACRO(CCV_NNC_LAYER_NORM_FORWARD) |
183 | | #define CMD_LAYER_NORM_FORWARD(_epsilon, _elementwise_affine, ...) ccv_nnc_cmd(CCV_NNC_LAYER_NORM_FORWARD, 0, ((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}},.lnorm={.epsilon=_epsilon,.elementwise_affine=_elementwise_affine,.count=LIST_COUNT(__VA_ARGS__),.axis={__VA_ARGS__}}}), 0) |
184 | | //@REGISTER_EASY_COMMAND_MACRO(CCV_NNC_LAYER_NORM_BACKWARD) |
185 | | #define CMD_LAYER_NORM_BACKWARD(_epsilon, _elementwise_affine, ...) ccv_nnc_cmd(CCV_NNC_LAYER_NORM_BACKWARD, 0, ((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}},.lnorm={.epsilon=_epsilon,.elementwise_affine=_elementwise_affine,.count=LIST_COUNT(__VA_ARGS__),.axis={__VA_ARGS__}}}), 0) |
186 | | |
187 | | static int _ccv_nnc_group_norm_forw_bitmask(const ccv_nnc_cmd_param_t cmd, const int input_size, const int output_size, const uint64_t* const input_bitmasks, const int input_bitmask_size, const uint64_t* const output_bitmasks, const int output_bitmask_size) |
188 | 0 | { |
189 | 0 | if (cmd.gnorm.elementwise_affine) |
190 | 0 | { |
191 | | // 3 inputs (x, gamma, beta) |
192 | | // 3 outputs (y, saved_mean, saved_inv_std) |
193 | 0 | if (input_bitmasks[0] == 7u && output_bitmasks[0] == 7u) |
194 | 0 | return 1; |
195 | | // 3 inputs (x, gamma, beta) |
196 | | // 1 output (y) |
197 | 0 | if (input_bitmasks[0] == 7u && output_bitmasks[0] == 1u) |
198 | 0 | return 1; |
199 | 0 | } else { |
200 | | // 1 inputs (x) |
201 | | // 3 outputs (y, saved_mean, saved_inv_std) |
202 | 0 | if (input_bitmasks[0] == 1u && output_bitmasks[0] == 7u) |
203 | 0 | return 1; |
204 | | // 1 inputs (x) |
205 | | // 1 output (y) |
206 | 0 | if (input_bitmasks[0] == 1u && output_bitmasks[0] == 1u) |
207 | 0 | return 1; |
208 | 0 | } |
209 | 0 | return 0; |
210 | 0 | } |
211 | | |
212 | | static int _ccv_nnc_group_norm_back_bitmask(const ccv_nnc_cmd_param_t cmd, const int input_size, const int output_size, const uint64_t* const input_bitmasks, const int input_bitmask_size, const uint64_t* const output_bitmasks, const int output_bitmask_size) |
213 | 178 | { |
214 | 178 | if (cmd.gnorm.elementwise_affine) |
215 | 108 | { |
216 | | // 0b110011001 |
217 | | // Inputs (gradient, 0, 0, x, gamma, 0, 0, saved_mean, saved_inv_std) |
218 | | // Output the propagated error, dgamma and dbeta |
219 | 108 | if ((input_bitmasks[0] & 409u) == 409u && (output_bitmasks[0] & 7u) == 7u44 ) |
220 | 32 | return 1; |
221 | 76 | if ((input_bitmasks[0] & 409u) == 409u && (output_bitmasks[0] & 5u) == 5u12 ) |
222 | 2 | return 1; |
223 | 74 | if ((input_bitmasks[0] & 409u) == 409u && (output_bitmasks[0] & 3u) == 3u10 ) |
224 | 0 | return 1; |
225 | 74 | if ((input_bitmasks[0] & 409u) == 409u && (output_bitmasks[0] & 1u) == 1u10 ) |
226 | 10 | return 1; |
227 | 74 | } else { |
228 | | // 0b1101001 |
229 | | // Inputs (gradient, 0, 0, x, 0, saved_mean, saved_inv_std) |
230 | | // Output the propagated error |
231 | 70 | if ((input_bitmasks[0] & 105u) == 105u && (output_bitmasks[0] & 7u) == 1u28 ) |
232 | 28 | return 1; |
233 | 70 | } |
234 | 106 | return 0; |
235 | 178 | } |
236 | | |
237 | | static void _ccv_nnc_group_norm_tensor_auto_forw(const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_hint_t hint, ccv_nnc_tensor_param_t* const outputs, const int output_size) |
238 | 41 | { |
239 | 41 | assert(input_size == 3 || input_size == 1); |
240 | 41 | assert(output_size == 1 || output_size == 3); |
241 | 41 | outputs[0] = inputs[0]; |
242 | 41 | if (output_size == 1) |
243 | 0 | return; |
244 | 41 | int i, j; |
245 | 123 | for (i = 1; i < output_size; i++82 ) |
246 | 82 | { |
247 | 82 | outputs[i] = inputs[0]; |
248 | 82 | outputs[i].dim[cmd.gnorm.group_axis] = cmd.gnorm.groups; // Reduce to num_groups. |
249 | 130 | for (j = 0; j < cmd.gnorm.reduce_count; j++48 ) |
250 | 48 | outputs[i].dim[cmd.gnorm.reduce_axis[j]] = 1; |
251 | 82 | } |
252 | 41 | } |
253 | | |
254 | | static void _ccv_nnc_group_norm_tensor_auto_back(const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_hint_t hint, ccv_nnc_tensor_param_t* const outputs, const int output_size) |
255 | 15 | { |
256 | 15 | assert(input_size == 9 || input_size == 7); |
257 | 15 | assert(output_size == 1 || output_size == 3); |
258 | 15 | outputs[0] = inputs[0]; |
259 | 15 | int i, j; |
260 | 31 | for (i = 1; i < output_size; i++16 ) |
261 | 16 | { |
262 | 16 | outputs[i] = inputs[0]; |
263 | 16 | outputs[i].dim[cmd.gnorm.group_axis] = cmd.gnorm.groups; // Reduce the dimension to num_groups. |
264 | 24 | for (j = 0; j < cmd.gnorm.reduce_count; j++8 ) |
265 | 8 | outputs[i].dim[cmd.gnorm.reduce_axis[j]] = 1; |
266 | 16 | } |
267 | 15 | } |
268 | | |
269 | | REGISTER_COMMAND(CCV_NNC_GROUP_NORM_FORWARD)(ccv_nnc_cmd_registry_t* const registry) |
270 | | FIND_BACKEND(ccv_nnc_group_norm_cpu_ref.c, gpu/ccv_nnc_group_norm_gpu_cudnn.cu, mps/ccv_nnc_group_norm_mps.m) |
271 | 1 | { |
272 | 1 | registry->bitmask = _ccv_nnc_group_norm_forw_bitmask; |
273 | 1 | registry->tensor_auto = _ccv_nnc_group_norm_tensor_auto_forw; |
274 | 1 | } |
275 | | |
276 | | REGISTER_COMMAND(CCV_NNC_GROUP_NORM_BACKWARD)(ccv_nnc_cmd_registry_t* const registry) |
277 | | FIND_BACKEND(ccv_nnc_group_norm_cpu_ref.c, gpu/ccv_nnc_group_norm_gpu_cudnn.cu, mps/ccv_nnc_group_norm_mps.m) |
278 | 1 | { |
279 | 1 | registry->bitmask = _ccv_nnc_group_norm_back_bitmask; |
280 | 1 | registry->tensor_auto = _ccv_nnc_group_norm_tensor_auto_back; |
281 | 1 | } |
282 | | |
283 | | //@REGISTER_EASY_COMMAND_MACRO(CCV_NNC_GROUP_NORM_FORWARD) |
284 | | #define CMD_GROUP_NORM_FORWARD(_group_axis, _groups, _epsilon, _elementwise_affine, ...) ccv_nnc_cmd(CCV_NNC_GROUP_NORM_FORWARD, 0, ((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}},.gnorm={.group_axis=_group_axis,.groups=_groups,.epsilon=_epsilon,.elementwise_affine=_elementwise_affine,.reduce_count=LIST_COUNT(__VA_ARGS__),.reduce_axis={__VA_ARGS__}}}), 0) |
285 | | //@REGISTER_EASY_COMMAND_MACRO(CCV_NNC_GROUP_NORM_BACKWARD) |
286 | | #define CMD_GROUP_NORM_BACKWARD(_group_axis, _groups, _epsilon, _elementwise_affine, ...) ccv_nnc_cmd(CCV_NNC_GROUP_NORM_BACKWARD, 0, ((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}},.gnorm={.group_axis=_group_axis,.groups=_groups,.epsilon=_epsilon,.elementwise_affine=_elementwise_affine,.reduce_count=LIST_COUNT(__VA_ARGS__),.reduce_axis={__VA_ARGS__}}}), 0) |
287 | | |
288 | | static int _ccv_nnc_rmsnorm_forw_bitmask(const ccv_nnc_cmd_param_t cmd, const int input_size, const int output_size, const uint64_t* const input_bitmasks, const int input_bitmask_size, const uint64_t* const output_bitmasks, const int output_bitmask_size) |
289 | 0 | { |
290 | | // 2 inputs (x, gamma) |
291 | | // 2 outputs (y, saved_inv_std) |
292 | 0 | if (input_bitmasks[0] == 3u && output_bitmasks[0] == 3u) |
293 | 0 | return 1; |
294 | 0 | return 0; |
295 | 0 | } |
296 | | |
297 | | static int _ccv_nnc_rmsnorm_back_bitmask(const ccv_nnc_cmd_param_t cmd, const int input_size, const int output_size, const uint64_t* const input_bitmasks, const int input_bitmask_size, const uint64_t* const output_bitmasks, const int output_bitmask_size) |
298 | 47 | { |
299 | | // 1 + 4 + 8 + 32 |
300 | | // Inputs (gradient, 0, x, gamma, 0, saved_inv_std) |
301 | | // Output the propagated error, dgamma |
302 | 47 | if ((input_bitmasks[0] & 45u) == 45u && (output_bitmasks[0] & 3u) == 3u17 ) |
303 | 11 | return 1; |
304 | 36 | if ((input_bitmasks[0] & 45u) == 45u && (output_bitmasks[0] & 1u) == 1u6 ) |
305 | 6 | return 1; |
306 | 30 | return 0; |
307 | 36 | } |
308 | | |
309 | | static void _ccv_nnc_rmsnorm_tensor_auto_forw(const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_hint_t hint, ccv_nnc_tensor_param_t* const outputs, const int output_size) |
310 | 13 | { |
311 | 13 | assert(input_size == 2); |
312 | 13 | assert(output_size == 1 || output_size == 2); |
313 | 13 | outputs[0] = inputs[0]; |
314 | 13 | if (output_size == 1) |
315 | 0 | return; |
316 | 13 | int i, j; |
317 | 26 | for (i = 1; i < output_size; i++13 ) |
318 | 13 | { |
319 | 13 | outputs[i] = inputs[0]; |
320 | 52 | for (j = 0; j < cmd.rmsnorm.count; j++39 ) |
321 | 39 | outputs[i].dim[cmd.rmsnorm.axis[j]] = 1; // Reduce the dimension to 1. |
322 | 13 | } |
323 | 13 | } |
324 | | |
325 | | static void _ccv_nnc_rmsnorm_tensor_auto_back(const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_hint_t hint, ccv_nnc_tensor_param_t* const outputs, const int output_size) |
326 | 5 | { |
327 | 5 | assert(input_size == 6); |
328 | 5 | assert(output_size == 1 || output_size == 2); |
329 | 5 | outputs[0] = inputs[0]; |
330 | 5 | int i, j; |
331 | 10 | for (i = 1; i < output_size; i++5 ) |
332 | 5 | { |
333 | 5 | outputs[i] = inputs[0]; |
334 | 20 | for (j = 0; j < cmd.rmsnorm.count; j++15 ) |
335 | 15 | outputs[i].dim[cmd.rmsnorm.axis[j]] = 1; // Reduce the dimension to 1. |
336 | 5 | } |
337 | 5 | } |
338 | | |
339 | | REGISTER_COMMAND(CCV_NNC_RMSNORM_FORWARD)(ccv_nnc_cmd_registry_t* const registry) |
340 | | FIND_BACKEND(ccv_nnc_rmsnorm_cpu_ref.c, gpu/ccv_nnc_rmsnorm_gpu_cudnn.cu, mps/ccv_nnc_rmsnorm_mps.m) |
341 | 1 | { |
342 | 1 | registry->bitmask = _ccv_nnc_rmsnorm_forw_bitmask; |
343 | 1 | registry->tensor_auto = _ccv_nnc_rmsnorm_tensor_auto_forw; |
344 | 1 | } |
345 | | |
346 | | REGISTER_COMMAND(CCV_NNC_RMSNORM_BACKWARD)(ccv_nnc_cmd_registry_t* const registry) |
347 | | FIND_BACKEND(ccv_nnc_rmsnorm_cpu_ref.c, gpu/ccv_nnc_rmsnorm_gpu_cudnn.cu, mps/ccv_nnc_rmsnorm_mps.m) |
348 | 1 | { |
349 | 1 | registry->bitmask = _ccv_nnc_rmsnorm_back_bitmask; |
350 | 1 | registry->tensor_auto = _ccv_nnc_rmsnorm_tensor_auto_back; |
351 | 1 | } |
352 | | |
353 | | //@REGISTER_EASY_COMMAND_MACRO(CCV_NNC_RMSNORM_FORWARD) |
354 | | #define CMD_RMSNORM_FORWARD(_epsilon, ...) ccv_nnc_cmd(CCV_NNC_RMSNORM_FORWARD, 0, ((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}},.rmsnorm={.epsilon=_epsilon,.count=LIST_COUNT(__VA_ARGS__),.axis={__VA_ARGS__}}}), 0) |
355 | | //@REGISTER_EASY_COMMAND_MACRO(CCV_NNC_RMSNORM_BACKWARD) |
356 | | #define CMD_RMSNORM_BACKWARD(_epsilon, ...) ccv_nnc_cmd(CCV_NNC_RMSNORM_BACKWARD, 0, ((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}},.rmsnorm={.epsilon=_epsilon,.count=LIST_COUNT(__VA_ARGS__),.axis={__VA_ARGS__}}}), 0) |