Coverage Report

Created: 2026-04-18 18:15

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/cmd/norm/ccv_nnc_norm.c
Line
Count
Source
1
#include "ccv.h"
2
#include "nnc/ccv_nnc.h"
3
#include "nnc/ccv_nnc_internal.h"
4
5
static int _ccv_nnc_batch_norm_forw_bitmask(const ccv_nnc_cmd_param_t cmd, const int input_size, const int output_size, const uint64_t* const input_bitmasks, const int input_bitmask_size, const uint64_t* const output_bitmasks, const int output_bitmask_size)
6
675
{
7
  // 5 inputs (x, scale, bias, mean, var)
8
  // 1 outputs (y)
9
675
  if (input_bitmasks[0] == 31u && output_bitmasks[0] == 1u)
10
0
    return 1;
11
  // 5 inputs (x, scale, bias, mean, var)
12
  // 5 outputs (y, mean, var, saved_mean, saved_inv_var)
13
  // Both mean and var in output is inplace for the input mean, var
14
675
  if (input_bitmasks[0] == 31u && output_bitmasks[0] == 31u)
15
375
    return 1;
16
300
  return 0;
17
675
}
18
19
static int _ccv_nnc_batch_norm_enforce_inplace(const ccv_nnc_cmd_param_t cmd, const int input_idx, const int input_size, const int output_idx, const int output_size)
20
2.27k
{
21
2.27k
  if (input_idx == 3 && 
output_idx == 1455
)
22
91
    return 1;
23
2.18k
  if (input_idx == 4 && 
output_idx == 2454
)
24
91
    return 1;
25
2.09k
  return 0;
26
2.18k
}
27
28
static int _ccv_nnc_batch_norm_back_bitmask(const ccv_nnc_cmd_param_t cmd, const int input_size, const int output_size, const uint64_t* const input_bitmasks, const int input_bitmask_size, const uint64_t* const output_bitmasks, const int output_bitmask_size)
29
657
{
30
  // 0b110000001100001
31
  // Inputs (gradient, 0, 0, 0, 0, x, scale, 0, 0, 0, 0, 0, 0, saved_mean, saved_inv_var)
32
  // Output the propagated error, dscale and dbias
33
657
  if ((input_bitmasks[0] & 24673u) == 24673u && 
(output_bitmasks[0] & 7u) == 7u377
)
34
377
    return 1;
35
280
  return 0;
36
657
}
37
38
static void _ccv_nnc_batch_norm_tensor_auto_forw(const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_hint_t hint, ccv_nnc_tensor_param_t* const outputs, const int output_size)
39
1.90k
{
40
1.90k
  assert(input_size == 5);
41
1.90k
  assert(output_size == 1 || output_size == 5);
42
1.90k
  outputs[0] = inputs[0];
43
1.90k
  if (output_size == 1)
44
0
    return;
45
1.90k
  int i, j;
46
9.52k
  for (i = 1; i < output_size; 
i++7.62k
)
47
7.62k
  {
48
7.62k
    outputs[i] = inputs[0];
49
30.4k
    for (j = 0; j < cmd.bnorm.count; 
j++22.8k
)
50
22.8k
      outputs[i].dim[cmd.bnorm.axis[j]] = 1; // Reduce the dimension to 1.
51
7.62k
  }
52
1.90k
}
53
54
static void _ccv_nnc_batch_norm_tensor_auto_back(const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_hint_t hint, ccv_nnc_tensor_param_t* const outputs, const int output_size)
55
1.66k
{
56
1.66k
  assert(input_size == 15);
57
1.66k
  assert(output_size == 5);
58
1.66k
  outputs[0] = inputs[0];
59
1.66k
  int i, j;
60
8.30k
  for (i = 1; i < output_size; 
i++6.64k
)
61
6.64k
  {
62
6.64k
    outputs[i] = inputs[0];
63
26.5k
    for (j = 0; j < cmd.bnorm.count; 
j++19.9k
)
64
19.9k
      outputs[i].dim[cmd.bnorm.axis[j]] = 1; // Reduce the dimension to 1.
65
6.64k
  }
66
1.66k
}
67
68
REGISTER_COMMAND(CCV_NNC_BATCH_NORM_FORWARD)(ccv_nnc_cmd_registry_t* const registry)
69
  FIND_BACKEND(ccv_nnc_batch_norm_cpu_ref.c, gpu/ccv_nnc_batch_norm_gpu_cudnn.cu)
70
1
{
71
1
  registry->bitmask = _ccv_nnc_batch_norm_forw_bitmask;
72
1
  registry->tensor_auto = _ccv_nnc_batch_norm_tensor_auto_forw;
73
1
  registry->enforce_inplace = _ccv_nnc_batch_norm_enforce_inplace;
74
1
}
75
76
REGISTER_COMMAND(CCV_NNC_BATCH_NORM_BACKWARD)(ccv_nnc_cmd_registry_t* const registry)
77
  FIND_BACKEND(ccv_nnc_batch_norm_cpu_ref.c, gpu/ccv_nnc_batch_norm_gpu_cudnn.cu)
78
1
{
79
1
  registry->bitmask = _ccv_nnc_batch_norm_back_bitmask;
80
1
  registry->tensor_auto = _ccv_nnc_batch_norm_tensor_auto_back;
81
1
}
82
83
//@REGISTER_EASY_COMMAND_MACRO(CCV_NNC_BATCH_NORM_FORWARD)
84
#define CMD_BATCH_NORM_FORWARD(_epsilon, _is_test, _momentum, ...) ccv_nnc_cmd(CCV_NNC_BATCH_NORM_FORWARD, 0, ((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}},.bnorm={.epsilon=_epsilon,.is_test=_is_test,.momentum=_momentum,.count=LIST_COUNT(__VA_ARGS__),.axis={__VA_ARGS__}}}), 0)
85
//@REGISTER_EASY_COMMAND_MACRO(CCV_NNC_BATCH_NORM_BACKWARD)
86
#define CMD_BATCH_NORM_BACKWARD(_epsilon, _is_test, _momentum, ...) ccv_nnc_cmd(CCV_NNC_BATCH_NORM_BACKWARD, 0, ((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}},.bnorm={.epsilon=_epsilon,.is_test=_is_test,.momentum=_momentum,.count=LIST_COUNT(__VA_ARGS__),.axis={__VA_ARGS__}}}), 0)
87
88
static int _ccv_nnc_layer_norm_forw_bitmask(const ccv_nnc_cmd_param_t cmd, const int input_size, const int output_size, const uint64_t* const input_bitmasks, const int input_bitmask_size, const uint64_t* const output_bitmasks, const int output_bitmask_size)
89
40
{
90
40
  if (cmd.lnorm.elementwise_affine)
91
40
  {
92
    // 3 inputs (x, gamma, beta)
93
    // 3 outputs (y, saved_mean, saved_inv_std)
94
40
    if (input_bitmasks[0] == 7u && output_bitmasks[0] == 7u)
95
24
      return 1;
96
    // 3 inputs (x, gamma, beta)
97
    // 1 output (y)
98
16
    if (input_bitmasks[0] == 7u && output_bitmasks[0] == 1u)
99
0
      return 1;
100
16
  } else {
101
    // 1 inputs (x)
102
    // 3 outputs (y, saved_mean, saved_inv_std)
103
0
    if (input_bitmasks[0] == 1u && output_bitmasks[0] == 7u)
104
0
      return 1;
105
    // 1 inputs (x)
106
    // 1 output (y)
107
0
    if (input_bitmasks[0] == 1u && output_bitmasks[0] == 1u)
108
0
      return 1;
109
0
  }
110
16
  return 0;
111
40
}
112
113
static int _ccv_nnc_layer_norm_back_bitmask(const ccv_nnc_cmd_param_t cmd, const int input_size, const int output_size, const uint64_t* const input_bitmasks, const int input_bitmask_size, const uint64_t* const output_bitmasks, const int output_bitmask_size)
114
239
{
115
239
  if (cmd.lnorm.elementwise_affine)
116
189
  {
117
    // 0b110011001
118
    // Inputs (gradient, 0, 0, x, gamma, 0, 0, saved_mean, saved_inv_std)
119
    // Output the propagated error, dgamma and dbeta
120
189
    if ((input_bitmasks[0] & 409u) == 409u && 
(output_bitmasks[0] & 7u) == 7u69
)
121
57
      return 1;
122
132
    if ((input_bitmasks[0] & 409u) == 409u && 
(output_bitmasks[0] & 5u) == 5u12
)
123
2
      return 1;
124
130
    if ((input_bitmasks[0] & 409u) == 409u && 
(output_bitmasks[0] & 3u) == 3u10
)
125
0
      return 1;
126
130
    if ((input_bitmasks[0] & 409u) == 409u && 
(output_bitmasks[0] & 1u) == 1u10
)
127
10
      return 1;
128
130
  } else {
129
    // 0b1101001
130
    // Inputs (gradient, 0, 0, x, 0, saved_mean, saved_inv_std)
131
    // Output the propagated error
132
50
    if ((input_bitmasks[0] & 105u) == 105u && 
(output_bitmasks[0] & 1u) == 1u20
)
133
20
      return 1;
134
50
  }
135
150
  return 0;
136
239
}
137
138
static void _ccv_nnc_layer_norm_tensor_auto_forw(const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_hint_t hint, ccv_nnc_tensor_param_t* const outputs, const int output_size)
139
386
{
140
386
  assert(input_size == 3 || input_size == 1);
141
386
  assert(output_size == 1 || output_size == 3);
142
386
  outputs[0] = inputs[0];
143
386
  if (output_size == 1)
144
0
    return;
145
386
  int i, j;
146
1.15k
  for (i = 1; i < output_size; 
i++772
)
147
772
  {
148
772
    outputs[i] = inputs[0];
149
1.64k
    for (j = 0; j < cmd.lnorm.count; 
j++868
)
150
868
      outputs[i].dim[cmd.lnorm.axis[j]] = 1; // Reduce the dimension to 1.
151
772
  }
152
386
}
153
154
static void _ccv_nnc_layer_norm_tensor_auto_back(const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_hint_t hint, ccv_nnc_tensor_param_t* const outputs, const int output_size)
155
306
{
156
306
  assert(input_size == 9 || input_size == 7);
157
306
  assert(output_size == 1 || output_size == 3);
158
306
  outputs[0] = inputs[0];
159
306
  int i, j;
160
908
  for (i = 1; i < output_size; 
i++602
)
161
602
  {
162
602
    outputs[i] = inputs[0];
163
1.22k
    for (j = 0; j < cmd.lnorm.count; 
j++620
)
164
620
      outputs[i].dim[cmd.lnorm.axis[j]] = 1; // Reduce the dimension to 1.
165
602
  }
166
306
}
167
168
REGISTER_COMMAND(CCV_NNC_LAYER_NORM_FORWARD)(ccv_nnc_cmd_registry_t* const registry)
169
  FIND_BACKEND(ccv_nnc_layer_norm_cpu_ref.c, gpu/ccv_nnc_layer_norm_gpu_cudnn.cu, mps/ccv_nnc_layer_norm_mps.m)
170
1
{
171
1
  registry->bitmask = _ccv_nnc_layer_norm_forw_bitmask;
172
1
  registry->tensor_auto = _ccv_nnc_layer_norm_tensor_auto_forw;
173
1
}
174
175
REGISTER_COMMAND(CCV_NNC_LAYER_NORM_BACKWARD)(ccv_nnc_cmd_registry_t* const registry)
176
  FIND_BACKEND(ccv_nnc_layer_norm_cpu_ref.c, gpu/ccv_nnc_layer_norm_gpu_cudnn.cu, mps/ccv_nnc_layer_norm_mps.m)
177
1
{
178
1
  registry->bitmask = _ccv_nnc_layer_norm_back_bitmask;
179
1
  registry->tensor_auto = _ccv_nnc_layer_norm_tensor_auto_back;
180
1
}
181
182
//@REGISTER_EASY_COMMAND_MACRO(CCV_NNC_LAYER_NORM_FORWARD)
183
#define CMD_LAYER_NORM_FORWARD(_epsilon, _elementwise_affine, ...) ccv_nnc_cmd(CCV_NNC_LAYER_NORM_FORWARD, 0, ((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}},.lnorm={.epsilon=_epsilon,.elementwise_affine=_elementwise_affine,.count=LIST_COUNT(__VA_ARGS__),.axis={__VA_ARGS__}}}), 0)
184
//@REGISTER_EASY_COMMAND_MACRO(CCV_NNC_LAYER_NORM_BACKWARD)
185
#define CMD_LAYER_NORM_BACKWARD(_epsilon, _elementwise_affine, ...) ccv_nnc_cmd(CCV_NNC_LAYER_NORM_BACKWARD, 0, ((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}},.lnorm={.epsilon=_epsilon,.elementwise_affine=_elementwise_affine,.count=LIST_COUNT(__VA_ARGS__),.axis={__VA_ARGS__}}}), 0)
186
187
static int _ccv_nnc_group_norm_forw_bitmask(const ccv_nnc_cmd_param_t cmd, const int input_size, const int output_size, const uint64_t* const input_bitmasks, const int input_bitmask_size, const uint64_t* const output_bitmasks, const int output_bitmask_size)
188
0
{
189
0
  if (cmd.gnorm.elementwise_affine)
190
0
  {
191
    // 3 inputs (x, gamma, beta)
192
    // 3 outputs (y, saved_mean, saved_inv_std)
193
0
    if (input_bitmasks[0] == 7u && output_bitmasks[0] == 7u)
194
0
      return 1;
195
    // 3 inputs (x, gamma, beta)
196
    // 1 output (y)
197
0
    if (input_bitmasks[0] == 7u && output_bitmasks[0] == 1u)
198
0
      return 1;
199
0
  } else {
200
    // 1 inputs (x)
201
    // 3 outputs (y, saved_mean, saved_inv_std)
202
0
    if (input_bitmasks[0] == 1u && output_bitmasks[0] == 7u)
203
0
      return 1;
204
    // 1 inputs (x)
205
    // 1 output (y)
206
0
    if (input_bitmasks[0] == 1u && output_bitmasks[0] == 1u)
207
0
      return 1;
208
0
  }
209
0
  return 0;
210
0
}
211
212
static int _ccv_nnc_group_norm_back_bitmask(const ccv_nnc_cmd_param_t cmd, const int input_size, const int output_size, const uint64_t* const input_bitmasks, const int input_bitmask_size, const uint64_t* const output_bitmasks, const int output_bitmask_size)
213
178
{
214
178
  if (cmd.gnorm.elementwise_affine)
215
108
  {
216
    // 0b110011001
217
    // Inputs (gradient, 0, 0, x, gamma, 0, 0, saved_mean, saved_inv_std)
218
    // Output the propagated error, dgamma and dbeta
219
108
    if ((input_bitmasks[0] & 409u) == 409u && 
(output_bitmasks[0] & 7u) == 7u44
)
220
32
      return 1;
221
76
    if ((input_bitmasks[0] & 409u) == 409u && 
(output_bitmasks[0] & 5u) == 5u12
)
222
2
      return 1;
223
74
    if ((input_bitmasks[0] & 409u) == 409u && 
(output_bitmasks[0] & 3u) == 3u10
)
224
0
      return 1;
225
74
    if ((input_bitmasks[0] & 409u) == 409u && 
(output_bitmasks[0] & 1u) == 1u10
)
226
10
      return 1;
227
74
  } else {
228
    // 0b1101001
229
    // Inputs (gradient, 0, 0, x, 0, saved_mean, saved_inv_std)
230
    // Output the propagated error
231
70
    if ((input_bitmasks[0] & 105u) == 105u && 
(output_bitmasks[0] & 7u) == 1u28
)
232
28
      return 1;
233
70
  }
234
106
  return 0;
235
178
}
236
237
static void _ccv_nnc_group_norm_tensor_auto_forw(const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_hint_t hint, ccv_nnc_tensor_param_t* const outputs, const int output_size)
238
41
{
239
41
  assert(input_size == 3 || input_size == 1);
240
41
  assert(output_size == 1 || output_size == 3);
241
41
  outputs[0] = inputs[0];
242
41
  if (output_size == 1)
243
0
    return;
244
41
  int i, j;
245
123
  for (i = 1; i < output_size; 
i++82
)
246
82
  {
247
82
    outputs[i] = inputs[0];
248
82
    outputs[i].dim[cmd.gnorm.group_axis] = cmd.gnorm.groups; // Reduce to num_groups.
249
130
    for (j = 0; j < cmd.gnorm.reduce_count; 
j++48
)
250
48
      outputs[i].dim[cmd.gnorm.reduce_axis[j]] = 1;
251
82
  }
252
41
}
253
254
static void _ccv_nnc_group_norm_tensor_auto_back(const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_hint_t hint, ccv_nnc_tensor_param_t* const outputs, const int output_size)
255
15
{
256
15
  assert(input_size == 9 || input_size == 7);
257
15
  assert(output_size == 1 || output_size == 3);
258
15
  outputs[0] = inputs[0];
259
15
  int i, j;
260
31
  for (i = 1; i < output_size; 
i++16
)
261
16
  {
262
16
    outputs[i] = inputs[0];
263
16
    outputs[i].dim[cmd.gnorm.group_axis] = cmd.gnorm.groups; // Reduce the dimension to num_groups.
264
24
    for (j = 0; j < cmd.gnorm.reduce_count; 
j++8
)
265
8
      outputs[i].dim[cmd.gnorm.reduce_axis[j]] = 1;
266
16
  }
267
15
}
268
269
REGISTER_COMMAND(CCV_NNC_GROUP_NORM_FORWARD)(ccv_nnc_cmd_registry_t* const registry)
270
  FIND_BACKEND(ccv_nnc_group_norm_cpu_ref.c, gpu/ccv_nnc_group_norm_gpu_cudnn.cu, mps/ccv_nnc_group_norm_mps.m)
271
1
{
272
1
  registry->bitmask = _ccv_nnc_group_norm_forw_bitmask;
273
1
  registry->tensor_auto = _ccv_nnc_group_norm_tensor_auto_forw;
274
1
}
275
276
REGISTER_COMMAND(CCV_NNC_GROUP_NORM_BACKWARD)(ccv_nnc_cmd_registry_t* const registry)
277
  FIND_BACKEND(ccv_nnc_group_norm_cpu_ref.c, gpu/ccv_nnc_group_norm_gpu_cudnn.cu, mps/ccv_nnc_group_norm_mps.m)
278
1
{
279
1
  registry->bitmask = _ccv_nnc_group_norm_back_bitmask;
280
1
  registry->tensor_auto = _ccv_nnc_group_norm_tensor_auto_back;
281
1
}
282
283
//@REGISTER_EASY_COMMAND_MACRO(CCV_NNC_GROUP_NORM_FORWARD)
284
#define CMD_GROUP_NORM_FORWARD(_group_axis, _groups, _epsilon, _elementwise_affine, ...) ccv_nnc_cmd(CCV_NNC_GROUP_NORM_FORWARD, 0, ((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}},.gnorm={.group_axis=_group_axis,.groups=_groups,.epsilon=_epsilon,.elementwise_affine=_elementwise_affine,.reduce_count=LIST_COUNT(__VA_ARGS__),.reduce_axis={__VA_ARGS__}}}), 0)
285
//@REGISTER_EASY_COMMAND_MACRO(CCV_NNC_GROUP_NORM_BACKWARD)
286
#define CMD_GROUP_NORM_BACKWARD(_group_axis, _groups, _epsilon, _elementwise_affine, ...) ccv_nnc_cmd(CCV_NNC_GROUP_NORM_BACKWARD, 0, ((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}},.gnorm={.group_axis=_group_axis,.groups=_groups,.epsilon=_epsilon,.elementwise_affine=_elementwise_affine,.reduce_count=LIST_COUNT(__VA_ARGS__),.reduce_axis={__VA_ARGS__}}}), 0)
287
288
static int _ccv_nnc_rmsnorm_forw_bitmask(const ccv_nnc_cmd_param_t cmd, const int input_size, const int output_size, const uint64_t* const input_bitmasks, const int input_bitmask_size, const uint64_t* const output_bitmasks, const int output_bitmask_size)
289
0
{
290
0
  if (cmd.rmsnorm.elementwise_affine)
291
0
  {
292
    // 2 inputs (x, gamma)
293
    // 2 outputs (y, saved_inv_std)
294
0
    if (input_bitmasks[0] == 3u && output_bitmasks[0] == 3u)
295
0
      return 1;
296
0
  } else {
297
    // 1 inputs (x)
298
    // 2 outputs (y, saved_inv_std)
299
0
    if (input_bitmasks[0] == 1u && output_bitmasks[0] == 3u)
300
0
      return 1;
301
0
  }
302
0
  return 0;
303
0
}
304
305
static int _ccv_nnc_rmsnorm_back_bitmask(const ccv_nnc_cmd_param_t cmd, const int input_size, const int output_size, const uint64_t* const input_bitmasks, const int input_bitmask_size, const uint64_t* const output_bitmasks, const int output_bitmask_size)
306
82
{
307
82
  if (cmd.rmsnorm.elementwise_affine)
308
47
  {
309
    // 1 + 4 + 8 + 32
310
    // Inputs (gradient, 0, x, gamma, 0, saved_inv_std)
311
    // Output the propagated error, dgamma
312
47
    if ((input_bitmasks[0] & 45u) == 45u && 
(output_bitmasks[0] & 3u) == 3u17
)
313
11
      return 1;
314
36
    if ((input_bitmasks[0] & 45u) == 45u && 
(output_bitmasks[0] & 1u) == 1u6
)
315
6
      return 1;
316
36
  } else {
317
    // 1 + 4 + 16
318
    // Inputs (gradient, 0, x, 0, saved_inv_std)
319
    // Output the propagated error
320
35
    if ((input_bitmasks[0] & 21u) == 21u && 
(output_bitmasks[0] & 1u) == 1u15
)
321
15
      return 1;
322
35
  }
323
50
  return 0;
324
82
}
325
326
static void _ccv_nnc_rmsnorm_tensor_auto_forw(const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_hint_t hint, ccv_nnc_tensor_param_t* const outputs, const int output_size)
327
26
{
328
26
  assert(input_size == 2 || input_size == 1);
329
26
  assert(output_size == 1 || output_size == 2);
330
26
  outputs[0] = inputs[0];
331
26
  if (output_size == 1)
332
0
    return;
333
26
  int i, j;
334
52
  for (i = 1; i < output_size; 
i++26
)
335
26
  {
336
26
    outputs[i] = inputs[0];
337
104
    for (j = 0; j < cmd.rmsnorm.count; 
j++78
)
338
78
      outputs[i].dim[cmd.rmsnorm.axis[j]] = 1; // Reduce the dimension to 1.
339
26
  }
340
26
}
341
342
static void _ccv_nnc_rmsnorm_tensor_auto_back(const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_hint_t hint, ccv_nnc_tensor_param_t* const outputs, const int output_size)
343
10
{
344
10
  assert(input_size == 6 || input_size == 5);
345
10
  assert(output_size == 1 || output_size == 2);
346
10
  outputs[0] = inputs[0];
347
10
  int i, j;
348
15
  for (i = 1; i < output_size; 
i++5
)
349
5
  {
350
5
    outputs[i] = inputs[0];
351
20
    for (j = 0; j < cmd.rmsnorm.count; 
j++15
)
352
15
      outputs[i].dim[cmd.rmsnorm.axis[j]] = 1; // Reduce the dimension to 1.
353
5
  }
354
10
}
355
356
REGISTER_COMMAND(CCV_NNC_RMSNORM_FORWARD)(ccv_nnc_cmd_registry_t* const registry)
357
  FIND_BACKEND(ccv_nnc_rmsnorm_cpu_ref.c, gpu/ccv_nnc_rmsnorm_gpu_cudnn.cu, mps/ccv_nnc_rmsnorm_mps.m)
358
1
{
359
1
  registry->bitmask = _ccv_nnc_rmsnorm_forw_bitmask;
360
1
  registry->tensor_auto = _ccv_nnc_rmsnorm_tensor_auto_forw;
361
1
}
362
363
REGISTER_COMMAND(CCV_NNC_RMSNORM_BACKWARD)(ccv_nnc_cmd_registry_t* const registry)
364
  FIND_BACKEND(ccv_nnc_rmsnorm_cpu_ref.c, gpu/ccv_nnc_rmsnorm_gpu_cudnn.cu, mps/ccv_nnc_rmsnorm_mps.m)
365
1
{
366
1
  registry->bitmask = _ccv_nnc_rmsnorm_back_bitmask;
367
1
  registry->tensor_auto = _ccv_nnc_rmsnorm_tensor_auto_back;
368
1
}
369
370
//@REGISTER_EASY_COMMAND_MACRO(CCV_NNC_RMSNORM_FORWARD)
371
#define CMD_RMSNORM_FORWARD(_epsilon, _elementwise_affine, ...) ccv_nnc_cmd(CCV_NNC_RMSNORM_FORWARD, 0, ((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}},.rmsnorm={.epsilon=_epsilon,.elementwise_affine=_elementwise_affine,.count=LIST_COUNT(__VA_ARGS__),.axis={__VA_ARGS__}}}), 0)
372
//@REGISTER_EASY_COMMAND_MACRO(CCV_NNC_RMSNORM_BACKWARD)
373
#define CMD_RMSNORM_BACKWARD(_epsilon, _elementwise_affine, ...) ccv_nnc_cmd(CCV_NNC_RMSNORM_BACKWARD, 0, ((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}},.rmsnorm={.epsilon=_epsilon,.elementwise_affine=_elementwise_affine,.count=LIST_COUNT(__VA_ARGS__),.axis={__VA_ARGS__}}}), 0)