Coverage Report

Created: 2024-08-19 11:27

/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/cmd/norm/ccv_nnc_rmsnorm_cpu_ref.c
Line
Count
Source (jump to first uncovered line)
1
#include "ccv.h"
2
#include "ccv_internal.h"
3
#include "nnc/ccv_nnc.h"
4
#include "nnc/ccv_nnc_easy.h"
5
#include "nnc/ccv_nnc_internal.h"
6
#ifdef USE_OPENMP
7
#include <omp.h>
8
#endif
9
#ifdef USE_DISPATCH
10
#include <dispatch/dispatch.h>
11
#endif
12
13
// Shared methods.
14
#include "../_ccv_nnc_cpu_ref.h"
15
16
static int _ccv_nnc_rmsnorm_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
17
5
{
18
5
  assert(input_size == 2);
19
5
  ccv_nnc_tensor_view_t* const a = (ccv_nnc_tensor_view_t*)inputs[0];
20
5
  ccv_nnc_tensor_view_t* const scale = (ccv_nnc_tensor_view_t*)inputs[1];
21
5
  ccv_nnc_tensor_view_t* const b = (ccv_nnc_tensor_view_t*)outputs[0];
22
5
  ccv_nnc_tensor_view_t* const saved_inv_std = (ccv_nnc_tensor_view_t*)outputs[1];
23
5
  assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
24
5
  assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
25
  // Assuming this is float 32.
26
5
  int adim[CCV_NNC_MAX_DIM_ALLOC];
27
5
  int rdim[CCV_NNC_MAX_DIM_ALLOC];
28
5
  ccv_nnc_tensor_view_get_dim(a, adim);
29
5
  ccv_nnc_tensor_view_get_dim(saved_inv_std, rdim);
30
5
  assert(ccv_nnc_tensor_view_check_dim(b, adim));
31
5
  assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
32
5
  int astride[CCV_NNC_MAX_DIM_ALLOC];
33
5
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
34
5
  int scale_stride[CCV_NNC_MAX_DIM_ALLOC];
35
5
  ccv_nnc_tensor_view_get_stride(a, astride);
36
5
  ccv_nnc_tensor_view_get_stride(scale, scale_stride);
37
5
  ccv_nnc_tensor_view_get_stride(b, bstride);
38
  // The epsilon is used a little bit differently from batch norm, it is outside of the sqrt in this case.
39
5
  const float epsilon = cmd.info.rmsnorm.epsilon;
40
5
  int saved_inv_std_stride[CCV_NNC_MAX_DIM_ALLOC];
41
5
  ccv_nnc_tensor_view_get_stride(saved_inv_std, saved_inv_std_stride);
42
5
  int x;
43
5
  int n = 1;
44
25
  for (x = 0; x < CCV_NNC_MAX_DIM + 2; 
x++20
)
45
20
    n *= adim[x];
46
25
  for (x = 0; x < CCV_NNC_MAX_DIM + 2; 
x++20
)
47
20
    n /= rdim[x];
48
5
  const float inv_n = 1. / n;
49
5
  ccv_nnc_tensor_zero(saved_inv_std);
50
5
  float* const ap = a->data.f32;
51
5
  float* const varp = saved_inv_std->data.f32;
52
5
  int i[CCV_NNC_MAX_DIM + 2];
53
27
  for (i[0] = 0; i[0] < adim[0]; 
i[0]++22
)
54
22
  {
55
22
    float* const ap0 = ap + i[0] * astride[0];
56
22
    float* const varp0 = rdim[0] == 1 ? 
varp0
: varp + i[0] * saved_inv_std_stride[0];
57
98
    for (i[1] = 0; i[1] < adim[1]; 
i[1]++76
)
58
76
    {
59
76
      float* ap1 = ap0 + i[1] * astride[1];
60
76
      float* const varp1 = rdim[1] == 1 ? varp0 : 
varp0 + i[1] * saved_inv_std_stride[1]0
;
61
356
      for (i[2] = 0; i[2] < adim[2]; 
i[2]++280
)
62
280
      {
63
280
        float* const varp2 = rdim[2] == 1 ? varp1 : 
varp1 + i[2] * saved_inv_std_stride[2]0
;
64
280
        if (rdim[3] == 1)
65
3.08k
          
for (x = 0; 280
x < adim[3];
x++2.80k
)
66
2.80k
          {
67
2.80k
            float w = ap1[x * astride[3]];
68
2.80k
            varp2[0] += w * w;
69
2.80k
          }
70
0
        else
71
0
          for (x = 0; x < adim[3]; x++)
72
0
          {
73
0
            float w = ap1[x * astride[3]];
74
0
            varp2[x] += w * w;
75
0
          }
76
280
        ap1 += astride[2];
77
280
      }
78
76
    }
79
22
  }
80
27
  for (i[0] = 0; i[0] < rdim[0]; 
i[0]++22
)
81
22
  {
82
22
    float* const varp0 = varp + i[0] * saved_inv_std_stride[0];
83
44
    for (i[1] = 0; i[1] < rdim[1]; 
i[1]++22
)
84
22
    {
85
22
      float* const varp1 = varp0 + i[1] * saved_inv_std_stride[1];
86
44
      for (i[2] = 0; i[2] < rdim[2]; 
i[2]++22
)
87
22
      {
88
22
        float* const varp2 = varp1 + i[2] * saved_inv_std_stride[2];
89
44
        for (x = 0; x < rdim[3]; 
x++22
)
90
22
          varp2[x] = 1. / sqrtf(varp2[x] * inv_n + epsilon);
91
22
      }
92
22
    }
93
22
  }
94
5
  float* const scalep = scale->data.f32;
95
5
  int sdim[CCV_NNC_MAX_DIM_ALLOC];
96
5
  ccv_nnc_tensor_view_get_dim(scale, sdim);
97
  // Do the straight-forward one, y = x * inv_std * scale + bias, we cannot allocate extra memory to help.
98
  // There is no need for precompute since scale / bias is per element.
99
5
  float* const bp = b->data.f32;
100
27
  for (i[0] = 0; i[0] < adim[0]; 
i[0]++22
)
101
22
  {
102
22
    float* const ap0 = ap + i[0] * astride[0];
103
22
    float* const bp0 = bp + i[0] * bstride[0];
104
22
    float* const varp0 = rdim[0] == 1 ? 
varp0
: varp + i[0] * saved_inv_std_stride[0];
105
22
    float* const scalep0 = sdim[0] == 1 ? scalep : 
scalep + i[0] * scale_stride[0]0
;
106
98
    for (i[1] = 0; i[1] < adim[1]; 
i[1]++76
)
107
76
    {
108
76
      float* ap1 = ap0 + i[1] * astride[1];
109
76
      float* bp1 = bp0 + i[1] * bstride[1];
110
76
      float* const varp1 = rdim[1] == 1 ? varp0 : 
varp0 + i[1] * saved_inv_std_stride[1]0
;
111
76
      float* const scalep1 = sdim[1] == 1 ? 
scalep00
: scalep0 + i[1] * scale_stride[1];
112
356
      for (i[2] = 0; i[2] < adim[2]; 
i[2]++280
)
113
280
      {
114
280
        float* const varp2 = rdim[2] == 1 ? varp1 : 
varp1 + i[2] * saved_inv_std_stride[2]0
;
115
280
        float* const scalep2 = sdim[2] == 1 ? 
scalep10
: scalep1 + i[2] * scale_stride[2];
116
280
        if (rdim[3] == 1)
117
3.08k
          
for (x = 0; 280
x < adim[3];
x++2.80k
)
118
2.80k
            bp1[x] = ap1[x * astride[3]] * varp2[0] * scalep2[sdim[3] == 1 ? 
00
: x];
119
0
        else
120
0
          for (x = 0; x < adim[3]; x++)
121
0
            bp1[x] = ap1[x * astride[3]] * varp2[x] * scalep2[sdim[3] == 1 ? 0 : x];
122
280
        ap1 += astride[2];
123
280
        bp1 += bstride[2];
124
280
      }
125
76
    }
126
22
  }
127
5
  return CCV_NNC_EXEC_SUCCESS;
128
5
}
129
130
static int _ccv_nnc_rmsnorm_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
131
3
{
132
3
  assert(input_size == 6);
133
3
  assert(output_size >= 1);
134
3
  ccv_nnc_tensor_view_t* const g = (ccv_nnc_tensor_view_t*)inputs[0];
135
3
  ccv_nnc_tensor_view_t* const a = (ccv_nnc_tensor_view_t*)inputs[2];
136
3
  ccv_nnc_tensor_view_t* const scale = (ccv_nnc_tensor_view_t*)inputs[3];
137
3
  ccv_nnc_tensor_view_t* const saved_inv_std = (ccv_nnc_tensor_view_t*)inputs[5];
138
3
  ccv_nnc_tensor_view_t* const h = (ccv_nnc_tensor_view_t*)outputs[0];
139
3
  ccv_nnc_tensor_view_t* const dscale = output_size > 1 ? (ccv_nnc_tensor_view_t*)outputs[1] : 
00
;
140
3
  assert(ccv_nnc_tensor_nd(g->info.dim) <= CCV_NNC_MAX_DIM + 2);
141
3
  assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
142
3
  assert(ccv_nnc_tensor_nd(h->info.dim) <= CCV_NNC_MAX_DIM + 2);
143
  // Assuming this is float 32.
144
3
  int gdim[CCV_NNC_MAX_DIM_ALLOC];
145
3
  int rdim[CCV_NNC_MAX_DIM_ALLOC];
146
3
  ccv_nnc_tensor_view_get_dim(g, gdim);
147
3
  ccv_nnc_tensor_view_get_dim(saved_inv_std, rdim);
148
3
  int sdim[CCV_NNC_MAX_DIM_ALLOC];
149
3
  ccv_nnc_tensor_view_get_dim(scale, sdim);
150
3
  if (dscale)
151
2
    { assert(ccv_nnc_tensor_view_check_dim(dscale, sdim)); }
152
3
  assert(ccv_nnc_tensor_view_check_dim(a, gdim));
153
3
  assert(ccv_nnc_tensor_view_check_dim(h, gdim));
154
3
  int astride[CCV_NNC_MAX_DIM_ALLOC];
155
3
  int gstride[CCV_NNC_MAX_DIM_ALLOC];
156
3
  int hstride[CCV_NNC_MAX_DIM_ALLOC];
157
3
  int scale_stride[CCV_NNC_MAX_DIM_ALLOC];
158
3
  int inv_std_stride[CCV_NNC_MAX_DIM_ALLOC];
159
3
  int dscale_stride[CCV_NNC_MAX_DIM_ALLOC];
160
3
  ccv_nnc_tensor_view_get_stride(a, astride);
161
3
  ccv_nnc_tensor_view_get_stride(g, gstride);
162
3
  ccv_nnc_tensor_view_get_stride(h, hstride);
163
3
  ccv_nnc_tensor_view_get_stride(scale, scale_stride);
164
3
  ccv_nnc_tensor_view_get_stride(saved_inv_std, inv_std_stride);
165
3
  if (dscale)
166
2
    ccv_nnc_tensor_view_get_stride(dscale, dscale_stride);
167
  // Need to allocate two additional memory:
168
  // 1. normalized a;
169
  // 2. scale * inv_std / n;
170
3
  assert(!(flags & CCV_NNC_ZERO_MEMORY_ALLOC));
171
3
  int x;
172
3
  int n = 1;
173
15
  for (x = 0; x < CCV_NNC_MAX_DIM + 2; 
x++12
)
174
12
    n *= gdim[x];
175
15
  for (x = 0; x < CCV_NNC_MAX_DIM + 2; 
x++12
)
176
12
    n /= rdim[x];
177
3
  int gcount = 1, rcount = 1;
178
15
  for (x = 0; x < CCV_NNC_MAX_DIM + 2; 
x++12
)
179
12
    gcount *= gdim[x], rcount *= rdim[x];
180
3
  float* const ah = (float*)ccv_nnc_stream_context_get_workspace(stream_context, sizeof(float) * gcount * 2 + sizeof(float) * rcount, CCV_TENSOR_CPU_MEMORY);
181
3
  float* const gss = ah + gcount; // g * scale * inv_std
182
3
  float* const ahgssr = gss + gcount; // ah * gss then reduced to inv_std dimension.
183
3
  int i[CCV_NNC_MAX_DIM + 2];
184
3
  float* ahp = ah;
185
3
  const float* const inv_stdp = saved_inv_std->data.f32;
186
3
  const float* const ap = a->data.f32;
187
15
  for (i[0] = 0; i[0] < gdim[0]; 
i[0]++12
)
188
12
  {
189
12
    const float* const ap0 = ap + i[0] * astride[0];
190
12
    const float* const inv_stdp0 = rdim[0] == 1 ? 
inv_stdp0
: inv_stdp + i[0] * inv_std_stride[0];
191
52
    for (i[1] = 0; i[1] < gdim[1]; 
i[1]++40
)
192
40
    {
193
40
      const float* ap1 = ap0 + i[1] * astride[1];
194
40
      const float* const inv_stdp1 = rdim[1] == 1 ? inv_stdp0 : 
inv_stdp0 + i[1] * inv_std_stride[1]0
;
195
184
      for (i[2] = 0; i[2] < gdim[2]; 
i[2]++144
)
196
144
      {
197
144
        const float* const inv_stdp2 = rdim[2] == 1 ? inv_stdp1 : 
inv_stdp1 + i[2] * inv_std_stride[2]0
;
198
144
        if (rdim[3] == 1)
199
1.58k
          
for (x = 0; 144
x < gdim[3];
x++1.44k
)
200
1.44k
            ahp[x] = ap1[x] * inv_stdp2[0];
201
0
        else
202
0
          for (x = 0; x < gdim[3]; x++)
203
0
            ahp[x] = ap1[x] * inv_stdp2[x];
204
144
        ap1 += astride[2];
205
144
        ahp += gdim[3];
206
144
      }
207
40
    }
208
12
  }
209
3
  if (dscale)
210
2
  {
211
2
    ccv_nnc_tensor_zero(dscale);
212
2
    ahp = ah;
213
2
    float* gssp = gss;
214
2
    const float* const gp = g->data.f32;
215
2
    const float* const scalep = scale->data.f32;
216
2
    float* const dscalep = dscale->data.f32;
217
12
    for (i[0] = 0; i[0] < gdim[0]; 
i[0]++10
)
218
10
    {
219
10
      const float* const gp0 = gp + i[0] * gstride[0];
220
10
      const float* const inv_stdp0 = rdim[0] == 1 ? 
inv_stdp0
: inv_stdp + i[0] * inv_std_stride[0];
221
10
      const float* const scalep0 = sdim[0] == 1 ? scalep : 
scalep + i[0] * scale_stride[0]0
;
222
10
      float* const dscalep0 = sdim[0] == 1 ? dscalep : 
dscalep + i[0] * dscale_stride[0]0
;
223
46
      for (i[1] = 0; i[1] < gdim[1]; 
i[1]++36
)
224
36
      {
225
36
        const float* gp1 = gp0 + i[1] * gstride[1];
226
36
        const float* const inv_stdp1 = rdim[1] == 1 ? inv_stdp0 : 
inv_stdp0 + i[1] * inv_std_stride[1]0
;
227
36
        const float* const scalep1 = sdim[1] == 1 ? 
scalep00
: scalep0 + i[1] * scale_stride[1];
228
36
        float* const dscalep1 = sdim[1] == 1 ? 
dscalep00
: dscalep0 + i[1] * dscale_stride[1];
229
172
        for (i[2] = 0; i[2] < gdim[2]; 
i[2]++136
)
230
136
        {
231
136
          const float* const inv_stdp2 = rdim[2] == 1 ? inv_stdp1 : 
inv_stdp1 + i[2] * inv_std_stride[2]0
;
232
136
          const float* const scalep2 = sdim[2] == 1 ? 
scalep10
: scalep1 + i[2] * scale_stride[2];
233
136
          float* const dscalep2 = sdim[2] == 1 ? 
dscalep10
: dscalep1 + i[2] * dscale_stride[2];
234
136
          if (sdim[3] == 1)
235
0
            for (x = 0; x < gdim[3]; x++)
236
0
            {
237
0
              gssp[x] = gp1[x] * scalep2[0] * inv_stdp2[rdim[3] == 1 ? 0 : x];
238
0
              dscalep2[0] += ahp[x] * gp1[x];
239
0
            }
240
136
          else
241
1.49k
            
for (x = 0; 136
x < gdim[3];
x++1.36k
)
242
1.36k
            {
243
1.36k
              gssp[x] = gp1[x] * scalep2[x] * inv_stdp2[rdim[3] == 1 ? 0 : 
x0
];
244
1.36k
              dscalep2[x] += ahp[x] * gp1[x];
245
1.36k
            }
246
136
          gp1 += gstride[2];
247
136
          ahp += gdim[3];
248
136
          gssp += gdim[3];
249
136
        }
250
36
      }
251
10
    }
252
2
  } else {
253
1
    float* gssp = gss;
254
1
    const float* const gp = g->data.f32;
255
1
    const float* const scalep = scale->data.f32;
256
3
    for (i[0] = 0; i[0] < gdim[0]; 
i[0]++2
)
257
2
    {
258
2
      const float* const gp0 = gp + i[0] * gstride[0];
259
2
      const float* const inv_stdp0 = rdim[0] == 1 ? 
inv_stdp0
: inv_stdp + i[0] * inv_std_stride[0];
260
2
      const float* const scalep0 = sdim[0] == 1 ? scalep : 
scalep + i[0] * scale_stride[0]0
;
261
6
      for (i[1] = 0; i[1] < gdim[1]; 
i[1]++4
)
262
4
      {
263
4
        const float* gp1 = gp0 + i[1] * gstride[1];
264
4
        const float* const inv_stdp1 = rdim[1] == 1 ? inv_stdp0 : 
inv_stdp0 + i[1] * inv_std_stride[1]0
;
265
4
        const float* const scalep1 = sdim[1] == 1 ? 
scalep00
: scalep0 + i[1] * scale_stride[1];
266
12
        for (i[2] = 0; i[2] < gdim[2]; 
i[2]++8
)
267
8
        {
268
8
          const float* const inv_stdp2 = rdim[2] == 1 ? inv_stdp1 : 
inv_stdp1 + i[2] * inv_std_stride[2]0
;
269
8
          const float* const scalep2 = sdim[2] == 1 ? 
scalep10
: scalep1 + i[2] * scale_stride[2];
270
8
          if (sdim[3] == 1)
271
0
            for (x = 0; x < gdim[3]; x++)
272
0
              gssp[x] = gp1[x] * scalep2[0] * inv_stdp2[rdim[3] == 1 ? 0 : x];
273
8
          else
274
88
            
for (x = 0; 8
x < gdim[3];
x++80
)
275
80
              gssp[x] = gp1[x] * scalep2[x] * inv_stdp2[rdim[3] == 1 ? 0 : 
x0
];
276
8
          gp1 += gstride[2];
277
8
          gssp += gdim[3];
278
8
        }
279
4
      }
280
2
    }
281
1
  }
282
3
  ahp = ah;
283
3
  float* gssp = gss;
284
3
  ccv_nnc_tensor_t ahgssrt = ccv_nnc_tensor(ahgssr, saved_inv_std->info, 0);
285
3
  ccv_nnc_tensor_zero(&ahgssrt);
286
3
  float* const ahgssrp = ahgssr;
287
15
  for (i[0] = 0; i[0] < gdim[0]; 
i[0]++12
)
288
12
  {
289
12
    float* const ahgssrp0 = rdim[0] == 1 ? 
ahgssrp0
: ahgssrp + i[0] * rdim[1] * rdim[2] * rdim[3];
290
52
    for (i[1] = 0; i[1] < gdim[1]; 
i[1]++40
)
291
40
    {
292
40
      float* const ahgssrp1 = rdim[1] == 1 ? ahgssrp0 : 
ahgssrp0 + i[1] * rdim[2] * rdim[3]0
;
293
184
      for (i[2] = 0; i[2] < gdim[2]; 
i[2]++144
)
294
144
      {
295
144
        float* const ahgssrp2 = rdim[2] == 1 ? ahgssrp1 : 
ahgssrp1 + i[2] * rdim[3]0
;
296
144
        if (rdim[3] == 1)
297
1.58k
          
for (x = 0; 144
x < gdim[3];
x++1.44k
)
298
1.44k
            ahgssrp2[0] += ahp[x] * gssp[x];
299
0
        else
300
0
          for (x = 0; x < gdim[3]; x++)
301
0
            ahgssrp2[x] += ahp[x] * gssp[x];
302
144
        ahp += gdim[3];
303
144
        gssp += gdim[3];
304
144
      }
305
40
    }
306
12
  }
307
  // Now the part to compute dx (h).
308
3
  float* const hp = h->data.f32;
309
3
  ahp = ah;
310
3
  const float inv_n = 1. / n;
311
3
  gssp = gss;
312
15
  for (i[0] = 0; i[0] < gdim[0]; 
i[0]++12
)
313
12
  {
314
12
    float* const hp0 = hp + i[0] * hstride[0];
315
12
    const float* const ahgssrp0 = rdim[0] == 1 ? 
ahgssrp0
: ahgssrp + i[0] * rdim[1] * rdim[2] * rdim[3];
316
52
    for (i[1] = 0; i[1] < gdim[1]; 
i[1]++40
)
317
40
    {
318
40
      float* hp1 = hp0 + i[1] * hstride[1];
319
40
      const float* const ahgssrp1 = rdim[1] == 1 ? ahgssrp0 : 
ahgssrp0 + i[1] * rdim[2] * rdim[3]0
;
320
184
      for (i[2] = 0; i[2] < gdim[2]; 
i[2]++144
)
321
144
      {
322
144
        const float* const ahgssrp2 = rdim[2] == 1 ? ahgssrp1 : 
ahgssrp1 + i[2] * rdim[3]0
;
323
144
        if (rdim[3] == 1)
324
1.58k
          
for (x = 0; 144
x < gdim[3];
x++1.44k
)
325
1.44k
            hp1[x] = gssp[x] - inv_n * ahp[x] * ahgssrp2[0];
326
0
        else
327
0
          for (x = 0; x < gdim[3]; x++)
328
0
            hp1[x] = gssp[x] - inv_n * ahp[x] * ahgssrp2[x];
329
144
        hp1 += hstride[2];
330
144
        ahp += gdim[3];
331
144
        gssp += gdim[3];
332
144
      }
333
40
    }
334
12
  }
335
3
  return CCV_NNC_EXEC_SUCCESS;
336
3
}
337
338
REGISTER_COMMAND_BACKEND(CCV_NNC_RMSNORM_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
339
1
{
340
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
341
1
  registry->tensor_datatypes = CCV_32F;
342
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
343
1
  registry->algorithms = 1;
344
1
  registry->exec = _ccv_nnc_rmsnorm_forw;
345
1
}
346
347
REGISTER_COMMAND_BACKEND(CCV_NNC_RMSNORM_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
348
1
{
349
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
350
1
  registry->tensor_datatypes = CCV_32F;
351
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
352
1
  registry->algorithms = 1;
353
1
  registry->exec = _ccv_nnc_rmsnorm_back;
354
1
}