Coverage Report

Created: 2024-08-18 16:21

/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/cmd/norm/ccv_nnc_layer_norm_cpu_ref.c
Line
Count
Source (jump to first uncovered line)
1
#include "ccv.h"
2
#include "ccv_internal.h"
3
#include "nnc/ccv_nnc.h"
4
#include "nnc/ccv_nnc_easy.h"
5
#include "nnc/ccv_nnc_internal.h"
6
#ifdef USE_OPENMP
7
#include <omp.h>
8
#endif
9
#ifdef USE_DISPATCH
10
#include <dispatch/dispatch.h>
11
#endif
12
13
// Shared methods.
14
#include "../_ccv_nnc_cpu_ref.h"
15
16
static int _ccv_nnc_layer_norm_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
17
10
{
18
10
  assert(input_size == 3 || input_size == 1);
19
10
  ccv_nnc_tensor_view_t* const a = (ccv_nnc_tensor_view_t*)inputs[0];
20
10
  ccv_nnc_tensor_view_t* const scale = input_size >= 2 ? 
(ccv_nnc_tensor_view_t*)inputs[1]5
:
05
;
21
10
  ccv_nnc_tensor_view_t* const bias = input_size >= 3 ? 
(ccv_nnc_tensor_view_t*)inputs[2]5
:
05
;
22
10
  ccv_nnc_tensor_view_t* const b = (ccv_nnc_tensor_view_t*)outputs[0];
23
10
  ccv_nnc_tensor_view_t* const saved_mean = (ccv_nnc_tensor_view_t*)outputs[1];
24
10
  ccv_nnc_tensor_view_t* const saved_inv_std = (ccv_nnc_tensor_view_t*)outputs[2];
25
10
  assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
26
10
  assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
27
  // Assuming this is float 32.
28
10
  int adim[CCV_NNC_MAX_DIM_ALLOC];
29
10
  int rdim[CCV_NNC_MAX_DIM_ALLOC];
30
10
  ccv_nnc_tensor_view_get_dim(a, adim);
31
10
  ccv_nnc_tensor_view_get_dim(saved_mean, rdim);
32
10
  assert(ccv_nnc_tensor_view_check_dim(saved_inv_std, rdim));
33
10
  assert(ccv_nnc_tensor_view_check_dim(b, adim));
34
10
  assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
35
10
  int astride[CCV_NNC_MAX_DIM_ALLOC];
36
10
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
37
10
  int scale_stride[CCV_NNC_MAX_DIM_ALLOC];
38
10
  int bias_stride[CCV_NNC_MAX_DIM_ALLOC];
39
10
  ccv_nnc_tensor_view_get_stride(a, astride);
40
10
  if (scale)
41
5
    ccv_nnc_tensor_view_get_stride(scale, scale_stride);
42
10
  if (bias)
43
5
    ccv_nnc_tensor_view_get_stride(bias, bias_stride);
44
10
  ccv_nnc_tensor_view_get_stride(b, bstride);
45
  // The epsilon is used a little bit differently from batch norm, it is outside of the sqrt in this case.
46
10
  const float epsilon = cmd.info.lnorm.epsilon;
47
10
  int saved_mean_stride[CCV_NNC_MAX_DIM_ALLOC];
48
10
  int saved_inv_std_stride[CCV_NNC_MAX_DIM_ALLOC];
49
10
  ccv_nnc_tensor_view_get_stride(saved_mean, saved_mean_stride);
50
10
  ccv_nnc_tensor_view_get_stride(saved_inv_std, saved_inv_std_stride);
51
10
  int x;
52
10
  int n = 1;
53
50
  for (x = 0; x < CCV_NNC_MAX_DIM + 2; 
x++40
)
54
40
    n *= adim[x];
55
50
  for (x = 0; x < CCV_NNC_MAX_DIM + 2; 
x++40
)
56
40
    n /= rdim[x];
57
10
  const float inv_n = 1. / n;
58
10
  _ccv_nnc_reduce_sum_forw_cpu_ref(a, saved_mean);
59
10
  _ccv_nnc_mul_forw_cpu_ref(inv_n, saved_mean, 0, saved_mean);
60
10
  ccv_nnc_tensor_zero(saved_inv_std);
61
10
  float* const ap = a->data.f32;
62
10
  float* const meanp = saved_mean->data.f32;
63
10
  float* const varp = saved_inv_std->data.f32;
64
10
  int i[CCV_NNC_MAX_DIM + 2];
65
54
  for (i[0] = 0; i[0] < adim[0]; 
i[0]++44
)
66
44
  {
67
44
    float* const ap0 = ap + i[0] * astride[0];
68
44
    float* const meanp0 = rdim[0] == 1 ? 
meanp0
: meanp + i[0] * saved_mean_stride[0];
69
44
    float* const varp0 = rdim[0] == 1 ? 
varp0
: varp + i[0] * saved_inv_std_stride[0];
70
196
    for (i[1] = 0; i[1] < adim[1]; 
i[1]++152
)
71
152
    {
72
152
      float* ap1 = ap0 + i[1] * astride[1];
73
152
      float* const meanp1 = rdim[1] == 1 ? meanp0 : 
meanp0 + i[1] * saved_mean_stride[1]0
;
74
152
      float* const varp1 = rdim[1] == 1 ? varp0 : 
varp0 + i[1] * saved_inv_std_stride[1]0
;
75
712
      for (i[2] = 0; i[2] < adim[2]; 
i[2]++560
)
76
560
      {
77
560
        float* const meanp2 = rdim[2] == 1 ? meanp1 : 
meanp1 + i[2] * saved_mean_stride[2]0
;
78
560
        float* const varp2 = rdim[2] == 1 ? varp1 : 
varp1 + i[2] * saved_inv_std_stride[2]0
;
79
560
        if (rdim[3] == 1)
80
6.16k
          
for (x = 0; 560
x < adim[3];
x++5.60k
)
81
5.60k
          {
82
5.60k
            float w = ap1[x * astride[3]] - meanp2[0];
83
5.60k
            varp2[0] += w * w;
84
5.60k
          }
85
0
        else
86
0
          for (x = 0; x < adim[3]; x++)
87
0
          {
88
0
            float w = ap1[x * astride[3]] - meanp2[x];
89
0
            varp2[x] += w * w;
90
0
          }
91
560
        ap1 += astride[2];
92
560
      }
93
152
    }
94
44
  }
95
54
  for (i[0] = 0; i[0] < rdim[0]; 
i[0]++44
)
96
44
  {
97
44
    float* const varp0 = varp + i[0] * saved_inv_std_stride[0];
98
88
    for (i[1] = 0; i[1] < rdim[1]; 
i[1]++44
)
99
44
    {
100
44
      float* const varp1 = varp0 + i[1] * saved_inv_std_stride[1];
101
88
      for (i[2] = 0; i[2] < rdim[2]; 
i[2]++44
)
102
44
      {
103
44
        float* const varp2 = varp1 + i[2] * saved_inv_std_stride[2];
104
88
        for (x = 0; x < rdim[3]; 
x++44
)
105
44
          varp2[x] = 1. / sqrtf(varp2[x] * inv_n + epsilon);
106
44
      }
107
44
    }
108
44
  }
109
10
  if (cmd.info.lnorm.elementwise_affine)
110
5
  {
111
5
    assert(scale && bias && "Should have both scale and bias");
112
5
    float* const scalep = scale->data.f32;
113
5
    float* const biasp = bias->data.f32;
114
5
    int sdim[CCV_NNC_MAX_DIM_ALLOC];
115
5
    ccv_nnc_tensor_view_get_dim(scale, sdim);
116
5
    int bias_dim[CCV_NNC_MAX_DIM_ALLOC];
117
5
    ccv_nnc_tensor_view_get_dim(bias, bias_dim);
118
    // Do the straight-forward one, y = (x - mean) * inv_std * scale + bias, we cannot allocate extra memory to help.
119
    // There is no need for precompute since scale / bias is per element.
120
5
    float* const bp = b->data.f32;
121
27
    for (i[0] = 0; i[0] < adim[0]; 
i[0]++22
)
122
22
    {
123
22
      float* const ap0 = ap + i[0] * astride[0];
124
22
      float* const bp0 = bp + i[0] * bstride[0];
125
22
      float* const meanp0 = rdim[0] == 1 ? 
meanp0
: meanp + i[0] * saved_mean_stride[0];
126
22
      float* const varp0 = rdim[0] == 1 ? 
varp0
: varp + i[0] * saved_inv_std_stride[0];
127
22
      float* const scalep0 = sdim[0] == 1 ? scalep : 
scalep + i[0] * scale_stride[0]0
;
128
22
      float* const biasp0 = bias_dim[0] == 1 ? biasp : 
biasp + i[0] * bias_stride[0]0
;
129
98
      for (i[1] = 0; i[1] < adim[1]; 
i[1]++76
)
130
76
      {
131
76
        float* ap1 = ap0 + i[1] * astride[1];
132
76
        float* bp1 = bp0 + i[1] * bstride[1];
133
76
        float* const meanp1 = rdim[1] == 1 ? meanp0 : 
meanp0 + i[1] * saved_mean_stride[1]0
;
134
76
        float* const varp1 = rdim[1] == 1 ? varp0 : 
varp0 + i[1] * saved_inv_std_stride[1]0
;
135
76
        float* const scalep1 = sdim[1] == 1 ? 
scalep00
: scalep0 + i[1] * scale_stride[1];
136
76
        float* const biasp1 = bias_dim[1] == 1 ? 
biasp00
: biasp0 + i[1] * bias_stride[1];
137
356
        for (i[2] = 0; i[2] < adim[2]; 
i[2]++280
)
138
280
        {
139
280
          float* const meanp2 = rdim[2] == 1 ? meanp1 : 
meanp1 + i[2] * saved_mean_stride[2]0
;
140
280
          float* const varp2 = rdim[2] == 1 ? varp1 : 
varp1 + i[2] * saved_inv_std_stride[2]0
;
141
280
          float* const scalep2 = sdim[2] == 1 ? 
scalep10
: scalep1 + i[2] * scale_stride[2];
142
280
          float* const biasp2 = bias_dim[2] == 1 ? 
biasp10
: biasp1 + i[2] * bias_stride[2];
143
280
          if (rdim[3] == 1)
144
3.08k
            
for (x = 0; 280
x < adim[3];
x++2.80k
)
145
2.80k
              bp1[x] = (ap1[x * astride[3]] - meanp2[0]) * varp2[0] * scalep2[sdim[3] == 1 ? 
00
: x] + biasp2[bias_dim[3] == 1 ?
00
: x];
146
0
          else
147
0
            for (x = 0; x < adim[3]; x++)
148
0
              bp1[x] = (ap1[x * astride[3]] - meanp2[x]) * varp2[x] * scalep2[sdim[3] == 1 ? 0 : x] + biasp2[bias_dim[3] == 1 ? 0 : x];
149
280
          ap1 += astride[2];
150
280
          bp1 += bstride[2];
151
280
        }
152
76
      }
153
22
    }
154
5
  } else {
155
    // Do the straight-forward one, y = (x - mean) * inv_std, we cannot allocate extra memory to help.
156
5
    float* const bp = b->data.f32;
157
27
    for (i[0] = 0; i[0] < adim[0]; 
i[0]++22
)
158
22
    {
159
22
      float* const ap0 = ap + i[0] * astride[0];
160
22
      float* const bp0 = bp + i[0] * bstride[0];
161
22
      float* const meanp0 = rdim[0] == 1 ? 
meanp0
: meanp + i[0] * saved_mean_stride[0];
162
22
      float* const varp0 = rdim[0] == 1 ? 
varp0
: varp + i[0] * saved_inv_std_stride[0];
163
98
      for (i[1] = 0; i[1] < adim[1]; 
i[1]++76
)
164
76
      {
165
76
        float* ap1 = ap0 + i[1] * astride[1];
166
76
        float* bp1 = bp0 + i[1] * bstride[1];
167
76
        float* const meanp1 = rdim[1] == 1 ? meanp0 : 
meanp0 + i[1] * saved_mean_stride[1]0
;
168
76
        float* const varp1 = rdim[1] == 1 ? varp0 : 
varp0 + i[1] * saved_inv_std_stride[1]0
;
169
356
        for (i[2] = 0; i[2] < adim[2]; 
i[2]++280
)
170
280
        {
171
280
          float* const meanp2 = rdim[2] == 1 ? meanp1 : 
meanp1 + i[2] * saved_mean_stride[2]0
;
172
280
          float* const varp2 = rdim[2] == 1 ? varp1 : 
varp1 + i[2] * saved_inv_std_stride[2]0
;
173
280
          if (rdim[3] == 1)
174
3.08k
            
for (x = 0; 280
x < adim[3];
x++2.80k
)
175
2.80k
              bp1[x] = (ap1[x * astride[3]] - meanp2[0]) * varp2[0];
176
0
          else
177
0
            for (x = 0; x < adim[3]; x++)
178
0
              bp1[x] = (ap1[x * astride[3]] - meanp2[x]) * varp2[x];
179
280
          ap1 += astride[2];
180
280
          bp1 += bstride[2];
181
280
        }
182
76
      }
183
22
    }
184
5
  }
185
10
  return CCV_NNC_EXEC_SUCCESS;
186
10
}
187
188
static int _ccv_nnc_layer_norm_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
189
6
{
190
6
  assert(input_size == 9 || input_size == 7);
191
6
  assert(output_size >= 1);
192
6
  const int elementwise_affine = cmd.info.lnorm.elementwise_affine;
193
6
  ccv_nnc_tensor_view_t* const g = (ccv_nnc_tensor_view_t*)inputs[0];
194
6
  ccv_nnc_tensor_view_t* const a = (ccv_nnc_tensor_view_t*)inputs[3];
195
6
  ccv_nnc_tensor_view_t* const scale = elementwise_affine ? 
(ccv_nnc_tensor_view_t*)inputs[4]3
:
03
;
196
6
  ccv_nnc_tensor_view_t* const saved_mean = (ccv_nnc_tensor_view_t*)inputs[elementwise_affine ? 
73
:
53
];
197
6
  ccv_nnc_tensor_view_t* const saved_inv_std = (ccv_nnc_tensor_view_t*)inputs[elementwise_affine ? 
83
:
63
];
198
6
  ccv_nnc_tensor_view_t* const h = (ccv_nnc_tensor_view_t*)outputs[0];
199
6
  ccv_nnc_tensor_view_t* const dscale = output_size > 1 ? 
(ccv_nnc_tensor_view_t*)outputs[1]3
:
03
;
200
6
  ccv_nnc_tensor_view_t* const dbias = output_size > 2 ? 
(ccv_nnc_tensor_view_t*)outputs[2]3
:
03
;
201
6
  assert(ccv_nnc_tensor_nd(g->info.dim) <= CCV_NNC_MAX_DIM + 2);
202
6
  assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
203
6
  assert(ccv_nnc_tensor_nd(h->info.dim) <= CCV_NNC_MAX_DIM + 2);
204
  // Assuming this is float 32.
205
6
  int gdim[CCV_NNC_MAX_DIM_ALLOC];
206
6
  int rdim[CCV_NNC_MAX_DIM_ALLOC];
207
6
  ccv_nnc_tensor_view_get_dim(g, gdim);
208
6
  ccv_nnc_tensor_view_get_dim(saved_mean, rdim);
209
6
  assert(ccv_nnc_tensor_view_check_dim(saved_inv_std, rdim));
210
6
  int sdim[CCV_NNC_MAX_DIM_ALLOC];
211
6
  if (scale)
212
3
    ccv_nnc_tensor_view_get_dim(scale, sdim);
213
6
  if (dscale)
214
2
    { assert(ccv_nnc_tensor_view_check_dim(dscale, sdim)); }
215
6
  assert(ccv_nnc_tensor_view_check_dim(a, gdim));
216
6
  assert(ccv_nnc_tensor_view_check_dim(h, gdim));
217
6
  if (dbias)
218
2
    _ccv_nnc_reduce_sum_forw_cpu_ref(g, dbias);
219
6
  int astride[CCV_NNC_MAX_DIM_ALLOC];
220
6
  int gstride[CCV_NNC_MAX_DIM_ALLOC];
221
6
  int hstride[CCV_NNC_MAX_DIM_ALLOC];
222
6
  int scale_stride[CCV_NNC_MAX_DIM_ALLOC];
223
6
  int mean_stride[CCV_NNC_MAX_DIM_ALLOC];
224
6
  int inv_std_stride[CCV_NNC_MAX_DIM_ALLOC];
225
6
  int dscale_stride[CCV_NNC_MAX_DIM_ALLOC];
226
6
  ccv_nnc_tensor_view_get_stride(a, astride);
227
6
  ccv_nnc_tensor_view_get_stride(g, gstride);
228
6
  ccv_nnc_tensor_view_get_stride(h, hstride);
229
6
  if (scale)
230
3
    ccv_nnc_tensor_view_get_stride(scale, scale_stride);
231
6
  ccv_nnc_tensor_view_get_stride(saved_mean, mean_stride);
232
6
  ccv_nnc_tensor_view_get_stride(saved_inv_std, inv_std_stride);
233
6
  if (dscale)
234
2
    ccv_nnc_tensor_view_get_stride(dscale, dscale_stride);
235
  // Need to allocate two additional memory:
236
  // 1. normalized a;
237
  // 2. scale * inv_std / n;
238
6
  assert(!(flags & CCV_NNC_ZERO_MEMORY_ALLOC));
239
6
  int x;
240
6
  int n = 1;
241
30
  for (x = 0; x < CCV_NNC_MAX_DIM + 2; 
x++24
)
242
24
    n *= gdim[x];
243
30
  for (x = 0; x < CCV_NNC_MAX_DIM + 2; 
x++24
)
244
24
    n /= rdim[x];
245
6
  int gcount = 1, rcount = 1;
246
30
  for (x = 0; x < CCV_NNC_MAX_DIM + 2; 
x++24
)
247
24
    gcount *= gdim[x], rcount *= rdim[x];
248
6
  float* const ah = (float*)ccv_nnc_stream_context_get_workspace(stream_context, sizeof(float) * gcount * 2 + sizeof(float) * rcount * 2, CCV_TENSOR_CPU_MEMORY);
249
6
  float* const gss = ah + gcount; // g * scale * inv_std
250
6
  float* const gssr = gss + gcount; // gss reduced to inv_std dimension
251
6
  float* const ahgssr = gssr + rcount; // ah * gss then reduced to inv_std dimension.
252
6
  int i[CCV_NNC_MAX_DIM + 2];
253
6
  float* ahp = ah;
254
6
  const float* const meanp = saved_mean->data.f32;
255
6
  const float* const inv_stdp = saved_inv_std->data.f32;
256
6
  const float* const ap = a->data.f32;
257
30
  for (i[0] = 0; i[0] < gdim[0]; 
i[0]++24
)
258
24
  {
259
24
    const float* const ap0 = ap + i[0] * astride[0];
260
24
    const float* const meanp0 = rdim[0] == 1 ? 
meanp0
: meanp + i[0] * mean_stride[0];
261
24
    const float* const inv_stdp0 = rdim[0] == 1 ? 
inv_stdp0
: inv_stdp + i[0] * inv_std_stride[0];
262
104
    for (i[1] = 0; i[1] < gdim[1]; 
i[1]++80
)
263
80
    {
264
80
      const float* ap1 = ap0 + i[1] * astride[1];
265
80
      const float* const meanp1 = rdim[1] == 1 ? meanp0 : 
meanp0 + i[1] * mean_stride[1]0
;
266
80
      const float* const inv_stdp1 = rdim[1] == 1 ? inv_stdp0 : 
inv_stdp0 + i[1] * inv_std_stride[1]0
;
267
368
      for (i[2] = 0; i[2] < gdim[2]; 
i[2]++288
)
268
288
      {
269
288
        const float* const meanp2 = rdim[2] == 1 ? meanp1 : 
meanp1 + i[2] * mean_stride[2]0
;
270
288
        const float* const inv_stdp2 = rdim[2] == 1 ? inv_stdp1 : 
inv_stdp1 + i[2] * inv_std_stride[2]0
;
271
288
        if (rdim[3] == 1)
272
3.16k
          
for (x = 0; 288
x < gdim[3];
x++2.88k
)
273
2.88k
            ahp[x] = (ap1[x] - meanp2[0]) * inv_stdp2[0];
274
0
        else
275
0
          for (x = 0; x < gdim[3]; x++)
276
0
            ahp[x] = (ap1[x] - meanp2[x]) * inv_stdp2[x];
277
288
        ap1 += astride[2];
278
288
        ahp += gdim[3];
279
288
      }
280
80
    }
281
24
  }
282
6
  if (dscale)
283
2
  {
284
2
    ccv_nnc_tensor_zero(dscale);
285
2
    ahp = ah;
286
2
    float* gssp = gss;
287
2
    const float* const gp = g->data.f32;
288
2
    const float* const scalep = scale->data.f32;
289
2
    float* const dscalep = dscale->data.f32;
290
12
    for (i[0] = 0; i[0] < gdim[0]; 
i[0]++10
)
291
10
    {
292
10
      const float* const gp0 = gp + i[0] * gstride[0];
293
10
      const float* const inv_stdp0 = rdim[0] == 1 ? 
inv_stdp0
: inv_stdp + i[0] * inv_std_stride[0];
294
10
      const float* const scalep0 = sdim[0] == 1 ? scalep : 
scalep + i[0] * scale_stride[0]0
;
295
10
      float* const dscalep0 = sdim[0] == 1 ? dscalep : 
dscalep + i[0] * dscale_stride[0]0
;
296
46
      for (i[1] = 0; i[1] < gdim[1]; 
i[1]++36
)
297
36
      {
298
36
        const float* gp1 = gp0 + i[1] * gstride[1];
299
36
        const float* const inv_stdp1 = rdim[1] == 1 ? inv_stdp0 : 
inv_stdp0 + i[1] * inv_std_stride[1]0
;
300
36
        const float* const scalep1 = sdim[1] == 1 ? 
scalep00
: scalep0 + i[1] * scale_stride[1];
301
36
        float* const dscalep1 = sdim[1] == 1 ? 
dscalep00
: dscalep0 + i[1] * dscale_stride[1];
302
172
        for (i[2] = 0; i[2] < gdim[2]; 
i[2]++136
)
303
136
        {
304
136
          const float* const inv_stdp2 = rdim[2] == 1 ? inv_stdp1 : 
inv_stdp1 + i[2] * inv_std_stride[2]0
;
305
136
          const float* const scalep2 = sdim[2] == 1 ? 
scalep10
: scalep1 + i[2] * scale_stride[2];
306
136
          float* const dscalep2 = sdim[2] == 1 ? 
dscalep10
: dscalep1 + i[2] * dscale_stride[2];
307
136
          if (sdim[3] == 1)
308
0
            for (x = 0; x < gdim[3]; x++)
309
0
            {
310
0
              gssp[x] = gp1[x] * scalep2[0] * inv_stdp2[rdim[3] == 1 ? 0 : x];
311
0
              dscalep2[0] += ahp[x] * gp1[x];
312
0
            }
313
136
          else
314
1.49k
            
for (x = 0; 136
x < gdim[3];
x++1.36k
)
315
1.36k
            {
316
1.36k
              gssp[x] = gp1[x] * scalep2[x] * inv_stdp2[rdim[3] == 1 ? 0 : 
x0
];
317
1.36k
              dscalep2[x] += ahp[x] * gp1[x];
318
1.36k
            }
319
136
          gp1 += gstride[2];
320
136
          ahp += gdim[3];
321
136
          gssp += gdim[3];
322
136
        }
323
36
      }
324
10
    }
325
4
  } else {
326
4
    float* gssp = gss;
327
4
    const float* const gp = g->data.f32;
328
4
    if (elementwise_affine)
329
1
    {
330
1
      const float* const scalep = scale->data.f32;
331
3
      for (i[0] = 0; i[0] < gdim[0]; 
i[0]++2
)
332
2
      {
333
2
        const float* const gp0 = gp + i[0] * gstride[0];
334
2
        const float* const inv_stdp0 = rdim[0] == 1 ? 
inv_stdp0
: inv_stdp + i[0] * inv_std_stride[0];
335
2
        const float* const scalep0 = sdim[0] == 1 ? scalep : 
scalep + i[0] * scale_stride[0]0
;
336
6
        for (i[1] = 0; i[1] < gdim[1]; 
i[1]++4
)
337
4
        {
338
4
          const float* gp1 = gp0 + i[1] * gstride[1];
339
4
          const float* const inv_stdp1 = rdim[1] == 1 ? inv_stdp0 : 
inv_stdp0 + i[1] * inv_std_stride[1]0
;
340
4
          const float* const scalep1 = sdim[1] == 1 ? 
scalep00
: scalep0 + i[1] * scale_stride[1];
341
12
          for (i[2] = 0; i[2] < gdim[2]; 
i[2]++8
)
342
8
          {
343
8
            const float* const inv_stdp2 = rdim[2] == 1 ? inv_stdp1 : 
inv_stdp1 + i[2] * inv_std_stride[2]0
;
344
8
            const float* const scalep2 = sdim[2] == 1 ? 
scalep10
: scalep1 + i[2] * scale_stride[2];
345
8
            if (sdim[3] == 1)
346
0
              for (x = 0; x < gdim[3]; x++)
347
0
                gssp[x] = gp1[x] * scalep2[0] * inv_stdp2[rdim[3] == 1 ? 0 : x];
348
8
            else
349
88
              
for (x = 0; 8
x < gdim[3];
x++80
)
350
80
                gssp[x] = gp1[x] * scalep2[x] * inv_stdp2[rdim[3] == 1 ? 0 : 
x0
];
351
8
            gp1 += gstride[2];
352
8
            gssp += gdim[3];
353
8
          }
354
4
        }
355
2
      }
356
3
    } else {
357
15
      for (i[0] = 0; i[0] < gdim[0]; 
i[0]++12
)
358
12
      {
359
12
        const float* const gp0 = gp + i[0] * gstride[0];
360
12
        const float* const inv_stdp0 = rdim[0] == 1 ? 
inv_stdp0
: inv_stdp + i[0] * inv_std_stride[0];
361
52
        for (i[1] = 0; i[1] < gdim[1]; 
i[1]++40
)
362
40
        {
363
40
          const float* gp1 = gp0 + i[1] * gstride[1];
364
40
          const float* const inv_stdp1 = rdim[1] == 1 ? inv_stdp0 : 
inv_stdp0 + i[1] * inv_std_stride[1]0
;
365
184
          for (i[2] = 0; i[2] < gdim[2]; 
i[2]++144
)
366
144
          {
367
144
            const float* const inv_stdp2 = rdim[2] == 1 ? inv_stdp1 : 
inv_stdp1 + i[2] * inv_std_stride[2]0
;
368
1.58k
            for (x = 0; x < gdim[3]; 
x++1.44k
)
369
1.44k
              gssp[x] = gp1[x] * inv_stdp2[rdim[3] == 1 ? 0 : 
x0
];
370
144
            gp1 += gstride[2];
371
144
            gssp += gdim[3];
372
144
          }
373
40
        }
374
12
      }
375
3
    }
376
4
  }
377
6
  ccv_nnc_tensor_t gsst = ccv_nnc_tensor(gss, g->info, 0);
378
6
  ccv_nnc_tensor_t gssrt = ccv_nnc_tensor(gssr, saved_mean->info, 0);
379
6
  _ccv_nnc_reduce_sum_forw_cpu_ref((ccv_nnc_tensor_view_t*)&gsst, (ccv_nnc_tensor_view_t*)&gssrt);
380
6
  ahp = ah;
381
6
  float* gssp = gss;
382
6
  ccv_nnc_tensor_t ahgssrt = ccv_nnc_tensor(ahgssr, saved_mean->info, 0);
383
6
  ccv_nnc_tensor_zero(&ahgssrt);
384
6
  float* const ahgssrp = ahgssr;
385
30
  for (i[0] = 0; i[0] < gdim[0]; 
i[0]++24
)
386
24
  {
387
24
    float* const ahgssrp0 = rdim[0] == 1 ? 
ahgssrp0
: ahgssrp + i[0] * rdim[1] * rdim[2] * rdim[3];
388
104
    for (i[1] = 0; i[1] < gdim[1]; 
i[1]++80
)
389
80
    {
390
80
      float* const ahgssrp1 = rdim[1] == 1 ? ahgssrp0 : 
ahgssrp0 + i[1] * rdim[2] * rdim[3]0
;
391
368
      for (i[2] = 0; i[2] < gdim[2]; 
i[2]++288
)
392
288
      {
393
288
        float* const ahgssrp2 = rdim[2] == 1 ? ahgssrp1 : 
ahgssrp1 + i[2] * rdim[3]0
;
394
288
        if (rdim[3] == 1)
395
3.16k
          
for (x = 0; 288
x < gdim[3];
x++2.88k
)
396
2.88k
            ahgssrp2[0] += ahp[x] * gssp[x];
397
0
        else
398
0
          for (x = 0; x < gdim[3]; x++)
399
0
            ahgssrp2[x] += ahp[x] * gssp[x];
400
288
        ahp += gdim[3];
401
288
        gssp += gdim[3];
402
288
      }
403
80
    }
404
24
  }
405
  // Now the part to compute dx (h).
406
6
  float* const hp = h->data.f32;
407
6
  ahp = ah;
408
6
  const float inv_n = 1. / n;
409
6
  gssp = gss;
410
6
  const float* const gssrp = gssr;
411
30
  for (i[0] = 0; i[0] < gdim[0]; 
i[0]++24
)
412
24
  {
413
24
    float* const hp0 = hp + i[0] * hstride[0];
414
24
    const float* const gssrp0 = rdim[0] == 1 ? 
gssrp0
: gssrp + i[0] * rdim[1] * rdim[2] * rdim[3];
415
24
    const float* const ahgssrp0 = rdim[0] == 1 ? 
ahgssrp0
: ahgssrp + i[0] * rdim[1] * rdim[2] * rdim[3];
416
104
    for (i[1] = 0; i[1] < gdim[1]; 
i[1]++80
)
417
80
    {
418
80
      float* hp1 = hp0 + i[1] * hstride[1];
419
80
      const float* const gssrp1 = rdim[1] == 1 ? gssrp0 : 
gssrp0 + i[1] * rdim[2] * rdim[3]0
;
420
80
      const float* const ahgssrp1 = rdim[1] == 1 ? ahgssrp0 : 
ahgssrp0 + i[1] * rdim[2] * rdim[3]0
;
421
368
      for (i[2] = 0; i[2] < gdim[2]; 
i[2]++288
)
422
288
      {
423
288
        const float* const gssrp2 = rdim[2] == 1 ? gssrp1 : 
gssrp1 + i[2] * rdim[3]0
;
424
288
        const float* const ahgssrp2 = rdim[2] == 1 ? ahgssrp1 : 
ahgssrp1 + i[2] * rdim[3]0
;
425
288
        if (rdim[3] == 1)
426
3.16k
          
for (x = 0; 288
x < gdim[3];
x++2.88k
)
427
2.88k
            hp1[x] = gssp[x] - inv_n * (gssrp2[0] + ahp[x] * ahgssrp2[0]);
428
0
        else
429
0
          for (x = 0; x < gdim[3]; x++)
430
0
            hp1[x] = gssp[x] - inv_n * (gssrp2[x] + ahp[x] * ahgssrp2[x]);
431
288
        hp1 += hstride[2];
432
288
        ahp += gdim[3];
433
288
        gssp += gdim[3];
434
288
      }
435
80
    }
436
24
  }
437
6
  return CCV_NNC_EXEC_SUCCESS;
438
6
}
439
440
REGISTER_COMMAND_BACKEND(CCV_NNC_LAYER_NORM_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
441
1
{
442
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
443
1
  registry->tensor_datatypes = CCV_32F;
444
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
445
1
  registry->algorithms = 1;
446
1
  registry->exec = _ccv_nnc_layer_norm_forw;
447
1
}
448
449
REGISTER_COMMAND_BACKEND(CCV_NNC_LAYER_NORM_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
450
1
{
451
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
452
1
  registry->tensor_datatypes = CCV_32F;
453
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
454
1
  registry->algorithms = 1;
455
1
  registry->exec = _ccv_nnc_layer_norm_back;
456
1
}