Coverage Report

Created: 2021-09-21 23:33

/home/liu/buildslave/linux-x64-runtests/build/lib/nnc/cmd/norm/ccv_nnc_layer_norm_cpu_ref.c
Line
Count
Source (jump to first uncovered line)
1
#include "ccv.h"
2
#include "ccv_internal.h"
3
#include "nnc/ccv_nnc.h"
4
#include "nnc/ccv_nnc_easy.h"
5
#include "nnc/ccv_nnc_internal.h"
6
#ifdef USE_OPENMP
7
#include <omp.h>
8
#endif
9
#ifdef USE_DISPATCH
10
#include <dispatch/dispatch.h>
11
#endif
12
13
// Shared methods.
14
#include "../_ccv_nnc_cpu_ref.h"
15
16
static int _ccv_nnc_layer_norm_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
17
4
{
18
4
  assert(input_size == 3);
19
4
  ccv_nnc_tensor_view_t* const a = (ccv_nnc_tensor_view_t*)inputs[0];
20
4
  ccv_nnc_tensor_view_t* const scale = (ccv_nnc_tensor_view_t*)inputs[1];
21
4
  ccv_nnc_tensor_view_t* const bias = (ccv_nnc_tensor_view_t*)inputs[2];
22
4
  ccv_nnc_tensor_view_t* const b = (ccv_nnc_tensor_view_t*)outputs[0];
23
4
  ccv_nnc_tensor_view_t* const saved_mean = (ccv_nnc_tensor_view_t*)outputs[1];
24
4
  ccv_nnc_tensor_view_t* const saved_inv_std = (ccv_nnc_tensor_view_t*)outputs[2];
25
4
  assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
26
4
  assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
27
4
  // Assuming this is float 32.
28
4
  int adim[CCV_NNC_MAX_DIM_ALLOC];
29
4
  int rdim[CCV_NNC_MAX_DIM_ALLOC];
30
4
  ccv_nnc_tensor_view_get_dim(a, adim);
31
4
  ccv_nnc_tensor_view_get_dim(saved_mean, rdim);
32
4
  assert(ccv_nnc_tensor_view_check_dim(saved_inv_std, rdim));
33
4
  assert(ccv_nnc_tensor_view_check_dim(b, adim));
34
4
  assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
35
4
  int ainc[CCV_NNC_MAX_DIM_ALLOC];
36
4
  int binc[CCV_NNC_MAX_DIM_ALLOC];
37
4
  int scale_inc[CCV_NNC_MAX_DIM_ALLOC];
38
4
  int bias_inc[CCV_NNC_MAX_DIM_ALLOC];
39
4
  ccv_nnc_tensor_view_get_inc(a, ainc);
40
4
  ccv_nnc_tensor_view_get_inc(scale, scale_inc);
41
4
  ccv_nnc_tensor_view_get_inc(bias, bias_inc);
42
4
  ccv_nnc_tensor_view_get_inc(b, binc);
43
4
  // The epsilon is used a little bit differently from batch norm, it is outside of the sqrt in this case.
44
4
  const float epsilon = cmd.info.lnorm.epsilon;
45
4
  int saved_mean_inc[CCV_NNC_MAX_DIM_ALLOC];
46
4
  int saved_inv_std_inc[CCV_NNC_MAX_DIM_ALLOC];
47
4
  ccv_nnc_tensor_view_get_inc(saved_mean, saved_mean_inc);
48
4
  ccv_nnc_tensor_view_get_inc(saved_inv_std, saved_inv_std_inc);
49
4
  int x;
50
4
  int n = 1;
51
20
  for (x = 0; x < CCV_NNC_MAX_DIM + 2; 
x++16
)
52
16
    n *= adim[x];
53
20
  for (x = 0; x < CCV_NNC_MAX_DIM + 2; 
x++16
)
54
16
    n /= rdim[x];
55
4
  const float inv_n = 1. / n;
56
4
  _ccv_nnc_reduce_sum_forw_cpu_ref(a, saved_mean);
57
4
  _ccv_nnc_mul_forw_cpu_ref(inv_n, saved_mean, 0, saved_mean);
58
4
  ccv_nnc_tensor_zero(saved_inv_std);
59
4
  float* ap = a->data.f32;
60
4
  float* const meanp = saved_mean->data.f32;
61
4
  float* const varp = saved_inv_std->data.f32;
62
4
  int i[CCV_NNC_MAX_DIM + 2];
63
24
  for (i[0] = 0; i[0] < adim[0]; 
i[0]++20
)
64
20
  {
65
20
    float* const meanp0 = rdim[0] == 1 ? 
meanp0
: meanp + i[0] * saved_mean_inc[1] * saved_mean_inc[2] * saved_mean_inc[3];
66
20
    float* const varp0 = rdim[0] == 1 ? 
varp0
: varp + i[0] * saved_inv_std_inc[1] * saved_inv_std_inc[2] * saved_inv_std_inc[3];
67
92
    for (i[1] = 0; i[1] < adim[1]; 
i[1]++72
)
68
72
    {
69
72
      float* const meanp1 = rdim[1] == 1 ? meanp0 : 
meanp0 + i[1] * saved_mean_inc[2] * saved_mean_inc[3]0
;
70
72
      float* const varp1 = rdim[1] == 1 ? varp0 : 
varp0 + i[1] * saved_inv_std_inc[2] * saved_inv_std_inc[3]0
;
71
344
      for (i[2] = 0; i[2] < adim[2]; 
i[2]++272
)
72
272
      {
73
272
        float* const meanp2 = rdim[2] == 1 ? meanp1 : 
meanp1 + i[2] * saved_mean_inc[3]0
;
74
272
        float* const varp2 = rdim[2] == 1 ? varp1 : 
varp1 + i[2] * saved_inv_std_inc[3]0
;
75
272
        if (rdim[3] == 1)
76
2.99k
          
for (x = 0; 272
x < adim[3];
x++2.72k
)
77
2.72k
          {
78
2.72k
            float w = ap[x] - meanp2[0];
79
2.72k
            varp2[0] += w * w;
80
2.72k
          }
81
0
        else
82
0
          for (x = 0; x < adim[3]; x++)
83
0
          {
84
0
            float w = ap[x] - meanp2[x];
85
0
            varp2[x] += w * w;
86
0
          }
87
272
        ap += ainc[3];
88
272
      }
89
72
      ap += (ainc[2] - adim[2]) * ainc[3];
90
72
    }
91
20
    ap += (ainc[1] - adim[1]) * ainc[2] * ainc[3];
92
20
  }
93
24
  for (i[0] = 0; i[0] < rdim[0]; 
i[0]++20
)
94
20
  {
95
20
    float* const varp0 = varp + i[0] * saved_inv_std_inc[1] * saved_inv_std_inc[2] * saved_inv_std_inc[3];
96
40
    for (i[1] = 0; i[1] < rdim[1]; 
i[1]++20
)
97
20
    {
98
20
      float* const varp1 = varp0 + i[1] * saved_inv_std_inc[2] * saved_inv_std_inc[3];
99
40
      for (i[2] = 0; i[2] < rdim[2]; 
i[2]++20
)
100
20
      {
101
20
        float* const varp2 = varp1 + i[2] * saved_inv_std_inc[3];
102
40
        for (x = 0; x < rdim[3]; 
x++20
)
103
20
          varp2[x] = 1. / (sqrtf(varp2[x] * inv_n) + epsilon);
104
20
      }
105
20
    }
106
20
  }
107
4
  float* const scalep = scale->data.f32;
108
4
  float* const biasp = bias->data.f32;
109
4
  int sdim[CCV_NNC_MAX_DIM_ALLOC];
110
4
  ccv_nnc_tensor_view_get_dim(scale, sdim);
111
4
  int bias_dim[CCV_NNC_MAX_DIM_ALLOC];
112
4
  ccv_nnc_tensor_view_get_dim(bias, bias_dim);
113
4
  // Do the straight-forward one, y = (x - mean) * inv_std * scale + bias, we cannot allocate extra memory to help.
114
4
  // There is no need for precompute since scale / bias is per element.
115
4
  ap = a->data.f32;
116
4
  float* bp = b->data.f32;
117
24
  for (i[0] = 0; i[0] < adim[0]; 
i[0]++20
)
118
20
  {
119
20
    float* const meanp0 = rdim[0] == 1 ? 
meanp0
: meanp + i[0] * saved_mean_inc[1] * saved_mean_inc[2] * saved_mean_inc[3];
120
20
    float* const varp0 = rdim[0] == 1 ? 
varp0
: varp + i[0] * saved_inv_std_inc[1] * saved_inv_std_inc[2] * saved_inv_std_inc[3];
121
20
    float* const scalep0 = sdim[0] == 1 ? scalep : 
scalep + i[0] * scale_inc[1] * scale_inc[2] * scale_inc[3]0
;
122
20
    float* const biasp0 = bias_dim[0] == 1 ? biasp : 
biasp + i[0] * bias_inc[1] * bias_inc[2] * bias_inc[3]0
;
123
92
    for (i[1] = 0; i[1] < adim[1]; 
i[1]++72
)
124
72
    {
125
72
      float* const meanp1 = rdim[1] == 1 ? meanp0 : 
meanp0 + i[1] * saved_mean_inc[2] * saved_mean_inc[3]0
;
126
72
      float* const varp1 = rdim[1] == 1 ? varp0 : 
varp0 + i[1] * saved_inv_std_inc[2] * saved_inv_std_inc[3]0
;
127
72
      float* const scalep1 = sdim[1] == 1 ? 
scalep00
: scalep0 + i[1] * scale_inc[2] * scale_inc[3];
128
72
      float* const biasp1 = bias_dim[1] == 1 ? 
biasp00
: biasp0 + i[1] * bias_inc[2] * bias_inc[3];
129
344
      for (i[2] = 0; i[2] < adim[2]; 
i[2]++272
)
130
272
      {
131
272
        float* const meanp2 = rdim[2] == 1 ? meanp1 : 
meanp1 + i[2] * saved_mean_inc[3]0
;
132
272
        float* const varp2 = rdim[2] == 1 ? varp1 : 
varp1 + i[2] * saved_inv_std_inc[3]0
;
133
272
        float* const scalep2 = sdim[2] == 1 ? 
scalep10
: scalep1 + i[2] * scale_inc[3];
134
272
        float* const biasp2 = bias_dim[2] == 1 ? 
biasp10
: biasp1 + i[2] * bias_inc[3];
135
272
        if (rdim[3] == 1)
136
2.99k
          
for (x = 0; 272
x < adim[3];
x++2.72k
)
137
2.72k
            bp[x] = (ap[x] - meanp2[0]) * varp2[0] * scalep2[sdim[3] == 1 ? 
00
: x] + biasp2[bias_dim[3] == 1 ?
00
: x];
138
0
        else
139
0
          for (x = 0; x < adim[3]; x++)
140
0
            bp[x] = (ap[x] - meanp2[x]) * varp2[x] * scalep2[sdim[3] == 1 ? 0 : x] + biasp2[bias_dim[3] == 1 ? 0 : x];
141
272
        ap += ainc[3];
142
272
        bp += binc[3];
143
272
      }
144
72
      ap += (ainc[2] - adim[2]) * ainc[3];
145
72
      bp += (binc[2] - adim[2]) * binc[3];
146
72
    }
147
20
    ap += (ainc[1] - adim[1]) * ainc[2] * ainc[3];
148
20
    bp += (binc[1] - adim[1]) * binc[2] * binc[3];
149
20
  }
150
4
  return CCV_NNC_EXEC_SUCCESS;
151
4
}
152
153
static int _ccv_nnc_layer_norm_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
154
2
{
155
2
  assert(input_size == 9);
156
2
  assert(output_size == 3);
157
2
  ccv_nnc_tensor_view_t* const g = (ccv_nnc_tensor_view_t*)inputs[0];
158
2
  ccv_nnc_tensor_view_t* const a = (ccv_nnc_tensor_view_t*)inputs[3];
159
2
  ccv_nnc_tensor_view_t* const scale = (ccv_nnc_tensor_view_t*)inputs[4];
160
2
  ccv_nnc_tensor_view_t* const saved_mean = (ccv_nnc_tensor_view_t*)inputs[7];
161
2
  ccv_nnc_tensor_view_t* const saved_inv_std = (ccv_nnc_tensor_view_t*)inputs[8];
162
2
  ccv_nnc_tensor_view_t* const h = (ccv_nnc_tensor_view_t*)outputs[0];
163
2
  ccv_nnc_tensor_view_t* const dscale = (ccv_nnc_tensor_view_t*)outputs[1];
164
2
  ccv_nnc_tensor_view_t* const dbias = (ccv_nnc_tensor_view_t*)outputs[2];
165
2
  assert(ccv_nnc_tensor_nd(g->info.dim) <= CCV_NNC_MAX_DIM + 2);
166
2
  assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
167
2
  assert(ccv_nnc_tensor_nd(h->info.dim) <= CCV_NNC_MAX_DIM + 2);
168
2
  // Assuming this is float 32.
169
2
  int gdim[CCV_NNC_MAX_DIM_ALLOC];
170
2
  int rdim[CCV_NNC_MAX_DIM_ALLOC];
171
2
  ccv_nnc_tensor_view_get_dim(g, gdim);
172
2
  ccv_nnc_tensor_view_get_dim(saved_mean, rdim);
173
2
  assert(ccv_nnc_tensor_view_check_dim(saved_inv_std, rdim));
174
2
  int sdim[CCV_NNC_MAX_DIM_ALLOC];
175
2
  ccv_nnc_tensor_view_get_dim(scale, sdim);
176
2
  assert(ccv_nnc_tensor_view_check_dim(dscale, sdim));
177
2
  assert(ccv_nnc_tensor_view_check_dim(a, gdim));
178
2
  assert(ccv_nnc_tensor_view_check_dim(h, gdim));
179
2
  _ccv_nnc_reduce_sum_forw_cpu_ref(g, dbias);
180
2
  int ainc[CCV_NNC_MAX_DIM_ALLOC];
181
2
  int ginc[CCV_NNC_MAX_DIM_ALLOC];
182
2
  int hinc[CCV_NNC_MAX_DIM_ALLOC];
183
2
  int scale_inc[CCV_NNC_MAX_DIM_ALLOC];
184
2
  int mean_inc[CCV_NNC_MAX_DIM_ALLOC];
185
2
  int inv_std_inc[CCV_NNC_MAX_DIM_ALLOC];
186
2
  int dscale_inc[CCV_NNC_MAX_DIM_ALLOC];
187
2
  ccv_nnc_tensor_view_get_inc(a, ainc);
188
2
  ccv_nnc_tensor_view_get_inc(g, ginc);
189
2
  ccv_nnc_tensor_view_get_inc(h, hinc);
190
2
  ccv_nnc_tensor_view_get_inc(scale, scale_inc);
191
2
  ccv_nnc_tensor_view_get_inc(saved_mean, mean_inc);
192
2
  ccv_nnc_tensor_view_get_inc(saved_inv_std, inv_std_inc);
193
2
  ccv_nnc_tensor_view_get_inc(dscale, dscale_inc);
194
2
  // Need to allocate two additional memory:
195
2
  // 1. normalized a;
196
2
  // 2. scale * inv_std / n;
197
2
  assert(!(flags & CCV_NNC_ZERO_MEMORY_ALLOC));
198
2
  int x;
199
2
  int n = 1;
200
10
  for (x = 0; x < CCV_NNC_MAX_DIM + 2; 
x++8
)
201
8
    n *= gdim[x];
202
10
  for (x = 0; x < CCV_NNC_MAX_DIM + 2; 
x++8
)
203
8
    n /= rdim[x];
204
2
  int gcount = 1, rcount = 1;
205
10
  for (x = 0; x < CCV_NNC_MAX_DIM + 2; 
x++8
)
206
8
    gcount *= gdim[x], rcount *= rdim[x];
207
2
  float* const ah = (float*)ccv_nnc_stream_context_get_workspace(stream_context, sizeof(float) * gcount * 2 + sizeof(float) * rcount * 2, CCV_TENSOR_CPU_MEMORY);
208
2
  float* const gss = ah + gcount; // g * scale * inv_std
209
2
  float* const gssr = gss + gcount; // gss reduced to inv_std dimension
210
2
  float* const ahgssr = gssr + rcount; // ah * gss then reduced to inv_std dimension.
211
2
  int i[CCV_NNC_MAX_DIM + 2];
212
2
  float* ahp = ah;
213
2
  const float* const meanp = saved_mean->data.f32;
214
2
  const float* const inv_stdp = saved_inv_std->data.f32;
215
2
  const float* ap = a->data.f32;
216
12
  for (i[0] = 0; i[0] < gdim[0]; 
i[0]++10
)
217
10
  {
218
10
    const float* const meanp0 = rdim[0] == 1 ? 
meanp0
: meanp + i[0] * mean_inc[1] * mean_inc[2] * mean_inc[3];
219
10
    const float* const inv_stdp0 = rdim[0] == 1 ? 
inv_stdp0
: inv_stdp + i[0] * inv_std_inc[1] * inv_std_inc[2] * inv_std_inc[3];
220
46
    for (i[1] = 0; i[1] < gdim[1]; 
i[1]++36
)
221
36
    {
222
36
      const float* const meanp1 = rdim[1] == 1 ? meanp0 : 
meanp0 + i[1] * mean_inc[2] * mean_inc[3]0
;
223
36
      const float* const inv_stdp1 = rdim[1] == 1 ? inv_stdp0 : 
inv_stdp0 + i[1] * inv_std_inc[2] * inv_std_inc[3]0
;
224
172
      for (i[2] = 0; i[2] < gdim[2]; 
i[2]++136
)
225
136
      {
226
136
        const float* const meanp2 = rdim[2] == 1 ? meanp1 : 
meanp1 + i[2] * mean_inc[3]0
;
227
136
        const float* const inv_stdp2 = rdim[2] == 1 ? inv_stdp1 : 
inv_stdp1 + i[2] * inv_std_inc[3]0
;
228
136
        if (rdim[3] == 1)
229
1.49k
          
for (x = 0; 136
x < gdim[3];
x++1.36k
)
230
1.36k
            ahp[x] = (ap[x] - meanp2[0]) * inv_stdp2[0];
231
0
        else
232
0
          for (x = 0; x < gdim[3]; x++)
233
0
            ahp[x] = (ap[x] - meanp2[x]) * inv_stdp2[x];
234
136
        ap += ainc[3];
235
136
        ahp += gdim[3];
236
136
      }
237
36
      ap += (ainc[2] - gdim[2]) * ainc[3];
238
36
    }
239
10
    ap += (ainc[1] - gdim[1]) * ainc[2] * ainc[3];
240
10
  }
241
2
  ccv_nnc_tensor_zero(dscale);
242
2
  ahp = ah;
243
2
  float* gssp = gss;
244
2
  const float* gp = g->data.f32;
245
2
  const float* const scalep = scale->data.f32;
246
2
  float* const dscalep = dscale->data.f32;
247
12
  for (i[0] = 0; i[0] < gdim[0]; 
i[0]++10
)
248
10
  {
249
10
    const float* const inv_stdp0 = rdim[0] == 1 ? 
inv_stdp0
: inv_stdp + i[0] * inv_std_inc[1] * inv_std_inc[2] * inv_std_inc[3];
250
10
    const float* const scalep0 = sdim[0] == 1 ? scalep : 
scalep + i[0] * scale_inc[1] * scale_inc[2] * scale_inc[3]0
;
251
10
    float* const dscalep0 = sdim[0] == 1 ? dscalep : 
dscalep + i[0] * dscale_inc[1] * dscale_inc[2] * dscale_inc[3]0
;
252
46
    for (i[1] = 0; i[1] < gdim[1]; 
i[1]++36
)
253
36
    {
254
36
      const float* const inv_stdp1 = rdim[1] == 1 ? inv_stdp0 : 
inv_stdp0 + i[1] * inv_std_inc[2] * inv_std_inc[3]0
;
255
36
      const float* const scalep1 = sdim[1] == 1 ? 
scalep00
: scalep0 + i[1] * scale_inc[2] * scale_inc[3];
256
36
      float* const dscalep1 = sdim[1] == 1 ? 
dscalep00
: dscalep0 + i[1] * dscale_inc[2] * dscale_inc[3];
257
172
      for (i[2] = 0; i[2] < gdim[2]; 
i[2]++136
)
258
136
      {
259
136
        const float* const inv_stdp2 = rdim[2] == 1 ? inv_stdp1 : 
inv_stdp1 + i[2] * inv_std_inc[3]0
;
260
136
        const float* const scalep2 = sdim[2] == 1 ? 
scalep10
: scalep1 + i[2] * scale_inc[3];
261
136
        float* const dscalep2 = sdim[2] == 1 ? 
dscalep10
: dscalep1 + i[2] * dscale_inc[3];
262
136
        if (sdim[3] == 1)
263
0
          for (x = 0; x < gdim[3]; x++)
264
0
          {
265
0
            gssp[x] = gp[x] * scalep2[0] * inv_stdp2[rdim[3] == 1 ? 0 : x];
266
0
            dscalep2[0] += ahp[x] * gp[x];
267
0
          }
268
136
        else
269
1.49k
          
for (x = 0; 136
x < gdim[3];
x++1.36k
)
270
1.36k
          {
271
1.36k
            gssp[x] = gp[x] * scalep2[x] * inv_stdp2[rdim[3] == 1 ? 0 : 
x0
];
272
1.36k
            dscalep2[x] += ahp[x] * gp[x];
273
1.36k
          }
274
136
        gp += ginc[3];
275
136
        ahp += gdim[3];
276
136
        gssp += gdim[3];
277
136
      }
278
36
      gp += (ginc[2] - gdim[2]) * ginc[3];
279
36
    }
280
10
    gp += (ginc[1] - gdim[1]) * ginc[2] * ginc[3];
281
10
  }
282
2
  ccv_nnc_tensor_t gsst = ccv_nnc_tensor(gss, g->info, 0);
283
2
  ccv_nnc_tensor_t gssrt = ccv_nnc_tensor(gssr, saved_mean->info, 0);
284
2
  _ccv_nnc_reduce_sum_forw_cpu_ref((ccv_nnc_tensor_view_t*)&gsst, (ccv_nnc_tensor_view_t*)&gssrt);
285
2
  ahp = ah;
286
2
  gssp = gss;
287
2
  ccv_nnc_tensor_t ahgssrt = ccv_nnc_tensor(ahgssr, saved_mean->info, 0);
288
2
  ccv_nnc_tensor_zero(&ahgssrt);
289
2
  float* const ahgssrp = ahgssr;
290
12
  for (i[0] = 0; i[0] < gdim[0]; 
i[0]++10
)
291
10
  {
292
10
    float* const ahgssrp0 = rdim[0] == 1 ? 
ahgssrp0
: ahgssrp + i[0] * rdim[1] * rdim[2] * rdim[3];
293
46
    for (i[1] = 0; i[1] < gdim[1]; 
i[1]++36
)
294
36
    {
295
36
      float* const ahgssrp1 = rdim[1] == 1 ? ahgssrp0 : 
ahgssrp0 + i[1] * rdim[2] * rdim[3]0
;
296
172
      for (i[2] = 0; i[2] < gdim[2]; 
i[2]++136
)
297
136
      {
298
136
        float* const ahgssrp2 = rdim[2] == 1 ? ahgssrp1 : 
ahgssrp1 + i[2] * rdim[3]0
;
299
136
        if (rdim[3] == 1)
300
1.49k
          
for (x = 0; 136
x < gdim[3];
x++1.36k
)
301
1.36k
            ahgssrp2[0] += ahp[x] * gssp[x];
302
0
        else
303
0
          for (x = 0; x < gdim[3]; x++)
304
0
            ahgssrp2[x] += ahp[x] * gssp[x];
305
136
        ahp += gdim[3];
306
136
        gssp += gdim[3];
307
136
      }
308
36
    }
309
10
  }
310
2
  // Now the part to compute dx (h).
311
2
  float* hp = h->data.f32;
312
2
  ahp = ah;
313
2
  const float inv_n = 1. / n;
314
2
  gssp = gss;
315
2
  const float* const gssrp = gssr;
316
12
  for (i[0] = 0; i[0] < gdim[0]; 
i[0]++10
)
317
10
  {
318
10
    const float* const gssrp0 = rdim[0] == 1 ? 
gssrp0
: gssrp + i[0] * rdim[1] * rdim[2] * rdim[3];
319
10
    const float* const ahgssrp0 = rdim[0] == 1 ? 
ahgssrp0
: ahgssrp + i[0] * rdim[1] * rdim[2] * rdim[3];
320
46
    for (i[1] = 0; i[1] < gdim[1]; 
i[1]++36
)
321
36
    {
322
36
      const float* const gssrp1 = rdim[1] == 1 ? gssrp0 : 
gssrp0 + i[1] * rdim[2] * rdim[3]0
;
323
36
      const float* const ahgssrp1 = rdim[1] == 1 ? ahgssrp0 : 
ahgssrp0 + i[1] * rdim[2] * rdim[3]0
;
324
172
      for (i[2] = 0; i[2] < gdim[2]; 
i[2]++136
)
325
136
      {
326
136
        const float* const gssrp2 = rdim[2] == 1 ? gssrp1 : 
gssrp1 + i[2] * rdim[3]0
;
327
136
        const float* const ahgssrp2 = rdim[2] == 1 ? ahgssrp1 : 
ahgssrp1 + i[2] * rdim[3]0
;
328
136
        if (rdim[3] == 1)
329
1.49k
          
for (x = 0; 136
x < gdim[3];
x++1.36k
)
330
1.36k
            hp[x] = gssp[x] - inv_n * (gssrp2[0] + ahp[x] * ahgssrp2[0]);
331
0
        else
332
0
          for (x = 0; x < gdim[3]; x++)
333
0
            hp[x] = gssp[x] - inv_n * (gssrp2[x] + ahp[x] * ahgssrp2[x]);
334
136
        hp += hinc[3];
335
136
        ahp += gdim[3];
336
136
        gssp += gdim[3];
337
136
      }
338
36
      hp += (hinc[2] - gdim[2]) * hinc[3];
339
36
    }
340
10
    hp += (hinc[1] - gdim[1]) * hinc[2] * hinc[3];
341
10
  }
342
2
  return CCV_NNC_EXEC_SUCCESS;
343
2
}
344
345
REGISTER_COMMAND_BACKEND(CCV_NNC_LAYER_NORM_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
346
1
{
347
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
348
1
  registry->tensor_datatypes = CCV_32F;
349
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
350
1
  registry->algorithms = 1;
351
1
  registry->exec = _ccv_nnc_layer_norm_forw;
352
1
}
353
354
REGISTER_COMMAND_BACKEND(CCV_NNC_LAYER_NORM_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
355
1
{
356
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
357
1
  registry->tensor_datatypes = CCV_32F;
358
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
359
1
  registry->algorithms = 1;
360
1
  registry->exec = _ccv_nnc_layer_norm_back;
361
1
}