Coverage Report

Created: 2021-09-21 23:33

/home/liu/buildslave/linux-x64-runtests/build/lib/nnc/cmd/norm/ccv_nnc_batch_norm_cpu_ref.c
Line
Count
Source (jump to first uncovered line)
1
#include "ccv.h"
2
#include "ccv_internal.h"
3
#include "nnc/ccv_nnc.h"
4
#include "nnc/ccv_nnc_easy.h"
5
#include "nnc/ccv_nnc_internal.h"
6
#ifdef USE_OPENMP
7
#include <omp.h>
8
#endif
9
#ifdef USE_DISPATCH
10
#include <dispatch/dispatch.h>
11
#endif
12
13
// Shared methods.
14
#include "../_ccv_nnc_cpu_ref.h"
15
16
static int _ccv_nnc_batch_norm_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
17
24
{
18
24
  assert(input_size == 5);
19
24
  ccv_nnc_tensor_view_t* const a = (ccv_nnc_tensor_view_t*)inputs[0];
20
24
  ccv_nnc_tensor_view_t* const scale = (ccv_nnc_tensor_view_t*)inputs[1];
21
24
  ccv_nnc_tensor_view_t* const bias = (ccv_nnc_tensor_view_t*)inputs[2];
22
24
  ccv_nnc_tensor_view_t* const mean = (ccv_nnc_tensor_view_t*)inputs[3];
23
24
  ccv_nnc_tensor_view_t* const var = (ccv_nnc_tensor_view_t*)inputs[4];
24
24
  ccv_nnc_tensor_view_t* const b = (ccv_nnc_tensor_view_t*)outputs[0];
25
24
  assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
26
24
  assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
27
24
  // Assuming this is float 32.
28
24
  int adim[CCV_NNC_MAX_DIM_ALLOC];
29
24
  int rdim[CCV_NNC_MAX_DIM_ALLOC];
30
24
  ccv_nnc_tensor_view_get_dim(a, adim);
31
24
  ccv_nnc_tensor_view_get_dim(scale, rdim);
32
24
  assert(ccv_nnc_tensor_view_check_dim(bias, rdim));
33
24
  assert(ccv_nnc_tensor_view_check_dim(mean, rdim));
34
24
  assert(ccv_nnc_tensor_view_check_dim(var, rdim));
35
24
  assert(ccv_nnc_tensor_view_check_dim(b, adim));
36
24
  assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
37
24
  int ainc[CCV_NNC_MAX_DIM_ALLOC];
38
24
  int binc[CCV_NNC_MAX_DIM_ALLOC];
39
24
  int scale_inc[CCV_NNC_MAX_DIM_ALLOC];
40
24
  int bias_inc[CCV_NNC_MAX_DIM_ALLOC];
41
24
  ccv_nnc_tensor_view_get_inc(a, ainc);
42
24
  ccv_nnc_tensor_view_get_inc(scale, scale_inc);
43
24
  ccv_nnc_tensor_view_get_inc(bias, bias_inc);
44
24
  ccv_nnc_tensor_view_get_inc(b, binc);
45
24
  const float epsilon = cmd.info.bnorm.epsilon;
46
24
  if (!cmd.info.bnorm.is_test)
47
24
  {
48
24
    assert(output_size == 5);
49
24
    // Both are inplace.
50
24
    assert(inputs[3]->data.f32 == outputs[1]->data.f32);
51
24
    assert(inputs[4]->data.f32 == outputs[2]->data.f32);
52
24
    ccv_nnc_tensor_view_t* const saved_mean = (ccv_nnc_tensor_view_t*)outputs[3];
53
24
    ccv_nnc_tensor_view_t* const saved_inv_std = (ccv_nnc_tensor_view_t*)outputs[4];
54
24
    assert(ccv_nnc_tensor_view_check_dim(saved_mean, rdim));
55
24
    assert(ccv_nnc_tensor_view_check_dim(saved_inv_std, rdim));
56
24
    int saved_mean_inc[CCV_NNC_MAX_DIM_ALLOC];
57
24
    int saved_inv_std_inc[CCV_NNC_MAX_DIM_ALLOC];
58
24
    ccv_nnc_tensor_view_get_inc(saved_mean, saved_mean_inc);
59
24
    ccv_nnc_tensor_view_get_inc(saved_inv_std, saved_inv_std_inc);
60
24
    int i[CCV_NNC_MAX_DIM + 2];
61
24
    int x;
62
24
    int batch_size = 1;
63
120
    for (x = 0; x < CCV_NNC_MAX_DIM + 2; 
x++96
)
64
96
      batch_size *= adim[x];
65
120
    for (x = 0; x < CCV_NNC_MAX_DIM + 2; 
x++96
)
66
96
      batch_size /= rdim[x];
67
24
    const float inv_batch_size = 1. / batch_size;
68
24
    _ccv_nnc_reduce_sum_forw_cpu_ref(a, saved_mean);
69
24
    _ccv_nnc_mul_forw_cpu_ref(inv_batch_size, saved_mean, 0, saved_mean);
70
24
    // Copy this into running mean / var.
71
24
    _ccv_nnc_add_forw_cpu_ref(cmd.info.bnorm.momentum, 1. - cmd.info.bnorm.momentum, mean, saved_mean, mean);
72
24
    ccv_nnc_tensor_zero(saved_inv_std);
73
24
    float* ap = a->data.f32;
74
24
    float* const meanp = saved_mean->data.f32;
75
24
    float* const varp = saved_inv_std->data.f32;
76
174
    for (i[0] = 0; i[0] < adim[0]; 
i[0]++150
)
77
150
    {
78
150
      float* const meanp0 = rdim[0] == 1 ? meanp : 
meanp + i[0] * saved_mean_inc[1] * saved_mean_inc[2] * saved_mean_inc[3]0
;
79
150
      float* const varp0 = rdim[0] == 1 ? varp : 
varp + i[0] * saved_inv_std_inc[1] * saved_inv_std_inc[2] * saved_inv_std_inc[3]0
;
80
722
      for (i[1] = 0; i[1] < adim[1]; 
i[1]++572
)
81
572
      {
82
572
        float* const meanp1 = rdim[1] == 1 ? meanp0 : 
meanp0 + i[1] * saved_mean_inc[2] * saved_mean_inc[3]0
;
83
572
        float* const varp1 = rdim[1] == 1 ? varp0 : 
varp0 + i[1] * saved_inv_std_inc[2] * saved_inv_std_inc[3]0
;
84
2.80k
        for (i[2] = 0; i[2] < adim[2]; 
i[2]++2.23k
)
85
2.23k
        {
86
2.23k
          float* const meanp2 = rdim[2] == 1 ? meanp1 : 
meanp1 + i[2] * saved_mean_inc[3]0
;
87
2.23k
          float* const varp2 = rdim[2] == 1 ? varp1 : 
varp1 + i[2] * saved_inv_std_inc[3]0
;
88
2.23k
          if (rdim[3] == 1)
89
0
            for (x = 0; x < adim[3]; x++)
90
0
            {
91
0
              float w = ap[x] - meanp2[0];
92
0
              varp2[0] += w * w;
93
0
            }
94
2.23k
          else
95
24.5k
            
for (x = 0; 2.23k
x < adim[3];
x++22.3k
)
96
22.3k
            {
97
22.3k
              float w = ap[x] - meanp2[x];
98
22.3k
              varp2[x] += w * w;
99
22.3k
            }
100
2.23k
          ap += ainc[3];
101
2.23k
        }
102
572
        ap += (ainc[2] - adim[2]) * ainc[3];
103
572
      }
104
150
      ap += (ainc[1] - adim[1]) * ainc[2] * ainc[3];
105
150
    }
106
24
    _ccv_nnc_mul_forw_cpu_ref(inv_batch_size, saved_inv_std, 0, saved_inv_std);
107
24
    _ccv_nnc_add_forw_cpu_ref(cmd.info.bnorm.momentum, 1. - cmd.info.bnorm.momentum, var, saved_inv_std, var);
108
48
    for (i[0] = 0; i[0] < rdim[0]; 
i[0]++24
)
109
24
    {
110
24
      float* const varp0 = varp + i[0] * saved_inv_std_inc[1] * saved_inv_std_inc[2] * saved_inv_std_inc[3];
111
48
      for (i[1] = 0; i[1] < rdim[1]; 
i[1]++24
)
112
24
      {
113
24
        float* const varp1 = varp0 + i[1] * saved_inv_std_inc[2] * saved_inv_std_inc[3];
114
48
        for (i[2] = 0; i[2] < rdim[2]; 
i[2]++24
)
115
24
        {
116
24
          float* const varp2 = varp1 + i[2] * saved_inv_std_inc[3];
117
264
          for (x = 0; x < rdim[3]; 
x++240
)
118
240
            varp2[x] = 1. / sqrtf(varp2[x] + epsilon);
119
24
        }
120
24
      }
121
24
    }
122
24
    float* const scalep = scale->data.f32;
123
24
    float* const biasp = bias->data.f32;
124
24
    // Now, after mean and inv_std computed, go and stretch a.
125
24
    if (flags & CCV_NNC_ZERO_MEMORY_ALLOC)
126
0
    {
127
0
      // Do the straight-forward one, y = (x - mean) * inv_std * scale + bias, we cannot allocate extra memory to help.
128
0
      ap = a->data.f32;
129
0
      float* bp = b->data.f32;
130
0
      for (i[0] = 0; i[0] < adim[0]; i[0]++)
131
0
      {
132
0
        float* const meanp0 = rdim[0] == 1 ? meanp : meanp + i[0] * saved_mean_inc[1] * saved_mean_inc[2] * saved_mean_inc[3];
133
0
        float* const varp0 = rdim[0] == 1 ? varp : varp + i[0] * saved_inv_std_inc[1] * saved_inv_std_inc[2] * saved_inv_std_inc[3];
134
0
        float* const scalep0 = rdim[0] == 1 ? scalep : scalep + i[0] * scale_inc[1] * scale_inc[2] * scale_inc[3];
135
0
        float* const biasp0 = rdim[0] == 1 ? biasp : biasp + i[0] * bias_inc[1] * bias_inc[2] * bias_inc[3];
136
0
        for (i[1] = 0; i[1] < adim[1]; i[1]++)
137
0
        {
138
0
          float* const meanp1 = rdim[1] == 1 ? meanp0 : meanp0 + i[1] * saved_mean_inc[2] * saved_mean_inc[3];
139
0
          float* const varp1 = rdim[1] == 1 ? varp0 : varp0 + i[1] * saved_inv_std_inc[2] * saved_inv_std_inc[3];
140
0
          float* const scalep1 = rdim[1] == 1 ? scalep0 : scalep0 + i[1] * scale_inc[2] * scale_inc[3];
141
0
          float* const biasp1 = rdim[1] == 1 ? biasp0 : biasp0 + i[1] * bias_inc[2] * bias_inc[3];
142
0
          for (i[2] = 0; i[2] < adim[2]; i[2]++)
143
0
          {
144
0
            float* const meanp2 = rdim[2] == 1 ? meanp1 : meanp1 + i[2] * saved_mean_inc[3];
145
0
            float* const varp2 = rdim[2] == 1 ? varp1 : varp1 + i[2] * saved_inv_std_inc[3];
146
0
            float* const scalep2 = rdim[2] == 1 ? scalep1 : scalep1 + i[2] * scale_inc[3];
147
0
            float* const biasp2 = rdim[2] == 1 ? biasp1 : biasp1 + i[2] * bias_inc[3];
148
0
            if (rdim[3] == 1)
149
0
              for (x = 0; x < adim[3]; x++)
150
0
                bp[x] = (ap[x] - meanp2[0]) * varp2[0] * scalep2[0] + biasp2[0];
151
0
            else
152
0
              for (x = 0; x < adim[3]; x++)
153
0
                bp[x] = (ap[x] - meanp2[x]) * varp2[x] * scalep2[x] + biasp2[x];
154
0
            ap += ainc[3];
155
0
            bp += binc[3];
156
0
          }
157
0
          ap += (ainc[2] - adim[2]) * ainc[3];
158
0
          bp += (binc[2] - adim[2]) * binc[3];
159
0
        }
160
0
        ap += (ainc[1] - adim[1]) * ainc[2] * ainc[3];
161
0
        bp += (binc[1] - adim[1]) * binc[2] * binc[3];
162
0
      }
163
24
    } else {
164
24
      // If we allocate extra memory, we can convert y = (x - mean) * inv_std * scale + bias
165
24
      // to y = x * inv_std * scale + (bias - mean * inv_std * scale)
166
24
      // we can pre-compute nscale = inv_std * scale, nbias = bias - mean * inv_std * scale
167
24
      int count = 1;
168
120
      for (x = 0; x < CCV_NNC_MAX_DIM + 2; 
x++96
)
169
96
        count *= rdim[x];
170
24
      float* const nscalep = (float*)ccv_nnc_stream_context_get_workspace(stream_context, sizeof(float) * count * 2, CCV_TENSOR_CPU_MEMORY);
171
24
      float* const nbiasp = nscalep + count;
172
48
      for (i[0] = 0; i[0] < rdim[0]; 
i[0]++24
)
173
24
      {
174
24
        float* const meanp0 = meanp + i[0] * saved_mean_inc[1] * saved_mean_inc[2] * saved_mean_inc[3];
175
24
        float* const varp0 = varp + i[0] * saved_inv_std_inc[1] * saved_inv_std_inc[2] * saved_inv_std_inc[3];
176
24
        float* const scalep0 = scalep + i[0] * scale_inc[1] * scale_inc[2] * scale_inc[3];
177
24
        float* const biasp0 = biasp + i[0] * bias_inc[1] * bias_inc[2] * bias_inc[3];
178
24
        float* const nscalep0 = nscalep + i[0] * rdim[1] * rdim[2] * rdim[3];
179
24
        float* const nbiasp0 = nbiasp + i[0] * rdim[1] * rdim[2] * rdim[3];
180
48
        for (i[1] = 0; i[1] < rdim[1]; 
i[1]++24
)
181
24
        {
182
24
          float* const meanp1 = meanp0 + i[1] * saved_mean_inc[2] * saved_mean_inc[3];
183
24
          float* const varp1 = varp0 + i[1] * saved_inv_std_inc[2] * saved_inv_std_inc[3];
184
24
          float* const scalep1 = scalep0 + i[1] * scale_inc[2] * scale_inc[3];
185
24
          float* const biasp1 = biasp0 + i[1] * bias_inc[2] * bias_inc[3];
186
24
          float* const nscalep1 = nscalep0 + i[1] * rdim[2] * rdim[3];
187
24
          float* const nbiasp1 = nbiasp0 + i[1] * rdim[2] * rdim[3];
188
48
          for (i[2] = 0; i[2] < rdim[2]; 
i[2]++24
)
189
24
          {
190
24
            float* const meanp2 = meanp1 + i[2] * saved_mean_inc[3];
191
24
            float* const varp2 = varp1 + i[2] * saved_inv_std_inc[3];
192
24
            float* const scalep2 = scalep1 + i[2] * scale_inc[3];
193
24
            float* const biasp2 = biasp1 + i[2] * bias_inc[3];
194
24
            float* const nscalep2 = nscalep1 + i[2] * rdim[3];
195
24
            float* const nbiasp2 = nbiasp1 + i[2] * rdim[3];
196
264
            for (x = 0; x < rdim[3]; 
x++240
)
197
240
            {
198
240
              const float w = varp2[x] * scalep2[x];
199
240
              nscalep2[x] = w;
200
240
              nbiasp2[x] = biasp2[x] - meanp2[x] * w;
201
240
            }
202
24
          }
203
24
        }
204
24
      }
205
24
      ap = a->data.f32;
206
24
      float* bp = b->data.f32;
207
174
      for (i[0] = 0; i[0] < adim[0]; 
i[0]++150
)
208
150
      {
209
150
        float* const nscalep0 = rdim[0] == 1 ? nscalep : 
nscalep + i[0] * rdim[1] * rdim[2] * rdim[3]0
;
210
150
        float* const nbiasp0 = rdim[0] == 1 ? nbiasp : 
nbiasp + i[0] * rdim[1] * rdim[2] * rdim[3]0
;
211
722
        for (i[1] = 0; i[1] < adim[1]; 
i[1]++572
)
212
572
        {
213
572
          float* const nscalep1 = rdim[1] == 1 ? nscalep0 : 
nscalep0 + i[1] * rdim[2] * rdim[3]0
;
214
572
          float* const nbiasp1 = rdim[1] == 1 ? nbiasp0 : 
nbiasp0 + i[1] * rdim[2] * rdim[3]0
;
215
2.80k
          for (i[2] = 0; i[2] < adim[2]; 
i[2]++2.23k
)
216
2.23k
          {
217
2.23k
            float* const nscalep2 = rdim[2] == 1 ? nscalep1 : 
nscalep1 + i[2] * rdim[3]0
;
218
2.23k
            float* const nbiasp2 = rdim[2] == 1 ? nbiasp1 : 
nbiasp1 + i[2] * rdim[3]0
;
219
2.23k
            if (rdim[3] == 1)
220
0
              for (x = 0; x < adim[3]; x++)
221
0
                bp[x] = ap[x] * nscalep2[0] + nbiasp2[0];
222
2.23k
            else
223
24.5k
              
for (x = 0; 2.23k
x < adim[3];
x++22.3k
)
224
22.3k
                bp[x] = ap[x] * nscalep2[x] + nbiasp2[x];
225
2.23k
            ap += ainc[3];
226
2.23k
            bp += binc[3];
227
2.23k
          }
228
572
          ap += (ainc[2] - adim[2]) * ainc[3];
229
572
          bp += (binc[2] - adim[2]) * binc[3];
230
572
        }
231
150
        ap += (ainc[1] - adim[1]) * ainc[2] * ainc[3];
232
150
        bp += (binc[1] - adim[1]) * binc[2] * binc[3];
233
150
      }
234
24
    }
235
24
  } else {
236
0
    assert(output_size >= 1);
237
0
    int mean_inc[CCV_NNC_MAX_DIM_ALLOC];
238
0
    int var_inc[CCV_NNC_MAX_DIM_ALLOC];
239
0
    ccv_nnc_tensor_view_get_inc(mean, mean_inc);
240
0
    ccv_nnc_tensor_view_get_inc(var, var_inc);
241
0
    int i[CCV_NNC_MAX_DIM + 2];
242
0
    int x;
243
0
    assert(!(flags & CCV_NNC_ZERO_MEMORY_ALLOC));
244
0
    int count = 1;
245
0
    for (x = 0; x < CCV_NNC_MAX_DIM + 2; x++)
246
0
      count *= rdim[x];
247
0
    float* const meanp = mean->data.f32;
248
0
    float* const varp = var->data.f32;
249
0
    float* const scalep = scale->data.f32;
250
0
    float* const biasp = bias->data.f32;
251
0
    float* const nscalep = (float*)ccv_nnc_stream_context_get_workspace(stream_context, sizeof(float) * count * 2, CCV_TENSOR_CPU_MEMORY);
252
0
    float* const nbiasp = nscalep + count;
253
0
    for (i[0] = 0; i[0] < rdim[0]; i[0]++)
254
0
    {
255
0
      float* const meanp0 = meanp + i[0] * mean_inc[1] * mean_inc[2] * mean_inc[3];
256
0
      float* const varp0 = varp + i[0] * var_inc[1] * var_inc[2] * var_inc[3];
257
0
      float* const scalep0 = scalep + i[0] * scale_inc[1] * scale_inc[2] * scale_inc[3];
258
0
      float* const biasp0 = biasp + i[0] * bias_inc[1] * bias_inc[2] * bias_inc[3];
259
0
      float* const nscalep0 = nscalep + i[0] * rdim[1] * rdim[2] * rdim[3];
260
0
      float* const nbiasp0 = nbiasp + i[0] * rdim[1] * rdim[2] * rdim[3];
261
0
      for (i[1] = 0; i[1] < rdim[1]; i[1]++)
262
0
      {
263
0
        float* const meanp1 = meanp0 + i[1] * mean_inc[2] * mean_inc[3];
264
0
        float* const varp1 = varp0 + i[1] * var_inc[2] * var_inc[3];
265
0
        float* const scalep1 = scalep0 + i[1] * scale_inc[2] * scale_inc[3];
266
0
        float* const biasp1 = biasp0 + i[1] * bias_inc[2] * bias_inc[3];
267
0
        float* const nscalep1 = nscalep0 + i[1] * rdim[2] * rdim[3];
268
0
        float* const nbiasp1 = nbiasp0 + i[1] * rdim[2] * rdim[3];
269
0
        for (i[2] = 0; i[2] < rdim[2]; i[2]++)
270
0
        {
271
0
          float* const meanp2 = meanp1 + i[2] * mean_inc[3];
272
0
          float* const varp2 = varp1 + i[2] * var_inc[3];
273
0
          float* const scalep2 = scalep1 + i[2] * scale_inc[3];
274
0
          float* const biasp2 = biasp1 + i[2] * bias_inc[3];
275
0
          float* const nscalep2 = nscalep1 + i[2] * rdim[3];
276
0
          float* const nbiasp2 = nbiasp1 + i[2] * rdim[3];
277
0
          for (x = 0; x < rdim[3]; x++)
278
0
          {
279
0
            const float w = scalep2[x] / (sqrtf(varp2[x]) + epsilon);
280
0
            nscalep2[x] = w;
281
0
            nbiasp2[x] = biasp2[x] - meanp2[x] * w;
282
0
          }
283
0
        }
284
0
      }
285
0
    }
286
0
    float* ap = a->data.f32;
287
0
    float* bp = b->data.f32;
288
0
    for (i[0] = 0; i[0] < adim[0]; i[0]++)
289
0
    {
290
0
      float* const nscalep0 = rdim[0] == 1 ? nscalep : nscalep + i[0] * rdim[1] * rdim[2] * rdim[3];
291
0
      float* const nbiasp0 = rdim[0] == 1 ? nbiasp : nbiasp + i[0] * rdim[1] * rdim[2] * rdim[3];
292
0
      for (i[1] = 0; i[1] < adim[1]; i[1]++)
293
0
      {
294
0
        float* const nscalep1 = rdim[1] == 1 ? nscalep0 : nscalep0 + i[1] * rdim[2] * rdim[3];
295
0
        float* const nbiasp1 = rdim[1] == 1 ? nbiasp0 : nbiasp0 + i[1] * rdim[2] * rdim[3];
296
0
        for (i[2] = 0; i[2] < adim[2]; i[2]++)
297
0
        {
298
0
          float* const nscalep2 = rdim[2] == 1 ? nscalep1 : nscalep1 + i[2] * rdim[3];
299
0
          float* const nbiasp2 = rdim[2] == 1 ? nbiasp1 : nbiasp1 + i[2] * rdim[3];
300
0
          if (rdim[3] == 1)
301
0
            for (x = 0; x < adim[3]; x++)
302
0
              bp[x] = ap[x] * nscalep2[0] + nbiasp2[0];
303
0
          else
304
0
            for (x = 0; x < adim[3]; x++)
305
0
              bp[x] = ap[x] * nscalep2[x] + nbiasp2[x];
306
0
          ap += ainc[3];
307
0
          bp += binc[3];
308
0
        }
309
0
        ap += (ainc[2] - adim[2]) * ainc[3];
310
0
        bp += (binc[2] - adim[2]) * binc[3];
311
0
      }
312
0
      ap += (ainc[1] - adim[1]) * ainc[2] * ainc[3];
313
0
      bp += (binc[1] - adim[1]) * binc[2] * binc[3];
314
0
    }
315
0
  }
316
24
  return CCV_NNC_EXEC_SUCCESS;
317
24
}
318
319
static int _ccv_nnc_batch_norm_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
320
3
{
321
3
  assert(input_size == 15);
322
3
  assert(output_size == 5);
323
3
  ccv_nnc_tensor_view_t* const g = (ccv_nnc_tensor_view_t*)inputs[0];
324
3
  ccv_nnc_tensor_view_t* const a = (ccv_nnc_tensor_view_t*)inputs[5];
325
3
  ccv_nnc_tensor_view_t* const scale = (ccv_nnc_tensor_view_t*)inputs[6];
326
3
  ccv_nnc_tensor_view_t* const saved_mean = (ccv_nnc_tensor_view_t*)inputs[13];
327
3
  ccv_nnc_tensor_view_t* const saved_inv_std = (ccv_nnc_tensor_view_t*)inputs[14];
328
3
  ccv_nnc_tensor_view_t* const h = (ccv_nnc_tensor_view_t*)outputs[0];
329
3
  ccv_nnc_tensor_view_t* const dscale = (ccv_nnc_tensor_view_t*)outputs[1];
330
3
  ccv_nnc_tensor_view_t* const dbias = (ccv_nnc_tensor_view_t*)outputs[2];
331
3
  assert(ccv_nnc_tensor_nd(g->info.dim) <= CCV_NNC_MAX_DIM + 2);
332
3
  assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
333
3
  assert(ccv_nnc_tensor_nd(h->info.dim) <= CCV_NNC_MAX_DIM + 2);
334
3
  // Assuming this is float 32.
335
3
  int gdim[CCV_NNC_MAX_DIM_ALLOC];
336
3
  int rdim[CCV_NNC_MAX_DIM_ALLOC];
337
3
  ccv_nnc_tensor_view_get_dim(g, gdim);
338
3
  ccv_nnc_tensor_view_get_dim(scale, rdim);
339
3
  assert(ccv_nnc_tensor_view_check_dim(saved_mean, rdim));
340
3
  assert(ccv_nnc_tensor_view_check_dim(saved_inv_std, rdim));
341
3
  assert(ccv_nnc_tensor_view_check_dim(dscale, rdim));
342
3
  assert(ccv_nnc_tensor_view_check_dim(dbias, rdim));
343
3
  assert(ccv_nnc_tensor_view_check_dim(a, gdim));
344
3
  assert(ccv_nnc_tensor_view_check_dim(h, gdim));
345
3
  _ccv_nnc_reduce_sum_forw_cpu_ref(g, dbias);
346
3
  int ainc[CCV_NNC_MAX_DIM_ALLOC];
347
3
  int ginc[CCV_NNC_MAX_DIM_ALLOC];
348
3
  int hinc[CCV_NNC_MAX_DIM_ALLOC];
349
3
  int mean_inc[CCV_NNC_MAX_DIM_ALLOC];
350
3
  int inv_std_inc[CCV_NNC_MAX_DIM_ALLOC];
351
3
  int dscale_inc[CCV_NNC_MAX_DIM_ALLOC];
352
3
  int dbias_inc[CCV_NNC_MAX_DIM_ALLOC];
353
3
  ccv_nnc_tensor_view_get_inc(a, ainc);
354
3
  ccv_nnc_tensor_view_get_inc(g, ginc);
355
3
  ccv_nnc_tensor_view_get_inc(h, hinc);
356
3
  ccv_nnc_tensor_view_get_inc(saved_mean, mean_inc);
357
3
  ccv_nnc_tensor_view_get_inc(saved_inv_std, inv_std_inc);
358
3
  ccv_nnc_tensor_view_get_inc(dscale, dscale_inc);
359
3
  ccv_nnc_tensor_view_get_inc(dbias, dbias_inc);
360
3
  // Need to allocate two additional memory:
361
3
  // 1. normalized a;
362
3
  // 2. scale * inv_std / batch_size;
363
3
  assert(!(flags & CCV_NNC_ZERO_MEMORY_ALLOC));
364
3
  int x;
365
3
  int batch_size = 1;
366
15
  for (x = 0; x < CCV_NNC_MAX_DIM + 2; 
x++12
)
367
12
    batch_size *= gdim[x];
368
15
  for (x = 0; x < CCV_NNC_MAX_DIM + 2; 
x++12
)
369
12
    batch_size /= rdim[x];
370
3
  int gcount = 1, rcount = 1;
371
15
  for (x = 0; x < CCV_NNC_MAX_DIM + 2; 
x++12
)
372
12
    gcount *= gdim[x], rcount *= rdim[x];
373
3
  float* const ah = (float*)ccv_nnc_stream_context_get_workspace(stream_context, sizeof(float) * gcount + sizeof(float) * rcount, CCV_TENSOR_CPU_MEMORY);
374
3
  float* const sisb = ah + gcount;
375
3
  ccv_nnc_tensor_t sisbt = ccv_nnc_tensor(sisb, scale->info, 0);
376
3
  _ccv_nnc_mul_forw_cpu_ref(1. / batch_size, scale, saved_inv_std, (ccv_nnc_tensor_view_t*)&sisbt);
377
3
  int i[CCV_NNC_MAX_DIM + 2];
378
3
  float* ap = a->data.f32;
379
3
  float* ahp = ah;
380
3
  float* const meanp = saved_mean->data.f32;
381
3
  float* const inv_stdp = saved_inv_std->data.f32;
382
9
  for (i[0] = 0; i[0] < gdim[0]; 
i[0]++6
)
383
6
  {
384
6
    float* const meanp0 = rdim[0] == 1 ? meanp : 
meanp + i[0] * mean_inc[1] * mean_inc[2] * mean_inc[3]0
;
385
6
    float* const inv_stdp0 = rdim[0] == 1 ? inv_stdp : 
inv_stdp + i[0] * inv_std_inc[1] * inv_std_inc[2] * inv_std_inc[3]0
;
386
18
    for (i[1] = 0; i[1] < gdim[1]; 
i[1]++12
)
387
12
    {
388
12
      float* const meanp1 = rdim[1] == 1 ? meanp0 : 
meanp0 + i[1] * mean_inc[2] * mean_inc[3]0
;
389
12
      float* const inv_stdp1 = rdim[1] == 1 ? inv_stdp0 : 
inv_stdp0 + i[1] * inv_std_inc[2] * inv_std_inc[3]0
;
390
36
      for (i[2] = 0; i[2] < gdim[2]; 
i[2]++24
)
391
24
      {
392
24
        float* const meanp2 = rdim[2] == 1 ? meanp1 : 
meanp1 + i[2] * mean_inc[3]0
;
393
24
        float* const inv_stdp2 = rdim[2] == 1 ? inv_stdp1 : 
inv_stdp1 + i[2] * inv_std_inc[3]0
;
394
24
        if (rdim[3] == 1)
395
0
          for (x = 0; x < gdim[3]; x++)
396
0
            ahp[x] = (ap[x] - meanp2[0]) * inv_stdp2[0];
397
24
        else
398
264
          
for (x = 0; 24
x < gdim[3];
x++240
)
399
240
            ahp[x] = (ap[x] - meanp2[x]) * inv_stdp2[x];
400
24
        ap += ainc[3];
401
24
        ahp += gdim[3];
402
24
      }
403
12
      ap += (ainc[2] - gdim[2]) * ainc[3];
404
12
    }
405
6
    ap += (ainc[1] - gdim[1]) * ainc[2] * ainc[3];
406
6
  }
407
3
  ccv_nnc_tensor_zero(dscale);
408
3
  ahp = ah;
409
3
  float* gp = g->data.f32;
410
3
  float* const dscalep = dscale->data.f32;
411
9
  for (i[0] = 0; i[0] < gdim[0]; 
i[0]++6
)
412
6
  {
413
6
    float* const dscalep0 = rdim[0] == 1 ? dscalep : 
dscalep + i[0] * dscale_inc[1] * dscale_inc[2] * dscale_inc[3]0
;
414
18
    for (i[1] = 0; i[1] < gdim[1]; 
i[1]++12
)
415
12
    {
416
12
      float* const dscalep1 = rdim[1] == 1 ? dscalep0 : 
dscalep0 + i[1] * dscale_inc[2] * dscale_inc[3]0
;
417
36
      for (i[2] = 0; i[2] < gdim[2]; 
i[2]++24
)
418
24
      {
419
24
        float* const dscalep2 = rdim[2] == 1 ? dscalep1 : 
dscalep1 + i[2] * dscale_inc[3]0
;
420
24
        if (rdim[3] == 1)
421
0
          for (x = 0; x < gdim[3]; x++)
422
0
            dscalep2[0] += ahp[x] * gp[x];
423
24
        else
424
264
          
for (x = 0; 24
x < gdim[3];
x++240
)
425
240
            dscalep2[x] += ahp[x] * gp[x];
426
24
        gp += ginc[3];
427
24
        ahp += gdim[3];
428
24
      }
429
12
      gp += (ginc[2] - gdim[2]) * ginc[3];
430
12
    }
431
6
    gp += (ginc[1] - gdim[1]) * ginc[2] * ginc[3];
432
6
  }
433
3
  // Now the part to compute dx (h).
434
3
  ap = a->data.f32;
435
3
  gp = g->data.f32;
436
3
  float* hp = h->data.f32;
437
3
  ahp = ah;
438
3
  float* const sisbp = sisb;
439
3
  float* const dbiasp = dbias->data.f32;
440
9
  for (i[0] = 0; i[0] < gdim[0]; 
i[0]++6
)
441
6
  {
442
6
    float* const sisbp0 = rdim[0] == 1 ? sisbp : 
sisbp + i[0] * rdim[1] * rdim[2] * rdim[3]0
;
443
6
    float* const dscalep0 = rdim[0] == 1 ? dscalep : 
dscalep + i[0] * dscale_inc[1] * dscale_inc[2] * dscale_inc[3]0
;
444
6
    float* const dbiasp0 = rdim[0] == 1 ? dbiasp : 
dbiasp + i[0] * dbias_inc[1] * dbias_inc[2] * dbias_inc[3]0
;
445
18
    for (i[1] = 0; i[1] < gdim[1]; 
i[1]++12
)
446
12
    {
447
12
      float* const sisbp1 = rdim[1] == 1 ? sisbp0 : 
sisbp0 + i[1] * rdim[2] * rdim[3]0
;
448
12
      float* const dscalep1 = rdim[1] == 1 ? dscalep0 : 
dscalep0 + i[1] * dscale_inc[2] * dscale_inc[3]0
;
449
12
      float* const dbiasp1 = rdim[1] == 1 ? dbiasp0 : 
dbiasp0 + i[1] * dbias_inc[2] * dbias_inc[3]0
;
450
36
      for (i[2] = 0; i[2] < gdim[2]; 
i[2]++24
)
451
24
      {
452
24
        float* const sisbp2 = rdim[2] == 1 ? sisbp1 : 
sisbp1 + i[2] * rdim[3]0
;
453
24
        float* const dscalep2 = rdim[2] == 1 ? dscalep1 : 
dscalep1 + i[2] * dscale_inc[3]0
;
454
24
        float* const dbiasp2 = rdim[2] == 1 ? dbiasp1 : 
dbiasp1 + i[2] * dbias_inc[3]0
;
455
24
        if (rdim[3] == 1)
456
0
          for (x = 0; x < gdim[3]; x++)
457
0
            hp[x] = sisbp2[0] * (batch_size * gp[x] - dbiasp2[0] - ahp[x] * dscalep2[0]);
458
24
        else
459
264
          
for (x = 0; 24
x < gdim[3];
x++240
)
460
240
            hp[x] = sisbp2[x] * (batch_size * gp[x] - dbiasp2[x] - ahp[x] * dscalep2[x]);
461
24
        ap += ainc[3];
462
24
        gp += ginc[3];
463
24
        hp += hinc[3];
464
24
        ahp += gdim[3];
465
24
      }
466
12
      ap += (ainc[2] - gdim[2]) * ainc[3];
467
12
      gp += (ginc[2] - gdim[2]) * ginc[3];
468
12
      hp += (hinc[2] - gdim[2]) * hinc[3];
469
12
    }
470
6
    ap += (ainc[1] - gdim[1]) * ainc[2] * ainc[3];
471
6
    gp += (ginc[1] - gdim[1]) * ginc[2] * ginc[3];
472
6
    hp += (hinc[1] - gdim[1]) * hinc[2] * hinc[3];
473
6
  }
474
3
  return CCV_NNC_EXEC_SUCCESS;
475
3
}
476
477
REGISTER_COMMAND_BACKEND(CCV_NNC_BATCH_NORM_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
478
1
{
479
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
480
1
  registry->tensor_datatypes = CCV_32F;
481
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
482
1
  registry->algorithms = 1;
483
1
  registry->exec = _ccv_nnc_batch_norm_forw;
484
1
}
485
486
REGISTER_COMMAND_BACKEND(CCV_NNC_BATCH_NORM_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
487
1
{
488
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
489
1
  registry->tensor_datatypes = CCV_32F;
490
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
491
1
  registry->algorithms = 1;
492
1
  registry->exec = _ccv_nnc_batch_norm_back;
493
1
}