Coverage Report

Created: 2019-07-03 22:50

/home/liu/buildslave/linux-x64-runtests/build/lib/nnc/cmd/blas/ccv_nnc_add_cpu_ref.c
Line
Count
Source (jump to first uncovered line)
1
#include <ccv.h>
2
#include <ccv_internal.h>
3
#include <nnc/ccv_nnc.h>
4
#include <nnc/ccv_nnc_easy.h>
5
#include <nnc/ccv_nnc_internal.h>
6
#ifdef USE_OPENMP
7
#include <omp.h>
8
#endif
9
#ifdef USE_DISPATCH
10
#include <dispatch/dispatch.h>
11
#endif
12
13
// Shared methods.
14
#include "../_ccv_nnc_cpu_ref.h"
15
16
void _ccv_nnc_add_forw_cpu_ref(const float p, const float q, ccv_nnc_tensor_view_t* const a, ccv_nnc_tensor_view_t* const b, ccv_nnc_tensor_view_t* const c)
17
8.96k
{
18
8.96k
  if (b == 0)
19
4.45k
  {
20
4.45k
    // It cannot be set otherwise we have trouble.
21
4.45k
    assert(q == 0);
22
4.45k
    if (p == 1)
23
4.44k
    {
24
4.44k
      _ccv_nnc_tensor_transfer_cpu_ref(a, c);
25
4.44k
      return;
26
4.44k
    } else 
if (5
p == 05
) {
27
0
      ccv_nnc_tensor_zero(c);
28
0
      return;
29
0
    }
30
5
    // Assuming this is float 32.
31
5
    int dim[CCV_NNC_MAX_DIM + 2];
32
5
    int ainc[CCV_NNC_MAX_DIM + 2];
33
5
    int cinc[CCV_NNC_MAX_DIM + 2];
34
5
    assert(a->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
35
5
    assert(c->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
36
5
    ccv_nnc_tensor_view_get_dim(a, dim);
37
5
    assert(ccv_nnc_tensor_view_check_dim(c, dim));
38
5
    int x;
39
5
    if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(c))
40
5
    {
41
5
      // Super optimal case, just do one for-loop for sum.
42
5
      const int tensor_count = ccv_nnc_tensor_count(a->info);
43
1.50k
      for (x = 0; x < tensor_count; 
x++1.50k
)
44
1.50k
        c->data.f32[x] = p * a->data.f32[x];
45
5
      return;
46
5
    }
47
0
    assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
48
0
    ccv_nnc_tensor_view_get_inc(a, ainc);
49
0
    ccv_nnc_tensor_view_get_inc(c, cinc);
50
0
    int i[CCV_NNC_MAX_DIM + 2];
51
0
    float* ap = a->data.f32;
52
0
    float* cp = c->data.f32;
53
0
    const int count = dim[2] * dim[3];
54
0
    if (ainc[3] == dim[3] && cinc[3] == dim[3])
55
0
    {
56
0
      // Special casing if the ainc[3] is the same as dim[3]
57
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
58
0
      {
59
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
60
0
        {
61
0
          for (x = 0; x < count; x++)
62
0
            cp[x] = p * ap[x];
63
0
          ap += ainc[2] * ainc[3];
64
0
          cp += cinc[2] * cinc[3];
65
0
        }
66
0
        ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
67
0
        cp += (cinc[1] - dim[1]) * cinc[2] * cinc[3];
68
0
      }
69
0
      return;
70
0
    }
71
0
    // Non-optimal case, need to do skip copy.
72
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
73
0
    {
74
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
75
0
      {
76
0
        for (i[2] = 0; i[2] < dim[2]; i[2]++)
77
0
        {
78
0
          for (x = 0; x < dim[3]; x++)
79
0
            cp[x] = p * ap[x];
80
0
          ap += ainc[3];
81
0
          cp += cinc[3];
82
0
        }
83
0
        ap += (ainc[2] - dim[2]) * ainc[3];
84
0
        cp += (cinc[2] - dim[2]) * cinc[3];
85
0
      }
86
0
      ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
87
0
      cp += (cinc[1] - dim[1]) * cinc[2] * cinc[3];
88
0
    }
89
0
    return;
90
0
  }
91
4.50k
  int cdim[CCV_NNC_MAX_DIM + 2];
92
4.50k
  assert(a->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
93
4.50k
  assert(b->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
94
4.50k
  ccv_nnc_tensor_view_get_dim(a, cdim); // Fill in cdim first.
95
4.50k
  ccv_nnc_tensor_view_get_broadcast_dim(b, cdim);
96
4.50k
  assert(ccv_nnc_tensor_view_check_broadcast_dim(a, cdim));
97
4.50k
  assert(ccv_nnc_tensor_view_check_broadcast_dim(b, cdim));
98
4.50k
  const int a_check_dim = ccv_nnc_tensor_view_check_dim(a, cdim);
99
4.50k
  const int b_check_dim = ccv_nnc_tensor_view_check_dim(b, cdim);
100
4.50k
  if (p == 1 && 
q == 14.45k
&&
a_check_dim3
&&
b_check_dim2
)
101
1
  {
102
1
    _ccv_nnc_ewsum_forw_cpu_ref((ccv_nnc_tensor_view_t*[]){
103
1
      a, b
104
1
    }, 2, &c, 1);
105
1
    return;
106
4.50k
  } else if (p == 1 && 
q == 04.45k
&&
a_check_dim0
) {
107
0
    _ccv_nnc_tensor_transfer_cpu_ref(a, c);
108
0
    return;
109
4.50k
  } else if (p == 0 && 
q == 10
&&
b_check_dim0
) {
110
0
    _ccv_nnc_tensor_transfer_cpu_ref(b, c);
111
0
    return;
112
4.50k
  } else if (p == 0 && 
q == 00
) {
113
0
    ccv_nnc_tensor_zero(c);
114
0
    return;
115
0
  }
116
4.50k
  // Assuming this is float 32.
117
4.50k
  int adim[CCV_NNC_MAX_DIM + 2];
118
4.50k
  int bdim[CCV_NNC_MAX_DIM + 2];
119
4.50k
  ccv_nnc_tensor_view_get_dim(a, adim);
120
4.50k
  ccv_nnc_tensor_view_get_dim(b, bdim);
121
4.50k
  int ainc[CCV_NNC_MAX_DIM + 2];
122
4.50k
  int binc[CCV_NNC_MAX_DIM + 2];
123
4.50k
  int cinc[CCV_NNC_MAX_DIM + 2];
124
4.50k
  assert(c->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
125
4.50k
  assert(ccv_nnc_tensor_view_check_dim(c, cdim));
126
4.50k
  int x;
127
4.50k
  if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b) && !CCV_IS_TENSOR_VIEW(c) && a_check_dim && 
b_check_dim4.50k
)
128
4.49k
  {
129
4.49k
    const int tensor_count = ccv_nnc_tensor_count(a->info);
130
4.49k
    // Super optimal case, just do one for-loop for sum.
131
9.43k
    for (x = 0; x < tensor_count; 
x++4.93k
)
132
4.93k
      c->data.f32[x] = p * a->data.f32[x] + q * b->data.f32[x];
133
4.49k
    return;
134
4.49k
  }
135
9
  assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
136
9
  ccv_nnc_tensor_view_get_inc(a, ainc);
137
9
  ccv_nnc_tensor_view_get_inc(b, binc);
138
9
  ccv_nnc_tensor_view_get_inc(c, cinc);
139
9
  int i[CCV_NNC_MAX_DIM + 2];
140
9
  float* ap = a->data.f32;
141
9
  float* bp = b->data.f32;
142
9
  float* cp = c->data.f32;
143
9
  const int count = cdim[2] * cdim[3];
144
9
  if (ainc[3] == cdim[3] && 
binc[3] == cdim[3]8
&&
cinc[3] == cdim[3]7
&&
adim[2] == cdim[2]7
&&
bdim[2] == cdim[2]7
)
145
0
  {
146
0
    // Special casing if the ainc[3] is the same as dim[3]
147
0
    for (i[0] = 0; i[0] < cdim[0]; i[0]++)
148
0
    {
149
0
      float* const ap0 = adim[0] == 1 ? ap : ap + i[0] * ainc[1] * ainc[2] * ainc[3];
150
0
      float* const bp0 = bdim[0] == 1 ? bp : bp + i[0] * binc[1] * binc[2] * binc[3];
151
0
      for (i[1] = 0; i[1] < cdim[1]; i[1]++)
152
0
      {
153
0
        float* const ap1 = adim[1] == 1 ? ap0 : ap0 + i[1] * ainc[2] * ainc[3];
154
0
        float* const bp1 = bdim[1] == 1 ? bp0 : bp0 + i[1] * binc[2] * binc[3];
155
0
        for (x = 0; x < count; x++)
156
0
          cp[x] = p * ap1[x] + q * bp1[x];
157
0
        cp += cinc[2] * cinc[3];
158
0
      }
159
0
      cp += (cinc[1] - cdim[1]) * cinc[2] * cinc[3];
160
0
    }
161
0
    return;
162
0
  }
163
9
  // Non-optimal case, need to do skip copy and handle broadcasting.
164
62
  
for (i[0] = 0; 9
i[0] < cdim[0];
i[0]++53
)
165
53
  {
166
53
    float* const ap0 = adim[0] == 1 ? 
ap3
:
ap + i[0] * ainc[1] * ainc[2] * ainc[3]50
;
167
53
    float* const bp0 = bdim[0] == 1 ? 
bp13
:
bp + i[0] * binc[1] * binc[2] * binc[3]40
;
168
292
    for (i[1] = 0; i[1] < cdim[1]; 
i[1]++239
)
169
239
    {
170
239
      float* const ap1 = adim[1] == 1 ? 
ap03
:
ap0 + i[1] * ainc[2] * ainc[3]236
;
171
239
      float* const bp1 = bdim[1] == 1 ? 
bp039
:
bp0 + i[1] * binc[2] * binc[3]200
;
172
1.38k
      for (i[2] = 0; i[2] < cdim[2]; 
i[2]++1.14k
)
173
1.14k
      {
174
1.14k
        float* const ap2 = adim[2] == 1 ? 
ap11
:
ap1 + i[2] * ainc[3]1.14k
;
175
1.14k
        float* const bp2 = bdim[2] == 1 ? bp1 : 
bp1 + i[2] * binc[3]0
;
176
1.14k
        if (adim[3] == 1)
177
12
          
for (x = 0; 4
x < cdim[3];
x++8
)
178
8
            cp[x] = p * ap2[0] + q * bp2[x];
179
1.13k
        else if (bdim[3] == 1)
180
101
          
for (x = 0; 1
x < cdim[3];
x++100
)
181
100
            cp[x] = p * ap2[x] + q * bp2[0];
182
1.13k
        else
183
5.50k
          
for (x = 0; 1.13k
x < cdim[3];
x++4.36k
)
184
4.36k
            cp[x] = p * ap2[x] + q * bp2[x];
185
1.14k
        cp += cinc[3];
186
1.14k
      }
187
239
      cp += (cinc[2] - cdim[2]) * cinc[3];
188
239
    }
189
53
    cp += (cinc[1] - cdim[1]) * cinc[2] * cinc[3];
190
53
  }
191
9
}
192
193
static int _ccv_nnc_add_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
194
4.46k
{
195
4.46k
  assert(input_size == 2);
196
4.46k
  _ccv_nnc_add_forw_cpu_ref(cmd.info.blas.a[0], cmd.info.blas.a[1], (ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[1], (ccv_nnc_tensor_view_t*)outputs[0]);
197
4.46k
  return CCV_NNC_EXEC_SUCCESS;
198
4.46k
}
199
200
static int _ccv_nnc_add_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
201
4.44k
{
202
4.44k
  if (inputs[0] == 0)
203
0
  {
204
0
    if (outputs[0])
205
0
      _ccv_nnc_tensor_set_cpu_ref((ccv_nnc_tensor_view_t*)outputs[0], cmd.info.blas.a[0]);
206
0
    if (output_size > 1 && outputs[1])
207
0
      _ccv_nnc_tensor_set_cpu_ref((ccv_nnc_tensor_view_t*)outputs[1], cmd.info.blas.a[1]);
208
0
    return CCV_NNC_EXEC_SUCCESS;
209
0
  }
210
4.44k
  int gdim[CCV_NNC_MAX_DIM + 2];
211
4.44k
  int ginc[CCV_NNC_MAX_DIM + 2];
212
4.44k
  ccv_nnc_tensor_view_t* const g = (ccv_nnc_tensor_view_t*)inputs[0];
213
4.44k
  ccv_nnc_tensor_view_get_dim(g, gdim);
214
4.44k
  ccv_nnc_tensor_view_get_inc(g, ginc);
215
4.44k
  if (outputs[0])
216
4.44k
  {
217
4.44k
    ccv_nnc_tensor_view_t* const a = (ccv_nnc_tensor_view_t*)outputs[0];
218
4.44k
    if (ccv_nnc_tensor_view_check_dim(a, gdim))
219
4.44k
      _ccv_nnc_add_forw_cpu_ref(cmd.info.blas.a[0], 0, (ccv_nnc_tensor_view_t*)inputs[0], 0, (ccv_nnc_tensor_view_t*)outputs[0]);
220
0
    else {
221
0
      assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
222
0
      const float p = cmd.info.blas.a[0];
223
0
      int adim[CCV_NNC_MAX_DIM + 2];
224
0
      int ainc[CCV_NNC_MAX_DIM + 2];
225
0
      ccv_nnc_tensor_view_get_dim(a, adim);
226
0
      ccv_nnc_tensor_view_get_inc(a, ainc);
227
0
      int i[CCV_NNC_MAX_DIM + 2];
228
0
      int x;
229
0
      float* const ap = a->data.f32;
230
0
      float* gp = g->data.f32;
231
0
      // zeroing out so that we can accumulate.
232
0
      ccv_nnc_tensor_zero(a);
233
0
      // Non-optimal case, need to do skip copy and handle broadcasting.
234
0
      for (i[0] = 0; i[0] < gdim[0]; i[0]++)
235
0
      {
236
0
        float* const ap0 = adim[0] == 1 ? ap : ap + i[0] * ainc[1] * ainc[2] * ainc[3];
237
0
        for (i[1] = 0; i[1] < gdim[1]; i[1]++)
238
0
        {
239
0
          float* const ap1 = adim[1] == 1 ? ap0 : ap0 + i[1] * ainc[2] * ainc[3];
240
0
          for (i[2] = 0; i[2] < gdim[2]; i[2]++)
241
0
          {
242
0
            float* const ap2 = adim[2] == 1 ? ap1 : ap1 + i[2] * ainc[3];
243
0
            if (adim[3] == 1)
244
0
              for (x = 0; x < gdim[3]; x++)
245
0
                ap2[0] += p * gp[x];
246
0
            else
247
0
              for (x = 0; x < gdim[3]; x++)
248
0
                ap2[x] += p * gp[x];
249
0
            gp += ginc[3];
250
0
          }
251
0
          gp += (ginc[2] - gdim[2]) * ginc[3];
252
0
        }
253
0
        gp += (ginc[1] - gdim[1]) * ginc[2] * ginc[3];
254
0
      }
255
0
    }
256
4.44k
  }
257
4.44k
  if (output_size > 1 && outputs[1])
258
7
  {
259
7
    ccv_nnc_tensor_view_t* const a = (ccv_nnc_tensor_view_t*)outputs[1];
260
7
    if (ccv_nnc_tensor_view_check_dim(a, gdim))
261
3
      _ccv_nnc_add_forw_cpu_ref(cmd.info.blas.a[1], 0, (ccv_nnc_tensor_view_t*)inputs[0], 0, (ccv_nnc_tensor_view_t*)outputs[1]);
262
4
    else {
263
4
      assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
264
4
      const float p = cmd.info.blas.a[1];
265
4
      int adim[CCV_NNC_MAX_DIM + 2];
266
4
      int ainc[CCV_NNC_MAX_DIM + 2];
267
4
      ccv_nnc_tensor_view_get_dim(a, adim);
268
4
      ccv_nnc_tensor_view_get_inc(a, ainc);
269
4
      int i[CCV_NNC_MAX_DIM + 2];
270
4
      int x;
271
4
      float* const ap = a->data.f32;
272
4
      float* gp = g->data.f32;
273
4
      // zeroing out so that we can accumulate.
274
4
      ccv_nnc_tensor_zero(a);
275
4
      // Non-optimal case, need to do skip copy and handle broadcasting.
276
27
      for (i[0] = 0; i[0] < gdim[0]; 
i[0]++23
)
277
23
      {
278
23
        float* const ap0 = adim[0] == 1 ? 
ap3
:
ap + i[0] * ainc[1] * ainc[2] * ainc[3]20
;
279
128
        for (i[1] = 0; i[1] < gdim[1]; 
i[1]++105
)
280
105
        {
281
105
          float* const ap1 = adim[1] == 1 ? 
ap05
:
ap0 + i[1] * ainc[2] * ainc[3]100
;
282
614
          for (i[2] = 0; i[2] < gdim[2]; 
i[2]++509
)
283
509
          {
284
509
            float* const ap2 = adim[2] == 1 ? ap1 : 
ap1 + i[2] * ainc[3]0
;
285
509
            if (adim[3] == 1)
286
101
              
for (x = 0; 1
x < gdim[3];
x++100
)
287
100
                ap2[0] += p * gp[x];
288
508
            else
289
2.08k
              
for (x = 0; 508
x < gdim[3];
x++1.58k
)
290
1.58k
                ap2[x] += p * gp[x];
291
509
            gp += ginc[3];
292
509
          }
293
105
          gp += (ginc[2] - gdim[2]) * ginc[3];
294
105
        }
295
23
        gp += (ginc[1] - gdim[1]) * ginc[2] * ginc[3];
296
23
      }
297
4
    }
298
7
  }
299
4.44k
  return CCV_NNC_EXEC_SUCCESS;
300
4.44k
}
301
302
REGISTER_COMMAND_BACKEND(CCV_NNC_ADD_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
303
1
{
304
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
305
1
  registry->tensor_datatypes = CCV_32F;
306
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
307
1
  registry->algorithms = 1;
308
1
  registry->exec = _ccv_nnc_add_forw;
309
1
}
310
311
REGISTER_COMMAND_BACKEND(CCV_NNC_ADD_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
312
1
{
313
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
314
1
  registry->tensor_datatypes = CCV_32F;
315
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
316
1
  registry->algorithms = 1;
317
1
  registry->exec = _ccv_nnc_add_back;
318
1
}