Coverage Report

Created: 2024-08-19 11:27

/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/cmd/blas/ccv_nnc_add_cpu_ref.c
Line
Count
Source (jump to first uncovered line)
1
#include "ccv.h"
2
#include "ccv_internal.h"
3
#include "nnc/ccv_nnc.h"
4
#include "nnc/ccv_nnc_easy.h"
5
#include "nnc/ccv_nnc_internal.h"
6
#ifdef USE_OPENMP
7
#include <omp.h>
8
#endif
9
#ifdef USE_DISPATCH
10
#include <dispatch/dispatch.h>
11
#endif
12
13
// Shared methods.
14
#include "../_ccv_nnc_cpu_ref.h"
15
16
void _ccv_nnc_add_forw_cpu_ref(const float p, const float q, ccv_nnc_tensor_view_t* const a, ccv_nnc_tensor_view_t* const b, ccv_nnc_tensor_view_t* const c)
17
28.9k
{
18
28.9k
  if (b == 0)
19
17.0k
  {
20
    // It cannot be set otherwise we have trouble.
21
17.0k
    assert(q == 0);
22
17.0k
    if (p == 1)
23
11.8k
    {
24
11.8k
      _ccv_nnc_tensor_transfer_cpu_ref_f32(a, c);
25
11.8k
      return;
26
11.8k
    } else 
if (5.17k
p == 05.17k
) {
27
0
      ccv_nnc_tensor_zero(c);
28
0
      return;
29
0
    }
30
    // Assuming this is float 32.
31
5.17k
    int dim[CCV_NNC_MAX_DIM_ALLOC];
32
5.17k
    int astride[CCV_NNC_MAX_DIM_ALLOC];
33
5.17k
    int cstride[CCV_NNC_MAX_DIM_ALLOC];
34
5.17k
    assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
35
5.17k
    assert(ccv_nnc_tensor_nd(c->info.dim) <= CCV_NNC_MAX_DIM + 2);
36
5.17k
    ccv_nnc_tensor_view_get_dim(a, dim);
37
5.17k
    assert(ccv_nnc_tensor_view_check_dim(c, dim));
38
5.17k
    int x;
39
5.17k
    if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(c))
40
5.17k
    {
41
      // Super optimal case, just do one for-loop for sum.
42
5.17k
      const int tensor_count = ccv_nnc_tensor_count(a->info);
43
11.8k
      for (x = 0; x < tensor_count; 
x++6.67k
)
44
6.67k
        c->data.f32[x] = p * a->data.f32[x];
45
5.17k
      return;
46
5.17k
    }
47
0
    assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
48
0
    ccv_nnc_tensor_view_get_stride(a, astride);
49
0
    ccv_nnc_tensor_view_get_stride(c, cstride);
50
0
    int i[CCV_NNC_MAX_DIM + 2];
51
0
    float* const ap = a->data.f32;
52
0
    float* const cp = c->data.f32;
53
0
    const int count = dim[2] * dim[3];
54
0
    if (astride[2] == dim[3] && cstride[2] == dim[3] && astride[3] == 1 && cstride[3] == 1)
55
0
    {
56
      // Special casing if the ainc[3] is the same as dim[3]
57
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
58
0
      {
59
0
        float* ap0 = ap + i[0] * astride[0];
60
0
        float* cp0 = cp + i[0] * cstride[0];
61
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
62
0
        {
63
0
          for (x = 0; x < count; x++)
64
0
            cp0[x] = p * ap0[x];
65
0
          ap0 += astride[1];
66
0
          cp0 += cstride[1];
67
0
        }
68
0
      }
69
0
      return;
70
0
    }
71
    // Non-optimal case, need to do skip copy.
72
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
73
0
    {
74
0
      float* const ap0 = ap + i[0] * astride[0];
75
0
      float* const cp0 = cp + i[0] * cstride[0];
76
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
77
0
      {
78
0
        float* ap1 = ap0 + i[1] * astride[1];
79
0
        float* cp1 = cp0 + i[1] * cstride[1];
80
0
        for (i[2] = 0; i[2] < dim[2]; i[2]++)
81
0
        {
82
0
          for (x = 0; x < dim[3]; x++)
83
0
            cp1[x * cstride[3]] = p * ap1[x * astride[3]];
84
0
          ap1 += astride[2];
85
0
          cp1 += cstride[2];
86
0
        }
87
0
      }
88
0
    }
89
0
    return;
90
0
  }
91
11.9k
  int cdim[CCV_NNC_MAX_DIM_ALLOC];
92
11.9k
  assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
93
11.9k
  assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
94
11.9k
  ccv_nnc_tensor_view_get_dim(a, cdim); // Fill in cdim first.
95
11.9k
  ccv_nnc_tensor_view_get_broadcast_dim(b, cdim);
96
11.9k
  assert(ccv_nnc_tensor_view_check_broadcast_dim(a, cdim));
97
11.9k
  assert(ccv_nnc_tensor_view_check_broadcast_dim(b, cdim));
98
11.9k
  const int a_check_dim = ccv_nnc_tensor_view_check_dim(a, cdim);
99
11.9k
  const int b_check_dim = ccv_nnc_tensor_view_check_dim(b, cdim);
100
11.9k
  if (p == 1 && 
q == 111.8k
&&
a_check_dim10
&&
b_check_dim9
)
101
7
  {
102
7
    _ccv_nnc_ewsum_forw_cpu_ref_f32((ccv_nnc_tensor_view_t*[]){
103
7
      a, b
104
7
    }, 2, &c, 1);
105
7
    return;
106
11.9k
  } else if (p == 1 && 
q == 011.8k
&&
a_check_dim0
) {
107
0
    _ccv_nnc_tensor_transfer_cpu_ref_f32(a, c);
108
0
    return;
109
11.9k
  } else if (p == 0 && 
q == 10
&&
b_check_dim0
) {
110
0
    _ccv_nnc_tensor_transfer_cpu_ref_f32(b, c);
111
0
    return;
112
11.9k
  } else if (p == 0 && 
q == 00
) {
113
0
    ccv_nnc_tensor_zero(c);
114
0
    return;
115
0
  }
116
  // Assuming this is float 32.
117
11.9k
  int adim[CCV_NNC_MAX_DIM_ALLOC];
118
11.9k
  int bdim[CCV_NNC_MAX_DIM_ALLOC];
119
11.9k
  ccv_nnc_tensor_view_get_dim(a, adim);
120
11.9k
  ccv_nnc_tensor_view_get_dim(b, bdim);
121
11.9k
  assert(ccv_nnc_tensor_nd(c->info.dim) <= CCV_NNC_MAX_DIM + 2);
122
11.9k
  assert(ccv_nnc_tensor_view_check_dim(c, cdim));
123
11.9k
  int x;
124
11.9k
  if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b) && !CCV_IS_TENSOR_VIEW(c) && a_check_dim && 
b_check_dim11.9k
)
125
11.7k
  {
126
11.7k
    const int tensor_count = ccv_nnc_tensor_count(a->info);
127
    // Super optimal case, just do one for-loop for sum.
128
23.9k
    for (x = 0; x < tensor_count; 
x++12.2k
)
129
12.2k
      c->data.f32[x] = p * a->data.f32[x] + q * b->data.f32[x];
130
11.7k
    return;
131
11.7k
  }
132
120
  assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
133
120
  int astride[CCV_NNC_MAX_DIM_ALLOC];
134
120
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
135
120
  int cstride[CCV_NNC_MAX_DIM_ALLOC];
136
120
  ccv_nnc_tensor_view_get_stride(a, astride);
137
120
  ccv_nnc_tensor_view_get_stride(b, bstride);
138
120
  ccv_nnc_tensor_view_get_stride(c, cstride);
139
120
  int i[CCV_NNC_MAX_DIM + 2];
140
120
  float* const ap = a->data.f32;
141
120
  float* const bp = b->data.f32;
142
120
  float* const cp = c->data.f32;
143
120
  const int count = cdim[2] * cdim[3];
144
120
  if (astride[2] == cdim[3] && 
bstride[2] == cdim[3]119
&&
cstride[2] == cdim[3]111
&&
adim[2] == cdim[2]111
&&
bdim[2] == cdim[2]111
&&
astride[3] == 10
&&
bstride[3] == 10
)
145
0
  {
146
    // Special casing if the ainc[3] is the same as dim[3]
147
0
    for (i[0] = 0; i[0] < cdim[0]; i[0]++)
148
0
    {
149
0
      float* const ap0 = adim[0] == 1 ? ap : ap + i[0] * astride[0];
150
0
      float* const bp0 = bdim[0] == 1 ? bp : bp + i[0] * bstride[0];
151
0
      float* cp0 = cp + i[0] * cstride[0];
152
0
      for (i[1] = 0; i[1] < cdim[1]; i[1]++)
153
0
      {
154
0
        float* const ap1 = adim[1] == 1 ? ap0 : ap0 + i[1] * astride[1];
155
0
        float* const bp1 = bdim[1] == 1 ? bp0 : bp0 + i[1] * bstride[1];
156
0
        for (x = 0; x < count; x++)
157
0
          cp0[x] = p * ap1[x] + q * bp1[x];
158
0
        cp0 += cstride[1];
159
0
      }
160
0
    }
161
0
    return;
162
0
  }
163
  // Non-optimal case, need to do skip copy and handle broadcasting.
164
354
  
for (i[0] = 0; 120
i[0] < cdim[0];
i[0]++234
)
165
234
  {
166
234
    float* const ap0 = adim[0] == 1 ? 
ap104
:
ap + i[0] * astride[0]130
;
167
234
    float* const bp0 = bdim[0] == 1 ? 
bp114
:
bp + i[0] * bstride[0]120
;
168
234
    float* const cp0 = cp + i[0] * cstride[0];
169
894
    for (i[1] = 0; i[1] < cdim[1]; 
i[1]++660
)
170
660
    {
171
660
      float* const ap1 = adim[1] == 1 ? 
ap0104
:
ap0 + i[1] * astride[1]556
;
172
660
      float* const bp1 = bdim[1] == 1 ? 
bp0268
:
bp0 + i[1] * bstride[1]392
;
173
660
      float* cp1 = cp0 + i[1] * cstride[1];
174
3.28k
      for (i[2] = 0; i[2] < cdim[2]; 
i[2]++2.62k
)
175
2.62k
      {
176
2.62k
        float* const ap2 = adim[2] == 1 ? 
ap12
:
ap1 + i[2] * astride[2]2.62k
;
177
2.62k
        float* const bp2 = bdim[2] == 1 ? 
bp12.36k
:
bp1 + i[2] * bstride[2]256
;
178
2.62k
        if (adim[3] == 1)
179
412
          
for (x = 0; 204
x < cdim[3];
x++208
)
180
208
            cp1[x] = p * ap2[0] + q * bp2[x * bstride[3]];
181
2.42k
        else if (bdim[3] == 1)
182
8.55k
          
for (x = 0; 770
x < cdim[3];
x++7.78k
)
183
7.78k
            cp1[x] = p * ap2[x * astride[3]] + q * bp2[0];
184
1.65k
        else
185
8.57k
          
for (x = 0; 1.65k
x < cdim[3];
x++6.92k
)
186
6.92k
            cp1[x] = p * ap2[x * astride[3]] + q * bp2[x * bstride[3]];
187
2.62k
        cp1 += cstride[2];
188
2.62k
      }
189
660
    }
190
234
  }
191
120
}
192
193
static int _ccv_nnc_add_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
194
11.8k
{
195
11.8k
  assert(input_size == 2);
196
11.8k
  _ccv_nnc_add_forw_cpu_ref(cmd.info.blas.a[0], cmd.info.blas.a[1], (ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[1], (ccv_nnc_tensor_view_t*)outputs[0]);
197
11.8k
  return CCV_NNC_EXEC_SUCCESS;
198
11.8k
}
199
200
static int _ccv_nnc_add_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
201
11.8k
{
202
11.8k
  if (inputs[0] == 0)
203
0
  {
204
0
    if (outputs[0])
205
0
      _ccv_nnc_tensor_set_cpu_ref_f32((ccv_nnc_tensor_view_t*)outputs[0], cmd.info.blas.a[0]);
206
0
    if (output_size > 1 && outputs[1])
207
0
      _ccv_nnc_tensor_set_cpu_ref_f32((ccv_nnc_tensor_view_t*)outputs[1], cmd.info.blas.a[1]);
208
0
    return CCV_NNC_EXEC_SUCCESS;
209
0
  }
210
11.8k
  int gdim[CCV_NNC_MAX_DIM_ALLOC];
211
11.8k
  int gstride[CCV_NNC_MAX_DIM_ALLOC];
212
11.8k
  ccv_nnc_tensor_view_t* const g = (ccv_nnc_tensor_view_t*)inputs[0];
213
11.8k
  ccv_nnc_tensor_view_get_dim(g, gdim);
214
11.8k
  ccv_nnc_tensor_view_get_stride(g, gstride);
215
11.8k
  if (outputs[0])
216
11.8k
  {
217
11.8k
    ccv_nnc_tensor_view_t* const a = (ccv_nnc_tensor_view_t*)outputs[0];
218
11.8k
    if (ccv_nnc_tensor_view_check_dim(a, gdim))
219
11.8k
      _ccv_nnc_add_forw_cpu_ref(cmd.info.blas.a[0], 0, (ccv_nnc_tensor_view_t*)inputs[0], 0, (ccv_nnc_tensor_view_t*)outputs[0]);
220
1
    else {
221
1
      assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
222
1
      const float p = cmd.info.blas.a[0];
223
1
      int adim[CCV_NNC_MAX_DIM_ALLOC];
224
1
      int astride[CCV_NNC_MAX_DIM_ALLOC];
225
1
      ccv_nnc_tensor_view_get_dim(a, adim);
226
1
      ccv_nnc_tensor_view_get_stride(a, astride);
227
1
      int i[CCV_NNC_MAX_DIM + 2];
228
1
      int x;
229
1
      float* const ap = a->data.f32;
230
1
      float* const gp = g->data.f32;
231
      // zeroing out so that we can accumulate.
232
1
      ccv_nnc_tensor_zero(a);
233
      // Non-optimal case, need to do skip copy and handle broadcasting.
234
2
      for (i[0] = 0; i[0] < gdim[0]; 
i[0]++1
)
235
1
      {
236
1
        float* const ap0 = adim[0] == 1 ? ap : 
ap + i[0] * astride[0]0
;
237
1
        float* const gp0 = gp + i[0] * gstride[0];
238
2
        for (i[1] = 0; i[1] < gdim[1]; 
i[1]++1
)
239
1
        {
240
1
          float* const ap1 = adim[1] == 1 ? ap0 : 
ap0 + i[1] * astride[1]0
;
241
1
          float* gp1 = gp0 + i[1] * gstride[1];
242
5
          for (i[2] = 0; i[2] < gdim[2]; 
i[2]++4
)
243
4
          {
244
4
            float* const ap2 = adim[2] == 1 ? 
ap10
: ap1 + i[2] * astride[2];
245
4
            if (adim[3] == 1)
246
12
              
for (x = 0; 4
x < gdim[3];
x++8
)
247
8
                ap2[0] += p * gp1[x];
248
0
            else
249
0
              for (x = 0; x < gdim[3]; x++)
250
0
                ap2[x] += p * gp1[x];
251
4
            gp1 += gstride[2];
252
4
          }
253
1
        }
254
1
      }
255
1
    }
256
11.8k
  }
257
11.8k
  if (output_size > 1 && outputs[1])
258
5.18k
  {
259
5.18k
    ccv_nnc_tensor_view_t* const a = (ccv_nnc_tensor_view_t*)outputs[1];
260
5.18k
    if (ccv_nnc_tensor_view_check_dim(a, gdim))
261
5.17k
      _ccv_nnc_add_forw_cpu_ref(cmd.info.blas.a[1], 0, (ccv_nnc_tensor_view_t*)inputs[0], 0, (ccv_nnc_tensor_view_t*)outputs[1]);
262
10
    else {
263
10
      assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
264
10
      const float p = cmd.info.blas.a[1];
265
10
      int adim[CCV_NNC_MAX_DIM_ALLOC];
266
10
      int astride[CCV_NNC_MAX_DIM_ALLOC];
267
10
      ccv_nnc_tensor_view_get_dim(a, adim);
268
10
      ccv_nnc_tensor_view_get_stride(a, astride);
269
10
      int i[CCV_NNC_MAX_DIM + 2];
270
10
      int x;
271
10
      float* const ap = a->data.f32;
272
10
      float* const gp = g->data.f32;
273
      // zeroing out so that we can accumulate.
274
10
      ccv_nnc_tensor_zero(a);
275
      // Non-optimal case, need to do skip copy and handle broadcasting.
276
74
      for (i[0] = 0; i[0] < gdim[0]; 
i[0]++64
)
277
64
      {
278
64
        float* const ap0 = adim[0] == 1 ? 
ap4
:
ap + i[0] * astride[0]60
;
279
64
        float* const gp0 = gp + i[0] * gstride[0];
280
330
        for (i[1] = 0; i[1] < gdim[1]; 
i[1]++266
)
281
266
        {
282
266
          float* const ap1 = adim[1] == 1 ? 
ap070
:
ap0 + i[1] * astride[1]196
;
283
266
          float* gp1 = gp0 + i[1] * gstride[1];
284
1.41k
          for (i[2] = 0; i[2] < gdim[2]; 
i[2]++1.15k
)
285
1.15k
          {
286
1.15k
            float* const ap2 = adim[2] == 1 ? 
ap11.02k
:
ap1 + i[2] * astride[2]128
;
287
1.15k
            if (adim[3] == 1)
288
4.32k
              
for (x = 0; 385
x < gdim[3];
x++3.94k
)
289
3.94k
                ap2[0] += p * gp1[x];
290
768
            else
291
3.63k
              
for (x = 0; 768
x < gdim[3];
x++2.86k
)
292
2.86k
                ap2[x] += p * gp1[x];
293
1.15k
            gp1 += gstride[2];
294
1.15k
          }
295
266
        }
296
64
      }
297
10
    }
298
5.18k
  }
299
11.8k
  return CCV_NNC_EXEC_SUCCESS;
300
11.8k
}
301
302
REGISTER_COMMAND_BACKEND(CCV_NNC_ADD_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
303
1
{
304
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
305
1
  registry->tensor_datatypes = CCV_32F;
306
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
307
1
  registry->algorithms = 1;
308
1
  registry->exec = _ccv_nnc_add_forw;
309
1
}
310
311
REGISTER_COMMAND_BACKEND(CCV_NNC_ADD_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
312
1
{
313
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
314
1
  registry->tensor_datatypes = CCV_32F;
315
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
316
1
  registry->algorithms = 1;
317
1
  registry->exec = _ccv_nnc_add_back;
318
1
}