Coverage Report

Created: 2024-08-18 16:21

/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/cmd/blas/ccv_nnc_mul_cpu_ref.c
Line
Count
Source (jump to first uncovered line)
1
#include "ccv.h"
2
#include "ccv_internal.h"
3
#include "nnc/ccv_nnc.h"
4
#include "nnc/ccv_nnc_easy.h"
5
#include "nnc/ccv_nnc_internal.h"
6
#ifdef USE_OPENMP
7
#include <omp.h>
8
#endif
9
#ifdef USE_DISPATCH
10
#include <dispatch/dispatch.h>
11
#endif
12
13
// Shared methods.
14
#include "../_ccv_nnc_cpu_ref.h"
15
16
void _ccv_nnc_mul_forw_cpu_ref(const float p, ccv_nnc_tensor_view_t* const a, ccv_nnc_tensor_view_t* const b, ccv_nnc_tensor_view_t* const c)
17
10.6k
{
18
10.6k
  if (b == 0)
19
137
  {
20
137
    if (p == 1)
21
0
    {
22
0
      _ccv_nnc_tensor_transfer_cpu_ref_f32(a, c);
23
0
      return;
24
137
    } else if (p == 0) {
25
0
      ccv_nnc_tensor_zero(c);
26
0
      return;
27
0
    }
28
    // Assuming this is float 32.
29
137
    int dim[CCV_NNC_MAX_DIM_ALLOC];
30
137
    int astride[CCV_NNC_MAX_DIM_ALLOC];
31
137
    int cstride[CCV_NNC_MAX_DIM_ALLOC];
32
137
    assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
33
137
    assert(ccv_nnc_tensor_nd(c->info.dim) <= CCV_NNC_MAX_DIM + 2);
34
137
    ccv_nnc_tensor_view_get_dim(a, dim);
35
137
    assert(ccv_nnc_tensor_view_check_dim(c, dim));
36
137
    int x;
37
137
    if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(c))
38
135
    {
39
      // Super optimal case, just do one for-loop for sum.
40
135
      const int tensor_count = ccv_nnc_tensor_count(a->info);
41
8.39M
      for (x = 0; x < tensor_count; 
x++8.39M
)
42
8.39M
        c->data.f32[x] = p * a->data.f32[x];
43
135
      return;
44
135
    }
45
2
    assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
46
2
    ccv_nnc_tensor_view_get_stride(a, astride);
47
2
    ccv_nnc_tensor_view_get_stride(c, cstride);
48
2
    int i[CCV_NNC_MAX_DIM + 2];
49
2
    float* const ap = a->data.f32;
50
2
    float* const cp = c->data.f32;
51
2
    const int count = dim[2] * dim[3];
52
2
    if (astride[2] == dim[3] && cstride[2] == dim[3])
53
2
    {
54
      // Special casing if the ainc[3] is the same as dim[3]
55
4
      for (i[0] = 0; i[0] < dim[0]; 
i[0]++2
)
56
2
      {
57
2
        float* ap0 = ap + i[0] * astride[0];
58
2
        float* cp0 = cp + i[0] * cstride[0];
59
4
        for (i[1] = 0; i[1] < dim[1]; 
i[1]++2
)
60
2
        {
61
4
          for (x = 0; x < count; 
x++2
)
62
2
            cp0[x] = p * ap0[x];
63
2
          ap0 += astride[1];
64
2
          cp0 += cstride[1];
65
2
        }
66
2
      }
67
2
      return;
68
2
    }
69
    // Non-optimal case, need to do skip copy.
70
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
71
0
    {
72
0
      float* const ap0 = ap + i[0] * astride[0];
73
0
      float* const cp0 = cp + i[0] * cstride[0];
74
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
75
0
      {
76
0
        float* ap1 = ap0 + i[1] * astride[1];
77
0
        float* cp1 = cp0 + i[1] * cstride[1];
78
0
        for (i[2] = 0; i[2] < dim[2]; i[2]++)
79
0
        {
80
0
          for (x = 0; x < dim[3]; x++)
81
0
            cp1[x] = p * ap1[x];
82
0
          ap1 += astride[2];
83
0
          cp1 += cstride[2];
84
0
        }
85
0
      }
86
0
    }
87
0
    return;
88
2
  }
89
10.4k
  int cdim[CCV_NNC_MAX_DIM_ALLOC];
90
10.4k
  assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
91
10.4k
  assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
92
10.4k
  ccv_nnc_tensor_view_get_dim(a, cdim); // Fill in cdim first.
93
10.4k
  ccv_nnc_tensor_view_get_broadcast_dim(b, cdim);
94
10.4k
  assert(ccv_nnc_tensor_view_check_broadcast_dim(a, cdim));
95
10.4k
  assert(ccv_nnc_tensor_view_check_broadcast_dim(b, cdim));
96
10.4k
  const int a_check_dim = ccv_nnc_tensor_view_check_dim(a, cdim);
97
10.4k
  const int b_check_dim = ccv_nnc_tensor_view_check_dim(b, cdim);
98
10.4k
  if (p == 1 && 
a_check_dim10.4k
&&
b_check_dim10.4k
)
99
10.4k
  {
100
10.4k
    _ccv_nnc_ewprod_forw_cpu_ref((ccv_nnc_tensor_view_t*[]){
101
10.4k
      a, b
102
10.4k
    }, 2, &c, 1);
103
10.4k
    return;
104
10.4k
  } else 
if (31
p == 031
) {
105
0
    ccv_nnc_tensor_zero(c);
106
0
    return;
107
0
  }
108
  // Assuming this is float 32.
109
31
  int adim[CCV_NNC_MAX_DIM_ALLOC];
110
31
  int bdim[CCV_NNC_MAX_DIM_ALLOC];
111
31
  ccv_nnc_tensor_view_get_dim(a, adim);
112
31
  ccv_nnc_tensor_view_get_dim(b, bdim);
113
31
  int astride[CCV_NNC_MAX_DIM_ALLOC];
114
31
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
115
31
  int cstride[CCV_NNC_MAX_DIM_ALLOC];
116
31
  assert(ccv_nnc_tensor_nd(c->info.dim) <= CCV_NNC_MAX_DIM + 2);
117
31
  assert(ccv_nnc_tensor_view_check_dim(c, cdim));
118
31
  int x;
119
31
  if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b) && !CCV_IS_TENSOR_VIEW(c) && a_check_dim && 
b_check_dim25
)
120
9
  {
121
9
    const int tensor_count = ccv_nnc_tensor_count(a->info);
122
    // Super optimal case, just do one for-loop for sum.
123
99
    for (x = 0; x < tensor_count; 
x++90
)
124
90
      c->data.f32[x] = p * a->data.f32[x] * b->data.f32[x];
125
9
    return;
126
9
  }
127
22
  assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
128
22
  ccv_nnc_tensor_view_get_stride(a, astride);
129
22
  ccv_nnc_tensor_view_get_stride(b, bstride);
130
22
  ccv_nnc_tensor_view_get_stride(c, cstride);
131
22
  int i[CCV_NNC_MAX_DIM + 2];
132
22
  float* const ap = a->data.f32;
133
22
  float* const bp = b->data.f32;
134
22
  float* const cp = c->data.f32;
135
22
  const int count = cdim[2] * cdim[3];
136
22
  if (astride[2] == cdim[3] && 
bstride[2] == cdim[3]17
&&
cstride[2] == cdim[3]8
&&
adim[2] == cdim[2]8
&&
bdim[2] == cdim[2]7
)
137
0
  {
138
    // Special casing if the ainc[3] is the same as dim[3]
139
0
    for (i[0] = 0; i[0] < cdim[0]; i[0]++)
140
0
    {
141
0
      float* const ap0 = adim[0] == 1 ? ap : ap + i[0] * astride[0];
142
0
      float* const bp0 = bdim[0] == 1 ? bp : bp + i[0] * bstride[0];
143
0
      float* cp0 = cp + i[0] * cstride[0];
144
0
      for (i[1] = 0; i[1] < cdim[1]; i[1]++)
145
0
      {
146
0
        float* const ap1 = adim[1] == 1 ? ap0 : ap0 + i[1] * astride[1];
147
0
        float* const bp1 = bdim[1] == 1 ? bp0 : bp0 + i[1] * bstride[1];
148
0
        for (x = 0; x < count; x++)
149
0
          cp0[x] = p * ap1[x] * bp1[x];
150
0
        cp0 += cstride[1];
151
0
      }
152
0
    }
153
0
    return;
154
0
  }
155
  // Non-optimal case, need to do skip copy and handle broadcasting.
156
136
  
for (i[0] = 0; 22
i[0] < cdim[0];
i[0]++114
)
157
114
  {
158
114
    float* const ap0 = adim[0] == 1 ? 
ap8
:
ap + i[0] * astride[0]106
;
159
114
    float* const bp0 = bdim[0] == 1 ? 
bp18
:
bp + i[0] * bstride[0]96
;
160
114
    float* const cp0 = cp + i[0] * cstride[0];
161
542
    for (i[1] = 0; i[1] < cdim[1]; 
i[1]++428
)
162
428
    {
163
428
      float* const ap1 = adim[1] == 1 ? 
ap08
:
ap0 + i[1] * astride[1]420
;
164
428
      float* const bp1 = bdim[1] == 1 ? 
bp0236
:
bp0 + i[1] * bstride[1]192
;
165
428
      float* cp1 = cp0 + i[1] * cstride[1];
166
2.11k
      for (i[2] = 0; i[2] < cdim[2]; 
i[2]++1.68k
)
167
1.68k
      {
168
1.68k
        float* const ap2 = adim[2] == 1 ? 
ap17
:
ap1 + i[2] * astride[2]1.67k
;
169
1.68k
        float* const bp2 = bdim[2] == 1 ? 
bp11.42k
:
bp1 + i[2] * bstride[2]258
;
170
1.68k
        if (adim[3] == 1)
171
29
          
for (x = 0; 10
x < cdim[3];
x++19
)
172
19
            cp1[x] = p * ap2[0] * bp2[x];
173
1.67k
        else if (bdim[3] == 1)
174
11.3k
          
for (x = 0; 1.02k
x < cdim[3];
x++10.3k
)
175
10.3k
            cp1[x] = p * ap2[x] * bp2[0];
176
650
        else
177
4.57k
          
for (x = 0; 650
x < cdim[3];
x++3.92k
)
178
3.92k
            cp1[x] = p * ap2[x] * bp2[x];
179
1.68k
        cp1 += cstride[2];
180
1.68k
      }
181
428
    }
182
114
  }
183
22
}
184
185
static int _ccv_nnc_mul_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
186
4.24k
{
187
4.24k
  assert(input_size == 2);
188
4.24k
  _ccv_nnc_mul_forw_cpu_ref(cmd.info.blas.a[0], (ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[1], (ccv_nnc_tensor_view_t*)outputs[0]);
189
4.24k
  return CCV_NNC_EXEC_SUCCESS;
190
4.24k
}
191
192
static int _ccv_nnc_mul_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
193
4.12k
{
194
4.12k
  int gdim[CCV_NNC_MAX_DIM_ALLOC];
195
4.12k
  int no_broadcasting = 1;
196
4.12k
  if (outputs[0])
197
2.11k
  {
198
2.11k
    assert(input_size >= 3 && inputs[2]);
199
2.11k
    ccv_nnc_tensor_view_get_dim((ccv_nnc_tensor_view_t*)outputs[0], gdim);
200
2.11k
    ccv_nnc_tensor_view_get_broadcast_dim((ccv_nnc_tensor_view_t*)inputs[2], gdim);
201
2.11k
    no_broadcasting = no_broadcasting && (ccv_nnc_tensor_view_check_dim((ccv_nnc_tensor_view_t*)outputs[0], gdim) && 
ccv_nnc_tensor_view_check_dim((ccv_nnc_tensor_view_t*)inputs[2], gdim)2.11k
);
202
2.11k
  }
203
4.12k
  if (no_broadcasting && 
output_size > 14.11k
&&
outputs[1]4.11k
)
204
4.11k
  {
205
4.11k
    assert(inputs[1]);
206
4.11k
    ccv_nnc_tensor_view_get_dim((ccv_nnc_tensor_view_t*)inputs[1], gdim);
207
4.11k
    ccv_nnc_tensor_view_get_broadcast_dim((ccv_nnc_tensor_view_t*)outputs[1], gdim);
208
4.11k
    no_broadcasting = no_broadcasting && (ccv_nnc_tensor_view_check_dim((ccv_nnc_tensor_view_t*)inputs[1], gdim) && 
ccv_nnc_tensor_view_check_dim((ccv_nnc_tensor_view_t*)outputs[1], gdim)4.11k
);
209
4.11k
  }
210
4.12k
  if (no_broadcasting)
211
4.11k
  {
212
4.11k
    if (outputs[0])
213
2.10k
    {
214
2.10k
      if (inputs[0] == 0)
215
0
        _ccv_nnc_mul_forw_cpu_ref(cmd.info.blas.a[0], (ccv_nnc_tensor_view_t*)inputs[2], 0, (ccv_nnc_tensor_view_t*)outputs[0]);
216
2.10k
      else
217
2.10k
        _ccv_nnc_mul_forw_cpu_ref(cmd.info.blas.a[0], (ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[2], (ccv_nnc_tensor_view_t*)outputs[0]);
218
2.10k
    }
219
4.11k
    if (output_size > 1 && outputs[1])
220
4.11k
    {
221
4.11k
      if (inputs[0] == 0)
222
0
        _ccv_nnc_mul_forw_cpu_ref(cmd.info.blas.a[0], (ccv_nnc_tensor_view_t*)inputs[1], 0, (ccv_nnc_tensor_view_t*)outputs[1]);
223
4.11k
      else
224
4.11k
        _ccv_nnc_mul_forw_cpu_ref(cmd.info.blas.a[0], (ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[1], (ccv_nnc_tensor_view_t*)outputs[1]);
225
4.11k
    }
226
4.11k
    return CCV_NNC_EXEC_SUCCESS;
227
4.11k
  }
228
13
  int adim[CCV_NNC_MAX_DIM_ALLOC];
229
13
  int bdim[CCV_NNC_MAX_DIM_ALLOC];
230
13
  int astride[CCV_NNC_MAX_DIM_ALLOC];
231
13
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
232
13
  int i[CCV_NNC_MAX_DIM + 2];
233
13
  int x;
234
13
  const float p = cmd.info.blas.a[0];
235
  // Now the case we need broadcasting.
236
13
  if (inputs[0] == 0)
237
3
  {
238
3
    if (outputs[0])
239
3
    {
240
3
      ccv_nnc_tensor_view_t* const a = (ccv_nnc_tensor_view_t*)outputs[0];
241
3
      ccv_nnc_tensor_view_t* const b = (ccv_nnc_tensor_view_t*)inputs[2];
242
3
      ccv_nnc_tensor_view_get_dim(a, adim);
243
3
      ccv_nnc_tensor_view_get_dim(b, bdim);
244
3
      ccv_nnc_tensor_view_get_stride(a, astride);
245
3
      ccv_nnc_tensor_view_get_stride(b, bstride);
246
3
      ccv_nnc_tensor_zero(a);
247
3
      float* const ap = a->data.f32;
248
3
      float* const bp = b->data.f32;
249
6
      for (i[0] = 0; i[0] < gdim[0]; 
i[0]++3
)
250
3
      {
251
3
        float* const ap0 = adim[0] == 1 ? ap : 
ap + i[0] * astride[0]0
;
252
3
        float* const bp0 = bdim[0] == 1 ? bp : 
bp + i[0] * bstride[0]0
;
253
6
        for (i[1] = 0; i[1] < gdim[1]; 
i[1]++3
)
254
3
        {
255
3
          float* const ap1 = adim[1] == 1 ? ap0 : 
ap0 + i[1] * astride[1]0
;
256
3
          float* const bp1 = bdim[1] == 1 ? bp0 : 
bp0 + i[1] * bstride[1]0
;
257
11
          for (i[2] = 0; i[2] < gdim[2]; 
i[2]++8
)
258
8
          {
259
8
            float* const ap2 = adim[2] == 1 ? 
ap12
:
ap1 + i[2] * astride[2]6
;
260
8
            float* const bp2 = bdim[2] == 1 ? 
bp16
:
bp1 + i[2] * bstride[2]2
;
261
8
            if (adim[3] == 1)
262
12
              
for (x = 0; 4
x < gdim[3];
x++8
)
263
8
                ap2[0] += p * bp2[x];
264
4
            else if (bdim[3] == 1)
265
0
              for (x = 0; x < gdim[3]; x++)
266
0
                ap2[x] += p * bp2[0];
267
4
            else
268
16
              
for (x = 0; 4
x < gdim[3];
x++12
)
269
12
                ap2[x] += p * bp2[x];
270
8
          }
271
3
        }
272
3
      }
273
3
    }
274
3
    if (output_size > 1 && outputs[1])
275
3
    {
276
3
      ccv_nnc_tensor_view_t* const a = (ccv_nnc_tensor_view_t*)outputs[1];
277
3
      ccv_nnc_tensor_view_t* const b = (ccv_nnc_tensor_view_t*)inputs[1];
278
3
      ccv_nnc_tensor_view_get_dim(a, adim);
279
3
      ccv_nnc_tensor_view_get_dim(b, bdim);
280
3
      ccv_nnc_tensor_view_get_stride(a, astride);
281
3
      ccv_nnc_tensor_view_get_stride(b, bstride);
282
3
      ccv_nnc_tensor_zero(a);
283
3
      float* const ap = a->data.f32;
284
3
      float* const bp = b->data.f32;
285
6
      for (i[0] = 0; i[0] < gdim[0]; 
i[0]++3
)
286
3
      {
287
3
        float* const ap0 = adim[0] == 1 ? ap : 
ap + i[0] * astride[0]0
;
288
3
        float* const bp0 = bdim[0] == 1 ? bp : 
bp + i[0] * bstride[0]0
;
289
6
        for (i[1] = 0; i[1] < gdim[1]; 
i[1]++3
)
290
3
        {
291
3
          float* const ap1 = adim[1] == 1 ? ap0 : 
ap0 + i[1] * astride[1]0
;
292
3
          float* const bp1 = bdim[1] == 1 ? bp0 : 
bp0 + i[1] * bstride[1]0
;
293
11
          for (i[2] = 0; i[2] < gdim[2]; 
i[2]++8
)
294
8
          {
295
8
            float* const ap2 = adim[2] == 1 ? 
ap16
:
ap1 + i[2] * astride[2]2
;
296
8
            float* const bp2 = bdim[2] == 1 ? 
bp12
:
bp1 + i[2] * bstride[2]6
;
297
8
            if (adim[3] == 1)
298
0
              for (x = 0; x < gdim[3]; x++)
299
0
                ap2[0] += p * bp2[x];
300
8
            else if (bdim[3] == 1)
301
12
              
for (x = 0; 4
x < gdim[3];
x++8
)
302
8
                ap2[x] += p * bp2[0];
303
4
            else
304
16
              
for (x = 0; 4
x < gdim[3];
x++12
)
305
12
                ap2[x] += p * bp2[x];
306
8
          }
307
3
        }
308
3
      }
309
3
    }
310
3
    return CCV_NNC_EXEC_SUCCESS;
311
3
  }
312
10
  int gstride[CCV_NNC_MAX_DIM_ALLOC];
313
10
  ccv_nnc_tensor_view_t* const g = (ccv_nnc_tensor_view_t*)inputs[0];
314
10
  ccv_nnc_tensor_view_get_dim(g, gdim);
315
10
  ccv_nnc_tensor_view_get_stride(g, gstride);
316
10
  if (outputs[0])
317
9
  {
318
9
    ccv_nnc_tensor_view_t* const a = (ccv_nnc_tensor_view_t*)outputs[0];
319
9
    ccv_nnc_tensor_view_t* const b = (ccv_nnc_tensor_view_t*)inputs[2];
320
9
    ccv_nnc_tensor_view_get_dim(a, adim);
321
9
    ccv_nnc_tensor_view_get_dim(b, bdim);
322
9
    ccv_nnc_tensor_view_get_stride(a, astride);
323
9
    ccv_nnc_tensor_view_get_stride(b, bstride);
324
9
    ccv_nnc_tensor_zero(a);
325
9
    float* const ap = a->data.f32;
326
9
    float* const bp = b->data.f32;
327
9
    float* const gp = g->data.f32;
328
61
    for (i[0] = 0; i[0] < gdim[0]; 
i[0]++52
)
329
52
    {
330
52
      float* const ap0 = adim[0] == 1 ? 
ap2
:
ap + i[0] * astride[0]50
;
331
52
      float* const bp0 = bdim[0] == 1 ? 
bp4
:
bp + i[0] * bstride[0]48
;
332
52
      float* const gp0 = gp + i[0] * gstride[0];
333
250
      for (i[1] = 0; i[1] < gdim[1]; 
i[1]++198
)
334
198
      {
335
198
        float* const ap1 = adim[1] == 1 ? 
ap02
:
ap0 + i[1] * astride[1]196
;
336
198
        float* const bp1 = bdim[1] == 1 ? 
bp0102
:
bp0 + i[1] * bstride[1]96
;
337
198
        float* gp1 = gp0 + i[1] * gstride[1];
338
979
        for (i[2] = 0; i[2] < gdim[2]; 
i[2]++781
)
339
781
        {
340
781
          float* const ap2 = adim[2] == 1 ? 
ap11
:
ap1 + i[2] * astride[2]780
;
341
781
          float* const bp2 = bdim[2] == 1 ? 
bp1653
:
bp1 + i[2] * bstride[2]128
;
342
781
          if (adim[3] == 1)
343
12
            
for (x = 0; 4
x < gdim[3];
x++8
)
344
8
              ap2[0] += p * gp1[x] * bp2[x];
345
777
          else if (bdim[3] == 1)
346
5.73k
            
for (x = 0; 513
x < gdim[3];
x++5.22k
)
347
5.22k
              ap2[x] += p * gp1[x] * bp2[0];
348
264
          else
349
1.62k
            
for (x = 0; 264
x < gdim[3];
x++1.36k
)
350
1.36k
              ap2[x] += p * gp1[x] * bp2[x];
351
781
          gp1 += gstride[2];
352
781
        }
353
198
      }
354
52
    }
355
9
  }
356
10
  if (output_size > 1 && outputs[1])
357
10
  {
358
10
    ccv_nnc_tensor_view_t* const a = (ccv_nnc_tensor_view_t*)outputs[1];
359
10
    ccv_nnc_tensor_view_t* const b = (ccv_nnc_tensor_view_t*)inputs[1];
360
10
    ccv_nnc_tensor_view_get_dim(a, adim);
361
10
    ccv_nnc_tensor_view_get_dim(b, bdim);
362
10
    ccv_nnc_tensor_view_get_stride(a, astride);
363
10
    ccv_nnc_tensor_view_get_stride(b, bstride);
364
10
    ccv_nnc_tensor_zero(a);
365
10
    float* const ap = a->data.f32;
366
10
    float* const bp = b->data.f32;
367
10
    float* const gp = g->data.f32;
368
63
    for (i[0] = 0; i[0] < gdim[0]; 
i[0]++53
)
369
53
    {
370
53
      float* const ap0 = adim[0] == 1 ? 
ap5
:
ap + i[0] * astride[0]48
;
371
53
      float* const bp0 = bdim[0] == 1 ? 
bp3
:
bp + i[0] * bstride[0]50
;
372
53
      float* const gp0 = gp + i[0] * gstride[0];
373
252
      for (i[1] = 0; i[1] < gdim[1]; 
i[1]++199
)
374
199
      {
375
199
        float* const ap1 = adim[1] == 1 ? 
ap0103
:
ap0 + i[1] * astride[1]96
;
376
199
        float* const bp1 = bdim[1] == 1 ? 
bp03
:
bp0 + i[1] * bstride[1]196
;
377
199
        float* gp1 = gp0 + i[1] * gstride[1];
378
982
        for (i[2] = 0; i[2] < gdim[2]; 
i[2]++783
)
379
783
        {
380
783
          float* const ap2 = adim[2] == 1 ? 
ap1653
:
ap1 + i[2] * astride[2]130
;
381
783
          float* const bp2 = bdim[2] == 1 ? 
bp13
:
bp1 + i[2] * bstride[2]780
;
382
783
          if (adim[3] == 1)
383
5.73k
            
for (x = 0; 515
x < gdim[3];
x++5.22k
)
384
5.22k
              ap2[0] += p * gp1[x] * bp2[x];
385
268
          else if (bdim[3] == 1)
386
12
            
for (x = 0; 4
x < gdim[3];
x++8
)
387
8
              ap2[x] += p * gp1[x] * bp2[0];
388
264
          else
389
1.62k
            
for (x = 0; 264
x < gdim[3];
x++1.36k
)
390
1.36k
              ap2[x] += p * gp1[x] * bp2[x];
391
783
          gp1 += gstride[2];
392
783
        }
393
199
      }
394
53
    }
395
10
  }
396
10
  return CCV_NNC_EXEC_SUCCESS;
397
13
}
398
399
REGISTER_COMMAND_BACKEND(CCV_NNC_MUL_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
400
1
{
401
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
402
1
  registry->tensor_datatypes = CCV_32F;
403
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
404
1
  registry->algorithms = 1;
405
1
  registry->exec = _ccv_nnc_mul_forw;
406
1
}
407
408
REGISTER_COMMAND_BACKEND(CCV_NNC_MUL_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
409
1
{
410
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
411
1
  registry->tensor_datatypes = CCV_32F;
412
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
413
1
  registry->algorithms = 1;
414
1
  registry->exec = _ccv_nnc_mul_back;
415
1
}
416
417
static int _ccv_nnc_scalar_mul_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
418
58
{
419
58
  _ccv_nnc_mul_forw_cpu_ref(cmd.info.blas.a[0], (ccv_nnc_tensor_view_t*)inputs[0], 0, (ccv_nnc_tensor_view_t*)outputs[0]);
420
58
  return CCV_NNC_EXEC_SUCCESS;
421
58
}
422
static int _ccv_nnc_scalar_mul_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
423
21
{
424
21
  if (inputs[0])
425
21
    _ccv_nnc_mul_forw_cpu_ref(cmd.info.blas.a[0], (ccv_nnc_tensor_view_t*)inputs[0], 0, (ccv_nnc_tensor_view_t*)outputs[0]);
426
0
  else
427
0
    _ccv_nnc_tensor_set_cpu_ref_f32((ccv_nnc_tensor_view_t*)outputs[0], cmd.info.blas.a[0]);
428
21
  return CCV_NNC_EXEC_SUCCESS;
429
21
}
430
431
REGISTER_COMMAND_BACKEND(CCV_NNC_SCALAR_MUL_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
432
1
{
433
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
434
1
  registry->tensor_datatypes = CCV_32F;
435
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
436
1
  registry->algorithms = 1;
437
1
  registry->exec = _ccv_nnc_scalar_mul_forw;
438
1
}
439
440
REGISTER_COMMAND_BACKEND(CCV_NNC_SCALAR_MUL_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
441
1
{
442
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
443
1
  registry->tensor_datatypes = CCV_32F;
444
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
445
1
  registry->algorithms = 1;
446
1
  registry->exec = _ccv_nnc_scalar_mul_back;
447
1
}