Coverage Report

Created: 2019-07-03 22:50

/home/liu/buildslave/linux-x64-runtests/build/lib/nnc/cmd/blas/ccv_nnc_mul_cpu_ref.c
Line
Count
Source (jump to first uncovered line)
1
#include <ccv.h>
2
#include <ccv_internal.h>
3
#include <nnc/ccv_nnc.h>
4
#include <nnc/ccv_nnc_easy.h>
5
#include <nnc/ccv_nnc_internal.h>
6
#ifdef USE_OPENMP
7
#include <omp.h>
8
#endif
9
#ifdef USE_DISPATCH
10
#include <dispatch/dispatch.h>
11
#endif
12
13
// Shared methods.
14
#include "../_ccv_nnc_cpu_ref.h"
15
16
void _ccv_nnc_mul_forw_cpu_ref(const float p, ccv_nnc_tensor_view_t* const a, ccv_nnc_tensor_view_t* const b, ccv_nnc_tensor_view_t* const c)
17
40
{
18
40
  if (b == 0)
19
32
  {
20
32
    if (p == 1)
21
0
    {
22
0
      _ccv_nnc_tensor_transfer_cpu_ref(a, c);
23
0
      return;
24
32
    } else if (p == 0) {
25
0
      ccv_nnc_tensor_zero(c);
26
0
      return;
27
0
    }
28
32
    // Assuming this is float 32.
29
32
    int dim[CCV_NNC_MAX_DIM + 2];
30
32
    int ainc[CCV_NNC_MAX_DIM + 2];
31
32
    int cinc[CCV_NNC_MAX_DIM + 2];
32
32
    assert(a->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
33
32
    assert(c->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
34
32
    ccv_nnc_tensor_view_get_dim(a, dim);
35
32
    assert(ccv_nnc_tensor_view_check_dim(c, dim));
36
32
    int x;
37
32
    if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(c))
38
32
    {
39
32
      // Super optimal case, just do one for-loop for sum.
40
32
      const int tensor_count = ccv_nnc_tensor_count(a->info);
41
343
      for (x = 0; x < tensor_count; 
x++311
)
42
311
        c->data.f32[x] = p * a->data.f32[x];
43
32
      return;
44
32
    }
45
0
    assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
46
0
    ccv_nnc_tensor_view_get_inc(a, ainc);
47
0
    ccv_nnc_tensor_view_get_inc(c, cinc);
48
0
    int i[CCV_NNC_MAX_DIM + 2];
49
0
    float* ap = a->data.f32;
50
0
    float* cp = c->data.f32;
51
0
    const int count = dim[2] * dim[3];
52
0
    if (ainc[3] == dim[3] && cinc[3] == dim[3])
53
0
    {
54
0
      // Special casing if the ainc[3] is the same as dim[3]
55
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
56
0
      {
57
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
58
0
        {
59
0
          for (x = 0; x < count; x++)
60
0
            cp[x] = p * ap[x];
61
0
          ap += ainc[2] * ainc[3];
62
0
          cp += cinc[2] * cinc[3];
63
0
        }
64
0
        ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
65
0
        cp += (cinc[1] - dim[1]) * cinc[2] * cinc[3];
66
0
      }
67
0
      return;
68
0
    }
69
0
    // Non-optimal case, need to do skip copy.
70
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
71
0
    {
72
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
73
0
      {
74
0
        for (i[2] = 0; i[2] < dim[2]; i[2]++)
75
0
        {
76
0
          for (x = 0; x < dim[3]; x++)
77
0
            cp[x] = p * ap[x];
78
0
          ap += ainc[3];
79
0
          cp += cinc[3];
80
0
        }
81
0
        ap += (ainc[2] - dim[2]) * ainc[3];
82
0
        cp += (cinc[2] - dim[2]) * cinc[3];
83
0
      }
84
0
      ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
85
0
      cp += (cinc[1] - dim[1]) * cinc[2] * cinc[3];
86
0
    }
87
0
    return;
88
0
  }
89
8
  int cdim[CCV_NNC_MAX_DIM + 2];
90
8
  assert(a->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
91
8
  assert(b->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
92
8
  ccv_nnc_tensor_view_get_dim(a, cdim); // Fill in cdim first.
93
8
  ccv_nnc_tensor_view_get_broadcast_dim(b, cdim);
94
8
  assert(ccv_nnc_tensor_view_check_broadcast_dim(a, cdim));
95
8
  assert(ccv_nnc_tensor_view_check_broadcast_dim(b, cdim));
96
8
  const int a_check_dim = ccv_nnc_tensor_view_check_dim(a, cdim);
97
8
  const int b_check_dim = ccv_nnc_tensor_view_check_dim(b, cdim);
98
8
  if (p == 1 && 
a_check_dim5
&&
b_check_dim4
)
99
0
  {
100
0
    _ccv_nnc_ewprod_forw_cpu_ref((ccv_nnc_tensor_view_t*[]){
101
0
      a, b
102
0
    }, 2, &c, 1);
103
0
    return;
104
8
  } else if (p == 0) {
105
0
    ccv_nnc_tensor_zero(c);
106
0
    return;
107
0
  }
108
8
  // Assuming this is float 32.
109
8
  int adim[CCV_NNC_MAX_DIM + 2];
110
8
  int bdim[CCV_NNC_MAX_DIM + 2];
111
8
  ccv_nnc_tensor_view_get_dim(a, adim);
112
8
  ccv_nnc_tensor_view_get_dim(b, bdim);
113
8
  int ainc[CCV_NNC_MAX_DIM + 2];
114
8
  int binc[CCV_NNC_MAX_DIM + 2];
115
8
  int cinc[CCV_NNC_MAX_DIM + 2];
116
8
  assert(c->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
117
8
  assert(ccv_nnc_tensor_view_check_dim(c, cdim));
118
8
  int x;
119
8
  if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b) && !CCV_IS_TENSOR_VIEW(c) && a_check_dim && 
b_check_dim7
)
120
3
  {
121
3
    const int tensor_count = ccv_nnc_tensor_count(a->info);
122
3
    // Super optimal case, just do one for-loop for sum.
123
33
    for (x = 0; x < tensor_count; 
x++30
)
124
30
      c->data.f32[x] = p * a->data.f32[x] * b->data.f32[x];
125
3
    return;
126
3
  }
127
5
  assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
128
5
  ccv_nnc_tensor_view_get_inc(a, ainc);
129
5
  ccv_nnc_tensor_view_get_inc(b, binc);
130
5
  ccv_nnc_tensor_view_get_inc(c, cinc);
131
5
  int i[CCV_NNC_MAX_DIM + 2];
132
5
  float* ap = a->data.f32;
133
5
  float* bp = b->data.f32;
134
5
  float* cp = c->data.f32;
135
5
  const int count = cdim[2] * cdim[3];
136
5
  if (ainc[3] == cdim[3] && 
binc[3] == cdim[3]4
&&
cinc[3] == cdim[3]3
&&
adim[2] == cdim[2]3
&&
bdim[2] == cdim[2]3
)
137
0
  {
138
0
    // Special casing if the ainc[3] is the same as dim[3]
139
0
    for (i[0] = 0; i[0] < cdim[0]; i[0]++)
140
0
    {
141
0
      float* const ap0 = adim[0] == 1 ? ap : ap + i[0] * ainc[1] * ainc[2] * ainc[3];
142
0
      float* const bp0 = bdim[0] == 1 ? bp : bp + i[0] * binc[1] * binc[2] * binc[3];
143
0
      for (i[1] = 0; i[1] < cdim[1]; i[1]++)
144
0
      {
145
0
        float* const ap1 = adim[1] == 1 ? ap0 : ap0 + i[1] * ainc[2] * ainc[3];
146
0
        float* const bp1 = bdim[1] == 1 ? bp0 : bp0 + i[1] * binc[2] * binc[3];
147
0
        for (x = 0; x < count; x++)
148
0
          cp[x] = p * ap1[x] * bp1[x];
149
0
        cp += cinc[2] * cinc[3];
150
0
      }
151
0
      cp += (cinc[1] - cdim[1]) * cinc[2] * cinc[3];
152
0
    }
153
0
    return;
154
0
  }
155
5
  // Non-optimal case, need to do skip copy and handle broadcasting.
156
18
  
for (i[0] = 0; 5
i[0] < cdim[0];
i[0]++13
)
157
13
  {
158
13
    float* const ap0 = adim[0] == 1 ? 
ap3
:
ap + i[0] * ainc[1] * ainc[2] * ainc[3]10
;
159
13
    float* const bp0 = bdim[0] == 1 ? bp : 
bp + i[0] * binc[1] * binc[2] * binc[3]0
;
160
52
    for (i[1] = 0; i[1] < cdim[1]; 
i[1]++39
)
161
39
    {
162
39
      float* const ap1 = adim[1] == 1 ? 
ap03
:
ap0 + i[1] * ainc[2] * ainc[3]36
;
163
39
      float* const bp1 = bdim[1] == 1 ? bp0 : 
bp0 + i[1] * binc[2] * binc[3]0
;
164
182
      for (i[2] = 0; i[2] < cdim[2]; 
i[2]++143
)
165
143
      {
166
143
        float* const ap2 = adim[2] == 1 ? 
ap11
:
ap1 + i[2] * ainc[3]142
;
167
143
        float* const bp2 = bdim[2] == 1 ? bp1 : 
bp1 + i[2] * binc[3]0
;
168
143
        if (adim[3] == 1)
169
12
          
for (x = 0; 4
x < cdim[3];
x++8
)
170
8
            cp[x] = p * ap2[0] * bp2[x];
171
139
        else if (bdim[3] == 1)
172
101
          
for (x = 0; 1
x < cdim[3];
x++100
)
173
100
            cp[x] = p * ap2[x] * bp2[0];
174
138
        else
175
1.50k
          
for (x = 0; 138
x < cdim[3];
x++1.36k
)
176
1.36k
            cp[x] = p * ap2[x] * bp2[x];
177
143
        cp += cinc[3];
178
143
      }
179
39
      cp += (cinc[2] - cdim[2]) * cinc[3];
180
39
    }
181
13
    cp += (cinc[1] - cdim[1]) * cinc[2] * cinc[3];
182
13
  }
183
5
}
184
185
static int _ccv_nnc_mul_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
186
5
{
187
5
  assert(input_size == 2);
188
5
  _ccv_nnc_mul_forw_cpu_ref(cmd.info.blas.a[0], (ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[1], (ccv_nnc_tensor_view_t*)outputs[0]);
189
5
  return CCV_NNC_EXEC_SUCCESS;
190
5
}
191
192
static int _ccv_nnc_mul_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
193
2
{
194
2
  int gdim[CCV_NNC_MAX_DIM + 2];
195
2
  int no_broadcasting = 1;
196
2
  if (outputs[0])
197
2
  {
198
2
    assert(input_size >= 3 && inputs[2]);
199
2
    ccv_nnc_tensor_view_get_dim((ccv_nnc_tensor_view_t*)outputs[0], gdim);
200
2
    ccv_nnc_tensor_view_get_broadcast_dim((ccv_nnc_tensor_view_t*)inputs[2], gdim);
201
2
    no_broadcasting = no_broadcasting && (ccv_nnc_tensor_view_check_dim((ccv_nnc_tensor_view_t*)outputs[0], gdim) && ccv_nnc_tensor_view_check_dim((ccv_nnc_tensor_view_t*)inputs[2], gdim));
202
2
  }
203
2
  if (no_broadcasting && 
output_size > 10
&&
outputs[1]0
)
204
0
  {
205
0
    assert(inputs[1]);
206
0
    ccv_nnc_tensor_view_get_dim((ccv_nnc_tensor_view_t*)inputs[1], gdim);
207
0
    ccv_nnc_tensor_view_get_broadcast_dim((ccv_nnc_tensor_view_t*)outputs[1], gdim);
208
0
    no_broadcasting = no_broadcasting && (ccv_nnc_tensor_view_check_dim((ccv_nnc_tensor_view_t*)inputs[1], gdim) && ccv_nnc_tensor_view_check_dim((ccv_nnc_tensor_view_t*)outputs[1], gdim));
209
0
  }
210
2
  if (no_broadcasting)
211
0
  {
212
0
    if (outputs[0])
213
0
    {
214
0
      if (inputs[0] == 0)
215
0
        _ccv_nnc_mul_forw_cpu_ref(cmd.info.blas.a[0], (ccv_nnc_tensor_view_t*)inputs[2], 0, (ccv_nnc_tensor_view_t*)outputs[0]);
216
0
      else
217
0
        _ccv_nnc_mul_forw_cpu_ref(cmd.info.blas.a[0], (ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[2], (ccv_nnc_tensor_view_t*)outputs[0]);
218
0
    }
219
0
    if (output_size > 1 && outputs[1])
220
0
    {
221
0
      if (inputs[0] == 0)
222
0
        _ccv_nnc_mul_forw_cpu_ref(cmd.info.blas.a[0], (ccv_nnc_tensor_view_t*)inputs[1], 0, (ccv_nnc_tensor_view_t*)outputs[1]);
223
0
      else
224
0
        _ccv_nnc_mul_forw_cpu_ref(cmd.info.blas.a[0], (ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[1], (ccv_nnc_tensor_view_t*)outputs[1]);
225
0
    }
226
0
    return CCV_NNC_EXEC_SUCCESS;
227
0
  }
228
2
  int adim[CCV_NNC_MAX_DIM + 2];
229
2
  int bdim[CCV_NNC_MAX_DIM + 2];
230
2
  int ainc[CCV_NNC_MAX_DIM + 2];
231
2
  int binc[CCV_NNC_MAX_DIM + 2];
232
2
  int i[CCV_NNC_MAX_DIM + 2];
233
2
  int x;
234
2
  const float p = cmd.info.blas.a[0];
235
2
  // Now the case we need broadcasting.
236
2
  if (inputs[0] == 0)
237
0
  {
238
0
    if (outputs[0])
239
0
    {
240
0
      ccv_nnc_tensor_view_t* const a = (ccv_nnc_tensor_view_t*)outputs[0];
241
0
      ccv_nnc_tensor_view_t* const b = (ccv_nnc_tensor_view_t*)inputs[2];
242
0
      ccv_nnc_tensor_view_get_dim(a, adim);
243
0
      ccv_nnc_tensor_view_get_dim(b, bdim);
244
0
      ccv_nnc_tensor_view_get_inc(a, ainc);
245
0
      ccv_nnc_tensor_view_get_inc(b, binc);
246
0
      ccv_nnc_tensor_zero(a);
247
0
      float* const ap = a->data.f32;
248
0
      float* const bp = b->data.f32;
249
0
      for (i[0] = 0; i[0] < gdim[0]; i[0]++)
250
0
      {
251
0
        float* const ap0 = adim[0] == 1 ? ap : ap + i[0] * ainc[1] * ainc[2] * ainc[3];
252
0
        float* const bp0 = bdim[0] == 1 ? bp : bp + i[0] * binc[1] * binc[2] * binc[3];
253
0
        for (i[1] = 0; i[1] < gdim[1]; i[1]++)
254
0
        {
255
0
          float* const ap1 = adim[1] == 1 ? ap0 : ap0 + i[1] * ainc[2] * ainc[3];
256
0
          float* const bp1 = bdim[1] == 1 ? bp0 : bp0 + i[1] * binc[2] * binc[3];
257
0
          for (i[2] = 0; i[2] < gdim[2]; i[2]++)
258
0
          {
259
0
            float* const ap2 = adim[2] == 1 ? ap1 : ap1 + i[2] * ainc[3];
260
0
            float* const bp2 = bdim[2] == 1 ? bp1 : bp1 + i[2] * binc[3];
261
0
            if (adim[3] == 1)
262
0
              for (x = 0; x < gdim[3]; x++)
263
0
                ap2[0] += p * bp2[x];
264
0
            else if (bdim[3] == 1)
265
0
              for (x = 0; x < gdim[3]; x++)
266
0
                ap2[x] += p * bp2[0];
267
0
            else
268
0
              for (x = 0; x < gdim[3]; x++)
269
0
                ap2[x] += p * bp2[x];
270
0
          }
271
0
        }
272
0
      }
273
0
    }
274
0
    if (output_size > 1 && outputs[1])
275
0
    {
276
0
      ccv_nnc_tensor_view_t* const a = (ccv_nnc_tensor_view_t*)outputs[1];
277
0
      ccv_nnc_tensor_view_t* const b = (ccv_nnc_tensor_view_t*)inputs[1];
278
0
      ccv_nnc_tensor_view_get_dim(a, adim);
279
0
      ccv_nnc_tensor_view_get_dim(b, bdim);
280
0
      ccv_nnc_tensor_view_get_inc(a, ainc);
281
0
      ccv_nnc_tensor_view_get_inc(b, binc);
282
0
      ccv_nnc_tensor_zero(a);
283
0
      float* const ap = a->data.f32;
284
0
      float* const bp = b->data.f32;
285
0
      for (i[0] = 0; i[0] < gdim[0]; i[0]++)
286
0
      {
287
0
        float* const ap0 = adim[0] == 1 ? ap : ap + i[0] * ainc[1] * ainc[2] * ainc[3];
288
0
        float* const bp0 = bdim[0] == 1 ? bp : bp + i[0] * binc[1] * binc[2] * binc[3];
289
0
        for (i[1] = 0; i[1] < gdim[1]; i[1]++)
290
0
        {
291
0
          float* const ap1 = adim[1] == 1 ? ap0 : ap0 + i[1] * ainc[2] * ainc[3];
292
0
          float* const bp1 = bdim[1] == 1 ? bp0 : bp0 + i[1] * binc[2] * binc[3];
293
0
          for (i[2] = 0; i[2] < gdim[2]; i[2]++)
294
0
          {
295
0
            float* const ap2 = adim[2] == 1 ? ap1 : ap1 + i[2] * ainc[3];
296
0
            float* const bp2 = bdim[2] == 1 ? bp1 : bp1 + i[2] * binc[3];
297
0
            if (adim[3] == 1)
298
0
              for (x = 0; x < gdim[3]; x++)
299
0
                ap2[0] += p * bp2[x];
300
0
            else if (bdim[3] == 1)
301
0
              for (x = 0; x < gdim[3]; x++)
302
0
                ap2[x] += p * bp2[0];
303
0
            else
304
0
              for (x = 0; x < gdim[3]; x++)
305
0
                ap2[x] += p * bp2[x];
306
0
          }
307
0
        }
308
0
      }
309
0
    }
310
0
    return CCV_NNC_EXEC_SUCCESS;
311
0
  }
312
2
  int ginc[CCV_NNC_MAX_DIM + 2];
313
2
  ccv_nnc_tensor_view_t* const g = (ccv_nnc_tensor_view_t*)inputs[0];
314
2
  ccv_nnc_tensor_view_get_dim(g, gdim);
315
2
  ccv_nnc_tensor_view_get_inc(g, ginc);
316
2
  if (outputs[0])
317
2
  {
318
2
    ccv_nnc_tensor_view_t* const a = (ccv_nnc_tensor_view_t*)outputs[0];
319
2
    ccv_nnc_tensor_view_t* const b = (ccv_nnc_tensor_view_t*)inputs[2];
320
2
    ccv_nnc_tensor_view_get_dim(a, adim);
321
2
    ccv_nnc_tensor_view_get_dim(b, bdim);
322
2
    ccv_nnc_tensor_view_get_inc(a, ainc);
323
2
    ccv_nnc_tensor_view_get_inc(b, binc);
324
2
    ccv_nnc_tensor_zero(a);
325
2
    float* const ap = a->data.f32;
326
2
    float* const bp = b->data.f32;
327
2
    float* gp = g->data.f32;
328
5
    for (i[0] = 0; i[0] < gdim[0]; 
i[0]++3
)
329
3
    {
330
3
      float* const ap0 = adim[0] == 1 ? 
ap1
:
ap + i[0] * ainc[1] * ainc[2] * ainc[3]2
;
331
3
      float* const bp0 = bdim[0] == 1 ? bp : 
bp + i[0] * binc[1] * binc[2] * binc[3]0
;
332
8
      for (i[1] = 0; i[1] < gdim[1]; 
i[1]++5
)
333
5
      {
334
5
        float* const ap1 = adim[1] == 1 ? 
ap01
:
ap0 + i[1] * ainc[2] * ainc[3]4
;
335
5
        float* const bp1 = bdim[1] == 1 ? bp0 : 
bp0 + i[1] * binc[2] * binc[3]0
;
336
14
        for (i[2] = 0; i[2] < gdim[2]; 
i[2]++9
)
337
9
        {
338
9
          float* const ap2 = adim[2] == 1 ? 
ap11
:
ap1 + i[2] * ainc[3]8
;
339
9
          float* const bp2 = bdim[2] == 1 ? bp1 : 
bp1 + i[2] * binc[3]0
;
340
9
          if (adim[3] == 1)
341
0
            for (x = 0; x < gdim[3]; x++)
342
0
              ap2[0] += p * gp[x] * bp2[x];
343
9
          else if (bdim[3] == 1)
344
101
            
for (x = 0; 1
x < gdim[3];
x++100
)
345
100
              ap2[x] += p * gp[x] * bp2[0];
346
8
          else
347
88
            
for (x = 0; 8
x < gdim[3];
x++80
)
348
80
              ap2[x] += p * gp[x] * bp2[x];
349
9
          gp += ginc[3];
350
9
        }
351
5
        gp += (ginc[2] - gdim[2]) * ginc[3];
352
5
      }
353
3
      gp += (ginc[1] - gdim[1]) * ginc[2] * ginc[3];
354
3
    }
355
2
  }
356
2
  if (output_size > 1 && outputs[1])
357
2
  {
358
2
    ccv_nnc_tensor_view_t* const a = (ccv_nnc_tensor_view_t*)outputs[1];
359
2
    ccv_nnc_tensor_view_t* const b = (ccv_nnc_tensor_view_t*)inputs[1];
360
2
    ccv_nnc_tensor_view_get_dim(a, adim);
361
2
    ccv_nnc_tensor_view_get_dim(b, bdim);
362
2
    ccv_nnc_tensor_view_get_inc(a, ainc);
363
2
    ccv_nnc_tensor_view_get_inc(b, binc);
364
2
    ccv_nnc_tensor_zero(a);
365
2
    float* const ap = a->data.f32;
366
2
    float* const bp = b->data.f32;
367
2
    float* gp = g->data.f32;
368
5
    for (i[0] = 0; i[0] < gdim[0]; 
i[0]++3
)
369
3
    {
370
3
      float* const ap0 = adim[0] == 1 ? ap : 
ap + i[0] * ainc[1] * ainc[2] * ainc[3]0
;
371
3
      float* const bp0 = bdim[0] == 1 ? 
bp1
:
bp + i[0] * binc[1] * binc[2] * binc[3]2
;
372
8
      for (i[1] = 0; i[1] < gdim[1]; 
i[1]++5
)
373
5
      {
374
5
        float* const ap1 = adim[1] == 1 ? ap0 : 
ap0 + i[1] * ainc[2] * ainc[3]0
;
375
5
        float* const bp1 = bdim[1] == 1 ? 
bp01
:
bp0 + i[1] * binc[2] * binc[3]4
;
376
14
        for (i[2] = 0; i[2] < gdim[2]; 
i[2]++9
)
377
9
        {
378
9
          float* const ap2 = adim[2] == 1 ? ap1 : 
ap1 + i[2] * ainc[3]0
;
379
9
          float* const bp2 = bdim[2] == 1 ? 
bp11
:
bp1 + i[2] * binc[3]8
;
380
9
          if (adim[3] == 1)
381
101
            
for (x = 0; 1
x < gdim[3];
x++100
)
382
100
              ap2[0] += p * gp[x] * bp2[x];
383
8
          else if (bdim[3] == 1)
384
0
            for (x = 0; x < gdim[3]; x++)
385
0
              ap2[x] += p * gp[x] * bp2[0];
386
8
          else
387
88
            
for (x = 0; 8
x < gdim[3];
x++80
)
388
80
              ap2[x] += p * gp[x] * bp2[x];
389
9
          gp += ginc[3];
390
9
        }
391
5
        gp += (ginc[2] - gdim[2]) * ginc[3];
392
5
      }
393
3
      gp += (ginc[1] - gdim[1]) * ginc[2] * ginc[3];
394
3
    }
395
2
  }
396
2
  return CCV_NNC_EXEC_SUCCESS;
397
2
}
398
399
REGISTER_COMMAND_BACKEND(CCV_NNC_MUL_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
400
1
{
401
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
402
1
  registry->tensor_datatypes = CCV_32F;
403
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
404
1
  registry->algorithms = 1;
405
1
  registry->exec = _ccv_nnc_mul_forw;
406
1
}
407
408
REGISTER_COMMAND_BACKEND(CCV_NNC_MUL_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
409
1
{
410
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
411
1
  registry->tensor_datatypes = CCV_32F;
412
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
413
1
  registry->algorithms = 1;
414
1
  registry->exec = _ccv_nnc_mul_back;
415
1
}
416
417
static int _ccv_nnc_scalar_mul_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
418
6
{
419
6
  _ccv_nnc_mul_forw_cpu_ref(cmd.info.blas.a[0], (ccv_nnc_tensor_view_t*)inputs[0], 0, (ccv_nnc_tensor_view_t*)outputs[0]);
420
6
  return CCV_NNC_EXEC_SUCCESS;
421
6
}
422
static int _ccv_nnc_scalar_mul_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
423
2
{
424
2
  if (inputs[0])
425
2
    _ccv_nnc_mul_forw_cpu_ref(cmd.info.blas.a[0], (ccv_nnc_tensor_view_t*)inputs[0], 0, (ccv_nnc_tensor_view_t*)outputs[0]);
426
0
  else
427
0
    _ccv_nnc_tensor_set_cpu_ref((ccv_nnc_tensor_view_t*)outputs[0], cmd.info.blas.a[0]);
428
2
  return CCV_NNC_EXEC_SUCCESS;
429
2
}
430
431
REGISTER_COMMAND_BACKEND(CCV_NNC_SCALAR_MUL_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
432
1
{
433
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
434
1
  registry->tensor_datatypes = CCV_32F;
435
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
436
1
  registry->algorithms = 1;
437
1
  registry->exec = _ccv_nnc_scalar_mul_forw;
438
1
}
439
440
REGISTER_COMMAND_BACKEND(CCV_NNC_SCALAR_MUL_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
441
1
{
442
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
443
1
  registry->tensor_datatypes = CCV_32F;
444
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
445
1
  registry->algorithms = 1;
446
1
  registry->exec = _ccv_nnc_scalar_mul_back;
447
1
}