Coverage Report

Created: 2024-08-18 16:21

/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/cmd/blas/ccv_nnc_cmul_cpu_ref.c
Line
Count
Source (jump to first uncovered line)
1
#include "ccv.h"
2
#include "ccv_internal.h"
3
#include "nnc/ccv_nnc.h"
4
#include "nnc/ccv_nnc_easy.h"
5
#include "nnc/ccv_nnc_internal.h"
6
#ifdef USE_OPENMP
7
#include <omp.h>
8
#endif
9
#ifdef USE_DISPATCH
10
#include <dispatch/dispatch.h>
11
#endif
12
13
// Shared methods.
14
#include "../_ccv_nnc_cpu_ref.h"
15
16
void _ccv_nnc_cmul_forw_cpu_ref(ccv_nnc_tensor_view_t* const a, ccv_nnc_tensor_view_t* const b, ccv_nnc_tensor_view_t* const c)
17
5
{
18
5
  int cdim[CCV_NNC_MAX_DIM_ALLOC];
19
5
  assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
20
5
  assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
21
5
  ccv_nnc_tensor_view_get_dim(a, cdim); // Fill in cdim first.
22
5
  ccv_nnc_tensor_view_get_broadcast_dim(b, cdim);
23
5
  assert(ccv_nnc_tensor_view_check_broadcast_dim(a, cdim));
24
5
  assert(ccv_nnc_tensor_view_check_broadcast_dim(b, cdim));
25
5
  const int a_check_dim = ccv_nnc_tensor_view_check_dim(a, cdim);
26
5
  const int b_check_dim = ccv_nnc_tensor_view_check_dim(b, cdim);
27
  // Assuming this is float 32.
28
5
  int adim[CCV_NNC_MAX_DIM_ALLOC];
29
5
  int bdim[CCV_NNC_MAX_DIM_ALLOC];
30
5
  ccv_nnc_tensor_view_get_dim(a, adim);
31
5
  ccv_nnc_tensor_view_get_dim(b, bdim);
32
5
  int astride[CCV_NNC_MAX_DIM_ALLOC];
33
5
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
34
5
  int cstride[CCV_NNC_MAX_DIM_ALLOC];
35
5
  assert(ccv_nnc_tensor_nd(c->info.dim) <= CCV_NNC_MAX_DIM + 2);
36
5
  assert(ccv_nnc_tensor_view_check_dim(c, cdim));
37
5
  int x;
38
5
  if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b) && !CCV_IS_TENSOR_VIEW(c) && a_check_dim && b_check_dim)
39
4
  {
40
4
    const int tensor_count = ccv_nnc_tensor_count(a->info);
41
4
    assert(tensor_count % 2 == 0);
42
    // Super optimal case, just do one for-loop for sum.
43
214
    
for (x = 0; 4
x < tensor_count;
x += 2210
)
44
210
    {
45
210
      const float a0 = a->data.f32[x];
46
210
      const float a1 = a->data.f32[x + 1];
47
210
      const float b0 = b->data.f32[x];
48
210
      const float b1 = b->data.f32[x + 1];
49
210
      c->data.f32[x] = a0 * b0 - a1 * b1;
50
210
      c->data.f32[x + 1] = a0 * b1 + a1 * b0;
51
210
    }
52
4
    return;
53
4
  }
54
1
  assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
55
1
  ccv_nnc_tensor_view_get_stride(a, astride);
56
1
  ccv_nnc_tensor_view_get_stride(b, bstride);
57
1
  ccv_nnc_tensor_view_get_stride(c, cstride);
58
1
  int i[CCV_NNC_MAX_DIM + 2];
59
1
  float* const ap = a->data.f32;
60
1
  float* const bp = b->data.f32;
61
1
  float* const cp = c->data.f32;
62
1
  const int count = cdim[2] * cdim[3];
63
1
  assert(count % 2 == 0);
64
1
  if (astride[2] == cdim[3] && bstride[2] == cdim[3] && cstride[2] == cdim[3] && adim[2] == cdim[2] && bdim[2] == cdim[2])
65
0
  {
66
    // Special casing if the ainc[3] is the same as dim[3]
67
0
    for (i[0] = 0; i[0] < cdim[0]; i[0]++)
68
0
    {
69
0
      float* const ap0 = adim[0] == 1 ? ap : ap + i[0] * astride[0];
70
0
      float* const bp0 = bdim[0] == 1 ? bp : bp + i[0] * bstride[0];
71
0
      float* cp0 = cp + i[0] * cstride[0];
72
0
      for (i[1] = 0; i[1] < cdim[1]; i[1]++)
73
0
      {
74
0
        float* const ap1 = adim[1] == 1 ? ap0 : ap0 + i[1] * astride[1];
75
0
        float* const bp1 = bdim[1] == 1 ? bp0 : bp0 + i[1] * bstride[1];
76
0
        for (x = 0; x < count; x += 2)
77
0
        {
78
0
          const float a0 = ap1[x];
79
0
          const float a1 = ap1[x + 1];
80
0
          const float b0 = bp1[x];
81
0
          const float b1 = bp1[x + 1];
82
0
          cp0[x] = a0 * b0 - a1 * b1;
83
0
          cp0[x + 1] = a0 * b1 + a1 * b0;
84
0
        }
85
0
        cp0 += cstride[1];
86
0
      }
87
0
    }
88
0
    return;
89
0
  }
90
1
  assert(adim[3] == cdim[3]);
91
1
  assert(bdim[3] == cdim[3]);
92
  // Non-optimal case, need to do skip copy and handle broadcasting.
93
2
  
for (i[0] = 0; 1
i[0] < cdim[0];
i[0]++1
)
94
1
  {
95
1
    float* const ap0 = adim[0] == 1 ? ap : 
ap + i[0] * astride[0]0
;
96
1
    float* const bp0 = bdim[0] == 1 ? bp : 
bp + i[0] * bstride[0]0
;
97
1
    float* const cp0 = cp + i[0] * cstride[0];
98
6
    for (i[1] = 0; i[1] < cdim[1]; 
i[1]++5
)
99
5
    {
100
5
      float* const ap1 = adim[1] == 1 ? 
ap00
: ap0 + i[1] * astride[1];
101
5
      float* const bp1 = bdim[1] == 1 ? 
bp00
: bp0 + i[1] * bstride[1];
102
5
      float* cp1 = cp0 + i[1] * cstride[1];
103
45
      for (i[2] = 0; i[2] < cdim[2]; 
i[2]++40
)
104
40
      {
105
40
        float* const ap2 = adim[2] == 1 ? 
ap10
: ap1 + i[2] * astride[2];
106
40
        float* const bp2 = bdim[2] == 1 ? bp1 : 
bp1 + i[2] * bstride[2]0
;
107
2.60k
        for (x = 0; x < cdim[3]; 
x += 22.56k
)
108
2.56k
        {
109
2.56k
          const float a0 = ap2[x];
110
2.56k
          const float a1 = ap2[x + 1];
111
2.56k
          const float b0 = bp2[x];
112
2.56k
          const float b1 = bp2[x + 1];
113
2.56k
          cp1[x] = a0 * b0 - a1 * b1;
114
2.56k
          cp1[x + 1] = a0 * b1 + a1 * b0;
115
2.56k
        }
116
40
        cp1 += cstride[2];
117
40
      }
118
5
    }
119
1
  }
120
1
}
121
122
void _ccv_nnc_cmul_conj_forw_cpu_ref(ccv_nnc_tensor_view_t* const a, ccv_nnc_tensor_view_t* const b, ccv_nnc_tensor_view_t* const c)
123
5
{
124
5
  int cdim[CCV_NNC_MAX_DIM_ALLOC];
125
5
  assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
126
5
  assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
127
5
  ccv_nnc_tensor_view_get_dim(a, cdim); // Fill in cdim first.
128
5
  ccv_nnc_tensor_view_get_broadcast_dim(b, cdim);
129
5
  assert(ccv_nnc_tensor_view_check_broadcast_dim(a, cdim));
130
5
  assert(ccv_nnc_tensor_view_check_broadcast_dim(b, cdim));
131
5
  const int a_check_dim = ccv_nnc_tensor_view_check_dim(a, cdim);
132
5
  const int b_check_dim = ccv_nnc_tensor_view_check_dim(b, cdim);
133
  // Assuming this is float 32.
134
5
  int adim[CCV_NNC_MAX_DIM_ALLOC];
135
5
  int bdim[CCV_NNC_MAX_DIM_ALLOC];
136
5
  ccv_nnc_tensor_view_get_dim(a, adim);
137
5
  ccv_nnc_tensor_view_get_dim(b, bdim);
138
5
  int astride[CCV_NNC_MAX_DIM_ALLOC];
139
5
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
140
5
  int cstride[CCV_NNC_MAX_DIM_ALLOC];
141
5
  assert(ccv_nnc_tensor_nd(c->info.dim) <= CCV_NNC_MAX_DIM + 2);
142
5
  assert(ccv_nnc_tensor_view_check_dim(c, cdim));
143
5
  int x;
144
5
  if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b) && !CCV_IS_TENSOR_VIEW(c) && a_check_dim && b_check_dim)
145
5
  {
146
5
    const int tensor_count = ccv_nnc_tensor_count(a->info);
147
5
    assert(tensor_count % 2 == 0);
148
    // Super optimal case, just do one for-loop for sum.
149
410
    
for (x = 0; 5
x < tensor_count;
x += 2405
)
150
405
    {
151
405
      const float a0 = a->data.f32[x];
152
405
      const float a1 = a->data.f32[x + 1];
153
405
      const float b0 = b->data.f32[x];
154
405
      const float b1 = b->data.f32[x + 1];
155
405
      c->data.f32[x] = a0 * b0 + a1 * b1;
156
405
      c->data.f32[x + 1] = -a0 * b1 + a1 * b0;
157
405
    }
158
5
    return;
159
5
  }
160
0
  assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
161
0
  ccv_nnc_tensor_view_get_stride(a, astride);
162
0
  ccv_nnc_tensor_view_get_stride(b, bstride);
163
0
  ccv_nnc_tensor_view_get_stride(c, cstride);
164
0
  int i[CCV_NNC_MAX_DIM + 2];
165
0
  float* const ap = a->data.f32;
166
0
  float* const bp = b->data.f32;
167
0
  float* const cp = c->data.f32;
168
0
  const int count = cdim[2] * cdim[3];
169
0
  assert(count % 2 == 0);
170
0
  if (astride[2] == cdim[3] && bstride[2] == cdim[3] && cstride[2] == cdim[3] && adim[2] == cdim[2] && bdim[2] == cdim[2])
171
0
  {
172
    // Special casing if the ainc[3] is the same as dim[3]
173
0
    for (i[0] = 0; i[0] < cdim[0]; i[0]++)
174
0
    {
175
0
      float* const ap0 = adim[0] == 1 ? ap : ap + i[0] * astride[0];
176
0
      float* const bp0 = bdim[0] == 1 ? bp : bp + i[0] * bstride[0];
177
0
      float* cp0 = cp + i[0] * cstride[0];
178
0
      for (i[1] = 0; i[1] < cdim[1]; i[1]++)
179
0
      {
180
0
        float* const ap1 = adim[1] == 1 ? ap0 : ap0 + i[1] * astride[1];
181
0
        float* const bp1 = bdim[1] == 1 ? bp0 : bp0 + i[1] * bstride[1];
182
0
        for (x = 0; x < count; x += 2)
183
0
        {
184
0
          const float a0 = ap1[x];
185
0
          const float a1 = ap1[x + 1];
186
0
          const float b0 = bp1[x];
187
0
          const float b1 = bp1[x + 1];
188
0
          cp0[x] = a0 * b0 + a1 * b1;
189
0
          cp0[x + 1] = -a0 * b1 + a1 * b0;
190
0
        }
191
0
        cp0 += cstride[1];
192
0
      }
193
0
    }
194
0
    return;
195
0
  }
196
0
  assert(adim[3] == cdim[3]);
197
0
  assert(bdim[3] == cdim[3]);
198
  // Non-optimal case, need to do skip copy and handle broadcasting.
199
0
  for (i[0] = 0; i[0] < cdim[0]; i[0]++)
200
0
  {
201
0
    float* const ap0 = adim[0] == 1 ? ap : ap + i[0] * astride[0];
202
0
    float* const bp0 = bdim[0] == 1 ? bp : bp + i[0] * bstride[0];
203
0
    float* const cp0 = cp + i[0] * cstride[0];
204
0
    for (i[1] = 0; i[1] < cdim[1]; i[1]++)
205
0
    {
206
0
      float* const ap1 = adim[1] == 1 ? ap0 : ap0 + i[1] * astride[1];
207
0
      float* const bp1 = bdim[1] == 1 ? bp0 : bp0 + i[1] * bstride[1];
208
0
      float* cp1 = cp0 + i[1] * cstride[1];
209
0
      for (i[2] = 0; i[2] < cdim[2]; i[2]++)
210
0
      {
211
0
        float* const ap2 = adim[2] == 1 ? ap1 : ap1 + i[2] * astride[2];
212
0
        float* const bp2 = bdim[2] == 1 ? bp1 : bp1 + i[2] * bstride[2];
213
0
        for (x = 0; x < cdim[3]; x += 2)
214
0
        {
215
0
          const float a0 = ap2[x];
216
0
          const float a1 = ap2[x + 1];
217
0
          const float b0 = bp2[x];
218
0
          const float b1 = bp2[x + 1];
219
0
          cp1[x] = a0 * b0 + a1 * b1;
220
0
          cp1[x + 1] = -a0 * b1 + a1 * b0;
221
0
        }
222
0
        cp1 += cstride[2];
223
0
      }
224
0
    }
225
0
  }
226
0
}
227
228
void _ccv_nnc_conj_forw_cpu_ref(ccv_nnc_tensor_view_t* const a, ccv_nnc_tensor_view_t* const c)
229
0
{
230
0
  int cdim[CCV_NNC_MAX_DIM_ALLOC];
231
0
  assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
232
0
  ccv_nnc_tensor_view_get_dim(a, cdim); // Fill in cdim first.
233
0
  assert(ccv_nnc_tensor_view_check_broadcast_dim(a, cdim));
234
0
  const int a_check_dim = ccv_nnc_tensor_view_check_dim(a, cdim);
235
  // Assuming this is float 32.
236
0
  int adim[CCV_NNC_MAX_DIM_ALLOC];
237
0
  ccv_nnc_tensor_view_get_dim(a, adim);
238
0
  int astride[CCV_NNC_MAX_DIM_ALLOC];
239
0
  int cstride[CCV_NNC_MAX_DIM_ALLOC];
240
0
  assert(ccv_nnc_tensor_nd(c->info.dim) <= CCV_NNC_MAX_DIM + 2);
241
0
  assert(ccv_nnc_tensor_view_check_dim(c, cdim));
242
0
  int x;
243
0
  if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(c) && a_check_dim)
244
0
  {
245
0
    const int tensor_count = ccv_nnc_tensor_count(a->info);
246
0
    assert(tensor_count % 2 == 0);
247
    // Super optimal case, just do one for-loop for sum.
248
0
    for (x = 0; x < tensor_count; x += 2)
249
0
    {
250
0
      c->data.f32[x] = a->data.f32[x];
251
0
      c->data.f32[x + 1] = -a->data.f32[x + 1];
252
0
    }
253
0
    return;
254
0
  }
255
0
  assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
256
0
  ccv_nnc_tensor_view_get_stride(a, astride);
257
0
  ccv_nnc_tensor_view_get_stride(c, cstride);
258
0
  int i[CCV_NNC_MAX_DIM + 2];
259
0
  float* const ap = a->data.f32;
260
0
  float* const cp = c->data.f32;
261
0
  const int count = cdim[2] * cdim[3];
262
0
  assert(count % 2 == 0);
263
0
  if (astride[2] == cdim[3] && cstride[2] == cdim[3] && adim[2] == cdim[2])
264
0
  {
265
    // Special casing if the ainc[3] is the same as dim[3]
266
0
    for (i[0] = 0; i[0] < cdim[0]; i[0]++)
267
0
    {
268
0
      float* const ap0 = adim[0] == 1 ? ap : ap + i[0] * astride[0];
269
0
      float* cp0 = cp + i[0] * cstride[0];
270
0
      for (i[1] = 0; i[1] < cdim[1]; i[1]++)
271
0
      {
272
0
        float* const ap1 = adim[1] == 1 ? ap0 : ap0 + i[1] * astride[1];
273
0
        for (x = 0; x < count; x += 2)
274
0
        {
275
0
          cp0[x] = ap1[x];
276
0
          cp0[x + 1] = -ap1[x + 1];
277
0
        }
278
0
        cp0 += cstride[1];
279
0
      }
280
0
    }
281
0
    return;
282
0
  }
283
0
  assert(adim[3] == cdim[3]);
284
  // Non-optimal case, need to do skip copy and handle broadcasting.
285
0
  for (i[0] = 0; i[0] < cdim[0]; i[0]++)
286
0
  {
287
0
    float* const ap0 = adim[0] == 1 ? ap : ap + i[0] * astride[0];
288
0
    float* const cp0 = cp + i[0] * cstride[0];
289
0
    for (i[1] = 0; i[1] < cdim[1]; i[1]++)
290
0
    {
291
0
      float* const ap1 = adim[1] == 1 ? ap0 : ap0 + i[1] * astride[1];
292
0
      float* cp1 = cp0 + i[1] * cstride[1];
293
0
      for (i[2] = 0; i[2] < cdim[2]; i[2]++)
294
0
      {
295
0
        float* const ap2 = adim[2] == 1 ? ap1 : ap1 + i[2] * astride[2];
296
0
        for (x = 0; x < cdim[3]; x += 2)
297
0
        {
298
0
          cp1[x] = ap2[x];
299
0
          cp1[x + 1] = -ap2[x + 1];
300
0
        }
301
0
        cp1 += cstride[2];
302
0
      }
303
0
    }
304
0
  }
305
0
}
306
307
static int _ccv_nnc_cmul_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
308
5
{
309
5
  assert(input_size == 2);
310
5
  _ccv_nnc_cmul_forw_cpu_ref((ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[1], (ccv_nnc_tensor_view_t*)outputs[0]);
311
5
  return CCV_NNC_EXEC_SUCCESS;
312
5
}
313
314
static int _ccv_nnc_cmul_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
315
3
{
316
3
  int gdim[CCV_NNC_MAX_DIM_ALLOC];
317
3
  int no_broadcasting = 1;
318
3
  if (outputs[0])
319
3
  {
320
3
    assert(input_size >= 3 && inputs[2]);
321
3
    ccv_nnc_tensor_view_get_dim((ccv_nnc_tensor_view_t*)outputs[0], gdim);
322
3
    ccv_nnc_tensor_view_get_broadcast_dim((ccv_nnc_tensor_view_t*)inputs[2], gdim);
323
3
    no_broadcasting = no_broadcasting && (ccv_nnc_tensor_view_check_dim((ccv_nnc_tensor_view_t*)outputs[0], gdim) && ccv_nnc_tensor_view_check_dim((ccv_nnc_tensor_view_t*)inputs[2], gdim));
324
3
  }
325
3
  if (no_broadcasting && output_size > 1 && outputs[1])
326
2
  {
327
2
    assert(inputs[1]);
328
2
    ccv_nnc_tensor_view_get_dim((ccv_nnc_tensor_view_t*)inputs[1], gdim);
329
2
    ccv_nnc_tensor_view_get_broadcast_dim((ccv_nnc_tensor_view_t*)outputs[1], gdim);
330
2
    no_broadcasting = no_broadcasting && (ccv_nnc_tensor_view_check_dim((ccv_nnc_tensor_view_t*)inputs[1], gdim) && ccv_nnc_tensor_view_check_dim((ccv_nnc_tensor_view_t*)outputs[1], gdim));
331
2
  }
332
  // We compute with the conjugation of the gradient output similar to PyTorch: https://pytorch.org/docs/stable/notes/autograd.html#autograd-for-complex-numbers
333
  // Note that in the absence of gradient output, we simply compute the conjugation of the other input.
334
3
  if (no_broadcasting)
335
3
  {
336
3
    if (outputs[0])
337
3
    {
338
3
      if (inputs[0] == 0)
339
0
        _ccv_nnc_conj_forw_cpu_ref((ccv_nnc_tensor_view_t*)inputs[2], (ccv_nnc_tensor_view_t*)outputs[0]);
340
3
      else
341
3
        _ccv_nnc_cmul_conj_forw_cpu_ref((ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[2], (ccv_nnc_tensor_view_t*)outputs[0]);
342
3
    }
343
3
    if (output_size > 1 && outputs[1])
344
2
    {
345
2
      if (inputs[0] == 0)
346
0
        _ccv_nnc_conj_forw_cpu_ref((ccv_nnc_tensor_view_t*)inputs[1], (ccv_nnc_tensor_view_t*)outputs[1]);
347
2
      else
348
2
        _ccv_nnc_cmul_conj_forw_cpu_ref((ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[1], (ccv_nnc_tensor_view_t*)outputs[1]);
349
2
    }
350
3
    return CCV_NNC_EXEC_SUCCESS;
351
3
  }
352
0
  int adim[CCV_NNC_MAX_DIM_ALLOC];
353
0
  int bdim[CCV_NNC_MAX_DIM_ALLOC];
354
0
  int astride[CCV_NNC_MAX_DIM_ALLOC];
355
0
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
356
0
  int i[CCV_NNC_MAX_DIM + 2];
357
0
  int x;
358
  // Now the case we need broadcasting.
359
0
  if (inputs[0] == 0)
360
0
  {
361
0
    if (outputs[0])
362
0
    {
363
0
      ccv_nnc_tensor_view_t* const a = (ccv_nnc_tensor_view_t*)outputs[0];
364
0
      ccv_nnc_tensor_view_t* const b = (ccv_nnc_tensor_view_t*)inputs[2];
365
0
      ccv_nnc_tensor_view_get_dim(a, adim);
366
0
      ccv_nnc_tensor_view_get_dim(b, bdim);
367
0
      ccv_nnc_tensor_view_get_stride(a, astride);
368
0
      ccv_nnc_tensor_view_get_stride(b, bstride);
369
0
      ccv_nnc_tensor_zero(a);
370
0
      float* const ap = a->data.f32;
371
0
      float* const bp = b->data.f32;
372
0
      for (i[0] = 0; i[0] < gdim[0]; i[0]++)
373
0
      {
374
0
        float* const ap0 = adim[0] == 1 ? ap : ap + i[0] * astride[0];
375
0
        float* const bp0 = bdim[0] == 1 ? bp : bp + i[0] * bstride[0];
376
0
        for (i[1] = 0; i[1] < gdim[1]; i[1]++)
377
0
        {
378
0
          float* const ap1 = adim[1] == 1 ? ap0 : ap0 + i[1] * astride[1];
379
0
          float* const bp1 = bdim[1] == 1 ? bp0 : bp0 + i[1] * bstride[1];
380
0
          for (i[2] = 0; i[2] < gdim[2]; i[2]++)
381
0
          {
382
0
            float* const ap2 = adim[2] == 1 ? ap1 : ap1 + i[2] * astride[2];
383
0
            float* const bp2 = bdim[2] == 1 ? bp1 : bp1 + i[2] * bstride[2];
384
0
            for (x = 0; x < gdim[3]; x++)
385
0
              ap2[x] += bp2[x];
386
0
          }
387
0
        }
388
0
      }
389
0
    }
390
0
    if (output_size > 1 && outputs[1])
391
0
    {
392
0
      ccv_nnc_tensor_view_t* const a = (ccv_nnc_tensor_view_t*)outputs[1];
393
0
      ccv_nnc_tensor_view_t* const b = (ccv_nnc_tensor_view_t*)inputs[1];
394
0
      ccv_nnc_tensor_view_get_dim(a, adim);
395
0
      ccv_nnc_tensor_view_get_dim(b, bdim);
396
0
      ccv_nnc_tensor_view_get_stride(a, astride);
397
0
      ccv_nnc_tensor_view_get_stride(b, bstride);
398
0
      ccv_nnc_tensor_zero(a);
399
0
      float* const ap = a->data.f32;
400
0
      float* const bp = b->data.f32;
401
0
      for (i[0] = 0; i[0] < gdim[0]; i[0]++)
402
0
      {
403
0
        float* const ap0 = adim[0] == 1 ? ap : ap + i[0] * astride[0];
404
0
        float* const bp0 = bdim[0] == 1 ? bp : bp + i[0] * bstride[0];
405
0
        for (i[1] = 0; i[1] < gdim[1]; i[1]++)
406
0
        {
407
0
          float* const ap1 = adim[1] == 1 ? ap0 : ap0 + i[1] * astride[1];
408
0
          float* const bp1 = bdim[1] == 1 ? bp0 : bp0 + i[1] * bstride[1];
409
0
          for (i[2] = 0; i[2] < gdim[2]; i[2]++)
410
0
          {
411
0
            float* const ap2 = adim[2] == 1 ? ap1 : ap1 + i[2] * astride[2];
412
0
            float* const bp2 = bdim[2] == 1 ? bp1 : bp1 + i[2] * bstride[2];
413
0
            for (x = 0; x < gdim[3]; x++)
414
0
              ap2[x] += bp2[x];
415
0
          }
416
0
        }
417
0
      }
418
0
    }
419
0
    return CCV_NNC_EXEC_SUCCESS;
420
0
  }
421
0
  int gstride[CCV_NNC_MAX_DIM_ALLOC];
422
0
  ccv_nnc_tensor_view_t* const g = (ccv_nnc_tensor_view_t*)inputs[0];
423
0
  ccv_nnc_tensor_view_get_dim(g, gdim);
424
0
  ccv_nnc_tensor_view_get_stride(g, gstride);
425
0
  if (outputs[0])
426
0
  {
427
0
    ccv_nnc_tensor_view_t* const a = (ccv_nnc_tensor_view_t*)outputs[0];
428
0
    ccv_nnc_tensor_view_t* const b = (ccv_nnc_tensor_view_t*)inputs[2];
429
0
    ccv_nnc_tensor_view_get_dim(a, adim);
430
0
    ccv_nnc_tensor_view_get_dim(b, bdim);
431
0
    ccv_nnc_tensor_view_get_stride(a, astride);
432
0
    ccv_nnc_tensor_view_get_stride(b, bstride);
433
0
    ccv_nnc_tensor_zero(a);
434
0
    float* const ap = a->data.f32;
435
0
    float* const bp = b->data.f32;
436
0
    float* const gp = g->data.f32;
437
0
    for (i[0] = 0; i[0] < gdim[0]; i[0]++)
438
0
    {
439
0
      float* const ap0 = adim[0] == 1 ? ap : ap + i[0] * astride[0];
440
0
      float* const bp0 = bdim[0] == 1 ? bp : bp + i[0] * bstride[0];
441
0
      float* const gp0 = gp + i[0] * gstride[0];
442
0
      for (i[1] = 0; i[1] < gdim[1]; i[1]++)
443
0
      {
444
0
        float* const ap1 = adim[1] == 1 ? ap0 : ap0 + i[1] * astride[1];
445
0
        float* const bp1 = bdim[1] == 1 ? bp0 : bp0 + i[1] * bstride[1];
446
0
        float* gp1 = gp0 + i[1] * gstride[1];
447
0
        for (i[2] = 0; i[2] < gdim[2]; i[2]++)
448
0
        {
449
0
          float* const ap2 = adim[2] == 1 ? ap1 : ap1 + i[2] * astride[2];
450
0
          float* const bp2 = bdim[2] == 1 ? bp1 : bp1 + i[2] * bstride[2];
451
0
          for (x = 0; x < gdim[3]; x += 2)
452
0
          {
453
0
            const float g0 = gp1[x];
454
0
            const float g1 = gp1[x + 1];
455
0
            const float b0 = bp2[x];
456
0
            const float b1 = bp2[x + 1];
457
0
            ap2[x] += g0 * b0 + g1 * b1;
458
0
            ap2[x + 1] += -g0 * b1 + g1 * b0;
459
0
          }
460
0
          gp1 += gstride[2];
461
0
        }
462
0
      }
463
0
    }
464
0
  }
465
0
  if (output_size > 1 && outputs[1])
466
0
  {
467
0
    ccv_nnc_tensor_view_t* const a = (ccv_nnc_tensor_view_t*)outputs[1];
468
0
    ccv_nnc_tensor_view_t* const b = (ccv_nnc_tensor_view_t*)inputs[1];
469
0
    ccv_nnc_tensor_view_get_dim(a, adim);
470
0
    ccv_nnc_tensor_view_get_dim(b, bdim);
471
0
    ccv_nnc_tensor_view_get_stride(a, astride);
472
0
    ccv_nnc_tensor_view_get_stride(b, bstride);
473
0
    ccv_nnc_tensor_zero(a);
474
0
    float* const ap = a->data.f32;
475
0
    float* const bp = b->data.f32;
476
0
    float* const gp = g->data.f32;
477
0
    for (i[0] = 0; i[0] < gdim[0]; i[0]++)
478
0
    {
479
0
      float* const ap0 = adim[0] == 1 ? ap : ap + i[0] * astride[0];
480
0
      float* const bp0 = bdim[0] == 1 ? bp : bp + i[0] * bstride[0];
481
0
      float* const gp0 = gp + i[0] * gstride[0];
482
0
      for (i[1] = 0; i[1] < gdim[1]; i[1]++)
483
0
      {
484
0
        float* const ap1 = adim[1] == 1 ? ap0 : ap0 + i[1] * astride[1];
485
0
        float* const bp1 = bdim[1] == 1 ? bp0 : bp0 + i[1] * bstride[1];
486
0
        float* gp1 = gp0 + i[1] * gstride[1];
487
0
        for (i[2] = 0; i[2] < gdim[2]; i[2]++)
488
0
        {
489
0
          float* const ap2 = adim[2] == 1 ? ap1 : ap1 + i[2] * astride[2];
490
0
          float* const bp2 = bdim[2] == 1 ? bp1 : bp1 + i[2] * bstride[2];
491
0
          for (x = 0; x < gdim[3]; x += 2)
492
0
          {
493
0
            const float g0 = gp1[x];
494
0
            const float g1 = gp1[x + 1];
495
0
            const float b0 = bp2[x];
496
0
            const float b1 = bp2[x + 1];
497
0
            ap2[x] += g0 * b0 + g1 * b1;
498
0
            ap2[x + 1] += -g0 * b1 + g1 * b0;
499
0
          }
500
0
          gp1 += gstride[2];
501
0
        }
502
0
      }
503
0
    }
504
0
  }
505
0
  return CCV_NNC_EXEC_SUCCESS;
506
0
}
507
508
REGISTER_COMMAND_BACKEND(CCV_NNC_CMUL_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
509
1
{
510
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
511
1
  registry->tensor_datatypes = CCV_32F;
512
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
513
1
  registry->algorithms = 1;
514
1
  registry->exec = _ccv_nnc_cmul_forw;
515
1
}
516
517
REGISTER_COMMAND_BACKEND(CCV_NNC_CMUL_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
518
1
{
519
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
520
1
  registry->tensor_datatypes = CCV_32F;
521
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
522
1
  registry->algorithms = 1;
523
1
  registry->exec = _ccv_nnc_cmul_back;
524
1
}
525