Coverage Report

Created: 2024-08-19 11:27

/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/cmd/convolution/ccv_nnc_conv_transpose_cpu_ref.c
Line
Count
Source (jump to first uncovered line)
1
#include "ccv.h"
2
#include "ccv_internal.h"
3
#include "nnc/ccv_nnc.h"
4
#include "nnc/ccv_nnc_easy.h"
5
#include "nnc/ccv_nnc_internal.h"
6
#ifdef USE_OPENMP
7
#include <omp.h>
8
#endif
9
#ifdef USE_DISPATCH
10
#include <dispatch/dispatch.h>
11
#endif
12
13
static int _ccv_nnc_conv_transpose_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
14
9
{
15
9
  assert(input_size >= 2);
16
9
  const ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[0];
17
9
  const ccv_nnc_tensor_t* w = inputs[1];
18
9
  assert(CCV_IS_TENSOR_CONTIGUOUS(w));
19
9
  const ccv_nnc_tensor_t* bias = input_size > 2 ? inputs[2] : 
00
;
20
9
  assert(!bias || !CCV_IS_TENSOR_VIEW(bias));
21
9
  assert(output_size == 1);
22
9
  ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)outputs[0];
23
9
  const int a_nd = ccv_nnc_tensor_nd(a->info.dim);
24
9
  assert(a_nd == CCV_NNC_MAX_DIM + 1 || a_nd == CCV_NNC_MAX_DIM + 2);
25
9
  const int* adim = (a_nd == CCV_NNC_MAX_DIM + 1) ? 
a->info.dim6
:
a->info.dim + 13
;
26
9
  const int b_nd = ccv_nnc_tensor_nd(b->info.dim);
27
9
  assert(b_nd == CCV_NNC_MAX_DIM + 1 || b_nd == CCV_NNC_MAX_DIM + 2);
28
9
  const int* bdim = (b_nd == CCV_NNC_MAX_DIM + 1) ? 
b->info.dim6
:
b->info.dim + 13
;
29
9
  const int groups = cmd.info.convolution_transpose.groups;
30
9
  assert(cmd.info.convolution_transpose.count % groups == 0);
31
9
  const int group_size = cmd.info.convolution_transpose.count / groups;
32
  // Make sure the weights output dimension matches the network convolution kernels
33
9
  int astride[CCV_NNC_MAX_DIM_ALLOC];
34
9
  ccv_nnc_tensor_view_get_stride(a, astride);
35
9
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
36
9
  ccv_nnc_tensor_view_get_stride(b, bstride);
37
9
  assert(!bias || bias->info.dim[0] == cmd.info.convolution_transpose.count);
38
9
  const int batch_size = (a_nd == CCV_NNC_MAX_DIM + 2) ? 
a->info.dim[0]3
:
16
;
39
9
  const int dilation[CCV_NNC_MAX_DIM] = {
40
9
    ccv_max(cmd.info.convolution_transpose.dilation[0], 1),
41
9
    ccv_max(cmd.info.convolution_transpose.dilation[1], 1)
42
9
  };
43
9
  if (a->info.format == CCV_TENSOR_FORMAT_NHWC)
44
5
  {
45
    // Make sure the weights dimension matches the network dimension
46
5
    assert(w->info.dim[1] == cmd.info.size.dim[0]);
47
5
    assert(w->info.dim[2] == cmd.info.size.dim[1]);
48
5
    assert(w->info.dim[CCV_NNC_MAX_DIM + 1] * groups == cmd.info.convolution_transpose.count);
49
5
    const int wdim[CCV_NNC_MAX_DIM] = {
50
5
      (w->info.dim[1] - 1) * dilation[0] + 1,
51
5
      (w->info.dim[2] - 1) * dilation[1] + 1
52
5
    };
53
5
    assert(w->info.dim[0] == adim[CCV_NNC_MAX_DIM]);
54
5
    assert(b->info.format == CCV_TENSOR_FORMAT_NHWC);
55
5
    const int channel_size = w->info.dim[CCV_NNC_MAX_DIM + 1];
56
5
    const int input_channel_size = w->info.dim[0] / groups;
57
5
    const int hwc = w->info.dim[1] * w->info.dim[2] * channel_size;
58
5
    assert(bdim[CCV_NNC_MAX_DIM] == cmd.info.convolution_transpose.count);
59
389
    
parallel_for5
(idx, cmd.info.convolution_transpose.count * batch_size) {
60
389
      int c;
61
389
      const int bidx = idx / cmd.info.convolution_transpose.count;
62
389
      const int k = idx % cmd.info.convolution_transpose.count;
63
389
      const int gidx = k / group_size;
64
389
      float* ap = a->data.f32 + bidx * astride[0];
65
389
      float* bp = b->data.f32 + bidx * bstride[0] + k;
66
      // kernel weight for one dim.
67
389
      float* wp = w->data.f32 + (k % group_size);
68
389
      float biasval = bias ? bias->data.f32[k] : 
00
;
69
      // This block will be cause in each for-loop, therefore, you can use it to generate some temporary variables.
70
389
      int i[CCV_NNC_MAX_DIM];
71
389
      int n[CCV_NNC_MAX_DIM];
72
389
      int d[CCV_NNC_MAX_DIM];
73
389
      int m[CCV_NNC_MAX_DIM];
74
389
      int j[CCV_NNC_MAX_DIM];
75
86.4k
      for (i[0] = 0; i[0] < bdim[0]; 
i[0]++86.0k
)
76
86.0k
      {
77
19.3M
        for (i[1] = 0; i[1] < bdim[1]; 
i[1]++19.2M
)
78
19.2M
          bp[i[1] * bstride[CCV_NNC_MAX_DIM]] = biasval;
79
86.0k
        bp += bstride[CCV_NNC_MAX_DIM - 1];
80
86.0k
      }
81
389
      bp = b->data.f32 + bidx * bstride[0] + k;
82
43.4k
      for (i[0] = 0; i[0] < adim[0]; 
i[0]++43.0k
)
83
43.0k
      {
84
43.0k
        SET_BORDER_OFFSET_SIZE_FOR(0, i, hint, wdim, bdim, n, m);
85
43.0k
        m[0] = (m[0] + n[0] - 1) / dilation[0] + 1;
86
43.0k
        const int n0 = (n[0] + dilation[0] - 1) / dilation[0];
87
43.0k
        d[0] = n0 * dilation[0] - n[0];
88
43.0k
        n[0] = n0;
89
43.0k
        m[0] = m[0] - n[0];
90
43.0k
        float* wpu = wp + n[0] * w->info.dim[CCV_NNC_MAX_DIM] * channel_size;
91
4.85M
        for (i[1] = 0; i[1] < adim[1]; 
i[1]++4.81M
)
92
4.81M
        {
93
4.81M
          SET_BORDER_OFFSET_SIZE_FOR(1, i, hint, wdim, bdim, n, m);
94
4.81M
          m[1] = (m[1] + n[1] - 1) / dilation[1] + 1;
95
4.81M
          const int n1 = (n[1] + dilation[1] - 1) / dilation[1];
96
4.81M
          d[1] = n1 * dilation[1] - n[1];
97
4.81M
          n[1] = n1;
98
4.81M
          m[1] = m[1] - n[1];
99
4.81M
          float* wpz = wpu + n[1] * channel_size;
100
4.81M
          const float* const apz = ap + i[1] * astride[CCV_NNC_MAX_DIM];
101
4.81M
          float* bpz = bp + d[0] * bstride[CCV_NNC_MAX_DIM - 1] + (ccv_max(i[1] * hint.stride.dim[1] - hint.border.begin[1], 0) + d[1]) * bstride[CCV_NNC_MAX_DIM];
102
38.2M
          for (j[0] = 0; j[0] < m[0]; 
j[0]++33.4M
)
103
33.4M
          {
104
265M
            for (j[1] = 0; j[1] < m[1]; 
j[1]++232M
)
105
232M
            {
106
232M
              float p = bpz[j[1] * dilation[1] * bstride[CCV_NNC_MAX_DIM]];
107
22.5G
              for (c = 0; c < input_channel_size; 
c++22.3G
)
108
22.3G
                 p += wpz[j[1] * channel_size + (c + gidx * input_channel_size) * hwc] * apz[c + gidx * input_channel_size];
109
232M
              bpz[j[1] * dilation[1] * bstride[CCV_NNC_MAX_DIM]] = p;
110
232M
            }
111
33.4M
            wpz += w->info.dim[CCV_NNC_MAX_DIM] * channel_size;
112
33.4M
            bpz += bstride[CCV_NNC_MAX_DIM - 1] * dilation[0];
113
33.4M
          }
114
4.81M
        }
115
43.0k
        ap += astride[CCV_NNC_MAX_DIM - 1];
116
43.0k
        bp += bstride[CCV_NNC_MAX_DIM - 1] * (ccv_max((i[0] + 1) * hint.stride.dim[0] - hint.border.begin[0], 0) - ccv_max(i[0] * hint.stride.dim[0] - hint.border.begin[0], 0));
117
43.0k
      }
118
389
    } parallel_endfor
119
5
  } else 
if (4
a->info.format == CCV_TENSOR_FORMAT_NCHW4
) {
120
    // Make sure the weights dimension matches the network dimension
121
4
    assert(w->info.dim[1] * groups == cmd.info.convolution_transpose.count);
122
4
    assert(w->info.dim[2] == cmd.info.size.dim[0]);
123
4
    assert(w->info.dim[3] == cmd.info.size.dim[1]);
124
4
    const int wdim[CCV_NNC_MAX_DIM] = {
125
4
      (w->info.dim[2] - 1) * dilation[0] + 1,
126
4
      (w->info.dim[3] - 1) * dilation[1] + 1
127
4
    };
128
4
    assert(w->info.dim[0] == adim[0]);
129
4
    assert(b->info.format == CCV_TENSOR_FORMAT_NCHW);
130
4
    const int channel_size = w->info.dim[1];
131
4
    const int input_channel_size = w->info.dim[0] / groups;
132
4
    const int hw = w->info.dim[2] * w->info.dim[3];
133
4
    const int chw = channel_size * hw;
134
4
    assert(bdim[0] == cmd.info.convolution_transpose.count);
135
197
    
parallel_for4
(idx, cmd.info.convolution_transpose.count * batch_size) {
136
197
      int c;
137
197
      const int bidx = idx / cmd.info.convolution_transpose.count;
138
197
      const int k = idx % cmd.info.convolution_transpose.count;
139
197
      const int gidx = k / group_size;
140
197
      float* ap = a->data.f32 + bidx * astride[0];
141
197
      float* bp = b->data.f32 + bidx * bstride[0] + k * bstride[1];
142
      // kernel weight for one dim.
143
197
      float* wp = w->data.f32 + (k % group_size) * hw;
144
197
      float biasval = bias ? bias->data.f32[k] : 
00
;
145
      // This block will be cause in each for-loop, therefore, you can use it to generate some temporary variables.
146
197
      int i[CCV_NNC_MAX_DIM];
147
197
      int n[CCV_NNC_MAX_DIM];
148
197
      int d[CCV_NNC_MAX_DIM];
149
197
      int m[CCV_NNC_MAX_DIM];
150
197
      int j[CCV_NNC_MAX_DIM];
151
43.2k
      for (i[0] = 0; i[0] < bdim[1]; 
i[0]++43.0k
)
152
43.0k
      {
153
9.67M
        for (i[1] = 0; i[1] < bdim[2]; 
i[1]++9.63M
)
154
9.63M
          bp[i[1]] = biasval;
155
43.0k
        bp += bstride[CCV_NNC_MAX_DIM];
156
43.0k
      }
157
197
      bp = b->data.f32 + bidx * bstride[0] + k * bstride[1];
158
21.7k
      for (i[0] = 0; i[0] < adim[1]; 
i[0]++21.5k
)
159
21.5k
      {
160
21.5k
        SET_BORDER_OFFSET_SIZE_FOR(0, i, hint, wdim, bdim + 1, n, m);
161
21.5k
        m[0] = (m[0] + n[0] - 1) / dilation[0] + 1;
162
21.5k
        const int n0 = (n[0] + dilation[0] - 1) / dilation[0];
163
21.5k
        d[0] = n0 * dilation[0] - n[0];
164
21.5k
        n[0] = n0;
165
21.5k
        m[0] = m[0] - n[0];
166
21.5k
        float* wpu = wp + n[0] * w->info.dim[CCV_NNC_MAX_DIM + 1];
167
2.42M
        for (i[1] = 0; i[1] < adim[2]; 
i[1]++2.40M
)
168
2.40M
        {
169
2.40M
          SET_BORDER_OFFSET_SIZE_FOR(1, i, hint, wdim, bdim + 1, n, m);
170
2.40M
          m[1] = (m[1] + n[1] - 1) / dilation[1] + 1;
171
2.40M
          const int n1 = (n[1] + dilation[1] - 1) / dilation[1];
172
2.40M
          d[1] = n1 * dilation[1] - n[1];
173
2.40M
          n[1] = n1;
174
2.40M
          m[1] = m[1] - n[1];
175
2.40M
          float* wpz = wpu + n[1];
176
2.40M
          const float* apz = ap + i[1];
177
2.40M
          float* bpz = bp + d[0] * bstride[CCV_NNC_MAX_DIM] + (ccv_max(i[1] * hint.stride.dim[1] - hint.border.begin[1], 0) + d[1]) * bstride[CCV_NNC_MAX_DIM + 1];
178
19.1M
          for (j[0] = 0; j[0] < m[0]; 
j[0]++16.7M
)
179
16.7M
          {
180
132M
            for (j[1] = 0; j[1] < m[1]; 
j[1]++116M
)
181
116M
            {
182
116M
              float p = bpz[j[1] * dilation[1] * bstride[CCV_NNC_MAX_DIM + 1]];
183
11.2G
              for (c = 0; c < input_channel_size; 
c++11.1G
)
184
11.1G
                 p += wpz[j[1] + (c + gidx * input_channel_size) * chw] * apz[(c + gidx * input_channel_size) * astride[1]];
185
116M
              bpz[j[1] * dilation[1] * bstride[CCV_NNC_MAX_DIM + 1]] = p;
186
116M
            }
187
16.7M
            wpz += w->info.dim[CCV_NNC_MAX_DIM + 1];
188
16.7M
            bpz += bstride[CCV_NNC_MAX_DIM] * dilation[0];
189
16.7M
          }
190
2.40M
        }
191
21.5k
        ap += astride[CCV_NNC_MAX_DIM];
192
21.5k
        bp += bstride[CCV_NNC_MAX_DIM] * (ccv_max((i[0] + 1) * hint.stride.dim[0] - hint.border.begin[0], 0) - ccv_max(i[0] * hint.stride.dim[0] - hint.border.begin[0], 0));
193
21.5k
      }
194
197
    } parallel_endfor
195
4
  }
196
9
  return CCV_NNC_EXEC_SUCCESS;
197
9
}
198
199
static int _ccv_nnc_conv_transpose_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
200
0
{
201
0
  return CCV_NNC_EXEC_INVALID;
202
0
}
203
204
REGISTER_COMMAND_BACKEND(CCV_NNC_CONVOLUTION_TRANSPOSE_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
205
1
{
206
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW;
207
1
  registry->tensor_datatypes = CCV_32F;
208
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
209
1
  registry->algorithms = 1;
210
1
  registry->exec = _ccv_nnc_conv_transpose_forw;
211
1
}
212
213
REGISTER_COMMAND_BACKEND(CCV_NNC_CONVOLUTION_TRANSPOSE_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
214
1
{
215
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC;
216
1
  registry->tensor_datatypes = CCV_32F;
217
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
218
1
  registry->algorithms = 1;
219
1
  registry->exec = _ccv_nnc_conv_transpose_back;
220
1
}
221