Coverage Report

Created: 2025-02-24 17:43

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/cmd/convolution/ccv_nnc_conv_transpose_cpu_ref.c
Line
Count
Source
1
#include "ccv.h"
2
#include "ccv_internal.h"
3
#include "nnc/ccv_nnc.h"
4
#include "nnc/ccv_nnc_easy.h"
5
#include "nnc/ccv_nnc_internal.h"
6
#ifdef USE_OPENMP
7
#include <omp.h>
8
#endif
9
#ifdef USE_DISPATCH
10
#include <dispatch/dispatch.h>
11
#endif
12
13
static int _ccv_nnc_conv_transpose_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
14
10
{
15
10
  assert(input_size >= 2);
16
10
  const ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[0];
17
10
  const ccv_nnc_tensor_t* w = inputs[1];
18
10
  assert(CCV_IS_TENSOR_CONTIGUOUS(w));
19
10
  const ccv_nnc_tensor_t* bias = input_size > 2 ? inputs[2] : 
00
;
20
10
  assert(!bias || !CCV_IS_TENSOR_VIEW(bias));
21
10
  assert(output_size == 1);
22
10
  ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)outputs[0];
23
10
  const int a_nd = ccv_nnc_tensor_nd(a->info.dim);
24
10
  assert(a_nd == CCV_NNC_MAX_DIM + 1 || a_nd == CCV_NNC_MAX_DIM + 2);
25
10
  const int* adim = (a_nd == CCV_NNC_MAX_DIM + 1) ? 
a->info.dim6
:
a->info.dim + 14
;
26
10
  const int b_nd = ccv_nnc_tensor_nd(b->info.dim);
27
10
  assert(b_nd == CCV_NNC_MAX_DIM + 1 || b_nd == CCV_NNC_MAX_DIM + 2);
28
10
  const int* bdim = (b_nd == CCV_NNC_MAX_DIM + 1) ? 
b->info.dim6
:
b->info.dim + 14
;
29
10
  const int groups = cmd.info.convolution_transpose.groups;
30
10
  assert(cmd.info.convolution_transpose.count % groups == 0);
31
10
  const int group_size = cmd.info.convolution_transpose.count / groups;
32
  // Make sure the weights output dimension matches the network convolution kernels
33
10
  int astride[CCV_NNC_MAX_DIM_ALLOC];
34
10
  ccv_nnc_tensor_view_get_stride(a, astride);
35
10
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
36
10
  ccv_nnc_tensor_view_get_stride(b, bstride);
37
10
  assert(!bias || bias->info.dim[0] == cmd.info.convolution_transpose.count);
38
10
  const int batch_size = (a_nd == CCV_NNC_MAX_DIM + 2) ? 
a->info.dim[0]4
:
16
;
39
10
  const int dilation[CCV_NNC_MAX_DIM] = {
40
10
    ccv_max(cmd.info.convolution_transpose.dilation[0], 1),
41
10
    ccv_max(cmd.info.convolution_transpose.dilation[1], 1)
42
10
  };
43
10
  if (a->info.format == CCV_TENSOR_FORMAT_NHWC)
44
6
  {
45
    // Make sure the weights dimension matches the network dimension
46
6
    assert(w->info.dim[1] == cmd.info.size.dim[0]);
47
6
    assert(w->info.dim[2] == cmd.info.size.dim[1]);
48
6
    assert(w->info.dim[CCV_NNC_MAX_DIM + 1] * groups == cmd.info.convolution_transpose.count);
49
6
    const int wdim[CCV_NNC_MAX_DIM] = {
50
6
      (w->info.dim[1] - 1) * dilation[0] + 1,
51
6
      (w->info.dim[2] - 1) * dilation[1] + 1
52
6
    };
53
6
    assert(w->info.dim[0] == adim[CCV_NNC_MAX_DIM]);
54
6
    assert(b->info.format == CCV_TENSOR_FORMAT_NHWC);
55
6
    const int channel_size = w->info.dim[CCV_NNC_MAX_DIM + 1];
56
6
    const int input_channel_size = w->info.dim[0] / groups;
57
6
    const int hwc = w->info.dim[1] * w->info.dim[2] * channel_size;
58
6
    assert(bdim[CCV_NNC_MAX_DIM] == cmd.info.convolution_transpose.count);
59
149
    
parallel_for6
(idx, cmd.info.convolution_transpose.count * batch_size) {
60
149
      int c;
61
149
      const int bidx = idx / cmd.info.convolution_transpose.count;
62
149
      const int k = idx % cmd.info.convolution_transpose.count;
63
149
      const int gidx = k / group_size;
64
149
      float* ap = a->data.f32 + bidx * astride[0];
65
149
      float* bp = b->data.f32 + bidx * bstride[0] + k;
66
      // kernel weight for one dim.
67
149
      float* wp = w->data.f32 + (k % group_size);
68
149
      float biasval = bias ? bias->data.f32[k] : 
00
;
69
      // This block will be cause in each for-loop, therefore, you can use it to generate some temporary variables.
70
149
      int i[CCV_NNC_MAX_DIM];
71
149
      int n[CCV_NNC_MAX_DIM];
72
149
      int d[CCV_NNC_MAX_DIM];
73
149
      int m[CCV_NNC_MAX_DIM];
74
149
      int j[CCV_NNC_MAX_DIM];
75
32.4k
      for (i[0] = 0; i[0] < bdim[0]; 
i[0]++32.2k
)
76
32.2k
      {
77
7.25M
        for (i[1] = 0; i[1] < bdim[1]; 
i[1]++7.22M
)
78
7.22M
          bp[i[1] * bstride[CCV_NNC_MAX_DIM]] = biasval;
79
32.2k
        bp += bstride[CCV_NNC_MAX_DIM - 1];
80
32.2k
      }
81
149
      bp = b->data.f32 + bidx * bstride[0] + k;
82
16.2k
      for (i[0] = 0; i[0] < adim[0]; 
i[0]++16.1k
)
83
16.1k
      {
84
16.1k
        SET_BORDER_OFFSET_SIZE_FOR(0, i, hint, wdim, bdim, n, m);
85
16.1k
        m[0] = (m[0] + n[0] - 1) / dilation[0] + 1;
86
16.1k
        const int n0 = (n[0] + dilation[0] - 1) / dilation[0];
87
16.1k
        d[0] = n0 * dilation[0] - n[0];
88
16.1k
        n[0] = n0;
89
16.1k
        m[0] = m[0] - n[0];
90
16.1k
        float* wpu = wp + n[0] * w->info.dim[CCV_NNC_MAX_DIM] * channel_size;
91
1.82M
        for (i[1] = 0; i[1] < adim[1]; 
i[1]++1.80M
)
92
1.80M
        {
93
1.80M
          SET_BORDER_OFFSET_SIZE_FOR(1, i, hint, wdim, bdim, n, m);
94
1.80M
          m[1] = (m[1] + n[1] - 1) / dilation[1] + 1;
95
1.80M
          const int n1 = (n[1] + dilation[1] - 1) / dilation[1];
96
1.80M
          d[1] = n1 * dilation[1] - n[1];
97
1.80M
          n[1] = n1;
98
1.80M
          m[1] = m[1] - n[1];
99
1.80M
          float* wpz = wpu + n[1] * channel_size;
100
1.80M
          const float* const apz = ap + i[1] * astride[CCV_NNC_MAX_DIM];
101
1.80M
          float* bpz = bp + d[0] * bstride[CCV_NNC_MAX_DIM - 1] + (ccv_max(i[1] * hint.stride.dim[1] - hint.border.begin[1], 0) + d[1]) * bstride[CCV_NNC_MAX_DIM];
102
14.3M
          for (j[0] = 0; j[0] < m[0]; 
j[0]++12.5M
)
103
12.5M
          {
104
99.7M
            for (j[1] = 0; j[1] < m[1]; 
j[1]++87.1M
)
105
87.1M
            {
106
87.1M
              float p = bpz[j[1] * dilation[1] * bstride[CCV_NNC_MAX_DIM]];
107
8.45G
              for (c = 0; c < input_channel_size; 
c++8.36G
)
108
8.36G
                 p += wpz[j[1] * channel_size + (c + gidx * input_channel_size) * hwc] * apz[c + gidx * input_channel_size];
109
87.1M
              bpz[j[1] * dilation[1] * bstride[CCV_NNC_MAX_DIM]] = p;
110
87.1M
            }
111
12.5M
            wpz += w->info.dim[CCV_NNC_MAX_DIM] * channel_size;
112
12.5M
            bpz += bstride[CCV_NNC_MAX_DIM - 1] * dilation[0];
113
12.5M
          }
114
1.80M
        }
115
16.1k
        ap += astride[CCV_NNC_MAX_DIM - 1];
116
16.1k
        bp += bstride[CCV_NNC_MAX_DIM - 1] * (ccv_max((i[0] + 1) * hint.stride.dim[0] - hint.border.begin[0], 0) - ccv_max(i[0] * hint.stride.dim[0] - hint.border.begin[0], 0));
117
16.1k
      }
118
149
    } parallel_endfor
119
6
  } else 
if (4
a->info.format == CCV_TENSOR_FORMAT_NCHW4
) {
120
    // Make sure the weights dimension matches the network dimension
121
4
    assert(w->info.dim[1] * groups == cmd.info.convolution_transpose.count);
122
4
    assert(w->info.dim[2] == cmd.info.size.dim[0]);
123
4
    assert(w->info.dim[3] == cmd.info.size.dim[1]);
124
4
    const int wdim[CCV_NNC_MAX_DIM] = {
125
4
      (w->info.dim[2] - 1) * dilation[0] + 1,
126
4
      (w->info.dim[3] - 1) * dilation[1] + 1
127
4
    };
128
4
    assert(w->info.dim[0] == adim[0]);
129
4
    assert(b->info.format == CCV_TENSOR_FORMAT_NCHW);
130
4
    const int channel_size = w->info.dim[1];
131
4
    const int input_channel_size = w->info.dim[0] / groups;
132
4
    const int hw = w->info.dim[2] * w->info.dim[3];
133
4
    const int chw = channel_size * hw;
134
4
    assert(bdim[0] == cmd.info.convolution_transpose.count);
135
53
    
parallel_for4
(idx, cmd.info.convolution_transpose.count * batch_size) {
136
53
      int c;
137
53
      const int bidx = idx / cmd.info.convolution_transpose.count;
138
53
      const int k = idx % cmd.info.convolution_transpose.count;
139
53
      const int gidx = k / group_size;
140
53
      float* ap = a->data.f32 + bidx * astride[0];
141
53
      float* bp = b->data.f32 + bidx * bstride[0] + k * bstride[1];
142
      // kernel weight for one dim.
143
53
      float* wp = w->data.f32 + (k % group_size) * hw;
144
53
      float biasval = bias ? bias->data.f32[k] : 
00
;
145
      // This block will be cause in each for-loop, therefore, you can use it to generate some temporary variables.
146
53
      int i[CCV_NNC_MAX_DIM];
147
53
      int n[CCV_NNC_MAX_DIM];
148
53
      int d[CCV_NNC_MAX_DIM];
149
53
      int m[CCV_NNC_MAX_DIM];
150
53
      int j[CCV_NNC_MAX_DIM];
151
10.8k
      for (i[0] = 0; i[0] < bdim[1]; 
i[0]++10.7k
)
152
10.7k
      {
153
2.41M
        for (i[1] = 0; i[1] < bdim[2]; 
i[1]++2.40M
)
154
2.40M
          bp[i[1]] = biasval;
155
10.7k
        bp += bstride[CCV_NNC_MAX_DIM];
156
10.7k
      }
157
53
      bp = b->data.f32 + bidx * bstride[0] + k * bstride[1];
158
5.43k
      for (i[0] = 0; i[0] < adim[1]; 
i[0]++5.38k
)
159
5.38k
      {
160
5.38k
        SET_BORDER_OFFSET_SIZE_FOR(0, i, hint, wdim, bdim + 1, n, m);
161
5.38k
        m[0] = (m[0] + n[0] - 1) / dilation[0] + 1;
162
5.38k
        const int n0 = (n[0] + dilation[0] - 1) / dilation[0];
163
5.38k
        d[0] = n0 * dilation[0] - n[0];
164
5.38k
        n[0] = n0;
165
5.38k
        m[0] = m[0] - n[0];
166
5.38k
        float* wpu = wp + n[0] * w->info.dim[CCV_NNC_MAX_DIM + 1];
167
607k
        for (i[1] = 0; i[1] < adim[2]; 
i[1]++602k
)
168
602k
        {
169
602k
          SET_BORDER_OFFSET_SIZE_FOR(1, i, hint, wdim, bdim + 1, n, m);
170
602k
          m[1] = (m[1] + n[1] - 1) / dilation[1] + 1;
171
602k
          const int n1 = (n[1] + dilation[1] - 1) / dilation[1];
172
602k
          d[1] = n1 * dilation[1] - n[1];
173
602k
          n[1] = n1;
174
602k
          m[1] = m[1] - n[1];
175
602k
          float* wpz = wpu + n[1];
176
602k
          const float* apz = ap + i[1];
177
602k
          float* bpz = bp + d[0] * bstride[CCV_NNC_MAX_DIM] + (ccv_max(i[1] * hint.stride.dim[1] - hint.border.begin[1], 0) + d[1]) * bstride[CCV_NNC_MAX_DIM + 1];
178
4.78M
          for (j[0] = 0; j[0] < m[0]; 
j[0]++4.18M
)
179
4.18M
          {
180
33.2M
            for (j[1] = 0; j[1] < m[1]; 
j[1]++29.0M
)
181
29.0M
            {
182
29.0M
              float p = bpz[j[1] * dilation[1] * bstride[CCV_NNC_MAX_DIM + 1]];
183
2.81G
              for (c = 0; c < input_channel_size; 
c++2.78G
)
184
2.78G
                 p += wpz[j[1] + (c + gidx * input_channel_size) * chw] * apz[(c + gidx * input_channel_size) * astride[1]];
185
29.0M
              bpz[j[1] * dilation[1] * bstride[CCV_NNC_MAX_DIM + 1]] = p;
186
29.0M
            }
187
4.18M
            wpz += w->info.dim[CCV_NNC_MAX_DIM + 1];
188
4.18M
            bpz += bstride[CCV_NNC_MAX_DIM] * dilation[0];
189
4.18M
          }
190
602k
        }
191
5.38k
        ap += astride[CCV_NNC_MAX_DIM];
192
5.38k
        bp += bstride[CCV_NNC_MAX_DIM] * (ccv_max((i[0] + 1) * hint.stride.dim[0] - hint.border.begin[0], 0) - ccv_max(i[0] * hint.stride.dim[0] - hint.border.begin[0], 0));
193
5.38k
      }
194
53
    } parallel_endfor
195
4
  }
196
10
  return CCV_NNC_EXEC_SUCCESS;
197
10
}
198
199
static int _ccv_nnc_conv_transpose_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
200
0
{
201
0
  return CCV_NNC_EXEC_INVALID;
202
0
}
203
204
REGISTER_COMMAND_BACKEND(CCV_NNC_CONVOLUTION_TRANSPOSE_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
205
1
{
206
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW;
207
1
  registry->tensor_datatypes = CCV_32F;
208
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
209
1
  registry->algorithms = 1;
210
1
  registry->exec = _ccv_nnc_conv_transpose_forw;
211
1
}
212
213
REGISTER_COMMAND_BACKEND(CCV_NNC_CONVOLUTION_TRANSPOSE_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
214
1
{
215
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC;
216
1
  registry->tensor_datatypes = CCV_32F;
217
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
218
1
  registry->algorithms = 1;
219
1
  registry->exec = _ccv_nnc_conv_transpose_back;
220
1
}
221