/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/cmd/convolution/ccv_nnc_conv_transpose_cpu_ref.c
Line | Count | Source (jump to first uncovered line) |
1 | | #include "ccv.h" |
2 | | #include "ccv_internal.h" |
3 | | #include "nnc/ccv_nnc.h" |
4 | | #include "nnc/ccv_nnc_easy.h" |
5 | | #include "nnc/ccv_nnc_internal.h" |
6 | | #ifdef USE_OPENMP |
7 | | #include <omp.h> |
8 | | #endif |
9 | | #ifdef USE_DISPATCH |
10 | | #include <dispatch/dispatch.h> |
11 | | #endif |
12 | | |
13 | | static int _ccv_nnc_conv_transpose_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
14 | 9 | { |
15 | 9 | assert(input_size >= 2); |
16 | 9 | const ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[0]; |
17 | 9 | const ccv_nnc_tensor_t* w = inputs[1]; |
18 | 9 | assert(CCV_IS_TENSOR_CONTIGUOUS(w)); |
19 | 9 | const ccv_nnc_tensor_t* bias = input_size > 2 ? inputs[2] : 00 ; |
20 | 9 | assert(!bias || !CCV_IS_TENSOR_VIEW(bias)); |
21 | 9 | assert(output_size == 1); |
22 | 9 | ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)outputs[0]; |
23 | 9 | const int a_nd = ccv_nnc_tensor_nd(a->info.dim); |
24 | 9 | assert(a_nd == CCV_NNC_MAX_DIM + 1 || a_nd == CCV_NNC_MAX_DIM + 2); |
25 | 9 | const int* adim = (a_nd == CCV_NNC_MAX_DIM + 1) ? a->info.dim6 : a->info.dim + 13 ; |
26 | 9 | const int b_nd = ccv_nnc_tensor_nd(b->info.dim); |
27 | 9 | assert(b_nd == CCV_NNC_MAX_DIM + 1 || b_nd == CCV_NNC_MAX_DIM + 2); |
28 | 9 | const int* bdim = (b_nd == CCV_NNC_MAX_DIM + 1) ? b->info.dim6 : b->info.dim + 13 ; |
29 | 9 | const int groups = cmd.info.convolution_transpose.groups; |
30 | 9 | assert(cmd.info.convolution_transpose.count % groups == 0); |
31 | 9 | const int group_size = cmd.info.convolution_transpose.count / groups; |
32 | | // Make sure the weights output dimension matches the network convolution kernels |
33 | 9 | int astride[CCV_NNC_MAX_DIM_ALLOC]; |
34 | 9 | ccv_nnc_tensor_view_get_stride(a, astride); |
35 | 9 | int bstride[CCV_NNC_MAX_DIM_ALLOC]; |
36 | 9 | ccv_nnc_tensor_view_get_stride(b, bstride); |
37 | 9 | assert(!bias || bias->info.dim[0] == cmd.info.convolution_transpose.count); |
38 | 9 | const int batch_size = (a_nd == CCV_NNC_MAX_DIM + 2) ? a->info.dim[0]3 : 16 ; |
39 | 9 | const int dilation[CCV_NNC_MAX_DIM] = { |
40 | 9 | ccv_max(cmd.info.convolution_transpose.dilation[0], 1), |
41 | 9 | ccv_max(cmd.info.convolution_transpose.dilation[1], 1) |
42 | 9 | }; |
43 | 9 | if (a->info.format == CCV_TENSOR_FORMAT_NHWC) |
44 | 5 | { |
45 | | // Make sure the weights dimension matches the network dimension |
46 | 5 | assert(w->info.dim[1] == cmd.info.size.dim[0]); |
47 | 5 | assert(w->info.dim[2] == cmd.info.size.dim[1]); |
48 | 5 | assert(w->info.dim[CCV_NNC_MAX_DIM + 1] * groups == cmd.info.convolution_transpose.count); |
49 | 5 | const int wdim[CCV_NNC_MAX_DIM] = { |
50 | 5 | (w->info.dim[1] - 1) * dilation[0] + 1, |
51 | 5 | (w->info.dim[2] - 1) * dilation[1] + 1 |
52 | 5 | }; |
53 | 5 | assert(w->info.dim[0] == adim[CCV_NNC_MAX_DIM]); |
54 | 5 | assert(b->info.format == CCV_TENSOR_FORMAT_NHWC); |
55 | 5 | const int channel_size = w->info.dim[CCV_NNC_MAX_DIM + 1]; |
56 | 5 | const int input_channel_size = w->info.dim[0] / groups; |
57 | 5 | const int hwc = w->info.dim[1] * w->info.dim[2] * channel_size; |
58 | 5 | assert(bdim[CCV_NNC_MAX_DIM] == cmd.info.convolution_transpose.count); |
59 | 389 | parallel_for5 (idx, cmd.info.convolution_transpose.count * batch_size) { |
60 | 389 | int c; |
61 | 389 | const int bidx = idx / cmd.info.convolution_transpose.count; |
62 | 389 | const int k = idx % cmd.info.convolution_transpose.count; |
63 | 389 | const int gidx = k / group_size; |
64 | 389 | float* ap = a->data.f32 + bidx * astride[0]; |
65 | 389 | float* bp = b->data.f32 + bidx * bstride[0] + k; |
66 | | // kernel weight for one dim. |
67 | 389 | float* wp = w->data.f32 + (k % group_size); |
68 | 389 | float biasval = bias ? bias->data.f32[k] : 00 ; |
69 | | // This block will be cause in each for-loop, therefore, you can use it to generate some temporary variables. |
70 | 389 | int i[CCV_NNC_MAX_DIM]; |
71 | 389 | int n[CCV_NNC_MAX_DIM]; |
72 | 389 | int d[CCV_NNC_MAX_DIM]; |
73 | 389 | int m[CCV_NNC_MAX_DIM]; |
74 | 389 | int j[CCV_NNC_MAX_DIM]; |
75 | 86.4k | for (i[0] = 0; i[0] < bdim[0]; i[0]++86.0k ) |
76 | 86.0k | { |
77 | 19.3M | for (i[1] = 0; i[1] < bdim[1]; i[1]++19.2M ) |
78 | 19.2M | bp[i[1] * bstride[CCV_NNC_MAX_DIM]] = biasval; |
79 | 86.0k | bp += bstride[CCV_NNC_MAX_DIM - 1]; |
80 | 86.0k | } |
81 | 389 | bp = b->data.f32 + bidx * bstride[0] + k; |
82 | 43.4k | for (i[0] = 0; i[0] < adim[0]; i[0]++43.0k ) |
83 | 43.0k | { |
84 | 43.0k | SET_BORDER_OFFSET_SIZE_FOR(0, i, hint, wdim, bdim, n, m); |
85 | 43.0k | m[0] = (m[0] + n[0] - 1) / dilation[0] + 1; |
86 | 43.0k | const int n0 = (n[0] + dilation[0] - 1) / dilation[0]; |
87 | 43.0k | d[0] = n0 * dilation[0] - n[0]; |
88 | 43.0k | n[0] = n0; |
89 | 43.0k | m[0] = m[0] - n[0]; |
90 | 43.0k | float* wpu = wp + n[0] * w->info.dim[CCV_NNC_MAX_DIM] * channel_size; |
91 | 4.85M | for (i[1] = 0; i[1] < adim[1]; i[1]++4.81M ) |
92 | 4.81M | { |
93 | 4.81M | SET_BORDER_OFFSET_SIZE_FOR(1, i, hint, wdim, bdim, n, m); |
94 | 4.81M | m[1] = (m[1] + n[1] - 1) / dilation[1] + 1; |
95 | 4.81M | const int n1 = (n[1] + dilation[1] - 1) / dilation[1]; |
96 | 4.81M | d[1] = n1 * dilation[1] - n[1]; |
97 | 4.81M | n[1] = n1; |
98 | 4.81M | m[1] = m[1] - n[1]; |
99 | 4.81M | float* wpz = wpu + n[1] * channel_size; |
100 | 4.81M | const float* const apz = ap + i[1] * astride[CCV_NNC_MAX_DIM]; |
101 | 4.81M | float* bpz = bp + d[0] * bstride[CCV_NNC_MAX_DIM - 1] + (ccv_max(i[1] * hint.stride.dim[1] - hint.border.begin[1], 0) + d[1]) * bstride[CCV_NNC_MAX_DIM]; |
102 | 38.2M | for (j[0] = 0; j[0] < m[0]; j[0]++33.4M ) |
103 | 33.4M | { |
104 | 265M | for (j[1] = 0; j[1] < m[1]; j[1]++232M ) |
105 | 232M | { |
106 | 232M | float p = bpz[j[1] * dilation[1] * bstride[CCV_NNC_MAX_DIM]]; |
107 | 22.5G | for (c = 0; c < input_channel_size; c++22.3G ) |
108 | 22.3G | p += wpz[j[1] * channel_size + (c + gidx * input_channel_size) * hwc] * apz[c + gidx * input_channel_size]; |
109 | 232M | bpz[j[1] * dilation[1] * bstride[CCV_NNC_MAX_DIM]] = p; |
110 | 232M | } |
111 | 33.4M | wpz += w->info.dim[CCV_NNC_MAX_DIM] * channel_size; |
112 | 33.4M | bpz += bstride[CCV_NNC_MAX_DIM - 1] * dilation[0]; |
113 | 33.4M | } |
114 | 4.81M | } |
115 | 43.0k | ap += astride[CCV_NNC_MAX_DIM - 1]; |
116 | 43.0k | bp += bstride[CCV_NNC_MAX_DIM - 1] * (ccv_max((i[0] + 1) * hint.stride.dim[0] - hint.border.begin[0], 0) - ccv_max(i[0] * hint.stride.dim[0] - hint.border.begin[0], 0)); |
117 | 43.0k | } |
118 | 389 | } parallel_endfor |
119 | 5 | } else if (4 a->info.format == CCV_TENSOR_FORMAT_NCHW4 ) { |
120 | | // Make sure the weights dimension matches the network dimension |
121 | 4 | assert(w->info.dim[1] * groups == cmd.info.convolution_transpose.count); |
122 | 4 | assert(w->info.dim[2] == cmd.info.size.dim[0]); |
123 | 4 | assert(w->info.dim[3] == cmd.info.size.dim[1]); |
124 | 4 | const int wdim[CCV_NNC_MAX_DIM] = { |
125 | 4 | (w->info.dim[2] - 1) * dilation[0] + 1, |
126 | 4 | (w->info.dim[3] - 1) * dilation[1] + 1 |
127 | 4 | }; |
128 | 4 | assert(w->info.dim[0] == adim[0]); |
129 | 4 | assert(b->info.format == CCV_TENSOR_FORMAT_NCHW); |
130 | 4 | const int channel_size = w->info.dim[1]; |
131 | 4 | const int input_channel_size = w->info.dim[0] / groups; |
132 | 4 | const int hw = w->info.dim[2] * w->info.dim[3]; |
133 | 4 | const int chw = channel_size * hw; |
134 | 4 | assert(bdim[0] == cmd.info.convolution_transpose.count); |
135 | 197 | parallel_for4 (idx, cmd.info.convolution_transpose.count * batch_size) { |
136 | 197 | int c; |
137 | 197 | const int bidx = idx / cmd.info.convolution_transpose.count; |
138 | 197 | const int k = idx % cmd.info.convolution_transpose.count; |
139 | 197 | const int gidx = k / group_size; |
140 | 197 | float* ap = a->data.f32 + bidx * astride[0]; |
141 | 197 | float* bp = b->data.f32 + bidx * bstride[0] + k * bstride[1]; |
142 | | // kernel weight for one dim. |
143 | 197 | float* wp = w->data.f32 + (k % group_size) * hw; |
144 | 197 | float biasval = bias ? bias->data.f32[k] : 00 ; |
145 | | // This block will be cause in each for-loop, therefore, you can use it to generate some temporary variables. |
146 | 197 | int i[CCV_NNC_MAX_DIM]; |
147 | 197 | int n[CCV_NNC_MAX_DIM]; |
148 | 197 | int d[CCV_NNC_MAX_DIM]; |
149 | 197 | int m[CCV_NNC_MAX_DIM]; |
150 | 197 | int j[CCV_NNC_MAX_DIM]; |
151 | 43.2k | for (i[0] = 0; i[0] < bdim[1]; i[0]++43.0k ) |
152 | 43.0k | { |
153 | 9.67M | for (i[1] = 0; i[1] < bdim[2]; i[1]++9.63M ) |
154 | 9.63M | bp[i[1]] = biasval; |
155 | 43.0k | bp += bstride[CCV_NNC_MAX_DIM]; |
156 | 43.0k | } |
157 | 197 | bp = b->data.f32 + bidx * bstride[0] + k * bstride[1]; |
158 | 21.7k | for (i[0] = 0; i[0] < adim[1]; i[0]++21.5k ) |
159 | 21.5k | { |
160 | 21.5k | SET_BORDER_OFFSET_SIZE_FOR(0, i, hint, wdim, bdim + 1, n, m); |
161 | 21.5k | m[0] = (m[0] + n[0] - 1) / dilation[0] + 1; |
162 | 21.5k | const int n0 = (n[0] + dilation[0] - 1) / dilation[0]; |
163 | 21.5k | d[0] = n0 * dilation[0] - n[0]; |
164 | 21.5k | n[0] = n0; |
165 | 21.5k | m[0] = m[0] - n[0]; |
166 | 21.5k | float* wpu = wp + n[0] * w->info.dim[CCV_NNC_MAX_DIM + 1]; |
167 | 2.42M | for (i[1] = 0; i[1] < adim[2]; i[1]++2.40M ) |
168 | 2.40M | { |
169 | 2.40M | SET_BORDER_OFFSET_SIZE_FOR(1, i, hint, wdim, bdim + 1, n, m); |
170 | 2.40M | m[1] = (m[1] + n[1] - 1) / dilation[1] + 1; |
171 | 2.40M | const int n1 = (n[1] + dilation[1] - 1) / dilation[1]; |
172 | 2.40M | d[1] = n1 * dilation[1] - n[1]; |
173 | 2.40M | n[1] = n1; |
174 | 2.40M | m[1] = m[1] - n[1]; |
175 | 2.40M | float* wpz = wpu + n[1]; |
176 | 2.40M | const float* apz = ap + i[1]; |
177 | 2.40M | float* bpz = bp + d[0] * bstride[CCV_NNC_MAX_DIM] + (ccv_max(i[1] * hint.stride.dim[1] - hint.border.begin[1], 0) + d[1]) * bstride[CCV_NNC_MAX_DIM + 1]; |
178 | 19.1M | for (j[0] = 0; j[0] < m[0]; j[0]++16.7M ) |
179 | 16.7M | { |
180 | 132M | for (j[1] = 0; j[1] < m[1]; j[1]++116M ) |
181 | 116M | { |
182 | 116M | float p = bpz[j[1] * dilation[1] * bstride[CCV_NNC_MAX_DIM + 1]]; |
183 | 11.2G | for (c = 0; c < input_channel_size; c++11.1G ) |
184 | 11.1G | p += wpz[j[1] + (c + gidx * input_channel_size) * chw] * apz[(c + gidx * input_channel_size) * astride[1]]; |
185 | 116M | bpz[j[1] * dilation[1] * bstride[CCV_NNC_MAX_DIM + 1]] = p; |
186 | 116M | } |
187 | 16.7M | wpz += w->info.dim[CCV_NNC_MAX_DIM + 1]; |
188 | 16.7M | bpz += bstride[CCV_NNC_MAX_DIM] * dilation[0]; |
189 | 16.7M | } |
190 | 2.40M | } |
191 | 21.5k | ap += astride[CCV_NNC_MAX_DIM]; |
192 | 21.5k | bp += bstride[CCV_NNC_MAX_DIM] * (ccv_max((i[0] + 1) * hint.stride.dim[0] - hint.border.begin[0], 0) - ccv_max(i[0] * hint.stride.dim[0] - hint.border.begin[0], 0)); |
193 | 21.5k | } |
194 | 197 | } parallel_endfor |
195 | 4 | } |
196 | 9 | return CCV_NNC_EXEC_SUCCESS; |
197 | 9 | } |
198 | | |
199 | | static int _ccv_nnc_conv_transpose_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
200 | 0 | { |
201 | 0 | return CCV_NNC_EXEC_INVALID; |
202 | 0 | } |
203 | | |
204 | | REGISTER_COMMAND_BACKEND(CCV_NNC_CONVOLUTION_TRANSPOSE_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
205 | 1 | { |
206 | 1 | registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW; |
207 | 1 | registry->tensor_datatypes = CCV_32F; |
208 | 1 | registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
209 | 1 | registry->algorithms = 1; |
210 | 1 | registry->exec = _ccv_nnc_conv_transpose_forw; |
211 | 1 | } |
212 | | |
213 | | REGISTER_COMMAND_BACKEND(CCV_NNC_CONVOLUTION_TRANSPOSE_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
214 | 1 | { |
215 | 1 | registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC; |
216 | 1 | registry->tensor_datatypes = CCV_32F; |
217 | 1 | registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
218 | 1 | registry->algorithms = 1; |
219 | 1 | registry->exec = _ccv_nnc_conv_transpose_back; |
220 | 1 | } |
221 | | |