/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/cmd/convolution/ccv_nnc_conv_transpose_cpu_ref.c
Line | Count | Source |
1 | | #include "ccv.h" |
2 | | #include "ccv_internal.h" |
3 | | #include "nnc/ccv_nnc.h" |
4 | | #include "nnc/ccv_nnc_easy.h" |
5 | | #include "nnc/ccv_nnc_internal.h" |
6 | | #ifdef USE_OPENMP |
7 | | #include <omp.h> |
8 | | #endif |
9 | | #ifdef USE_DISPATCH |
10 | | #include <dispatch/dispatch.h> |
11 | | #endif |
12 | | |
13 | | static int _ccv_nnc_conv_transpose_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
14 | 10 | { |
15 | 10 | assert(input_size >= 2); |
16 | 10 | const ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[0]; |
17 | 10 | const ccv_nnc_tensor_t* w = inputs[1]; |
18 | 10 | assert(CCV_IS_TENSOR_CONTIGUOUS(w)); |
19 | 10 | const ccv_nnc_tensor_t* bias = input_size > 2 ? inputs[2] : 00 ; |
20 | 10 | assert(!bias || !CCV_IS_TENSOR_VIEW(bias)); |
21 | 10 | assert(output_size == 1); |
22 | 10 | ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)outputs[0]; |
23 | 10 | const int a_nd = ccv_nnc_tensor_nd(a->info.dim); |
24 | 10 | assert(a_nd == CCV_NNC_MAX_DIM + 1 || a_nd == CCV_NNC_MAX_DIM + 2); |
25 | 10 | const int* adim = (a_nd == CCV_NNC_MAX_DIM + 1) ? a->info.dim6 : a->info.dim + 14 ; |
26 | 10 | const int b_nd = ccv_nnc_tensor_nd(b->info.dim); |
27 | 10 | assert(b_nd == CCV_NNC_MAX_DIM + 1 || b_nd == CCV_NNC_MAX_DIM + 2); |
28 | 10 | const int* bdim = (b_nd == CCV_NNC_MAX_DIM + 1) ? b->info.dim6 : b->info.dim + 14 ; |
29 | 10 | const int groups = cmd.info.convolution_transpose.groups; |
30 | 10 | assert(cmd.info.convolution_transpose.count % groups == 0); |
31 | 10 | const int group_size = cmd.info.convolution_transpose.count / groups; |
32 | | // Make sure the weights output dimension matches the network convolution kernels |
33 | 10 | int astride[CCV_NNC_MAX_DIM_ALLOC]; |
34 | 10 | ccv_nnc_tensor_view_get_stride(a, astride); |
35 | 10 | int bstride[CCV_NNC_MAX_DIM_ALLOC]; |
36 | 10 | ccv_nnc_tensor_view_get_stride(b, bstride); |
37 | 10 | assert(!bias || bias->info.dim[0] == cmd.info.convolution_transpose.count); |
38 | 10 | const int batch_size = (a_nd == CCV_NNC_MAX_DIM + 2) ? a->info.dim[0]4 : 16 ; |
39 | 10 | const int dilation[CCV_NNC_MAX_DIM] = { |
40 | 10 | ccv_max(cmd.info.convolution_transpose.dilation[0], 1), |
41 | 10 | ccv_max(cmd.info.convolution_transpose.dilation[1], 1) |
42 | 10 | }; |
43 | 10 | if (a->info.format == CCV_TENSOR_FORMAT_NHWC) |
44 | 6 | { |
45 | | // Make sure the weights dimension matches the network dimension |
46 | 6 | assert(w->info.dim[1] == cmd.info.size.dim[0]); |
47 | 6 | assert(w->info.dim[2] == cmd.info.size.dim[1]); |
48 | 6 | assert(w->info.dim[CCV_NNC_MAX_DIM + 1] * groups == cmd.info.convolution_transpose.count); |
49 | 6 | const int wdim[CCV_NNC_MAX_DIM] = { |
50 | 6 | (w->info.dim[1] - 1) * dilation[0] + 1, |
51 | 6 | (w->info.dim[2] - 1) * dilation[1] + 1 |
52 | 6 | }; |
53 | 6 | assert(w->info.dim[0] == adim[CCV_NNC_MAX_DIM]); |
54 | 6 | assert(b->info.format == CCV_TENSOR_FORMAT_NHWC); |
55 | 6 | const int channel_size = w->info.dim[CCV_NNC_MAX_DIM + 1]; |
56 | 6 | const int input_channel_size = w->info.dim[0] / groups; |
57 | 6 | const int hwc = w->info.dim[1] * w->info.dim[2] * channel_size; |
58 | 6 | assert(bdim[CCV_NNC_MAX_DIM] == cmd.info.convolution_transpose.count); |
59 | 149 | parallel_for6 (idx, cmd.info.convolution_transpose.count * batch_size) { |
60 | 149 | int c; |
61 | 149 | const int bidx = idx / cmd.info.convolution_transpose.count; |
62 | 149 | const int k = idx % cmd.info.convolution_transpose.count; |
63 | 149 | const int gidx = k / group_size; |
64 | 149 | float* ap = a->data.f32 + bidx * astride[0]; |
65 | 149 | float* bp = b->data.f32 + bidx * bstride[0] + k; |
66 | | // kernel weight for one dim. |
67 | 149 | float* wp = w->data.f32 + (k % group_size); |
68 | 149 | float biasval = bias ? bias->data.f32[k] : 00 ; |
69 | | // This block will be cause in each for-loop, therefore, you can use it to generate some temporary variables. |
70 | 149 | int i[CCV_NNC_MAX_DIM]; |
71 | 149 | int n[CCV_NNC_MAX_DIM]; |
72 | 149 | int d[CCV_NNC_MAX_DIM]; |
73 | 149 | int m[CCV_NNC_MAX_DIM]; |
74 | 149 | int j[CCV_NNC_MAX_DIM]; |
75 | 32.4k | for (i[0] = 0; i[0] < bdim[0]; i[0]++32.2k ) |
76 | 32.2k | { |
77 | 7.25M | for (i[1] = 0; i[1] < bdim[1]; i[1]++7.22M ) |
78 | 7.22M | bp[i[1] * bstride[CCV_NNC_MAX_DIM]] = biasval; |
79 | 32.2k | bp += bstride[CCV_NNC_MAX_DIM - 1]; |
80 | 32.2k | } |
81 | 149 | bp = b->data.f32 + bidx * bstride[0] + k; |
82 | 16.2k | for (i[0] = 0; i[0] < adim[0]; i[0]++16.1k ) |
83 | 16.1k | { |
84 | 16.1k | SET_BORDER_OFFSET_SIZE_FOR(0, i, hint, wdim, bdim, n, m); |
85 | 16.1k | m[0] = (m[0] + n[0] - 1) / dilation[0] + 1; |
86 | 16.1k | const int n0 = (n[0] + dilation[0] - 1) / dilation[0]; |
87 | 16.1k | d[0] = n0 * dilation[0] - n[0]; |
88 | 16.1k | n[0] = n0; |
89 | 16.1k | m[0] = m[0] - n[0]; |
90 | 16.1k | float* wpu = wp + n[0] * w->info.dim[CCV_NNC_MAX_DIM] * channel_size; |
91 | 1.82M | for (i[1] = 0; i[1] < adim[1]; i[1]++1.80M ) |
92 | 1.80M | { |
93 | 1.80M | SET_BORDER_OFFSET_SIZE_FOR(1, i, hint, wdim, bdim, n, m); |
94 | 1.80M | m[1] = (m[1] + n[1] - 1) / dilation[1] + 1; |
95 | 1.80M | const int n1 = (n[1] + dilation[1] - 1) / dilation[1]; |
96 | 1.80M | d[1] = n1 * dilation[1] - n[1]; |
97 | 1.80M | n[1] = n1; |
98 | 1.80M | m[1] = m[1] - n[1]; |
99 | 1.80M | float* wpz = wpu + n[1] * channel_size; |
100 | 1.80M | const float* const apz = ap + i[1] * astride[CCV_NNC_MAX_DIM]; |
101 | 1.80M | float* bpz = bp + d[0] * bstride[CCV_NNC_MAX_DIM - 1] + (ccv_max(i[1] * hint.stride.dim[1] - hint.border.begin[1], 0) + d[1]) * bstride[CCV_NNC_MAX_DIM]; |
102 | 14.3M | for (j[0] = 0; j[0] < m[0]; j[0]++12.5M ) |
103 | 12.5M | { |
104 | 99.7M | for (j[1] = 0; j[1] < m[1]; j[1]++87.1M ) |
105 | 87.1M | { |
106 | 87.1M | float p = bpz[j[1] * dilation[1] * bstride[CCV_NNC_MAX_DIM]]; |
107 | 8.45G | for (c = 0; c < input_channel_size; c++8.36G ) |
108 | 8.36G | p += wpz[j[1] * channel_size + (c + gidx * input_channel_size) * hwc] * apz[c + gidx * input_channel_size]; |
109 | 87.1M | bpz[j[1] * dilation[1] * bstride[CCV_NNC_MAX_DIM]] = p; |
110 | 87.1M | } |
111 | 12.5M | wpz += w->info.dim[CCV_NNC_MAX_DIM] * channel_size; |
112 | 12.5M | bpz += bstride[CCV_NNC_MAX_DIM - 1] * dilation[0]; |
113 | 12.5M | } |
114 | 1.80M | } |
115 | 16.1k | ap += astride[CCV_NNC_MAX_DIM - 1]; |
116 | 16.1k | bp += bstride[CCV_NNC_MAX_DIM - 1] * (ccv_max((i[0] + 1) * hint.stride.dim[0] - hint.border.begin[0], 0) - ccv_max(i[0] * hint.stride.dim[0] - hint.border.begin[0], 0)); |
117 | 16.1k | } |
118 | 149 | } parallel_endfor |
119 | 6 | } else if (4 a->info.format == CCV_TENSOR_FORMAT_NCHW4 ) { |
120 | | // Make sure the weights dimension matches the network dimension |
121 | 4 | assert(w->info.dim[1] * groups == cmd.info.convolution_transpose.count); |
122 | 4 | assert(w->info.dim[2] == cmd.info.size.dim[0]); |
123 | 4 | assert(w->info.dim[3] == cmd.info.size.dim[1]); |
124 | 4 | const int wdim[CCV_NNC_MAX_DIM] = { |
125 | 4 | (w->info.dim[2] - 1) * dilation[0] + 1, |
126 | 4 | (w->info.dim[3] - 1) * dilation[1] + 1 |
127 | 4 | }; |
128 | 4 | assert(w->info.dim[0] == adim[0]); |
129 | 4 | assert(b->info.format == CCV_TENSOR_FORMAT_NCHW); |
130 | 4 | const int channel_size = w->info.dim[1]; |
131 | 4 | const int input_channel_size = w->info.dim[0] / groups; |
132 | 4 | const int hw = w->info.dim[2] * w->info.dim[3]; |
133 | 4 | const int chw = channel_size * hw; |
134 | 4 | assert(bdim[0] == cmd.info.convolution_transpose.count); |
135 | 53 | parallel_for4 (idx, cmd.info.convolution_transpose.count * batch_size) { |
136 | 53 | int c; |
137 | 53 | const int bidx = idx / cmd.info.convolution_transpose.count; |
138 | 53 | const int k = idx % cmd.info.convolution_transpose.count; |
139 | 53 | const int gidx = k / group_size; |
140 | 53 | float* ap = a->data.f32 + bidx * astride[0]; |
141 | 53 | float* bp = b->data.f32 + bidx * bstride[0] + k * bstride[1]; |
142 | | // kernel weight for one dim. |
143 | 53 | float* wp = w->data.f32 + (k % group_size) * hw; |
144 | 53 | float biasval = bias ? bias->data.f32[k] : 00 ; |
145 | | // This block will be cause in each for-loop, therefore, you can use it to generate some temporary variables. |
146 | 53 | int i[CCV_NNC_MAX_DIM]; |
147 | 53 | int n[CCV_NNC_MAX_DIM]; |
148 | 53 | int d[CCV_NNC_MAX_DIM]; |
149 | 53 | int m[CCV_NNC_MAX_DIM]; |
150 | 53 | int j[CCV_NNC_MAX_DIM]; |
151 | 10.8k | for (i[0] = 0; i[0] < bdim[1]; i[0]++10.7k ) |
152 | 10.7k | { |
153 | 2.41M | for (i[1] = 0; i[1] < bdim[2]; i[1]++2.40M ) |
154 | 2.40M | bp[i[1]] = biasval; |
155 | 10.7k | bp += bstride[CCV_NNC_MAX_DIM]; |
156 | 10.7k | } |
157 | 53 | bp = b->data.f32 + bidx * bstride[0] + k * bstride[1]; |
158 | 5.43k | for (i[0] = 0; i[0] < adim[1]; i[0]++5.38k ) |
159 | 5.38k | { |
160 | 5.38k | SET_BORDER_OFFSET_SIZE_FOR(0, i, hint, wdim, bdim + 1, n, m); |
161 | 5.38k | m[0] = (m[0] + n[0] - 1) / dilation[0] + 1; |
162 | 5.38k | const int n0 = (n[0] + dilation[0] - 1) / dilation[0]; |
163 | 5.38k | d[0] = n0 * dilation[0] - n[0]; |
164 | 5.38k | n[0] = n0; |
165 | 5.38k | m[0] = m[0] - n[0]; |
166 | 5.38k | float* wpu = wp + n[0] * w->info.dim[CCV_NNC_MAX_DIM + 1]; |
167 | 607k | for (i[1] = 0; i[1] < adim[2]; i[1]++602k ) |
168 | 602k | { |
169 | 602k | SET_BORDER_OFFSET_SIZE_FOR(1, i, hint, wdim, bdim + 1, n, m); |
170 | 602k | m[1] = (m[1] + n[1] - 1) / dilation[1] + 1; |
171 | 602k | const int n1 = (n[1] + dilation[1] - 1) / dilation[1]; |
172 | 602k | d[1] = n1 * dilation[1] - n[1]; |
173 | 602k | n[1] = n1; |
174 | 602k | m[1] = m[1] - n[1]; |
175 | 602k | float* wpz = wpu + n[1]; |
176 | 602k | const float* apz = ap + i[1]; |
177 | 602k | float* bpz = bp + d[0] * bstride[CCV_NNC_MAX_DIM] + (ccv_max(i[1] * hint.stride.dim[1] - hint.border.begin[1], 0) + d[1]) * bstride[CCV_NNC_MAX_DIM + 1]; |
178 | 4.78M | for (j[0] = 0; j[0] < m[0]; j[0]++4.18M ) |
179 | 4.18M | { |
180 | 33.2M | for (j[1] = 0; j[1] < m[1]; j[1]++29.0M ) |
181 | 29.0M | { |
182 | 29.0M | float p = bpz[j[1] * dilation[1] * bstride[CCV_NNC_MAX_DIM + 1]]; |
183 | 2.81G | for (c = 0; c < input_channel_size; c++2.78G ) |
184 | 2.78G | p += wpz[j[1] + (c + gidx * input_channel_size) * chw] * apz[(c + gidx * input_channel_size) * astride[1]]; |
185 | 29.0M | bpz[j[1] * dilation[1] * bstride[CCV_NNC_MAX_DIM + 1]] = p; |
186 | 29.0M | } |
187 | 4.18M | wpz += w->info.dim[CCV_NNC_MAX_DIM + 1]; |
188 | 4.18M | bpz += bstride[CCV_NNC_MAX_DIM] * dilation[0]; |
189 | 4.18M | } |
190 | 602k | } |
191 | 5.38k | ap += astride[CCV_NNC_MAX_DIM]; |
192 | 5.38k | bp += bstride[CCV_NNC_MAX_DIM] * (ccv_max((i[0] + 1) * hint.stride.dim[0] - hint.border.begin[0], 0) - ccv_max(i[0] * hint.stride.dim[0] - hint.border.begin[0], 0)); |
193 | 5.38k | } |
194 | 53 | } parallel_endfor |
195 | 4 | } |
196 | 10 | return CCV_NNC_EXEC_SUCCESS; |
197 | 10 | } |
198 | | |
199 | | static int _ccv_nnc_conv_transpose_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
200 | 0 | { |
201 | 0 | return CCV_NNC_EXEC_INVALID; |
202 | 0 | } |
203 | | |
204 | | REGISTER_COMMAND_BACKEND(CCV_NNC_CONVOLUTION_TRANSPOSE_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
205 | 1 | { |
206 | 1 | registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW; |
207 | 1 | registry->tensor_datatypes = CCV_32F; |
208 | 1 | registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
209 | 1 | registry->algorithms = 1; |
210 | 1 | registry->exec = _ccv_nnc_conv_transpose_forw; |
211 | 1 | } |
212 | | |
213 | | REGISTER_COMMAND_BACKEND(CCV_NNC_CONVOLUTION_TRANSPOSE_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
214 | 1 | { |
215 | 1 | registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC; |
216 | 1 | registry->tensor_datatypes = CCV_32F; |
217 | 1 | registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
218 | 1 | registry->algorithms = 1; |
219 | 1 | registry->exec = _ccv_nnc_conv_transpose_back; |
220 | 1 | } |
221 | | |