/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/cmd/convolution/ccv_nnc_conv_cpu_ref.c
Line | Count | Source |
1 | | #include "ccv.h" |
2 | | #include "ccv_internal.h" |
3 | | #include "nnc/ccv_nnc.h" |
4 | | #include "nnc/ccv_nnc_easy.h" |
5 | | #include "nnc/ccv_nnc_internal.h" |
6 | | #ifdef USE_OPENMP |
7 | | #include <omp.h> |
8 | | #endif |
9 | | #ifdef USE_DISPATCH |
10 | | #include <dispatch/dispatch.h> |
11 | | #endif |
12 | | |
13 | | static int _ccv_nnc_conv_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
14 | 751 | { |
15 | 751 | assert(input_size >= 2); |
16 | 751 | const ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[0]; |
17 | 751 | const ccv_nnc_tensor_t* w = inputs[1]; |
18 | 751 | assert(CCV_IS_TENSOR_CONTIGUOUS(w)); |
19 | 751 | const ccv_nnc_tensor_t* bias = input_size > 2 ? inputs[2]748 : 03 ; |
20 | 751 | assert(!bias || !CCV_IS_TENSOR_VIEW(bias)); |
21 | 751 | assert(output_size == 1); |
22 | 751 | ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)outputs[0]; |
23 | 751 | const int a_nd = ccv_nnc_tensor_nd(a->info.dim); |
24 | 751 | const int size_nd = ccv_nnc_tensor_nd(cmd.info.size.dim) - 1; |
25 | 751 | assert(size_nd == 2 || size_nd == 3); |
26 | 751 | assert(a_nd == size_nd + 1 || a_nd == size_nd + 2); |
27 | 751 | const int* adim = (a_nd == size_nd + 1) ? a->info.dim679 : a->info.dim + 172 ; |
28 | 751 | const int b_nd = ccv_nnc_tensor_nd(b->info.dim); |
29 | 751 | assert(b_nd == size_nd + 1 || b_nd == size_nd + 2); |
30 | 751 | const int* bdim = (b_nd == size_nd + 1) ? b->info.dim679 : b->info.dim + 172 ; |
31 | 751 | const int groups = cmd.info.convolution.groups; |
32 | 751 | assert(cmd.info.convolution.count % groups == 0); |
33 | 751 | const int group_size = cmd.info.convolution.count / groups; |
34 | | // Make sure the weights output dimension matches the network convolution kernels |
35 | 751 | assert(w->info.dim[0] == cmd.info.convolution.count); |
36 | 751 | int astride[CCV_NNC_MAX_DIM_ALLOC]; |
37 | 751 | ccv_nnc_tensor_view_get_stride(a, astride); |
38 | 751 | int bstride[CCV_NNC_MAX_DIM_ALLOC]; |
39 | 751 | ccv_nnc_tensor_view_get_stride(b, bstride); |
40 | 751 | assert(!bias || bias->info.dim[0] == cmd.info.convolution.count); |
41 | 751 | const int batch_size = (a_nd == size_nd + 2) ? a->info.dim[0]72 : 1679 ; |
42 | 751 | int dilation[size_nd]; |
43 | 751 | int i; |
44 | 2.25k | for (i = 0; i < size_nd; i++1.50k ) |
45 | 1.50k | dilation[i] = ccv_max(cmd.info.convolution.dilation[i], 1); |
46 | 751 | if (a->info.format == CCV_TENSOR_FORMAT_NHWC) |
47 | 749 | { |
48 | | // Make sure the weights dimension matches the network dimension |
49 | 2.24k | for (i = 0; i < size_nd; i++1.50k ) |
50 | 1.50k | { assert(w->info.dim[i + 1] == cmd.info.size.dim[i]); } |
51 | 749 | int wdim[size_nd]; |
52 | 2.24k | for (i = 0; i < size_nd; i++1.50k ) |
53 | 1.50k | wdim[i] = (w->info.dim[i + 1] - 1) * dilation[i] + 1; |
54 | 749 | assert(w->info.dim[size_nd + 1] * groups == adim[size_nd]); |
55 | 749 | assert(b->info.format == CCV_TENSOR_FORMAT_NHWC); |
56 | 749 | const int channel_size = w->info.dim[size_nd + 1]; |
57 | 749 | assert(bdim[size_nd] == cmd.info.convolution.count); |
58 | 749 | if (size_nd == 2) |
59 | 747 | { |
60 | 41.4k | parallel_for747 (idx, cmd.info.convolution.count * batch_size) { |
61 | 41.4k | int c; |
62 | 41.4k | const int bidx = idx / cmd.info.convolution.count; |
63 | 41.4k | const int k = idx % cmd.info.convolution.count; |
64 | 41.4k | const int gidx = k / group_size; |
65 | 41.4k | float* ap = a->data.f32 + bidx * astride[0]; |
66 | 41.4k | float* bp = b->data.f32 + bidx * bstride[0] + k; |
67 | | // kernel weight for one dim. |
68 | 41.4k | float* wp = w->data.f32 + k * w->info.dim[1] * w->info.dim[2] * channel_size; |
69 | 41.4k | float biasval = bias ? bias->data.f32[k]41.4k : 08 ; |
70 | | // This block will be cause in each for-loop, therefore, you can use it to generate some temporary variables. |
71 | 41.4k | int i[2]; |
72 | 41.4k | int n[2]; |
73 | 41.4k | int d[2]; |
74 | 41.4k | int m[2]; |
75 | 41.4k | int j[2]; |
76 | 2.01M | for (i[0] = 0; i[0] < bdim[0]; i[0]++1.96M ) |
77 | 1.96M | { |
78 | 1.96M | SET_BORDER_OFFSET_SIZE_FOR(0, i, hint, wdim, adim, n, m); |
79 | 1.96M | m[0] = (m[0] + n[0] - 1) / dilation[0] + 1; |
80 | 1.96M | const int n0 = (n[0] + dilation[0] - 1) / dilation[0]; |
81 | 1.96M | d[0] = n0 * dilation[0] - n[0]; |
82 | 1.96M | n[0] = n0; |
83 | 1.96M | m[0] = m[0] - n[0]; |
84 | 1.96M | float* wpu = wp + n[0] * w->info.dim[2] * channel_size; |
85 | 181M | for (i[1] = 0; i[1] < bdim[1]; i[1]++179M ) |
86 | 179M | { |
87 | 179M | SET_BORDER_OFFSET_SIZE_FOR(1, i, hint, wdim, adim, n, m); |
88 | 179M | m[1] = (m[1] + n[1] - 1) / dilation[1] + 1; |
89 | 179M | const int n1 = (n[1] + dilation[1] - 1) / dilation[1]; |
90 | 179M | d[1] = n1 * dilation[1] - n[1]; |
91 | 179M | n[1] = n1; |
92 | 179M | m[1] = m[1] - n[1]; |
93 | 179M | float p = biasval; |
94 | 179M | float* wpz = wpu + n[1] * channel_size; |
95 | 179M | float* apz = ap + d[0] * astride[1] + (ccv_max(i[1] * hint.stride.dim[1] - hint.border.begin[1], 0) + d[1]) * astride[2] + gidx * channel_size; |
96 | 1.02G | for (j[0] = 0; j[0] < m[0]; j[0]++843M ) |
97 | 843M | { |
98 | 5.46G | for (j[1] = 0; j[1] < m[1]; j[1]++4.62G ) |
99 | 116G | for (c = 0; 4.62G c < channel_size; c++111G ) |
100 | 111G | p += wpz[j[1] * channel_size + c] * apz[j[1] * dilation[1] * astride[2] + c]; |
101 | 843M | wpz += w->info.dim[2] * channel_size; |
102 | 843M | apz += astride[1] * dilation[0]; |
103 | 843M | } |
104 | 179M | bp[i[1] * bstride[2]] = p; |
105 | 179M | } |
106 | 1.96M | bp += bstride[1]; |
107 | 1.96M | ap += astride[1] * (ccv_max((i[0] + 1) * hint.stride.dim[0] - hint.border.begin[0], 0) - ccv_max(i[0] * hint.stride.dim[0] - hint.border.begin[0], 0)); |
108 | 1.96M | } |
109 | 41.4k | } parallel_endfor |
110 | 747 | } else if (2 size_nd == 32 ) { |
111 | 2 | if (a_nd == size_nd + 1) |
112 | 5 | for (i = a_nd; 1 i > 0; i--4 ) |
113 | 4 | astride[i] = astride[i - 1]; |
114 | 2 | if (b_nd == size_nd + 1) |
115 | 5 | for (i = b_nd; 1 i > 0; i--4 ) |
116 | 4 | bstride[i] = bstride[i - 1]; |
117 | 1.54k | parallel_for2 (idx, cmd.info.convolution.count * batch_size) { |
118 | 1.54k | int c; |
119 | 1.54k | const int bidx = idx / cmd.info.convolution.count; |
120 | 1.54k | const int k = idx % cmd.info.convolution.count; |
121 | 1.54k | const int gidx = k / group_size; |
122 | 1.54k | float* ap = a->data.f32 + bidx * astride[0]; |
123 | 1.54k | float* bp = b->data.f32 + bidx * bstride[0] + k; |
124 | | // kernel weight for one dim. |
125 | 1.54k | float* wp = w->data.f32 + k * w->info.dim[1] * w->info.dim[2] * w->info.dim[3] * channel_size; |
126 | 1.54k | float biasval = bias ? bias->data.f32[k] : 00 ; |
127 | | // This block will be cause in each for-loop, therefore, you can use it to generate some temporary variables. |
128 | 1.54k | int i[3]; |
129 | 1.54k | int n[3]; |
130 | 1.54k | int d[3]; |
131 | 1.54k | int m[3]; |
132 | 1.54k | int j[3]; |
133 | 6.17k | for (i[0] = 0; i[0] < bdim[0]; i[0]++4.63k ) |
134 | 4.63k | { |
135 | 4.63k | SET_BORDER_OFFSET_SIZE_FOR(0, i, hint, wdim, adim, n, m); |
136 | 4.63k | m[0] = (m[0] + n[0] - 1) / dilation[0] + 1; |
137 | 4.63k | const int n0 = (n[0] + dilation[0] - 1) / dilation[0]; |
138 | 4.63k | d[0] = n0 * dilation[0] - n[0]; |
139 | 4.63k | n[0] = n0; |
140 | 4.63k | m[0] = m[0] - n[0]; |
141 | 4.63k | float* wpu = wp + n[0] * w->info.dim[2] * w->info.dim[3] * channel_size; |
142 | 4.63k | float* bpu = bp; |
143 | 522k | for (i[1] = 0; i[1] < bdim[1]; i[1]++517k ) |
144 | 517k | { |
145 | 517k | SET_BORDER_OFFSET_SIZE_FOR(1, i, hint, wdim, adim, n, m); |
146 | 517k | m[1] = (m[1] + n[1] - 1) / dilation[1] + 1; |
147 | 517k | const int n1 = (n[1] + dilation[1] - 1) / dilation[1]; |
148 | 517k | d[1] = n1 * dilation[1] - n[1]; |
149 | 517k | n[1] = n1; |
150 | 517k | m[1] = m[1] - n[1]; |
151 | 58.3M | for (i[2] = 0; i[2] < bdim[2]; i[2]++57.8M ) |
152 | 57.8M | { |
153 | 57.8M | SET_BORDER_OFFSET_SIZE_FOR(2, i, hint, wdim, adim, n, m); |
154 | 57.8M | m[2] = (m[2] + n[2] - 1) / dilation[2] + 1; |
155 | 57.8M | const int n2 = (n[2] + dilation[2] - 1) / dilation[2]; |
156 | 57.8M | d[2] = n2 * dilation[2] - n[2]; |
157 | 57.8M | n[2] = n2; |
158 | 57.8M | m[2] = m[2] - n[2]; |
159 | 57.8M | float p = biasval; |
160 | 57.8M | float* wpz = wpu + n[1] * w->info.dim[3] * channel_size + n[2] * channel_size; |
161 | 57.8M | float* apz = ap + d[0] * astride[1] + (ccv_max(i[1] * hint.stride.dim[1] - hint.border.begin[1], 0) + d[1]) * astride[2] + (ccv_max(i[2] * hint.stride.dim[2] - hint.border.begin[2], 0) + d[2]) * astride[3] + gidx * channel_size; |
162 | 193M | for (j[0] = 0; j[0] < m[0]; j[0]++135M ) |
163 | 135M | { |
164 | 1.08G | for (j[1] = 0; j[1] < m[1]; j[1]++945M ) |
165 | 7.54G | for (j[2] = 0; 945M j[2] < m[2]; j[2]++6.59G ) |
166 | 26.3G | for (c = 0; 6.59G c < channel_size; c++19.7G ) |
167 | 19.7G | p += wpz[(j[1] * w->info.dim[3] + j[2]) * channel_size + c] * apz[j[1] * dilation[1] * astride[2] + j[2] * dilation[2] * astride[3] + c]; |
168 | 135M | wpz += w->info.dim[2] * w->info.dim[3] * channel_size; |
169 | 135M | apz += astride[1] * dilation[0]; |
170 | 135M | } |
171 | 57.8M | bpu[i[2] * bstride[3]] = p; |
172 | 57.8M | } |
173 | 517k | bpu += bstride[2]; |
174 | 517k | } |
175 | 4.63k | bp += bstride[1]; |
176 | 4.63k | ap += astride[1] * (ccv_max((i[0] + 1) * hint.stride.dim[0] - hint.border.begin[0], 0) - ccv_max(i[0] * hint.stride.dim[0] - hint.border.begin[0], 0)); |
177 | 4.63k | } |
178 | 1.54k | } parallel_endfor |
179 | 2 | } else { |
180 | 0 | assert(0 && "Cannot support 1d or 4d convolution."); |
181 | 0 | } |
182 | 749 | } else if (2 a->info.format == CCV_TENSOR_FORMAT_NCHW2 ) { |
183 | | // Make sure the weights dimension matches the network dimension |
184 | 7 | for (i = 0; i < size_nd; i++5 ) |
185 | 5 | { assert(w->info.dim[i + 2] == cmd.info.size.dim[i]); } |
186 | 2 | int wdim[size_nd]; |
187 | 7 | for (i = 0; i < size_nd; i++5 ) |
188 | 5 | wdim[i] = (w->info.dim[i + 2] - 1) * dilation[i] + 1; |
189 | 2 | assert(w->info.dim[1] * groups == adim[0]); |
190 | 2 | assert(b->info.format == CCV_TENSOR_FORMAT_NCHW); |
191 | 2 | const int channel_size = w->info.dim[1]; |
192 | 2 | assert(bdim[0] == cmd.info.convolution.count); |
193 | 2 | if (size_nd == 2) |
194 | 1 | { |
195 | 1 | const int hw = w->info.dim[2] * w->info.dim[3]; |
196 | 1.53k | parallel_for1 (idx, cmd.info.convolution.count * batch_size) { |
197 | 1.53k | int c; |
198 | 1.53k | const int bidx = idx / cmd.info.convolution.count; |
199 | 1.53k | const int k = idx % cmd.info.convolution.count; |
200 | 1.53k | const int gidx = k / group_size; |
201 | 1.53k | float* ap = a->data.f32 + bidx * astride[0]; |
202 | 1.53k | float* bp = b->data.f32 + bidx * bstride[0] + k * bstride[1]; |
203 | | // kernel weight for one dim. |
204 | 1.53k | float* wp = w->data.f32 + k * hw * channel_size; |
205 | 1.53k | float biasval = bias ? bias->data.f32[k] : 00 ; |
206 | | // This block will be cause in each for-loop, therefore, you can use it to generate some temporary variables. |
207 | 1.53k | int i[2]; |
208 | 1.53k | int n[2]; |
209 | 1.53k | int d[2]; |
210 | 1.53k | int m[2]; |
211 | 1.53k | int j[2]; |
212 | 173k | for (i[0] = 0; i[0] < bdim[1]; i[0]++172k ) |
213 | 172k | { |
214 | 172k | SET_BORDER_OFFSET_SIZE_FOR(0, i, hint, wdim, adim + 1, n, m); |
215 | 172k | m[0] = (m[0] + n[0] - 1) / dilation[0] + 1; |
216 | 172k | const int n0 = (n[0] + dilation[0] - 1) / dilation[0]; |
217 | 172k | d[0] = n0 * dilation[0] - n[0]; |
218 | 172k | n[0] = n0; |
219 | 172k | m[0] = m[0] - n[0]; |
220 | 172k | float* wpu = wp + n[0] * w->info.dim[3]; |
221 | 19.4M | for (i[1] = 0; i[1] < bdim[2]; i[1]++19.2M ) |
222 | 19.2M | { |
223 | 19.2M | SET_BORDER_OFFSET_SIZE_FOR(1, i, hint, wdim, adim + 1, n, m); |
224 | 19.2M | m[1] = (m[1] + n[1] - 1) / dilation[1] + 1; |
225 | 19.2M | const int n1 = (n[1] + dilation[1] - 1) / dilation[1]; |
226 | 19.2M | d[1] = n1 * dilation[1] - n[1]; |
227 | 19.2M | n[1] = n1; |
228 | 19.2M | m[1] = m[1] - n[1]; |
229 | 19.2M | float p = biasval; |
230 | 19.2M | float* wpz = wpu + n[1]; |
231 | 19.2M | float* apz = ap + d[0] * astride[2] + (ccv_max(i[1] * hint.stride.dim[1] - hint.border.begin[1], 0) + d[1]) * astride[3] + gidx * channel_size * astride[1]; |
232 | 153M | for (j[0] = 0; j[0] < m[0]; j[0]++133M ) |
233 | 133M | { |
234 | 1.06G | for (j[1] = 0; j[1] < m[1]; j[1]++929M ) |
235 | 3.71G | for (c = 0; 929M c < channel_size; c++2.78G ) |
236 | 2.78G | p += wpz[j[1] + c * hw] * apz[j[1] * dilation[1] * astride[3] + c * astride[1]]; |
237 | 133M | wpz += w->info.dim[3]; |
238 | 133M | apz += astride[2] * dilation[0]; |
239 | 133M | } |
240 | 19.2M | bp[i[1]] = p; |
241 | 19.2M | } |
242 | 172k | bp += bstride[2]; |
243 | 172k | ap += astride[2] * (ccv_max((i[0] + 1) * hint.stride.dim[0] - hint.border.begin[0], 0) - ccv_max(i[0] * hint.stride.dim[0] - hint.border.begin[0], 0)); |
244 | 172k | } |
245 | 1.53k | } parallel_endfor |
246 | 1 | } else if (size_nd == 3) { |
247 | 1 | if (a_nd == size_nd + 1) |
248 | 0 | for (i = a_nd; i > 0; i--) |
249 | 0 | astride[i] = astride[i - 1]; |
250 | 1 | if (b_nd == size_nd + 1) |
251 | 0 | for (i = b_nd; i > 0; i--) |
252 | 0 | bstride[i] = bstride[i - 1]; |
253 | 1 | const int hw = w->info.dim[2] * w->info.dim[3] * w->info.dim[4]; |
254 | 1.53k | parallel_for1 (idx, cmd.info.convolution.count * batch_size) { |
255 | 1.53k | int c; |
256 | 1.53k | const int bidx = idx / cmd.info.convolution.count; |
257 | 1.53k | const int k = idx % cmd.info.convolution.count; |
258 | 1.53k | const int gidx = k / group_size; |
259 | 1.53k | float* ap = a->data.f32 + bidx * astride[0]; |
260 | 1.53k | float* bp = b->data.f32 + bidx * bstride[0] + k * bstride[1]; |
261 | | // kernel weight for one dim. |
262 | 1.53k | float* wp = w->data.f32 + k * hw * channel_size; |
263 | 1.53k | float biasval = bias ? bias->data.f32[k] : 00 ; |
264 | | // This block will be cause in each for-loop, therefore, you can use it to generate some temporary variables. |
265 | 1.53k | int i[3]; |
266 | 1.53k | int n[3]; |
267 | 1.53k | int d[3]; |
268 | 1.53k | int m[3]; |
269 | 1.53k | int j[3]; |
270 | 6.14k | for (i[0] = 0; i[0] < bdim[1]; i[0]++4.60k ) |
271 | 4.60k | { |
272 | 4.60k | SET_BORDER_OFFSET_SIZE_FOR(0, i, hint, wdim, adim + 1, n, m); |
273 | 4.60k | m[0] = (m[0] + n[0] - 1) / dilation[0] + 1; |
274 | 4.60k | const int n0 = (n[0] + dilation[0] - 1) / dilation[0]; |
275 | 4.60k | d[0] = n0 * dilation[0] - n[0]; |
276 | 4.60k | n[0] = n0; |
277 | 4.60k | m[0] = m[0] - n[0]; |
278 | 4.60k | float* wpu = wp + n[0] * w->info.dim[3] * w->info.dim[4]; |
279 | 520k | for (i[1] = 0; i[1] < bdim[2]; i[1]++516k ) |
280 | 516k | { |
281 | 516k | SET_BORDER_OFFSET_SIZE_FOR(1, i, hint, wdim, adim + 1, n, m); |
282 | 516k | m[1] = (m[1] + n[1] - 1) / dilation[1] + 1; |
283 | 516k | const int n1 = (n[1] + dilation[1] - 1) / dilation[1]; |
284 | 516k | d[1] = n1 * dilation[1] - n[1]; |
285 | 516k | n[1] = n1; |
286 | 516k | m[1] = m[1] - n[1]; |
287 | 58.3M | for (i[2] = 0; i[2] < bdim[3]; i[2]++57.8M ) |
288 | 57.8M | { |
289 | 57.8M | SET_BORDER_OFFSET_SIZE_FOR(2, i, hint, wdim, adim + 1, n, m); |
290 | 57.8M | m[2] = (m[2] + n[2] - 1) / dilation[2] + 1; |
291 | 57.8M | const int n2 = (n[2] + dilation[2] - 1) / dilation[2]; |
292 | 57.8M | d[2] = n2 * dilation[2] - n[2]; |
293 | 57.8M | n[2] = n2; |
294 | 57.8M | m[2] = m[2] - n[2]; |
295 | 57.8M | float p = biasval; |
296 | 57.8M | float* wpz = wpu + n[1] * w->info.dim[4] + n[2]; |
297 | 57.8M | float* apz = ap + d[0] * astride[2] + (ccv_max(i[1] * hint.stride.dim[1] - hint.border.begin[1], 0) + d[1]) * astride[3] + (ccv_max(i[2] * hint.stride.dim[2] - hint.border.begin[2], 0) + d[2]) * astride[4] + gidx * channel_size * astride[1]; |
298 | 192M | for (j[0] = 0; j[0] < m[0]; j[0]++134M ) |
299 | 134M | { |
300 | 1.07G | for (j[1] = 0; j[1] < m[1]; j[1]++936M ) |
301 | 7.44G | for (j[2] = 0; 936M j[2] < m[2]; j[2]++6.50G ) |
302 | 26.0G | for (c = 0; 6.50G c < channel_size; c++19.5G ) |
303 | 19.5G | p += wpz[j[1] * w->info.dim[4] + j[2] + c * hw] * apz[j[1] * dilation[1] * astride[3] + j[2] * dilation[2] * astride[4] + c * astride[1]]; |
304 | 134M | wpz += w->info.dim[3] * w->info.dim[4]; |
305 | 134M | apz += astride[2] * dilation[0]; |
306 | 134M | } |
307 | 57.8M | bp[i[1] * bstride[3] + i[2]] = p; |
308 | 57.8M | } |
309 | 516k | } |
310 | 4.60k | bp += bstride[2]; |
311 | 4.60k | ap += astride[2] * (ccv_max((i[0] + 1) * hint.stride.dim[0] - hint.border.begin[0], 0) - ccv_max(i[0] * hint.stride.dim[0] - hint.border.begin[0], 0)); |
312 | 4.60k | } |
313 | 1.53k | } parallel_endfor |
314 | 1 | } else { |
315 | 0 | assert(0 && "Cannot support 1d or 4d convolution."); |
316 | 0 | } |
317 | 2 | } |
318 | 751 | return CCV_NNC_EXEC_SUCCESS; |
319 | 751 | } |
320 | | |
321 | | static int _ccv_nnc_conv_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
322 | 1.11k | { |
323 | | // inputs: gradient, forw prop input, [w] |
324 | | // outputs: [output gradient], weight updates, bias updates |
325 | 1.11k | assert(input_size >= 2 && output_size >= 2); |
326 | 1.11k | const ccv_nnc_tensor_view_t* g = (ccv_nnc_tensor_view_t*)inputs[0]; // gradients |
327 | 1.11k | const ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[1]; |
328 | 1.11k | ccv_nnc_tensor_t* w = output_size > 1 ? outputs[1] : 00 ; |
329 | 1.11k | assert(CCV_IS_TENSOR_CONTIGUOUS(w)); |
330 | 1.11k | ccv_nnc_tensor_t* bias = output_size > 2 ? outputs[2]1.10k : 02 ; |
331 | 1.11k | assert(!bias || !CCV_IS_TENSOR_VIEW(bias)); |
332 | 1.11k | ccv_nnc_tensor_view_t* h = (ccv_nnc_tensor_view_t*)outputs[0]; // output gradients |
333 | 1.11k | if (!(flags & CCV_NNC_ACCUMULATE_OUTPUT)) // reset the gradients to 0 |
334 | 1.11k | { |
335 | 1.11k | if (w) |
336 | 1.11k | memset(w->data.u8, 0, sizeof(float) * ccv_nnc_tensor_count(w->info)); |
337 | 1.11k | if (bias) |
338 | 1.10k | memset(bias->data.u8, 0, sizeof(float) * ccv_nnc_tensor_count(bias->info)); |
339 | 1.11k | } |
340 | 1.11k | const int a_nd = ccv_nnc_tensor_nd(a->info.dim); |
341 | 1.11k | assert(a_nd == CCV_NNC_MAX_DIM + 1 || a_nd == CCV_NNC_MAX_DIM + 2); |
342 | 1.11k | const int* adim = (a_nd == CCV_NNC_MAX_DIM + 1) ? a->info.dim4 : a->info.dim + 11.10k ; |
343 | 1.11k | const int g_nd = ccv_nnc_tensor_nd(g->info.dim); |
344 | 1.11k | assert(g_nd == CCV_NNC_MAX_DIM + 1 || g_nd == CCV_NNC_MAX_DIM + 2); |
345 | 1.11k | const int* gdim = (g_nd == CCV_NNC_MAX_DIM + 1) ? g->info.dim4 : g->info.dim + 11.10k ; |
346 | 1.11k | int astride[CCV_NNC_MAX_DIM_ALLOC]; |
347 | 1.11k | ccv_nnc_tensor_view_get_stride(a, astride); |
348 | 1.11k | int gstride[CCV_NNC_MAX_DIM_ALLOC]; |
349 | 1.11k | ccv_nnc_tensor_view_get_stride(g, gstride); |
350 | 1.11k | const int groups = cmd.info.convolution.groups; |
351 | 1.11k | if (w) |
352 | 1.11k | assert(w->info.dim[CCV_NNC_MAX_DIM + 1] * groups == adim[CCV_NNC_MAX_DIM]); |
353 | 1.11k | assert(cmd.info.convolution.count % groups == 0); |
354 | 1.11k | const int group_size = cmd.info.convolution.count / groups; |
355 | 1.11k | const int channel_size = w ? w->info.dim[CCV_NNC_MAX_DIM + 1] : inputs[2]->info.dim[0 CCV_NNC_MAX_DIM0 + 1]; |
356 | 1.11k | const int batch_size = (a_nd == CCV_NNC_MAX_DIM + 2) ? a->info.dim[0]1.10k : 14 ; |
357 | 1.11k | const int dilation[CCV_NNC_MAX_DIM] = { |
358 | 1.11k | ccv_max(cmd.info.convolution.dilation[0], 1), |
359 | 1.11k | ccv_max(cmd.info.convolution.dilation[1], 1) |
360 | 1.11k | }; |
361 | 1.11k | const int wdim[CCV_NNC_MAX_DIM] = { |
362 | 1.11k | (w->info.dim[1] - 1) * dilation[0] + 1, |
363 | 1.11k | (w->info.dim[2] - 1) * dilation[1] + 1 |
364 | 1.11k | }; |
365 | 1.11k | if (w) |
366 | 1.11k | { |
367 | 47.4k | parallel_for1.11k (k, cmd.info.convolution.count) { |
368 | 47.4k | int c; |
369 | 47.4k | const int gidx = k / group_size; |
370 | | // kernel weight for one dim. |
371 | 47.4k | float* wp = w->data.f32 + k * w->info.dim[1] * w->info.dim[2] * w->info.dim[3]; |
372 | 47.4k | float biasval = 0; |
373 | 47.4k | int i[CCV_NNC_MAX_DIM]; |
374 | 47.4k | int n[CCV_NNC_MAX_DIM]; |
375 | 47.4k | int d[CCV_NNC_MAX_DIM]; |
376 | 47.4k | int m[CCV_NNC_MAX_DIM]; |
377 | 47.4k | int j[CCV_NNC_MAX_DIM]; |
378 | 47.4k | int bidx; |
379 | 100k | for (bidx = 0; bidx < batch_size; bidx++53.1k ) |
380 | 53.1k | { |
381 | 53.1k | const float* ap = a->data.f32 + bidx * astride[0]; |
382 | 53.1k | const float* gp = g->data.f32 + bidx * gstride[0] + k; |
383 | 1.44M | for (i[0] = 0; i[0] < gdim[0]; i[0]++1.39M ) |
384 | 1.39M | { |
385 | 1.39M | SET_BORDER_OFFSET_SIZE_FOR(0, i, hint, wdim, adim, n, m); |
386 | 1.39M | m[0] = (m[0] + n[0] - 1) / dilation[0] + 1; |
387 | 1.39M | const int n0 = (n[0] + dilation[0] - 1) / dilation[0]; |
388 | 1.39M | d[0] = n0 * dilation[0] - n[0]; |
389 | 1.39M | n[0] = n0; |
390 | 1.39M | m[0] = m[0] - n[0]; |
391 | 1.39M | float* wpu = wp + n[0] * w->info.dim[CCV_NNC_MAX_DIM] * channel_size; |
392 | 93.5M | for (i[1] = 0; i[1] < gdim[1]; i[1]++92.1M ) |
393 | 92.1M | { |
394 | 92.1M | SET_BORDER_OFFSET_SIZE_FOR(1, i, hint, wdim, adim, n, m); |
395 | 92.1M | m[1] = (m[1] + n[1] - 1) / dilation[1] + 1; |
396 | 92.1M | const int n1 = (n[1] + dilation[1] - 1) / dilation[1]; |
397 | 92.1M | d[1] = n1 * dilation[1] - n[1]; |
398 | 92.1M | n[1] = n1; |
399 | 92.1M | m[1] = m[1] - n[1]; |
400 | 92.1M | const float v = gp[i[1] * gstride[CCV_NNC_MAX_DIM]]; |
401 | 92.1M | if (v == 0) // shortcut if v is zero |
402 | 11.8M | continue; |
403 | 80.2M | biasval += v; |
404 | 80.2M | float* wpz = wpu + n[1] * channel_size; |
405 | 80.2M | const float* apz = ap + d[0] * astride[CCV_NNC_MAX_DIM - 1] + (ccv_max(i[1] * hint.stride.dim[1] - hint.border.begin[1], 0) + d[1]) * astride[CCV_NNC_MAX_DIM] + gidx * channel_size; |
406 | 629M | for (j[0] = 0; j[0] < m[0]; j[0]++549M ) |
407 | 549M | { |
408 | 4.31G | for (j[1] = 0; j[1] < m[1]; j[1]++3.76G ) |
409 | 15.9G | for (c = 0; 3.76G c < channel_size; c++12.1G ) |
410 | 12.1G | wpz[j[1] * channel_size + c] += v * apz[j[1] * dilation[1] * astride[CCV_NNC_MAX_DIM] + c]; |
411 | 549M | wpz += w->info.dim[CCV_NNC_MAX_DIM] * channel_size; |
412 | 549M | apz += astride[CCV_NNC_MAX_DIM - 1] * dilation[0]; |
413 | 549M | } |
414 | 80.2M | } |
415 | 1.39M | gp += gstride[CCV_NNC_MAX_DIM - 1]; |
416 | 1.39M | ap += astride[CCV_NNC_MAX_DIM - 1] * (ccv_max((i[0] + 1) * hint.stride.dim[0] - hint.border.begin[0], 0) - ccv_max(i[0] * hint.stride.dim[0] - hint.border.begin[0], 0)); |
417 | 1.39M | } |
418 | 53.1k | } |
419 | 47.4k | if (bias) |
420 | 47.4k | bias->data.f32[k] = biasval; |
421 | 47.4k | } parallel_endfor |
422 | 1.11k | } |
423 | | // If h is available, therefore, we need to propagate the gradients back |
424 | 1.11k | if (h) |
425 | 1.01k | { |
426 | 1.01k | assert(h); |
427 | 1.01k | const int h_nd = ccv_nnc_tensor_nd(h->info.dim); |
428 | 1.01k | assert(h_nd == CCV_NNC_MAX_DIM + 1 || h_nd == CCV_NNC_MAX_DIM + 2); |
429 | 1.01k | const int* hdim = (h_nd == CCV_NNC_MAX_DIM + 1) ? h->info.dim6 : h->info.dim + 11.00k ; |
430 | 1.01k | int hstride[CCV_NNC_MAX_DIM_ALLOC]; |
431 | 1.01k | ccv_nnc_tensor_view_get_stride(h, hstride); |
432 | | // reset it to 0. |
433 | 1.01k | ccv_nnc_tensor_zero(h); |
434 | 1.01k | w = inputs[2]; |
435 | 1.01k | assert(CCV_IS_TENSOR_CONTIGUOUS(w)); |
436 | 1.01k | int bidx; |
437 | 2.08k | for (bidx = 0; bidx < batch_size; bidx++1.07k ) |
438 | 1.07k | { |
439 | 1.07k | int k; |
440 | 51.0k | for (k = 0; k < cmd.info.convolution.count; k++49.9k ) |
441 | 49.9k | { |
442 | 49.9k | int c; |
443 | 49.9k | const int gidx = k / group_size; |
444 | 49.9k | float* hp = h->data.f32 + bidx * hstride[0]; |
445 | 49.9k | const float* gp = g->data.f32 + bidx * gstride[0] + k; |
446 | | // kernel weight for one dim. |
447 | 49.9k | float* wp = w->data.f32 + k * w->info.dim[1] * w->info.dim[2] * channel_size; |
448 | | // This block will be cause in each for-loop, therefore, you can use it to generate some temporary variables. |
449 | 49.9k | int i[CCV_NNC_MAX_DIM]; |
450 | 49.9k | int n[CCV_NNC_MAX_DIM]; |
451 | 49.9k | int d[CCV_NNC_MAX_DIM]; |
452 | 49.9k | int m[CCV_NNC_MAX_DIM]; |
453 | 49.9k | int j[CCV_NNC_MAX_DIM]; |
454 | 1.34M | for (i[0] = 0; i[0] < gdim[0]; i[0]++1.29M ) |
455 | 1.29M | { |
456 | 1.29M | SET_BORDER_OFFSET_SIZE_FOR(0, i, hint, wdim, hdim, n, m); |
457 | 1.29M | m[0] = (m[0] + n[0] - 1) / dilation[0] + 1; |
458 | 1.29M | const int n0 = (n[0] + dilation[0] - 1) / dilation[0]; |
459 | 1.29M | d[0] = n0 * dilation[0] - n[0]; |
460 | 1.29M | n[0] = n0; |
461 | 1.29M | m[0] = m[0] - n[0]; |
462 | 1.29M | const float* wpu = wp + n[0] * w->info.dim[CCV_NNC_MAX_DIM] * channel_size; |
463 | 90.3M | for (i[1] = 0; i[1] < gdim[1]; i[1]++89.1M ) |
464 | 89.1M | { |
465 | 89.1M | SET_BORDER_OFFSET_SIZE_FOR(1, i, hint, wdim, hdim, n, m); |
466 | 89.1M | m[1] = (m[1] + n[1] - 1) / dilation[1] + 1; |
467 | 89.1M | const int n1 = (n[1] + dilation[1] - 1) / dilation[1]; |
468 | 89.1M | d[1] = n1 * dilation[1] - n[1]; |
469 | 89.1M | n[1] = n1; |
470 | 89.1M | m[1] = m[1] - n[1]; |
471 | 89.1M | const float v = gp[i[1] * gstride[CCV_NNC_MAX_DIM]]; |
472 | 89.1M | if (v == 0) // shortcut if v is zero |
473 | 9.26M | continue; |
474 | 79.8M | const float* wpz = wpu + n[1] * channel_size; |
475 | 79.8M | float* hpz = hp + d[0] * hstride[CCV_NNC_MAX_DIM - 1] + (ccv_max(i[1] * hint.stride.dim[1] - hint.border.begin[1], 0) + d[1]) * hstride[CCV_NNC_MAX_DIM] + gidx * channel_size; |
476 | 626M | for (j[0] = 0; j[0] < m[0]; j[0]++547M ) |
477 | 547M | { |
478 | 4.30G | for (j[1] = 0; j[1] < m[1]; j[1]++3.75G ) |
479 | 15.8G | for (c = 0; 3.75G c < channel_size; c++12.1G ) |
480 | 12.1G | hpz[j[1] * dilation[1] * hstride[CCV_NNC_MAX_DIM] + c] += v * wpz[j[1] * channel_size + c]; |
481 | 547M | wpz += w->info.dim[CCV_NNC_MAX_DIM] * channel_size; |
482 | 547M | hpz += hstride[CCV_NNC_MAX_DIM - 1] * dilation[0]; |
483 | 547M | } |
484 | 79.8M | } |
485 | 1.29M | gp += gstride[CCV_NNC_MAX_DIM - 1]; |
486 | 1.29M | hp += hstride[CCV_NNC_MAX_DIM - 1] * (ccv_max((i[0] + 1) * hint.stride.dim[0] - hint.border.begin[0], 0) - ccv_max(i[0] * hint.stride.dim[0] - hint.border.begin[0], 0)); |
487 | 1.29M | } |
488 | 49.9k | } |
489 | 1.07k | } |
490 | 1.01k | } |
491 | 1.11k | return CCV_NNC_EXEC_SUCCESS; |
492 | 1.11k | } |
493 | | |
494 | | REGISTER_COMMAND_BACKEND(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
495 | 1 | { |
496 | 1 | registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW; |
497 | 1 | registry->tensor_datatypes = CCV_32F; |
498 | 1 | registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
499 | 1 | registry->algorithms = 1; |
500 | 1 | registry->exec = _ccv_nnc_conv_forw; |
501 | 1 | } |
502 | | |
503 | | REGISTER_COMMAND_BACKEND(CCV_NNC_CONVOLUTION_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
504 | 1 | { |
505 | 1 | registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC; |
506 | 1 | registry->tensor_datatypes = CCV_32F; |
507 | 1 | registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
508 | 1 | registry->algorithms = 1; |
509 | 1 | registry->exec = _ccv_nnc_conv_back; |
510 | 1 | } |
511 | | |