/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/cmd/convolution/ccv_nnc_conv_cpu_ref.c
Line | Count | Source (jump to first uncovered line) |
1 | | #include "ccv.h" |
2 | | #include "ccv_internal.h" |
3 | | #include "nnc/ccv_nnc.h" |
4 | | #include "nnc/ccv_nnc_easy.h" |
5 | | #include "nnc/ccv_nnc_internal.h" |
6 | | #ifdef USE_OPENMP |
7 | | #include <omp.h> |
8 | | #endif |
9 | | #ifdef USE_DISPATCH |
10 | | #include <dispatch/dispatch.h> |
11 | | #endif |
12 | | |
13 | | static int _ccv_nnc_conv_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
14 | 748 | { |
15 | 748 | assert(input_size >= 2); |
16 | 748 | const ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[0]; |
17 | 748 | const ccv_nnc_tensor_t* w = inputs[1]; |
18 | 748 | assert(CCV_IS_TENSOR_CONTIGUOUS(w)); |
19 | 748 | const ccv_nnc_tensor_t* bias = input_size > 2 ? inputs[2]745 : 03 ; |
20 | 748 | assert(!bias || !CCV_IS_TENSOR_VIEW(bias)); |
21 | 748 | assert(output_size == 1); |
22 | 748 | ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)outputs[0]; |
23 | 748 | const int a_nd = ccv_nnc_tensor_nd(a->info.dim); |
24 | 748 | assert(a_nd == CCV_NNC_MAX_DIM + 1 || a_nd == CCV_NNC_MAX_DIM + 2); |
25 | 748 | const int* adim = (a_nd == CCV_NNC_MAX_DIM + 1) ? a->info.dim678 : a->info.dim + 170 ; |
26 | 748 | const int b_nd = ccv_nnc_tensor_nd(b->info.dim); |
27 | 748 | assert(b_nd == CCV_NNC_MAX_DIM + 1 || b_nd == CCV_NNC_MAX_DIM + 2); |
28 | 748 | const int* bdim = (b_nd == CCV_NNC_MAX_DIM + 1) ? b->info.dim678 : b->info.dim + 170 ; |
29 | 748 | const int groups = cmd.info.convolution.groups; |
30 | 748 | assert(cmd.info.convolution.count % groups == 0); |
31 | 748 | const int group_size = cmd.info.convolution.count / groups; |
32 | | // Make sure the weights output dimension matches the network convolution kernels |
33 | 748 | assert(w->info.dim[0] == cmd.info.convolution.count); |
34 | 748 | int astride[CCV_NNC_MAX_DIM_ALLOC]; |
35 | 748 | ccv_nnc_tensor_view_get_stride(a, astride); |
36 | 748 | int bstride[CCV_NNC_MAX_DIM_ALLOC]; |
37 | 748 | ccv_nnc_tensor_view_get_stride(b, bstride); |
38 | 748 | assert(!bias || bias->info.dim[0] == cmd.info.convolution.count); |
39 | 748 | const int batch_size = (a_nd == CCV_NNC_MAX_DIM + 2) ? a->info.dim[0]70 : 1678 ; |
40 | 748 | const int dilation[CCV_NNC_MAX_DIM] = { |
41 | 748 | ccv_max(cmd.info.convolution.dilation[0], 1), |
42 | 748 | ccv_max(cmd.info.convolution.dilation[1], 1) |
43 | 748 | }; |
44 | 748 | if (a->info.format == CCV_TENSOR_FORMAT_NHWC) |
45 | 747 | { |
46 | | // Make sure the weights dimension matches the network dimension |
47 | 747 | assert(w->info.dim[1] == cmd.info.size.dim[0]); |
48 | 747 | assert(w->info.dim[2] == cmd.info.size.dim[1]); |
49 | 747 | const int wdim[CCV_NNC_MAX_DIM] = { |
50 | 747 | (w->info.dim[1] - 1) * dilation[0] + 1, |
51 | 747 | (w->info.dim[2] - 1) * dilation[1] + 1 |
52 | 747 | }; |
53 | 747 | assert(w->info.dim[CCV_NNC_MAX_DIM + 1] * groups == adim[CCV_NNC_MAX_DIM]); |
54 | 747 | assert(b->info.format == CCV_TENSOR_FORMAT_NHWC); |
55 | 747 | const int channel_size = w->info.dim[CCV_NNC_MAX_DIM + 1]; |
56 | 747 | assert(bdim[CCV_NNC_MAX_DIM] == cmd.info.convolution.count); |
57 | 59.8k | parallel_for747 (idx, cmd.info.convolution.count * batch_size) { |
58 | 59.8k | int c; |
59 | 59.8k | const int bidx = idx / cmd.info.convolution.count; |
60 | 59.8k | const int k = idx % cmd.info.convolution.count; |
61 | 59.8k | const int gidx = k / group_size; |
62 | 59.8k | float* ap = a->data.f32 + bidx * astride[0]; |
63 | 59.8k | float* bp = b->data.f32 + bidx * bstride[0] + k; |
64 | | // kernel weight for one dim. |
65 | 59.8k | float* wp = w->data.f32 + k * w->info.dim[1] * w->info.dim[2] * channel_size; |
66 | 59.8k | float biasval = bias ? bias->data.f32[k]59.8k : 08 ; |
67 | | // This block will be cause in each for-loop, therefore, you can use it to generate some temporary variables. |
68 | 59.8k | int i[CCV_NNC_MAX_DIM]; |
69 | 59.8k | int n[CCV_NNC_MAX_DIM]; |
70 | 59.8k | int d[CCV_NNC_MAX_DIM]; |
71 | 59.8k | int m[CCV_NNC_MAX_DIM]; |
72 | 59.8k | int j[CCV_NNC_MAX_DIM]; |
73 | 4.09M | for (i[0] = 0; i[0] < bdim[0]; i[0]++4.03M ) |
74 | 4.03M | { |
75 | 4.03M | SET_BORDER_OFFSET_SIZE_FOR(0, i, hint, wdim, adim, n, m); |
76 | 4.03M | m[0] = (m[0] + n[0] - 1) / dilation[0] + 1; |
77 | 4.03M | const int n0 = (n[0] + dilation[0] - 1) / dilation[0]; |
78 | 4.03M | d[0] = n0 * dilation[0] - n[0]; |
79 | 4.03M | n[0] = n0; |
80 | 4.03M | m[0] = m[0] - n[0]; |
81 | 4.03M | float* wpu = wp + n[0] * w->info.dim[CCV_NNC_MAX_DIM] * channel_size; |
82 | 414M | for (i[1] = 0; i[1] < bdim[1]; i[1]++410M ) |
83 | 410M | { |
84 | 410M | SET_BORDER_OFFSET_SIZE_FOR(1, i, hint, wdim, adim, n, m); |
85 | 410M | m[1] = (m[1] + n[1] - 1) / dilation[1] + 1; |
86 | 410M | const int n1 = (n[1] + dilation[1] - 1) / dilation[1]; |
87 | 410M | d[1] = n1 * dilation[1] - n[1]; |
88 | 410M | n[1] = n1; |
89 | 410M | m[1] = m[1] - n[1]; |
90 | 410M | float p = biasval; |
91 | 410M | float* wpz = wpu + n[1] * channel_size; |
92 | 410M | float* apz = ap + d[0] * astride[CCV_NNC_MAX_DIM - 1] + (ccv_max(i[1] * hint.stride.dim[1] - hint.border.begin[1], 0) + d[1]) * astride[CCV_NNC_MAX_DIM] + gidx * channel_size; |
93 | 2.85G | for (j[0] = 0; j[0] < m[0]; j[0]++2.44G ) |
94 | 2.44G | { |
95 | 18.1G | for (j[1] = 0; j[1] < m[1]; j[1]++15.7G ) |
96 | 160G | for (c = 0; 15.7G c < channel_size; c++144G ) |
97 | 144G | p += wpz[j[1] * channel_size + c] * apz[j[1] * dilation[1] * astride[CCV_NNC_MAX_DIM] + c]; |
98 | 2.44G | wpz += w->info.dim[CCV_NNC_MAX_DIM] * channel_size; |
99 | 2.44G | apz += astride[CCV_NNC_MAX_DIM - 1] * dilation[0]; |
100 | 2.44G | } |
101 | 410M | bp[i[1] * bstride[CCV_NNC_MAX_DIM]] = p; |
102 | 410M | } |
103 | 4.03M | bp += bstride[CCV_NNC_MAX_DIM - 1]; |
104 | 4.03M | ap += astride[CCV_NNC_MAX_DIM - 1] * (ccv_max((i[0] + 1) * hint.stride.dim[0] - hint.border.begin[0], 0) - ccv_max(i[0] * hint.stride.dim[0] - hint.border.begin[0], 0)); |
105 | 4.03M | } |
106 | 59.8k | } parallel_endfor |
107 | 747 | } else if (1 a->info.format == CCV_TENSOR_FORMAT_NCHW1 ) { |
108 | | // Make sure the weights dimension matches the network dimension |
109 | 1 | assert(w->info.dim[2] == cmd.info.size.dim[0]); |
110 | 1 | assert(w->info.dim[3] == cmd.info.size.dim[1]); |
111 | 1 | const int wdim[CCV_NNC_MAX_DIM] = { |
112 | 1 | (w->info.dim[2] - 1) * dilation[0] + 1, |
113 | 1 | (w->info.dim[3] - 1) * dilation[1] + 1 |
114 | 1 | }; |
115 | 1 | assert(w->info.dim[1] * groups == adim[0]); |
116 | 1 | assert(b->info.format == CCV_TENSOR_FORMAT_NCHW); |
117 | 1 | const int channel_size = w->info.dim[1]; |
118 | 1 | const int hw = w->info.dim[2] * w->info.dim[3]; |
119 | 1 | assert(bdim[0] == cmd.info.convolution.count); |
120 | 6.14k | parallel_for1 (idx, cmd.info.convolution.count * batch_size) { |
121 | 6.14k | int c; |
122 | 6.14k | const int bidx = idx / cmd.info.convolution.count; |
123 | 6.14k | const int k = idx % cmd.info.convolution.count; |
124 | 6.14k | const int gidx = k / group_size; |
125 | 6.14k | float* ap = a->data.f32 + bidx * astride[0]; |
126 | 6.14k | float* bp = b->data.f32 + bidx * bstride[0] + k * bstride[1]; |
127 | | // kernel weight for one dim. |
128 | 6.14k | float* wp = w->data.f32 + k * hw * channel_size; |
129 | 6.14k | float biasval = bias ? bias->data.f32[k] : 00 ; |
130 | | // This block will be cause in each for-loop, therefore, you can use it to generate some temporary variables. |
131 | 6.14k | int i[CCV_NNC_MAX_DIM]; |
132 | 6.14k | int n[CCV_NNC_MAX_DIM]; |
133 | 6.14k | int d[CCV_NNC_MAX_DIM]; |
134 | 6.14k | int m[CCV_NNC_MAX_DIM]; |
135 | 6.14k | int j[CCV_NNC_MAX_DIM]; |
136 | 694k | for (i[0] = 0; i[0] < bdim[1]; i[0]++688k ) |
137 | 688k | { |
138 | 688k | SET_BORDER_OFFSET_SIZE_FOR(0, i, hint, wdim, adim + 1, n, m); |
139 | 688k | m[0] = (m[0] + n[0] - 1) / dilation[0] + 1; |
140 | 688k | const int n0 = (n[0] + dilation[0] - 1) / dilation[0]; |
141 | 688k | d[0] = n0 * dilation[0] - n[0]; |
142 | 688k | n[0] = n0; |
143 | 688k | m[0] = m[0] - n[0]; |
144 | 688k | float* wpu = wp + n[0] * w->info.dim[CCV_NNC_MAX_DIM + 1]; |
145 | 77.7M | for (i[1] = 0; i[1] < bdim[2]; i[1]++77.0M ) |
146 | 77.0M | { |
147 | 77.0M | SET_BORDER_OFFSET_SIZE_FOR(1, i, hint, wdim, adim + 1, n, m); |
148 | 77.0M | m[1] = (m[1] + n[1] - 1) / dilation[1] + 1; |
149 | 77.0M | const int n1 = (n[1] + dilation[1] - 1) / dilation[1]; |
150 | 77.0M | d[1] = n1 * dilation[1] - n[1]; |
151 | 77.0M | n[1] = n1; |
152 | 77.0M | m[1] = m[1] - n[1]; |
153 | 77.0M | float p = biasval; |
154 | 77.0M | float* wpz = wpu + n[1]; |
155 | 77.0M | float* apz = ap + d[0] * astride[CCV_NNC_MAX_DIM] + (ccv_max(i[1] * hint.stride.dim[1] - hint.border.begin[1], 0) + d[1]) * astride[CCV_NNC_MAX_DIM + 1] + gidx * channel_size * astride[1]; |
156 | 612M | for (j[0] = 0; j[0] < m[0]; j[0]++535M ) |
157 | 535M | { |
158 | 4.25G | for (j[1] = 0; j[1] < m[1]; j[1]++3.71G ) |
159 | 14.8G | for (c = 0; 3.71G c < channel_size; c++11.1G ) |
160 | 11.1G | p += wpz[j[1] + c * hw] * apz[j[1] * dilation[1] * astride[CCV_NNC_MAX_DIM + 1] + c * astride[1]]; |
161 | 535M | wpz += w->info.dim[CCV_NNC_MAX_DIM + 1]; |
162 | 535M | apz += astride[CCV_NNC_MAX_DIM] * dilation[0]; |
163 | 535M | } |
164 | 77.0M | bp[i[1]] = p; |
165 | 77.0M | } |
166 | 688k | bp += bstride[CCV_NNC_MAX_DIM]; |
167 | 688k | ap += astride[CCV_NNC_MAX_DIM] * (ccv_max((i[0] + 1) * hint.stride.dim[0] - hint.border.begin[0], 0) - ccv_max(i[0] * hint.stride.dim[0] - hint.border.begin[0], 0)); |
168 | 688k | } |
169 | 6.14k | } parallel_endfor |
170 | 1 | } |
171 | 748 | return CCV_NNC_EXEC_SUCCESS; |
172 | 748 | } |
173 | | |
174 | | static int _ccv_nnc_conv_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
175 | 1.11k | { |
176 | | // inputs: gradient, forw prop input, [w] |
177 | | // outputs: [output gradient], weight updates, bias updates |
178 | 1.11k | assert(input_size >= 2 && output_size >= 2); |
179 | 1.11k | const ccv_nnc_tensor_view_t* g = (ccv_nnc_tensor_view_t*)inputs[0]; // gradients |
180 | 1.11k | const ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[1]; |
181 | 1.11k | ccv_nnc_tensor_t* w = output_size > 1 ? outputs[1] : 00 ; |
182 | 1.11k | assert(CCV_IS_TENSOR_CONTIGUOUS(w)); |
183 | 1.11k | ccv_nnc_tensor_t* bias = output_size > 2 ? outputs[2]1.10k : 02 ; |
184 | 1.11k | assert(!bias || !CCV_IS_TENSOR_VIEW(bias)); |
185 | 1.11k | ccv_nnc_tensor_view_t* h = (ccv_nnc_tensor_view_t*)outputs[0]; // output gradients |
186 | 1.11k | if (!(flags & CCV_NNC_ACCUMULATE_OUTPUT)) // reset the gradients to 0 |
187 | 1.11k | { |
188 | 1.11k | if (w) |
189 | 1.11k | memset(w->data.u8, 0, sizeof(float) * ccv_nnc_tensor_count(w->info)); |
190 | 1.11k | if (bias) |
191 | 1.10k | memset(bias->data.u8, 0, sizeof(float) * ccv_nnc_tensor_count(bias->info)); |
192 | 1.11k | } |
193 | 1.11k | const int a_nd = ccv_nnc_tensor_nd(a->info.dim); |
194 | 1.11k | assert(a_nd == CCV_NNC_MAX_DIM + 1 || a_nd == CCV_NNC_MAX_DIM + 2); |
195 | 1.11k | const int* adim = (a_nd == CCV_NNC_MAX_DIM + 1) ? a->info.dim4 : a->info.dim + 11.10k ; |
196 | 1.11k | const int g_nd = ccv_nnc_tensor_nd(g->info.dim); |
197 | 1.11k | assert(g_nd == CCV_NNC_MAX_DIM + 1 || g_nd == CCV_NNC_MAX_DIM + 2); |
198 | 1.11k | const int* gdim = (g_nd == CCV_NNC_MAX_DIM + 1) ? g->info.dim4 : g->info.dim + 11.10k ; |
199 | 1.11k | int astride[CCV_NNC_MAX_DIM_ALLOC]; |
200 | 1.11k | ccv_nnc_tensor_view_get_stride(a, astride); |
201 | 1.11k | int gstride[CCV_NNC_MAX_DIM_ALLOC]; |
202 | 1.11k | ccv_nnc_tensor_view_get_stride(g, gstride); |
203 | 1.11k | const int groups = cmd.info.convolution.groups; |
204 | 1.11k | if (w) |
205 | 1.11k | assert(w->info.dim[CCV_NNC_MAX_DIM + 1] * groups == adim[CCV_NNC_MAX_DIM]); |
206 | 1.11k | assert(cmd.info.convolution.count % groups == 0); |
207 | 1.11k | const int group_size = cmd.info.convolution.count / groups; |
208 | 1.11k | const int channel_size = w ? w->info.dim[CCV_NNC_MAX_DIM + 1] : inputs[2]->info.dim[0 CCV_NNC_MAX_DIM0 + 1]; |
209 | 1.11k | const int batch_size = (a_nd == CCV_NNC_MAX_DIM + 2) ? a->info.dim[0]1.10k : 14 ; |
210 | 1.11k | const int dilation[CCV_NNC_MAX_DIM] = { |
211 | 1.11k | ccv_max(cmd.info.convolution.dilation[0], 1), |
212 | 1.11k | ccv_max(cmd.info.convolution.dilation[1], 1) |
213 | 1.11k | }; |
214 | 1.11k | const int wdim[CCV_NNC_MAX_DIM] = { |
215 | 1.11k | (w->info.dim[1] - 1) * dilation[0] + 1, |
216 | 1.11k | (w->info.dim[2] - 1) * dilation[1] + 1 |
217 | 1.11k | }; |
218 | 1.11k | if (w) |
219 | 1.11k | { |
220 | 47.4k | parallel_for1.11k (k, cmd.info.convolution.count) { |
221 | 47.4k | int c; |
222 | 47.4k | const int gidx = k / group_size; |
223 | | // kernel weight for one dim. |
224 | 47.4k | float* wp = w->data.f32 + k * w->info.dim[1] * w->info.dim[2] * w->info.dim[3]; |
225 | 47.4k | float biasval = 0; |
226 | 47.4k | int i[CCV_NNC_MAX_DIM]; |
227 | 47.4k | int n[CCV_NNC_MAX_DIM]; |
228 | 47.4k | int d[CCV_NNC_MAX_DIM]; |
229 | 47.4k | int m[CCV_NNC_MAX_DIM]; |
230 | 47.4k | int j[CCV_NNC_MAX_DIM]; |
231 | 47.4k | int bidx; |
232 | 119k | for (bidx = 0; bidx < batch_size; bidx++71.6k ) |
233 | 71.6k | { |
234 | 71.6k | const float* ap = a->data.f32 + bidx * astride[0]; |
235 | 71.6k | const float* gp = g->data.f32 + bidx * gstride[0] + k; |
236 | 3.53M | for (i[0] = 0; i[0] < gdim[0]; i[0]++3.45M ) |
237 | 3.45M | { |
238 | 3.45M | SET_BORDER_OFFSET_SIZE_FOR(0, i, hint, wdim, adim, n, m); |
239 | 3.45M | m[0] = (m[0] + n[0] - 1) / dilation[0] + 1; |
240 | 3.45M | const int n0 = (n[0] + dilation[0] - 1) / dilation[0]; |
241 | 3.45M | d[0] = n0 * dilation[0] - n[0]; |
242 | 3.45M | n[0] = n0; |
243 | 3.45M | m[0] = m[0] - n[0]; |
244 | 3.45M | float* wpu = wp + n[0] * w->info.dim[CCV_NNC_MAX_DIM] * channel_size; |
245 | 326M | for (i[1] = 0; i[1] < gdim[1]; i[1]++323M ) |
246 | 323M | { |
247 | 323M | SET_BORDER_OFFSET_SIZE_FOR(1, i, hint, wdim, adim, n, m); |
248 | 323M | m[1] = (m[1] + n[1] - 1) / dilation[1] + 1; |
249 | 323M | const int n1 = (n[1] + dilation[1] - 1) / dilation[1]; |
250 | 323M | d[1] = n1 * dilation[1] - n[1]; |
251 | 323M | n[1] = n1; |
252 | 323M | m[1] = m[1] - n[1]; |
253 | 323M | const float v = gp[i[1] * gstride[CCV_NNC_MAX_DIM]]; |
254 | 323M | if (v == 0) // shortcut if v is zero |
255 | 11.8M | continue; |
256 | 311M | biasval += v; |
257 | 311M | float* wpz = wpu + n[1] * channel_size; |
258 | 311M | const float* apz = ap + d[0] * astride[CCV_NNC_MAX_DIM - 1] + (ccv_max(i[1] * hint.stride.dim[1] - hint.border.begin[1], 0) + d[1]) * astride[CCV_NNC_MAX_DIM] + gidx * channel_size; |
259 | 2.46G | for (j[0] = 0; j[0] < m[0]; j[0]++2.15G ) |
260 | 2.15G | { |
261 | 17.0G | for (j[1] = 0; j[1] < m[1]; j[1]++14.8G ) |
262 | 60.3G | for (c = 0; 14.8G c < channel_size; c++45.5G ) |
263 | 45.5G | wpz[j[1] * channel_size + c] += v * apz[j[1] * dilation[1] * astride[CCV_NNC_MAX_DIM] + c]; |
264 | 2.15G | wpz += w->info.dim[CCV_NNC_MAX_DIM] * channel_size; |
265 | 2.15G | apz += astride[CCV_NNC_MAX_DIM - 1] * dilation[0]; |
266 | 2.15G | } |
267 | 311M | } |
268 | 3.45M | gp += gstride[CCV_NNC_MAX_DIM - 1]; |
269 | 3.45M | ap += astride[CCV_NNC_MAX_DIM - 1] * (ccv_max((i[0] + 1) * hint.stride.dim[0] - hint.border.begin[0], 0) - ccv_max(i[0] * hint.stride.dim[0] - hint.border.begin[0], 0)); |
270 | 3.45M | } |
271 | 71.6k | } |
272 | 47.4k | if (bias) |
273 | 47.4k | bias->data.f32[k] = biasval; |
274 | 47.4k | } parallel_endfor |
275 | 1.11k | } |
276 | | // If h is available, therefore, we need to propagate the gradients back |
277 | 1.11k | if (h) |
278 | 1.01k | { |
279 | 1.01k | assert(h); |
280 | 1.01k | const int h_nd = ccv_nnc_tensor_nd(h->info.dim); |
281 | 1.01k | assert(h_nd == CCV_NNC_MAX_DIM + 1 || h_nd == CCV_NNC_MAX_DIM + 2); |
282 | 1.01k | const int* hdim = (h_nd == CCV_NNC_MAX_DIM + 1) ? h->info.dim6 : h->info.dim + 11.00k ; |
283 | 1.01k | int hstride[CCV_NNC_MAX_DIM_ALLOC]; |
284 | 1.01k | ccv_nnc_tensor_view_get_stride(h, hstride); |
285 | | // reset it to 0. |
286 | 1.01k | ccv_nnc_tensor_zero(h); |
287 | 1.01k | w = inputs[2]; |
288 | 1.01k | assert(CCV_IS_TENSOR_CONTIGUOUS(w)); |
289 | 1.01k | int bidx; |
290 | 2.27k | for (bidx = 0; bidx < batch_size; bidx++1.26k ) |
291 | 1.26k | { |
292 | 1.26k | int k; |
293 | 69.6k | for (k = 0; k < cmd.info.convolution.count; k++68.4k ) |
294 | 68.4k | { |
295 | 68.4k | int c; |
296 | 68.4k | const int gidx = k / group_size; |
297 | 68.4k | float* hp = h->data.f32 + bidx * hstride[0]; |
298 | 68.4k | const float* gp = g->data.f32 + bidx * gstride[0] + k; |
299 | | // kernel weight for one dim. |
300 | 68.4k | float* wp = w->data.f32 + k * w->info.dim[1] * w->info.dim[2] * channel_size; |
301 | | // This block will be cause in each for-loop, therefore, you can use it to generate some temporary variables. |
302 | 68.4k | int i[CCV_NNC_MAX_DIM]; |
303 | 68.4k | int n[CCV_NNC_MAX_DIM]; |
304 | 68.4k | int d[CCV_NNC_MAX_DIM]; |
305 | 68.4k | int m[CCV_NNC_MAX_DIM]; |
306 | 68.4k | int j[CCV_NNC_MAX_DIM]; |
307 | 3.42M | for (i[0] = 0; i[0] < gdim[0]; i[0]++3.35M ) |
308 | 3.35M | { |
309 | 3.35M | SET_BORDER_OFFSET_SIZE_FOR(0, i, hint, wdim, hdim, n, m); |
310 | 3.35M | m[0] = (m[0] + n[0] - 1) / dilation[0] + 1; |
311 | 3.35M | const int n0 = (n[0] + dilation[0] - 1) / dilation[0]; |
312 | 3.35M | d[0] = n0 * dilation[0] - n[0]; |
313 | 3.35M | n[0] = n0; |
314 | 3.35M | m[0] = m[0] - n[0]; |
315 | 3.35M | const float* wpu = wp + n[0] * w->info.dim[CCV_NNC_MAX_DIM] * channel_size; |
316 | 323M | for (i[1] = 0; i[1] < gdim[1]; i[1]++320M ) |
317 | 320M | { |
318 | 320M | SET_BORDER_OFFSET_SIZE_FOR(1, i, hint, wdim, hdim, n, m); |
319 | 320M | m[1] = (m[1] + n[1] - 1) / dilation[1] + 1; |
320 | 320M | const int n1 = (n[1] + dilation[1] - 1) / dilation[1]; |
321 | 320M | d[1] = n1 * dilation[1] - n[1]; |
322 | 320M | n[1] = n1; |
323 | 320M | m[1] = m[1] - n[1]; |
324 | 320M | const float v = gp[i[1] * gstride[CCV_NNC_MAX_DIM]]; |
325 | 320M | if (v == 0) // shortcut if v is zero |
326 | 9.21M | continue; |
327 | 311M | const float* wpz = wpu + n[1] * channel_size; |
328 | 311M | float* hpz = hp + d[0] * hstride[CCV_NNC_MAX_DIM - 1] + (ccv_max(i[1] * hint.stride.dim[1] - hint.border.begin[1], 0) + d[1]) * hstride[CCV_NNC_MAX_DIM] + gidx * channel_size; |
329 | 2.46G | for (j[0] = 0; j[0] < m[0]; j[0]++2.15G ) |
330 | 2.15G | { |
331 | 17.0G | for (j[1] = 0; j[1] < m[1]; j[1]++14.8G ) |
332 | 60.3G | for (c = 0; 14.8G c < channel_size; c++45.4G ) |
333 | 45.4G | hpz[j[1] * dilation[1] * hstride[CCV_NNC_MAX_DIM] + c] += v * wpz[j[1] * channel_size + c]; |
334 | 2.15G | wpz += w->info.dim[CCV_NNC_MAX_DIM] * channel_size; |
335 | 2.15G | hpz += hstride[CCV_NNC_MAX_DIM - 1] * dilation[0]; |
336 | 2.15G | } |
337 | 311M | } |
338 | 3.35M | gp += gstride[CCV_NNC_MAX_DIM - 1]; |
339 | 3.35M | hp += hstride[CCV_NNC_MAX_DIM - 1] * (ccv_max((i[0] + 1) * hint.stride.dim[0] - hint.border.begin[0], 0) - ccv_max(i[0] * hint.stride.dim[0] - hint.border.begin[0], 0)); |
340 | 3.35M | } |
341 | 68.4k | } |
342 | 1.26k | } |
343 | 1.01k | } |
344 | 1.11k | return CCV_NNC_EXEC_SUCCESS; |
345 | 1.11k | } |
346 | | |
347 | | REGISTER_COMMAND_BACKEND(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
348 | 1 | { |
349 | 1 | registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW; |
350 | 1 | registry->tensor_datatypes = CCV_32F; |
351 | 1 | registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
352 | 1 | registry->algorithms = 1; |
353 | 1 | registry->exec = _ccv_nnc_conv_forw; |
354 | 1 | } |
355 | | |
356 | | REGISTER_COMMAND_BACKEND(CCV_NNC_CONVOLUTION_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
357 | 1 | { |
358 | 1 | registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC; |
359 | 1 | registry->tensor_datatypes = CCV_32F; |
360 | 1 | registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
361 | 1 | registry->algorithms = 1; |
362 | 1 | registry->exec = _ccv_nnc_conv_back; |
363 | 1 | } |
364 | | |