/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/cmd/blas/ccv_nnc_add_cpu_ref.c
Line | Count | Source (jump to first uncovered line) |
1 | | #include "ccv.h" |
2 | | #include "ccv_internal.h" |
3 | | #include "nnc/ccv_nnc.h" |
4 | | #include "nnc/ccv_nnc_easy.h" |
5 | | #include "nnc/ccv_nnc_internal.h" |
6 | | #ifdef USE_OPENMP |
7 | | #include <omp.h> |
8 | | #endif |
9 | | #ifdef USE_DISPATCH |
10 | | #include <dispatch/dispatch.h> |
11 | | #endif |
12 | | |
13 | | // Shared methods. |
14 | | #include "../_ccv_nnc_cpu_ref.h" |
15 | | |
16 | | void _ccv_nnc_add_forw_cpu_ref(const float p, const float q, ccv_nnc_tensor_view_t* const a, ccv_nnc_tensor_view_t* const b, ccv_nnc_tensor_view_t* const c) |
17 | 28.9k | { |
18 | 28.9k | if (b == 0) |
19 | 17.0k | { |
20 | | // It cannot be set otherwise we have trouble. |
21 | 17.0k | assert(q == 0); |
22 | 17.0k | if (p == 1) |
23 | 11.8k | { |
24 | 11.8k | _ccv_nnc_tensor_transfer_cpu_ref_f32(a, c); |
25 | 11.8k | return; |
26 | 11.8k | } else if (5.17k p == 05.17k ) { |
27 | 0 | ccv_nnc_tensor_zero(c); |
28 | 0 | return; |
29 | 0 | } |
30 | | // Assuming this is float 32. |
31 | 5.17k | int dim[CCV_NNC_MAX_DIM_ALLOC]; |
32 | 5.17k | int astride[CCV_NNC_MAX_DIM_ALLOC]; |
33 | 5.17k | int cstride[CCV_NNC_MAX_DIM_ALLOC]; |
34 | 5.17k | assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2); |
35 | 5.17k | assert(ccv_nnc_tensor_nd(c->info.dim) <= CCV_NNC_MAX_DIM + 2); |
36 | 5.17k | ccv_nnc_tensor_view_get_dim(a, dim); |
37 | 5.17k | assert(ccv_nnc_tensor_view_check_dim(c, dim)); |
38 | 5.17k | int x; |
39 | 5.17k | if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(c)) |
40 | 5.17k | { |
41 | | // Super optimal case, just do one for-loop for sum. |
42 | 5.17k | const int tensor_count = ccv_nnc_tensor_count(a->info); |
43 | 11.8k | for (x = 0; x < tensor_count; x++6.67k ) |
44 | 6.67k | c->data.f32[x] = p * a->data.f32[x]; |
45 | 5.17k | return; |
46 | 5.17k | } |
47 | 0 | assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number. |
48 | 0 | ccv_nnc_tensor_view_get_stride(a, astride); |
49 | 0 | ccv_nnc_tensor_view_get_stride(c, cstride); |
50 | 0 | int i[CCV_NNC_MAX_DIM + 2]; |
51 | 0 | float* const ap = a->data.f32; |
52 | 0 | float* const cp = c->data.f32; |
53 | 0 | const int count = dim[2] * dim[3]; |
54 | 0 | if (astride[2] == dim[3] && cstride[2] == dim[3] && astride[3] == 1 && cstride[3] == 1) |
55 | 0 | { |
56 | | // Special casing if the ainc[3] is the same as dim[3] |
57 | 0 | for (i[0] = 0; i[0] < dim[0]; i[0]++) |
58 | 0 | { |
59 | 0 | float* ap0 = ap + i[0] * astride[0]; |
60 | 0 | float* cp0 = cp + i[0] * cstride[0]; |
61 | 0 | for (i[1] = 0; i[1] < dim[1]; i[1]++) |
62 | 0 | { |
63 | 0 | for (x = 0; x < count; x++) |
64 | 0 | cp0[x] = p * ap0[x]; |
65 | 0 | ap0 += astride[1]; |
66 | 0 | cp0 += cstride[1]; |
67 | 0 | } |
68 | 0 | } |
69 | 0 | return; |
70 | 0 | } |
71 | | // Non-optimal case, need to do skip copy. |
72 | 0 | for (i[0] = 0; i[0] < dim[0]; i[0]++) |
73 | 0 | { |
74 | 0 | float* const ap0 = ap + i[0] * astride[0]; |
75 | 0 | float* const cp0 = cp + i[0] * cstride[0]; |
76 | 0 | for (i[1] = 0; i[1] < dim[1]; i[1]++) |
77 | 0 | { |
78 | 0 | float* ap1 = ap0 + i[1] * astride[1]; |
79 | 0 | float* cp1 = cp0 + i[1] * cstride[1]; |
80 | 0 | for (i[2] = 0; i[2] < dim[2]; i[2]++) |
81 | 0 | { |
82 | 0 | for (x = 0; x < dim[3]; x++) |
83 | 0 | cp1[x * cstride[3]] = p * ap1[x * astride[3]]; |
84 | 0 | ap1 += astride[2]; |
85 | 0 | cp1 += cstride[2]; |
86 | 0 | } |
87 | 0 | } |
88 | 0 | } |
89 | 0 | return; |
90 | 0 | } |
91 | 11.9k | int cdim[CCV_NNC_MAX_DIM_ALLOC]; |
92 | 11.9k | assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2); |
93 | 11.9k | assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2); |
94 | 11.9k | ccv_nnc_tensor_view_get_dim(a, cdim); // Fill in cdim first. |
95 | 11.9k | ccv_nnc_tensor_view_get_broadcast_dim(b, cdim); |
96 | 11.9k | assert(ccv_nnc_tensor_view_check_broadcast_dim(a, cdim)); |
97 | 11.9k | assert(ccv_nnc_tensor_view_check_broadcast_dim(b, cdim)); |
98 | 11.9k | const int a_check_dim = ccv_nnc_tensor_view_check_dim(a, cdim); |
99 | 11.9k | const int b_check_dim = ccv_nnc_tensor_view_check_dim(b, cdim); |
100 | 11.9k | if (p == 1 && q == 111.8k && a_check_dim10 && b_check_dim9 ) |
101 | 7 | { |
102 | 7 | _ccv_nnc_ewsum_forw_cpu_ref_f32((ccv_nnc_tensor_view_t*[]){ |
103 | 7 | a, b |
104 | 7 | }, 2, &c, 1); |
105 | 7 | return; |
106 | 11.9k | } else if (p == 1 && q == 011.8k && a_check_dim0 ) { |
107 | 0 | _ccv_nnc_tensor_transfer_cpu_ref_f32(a, c); |
108 | 0 | return; |
109 | 11.9k | } else if (p == 0 && q == 10 && b_check_dim0 ) { |
110 | 0 | _ccv_nnc_tensor_transfer_cpu_ref_f32(b, c); |
111 | 0 | return; |
112 | 11.9k | } else if (p == 0 && q == 00 ) { |
113 | 0 | ccv_nnc_tensor_zero(c); |
114 | 0 | return; |
115 | 0 | } |
116 | | // Assuming this is float 32. |
117 | 11.9k | int adim[CCV_NNC_MAX_DIM_ALLOC]; |
118 | 11.9k | int bdim[CCV_NNC_MAX_DIM_ALLOC]; |
119 | 11.9k | ccv_nnc_tensor_view_get_dim(a, adim); |
120 | 11.9k | ccv_nnc_tensor_view_get_dim(b, bdim); |
121 | 11.9k | assert(ccv_nnc_tensor_nd(c->info.dim) <= CCV_NNC_MAX_DIM + 2); |
122 | 11.9k | assert(ccv_nnc_tensor_view_check_dim(c, cdim)); |
123 | 11.9k | int x; |
124 | 11.9k | if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b) && !CCV_IS_TENSOR_VIEW(c) && a_check_dim && b_check_dim11.9k ) |
125 | 11.7k | { |
126 | 11.7k | const int tensor_count = ccv_nnc_tensor_count(a->info); |
127 | | // Super optimal case, just do one for-loop for sum. |
128 | 23.9k | for (x = 0; x < tensor_count; x++12.2k ) |
129 | 12.2k | c->data.f32[x] = p * a->data.f32[x] + q * b->data.f32[x]; |
130 | 11.7k | return; |
131 | 11.7k | } |
132 | 120 | assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number. |
133 | 120 | int astride[CCV_NNC_MAX_DIM_ALLOC]; |
134 | 120 | int bstride[CCV_NNC_MAX_DIM_ALLOC]; |
135 | 120 | int cstride[CCV_NNC_MAX_DIM_ALLOC]; |
136 | 120 | ccv_nnc_tensor_view_get_stride(a, astride); |
137 | 120 | ccv_nnc_tensor_view_get_stride(b, bstride); |
138 | 120 | ccv_nnc_tensor_view_get_stride(c, cstride); |
139 | 120 | int i[CCV_NNC_MAX_DIM + 2]; |
140 | 120 | float* const ap = a->data.f32; |
141 | 120 | float* const bp = b->data.f32; |
142 | 120 | float* const cp = c->data.f32; |
143 | 120 | const int count = cdim[2] * cdim[3]; |
144 | 120 | if (astride[2] == cdim[3] && bstride[2] == cdim[3]119 && cstride[2] == cdim[3]111 && adim[2] == cdim[2]111 && bdim[2] == cdim[2]111 && astride[3] == 10 && bstride[3] == 10 ) |
145 | 0 | { |
146 | | // Special casing if the ainc[3] is the same as dim[3] |
147 | 0 | for (i[0] = 0; i[0] < cdim[0]; i[0]++) |
148 | 0 | { |
149 | 0 | float* const ap0 = adim[0] == 1 ? ap : ap + i[0] * astride[0]; |
150 | 0 | float* const bp0 = bdim[0] == 1 ? bp : bp + i[0] * bstride[0]; |
151 | 0 | float* cp0 = cp + i[0] * cstride[0]; |
152 | 0 | for (i[1] = 0; i[1] < cdim[1]; i[1]++) |
153 | 0 | { |
154 | 0 | float* const ap1 = adim[1] == 1 ? ap0 : ap0 + i[1] * astride[1]; |
155 | 0 | float* const bp1 = bdim[1] == 1 ? bp0 : bp0 + i[1] * bstride[1]; |
156 | 0 | for (x = 0; x < count; x++) |
157 | 0 | cp0[x] = p * ap1[x] + q * bp1[x]; |
158 | 0 | cp0 += cstride[1]; |
159 | 0 | } |
160 | 0 | } |
161 | 0 | return; |
162 | 0 | } |
163 | | // Non-optimal case, need to do skip copy and handle broadcasting. |
164 | 354 | for (i[0] = 0; 120 i[0] < cdim[0]; i[0]++234 ) |
165 | 234 | { |
166 | 234 | float* const ap0 = adim[0] == 1 ? ap104 : ap + i[0] * astride[0]130 ; |
167 | 234 | float* const bp0 = bdim[0] == 1 ? bp114 : bp + i[0] * bstride[0]120 ; |
168 | 234 | float* const cp0 = cp + i[0] * cstride[0]; |
169 | 894 | for (i[1] = 0; i[1] < cdim[1]; i[1]++660 ) |
170 | 660 | { |
171 | 660 | float* const ap1 = adim[1] == 1 ? ap0104 : ap0 + i[1] * astride[1]556 ; |
172 | 660 | float* const bp1 = bdim[1] == 1 ? bp0268 : bp0 + i[1] * bstride[1]392 ; |
173 | 660 | float* cp1 = cp0 + i[1] * cstride[1]; |
174 | 3.28k | for (i[2] = 0; i[2] < cdim[2]; i[2]++2.62k ) |
175 | 2.62k | { |
176 | 2.62k | float* const ap2 = adim[2] == 1 ? ap12 : ap1 + i[2] * astride[2]2.62k ; |
177 | 2.62k | float* const bp2 = bdim[2] == 1 ? bp12.36k : bp1 + i[2] * bstride[2]256 ; |
178 | 2.62k | if (adim[3] == 1) |
179 | 412 | for (x = 0; 204 x < cdim[3]; x++208 ) |
180 | 208 | cp1[x] = p * ap2[0] + q * bp2[x * bstride[3]]; |
181 | 2.42k | else if (bdim[3] == 1) |
182 | 8.55k | for (x = 0; 770 x < cdim[3]; x++7.78k ) |
183 | 7.78k | cp1[x] = p * ap2[x * astride[3]] + q * bp2[0]; |
184 | 1.65k | else |
185 | 8.57k | for (x = 0; 1.65k x < cdim[3]; x++6.92k ) |
186 | 6.92k | cp1[x] = p * ap2[x * astride[3]] + q * bp2[x * bstride[3]]; |
187 | 2.62k | cp1 += cstride[2]; |
188 | 2.62k | } |
189 | 660 | } |
190 | 234 | } |
191 | 120 | } |
192 | | |
193 | | static int _ccv_nnc_add_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
194 | 11.8k | { |
195 | 11.8k | assert(input_size == 2); |
196 | 11.8k | _ccv_nnc_add_forw_cpu_ref(cmd.info.blas.a[0], cmd.info.blas.a[1], (ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[1], (ccv_nnc_tensor_view_t*)outputs[0]); |
197 | 11.8k | return CCV_NNC_EXEC_SUCCESS; |
198 | 11.8k | } |
199 | | |
200 | | static int _ccv_nnc_add_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
201 | 11.8k | { |
202 | 11.8k | if (inputs[0] == 0) |
203 | 0 | { |
204 | 0 | if (outputs[0]) |
205 | 0 | _ccv_nnc_tensor_set_cpu_ref_f32((ccv_nnc_tensor_view_t*)outputs[0], cmd.info.blas.a[0]); |
206 | 0 | if (output_size > 1 && outputs[1]) |
207 | 0 | _ccv_nnc_tensor_set_cpu_ref_f32((ccv_nnc_tensor_view_t*)outputs[1], cmd.info.blas.a[1]); |
208 | 0 | return CCV_NNC_EXEC_SUCCESS; |
209 | 0 | } |
210 | 11.8k | int gdim[CCV_NNC_MAX_DIM_ALLOC]; |
211 | 11.8k | int gstride[CCV_NNC_MAX_DIM_ALLOC]; |
212 | 11.8k | ccv_nnc_tensor_view_t* const g = (ccv_nnc_tensor_view_t*)inputs[0]; |
213 | 11.8k | ccv_nnc_tensor_view_get_dim(g, gdim); |
214 | 11.8k | ccv_nnc_tensor_view_get_stride(g, gstride); |
215 | 11.8k | if (outputs[0]) |
216 | 11.8k | { |
217 | 11.8k | ccv_nnc_tensor_view_t* const a = (ccv_nnc_tensor_view_t*)outputs[0]; |
218 | 11.8k | if (ccv_nnc_tensor_view_check_dim(a, gdim)) |
219 | 11.8k | _ccv_nnc_add_forw_cpu_ref(cmd.info.blas.a[0], 0, (ccv_nnc_tensor_view_t*)inputs[0], 0, (ccv_nnc_tensor_view_t*)outputs[0]); |
220 | 1 | else { |
221 | 1 | assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number. |
222 | 1 | const float p = cmd.info.blas.a[0]; |
223 | 1 | int adim[CCV_NNC_MAX_DIM_ALLOC]; |
224 | 1 | int astride[CCV_NNC_MAX_DIM_ALLOC]; |
225 | 1 | ccv_nnc_tensor_view_get_dim(a, adim); |
226 | 1 | ccv_nnc_tensor_view_get_stride(a, astride); |
227 | 1 | int i[CCV_NNC_MAX_DIM + 2]; |
228 | 1 | int x; |
229 | 1 | float* const ap = a->data.f32; |
230 | 1 | float* const gp = g->data.f32; |
231 | | // zeroing out so that we can accumulate. |
232 | 1 | ccv_nnc_tensor_zero(a); |
233 | | // Non-optimal case, need to do skip copy and handle broadcasting. |
234 | 2 | for (i[0] = 0; i[0] < gdim[0]; i[0]++1 ) |
235 | 1 | { |
236 | 1 | float* const ap0 = adim[0] == 1 ? ap : ap + i[0] * astride[0]0 ; |
237 | 1 | float* const gp0 = gp + i[0] * gstride[0]; |
238 | 2 | for (i[1] = 0; i[1] < gdim[1]; i[1]++1 ) |
239 | 1 | { |
240 | 1 | float* const ap1 = adim[1] == 1 ? ap0 : ap0 + i[1] * astride[1]0 ; |
241 | 1 | float* gp1 = gp0 + i[1] * gstride[1]; |
242 | 5 | for (i[2] = 0; i[2] < gdim[2]; i[2]++4 ) |
243 | 4 | { |
244 | 4 | float* const ap2 = adim[2] == 1 ? ap10 : ap1 + i[2] * astride[2]; |
245 | 4 | if (adim[3] == 1) |
246 | 12 | for (x = 0; 4 x < gdim[3]; x++8 ) |
247 | 8 | ap2[0] += p * gp1[x]; |
248 | 0 | else |
249 | 0 | for (x = 0; x < gdim[3]; x++) |
250 | 0 | ap2[x] += p * gp1[x]; |
251 | 4 | gp1 += gstride[2]; |
252 | 4 | } |
253 | 1 | } |
254 | 1 | } |
255 | 1 | } |
256 | 11.8k | } |
257 | 11.8k | if (output_size > 1 && outputs[1]) |
258 | 5.18k | { |
259 | 5.18k | ccv_nnc_tensor_view_t* const a = (ccv_nnc_tensor_view_t*)outputs[1]; |
260 | 5.18k | if (ccv_nnc_tensor_view_check_dim(a, gdim)) |
261 | 5.17k | _ccv_nnc_add_forw_cpu_ref(cmd.info.blas.a[1], 0, (ccv_nnc_tensor_view_t*)inputs[0], 0, (ccv_nnc_tensor_view_t*)outputs[1]); |
262 | 10 | else { |
263 | 10 | assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number. |
264 | 10 | const float p = cmd.info.blas.a[1]; |
265 | 10 | int adim[CCV_NNC_MAX_DIM_ALLOC]; |
266 | 10 | int astride[CCV_NNC_MAX_DIM_ALLOC]; |
267 | 10 | ccv_nnc_tensor_view_get_dim(a, adim); |
268 | 10 | ccv_nnc_tensor_view_get_stride(a, astride); |
269 | 10 | int i[CCV_NNC_MAX_DIM + 2]; |
270 | 10 | int x; |
271 | 10 | float* const ap = a->data.f32; |
272 | 10 | float* const gp = g->data.f32; |
273 | | // zeroing out so that we can accumulate. |
274 | 10 | ccv_nnc_tensor_zero(a); |
275 | | // Non-optimal case, need to do skip copy and handle broadcasting. |
276 | 74 | for (i[0] = 0; i[0] < gdim[0]; i[0]++64 ) |
277 | 64 | { |
278 | 64 | float* const ap0 = adim[0] == 1 ? ap4 : ap + i[0] * astride[0]60 ; |
279 | 64 | float* const gp0 = gp + i[0] * gstride[0]; |
280 | 330 | for (i[1] = 0; i[1] < gdim[1]; i[1]++266 ) |
281 | 266 | { |
282 | 266 | float* const ap1 = adim[1] == 1 ? ap070 : ap0 + i[1] * astride[1]196 ; |
283 | 266 | float* gp1 = gp0 + i[1] * gstride[1]; |
284 | 1.41k | for (i[2] = 0; i[2] < gdim[2]; i[2]++1.15k ) |
285 | 1.15k | { |
286 | 1.15k | float* const ap2 = adim[2] == 1 ? ap11.02k : ap1 + i[2] * astride[2]128 ; |
287 | 1.15k | if (adim[3] == 1) |
288 | 4.32k | for (x = 0; 385 x < gdim[3]; x++3.94k ) |
289 | 3.94k | ap2[0] += p * gp1[x]; |
290 | 768 | else |
291 | 3.63k | for (x = 0; 768 x < gdim[3]; x++2.86k ) |
292 | 2.86k | ap2[x] += p * gp1[x]; |
293 | 1.15k | gp1 += gstride[2]; |
294 | 1.15k | } |
295 | 266 | } |
296 | 64 | } |
297 | 10 | } |
298 | 5.18k | } |
299 | 11.8k | return CCV_NNC_EXEC_SUCCESS; |
300 | 11.8k | } |
301 | | |
302 | | REGISTER_COMMAND_BACKEND(CCV_NNC_ADD_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
303 | 1 | { |
304 | 1 | registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN; |
305 | 1 | registry->tensor_datatypes = CCV_32F; |
306 | 1 | registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
307 | 1 | registry->algorithms = 1; |
308 | 1 | registry->exec = _ccv_nnc_add_forw; |
309 | 1 | } |
310 | | |
311 | | REGISTER_COMMAND_BACKEND(CCV_NNC_ADD_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
312 | 1 | { |
313 | 1 | registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN; |
314 | 1 | registry->tensor_datatypes = CCV_32F; |
315 | 1 | registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
316 | 1 | registry->algorithms = 1; |
317 | 1 | registry->exec = _ccv_nnc_add_back; |
318 | 1 | } |