/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/cmd/blas/ccv_nnc_mul_cpu_ref.c
Line | Count | Source |
1 | | #include "ccv.h" |
2 | | #include "ccv_internal.h" |
3 | | #include "nnc/ccv_nnc.h" |
4 | | #include "nnc/ccv_nnc_easy.h" |
5 | | #include "nnc/ccv_nnc_internal.h" |
6 | | #ifdef USE_OPENMP |
7 | | #include <omp.h> |
8 | | #endif |
9 | | #ifdef USE_DISPATCH |
10 | | #include <dispatch/dispatch.h> |
11 | | #endif |
12 | | |
13 | | // Shared methods. |
14 | | #include "../_ccv_nnc_cpu_ref.h" |
15 | | |
16 | | void _ccv_nnc_mul_forw_cpu_ref(const float p, ccv_nnc_tensor_view_t* const a, ccv_nnc_tensor_view_t* const b, ccv_nnc_tensor_view_t* const c) |
17 | 10.6k | { |
18 | 10.6k | if (b == 0) |
19 | 147 | { |
20 | 147 | if (p == 1) |
21 | 0 | { |
22 | 0 | _ccv_nnc_tensor_transfer_cpu_ref_f32(a, c); |
23 | 0 | return; |
24 | 147 | } else if (p == 0) { |
25 | 0 | ccv_nnc_tensor_zero(c); |
26 | 0 | return; |
27 | 0 | } |
28 | | // Assuming this is float 32. |
29 | 147 | int dim[CCV_NNC_MAX_DIM_ALLOC]; |
30 | 147 | int astride[CCV_NNC_MAX_DIM_ALLOC]; |
31 | 147 | int cstride[CCV_NNC_MAX_DIM_ALLOC]; |
32 | 147 | assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2); |
33 | 147 | assert(ccv_nnc_tensor_nd(c->info.dim) <= CCV_NNC_MAX_DIM + 2); |
34 | 147 | ccv_nnc_tensor_view_get_dim(a, dim); |
35 | 147 | assert(ccv_nnc_tensor_view_check_dim(c, dim)); |
36 | 147 | int x; |
37 | 147 | if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(c)) |
38 | 145 | { |
39 | | // Super optimal case, just do one for-loop for sum. |
40 | 145 | const int tensor_count = ccv_nnc_tensor_count(a->info); |
41 | 8.39M | for (x = 0; x < tensor_count; x++8.39M ) |
42 | 8.39M | c->data.f32[x] = p * a->data.f32[x]; |
43 | 145 | return; |
44 | 145 | } |
45 | 147 | assert(CCV_NNC_MAX_DIM == 2)2 ; // Need to change this logic for CCV_NNC_MAX_DIM == other number. |
46 | 2 | ccv_nnc_tensor_view_get_stride(a, astride); |
47 | 2 | ccv_nnc_tensor_view_get_stride(c, cstride); |
48 | 2 | int i[CCV_NNC_MAX_DIM + 2]; |
49 | 2 | float* const ap = a->data.f32; |
50 | 2 | float* const cp = c->data.f32; |
51 | 2 | const int count = dim[2] * dim[3]; |
52 | 2 | if (astride[2] == dim[3] && cstride[2] == dim[3]) |
53 | 2 | { |
54 | | // Special casing if the ainc[3] is the same as dim[3] |
55 | 4 | for (i[0] = 0; i[0] < dim[0]; i[0]++2 ) |
56 | 2 | { |
57 | 2 | float* ap0 = ap + i[0] * astride[0]; |
58 | 2 | float* cp0 = cp + i[0] * cstride[0]; |
59 | 4 | for (i[1] = 0; i[1] < dim[1]; i[1]++2 ) |
60 | 2 | { |
61 | 4 | for (x = 0; x < count; x++2 ) |
62 | 2 | cp0[x] = p * ap0[x]; |
63 | 2 | ap0 += astride[1]; |
64 | 2 | cp0 += cstride[1]; |
65 | 2 | } |
66 | 2 | } |
67 | 2 | return; |
68 | 2 | } |
69 | | // Non-optimal case, need to do skip copy. |
70 | 0 | for (i[0] = 0; i[0] < dim[0]; i[0]++) |
71 | 0 | { |
72 | 0 | float* const ap0 = ap + i[0] * astride[0]; |
73 | 0 | float* const cp0 = cp + i[0] * cstride[0]; |
74 | 0 | for (i[1] = 0; i[1] < dim[1]; i[1]++) |
75 | 0 | { |
76 | 0 | float* ap1 = ap0 + i[1] * astride[1]; |
77 | 0 | float* cp1 = cp0 + i[1] * cstride[1]; |
78 | 0 | for (i[2] = 0; i[2] < dim[2]; i[2]++) |
79 | 0 | { |
80 | 0 | for (x = 0; x < dim[3]; x++) |
81 | 0 | cp1[x] = p * ap1[x]; |
82 | 0 | ap1 += astride[2]; |
83 | 0 | cp1 += cstride[2]; |
84 | 0 | } |
85 | 0 | } |
86 | 0 | } |
87 | 0 | return; |
88 | 2 | } |
89 | 10.4k | int cdim[CCV_NNC_MAX_DIM_ALLOC]; |
90 | 10.4k | assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2); |
91 | 10.4k | assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2); |
92 | 10.4k | ccv_nnc_tensor_view_get_dim(a, cdim); // Fill in cdim first. |
93 | 10.4k | ccv_nnc_tensor_view_get_broadcast_dim(b, cdim); |
94 | 10.4k | assert(ccv_nnc_tensor_view_check_broadcast_dim(a, cdim)); |
95 | 10.4k | assert(ccv_nnc_tensor_view_check_broadcast_dim(b, cdim)); |
96 | 10.4k | const int a_check_dim = ccv_nnc_tensor_view_check_dim(a, cdim); |
97 | 10.4k | const int b_check_dim = ccv_nnc_tensor_view_check_dim(b, cdim); |
98 | 10.4k | if (p == 1 && a_check_dim10.4k && b_check_dim10.4k ) |
99 | 10.4k | { |
100 | 10.4k | _ccv_nnc_ewprod_forw_cpu_ref((ccv_nnc_tensor_view_t*[]){ |
101 | 10.4k | a, b |
102 | 10.4k | }, 2, &c, 1); |
103 | 10.4k | return; |
104 | 10.4k | } else if (33 p == 033 ) { |
105 | 0 | ccv_nnc_tensor_zero(c); |
106 | 0 | return; |
107 | 0 | } |
108 | | // Assuming this is float 32. |
109 | 33 | int adim[CCV_NNC_MAX_DIM_ALLOC]; |
110 | 33 | int bdim[CCV_NNC_MAX_DIM_ALLOC]; |
111 | 33 | ccv_nnc_tensor_view_get_dim(a, adim); |
112 | 33 | ccv_nnc_tensor_view_get_dim(b, bdim); |
113 | 33 | int astride[CCV_NNC_MAX_DIM_ALLOC]; |
114 | 33 | int bstride[CCV_NNC_MAX_DIM_ALLOC]; |
115 | 33 | int cstride[CCV_NNC_MAX_DIM_ALLOC]; |
116 | 33 | assert(ccv_nnc_tensor_nd(c->info.dim) <= CCV_NNC_MAX_DIM + 2); |
117 | 33 | assert(ccv_nnc_tensor_view_check_dim(c, cdim)); |
118 | 33 | int x; |
119 | 33 | if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b) && !CCV_IS_TENSOR_VIEW(c) && a_check_dim && b_check_dim27 ) |
120 | 9 | { |
121 | 9 | const int tensor_count = ccv_nnc_tensor_count(a->info); |
122 | | // Super optimal case, just do one for-loop for sum. |
123 | 99 | for (x = 0; x < tensor_count; x++90 ) |
124 | 90 | c->data.f32[x] = p * a->data.f32[x] * b->data.f32[x]; |
125 | 9 | return; |
126 | 9 | } |
127 | 33 | assert(CCV_NNC_MAX_DIM == 2)24 ; // Need to change this logic for CCV_NNC_MAX_DIM == other number. |
128 | 24 | ccv_nnc_tensor_view_get_stride(a, astride); |
129 | 24 | ccv_nnc_tensor_view_get_stride(b, bstride); |
130 | 24 | ccv_nnc_tensor_view_get_stride(c, cstride); |
131 | 24 | int i[CCV_NNC_MAX_DIM + 2]; |
132 | 24 | float* const ap = a->data.f32; |
133 | 24 | float* const bp = b->data.f32; |
134 | 24 | float* const cp = c->data.f32; |
135 | 24 | const int count = cdim[2] * cdim[3]; |
136 | 24 | if (astride[2] == cdim[3] && bstride[2] == cdim[3]19 && cstride[2] == cdim[3]8 && adim[2] == cdim[2]8 && bdim[2] == cdim[2]7 ) |
137 | 0 | { |
138 | | // Special casing if the ainc[3] is the same as dim[3] |
139 | 0 | for (i[0] = 0; i[0] < cdim[0]; i[0]++) |
140 | 0 | { |
141 | 0 | float* const ap0 = adim[0] == 1 ? ap : ap + i[0] * astride[0]; |
142 | 0 | float* const bp0 = bdim[0] == 1 ? bp : bp + i[0] * bstride[0]; |
143 | 0 | float* cp0 = cp + i[0] * cstride[0]; |
144 | 0 | for (i[1] = 0; i[1] < cdim[1]; i[1]++) |
145 | 0 | { |
146 | 0 | float* const ap1 = adim[1] == 1 ? ap0 : ap0 + i[1] * astride[1]; |
147 | 0 | float* const bp1 = bdim[1] == 1 ? bp0 : bp0 + i[1] * bstride[1]; |
148 | 0 | for (x = 0; x < count; x++) |
149 | 0 | cp0[x] = p * ap1[x] * bp1[x]; |
150 | 0 | cp0 += cstride[1]; |
151 | 0 | } |
152 | 0 | } |
153 | 0 | return; |
154 | 0 | } |
155 | | // Non-optimal case, need to do skip copy and handle broadcasting. |
156 | 154 | for (i[0] = 0; 24 i[0] < cdim[0]; i[0]++130 ) |
157 | 130 | { |
158 | 130 | float* const ap0 = adim[0] == 1 ? ap8 : ap + i[0] * astride[0]122 ; |
159 | 130 | float* const bp0 = bdim[0] == 1 ? bp18 : bp + i[0] * bstride[0]112 ; |
160 | 130 | float* const cp0 = cp + i[0] * cstride[0]; |
161 | 622 | for (i[1] = 0; i[1] < cdim[1]; i[1]++492 ) |
162 | 492 | { |
163 | 492 | float* const ap1 = adim[1] == 1 ? ap08 : ap0 + i[1] * astride[1]484 ; |
164 | 492 | float* const bp1 = bdim[1] == 1 ? bp0300 : bp0 + i[1] * bstride[1]192 ; |
165 | 492 | float* cp1 = cp0 + i[1] * cstride[1]; |
166 | 2.43k | for (i[2] = 0; i[2] < cdim[2]; i[2]++1.94k ) |
167 | 1.94k | { |
168 | 1.94k | float* const ap2 = adim[2] == 1 ? ap17 : ap1 + i[2] * astride[2]1.93k ; |
169 | 1.94k | float* const bp2 = bdim[2] == 1 ? bp11.68k : bp1 + i[2] * bstride[2]258 ; |
170 | 1.94k | if (adim[3] == 1) |
171 | 29 | for (x = 0; 10 x < cdim[3]; x++19 ) |
172 | 19 | cp1[x] = p * ap2[0] * bp2[x]; |
173 | 1.93k | else if (bdim[3] == 1) |
174 | 14.1k | for (x = 0; 1.28k x < cdim[3]; x++12.9k ) |
175 | 12.9k | cp1[x] = p * ap2[x] * bp2[0]; |
176 | 650 | else |
177 | 4.57k | for (x = 0; 650 x < cdim[3]; x++3.92k ) |
178 | 3.92k | cp1[x] = p * ap2[x] * bp2[x]; |
179 | 1.94k | cp1 += cstride[2]; |
180 | 1.94k | } |
181 | 492 | } |
182 | 130 | } |
183 | 24 | } |
184 | | |
185 | | static int _ccv_nnc_mul_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
186 | 4.25k | { |
187 | 4.25k | assert(input_size == 2); |
188 | 4.25k | _ccv_nnc_mul_forw_cpu_ref(cmd.info.blas.a[0], (ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[1], (ccv_nnc_tensor_view_t*)outputs[0]); |
189 | 4.25k | return CCV_NNC_EXEC_SUCCESS; |
190 | 4.25k | } |
191 | | |
192 | | static int _ccv_nnc_mul_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
193 | 4.12k | { |
194 | 4.12k | int gdim[CCV_NNC_MAX_DIM_ALLOC]; |
195 | 4.12k | int no_broadcasting = 1; |
196 | 4.12k | if (outputs[0]) |
197 | 2.12k | { |
198 | 2.12k | assert(input_size >= 3 && inputs[2]); |
199 | 2.12k | ccv_nnc_tensor_view_get_dim((ccv_nnc_tensor_view_t*)outputs[0], gdim); |
200 | 2.12k | ccv_nnc_tensor_view_get_broadcast_dim((ccv_nnc_tensor_view_t*)inputs[2], gdim); |
201 | 2.12k | no_broadcasting = no_broadcasting && (ccv_nnc_tensor_view_check_dim((ccv_nnc_tensor_view_t*)outputs[0], gdim) && ccv_nnc_tensor_view_check_dim((ccv_nnc_tensor_view_t*)inputs[2], gdim)2.11k ); |
202 | 2.12k | } |
203 | 4.12k | if (no_broadcasting && output_size > 14.11k && outputs[1]4.11k ) |
204 | 4.11k | { |
205 | 4.11k | assert(inputs[1]); |
206 | 4.11k | ccv_nnc_tensor_view_get_dim((ccv_nnc_tensor_view_t*)inputs[1], gdim); |
207 | 4.11k | ccv_nnc_tensor_view_get_broadcast_dim((ccv_nnc_tensor_view_t*)outputs[1], gdim); |
208 | 4.11k | no_broadcasting = no_broadcasting && (ccv_nnc_tensor_view_check_dim((ccv_nnc_tensor_view_t*)inputs[1], gdim) && ccv_nnc_tensor_view_check_dim((ccv_nnc_tensor_view_t*)outputs[1], gdim)4.11k ); |
209 | 4.11k | } |
210 | 4.12k | if (no_broadcasting) |
211 | 4.11k | { |
212 | 4.11k | if (outputs[0]) |
213 | 2.10k | { |
214 | 2.10k | if (inputs[0] == 0) |
215 | 0 | _ccv_nnc_mul_forw_cpu_ref(cmd.info.blas.a[0], (ccv_nnc_tensor_view_t*)inputs[2], 0, (ccv_nnc_tensor_view_t*)outputs[0]); |
216 | 2.10k | else |
217 | 2.10k | _ccv_nnc_mul_forw_cpu_ref(cmd.info.blas.a[0], (ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[2], (ccv_nnc_tensor_view_t*)outputs[0]); |
218 | 2.10k | } |
219 | 4.11k | if (output_size > 1 && outputs[1]) |
220 | 4.11k | { |
221 | 4.11k | if (inputs[0] == 0) |
222 | 0 | _ccv_nnc_mul_forw_cpu_ref(cmd.info.blas.a[0], (ccv_nnc_tensor_view_t*)inputs[1], 0, (ccv_nnc_tensor_view_t*)outputs[1]); |
223 | 4.11k | else |
224 | 4.11k | _ccv_nnc_mul_forw_cpu_ref(cmd.info.blas.a[0], (ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[1], (ccv_nnc_tensor_view_t*)outputs[1]); |
225 | 4.11k | } |
226 | 4.11k | return CCV_NNC_EXEC_SUCCESS; |
227 | 4.11k | } |
228 | 14 | int adim[CCV_NNC_MAX_DIM_ALLOC]; |
229 | 14 | int bdim[CCV_NNC_MAX_DIM_ALLOC]; |
230 | 14 | int astride[CCV_NNC_MAX_DIM_ALLOC]; |
231 | 14 | int bstride[CCV_NNC_MAX_DIM_ALLOC]; |
232 | 14 | int i[CCV_NNC_MAX_DIM + 2]; |
233 | 14 | int x; |
234 | 14 | const float p = cmd.info.blas.a[0]; |
235 | | // Now the case we need broadcasting. |
236 | 14 | if (inputs[0] == 0) |
237 | 3 | { |
238 | 3 | if (outputs[0]) |
239 | 3 | { |
240 | 3 | ccv_nnc_tensor_view_t* const a = (ccv_nnc_tensor_view_t*)outputs[0]; |
241 | 3 | ccv_nnc_tensor_view_t* const b = (ccv_nnc_tensor_view_t*)inputs[2]; |
242 | 3 | ccv_nnc_tensor_view_get_dim(a, adim); |
243 | 3 | ccv_nnc_tensor_view_get_dim(b, bdim); |
244 | 3 | ccv_nnc_tensor_view_get_stride(a, astride); |
245 | 3 | ccv_nnc_tensor_view_get_stride(b, bstride); |
246 | 3 | ccv_nnc_tensor_zero(a); |
247 | 3 | float* const ap = a->data.f32; |
248 | 3 | float* const bp = b->data.f32; |
249 | 6 | for (i[0] = 0; i[0] < gdim[0]; i[0]++3 ) |
250 | 3 | { |
251 | 3 | float* const ap0 = adim[0] == 1 ? ap : ap + i[0] * astride[0]0 ; |
252 | 3 | float* const bp0 = bdim[0] == 1 ? bp : bp + i[0] * bstride[0]0 ; |
253 | 6 | for (i[1] = 0; i[1] < gdim[1]; i[1]++3 ) |
254 | 3 | { |
255 | 3 | float* const ap1 = adim[1] == 1 ? ap0 : ap0 + i[1] * astride[1]0 ; |
256 | 3 | float* const bp1 = bdim[1] == 1 ? bp0 : bp0 + i[1] * bstride[1]0 ; |
257 | 11 | for (i[2] = 0; i[2] < gdim[2]; i[2]++8 ) |
258 | 8 | { |
259 | 8 | float* const ap2 = adim[2] == 1 ? ap12 : ap1 + i[2] * astride[2]6 ; |
260 | 8 | float* const bp2 = bdim[2] == 1 ? bp16 : bp1 + i[2] * bstride[2]2 ; |
261 | 8 | if (adim[3] == 1) |
262 | 12 | for (x = 0; 4 x < gdim[3]; x++8 ) |
263 | 8 | ap2[0] += p * bp2[x]; |
264 | 4 | else if (bdim[3] == 1) |
265 | 0 | for (x = 0; x < gdim[3]; x++) |
266 | 0 | ap2[x] += p * bp2[0]; |
267 | 4 | else |
268 | 16 | for (x = 0; 4 x < gdim[3]; x++12 ) |
269 | 12 | ap2[x] += p * bp2[x]; |
270 | 8 | } |
271 | 3 | } |
272 | 3 | } |
273 | 3 | } |
274 | 3 | if (output_size > 1 && outputs[1]) |
275 | 3 | { |
276 | 3 | ccv_nnc_tensor_view_t* const a = (ccv_nnc_tensor_view_t*)outputs[1]; |
277 | 3 | ccv_nnc_tensor_view_t* const b = (ccv_nnc_tensor_view_t*)inputs[1]; |
278 | 3 | ccv_nnc_tensor_view_get_dim(a, adim); |
279 | 3 | ccv_nnc_tensor_view_get_dim(b, bdim); |
280 | 3 | ccv_nnc_tensor_view_get_stride(a, astride); |
281 | 3 | ccv_nnc_tensor_view_get_stride(b, bstride); |
282 | 3 | ccv_nnc_tensor_zero(a); |
283 | 3 | float* const ap = a->data.f32; |
284 | 3 | float* const bp = b->data.f32; |
285 | 6 | for (i[0] = 0; i[0] < gdim[0]; i[0]++3 ) |
286 | 3 | { |
287 | 3 | float* const ap0 = adim[0] == 1 ? ap : ap + i[0] * astride[0]0 ; |
288 | 3 | float* const bp0 = bdim[0] == 1 ? bp : bp + i[0] * bstride[0]0 ; |
289 | 6 | for (i[1] = 0; i[1] < gdim[1]; i[1]++3 ) |
290 | 3 | { |
291 | 3 | float* const ap1 = adim[1] == 1 ? ap0 : ap0 + i[1] * astride[1]0 ; |
292 | 3 | float* const bp1 = bdim[1] == 1 ? bp0 : bp0 + i[1] * bstride[1]0 ; |
293 | 11 | for (i[2] = 0; i[2] < gdim[2]; i[2]++8 ) |
294 | 8 | { |
295 | 8 | float* const ap2 = adim[2] == 1 ? ap16 : ap1 + i[2] * astride[2]2 ; |
296 | 8 | float* const bp2 = bdim[2] == 1 ? bp12 : bp1 + i[2] * bstride[2]6 ; |
297 | 8 | if (adim[3] == 1) |
298 | 0 | for (x = 0; x < gdim[3]; x++) |
299 | 0 | ap2[0] += p * bp2[x]; |
300 | 8 | else if (bdim[3] == 1) |
301 | 12 | for (x = 0; 4 x < gdim[3]; x++8 ) |
302 | 8 | ap2[x] += p * bp2[0]; |
303 | 4 | else |
304 | 16 | for (x = 0; 4 x < gdim[3]; x++12 ) |
305 | 12 | ap2[x] += p * bp2[x]; |
306 | 8 | } |
307 | 3 | } |
308 | 3 | } |
309 | 3 | } |
310 | 3 | return CCV_NNC_EXEC_SUCCESS; |
311 | 3 | } |
312 | 11 | int gstride[CCV_NNC_MAX_DIM_ALLOC]; |
313 | 11 | ccv_nnc_tensor_view_t* const g = (ccv_nnc_tensor_view_t*)inputs[0]; |
314 | 11 | ccv_nnc_tensor_view_get_dim(g, gdim); |
315 | 11 | ccv_nnc_tensor_view_get_stride(g, gstride); |
316 | 11 | if (outputs[0]) |
317 | 10 | { |
318 | 10 | ccv_nnc_tensor_view_t* const a = (ccv_nnc_tensor_view_t*)outputs[0]; |
319 | 10 | ccv_nnc_tensor_view_t* const b = (ccv_nnc_tensor_view_t*)inputs[2]; |
320 | 10 | ccv_nnc_tensor_view_get_dim(a, adim); |
321 | 10 | ccv_nnc_tensor_view_get_dim(b, bdim); |
322 | 10 | ccv_nnc_tensor_view_get_stride(a, astride); |
323 | 10 | ccv_nnc_tensor_view_get_stride(b, bstride); |
324 | 10 | ccv_nnc_tensor_zero(a); |
325 | 10 | float* const ap = a->data.f32; |
326 | 10 | float* const bp = b->data.f32; |
327 | 10 | float* const gp = g->data.f32; |
328 | 70 | for (i[0] = 0; i[0] < gdim[0]; i[0]++60 ) |
329 | 60 | { |
330 | 60 | float* const ap0 = adim[0] == 1 ? ap2 : ap + i[0] * astride[0]58 ; |
331 | 60 | float* const bp0 = bdim[0] == 1 ? bp4 : bp + i[0] * bstride[0]56 ; |
332 | 60 | float* const gp0 = gp + i[0] * gstride[0]; |
333 | 290 | for (i[1] = 0; i[1] < gdim[1]; i[1]++230 ) |
334 | 230 | { |
335 | 230 | float* const ap1 = adim[1] == 1 ? ap02 : ap0 + i[1] * astride[1]228 ; |
336 | 230 | float* const bp1 = bdim[1] == 1 ? bp0134 : bp0 + i[1] * bstride[1]96 ; |
337 | 230 | float* gp1 = gp0 + i[1] * gstride[1]; |
338 | 1.13k | for (i[2] = 0; i[2] < gdim[2]; i[2]++909 ) |
339 | 909 | { |
340 | 909 | float* const ap2 = adim[2] == 1 ? ap11 : ap1 + i[2] * astride[2]908 ; |
341 | 909 | float* const bp2 = bdim[2] == 1 ? bp1781 : bp1 + i[2] * bstride[2]128 ; |
342 | 909 | if (adim[3] == 1) |
343 | 12 | for (x = 0; 4 x < gdim[3]; x++8 ) |
344 | 8 | ap2[0] += p * gp1[x] * bp2[x]; |
345 | 905 | else if (bdim[3] == 1) |
346 | 7.14k | for (x = 0; 641 x < gdim[3]; x++6.50k ) |
347 | 6.50k | ap2[x] += p * gp1[x] * bp2[0]; |
348 | 264 | else |
349 | 1.62k | for (x = 0; 264 x < gdim[3]; x++1.36k ) |
350 | 1.36k | ap2[x] += p * gp1[x] * bp2[x]; |
351 | 909 | gp1 += gstride[2]; |
352 | 909 | } |
353 | 230 | } |
354 | 60 | } |
355 | 10 | } |
356 | 11 | if (output_size > 1 && outputs[1]) |
357 | 11 | { |
358 | 11 | ccv_nnc_tensor_view_t* const a = (ccv_nnc_tensor_view_t*)outputs[1]; |
359 | 11 | ccv_nnc_tensor_view_t* const b = (ccv_nnc_tensor_view_t*)inputs[1]; |
360 | 11 | ccv_nnc_tensor_view_get_dim(a, adim); |
361 | 11 | ccv_nnc_tensor_view_get_dim(b, bdim); |
362 | 11 | ccv_nnc_tensor_view_get_stride(a, astride); |
363 | 11 | ccv_nnc_tensor_view_get_stride(b, bstride); |
364 | 11 | ccv_nnc_tensor_zero(a); |
365 | 11 | float* const ap = a->data.f32; |
366 | 11 | float* const bp = b->data.f32; |
367 | 11 | float* const gp = g->data.f32; |
368 | 72 | for (i[0] = 0; i[0] < gdim[0]; i[0]++61 ) |
369 | 61 | { |
370 | 61 | float* const ap0 = adim[0] == 1 ? ap5 : ap + i[0] * astride[0]56 ; |
371 | 61 | float* const bp0 = bdim[0] == 1 ? bp3 : bp + i[0] * bstride[0]58 ; |
372 | 61 | float* const gp0 = gp + i[0] * gstride[0]; |
373 | 292 | for (i[1] = 0; i[1] < gdim[1]; i[1]++231 ) |
374 | 231 | { |
375 | 231 | float* const ap1 = adim[1] == 1 ? ap0135 : ap0 + i[1] * astride[1]96 ; |
376 | 231 | float* const bp1 = bdim[1] == 1 ? bp03 : bp0 + i[1] * bstride[1]228 ; |
377 | 231 | float* gp1 = gp0 + i[1] * gstride[1]; |
378 | 1.14k | for (i[2] = 0; i[2] < gdim[2]; i[2]++911 ) |
379 | 911 | { |
380 | 911 | float* const ap2 = adim[2] == 1 ? ap1781 : ap1 + i[2] * astride[2]130 ; |
381 | 911 | float* const bp2 = bdim[2] == 1 ? bp13 : bp1 + i[2] * bstride[2]908 ; |
382 | 911 | if (adim[3] == 1) |
383 | 7.14k | for (x = 0; 643 x < gdim[3]; x++6.50k ) |
384 | 6.50k | ap2[0] += p * gp1[x] * bp2[x]; |
385 | 268 | else if (bdim[3] == 1) |
386 | 12 | for (x = 0; 4 x < gdim[3]; x++8 ) |
387 | 8 | ap2[x] += p * gp1[x] * bp2[0]; |
388 | 264 | else |
389 | 1.62k | for (x = 0; 264 x < gdim[3]; x++1.36k ) |
390 | 1.36k | ap2[x] += p * gp1[x] * bp2[x]; |
391 | 911 | gp1 += gstride[2]; |
392 | 911 | } |
393 | 231 | } |
394 | 61 | } |
395 | 11 | } |
396 | 11 | return CCV_NNC_EXEC_SUCCESS; |
397 | 14 | } |
398 | | |
399 | | REGISTER_COMMAND_BACKEND(CCV_NNC_MUL_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
400 | 1 | { |
401 | 1 | registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN; |
402 | 1 | registry->tensor_datatypes = CCV_32F; |
403 | 1 | registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
404 | 1 | registry->algorithms = 1; |
405 | 1 | registry->exec = _ccv_nnc_mul_forw; |
406 | 1 | } |
407 | | |
408 | | REGISTER_COMMAND_BACKEND(CCV_NNC_MUL_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
409 | 1 | { |
410 | 1 | registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN; |
411 | 1 | registry->tensor_datatypes = CCV_32F; |
412 | 1 | registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
413 | 1 | registry->algorithms = 1; |
414 | 1 | registry->exec = _ccv_nnc_mul_back; |
415 | 1 | } |
416 | | |
417 | | static void _ccv_nnc_scalar_mul_forw_cpu_ref_i32(const float p, ccv_nnc_tensor_view_t* const a, ccv_nnc_tensor_view_t* const c) |
418 | 1 | { |
419 | | // Assuming this is int. |
420 | 1 | int dim[CCV_NNC_MAX_DIM_ALLOC]; |
421 | 1 | int astride[CCV_NNC_MAX_DIM_ALLOC]; |
422 | 1 | int cstride[CCV_NNC_MAX_DIM_ALLOC]; |
423 | 1 | assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2); |
424 | 1 | assert(ccv_nnc_tensor_nd(c->info.dim) <= CCV_NNC_MAX_DIM + 2); |
425 | 1 | ccv_nnc_tensor_view_get_dim(a, dim); |
426 | 1 | assert(ccv_nnc_tensor_view_check_dim(c, dim)); |
427 | 1 | int x; |
428 | 1 | if (CCV_IS_TENSOR_CONTIGUOUS(a) && CCV_IS_TENSOR_CONTIGUOUS(c)) |
429 | 1 | { |
430 | | // Super optimal case, just do one for-loop for sum. |
431 | 1 | const int tensor_count = ccv_nnc_tensor_count(a->info); |
432 | 7 | for (x = 0; x < tensor_count; x++6 ) |
433 | 6 | c->data.i32[x] = (int)(p * (float)a->data.i32[x]); |
434 | 1 | return; |
435 | 1 | } |
436 | 1 | assert(CCV_NNC_MAX_DIM == 2)0 ; // Need to change this logic for CCV_NNC_MAX_DIM == other number. |
437 | 0 | ccv_nnc_tensor_view_get_stride(a, astride); |
438 | 0 | ccv_nnc_tensor_view_get_stride(c, cstride); |
439 | 0 | int i[CCV_NNC_MAX_DIM + 2]; |
440 | 0 | int* const ap = a->data.i32; |
441 | 0 | int* const cp = c->data.i32; |
442 | 0 | const int count = dim[2] * dim[3]; |
443 | 0 | if (astride[2] == dim[3] && cstride[2] == dim[3]) |
444 | 0 | { |
445 | | // Special casing if the ainc[3] is the same as dim[3] |
446 | 0 | for (i[0] = 0; i[0] < dim[0]; i[0]++) |
447 | 0 | { |
448 | 0 | int* ap0 = ap + i[0] * astride[0]; |
449 | 0 | int* cp0 = cp + i[0] * cstride[0]; |
450 | 0 | for (i[1] = 0; i[1] < dim[1]; i[1]++) |
451 | 0 | { |
452 | 0 | for (x = 0; x < count; x++) |
453 | 0 | cp0[x] = (int)(p * (float)ap0[x]); |
454 | 0 | ap0 += astride[1]; |
455 | 0 | cp0 += cstride[1]; |
456 | 0 | } |
457 | 0 | } |
458 | 0 | return; |
459 | 0 | } |
460 | | // Non-optimal case, need to do skip copy. |
461 | 0 | for (i[0] = 0; i[0] < dim[0]; i[0]++) |
462 | 0 | { |
463 | 0 | int* const ap0 = ap + i[0] * astride[0]; |
464 | 0 | int* const cp0 = cp + i[0] * cstride[0]; |
465 | 0 | for (i[1] = 0; i[1] < dim[1]; i[1]++) |
466 | 0 | { |
467 | 0 | int* ap1 = ap0 + i[1] * astride[1]; |
468 | 0 | int* cp1 = cp0 + i[1] * cstride[1]; |
469 | 0 | for (i[2] = 0; i[2] < dim[2]; i[2]++) |
470 | 0 | { |
471 | 0 | for (x = 0; x < dim[3]; x++) |
472 | 0 | cp1[x] = (int)(p * (float)ap1[x]); |
473 | 0 | ap1 += astride[2]; |
474 | 0 | cp1 += cstride[2]; |
475 | 0 | } |
476 | 0 | } |
477 | 0 | } |
478 | 0 | } |
479 | | |
480 | | static int _ccv_nnc_scalar_mul_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
481 | 65 | { |
482 | 65 | if (inputs[0]->info.datatype == CCV_32S) |
483 | 1 | { |
484 | 1 | _ccv_nnc_scalar_mul_forw_cpu_ref_i32(cmd.info.blas.a[0], (ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)outputs[0]); |
485 | 1 | return CCV_NNC_EXEC_SUCCESS; |
486 | 1 | } |
487 | 64 | _ccv_nnc_mul_forw_cpu_ref(cmd.info.blas.a[0], (ccv_nnc_tensor_view_t*)inputs[0], 0, (ccv_nnc_tensor_view_t*)outputs[0]); |
488 | 64 | return CCV_NNC_EXEC_SUCCESS; |
489 | 65 | } |
490 | | static int _ccv_nnc_scalar_mul_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
491 | 25 | { |
492 | 25 | if (inputs[0]) |
493 | 25 | _ccv_nnc_mul_forw_cpu_ref(cmd.info.blas.a[0], (ccv_nnc_tensor_view_t*)inputs[0], 0, (ccv_nnc_tensor_view_t*)outputs[0]); |
494 | 0 | else |
495 | 0 | _ccv_nnc_tensor_set_cpu_ref_f32((ccv_nnc_tensor_view_t*)outputs[0], cmd.info.blas.a[0]); |
496 | 25 | return CCV_NNC_EXEC_SUCCESS; |
497 | 25 | } |
498 | | |
499 | | REGISTER_COMMAND_BACKEND(CCV_NNC_SCALAR_MUL_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
500 | 1 | { |
501 | 1 | registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN; |
502 | 1 | registry->tensor_datatypes = CCV_32F | CCV_32S; |
503 | 1 | registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
504 | 1 | registry->algorithms = 1; |
505 | 1 | registry->exec = _ccv_nnc_scalar_mul_forw; |
506 | 1 | } |
507 | | |
508 | | REGISTER_COMMAND_BACKEND(CCV_NNC_SCALAR_MUL_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
509 | 1 | { |
510 | 1 | registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN; |
511 | 1 | registry->tensor_datatypes = CCV_32F; |
512 | 1 | registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
513 | 1 | registry->algorithms = 1; |
514 | 1 | registry->exec = _ccv_nnc_scalar_mul_back; |
515 | 1 | } |