/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/cmd/blas/ccv_nnc_mul_cpu_ref.c
Line | Count | Source (jump to first uncovered line) |
1 | | #include "ccv.h" |
2 | | #include "ccv_internal.h" |
3 | | #include "nnc/ccv_nnc.h" |
4 | | #include "nnc/ccv_nnc_easy.h" |
5 | | #include "nnc/ccv_nnc_internal.h" |
6 | | #ifdef USE_OPENMP |
7 | | #include <omp.h> |
8 | | #endif |
9 | | #ifdef USE_DISPATCH |
10 | | #include <dispatch/dispatch.h> |
11 | | #endif |
12 | | |
13 | | // Shared methods. |
14 | | #include "../_ccv_nnc_cpu_ref.h" |
15 | | |
16 | | void _ccv_nnc_mul_forw_cpu_ref(const float p, ccv_nnc_tensor_view_t* const a, ccv_nnc_tensor_view_t* const b, ccv_nnc_tensor_view_t* const c) |
17 | 10.6k | { |
18 | 10.6k | if (b == 0) |
19 | 137 | { |
20 | 137 | if (p == 1) |
21 | 0 | { |
22 | 0 | _ccv_nnc_tensor_transfer_cpu_ref_f32(a, c); |
23 | 0 | return; |
24 | 137 | } else if (p == 0) { |
25 | 0 | ccv_nnc_tensor_zero(c); |
26 | 0 | return; |
27 | 0 | } |
28 | | // Assuming this is float 32. |
29 | 137 | int dim[CCV_NNC_MAX_DIM_ALLOC]; |
30 | 137 | int astride[CCV_NNC_MAX_DIM_ALLOC]; |
31 | 137 | int cstride[CCV_NNC_MAX_DIM_ALLOC]; |
32 | 137 | assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2); |
33 | 137 | assert(ccv_nnc_tensor_nd(c->info.dim) <= CCV_NNC_MAX_DIM + 2); |
34 | 137 | ccv_nnc_tensor_view_get_dim(a, dim); |
35 | 137 | assert(ccv_nnc_tensor_view_check_dim(c, dim)); |
36 | 137 | int x; |
37 | 137 | if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(c)) |
38 | 135 | { |
39 | | // Super optimal case, just do one for-loop for sum. |
40 | 135 | const int tensor_count = ccv_nnc_tensor_count(a->info); |
41 | 8.39M | for (x = 0; x < tensor_count; x++8.39M ) |
42 | 8.39M | c->data.f32[x] = p * a->data.f32[x]; |
43 | 135 | return; |
44 | 135 | } |
45 | 2 | assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number. |
46 | 2 | ccv_nnc_tensor_view_get_stride(a, astride); |
47 | 2 | ccv_nnc_tensor_view_get_stride(c, cstride); |
48 | 2 | int i[CCV_NNC_MAX_DIM + 2]; |
49 | 2 | float* const ap = a->data.f32; |
50 | 2 | float* const cp = c->data.f32; |
51 | 2 | const int count = dim[2] * dim[3]; |
52 | 2 | if (astride[2] == dim[3] && cstride[2] == dim[3]) |
53 | 2 | { |
54 | | // Special casing if the ainc[3] is the same as dim[3] |
55 | 4 | for (i[0] = 0; i[0] < dim[0]; i[0]++2 ) |
56 | 2 | { |
57 | 2 | float* ap0 = ap + i[0] * astride[0]; |
58 | 2 | float* cp0 = cp + i[0] * cstride[0]; |
59 | 4 | for (i[1] = 0; i[1] < dim[1]; i[1]++2 ) |
60 | 2 | { |
61 | 4 | for (x = 0; x < count; x++2 ) |
62 | 2 | cp0[x] = p * ap0[x]; |
63 | 2 | ap0 += astride[1]; |
64 | 2 | cp0 += cstride[1]; |
65 | 2 | } |
66 | 2 | } |
67 | 2 | return; |
68 | 2 | } |
69 | | // Non-optimal case, need to do skip copy. |
70 | 0 | for (i[0] = 0; i[0] < dim[0]; i[0]++) |
71 | 0 | { |
72 | 0 | float* const ap0 = ap + i[0] * astride[0]; |
73 | 0 | float* const cp0 = cp + i[0] * cstride[0]; |
74 | 0 | for (i[1] = 0; i[1] < dim[1]; i[1]++) |
75 | 0 | { |
76 | 0 | float* ap1 = ap0 + i[1] * astride[1]; |
77 | 0 | float* cp1 = cp0 + i[1] * cstride[1]; |
78 | 0 | for (i[2] = 0; i[2] < dim[2]; i[2]++) |
79 | 0 | { |
80 | 0 | for (x = 0; x < dim[3]; x++) |
81 | 0 | cp1[x] = p * ap1[x]; |
82 | 0 | ap1 += astride[2]; |
83 | 0 | cp1 += cstride[2]; |
84 | 0 | } |
85 | 0 | } |
86 | 0 | } |
87 | 0 | return; |
88 | 2 | } |
89 | 10.4k | int cdim[CCV_NNC_MAX_DIM_ALLOC]; |
90 | 10.4k | assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2); |
91 | 10.4k | assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2); |
92 | 10.4k | ccv_nnc_tensor_view_get_dim(a, cdim); // Fill in cdim first. |
93 | 10.4k | ccv_nnc_tensor_view_get_broadcast_dim(b, cdim); |
94 | 10.4k | assert(ccv_nnc_tensor_view_check_broadcast_dim(a, cdim)); |
95 | 10.4k | assert(ccv_nnc_tensor_view_check_broadcast_dim(b, cdim)); |
96 | 10.4k | const int a_check_dim = ccv_nnc_tensor_view_check_dim(a, cdim); |
97 | 10.4k | const int b_check_dim = ccv_nnc_tensor_view_check_dim(b, cdim); |
98 | 10.4k | if (p == 1 && a_check_dim10.4k && b_check_dim10.4k ) |
99 | 10.4k | { |
100 | 10.4k | _ccv_nnc_ewprod_forw_cpu_ref((ccv_nnc_tensor_view_t*[]){ |
101 | 10.4k | a, b |
102 | 10.4k | }, 2, &c, 1); |
103 | 10.4k | return; |
104 | 10.4k | } else if (31 p == 031 ) { |
105 | 0 | ccv_nnc_tensor_zero(c); |
106 | 0 | return; |
107 | 0 | } |
108 | | // Assuming this is float 32. |
109 | 31 | int adim[CCV_NNC_MAX_DIM_ALLOC]; |
110 | 31 | int bdim[CCV_NNC_MAX_DIM_ALLOC]; |
111 | 31 | ccv_nnc_tensor_view_get_dim(a, adim); |
112 | 31 | ccv_nnc_tensor_view_get_dim(b, bdim); |
113 | 31 | int astride[CCV_NNC_MAX_DIM_ALLOC]; |
114 | 31 | int bstride[CCV_NNC_MAX_DIM_ALLOC]; |
115 | 31 | int cstride[CCV_NNC_MAX_DIM_ALLOC]; |
116 | 31 | assert(ccv_nnc_tensor_nd(c->info.dim) <= CCV_NNC_MAX_DIM + 2); |
117 | 31 | assert(ccv_nnc_tensor_view_check_dim(c, cdim)); |
118 | 31 | int x; |
119 | 31 | if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b) && !CCV_IS_TENSOR_VIEW(c) && a_check_dim && b_check_dim25 ) |
120 | 9 | { |
121 | 9 | const int tensor_count = ccv_nnc_tensor_count(a->info); |
122 | | // Super optimal case, just do one for-loop for sum. |
123 | 99 | for (x = 0; x < tensor_count; x++90 ) |
124 | 90 | c->data.f32[x] = p * a->data.f32[x] * b->data.f32[x]; |
125 | 9 | return; |
126 | 9 | } |
127 | 22 | assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number. |
128 | 22 | ccv_nnc_tensor_view_get_stride(a, astride); |
129 | 22 | ccv_nnc_tensor_view_get_stride(b, bstride); |
130 | 22 | ccv_nnc_tensor_view_get_stride(c, cstride); |
131 | 22 | int i[CCV_NNC_MAX_DIM + 2]; |
132 | 22 | float* const ap = a->data.f32; |
133 | 22 | float* const bp = b->data.f32; |
134 | 22 | float* const cp = c->data.f32; |
135 | 22 | const int count = cdim[2] * cdim[3]; |
136 | 22 | if (astride[2] == cdim[3] && bstride[2] == cdim[3]17 && cstride[2] == cdim[3]8 && adim[2] == cdim[2]8 && bdim[2] == cdim[2]7 ) |
137 | 0 | { |
138 | | // Special casing if the ainc[3] is the same as dim[3] |
139 | 0 | for (i[0] = 0; i[0] < cdim[0]; i[0]++) |
140 | 0 | { |
141 | 0 | float* const ap0 = adim[0] == 1 ? ap : ap + i[0] * astride[0]; |
142 | 0 | float* const bp0 = bdim[0] == 1 ? bp : bp + i[0] * bstride[0]; |
143 | 0 | float* cp0 = cp + i[0] * cstride[0]; |
144 | 0 | for (i[1] = 0; i[1] < cdim[1]; i[1]++) |
145 | 0 | { |
146 | 0 | float* const ap1 = adim[1] == 1 ? ap0 : ap0 + i[1] * astride[1]; |
147 | 0 | float* const bp1 = bdim[1] == 1 ? bp0 : bp0 + i[1] * bstride[1]; |
148 | 0 | for (x = 0; x < count; x++) |
149 | 0 | cp0[x] = p * ap1[x] * bp1[x]; |
150 | 0 | cp0 += cstride[1]; |
151 | 0 | } |
152 | 0 | } |
153 | 0 | return; |
154 | 0 | } |
155 | | // Non-optimal case, need to do skip copy and handle broadcasting. |
156 | 136 | for (i[0] = 0; 22 i[0] < cdim[0]; i[0]++114 ) |
157 | 114 | { |
158 | 114 | float* const ap0 = adim[0] == 1 ? ap8 : ap + i[0] * astride[0]106 ; |
159 | 114 | float* const bp0 = bdim[0] == 1 ? bp18 : bp + i[0] * bstride[0]96 ; |
160 | 114 | float* const cp0 = cp + i[0] * cstride[0]; |
161 | 542 | for (i[1] = 0; i[1] < cdim[1]; i[1]++428 ) |
162 | 428 | { |
163 | 428 | float* const ap1 = adim[1] == 1 ? ap08 : ap0 + i[1] * astride[1]420 ; |
164 | 428 | float* const bp1 = bdim[1] == 1 ? bp0236 : bp0 + i[1] * bstride[1]192 ; |
165 | 428 | float* cp1 = cp0 + i[1] * cstride[1]; |
166 | 2.11k | for (i[2] = 0; i[2] < cdim[2]; i[2]++1.68k ) |
167 | 1.68k | { |
168 | 1.68k | float* const ap2 = adim[2] == 1 ? ap17 : ap1 + i[2] * astride[2]1.67k ; |
169 | 1.68k | float* const bp2 = bdim[2] == 1 ? bp11.42k : bp1 + i[2] * bstride[2]258 ; |
170 | 1.68k | if (adim[3] == 1) |
171 | 29 | for (x = 0; 10 x < cdim[3]; x++19 ) |
172 | 19 | cp1[x] = p * ap2[0] * bp2[x]; |
173 | 1.67k | else if (bdim[3] == 1) |
174 | 11.3k | for (x = 0; 1.02k x < cdim[3]; x++10.3k ) |
175 | 10.3k | cp1[x] = p * ap2[x] * bp2[0]; |
176 | 650 | else |
177 | 4.57k | for (x = 0; 650 x < cdim[3]; x++3.92k ) |
178 | 3.92k | cp1[x] = p * ap2[x] * bp2[x]; |
179 | 1.68k | cp1 += cstride[2]; |
180 | 1.68k | } |
181 | 428 | } |
182 | 114 | } |
183 | 22 | } |
184 | | |
185 | | static int _ccv_nnc_mul_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
186 | 4.24k | { |
187 | 4.24k | assert(input_size == 2); |
188 | 4.24k | _ccv_nnc_mul_forw_cpu_ref(cmd.info.blas.a[0], (ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[1], (ccv_nnc_tensor_view_t*)outputs[0]); |
189 | 4.24k | return CCV_NNC_EXEC_SUCCESS; |
190 | 4.24k | } |
191 | | |
192 | | static int _ccv_nnc_mul_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
193 | 4.12k | { |
194 | 4.12k | int gdim[CCV_NNC_MAX_DIM_ALLOC]; |
195 | 4.12k | int no_broadcasting = 1; |
196 | 4.12k | if (outputs[0]) |
197 | 2.11k | { |
198 | 2.11k | assert(input_size >= 3 && inputs[2]); |
199 | 2.11k | ccv_nnc_tensor_view_get_dim((ccv_nnc_tensor_view_t*)outputs[0], gdim); |
200 | 2.11k | ccv_nnc_tensor_view_get_broadcast_dim((ccv_nnc_tensor_view_t*)inputs[2], gdim); |
201 | 2.11k | no_broadcasting = no_broadcasting && (ccv_nnc_tensor_view_check_dim((ccv_nnc_tensor_view_t*)outputs[0], gdim) && ccv_nnc_tensor_view_check_dim((ccv_nnc_tensor_view_t*)inputs[2], gdim)2.11k ); |
202 | 2.11k | } |
203 | 4.12k | if (no_broadcasting && output_size > 14.11k && outputs[1]4.11k ) |
204 | 4.11k | { |
205 | 4.11k | assert(inputs[1]); |
206 | 4.11k | ccv_nnc_tensor_view_get_dim((ccv_nnc_tensor_view_t*)inputs[1], gdim); |
207 | 4.11k | ccv_nnc_tensor_view_get_broadcast_dim((ccv_nnc_tensor_view_t*)outputs[1], gdim); |
208 | 4.11k | no_broadcasting = no_broadcasting && (ccv_nnc_tensor_view_check_dim((ccv_nnc_tensor_view_t*)inputs[1], gdim) && ccv_nnc_tensor_view_check_dim((ccv_nnc_tensor_view_t*)outputs[1], gdim)4.11k ); |
209 | 4.11k | } |
210 | 4.12k | if (no_broadcasting) |
211 | 4.11k | { |
212 | 4.11k | if (outputs[0]) |
213 | 2.10k | { |
214 | 2.10k | if (inputs[0] == 0) |
215 | 0 | _ccv_nnc_mul_forw_cpu_ref(cmd.info.blas.a[0], (ccv_nnc_tensor_view_t*)inputs[2], 0, (ccv_nnc_tensor_view_t*)outputs[0]); |
216 | 2.10k | else |
217 | 2.10k | _ccv_nnc_mul_forw_cpu_ref(cmd.info.blas.a[0], (ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[2], (ccv_nnc_tensor_view_t*)outputs[0]); |
218 | 2.10k | } |
219 | 4.11k | if (output_size > 1 && outputs[1]) |
220 | 4.11k | { |
221 | 4.11k | if (inputs[0] == 0) |
222 | 0 | _ccv_nnc_mul_forw_cpu_ref(cmd.info.blas.a[0], (ccv_nnc_tensor_view_t*)inputs[1], 0, (ccv_nnc_tensor_view_t*)outputs[1]); |
223 | 4.11k | else |
224 | 4.11k | _ccv_nnc_mul_forw_cpu_ref(cmd.info.blas.a[0], (ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[1], (ccv_nnc_tensor_view_t*)outputs[1]); |
225 | 4.11k | } |
226 | 4.11k | return CCV_NNC_EXEC_SUCCESS; |
227 | 4.11k | } |
228 | 13 | int adim[CCV_NNC_MAX_DIM_ALLOC]; |
229 | 13 | int bdim[CCV_NNC_MAX_DIM_ALLOC]; |
230 | 13 | int astride[CCV_NNC_MAX_DIM_ALLOC]; |
231 | 13 | int bstride[CCV_NNC_MAX_DIM_ALLOC]; |
232 | 13 | int i[CCV_NNC_MAX_DIM + 2]; |
233 | 13 | int x; |
234 | 13 | const float p = cmd.info.blas.a[0]; |
235 | | // Now the case we need broadcasting. |
236 | 13 | if (inputs[0] == 0) |
237 | 3 | { |
238 | 3 | if (outputs[0]) |
239 | 3 | { |
240 | 3 | ccv_nnc_tensor_view_t* const a = (ccv_nnc_tensor_view_t*)outputs[0]; |
241 | 3 | ccv_nnc_tensor_view_t* const b = (ccv_nnc_tensor_view_t*)inputs[2]; |
242 | 3 | ccv_nnc_tensor_view_get_dim(a, adim); |
243 | 3 | ccv_nnc_tensor_view_get_dim(b, bdim); |
244 | 3 | ccv_nnc_tensor_view_get_stride(a, astride); |
245 | 3 | ccv_nnc_tensor_view_get_stride(b, bstride); |
246 | 3 | ccv_nnc_tensor_zero(a); |
247 | 3 | float* const ap = a->data.f32; |
248 | 3 | float* const bp = b->data.f32; |
249 | 6 | for (i[0] = 0; i[0] < gdim[0]; i[0]++3 ) |
250 | 3 | { |
251 | 3 | float* const ap0 = adim[0] == 1 ? ap : ap + i[0] * astride[0]0 ; |
252 | 3 | float* const bp0 = bdim[0] == 1 ? bp : bp + i[0] * bstride[0]0 ; |
253 | 6 | for (i[1] = 0; i[1] < gdim[1]; i[1]++3 ) |
254 | 3 | { |
255 | 3 | float* const ap1 = adim[1] == 1 ? ap0 : ap0 + i[1] * astride[1]0 ; |
256 | 3 | float* const bp1 = bdim[1] == 1 ? bp0 : bp0 + i[1] * bstride[1]0 ; |
257 | 11 | for (i[2] = 0; i[2] < gdim[2]; i[2]++8 ) |
258 | 8 | { |
259 | 8 | float* const ap2 = adim[2] == 1 ? ap12 : ap1 + i[2] * astride[2]6 ; |
260 | 8 | float* const bp2 = bdim[2] == 1 ? bp16 : bp1 + i[2] * bstride[2]2 ; |
261 | 8 | if (adim[3] == 1) |
262 | 12 | for (x = 0; 4 x < gdim[3]; x++8 ) |
263 | 8 | ap2[0] += p * bp2[x]; |
264 | 4 | else if (bdim[3] == 1) |
265 | 0 | for (x = 0; x < gdim[3]; x++) |
266 | 0 | ap2[x] += p * bp2[0]; |
267 | 4 | else |
268 | 16 | for (x = 0; 4 x < gdim[3]; x++12 ) |
269 | 12 | ap2[x] += p * bp2[x]; |
270 | 8 | } |
271 | 3 | } |
272 | 3 | } |
273 | 3 | } |
274 | 3 | if (output_size > 1 && outputs[1]) |
275 | 3 | { |
276 | 3 | ccv_nnc_tensor_view_t* const a = (ccv_nnc_tensor_view_t*)outputs[1]; |
277 | 3 | ccv_nnc_tensor_view_t* const b = (ccv_nnc_tensor_view_t*)inputs[1]; |
278 | 3 | ccv_nnc_tensor_view_get_dim(a, adim); |
279 | 3 | ccv_nnc_tensor_view_get_dim(b, bdim); |
280 | 3 | ccv_nnc_tensor_view_get_stride(a, astride); |
281 | 3 | ccv_nnc_tensor_view_get_stride(b, bstride); |
282 | 3 | ccv_nnc_tensor_zero(a); |
283 | 3 | float* const ap = a->data.f32; |
284 | 3 | float* const bp = b->data.f32; |
285 | 6 | for (i[0] = 0; i[0] < gdim[0]; i[0]++3 ) |
286 | 3 | { |
287 | 3 | float* const ap0 = adim[0] == 1 ? ap : ap + i[0] * astride[0]0 ; |
288 | 3 | float* const bp0 = bdim[0] == 1 ? bp : bp + i[0] * bstride[0]0 ; |
289 | 6 | for (i[1] = 0; i[1] < gdim[1]; i[1]++3 ) |
290 | 3 | { |
291 | 3 | float* const ap1 = adim[1] == 1 ? ap0 : ap0 + i[1] * astride[1]0 ; |
292 | 3 | float* const bp1 = bdim[1] == 1 ? bp0 : bp0 + i[1] * bstride[1]0 ; |
293 | 11 | for (i[2] = 0; i[2] < gdim[2]; i[2]++8 ) |
294 | 8 | { |
295 | 8 | float* const ap2 = adim[2] == 1 ? ap16 : ap1 + i[2] * astride[2]2 ; |
296 | 8 | float* const bp2 = bdim[2] == 1 ? bp12 : bp1 + i[2] * bstride[2]6 ; |
297 | 8 | if (adim[3] == 1) |
298 | 0 | for (x = 0; x < gdim[3]; x++) |
299 | 0 | ap2[0] += p * bp2[x]; |
300 | 8 | else if (bdim[3] == 1) |
301 | 12 | for (x = 0; 4 x < gdim[3]; x++8 ) |
302 | 8 | ap2[x] += p * bp2[0]; |
303 | 4 | else |
304 | 16 | for (x = 0; 4 x < gdim[3]; x++12 ) |
305 | 12 | ap2[x] += p * bp2[x]; |
306 | 8 | } |
307 | 3 | } |
308 | 3 | } |
309 | 3 | } |
310 | 3 | return CCV_NNC_EXEC_SUCCESS; |
311 | 3 | } |
312 | 10 | int gstride[CCV_NNC_MAX_DIM_ALLOC]; |
313 | 10 | ccv_nnc_tensor_view_t* const g = (ccv_nnc_tensor_view_t*)inputs[0]; |
314 | 10 | ccv_nnc_tensor_view_get_dim(g, gdim); |
315 | 10 | ccv_nnc_tensor_view_get_stride(g, gstride); |
316 | 10 | if (outputs[0]) |
317 | 9 | { |
318 | 9 | ccv_nnc_tensor_view_t* const a = (ccv_nnc_tensor_view_t*)outputs[0]; |
319 | 9 | ccv_nnc_tensor_view_t* const b = (ccv_nnc_tensor_view_t*)inputs[2]; |
320 | 9 | ccv_nnc_tensor_view_get_dim(a, adim); |
321 | 9 | ccv_nnc_tensor_view_get_dim(b, bdim); |
322 | 9 | ccv_nnc_tensor_view_get_stride(a, astride); |
323 | 9 | ccv_nnc_tensor_view_get_stride(b, bstride); |
324 | 9 | ccv_nnc_tensor_zero(a); |
325 | 9 | float* const ap = a->data.f32; |
326 | 9 | float* const bp = b->data.f32; |
327 | 9 | float* const gp = g->data.f32; |
328 | 61 | for (i[0] = 0; i[0] < gdim[0]; i[0]++52 ) |
329 | 52 | { |
330 | 52 | float* const ap0 = adim[0] == 1 ? ap2 : ap + i[0] * astride[0]50 ; |
331 | 52 | float* const bp0 = bdim[0] == 1 ? bp4 : bp + i[0] * bstride[0]48 ; |
332 | 52 | float* const gp0 = gp + i[0] * gstride[0]; |
333 | 250 | for (i[1] = 0; i[1] < gdim[1]; i[1]++198 ) |
334 | 198 | { |
335 | 198 | float* const ap1 = adim[1] == 1 ? ap02 : ap0 + i[1] * astride[1]196 ; |
336 | 198 | float* const bp1 = bdim[1] == 1 ? bp0102 : bp0 + i[1] * bstride[1]96 ; |
337 | 198 | float* gp1 = gp0 + i[1] * gstride[1]; |
338 | 979 | for (i[2] = 0; i[2] < gdim[2]; i[2]++781 ) |
339 | 781 | { |
340 | 781 | float* const ap2 = adim[2] == 1 ? ap11 : ap1 + i[2] * astride[2]780 ; |
341 | 781 | float* const bp2 = bdim[2] == 1 ? bp1653 : bp1 + i[2] * bstride[2]128 ; |
342 | 781 | if (adim[3] == 1) |
343 | 12 | for (x = 0; 4 x < gdim[3]; x++8 ) |
344 | 8 | ap2[0] += p * gp1[x] * bp2[x]; |
345 | 777 | else if (bdim[3] == 1) |
346 | 5.73k | for (x = 0; 513 x < gdim[3]; x++5.22k ) |
347 | 5.22k | ap2[x] += p * gp1[x] * bp2[0]; |
348 | 264 | else |
349 | 1.62k | for (x = 0; 264 x < gdim[3]; x++1.36k ) |
350 | 1.36k | ap2[x] += p * gp1[x] * bp2[x]; |
351 | 781 | gp1 += gstride[2]; |
352 | 781 | } |
353 | 198 | } |
354 | 52 | } |
355 | 9 | } |
356 | 10 | if (output_size > 1 && outputs[1]) |
357 | 10 | { |
358 | 10 | ccv_nnc_tensor_view_t* const a = (ccv_nnc_tensor_view_t*)outputs[1]; |
359 | 10 | ccv_nnc_tensor_view_t* const b = (ccv_nnc_tensor_view_t*)inputs[1]; |
360 | 10 | ccv_nnc_tensor_view_get_dim(a, adim); |
361 | 10 | ccv_nnc_tensor_view_get_dim(b, bdim); |
362 | 10 | ccv_nnc_tensor_view_get_stride(a, astride); |
363 | 10 | ccv_nnc_tensor_view_get_stride(b, bstride); |
364 | 10 | ccv_nnc_tensor_zero(a); |
365 | 10 | float* const ap = a->data.f32; |
366 | 10 | float* const bp = b->data.f32; |
367 | 10 | float* const gp = g->data.f32; |
368 | 63 | for (i[0] = 0; i[0] < gdim[0]; i[0]++53 ) |
369 | 53 | { |
370 | 53 | float* const ap0 = adim[0] == 1 ? ap5 : ap + i[0] * astride[0]48 ; |
371 | 53 | float* const bp0 = bdim[0] == 1 ? bp3 : bp + i[0] * bstride[0]50 ; |
372 | 53 | float* const gp0 = gp + i[0] * gstride[0]; |
373 | 252 | for (i[1] = 0; i[1] < gdim[1]; i[1]++199 ) |
374 | 199 | { |
375 | 199 | float* const ap1 = adim[1] == 1 ? ap0103 : ap0 + i[1] * astride[1]96 ; |
376 | 199 | float* const bp1 = bdim[1] == 1 ? bp03 : bp0 + i[1] * bstride[1]196 ; |
377 | 199 | float* gp1 = gp0 + i[1] * gstride[1]; |
378 | 982 | for (i[2] = 0; i[2] < gdim[2]; i[2]++783 ) |
379 | 783 | { |
380 | 783 | float* const ap2 = adim[2] == 1 ? ap1653 : ap1 + i[2] * astride[2]130 ; |
381 | 783 | float* const bp2 = bdim[2] == 1 ? bp13 : bp1 + i[2] * bstride[2]780 ; |
382 | 783 | if (adim[3] == 1) |
383 | 5.73k | for (x = 0; 515 x < gdim[3]; x++5.22k ) |
384 | 5.22k | ap2[0] += p * gp1[x] * bp2[x]; |
385 | 268 | else if (bdim[3] == 1) |
386 | 12 | for (x = 0; 4 x < gdim[3]; x++8 ) |
387 | 8 | ap2[x] += p * gp1[x] * bp2[0]; |
388 | 264 | else |
389 | 1.62k | for (x = 0; 264 x < gdim[3]; x++1.36k ) |
390 | 1.36k | ap2[x] += p * gp1[x] * bp2[x]; |
391 | 783 | gp1 += gstride[2]; |
392 | 783 | } |
393 | 199 | } |
394 | 53 | } |
395 | 10 | } |
396 | 10 | return CCV_NNC_EXEC_SUCCESS; |
397 | 13 | } |
398 | | |
399 | | REGISTER_COMMAND_BACKEND(CCV_NNC_MUL_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
400 | 1 | { |
401 | 1 | registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN; |
402 | 1 | registry->tensor_datatypes = CCV_32F; |
403 | 1 | registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
404 | 1 | registry->algorithms = 1; |
405 | 1 | registry->exec = _ccv_nnc_mul_forw; |
406 | 1 | } |
407 | | |
408 | | REGISTER_COMMAND_BACKEND(CCV_NNC_MUL_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
409 | 1 | { |
410 | 1 | registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN; |
411 | 1 | registry->tensor_datatypes = CCV_32F; |
412 | 1 | registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
413 | 1 | registry->algorithms = 1; |
414 | 1 | registry->exec = _ccv_nnc_mul_back; |
415 | 1 | } |
416 | | |
417 | | static int _ccv_nnc_scalar_mul_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
418 | 58 | { |
419 | 58 | _ccv_nnc_mul_forw_cpu_ref(cmd.info.blas.a[0], (ccv_nnc_tensor_view_t*)inputs[0], 0, (ccv_nnc_tensor_view_t*)outputs[0]); |
420 | 58 | return CCV_NNC_EXEC_SUCCESS; |
421 | 58 | } |
422 | | static int _ccv_nnc_scalar_mul_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
423 | 21 | { |
424 | 21 | if (inputs[0]) |
425 | 21 | _ccv_nnc_mul_forw_cpu_ref(cmd.info.blas.a[0], (ccv_nnc_tensor_view_t*)inputs[0], 0, (ccv_nnc_tensor_view_t*)outputs[0]); |
426 | 0 | else |
427 | 0 | _ccv_nnc_tensor_set_cpu_ref_f32((ccv_nnc_tensor_view_t*)outputs[0], cmd.info.blas.a[0]); |
428 | 21 | return CCV_NNC_EXEC_SUCCESS; |
429 | 21 | } |
430 | | |
431 | | REGISTER_COMMAND_BACKEND(CCV_NNC_SCALAR_MUL_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
432 | 1 | { |
433 | 1 | registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN; |
434 | 1 | registry->tensor_datatypes = CCV_32F; |
435 | 1 | registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
436 | 1 | registry->algorithms = 1; |
437 | 1 | registry->exec = _ccv_nnc_scalar_mul_forw; |
438 | 1 | } |
439 | | |
440 | | REGISTER_COMMAND_BACKEND(CCV_NNC_SCALAR_MUL_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
441 | 1 | { |
442 | 1 | registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN; |
443 | 1 | registry->tensor_datatypes = CCV_32F; |
444 | 1 | registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
445 | 1 | registry->algorithms = 1; |
446 | 1 | registry->exec = _ccv_nnc_scalar_mul_back; |
447 | 1 | } |