/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/cmd/blas/ccv_nnc_cmul_cpu_ref.c
Line | Count | Source (jump to first uncovered line) |
1 | | #include "ccv.h" |
2 | | #include "ccv_internal.h" |
3 | | #include "nnc/ccv_nnc.h" |
4 | | #include "nnc/ccv_nnc_easy.h" |
5 | | #include "nnc/ccv_nnc_internal.h" |
6 | | #ifdef USE_OPENMP |
7 | | #include <omp.h> |
8 | | #endif |
9 | | #ifdef USE_DISPATCH |
10 | | #include <dispatch/dispatch.h> |
11 | | #endif |
12 | | |
13 | | // Shared methods. |
14 | | #include "../_ccv_nnc_cpu_ref.h" |
15 | | |
16 | | void _ccv_nnc_cmul_forw_cpu_ref(ccv_nnc_tensor_view_t* const a, ccv_nnc_tensor_view_t* const b, ccv_nnc_tensor_view_t* const c) |
17 | 5 | { |
18 | 5 | int cdim[CCV_NNC_MAX_DIM_ALLOC]; |
19 | 5 | assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2); |
20 | 5 | assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2); |
21 | 5 | ccv_nnc_tensor_view_get_dim(a, cdim); // Fill in cdim first. |
22 | 5 | ccv_nnc_tensor_view_get_broadcast_dim(b, cdim); |
23 | 5 | assert(ccv_nnc_tensor_view_check_broadcast_dim(a, cdim)); |
24 | 5 | assert(ccv_nnc_tensor_view_check_broadcast_dim(b, cdim)); |
25 | 5 | const int a_check_dim = ccv_nnc_tensor_view_check_dim(a, cdim); |
26 | 5 | const int b_check_dim = ccv_nnc_tensor_view_check_dim(b, cdim); |
27 | | // Assuming this is float 32. |
28 | 5 | int adim[CCV_NNC_MAX_DIM_ALLOC]; |
29 | 5 | int bdim[CCV_NNC_MAX_DIM_ALLOC]; |
30 | 5 | ccv_nnc_tensor_view_get_dim(a, adim); |
31 | 5 | ccv_nnc_tensor_view_get_dim(b, bdim); |
32 | 5 | int astride[CCV_NNC_MAX_DIM_ALLOC]; |
33 | 5 | int bstride[CCV_NNC_MAX_DIM_ALLOC]; |
34 | 5 | int cstride[CCV_NNC_MAX_DIM_ALLOC]; |
35 | 5 | assert(ccv_nnc_tensor_nd(c->info.dim) <= CCV_NNC_MAX_DIM + 2); |
36 | 5 | assert(ccv_nnc_tensor_view_check_dim(c, cdim)); |
37 | 5 | int x; |
38 | 5 | if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b) && !CCV_IS_TENSOR_VIEW(c) && a_check_dim && b_check_dim) |
39 | 4 | { |
40 | 4 | const int tensor_count = ccv_nnc_tensor_count(a->info); |
41 | 4 | assert(tensor_count % 2 == 0); |
42 | | // Super optimal case, just do one for-loop for sum. |
43 | 214 | for (x = 0; 4 x < tensor_count; x += 2210 ) |
44 | 210 | { |
45 | 210 | const float a0 = a->data.f32[x]; |
46 | 210 | const float a1 = a->data.f32[x + 1]; |
47 | 210 | const float b0 = b->data.f32[x]; |
48 | 210 | const float b1 = b->data.f32[x + 1]; |
49 | 210 | c->data.f32[x] = a0 * b0 - a1 * b1; |
50 | 210 | c->data.f32[x + 1] = a0 * b1 + a1 * b0; |
51 | 210 | } |
52 | 4 | return; |
53 | 4 | } |
54 | 1 | assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number. |
55 | 1 | ccv_nnc_tensor_view_get_stride(a, astride); |
56 | 1 | ccv_nnc_tensor_view_get_stride(b, bstride); |
57 | 1 | ccv_nnc_tensor_view_get_stride(c, cstride); |
58 | 1 | int i[CCV_NNC_MAX_DIM + 2]; |
59 | 1 | float* const ap = a->data.f32; |
60 | 1 | float* const bp = b->data.f32; |
61 | 1 | float* const cp = c->data.f32; |
62 | 1 | const int count = cdim[2] * cdim[3]; |
63 | 1 | assert(count % 2 == 0); |
64 | 1 | if (astride[2] == cdim[3] && bstride[2] == cdim[3] && cstride[2] == cdim[3] && adim[2] == cdim[2] && bdim[2] == cdim[2]) |
65 | 0 | { |
66 | | // Special casing if the ainc[3] is the same as dim[3] |
67 | 0 | for (i[0] = 0; i[0] < cdim[0]; i[0]++) |
68 | 0 | { |
69 | 0 | float* const ap0 = adim[0] == 1 ? ap : ap + i[0] * astride[0]; |
70 | 0 | float* const bp0 = bdim[0] == 1 ? bp : bp + i[0] * bstride[0]; |
71 | 0 | float* cp0 = cp + i[0] * cstride[0]; |
72 | 0 | for (i[1] = 0; i[1] < cdim[1]; i[1]++) |
73 | 0 | { |
74 | 0 | float* const ap1 = adim[1] == 1 ? ap0 : ap0 + i[1] * astride[1]; |
75 | 0 | float* const bp1 = bdim[1] == 1 ? bp0 : bp0 + i[1] * bstride[1]; |
76 | 0 | for (x = 0; x < count; x += 2) |
77 | 0 | { |
78 | 0 | const float a0 = ap1[x]; |
79 | 0 | const float a1 = ap1[x + 1]; |
80 | 0 | const float b0 = bp1[x]; |
81 | 0 | const float b1 = bp1[x + 1]; |
82 | 0 | cp0[x] = a0 * b0 - a1 * b1; |
83 | 0 | cp0[x + 1] = a0 * b1 + a1 * b0; |
84 | 0 | } |
85 | 0 | cp0 += cstride[1]; |
86 | 0 | } |
87 | 0 | } |
88 | 0 | return; |
89 | 0 | } |
90 | 1 | assert(adim[3] == cdim[3]); |
91 | 1 | assert(bdim[3] == cdim[3]); |
92 | | // Non-optimal case, need to do skip copy and handle broadcasting. |
93 | 2 | for (i[0] = 0; 1 i[0] < cdim[0]; i[0]++1 ) |
94 | 1 | { |
95 | 1 | float* const ap0 = adim[0] == 1 ? ap : ap + i[0] * astride[0]0 ; |
96 | 1 | float* const bp0 = bdim[0] == 1 ? bp : bp + i[0] * bstride[0]0 ; |
97 | 1 | float* const cp0 = cp + i[0] * cstride[0]; |
98 | 6 | for (i[1] = 0; i[1] < cdim[1]; i[1]++5 ) |
99 | 5 | { |
100 | 5 | float* const ap1 = adim[1] == 1 ? ap00 : ap0 + i[1] * astride[1]; |
101 | 5 | float* const bp1 = bdim[1] == 1 ? bp00 : bp0 + i[1] * bstride[1]; |
102 | 5 | float* cp1 = cp0 + i[1] * cstride[1]; |
103 | 45 | for (i[2] = 0; i[2] < cdim[2]; i[2]++40 ) |
104 | 40 | { |
105 | 40 | float* const ap2 = adim[2] == 1 ? ap10 : ap1 + i[2] * astride[2]; |
106 | 40 | float* const bp2 = bdim[2] == 1 ? bp1 : bp1 + i[2] * bstride[2]0 ; |
107 | 2.60k | for (x = 0; x < cdim[3]; x += 22.56k ) |
108 | 2.56k | { |
109 | 2.56k | const float a0 = ap2[x]; |
110 | 2.56k | const float a1 = ap2[x + 1]; |
111 | 2.56k | const float b0 = bp2[x]; |
112 | 2.56k | const float b1 = bp2[x + 1]; |
113 | 2.56k | cp1[x] = a0 * b0 - a1 * b1; |
114 | 2.56k | cp1[x + 1] = a0 * b1 + a1 * b0; |
115 | 2.56k | } |
116 | 40 | cp1 += cstride[2]; |
117 | 40 | } |
118 | 5 | } |
119 | 1 | } |
120 | 1 | } |
121 | | |
122 | | void _ccv_nnc_cmul_conj_forw_cpu_ref(ccv_nnc_tensor_view_t* const a, ccv_nnc_tensor_view_t* const b, ccv_nnc_tensor_view_t* const c) |
123 | 5 | { |
124 | 5 | int cdim[CCV_NNC_MAX_DIM_ALLOC]; |
125 | 5 | assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2); |
126 | 5 | assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2); |
127 | 5 | ccv_nnc_tensor_view_get_dim(a, cdim); // Fill in cdim first. |
128 | 5 | ccv_nnc_tensor_view_get_broadcast_dim(b, cdim); |
129 | 5 | assert(ccv_nnc_tensor_view_check_broadcast_dim(a, cdim)); |
130 | 5 | assert(ccv_nnc_tensor_view_check_broadcast_dim(b, cdim)); |
131 | 5 | const int a_check_dim = ccv_nnc_tensor_view_check_dim(a, cdim); |
132 | 5 | const int b_check_dim = ccv_nnc_tensor_view_check_dim(b, cdim); |
133 | | // Assuming this is float 32. |
134 | 5 | int adim[CCV_NNC_MAX_DIM_ALLOC]; |
135 | 5 | int bdim[CCV_NNC_MAX_DIM_ALLOC]; |
136 | 5 | ccv_nnc_tensor_view_get_dim(a, adim); |
137 | 5 | ccv_nnc_tensor_view_get_dim(b, bdim); |
138 | 5 | int astride[CCV_NNC_MAX_DIM_ALLOC]; |
139 | 5 | int bstride[CCV_NNC_MAX_DIM_ALLOC]; |
140 | 5 | int cstride[CCV_NNC_MAX_DIM_ALLOC]; |
141 | 5 | assert(ccv_nnc_tensor_nd(c->info.dim) <= CCV_NNC_MAX_DIM + 2); |
142 | 5 | assert(ccv_nnc_tensor_view_check_dim(c, cdim)); |
143 | 5 | int x; |
144 | 5 | if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b) && !CCV_IS_TENSOR_VIEW(c) && a_check_dim && b_check_dim) |
145 | 5 | { |
146 | 5 | const int tensor_count = ccv_nnc_tensor_count(a->info); |
147 | 5 | assert(tensor_count % 2 == 0); |
148 | | // Super optimal case, just do one for-loop for sum. |
149 | 410 | for (x = 0; 5 x < tensor_count; x += 2405 ) |
150 | 405 | { |
151 | 405 | const float a0 = a->data.f32[x]; |
152 | 405 | const float a1 = a->data.f32[x + 1]; |
153 | 405 | const float b0 = b->data.f32[x]; |
154 | 405 | const float b1 = b->data.f32[x + 1]; |
155 | 405 | c->data.f32[x] = a0 * b0 + a1 * b1; |
156 | 405 | c->data.f32[x + 1] = -a0 * b1 + a1 * b0; |
157 | 405 | } |
158 | 5 | return; |
159 | 5 | } |
160 | 0 | assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number. |
161 | 0 | ccv_nnc_tensor_view_get_stride(a, astride); |
162 | 0 | ccv_nnc_tensor_view_get_stride(b, bstride); |
163 | 0 | ccv_nnc_tensor_view_get_stride(c, cstride); |
164 | 0 | int i[CCV_NNC_MAX_DIM + 2]; |
165 | 0 | float* const ap = a->data.f32; |
166 | 0 | float* const bp = b->data.f32; |
167 | 0 | float* const cp = c->data.f32; |
168 | 0 | const int count = cdim[2] * cdim[3]; |
169 | 0 | assert(count % 2 == 0); |
170 | 0 | if (astride[2] == cdim[3] && bstride[2] == cdim[3] && cstride[2] == cdim[3] && adim[2] == cdim[2] && bdim[2] == cdim[2]) |
171 | 0 | { |
172 | | // Special casing if the ainc[3] is the same as dim[3] |
173 | 0 | for (i[0] = 0; i[0] < cdim[0]; i[0]++) |
174 | 0 | { |
175 | 0 | float* const ap0 = adim[0] == 1 ? ap : ap + i[0] * astride[0]; |
176 | 0 | float* const bp0 = bdim[0] == 1 ? bp : bp + i[0] * bstride[0]; |
177 | 0 | float* cp0 = cp + i[0] * cstride[0]; |
178 | 0 | for (i[1] = 0; i[1] < cdim[1]; i[1]++) |
179 | 0 | { |
180 | 0 | float* const ap1 = adim[1] == 1 ? ap0 : ap0 + i[1] * astride[1]; |
181 | 0 | float* const bp1 = bdim[1] == 1 ? bp0 : bp0 + i[1] * bstride[1]; |
182 | 0 | for (x = 0; x < count; x += 2) |
183 | 0 | { |
184 | 0 | const float a0 = ap1[x]; |
185 | 0 | const float a1 = ap1[x + 1]; |
186 | 0 | const float b0 = bp1[x]; |
187 | 0 | const float b1 = bp1[x + 1]; |
188 | 0 | cp0[x] = a0 * b0 + a1 * b1; |
189 | 0 | cp0[x + 1] = -a0 * b1 + a1 * b0; |
190 | 0 | } |
191 | 0 | cp0 += cstride[1]; |
192 | 0 | } |
193 | 0 | } |
194 | 0 | return; |
195 | 0 | } |
196 | 0 | assert(adim[3] == cdim[3]); |
197 | 0 | assert(bdim[3] == cdim[3]); |
198 | | // Non-optimal case, need to do skip copy and handle broadcasting. |
199 | 0 | for (i[0] = 0; i[0] < cdim[0]; i[0]++) |
200 | 0 | { |
201 | 0 | float* const ap0 = adim[0] == 1 ? ap : ap + i[0] * astride[0]; |
202 | 0 | float* const bp0 = bdim[0] == 1 ? bp : bp + i[0] * bstride[0]; |
203 | 0 | float* const cp0 = cp + i[0] * cstride[0]; |
204 | 0 | for (i[1] = 0; i[1] < cdim[1]; i[1]++) |
205 | 0 | { |
206 | 0 | float* const ap1 = adim[1] == 1 ? ap0 : ap0 + i[1] * astride[1]; |
207 | 0 | float* const bp1 = bdim[1] == 1 ? bp0 : bp0 + i[1] * bstride[1]; |
208 | 0 | float* cp1 = cp0 + i[1] * cstride[1]; |
209 | 0 | for (i[2] = 0; i[2] < cdim[2]; i[2]++) |
210 | 0 | { |
211 | 0 | float* const ap2 = adim[2] == 1 ? ap1 : ap1 + i[2] * astride[2]; |
212 | 0 | float* const bp2 = bdim[2] == 1 ? bp1 : bp1 + i[2] * bstride[2]; |
213 | 0 | for (x = 0; x < cdim[3]; x += 2) |
214 | 0 | { |
215 | 0 | const float a0 = ap2[x]; |
216 | 0 | const float a1 = ap2[x + 1]; |
217 | 0 | const float b0 = bp2[x]; |
218 | 0 | const float b1 = bp2[x + 1]; |
219 | 0 | cp1[x] = a0 * b0 + a1 * b1; |
220 | 0 | cp1[x + 1] = -a0 * b1 + a1 * b0; |
221 | 0 | } |
222 | 0 | cp1 += cstride[2]; |
223 | 0 | } |
224 | 0 | } |
225 | 0 | } |
226 | 0 | } |
227 | | |
228 | | void _ccv_nnc_conj_forw_cpu_ref(ccv_nnc_tensor_view_t* const a, ccv_nnc_tensor_view_t* const c) |
229 | 0 | { |
230 | 0 | int cdim[CCV_NNC_MAX_DIM_ALLOC]; |
231 | 0 | assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2); |
232 | 0 | ccv_nnc_tensor_view_get_dim(a, cdim); // Fill in cdim first. |
233 | 0 | assert(ccv_nnc_tensor_view_check_broadcast_dim(a, cdim)); |
234 | 0 | const int a_check_dim = ccv_nnc_tensor_view_check_dim(a, cdim); |
235 | | // Assuming this is float 32. |
236 | 0 | int adim[CCV_NNC_MAX_DIM_ALLOC]; |
237 | 0 | ccv_nnc_tensor_view_get_dim(a, adim); |
238 | 0 | int astride[CCV_NNC_MAX_DIM_ALLOC]; |
239 | 0 | int cstride[CCV_NNC_MAX_DIM_ALLOC]; |
240 | 0 | assert(ccv_nnc_tensor_nd(c->info.dim) <= CCV_NNC_MAX_DIM + 2); |
241 | 0 | assert(ccv_nnc_tensor_view_check_dim(c, cdim)); |
242 | 0 | int x; |
243 | 0 | if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(c) && a_check_dim) |
244 | 0 | { |
245 | 0 | const int tensor_count = ccv_nnc_tensor_count(a->info); |
246 | 0 | assert(tensor_count % 2 == 0); |
247 | | // Super optimal case, just do one for-loop for sum. |
248 | 0 | for (x = 0; x < tensor_count; x += 2) |
249 | 0 | { |
250 | 0 | c->data.f32[x] = a->data.f32[x]; |
251 | 0 | c->data.f32[x + 1] = -a->data.f32[x + 1]; |
252 | 0 | } |
253 | 0 | return; |
254 | 0 | } |
255 | 0 | assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number. |
256 | 0 | ccv_nnc_tensor_view_get_stride(a, astride); |
257 | 0 | ccv_nnc_tensor_view_get_stride(c, cstride); |
258 | 0 | int i[CCV_NNC_MAX_DIM + 2]; |
259 | 0 | float* const ap = a->data.f32; |
260 | 0 | float* const cp = c->data.f32; |
261 | 0 | const int count = cdim[2] * cdim[3]; |
262 | 0 | assert(count % 2 == 0); |
263 | 0 | if (astride[2] == cdim[3] && cstride[2] == cdim[3] && adim[2] == cdim[2]) |
264 | 0 | { |
265 | | // Special casing if the ainc[3] is the same as dim[3] |
266 | 0 | for (i[0] = 0; i[0] < cdim[0]; i[0]++) |
267 | 0 | { |
268 | 0 | float* const ap0 = adim[0] == 1 ? ap : ap + i[0] * astride[0]; |
269 | 0 | float* cp0 = cp + i[0] * cstride[0]; |
270 | 0 | for (i[1] = 0; i[1] < cdim[1]; i[1]++) |
271 | 0 | { |
272 | 0 | float* const ap1 = adim[1] == 1 ? ap0 : ap0 + i[1] * astride[1]; |
273 | 0 | for (x = 0; x < count; x += 2) |
274 | 0 | { |
275 | 0 | cp0[x] = ap1[x]; |
276 | 0 | cp0[x + 1] = -ap1[x + 1]; |
277 | 0 | } |
278 | 0 | cp0 += cstride[1]; |
279 | 0 | } |
280 | 0 | } |
281 | 0 | return; |
282 | 0 | } |
283 | 0 | assert(adim[3] == cdim[3]); |
284 | | // Non-optimal case, need to do skip copy and handle broadcasting. |
285 | 0 | for (i[0] = 0; i[0] < cdim[0]; i[0]++) |
286 | 0 | { |
287 | 0 | float* const ap0 = adim[0] == 1 ? ap : ap + i[0] * astride[0]; |
288 | 0 | float* const cp0 = cp + i[0] * cstride[0]; |
289 | 0 | for (i[1] = 0; i[1] < cdim[1]; i[1]++) |
290 | 0 | { |
291 | 0 | float* const ap1 = adim[1] == 1 ? ap0 : ap0 + i[1] * astride[1]; |
292 | 0 | float* cp1 = cp0 + i[1] * cstride[1]; |
293 | 0 | for (i[2] = 0; i[2] < cdim[2]; i[2]++) |
294 | 0 | { |
295 | 0 | float* const ap2 = adim[2] == 1 ? ap1 : ap1 + i[2] * astride[2]; |
296 | 0 | for (x = 0; x < cdim[3]; x += 2) |
297 | 0 | { |
298 | 0 | cp1[x] = ap2[x]; |
299 | 0 | cp1[x + 1] = -ap2[x + 1]; |
300 | 0 | } |
301 | 0 | cp1 += cstride[2]; |
302 | 0 | } |
303 | 0 | } |
304 | 0 | } |
305 | 0 | } |
306 | | |
307 | | static int _ccv_nnc_cmul_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
308 | 5 | { |
309 | 5 | assert(input_size == 2); |
310 | 5 | _ccv_nnc_cmul_forw_cpu_ref((ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[1], (ccv_nnc_tensor_view_t*)outputs[0]); |
311 | 5 | return CCV_NNC_EXEC_SUCCESS; |
312 | 5 | } |
313 | | |
314 | | static int _ccv_nnc_cmul_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
315 | 3 | { |
316 | 3 | int gdim[CCV_NNC_MAX_DIM_ALLOC]; |
317 | 3 | int no_broadcasting = 1; |
318 | 3 | if (outputs[0]) |
319 | 3 | { |
320 | 3 | assert(input_size >= 3 && inputs[2]); |
321 | 3 | ccv_nnc_tensor_view_get_dim((ccv_nnc_tensor_view_t*)outputs[0], gdim); |
322 | 3 | ccv_nnc_tensor_view_get_broadcast_dim((ccv_nnc_tensor_view_t*)inputs[2], gdim); |
323 | 3 | no_broadcasting = no_broadcasting && (ccv_nnc_tensor_view_check_dim((ccv_nnc_tensor_view_t*)outputs[0], gdim) && ccv_nnc_tensor_view_check_dim((ccv_nnc_tensor_view_t*)inputs[2], gdim)); |
324 | 3 | } |
325 | 3 | if (no_broadcasting && output_size > 1 && outputs[1]) |
326 | 2 | { |
327 | 2 | assert(inputs[1]); |
328 | 2 | ccv_nnc_tensor_view_get_dim((ccv_nnc_tensor_view_t*)inputs[1], gdim); |
329 | 2 | ccv_nnc_tensor_view_get_broadcast_dim((ccv_nnc_tensor_view_t*)outputs[1], gdim); |
330 | 2 | no_broadcasting = no_broadcasting && (ccv_nnc_tensor_view_check_dim((ccv_nnc_tensor_view_t*)inputs[1], gdim) && ccv_nnc_tensor_view_check_dim((ccv_nnc_tensor_view_t*)outputs[1], gdim)); |
331 | 2 | } |
332 | | // We compute with the conjugation of the gradient output similar to PyTorch: https://pytorch.org/docs/stable/notes/autograd.html#autograd-for-complex-numbers |
333 | | // Note that in the absence of gradient output, we simply compute the conjugation of the other input. |
334 | 3 | if (no_broadcasting) |
335 | 3 | { |
336 | 3 | if (outputs[0]) |
337 | 3 | { |
338 | 3 | if (inputs[0] == 0) |
339 | 0 | _ccv_nnc_conj_forw_cpu_ref((ccv_nnc_tensor_view_t*)inputs[2], (ccv_nnc_tensor_view_t*)outputs[0]); |
340 | 3 | else |
341 | 3 | _ccv_nnc_cmul_conj_forw_cpu_ref((ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[2], (ccv_nnc_tensor_view_t*)outputs[0]); |
342 | 3 | } |
343 | 3 | if (output_size > 1 && outputs[1]) |
344 | 2 | { |
345 | 2 | if (inputs[0] == 0) |
346 | 0 | _ccv_nnc_conj_forw_cpu_ref((ccv_nnc_tensor_view_t*)inputs[1], (ccv_nnc_tensor_view_t*)outputs[1]); |
347 | 2 | else |
348 | 2 | _ccv_nnc_cmul_conj_forw_cpu_ref((ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[1], (ccv_nnc_tensor_view_t*)outputs[1]); |
349 | 2 | } |
350 | 3 | return CCV_NNC_EXEC_SUCCESS; |
351 | 3 | } |
352 | 0 | int adim[CCV_NNC_MAX_DIM_ALLOC]; |
353 | 0 | int bdim[CCV_NNC_MAX_DIM_ALLOC]; |
354 | 0 | int astride[CCV_NNC_MAX_DIM_ALLOC]; |
355 | 0 | int bstride[CCV_NNC_MAX_DIM_ALLOC]; |
356 | 0 | int i[CCV_NNC_MAX_DIM + 2]; |
357 | 0 | int x; |
358 | | // Now the case we need broadcasting. |
359 | 0 | if (inputs[0] == 0) |
360 | 0 | { |
361 | 0 | if (outputs[0]) |
362 | 0 | { |
363 | 0 | ccv_nnc_tensor_view_t* const a = (ccv_nnc_tensor_view_t*)outputs[0]; |
364 | 0 | ccv_nnc_tensor_view_t* const b = (ccv_nnc_tensor_view_t*)inputs[2]; |
365 | 0 | ccv_nnc_tensor_view_get_dim(a, adim); |
366 | 0 | ccv_nnc_tensor_view_get_dim(b, bdim); |
367 | 0 | ccv_nnc_tensor_view_get_stride(a, astride); |
368 | 0 | ccv_nnc_tensor_view_get_stride(b, bstride); |
369 | 0 | ccv_nnc_tensor_zero(a); |
370 | 0 | float* const ap = a->data.f32; |
371 | 0 | float* const bp = b->data.f32; |
372 | 0 | for (i[0] = 0; i[0] < gdim[0]; i[0]++) |
373 | 0 | { |
374 | 0 | float* const ap0 = adim[0] == 1 ? ap : ap + i[0] * astride[0]; |
375 | 0 | float* const bp0 = bdim[0] == 1 ? bp : bp + i[0] * bstride[0]; |
376 | 0 | for (i[1] = 0; i[1] < gdim[1]; i[1]++) |
377 | 0 | { |
378 | 0 | float* const ap1 = adim[1] == 1 ? ap0 : ap0 + i[1] * astride[1]; |
379 | 0 | float* const bp1 = bdim[1] == 1 ? bp0 : bp0 + i[1] * bstride[1]; |
380 | 0 | for (i[2] = 0; i[2] < gdim[2]; i[2]++) |
381 | 0 | { |
382 | 0 | float* const ap2 = adim[2] == 1 ? ap1 : ap1 + i[2] * astride[2]; |
383 | 0 | float* const bp2 = bdim[2] == 1 ? bp1 : bp1 + i[2] * bstride[2]; |
384 | 0 | for (x = 0; x < gdim[3]; x++) |
385 | 0 | ap2[x] += bp2[x]; |
386 | 0 | } |
387 | 0 | } |
388 | 0 | } |
389 | 0 | } |
390 | 0 | if (output_size > 1 && outputs[1]) |
391 | 0 | { |
392 | 0 | ccv_nnc_tensor_view_t* const a = (ccv_nnc_tensor_view_t*)outputs[1]; |
393 | 0 | ccv_nnc_tensor_view_t* const b = (ccv_nnc_tensor_view_t*)inputs[1]; |
394 | 0 | ccv_nnc_tensor_view_get_dim(a, adim); |
395 | 0 | ccv_nnc_tensor_view_get_dim(b, bdim); |
396 | 0 | ccv_nnc_tensor_view_get_stride(a, astride); |
397 | 0 | ccv_nnc_tensor_view_get_stride(b, bstride); |
398 | 0 | ccv_nnc_tensor_zero(a); |
399 | 0 | float* const ap = a->data.f32; |
400 | 0 | float* const bp = b->data.f32; |
401 | 0 | for (i[0] = 0; i[0] < gdim[0]; i[0]++) |
402 | 0 | { |
403 | 0 | float* const ap0 = adim[0] == 1 ? ap : ap + i[0] * astride[0]; |
404 | 0 | float* const bp0 = bdim[0] == 1 ? bp : bp + i[0] * bstride[0]; |
405 | 0 | for (i[1] = 0; i[1] < gdim[1]; i[1]++) |
406 | 0 | { |
407 | 0 | float* const ap1 = adim[1] == 1 ? ap0 : ap0 + i[1] * astride[1]; |
408 | 0 | float* const bp1 = bdim[1] == 1 ? bp0 : bp0 + i[1] * bstride[1]; |
409 | 0 | for (i[2] = 0; i[2] < gdim[2]; i[2]++) |
410 | 0 | { |
411 | 0 | float* const ap2 = adim[2] == 1 ? ap1 : ap1 + i[2] * astride[2]; |
412 | 0 | float* const bp2 = bdim[2] == 1 ? bp1 : bp1 + i[2] * bstride[2]; |
413 | 0 | for (x = 0; x < gdim[3]; x++) |
414 | 0 | ap2[x] += bp2[x]; |
415 | 0 | } |
416 | 0 | } |
417 | 0 | } |
418 | 0 | } |
419 | 0 | return CCV_NNC_EXEC_SUCCESS; |
420 | 0 | } |
421 | 0 | int gstride[CCV_NNC_MAX_DIM_ALLOC]; |
422 | 0 | ccv_nnc_tensor_view_t* const g = (ccv_nnc_tensor_view_t*)inputs[0]; |
423 | 0 | ccv_nnc_tensor_view_get_dim(g, gdim); |
424 | 0 | ccv_nnc_tensor_view_get_stride(g, gstride); |
425 | 0 | if (outputs[0]) |
426 | 0 | { |
427 | 0 | ccv_nnc_tensor_view_t* const a = (ccv_nnc_tensor_view_t*)outputs[0]; |
428 | 0 | ccv_nnc_tensor_view_t* const b = (ccv_nnc_tensor_view_t*)inputs[2]; |
429 | 0 | ccv_nnc_tensor_view_get_dim(a, adim); |
430 | 0 | ccv_nnc_tensor_view_get_dim(b, bdim); |
431 | 0 | ccv_nnc_tensor_view_get_stride(a, astride); |
432 | 0 | ccv_nnc_tensor_view_get_stride(b, bstride); |
433 | 0 | ccv_nnc_tensor_zero(a); |
434 | 0 | float* const ap = a->data.f32; |
435 | 0 | float* const bp = b->data.f32; |
436 | 0 | float* const gp = g->data.f32; |
437 | 0 | for (i[0] = 0; i[0] < gdim[0]; i[0]++) |
438 | 0 | { |
439 | 0 | float* const ap0 = adim[0] == 1 ? ap : ap + i[0] * astride[0]; |
440 | 0 | float* const bp0 = bdim[0] == 1 ? bp : bp + i[0] * bstride[0]; |
441 | 0 | float* const gp0 = gp + i[0] * gstride[0]; |
442 | 0 | for (i[1] = 0; i[1] < gdim[1]; i[1]++) |
443 | 0 | { |
444 | 0 | float* const ap1 = adim[1] == 1 ? ap0 : ap0 + i[1] * astride[1]; |
445 | 0 | float* const bp1 = bdim[1] == 1 ? bp0 : bp0 + i[1] * bstride[1]; |
446 | 0 | float* gp1 = gp0 + i[1] * gstride[1]; |
447 | 0 | for (i[2] = 0; i[2] < gdim[2]; i[2]++) |
448 | 0 | { |
449 | 0 | float* const ap2 = adim[2] == 1 ? ap1 : ap1 + i[2] * astride[2]; |
450 | 0 | float* const bp2 = bdim[2] == 1 ? bp1 : bp1 + i[2] * bstride[2]; |
451 | 0 | for (x = 0; x < gdim[3]; x += 2) |
452 | 0 | { |
453 | 0 | const float g0 = gp1[x]; |
454 | 0 | const float g1 = gp1[x + 1]; |
455 | 0 | const float b0 = bp2[x]; |
456 | 0 | const float b1 = bp2[x + 1]; |
457 | 0 | ap2[x] += g0 * b0 + g1 * b1; |
458 | 0 | ap2[x + 1] += -g0 * b1 + g1 * b0; |
459 | 0 | } |
460 | 0 | gp1 += gstride[2]; |
461 | 0 | } |
462 | 0 | } |
463 | 0 | } |
464 | 0 | } |
465 | 0 | if (output_size > 1 && outputs[1]) |
466 | 0 | { |
467 | 0 | ccv_nnc_tensor_view_t* const a = (ccv_nnc_tensor_view_t*)outputs[1]; |
468 | 0 | ccv_nnc_tensor_view_t* const b = (ccv_nnc_tensor_view_t*)inputs[1]; |
469 | 0 | ccv_nnc_tensor_view_get_dim(a, adim); |
470 | 0 | ccv_nnc_tensor_view_get_dim(b, bdim); |
471 | 0 | ccv_nnc_tensor_view_get_stride(a, astride); |
472 | 0 | ccv_nnc_tensor_view_get_stride(b, bstride); |
473 | 0 | ccv_nnc_tensor_zero(a); |
474 | 0 | float* const ap = a->data.f32; |
475 | 0 | float* const bp = b->data.f32; |
476 | 0 | float* const gp = g->data.f32; |
477 | 0 | for (i[0] = 0; i[0] < gdim[0]; i[0]++) |
478 | 0 | { |
479 | 0 | float* const ap0 = adim[0] == 1 ? ap : ap + i[0] * astride[0]; |
480 | 0 | float* const bp0 = bdim[0] == 1 ? bp : bp + i[0] * bstride[0]; |
481 | 0 | float* const gp0 = gp + i[0] * gstride[0]; |
482 | 0 | for (i[1] = 0; i[1] < gdim[1]; i[1]++) |
483 | 0 | { |
484 | 0 | float* const ap1 = adim[1] == 1 ? ap0 : ap0 + i[1] * astride[1]; |
485 | 0 | float* const bp1 = bdim[1] == 1 ? bp0 : bp0 + i[1] * bstride[1]; |
486 | 0 | float* gp1 = gp0 + i[1] * gstride[1]; |
487 | 0 | for (i[2] = 0; i[2] < gdim[2]; i[2]++) |
488 | 0 | { |
489 | 0 | float* const ap2 = adim[2] == 1 ? ap1 : ap1 + i[2] * astride[2]; |
490 | 0 | float* const bp2 = bdim[2] == 1 ? bp1 : bp1 + i[2] * bstride[2]; |
491 | 0 | for (x = 0; x < gdim[3]; x += 2) |
492 | 0 | { |
493 | 0 | const float g0 = gp1[x]; |
494 | 0 | const float g1 = gp1[x + 1]; |
495 | 0 | const float b0 = bp2[x]; |
496 | 0 | const float b1 = bp2[x + 1]; |
497 | 0 | ap2[x] += g0 * b0 + g1 * b1; |
498 | 0 | ap2[x + 1] += -g0 * b1 + g1 * b0; |
499 | 0 | } |
500 | 0 | gp1 += gstride[2]; |
501 | 0 | } |
502 | 0 | } |
503 | 0 | } |
504 | 0 | } |
505 | 0 | return CCV_NNC_EXEC_SUCCESS; |
506 | 0 | } |
507 | | |
508 | | REGISTER_COMMAND_BACKEND(CCV_NNC_CMUL_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
509 | 1 | { |
510 | 1 | registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN; |
511 | 1 | registry->tensor_datatypes = CCV_32F; |
512 | 1 | registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
513 | 1 | registry->algorithms = 1; |
514 | 1 | registry->exec = _ccv_nnc_cmul_forw; |
515 | 1 | } |
516 | | |
517 | | REGISTER_COMMAND_BACKEND(CCV_NNC_CMUL_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
518 | 1 | { |
519 | 1 | registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN; |
520 | 1 | registry->tensor_datatypes = CCV_32F; |
521 | 1 | registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
522 | 1 | registry->algorithms = 1; |
523 | 1 | registry->exec = _ccv_nnc_cmul_back; |
524 | 1 | } |
525 | | |