/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/cmd/util/ccv_nnc_util_cpu_ref.c
Line | Count | Source |
1 | | #include "ccv.h" |
2 | | #include "ccv_internal.h" |
3 | | #include "nnc/ccv_nnc.h" |
4 | | #include "nnc/ccv_nnc_easy.h" |
5 | | #include "nnc/ccv_nnc_internal.h" |
6 | | #ifdef USE_OPENMP |
7 | | #include <omp.h> |
8 | | #endif |
9 | | #ifdef USE_DISPATCH |
10 | | #include <dispatch/dispatch.h> |
11 | | #endif |
12 | | #include "../_ccv_nnc_cpu_ref.h" |
13 | | |
14 | | void _ccv_nnc_tensor_transfer_cpu_ref_u8(const ccv_nnc_tensor_view_t* const a, ccv_nnc_tensor_view_t* const b) |
15 | 0 | { |
16 | | // Assuming this is float 32. |
17 | 0 | assert(a->info.datatype == b->info.datatype); |
18 | 0 | if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b)) |
19 | 0 | { |
20 | | // Super optimal case, just do memcpy. |
21 | 0 | memcpy(b->data.u8, a->data.u8, ccv_nnc_tensor_count(a->info) * CCV_GET_DATA_TYPE_SIZE(a->info.datatype)); |
22 | 0 | return; |
23 | 0 | } |
24 | 0 | int dim[CCV_NNC_MAX_DIM_ALLOC]; |
25 | 0 | int astride[CCV_NNC_MAX_DIM_ALLOC]; |
26 | 0 | int bstride[CCV_NNC_MAX_DIM_ALLOC]; |
27 | 0 | ccv_nnc_tensor_view_get_dim(a, dim); |
28 | 0 | assert(ccv_nnc_tensor_view_check_dim(b, dim)); |
29 | 0 | ccv_nnc_tensor_view_get_stride(a, astride); |
30 | 0 | ccv_nnc_tensor_view_get_stride(b, bstride); |
31 | 0 | assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number. |
32 | 0 | int i[CCV_NNC_MAX_DIM + 2]; |
33 | 0 | unsigned char* const ap = a->data.u8; |
34 | 0 | unsigned char* const bp = b->data.u8; |
35 | 0 | if (astride[2] == dim[3] && bstride[3] == dim[3]) |
36 | 0 | { |
37 | | // Special casing if the ainc[3] is the same as dim[3] (do memcpy for the last two dim) |
38 | 0 | for (i[0] = 0; i[0] < dim[0]; i[0]++) |
39 | 0 | { |
40 | 0 | unsigned char* ap0 = ap + i[0] * astride[0]; |
41 | 0 | unsigned char* bp0 = bp + i[0] * bstride[0]; |
42 | 0 | for (i[1] = 0; i[1] < dim[1]; i[1]++) |
43 | 0 | { |
44 | 0 | memcpy(bp0, ap0, dim[2] * dim[3] * sizeof(unsigned char)); |
45 | 0 | ap0 += astride[1]; |
46 | 0 | bp0 += bstride[1]; |
47 | 0 | } |
48 | 0 | } |
49 | 0 | return; |
50 | 0 | } else if (astride[3] == 1 && bstride[3] == 1) { |
51 | | // The case the last dimension is packed. |
52 | 0 | for (i[0] = 0; i[0] < dim[0]; i[0]++) |
53 | 0 | { |
54 | 0 | unsigned char* const ap0 = ap + i[0] * astride[0]; |
55 | 0 | unsigned char* const bp0 = bp + i[0] * bstride[0]; |
56 | 0 | for (i[1] = 0; i[1] < dim[1]; i[1]++) |
57 | 0 | { |
58 | 0 | unsigned char* ap1 = ap0 + i[1] * astride[1]; |
59 | 0 | unsigned char* bp1 = bp0 + i[1] * bstride[1]; |
60 | 0 | for (i[2] = 0; i[2] < dim[2]; i[2]++) |
61 | 0 | { |
62 | 0 | memcpy(bp1, ap1, dim[3] * sizeof(unsigned char)); |
63 | 0 | ap1 += astride[2]; |
64 | 0 | bp1 += bstride[2]; |
65 | 0 | } |
66 | 0 | } |
67 | 0 | } |
68 | 0 | return; |
69 | 0 | } |
70 | | // Non-optimal case, need to do skip copy. |
71 | 0 | for (i[0] = 0; i[0] < dim[0]; i[0]++) |
72 | 0 | { |
73 | 0 | unsigned char* const ap0 = ap + i[0] * astride[0]; |
74 | 0 | unsigned char* const bp0 = bp + i[0] * bstride[0]; |
75 | 0 | for (i[1] = 0; i[1] < dim[1]; i[1]++) |
76 | 0 | { |
77 | 0 | unsigned char* ap1 = ap0 + i[1] * astride[1]; |
78 | 0 | unsigned char* bp1 = bp0 + i[1] * bstride[1]; |
79 | 0 | for (i[2] = 0; i[2] < dim[2]; i[2]++) |
80 | 0 | { |
81 | 0 | for (i[3] = 0; i[3] < dim[3]; i[3]++) |
82 | 0 | bp1[i[3] * bstride[3]] = ap1[i[3] * astride[3]]; |
83 | 0 | ap1 += astride[2]; |
84 | 0 | bp1 += bstride[2]; |
85 | 0 | } |
86 | 0 | } |
87 | 0 | } |
88 | 0 | } |
89 | | |
90 | | void _ccv_nnc_tensor_transfer_cpu_ref_f16(const ccv_nnc_tensor_view_t* const a, ccv_nnc_tensor_view_t* const b) |
91 | 0 | { |
92 | | // Assuming this is float 32. |
93 | 0 | assert(a->info.datatype == b->info.datatype); |
94 | 0 | if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b)) |
95 | 0 | { |
96 | | // Super optimal case, just do memcpy. |
97 | 0 | memcpy(b->data.u8, a->data.u8, ccv_nnc_tensor_count(a->info) * CCV_GET_DATA_TYPE_SIZE(a->info.datatype)); |
98 | 0 | return; |
99 | 0 | } |
100 | 0 | int dim[CCV_NNC_MAX_DIM_ALLOC]; |
101 | 0 | int astride[CCV_NNC_MAX_DIM_ALLOC]; |
102 | 0 | int bstride[CCV_NNC_MAX_DIM_ALLOC]; |
103 | 0 | ccv_nnc_tensor_view_get_dim(a, dim); |
104 | 0 | assert(ccv_nnc_tensor_view_check_dim(b, dim)); |
105 | 0 | ccv_nnc_tensor_view_get_stride(a, astride); |
106 | 0 | ccv_nnc_tensor_view_get_stride(b, bstride); |
107 | 0 | assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number. |
108 | 0 | int i[CCV_NNC_MAX_DIM + 2]; |
109 | 0 | ccv_float16_t* const ap = a->data.f16; |
110 | 0 | ccv_float16_t* const bp = b->data.f16; |
111 | 0 | if (astride[2] == dim[3] && bstride[3] == dim[3]) |
112 | 0 | { |
113 | | // Special casing if the ainc[3] is the same as dim[3] (do memcpy for the last two dim) |
114 | 0 | for (i[0] = 0; i[0] < dim[0]; i[0]++) |
115 | 0 | { |
116 | 0 | ccv_float16_t* ap0 = ap + i[0] * astride[0]; |
117 | 0 | ccv_float16_t* bp0 = bp + i[0] * bstride[0]; |
118 | 0 | for (i[1] = 0; i[1] < dim[1]; i[1]++) |
119 | 0 | { |
120 | 0 | memcpy(bp0, ap0, dim[2] * dim[3] * sizeof(ccv_float16_t)); |
121 | 0 | ap0 += astride[1]; |
122 | 0 | bp0 += bstride[1]; |
123 | 0 | } |
124 | 0 | } |
125 | 0 | return; |
126 | 0 | } else if (astride[3] == 1 && bstride[3] == 1) { |
127 | | // The case the last dimension is packed. |
128 | 0 | for (i[0] = 0; i[0] < dim[0]; i[0]++) |
129 | 0 | { |
130 | 0 | ccv_float16_t* const ap0 = ap + i[0] * astride[0]; |
131 | 0 | ccv_float16_t* const bp0 = bp + i[0] * bstride[0]; |
132 | 0 | for (i[1] = 0; i[1] < dim[1]; i[1]++) |
133 | 0 | { |
134 | 0 | ccv_float16_t* ap1 = ap0 + i[1] * astride[1]; |
135 | 0 | ccv_float16_t* bp1 = bp0 + i[1] * bstride[1]; |
136 | 0 | for (i[2] = 0; i[2] < dim[2]; i[2]++) |
137 | 0 | { |
138 | 0 | memcpy(bp1, ap1, dim[3] * sizeof(ccv_float16_t)); |
139 | 0 | ap1 += astride[2]; |
140 | 0 | bp1 += bstride[2]; |
141 | 0 | } |
142 | 0 | } |
143 | 0 | } |
144 | 0 | return; |
145 | 0 | } |
146 | | // Non-optimal case, need to do skip copy. |
147 | 0 | for (i[0] = 0; i[0] < dim[0]; i[0]++) |
148 | 0 | { |
149 | 0 | ccv_float16_t* const ap0 = ap + i[0] * astride[0]; |
150 | 0 | ccv_float16_t* const bp0 = bp + i[0] * bstride[0]; |
151 | 0 | for (i[1] = 0; i[1] < dim[1]; i[1]++) |
152 | 0 | { |
153 | 0 | ccv_float16_t* ap1 = ap0 + i[1] * astride[1]; |
154 | 0 | ccv_float16_t* bp1 = bp0 + i[1] * bstride[1]; |
155 | 0 | for (i[2] = 0; i[2] < dim[2]; i[2]++) |
156 | 0 | { |
157 | 0 | for (i[3] = 0; i[3] < dim[3]; i[3]++) |
158 | 0 | bp1[i[3] * bstride[3]] = ap1[i[3] * astride[3]]; |
159 | 0 | ap1 += astride[2]; |
160 | 0 | bp1 += bstride[2]; |
161 | 0 | } |
162 | 0 | } |
163 | 0 | } |
164 | 0 | } |
165 | | |
166 | | void _ccv_nnc_tensor_transfer_cpu_ref_f32(const ccv_nnc_tensor_view_t* const a, ccv_nnc_tensor_view_t* const b) |
167 | 21.5k | { |
168 | | // Assuming this is float 32. |
169 | 21.5k | assert(a->info.datatype == b->info.datatype); |
170 | 21.5k | if (!CCV_IS_TENSOR_VIEW(a) && !21.5k CCV_IS_TENSOR_VIEW21.5k (b)) |
171 | 21.5k | { |
172 | | // Super optimal case, just do memcpy. |
173 | 21.5k | memcpy(b->data.u8, a->data.u8, ccv_nnc_tensor_count(a->info) * CCV_GET_DATA_TYPE_SIZE(a->info.datatype)); |
174 | 21.5k | return; |
175 | 21.5k | } |
176 | 12 | int dim[CCV_NNC_MAX_DIM_ALLOC]; |
177 | 12 | int astride[CCV_NNC_MAX_DIM_ALLOC]; |
178 | 12 | int bstride[CCV_NNC_MAX_DIM_ALLOC]; |
179 | 12 | ccv_nnc_tensor_view_get_dim(a, dim); |
180 | 12 | assert(ccv_nnc_tensor_view_check_dim(b, dim)); |
181 | 12 | ccv_nnc_tensor_view_get_stride(a, astride); |
182 | 12 | ccv_nnc_tensor_view_get_stride(b, bstride); |
183 | 12 | assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number. |
184 | 12 | int i[CCV_NNC_MAX_DIM + 2]; |
185 | 12 | float* const ap = a->data.f32; |
186 | 12 | float* const bp = b->data.f32; |
187 | 12 | if (astride[2] == dim[3] && bstride[2] == dim[3]6 ) |
188 | 3 | { |
189 | | // Special casing if the ainc[3] is the same as dim[3] (do memcpy for the last two dim) |
190 | 6 | for (i[0] = 0; i[0] < dim[0]; i[0]++3 ) |
191 | 3 | { |
192 | 3 | float* ap0 = ap + i[0] * astride[0]; |
193 | 3 | float* bp0 = bp + i[0] * bstride[0]; |
194 | 8 | for (i[1] = 0; i[1] < dim[1]; i[1]++5 ) |
195 | 5 | { |
196 | 5 | memcpy(bp0, ap0, dim[2] * dim[3] * sizeof(float)); |
197 | 5 | ap0 += astride[1]; |
198 | 5 | bp0 += bstride[1]; |
199 | 5 | } |
200 | 3 | } |
201 | 3 | return; |
202 | 9 | } else if (astride[3] == 1 && bstride[3] == 18 ) { |
203 | | // The case the last dimension is packed. |
204 | 21 | for (i[0] = 0; i[0] < dim[0]; i[0]++13 ) |
205 | 13 | { |
206 | 13 | float* const ap0 = ap + i[0] * astride[0]; |
207 | 13 | float* const bp0 = bp + i[0] * bstride[0]; |
208 | 45 | for (i[1] = 0; i[1] < dim[1]; i[1]++32 ) |
209 | 32 | { |
210 | 32 | float* ap1 = ap0 + i[1] * astride[1]; |
211 | 32 | float* bp1 = bp0 + i[1] * bstride[1]; |
212 | 167 | for (i[2] = 0; i[2] < dim[2]; i[2]++135 ) |
213 | 135 | { |
214 | 135 | memcpy(bp1, ap1, dim[3] * sizeof(float)); |
215 | 135 | ap1 += astride[2]; |
216 | 135 | bp1 += bstride[2]; |
217 | 135 | } |
218 | 32 | } |
219 | 13 | } |
220 | 8 | return; |
221 | 8 | } |
222 | | // Non-optimal case, need to do skip copy. |
223 | 2 | for (i[0] = 0; 1 i[0] < dim[0]; i[0]++1 ) |
224 | 1 | { |
225 | 1 | float* const ap0 = ap + i[0] * astride[0]; |
226 | 1 | float* const bp0 = bp + i[0] * bstride[0]; |
227 | 3 | for (i[1] = 0; i[1] < dim[1]; i[1]++2 ) |
228 | 2 | { |
229 | 2 | float* ap1 = ap0 + i[1] * astride[1]; |
230 | 2 | float* bp1 = bp0 + i[1] * bstride[1]; |
231 | 10 | for (i[2] = 0; i[2] < dim[2]; i[2]++8 ) |
232 | 8 | { |
233 | 32 | for (i[3] = 0; i[3] < dim[3]; i[3]++24 ) |
234 | 24 | bp1[i[3] * bstride[3]] = ap1[i[3] * astride[3]]; |
235 | 8 | ap1 += astride[2]; |
236 | 8 | bp1 += bstride[2]; |
237 | 8 | } |
238 | 2 | } |
239 | 1 | } |
240 | 1 | } |
241 | | |
242 | | void _ccv_nnc_tensor_transfer_cpu_ref_f64(const ccv_nnc_tensor_view_t* const a, ccv_nnc_tensor_view_t* const b) |
243 | 0 | { |
244 | | // Assuming this is float 32. |
245 | 0 | assert(a->info.datatype == b->info.datatype); |
246 | 0 | if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b)) |
247 | 0 | { |
248 | | // Super optimal case, just do memcpy. |
249 | 0 | memcpy(b->data.u8, a->data.u8, ccv_nnc_tensor_count(a->info) * CCV_GET_DATA_TYPE_SIZE(a->info.datatype)); |
250 | 0 | return; |
251 | 0 | } |
252 | 0 | int dim[CCV_NNC_MAX_DIM_ALLOC]; |
253 | 0 | int astride[CCV_NNC_MAX_DIM_ALLOC]; |
254 | 0 | int bstride[CCV_NNC_MAX_DIM_ALLOC]; |
255 | 0 | ccv_nnc_tensor_view_get_dim(a, dim); |
256 | 0 | assert(ccv_nnc_tensor_view_check_dim(b, dim)); |
257 | 0 | ccv_nnc_tensor_view_get_stride(a, astride); |
258 | 0 | ccv_nnc_tensor_view_get_stride(b, bstride); |
259 | 0 | assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number. |
260 | 0 | int i[CCV_NNC_MAX_DIM + 2]; |
261 | 0 | double* ap = a->data.f64; |
262 | 0 | double* bp = b->data.f64; |
263 | 0 | if (astride[2] == dim[3] && bstride[2] == dim[3]) |
264 | 0 | { |
265 | | // Special casing if the ainc[3] is the same as dim[3] (do memcpy for the last two dim) |
266 | 0 | for (i[0] = 0; i[0] < dim[0]; i[0]++) |
267 | 0 | { |
268 | 0 | double* ap0 = ap + i[0] * astride[0]; |
269 | 0 | double* bp0 = bp + i[0] * bstride[0]; |
270 | 0 | for (i[1] = 0; i[1] < dim[1]; i[1]++) |
271 | 0 | { |
272 | 0 | memcpy(bp0, ap0, dim[2] * dim[3] * sizeof(double)); |
273 | 0 | ap0 += astride[1]; |
274 | 0 | bp0 += bstride[1]; |
275 | 0 | } |
276 | 0 | } |
277 | 0 | return; |
278 | 0 | } else if (astride[3] == 1 && bstride[3] == 1) { |
279 | | // The case the last dimension is packed. |
280 | 0 | for (i[0] = 0; i[0] < dim[0]; i[0]++) |
281 | 0 | { |
282 | 0 | double* const ap0 = ap + i[0] * astride[0]; |
283 | 0 | double* const bp0 = bp + i[0] * bstride[0]; |
284 | 0 | for (i[1] = 0; i[1] < dim[1]; i[1]++) |
285 | 0 | { |
286 | 0 | double* ap1 = ap0 + i[1] * astride[1]; |
287 | 0 | double* bp1 = bp0 + i[1] * bstride[1]; |
288 | 0 | for (i[2] = 0; i[2] < dim[2]; i[2]++) |
289 | 0 | { |
290 | 0 | memcpy(bp1, ap1, dim[3] * sizeof(double)); |
291 | 0 | ap1 += astride[2]; |
292 | 0 | bp1 += bstride[2]; |
293 | 0 | } |
294 | 0 | } |
295 | 0 | } |
296 | 0 | return; |
297 | 0 | } |
298 | | // Non-optimal case, need to do skip copy. |
299 | 0 | for (i[0] = 0; i[0] < dim[0]; i[0]++) |
300 | 0 | { |
301 | 0 | double* const ap0 = ap + i[0] * astride[0]; |
302 | 0 | double* const bp0 = bp + i[0] * bstride[0]; |
303 | 0 | for (i[1] = 0; i[1] < dim[1]; i[1]++) |
304 | 0 | { |
305 | 0 | double* ap1 = ap0 + i[1] * astride[1]; |
306 | 0 | double* bp1 = bp0 + i[1] * bstride[1]; |
307 | 0 | for (i[2] = 0; i[2] < dim[2]; i[2]++) |
308 | 0 | { |
309 | 0 | for (i[3] = 0; i[3] < dim[3]; i[3]++) |
310 | 0 | bp1[i[3] * bstride[3]] = ap1[i[3] * astride[3]]; |
311 | 0 | ap1 += astride[2]; |
312 | 0 | bp1 += bstride[2]; |
313 | 0 | } |
314 | 0 | } |
315 | 0 | } |
316 | 0 | } |
317 | | |
318 | | void _ccv_nnc_tensor_set_cpu_ref_f16(ccv_nnc_tensor_view_t* const a, const float b) |
319 | 0 | { |
320 | | // Assuming this is short. |
321 | 0 | int dim[CCV_NNC_MAX_DIM_ALLOC]; |
322 | 0 | int astride[CCV_NNC_MAX_DIM_ALLOC]; |
323 | 0 | short h; |
324 | 0 | ccv_float_to_half_precision((float*)&b, (uint16_t*)&h, 1); |
325 | 0 | int x; |
326 | 0 | if (!CCV_IS_TENSOR_VIEW(a)) |
327 | 0 | { |
328 | | // Super optimal case, just do one for-loop for sum. |
329 | 0 | const int tensor_count = ccv_nnc_tensor_count(a->info); |
330 | 0 | for (x = 0; x < tensor_count; x++) |
331 | 0 | a->data.f16[x].v = h; |
332 | 0 | return; |
333 | 0 | } |
334 | 0 | assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number. |
335 | 0 | ccv_nnc_tensor_view_get_dim(a, dim); |
336 | 0 | ccv_nnc_tensor_view_get_stride(a, astride); |
337 | 0 | int i[CCV_NNC_MAX_DIM + 2]; |
338 | 0 | short* const ap = (short*)a->data.f16; |
339 | 0 | const int count = dim[2] * dim[3]; |
340 | 0 | if (astride[2] == dim[3]) |
341 | 0 | { |
342 | | // Special casing if the ainc[3] is the same as dim[3] |
343 | 0 | for (i[0] = 0; i[0] < dim[0]; i[0]++) |
344 | 0 | { |
345 | 0 | short* ap0 = ap + i[0] * astride[0]; |
346 | 0 | for (i[1] = 0; i[1] < dim[1]; i[1]++) |
347 | 0 | { |
348 | 0 | for (x = 0; x < count; x++) |
349 | 0 | ap0[x] = h; |
350 | 0 | ap0 += astride[1]; |
351 | 0 | } |
352 | 0 | } |
353 | 0 | return; |
354 | 0 | } else if (astride[3] == 1) { |
355 | | // The case the last dimension is packed. |
356 | 0 | for (i[0] = 0; i[0] < dim[0]; i[0]++) |
357 | 0 | { |
358 | 0 | short* const ap0 = ap + i[0] * astride[0]; |
359 | 0 | for (i[1] = 0; i[1] < dim[1]; i[1]++) |
360 | 0 | { |
361 | 0 | short* ap1 = ap0 + i[1] * astride[1]; |
362 | 0 | for (i[2] = 0; i[2] < dim[2]; i[2]++) |
363 | 0 | { |
364 | 0 | for (x = 0; x < dim[3]; x++) |
365 | 0 | ap1[x] = h; |
366 | 0 | ap1 += astride[2]; |
367 | 0 | } |
368 | 0 | } |
369 | 0 | } |
370 | 0 | return; |
371 | 0 | } |
372 | | // Non-optimal case, need to do skip copy. |
373 | 0 | for (i[0] = 0; i[0] < dim[0]; i[0]++) |
374 | 0 | { |
375 | 0 | short* const ap0 = ap + i[0] * astride[0]; |
376 | 0 | for (i[1] = 0; i[1] < dim[1]; i[1]++) |
377 | 0 | { |
378 | 0 | short* ap1 = ap0 + i[1] * astride[1]; |
379 | 0 | for (i[2] = 0; i[2] < dim[2]; i[2]++) |
380 | 0 | { |
381 | 0 | for (x = 0; x < dim[3]; x++) |
382 | 0 | ap1[x * astride[3]] = h; |
383 | 0 | ap1 += astride[2]; |
384 | 0 | } |
385 | 0 | } |
386 | 0 | } |
387 | 0 | } |
388 | | |
389 | | void _ccv_nnc_tensor_set_cpu_ref_bf16(ccv_nnc_tensor_view_t* const a, const float b) |
390 | 0 | { |
391 | | // Assuming this is short. |
392 | 0 | int dim[CCV_NNC_MAX_DIM_ALLOC]; |
393 | 0 | int astride[CCV_NNC_MAX_DIM_ALLOC]; |
394 | 0 | short h; |
395 | 0 | ccv_float_to_bfloat((float*)&b, (uint16_t*)&h, 1); |
396 | 0 | int x; |
397 | 0 | if (!CCV_IS_TENSOR_VIEW(a)) |
398 | 0 | { |
399 | | // Super optimal case, just do one for-loop for sum. |
400 | 0 | const int tensor_count = ccv_nnc_tensor_count(a->info); |
401 | 0 | for (x = 0; x < tensor_count; x++) |
402 | 0 | a->data.f16[x].v = h; |
403 | 0 | return; |
404 | 0 | } |
405 | 0 | assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number. |
406 | 0 | ccv_nnc_tensor_view_get_dim(a, dim); |
407 | 0 | ccv_nnc_tensor_view_get_stride(a, astride); |
408 | 0 | int i[CCV_NNC_MAX_DIM + 2]; |
409 | 0 | short* const ap = (short*)a->data.f16; |
410 | 0 | const int count = dim[2] * dim[3]; |
411 | 0 | if (astride[2] == dim[3]) |
412 | 0 | { |
413 | | // Special casing if the ainc[3] is the same as dim[3] |
414 | 0 | for (i[0] = 0; i[0] < dim[0]; i[0]++) |
415 | 0 | { |
416 | 0 | short* ap0 = ap + i[0] * astride[0]; |
417 | 0 | for (i[1] = 0; i[1] < dim[1]; i[1]++) |
418 | 0 | { |
419 | 0 | for (x = 0; x < count; x++) |
420 | 0 | ap0[x] = h; |
421 | 0 | ap0 += astride[1]; |
422 | 0 | } |
423 | 0 | } |
424 | 0 | return; |
425 | 0 | } else if (astride[3] == 1) { |
426 | | // The case the last dimension is packed. |
427 | 0 | for (i[0] = 0; i[0] < dim[0]; i[0]++) |
428 | 0 | { |
429 | 0 | short* const ap0 = ap + i[0] * astride[0]; |
430 | 0 | for (i[1] = 0; i[1] < dim[1]; i[1]++) |
431 | 0 | { |
432 | 0 | short* ap1 = ap0 + i[1] * astride[1]; |
433 | 0 | for (i[2] = 0; i[2] < dim[2]; i[2]++) |
434 | 0 | { |
435 | 0 | for (x = 0; x < dim[3]; x++) |
436 | 0 | ap1[x] = h; |
437 | 0 | ap1 += astride[2]; |
438 | 0 | } |
439 | 0 | } |
440 | 0 | } |
441 | 0 | return; |
442 | 0 | } |
443 | | // Non-optimal case, need to do skip copy. |
444 | 0 | for (i[0] = 0; i[0] < dim[0]; i[0]++) |
445 | 0 | { |
446 | 0 | short* const ap0 = ap + i[0] * astride[0]; |
447 | 0 | for (i[1] = 0; i[1] < dim[1]; i[1]++) |
448 | 0 | { |
449 | 0 | short* ap1 = ap0 + i[1] * astride[1]; |
450 | 0 | for (i[2] = 0; i[2] < dim[2]; i[2]++) |
451 | 0 | { |
452 | 0 | for (x = 0; x < dim[3]; x++) |
453 | 0 | ap1[x * astride[3]] = h; |
454 | 0 | ap1 += astride[2]; |
455 | 0 | } |
456 | 0 | } |
457 | 0 | } |
458 | 0 | } |
459 | | |
460 | | void _ccv_nnc_tensor_set_cpu_ref_f32(ccv_nnc_tensor_view_t* const a, const float b) |
461 | 10.9k | { |
462 | | // Assuming this is float 32. |
463 | 10.9k | int dim[CCV_NNC_MAX_DIM_ALLOC]; |
464 | 10.9k | int astride[CCV_NNC_MAX_DIM_ALLOC]; |
465 | 10.9k | int x; |
466 | 10.9k | if (!CCV_IS_TENSOR_VIEW(a)) |
467 | 10.9k | { |
468 | | // Super optimal case, just do one for-loop for sum. |
469 | 10.9k | const int tensor_count = ccv_nnc_tensor_count(a->info); |
470 | 400k | for (x = 0; x < tensor_count; x++389k ) |
471 | 389k | a->data.f32[x] = b; |
472 | 10.9k | return; |
473 | 10.9k | } |
474 | 10.9k | assert(CCV_NNC_MAX_DIM == 2)0 ; // Need to change this logic for CCV_NNC_MAX_DIM == other number. |
475 | 0 | ccv_nnc_tensor_view_get_dim(a, dim); |
476 | 0 | ccv_nnc_tensor_view_get_stride(a, astride); |
477 | 0 | int i[CCV_NNC_MAX_DIM + 2]; |
478 | 0 | float* const ap = a->data.f32; |
479 | 0 | const int count = dim[2] * dim[3]; |
480 | 0 | if (astride[2] == dim[3]) |
481 | 0 | { |
482 | | // Special casing if the ainc[3] is the same as dim[3] |
483 | 0 | for (i[0] = 0; i[0] < dim[0]; i[0]++) |
484 | 0 | { |
485 | 0 | float* ap0 = ap + i[0] * astride[0]; |
486 | 0 | for (i[1] = 0; i[1] < dim[1]; i[1]++) |
487 | 0 | { |
488 | 0 | for (x = 0; x < count; x++) |
489 | 0 | ap0[x] = b; |
490 | 0 | ap0 += astride[1]; |
491 | 0 | } |
492 | 0 | } |
493 | 0 | return; |
494 | 0 | } else if (astride[3] == 1) { |
495 | | // The case the last dimension is packed. |
496 | 0 | for (i[0] = 0; i[0] < dim[0]; i[0]++) |
497 | 0 | { |
498 | 0 | float* const ap0 = ap + i[0] * astride[0]; |
499 | 0 | for (i[1] = 0; i[1] < dim[1]; i[1]++) |
500 | 0 | { |
501 | 0 | float* ap1 = ap0 + i[1] * astride[1]; |
502 | 0 | for (i[2] = 0; i[2] < dim[2]; i[2]++) |
503 | 0 | { |
504 | 0 | for (x = 0; x < dim[3]; x++) |
505 | 0 | ap1[x] = b; |
506 | 0 | ap1 += astride[2]; |
507 | 0 | } |
508 | 0 | } |
509 | 0 | } |
510 | 0 | return; |
511 | 0 | } |
512 | | // Non-optimal case, need to do skip copy. |
513 | 0 | for (i[0] = 0; i[0] < dim[0]; i[0]++) |
514 | 0 | { |
515 | 0 | float* const ap0 = ap + i[0] * astride[0]; |
516 | 0 | for (i[1] = 0; i[1] < dim[1]; i[1]++) |
517 | 0 | { |
518 | 0 | float* ap1 = ap0 + i[1] * astride[1]; |
519 | 0 | for (i[2] = 0; i[2] < dim[2]; i[2]++) |
520 | 0 | { |
521 | 0 | for (x = 0; x < dim[3]; x++) |
522 | 0 | ap1[x * astride[3]] = b; |
523 | 0 | ap1 += astride[2]; |
524 | 0 | } |
525 | 0 | } |
526 | 0 | } |
527 | 0 | } |
528 | | |
529 | | void _ccv_nnc_tensor_set_cpu_ref_f64(ccv_nnc_tensor_view_t* const a, const double b) |
530 | 1 | { |
531 | | // Assuming this is double. |
532 | 1 | int dim[CCV_NNC_MAX_DIM_ALLOC]; |
533 | 1 | int astride[CCV_NNC_MAX_DIM_ALLOC]; |
534 | 1 | int x; |
535 | 1 | if (!CCV_IS_TENSOR_VIEW(a)) |
536 | 1 | { |
537 | | // Super optimal case, just do one for-loop for sum. |
538 | 1 | const int tensor_count = ccv_nnc_tensor_count(a->info); |
539 | 7.92k | for (x = 0; x < tensor_count; x++7.92k ) |
540 | 7.92k | a->data.f64[x] = b; |
541 | 1 | return; |
542 | 1 | } |
543 | 1 | assert(CCV_NNC_MAX_DIM == 2)0 ; // Need to change this logic for CCV_NNC_MAX_DIM == other number. |
544 | 0 | ccv_nnc_tensor_view_get_dim(a, dim); |
545 | 0 | ccv_nnc_tensor_view_get_stride(a, astride); |
546 | 0 | int i[CCV_NNC_MAX_DIM + 2]; |
547 | 0 | double* const ap = a->data.f64; |
548 | 0 | const int count = dim[2] * dim[3]; |
549 | 0 | if (astride[2] == dim[3]) |
550 | 0 | { |
551 | | // Special casing if the ainc[3] is the same as dim[3] |
552 | 0 | for (i[0] = 0; i[0] < dim[0]; i[0]++) |
553 | 0 | { |
554 | 0 | double* ap0 = ap + i[0] * astride[0]; |
555 | 0 | for (i[1] = 0; i[1] < dim[1]; i[1]++) |
556 | 0 | { |
557 | 0 | for (x = 0; x < count; x++) |
558 | 0 | ap0[x] = b; |
559 | 0 | ap0 += astride[1]; |
560 | 0 | } |
561 | 0 | } |
562 | 0 | return; |
563 | 0 | } else if (astride[3] == 1) { |
564 | | // The case the last dimension is packed. |
565 | 0 | for (i[0] = 0; i[0] < dim[0]; i[0]++) |
566 | 0 | { |
567 | 0 | double* const ap0 = ap + i[0] * astride[0]; |
568 | 0 | for (i[1] = 0; i[1] < dim[1]; i[1]++) |
569 | 0 | { |
570 | 0 | double* ap1 = ap0 + i[1] * astride[1]; |
571 | 0 | for (i[2] = 0; i[2] < dim[2]; i[2]++) |
572 | 0 | { |
573 | 0 | for (x = 0; x < dim[3]; x++) |
574 | 0 | ap1[x] = b; |
575 | 0 | ap1 += astride[2]; |
576 | 0 | } |
577 | 0 | } |
578 | 0 | } |
579 | 0 | return; |
580 | 0 | } |
581 | | // Non-optimal case, need to do skip copy. |
582 | 0 | for (i[0] = 0; i[0] < dim[0]; i[0]++) |
583 | 0 | { |
584 | 0 | double* const ap0 = ap + i[0] * astride[0]; |
585 | 0 | for (i[1] = 0; i[1] < dim[1]; i[1]++) |
586 | 0 | { |
587 | 0 | double* ap1 = ap0 + i[1] * astride[1]; |
588 | 0 | for (i[2] = 0; i[2] < dim[2]; i[2]++) |
589 | 0 | { |
590 | 0 | for (x = 0; x < dim[3]; x++) |
591 | 0 | ap1[x * astride[3]] = b; |
592 | 0 | ap1 += astride[2]; |
593 | 0 | } |
594 | 0 | } |
595 | 0 | } |
596 | 0 | } |
597 | | |
598 | | void _ccv_nnc_tensor_set_cpu_ref_i32(ccv_nnc_tensor_view_t* const a, const int b) |
599 | 4 | { |
600 | | // Assuming this is float 32. |
601 | 4 | int dim[CCV_NNC_MAX_DIM_ALLOC]; |
602 | 4 | int astride[CCV_NNC_MAX_DIM_ALLOC]; |
603 | 4 | int x; |
604 | 4 | if (!CCV_IS_TENSOR_VIEW(a)) |
605 | 4 | { |
606 | | // Super optimal case, just do one for-loop for sum. |
607 | 4 | const int tensor_count = ccv_nnc_tensor_count(a->info); |
608 | 11 | for (x = 0; x < tensor_count; x++7 ) |
609 | 7 | a->data.i32[x] = b; |
610 | 4 | return; |
611 | 4 | } |
612 | 4 | assert(CCV_NNC_MAX_DIM == 2)0 ; // Need to change this logic for CCV_NNC_MAX_DIM == other number. |
613 | 0 | ccv_nnc_tensor_view_get_dim(a, dim); |
614 | 0 | ccv_nnc_tensor_view_get_stride(a, astride); |
615 | 0 | int i[CCV_NNC_MAX_DIM + 2]; |
616 | 0 | int* const ap = a->data.i32; |
617 | 0 | const int count = dim[2] * dim[3]; |
618 | 0 | if (astride[2] == dim[3]) |
619 | 0 | { |
620 | | // Special casing if the ainc[3] is the same as dim[3] |
621 | 0 | for (i[0] = 0; i[0] < dim[0]; i[0]++) |
622 | 0 | { |
623 | 0 | int* ap0 = ap + i[0] * astride[0]; |
624 | 0 | for (i[1] = 0; i[1] < dim[1]; i[1]++) |
625 | 0 | { |
626 | 0 | for (x = 0; x < count; x++) |
627 | 0 | ap0[x] = b; |
628 | 0 | ap0 += astride[1]; |
629 | 0 | } |
630 | 0 | } |
631 | 0 | return; |
632 | 0 | } else if (astride[3] == 1) { |
633 | | // The case the last dimension is packed. |
634 | 0 | for (i[0] = 0; i[0] < dim[0]; i[0]++) |
635 | 0 | { |
636 | 0 | int* const ap0 = ap + i[0] * astride[0]; |
637 | 0 | for (i[1] = 0; i[1] < dim[1]; i[1]++) |
638 | 0 | { |
639 | 0 | int* ap1 = ap0 + i[1] * astride[1]; |
640 | 0 | for (i[2] = 0; i[2] < dim[2]; i[2]++) |
641 | 0 | { |
642 | 0 | for (x = 0; x < dim[3]; x++) |
643 | 0 | ap1[x] = b; |
644 | 0 | ap1 += astride[2]; |
645 | 0 | } |
646 | 0 | } |
647 | 0 | } |
648 | 0 | return; |
649 | 0 | } |
650 | | // Non-optimal case, need to do skip copy. |
651 | 0 | for (i[0] = 0; i[0] < dim[0]; i[0]++) |
652 | 0 | { |
653 | 0 | int* const ap0 = ap + i[0] * astride[0]; |
654 | 0 | for (i[1] = 0; i[1] < dim[1]; i[1]++) |
655 | 0 | { |
656 | 0 | int* ap1 = ap0 + i[1] * astride[1]; |
657 | 0 | for (i[2] = 0; i[2] < dim[2]; i[2]++) |
658 | 0 | { |
659 | 0 | for (x = 0; x < dim[3]; x++) |
660 | 0 | ap1[x * astride[3]] = b; |
661 | 0 | ap1 += astride[2]; |
662 | 0 | } |
663 | 0 | } |
664 | 0 | } |
665 | 0 | } |
666 | | |
667 | | static int _ccv_nnc_data_transfer(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
668 | 2.05k | { |
669 | 2.05k | int i; |
670 | 8.10k | for (i = 0; i < ccv_min(input_size, output_size); i++6.04k ) |
671 | 6.04k | { |
672 | 6.04k | const ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[i]; |
673 | 6.04k | ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)outputs[i]; |
674 | 6.04k | if (a != b) // Only do transfer if these are two different tensors. |
675 | 78 | { |
676 | 78 | assert(a->info.datatype == b->info.datatype); |
677 | 78 | if (a->info.datatype == CCV_16F || a->info.datatype == CCV_16BF) |
678 | 0 | _ccv_nnc_tensor_transfer_cpu_ref_f16(a, b); |
679 | 78 | else if (a->info.datatype == CCV_32F || a->info.datatype == CCV_32S0 ) |
680 | 78 | _ccv_nnc_tensor_transfer_cpu_ref_f32(a, b); |
681 | 0 | else if (a->info.datatype == CCV_64F) |
682 | 0 | _ccv_nnc_tensor_transfer_cpu_ref_f64(a, b); |
683 | 0 | else if (a->info.datatype == CCV_8U) |
684 | 0 | _ccv_nnc_tensor_transfer_cpu_ref_u8(a, b); |
685 | 78 | } |
686 | 6.04k | } |
687 | 2.05k | return CCV_NNC_EXEC_SUCCESS; |
688 | 2.05k | } |
689 | | |
690 | | REGISTER_COMMAND_BACKEND(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
691 | 1 | { |
692 | 1 | registry->tensor_formats = CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_CHWN; |
693 | 1 | registry->tensor_datatypes = CCV_64F | CCV_32F | CCV_16F | CCV_32S | CCV_16BF; |
694 | 1 | registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
695 | 1 | registry->algorithms = 1; |
696 | 1 | registry->exec = _ccv_nnc_data_transfer; |
697 | 1 | } |
698 | | |
699 | | REGISTER_COMMAND_BACKEND(CCV_NNC_DATA_TRANSFER_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
700 | 1 | { |
701 | 1 | registry->tensor_formats = CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_CHWN; |
702 | 1 | registry->tensor_datatypes = CCV_64F | CCV_32F | CCV_16F | CCV_32S | CCV_16BF; |
703 | 1 | registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
704 | 1 | registry->algorithms = 1; |
705 | 1 | registry->exec = _ccv_nnc_data_transfer; |
706 | 1 | } |
707 | | |
708 | | static int _ccv_nnc_set_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
709 | 11.0k | { |
710 | 11.0k | int i; |
711 | 11.0k | if (cmd.info.blas.a[0] == 0) |
712 | 310 | for (i = 0; 155 i < output_size; i++155 ) |
713 | 155 | ccv_nnc_tensor_zero(outputs[i]); |
714 | 10.8k | else |
715 | 21.7k | for (i = 0; 10.8k i < output_size; i++10.8k ) |
716 | 10.8k | if (outputs[i]->info.datatype == CCV_16F) |
717 | 0 | _ccv_nnc_tensor_set_cpu_ref_f16((ccv_nnc_tensor_view_t*)outputs[i], cmd.info.blas.a[0]); |
718 | 10.8k | else if (outputs[i]->info.datatype == CCV_16BF) |
719 | 0 | _ccv_nnc_tensor_set_cpu_ref_bf16((ccv_nnc_tensor_view_t*)outputs[i], cmd.info.blas.a[0]); |
720 | 10.8k | else if (outputs[i]->info.datatype == CCV_32F) |
721 | 10.8k | _ccv_nnc_tensor_set_cpu_ref_f32((ccv_nnc_tensor_view_t*)outputs[i], cmd.info.blas.a[0]); |
722 | 2 | else if (outputs[i]->info.datatype == CCV_64F) |
723 | 1 | _ccv_nnc_tensor_set_cpu_ref_f64((ccv_nnc_tensor_view_t*)outputs[i], cmd.info.blas.a[0]); |
724 | 1 | else if (outputs[i]->info.datatype == CCV_32S) |
725 | 1 | _ccv_nnc_tensor_set_cpu_ref_i32((ccv_nnc_tensor_view_t*)outputs[i], (int)cmd.info.blas.a[0]); |
726 | 0 | else |
727 | 0 | { assert(0); } |
728 | 11.0k | return CCV_NNC_EXEC_SUCCESS; |
729 | 11.0k | } |
730 | | |
731 | | static int _ccv_nnc_set_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
732 | 0 | { |
733 | 0 | int i; |
734 | 0 | for (i = 0; i < output_size; i++) |
735 | 0 | ccv_nnc_tensor_zero(outputs[i]); |
736 | 0 | return CCV_NNC_EXEC_SUCCESS; |
737 | 0 | } |
738 | | |
739 | | REGISTER_COMMAND_BACKEND(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
740 | 1 | { |
741 | 1 | registry->tensor_formats = CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_CHWN; |
742 | 1 | registry->tensor_datatypes = CCV_64F | CCV_32F | CCV_16F | CCV_32S | CCV_16BF; |
743 | 1 | registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
744 | 1 | registry->algorithms = 1; |
745 | 1 | registry->exec = _ccv_nnc_set_forw; |
746 | 1 | } |
747 | | |
748 | | REGISTER_COMMAND_BACKEND(CCV_NNC_SET_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
749 | 1 | { |
750 | 1 | registry->tensor_formats = CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_CHWN; |
751 | 1 | registry->tensor_datatypes = CCV_64F | CCV_32F | CCV_16F | CCV_32S | CCV_16BF; |
752 | 1 | registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
753 | 1 | registry->algorithms = 1; |
754 | 1 | registry->exec = _ccv_nnc_set_back; |
755 | 1 | } |
756 | | |
757 | | static void _ccv_nnc_tensor_nhwc_nchw_f32(const ccv_nnc_tensor_view_t* a, ccv_nnc_tensor_view_t* b) |
758 | 10 | { |
759 | | // Assuming this is float 32. |
760 | 10 | int astride[CCV_NNC_MAX_DIM_ALLOC]; |
761 | 10 | int bstride[CCV_NNC_MAX_DIM_ALLOC]; |
762 | 10 | int k; |
763 | | // In case it is Toll-free bridged matrix object (NHWC format is possible). |
764 | 10 | const int a_nd = ccv_nnc_tensor_nd(a->info.dim); |
765 | 10 | const int b_nd = ccv_nnc_tensor_nd(b->info.dim); |
766 | 10 | const int a_offset = CCV_NNC_MAX_DIM + 2 - a_nd; |
767 | 10 | assert(a_offset == 0 || a_offset == 1); |
768 | 10 | const int b_offset = CCV_NNC_MAX_DIM + 2 - b_nd; |
769 | 10 | assert(b_offset == 0 || b_offset == 1); |
770 | 10 | ccv_nnc_tensor_view_get_stride(a, astride); |
771 | 10 | ccv_nnc_tensor_view_get_stride(b, bstride); |
772 | | // Comparing N |
773 | 10 | assert((a_offset == 0 ? a->info.dim[0] : 1) == (b_offset == 0 ? b->info.dim[0] : 1)); |
774 | 10 | const int n = (a_offset == 0 ? a->info.dim[0]3 : 17 ); |
775 | | // Comparing C |
776 | 10 | assert(a->info.dim[a_nd - 1] == b->info.dim[1 - b_offset]); |
777 | 10 | const int c = a->info.dim[a_nd - 1]; |
778 | | // Comparing HW |
779 | 10 | int hw[CCV_NNC_MAX_DIM]; |
780 | 30 | for (k = 0; k < CCV_NNC_MAX_DIM; k++20 ) |
781 | 20 | { |
782 | 20 | assert(a->info.dim[k + 1 - a_offset] == b->info.dim[k + 2 - b_offset]); |
783 | 20 | hw[k] = a->info.dim[k + 1 - a_offset]; |
784 | 20 | } |
785 | 10 | assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number. |
786 | 10 | int i[CCV_NNC_MAX_DIM + 2]; |
787 | 10 | float* const ap = a->data.f32; |
788 | 10 | float* const bp = b->data.f32; |
789 | | // Non-optimal case, need to do skip copy. |
790 | 117 | for (i[0] = 0; i[0] < n; i[0]++107 ) |
791 | 107 | { |
792 | 107 | float* ap0 = ap + i[0] * astride[0]; |
793 | 107 | float* const bp0 = bp + i[0] * bstride[0]; |
794 | 1.17k | for (i[3] = 0; i[3] < c; i[3]++1.06k ) |
795 | 1.06k | { |
796 | 1.06k | float* apu = ap0 + i[3]; |
797 | 1.06k | float* bp1 = bp0 + i[3] * bstride[1]; |
798 | 21.7k | for (i[1] = 0; i[1] < hw[0]; i[1]++20.6k ) |
799 | 20.6k | { |
800 | 3.37M | for (i[2] = 0; i[2] < hw[1]; i[2]++3.35M ) |
801 | 3.35M | bp1[i[2]] = apu[i[2] * astride[2]]; |
802 | 20.6k | apu += astride[1]; |
803 | 20.6k | bp1 += bstride[2]; |
804 | 20.6k | } |
805 | 1.06k | } |
806 | 107 | } |
807 | 10 | } |
808 | | |
809 | | static void _ccv_nnc_tensor_nchw_nhwc_f32(const ccv_nnc_tensor_view_t* a, ccv_nnc_tensor_view_t* b) |
810 | 9 | { |
811 | | // Assuming this is float 32. |
812 | 9 | int astride[CCV_NNC_MAX_DIM_ALLOC]; |
813 | 9 | int bstride[CCV_NNC_MAX_DIM_ALLOC]; |
814 | 9 | int k; |
815 | | // In case it is Toll-free bridged matrix object (NHWC format is possible). |
816 | 9 | const int a_nd = ccv_nnc_tensor_nd(a->info.dim); |
817 | 9 | const int b_nd = ccv_nnc_tensor_nd(b->info.dim); |
818 | 9 | const int a_offset = CCV_NNC_MAX_DIM + 2 - a_nd; |
819 | 9 | assert(a_offset == 0 || a_offset == 1); |
820 | 9 | const int b_offset = CCV_NNC_MAX_DIM + 2 - b_nd; |
821 | 9 | assert(b_offset == 0 || b_offset == 1); |
822 | 9 | ccv_nnc_tensor_view_get_stride(a, astride); |
823 | 9 | ccv_nnc_tensor_view_get_stride(b, bstride); |
824 | | // Comparing N |
825 | 9 | assert((a_offset == 0 ? a->info.dim[0] : 1) == (b_offset == 0 ? b->info.dim[0] : 1)); |
826 | 9 | const int n = (a_offset == 0 ? a->info.dim[0]2 : 17 ); |
827 | | // Comparing C |
828 | 9 | assert(a->info.dim[1 - a_offset] == b->info.dim[b_nd - 1]); |
829 | 9 | const int c = a->info.dim[1 - a_offset]; |
830 | | // Comparing HW |
831 | 9 | int hw[CCV_NNC_MAX_DIM]; |
832 | 27 | for (k = 0; k < CCV_NNC_MAX_DIM; k++18 ) |
833 | 18 | { |
834 | 18 | assert(a->info.dim[k + 2 - a_offset] == b->info.dim[k + 1 - b_offset]); |
835 | 18 | hw[k] = a->info.dim[k + 2 - a_offset]; |
836 | 18 | } |
837 | 9 | assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number. |
838 | 9 | int i[CCV_NNC_MAX_DIM + 2]; |
839 | 9 | float* const ap = a->data.f32; |
840 | 9 | float* const bp = b->data.f32; |
841 | | // Non-optimal case, need to do skip copy. |
842 | 20 | for (i[0] = 0; i[0] < n; i[0]++11 ) |
843 | 11 | { |
844 | 11 | float* const ap0 = ap + i[0] * astride[0]; |
845 | 11 | float* const bp0 = bp + i[0] * bstride[0]; |
846 | 792 | for (i[3] = 0; i[3] < c; i[3]++781 ) |
847 | 781 | { |
848 | 781 | float* bpu = bp0 + i[3]; |
849 | 781 | float* ap1 = ap0 + i[3] * astride[1]; |
850 | 20.1k | for (i[1] = 0; i[1] < hw[0]; i[1]++19.3k ) |
851 | 19.3k | { |
852 | 5.04M | for (i[2] = 0; i[2] < hw[1]; i[2]++5.02M ) |
853 | 5.02M | bpu[i[2] * bstride[2]] = ap1[i[2]]; |
854 | 19.3k | ap1 += astride[2]; |
855 | 19.3k | bpu += bstride[1]; |
856 | 19.3k | } |
857 | 781 | } |
858 | 11 | } |
859 | 9 | } |
860 | | |
861 | | static void _ccv_nnc_tensor_nhwc_nchw_f64(const ccv_nnc_tensor_view_t* a, ccv_nnc_tensor_view_t* b) |
862 | 1 | { |
863 | | // Assuming this is float 32. |
864 | 1 | int astride[CCV_NNC_MAX_DIM_ALLOC]; |
865 | 1 | int bstride[CCV_NNC_MAX_DIM_ALLOC]; |
866 | 1 | int k; |
867 | | // In case it is Toll-free bridged matrix object (NHWC format is possible). |
868 | 1 | const int a_nd = ccv_nnc_tensor_nd(a->info.dim); |
869 | 1 | const int b_nd = ccv_nnc_tensor_nd(b->info.dim); |
870 | 1 | const int a_offset = CCV_NNC_MAX_DIM + 2 - a_nd; |
871 | 1 | assert(a_offset == 0 || a_offset == 1); |
872 | 1 | const int b_offset = CCV_NNC_MAX_DIM + 2 - b_nd; |
873 | 1 | assert(b_offset == 0 || b_offset == 1); |
874 | 1 | ccv_nnc_tensor_view_get_stride(a, astride); |
875 | 1 | ccv_nnc_tensor_view_get_stride(b, bstride); |
876 | | // Comparing N |
877 | 1 | assert((a_offset == 0 ? a->info.dim[0] : 1) == (b_offset == 0 ? b->info.dim[0] : 1)); |
878 | 1 | const int n = (a_offset == 0 ? a->info.dim[0] : 10 ); |
879 | | // Comparing C |
880 | 1 | assert(a->info.dim[a_nd - 1] == b->info.dim[1 - b_offset]); |
881 | 1 | const int c = a->info.dim[a_nd - 1]; |
882 | | // Comparing HW |
883 | 1 | int hw[CCV_NNC_MAX_DIM]; |
884 | 3 | for (k = 0; k < CCV_NNC_MAX_DIM; k++2 ) |
885 | 2 | { |
886 | 2 | assert(a->info.dim[k + 1 - a_offset] == b->info.dim[k + 2 - b_offset]); |
887 | 2 | hw[k] = a->info.dim[k + 1 - a_offset]; |
888 | 2 | } |
889 | 1 | assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number. |
890 | 1 | int i[CCV_NNC_MAX_DIM + 2]; |
891 | 1 | double* const ap = a->data.f64; |
892 | 1 | double* const bp = b->data.f64; |
893 | | // Non-optimal case, need to do skip copy. |
894 | 12 | for (i[0] = 0; i[0] < n; i[0]++11 ) |
895 | 11 | { |
896 | 11 | double* ap0 = ap + i[0] * astride[0]; |
897 | 11 | double* const bp0 = bp + i[0] * bstride[0]; |
898 | 99 | for (i[3] = 0; i[3] < c; i[3]++88 ) |
899 | 88 | { |
900 | 88 | double* apu = ap0 + i[3]; |
901 | 88 | double* bp1 = bp0 + i[3] * bstride[1]; |
902 | 968 | for (i[1] = 0; i[1] < hw[0]; i[1]++880 ) |
903 | 880 | { |
904 | 8.80k | for (i[2] = 0; i[2] < hw[1]; i[2]++7.92k ) |
905 | 7.92k | bp1[i[2]] = apu[i[2] * astride[2]]; |
906 | 880 | apu += astride[1]; |
907 | 880 | bp1 += bstride[2]; |
908 | 880 | } |
909 | 88 | } |
910 | 11 | } |
911 | 1 | } |
912 | | |
913 | | static void _ccv_nnc_tensor_nchw_nhwc_f64(const ccv_nnc_tensor_view_t* a, ccv_nnc_tensor_view_t* b) |
914 | 0 | { |
915 | | // Assuming this is float 32. |
916 | 0 | int astride[CCV_NNC_MAX_DIM_ALLOC]; |
917 | 0 | int bstride[CCV_NNC_MAX_DIM_ALLOC]; |
918 | 0 | int k; |
919 | | // In case it is Toll-free bridged matrix object (NHWC format is possible). |
920 | 0 | const int a_nd = ccv_nnc_tensor_nd(a->info.dim); |
921 | 0 | const int b_nd = ccv_nnc_tensor_nd(b->info.dim); |
922 | 0 | const int a_offset = CCV_NNC_MAX_DIM + 2 - a_nd; |
923 | 0 | assert(a_offset == 0 || a_offset == 1); |
924 | 0 | const int b_offset = CCV_NNC_MAX_DIM + 2 - b_nd; |
925 | 0 | assert(b_offset == 0 || b_offset == 1); |
926 | 0 | ccv_nnc_tensor_view_get_stride(a, astride); |
927 | 0 | ccv_nnc_tensor_view_get_stride(b, bstride); |
928 | | // Comparing N |
929 | 0 | assert((a_offset == 0 ? a->info.dim[0] : 1) == (b_offset == 0 ? b->info.dim[0] : 1)); |
930 | 0 | const int n = (a_offset == 0 ? a->info.dim[0] : 1); |
931 | | // Comparing C |
932 | 0 | assert(a->info.dim[1 - a_offset] == b->info.dim[b_nd - 1]); |
933 | 0 | const int c = a->info.dim[1 - a_offset]; |
934 | | // Comparing HW |
935 | 0 | int hw[CCV_NNC_MAX_DIM]; |
936 | 0 | for (k = 0; k < CCV_NNC_MAX_DIM; k++) |
937 | 0 | { |
938 | 0 | assert(a->info.dim[k + 2 - a_offset] == b->info.dim[k + 1 - b_offset]); |
939 | 0 | hw[k] = a->info.dim[k + 2 - a_offset]; |
940 | 0 | } |
941 | 0 | assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number. |
942 | 0 | int i[CCV_NNC_MAX_DIM + 2]; |
943 | 0 | double* const ap = a->data.f64; |
944 | 0 | double* const bp = b->data.f64; |
945 | | // Non-optimal case, need to do skip copy. |
946 | 0 | for (i[0] = 0; i[0] < n; i[0]++) |
947 | 0 | { |
948 | 0 | double* const ap0 = ap + i[0] * astride[0]; |
949 | 0 | double* const bp0 = bp + i[0] * bstride[0]; |
950 | 0 | for (i[3] = 0; i[3] < c; i[3]++) |
951 | 0 | { |
952 | 0 | double* bpu = bp0 + i[3]; |
953 | 0 | double* ap1 = ap0 + i[3] * astride[1]; |
954 | 0 | for (i[1] = 0; i[1] < hw[0]; i[1]++) |
955 | 0 | { |
956 | 0 | for (i[2] = 0; i[2] < hw[1]; i[2]++) |
957 | 0 | bpu[i[2] * bstride[2]] = ap1[i[2]]; |
958 | 0 | ap1 += astride[2]; |
959 | 0 | bpu += bstride[1]; |
960 | 0 | } |
961 | 0 | } |
962 | 0 | } |
963 | 0 | } |
964 | | |
965 | | static void _ccv_nnc_tensor_nhwc_nchw_f16(const ccv_nnc_tensor_view_t* a, ccv_nnc_tensor_view_t* b) |
966 | 0 | { |
967 | | // Assuming this is float 32. |
968 | 0 | int astride[CCV_NNC_MAX_DIM_ALLOC]; |
969 | 0 | int bstride[CCV_NNC_MAX_DIM_ALLOC]; |
970 | 0 | int k; |
971 | | // In case it is Toll-free bridged matrix object (NHWC format is possible). |
972 | 0 | const int a_nd = ccv_nnc_tensor_nd(a->info.dim); |
973 | 0 | const int b_nd = ccv_nnc_tensor_nd(b->info.dim); |
974 | 0 | const int a_offset = CCV_NNC_MAX_DIM + 2 - a_nd; |
975 | 0 | assert(a_offset == 0 || a_offset == 1); |
976 | 0 | const int b_offset = CCV_NNC_MAX_DIM + 2 - b_nd; |
977 | 0 | assert(b_offset == 0 || b_offset == 1); |
978 | 0 | ccv_nnc_tensor_view_get_stride(a, astride); |
979 | 0 | ccv_nnc_tensor_view_get_stride(b, bstride); |
980 | | // Comparing N |
981 | 0 | assert((a_offset == 0 ? a->info.dim[0] : 1) == (b_offset == 0 ? b->info.dim[0] : 1)); |
982 | 0 | const int n = (a_offset == 0 ? a->info.dim[0] : 1); |
983 | | // Comparing C |
984 | 0 | assert(a->info.dim[a_nd - 1] == b->info.dim[1 - b_offset]); |
985 | 0 | const int c = a->info.dim[a_nd - 1]; |
986 | | // Comparing HW |
987 | 0 | int hw[CCV_NNC_MAX_DIM]; |
988 | 0 | for (k = 0; k < CCV_NNC_MAX_DIM; k++) |
989 | 0 | { |
990 | 0 | assert(a->info.dim[k + 1 - a_offset] == b->info.dim[k + 2 - b_offset]); |
991 | 0 | hw[k] = a->info.dim[k + 1 - a_offset]; |
992 | 0 | } |
993 | 0 | assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number. |
994 | 0 | int i[CCV_NNC_MAX_DIM + 2]; |
995 | 0 | ccv_float16_t* const ap = a->data.f16; |
996 | 0 | ccv_float16_t* const bp = b->data.f16; |
997 | | // Non-optimal case, need to do skip copy. |
998 | 0 | for (i[0] = 0; i[0] < n; i[0]++) |
999 | 0 | { |
1000 | 0 | ccv_float16_t* ap0 = ap + i[0] * astride[0]; |
1001 | 0 | ccv_float16_t* const bp0 = bp + i[0] * bstride[0]; |
1002 | 0 | for (i[3] = 0; i[3] < c; i[3]++) |
1003 | 0 | { |
1004 | 0 | ccv_float16_t* apu = ap0 + i[3]; |
1005 | 0 | ccv_float16_t* bp1 = bp0 + i[3] * bstride[1]; |
1006 | 0 | for (i[1] = 0; i[1] < hw[0]; i[1]++) |
1007 | 0 | { |
1008 | 0 | for (i[2] = 0; i[2] < hw[1]; i[2]++) |
1009 | 0 | bp1[i[2]] = apu[i[2] * astride[2]]; |
1010 | 0 | apu += astride[1]; |
1011 | 0 | bp1 += bstride[2]; |
1012 | 0 | } |
1013 | 0 | } |
1014 | 0 | } |
1015 | 0 | } |
1016 | | |
1017 | | static void _ccv_nnc_tensor_nchw_nhwc_f16(const ccv_nnc_tensor_view_t* a, ccv_nnc_tensor_view_t* b) |
1018 | 0 | { |
1019 | | // Assuming this is float 32. |
1020 | 0 | int astride[CCV_NNC_MAX_DIM_ALLOC]; |
1021 | 0 | int bstride[CCV_NNC_MAX_DIM_ALLOC]; |
1022 | 0 | int k; |
1023 | | // In case it is Toll-free bridged matrix object (NHWC format is possible). |
1024 | 0 | const int a_nd = ccv_nnc_tensor_nd(a->info.dim); |
1025 | 0 | const int b_nd = ccv_nnc_tensor_nd(b->info.dim); |
1026 | 0 | const int a_offset = CCV_NNC_MAX_DIM + 2 - a_nd; |
1027 | 0 | assert(a_offset == 0 || a_offset == 1); |
1028 | 0 | const int b_offset = CCV_NNC_MAX_DIM + 2 - b_nd; |
1029 | 0 | assert(b_offset == 0 || b_offset == 1); |
1030 | 0 | ccv_nnc_tensor_view_get_stride(a, astride); |
1031 | 0 | ccv_nnc_tensor_view_get_stride(b, bstride); |
1032 | | // Comparing N |
1033 | 0 | assert((a_offset == 0 ? a->info.dim[0] : 1) == (b_offset == 0 ? b->info.dim[0] : 1)); |
1034 | 0 | const int n = (a_offset == 0 ? a->info.dim[0] : 1); |
1035 | | // Comparing C |
1036 | 0 | assert(a->info.dim[1 - a_offset] == b->info.dim[b_nd - 1]); |
1037 | 0 | const int c = a->info.dim[1 - a_offset]; |
1038 | | // Comparing HW |
1039 | 0 | int hw[CCV_NNC_MAX_DIM]; |
1040 | 0 | for (k = 0; k < CCV_NNC_MAX_DIM; k++) |
1041 | 0 | { |
1042 | 0 | assert(a->info.dim[k + 2 - a_offset] == b->info.dim[k + 1 - b_offset]); |
1043 | 0 | hw[k] = a->info.dim[k + 2 - a_offset]; |
1044 | 0 | } |
1045 | 0 | assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number. |
1046 | 0 | int i[CCV_NNC_MAX_DIM + 2]; |
1047 | 0 | ccv_float16_t* const ap = a->data.f16; |
1048 | 0 | ccv_float16_t* const bp = b->data.f16; |
1049 | | // Non-optimal case, need to do skip copy. |
1050 | 0 | for (i[0] = 0; i[0] < n; i[0]++) |
1051 | 0 | { |
1052 | 0 | ccv_float16_t* const ap0 = ap + i[0] * astride[0]; |
1053 | 0 | ccv_float16_t* const bp0 = bp + i[0] * bstride[0]; |
1054 | 0 | for (i[3] = 0; i[3] < c; i[3]++) |
1055 | 0 | { |
1056 | 0 | ccv_float16_t* bpu = bp0 + i[3]; |
1057 | 0 | ccv_float16_t* ap1 = ap0 + i[3] * astride[1]; |
1058 | 0 | for (i[1] = 0; i[1] < hw[0]; i[1]++) |
1059 | 0 | { |
1060 | 0 | for (i[2] = 0; i[2] < hw[1]; i[2]++) |
1061 | 0 | bpu[i[2] * bstride[2]] = ap1[i[2]]; |
1062 | 0 | ap1 += astride[2]; |
1063 | 0 | bpu += bstride[1]; |
1064 | 0 | } |
1065 | 0 | } |
1066 | 0 | } |
1067 | 0 | } |
1068 | | |
1069 | | static int _ccv_nnc_format_transform(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
1070 | 2.03k | { |
1071 | 2.03k | assert(output_size <= input_size); |
1072 | 2.03k | int i; |
1073 | 4.06k | for (i = 0; i < output_size; i++2.03k ) |
1074 | 2.03k | { |
1075 | 2.03k | const ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[i]; |
1076 | 2.03k | ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)outputs[i]; |
1077 | 2.03k | assert(a != b); // Cannot do inplace transform. |
1078 | 2.03k | assert(a->info.datatype == b->info.datatype); |
1079 | 2.03k | if (a->info.dim[0] == 0 || b->info.dim[0] == 0) |
1080 | 0 | continue; |
1081 | 2.03k | if (a->info.datatype == CCV_32F || a->info.datatype == CCV_32S1 ) |
1082 | 2.03k | { |
1083 | 2.03k | if (a->info.format == b->info.format) { |
1084 | | // If it is the same, just do a normal data transfer. |
1085 | 2.01k | _ccv_nnc_tensor_transfer_cpu_ref_f32(a, b); |
1086 | 2.01k | } else if (19 a->info.format == CCV_TENSOR_FORMAT_NHWC19 && b->info.format == CCV_TENSOR_FORMAT_NCHW10 ) { |
1087 | 10 | _ccv_nnc_tensor_nhwc_nchw_f32(a, b); |
1088 | 10 | } else if (9 a->info.format == CCV_TENSOR_FORMAT_NHWC9 && b->info.format == CCV_TENSOR_FORMAT_CHWN0 ) { |
1089 | 0 | assert(0); |
1090 | 9 | } else if (a->info.format == CCV_TENSOR_FORMAT_NCHW && b->info.format == CCV_TENSOR_FORMAT_NHWC) { |
1091 | 9 | _ccv_nnc_tensor_nchw_nhwc_f32(a, b); |
1092 | 9 | } else if (0 a->info.format == CCV_TENSOR_FORMAT_NCHW0 && b->info.format == CCV_TENSOR_FORMAT_CHWN0 ) { |
1093 | 0 | assert(0); |
1094 | 0 | } else if (a->info.format == CCV_TENSOR_FORMAT_CHWN && b->info.format == CCV_TENSOR_FORMAT_NHWC) { |
1095 | 0 | assert(0); |
1096 | 0 | } else if (a->info.format == CCV_TENSOR_FORMAT_CHWN && b->info.format == CCV_TENSOR_FORMAT_NCHW) { |
1097 | 0 | assert(0); |
1098 | 0 | } |
1099 | 2.03k | } else if (1 a->info.datatype == CCV_64F1 ) { |
1100 | 1 | if (a->info.format == b->info.format) { |
1101 | | // If it is the same, just do a normal data transfer. |
1102 | 0 | _ccv_nnc_tensor_transfer_cpu_ref_f64(a, b); |
1103 | 1 | } else if (a->info.format == CCV_TENSOR_FORMAT_NHWC && b->info.format == CCV_TENSOR_FORMAT_NCHW) { |
1104 | 1 | _ccv_nnc_tensor_nhwc_nchw_f64(a, b); |
1105 | 1 | } else if (0 a->info.format == CCV_TENSOR_FORMAT_NHWC0 && b->info.format == CCV_TENSOR_FORMAT_CHWN0 ) { |
1106 | 0 | assert(0); |
1107 | 0 | } else if (a->info.format == CCV_TENSOR_FORMAT_NCHW && b->info.format == CCV_TENSOR_FORMAT_NHWC) { |
1108 | 0 | _ccv_nnc_tensor_nchw_nhwc_f64(a, b); |
1109 | 0 | } else if (a->info.format == CCV_TENSOR_FORMAT_NCHW && b->info.format == CCV_TENSOR_FORMAT_CHWN) { |
1110 | 0 | assert(0); |
1111 | 0 | } else if (a->info.format == CCV_TENSOR_FORMAT_CHWN && b->info.format == CCV_TENSOR_FORMAT_NHWC) { |
1112 | 0 | assert(0); |
1113 | 0 | } else if (a->info.format == CCV_TENSOR_FORMAT_CHWN && b->info.format == CCV_TENSOR_FORMAT_NCHW) { |
1114 | 0 | assert(0); |
1115 | 0 | } |
1116 | 1 | } else if (0 a->info.datatype == CCV_16F0 || a->info.datatype == CCV_16BF0 ) { |
1117 | 0 | if (a->info.format == b->info.format) { |
1118 | | // If it is the same, just do a normal data transfer. |
1119 | 0 | _ccv_nnc_tensor_transfer_cpu_ref_f16(a, b); |
1120 | 0 | } else if (a->info.format == CCV_TENSOR_FORMAT_NHWC && b->info.format == CCV_TENSOR_FORMAT_NCHW) { |
1121 | 0 | _ccv_nnc_tensor_nhwc_nchw_f16(a, b); |
1122 | 0 | } else if (a->info.format == CCV_TENSOR_FORMAT_NHWC && b->info.format == CCV_TENSOR_FORMAT_CHWN) { |
1123 | 0 | assert(0); |
1124 | 0 | } else if (a->info.format == CCV_TENSOR_FORMAT_NCHW && b->info.format == CCV_TENSOR_FORMAT_NHWC) { |
1125 | 0 | _ccv_nnc_tensor_nchw_nhwc_f16(a, b); |
1126 | 0 | } else if (a->info.format == CCV_TENSOR_FORMAT_NCHW && b->info.format == CCV_TENSOR_FORMAT_CHWN) { |
1127 | 0 | assert(0); |
1128 | 0 | } else if (a->info.format == CCV_TENSOR_FORMAT_CHWN && b->info.format == CCV_TENSOR_FORMAT_NHWC) { |
1129 | 0 | assert(0); |
1130 | 0 | } else if (a->info.format == CCV_TENSOR_FORMAT_CHWN && b->info.format == CCV_TENSOR_FORMAT_NCHW) { |
1131 | 0 | assert(0); |
1132 | 0 | } |
1133 | 0 | } else if (a->info.datatype == CCV_8U) { |
1134 | 0 | if (a->info.format == b->info.format) { |
1135 | | // If it is the same, just do a normal data transfer. |
1136 | 0 | _ccv_nnc_tensor_transfer_cpu_ref_u8(a, b); |
1137 | 0 | } else if (a->info.format == CCV_TENSOR_FORMAT_NHWC && b->info.format == CCV_TENSOR_FORMAT_NCHW) { |
1138 | 0 | assert(0); |
1139 | 0 | } else if (a->info.format == CCV_TENSOR_FORMAT_NHWC && b->info.format == CCV_TENSOR_FORMAT_CHWN) { |
1140 | 0 | assert(0); |
1141 | 0 | } else if (a->info.format == CCV_TENSOR_FORMAT_NCHW && b->info.format == CCV_TENSOR_FORMAT_NHWC) { |
1142 | 0 | assert(0); |
1143 | 0 | } else if (a->info.format == CCV_TENSOR_FORMAT_NCHW && b->info.format == CCV_TENSOR_FORMAT_CHWN) { |
1144 | 0 | assert(0); |
1145 | 0 | } else if (a->info.format == CCV_TENSOR_FORMAT_CHWN && b->info.format == CCV_TENSOR_FORMAT_NHWC) { |
1146 | 0 | assert(0); |
1147 | 0 | } else if (a->info.format == CCV_TENSOR_FORMAT_CHWN && b->info.format == CCV_TENSOR_FORMAT_NCHW) { |
1148 | 0 | assert(0); |
1149 | 0 | } |
1150 | 0 | } else { |
1151 | 0 | assert(0); |
1152 | 0 | } |
1153 | 2.03k | } |
1154 | 2.03k | return CCV_NNC_EXEC_SUCCESS; |
1155 | 2.03k | } |
1156 | | |
1157 | | REGISTER_COMMAND_BACKEND(CCV_NNC_FORMAT_TRANSFORM_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
1158 | 1 | { |
1159 | 1 | registry->tensor_formats = CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_CHWN; |
1160 | 1 | registry->tensor_datatypes = CCV_64F | CCV_32F | CCV_32S | CCV_16F | CCV_8U | CCV_16BF; |
1161 | 1 | registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
1162 | 1 | registry->algorithms = 1; |
1163 | 1 | registry->exec = _ccv_nnc_format_transform; |
1164 | 1 | } |
1165 | | |
1166 | | REGISTER_COMMAND_BACKEND(CCV_NNC_FORMAT_TRANSFORM_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
1167 | 1 | { |
1168 | 1 | registry->tensor_formats = CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_CHWN; |
1169 | 1 | registry->tensor_datatypes = CCV_64F | CCV_32F | CCV_32S | CCV_16F | CCV_8U | CCV_16BF; |
1170 | 1 | registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
1171 | 1 | registry->algorithms = 1; |
1172 | 1 | registry->exec = _ccv_nnc_format_transform; |
1173 | 1 | } |
1174 | | |
1175 | | static int _ccv_nnc_transpose(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
1176 | 55 | { |
1177 | 55 | assert(output_size <= input_size); |
1178 | 55 | int k; |
1179 | 110 | for (k = 0; k < output_size; k++55 ) |
1180 | 55 | { |
1181 | 55 | const ccv_nnc_tensor_view_t* const a = (ccv_nnc_tensor_view_t*)inputs[k]; |
1182 | 55 | ccv_nnc_tensor_view_t* const b = (ccv_nnc_tensor_view_t*)outputs[k]; |
1183 | 55 | const int a_nd = ccv_nnc_tensor_nd(a->info.dim); |
1184 | 55 | const int b_nd = ccv_nnc_tensor_nd(b->info.dim); |
1185 | 55 | assert(a_nd == b_nd); |
1186 | 55 | assert(a_nd <= CCV_NNC_MAX_DIM + 2); // I can only handle maximum 4. |
1187 | 55 | assert(a_nd >= 2 && b_nd >= 2); // You cannot transpose if it is less than 2. |
1188 | 55 | assert(a->info.dim[cmd.info.transpose.axis[0]] == b->info.dim[cmd.info.transpose.axis[1]]); |
1189 | 55 | assert(a->info.dim[cmd.info.transpose.axis[1]] == b->info.dim[cmd.info.transpose.axis[0]]); |
1190 | 55 | int x; |
1191 | 271 | for (x = 0; x < a_nd; x++216 ) |
1192 | 216 | if (x != cmd.info.transpose.axis[0] && x != cmd.info.transpose.axis[1]161 ) |
1193 | 106 | { assert(a->info.dim[x] == b->info.dim[x]); } |
1194 | 55 | size_t astride[CCV_NNC_MAX_DIM + 2]; |
1195 | 55 | size_t bstride[CCV_NNC_MAX_DIM + 2]; |
1196 | 55 | int dim[CCV_NNC_MAX_DIM + 2]; |
1197 | 59 | for (x = b_nd; x < CCV_NNC_MAX_DIM + 2; x++4 ) |
1198 | 4 | dim[x] = 1; |
1199 | 271 | for (x = 0; x < b_nd; x++216 ) |
1200 | 216 | dim[x] = b->info.dim[x]; |
1201 | | // Don't use ccv_nnc_tensor_view_get_inc or get_dim because these will prefill beginning to 1: |
1202 | | // for example, if the dimension is [2, 4], it will fill to [1, 1, 2, 4] so the axis index will |
1203 | | // be messed up. |
1204 | 55 | if (CCV_IS_TENSOR_VIEW(a)) |
1205 | 6 | { |
1206 | 8 | for (x = a_nd; x < CCV_NNC_MAX_DIM + 2; x++2 ) |
1207 | 2 | astride[x] = 1; |
1208 | 28 | for (x = 0; x < a_nd; x++22 ) |
1209 | 22 | astride[x] = a->stride[x]; |
1210 | 49 | } else { |
1211 | 49 | const int* const adim = a->info.dim; |
1212 | 100 | for (x = a_nd - 1; x < CCV_NNC_MAX_DIM + 2; x++51 ) |
1213 | 51 | astride[x] = 1; |
1214 | 194 | for (x = a_nd - 2; x >= 0; x--145 ) |
1215 | 145 | astride[x] = astride[x + 1] * adim[x + 1]; |
1216 | 49 | } |
1217 | 55 | if (CCV_IS_TENSOR_VIEW(b)) |
1218 | 6 | { |
1219 | 8 | for (x = b_nd; x < CCV_NNC_MAX_DIM + 2; x++2 ) |
1220 | 2 | bstride[x] = 1; |
1221 | 28 | for (x = 0; x < b_nd; x++22 ) |
1222 | 22 | bstride[x] = b->stride[x]; |
1223 | 49 | } else { |
1224 | 49 | const int* const bdim = b->info.dim; |
1225 | 100 | for (x = b_nd - 1; x < CCV_NNC_MAX_DIM + 2; x++51 ) |
1226 | 51 | bstride[x] = 1; |
1227 | 194 | for (x = b_nd - 2; x >= 0; x--145 ) |
1228 | 145 | bstride[x] = bstride[x + 1] * bdim[x + 1]; |
1229 | 49 | } |
1230 | 55 | const float* const ap = a->data.f32; |
1231 | 55 | float* const bp = b->data.f32; |
1232 | 55 | int i[CCV_NNC_MAX_DIM + 2]; |
1233 | 55 | int j[CCV_NNC_MAX_DIM + 2] = { |
1234 | 55 | 0, 1, 2, 3 |
1235 | 55 | }; |
1236 | 55 | CCV_SWAP(j[cmd.info.transpose.axis[0]], j[cmd.info.transpose.axis[1]], x); |
1237 | 658 | for (i[0] = 0; i[0] < dim[0]; i[0]++603 ) |
1238 | 603 | { |
1239 | 603 | float* const bp0 = bp + i[0] * bstride[0]; |
1240 | 28.4k | for (i[1] = 0; i[1] < dim[1]; i[1]++27.8k ) |
1241 | 27.8k | { |
1242 | 27.8k | float* const bp1 = bp0 + i[1] * bstride[1]; |
1243 | 578k | for (i[2] = 0; i[2] < dim[2]; i[2]++550k ) |
1244 | 550k | { |
1245 | 550k | float* const bp2 = bp1 + i[2] * bstride[2]; |
1246 | 43.3M | for (i[3] = 0; i[3] < dim[3]; i[3]++42.8M ) |
1247 | 42.8M | bp2[i[3]] = ap[i[j[0]] * astride[0] + i[j[1]] * astride[1] + i[j[2]] * astride[2] + i[j[3]] * astride[3]]; |
1248 | 550k | } |
1249 | 27.8k | } |
1250 | 603 | } |
1251 | 55 | } |
1252 | 55 | return CCV_NNC_EXEC_SUCCESS; |
1253 | 55 | } |
1254 | | |
1255 | | REGISTER_COMMAND_BACKEND(CCV_NNC_TRANSPOSE_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
1256 | 1 | { |
1257 | 1 | registry->tensor_formats = CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_CHWN; |
1258 | 1 | registry->tensor_datatypes = CCV_32F; |
1259 | 1 | registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
1260 | 1 | registry->algorithms = 1; |
1261 | 1 | registry->exec = _ccv_nnc_transpose; |
1262 | 1 | } |
1263 | | |
1264 | | REGISTER_COMMAND_BACKEND(CCV_NNC_TRANSPOSE_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
1265 | 1 | { |
1266 | 1 | registry->tensor_formats = CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_CHWN; |
1267 | 1 | registry->tensor_datatypes = CCV_32F; |
1268 | 1 | registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
1269 | 1 | registry->algorithms = 1; |
1270 | 1 | registry->exec = _ccv_nnc_transpose; |
1271 | 1 | } |
1272 | | |
1273 | | static int _ccv_nnc_datatype_conversion(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
1274 | 120k | { |
1275 | 120k | assert(output_size <= input_size); |
1276 | 120k | int i; |
1277 | 240k | for (i = 0; i < output_size; i++120k ) |
1278 | 120k | { |
1279 | 120k | const ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[i]; |
1280 | 120k | ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)outputs[i]; |
1281 | 120k | assert(a != b); // Cannot do inplace transform. |
1282 | 120k | assert(a->info.format == b->info.format); |
1283 | 120k | if (a->info.datatype == b->info.datatype) { |
1284 | | // If it is the same, just do a normal data transfer. |
1285 | 2 | if (a->info.datatype == CCV_16F || a->info.datatype == CCV_16BF) |
1286 | 0 | _ccv_nnc_tensor_transfer_cpu_ref_f16(a, b); |
1287 | 2 | else if (a->info.datatype == CCV_32F) |
1288 | 2 | _ccv_nnc_tensor_transfer_cpu_ref_f32(a, b); |
1289 | 0 | else if (a->info.datatype == CCV_64F) |
1290 | 0 | _ccv_nnc_tensor_transfer_cpu_ref_f64(a, b); |
1291 | 120k | } else if (a->info.datatype == CCV_32F && b->info.datatype == CCV_16F120k ) { |
1292 | 120k | assert(CCV_IS_TENSOR_CONTIGUOUS(a)); |
1293 | 120k | assert(CCV_IS_TENSOR_CONTIGUOUS(b)); |
1294 | 120k | const size_t tensor_count = ccv_nnc_tensor_count(a->info); |
1295 | 120k | assert(tensor_count == ccv_nnc_tensor_count(b->info)); |
1296 | 120k | ccv_float_to_half_precision(a->data.f32, (uint16_t*)b->data.f16, tensor_count); |
1297 | 120k | } else if (205 a->info.datatype == CCV_16F205 && b->info.datatype == CCV_32F201 ) { |
1298 | 201 | assert(CCV_IS_TENSOR_CONTIGUOUS(a)); |
1299 | 201 | assert(CCV_IS_TENSOR_CONTIGUOUS(b)); |
1300 | 201 | const int tensor_count = ccv_nnc_tensor_count(a->info); |
1301 | 201 | assert(tensor_count == ccv_nnc_tensor_count(b->info)); |
1302 | 201 | ccv_half_precision_to_float((uint16_t*)a->data.f16, b->data.f32, tensor_count); |
1303 | 201 | } else if (4 a->info.datatype == CCV_64F4 && b->info.datatype == CCV_32F3 ) { |
1304 | 1 | assert(CCV_IS_TENSOR_CONTIGUOUS(a)); |
1305 | 1 | assert(CCV_IS_TENSOR_CONTIGUOUS(b)); |
1306 | 1 | const size_t tensor_count = ccv_nnc_tensor_count(a->info); |
1307 | 1 | assert(tensor_count == ccv_nnc_tensor_count(b->info)); |
1308 | 1 | int i; |
1309 | 129 | for (i = 0; i < tensor_count; i++128 ) |
1310 | 128 | b->data.f32[i] = (float)a->data.f64[i]; |
1311 | 3 | } else if (a->info.datatype == CCV_32F && b->info.datatype == CCV_64F1 ) { |
1312 | 1 | assert(CCV_IS_TENSOR_CONTIGUOUS(a)); |
1313 | 1 | assert(CCV_IS_TENSOR_CONTIGUOUS(b)); |
1314 | 1 | const int tensor_count = ccv_nnc_tensor_count(a->info); |
1315 | 1 | assert(tensor_count == ccv_nnc_tensor_count(b->info)); |
1316 | 129 | for (i = 0; 1 i < tensor_count; i++128 ) |
1317 | 128 | b->data.f64[i] = (double)a->data.f32[i]; |
1318 | 2 | } else if (a->info.datatype == CCV_64F && b->info.datatype == CCV_16F) { |
1319 | 2 | assert(CCV_IS_TENSOR_CONTIGUOUS(a)); |
1320 | 2 | assert(CCV_IS_TENSOR_CONTIGUOUS(b)); |
1321 | 2 | const size_t tensor_count = ccv_nnc_tensor_count(a->info); |
1322 | 2 | assert(tensor_count == ccv_nnc_tensor_count(b->info)); |
1323 | 2 | ccv_double_to_half_precision(a->data.f64, (uint16_t*)b->data.f16, tensor_count); |
1324 | 2 | } else if (0 a->info.datatype == CCV_16F0 && b->info.datatype == CCV_64F0 ) { |
1325 | 0 | assert(CCV_IS_TENSOR_CONTIGUOUS(a)); |
1326 | 0 | assert(CCV_IS_TENSOR_CONTIGUOUS(b)); |
1327 | 0 | const int tensor_count = ccv_nnc_tensor_count(a->info); |
1328 | 0 | assert(tensor_count == ccv_nnc_tensor_count(b->info)); |
1329 | 0 | ccv_half_precision_to_double((uint16_t*)a->data.f16, b->data.f64, tensor_count); |
1330 | 0 | } else if (a->info.datatype == CCV_16F && b->info.datatype == CCV_16BF) { |
1331 | 0 | assert(CCV_IS_TENSOR_CONTIGUOUS(a)); |
1332 | 0 | assert(CCV_IS_TENSOR_CONTIGUOUS(b)); |
1333 | 0 | const size_t tensor_count = ccv_nnc_tensor_count(a->info); |
1334 | 0 | assert(tensor_count == ccv_nnc_tensor_count(b->info)); |
1335 | 0 | ccv_half_precision_to_bfloat((uint16_t*)a->data.f16, (uint16_t*)b->data.f16, tensor_count); |
1336 | 0 | } else if (a->info.datatype == CCV_16BF && b->info.datatype == CCV_16F) { |
1337 | 0 | assert(CCV_IS_TENSOR_CONTIGUOUS(a)); |
1338 | 0 | assert(CCV_IS_TENSOR_CONTIGUOUS(b)); |
1339 | 0 | const int tensor_count = ccv_nnc_tensor_count(a->info); |
1340 | 0 | assert(tensor_count == ccv_nnc_tensor_count(b->info)); |
1341 | 0 | ccv_bfloat_to_half_precision((uint16_t*)a->data.f16, (uint16_t*)b->data.f16, tensor_count); |
1342 | 0 | } else if (a->info.datatype == CCV_32F && b->info.datatype == CCV_16BF) { |
1343 | 0 | assert(CCV_IS_TENSOR_CONTIGUOUS(a)); |
1344 | 0 | assert(CCV_IS_TENSOR_CONTIGUOUS(b)); |
1345 | 0 | const size_t tensor_count = ccv_nnc_tensor_count(a->info); |
1346 | 0 | assert(tensor_count == ccv_nnc_tensor_count(b->info)); |
1347 | 0 | ccv_float_to_bfloat(a->data.f32, (uint16_t*)b->data.f16, tensor_count); |
1348 | 0 | } else if (a->info.datatype == CCV_16BF && b->info.datatype == CCV_32F) { |
1349 | 0 | assert(CCV_IS_TENSOR_CONTIGUOUS(a)); |
1350 | 0 | assert(CCV_IS_TENSOR_CONTIGUOUS(b)); |
1351 | 0 | const int tensor_count = ccv_nnc_tensor_count(a->info); |
1352 | 0 | assert(tensor_count == ccv_nnc_tensor_count(b->info)); |
1353 | 0 | ccv_bfloat_to_float((uint16_t*)a->data.f16, b->data.f32, tensor_count); |
1354 | 0 | } else if (a->info.datatype == CCV_64F && b->info.datatype == CCV_16BF) { |
1355 | 0 | assert(CCV_IS_TENSOR_CONTIGUOUS(a)); |
1356 | 0 | assert(CCV_IS_TENSOR_CONTIGUOUS(b)); |
1357 | 0 | const size_t tensor_count = ccv_nnc_tensor_count(a->info); |
1358 | 0 | assert(tensor_count == ccv_nnc_tensor_count(b->info)); |
1359 | 0 | ccv_double_to_bfloat(a->data.f64, (uint16_t*)b->data.f16, tensor_count); |
1360 | 0 | } else if (a->info.datatype == CCV_16BF && b->info.datatype == CCV_64F) { |
1361 | 0 | assert(CCV_IS_TENSOR_CONTIGUOUS(a)); |
1362 | 0 | assert(CCV_IS_TENSOR_CONTIGUOUS(b)); |
1363 | 0 | const int tensor_count = ccv_nnc_tensor_count(a->info); |
1364 | 0 | assert(tensor_count == ccv_nnc_tensor_count(b->info)); |
1365 | 0 | ccv_bfloat_to_double((uint16_t*)a->data.f16, b->data.f64, tensor_count); |
1366 | 0 | } |
1367 | 120k | } |
1368 | 120k | return CCV_NNC_EXEC_SUCCESS; |
1369 | 120k | } |
1370 | | |
1371 | | REGISTER_COMMAND_BACKEND(CCV_NNC_DATATYPE_CONVERSION_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
1372 | 1 | { |
1373 | 1 | registry->tensor_formats = CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_CHWN; |
1374 | 1 | registry->tensor_datatypes = CCV_64F | CCV_32F | CCV_16F | CCV_16BF; |
1375 | 1 | registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
1376 | 1 | registry->algorithms = 1; |
1377 | 1 | registry->exec = _ccv_nnc_datatype_conversion; |
1378 | 1 | } |
1379 | | |
1380 | | REGISTER_COMMAND_BACKEND(CCV_NNC_DATATYPE_CONVERSION_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
1381 | 1 | { |
1382 | 1 | registry->tensor_formats = CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_CHWN; |
1383 | 1 | registry->tensor_datatypes = CCV_64F | CCV_32F | CCV_16F | CCV_16BF; |
1384 | 1 | registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
1385 | 1 | registry->algorithms = 1; |
1386 | 1 | registry->exec = _ccv_nnc_datatype_conversion; |
1387 | 1 | } |
1388 | | |
1389 | | static void _ccv_nnc_masked_fill_cpu_ref_f(const float p, const float q, ccv_nnc_tensor_view_t* const a, ccv_nnc_tensor_view_t* const b, ccv_nnc_tensor_view_t* const c) |
1390 | 2 | { |
1391 | 2 | int cdim[CCV_NNC_MAX_DIM_ALLOC]; |
1392 | 2 | ccv_nnc_tensor_view_get_dim(a, cdim); // Fill in cdim first. |
1393 | 2 | ccv_nnc_tensor_view_get_broadcast_dim(b, cdim); |
1394 | 2 | assert(ccv_nnc_tensor_view_check_broadcast_dim(a, cdim)); |
1395 | 2 | assert(ccv_nnc_tensor_view_check_broadcast_dim(b, cdim)); |
1396 | 2 | const int a_check_dim = ccv_nnc_tensor_view_check_dim(a, cdim); |
1397 | 2 | const int b_check_dim = ccv_nnc_tensor_view_check_dim(b, cdim); |
1398 | | // Assuming this is float 32. |
1399 | 2 | int adim[CCV_NNC_MAX_DIM_ALLOC]; |
1400 | 2 | int bdim[CCV_NNC_MAX_DIM_ALLOC]; |
1401 | 2 | ccv_nnc_tensor_view_get_dim(a, adim); |
1402 | 2 | ccv_nnc_tensor_view_get_dim(b, bdim); |
1403 | 2 | int astride[CCV_NNC_MAX_DIM_ALLOC]; |
1404 | 2 | int bstride[CCV_NNC_MAX_DIM_ALLOC]; |
1405 | 2 | int cstride[CCV_NNC_MAX_DIM_ALLOC]; |
1406 | 2 | assert(ccv_nnc_tensor_view_check_dim(c, cdim)); |
1407 | 2 | int x; |
1408 | 2 | if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b) && !CCV_IS_TENSOR_VIEW(c) && a_check_dim && b_check_dim) |
1409 | 0 | { |
1410 | 0 | const int tensor_count = ccv_nnc_tensor_count(a->info); |
1411 | | // Super optimal case, just do one for-loop for sum. |
1412 | 0 | for (x = 0; x < tensor_count; x++) |
1413 | 0 | c->data.f32[x] = (b->data.f32[x] == p) ? q : a->data.f32[x]; |
1414 | 0 | return; |
1415 | 0 | } |
1416 | 2 | assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number. |
1417 | 2 | ccv_nnc_tensor_view_get_stride(a, astride); |
1418 | 2 | ccv_nnc_tensor_view_get_stride(b, bstride); |
1419 | 2 | ccv_nnc_tensor_view_get_stride(c, cstride); |
1420 | 2 | int i[CCV_NNC_MAX_DIM + 2]; |
1421 | 2 | float* const ap = a->data.f32; |
1422 | 2 | float* const bp = b->data.f32; |
1423 | 2 | float* const cp = c->data.f32; |
1424 | 2 | const int count = cdim[2] * cdim[3]; |
1425 | 2 | if (astride[2] == cdim[3] && bstride[2] == cdim[3] && cstride[2] == cdim[3] && adim[2] == cdim[2] && bdim[2] == cdim[2]) |
1426 | 2 | { |
1427 | | // Special casing if the ainc[3] is the same as dim[3] |
1428 | 4 | for (i[0] = 0; i[0] < cdim[0]; i[0]++2 ) |
1429 | 2 | { |
1430 | 2 | float* const ap0 = adim[0] == 1 ? ap : ap + i[0] * astride[0]0 ; |
1431 | 2 | float* const bp0 = bdim[0] == 1 ? bp : bp + i[0] * bstride[0]0 ; |
1432 | 2 | float* cp0 = cp + i[0] * cstride[0]; |
1433 | 14 | for (i[1] = 0; i[1] < cdim[1]; i[1]++12 ) |
1434 | 12 | { |
1435 | 12 | float* const ap1 = adim[1] == 1 ? ap00 : ap0 + i[1] * astride[1]; |
1436 | 12 | float* const bp1 = bdim[1] == 1 ? bp0 : bp0 + i[1] * bstride[1]0 ; |
1437 | 252 | for (x = 0; x < count; x++240 ) |
1438 | 240 | cp0[x] = (bp1[x] == p) ? q120 : ap1[x]120 ; |
1439 | 12 | cp0 += cstride[1]; |
1440 | 12 | } |
1441 | 2 | } |
1442 | 2 | return; |
1443 | 2 | } |
1444 | | // Non-optimal case, need to do skip copy and handle broadcasting. |
1445 | 0 | for (i[0] = 0; i[0] < cdim[0]; i[0]++) |
1446 | 0 | { |
1447 | 0 | float* const ap0 = adim[0] == 1 ? ap : ap + i[0] * astride[0]; |
1448 | 0 | float* const bp0 = bdim[0] == 1 ? bp : bp + i[0] * bstride[0]; |
1449 | 0 | float* const cp0 = cp + i[0] * cstride[0]; |
1450 | 0 | for (i[1] = 0; i[1] < cdim[1]; i[1]++) |
1451 | 0 | { |
1452 | 0 | float* const ap1 = adim[1] == 1 ? ap0 : ap0 + i[1] * astride[1]; |
1453 | 0 | float* const bp1 = bdim[1] == 1 ? bp0 : bp0 + i[1] * bstride[1]; |
1454 | 0 | float* cp1 = cp0 + i[1] * cstride[1]; |
1455 | 0 | for (i[2] = 0; i[2] < cdim[2]; i[2]++) |
1456 | 0 | { |
1457 | 0 | float* const ap2 = adim[2] == 1 ? ap1 : ap1 + i[2] * astride[2]; |
1458 | 0 | float* const bp2 = bdim[2] == 1 ? bp1 : bp1 + i[2] * bstride[2]; |
1459 | 0 | if (adim[3] == 1) |
1460 | 0 | for (x = 0; x < cdim[3]; x++) |
1461 | 0 | cp1[x] = (bp2[x] == p) ? q : ap2[0]; |
1462 | 0 | else if (bdim[3] == 1) |
1463 | 0 | if (bp2[0] == p) |
1464 | 0 | for (x = 0; x < cdim[3]; x++) |
1465 | 0 | cp1[x] = q; |
1466 | 0 | else |
1467 | 0 | for (x = 0; x < cdim[3]; x++) |
1468 | 0 | cp1[x] = ap2[x]; |
1469 | 0 | else |
1470 | 0 | for (x = 0; x < cdim[3]; x++) |
1471 | 0 | cp1[x] = (bp2[x] == p) ? q : ap2[x]; |
1472 | 0 | cp1 += cstride[2]; |
1473 | 0 | } |
1474 | 0 | } |
1475 | 0 | } |
1476 | 0 | } |
1477 | | |
1478 | | static void _ccv_nnc_masked_fill_cpu_ref_s(const int p, const float q, ccv_nnc_tensor_view_t* const a, ccv_nnc_tensor_view_t* const b, ccv_nnc_tensor_view_t* const c) |
1479 | 4 | { |
1480 | 4 | int cdim[CCV_NNC_MAX_DIM_ALLOC]; |
1481 | 4 | ccv_nnc_tensor_view_get_dim(a, cdim); // Fill in cdim first. |
1482 | 4 | ccv_nnc_tensor_view_get_broadcast_dim(b, cdim); |
1483 | 4 | assert(ccv_nnc_tensor_view_check_broadcast_dim(a, cdim)); |
1484 | 4 | assert(ccv_nnc_tensor_view_check_broadcast_dim(b, cdim)); |
1485 | 4 | const int a_check_dim = ccv_nnc_tensor_view_check_dim(a, cdim); |
1486 | 4 | const int b_check_dim = ccv_nnc_tensor_view_check_dim(b, cdim); |
1487 | | // Assuming this is float 32. |
1488 | 4 | int adim[CCV_NNC_MAX_DIM_ALLOC]; |
1489 | 4 | int bdim[CCV_NNC_MAX_DIM_ALLOC]; |
1490 | 4 | ccv_nnc_tensor_view_get_dim(a, adim); |
1491 | 4 | ccv_nnc_tensor_view_get_dim(b, bdim); |
1492 | 4 | int astride[CCV_NNC_MAX_DIM_ALLOC]; |
1493 | 4 | int bstride[CCV_NNC_MAX_DIM_ALLOC]; |
1494 | 4 | int cstride[CCV_NNC_MAX_DIM_ALLOC]; |
1495 | 4 | assert(ccv_nnc_tensor_view_check_dim(c, cdim)); |
1496 | 4 | int x; |
1497 | 4 | if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b) && !CCV_IS_TENSOR_VIEW(c) && a_check_dim && b_check_dim) |
1498 | 0 | { |
1499 | 0 | const int tensor_count = ccv_nnc_tensor_count(a->info); |
1500 | | // Super optimal case, just do one for-loop for sum. |
1501 | 0 | for (x = 0; x < tensor_count; x++) |
1502 | 0 | c->data.f32[x] = (b->data.i32[x] == p) ? q : a->data.f32[x]; |
1503 | 0 | return; |
1504 | 0 | } |
1505 | 4 | assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number. |
1506 | 4 | ccv_nnc_tensor_view_get_stride(a, astride); |
1507 | 4 | ccv_nnc_tensor_view_get_stride(b, bstride); |
1508 | 4 | ccv_nnc_tensor_view_get_stride(c, cstride); |
1509 | 4 | int i[CCV_NNC_MAX_DIM + 2]; |
1510 | 4 | float* const ap = a->data.f32; |
1511 | 4 | int* const bp = b->data.i32; |
1512 | 4 | float* const cp = c->data.f32; |
1513 | 4 | const int count = cdim[2] * cdim[3]; |
1514 | 4 | if (astride[2] == cdim[3] && bstride[2] == cdim[3] && cstride[2] == cdim[3] && adim[2] == cdim[2] && bdim[2] == cdim[2]) |
1515 | 4 | { |
1516 | | // Special casing if the ainc[3] is the same as dim[3] |
1517 | 8 | for (i[0] = 0; i[0] < cdim[0]; i[0]++4 ) |
1518 | 4 | { |
1519 | 4 | float* const ap0 = adim[0] == 1 ? ap : ap + i[0] * astride[0]0 ; |
1520 | 4 | int* const bp0 = bdim[0] == 1 ? bp : bp + i[0] * bstride[0]0 ; |
1521 | 4 | float* cp0 = cp + i[0] * cstride[0]; |
1522 | 28 | for (i[1] = 0; i[1] < cdim[1]; i[1]++24 ) |
1523 | 24 | { |
1524 | 24 | float* const ap1 = adim[1] == 1 ? ap00 : ap0 + i[1] * astride[1]; |
1525 | 24 | int* const bp1 = bdim[1] == 1 ? bp0 : bp0 + i[1] * bstride[1]0 ; |
1526 | 504 | for (x = 0; x < count; x++480 ) |
1527 | 480 | cp0[x] = (bp1[x] == p) ? q240 : ap1[x]240 ; |
1528 | 24 | cp0 += cstride[1]; |
1529 | 24 | } |
1530 | 4 | } |
1531 | 4 | return; |
1532 | 4 | } |
1533 | | // Non-optimal case, need to do skip copy and handle broadcasting. |
1534 | 0 | for (i[0] = 0; i[0] < cdim[0]; i[0]++) |
1535 | 0 | { |
1536 | 0 | float* const ap0 = adim[0] == 1 ? ap : ap + i[0] * astride[0]; |
1537 | 0 | int* const bp0 = bdim[0] == 1 ? bp : bp + i[0] * bstride[0]; |
1538 | 0 | float* const cp0 = cp + i[0] * cstride[0]; |
1539 | 0 | for (i[1] = 0; i[1] < cdim[1]; i[1]++) |
1540 | 0 | { |
1541 | 0 | float* const ap1 = adim[1] == 1 ? ap0 : ap0 + i[1] * astride[1]; |
1542 | 0 | int* const bp1 = bdim[1] == 1 ? bp0 : bp0 + i[1] * bstride[1]; |
1543 | 0 | float* cp1 = cp0 + i[1] * cstride[1]; |
1544 | 0 | for (i[2] = 0; i[2] < cdim[2]; i[2]++) |
1545 | 0 | { |
1546 | 0 | float* const ap2 = adim[2] == 1 ? ap1 : ap1 + i[2] * astride[2]; |
1547 | 0 | int* const bp2 = bdim[2] == 1 ? bp1 : bp1 + i[2] * bstride[2]; |
1548 | 0 | if (adim[3] == 1) |
1549 | 0 | for (x = 0; x < cdim[3]; x++) |
1550 | 0 | cp1[x] = (bp2[x] == p) ? q : ap2[0]; |
1551 | 0 | else if (bdim[3] == 1) |
1552 | 0 | if (bp2[0] == p) |
1553 | 0 | for (x = 0; x < cdim[3]; x++) |
1554 | 0 | cp1[x] = q; |
1555 | 0 | else |
1556 | 0 | for (x = 0; x < cdim[3]; x++) |
1557 | 0 | cp1[x] = ap2[x]; |
1558 | 0 | else |
1559 | 0 | for (x = 0; x < cdim[3]; x++) |
1560 | 0 | cp1[x] = (bp2[x] == p) ? q : ap2[x]; |
1561 | 0 | cp1 += cstride[2]; |
1562 | 0 | } |
1563 | 0 | } |
1564 | 0 | } |
1565 | 0 | } |
1566 | | |
1567 | | static int _ccv_nnc_masked_fill_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
1568 | 3 | { |
1569 | 3 | assert(input_size >= 2); |
1570 | 3 | assert(inputs[0]); |
1571 | 3 | assert(inputs[1]); |
1572 | 3 | assert(outputs[0]); |
1573 | 3 | if (inputs[1]->info.datatype == CCV_32F) |
1574 | 1 | _ccv_nnc_masked_fill_cpu_ref_f(cmd.info.blas.a[0], cmd.info.blas.a[1], (ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[1], (ccv_nnc_tensor_view_t*)outputs[0]); |
1575 | 2 | else if (inputs[1]->info.datatype == CCV_32S) |
1576 | 2 | _ccv_nnc_masked_fill_cpu_ref_s((int)(cmd.info.blas.a[0] + 0.5), cmd.info.blas.a[1], (ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[1], (ccv_nnc_tensor_view_t*)outputs[0]); |
1577 | 3 | return CCV_NNC_EXEC_SUCCESS; |
1578 | 3 | } |
1579 | | |
1580 | | static int _ccv_nnc_masked_fill_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
1581 | 3 | { |
1582 | 3 | assert(input_size >= 3); |
1583 | 3 | if (inputs[2]->info.datatype == CCV_32F) |
1584 | 1 | _ccv_nnc_masked_fill_cpu_ref_f(cmd.info.blas.a[0], 0, (ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[2], (ccv_nnc_tensor_view_t*)outputs[0]); |
1585 | 2 | else if (inputs[2]->info.datatype == CCV_32S) |
1586 | 2 | _ccv_nnc_masked_fill_cpu_ref_s((int)(cmd.info.blas.a[0] + 0.5), 0, (ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[2], (ccv_nnc_tensor_view_t*)outputs[0]); |
1587 | | // TODO: doesn't really support taking gradient on mask. |
1588 | | // if (output_size >= 2 && outputs[1]) |
1589 | 3 | return CCV_NNC_EXEC_SUCCESS; |
1590 | 3 | } |
1591 | | |
1592 | | REGISTER_COMMAND_BACKEND(CCV_NNC_MASKED_FILL_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
1593 | 1 | { |
1594 | 1 | registry->tensor_formats = CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_CHWN; |
1595 | 1 | registry->tensor_datatypes = CCV_32F | CCV_32S; |
1596 | 1 | registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
1597 | 1 | registry->algorithms = 1; |
1598 | 1 | registry->exec = _ccv_nnc_masked_fill_forw; |
1599 | 1 | } |
1600 | | |
1601 | | REGISTER_COMMAND_BACKEND(CCV_NNC_MASKED_FILL_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
1602 | 1 | { |
1603 | 1 | registry->tensor_formats = CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_CHWN; |
1604 | 1 | registry->tensor_datatypes = CCV_32F | CCV_32S; |
1605 | 1 | registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
1606 | 1 | registry->algorithms = 1; |
1607 | 1 | registry->exec = _ccv_nnc_masked_fill_back; |
1608 | 1 | } |