/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/cmd/compare/ccv_nnc_min_cpu_ref.c
Line | Count | Source (jump to first uncovered line) |
1 | | #include "ccv.h" |
2 | | #include "ccv_internal.h" |
3 | | #include "nnc/ccv_nnc.h" |
4 | | #include "nnc/ccv_nnc_easy.h" |
5 | | #include "nnc/ccv_nnc_internal.h" |
6 | | #ifdef USE_OPENMP |
7 | | #include <omp.h> |
8 | | #endif |
9 | | #ifdef USE_DISPATCH |
10 | | #include <dispatch/dispatch.h> |
11 | | #endif |
12 | | |
13 | | static int _ccv_nnc_min_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
14 | 4 | { |
15 | 4 | ccv_nnc_tensor_view_t* const a = (ccv_nnc_tensor_view_t*)inputs[0]; |
16 | 4 | ccv_nnc_tensor_view_t* const b = (ccv_nnc_tensor_view_t*)inputs[1]; |
17 | 4 | ccv_nnc_tensor_view_t* const c = (ccv_nnc_tensor_view_t*)outputs[0]; |
18 | | // Assuming this is float 32. |
19 | 4 | int dim[CCV_NNC_MAX_DIM_ALLOC]; |
20 | 4 | int astride[CCV_NNC_MAX_DIM_ALLOC]; |
21 | 4 | int bstride[CCV_NNC_MAX_DIM_ALLOC]; |
22 | 4 | int cstride[CCV_NNC_MAX_DIM_ALLOC]; |
23 | 4 | assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2); |
24 | 4 | assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2); |
25 | 4 | assert(ccv_nnc_tensor_nd(c->info.dim) <= CCV_NNC_MAX_DIM + 2); |
26 | 4 | ccv_nnc_tensor_view_get_dim(a, dim); |
27 | 4 | assert(ccv_nnc_tensor_view_check_dim(b, dim)); |
28 | 4 | assert(ccv_nnc_tensor_view_check_dim(c, dim)); |
29 | 4 | int x; |
30 | 4 | if (!CCV_IS_TENSOR_VIEW(a) && !3 CCV_IS_TENSOR_VIEW3 (b) && !3 CCV_IS_TENSOR_VIEW3 (c)) |
31 | 3 | { |
32 | | // Super optimal case, just do one for-loop for sum. |
33 | 3 | const int tensor_count = ccv_nnc_tensor_count(a->info); |
34 | 1.02k | for (x = 0; x < tensor_count; x++1.02k ) |
35 | 1.02k | c->data.f32[x] = ccv_min(a->data.f32[x], b->data.f32[x]); |
36 | 3 | return CCV_NNC_EXEC_SUCCESS; |
37 | 3 | } |
38 | 1 | assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number. |
39 | 1 | ccv_nnc_tensor_view_get_stride(a, astride); |
40 | 1 | ccv_nnc_tensor_view_get_stride(b, bstride); |
41 | 1 | ccv_nnc_tensor_view_get_stride(c, cstride); |
42 | 1 | int i[CCV_NNC_MAX_DIM + 2]; |
43 | 1 | float* const ap = a->data.f32; |
44 | 1 | float* const bp = b->data.f32; |
45 | 1 | float* const cp = c->data.f32; |
46 | 1 | const int count = dim[2] * dim[3]; |
47 | 1 | if (astride[2] == dim[3] && bstride[2] == dim[3] && cstride[2] == dim[3]) |
48 | 1 | { |
49 | | // Special casing if the ainc[3] is the same as dim[3] |
50 | 2 | for (i[0] = 0; i[0] < dim[0]; i[0]++1 ) |
51 | 1 | { |
52 | 1 | float* ap0 = ap + i[0] * astride[0]; |
53 | 1 | float* bp0 = bp + i[0] * bstride[0]; |
54 | 1 | float* cp0 = cp + i[0] * cstride[0]; |
55 | 3 | for (i[1] = 0; i[1] < dim[1]; i[1]++2 ) |
56 | 2 | { |
57 | 14 | for (x = 0; x < count; x++12 ) |
58 | 12 | cp0[x] = ccv_min(ap0[x], bp0[x]); |
59 | 2 | ap0 += astride[1]; |
60 | 2 | bp0 += bstride[1]; |
61 | 2 | cp0 += cstride[1]; |
62 | 2 | } |
63 | 1 | } |
64 | 1 | return CCV_NNC_EXEC_SUCCESS; |
65 | 1 | } |
66 | | // Non-optimal case, need to do skip copy. |
67 | 0 | for (i[0] = 0; i[0] < dim[0]; i[0]++) |
68 | 0 | { |
69 | 0 | float* const ap0 = ap + i[0] * astride[0]; |
70 | 0 | float* const bp0 = bp + i[0] * bstride[0]; |
71 | 0 | float* const cp0 = cp + i[0] * cstride[0]; |
72 | 0 | for (i[1] = 0; i[1] < dim[1]; i[1]++) |
73 | 0 | { |
74 | 0 | float* ap1 = ap0 + i[1] * astride[1]; |
75 | 0 | float* bp1 = bp0 + i[1] * bstride[1]; |
76 | 0 | float* cp1 = cp0 + i[1] * cstride[1]; |
77 | 0 | for (i[2] = 0; i[2] < dim[2]; i[2]++) |
78 | 0 | { |
79 | 0 | for (x = 0; x < dim[3]; x++) |
80 | 0 | cp1[x] = ccv_min(ap1[x], bp1[x]); |
81 | 0 | ap1 += astride[2]; |
82 | 0 | bp1 += bstride[2]; |
83 | 0 | cp1 += cstride[2]; |
84 | 0 | } |
85 | 0 | } |
86 | 0 | } |
87 | 0 | return CCV_NNC_EXEC_SUCCESS; |
88 | 1 | } |
89 | | |
90 | | static int _ccv_nnc_min_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
91 | 5 | { |
92 | 5 | ccv_nnc_tensor_view_t* const g = (ccv_nnc_tensor_view_t*)inputs[0]; |
93 | 5 | ccv_nnc_tensor_view_t* const a = (ccv_nnc_tensor_view_t*)inputs[1]; |
94 | 5 | ccv_nnc_tensor_view_t* const b = (ccv_nnc_tensor_view_t*)inputs[2]; |
95 | 5 | ccv_nnc_tensor_view_t* const ha = (ccv_nnc_tensor_view_t*)outputs[0]; |
96 | 5 | ccv_nnc_tensor_view_t* const hb = (ccv_nnc_tensor_view_t*)outputs[1]; |
97 | | // Assuming this is float 32. |
98 | 5 | int dim[CCV_NNC_MAX_DIM_ALLOC]; |
99 | 5 | int gstride[CCV_NNC_MAX_DIM_ALLOC]; |
100 | 5 | int astride[CCV_NNC_MAX_DIM_ALLOC]; |
101 | 5 | int bstride[CCV_NNC_MAX_DIM_ALLOC]; |
102 | 5 | int hastride[CCV_NNC_MAX_DIM_ALLOC]; |
103 | 5 | int hbstride[CCV_NNC_MAX_DIM_ALLOC]; |
104 | 5 | assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2); |
105 | 5 | assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2); |
106 | 5 | assert(ccv_nnc_tensor_nd(ha->info.dim) <= CCV_NNC_MAX_DIM + 2); |
107 | 5 | assert(ccv_nnc_tensor_nd(hb->info.dim) <= CCV_NNC_MAX_DIM + 2); |
108 | 5 | ccv_nnc_tensor_view_get_dim(a, dim); |
109 | 5 | assert(ccv_nnc_tensor_view_check_dim(b, dim)); |
110 | 5 | assert(ccv_nnc_tensor_view_check_dim(ha, dim)); |
111 | 5 | assert(ccv_nnc_tensor_view_check_dim(hb, dim)); |
112 | 5 | if (g) |
113 | 3 | { |
114 | 3 | assert(ccv_nnc_tensor_nd(g->info.dim) <= CCV_NNC_MAX_DIM + 2); |
115 | 3 | assert(ccv_nnc_tensor_view_check_dim(g, dim)); |
116 | 3 | int x; |
117 | 3 | if (!CCV_IS_TENSOR_VIEW(g) && !CCV_IS_TENSOR_VIEW(a) && !2 CCV_IS_TENSOR_VIEW2 (b) && !2 CCV_IS_TENSOR_VIEW2 (ha) && !2 CCV_IS_TENSOR_VIEW2 (hb)) |
118 | 2 | { |
119 | | // Super optimal case, just do one for-loop for sum. |
120 | 2 | const int tensor_count = ccv_nnc_tensor_count(a->info); |
121 | 1.01k | for (x = 0; x < tensor_count; x++1.01k ) |
122 | 1.01k | if (a->data.f32[x] < b->data.f32[x]) |
123 | 497 | { |
124 | 497 | ha->data.f32[x] = g->data.f32[x]; |
125 | 497 | hb->data.f32[x] = 0; |
126 | 515 | } else if (a->data.f32[x] > b->data.f32[x]) { |
127 | 512 | hb->data.f32[x] = g->data.f32[x]; |
128 | 512 | ha->data.f32[x] = 0; |
129 | 512 | } else |
130 | 3 | ha->data.f32[x] = hb->data.f32[x] = g->data.f32[x]; |
131 | 2 | return CCV_NNC_EXEC_SUCCESS; |
132 | 2 | } |
133 | 1 | assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number. |
134 | 1 | ccv_nnc_tensor_view_get_stride(g, gstride); |
135 | 1 | ccv_nnc_tensor_view_get_stride(a, astride); |
136 | 1 | ccv_nnc_tensor_view_get_stride(b, bstride); |
137 | 1 | ccv_nnc_tensor_view_get_stride(ha, hastride); |
138 | 1 | ccv_nnc_tensor_view_get_stride(hb, hbstride); |
139 | 1 | int i[CCV_NNC_MAX_DIM + 2]; |
140 | 1 | float* const gp = g->data.f32; |
141 | 1 | float* const ap = a->data.f32; |
142 | 1 | float* const bp = b->data.f32; |
143 | 1 | float* const hap = ha->data.f32; |
144 | 1 | float* const hbp = hb->data.f32; |
145 | 1 | const int count = dim[2] * dim[3]; |
146 | 1 | if (astride[2] == dim[3] && bstride[2] == dim[3] && hastride[2] == dim[3] && hbstride[2] == dim[3]) |
147 | 1 | { |
148 | | // Special casing if the ainc[3] is the same as dim[3] |
149 | 2 | for (i[0] = 0; i[0] < dim[0]; i[0]++1 ) |
150 | 1 | { |
151 | 1 | float* gp0 = gp + i[0] * gstride[0]; |
152 | 1 | float* ap0 = ap + i[0] * astride[0]; |
153 | 1 | float* bp0 = bp + i[0] * bstride[0]; |
154 | 1 | float* hap0 = hap + i[0] * hastride[0]; |
155 | 1 | float* hbp0 = hbp + i[0] * hbstride[0]; |
156 | 3 | for (i[1] = 0; i[1] < dim[1]; i[1]++2 ) |
157 | 2 | { |
158 | 14 | for (x = 0; x < count; x++12 ) |
159 | 12 | if (ap0[x] < bp0[x]) { |
160 | 0 | hap0[x] = gp0[x]; |
161 | 0 | hbp0[x] = 0; |
162 | 12 | } else if (ap0[x] > bp0[x]) { |
163 | 9 | hbp0[x] = gp0[x]; |
164 | 9 | hap0[x] = 0; |
165 | 9 | } else |
166 | 3 | hap0[x] = hbp0[x] = gp0[x]; |
167 | 2 | gp0 += gstride[1]; |
168 | 2 | ap0 += astride[1]; |
169 | 2 | bp0 += bstride[1]; |
170 | 2 | hap0 += hastride[1]; |
171 | 2 | hbp0 += hbstride[1]; |
172 | 2 | } |
173 | 1 | } |
174 | 1 | return CCV_NNC_EXEC_SUCCESS; |
175 | 1 | } |
176 | | // Non-optimal case, need to do skip copy. |
177 | 0 | for (i[0] = 0; i[0] < dim[0]; i[0]++) |
178 | 0 | { |
179 | 0 | float* const gp0 = gp + i[0] * gstride[0]; |
180 | 0 | float* const ap0 = ap + i[0] * astride[0]; |
181 | 0 | float* const bp0 = bp + i[0] * bstride[0]; |
182 | 0 | float* const hap0 = hap + i[0] * hastride[0]; |
183 | 0 | float* const hbp0 = hbp + i[0] * hbstride[0]; |
184 | 0 | for (i[1] = 0; i[1] < dim[1]; i[1]++) |
185 | 0 | { |
186 | 0 | float* gp1 = gp0 + i[1] * gstride[1]; |
187 | 0 | float* ap1 = ap0 + i[1] * astride[1]; |
188 | 0 | float* bp1 = bp0 + i[1] * bstride[1]; |
189 | 0 | float* hap1 = hap0 + i[1] * hastride[1]; |
190 | 0 | float* hbp1 = hbp0 + i[1] * hbstride[1]; |
191 | 0 | for (i[2] = 0; i[2] < dim[2]; i[2]++) |
192 | 0 | { |
193 | 0 | for (x = 0; x < dim[3]; x++) |
194 | 0 | if (ap1[x] < bp1[x]) { |
195 | 0 | hap1[x] = gp1[x]; |
196 | 0 | hbp1[x] = 0; |
197 | 0 | } else if (ap1[x] > bp1[x]) { |
198 | 0 | hbp1[x] = gp1[x]; |
199 | 0 | hap1[x] = 0; |
200 | 0 | } else |
201 | 0 | hap1[x] = hbp1[x] = gp1[x]; |
202 | 0 | gp1 += gstride[2]; |
203 | 0 | ap1 += astride[2]; |
204 | 0 | bp1 += bstride[2]; |
205 | 0 | hap1 += hastride[2]; |
206 | 0 | hbp1 += hbstride[2]; |
207 | 0 | } |
208 | 0 | } |
209 | 0 | } |
210 | 2 | } else { |
211 | 2 | int x; |
212 | 2 | if (!CCV_IS_TENSOR_VIEW(a) && !1 CCV_IS_TENSOR_VIEW1 (b) && !1 CCV_IS_TENSOR_VIEW1 (ha) && !1 CCV_IS_TENSOR_VIEW1 (hb)) |
213 | 1 | { |
214 | | // Super optimal case, just do one for-loop for sum. |
215 | 1 | const int tensor_count = ccv_nnc_tensor_count(a->info); |
216 | 13 | for (x = 0; x < tensor_count; x++12 ) |
217 | 12 | if (a->data.f32[x] < b->data.f32[x]) { |
218 | 0 | ha->data.f32[x] = 1; |
219 | 0 | hb->data.f32[x] = 0; |
220 | 12 | } else if (a->data.f32[x] > b->data.f32[x]) { |
221 | 9 | ha->data.f32[x] = 0; |
222 | 9 | hb->data.f32[x] = 1; |
223 | 9 | } else |
224 | 3 | ha->data.f32[x] = hb->data.f32[x] = 1; |
225 | 1 | return CCV_NNC_EXEC_SUCCESS; |
226 | 1 | } |
227 | 1 | assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number. |
228 | 1 | ccv_nnc_tensor_view_get_stride(a, astride); |
229 | 1 | ccv_nnc_tensor_view_get_stride(b, bstride); |
230 | 1 | ccv_nnc_tensor_view_get_stride(ha, hastride); |
231 | 1 | ccv_nnc_tensor_view_get_stride(hb, hbstride); |
232 | 1 | int i[CCV_NNC_MAX_DIM + 2]; |
233 | 1 | float* const ap = a->data.f32; |
234 | 1 | float* const bp = b->data.f32; |
235 | 1 | float* const hap = ha->data.f32; |
236 | 1 | float* const hbp = hb->data.f32; |
237 | 1 | const int count = dim[2] * dim[3]; |
238 | 1 | if (astride[2] == dim[3] && bstride[2] == dim[3] && hastride[2] == dim[3] && hbstride[2] == dim[3]) |
239 | 1 | { |
240 | | // Special casing if the ainc[3] is the same as dim[3] |
241 | 2 | for (i[0] = 0; i[0] < dim[0]; i[0]++1 ) |
242 | 1 | { |
243 | 1 | float* ap0 = ap + i[0] * astride[0]; |
244 | 1 | float* bp0 = bp + i[0] * bstride[0]; |
245 | 1 | float* hap0 = hap + i[0] * hastride[0]; |
246 | 1 | float* hbp0 = hbp + i[0] * hbstride[0]; |
247 | 3 | for (i[1] = 0; i[1] < dim[1]; i[1]++2 ) |
248 | 2 | { |
249 | 14 | for (x = 0; x < count; x++12 ) |
250 | 12 | if (ap0[x] < bp0[x]) { |
251 | 0 | hap0[x] = 1; |
252 | 0 | hbp0[x] = 0; |
253 | 12 | } else if (ap0[x] > bp0[x]) { |
254 | 9 | hap0[x] = 0; |
255 | 9 | hbp0[x] = 1; |
256 | 9 | } else |
257 | 3 | hap0[x] = hbp0[x] = 1; |
258 | 2 | ap0 += astride[1]; |
259 | 2 | bp0 += bstride[1]; |
260 | 2 | hap0 += hastride[1]; |
261 | 2 | hbp0 += hbstride[1]; |
262 | 2 | } |
263 | 1 | } |
264 | 1 | return CCV_NNC_EXEC_SUCCESS; |
265 | 1 | } |
266 | | // Non-optimal case, need to do skip copy. |
267 | 0 | for (i[0] = 0; i[0] < dim[0]; i[0]++) |
268 | 0 | { |
269 | 0 | float* const ap0 = ap + i[0] * astride[0]; |
270 | 0 | float* const bp0 = bp + i[0] * bstride[0]; |
271 | 0 | float* const hap0 = hap + i[0] * hastride[0]; |
272 | 0 | float* const hbp0 = hbp + i[0] * hbstride[0]; |
273 | 0 | for (i[1] = 0; i[1] < dim[1]; i[1]++) |
274 | 0 | { |
275 | 0 | float* ap1 = ap0 + i[1] * astride[1]; |
276 | 0 | float* bp1 = bp0 + i[1] * bstride[1]; |
277 | 0 | float* hap1 = hap0 + i[1] * hastride[1]; |
278 | 0 | float* hbp1 = hbp0 + i[1] * hbstride[1]; |
279 | 0 | for (i[2] = 0; i[2] < dim[2]; i[2]++) |
280 | 0 | { |
281 | 0 | for (x = 0; x < dim[3]; x++) |
282 | 0 | if (ap1[x] < bp1[x]) { |
283 | 0 | hap1[x] = 1; |
284 | 0 | hbp1[x] = 0; |
285 | 0 | } else if (ap1[x] > bp1[x]) { |
286 | 0 | hap1[x] = 0; |
287 | 0 | hbp1[x] = 1; |
288 | 0 | } else |
289 | 0 | hap1[x] = hbp1[x] = 1; |
290 | 0 | ap1 += astride[2]; |
291 | 0 | bp1 += bstride[2]; |
292 | 0 | hap1 += hastride[2]; |
293 | 0 | hbp1 += hbstride[2]; |
294 | 0 | } |
295 | 0 | } |
296 | 0 | } |
297 | 0 | } |
298 | 0 | return CCV_NNC_EXEC_SUCCESS; |
299 | 5 | } |
300 | | |
301 | | REGISTER_COMMAND_BACKEND(CCV_NNC_MIN_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
302 | 1 | { |
303 | 1 | registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN; |
304 | 1 | registry->tensor_datatypes = CCV_32F; |
305 | 1 | registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
306 | 1 | registry->algorithms = 1; |
307 | 1 | registry->exec = _ccv_nnc_min_forw; |
308 | 1 | } |
309 | | |
310 | | REGISTER_COMMAND_BACKEND(CCV_NNC_MIN_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
311 | 1 | { |
312 | 1 | registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN; |
313 | 1 | registry->tensor_datatypes = CCV_32F; |
314 | 1 | registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
315 | 1 | registry->algorithms = 1; |
316 | 1 | registry->exec = _ccv_nnc_min_back; |
317 | 1 | } |