/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/cmd/nms/ccv_nnc_nms_cpu_ref.c
Line | Count | Source (jump to first uncovered line) |
1 | | #include "ccv.h" |
2 | | #include "ccv_internal.h" |
3 | | #include "nnc/ccv_nnc.h" |
4 | | #include "nnc/ccv_nnc_easy.h" |
5 | | #include "nnc/ccv_nnc_internal.h" |
6 | | #ifdef USE_OPENMP |
7 | | #include <omp.h> |
8 | | #endif |
9 | | #ifdef USE_DISPATCH |
10 | | #include <dispatch/dispatch.h> |
11 | | #endif |
12 | | |
13 | | typedef struct { |
14 | | float v[5]; |
15 | | } float5; |
16 | 5.08k | #define less_than(a, b, aux) ((a).v[0] > (b).v[0]) |
17 | 1.54k | #define swap_func(a, b, array, aux, t) do { \ |
18 | 1.53k | (t) = (a); \ |
19 | 1.53k | (a) = (b); \ |
20 | 1.53k | (b) = (t); \ |
21 | 1.53k | int _t = aux[&(a) - array]; \ |
22 | 1.53k | aux[&(a) - array] = aux[&(b) - array]; \ |
23 | 1.53k | aux[&(b) - array] = _t; \ |
24 | 1.53k | } while (0) |
25 | 5.08k | CCV_IMPLEMENT_QSORT_EX(_ccv_nnc_nms_sortby_f5_32f, float5, less_than, swap_func1.53k , int*) |
26 | | #undef less_than |
27 | | #undef swap_func |
28 | | |
29 | | static int _ccv_nnc_nms_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
30 | 5 | { |
31 | 5 | assert(input_size == 1); |
32 | 5 | const ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[0]; |
33 | 5 | assert(output_size == 2); |
34 | 5 | ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)outputs[0]; |
35 | 5 | ccv_nnc_tensor_view_t* c = (ccv_nnc_tensor_view_t*)outputs[1]; |
36 | 5 | const int a_nd = ccv_nnc_tensor_nd(a->info.dim); |
37 | 5 | const int b_nd = ccv_nnc_tensor_nd(b->info.dim); |
38 | 5 | const int c_nd = ccv_nnc_tensor_nd(c->info.dim); |
39 | 5 | assert(a_nd == b_nd); |
40 | 5 | int i; |
41 | 15 | for (i = 0; i < a_nd; i++10 ) |
42 | 10 | { assert(a->info.dim[i] == b->info.dim[i]); } |
43 | 5 | const int n = a_nd >= 3 ? a->info.dim[0]0 : 1; |
44 | 5 | const int aninc = CCV_IS_TENSOR_VIEW(a) ? (1 a_nd >= 31 ? a->stride[0]0 : 01 ) : (4 a_nd >= 34 ? a->info.dim[1] * a->info.dim[2]0 : 04 ); |
45 | 5 | const int bninc = CCV_IS_TENSOR_VIEW(b) ? (1 b_nd >= 31 ? b->stride[0]0 : 01 ) : (4 b_nd >= 34 ? b->info.dim[1] * b->info.dim[2]0 : 04 ); |
46 | 5 | const int cninc = CCV_IS_TENSOR_VIEW(c) ? (0 c_nd >= 20 ? c->stride[0]0 : 00 ) : (c_nd >= 2 ? c->info.dim[1]0 : 0); |
47 | 5 | const int m = a_nd >= 3 ? a->info.dim[1]0 : a->info.dim[0]; |
48 | 5 | if (c_nd == 1) |
49 | 5 | { assert(m == c->info.dim[0]); } |
50 | 0 | else |
51 | 0 | { assert(c_nd == 2 && n == c->info.dim[0] && m == c->info.dim[1]); } |
52 | 5 | const int aminc = CCV_IS_TENSOR_VIEW(a) ? a->stride[a_nd - 2]1 : a->info.dim[a_nd - 1]4 ; |
53 | 5 | const int bminc = CCV_IS_TENSOR_VIEW(b) ? b->stride[b_nd - 2]1 : b->info.dim[b_nd - 1]4 ; |
54 | 5 | const int d = a_nd <= 1 ? 10 : a->info.dim[a_nd - 1]; |
55 | 5 | const float iou_threshold = cmd.info.nms.iou_threshold; |
56 | 5 | if (d == 5 && aminc == 5 && aminc == bminc4 ) // If it is 5, we can use our quick sort implementation. |
57 | 4 | { |
58 | 4 | parallel_for(i, n) |
59 | 4 | { |
60 | 4 | int x, y; |
61 | 4 | const float* const ap = a->data.f32 + i * aninc; |
62 | 4 | float* const bp = b->data.f32 + i * bninc; |
63 | 4 | int* const cp = c->data.i32 + i * cninc; |
64 | 1.03k | for (x = 0; x < m; x++1.03k ) |
65 | 1.03k | cp[x] = x; |
66 | 5.15k | for (x = 0; x < m * d; x++5.15k ) |
67 | 5.15k | bp[x] = ap[x]; |
68 | 4 | _ccv_nnc_nms_sortby_f5_32f((float5*)bp, m, cp); |
69 | 1.03k | for (x = 0; x < m; x++1.03k ) |
70 | 1.03k | { |
71 | 1.03k | float v = bp[x * 5]; |
72 | 1.03k | if (v == -FLT_MAX) // Suppressed. |
73 | 510 | continue; |
74 | 520 | const float area1 = bp[x * 5 + 3] * bp[x * 5 + 4]; |
75 | 250k | for (y = x + 1; y < m; y++250k ) |
76 | 250k | { |
77 | 250k | const float u = bp[y * 5]; |
78 | 250k | if (u == -FLT_MAX) // Suppressed. |
79 | 0 | continue; |
80 | 250k | const float area2 = bp[y * 5 + 3] * bp[y * 5 + 4]; |
81 | 250k | const float xdiff = ccv_max(0, ccv_min(bp[x * 5 + 1] + bp[x * 5 + 3], bp[y * 5 + 1] + bp[y * 5 + 3]) - ccv_max(bp[x * 5 + 1], bp[y * 5 + 1])); |
82 | 250k | const float ydiff = ccv_max(0, ccv_min(bp[x * 5 + 2] + bp[x * 5 + 4], bp[y * 5 + 2] + bp[y * 5 + 4]) - ccv_max(bp[x * 5 + 2], bp[y * 5 + 2])); |
83 | 250k | const float intersection = xdiff * ydiff; |
84 | 250k | const float iou = intersection / (area1 + area2 - intersection); |
85 | 250k | if (iou >= iou_threshold) |
86 | 510 | bp[y * 5] = -FLT_MAX; |
87 | 250k | } |
88 | 520 | } |
89 | | // Move these values up and move suppressed to the end. |
90 | 1.03k | for (x = 0, y = 0; x < m; x++1.03k ) |
91 | 1.03k | if (bp[x * 5] != -FLT_MAX) |
92 | 520 | { |
93 | 520 | int j; |
94 | 520 | if (x != y) |
95 | 507 | { |
96 | 3.04k | for (j = 0; j < 5; j++2.53k ) |
97 | 2.53k | bp[y * 5 + j] = bp[x * 5 + j]; |
98 | 507 | cp[y] = cp[x]; |
99 | 507 | } |
100 | 520 | ++y; |
101 | 520 | } |
102 | 514 | for (x = y; x < m; x++510 ) |
103 | 510 | cp[x] = -1, bp[x * 5] = -FLT_MAX; |
104 | 4 | } parallel_endfor |
105 | 4 | } else { |
106 | | // Otherwise, fall to use selection sort. |
107 | 1 | parallel_for(i, n) |
108 | 1 | { |
109 | 1 | int x, y; |
110 | 1 | const float* const ap = a->data.f32 + i * aninc; |
111 | 1 | float* const bp = b->data.f32 + i * bninc; |
112 | 1 | int* const cp = c->data.i32 + i * cninc; |
113 | 11 | for (x = 0; x < m; x++10 ) |
114 | 10 | cp[x] = x; |
115 | 11 | for (x = 0; x < m; x++10 ) |
116 | 60 | for (y = 0; 10 y < d; y++50 ) |
117 | 50 | bp[x * bminc + y] = ap[x * aminc + y]; |
118 | 11 | for (x = 0; x < m; x++10 ) |
119 | 10 | { |
120 | 10 | float v = bp[x * bminc]; |
121 | 10 | int k = x; |
122 | 55 | for (y = x + 1; y < m; y++45 ) |
123 | 45 | { |
124 | 45 | const float u = bp[y * bminc]; |
125 | 45 | if (u > v) |
126 | 25 | k = y, v = u; |
127 | 45 | } |
128 | 60 | for (y = 0; y < d; y++50 ) |
129 | 50 | { |
130 | 50 | const float t = bp[k * bminc + y]; |
131 | 50 | bp[k * bminc + y] = bp[x * bminc + y]; |
132 | 50 | bp[x * bminc + y] = t; |
133 | 50 | const int u = cp[k]; |
134 | 50 | cp[k] = cp[x]; |
135 | 50 | cp[x] = u; |
136 | 50 | } |
137 | 10 | } |
138 | 11 | for (x = 0; x < m; x++10 ) |
139 | 10 | { |
140 | 10 | float v = bp[x * bminc]; |
141 | 10 | if (v == -FLT_MAX) // Suppressed. |
142 | 0 | continue; |
143 | 10 | const float area1 = bp[x * bminc + 3] * bp[x * bminc + 4]; |
144 | 55 | for (y = x + 1; y < m; y++45 ) |
145 | 45 | { |
146 | 45 | const float u = bp[y * bminc]; |
147 | 45 | if (u == -FLT_MAX) // Suppressed. |
148 | 0 | continue; |
149 | 45 | const float area2 = bp[y * bminc + 3] * bp[y * bminc + 4]; |
150 | 45 | const float xdiff = ccv_max(0, ccv_min(bp[x * bminc + 1] + bp[x * bminc + 3], bp[y * bminc + 1] + bp[y * bminc + 3]) - ccv_max(bp[x * bminc + 1], bp[y * bminc + 1])); |
151 | 45 | const float ydiff = ccv_max(0, ccv_min(bp[x * bminc + 2] + bp[x * bminc + 4], bp[y * bminc + 2] + bp[y * bminc + 4]) - ccv_max(bp[x * bminc + 2], bp[y * bminc + 2])); |
152 | 45 | const float intersection = xdiff * ydiff; |
153 | 45 | const float iou = intersection / (area1 + area2 - intersection); |
154 | 45 | if (iou >= iou_threshold) |
155 | 0 | bp[y * bminc] = -FLT_MAX; |
156 | 45 | } |
157 | 10 | } |
158 | 11 | for (x = 0, y = 0; x < m; x++10 ) |
159 | 10 | if (bp[x * bminc] != -FLT_MAX) |
160 | 10 | { |
161 | 10 | int j; |
162 | 10 | if (x != y) |
163 | 0 | { |
164 | 0 | for (j = 0; j < 5; j++) |
165 | 0 | bp[y * bminc + j] = bp[x * bminc + j]; |
166 | 0 | cp[y] = cp[x]; |
167 | 0 | } |
168 | 10 | ++y; |
169 | 10 | } |
170 | 1 | for (x = y; x < m; x++0 ) |
171 | 0 | cp[x] = -1, bp[x * bminc] = -FLT_MAX; |
172 | 1 | } parallel_endfor |
173 | 1 | } |
174 | 5 | return CCV_NNC_EXEC_SUCCESS; |
175 | 5 | } |
176 | | |
177 | | static int _ccv_nnc_nms_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
178 | 1 | { |
179 | 1 | assert(input_size >= 5); |
180 | 1 | const ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[0]; |
181 | 1 | const ccv_nnc_tensor_view_t* c = (ccv_nnc_tensor_view_t*)inputs[4]; |
182 | 1 | assert(output_size == 1); |
183 | 1 | ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)outputs[0]; |
184 | 1 | const int a_nd = ccv_nnc_tensor_nd(a->info.dim); |
185 | 1 | const int b_nd = ccv_nnc_tensor_nd(b->info.dim); |
186 | 1 | const int c_nd = ccv_nnc_tensor_nd(c->info.dim); |
187 | 1 | assert(a_nd == b_nd); |
188 | 1 | int i; |
189 | 3 | for (i = 0; i < a_nd; i++2 ) |
190 | 2 | { assert(a->info.dim[i] == b->info.dim[i]); } |
191 | 1 | const int n = a_nd >= 3 ? a->info.dim[0]0 : 1; |
192 | 1 | const int aninc = CCV_IS_TENSOR_VIEW(a) ? (0 a_nd >= 30 ? a->stride[0]0 : 00 ) : (a_nd >= 3 ? a->info.dim[1] * a->info.dim[2]0 : 0); |
193 | 1 | const int bninc = CCV_IS_TENSOR_VIEW(b) ? (0 b_nd >= 30 ? b->stride[0]0 : 00 ) : (b_nd >= 3 ? b->info.dim[1] * b->info.dim[2]0 : 0); |
194 | 1 | const int cninc = CCV_IS_TENSOR_VIEW(c) ? (0 c_nd >= 20 ? c->stride[0]0 : 00 ) : (c_nd >= 2 ? c->info.dim[1]0 : 0); |
195 | 1 | const int m = a_nd >= 3 ? a->info.dim[1]0 : a->info.dim[0]; |
196 | 1 | if (c_nd == 1) |
197 | 1 | { assert(m == c->info.dim[0]); } |
198 | 0 | else |
199 | 0 | { assert(c_nd == 2 && n == c->info.dim[0] && m == c->info.dim[1]); } |
200 | 1 | const int aminc = CCV_IS_TENSOR_VIEW(a) ? a->stride[a_nd - 2]0 : a->info.dim[a_nd - 1]; |
201 | 1 | const int bminc = CCV_IS_TENSOR_VIEW(b) ? b->stride[b_nd - 2]0 : b->info.dim[b_nd - 1]; |
202 | 1 | const int d = a_nd <= 1 ? 10 : a->info.dim[a_nd - 1]; |
203 | 1 | parallel_for(i, n) |
204 | 1 | { |
205 | 1 | int x, y; |
206 | 1 | const float* const ap = a->data.f32 + i * aninc; |
207 | 1 | const int* const cp = c->data.i32 + i * cninc; |
208 | 1 | float* const bp = b->data.f32 + i * bninc; |
209 | 11 | for (x = 0; x < m; x++10 ) |
210 | 60 | for (y = 0; 10 y < d; y++50 ) |
211 | 50 | bp[x * bminc + y] = 0; |
212 | 6 | for (x = 0; x < m; x++5 ) |
213 | 6 | { |
214 | 6 | const int k = cp[x]; |
215 | 6 | if (k < 0) |
216 | 1 | break; |
217 | 30 | for (y = 0; 5 y < d; y++25 ) |
218 | 25 | bp[k * bminc + y] = ap[x * aminc + y]; |
219 | 5 | } |
220 | 1 | } parallel_endfor |
221 | 1 | return CCV_NNC_EXEC_SUCCESS; |
222 | 1 | } |
223 | | |
224 | | REGISTER_COMMAND_BACKEND(CCV_NNC_NMS_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
225 | 1 | { |
226 | 1 | registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW; |
227 | 1 | registry->tensor_datatypes = CCV_32F | CCV_32S; |
228 | 1 | registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
229 | 1 | registry->algorithms = 1; |
230 | 1 | registry->exec = _ccv_nnc_nms_forw; |
231 | 1 | } |
232 | | |
233 | | REGISTER_COMMAND_BACKEND(CCV_NNC_NMS_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
234 | 1 | { |
235 | 1 | registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW; |
236 | 1 | registry->tensor_datatypes = CCV_32F | CCV_32S; |
237 | 1 | registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
238 | 1 | registry->algorithms = 1; |
239 | 1 | registry->exec = _ccv_nnc_nms_back; |
240 | 1 | } |