/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/cmd/roi/ccv_nnc_roi_align_cpu_ref.c
Line | Count | Source (jump to first uncovered line) |
1 | | #include "ccv.h" |
2 | | #include "ccv_internal.h" |
3 | | #include "nnc/ccv_nnc.h" |
4 | | #include "nnc/ccv_nnc_easy.h" |
5 | | #include "nnc/ccv_nnc_internal.h" |
6 | | #ifdef USE_OPENMP |
7 | | #include <omp.h> |
8 | | #endif |
9 | | #ifdef USE_DISPATCH |
10 | | #include <dispatch/dispatch.h> |
11 | | #endif |
12 | | |
13 | | typedef struct { |
14 | | int i0, i1, mute; |
15 | | float r; |
16 | | } roi_align_coeffs_t; |
17 | | |
18 | | static void _ccv_nnc_bilinear_coeffs(ccv_nnc_stream_context_t* const stream_context, const int h, const int w, const float roi_y, const float roi_x, const float roi_h, const float roi_w, const int pool_h, const int pool_w, int* const bin_h_ref, int* const bin_w_ref, roi_align_coeffs_t** const y_coeffs_ref, roi_align_coeffs_t** const x_coeffs_ref, int** const bin_h_at_y_ref, int** const bin_w_at_x_ref, int* const start_h_ref, int* const start_w_ref, int* const end_h_ref, int* const end_w_ref) |
19 | 16 | { |
20 | 16 | const int bin_h = (int)ceilf(roi_h / pool_h); // How many bins in each point of the pool. We slightly sampling at higher resolution (due to ceiling) with bilinear interpolation. |
21 | 16 | const int bin_w = (int)ceilf(roi_w / pool_w); |
22 | 16 | const int bin_pool_h = bin_h * pool_h; // Before averaging, what's the size of the region in integral term. |
23 | 16 | const int bin_pool_w = bin_w * pool_w; |
24 | 16 | const float scale_y = roi_h / bin_pool_h; // The scale to multiply back to get original coordinate. |
25 | 16 | const float scale_x = roi_w / bin_pool_w; |
26 | 16 | int x, y, i, j; |
27 | 16 | roi_align_coeffs_t* const y_coeffs = (roi_align_coeffs_t*)ccv_nnc_stream_context_get_workspace(stream_context, sizeof(roi_align_coeffs_t) * (bin_pool_h + bin_pool_w) + sizeof(int) * (pool_h + pool_w), CCV_TENSOR_CPU_MEMORY); |
28 | 16 | roi_align_coeffs_t* const x_coeffs = y_coeffs + bin_pool_h; |
29 | 16 | int* const bin_h_at_y = (int*)(x_coeffs + bin_pool_w); |
30 | 16 | int* const bin_w_at_x = bin_h_at_y + pool_h; |
31 | 80 | for (i = 0; i < pool_h; i++64 ) |
32 | 64 | { |
33 | 64 | const int pi = i * bin_h; |
34 | 64 | int count = 0; |
35 | 456 | for (y = 0; y < bin_h; y++392 ) |
36 | 392 | { |
37 | 392 | const float ay = roi_y + (y + pi + 0.5) * scale_y - 0.5; |
38 | 392 | const int iy = (int)floorf(ay); |
39 | 392 | const float ry = ay - iy; |
40 | 392 | const int iy0 = ccv_clamp(iy, 0, h - 1); |
41 | 392 | const int iy1 = ccv_clamp(iy + 1, 0, h - 1); |
42 | 392 | y_coeffs[pi + y].i0 = iy0; |
43 | 392 | y_coeffs[pi + y].i1 = iy1; |
44 | 392 | y_coeffs[pi + y].r = ry; |
45 | 392 | const int mute = (iy + 1 < 0 || iy > h - 1); |
46 | 392 | y_coeffs[pi + y].mute = mute; |
47 | 392 | if (!mute) |
48 | 392 | ++count; |
49 | 392 | } |
50 | 64 | bin_h_at_y[i] = count; |
51 | 64 | } |
52 | 16 | int start_h = pool_h; |
53 | 32 | for (i = 0; start_h == pool_h && i < pool_h16 ; i++16 ) |
54 | 16 | if (bin_h_at_y[i] > 0) |
55 | 16 | start_h = i; |
56 | 16 | int end_h = 0; |
57 | 32 | for (i = pool_h - 1; end_h == 0 && i >= 016 ; i--16 ) |
58 | 16 | if (bin_h_at_y[i] > 0) |
59 | 16 | end_h = i + 1; |
60 | 80 | for (j = 0; j < pool_w; j++64 ) |
61 | 64 | { |
62 | 64 | const int pj = j * bin_w; |
63 | 64 | int count = 0; |
64 | 528 | for (x = 0; x < bin_w; x++464 ) |
65 | 464 | { |
66 | 464 | const float ax = roi_x + (x + pj + 0.5) * scale_x - 0.5; |
67 | 464 | const int ix = (int)floorf(ax); |
68 | 464 | const float rx = ax - ix; |
69 | 464 | const int ix0 = ccv_clamp(ix, 0, w - 1); |
70 | 464 | const int ix1 = ccv_clamp(ix + 1, 0, w - 1); |
71 | 464 | x_coeffs[pj + x].i0 = ix0; |
72 | 464 | x_coeffs[pj + x].i1 = ix1; |
73 | 464 | x_coeffs[pj + x].r = rx; |
74 | 464 | const int mute = (ix + 1 < 0 || ix > w - 1); |
75 | 464 | x_coeffs[pj + x].mute = mute; |
76 | 464 | if (!mute) |
77 | 464 | ++count; |
78 | 464 | } |
79 | 64 | bin_w_at_x[j] = count; |
80 | 64 | } |
81 | 16 | int start_w = pool_w; |
82 | 32 | for (j = 0; start_w == pool_w && j < pool_w16 ; j++16 ) |
83 | 16 | if (bin_w_at_x[j] > 0) |
84 | 16 | start_w = j; |
85 | 16 | int end_w = 0; |
86 | 32 | for (j = pool_w - 1; end_w == 0 && j >= 016 ; j--16 ) |
87 | 16 | if (bin_w_at_x[j] > 0) |
88 | 16 | end_w = j + 1; |
89 | 16 | *bin_h_ref = bin_h; |
90 | 16 | *bin_w_ref = bin_w; |
91 | 16 | *y_coeffs_ref = y_coeffs; |
92 | 16 | *x_coeffs_ref = x_coeffs; |
93 | 16 | *bin_h_at_y_ref = bin_h_at_y; |
94 | 16 | *bin_w_at_x_ref = bin_w_at_x; |
95 | 16 | *start_h_ref = start_h; |
96 | 16 | *start_w_ref = start_w; |
97 | 16 | *end_h_ref = end_h; |
98 | 16 | *end_w_ref = end_w; |
99 | 16 | } |
100 | | |
101 | | static int _ccv_nnc_roi_align_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
102 | 7 | { |
103 | 7 | assert(input_size == 2); |
104 | 7 | const ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[0]; |
105 | 7 | assert(output_size == 1); |
106 | 7 | const ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)inputs[1]; |
107 | 7 | ccv_nnc_tensor_view_t* c = (ccv_nnc_tensor_view_t*)outputs[0]; |
108 | 7 | const int a_nd = ccv_nnc_tensor_nd(a->info.dim); |
109 | 7 | assert(a_nd == CCV_NNC_MAX_DIM + 1 || a_nd == CCV_NNC_MAX_DIM + 2); |
110 | 7 | const int* adim = (a_nd == CCV_NNC_MAX_DIM + 1) ? a->info.dim4 : a->info.dim + 13 ; |
111 | 7 | const int h = adim[0]; |
112 | 7 | const int w = adim[1]; |
113 | 7 | const int c_nd = ccv_nnc_tensor_nd(c->info.dim); |
114 | 7 | assert(c_nd == CCV_NNC_MAX_DIM + 1 || c_nd == CCV_NNC_MAX_DIM + 2); |
115 | 7 | const int* cdim = (c_nd == CCV_NNC_MAX_DIM + 1) ? c->info.dim4 : c->info.dim + 13 ; |
116 | 7 | const int pool_h = cdim[0]; |
117 | 7 | const int pool_w = cdim[1]; |
118 | 7 | assert(cdim[2] == adim[2]); |
119 | 7 | const int ch = cdim[2]; |
120 | 7 | const float* const ap = a->data.f32; |
121 | 7 | int astride[CCV_NNC_MAX_DIM_ALLOC]; |
122 | 7 | ccv_nnc_tensor_view_get_stride(a, astride); |
123 | 7 | const float* const bp = b->data.f32; |
124 | 7 | float* cp = c->data.f32; |
125 | 7 | int cstride[CCV_NNC_MAX_DIM_ALLOC]; |
126 | 7 | ccv_nnc_tensor_view_get_stride(c, cstride); |
127 | 7 | const int a_n = ccv_nnc_tensor_get_n(a->info); |
128 | 7 | const int b_nd = ccv_nnc_tensor_nd(b->info.dim); |
129 | 7 | assert(b_nd == 1 || b_nd == 2); |
130 | 7 | const int b_n = b_nd == 1 ? 16 : b->info.dim[0]1 ; |
131 | 7 | const int c_n = ccv_nnc_tensor_get_n(c->info); |
132 | 7 | assert(c_n == ccv_max(a_n, b_n)); |
133 | 7 | const int aninc = a_nd == CCV_NNC_MAX_DIM + 1 ? 04 : astride[0]3 ; |
134 | 7 | int bstride[CCV_NNC_MAX_DIM_ALLOC]; |
135 | 7 | ccv_nnc_tensor_view_get_stride(b, bstride); |
136 | 7 | const int bninc = b_nd == 1 ? 06 : bstride[1 CCV_NNC_MAX_DIM1 + 2 - b_nd]; |
137 | 7 | const int cninc = c_nd == CCV_NNC_MAX_DIM + 1 ? 04 : cstride[0]3 ; |
138 | 7 | ccv_nnc_tensor_zero(c); |
139 | 7 | int bin_h, bin_w; |
140 | 7 | roi_align_coeffs_t* y_coeffs; |
141 | 7 | roi_align_coeffs_t* x_coeffs; |
142 | 7 | int* bin_h_at_y; |
143 | 7 | int* bin_w_at_x; |
144 | 7 | int start_h, start_w, end_h, end_w; |
145 | 7 | int n; |
146 | 17 | for (n = 0; n < c_n; n++10 ) |
147 | 10 | { |
148 | 10 | const float* const apn = ap + (n % a_n) * aninc; |
149 | 10 | float* cpn = cp + n * cninc; |
150 | 10 | const float roi_x = bp[(n % b_n) * bninc] * w; // These assumed it is real-coordinate, with range between 0 to w - 1. |
151 | 10 | const float roi_y = bp[(n % b_n) * bninc + 1] * h; |
152 | 10 | const float roi_w = bp[(n % b_n) * bninc + 2] * w; |
153 | 10 | const float roi_h = bp[(n % b_n) * bninc + 3] * h; |
154 | | // Re-compute the offsets if b changes or it is the first time. |
155 | 10 | if ((b_n == 1 && n == 08 ) || b_n > 14 ) |
156 | 8 | _ccv_nnc_bilinear_coeffs(stream_context, h, w, roi_y, roi_x, roi_h, roi_w, pool_h, pool_w, &bin_h, &bin_w, &y_coeffs, &x_coeffs, &bin_h_at_y, &bin_w_at_x, &start_h, &start_w, &end_h, &end_w); |
157 | 10 | int i, j, x, y, k; |
158 | 50 | for (i = start_h; i < end_h; i++40 ) |
159 | 40 | { |
160 | 40 | const int pi = i * bin_h; |
161 | 40 | const int bin_hz = bin_h_at_y[i]; |
162 | 200 | for (j = start_w; j < end_w; j++160 ) |
163 | 160 | { |
164 | 160 | const int pj = j * bin_w; |
165 | 160 | const int bin_wz = bin_w_at_x[j]; |
166 | 160 | const float inv = 1.0 / (bin_hz * bin_wz); |
167 | 160 | float* const cpz = cpn + j * cstride[CCV_NNC_MAX_DIM]; |
168 | 1.12k | for (y = 0; y < bin_h; y++960 ) |
169 | 960 | { |
170 | 960 | if (y_coeffs[pi + y].mute) |
171 | 0 | continue; |
172 | 960 | const float ry = y_coeffs[pi + y].r; |
173 | 960 | const int iy0 = y_coeffs[pi + y].i0; |
174 | 960 | const int iy1 = y_coeffs[pi + y].i1; |
175 | 8.25k | for (x = 0; x < bin_w; x++7.29k ) |
176 | 7.29k | { |
177 | 7.29k | if (x_coeffs[pj + x].mute) |
178 | 0 | continue; |
179 | 7.29k | const float rx = x_coeffs[pj + x].r; |
180 | 7.29k | const int ix0 = x_coeffs[pj + x].i0; |
181 | 7.29k | const int ix1 = x_coeffs[pj + x].i1; |
182 | 7.29k | const float c00 = (1 - ry) * (1 - rx); |
183 | 7.29k | const float c01 = (1 - ry) * rx; |
184 | 7.29k | const float c10 = ry * (1 - rx); |
185 | 7.29k | const float c11 = ry * rx; |
186 | 7.29k | const float* const ap00 = apn + iy0 * astride[CCV_NNC_MAX_DIM - 1] + ix0 * astride[CCV_NNC_MAX_DIM]; |
187 | 7.29k | const float* const ap01 = apn + iy0 * astride[CCV_NNC_MAX_DIM - 1] + ix1 * astride[CCV_NNC_MAX_DIM]; |
188 | 7.29k | const float* const ap10 = apn + iy1 * astride[CCV_NNC_MAX_DIM - 1] + ix0 * astride[CCV_NNC_MAX_DIM]; |
189 | 7.29k | const float* const ap11 = apn + iy1 * astride[CCV_NNC_MAX_DIM - 1] + ix1 * astride[CCV_NNC_MAX_DIM]; |
190 | 797k | for (k = 0; k < ch; k++789k ) |
191 | 789k | cpz[k] += ap00[k] * c00 + ap01[k] * c01 + ap10[k] * c10 + ap11[k] * c11; |
192 | 7.29k | } |
193 | 960 | } |
194 | 12.6k | for (k = 0; k < ch; k++12.4k ) |
195 | 12.4k | cpz[k] *= inv; |
196 | 160 | } |
197 | 40 | cpn += cstride[CCV_NNC_MAX_DIM - 1]; |
198 | 40 | } |
199 | 10 | } |
200 | 7 | return CCV_NNC_EXEC_SUCCESS; |
201 | 7 | } |
202 | | |
203 | | static int _ccv_nnc_roi_align_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
204 | 6 | { |
205 | 6 | assert(input_size >= 3); |
206 | 6 | const ccv_nnc_tensor_view_t* g = (ccv_nnc_tensor_view_t*)inputs[0]; |
207 | 6 | assert(output_size == 1); |
208 | 6 | ccv_nnc_tensor_view_t* o = (ccv_nnc_tensor_view_t*)outputs[0]; |
209 | 6 | const int g_nd = ccv_nnc_tensor_nd(g->info.dim); |
210 | 6 | assert(g_nd == CCV_NNC_MAX_DIM + 1 || g_nd == CCV_NNC_MAX_DIM + 2); |
211 | 6 | const int* gdim = (g_nd == CCV_NNC_MAX_DIM + 1) ? g->info.dim3 : g->info.dim + 13 ; |
212 | 6 | const int pool_h = gdim[0]; |
213 | 6 | const int pool_w = gdim[1]; |
214 | 6 | const int o_nd = ccv_nnc_tensor_nd(o->info.dim); |
215 | 6 | assert(o_nd == CCV_NNC_MAX_DIM + 1 || o_nd == CCV_NNC_MAX_DIM + 2); |
216 | 6 | const int* odim = (o_nd == CCV_NNC_MAX_DIM + 1) ? o->info.dim3 : o->info.dim + 13 ; |
217 | 6 | const int h = odim[0]; |
218 | 6 | const int w = odim[1]; |
219 | 6 | assert(gdim[2] == odim[2]); |
220 | 6 | const int ch = gdim[2]; |
221 | 6 | float* gp = g->data.f32; |
222 | 6 | int gstride[CCV_NNC_MAX_DIM_ALLOC]; |
223 | 6 | ccv_nnc_tensor_view_get_stride(g, gstride); |
224 | 6 | float* op = o->data.f32; |
225 | 6 | int ostride[CCV_NNC_MAX_DIM_ALLOC]; |
226 | 6 | ccv_nnc_tensor_view_get_stride(o, ostride); |
227 | 6 | const ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)inputs[2]; |
228 | 6 | const float* const bp = b->data.f32; |
229 | 6 | const int o_n = ccv_nnc_tensor_get_n(o->info); |
230 | 6 | const int b_nd = ccv_nnc_tensor_nd(b->info.dim); |
231 | 6 | assert(b_nd == 1 || b_nd == 2); |
232 | 6 | const int b_n = b_nd == 1 ? 14 : b->info.dim[0]2 ; |
233 | 6 | const int g_n = ccv_nnc_tensor_get_n(g->info); |
234 | 6 | assert(g_n == ccv_max(o_n, b_n)); |
235 | 6 | const int oninc = o_nd == CCV_NNC_MAX_DIM + 1 ? 03 : ostride[0]3 ; |
236 | 6 | int bstride[CCV_NNC_MAX_DIM_ALLOC]; |
237 | 6 | ccv_nnc_tensor_view_get_stride(b, bstride); |
238 | 6 | const int bninc = b_nd == 1 ? 04 : bstride[2 CCV_NNC_MAX_DIM2 + 2 - b_nd]; |
239 | 6 | const int gninc = g_nd == CCV_NNC_MAX_DIM + 1 ? 03 : gstride[0]3 ; |
240 | 6 | int bin_h, bin_w; |
241 | 6 | roi_align_coeffs_t* y_coeffs; |
242 | 6 | roi_align_coeffs_t* x_coeffs; |
243 | 6 | int* bin_h_at_y; |
244 | 6 | int* bin_w_at_x; |
245 | 6 | int start_h, start_w, end_h, end_w; |
246 | 6 | int n; |
247 | 6 | ccv_nnc_tensor_zero(o); |
248 | 15 | for (n = 0; n < g_n; n++9 ) |
249 | 9 | { |
250 | 9 | const float roi_x = bp[(n % b_n) * bninc] * w; // These assumed it is real-coordinate, with range between 0 to w - 1. |
251 | 9 | const float roi_y = bp[(n % b_n) * bninc + 1] * h; |
252 | 9 | const float roi_w = bp[(n % b_n) * bninc + 2] * w; |
253 | 9 | const float roi_h = bp[(n % b_n) * bninc + 3] * h; |
254 | | // Re-compute the offsets if b changes or it is the first time. |
255 | 9 | if ((b_n == 1 && n == 05 ) || b_n > 15 ) |
256 | 8 | _ccv_nnc_bilinear_coeffs(stream_context, h, w, roi_y, roi_x, roi_h, roi_w, pool_h, pool_w, &bin_h, &bin_w, &y_coeffs, &x_coeffs, &bin_h_at_y, &bin_w_at_x, &start_h, &start_w, &end_h, &end_w); |
257 | 9 | const float* gpn = gp + n * gninc; |
258 | 9 | float* const opn = op + (n % o_n) * oninc; |
259 | 9 | int x, y, i, j, k; |
260 | 45 | for (i = 0; i < pool_h; i++36 ) |
261 | 36 | { |
262 | 36 | const int pi = i * bin_h; |
263 | 36 | const int bin_hz = bin_h_at_y[i]; |
264 | 180 | for (j = 0; j < pool_w; j++144 ) |
265 | 144 | { |
266 | 144 | const int pj = j * bin_w; |
267 | 144 | const int bin_wz = bin_w_at_x[j]; |
268 | 144 | const float inv = 1.0 / (bin_hz * bin_wz); |
269 | 144 | const float* const gpz = gpn + j * gstride[CCV_NNC_MAX_DIM]; |
270 | 1.05k | for (y = 0; y < bin_h; y++912 ) |
271 | 912 | { |
272 | 912 | if (y_coeffs[pi + y].mute) |
273 | 0 | continue; |
274 | 912 | const float ry = y_coeffs[pi + y].r; |
275 | 912 | const int iy0 = y_coeffs[pi + y].i0; |
276 | 912 | const int iy1 = y_coeffs[pi + y].i1; |
277 | 7.92k | for (x = 0; x < bin_w; x++7.00k ) |
278 | 7.00k | { |
279 | 7.00k | if (x_coeffs[pj + x].mute) |
280 | 0 | continue; |
281 | 7.00k | const float rx = x_coeffs[pj + x].r; |
282 | 7.00k | const int ix0 = x_coeffs[pj + x].i0; |
283 | 7.00k | const int ix1 = x_coeffs[pj + x].i1; |
284 | 7.00k | const float c00 = (1 - ry) * (1 - rx); |
285 | 7.00k | const float c01 = (1 - ry) * rx; |
286 | 7.00k | const float c10 = ry * (1 - rx); |
287 | 7.00k | const float c11 = ry * rx; |
288 | 7.00k | float* const op00 = opn + iy0 * ostride[CCV_NNC_MAX_DIM - 1] + ix0 * ostride[CCV_NNC_MAX_DIM]; |
289 | 7.00k | float* const op01 = opn + iy0 * ostride[CCV_NNC_MAX_DIM - 1] + ix1 * ostride[CCV_NNC_MAX_DIM]; |
290 | 7.00k | float* const op10 = opn + iy1 * ostride[CCV_NNC_MAX_DIM - 1] + ix0 * ostride[CCV_NNC_MAX_DIM]; |
291 | 7.00k | float* const op11 = opn + iy1 * ostride[CCV_NNC_MAX_DIM - 1] + ix1 * ostride[CCV_NNC_MAX_DIM]; |
292 | 796k | for (k = 0; k < ch; k++789k ) |
293 | 789k | { |
294 | 789k | op00[k] += gpz[k] * c00 * inv; |
295 | 789k | op01[k] += gpz[k] * c01 * inv; |
296 | 789k | op10[k] += gpz[k] * c10 * inv; |
297 | 789k | op11[k] += gpz[k] * c11 * inv; |
298 | 789k | } |
299 | 7.00k | } |
300 | 912 | } |
301 | 144 | } |
302 | 36 | gpn += gstride[CCV_NNC_MAX_DIM - 1]; |
303 | 36 | } |
304 | 9 | } |
305 | 6 | return CCV_NNC_EXEC_SUCCESS; |
306 | 6 | } |
307 | | |
308 | | REGISTER_COMMAND_BACKEND(CCV_NNC_ROI_ALIGN_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
309 | 1 | { |
310 | 1 | registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC; |
311 | 1 | registry->tensor_datatypes = CCV_32F; |
312 | 1 | registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
313 | 1 | registry->algorithms = 1; |
314 | 1 | registry->exec = _ccv_nnc_roi_align_forw; |
315 | 1 | } |
316 | | |
317 | | REGISTER_COMMAND_BACKEND(CCV_NNC_ROI_ALIGN_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
318 | 1 | { |
319 | 1 | registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC; |
320 | 1 | registry->tensor_datatypes = CCV_32F; |
321 | 1 | registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
322 | 1 | registry->algorithms = 1; |
323 | 1 | registry->exec = _ccv_nnc_roi_align_back; |
324 | 1 | } |