/home/liu/actions-runner/_work/ccv/ccv/lib/ccv_numeric.c
Line | Count | Source (jump to first uncovered line) |
1 | | #include "ccv.h" |
2 | | #include "ccv_internal.h" |
3 | | #include <complex.h> |
4 | | #ifdef HAVE_FFTW3 |
5 | | #include <pthread.h> |
6 | | #include <fftw3.h> |
7 | | #else |
8 | | #include "3rdparty/kissfft/kiss_fftndr.h" |
9 | | #include "3rdparty/kissfft/kissf_fftndr.h" |
10 | | #endif |
11 | | |
12 | | const ccv_minimize_param_t ccv_minimize_default_params = { |
13 | | .interp = 0.1, |
14 | | .extrap = 3.0, |
15 | | .max_iter = 20, |
16 | | .ratio = 10.0, |
17 | | .sig = 0.1, |
18 | | .rho = 0.05, |
19 | | }; |
20 | | |
21 | | void ccv_invert(ccv_matrix_t* a, ccv_matrix_t** b, int type) |
22 | 0 | { |
23 | 0 | } |
24 | | |
25 | | void ccv_solve(ccv_matrix_t* a, ccv_matrix_t* b, ccv_matrix_t** d, int type) |
26 | 0 | { |
27 | 0 | } |
28 | | |
29 | | void ccv_eigen(ccv_dense_matrix_t* a, ccv_dense_matrix_t** vector, ccv_dense_matrix_t** lambda, int type, double epsilon) |
30 | 1 | { |
31 | 1 | ccv_declare_derived_signature(vsig, a->sig != 0, ccv_sign_with_literal("ccv_eigen(vector)"), a->sig, CCV_EOF_SIGN); |
32 | 1 | ccv_declare_derived_signature(lsig, a->sig != 0, ccv_sign_with_literal("ccv_eigen(lambda)"), a->sig, CCV_EOF_SIGN); |
33 | 1 | assert(CCV_GET_CHANNEL(a->type) == 1); |
34 | 1 | type = (type == 0) ? CCV_GET_DATA_TYPE(a->type) | CCV_C1 : CCV_GET_DATA_TYPE0 (type) | CCV_C10 ; |
35 | | // as of now, this function only support real symmetric matrix |
36 | 1 | ccv_dense_matrix_t* dvector = *vector = ccv_dense_matrix_renew(*vector, a->rows, a->cols, CCV_32F | CCV_64F | CCV_C1, type, vsig); |
37 | 1 | ccv_dense_matrix_t* dlambda = *lambda = ccv_dense_matrix_renew(*lambda, 1, a->cols, CCV_32F | CCV_64F | CCV_C1, type, lsig); |
38 | 1 | assert(CCV_GET_DATA_TYPE(dvector->type) == CCV_GET_DATA_TYPE(dlambda->type)); |
39 | 1 | ccv_object_return_if_cached(, dvector, dlambda); |
40 | 1 | double* ja = (double*)ccmalloc(sizeof(double) * a->rows * a->cols); |
41 | 1 | int i, j; |
42 | 1 | unsigned char* aptr = a->data.u8; |
43 | 1 | assert(a->rows > 0 && a->cols > 0); |
44 | 1 | #define for_block(_, _for_get) \ |
45 | 5 | for (i = 0; 1 i < a->rows; i++4 ) \ |
46 | 4 | { \ |
47 | 20 | for (j = 0; j < a->cols; j++16 ) \ |
48 | 16 | ja[i * a->cols + j] = _for_get4 (aptr, j); \ |
49 | 4 | aptr += a->step; \ |
50 | 4 | } |
51 | 1 | ccv_matrix_getter(a->type, for_block); |
52 | 1 | #undef for_block |
53 | 1 | ccv_zero(dvector); |
54 | 1 | ccv_zero(dlambda); |
55 | 1 | unsigned char* dvptr = dvector->data.u8; |
56 | 1 | #define for_block(_, _for_set) \ |
57 | 5 | for (i = 0; 1 i < a->cols; i++4 ) \ |
58 | 1 | _for_set(dvptr, i * a->cols + i, 1); |
59 | 1 | ccv_matrix_setter(dvector->type, for_block); |
60 | 1 | #undef for_block |
61 | 1 | double accuracy = 0; |
62 | 17 | for (i = 0; i < a->rows * a->cols; i++16 ) |
63 | 16 | accuracy += ja[i]; |
64 | 1 | accuracy = sqrt(2 * accuracy); |
65 | 1 | int p, q; |
66 | 1 | unsigned char* dlptr = dlambda->data.u8; |
67 | 1 | int flag = 1; |
68 | 1 | assert(a->rows == a->cols); |
69 | 1 | #define for_block(_, _for_set, _for_get) \ |
70 | 39 | do 1 { \ |
71 | 39 | if (!flag) \ |
72 | 39 | accuracy = accuracy * 0.524 ; \ |
73 | 39 | flag = 0; \ |
74 | 147 | for (p = 0; p < a->rows; p++108 ) \ |
75 | 122 | { \ |
76 | 303 | for (q = p + 1; q < a->cols; q++181 ) \ |
77 | 195 | if (fabs(ja[p * a->cols + q]) > accuracy) \ |
78 | 195 | { \ |
79 | 14 | double x = -ja[p * a->cols + q]; \ |
80 | 14 | double y = (ja[q * a->cols + q] - ja[p * a->cols + p]) * 0.5; \ |
81 | 14 | double omega = (x == 0 && y == 00 ) ? 10 : x / sqrt(x * x + y * y); \ |
82 | 14 | if (y < 0) \ |
83 | 14 | omega = -omega12 ; \ |
84 | 14 | double sn = 1.0 + sqrt(1.0 - omega * omega); \ |
85 | 14 | sn = omega / sqrt(2 * sn); \ |
86 | 14 | double cn = sqrt(1.0 - sn * sn); \ |
87 | 14 | double fpp = ja[p * a->cols + p]; \ |
88 | 14 | double fpq = ja[p * a->cols + q]; \ |
89 | 14 | double fqq = ja[q * a->cols + q]; \ |
90 | 14 | ja[p * a->cols + p] = fpp * cn * cn + fqq * sn * sn + fpq * omega; \ |
91 | 14 | ja[q * a->cols + q] = fpp * sn * sn + fqq * cn * cn - fpq * omega; \ |
92 | 14 | ja[p * a->cols + q] = ja[q * a->cols + p] = 0; \ |
93 | 70 | for (i = 0; i < a->cols; i++56 ) \ |
94 | 56 | if (i != q && i != p42 ) \ |
95 | 56 | { \ |
96 | 28 | fpp = ja[p * a->cols + i]; \ |
97 | 28 | fqq = ja[q * a->cols + i]; \ |
98 | 28 | ja[p * a->cols + i] = fpp * cn + fqq * sn; \ |
99 | 28 | ja[q * a->cols + i] = -fpp * sn + fqq * cn; \ |
100 | 28 | } \ |
101 | 70 | for (i = 0; i < a->rows; i++56 ) \ |
102 | 56 | if (i != q && i != p42 ) \ |
103 | 56 | { \ |
104 | 28 | fpp = ja[i * a->cols + p]; \ |
105 | 28 | fqq = ja[i * a->cols + q]; \ |
106 | 28 | ja[i * a->cols + p] = fpp * cn + fqq * sn; \ |
107 | 28 | ja[i * a->cols + q] = -fpp * sn + fqq * cn; \ |
108 | 28 | } \ |
109 | 70 | for (i = 0; i < a->cols; i++56 ) \ |
110 | 56 | { \ |
111 | 56 | fpp = _for_get(dvptr, p * a->cols + i); \ |
112 | 56 | fqq = _for_get(dvptr, q * a->cols + i); \ |
113 | 56 | _for_set(dvptr, p * a->cols + i, fpp * cn + fqq * sn); \ |
114 | 56 | _for_set(dvptr, q * a->cols + i, -fpp * sn + fqq * cn); \ |
115 | 56 | } \ |
116 | 70 | for (i = 0; i < a->cols; i++56 ) \ |
117 | 14 | _for_set(dlptr, i, ja[i * a->cols + i]); \ |
118 | 14 | flag = 1; \ |
119 | 14 | break; \ |
120 | 14 | } \ |
121 | 122 | if (flag) \ |
122 | 122 | break14 ; \ |
123 | 122 | } \ |
124 | 39 | } while (accuracy > epsilon); |
125 | 1 | ccv_matrix_setter_getter(dvector->type, for_block); |
126 | 1 | #undef for_block |
127 | 1 | ccfree(ja); |
128 | 1 | } |
129 | | |
130 | | void ccv_minimize(ccv_dense_matrix_t* x, int length, double red, ccv_minimize_f func, ccv_minimize_param_t params, void* data) |
131 | 1 | { |
132 | 1 | ccv_dense_matrix_t* df0 = ccv_dense_matrix_new(x->rows, x->cols, x->type, 0, 0); |
133 | 1 | ccv_zero(df0); |
134 | 1 | ccv_dense_matrix_t* df3 = ccv_dense_matrix_new(x->rows, x->cols, x->type, 0, 0); |
135 | 1 | ccv_zero(df3); |
136 | 1 | ccv_dense_matrix_t* dF0 = ccv_dense_matrix_new(x->rows, x->cols, x->type, 0, 0); |
137 | 1 | ccv_zero(dF0); |
138 | 1 | ccv_dense_matrix_t* s = ccv_dense_matrix_new(x->rows, x->cols, x->type, 0, 0); |
139 | 1 | ccv_zero(s); |
140 | 1 | ccv_dense_matrix_t* x0 = ccv_dense_matrix_new(x->rows, x->cols, x->type, 0, 0); |
141 | 1 | ccv_zero(x0); |
142 | 1 | ccv_dense_matrix_t* xn = ccv_dense_matrix_new(x->rows, x->cols, x->type, 0, 0); |
143 | 1 | ccv_zero(xn); |
144 | | |
145 | 1 | double F0 = 0, f0 = 0, f1 = 0, f2 = 0, f3 = 0, f4 = 0; |
146 | 1 | double x1 = 0, x2 = 0, x3 = 0, x4 = 0; |
147 | 1 | double d0 = 0, d1 = 0, d2 = 0, d3 = 0, d4 = 0; |
148 | 1 | double A = 0, B = 0; |
149 | 1 | int ls_failed = 0; |
150 | | |
151 | 1 | int i, j, k; |
152 | 1 | func(x, &f0, df0, data); |
153 | 1 | d0 = 0; |
154 | 1 | unsigned char* df0p = df0->data.u8; |
155 | 1 | unsigned char* sp = s->data.u8; |
156 | 2 | for (i = 0; i < x->rows; i++1 ) |
157 | 1 | { |
158 | 3 | for (j = 0; j < x->cols; j++2 ) |
159 | 2 | { |
160 | 2 | double ss = ccv_get_value(x->type, df0p, j); |
161 | 2 | ccv_set_value(x->type, sp, j, -ss, 0); |
162 | 2 | d0 += -ss * ss; |
163 | 2 | } |
164 | 1 | df0p += x->step; |
165 | 1 | sp += x->step; |
166 | 1 | } |
167 | 1 | x3 = red / (1.0 - d0); |
168 | 1 | int l = (length > 0) ? length : -length0 ; |
169 | 1 | int ls = (length > 0) ? 1 : 00 ; |
170 | 1 | int eh = (length > 0) ? 0 : 10 ; |
171 | 22 | for (k = 0; k < l;) |
172 | 22 | { |
173 | 22 | k += ls; |
174 | 22 | memcpy(x0->data.u8, x->data.u8, x->rows * x->step); |
175 | 22 | memcpy(dF0->data.u8, df0->data.u8, x->rows * x->step); |
176 | 22 | F0 = f0; |
177 | 22 | int m = ccv_min(params.max_iter, (length > 0) ? params.max_iter : l - k); |
178 | 22 | for (;;) |
179 | 80 | { |
180 | 80 | x2 = 0; |
181 | 80 | f3 = f2 = f0; |
182 | 80 | d2 = d0; |
183 | 80 | memcpy(df3->data.u8, df0->data.u8, x->rows * x->step); |
184 | 80 | while (m > 0) |
185 | 80 | { |
186 | 80 | m--; |
187 | 80 | k += eh; |
188 | 80 | unsigned char* sp = s->data.u8; |
189 | 80 | unsigned char* xp = x->data.u8; |
190 | 80 | unsigned char* xnp = xn->data.u8; |
191 | 160 | for (i = 0; i < x->rows; i++80 ) |
192 | 80 | { |
193 | 240 | for (j = 0; j < x->cols; j++160 ) |
194 | 160 | ccv_set_value(x->type, xnp, j, x3 * ccv_get_value(x->type, sp, j) + ccv_get_value(x->type, xp, j), 0); |
195 | 80 | sp += x->step; |
196 | 80 | xp += x->step; |
197 | 80 | xnp += x->step; |
198 | 80 | } |
199 | 80 | if (func(xn, &f3, df3, data) == 0) |
200 | 80 | break; |
201 | 0 | else |
202 | 0 | x3 = (x2 + x3) * 0.5; |
203 | 80 | } |
204 | 80 | if (f3 < F0) |
205 | 27 | { |
206 | 27 | memcpy(x0->data.u8, xn->data.u8, x->rows * x->step); |
207 | 27 | memcpy(dF0->data.u8, df3->data.u8, x->rows * x->step); |
208 | 27 | F0 = f3; |
209 | 27 | } |
210 | 80 | d3 = 0; |
211 | 80 | unsigned char* df3p = df3->data.u8; |
212 | 80 | unsigned char* sp = s->data.u8; |
213 | 160 | for (i = 0; i < x->rows; i++80 ) |
214 | 80 | { |
215 | 240 | for (j = 0; j < x->cols; j++160 ) |
216 | 160 | d3 += ccv_get_value(x->type, df3p, j) * ccv_get_value(x->type, sp, j); |
217 | 80 | df3p += x->step; |
218 | 80 | sp += x->step; |
219 | 80 | } |
220 | 80 | if ((d3 > params.sig * d0) || (f3 > f0 + x3 * params.rho * d0)61 || (m <= 0)60 ) |
221 | 22 | break; |
222 | 58 | x1 = x2; f1 = f2; d1 = d2; |
223 | 58 | x2 = x3; f2 = f3; d2 = d3; |
224 | 58 | A = 6.0 * (f1 - f2) + 3.0 * (d2 + d1) * (x2 - x1); |
225 | 58 | B = 3.0 * (f2 - f1) - (2.0 * d1 + d2) * (x2 - x1); |
226 | 58 | x3 = B * B - A * d1 * (x2 - x1); |
227 | 58 | if (x3 < 0) |
228 | 8 | x3 = x2 * params.extrap; |
229 | 50 | else { |
230 | 50 | x3 = x1 - d1 * (x2 - x1) * (x2 - x1) / (B + sqrt(x3)); |
231 | 50 | if (x3 < 0) |
232 | 0 | x3 = x2 * params.extrap; |
233 | 50 | else { |
234 | 50 | if (x3 > x2 * params.extrap) |
235 | 8 | x3 = x2 * params.extrap; |
236 | 42 | else if (x3 < x2 + params.interp * (x2 - x1)) |
237 | 38 | x3 = x2 + params.interp * (x2 - x1); |
238 | 50 | } |
239 | 50 | } |
240 | 58 | } |
241 | 88 | while (22 ((fabs(d3) > -params.sig * d0) || (f3 > f0 + x3 * params.rho * d0)18 ) && (m > 0)70 ) |
242 | 66 | { |
243 | 66 | if ((d3 > 1e-8) || (f3 > f0 + x3 * params.rho * d0)52 ) |
244 | 38 | { |
245 | 38 | x4 = x3; f4 = f3; d4 = d3; |
246 | 38 | } else { |
247 | 28 | x2 = x3; f2 = f3; d2 = d3; |
248 | 28 | } |
249 | 66 | if (f4 > f0) |
250 | 43 | x3 = x2 - (0.5 * d2 * (x4 - x2) * (x4 - x2)) / (f4 - f2 - d2 * (x4 - x2)); |
251 | 23 | else { |
252 | 23 | A = 6.0 * (f2 - f4) / (x4 - x2) + 3.0 * (d4 + d2); |
253 | 23 | B = 3.0 * (f4 - f2) - (2.0 * d2 + d4) * (x4 - x2); |
254 | 23 | x3 = B * B - A * d2 * (x4 - x2) * (x4 - x2); |
255 | 23 | x3 = (x3 < 0) ? (x2 + x4) * 0.50 : x2 + (sqrt(x3) - B) / A; |
256 | 23 | } |
257 | 66 | x3 = ccv_max(ccv_min(x3, x4 - params.interp * (x4 - x2)), x2 + params.interp * (x4 - x2)); |
258 | 66 | sp = s->data.u8; |
259 | 66 | unsigned char* xp = x->data.u8; |
260 | 66 | unsigned char* xnp = xn->data.u8; |
261 | 132 | for (i = 0; i < x->rows; i++66 ) |
262 | 66 | { |
263 | 198 | for (j = 0; j < x->cols; j++132 ) |
264 | 132 | ccv_set_value(x->type, xnp, j, x3 * ccv_get_value(x->type, sp, j) + ccv_get_value(x->type, xp, j), 0); |
265 | 66 | sp += x->step; |
266 | 66 | xp += x->step; |
267 | 66 | xnp += x->step; |
268 | 66 | } |
269 | 66 | func(xn, &f3, df3, data); |
270 | 66 | if (f3 < F0) |
271 | 24 | { |
272 | 24 | memcpy(x0->data.u8, xn->data.u8, x->rows * x->step); |
273 | 24 | memcpy(dF0->data.u8, df3->data.u8, x->rows * x->step); |
274 | 24 | F0 = f3; |
275 | 24 | } |
276 | 66 | m--; |
277 | 66 | k += eh; |
278 | 66 | d3 = 0; |
279 | 66 | sp = s->data.u8; |
280 | 66 | unsigned char* df3p = df3->data.u8; |
281 | 132 | for (i = 0; i < x->rows; i++66 ) |
282 | 66 | { |
283 | 198 | for (j = 0; j < x->cols; j++132 ) |
284 | 132 | d3 += ccv_get_value(x->type, df3p, j) * ccv_get_value(x->type, sp, j); |
285 | 66 | df3p += x->step; |
286 | 66 | sp += x->step; |
287 | 66 | } |
288 | 66 | } |
289 | 22 | if ((fabs(d3) < -params.sig * d0) && (f3 < f0 + x3 * params.rho * d0)18 ) |
290 | 18 | { |
291 | 18 | memcpy(x->data.u8, xn->data.u8, x->rows * x->step); |
292 | 18 | f0 = f3; |
293 | 18 | double df0_df3 = 0; |
294 | 18 | double df3_df3 = 0; |
295 | 18 | double df0_df0 = 0; |
296 | 18 | unsigned char* df0p = df0->data.u8; |
297 | 18 | unsigned char* df3p = df3->data.u8; |
298 | 36 | for (i = 0; i < x->rows; i++18 ) |
299 | 18 | { |
300 | 54 | for (j = 0; j < x->cols; j++36 ) |
301 | 36 | { |
302 | 36 | df0_df0 += ccv_get_value(x->type, df0p, j) * ccv_get_value(x->type, df0p, j); |
303 | 36 | df0_df3 += ccv_get_value(x->type, df0p, j) * ccv_get_value(x->type, df3p, j); |
304 | 36 | df3_df3 += ccv_get_value(x->type, df3p, j) * ccv_get_value(x->type, df3p, j); |
305 | 36 | } |
306 | 18 | df0p += x->step; |
307 | 18 | df3p += x->step; |
308 | 18 | } |
309 | 18 | double slr = (df3_df3 - df0_df3) / df0_df0; |
310 | 18 | df3p = df3->data.u8; |
311 | 18 | unsigned char* sp = s->data.u8; |
312 | 36 | for (i = 0; i < x->rows; i++18 ) |
313 | 18 | { |
314 | 54 | for (j = 0; j < x->cols; j++36 ) |
315 | 36 | ccv_set_value(x->type, sp, j, slr * ccv_get_value(x->type, sp, j) - ccv_get_value(x->type, df3p, j), 0); |
316 | 18 | df3p += x->step; |
317 | 18 | sp += x->step; |
318 | 18 | } |
319 | 18 | memcpy(df0->data.u8, df3->data.u8, x->rows * x->step); |
320 | 18 | d3 = d0; |
321 | 18 | d0 = 0; |
322 | 18 | df0p = df0->data.u8; |
323 | 18 | sp = s->data.u8; |
324 | 36 | for (i = 0; i < x->rows; i++18 ) |
325 | 18 | { |
326 | 54 | for (j = 0; j < x->cols; j++36 ) |
327 | 36 | { |
328 | 36 | d0 += ccv_get_value(x->type, df0p, j) * ccv_get_value(x->type, sp, j); |
329 | 36 | } |
330 | 18 | df0p += x->step; |
331 | 18 | sp += x->step; |
332 | 18 | } |
333 | 18 | if (d0 > 0) |
334 | 1 | { |
335 | 1 | d0 = 0; |
336 | 1 | df0p = df0->data.u8; |
337 | 1 | sp = s->data.u8; |
338 | 2 | for (i = 0; i < x->rows; i++1 ) |
339 | 1 | { |
340 | 3 | for (j = 0; j < x->cols; j++2 ) |
341 | 2 | { |
342 | 2 | double ss = ccv_get_value(x->type, df0p, j); |
343 | 2 | ccv_set_value(x->type, sp, j, -ss, 0); |
344 | 2 | d0 += -ss * ss; |
345 | 2 | } |
346 | 1 | df0p += x->step; |
347 | 1 | sp += x->step; |
348 | 1 | } |
349 | 1 | } |
350 | 18 | x3 = x3 * ccv_min(params.ratio, d3 / (d0 - 1e-8)); |
351 | 18 | ls_failed = 0; |
352 | 18 | } else { |
353 | 4 | memcpy(x->data.u8, x0->data.u8, x->rows * x->step); |
354 | 4 | memcpy(df0->data.u8, dF0->data.u8, x->rows * x->step); |
355 | 4 | f0 = F0; |
356 | 4 | if (ls_failed) |
357 | 1 | break; |
358 | 3 | d0 = 0; |
359 | 3 | unsigned char* df0p = df0->data.u8; |
360 | 3 | unsigned char* sp = s->data.u8; |
361 | 6 | for (i = 0; i < x->rows; i++3 ) |
362 | 3 | { |
363 | 9 | for (j = 0; j < x->cols; j++6 ) |
364 | 6 | { |
365 | 6 | double ss = ccv_get_value(x->type, df0p, j); |
366 | 6 | ccv_set_value(x->type, sp, j, -ss, 0); |
367 | 6 | d0 += -ss * ss; |
368 | 6 | } |
369 | 3 | df0p += x->step; |
370 | 3 | sp += x->step; |
371 | 3 | } |
372 | 3 | x3 = red / (1.0 - d0); |
373 | 3 | ls_failed = 1; |
374 | 3 | } |
375 | 22 | } |
376 | 1 | ccv_matrix_free(s); |
377 | 1 | ccv_matrix_free(x0); |
378 | 1 | ccv_matrix_free(xn); |
379 | 1 | ccv_matrix_free(dF0); |
380 | 1 | ccv_matrix_free(df0); |
381 | 1 | ccv_matrix_free(df3); |
382 | 1 | } |
383 | | |
384 | | #ifdef HAVE_FFTW3 |
385 | | /* optimal FFT size table is adopted from OpenCV */ |
386 | | static const int _ccv_optimal_fft_size[] = { |
387 | | 1, 2, 3, 4, 5, 6, 8, 9, 10, 12, 15, 16, 18, 20, 24, 25, 27, 30, 32, 36, 40, 45, 48, |
388 | | 50, 54, 60, 64, 72, 75, 80, 81, 90, 96, 100, 108, 120, 125, 128, 135, 144, 150, 160, |
389 | | 162, 180, 192, 200, 216, 225, 240, 243, 250, 256, 270, 288, 300, 320, 324, 360, 375, |
390 | | 384, 400, 405, 432, 450, 480, 486, 500, 512, 540, 576, 600, 625, 640, 648, 675, 720, |
391 | | 729, 750, 768, 800, 810, 864, 900, 960, 972, 1000, 1024, 1080, 1125, 1152, 1200, |
392 | | 1215, 1250, 1280, 1296, 1350, 1440, 1458, 1500, 1536, 1600, 1620, 1728, 1800, 1875, |
393 | | 1920, 1944, 2000, 2025, 2048, 2160, 2187, 2250, 2304, 2400, 2430, 2500, 2560, 2592, |
394 | | 2700, 2880, 2916, 3000, 3072, 3125, 3200, 3240, 3375, 3456, 3600, 3645, 3750, 3840, |
395 | | 3888, 4000, 4050, 4096, 4320, 4374, 4500, 4608, 4800, 4860, 5000, 5120, 5184, 5400, |
396 | | 5625, 5760, 5832, 6000, 6075, 6144, 6250, 6400, 6480, 6561, 6750, 6912, 7200, 7290, |
397 | | 7500, 7680, 7776, 8000, 8100, 8192, 8640, 8748, 9000, 9216, 9375, 9600, 9720, 10000, |
398 | | 10125, 10240, 10368, 10800, 10935, 11250, 11520, 11664, 12000, 12150, 12288, 12500, |
399 | | 12800, 12960, 13122, 13500, 13824, 14400, 14580, 15000, 15360, 15552, 15625, 16000, |
400 | | 16200, 16384, 16875, 17280, 17496, 18000, 18225, 18432, 18750, 19200, 19440, 19683, |
401 | | 20000, 20250, 20480, 20736, 21600, 21870, 22500, 23040, 23328, 24000, 24300, 24576, |
402 | | 25000, 25600, 25920, 26244, 27000, 27648, 28125, 28800, 29160, 30000, 30375, 30720, |
403 | | 31104, 31250, 32000, 32400, 32768, 32805, 33750, 34560, 34992, 36000, 36450, 36864, |
404 | | 37500, 38400, 38880, 39366, 40000, 40500, 40960, 41472, 43200, 43740, 45000, 46080, |
405 | | 46656, 46875, 48000, 48600, 49152, 50000, 50625, 51200, 51840, 52488, 54000, 54675, |
406 | | 55296, 56250, 57600, 58320, 59049, 60000, 60750, 61440, 62208, 62500, 64000, 64800, |
407 | | 65536, 65610, 67500, 69120, 69984, 72000, 72900, 73728, 75000, 76800, 77760, 78125, |
408 | | 78732, 80000, 81000, 81920, 82944, 84375, 86400, 87480, 90000, 91125, 92160, 93312, |
409 | | 93750, 96000, 97200, 98304, 98415, 100000, 101250, 102400, 103680, 104976, 108000, |
410 | | 109350, 110592, 112500, 115200, 116640, 118098, 120000, 121500, 122880, 124416, 125000, |
411 | | 128000, 129600, 131072, 131220, 135000, 138240, 139968, 140625, 144000, 145800, 147456, |
412 | | 150000, 151875, 153600, 155520, 156250, 157464, 160000, 162000, 163840, 164025, 165888, |
413 | | 168750, 172800, 174960, 177147, 180000, 182250, 184320, 186624, 187500, 192000, 194400, |
414 | | 196608, 196830, 200000, 202500, 204800, 207360, 209952, 216000, 218700, 221184, 225000, |
415 | | 230400, 233280, 234375, 236196, 240000, 243000, 245760, 248832, 250000, 253125, 256000, |
416 | | 259200, 262144, 262440, 270000, 273375, 276480, 279936, 281250, 288000, 291600, 294912, |
417 | | 295245, 300000, 303750, 307200, 311040, 312500, 314928, 320000, 324000, 327680, 328050, |
418 | | 331776, 337500, 345600, 349920, 354294, 360000, 364500, 368640, 373248, 375000, 384000, |
419 | | 388800, 390625, 393216, 393660, 400000, 405000, 409600, 414720, 419904, 421875, 432000, |
420 | | 437400, 442368, 450000, 455625, 460800, 466560, 468750, 472392, 480000, 486000, 491520, |
421 | | 492075, 497664, 500000, 506250, 512000, 518400, 524288, 524880, 531441, 540000, 546750, |
422 | | 552960, 559872, 562500, 576000, 583200, 589824, 590490, 600000, 607500, 614400, 622080, |
423 | | 625000, 629856, 640000, 648000, 655360, 656100, 663552, 675000, 691200, 699840, 703125, |
424 | | 708588, 720000, 729000, 737280, 746496, 750000, 759375, 768000, 777600, 781250, 786432, |
425 | | 787320, 800000, 810000, 819200, 820125, 829440, 839808, 843750, 864000, 874800, 884736, |
426 | | 885735, 900000, 911250, 921600, 933120, 937500, 944784, 960000, 972000, 983040, 984150, |
427 | | 995328, 1000000, 1012500, 1024000, 1036800, 1048576, 1049760, 1062882, 1080000, 1093500, |
428 | | 1105920, 1119744, 1125000, 1152000, 1166400, 1171875, 1179648, 1180980, 1200000, |
429 | | 1215000, 1228800, 1244160, 1250000, 1259712, 1265625, 1280000, 1296000, 1310720, |
430 | | 1312200, 1327104, 1350000, 1366875, 1382400, 1399680, 1406250, 1417176, 1440000, |
431 | | 1458000, 1474560, 1476225, 1492992, 1500000, 1518750, 1536000, 1555200, 1562500, |
432 | | 1572864, 1574640, 1594323, 1600000, 1620000, 1638400, 1640250, 1658880, 1679616, |
433 | | 1687500, 1728000, 1749600, 1769472, 1771470, 1800000, 1822500, 1843200, 1866240, |
434 | | 1875000, 1889568, 1920000, 1944000, 1953125, 1966080, 1968300, 1990656, 2000000, |
435 | | 2025000, 2048000, 2073600, 2097152, 2099520, 2109375, 2125764, 2160000, 2187000, |
436 | | 2211840, 2239488, 2250000, 2278125, 2304000, 2332800, 2343750, 2359296, 2361960, |
437 | | 2400000, 2430000, 2457600, 2460375, 2488320, 2500000, 2519424, 2531250, 2560000, |
438 | | 2592000, 2621440, 2624400, 2654208, 2657205, 2700000, 2733750, 2764800, 2799360, |
439 | | 2812500, 2834352, 2880000, 2916000, 2949120, 2952450, 2985984, 3000000, 3037500, |
440 | | 3072000, 3110400, 3125000, 3145728, 3149280, 3188646, 3200000, 3240000, 3276800, |
441 | | 3280500, 3317760, 3359232, 3375000, 3456000, 3499200, 3515625, 3538944, 3542940, |
442 | | 3600000, 3645000, 3686400, 3732480, 3750000, 3779136, 3796875, 3840000, 3888000, |
443 | | 3906250, 3932160, 3936600, 3981312, 4000000, 4050000, 4096000, 4100625, 4147200, |
444 | | 4194304, 4199040, 4218750, 4251528, 4320000, 4374000, 4423680, 4428675, 4478976, |
445 | | 4500000, 4556250, 4608000, 4665600, 4687500, 4718592, 4723920, 4782969, 4800000, |
446 | | 4860000, 4915200, 4920750, 4976640, 5000000, 5038848, 5062500, 5120000, 5184000, |
447 | | 5242880, 5248800, 5308416, 5314410, 5400000, 5467500, 5529600, 5598720, 5625000, |
448 | | 5668704, 5760000, 5832000, 5859375, 5898240, 5904900, 5971968, 6000000, 6075000, |
449 | | 6144000, 6220800, 6250000, 6291456, 6298560, 6328125, 6377292, 6400000, 6480000, |
450 | | 6553600, 6561000, 6635520, 6718464, 6750000, 6834375, 6912000, 6998400, 7031250, |
451 | | 7077888, 7085880, 7200000, 7290000, 7372800, 7381125, 7464960, 7500000, 7558272, |
452 | | 7593750, 7680000, 7776000, 7812500, 7864320, 7873200, 7962624, 7971615, 8000000, |
453 | | 8100000, 8192000, 8201250, 8294400, 8388608, 8398080, 8437500, 8503056, 8640000, |
454 | | 8748000, 8847360, 8857350, 8957952, 9000000, 9112500, 9216000, 9331200, 9375000, |
455 | | 9437184, 9447840, 9565938, 9600000, 9720000, 9765625, 9830400, 9841500, 9953280, |
456 | | 10000000, 10077696, 10125000, 10240000, 10368000, 10485760, 10497600, 10546875, 10616832, |
457 | | 10628820, 10800000, 10935000, 11059200, 11197440, 11250000, 11337408, 11390625, 11520000, |
458 | | 11664000, 11718750, 11796480, 11809800, 11943936, 12000000, 12150000, 12288000, 12301875, |
459 | | 12441600, 12500000, 12582912, 12597120, 12656250, 12754584, 12800000, 12960000, 13107200, |
460 | | 13122000, 13271040, 13286025, 13436928, 13500000, 13668750, 13824000, 13996800, 14062500, |
461 | | 14155776, 14171760, 14400000, 14580000, 14745600, 14762250, 14929920, 15000000, 15116544, |
462 | | 15187500, 15360000, 15552000, 15625000, 15728640, 15746400, 15925248, 15943230, 16000000, |
463 | | 16200000, 16384000, 16402500, 16588800, 16777216, 16796160, 16875000, 17006112, 17280000, |
464 | | 17496000, 17578125, 17694720, 17714700, 17915904, 18000000, 18225000, 18432000, 18662400, |
465 | | 18750000, 18874368, 18895680, 18984375, 19131876, 19200000, 19440000, 19531250, 19660800, |
466 | | 19683000, 19906560, 20000000, 20155392, 20250000, 20480000, 20503125, 20736000, 20971520, |
467 | | 20995200, 21093750, 21233664, 21257640, 21600000, 21870000, 22118400, 22143375, 22394880, |
468 | | 22500000, 22674816, 22781250, 23040000, 23328000, 23437500, 23592960, 23619600, 23887872, |
469 | | 23914845, 24000000, 24300000, 24576000, 24603750, 24883200, 25000000, 25165824, 25194240, |
470 | | 25312500, 25509168, 25600000, 25920000, 26214400, 26244000, 26542080, 26572050, 26873856, |
471 | | 27000000, 27337500, 27648000, 27993600, 28125000, 28311552, 28343520, 28800000, 29160000, |
472 | | 29296875, 29491200, 29524500, 29859840, 30000000, 30233088, 30375000, 30720000, 31104000, |
473 | | 31250000, 31457280, 31492800, 31640625, 31850496, 31886460, 32000000, 32400000, 32768000, |
474 | | 32805000, 33177600, 33554432, 33592320, 33750000, 34012224, 34171875, 34560000, 34992000, |
475 | | 35156250, 35389440, 35429400, 35831808, 36000000, 36450000, 36864000, 36905625, 37324800, |
476 | | 37500000, 37748736, 37791360, 37968750, 38263752, 38400000, 38880000, 39062500, 39321600, |
477 | | 39366000, 39813120, 39858075, 40000000, 40310784, 40500000, 40960000, 41006250, 41472000, |
478 | | 41943040, 41990400, 42187500, 42467328, 42515280, 43200000, 43740000, 44236800, 44286750, |
479 | | 44789760, 45000000, 45349632, 45562500, 46080000, 46656000, 46875000, 47185920, 47239200, |
480 | | 47775744, 47829690, 48000000, 48600000, 48828125, 49152000, 49207500, 49766400, 50000000, |
481 | | 50331648, 50388480, 50625000, 51018336, 51200000, 51840000, 52428800, 52488000, 52734375, |
482 | | 53084160, 53144100, 53747712, 54000000, 54675000, 55296000, 55987200, 56250000, 56623104, |
483 | | 56687040, 56953125, 57600000, 58320000, 58593750, 58982400, 59049000, 59719680, 60000000, |
484 | | 60466176, 60750000, 61440000, 61509375, 62208000, 62500000, 62914560, 62985600, 63281250, |
485 | | 63700992, 63772920, 64000000, 64800000, 65536000, 65610000, 66355200, 66430125, 67108864, |
486 | | 67184640, 67500000, 68024448, 68343750, 69120000, 69984000, 70312500, 70778880, 70858800, |
487 | | 71663616, 72000000, 72900000, 73728000, 73811250, 74649600, 75000000, 75497472, 75582720, |
488 | | 75937500, 76527504, 76800000, 77760000, 78125000, 78643200, 78732000, 79626240, 79716150, |
489 | | 80000000, 80621568, 81000000, 81920000, 82012500, 82944000, 83886080, 83980800, 84375000, |
490 | | 84934656, 85030560, 86400000, 87480000, 87890625, 88473600, 88573500, 89579520, 90000000, |
491 | | 90699264, 91125000, 92160000, 93312000, 93750000, 94371840, 94478400, 94921875, 95551488, |
492 | | 95659380, 96000000, 97200000, 97656250, 98304000, 98415000, 99532800, 100000000, |
493 | | 100663296, 100776960, 101250000, 102036672, 102400000, 102515625, 103680000, 104857600, |
494 | | 104976000, 105468750, 106168320, 106288200, 107495424, 108000000, 109350000, 110592000, |
495 | | 110716875, 111974400, 112500000, 113246208, 113374080, 113906250, 115200000, 116640000, |
496 | | 117187500, 117964800, 118098000, 119439360, 119574225, 120000000, 120932352, 121500000, |
497 | | 122880000, 123018750, 124416000, 125000000, 125829120, 125971200, 126562500, 127401984, |
498 | | 127545840, 128000000, 129600000, 131072000, 131220000, 132710400, 132860250, 134217728, |
499 | | 134369280, 135000000, 136048896, 136687500, 138240000, 139968000, 140625000, 141557760, |
500 | | 141717600, 143327232, 144000000, 145800000, 146484375, 147456000, 147622500, 149299200, |
501 | | 150000000, 150994944, 151165440, 151875000, 153055008, 153600000, 155520000, 156250000, |
502 | | 157286400, 157464000, 158203125, 159252480, 159432300, 160000000, 161243136, 162000000, |
503 | | 163840000, 164025000, 165888000, 167772160, 167961600, 168750000, 169869312, 170061120, |
504 | | 170859375, 172800000, 174960000, 175781250, 176947200, 177147000, 179159040, 180000000, |
505 | | 181398528, 182250000, 184320000, 184528125, 186624000, 187500000, 188743680, 188956800, |
506 | | 189843750, 191102976, 191318760, 192000000, 194400000, 195312500, 196608000, 196830000, |
507 | | 199065600, 199290375, 200000000, 201326592, 201553920, 202500000, 204073344, 204800000, |
508 | | 205031250, 207360000, 209715200, 209952000, 210937500, 212336640, 212576400, 214990848, |
509 | | 216000000, 218700000, 221184000, 221433750, 223948800, 225000000, 226492416, 226748160, |
510 | | 227812500, 230400000, 233280000, 234375000, 235929600, 236196000, 238878720, 239148450, |
511 | | 240000000, 241864704, 243000000, 244140625, 245760000, 246037500, 248832000, 250000000, |
512 | | 251658240, 251942400, 253125000, 254803968, 255091680, 256000000, 259200000, 262144000, |
513 | | 262440000, 263671875, 265420800, 265720500, 268435456, 268738560, 270000000, 272097792, |
514 | | 273375000, 276480000, 279936000, 281250000, 283115520, 283435200, 284765625, 286654464, |
515 | | 288000000, 291600000, 292968750, 294912000, 295245000, 298598400, 300000000, 301989888, |
516 | | 302330880, 303750000, 306110016, 307200000, 307546875, 311040000, 312500000, 314572800, |
517 | | 314928000, 316406250, 318504960, 318864600, 320000000, 322486272, 324000000, 327680000, |
518 | | 328050000, 331776000, 332150625, 335544320, 335923200, 337500000, 339738624, 340122240, |
519 | | 341718750, 345600000, 349920000, 351562500, 353894400, 354294000, 358318080, 360000000, |
520 | | 362797056, 364500000, 368640000, 369056250, 373248000, 375000000, 377487360, 377913600, |
521 | | 379687500, 382205952, 382637520, 384000000, 388800000, 390625000, 393216000, 393660000, |
522 | | 398131200, 398580750, 400000000, 402653184, 403107840, 405000000, 408146688, 409600000, |
523 | | 410062500, 414720000, 419430400, 419904000, 421875000, 424673280, 425152800, 429981696, |
524 | | 432000000, 437400000, 439453125, 442368000, 442867500, 447897600, 450000000, 452984832, |
525 | | 453496320, 455625000, 460800000, 466560000, 468750000, 471859200, 472392000, 474609375, |
526 | | 477757440, 478296900, 480000000, 483729408, 486000000, 488281250, 491520000, 492075000, |
527 | | 497664000, 500000000, 503316480, 503884800, 506250000, 509607936, 510183360, 512000000, |
528 | | 512578125, 518400000, 524288000, 524880000, 527343750, 530841600, 531441000, 536870912, |
529 | | 537477120, 540000000, 544195584, 546750000, 552960000, 553584375, 559872000, 562500000, |
530 | | 566231040, 566870400, 569531250, 573308928, 576000000, 583200000, 585937500, 589824000, |
531 | | 590490000, 597196800, 597871125, 600000000, 603979776, 604661760, 607500000, 612220032, |
532 | | 614400000, 615093750, 622080000, 625000000, 629145600, 629856000, 632812500, 637009920, |
533 | | 637729200, 640000000, 644972544, 648000000, 655360000, 656100000, 663552000, 664301250, |
534 | | 671088640, 671846400, 675000000, 679477248, 680244480, 683437500, 691200000, 699840000, |
535 | | 703125000, 707788800, 708588000, 716636160, 720000000, 725594112, 729000000, 732421875, |
536 | | 737280000, 738112500, 746496000, 750000000, 754974720, 755827200, 759375000, 764411904, |
537 | | 765275040, 768000000, 777600000, 781250000, 786432000, 787320000, 791015625, 796262400, |
538 | | 797161500, 800000000, 805306368, 806215680, 810000000, 816293376, 819200000, 820125000, |
539 | | 829440000, 838860800, 839808000, 843750000, 849346560, 850305600, 854296875, 859963392, |
540 | | 864000000, 874800000, 878906250, 884736000, 885735000, 895795200, 900000000, 905969664, |
541 | | 906992640, 911250000, 921600000, 922640625, 933120000, 937500000, 943718400, 944784000, |
542 | | 949218750, 955514880, 956593800, 960000000, 967458816, 972000000, 976562500, 983040000, |
543 | | 984150000, 995328000, 996451875, 1000000000, 1006632960, 1007769600, 1012500000, |
544 | | 1019215872, 1020366720, 1024000000, 1025156250, 1036800000, 1048576000, 1049760000, |
545 | | 1054687500, 1061683200, 1062882000, 1073741824, 1074954240, 1080000000, 1088391168, |
546 | | 1093500000, 1105920000, 1107168750, 1119744000, 1125000000, 1132462080, 1133740800, |
547 | | 1139062500, 1146617856, 1152000000, 1166400000, 1171875000, 1179648000, 1180980000, |
548 | | 1194393600, 1195742250, 1200000000, 1207959552, 1209323520, 1215000000, 1220703125, |
549 | | 1224440064, 1228800000, 1230187500, 1244160000, 1250000000, 1258291200, 1259712000, |
550 | | 1265625000, 1274019840, 1275458400, 1280000000, 1289945088, 1296000000, 1310720000, |
551 | | 1312200000, 1318359375, 1327104000, 1328602500, 1342177280, 1343692800, 1350000000, |
552 | | 1358954496, 1360488960, 1366875000, 1382400000, 1399680000, 1406250000, 1415577600, |
553 | | 1417176000, 1423828125, 1433272320, 1440000000, 1451188224, 1458000000, 1464843750, |
554 | | 1474560000, 1476225000, 1492992000, 1500000000, 1509949440, 1511654400, 1518750000, |
555 | | 1528823808, 1530550080, 1536000000, 1537734375, 1555200000, 1562500000, 1572864000, |
556 | | 1574640000, 1582031250, 1592524800, 1594323000, 1600000000, 1610612736, 1612431360, |
557 | | 1620000000, 1632586752, 1638400000, 1640250000, 1658880000, 1660753125, 1677721600, |
558 | | 1679616000, 1687500000, 1698693120, 1700611200, 1708593750, 1719926784, 1728000000, |
559 | | 1749600000, 1757812500, 1769472000, 1771470000, 1791590400, 1800000000, 1811939328, |
560 | | 1813985280, 1822500000, 1843200000, 1845281250, 1866240000, 1875000000, 1887436800, |
561 | | 1889568000, 1898437500, 1911029760, 1913187600, 1920000000, 1934917632, 1944000000, |
562 | | 1953125000, 1966080000, 1968300000, 1990656000, 1992903750, 2000000000, 2013265920, |
563 | | 2015539200, 2025000000, 2038431744, 2040733440, 2048000000, 2050312500, 2073600000, |
564 | | 2097152000, 2099520000, 2109375000, 2123366400, 2125764000 |
565 | | }; |
566 | | |
567 | | static int _ccv_get_optimal_fft_size(int size) |
568 | 12 | { |
569 | 12 | int a = 0, b = sizeof(_ccv_optimal_fft_size) / sizeof(_ccv_optimal_fft_size[0]) - 1; |
570 | 12 | if((unsigned)size >= (unsigned)_ccv_optimal_fft_size[b]) |
571 | 0 | return -1; |
572 | 142 | while(12 a < b) |
573 | 130 | { |
574 | 130 | int c = (a + b) >> 1; |
575 | 130 | if(size <= _ccv_optimal_fft_size[c]) |
576 | 98 | b = c; |
577 | 32 | else |
578 | 32 | a = c + 1; |
579 | 130 | } |
580 | 12 | return _ccv_optimal_fft_size[b]; |
581 | 12 | } |
582 | | |
583 | | static pthread_mutex_t fftw_plan_mutex = PTHREAD_MUTEX_INITIALIZER; |
584 | | |
585 | | static void _ccv_filter_fftw(ccv_dense_matrix_t* a, ccv_dense_matrix_t* b, ccv_dense_matrix_t* d, int padding_pattern) |
586 | 6 | { |
587 | 6 | int ch = CCV_GET_CHANNEL(a->type); |
588 | 6 | int fft_type = (CCV_GET_DATA_TYPE(d->type) == CCV_8U || CCV_GET_DATA_TYPE(d->type) == CCV_32F) ? CCV_32F5 : CCV_64F1 ; |
589 | 6 | int rows = ccv_min(a->rows + b->rows - 1, _ccv_get_optimal_fft_size(b->rows * 3)); |
590 | 6 | int cols = ccv_min(a->cols + b->cols - 1, _ccv_get_optimal_fft_size(b->cols * 3)); |
591 | 6 | int cols_2c = 2 * (cols / 2 + 1); |
592 | 6 | void* fftw_a; |
593 | 6 | void* fftw_b; |
594 | 6 | void* fftw_d; |
595 | 6 | fftw_plan p, pinv; |
596 | 6 | fftwf_plan pf, pinvf; |
597 | 6 | if (fft_type == CCV_32F) |
598 | 5 | { |
599 | 5 | fftw_a = fftwf_malloc(rows * cols_2c * ch * sizeof(float)); |
600 | 5 | fftw_b = fftwf_malloc(rows * cols_2c * ch * sizeof(float)); |
601 | 5 | fftw_d = fftwf_malloc(rows * cols_2c * ch * sizeof(float)); |
602 | 5 | pthread_mutex_lock(&fftw_plan_mutex); |
603 | 5 | if (ch == 1) |
604 | 3 | { |
605 | 3 | pf = fftwf_plan_dft_r2c_2d(rows, cols, 0, 0, FFTW_ESTIMATE); |
606 | 3 | pinvf = fftwf_plan_dft_c2r_2d(rows, cols, 0, 0, FFTW_ESTIMATE); |
607 | 3 | } else { |
608 | 2 | int ndim[] = {rows, cols}; |
609 | 2 | pf = fftwf_plan_many_dft_r2c(2, ndim, ch, 0, 0, ch, 1, 0, 0, ch, 1, FFTW_ESTIMATE); |
610 | 2 | pinvf = fftwf_plan_many_dft_c2r(2, ndim, ch, 0, 0, ch, 1, 0, 0, ch, 1, FFTW_ESTIMATE); |
611 | 2 | } |
612 | 5 | pthread_mutex_unlock(&fftw_plan_mutex); |
613 | 5 | } else { |
614 | 1 | fftw_a = fftw_malloc(rows * cols_2c * ch * sizeof(double)); |
615 | 1 | fftw_b = fftw_malloc(rows * cols_2c * ch * sizeof(double)); |
616 | 1 | fftw_d = fftw_malloc(rows * cols_2c * ch * sizeof(double)); |
617 | 1 | pthread_mutex_lock(&fftw_plan_mutex); |
618 | 1 | if (ch == 1) |
619 | 0 | { |
620 | 0 | p = fftw_plan_dft_r2c_2d(rows, cols, 0, 0, FFTW_ESTIMATE); |
621 | 0 | pinv = fftw_plan_dft_c2r_2d(rows, cols, 0, 0, FFTW_ESTIMATE); |
622 | 1 | } else { |
623 | 1 | int ndim[] = {rows, cols}; |
624 | 1 | p = fftw_plan_many_dft_r2c(2, ndim, ch, 0, 0, ch, 1, 0, 0, ch, 1, FFTW_ESTIMATE); |
625 | 1 | pinv = fftw_plan_many_dft_c2r(2, ndim, ch, 0, 0, ch, 1, 0, 0, ch, 1, FFTW_ESTIMATE); |
626 | 1 | } |
627 | 1 | pthread_mutex_unlock(&fftw_plan_mutex); |
628 | 1 | } |
629 | 6 | memset(fftw_b, 0, rows * cols_2c * ch * CCV_GET_DATA_TYPE_SIZE(fft_type)); |
630 | | |
631 | 6 | int i, j, k; |
632 | 6 | unsigned char* m_ptr = b->data.u8; |
633 | | // to flip matrix b is crucial, this problem only shows when I changed to a more sophisticated test case |
634 | 6 | #define for_block(_for_type, _for_get) \ |
635 | 6 | _for_type* fftw_ptr = (_for_type*)fftw_b + (b->rows - 1) * cols_2c * ch; \ |
636 | 416 | for (i = 0; i < b->rows; i++410 ) \ |
637 | 410 | { \ |
638 | 33.1k | for (j = 0; j < b->cols; j++32.7k ) \ |
639 | 130k | for (k = 0; 32.7k k < ch; k++97.7k ) \ |
640 | 97.7k | fftw_ptr[(b->cols - 1 - j) * ch + k] = _for_get410 (m_ptr, j * ch + k); \ |
641 | 410 | fftw_ptr -= cols_2c * ch; \ |
642 | 410 | m_ptr += b->step; \ |
643 | 410 | } |
644 | 6 | ccv_matrix_typeof(fft_type, ccv_matrix_getter, b->type, for_block); |
645 | 6 | #undef for_block |
646 | 6 | if (fft_type == CCV_32F) |
647 | 5 | fftwf_execute_dft_r2c(pf, (float*)fftw_b, (fftwf_complex*)fftw_b); |
648 | 1 | else |
649 | 1 | fftw_execute_dft_r2c(p, (double*)fftw_b, (fftw_complex*)fftw_b); |
650 | | /* why a->cols + cols - 2 * (b->cols & ~1) ? |
651 | | * what we really want is ceiling((a->cols - (b->cols & ~1)) / (cols - (b->cols & ~1))) |
652 | | * in this case, we strip out paddings on the left/right, and compute how many tiles |
653 | | * we need. It then be interpreted in the above integer division form */ |
654 | 6 | int tile_x = ccv_max(1, (a->cols + cols - 2 * (b->cols & ~1)) / (cols - (b->cols & ~1))); |
655 | 6 | int tile_y = ccv_max(1, (a->rows + rows - 2 * (b->rows & ~1)) / (rows - (b->rows & ~1))); |
656 | 6 | int brows2 = b->rows / 2; |
657 | 6 | int bcols2 = b->cols / 2; |
658 | 6 | #define for_block(_for_type, _cpx_type, _for_set, _for_get) \ |
659 | 6 | _for_type scale = 1.0 / (rows * cols); \ |
660 | 14 | for (i = 0; i < tile_y; i++8 ) \ |
661 | 27 | for (j = 0; 8 j < tile_x; j++19 ) \ |
662 | 19 | { \ |
663 | 19 | int x, y; \ |
664 | 19 | memset(fftw_a, 0, rows * cols_2c * ch * sizeof(_for_type)); \ |
665 | 19 | int iy = ccv_min(i * (rows - (b->rows & ~1)), ccv_max(a->rows - rows, 0)); \ |
666 | 19 | int ix = ccv_min(j * (cols - (b->cols & ~1)), ccv_max(a->cols - cols, 0)); \ |
667 | 19 | _for_type* fftw_ptr = (_for_type*)fftw_a; \ |
668 | 19 | int end_y = ccv_min(rows, a->rows - iy); \ |
669 | 19 | int end_x = ccv_min(cols, a->cols - ix); \ |
670 | 19 | m_ptr = (unsigned char*)ccv_get_dense_matrix_cell(a, iy, ix, 0); \ |
671 | 5.48k | for (y = 0; y < end_y; y++5.46k ) \ |
672 | 5.46k | { \ |
673 | 4.57M | for (x = 0; x < end_x * ch; x++4.56M ) \ |
674 | 4.56M | fftw_ptr[x] = _for_get5.46k (m_ptr, x); \ |
675 | 5.46k | fftw_ptr += cols_2c * ch; \ |
676 | 5.46k | m_ptr += a->step; \ |
677 | 5.46k | } \ |
678 | 19 | _cpx_type* fftw_ac = (_cpx_type*)fftw_a; \ |
679 | 19 | _cpx_type* fftw_bc = (_cpx_type*)fftw_b; \ |
680 | 19 | _cpx_type* fftw_dc = (_cpx_type*)fftw_d; \ |
681 | 19 | fft_execute_dft_r2c((_for_type*)fftw_a, (_cpx_type*)fftw_ac); \ |
682 | 2.44M | for (x = 0; x < rows * ch * (cols / 2 + 1); x++2.44M ) \ |
683 | 2.44M | fftw_dc[x] = (fftw_ac[x] * fftw_bc[x]) * scale; \ |
684 | 19 | fft_execute_dft_c2r((_cpx_type*)fftw_dc, (_for_type*)fftw_d); \ |
685 | 19 | fftw_ptr = (_for_type*)fftw_d + ((1 + (i > 0)) * brows2 * cols_2c + (1 + (j > 0)) * bcols2) * ch; \ |
686 | 19 | end_y = ccv_min(d->rows - (iy + (i > 0) * brows2), \ |
687 | 19 | (rows - (b->rows & ~1)) + (i == 0) * brows2); \ |
688 | 19 | end_x = ccv_min(d->cols - (ix + (j > 0) * bcols2), \ |
689 | 19 | (cols - (b->cols & ~1)) + (j == 0) * bcols2); \ |
690 | 19 | m_ptr = (unsigned char*)ccv_get_dense_matrix_cell(d, iy + (i > 0) * brows2, ix + (j > 0) * bcols2, 0); \ |
691 | 4.58k | for (y = 0; y < end_y; y++4.56k ) \ |
692 | 4.56k | { \ |
693 | 2.71M | for (x = 0; x < end_x * ch; x++2.71M ) \ |
694 | 4.56k | _for_set(m_ptr, x, fftw_ptr[x]); \ |
695 | 4.56k | m_ptr += d->step; \ |
696 | 4.56k | fftw_ptr += cols_2c * ch; \ |
697 | 4.56k | } \ |
698 | 19 | int end_tile_y, end_tile_x; \ |
699 | | /* handle edge cases: */ \ |
700 | 19 | if (i + 1 == tile_y && end_y + iy + (i > 0) * brows2 < d->rows13 ) \ |
701 | 19 | { \ |
702 | 6 | end_tile_y = ccv_min(brows2, d->rows - (iy + (i > 0) * brows2 + end_y)); \ |
703 | 6 | fftw_ptr = (_for_type*)fftw_d + (1 + (j > 0)) * bcols2 * ch; \ |
704 | 6 | m_ptr = (unsigned char*)ccv_get_dense_matrix_cell(d, iy + (i > 0) * brows2 + end_y, ix + (j > 0) * bcols2, 0); \ |
705 | 306 | for (y = 0; y < end_tile_y; y++300 ) \ |
706 | 300 | { \ |
707 | 204k | for (x = 0; x < end_x * ch; x++204k ) \ |
708 | 300 | _for_set(m_ptr, x, fftw_ptr[x]); \ |
709 | 300 | m_ptr += d->step; \ |
710 | 300 | fftw_ptr += cols_2c * ch; \ |
711 | 300 | } \ |
712 | 6 | } \ |
713 | 19 | if (j + 1 == tile_x && end_x + ix + (j > 0) * bcols2 < d->cols8 ) \ |
714 | 19 | { \ |
715 | 6 | end_tile_x = ccv_min(bcols2, d->cols - (ix + (j > 0) * bcols2 + end_x)); \ |
716 | 6 | fftw_ptr = (_for_type*)fftw_d + (1 + (i > 0)) * brows2 * cols_2c * ch; \ |
717 | 6 | m_ptr = (unsigned char*)ccv_get_dense_matrix_cell(d, iy + (i > 0) * brows2, ix + (j > 0) * bcols2 + end_x, 0); \ |
718 | 1.39k | for (y = 0; y < end_y; y++1.38k ) \ |
719 | 1.38k | { \ |
720 | 187k | for (x = 0; x < end_tile_x * ch; x++185k ) \ |
721 | 1.38k | _for_set(m_ptr, x, fftw_ptr[x]); \ |
722 | 1.38k | m_ptr += d->step; \ |
723 | 1.38k | fftw_ptr += cols_2c * ch; \ |
724 | 1.38k | } \ |
725 | 6 | } \ |
726 | 19 | if (i + 1 == tile_y && end_y + iy + (i > 0) * brows2 < d->rows13 && \ |
727 | 19 | j + 1 == tile_x6 && end_x + ix + (j > 0) * bcols2 < d->cols2 ) \ |
728 | 19 | { \ |
729 | 2 | fftw_ptr = (_for_type*)fftw_d; \ |
730 | 2 | m_ptr = (unsigned char*)ccv_get_dense_matrix_cell(d, iy + (i > 0) * brows2 + end_y, ix + (j > 0) * bcols2 + end_x, 0); \ |
731 | 102 | for (y = 0; y < end_tile_y; y++100 ) \ |
732 | 100 | { \ |
733 | 15.1k | for (x = 0; x < end_tile_x * ch; x++15.0k ) \ |
734 | 100 | _for_set(m_ptr, x, fftw_ptr[x]); \ |
735 | 100 | m_ptr += d->step; \ |
736 | 100 | fftw_ptr += cols_2c * ch; \ |
737 | 100 | } \ |
738 | 2 | } \ |
739 | 19 | } |
740 | 6 | if (fft_type == CCV_32F) |
741 | 5 | { |
742 | 15 | #define fft_execute_dft_r2c(r, c) fftwf_execute_dft_r2c(pf, r, c) |
743 | 15 | #define fft_execute_dft_c2r(c, r) fftwf_execute_dft_c2r(pinvf, c, r) |
744 | 5 | ccv_matrix_setter(d->type, ccv_matrix_getter, a->type, for_block, float, fftwf_complex); |
745 | 5 | #undef fft_execute_dft_r2c |
746 | 5 | #undef fft_execute_dft_c2r |
747 | 5 | pthread_mutex_lock(&fftw_plan_mutex); |
748 | 5 | fftwf_destroy_plan(pf); |
749 | 5 | fftwf_destroy_plan(pinvf); |
750 | 5 | pthread_mutex_unlock(&fftw_plan_mutex); |
751 | 5 | fftwf_free(fftw_a); |
752 | 5 | fftwf_free(fftw_b); |
753 | 5 | fftwf_free(fftw_d); |
754 | 5 | } else { |
755 | 4 | #define fft_execute_dft_r2c(r, c) fftw_execute_dft_r2c(p, r, c) |
756 | 4 | #define fft_execute_dft_c2r(c, r) fftw_execute_dft_c2r(pinv, c, r) |
757 | 1 | ccv_matrix_setter(d->type, ccv_matrix_getter, a->type, for_block, double, fftw_complex); |
758 | 1 | #undef fft_execute_dft_r2c |
759 | 1 | #undef fft_execute_dft_c2r |
760 | 1 | pthread_mutex_lock(&fftw_plan_mutex); |
761 | 1 | fftw_destroy_plan(p); |
762 | 1 | fftw_destroy_plan(pinv); |
763 | 1 | pthread_mutex_unlock(&fftw_plan_mutex); |
764 | 1 | fftw_free(fftw_a); |
765 | 1 | fftw_free(fftw_b); |
766 | 1 | fftw_free(fftw_d); |
767 | 1 | } |
768 | 6 | #undef for_block |
769 | 6 | } |
770 | | #else |
771 | | static void _ccv_filter_kissfft(ccv_dense_matrix_t* a, ccv_dense_matrix_t* b, ccv_dense_matrix_t* d, int padding_pattern) |
772 | | { |
773 | | int ch = CCV_GET_CHANNEL(a->type); |
774 | | int fft_type = (CCV_GET_DATA_TYPE(d->type) == CCV_8U || CCV_GET_DATA_TYPE(d->type) == CCV_32F) ? CCV_32F : CCV_64F; |
775 | | int rows = ((ccv_min(a->rows + b->rows - 1, kiss_fftr_next_fast_size_real(b->rows * 3)) + 1) >> 1) << 1; |
776 | | int cols = ((ccv_min(a->cols + b->cols - 1, kiss_fftr_next_fast_size_real(b->cols * 3)) + 1) >> 1) << 1; |
777 | | int ndim[] = {rows, cols}; |
778 | | void* kiss_a; |
779 | | void* kiss_b; |
780 | | void* kiss_d; |
781 | | void* kiss_ac; |
782 | | void* kiss_bc; |
783 | | void* kiss_dc; |
784 | | kiss_fftndr_cfg p; |
785 | | kiss_fftndr_cfg pinv; |
786 | | kissf_fftndr_cfg pf; |
787 | | kissf_fftndr_cfg pinvf; |
788 | | if (fft_type == CCV_32F) |
789 | | { |
790 | | pf = kissf_fftndr_alloc(ndim, 2, 0, 0, 0); |
791 | | pinvf = kissf_fftndr_alloc(ndim, 2, 1, 0, 0); |
792 | | kiss_a = ccmalloc(rows * cols * ch * sizeof(kissf_fft_scalar)); |
793 | | kiss_b = ccmalloc(rows * cols * ch * sizeof(kissf_fft_scalar)); |
794 | | kiss_d = ccmalloc(rows * cols * ch * sizeof(kissf_fft_scalar)); |
795 | | kiss_ac = ccmalloc(rows * (cols / 2 + 1) * ch * sizeof(kissf_fft_cpx)); |
796 | | kiss_bc = ccmalloc(rows * (cols / 2 + 1) * ch * sizeof(kissf_fft_cpx)); |
797 | | kiss_dc = ccmalloc(rows * (cols / 2 + 1) * ch * sizeof(kissf_fft_cpx)); |
798 | | memset(kiss_b, 0, rows * cols * ch * sizeof(kissf_fft_scalar)); |
799 | | } else { |
800 | | p = kiss_fftndr_alloc(ndim, 2, 0, 0, 0); |
801 | | pinv = kiss_fftndr_alloc(ndim, 2, 1, 0, 0); |
802 | | kiss_a = ccmalloc(rows * cols * ch * sizeof(kiss_fft_scalar)); |
803 | | kiss_b = ccmalloc(rows * cols * ch * sizeof(kiss_fft_scalar)); |
804 | | kiss_d = ccmalloc(rows * cols * ch * sizeof(kiss_fft_scalar)); |
805 | | kiss_ac = ccmalloc(rows * (cols / 2 + 1) * ch * sizeof(kiss_fft_cpx)); |
806 | | kiss_bc = ccmalloc(rows * (cols / 2 + 1) * ch * sizeof(kiss_fft_cpx)); |
807 | | kiss_dc = ccmalloc(rows * (cols / 2 + 1) * ch * sizeof(kiss_fft_cpx)); |
808 | | memset(kiss_b, 0, rows * cols * ch * sizeof(kiss_fft_scalar)); |
809 | | } |
810 | | int nch = rows * cols, nchc = rows * (cols / 2 + 1); |
811 | | int i, j, k; |
812 | | unsigned char* m_ptr = b->data.u8; |
813 | | // to flip matrix b is crucial, this problem only shows when I changed to a more sophisticated test case |
814 | | #define for_block(_for_type, _for_get) \ |
815 | | _for_type* kiss_ptr = (_for_type*)kiss_b + (b->rows - 1) * cols; \ |
816 | | for (i = 0; i < b->rows; i++) \ |
817 | | { \ |
818 | | for (j = 0; j < b->cols; j++) \ |
819 | | for (k = 0; k < ch; k++) \ |
820 | | kiss_ptr[k * nch + b->cols - 1 - j] = _for_get(m_ptr, j * ch + k); \ |
821 | | kiss_ptr -= cols; \ |
822 | | m_ptr += b->step; \ |
823 | | } |
824 | | ccv_matrix_typeof(fft_type, ccv_matrix_getter, b->type, for_block); |
825 | | #undef for_block |
826 | | if (fft_type == CCV_32F) |
827 | | for (k = 0; k < ch; k++) |
828 | | kissf_fftndr(pf, (kissf_fft_scalar*)kiss_b + nch * k, (kissf_fft_cpx*)kiss_bc + nchc * k); |
829 | | else |
830 | | for (k = 0; k < ch; k++) |
831 | | kiss_fftndr(p, (kiss_fft_scalar*)kiss_b + nch * k, (kiss_fft_cpx*)kiss_bc + nchc * k); |
832 | | /* why a->cols + cols - 2 * (b->cols & ~1) ? |
833 | | * what we really want is ceiling((a->cols - (b->cols & ~1)) / (cols - (b->cols & ~1))) |
834 | | * in this case, we strip out paddings on the left/right, and compute how many tiles |
835 | | * we need. It then be interpreted in the above integer division form */ |
836 | | int tile_x = ccv_max(1, (a->cols + cols - 2 * (b->cols & ~1)) / (cols - (b->cols & ~1))); |
837 | | int tile_y = ccv_max(1, (a->rows + rows - 2 * (b->rows & ~1)) / (rows - (b->rows & ~1))); |
838 | | int brows2 = b->rows / 2; |
839 | | int bcols2 = b->cols / 2; |
840 | | #define for_block(_for_type, _cpx_type, _for_set, _for_get) \ |
841 | | _for_type scale = 1.0 / (rows * cols); \ |
842 | | for (i = 0; i < tile_y; i++) \ |
843 | | for (j = 0; j < tile_x; j++) \ |
844 | | { \ |
845 | | int x, y; \ |
846 | | memset(kiss_a, 0, rows * cols * ch * sizeof(_for_type)); \ |
847 | | int iy = ccv_min(i * (rows - (b->rows & ~1)), ccv_max(a->rows - rows, 0)); \ |
848 | | int ix = ccv_min(j * (cols - (b->cols & ~1)), ccv_max(a->cols - cols, 0)); \ |
849 | | _for_type* kiss_ptr = (_for_type*)kiss_a; \ |
850 | | int end_y = ccv_min(rows, a->rows - iy); \ |
851 | | int end_x = ccv_min(cols, a->cols - ix); \ |
852 | | m_ptr = (unsigned char*)ccv_get_dense_matrix_cell(a, iy, ix, 0); \ |
853 | | for (y = 0; y < end_y; y++) \ |
854 | | { \ |
855 | | for (x = 0; x < end_x; x++) \ |
856 | | for (k = 0; k < ch; k++) \ |
857 | | kiss_ptr[k * nch + x] = _for_get(m_ptr, x * ch + k); \ |
858 | | kiss_ptr += cols; \ |
859 | | m_ptr += a->step; \ |
860 | | } \ |
861 | | for (k = 0; k < ch; k++) \ |
862 | | fft_ndr((_for_type*)kiss_a + nch * k, (_cpx_type*)kiss_ac + nchc * k); \ |
863 | | _cpx_type* fft_ac = (_cpx_type*)kiss_ac; \ |
864 | | _cpx_type* fft_bc = (_cpx_type*)kiss_bc; \ |
865 | | _cpx_type* fft_dc = (_cpx_type*)kiss_dc; \ |
866 | | for (x = 0; x < rows * ch * (cols / 2 + 1); x++) \ |
867 | | { \ |
868 | | fft_dc[x].r = (fft_ac[x].r * fft_bc[x].r - fft_ac[x].i * fft_bc[x].i) * scale; \ |
869 | | fft_dc[x].i = (fft_ac[x].i * fft_bc[x].r + fft_ac[x].r * fft_bc[x].i) * scale; \ |
870 | | } \ |
871 | | for (k = 0; k < ch; k++) \ |
872 | | fft_ndri((_cpx_type*)kiss_dc + nchc * k, (_for_type*)kiss_d + nch * k); \ |
873 | | kiss_ptr = (_for_type*)kiss_d + (1 + (i > 0)) * brows2 * cols + (1 + (j > 0)) * bcols2; \ |
874 | | end_y = ccv_min(d->rows - (iy + (i > 0) * brows2), \ |
875 | | (rows - (b->rows & ~1)) + (i == 0) * brows2); \ |
876 | | end_x = ccv_min(d->cols - (ix + (j > 0) * bcols2), \ |
877 | | (cols - (b->cols & ~1)) + (j == 0) * bcols2); \ |
878 | | m_ptr = (unsigned char*)ccv_get_dense_matrix_cell(d, iy + (i > 0) * brows2, ix + (j > 0) * bcols2, 0); \ |
879 | | for (y = 0; y < end_y; y++) \ |
880 | | { \ |
881 | | for (x = 0; x < end_x; x++) \ |
882 | | for (k = 0; k < ch; k++) \ |
883 | | _for_set(m_ptr, x * ch + k, kiss_ptr[k * nch + x]); \ |
884 | | m_ptr += d->step; \ |
885 | | kiss_ptr += cols; \ |
886 | | } \ |
887 | | int end_tile_y, end_tile_x; \ |
888 | | /* handle edge cases: */ \ |
889 | | if (i + 1 == tile_y && end_y + iy + (i > 0) * brows2 < d->rows) \ |
890 | | { \ |
891 | | end_tile_y = ccv_min(brows2, d->rows - (iy + (i > 0) * brows2 + end_y)); \ |
892 | | kiss_ptr = (_for_type*)kiss_d + (1 + (j > 0)) * bcols2; \ |
893 | | m_ptr = (unsigned char*)ccv_get_dense_matrix_cell(d, iy + (i > 0) * brows2 + end_y, ix + (j > 0) * bcols2, 0); \ |
894 | | for (y = 0; y < end_tile_y; y++) \ |
895 | | { \ |
896 | | for (x = 0; x < end_x; x++) \ |
897 | | for (k = 0; k < ch; k++) \ |
898 | | _for_set(m_ptr, x * ch + k, kiss_ptr[k * nch + x]); \ |
899 | | m_ptr += d->step; \ |
900 | | kiss_ptr += cols; \ |
901 | | } \ |
902 | | } \ |
903 | | if (j + 1 == tile_x && end_x + ix + (j > 0) * bcols2 < d->cols) \ |
904 | | { \ |
905 | | end_tile_x = ccv_min(bcols2, d->cols - (ix + (j > 0) * bcols2 + end_x)); \ |
906 | | kiss_ptr = (_for_type*)kiss_d + (1 + (i > 0)) * brows2 * cols; \ |
907 | | m_ptr = (unsigned char*)ccv_get_dense_matrix_cell(d, iy + (i > 0) * brows2, ix + (j > 0) * bcols2 + end_x, 0); \ |
908 | | for (y = 0; y < end_y; y++) \ |
909 | | { \ |
910 | | for (x = 0; x < end_tile_x; x++) \ |
911 | | for (k = 0; k < ch; k++) \ |
912 | | _for_set(m_ptr, x * ch + k, kiss_ptr[k * nch + x]); \ |
913 | | m_ptr += d->step; \ |
914 | | kiss_ptr += cols; \ |
915 | | } \ |
916 | | } \ |
917 | | if (i + 1 == tile_y && end_y + iy + (i > 0) * brows2 < d->rows && \ |
918 | | j + 1 == tile_x && end_x + ix + (j > 0) * bcols2 < d->cols) \ |
919 | | { \ |
920 | | kiss_ptr = (_for_type*)kiss_d; \ |
921 | | m_ptr = (unsigned char*)ccv_get_dense_matrix_cell(d, iy + (i > 0) * brows2 + end_y, ix + (j > 0) * bcols2 + end_x, 0); \ |
922 | | for (y = 0; y < end_tile_y; y++) \ |
923 | | { \ |
924 | | for (x = 0; x < end_tile_x; x++) \ |
925 | | for (k = 0; k < ch; k++) \ |
926 | | _for_set(m_ptr, x * ch + k, kiss_ptr[k * nch + x]); \ |
927 | | m_ptr += d->step; \ |
928 | | kiss_ptr += cols; \ |
929 | | } \ |
930 | | } \ |
931 | | } |
932 | | if (fft_type == CCV_32F) |
933 | | { |
934 | | #define fft_ndr(r, c) kissf_fftndr(pf, r, c) |
935 | | #define fft_ndri(c, r) kissf_fftndri(pinvf, c, r) |
936 | | ccv_matrix_setter(d->type, ccv_matrix_getter, a->type, for_block, kissf_fft_scalar, kissf_fft_cpx); |
937 | | #undef fft_ndr |
938 | | #undef fft_ndri |
939 | | kissf_fft_free(pf); |
940 | | kissf_fft_free(pinvf); |
941 | | } else { |
942 | | #define fft_ndr(r, c) kiss_fftndr(p, r, c) |
943 | | #define fft_ndri(c, r) kiss_fftndri(pinv, c, r) |
944 | | ccv_matrix_setter(d->type, ccv_matrix_getter, a->type, for_block, kiss_fft_scalar, kiss_fft_cpx); |
945 | | #undef fft_ndr |
946 | | #undef fft_ndri |
947 | | kiss_fft_free(p); |
948 | | kiss_fft_free(pinv); |
949 | | } |
950 | | #undef for_block |
951 | | ccfree(kiss_dc); |
952 | | ccfree(kiss_bc); |
953 | | ccfree(kiss_ac); |
954 | | ccfree(kiss_d); |
955 | | ccfree(kiss_b); |
956 | | ccfree(kiss_a); |
957 | | } |
958 | | #endif |
959 | | |
960 | | static void _ccv_filter_direct_8u(ccv_dense_matrix_t* a, ccv_dense_matrix_t* b, ccv_dense_matrix_t* d, int padding_pattern) |
961 | 0 | { |
962 | 0 | int i, j, y, x, k; |
963 | 0 | int nz = b->rows * b->cols; |
964 | 0 | int* coeff = (int*)ccmalloc(nz * sizeof(int)); |
965 | 0 | int* cx = (int*)ccmalloc(nz * sizeof(int)); |
966 | 0 | int* cy = (int*)ccmalloc(nz * sizeof(int)); |
967 | 0 | int scale = 1 << 14; |
968 | 0 | nz = 0; |
969 | 0 | for (i = 0; i < b->rows; i++) |
970 | 0 | for (j = 0; j < b->cols; j++) |
971 | 0 | { |
972 | 0 | coeff[nz] = (int)(ccv_get_dense_matrix_cell_value(b, i, j, 0) * scale + 0.5); |
973 | 0 | if (coeff[nz] == 0) |
974 | 0 | continue; |
975 | 0 | cy[nz] = i; |
976 | 0 | cx[nz] = j; |
977 | 0 | nz++; |
978 | 0 | } |
979 | 0 | ccv_dense_matrix_t* pa = ccv_dense_matrix_new(a->rows + b->rows / 2 * 2, a->cols + b->cols / 2 * 2, CCV_8U | CCV_C1, 0, 0); |
980 | | /* the padding pattern is different from FFT: |aa{BORDER}|abcd|{BORDER}dd| */ |
981 | 0 | for (i = 0; i < pa->rows; i++) |
982 | 0 | for (j = 0; j < pa->cols; j++) |
983 | 0 | pa->data.u8[i * pa->step + j] = a->data.u8[ccv_clamp(i - b->rows / 2, 0, a->rows - 1) * a->step + ccv_clamp(j - b->cols / 2, 0, a->cols - 1)]; |
984 | 0 | unsigned char* m_ptr = d->data.u8; |
985 | 0 | unsigned char* a_ptr = pa->data.u8; |
986 | | /* 0.5 denote the overhead for indexing x and y */ |
987 | 0 | if (nz < b->rows * b->cols * 0.75) |
988 | 0 | { |
989 | 0 | for (i = 0; i < d->rows; i++) |
990 | 0 | { |
991 | 0 | for (j = 0; j < d->cols; j++) |
992 | 0 | { |
993 | 0 | int z = 0; |
994 | 0 | for (k = 0; k < nz; k++) |
995 | 0 | z += a_ptr[cy[k] * pa->step + j + cx[k]] * coeff[k]; |
996 | 0 | m_ptr[j] = ccv_clamp(z >> 14, 0, 255); |
997 | 0 | } |
998 | 0 | m_ptr += d->step; |
999 | 0 | a_ptr += pa->step; |
1000 | 0 | } |
1001 | 0 | } else { |
1002 | 0 | k = 0; |
1003 | 0 | for (i = 0; i < b->rows; i++) |
1004 | 0 | for (j = 0; j < b->cols; j++) |
1005 | 0 | { |
1006 | 0 | coeff[k] = (int)(ccv_get_dense_matrix_cell_value(b, i, j, 0) * scale + 0.5); |
1007 | 0 | k++; |
1008 | 0 | } |
1009 | 0 | for (i = 0; i < d->rows; i++) |
1010 | 0 | { |
1011 | 0 | for (j = 0; j < d->cols; j++) |
1012 | 0 | { |
1013 | 0 | int* c_ptr = coeff; |
1014 | 0 | int z = 0; |
1015 | 0 | for (y = 0; y < b->rows; y++) |
1016 | 0 | { |
1017 | 0 | int iyx = y * pa->step; |
1018 | 0 | for (x = 0; x < b->cols; x++) |
1019 | 0 | { |
1020 | 0 | z += a_ptr[iyx + j + x] * c_ptr[0]; |
1021 | 0 | c_ptr++; |
1022 | 0 | } |
1023 | 0 | } |
1024 | 0 | m_ptr[j] = ccv_clamp(z >> 14, 0, 255); |
1025 | 0 | } |
1026 | 0 | m_ptr += d->step; |
1027 | 0 | a_ptr += pa->step; |
1028 | 0 | } |
1029 | 0 | } |
1030 | 0 | ccv_matrix_free(pa); |
1031 | 0 | ccfree(coeff); |
1032 | 0 | ccfree(cx); |
1033 | 0 | ccfree(cy); |
1034 | 0 | } |
1035 | | |
1036 | | void ccv_filter(ccv_dense_matrix_t* a, ccv_dense_matrix_t* b, ccv_dense_matrix_t** d, int type, int padding_pattern) |
1037 | 6 | { |
1038 | 6 | ccv_declare_derived_signature(sig, a->sig != 0 && b->sig != 0, ccv_sign_with_literal("ccv_filter"), a->sig, b->sig, CCV_EOF_SIGN); |
1039 | 6 | type = (type == 0) ? CCV_GET_DATA_TYPE3 (a->type) | 3 CCV_GET_CHANNEL3 (a->type) : CCV_GET_DATA_TYPE3 (type) | 3 CCV_GET_CHANNEL3 (a->type); |
1040 | 6 | ccv_dense_matrix_t* dd = *d = ccv_dense_matrix_renew(*d, a->rows, a->cols, CCV_ALL_DATA_TYPE | CCV_GET_CHANNEL(a->type), type, sig); |
1041 | 6 | ccv_object_return_if_cached(, dd); |
1042 | | |
1043 | | /* 15 is the constant to indicate the high cost of FFT (even with O(nlog(m)) for |
1044 | | * integer image. |
1045 | | * NOTE: FFT has time complexity of O(nlog(n)), however, for convolution, it |
1046 | | * is not the case. Convolving one image (a) to a kernel (b), can be done by |
1047 | | * dividing image a to several blocks proportional to (b). Thus, one don't need |
1048 | | * to do FFT for the whole image. The image can be divided to n/m part, and |
1049 | | * the FFT itself is O(mlog(m)), so, the convolution process has time complexity |
1050 | | * of O(nlog(m)) */ |
1051 | 6 | if ((b->rows * b->cols < (log((double)(b->rows * b->cols)) + 1) * 15) && (a->type & CCV_8U)1 ) |
1052 | 0 | { |
1053 | 0 | _ccv_filter_direct_8u(a, b, dd, padding_pattern); |
1054 | 6 | } else { |
1055 | 6 | #ifdef HAVE_FFTW3 |
1056 | 6 | _ccv_filter_fftw(a, b, dd, padding_pattern); |
1057 | | #else |
1058 | | _ccv_filter_kissfft(a, b, dd, padding_pattern); |
1059 | | #endif |
1060 | 6 | } |
1061 | 6 | } |
1062 | | |
1063 | | void ccv_filter_kernel(ccv_dense_matrix_t* x, ccv_filter_kernel_f func, void* data) |
1064 | 2 | { |
1065 | 2 | int i, j, k, ch = CCV_GET_CHANNEL(x->type); |
1066 | 2 | unsigned char* m_ptr = x->data.u8; |
1067 | 2 | double rows_2 = (x->rows - 1) * 0.5; |
1068 | 2 | double cols_2 = (x->cols - 1) * 0.5; |
1069 | 2 | #define for_block(_, _for_set) \ |
1070 | 203 | for (i = 0; 2 i < x->rows; i++201 ) \ |
1071 | 201 | { \ |
1072 | 20.4k | for (j = 0; j < x->cols; j++20.2k ) \ |
1073 | 20.2k | { \ |
1074 | 20.2k | double result = func(j - cols_2, i - rows_2, data); \ |
1075 | 80.8k | for (k = 0; k < ch; k++60.6k ) \ |
1076 | 20.2k | _for_set(m_ptr, j * ch + k, result); \ |
1077 | 20.2k | } \ |
1078 | 201 | m_ptr += x->step; \ |
1079 | 201 | } |
1080 | 2 | ccv_matrix_setter(x->type, for_block); |
1081 | 2 | #undef for_block |
1082 | 2 | ccv_make_matrix_immutable(x); |
1083 | 2 | } |
1084 | | |
1085 | | void ccv_distance_transform(ccv_dense_matrix_t* a, ccv_dense_matrix_t** b, int type, ccv_dense_matrix_t** x, int x_type, ccv_dense_matrix_t** y, int y_type, double dx, double dy, double dxx, double dyy, int flag) |
1086 | 2 | { |
1087 | 2 | assert(!(flag & CCV_L2_NORM) && (flag & CCV_GSEDT)); |
1088 | 2 | ccv_declare_derived_signature(sig, a->sig != 0, ccv_sign_with_format(64, "ccv_distance_transform(%la,%la,%la,%la,%d)", dx, dy, dxx, dyy, flag), a->sig, CCV_EOF_SIGN); |
1089 | 2 | type = (CCV_GET_DATA_TYPE(type) == CCV_64F || CCV_GET_DATA_TYPE(a->type) == CCV_64F || CCV_GET_DATA_TYPE(a->type) == CCV_64S) ? CCV_GET_CHANNEL0 (a->type) | CCV_64F0 : CCV_GET_CHANNEL(a->type) | CCV_32F; |
1090 | 2 | ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, a->rows, a->cols, CCV_ALL_DATA_TYPE | CCV_GET_CHANNEL(a->type), type, sig); |
1091 | 2 | ccv_dense_matrix_t* mx = 0; |
1092 | 2 | ccv_dense_matrix_t* my = 0; |
1093 | 2 | if (x != 0) |
1094 | 0 | { |
1095 | 0 | ccv_declare_derived_signature(xsig, a->sig != 0, ccv_sign_with_format(64, "ccv_distance_transform_x(%la,%la,%la,%la,%d)", dx, dy, dxx, dyy, flag), a->sig, CCV_EOF_SIGN); |
1096 | 0 | mx = *x = ccv_dense_matrix_renew(*x, a->rows, a->cols, CCV_32S | CCV_C1, CCV_32S | CCV_C1, xsig); |
1097 | 0 | } |
1098 | 2 | if (y != 0) |
1099 | 0 | { |
1100 | 0 | ccv_declare_derived_signature(ysig, a->sig != 0, ccv_sign_with_format(64, "ccv_distance_transform_y(%la,%la,%la,%la,%d)", dx, dy, dxx, dyy, flag), a->sig, CCV_EOF_SIGN); |
1101 | 0 | my = *y = ccv_dense_matrix_renew(*y, a->rows, a->cols, CCV_32S | CCV_C1, CCV_32S | CCV_C1, ysig); |
1102 | 0 | } |
1103 | 2 | ccv_object_return_if_cached(, db, mx, my); |
1104 | 2 | ccv_revive_object_if_cached(db, mx, my); |
1105 | 2 | int i, j, k; |
1106 | 2 | unsigned char* a_ptr = a->data.u8; |
1107 | 2 | unsigned char* b_ptr = db->data.u8; |
1108 | 2 | int* v = (int*)alloca(sizeof(int) * ccv_max(db->rows, db->cols)); |
1109 | 2 | unsigned char* c_ptr = (unsigned char*)alloca(CCV_GET_DATA_TYPE_SIZE(db->type) * db->rows); |
1110 | 2 | int* x_ptr = mx ? mx->data.i320 : 0; |
1111 | 2 | int* y_ptr = my ? my->data.i320 : 0; |
1112 | 2 | #define for_block(_for_max, _for_type_b, _for_set_b, _for_get_b, _for_get_a) \ |
1113 | 2 | _for_type_b _dx = dx, _dy = dy, _dxx = dxx, _dyy = dyy; \ |
1114 | 2 | _for_type_b* z = (_for_type_b*)alloca(sizeof(_for_type_b) * (ccv_max(db->rows, db->cols) + 1)); \ |
1115 | 2 | if (_dxx > 1e-6) \ |
1116 | 2 | { \ |
1117 | 652 | for (i = 0; i < a->rows; i++650 ) \ |
1118 | 650 | { \ |
1119 | 650 | k = 0; \ |
1120 | 650 | v[0] = 0; \ |
1121 | 650 | z[0] = (_for_type_b)-_for_max; \ |
1122 | 650 | z[1] = (_for_type_b)_for_max; \ |
1123 | 260k | for (j = 1; j < a->cols; j++259k ) \ |
1124 | 259k | { \ |
1125 | 259k | _for_type_b s; \ |
1126 | 259k | for (;;) \ |
1127 | 270k | { \ |
1128 | 270k | assert(k >= 0 && k < ccv_max(db->rows, db->cols)); \ |
1129 | 270k | s = ((SGN _for_get_a(a_ptr, j) + _dxx * j * j - _dx * j) - (SGN _for_get_a(a_ptr, v[k]) + _dxx * v[k] * v[k] - _dx * v[k])) / (2.0 * _dxx * (j - v[k])); \ |
1130 | 270k | if (s > z[k]) break259k ; \ |
1131 | 270k | --k; \ |
1132 | 11.4k | } \ |
1133 | 259k | ++k; \ |
1134 | 259k | assert(k >= 0 && k < ccv_max(db->rows, db->cols)); \ |
1135 | 259k | v[k] = j; \ |
1136 | 259k | z[k] = s; \ |
1137 | 259k | z[k + 1] = (_for_type_b)_for_max; \ |
1138 | 259k | } \ |
1139 | 650 | assert(z[k + 1] >= a->cols - 1); \ |
1140 | 650 | k = 0; \ |
1141 | 650 | if (mx) \ |
1142 | 650 | { \ |
1143 | 0 | for (j = 0; j < a->cols; j++) \ |
1144 | 0 | { \ |
1145 | 0 | while (z[k + 1] < j) \ |
1146 | 0 | { \ |
1147 | 0 | assert(k >= 0 && k < ccv_max(db->rows, db->cols) - 1); \ |
1148 | 0 | ++k; \ |
1149 | 0 | } \ |
1150 | 0 | _for_set_b(b_ptr, j, _dx * (j - v[k]) + _dxx * (j - v[k]) * (j - v[k]) SGN _for_get_a(a_ptr, v[k])); \ |
1151 | 0 | x_ptr[j] = j - v[k]; \ |
1152 | 0 | } \ |
1153 | 0 | x_ptr += mx->cols; \ |
1154 | 650 | } else { \ |
1155 | 260k | for (j = 0; j < a->cols; j++260k ) \ |
1156 | 260k | { \ |
1157 | 507k | while (z[k + 1] < j) \ |
1158 | 260k | { \ |
1159 | 247k | assert(k >= 0 && k < ccv_max(db->rows, db->cols) - 1); \ |
1160 | 247k | ++k; \ |
1161 | 247k | } \ |
1162 | 260k | _for_set_b(b_ptr, j, _dx * (j - v[k]) + _dxx * (j - v[k]) * (j - v[k]) SGN _for_get_a(a_ptr, v[k])); \ |
1163 | 260k | } \ |
1164 | 650 | } \ |
1165 | 650 | a_ptr += a->step; \ |
1166 | 650 | b_ptr += db->step; \ |
1167 | 650 | } \ |
1168 | 2 | } else { /* above algorithm cannot handle dxx == 0 properly, below is special casing for that */ \ |
1169 | 0 | assert(mx == 0); \ |
1170 | 0 | for (i = 0; i < a->rows; i++) \ |
1171 | 0 | { \ |
1172 | 0 | for (j = 0; j < a->cols; j++) \ |
1173 | 0 | _for_set_b(b_ptr, j, SGN _for_get_a(a_ptr, j)); \ |
1174 | 0 | for (j = 1; j < a->cols; j++) \ |
1175 | 0 | _for_set_b(b_ptr, j, ccv_min(_for_get_b(b_ptr, j), _for_get_b(b_ptr, j - 1) + _dx)); \ |
1176 | 0 | for (j = a->cols - 2; j >= 0; j--) \ |
1177 | 0 | _for_set_b(b_ptr, j, ccv_min(_for_get_b(b_ptr, j), _for_get_b(b_ptr, j + 1) - _dx)); \ |
1178 | 0 | a_ptr += a->step; \ |
1179 | 0 | b_ptr += db->step; \ |
1180 | 0 | } \ |
1181 | 0 | } \ |
1182 | 2 | b_ptr = db->data.u8; \ |
1183 | 2 | if (_dyy > 1e-6) \ |
1184 | 2 | { \ |
1185 | 802 | for (j = 0; j < db->cols; j++800 ) \ |
1186 | 800 | { \ |
1187 | 260k | for (i = 0; i < db->rows; i++260k ) \ |
1188 | 800 | _for_set_b(c_ptr, i, _for_get_b(b_ptr + i * db->step, j)); \ |
1189 | 800 | k = 0; \ |
1190 | 800 | v[0] = 0; \ |
1191 | 800 | z[0] = (_for_type_b)-_for_max; \ |
1192 | 800 | z[1] = (_for_type_b)_for_max; \ |
1193 | 260k | for (i = 1; i < db->rows; i++259k ) \ |
1194 | 259k | { \ |
1195 | 259k | _for_type_b s; \ |
1196 | 259k | for (;;) \ |
1197 | 274k | { \ |
1198 | 274k | assert(k >= 0 && k < ccv_max(db->rows, db->cols)); \ |
1199 | 274k | s = ((_for_get_b(c_ptr, i) + _dyy * i * i - _dy * i) - (_for_get_b(c_ptr, v[k]) + _dyy * v[k] * v[k] - _dy * v[k])) / (2.0 * _dyy * (i - v[k])); \ |
1200 | 274k | if (s > z[k]) break259k ; \ |
1201 | 274k | --k; \ |
1202 | 15.6k | } \ |
1203 | 259k | ++k; \ |
1204 | 259k | assert(k >= 0 && k < ccv_max(db->rows, db->cols)); \ |
1205 | 259k | v[k] = i; \ |
1206 | 259k | z[k] = s; \ |
1207 | 259k | z[k + 1] = (_for_type_b)_for_max; \ |
1208 | 259k | } \ |
1209 | 800 | assert(z[k + 1] >= db->rows - 1); \ |
1210 | 800 | k = 0; \ |
1211 | 800 | if (my) \ |
1212 | 800 | { \ |
1213 | 0 | for (i = 0; i < db->rows; i++) \ |
1214 | 0 | { \ |
1215 | 0 | while (z[k + 1] < i) \ |
1216 | 0 | { \ |
1217 | 0 | assert(k >= 0 && k < ccv_max(db->rows, db->cols) - 1); \ |
1218 | 0 | ++k; \ |
1219 | 0 | } \ |
1220 | 0 | _for_set_b(b_ptr + i * db->step, j, _dy * (i - v[k]) + _dyy * (i - v[k]) * (i - v[k]) + _for_get_b(c_ptr, v[k])); \ |
1221 | 0 | y_ptr[i * my->cols] = i - v[k]; \ |
1222 | 0 | } \ |
1223 | 0 | ++y_ptr; \ |
1224 | 800 | } else { \ |
1225 | 260k | for (i = 0; i < db->rows; i++260k ) \ |
1226 | 260k | { \ |
1227 | 503k | while (z[k + 1] < i) \ |
1228 | 260k | { \ |
1229 | 243k | assert(k >= 0 && k < ccv_max(db->rows, db->cols) - 1); \ |
1230 | 243k | ++k; \ |
1231 | 243k | } \ |
1232 | 260k | _for_set_b(b_ptr + i * db->step, j, _dy * (i - v[k]) + _dyy * (i - v[k]) * (i - v[k]) + _for_get_b(c_ptr, v[k])); \ |
1233 | 260k | } \ |
1234 | 800 | } \ |
1235 | 800 | } \ |
1236 | 2 | } else { \ |
1237 | 0 | assert(my == 0); \ |
1238 | 0 | for (j = 0; j < db->cols; j++) \ |
1239 | 0 | { \ |
1240 | 0 | for (i = 1; i < db->rows; i++) \ |
1241 | 0 | _for_set_b(b_ptr + i * db->step, j, ccv_min(_for_get_b(b_ptr + i * db->step, j), _for_get_b(b_ptr + (i - 1) * db->step, j) + _dy)); \ |
1242 | 0 | for (i = db->rows - 2; i >= 0; i--) \ |
1243 | 0 | _for_set_b(b_ptr + i * db->step, j, ccv_min(_for_get_b(b_ptr + i * db->step, j), _for_get_b(b_ptr + (i + 1) * db->step, j) - _dy)); \ |
1244 | 0 | } \ |
1245 | 0 | } |
1246 | 2 | if (flag & CCV_NEGATIVE) |
1247 | 1 | { |
1248 | 261k | #define SGN - |
1249 | 1 | if (db->type & CCV_64F) |
1250 | 0 | { |
1251 | 0 | ccv_matrix_typeof_setter_getter(db->type, ccv_matrix_getter, a->type, for_block, DBL_MAX); |
1252 | 1 | } else { |
1253 | 1 | ccv_matrix_typeof_setter_getter(db->type, ccv_matrix_getter, a->type, for_block, FLT_MAX); |
1254 | 1 | } |
1255 | 1 | #undef SGN |
1256 | 1 | } else { |
1257 | 280k | #define SGN + |
1258 | 1 | if (db->type & CCV_64F) |
1259 | 0 | { |
1260 | 0 | ccv_matrix_typeof_setter_getter(db->type, ccv_matrix_getter, a->type, for_block, DBL_MAX); |
1261 | 1 | } else { |
1262 | 1 | ccv_matrix_typeof_setter_getter(db->type, ccv_matrix_getter, a->type, for_block, FLT_MAX); |
1263 | 1 | } |
1264 | 1 | #undef SGN |
1265 | 1 | } |
1266 | 2 | #undef for_block |
1267 | 2 | } |
1268 | | |
1269 | | inline static double _kmeans1d_cost(double* cumsum, double* cumsum2, int i, int j) |
1270 | 189M | { |
1271 | 189M | if (j < i) |
1272 | 15.6M | return 0; |
1273 | 173M | double mu = (cumsum[j + 1] - cumsum[i]) / (j - i + 1); |
1274 | 173M | double result = cumsum2[j + 1] - cumsum2[i]; |
1275 | 173M | result += (j - i + 1) * (mu * mu); |
1276 | 173M | result -= (2 * mu) * (cumsum[j + 1] - cumsum[i]); |
1277 | 173M | return result; |
1278 | 189M | } |
1279 | | |
1280 | | inline static double _kmeans1d_lookup(double* D, double* cumsum, double* cumsum2, int i, int j) |
1281 | 189M | { |
1282 | 189M | const int col = i < j - 1 ? i7.98M : j - 1181M ; |
1283 | 189M | return (col >= 0 ? D[col]188M : 0575k ) + _kmeans1d_cost(cumsum, cumsum2, j, i); |
1284 | 189M | } |
1285 | | |
1286 | | static void _smawk2(int row_start, int row_stride, int row_size, int* cols, int col_size, int* reserved, double* D, double* cumsum, double* cumsum2, int* result) |
1287 | 298k | { |
1288 | 298k | if (row_size == 0) |
1289 | 35.8k | return; |
1290 | 262k | int* _cols = cols + col_size; |
1291 | 262k | int _col_size = 0; |
1292 | 262k | int i; |
1293 | 19.4M | for (i = 0; i < col_size; i++19.2M ) |
1294 | 19.2M | { |
1295 | 19.2M | const int col = cols[i]; |
1296 | 19.2M | for (;;) |
1297 | 28.7M | { |
1298 | 28.7M | if (_col_size == 0) |
1299 | 485k | break; |
1300 | 28.2M | const int row = row_start + row_stride * (_col_size - 1); |
1301 | 28.2M | if (_kmeans1d_lookup(D, cumsum, cumsum2, row, col) >= _kmeans1d_lookup(D, cumsum, cumsum2, row, _cols[_col_size - 1])) |
1302 | 18.7M | break; |
1303 | 9.50M | --_col_size; |
1304 | 9.50M | } |
1305 | 19.2M | if (_col_size < row_size) |
1306 | 19.1M | { |
1307 | 19.1M | _cols[_col_size] = col; |
1308 | 19.1M | ++_col_size; |
1309 | 19.1M | } |
1310 | 19.2M | } |
1311 | 262k | _smawk2(row_start + row_stride, row_stride * 2, row_size / 2, _cols, _col_size, reserved, D, cumsum, cumsum2, result); |
1312 | | // Build the reverse lookup table. |
1313 | 9.86M | for (i = 0; i < _col_size; i++9.60M ) |
1314 | 9.60M | reserved[_cols[i]] = i; |
1315 | 262k | int start = 0; |
1316 | 5.09M | for (i = 0; i < row_size; i += 24.82M ) { |
1317 | 4.82M | const int row = row_start + i * row_stride; |
1318 | 4.82M | int stop = _col_size - 1; |
1319 | 4.82M | if (i < row_size - 1) |
1320 | 4.77M | { |
1321 | 4.77M | const int argmin = result[row_start + (i + 1) * row_stride]; |
1322 | 4.77M | stop = reserved[argmin]; |
1323 | 4.77M | } |
1324 | 4.82M | int argmin = _cols[start]; |
1325 | 4.82M | double min = _kmeans1d_lookup(D, cumsum, cumsum2, row, argmin); |
1326 | 4.82M | int c; |
1327 | 14.1M | for (c = start + 1; c <= stop; c++9.28M ) |
1328 | 9.28M | { |
1329 | 9.28M | double value = _kmeans1d_lookup(D, cumsum, cumsum2, row, _cols[c]); |
1330 | 9.28M | if (value < min) { |
1331 | 4.66M | argmin = _cols[c]; |
1332 | 4.66M | min = value; |
1333 | 4.66M | } |
1334 | 9.28M | } |
1335 | 4.82M | result[row] = argmin; |
1336 | 4.82M | start = stop; |
1337 | 4.82M | } |
1338 | 262k | } |
1339 | | |
1340 | | static void _smawk1(int row_start, int row_stride, int row_size, int* cols, int col_size, int* reserved, double* D, double* cumsum, double* cumsum2, int* result) |
1341 | 35.8k | { |
1342 | 35.8k | if (row_size == 0) |
1343 | 0 | return; |
1344 | 35.8k | int* _cols = cols; |
1345 | 35.8k | int _col_size = 0; |
1346 | 35.8k | int i; |
1347 | 19.3M | for (i = 0; i < col_size; i++19.3M ) |
1348 | 19.3M | { |
1349 | 19.3M | const int col = i; |
1350 | 19.3M | for (;;) |
1351 | 28.4M | { |
1352 | 28.4M | if (_col_size == 0) |
1353 | 37.1k | break; |
1354 | 28.3M | const int row = row_start + row_stride * (_col_size - 1); |
1355 | 28.3M | if (_kmeans1d_lookup(D, cumsum, cumsum2, row, col) >= _kmeans1d_lookup(D, cumsum, cumsum2, row, _cols[_col_size - 1])) |
1356 | 19.2M | break; |
1357 | 9.08M | --_col_size; |
1358 | 9.08M | } |
1359 | 19.3M | if (_col_size < row_size) |
1360 | 18.7M | { |
1361 | 18.7M | _cols[_col_size] = col; |
1362 | 18.7M | ++_col_size; |
1363 | 18.7M | } |
1364 | 19.3M | } |
1365 | 35.8k | _smawk2(row_start + row_stride, row_stride * 2, row_size / 2, _cols, _col_size, reserved, D, cumsum, cumsum2, result); |
1366 | | // Build the reverse lookup table. |
1367 | 9.69M | for (i = 0; i < _col_size; i++9.65M ) |
1368 | 9.65M | reserved[_cols[i]] = i; |
1369 | 35.8k | int start = 0; |
1370 | 4.86M | for (i = 0; i < row_size; i += 24.83M ) { |
1371 | 4.83M | const int row = row_start + i * row_stride; |
1372 | 4.83M | int stop = _col_size - 1; |
1373 | 4.83M | if (i < row_size - 1) |
1374 | 4.82M | { |
1375 | 4.82M | const int argmin = result[row_start + (i + 1) * row_stride]; |
1376 | 4.82M | stop = reserved[argmin]; |
1377 | 4.82M | } |
1378 | 4.83M | int argmin = _cols[start]; |
1379 | 4.83M | double min = _kmeans1d_lookup(D, cumsum, cumsum2, row, argmin); |
1380 | 4.83M | int c; |
1381 | 14.3M | for (c = start + 1; c <= stop; c++9.49M ) |
1382 | 9.49M | { |
1383 | 9.49M | double value = _kmeans1d_lookup(D, cumsum, cumsum2, row, _cols[c]); |
1384 | 9.49M | if (value < min) { |
1385 | 4.98M | argmin = _cols[c]; |
1386 | 4.98M | min = value; |
1387 | 4.98M | } |
1388 | 9.49M | } |
1389 | 4.83M | result[row] = argmin; |
1390 | 4.83M | start = stop; |
1391 | 4.83M | } |
1392 | 35.8k | } |
1393 | | |
1394 | | static void _smawk0(int row_start, int row_stride, int row_size, int* cols, int col_size, int* reserved, double* D, double* cumsum, double* cumsum2, int* result) |
1395 | 35.8k | { |
1396 | 35.8k | if (row_size == 0) |
1397 | 0 | return; |
1398 | 35.8k | _smawk1(row_start + row_stride, row_stride * 2, row_size / 2, cols, col_size, reserved, D, cumsum, cumsum2, result); |
1399 | | // Build the reverse lookup table. |
1400 | 35.8k | int start = 0; |
1401 | 35.8k | int i; |
1402 | 9.69M | for (i = 0; i < row_size; i += 29.66M ) { |
1403 | 9.66M | const int row = row_start + i * row_stride; |
1404 | 9.66M | int stop = col_size - 1; |
1405 | 9.66M | if (i < row_size - 1) |
1406 | 9.65M | { |
1407 | 9.65M | const int argmin = result[row_start + (i + 1) * row_stride]; |
1408 | 9.65M | stop = argmin; |
1409 | 9.65M | } |
1410 | 9.66M | int argmin = start; |
1411 | 9.66M | double min = _kmeans1d_lookup(D, cumsum, cumsum2, row, argmin); |
1412 | 9.66M | int c; |
1413 | 28.1M | for (c = start + 1; c <= stop; c++18.5M ) |
1414 | 18.5M | { |
1415 | 18.5M | double value = _kmeans1d_lookup(D, cumsum, cumsum2, row, c); |
1416 | 18.5M | if (value < min) { |
1417 | 9.72M | argmin = c; |
1418 | 9.72M | min = value; |
1419 | 9.72M | } |
1420 | 18.5M | } |
1421 | 9.66M | result[row] = argmin; |
1422 | 9.66M | start = stop; |
1423 | 9.66M | } |
1424 | 35.8k | } |
1425 | | |
1426 | | static void smawk(const int n, int* cols, double* D, double* cumsum, double* cumsum2, int* result) |
1427 | 35.8k | { |
1428 | 35.8k | _smawk0(0, 1, n, cols + n, n, cols, D, cumsum, cumsum2, result); |
1429 | 35.8k | } |
1430 | | |
1431 | | typedef struct { |
1432 | | double value; |
1433 | | int index; |
1434 | | } ccv_kmeans1d_undo_t; |
1435 | | |
1436 | | #undef more_than |
1437 | 2.04M | #define less_than(s1, s2, aux) ((s1).value < (s2).value) |
1438 | 2.04M | static CCV_IMPLEMENT_QSORT(_ccv_kmeans1d_undo_qsort, ccv_kmeans1d_undo_t, less_than) |
1439 | | #undef less_than |
1440 | | |
1441 | | void ccv_kmeans1d(const ccv_dense_matrix_t* const a, const int k, int* const clusters, double* const centroids) |
1442 | 742 | { |
1443 | 742 | assert(k > 1); |
1444 | 742 | assert(CCV_GET_CHANNEL(a->type) == CCV_C1); |
1445 | 742 | const int n = a->rows * a->cols; |
1446 | 742 | ccv_kmeans1d_undo_t* const sorted_undos = ccmalloc(sizeof(ccv_kmeans1d_undo_t) * n); |
1447 | 742 | int i; |
1448 | 742 | if (CCV_GET_DATA_TYPE(a->type) == CCV_16F) |
1449 | 252 | { |
1450 | 252 | float* f = (float*)sorted_undos; |
1451 | 252 | ccv_half_precision_to_float((uint16_t*)a->data.f16, (float*)f, n); |
1452 | 76.5k | for (i = n - 1; i >= 0; i--76.2k ) |
1453 | 76.2k | { |
1454 | 76.2k | sorted_undos[i].value = f[i]; |
1455 | 76.2k | sorted_undos[i].index = i; |
1456 | 76.2k | } |
1457 | 490 | } else if (CCV_GET_DATA_TYPE(a->type) == CCV_32F) { |
1458 | 62.5k | for (i = 0; i < n; i++62.2k ) |
1459 | 62.2k | { |
1460 | 62.2k | sorted_undos[i].value = a->data.f32[i]; |
1461 | 62.2k | sorted_undos[i].index = i; |
1462 | 62.2k | } |
1463 | 246 | } else if (244 CCV_GET_DATA_TYPE244 (a->type) == CCV_64F244 ) { |
1464 | 57.0k | for (i = 0; i < n; i++56.7k ) |
1465 | 56.7k | { |
1466 | 56.7k | sorted_undos[i].value = a->data.f64[i]; |
1467 | 56.7k | sorted_undos[i].index = i; |
1468 | 56.7k | } |
1469 | 244 | } |
1470 | 742 | _ccv_kmeans1d_undo_qsort(sorted_undos, n, 0); |
1471 | 742 | double* cc = ccmalloc(sizeof(double) * 2 * (n + 1)); |
1472 | 742 | cc[0] = 0; |
1473 | 742 | cc[n + 1] = 0; |
1474 | 742 | double cumsum = 0, cumsum2 = 0; |
1475 | 196k | for (i = 0; i < n; i++195k ) |
1476 | 195k | { |
1477 | 195k | cumsum += sorted_undos[i].value; |
1478 | 195k | cumsum2 += sorted_undos[i].value * sorted_undos[i].value; |
1479 | 195k | cc[i + 1] = cumsum; |
1480 | 195k | cc[i + 1 + n + 1] = cumsum2; |
1481 | 195k | } |
1482 | 742 | double* D = ccmalloc(sizeof(double) * 2 * n); |
1483 | 742 | int* T = ccmalloc(sizeof(int) * n * k); |
1484 | 196k | for (i = 0; i < n; i++195k ) |
1485 | 195k | { |
1486 | 195k | D[i] = _kmeans1d_cost(cc, cc + n + 1, 0, i); |
1487 | 195k | T[i] = 0; |
1488 | 195k | } |
1489 | 742 | int k_; |
1490 | 742 | int* cols = ccmalloc(sizeof(int) * n * 2); |
1491 | 36.5k | for (k_ = 1; k_ < k; k_++35.8k ) |
1492 | 35.8k | { |
1493 | 35.8k | double* lastD = D + ((k_ - 1) % 2) * n; |
1494 | 35.8k | int* const cT = T + k_ * n; |
1495 | 35.8k | smawk(n, cols, lastD, cc, cc + n + 1, cT); |
1496 | 35.8k | double* nextD = D + (k_ % 2) * n; |
1497 | 19.3M | for (i = 0; i < n; i++19.3M ) |
1498 | 19.3M | { |
1499 | 19.3M | const int argmin = cT[i]; |
1500 | 19.3M | nextD[i] = _kmeans1d_lookup(lastD, cc, cc + n + 1, i, argmin); |
1501 | 19.3M | } |
1502 | 35.8k | } |
1503 | 742 | ccfree(cc); |
1504 | 742 | ccfree(D); |
1505 | 742 | int t = n; |
1506 | 742 | k_ = k - 1; |
1507 | 742 | int n_ = n - 1; |
1508 | 36.1k | do { |
1509 | 36.1k | int t_ = t; |
1510 | 36.1k | t = T[k_ * n + n_]; |
1511 | 36.1k | double centroid = 0.0; |
1512 | 231k | for (i = t; i < t_; i++195k ) |
1513 | 195k | { |
1514 | 195k | cols[i] = k_; |
1515 | 195k | centroid += (sorted_undos[i].value - centroid) / (i - t + 1); |
1516 | 195k | } |
1517 | 36.1k | centroids[k_] = centroid; |
1518 | 36.1k | k_ -= 1; |
1519 | 36.1k | n_ = t - 1; |
1520 | 36.1k | } while (t > 0); |
1521 | 196k | for (i = 0; i < n; i++195k ) |
1522 | 195k | clusters[sorted_undos[i].index] = cols[i]; |
1523 | 742 | ccfree(cols); |
1524 | 742 | ccfree(sorted_undos); |
1525 | 742 | ccfree(T); |
1526 | 742 | } |