Coverage Report

Created: 2024-08-19 11:27

/home/liu/actions-runner/_work/ccv/ccv/lib/ccv_numeric.c
Line
Count
Source (jump to first uncovered line)
1
#include "ccv.h"
2
#include "ccv_internal.h"
3
#include <complex.h>
4
#ifdef HAVE_FFTW3
5
#include <pthread.h>
6
#include <fftw3.h>
7
#else
8
#include "3rdparty/kissfft/kiss_fftndr.h"
9
#include "3rdparty/kissfft/kissf_fftndr.h"
10
#endif
11
12
const ccv_minimize_param_t ccv_minimize_default_params = {
13
  .interp = 0.1,
14
  .extrap = 3.0,
15
  .max_iter = 20,
16
  .ratio = 10.0,
17
  .sig = 0.1,
18
  .rho = 0.05,
19
};
20
21
void ccv_invert(ccv_matrix_t* a, ccv_matrix_t** b, int type)
22
0
{
23
0
}
24
25
void ccv_solve(ccv_matrix_t* a, ccv_matrix_t* b, ccv_matrix_t** d, int type)
26
0
{
27
0
}
28
29
void ccv_eigen(ccv_dense_matrix_t* a, ccv_dense_matrix_t** vector, ccv_dense_matrix_t** lambda, int type, double epsilon)
30
1
{
31
1
  ccv_declare_derived_signature(vsig, a->sig != 0, ccv_sign_with_literal("ccv_eigen(vector)"), a->sig, CCV_EOF_SIGN);
32
1
  ccv_declare_derived_signature(lsig, a->sig != 0, ccv_sign_with_literal("ccv_eigen(lambda)"), a->sig, CCV_EOF_SIGN);
33
1
  assert(CCV_GET_CHANNEL(a->type) == 1);
34
1
  type = (type == 0) ? CCV_GET_DATA_TYPE(a->type) | CCV_C1 : 
CCV_GET_DATA_TYPE0
(type) | CCV_C10
;
35
  // as of now, this function only support real symmetric matrix
36
1
  ccv_dense_matrix_t* dvector = *vector = ccv_dense_matrix_renew(*vector, a->rows, a->cols, CCV_32F | CCV_64F | CCV_C1, type, vsig);
37
1
  ccv_dense_matrix_t* dlambda = *lambda = ccv_dense_matrix_renew(*lambda, 1, a->cols, CCV_32F | CCV_64F | CCV_C1, type, lsig);
38
1
  assert(CCV_GET_DATA_TYPE(dvector->type) == CCV_GET_DATA_TYPE(dlambda->type));
39
1
  ccv_object_return_if_cached(, dvector, dlambda);
40
1
  double* ja = (double*)ccmalloc(sizeof(double) * a->rows * a->cols);
41
1
  int i, j;
42
1
  unsigned char* aptr = a->data.u8;
43
1
  assert(a->rows > 0 && a->cols > 0);
44
1
#define for_block(_, _for_get) \
45
5
  
for (i = 0; 1
i < a->rows;
i++4
) \
46
4
  { \
47
20
    for (j = 0; j < a->cols; 
j++16
) \
48
16
      ja[i * a->cols + j] = 
_for_get4
(aptr, j); \
49
4
    aptr += a->step; \
50
4
  }
51
1
  ccv_matrix_getter(a->type, for_block);
52
1
#undef for_block
53
1
  ccv_zero(dvector);
54
1
  ccv_zero(dlambda);
55
1
  unsigned char* dvptr = dvector->data.u8;
56
1
#define for_block(_, _for_set) \
57
5
  
for (i = 0; 1
i < a->cols;
i++4
) \
58
1
    _for_set(dvptr, i * a->cols + i, 1);
59
1
  ccv_matrix_setter(dvector->type, for_block);
60
1
#undef for_block
61
1
  double accuracy = 0;
62
17
  for (i = 0; i < a->rows * a->cols; 
i++16
)
63
16
    accuracy += ja[i];
64
1
  accuracy = sqrt(2 * accuracy);
65
1
  int p, q;
66
1
  unsigned char* dlptr = dlambda->data.u8;
67
1
  int flag = 1;
68
1
  assert(a->rows == a->cols);
69
1
#define for_block(_, _for_set, _for_get) \
70
39
  
do 1
{ \
71
39
    if (!flag) \
72
39
      
accuracy = accuracy * 0.524
; \
73
39
    flag = 0; \
74
147
    for (p = 0; p < a->rows; 
p++108
) \
75
122
    { \
76
303
      for (q = p + 1; q < a->cols; 
q++181
) \
77
195
        if (fabs(ja[p * a->cols + q]) > accuracy) \
78
195
        { \
79
14
          double x = -ja[p * a->cols + q]; \
80
14
          double y = (ja[q * a->cols + q] - ja[p * a->cols + p]) * 0.5; \
81
14
          double omega = (x == 0 && 
y == 00
) ?
10
: x / sqrt(x * x + y * y); \
82
14
          if (y < 0) \
83
14
            
omega = -omega12
; \
84
14
          double sn = 1.0 + sqrt(1.0 - omega * omega); \
85
14
          sn = omega / sqrt(2 * sn); \
86
14
          double cn = sqrt(1.0 - sn * sn); \
87
14
          double fpp = ja[p * a->cols + p]; \
88
14
          double fpq = ja[p * a->cols + q]; \
89
14
          double fqq = ja[q * a->cols + q]; \
90
14
          ja[p * a->cols + p] = fpp * cn * cn + fqq * sn * sn + fpq * omega; \
91
14
          ja[q * a->cols + q] = fpp * sn * sn + fqq * cn * cn - fpq * omega; \
92
14
          ja[p * a->cols + q] = ja[q * a->cols + p] = 0; \
93
70
          for (i = 0; i < a->cols; 
i++56
) \
94
56
            if (i != q && 
i != p42
) \
95
56
            { \
96
28
              fpp = ja[p * a->cols + i]; \
97
28
              fqq = ja[q * a->cols + i]; \
98
28
              ja[p * a->cols + i] = fpp * cn + fqq * sn; \
99
28
              ja[q * a->cols + i] = -fpp * sn + fqq * cn; \
100
28
            } \
101
70
          for (i = 0; i < a->rows; 
i++56
) \
102
56
            if (i != q && 
i != p42
) \
103
56
            { \
104
28
              fpp = ja[i * a->cols + p]; \
105
28
              fqq = ja[i * a->cols + q]; \
106
28
              ja[i * a->cols + p] = fpp * cn + fqq * sn; \
107
28
              ja[i * a->cols + q] = -fpp * sn + fqq * cn; \
108
28
            } \
109
70
          for (i = 0; i < a->cols; 
i++56
) \
110
56
          { \
111
56
            fpp = _for_get(dvptr, p * a->cols + i); \
112
56
            fqq = _for_get(dvptr, q * a->cols + i); \
113
56
            _for_set(dvptr, p * a->cols + i, fpp * cn + fqq * sn); \
114
56
            _for_set(dvptr, q * a->cols + i, -fpp * sn + fqq * cn); \
115
56
          } \
116
70
          for (i = 0; i < a->cols; 
i++56
) \
117
14
            _for_set(dlptr, i, ja[i * a->cols + i]); \
118
14
          flag = 1; \
119
14
          break; \
120
14
        } \
121
122
      if (flag) \
122
122
        
break14
; \
123
122
    } \
124
39
  } while (accuracy > epsilon);
125
1
  ccv_matrix_setter_getter(dvector->type, for_block);
126
1
#undef for_block
127
1
  ccfree(ja);
128
1
}
129
130
void ccv_minimize(ccv_dense_matrix_t* x, int length, double red, ccv_minimize_f func, ccv_minimize_param_t params, void* data)
131
1
{
132
1
  ccv_dense_matrix_t* df0 = ccv_dense_matrix_new(x->rows, x->cols, x->type, 0, 0);
133
1
  ccv_zero(df0);
134
1
  ccv_dense_matrix_t* df3 = ccv_dense_matrix_new(x->rows, x->cols, x->type, 0, 0);
135
1
  ccv_zero(df3);
136
1
  ccv_dense_matrix_t* dF0 = ccv_dense_matrix_new(x->rows, x->cols, x->type, 0, 0);
137
1
  ccv_zero(dF0);
138
1
  ccv_dense_matrix_t* s = ccv_dense_matrix_new(x->rows, x->cols, x->type, 0, 0);
139
1
  ccv_zero(s);
140
1
  ccv_dense_matrix_t* x0 = ccv_dense_matrix_new(x->rows, x->cols, x->type, 0, 0);
141
1
  ccv_zero(x0);
142
1
  ccv_dense_matrix_t* xn = ccv_dense_matrix_new(x->rows, x->cols, x->type, 0, 0);
143
1
  ccv_zero(xn);
144
  
145
1
  double F0 = 0, f0 = 0, f1 = 0, f2 = 0, f3 = 0, f4 = 0;
146
1
  double x1 = 0, x2 = 0, x3 = 0, x4 = 0;
147
1
  double d0 = 0, d1 = 0, d2 = 0, d3 = 0, d4 = 0;
148
1
  double A = 0, B = 0;
149
1
  int ls_failed = 0;
150
151
1
  int i, j, k;
152
1
  func(x, &f0, df0, data);
153
1
  d0 = 0;
154
1
  unsigned char* df0p = df0->data.u8;
155
1
  unsigned char* sp = s->data.u8;
156
2
  for (i = 0; i < x->rows; 
i++1
)
157
1
  {
158
3
    for (j = 0; j < x->cols; 
j++2
)
159
2
    {
160
2
      double ss = ccv_get_value(x->type, df0p, j);
161
2
      ccv_set_value(x->type, sp, j, -ss, 0);
162
2
      d0 += -ss * ss;
163
2
    }
164
1
    df0p += x->step;
165
1
    sp += x->step;
166
1
  }
167
1
  x3 = red / (1.0 - d0);
168
1
  int l = (length > 0) ? length : 
-length0
;
169
1
  int ls = (length > 0) ? 1 : 
00
;
170
1
  int eh = (length > 0) ? 0 : 
10
;
171
22
  for (k = 0; k < l;)
172
22
  {
173
22
    k += ls;
174
22
    memcpy(x0->data.u8, x->data.u8, x->rows * x->step);
175
22
    memcpy(dF0->data.u8, df0->data.u8, x->rows * x->step);
176
22
    F0 = f0;
177
22
    int m = ccv_min(params.max_iter, (length > 0) ? params.max_iter : l - k);
178
22
    for (;;)
179
80
    {
180
80
      x2 = 0;
181
80
      f3 = f2 = f0;
182
80
      d2 = d0;
183
80
      memcpy(df3->data.u8, df0->data.u8, x->rows * x->step);
184
80
      while (m > 0)
185
80
      {
186
80
        m--;
187
80
        k += eh;
188
80
        unsigned char* sp = s->data.u8;
189
80
        unsigned char* xp = x->data.u8;
190
80
        unsigned char* xnp = xn->data.u8;
191
160
        for (i = 0; i < x->rows; 
i++80
)
192
80
        {
193
240
          for (j = 0; j < x->cols; 
j++160
)
194
160
            ccv_set_value(x->type, xnp, j, x3 * ccv_get_value(x->type, sp, j) + ccv_get_value(x->type, xp, j), 0);
195
80
          sp += x->step;
196
80
          xp += x->step;
197
80
          xnp += x->step;
198
80
        }
199
80
        if (func(xn, &f3, df3, data) == 0)
200
80
          break;
201
0
        else
202
0
          x3 = (x2 + x3) * 0.5;
203
80
      }
204
80
      if (f3 < F0)
205
27
      {
206
27
        memcpy(x0->data.u8, xn->data.u8, x->rows * x->step);
207
27
        memcpy(dF0->data.u8, df3->data.u8, x->rows * x->step);
208
27
        F0 = f3;
209
27
      }
210
80
      d3 = 0;
211
80
      unsigned char* df3p = df3->data.u8;
212
80
      unsigned char* sp = s->data.u8;
213
160
      for (i = 0; i < x->rows; 
i++80
)
214
80
      {
215
240
        for (j = 0; j < x->cols; 
j++160
)
216
160
          d3 += ccv_get_value(x->type, df3p, j) * ccv_get_value(x->type, sp, j);
217
80
        df3p += x->step;
218
80
        sp += x->step;
219
80
      }
220
80
      if ((d3 > params.sig * d0) || 
(f3 > f0 + x3 * params.rho * d0)61
||
(m <= 0)60
)
221
22
        break;
222
58
      x1 = x2; f1 = f2; d1 = d2;
223
58
      x2 = x3; f2 = f3; d2 = d3;
224
58
      A = 6.0 * (f1 - f2) + 3.0 * (d2 + d1) * (x2 - x1);
225
58
      B = 3.0 * (f2 - f1) - (2.0 * d1 + d2) * (x2 - x1);
226
58
      x3 = B * B - A * d1 * (x2 - x1);
227
58
      if (x3 < 0)
228
8
        x3 = x2 * params.extrap;
229
50
      else {
230
50
        x3 = x1 - d1 * (x2 - x1) * (x2 - x1) / (B + sqrt(x3));
231
50
        if (x3 < 0)
232
0
          x3 = x2 * params.extrap;
233
50
        else {
234
50
          if (x3 > x2 * params.extrap)
235
8
            x3 = x2 * params.extrap;
236
42
          else if (x3 < x2 + params.interp * (x2 - x1))
237
38
            x3 = x2 + params.interp * (x2 - x1);
238
50
        }
239
50
      }
240
58
    }
241
88
    
while (22
((fabs(d3) > -params.sig * d0) ||
(f3 > f0 + x3 * params.rho * d0)18
) &&
(m > 0)70
)
242
66
    {
243
66
      if ((d3 > 1e-8) || 
(f3 > f0 + x3 * params.rho * d0)52
)
244
38
      {
245
38
        x4 = x3; f4 = f3; d4 = d3;
246
38
      } else {
247
28
        x2 = x3; f2 = f3; d2 = d3;
248
28
      }
249
66
      if (f4 > f0)
250
43
        x3 = x2 - (0.5 * d2 * (x4 - x2) * (x4 - x2)) / (f4 - f2 - d2 * (x4 - x2));
251
23
      else {
252
23
        A = 6.0 * (f2 - f4) / (x4 - x2) + 3.0 * (d4 + d2);
253
23
        B = 3.0 * (f4 - f2) - (2.0 * d2 + d4) * (x4 - x2);
254
23
        x3 = B * B - A * d2 * (x4 - x2) * (x4 - x2);
255
23
        x3 = (x3 < 0) ? 
(x2 + x4) * 0.50
: x2 + (sqrt(x3) - B) / A;
256
23
      }
257
66
      x3 = ccv_max(ccv_min(x3, x4 - params.interp * (x4 - x2)), x2 + params.interp * (x4 - x2));
258
66
      sp = s->data.u8;
259
66
      unsigned char* xp = x->data.u8;
260
66
      unsigned char* xnp = xn->data.u8;
261
132
      for (i = 0; i < x->rows; 
i++66
)
262
66
      {
263
198
        for (j = 0; j < x->cols; 
j++132
)
264
132
          ccv_set_value(x->type, xnp, j, x3 * ccv_get_value(x->type, sp, j) + ccv_get_value(x->type, xp, j), 0);
265
66
        sp += x->step;
266
66
        xp += x->step;
267
66
        xnp += x->step;
268
66
      }
269
66
      func(xn, &f3, df3, data);
270
66
      if (f3 < F0)
271
24
      {
272
24
        memcpy(x0->data.u8, xn->data.u8, x->rows * x->step);
273
24
        memcpy(dF0->data.u8, df3->data.u8, x->rows * x->step);
274
24
        F0 = f3;
275
24
      }
276
66
      m--;
277
66
      k += eh;
278
66
      d3 = 0;
279
66
      sp = s->data.u8;
280
66
      unsigned char* df3p = df3->data.u8;
281
132
      for (i = 0; i < x->rows; 
i++66
)
282
66
      {
283
198
        for (j = 0; j < x->cols; 
j++132
)
284
132
          d3 += ccv_get_value(x->type, df3p, j) * ccv_get_value(x->type, sp, j);
285
66
        df3p += x->step;
286
66
        sp += x->step;
287
66
      }
288
66
    }
289
22
    if ((fabs(d3) < -params.sig * d0) && 
(f3 < f0 + x3 * params.rho * d0)18
)
290
18
    {
291
18
      memcpy(x->data.u8, xn->data.u8, x->rows * x->step);
292
18
      f0 = f3;
293
18
      double df0_df3 = 0;
294
18
      double df3_df3 = 0;
295
18
      double df0_df0 = 0;
296
18
      unsigned char* df0p = df0->data.u8;
297
18
      unsigned char* df3p = df3->data.u8;
298
36
      for (i = 0; i < x->rows; 
i++18
)
299
18
      {
300
54
        for (j = 0; j < x->cols; 
j++36
)
301
36
        {
302
36
          df0_df0 += ccv_get_value(x->type, df0p, j) * ccv_get_value(x->type, df0p, j);
303
36
          df0_df3 += ccv_get_value(x->type, df0p, j) * ccv_get_value(x->type, df3p, j);
304
36
          df3_df3 += ccv_get_value(x->type, df3p, j) * ccv_get_value(x->type, df3p, j);
305
36
        }
306
18
        df0p += x->step;
307
18
        df3p += x->step;
308
18
      }
309
18
      double slr = (df3_df3 - df0_df3) / df0_df0;
310
18
      df3p = df3->data.u8;
311
18
      unsigned char* sp = s->data.u8;
312
36
      for (i = 0; i < x->rows; 
i++18
)
313
18
      {
314
54
        for (j = 0; j < x->cols; 
j++36
)
315
36
          ccv_set_value(x->type, sp, j, slr * ccv_get_value(x->type, sp, j) - ccv_get_value(x->type, df3p, j), 0);
316
18
        df3p += x->step;
317
18
        sp += x->step;
318
18
      }
319
18
      memcpy(df0->data.u8, df3->data.u8, x->rows * x->step);
320
18
      d3 = d0;
321
18
      d0 = 0;
322
18
      df0p = df0->data.u8;
323
18
      sp = s->data.u8;
324
36
      for (i = 0; i < x->rows; 
i++18
)
325
18
      {
326
54
        for (j = 0; j < x->cols; 
j++36
)
327
36
        {
328
36
          d0 += ccv_get_value(x->type, df0p, j) * ccv_get_value(x->type, sp, j);
329
36
        }
330
18
        df0p += x->step;
331
18
        sp += x->step;
332
18
      }
333
18
      if (d0 > 0)
334
1
      {
335
1
        d0 = 0;
336
1
        df0p = df0->data.u8;
337
1
        sp = s->data.u8;
338
2
        for (i = 0; i < x->rows; 
i++1
)
339
1
        {
340
3
          for (j = 0; j < x->cols; 
j++2
)
341
2
          {
342
2
            double ss = ccv_get_value(x->type, df0p, j);
343
2
            ccv_set_value(x->type, sp, j, -ss, 0);
344
2
            d0 += -ss * ss;
345
2
          }
346
1
          df0p += x->step;
347
1
          sp += x->step;
348
1
        }
349
1
      }
350
18
      x3 = x3 * ccv_min(params.ratio, d3 / (d0 - 1e-8));
351
18
      ls_failed = 0;
352
18
    } else {
353
4
      memcpy(x->data.u8, x0->data.u8, x->rows * x->step);
354
4
      memcpy(df0->data.u8, dF0->data.u8, x->rows * x->step);
355
4
      f0 = F0;
356
4
      if (ls_failed)
357
1
        break;
358
3
      d0 = 0;
359
3
      unsigned char* df0p = df0->data.u8;
360
3
      unsigned char* sp = s->data.u8;
361
6
      for (i = 0; i < x->rows; 
i++3
)
362
3
      {
363
9
        for (j = 0; j < x->cols; 
j++6
)
364
6
        {
365
6
          double ss = ccv_get_value(x->type, df0p, j);
366
6
          ccv_set_value(x->type, sp, j, -ss, 0);
367
6
          d0 += -ss * ss;
368
6
        }
369
3
        df0p += x->step;
370
3
        sp += x->step;
371
3
      }
372
3
      x3 = red / (1.0 - d0);
373
3
      ls_failed = 1;
374
3
    }
375
22
  }
376
1
  ccv_matrix_free(s);
377
1
  ccv_matrix_free(x0);
378
1
  ccv_matrix_free(xn);
379
1
  ccv_matrix_free(dF0);
380
1
  ccv_matrix_free(df0);
381
1
  ccv_matrix_free(df3);
382
1
}
383
384
#ifdef HAVE_FFTW3
385
/* optimal FFT size table is adopted from OpenCV */
386
static const int _ccv_optimal_fft_size[] = {
387
  1, 2, 3, 4, 5, 6, 8, 9, 10, 12, 15, 16, 18, 20, 24, 25, 27, 30, 32, 36, 40, 45, 48, 
388
  50, 54, 60, 64, 72, 75, 80, 81, 90, 96, 100, 108, 120, 125, 128, 135, 144, 150, 160, 
389
  162, 180, 192, 200, 216, 225, 240, 243, 250, 256, 270, 288, 300, 320, 324, 360, 375, 
390
  384, 400, 405, 432, 450, 480, 486, 500, 512, 540, 576, 600, 625, 640, 648, 675, 720, 
391
  729, 750, 768, 800, 810, 864, 900, 960, 972, 1000, 1024, 1080, 1125, 1152, 1200, 
392
  1215, 1250, 1280, 1296, 1350, 1440, 1458, 1500, 1536, 1600, 1620, 1728, 1800, 1875, 
393
  1920, 1944, 2000, 2025, 2048, 2160, 2187, 2250, 2304, 2400, 2430, 2500, 2560, 2592, 
394
  2700, 2880, 2916, 3000, 3072, 3125, 3200, 3240, 3375, 3456, 3600, 3645, 3750, 3840, 
395
  3888, 4000, 4050, 4096, 4320, 4374, 4500, 4608, 4800, 4860, 5000, 5120, 5184, 5400, 
396
  5625, 5760, 5832, 6000, 6075, 6144, 6250, 6400, 6480, 6561, 6750, 6912, 7200, 7290, 
397
  7500, 7680, 7776, 8000, 8100, 8192, 8640, 8748, 9000, 9216, 9375, 9600, 9720, 10000, 
398
  10125, 10240, 10368, 10800, 10935, 11250, 11520, 11664, 12000, 12150, 12288, 12500, 
399
  12800, 12960, 13122, 13500, 13824, 14400, 14580, 15000, 15360, 15552, 15625, 16000, 
400
  16200, 16384, 16875, 17280, 17496, 18000, 18225, 18432, 18750, 19200, 19440, 19683, 
401
  20000, 20250, 20480, 20736, 21600, 21870, 22500, 23040, 23328, 24000, 24300, 24576, 
402
  25000, 25600, 25920, 26244, 27000, 27648, 28125, 28800, 29160, 30000, 30375, 30720, 
403
  31104, 31250, 32000, 32400, 32768, 32805, 33750, 34560, 34992, 36000, 36450, 36864, 
404
  37500, 38400, 38880, 39366, 40000, 40500, 40960, 41472, 43200, 43740, 45000, 46080, 
405
  46656, 46875, 48000, 48600, 49152, 50000, 50625, 51200, 51840, 52488, 54000, 54675, 
406
  55296, 56250, 57600, 58320, 59049, 60000, 60750, 61440, 62208, 62500, 64000, 64800, 
407
  65536, 65610, 67500, 69120, 69984, 72000, 72900, 73728, 75000, 76800, 77760, 78125, 
408
  78732, 80000, 81000, 81920, 82944, 84375, 86400, 87480, 90000, 91125, 92160, 93312, 
409
  93750, 96000, 97200, 98304, 98415, 100000, 101250, 102400, 103680, 104976, 108000, 
410
  109350, 110592, 112500, 115200, 116640, 118098, 120000, 121500, 122880, 124416, 125000, 
411
  128000, 129600, 131072, 131220, 135000, 138240, 139968, 140625, 144000, 145800, 147456, 
412
  150000, 151875, 153600, 155520, 156250, 157464, 160000, 162000, 163840, 164025, 165888, 
413
  168750, 172800, 174960, 177147, 180000, 182250, 184320, 186624, 187500, 192000, 194400, 
414
  196608, 196830, 200000, 202500, 204800, 207360, 209952, 216000, 218700, 221184, 225000, 
415
  230400, 233280, 234375, 236196, 240000, 243000, 245760, 248832, 250000, 253125, 256000, 
416
  259200, 262144, 262440, 270000, 273375, 276480, 279936, 281250, 288000, 291600, 294912, 
417
  295245, 300000, 303750, 307200, 311040, 312500, 314928, 320000, 324000, 327680, 328050, 
418
  331776, 337500, 345600, 349920, 354294, 360000, 364500, 368640, 373248, 375000, 384000, 
419
  388800, 390625, 393216, 393660, 400000, 405000, 409600, 414720, 419904, 421875, 432000, 
420
  437400, 442368, 450000, 455625, 460800, 466560, 468750, 472392, 480000, 486000, 491520, 
421
  492075, 497664, 500000, 506250, 512000, 518400, 524288, 524880, 531441, 540000, 546750, 
422
  552960, 559872, 562500, 576000, 583200, 589824, 590490, 600000, 607500, 614400, 622080, 
423
  625000, 629856, 640000, 648000, 655360, 656100, 663552, 675000, 691200, 699840, 703125, 
424
  708588, 720000, 729000, 737280, 746496, 750000, 759375, 768000, 777600, 781250, 786432, 
425
  787320, 800000, 810000, 819200, 820125, 829440, 839808, 843750, 864000, 874800, 884736, 
426
  885735, 900000, 911250, 921600, 933120, 937500, 944784, 960000, 972000, 983040, 984150, 
427
  995328, 1000000, 1012500, 1024000, 1036800, 1048576, 1049760, 1062882, 1080000, 1093500, 
428
  1105920, 1119744, 1125000, 1152000, 1166400, 1171875, 1179648, 1180980, 1200000, 
429
  1215000, 1228800, 1244160, 1250000, 1259712, 1265625, 1280000, 1296000, 1310720, 
430
  1312200, 1327104, 1350000, 1366875, 1382400, 1399680, 1406250, 1417176, 1440000, 
431
  1458000, 1474560, 1476225, 1492992, 1500000, 1518750, 1536000, 1555200, 1562500, 
432
  1572864, 1574640, 1594323, 1600000, 1620000, 1638400, 1640250, 1658880, 1679616, 
433
  1687500, 1728000, 1749600, 1769472, 1771470, 1800000, 1822500, 1843200, 1866240, 
434
  1875000, 1889568, 1920000, 1944000, 1953125, 1966080, 1968300, 1990656, 2000000, 
435
  2025000, 2048000, 2073600, 2097152, 2099520, 2109375, 2125764, 2160000, 2187000, 
436
  2211840, 2239488, 2250000, 2278125, 2304000, 2332800, 2343750, 2359296, 2361960, 
437
  2400000, 2430000, 2457600, 2460375, 2488320, 2500000, 2519424, 2531250, 2560000, 
438
  2592000, 2621440, 2624400, 2654208, 2657205, 2700000, 2733750, 2764800, 2799360, 
439
  2812500, 2834352, 2880000, 2916000, 2949120, 2952450, 2985984, 3000000, 3037500, 
440
  3072000, 3110400, 3125000, 3145728, 3149280, 3188646, 3200000, 3240000, 3276800, 
441
  3280500, 3317760, 3359232, 3375000, 3456000, 3499200, 3515625, 3538944, 3542940, 
442
  3600000, 3645000, 3686400, 3732480, 3750000, 3779136, 3796875, 3840000, 3888000, 
443
  3906250, 3932160, 3936600, 3981312, 4000000, 4050000, 4096000, 4100625, 4147200, 
444
  4194304, 4199040, 4218750, 4251528, 4320000, 4374000, 4423680, 4428675, 4478976, 
445
  4500000, 4556250, 4608000, 4665600, 4687500, 4718592, 4723920, 4782969, 4800000, 
446
  4860000, 4915200, 4920750, 4976640, 5000000, 5038848, 5062500, 5120000, 5184000, 
447
  5242880, 5248800, 5308416, 5314410, 5400000, 5467500, 5529600, 5598720, 5625000, 
448
  5668704, 5760000, 5832000, 5859375, 5898240, 5904900, 5971968, 6000000, 6075000, 
449
  6144000, 6220800, 6250000, 6291456, 6298560, 6328125, 6377292, 6400000, 6480000, 
450
  6553600, 6561000, 6635520, 6718464, 6750000, 6834375, 6912000, 6998400, 7031250, 
451
  7077888, 7085880, 7200000, 7290000, 7372800, 7381125, 7464960, 7500000, 7558272, 
452
  7593750, 7680000, 7776000, 7812500, 7864320, 7873200, 7962624, 7971615, 8000000, 
453
  8100000, 8192000, 8201250, 8294400, 8388608, 8398080, 8437500, 8503056, 8640000, 
454
  8748000, 8847360, 8857350, 8957952, 9000000, 9112500, 9216000, 9331200, 9375000, 
455
  9437184, 9447840, 9565938, 9600000, 9720000, 9765625, 9830400, 9841500, 9953280, 
456
  10000000, 10077696, 10125000, 10240000, 10368000, 10485760, 10497600, 10546875, 10616832, 
457
  10628820, 10800000, 10935000, 11059200, 11197440, 11250000, 11337408, 11390625, 11520000, 
458
  11664000, 11718750, 11796480, 11809800, 11943936, 12000000, 12150000, 12288000, 12301875, 
459
  12441600, 12500000, 12582912, 12597120, 12656250, 12754584, 12800000, 12960000, 13107200, 
460
  13122000, 13271040, 13286025, 13436928, 13500000, 13668750, 13824000, 13996800, 14062500, 
461
  14155776, 14171760, 14400000, 14580000, 14745600, 14762250, 14929920, 15000000, 15116544, 
462
  15187500, 15360000, 15552000, 15625000, 15728640, 15746400, 15925248, 15943230, 16000000, 
463
  16200000, 16384000, 16402500, 16588800, 16777216, 16796160, 16875000, 17006112, 17280000, 
464
  17496000, 17578125, 17694720, 17714700, 17915904, 18000000, 18225000, 18432000, 18662400, 
465
  18750000, 18874368, 18895680, 18984375, 19131876, 19200000, 19440000, 19531250, 19660800, 
466
  19683000, 19906560, 20000000, 20155392, 20250000, 20480000, 20503125, 20736000, 20971520, 
467
  20995200, 21093750, 21233664, 21257640, 21600000, 21870000, 22118400, 22143375, 22394880, 
468
  22500000, 22674816, 22781250, 23040000, 23328000, 23437500, 23592960, 23619600, 23887872, 
469
  23914845, 24000000, 24300000, 24576000, 24603750, 24883200, 25000000, 25165824, 25194240, 
470
  25312500, 25509168, 25600000, 25920000, 26214400, 26244000, 26542080, 26572050, 26873856, 
471
  27000000, 27337500, 27648000, 27993600, 28125000, 28311552, 28343520, 28800000, 29160000, 
472
  29296875, 29491200, 29524500, 29859840, 30000000, 30233088, 30375000, 30720000, 31104000, 
473
  31250000, 31457280, 31492800, 31640625, 31850496, 31886460, 32000000, 32400000, 32768000, 
474
  32805000, 33177600, 33554432, 33592320, 33750000, 34012224, 34171875, 34560000, 34992000, 
475
  35156250, 35389440, 35429400, 35831808, 36000000, 36450000, 36864000, 36905625, 37324800, 
476
  37500000, 37748736, 37791360, 37968750, 38263752, 38400000, 38880000, 39062500, 39321600, 
477
  39366000, 39813120, 39858075, 40000000, 40310784, 40500000, 40960000, 41006250, 41472000, 
478
  41943040, 41990400, 42187500, 42467328, 42515280, 43200000, 43740000, 44236800, 44286750, 
479
  44789760, 45000000, 45349632, 45562500, 46080000, 46656000, 46875000, 47185920, 47239200, 
480
  47775744, 47829690, 48000000, 48600000, 48828125, 49152000, 49207500, 49766400, 50000000, 
481
  50331648, 50388480, 50625000, 51018336, 51200000, 51840000, 52428800, 52488000, 52734375, 
482
  53084160, 53144100, 53747712, 54000000, 54675000, 55296000, 55987200, 56250000, 56623104, 
483
  56687040, 56953125, 57600000, 58320000, 58593750, 58982400, 59049000, 59719680, 60000000, 
484
  60466176, 60750000, 61440000, 61509375, 62208000, 62500000, 62914560, 62985600, 63281250, 
485
  63700992, 63772920, 64000000, 64800000, 65536000, 65610000, 66355200, 66430125, 67108864, 
486
  67184640, 67500000, 68024448, 68343750, 69120000, 69984000, 70312500, 70778880, 70858800, 
487
  71663616, 72000000, 72900000, 73728000, 73811250, 74649600, 75000000, 75497472, 75582720, 
488
  75937500, 76527504, 76800000, 77760000, 78125000, 78643200, 78732000, 79626240, 79716150, 
489
  80000000, 80621568, 81000000, 81920000, 82012500, 82944000, 83886080, 83980800, 84375000, 
490
  84934656, 85030560, 86400000, 87480000, 87890625, 88473600, 88573500, 89579520, 90000000, 
491
  90699264, 91125000, 92160000, 93312000, 93750000, 94371840, 94478400, 94921875, 95551488, 
492
  95659380, 96000000, 97200000, 97656250, 98304000, 98415000, 99532800, 100000000, 
493
  100663296, 100776960, 101250000, 102036672, 102400000, 102515625, 103680000, 104857600, 
494
  104976000, 105468750, 106168320, 106288200, 107495424, 108000000, 109350000, 110592000, 
495
  110716875, 111974400, 112500000, 113246208, 113374080, 113906250, 115200000, 116640000, 
496
  117187500, 117964800, 118098000, 119439360, 119574225, 120000000, 120932352, 121500000, 
497
  122880000, 123018750, 124416000, 125000000, 125829120, 125971200, 126562500, 127401984, 
498
  127545840, 128000000, 129600000, 131072000, 131220000, 132710400, 132860250, 134217728, 
499
  134369280, 135000000, 136048896, 136687500, 138240000, 139968000, 140625000, 141557760, 
500
  141717600, 143327232, 144000000, 145800000, 146484375, 147456000, 147622500, 149299200, 
501
  150000000, 150994944, 151165440, 151875000, 153055008, 153600000, 155520000, 156250000, 
502
  157286400, 157464000, 158203125, 159252480, 159432300, 160000000, 161243136, 162000000, 
503
  163840000, 164025000, 165888000, 167772160, 167961600, 168750000, 169869312, 170061120, 
504
  170859375, 172800000, 174960000, 175781250, 176947200, 177147000, 179159040, 180000000, 
505
  181398528, 182250000, 184320000, 184528125, 186624000, 187500000, 188743680, 188956800, 
506
  189843750, 191102976, 191318760, 192000000, 194400000, 195312500, 196608000, 196830000, 
507
  199065600, 199290375, 200000000, 201326592, 201553920, 202500000, 204073344, 204800000, 
508
  205031250, 207360000, 209715200, 209952000, 210937500, 212336640, 212576400, 214990848, 
509
  216000000, 218700000, 221184000, 221433750, 223948800, 225000000, 226492416, 226748160, 
510
  227812500, 230400000, 233280000, 234375000, 235929600, 236196000, 238878720, 239148450, 
511
  240000000, 241864704, 243000000, 244140625, 245760000, 246037500, 248832000, 250000000, 
512
  251658240, 251942400, 253125000, 254803968, 255091680, 256000000, 259200000, 262144000, 
513
  262440000, 263671875, 265420800, 265720500, 268435456, 268738560, 270000000, 272097792, 
514
  273375000, 276480000, 279936000, 281250000, 283115520, 283435200, 284765625, 286654464, 
515
  288000000, 291600000, 292968750, 294912000, 295245000, 298598400, 300000000, 301989888, 
516
  302330880, 303750000, 306110016, 307200000, 307546875, 311040000, 312500000, 314572800, 
517
  314928000, 316406250, 318504960, 318864600, 320000000, 322486272, 324000000, 327680000, 
518
  328050000, 331776000, 332150625, 335544320, 335923200, 337500000, 339738624, 340122240, 
519
  341718750, 345600000, 349920000, 351562500, 353894400, 354294000, 358318080, 360000000, 
520
  362797056, 364500000, 368640000, 369056250, 373248000, 375000000, 377487360, 377913600, 
521
  379687500, 382205952, 382637520, 384000000, 388800000, 390625000, 393216000, 393660000, 
522
  398131200, 398580750, 400000000, 402653184, 403107840, 405000000, 408146688, 409600000, 
523
  410062500, 414720000, 419430400, 419904000, 421875000, 424673280, 425152800, 429981696, 
524
  432000000, 437400000, 439453125, 442368000, 442867500, 447897600, 450000000, 452984832, 
525
  453496320, 455625000, 460800000, 466560000, 468750000, 471859200, 472392000, 474609375, 
526
  477757440, 478296900, 480000000, 483729408, 486000000, 488281250, 491520000, 492075000, 
527
  497664000, 500000000, 503316480, 503884800, 506250000, 509607936, 510183360, 512000000, 
528
  512578125, 518400000, 524288000, 524880000, 527343750, 530841600, 531441000, 536870912, 
529
  537477120, 540000000, 544195584, 546750000, 552960000, 553584375, 559872000, 562500000, 
530
  566231040, 566870400, 569531250, 573308928, 576000000, 583200000, 585937500, 589824000, 
531
  590490000, 597196800, 597871125, 600000000, 603979776, 604661760, 607500000, 612220032, 
532
  614400000, 615093750, 622080000, 625000000, 629145600, 629856000, 632812500, 637009920, 
533
  637729200, 640000000, 644972544, 648000000, 655360000, 656100000, 663552000, 664301250, 
534
  671088640, 671846400, 675000000, 679477248, 680244480, 683437500, 691200000, 699840000, 
535
  703125000, 707788800, 708588000, 716636160, 720000000, 725594112, 729000000, 732421875, 
536
  737280000, 738112500, 746496000, 750000000, 754974720, 755827200, 759375000, 764411904, 
537
  765275040, 768000000, 777600000, 781250000, 786432000, 787320000, 791015625, 796262400, 
538
  797161500, 800000000, 805306368, 806215680, 810000000, 816293376, 819200000, 820125000, 
539
  829440000, 838860800, 839808000, 843750000, 849346560, 850305600, 854296875, 859963392, 
540
  864000000, 874800000, 878906250, 884736000, 885735000, 895795200, 900000000, 905969664, 
541
  906992640, 911250000, 921600000, 922640625, 933120000, 937500000, 943718400, 944784000, 
542
  949218750, 955514880, 956593800, 960000000, 967458816, 972000000, 976562500, 983040000, 
543
  984150000, 995328000, 996451875, 1000000000, 1006632960, 1007769600, 1012500000, 
544
  1019215872, 1020366720, 1024000000, 1025156250, 1036800000, 1048576000, 1049760000, 
545
  1054687500, 1061683200, 1062882000, 1073741824, 1074954240, 1080000000, 1088391168, 
546
  1093500000, 1105920000, 1107168750, 1119744000, 1125000000, 1132462080, 1133740800, 
547
  1139062500, 1146617856, 1152000000, 1166400000, 1171875000, 1179648000, 1180980000, 
548
  1194393600, 1195742250, 1200000000, 1207959552, 1209323520, 1215000000, 1220703125, 
549
  1224440064, 1228800000, 1230187500, 1244160000, 1250000000, 1258291200, 1259712000, 
550
  1265625000, 1274019840, 1275458400, 1280000000, 1289945088, 1296000000, 1310720000, 
551
  1312200000, 1318359375, 1327104000, 1328602500, 1342177280, 1343692800, 1350000000, 
552
  1358954496, 1360488960, 1366875000, 1382400000, 1399680000, 1406250000, 1415577600, 
553
  1417176000, 1423828125, 1433272320, 1440000000, 1451188224, 1458000000, 1464843750, 
554
  1474560000, 1476225000, 1492992000, 1500000000, 1509949440, 1511654400, 1518750000, 
555
  1528823808, 1530550080, 1536000000, 1537734375, 1555200000, 1562500000, 1572864000, 
556
  1574640000, 1582031250, 1592524800, 1594323000, 1600000000, 1610612736, 1612431360, 
557
  1620000000, 1632586752, 1638400000, 1640250000, 1658880000, 1660753125, 1677721600, 
558
  1679616000, 1687500000, 1698693120, 1700611200, 1708593750, 1719926784, 1728000000, 
559
  1749600000, 1757812500, 1769472000, 1771470000, 1791590400, 1800000000, 1811939328, 
560
  1813985280, 1822500000, 1843200000, 1845281250, 1866240000, 1875000000, 1887436800, 
561
  1889568000, 1898437500, 1911029760, 1913187600, 1920000000, 1934917632, 1944000000, 
562
  1953125000, 1966080000, 1968300000, 1990656000, 1992903750, 2000000000, 2013265920, 
563
  2015539200, 2025000000, 2038431744, 2040733440, 2048000000, 2050312500, 2073600000, 
564
  2097152000, 2099520000, 2109375000, 2123366400, 2125764000
565
};
566
567
static int _ccv_get_optimal_fft_size(int size)
568
12
{
569
12
  int a = 0, b = sizeof(_ccv_optimal_fft_size) / sizeof(_ccv_optimal_fft_size[0]) - 1;
570
12
    if((unsigned)size >= (unsigned)_ccv_optimal_fft_size[b])
571
0
    return -1;
572
142
  
while(12
a < b)
573
130
  {
574
130
    int c = (a + b) >> 1;
575
130
    if(size <= _ccv_optimal_fft_size[c])
576
98
      b = c;
577
32
    else
578
32
      a = c + 1;
579
130
    }
580
12
    return _ccv_optimal_fft_size[b];
581
12
}
582
583
static pthread_mutex_t fftw_plan_mutex = PTHREAD_MUTEX_INITIALIZER;
584
585
static void _ccv_filter_fftw(ccv_dense_matrix_t* a, ccv_dense_matrix_t* b, ccv_dense_matrix_t* d, int padding_pattern)
586
6
{
587
6
  int ch = CCV_GET_CHANNEL(a->type);
588
6
  int fft_type = (CCV_GET_DATA_TYPE(d->type) == CCV_8U || CCV_GET_DATA_TYPE(d->type) == CCV_32F) ? 
CCV_32F5
:
CCV_64F1
;
589
6
  int rows = ccv_min(a->rows + b->rows - 1, _ccv_get_optimal_fft_size(b->rows * 3));
590
6
  int cols = ccv_min(a->cols + b->cols - 1, _ccv_get_optimal_fft_size(b->cols * 3));
591
6
  int cols_2c = 2 * (cols / 2 + 1);
592
6
  void* fftw_a;
593
6
  void* fftw_b;
594
6
  void* fftw_d;
595
6
  fftw_plan p, pinv;
596
6
  fftwf_plan pf, pinvf;
597
6
  if (fft_type == CCV_32F)
598
5
  {
599
5
    fftw_a = fftwf_malloc(rows * cols_2c * ch * sizeof(float));
600
5
    fftw_b = fftwf_malloc(rows * cols_2c * ch * sizeof(float));
601
5
    fftw_d = fftwf_malloc(rows * cols_2c * ch * sizeof(float));
602
5
    pthread_mutex_lock(&fftw_plan_mutex);
603
5
    if (ch == 1)
604
3
    {
605
3
      pf = fftwf_plan_dft_r2c_2d(rows, cols, 0, 0, FFTW_ESTIMATE);
606
3
      pinvf = fftwf_plan_dft_c2r_2d(rows, cols, 0, 0, FFTW_ESTIMATE);
607
3
    } else {
608
2
      int ndim[] = {rows, cols};
609
2
      pf = fftwf_plan_many_dft_r2c(2, ndim, ch, 0, 0, ch, 1, 0, 0, ch, 1, FFTW_ESTIMATE);
610
2
      pinvf = fftwf_plan_many_dft_c2r(2, ndim, ch, 0, 0, ch, 1, 0, 0, ch, 1, FFTW_ESTIMATE);
611
2
    }
612
5
    pthread_mutex_unlock(&fftw_plan_mutex);
613
5
  } else {
614
1
    fftw_a = fftw_malloc(rows * cols_2c * ch * sizeof(double));
615
1
    fftw_b = fftw_malloc(rows * cols_2c * ch * sizeof(double));
616
1
    fftw_d = fftw_malloc(rows * cols_2c * ch * sizeof(double));
617
1
    pthread_mutex_lock(&fftw_plan_mutex);
618
1
    if (ch == 1)
619
0
    {
620
0
      p = fftw_plan_dft_r2c_2d(rows, cols, 0, 0, FFTW_ESTIMATE);
621
0
      pinv = fftw_plan_dft_c2r_2d(rows, cols, 0, 0, FFTW_ESTIMATE);
622
1
    } else {
623
1
      int ndim[] = {rows, cols};
624
1
      p = fftw_plan_many_dft_r2c(2, ndim, ch, 0, 0, ch, 1, 0, 0, ch, 1, FFTW_ESTIMATE);
625
1
      pinv = fftw_plan_many_dft_c2r(2, ndim, ch, 0, 0, ch, 1, 0, 0, ch, 1, FFTW_ESTIMATE);
626
1
    }
627
1
    pthread_mutex_unlock(&fftw_plan_mutex);
628
1
  }
629
6
  memset(fftw_b, 0, rows * cols_2c * ch * CCV_GET_DATA_TYPE_SIZE(fft_type));
630
631
6
  int i, j, k;
632
6
  unsigned char* m_ptr = b->data.u8;
633
  // to flip matrix b is crucial, this problem only shows when I changed to a more sophisticated test case
634
6
#define for_block(_for_type, _for_get) \
635
6
  _for_type* fftw_ptr = (_for_type*)fftw_b + (b->rows - 1) * cols_2c * ch; \
636
416
  for (i = 0; i < b->rows; 
i++410
) \
637
410
  { \
638
33.1k
    for (j = 0; j < b->cols; 
j++32.7k
) \
639
130k
      
for (k = 0; 32.7k
k < ch;
k++97.7k
) \
640
97.7k
        fftw_ptr[(b->cols - 1 - j) * ch + k] = 
_for_get410
(m_ptr, j * ch + k); \
641
410
    fftw_ptr -= cols_2c * ch; \
642
410
    m_ptr += b->step; \
643
410
  }
644
6
  ccv_matrix_typeof(fft_type, ccv_matrix_getter, b->type, for_block);
645
6
#undef for_block
646
6
  if (fft_type == CCV_32F)
647
5
    fftwf_execute_dft_r2c(pf, (float*)fftw_b, (fftwf_complex*)fftw_b);
648
1
  else
649
1
    fftw_execute_dft_r2c(p, (double*)fftw_b, (fftw_complex*)fftw_b);
650
  /* why a->cols + cols - 2 * (b->cols & ~1) ?
651
   * what we really want is ceiling((a->cols - (b->cols & ~1)) / (cols - (b->cols & ~1)))
652
   * in this case, we strip out paddings on the left/right, and compute how many tiles
653
   * we need. It then be interpreted in the above integer division form */
654
6
  int tile_x = ccv_max(1, (a->cols + cols - 2 * (b->cols & ~1)) / (cols - (b->cols & ~1)));
655
6
  int tile_y = ccv_max(1, (a->rows + rows - 2 * (b->rows & ~1)) / (rows - (b->rows & ~1)));
656
6
  int brows2 = b->rows / 2;
657
6
  int bcols2 = b->cols / 2;
658
6
#define for_block(_for_type, _cpx_type, _for_set, _for_get) \
659
6
  _for_type scale = 1.0 / (rows * cols); \
660
14
  for (i = 0; i < tile_y; 
i++8
) \
661
27
    
for (j = 0; 8
j < tile_x;
j++19
) \
662
19
    { \
663
19
      int x, y; \
664
19
      memset(fftw_a, 0, rows * cols_2c * ch * sizeof(_for_type)); \
665
19
      int iy = ccv_min(i * (rows - (b->rows & ~1)), ccv_max(a->rows - rows, 0)); \
666
19
      int ix = ccv_min(j * (cols - (b->cols & ~1)), ccv_max(a->cols - cols, 0)); \
667
19
      _for_type* fftw_ptr = (_for_type*)fftw_a; \
668
19
      int end_y = ccv_min(rows, a->rows - iy); \
669
19
      int end_x = ccv_min(cols, a->cols - ix); \
670
19
      m_ptr = (unsigned char*)ccv_get_dense_matrix_cell(a, iy, ix, 0); \
671
5.48k
      for (y = 0; y < end_y; 
y++5.46k
) \
672
5.46k
      { \
673
4.57M
        for (x = 0; x < end_x * ch; 
x++4.56M
) \
674
4.56M
          fftw_ptr[x] = 
_for_get5.46k
(m_ptr, x); \
675
5.46k
        fftw_ptr += cols_2c * ch; \
676
5.46k
        m_ptr += a->step; \
677
5.46k
      } \
678
19
      _cpx_type* fftw_ac = (_cpx_type*)fftw_a; \
679
19
      _cpx_type* fftw_bc = (_cpx_type*)fftw_b; \
680
19
      _cpx_type* fftw_dc = (_cpx_type*)fftw_d; \
681
19
      fft_execute_dft_r2c((_for_type*)fftw_a, (_cpx_type*)fftw_ac); \
682
2.44M
      for (x = 0; x < rows * ch * (cols / 2 + 1); 
x++2.44M
) \
683
2.44M
        fftw_dc[x] = (fftw_ac[x] * fftw_bc[x]) * scale; \
684
19
      fft_execute_dft_c2r((_cpx_type*)fftw_dc, (_for_type*)fftw_d); \
685
19
      fftw_ptr = (_for_type*)fftw_d + ((1 + (i > 0)) * brows2 * cols_2c + (1 + (j > 0)) * bcols2) * ch; \
686
19
      end_y = ccv_min(d->rows - (iy + (i > 0) * brows2), \
687
19
              (rows - (b->rows & ~1)) + (i == 0) * brows2); \
688
19
      end_x = ccv_min(d->cols - (ix + (j > 0) * bcols2), \
689
19
              (cols - (b->cols & ~1)) + (j == 0) * bcols2); \
690
19
      m_ptr = (unsigned char*)ccv_get_dense_matrix_cell(d, iy + (i > 0) * brows2, ix + (j > 0) * bcols2, 0); \
691
4.58k
      for (y = 0; y < end_y; 
y++4.56k
) \
692
4.56k
      { \
693
2.71M
        for (x = 0; x < end_x * ch; 
x++2.71M
) \
694
4.56k
          _for_set(m_ptr, x, fftw_ptr[x]); \
695
4.56k
        m_ptr += d->step; \
696
4.56k
        fftw_ptr += cols_2c * ch; \
697
4.56k
      } \
698
19
      int end_tile_y, end_tile_x; \
699
      /* handle edge cases: */ \
700
19
      if (i + 1 == tile_y && 
end_y + iy + (i > 0) * brows2 < d->rows13
) \
701
19
      { \
702
6
        end_tile_y = ccv_min(brows2, d->rows - (iy + (i > 0) * brows2 + end_y)); \
703
6
        fftw_ptr = (_for_type*)fftw_d + (1 + (j > 0)) * bcols2 * ch; \
704
6
        m_ptr = (unsigned char*)ccv_get_dense_matrix_cell(d, iy + (i > 0) * brows2 + end_y, ix + (j > 0) * bcols2, 0); \
705
306
        for (y = 0; y < end_tile_y; 
y++300
) \
706
300
        { \
707
204k
          for (x = 0; x < end_x * ch; 
x++204k
) \
708
300
            _for_set(m_ptr, x, fftw_ptr[x]); \
709
300
          m_ptr += d->step; \
710
300
          fftw_ptr += cols_2c * ch; \
711
300
        } \
712
6
      } \
713
19
      if (j + 1 == tile_x && 
end_x + ix + (j > 0) * bcols2 < d->cols8
) \
714
19
      { \
715
6
        end_tile_x = ccv_min(bcols2, d->cols - (ix + (j > 0) * bcols2 + end_x)); \
716
6
        fftw_ptr = (_for_type*)fftw_d + (1 + (i > 0)) * brows2 * cols_2c * ch; \
717
6
        m_ptr = (unsigned char*)ccv_get_dense_matrix_cell(d, iy + (i > 0) * brows2, ix + (j > 0) * bcols2 + end_x, 0); \
718
1.39k
        for (y = 0; y < end_y; 
y++1.38k
) \
719
1.38k
        { \
720
187k
          for (x = 0; x < end_tile_x * ch; 
x++185k
) \
721
1.38k
            _for_set(m_ptr, x, fftw_ptr[x]); \
722
1.38k
          m_ptr += d->step; \
723
1.38k
          fftw_ptr += cols_2c * ch; \
724
1.38k
        } \
725
6
      } \
726
19
      if (i + 1 == tile_y && 
end_y + iy + (i > 0) * brows2 < d->rows13
&& \
727
19
        
j + 1 == tile_x6
&&
end_x + ix + (j > 0) * bcols2 < d->cols2
) \
728
19
      { \
729
2
        fftw_ptr = (_for_type*)fftw_d; \
730
2
        m_ptr = (unsigned char*)ccv_get_dense_matrix_cell(d, iy + (i > 0) * brows2 + end_y, ix + (j > 0) * bcols2 + end_x, 0); \
731
102
        for (y = 0; y < end_tile_y; 
y++100
) \
732
100
        { \
733
15.1k
          for (x = 0; x < end_tile_x * ch; 
x++15.0k
) \
734
100
            _for_set(m_ptr, x, fftw_ptr[x]); \
735
100
          m_ptr += d->step; \
736
100
          fftw_ptr += cols_2c * ch; \
737
100
        } \
738
2
      } \
739
19
    }
740
6
  if (fft_type == CCV_32F)
741
5
  {
742
15
#define fft_execute_dft_r2c(r, c) fftwf_execute_dft_r2c(pf, r, c)
743
15
#define fft_execute_dft_c2r(c, r) fftwf_execute_dft_c2r(pinvf, c, r)
744
5
    ccv_matrix_setter(d->type, ccv_matrix_getter, a->type, for_block, float, fftwf_complex);
745
5
#undef fft_execute_dft_r2c
746
5
#undef fft_execute_dft_c2r
747
5
    pthread_mutex_lock(&fftw_plan_mutex);
748
5
    fftwf_destroy_plan(pf);
749
5
    fftwf_destroy_plan(pinvf);
750
5
    pthread_mutex_unlock(&fftw_plan_mutex);
751
5
    fftwf_free(fftw_a);
752
5
    fftwf_free(fftw_b);
753
5
    fftwf_free(fftw_d);
754
5
  } else {
755
4
#define fft_execute_dft_r2c(r, c) fftw_execute_dft_r2c(p, r, c)
756
4
#define fft_execute_dft_c2r(c, r) fftw_execute_dft_c2r(pinv, c, r)
757
1
    ccv_matrix_setter(d->type, ccv_matrix_getter, a->type, for_block, double, fftw_complex);
758
1
#undef fft_execute_dft_r2c
759
1
#undef fft_execute_dft_c2r
760
1
    pthread_mutex_lock(&fftw_plan_mutex);
761
1
    fftw_destroy_plan(p);
762
1
    fftw_destroy_plan(pinv);
763
1
    pthread_mutex_unlock(&fftw_plan_mutex);
764
1
    fftw_free(fftw_a);
765
1
    fftw_free(fftw_b);
766
1
    fftw_free(fftw_d);
767
1
  }
768
6
#undef for_block
769
6
}
770
#else
771
static void _ccv_filter_kissfft(ccv_dense_matrix_t* a, ccv_dense_matrix_t* b, ccv_dense_matrix_t* d, int padding_pattern)
772
{
773
  int ch = CCV_GET_CHANNEL(a->type);
774
  int fft_type = (CCV_GET_DATA_TYPE(d->type) == CCV_8U || CCV_GET_DATA_TYPE(d->type) == CCV_32F) ? CCV_32F : CCV_64F;
775
  int rows = ((ccv_min(a->rows + b->rows - 1, kiss_fftr_next_fast_size_real(b->rows * 3)) + 1) >> 1) << 1;
776
  int cols = ((ccv_min(a->cols + b->cols - 1, kiss_fftr_next_fast_size_real(b->cols * 3)) + 1) >> 1) << 1;
777
  int ndim[] = {rows, cols};
778
  void* kiss_a;
779
  void* kiss_b;
780
  void* kiss_d;
781
  void* kiss_ac;
782
  void* kiss_bc;
783
  void* kiss_dc;
784
  kiss_fftndr_cfg p;
785
  kiss_fftndr_cfg pinv;
786
  kissf_fftndr_cfg pf;
787
  kissf_fftndr_cfg pinvf;
788
  if (fft_type == CCV_32F)
789
  {
790
    pf = kissf_fftndr_alloc(ndim, 2, 0, 0, 0);
791
    pinvf = kissf_fftndr_alloc(ndim, 2, 1, 0, 0);
792
    kiss_a = ccmalloc(rows * cols * ch * sizeof(kissf_fft_scalar));
793
    kiss_b = ccmalloc(rows * cols * ch * sizeof(kissf_fft_scalar));
794
    kiss_d = ccmalloc(rows * cols * ch * sizeof(kissf_fft_scalar));
795
    kiss_ac = ccmalloc(rows * (cols / 2 + 1) * ch * sizeof(kissf_fft_cpx));
796
    kiss_bc = ccmalloc(rows * (cols / 2 + 1) * ch * sizeof(kissf_fft_cpx));
797
    kiss_dc = ccmalloc(rows * (cols / 2 + 1) * ch * sizeof(kissf_fft_cpx));
798
    memset(kiss_b, 0, rows * cols * ch * sizeof(kissf_fft_scalar));
799
  } else {
800
    p = kiss_fftndr_alloc(ndim, 2, 0, 0, 0);
801
    pinv = kiss_fftndr_alloc(ndim, 2, 1, 0, 0);
802
    kiss_a = ccmalloc(rows * cols * ch * sizeof(kiss_fft_scalar));
803
    kiss_b = ccmalloc(rows * cols * ch * sizeof(kiss_fft_scalar));
804
    kiss_d = ccmalloc(rows * cols * ch * sizeof(kiss_fft_scalar));
805
    kiss_ac = ccmalloc(rows * (cols / 2 + 1) * ch * sizeof(kiss_fft_cpx));
806
    kiss_bc = ccmalloc(rows * (cols / 2 + 1) * ch * sizeof(kiss_fft_cpx));
807
    kiss_dc = ccmalloc(rows * (cols / 2 + 1) * ch * sizeof(kiss_fft_cpx));
808
    memset(kiss_b, 0, rows * cols * ch * sizeof(kiss_fft_scalar));
809
  }
810
  int nch = rows * cols, nchc = rows * (cols / 2 + 1);
811
  int i, j, k;
812
  unsigned char* m_ptr = b->data.u8;
813
  // to flip matrix b is crucial, this problem only shows when I changed to a more sophisticated test case
814
#define for_block(_for_type, _for_get) \
815
  _for_type* kiss_ptr = (_for_type*)kiss_b + (b->rows - 1) * cols; \
816
  for (i = 0; i < b->rows; i++) \
817
  { \
818
    for (j = 0; j < b->cols; j++) \
819
      for (k = 0; k < ch; k++) \
820
        kiss_ptr[k * nch + b->cols - 1 - j] = _for_get(m_ptr, j * ch + k); \
821
    kiss_ptr -= cols; \
822
    m_ptr += b->step; \
823
  }
824
  ccv_matrix_typeof(fft_type, ccv_matrix_getter, b->type, for_block);
825
#undef for_block
826
  if (fft_type == CCV_32F)
827
    for (k = 0; k < ch; k++)
828
      kissf_fftndr(pf, (kissf_fft_scalar*)kiss_b + nch * k, (kissf_fft_cpx*)kiss_bc + nchc * k);
829
  else
830
    for (k = 0; k < ch; k++)
831
      kiss_fftndr(p, (kiss_fft_scalar*)kiss_b + nch * k, (kiss_fft_cpx*)kiss_bc + nchc * k);
832
  /* why a->cols + cols - 2 * (b->cols & ~1) ?
833
   * what we really want is ceiling((a->cols - (b->cols & ~1)) / (cols - (b->cols & ~1)))
834
   * in this case, we strip out paddings on the left/right, and compute how many tiles
835
   * we need. It then be interpreted in the above integer division form */
836
  int tile_x = ccv_max(1, (a->cols + cols - 2 * (b->cols & ~1)) / (cols - (b->cols & ~1)));
837
  int tile_y = ccv_max(1, (a->rows + rows - 2 * (b->rows & ~1)) / (rows - (b->rows & ~1)));
838
  int brows2 = b->rows / 2;
839
  int bcols2 = b->cols / 2;
840
#define for_block(_for_type, _cpx_type, _for_set, _for_get) \
841
  _for_type scale = 1.0 / (rows * cols); \
842
  for (i = 0; i < tile_y; i++) \
843
    for (j = 0; j < tile_x; j++) \
844
    { \
845
      int x, y; \
846
      memset(kiss_a, 0, rows * cols * ch * sizeof(_for_type)); \
847
      int iy = ccv_min(i * (rows - (b->rows & ~1)), ccv_max(a->rows - rows, 0)); \
848
      int ix = ccv_min(j * (cols - (b->cols & ~1)), ccv_max(a->cols - cols, 0)); \
849
      _for_type* kiss_ptr = (_for_type*)kiss_a; \
850
      int end_y = ccv_min(rows, a->rows - iy); \
851
      int end_x = ccv_min(cols, a->cols - ix); \
852
      m_ptr = (unsigned char*)ccv_get_dense_matrix_cell(a, iy, ix, 0); \
853
      for (y = 0; y < end_y; y++) \
854
      { \
855
        for (x = 0; x < end_x; x++) \
856
          for (k = 0; k < ch; k++) \
857
            kiss_ptr[k * nch + x] = _for_get(m_ptr, x * ch + k); \
858
        kiss_ptr += cols; \
859
        m_ptr += a->step; \
860
      } \
861
      for (k = 0; k < ch; k++) \
862
        fft_ndr((_for_type*)kiss_a + nch * k, (_cpx_type*)kiss_ac + nchc * k); \
863
      _cpx_type* fft_ac = (_cpx_type*)kiss_ac; \
864
      _cpx_type* fft_bc = (_cpx_type*)kiss_bc; \
865
      _cpx_type* fft_dc = (_cpx_type*)kiss_dc; \
866
      for (x = 0; x < rows * ch * (cols / 2 + 1); x++) \
867
      { \
868
        fft_dc[x].r = (fft_ac[x].r * fft_bc[x].r - fft_ac[x].i * fft_bc[x].i) * scale; \
869
        fft_dc[x].i = (fft_ac[x].i * fft_bc[x].r + fft_ac[x].r * fft_bc[x].i) * scale; \
870
      } \
871
      for (k = 0; k < ch; k++) \
872
        fft_ndri((_cpx_type*)kiss_dc + nchc * k, (_for_type*)kiss_d + nch * k); \
873
      kiss_ptr = (_for_type*)kiss_d + (1 + (i > 0)) * brows2 * cols + (1 + (j > 0)) * bcols2; \
874
      end_y = ccv_min(d->rows - (iy + (i > 0) * brows2), \
875
              (rows - (b->rows & ~1)) + (i == 0) * brows2); \
876
      end_x = ccv_min(d->cols - (ix + (j > 0) * bcols2), \
877
              (cols - (b->cols & ~1)) + (j == 0) * bcols2); \
878
      m_ptr = (unsigned char*)ccv_get_dense_matrix_cell(d, iy + (i > 0) * brows2, ix + (j > 0) * bcols2, 0); \
879
      for (y = 0; y < end_y; y++) \
880
      { \
881
        for (x = 0; x < end_x; x++) \
882
          for (k = 0; k < ch; k++) \
883
            _for_set(m_ptr, x * ch + k, kiss_ptr[k * nch + x]); \
884
        m_ptr += d->step; \
885
        kiss_ptr += cols; \
886
      } \
887
      int end_tile_y, end_tile_x; \
888
      /* handle edge cases: */ \
889
      if (i + 1 == tile_y && end_y + iy + (i > 0) * brows2 < d->rows) \
890
      { \
891
        end_tile_y = ccv_min(brows2, d->rows - (iy + (i > 0) * brows2 + end_y)); \
892
        kiss_ptr = (_for_type*)kiss_d + (1 + (j > 0)) * bcols2; \
893
        m_ptr = (unsigned char*)ccv_get_dense_matrix_cell(d, iy + (i > 0) * brows2 + end_y, ix + (j > 0) * bcols2, 0); \
894
        for (y = 0; y < end_tile_y; y++) \
895
        { \
896
          for (x = 0; x < end_x; x++) \
897
            for (k = 0; k < ch; k++) \
898
              _for_set(m_ptr, x * ch + k, kiss_ptr[k * nch + x]); \
899
          m_ptr += d->step; \
900
          kiss_ptr += cols; \
901
        } \
902
      } \
903
      if (j + 1 == tile_x && end_x + ix + (j > 0) * bcols2 < d->cols) \
904
      { \
905
        end_tile_x = ccv_min(bcols2, d->cols - (ix + (j > 0) * bcols2 + end_x)); \
906
        kiss_ptr = (_for_type*)kiss_d + (1 + (i > 0)) * brows2 * cols; \
907
        m_ptr = (unsigned char*)ccv_get_dense_matrix_cell(d, iy + (i > 0) * brows2, ix + (j > 0) * bcols2 + end_x, 0); \
908
        for (y = 0; y < end_y; y++) \
909
        { \
910
          for (x = 0; x < end_tile_x; x++) \
911
            for (k = 0; k < ch; k++) \
912
              _for_set(m_ptr, x * ch + k, kiss_ptr[k * nch + x]); \
913
          m_ptr += d->step; \
914
          kiss_ptr += cols; \
915
        } \
916
      } \
917
      if (i + 1 == tile_y && end_y + iy + (i > 0) * brows2 < d->rows && \
918
        j + 1 == tile_x && end_x + ix + (j > 0) * bcols2 < d->cols) \
919
      { \
920
        kiss_ptr = (_for_type*)kiss_d; \
921
        m_ptr = (unsigned char*)ccv_get_dense_matrix_cell(d, iy + (i > 0) * brows2 + end_y, ix + (j > 0) * bcols2 + end_x, 0); \
922
        for (y = 0; y < end_tile_y; y++) \
923
        { \
924
          for (x = 0; x < end_tile_x; x++) \
925
            for (k = 0; k < ch; k++) \
926
              _for_set(m_ptr, x * ch + k, kiss_ptr[k * nch + x]); \
927
          m_ptr += d->step; \
928
          kiss_ptr += cols; \
929
        } \
930
      } \
931
    }
932
  if (fft_type == CCV_32F)
933
  {
934
#define fft_ndr(r, c) kissf_fftndr(pf, r, c)
935
#define fft_ndri(c, r) kissf_fftndri(pinvf, c, r)
936
    ccv_matrix_setter(d->type, ccv_matrix_getter, a->type, for_block, kissf_fft_scalar, kissf_fft_cpx);
937
#undef fft_ndr
938
#undef fft_ndri
939
    kissf_fft_free(pf);
940
    kissf_fft_free(pinvf);
941
  } else {
942
#define fft_ndr(r, c) kiss_fftndr(p, r, c)
943
#define fft_ndri(c, r) kiss_fftndri(pinv, c, r)
944
    ccv_matrix_setter(d->type, ccv_matrix_getter, a->type, for_block, kiss_fft_scalar, kiss_fft_cpx);
945
#undef fft_ndr
946
#undef fft_ndri
947
    kiss_fft_free(p);
948
    kiss_fft_free(pinv);
949
  }
950
#undef for_block
951
  ccfree(kiss_dc);
952
  ccfree(kiss_bc);
953
  ccfree(kiss_ac);
954
  ccfree(kiss_d);
955
  ccfree(kiss_b);
956
  ccfree(kiss_a);
957
}
958
#endif
959
960
static void _ccv_filter_direct_8u(ccv_dense_matrix_t* a, ccv_dense_matrix_t* b, ccv_dense_matrix_t* d, int padding_pattern)
961
0
{
962
0
  int i, j, y, x, k;
963
0
  int nz = b->rows * b->cols;
964
0
  int* coeff = (int*)ccmalloc(nz * sizeof(int));
965
0
  int* cx = (int*)ccmalloc(nz * sizeof(int));
966
0
  int* cy = (int*)ccmalloc(nz * sizeof(int));
967
0
  int scale = 1 << 14;
968
0
  nz = 0;
969
0
  for (i = 0; i < b->rows; i++)
970
0
    for (j = 0; j < b->cols; j++)
971
0
    {
972
0
      coeff[nz] = (int)(ccv_get_dense_matrix_cell_value(b, i, j, 0) * scale + 0.5);
973
0
      if (coeff[nz] == 0)
974
0
        continue;
975
0
      cy[nz] = i;
976
0
      cx[nz] = j;
977
0
      nz++;
978
0
    }
979
0
  ccv_dense_matrix_t* pa = ccv_dense_matrix_new(a->rows + b->rows / 2 * 2, a->cols + b->cols / 2 * 2, CCV_8U | CCV_C1, 0, 0);
980
  /* the padding pattern is different from FFT: |aa{BORDER}|abcd|{BORDER}dd| */
981
0
  for (i = 0; i < pa->rows; i++)
982
0
    for (j = 0; j < pa->cols; j++)
983
0
      pa->data.u8[i * pa->step + j] = a->data.u8[ccv_clamp(i - b->rows / 2, 0, a->rows - 1) * a->step + ccv_clamp(j - b->cols / 2, 0, a->cols - 1)];
984
0
  unsigned char* m_ptr = d->data.u8;
985
0
  unsigned char* a_ptr = pa->data.u8;
986
  /* 0.5 denote the overhead for indexing x and y */
987
0
  if (nz < b->rows * b->cols * 0.75)
988
0
  {
989
0
    for (i = 0; i < d->rows; i++)
990
0
    {
991
0
      for (j = 0; j < d->cols; j++)
992
0
      {
993
0
        int z = 0;
994
0
        for (k = 0; k < nz; k++)
995
0
          z += a_ptr[cy[k] * pa->step + j + cx[k]] * coeff[k];
996
0
        m_ptr[j] = ccv_clamp(z >> 14, 0, 255);
997
0
      }
998
0
      m_ptr += d->step;
999
0
      a_ptr += pa->step;
1000
0
    }
1001
0
  } else {
1002
0
    k = 0;
1003
0
    for (i = 0; i < b->rows; i++)
1004
0
      for (j = 0; j < b->cols; j++)
1005
0
      {
1006
0
        coeff[k] = (int)(ccv_get_dense_matrix_cell_value(b, i, j, 0) * scale + 0.5);
1007
0
        k++;
1008
0
      }
1009
0
    for (i = 0; i < d->rows; i++)
1010
0
    {
1011
0
      for (j = 0; j < d->cols; j++)
1012
0
      {
1013
0
        int* c_ptr = coeff;
1014
0
        int z = 0;
1015
0
        for (y = 0; y < b->rows; y++)
1016
0
        {
1017
0
          int iyx = y * pa->step;
1018
0
          for (x = 0; x < b->cols; x++)
1019
0
          {
1020
0
            z += a_ptr[iyx + j + x] * c_ptr[0];
1021
0
            c_ptr++;
1022
0
          }
1023
0
        }
1024
0
        m_ptr[j] = ccv_clamp(z >> 14, 0, 255);
1025
0
      }
1026
0
      m_ptr += d->step;
1027
0
      a_ptr += pa->step;
1028
0
    }
1029
0
  }
1030
0
  ccv_matrix_free(pa);
1031
0
  ccfree(coeff);
1032
0
  ccfree(cx);
1033
0
  ccfree(cy);
1034
0
}
1035
1036
void ccv_filter(ccv_dense_matrix_t* a, ccv_dense_matrix_t* b, ccv_dense_matrix_t** d, int type, int padding_pattern)
1037
6
{
1038
6
  ccv_declare_derived_signature(sig, a->sig != 0 && b->sig != 0, ccv_sign_with_literal("ccv_filter"), a->sig, b->sig, CCV_EOF_SIGN);
1039
6
  type = (type == 0) ? 
CCV_GET_DATA_TYPE3
(a->type) | 3
CCV_GET_CHANNEL3
(a->type) :
CCV_GET_DATA_TYPE3
(type) | 3
CCV_GET_CHANNEL3
(a->type);
1040
6
  ccv_dense_matrix_t* dd = *d = ccv_dense_matrix_renew(*d, a->rows, a->cols, CCV_ALL_DATA_TYPE | CCV_GET_CHANNEL(a->type), type, sig);
1041
6
  ccv_object_return_if_cached(, dd);
1042
1043
  /* 15 is the constant to indicate the high cost of FFT (even with O(nlog(m)) for
1044
   * integer image.
1045
   * NOTE: FFT has time complexity of O(nlog(n)), however, for convolution, it
1046
   * is not the case. Convolving one image (a) to a kernel (b), can be done by
1047
   * dividing image a to several blocks proportional to (b). Thus, one don't need
1048
   * to do FFT for the whole image. The image can be divided to n/m part, and
1049
   * the FFT itself is O(mlog(m)), so, the convolution process has time complexity
1050
   * of O(nlog(m)) */
1051
6
  if ((b->rows * b->cols < (log((double)(b->rows * b->cols)) + 1) * 15) && 
(a->type & CCV_8U)1
)
1052
0
  {
1053
0
    _ccv_filter_direct_8u(a, b, dd, padding_pattern);
1054
6
  } else {
1055
6
#ifdef HAVE_FFTW3
1056
6
    _ccv_filter_fftw(a, b, dd, padding_pattern);
1057
#else
1058
    _ccv_filter_kissfft(a, b, dd, padding_pattern);
1059
#endif
1060
6
  }
1061
6
}
1062
1063
void ccv_filter_kernel(ccv_dense_matrix_t* x, ccv_filter_kernel_f func, void* data)
1064
2
{
1065
2
  int i, j, k, ch = CCV_GET_CHANNEL(x->type);
1066
2
  unsigned char* m_ptr = x->data.u8;
1067
2
  double rows_2 = (x->rows - 1) * 0.5;
1068
2
  double cols_2 = (x->cols - 1) * 0.5;
1069
2
#define for_block(_, _for_set) \
1070
203
  
for (i = 0; 2
i < x->rows;
i++201
) \
1071
201
  { \
1072
20.4k
    for (j = 0; j < x->cols; 
j++20.2k
) \
1073
20.2k
    { \
1074
20.2k
      double result = func(j - cols_2, i - rows_2, data); \
1075
80.8k
      for (k = 0; k < ch; 
k++60.6k
) \
1076
20.2k
        _for_set(m_ptr, j * ch + k, result); \
1077
20.2k
    } \
1078
201
    m_ptr += x->step; \
1079
201
  }
1080
2
  ccv_matrix_setter(x->type, for_block);
1081
2
#undef for_block
1082
2
  ccv_make_matrix_immutable(x);
1083
2
}
1084
1085
void ccv_distance_transform(ccv_dense_matrix_t* a, ccv_dense_matrix_t** b, int type, ccv_dense_matrix_t** x, int x_type, ccv_dense_matrix_t** y, int y_type, double dx, double dy, double dxx, double dyy, int flag)
1086
2
{
1087
2
  assert(!(flag & CCV_L2_NORM) && (flag & CCV_GSEDT));
1088
2
  ccv_declare_derived_signature(sig, a->sig != 0, ccv_sign_with_format(64, "ccv_distance_transform(%la,%la,%la,%la,%d)", dx, dy, dxx, dyy, flag), a->sig, CCV_EOF_SIGN);
1089
2
  type = (CCV_GET_DATA_TYPE(type) == CCV_64F || CCV_GET_DATA_TYPE(a->type) == CCV_64F || CCV_GET_DATA_TYPE(a->type) == CCV_64S) ? 
CCV_GET_CHANNEL0
(a->type) | CCV_64F0
: CCV_GET_CHANNEL(a->type) | CCV_32F;
1090
2
  ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, a->rows, a->cols, CCV_ALL_DATA_TYPE | CCV_GET_CHANNEL(a->type), type, sig);
1091
2
  ccv_dense_matrix_t* mx = 0;
1092
2
  ccv_dense_matrix_t* my = 0;
1093
2
  if (x != 0)
1094
0
  {
1095
0
    ccv_declare_derived_signature(xsig, a->sig != 0, ccv_sign_with_format(64, "ccv_distance_transform_x(%la,%la,%la,%la,%d)", dx, dy, dxx, dyy, flag), a->sig, CCV_EOF_SIGN);
1096
0
    mx = *x = ccv_dense_matrix_renew(*x, a->rows, a->cols, CCV_32S | CCV_C1, CCV_32S | CCV_C1, xsig);
1097
0
  }
1098
2
  if (y != 0)
1099
0
  {
1100
0
    ccv_declare_derived_signature(ysig, a->sig != 0, ccv_sign_with_format(64, "ccv_distance_transform_y(%la,%la,%la,%la,%d)", dx, dy, dxx, dyy, flag), a->sig, CCV_EOF_SIGN);
1101
0
    my = *y = ccv_dense_matrix_renew(*y, a->rows, a->cols, CCV_32S | CCV_C1, CCV_32S | CCV_C1, ysig);
1102
0
  }
1103
2
  ccv_object_return_if_cached(, db, mx, my);
1104
2
  ccv_revive_object_if_cached(db, mx, my);
1105
2
  int i, j, k;
1106
2
  unsigned char* a_ptr = a->data.u8;
1107
2
  unsigned char* b_ptr = db->data.u8;
1108
2
  int* v = (int*)alloca(sizeof(int) * ccv_max(db->rows, db->cols));
1109
2
  unsigned char* c_ptr = (unsigned char*)alloca(CCV_GET_DATA_TYPE_SIZE(db->type) * db->rows);
1110
2
  int* x_ptr = mx ? 
mx->data.i320
: 0;
1111
2
  int* y_ptr = my ? 
my->data.i320
: 0;
1112
2
#define for_block(_for_max, _for_type_b, _for_set_b, _for_get_b, _for_get_a) \
1113
2
  _for_type_b _dx = dx, _dy = dy, _dxx = dxx, _dyy = dyy; \
1114
2
  _for_type_b* z = (_for_type_b*)alloca(sizeof(_for_type_b) * (ccv_max(db->rows, db->cols) + 1)); \
1115
2
  if (_dxx > 1e-6) \
1116
2
  { \
1117
652
    for (i = 0; i < a->rows; 
i++650
) \
1118
650
    { \
1119
650
      k = 0; \
1120
650
      v[0] = 0; \
1121
650
      z[0] = (_for_type_b)-_for_max; \
1122
650
      z[1] = (_for_type_b)_for_max; \
1123
260k
      for (j = 1; j < a->cols; 
j++259k
) \
1124
259k
      { \
1125
259k
        _for_type_b s; \
1126
259k
        for (;;) \
1127
270k
        { \
1128
270k
          assert(k >= 0 && k < ccv_max(db->rows, db->cols)); \
1129
270k
          s = ((SGN _for_get_a(a_ptr, j) + _dxx * j * j - _dx * j) - (SGN _for_get_a(a_ptr, v[k]) + _dxx * v[k] * v[k] - _dx * v[k])) / (2.0 * _dxx * (j - v[k])); \
1130
270k
          if (s > z[k]) 
break259k
; \
1131
270k
          --k; \
1132
11.4k
        } \
1133
259k
        ++k; \
1134
259k
        assert(k >= 0 && k < ccv_max(db->rows, db->cols)); \
1135
259k
        v[k] = j; \
1136
259k
        z[k] = s; \
1137
259k
        z[k + 1] = (_for_type_b)_for_max; \
1138
259k
      } \
1139
650
      assert(z[k + 1] >= a->cols - 1); \
1140
650
      k = 0; \
1141
650
      if (mx) \
1142
650
      { \
1143
0
        for (j = 0; j < a->cols; j++) \
1144
0
        { \
1145
0
          while (z[k + 1] < j) \
1146
0
          { \
1147
0
            assert(k >= 0 && k < ccv_max(db->rows, db->cols) - 1); \
1148
0
            ++k; \
1149
0
          } \
1150
0
          _for_set_b(b_ptr, j, _dx * (j - v[k]) + _dxx * (j - v[k]) * (j - v[k]) SGN _for_get_a(a_ptr, v[k])); \
1151
0
          x_ptr[j] = j - v[k]; \
1152
0
        } \
1153
0
        x_ptr += mx->cols; \
1154
650
      } else { \
1155
260k
        for (j = 0; j < a->cols; 
j++260k
) \
1156
260k
        { \
1157
507k
          while (z[k + 1] < j) \
1158
260k
          { \
1159
247k
            assert(k >= 0 && k < ccv_max(db->rows, db->cols) - 1); \
1160
247k
            ++k; \
1161
247k
          } \
1162
260k
          _for_set_b(b_ptr, j, _dx * (j - v[k]) + _dxx * (j - v[k]) * (j - v[k]) SGN _for_get_a(a_ptr, v[k])); \
1163
260k
        } \
1164
650
      } \
1165
650
      a_ptr += a->step; \
1166
650
      b_ptr += db->step; \
1167
650
    } \
1168
2
  } else { /* above algorithm cannot handle dxx == 0 properly, below is special casing for that */ \
1169
0
    assert(mx == 0); \
1170
0
    for (i = 0; i < a->rows; i++) \
1171
0
    { \
1172
0
      for (j = 0; j < a->cols; j++) \
1173
0
        _for_set_b(b_ptr, j, SGN _for_get_a(a_ptr, j)); \
1174
0
      for (j = 1; j < a->cols; j++) \
1175
0
        _for_set_b(b_ptr, j, ccv_min(_for_get_b(b_ptr, j), _for_get_b(b_ptr, j - 1) + _dx)); \
1176
0
      for (j = a->cols - 2; j >= 0; j--) \
1177
0
        _for_set_b(b_ptr, j, ccv_min(_for_get_b(b_ptr, j), _for_get_b(b_ptr, j + 1) - _dx)); \
1178
0
      a_ptr += a->step; \
1179
0
      b_ptr += db->step; \
1180
0
    } \
1181
0
  } \
1182
2
  b_ptr = db->data.u8; \
1183
2
  if (_dyy > 1e-6) \
1184
2
  { \
1185
802
    for (j = 0; j < db->cols; 
j++800
) \
1186
800
    { \
1187
260k
      for (i = 0; i < db->rows; 
i++260k
) \
1188
800
        _for_set_b(c_ptr, i, _for_get_b(b_ptr + i * db->step, j)); \
1189
800
      k = 0; \
1190
800
      v[0] = 0; \
1191
800
      z[0] = (_for_type_b)-_for_max; \
1192
800
      z[1] = (_for_type_b)_for_max; \
1193
260k
      for (i = 1; i < db->rows; 
i++259k
) \
1194
259k
      { \
1195
259k
        _for_type_b s; \
1196
259k
        for (;;) \
1197
274k
        { \
1198
274k
          assert(k >= 0 && k < ccv_max(db->rows, db->cols)); \
1199
274k
          s = ((_for_get_b(c_ptr, i) + _dyy * i * i - _dy * i) - (_for_get_b(c_ptr, v[k]) + _dyy * v[k] * v[k] - _dy * v[k])) / (2.0 * _dyy * (i - v[k])); \
1200
274k
          if (s > z[k]) 
break259k
; \
1201
274k
          --k; \
1202
15.6k
        } \
1203
259k
        ++k; \
1204
259k
        assert(k >= 0 && k < ccv_max(db->rows, db->cols)); \
1205
259k
        v[k] = i; \
1206
259k
        z[k] = s; \
1207
259k
        z[k + 1] = (_for_type_b)_for_max; \
1208
259k
      } \
1209
800
      assert(z[k + 1] >= db->rows - 1); \
1210
800
      k = 0; \
1211
800
      if (my) \
1212
800
      { \
1213
0
        for (i = 0; i < db->rows; i++) \
1214
0
        { \
1215
0
          while (z[k + 1] < i) \
1216
0
          { \
1217
0
            assert(k >= 0 && k < ccv_max(db->rows, db->cols) - 1); \
1218
0
            ++k; \
1219
0
          } \
1220
0
          _for_set_b(b_ptr + i * db->step, j, _dy * (i - v[k]) + _dyy * (i - v[k]) * (i - v[k]) + _for_get_b(c_ptr, v[k])); \
1221
0
          y_ptr[i * my->cols] = i - v[k]; \
1222
0
        } \
1223
0
        ++y_ptr; \
1224
800
      } else { \
1225
260k
        for (i = 0; i < db->rows; 
i++260k
) \
1226
260k
        { \
1227
503k
          while (z[k + 1] < i) \
1228
260k
          { \
1229
243k
            assert(k >= 0 && k < ccv_max(db->rows, db->cols) - 1); \
1230
243k
            ++k; \
1231
243k
          } \
1232
260k
          _for_set_b(b_ptr + i * db->step, j, _dy * (i - v[k]) + _dyy * (i - v[k]) * (i - v[k]) + _for_get_b(c_ptr, v[k])); \
1233
260k
        } \
1234
800
      } \
1235
800
    } \
1236
2
  } else { \
1237
0
    assert(my == 0); \
1238
0
    for (j = 0; j < db->cols; j++) \
1239
0
    { \
1240
0
      for (i = 1; i < db->rows; i++) \
1241
0
        _for_set_b(b_ptr + i * db->step, j, ccv_min(_for_get_b(b_ptr + i * db->step, j), _for_get_b(b_ptr + (i - 1) * db->step, j) + _dy)); \
1242
0
      for (i = db->rows - 2; i >= 0; i--) \
1243
0
        _for_set_b(b_ptr + i * db->step, j, ccv_min(_for_get_b(b_ptr + i * db->step, j), _for_get_b(b_ptr + (i + 1) * db->step, j) - _dy)); \
1244
0
    } \
1245
0
  }
1246
2
  if (flag & CCV_NEGATIVE)
1247
1
  {
1248
261k
#define SGN -
1249
1
    if (db->type & CCV_64F)
1250
0
    {
1251
0
      ccv_matrix_typeof_setter_getter(db->type, ccv_matrix_getter, a->type, for_block, DBL_MAX);
1252
1
    } else {
1253
1
      ccv_matrix_typeof_setter_getter(db->type, ccv_matrix_getter, a->type, for_block, FLT_MAX);
1254
1
    }
1255
1
#undef SGN
1256
1
  } else {
1257
280k
#define SGN +
1258
1
    if (db->type & CCV_64F)
1259
0
    {
1260
0
      ccv_matrix_typeof_setter_getter(db->type, ccv_matrix_getter, a->type, for_block, DBL_MAX);
1261
1
    } else {
1262
1
      ccv_matrix_typeof_setter_getter(db->type, ccv_matrix_getter, a->type, for_block, FLT_MAX);
1263
1
    }
1264
1
#undef SGN
1265
1
  }
1266
2
#undef for_block
1267
2
}
1268
1269
inline static double _kmeans1d_cost(double* cumsum, double* cumsum2, int i, int j)
1270
189M
{
1271
189M
  if (j < i)
1272
15.6M
    return 0;
1273
173M
  double mu = (cumsum[j + 1] - cumsum[i]) / (j - i + 1);
1274
173M
  double result = cumsum2[j + 1] - cumsum2[i];
1275
173M
  result += (j - i + 1) * (mu * mu);
1276
173M
  result -= (2 * mu) * (cumsum[j + 1] - cumsum[i]);
1277
173M
  return result;
1278
189M
}
1279
1280
inline static double _kmeans1d_lookup(double* D, double* cumsum, double* cumsum2, int i, int j)
1281
189M
{
1282
189M
  const int col = i < j - 1 ? 
i7.98M
:
j - 1181M
;
1283
189M
  return (col >= 0 ? 
D[col]188M
:
0575k
) + _kmeans1d_cost(cumsum, cumsum2, j, i);
1284
189M
}
1285
1286
static void _smawk2(int row_start, int row_stride, int row_size, int* cols, int col_size, int* reserved, double* D, double* cumsum, double* cumsum2, int* result)
1287
298k
{
1288
298k
  if (row_size == 0)
1289
35.8k
    return;
1290
262k
  int* _cols = cols + col_size;
1291
262k
  int _col_size = 0;
1292
262k
  int i;
1293
19.4M
  for (i = 0; i < col_size; 
i++19.2M
)
1294
19.2M
  {
1295
19.2M
    const int col = cols[i];
1296
19.2M
    for (;;)
1297
28.7M
    {
1298
28.7M
      if (_col_size == 0)
1299
485k
        break;
1300
28.2M
      const int row = row_start + row_stride * (_col_size - 1);
1301
28.2M
      if (_kmeans1d_lookup(D, cumsum, cumsum2, row, col) >= _kmeans1d_lookup(D, cumsum, cumsum2, row, _cols[_col_size - 1]))
1302
18.7M
        break;
1303
9.50M
      --_col_size;
1304
9.50M
    }
1305
19.2M
    if (_col_size < row_size)
1306
19.1M
    {
1307
19.1M
      _cols[_col_size] = col;
1308
19.1M
      ++_col_size;
1309
19.1M
    }
1310
19.2M
  }
1311
262k
  _smawk2(row_start + row_stride, row_stride * 2, row_size / 2, _cols, _col_size, reserved, D, cumsum, cumsum2, result);
1312
  // Build the reverse lookup table.
1313
9.86M
  for (i = 0; i < _col_size; 
i++9.60M
)
1314
9.60M
    reserved[_cols[i]] = i;
1315
262k
    int start = 0;
1316
5.09M
    for (i = 0; i < row_size; 
i += 24.82M
) {
1317
4.82M
        const int row = row_start + i * row_stride;
1318
4.82M
        int stop = _col_size - 1;
1319
4.82M
        if (i < row_size - 1)
1320
4.77M
    {
1321
4.77M
      const int argmin = result[row_start + (i + 1) * row_stride];
1322
4.77M
            stop = reserved[argmin];
1323
4.77M
    }
1324
4.82M
        int argmin = _cols[start];
1325
4.82M
        double min = _kmeans1d_lookup(D, cumsum, cumsum2, row, argmin);
1326
4.82M
    int c;
1327
14.1M
        for (c = start + 1; c <= stop; 
c++9.28M
)
1328
9.28M
    {
1329
9.28M
            double value = _kmeans1d_lookup(D, cumsum, cumsum2, row, _cols[c]);
1330
9.28M
            if (value < min) {
1331
4.66M
                argmin = _cols[c];
1332
4.66M
                min = value;
1333
4.66M
            }
1334
9.28M
        }
1335
4.82M
        result[row] = argmin;
1336
4.82M
        start = stop;
1337
4.82M
    }
1338
262k
}
1339
1340
static void _smawk1(int row_start, int row_stride, int row_size, int* cols, int col_size, int* reserved, double* D, double* cumsum, double* cumsum2, int* result)
1341
35.8k
{
1342
35.8k
  if (row_size == 0)
1343
0
    return;
1344
35.8k
  int* _cols = cols;
1345
35.8k
  int _col_size = 0;
1346
35.8k
  int i;
1347
19.3M
  for (i = 0; i < col_size; 
i++19.3M
)
1348
19.3M
  {
1349
19.3M
    const int col = i;
1350
19.3M
    for (;;)
1351
28.4M
    {
1352
28.4M
      if (_col_size == 0)
1353
37.1k
        break;
1354
28.3M
      const int row = row_start + row_stride * (_col_size - 1);
1355
28.3M
      if (_kmeans1d_lookup(D, cumsum, cumsum2, row, col) >= _kmeans1d_lookup(D, cumsum, cumsum2, row, _cols[_col_size - 1]))
1356
19.2M
        break;
1357
9.08M
      --_col_size;
1358
9.08M
    }
1359
19.3M
    if (_col_size < row_size)
1360
18.7M
    {
1361
18.7M
      _cols[_col_size] = col;
1362
18.7M
      ++_col_size;
1363
18.7M
    }
1364
19.3M
  }
1365
35.8k
  _smawk2(row_start + row_stride, row_stride * 2, row_size / 2, _cols, _col_size, reserved, D, cumsum, cumsum2, result);
1366
  // Build the reverse lookup table.
1367
9.69M
  for (i = 0; i < _col_size; 
i++9.65M
)
1368
9.65M
    reserved[_cols[i]] = i;
1369
35.8k
    int start = 0;
1370
4.86M
    for (i = 0; i < row_size; 
i += 24.83M
) {
1371
4.83M
        const int row = row_start + i * row_stride;
1372
4.83M
        int stop = _col_size - 1;
1373
4.83M
        if (i < row_size - 1)
1374
4.82M
    {
1375
4.82M
      const int argmin = result[row_start + (i + 1) * row_stride];
1376
4.82M
      stop = reserved[argmin];
1377
4.82M
    }
1378
4.83M
        int argmin = _cols[start];
1379
4.83M
        double min = _kmeans1d_lookup(D, cumsum, cumsum2, row, argmin);
1380
4.83M
    int c;
1381
14.3M
        for (c = start + 1; c <= stop; 
c++9.49M
)
1382
9.49M
    {
1383
9.49M
            double value = _kmeans1d_lookup(D, cumsum, cumsum2, row, _cols[c]);
1384
9.49M
            if (value < min) {
1385
4.98M
                argmin = _cols[c];
1386
4.98M
                min = value;
1387
4.98M
            }
1388
9.49M
        }
1389
4.83M
        result[row] = argmin;
1390
4.83M
        start = stop;
1391
4.83M
    }
1392
35.8k
}
1393
1394
static void _smawk0(int row_start, int row_stride, int row_size, int* cols, int col_size, int* reserved, double* D, double* cumsum, double* cumsum2, int* result)
1395
35.8k
{
1396
35.8k
  if (row_size == 0)
1397
0
    return;
1398
35.8k
  _smawk1(row_start + row_stride, row_stride * 2, row_size / 2, cols, col_size, reserved, D, cumsum, cumsum2, result);
1399
  // Build the reverse lookup table.
1400
35.8k
    int start = 0;
1401
35.8k
  int i;
1402
9.69M
    for (i = 0; i < row_size; 
i += 29.66M
) {
1403
9.66M
        const int row = row_start + i * row_stride;
1404
9.66M
        int stop = col_size - 1;
1405
9.66M
        if (i < row_size - 1)
1406
9.65M
    {
1407
9.65M
      const int argmin = result[row_start + (i + 1) * row_stride];
1408
9.65M
      stop = argmin;
1409
9.65M
    }
1410
9.66M
        int argmin = start;
1411
9.66M
        double min = _kmeans1d_lookup(D, cumsum, cumsum2, row, argmin);
1412
9.66M
    int c;
1413
28.1M
        for (c = start + 1; c <= stop; 
c++18.5M
)
1414
18.5M
    {
1415
18.5M
            double value = _kmeans1d_lookup(D, cumsum, cumsum2, row, c);
1416
18.5M
            if (value < min) {
1417
9.72M
                argmin = c;
1418
9.72M
                min = value;
1419
9.72M
            }
1420
18.5M
        }
1421
9.66M
        result[row] = argmin;
1422
9.66M
        start = stop;
1423
9.66M
    }
1424
35.8k
}
1425
1426
static void smawk(const int n, int* cols, double* D, double* cumsum, double* cumsum2, int* result)
1427
35.8k
{
1428
35.8k
  _smawk0(0, 1, n, cols + n, n, cols, D, cumsum, cumsum2, result);
1429
35.8k
}
1430
1431
typedef struct {
1432
  double value;
1433
  int index;
1434
} ccv_kmeans1d_undo_t;
1435
1436
#undef more_than
1437
2.04M
#define less_than(s1, s2, aux) ((s1).value < (s2).value)
1438
2.04M
static CCV_IMPLEMENT_QSORT(_ccv_kmeans1d_undo_qsort, ccv_kmeans1d_undo_t, less_than)
1439
#undef less_than
1440
1441
void ccv_kmeans1d(const ccv_dense_matrix_t* const a, const int k, int* const clusters, double* const centroids)
1442
742
{
1443
742
  assert(k > 1);
1444
742
  assert(CCV_GET_CHANNEL(a->type) == CCV_C1);
1445
742
  const int n = a->rows * a->cols;
1446
742
  ccv_kmeans1d_undo_t* const sorted_undos = ccmalloc(sizeof(ccv_kmeans1d_undo_t) * n);
1447
742
  int i;
1448
742
  if (CCV_GET_DATA_TYPE(a->type) == CCV_16F)
1449
252
  {
1450
252
    float* f = (float*)sorted_undos;
1451
252
    ccv_half_precision_to_float((uint16_t*)a->data.f16, (float*)f, n);
1452
76.5k
    for (i = n - 1; i >= 0; 
i--76.2k
)
1453
76.2k
    {
1454
76.2k
      sorted_undos[i].value = f[i];
1455
76.2k
      sorted_undos[i].index = i;
1456
76.2k
    }
1457
490
  } else if (CCV_GET_DATA_TYPE(a->type) == CCV_32F) {
1458
62.5k
    for (i = 0; i < n; 
i++62.2k
)
1459
62.2k
    {
1460
62.2k
      sorted_undos[i].value = a->data.f32[i];
1461
62.2k
      sorted_undos[i].index = i;
1462
62.2k
    }
1463
246
  } else 
if (244
CCV_GET_DATA_TYPE244
(a->type) == CCV_64F244
) {
1464
57.0k
    for (i = 0; i < n; 
i++56.7k
)
1465
56.7k
    {
1466
56.7k
      sorted_undos[i].value = a->data.f64[i];
1467
56.7k
      sorted_undos[i].index = i;
1468
56.7k
    }
1469
244
  }
1470
742
  _ccv_kmeans1d_undo_qsort(sorted_undos, n, 0);
1471
742
  double* cc = ccmalloc(sizeof(double) * 2 * (n + 1));
1472
742
  cc[0] = 0;
1473
742
  cc[n + 1] = 0;
1474
742
  double cumsum = 0, cumsum2 = 0;
1475
196k
  for (i = 0; i < n; 
i++195k
)
1476
195k
  {
1477
195k
    cumsum += sorted_undos[i].value;
1478
195k
    cumsum2 += sorted_undos[i].value * sorted_undos[i].value;
1479
195k
    cc[i + 1] = cumsum;
1480
195k
    cc[i + 1 + n + 1] = cumsum2;
1481
195k
  }
1482
742
  double* D = ccmalloc(sizeof(double) * 2 * n);
1483
742
  int* T = ccmalloc(sizeof(int) * n * k);
1484
196k
  for (i = 0; i < n; 
i++195k
)
1485
195k
  {
1486
195k
    D[i] = _kmeans1d_cost(cc, cc + n + 1, 0, i);
1487
195k
    T[i] = 0;
1488
195k
  }
1489
742
  int k_;
1490
742
  int* cols = ccmalloc(sizeof(int) * n * 2);
1491
36.5k
  for (k_ = 1; k_ < k; 
k_++35.8k
)
1492
35.8k
  {
1493
35.8k
    double* lastD = D + ((k_ - 1) % 2) * n;
1494
35.8k
    int* const cT = T + k_ * n;
1495
35.8k
    smawk(n, cols, lastD, cc, cc + n + 1, cT);
1496
35.8k
    double* nextD = D + (k_ % 2) * n;
1497
19.3M
    for (i = 0; i < n; 
i++19.3M
)
1498
19.3M
    {
1499
19.3M
      const int argmin = cT[i];
1500
19.3M
      nextD[i] = _kmeans1d_lookup(lastD, cc, cc + n + 1, i, argmin);
1501
19.3M
    }
1502
35.8k
  }
1503
742
  ccfree(cc);
1504
742
  ccfree(D);
1505
742
  int t = n;
1506
742
  k_ = k - 1;
1507
742
  int n_ = n - 1;
1508
36.1k
  do {
1509
36.1k
    int t_ = t;
1510
36.1k
    t = T[k_ * n + n_];
1511
36.1k
    double centroid = 0.0;
1512
231k
    for (i = t; i < t_; 
i++195k
)
1513
195k
    {
1514
195k
      cols[i] = k_;
1515
195k
      centroid += (sorted_undos[i].value - centroid) / (i - t + 1);
1516
195k
    }
1517
36.1k
    centroids[k_] = centroid;
1518
36.1k
    k_ -= 1;
1519
36.1k
    n_ = t - 1;
1520
36.1k
  } while (t > 0);
1521
196k
  for (i = 0; i < n; 
i++195k
)
1522
195k
    clusters[sorted_undos[i].index] = cols[i];
1523
742
  ccfree(cols);
1524
742
  ccfree(sorted_undos);
1525
742
  ccfree(T);
1526
742
}