Coverage Report

Created: 2019-07-03 22:50

/home/liu/buildslave/linux-x64-runtests/build/lib/ccv_numeric.c
Line
Count
Source (jump to first uncovered line)
1
#include "ccv.h"
2
#include "ccv_internal.h"
3
#include <complex.h>
4
#ifdef HAVE_FFTW3
5
#include <pthread.h>
6
#include <fftw3.h>
7
#else
8
#include "3rdparty/kissfft/kiss_fftndr.h"
9
#include "3rdparty/kissfft/kissf_fftndr.h"
10
#endif
11
12
const ccv_minimize_param_t ccv_minimize_default_params = {
13
  .interp = 0.1,
14
  .extrap = 3.0,
15
  .max_iter = 20,
16
  .ratio = 10.0,
17
  .sig = 0.1,
18
  .rho = 0.05,
19
};
20
21
void ccv_invert(ccv_matrix_t* a, ccv_matrix_t** b, int type)
22
0
{
23
0
}
24
25
void ccv_solve(ccv_matrix_t* a, ccv_matrix_t* b, ccv_matrix_t** d, int type)
26
0
{
27
0
}
28
29
void ccv_eigen(ccv_dense_matrix_t* a, ccv_dense_matrix_t** vector, ccv_dense_matrix_t** lambda, int type, double epsilon)
30
1
{
31
1
  ccv_declare_derived_signature(vsig, a->sig != 0, ccv_sign_with_literal("ccv_eigen(vector)"), a->sig, CCV_EOF_SIGN);
32
1
  ccv_declare_derived_signature(lsig, a->sig != 0, ccv_sign_with_literal("ccv_eigen(lambda)"), a->sig, CCV_EOF_SIGN);
33
1
  assert(CCV_GET_CHANNEL(a->type) == 1);
34
1
  type = (type == 0) ? CCV_GET_DATA_TYPE(a->type) | CCV_C1 : 
CCV_GET_DATA_TYPE0
(type) | CCV_C10
;
35
1
  // as of now, this function only support real symmetric matrix
36
1
  ccv_dense_matrix_t* dvector = *vector = ccv_dense_matrix_renew(*vector, a->rows, a->cols, CCV_32F | CCV_64F | CCV_C1, type, vsig);
37
1
  ccv_dense_matrix_t* dlambda = *lambda = ccv_dense_matrix_renew(*lambda, 1, a->cols, CCV_32F | CCV_64F | CCV_C1, type, lsig);
38
1
  assert(CCV_GET_DATA_TYPE(dvector->type) == CCV_GET_DATA_TYPE(dlambda->type));
39
1
  ccv_object_return_if_cached(, dvector, dlambda);
40
1
  double* ja = (double*)ccmalloc(sizeof(double) * a->rows * a->cols);
41
1
  int i, j;
42
1
  unsigned char* aptr = a->data.u8;
43
1
  assert(a->rows > 0 && a->cols > 0);
44
1
#define for_block(_, _for_get) \
45
5
  
for (i = 0; 1
i < a->rows;
i++4
) \
46
4
  { \
47
20
    for (j = 0; j < a->cols; 
j++16
) \
48
16
      ja[i * a->cols + j] = _for_get(aptr, j, 0); \
49
4
    aptr += a->step; \
50
4
  }
51
1
  ccv_matrix_getter(a->type, for_block);
52
1
#undef for_block
53
1
  ccv_zero(dvector);
54
1
  ccv_zero(dlambda);
55
1
  unsigned char* dvptr = dvector->data.u8;
56
1
#define for_block(_, _for_set) \
57
5
  
for (i = 0; 1
i < a->cols;
i++4
) \
58
4
    _for_set(dvptr, i * a->cols + i, 1, 0);
59
1
  ccv_matrix_setter(dvector->type, for_block);
60
1
#undef for_block
61
1
  double accuracy = 0;
62
17
  for (i = 0; i < a->rows * a->cols; 
i++16
)
63
16
    accuracy += ja[i];
64
1
  accuracy = sqrt(2 * accuracy);
65
1
  int p, q;
66
1
  unsigned char* dlptr = dlambda->data.u8;
67
1
  int flag = 1;
68
1
  assert(a->rows == a->cols);
69
1
#define for_block(_, _for_set, _for_get) \
70
39
  
do 1
{ \
71
39
    if (!flag) \
72
39
      
accuracy = accuracy * 0.524
; \
73
39
    flag = 0; \
74
147
    for (p = 0; p < a->rows; 
p++108
) \
75
122
    { \
76
303
      for (q = p + 1; q < a->cols; 
q++181
) \
77
195
        if (fabs(ja[p * a->cols + q]) > accuracy) \
78
195
        { \
79
14
          double x = -ja[p * a->cols + q]; \
80
14
          double y = (ja[q * a->cols + q] - ja[p * a->cols + p]) * 0.5; \
81
14
          double omega = (x == 0 && 
y == 00
) ?
10
: x / sqrt(x * x + y * y); \
82
14
          if (y < 0) \
83
14
            
omega = -omega12
; \
84
14
          double sn = 1.0 + sqrt(1.0 - omega * omega); \
85
14
          sn = omega / sqrt(2 * sn); \
86
14
          double cn = sqrt(1.0 - sn * sn); \
87
14
          double fpp = ja[p * a->cols + p]; \
88
14
          double fpq = ja[p * a->cols + q]; \
89
14
          double fqq = ja[q * a->cols + q]; \
90
14
          ja[p * a->cols + p] = fpp * cn * cn + fqq * sn * sn + fpq * omega; \
91
14
          ja[q * a->cols + q] = fpp * sn * sn + fqq * cn * cn - fpq * omega; \
92
14
          ja[p * a->cols + q] = ja[q * a->cols + p] = 0; \
93
70
          for (i = 0; i < a->cols; 
i++56
) \
94
56
            if (i != q && 
i != p42
) \
95
56
            { \
96
28
              fpp = ja[p * a->cols + i]; \
97
28
              fqq = ja[q * a->cols + i]; \
98
28
              ja[p * a->cols + i] = fpp * cn + fqq * sn; \
99
28
              ja[q * a->cols + i] = -fpp * sn + fqq * cn; \
100
28
            } \
101
70
          for (i = 0; i < a->rows; 
i++56
) \
102
56
            if (i != q && 
i != p42
) \
103
56
            { \
104
28
              fpp = ja[i * a->cols + p]; \
105
28
              fqq = ja[i * a->cols + q]; \
106
28
              ja[i * a->cols + p] = fpp * cn + fqq * sn; \
107
28
              ja[i * a->cols + q] = -fpp * sn + fqq * cn; \
108
28
            } \
109
70
          for (i = 0; i < a->cols; 
i++56
) \
110
56
          { \
111
56
            fpp = _for_get(dvptr, p * a->cols + i, 0); \
112
56
            fqq = _for_get(dvptr, q * a->cols + i, 0); \
113
56
            _for_set(dvptr, p * a->cols + i, fpp * cn + fqq * sn, 0); \
114
56
            _for_set(dvptr, q * a->cols + i, -fpp * sn + fqq * cn, 0); \
115
56
          } \
116
70
          for (i = 0; i < a->cols; 
i++56
) \
117
56
            _for_set(dlptr, i, ja[i * a->cols + i], 0); \
118
14
          flag = 1; \
119
14
          break; \
120
14
        } \
121
122
      if (flag) \
122
122
        
break14
; \
123
122
    } \
124
39
  } while (accuracy > epsilon);
125
1
  ccv_matrix_setter_getter(dvector->type, for_block);
126
1
#undef for_block
127
1
  ccfree(ja);
128
1
}
129
130
void ccv_minimize(ccv_dense_matrix_t* x, int length, double red, ccv_minimize_f func, ccv_minimize_param_t params, void* data)
131
1
{
132
1
  ccv_dense_matrix_t* df0 = ccv_dense_matrix_new(x->rows, x->cols, x->type, 0, 0);
133
1
  ccv_zero(df0);
134
1
  ccv_dense_matrix_t* df3 = ccv_dense_matrix_new(x->rows, x->cols, x->type, 0, 0);
135
1
  ccv_zero(df3);
136
1
  ccv_dense_matrix_t* dF0 = ccv_dense_matrix_new(x->rows, x->cols, x->type, 0, 0);
137
1
  ccv_zero(dF0);
138
1
  ccv_dense_matrix_t* s = ccv_dense_matrix_new(x->rows, x->cols, x->type, 0, 0);
139
1
  ccv_zero(s);
140
1
  ccv_dense_matrix_t* x0 = ccv_dense_matrix_new(x->rows, x->cols, x->type, 0, 0);
141
1
  ccv_zero(x0);
142
1
  ccv_dense_matrix_t* xn = ccv_dense_matrix_new(x->rows, x->cols, x->type, 0, 0);
143
1
  ccv_zero(xn);
144
1
  
145
1
  double F0 = 0, f0 = 0, f1 = 0, f2 = 0, f3 = 0, f4 = 0;
146
1
  double x1 = 0, x2 = 0, x3 = 0, x4 = 0;
147
1
  double d0 = 0, d1 = 0, d2 = 0, d3 = 0, d4 = 0;
148
1
  double A = 0, B = 0;
149
1
  int ls_failed = 0;
150
1
151
1
  int i, j, k;
152
1
  func(x, &f0, df0, data);
153
1
  d0 = 0;
154
1
  unsigned char* df0p = df0->data.u8;
155
1
  unsigned char* sp = s->data.u8;
156
2
  for (i = 0; i < x->rows; 
i++1
)
157
1
  {
158
3
    for (j = 0; j < x->cols; 
j++2
)
159
2
    {
160
2
      double ss = ccv_get_value(x->type, df0p, j);
161
2
      ccv_set_value(x->type, sp, j, -ss, 0);
162
2
      d0 += -ss * ss;
163
2
    }
164
1
    df0p += x->step;
165
1
    sp += x->step;
166
1
  }
167
1
  x3 = red / (1.0 - d0);
168
1
  int l = (length > 0) ? length : 
-length0
;
169
1
  int ls = (length > 0) ? 1 : 
00
;
170
1
  int eh = (length > 0) ? 0 : 
10
;
171
22
  for (k = 0; k < l;)
172
22
  {
173
22
    k += ls;
174
22
    memcpy(x0->data.u8, x->data.u8, x->rows * x->step);
175
22
    memcpy(dF0->data.u8, df0->data.u8, x->rows * x->step);
176
22
    F0 = f0;
177
22
    int m = ccv_min(params.max_iter, (length > 0) ? params.max_iter : l - k);
178
22
    for (;;)
179
80
    {
180
80
      x2 = 0;
181
80
      f3 = f2 = f0;
182
80
      d2 = d0;
183
80
      memcpy(df3->data.u8, df0->data.u8, x->rows * x->step);
184
80
      while (m > 0)
185
80
      {
186
80
        m--;
187
80
        k += eh;
188
80
        unsigned char* sp = s->data.u8;
189
80
        unsigned char* xp = x->data.u8;
190
80
        unsigned char* xnp = xn->data.u8;
191
160
        for (i = 0; i < x->rows; 
i++80
)
192
80
        {
193
240
          for (j = 0; j < x->cols; 
j++160
)
194
160
            ccv_set_value(x->type, xnp, j, x3 * ccv_get_value(x->type, sp, j) + ccv_get_value(x->type, xp, j), 0);
195
80
          sp += x->step;
196
80
          xp += x->step;
197
80
          xnp += x->step;
198
80
        }
199
80
        if (func(xn, &f3, df3, data) == 0)
200
80
          break;
201
0
        else
202
0
          x3 = (x2 + x3) * 0.5;
203
80
      }
204
80
      if (f3 < F0)
205
27
      {
206
27
        memcpy(x0->data.u8, xn->data.u8, x->rows * x->step);
207
27
        memcpy(dF0->data.u8, df3->data.u8, x->rows * x->step);
208
27
        F0 = f3;
209
27
      }
210
80
      d3 = 0;
211
80
      unsigned char* df3p = df3->data.u8;
212
80
      unsigned char* sp = s->data.u8;
213
160
      for (i = 0; i < x->rows; 
i++80
)
214
80
      {
215
240
        for (j = 0; j < x->cols; 
j++160
)
216
160
          d3 += ccv_get_value(x->type, df3p, j) * ccv_get_value(x->type, sp, j);
217
80
        df3p += x->step;
218
80
        sp += x->step;
219
80
      }
220
80
      if ((d3 > params.sig * d0) || 
(f3 > f0 + x3 * params.rho * d0)61
||
(m <= 0)60
)
221
22
        break;
222
58
      x1 = x2; f1 = f2; d1 = d2;
223
58
      x2 = x3; f2 = f3; d2 = d3;
224
58
      A = 6.0 * (f1 - f2) + 3.0 * (d2 + d1) * (x2 - x1);
225
58
      B = 3.0 * (f2 - f1) - (2.0 * d1 + d2) * (x2 - x1);
226
58
      x3 = B * B - A * d1 * (x2 - x1);
227
58
      if (x3 < 0)
228
8
        x3 = x2 * params.extrap;
229
50
      else {
230
50
        x3 = x1 - d1 * (x2 - x1) * (x2 - x1) / (B + sqrt(x3));
231
50
        if (x3 < 0)
232
0
          x3 = x2 * params.extrap;
233
50
        else {
234
50
          if (x3 > x2 * params.extrap)
235
8
            x3 = x2 * params.extrap;
236
42
          else if (x3 < x2 + params.interp * (x2 - x1))
237
38
            x3 = x2 + params.interp * (x2 - x1);
238
50
        }
239
50
      }
240
58
    }
241
88
    
while (22
((fabs(d3) > -params.sig * d0) ||
(f3 > f0 + x3 * params.rho * d0)18
) &&
(m > 0)70
)
242
66
    {
243
66
      if ((d3 > 1e-8) || 
(f3 > f0 + x3 * params.rho * d0)52
)
244
38
      {
245
38
        x4 = x3; f4 = f3; d4 = d3;
246
38
      } else {
247
28
        x2 = x3; f2 = f3; d2 = d3;
248
28
      }
249
66
      if (f4 > f0)
250
43
        x3 = x2 - (0.5 * d2 * (x4 - x2) * (x4 - x2)) / (f4 - f2 - d2 * (x4 - x2));
251
23
      else {
252
23
        A = 6.0 * (f2 - f4) / (x4 - x2) + 3.0 * (d4 + d2);
253
23
        B = 3.0 * (f4 - f2) - (2.0 * d2 + d4) * (x4 - x2);
254
23
        x3 = B * B - A * d2 * (x4 - x2) * (x4 - x2);
255
23
        x3 = (x3 < 0) ? 
(x2 + x4) * 0.50
: x2 + (sqrt(x3) - B) / A;
256
23
      }
257
66
      x3 = ccv_max(ccv_min(x3, x4 - params.interp * (x4 - x2)), x2 + params.interp * (x4 - x2));
258
66
      sp = s->data.u8;
259
66
      unsigned char* xp = x->data.u8;
260
66
      unsigned char* xnp = xn->data.u8;
261
132
      for (i = 0; i < x->rows; 
i++66
)
262
66
      {
263
198
        for (j = 0; j < x->cols; 
j++132
)
264
132
          ccv_set_value(x->type, xnp, j, x3 * ccv_get_value(x->type, sp, j) + ccv_get_value(x->type, xp, j), 0);
265
66
        sp += x->step;
266
66
        xp += x->step;
267
66
        xnp += x->step;
268
66
      }
269
66
      func(xn, &f3, df3, data);
270
66
      if (f3 < F0)
271
24
      {
272
24
        memcpy(x0->data.u8, xn->data.u8, x->rows * x->step);
273
24
        memcpy(dF0->data.u8, df3->data.u8, x->rows * x->step);
274
24
        F0 = f3;
275
24
      }
276
66
      m--;
277
66
      k += eh;
278
66
      d3 = 0;
279
66
      sp = s->data.u8;
280
66
      unsigned char* df3p = df3->data.u8;
281
132
      for (i = 0; i < x->rows; 
i++66
)
282
66
      {
283
198
        for (j = 0; j < x->cols; 
j++132
)
284
132
          d3 += ccv_get_value(x->type, df3p, j) * ccv_get_value(x->type, sp, j);
285
66
        df3p += x->step;
286
66
        sp += x->step;
287
66
      }
288
66
    }
289
22
    if ((fabs(d3) < -params.sig * d0) && 
(f3 < f0 + x3 * params.rho * d0)18
)
290
18
    {
291
18
      memcpy(x->data.u8, xn->data.u8, x->rows * x->step);
292
18
      f0 = f3;
293
18
      double df0_df3 = 0;
294
18
      double df3_df3 = 0;
295
18
      double df0_df0 = 0;
296
18
      unsigned char* df0p = df0->data.u8;
297
18
      unsigned char* df3p = df3->data.u8;
298
36
      for (i = 0; i < x->rows; 
i++18
)
299
18
      {
300
54
        for (j = 0; j < x->cols; 
j++36
)
301
36
        {
302
36
          df0_df0 += ccv_get_value(x->type, df0p, j) * ccv_get_value(x->type, df0p, j);
303
36
          df0_df3 += ccv_get_value(x->type, df0p, j) * ccv_get_value(x->type, df3p, j);
304
36
          df3_df3 += ccv_get_value(x->type, df3p, j) * ccv_get_value(x->type, df3p, j);
305
36
        }
306
18
        df0p += x->step;
307
18
        df3p += x->step;
308
18
      }
309
18
      double slr = (df3_df3 - df0_df3) / df0_df0;
310
18
      df3p = df3->data.u8;
311
18
      unsigned char* sp = s->data.u8;
312
36
      for (i = 0; i < x->rows; 
i++18
)
313
18
      {
314
54
        for (j = 0; j < x->cols; 
j++36
)
315
36
          ccv_set_value(x->type, sp, j, slr * ccv_get_value(x->type, sp, j) - ccv_get_value(x->type, df3p, j), 0);
316
18
        df3p += x->step;
317
18
        sp += x->step;
318
18
      }
319
18
      memcpy(df0->data.u8, df3->data.u8, x->rows * x->step);
320
18
      d3 = d0;
321
18
      d0 = 0;
322
18
      df0p = df0->data.u8;
323
18
      sp = s->data.u8;
324
36
      for (i = 0; i < x->rows; 
i++18
)
325
18
      {
326
54
        for (j = 0; j < x->cols; 
j++36
)
327
36
        {
328
36
          d0 += ccv_get_value(x->type, df0p, j) * ccv_get_value(x->type, sp, j);
329
36
        }
330
18
        df0p += x->step;
331
18
        sp += x->step;
332
18
      }
333
18
      if (d0 > 0)
334
1
      {
335
1
        d0 = 0;
336
1
        df0p = df0->data.u8;
337
1
        sp = s->data.u8;
338
2
        for (i = 0; i < x->rows; 
i++1
)
339
1
        {
340
3
          for (j = 0; j < x->cols; 
j++2
)
341
2
          {
342
2
            double ss = ccv_get_value(x->type, df0p, j);
343
2
            ccv_set_value(x->type, sp, j, -ss, 0);
344
2
            d0 += -ss * ss;
345
2
          }
346
1
          df0p += x->step;
347
1
          sp += x->step;
348
1
        }
349
1
      }
350
18
      x3 = x3 * ccv_min(params.ratio, d3 / (d0 - 1e-8));
351
18
      ls_failed = 0;
352
18
    } else {
353
4
      memcpy(x->data.u8, x0->data.u8, x->rows * x->step);
354
4
      memcpy(df0->data.u8, dF0->data.u8, x->rows * x->step);
355
4
      f0 = F0;
356
4
      if (ls_failed)
357
1
        break;
358
3
      d0 = 0;
359
3
      unsigned char* df0p = df0->data.u8;
360
3
      unsigned char* sp = s->data.u8;
361
6
      for (i = 0; i < x->rows; 
i++3
)
362
3
      {
363
9
        for (j = 0; j < x->cols; 
j++6
)
364
6
        {
365
6
          double ss = ccv_get_value(x->type, df0p, j);
366
6
          ccv_set_value(x->type, sp, j, -ss, 0);
367
6
          d0 += -ss * ss;
368
6
        }
369
3
        df0p += x->step;
370
3
        sp += x->step;
371
3
      }
372
3
      x3 = red / (1.0 - d0);
373
3
      ls_failed = 1;
374
3
    }
375
22
  }
376
1
  ccv_matrix_free(s);
377
1
  ccv_matrix_free(x0);
378
1
  ccv_matrix_free(xn);
379
1
  ccv_matrix_free(dF0);
380
1
  ccv_matrix_free(df0);
381
1
  ccv_matrix_free(df3);
382
1
}
383
384
#ifdef HAVE_FFTW3
385
/* optimal FFT size table is adopted from OpenCV */
386
static const int _ccv_optimal_fft_size[] = {
387
  1, 2, 3, 4, 5, 6, 8, 9, 10, 12, 15, 16, 18, 20, 24, 25, 27, 30, 32, 36, 40, 45, 48, 
388
  50, 54, 60, 64, 72, 75, 80, 81, 90, 96, 100, 108, 120, 125, 128, 135, 144, 150, 160, 
389
  162, 180, 192, 200, 216, 225, 240, 243, 250, 256, 270, 288, 300, 320, 324, 360, 375, 
390
  384, 400, 405, 432, 450, 480, 486, 500, 512, 540, 576, 600, 625, 640, 648, 675, 720, 
391
  729, 750, 768, 800, 810, 864, 900, 960, 972, 1000, 1024, 1080, 1125, 1152, 1200, 
392
  1215, 1250, 1280, 1296, 1350, 1440, 1458, 1500, 1536, 1600, 1620, 1728, 1800, 1875, 
393
  1920, 1944, 2000, 2025, 2048, 2160, 2187, 2250, 2304, 2400, 2430, 2500, 2560, 2592, 
394
  2700, 2880, 2916, 3000, 3072, 3125, 3200, 3240, 3375, 3456, 3600, 3645, 3750, 3840, 
395
  3888, 4000, 4050, 4096, 4320, 4374, 4500, 4608, 4800, 4860, 5000, 5120, 5184, 5400, 
396
  5625, 5760, 5832, 6000, 6075, 6144, 6250, 6400, 6480, 6561, 6750, 6912, 7200, 7290, 
397
  7500, 7680, 7776, 8000, 8100, 8192, 8640, 8748, 9000, 9216, 9375, 9600, 9720, 10000, 
398
  10125, 10240, 10368, 10800, 10935, 11250, 11520, 11664, 12000, 12150, 12288, 12500, 
399
  12800, 12960, 13122, 13500, 13824, 14400, 14580, 15000, 15360, 15552, 15625, 16000, 
400
  16200, 16384, 16875, 17280, 17496, 18000, 18225, 18432, 18750, 19200, 19440, 19683, 
401
  20000, 20250, 20480, 20736, 21600, 21870, 22500, 23040, 23328, 24000, 24300, 24576, 
402
  25000, 25600, 25920, 26244, 27000, 27648, 28125, 28800, 29160, 30000, 30375, 30720, 
403
  31104, 31250, 32000, 32400, 32768, 32805, 33750, 34560, 34992, 36000, 36450, 36864, 
404
  37500, 38400, 38880, 39366, 40000, 40500, 40960, 41472, 43200, 43740, 45000, 46080, 
405
  46656, 46875, 48000, 48600, 49152, 50000, 50625, 51200, 51840, 52488, 54000, 54675, 
406
  55296, 56250, 57600, 58320, 59049, 60000, 60750, 61440, 62208, 62500, 64000, 64800, 
407
  65536, 65610, 67500, 69120, 69984, 72000, 72900, 73728, 75000, 76800, 77760, 78125, 
408
  78732, 80000, 81000, 81920, 82944, 84375, 86400, 87480, 90000, 91125, 92160, 93312, 
409
  93750, 96000, 97200, 98304, 98415, 100000, 101250, 102400, 103680, 104976, 108000, 
410
  109350, 110592, 112500, 115200, 116640, 118098, 120000, 121500, 122880, 124416, 125000, 
411
  128000, 129600, 131072, 131220, 135000, 138240, 139968, 140625, 144000, 145800, 147456, 
412
  150000, 151875, 153600, 155520, 156250, 157464, 160000, 162000, 163840, 164025, 165888, 
413
  168750, 172800, 174960, 177147, 180000, 182250, 184320, 186624, 187500, 192000, 194400, 
414
  196608, 196830, 200000, 202500, 204800, 207360, 209952, 216000, 218700, 221184, 225000, 
415
  230400, 233280, 234375, 236196, 240000, 243000, 245760, 248832, 250000, 253125, 256000, 
416
  259200, 262144, 262440, 270000, 273375, 276480, 279936, 281250, 288000, 291600, 294912, 
417
  295245, 300000, 303750, 307200, 311040, 312500, 314928, 320000, 324000, 327680, 328050, 
418
  331776, 337500, 345600, 349920, 354294, 360000, 364500, 368640, 373248, 375000, 384000, 
419
  388800, 390625, 393216, 393660, 400000, 405000, 409600, 414720, 419904, 421875, 432000, 
420
  437400, 442368, 450000, 455625, 460800, 466560, 468750, 472392, 480000, 486000, 491520, 
421
  492075, 497664, 500000, 506250, 512000, 518400, 524288, 524880, 531441, 540000, 546750, 
422
  552960, 559872, 562500, 576000, 583200, 589824, 590490, 600000, 607500, 614400, 622080, 
423
  625000, 629856, 640000, 648000, 655360, 656100, 663552, 675000, 691200, 699840, 703125, 
424
  708588, 720000, 729000, 737280, 746496, 750000, 759375, 768000, 777600, 781250, 786432, 
425
  787320, 800000, 810000, 819200, 820125, 829440, 839808, 843750, 864000, 874800, 884736, 
426
  885735, 900000, 911250, 921600, 933120, 937500, 944784, 960000, 972000, 983040, 984150, 
427
  995328, 1000000, 1012500, 1024000, 1036800, 1048576, 1049760, 1062882, 1080000, 1093500, 
428
  1105920, 1119744, 1125000, 1152000, 1166400, 1171875, 1179648, 1180980, 1200000, 
429
  1215000, 1228800, 1244160, 1250000, 1259712, 1265625, 1280000, 1296000, 1310720, 
430
  1312200, 1327104, 1350000, 1366875, 1382400, 1399680, 1406250, 1417176, 1440000, 
431
  1458000, 1474560, 1476225, 1492992, 1500000, 1518750, 1536000, 1555200, 1562500, 
432
  1572864, 1574640, 1594323, 1600000, 1620000, 1638400, 1640250, 1658880, 1679616, 
433
  1687500, 1728000, 1749600, 1769472, 1771470, 1800000, 1822500, 1843200, 1866240, 
434
  1875000, 1889568, 1920000, 1944000, 1953125, 1966080, 1968300, 1990656, 2000000, 
435
  2025000, 2048000, 2073600, 2097152, 2099520, 2109375, 2125764, 2160000, 2187000, 
436
  2211840, 2239488, 2250000, 2278125, 2304000, 2332800, 2343750, 2359296, 2361960, 
437
  2400000, 2430000, 2457600, 2460375, 2488320, 2500000, 2519424, 2531250, 2560000, 
438
  2592000, 2621440, 2624400, 2654208, 2657205, 2700000, 2733750, 2764800, 2799360, 
439
  2812500, 2834352, 2880000, 2916000, 2949120, 2952450, 2985984, 3000000, 3037500, 
440
  3072000, 3110400, 3125000, 3145728, 3149280, 3188646, 3200000, 3240000, 3276800, 
441
  3280500, 3317760, 3359232, 3375000, 3456000, 3499200, 3515625, 3538944, 3542940, 
442
  3600000, 3645000, 3686400, 3732480, 3750000, 3779136, 3796875, 3840000, 3888000, 
443
  3906250, 3932160, 3936600, 3981312, 4000000, 4050000, 4096000, 4100625, 4147200, 
444
  4194304, 4199040, 4218750, 4251528, 4320000, 4374000, 4423680, 4428675, 4478976, 
445
  4500000, 4556250, 4608000, 4665600, 4687500, 4718592, 4723920, 4782969, 4800000, 
446
  4860000, 4915200, 4920750, 4976640, 5000000, 5038848, 5062500, 5120000, 5184000, 
447
  5242880, 5248800, 5308416, 5314410, 5400000, 5467500, 5529600, 5598720, 5625000, 
448
  5668704, 5760000, 5832000, 5859375, 5898240, 5904900, 5971968, 6000000, 6075000, 
449
  6144000, 6220800, 6250000, 6291456, 6298560, 6328125, 6377292, 6400000, 6480000, 
450
  6553600, 6561000, 6635520, 6718464, 6750000, 6834375, 6912000, 6998400, 7031250, 
451
  7077888, 7085880, 7200000, 7290000, 7372800, 7381125, 7464960, 7500000, 7558272, 
452
  7593750, 7680000, 7776000, 7812500, 7864320, 7873200, 7962624, 7971615, 8000000, 
453
  8100000, 8192000, 8201250, 8294400, 8388608, 8398080, 8437500, 8503056, 8640000, 
454
  8748000, 8847360, 8857350, 8957952, 9000000, 9112500, 9216000, 9331200, 9375000, 
455
  9437184, 9447840, 9565938, 9600000, 9720000, 9765625, 9830400, 9841500, 9953280, 
456
  10000000, 10077696, 10125000, 10240000, 10368000, 10485760, 10497600, 10546875, 10616832, 
457
  10628820, 10800000, 10935000, 11059200, 11197440, 11250000, 11337408, 11390625, 11520000, 
458
  11664000, 11718750, 11796480, 11809800, 11943936, 12000000, 12150000, 12288000, 12301875, 
459
  12441600, 12500000, 12582912, 12597120, 12656250, 12754584, 12800000, 12960000, 13107200, 
460
  13122000, 13271040, 13286025, 13436928, 13500000, 13668750, 13824000, 13996800, 14062500, 
461
  14155776, 14171760, 14400000, 14580000, 14745600, 14762250, 14929920, 15000000, 15116544, 
462
  15187500, 15360000, 15552000, 15625000, 15728640, 15746400, 15925248, 15943230, 16000000, 
463
  16200000, 16384000, 16402500, 16588800, 16777216, 16796160, 16875000, 17006112, 17280000, 
464
  17496000, 17578125, 17694720, 17714700, 17915904, 18000000, 18225000, 18432000, 18662400, 
465
  18750000, 18874368, 18895680, 18984375, 19131876, 19200000, 19440000, 19531250, 19660800, 
466
  19683000, 19906560, 20000000, 20155392, 20250000, 20480000, 20503125, 20736000, 20971520, 
467
  20995200, 21093750, 21233664, 21257640, 21600000, 21870000, 22118400, 22143375, 22394880, 
468
  22500000, 22674816, 22781250, 23040000, 23328000, 23437500, 23592960, 23619600, 23887872, 
469
  23914845, 24000000, 24300000, 24576000, 24603750, 24883200, 25000000, 25165824, 25194240, 
470
  25312500, 25509168, 25600000, 25920000, 26214400, 26244000, 26542080, 26572050, 26873856, 
471
  27000000, 27337500, 27648000, 27993600, 28125000, 28311552, 28343520, 28800000, 29160000, 
472
  29296875, 29491200, 29524500, 29859840, 30000000, 30233088, 30375000, 30720000, 31104000, 
473
  31250000, 31457280, 31492800, 31640625, 31850496, 31886460, 32000000, 32400000, 32768000, 
474
  32805000, 33177600, 33554432, 33592320, 33750000, 34012224, 34171875, 34560000, 34992000, 
475
  35156250, 35389440, 35429400, 35831808, 36000000, 36450000, 36864000, 36905625, 37324800, 
476
  37500000, 37748736, 37791360, 37968750, 38263752, 38400000, 38880000, 39062500, 39321600, 
477
  39366000, 39813120, 39858075, 40000000, 40310784, 40500000, 40960000, 41006250, 41472000, 
478
  41943040, 41990400, 42187500, 42467328, 42515280, 43200000, 43740000, 44236800, 44286750, 
479
  44789760, 45000000, 45349632, 45562500, 46080000, 46656000, 46875000, 47185920, 47239200, 
480
  47775744, 47829690, 48000000, 48600000, 48828125, 49152000, 49207500, 49766400, 50000000, 
481
  50331648, 50388480, 50625000, 51018336, 51200000, 51840000, 52428800, 52488000, 52734375, 
482
  53084160, 53144100, 53747712, 54000000, 54675000, 55296000, 55987200, 56250000, 56623104, 
483
  56687040, 56953125, 57600000, 58320000, 58593750, 58982400, 59049000, 59719680, 60000000, 
484
  60466176, 60750000, 61440000, 61509375, 62208000, 62500000, 62914560, 62985600, 63281250, 
485
  63700992, 63772920, 64000000, 64800000, 65536000, 65610000, 66355200, 66430125, 67108864, 
486
  67184640, 67500000, 68024448, 68343750, 69120000, 69984000, 70312500, 70778880, 70858800, 
487
  71663616, 72000000, 72900000, 73728000, 73811250, 74649600, 75000000, 75497472, 75582720, 
488
  75937500, 76527504, 76800000, 77760000, 78125000, 78643200, 78732000, 79626240, 79716150, 
489
  80000000, 80621568, 81000000, 81920000, 82012500, 82944000, 83886080, 83980800, 84375000, 
490
  84934656, 85030560, 86400000, 87480000, 87890625, 88473600, 88573500, 89579520, 90000000, 
491
  90699264, 91125000, 92160000, 93312000, 93750000, 94371840, 94478400, 94921875, 95551488, 
492
  95659380, 96000000, 97200000, 97656250, 98304000, 98415000, 99532800, 100000000, 
493
  100663296, 100776960, 101250000, 102036672, 102400000, 102515625, 103680000, 104857600, 
494
  104976000, 105468750, 106168320, 106288200, 107495424, 108000000, 109350000, 110592000, 
495
  110716875, 111974400, 112500000, 113246208, 113374080, 113906250, 115200000, 116640000, 
496
  117187500, 117964800, 118098000, 119439360, 119574225, 120000000, 120932352, 121500000, 
497
  122880000, 123018750, 124416000, 125000000, 125829120, 125971200, 126562500, 127401984, 
498
  127545840, 128000000, 129600000, 131072000, 131220000, 132710400, 132860250, 134217728, 
499
  134369280, 135000000, 136048896, 136687500, 138240000, 139968000, 140625000, 141557760, 
500
  141717600, 143327232, 144000000, 145800000, 146484375, 147456000, 147622500, 149299200, 
501
  150000000, 150994944, 151165440, 151875000, 153055008, 153600000, 155520000, 156250000, 
502
  157286400, 157464000, 158203125, 159252480, 159432300, 160000000, 161243136, 162000000, 
503
  163840000, 164025000, 165888000, 167772160, 167961600, 168750000, 169869312, 170061120, 
504
  170859375, 172800000, 174960000, 175781250, 176947200, 177147000, 179159040, 180000000, 
505
  181398528, 182250000, 184320000, 184528125, 186624000, 187500000, 188743680, 188956800, 
506
  189843750, 191102976, 191318760, 192000000, 194400000, 195312500, 196608000, 196830000, 
507
  199065600, 199290375, 200000000, 201326592, 201553920, 202500000, 204073344, 204800000, 
508
  205031250, 207360000, 209715200, 209952000, 210937500, 212336640, 212576400, 214990848, 
509
  216000000, 218700000, 221184000, 221433750, 223948800, 225000000, 226492416, 226748160, 
510
  227812500, 230400000, 233280000, 234375000, 235929600, 236196000, 238878720, 239148450, 
511
  240000000, 241864704, 243000000, 244140625, 245760000, 246037500, 248832000, 250000000, 
512
  251658240, 251942400, 253125000, 254803968, 255091680, 256000000, 259200000, 262144000, 
513
  262440000, 263671875, 265420800, 265720500, 268435456, 268738560, 270000000, 272097792, 
514
  273375000, 276480000, 279936000, 281250000, 283115520, 283435200, 284765625, 286654464, 
515
  288000000, 291600000, 292968750, 294912000, 295245000, 298598400, 300000000, 301989888, 
516
  302330880, 303750000, 306110016, 307200000, 307546875, 311040000, 312500000, 314572800, 
517
  314928000, 316406250, 318504960, 318864600, 320000000, 322486272, 324000000, 327680000, 
518
  328050000, 331776000, 332150625, 335544320, 335923200, 337500000, 339738624, 340122240, 
519
  341718750, 345600000, 349920000, 351562500, 353894400, 354294000, 358318080, 360000000, 
520
  362797056, 364500000, 368640000, 369056250, 373248000, 375000000, 377487360, 377913600, 
521
  379687500, 382205952, 382637520, 384000000, 388800000, 390625000, 393216000, 393660000, 
522
  398131200, 398580750, 400000000, 402653184, 403107840, 405000000, 408146688, 409600000, 
523
  410062500, 414720000, 419430400, 419904000, 421875000, 424673280, 425152800, 429981696, 
524
  432000000, 437400000, 439453125, 442368000, 442867500, 447897600, 450000000, 452984832, 
525
  453496320, 455625000, 460800000, 466560000, 468750000, 471859200, 472392000, 474609375, 
526
  477757440, 478296900, 480000000, 483729408, 486000000, 488281250, 491520000, 492075000, 
527
  497664000, 500000000, 503316480, 503884800, 506250000, 509607936, 510183360, 512000000, 
528
  512578125, 518400000, 524288000, 524880000, 527343750, 530841600, 531441000, 536870912, 
529
  537477120, 540000000, 544195584, 546750000, 552960000, 553584375, 559872000, 562500000, 
530
  566231040, 566870400, 569531250, 573308928, 576000000, 583200000, 585937500, 589824000, 
531
  590490000, 597196800, 597871125, 600000000, 603979776, 604661760, 607500000, 612220032, 
532
  614400000, 615093750, 622080000, 625000000, 629145600, 629856000, 632812500, 637009920, 
533
  637729200, 640000000, 644972544, 648000000, 655360000, 656100000, 663552000, 664301250, 
534
  671088640, 671846400, 675000000, 679477248, 680244480, 683437500, 691200000, 699840000, 
535
  703125000, 707788800, 708588000, 716636160, 720000000, 725594112, 729000000, 732421875, 
536
  737280000, 738112500, 746496000, 750000000, 754974720, 755827200, 759375000, 764411904, 
537
  765275040, 768000000, 777600000, 781250000, 786432000, 787320000, 791015625, 796262400, 
538
  797161500, 800000000, 805306368, 806215680, 810000000, 816293376, 819200000, 820125000, 
539
  829440000, 838860800, 839808000, 843750000, 849346560, 850305600, 854296875, 859963392, 
540
  864000000, 874800000, 878906250, 884736000, 885735000, 895795200, 900000000, 905969664, 
541
  906992640, 911250000, 921600000, 922640625, 933120000, 937500000, 943718400, 944784000, 
542
  949218750, 955514880, 956593800, 960000000, 967458816, 972000000, 976562500, 983040000, 
543
  984150000, 995328000, 996451875, 1000000000, 1006632960, 1007769600, 1012500000, 
544
  1019215872, 1020366720, 1024000000, 1025156250, 1036800000, 1048576000, 1049760000, 
545
  1054687500, 1061683200, 1062882000, 1073741824, 1074954240, 1080000000, 1088391168, 
546
  1093500000, 1105920000, 1107168750, 1119744000, 1125000000, 1132462080, 1133740800, 
547
  1139062500, 1146617856, 1152000000, 1166400000, 1171875000, 1179648000, 1180980000, 
548
  1194393600, 1195742250, 1200000000, 1207959552, 1209323520, 1215000000, 1220703125, 
549
  1224440064, 1228800000, 1230187500, 1244160000, 1250000000, 1258291200, 1259712000, 
550
  1265625000, 1274019840, 1275458400, 1280000000, 1289945088, 1296000000, 1310720000, 
551
  1312200000, 1318359375, 1327104000, 1328602500, 1342177280, 1343692800, 1350000000, 
552
  1358954496, 1360488960, 1366875000, 1382400000, 1399680000, 1406250000, 1415577600, 
553
  1417176000, 1423828125, 1433272320, 1440000000, 1451188224, 1458000000, 1464843750, 
554
  1474560000, 1476225000, 1492992000, 1500000000, 1509949440, 1511654400, 1518750000, 
555
  1528823808, 1530550080, 1536000000, 1537734375, 1555200000, 1562500000, 1572864000, 
556
  1574640000, 1582031250, 1592524800, 1594323000, 1600000000, 1610612736, 1612431360, 
557
  1620000000, 1632586752, 1638400000, 1640250000, 1658880000, 1660753125, 1677721600, 
558
  1679616000, 1687500000, 1698693120, 1700611200, 1708593750, 1719926784, 1728000000, 
559
  1749600000, 1757812500, 1769472000, 1771470000, 1791590400, 1800000000, 1811939328, 
560
  1813985280, 1822500000, 1843200000, 1845281250, 1866240000, 1875000000, 1887436800, 
561
  1889568000, 1898437500, 1911029760, 1913187600, 1920000000, 1934917632, 1944000000, 
562
  1953125000, 1966080000, 1968300000, 1990656000, 1992903750, 2000000000, 2013265920, 
563
  2015539200, 2025000000, 2038431744, 2040733440, 2048000000, 2050312500, 2073600000, 
564
  2097152000, 2099520000, 2109375000, 2123366400, 2125764000
565
};
566
567
static int _ccv_get_optimal_fft_size(int size)
568
12
{
569
12
  int a = 0, b = sizeof(_ccv_optimal_fft_size) / sizeof(_ccv_optimal_fft_size[0]) - 1;
570
12
    if((unsigned)size >= (unsigned)_ccv_optimal_fft_size[b])
571
0
    return -1;
572
142
  
while(12
a < b)
573
130
  {
574
130
    int c = (a + b) >> 1;
575
130
    if(size <= _ccv_optimal_fft_size[c])
576
98
      b = c;
577
32
    else
578
32
      a = c + 1;
579
130
    }
580
12
    return _ccv_optimal_fft_size[b];
581
12
}
582
583
static pthread_mutex_t fftw_plan_mutex = PTHREAD_MUTEX_INITIALIZER;
584
585
static void _ccv_filter_fftw(ccv_dense_matrix_t* a, ccv_dense_matrix_t* b, ccv_dense_matrix_t* d, int padding_pattern)
586
6
{
587
6
  int ch = CCV_GET_CHANNEL(a->type);
588
6
  int fft_type = (CCV_GET_DATA_TYPE(d->type) == CCV_8U || CCV_GET_DATA_TYPE(d->type) == CCV_32F) ? 
CCV_32F5
:
CCV_64F1
;
589
6
  int rows = ccv_min(a->rows + b->rows - 1, _ccv_get_optimal_fft_size(b->rows * 3));
590
6
  int cols = ccv_min(a->cols + b->cols - 1, _ccv_get_optimal_fft_size(b->cols * 3));
591
6
  int cols_2c = 2 * (cols / 2 + 1);
592
6
  void* fftw_a;
593
6
  void* fftw_b;
594
6
  void* fftw_d;
595
6
  fftw_plan p, pinv;
596
6
  fftwf_plan pf, pinvf;
597
6
  if (fft_type == CCV_32F)
598
5
  {
599
5
    fftw_a = fftwf_malloc(rows * cols_2c * ch * sizeof(float));
600
5
    fftw_b = fftwf_malloc(rows * cols_2c * ch * sizeof(float));
601
5
    fftw_d = fftwf_malloc(rows * cols_2c * ch * sizeof(float));
602
5
    pthread_mutex_lock(&fftw_plan_mutex);
603
5
    if (ch == 1)
604
3
    {
605
3
      pf = fftwf_plan_dft_r2c_2d(rows, cols, 0, 0, FFTW_ESTIMATE);
606
3
      pinvf = fftwf_plan_dft_c2r_2d(rows, cols, 0, 0, FFTW_ESTIMATE);
607
3
    } else {
608
2
      int ndim[] = {rows, cols};
609
2
      pf = fftwf_plan_many_dft_r2c(2, ndim, ch, 0, 0, ch, 1, 0, 0, ch, 1, FFTW_ESTIMATE);
610
2
      pinvf = fftwf_plan_many_dft_c2r(2, ndim, ch, 0, 0, ch, 1, 0, 0, ch, 1, FFTW_ESTIMATE);
611
2
    }
612
5
    pthread_mutex_unlock(&fftw_plan_mutex);
613
5
  } else {
614
1
    fftw_a = fftw_malloc(rows * cols_2c * ch * sizeof(double));
615
1
    fftw_b = fftw_malloc(rows * cols_2c * ch * sizeof(double));
616
1
    fftw_d = fftw_malloc(rows * cols_2c * ch * sizeof(double));
617
1
    pthread_mutex_lock(&fftw_plan_mutex);
618
1
    if (ch == 1)
619
0
    {
620
0
      p = fftw_plan_dft_r2c_2d(rows, cols, 0, 0, FFTW_ESTIMATE);
621
0
      pinv = fftw_plan_dft_c2r_2d(rows, cols, 0, 0, FFTW_ESTIMATE);
622
1
    } else {
623
1
      int ndim[] = {rows, cols};
624
1
      p = fftw_plan_many_dft_r2c(2, ndim, ch, 0, 0, ch, 1, 0, 0, ch, 1, FFTW_ESTIMATE);
625
1
      pinv = fftw_plan_many_dft_c2r(2, ndim, ch, 0, 0, ch, 1, 0, 0, ch, 1, FFTW_ESTIMATE);
626
1
    }
627
1
    pthread_mutex_unlock(&fftw_plan_mutex);
628
1
  }
629
6
  memset(fftw_b, 0, rows * cols_2c * ch * CCV_GET_DATA_TYPE_SIZE(fft_type));
630
6
631
6
  int i, j, k;
632
6
  unsigned char* m_ptr = b->data.u8;
633
6
  // to flip matrix b is crucial, this problem only shows when I changed to a more sophisticated test case
634
6
#define for_block(_for_type, _for_get) \
635
6
  _for_type* fftw_ptr = (_for_type*)fftw_b + (b->rows - 1) * cols_2c * ch; \
636
416
  for (i = 0; i < b->rows; 
i++410
) \
637
410
  { \
638
33.1k
    for (j = 0; j < b->cols; 
j++32.7k
) \
639
130k
      
for (k = 0; 32.7k
k < ch;
k++97.7k
) \
640
97.7k
        fftw_ptr[(b->cols - 1 - j) * ch + k] = _for_get(m_ptr, j * ch + k, 0); \
641
410
    fftw_ptr -= cols_2c * ch; \
642
410
    m_ptr += b->step; \
643
410
  }
644
6
  ccv_matrix_typeof(fft_type, ccv_matrix_getter, b->type, for_block);
645
6
#undef for_block
646
6
  if (fft_type == CCV_32F)
647
5
    fftwf_execute_dft_r2c(pf, (float*)fftw_b, (fftwf_complex*)fftw_b);
648
1
  else
649
1
    fftw_execute_dft_r2c(p, (double*)fftw_b, (fftw_complex*)fftw_b);
650
6
  /* why a->cols + cols - 2 * (b->cols & ~1) ?
651
6
   * what we really want is ceiling((a->cols - (b->cols & ~1)) / (cols - (b->cols & ~1)))
652
6
   * in this case, we strip out paddings on the left/right, and compute how many tiles
653
6
   * we need. It then be interpreted in the above integer division form */
654
6
  int tile_x = ccv_max(1, (a->cols + cols - 2 * (b->cols & ~1)) / (cols - (b->cols & ~1)));
655
6
  int tile_y = ccv_max(1, (a->rows + rows - 2 * (b->rows & ~1)) / (rows - (b->rows & ~1)));
656
6
  int brows2 = b->rows / 2;
657
6
  int bcols2 = b->cols / 2;
658
6
#define for_block(_for_type, _cpx_type, _for_set, _for_get) \
659
6
  _for_type scale = 1.0 / (rows * cols); \
660
14
  for (i = 0; i < tile_y; 
i++8
) \
661
27
    
for (j = 0; 8
j < tile_x;
j++19
) \
662
19
    { \
663
19
      int x, y; \
664
19
      memset(fftw_a, 0, rows * cols_2c * ch * sizeof(_for_type)); \
665
19
      int iy = ccv_min(i * (rows - (b->rows & ~1)), ccv_max(a->rows - rows, 0)); \
666
19
      int ix = ccv_min(j * (cols - (b->cols & ~1)), ccv_max(a->cols - cols, 0)); \
667
19
      _for_type* fftw_ptr = (_for_type*)fftw_a; \
668
19
      int end_y = ccv_min(rows, a->rows - iy); \
669
19
      int end_x = ccv_min(cols, a->cols - ix); \
670
19
      m_ptr = (unsigned char*)ccv_get_dense_matrix_cell(a, iy, ix, 0); \
671
5.48k
      for (y = 0; y < end_y; 
y++5.46k
) \
672
5.46k
      { \
673
4.57M
        for (x = 0; x < end_x * ch; 
x++4.56M
) \
674
4.56M
          fftw_ptr[x] = _for_get(m_ptr, x, 0); \
675
5.46k
        fftw_ptr += cols_2c * ch; \
676
5.46k
        m_ptr += a->step; \
677
5.46k
      } \
678
19
      _cpx_type* fftw_ac = (_cpx_type*)fftw_a; \
679
19
      _cpx_type* fftw_bc = (_cpx_type*)fftw_b; \
680
19
      _cpx_type* fftw_dc = (_cpx_type*)fftw_d; \
681
19
      fft_execute_dft_r2c((_for_type*)fftw_a, (_cpx_type*)fftw_ac); \
682
2.44M
      for (x = 0; x < rows * ch * (cols / 2 + 1); 
x++2.44M
) \
683
2.44M
        fftw_dc[x] = (fftw_ac[x] * fftw_bc[x]) * scale; \
684
19
      fft_execute_dft_c2r((_cpx_type*)fftw_dc, (_for_type*)fftw_d); \
685
19
      fftw_ptr = (_for_type*)fftw_d + ((1 + (i > 0)) * brows2 * cols_2c + (1 + (j > 0)) * bcols2) * ch; \
686
19
      end_y = ccv_min(d->rows - (iy + (i > 0) * brows2), \
687
19
              (rows - (b->rows & ~1)) + (i == 0) * brows2); \
688
19
      end_x = ccv_min(d->cols - (ix + (j > 0) * bcols2), \
689
19
              (cols - (b->cols & ~1)) + (j == 0) * bcols2); \
690
19
      m_ptr = (unsigned char*)ccv_get_dense_matrix_cell(d, iy + (i > 0) * brows2, ix + (j > 0) * bcols2, 0); \
691
4.58k
      for (y = 0; y < end_y; 
y++4.56k
) \
692
4.56k
      { \
693
2.71M
        for (x = 0; x < end_x * ch; 
x++2.71M
) \
694
2.71M
          _for_set(m_ptr, x, fftw_ptr[x], 0); \
695
4.56k
        m_ptr += d->step; \
696
4.56k
        fftw_ptr += cols_2c * ch; \
697
4.56k
      } \
698
19
      int end_tile_y, end_tile_x; \
699
19
      /* handle edge cases: */ \
700
19
      if (i + 1 == tile_y && 
end_y + iy + (i > 0) * brows2 < d->rows13
) \
701
19
      { \
702
6
        end_tile_y = ccv_min(brows2, d->rows - (iy + (i > 0) * brows2 + end_y)); \
703
6
        fftw_ptr = (_for_type*)fftw_d + (1 + (j > 0)) * bcols2 * ch; \
704
6
        m_ptr = (unsigned char*)ccv_get_dense_matrix_cell(d, iy + (i > 0) * brows2 + end_y, ix + (j > 0) * bcols2, 0); \
705
306
        for (y = 0; y < end_tile_y; 
y++300
) \
706
300
        { \
707
204k
          for (x = 0; x < end_x * ch; 
x++204k
) \
708
204k
            _for_set(m_ptr, x, fftw_ptr[x], 0); \
709
300
          m_ptr += d->step; \
710
300
          fftw_ptr += cols_2c * ch; \
711
300
        } \
712
6
      } \
713
19
      if (j + 1 == tile_x && 
end_x + ix + (j > 0) * bcols2 < d->cols8
) \
714
19
      { \
715
6
        end_tile_x = ccv_min(bcols2, d->cols - (ix + (j > 0) * bcols2 + end_x)); \
716
6
        fftw_ptr = (_for_type*)fftw_d + (1 + (i > 0)) * brows2 * cols_2c * ch; \
717
6
        m_ptr = (unsigned char*)ccv_get_dense_matrix_cell(d, iy + (i > 0) * brows2, ix + (j > 0) * bcols2 + end_x, 0); \
718
1.39k
        for (y = 0; y < end_y; 
y++1.38k
) \
719
1.38k
        { \
720
187k
          for (x = 0; x < end_tile_x * ch; 
x++185k
) \
721
185k
            _for_set(m_ptr, x, fftw_ptr[x], 0); \
722
1.38k
          m_ptr += d->step; \
723
1.38k
          fftw_ptr += cols_2c * ch; \
724
1.38k
        } \
725
6
      } \
726
19
      if (i + 1 == tile_y && 
end_y + iy + (i > 0) * brows2 < d->rows13
&& \
727
19
        
j + 1 == tile_x6
&&
end_x + ix + (j > 0) * bcols2 < d->cols2
) \
728
19
      { \
729
2
        fftw_ptr = (_for_type*)fftw_d; \
730
2
        m_ptr = (unsigned char*)ccv_get_dense_matrix_cell(d, iy + (i > 0) * brows2 + end_y, ix + (j > 0) * bcols2 + end_x, 0); \
731
102
        for (y = 0; y < end_tile_y; 
y++100
) \
732
100
        { \
733
15.1k
          for (x = 0; x < end_tile_x * ch; 
x++15.0k
) \
734
15.0k
            _for_set(m_ptr, x, fftw_ptr[x], 0); \
735
100
          m_ptr += d->step; \
736
100
          fftw_ptr += cols_2c * ch; \
737
100
        } \
738
2
      } \
739
19
    }
740
6
  if (fft_type == CCV_32F)
741
5
  {
742
15
#define fft_execute_dft_r2c(r, c) fftwf_execute_dft_r2c(pf, r, c)
743
15
#define fft_execute_dft_c2r(c, r) fftwf_execute_dft_c2r(pinvf, c, r)
744
5
    ccv_matrix_setter(d->type, ccv_matrix_getter, a->type, for_block, float, fftwf_complex);
745
5
#undef fft_execute_dft_r2c
746
5
#undef fft_execute_dft_c2r
747
5
    pthread_mutex_lock(&fftw_plan_mutex);
748
5
    fftwf_destroy_plan(pf);
749
5
    fftwf_destroy_plan(pinvf);
750
5
    pthread_mutex_unlock(&fftw_plan_mutex);
751
5
    fftwf_free(fftw_a);
752
5
    fftwf_free(fftw_b);
753
5
    fftwf_free(fftw_d);
754
5
  } else {
755
4
#define fft_execute_dft_r2c(r, c) fftw_execute_dft_r2c(p, r, c)
756
4
#define fft_execute_dft_c2r(c, r) fftw_execute_dft_c2r(pinv, c, r)
757
1
    ccv_matrix_setter(d->type, ccv_matrix_getter, a->type, for_block, double, fftw_complex);
758
1
#undef fft_execute_dft_r2c
759
1
#undef fft_execute_dft_c2r
760
1
    pthread_mutex_lock(&fftw_plan_mutex);
761
1
    fftw_destroy_plan(p);
762
1
    fftw_destroy_plan(pinv);
763
1
    pthread_mutex_unlock(&fftw_plan_mutex);
764
1
    fftw_free(fftw_a);
765
1
    fftw_free(fftw_b);
766
1
    fftw_free(fftw_d);
767
1
  }
768
6
#undef for_block
769
6
}
770
#else
771
static void _ccv_filter_kissfft(ccv_dense_matrix_t* a, ccv_dense_matrix_t* b, ccv_dense_matrix_t* d, int padding_pattern)
772
{
773
  int ch = CCV_GET_CHANNEL(a->type);
774
  int fft_type = (CCV_GET_DATA_TYPE(d->type) == CCV_8U || CCV_GET_DATA_TYPE(d->type) == CCV_32F) ? CCV_32F : CCV_64F;
775
  int rows = ((ccv_min(a->rows + b->rows - 1, kiss_fftr_next_fast_size_real(b->rows * 3)) + 1) >> 1) << 1;
776
  int cols = ((ccv_min(a->cols + b->cols - 1, kiss_fftr_next_fast_size_real(b->cols * 3)) + 1) >> 1) << 1;
777
  int ndim[] = {rows, cols};
778
  void* kiss_a;
779
  void* kiss_b;
780
  void* kiss_d;
781
  void* kiss_ac;
782
  void* kiss_bc;
783
  void* kiss_dc;
784
  kiss_fftndr_cfg p;
785
  kiss_fftndr_cfg pinv;
786
  kissf_fftndr_cfg pf;
787
  kissf_fftndr_cfg pinvf;
788
  if (fft_type == CCV_32F)
789
  {
790
    pf = kissf_fftndr_alloc(ndim, 2, 0, 0, 0);
791
    pinvf = kissf_fftndr_alloc(ndim, 2, 1, 0, 0);
792
    kiss_a = ccmalloc(rows * cols * ch * sizeof(kissf_fft_scalar));
793
    kiss_b = ccmalloc(rows * cols * ch * sizeof(kissf_fft_scalar));
794
    kiss_d = ccmalloc(rows * cols * ch * sizeof(kissf_fft_scalar));
795
    kiss_ac = ccmalloc(rows * (cols / 2 + 1) * ch * sizeof(kissf_fft_cpx));
796
    kiss_bc = ccmalloc(rows * (cols / 2 + 1) * ch * sizeof(kissf_fft_cpx));
797
    kiss_dc = ccmalloc(rows * (cols / 2 + 1) * ch * sizeof(kissf_fft_cpx));
798
    memset(kiss_b, 0, rows * cols * ch * sizeof(kissf_fft_scalar));
799
  } else {
800
    p = kiss_fftndr_alloc(ndim, 2, 0, 0, 0);
801
    pinv = kiss_fftndr_alloc(ndim, 2, 1, 0, 0);
802
    kiss_a = ccmalloc(rows * cols * ch * sizeof(kiss_fft_scalar));
803
    kiss_b = ccmalloc(rows * cols * ch * sizeof(kiss_fft_scalar));
804
    kiss_d = ccmalloc(rows * cols * ch * sizeof(kiss_fft_scalar));
805
    kiss_ac = ccmalloc(rows * (cols / 2 + 1) * ch * sizeof(kiss_fft_cpx));
806
    kiss_bc = ccmalloc(rows * (cols / 2 + 1) * ch * sizeof(kiss_fft_cpx));
807
    kiss_dc = ccmalloc(rows * (cols / 2 + 1) * ch * sizeof(kiss_fft_cpx));
808
    memset(kiss_b, 0, rows * cols * ch * sizeof(kiss_fft_scalar));
809
  }
810
  int nch = rows * cols, nchc = rows * (cols / 2 + 1);
811
  int i, j, k;
812
  unsigned char* m_ptr = b->data.u8;
813
  // to flip matrix b is crucial, this problem only shows when I changed to a more sophisticated test case
814
#define for_block(_for_type, _for_get) \
815
  _for_type* kiss_ptr = (_for_type*)kiss_b + (b->rows - 1) * cols; \
816
  for (i = 0; i < b->rows; i++) \
817
  { \
818
    for (j = 0; j < b->cols; j++) \
819
      for (k = 0; k < ch; k++) \
820
        kiss_ptr[k * nch + b->cols - 1 - j] = _for_get(m_ptr, j * ch + k, 0); \
821
    kiss_ptr -= cols; \
822
    m_ptr += b->step; \
823
  }
824
  ccv_matrix_typeof(fft_type, ccv_matrix_getter, b->type, for_block);
825
#undef for_block
826
  if (fft_type == CCV_32F)
827
    for (k = 0; k < ch; k++)
828
      kissf_fftndr(pf, (kissf_fft_scalar*)kiss_b + nch * k, (kissf_fft_cpx*)kiss_bc + nchc * k);
829
  else
830
    for (k = 0; k < ch; k++)
831
      kiss_fftndr(p, (kiss_fft_scalar*)kiss_b + nch * k, (kiss_fft_cpx*)kiss_bc + nchc * k);
832
  /* why a->cols + cols - 2 * (b->cols & ~1) ?
833
   * what we really want is ceiling((a->cols - (b->cols & ~1)) / (cols - (b->cols & ~1)))
834
   * in this case, we strip out paddings on the left/right, and compute how many tiles
835
   * we need. It then be interpreted in the above integer division form */
836
  int tile_x = ccv_max(1, (a->cols + cols - 2 * (b->cols & ~1)) / (cols - (b->cols & ~1)));
837
  int tile_y = ccv_max(1, (a->rows + rows - 2 * (b->rows & ~1)) / (rows - (b->rows & ~1)));
838
  int brows2 = b->rows / 2;
839
  int bcols2 = b->cols / 2;
840
#define for_block(_for_type, _cpx_type, _for_set, _for_get) \
841
  _for_type scale = 1.0 / (rows * cols); \
842
  for (i = 0; i < tile_y; i++) \
843
    for (j = 0; j < tile_x; j++) \
844
    { \
845
      int x, y; \
846
      memset(kiss_a, 0, rows * cols * ch * sizeof(_for_type)); \
847
      int iy = ccv_min(i * (rows - (b->rows & ~1)), ccv_max(a->rows - rows, 0)); \
848
      int ix = ccv_min(j * (cols - (b->cols & ~1)), ccv_max(a->cols - cols, 0)); \
849
      _for_type* kiss_ptr = (_for_type*)kiss_a; \
850
      int end_y = ccv_min(rows, a->rows - iy); \
851
      int end_x = ccv_min(cols, a->cols - ix); \
852
      m_ptr = (unsigned char*)ccv_get_dense_matrix_cell(a, iy, ix, 0); \
853
      for (y = 0; y < end_y; y++) \
854
      { \
855
        for (x = 0; x < end_x; x++) \
856
          for (k = 0; k < ch; k++) \
857
            kiss_ptr[k * nch + x] = _for_get(m_ptr, x * ch + k, 0); \
858
        kiss_ptr += cols; \
859
        m_ptr += a->step; \
860
      } \
861
      for (k = 0; k < ch; k++) \
862
        fft_ndr((_for_type*)kiss_a + nch * k, (_cpx_type*)kiss_ac + nchc * k); \
863
      _cpx_type* fft_ac = (_cpx_type*)kiss_ac; \
864
      _cpx_type* fft_bc = (_cpx_type*)kiss_bc; \
865
      _cpx_type* fft_dc = (_cpx_type*)kiss_dc; \
866
      for (x = 0; x < rows * ch * (cols / 2 + 1); x++) \
867
      { \
868
        fft_dc[x].r = (fft_ac[x].r * fft_bc[x].r - fft_ac[x].i * fft_bc[x].i) * scale; \
869
        fft_dc[x].i = (fft_ac[x].i * fft_bc[x].r + fft_ac[x].r * fft_bc[x].i) * scale; \
870
      } \
871
      for (k = 0; k < ch; k++) \
872
        fft_ndri((_cpx_type*)kiss_dc + nchc * k, (_for_type*)kiss_d + nch * k); \
873
      kiss_ptr = (_for_type*)kiss_d + (1 + (i > 0)) * brows2 * cols + (1 + (j > 0)) * bcols2; \
874
      end_y = ccv_min(d->rows - (iy + (i > 0) * brows2), \
875
              (rows - (b->rows & ~1)) + (i == 0) * brows2); \
876
      end_x = ccv_min(d->cols - (ix + (j > 0) * bcols2), \
877
              (cols - (b->cols & ~1)) + (j == 0) * bcols2); \
878
      m_ptr = (unsigned char*)ccv_get_dense_matrix_cell(d, iy + (i > 0) * brows2, ix + (j > 0) * bcols2, 0); \
879
      for (y = 0; y < end_y; y++) \
880
      { \
881
        for (x = 0; x < end_x; x++) \
882
          for (k = 0; k < ch; k++) \
883
            _for_set(m_ptr, x * ch + k, kiss_ptr[k * nch + x], 0); \
884
        m_ptr += d->step; \
885
        kiss_ptr += cols; \
886
      } \
887
      int end_tile_y, end_tile_x; \
888
      /* handle edge cases: */ \
889
      if (i + 1 == tile_y && end_y + iy + (i > 0) * brows2 < d->rows) \
890
      { \
891
        end_tile_y = ccv_min(brows2, d->rows - (iy + (i > 0) * brows2 + end_y)); \
892
        kiss_ptr = (_for_type*)kiss_d + (1 + (j > 0)) * bcols2; \
893
        m_ptr = (unsigned char*)ccv_get_dense_matrix_cell(d, iy + (i > 0) * brows2 + end_y, ix + (j > 0) * bcols2, 0); \
894
        for (y = 0; y < end_tile_y; y++) \
895
        { \
896
          for (x = 0; x < end_x; x++) \
897
            for (k = 0; k < ch; k++) \
898
              _for_set(m_ptr, x * ch + k, kiss_ptr[k * nch + x], 0); \
899
          m_ptr += d->step; \
900
          kiss_ptr += cols; \
901
        } \
902
      } \
903
      if (j + 1 == tile_x && end_x + ix + (j > 0) * bcols2 < d->cols) \
904
      { \
905
        end_tile_x = ccv_min(bcols2, d->cols - (ix + (j > 0) * bcols2 + end_x)); \
906
        kiss_ptr = (_for_type*)kiss_d + (1 + (i > 0)) * brows2 * cols; \
907
        m_ptr = (unsigned char*)ccv_get_dense_matrix_cell(d, iy + (i > 0) * brows2, ix + (j > 0) * bcols2 + end_x, 0); \
908
        for (y = 0; y < end_y; y++) \
909
        { \
910
          for (x = 0; x < end_tile_x; x++) \
911
            for (k = 0; k < ch; k++) \
912
              _for_set(m_ptr, x * ch + k, kiss_ptr[k * nch + x], 0); \
913
          m_ptr += d->step; \
914
          kiss_ptr += cols; \
915
        } \
916
      } \
917
      if (i + 1 == tile_y && end_y + iy + (i > 0) * brows2 < d->rows && \
918
        j + 1 == tile_x && end_x + ix + (j > 0) * bcols2 < d->cols) \
919
      { \
920
        kiss_ptr = (_for_type*)kiss_d; \
921
        m_ptr = (unsigned char*)ccv_get_dense_matrix_cell(d, iy + (i > 0) * brows2 + end_y, ix + (j > 0) * bcols2 + end_x, 0); \
922
        for (y = 0; y < end_tile_y; y++) \
923
        { \
924
          for (x = 0; x < end_tile_x; x++) \
925
            for (k = 0; k < ch; k++) \
926
              _for_set(m_ptr, x * ch + k, kiss_ptr[k * nch + x], 0); \
927
          m_ptr += d->step; \
928
          kiss_ptr += cols; \
929
        } \
930
      } \
931
    }
932
  if (fft_type == CCV_32F)
933
  {
934
#define fft_ndr(r, c) kissf_fftndr(pf, r, c)
935
#define fft_ndri(c, r) kissf_fftndri(pinvf, c, r)
936
    ccv_matrix_setter(d->type, ccv_matrix_getter, a->type, for_block, kissf_fft_scalar, kissf_fft_cpx);
937
#undef fft_ndr
938
#undef fft_ndri
939
    kissf_fft_free(pf);
940
    kissf_fft_free(pinvf);
941
  } else {
942
#define fft_ndr(r, c) kiss_fftndr(p, r, c)
943
#define fft_ndri(c, r) kiss_fftndri(pinv, c, r)
944
    ccv_matrix_setter(d->type, ccv_matrix_getter, a->type, for_block, kiss_fft_scalar, kiss_fft_cpx);
945
#undef fft_ndr
946
#undef fft_ndri
947
    kiss_fft_free(p);
948
    kiss_fft_free(pinv);
949
  }
950
#undef for_block
951
  ccfree(kiss_dc);
952
  ccfree(kiss_bc);
953
  ccfree(kiss_ac);
954
  ccfree(kiss_d);
955
  ccfree(kiss_b);
956
  ccfree(kiss_a);
957
}
958
#endif
959
960
static void _ccv_filter_direct_8u(ccv_dense_matrix_t* a, ccv_dense_matrix_t* b, ccv_dense_matrix_t* d, int padding_pattern)
961
0
{
962
0
  int i, j, y, x, k;
963
0
  int nz = b->rows * b->cols;
964
0
  int* coeff = (int*)ccmalloc(nz * sizeof(int));
965
0
  int* cx = (int*)ccmalloc(nz * sizeof(int));
966
0
  int* cy = (int*)ccmalloc(nz * sizeof(int));
967
0
  int scale = 1 << 14;
968
0
  nz = 0;
969
0
  for (i = 0; i < b->rows; i++)
970
0
    for (j = 0; j < b->cols; j++)
971
0
    {
972
0
      coeff[nz] = (int)(ccv_get_dense_matrix_cell_value(b, i, j, 0) * scale + 0.5);
973
0
      if (coeff[nz] == 0)
974
0
        continue;
975
0
      cy[nz] = i;
976
0
      cx[nz] = j;
977
0
      nz++;
978
0
    }
979
0
  ccv_dense_matrix_t* pa = ccv_dense_matrix_new(a->rows + b->rows / 2 * 2, a->cols + b->cols / 2 * 2, CCV_8U | CCV_C1, 0, 0);
980
0
  /* the padding pattern is different from FFT: |aa{BORDER}|abcd|{BORDER}dd| */
981
0
  for (i = 0; i < pa->rows; i++)
982
0
    for (j = 0; j < pa->cols; j++)
983
0
      pa->data.u8[i * pa->step + j] = a->data.u8[ccv_clamp(i - b->rows / 2, 0, a->rows - 1) * a->step + ccv_clamp(j - b->cols / 2, 0, a->cols - 1)];
984
0
  unsigned char* m_ptr = d->data.u8;
985
0
  unsigned char* a_ptr = pa->data.u8;
986
0
  /* 0.5 denote the overhead for indexing x and y */
987
0
  if (nz < b->rows * b->cols * 0.75)
988
0
  {
989
0
    for (i = 0; i < d->rows; i++)
990
0
    {
991
0
      for (j = 0; j < d->cols; j++)
992
0
      {
993
0
        int z = 0;
994
0
        for (k = 0; k < nz; k++)
995
0
          z += a_ptr[cy[k] * pa->step + j + cx[k]] * coeff[k];
996
0
        m_ptr[j] = ccv_clamp(z >> 14, 0, 255);
997
0
      }
998
0
      m_ptr += d->step;
999
0
      a_ptr += pa->step;
1000
0
    }
1001
0
  } else {
1002
0
    k = 0;
1003
0
    for (i = 0; i < b->rows; i++)
1004
0
      for (j = 0; j < b->cols; j++)
1005
0
      {
1006
0
        coeff[k] = (int)(ccv_get_dense_matrix_cell_value(b, i, j, 0) * scale + 0.5);
1007
0
        k++;
1008
0
      }
1009
0
    for (i = 0; i < d->rows; i++)
1010
0
    {
1011
0
      for (j = 0; j < d->cols; j++)
1012
0
      {
1013
0
        int* c_ptr = coeff;
1014
0
        int z = 0;
1015
0
        for (y = 0; y < b->rows; y++)
1016
0
        {
1017
0
          int iyx = y * pa->step;
1018
0
          for (x = 0; x < b->cols; x++)
1019
0
          {
1020
0
            z += a_ptr[iyx + j + x] * c_ptr[0];
1021
0
            c_ptr++;
1022
0
          }
1023
0
        }
1024
0
        m_ptr[j] = ccv_clamp(z >> 14, 0, 255);
1025
0
      }
1026
0
      m_ptr += d->step;
1027
0
      a_ptr += pa->step;
1028
0
    }
1029
0
  }
1030
0
  ccv_matrix_free(pa);
1031
0
  ccfree(coeff);
1032
0
  ccfree(cx);
1033
0
  ccfree(cy);
1034
0
}
1035
1036
void ccv_filter(ccv_dense_matrix_t* a, ccv_dense_matrix_t* b, ccv_dense_matrix_t** d, int type, int padding_pattern)
1037
6
{
1038
6
  ccv_declare_derived_signature(sig, a->sig != 0 && b->sig != 0, ccv_sign_with_literal("ccv_filter"), a->sig, b->sig, CCV_EOF_SIGN);
1039
6
  type = (type == 0) ? 
CCV_GET_DATA_TYPE3
(a->type) | 3
CCV_GET_CHANNEL3
(a->type) :
CCV_GET_DATA_TYPE3
(type) | 3
CCV_GET_CHANNEL3
(a->type);
1040
6
  ccv_dense_matrix_t* dd = *d = ccv_dense_matrix_renew(*d, a->rows, a->cols, CCV_ALL_DATA_TYPE | CCV_GET_CHANNEL(a->type), type, sig);
1041
6
  ccv_object_return_if_cached(, dd);
1042
6
1043
6
  /* 15 is the constant to indicate the high cost of FFT (even with O(nlog(m)) for
1044
6
   * integer image.
1045
6
   * NOTE: FFT has time complexity of O(nlog(n)), however, for convolution, it
1046
6
   * is not the case. Convolving one image (a) to a kernel (b), can be done by
1047
6
   * dividing image a to several blocks proportional to (b). Thus, one don't need
1048
6
   * to do FFT for the whole image. The image can be divided to n/m part, and
1049
6
   * the FFT itself is O(mlog(m)), so, the convolution process has time complexity
1050
6
   * of O(nlog(m)) */
1051
6
  if ((b->rows * b->cols < (log((double)(b->rows * b->cols)) + 1) * 15) && 
(a->type & CCV_8U)1
)
1052
0
  {
1053
0
    _ccv_filter_direct_8u(a, b, dd, padding_pattern);
1054
6
  } else {
1055
6
#ifdef HAVE_FFTW3
1056
6
    _ccv_filter_fftw(a, b, dd, padding_pattern);
1057
#else
1058
    _ccv_filter_kissfft(a, b, dd, padding_pattern);
1059
#endif
1060
  }
1061
6
}
1062
1063
void ccv_filter_kernel(ccv_dense_matrix_t* x, ccv_filter_kernel_f func, void* data)
1064
2
{
1065
2
  int i, j, k, ch = CCV_GET_CHANNEL(x->type);
1066
2
  unsigned char* m_ptr = x->data.u8;
1067
2
  double rows_2 = (x->rows - 1) * 0.5;
1068
2
  double cols_2 = (x->cols - 1) * 0.5;
1069
2
#define for_block(_, _for_set) \
1070
203
  
for (i = 0; 2
i < x->rows;
i++201
) \
1071
201
  { \
1072
20.4k
    for (j = 0; j < x->cols; 
j++20.2k
) \
1073
20.2k
    { \
1074
20.2k
      double result = func(j - cols_2, i - rows_2, data); \
1075
80.8k
      for (k = 0; k < ch; 
k++60.6k
) \
1076
60.6k
        _for_set(m_ptr, j * ch + k, result, 0); \
1077
20.2k
    } \
1078
201
    m_ptr += x->step; \
1079
201
  }
1080
2
  ccv_matrix_setter(x->type, for_block);
1081
2
#undef for_block
1082
2
  ccv_make_matrix_immutable(x);
1083
2
}
1084
1085
void ccv_distance_transform(ccv_dense_matrix_t* a, ccv_dense_matrix_t** b, int type, ccv_dense_matrix_t** x, int x_type, ccv_dense_matrix_t** y, int y_type, double dx, double dy, double dxx, double dyy, int flag)
1086
2
{
1087
2
  assert(!(flag & CCV_L2_NORM) && (flag & CCV_GSEDT));
1088
2
  ccv_declare_derived_signature(sig, a->sig != 0, ccv_sign_with_format(64, "ccv_distance_transform(%la,%la,%la,%la,%d)", dx, dy, dxx, dyy, flag), a->sig, CCV_EOF_SIGN);
1089
2
  type = (CCV_GET_DATA_TYPE(type) == CCV_64F || CCV_GET_DATA_TYPE(a->type) == CCV_64F || CCV_GET_DATA_TYPE(a->type) == CCV_64S) ? 
CCV_GET_CHANNEL0
(a->type) | CCV_64F0
: CCV_GET_CHANNEL(a->type) | CCV_32F;
1090
2
  ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, a->rows, a->cols, CCV_ALL_DATA_TYPE | CCV_GET_CHANNEL(a->type), type, sig);
1091
2
  ccv_dense_matrix_t* mx = 0;
1092
2
  ccv_dense_matrix_t* my = 0;
1093
2
  if (x != 0)
1094
0
  {
1095
0
    ccv_declare_derived_signature(xsig, a->sig != 0, ccv_sign_with_format(64, "ccv_distance_transform_x(%la,%la,%la,%la,%d)", dx, dy, dxx, dyy, flag), a->sig, CCV_EOF_SIGN);
1096
0
    mx = *x = ccv_dense_matrix_renew(*x, a->rows, a->cols, CCV_32S | CCV_C1, CCV_32S | CCV_C1, xsig);
1097
0
  }
1098
2
  if (y != 0)
1099
0
  {
1100
0
    ccv_declare_derived_signature(ysig, a->sig != 0, ccv_sign_with_format(64, "ccv_distance_transform_y(%la,%la,%la,%la,%d)", dx, dy, dxx, dyy, flag), a->sig, CCV_EOF_SIGN);
1101
0
    my = *y = ccv_dense_matrix_renew(*y, a->rows, a->cols, CCV_32S | CCV_C1, CCV_32S | CCV_C1, ysig);
1102
0
  }
1103
2
  ccv_object_return_if_cached(, db, mx, my);
1104
2
  ccv_revive_object_if_cached(db, mx, my);
1105
2
  int i, j, k;
1106
2
  unsigned char* a_ptr = a->data.u8;
1107
2
  unsigned char* b_ptr = db->data.u8;
1108
2
  int* v = (int*)alloca(sizeof(int) * ccv_max(db->rows, db->cols));
1109
2
  unsigned char* c_ptr = (unsigned char*)alloca(CCV_GET_DATA_TYPE_SIZE(db->type) * db->rows);
1110
2
  int* x_ptr = mx ? 
mx->data.i320
: 0;
1111
2
  int* y_ptr = my ? 
my->data.i320
: 0;
1112
2
#define for_block(_for_max, _for_type_b, _for_set_b, _for_get_b, _for_get_a) \
1113
2
  _for_type_b _dx = dx, _dy = dy, _dxx = dxx, _dyy = dyy; \
1114
2
  _for_type_b* z = (_for_type_b*)alloca(sizeof(_for_type_b) * (ccv_max(db->rows, db->cols) + 1)); \
1115
2
  if (_dxx > 1e-6) \
1116
2
  { \
1117
652
    for (i = 0; i < a->rows; 
i++650
) \
1118
650
    { \
1119
650
      k = 0; \
1120
650
      v[0] = 0; \
1121
650
      z[0] = (_for_type_b)-_for_max; \
1122
650
      z[1] = (_for_type_b)_for_max; \
1123
260k
      for (j = 1; j < a->cols; 
j++259k
) \
1124
259k
      { \
1125
259k
        _for_type_b s; \
1126
259k
        for (;;) \
1127
270k
        { \
1128
270k
          assert(k >= 0 && k < ccv_max(db->rows, db->cols)); \
1129
270k
          s = ((SGN _for_get_a(a_ptr, j, 0) + _dxx * j * j - _dx * j) - (SGN _for_get_a(a_ptr, v[k], 0) + _dxx * v[k] * v[k] - _dx * v[k])) / (2.0 * _dxx * (j - v[k])); \
1130
270k
          if (s > z[k]) 
break259k
; \
1131
270k
          --k; \
1132
11.4k
        } \
1133
259k
        ++k; \
1134
259k
        assert(k >= 0 && k < ccv_max(db->rows, db->cols)); \
1135
259k
        v[k] = j; \
1136
259k
        z[k] = s; \
1137
259k
        z[k + 1] = (_for_type_b)_for_max; \
1138
259k
      } \
1139
650
      assert(z[k + 1] >= a->cols - 1); \
1140
650
      k = 0; \
1141
650
      if (mx) \
1142
650
      { \
1143
0
        for (j = 0; j < a->cols; j++) \
1144
0
        { \
1145
0
          while (z[k + 1] < j) \
1146
0
          { \
1147
0
            assert(k >= 0 && k < ccv_max(db->rows, db->cols) - 1); \
1148
0
            ++k; \
1149
0
          } \
1150
0
          _for_set_b(b_ptr, j, _dx * (j - v[k]) + _dxx * (j - v[k]) * (j - v[k]) SGN _for_get_a(a_ptr, v[k], 0), 0); \
1151
0
          x_ptr[j] = j - v[k]; \
1152
0
        } \
1153
0
        x_ptr += mx->cols; \
1154
650
      } else { \
1155
260k
        for (j = 0; j < a->cols; 
j++260k
) \
1156
260k
        { \
1157
507k
          while (z[k + 1] < j) \
1158
260k
          { \
1159
247k
            assert(k >= 0 && k < ccv_max(db->rows, db->cols) - 1); \
1160
247k
            ++k; \
1161
247k
          } \
1162
260k
          _for_set_b(b_ptr, j, _dx * (j - v[k]) + _dxx * (j - v[k]) * (j - v[k]) SGN _for_get_a(a_ptr, v[k], 0), 0); \
1163
260k
        } \
1164
650
      } \
1165
650
      a_ptr += a->step; \
1166
650
      b_ptr += db->step; \
1167
650
    } \
1168
2
  } else { /* above algorithm cannot handle dxx == 0 properly, below is special casing for that */ \
1169
0
    assert(mx == 0); \
1170
0
    for (i = 0; i < a->rows; i++) \
1171
0
    { \
1172
0
      for (j = 0; j < a->cols; j++) \
1173
0
        _for_set_b(b_ptr, j, SGN _for_get_a(a_ptr, j, 0), 0); \
1174
0
      for (j = 1; j < a->cols; j++) \
1175
0
        _for_set_b(b_ptr, j, ccv_min(_for_get_b(b_ptr, j, 0), _for_get_b(b_ptr, j - 1, 0) + _dx), 0); \
1176
0
      for (j = a->cols - 2; j >= 0; j--) \
1177
0
        _for_set_b(b_ptr, j, ccv_min(_for_get_b(b_ptr, j, 0), _for_get_b(b_ptr, j + 1, 0) - _dx), 0); \
1178
0
      a_ptr += a->step; \
1179
0
      b_ptr += db->step; \
1180
0
    } \
1181
0
  } \
1182
2
  b_ptr = db->data.u8; \
1183
2
  if (_dyy > 1e-6) \
1184
2
  { \
1185
802
    for (j = 0; j < db->cols; 
j++800
) \
1186
800
    { \
1187
260k
      for (i = 0; i < db->rows; 
i++260k
) \
1188
260k
        _for_set_b(c_ptr, i, _for_get_b(b_ptr + i * db->step, j, 0), 0); \
1189
800
      k = 0; \
1190
800
      v[0] = 0; \
1191
800
      z[0] = (_for_type_b)-_for_max; \
1192
800
      z[1] = (_for_type_b)_for_max; \
1193
260k
      for (i = 1; i < db->rows; 
i++259k
) \
1194
259k
      { \
1195
259k
        _for_type_b s; \
1196
259k
        for (;;) \
1197
274k
        { \
1198
274k
          assert(k >= 0 && k < ccv_max(db->rows, db->cols)); \
1199
274k
          s = ((_for_get_b(c_ptr, i, 0) + _dyy * i * i - _dy * i) - (_for_get_b(c_ptr, v[k], 0) + _dyy * v[k] * v[k] - _dy * v[k])) / (2.0 * _dyy * (i - v[k])); \
1200
274k
          if (s > z[k]) 
break259k
; \
1201
274k
          --k; \
1202
15.6k
        } \
1203
259k
        ++k; \
1204
259k
        assert(k >= 0 && k < ccv_max(db->rows, db->cols)); \
1205
259k
        v[k] = i; \
1206
259k
        z[k] = s; \
1207
259k
        z[k + 1] = (_for_type_b)_for_max; \
1208
259k
      } \
1209
800
      assert(z[k + 1] >= db->rows - 1); \
1210
800
      k = 0; \
1211
800
      if (my) \
1212
800
      { \
1213
0
        for (i = 0; i < db->rows; i++) \
1214
0
        { \
1215
0
          while (z[k + 1] < i) \
1216
0
          { \
1217
0
            assert(k >= 0 && k < ccv_max(db->rows, db->cols) - 1); \
1218
0
            ++k; \
1219
0
          } \
1220
0
          _for_set_b(b_ptr + i * db->step, j, _dy * (i - v[k]) + _dyy * (i - v[k]) * (i - v[k]) + _for_get_b(c_ptr, v[k], 0), 0); \
1221
0
          y_ptr[i * my->cols] = i - v[k]; \
1222
0
        } \
1223
0
        ++y_ptr; \
1224
800
      } else { \
1225
260k
        for (i = 0; i < db->rows; 
i++260k
) \
1226
260k
        { \
1227
503k
          while (z[k + 1] < i) \
1228
260k
          { \
1229
243k
            assert(k >= 0 && k < ccv_max(db->rows, db->cols) - 1); \
1230
243k
            ++k; \
1231
243k
          } \
1232
260k
          _for_set_b(b_ptr + i * db->step, j, _dy * (i - v[k]) + _dyy * (i - v[k]) * (i - v[k]) + _for_get_b(c_ptr, v[k], 0), 0); \
1233
260k
        } \
1234
800
      } \
1235
800
    } \
1236
2
  } else { \
1237
0
    assert(my == 0); \
1238
0
    for (j = 0; j < db->cols; j++) \
1239
0
    { \
1240
0
      for (i = 1; i < db->rows; i++) \
1241
0
        _for_set_b(b_ptr + i * db->step, j, ccv_min(_for_get_b(b_ptr + i * db->step, j, 0), _for_get_b(b_ptr + (i - 1) * db->step, j, 0) + _dy), 0); \
1242
0
      for (i = db->rows - 2; i >= 0; i--) \
1243
0
        _for_set_b(b_ptr + i * db->step, j, ccv_min(_for_get_b(b_ptr + i * db->step, j, 0), _for_get_b(b_ptr + (i + 1) * db->step, j, 0) - _dy), 0); \
1244
0
    } \
1245
0
  }
1246
2
  if (flag & CCV_NEGATIVE)
1247
1
  {
1248
261k
#define SGN -
1249
1
    if (db->type & CCV_64F)
1250
0
    {
1251
0
      ccv_matrix_typeof_setter_getter(db->type, ccv_matrix_getter, a->type, for_block, DBL_MAX);
1252
1
    } else {
1253
1
      ccv_matrix_typeof_setter_getter(db->type, ccv_matrix_getter, a->type, for_block, FLT_MAX);
1254
1
    }
1255
1
#undef SGN
1256
1
  } else {
1257
280k
#define SGN +
1258
1
    if (db->type & CCV_64F)
1259
0
    {
1260
0
      ccv_matrix_typeof_setter_getter(db->type, ccv_matrix_getter, a->type, for_block, DBL_MAX);
1261
1
    } else {
1262
1
      ccv_matrix_typeof_setter_getter(db->type, ccv_matrix_getter, a->type, for_block, FLT_MAX);
1263
1
    }
1264
1
#undef SGN
1265
1
  }
1266
2
#undef for_block
1267
2
}