Coverage Report

Created: 2022-08-03 23:52

/home/liu/buildslave/linux-x64-runtests/build/lib/ccv_convnet.c
Line
Count
Source (jump to first uncovered line)
1
#include "ccv.h"
2
#include "ccv_internal.h"
3
#if defined(HAVE_SSE2)
4
#include <xmmintrin.h>
5
#elif defined(HAVE_NEON)
6
#include <arm_neon.h>
7
#endif
8
#ifdef HAVE_GSL
9
#include <gsl/gsl_rng.h>
10
#include <gsl/gsl_randist.h>
11
#endif
12
#ifdef USE_OPENMP
13
#include <omp.h>
14
#endif
15
#ifdef USE_DISPATCH
16
#include <dispatch/dispatch.h>
17
#endif
18
#ifdef HAVE_CUDA
19
#include "cuda/cwc.h"
20
#endif
21
#include "3rdparty/sqlite3/sqlite3.h"
22
#include "inc/ccv_convnet_internal.h"
23
24
#ifndef CASE_TESTS
25
26
ccv_convnet_t* ccv_convnet_new(int use_cwc_accel, ccv_size_t input, ccv_convnet_layer_param_t params[], int count)
27
{
28
  ccv_convnet_t* convnet = (ccv_convnet_t*)ccmalloc(sizeof(ccv_convnet_t) + sizeof(ccv_convnet_layer_t) * count + sizeof(ccv_dense_matrix_t*) * count * 2);
29
  convnet->use_cwc_accel = use_cwc_accel;
30
#ifdef HAVE_GSL
31
  gsl_rng_env_setup();
32
  gsl_rng* rng = gsl_rng_alloc(gsl_rng_default);
33
  gsl_rng_set(rng, (unsigned long int)convnet);
34
#endif
35
  convnet->reserved = 0;
36
  convnet->layers = (ccv_convnet_layer_t*)(convnet + 1);
37
  convnet->acts = (ccv_dense_matrix_t**)(convnet->layers + count);
38
  memset(convnet->acts, 0, sizeof(ccv_dense_matrix_t*) * count);
39
  convnet->denoms = (ccv_dense_matrix_t**)(convnet->acts + count);
40
  memset(convnet->denoms, 0, sizeof(ccv_dense_matrix_t*) * count);
41
  convnet->count = count;
42
  convnet->input = input;
43
  convnet->rows = params[0].input.matrix.rows;
44
  convnet->cols = params[0].input.matrix.cols;
45
  convnet->channels = params[0].input.matrix.channels;
46
  convnet->mean_activity = ccv_dense_matrix_new(convnet->input.height, convnet->input.width, convnet->channels | CCV_32F, 0, 0);
47
  ccv_zero(convnet->mean_activity);
48
  ccv_convnet_layer_t* layers = convnet->layers;
49
  int i, j;
50
  for (i = 0; i < count; i++)
51
  {
52
    layers[i].type = params[i].type;
53
    layers[i].input = params[i].input;
54
    layers[i].net = params[i].output;
55
    layers[i].reserved = 0;
56
    switch (params[i].type)
57
    {
58
      case CCV_CONVNET_CONVOLUTIONAL:
59
        assert(params[i].input.matrix.channels % params[i].input.matrix.partition == 0);
60
        assert(params[i].output.convolutional.count % params[i].output.convolutional.partition == 0);
61
        assert(params[i].output.convolutional.partition % params[i].input.matrix.partition == 0);
62
        assert(params[i].output.convolutional.partition >= params[i].input.matrix.partition);
63
        layers[i].wnum = params[i].output.convolutional.rows * params[i].output.convolutional.cols * params[i].output.convolutional.channels / params[i].input.matrix.partition * params[i].output.convolutional.count;
64
        layers[i].w = (float*)ccmalloc(sizeof(float) * (layers[i].wnum + params[i].output.convolutional.count));
65
        layers[i].bias = layers[i].w + layers[i].wnum;
66
#ifdef HAVE_GSL
67
        for (j = 0; j < layers[i].wnum; j++)
68
          layers[i].w[j] = (gsl_rng_uniform_pos(rng) * 2 - 1) * params[i].glorot / sqrtf(params[i].output.convolutional.rows * params[i].output.convolutional.cols * params[i].output.convolutional.channels / params[i].input.matrix.partition + params[i].output.convolutional.count);
69
#else
70
        for (j = 0; j < layers[i].wnum; j++)
71
          layers[i].w[j] = 0;
72
#endif
73
        for (j = 0; j < params[i].output.convolutional.count; j++)
74
          layers[i].bias[j] = params[i].bias;
75
        break;
76
      case CCV_CONVNET_FULL_CONNECT:
77
        layers[i].wnum = params[i].input.node.count * params[i].output.full_connect.count;
78
        layers[i].w = (float*)ccmalloc(sizeof(float) * (layers[i].wnum + params[i].output.full_connect.count));
79
        layers[i].bias = layers[i].w + layers[i].wnum;
80
#ifdef HAVE_GSL
81
        for (j = 0; j < layers[i].wnum; j++)
82
          layers[i].w[j] = (gsl_rng_uniform_pos(rng) * 2 - 1) * params[i].glorot / sqrtf(params[i].input.node.count + params[i].output.full_connect.count);
83
#else
84
        for (j = 0; j < layers[i].wnum; j++)
85
          layers[i].w[j] = 0;
86
#endif
87
        for (j = 0; j < params[i].output.full_connect.count; j++)
88
          layers[i].bias[j] = params[i].bias;
89
        break;
90
      default:
91
        layers[i].wnum = 0;
92
        layers[i].w = 0;
93
        layers[i].bias = 0;
94
        break;
95
    }
96
  }
97
#ifdef HAVE_GSL
98
  gsl_rng_free(rng);
99
#endif
100
  return convnet;
101
}
102
103
int ccv_convnet_verify(ccv_convnet_t* convnet, int output)
104
{
105
  int i, out_rows, out_cols, out_partition, out_channels;
106
  if (convnet->count < 1)
107
    return -1;
108
  // the last layer has to be full connect
109
  if (convnet->layers[convnet->count - 1].type != CCV_CONVNET_FULL_CONNECT)
110
    return -1;
111
  // you cannot enable relu on the last layer
112
  if (convnet->layers[convnet->count - 1].net.full_connect.relu)
113
    return -1;
114
  out_channels = 3;
115
  for (i = 0; i < convnet->count; i++)
116
  {
117
    ccv_convnet_layer_t* layer = convnet->layers + i;
118
    if (i > 0 && (out_rows != layer->input.matrix.rows || out_cols != layer->input.matrix.cols))
119
      return -1;
120
    // the input channels should be equal to the previous output channels, skip this check for full connect as it is meaningless
121
    if (out_channels != layer->input.matrix.channels && layer->type != CCV_CONVNET_FULL_CONNECT)
122
      return -1;
123
    ccv_convnet_make_output(layer, layer->input.matrix.rows, layer->input.matrix.cols, &out_rows, &out_cols, &out_partition);
124
    if (layer->type == CCV_CONVNET_CONVOLUTIONAL)
125
    {
126
      // check to see if the input matrix channel is equal to the expected input of the convolutional layer filters
127
      if (layer->input.matrix.channels != layer->net.convolutional.channels)
128
        return -1;
129
      // if this layer is convolutional layer, its filter output should equal to next layer's channel input
130
      out_channels = layer->net.convolutional.count;
131
    }
132
  }
133
  if (out_rows * out_cols != output)
134
    return -1;
135
  int count = 0;
136
  for (i = 0; i < convnet->count; i++)
137
  {
138
    ccv_convnet_layer_t* layer = convnet->layers + i;
139
    if (layer->type == CCV_CONVNET_FULL_CONNECT)
140
    {
141
      count = i;
142
      break;
143
    }
144
  }
145
  // all the layers after the first full connect layer should only be full connect layer
146
  for (i = count; i < convnet->count; i++)
147
    if (convnet->layers[i].type != CCV_CONVNET_FULL_CONNECT ||
148
      convnet->layers[i].input.matrix.rows * convnet->layers[i].input.matrix.cols * convnet->layers[i].input.matrix.channels != convnet->layers[i].input.node.count)
149
      return -1;
150
  return 0;
151
}
152
153
#endif
154
155
#if defined(HAVE_SSE2) || defined(HAVE_NEON)
156
157
static void _ccv_convnet_layer_simd_alloc_reserved(ccv_convnet_layer_t* layer)
158
0
{
159
0
  if (layer->reserved)
160
0
    return;
161
0
  int partition = layer->input.matrix.partition;
162
0
  int ch = layer->net.convolutional.channels;
163
0
  int count = layer->net.convolutional.count;
164
0
  int kernel_rows = layer->net.convolutional.rows;
165
0
  int kernel_cols = layer->net.convolutional.cols;
166
0
  int ch_per_partition = ch / partition;
167
0
  int count_per_4 = count / 4;
168
0
  float* simd_w = (float*)ccmalloc(sizeof(float) * layer->wnum);
169
0
  int i, j, k, c;
170
0
  for (k = 0; k < count_per_4; k++)
171
0
    for (i = 0; i < kernel_rows * kernel_cols; i++)
172
0
      for (j = 0; j < ch_per_partition; j++)
173
0
        for (c = 0; c < 4; c++)
174
0
          simd_w[(k * kernel_rows * kernel_cols * ch_per_partition + i * ch_per_partition + j) * 4 + c] = layer->w[(k * 4 + c) * kernel_rows * kernel_cols * ch_per_partition + i * ch_per_partition + j];
175
0
  layer->reserved = simd_w;
176
0
}
177
178
#endif
179
180
#define SIMD(x) ((float*)((x)->reserved))
181
182
#if defined(HAVE_SSE2)
183
static inline void _ccv_convnet_convolutional_forward_propagate_sse2(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t* db, int rows, int cols, int ch, int count, int strides, int border, int kernel_rows, int kernel_cols, int ch_per_partition, int count_per_partition)
184
0
{
185
0
  assert(SIMD(layer));
186
0
#define main_for(block) \
187
0
  parallel_for(k, (count >> 2)) { \
188
0
    int i, j, x, y, c; \
189
0
    int p = k * 4 / count_per_partition; \
190
0
    float* ap = a->data.f32 + p * ch_per_partition; \
191
0
    float* bp = db->data.f32 + k * 4; \
192
0
    float* layer_w = SIMD(layer) + k * 4 * kernel_rows * kernel_cols * ch_per_partition; \
193
0
    float bias[4] __attribute__ ((__aligned__(16))); \
194
0
    memcpy(bias, layer->bias + k * 4, sizeof(float) * 4); \
195
0
    /* 4 accumulators */ \
196
0
    __m128 z4 = _mm_setzero_ps(); \
197
0
    for (i = 0; i < db->rows; i++) \
198
0
    { \
199
0
      int comy = ccv_max(i * strides - border, 0) - (i * strides - border); \
200
0
      int maxy = kernel_rows - comy - (i * strides + kernel_rows - ccv_min(a->rows + border, i * strides + kernel_rows)); \
201
0
      comy *= ch_per_partition * kernel_cols; \
202
0
      for (j = 0; j < db->cols; j++) \
203
0
      { \
204
0
        __m128 v40 = _mm_load_ps(bias); \
205
0
        __m128 v41 = _mm_setzero_ps(); \
206
0
        __m128 v42 = _mm_setzero_ps(); \
207
0
        __m128 v43 = _mm_setzero_ps(); \
208
0
        int comx = ccv_max(j * strides - border, 0) - (j * strides - border); \
209
0
        int maxx = kernel_cols - comx - (j * strides + kernel_cols - ccv_min(a->cols + border, j * strides + kernel_cols)); \
210
0
        float* w = layer_w + (comx * ch_per_partition + comy) * 4; \
211
0
        float* apz = ap + ccv_max(j * strides - border, 0) * ch; \
212
0
        /* when we have border, we simply do zero padding */ \
213
0
        for (y = 0; y < maxy; y++) \
214
0
        { \
215
0
          /* special casing for these cases to speed up SIMD computation */ \
216
0
          for (x = 0; x < maxx; x++) \
217
0
          { \
218
0
            c = 0; \
219
0
            for (; c < ch_per_partition - 3; c += 4) \
220
0
            { \
221
0
              __m128 apz4 = _mm_loadu_ps(apz + x * ch + c); \
222
0
              __m128 w40 = _mm_loadu_ps(w + (x * ch_per_partition + c) * 4); \
223
0
              __m128 w41 = _mm_loadu_ps(w + (x * ch_per_partition + c + 1) * 4); \
224
0
              __m128 w42 = _mm_loadu_ps(w + (x * ch_per_partition + c + 2) * 4); \
225
0
              __m128 w43 = _mm_loadu_ps(w + (x * ch_per_partition + c + 3) * 4); \
226
0
              __m128 apz40 = _mm_shuffle_ps(apz4, apz4, 0x00); \
227
0
              __m128 apz41 = _mm_shuffle_ps(apz4, apz4, 0x55); \
228
0
              __m128 apz42 = _mm_shuffle_ps(apz4, apz4, 0xAA); \
229
0
              __m128 apz43 = _mm_shuffle_ps(apz4, apz4, 0xFF); \
230
0
              v40 =_mm_add_ps(_mm_mul_ps(w40, apz40), v40); \
231
0
              v41 =_mm_add_ps(_mm_mul_ps(w41, apz41), v41); \
232
0
              v42 =_mm_add_ps(_mm_mul_ps(w42, apz42), v42); \
233
0
              v43 =_mm_add_ps(_mm_mul_ps(w43, apz43), v43); \
234
0
            } \
235
0
            block /* insert executions for tail partition */ \
236
0
          } \
237
0
          w += kernel_cols * ch_per_partition * 4; \
238
0
          apz += a->cols * ch; \
239
0
        } \
240
0
        __m128 v4 = _mm_max_ps(z4, _mm_add_ps(_mm_add_ps(v40, v41), _mm_add_ps(v42, v43))); \
241
0
        _mm_storeu_ps(bp + j * count, v4); /* ReLU */ \
242
0
      } \
243
0
      bp += db->cols * count; \
244
0
      ap += a->cols * ch * (ccv_max((i + 1) * strides - border, 0) - ccv_max(i * strides - border, 0)); \
245
0
    } \
246
0
  } parallel_endfor
247
0
  if (ch_per_partition % 4 == 0)
248
0
  {
249
0
    main_for();
250
0
  } else if (ch_per_partition % 4 == 3) { // unroll the last for-loops
251
0
#define block \
252
0
    __m128 apz40 = _mm_load1_ps(apz + x * ch + c); \
253
0
    __m128 apz41 = _mm_load1_ps(apz + x * ch + c + 1); \
254
0
    __m128 apz42 = _mm_load1_ps(apz + x * ch + c + 2); \
255
0
    __m128 w40 = _mm_loadu_ps(w + (x * ch_per_partition + c) * 4); \
256
0
    __m128 w41 = _mm_loadu_ps(w + (x * ch_per_partition + c + 1) * 4); \
257
0
    __m128 w42 = _mm_loadu_ps(w + (x * ch_per_partition + c + 2) * 4); \
258
0
    v40 = _mm_add_ps(_mm_mul_ps(w40, apz40), v40); \
259
0
    v41 = _mm_add_ps(_mm_mul_ps(w41, apz41), v41); \
260
0
    v42 = _mm_add_ps(_mm_mul_ps(w42, apz42), v42);
261
0
    main_for(block);
262
0
#undef block
263
0
  } else if (ch_per_partition % 4 == 2) { // unroll the last for-loops
264
0
#define block \
265
0
    __m128 apz40 = _mm_load1_ps(apz + x * ch + c); \
266
0
    __m128 apz41 = _mm_load1_ps(apz + x * ch + c + 1); \
267
0
    __m128 w40 = _mm_loadu_ps(w + (x * ch_per_partition + c) * 4); \
268
0
    __m128 w41 = _mm_loadu_ps(w + (x * ch_per_partition + c + 1) * 4); \
269
0
    v40 = _mm_add_ps(_mm_mul_ps(w40, apz40), v40); \
270
0
    v41 = _mm_add_ps(_mm_mul_ps(w41, apz41), v41);
271
0
    main_for(block);
272
0
#undef block
273
0
  } else {
274
0
#define block \
275
0
    __m128 apz4 = _mm_load1_ps(apz + x * ch + c); \
276
0
    __m128 w4 = _mm_loadu_ps(w + (x * ch_per_partition + c) * 4); \
277
0
    v40 = _mm_add_ps(_mm_mul_ps(w4, apz4), v40);
278
0
    main_for(block);
279
0
#undef block
280
0
  }
281
0
#undef main_for
282
0
}
283
#elif defined(HAVE_NEON)
284
static inline void _ccv_convnet_convolutional_forward_propagate_neon(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t* db, int rows, int cols, int ch, int count, int strides, int border, int kernel_rows, int kernel_cols, int ch_per_partition, int count_per_partition)
285
{
286
  assert(SIMD(layer));
287
#define main_for(block) \
288
  parallel_for(k, (count >> 2)) { \
289
    int i, j, x, y, c; \
290
    int p = k * 4 / count_per_partition; \
291
    float* ap = a->data.f32 + p * ch_per_partition; \
292
    float* bp = db->data.f32 + k * 4; \
293
    float* layer_w = SIMD(layer) + k * 4 * kernel_rows * kernel_cols * ch_per_partition; \
294
    float bias[4] __attribute__ ((__aligned__(16))); \
295
    memcpy(bias, layer->bias + k * 4, sizeof(float) * 4); \
296
    float32x4_t z4 = vmovq_n_f32(0); \
297
    for (i = 0; i < db->rows; i++) \
298
    { \
299
      int comy = ccv_max(i * strides - border, 0) - (i * strides - border); \
300
      int maxy = kernel_rows - comy - (i * strides + kernel_rows - ccv_min(a->rows + border, i * strides + kernel_rows)); \
301
      comy *= ch_per_partition * kernel_cols; \
302
      for (j = 0; j < db->cols; j++) \
303
      { \
304
        float32x4_t v40 = vld1q_f32(bias); \
305
        float32x4_t v41 = vmovq_n_f32(0); \
306
        int comx = ccv_max(j * strides - border, 0) - (j * strides - border); \
307
        int maxx = kernel_cols - comx - (j * strides + kernel_cols - ccv_min(a->cols + border, j * strides + kernel_cols)); \
308
        float* w = layer_w + (comx * ch_per_partition + comy) * 4; \
309
        float* apz = ap + ccv_max(j * strides - border, 0) * ch; \
310
        /* when we have border, we simply do zero padding */ \
311
        for (y = 0; y < maxy; y++) \
312
        { \
313
          for (x = 0; x < maxx; x++) \
314
          { \
315
            c = 0; \
316
            for (; c < ch_per_partition - 1; c += 2) \
317
            { \
318
              float32x2_t apz4 = vld1_f32(apz + x * ch + c); \
319
              float32x4_t apz40 = vdupq_lane_f32(apz4, 0); \
320
              float32x4_t apz41 = vdupq_lane_f32(apz4, 1); \
321
              float32x4_t w40 = vld1q_f32(w + (x * ch_per_partition + c) * 4); \
322
              float32x4_t w41 = vld1q_f32(w + (x * ch_per_partition + c + 1) * 4); \
323
              v40 = vmlaq_f32(v40, w40, apz40); \
324
              v41 = vmlaq_f32(v41, w41, apz41); \
325
            } \
326
            block /* insert executions for tail partition */ \
327
          } \
328
          w += kernel_cols * ch_per_partition * 4; \
329
          apz += a->cols * ch; \
330
        } \
331
        float32x4_t v4 = vmaxq_f32(z4, vaddq_f32(v40, v41)); \
332
        vst1q_f32(bp + j * count, v4); /* ReLU */ \
333
      } \
334
      bp += db->cols * count; \
335
      ap += a->cols * ch * (ccv_max((i + 1) * strides - border, 0) - ccv_max(i * strides - border, 0)); \
336
    } \
337
  } parallel_endfor
338
  if (ch_per_partition % 2 == 0)
339
  {
340
    main_for();
341
  } else { // unroll the last for-loops
342
#define block \
343
    float32x4_t apz4 = vmovq_n_f32(apz[x * ch + c]); \
344
    float32x4_t w4 = vld1q_f32(w + (x * ch_per_partition + c) * 4); \
345
    v40 = vmlaq_f32(v40, w4, apz4);
346
    main_for(block);
347
#undef block
348
  }
349
#undef main_for
350
}
351
#else
352
static inline void _ccv_convnet_convolutional_forward_propagate_fallback(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t* db, int rows, int cols, int ch, int count, int strides, int border, int kernel_rows, int kernel_cols, int ch_per_partition, int count_per_partition)
353
{
354
  parallel_for(k, count) {
355
    int i, j, x, y, c;
356
    int p = k / count_per_partition;
357
    float* ap = a->data.f32 + p * ch_per_partition;
358
    float* bp = db->data.f32 + k;
359
    float* layer_w = layer->w + k * kernel_rows * kernel_cols * ch_per_partition;
360
    float bias = layer->bias[k];
361
    for (i = 0; i < db->rows; i++)
362
    {
363
      int comy = ccv_max(i * strides - border, 0) - (i * strides - border);
364
      int maxy = kernel_rows - comy - (i * strides + kernel_rows - ccv_min(a->rows + border, i * strides + kernel_rows));
365
      comy *= ch_per_partition * kernel_cols;
366
      for (j = 0; j < db->cols; j++)
367
      {
368
        float v = bias;
369
        int comx = ccv_max(j * strides - border, 0) - (j * strides - border);
370
        int maxx = kernel_cols - comx - (j * strides + kernel_cols - ccv_min(a->cols + border, j * strides + kernel_cols));
371
        float* w = layer_w + comx * ch_per_partition + comy;
372
        float* apz = ap + ccv_max(j * strides - border, 0) * ch;
373
        // when we have border, we simply do zero padding
374
        for (y = 0; y < maxy; y++)
375
        {
376
          for (x = 0; x < maxx; x++)
377
            for (c = 0; c < ch_per_partition; c++)
378
              v += w[x * ch_per_partition + c] * apz[x * ch + c];
379
          w += kernel_cols * ch_per_partition;
380
          apz += a->cols * ch;
381
        }
382
        bp[j * count] = ccv_max(0, v); // ReLU
383
      }
384
      bp += db->cols * count;
385
      ap += a->cols * ch * (ccv_max((i + 1) * strides - border, 0) - ccv_max(i * strides - border, 0));
386
    }
387
  } parallel_endfor
388
}
389
#endif
390
391
static void _ccv_convnet_convolutional_forward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t** b)
392
0
{
393
0
  int rows, cols, partition;
394
0
  ccv_convnet_make_output(layer, a->rows, a->cols, &rows, &cols, &partition);
395
0
  int ch = layer->net.convolutional.channels;
396
0
  int count = layer->net.convolutional.count;
397
0
  int strides = layer->net.convolutional.strides;
398
0
  int border = layer->net.convolutional.border;
399
0
  int kernel_rows = layer->net.convolutional.rows;
400
0
  int kernel_cols = layer->net.convolutional.cols;
401
0
  int type = CCV_32F | count;
402
0
  assert(CCV_GET_CHANNEL(a->type) == ch);
403
0
  assert(CCV_GET_DATA_TYPE(a->type) == CCV_32F);
404
0
  ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, rows, cols, type, type, 0);
405
0
  int ch_per_partition = ch / partition;
406
0
  int count_per_partition = count / partition;
407
0
  assert(count_per_partition % 4 == 0);
408
0
#if defined(HAVE_SSE2) || defined(HAVE_NEON)
409
0
  _ccv_convnet_layer_simd_alloc_reserved(layer);
410
0
#endif
411
0
#if defined(HAVE_SSE2)
412
0
  _ccv_convnet_convolutional_forward_propagate_sse2(layer, a, db, rows, cols, ch, count, strides, border, kernel_rows, kernel_cols, ch_per_partition, count_per_partition);
413
0
#elif defined(HAVE_NEON)
414
0
  _ccv_convnet_convolutional_forward_propagate_neon(layer, a, db, rows, cols, ch, count, strides, border, kernel_rows, kernel_cols, ch_per_partition, count_per_partition);
415
0
#else
416
0
  _ccv_convnet_convolutional_forward_propagate_fallback(layer, a, db, rows, cols, ch, count, strides, border, kernel_rows, kernel_cols, ch_per_partition, count_per_partition);
417
0
#endif
418
0
}
419
420
static void _ccv_convnet_full_connect_forward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t** b)
421
0
{
422
0
  assert(CCV_GET_DATA_TYPE(a->type) == CCV_32F);
423
0
  ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, layer->net.full_connect.count, 1, CCV_32F | CCV_C1, CCV_32F | CCV_C1, 0);
424
0
  int ch = CCV_GET_CHANNEL(a->type);
425
0
  int rows = a->rows, cols = a->cols;
426
0
  // reshape a for gemm
427
0
  assert(a->step == a->cols * CCV_GET_DATA_TYPE_SIZE(a->type) * ch);
428
0
  a->rows = rows * cols * ch, a->cols = 1, a->type = (a->type - ch) | CCV_C1;
429
0
  assert(a->rows * db->rows == layer->wnum);
430
0
  a->step = a->cols * CCV_GET_DATA_TYPE_SIZE(a->type);
431
0
  int i;
432
0
  float* bptr = db->data.f32;
433
0
  for (i = 0; i < db->rows; i++)
434
0
    bptr[i] = layer->bias[i];
435
0
  ccv_dense_matrix_t dw = ccv_dense_matrix(db->rows, a->rows, CCV_32F | CCV_C1, layer->w, 0);
436
0
  ccv_gemm(&dw, a, 1, db, 1, 0, (ccv_matrix_t**)&db, 0); // supply db as matrix C is allowed
437
0
  if (layer->net.full_connect.relu)
438
0
    for (i = 0; i < db->rows; i++)
439
0
      bptr[i] = ccv_max(0, bptr[i]); // relu
440
0
  a->rows = rows, a->cols = cols, a->type = (a->type - CCV_GET_CHANNEL(a->type)) | ch;
441
0
  a->step = a->cols * CCV_GET_DATA_TYPE_SIZE(a->type) * CCV_GET_CHANNEL(a->type);
442
0
}
443
444
static void _ccv_convnet_rnorm_forward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t** b, ccv_dense_matrix_t** denoms)
445
0
{
446
0
  int rows, cols, partition;
447
0
  ccv_convnet_make_output(layer, a->rows, a->cols, &rows, &cols, &partition);
448
0
  int size = layer->net.rnorm.size;
449
0
  float kappa = layer->net.rnorm.kappa;
450
0
  float alpha = layer->net.rnorm.alpha;
451
0
  float beta = layer->net.rnorm.beta;
452
0
  int way = size / 2;
453
0
  assert(CCV_GET_DATA_TYPE(a->type) == CCV_32F);
454
0
  int ch = CCV_GET_CHANNEL(a->type);
455
0
  int type = CCV_32F | ch;
456
0
  ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, rows, cols, type, type, 0);
457
0
  int i, j, k, x, p;
458
0
  float* ap = a->data.f32;
459
0
  float* bp = db->data.f32;
460
0
  int ch_per_partition = ch / partition;
461
0
  if (denoms)
462
0
  {
463
0
    ccv_dense_matrix_t* ddenoms = *denoms = ccv_dense_matrix_renew(*denoms, rows, cols, type, type, 0);
464
0
    float* dp = ddenoms->data.f32;
465
0
    for (i = 0; i < db->rows; i++)
466
0
    {
467
0
      for (j = 0; j < db->cols; j++)
468
0
        for (p = 0; p < partition; p++)
469
0
          for (k = 0; k < ch_per_partition; k++)
470
0
          {
471
0
            float v = ap[j * ch + p * ch_per_partition + k];
472
0
            float denom = 0;
473
0
            for (x = ccv_max(k - way, 0); x <= ccv_min(k + way, ch_per_partition - 1); x++)
474
0
              denom += ap[j * ch + p * ch_per_partition + x] * ap[j * ch + p * ch_per_partition + x];
475
0
            denom = kappa + alpha * denom;
476
0
            dp[j * ch + p * ch_per_partition + k] = denom;
477
0
            bp[j * ch + p * ch_per_partition + k] = v * powf(denom, -beta);
478
0
          }
479
0
      ap += a->cols * ch;
480
0
      dp += ddenoms->cols * ch;
481
0
      bp += db->cols * ch;
482
0
    }
483
0
  } else {
484
0
    for (i = 0; i < db->rows; i++)
485
0
    {
486
0
      for (j = 0; j < db->cols; j++)
487
0
        for (p = 0; p < partition; p++)
488
0
          for (k = 0; k < ch_per_partition; k++)
489
0
          {
490
0
            float v = ap[j * ch + p * ch_per_partition + k];
491
0
            float denom = 0;
492
0
            for (x = ccv_max(k - way, 0); x <= ccv_min(k + way, ch_per_partition - 1); x++)
493
0
              denom += ap[j * ch + p * ch_per_partition + x] * ap[j * ch + p * ch_per_partition + x];
494
0
            denom = kappa + alpha * denom;
495
0
            bp[j * ch + p * ch_per_partition + k] = v * powf(denom, -beta);
496
0
          }
497
0
      ap += a->cols * ch;
498
0
      bp += db->cols * ch;
499
0
    }
500
0
  }
501
0
}
502
503
static void _ccv_convnet_max_pool_forward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t** b)
504
0
{
505
0
  int rows, cols, partition;
506
0
  ccv_convnet_make_output(layer, a->rows, a->cols, &rows, &cols, &partition);
507
0
  int size = layer->net.pool.size;
508
0
  int strides = layer->net.pool.strides;
509
0
  int border = layer->net.pool.border;
510
0
  assert(CCV_GET_DATA_TYPE(a->type) == CCV_32F);
511
0
  int ch = CCV_GET_CHANNEL(a->type);
512
0
  int type = CCV_32F | ch;
513
0
  ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, rows, cols, type, type, 0);
514
0
  int i, j, k, x, y;
515
0
  float* ap = a->data.f32;
516
0
  float* bp = db->data.f32;
517
0
  for (i = 0; i < db->rows; i++)
518
0
  {
519
0
    const int start_y = ccv_max(i * strides - border, 0) - (i * strides - border);
520
0
    const int end_y = size + ccv_min(i * strides + size - border, a->rows) - (i * strides + size - border);
521
0
    for (j = 0; j < db->cols; j++)
522
0
    {
523
0
      const int start_x = ccv_max(j * strides - border, 0) - (j * strides - border);
524
0
      const int end_x = size + ccv_min(j * strides + size - border, a->cols) - (j * strides + size - border);
525
0
      for (k = 0; k < ch; k++)
526
0
      {
527
0
        float v = 0;
528
0
        for (y = start_y; y < end_y; y++)
529
0
          for (x = start_x; x < end_x; x++)
530
0
            if (x == start_x && y == start_y)
531
0
              v = ap[(j * strides - border + x + (y - border) * a->cols) * ch + k];
532
0
            else if (ap[(j * strides - border + x + (y - border) * a->cols) * ch + k] > v)
533
0
              v = ap[(j * strides - border + x + (y - border) * a->cols) * ch + k];
534
0
        bp[j * ch + k] = v;
535
0
      }
536
0
    }
537
0
    ap += a->cols * ch * strides;
538
0
    bp += db->cols * ch;
539
0
  }
540
0
}
541
542
static void _ccv_convnet_average_pool_forward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t** b)
543
0
{
544
0
  int rows, cols, partition;
545
0
  ccv_convnet_make_output(layer, a->rows, a->cols, &rows, &cols, &partition);
546
0
  int size = layer->net.pool.size;
547
0
  int strides = layer->net.pool.strides;
548
0
  int border = layer->net.pool.border;
549
0
  assert(CCV_GET_DATA_TYPE(a->type) == CCV_32F);
550
0
  int ch = CCV_GET_CHANNEL(a->type);
551
0
  int type = CCV_32F | ch;
552
0
  ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, rows, cols, type, type, 0);
553
0
  int i, j, k, x, y;
554
0
  float* ap = a->data.f32;
555
0
  float* bp = db->data.f32;
556
0
  for (i = 0; i < db->rows; i++)
557
0
  {
558
0
    const int start_y = ccv_max(i * strides - border, 0) - (i * strides - border);
559
0
    const int end_y = size + ccv_min(i * strides + size - border, a->rows) - (i * strides + size - border);
560
0
    for (j = 0; j < db->cols; j++)
561
0
    {
562
0
      const int start_x = ccv_max(j * strides - border, 0) - (j * strides - border);
563
0
      const int end_x = size + ccv_min(j * strides + size - border, a->cols) - (j * strides + size - border);
564
0
      for (k = 0; k < ch; k++)
565
0
      {
566
0
        float v = 0;
567
0
        for (y = start_y; y < end_y; y++)
568
0
          for (x = start_x; x < end_x; x++)
569
0
            v += ap[(j * strides - border + x + (y - border) * a->cols) * ch + k];
570
0
        bp[j * ch + k] = v / ((end_x - start_x) * (end_y - start_y));
571
0
      }
572
0
    }
573
0
    ap += a->cols * ch * strides;
574
0
    bp += db->cols * ch;
575
0
  }
576
0
}
577
578
static void _ccv_convnet_layer_forward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t** b, ccv_dense_matrix_t** denoms)
579
0
{
580
0
  switch(layer->type)
581
0
  {
582
0
    case CCV_CONVNET_CONVOLUTIONAL:
583
0
      _ccv_convnet_convolutional_forward_propagate(layer, a, b);
584
0
      break;
585
0
    case CCV_CONVNET_FULL_CONNECT:
586
0
      _ccv_convnet_full_connect_forward_propagate(layer, a, b);
587
0
      break;
588
0
    case CCV_CONVNET_LOCAL_RESPONSE_NORM:
589
0
      _ccv_convnet_rnorm_forward_propagate(layer, a, b, denoms);
590
0
      break;
591
0
    case CCV_CONVNET_MAX_POOL:
592
0
      _ccv_convnet_max_pool_forward_propagate(layer, a, b);
593
0
      break;
594
0
    case CCV_CONVNET_AVERAGE_POOL:
595
0
      _ccv_convnet_average_pool_forward_propagate(layer, a, b);
596
0
      break;
597
0
  }
598
0
}
599
600
static void _ccv_convnet_full_connect_forward_propagate_parallel(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t** b)
601
0
{
602
0
  assert(CCV_GET_DATA_TYPE(a->type) == CCV_32F);
603
0
  ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, a->rows, layer->net.full_connect.count, CCV_32F | CCV_C1, CCV_32F | CCV_C1, 0);
604
0
  // reshape a for gemm
605
0
  int i, j;
606
0
  float* bptr = db->data.f32;
607
0
  for (i = 0; i < db->rows; i++)
608
0
  {
609
0
    for (j = 0; j < db->cols; j++)
610
0
      bptr[j] = layer->bias[j];
611
0
    bptr += db->cols;
612
0
  }
613
0
  ccv_dense_matrix_t dw = ccv_dense_matrix(db->cols, a->cols, CCV_32F | CCV_C1, layer->w, 0);
614
0
  ccv_gemm(a, &dw, 1, db, 1, CCV_B_TRANSPOSE, (ccv_matrix_t**)&db, 0); // supply db as matrix C is allowed
615
0
  bptr = db->data.f32;
616
0
  if (layer->net.full_connect.relu)
617
0
    for (i = 0; i < db->rows; i++)
618
0
    {
619
0
      for (j = 0; j < db->cols; j++)
620
0
        bptr[j] = ccv_max(0, bptr[j]); // relu
621
0
      bptr += db->cols;
622
0
    }
623
0
}
624
625
static void _ccv_convnet_compute_softmax_parallel(ccv_dense_matrix_t* a, ccv_dense_matrix_t** b, int type)
626
0
{
627
0
  assert(CCV_GET_CHANNEL(a->type) == CCV_C1);
628
0
  assert(CCV_GET_DATA_TYPE(a->type) == CCV_32F);
629
0
  ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, 1, a->cols, CCV_32F | CCV_C1, CCV_32F | CCV_C1, 0);
630
0
  ccv_zero(db);
631
0
  int i, j;
632
0
  float* aptr = a->data.f32;
633
0
  float* bptr = db->data.f32;
634
0
  float* cptr = (float*)ccmalloc(sizeof(float) * a->cols);
635
0
  for (i = 0; i < a->rows; i++)
636
0
  {
637
0
    double max = aptr[0];
638
0
    for (j = 1; j < a->cols; j++)
639
0
      if (aptr[j] > max)
640
0
        max = aptr[j];
641
0
    double tt = 0;
642
0
    for (j = 0; j < a->cols; j++)
643
0
      tt += (cptr[j] = expf(aptr[j] - max));
644
0
    tt = 1.0 / tt;
645
0
    for (j = 0; j < a->cols; j++)
646
0
      bptr[j] += cptr[j] * tt;
647
0
    aptr += a->cols;
648
0
  }
649
0
  ccfree(cptr);
650
0
}
651
652
#ifndef CASE_TESTS
653
654
void ccv_convnet_encode(ccv_convnet_t* convnet, ccv_dense_matrix_t** a, ccv_dense_matrix_t** b, int batch)
655
{
656
#ifdef HAVE_CUDA
657
  if (convnet->use_cwc_accel)
658
    cwc_convnet_encode(convnet, a, b, batch);
659
  else {
660
#endif
661
  assert(batch == 1);
662
  assert(CCV_GET_CHANNEL((*a)->type) == convnet->channels);
663
  assert((*a)->rows == convnet->rows);
664
  assert((*a)->cols == convnet->cols);
665
  int i;
666
  // save the last layer of neuron cache in case that we encode to a different matrix
667
  ccv_dense_matrix_t* out_neuron = convnet->acts[convnet->count - 1];
668
  convnet->acts[convnet->count - 1] = *b;
669
  _ccv_convnet_layer_forward_propagate(convnet->layers, *a, convnet->acts, convnet->denoms);
670
  for (i = 1; i < convnet->count; i++)
671
    _ccv_convnet_layer_forward_propagate(convnet->layers + i, convnet->acts[i - 1], convnet->acts + i, convnet->denoms + i);
672
  if (convnet->acts + convnet->count - 1 != b)
673
  {
674
    *b = convnet->acts[convnet->count - 1];
675
    // restore the last layer of neuron cache
676
    convnet->acts[convnet->count - 1] = out_neuron;
677
  }
678
#ifdef HAVE_CUDA
679
  }
680
#endif
681
}
682
683
// find the layer for scanning (it is the last convolutional layer)
684
static int _ccv_convnet_find_scan(ccv_convnet_t* convnet)
685
{
686
  int i;
687
  ccv_convnet_layer_t* layers = convnet->layers;
688
  for (i = convnet->count - 1; i >= 0; i--)
689
    if (layers[i].type == CCV_CONVNET_CONVOLUTIONAL)
690
      return i;
691
  return -1;
692
}
693
694
static int _ccv_convnet_derive_scale(ccv_convnet_t* convnet, int scan)
695
{
696
  int i, scale = 1;
697
  for (i = scan; i >= 0; i--)
698
  {
699
    ccv_convnet_layer_t* layer = convnet->layers + i;
700
    switch (layer->type)
701
    {
702
      case CCV_CONVNET_CONVOLUTIONAL:
703
        scale *= layer->net.convolutional.strides;
704
        break;
705
      case CCV_CONVNET_MAX_POOL:
706
      case CCV_CONVNET_AVERAGE_POOL:
707
        scale *= layer->net.pool.strides;
708
        break;
709
    }
710
  }
711
  return scale;
712
}
713
714
static int _ccv_convnet_find_full_connect(ccv_convnet_t* convnet)
715
{
716
  int i;
717
  for (i = 0; i < convnet->count; i++)
718
    if (convnet->layers[i].type == CCV_CONVNET_FULL_CONNECT)
719
      return i;
720
  return -1;
721
}
722
723
void ccv_convnet_classify(ccv_convnet_t* convnet, ccv_dense_matrix_t** a, int symmetric, ccv_array_t** ranks, int tops, int batch)
724
{
725
#ifdef HAVE_CUDA
726
  if (convnet->use_cwc_accel)
727
    cwc_convnet_classify(convnet, a, symmetric, ranks, tops, batch);
728
  else {
729
#endif
730
  int i, j, k, t;
731
  ccv_dense_matrix_t** b = (ccv_dense_matrix_t**)alloca(sizeof(ccv_dense_matrix_t*) * (convnet->count + 1));
732
  int scan = _ccv_convnet_find_scan(convnet);
733
  int scale = _ccv_convnet_derive_scale(convnet, scan);
734
  int full_connect = _ccv_convnet_find_full_connect(convnet);
735
  assert(scan >= 0 && scan < convnet->count);
736
  assert(full_connect >= 0 && full_connect < convnet->count);
737
  memset(b, 0, sizeof(ccv_dense_matrix_t*) * (convnet->count + 1));
738
  for (i = 0; i < batch; i++)
739
  {
740
    assert(CCV_GET_CHANNEL(a[i]->type) == convnet->channels);
741
    assert(a[i]->rows == convnet->input.height || a[i]->cols == convnet->input.width);
742
    assert(a[i]->rows >= convnet->input.height && a[i]->cols >= convnet->input.width);
743
    // find optimal rows and cols to slice to
744
    int rows = convnet->rows + ((a[i]->rows - convnet->rows) / scale) * scale;
745
    int cols = convnet->cols + ((a[i]->cols - convnet->cols) / scale) * scale;
746
    assert(rows == convnet->input.height || cols == convnet->input.width);
747
    assert(rows <= a[i]->rows && cols <= a[i]->cols);
748
    ccv_dense_matrix_t* slice = 0;
749
    ccv_slice(a[i], (ccv_matrix_t**)&slice, CCV_32F, (a[i]->rows - rows) / 2, (a[i]->cols - cols) / 2, rows, cols);
750
    ccv_dense_matrix_t* mean_activity = 0;
751
    // scale mean activity up to be substractable (from this one, the CPU implementation is an approximation of GPU implementation)
752
    ccv_resample(convnet->mean_activity, &mean_activity, 0, rows, cols, CCV_INTER_CUBIC);
753
    ccv_subtract(slice, mean_activity, (ccv_matrix_t**)b, CCV_32F);
754
    ccv_matrix_free(mean_activity);
755
    ccv_matrix_free(slice);
756
    // doing the first few layers until the first scan layer
757
    int out_rows, out_cols, out_partition;
758
    ccv_dense_matrix_t* c = ccv_dense_matrix_new(5 * (!!symmetric + 1), convnet->layers[full_connect].input.node.count, CCV_32F | CCV_C1, 0, 0);
759
    for (t = 0; t <= !!symmetric; t++)
760
    {
761
      rows = b[0]->rows, cols = b[0]->cols;
762
      for (j = 0; j < scan + 1; j++)
763
      {
764
        ccv_convnet_layer_t* layer = convnet->layers + j;
765
        ccv_convnet_make_output(layer, rows, cols, &out_rows, &out_cols, &out_partition);
766
        _ccv_convnet_layer_forward_propagate(layer, b[j], b + j + 1, 0);
767
        assert(b[j + 1]->rows == out_rows && b[j + 1]->cols == out_cols);
768
        if (j > 0)
769
          ccv_matrix_free(b[j]);
770
        rows = out_rows, cols = out_cols;
771
      }
772
      int offsets[5][2] = {
773
        {0, 0},
774
        {cols - convnet->layers[scan + 1].input.matrix.cols, 0},
775
        {(cols - convnet->layers[scan + 1].input.matrix.cols) / 2, (rows - convnet->layers[scan + 1].input.matrix.rows) / 2},
776
        {0, rows - convnet->layers[scan + 1].input.matrix.rows},
777
        {cols - convnet->layers[scan + 1].input.matrix.cols, rows - convnet->layers[scan + 1].input.matrix.rows},
778
      };
779
      for (k = 0; k < 5; k++)
780
      {
781
        ccv_dense_matrix_t* input = 0;
782
        ccv_convnet_layer_t* layer = convnet->layers + scan + 1;
783
        ccv_slice(b[scan + 1], (ccv_matrix_t**)&input, CCV_32F, offsets[k][1], offsets[k][0], layer->input.matrix.rows, layer->input.matrix.cols);
784
        // copy the last layer for full connect compute
785
        b[full_connect] = ccv_dense_matrix_new(convnet->layers[full_connect].input.matrix.rows, convnet->layers[full_connect].input.matrix.cols, CCV_NO_DATA_ALLOC | CCV_32F | convnet->layers[full_connect].input.matrix.channels, c->data.f32 + (t * 5 + k) * convnet->layers[full_connect].input.node.count, 0);
786
        for (j = scan + 1; j < full_connect; j++)
787
        {
788
          layer = convnet->layers + j;
789
          _ccv_convnet_layer_forward_propagate(layer, j > scan + 1 ? b[j] : input, b + j + 1, 0);
790
          if (j > scan + 1)
791
            ccv_matrix_free(b[j]);
792
          else
793
            ccv_matrix_free(input);
794
        }
795
        ccv_matrix_free(b[full_connect]);
796
        // set it to 0
797
        memset(b + scan + 2, 0, sizeof(ccv_dense_matrix_t*) * (full_connect - scan - 1));
798
      }
799
      ccv_matrix_free(b[scan + 1]);
800
      memset(b + 1, 0, sizeof(ccv_dense_matrix_t*) * (scan + 1));
801
      if (t < !!symmetric)
802
        ccv_flip(b[0], &b[0], 0, CCV_FLIP_X);
803
    }
804
    ccv_matrix_free(b[0]);
805
    // now have everything in c, do the last full connect propagate
806
    b[full_connect] = c;
807
    for (j = full_connect; j < convnet->count; j++)
808
    {
809
      ccv_convnet_layer_t* layer = convnet->layers + j;
810
      assert(layer->type == CCV_CONVNET_FULL_CONNECT);
811
      _ccv_convnet_full_connect_forward_propagate_parallel(layer, b[j], b + j + 1);
812
      ccv_matrix_free(b[j]);
813
    }
814
    ccv_dense_matrix_t* softmax = 0;
815
    _ccv_convnet_compute_softmax_parallel(b[convnet->count], &softmax, 0);
816
    ccv_matrix_free(b[convnet->count]);
817
    ranks[i] = ccv_array_new(sizeof(ccv_classification_t), tops, 0);
818
    float* r = softmax->data.f32;
819
    assert(tops <= softmax->cols);
820
    for (j = 0; j < tops; j++)
821
    {
822
      float max_val = -1;
823
      int max_idx = -1;
824
      for (k = 0; k < softmax->cols; k++)
825
        if (r[k] >= 0 && r[k] > max_val)
826
          max_val = r[k], max_idx = k;
827
      assert(max_idx >= 0);
828
      r[max_idx] = -1;
829
      ccv_classification_t classification = {
830
        .id = max_idx,
831
        .confidence = max_val / ((!!symmetric + 1) * 5),
832
      };
833
      ccv_array_push(ranks[i], &classification);
834
    }
835
    ccv_matrix_free(softmax);
836
    memset(b, 0, sizeof(ccv_dense_matrix_t*) * (convnet->count + 1));
837
  }
838
#ifdef HAVE_CUDA
839
  }
840
#endif
841
}
842
843
#endif
844
845
#ifdef HAVE_GSL
846
847
// compute back propagated gradient & weight update delta
848
static void _ccv_convnet_convolutional_backward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t* n, ccv_dense_matrix_t* m, ccv_dense_matrix_t** b, ccv_convnet_layer_t* update_params)
849
7
{
850
  // a is the input gradient (for back prop).
851
  // x is the input (for forward prop), b is the output gradient (gradient, or known as propagated error)
852
  // note that y (the output from forward prop) is not included because the full connect net is simple enough that we don't need it
853
7
  int rows, cols, partition;
854
7
  ccv_convnet_make_output(layer, layer->input.matrix.rows, layer->input.matrix.cols, &rows, &cols, &partition);
855
7
  int ch = layer->net.convolutional.channels;
856
7
  int count = layer->net.convolutional.count;
857
7
  int strides = layer->net.convolutional.strides;
858
7
  int border = layer->net.convolutional.border;
859
7
  int kernel_rows = layer->net.convolutional.rows;
860
7
  int kernel_cols = layer->net.convolutional.cols;
861
7
  assert(a->rows == rows);
862
7
  assert(a->cols == cols);
863
7
  assert(CCV_GET_CHANNEL(a->type) == count);
864
7
  int a_rows = a->rows, a_cols = a->cols, a_ch = CCV_GET_CHANNEL(a->type);
865
7
  a->rows = rows, a->cols = cols, a->type = (a->type - a_ch) | count;
866
7
  assert(CCV_GET_CHANNEL(m->type) == ch);
867
7
  assert(CCV_GET_DATA_TYPE(m->type) == CCV_32F);
868
7
  int count_per_partition = count / partition;
869
7
  int ch_per_partition = ch / partition;
870
  // update weight gradient
871
240
  
parallel_for127
(k, count) {
872
240
    int i, j, x, y, c;
873
240
    int p = k / count_per_partition;
874
240
    float* mp = m->data.f32 + p * ch_per_partition;
875
240
    float* ap = a->data.f32 + k;
876
240
    float* np = n->data.f32 + k;
877
240
    float* update_w = update_params->w + k * kernel_rows * kernel_cols * ch_per_partition;
878
240
    float bias = 0;
879
1.82k
    for (i = 0; i < rows; 
i++1.70k
)
880
1.70k
    {
881
1.71k
      int comy = ccv_max(i * strides - border, 0) - (i * strides - border);
882
1.70k
      int maxy = kernel_rows - comy - (i * strides + kernel_rows - ccv_min(m->rows + border, i * strides + kernel_rows));
883
1.70k
      comy *= ch_per_partition * kernel_cols;
884
43.6k
      for (j = 0; j < cols; 
j++41.9k
)
885
41.9k
      {
886
41.9k
        if (np[j * count] > 0)
887
32.7k
        { /* when np is bigger than 0, relu continues to update the weight, otherwise it stops */
888
32.7k
          float v = ap[j * count];
889
32.7k
          bias += v;
890
32.7k
          int comx = ccv_max(j * strides - border, 0) - (j * strides - border);
891
32.7k
          int maxx = kernel_cols - comx - (j * strides + kernel_cols - ccv_min(m->cols + border, j * strides + kernel_cols));
892
32.7k
          float* w = update_w + comx * ch_per_partition + comy;
893
32.7k
          float* mpz = mp + ccv_max(j * strides - border, 0) * ch;
894
          /* when we have border, we simply do zero padding */
895
130k
          for (y = 0; y < maxy; 
y++98.0k
)
896
98.0k
          {
897
317k
            for (x = 0; x < maxx; 
x++219k
)
898
674k
              
for (c = 0; 219k
c < ch_per_partition;
c++455k
)
899
455k
                w[x * ch_per_partition + c] += v * mpz[x * ch + c];
900
98.0k
            w += kernel_cols * ch_per_partition;
901
98.0k
            mpz += m->cols * ch;
902
98.0k
          }
903
32.7k
        }
904
41.9k
      }
905
1.70k
      ap += a->cols * count;
906
1.70k
      np += n->cols * count;
907
1.70k
      mp += m->cols * ch * (ccv_max((i + 1) * strides - border, 0) - ccv_max(i * strides - border, 0));
908
1.70k
    }
909
240
    update_params->bias[k] += bias;
910
240
  } 
parallel_endfor127
911
7
  if (b)
912
6
  {
913
6
    ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, m->rows, m->cols, CCV_32F | CCV_GET_CHANNEL(m->type), CCV_32F | CCV_GET_CHANNEL(m->type), 0);
914
    // clear it up before propagate result
915
6
    ccv_zero(db);
916
6
    int k;
917
62
    for (k = 0; k < count; 
k++56
)
918
56
    {
919
56
      int i, j, x, y, c;
920
56
      int p = k / count_per_partition;
921
56
      float* bp = db->data.f32 + p * ch_per_partition;
922
56
      float* ap = a->data.f32 + k;
923
56
      float* np = n->data.f32 + k;
924
56
      float* layer_w = layer->w + k * kernel_rows * kernel_cols * ch_per_partition;
925
1.79k
      for (i = 0; i < rows; 
i++1.73k
)
926
1.73k
      {
927
1.73k
        int comy = ccv_max(i * strides - border, 0) - (i * strides - border);
928
1.73k
        int maxy = kernel_rows - comy - (i * strides + kernel_rows - ccv_min(db->rows + border, i * strides + kernel_rows));
929
1.73k
        comy *= ch_per_partition * kernel_cols;
930
55.5k
        for (j = 0; j < cols; 
j++53.8k
)
931
53.8k
        {
932
53.8k
          if (np[j * count] > 0)
933
43.4k
          { /* when np is bigger than 0, relu continues to update the weight, otherwise it stops */
934
43.4k
            float v = ap[j * count];
935
43.4k
            int comx = ccv_max(j * strides - border, 0) - (j * strides - border);
936
43.4k
            int maxx = kernel_cols - comx - (j * strides + kernel_cols - ccv_min(db->cols + border, j * strides + kernel_cols));
937
43.4k
            float* w = layer_w + comx * ch_per_partition + comy;
938
43.4k
            float* bpz = bp + ccv_max(j * strides - border, 0) * ch;
939
            /* when we have border, we simply do zero padding */
940
252k
            for (y = 0; y < maxy; 
y++209k
)
941
209k
            {
942
1.21M
              for (x = 0; x < maxx; 
x++1.00M
)
943
3.76M
                
for (c = 0; 1.00M
c < ch_per_partition;
c++2.76M
)
944
2.76M
                  bpz[x * ch + c] += v * w[x * ch_per_partition + c];
945
209k
              w += kernel_cols * ch_per_partition;
946
209k
              bpz += db->cols * ch;
947
209k
            }
948
43.4k
          }
949
53.8k
        }
950
1.73k
        ap += a->cols * count;
951
1.73k
        np += n->cols * count;
952
1.73k
        bp += db->cols * ch * (ccv_max((i + 1) * strides - border, 0) - ccv_max(i * strides - border, 0));
953
1.73k
      }
954
56
    }
955
6
  }
956
7
  a->rows = a_rows, a->cols = a_cols, a->type = (a->type - CCV_GET_CHANNEL(a->type)) | a_ch;
957
7
}
958
959
static void _ccv_convnet_full_connect_backward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t* y, ccv_dense_matrix_t* x, ccv_dense_matrix_t** b, ccv_convnet_layer_t* update_params)
960
3
{
961
  // a is the input gradient (for back prop), y is the output (for forward prop)
962
  // x is the input (for forward prop), b is the output gradient (gradient, or known as propagated error)
963
3
  ccv_dense_matrix_t* db = 0;
964
3
  if (b)
965
3
    db = *b = ccv_dense_matrix_renew(*b, x->rows, x->cols, CCV_32F | CCV_GET_CHANNEL(x->type), CCV_32F | CCV_GET_CHANNEL(x->type), 0);
966
3
  int x_rows = x->rows, x_cols = x->cols, x_ch = CCV_GET_CHANNEL(x->type);
967
3
  x->rows = x_rows * x_cols * x_ch, x->cols = 1, x->type = (x->type - x_ch) | CCV_C1;
968
3
  x->step = x->cols * CCV_GET_DATA_TYPE_SIZE(x->type);
969
3
  int i;
970
3
  if (layer->net.full_connect.relu)
971
0
    for (i = 0; i < y->rows; i++)
972
0
      if (y->data.f32[i] <= 0)
973
0
        a->data.f32[i] = 0;
974
3
  ccv_dense_matrix_t w = ccv_dense_matrix(a->rows, x->rows, CCV_32F | CCV_C1, update_params->w, 0);
975
3
  ccv_dense_matrix_t* dw = &w;
976
  // compute bias gradient
977
3
  ccv_dense_matrix_t bias = ccv_dense_matrix(a->rows, 1, CCV_32F | CCV_C1, update_params->bias, 0);
978
3
  ccv_dense_matrix_t* dbias = &bias;
979
3
  ccv_add(a, dbias, (ccv_matrix_t**)&dbias, 0);
980
  // compute weight gradient
981
3
  ccv_gemm(a, x, 1, dw, 1, CCV_B_TRANSPOSE, (ccv_matrix_t**)&dw, 0);
982
3
  w = ccv_dense_matrix(a->rows, x->rows, CCV_32F | CCV_C1, layer->w, 0);
983
  // propagate error
984
3
  if (db)
985
3
  {
986
3
    db->rows = x->rows, db->cols = x->cols, db->type = (db->type - x_ch) | CCV_C1;
987
3
    db->step = db->cols * CCV_GET_DATA_TYPE_SIZE(db->type);
988
3
    ccv_gemm(&w, a, 1, 0, 0, CCV_A_TRANSPOSE, (ccv_matrix_t**)&db, 0);
989
3
    db->rows = x_rows, db->cols = x_cols, db->type = (db->type - CCV_GET_CHANNEL(db->type)) | x_ch;
990
3
    db->step = db->cols * CCV_GET_DATA_TYPE_SIZE(db->type) * CCV_GET_CHANNEL(db->type);
991
3
  }
992
3
  x->rows = x_rows, x->cols = x_cols, x->type = (x->type - CCV_GET_CHANNEL(x->type)) | x_ch;
993
3
  x->step = x->cols * CCV_GET_DATA_TYPE_SIZE(x->type) * CCV_GET_CHANNEL(x->type);
994
3
}
995
996
static void _ccv_convnet_rnorm_backward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t* n, ccv_dense_matrix_t* m, ccv_dense_matrix_t* denoms, ccv_dense_matrix_t** b)
997
4
{
998
4
  int rows, cols, partition;
999
4
  ccv_convnet_make_output(layer, layer->input.matrix.rows, layer->input.matrix.cols, &rows, &cols, &partition);
1000
4
  int size = layer->net.rnorm.size;
1001
4
  float alpha = layer->net.rnorm.alpha;
1002
4
  float beta = layer->net.rnorm.beta;
1003
4
  int way = size / 2;
1004
4
  assert(CCV_GET_DATA_TYPE(a->type) == CCV_32F);
1005
4
  int ch = CCV_GET_CHANNEL(a->type);
1006
4
  int type = CCV_32F | ch;
1007
4
  ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, rows, cols, type, type, 0);
1008
4
  int i, j, k, x, p;
1009
4
  float* ap = a->data.f32;
1010
4
  float* np = n->data.f32;
1011
4
  float* mp = m->data.f32;
1012
4
  float* dp = denoms->data.f32;
1013
4
  float* bp = db->data.f32;
1014
4
  int ch_per_partition = ch / partition;
1015
116
  for (i = 0; i < db->rows; 
i++112
)
1016
112
  {
1017
3.26k
    for (j = 0; j < db->cols; 
j++3.14k
)
1018
7.02k
      
for (p = 0; 3.14k
p < partition;
p++3.87k
)
1019
16.4k
        
for (k = 0; 3.87k
k < ch_per_partition;
k++12.5k
)
1020
12.5k
        {
1021
12.5k
          float nom = 0;
1022
42.6k
          for (x = 
ccv_max12.5k
(k - way, 0); x <= ccv_min(k + way, ch_per_partition - 1);
x++30.0k
)
1023
30.0k
            nom += -2 * alpha * beta * ap[j * ch + x + p * ch_per_partition] * np[j * ch + x + p * ch_per_partition] / dp[j * ch + x + p * ch_per_partition];
1024
12.5k
          bp[j * ch + k + p * ch_per_partition] = mp[j * ch + k + p * ch_per_partition] * nom + ap[j * ch + k + p * ch_per_partition] * powf(dp[j * ch + k + p * ch_per_partition], -beta);
1025
12.5k
        }
1026
112
    ap += a->cols * ch;
1027
112
    np += n->cols * ch;
1028
112
    mp += m->cols * ch;
1029
112
    dp += denoms->cols * ch;
1030
112
    bp += db->cols * ch;
1031
112
  }
1032
4
}
1033
1034
static void _ccv_convnet_max_pool_backward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t* n, ccv_dense_matrix_t* m, ccv_dense_matrix_t** b)
1035
1
{
1036
  // a is the input gradient (for back prop), y is the output (from forward prop),
1037
  // x is the input (for forward prop), b is the output gradient (gradient, or known as propagated error)
1038
  // pooling layer doesn't need the dropout
1039
1
  if (b)
1040
1
  {
1041
1
    assert(CCV_GET_CHANNEL(a->type) == CCV_GET_CHANNEL(n->type));
1042
1
    assert(CCV_GET_CHANNEL(a->type) == CCV_GET_CHANNEL(m->type));
1043
1
    int ch = CCV_GET_CHANNEL(a->type);
1044
1
    ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, m->rows, m->cols, CCV_32F | ch, CCV_32F | ch, 0);
1045
1
    ccv_zero(db);
1046
1
    int size = layer->net.pool.size;
1047
1
    int strides = layer->net.pool.strides;
1048
1
    int border = layer->net.pool.border;
1049
1
    int i, j, k, x, y;
1050
1
    float* ap = a->data.f32;
1051
1
    float* bp = db->data.f32;
1052
1
    float* np = n->data.f32;
1053
1
    float* mp = m->data.f32;
1054
16
    for (i = 0; i < a->rows; 
i++15
)
1055
15
    {
1056
15
      const int start_y = ccv_max(i * strides - border, 0) - (i * strides - border);
1057
15
      const int end_y = size + ccv_min(i * strides + size - border, db->rows) - (i * strides + size - border);
1058
240
      for (j = 0; j < a->cols; 
j++225
)
1059
225
      {
1060
225
        const int start_x = ccv_max(j * strides - border, 0) - (j * strides - border);
1061
225
        const int end_x = size + ccv_min(j * strides + size - border, db->cols) - (j * strides + size - border);
1062
675
        for (k = 0; k < ch; 
k++450
)
1063
450
        {
1064
450
          float v = np[j * ch + k];
1065
450
          float u = ap[j * ch + k];
1066
1.80k
          for (y = start_y; y < end_y; 
y++1.35k
)
1067
5.40k
            
for (x = start_x; 1.35k
x < end_x;
x++4.05k
)
1068
              // we have to do direct comparison otherwise it will contribute to too many cells
1069
              // and the propagation won't work. But CPU will have different result comparing with GPU
1070
4.05k
              if (mp[(j * strides - border + x + (y - border) * m->cols) * ch + k] == v)
1071
450
                bp[(j * strides - border + x + (y - border) * db->cols) * ch + k] += u;
1072
450
        }
1073
225
      }
1074
15
      ap += a->cols * ch;
1075
15
      np += n->cols * ch;
1076
15
      bp += db->cols * ch * strides;
1077
15
      mp += m->cols * ch * strides;
1078
15
    }
1079
1
  }
1080
1
}
1081
1082
static void _ccv_convnet_average_pool_backward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t* m, ccv_dense_matrix_t** b)
1083
1
{
1084
  // a is the input gradient (for back prop), y is the output (from forward prop),
1085
  // x is the input (for forward prop), b is the output gradient (gradient, or known as propagated error)
1086
  // pooling layer doesn't need the dropout
1087
1
  if (b)
1088
1
  {
1089
1
    assert(CCV_GET_CHANNEL(a->type) == CCV_GET_CHANNEL(m->type));
1090
1
    int ch = CCV_GET_CHANNEL(a->type);
1091
1
    ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, m->rows, m->cols, CCV_32F | ch, CCV_32F | ch, 0);
1092
1
    ccv_zero(db);
1093
1
    int size = layer->net.pool.size;
1094
1
    int strides = layer->net.pool.strides;
1095
1
    int border = layer->net.pool.border;
1096
1
    int i, j, k, x, y;
1097
1
    float* ap = a->data.f32;
1098
1
    float* bp = db->data.f32;
1099
16
    for (i = 0; i < a->rows; 
i++15
)
1100
15
    {
1101
15
      const int start_y = ccv_max(i * strides - border, 0) - (i * strides - border);
1102
15
      const int end_y = size + ccv_min(i * strides + size - border, db->rows) - (i * strides + size - border);
1103
240
      for (j = 0; j < a->cols; 
j++225
)
1104
225
      {
1105
225
        const int start_x = ccv_max(j * strides - border, 0) - (j * strides - border);
1106
225
        const int end_x = size + ccv_min(j * strides + size - border, db->cols) - (j * strides + size - border);
1107
675
        for (k = 0; k < ch; 
k++450
)
1108
450
        {
1109
450
          float u = ap[j * ch + k] / ((end_x - start_x) * (end_y - start_y));
1110
1.80k
          for (y = start_y; y < end_y; 
y++1.35k
)
1111
5.40k
            
for (x = start_x; 1.35k
x < end_x;
x++4.05k
)
1112
4.05k
              bp[(j * strides - border + x + (y - border) * db->cols) * ch + k] += u;
1113
450
        }
1114
225
      }
1115
15
      ap += a->cols * ch;
1116
15
      bp += db->cols * ch * strides;
1117
15
    }
1118
1
  }
1119
1
}
1120
1121
static void _ccv_convnet_propagate_loss(ccv_convnet_t* convnet, ccv_dense_matrix_t* a, ccv_dense_matrix_t* dloss, ccv_convnet_t* update_params)
1122
1
{
1123
1
  int i;
1124
1
  ccv_convnet_layer_t* layer = convnet->layers + convnet->count - 1;
1125
1
  assert(layer->type == CCV_CONVNET_FULL_CONNECT); // the last layer has too be a full connect one to generate softmax result
1126
1
  _ccv_convnet_full_connect_backward_propagate(layer, dloss, convnet->acts[convnet->count - 1], convnet->acts[convnet->count - 2], convnet->count - 1 > 0 ? update_params->acts + convnet->count - 2 : 
00
, update_params->layers + convnet->count - 1);
1127
2
  for (i = convnet->count - 2; i >= 0; 
i--1
)
1128
1
  {
1129
1
    layer = convnet->layers + i;
1130
1
    switch (layer->type)
1131
1
    {
1132
1
      case CCV_CONVNET_CONVOLUTIONAL:
1133
1
        _ccv_convnet_convolutional_backward_propagate(layer, update_params->acts[i], convnet->acts[i], i > 0 ? 
convnet->acts[i - 1]0
: a, i > 0 ?
update_params->acts + i - 10
: 0, update_params->layers + i);
1134
1
        break;
1135
0
      case CCV_CONVNET_FULL_CONNECT:
1136
0
        _ccv_convnet_full_connect_backward_propagate(layer, update_params->acts[i], convnet->acts[i], i > 0 ? convnet->acts[i - 1] : a, i > 0 ? update_params->acts + i - 1 : 0, update_params->layers + i);
1137
0
        break;
1138
0
      case CCV_CONVNET_LOCAL_RESPONSE_NORM:
1139
0
        _ccv_convnet_rnorm_backward_propagate(layer, update_params->acts[i], convnet->acts[i], i > 0 ? convnet->acts[i - 1] : a, convnet->denoms[i], i > 0 ? update_params->acts + i - 1 : 0);
1140
0
        break;
1141
0
      case CCV_CONVNET_MAX_POOL:
1142
0
        _ccv_convnet_max_pool_backward_propagate(layer, update_params->acts[i], convnet->acts[i], i > 0 ? convnet->acts[i - 1] : a, i > 0 ? update_params->acts + i - 1 : 0);
1143
0
        break;
1144
0
      case CCV_CONVNET_AVERAGE_POOL:
1145
0
        _ccv_convnet_average_pool_backward_propagate(layer, update_params->acts[i], i > 0 ? convnet->acts[i - 1] : a, i > 0 ? update_params->acts + i - 1 : 0);
1146
0
        break;
1147
1
    }
1148
1
  }
1149
1
}
1150
1151
static void _ccv_convnet_update(ccv_convnet_t* convnet, int batch, ccv_convnet_t* momentum, ccv_convnet_t* update_params, ccv_convnet_layer_train_param_t* layer_params)
1152
0
{
1153
0
  int i, j;
1154
0
  float learn_rate;
1155
0
  for (i = 0; i < convnet->count; i++)
1156
0
    switch (update_params->layers[i].type)
1157
0
    {
1158
0
      case CCV_CONVNET_CONVOLUTIONAL:
1159
0
      {
1160
0
        float* w = convnet->layers[i].w;
1161
0
        float* vw = momentum->layers[i].w;
1162
0
        float* dw = update_params->layers[i].w;
1163
0
        learn_rate = layer_params[i].w.learn_rate / batch;
1164
0
        for (j = 0; j < convnet->layers[i].wnum; j++)
1165
0
        {
1166
0
          vw[j] = layer_params[i].w.momentum * vw[j] - layer_params[i].w.decay * layer_params[i].w.learn_rate * w[j] + learn_rate * dw[j];
1167
0
          w[j] += vw[j];
1168
0
        }
1169
0
        float* bias = convnet->layers[i].bias;
1170
0
        float* vbias = momentum->layers[i].bias;
1171
0
        float* dbias = update_params->layers[i].bias;
1172
0
        learn_rate = layer_params[i].bias.learn_rate / batch;
1173
0
        for (j = 0; j < convnet->layers[i].net.convolutional.count; j++)
1174
0
        {
1175
0
          vbias[j] = layer_params[i].bias.momentum * vbias[j] - layer_params[i].bias.decay * layer_params[i].bias.learn_rate * bias[j] + learn_rate * dbias[j];
1176
0
          bias[j] += vbias[j];
1177
0
        }
1178
0
        break;
1179
0
      }
1180
0
      case CCV_CONVNET_FULL_CONNECT:
1181
0
      {
1182
0
        float* w = convnet->layers[i].w;
1183
0
        float* vw = momentum->layers[i].w;
1184
0
        float* dw = update_params->layers[i].w;
1185
0
        learn_rate = layer_params[i].w.learn_rate / batch;
1186
0
        for (j = 0; j < convnet->layers[i].wnum; j++)
1187
0
        {
1188
0
          vw[j] = layer_params[i].w.momentum * vw[j] - layer_params[i].w.decay * layer_params[i].w.learn_rate * w[j] + learn_rate * dw[j];
1189
0
          w[j] += vw[j];
1190
0
        }
1191
0
        float* bias = convnet->layers[i].bias;
1192
0
        float* vbias = momentum->layers[i].bias;
1193
0
        float* dbias = update_params->layers[i].bias;
1194
0
        learn_rate = layer_params[i].bias.learn_rate / batch;
1195
0
        for (j = 0; j < convnet->layers[i].net.full_connect.count; j++)
1196
0
        {
1197
0
          vbias[j] = layer_params[i].bias.momentum * vbias[j] - layer_params[i].bias.decay * layer_params[i].bias.learn_rate * bias[j] + learn_rate * dbias[j];
1198
0
          bias[j] += vbias[j];
1199
0
        }
1200
0
        break;
1201
0
      }
1202
0
    }
1203
0
}
1204
1205
static void _ccv_convnet_update_zero(ccv_convnet_t* update_params)
1206
9
{
1207
9
  int i;
1208
20
  for (i = 0; i < update_params->count; 
i++11
)
1209
11
    switch (update_params->layers[i].type)
1210
11
    {
1211
7
      case CCV_CONVNET_CONVOLUTIONAL:
1212
7
        memset(update_params->layers[i].w, 0, sizeof(float) * update_params->layers[i].wnum);
1213
7
        memset(update_params->layers[i].bias, 0, sizeof(float) * update_params->layers[i].net.convolutional.count);
1214
7
        break;
1215
3
      case CCV_CONVNET_FULL_CONNECT:
1216
3
        assert(update_params->layers[i].wnum % update_params->layers[i].net.full_connect.count == 0);
1217
3
        memset(update_params->layers[i].w, 0, sizeof(float) * update_params->layers[i].wnum);
1218
3
        memset(update_params->layers[i].bias, 0, sizeof(float) * update_params->layers[i].net.full_connect.count);
1219
3
        break;
1220
11
    }
1221
9
}
1222
1223
static ccv_convnet_t* _ccv_convnet_update_new(ccv_convnet_t* convnet)
1224
8
{
1225
8
  ccv_convnet_t* update_params = (ccv_convnet_t*)ccmalloc(sizeof(ccv_convnet_t) + sizeof(ccv_convnet_layer_t) * convnet->count + sizeof(ccv_dense_matrix_t*) * convnet->count);
1226
8
  update_params->reserved = 0;
1227
8
  update_params->layers = (ccv_convnet_layer_t*)(update_params + 1);
1228
8
  update_params->acts = (ccv_dense_matrix_t**)(update_params->layers + convnet->count);
1229
8
  memset(update_params->acts, 0, sizeof(ccv_dense_matrix_t*) * convnet->count);
1230
8
  update_params->denoms = 0;
1231
8
  update_params->input = convnet->input;
1232
8
  update_params->rows = convnet->rows;
1233
8
  update_params->cols = convnet->cols;
1234
8
  update_params->count = convnet->count;
1235
8
  update_params->channels = convnet->channels;
1236
8
  update_params->mean_activity = 0;
1237
8
  int i;
1238
18
  for (i = 0; i < convnet->count; 
i++10
)
1239
10
  {
1240
10
    update_params->layers[i].type = convnet->layers[i].type;
1241
10
    update_params->layers[i].input = convnet->layers[i].input;
1242
10
    update_params->layers[i].net = convnet->layers[i].net;
1243
10
    update_params->layers[i].wnum = convnet->layers[i].wnum;
1244
10
    update_params->layers[i].reserved = 0;
1245
10
    switch (update_params->layers[i].type)
1246
10
    {
1247
6
      case CCV_CONVNET_CONVOLUTIONAL:
1248
6
        update_params->layers[i].w = (float*)cccalloc(update_params->layers[i].wnum + update_params->layers[i].net.convolutional.count, sizeof(float));
1249
6
        update_params->layers[i].bias = update_params->layers[i].w + update_params->layers[i].wnum;
1250
6
        break;
1251
3
      case CCV_CONVNET_FULL_CONNECT:
1252
3
        assert(update_params->layers[i].wnum % update_params->layers[i].net.full_connect.count == 0);
1253
3
        update_params->layers[i].w = (float*)cccalloc(update_params->layers[i].wnum + update_params->layers[i].net.full_connect.count, sizeof(float));
1254
3
        update_params->layers[i].bias = update_params->layers[i].w + update_params->layers[i].wnum;
1255
3
        break;
1256
1
      case CCV_CONVNET_LOCAL_RESPONSE_NORM:
1257
1
      case CCV_CONVNET_MAX_POOL:
1258
1
      case CCV_CONVNET_AVERAGE_POOL:
1259
1
        update_params->layers[i].w = 0;
1260
1
        update_params->layers[i].bias = 0;
1261
1
        break;
1262
10
    }
1263
10
  }
1264
8
  return update_params;
1265
8
}
1266
1267
static void _ccv_convnet_compute_softmax(ccv_dense_matrix_t* a, ccv_dense_matrix_t** b, int type)
1268
5.26k
{
1269
5.26k
  int ch = CCV_GET_CHANNEL(a->type);
1270
5.26k
  assert(CCV_GET_DATA_TYPE(a->type) == CCV_32F);
1271
5.26k
  ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, a->rows, a->cols, CCV_32F | ch, CCV_32F | ch, 0);
1272
5.26k
  int i;
1273
5.26k
  float* aptr = a->data.f32;
1274
5.26k
  float* bptr = db->data.f32;
1275
5.26k
  double max = aptr[0];
1276
7.85M
  for (i = 1; i < a->rows * a->cols * ch; 
i++7.84M
)
1277
7.84M
    if (aptr[i] > max)
1278
231k
      max = aptr[i];
1279
5.26k
  double tt = 0;
1280
7.85M
  for (i = 0; i < a->rows * a->cols * ch; 
i++7.85M
)
1281
7.85M
    tt += (bptr[i] = expf(aptr[i] - max));
1282
5.26k
  tt = 1.0 / tt;
1283
7.85M
  for (i = 0; i < a->rows * a->cols * ch; 
i++7.85M
)
1284
7.85M
    bptr[i] *= tt;
1285
5.26k
}
1286
1287
static void _ccv_convnet_classify(ccv_convnet_t* convnet, ccv_dense_matrix_t** a, int* labels, int batch)
1288
0
{
1289
0
  assert(batch == 1);
1290
0
  ccv_convnet_encode(convnet, a, convnet->acts + convnet->count - 1, 1);
1291
0
  int i, c = 0;
1292
0
  ccv_dense_matrix_t* b = convnet->acts[convnet->count - 1];
1293
0
  float maxc = b->data.f32[0];
1294
0
  for (i = 1; i < b->rows; i++)
1295
0
    if (b->data.f32[i] > maxc)
1296
0
      maxc = b->data.f32[i], c = i;
1297
0
  labels[0] = c;
1298
0
}
1299
1300
#endif
1301
1302
#ifndef CASE_TESTS
1303
1304
void ccv_convnet_supervised_train(ccv_convnet_t* convnet, ccv_array_t* categorizeds, ccv_array_t* tests, const char* filename, ccv_convnet_train_param_t params)
1305
{
1306
#ifdef HAVE_GSL
1307
#ifdef HAVE_CUDA
1308
  if (convnet->use_cwc_accel)
1309
    cwc_convnet_supervised_train(convnet, categorizeds, tests, filename, params);
1310
  else {
1311
#endif
1312
  int i, j, t;
1313
  gsl_rng_env_setup();
1314
  gsl_rng* rng = gsl_rng_alloc(gsl_rng_default);
1315
  int aligned_padding = categorizeds->rnum % params.mini_batch;
1316
  int aligned_rnum = categorizeds->rnum - aligned_padding;
1317
  int* idx = (int*)ccmalloc(sizeof(int) * (categorizeds->rnum + aligned_padding));
1318
  for (i = 0; i < categorizeds->rnum; i++)
1319
    idx[i] = i;
1320
  gsl_ran_shuffle(rng, idx, categorizeds->rnum, sizeof(int));
1321
  // the last layer has to be full connect, thus we can use it as softmax layer
1322
  assert(convnet->layers[convnet->count - 1].type == CCV_CONVNET_FULL_CONNECT);
1323
  int category_count = convnet->layers[convnet->count - 1].net.full_connect.count;
1324
  ccv_convnet_t* update_params = _ccv_convnet_update_new(convnet);
1325
  ccv_convnet_t* momentum = _ccv_convnet_update_new(convnet);
1326
  for (t = 0; t < params.max_epoch; t++)
1327
  {
1328
    for (i = 0; i < aligned_rnum; i++)
1329
    {
1330
      // dropout the first hidden layer
1331
      ccv_categorized_t* categorized = (ccv_categorized_t*)ccv_array_get(categorizeds, idx[i]);
1332
      ccv_convnet_encode(convnet, &categorized->matrix, convnet->acts + convnet->count - 1, 1);
1333
      ccv_dense_matrix_t* softmax = convnet->acts[convnet->count - 1];
1334
      float* dloss = softmax->data.f32;
1335
      _ccv_convnet_compute_softmax(softmax, &softmax, 0);
1336
      assert(softmax->rows == category_count && softmax->cols == 1);
1337
      // this mashes softmax and logistic regression together
1338
      // also, it gives you -D[loss w.r.t. to x_i] (note the negative sign)
1339
      for (j = 0; j < category_count; j++)
1340
        dloss[j] = (j == categorized->c) - dloss[j];
1341
      _ccv_convnet_propagate_loss(convnet, categorized->matrix, softmax, update_params);
1342
      if ((i + 1) % params.mini_batch == 0)
1343
      {
1344
        FLUSH(CCV_CLI_INFO, " - at epoch %03d / %d => stochastic gradient descent at %d / %d", t + 1, params.max_epoch, (i + 1) / params.mini_batch, aligned_rnum / params.mini_batch);
1345
        // update weights
1346
        _ccv_convnet_update(convnet, params.mini_batch, momentum, update_params, params.layer_params);
1347
        _ccv_convnet_update_zero(update_params);
1348
        // compact the convnet to avoid any staled temporary resource
1349
        ccv_convnet_compact(convnet);
1350
      }
1351
    }
1352
    int miss = 0;
1353
    for (i = 0; i < tests->rnum; i++)
1354
    {
1355
      FLUSH(CCV_CLI_INFO, " - at epoch %03d / %d => going through %d / %d for tests", t + 1, params.max_epoch, i + 1, tests->rnum);
1356
      ccv_categorized_t* test = (ccv_categorized_t*)ccv_array_get(tests, i);
1357
      int c = 0;
1358
      _ccv_convnet_classify(convnet, &test->matrix, &c, 1);
1359
      if (c != test->c)
1360
        ++miss;
1361
    }
1362
    FLUSH(CCV_CLI_INFO, " - at epoch %03d / %d => with miss rate %.2f%%\n", t + 1, params.max_epoch, miss * 100.0f / tests->rnum);
1363
    if (t + 1 < params.max_epoch)
1364
    {
1365
      // reshuffle the parts we visited and move the rest to the beginning
1366
      memcpy(idx + categorizeds->rnum, idx + aligned_rnum, sizeof(int) * aligned_padding);
1367
      memmove(idx + aligned_padding, idx, sizeof(int) * aligned_rnum);
1368
      memcpy(idx, idx + categorizeds->rnum, sizeof(int) * aligned_padding);
1369
      gsl_ran_shuffle(rng, idx + aligned_padding, aligned_rnum, sizeof(int));
1370
    }
1371
  }
1372
  ccfree(idx);
1373
  ccv_convnet_free(momentum);
1374
  ccv_convnet_free(update_params);
1375
  gsl_rng_free(rng);
1376
#ifdef HAVE_CUDA
1377
  }
1378
#endif
1379
#else
1380
  assert(0 && "ccv_convnet_supervised_train requires GSL library support");
1381
#endif
1382
}
1383
1384
void ccv_convnet_compact(ccv_convnet_t* convnet)
1385
{
1386
#ifdef HAVE_CUDA
1387
  cwc_convnet_compact(convnet);
1388
#endif
1389
  int i;
1390
  for (i = 0; i < convnet->count; i++)
1391
  {
1392
    if (convnet->acts[i])
1393
      ccv_matrix_free(convnet->acts[i]);
1394
    convnet->acts[i] = 0;
1395
    if (convnet->denoms)
1396
    {
1397
      if (convnet->denoms[i])
1398
        ccv_matrix_free(convnet->denoms[i]);
1399
      convnet->denoms[i] = 0;
1400
    }
1401
    if (SIMD(convnet->layers + i))
1402
    {
1403
      ccfree(convnet->layers[i].reserved);
1404
      convnet->layers[i].reserved = 0;
1405
    }
1406
  }
1407
}
1408
1409
void ccv_convnet_write(ccv_convnet_t* convnet, const char* filename, ccv_convnet_write_param_t params)
1410
{
1411
  sqlite3* db = 0;
1412
  if (SQLITE_OK == sqlite3_open(filename, &db))
1413
  {
1414
    const char layer_create_table_qs[] =
1415
      "CREATE TABLE IF NOT EXISTS layer_params "
1416
      "(layer INTEGER PRIMARY KEY ASC, type INTEGER, "
1417
      "input_matrix_rows INTEGER, input_matrix_cols INTEGER, input_matrix_channels INTEGER, input_matrix_partition INTEGER, input_node_count INTEGER, "
1418
      "output_rows INTEGER, output_cols INTEGER, output_channels INTEGER, output_partition INTEGER, output_count INTEGER, output_strides INTEGER, output_border INTEGER, "
1419
      "output_size INTEGER, output_kappa REAL, output_alpha REAL, output_beta REAL, output_relu INTEGER);"
1420
      "CREATE TABLE IF NOT EXISTS convnet_params "
1421
      "(convnet INTEGER PRIMARY KEY ASC, input_height INTEGER, input_width INTEGER, mean_activity BLOB);"
1422
      "CREATE TABLE IF NOT EXISTS layer_data "
1423
      "(layer INTEGER PRIMARY KEY ASC, weight BLOB, bias BLOB, half_precision INTEGER);";
1424
    assert(SQLITE_OK == sqlite3_exec(db, layer_create_table_qs, 0, 0, 0));
1425
    const char layer_params_insert_qs[] = 
1426
      "REPLACE INTO layer_params "
1427
      "(layer, type, "
1428
      "input_matrix_rows, input_matrix_cols, input_matrix_channels, input_matrix_partition, input_node_count, "
1429
      "output_rows, output_cols, output_channels, output_partition, output_count, output_strides, output_border, "
1430
      "output_size, output_kappa, output_alpha, output_beta, output_relu) VALUES "
1431
      "($layer, $type, " // 1
1432
      "$input_matrix_rows, $input_matrix_cols, $input_matrix_channels, $input_matrix_partition, $input_node_count, " // 6
1433
      "$output_rows, $output_cols, $output_channels, $output_partition, $output_count, $output_strides, $output_border, " // 13
1434
      "$output_size, $output_kappa, $output_alpha, $output_beta, $output_relu);"; // 18
1435
    sqlite3_stmt* layer_params_insert_stmt = 0;
1436
    assert(SQLITE_OK == sqlite3_prepare_v2(db, layer_params_insert_qs, sizeof(layer_params_insert_qs), &layer_params_insert_stmt, 0));
1437
    const char layer_data_insert_qs[] =
1438
      "REPLACE INTO layer_data "
1439
      "(layer, weight, bias, half_precision) VALUES ($layer, $weight, $bias, $half_precision);";
1440
    sqlite3_stmt* layer_data_insert_stmt = 0;
1441
    assert(SQLITE_OK == sqlite3_prepare_v2(db, layer_data_insert_qs, sizeof(layer_data_insert_qs), &layer_data_insert_stmt, 0));
1442
    int i;
1443
    for (i = 0; i < convnet->count; i++)
1444
    {
1445
      ccv_convnet_layer_t* layer = convnet->layers + i;
1446
      // insert layer params
1447
      sqlite3_bind_int(layer_params_insert_stmt, 1, i);
1448
      sqlite3_bind_int(layer_params_insert_stmt, 2, layer->type);
1449
      sqlite3_bind_int(layer_params_insert_stmt, 3, layer->input.matrix.rows);
1450
      sqlite3_bind_int(layer_params_insert_stmt, 4, layer->input.matrix.cols);
1451
      sqlite3_bind_int(layer_params_insert_stmt, 5, layer->input.matrix.channels);
1452
      sqlite3_bind_int(layer_params_insert_stmt, 6, layer->input.matrix.partition);
1453
      sqlite3_bind_int(layer_params_insert_stmt, 7, layer->input.node.count);
1454
      switch (layer->type)
1455
      {
1456
        case CCV_CONVNET_CONVOLUTIONAL:
1457
          sqlite3_bind_int(layer_params_insert_stmt, 8, layer->net.convolutional.rows);
1458
          sqlite3_bind_int(layer_params_insert_stmt, 9, layer->net.convolutional.cols);
1459
          sqlite3_bind_int(layer_params_insert_stmt, 10, layer->net.convolutional.channels);
1460
          sqlite3_bind_int(layer_params_insert_stmt, 11, layer->net.convolutional.partition);
1461
          sqlite3_bind_int(layer_params_insert_stmt, 12, layer->net.convolutional.count);
1462
          sqlite3_bind_int(layer_params_insert_stmt, 13, layer->net.convolutional.strides);
1463
          sqlite3_bind_int(layer_params_insert_stmt, 14, layer->net.convolutional.border);
1464
          break;
1465
        case CCV_CONVNET_FULL_CONNECT:
1466
          sqlite3_bind_int(layer_params_insert_stmt, 12, layer->net.full_connect.count);
1467
          sqlite3_bind_int(layer_params_insert_stmt, 19, layer->net.full_connect.relu);
1468
          break;
1469
        case CCV_CONVNET_MAX_POOL:
1470
        case CCV_CONVNET_AVERAGE_POOL:
1471
          sqlite3_bind_int(layer_params_insert_stmt, 13, layer->net.pool.strides);
1472
          sqlite3_bind_int(layer_params_insert_stmt, 14, layer->net.pool.border);
1473
          sqlite3_bind_int(layer_params_insert_stmt, 15, layer->net.pool.size);
1474
          break;
1475
        case CCV_CONVNET_LOCAL_RESPONSE_NORM:
1476
          sqlite3_bind_int(layer_params_insert_stmt, 15, layer->net.rnorm.size);
1477
          sqlite3_bind_double(layer_params_insert_stmt, 16, layer->net.rnorm.kappa);
1478
          sqlite3_bind_double(layer_params_insert_stmt, 17, layer->net.rnorm.alpha);
1479
          sqlite3_bind_double(layer_params_insert_stmt, 18, layer->net.rnorm.beta);
1480
          break;
1481
      }
1482
      assert(SQLITE_DONE == sqlite3_step(layer_params_insert_stmt));
1483
      sqlite3_reset(layer_params_insert_stmt);
1484
      sqlite3_clear_bindings(layer_params_insert_stmt);
1485
      // insert layer data
1486
      if (layer->type == CCV_CONVNET_CONVOLUTIONAL || layer->type == CCV_CONVNET_FULL_CONNECT)
1487
      {
1488
        sqlite3_bind_int(layer_data_insert_stmt, 1, i);
1489
        if (params.half_precision)
1490
        {
1491
          uint16_t* w = (uint16_t*)ccmalloc(sizeof(uint16_t) * layer->wnum);
1492
          ccv_float_to_half_precision(layer->w, w, layer->wnum);
1493
          uint16_t* bias = (uint16_t*)ccmalloc(sizeof(uint16_t) * (layer->type == CCV_CONVNET_CONVOLUTIONAL ? layer->net.convolutional.count : layer->net.full_connect.count));
1494
          ccv_float_to_half_precision(layer->bias, bias, layer->type == CCV_CONVNET_CONVOLUTIONAL ? layer->net.convolutional.count : layer->net.full_connect.count);
1495
          sqlite3_bind_blob(layer_data_insert_stmt, 2, w, sizeof(uint16_t) * layer->wnum, ccfree);
1496
          sqlite3_bind_blob(layer_data_insert_stmt, 3, bias, sizeof(uint16_t) * (layer->type == CCV_CONVNET_CONVOLUTIONAL ? layer->net.convolutional.count : layer->net.full_connect.count), ccfree);
1497
        } else {
1498
          sqlite3_bind_blob(layer_data_insert_stmt, 2, layer->w, sizeof(float) * layer->wnum, SQLITE_STATIC);
1499
          sqlite3_bind_blob(layer_data_insert_stmt, 3, layer->bias, sizeof(float) * (layer->type == CCV_CONVNET_CONVOLUTIONAL ? layer->net.convolutional.count : layer->net.full_connect.count), SQLITE_STATIC);
1500
        }
1501
        sqlite3_bind_int(layer_data_insert_stmt, 4, params.half_precision);
1502
        assert(SQLITE_DONE == sqlite3_step(layer_data_insert_stmt));
1503
        sqlite3_reset(layer_data_insert_stmt);
1504
        sqlite3_clear_bindings(layer_data_insert_stmt);
1505
      }
1506
    }
1507
    // insert convnet related params
1508
    const char convnet_params_insert_qs[] =
1509
      "REPLACE INTO convnet_params "
1510
      "(convnet, mean_activity, input_height, input_width) VALUES (0, $mean_activity, $input_height, $input_width);";
1511
    sqlite3_stmt* convnet_params_insert_stmt = 0;
1512
    assert(SQLITE_OK == sqlite3_prepare_v2(db, convnet_params_insert_qs, sizeof(convnet_params_insert_qs), &convnet_params_insert_stmt, 0));
1513
    assert(convnet->mean_activity->rows == convnet->input.height);
1514
    assert(convnet->mean_activity->cols == convnet->input.width);
1515
    assert(CCV_GET_CHANNEL(convnet->mean_activity->type) == convnet->channels);
1516
    assert(CCV_GET_DATA_TYPE(convnet->mean_activity->type) == CCV_32F);
1517
    sqlite3_bind_blob(convnet_params_insert_stmt, 1, convnet->mean_activity->data.f32, sizeof(float) * convnet->input.height * convnet->input.width * convnet->channels, SQLITE_STATIC);
1518
    sqlite3_bind_int(convnet_params_insert_stmt, 2, convnet->input.height);
1519
    sqlite3_bind_int(convnet_params_insert_stmt, 3, convnet->input.width);
1520
    assert(SQLITE_DONE == sqlite3_step(convnet_params_insert_stmt));
1521
    sqlite3_reset(convnet_params_insert_stmt);
1522
    sqlite3_clear_bindings(convnet_params_insert_stmt);
1523
1524
    sqlite3_finalize(layer_params_insert_stmt);
1525
    sqlite3_finalize(layer_data_insert_stmt);
1526
    sqlite3_finalize(convnet_params_insert_stmt);
1527
    sqlite3_close(db);
1528
  }
1529
}
1530
1531
ccv_convnet_t* ccv_convnet_read(int use_cwc_accel, const char* filename)
1532
{
1533
  sqlite3* db = 0;
1534
  if (SQLITE_OK == sqlite3_open(filename, &db))
1535
  {
1536
    ccv_convnet_t* convnet = 0;
1537
    sqlite3_stmt* layer_params_stmt = 0;
1538
    // load layer params
1539
    const char layer_params_qs[] =
1540
      "SELECT type, " // 1
1541
      "input_matrix_rows, input_matrix_cols, input_matrix_channels, input_matrix_partition, input_node_count, " // 6
1542
      "output_rows, output_cols, output_channels, output_partition, output_count, output_strides, output_border, " // 13
1543
      "output_size, output_kappa, output_alpha, output_beta, output_relu FROM layer_params ORDER BY layer ASC;"; // 18
1544
    if (SQLITE_OK == sqlite3_prepare_v2(db, layer_params_qs, sizeof(layer_params_qs), &layer_params_stmt, 0))
1545
    {
1546
      ccv_array_t* layer_params = ccv_array_new(sizeof(ccv_convnet_layer_param_t), 3, 0);
1547
      while (sqlite3_step(layer_params_stmt) == SQLITE_ROW)
1548
      {
1549
        ccv_convnet_layer_param_t layer_param;
1550
        layer_param.type = sqlite3_column_int(layer_params_stmt, 0);
1551
        layer_param.input.matrix.rows = sqlite3_column_int(layer_params_stmt, 1);
1552
        layer_param.input.matrix.cols = sqlite3_column_int(layer_params_stmt, 2);
1553
        layer_param.input.matrix.channels = sqlite3_column_int(layer_params_stmt, 3);
1554
        layer_param.input.matrix.partition = sqlite3_column_int(layer_params_stmt, 4);
1555
        layer_param.input.node.count = sqlite3_column_int(layer_params_stmt, 5);
1556
        layer_param.bias = layer_param.glorot = 0; // this is irrelevant to read convnet
1557
        switch (layer_param.type)
1558
        {
1559
          case CCV_CONVNET_CONVOLUTIONAL:
1560
            layer_param.output.convolutional.rows = sqlite3_column_int(layer_params_stmt, 6);
1561
            layer_param.output.convolutional.cols = sqlite3_column_int(layer_params_stmt, 7);
1562
            layer_param.output.convolutional.channels = sqlite3_column_int(layer_params_stmt, 8);
1563
            layer_param.output.convolutional.partition = sqlite3_column_int(layer_params_stmt, 9);
1564
            layer_param.output.convolutional.count = sqlite3_column_int(layer_params_stmt, 10);
1565
            layer_param.output.convolutional.strides = sqlite3_column_int(layer_params_stmt, 11);
1566
            layer_param.output.convolutional.border = sqlite3_column_int(layer_params_stmt, 12);
1567
            break;
1568
          case CCV_CONVNET_FULL_CONNECT:
1569
            layer_param.output.full_connect.count = sqlite3_column_int(layer_params_stmt, 10);
1570
            layer_param.output.full_connect.relu = sqlite3_column_int(layer_params_stmt, 17);
1571
            break;
1572
          case CCV_CONVNET_MAX_POOL:
1573
          case CCV_CONVNET_AVERAGE_POOL:
1574
            layer_param.output.pool.strides = sqlite3_column_int(layer_params_stmt, 11);
1575
            layer_param.output.pool.border = sqlite3_column_int(layer_params_stmt, 12);
1576
            layer_param.output.pool.size = sqlite3_column_int(layer_params_stmt, 13);
1577
            break;
1578
          case CCV_CONVNET_LOCAL_RESPONSE_NORM:
1579
            layer_param.output.rnorm.size = sqlite3_column_int(layer_params_stmt, 13);
1580
            layer_param.output.rnorm.kappa = sqlite3_column_double(layer_params_stmt, 14);
1581
            layer_param.output.rnorm.alpha = sqlite3_column_double(layer_params_stmt, 15);
1582
            layer_param.output.rnorm.beta = sqlite3_column_double(layer_params_stmt, 16);
1583
            break;
1584
        }
1585
        ccv_array_push(layer_params, &layer_param);
1586
      }
1587
      sqlite3_finalize(layer_params_stmt);
1588
      sqlite3_stmt* convnet_params_input_stmt = 0;
1589
      // load convnet params for input
1590
      const char convnet_params_input_qs[] =
1591
        "SELECT input_height, input_width FROM convnet_params WHERE convnet = 0;";
1592
      ccv_size_t input = ccv_size(0, 0);
1593
      if (SQLITE_OK == sqlite3_prepare_v2(db, convnet_params_input_qs, sizeof(convnet_params_input_qs), &convnet_params_input_stmt, 0))
1594
      {
1595
        if (sqlite3_step(convnet_params_input_stmt) == SQLITE_ROW)
1596
        {
1597
          input.height = sqlite3_column_int(convnet_params_input_stmt, 0);
1598
          input.width = sqlite3_column_int(convnet_params_input_stmt, 1);
1599
        }
1600
        sqlite3_finalize(convnet_params_input_stmt);
1601
      }
1602
      assert(input.height != 0 && input.width != 0);
1603
      convnet = ccv_convnet_new(use_cwc_accel, input, (ccv_convnet_layer_param_t*)ccv_array_get(layer_params, 0), layer_params->rnum);
1604
      ccv_array_free(layer_params);
1605
      // load layer data
1606
      sqlite3_stmt* layer_data_stmt = 0;
1607
      const char layer_data_qs[] =
1608
        "SELECT layer, weight, bias, half_precision FROM layer_data;";
1609
      if (SQLITE_OK == sqlite3_prepare_v2(db, layer_data_qs, sizeof(layer_data_qs), &layer_data_stmt, 0))
1610
      {
1611
        while (sqlite3_step(layer_data_stmt) == SQLITE_ROW)
1612
        {
1613
          ccv_convnet_layer_t* layer = convnet->layers + sqlite3_column_int(layer_data_stmt, 0);
1614
          int half_precision = sqlite3_column_int(layer_data_stmt, 3);
1615
          int wnum = sqlite3_column_bytes(layer_data_stmt, 1) / (half_precision ? sizeof(uint16_t) : sizeof(float));
1616
          // if weights available, load weights
1617
          if (wnum == layer->wnum)
1618
          {
1619
            const void* w = sqlite3_column_blob(layer_data_stmt, 1);
1620
            if (half_precision)
1621
            {
1622
              float* f = (float*)ccmalloc(sizeof(float) * layer->wnum);
1623
              ccv_half_precision_to_float((uint16_t*)w, f, layer->wnum);
1624
              w = f;
1625
            }
1626
            switch (layer->type)
1627
            {
1628
              case CCV_CONVNET_CONVOLUTIONAL:
1629
                memcpy(layer->w, w, sizeof(float) * layer->wnum);
1630
                break;
1631
              case CCV_CONVNET_FULL_CONNECT:
1632
                memcpy(layer->w, w, sizeof(float) * layer->wnum);
1633
                break;
1634
            }
1635
            if (half_precision)
1636
              ccfree((void*)w);
1637
          }
1638
          int bnum = sqlite3_column_bytes(layer_data_stmt, 2) / (half_precision ? sizeof(uint16_t) : sizeof(float));
1639
          // if bias available, load bias
1640
          if (bnum == (layer->type == CCV_CONVNET_CONVOLUTIONAL ? layer->net.convolutional.count : layer->net.full_connect.count))
1641
          {
1642
            const void* bias = sqlite3_column_blob(layer_data_stmt, 2);
1643
            if (half_precision)
1644
            {
1645
              float* f = (float*)ccmalloc(sizeof(float) * (layer->type == CCV_CONVNET_CONVOLUTIONAL ? layer->net.convolutional.count : layer->net.full_connect.count));
1646
              ccv_half_precision_to_float((uint16_t*)bias, f, layer->type == CCV_CONVNET_CONVOLUTIONAL ? layer->net.convolutional.count : layer->net.full_connect.count);
1647
              bias = f;
1648
            }
1649
            switch (layer->type)
1650
            {
1651
              case CCV_CONVNET_CONVOLUTIONAL:
1652
                memcpy(layer->bias, bias, sizeof(float) * layer->net.convolutional.count);
1653
                break;
1654
              case CCV_CONVNET_FULL_CONNECT:
1655
                memcpy(layer->bias, bias, sizeof(float) * layer->net.full_connect.count);
1656
                break;
1657
            }
1658
            if (half_precision)
1659
              ccfree((void*)bias);
1660
          }
1661
        }
1662
        sqlite3_finalize(layer_data_stmt);
1663
      }
1664
      sqlite3_stmt* convnet_params_mean_activity_stmt = 0;
1665
      // load convnet params for mean activity
1666
      const char convnet_params_mean_activity_qs[] =
1667
        "SELECT mean_activity FROM convnet_params WHERE convnet = 0;";
1668
      if (SQLITE_OK == sqlite3_prepare_v2(db, convnet_params_mean_activity_qs, sizeof(convnet_params_mean_activity_qs), &convnet_params_mean_activity_stmt, 0))
1669
      {
1670
        if (sqlite3_step(convnet_params_mean_activity_stmt) == SQLITE_ROW)
1671
        {
1672
          int elems = sqlite3_column_bytes(convnet_params_mean_activity_stmt, 0) / sizeof(float);
1673
          if (elems == convnet->input.height * convnet->input.width * convnet->channels)
1674
            memcpy(convnet->mean_activity->data.f32, sqlite3_column_blob(convnet_params_mean_activity_stmt, 0), sizeof(float) * elems);
1675
        }
1676
        sqlite3_finalize(convnet_params_mean_activity_stmt);
1677
      }
1678
    }
1679
    sqlite3_close(db);
1680
    return convnet;
1681
  }
1682
  return 0;
1683
}
1684
1685
void ccv_convnet_input_formation(ccv_size_t input, ccv_dense_matrix_t* a, ccv_dense_matrix_t** b)
1686
{
1687
  if (a->rows > input.height && a->cols > input.width)
1688
    ccv_resample(a, b, CCV_32F, ccv_max(input.height, (int)(a->rows * (float)input.height / a->cols + 0.5)), ccv_max(input.width, (int)(a->cols * (float)input.width / a->rows + 0.5)), CCV_INTER_AREA);
1689
  else if (a->rows < input.height || a->cols < input.width)
1690
    ccv_resample(a, b, CCV_32F, ccv_max(input.height, (int)(a->rows * (float)input.height / a->cols + 0.5)), ccv_max(input.width, (int)(a->cols * (float)input.width / a->rows + 0.5)), CCV_INTER_CUBIC);
1691
  else
1692
    ccv_shift(a, (ccv_matrix_t**)b, CCV_32F, 0, 0); // converting to 32f
1693
}
1694
1695
void ccv_convnet_free(ccv_convnet_t* convnet)
1696
{
1697
  ccv_convnet_compact(convnet);
1698
  int i;
1699
  for (i = 0; i < convnet->count; i++)
1700
    if (convnet->layers[i].w)
1701
      ccfree(convnet->layers[i].w);
1702
  if (convnet->mean_activity)
1703
    ccv_matrix_free(convnet->mean_activity);
1704
  ccfree(convnet);
1705
}
1706
1707
#endif