Coverage Report

Created: 2024-08-19 11:27

/home/liu/actions-runner/_work/ccv/ccv/lib/ccv_convnet.c
Line
Count
Source (jump to first uncovered line)
1
#include "ccv.h"
2
#include "ccv_internal.h"
3
#if defined(HAVE_SSE2)
4
#include <xmmintrin.h>
5
#elif defined(HAVE_NEON)
6
#include <arm_neon.h>
7
#endif
8
#ifdef HAVE_GSL
9
#include <gsl/gsl_rng.h>
10
#include <gsl/gsl_randist.h>
11
#endif
12
#ifdef USE_OPENMP
13
#include <omp.h>
14
#endif
15
#ifdef USE_DISPATCH
16
#include <dispatch/dispatch.h>
17
#endif
18
#ifdef HAVE_CUDA
19
#include "cuda/cwc.h"
20
#endif
21
#include "3rdparty/sqlite3/sqlite3.h"
22
#include "inc/ccv_convnet_internal.h"
23
24
#ifndef CASE_TESTS
25
26
ccv_convnet_t* ccv_convnet_new(int use_cwc_accel, ccv_size_t input, ccv_convnet_layer_param_t params[], int count)
27
29
{
28
29
  ccv_convnet_t* convnet = (ccv_convnet_t*)ccmalloc(sizeof(ccv_convnet_t) + sizeof(ccv_convnet_layer_t) * count + sizeof(ccv_dense_matrix_t*) * count * 2);
29
29
  convnet->use_cwc_accel = use_cwc_accel;
30
29
#ifdef HAVE_GSL
31
29
  gsl_rng_env_setup();
32
29
  gsl_rng* rng = gsl_rng_alloc(gsl_rng_default);
33
29
  gsl_rng_set(rng, (unsigned long int)convnet);
34
29
#endif
35
29
  convnet->reserved = 0;
36
29
  convnet->layers = (ccv_convnet_layer_t*)(convnet + 1);
37
29
  convnet->acts = (ccv_dense_matrix_t**)(convnet->layers + count);
38
29
  memset(convnet->acts, 0, sizeof(ccv_dense_matrix_t*) * count);
39
29
  convnet->denoms = (ccv_dense_matrix_t**)(convnet->acts + count);
40
29
  memset(convnet->denoms, 0, sizeof(ccv_dense_matrix_t*) * count);
41
29
  convnet->count = count;
42
29
  convnet->input = input;
43
29
  convnet->rows = params[0].input.matrix.rows;
44
29
  convnet->cols = params[0].input.matrix.cols;
45
29
  convnet->channels = params[0].input.matrix.channels;
46
29
  convnet->mean_activity = ccv_dense_matrix_new(convnet->input.height, convnet->input.width, convnet->channels | CCV_32F, 0, 0);
47
29
  ccv_zero(convnet->mean_activity);
48
29
  ccv_convnet_layer_t* layers = convnet->layers;
49
29
  int i, j;
50
100
  for (i = 0; i < count; 
i++71
)
51
71
  {
52
71
    layers[i].type = params[i].type;
53
71
    layers[i].input = params[i].input;
54
71
    layers[i].net = params[i].output;
55
71
    layers[i].reserved = 0;
56
71
    switch (params[i].type)
57
71
    {
58
38
      case CCV_CONVNET_CONVOLUTIONAL:
59
38
        assert(params[i].input.matrix.channels % params[i].input.matrix.partition == 0);
60
38
        assert(params[i].output.convolutional.count % params[i].output.convolutional.partition == 0);
61
38
        assert(params[i].output.convolutional.partition % params[i].input.matrix.partition == 0);
62
38
        assert(params[i].output.convolutional.partition >= params[i].input.matrix.partition);
63
38
        layers[i].wnum = params[i].output.convolutional.rows * params[i].output.convolutional.cols * params[i].output.convolutional.channels / params[i].input.matrix.partition * params[i].output.convolutional.count;
64
38
        layers[i].w = (float*)ccmalloc(sizeof(float) * (layers[i].wnum + params[i].output.convolutional.count));
65
38
        layers[i].bias = layers[i].w + layers[i].wnum;
66
38
#ifdef HAVE_GSL
67
29.4M
        for (j = 0; j < layers[i].wnum; 
j++29.4M
)
68
29.4M
          layers[i].w[j] = (gsl_rng_uniform_pos(rng) * 2 - 1) * params[i].glorot / sqrtf(params[i].output.convolutional.rows * params[i].output.convolutional.cols * params[i].output.convolutional.channels / params[i].input.matrix.partition + params[i].output.convolutional.count);
69
#else
70
        for (j = 0; j < layers[i].wnum; j++)
71
          layers[i].w[j] = 0;
72
#endif
73
8.57k
        for (j = 0; j < params[i].output.convolutional.count; 
j++8.53k
)
74
8.53k
          layers[i].bias[j] = params[i].bias;
75
38
        break;
76
10
      case CCV_CONVNET_FULL_CONNECT:
77
10
        layers[i].wnum = params[i].input.node.count * params[i].output.full_connect.count;
78
10
        layers[i].w = (float*)ccmalloc(sizeof(float) * (layers[i].wnum + params[i].output.full_connect.count));
79
10
        layers[i].bias = layers[i].w + layers[i].wnum;
80
10
#ifdef HAVE_GSL
81
237M
        for (j = 0; j < layers[i].wnum; 
j++237M
)
82
237M
          layers[i].w[j] = (gsl_rng_uniform_pos(rng) * 2 - 1) * params[i].glorot / sqrtf(params[i].input.node.count + params[i].output.full_connect.count);
83
#else
84
        for (j = 0; j < layers[i].wnum; j++)
85
          layers[i].w[j] = 0;
86
#endif
87
20.4k
        for (j = 0; j < params[i].output.full_connect.count; 
j++20.4k
)
88
20.4k
          layers[i].bias[j] = params[i].bias;
89
10
        break;
90
23
      default:
91
23
        layers[i].wnum = 0;
92
23
        layers[i].w = 0;
93
23
        layers[i].bias = 0;
94
23
        break;
95
71
    }
96
71
  }
97
29
#ifdef HAVE_GSL
98
29
  gsl_rng_free(rng);
99
29
#endif
100
29
  return convnet;
101
29
}
102
103
int ccv_convnet_verify(ccv_convnet_t* convnet, int output)
104
0
{
105
0
  int i, out_rows, out_cols, out_partition, out_channels;
106
0
  if (convnet->count < 1)
107
0
    return -1;
108
  // the last layer has to be full connect
109
0
  if (convnet->layers[convnet->count - 1].type != CCV_CONVNET_FULL_CONNECT)
110
0
    return -1;
111
  // you cannot enable relu on the last layer
112
0
  if (convnet->layers[convnet->count - 1].net.full_connect.relu)
113
0
    return -1;
114
0
  out_channels = 3;
115
0
  for (i = 0; i < convnet->count; i++)
116
0
  {
117
0
    ccv_convnet_layer_t* layer = convnet->layers + i;
118
0
    if (i > 0 && (out_rows != layer->input.matrix.rows || out_cols != layer->input.matrix.cols))
119
0
      return -1;
120
    // the input channels should be equal to the previous output channels, skip this check for full connect as it is meaningless
121
0
    if (out_channels != layer->input.matrix.channels && layer->type != CCV_CONVNET_FULL_CONNECT)
122
0
      return -1;
123
0
    ccv_convnet_make_output(layer, layer->input.matrix.rows, layer->input.matrix.cols, &out_rows, &out_cols, &out_partition);
124
0
    if (layer->type == CCV_CONVNET_CONVOLUTIONAL)
125
0
    {
126
      // check to see if the input matrix channel is equal to the expected input of the convolutional layer filters
127
0
      if (layer->input.matrix.channels != layer->net.convolutional.channels)
128
0
        return -1;
129
      // if this layer is convolutional layer, its filter output should equal to next layer's channel input
130
0
      out_channels = layer->net.convolutional.count;
131
0
    }
132
0
  }
133
0
  if (out_rows * out_cols != output)
134
0
    return -1;
135
0
  int count = 0;
136
0
  for (i = 0; i < convnet->count; i++)
137
0
  {
138
0
    ccv_convnet_layer_t* layer = convnet->layers + i;
139
0
    if (layer->type == CCV_CONVNET_FULL_CONNECT)
140
0
    {
141
0
      count = i;
142
0
      break;
143
0
    }
144
0
  }
145
  // all the layers after the first full connect layer should only be full connect layer
146
0
  for (i = count; i < convnet->count; i++)
147
0
    if (convnet->layers[i].type != CCV_CONVNET_FULL_CONNECT ||
148
0
      convnet->layers[i].input.matrix.rows * convnet->layers[i].input.matrix.cols * convnet->layers[i].input.matrix.channels != convnet->layers[i].input.node.count)
149
0
      return -1;
150
0
  return 0;
151
0
}
152
153
#endif
154
155
#if defined(HAVE_SSE2) || defined(HAVE_NEON)
156
157
static void _ccv_convnet_layer_simd_alloc_reserved(ccv_convnet_layer_t* layer)
158
2.37k
{
159
2.37k
  if (layer->reserved)
160
1
    return;
161
2.37k
  int partition = layer->input.matrix.partition;
162
2.37k
  int ch = layer->net.convolutional.channels;
163
2.37k
  int count = layer->net.convolutional.count;
164
2.37k
  int kernel_rows = layer->net.convolutional.rows;
165
2.37k
  int kernel_cols = layer->net.convolutional.cols;
166
2.37k
  int ch_per_partition = ch / partition;
167
2.37k
  int count_per_4 = count / 4;
168
2.37k
  float* simd_w = (float*)ccmalloc(sizeof(float) * layer->wnum);
169
2.37k
  int i, j, k, c;
170
6.84k
  for (k = 0; k < count_per_4; 
k++4.47k
)
171
77.7k
    
for (i = 0; 4.47k
i < kernel_rows * kernel_cols;
i++73.2k
)
172
7.56M
      
for (j = 0; 73.2k
j < ch_per_partition;
j++7.49M
)
173
37.4M
        
for (c = 0; 7.49M
c < 4;
c++29.9M
)
174
29.9M
          simd_w[(k * kernel_rows * kernel_cols * ch_per_partition + i * ch_per_partition + j) * 4 + c] = layer->w[(k * 4 + c) * kernel_rows * kernel_cols * ch_per_partition + i * ch_per_partition + j];
175
2.37k
  layer->reserved = simd_w;
176
2.37k
}
Unexecuted instantiation: convnet.tests.c:_ccv_convnet_layer_simd_alloc_reserved
ccv_convnet.c:_ccv_convnet_layer_simd_alloc_reserved
Line
Count
Source
158
2.37k
{
159
2.37k
  if (layer->reserved)
160
1
    return;
161
2.37k
  int partition = layer->input.matrix.partition;
162
2.37k
  int ch = layer->net.convolutional.channels;
163
2.37k
  int count = layer->net.convolutional.count;
164
2.37k
  int kernel_rows = layer->net.convolutional.rows;
165
2.37k
  int kernel_cols = layer->net.convolutional.cols;
166
2.37k
  int ch_per_partition = ch / partition;
167
2.37k
  int count_per_4 = count / 4;
168
2.37k
  float* simd_w = (float*)ccmalloc(sizeof(float) * layer->wnum);
169
2.37k
  int i, j, k, c;
170
6.84k
  for (k = 0; k < count_per_4; 
k++4.47k
)
171
77.7k
    
for (i = 0; 4.47k
i < kernel_rows * kernel_cols;
i++73.2k
)
172
7.56M
      
for (j = 0; 73.2k
j < ch_per_partition;
j++7.49M
)
173
37.4M
        
for (c = 0; 7.49M
c < 4;
c++29.9M
)
174
29.9M
          simd_w[(k * kernel_rows * kernel_cols * ch_per_partition + i * ch_per_partition + j) * 4 + c] = layer->w[(k * 4 + c) * kernel_rows * kernel_cols * ch_per_partition + i * ch_per_partition + j];
175
2.37k
  layer->reserved = simd_w;
176
2.37k
}
177
178
#endif
179
180
8.00k
#define SIMD(x) ((float*)((x)->reserved))
181
182
#if defined(HAVE_SSE2)
183
static inline void _ccv_convnet_convolutional_forward_propagate_sse2(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t* db, int rows, int cols, int ch, int count, int strides, int border, int kernel_rows, int kernel_cols, int ch_per_partition, int count_per_partition)
184
2.37k
{
185
2.37k
  assert(SIMD(layer));
186
2.37k
#define main_for(block) \
187
4.47k
  
parallel_for2.37k
(k, (count >> 2)) { \
188
4.47k
    int i, j, x, y, c; \
189
4.47k
    int p = k * 4 / count_per_partition; \
190
4.47k
    float* ap = a->data.f32 + p * ch_per_partition; \
191
4.47k
    float* bp = db->data.f32 + k * 4; \
192
4.47k
    float* layer_w = SIMD(layer) + k * 4 * kernel_rows * kernel_cols * ch_per_partition; \
193
4.47k
    float bias[4] __attribute__ ((__aligned__(16))); \
194
4.47k
    memcpy(bias, layer->bias + k * 4, sizeof(float) * 4); \
195
    /* 4 accumulators */ \
196
4.47k
    __m128 z4 = _mm_setzero_ps(); \
197
150k
    for (i = 0; i < db->rows; 
i++145k
) \
198
145k
    { \
199
145k
      int comy = ccv_max(i * strides - border, 0) - (i * strides - border); \
200
145k
      int maxy = kernel_rows - comy - (i * strides + kernel_rows - ccv_min(a->rows + border, i * strides + kernel_rows)); \
201
145k
      comy *= ch_per_partition * kernel_cols; \
202
8.74M
      for (j = 0; j < db->cols; 
j++8.59M
) \
203
8.59M
      { \
204
8.59M
        __m128 v40 = _mm_load_ps(bias); \
205
8.59M
        __m128 v41 = _mm_setzero_ps(); \
206
8.59M
        __m128 v42 = _mm_setzero_ps(); \
207
8.59M
        __m128 v43 = _mm_setzero_ps(); \
208
8.59M
        int comx = ccv_max(j * strides - border, 0) - (j * strides - border); \
209
8.59M
        int maxx = kernel_cols - comx - (j * strides + kernel_cols - ccv_min(a->cols + border, j * strides + kernel_cols)); \
210
8.59M
        float* w = layer_w + (comx * ch_per_partition + comy) * 4; \
211
8.59M
        float* apz = ap + ccv_max(j * strides - border, 0) * ch; \
212
        /* when we have border, we simply do zero padding */ \
213
37.8M
        for (y = 0; y < maxy; 
y++29.2M
) \
214
29.2M
        { \
215
          /* special casing for these cases to speed up SIMD computation */ \
216
134M
          for (x = 0; x < maxx; 
x++104M
) \
217
104M
          { \
218
104M
            c = 0; \
219
1.85G
            for (; c < ch_per_partition - 3; 
c += 41.75G
) \
220
1.75G
            { \
221
1.75G
              __m128 apz4 = _mm_loadu_ps(apz + x * ch + c); \
222
1.75G
              __m128 w40 = _mm_loadu_ps(w + (x * ch_per_partition + c) * 4); \
223
1.75G
              __m128 w41 = _mm_loadu_ps(w + (x * ch_per_partition + c + 1) * 4); \
224
1.75G
              __m128 w42 = _mm_loadu_ps(w + (x * ch_per_partition + c + 2) * 4); \
225
1.75G
              __m128 w43 = _mm_loadu_ps(w + (x * ch_per_partition + c + 3) * 4); \
226
1.75G
              __m128 apz40 = _mm_shuffle_ps(apz4, apz4, 0x00); \
227
1.75G
              __m128 apz41 = _mm_shuffle_ps(apz4, apz4, 0x55); \
228
1.75G
              __m128 apz42 = _mm_shuffle_ps(apz4, apz4, 0xAA); \
229
1.75G
              __m128 apz43 = _mm_shuffle_ps(apz4, apz4, 0xFF); \
230
1.75G
              v40 =_mm_add_ps(_mm_mul_ps(w40, apz40), v40); \
231
1.75G
              v41 =_mm_add_ps(_mm_mul_ps(w41, apz41), v41); \
232
1.75G
              v42 =_mm_add_ps(_mm_mul_ps(w42, apz42), v42); \
233
1.75G
              v43 =_mm_add_ps(_mm_mul_ps(w43, apz43), v43); \
234
1.75G
            } \
235
104M
            block /* insert executions for tail partition */ \
236
104M
          } \
237
29.2M
          w += kernel_cols * ch_per_partition * 4; \
238
29.2M
          apz += a->cols * ch; \
239
29.2M
        } \
240
8.59M
        __m128 v4 = _mm_max_ps(z4, _mm_add_ps(_mm_add_ps(v40, v41), _mm_add_ps(v42, v43))); \
241
8.59M
        _mm_storeu_ps(bp + j * count, v4); /* ReLU */ \
242
8.59M
      } \
243
145k
      bp += db->cols * count; \
244
145k
      ap += a->cols * ch * (ccv_max((i + 1) * strides - border, 0) - ccv_max(i * strides - border, 0)); \
245
145k
    } \
246
4.47k
  } parallel_endfor
247
2.37k
  if (ch_per_partition % 4 == 0)
248
24
  {
249
24
    main_for();
250
2.35k
  } else if (ch_per_partition % 4 == 3) { // unroll the last for-loops
251
1.22k
#define block \
252
1.22k
    __m128 apz40 = _mm_load1_ps(apz + x * ch + c); \
253
1.22k
    __m128 apz41 = _mm_load1_ps(apz + x * ch + c + 1); \
254
1.22k
    __m128 apz42 = _mm_load1_ps(apz + x * ch + c + 2); \
255
1.22k
    __m128 w40 = _mm_loadu_ps(w + (x * ch_per_partition + c) * 4); \
256
1.22k
    __m128 w41 = _mm_loadu_ps(w + (x * ch_per_partition + c + 1) * 4); \
257
1.22k
    __m128 w42 = _mm_loadu_ps(w + (x * ch_per_partition + c + 2) * 4); \
258
1.22k
    v40 = _mm_add_ps(_mm_mul_ps(w40, apz40), v40); \
259
1.22k
    v41 = _mm_add_ps(_mm_mul_ps(w41, apz41), v41); \
260
1.22k
    v42 = _mm_add_ps(_mm_mul_ps(w42, apz42), v42);
261
1.22k
    main_for(block);
262
1.22k
#undef block
263
1.22k
  } else 
if (1.13k
ch_per_partition % 4 == 21.13k
) { // unroll the last for-loops
264
1.12k
#define block \
265
1.12k
    __m128 apz40 = _mm_load1_ps(apz + x * ch + c); \
266
1.12k
    __m128 apz41 = _mm_load1_ps(apz + x * ch + c + 1); \
267
1.12k
    __m128 w40 = _mm_loadu_ps(w + (x * ch_per_partition + c) * 4); \
268
1.12k
    __m128 w41 = _mm_loadu_ps(w + (x * ch_per_partition + c + 1) * 4); \
269
1.12k
    v40 = _mm_add_ps(_mm_mul_ps(w40, apz40), v40); \
270
1.12k
    v41 = _mm_add_ps(_mm_mul_ps(w41, apz41), v41);
271
1.12k
    main_for(block);
272
1.12k
#undef block
273
1.12k
  } else {
274
3
#define block \
275
3
    __m128 apz4 = _mm_load1_ps(apz + x * ch + c); \
276
3
    __m128 w4 = _mm_loadu_ps(w + (x * ch_per_partition + c) * 4); \
277
3
    v40 = _mm_add_ps(_mm_mul_ps(w4, apz4), v40);
278
3
    main_for(block);
279
3
#undef block
280
3
  }
281
2.37k
#undef main_for
282
2.37k
}
Unexecuted instantiation: convnet.tests.c:_ccv_convnet_convolutional_forward_propagate_sse2
ccv_convnet.c:_ccv_convnet_convolutional_forward_propagate_sse2
Line
Count
Source
184
2.37k
{
185
2.37k
  assert(SIMD(layer));
186
2.37k
#define main_for(block) \
187
2.37k
  parallel_for(k, (count >> 2)) { \
188
2.37k
    int i, j, x, y, c; \
189
2.37k
    int p = k * 4 / count_per_partition; \
190
2.37k
    float* ap = a->data.f32 + p * ch_per_partition; \
191
2.37k
    float* bp = db->data.f32 + k * 4; \
192
2.37k
    float* layer_w = SIMD(layer) + k * 4 * kernel_rows * kernel_cols * ch_per_partition; \
193
2.37k
    float bias[4] __attribute__ ((__aligned__(16))); \
194
2.37k
    memcpy(bias, layer->bias + k * 4, sizeof(float) * 4); \
195
    /* 4 accumulators */ \
196
2.37k
    __m128 z4 = _mm_setzero_ps(); \
197
2.37k
    for (i = 0; i < db->rows; i++) \
198
2.37k
    { \
199
2.37k
      int comy = ccv_max(i * strides - border, 0) - (i * strides - border); \
200
2.37k
      int maxy = kernel_rows - comy - (i * strides + kernel_rows - ccv_min(a->rows + border, i * strides + kernel_rows)); \
201
2.37k
      comy *= ch_per_partition * kernel_cols; \
202
2.37k
      for (j = 0; j < db->cols; j++) \
203
2.37k
      { \
204
2.37k
        __m128 v40 = _mm_load_ps(bias); \
205
2.37k
        __m128 v41 = _mm_setzero_ps(); \
206
2.37k
        __m128 v42 = _mm_setzero_ps(); \
207
2.37k
        __m128 v43 = _mm_setzero_ps(); \
208
2.37k
        int comx = ccv_max(j * strides - border, 0) - (j * strides - border); \
209
2.37k
        int maxx = kernel_cols - comx - (j * strides + kernel_cols - ccv_min(a->cols + border, j * strides + kernel_cols)); \
210
2.37k
        float* w = layer_w + (comx * ch_per_partition + comy) * 4; \
211
2.37k
        float* apz = ap + ccv_max(j * strides - border, 0) * ch; \
212
        /* when we have border, we simply do zero padding */ \
213
2.37k
        for (y = 0; y < maxy; y++) \
214
2.37k
        { \
215
          /* special casing for these cases to speed up SIMD computation */ \
216
2.37k
          for (x = 0; x < maxx; x++) \
217
2.37k
          { \
218
2.37k
            c = 0; \
219
2.37k
            for (; c < ch_per_partition - 3; c += 4) \
220
2.37k
            { \
221
2.37k
              __m128 apz4 = _mm_loadu_ps(apz + x * ch + c); \
222
2.37k
              __m128 w40 = _mm_loadu_ps(w + (x * ch_per_partition + c) * 4); \
223
2.37k
              __m128 w41 = _mm_loadu_ps(w + (x * ch_per_partition + c + 1) * 4); \
224
2.37k
              __m128 w42 = _mm_loadu_ps(w + (x * ch_per_partition + c + 2) * 4); \
225
2.37k
              __m128 w43 = _mm_loadu_ps(w + (x * ch_per_partition + c + 3) * 4); \
226
2.37k
              __m128 apz40 = _mm_shuffle_ps(apz4, apz4, 0x00); \
227
2.37k
              __m128 apz41 = _mm_shuffle_ps(apz4, apz4, 0x55); \
228
2.37k
              __m128 apz42 = _mm_shuffle_ps(apz4, apz4, 0xAA); \
229
2.37k
              __m128 apz43 = _mm_shuffle_ps(apz4, apz4, 0xFF); \
230
2.37k
              v40 =_mm_add_ps(_mm_mul_ps(w40, apz40), v40); \
231
2.37k
              v41 =_mm_add_ps(_mm_mul_ps(w41, apz41), v41); \
232
2.37k
              v42 =_mm_add_ps(_mm_mul_ps(w42, apz42), v42); \
233
2.37k
              v43 =_mm_add_ps(_mm_mul_ps(w43, apz43), v43); \
234
2.37k
            } \
235
2.37k
            block /* insert executions for tail partition */ \
236
2.37k
          } \
237
2.37k
          w += kernel_cols * ch_per_partition * 4; \
238
2.37k
          apz += a->cols * ch; \
239
2.37k
        } \
240
2.37k
        __m128 v4 = _mm_max_ps(z4, _mm_add_ps(_mm_add_ps(v40, v41), _mm_add_ps(v42, v43))); \
241
2.37k
        _mm_storeu_ps(bp + j * count, v4); /* ReLU */ \
242
2.37k
      } \
243
2.37k
      bp += db->cols * count; \
244
2.37k
      ap += a->cols * ch * (ccv_max((i + 1) * strides - border, 0) - ccv_max(i * strides - border, 0)); \
245
2.37k
    } \
246
2.37k
  } parallel_endfor
247
2.37k
  if (ch_per_partition % 4 == 0)
248
24
  {
249
24
    main_for();
250
2.35k
  } else if (ch_per_partition % 4 == 3) { // unroll the last for-loops
251
1.22k
#define block \
252
1.22k
    __m128 apz40 = _mm_load1_ps(apz + x * ch + c); \
253
1.22k
    __m128 apz41 = _mm_load1_ps(apz + x * ch + c + 1); \
254
1.22k
    __m128 apz42 = _mm_load1_ps(apz + x * ch + c + 2); \
255
1.22k
    __m128 w40 = _mm_loadu_ps(w + (x * ch_per_partition + c) * 4); \
256
1.22k
    __m128 w41 = _mm_loadu_ps(w + (x * ch_per_partition + c + 1) * 4); \
257
1.22k
    __m128 w42 = _mm_loadu_ps(w + (x * ch_per_partition + c + 2) * 4); \
258
1.22k
    v40 = _mm_add_ps(_mm_mul_ps(w40, apz40), v40); \
259
1.22k
    v41 = _mm_add_ps(_mm_mul_ps(w41, apz41), v41); \
260
1.22k
    v42 = _mm_add_ps(_mm_mul_ps(w42, apz42), v42);
261
1.22k
    main_for(block);
262
1.22k
#undef block
263
1.22k
  } else 
if (1.13k
ch_per_partition % 4 == 21.13k
) { // unroll the last for-loops
264
1.12k
#define block \
265
1.12k
    __m128 apz40 = _mm_load1_ps(apz + x * ch + c); \
266
1.12k
    __m128 apz41 = _mm_load1_ps(apz + x * ch + c + 1); \
267
1.12k
    __m128 w40 = _mm_loadu_ps(w + (x * ch_per_partition + c) * 4); \
268
1.12k
    __m128 w41 = _mm_loadu_ps(w + (x * ch_per_partition + c + 1) * 4); \
269
1.12k
    v40 = _mm_add_ps(_mm_mul_ps(w40, apz40), v40); \
270
1.12k
    v41 = _mm_add_ps(_mm_mul_ps(w41, apz41), v41);
271
1.12k
    main_for(block);
272
1.12k
#undef block
273
1.12k
  } else {
274
3
#define block \
275
3
    __m128 apz4 = _mm_load1_ps(apz + x * ch + c); \
276
3
    __m128 w4 = _mm_loadu_ps(w + (x * ch_per_partition + c) * 4); \
277
3
    v40 = _mm_add_ps(_mm_mul_ps(w4, apz4), v40);
278
3
    main_for(block);
279
3
#undef block
280
3
  }
281
2.37k
#undef main_for
282
2.37k
}
283
#elif defined(HAVE_NEON)
284
static inline void _ccv_convnet_convolutional_forward_propagate_neon(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t* db, int rows, int cols, int ch, int count, int strides, int border, int kernel_rows, int kernel_cols, int ch_per_partition, int count_per_partition)
285
{
286
  assert(SIMD(layer));
287
#define main_for(block) \
288
  parallel_for(k, (count >> 2)) { \
289
    int i, j, x, y, c; \
290
    int p = k * 4 / count_per_partition; \
291
    float* ap = a->data.f32 + p * ch_per_partition; \
292
    float* bp = db->data.f32 + k * 4; \
293
    float* layer_w = SIMD(layer) + k * 4 * kernel_rows * kernel_cols * ch_per_partition; \
294
    float bias[4] __attribute__ ((__aligned__(16))); \
295
    memcpy(bias, layer->bias + k * 4, sizeof(float) * 4); \
296
    float32x4_t z4 = vmovq_n_f32(0); \
297
    for (i = 0; i < db->rows; i++) \
298
    { \
299
      int comy = ccv_max(i * strides - border, 0) - (i * strides - border); \
300
      int maxy = kernel_rows - comy - (i * strides + kernel_rows - ccv_min(a->rows + border, i * strides + kernel_rows)); \
301
      comy *= ch_per_partition * kernel_cols; \
302
      for (j = 0; j < db->cols; j++) \
303
      { \
304
        float32x4_t v40 = vld1q_f32(bias); \
305
        float32x4_t v41 = vmovq_n_f32(0); \
306
        int comx = ccv_max(j * strides - border, 0) - (j * strides - border); \
307
        int maxx = kernel_cols - comx - (j * strides + kernel_cols - ccv_min(a->cols + border, j * strides + kernel_cols)); \
308
        float* w = layer_w + (comx * ch_per_partition + comy) * 4; \
309
        float* apz = ap + ccv_max(j * strides - border, 0) * ch; \
310
        /* when we have border, we simply do zero padding */ \
311
        for (y = 0; y < maxy; y++) \
312
        { \
313
          for (x = 0; x < maxx; x++) \
314
          { \
315
            c = 0; \
316
            for (; c < ch_per_partition - 1; c += 2) \
317
            { \
318
              float32x2_t apz4 = vld1_f32(apz + x * ch + c); \
319
              float32x4_t apz40 = vdupq_lane_f32(apz4, 0); \
320
              float32x4_t apz41 = vdupq_lane_f32(apz4, 1); \
321
              float32x4_t w40 = vld1q_f32(w + (x * ch_per_partition + c) * 4); \
322
              float32x4_t w41 = vld1q_f32(w + (x * ch_per_partition + c + 1) * 4); \
323
              v40 = vmlaq_f32(v40, w40, apz40); \
324
              v41 = vmlaq_f32(v41, w41, apz41); \
325
            } \
326
            block /* insert executions for tail partition */ \
327
          } \
328
          w += kernel_cols * ch_per_partition * 4; \
329
          apz += a->cols * ch; \
330
        } \
331
        float32x4_t v4 = vmaxq_f32(z4, vaddq_f32(v40, v41)); \
332
        vst1q_f32(bp + j * count, v4); /* ReLU */ \
333
      } \
334
      bp += db->cols * count; \
335
      ap += a->cols * ch * (ccv_max((i + 1) * strides - border, 0) - ccv_max(i * strides - border, 0)); \
336
    } \
337
  } parallel_endfor
338
  if (ch_per_partition % 2 == 0)
339
  {
340
    main_for();
341
  } else { // unroll the last for-loops
342
#define block \
343
    float32x4_t apz4 = vmovq_n_f32(apz[x * ch + c]); \
344
    float32x4_t w4 = vld1q_f32(w + (x * ch_per_partition + c) * 4); \
345
    v40 = vmlaq_f32(v40, w4, apz4);
346
    main_for(block);
347
#undef block
348
  }
349
#undef main_for
350
}
351
#else
352
static inline void _ccv_convnet_convolutional_forward_propagate_fallback(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t* db, int rows, int cols, int ch, int count, int strides, int border, int kernel_rows, int kernel_cols, int ch_per_partition, int count_per_partition)
353
{
354
  parallel_for(k, count) {
355
    int i, j, x, y, c;
356
    int p = k / count_per_partition;
357
    float* ap = a->data.f32 + p * ch_per_partition;
358
    float* bp = db->data.f32 + k;
359
    float* layer_w = layer->w + k * kernel_rows * kernel_cols * ch_per_partition;
360
    float bias = layer->bias[k];
361
    for (i = 0; i < db->rows; i++)
362
    {
363
      int comy = ccv_max(i * strides - border, 0) - (i * strides - border);
364
      int maxy = kernel_rows - comy - (i * strides + kernel_rows - ccv_min(a->rows + border, i * strides + kernel_rows));
365
      comy *= ch_per_partition * kernel_cols;
366
      for (j = 0; j < db->cols; j++)
367
      {
368
        float v = bias;
369
        int comx = ccv_max(j * strides - border, 0) - (j * strides - border);
370
        int maxx = kernel_cols - comx - (j * strides + kernel_cols - ccv_min(a->cols + border, j * strides + kernel_cols));
371
        float* w = layer_w + comx * ch_per_partition + comy;
372
        float* apz = ap + ccv_max(j * strides - border, 0) * ch;
373
        // when we have border, we simply do zero padding
374
        for (y = 0; y < maxy; y++)
375
        {
376
          for (x = 0; x < maxx; x++)
377
            for (c = 0; c < ch_per_partition; c++)
378
              v += w[x * ch_per_partition + c] * apz[x * ch + c];
379
          w += kernel_cols * ch_per_partition;
380
          apz += a->cols * ch;
381
        }
382
        bp[j * count] = ccv_max(0, v); // ReLU
383
      }
384
      bp += db->cols * count;
385
      ap += a->cols * ch * (ccv_max((i + 1) * strides - border, 0) - ccv_max(i * strides - border, 0));
386
    }
387
  } parallel_endfor
388
}
389
#endif
390
391
static void _ccv_convnet_convolutional_forward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t** b)
392
2.37k
{
393
2.37k
  int rows, cols, partition;
394
2.37k
  ccv_convnet_make_output(layer, a->rows, a->cols, &rows, &cols, &partition);
395
2.37k
  int ch = layer->net.convolutional.channels;
396
2.37k
  int count = layer->net.convolutional.count;
397
2.37k
  int strides = layer->net.convolutional.strides;
398
2.37k
  int border = layer->net.convolutional.border;
399
2.37k
  int kernel_rows = layer->net.convolutional.rows;
400
2.37k
  int kernel_cols = layer->net.convolutional.cols;
401
2.37k
  int type = CCV_32F | count;
402
2.37k
  assert(CCV_GET_CHANNEL(a->type) == ch);
403
2.37k
  assert(CCV_GET_DATA_TYPE(a->type) == CCV_32F);
404
2.37k
  ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, rows, cols, type, type, 0);
405
2.37k
  int ch_per_partition = ch / partition;
406
2.37k
  int count_per_partition = count / partition;
407
2.37k
  assert(count_per_partition % 4 == 0);
408
2.37k
#if defined(HAVE_SSE2) || defined(HAVE_NEON)
409
2.37k
  _ccv_convnet_layer_simd_alloc_reserved(layer);
410
2.37k
#endif
411
2.37k
#if defined(HAVE_SSE2)
412
2.37k
  _ccv_convnet_convolutional_forward_propagate_sse2(layer, a, db, rows, cols, ch, count, strides, border, kernel_rows, kernel_cols, ch_per_partition, count_per_partition);
413
#elif defined(HAVE_NEON)
414
  _ccv_convnet_convolutional_forward_propagate_neon(layer, a, db, rows, cols, ch, count, strides, border, kernel_rows, kernel_cols, ch_per_partition, count_per_partition);
415
#else
416
  _ccv_convnet_convolutional_forward_propagate_fallback(layer, a, db, rows, cols, ch, count, strides, border, kernel_rows, kernel_cols, ch_per_partition, count_per_partition);
417
#endif
418
2.37k
}
Unexecuted instantiation: convnet.tests.c:_ccv_convnet_convolutional_forward_propagate
ccv_convnet.c:_ccv_convnet_convolutional_forward_propagate
Line
Count
Source
392
2.37k
{
393
2.37k
  int rows, cols, partition;
394
2.37k
  ccv_convnet_make_output(layer, a->rows, a->cols, &rows, &cols, &partition);
395
2.37k
  int ch = layer->net.convolutional.channels;
396
2.37k
  int count = layer->net.convolutional.count;
397
2.37k
  int strides = layer->net.convolutional.strides;
398
2.37k
  int border = layer->net.convolutional.border;
399
2.37k
  int kernel_rows = layer->net.convolutional.rows;
400
2.37k
  int kernel_cols = layer->net.convolutional.cols;
401
2.37k
  int type = CCV_32F | count;
402
2.37k
  assert(CCV_GET_CHANNEL(a->type) == ch);
403
2.37k
  assert(CCV_GET_DATA_TYPE(a->type) == CCV_32F);
404
2.37k
  ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, rows, cols, type, type, 0);
405
2.37k
  int ch_per_partition = ch / partition;
406
2.37k
  int count_per_partition = count / partition;
407
2.37k
  assert(count_per_partition % 4 == 0);
408
2.37k
#if defined(HAVE_SSE2) || defined(HAVE_NEON)
409
2.37k
  _ccv_convnet_layer_simd_alloc_reserved(layer);
410
2.37k
#endif
411
2.37k
#if defined(HAVE_SSE2)
412
2.37k
  _ccv_convnet_convolutional_forward_propagate_sse2(layer, a, db, rows, cols, ch, count, strides, border, kernel_rows, kernel_cols, ch_per_partition, count_per_partition);
413
#elif defined(HAVE_NEON)
414
  _ccv_convnet_convolutional_forward_propagate_neon(layer, a, db, rows, cols, ch, count, strides, border, kernel_rows, kernel_cols, ch_per_partition, count_per_partition);
415
#else
416
  _ccv_convnet_convolutional_forward_propagate_fallback(layer, a, db, rows, cols, ch, count, strides, border, kernel_rows, kernel_cols, ch_per_partition, count_per_partition);
417
#endif
418
2.37k
}
419
420
static void _ccv_convnet_full_connect_forward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t** b)
421
3.23k
{
422
3.23k
  assert(CCV_GET_DATA_TYPE(a->type) == CCV_32F);
423
3.23k
  ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, layer->net.full_connect.count, 1, CCV_32F | CCV_C1, CCV_32F | CCV_C1, 0);
424
3.23k
  int ch = CCV_GET_CHANNEL(a->type);
425
3.23k
  int rows = a->rows, cols = a->cols;
426
  // reshape a for gemm
427
3.23k
  assert(a->step == a->cols * CCV_GET_DATA_TYPE_SIZE(a->type) * ch);
428
3.23k
  a->rows = rows * cols * ch, a->cols = 1, a->type = (a->type - ch) | CCV_C1;
429
3.23k
  assert(a->rows * db->rows == layer->wnum);
430
3.23k
  a->step = a->cols * CCV_GET_DATA_TYPE_SIZE(a->type);
431
3.23k
  int i;
432
3.23k
  float* bptr = db->data.f32;
433
55.9k
  for (i = 0; i < db->rows; 
i++52.7k
)
434
52.7k
    bptr[i] = layer->bias[i];
435
3.23k
  ccv_dense_matrix_t dw = ccv_dense_matrix(db->rows, a->rows, CCV_32F | CCV_C1, layer->w, 0);
436
3.23k
  ccv_gemm(&dw, a, 1, db, 1, 0, (ccv_matrix_t**)&db, 0); // supply db as matrix C is allowed
437
3.23k
  if (layer->net.full_connect.relu)
438
16.3k
    
for (i = 0; 4
i < db->rows;
i++16.3k
)
439
16.3k
      bptr[i] = ccv_max(0, bptr[i]); // relu
440
3.23k
  a->rows = rows, a->cols = cols, a->type = (a->type - CCV_GET_CHANNEL(a->type)) | ch;
441
3.23k
  a->step = a->cols * CCV_GET_DATA_TYPE_SIZE(a->type) * CCV_GET_CHANNEL(a->type);
442
3.23k
}
Unexecuted instantiation: convnet.tests.c:_ccv_convnet_full_connect_forward_propagate
ccv_convnet.c:_ccv_convnet_full_connect_forward_propagate
Line
Count
Source
421
3.23k
{
422
3.23k
  assert(CCV_GET_DATA_TYPE(a->type) == CCV_32F);
423
3.23k
  ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, layer->net.full_connect.count, 1, CCV_32F | CCV_C1, CCV_32F | CCV_C1, 0);
424
3.23k
  int ch = CCV_GET_CHANNEL(a->type);
425
3.23k
  int rows = a->rows, cols = a->cols;
426
  // reshape a for gemm
427
3.23k
  assert(a->step == a->cols * CCV_GET_DATA_TYPE_SIZE(a->type) * ch);
428
3.23k
  a->rows = rows * cols * ch, a->cols = 1, a->type = (a->type - ch) | CCV_C1;
429
3.23k
  assert(a->rows * db->rows == layer->wnum);
430
3.23k
  a->step = a->cols * CCV_GET_DATA_TYPE_SIZE(a->type);
431
3.23k
  int i;
432
3.23k
  float* bptr = db->data.f32;
433
55.9k
  for (i = 0; i < db->rows; 
i++52.7k
)
434
52.7k
    bptr[i] = layer->bias[i];
435
3.23k
  ccv_dense_matrix_t dw = ccv_dense_matrix(db->rows, a->rows, CCV_32F | CCV_C1, layer->w, 0);
436
3.23k
  ccv_gemm(&dw, a, 1, db, 1, 0, (ccv_matrix_t**)&db, 0); // supply db as matrix C is allowed
437
3.23k
  if (layer->net.full_connect.relu)
438
16.3k
    
for (i = 0; 4
i < db->rows;
i++16.3k
)
439
16.3k
      bptr[i] = ccv_max(0, bptr[i]); // relu
440
3.23k
  a->rows = rows, a->cols = cols, a->type = (a->type - CCV_GET_CHANNEL(a->type)) | ch;
441
3.23k
  a->step = a->cols * CCV_GET_DATA_TYPE_SIZE(a->type) * CCV_GET_CHANNEL(a->type);
442
3.23k
}
443
444
static void _ccv_convnet_rnorm_forward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t** b, ccv_dense_matrix_t** denoms)
445
823
{
446
823
  int rows, cols, partition;
447
823
  ccv_convnet_make_output(layer, a->rows, a->cols, &rows, &cols, &partition);
448
823
  int size = layer->net.rnorm.size;
449
823
  float kappa = layer->net.rnorm.kappa;
450
823
  float alpha = layer->net.rnorm.alpha;
451
823
  float beta = layer->net.rnorm.beta;
452
823
  int way = size / 2;
453
823
  assert(CCV_GET_DATA_TYPE(a->type) == CCV_32F);
454
823
  int ch = CCV_GET_CHANNEL(a->type);
455
823
  int type = CCV_32F | ch;
456
823
  ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, rows, cols, type, type, 0);
457
823
  int i, j, k, x, p;
458
823
  float* ap = a->data.f32;
459
823
  float* bp = db->data.f32;
460
823
  int ch_per_partition = ch / partition;
461
823
  if (denoms)
462
823
  {
463
823
    ccv_dense_matrix_t* ddenoms = *denoms = ccv_dense_matrix_renew(*denoms, rows, cols, type, type, 0);
464
823
    float* dp = ddenoms->data.f32;
465
26.3k
    for (i = 0; i < db->rows; 
i++25.4k
)
466
25.4k
    {
467
815k
      for (j = 0; j < db->cols; 
j++789k
)
468
1.58M
        
for (p = 0; 789k
p < partition;
p++790k
)
469
3.95M
          
for (k = 0; 790k
k < ch_per_partition;
k++3.16M
)
470
3.16M
          {
471
3.16M
            float v = ap[j * ch + p * ch_per_partition + k];
472
3.16M
            float denom = 0;
473
11.0M
            for (x = 
ccv_max3.16M
(k - way, 0); x <= ccv_min(k + way, ch_per_partition - 1);
x++7.90M
)
474
7.90M
              denom += ap[j * ch + p * ch_per_partition + x] * ap[j * ch + p * ch_per_partition + x];
475
3.16M
            denom = kappa + alpha * denom;
476
3.16M
            dp[j * ch + p * ch_per_partition + k] = denom;
477
3.16M
            bp[j * ch + p * ch_per_partition + k] = v * powf(denom, -beta);
478
3.16M
          }
479
25.4k
      ap += a->cols * ch;
480
25.4k
      dp += ddenoms->cols * ch;
481
25.4k
      bp += db->cols * ch;
482
25.4k
    }
483
823
  } else {
484
0
    for (i = 0; i < db->rows; i++)
485
0
    {
486
0
      for (j = 0; j < db->cols; j++)
487
0
        for (p = 0; p < partition; p++)
488
0
          for (k = 0; k < ch_per_partition; k++)
489
0
          {
490
0
            float v = ap[j * ch + p * ch_per_partition + k];
491
0
            float denom = 0;
492
0
            for (x = ccv_max(k - way, 0); x <= ccv_min(k + way, ch_per_partition - 1); x++)
493
0
              denom += ap[j * ch + p * ch_per_partition + x] * ap[j * ch + p * ch_per_partition + x];
494
0
            denom = kappa + alpha * denom;
495
0
            bp[j * ch + p * ch_per_partition + k] = v * powf(denom, -beta);
496
0
          }
497
0
      ap += a->cols * ch;
498
0
      bp += db->cols * ch;
499
0
    }
500
0
  }
501
823
}
Unexecuted instantiation: convnet.tests.c:_ccv_convnet_rnorm_forward_propagate
ccv_convnet.c:_ccv_convnet_rnorm_forward_propagate
Line
Count
Source
445
823
{
446
823
  int rows, cols, partition;
447
823
  ccv_convnet_make_output(layer, a->rows, a->cols, &rows, &cols, &partition);
448
823
  int size = layer->net.rnorm.size;
449
823
  float kappa = layer->net.rnorm.kappa;
450
823
  float alpha = layer->net.rnorm.alpha;
451
823
  float beta = layer->net.rnorm.beta;
452
823
  int way = size / 2;
453
823
  assert(CCV_GET_DATA_TYPE(a->type) == CCV_32F);
454
823
  int ch = CCV_GET_CHANNEL(a->type);
455
823
  int type = CCV_32F | ch;
456
823
  ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, rows, cols, type, type, 0);
457
823
  int i, j, k, x, p;
458
823
  float* ap = a->data.f32;
459
823
  float* bp = db->data.f32;
460
823
  int ch_per_partition = ch / partition;
461
823
  if (denoms)
462
823
  {
463
823
    ccv_dense_matrix_t* ddenoms = *denoms = ccv_dense_matrix_renew(*denoms, rows, cols, type, type, 0);
464
823
    float* dp = ddenoms->data.f32;
465
26.3k
    for (i = 0; i < db->rows; 
i++25.4k
)
466
25.4k
    {
467
815k
      for (j = 0; j < db->cols; 
j++789k
)
468
1.58M
        
for (p = 0; 789k
p < partition;
p++790k
)
469
3.95M
          
for (k = 0; 790k
k < ch_per_partition;
k++3.16M
)
470
3.16M
          {
471
3.16M
            float v = ap[j * ch + p * ch_per_partition + k];
472
3.16M
            float denom = 0;
473
11.0M
            for (x = 
ccv_max3.16M
(k - way, 0); x <= ccv_min(k + way, ch_per_partition - 1);
x++7.90M
)
474
7.90M
              denom += ap[j * ch + p * ch_per_partition + x] * ap[j * ch + p * ch_per_partition + x];
475
3.16M
            denom = kappa + alpha * denom;
476
3.16M
            dp[j * ch + p * ch_per_partition + k] = denom;
477
3.16M
            bp[j * ch + p * ch_per_partition + k] = v * powf(denom, -beta);
478
3.16M
          }
479
25.4k
      ap += a->cols * ch;
480
25.4k
      dp += ddenoms->cols * ch;
481
25.4k
      bp += db->cols * ch;
482
25.4k
    }
483
823
  } else {
484
0
    for (i = 0; i < db->rows; i++)
485
0
    {
486
0
      for (j = 0; j < db->cols; j++)
487
0
        for (p = 0; p < partition; p++)
488
0
          for (k = 0; k < ch_per_partition; k++)
489
0
          {
490
0
            float v = ap[j * ch + p * ch_per_partition + k];
491
0
            float denom = 0;
492
0
            for (x = ccv_max(k - way, 0); x <= ccv_min(k + way, ch_per_partition - 1); x++)
493
0
              denom += ap[j * ch + p * ch_per_partition + x] * ap[j * ch + p * ch_per_partition + x];
494
0
            denom = kappa + alpha * denom;
495
0
            bp[j * ch + p * ch_per_partition + k] = v * powf(denom, -beta);
496
0
          }
497
0
      ap += a->cols * ch;
498
0
      bp += db->cols * ch;
499
0
    }
500
0
  }
501
823
}
502
503
static void _ccv_convnet_max_pool_forward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t** b)
504
14
{
505
14
  int rows, cols, partition;
506
14
  ccv_convnet_make_output(layer, a->rows, a->cols, &rows, &cols, &partition);
507
14
  int size = layer->net.pool.size;
508
14
  int strides = layer->net.pool.strides;
509
14
  int border = layer->net.pool.border;
510
14
  assert(CCV_GET_DATA_TYPE(a->type) == CCV_32F);
511
14
  int ch = CCV_GET_CHANNEL(a->type);
512
14
  int type = CCV_32F | ch;
513
14
  ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, rows, cols, type, type, 0);
514
14
  int i, j, k, x, y;
515
14
  float* ap = a->data.f32;
516
14
  float* bp = db->data.f32;
517
526
  for (i = 0; i < db->rows; 
i++512
)
518
512
  {
519
512
    const int start_y = ccv_max(i * strides - border, 0) - (i * strides - border);
520
512
    const int end_y = size + ccv_min(i * strides + size - border, a->rows) - (i * strides + size - border);
521
35.1k
    for (j = 0; j < db->cols; 
j++34.6k
)
522
34.6k
    {
523
34.6k
      const int start_x = ccv_max(j * strides - border, 0) - (j * strides - border);
524
34.6k
      const int end_x = size + ccv_min(j * strides + size - border, a->cols) - (j * strides + size - border);
525
2.97M
      for (k = 0; k < ch; 
k++2.93M
)
526
2.93M
      {
527
2.93M
        float v = 0;
528
11.7M
        for (y = start_y; y < end_y; 
y++8.81M
)
529
35.2M
          
for (x = start_x; 8.81M
x < end_x;
x++26.4M
)
530
26.4M
            if (x == start_x && 
y == start_y8.81M
)
531
2.93M
              v = ap[(j * strides - border + x + (y - border) * a->cols) * ch + k];
532
23.4M
            else if (ap[(j * strides - border + x + (y - border) * a->cols) * ch + k] > v)
533
3.80M
              v = ap[(j * strides - border + x + (y - border) * a->cols) * ch + k];
534
2.93M
        bp[j * ch + k] = v;
535
2.93M
      }
536
34.6k
    }
537
512
    ap += a->cols * ch * strides;
538
512
    bp += db->cols * ch;
539
512
  }
540
14
}
Unexecuted instantiation: convnet.tests.c:_ccv_convnet_max_pool_forward_propagate
ccv_convnet.c:_ccv_convnet_max_pool_forward_propagate
Line
Count
Source
504
14
{
505
14
  int rows, cols, partition;
506
14
  ccv_convnet_make_output(layer, a->rows, a->cols, &rows, &cols, &partition);
507
14
  int size = layer->net.pool.size;
508
14
  int strides = layer->net.pool.strides;
509
14
  int border = layer->net.pool.border;
510
14
  assert(CCV_GET_DATA_TYPE(a->type) == CCV_32F);
511
14
  int ch = CCV_GET_CHANNEL(a->type);
512
14
  int type = CCV_32F | ch;
513
14
  ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, rows, cols, type, type, 0);
514
14
  int i, j, k, x, y;
515
14
  float* ap = a->data.f32;
516
14
  float* bp = db->data.f32;
517
526
  for (i = 0; i < db->rows; 
i++512
)
518
512
  {
519
512
    const int start_y = ccv_max(i * strides - border, 0) - (i * strides - border);
520
512
    const int end_y = size + ccv_min(i * strides + size - border, a->rows) - (i * strides + size - border);
521
35.1k
    for (j = 0; j < db->cols; 
j++34.6k
)
522
34.6k
    {
523
34.6k
      const int start_x = ccv_max(j * strides - border, 0) - (j * strides - border);
524
34.6k
      const int end_x = size + ccv_min(j * strides + size - border, a->cols) - (j * strides + size - border);
525
2.97M
      for (k = 0; k < ch; 
k++2.93M
)
526
2.93M
      {
527
2.93M
        float v = 0;
528
11.7M
        for (y = start_y; y < end_y; 
y++8.81M
)
529
35.2M
          
for (x = start_x; 8.81M
x < end_x;
x++26.4M
)
530
26.4M
            if (x == start_x && 
y == start_y8.81M
)
531
2.93M
              v = ap[(j * strides - border + x + (y - border) * a->cols) * ch + k];
532
23.4M
            else if (ap[(j * strides - border + x + (y - border) * a->cols) * ch + k] > v)
533
3.80M
              v = ap[(j * strides - border + x + (y - border) * a->cols) * ch + k];
534
2.93M
        bp[j * ch + k] = v;
535
2.93M
      }
536
34.6k
    }
537
512
    ap += a->cols * ch * strides;
538
512
    bp += db->cols * ch;
539
512
  }
540
14
}
541
542
static void _ccv_convnet_average_pool_forward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t** b)
543
3
{
544
3
  int rows, cols, partition;
545
3
  ccv_convnet_make_output(layer, a->rows, a->cols, &rows, &cols, &partition);
546
3
  int size = layer->net.pool.size;
547
3
  int strides = layer->net.pool.strides;
548
3
  int border = layer->net.pool.border;
549
3
  assert(CCV_GET_DATA_TYPE(a->type) == CCV_32F);
550
3
  int ch = CCV_GET_CHANNEL(a->type);
551
3
  int type = CCV_32F | ch;
552
3
  ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, rows, cols, type, type, 0);
553
3
  int i, j, k, x, y;
554
3
  float* ap = a->data.f32;
555
3
  float* bp = db->data.f32;
556
76
  for (i = 0; i < db->rows; 
i++73
)
557
73
  {
558
73
    const int start_y = ccv_max(i * strides - border, 0) - (i * strides - border);
559
73
    const int end_y = size + ccv_min(i * strides + size - border, a->rows) - (i * strides + size - border);
560
1.89k
    for (j = 0; j < db->cols; 
j++1.81k
)
561
1.81k
    {
562
1.81k
      const int start_x = ccv_max(j * strides - border, 0) - (j * strides - border);
563
1.81k
      const int end_x = size + ccv_min(j * strides + size - border, a->cols) - (j * strides + size - border);
564
3.63k
      for (k = 0; k < ch; 
k++1.81k
)
565
1.81k
      {
566
1.81k
        float v = 0;
567
6.54k
        for (y = start_y; y < end_y; 
y++4.72k
)
568
17.4k
          
for (x = start_x; 4.72k
x < end_x;
x++12.7k
)
569
12.7k
            v += ap[(j * strides - border + x + (y - border) * a->cols) * ch + k];
570
1.81k
        bp[j * ch + k] = v / ((end_x - start_x) * (end_y - start_y));
571
1.81k
      }
572
1.81k
    }
573
73
    ap += a->cols * ch * strides;
574
73
    bp += db->cols * ch;
575
73
  }
576
3
}
Unexecuted instantiation: convnet.tests.c:_ccv_convnet_average_pool_forward_propagate
ccv_convnet.c:_ccv_convnet_average_pool_forward_propagate
Line
Count
Source
543
3
{
544
3
  int rows, cols, partition;
545
3
  ccv_convnet_make_output(layer, a->rows, a->cols, &rows, &cols, &partition);
546
3
  int size = layer->net.pool.size;
547
3
  int strides = layer->net.pool.strides;
548
3
  int border = layer->net.pool.border;
549
3
  assert(CCV_GET_DATA_TYPE(a->type) == CCV_32F);
550
3
  int ch = CCV_GET_CHANNEL(a->type);
551
3
  int type = CCV_32F | ch;
552
3
  ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, rows, cols, type, type, 0);
553
3
  int i, j, k, x, y;
554
3
  float* ap = a->data.f32;
555
3
  float* bp = db->data.f32;
556
76
  for (i = 0; i < db->rows; 
i++73
)
557
73
  {
558
73
    const int start_y = ccv_max(i * strides - border, 0) - (i * strides - border);
559
73
    const int end_y = size + ccv_min(i * strides + size - border, a->rows) - (i * strides + size - border);
560
1.89k
    for (j = 0; j < db->cols; 
j++1.81k
)
561
1.81k
    {
562
1.81k
      const int start_x = ccv_max(j * strides - border, 0) - (j * strides - border);
563
1.81k
      const int end_x = size + ccv_min(j * strides + size - border, a->cols) - (j * strides + size - border);
564
3.63k
      for (k = 0; k < ch; 
k++1.81k
)
565
1.81k
      {
566
1.81k
        float v = 0;
567
6.54k
        for (y = start_y; y < end_y; 
y++4.72k
)
568
17.4k
          
for (x = start_x; 4.72k
x < end_x;
x++12.7k
)
569
12.7k
            v += ap[(j * strides - border + x + (y - border) * a->cols) * ch + k];
570
1.81k
        bp[j * ch + k] = v / ((end_x - start_x) * (end_y - start_y));
571
1.81k
      }
572
1.81k
    }
573
73
    ap += a->cols * ch * strides;
574
73
    bp += db->cols * ch;
575
73
  }
576
3
}
577
578
static void _ccv_convnet_layer_forward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t** b, ccv_dense_matrix_t** denoms)
579
6.45k
{
580
6.45k
  switch(layer->type)
581
6.45k
  {
582
2.37k
    case CCV_CONVNET_CONVOLUTIONAL:
583
2.37k
      _ccv_convnet_convolutional_forward_propagate(layer, a, b);
584
2.37k
      break;
585
3.23k
    case CCV_CONVNET_FULL_CONNECT:
586
3.23k
      _ccv_convnet_full_connect_forward_propagate(layer, a, b);
587
3.23k
      break;
588
823
    case CCV_CONVNET_LOCAL_RESPONSE_NORM:
589
823
      _ccv_convnet_rnorm_forward_propagate(layer, a, b, denoms);
590
823
      break;
591
14
    case CCV_CONVNET_MAX_POOL:
592
14
      _ccv_convnet_max_pool_forward_propagate(layer, a, b);
593
14
      break;
594
3
    case CCV_CONVNET_AVERAGE_POOL:
595
3
      _ccv_convnet_average_pool_forward_propagate(layer, a, b);
596
3
      break;
597
6.45k
  }
598
6.45k
}
Unexecuted instantiation: convnet.tests.c:_ccv_convnet_layer_forward_propagate
ccv_convnet.c:_ccv_convnet_layer_forward_propagate
Line
Count
Source
579
6.45k
{
580
6.45k
  switch(layer->type)
581
6.45k
  {
582
2.37k
    case CCV_CONVNET_CONVOLUTIONAL:
583
2.37k
      _ccv_convnet_convolutional_forward_propagate(layer, a, b);
584
2.37k
      break;
585
3.23k
    case CCV_CONVNET_FULL_CONNECT:
586
3.23k
      _ccv_convnet_full_connect_forward_propagate(layer, a, b);
587
3.23k
      break;
588
823
    case CCV_CONVNET_LOCAL_RESPONSE_NORM:
589
823
      _ccv_convnet_rnorm_forward_propagate(layer, a, b, denoms);
590
823
      break;
591
14
    case CCV_CONVNET_MAX_POOL:
592
14
      _ccv_convnet_max_pool_forward_propagate(layer, a, b);
593
14
      break;
594
3
    case CCV_CONVNET_AVERAGE_POOL:
595
3
      _ccv_convnet_average_pool_forward_propagate(layer, a, b);
596
3
      break;
597
6.45k
  }
598
6.45k
}
599
600
static void _ccv_convnet_full_connect_forward_propagate_parallel(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t** b)
601
0
{
602
0
  assert(CCV_GET_DATA_TYPE(a->type) == CCV_32F);
603
0
  ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, a->rows, layer->net.full_connect.count, CCV_32F | CCV_C1, CCV_32F | CCV_C1, 0);
604
  // reshape a for gemm
605
0
  int i, j;
606
0
  float* bptr = db->data.f32;
607
0
  for (i = 0; i < db->rows; i++)
608
0
  {
609
0
    for (j = 0; j < db->cols; j++)
610
0
      bptr[j] = layer->bias[j];
611
0
    bptr += db->cols;
612
0
  }
613
0
  ccv_dense_matrix_t dw = ccv_dense_matrix(db->cols, a->cols, CCV_32F | CCV_C1, layer->w, 0);
614
0
  ccv_gemm(a, &dw, 1, db, 1, CCV_B_TRANSPOSE, (ccv_matrix_t**)&db, 0); // supply db as matrix C is allowed
615
0
  bptr = db->data.f32;
616
0
  if (layer->net.full_connect.relu)
617
0
    for (i = 0; i < db->rows; i++)
618
0
    {
619
0
      for (j = 0; j < db->cols; j++)
620
0
        bptr[j] = ccv_max(0, bptr[j]); // relu
621
0
      bptr += db->cols;
622
0
    }
623
0
}
Unexecuted instantiation: convnet.tests.c:_ccv_convnet_full_connect_forward_propagate_parallel
Unexecuted instantiation: ccv_convnet.c:_ccv_convnet_full_connect_forward_propagate_parallel
624
625
static void _ccv_convnet_compute_softmax_parallel(ccv_dense_matrix_t* a, ccv_dense_matrix_t** b, int type)
626
0
{
627
0
  assert(CCV_GET_CHANNEL(a->type) == CCV_C1);
628
0
  assert(CCV_GET_DATA_TYPE(a->type) == CCV_32F);
629
0
  ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, 1, a->cols, CCV_32F | CCV_C1, CCV_32F | CCV_C1, 0);
630
0
  ccv_zero(db);
631
0
  int i, j;
632
0
  float* aptr = a->data.f32;
633
0
  float* bptr = db->data.f32;
634
0
  float* cptr = (float*)ccmalloc(sizeof(float) * a->cols);
635
0
  for (i = 0; i < a->rows; i++)
636
0
  {
637
0
    double max = aptr[0];
638
0
    for (j = 1; j < a->cols; j++)
639
0
      if (aptr[j] > max)
640
0
        max = aptr[j];
641
0
    double tt = 0;
642
0
    for (j = 0; j < a->cols; j++)
643
0
      tt += (cptr[j] = expf(aptr[j] - max));
644
0
    tt = 1.0 / tt;
645
0
    for (j = 0; j < a->cols; j++)
646
0
      bptr[j] += cptr[j] * tt;
647
0
    aptr += a->cols;
648
0
  }
649
0
  ccfree(cptr);
650
0
}
Unexecuted instantiation: convnet.tests.c:_ccv_convnet_compute_softmax_parallel
Unexecuted instantiation: ccv_convnet.c:_ccv_convnet_compute_softmax_parallel
651
652
#ifndef CASE_TESTS
653
654
void ccv_convnet_encode(ccv_convnet_t* convnet, ccv_dense_matrix_t** a, ccv_dense_matrix_t** b, int batch)
655
5.28k
{
656
5.28k
#ifdef HAVE_CUDA
657
5.28k
  if (convnet->use_cwc_accel)
658
0
    cwc_convnet_encode(convnet, a, b, batch);
659
5.28k
  else {
660
5.28k
#endif
661
5.28k
  assert(batch == 1);
662
5.28k
  assert(CCV_GET_CHANNEL((*a)->type) == convnet->channels);
663
5.28k
  assert((*a)->rows == convnet->rows);
664
5.28k
  assert((*a)->cols == convnet->cols);
665
5.28k
  int i;
666
  // save the last layer of neuron cache in case that we encode to a different matrix
667
5.28k
  ccv_dense_matrix_t* out_neuron = convnet->acts[convnet->count - 1];
668
5.28k
  convnet->acts[convnet->count - 1] = *b;
669
5.28k
  _ccv_convnet_layer_forward_propagate(convnet->layers, *a, convnet->acts, convnet->denoms);
670
6.45k
  for (i = 1; i < convnet->count; 
i++1.16k
)
671
1.16k
    _ccv_convnet_layer_forward_propagate(convnet->layers + i, convnet->acts[i - 1], convnet->acts + i, convnet->denoms + i);
672
5.28k
  if (convnet->acts + convnet->count - 1 != b)
673
5.28k
  {
674
5.28k
    *b = convnet->acts[convnet->count - 1];
675
    // restore the last layer of neuron cache
676
5.28k
    convnet->acts[convnet->count - 1] = out_neuron;
677
5.28k
  }
678
5.28k
#ifdef HAVE_CUDA
679
5.28k
  }
680
5.28k
#endif
681
5.28k
}
682
683
// find the layer for scanning (it is the last convolutional layer)
684
static int _ccv_convnet_find_scan(ccv_convnet_t* convnet)
685
0
{
686
0
  int i;
687
0
  ccv_convnet_layer_t* layers = convnet->layers;
688
0
  for (i = convnet->count - 1; i >= 0; i--)
689
0
    if (layers[i].type == CCV_CONVNET_CONVOLUTIONAL)
690
0
      return i;
691
0
  return -1;
692
0
}
693
694
static int _ccv_convnet_derive_scale(ccv_convnet_t* convnet, int scan)
695
0
{
696
0
  int i, scale = 1;
697
0
  for (i = scan; i >= 0; i--)
698
0
  {
699
0
    ccv_convnet_layer_t* layer = convnet->layers + i;
700
0
    switch (layer->type)
701
0
    {
702
0
      case CCV_CONVNET_CONVOLUTIONAL:
703
0
        scale *= layer->net.convolutional.strides;
704
0
        break;
705
0
      case CCV_CONVNET_MAX_POOL:
706
0
      case CCV_CONVNET_AVERAGE_POOL:
707
0
        scale *= layer->net.pool.strides;
708
0
        break;
709
0
    }
710
0
  }
711
0
  return scale;
712
0
}
713
714
static int _ccv_convnet_find_full_connect(ccv_convnet_t* convnet)
715
0
{
716
0
  int i;
717
0
  for (i = 0; i < convnet->count; i++)
718
0
    if (convnet->layers[i].type == CCV_CONVNET_FULL_CONNECT)
719
0
      return i;
720
0
  return -1;
721
0
}
722
723
void ccv_convnet_classify(ccv_convnet_t* convnet, ccv_dense_matrix_t** a, int symmetric, ccv_array_t** ranks, int tops, int batch)
724
0
{
725
0
#ifdef HAVE_CUDA
726
0
  if (convnet->use_cwc_accel)
727
0
    cwc_convnet_classify(convnet, a, symmetric, ranks, tops, batch);
728
0
  else {
729
0
#endif
730
0
  int i, j, k, t;
731
0
  ccv_dense_matrix_t** b = (ccv_dense_matrix_t**)alloca(sizeof(ccv_dense_matrix_t*) * (convnet->count + 1));
732
0
  int scan = _ccv_convnet_find_scan(convnet);
733
0
  int scale = _ccv_convnet_derive_scale(convnet, scan);
734
0
  int full_connect = _ccv_convnet_find_full_connect(convnet);
735
0
  assert(scan >= 0 && scan < convnet->count);
736
0
  assert(full_connect >= 0 && full_connect < convnet->count);
737
0
  memset(b, 0, sizeof(ccv_dense_matrix_t*) * (convnet->count + 1));
738
0
  for (i = 0; i < batch; i++)
739
0
  {
740
0
    assert(CCV_GET_CHANNEL(a[i]->type) == convnet->channels);
741
0
    assert(a[i]->rows == convnet->input.height || a[i]->cols == convnet->input.width);
742
0
    assert(a[i]->rows >= convnet->input.height && a[i]->cols >= convnet->input.width);
743
    // find optimal rows and cols to slice to
744
0
    int rows = convnet->rows + ((a[i]->rows - convnet->rows) / scale) * scale;
745
0
    int cols = convnet->cols + ((a[i]->cols - convnet->cols) / scale) * scale;
746
0
    assert(rows == convnet->input.height || cols == convnet->input.width);
747
0
    assert(rows <= a[i]->rows && cols <= a[i]->cols);
748
0
    ccv_dense_matrix_t* slice = 0;
749
0
    ccv_slice(a[i], (ccv_matrix_t**)&slice, CCV_32F, (a[i]->rows - rows) / 2, (a[i]->cols - cols) / 2, rows, cols);
750
0
    ccv_dense_matrix_t* mean_activity = 0;
751
    // scale mean activity up to be substractable (from this one, the CPU implementation is an approximation of GPU implementation)
752
0
    ccv_resample(convnet->mean_activity, &mean_activity, 0, (double)rows / (double)convnet->mean_activity->rows, (double)cols / (double)convnet->mean_activity->cols, CCV_INTER_CUBIC);
753
0
    ccv_subtract(slice, mean_activity, (ccv_matrix_t**)b, CCV_32F);
754
0
    ccv_matrix_free(mean_activity);
755
0
    ccv_matrix_free(slice);
756
    // doing the first few layers until the first scan layer
757
0
    int out_rows, out_cols, out_partition;
758
0
    ccv_dense_matrix_t* c = ccv_dense_matrix_new(5 * (!!symmetric + 1), convnet->layers[full_connect].input.node.count, CCV_32F | CCV_C1, 0, 0);
759
0
    for (t = 0; t <= !!symmetric; t++)
760
0
    {
761
0
      rows = b[0]->rows, cols = b[0]->cols;
762
0
      for (j = 0; j < scan + 1; j++)
763
0
      {
764
0
        ccv_convnet_layer_t* layer = convnet->layers + j;
765
0
        ccv_convnet_make_output(layer, rows, cols, &out_rows, &out_cols, &out_partition);
766
0
        _ccv_convnet_layer_forward_propagate(layer, b[j], b + j + 1, 0);
767
0
        assert(b[j + 1]->rows == out_rows && b[j + 1]->cols == out_cols);
768
0
        if (j > 0)
769
0
          ccv_matrix_free(b[j]);
770
0
        rows = out_rows, cols = out_cols;
771
0
      }
772
0
      int offsets[5][2] = {
773
0
        {0, 0},
774
0
        {cols - convnet->layers[scan + 1].input.matrix.cols, 0},
775
0
        {(cols - convnet->layers[scan + 1].input.matrix.cols) / 2, (rows - convnet->layers[scan + 1].input.matrix.rows) / 2},
776
0
        {0, rows - convnet->layers[scan + 1].input.matrix.rows},
777
0
        {cols - convnet->layers[scan + 1].input.matrix.cols, rows - convnet->layers[scan + 1].input.matrix.rows},
778
0
      };
779
0
      for (k = 0; k < 5; k++)
780
0
      {
781
0
        ccv_dense_matrix_t* input = 0;
782
0
        ccv_convnet_layer_t* layer = convnet->layers + scan + 1;
783
0
        ccv_slice(b[scan + 1], (ccv_matrix_t**)&input, CCV_32F, offsets[k][1], offsets[k][0], layer->input.matrix.rows, layer->input.matrix.cols);
784
        // copy the last layer for full connect compute
785
0
        b[full_connect] = ccv_dense_matrix_new(convnet->layers[full_connect].input.matrix.rows, convnet->layers[full_connect].input.matrix.cols, CCV_NO_DATA_ALLOC | CCV_32F | convnet->layers[full_connect].input.matrix.channels, c->data.f32 + (t * 5 + k) * convnet->layers[full_connect].input.node.count, 0);
786
0
        for (j = scan + 1; j < full_connect; j++)
787
0
        {
788
0
          layer = convnet->layers + j;
789
0
          _ccv_convnet_layer_forward_propagate(layer, j > scan + 1 ? b[j] : input, b + j + 1, 0);
790
0
          if (j > scan + 1)
791
0
            ccv_matrix_free(b[j]);
792
0
          else
793
0
            ccv_matrix_free(input);
794
0
        }
795
0
        ccv_matrix_free(b[full_connect]);
796
        // set it to 0
797
0
        memset(b + scan + 2, 0, sizeof(ccv_dense_matrix_t*) * (full_connect - scan - 1));
798
0
      }
799
0
      ccv_matrix_free(b[scan + 1]);
800
0
      memset(b + 1, 0, sizeof(ccv_dense_matrix_t*) * (scan + 1));
801
0
      if (t < !!symmetric)
802
0
        ccv_flip(b[0], &b[0], 0, CCV_FLIP_X);
803
0
    }
804
0
    ccv_matrix_free(b[0]);
805
    // now have everything in c, do the last full connect propagate
806
0
    b[full_connect] = c;
807
0
    for (j = full_connect; j < convnet->count; j++)
808
0
    {
809
0
      ccv_convnet_layer_t* layer = convnet->layers + j;
810
0
      assert(layer->type == CCV_CONVNET_FULL_CONNECT);
811
0
      _ccv_convnet_full_connect_forward_propagate_parallel(layer, b[j], b + j + 1);
812
0
      ccv_matrix_free(b[j]);
813
0
    }
814
0
    ccv_dense_matrix_t* softmax = 0;
815
0
    _ccv_convnet_compute_softmax_parallel(b[convnet->count], &softmax, 0);
816
0
    ccv_matrix_free(b[convnet->count]);
817
0
    ranks[i] = ccv_array_new(sizeof(ccv_classification_t), tops, 0);
818
0
    float* r = softmax->data.f32;
819
0
    assert(tops <= softmax->cols);
820
0
    for (j = 0; j < tops; j++)
821
0
    {
822
0
      float max_val = -1;
823
0
      int max_idx = -1;
824
0
      for (k = 0; k < softmax->cols; k++)
825
0
        if (r[k] >= 0 && r[k] > max_val)
826
0
          max_val = r[k], max_idx = k;
827
0
      assert(max_idx >= 0);
828
0
      r[max_idx] = -1;
829
0
      ccv_classification_t classification = {
830
0
        .id = max_idx,
831
0
        .confidence = max_val / ((!!symmetric + 1) * 5),
832
0
      };
833
0
      ccv_array_push(ranks[i], &classification);
834
0
    }
835
0
    ccv_matrix_free(softmax);
836
0
    memset(b, 0, sizeof(ccv_dense_matrix_t*) * (convnet->count + 1));
837
0
  }
838
0
#ifdef HAVE_CUDA
839
0
  }
840
0
#endif
841
0
}
842
843
#endif
844
845
#ifdef HAVE_GSL
846
847
// compute back propagated gradient & weight update delta
848
static void _ccv_convnet_convolutional_backward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t* n, ccv_dense_matrix_t* m, ccv_dense_matrix_t** b, ccv_convnet_layer_t* update_params)
849
7
{
850
  // a is the input gradient (for back prop).
851
  // x is the input (for forward prop), b is the output gradient (gradient, or known as propagated error)
852
  // note that y (the output from forward prop) is not included because the full connect net is simple enough that we don't need it
853
7
  int rows, cols, partition;
854
7
  ccv_convnet_make_output(layer, layer->input.matrix.rows, layer->input.matrix.cols, &rows, &cols, &partition);
855
7
  int ch = layer->net.convolutional.channels;
856
7
  int count = layer->net.convolutional.count;
857
7
  int strides = layer->net.convolutional.strides;
858
7
  int border = layer->net.convolutional.border;
859
7
  int kernel_rows = layer->net.convolutional.rows;
860
7
  int kernel_cols = layer->net.convolutional.cols;
861
7
  assert(a->rows == rows);
862
7
  assert(a->cols == cols);
863
7
  assert(CCV_GET_CHANNEL(a->type) == count);
864
7
  int a_rows = a->rows, a_cols = a->cols, a_ch = CCV_GET_CHANNEL(a->type);
865
7
  a->rows = rows, a->cols = cols, a->type = (a->type - a_ch) | count;
866
7
  assert(CCV_GET_CHANNEL(m->type) == ch);
867
7
  assert(CCV_GET_DATA_TYPE(m->type) == CCV_32F);
868
7
  int count_per_partition = count / partition;
869
7
  int ch_per_partition = ch / partition;
870
  // update weight gradient
871
60
  
parallel_for7
(k, count) {
872
60
    int i, j, x, y, c;
873
60
    int p = k / count_per_partition;
874
60
    float* mp = m->data.f32 + p * ch_per_partition;
875
60
    float* ap = a->data.f32 + k;
876
60
    float* np = n->data.f32 + k;
877
60
    float* update_w = update_params->w + k * kernel_rows * kernel_cols * ch_per_partition;
878
60
    float bias = 0;
879
1.81k
    for (i = 0; i < rows; 
i++1.75k
)
880
1.75k
    {
881
1.75k
      int comy = ccv_max(i * strides - border, 0) - (i * strides - border);
882
1.75k
      int maxy = kernel_rows - comy - (i * strides + kernel_rows - ccv_min(m->rows + border, i * strides + kernel_rows));
883
1.75k
      comy *= ch_per_partition * kernel_cols;
884
55.6k
      for (j = 0; j < cols; 
j++53.9k
)
885
53.9k
      {
886
53.9k
        if (np[j * count] > 0)
887
43.5k
        { /* when np is bigger than 0, relu continues to update the weight, otherwise it stops */
888
43.5k
          float v = ap[j * count];
889
43.5k
          bias += v;
890
43.5k
          int comx = ccv_max(j * strides - border, 0) - (j * strides - border);
891
43.5k
          int maxx = kernel_cols - comx - (j * strides + kernel_cols - ccv_min(m->cols + border, j * strides + kernel_cols));
892
43.5k
          float* w = update_w + comx * ch_per_partition + comy;
893
43.5k
          float* mpz = mp + ccv_max(j * strides - border, 0) * ch;
894
          /* when we have border, we simply do zero padding */
895
252k
          for (y = 0; y < maxy; 
y++209k
)
896
209k
          {
897
1.21M
            for (x = 0; x < maxx; 
x++1.00M
)
898
3.77M
              
for (c = 0; 1.00M
c < ch_per_partition;
c++2.76M
)
899
2.76M
                w[x * ch_per_partition + c] += v * mpz[x * ch + c];
900
209k
            w += kernel_cols * ch_per_partition;
901
209k
            mpz += m->cols * ch;
902
209k
          }
903
43.5k
        }
904
53.9k
      }
905
1.75k
      ap += a->cols * count;
906
1.75k
      np += n->cols * count;
907
1.75k
      mp += m->cols * ch * (ccv_max((i + 1) * strides - border, 0) - ccv_max(i * strides - border, 0));
908
1.75k
    }
909
60
    update_params->bias[k] += bias;
910
60
  } parallel_endfor
911
7
  if (b)
912
6
  {
913
6
    ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, m->rows, m->cols, CCV_32F | CCV_GET_CHANNEL(m->type), CCV_32F | CCV_GET_CHANNEL(m->type), 0);
914
    // clear it up before propagate result
915
6
    ccv_zero(db);
916
6
    int k;
917
62
    for (k = 0; k < count; 
k++56
)
918
56
    {
919
56
      int i, j, x, y, c;
920
56
      int p = k / count_per_partition;
921
56
      float* bp = db->data.f32 + p * ch_per_partition;
922
56
      float* ap = a->data.f32 + k;
923
56
      float* np = n->data.f32 + k;
924
56
      float* layer_w = layer->w + k * kernel_rows * kernel_cols * ch_per_partition;
925
1.79k
      for (i = 0; i < rows; 
i++1.73k
)
926
1.73k
      {
927
1.73k
        int comy = ccv_max(i * strides - border, 0) - (i * strides - border);
928
1.73k
        int maxy = kernel_rows - comy - (i * strides + kernel_rows - ccv_min(db->rows + border, i * strides + kernel_rows));
929
1.73k
        comy *= ch_per_partition * kernel_cols;
930
55.5k
        for (j = 0; j < cols; 
j++53.8k
)
931
53.8k
        {
932
53.8k
          if (np[j * count] > 0)
933
43.4k
          { /* when np is bigger than 0, relu continues to update the weight, otherwise it stops */
934
43.4k
            float v = ap[j * count];
935
43.4k
            int comx = ccv_max(j * strides - border, 0) - (j * strides - border);
936
43.4k
            int maxx = kernel_cols - comx - (j * strides + kernel_cols - ccv_min(db->cols + border, j * strides + kernel_cols));
937
43.4k
            float* w = layer_w + comx * ch_per_partition + comy;
938
43.4k
            float* bpz = bp + ccv_max(j * strides - border, 0) * ch;
939
            /* when we have border, we simply do zero padding */
940
252k
            for (y = 0; y < maxy; 
y++209k
)
941
209k
            {
942
1.21M
              for (x = 0; x < maxx; 
x++1.00M
)
943
3.76M
                
for (c = 0; 1.00M
c < ch_per_partition;
c++2.76M
)
944
2.76M
                  bpz[x * ch + c] += v * w[x * ch_per_partition + c];
945
209k
              w += kernel_cols * ch_per_partition;
946
209k
              bpz += db->cols * ch;
947
209k
            }
948
43.4k
          }
949
53.8k
        }
950
1.73k
        ap += a->cols * count;
951
1.73k
        np += n->cols * count;
952
1.73k
        bp += db->cols * ch * (ccv_max((i + 1) * strides - border, 0) - ccv_max(i * strides - border, 0));
953
1.73k
      }
954
56
    }
955
6
  }
956
7
  a->rows = a_rows, a->cols = a_cols, a->type = (a->type - CCV_GET_CHANNEL(a->type)) | a_ch;
957
7
}
convnet.tests.c:_ccv_convnet_convolutional_backward_propagate
Line
Count
Source
849
7
{
850
  // a is the input gradient (for back prop).
851
  // x is the input (for forward prop), b is the output gradient (gradient, or known as propagated error)
852
  // note that y (the output from forward prop) is not included because the full connect net is simple enough that we don't need it
853
7
  int rows, cols, partition;
854
7
  ccv_convnet_make_output(layer, layer->input.matrix.rows, layer->input.matrix.cols, &rows, &cols, &partition);
855
7
  int ch = layer->net.convolutional.channels;
856
7
  int count = layer->net.convolutional.count;
857
7
  int strides = layer->net.convolutional.strides;
858
7
  int border = layer->net.convolutional.border;
859
7
  int kernel_rows = layer->net.convolutional.rows;
860
7
  int kernel_cols = layer->net.convolutional.cols;
861
7
  assert(a->rows == rows);
862
7
  assert(a->cols == cols);
863
7
  assert(CCV_GET_CHANNEL(a->type) == count);
864
7
  int a_rows = a->rows, a_cols = a->cols, a_ch = CCV_GET_CHANNEL(a->type);
865
7
  a->rows = rows, a->cols = cols, a->type = (a->type - a_ch) | count;
866
7
  assert(CCV_GET_CHANNEL(m->type) == ch);
867
7
  assert(CCV_GET_DATA_TYPE(m->type) == CCV_32F);
868
7
  int count_per_partition = count / partition;
869
7
  int ch_per_partition = ch / partition;
870
  // update weight gradient
871
60
  
parallel_for7
(k, count) {
872
60
    int i, j, x, y, c;
873
60
    int p = k / count_per_partition;
874
60
    float* mp = m->data.f32 + p * ch_per_partition;
875
60
    float* ap = a->data.f32 + k;
876
60
    float* np = n->data.f32 + k;
877
60
    float* update_w = update_params->w + k * kernel_rows * kernel_cols * ch_per_partition;
878
60
    float bias = 0;
879
1.81k
    for (i = 0; i < rows; 
i++1.75k
)
880
1.75k
    {
881
1.75k
      int comy = ccv_max(i * strides - border, 0) - (i * strides - border);
882
1.75k
      int maxy = kernel_rows - comy - (i * strides + kernel_rows - ccv_min(m->rows + border, i * strides + kernel_rows));
883
1.75k
      comy *= ch_per_partition * kernel_cols;
884
55.6k
      for (j = 0; j < cols; 
j++53.9k
)
885
53.9k
      {
886
53.9k
        if (np[j * count] > 0)
887
43.5k
        { /* when np is bigger than 0, relu continues to update the weight, otherwise it stops */
888
43.5k
          float v = ap[j * count];
889
43.5k
          bias += v;
890
43.5k
          int comx = ccv_max(j * strides - border, 0) - (j * strides - border);
891
43.5k
          int maxx = kernel_cols - comx - (j * strides + kernel_cols - ccv_min(m->cols + border, j * strides + kernel_cols));
892
43.5k
          float* w = update_w + comx * ch_per_partition + comy;
893
43.5k
          float* mpz = mp + ccv_max(j * strides - border, 0) * ch;
894
          /* when we have border, we simply do zero padding */
895
252k
          for (y = 0; y < maxy; 
y++209k
)
896
209k
          {
897
1.21M
            for (x = 0; x < maxx; 
x++1.00M
)
898
3.77M
              
for (c = 0; 1.00M
c < ch_per_partition;
c++2.76M
)
899
2.76M
                w[x * ch_per_partition + c] += v * mpz[x * ch + c];
900
209k
            w += kernel_cols * ch_per_partition;
901
209k
            mpz += m->cols * ch;
902
209k
          }
903
43.5k
        }
904
53.9k
      }
905
1.75k
      ap += a->cols * count;
906
1.75k
      np += n->cols * count;
907
1.75k
      mp += m->cols * ch * (ccv_max((i + 1) * strides - border, 0) - ccv_max(i * strides - border, 0));
908
1.75k
    }
909
60
    update_params->bias[k] += bias;
910
60
  } parallel_endfor
911
7
  if (b)
912
6
  {
913
6
    ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, m->rows, m->cols, CCV_32F | CCV_GET_CHANNEL(m->type), CCV_32F | CCV_GET_CHANNEL(m->type), 0);
914
    // clear it up before propagate result
915
6
    ccv_zero(db);
916
6
    int k;
917
62
    for (k = 0; k < count; 
k++56
)
918
56
    {
919
56
      int i, j, x, y, c;
920
56
      int p = k / count_per_partition;
921
56
      float* bp = db->data.f32 + p * ch_per_partition;
922
56
      float* ap = a->data.f32 + k;
923
56
      float* np = n->data.f32 + k;
924
56
      float* layer_w = layer->w + k * kernel_rows * kernel_cols * ch_per_partition;
925
1.79k
      for (i = 0; i < rows; 
i++1.73k
)
926
1.73k
      {
927
1.73k
        int comy = ccv_max(i * strides - border, 0) - (i * strides - border);
928
1.73k
        int maxy = kernel_rows - comy - (i * strides + kernel_rows - ccv_min(db->rows + border, i * strides + kernel_rows));
929
1.73k
        comy *= ch_per_partition * kernel_cols;
930
55.5k
        for (j = 0; j < cols; 
j++53.8k
)
931
53.8k
        {
932
53.8k
          if (np[j * count] > 0)
933
43.4k
          { /* when np is bigger than 0, relu continues to update the weight, otherwise it stops */
934
43.4k
            float v = ap[j * count];
935
43.4k
            int comx = ccv_max(j * strides - border, 0) - (j * strides - border);
936
43.4k
            int maxx = kernel_cols - comx - (j * strides + kernel_cols - ccv_min(db->cols + border, j * strides + kernel_cols));
937
43.4k
            float* w = layer_w + comx * ch_per_partition + comy;
938
43.4k
            float* bpz = bp + ccv_max(j * strides - border, 0) * ch;
939
            /* when we have border, we simply do zero padding */
940
252k
            for (y = 0; y < maxy; 
y++209k
)
941
209k
            {
942
1.21M
              for (x = 0; x < maxx; 
x++1.00M
)
943
3.76M
                
for (c = 0; 1.00M
c < ch_per_partition;
c++2.76M
)
944
2.76M
                  bpz[x * ch + c] += v * w[x * ch_per_partition + c];
945
209k
              w += kernel_cols * ch_per_partition;
946
209k
              bpz += db->cols * ch;
947
209k
            }
948
43.4k
          }
949
53.8k
        }
950
1.73k
        ap += a->cols * count;
951
1.73k
        np += n->cols * count;
952
1.73k
        bp += db->cols * ch * (ccv_max((i + 1) * strides - border, 0) - ccv_max(i * strides - border, 0));
953
1.73k
      }
954
56
    }
955
6
  }
956
7
  a->rows = a_rows, a->cols = a_cols, a->type = (a->type - CCV_GET_CHANNEL(a->type)) | a_ch;
957
7
}
Unexecuted instantiation: ccv_convnet.c:_ccv_convnet_convolutional_backward_propagate
958
959
static void _ccv_convnet_full_connect_backward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t* y, ccv_dense_matrix_t* x, ccv_dense_matrix_t** b, ccv_convnet_layer_t* update_params)
960
3
{
961
  // a is the input gradient (for back prop), y is the output (for forward prop)
962
  // x is the input (for forward prop), b is the output gradient (gradient, or known as propagated error)
963
3
  ccv_dense_matrix_t* db = 0;
964
3
  if (b)
965
3
    db = *b = ccv_dense_matrix_renew(*b, x->rows, x->cols, CCV_32F | CCV_GET_CHANNEL(x->type), CCV_32F | CCV_GET_CHANNEL(x->type), 0);
966
3
  int x_rows = x->rows, x_cols = x->cols, x_ch = CCV_GET_CHANNEL(x->type);
967
3
  x->rows = x_rows * x_cols * x_ch, x->cols = 1, x->type = (x->type - x_ch) | CCV_C1;
968
3
  x->step = x->cols * CCV_GET_DATA_TYPE_SIZE(x->type);
969
3
  int i;
970
3
  if (layer->net.full_connect.relu)
971
0
    for (i = 0; i < y->rows; i++)
972
0
      if (y->data.f32[i] <= 0)
973
0
        a->data.f32[i] = 0;
974
3
  ccv_dense_matrix_t w = ccv_dense_matrix(a->rows, x->rows, CCV_32F | CCV_C1, update_params->w, 0);
975
3
  ccv_dense_matrix_t* dw = &w;
976
  // compute bias gradient
977
3
  ccv_dense_matrix_t bias = ccv_dense_matrix(a->rows, 1, CCV_32F | CCV_C1, update_params->bias, 0);
978
3
  ccv_dense_matrix_t* dbias = &bias;
979
3
  ccv_add(a, dbias, (ccv_matrix_t**)&dbias, 0);
980
  // compute weight gradient
981
3
  ccv_gemm(a, x, 1, dw, 1, CCV_B_TRANSPOSE, (ccv_matrix_t**)&dw, 0);
982
3
  w = ccv_dense_matrix(a->rows, x->rows, CCV_32F | CCV_C1, layer->w, 0);
983
  // propagate error
984
3
  if (db)
985
3
  {
986
3
    db->rows = x->rows, db->cols = x->cols, db->type = (db->type - x_ch) | CCV_C1;
987
3
    db->step = db->cols * CCV_GET_DATA_TYPE_SIZE(db->type);
988
3
    ccv_gemm(&w, a, 1, 0, 0, CCV_A_TRANSPOSE, (ccv_matrix_t**)&db, 0);
989
3
    db->rows = x_rows, db->cols = x_cols, db->type = (db->type - CCV_GET_CHANNEL(db->type)) | x_ch;
990
3
    db->step = db->cols * CCV_GET_DATA_TYPE_SIZE(db->type) * CCV_GET_CHANNEL(db->type);
991
3
  }
992
3
  x->rows = x_rows, x->cols = x_cols, x->type = (x->type - CCV_GET_CHANNEL(x->type)) | x_ch;
993
3
  x->step = x->cols * CCV_GET_DATA_TYPE_SIZE(x->type) * CCV_GET_CHANNEL(x->type);
994
3
}
convnet.tests.c:_ccv_convnet_full_connect_backward_propagate
Line
Count
Source
960
3
{
961
  // a is the input gradient (for back prop), y is the output (for forward prop)
962
  // x is the input (for forward prop), b is the output gradient (gradient, or known as propagated error)
963
3
  ccv_dense_matrix_t* db = 0;
964
3
  if (b)
965
3
    db = *b = ccv_dense_matrix_renew(*b, x->rows, x->cols, CCV_32F | CCV_GET_CHANNEL(x->type), CCV_32F | CCV_GET_CHANNEL(x->type), 0);
966
3
  int x_rows = x->rows, x_cols = x->cols, x_ch = CCV_GET_CHANNEL(x->type);
967
3
  x->rows = x_rows * x_cols * x_ch, x->cols = 1, x->type = (x->type - x_ch) | CCV_C1;
968
3
  x->step = x->cols * CCV_GET_DATA_TYPE_SIZE(x->type);
969
3
  int i;
970
3
  if (layer->net.full_connect.relu)
971
0
    for (i = 0; i < y->rows; i++)
972
0
      if (y->data.f32[i] <= 0)
973
0
        a->data.f32[i] = 0;
974
3
  ccv_dense_matrix_t w = ccv_dense_matrix(a->rows, x->rows, CCV_32F | CCV_C1, update_params->w, 0);
975
3
  ccv_dense_matrix_t* dw = &w;
976
  // compute bias gradient
977
3
  ccv_dense_matrix_t bias = ccv_dense_matrix(a->rows, 1, CCV_32F | CCV_C1, update_params->bias, 0);
978
3
  ccv_dense_matrix_t* dbias = &bias;
979
3
  ccv_add(a, dbias, (ccv_matrix_t**)&dbias, 0);
980
  // compute weight gradient
981
3
  ccv_gemm(a, x, 1, dw, 1, CCV_B_TRANSPOSE, (ccv_matrix_t**)&dw, 0);
982
3
  w = ccv_dense_matrix(a->rows, x->rows, CCV_32F | CCV_C1, layer->w, 0);
983
  // propagate error
984
3
  if (db)
985
3
  {
986
3
    db->rows = x->rows, db->cols = x->cols, db->type = (db->type - x_ch) | CCV_C1;
987
3
    db->step = db->cols * CCV_GET_DATA_TYPE_SIZE(db->type);
988
3
    ccv_gemm(&w, a, 1, 0, 0, CCV_A_TRANSPOSE, (ccv_matrix_t**)&db, 0);
989
3
    db->rows = x_rows, db->cols = x_cols, db->type = (db->type - CCV_GET_CHANNEL(db->type)) | x_ch;
990
3
    db->step = db->cols * CCV_GET_DATA_TYPE_SIZE(db->type) * CCV_GET_CHANNEL(db->type);
991
3
  }
992
3
  x->rows = x_rows, x->cols = x_cols, x->type = (x->type - CCV_GET_CHANNEL(x->type)) | x_ch;
993
3
  x->step = x->cols * CCV_GET_DATA_TYPE_SIZE(x->type) * CCV_GET_CHANNEL(x->type);
994
3
}
Unexecuted instantiation: ccv_convnet.c:_ccv_convnet_full_connect_backward_propagate
995
996
static void _ccv_convnet_rnorm_backward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t* n, ccv_dense_matrix_t* m, ccv_dense_matrix_t* denoms, ccv_dense_matrix_t** b)
997
4
{
998
4
  int rows, cols, partition;
999
4
  ccv_convnet_make_output(layer, layer->input.matrix.rows, layer->input.matrix.cols, &rows, &cols, &partition);
1000
4
  int size = layer->net.rnorm.size;
1001
4
  float alpha = layer->net.rnorm.alpha;
1002
4
  float beta = layer->net.rnorm.beta;
1003
4
  int way = size / 2;
1004
4
  assert(CCV_GET_DATA_TYPE(a->type) == CCV_32F);
1005
4
  int ch = CCV_GET_CHANNEL(a->type);
1006
4
  int type = CCV_32F | ch;
1007
4
  ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, rows, cols, type, type, 0);
1008
4
  int i, j, k, x, p;
1009
4
  float* ap = a->data.f32;
1010
4
  float* np = n->data.f32;
1011
4
  float* mp = m->data.f32;
1012
4
  float* dp = denoms->data.f32;
1013
4
  float* bp = db->data.f32;
1014
4
  int ch_per_partition = ch / partition;
1015
116
  for (i = 0; i < db->rows; 
i++112
)
1016
112
  {
1017
3.26k
    for (j = 0; j < db->cols; 
j++3.14k
)
1018
7.02k
      
for (p = 0; 3.14k
p < partition;
p++3.87k
)
1019
16.4k
        
for (k = 0; 3.87k
k < ch_per_partition;
k++12.5k
)
1020
12.5k
        {
1021
12.5k
          float nom = 0;
1022
42.6k
          for (x = 
ccv_max12.5k
(k - way, 0); x <= ccv_min(k + way, ch_per_partition - 1);
x++30.0k
)
1023
30.0k
            nom += -2 * alpha * beta * ap[j * ch + x + p * ch_per_partition] * np[j * ch + x + p * ch_per_partition] / dp[j * ch + x + p * ch_per_partition];
1024
12.5k
          bp[j * ch + k + p * ch_per_partition] = mp[j * ch + k + p * ch_per_partition] * nom + ap[j * ch + k + p * ch_per_partition] * powf(dp[j * ch + k + p * ch_per_partition], -beta);
1025
12.5k
        }
1026
112
    ap += a->cols * ch;
1027
112
    np += n->cols * ch;
1028
112
    mp += m->cols * ch;
1029
112
    dp += denoms->cols * ch;
1030
112
    bp += db->cols * ch;
1031
112
  }
1032
4
}
convnet.tests.c:_ccv_convnet_rnorm_backward_propagate
Line
Count
Source
997
4
{
998
4
  int rows, cols, partition;
999
4
  ccv_convnet_make_output(layer, layer->input.matrix.rows, layer->input.matrix.cols, &rows, &cols, &partition);
1000
4
  int size = layer->net.rnorm.size;
1001
4
  float alpha = layer->net.rnorm.alpha;
1002
4
  float beta = layer->net.rnorm.beta;
1003
4
  int way = size / 2;
1004
4
  assert(CCV_GET_DATA_TYPE(a->type) == CCV_32F);
1005
4
  int ch = CCV_GET_CHANNEL(a->type);
1006
4
  int type = CCV_32F | ch;
1007
4
  ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, rows, cols, type, type, 0);
1008
4
  int i, j, k, x, p;
1009
4
  float* ap = a->data.f32;
1010
4
  float* np = n->data.f32;
1011
4
  float* mp = m->data.f32;
1012
4
  float* dp = denoms->data.f32;
1013
4
  float* bp = db->data.f32;
1014
4
  int ch_per_partition = ch / partition;
1015
116
  for (i = 0; i < db->rows; 
i++112
)
1016
112
  {
1017
3.26k
    for (j = 0; j < db->cols; 
j++3.14k
)
1018
7.02k
      
for (p = 0; 3.14k
p < partition;
p++3.87k
)
1019
16.4k
        
for (k = 0; 3.87k
k < ch_per_partition;
k++12.5k
)
1020
12.5k
        {
1021
12.5k
          float nom = 0;
1022
42.6k
          for (x = 
ccv_max12.5k
(k - way, 0); x <= ccv_min(k + way, ch_per_partition - 1);
x++30.0k
)
1023
30.0k
            nom += -2 * alpha * beta * ap[j * ch + x + p * ch_per_partition] * np[j * ch + x + p * ch_per_partition] / dp[j * ch + x + p * ch_per_partition];
1024
12.5k
          bp[j * ch + k + p * ch_per_partition] = mp[j * ch + k + p * ch_per_partition] * nom + ap[j * ch + k + p * ch_per_partition] * powf(dp[j * ch + k + p * ch_per_partition], -beta);
1025
12.5k
        }
1026
112
    ap += a->cols * ch;
1027
112
    np += n->cols * ch;
1028
112
    mp += m->cols * ch;
1029
112
    dp += denoms->cols * ch;
1030
112
    bp += db->cols * ch;
1031
112
  }
1032
4
}
Unexecuted instantiation: ccv_convnet.c:_ccv_convnet_rnorm_backward_propagate
1033
1034
static void _ccv_convnet_max_pool_backward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t* n, ccv_dense_matrix_t* m, ccv_dense_matrix_t** b)
1035
1
{
1036
  // a is the input gradient (for back prop), y is the output (from forward prop),
1037
  // x is the input (for forward prop), b is the output gradient (gradient, or known as propagated error)
1038
  // pooling layer doesn't need the dropout
1039
1
  if (b)
1040
1
  {
1041
1
    assert(CCV_GET_CHANNEL(a->type) == CCV_GET_CHANNEL(n->type));
1042
1
    assert(CCV_GET_CHANNEL(a->type) == CCV_GET_CHANNEL(m->type));
1043
1
    int ch = CCV_GET_CHANNEL(a->type);
1044
1
    ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, m->rows, m->cols, CCV_32F | ch, CCV_32F | ch, 0);
1045
1
    ccv_zero(db);
1046
1
    int size = layer->net.pool.size;
1047
1
    int strides = layer->net.pool.strides;
1048
1
    int border = layer->net.pool.border;
1049
1
    int i, j, k, x, y;
1050
1
    float* ap = a->data.f32;
1051
1
    float* bp = db->data.f32;
1052
1
    float* np = n->data.f32;
1053
1
    float* mp = m->data.f32;
1054
16
    for (i = 0; i < a->rows; 
i++15
)
1055
15
    {
1056
15
      const int start_y = ccv_max(i * strides - border, 0) - (i * strides - border);
1057
15
      const int end_y = size + ccv_min(i * strides + size - border, db->rows) - (i * strides + size - border);
1058
240
      for (j = 0; j < a->cols; 
j++225
)
1059
225
      {
1060
225
        const int start_x = ccv_max(j * strides - border, 0) - (j * strides - border);
1061
225
        const int end_x = size + ccv_min(j * strides + size - border, db->cols) - (j * strides + size - border);
1062
675
        for (k = 0; k < ch; 
k++450
)
1063
450
        {
1064
450
          float v = np[j * ch + k];
1065
450
          float u = ap[j * ch + k];
1066
1.80k
          for (y = start_y; y < end_y; 
y++1.35k
)
1067
5.40k
            
for (x = start_x; 1.35k
x < end_x;
x++4.05k
)
1068
              // we have to do direct comparison otherwise it will contribute to too many cells
1069
              // and the propagation won't work. But CPU will have different result comparing with GPU
1070
4.05k
              if (mp[(j * strides - border + x + (y - border) * m->cols) * ch + k] == v)
1071
450
                bp[(j * strides - border + x + (y - border) * db->cols) * ch + k] += u;
1072
450
        }
1073
225
      }
1074
15
      ap += a->cols * ch;
1075
15
      np += n->cols * ch;
1076
15
      bp += db->cols * ch * strides;
1077
15
      mp += m->cols * ch * strides;
1078
15
    }
1079
1
  }
1080
1
}
convnet.tests.c:_ccv_convnet_max_pool_backward_propagate
Line
Count
Source
1035
1
{
1036
  // a is the input gradient (for back prop), y is the output (from forward prop),
1037
  // x is the input (for forward prop), b is the output gradient (gradient, or known as propagated error)
1038
  // pooling layer doesn't need the dropout
1039
1
  if (b)
1040
1
  {
1041
1
    assert(CCV_GET_CHANNEL(a->type) == CCV_GET_CHANNEL(n->type));
1042
1
    assert(CCV_GET_CHANNEL(a->type) == CCV_GET_CHANNEL(m->type));
1043
1
    int ch = CCV_GET_CHANNEL(a->type);
1044
1
    ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, m->rows, m->cols, CCV_32F | ch, CCV_32F | ch, 0);
1045
1
    ccv_zero(db);
1046
1
    int size = layer->net.pool.size;
1047
1
    int strides = layer->net.pool.strides;
1048
1
    int border = layer->net.pool.border;
1049
1
    int i, j, k, x, y;
1050
1
    float* ap = a->data.f32;
1051
1
    float* bp = db->data.f32;
1052
1
    float* np = n->data.f32;
1053
1
    float* mp = m->data.f32;
1054
16
    for (i = 0; i < a->rows; 
i++15
)
1055
15
    {
1056
15
      const int start_y = ccv_max(i * strides - border, 0) - (i * strides - border);
1057
15
      const int end_y = size + ccv_min(i * strides + size - border, db->rows) - (i * strides + size - border);
1058
240
      for (j = 0; j < a->cols; 
j++225
)
1059
225
      {
1060
225
        const int start_x = ccv_max(j * strides - border, 0) - (j * strides - border);
1061
225
        const int end_x = size + ccv_min(j * strides + size - border, db->cols) - (j * strides + size - border);
1062
675
        for (k = 0; k < ch; 
k++450
)
1063
450
        {
1064
450
          float v = np[j * ch + k];
1065
450
          float u = ap[j * ch + k];
1066
1.80k
          for (y = start_y; y < end_y; 
y++1.35k
)
1067
5.40k
            
for (x = start_x; 1.35k
x < end_x;
x++4.05k
)
1068
              // we have to do direct comparison otherwise it will contribute to too many cells
1069
              // and the propagation won't work. But CPU will have different result comparing with GPU
1070
4.05k
              if (mp[(j * strides - border + x + (y - border) * m->cols) * ch + k] == v)
1071
450
                bp[(j * strides - border + x + (y - border) * db->cols) * ch + k] += u;
1072
450
        }
1073
225
      }
1074
15
      ap += a->cols * ch;
1075
15
      np += n->cols * ch;
1076
15
      bp += db->cols * ch * strides;
1077
15
      mp += m->cols * ch * strides;
1078
15
    }
1079
1
  }
1080
1
}
Unexecuted instantiation: ccv_convnet.c:_ccv_convnet_max_pool_backward_propagate
1081
1082
static void _ccv_convnet_average_pool_backward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t* m, ccv_dense_matrix_t** b)
1083
1
{
1084
  // a is the input gradient (for back prop), y is the output (from forward prop),
1085
  // x is the input (for forward prop), b is the output gradient (gradient, or known as propagated error)
1086
  // pooling layer doesn't need the dropout
1087
1
  if (b)
1088
1
  {
1089
1
    assert(CCV_GET_CHANNEL(a->type) == CCV_GET_CHANNEL(m->type));
1090
1
    int ch = CCV_GET_CHANNEL(a->type);
1091
1
    ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, m->rows, m->cols, CCV_32F | ch, CCV_32F | ch, 0);
1092
1
    ccv_zero(db);
1093
1
    int size = layer->net.pool.size;
1094
1
    int strides = layer->net.pool.strides;
1095
1
    int border = layer->net.pool.border;
1096
1
    int i, j, k, x, y;
1097
1
    float* ap = a->data.f32;
1098
1
    float* bp = db->data.f32;
1099
16
    for (i = 0; i < a->rows; 
i++15
)
1100
15
    {
1101
15
      const int start_y = ccv_max(i * strides - border, 0) - (i * strides - border);
1102
15
      const int end_y = size + ccv_min(i * strides + size - border, db->rows) - (i * strides + size - border);
1103
240
      for (j = 0; j < a->cols; 
j++225
)
1104
225
      {
1105
225
        const int start_x = ccv_max(j * strides - border, 0) - (j * strides - border);
1106
225
        const int end_x = size + ccv_min(j * strides + size - border, db->cols) - (j * strides + size - border);
1107
675
        for (k = 0; k < ch; 
k++450
)
1108
450
        {
1109
450
          float u = ap[j * ch + k] / ((end_x - start_x) * (end_y - start_y));
1110
1.80k
          for (y = start_y; y < end_y; 
y++1.35k
)
1111
5.40k
            
for (x = start_x; 1.35k
x < end_x;
x++4.05k
)
1112
4.05k
              bp[(j * strides - border + x + (y - border) * db->cols) * ch + k] += u;
1113
450
        }
1114
225
      }
1115
15
      ap += a->cols * ch;
1116
15
      bp += db->cols * ch * strides;
1117
15
    }
1118
1
  }
1119
1
}
convnet.tests.c:_ccv_convnet_average_pool_backward_propagate
Line
Count
Source
1083
1
{
1084
  // a is the input gradient (for back prop), y is the output (from forward prop),
1085
  // x is the input (for forward prop), b is the output gradient (gradient, or known as propagated error)
1086
  // pooling layer doesn't need the dropout
1087
1
  if (b)
1088
1
  {
1089
1
    assert(CCV_GET_CHANNEL(a->type) == CCV_GET_CHANNEL(m->type));
1090
1
    int ch = CCV_GET_CHANNEL(a->type);
1091
1
    ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, m->rows, m->cols, CCV_32F | ch, CCV_32F | ch, 0);
1092
1
    ccv_zero(db);
1093
1
    int size = layer->net.pool.size;
1094
1
    int strides = layer->net.pool.strides;
1095
1
    int border = layer->net.pool.border;
1096
1
    int i, j, k, x, y;
1097
1
    float* ap = a->data.f32;
1098
1
    float* bp = db->data.f32;
1099
16
    for (i = 0; i < a->rows; 
i++15
)
1100
15
    {
1101
15
      const int start_y = ccv_max(i * strides - border, 0) - (i * strides - border);
1102
15
      const int end_y = size + ccv_min(i * strides + size - border, db->rows) - (i * strides + size - border);
1103
240
      for (j = 0; j < a->cols; 
j++225
)
1104
225
      {
1105
225
        const int start_x = ccv_max(j * strides - border, 0) - (j * strides - border);
1106
225
        const int end_x = size + ccv_min(j * strides + size - border, db->cols) - (j * strides + size - border);
1107
675
        for (k = 0; k < ch; 
k++450
)
1108
450
        {
1109
450
          float u = ap[j * ch + k] / ((end_x - start_x) * (end_y - start_y));
1110
1.80k
          for (y = start_y; y < end_y; 
y++1.35k
)
1111
5.40k
            
for (x = start_x; 1.35k
x < end_x;
x++4.05k
)
1112
4.05k
              bp[(j * strides - border + x + (y - border) * db->cols) * ch + k] += u;
1113
450
        }
1114
225
      }
1115
15
      ap += a->cols * ch;
1116
15
      bp += db->cols * ch * strides;
1117
15
    }
1118
1
  }
1119
1
}
Unexecuted instantiation: ccv_convnet.c:_ccv_convnet_average_pool_backward_propagate
1120
1121
static void _ccv_convnet_propagate_loss(ccv_convnet_t* convnet, ccv_dense_matrix_t* a, ccv_dense_matrix_t* dloss, ccv_convnet_t* update_params)
1122
1
{
1123
1
  int i;
1124
1
  ccv_convnet_layer_t* layer = convnet->layers + convnet->count - 1;
1125
1
  assert(layer->type == CCV_CONVNET_FULL_CONNECT); // the last layer has too be a full connect one to generate softmax result
1126
1
  _ccv_convnet_full_connect_backward_propagate(layer, dloss, convnet->acts[convnet->count - 1], convnet->acts[convnet->count - 2], convnet->count - 1 > 0 ? update_params->acts + convnet->count - 2 : 
00
, update_params->layers + convnet->count - 1);
1127
2
  for (i = convnet->count - 2; i >= 0; 
i--1
)
1128
1
  {
1129
1
    layer = convnet->layers + i;
1130
1
    switch (layer->type)
1131
1
    {
1132
1
      case CCV_CONVNET_CONVOLUTIONAL:
1133
1
        _ccv_convnet_convolutional_backward_propagate(layer, update_params->acts[i], convnet->acts[i], i > 0 ? 
convnet->acts[i - 1]0
: a, i > 0 ?
update_params->acts + i - 10
: 0, update_params->layers + i);
1134
1
        break;
1135
0
      case CCV_CONVNET_FULL_CONNECT:
1136
0
        _ccv_convnet_full_connect_backward_propagate(layer, update_params->acts[i], convnet->acts[i], i > 0 ? convnet->acts[i - 1] : a, i > 0 ? update_params->acts + i - 1 : 0, update_params->layers + i);
1137
0
        break;
1138
0
      case CCV_CONVNET_LOCAL_RESPONSE_NORM:
1139
0
        _ccv_convnet_rnorm_backward_propagate(layer, update_params->acts[i], convnet->acts[i], i > 0 ? convnet->acts[i - 1] : a, convnet->denoms[i], i > 0 ? update_params->acts + i - 1 : 0);
1140
0
        break;
1141
0
      case CCV_CONVNET_MAX_POOL:
1142
0
        _ccv_convnet_max_pool_backward_propagate(layer, update_params->acts[i], convnet->acts[i], i > 0 ? convnet->acts[i - 1] : a, i > 0 ? update_params->acts + i - 1 : 0);
1143
0
        break;
1144
0
      case CCV_CONVNET_AVERAGE_POOL:
1145
0
        _ccv_convnet_average_pool_backward_propagate(layer, update_params->acts[i], i > 0 ? convnet->acts[i - 1] : a, i > 0 ? update_params->acts + i - 1 : 0);
1146
0
        break;
1147
1
    }
1148
1
  }
1149
1
}
convnet.tests.c:_ccv_convnet_propagate_loss
Line
Count
Source
1122
1
{
1123
1
  int i;
1124
1
  ccv_convnet_layer_t* layer = convnet->layers + convnet->count - 1;
1125
1
  assert(layer->type == CCV_CONVNET_FULL_CONNECT); // the last layer has too be a full connect one to generate softmax result
1126
1
  _ccv_convnet_full_connect_backward_propagate(layer, dloss, convnet->acts[convnet->count - 1], convnet->acts[convnet->count - 2], convnet->count - 1 > 0 ? update_params->acts + convnet->count - 2 : 
00
, update_params->layers + convnet->count - 1);
1127
2
  for (i = convnet->count - 2; i >= 0; 
i--1
)
1128
1
  {
1129
1
    layer = convnet->layers + i;
1130
1
    switch (layer->type)
1131
1
    {
1132
1
      case CCV_CONVNET_CONVOLUTIONAL:
1133
1
        _ccv_convnet_convolutional_backward_propagate(layer, update_params->acts[i], convnet->acts[i], i > 0 ? 
convnet->acts[i - 1]0
: a, i > 0 ?
update_params->acts + i - 10
: 0, update_params->layers + i);
1134
1
        break;
1135
0
      case CCV_CONVNET_FULL_CONNECT:
1136
0
        _ccv_convnet_full_connect_backward_propagate(layer, update_params->acts[i], convnet->acts[i], i > 0 ? convnet->acts[i - 1] : a, i > 0 ? update_params->acts + i - 1 : 0, update_params->layers + i);
1137
0
        break;
1138
0
      case CCV_CONVNET_LOCAL_RESPONSE_NORM:
1139
0
        _ccv_convnet_rnorm_backward_propagate(layer, update_params->acts[i], convnet->acts[i], i > 0 ? convnet->acts[i - 1] : a, convnet->denoms[i], i > 0 ? update_params->acts + i - 1 : 0);
1140
0
        break;
1141
0
      case CCV_CONVNET_MAX_POOL:
1142
0
        _ccv_convnet_max_pool_backward_propagate(layer, update_params->acts[i], convnet->acts[i], i > 0 ? convnet->acts[i - 1] : a, i > 0 ? update_params->acts + i - 1 : 0);
1143
0
        break;
1144
0
      case CCV_CONVNET_AVERAGE_POOL:
1145
0
        _ccv_convnet_average_pool_backward_propagate(layer, update_params->acts[i], i > 0 ? convnet->acts[i - 1] : a, i > 0 ? update_params->acts + i - 1 : 0);
1146
0
        break;
1147
1
    }
1148
1
  }
1149
1
}
Unexecuted instantiation: ccv_convnet.c:_ccv_convnet_propagate_loss
1150
1151
static void _ccv_convnet_update(ccv_convnet_t* convnet, int batch, ccv_convnet_t* momentum, ccv_convnet_t* update_params, ccv_convnet_layer_train_param_t* layer_params)
1152
0
{
1153
0
  int i, j;
1154
0
  float learn_rate;
1155
0
  for (i = 0; i < convnet->count; i++)
1156
0
    switch (update_params->layers[i].type)
1157
0
    {
1158
0
      case CCV_CONVNET_CONVOLUTIONAL:
1159
0
      {
1160
0
        float* w = convnet->layers[i].w;
1161
0
        float* vw = momentum->layers[i].w;
1162
0
        float* dw = update_params->layers[i].w;
1163
0
        learn_rate = layer_params[i].w.learn_rate / batch;
1164
0
        for (j = 0; j < convnet->layers[i].wnum; j++)
1165
0
        {
1166
0
          vw[j] = layer_params[i].w.momentum * vw[j] - layer_params[i].w.decay * layer_params[i].w.learn_rate * w[j] + learn_rate * dw[j];
1167
0
          w[j] += vw[j];
1168
0
        }
1169
0
        float* bias = convnet->layers[i].bias;
1170
0
        float* vbias = momentum->layers[i].bias;
1171
0
        float* dbias = update_params->layers[i].bias;
1172
0
        learn_rate = layer_params[i].bias.learn_rate / batch;
1173
0
        for (j = 0; j < convnet->layers[i].net.convolutional.count; j++)
1174
0
        {
1175
0
          vbias[j] = layer_params[i].bias.momentum * vbias[j] - layer_params[i].bias.decay * layer_params[i].bias.learn_rate * bias[j] + learn_rate * dbias[j];
1176
0
          bias[j] += vbias[j];
1177
0
        }
1178
0
        break;
1179
0
      }
1180
0
      case CCV_CONVNET_FULL_CONNECT:
1181
0
      {
1182
0
        float* w = convnet->layers[i].w;
1183
0
        float* vw = momentum->layers[i].w;
1184
0
        float* dw = update_params->layers[i].w;
1185
0
        learn_rate = layer_params[i].w.learn_rate / batch;
1186
0
        for (j = 0; j < convnet->layers[i].wnum; j++)
1187
0
        {
1188
0
          vw[j] = layer_params[i].w.momentum * vw[j] - layer_params[i].w.decay * layer_params[i].w.learn_rate * w[j] + learn_rate * dw[j];
1189
0
          w[j] += vw[j];
1190
0
        }
1191
0
        float* bias = convnet->layers[i].bias;
1192
0
        float* vbias = momentum->layers[i].bias;
1193
0
        float* dbias = update_params->layers[i].bias;
1194
0
        learn_rate = layer_params[i].bias.learn_rate / batch;
1195
0
        for (j = 0; j < convnet->layers[i].net.full_connect.count; j++)
1196
0
        {
1197
0
          vbias[j] = layer_params[i].bias.momentum * vbias[j] - layer_params[i].bias.decay * layer_params[i].bias.learn_rate * bias[j] + learn_rate * dbias[j];
1198
0
          bias[j] += vbias[j];
1199
0
        }
1200
0
        break;
1201
0
      }
1202
0
    }
1203
0
}
Unexecuted instantiation: convnet.tests.c:_ccv_convnet_update
Unexecuted instantiation: ccv_convnet.c:_ccv_convnet_update
1204
1205
static void _ccv_convnet_update_zero(ccv_convnet_t* update_params)
1206
9
{
1207
9
  int i;
1208
20
  for (i = 0; i < update_params->count; 
i++11
)
1209
11
    switch (update_params->layers[i].type)
1210
11
    {
1211
7
      case CCV_CONVNET_CONVOLUTIONAL:
1212
7
        memset(update_params->layers[i].w, 0, sizeof(float) * update_params->layers[i].wnum);
1213
7
        memset(update_params->layers[i].bias, 0, sizeof(float) * update_params->layers[i].net.convolutional.count);
1214
7
        break;
1215
3
      case CCV_CONVNET_FULL_CONNECT:
1216
3
        assert(update_params->layers[i].wnum % update_params->layers[i].net.full_connect.count == 0);
1217
3
        memset(update_params->layers[i].w, 0, sizeof(float) * update_params->layers[i].wnum);
1218
3
        memset(update_params->layers[i].bias, 0, sizeof(float) * update_params->layers[i].net.full_connect.count);
1219
3
        break;
1220
11
    }
1221
9
}
convnet.tests.c:_ccv_convnet_update_zero
Line
Count
Source
1206
9
{
1207
9
  int i;
1208
20
  for (i = 0; i < update_params->count; 
i++11
)
1209
11
    switch (update_params->layers[i].type)
1210
11
    {
1211
7
      case CCV_CONVNET_CONVOLUTIONAL:
1212
7
        memset(update_params->layers[i].w, 0, sizeof(float) * update_params->layers[i].wnum);
1213
7
        memset(update_params->layers[i].bias, 0, sizeof(float) * update_params->layers[i].net.convolutional.count);
1214
7
        break;
1215
3
      case CCV_CONVNET_FULL_CONNECT:
1216
3
        assert(update_params->layers[i].wnum % update_params->layers[i].net.full_connect.count == 0);
1217
3
        memset(update_params->layers[i].w, 0, sizeof(float) * update_params->layers[i].wnum);
1218
3
        memset(update_params->layers[i].bias, 0, sizeof(float) * update_params->layers[i].net.full_connect.count);
1219
3
        break;
1220
11
    }
1221
9
}
Unexecuted instantiation: ccv_convnet.c:_ccv_convnet_update_zero
1222
1223
static ccv_convnet_t* _ccv_convnet_update_new(ccv_convnet_t* convnet)
1224
8
{
1225
8
  ccv_convnet_t* update_params = (ccv_convnet_t*)ccmalloc(sizeof(ccv_convnet_t) + sizeof(ccv_convnet_layer_t) * convnet->count + sizeof(ccv_dense_matrix_t*) * convnet->count);
1226
8
  update_params->reserved = 0;
1227
8
  update_params->layers = (ccv_convnet_layer_t*)(update_params + 1);
1228
8
  update_params->acts = (ccv_dense_matrix_t**)(update_params->layers + convnet->count);
1229
8
  memset(update_params->acts, 0, sizeof(ccv_dense_matrix_t*) * convnet->count);
1230
8
  update_params->denoms = 0;
1231
8
  update_params->input = convnet->input;
1232
8
  update_params->rows = convnet->rows;
1233
8
  update_params->cols = convnet->cols;
1234
8
  update_params->count = convnet->count;
1235
8
  update_params->channels = convnet->channels;
1236
8
  update_params->mean_activity = 0;
1237
8
  int i;
1238
18
  for (i = 0; i < convnet->count; 
i++10
)
1239
10
  {
1240
10
    update_params->layers[i].type = convnet->layers[i].type;
1241
10
    update_params->layers[i].input = convnet->layers[i].input;
1242
10
    update_params->layers[i].net = convnet->layers[i].net;
1243
10
    update_params->layers[i].wnum = convnet->layers[i].wnum;
1244
10
    update_params->layers[i].reserved = 0;
1245
10
    switch (update_params->layers[i].type)
1246
10
    {
1247
6
      case CCV_CONVNET_CONVOLUTIONAL:
1248
6
        update_params->layers[i].w = (float*)cccalloc(update_params->layers[i].wnum + update_params->layers[i].net.convolutional.count, sizeof(float));
1249
6
        update_params->layers[i].bias = update_params->layers[i].w + update_params->layers[i].wnum;
1250
6
        break;
1251
3
      case CCV_CONVNET_FULL_CONNECT:
1252
3
        assert(update_params->layers[i].wnum % update_params->layers[i].net.full_connect.count == 0);
1253
3
        update_params->layers[i].w = (float*)cccalloc(update_params->layers[i].wnum + update_params->layers[i].net.full_connect.count, sizeof(float));
1254
3
        update_params->layers[i].bias = update_params->layers[i].w + update_params->layers[i].wnum;
1255
3
        break;
1256
1
      case CCV_CONVNET_LOCAL_RESPONSE_NORM:
1257
1
      case CCV_CONVNET_MAX_POOL:
1258
1
      case CCV_CONVNET_AVERAGE_POOL:
1259
1
        update_params->layers[i].w = 0;
1260
1
        update_params->layers[i].bias = 0;
1261
1
        break;
1262
10
    }
1263
10
  }
1264
8
  return update_params;
1265
8
}
convnet.tests.c:_ccv_convnet_update_new
Line
Count
Source
1224
8
{
1225
8
  ccv_convnet_t* update_params = (ccv_convnet_t*)ccmalloc(sizeof(ccv_convnet_t) + sizeof(ccv_convnet_layer_t) * convnet->count + sizeof(ccv_dense_matrix_t*) * convnet->count);
1226
8
  update_params->reserved = 0;
1227
8
  update_params->layers = (ccv_convnet_layer_t*)(update_params + 1);
1228
8
  update_params->acts = (ccv_dense_matrix_t**)(update_params->layers + convnet->count);
1229
8
  memset(update_params->acts, 0, sizeof(ccv_dense_matrix_t*) * convnet->count);
1230
8
  update_params->denoms = 0;
1231
8
  update_params->input = convnet->input;
1232
8
  update_params->rows = convnet->rows;
1233
8
  update_params->cols = convnet->cols;
1234
8
  update_params->count = convnet->count;
1235
8
  update_params->channels = convnet->channels;
1236
8
  update_params->mean_activity = 0;
1237
8
  int i;
1238
18
  for (i = 0; i < convnet->count; 
i++10
)
1239
10
  {
1240
10
    update_params->layers[i].type = convnet->layers[i].type;
1241
10
    update_params->layers[i].input = convnet->layers[i].input;
1242
10
    update_params->layers[i].net = convnet->layers[i].net;
1243
10
    update_params->layers[i].wnum = convnet->layers[i].wnum;
1244
10
    update_params->layers[i].reserved = 0;
1245
10
    switch (update_params->layers[i].type)
1246
10
    {
1247
6
      case CCV_CONVNET_CONVOLUTIONAL:
1248
6
        update_params->layers[i].w = (float*)cccalloc(update_params->layers[i].wnum + update_params->layers[i].net.convolutional.count, sizeof(float));
1249
6
        update_params->layers[i].bias = update_params->layers[i].w + update_params->layers[i].wnum;
1250
6
        break;
1251
3
      case CCV_CONVNET_FULL_CONNECT:
1252
3
        assert(update_params->layers[i].wnum % update_params->layers[i].net.full_connect.count == 0);
1253
3
        update_params->layers[i].w = (float*)cccalloc(update_params->layers[i].wnum + update_params->layers[i].net.full_connect.count, sizeof(float));
1254
3
        update_params->layers[i].bias = update_params->layers[i].w + update_params->layers[i].wnum;
1255
3
        break;
1256
1
      case CCV_CONVNET_LOCAL_RESPONSE_NORM:
1257
1
      case CCV_CONVNET_MAX_POOL:
1258
1
      case CCV_CONVNET_AVERAGE_POOL:
1259
1
        update_params->layers[i].w = 0;
1260
1
        update_params->layers[i].bias = 0;
1261
1
        break;
1262
10
    }
1263
10
  }
1264
8
  return update_params;
1265
8
}
Unexecuted instantiation: ccv_convnet.c:_ccv_convnet_update_new
1266
1267
static void _ccv_convnet_compute_softmax(ccv_dense_matrix_t* a, ccv_dense_matrix_t** b, int type)
1268
5.26k
{
1269
5.26k
  int ch = CCV_GET_CHANNEL(a->type);
1270
5.26k
  assert(CCV_GET_DATA_TYPE(a->type) == CCV_32F);
1271
5.26k
  ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, a->rows, a->cols, CCV_32F | ch, CCV_32F | ch, 0);
1272
5.26k
  int i;
1273
5.26k
  float* aptr = a->data.f32;
1274
5.26k
  float* bptr = db->data.f32;
1275
5.26k
  double max = aptr[0];
1276
7.85M
  for (i = 1; i < a->rows * a->cols * ch; 
i++7.84M
)
1277
7.84M
    if (aptr[i] > max)
1278
231k
      max = aptr[i];
1279
5.26k
  double tt = 0;
1280
7.85M
  for (i = 0; i < a->rows * a->cols * ch; 
i++7.85M
)
1281
7.85M
    tt += (bptr[i] = expf(aptr[i] - max));
1282
5.26k
  tt = 1.0 / tt;
1283
7.85M
  for (i = 0; i < a->rows * a->cols * ch; 
i++7.85M
)
1284
7.85M
    bptr[i] *= tt;
1285
5.26k
}
convnet.tests.c:_ccv_convnet_compute_softmax
Line
Count
Source
1268
5.26k
{
1269
5.26k
  int ch = CCV_GET_CHANNEL(a->type);
1270
5.26k
  assert(CCV_GET_DATA_TYPE(a->type) == CCV_32F);
1271
5.26k
  ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, a->rows, a->cols, CCV_32F | ch, CCV_32F | ch, 0);
1272
5.26k
  int i;
1273
5.26k
  float* aptr = a->data.f32;
1274
5.26k
  float* bptr = db->data.f32;
1275
5.26k
  double max = aptr[0];
1276
7.85M
  for (i = 1; i < a->rows * a->cols * ch; 
i++7.84M
)
1277
7.84M
    if (aptr[i] > max)
1278
231k
      max = aptr[i];
1279
5.26k
  double tt = 0;
1280
7.85M
  for (i = 0; i < a->rows * a->cols * ch; 
i++7.85M
)
1281
7.85M
    tt += (bptr[i] = expf(aptr[i] - max));
1282
5.26k
  tt = 1.0 / tt;
1283
7.85M
  for (i = 0; i < a->rows * a->cols * ch; 
i++7.85M
)
1284
7.85M
    bptr[i] *= tt;
1285
5.26k
}
Unexecuted instantiation: ccv_convnet.c:_ccv_convnet_compute_softmax
1286
1287
static void _ccv_convnet_classify(ccv_convnet_t* convnet, ccv_dense_matrix_t** a, int* labels, int batch)
1288
0
{
1289
0
  assert(batch == 1);
1290
0
  ccv_convnet_encode(convnet, a, convnet->acts + convnet->count - 1, 1);
1291
0
  int i, c = 0;
1292
0
  ccv_dense_matrix_t* b = convnet->acts[convnet->count - 1];
1293
0
  float maxc = b->data.f32[0];
1294
0
  for (i = 1; i < b->rows; i++)
1295
0
    if (b->data.f32[i] > maxc)
1296
0
      maxc = b->data.f32[i], c = i;
1297
0
  labels[0] = c;
1298
0
}
Unexecuted instantiation: convnet.tests.c:_ccv_convnet_classify
Unexecuted instantiation: ccv_convnet.c:_ccv_convnet_classify
1299
1300
#endif
1301
1302
#ifndef CASE_TESTS
1303
1304
void ccv_convnet_supervised_train(ccv_convnet_t* convnet, ccv_array_t* categorizeds, ccv_array_t* tests, const char* filename, ccv_convnet_train_param_t params)
1305
0
{
1306
0
#ifdef HAVE_GSL
1307
0
#ifdef HAVE_CUDA
1308
0
  if (convnet->use_cwc_accel)
1309
0
    cwc_convnet_supervised_train(convnet, categorizeds, tests, filename, params);
1310
0
  else {
1311
0
#endif
1312
0
  int i, j, t;
1313
0
  gsl_rng_env_setup();
1314
0
  gsl_rng* rng = gsl_rng_alloc(gsl_rng_default);
1315
0
  int aligned_padding = categorizeds->rnum % params.mini_batch;
1316
0
  int aligned_rnum = categorizeds->rnum - aligned_padding;
1317
0
  int* idx = (int*)ccmalloc(sizeof(int) * (categorizeds->rnum + aligned_padding));
1318
0
  for (i = 0; i < categorizeds->rnum; i++)
1319
0
    idx[i] = i;
1320
0
  gsl_ran_shuffle(rng, idx, categorizeds->rnum, sizeof(int));
1321
  // the last layer has to be full connect, thus we can use it as softmax layer
1322
0
  assert(convnet->layers[convnet->count - 1].type == CCV_CONVNET_FULL_CONNECT);
1323
0
  int category_count = convnet->layers[convnet->count - 1].net.full_connect.count;
1324
0
  ccv_convnet_t* update_params = _ccv_convnet_update_new(convnet);
1325
0
  ccv_convnet_t* momentum = _ccv_convnet_update_new(convnet);
1326
0
  for (t = 0; t < params.max_epoch; t++)
1327
0
  {
1328
0
    for (i = 0; i < aligned_rnum; i++)
1329
0
    {
1330
      // dropout the first hidden layer
1331
0
      ccv_categorized_t* categorized = (ccv_categorized_t*)ccv_array_get(categorizeds, idx[i]);
1332
0
      ccv_convnet_encode(convnet, &categorized->matrix, convnet->acts + convnet->count - 1, 1);
1333
0
      ccv_dense_matrix_t* softmax = convnet->acts[convnet->count - 1];
1334
0
      float* dloss = softmax->data.f32;
1335
0
      _ccv_convnet_compute_softmax(softmax, &softmax, 0);
1336
0
      assert(softmax->rows == category_count && softmax->cols == 1);
1337
      // this mashes softmax and logistic regression together
1338
      // also, it gives you -D[loss w.r.t. to x_i] (note the negative sign)
1339
0
      for (j = 0; j < category_count; j++)
1340
0
        dloss[j] = (j == categorized->c) - dloss[j];
1341
0
      _ccv_convnet_propagate_loss(convnet, categorized->matrix, softmax, update_params);
1342
0
      if ((i + 1) % params.mini_batch == 0)
1343
0
      {
1344
0
        FLUSH(CCV_CLI_INFO, " - at epoch %03d / %d => stochastic gradient descent at %d / %d", t + 1, params.max_epoch, (i + 1) / params.mini_batch, aligned_rnum / params.mini_batch);
1345
        // update weights
1346
0
        _ccv_convnet_update(convnet, params.mini_batch, momentum, update_params, params.layer_params);
1347
0
        _ccv_convnet_update_zero(update_params);
1348
        // compact the convnet to avoid any staled temporary resource
1349
0
        ccv_convnet_compact(convnet);
1350
0
      }
1351
0
    }
1352
0
    int miss = 0;
1353
0
    for (i = 0; i < tests->rnum; i++)
1354
0
    {
1355
0
      FLUSH(CCV_CLI_INFO, " - at epoch %03d / %d => going through %d / %d for tests", t + 1, params.max_epoch, i + 1, tests->rnum);
1356
0
      ccv_categorized_t* test = (ccv_categorized_t*)ccv_array_get(tests, i);
1357
0
      int c = 0;
1358
0
      _ccv_convnet_classify(convnet, &test->matrix, &c, 1);
1359
0
      if (c != test->c)
1360
0
        ++miss;
1361
0
    }
1362
0
    FLUSH(CCV_CLI_INFO, " - at epoch %03d / %d => with miss rate %.2f%%\n", t + 1, params.max_epoch, miss * 100.0f / tests->rnum);
1363
0
    if (t + 1 < params.max_epoch)
1364
0
    {
1365
      // reshuffle the parts we visited and move the rest to the beginning
1366
0
      memcpy(idx + categorizeds->rnum, idx + aligned_rnum, sizeof(int) * aligned_padding);
1367
0
      memmove(idx + aligned_padding, idx, sizeof(int) * aligned_rnum);
1368
0
      memcpy(idx, idx + categorizeds->rnum, sizeof(int) * aligned_padding);
1369
0
      gsl_ran_shuffle(rng, idx + aligned_padding, aligned_rnum, sizeof(int));
1370
0
    }
1371
0
  }
1372
0
  ccfree(idx);
1373
0
  ccv_convnet_free(momentum);
1374
0
  ccv_convnet_free(update_params);
1375
0
  gsl_rng_free(rng);
1376
0
#ifdef HAVE_CUDA
1377
0
  }
1378
0
#endif
1379
#else
1380
  assert(0 && "ccv_convnet_supervised_train requires GSL library support");
1381
#endif
1382
0
}
1383
1384
void ccv_convnet_compact(ccv_convnet_t* convnet)
1385
2.37k
{
1386
2.37k
#ifdef HAVE_CUDA
1387
2.37k
  cwc_convnet_compact(convnet);
1388
2.37k
#endif
1389
2.37k
  int i;
1390
5.91k
  for (i = 0; i < convnet->count; 
i++3.53k
)
1391
3.53k
  {
1392
3.53k
    if (convnet->acts[i])
1393
1.16k
      ccv_matrix_free(convnet->acts[i]);
1394
3.53k
    convnet->acts[i] = 0;
1395
3.53k
    if (convnet->denoms)
1396
3.52k
    {
1397
3.52k
      if (convnet->denoms[i])
1398
821
        ccv_matrix_free(convnet->denoms[i]);
1399
3.52k
      convnet->denoms[i] = 0;
1400
3.52k
    }
1401
3.53k
    if (SIMD(convnet->layers + i))
1402
2.37k
    {
1403
2.37k
      ccfree(convnet->layers[i].reserved);
1404
2.37k
      convnet->layers[i].reserved = 0;
1405
2.37k
    }
1406
3.53k
  }
1407
2.37k
}
1408
1409
void ccv_convnet_write(ccv_convnet_t* convnet, const char* filename, ccv_convnet_write_param_t params)
1410
0
{
1411
0
  sqlite3* db = 0;
1412
0
  if (SQLITE_OK == sqlite3_open(filename, &db))
1413
0
  {
1414
0
    const char layer_create_table_qs[] =
1415
0
      "CREATE TABLE IF NOT EXISTS layer_params "
1416
0
      "(layer INTEGER PRIMARY KEY ASC, type INTEGER, "
1417
0
      "input_matrix_rows INTEGER, input_matrix_cols INTEGER, input_matrix_channels INTEGER, input_matrix_partition INTEGER, input_node_count INTEGER, "
1418
0
      "output_rows INTEGER, output_cols INTEGER, output_channels INTEGER, output_partition INTEGER, output_count INTEGER, output_strides INTEGER, output_border INTEGER, "
1419
0
      "output_size INTEGER, output_kappa REAL, output_alpha REAL, output_beta REAL, output_relu INTEGER);"
1420
0
      "CREATE TABLE IF NOT EXISTS convnet_params "
1421
0
      "(convnet INTEGER PRIMARY KEY ASC, input_height INTEGER, input_width INTEGER, mean_activity BLOB);"
1422
0
      "CREATE TABLE IF NOT EXISTS layer_data "
1423
0
      "(layer INTEGER PRIMARY KEY ASC, weight BLOB, bias BLOB, half_precision INTEGER);";
1424
0
    assert(SQLITE_OK == sqlite3_exec(db, layer_create_table_qs, 0, 0, 0));
1425
0
    const char layer_params_insert_qs[] = 
1426
0
      "REPLACE INTO layer_params "
1427
0
      "(layer, type, "
1428
0
      "input_matrix_rows, input_matrix_cols, input_matrix_channels, input_matrix_partition, input_node_count, "
1429
0
      "output_rows, output_cols, output_channels, output_partition, output_count, output_strides, output_border, "
1430
0
      "output_size, output_kappa, output_alpha, output_beta, output_relu) VALUES "
1431
0
      "($layer, $type, " // 1
1432
0
      "$input_matrix_rows, $input_matrix_cols, $input_matrix_channels, $input_matrix_partition, $input_node_count, " // 6
1433
0
      "$output_rows, $output_cols, $output_channels, $output_partition, $output_count, $output_strides, $output_border, " // 13
1434
0
      "$output_size, $output_kappa, $output_alpha, $output_beta, $output_relu);"; // 18
1435
0
    sqlite3_stmt* layer_params_insert_stmt = 0;
1436
0
    assert(SQLITE_OK == sqlite3_prepare_v2(db, layer_params_insert_qs, sizeof(layer_params_insert_qs), &layer_params_insert_stmt, 0));
1437
0
    const char layer_data_insert_qs[] =
1438
0
      "REPLACE INTO layer_data "
1439
0
      "(layer, weight, bias, half_precision) VALUES ($layer, $weight, $bias, $half_precision);";
1440
0
    sqlite3_stmt* layer_data_insert_stmt = 0;
1441
0
    assert(SQLITE_OK == sqlite3_prepare_v2(db, layer_data_insert_qs, sizeof(layer_data_insert_qs), &layer_data_insert_stmt, 0));
1442
0
    int i;
1443
0
    for (i = 0; i < convnet->count; i++)
1444
0
    {
1445
0
      ccv_convnet_layer_t* layer = convnet->layers + i;
1446
      // insert layer params
1447
0
      sqlite3_bind_int(layer_params_insert_stmt, 1, i);
1448
0
      sqlite3_bind_int(layer_params_insert_stmt, 2, layer->type);
1449
0
      sqlite3_bind_int(layer_params_insert_stmt, 3, layer->input.matrix.rows);
1450
0
      sqlite3_bind_int(layer_params_insert_stmt, 4, layer->input.matrix.cols);
1451
0
      sqlite3_bind_int(layer_params_insert_stmt, 5, layer->input.matrix.channels);
1452
0
      sqlite3_bind_int(layer_params_insert_stmt, 6, layer->input.matrix.partition);
1453
0
      sqlite3_bind_int(layer_params_insert_stmt, 7, layer->input.node.count);
1454
0
      switch (layer->type)
1455
0
      {
1456
0
        case CCV_CONVNET_CONVOLUTIONAL:
1457
0
          sqlite3_bind_int(layer_params_insert_stmt, 8, layer->net.convolutional.rows);
1458
0
          sqlite3_bind_int(layer_params_insert_stmt, 9, layer->net.convolutional.cols);
1459
0
          sqlite3_bind_int(layer_params_insert_stmt, 10, layer->net.convolutional.channels);
1460
0
          sqlite3_bind_int(layer_params_insert_stmt, 11, layer->net.convolutional.partition);
1461
0
          sqlite3_bind_int(layer_params_insert_stmt, 12, layer->net.convolutional.count);
1462
0
          sqlite3_bind_int(layer_params_insert_stmt, 13, layer->net.convolutional.strides);
1463
0
          sqlite3_bind_int(layer_params_insert_stmt, 14, layer->net.convolutional.border);
1464
0
          break;
1465
0
        case CCV_CONVNET_FULL_CONNECT:
1466
0
          sqlite3_bind_int(layer_params_insert_stmt, 12, layer->net.full_connect.count);
1467
0
          sqlite3_bind_int(layer_params_insert_stmt, 19, layer->net.full_connect.relu);
1468
0
          break;
1469
0
        case CCV_CONVNET_MAX_POOL:
1470
0
        case CCV_CONVNET_AVERAGE_POOL:
1471
0
          sqlite3_bind_int(layer_params_insert_stmt, 13, layer->net.pool.strides);
1472
0
          sqlite3_bind_int(layer_params_insert_stmt, 14, layer->net.pool.border);
1473
0
          sqlite3_bind_int(layer_params_insert_stmt, 15, layer->net.pool.size);
1474
0
          break;
1475
0
        case CCV_CONVNET_LOCAL_RESPONSE_NORM:
1476
0
          sqlite3_bind_int(layer_params_insert_stmt, 15, layer->net.rnorm.size);
1477
0
          sqlite3_bind_double(layer_params_insert_stmt, 16, layer->net.rnorm.kappa);
1478
0
          sqlite3_bind_double(layer_params_insert_stmt, 17, layer->net.rnorm.alpha);
1479
0
          sqlite3_bind_double(layer_params_insert_stmt, 18, layer->net.rnorm.beta);
1480
0
          break;
1481
0
      }
1482
0
      assert(SQLITE_DONE == sqlite3_step(layer_params_insert_stmt));
1483
0
      sqlite3_reset(layer_params_insert_stmt);
1484
0
      sqlite3_clear_bindings(layer_params_insert_stmt);
1485
      // insert layer data
1486
0
      if (layer->type == CCV_CONVNET_CONVOLUTIONAL || layer->type == CCV_CONVNET_FULL_CONNECT)
1487
0
      {
1488
0
        sqlite3_bind_int(layer_data_insert_stmt, 1, i);
1489
0
        if (params.half_precision)
1490
0
        {
1491
0
          uint16_t* w = (uint16_t*)ccmalloc(sizeof(uint16_t) * layer->wnum);
1492
0
          ccv_float_to_half_precision(layer->w, w, layer->wnum);
1493
0
          uint16_t* bias = (uint16_t*)ccmalloc(sizeof(uint16_t) * (layer->type == CCV_CONVNET_CONVOLUTIONAL ? layer->net.convolutional.count : layer->net.full_connect.count));
1494
0
          ccv_float_to_half_precision(layer->bias, bias, layer->type == CCV_CONVNET_CONVOLUTIONAL ? layer->net.convolutional.count : layer->net.full_connect.count);
1495
0
          sqlite3_bind_blob(layer_data_insert_stmt, 2, w, sizeof(uint16_t) * layer->wnum, ccfree);
1496
0
          sqlite3_bind_blob(layer_data_insert_stmt, 3, bias, sizeof(uint16_t) * (layer->type == CCV_CONVNET_CONVOLUTIONAL ? layer->net.convolutional.count : layer->net.full_connect.count), ccfree);
1497
0
        } else {
1498
0
          sqlite3_bind_blob(layer_data_insert_stmt, 2, layer->w, sizeof(float) * layer->wnum, SQLITE_STATIC);
1499
0
          sqlite3_bind_blob(layer_data_insert_stmt, 3, layer->bias, sizeof(float) * (layer->type == CCV_CONVNET_CONVOLUTIONAL ? layer->net.convolutional.count : layer->net.full_connect.count), SQLITE_STATIC);
1500
0
        }
1501
0
        sqlite3_bind_int(layer_data_insert_stmt, 4, params.half_precision);
1502
0
        assert(SQLITE_DONE == sqlite3_step(layer_data_insert_stmt));
1503
0
        sqlite3_reset(layer_data_insert_stmt);
1504
0
        sqlite3_clear_bindings(layer_data_insert_stmt);
1505
0
      }
1506
0
    }
1507
    // insert convnet related params
1508
0
    const char convnet_params_insert_qs[] =
1509
0
      "REPLACE INTO convnet_params "
1510
0
      "(convnet, mean_activity, input_height, input_width) VALUES (0, $mean_activity, $input_height, $input_width);";
1511
0
    sqlite3_stmt* convnet_params_insert_stmt = 0;
1512
0
    assert(SQLITE_OK == sqlite3_prepare_v2(db, convnet_params_insert_qs, sizeof(convnet_params_insert_qs), &convnet_params_insert_stmt, 0));
1513
0
    assert(convnet->mean_activity->rows == convnet->input.height);
1514
0
    assert(convnet->mean_activity->cols == convnet->input.width);
1515
0
    assert(CCV_GET_CHANNEL(convnet->mean_activity->type) == convnet->channels);
1516
0
    assert(CCV_GET_DATA_TYPE(convnet->mean_activity->type) == CCV_32F);
1517
0
    sqlite3_bind_blob(convnet_params_insert_stmt, 1, convnet->mean_activity->data.f32, sizeof(float) * convnet->input.height * convnet->input.width * convnet->channels, SQLITE_STATIC);
1518
0
    sqlite3_bind_int(convnet_params_insert_stmt, 2, convnet->input.height);
1519
0
    sqlite3_bind_int(convnet_params_insert_stmt, 3, convnet->input.width);
1520
0
    assert(SQLITE_DONE == sqlite3_step(convnet_params_insert_stmt));
1521
0
    sqlite3_reset(convnet_params_insert_stmt);
1522
0
    sqlite3_clear_bindings(convnet_params_insert_stmt);
1523
1524
0
    sqlite3_finalize(layer_params_insert_stmt);
1525
0
    sqlite3_finalize(layer_data_insert_stmt);
1526
0
    sqlite3_finalize(convnet_params_insert_stmt);
1527
0
    sqlite3_close(db);
1528
0
  }
1529
0
}
1530
1531
ccv_convnet_t* ccv_convnet_read(int use_cwc_accel, const char* filename)
1532
2
{
1533
2
  sqlite3* db = 0;
1534
2
  if (SQLITE_OK == sqlite3_open(filename, &db))
1535
2
  {
1536
2
    ccv_convnet_t* convnet = 0;
1537
2
    sqlite3_stmt* layer_params_stmt = 0;
1538
    // load layer params
1539
2
    const char layer_params_qs[] =
1540
2
      "SELECT type, " // 1
1541
2
      "input_matrix_rows, input_matrix_cols, input_matrix_channels, input_matrix_partition, input_node_count, " // 6
1542
2
      "output_rows, output_cols, output_channels, output_partition, output_count, output_strides, output_border, " // 13
1543
2
      "output_size, output_kappa, output_alpha, output_beta, output_relu FROM layer_params ORDER BY layer ASC;"; // 18
1544
2
    if (SQLITE_OK == sqlite3_prepare_v2(db, layer_params_qs, sizeof(layer_params_qs), &layer_params_stmt, 0))
1545
2
    {
1546
2
      ccv_array_t* layer_params = ccv_array_new(sizeof(ccv_convnet_layer_param_t), 3, 0);
1547
44
      while (sqlite3_step(layer_params_stmt) == SQLITE_ROW)
1548
42
      {
1549
42
        ccv_convnet_layer_param_t layer_param;
1550
42
        layer_param.type = sqlite3_column_int(layer_params_stmt, 0);
1551
42
        layer_param.input.matrix.rows = sqlite3_column_int(layer_params_stmt, 1);
1552
42
        layer_param.input.matrix.cols = sqlite3_column_int(layer_params_stmt, 2);
1553
42
        layer_param.input.matrix.channels = sqlite3_column_int(layer_params_stmt, 3);
1554
42
        layer_param.input.matrix.partition = sqlite3_column_int(layer_params_stmt, 4);
1555
42
        layer_param.input.node.count = sqlite3_column_int(layer_params_stmt, 5);
1556
42
        layer_param.bias = layer_param.glorot = 0; // this is irrelevant to read convnet
1557
42
        switch (layer_param.type)
1558
42
        {
1559
26
          case CCV_CONVNET_CONVOLUTIONAL:
1560
26
            layer_param.output.convolutional.rows = sqlite3_column_int(layer_params_stmt, 6);
1561
26
            layer_param.output.convolutional.cols = sqlite3_column_int(layer_params_stmt, 7);
1562
26
            layer_param.output.convolutional.channels = sqlite3_column_int(layer_params_stmt, 8);
1563
26
            layer_param.output.convolutional.partition = sqlite3_column_int(layer_params_stmt, 9);
1564
26
            layer_param.output.convolutional.count = sqlite3_column_int(layer_params_stmt, 10);
1565
26
            layer_param.output.convolutional.strides = sqlite3_column_int(layer_params_stmt, 11);
1566
26
            layer_param.output.convolutional.border = sqlite3_column_int(layer_params_stmt, 12);
1567
26
            break;
1568
6
          case CCV_CONVNET_FULL_CONNECT:
1569
6
            layer_param.output.full_connect.count = sqlite3_column_int(layer_params_stmt, 10);
1570
6
            layer_param.output.full_connect.relu = sqlite3_column_int(layer_params_stmt, 17);
1571
6
            break;
1572
10
          case CCV_CONVNET_MAX_POOL:
1573
10
          case CCV_CONVNET_AVERAGE_POOL:
1574
10
            layer_param.output.pool.strides = sqlite3_column_int(layer_params_stmt, 11);
1575
10
            layer_param.output.pool.border = sqlite3_column_int(layer_params_stmt, 12);
1576
10
            layer_param.output.pool.size = sqlite3_column_int(layer_params_stmt, 13);
1577
10
            break;
1578
0
          case CCV_CONVNET_LOCAL_RESPONSE_NORM:
1579
0
            layer_param.output.rnorm.size = sqlite3_column_int(layer_params_stmt, 13);
1580
0
            layer_param.output.rnorm.kappa = sqlite3_column_double(layer_params_stmt, 14);
1581
0
            layer_param.output.rnorm.alpha = sqlite3_column_double(layer_params_stmt, 15);
1582
0
            layer_param.output.rnorm.beta = sqlite3_column_double(layer_params_stmt, 16);
1583
0
            break;
1584
42
        }
1585
42
        ccv_array_push(layer_params, &layer_param);
1586
42
      }
1587
2
      sqlite3_finalize(layer_params_stmt);
1588
2
      sqlite3_stmt* convnet_params_input_stmt = 0;
1589
      // load convnet params for input
1590
2
      const char convnet_params_input_qs[] =
1591
2
        "SELECT input_height, input_width FROM convnet_params WHERE convnet = 0;";
1592
2
      ccv_size_t input = ccv_size(0, 0);
1593
2
      if (SQLITE_OK == sqlite3_prepare_v2(db, convnet_params_input_qs, sizeof(convnet_params_input_qs), &convnet_params_input_stmt, 0))
1594
2
      {
1595
2
        if (sqlite3_step(convnet_params_input_stmt) == SQLITE_ROW)
1596
2
        {
1597
2
          input.height = sqlite3_column_int(convnet_params_input_stmt, 0);
1598
2
          input.width = sqlite3_column_int(convnet_params_input_stmt, 1);
1599
2
        }
1600
2
        sqlite3_finalize(convnet_params_input_stmt);
1601
2
      }
1602
2
      assert(input.height != 0 && input.width != 0);
1603
2
      convnet = ccv_convnet_new(use_cwc_accel, input, (ccv_convnet_layer_param_t*)ccv_array_get(layer_params, 0), layer_params->rnum);
1604
2
      ccv_array_free(layer_params);
1605
      // load layer data
1606
2
      sqlite3_stmt* layer_data_stmt = 0;
1607
2
      const char layer_data_qs[] =
1608
2
        "SELECT layer, weight, bias, half_precision FROM layer_data;";
1609
2
      if (SQLITE_OK == sqlite3_prepare_v2(db, layer_data_qs, sizeof(layer_data_qs), &layer_data_stmt, 0))
1610
2
      {
1611
34
        while (sqlite3_step(layer_data_stmt) == SQLITE_ROW)
1612
32
        {
1613
32
          ccv_convnet_layer_t* layer = convnet->layers + sqlite3_column_int(layer_data_stmt, 0);
1614
32
          int half_precision = sqlite3_column_int(layer_data_stmt, 3);
1615
32
          int wnum = sqlite3_column_bytes(layer_data_stmt, 1) / (half_precision ? sizeof(uint16_t) : 
sizeof(float)0
);
1616
          // if weights available, load weights
1617
32
          if (wnum == layer->wnum)
1618
32
          {
1619
32
            const void* w = sqlite3_column_blob(layer_data_stmt, 1);
1620
32
            if (half_precision)
1621
32
            {
1622
32
              float* f = (float*)ccmalloc(sizeof(float) * layer->wnum);
1623
32
              ccv_half_precision_to_float((uint16_t*)w, f, layer->wnum);
1624
32
              w = f;
1625
32
            }
1626
32
            switch (layer->type)
1627
32
            {
1628
26
              case CCV_CONVNET_CONVOLUTIONAL:
1629
26
                memcpy(layer->w, w, sizeof(float) * layer->wnum);
1630
26
                break;
1631
6
              case CCV_CONVNET_FULL_CONNECT:
1632
6
                memcpy(layer->w, w, sizeof(float) * layer->wnum);
1633
6
                break;
1634
32
            }
1635
32
            if (half_precision)
1636
32
              ccfree((void*)w);
1637
32
          }
1638
32
          int bnum = sqlite3_column_bytes(layer_data_stmt, 2) / (half_precision ? sizeof(uint16_t) : 
sizeof(float)0
);
1639
          // if bias available, load bias
1640
32
          if (bnum == (layer->type == CCV_CONVNET_CONVOLUTIONAL ? 
layer->net.convolutional.count26
:
layer->net.full_connect.count6
))
1641
32
          {
1642
32
            const void* bias = sqlite3_column_blob(layer_data_stmt, 2);
1643
32
            if (half_precision)
1644
32
            {
1645
32
              float* f = (float*)ccmalloc(sizeof(float) * (layer->type == CCV_CONVNET_CONVOLUTIONAL ? 
layer->net.convolutional.count26
:
layer->net.full_connect.count6
));
1646
32
              ccv_half_precision_to_float((uint16_t*)bias, f, layer->type == CCV_CONVNET_CONVOLUTIONAL ? 
layer->net.convolutional.count26
:
layer->net.full_connect.count6
);
1647
32
              bias = f;
1648
32
            }
1649
32
            switch (layer->type)
1650
32
            {
1651
26
              case CCV_CONVNET_CONVOLUTIONAL:
1652
26
                memcpy(layer->bias, bias, sizeof(float) * layer->net.convolutional.count);
1653
26
                break;
1654
6
              case CCV_CONVNET_FULL_CONNECT:
1655
6
                memcpy(layer->bias, bias, sizeof(float) * layer->net.full_connect.count);
1656
6
                break;
1657
32
            }
1658
32
            if (half_precision)
1659
32
              ccfree((void*)bias);
1660
32
          }
1661
32
        }
1662
2
        sqlite3_finalize(layer_data_stmt);
1663
2
      }
1664
2
      sqlite3_stmt* convnet_params_mean_activity_stmt = 0;
1665
      // load convnet params for mean activity
1666
2
      const char convnet_params_mean_activity_qs[] =
1667
2
        "SELECT mean_activity FROM convnet_params WHERE convnet = 0;";
1668
2
      if (SQLITE_OK == sqlite3_prepare_v2(db, convnet_params_mean_activity_qs, sizeof(convnet_params_mean_activity_qs), &convnet_params_mean_activity_stmt, 0))
1669
2
      {
1670
2
        if (sqlite3_step(convnet_params_mean_activity_stmt) == SQLITE_ROW)
1671
2
        {
1672
2
          int elems = sqlite3_column_bytes(convnet_params_mean_activity_stmt, 0) / sizeof(float);
1673
2
          if (elems == convnet->input.height * convnet->input.width * convnet->channels)
1674
2
            memcpy(convnet->mean_activity->data.f32, sqlite3_column_blob(convnet_params_mean_activity_stmt, 0), sizeof(float) * elems);
1675
2
        }
1676
2
        sqlite3_finalize(convnet_params_mean_activity_stmt);
1677
2
      }
1678
2
    }
1679
2
    sqlite3_close(db);
1680
2
    return convnet;
1681
2
  }
1682
0
  return 0;
1683
2
}
1684
1685
void ccv_convnet_input_formation(ccv_size_t input, ccv_dense_matrix_t* a, ccv_dense_matrix_t** b)
1686
2
{
1687
2
  if (a->rows > input.height && a->cols > input.width)
1688
2
    ccv_resample(a, b, CCV_32F, (double)ccv_max(input.height, (int)(a->rows * (float)input.height / a->cols + 0.5)) / (double)a->rows, (double)ccv_max(input.width, (int)(a->cols * (float)input.width / a->rows + 0.5)) / (double)a->cols, CCV_INTER_AREA);
1689
0
  else if (a->rows < input.height || a->cols < input.width)
1690
0
    ccv_resample(a, b, CCV_32F, (double)ccv_max(input.height, (int)(a->rows * (float)input.height / a->cols + 0.5)) / (double)a->rows, (double)ccv_max(input.width, (int)(a->cols * (float)input.width / a->rows + 0.5)) / (double)a->cols, CCV_INTER_CUBIC);
1691
0
  else
1692
0
    ccv_shift(a, (ccv_matrix_t**)b, CCV_32F, 0, 0); // converting to 32f
1693
2
}
1694
1695
void ccv_convnet_free(ccv_convnet_t* convnet)
1696
37
{
1697
37
  ccv_convnet_compact(convnet);
1698
37
  int i;
1699
118
  for (i = 0; i < convnet->count; 
i++81
)
1700
81
    if (convnet->layers[i].w)
1701
57
      ccfree(convnet->layers[i].w);
1702
37
  if (convnet->mean_activity)
1703
29
    ccv_matrix_free(convnet->mean_activity);
1704
37
  ccfree(convnet);
1705
37
}
1706
1707
#endif