/home/liu/actions-runner/_work/ccv/ccv/lib/ccv_convnet.c
Line | Count | Source (jump to first uncovered line) |
1 | | #include "ccv.h" |
2 | | #include "ccv_internal.h" |
3 | | #if defined(HAVE_SSE2) |
4 | | #include <xmmintrin.h> |
5 | | #elif defined(HAVE_NEON) |
6 | | #include <arm_neon.h> |
7 | | #endif |
8 | | #ifdef HAVE_GSL |
9 | | #include <gsl/gsl_rng.h> |
10 | | #include <gsl/gsl_randist.h> |
11 | | #endif |
12 | | #ifdef USE_OPENMP |
13 | | #include <omp.h> |
14 | | #endif |
15 | | #ifdef USE_DISPATCH |
16 | | #include <dispatch/dispatch.h> |
17 | | #endif |
18 | | #ifdef HAVE_CUDA |
19 | | #include "cuda/cwc.h" |
20 | | #endif |
21 | | #include "3rdparty/sqlite3/sqlite3.h" |
22 | | #include "inc/ccv_convnet_internal.h" |
23 | | |
24 | | #ifndef CASE_TESTS |
25 | | |
26 | | ccv_convnet_t* ccv_convnet_new(int use_cwc_accel, ccv_size_t input, ccv_convnet_layer_param_t params[], int count) |
27 | 29 | { |
28 | 29 | ccv_convnet_t* convnet = (ccv_convnet_t*)ccmalloc(sizeof(ccv_convnet_t) + sizeof(ccv_convnet_layer_t) * count + sizeof(ccv_dense_matrix_t*) * count * 2); |
29 | 29 | convnet->use_cwc_accel = use_cwc_accel; |
30 | 29 | #ifdef HAVE_GSL |
31 | 29 | gsl_rng_env_setup(); |
32 | 29 | gsl_rng* rng = gsl_rng_alloc(gsl_rng_default); |
33 | 29 | gsl_rng_set(rng, (unsigned long int)convnet); |
34 | 29 | #endif |
35 | 29 | convnet->reserved = 0; |
36 | 29 | convnet->layers = (ccv_convnet_layer_t*)(convnet + 1); |
37 | 29 | convnet->acts = (ccv_dense_matrix_t**)(convnet->layers + count); |
38 | 29 | memset(convnet->acts, 0, sizeof(ccv_dense_matrix_t*) * count); |
39 | 29 | convnet->denoms = (ccv_dense_matrix_t**)(convnet->acts + count); |
40 | 29 | memset(convnet->denoms, 0, sizeof(ccv_dense_matrix_t*) * count); |
41 | 29 | convnet->count = count; |
42 | 29 | convnet->input = input; |
43 | 29 | convnet->rows = params[0].input.matrix.rows; |
44 | 29 | convnet->cols = params[0].input.matrix.cols; |
45 | 29 | convnet->channels = params[0].input.matrix.channels; |
46 | 29 | convnet->mean_activity = ccv_dense_matrix_new(convnet->input.height, convnet->input.width, convnet->channels | CCV_32F, 0, 0); |
47 | 29 | ccv_zero(convnet->mean_activity); |
48 | 29 | ccv_convnet_layer_t* layers = convnet->layers; |
49 | 29 | int i, j; |
50 | 100 | for (i = 0; i < count; i++71 ) |
51 | 71 | { |
52 | 71 | layers[i].type = params[i].type; |
53 | 71 | layers[i].input = params[i].input; |
54 | 71 | layers[i].net = params[i].output; |
55 | 71 | layers[i].reserved = 0; |
56 | 71 | switch (params[i].type) |
57 | 71 | { |
58 | 38 | case CCV_CONVNET_CONVOLUTIONAL: |
59 | 38 | assert(params[i].input.matrix.channels % params[i].input.matrix.partition == 0); |
60 | 38 | assert(params[i].output.convolutional.count % params[i].output.convolutional.partition == 0); |
61 | 38 | assert(params[i].output.convolutional.partition % params[i].input.matrix.partition == 0); |
62 | 38 | assert(params[i].output.convolutional.partition >= params[i].input.matrix.partition); |
63 | 38 | layers[i].wnum = params[i].output.convolutional.rows * params[i].output.convolutional.cols * params[i].output.convolutional.channels / params[i].input.matrix.partition * params[i].output.convolutional.count; |
64 | 38 | layers[i].w = (float*)ccmalloc(sizeof(float) * (layers[i].wnum + params[i].output.convolutional.count)); |
65 | 38 | layers[i].bias = layers[i].w + layers[i].wnum; |
66 | 38 | #ifdef HAVE_GSL |
67 | 29.4M | for (j = 0; j < layers[i].wnum; j++29.4M ) |
68 | 29.4M | layers[i].w[j] = (gsl_rng_uniform_pos(rng) * 2 - 1) * params[i].glorot / sqrtf(params[i].output.convolutional.rows * params[i].output.convolutional.cols * params[i].output.convolutional.channels / params[i].input.matrix.partition + params[i].output.convolutional.count); |
69 | | #else |
70 | | for (j = 0; j < layers[i].wnum; j++) |
71 | | layers[i].w[j] = 0; |
72 | | #endif |
73 | 8.57k | for (j = 0; j < params[i].output.convolutional.count; j++8.53k ) |
74 | 8.53k | layers[i].bias[j] = params[i].bias; |
75 | 38 | break; |
76 | 10 | case CCV_CONVNET_FULL_CONNECT: |
77 | 10 | layers[i].wnum = params[i].input.node.count * params[i].output.full_connect.count; |
78 | 10 | layers[i].w = (float*)ccmalloc(sizeof(float) * (layers[i].wnum + params[i].output.full_connect.count)); |
79 | 10 | layers[i].bias = layers[i].w + layers[i].wnum; |
80 | 10 | #ifdef HAVE_GSL |
81 | 237M | for (j = 0; j < layers[i].wnum; j++237M ) |
82 | 237M | layers[i].w[j] = (gsl_rng_uniform_pos(rng) * 2 - 1) * params[i].glorot / sqrtf(params[i].input.node.count + params[i].output.full_connect.count); |
83 | | #else |
84 | | for (j = 0; j < layers[i].wnum; j++) |
85 | | layers[i].w[j] = 0; |
86 | | #endif |
87 | 20.4k | for (j = 0; j < params[i].output.full_connect.count; j++20.4k ) |
88 | 20.4k | layers[i].bias[j] = params[i].bias; |
89 | 10 | break; |
90 | 23 | default: |
91 | 23 | layers[i].wnum = 0; |
92 | 23 | layers[i].w = 0; |
93 | 23 | layers[i].bias = 0; |
94 | 23 | break; |
95 | 71 | } |
96 | 71 | } |
97 | 29 | #ifdef HAVE_GSL |
98 | 29 | gsl_rng_free(rng); |
99 | 29 | #endif |
100 | 29 | return convnet; |
101 | 29 | } |
102 | | |
103 | | int ccv_convnet_verify(ccv_convnet_t* convnet, int output) |
104 | 0 | { |
105 | 0 | int i, out_rows, out_cols, out_partition, out_channels; |
106 | 0 | if (convnet->count < 1) |
107 | 0 | return -1; |
108 | | // the last layer has to be full connect |
109 | 0 | if (convnet->layers[convnet->count - 1].type != CCV_CONVNET_FULL_CONNECT) |
110 | 0 | return -1; |
111 | | // you cannot enable relu on the last layer |
112 | 0 | if (convnet->layers[convnet->count - 1].net.full_connect.relu) |
113 | 0 | return -1; |
114 | 0 | out_channels = 3; |
115 | 0 | for (i = 0; i < convnet->count; i++) |
116 | 0 | { |
117 | 0 | ccv_convnet_layer_t* layer = convnet->layers + i; |
118 | 0 | if (i > 0 && (out_rows != layer->input.matrix.rows || out_cols != layer->input.matrix.cols)) |
119 | 0 | return -1; |
120 | | // the input channels should be equal to the previous output channels, skip this check for full connect as it is meaningless |
121 | 0 | if (out_channels != layer->input.matrix.channels && layer->type != CCV_CONVNET_FULL_CONNECT) |
122 | 0 | return -1; |
123 | 0 | ccv_convnet_make_output(layer, layer->input.matrix.rows, layer->input.matrix.cols, &out_rows, &out_cols, &out_partition); |
124 | 0 | if (layer->type == CCV_CONVNET_CONVOLUTIONAL) |
125 | 0 | { |
126 | | // check to see if the input matrix channel is equal to the expected input of the convolutional layer filters |
127 | 0 | if (layer->input.matrix.channels != layer->net.convolutional.channels) |
128 | 0 | return -1; |
129 | | // if this layer is convolutional layer, its filter output should equal to next layer's channel input |
130 | 0 | out_channels = layer->net.convolutional.count; |
131 | 0 | } |
132 | 0 | } |
133 | 0 | if (out_rows * out_cols != output) |
134 | 0 | return -1; |
135 | 0 | int count = 0; |
136 | 0 | for (i = 0; i < convnet->count; i++) |
137 | 0 | { |
138 | 0 | ccv_convnet_layer_t* layer = convnet->layers + i; |
139 | 0 | if (layer->type == CCV_CONVNET_FULL_CONNECT) |
140 | 0 | { |
141 | 0 | count = i; |
142 | 0 | break; |
143 | 0 | } |
144 | 0 | } |
145 | | // all the layers after the first full connect layer should only be full connect layer |
146 | 0 | for (i = count; i < convnet->count; i++) |
147 | 0 | if (convnet->layers[i].type != CCV_CONVNET_FULL_CONNECT || |
148 | 0 | convnet->layers[i].input.matrix.rows * convnet->layers[i].input.matrix.cols * convnet->layers[i].input.matrix.channels != convnet->layers[i].input.node.count) |
149 | 0 | return -1; |
150 | 0 | return 0; |
151 | 0 | } |
152 | | |
153 | | #endif |
154 | | |
155 | | #if defined(HAVE_SSE2) || defined(HAVE_NEON) |
156 | | |
157 | | static void _ccv_convnet_layer_simd_alloc_reserved(ccv_convnet_layer_t* layer) |
158 | 2.37k | { |
159 | 2.37k | if (layer->reserved) |
160 | 1 | return; |
161 | 2.37k | int partition = layer->input.matrix.partition; |
162 | 2.37k | int ch = layer->net.convolutional.channels; |
163 | 2.37k | int count = layer->net.convolutional.count; |
164 | 2.37k | int kernel_rows = layer->net.convolutional.rows; |
165 | 2.37k | int kernel_cols = layer->net.convolutional.cols; |
166 | 2.37k | int ch_per_partition = ch / partition; |
167 | 2.37k | int count_per_4 = count / 4; |
168 | 2.37k | float* simd_w = (float*)ccmalloc(sizeof(float) * layer->wnum); |
169 | 2.37k | int i, j, k, c; |
170 | 6.84k | for (k = 0; k < count_per_4; k++4.47k ) |
171 | 77.7k | for (i = 0; 4.47k i < kernel_rows * kernel_cols; i++73.2k ) |
172 | 7.56M | for (j = 0; 73.2k j < ch_per_partition; j++7.49M ) |
173 | 37.4M | for (c = 0; 7.49M c < 4; c++29.9M ) |
174 | 29.9M | simd_w[(k * kernel_rows * kernel_cols * ch_per_partition + i * ch_per_partition + j) * 4 + c] = layer->w[(k * 4 + c) * kernel_rows * kernel_cols * ch_per_partition + i * ch_per_partition + j]; |
175 | 2.37k | layer->reserved = simd_w; |
176 | 2.37k | } Unexecuted instantiation: convnet.tests.c:_ccv_convnet_layer_simd_alloc_reserved ccv_convnet.c:_ccv_convnet_layer_simd_alloc_reserved Line | Count | Source | 158 | 2.37k | { | 159 | 2.37k | if (layer->reserved) | 160 | 1 | return; | 161 | 2.37k | int partition = layer->input.matrix.partition; | 162 | 2.37k | int ch = layer->net.convolutional.channels; | 163 | 2.37k | int count = layer->net.convolutional.count; | 164 | 2.37k | int kernel_rows = layer->net.convolutional.rows; | 165 | 2.37k | int kernel_cols = layer->net.convolutional.cols; | 166 | 2.37k | int ch_per_partition = ch / partition; | 167 | 2.37k | int count_per_4 = count / 4; | 168 | 2.37k | float* simd_w = (float*)ccmalloc(sizeof(float) * layer->wnum); | 169 | 2.37k | int i, j, k, c; | 170 | 6.84k | for (k = 0; k < count_per_4; k++4.47k ) | 171 | 77.7k | for (i = 0; 4.47k i < kernel_rows * kernel_cols; i++73.2k ) | 172 | 7.56M | for (j = 0; 73.2k j < ch_per_partition; j++7.49M ) | 173 | 37.4M | for (c = 0; 7.49M c < 4; c++29.9M ) | 174 | 29.9M | simd_w[(k * kernel_rows * kernel_cols * ch_per_partition + i * ch_per_partition + j) * 4 + c] = layer->w[(k * 4 + c) * kernel_rows * kernel_cols * ch_per_partition + i * ch_per_partition + j]; | 175 | 2.37k | layer->reserved = simd_w; | 176 | 2.37k | } |
|
177 | | |
178 | | #endif |
179 | | |
180 | 8.00k | #define SIMD(x) ((float*)((x)->reserved)) |
181 | | |
182 | | #if defined(HAVE_SSE2) |
183 | | static inline void _ccv_convnet_convolutional_forward_propagate_sse2(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t* db, int rows, int cols, int ch, int count, int strides, int border, int kernel_rows, int kernel_cols, int ch_per_partition, int count_per_partition) |
184 | 2.37k | { |
185 | 2.37k | assert(SIMD(layer)); |
186 | 2.37k | #define main_for(block) \ |
187 | 4.47k | parallel_for2.37k (k, (count >> 2)) { \ |
188 | 4.47k | int i, j, x, y, c; \ |
189 | 4.47k | int p = k * 4 / count_per_partition; \ |
190 | 4.47k | float* ap = a->data.f32 + p * ch_per_partition; \ |
191 | 4.47k | float* bp = db->data.f32 + k * 4; \ |
192 | 4.47k | float* layer_w = SIMD(layer) + k * 4 * kernel_rows * kernel_cols * ch_per_partition; \ |
193 | 4.47k | float bias[4] __attribute__ ((__aligned__(16))); \ |
194 | 4.47k | memcpy(bias, layer->bias + k * 4, sizeof(float) * 4); \ |
195 | | /* 4 accumulators */ \ |
196 | 4.47k | __m128 z4 = _mm_setzero_ps(); \ |
197 | 150k | for (i = 0; i < db->rows; i++145k ) \ |
198 | 145k | { \ |
199 | 145k | int comy = ccv_max(i * strides - border, 0) - (i * strides - border); \ |
200 | 145k | int maxy = kernel_rows - comy - (i * strides + kernel_rows - ccv_min(a->rows + border, i * strides + kernel_rows)); \ |
201 | 145k | comy *= ch_per_partition * kernel_cols; \ |
202 | 8.74M | for (j = 0; j < db->cols; j++8.59M ) \ |
203 | 8.59M | { \ |
204 | 8.59M | __m128 v40 = _mm_load_ps(bias); \ |
205 | 8.59M | __m128 v41 = _mm_setzero_ps(); \ |
206 | 8.59M | __m128 v42 = _mm_setzero_ps(); \ |
207 | 8.59M | __m128 v43 = _mm_setzero_ps(); \ |
208 | 8.59M | int comx = ccv_max(j * strides - border, 0) - (j * strides - border); \ |
209 | 8.59M | int maxx = kernel_cols - comx - (j * strides + kernel_cols - ccv_min(a->cols + border, j * strides + kernel_cols)); \ |
210 | 8.59M | float* w = layer_w + (comx * ch_per_partition + comy) * 4; \ |
211 | 8.59M | float* apz = ap + ccv_max(j * strides - border, 0) * ch; \ |
212 | | /* when we have border, we simply do zero padding */ \ |
213 | 37.8M | for (y = 0; y < maxy; y++29.2M ) \ |
214 | 29.2M | { \ |
215 | | /* special casing for these cases to speed up SIMD computation */ \ |
216 | 134M | for (x = 0; x < maxx; x++104M ) \ |
217 | 104M | { \ |
218 | 104M | c = 0; \ |
219 | 1.85G | for (; c < ch_per_partition - 3; c += 41.75G ) \ |
220 | 1.75G | { \ |
221 | 1.75G | __m128 apz4 = _mm_loadu_ps(apz + x * ch + c); \ |
222 | 1.75G | __m128 w40 = _mm_loadu_ps(w + (x * ch_per_partition + c) * 4); \ |
223 | 1.75G | __m128 w41 = _mm_loadu_ps(w + (x * ch_per_partition + c + 1) * 4); \ |
224 | 1.75G | __m128 w42 = _mm_loadu_ps(w + (x * ch_per_partition + c + 2) * 4); \ |
225 | 1.75G | __m128 w43 = _mm_loadu_ps(w + (x * ch_per_partition + c + 3) * 4); \ |
226 | 1.75G | __m128 apz40 = _mm_shuffle_ps(apz4, apz4, 0x00); \ |
227 | 1.75G | __m128 apz41 = _mm_shuffle_ps(apz4, apz4, 0x55); \ |
228 | 1.75G | __m128 apz42 = _mm_shuffle_ps(apz4, apz4, 0xAA); \ |
229 | 1.75G | __m128 apz43 = _mm_shuffle_ps(apz4, apz4, 0xFF); \ |
230 | 1.75G | v40 =_mm_add_ps(_mm_mul_ps(w40, apz40), v40); \ |
231 | 1.75G | v41 =_mm_add_ps(_mm_mul_ps(w41, apz41), v41); \ |
232 | 1.75G | v42 =_mm_add_ps(_mm_mul_ps(w42, apz42), v42); \ |
233 | 1.75G | v43 =_mm_add_ps(_mm_mul_ps(w43, apz43), v43); \ |
234 | 1.75G | } \ |
235 | 104M | block /* insert executions for tail partition */ \ |
236 | 104M | } \ |
237 | 29.2M | w += kernel_cols * ch_per_partition * 4; \ |
238 | 29.2M | apz += a->cols * ch; \ |
239 | 29.2M | } \ |
240 | 8.59M | __m128 v4 = _mm_max_ps(z4, _mm_add_ps(_mm_add_ps(v40, v41), _mm_add_ps(v42, v43))); \ |
241 | 8.59M | _mm_storeu_ps(bp + j * count, v4); /* ReLU */ \ |
242 | 8.59M | } \ |
243 | 145k | bp += db->cols * count; \ |
244 | 145k | ap += a->cols * ch * (ccv_max((i + 1) * strides - border, 0) - ccv_max(i * strides - border, 0)); \ |
245 | 145k | } \ |
246 | 4.47k | } parallel_endfor |
247 | 2.37k | if (ch_per_partition % 4 == 0) |
248 | 24 | { |
249 | 24 | main_for(); |
250 | 2.35k | } else if (ch_per_partition % 4 == 3) { // unroll the last for-loops |
251 | 1.22k | #define block \ |
252 | 1.22k | __m128 apz40 = _mm_load1_ps(apz + x * ch + c); \ |
253 | 1.22k | __m128 apz41 = _mm_load1_ps(apz + x * ch + c + 1); \ |
254 | 1.22k | __m128 apz42 = _mm_load1_ps(apz + x * ch + c + 2); \ |
255 | 1.22k | __m128 w40 = _mm_loadu_ps(w + (x * ch_per_partition + c) * 4); \ |
256 | 1.22k | __m128 w41 = _mm_loadu_ps(w + (x * ch_per_partition + c + 1) * 4); \ |
257 | 1.22k | __m128 w42 = _mm_loadu_ps(w + (x * ch_per_partition + c + 2) * 4); \ |
258 | 1.22k | v40 = _mm_add_ps(_mm_mul_ps(w40, apz40), v40); \ |
259 | 1.22k | v41 = _mm_add_ps(_mm_mul_ps(w41, apz41), v41); \ |
260 | 1.22k | v42 = _mm_add_ps(_mm_mul_ps(w42, apz42), v42); |
261 | 1.22k | main_for(block); |
262 | 1.22k | #undef block |
263 | 1.22k | } else if (1.13k ch_per_partition % 4 == 21.13k ) { // unroll the last for-loops |
264 | 1.12k | #define block \ |
265 | 1.12k | __m128 apz40 = _mm_load1_ps(apz + x * ch + c); \ |
266 | 1.12k | __m128 apz41 = _mm_load1_ps(apz + x * ch + c + 1); \ |
267 | 1.12k | __m128 w40 = _mm_loadu_ps(w + (x * ch_per_partition + c) * 4); \ |
268 | 1.12k | __m128 w41 = _mm_loadu_ps(w + (x * ch_per_partition + c + 1) * 4); \ |
269 | 1.12k | v40 = _mm_add_ps(_mm_mul_ps(w40, apz40), v40); \ |
270 | 1.12k | v41 = _mm_add_ps(_mm_mul_ps(w41, apz41), v41); |
271 | 1.12k | main_for(block); |
272 | 1.12k | #undef block |
273 | 1.12k | } else { |
274 | 3 | #define block \ |
275 | 3 | __m128 apz4 = _mm_load1_ps(apz + x * ch + c); \ |
276 | 3 | __m128 w4 = _mm_loadu_ps(w + (x * ch_per_partition + c) * 4); \ |
277 | 3 | v40 = _mm_add_ps(_mm_mul_ps(w4, apz4), v40); |
278 | 3 | main_for(block); |
279 | 3 | #undef block |
280 | 3 | } |
281 | 2.37k | #undef main_for |
282 | 2.37k | } Unexecuted instantiation: convnet.tests.c:_ccv_convnet_convolutional_forward_propagate_sse2 ccv_convnet.c:_ccv_convnet_convolutional_forward_propagate_sse2 Line | Count | Source | 184 | 2.37k | { | 185 | 2.37k | assert(SIMD(layer)); | 186 | 2.37k | #define main_for(block) \ | 187 | 2.37k | parallel_for(k, (count >> 2)) { \ | 188 | 2.37k | int i, j, x, y, c; \ | 189 | 2.37k | int p = k * 4 / count_per_partition; \ | 190 | 2.37k | float* ap = a->data.f32 + p * ch_per_partition; \ | 191 | 2.37k | float* bp = db->data.f32 + k * 4; \ | 192 | 2.37k | float* layer_w = SIMD(layer) + k * 4 * kernel_rows * kernel_cols * ch_per_partition; \ | 193 | 2.37k | float bias[4] __attribute__ ((__aligned__(16))); \ | 194 | 2.37k | memcpy(bias, layer->bias + k * 4, sizeof(float) * 4); \ | 195 | | /* 4 accumulators */ \ | 196 | 2.37k | __m128 z4 = _mm_setzero_ps(); \ | 197 | 2.37k | for (i = 0; i < db->rows; i++) \ | 198 | 2.37k | { \ | 199 | 2.37k | int comy = ccv_max(i * strides - border, 0) - (i * strides - border); \ | 200 | 2.37k | int maxy = kernel_rows - comy - (i * strides + kernel_rows - ccv_min(a->rows + border, i * strides + kernel_rows)); \ | 201 | 2.37k | comy *= ch_per_partition * kernel_cols; \ | 202 | 2.37k | for (j = 0; j < db->cols; j++) \ | 203 | 2.37k | { \ | 204 | 2.37k | __m128 v40 = _mm_load_ps(bias); \ | 205 | 2.37k | __m128 v41 = _mm_setzero_ps(); \ | 206 | 2.37k | __m128 v42 = _mm_setzero_ps(); \ | 207 | 2.37k | __m128 v43 = _mm_setzero_ps(); \ | 208 | 2.37k | int comx = ccv_max(j * strides - border, 0) - (j * strides - border); \ | 209 | 2.37k | int maxx = kernel_cols - comx - (j * strides + kernel_cols - ccv_min(a->cols + border, j * strides + kernel_cols)); \ | 210 | 2.37k | float* w = layer_w + (comx * ch_per_partition + comy) * 4; \ | 211 | 2.37k | float* apz = ap + ccv_max(j * strides - border, 0) * ch; \ | 212 | | /* when we have border, we simply do zero padding */ \ | 213 | 2.37k | for (y = 0; y < maxy; y++) \ | 214 | 2.37k | { \ | 215 | | /* special casing for these cases to speed up SIMD computation */ \ | 216 | 2.37k | for (x = 0; x < maxx; x++) \ | 217 | 2.37k | { \ | 218 | 2.37k | c = 0; \ | 219 | 2.37k | for (; c < ch_per_partition - 3; c += 4) \ | 220 | 2.37k | { \ | 221 | 2.37k | __m128 apz4 = _mm_loadu_ps(apz + x * ch + c); \ | 222 | 2.37k | __m128 w40 = _mm_loadu_ps(w + (x * ch_per_partition + c) * 4); \ | 223 | 2.37k | __m128 w41 = _mm_loadu_ps(w + (x * ch_per_partition + c + 1) * 4); \ | 224 | 2.37k | __m128 w42 = _mm_loadu_ps(w + (x * ch_per_partition + c + 2) * 4); \ | 225 | 2.37k | __m128 w43 = _mm_loadu_ps(w + (x * ch_per_partition + c + 3) * 4); \ | 226 | 2.37k | __m128 apz40 = _mm_shuffle_ps(apz4, apz4, 0x00); \ | 227 | 2.37k | __m128 apz41 = _mm_shuffle_ps(apz4, apz4, 0x55); \ | 228 | 2.37k | __m128 apz42 = _mm_shuffle_ps(apz4, apz4, 0xAA); \ | 229 | 2.37k | __m128 apz43 = _mm_shuffle_ps(apz4, apz4, 0xFF); \ | 230 | 2.37k | v40 =_mm_add_ps(_mm_mul_ps(w40, apz40), v40); \ | 231 | 2.37k | v41 =_mm_add_ps(_mm_mul_ps(w41, apz41), v41); \ | 232 | 2.37k | v42 =_mm_add_ps(_mm_mul_ps(w42, apz42), v42); \ | 233 | 2.37k | v43 =_mm_add_ps(_mm_mul_ps(w43, apz43), v43); \ | 234 | 2.37k | } \ | 235 | 2.37k | block /* insert executions for tail partition */ \ | 236 | 2.37k | } \ | 237 | 2.37k | w += kernel_cols * ch_per_partition * 4; \ | 238 | 2.37k | apz += a->cols * ch; \ | 239 | 2.37k | } \ | 240 | 2.37k | __m128 v4 = _mm_max_ps(z4, _mm_add_ps(_mm_add_ps(v40, v41), _mm_add_ps(v42, v43))); \ | 241 | 2.37k | _mm_storeu_ps(bp + j * count, v4); /* ReLU */ \ | 242 | 2.37k | } \ | 243 | 2.37k | bp += db->cols * count; \ | 244 | 2.37k | ap += a->cols * ch * (ccv_max((i + 1) * strides - border, 0) - ccv_max(i * strides - border, 0)); \ | 245 | 2.37k | } \ | 246 | 2.37k | } parallel_endfor | 247 | 2.37k | if (ch_per_partition % 4 == 0) | 248 | 24 | { | 249 | 24 | main_for(); | 250 | 2.35k | } else if (ch_per_partition % 4 == 3) { // unroll the last for-loops | 251 | 1.22k | #define block \ | 252 | 1.22k | __m128 apz40 = _mm_load1_ps(apz + x * ch + c); \ | 253 | 1.22k | __m128 apz41 = _mm_load1_ps(apz + x * ch + c + 1); \ | 254 | 1.22k | __m128 apz42 = _mm_load1_ps(apz + x * ch + c + 2); \ | 255 | 1.22k | __m128 w40 = _mm_loadu_ps(w + (x * ch_per_partition + c) * 4); \ | 256 | 1.22k | __m128 w41 = _mm_loadu_ps(w + (x * ch_per_partition + c + 1) * 4); \ | 257 | 1.22k | __m128 w42 = _mm_loadu_ps(w + (x * ch_per_partition + c + 2) * 4); \ | 258 | 1.22k | v40 = _mm_add_ps(_mm_mul_ps(w40, apz40), v40); \ | 259 | 1.22k | v41 = _mm_add_ps(_mm_mul_ps(w41, apz41), v41); \ | 260 | 1.22k | v42 = _mm_add_ps(_mm_mul_ps(w42, apz42), v42); | 261 | 1.22k | main_for(block); | 262 | 1.22k | #undef block | 263 | 1.22k | } else if (1.13k ch_per_partition % 4 == 21.13k ) { // unroll the last for-loops | 264 | 1.12k | #define block \ | 265 | 1.12k | __m128 apz40 = _mm_load1_ps(apz + x * ch + c); \ | 266 | 1.12k | __m128 apz41 = _mm_load1_ps(apz + x * ch + c + 1); \ | 267 | 1.12k | __m128 w40 = _mm_loadu_ps(w + (x * ch_per_partition + c) * 4); \ | 268 | 1.12k | __m128 w41 = _mm_loadu_ps(w + (x * ch_per_partition + c + 1) * 4); \ | 269 | 1.12k | v40 = _mm_add_ps(_mm_mul_ps(w40, apz40), v40); \ | 270 | 1.12k | v41 = _mm_add_ps(_mm_mul_ps(w41, apz41), v41); | 271 | 1.12k | main_for(block); | 272 | 1.12k | #undef block | 273 | 1.12k | } else { | 274 | 3 | #define block \ | 275 | 3 | __m128 apz4 = _mm_load1_ps(apz + x * ch + c); \ | 276 | 3 | __m128 w4 = _mm_loadu_ps(w + (x * ch_per_partition + c) * 4); \ | 277 | 3 | v40 = _mm_add_ps(_mm_mul_ps(w4, apz4), v40); | 278 | 3 | main_for(block); | 279 | 3 | #undef block | 280 | 3 | } | 281 | 2.37k | #undef main_for | 282 | 2.37k | } |
|
283 | | #elif defined(HAVE_NEON) |
284 | | static inline void _ccv_convnet_convolutional_forward_propagate_neon(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t* db, int rows, int cols, int ch, int count, int strides, int border, int kernel_rows, int kernel_cols, int ch_per_partition, int count_per_partition) |
285 | | { |
286 | | assert(SIMD(layer)); |
287 | | #define main_for(block) \ |
288 | | parallel_for(k, (count >> 2)) { \ |
289 | | int i, j, x, y, c; \ |
290 | | int p = k * 4 / count_per_partition; \ |
291 | | float* ap = a->data.f32 + p * ch_per_partition; \ |
292 | | float* bp = db->data.f32 + k * 4; \ |
293 | | float* layer_w = SIMD(layer) + k * 4 * kernel_rows * kernel_cols * ch_per_partition; \ |
294 | | float bias[4] __attribute__ ((__aligned__(16))); \ |
295 | | memcpy(bias, layer->bias + k * 4, sizeof(float) * 4); \ |
296 | | float32x4_t z4 = vmovq_n_f32(0); \ |
297 | | for (i = 0; i < db->rows; i++) \ |
298 | | { \ |
299 | | int comy = ccv_max(i * strides - border, 0) - (i * strides - border); \ |
300 | | int maxy = kernel_rows - comy - (i * strides + kernel_rows - ccv_min(a->rows + border, i * strides + kernel_rows)); \ |
301 | | comy *= ch_per_partition * kernel_cols; \ |
302 | | for (j = 0; j < db->cols; j++) \ |
303 | | { \ |
304 | | float32x4_t v40 = vld1q_f32(bias); \ |
305 | | float32x4_t v41 = vmovq_n_f32(0); \ |
306 | | int comx = ccv_max(j * strides - border, 0) - (j * strides - border); \ |
307 | | int maxx = kernel_cols - comx - (j * strides + kernel_cols - ccv_min(a->cols + border, j * strides + kernel_cols)); \ |
308 | | float* w = layer_w + (comx * ch_per_partition + comy) * 4; \ |
309 | | float* apz = ap + ccv_max(j * strides - border, 0) * ch; \ |
310 | | /* when we have border, we simply do zero padding */ \ |
311 | | for (y = 0; y < maxy; y++) \ |
312 | | { \ |
313 | | for (x = 0; x < maxx; x++) \ |
314 | | { \ |
315 | | c = 0; \ |
316 | | for (; c < ch_per_partition - 1; c += 2) \ |
317 | | { \ |
318 | | float32x2_t apz4 = vld1_f32(apz + x * ch + c); \ |
319 | | float32x4_t apz40 = vdupq_lane_f32(apz4, 0); \ |
320 | | float32x4_t apz41 = vdupq_lane_f32(apz4, 1); \ |
321 | | float32x4_t w40 = vld1q_f32(w + (x * ch_per_partition + c) * 4); \ |
322 | | float32x4_t w41 = vld1q_f32(w + (x * ch_per_partition + c + 1) * 4); \ |
323 | | v40 = vmlaq_f32(v40, w40, apz40); \ |
324 | | v41 = vmlaq_f32(v41, w41, apz41); \ |
325 | | } \ |
326 | | block /* insert executions for tail partition */ \ |
327 | | } \ |
328 | | w += kernel_cols * ch_per_partition * 4; \ |
329 | | apz += a->cols * ch; \ |
330 | | } \ |
331 | | float32x4_t v4 = vmaxq_f32(z4, vaddq_f32(v40, v41)); \ |
332 | | vst1q_f32(bp + j * count, v4); /* ReLU */ \ |
333 | | } \ |
334 | | bp += db->cols * count; \ |
335 | | ap += a->cols * ch * (ccv_max((i + 1) * strides - border, 0) - ccv_max(i * strides - border, 0)); \ |
336 | | } \ |
337 | | } parallel_endfor |
338 | | if (ch_per_partition % 2 == 0) |
339 | | { |
340 | | main_for(); |
341 | | } else { // unroll the last for-loops |
342 | | #define block \ |
343 | | float32x4_t apz4 = vmovq_n_f32(apz[x * ch + c]); \ |
344 | | float32x4_t w4 = vld1q_f32(w + (x * ch_per_partition + c) * 4); \ |
345 | | v40 = vmlaq_f32(v40, w4, apz4); |
346 | | main_for(block); |
347 | | #undef block |
348 | | } |
349 | | #undef main_for |
350 | | } |
351 | | #else |
352 | | static inline void _ccv_convnet_convolutional_forward_propagate_fallback(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t* db, int rows, int cols, int ch, int count, int strides, int border, int kernel_rows, int kernel_cols, int ch_per_partition, int count_per_partition) |
353 | | { |
354 | | parallel_for(k, count) { |
355 | | int i, j, x, y, c; |
356 | | int p = k / count_per_partition; |
357 | | float* ap = a->data.f32 + p * ch_per_partition; |
358 | | float* bp = db->data.f32 + k; |
359 | | float* layer_w = layer->w + k * kernel_rows * kernel_cols * ch_per_partition; |
360 | | float bias = layer->bias[k]; |
361 | | for (i = 0; i < db->rows; i++) |
362 | | { |
363 | | int comy = ccv_max(i * strides - border, 0) - (i * strides - border); |
364 | | int maxy = kernel_rows - comy - (i * strides + kernel_rows - ccv_min(a->rows + border, i * strides + kernel_rows)); |
365 | | comy *= ch_per_partition * kernel_cols; |
366 | | for (j = 0; j < db->cols; j++) |
367 | | { |
368 | | float v = bias; |
369 | | int comx = ccv_max(j * strides - border, 0) - (j * strides - border); |
370 | | int maxx = kernel_cols - comx - (j * strides + kernel_cols - ccv_min(a->cols + border, j * strides + kernel_cols)); |
371 | | float* w = layer_w + comx * ch_per_partition + comy; |
372 | | float* apz = ap + ccv_max(j * strides - border, 0) * ch; |
373 | | // when we have border, we simply do zero padding |
374 | | for (y = 0; y < maxy; y++) |
375 | | { |
376 | | for (x = 0; x < maxx; x++) |
377 | | for (c = 0; c < ch_per_partition; c++) |
378 | | v += w[x * ch_per_partition + c] * apz[x * ch + c]; |
379 | | w += kernel_cols * ch_per_partition; |
380 | | apz += a->cols * ch; |
381 | | } |
382 | | bp[j * count] = ccv_max(0, v); // ReLU |
383 | | } |
384 | | bp += db->cols * count; |
385 | | ap += a->cols * ch * (ccv_max((i + 1) * strides - border, 0) - ccv_max(i * strides - border, 0)); |
386 | | } |
387 | | } parallel_endfor |
388 | | } |
389 | | #endif |
390 | | |
391 | | static void _ccv_convnet_convolutional_forward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t** b) |
392 | 2.37k | { |
393 | 2.37k | int rows, cols, partition; |
394 | 2.37k | ccv_convnet_make_output(layer, a->rows, a->cols, &rows, &cols, &partition); |
395 | 2.37k | int ch = layer->net.convolutional.channels; |
396 | 2.37k | int count = layer->net.convolutional.count; |
397 | 2.37k | int strides = layer->net.convolutional.strides; |
398 | 2.37k | int border = layer->net.convolutional.border; |
399 | 2.37k | int kernel_rows = layer->net.convolutional.rows; |
400 | 2.37k | int kernel_cols = layer->net.convolutional.cols; |
401 | 2.37k | int type = CCV_32F | count; |
402 | 2.37k | assert(CCV_GET_CHANNEL(a->type) == ch); |
403 | 2.37k | assert(CCV_GET_DATA_TYPE(a->type) == CCV_32F); |
404 | 2.37k | ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, rows, cols, type, type, 0); |
405 | 2.37k | int ch_per_partition = ch / partition; |
406 | 2.37k | int count_per_partition = count / partition; |
407 | 2.37k | assert(count_per_partition % 4 == 0); |
408 | 2.37k | #if defined(HAVE_SSE2) || defined(HAVE_NEON) |
409 | 2.37k | _ccv_convnet_layer_simd_alloc_reserved(layer); |
410 | 2.37k | #endif |
411 | 2.37k | #if defined(HAVE_SSE2) |
412 | 2.37k | _ccv_convnet_convolutional_forward_propagate_sse2(layer, a, db, rows, cols, ch, count, strides, border, kernel_rows, kernel_cols, ch_per_partition, count_per_partition); |
413 | | #elif defined(HAVE_NEON) |
414 | | _ccv_convnet_convolutional_forward_propagate_neon(layer, a, db, rows, cols, ch, count, strides, border, kernel_rows, kernel_cols, ch_per_partition, count_per_partition); |
415 | | #else |
416 | | _ccv_convnet_convolutional_forward_propagate_fallback(layer, a, db, rows, cols, ch, count, strides, border, kernel_rows, kernel_cols, ch_per_partition, count_per_partition); |
417 | | #endif |
418 | 2.37k | } Unexecuted instantiation: convnet.tests.c:_ccv_convnet_convolutional_forward_propagate ccv_convnet.c:_ccv_convnet_convolutional_forward_propagate Line | Count | Source | 392 | 2.37k | { | 393 | 2.37k | int rows, cols, partition; | 394 | 2.37k | ccv_convnet_make_output(layer, a->rows, a->cols, &rows, &cols, &partition); | 395 | 2.37k | int ch = layer->net.convolutional.channels; | 396 | 2.37k | int count = layer->net.convolutional.count; | 397 | 2.37k | int strides = layer->net.convolutional.strides; | 398 | 2.37k | int border = layer->net.convolutional.border; | 399 | 2.37k | int kernel_rows = layer->net.convolutional.rows; | 400 | 2.37k | int kernel_cols = layer->net.convolutional.cols; | 401 | 2.37k | int type = CCV_32F | count; | 402 | 2.37k | assert(CCV_GET_CHANNEL(a->type) == ch); | 403 | 2.37k | assert(CCV_GET_DATA_TYPE(a->type) == CCV_32F); | 404 | 2.37k | ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, rows, cols, type, type, 0); | 405 | 2.37k | int ch_per_partition = ch / partition; | 406 | 2.37k | int count_per_partition = count / partition; | 407 | 2.37k | assert(count_per_partition % 4 == 0); | 408 | 2.37k | #if defined(HAVE_SSE2) || defined(HAVE_NEON) | 409 | 2.37k | _ccv_convnet_layer_simd_alloc_reserved(layer); | 410 | 2.37k | #endif | 411 | 2.37k | #if defined(HAVE_SSE2) | 412 | 2.37k | _ccv_convnet_convolutional_forward_propagate_sse2(layer, a, db, rows, cols, ch, count, strides, border, kernel_rows, kernel_cols, ch_per_partition, count_per_partition); | 413 | | #elif defined(HAVE_NEON) | 414 | | _ccv_convnet_convolutional_forward_propagate_neon(layer, a, db, rows, cols, ch, count, strides, border, kernel_rows, kernel_cols, ch_per_partition, count_per_partition); | 415 | | #else | 416 | | _ccv_convnet_convolutional_forward_propagate_fallback(layer, a, db, rows, cols, ch, count, strides, border, kernel_rows, kernel_cols, ch_per_partition, count_per_partition); | 417 | | #endif | 418 | 2.37k | } |
|
419 | | |
420 | | static void _ccv_convnet_full_connect_forward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t** b) |
421 | 3.23k | { |
422 | 3.23k | assert(CCV_GET_DATA_TYPE(a->type) == CCV_32F); |
423 | 3.23k | ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, layer->net.full_connect.count, 1, CCV_32F | CCV_C1, CCV_32F | CCV_C1, 0); |
424 | 3.23k | int ch = CCV_GET_CHANNEL(a->type); |
425 | 3.23k | int rows = a->rows, cols = a->cols; |
426 | | // reshape a for gemm |
427 | 3.23k | assert(a->step == a->cols * CCV_GET_DATA_TYPE_SIZE(a->type) * ch); |
428 | 3.23k | a->rows = rows * cols * ch, a->cols = 1, a->type = (a->type - ch) | CCV_C1; |
429 | 3.23k | assert(a->rows * db->rows == layer->wnum); |
430 | 3.23k | a->step = a->cols * CCV_GET_DATA_TYPE_SIZE(a->type); |
431 | 3.23k | int i; |
432 | 3.23k | float* bptr = db->data.f32; |
433 | 55.9k | for (i = 0; i < db->rows; i++52.7k ) |
434 | 52.7k | bptr[i] = layer->bias[i]; |
435 | 3.23k | ccv_dense_matrix_t dw = ccv_dense_matrix(db->rows, a->rows, CCV_32F | CCV_C1, layer->w, 0); |
436 | 3.23k | ccv_gemm(&dw, a, 1, db, 1, 0, (ccv_matrix_t**)&db, 0); // supply db as matrix C is allowed |
437 | 3.23k | if (layer->net.full_connect.relu) |
438 | 16.3k | for (i = 0; 4 i < db->rows; i++16.3k ) |
439 | 16.3k | bptr[i] = ccv_max(0, bptr[i]); // relu |
440 | 3.23k | a->rows = rows, a->cols = cols, a->type = (a->type - CCV_GET_CHANNEL(a->type)) | ch; |
441 | 3.23k | a->step = a->cols * CCV_GET_DATA_TYPE_SIZE(a->type) * CCV_GET_CHANNEL(a->type); |
442 | 3.23k | } Unexecuted instantiation: convnet.tests.c:_ccv_convnet_full_connect_forward_propagate ccv_convnet.c:_ccv_convnet_full_connect_forward_propagate Line | Count | Source | 421 | 3.23k | { | 422 | 3.23k | assert(CCV_GET_DATA_TYPE(a->type) == CCV_32F); | 423 | 3.23k | ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, layer->net.full_connect.count, 1, CCV_32F | CCV_C1, CCV_32F | CCV_C1, 0); | 424 | 3.23k | int ch = CCV_GET_CHANNEL(a->type); | 425 | 3.23k | int rows = a->rows, cols = a->cols; | 426 | | // reshape a for gemm | 427 | 3.23k | assert(a->step == a->cols * CCV_GET_DATA_TYPE_SIZE(a->type) * ch); | 428 | 3.23k | a->rows = rows * cols * ch, a->cols = 1, a->type = (a->type - ch) | CCV_C1; | 429 | 3.23k | assert(a->rows * db->rows == layer->wnum); | 430 | 3.23k | a->step = a->cols * CCV_GET_DATA_TYPE_SIZE(a->type); | 431 | 3.23k | int i; | 432 | 3.23k | float* bptr = db->data.f32; | 433 | 55.9k | for (i = 0; i < db->rows; i++52.7k ) | 434 | 52.7k | bptr[i] = layer->bias[i]; | 435 | 3.23k | ccv_dense_matrix_t dw = ccv_dense_matrix(db->rows, a->rows, CCV_32F | CCV_C1, layer->w, 0); | 436 | 3.23k | ccv_gemm(&dw, a, 1, db, 1, 0, (ccv_matrix_t**)&db, 0); // supply db as matrix C is allowed | 437 | 3.23k | if (layer->net.full_connect.relu) | 438 | 16.3k | for (i = 0; 4 i < db->rows; i++16.3k ) | 439 | 16.3k | bptr[i] = ccv_max(0, bptr[i]); // relu | 440 | 3.23k | a->rows = rows, a->cols = cols, a->type = (a->type - CCV_GET_CHANNEL(a->type)) | ch; | 441 | 3.23k | a->step = a->cols * CCV_GET_DATA_TYPE_SIZE(a->type) * CCV_GET_CHANNEL(a->type); | 442 | 3.23k | } |
|
443 | | |
444 | | static void _ccv_convnet_rnorm_forward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t** b, ccv_dense_matrix_t** denoms) |
445 | 823 | { |
446 | 823 | int rows, cols, partition; |
447 | 823 | ccv_convnet_make_output(layer, a->rows, a->cols, &rows, &cols, &partition); |
448 | 823 | int size = layer->net.rnorm.size; |
449 | 823 | float kappa = layer->net.rnorm.kappa; |
450 | 823 | float alpha = layer->net.rnorm.alpha; |
451 | 823 | float beta = layer->net.rnorm.beta; |
452 | 823 | int way = size / 2; |
453 | 823 | assert(CCV_GET_DATA_TYPE(a->type) == CCV_32F); |
454 | 823 | int ch = CCV_GET_CHANNEL(a->type); |
455 | 823 | int type = CCV_32F | ch; |
456 | 823 | ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, rows, cols, type, type, 0); |
457 | 823 | int i, j, k, x, p; |
458 | 823 | float* ap = a->data.f32; |
459 | 823 | float* bp = db->data.f32; |
460 | 823 | int ch_per_partition = ch / partition; |
461 | 823 | if (denoms) |
462 | 823 | { |
463 | 823 | ccv_dense_matrix_t* ddenoms = *denoms = ccv_dense_matrix_renew(*denoms, rows, cols, type, type, 0); |
464 | 823 | float* dp = ddenoms->data.f32; |
465 | 26.3k | for (i = 0; i < db->rows; i++25.4k ) |
466 | 25.4k | { |
467 | 815k | for (j = 0; j < db->cols; j++789k ) |
468 | 1.58M | for (p = 0; 789k p < partition; p++790k ) |
469 | 3.95M | for (k = 0; 790k k < ch_per_partition; k++3.16M ) |
470 | 3.16M | { |
471 | 3.16M | float v = ap[j * ch + p * ch_per_partition + k]; |
472 | 3.16M | float denom = 0; |
473 | 11.0M | for (x = ccv_max3.16M (k - way, 0); x <= ccv_min(k + way, ch_per_partition - 1); x++7.90M ) |
474 | 7.90M | denom += ap[j * ch + p * ch_per_partition + x] * ap[j * ch + p * ch_per_partition + x]; |
475 | 3.16M | denom = kappa + alpha * denom; |
476 | 3.16M | dp[j * ch + p * ch_per_partition + k] = denom; |
477 | 3.16M | bp[j * ch + p * ch_per_partition + k] = v * powf(denom, -beta); |
478 | 3.16M | } |
479 | 25.4k | ap += a->cols * ch; |
480 | 25.4k | dp += ddenoms->cols * ch; |
481 | 25.4k | bp += db->cols * ch; |
482 | 25.4k | } |
483 | 823 | } else { |
484 | 0 | for (i = 0; i < db->rows; i++) |
485 | 0 | { |
486 | 0 | for (j = 0; j < db->cols; j++) |
487 | 0 | for (p = 0; p < partition; p++) |
488 | 0 | for (k = 0; k < ch_per_partition; k++) |
489 | 0 | { |
490 | 0 | float v = ap[j * ch + p * ch_per_partition + k]; |
491 | 0 | float denom = 0; |
492 | 0 | for (x = ccv_max(k - way, 0); x <= ccv_min(k + way, ch_per_partition - 1); x++) |
493 | 0 | denom += ap[j * ch + p * ch_per_partition + x] * ap[j * ch + p * ch_per_partition + x]; |
494 | 0 | denom = kappa + alpha * denom; |
495 | 0 | bp[j * ch + p * ch_per_partition + k] = v * powf(denom, -beta); |
496 | 0 | } |
497 | 0 | ap += a->cols * ch; |
498 | 0 | bp += db->cols * ch; |
499 | 0 | } |
500 | 0 | } |
501 | 823 | } Unexecuted instantiation: convnet.tests.c:_ccv_convnet_rnorm_forward_propagate ccv_convnet.c:_ccv_convnet_rnorm_forward_propagate Line | Count | Source | 445 | 823 | { | 446 | 823 | int rows, cols, partition; | 447 | 823 | ccv_convnet_make_output(layer, a->rows, a->cols, &rows, &cols, &partition); | 448 | 823 | int size = layer->net.rnorm.size; | 449 | 823 | float kappa = layer->net.rnorm.kappa; | 450 | 823 | float alpha = layer->net.rnorm.alpha; | 451 | 823 | float beta = layer->net.rnorm.beta; | 452 | 823 | int way = size / 2; | 453 | 823 | assert(CCV_GET_DATA_TYPE(a->type) == CCV_32F); | 454 | 823 | int ch = CCV_GET_CHANNEL(a->type); | 455 | 823 | int type = CCV_32F | ch; | 456 | 823 | ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, rows, cols, type, type, 0); | 457 | 823 | int i, j, k, x, p; | 458 | 823 | float* ap = a->data.f32; | 459 | 823 | float* bp = db->data.f32; | 460 | 823 | int ch_per_partition = ch / partition; | 461 | 823 | if (denoms) | 462 | 823 | { | 463 | 823 | ccv_dense_matrix_t* ddenoms = *denoms = ccv_dense_matrix_renew(*denoms, rows, cols, type, type, 0); | 464 | 823 | float* dp = ddenoms->data.f32; | 465 | 26.3k | for (i = 0; i < db->rows; i++25.4k ) | 466 | 25.4k | { | 467 | 815k | for (j = 0; j < db->cols; j++789k ) | 468 | 1.58M | for (p = 0; 789k p < partition; p++790k ) | 469 | 3.95M | for (k = 0; 790k k < ch_per_partition; k++3.16M ) | 470 | 3.16M | { | 471 | 3.16M | float v = ap[j * ch + p * ch_per_partition + k]; | 472 | 3.16M | float denom = 0; | 473 | 11.0M | for (x = ccv_max3.16M (k - way, 0); x <= ccv_min(k + way, ch_per_partition - 1); x++7.90M ) | 474 | 7.90M | denom += ap[j * ch + p * ch_per_partition + x] * ap[j * ch + p * ch_per_partition + x]; | 475 | 3.16M | denom = kappa + alpha * denom; | 476 | 3.16M | dp[j * ch + p * ch_per_partition + k] = denom; | 477 | 3.16M | bp[j * ch + p * ch_per_partition + k] = v * powf(denom, -beta); | 478 | 3.16M | } | 479 | 25.4k | ap += a->cols * ch; | 480 | 25.4k | dp += ddenoms->cols * ch; | 481 | 25.4k | bp += db->cols * ch; | 482 | 25.4k | } | 483 | 823 | } else { | 484 | 0 | for (i = 0; i < db->rows; i++) | 485 | 0 | { | 486 | 0 | for (j = 0; j < db->cols; j++) | 487 | 0 | for (p = 0; p < partition; p++) | 488 | 0 | for (k = 0; k < ch_per_partition; k++) | 489 | 0 | { | 490 | 0 | float v = ap[j * ch + p * ch_per_partition + k]; | 491 | 0 | float denom = 0; | 492 | 0 | for (x = ccv_max(k - way, 0); x <= ccv_min(k + way, ch_per_partition - 1); x++) | 493 | 0 | denom += ap[j * ch + p * ch_per_partition + x] * ap[j * ch + p * ch_per_partition + x]; | 494 | 0 | denom = kappa + alpha * denom; | 495 | 0 | bp[j * ch + p * ch_per_partition + k] = v * powf(denom, -beta); | 496 | 0 | } | 497 | 0 | ap += a->cols * ch; | 498 | 0 | bp += db->cols * ch; | 499 | 0 | } | 500 | 0 | } | 501 | 823 | } |
|
502 | | |
503 | | static void _ccv_convnet_max_pool_forward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t** b) |
504 | 14 | { |
505 | 14 | int rows, cols, partition; |
506 | 14 | ccv_convnet_make_output(layer, a->rows, a->cols, &rows, &cols, &partition); |
507 | 14 | int size = layer->net.pool.size; |
508 | 14 | int strides = layer->net.pool.strides; |
509 | 14 | int border = layer->net.pool.border; |
510 | 14 | assert(CCV_GET_DATA_TYPE(a->type) == CCV_32F); |
511 | 14 | int ch = CCV_GET_CHANNEL(a->type); |
512 | 14 | int type = CCV_32F | ch; |
513 | 14 | ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, rows, cols, type, type, 0); |
514 | 14 | int i, j, k, x, y; |
515 | 14 | float* ap = a->data.f32; |
516 | 14 | float* bp = db->data.f32; |
517 | 526 | for (i = 0; i < db->rows; i++512 ) |
518 | 512 | { |
519 | 512 | const int start_y = ccv_max(i * strides - border, 0) - (i * strides - border); |
520 | 512 | const int end_y = size + ccv_min(i * strides + size - border, a->rows) - (i * strides + size - border); |
521 | 35.1k | for (j = 0; j < db->cols; j++34.6k ) |
522 | 34.6k | { |
523 | 34.6k | const int start_x = ccv_max(j * strides - border, 0) - (j * strides - border); |
524 | 34.6k | const int end_x = size + ccv_min(j * strides + size - border, a->cols) - (j * strides + size - border); |
525 | 2.97M | for (k = 0; k < ch; k++2.93M ) |
526 | 2.93M | { |
527 | 2.93M | float v = 0; |
528 | 11.7M | for (y = start_y; y < end_y; y++8.81M ) |
529 | 35.2M | for (x = start_x; 8.81M x < end_x; x++26.4M ) |
530 | 26.4M | if (x == start_x && y == start_y8.81M ) |
531 | 2.93M | v = ap[(j * strides - border + x + (y - border) * a->cols) * ch + k]; |
532 | 23.4M | else if (ap[(j * strides - border + x + (y - border) * a->cols) * ch + k] > v) |
533 | 3.80M | v = ap[(j * strides - border + x + (y - border) * a->cols) * ch + k]; |
534 | 2.93M | bp[j * ch + k] = v; |
535 | 2.93M | } |
536 | 34.6k | } |
537 | 512 | ap += a->cols * ch * strides; |
538 | 512 | bp += db->cols * ch; |
539 | 512 | } |
540 | 14 | } Unexecuted instantiation: convnet.tests.c:_ccv_convnet_max_pool_forward_propagate ccv_convnet.c:_ccv_convnet_max_pool_forward_propagate Line | Count | Source | 504 | 14 | { | 505 | 14 | int rows, cols, partition; | 506 | 14 | ccv_convnet_make_output(layer, a->rows, a->cols, &rows, &cols, &partition); | 507 | 14 | int size = layer->net.pool.size; | 508 | 14 | int strides = layer->net.pool.strides; | 509 | 14 | int border = layer->net.pool.border; | 510 | 14 | assert(CCV_GET_DATA_TYPE(a->type) == CCV_32F); | 511 | 14 | int ch = CCV_GET_CHANNEL(a->type); | 512 | 14 | int type = CCV_32F | ch; | 513 | 14 | ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, rows, cols, type, type, 0); | 514 | 14 | int i, j, k, x, y; | 515 | 14 | float* ap = a->data.f32; | 516 | 14 | float* bp = db->data.f32; | 517 | 526 | for (i = 0; i < db->rows; i++512 ) | 518 | 512 | { | 519 | 512 | const int start_y = ccv_max(i * strides - border, 0) - (i * strides - border); | 520 | 512 | const int end_y = size + ccv_min(i * strides + size - border, a->rows) - (i * strides + size - border); | 521 | 35.1k | for (j = 0; j < db->cols; j++34.6k ) | 522 | 34.6k | { | 523 | 34.6k | const int start_x = ccv_max(j * strides - border, 0) - (j * strides - border); | 524 | 34.6k | const int end_x = size + ccv_min(j * strides + size - border, a->cols) - (j * strides + size - border); | 525 | 2.97M | for (k = 0; k < ch; k++2.93M ) | 526 | 2.93M | { | 527 | 2.93M | float v = 0; | 528 | 11.7M | for (y = start_y; y < end_y; y++8.81M ) | 529 | 35.2M | for (x = start_x; 8.81M x < end_x; x++26.4M ) | 530 | 26.4M | if (x == start_x && y == start_y8.81M ) | 531 | 2.93M | v = ap[(j * strides - border + x + (y - border) * a->cols) * ch + k]; | 532 | 23.4M | else if (ap[(j * strides - border + x + (y - border) * a->cols) * ch + k] > v) | 533 | 3.80M | v = ap[(j * strides - border + x + (y - border) * a->cols) * ch + k]; | 534 | 2.93M | bp[j * ch + k] = v; | 535 | 2.93M | } | 536 | 34.6k | } | 537 | 512 | ap += a->cols * ch * strides; | 538 | 512 | bp += db->cols * ch; | 539 | 512 | } | 540 | 14 | } |
|
541 | | |
542 | | static void _ccv_convnet_average_pool_forward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t** b) |
543 | 3 | { |
544 | 3 | int rows, cols, partition; |
545 | 3 | ccv_convnet_make_output(layer, a->rows, a->cols, &rows, &cols, &partition); |
546 | 3 | int size = layer->net.pool.size; |
547 | 3 | int strides = layer->net.pool.strides; |
548 | 3 | int border = layer->net.pool.border; |
549 | 3 | assert(CCV_GET_DATA_TYPE(a->type) == CCV_32F); |
550 | 3 | int ch = CCV_GET_CHANNEL(a->type); |
551 | 3 | int type = CCV_32F | ch; |
552 | 3 | ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, rows, cols, type, type, 0); |
553 | 3 | int i, j, k, x, y; |
554 | 3 | float* ap = a->data.f32; |
555 | 3 | float* bp = db->data.f32; |
556 | 76 | for (i = 0; i < db->rows; i++73 ) |
557 | 73 | { |
558 | 73 | const int start_y = ccv_max(i * strides - border, 0) - (i * strides - border); |
559 | 73 | const int end_y = size + ccv_min(i * strides + size - border, a->rows) - (i * strides + size - border); |
560 | 1.89k | for (j = 0; j < db->cols; j++1.81k ) |
561 | 1.81k | { |
562 | 1.81k | const int start_x = ccv_max(j * strides - border, 0) - (j * strides - border); |
563 | 1.81k | const int end_x = size + ccv_min(j * strides + size - border, a->cols) - (j * strides + size - border); |
564 | 3.63k | for (k = 0; k < ch; k++1.81k ) |
565 | 1.81k | { |
566 | 1.81k | float v = 0; |
567 | 6.54k | for (y = start_y; y < end_y; y++4.72k ) |
568 | 17.4k | for (x = start_x; 4.72k x < end_x; x++12.7k ) |
569 | 12.7k | v += ap[(j * strides - border + x + (y - border) * a->cols) * ch + k]; |
570 | 1.81k | bp[j * ch + k] = v / ((end_x - start_x) * (end_y - start_y)); |
571 | 1.81k | } |
572 | 1.81k | } |
573 | 73 | ap += a->cols * ch * strides; |
574 | 73 | bp += db->cols * ch; |
575 | 73 | } |
576 | 3 | } Unexecuted instantiation: convnet.tests.c:_ccv_convnet_average_pool_forward_propagate ccv_convnet.c:_ccv_convnet_average_pool_forward_propagate Line | Count | Source | 543 | 3 | { | 544 | 3 | int rows, cols, partition; | 545 | 3 | ccv_convnet_make_output(layer, a->rows, a->cols, &rows, &cols, &partition); | 546 | 3 | int size = layer->net.pool.size; | 547 | 3 | int strides = layer->net.pool.strides; | 548 | 3 | int border = layer->net.pool.border; | 549 | 3 | assert(CCV_GET_DATA_TYPE(a->type) == CCV_32F); | 550 | 3 | int ch = CCV_GET_CHANNEL(a->type); | 551 | 3 | int type = CCV_32F | ch; | 552 | 3 | ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, rows, cols, type, type, 0); | 553 | 3 | int i, j, k, x, y; | 554 | 3 | float* ap = a->data.f32; | 555 | 3 | float* bp = db->data.f32; | 556 | 76 | for (i = 0; i < db->rows; i++73 ) | 557 | 73 | { | 558 | 73 | const int start_y = ccv_max(i * strides - border, 0) - (i * strides - border); | 559 | 73 | const int end_y = size + ccv_min(i * strides + size - border, a->rows) - (i * strides + size - border); | 560 | 1.89k | for (j = 0; j < db->cols; j++1.81k ) | 561 | 1.81k | { | 562 | 1.81k | const int start_x = ccv_max(j * strides - border, 0) - (j * strides - border); | 563 | 1.81k | const int end_x = size + ccv_min(j * strides + size - border, a->cols) - (j * strides + size - border); | 564 | 3.63k | for (k = 0; k < ch; k++1.81k ) | 565 | 1.81k | { | 566 | 1.81k | float v = 0; | 567 | 6.54k | for (y = start_y; y < end_y; y++4.72k ) | 568 | 17.4k | for (x = start_x; 4.72k x < end_x; x++12.7k ) | 569 | 12.7k | v += ap[(j * strides - border + x + (y - border) * a->cols) * ch + k]; | 570 | 1.81k | bp[j * ch + k] = v / ((end_x - start_x) * (end_y - start_y)); | 571 | 1.81k | } | 572 | 1.81k | } | 573 | 73 | ap += a->cols * ch * strides; | 574 | 73 | bp += db->cols * ch; | 575 | 73 | } | 576 | 3 | } |
|
577 | | |
578 | | static void _ccv_convnet_layer_forward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t** b, ccv_dense_matrix_t** denoms) |
579 | 6.45k | { |
580 | 6.45k | switch(layer->type) |
581 | 6.45k | { |
582 | 2.37k | case CCV_CONVNET_CONVOLUTIONAL: |
583 | 2.37k | _ccv_convnet_convolutional_forward_propagate(layer, a, b); |
584 | 2.37k | break; |
585 | 3.23k | case CCV_CONVNET_FULL_CONNECT: |
586 | 3.23k | _ccv_convnet_full_connect_forward_propagate(layer, a, b); |
587 | 3.23k | break; |
588 | 823 | case CCV_CONVNET_LOCAL_RESPONSE_NORM: |
589 | 823 | _ccv_convnet_rnorm_forward_propagate(layer, a, b, denoms); |
590 | 823 | break; |
591 | 14 | case CCV_CONVNET_MAX_POOL: |
592 | 14 | _ccv_convnet_max_pool_forward_propagate(layer, a, b); |
593 | 14 | break; |
594 | 3 | case CCV_CONVNET_AVERAGE_POOL: |
595 | 3 | _ccv_convnet_average_pool_forward_propagate(layer, a, b); |
596 | 3 | break; |
597 | 6.45k | } |
598 | 6.45k | } Unexecuted instantiation: convnet.tests.c:_ccv_convnet_layer_forward_propagate ccv_convnet.c:_ccv_convnet_layer_forward_propagate Line | Count | Source | 579 | 6.45k | { | 580 | 6.45k | switch(layer->type) | 581 | 6.45k | { | 582 | 2.37k | case CCV_CONVNET_CONVOLUTIONAL: | 583 | 2.37k | _ccv_convnet_convolutional_forward_propagate(layer, a, b); | 584 | 2.37k | break; | 585 | 3.23k | case CCV_CONVNET_FULL_CONNECT: | 586 | 3.23k | _ccv_convnet_full_connect_forward_propagate(layer, a, b); | 587 | 3.23k | break; | 588 | 823 | case CCV_CONVNET_LOCAL_RESPONSE_NORM: | 589 | 823 | _ccv_convnet_rnorm_forward_propagate(layer, a, b, denoms); | 590 | 823 | break; | 591 | 14 | case CCV_CONVNET_MAX_POOL: | 592 | 14 | _ccv_convnet_max_pool_forward_propagate(layer, a, b); | 593 | 14 | break; | 594 | 3 | case CCV_CONVNET_AVERAGE_POOL: | 595 | 3 | _ccv_convnet_average_pool_forward_propagate(layer, a, b); | 596 | 3 | break; | 597 | 6.45k | } | 598 | 6.45k | } |
|
599 | | |
600 | | static void _ccv_convnet_full_connect_forward_propagate_parallel(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t** b) |
601 | 0 | { |
602 | 0 | assert(CCV_GET_DATA_TYPE(a->type) == CCV_32F); |
603 | 0 | ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, a->rows, layer->net.full_connect.count, CCV_32F | CCV_C1, CCV_32F | CCV_C1, 0); |
604 | | // reshape a for gemm |
605 | 0 | int i, j; |
606 | 0 | float* bptr = db->data.f32; |
607 | 0 | for (i = 0; i < db->rows; i++) |
608 | 0 | { |
609 | 0 | for (j = 0; j < db->cols; j++) |
610 | 0 | bptr[j] = layer->bias[j]; |
611 | 0 | bptr += db->cols; |
612 | 0 | } |
613 | 0 | ccv_dense_matrix_t dw = ccv_dense_matrix(db->cols, a->cols, CCV_32F | CCV_C1, layer->w, 0); |
614 | 0 | ccv_gemm(a, &dw, 1, db, 1, CCV_B_TRANSPOSE, (ccv_matrix_t**)&db, 0); // supply db as matrix C is allowed |
615 | 0 | bptr = db->data.f32; |
616 | 0 | if (layer->net.full_connect.relu) |
617 | 0 | for (i = 0; i < db->rows; i++) |
618 | 0 | { |
619 | 0 | for (j = 0; j < db->cols; j++) |
620 | 0 | bptr[j] = ccv_max(0, bptr[j]); // relu |
621 | 0 | bptr += db->cols; |
622 | 0 | } |
623 | 0 | } Unexecuted instantiation: convnet.tests.c:_ccv_convnet_full_connect_forward_propagate_parallel Unexecuted instantiation: ccv_convnet.c:_ccv_convnet_full_connect_forward_propagate_parallel |
624 | | |
625 | | static void _ccv_convnet_compute_softmax_parallel(ccv_dense_matrix_t* a, ccv_dense_matrix_t** b, int type) |
626 | 0 | { |
627 | 0 | assert(CCV_GET_CHANNEL(a->type) == CCV_C1); |
628 | 0 | assert(CCV_GET_DATA_TYPE(a->type) == CCV_32F); |
629 | 0 | ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, 1, a->cols, CCV_32F | CCV_C1, CCV_32F | CCV_C1, 0); |
630 | 0 | ccv_zero(db); |
631 | 0 | int i, j; |
632 | 0 | float* aptr = a->data.f32; |
633 | 0 | float* bptr = db->data.f32; |
634 | 0 | float* cptr = (float*)ccmalloc(sizeof(float) * a->cols); |
635 | 0 | for (i = 0; i < a->rows; i++) |
636 | 0 | { |
637 | 0 | double max = aptr[0]; |
638 | 0 | for (j = 1; j < a->cols; j++) |
639 | 0 | if (aptr[j] > max) |
640 | 0 | max = aptr[j]; |
641 | 0 | double tt = 0; |
642 | 0 | for (j = 0; j < a->cols; j++) |
643 | 0 | tt += (cptr[j] = expf(aptr[j] - max)); |
644 | 0 | tt = 1.0 / tt; |
645 | 0 | for (j = 0; j < a->cols; j++) |
646 | 0 | bptr[j] += cptr[j] * tt; |
647 | 0 | aptr += a->cols; |
648 | 0 | } |
649 | 0 | ccfree(cptr); |
650 | 0 | } Unexecuted instantiation: convnet.tests.c:_ccv_convnet_compute_softmax_parallel Unexecuted instantiation: ccv_convnet.c:_ccv_convnet_compute_softmax_parallel |
651 | | |
652 | | #ifndef CASE_TESTS |
653 | | |
654 | | void ccv_convnet_encode(ccv_convnet_t* convnet, ccv_dense_matrix_t** a, ccv_dense_matrix_t** b, int batch) |
655 | 5.28k | { |
656 | 5.28k | #ifdef HAVE_CUDA |
657 | 5.28k | if (convnet->use_cwc_accel) |
658 | 0 | cwc_convnet_encode(convnet, a, b, batch); |
659 | 5.28k | else { |
660 | 5.28k | #endif |
661 | 5.28k | assert(batch == 1); |
662 | 5.28k | assert(CCV_GET_CHANNEL((*a)->type) == convnet->channels); |
663 | 5.28k | assert((*a)->rows == convnet->rows); |
664 | 5.28k | assert((*a)->cols == convnet->cols); |
665 | 5.28k | int i; |
666 | | // save the last layer of neuron cache in case that we encode to a different matrix |
667 | 5.28k | ccv_dense_matrix_t* out_neuron = convnet->acts[convnet->count - 1]; |
668 | 5.28k | convnet->acts[convnet->count - 1] = *b; |
669 | 5.28k | _ccv_convnet_layer_forward_propagate(convnet->layers, *a, convnet->acts, convnet->denoms); |
670 | 6.45k | for (i = 1; i < convnet->count; i++1.16k ) |
671 | 1.16k | _ccv_convnet_layer_forward_propagate(convnet->layers + i, convnet->acts[i - 1], convnet->acts + i, convnet->denoms + i); |
672 | 5.28k | if (convnet->acts + convnet->count - 1 != b) |
673 | 5.28k | { |
674 | 5.28k | *b = convnet->acts[convnet->count - 1]; |
675 | | // restore the last layer of neuron cache |
676 | 5.28k | convnet->acts[convnet->count - 1] = out_neuron; |
677 | 5.28k | } |
678 | 5.28k | #ifdef HAVE_CUDA |
679 | 5.28k | } |
680 | 5.28k | #endif |
681 | 5.28k | } |
682 | | |
683 | | // find the layer for scanning (it is the last convolutional layer) |
684 | | static int _ccv_convnet_find_scan(ccv_convnet_t* convnet) |
685 | 0 | { |
686 | 0 | int i; |
687 | 0 | ccv_convnet_layer_t* layers = convnet->layers; |
688 | 0 | for (i = convnet->count - 1; i >= 0; i--) |
689 | 0 | if (layers[i].type == CCV_CONVNET_CONVOLUTIONAL) |
690 | 0 | return i; |
691 | 0 | return -1; |
692 | 0 | } |
693 | | |
694 | | static int _ccv_convnet_derive_scale(ccv_convnet_t* convnet, int scan) |
695 | 0 | { |
696 | 0 | int i, scale = 1; |
697 | 0 | for (i = scan; i >= 0; i--) |
698 | 0 | { |
699 | 0 | ccv_convnet_layer_t* layer = convnet->layers + i; |
700 | 0 | switch (layer->type) |
701 | 0 | { |
702 | 0 | case CCV_CONVNET_CONVOLUTIONAL: |
703 | 0 | scale *= layer->net.convolutional.strides; |
704 | 0 | break; |
705 | 0 | case CCV_CONVNET_MAX_POOL: |
706 | 0 | case CCV_CONVNET_AVERAGE_POOL: |
707 | 0 | scale *= layer->net.pool.strides; |
708 | 0 | break; |
709 | 0 | } |
710 | 0 | } |
711 | 0 | return scale; |
712 | 0 | } |
713 | | |
714 | | static int _ccv_convnet_find_full_connect(ccv_convnet_t* convnet) |
715 | 0 | { |
716 | 0 | int i; |
717 | 0 | for (i = 0; i < convnet->count; i++) |
718 | 0 | if (convnet->layers[i].type == CCV_CONVNET_FULL_CONNECT) |
719 | 0 | return i; |
720 | 0 | return -1; |
721 | 0 | } |
722 | | |
723 | | void ccv_convnet_classify(ccv_convnet_t* convnet, ccv_dense_matrix_t** a, int symmetric, ccv_array_t** ranks, int tops, int batch) |
724 | 0 | { |
725 | 0 | #ifdef HAVE_CUDA |
726 | 0 | if (convnet->use_cwc_accel) |
727 | 0 | cwc_convnet_classify(convnet, a, symmetric, ranks, tops, batch); |
728 | 0 | else { |
729 | 0 | #endif |
730 | 0 | int i, j, k, t; |
731 | 0 | ccv_dense_matrix_t** b = (ccv_dense_matrix_t**)alloca(sizeof(ccv_dense_matrix_t*) * (convnet->count + 1)); |
732 | 0 | int scan = _ccv_convnet_find_scan(convnet); |
733 | 0 | int scale = _ccv_convnet_derive_scale(convnet, scan); |
734 | 0 | int full_connect = _ccv_convnet_find_full_connect(convnet); |
735 | 0 | assert(scan >= 0 && scan < convnet->count); |
736 | 0 | assert(full_connect >= 0 && full_connect < convnet->count); |
737 | 0 | memset(b, 0, sizeof(ccv_dense_matrix_t*) * (convnet->count + 1)); |
738 | 0 | for (i = 0; i < batch; i++) |
739 | 0 | { |
740 | 0 | assert(CCV_GET_CHANNEL(a[i]->type) == convnet->channels); |
741 | 0 | assert(a[i]->rows == convnet->input.height || a[i]->cols == convnet->input.width); |
742 | 0 | assert(a[i]->rows >= convnet->input.height && a[i]->cols >= convnet->input.width); |
743 | | // find optimal rows and cols to slice to |
744 | 0 | int rows = convnet->rows + ((a[i]->rows - convnet->rows) / scale) * scale; |
745 | 0 | int cols = convnet->cols + ((a[i]->cols - convnet->cols) / scale) * scale; |
746 | 0 | assert(rows == convnet->input.height || cols == convnet->input.width); |
747 | 0 | assert(rows <= a[i]->rows && cols <= a[i]->cols); |
748 | 0 | ccv_dense_matrix_t* slice = 0; |
749 | 0 | ccv_slice(a[i], (ccv_matrix_t**)&slice, CCV_32F, (a[i]->rows - rows) / 2, (a[i]->cols - cols) / 2, rows, cols); |
750 | 0 | ccv_dense_matrix_t* mean_activity = 0; |
751 | | // scale mean activity up to be substractable (from this one, the CPU implementation is an approximation of GPU implementation) |
752 | 0 | ccv_resample(convnet->mean_activity, &mean_activity, 0, (double)rows / (double)convnet->mean_activity->rows, (double)cols / (double)convnet->mean_activity->cols, CCV_INTER_CUBIC); |
753 | 0 | ccv_subtract(slice, mean_activity, (ccv_matrix_t**)b, CCV_32F); |
754 | 0 | ccv_matrix_free(mean_activity); |
755 | 0 | ccv_matrix_free(slice); |
756 | | // doing the first few layers until the first scan layer |
757 | 0 | int out_rows, out_cols, out_partition; |
758 | 0 | ccv_dense_matrix_t* c = ccv_dense_matrix_new(5 * (!!symmetric + 1), convnet->layers[full_connect].input.node.count, CCV_32F | CCV_C1, 0, 0); |
759 | 0 | for (t = 0; t <= !!symmetric; t++) |
760 | 0 | { |
761 | 0 | rows = b[0]->rows, cols = b[0]->cols; |
762 | 0 | for (j = 0; j < scan + 1; j++) |
763 | 0 | { |
764 | 0 | ccv_convnet_layer_t* layer = convnet->layers + j; |
765 | 0 | ccv_convnet_make_output(layer, rows, cols, &out_rows, &out_cols, &out_partition); |
766 | 0 | _ccv_convnet_layer_forward_propagate(layer, b[j], b + j + 1, 0); |
767 | 0 | assert(b[j + 1]->rows == out_rows && b[j + 1]->cols == out_cols); |
768 | 0 | if (j > 0) |
769 | 0 | ccv_matrix_free(b[j]); |
770 | 0 | rows = out_rows, cols = out_cols; |
771 | 0 | } |
772 | 0 | int offsets[5][2] = { |
773 | 0 | {0, 0}, |
774 | 0 | {cols - convnet->layers[scan + 1].input.matrix.cols, 0}, |
775 | 0 | {(cols - convnet->layers[scan + 1].input.matrix.cols) / 2, (rows - convnet->layers[scan + 1].input.matrix.rows) / 2}, |
776 | 0 | {0, rows - convnet->layers[scan + 1].input.matrix.rows}, |
777 | 0 | {cols - convnet->layers[scan + 1].input.matrix.cols, rows - convnet->layers[scan + 1].input.matrix.rows}, |
778 | 0 | }; |
779 | 0 | for (k = 0; k < 5; k++) |
780 | 0 | { |
781 | 0 | ccv_dense_matrix_t* input = 0; |
782 | 0 | ccv_convnet_layer_t* layer = convnet->layers + scan + 1; |
783 | 0 | ccv_slice(b[scan + 1], (ccv_matrix_t**)&input, CCV_32F, offsets[k][1], offsets[k][0], layer->input.matrix.rows, layer->input.matrix.cols); |
784 | | // copy the last layer for full connect compute |
785 | 0 | b[full_connect] = ccv_dense_matrix_new(convnet->layers[full_connect].input.matrix.rows, convnet->layers[full_connect].input.matrix.cols, CCV_NO_DATA_ALLOC | CCV_32F | convnet->layers[full_connect].input.matrix.channels, c->data.f32 + (t * 5 + k) * convnet->layers[full_connect].input.node.count, 0); |
786 | 0 | for (j = scan + 1; j < full_connect; j++) |
787 | 0 | { |
788 | 0 | layer = convnet->layers + j; |
789 | 0 | _ccv_convnet_layer_forward_propagate(layer, j > scan + 1 ? b[j] : input, b + j + 1, 0); |
790 | 0 | if (j > scan + 1) |
791 | 0 | ccv_matrix_free(b[j]); |
792 | 0 | else |
793 | 0 | ccv_matrix_free(input); |
794 | 0 | } |
795 | 0 | ccv_matrix_free(b[full_connect]); |
796 | | // set it to 0 |
797 | 0 | memset(b + scan + 2, 0, sizeof(ccv_dense_matrix_t*) * (full_connect - scan - 1)); |
798 | 0 | } |
799 | 0 | ccv_matrix_free(b[scan + 1]); |
800 | 0 | memset(b + 1, 0, sizeof(ccv_dense_matrix_t*) * (scan + 1)); |
801 | 0 | if (t < !!symmetric) |
802 | 0 | ccv_flip(b[0], &b[0], 0, CCV_FLIP_X); |
803 | 0 | } |
804 | 0 | ccv_matrix_free(b[0]); |
805 | | // now have everything in c, do the last full connect propagate |
806 | 0 | b[full_connect] = c; |
807 | 0 | for (j = full_connect; j < convnet->count; j++) |
808 | 0 | { |
809 | 0 | ccv_convnet_layer_t* layer = convnet->layers + j; |
810 | 0 | assert(layer->type == CCV_CONVNET_FULL_CONNECT); |
811 | 0 | _ccv_convnet_full_connect_forward_propagate_parallel(layer, b[j], b + j + 1); |
812 | 0 | ccv_matrix_free(b[j]); |
813 | 0 | } |
814 | 0 | ccv_dense_matrix_t* softmax = 0; |
815 | 0 | _ccv_convnet_compute_softmax_parallel(b[convnet->count], &softmax, 0); |
816 | 0 | ccv_matrix_free(b[convnet->count]); |
817 | 0 | ranks[i] = ccv_array_new(sizeof(ccv_classification_t), tops, 0); |
818 | 0 | float* r = softmax->data.f32; |
819 | 0 | assert(tops <= softmax->cols); |
820 | 0 | for (j = 0; j < tops; j++) |
821 | 0 | { |
822 | 0 | float max_val = -1; |
823 | 0 | int max_idx = -1; |
824 | 0 | for (k = 0; k < softmax->cols; k++) |
825 | 0 | if (r[k] >= 0 && r[k] > max_val) |
826 | 0 | max_val = r[k], max_idx = k; |
827 | 0 | assert(max_idx >= 0); |
828 | 0 | r[max_idx] = -1; |
829 | 0 | ccv_classification_t classification = { |
830 | 0 | .id = max_idx, |
831 | 0 | .confidence = max_val / ((!!symmetric + 1) * 5), |
832 | 0 | }; |
833 | 0 | ccv_array_push(ranks[i], &classification); |
834 | 0 | } |
835 | 0 | ccv_matrix_free(softmax); |
836 | 0 | memset(b, 0, sizeof(ccv_dense_matrix_t*) * (convnet->count + 1)); |
837 | 0 | } |
838 | 0 | #ifdef HAVE_CUDA |
839 | 0 | } |
840 | 0 | #endif |
841 | 0 | } |
842 | | |
843 | | #endif |
844 | | |
845 | | #ifdef HAVE_GSL |
846 | | |
847 | | // compute back propagated gradient & weight update delta |
848 | | static void _ccv_convnet_convolutional_backward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t* n, ccv_dense_matrix_t* m, ccv_dense_matrix_t** b, ccv_convnet_layer_t* update_params) |
849 | 7 | { |
850 | | // a is the input gradient (for back prop). |
851 | | // x is the input (for forward prop), b is the output gradient (gradient, or known as propagated error) |
852 | | // note that y (the output from forward prop) is not included because the full connect net is simple enough that we don't need it |
853 | 7 | int rows, cols, partition; |
854 | 7 | ccv_convnet_make_output(layer, layer->input.matrix.rows, layer->input.matrix.cols, &rows, &cols, &partition); |
855 | 7 | int ch = layer->net.convolutional.channels; |
856 | 7 | int count = layer->net.convolutional.count; |
857 | 7 | int strides = layer->net.convolutional.strides; |
858 | 7 | int border = layer->net.convolutional.border; |
859 | 7 | int kernel_rows = layer->net.convolutional.rows; |
860 | 7 | int kernel_cols = layer->net.convolutional.cols; |
861 | 7 | assert(a->rows == rows); |
862 | 7 | assert(a->cols == cols); |
863 | 7 | assert(CCV_GET_CHANNEL(a->type) == count); |
864 | 7 | int a_rows = a->rows, a_cols = a->cols, a_ch = CCV_GET_CHANNEL(a->type); |
865 | 7 | a->rows = rows, a->cols = cols, a->type = (a->type - a_ch) | count; |
866 | 7 | assert(CCV_GET_CHANNEL(m->type) == ch); |
867 | 7 | assert(CCV_GET_DATA_TYPE(m->type) == CCV_32F); |
868 | 7 | int count_per_partition = count / partition; |
869 | 7 | int ch_per_partition = ch / partition; |
870 | | // update weight gradient |
871 | 60 | parallel_for7 (k, count) { |
872 | 60 | int i, j, x, y, c; |
873 | 60 | int p = k / count_per_partition; |
874 | 60 | float* mp = m->data.f32 + p * ch_per_partition; |
875 | 60 | float* ap = a->data.f32 + k; |
876 | 60 | float* np = n->data.f32 + k; |
877 | 60 | float* update_w = update_params->w + k * kernel_rows * kernel_cols * ch_per_partition; |
878 | 60 | float bias = 0; |
879 | 1.81k | for (i = 0; i < rows; i++1.75k ) |
880 | 1.75k | { |
881 | 1.75k | int comy = ccv_max(i * strides - border, 0) - (i * strides - border); |
882 | 1.75k | int maxy = kernel_rows - comy - (i * strides + kernel_rows - ccv_min(m->rows + border, i * strides + kernel_rows)); |
883 | 1.75k | comy *= ch_per_partition * kernel_cols; |
884 | 55.6k | for (j = 0; j < cols; j++53.9k ) |
885 | 53.9k | { |
886 | 53.9k | if (np[j * count] > 0) |
887 | 43.5k | { /* when np is bigger than 0, relu continues to update the weight, otherwise it stops */ |
888 | 43.5k | float v = ap[j * count]; |
889 | 43.5k | bias += v; |
890 | 43.5k | int comx = ccv_max(j * strides - border, 0) - (j * strides - border); |
891 | 43.5k | int maxx = kernel_cols - comx - (j * strides + kernel_cols - ccv_min(m->cols + border, j * strides + kernel_cols)); |
892 | 43.5k | float* w = update_w + comx * ch_per_partition + comy; |
893 | 43.5k | float* mpz = mp + ccv_max(j * strides - border, 0) * ch; |
894 | | /* when we have border, we simply do zero padding */ |
895 | 252k | for (y = 0; y < maxy; y++209k ) |
896 | 209k | { |
897 | 1.21M | for (x = 0; x < maxx; x++1.00M ) |
898 | 3.77M | for (c = 0; 1.00M c < ch_per_partition; c++2.76M ) |
899 | 2.76M | w[x * ch_per_partition + c] += v * mpz[x * ch + c]; |
900 | 209k | w += kernel_cols * ch_per_partition; |
901 | 209k | mpz += m->cols * ch; |
902 | 209k | } |
903 | 43.5k | } |
904 | 53.9k | } |
905 | 1.75k | ap += a->cols * count; |
906 | 1.75k | np += n->cols * count; |
907 | 1.75k | mp += m->cols * ch * (ccv_max((i + 1) * strides - border, 0) - ccv_max(i * strides - border, 0)); |
908 | 1.75k | } |
909 | 60 | update_params->bias[k] += bias; |
910 | 60 | } parallel_endfor |
911 | 7 | if (b) |
912 | 6 | { |
913 | 6 | ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, m->rows, m->cols, CCV_32F | CCV_GET_CHANNEL(m->type), CCV_32F | CCV_GET_CHANNEL(m->type), 0); |
914 | | // clear it up before propagate result |
915 | 6 | ccv_zero(db); |
916 | 6 | int k; |
917 | 62 | for (k = 0; k < count; k++56 ) |
918 | 56 | { |
919 | 56 | int i, j, x, y, c; |
920 | 56 | int p = k / count_per_partition; |
921 | 56 | float* bp = db->data.f32 + p * ch_per_partition; |
922 | 56 | float* ap = a->data.f32 + k; |
923 | 56 | float* np = n->data.f32 + k; |
924 | 56 | float* layer_w = layer->w + k * kernel_rows * kernel_cols * ch_per_partition; |
925 | 1.79k | for (i = 0; i < rows; i++1.73k ) |
926 | 1.73k | { |
927 | 1.73k | int comy = ccv_max(i * strides - border, 0) - (i * strides - border); |
928 | 1.73k | int maxy = kernel_rows - comy - (i * strides + kernel_rows - ccv_min(db->rows + border, i * strides + kernel_rows)); |
929 | 1.73k | comy *= ch_per_partition * kernel_cols; |
930 | 55.5k | for (j = 0; j < cols; j++53.8k ) |
931 | 53.8k | { |
932 | 53.8k | if (np[j * count] > 0) |
933 | 43.4k | { /* when np is bigger than 0, relu continues to update the weight, otherwise it stops */ |
934 | 43.4k | float v = ap[j * count]; |
935 | 43.4k | int comx = ccv_max(j * strides - border, 0) - (j * strides - border); |
936 | 43.4k | int maxx = kernel_cols - comx - (j * strides + kernel_cols - ccv_min(db->cols + border, j * strides + kernel_cols)); |
937 | 43.4k | float* w = layer_w + comx * ch_per_partition + comy; |
938 | 43.4k | float* bpz = bp + ccv_max(j * strides - border, 0) * ch; |
939 | | /* when we have border, we simply do zero padding */ |
940 | 252k | for (y = 0; y < maxy; y++209k ) |
941 | 209k | { |
942 | 1.21M | for (x = 0; x < maxx; x++1.00M ) |
943 | 3.76M | for (c = 0; 1.00M c < ch_per_partition; c++2.76M ) |
944 | 2.76M | bpz[x * ch + c] += v * w[x * ch_per_partition + c]; |
945 | 209k | w += kernel_cols * ch_per_partition; |
946 | 209k | bpz += db->cols * ch; |
947 | 209k | } |
948 | 43.4k | } |
949 | 53.8k | } |
950 | 1.73k | ap += a->cols * count; |
951 | 1.73k | np += n->cols * count; |
952 | 1.73k | bp += db->cols * ch * (ccv_max((i + 1) * strides - border, 0) - ccv_max(i * strides - border, 0)); |
953 | 1.73k | } |
954 | 56 | } |
955 | 6 | } |
956 | 7 | a->rows = a_rows, a->cols = a_cols, a->type = (a->type - CCV_GET_CHANNEL(a->type)) | a_ch; |
957 | 7 | } convnet.tests.c:_ccv_convnet_convolutional_backward_propagate Line | Count | Source | 849 | 7 | { | 850 | | // a is the input gradient (for back prop). | 851 | | // x is the input (for forward prop), b is the output gradient (gradient, or known as propagated error) | 852 | | // note that y (the output from forward prop) is not included because the full connect net is simple enough that we don't need it | 853 | 7 | int rows, cols, partition; | 854 | 7 | ccv_convnet_make_output(layer, layer->input.matrix.rows, layer->input.matrix.cols, &rows, &cols, &partition); | 855 | 7 | int ch = layer->net.convolutional.channels; | 856 | 7 | int count = layer->net.convolutional.count; | 857 | 7 | int strides = layer->net.convolutional.strides; | 858 | 7 | int border = layer->net.convolutional.border; | 859 | 7 | int kernel_rows = layer->net.convolutional.rows; | 860 | 7 | int kernel_cols = layer->net.convolutional.cols; | 861 | 7 | assert(a->rows == rows); | 862 | 7 | assert(a->cols == cols); | 863 | 7 | assert(CCV_GET_CHANNEL(a->type) == count); | 864 | 7 | int a_rows = a->rows, a_cols = a->cols, a_ch = CCV_GET_CHANNEL(a->type); | 865 | 7 | a->rows = rows, a->cols = cols, a->type = (a->type - a_ch) | count; | 866 | 7 | assert(CCV_GET_CHANNEL(m->type) == ch); | 867 | 7 | assert(CCV_GET_DATA_TYPE(m->type) == CCV_32F); | 868 | 7 | int count_per_partition = count / partition; | 869 | 7 | int ch_per_partition = ch / partition; | 870 | | // update weight gradient | 871 | 60 | parallel_for7 (k, count) { | 872 | 60 | int i, j, x, y, c; | 873 | 60 | int p = k / count_per_partition; | 874 | 60 | float* mp = m->data.f32 + p * ch_per_partition; | 875 | 60 | float* ap = a->data.f32 + k; | 876 | 60 | float* np = n->data.f32 + k; | 877 | 60 | float* update_w = update_params->w + k * kernel_rows * kernel_cols * ch_per_partition; | 878 | 60 | float bias = 0; | 879 | 1.81k | for (i = 0; i < rows; i++1.75k ) | 880 | 1.75k | { | 881 | 1.75k | int comy = ccv_max(i * strides - border, 0) - (i * strides - border); | 882 | 1.75k | int maxy = kernel_rows - comy - (i * strides + kernel_rows - ccv_min(m->rows + border, i * strides + kernel_rows)); | 883 | 1.75k | comy *= ch_per_partition * kernel_cols; | 884 | 55.6k | for (j = 0; j < cols; j++53.9k ) | 885 | 53.9k | { | 886 | 53.9k | if (np[j * count] > 0) | 887 | 43.5k | { /* when np is bigger than 0, relu continues to update the weight, otherwise it stops */ | 888 | 43.5k | float v = ap[j * count]; | 889 | 43.5k | bias += v; | 890 | 43.5k | int comx = ccv_max(j * strides - border, 0) - (j * strides - border); | 891 | 43.5k | int maxx = kernel_cols - comx - (j * strides + kernel_cols - ccv_min(m->cols + border, j * strides + kernel_cols)); | 892 | 43.5k | float* w = update_w + comx * ch_per_partition + comy; | 893 | 43.5k | float* mpz = mp + ccv_max(j * strides - border, 0) * ch; | 894 | | /* when we have border, we simply do zero padding */ | 895 | 252k | for (y = 0; y < maxy; y++209k ) | 896 | 209k | { | 897 | 1.21M | for (x = 0; x < maxx; x++1.00M ) | 898 | 3.77M | for (c = 0; 1.00M c < ch_per_partition; c++2.76M ) | 899 | 2.76M | w[x * ch_per_partition + c] += v * mpz[x * ch + c]; | 900 | 209k | w += kernel_cols * ch_per_partition; | 901 | 209k | mpz += m->cols * ch; | 902 | 209k | } | 903 | 43.5k | } | 904 | 53.9k | } | 905 | 1.75k | ap += a->cols * count; | 906 | 1.75k | np += n->cols * count; | 907 | 1.75k | mp += m->cols * ch * (ccv_max((i + 1) * strides - border, 0) - ccv_max(i * strides - border, 0)); | 908 | 1.75k | } | 909 | 60 | update_params->bias[k] += bias; | 910 | 60 | } parallel_endfor | 911 | 7 | if (b) | 912 | 6 | { | 913 | 6 | ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, m->rows, m->cols, CCV_32F | CCV_GET_CHANNEL(m->type), CCV_32F | CCV_GET_CHANNEL(m->type), 0); | 914 | | // clear it up before propagate result | 915 | 6 | ccv_zero(db); | 916 | 6 | int k; | 917 | 62 | for (k = 0; k < count; k++56 ) | 918 | 56 | { | 919 | 56 | int i, j, x, y, c; | 920 | 56 | int p = k / count_per_partition; | 921 | 56 | float* bp = db->data.f32 + p * ch_per_partition; | 922 | 56 | float* ap = a->data.f32 + k; | 923 | 56 | float* np = n->data.f32 + k; | 924 | 56 | float* layer_w = layer->w + k * kernel_rows * kernel_cols * ch_per_partition; | 925 | 1.79k | for (i = 0; i < rows; i++1.73k ) | 926 | 1.73k | { | 927 | 1.73k | int comy = ccv_max(i * strides - border, 0) - (i * strides - border); | 928 | 1.73k | int maxy = kernel_rows - comy - (i * strides + kernel_rows - ccv_min(db->rows + border, i * strides + kernel_rows)); | 929 | 1.73k | comy *= ch_per_partition * kernel_cols; | 930 | 55.5k | for (j = 0; j < cols; j++53.8k ) | 931 | 53.8k | { | 932 | 53.8k | if (np[j * count] > 0) | 933 | 43.4k | { /* when np is bigger than 0, relu continues to update the weight, otherwise it stops */ | 934 | 43.4k | float v = ap[j * count]; | 935 | 43.4k | int comx = ccv_max(j * strides - border, 0) - (j * strides - border); | 936 | 43.4k | int maxx = kernel_cols - comx - (j * strides + kernel_cols - ccv_min(db->cols + border, j * strides + kernel_cols)); | 937 | 43.4k | float* w = layer_w + comx * ch_per_partition + comy; | 938 | 43.4k | float* bpz = bp + ccv_max(j * strides - border, 0) * ch; | 939 | | /* when we have border, we simply do zero padding */ | 940 | 252k | for (y = 0; y < maxy; y++209k ) | 941 | 209k | { | 942 | 1.21M | for (x = 0; x < maxx; x++1.00M ) | 943 | 3.76M | for (c = 0; 1.00M c < ch_per_partition; c++2.76M ) | 944 | 2.76M | bpz[x * ch + c] += v * w[x * ch_per_partition + c]; | 945 | 209k | w += kernel_cols * ch_per_partition; | 946 | 209k | bpz += db->cols * ch; | 947 | 209k | } | 948 | 43.4k | } | 949 | 53.8k | } | 950 | 1.73k | ap += a->cols * count; | 951 | 1.73k | np += n->cols * count; | 952 | 1.73k | bp += db->cols * ch * (ccv_max((i + 1) * strides - border, 0) - ccv_max(i * strides - border, 0)); | 953 | 1.73k | } | 954 | 56 | } | 955 | 6 | } | 956 | 7 | a->rows = a_rows, a->cols = a_cols, a->type = (a->type - CCV_GET_CHANNEL(a->type)) | a_ch; | 957 | 7 | } |
Unexecuted instantiation: ccv_convnet.c:_ccv_convnet_convolutional_backward_propagate |
958 | | |
959 | | static void _ccv_convnet_full_connect_backward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t* y, ccv_dense_matrix_t* x, ccv_dense_matrix_t** b, ccv_convnet_layer_t* update_params) |
960 | 3 | { |
961 | | // a is the input gradient (for back prop), y is the output (for forward prop) |
962 | | // x is the input (for forward prop), b is the output gradient (gradient, or known as propagated error) |
963 | 3 | ccv_dense_matrix_t* db = 0; |
964 | 3 | if (b) |
965 | 3 | db = *b = ccv_dense_matrix_renew(*b, x->rows, x->cols, CCV_32F | CCV_GET_CHANNEL(x->type), CCV_32F | CCV_GET_CHANNEL(x->type), 0); |
966 | 3 | int x_rows = x->rows, x_cols = x->cols, x_ch = CCV_GET_CHANNEL(x->type); |
967 | 3 | x->rows = x_rows * x_cols * x_ch, x->cols = 1, x->type = (x->type - x_ch) | CCV_C1; |
968 | 3 | x->step = x->cols * CCV_GET_DATA_TYPE_SIZE(x->type); |
969 | 3 | int i; |
970 | 3 | if (layer->net.full_connect.relu) |
971 | 0 | for (i = 0; i < y->rows; i++) |
972 | 0 | if (y->data.f32[i] <= 0) |
973 | 0 | a->data.f32[i] = 0; |
974 | 3 | ccv_dense_matrix_t w = ccv_dense_matrix(a->rows, x->rows, CCV_32F | CCV_C1, update_params->w, 0); |
975 | 3 | ccv_dense_matrix_t* dw = &w; |
976 | | // compute bias gradient |
977 | 3 | ccv_dense_matrix_t bias = ccv_dense_matrix(a->rows, 1, CCV_32F | CCV_C1, update_params->bias, 0); |
978 | 3 | ccv_dense_matrix_t* dbias = &bias; |
979 | 3 | ccv_add(a, dbias, (ccv_matrix_t**)&dbias, 0); |
980 | | // compute weight gradient |
981 | 3 | ccv_gemm(a, x, 1, dw, 1, CCV_B_TRANSPOSE, (ccv_matrix_t**)&dw, 0); |
982 | 3 | w = ccv_dense_matrix(a->rows, x->rows, CCV_32F | CCV_C1, layer->w, 0); |
983 | | // propagate error |
984 | 3 | if (db) |
985 | 3 | { |
986 | 3 | db->rows = x->rows, db->cols = x->cols, db->type = (db->type - x_ch) | CCV_C1; |
987 | 3 | db->step = db->cols * CCV_GET_DATA_TYPE_SIZE(db->type); |
988 | 3 | ccv_gemm(&w, a, 1, 0, 0, CCV_A_TRANSPOSE, (ccv_matrix_t**)&db, 0); |
989 | 3 | db->rows = x_rows, db->cols = x_cols, db->type = (db->type - CCV_GET_CHANNEL(db->type)) | x_ch; |
990 | 3 | db->step = db->cols * CCV_GET_DATA_TYPE_SIZE(db->type) * CCV_GET_CHANNEL(db->type); |
991 | 3 | } |
992 | 3 | x->rows = x_rows, x->cols = x_cols, x->type = (x->type - CCV_GET_CHANNEL(x->type)) | x_ch; |
993 | 3 | x->step = x->cols * CCV_GET_DATA_TYPE_SIZE(x->type) * CCV_GET_CHANNEL(x->type); |
994 | 3 | } convnet.tests.c:_ccv_convnet_full_connect_backward_propagate Line | Count | Source | 960 | 3 | { | 961 | | // a is the input gradient (for back prop), y is the output (for forward prop) | 962 | | // x is the input (for forward prop), b is the output gradient (gradient, or known as propagated error) | 963 | 3 | ccv_dense_matrix_t* db = 0; | 964 | 3 | if (b) | 965 | 3 | db = *b = ccv_dense_matrix_renew(*b, x->rows, x->cols, CCV_32F | CCV_GET_CHANNEL(x->type), CCV_32F | CCV_GET_CHANNEL(x->type), 0); | 966 | 3 | int x_rows = x->rows, x_cols = x->cols, x_ch = CCV_GET_CHANNEL(x->type); | 967 | 3 | x->rows = x_rows * x_cols * x_ch, x->cols = 1, x->type = (x->type - x_ch) | CCV_C1; | 968 | 3 | x->step = x->cols * CCV_GET_DATA_TYPE_SIZE(x->type); | 969 | 3 | int i; | 970 | 3 | if (layer->net.full_connect.relu) | 971 | 0 | for (i = 0; i < y->rows; i++) | 972 | 0 | if (y->data.f32[i] <= 0) | 973 | 0 | a->data.f32[i] = 0; | 974 | 3 | ccv_dense_matrix_t w = ccv_dense_matrix(a->rows, x->rows, CCV_32F | CCV_C1, update_params->w, 0); | 975 | 3 | ccv_dense_matrix_t* dw = &w; | 976 | | // compute bias gradient | 977 | 3 | ccv_dense_matrix_t bias = ccv_dense_matrix(a->rows, 1, CCV_32F | CCV_C1, update_params->bias, 0); | 978 | 3 | ccv_dense_matrix_t* dbias = &bias; | 979 | 3 | ccv_add(a, dbias, (ccv_matrix_t**)&dbias, 0); | 980 | | // compute weight gradient | 981 | 3 | ccv_gemm(a, x, 1, dw, 1, CCV_B_TRANSPOSE, (ccv_matrix_t**)&dw, 0); | 982 | 3 | w = ccv_dense_matrix(a->rows, x->rows, CCV_32F | CCV_C1, layer->w, 0); | 983 | | // propagate error | 984 | 3 | if (db) | 985 | 3 | { | 986 | 3 | db->rows = x->rows, db->cols = x->cols, db->type = (db->type - x_ch) | CCV_C1; | 987 | 3 | db->step = db->cols * CCV_GET_DATA_TYPE_SIZE(db->type); | 988 | 3 | ccv_gemm(&w, a, 1, 0, 0, CCV_A_TRANSPOSE, (ccv_matrix_t**)&db, 0); | 989 | 3 | db->rows = x_rows, db->cols = x_cols, db->type = (db->type - CCV_GET_CHANNEL(db->type)) | x_ch; | 990 | 3 | db->step = db->cols * CCV_GET_DATA_TYPE_SIZE(db->type) * CCV_GET_CHANNEL(db->type); | 991 | 3 | } | 992 | 3 | x->rows = x_rows, x->cols = x_cols, x->type = (x->type - CCV_GET_CHANNEL(x->type)) | x_ch; | 993 | 3 | x->step = x->cols * CCV_GET_DATA_TYPE_SIZE(x->type) * CCV_GET_CHANNEL(x->type); | 994 | 3 | } |
Unexecuted instantiation: ccv_convnet.c:_ccv_convnet_full_connect_backward_propagate |
995 | | |
996 | | static void _ccv_convnet_rnorm_backward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t* n, ccv_dense_matrix_t* m, ccv_dense_matrix_t* denoms, ccv_dense_matrix_t** b) |
997 | 4 | { |
998 | 4 | int rows, cols, partition; |
999 | 4 | ccv_convnet_make_output(layer, layer->input.matrix.rows, layer->input.matrix.cols, &rows, &cols, &partition); |
1000 | 4 | int size = layer->net.rnorm.size; |
1001 | 4 | float alpha = layer->net.rnorm.alpha; |
1002 | 4 | float beta = layer->net.rnorm.beta; |
1003 | 4 | int way = size / 2; |
1004 | 4 | assert(CCV_GET_DATA_TYPE(a->type) == CCV_32F); |
1005 | 4 | int ch = CCV_GET_CHANNEL(a->type); |
1006 | 4 | int type = CCV_32F | ch; |
1007 | 4 | ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, rows, cols, type, type, 0); |
1008 | 4 | int i, j, k, x, p; |
1009 | 4 | float* ap = a->data.f32; |
1010 | 4 | float* np = n->data.f32; |
1011 | 4 | float* mp = m->data.f32; |
1012 | 4 | float* dp = denoms->data.f32; |
1013 | 4 | float* bp = db->data.f32; |
1014 | 4 | int ch_per_partition = ch / partition; |
1015 | 116 | for (i = 0; i < db->rows; i++112 ) |
1016 | 112 | { |
1017 | 3.26k | for (j = 0; j < db->cols; j++3.14k ) |
1018 | 7.02k | for (p = 0; 3.14k p < partition; p++3.87k ) |
1019 | 16.4k | for (k = 0; 3.87k k < ch_per_partition; k++12.5k ) |
1020 | 12.5k | { |
1021 | 12.5k | float nom = 0; |
1022 | 42.6k | for (x = ccv_max12.5k (k - way, 0); x <= ccv_min(k + way, ch_per_partition - 1); x++30.0k ) |
1023 | 30.0k | nom += -2 * alpha * beta * ap[j * ch + x + p * ch_per_partition] * np[j * ch + x + p * ch_per_partition] / dp[j * ch + x + p * ch_per_partition]; |
1024 | 12.5k | bp[j * ch + k + p * ch_per_partition] = mp[j * ch + k + p * ch_per_partition] * nom + ap[j * ch + k + p * ch_per_partition] * powf(dp[j * ch + k + p * ch_per_partition], -beta); |
1025 | 12.5k | } |
1026 | 112 | ap += a->cols * ch; |
1027 | 112 | np += n->cols * ch; |
1028 | 112 | mp += m->cols * ch; |
1029 | 112 | dp += denoms->cols * ch; |
1030 | 112 | bp += db->cols * ch; |
1031 | 112 | } |
1032 | 4 | } convnet.tests.c:_ccv_convnet_rnorm_backward_propagate Line | Count | Source | 997 | 4 | { | 998 | 4 | int rows, cols, partition; | 999 | 4 | ccv_convnet_make_output(layer, layer->input.matrix.rows, layer->input.matrix.cols, &rows, &cols, &partition); | 1000 | 4 | int size = layer->net.rnorm.size; | 1001 | 4 | float alpha = layer->net.rnorm.alpha; | 1002 | 4 | float beta = layer->net.rnorm.beta; | 1003 | 4 | int way = size / 2; | 1004 | 4 | assert(CCV_GET_DATA_TYPE(a->type) == CCV_32F); | 1005 | 4 | int ch = CCV_GET_CHANNEL(a->type); | 1006 | 4 | int type = CCV_32F | ch; | 1007 | 4 | ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, rows, cols, type, type, 0); | 1008 | 4 | int i, j, k, x, p; | 1009 | 4 | float* ap = a->data.f32; | 1010 | 4 | float* np = n->data.f32; | 1011 | 4 | float* mp = m->data.f32; | 1012 | 4 | float* dp = denoms->data.f32; | 1013 | 4 | float* bp = db->data.f32; | 1014 | 4 | int ch_per_partition = ch / partition; | 1015 | 116 | for (i = 0; i < db->rows; i++112 ) | 1016 | 112 | { | 1017 | 3.26k | for (j = 0; j < db->cols; j++3.14k ) | 1018 | 7.02k | for (p = 0; 3.14k p < partition; p++3.87k ) | 1019 | 16.4k | for (k = 0; 3.87k k < ch_per_partition; k++12.5k ) | 1020 | 12.5k | { | 1021 | 12.5k | float nom = 0; | 1022 | 42.6k | for (x = ccv_max12.5k (k - way, 0); x <= ccv_min(k + way, ch_per_partition - 1); x++30.0k ) | 1023 | 30.0k | nom += -2 * alpha * beta * ap[j * ch + x + p * ch_per_partition] * np[j * ch + x + p * ch_per_partition] / dp[j * ch + x + p * ch_per_partition]; | 1024 | 12.5k | bp[j * ch + k + p * ch_per_partition] = mp[j * ch + k + p * ch_per_partition] * nom + ap[j * ch + k + p * ch_per_partition] * powf(dp[j * ch + k + p * ch_per_partition], -beta); | 1025 | 12.5k | } | 1026 | 112 | ap += a->cols * ch; | 1027 | 112 | np += n->cols * ch; | 1028 | 112 | mp += m->cols * ch; | 1029 | 112 | dp += denoms->cols * ch; | 1030 | 112 | bp += db->cols * ch; | 1031 | 112 | } | 1032 | 4 | } |
Unexecuted instantiation: ccv_convnet.c:_ccv_convnet_rnorm_backward_propagate |
1033 | | |
1034 | | static void _ccv_convnet_max_pool_backward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t* n, ccv_dense_matrix_t* m, ccv_dense_matrix_t** b) |
1035 | 1 | { |
1036 | | // a is the input gradient (for back prop), y is the output (from forward prop), |
1037 | | // x is the input (for forward prop), b is the output gradient (gradient, or known as propagated error) |
1038 | | // pooling layer doesn't need the dropout |
1039 | 1 | if (b) |
1040 | 1 | { |
1041 | 1 | assert(CCV_GET_CHANNEL(a->type) == CCV_GET_CHANNEL(n->type)); |
1042 | 1 | assert(CCV_GET_CHANNEL(a->type) == CCV_GET_CHANNEL(m->type)); |
1043 | 1 | int ch = CCV_GET_CHANNEL(a->type); |
1044 | 1 | ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, m->rows, m->cols, CCV_32F | ch, CCV_32F | ch, 0); |
1045 | 1 | ccv_zero(db); |
1046 | 1 | int size = layer->net.pool.size; |
1047 | 1 | int strides = layer->net.pool.strides; |
1048 | 1 | int border = layer->net.pool.border; |
1049 | 1 | int i, j, k, x, y; |
1050 | 1 | float* ap = a->data.f32; |
1051 | 1 | float* bp = db->data.f32; |
1052 | 1 | float* np = n->data.f32; |
1053 | 1 | float* mp = m->data.f32; |
1054 | 16 | for (i = 0; i < a->rows; i++15 ) |
1055 | 15 | { |
1056 | 15 | const int start_y = ccv_max(i * strides - border, 0) - (i * strides - border); |
1057 | 15 | const int end_y = size + ccv_min(i * strides + size - border, db->rows) - (i * strides + size - border); |
1058 | 240 | for (j = 0; j < a->cols; j++225 ) |
1059 | 225 | { |
1060 | 225 | const int start_x = ccv_max(j * strides - border, 0) - (j * strides - border); |
1061 | 225 | const int end_x = size + ccv_min(j * strides + size - border, db->cols) - (j * strides + size - border); |
1062 | 675 | for (k = 0; k < ch; k++450 ) |
1063 | 450 | { |
1064 | 450 | float v = np[j * ch + k]; |
1065 | 450 | float u = ap[j * ch + k]; |
1066 | 1.80k | for (y = start_y; y < end_y; y++1.35k ) |
1067 | 5.40k | for (x = start_x; 1.35k x < end_x; x++4.05k ) |
1068 | | // we have to do direct comparison otherwise it will contribute to too many cells |
1069 | | // and the propagation won't work. But CPU will have different result comparing with GPU |
1070 | 4.05k | if (mp[(j * strides - border + x + (y - border) * m->cols) * ch + k] == v) |
1071 | 450 | bp[(j * strides - border + x + (y - border) * db->cols) * ch + k] += u; |
1072 | 450 | } |
1073 | 225 | } |
1074 | 15 | ap += a->cols * ch; |
1075 | 15 | np += n->cols * ch; |
1076 | 15 | bp += db->cols * ch * strides; |
1077 | 15 | mp += m->cols * ch * strides; |
1078 | 15 | } |
1079 | 1 | } |
1080 | 1 | } convnet.tests.c:_ccv_convnet_max_pool_backward_propagate Line | Count | Source | 1035 | 1 | { | 1036 | | // a is the input gradient (for back prop), y is the output (from forward prop), | 1037 | | // x is the input (for forward prop), b is the output gradient (gradient, or known as propagated error) | 1038 | | // pooling layer doesn't need the dropout | 1039 | 1 | if (b) | 1040 | 1 | { | 1041 | 1 | assert(CCV_GET_CHANNEL(a->type) == CCV_GET_CHANNEL(n->type)); | 1042 | 1 | assert(CCV_GET_CHANNEL(a->type) == CCV_GET_CHANNEL(m->type)); | 1043 | 1 | int ch = CCV_GET_CHANNEL(a->type); | 1044 | 1 | ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, m->rows, m->cols, CCV_32F | ch, CCV_32F | ch, 0); | 1045 | 1 | ccv_zero(db); | 1046 | 1 | int size = layer->net.pool.size; | 1047 | 1 | int strides = layer->net.pool.strides; | 1048 | 1 | int border = layer->net.pool.border; | 1049 | 1 | int i, j, k, x, y; | 1050 | 1 | float* ap = a->data.f32; | 1051 | 1 | float* bp = db->data.f32; | 1052 | 1 | float* np = n->data.f32; | 1053 | 1 | float* mp = m->data.f32; | 1054 | 16 | for (i = 0; i < a->rows; i++15 ) | 1055 | 15 | { | 1056 | 15 | const int start_y = ccv_max(i * strides - border, 0) - (i * strides - border); | 1057 | 15 | const int end_y = size + ccv_min(i * strides + size - border, db->rows) - (i * strides + size - border); | 1058 | 240 | for (j = 0; j < a->cols; j++225 ) | 1059 | 225 | { | 1060 | 225 | const int start_x = ccv_max(j * strides - border, 0) - (j * strides - border); | 1061 | 225 | const int end_x = size + ccv_min(j * strides + size - border, db->cols) - (j * strides + size - border); | 1062 | 675 | for (k = 0; k < ch; k++450 ) | 1063 | 450 | { | 1064 | 450 | float v = np[j * ch + k]; | 1065 | 450 | float u = ap[j * ch + k]; | 1066 | 1.80k | for (y = start_y; y < end_y; y++1.35k ) | 1067 | 5.40k | for (x = start_x; 1.35k x < end_x; x++4.05k ) | 1068 | | // we have to do direct comparison otherwise it will contribute to too many cells | 1069 | | // and the propagation won't work. But CPU will have different result comparing with GPU | 1070 | 4.05k | if (mp[(j * strides - border + x + (y - border) * m->cols) * ch + k] == v) | 1071 | 450 | bp[(j * strides - border + x + (y - border) * db->cols) * ch + k] += u; | 1072 | 450 | } | 1073 | 225 | } | 1074 | 15 | ap += a->cols * ch; | 1075 | 15 | np += n->cols * ch; | 1076 | 15 | bp += db->cols * ch * strides; | 1077 | 15 | mp += m->cols * ch * strides; | 1078 | 15 | } | 1079 | 1 | } | 1080 | 1 | } |
Unexecuted instantiation: ccv_convnet.c:_ccv_convnet_max_pool_backward_propagate |
1081 | | |
1082 | | static void _ccv_convnet_average_pool_backward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t* m, ccv_dense_matrix_t** b) |
1083 | 1 | { |
1084 | | // a is the input gradient (for back prop), y is the output (from forward prop), |
1085 | | // x is the input (for forward prop), b is the output gradient (gradient, or known as propagated error) |
1086 | | // pooling layer doesn't need the dropout |
1087 | 1 | if (b) |
1088 | 1 | { |
1089 | 1 | assert(CCV_GET_CHANNEL(a->type) == CCV_GET_CHANNEL(m->type)); |
1090 | 1 | int ch = CCV_GET_CHANNEL(a->type); |
1091 | 1 | ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, m->rows, m->cols, CCV_32F | ch, CCV_32F | ch, 0); |
1092 | 1 | ccv_zero(db); |
1093 | 1 | int size = layer->net.pool.size; |
1094 | 1 | int strides = layer->net.pool.strides; |
1095 | 1 | int border = layer->net.pool.border; |
1096 | 1 | int i, j, k, x, y; |
1097 | 1 | float* ap = a->data.f32; |
1098 | 1 | float* bp = db->data.f32; |
1099 | 16 | for (i = 0; i < a->rows; i++15 ) |
1100 | 15 | { |
1101 | 15 | const int start_y = ccv_max(i * strides - border, 0) - (i * strides - border); |
1102 | 15 | const int end_y = size + ccv_min(i * strides + size - border, db->rows) - (i * strides + size - border); |
1103 | 240 | for (j = 0; j < a->cols; j++225 ) |
1104 | 225 | { |
1105 | 225 | const int start_x = ccv_max(j * strides - border, 0) - (j * strides - border); |
1106 | 225 | const int end_x = size + ccv_min(j * strides + size - border, db->cols) - (j * strides + size - border); |
1107 | 675 | for (k = 0; k < ch; k++450 ) |
1108 | 450 | { |
1109 | 450 | float u = ap[j * ch + k] / ((end_x - start_x) * (end_y - start_y)); |
1110 | 1.80k | for (y = start_y; y < end_y; y++1.35k ) |
1111 | 5.40k | for (x = start_x; 1.35k x < end_x; x++4.05k ) |
1112 | 4.05k | bp[(j * strides - border + x + (y - border) * db->cols) * ch + k] += u; |
1113 | 450 | } |
1114 | 225 | } |
1115 | 15 | ap += a->cols * ch; |
1116 | 15 | bp += db->cols * ch * strides; |
1117 | 15 | } |
1118 | 1 | } |
1119 | 1 | } convnet.tests.c:_ccv_convnet_average_pool_backward_propagate Line | Count | Source | 1083 | 1 | { | 1084 | | // a is the input gradient (for back prop), y is the output (from forward prop), | 1085 | | // x is the input (for forward prop), b is the output gradient (gradient, or known as propagated error) | 1086 | | // pooling layer doesn't need the dropout | 1087 | 1 | if (b) | 1088 | 1 | { | 1089 | 1 | assert(CCV_GET_CHANNEL(a->type) == CCV_GET_CHANNEL(m->type)); | 1090 | 1 | int ch = CCV_GET_CHANNEL(a->type); | 1091 | 1 | ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, m->rows, m->cols, CCV_32F | ch, CCV_32F | ch, 0); | 1092 | 1 | ccv_zero(db); | 1093 | 1 | int size = layer->net.pool.size; | 1094 | 1 | int strides = layer->net.pool.strides; | 1095 | 1 | int border = layer->net.pool.border; | 1096 | 1 | int i, j, k, x, y; | 1097 | 1 | float* ap = a->data.f32; | 1098 | 1 | float* bp = db->data.f32; | 1099 | 16 | for (i = 0; i < a->rows; i++15 ) | 1100 | 15 | { | 1101 | 15 | const int start_y = ccv_max(i * strides - border, 0) - (i * strides - border); | 1102 | 15 | const int end_y = size + ccv_min(i * strides + size - border, db->rows) - (i * strides + size - border); | 1103 | 240 | for (j = 0; j < a->cols; j++225 ) | 1104 | 225 | { | 1105 | 225 | const int start_x = ccv_max(j * strides - border, 0) - (j * strides - border); | 1106 | 225 | const int end_x = size + ccv_min(j * strides + size - border, db->cols) - (j * strides + size - border); | 1107 | 675 | for (k = 0; k < ch; k++450 ) | 1108 | 450 | { | 1109 | 450 | float u = ap[j * ch + k] / ((end_x - start_x) * (end_y - start_y)); | 1110 | 1.80k | for (y = start_y; y < end_y; y++1.35k ) | 1111 | 5.40k | for (x = start_x; 1.35k x < end_x; x++4.05k ) | 1112 | 4.05k | bp[(j * strides - border + x + (y - border) * db->cols) * ch + k] += u; | 1113 | 450 | } | 1114 | 225 | } | 1115 | 15 | ap += a->cols * ch; | 1116 | 15 | bp += db->cols * ch * strides; | 1117 | 15 | } | 1118 | 1 | } | 1119 | 1 | } |
Unexecuted instantiation: ccv_convnet.c:_ccv_convnet_average_pool_backward_propagate |
1120 | | |
1121 | | static void _ccv_convnet_propagate_loss(ccv_convnet_t* convnet, ccv_dense_matrix_t* a, ccv_dense_matrix_t* dloss, ccv_convnet_t* update_params) |
1122 | 1 | { |
1123 | 1 | int i; |
1124 | 1 | ccv_convnet_layer_t* layer = convnet->layers + convnet->count - 1; |
1125 | 1 | assert(layer->type == CCV_CONVNET_FULL_CONNECT); // the last layer has too be a full connect one to generate softmax result |
1126 | 1 | _ccv_convnet_full_connect_backward_propagate(layer, dloss, convnet->acts[convnet->count - 1], convnet->acts[convnet->count - 2], convnet->count - 1 > 0 ? update_params->acts + convnet->count - 2 : 00 , update_params->layers + convnet->count - 1); |
1127 | 2 | for (i = convnet->count - 2; i >= 0; i--1 ) |
1128 | 1 | { |
1129 | 1 | layer = convnet->layers + i; |
1130 | 1 | switch (layer->type) |
1131 | 1 | { |
1132 | 1 | case CCV_CONVNET_CONVOLUTIONAL: |
1133 | 1 | _ccv_convnet_convolutional_backward_propagate(layer, update_params->acts[i], convnet->acts[i], i > 0 ? convnet->acts[i - 1]0 : a, i > 0 ? update_params->acts + i - 10 : 0, update_params->layers + i); |
1134 | 1 | break; |
1135 | 0 | case CCV_CONVNET_FULL_CONNECT: |
1136 | 0 | _ccv_convnet_full_connect_backward_propagate(layer, update_params->acts[i], convnet->acts[i], i > 0 ? convnet->acts[i - 1] : a, i > 0 ? update_params->acts + i - 1 : 0, update_params->layers + i); |
1137 | 0 | break; |
1138 | 0 | case CCV_CONVNET_LOCAL_RESPONSE_NORM: |
1139 | 0 | _ccv_convnet_rnorm_backward_propagate(layer, update_params->acts[i], convnet->acts[i], i > 0 ? convnet->acts[i - 1] : a, convnet->denoms[i], i > 0 ? update_params->acts + i - 1 : 0); |
1140 | 0 | break; |
1141 | 0 | case CCV_CONVNET_MAX_POOL: |
1142 | 0 | _ccv_convnet_max_pool_backward_propagate(layer, update_params->acts[i], convnet->acts[i], i > 0 ? convnet->acts[i - 1] : a, i > 0 ? update_params->acts + i - 1 : 0); |
1143 | 0 | break; |
1144 | 0 | case CCV_CONVNET_AVERAGE_POOL: |
1145 | 0 | _ccv_convnet_average_pool_backward_propagate(layer, update_params->acts[i], i > 0 ? convnet->acts[i - 1] : a, i > 0 ? update_params->acts + i - 1 : 0); |
1146 | 0 | break; |
1147 | 1 | } |
1148 | 1 | } |
1149 | 1 | } convnet.tests.c:_ccv_convnet_propagate_loss Line | Count | Source | 1122 | 1 | { | 1123 | 1 | int i; | 1124 | 1 | ccv_convnet_layer_t* layer = convnet->layers + convnet->count - 1; | 1125 | 1 | assert(layer->type == CCV_CONVNET_FULL_CONNECT); // the last layer has too be a full connect one to generate softmax result | 1126 | 1 | _ccv_convnet_full_connect_backward_propagate(layer, dloss, convnet->acts[convnet->count - 1], convnet->acts[convnet->count - 2], convnet->count - 1 > 0 ? update_params->acts + convnet->count - 2 : 00 , update_params->layers + convnet->count - 1); | 1127 | 2 | for (i = convnet->count - 2; i >= 0; i--1 ) | 1128 | 1 | { | 1129 | 1 | layer = convnet->layers + i; | 1130 | 1 | switch (layer->type) | 1131 | 1 | { | 1132 | 1 | case CCV_CONVNET_CONVOLUTIONAL: | 1133 | 1 | _ccv_convnet_convolutional_backward_propagate(layer, update_params->acts[i], convnet->acts[i], i > 0 ? convnet->acts[i - 1]0 : a, i > 0 ? update_params->acts + i - 10 : 0, update_params->layers + i); | 1134 | 1 | break; | 1135 | 0 | case CCV_CONVNET_FULL_CONNECT: | 1136 | 0 | _ccv_convnet_full_connect_backward_propagate(layer, update_params->acts[i], convnet->acts[i], i > 0 ? convnet->acts[i - 1] : a, i > 0 ? update_params->acts + i - 1 : 0, update_params->layers + i); | 1137 | 0 | break; | 1138 | 0 | case CCV_CONVNET_LOCAL_RESPONSE_NORM: | 1139 | 0 | _ccv_convnet_rnorm_backward_propagate(layer, update_params->acts[i], convnet->acts[i], i > 0 ? convnet->acts[i - 1] : a, convnet->denoms[i], i > 0 ? update_params->acts + i - 1 : 0); | 1140 | 0 | break; | 1141 | 0 | case CCV_CONVNET_MAX_POOL: | 1142 | 0 | _ccv_convnet_max_pool_backward_propagate(layer, update_params->acts[i], convnet->acts[i], i > 0 ? convnet->acts[i - 1] : a, i > 0 ? update_params->acts + i - 1 : 0); | 1143 | 0 | break; | 1144 | 0 | case CCV_CONVNET_AVERAGE_POOL: | 1145 | 0 | _ccv_convnet_average_pool_backward_propagate(layer, update_params->acts[i], i > 0 ? convnet->acts[i - 1] : a, i > 0 ? update_params->acts + i - 1 : 0); | 1146 | 0 | break; | 1147 | 1 | } | 1148 | 1 | } | 1149 | 1 | } |
Unexecuted instantiation: ccv_convnet.c:_ccv_convnet_propagate_loss |
1150 | | |
1151 | | static void _ccv_convnet_update(ccv_convnet_t* convnet, int batch, ccv_convnet_t* momentum, ccv_convnet_t* update_params, ccv_convnet_layer_train_param_t* layer_params) |
1152 | 0 | { |
1153 | 0 | int i, j; |
1154 | 0 | float learn_rate; |
1155 | 0 | for (i = 0; i < convnet->count; i++) |
1156 | 0 | switch (update_params->layers[i].type) |
1157 | 0 | { |
1158 | 0 | case CCV_CONVNET_CONVOLUTIONAL: |
1159 | 0 | { |
1160 | 0 | float* w = convnet->layers[i].w; |
1161 | 0 | float* vw = momentum->layers[i].w; |
1162 | 0 | float* dw = update_params->layers[i].w; |
1163 | 0 | learn_rate = layer_params[i].w.learn_rate / batch; |
1164 | 0 | for (j = 0; j < convnet->layers[i].wnum; j++) |
1165 | 0 | { |
1166 | 0 | vw[j] = layer_params[i].w.momentum * vw[j] - layer_params[i].w.decay * layer_params[i].w.learn_rate * w[j] + learn_rate * dw[j]; |
1167 | 0 | w[j] += vw[j]; |
1168 | 0 | } |
1169 | 0 | float* bias = convnet->layers[i].bias; |
1170 | 0 | float* vbias = momentum->layers[i].bias; |
1171 | 0 | float* dbias = update_params->layers[i].bias; |
1172 | 0 | learn_rate = layer_params[i].bias.learn_rate / batch; |
1173 | 0 | for (j = 0; j < convnet->layers[i].net.convolutional.count; j++) |
1174 | 0 | { |
1175 | 0 | vbias[j] = layer_params[i].bias.momentum * vbias[j] - layer_params[i].bias.decay * layer_params[i].bias.learn_rate * bias[j] + learn_rate * dbias[j]; |
1176 | 0 | bias[j] += vbias[j]; |
1177 | 0 | } |
1178 | 0 | break; |
1179 | 0 | } |
1180 | 0 | case CCV_CONVNET_FULL_CONNECT: |
1181 | 0 | { |
1182 | 0 | float* w = convnet->layers[i].w; |
1183 | 0 | float* vw = momentum->layers[i].w; |
1184 | 0 | float* dw = update_params->layers[i].w; |
1185 | 0 | learn_rate = layer_params[i].w.learn_rate / batch; |
1186 | 0 | for (j = 0; j < convnet->layers[i].wnum; j++) |
1187 | 0 | { |
1188 | 0 | vw[j] = layer_params[i].w.momentum * vw[j] - layer_params[i].w.decay * layer_params[i].w.learn_rate * w[j] + learn_rate * dw[j]; |
1189 | 0 | w[j] += vw[j]; |
1190 | 0 | } |
1191 | 0 | float* bias = convnet->layers[i].bias; |
1192 | 0 | float* vbias = momentum->layers[i].bias; |
1193 | 0 | float* dbias = update_params->layers[i].bias; |
1194 | 0 | learn_rate = layer_params[i].bias.learn_rate / batch; |
1195 | 0 | for (j = 0; j < convnet->layers[i].net.full_connect.count; j++) |
1196 | 0 | { |
1197 | 0 | vbias[j] = layer_params[i].bias.momentum * vbias[j] - layer_params[i].bias.decay * layer_params[i].bias.learn_rate * bias[j] + learn_rate * dbias[j]; |
1198 | 0 | bias[j] += vbias[j]; |
1199 | 0 | } |
1200 | 0 | break; |
1201 | 0 | } |
1202 | 0 | } |
1203 | 0 | } Unexecuted instantiation: convnet.tests.c:_ccv_convnet_update Unexecuted instantiation: ccv_convnet.c:_ccv_convnet_update |
1204 | | |
1205 | | static void _ccv_convnet_update_zero(ccv_convnet_t* update_params) |
1206 | 9 | { |
1207 | 9 | int i; |
1208 | 20 | for (i = 0; i < update_params->count; i++11 ) |
1209 | 11 | switch (update_params->layers[i].type) |
1210 | 11 | { |
1211 | 7 | case CCV_CONVNET_CONVOLUTIONAL: |
1212 | 7 | memset(update_params->layers[i].w, 0, sizeof(float) * update_params->layers[i].wnum); |
1213 | 7 | memset(update_params->layers[i].bias, 0, sizeof(float) * update_params->layers[i].net.convolutional.count); |
1214 | 7 | break; |
1215 | 3 | case CCV_CONVNET_FULL_CONNECT: |
1216 | 3 | assert(update_params->layers[i].wnum % update_params->layers[i].net.full_connect.count == 0); |
1217 | 3 | memset(update_params->layers[i].w, 0, sizeof(float) * update_params->layers[i].wnum); |
1218 | 3 | memset(update_params->layers[i].bias, 0, sizeof(float) * update_params->layers[i].net.full_connect.count); |
1219 | 3 | break; |
1220 | 11 | } |
1221 | 9 | } convnet.tests.c:_ccv_convnet_update_zero Line | Count | Source | 1206 | 9 | { | 1207 | 9 | int i; | 1208 | 20 | for (i = 0; i < update_params->count; i++11 ) | 1209 | 11 | switch (update_params->layers[i].type) | 1210 | 11 | { | 1211 | 7 | case CCV_CONVNET_CONVOLUTIONAL: | 1212 | 7 | memset(update_params->layers[i].w, 0, sizeof(float) * update_params->layers[i].wnum); | 1213 | 7 | memset(update_params->layers[i].bias, 0, sizeof(float) * update_params->layers[i].net.convolutional.count); | 1214 | 7 | break; | 1215 | 3 | case CCV_CONVNET_FULL_CONNECT: | 1216 | 3 | assert(update_params->layers[i].wnum % update_params->layers[i].net.full_connect.count == 0); | 1217 | 3 | memset(update_params->layers[i].w, 0, sizeof(float) * update_params->layers[i].wnum); | 1218 | 3 | memset(update_params->layers[i].bias, 0, sizeof(float) * update_params->layers[i].net.full_connect.count); | 1219 | 3 | break; | 1220 | 11 | } | 1221 | 9 | } |
Unexecuted instantiation: ccv_convnet.c:_ccv_convnet_update_zero |
1222 | | |
1223 | | static ccv_convnet_t* _ccv_convnet_update_new(ccv_convnet_t* convnet) |
1224 | 8 | { |
1225 | 8 | ccv_convnet_t* update_params = (ccv_convnet_t*)ccmalloc(sizeof(ccv_convnet_t) + sizeof(ccv_convnet_layer_t) * convnet->count + sizeof(ccv_dense_matrix_t*) * convnet->count); |
1226 | 8 | update_params->reserved = 0; |
1227 | 8 | update_params->layers = (ccv_convnet_layer_t*)(update_params + 1); |
1228 | 8 | update_params->acts = (ccv_dense_matrix_t**)(update_params->layers + convnet->count); |
1229 | 8 | memset(update_params->acts, 0, sizeof(ccv_dense_matrix_t*) * convnet->count); |
1230 | 8 | update_params->denoms = 0; |
1231 | 8 | update_params->input = convnet->input; |
1232 | 8 | update_params->rows = convnet->rows; |
1233 | 8 | update_params->cols = convnet->cols; |
1234 | 8 | update_params->count = convnet->count; |
1235 | 8 | update_params->channels = convnet->channels; |
1236 | 8 | update_params->mean_activity = 0; |
1237 | 8 | int i; |
1238 | 18 | for (i = 0; i < convnet->count; i++10 ) |
1239 | 10 | { |
1240 | 10 | update_params->layers[i].type = convnet->layers[i].type; |
1241 | 10 | update_params->layers[i].input = convnet->layers[i].input; |
1242 | 10 | update_params->layers[i].net = convnet->layers[i].net; |
1243 | 10 | update_params->layers[i].wnum = convnet->layers[i].wnum; |
1244 | 10 | update_params->layers[i].reserved = 0; |
1245 | 10 | switch (update_params->layers[i].type) |
1246 | 10 | { |
1247 | 6 | case CCV_CONVNET_CONVOLUTIONAL: |
1248 | 6 | update_params->layers[i].w = (float*)cccalloc(update_params->layers[i].wnum + update_params->layers[i].net.convolutional.count, sizeof(float)); |
1249 | 6 | update_params->layers[i].bias = update_params->layers[i].w + update_params->layers[i].wnum; |
1250 | 6 | break; |
1251 | 3 | case CCV_CONVNET_FULL_CONNECT: |
1252 | 3 | assert(update_params->layers[i].wnum % update_params->layers[i].net.full_connect.count == 0); |
1253 | 3 | update_params->layers[i].w = (float*)cccalloc(update_params->layers[i].wnum + update_params->layers[i].net.full_connect.count, sizeof(float)); |
1254 | 3 | update_params->layers[i].bias = update_params->layers[i].w + update_params->layers[i].wnum; |
1255 | 3 | break; |
1256 | 1 | case CCV_CONVNET_LOCAL_RESPONSE_NORM: |
1257 | 1 | case CCV_CONVNET_MAX_POOL: |
1258 | 1 | case CCV_CONVNET_AVERAGE_POOL: |
1259 | 1 | update_params->layers[i].w = 0; |
1260 | 1 | update_params->layers[i].bias = 0; |
1261 | 1 | break; |
1262 | 10 | } |
1263 | 10 | } |
1264 | 8 | return update_params; |
1265 | 8 | } convnet.tests.c:_ccv_convnet_update_new Line | Count | Source | 1224 | 8 | { | 1225 | 8 | ccv_convnet_t* update_params = (ccv_convnet_t*)ccmalloc(sizeof(ccv_convnet_t) + sizeof(ccv_convnet_layer_t) * convnet->count + sizeof(ccv_dense_matrix_t*) * convnet->count); | 1226 | 8 | update_params->reserved = 0; | 1227 | 8 | update_params->layers = (ccv_convnet_layer_t*)(update_params + 1); | 1228 | 8 | update_params->acts = (ccv_dense_matrix_t**)(update_params->layers + convnet->count); | 1229 | 8 | memset(update_params->acts, 0, sizeof(ccv_dense_matrix_t*) * convnet->count); | 1230 | 8 | update_params->denoms = 0; | 1231 | 8 | update_params->input = convnet->input; | 1232 | 8 | update_params->rows = convnet->rows; | 1233 | 8 | update_params->cols = convnet->cols; | 1234 | 8 | update_params->count = convnet->count; | 1235 | 8 | update_params->channels = convnet->channels; | 1236 | 8 | update_params->mean_activity = 0; | 1237 | 8 | int i; | 1238 | 18 | for (i = 0; i < convnet->count; i++10 ) | 1239 | 10 | { | 1240 | 10 | update_params->layers[i].type = convnet->layers[i].type; | 1241 | 10 | update_params->layers[i].input = convnet->layers[i].input; | 1242 | 10 | update_params->layers[i].net = convnet->layers[i].net; | 1243 | 10 | update_params->layers[i].wnum = convnet->layers[i].wnum; | 1244 | 10 | update_params->layers[i].reserved = 0; | 1245 | 10 | switch (update_params->layers[i].type) | 1246 | 10 | { | 1247 | 6 | case CCV_CONVNET_CONVOLUTIONAL: | 1248 | 6 | update_params->layers[i].w = (float*)cccalloc(update_params->layers[i].wnum + update_params->layers[i].net.convolutional.count, sizeof(float)); | 1249 | 6 | update_params->layers[i].bias = update_params->layers[i].w + update_params->layers[i].wnum; | 1250 | 6 | break; | 1251 | 3 | case CCV_CONVNET_FULL_CONNECT: | 1252 | 3 | assert(update_params->layers[i].wnum % update_params->layers[i].net.full_connect.count == 0); | 1253 | 3 | update_params->layers[i].w = (float*)cccalloc(update_params->layers[i].wnum + update_params->layers[i].net.full_connect.count, sizeof(float)); | 1254 | 3 | update_params->layers[i].bias = update_params->layers[i].w + update_params->layers[i].wnum; | 1255 | 3 | break; | 1256 | 1 | case CCV_CONVNET_LOCAL_RESPONSE_NORM: | 1257 | 1 | case CCV_CONVNET_MAX_POOL: | 1258 | 1 | case CCV_CONVNET_AVERAGE_POOL: | 1259 | 1 | update_params->layers[i].w = 0; | 1260 | 1 | update_params->layers[i].bias = 0; | 1261 | 1 | break; | 1262 | 10 | } | 1263 | 10 | } | 1264 | 8 | return update_params; | 1265 | 8 | } |
Unexecuted instantiation: ccv_convnet.c:_ccv_convnet_update_new |
1266 | | |
1267 | | static void _ccv_convnet_compute_softmax(ccv_dense_matrix_t* a, ccv_dense_matrix_t** b, int type) |
1268 | 5.26k | { |
1269 | 5.26k | int ch = CCV_GET_CHANNEL(a->type); |
1270 | 5.26k | assert(CCV_GET_DATA_TYPE(a->type) == CCV_32F); |
1271 | 5.26k | ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, a->rows, a->cols, CCV_32F | ch, CCV_32F | ch, 0); |
1272 | 5.26k | int i; |
1273 | 5.26k | float* aptr = a->data.f32; |
1274 | 5.26k | float* bptr = db->data.f32; |
1275 | 5.26k | double max = aptr[0]; |
1276 | 7.85M | for (i = 1; i < a->rows * a->cols * ch; i++7.84M ) |
1277 | 7.84M | if (aptr[i] > max) |
1278 | 231k | max = aptr[i]; |
1279 | 5.26k | double tt = 0; |
1280 | 7.85M | for (i = 0; i < a->rows * a->cols * ch; i++7.85M ) |
1281 | 7.85M | tt += (bptr[i] = expf(aptr[i] - max)); |
1282 | 5.26k | tt = 1.0 / tt; |
1283 | 7.85M | for (i = 0; i < a->rows * a->cols * ch; i++7.85M ) |
1284 | 7.85M | bptr[i] *= tt; |
1285 | 5.26k | } convnet.tests.c:_ccv_convnet_compute_softmax Line | Count | Source | 1268 | 5.26k | { | 1269 | 5.26k | int ch = CCV_GET_CHANNEL(a->type); | 1270 | 5.26k | assert(CCV_GET_DATA_TYPE(a->type) == CCV_32F); | 1271 | 5.26k | ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, a->rows, a->cols, CCV_32F | ch, CCV_32F | ch, 0); | 1272 | 5.26k | int i; | 1273 | 5.26k | float* aptr = a->data.f32; | 1274 | 5.26k | float* bptr = db->data.f32; | 1275 | 5.26k | double max = aptr[0]; | 1276 | 7.85M | for (i = 1; i < a->rows * a->cols * ch; i++7.84M ) | 1277 | 7.84M | if (aptr[i] > max) | 1278 | 231k | max = aptr[i]; | 1279 | 5.26k | double tt = 0; | 1280 | 7.85M | for (i = 0; i < a->rows * a->cols * ch; i++7.85M ) | 1281 | 7.85M | tt += (bptr[i] = expf(aptr[i] - max)); | 1282 | 5.26k | tt = 1.0 / tt; | 1283 | 7.85M | for (i = 0; i < a->rows * a->cols * ch; i++7.85M ) | 1284 | 7.85M | bptr[i] *= tt; | 1285 | 5.26k | } |
Unexecuted instantiation: ccv_convnet.c:_ccv_convnet_compute_softmax |
1286 | | |
1287 | | static void _ccv_convnet_classify(ccv_convnet_t* convnet, ccv_dense_matrix_t** a, int* labels, int batch) |
1288 | 0 | { |
1289 | 0 | assert(batch == 1); |
1290 | 0 | ccv_convnet_encode(convnet, a, convnet->acts + convnet->count - 1, 1); |
1291 | 0 | int i, c = 0; |
1292 | 0 | ccv_dense_matrix_t* b = convnet->acts[convnet->count - 1]; |
1293 | 0 | float maxc = b->data.f32[0]; |
1294 | 0 | for (i = 1; i < b->rows; i++) |
1295 | 0 | if (b->data.f32[i] > maxc) |
1296 | 0 | maxc = b->data.f32[i], c = i; |
1297 | 0 | labels[0] = c; |
1298 | 0 | } Unexecuted instantiation: convnet.tests.c:_ccv_convnet_classify Unexecuted instantiation: ccv_convnet.c:_ccv_convnet_classify |
1299 | | |
1300 | | #endif |
1301 | | |
1302 | | #ifndef CASE_TESTS |
1303 | | |
1304 | | void ccv_convnet_supervised_train(ccv_convnet_t* convnet, ccv_array_t* categorizeds, ccv_array_t* tests, const char* filename, ccv_convnet_train_param_t params) |
1305 | 0 | { |
1306 | 0 | #ifdef HAVE_GSL |
1307 | 0 | #ifdef HAVE_CUDA |
1308 | 0 | if (convnet->use_cwc_accel) |
1309 | 0 | cwc_convnet_supervised_train(convnet, categorizeds, tests, filename, params); |
1310 | 0 | else { |
1311 | 0 | #endif |
1312 | 0 | int i, j, t; |
1313 | 0 | gsl_rng_env_setup(); |
1314 | 0 | gsl_rng* rng = gsl_rng_alloc(gsl_rng_default); |
1315 | 0 | int aligned_padding = categorizeds->rnum % params.mini_batch; |
1316 | 0 | int aligned_rnum = categorizeds->rnum - aligned_padding; |
1317 | 0 | int* idx = (int*)ccmalloc(sizeof(int) * (categorizeds->rnum + aligned_padding)); |
1318 | 0 | for (i = 0; i < categorizeds->rnum; i++) |
1319 | 0 | idx[i] = i; |
1320 | 0 | gsl_ran_shuffle(rng, idx, categorizeds->rnum, sizeof(int)); |
1321 | | // the last layer has to be full connect, thus we can use it as softmax layer |
1322 | 0 | assert(convnet->layers[convnet->count - 1].type == CCV_CONVNET_FULL_CONNECT); |
1323 | 0 | int category_count = convnet->layers[convnet->count - 1].net.full_connect.count; |
1324 | 0 | ccv_convnet_t* update_params = _ccv_convnet_update_new(convnet); |
1325 | 0 | ccv_convnet_t* momentum = _ccv_convnet_update_new(convnet); |
1326 | 0 | for (t = 0; t < params.max_epoch; t++) |
1327 | 0 | { |
1328 | 0 | for (i = 0; i < aligned_rnum; i++) |
1329 | 0 | { |
1330 | | // dropout the first hidden layer |
1331 | 0 | ccv_categorized_t* categorized = (ccv_categorized_t*)ccv_array_get(categorizeds, idx[i]); |
1332 | 0 | ccv_convnet_encode(convnet, &categorized->matrix, convnet->acts + convnet->count - 1, 1); |
1333 | 0 | ccv_dense_matrix_t* softmax = convnet->acts[convnet->count - 1]; |
1334 | 0 | float* dloss = softmax->data.f32; |
1335 | 0 | _ccv_convnet_compute_softmax(softmax, &softmax, 0); |
1336 | 0 | assert(softmax->rows == category_count && softmax->cols == 1); |
1337 | | // this mashes softmax and logistic regression together |
1338 | | // also, it gives you -D[loss w.r.t. to x_i] (note the negative sign) |
1339 | 0 | for (j = 0; j < category_count; j++) |
1340 | 0 | dloss[j] = (j == categorized->c) - dloss[j]; |
1341 | 0 | _ccv_convnet_propagate_loss(convnet, categorized->matrix, softmax, update_params); |
1342 | 0 | if ((i + 1) % params.mini_batch == 0) |
1343 | 0 | { |
1344 | 0 | FLUSH(CCV_CLI_INFO, " - at epoch %03d / %d => stochastic gradient descent at %d / %d", t + 1, params.max_epoch, (i + 1) / params.mini_batch, aligned_rnum / params.mini_batch); |
1345 | | // update weights |
1346 | 0 | _ccv_convnet_update(convnet, params.mini_batch, momentum, update_params, params.layer_params); |
1347 | 0 | _ccv_convnet_update_zero(update_params); |
1348 | | // compact the convnet to avoid any staled temporary resource |
1349 | 0 | ccv_convnet_compact(convnet); |
1350 | 0 | } |
1351 | 0 | } |
1352 | 0 | int miss = 0; |
1353 | 0 | for (i = 0; i < tests->rnum; i++) |
1354 | 0 | { |
1355 | 0 | FLUSH(CCV_CLI_INFO, " - at epoch %03d / %d => going through %d / %d for tests", t + 1, params.max_epoch, i + 1, tests->rnum); |
1356 | 0 | ccv_categorized_t* test = (ccv_categorized_t*)ccv_array_get(tests, i); |
1357 | 0 | int c = 0; |
1358 | 0 | _ccv_convnet_classify(convnet, &test->matrix, &c, 1); |
1359 | 0 | if (c != test->c) |
1360 | 0 | ++miss; |
1361 | 0 | } |
1362 | 0 | FLUSH(CCV_CLI_INFO, " - at epoch %03d / %d => with miss rate %.2f%%\n", t + 1, params.max_epoch, miss * 100.0f / tests->rnum); |
1363 | 0 | if (t + 1 < params.max_epoch) |
1364 | 0 | { |
1365 | | // reshuffle the parts we visited and move the rest to the beginning |
1366 | 0 | memcpy(idx + categorizeds->rnum, idx + aligned_rnum, sizeof(int) * aligned_padding); |
1367 | 0 | memmove(idx + aligned_padding, idx, sizeof(int) * aligned_rnum); |
1368 | 0 | memcpy(idx, idx + categorizeds->rnum, sizeof(int) * aligned_padding); |
1369 | 0 | gsl_ran_shuffle(rng, idx + aligned_padding, aligned_rnum, sizeof(int)); |
1370 | 0 | } |
1371 | 0 | } |
1372 | 0 | ccfree(idx); |
1373 | 0 | ccv_convnet_free(momentum); |
1374 | 0 | ccv_convnet_free(update_params); |
1375 | 0 | gsl_rng_free(rng); |
1376 | 0 | #ifdef HAVE_CUDA |
1377 | 0 | } |
1378 | 0 | #endif |
1379 | | #else |
1380 | | assert(0 && "ccv_convnet_supervised_train requires GSL library support"); |
1381 | | #endif |
1382 | 0 | } |
1383 | | |
1384 | | void ccv_convnet_compact(ccv_convnet_t* convnet) |
1385 | 2.37k | { |
1386 | 2.37k | #ifdef HAVE_CUDA |
1387 | 2.37k | cwc_convnet_compact(convnet); |
1388 | 2.37k | #endif |
1389 | 2.37k | int i; |
1390 | 5.91k | for (i = 0; i < convnet->count; i++3.53k ) |
1391 | 3.53k | { |
1392 | 3.53k | if (convnet->acts[i]) |
1393 | 1.16k | ccv_matrix_free(convnet->acts[i]); |
1394 | 3.53k | convnet->acts[i] = 0; |
1395 | 3.53k | if (convnet->denoms) |
1396 | 3.52k | { |
1397 | 3.52k | if (convnet->denoms[i]) |
1398 | 821 | ccv_matrix_free(convnet->denoms[i]); |
1399 | 3.52k | convnet->denoms[i] = 0; |
1400 | 3.52k | } |
1401 | 3.53k | if (SIMD(convnet->layers + i)) |
1402 | 2.37k | { |
1403 | 2.37k | ccfree(convnet->layers[i].reserved); |
1404 | 2.37k | convnet->layers[i].reserved = 0; |
1405 | 2.37k | } |
1406 | 3.53k | } |
1407 | 2.37k | } |
1408 | | |
1409 | | void ccv_convnet_write(ccv_convnet_t* convnet, const char* filename, ccv_convnet_write_param_t params) |
1410 | 0 | { |
1411 | 0 | sqlite3* db = 0; |
1412 | 0 | if (SQLITE_OK == sqlite3_open(filename, &db)) |
1413 | 0 | { |
1414 | 0 | const char layer_create_table_qs[] = |
1415 | 0 | "CREATE TABLE IF NOT EXISTS layer_params " |
1416 | 0 | "(layer INTEGER PRIMARY KEY ASC, type INTEGER, " |
1417 | 0 | "input_matrix_rows INTEGER, input_matrix_cols INTEGER, input_matrix_channels INTEGER, input_matrix_partition INTEGER, input_node_count INTEGER, " |
1418 | 0 | "output_rows INTEGER, output_cols INTEGER, output_channels INTEGER, output_partition INTEGER, output_count INTEGER, output_strides INTEGER, output_border INTEGER, " |
1419 | 0 | "output_size INTEGER, output_kappa REAL, output_alpha REAL, output_beta REAL, output_relu INTEGER);" |
1420 | 0 | "CREATE TABLE IF NOT EXISTS convnet_params " |
1421 | 0 | "(convnet INTEGER PRIMARY KEY ASC, input_height INTEGER, input_width INTEGER, mean_activity BLOB);" |
1422 | 0 | "CREATE TABLE IF NOT EXISTS layer_data " |
1423 | 0 | "(layer INTEGER PRIMARY KEY ASC, weight BLOB, bias BLOB, half_precision INTEGER);"; |
1424 | 0 | assert(SQLITE_OK == sqlite3_exec(db, layer_create_table_qs, 0, 0, 0)); |
1425 | 0 | const char layer_params_insert_qs[] = |
1426 | 0 | "REPLACE INTO layer_params " |
1427 | 0 | "(layer, type, " |
1428 | 0 | "input_matrix_rows, input_matrix_cols, input_matrix_channels, input_matrix_partition, input_node_count, " |
1429 | 0 | "output_rows, output_cols, output_channels, output_partition, output_count, output_strides, output_border, " |
1430 | 0 | "output_size, output_kappa, output_alpha, output_beta, output_relu) VALUES " |
1431 | 0 | "($layer, $type, " // 1 |
1432 | 0 | "$input_matrix_rows, $input_matrix_cols, $input_matrix_channels, $input_matrix_partition, $input_node_count, " // 6 |
1433 | 0 | "$output_rows, $output_cols, $output_channels, $output_partition, $output_count, $output_strides, $output_border, " // 13 |
1434 | 0 | "$output_size, $output_kappa, $output_alpha, $output_beta, $output_relu);"; // 18 |
1435 | 0 | sqlite3_stmt* layer_params_insert_stmt = 0; |
1436 | 0 | assert(SQLITE_OK == sqlite3_prepare_v2(db, layer_params_insert_qs, sizeof(layer_params_insert_qs), &layer_params_insert_stmt, 0)); |
1437 | 0 | const char layer_data_insert_qs[] = |
1438 | 0 | "REPLACE INTO layer_data " |
1439 | 0 | "(layer, weight, bias, half_precision) VALUES ($layer, $weight, $bias, $half_precision);"; |
1440 | 0 | sqlite3_stmt* layer_data_insert_stmt = 0; |
1441 | 0 | assert(SQLITE_OK == sqlite3_prepare_v2(db, layer_data_insert_qs, sizeof(layer_data_insert_qs), &layer_data_insert_stmt, 0)); |
1442 | 0 | int i; |
1443 | 0 | for (i = 0; i < convnet->count; i++) |
1444 | 0 | { |
1445 | 0 | ccv_convnet_layer_t* layer = convnet->layers + i; |
1446 | | // insert layer params |
1447 | 0 | sqlite3_bind_int(layer_params_insert_stmt, 1, i); |
1448 | 0 | sqlite3_bind_int(layer_params_insert_stmt, 2, layer->type); |
1449 | 0 | sqlite3_bind_int(layer_params_insert_stmt, 3, layer->input.matrix.rows); |
1450 | 0 | sqlite3_bind_int(layer_params_insert_stmt, 4, layer->input.matrix.cols); |
1451 | 0 | sqlite3_bind_int(layer_params_insert_stmt, 5, layer->input.matrix.channels); |
1452 | 0 | sqlite3_bind_int(layer_params_insert_stmt, 6, layer->input.matrix.partition); |
1453 | 0 | sqlite3_bind_int(layer_params_insert_stmt, 7, layer->input.node.count); |
1454 | 0 | switch (layer->type) |
1455 | 0 | { |
1456 | 0 | case CCV_CONVNET_CONVOLUTIONAL: |
1457 | 0 | sqlite3_bind_int(layer_params_insert_stmt, 8, layer->net.convolutional.rows); |
1458 | 0 | sqlite3_bind_int(layer_params_insert_stmt, 9, layer->net.convolutional.cols); |
1459 | 0 | sqlite3_bind_int(layer_params_insert_stmt, 10, layer->net.convolutional.channels); |
1460 | 0 | sqlite3_bind_int(layer_params_insert_stmt, 11, layer->net.convolutional.partition); |
1461 | 0 | sqlite3_bind_int(layer_params_insert_stmt, 12, layer->net.convolutional.count); |
1462 | 0 | sqlite3_bind_int(layer_params_insert_stmt, 13, layer->net.convolutional.strides); |
1463 | 0 | sqlite3_bind_int(layer_params_insert_stmt, 14, layer->net.convolutional.border); |
1464 | 0 | break; |
1465 | 0 | case CCV_CONVNET_FULL_CONNECT: |
1466 | 0 | sqlite3_bind_int(layer_params_insert_stmt, 12, layer->net.full_connect.count); |
1467 | 0 | sqlite3_bind_int(layer_params_insert_stmt, 19, layer->net.full_connect.relu); |
1468 | 0 | break; |
1469 | 0 | case CCV_CONVNET_MAX_POOL: |
1470 | 0 | case CCV_CONVNET_AVERAGE_POOL: |
1471 | 0 | sqlite3_bind_int(layer_params_insert_stmt, 13, layer->net.pool.strides); |
1472 | 0 | sqlite3_bind_int(layer_params_insert_stmt, 14, layer->net.pool.border); |
1473 | 0 | sqlite3_bind_int(layer_params_insert_stmt, 15, layer->net.pool.size); |
1474 | 0 | break; |
1475 | 0 | case CCV_CONVNET_LOCAL_RESPONSE_NORM: |
1476 | 0 | sqlite3_bind_int(layer_params_insert_stmt, 15, layer->net.rnorm.size); |
1477 | 0 | sqlite3_bind_double(layer_params_insert_stmt, 16, layer->net.rnorm.kappa); |
1478 | 0 | sqlite3_bind_double(layer_params_insert_stmt, 17, layer->net.rnorm.alpha); |
1479 | 0 | sqlite3_bind_double(layer_params_insert_stmt, 18, layer->net.rnorm.beta); |
1480 | 0 | break; |
1481 | 0 | } |
1482 | 0 | assert(SQLITE_DONE == sqlite3_step(layer_params_insert_stmt)); |
1483 | 0 | sqlite3_reset(layer_params_insert_stmt); |
1484 | 0 | sqlite3_clear_bindings(layer_params_insert_stmt); |
1485 | | // insert layer data |
1486 | 0 | if (layer->type == CCV_CONVNET_CONVOLUTIONAL || layer->type == CCV_CONVNET_FULL_CONNECT) |
1487 | 0 | { |
1488 | 0 | sqlite3_bind_int(layer_data_insert_stmt, 1, i); |
1489 | 0 | if (params.half_precision) |
1490 | 0 | { |
1491 | 0 | uint16_t* w = (uint16_t*)ccmalloc(sizeof(uint16_t) * layer->wnum); |
1492 | 0 | ccv_float_to_half_precision(layer->w, w, layer->wnum); |
1493 | 0 | uint16_t* bias = (uint16_t*)ccmalloc(sizeof(uint16_t) * (layer->type == CCV_CONVNET_CONVOLUTIONAL ? layer->net.convolutional.count : layer->net.full_connect.count)); |
1494 | 0 | ccv_float_to_half_precision(layer->bias, bias, layer->type == CCV_CONVNET_CONVOLUTIONAL ? layer->net.convolutional.count : layer->net.full_connect.count); |
1495 | 0 | sqlite3_bind_blob(layer_data_insert_stmt, 2, w, sizeof(uint16_t) * layer->wnum, ccfree); |
1496 | 0 | sqlite3_bind_blob(layer_data_insert_stmt, 3, bias, sizeof(uint16_t) * (layer->type == CCV_CONVNET_CONVOLUTIONAL ? layer->net.convolutional.count : layer->net.full_connect.count), ccfree); |
1497 | 0 | } else { |
1498 | 0 | sqlite3_bind_blob(layer_data_insert_stmt, 2, layer->w, sizeof(float) * layer->wnum, SQLITE_STATIC); |
1499 | 0 | sqlite3_bind_blob(layer_data_insert_stmt, 3, layer->bias, sizeof(float) * (layer->type == CCV_CONVNET_CONVOLUTIONAL ? layer->net.convolutional.count : layer->net.full_connect.count), SQLITE_STATIC); |
1500 | 0 | } |
1501 | 0 | sqlite3_bind_int(layer_data_insert_stmt, 4, params.half_precision); |
1502 | 0 | assert(SQLITE_DONE == sqlite3_step(layer_data_insert_stmt)); |
1503 | 0 | sqlite3_reset(layer_data_insert_stmt); |
1504 | 0 | sqlite3_clear_bindings(layer_data_insert_stmt); |
1505 | 0 | } |
1506 | 0 | } |
1507 | | // insert convnet related params |
1508 | 0 | const char convnet_params_insert_qs[] = |
1509 | 0 | "REPLACE INTO convnet_params " |
1510 | 0 | "(convnet, mean_activity, input_height, input_width) VALUES (0, $mean_activity, $input_height, $input_width);"; |
1511 | 0 | sqlite3_stmt* convnet_params_insert_stmt = 0; |
1512 | 0 | assert(SQLITE_OK == sqlite3_prepare_v2(db, convnet_params_insert_qs, sizeof(convnet_params_insert_qs), &convnet_params_insert_stmt, 0)); |
1513 | 0 | assert(convnet->mean_activity->rows == convnet->input.height); |
1514 | 0 | assert(convnet->mean_activity->cols == convnet->input.width); |
1515 | 0 | assert(CCV_GET_CHANNEL(convnet->mean_activity->type) == convnet->channels); |
1516 | 0 | assert(CCV_GET_DATA_TYPE(convnet->mean_activity->type) == CCV_32F); |
1517 | 0 | sqlite3_bind_blob(convnet_params_insert_stmt, 1, convnet->mean_activity->data.f32, sizeof(float) * convnet->input.height * convnet->input.width * convnet->channels, SQLITE_STATIC); |
1518 | 0 | sqlite3_bind_int(convnet_params_insert_stmt, 2, convnet->input.height); |
1519 | 0 | sqlite3_bind_int(convnet_params_insert_stmt, 3, convnet->input.width); |
1520 | 0 | assert(SQLITE_DONE == sqlite3_step(convnet_params_insert_stmt)); |
1521 | 0 | sqlite3_reset(convnet_params_insert_stmt); |
1522 | 0 | sqlite3_clear_bindings(convnet_params_insert_stmt); |
1523 | |
|
1524 | 0 | sqlite3_finalize(layer_params_insert_stmt); |
1525 | 0 | sqlite3_finalize(layer_data_insert_stmt); |
1526 | 0 | sqlite3_finalize(convnet_params_insert_stmt); |
1527 | 0 | sqlite3_close(db); |
1528 | 0 | } |
1529 | 0 | } |
1530 | | |
1531 | | ccv_convnet_t* ccv_convnet_read(int use_cwc_accel, const char* filename) |
1532 | 2 | { |
1533 | 2 | sqlite3* db = 0; |
1534 | 2 | if (SQLITE_OK == sqlite3_open(filename, &db)) |
1535 | 2 | { |
1536 | 2 | ccv_convnet_t* convnet = 0; |
1537 | 2 | sqlite3_stmt* layer_params_stmt = 0; |
1538 | | // load layer params |
1539 | 2 | const char layer_params_qs[] = |
1540 | 2 | "SELECT type, " // 1 |
1541 | 2 | "input_matrix_rows, input_matrix_cols, input_matrix_channels, input_matrix_partition, input_node_count, " // 6 |
1542 | 2 | "output_rows, output_cols, output_channels, output_partition, output_count, output_strides, output_border, " // 13 |
1543 | 2 | "output_size, output_kappa, output_alpha, output_beta, output_relu FROM layer_params ORDER BY layer ASC;"; // 18 |
1544 | 2 | if (SQLITE_OK == sqlite3_prepare_v2(db, layer_params_qs, sizeof(layer_params_qs), &layer_params_stmt, 0)) |
1545 | 2 | { |
1546 | 2 | ccv_array_t* layer_params = ccv_array_new(sizeof(ccv_convnet_layer_param_t), 3, 0); |
1547 | 44 | while (sqlite3_step(layer_params_stmt) == SQLITE_ROW) |
1548 | 42 | { |
1549 | 42 | ccv_convnet_layer_param_t layer_param; |
1550 | 42 | layer_param.type = sqlite3_column_int(layer_params_stmt, 0); |
1551 | 42 | layer_param.input.matrix.rows = sqlite3_column_int(layer_params_stmt, 1); |
1552 | 42 | layer_param.input.matrix.cols = sqlite3_column_int(layer_params_stmt, 2); |
1553 | 42 | layer_param.input.matrix.channels = sqlite3_column_int(layer_params_stmt, 3); |
1554 | 42 | layer_param.input.matrix.partition = sqlite3_column_int(layer_params_stmt, 4); |
1555 | 42 | layer_param.input.node.count = sqlite3_column_int(layer_params_stmt, 5); |
1556 | 42 | layer_param.bias = layer_param.glorot = 0; // this is irrelevant to read convnet |
1557 | 42 | switch (layer_param.type) |
1558 | 42 | { |
1559 | 26 | case CCV_CONVNET_CONVOLUTIONAL: |
1560 | 26 | layer_param.output.convolutional.rows = sqlite3_column_int(layer_params_stmt, 6); |
1561 | 26 | layer_param.output.convolutional.cols = sqlite3_column_int(layer_params_stmt, 7); |
1562 | 26 | layer_param.output.convolutional.channels = sqlite3_column_int(layer_params_stmt, 8); |
1563 | 26 | layer_param.output.convolutional.partition = sqlite3_column_int(layer_params_stmt, 9); |
1564 | 26 | layer_param.output.convolutional.count = sqlite3_column_int(layer_params_stmt, 10); |
1565 | 26 | layer_param.output.convolutional.strides = sqlite3_column_int(layer_params_stmt, 11); |
1566 | 26 | layer_param.output.convolutional.border = sqlite3_column_int(layer_params_stmt, 12); |
1567 | 26 | break; |
1568 | 6 | case CCV_CONVNET_FULL_CONNECT: |
1569 | 6 | layer_param.output.full_connect.count = sqlite3_column_int(layer_params_stmt, 10); |
1570 | 6 | layer_param.output.full_connect.relu = sqlite3_column_int(layer_params_stmt, 17); |
1571 | 6 | break; |
1572 | 10 | case CCV_CONVNET_MAX_POOL: |
1573 | 10 | case CCV_CONVNET_AVERAGE_POOL: |
1574 | 10 | layer_param.output.pool.strides = sqlite3_column_int(layer_params_stmt, 11); |
1575 | 10 | layer_param.output.pool.border = sqlite3_column_int(layer_params_stmt, 12); |
1576 | 10 | layer_param.output.pool.size = sqlite3_column_int(layer_params_stmt, 13); |
1577 | 10 | break; |
1578 | 0 | case CCV_CONVNET_LOCAL_RESPONSE_NORM: |
1579 | 0 | layer_param.output.rnorm.size = sqlite3_column_int(layer_params_stmt, 13); |
1580 | 0 | layer_param.output.rnorm.kappa = sqlite3_column_double(layer_params_stmt, 14); |
1581 | 0 | layer_param.output.rnorm.alpha = sqlite3_column_double(layer_params_stmt, 15); |
1582 | 0 | layer_param.output.rnorm.beta = sqlite3_column_double(layer_params_stmt, 16); |
1583 | 0 | break; |
1584 | 42 | } |
1585 | 42 | ccv_array_push(layer_params, &layer_param); |
1586 | 42 | } |
1587 | 2 | sqlite3_finalize(layer_params_stmt); |
1588 | 2 | sqlite3_stmt* convnet_params_input_stmt = 0; |
1589 | | // load convnet params for input |
1590 | 2 | const char convnet_params_input_qs[] = |
1591 | 2 | "SELECT input_height, input_width FROM convnet_params WHERE convnet = 0;"; |
1592 | 2 | ccv_size_t input = ccv_size(0, 0); |
1593 | 2 | if (SQLITE_OK == sqlite3_prepare_v2(db, convnet_params_input_qs, sizeof(convnet_params_input_qs), &convnet_params_input_stmt, 0)) |
1594 | 2 | { |
1595 | 2 | if (sqlite3_step(convnet_params_input_stmt) == SQLITE_ROW) |
1596 | 2 | { |
1597 | 2 | input.height = sqlite3_column_int(convnet_params_input_stmt, 0); |
1598 | 2 | input.width = sqlite3_column_int(convnet_params_input_stmt, 1); |
1599 | 2 | } |
1600 | 2 | sqlite3_finalize(convnet_params_input_stmt); |
1601 | 2 | } |
1602 | 2 | assert(input.height != 0 && input.width != 0); |
1603 | 2 | convnet = ccv_convnet_new(use_cwc_accel, input, (ccv_convnet_layer_param_t*)ccv_array_get(layer_params, 0), layer_params->rnum); |
1604 | 2 | ccv_array_free(layer_params); |
1605 | | // load layer data |
1606 | 2 | sqlite3_stmt* layer_data_stmt = 0; |
1607 | 2 | const char layer_data_qs[] = |
1608 | 2 | "SELECT layer, weight, bias, half_precision FROM layer_data;"; |
1609 | 2 | if (SQLITE_OK == sqlite3_prepare_v2(db, layer_data_qs, sizeof(layer_data_qs), &layer_data_stmt, 0)) |
1610 | 2 | { |
1611 | 34 | while (sqlite3_step(layer_data_stmt) == SQLITE_ROW) |
1612 | 32 | { |
1613 | 32 | ccv_convnet_layer_t* layer = convnet->layers + sqlite3_column_int(layer_data_stmt, 0); |
1614 | 32 | int half_precision = sqlite3_column_int(layer_data_stmt, 3); |
1615 | 32 | int wnum = sqlite3_column_bytes(layer_data_stmt, 1) / (half_precision ? sizeof(uint16_t) : sizeof(float)0 ); |
1616 | | // if weights available, load weights |
1617 | 32 | if (wnum == layer->wnum) |
1618 | 32 | { |
1619 | 32 | const void* w = sqlite3_column_blob(layer_data_stmt, 1); |
1620 | 32 | if (half_precision) |
1621 | 32 | { |
1622 | 32 | float* f = (float*)ccmalloc(sizeof(float) * layer->wnum); |
1623 | 32 | ccv_half_precision_to_float((uint16_t*)w, f, layer->wnum); |
1624 | 32 | w = f; |
1625 | 32 | } |
1626 | 32 | switch (layer->type) |
1627 | 32 | { |
1628 | 26 | case CCV_CONVNET_CONVOLUTIONAL: |
1629 | 26 | memcpy(layer->w, w, sizeof(float) * layer->wnum); |
1630 | 26 | break; |
1631 | 6 | case CCV_CONVNET_FULL_CONNECT: |
1632 | 6 | memcpy(layer->w, w, sizeof(float) * layer->wnum); |
1633 | 6 | break; |
1634 | 32 | } |
1635 | 32 | if (half_precision) |
1636 | 32 | ccfree((void*)w); |
1637 | 32 | } |
1638 | 32 | int bnum = sqlite3_column_bytes(layer_data_stmt, 2) / (half_precision ? sizeof(uint16_t) : sizeof(float)0 ); |
1639 | | // if bias available, load bias |
1640 | 32 | if (bnum == (layer->type == CCV_CONVNET_CONVOLUTIONAL ? layer->net.convolutional.count26 : layer->net.full_connect.count6 )) |
1641 | 32 | { |
1642 | 32 | const void* bias = sqlite3_column_blob(layer_data_stmt, 2); |
1643 | 32 | if (half_precision) |
1644 | 32 | { |
1645 | 32 | float* f = (float*)ccmalloc(sizeof(float) * (layer->type == CCV_CONVNET_CONVOLUTIONAL ? layer->net.convolutional.count26 : layer->net.full_connect.count6 )); |
1646 | 32 | ccv_half_precision_to_float((uint16_t*)bias, f, layer->type == CCV_CONVNET_CONVOLUTIONAL ? layer->net.convolutional.count26 : layer->net.full_connect.count6 ); |
1647 | 32 | bias = f; |
1648 | 32 | } |
1649 | 32 | switch (layer->type) |
1650 | 32 | { |
1651 | 26 | case CCV_CONVNET_CONVOLUTIONAL: |
1652 | 26 | memcpy(layer->bias, bias, sizeof(float) * layer->net.convolutional.count); |
1653 | 26 | break; |
1654 | 6 | case CCV_CONVNET_FULL_CONNECT: |
1655 | 6 | memcpy(layer->bias, bias, sizeof(float) * layer->net.full_connect.count); |
1656 | 6 | break; |
1657 | 32 | } |
1658 | 32 | if (half_precision) |
1659 | 32 | ccfree((void*)bias); |
1660 | 32 | } |
1661 | 32 | } |
1662 | 2 | sqlite3_finalize(layer_data_stmt); |
1663 | 2 | } |
1664 | 2 | sqlite3_stmt* convnet_params_mean_activity_stmt = 0; |
1665 | | // load convnet params for mean activity |
1666 | 2 | const char convnet_params_mean_activity_qs[] = |
1667 | 2 | "SELECT mean_activity FROM convnet_params WHERE convnet = 0;"; |
1668 | 2 | if (SQLITE_OK == sqlite3_prepare_v2(db, convnet_params_mean_activity_qs, sizeof(convnet_params_mean_activity_qs), &convnet_params_mean_activity_stmt, 0)) |
1669 | 2 | { |
1670 | 2 | if (sqlite3_step(convnet_params_mean_activity_stmt) == SQLITE_ROW) |
1671 | 2 | { |
1672 | 2 | int elems = sqlite3_column_bytes(convnet_params_mean_activity_stmt, 0) / sizeof(float); |
1673 | 2 | if (elems == convnet->input.height * convnet->input.width * convnet->channels) |
1674 | 2 | memcpy(convnet->mean_activity->data.f32, sqlite3_column_blob(convnet_params_mean_activity_stmt, 0), sizeof(float) * elems); |
1675 | 2 | } |
1676 | 2 | sqlite3_finalize(convnet_params_mean_activity_stmt); |
1677 | 2 | } |
1678 | 2 | } |
1679 | 2 | sqlite3_close(db); |
1680 | 2 | return convnet; |
1681 | 2 | } |
1682 | 0 | return 0; |
1683 | 2 | } |
1684 | | |
1685 | | void ccv_convnet_input_formation(ccv_size_t input, ccv_dense_matrix_t* a, ccv_dense_matrix_t** b) |
1686 | 2 | { |
1687 | 2 | if (a->rows > input.height && a->cols > input.width) |
1688 | 2 | ccv_resample(a, b, CCV_32F, (double)ccv_max(input.height, (int)(a->rows * (float)input.height / a->cols + 0.5)) / (double)a->rows, (double)ccv_max(input.width, (int)(a->cols * (float)input.width / a->rows + 0.5)) / (double)a->cols, CCV_INTER_AREA); |
1689 | 0 | else if (a->rows < input.height || a->cols < input.width) |
1690 | 0 | ccv_resample(a, b, CCV_32F, (double)ccv_max(input.height, (int)(a->rows * (float)input.height / a->cols + 0.5)) / (double)a->rows, (double)ccv_max(input.width, (int)(a->cols * (float)input.width / a->rows + 0.5)) / (double)a->cols, CCV_INTER_CUBIC); |
1691 | 0 | else |
1692 | 0 | ccv_shift(a, (ccv_matrix_t**)b, CCV_32F, 0, 0); // converting to 32f |
1693 | 2 | } |
1694 | | |
1695 | | void ccv_convnet_free(ccv_convnet_t* convnet) |
1696 | 37 | { |
1697 | 37 | ccv_convnet_compact(convnet); |
1698 | 37 | int i; |
1699 | 118 | for (i = 0; i < convnet->count; i++81 ) |
1700 | 81 | if (convnet->layers[i].w) |
1701 | 57 | ccfree(convnet->layers[i].w); |
1702 | 37 | if (convnet->mean_activity) |
1703 | 29 | ccv_matrix_free(convnet->mean_activity); |
1704 | 37 | ccfree(convnet); |
1705 | 37 | } |
1706 | | |
1707 | | #endif |