ccv_convnet.c

Bug Summary

File:	ccv_convnet.c
Warning:	line 974, column 25 4th function call argument is an uninitialized value
Annotated Source Code

Press '?' to see keyboard shortcuts
Show analyzer invocation
clang -cc1 -cc1 -triple x86_64-unknown-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name ccv_convnet.c -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -mrelocation-model pic -pic-level 2 -pic-is-pie -mframe-pointer=none -menable-no-infs -menable-no-nans -fapprox-func -funsafe-math-optimizations -fno-signed-zeros -mreassociate -freciprocal-math -ffp-contract=fast -fno-rounding-math -ffast-math -ffinite-math-only -complex-range=basic -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -target-feature +sse2 -tune-cpu generic -debugger-tuning=gdb -fdebug-compilation-dir=/home/liu/actions-runner/_work/ccv/ccv/lib -fcoverage-compilation-dir=/home/liu/actions-runner/_work/ccv/ccv/lib -resource-dir /usr/local/lib/clang/19 -I . -I /usr/local/cuda/include -D HAVE_CBLAS -D HAVE_LIBPNG -D HAVE_LIBJPEG -D HAVE_FFTW3 -D HAVE_PTHREAD -D HAVE_LIBLINEAR -D HAVE_TESSERACT -D HAVE_AVCODEC -D HAVE_AVFORMAT -D HAVE_AVUTIL -D HAVE_SWSCALE -D HAVE_SSE2 -D HAVE_GSL -D HAVE_CUDA -D HAVE_CUDNN -D HAVE_NCCL -D USE_SYSTEM_CUB -I /usr/local/include -internal-isystem /usr/local/lib/clang/19/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/12/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -ferror-limit 19 -fgnuc-version=4.2.1 -fskip-odr-check-in-gmf -vectorize-loops -vectorize-slp -analyzer-output=html -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /home/liu/actions-runner/_work/ccv/ccv/_analyze/2024-11-08-093000-160011-1 -x c ccv_convnet.c
1#include "ccv.h"
2#include "ccv_internal.h"
3#if defined(HAVE_SSE21)
4#include <xmmintrin.h>
5#elif defined(HAVE_NEON)
6#include <arm_neon.h>
7#endif
8#ifdef HAVE_GSL1
9#include <gsl/gsl_rng.h>
10#include <gsl/gsl_randist.h>
11#endif
12#ifdef USE_OPENMP
13#include <omp.h>
14#endif
15#ifdef USE_DISPATCH
16#include <dispatch/dispatch.h>
17#endif
18#ifdef HAVE_CUDA1
19#include "cuda/cwc.h"
20#endif
21#include "3rdparty/sqlite3/sqlite3.h"
22#include "inc/ccv_convnet_internal.h"
23 
24#ifndef CASE_TESTS
25 
26ccv_convnet_t* ccv_convnet_new(int use_cwc_accel, ccv_size_t input, ccv_convnet_layer_param_t params[], int count)
27{
28	ccv_convnet_t* convnet = (ccv_convnet_t*)ccmallocmalloc(sizeof(ccv_convnet_t) + sizeof(ccv_convnet_layer_t) * count + sizeof(ccv_dense_matrix_t*) * count * 2);
29	convnet->use_cwc_accel = use_cwc_accel;
30#ifdef HAVE_GSL1
31	gsl_rng_env_setup();
32	gsl_rng* rng = gsl_rng_alloc(gsl_rng_default);
33	gsl_rng_set(rng, (unsigned long int)convnet);
34#endif
35	convnet->reserved = 0;
36	convnet->layers = (ccv_convnet_layer_t*)(convnet + 1);
37	convnet->acts = (ccv_dense_matrix_t**)(convnet->layers + count);
38	memset(convnet->acts, 0, sizeof(ccv_dense_matrix_t*) * count);
39	convnet->denoms = (ccv_dense_matrix_t**)(convnet->acts + count);
40	memset(convnet->denoms, 0, sizeof(ccv_dense_matrix_t*) * count);
41	convnet->count = count;
42	convnet->input = input;
43	convnet->rows = params[0].input.matrix.rows;
44	convnet->cols = params[0].input.matrix.cols;
45	convnet->channels = params[0].input.matrix.channels;
46	convnet->mean_activity = ccv_dense_matrix_new(convnet->input.height, convnet->input.width, convnet->channels | CCV_32F, 0, 0);
47	ccv_zero(convnet->mean_activity);
48	ccv_convnet_layer_t* layers = convnet->layers;
49	int i, j;
50	for (i = 0; i < count; i++)
51	{
52		layers[i].type = params[i].type;
53		layers[i].input = params[i].input;
54		layers[i].net = params[i].output;
55		layers[i].reserved = 0;
56		switch (params[i].type)
57		{
58			case CCV_CONVNET_CONVOLUTIONAL:
59				assert(params[i].input.matrix.channels % params[i].input.matrix.partition == 0)((void) sizeof ((params[i].input.matrix.channels % params[i].
input.matrix.partition == 0) ? 1 : 0), __extension__ ({ if (params
[i].input.matrix.channels % params[i].input.matrix.partition ==
 0) ; else __assert_fail ("params[i].input.matrix.channels % params[i].input.matrix.partition == 0"
, "ccv_convnet.c", 59, __extension__ __PRETTY_FUNCTION__); })
);
60				assert(params[i].output.convolutional.count % params[i].output.convolutional.partition == 0)((void) sizeof ((params[i].output.convolutional.count % params
[i].output.convolutional.partition == 0) ? 1 : 0), __extension__
 ({ if (params[i].output.convolutional.count % params[i].output
.convolutional.partition == 0) ; else __assert_fail ("params[i].output.convolutional.count % params[i].output.convolutional.partition == 0"
, "ccv_convnet.c", 60, __extension__ __PRETTY_FUNCTION__); })
);
61				assert(params[i].output.convolutional.partition % params[i].input.matrix.partition == 0)((void) sizeof ((params[i].output.convolutional.partition % params
[i].input.matrix.partition == 0) ? 1 : 0), __extension__ ({ if
 (params[i].output.convolutional.partition % params[i].input.
matrix.partition == 0) ; else __assert_fail ("params[i].output.convolutional.partition % params[i].input.matrix.partition == 0"
, "ccv_convnet.c", 61, __extension__ __PRETTY_FUNCTION__); })
);
62				assert(params[i].output.convolutional.partition >= params[i].input.matrix.partition)((void) sizeof ((params[i].output.convolutional.partition >=
 params[i].input.matrix.partition) ? 1 : 0), __extension__ ({
 if (params[i].output.convolutional.partition >= params[i]
.input.matrix.partition) ; else __assert_fail ("params[i].output.convolutional.partition >= params[i].input.matrix.partition"
, "ccv_convnet.c", 62, __extension__ __PRETTY_FUNCTION__); })
);
63				layers[i].wnum = params[i].output.convolutional.rows * params[i].output.convolutional.cols * params[i].output.convolutional.channels / params[i].input.matrix.partition * params[i].output.convolutional.count;
64				layers[i].w = (float*)ccmallocmalloc(sizeof(float) * (layers[i].wnum + params[i].output.convolutional.count));
65				layers[i].bias = layers[i].w + layers[i].wnum;
66#ifdef HAVE_GSL1
67				for (j = 0; j < layers[i].wnum; j++)
68					layers[i].w[j] = (gsl_rng_uniform_pos(rng) * 2 - 1) * params[i].glorot / sqrtf(params[i].output.convolutional.rows * params[i].output.convolutional.cols * params[i].output.convolutional.channels / params[i].input.matrix.partition + params[i].output.convolutional.count);
69#else
70				for (j = 0; j < layers[i].wnum; j++)
71					layers[i].w[j] = 0;
72#endif
73				for (j = 0; j < params[i].output.convolutional.count; j++)
74					layers[i].bias[j] = params[i].bias;
75				break;
76			case CCV_CONVNET_FULL_CONNECT:
77				layers[i].wnum = params[i].input.node.count * params[i].output.full_connect.count;
78				layers[i].w = (float*)ccmallocmalloc(sizeof(float) * (layers[i].wnum + params[i].output.full_connect.count));
79				layers[i].bias = layers[i].w + layers[i].wnum;
80#ifdef HAVE_GSL1
81				for (j = 0; j < layers[i].wnum; j++)
82					layers[i].w[j] = (gsl_rng_uniform_pos(rng) * 2 - 1) * params[i].glorot / sqrtf(params[i].input.node.count + params[i].output.full_connect.count);
83#else
84				for (j = 0; j < layers[i].wnum; j++)
85					layers[i].w[j] = 0;
86#endif
87				for (j = 0; j < params[i].output.full_connect.count; j++)
88					layers[i].bias[j] = params[i].bias;
89				break;
90			default:
91				layers[i].wnum = 0;
92				layers[i].w = 0;
93				layers[i].bias = 0;
94				break;
95		}
96	}
97#ifdef HAVE_GSL1
98	gsl_rng_free(rng);
99#endif
100	return convnet;
101}
102 
103int ccv_convnet_verify(ccv_convnet_t* convnet, int output)
104{
105	int i, out_rows, out_cols, out_partition, out_channels;
106	if (convnet->count < 1)
107		return -1;
108	// the last layer has to be full connect
109	if (convnet->layers[convnet->count - 1].type != CCV_CONVNET_FULL_CONNECT)
110		return -1;
111	// you cannot enable relu on the last layer
112	if (convnet->layers[convnet->count - 1].net.full_connect.relu)
113		return -1;
114	out_channels = 3;
115	for (i = 0; i < convnet->count; i++)
116	{
117		ccv_convnet_layer_t* layer = convnet->layers + i;
118		if (i > 0 && (out_rows != layer->input.matrix.rows || out_cols != layer->input.matrix.cols))
119			return -1;
120		// the input channels should be equal to the previous output channels, skip this check for full connect as it is meaningless
121		if (out_channels != layer->input.matrix.channels && layer->type != CCV_CONVNET_FULL_CONNECT)
122			return -1;
123		ccv_convnet_make_output(layer, layer->input.matrix.rows, layer->input.matrix.cols, &out_rows, &out_cols, &out_partition);
124		if (layer->type == CCV_CONVNET_CONVOLUTIONAL)
125		{
126			// check to see if the input matrix channel is equal to the expected input of the convolutional layer filters
127			if (layer->input.matrix.channels != layer->net.convolutional.channels)
128				return -1;
129			// if this layer is convolutional layer, its filter output should equal to next layer's channel input
130			out_channels = layer->net.convolutional.count;
131		}
132	}
133	if (out_rows * out_cols != output)
134		return -1;
135	int count = 0;
136	for (i = 0; i < convnet->count; i++)
137	{
138		ccv_convnet_layer_t* layer = convnet->layers + i;
139		if (layer->type == CCV_CONVNET_FULL_CONNECT)
140		{
141			count = i;
142			break;
143		}
144	}
145	// all the layers after the first full connect layer should only be full connect layer
146	for (i = count; i < convnet->count; i++)
147		if (convnet->layers[i].type != CCV_CONVNET_FULL_CONNECT ||
148			convnet->layers[i].input.matrix.rows * convnet->layers[i].input.matrix.cols * convnet->layers[i].input.matrix.channels != convnet->layers[i].input.node.count)
149			return -1;
150	return 0;
151}
152 
153#endif
154 
155#if defined(HAVE_SSE21) || defined(HAVE_NEON)
156 
157static void _ccv_convnet_layer_simd_alloc_reserved(ccv_convnet_layer_t* layer)
158{
159	if (layer->reserved)
160		return;
161	int partition = layer->input.matrix.partition;
162	int ch = layer->net.convolutional.channels;
163	int count = layer->net.convolutional.count;
164	int kernel_rows = layer->net.convolutional.rows;
165	int kernel_cols = layer->net.convolutional.cols;
166	int ch_per_partition = ch / partition;
167	int count_per_4 = count / 4;
168	float* simd_w = (float*)ccmallocmalloc(sizeof(float) * layer->wnum);
169	int i, j, k, c;
170	for (k = 0; k < count_per_4; k++)
171		for (i = 0; i < kernel_rows * kernel_cols; i++)
172			for (j = 0; j < ch_per_partition; j++)
173				for (c = 0; c < 4; c++)
174					simd_w[(k * kernel_rows * kernel_cols * ch_per_partition + i * ch_per_partition + j) * 4 + c] = layer->w[(k * 4 + c) * kernel_rows * kernel_cols * ch_per_partition + i * ch_per_partition + j];
175	layer->reserved = simd_w;
176}
177 
178#endif
179 
180#define SIMD(x)((float*)((x)->reserved)) ((float*)((x)->reserved))
181 
182#if defined(HAVE_SSE21)
183static inline void _ccv_convnet_convolutional_forward_propagate_sse2(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t* db, int rows, int cols, int ch, int count, int strides, int border, int kernel_rows, int kernel_cols, int ch_per_partition, int count_per_partition)
184{
185	assert(SIMD(layer))((void) sizeof ((((float*)((layer)->reserved))) ? 1 : 0), __extension__
 ({ if (((float*)((layer)->reserved))) ; else __assert_fail
 ("SIMD(layer)", "ccv_convnet.c", 185, __extension__ __PRETTY_FUNCTION__
); }));
186#define main_for(block) \
187	parallel_for(k, (count >> 2)){ int k; for ((k) = 0; (k) < ((count >> 2)); (k)++) { { \
188		int i, j, x, y, c; \
189		int p = k * 4 / count_per_partition; \
190		float* ap = a->data.f32 + p * ch_per_partition; \
191		float* bp = db->data.f32 + k * 4; \
192		float* layer_w = SIMD(layer)((float*)((layer)->reserved)) + k * 4 * kernel_rows * kernel_cols * ch_per_partition; \
193		float bias[4] __attribute__ ((__aligned__(16))); \
194		memcpy(bias, layer->bias + k * 4, sizeof(float) * 4); \
195		/* 4 accumulators */ \
196		__m128 z4 = _mm_setzero_ps(); \
197		for (i = 0; i < db->rows; i++) \
198		{ \
199			int comy = ccv_max(i * strides - border, 0)({ typeof (i * strides - border) _a = (i * strides - border);
 typeof (0) _b = (0); (_a > _b) ? _a : _b; }) - (i * strides - border); \
200			int maxy = kernel_rows - comy - (i * strides + kernel_rows - ccv_min(a->rows + border, i * strides + kernel_rows)({ typeof (a->rows + border) _a = (a->rows + border); typeof
 (i * strides + kernel_rows) _b = (i * strides + kernel_rows)
; (_a < _b) ? _a : _b; })); \
201			comy *= ch_per_partition * kernel_cols; \
202			for (j = 0; j < db->cols; j++) \
203			{ \
204				__m128 v40 = _mm_load_ps(bias); \
205				__m128 v41 = _mm_setzero_ps(); \
206				__m128 v42 = _mm_setzero_ps(); \
207				__m128 v43 = _mm_setzero_ps(); \
208				int comx = ccv_max(j * strides - border, 0)({ typeof (j * strides - border) _a = (j * strides - border);
 typeof (0) _b = (0); (_a > _b) ? _a : _b; }) - (j * strides - border); \
209				int maxx = kernel_cols - comx - (j * strides + kernel_cols - ccv_min(a->cols + border, j * strides + kernel_cols)({ typeof (a->cols + border) _a = (a->cols + border); typeof
 (j * strides + kernel_cols) _b = (j * strides + kernel_cols)
; (_a < _b) ? _a : _b; })); \
210				float* w = layer_w + (comx * ch_per_partition + comy) * 4; \
211				float* apz = ap + ccv_max(j * strides - border, 0)({ typeof (j * strides - border) _a = (j * strides - border);
 typeof (0) _b = (0); (_a > _b) ? _a : _b; }) * ch; \
212				/* when we have border, we simply do zero padding */ \
213				for (y = 0; y < maxy; y++) \
214				{ \
215					/* special casing for these cases to speed up SIMD computation */ \
216					for (x = 0; x < maxx; x++) \
217					{ \
218						c = 0; \
219						for (; c < ch_per_partition - 3; c += 4) \
220						{ \
221							__m128 apz4 = _mm_loadu_ps(apz + x * ch + c); \
222							__m128 w40 = _mm_loadu_ps(w + (x * ch_per_partition + c) * 4); \
223							__m128 w41 = _mm_loadu_ps(w + (x * ch_per_partition + c + 1) * 4); \
224							__m128 w42 = _mm_loadu_ps(w + (x * ch_per_partition + c + 2) * 4); \
225							__m128 w43 = _mm_loadu_ps(w + (x * ch_per_partition + c + 3) * 4); \
226							__m128 apz40 = _mm_shuffle_ps(apz4, apz4, 0x00)((__m128)__builtin_ia32_shufps((__v4sf)(__m128)(apz4), (__v4sf
)(__m128)(apz4), (int)(0x00))); \
227							__m128 apz41 = _mm_shuffle_ps(apz4, apz4, 0x55)((__m128)__builtin_ia32_shufps((__v4sf)(__m128)(apz4), (__v4sf
)(__m128)(apz4), (int)(0x55))); \
228							__m128 apz42 = _mm_shuffle_ps(apz4, apz4, 0xAA)((__m128)__builtin_ia32_shufps((__v4sf)(__m128)(apz4), (__v4sf
)(__m128)(apz4), (int)(0xAA))); \
229							__m128 apz43 = _mm_shuffle_ps(apz4, apz4, 0xFF)((__m128)__builtin_ia32_shufps((__v4sf)(__m128)(apz4), (__v4sf
)(__m128)(apz4), (int)(0xFF))); \
230							v40 =_mm_add_ps(_mm_mul_ps(w40, apz40), v40); \
231							v41 =_mm_add_ps(_mm_mul_ps(w41, apz41), v41); \
232							v42 =_mm_add_ps(_mm_mul_ps(w42, apz42), v42); \
233							v43 =_mm_add_ps(_mm_mul_ps(w43, apz43), v43); \
234						} \
235						block /* insert executions for tail partition */ \
236					} \
237					w += kernel_cols * ch_per_partition * 4; \
238					apz += a->cols * ch; \
239				} \
240				__m128 v4 = _mm_max_ps(z4, _mm_add_ps(_mm_add_ps(v40, v41), _mm_add_ps(v42, v43))); \
241				_mm_storeu_ps(bp + j * count, v4); /* ReLU */ \
242			} \
243			bp += db->cols * count; \
244			ap += a->cols * ch * (ccv_max((i + 1) * strides - border, 0)({ typeof ((i + 1) * strides - border) _a = ((i + 1) * strides
 - border); typeof (0) _b = (0); (_a > _b) ? _a : _b; }) - ccv_max(i * strides - border, 0)({ typeof (i * strides - border) _a = (i * strides - border);
 typeof (0) _b = (0); (_a > _b) ? _a : _b; })); \
245		} \
246	} parallel_endfor} }
247	if (ch_per_partition % 4 == 0)
248	{
249		main_for();
250	} else if (ch_per_partition % 4 == 3) { // unroll the last for-loops
251#define block \
252		__m128 apz40 = _mm_load1_ps(apz + x * ch + c); \
253		__m128 apz41 = _mm_load1_ps(apz + x * ch + c + 1); \
254		__m128 apz42 = _mm_load1_ps(apz + x * ch + c + 2); \
255		__m128 w40 = _mm_loadu_ps(w + (x * ch_per_partition + c) * 4); \
256		__m128 w41 = _mm_loadu_ps(w + (x * ch_per_partition + c + 1) * 4); \
257		__m128 w42 = _mm_loadu_ps(w + (x * ch_per_partition + c + 2) * 4); \
258		v40 = _mm_add_ps(_mm_mul_ps(w40, apz40), v40); \
259		v41 = _mm_add_ps(_mm_mul_ps(w41, apz41), v41); \
260		v42 = _mm_add_ps(_mm_mul_ps(w42, apz42), v42);
261		main_for(block);
262#undef block
263	} else if (ch_per_partition % 4 == 2) { // unroll the last for-loops
264#define block \
265		__m128 apz40 = _mm_load1_ps(apz + x * ch + c); \
266		__m128 apz41 = _mm_load1_ps(apz + x * ch + c + 1); \
267		__m128 w40 = _mm_loadu_ps(w + (x * ch_per_partition + c) * 4); \
268		__m128 w41 = _mm_loadu_ps(w + (x * ch_per_partition + c + 1) * 4); \
269		v40 = _mm_add_ps(_mm_mul_ps(w40, apz40), v40); \
270		v41 = _mm_add_ps(_mm_mul_ps(w41, apz41), v41);
271		main_for(block);
272#undef block
273	} else {
274#define block \
275		__m128 apz4 = _mm_load1_ps(apz + x * ch + c); \
276		__m128 w4 = _mm_loadu_ps(w + (x * ch_per_partition + c) * 4); \
277		v40 = _mm_add_ps(_mm_mul_ps(w4, apz4), v40);
278		main_for(block);
279#undef block
280	}
281#undef main_for
282}
283#elif defined(HAVE_NEON)
284static inline void _ccv_convnet_convolutional_forward_propagate_neon(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t* db, int rows, int cols, int ch, int count, int strides, int border, int kernel_rows, int kernel_cols, int ch_per_partition, int count_per_partition)
285{
286	assert(SIMD(layer))((void) sizeof ((((float*)((layer)->reserved))) ? 1 : 0), __extension__
 ({ if (((float*)((layer)->reserved))) ; else __assert_fail
 ("SIMD(layer)", "ccv_convnet.c", 286, __extension__ __PRETTY_FUNCTION__
); }));
287#define main_for(block) \
288	parallel_for(k, (count >> 2)){ int k; for ((k) = 0; (k) < ((count >> 2)); (k)++) { { \
289		int i, j, x, y, c; \
290		int p = k * 4 / count_per_partition; \
291		float* ap = a->data.f32 + p * ch_per_partition; \
292		float* bp = db->data.f32 + k * 4; \
293		float* layer_w = SIMD(layer)((float*)((layer)->reserved)) + k * 4 * kernel_rows * kernel_cols * ch_per_partition; \
294		float bias[4] __attribute__ ((__aligned__(16))); \
295		memcpy(bias, layer->bias + k * 4, sizeof(float) * 4); \
296		float32x4_t z4 = vmovq_n_f32(0); \
297		for (i = 0; i < db->rows; i++) \
298		{ \
299			int comy = ccv_max(i * strides - border, 0)({ typeof (i * strides - border) _a = (i * strides - border);
 typeof (0) _b = (0); (_a > _b) ? _a : _b; }) - (i * strides - border); \
300			int maxy = kernel_rows - comy - (i * strides + kernel_rows - ccv_min(a->rows + border, i * strides + kernel_rows)({ typeof (a->rows + border) _a = (a->rows + border); typeof
 (i * strides + kernel_rows) _b = (i * strides + kernel_rows)
; (_a < _b) ? _a : _b; })); \
301			comy *= ch_per_partition * kernel_cols; \
302			for (j = 0; j < db->cols; j++) \
303			{ \
304				float32x4_t v40 = vld1q_f32(bias); \
305				float32x4_t v41 = vmovq_n_f32(0); \
306				int comx = ccv_max(j * strides - border, 0)({ typeof (j * strides - border) _a = (j * strides - border);
 typeof (0) _b = (0); (_a > _b) ? _a : _b; }) - (j * strides - border); \
307				int maxx = kernel_cols - comx - (j * strides + kernel_cols - ccv_min(a->cols + border, j * strides + kernel_cols)({ typeof (a->cols + border) _a = (a->cols + border); typeof
 (j * strides + kernel_cols) _b = (j * strides + kernel_cols)
; (_a < _b) ? _a : _b; })); \
308				float* w = layer_w + (comx * ch_per_partition + comy) * 4; \
309				float* apz = ap + ccv_max(j * strides - border, 0)({ typeof (j * strides - border) _a = (j * strides - border);
 typeof (0) _b = (0); (_a > _b) ? _a : _b; }) * ch; \
310				/* when we have border, we simply do zero padding */ \
311				for (y = 0; y < maxy; y++) \
312				{ \
313					for (x = 0; x < maxx; x++) \
314					{ \
315						c = 0; \
316						for (; c < ch_per_partition - 1; c += 2) \
317						{ \
318							float32x2_t apz4 = vld1_f32(apz + x * ch + c); \
319							float32x4_t apz40 = vdupq_lane_f32(apz4, 0); \
320							float32x4_t apz41 = vdupq_lane_f32(apz4, 1); \
321							float32x4_t w40 = vld1q_f32(w + (x * ch_per_partition + c) * 4); \
322							float32x4_t w41 = vld1q_f32(w + (x * ch_per_partition + c + 1) * 4); \
323							v40 = vmlaq_f32(v40, w40, apz40); \
324							v41 = vmlaq_f32(v41, w41, apz41); \
325						} \
326						block /* insert executions for tail partition */ \
327					} \
328					w += kernel_cols * ch_per_partition * 4; \
329					apz += a->cols * ch; \
330				} \
331				float32x4_t v4 = vmaxq_f32(z4, vaddq_f32(v40, v41)); \
332				vst1q_f32(bp + j * count, v4); /* ReLU */ \
333			} \
334			bp += db->cols * count; \
335			ap += a->cols * ch * (ccv_max((i + 1) * strides - border, 0)({ typeof ((i + 1) * strides - border) _a = ((i + 1) * strides
 - border); typeof (0) _b = (0); (_a > _b) ? _a : _b; }) - ccv_max(i * strides - border, 0)({ typeof (i * strides - border) _a = (i * strides - border);
 typeof (0) _b = (0); (_a > _b) ? _a : _b; })); \
336		} \
337	} parallel_endfor} }
338	if (ch_per_partition % 2 == 0)
339	{
340		main_for();
341	} else { // unroll the last for-loops
342#define block \
343		float32x4_t apz4 = vmovq_n_f32(apz[x * ch + c]); \
344		float32x4_t w4 = vld1q_f32(w + (x * ch_per_partition + c) * 4); \
345		v40 = vmlaq_f32(v40, w4, apz4);
346		main_for(block);
347#undef block
348	}
349#undef main_for
350}
351#else
352static inline void _ccv_convnet_convolutional_forward_propagate_fallback(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t* db, int rows, int cols, int ch, int count, int strides, int border, int kernel_rows, int kernel_cols, int ch_per_partition, int count_per_partition)
353{
354	parallel_for(k, count){ int k; for ((k) = 0; (k) < (count); (k)++) { {
355		int i, j, x, y, c;
356		int p = k / count_per_partition;
357		float* ap = a->data.f32 + p * ch_per_partition;
358		float* bp = db->data.f32 + k;
359		float* layer_w = layer->w + k * kernel_rows * kernel_cols * ch_per_partition;
360		float bias = layer->bias[k];
361		for (i = 0; i < db->rows; i++)
362		{
363			int comy = ccv_max(i * strides - border, 0)({ typeof (i * strides - border) _a = (i * strides - border);
 typeof (0) _b = (0); (_a > _b) ? _a : _b; }) - (i * strides - border);
364			int maxy = kernel_rows - comy - (i * strides + kernel_rows - ccv_min(a->rows + border, i * strides + kernel_rows)({ typeof (a->rows + border) _a = (a->rows + border); typeof
 (i * strides + kernel_rows) _b = (i * strides + kernel_rows)
; (_a < _b) ? _a : _b; }));
365			comy *= ch_per_partition * kernel_cols;
366			for (j = 0; j < db->cols; j++)
367			{
368				float v = bias;
369				int comx = ccv_max(j * strides - border, 0)({ typeof (j * strides - border) _a = (j * strides - border);
 typeof (0) _b = (0); (_a > _b) ? _a : _b; }) - (j * strides - border);
370				int maxx = kernel_cols - comx - (j * strides + kernel_cols - ccv_min(a->cols + border, j * strides + kernel_cols)({ typeof (a->cols + border) _a = (a->cols + border); typeof
 (j * strides + kernel_cols) _b = (j * strides + kernel_cols)
; (_a < _b) ? _a : _b; }));
371				float* w = layer_w + comx * ch_per_partition + comy;
372				float* apz = ap + ccv_max(j * strides - border, 0)({ typeof (j * strides - border) _a = (j * strides - border);
 typeof (0) _b = (0); (_a > _b) ? _a : _b; }) * ch;
373				// when we have border, we simply do zero padding
374				for (y = 0; y < maxy; y++)
375				{
376					for (x = 0; x < maxx; x++)
377						for (c = 0; c < ch_per_partition; c++)
378							v += w[x * ch_per_partition + c] * apz[x * ch + c];
379					w += kernel_cols * ch_per_partition;
380					apz += a->cols * ch;
381				}
382				bp[j * count] = ccv_max(0, v)({ typeof (0) _a = (0); typeof (v) _b = (v); (_a > _b) ? _a
 : _b; }); // ReLU
383			}
384			bp += db->cols * count;
385			ap += a->cols * ch * (ccv_max((i + 1) * strides - border, 0)({ typeof ((i + 1) * strides - border) _a = ((i + 1) * strides
 - border); typeof (0) _b = (0); (_a > _b) ? _a : _b; }) - ccv_max(i * strides - border, 0)({ typeof (i * strides - border) _a = (i * strides - border);
 typeof (0) _b = (0); (_a > _b) ? _a : _b; }));
386		}
387	} parallel_endfor} }
388}
389#endif
390 
391static void _ccv_convnet_convolutional_forward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t** b)
392{
393	int rows, cols, partition;
394	ccv_convnet_make_output(layer, a->rows, a->cols, &rows, &cols, &partition);
395	int ch = layer->net.convolutional.channels;
396	int count = layer->net.convolutional.count;
397	int strides = layer->net.convolutional.strides;
398	int border = layer->net.convolutional.border;
399	int kernel_rows = layer->net.convolutional.rows;
400	int kernel_cols = layer->net.convolutional.cols;
401	int type = CCV_32F | count;
402	assert(CCV_GET_CHANNEL(a->type) == ch)((void) sizeof ((((a->type) & 0xFFF) == ch) ? 1 : 0), __extension__
 ({ if (((a->type) & 0xFFF) == ch) ; else __assert_fail
 ("CCV_GET_CHANNEL(a->type) == ch", "ccv_convnet.c", 402, __extension__
 __PRETTY_FUNCTION__); }));
403	assert(CCV_GET_DATA_TYPE(a->type) == CCV_32F)((void) sizeof ((((a->type) & 0xFF000) == CCV_32F) ? 1
 : 0), __extension__ ({ if (((a->type) & 0xFF000) == CCV_32F
) ; else __assert_fail ("CCV_GET_DATA_TYPE(a->type) == CCV_32F"
, "ccv_convnet.c", 403, __extension__ __PRETTY_FUNCTION__); }
));
404	ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, rows, cols, type, type, 0);
405	int ch_per_partition = ch / partition;
406	int count_per_partition = count / partition;
407	assert(count_per_partition % 4 == 0)((void) sizeof ((count_per_partition % 4 == 0) ? 1 : 0), __extension__
 ({ if (count_per_partition % 4 == 0) ; else __assert_fail ("count_per_partition % 4 == 0"
, "ccv_convnet.c", 407, __extension__ __PRETTY_FUNCTION__); }
));
408#if defined(HAVE_SSE21) || defined(HAVE_NEON)
409	_ccv_convnet_layer_simd_alloc_reserved(layer);
410#endif
411#if defined(HAVE_SSE21)
412	_ccv_convnet_convolutional_forward_propagate_sse2(layer, a, db, rows, cols, ch, count, strides, border, kernel_rows, kernel_cols, ch_per_partition, count_per_partition);
413#elif defined(HAVE_NEON)
414	_ccv_convnet_convolutional_forward_propagate_neon(layer, a, db, rows, cols, ch, count, strides, border, kernel_rows, kernel_cols, ch_per_partition, count_per_partition);
415#else
416	_ccv_convnet_convolutional_forward_propagate_fallback(layer, a, db, rows, cols, ch, count, strides, border, kernel_rows, kernel_cols, ch_per_partition, count_per_partition);
417#endif
418}
419 
420static void _ccv_convnet_full_connect_forward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t** b)
421{
422	assert(CCV_GET_DATA_TYPE(a->type) == CCV_32F)((void) sizeof ((((a->type) & 0xFF000) == CCV_32F) ? 1
 : 0), __extension__ ({ if (((a->type) & 0xFF000) == CCV_32F
) ; else __assert_fail ("CCV_GET_DATA_TYPE(a->type) == CCV_32F"
, "ccv_convnet.c", 422, __extension__ __PRETTY_FUNCTION__); }
));
423	ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, layer->net.full_connect.count, 1, CCV_32F | CCV_C1, CCV_32F | CCV_C1, 0);
424	int ch = CCV_GET_CHANNEL(a->type)((a->type) & 0xFFF);
425	int rows = a->rows, cols = a->cols;
426	// reshape a for gemm
427	assert(a->step == a->cols * CCV_GET_DATA_TYPE_SIZE(a->type) * ch)((void) sizeof ((a->step == a->cols * _ccv_get_data_type_size
[((a->type) & 0xFF000) >> 12] * ch) ? 1 : 0), __extension__
 ({ if (a->step == a->cols * _ccv_get_data_type_size[((
a->type) & 0xFF000) >> 12] * ch) ; else __assert_fail
 ("a->step == a->cols * CCV_GET_DATA_TYPE_SIZE(a->type) * ch"
, "ccv_convnet.c", 427, __extension__ __PRETTY_FUNCTION__); }
));
428	a->rows = rows * cols * ch, a->cols = 1, a->type = (a->type - ch) | CCV_C1;
429	assert(a->rows * db->rows == layer->wnum)((void) sizeof ((a->rows * db->rows == layer->wnum) ?
 1 : 0), __extension__ ({ if (a->rows * db->rows == layer
->wnum) ; else __assert_fail ("a->rows * db->rows == layer->wnum"
, "ccv_convnet.c", 429, __extension__ __PRETTY_FUNCTION__); }
));
430	a->step = a->cols * CCV_GET_DATA_TYPE_SIZE(a->type)_ccv_get_data_type_size[((a->type) & 0xFF000) >>
 12];
431	int i;
432	float* bptr = db->data.f32;
433	for (i = 0; i < db->rows; i++)
434		bptr[i] = layer->bias[i];
435	ccv_dense_matrix_t dw = ccv_dense_matrix(db->rows, a->rows, CCV_32F | CCV_C1, layer->w, 0);
436	ccv_gemm(&dw, a, 1, db, 1, 0, (ccv_matrix_t**)&db, 0); // supply db as matrix C is allowed
437	if (layer->net.full_connect.relu)
438		for (i = 0; i < db->rows; i++)
439			bptr[i] = ccv_max(0, bptr[i])({ typeof (0) _a = (0); typeof (bptr[i]) _b = (bptr[i]); (_a >
 _b) ? _a : _b; }); // relu
440	a->rows = rows, a->cols = cols, a->type = (a->type - CCV_GET_CHANNEL(a->type)((a->type) & 0xFFF)) | ch;
441	a->step = a->cols * CCV_GET_DATA_TYPE_SIZE(a->type)_ccv_get_data_type_size[((a->type) & 0xFF000) >>
 12] * CCV_GET_CHANNEL(a->type)((a->type) & 0xFFF);
442}
443 
444static void _ccv_convnet_rnorm_forward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t** b, ccv_dense_matrix_t** denoms)
445{
446	int rows, cols, partition;
447	ccv_convnet_make_output(layer, a->rows, a->cols, &rows, &cols, &partition);
448	int size = layer->net.rnorm.size;
449	float kappa = layer->net.rnorm.kappa;
450	float alpha = layer->net.rnorm.alpha;
451	float beta = layer->net.rnorm.beta;
452	int way = size / 2;
453	assert(CCV_GET_DATA_TYPE(a->type) == CCV_32F)((void) sizeof ((((a->type) & 0xFF000) == CCV_32F) ? 1
 : 0), __extension__ ({ if (((a->type) & 0xFF000) == CCV_32F
) ; else __assert_fail ("CCV_GET_DATA_TYPE(a->type) == CCV_32F"
, "ccv_convnet.c", 453, __extension__ __PRETTY_FUNCTION__); }
));
454	int ch = CCV_GET_CHANNEL(a->type)((a->type) & 0xFFF);
455	int type = CCV_32F | ch;
456	ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, rows, cols, type, type, 0);
457	int i, j, k, x, p;
458	float* ap = a->data.f32;
459	float* bp = db->data.f32;
460	int ch_per_partition = ch / partition;
461	if (denoms)
462	{
463		ccv_dense_matrix_t* ddenoms = *denoms = ccv_dense_matrix_renew(*denoms, rows, cols, type, type, 0);
464		float* dp = ddenoms->data.f32;
465		for (i = 0; i < db->rows; i++)
466		{
467			for (j = 0; j < db->cols; j++)
468				for (p = 0; p < partition; p++)
469					for (k = 0; k < ch_per_partition; k++)
470					{
471						float v = ap[j * ch + p * ch_per_partition + k];
472						float denom = 0;
473						for (x = ccv_max(k - way, 0)({ typeof (k - way) _a = (k - way); typeof (0) _b = (0); (_a >
 _b) ? _a : _b; }); x <= ccv_min(k + way, ch_per_partition - 1)({ typeof (k + way) _a = (k + way); typeof (ch_per_partition -
 1) _b = (ch_per_partition - 1); (_a < _b) ? _a : _b; }); x++)
474							denom += ap[j * ch + p * ch_per_partition + x] * ap[j * ch + p * ch_per_partition + x];
475						denom = kappa + alpha * denom;
476						dp[j * ch + p * ch_per_partition + k] = denom;
477						bp[j * ch + p * ch_per_partition + k] = v * powf(denom, -beta);
478					}
479			ap += a->cols * ch;
480			dp += ddenoms->cols * ch;
481			bp += db->cols * ch;
482		}
483	} else {
484		for (i = 0; i < db->rows; i++)
485		{
486			for (j = 0; j < db->cols; j++)
487				for (p = 0; p < partition; p++)
488					for (k = 0; k < ch_per_partition; k++)
489					{
490						float v = ap[j * ch + p * ch_per_partition + k];
491						float denom = 0;
492						for (x = ccv_max(k - way, 0)({ typeof (k - way) _a = (k - way); typeof (0) _b = (0); (_a >
 _b) ? _a : _b; }); x <= ccv_min(k + way, ch_per_partition - 1)({ typeof (k + way) _a = (k + way); typeof (ch_per_partition -
 1) _b = (ch_per_partition - 1); (_a < _b) ? _a : _b; }); x++)
493							denom += ap[j * ch + p * ch_per_partition + x] * ap[j * ch + p * ch_per_partition + x];
494						denom = kappa + alpha * denom;
495						bp[j * ch + p * ch_per_partition + k] = v * powf(denom, -beta);
496					}
497			ap += a->cols * ch;
498			bp += db->cols * ch;
499		}
500	}
501}
502 
503static void _ccv_convnet_max_pool_forward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t** b)
504{
505	int rows, cols, partition;
506	ccv_convnet_make_output(layer, a->rows, a->cols, &rows, &cols, &partition);
507	int size = layer->net.pool.size;
508	int strides = layer->net.pool.strides;
509	int border = layer->net.pool.border;
510	assert(CCV_GET_DATA_TYPE(a->type) == CCV_32F)((void) sizeof ((((a->type) & 0xFF000) == CCV_32F) ? 1
 : 0), __extension__ ({ if (((a->type) & 0xFF000) == CCV_32F
) ; else __assert_fail ("CCV_GET_DATA_TYPE(a->type) == CCV_32F"
, "ccv_convnet.c", 510, __extension__ __PRETTY_FUNCTION__); }
));
511	int ch = CCV_GET_CHANNEL(a->type)((a->type) & 0xFFF);
512	int type = CCV_32F | ch;
513	ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, rows, cols, type, type, 0);
514	int i, j, k, x, y;
515	float* ap = a->data.f32;
516	float* bp = db->data.f32;
517	for (i = 0; i < db->rows; i++)
518	{
519		const int start_y = ccv_max(i * strides - border, 0)({ typeof (i * strides - border) _a = (i * strides - border);
 typeof (0) _b = (0); (_a > _b) ? _a : _b; }) - (i * strides - border);
520		const int end_y = size + ccv_min(i * strides + size - border, a->rows)({ typeof (i * strides + size - border) _a = (i * strides + size
 - border); typeof (a->rows) _b = (a->rows); (_a < _b
) ? _a : _b; }) - (i * strides + size - border);
521		for (j = 0; j < db->cols; j++)
522		{
523			const int start_x = ccv_max(j * strides - border, 0)({ typeof (j * strides - border) _a = (j * strides - border);
 typeof (0) _b = (0); (_a > _b) ? _a : _b; }) - (j * strides - border);
524			const int end_x = size + ccv_min(j * strides + size - border, a->cols)({ typeof (j * strides + size - border) _a = (j * strides + size
 - border); typeof (a->cols) _b = (a->cols); (_a < _b
) ? _a : _b; }) - (j * strides + size - border);
525			for (k = 0; k < ch; k++)
526			{
527				float v = 0;
528				for (y = start_y; y < end_y; y++)
529					for (x = start_x; x < end_x; x++)
530						if (x == start_x && y == start_y)
531							v = ap[(j * strides - border + x + (y - border) * a->cols) * ch + k];
532						else if (ap[(j * strides - border + x + (y - border) * a->cols) * ch + k] > v)
533							v = ap[(j * strides - border + x + (y - border) * a->cols) * ch + k];
534				bp[j * ch + k] = v;
535			}
536		}
537		ap += a->cols * ch * strides;
538		bp += db->cols * ch;
539	}
540}
541 
542static void _ccv_convnet_average_pool_forward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t** b)
543{
544	int rows, cols, partition;
545	ccv_convnet_make_output(layer, a->rows, a->cols, &rows, &cols, &partition);
546	int size = layer->net.pool.size;
547	int strides = layer->net.pool.strides;
548	int border = layer->net.pool.border;
549	assert(CCV_GET_DATA_TYPE(a->type) == CCV_32F)((void) sizeof ((((a->type) & 0xFF000) == CCV_32F) ? 1
 : 0), __extension__ ({ if (((a->type) & 0xFF000) == CCV_32F
) ; else __assert_fail ("CCV_GET_DATA_TYPE(a->type) == CCV_32F"
, "ccv_convnet.c", 549, __extension__ __PRETTY_FUNCTION__); }
));
550	int ch = CCV_GET_CHANNEL(a->type)((a->type) & 0xFFF);
551	int type = CCV_32F | ch;
552	ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, rows, cols, type, type, 0);
553	int i, j, k, x, y;
554	float* ap = a->data.f32;
555	float* bp = db->data.f32;
556	for (i = 0; i < db->rows; i++)
557	{
558		const int start_y = ccv_max(i * strides - border, 0)({ typeof (i * strides - border) _a = (i * strides - border);
 typeof (0) _b = (0); (_a > _b) ? _a : _b; }) - (i * strides - border);
559		const int end_y = size + ccv_min(i * strides + size - border, a->rows)({ typeof (i * strides + size - border) _a = (i * strides + size
 - border); typeof (a->rows) _b = (a->rows); (_a < _b
) ? _a : _b; }) - (i * strides + size - border);
560		for (j = 0; j < db->cols; j++)
561		{
562			const int start_x = ccv_max(j * strides - border, 0)({ typeof (j * strides - border) _a = (j * strides - border);
 typeof (0) _b = (0); (_a > _b) ? _a : _b; }) - (j * strides - border);
563			const int end_x = size + ccv_min(j * strides + size - border, a->cols)({ typeof (j * strides + size - border) _a = (j * strides + size
 - border); typeof (a->cols) _b = (a->cols); (_a < _b
) ? _a : _b; }) - (j * strides + size - border);
564			for (k = 0; k < ch; k++)
565			{
566				float v = 0;
567				for (y = start_y; y < end_y; y++)
568					for (x = start_x; x < end_x; x++)
569						v += ap[(j * strides - border + x + (y - border) * a->cols) * ch + k];
570				bp[j * ch + k] = v / ((end_x - start_x) * (end_y - start_y));
571			}
572		}
573		ap += a->cols * ch * strides;
574		bp += db->cols * ch;
575	}
576}
577 
578static void _ccv_convnet_layer_forward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t** b, ccv_dense_matrix_t** denoms)
579{
580	switch(layer->type)
581	{
582		case CCV_CONVNET_CONVOLUTIONAL:
583			_ccv_convnet_convolutional_forward_propagate(layer, a, b);
584			break;
585		case CCV_CONVNET_FULL_CONNECT:
586			_ccv_convnet_full_connect_forward_propagate(layer, a, b);
587			break;
588		case CCV_CONVNET_LOCAL_RESPONSE_NORM:
589			_ccv_convnet_rnorm_forward_propagate(layer, a, b, denoms);
590			break;
591		case CCV_CONVNET_MAX_POOL:
592			_ccv_convnet_max_pool_forward_propagate(layer, a, b);
593			break;
594		case CCV_CONVNET_AVERAGE_POOL:
595			_ccv_convnet_average_pool_forward_propagate(layer, a, b);
596			break;
597	}
598}
599 
600static void _ccv_convnet_full_connect_forward_propagate_parallel(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t** b)
601{
602	assert(CCV_GET_DATA_TYPE(a->type) == CCV_32F)((void) sizeof ((((a->type) & 0xFF000) == CCV_32F) ? 1
 : 0), __extension__ ({ if (((a->type) & 0xFF000) == CCV_32F
) ; else __assert_fail ("CCV_GET_DATA_TYPE(a->type) == CCV_32F"
, "ccv_convnet.c", 602, __extension__ __PRETTY_FUNCTION__); }
));
603	ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, a->rows, layer->net.full_connect.count, CCV_32F | CCV_C1, CCV_32F | CCV_C1, 0);
604	// reshape a for gemm
605	int i, j;
606	float* bptr = db->data.f32;
607	for (i = 0; i < db->rows; i++)
608	{
609		for (j = 0; j < db->cols; j++)
610			bptr[j] = layer->bias[j];
611		bptr += db->cols;
612	}
613	ccv_dense_matrix_t dw = ccv_dense_matrix(db->cols, a->cols, CCV_32F | CCV_C1, layer->w, 0);
614	ccv_gemm(a, &dw, 1, db, 1, CCV_B_TRANSPOSE, (ccv_matrix_t**)&db, 0); // supply db as matrix C is allowed
615	bptr = db->data.f32;
616	if (layer->net.full_connect.relu)
617		for (i = 0; i < db->rows; i++)
618		{
619			for (j = 0; j < db->cols; j++)
620				bptr[j] = ccv_max(0, bptr[j])({ typeof (0) _a = (0); typeof (bptr[j]) _b = (bptr[j]); (_a >
 _b) ? _a : _b; }); // relu
621			bptr += db->cols;
622		}
623}
624 
625static void _ccv_convnet_compute_softmax_parallel(ccv_dense_matrix_t* a, ccv_dense_matrix_t** b, int type)
626{
627	assert(CCV_GET_CHANNEL(a->type) == CCV_C1)((void) sizeof ((((a->type) & 0xFFF) == CCV_C1) ? 1 : 0
), __extension__ ({ if (((a->type) & 0xFFF) == CCV_C1)
 ; else __assert_fail ("CCV_GET_CHANNEL(a->type) == CCV_C1"
, "ccv_convnet.c", 627, __extension__ __PRETTY_FUNCTION__); }
));
628	assert(CCV_GET_DATA_TYPE(a->type) == CCV_32F)((void) sizeof ((((a->type) & 0xFF000) == CCV_32F) ? 1
 : 0), __extension__ ({ if (((a->type) & 0xFF000) == CCV_32F
) ; else __assert_fail ("CCV_GET_DATA_TYPE(a->type) == CCV_32F"
, "ccv_convnet.c", 628, __extension__ __PRETTY_FUNCTION__); }
));
629	ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, 1, a->cols, CCV_32F | CCV_C1, CCV_32F | CCV_C1, 0);
630	ccv_zero(db);
631	int i, j;
632	float* aptr = a->data.f32;
633	float* bptr = db->data.f32;
634	float* cptr = (float*)ccmallocmalloc(sizeof(float) * a->cols);
635	for (i = 0; i < a->rows; i++)
636	{
637		double max = aptr[0];
638		for (j = 1; j < a->cols; j++)
639			if (aptr[j] > max)
640				max = aptr[j];
641		double tt = 0;
642		for (j = 0; j < a->cols; j++)
643			tt += (cptr[j] = expf(aptr[j] - max));
644		tt = 1.0 / tt;
645		for (j = 0; j < a->cols; j++)
646			bptr[j] += cptr[j] * tt;
647		aptr += a->cols;
648	}
649	ccfreefree(cptr);
650}
651 
652#ifndef CASE_TESTS
653 
654void ccv_convnet_encode(ccv_convnet_t* convnet, ccv_dense_matrix_t** a, ccv_dense_matrix_t** b, int batch)
655{
656#ifdef HAVE_CUDA1
657	if (convnet->use_cwc_accel)
658		cwc_convnet_encode(convnet, a, b, batch);
659	else {
660#endif
661	assert(batch == 1)((void) sizeof ((batch == 1) ? 1 : 0), __extension__ ({ if (batch
 == 1) ; else __assert_fail ("batch == 1", "ccv_convnet.c", 661
, __extension__ __PRETTY_FUNCTION__); }));
662	assert(CCV_GET_CHANNEL((*a)->type) == convnet->channels)((void) sizeof (((((*a)->type) & 0xFFF) == convnet->
channels) ? 1 : 0), __extension__ ({ if ((((*a)->type) &
 0xFFF) == convnet->channels) ; else __assert_fail ("CCV_GET_CHANNEL((*a)->type) == convnet->channels"
, "ccv_convnet.c", 662, __extension__ __PRETTY_FUNCTION__); }
));
663	assert((*a)->rows == convnet->rows)((void) sizeof (((*a)->rows == convnet->rows) ? 1 : 0),
 __extension__ ({ if ((*a)->rows == convnet->rows) ; else
 __assert_fail ("(*a)->rows == convnet->rows", "ccv_convnet.c"
, 663, __extension__ __PRETTY_FUNCTION__); }));
664	assert((*a)->cols == convnet->cols)((void) sizeof (((*a)->cols == convnet->cols) ? 1 : 0),
 __extension__ ({ if ((*a)->cols == convnet->cols) ; else
 __assert_fail ("(*a)->cols == convnet->cols", "ccv_convnet.c"
, 664, __extension__ __PRETTY_FUNCTION__); }));
665	int i;
666	// save the last layer of neuron cache in case that we encode to a different matrix
667	ccv_dense_matrix_t* out_neuron = convnet->acts[convnet->count - 1];
668	convnet->acts[convnet->count - 1] = *b;
669	_ccv_convnet_layer_forward_propagate(convnet->layers, *a, convnet->acts, convnet->denoms);
670	for (i = 1; i < convnet->count; i++)
671		_ccv_convnet_layer_forward_propagate(convnet->layers + i, convnet->acts[i - 1], convnet->acts + i, convnet->denoms + i);
672	if (convnet->acts + convnet->count - 1 != b)
673	{
674		*b = convnet->acts[convnet->count - 1];
675		// restore the last layer of neuron cache
676		convnet->acts[convnet->count - 1] = out_neuron;
677	}
678#ifdef HAVE_CUDA1
679	}
680#endif
681}
682 
683// find the layer for scanning (it is the last convolutional layer)
684static int _ccv_convnet_find_scan(ccv_convnet_t* convnet)
685{
686	int i;
687	ccv_convnet_layer_t* layers = convnet->layers;
688	for (i = convnet->count - 1; i >= 0; i--)
689		if (layers[i].type == CCV_CONVNET_CONVOLUTIONAL)
690			return i;
691	return -1;
692}
693 
694static int _ccv_convnet_derive_scale(ccv_convnet_t* convnet, int scan)
695{
696	int i, scale = 1;
697	for (i = scan; i >= 0; i--)
698	{
699		ccv_convnet_layer_t* layer = convnet->layers + i;
700		switch (layer->type)
701		{
702			case CCV_CONVNET_CONVOLUTIONAL:
703				scale *= layer->net.convolutional.strides;
704				break;
705			case CCV_CONVNET_MAX_POOL:
706			case CCV_CONVNET_AVERAGE_POOL:
707				scale *= layer->net.pool.strides;
708				break;
709		}
710	}
711	return scale;
712}
713 
714static int _ccv_convnet_find_full_connect(ccv_convnet_t* convnet)
715{
716	int i;
717	for (i = 0; i < convnet->count; i++)
718		if (convnet->layers[i].type == CCV_CONVNET_FULL_CONNECT)
719			return i;
720	return -1;
721}
722 
723void ccv_convnet_classify(ccv_convnet_t* convnet, ccv_dense_matrix_t** a, int symmetric, ccv_array_t** ranks, int tops, int batch)
724{
725#ifdef HAVE_CUDA1
726	if (convnet->use_cwc_accel)
727		cwc_convnet_classify(convnet, a, symmetric, ranks, tops, batch);
728	else {
729#endif
730	int i, j, k, t;
731	ccv_dense_matrix_t** b = (ccv_dense_matrix_t**)alloca(sizeof(ccv_dense_matrix_t*) * (convnet->count + 1))__builtin_alloca (sizeof(ccv_dense_matrix_t*) * (convnet->
count + 1));
732	int scan = _ccv_convnet_find_scan(convnet);
733	int scale = _ccv_convnet_derive_scale(convnet, scan);
734	int full_connect = _ccv_convnet_find_full_connect(convnet);
735	assert(scan >= 0 && scan < convnet->count)((void) sizeof ((scan >= 0 && scan < convnet->
count) ? 1 : 0), __extension__ ({ if (scan >= 0 &&
 scan < convnet->count) ; else __assert_fail ("scan >= 0 && scan < convnet->count"
, "ccv_convnet.c", 735, __extension__ __PRETTY_FUNCTION__); }
));
736	assert(full_connect >= 0 && full_connect < convnet->count)((void) sizeof ((full_connect >= 0 && full_connect
 < convnet->count) ? 1 : 0), __extension__ ({ if (full_connect
 >= 0 && full_connect < convnet->count) ; else
 __assert_fail ("full_connect >= 0 && full_connect < convnet->count"
, "ccv_convnet.c", 736, __extension__ __PRETTY_FUNCTION__); }
));
737	memset(b, 0, sizeof(ccv_dense_matrix_t*) * (convnet->count + 1));
738	for (i = 0; i < batch; i++)
739	{
740		assert(CCV_GET_CHANNEL(a[i]->type) == convnet->channels)((void) sizeof ((((a[i]->type) & 0xFFF) == convnet->
channels) ? 1 : 0), __extension__ ({ if (((a[i]->type) &
 0xFFF) == convnet->channels) ; else __assert_fail ("CCV_GET_CHANNEL(a[i]->type) == convnet->channels"
, "ccv_convnet.c", 740, __extension__ __PRETTY_FUNCTION__); }
));
741		assert(a[i]->rows == convnet->input.height || a[i]->cols == convnet->input.width)((void) sizeof ((a[i]->rows == convnet->input.height ||
 a[i]->cols == convnet->input.width) ? 1 : 0), __extension__
 ({ if (a[i]->rows == convnet->input.height || a[i]->
cols == convnet->input.width) ; else __assert_fail ("a[i]->rows == convnet->input.height || a[i]->cols == convnet->input.width"
, "ccv_convnet.c", 741, __extension__ __PRETTY_FUNCTION__); }
));
742		assert(a[i]->rows >= convnet->input.height && a[i]->cols >= convnet->input.width)((void) sizeof ((a[i]->rows >= convnet->input.height
 && a[i]->cols >= convnet->input.width) ? 1 :
 0), __extension__ ({ if (a[i]->rows >= convnet->input
.height && a[i]->cols >= convnet->input.width
) ; else __assert_fail ("a[i]->rows >= convnet->input.height && a[i]->cols >= convnet->input.width"
, "ccv_convnet.c", 742, __extension__ __PRETTY_FUNCTION__); }
));
743		// find optimal rows and cols to slice to
744		int rows = convnet->rows + ((a[i]->rows - convnet->rows) / scale) * scale;
745		int cols = convnet->cols + ((a[i]->cols - convnet->cols) / scale) * scale;
746		assert(rows == convnet->input.height || cols == convnet->input.width)((void) sizeof ((rows == convnet->input.height || cols == convnet
->input.width) ? 1 : 0), __extension__ ({ if (rows == convnet
->input.height || cols == convnet->input.width) ; else __assert_fail
 ("rows == convnet->input.height || cols == convnet->input.width"
, "ccv_convnet.c", 746, __extension__ __PRETTY_FUNCTION__); }
));
747		assert(rows <= a[i]->rows && cols <= a[i]->cols)((void) sizeof ((rows <= a[i]->rows && cols <=
 a[i]->cols) ? 1 : 0), __extension__ ({ if (rows <= a[i
]->rows && cols <= a[i]->cols) ; else __assert_fail
 ("rows <= a[i]->rows && cols <= a[i]->cols"
, "ccv_convnet.c", 747, __extension__ __PRETTY_FUNCTION__); }
));
748		ccv_dense_matrix_t* slice = 0;
749		ccv_slice(a[i], (ccv_matrix_t**)&slice, CCV_32F, (a[i]->rows - rows) / 2, (a[i]->cols - cols) / 2, rows, cols);
750		ccv_dense_matrix_t* mean_activity = 0;
751		// scale mean activity up to be substractable (from this one, the CPU implementation is an approximation of GPU implementation)
752		ccv_resample(convnet->mean_activity, &mean_activity, 0, (double)rows / (double)convnet->mean_activity->rows, (double)cols / (double)convnet->mean_activity->cols, CCV_INTER_CUBIC);
753		ccv_subtract(slice, mean_activity, (ccv_matrix_t**)b, CCV_32F);
754		ccv_matrix_free(mean_activity);
755		ccv_matrix_free(slice);
756		// doing the first few layers until the first scan layer
757		int out_rows, out_cols, out_partition;
758		ccv_dense_matrix_t* c = ccv_dense_matrix_new(5 * (!!symmetric + 1), convnet->layers[full_connect].input.node.count, CCV_32F | CCV_C1, 0, 0);
759		for (t = 0; t <= !!symmetric; t++)
760		{
761			rows = b[0]->rows, cols = b[0]->cols;
762			for (j = 0; j < scan + 1; j++)
763			{
764				ccv_convnet_layer_t* layer = convnet->layers + j;
765				ccv_convnet_make_output(layer, rows, cols, &out_rows, &out_cols, &out_partition);
766				_ccv_convnet_layer_forward_propagate(layer, b[j], b + j + 1, 0);
767				assert(b[j + 1]->rows == out_rows && b[j + 1]->cols == out_cols)((void) sizeof ((b[j + 1]->rows == out_rows && b[j
 + 1]->cols == out_cols) ? 1 : 0), __extension__ ({ if (b[
j + 1]->rows == out_rows && b[j + 1]->cols == out_cols
) ; else __assert_fail ("b[j + 1]->rows == out_rows && b[j + 1]->cols == out_cols"
, "ccv_convnet.c", 767, __extension__ __PRETTY_FUNCTION__); }
));
768				if (j > 0)
769					ccv_matrix_free(b[j]);
770				rows = out_rows, cols = out_cols;
771			}
772			int offsets[5][2] = {
773				{0, 0},
774				{cols - convnet->layers[scan + 1].input.matrix.cols, 0},
775				{(cols - convnet->layers[scan + 1].input.matrix.cols) / 2, (rows - convnet->layers[scan + 1].input.matrix.rows) / 2},
776				{0, rows - convnet->layers[scan + 1].input.matrix.rows},
777				{cols - convnet->layers[scan + 1].input.matrix.cols, rows - convnet->layers[scan + 1].input.matrix.rows},
778			};
779			for (k = 0; k < 5; k++)
780			{
781				ccv_dense_matrix_t* input = 0;
782				ccv_convnet_layer_t* layer = convnet->layers + scan + 1;
783				ccv_slice(b[scan + 1], (ccv_matrix_t**)&input, CCV_32F, offsets[k][1], offsets[k][0], layer->input.matrix.rows, layer->input.matrix.cols);
784				// copy the last layer for full connect compute
785				b[full_connect] = ccv_dense_matrix_new(convnet->layers[full_connect].input.matrix.rows, convnet->layers[full_connect].input.matrix.cols, CCV_NO_DATA_ALLOC | CCV_32F | convnet->layers[full_connect].input.matrix.channels, c->data.f32 + (t * 5 + k) * convnet->layers[full_connect].input.node.count, 0);
786				for (j = scan + 1; j < full_connect; j++)
787				{
788					layer = convnet->layers + j;
789					_ccv_convnet_layer_forward_propagate(layer, j > scan + 1 ? b[j] : input, b + j + 1, 0);
790					if (j > scan + 1)
791						ccv_matrix_free(b[j]);
792					else
793						ccv_matrix_free(input);
794				}
795				ccv_matrix_free(b[full_connect]);
796				// set it to 0
797				memset(b + scan + 2, 0, sizeof(ccv_dense_matrix_t*) * (full_connect - scan - 1));
798			}
799			ccv_matrix_free(b[scan + 1]);
800			memset(b + 1, 0, sizeof(ccv_dense_matrix_t*) * (scan + 1));
801			if (t < !!symmetric)
802				ccv_flip(b[0], &b[0], 0, CCV_FLIP_X);
803		}
804		ccv_matrix_free(b[0]);
805		// now have everything in c, do the last full connect propagate
806		b[full_connect] = c;
807		for (j = full_connect; j < convnet->count; j++)
808		{
809			ccv_convnet_layer_t* layer = convnet->layers + j;
810			assert(layer->type == CCV_CONVNET_FULL_CONNECT)((void) sizeof ((layer->type == CCV_CONVNET_FULL_CONNECT) ?
 1 : 0), __extension__ ({ if (layer->type == CCV_CONVNET_FULL_CONNECT
) ; else __assert_fail ("layer->type == CCV_CONVNET_FULL_CONNECT"
, "ccv_convnet.c", 810, __extension__ __PRETTY_FUNCTION__); }
));
811			_ccv_convnet_full_connect_forward_propagate_parallel(layer, b[j], b + j + 1);
812			ccv_matrix_free(b[j]);
813		}
814		ccv_dense_matrix_t* softmax = 0;
815		_ccv_convnet_compute_softmax_parallel(b[convnet->count], &softmax, 0);
816		ccv_matrix_free(b[convnet->count]);
817		ranks[i] = ccv_array_new(sizeof(ccv_classification_t), tops, 0);
818		float* r = softmax->data.f32;
819		assert(tops <= softmax->cols)((void) sizeof ((tops <= softmax->cols) ? 1 : 0), __extension__
 ({ if (tops <= softmax->cols) ; else __assert_fail ("tops <= softmax->cols"
, "ccv_convnet.c", 819, __extension__ __PRETTY_FUNCTION__); }
));
820		for (j = 0; j < tops; j++)
821		{
822			float max_val = -1;
823			int max_idx = -1;
824			for (k = 0; k < softmax->cols; k++)
825				if (r[k] >= 0 && r[k] > max_val)
826					max_val = r[k], max_idx = k;
827			assert(max_idx >= 0)((void) sizeof ((max_idx >= 0) ? 1 : 0), __extension__ ({ if
 (max_idx >= 0) ; else __assert_fail ("max_idx >= 0", "ccv_convnet.c"
, 827, __extension__ __PRETTY_FUNCTION__); }));
828			r[max_idx] = -1;
829			ccv_classification_t classification = {
830				.id = max_idx,
831				.confidence = max_val / ((!!symmetric + 1) * 5),
832			};
833			ccv_array_push(ranks[i], &classification);
834		}
835		ccv_matrix_free(softmax);
836		memset(b, 0, sizeof(ccv_dense_matrix_t*) * (convnet->count + 1));
837	}
838#ifdef HAVE_CUDA1
839	}
840#endif
841}
842 
843#endif
844 
845#ifdef HAVE_GSL1
846 
847// compute back propagated gradient & weight update delta
848static void _ccv_convnet_convolutional_backward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t* n, ccv_dense_matrix_t* m, ccv_dense_matrix_t** b, ccv_convnet_layer_t* update_params)
849{
850	// a is the input gradient (for back prop).
851	// x is the input (for forward prop), b is the output gradient (gradient, or known as propagated error)
852	// note that y (the output from forward prop) is not included because the full connect net is simple enough that we don't need it
853	int rows, cols, partition;
854	ccv_convnet_make_output(layer, layer->input.matrix.rows, layer->input.matrix.cols, &rows, &cols, &partition);
855	int ch = layer->net.convolutional.channels;
856	int count = layer->net.convolutional.count;
857	int strides = layer->net.convolutional.strides;
858	int border = layer->net.convolutional.border;
859	int kernel_rows = layer->net.convolutional.rows;
860	int kernel_cols = layer->net.convolutional.cols;
861	assert(a->rows == rows)((void) sizeof ((a->rows == rows) ? 1 : 0), __extension__ (
{ if (a->rows == rows) ; else __assert_fail ("a->rows == rows"
, "ccv_convnet.c", 861, __extension__ __PRETTY_FUNCTION__); }
));
862	assert(a->cols == cols)((void) sizeof ((a->cols == cols) ? 1 : 0), __extension__ (
{ if (a->cols == cols) ; else __assert_fail ("a->cols == cols"
, "ccv_convnet.c", 862, __extension__ __PRETTY_FUNCTION__); }
));
863	assert(CCV_GET_CHANNEL(a->type) == count)((void) sizeof ((((a->type) & 0xFFF) == count) ? 1 : 0
), __extension__ ({ if (((a->type) & 0xFFF) == count) ;
 else __assert_fail ("CCV_GET_CHANNEL(a->type) == count", "ccv_convnet.c"
, 863, __extension__ __PRETTY_FUNCTION__); }));
864	int a_rows = a->rows, a_cols = a->cols, a_ch = CCV_GET_CHANNEL(a->type)((a->type) & 0xFFF);
865	a->rows = rows, a->cols = cols, a->type = (a->type - a_ch) | count;
866	assert(CCV_GET_CHANNEL(m->type) == ch)((void) sizeof ((((m->type) & 0xFFF) == ch) ? 1 : 0), __extension__
 ({ if (((m->type) & 0xFFF) == ch) ; else __assert_fail
 ("CCV_GET_CHANNEL(m->type) == ch", "ccv_convnet.c", 866, __extension__
 __PRETTY_FUNCTION__); }));
867	assert(CCV_GET_DATA_TYPE(m->type) == CCV_32F)((void) sizeof ((((m->type) & 0xFF000) == CCV_32F) ? 1
 : 0), __extension__ ({ if (((m->type) & 0xFF000) == CCV_32F
) ; else __assert_fail ("CCV_GET_DATA_TYPE(m->type) == CCV_32F"
, "ccv_convnet.c", 867, __extension__ __PRETTY_FUNCTION__); }
));
868	int count_per_partition = count / partition;
869	int ch_per_partition = ch / partition;
870	// update weight gradient
871	parallel_for(k, count){ int k; for ((k) = 0; (k) < (count); (k)++) { {
872		int i, j, x, y, c;
873		int p = k / count_per_partition;
874		float* mp = m->data.f32 + p * ch_per_partition;
875		float* ap = a->data.f32 + k;
876		float* np = n->data.f32 + k;
877		float* update_w = update_params->w + k * kernel_rows * kernel_cols * ch_per_partition;
878		float bias = 0;
879		for (i = 0; i < rows; i++)
880		{
881			int comy = ccv_max(i * strides - border, 0)({ typeof (i * strides - border) _a = (i * strides - border);
 typeof (0) _b = (0); (_a > _b) ? _a : _b; }) - (i * strides - border);
882			int maxy = kernel_rows - comy - (i * strides + kernel_rows - ccv_min(m->rows + border, i * strides + kernel_rows)({ typeof (m->rows + border) _a = (m->rows + border); typeof
 (i * strides + kernel_rows) _b = (i * strides + kernel_rows)
; (_a < _b) ? _a : _b; }));
883			comy *= ch_per_partition * kernel_cols;
884			for (j = 0; j < cols; j++)
885			{
886				if (np[j * count] > 0)
887				{ /* when np is bigger than 0, relu continues to update the weight, otherwise it stops */
888					float v = ap[j * count];
889					bias += v;
890					int comx = ccv_max(j * strides - border, 0)({ typeof (j * strides - border) _a = (j * strides - border);
 typeof (0) _b = (0); (_a > _b) ? _a : _b; }) - (j * strides - border);
891					int maxx = kernel_cols - comx - (j * strides + kernel_cols - ccv_min(m->cols + border, j * strides + kernel_cols)({ typeof (m->cols + border) _a = (m->cols + border); typeof
 (j * strides + kernel_cols) _b = (j * strides + kernel_cols)
; (_a < _b) ? _a : _b; }));
892					float* w = update_w + comx * ch_per_partition + comy;
893					float* mpz = mp + ccv_max(j * strides - border, 0)({ typeof (j * strides - border) _a = (j * strides - border);
 typeof (0) _b = (0); (_a > _b) ? _a : _b; }) * ch;
894					/* when we have border, we simply do zero padding */
895					for (y = 0; y < maxy; y++)
896					{
897						for (x = 0; x < maxx; x++)
898							for (c = 0; c < ch_per_partition; c++)
899								w[x * ch_per_partition + c] += v * mpz[x * ch + c];
900						w += kernel_cols * ch_per_partition;
901						mpz += m->cols * ch;
902					}
903				}
904			}
905			ap += a->cols * count;
906			np += n->cols * count;
907			mp += m->cols * ch * (ccv_max((i + 1) * strides - border, 0)({ typeof ((i + 1) * strides - border) _a = ((i + 1) * strides
 - border); typeof (0) _b = (0); (_a > _b) ? _a : _b; }) - ccv_max(i * strides - border, 0)({ typeof (i * strides - border) _a = (i * strides - border);
 typeof (0) _b = (0); (_a > _b) ? _a : _b; }));
908		}
909		update_params->bias[k] += bias;
910	} parallel_endfor} }
911	if (b)
912	{
913		ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, m->rows, m->cols, CCV_32F | CCV_GET_CHANNEL(m->type)((m->type) & 0xFFF), CCV_32F | CCV_GET_CHANNEL(m->type)((m->type) & 0xFFF), 0);
914		// clear it up before propagate result
915		ccv_zero(db);
916		int k;
917		for (k = 0; k < count; k++)
918		{
919			int i, j, x, y, c;
920			int p = k / count_per_partition;
921			float* bp = db->data.f32 + p * ch_per_partition;
922			float* ap = a->data.f32 + k;
923			float* np = n->data.f32 + k;
924			float* layer_w = layer->w + k * kernel_rows * kernel_cols * ch_per_partition;
925			for (i = 0; i < rows; i++)
926			{
927				int comy = ccv_max(i * strides - border, 0)({ typeof (i * strides - border) _a = (i * strides - border);
 typeof (0) _b = (0); (_a > _b) ? _a : _b; }) - (i * strides - border);
928				int maxy = kernel_rows - comy - (i * strides + kernel_rows - ccv_min(db->rows + border, i * strides + kernel_rows)({ typeof (db->rows + border) _a = (db->rows + border);
 typeof (i * strides + kernel_rows) _b = (i * strides + kernel_rows
); (_a < _b) ? _a : _b; }));
929				comy *= ch_per_partition * kernel_cols;
930				for (j = 0; j < cols; j++)
931				{
932					if (np[j * count] > 0)
933					{ /* when np is bigger than 0, relu continues to update the weight, otherwise it stops */
934						float v = ap[j * count];
935						int comx = ccv_max(j * strides - border, 0)({ typeof (j * strides - border) _a = (j * strides - border);
 typeof (0) _b = (0); (_a > _b) ? _a : _b; }) - (j * strides - border);
936						int maxx = kernel_cols - comx - (j * strides + kernel_cols - ccv_min(db->cols + border, j * strides + kernel_cols)({ typeof (db->cols + border) _a = (db->cols + border);
 typeof (j * strides + kernel_cols) _b = (j * strides + kernel_cols
); (_a < _b) ? _a : _b; }));
937						float* w = layer_w + comx * ch_per_partition + comy;
938						float* bpz = bp + ccv_max(j * strides - border, 0)({ typeof (j * strides - border) _a = (j * strides - border);
 typeof (0) _b = (0); (_a > _b) ? _a : _b; }) * ch;
939						/* when we have border, we simply do zero padding */
940						for (y = 0; y < maxy; y++)
941						{
942							for (x = 0; x < maxx; x++)
943								for (c = 0; c < ch_per_partition; c++)
944									bpz[x * ch + c] += v * w[x * ch_per_partition + c];
945							w += kernel_cols * ch_per_partition;
946							bpz += db->cols * ch;
947						}
948					}
949				}
950				ap += a->cols * count;
951				np += n->cols * count;
952				bp += db->cols * ch * (ccv_max((i + 1) * strides - border, 0)({ typeof ((i + 1) * strides - border) _a = ((i + 1) * strides
 - border); typeof (0) _b = (0); (_a > _b) ? _a : _b; }) - ccv_max(i * strides - border, 0)({ typeof (i * strides - border) _a = (i * strides - border);
 typeof (0) _b = (0); (_a > _b) ? _a : _b; }));
953			}
954		}
955	}
956	a->rows = a_rows, a->cols = a_cols, a->type = (a->type - CCV_GET_CHANNEL(a->type)((a->type) & 0xFFF)) | a_ch;
957}
958 
959static void _ccv_convnet_full_connect_backward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t* y, ccv_dense_matrix_t* x, ccv_dense_matrix_t** b, ccv_convnet_layer_t* update_params)
960{
961	// a is the input gradient (for back prop), y is the output (for forward prop)
962	// x is the input (for forward prop), b is the output gradient (gradient, or known as propagated error)
963	ccv_dense_matrix_t* db = 0;
964	if (b26.1
'b' is null
)
27
←
Taking false branch→
965		db = *b = ccv_dense_matrix_renew(*b, x->rows, x->cols, CCV_32F | CCV_GET_CHANNEL(x->type)((x->type) & 0xFFF), CCV_32F | CCV_GET_CHANNEL(x->type)((x->type) & 0xFFF), 0);
966	int x_rows = x->rows, x_cols = x->cols, x_ch = CCV_GET_CHANNEL(x->type)((x->type) & 0xFFF);
967	x->rows = x_rows * x_cols * x_ch, x->cols = 1, x->type = (x->type - x_ch) | CCV_C1;
968	x->step = x->cols * CCV_GET_DATA_TYPE_SIZE(x->type)_ccv_get_data_type_size[((x->type) & 0xFF000) >>
 12];
969	int i;
970	if (layer->net.full_connect.relu)
28
←
Assuming field 'relu' is 0→
29
←
Taking false branch→
971		for (i = 0; i < y->rows; i++)
972			if (y->data.f32[i] <= 0)
973				a->data.f32[i] = 0;
974	ccv_dense_matrix_t w = ccv_dense_matrix(a->rows, x->rows, CCV_32F | CCV_C1, update_params->w, 0);
30
←
4th function call argument is an uninitialized value
975	ccv_dense_matrix_t* dw = &w;
976	// compute bias gradient
977	ccv_dense_matrix_t bias = ccv_dense_matrix(a->rows, 1, CCV_32F | CCV_C1, update_params->bias, 0);
978	ccv_dense_matrix_t* dbias = &bias;
979	ccv_add(a, dbias, (ccv_matrix_t**)&dbias, 0);
980	// compute weight gradient
981	ccv_gemm(a, x, 1, dw, 1, CCV_B_TRANSPOSE, (ccv_matrix_t**)&dw, 0);
982	w = ccv_dense_matrix(a->rows, x->rows, CCV_32F | CCV_C1, layer->w, 0);
983	// propagate error
984	if (db)
985	{
986		db->rows = x->rows, db->cols = x->cols, db->type = (db->type - x_ch) | CCV_C1;
987		db->step = db->cols * CCV_GET_DATA_TYPE_SIZE(db->type)_ccv_get_data_type_size[((db->type) & 0xFF000) >>
 12];
988		ccv_gemm(&w, a, 1, 0, 0, CCV_A_TRANSPOSE, (ccv_matrix_t**)&db, 0);
989		db->rows = x_rows, db->cols = x_cols, db->type = (db->type - CCV_GET_CHANNEL(db->type)((db->type) & 0xFFF)) | x_ch;
990		db->step = db->cols * CCV_GET_DATA_TYPE_SIZE(db->type)_ccv_get_data_type_size[((db->type) & 0xFF000) >>
 12] * CCV_GET_CHANNEL(db->type)((db->type) & 0xFFF);
991	}
992	x->rows = x_rows, x->cols = x_cols, x->type = (x->type - CCV_GET_CHANNEL(x->type)((x->type) & 0xFFF)) | x_ch;
993	x->step = x->cols * CCV_GET_DATA_TYPE_SIZE(x->type)_ccv_get_data_type_size[((x->type) & 0xFF000) >>
 12] * CCV_GET_CHANNEL(x->type)((x->type) & 0xFFF);
994}
995 
996static void _ccv_convnet_rnorm_backward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t* n, ccv_dense_matrix_t* m, ccv_dense_matrix_t* denoms, ccv_dense_matrix_t** b)
997{
998	int rows, cols, partition;
999	ccv_convnet_make_output(layer, layer->input.matrix.rows, layer->input.matrix.cols, &rows, &cols, &partition);
1000	int size = layer->net.rnorm.size;
1001	float alpha = layer->net.rnorm.alpha;
1002	float beta = layer->net.rnorm.beta;
1003	int way = size / 2;
1004	assert(CCV_GET_DATA_TYPE(a->type) == CCV_32F)((void) sizeof ((((a->type) & 0xFF000) == CCV_32F) ? 1
 : 0), __extension__ ({ if (((a->type) & 0xFF000) == CCV_32F
) ; else __assert_fail ("CCV_GET_DATA_TYPE(a->type) == CCV_32F"
, "ccv_convnet.c", 1004, __extension__ __PRETTY_FUNCTION__); }
));
1005	int ch = CCV_GET_CHANNEL(a->type)((a->type) & 0xFFF);
1006	int type = CCV_32F | ch;
1007	ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, rows, cols, type, type, 0);
1008	int i, j, k, x, p;
1009	float* ap = a->data.f32;
1010	float* np = n->data.f32;
1011	float* mp = m->data.f32;
1012	float* dp = denoms->data.f32;
1013	float* bp = db->data.f32;
1014	int ch_per_partition = ch / partition;
1015	for (i = 0; i < db->rows; i++)
1016	{
1017		for (j = 0; j < db->cols; j++)
1018			for (p = 0; p < partition; p++)
1019				for (k = 0; k < ch_per_partition; k++)
1020				{
1021					float nom = 0;
1022					for (x = ccv_max(k - way, 0)({ typeof (k - way) _a = (k - way); typeof (0) _b = (0); (_a >
 _b) ? _a : _b; }); x <= ccv_min(k + way, ch_per_partition - 1)({ typeof (k + way) _a = (k + way); typeof (ch_per_partition -
 1) _b = (ch_per_partition - 1); (_a < _b) ? _a : _b; }); x++)
1023						nom += -2 * alpha * beta * ap[j * ch + x + p * ch_per_partition] * np[j * ch + x + p * ch_per_partition] / dp[j * ch + x + p * ch_per_partition];
1024					bp[j * ch + k + p * ch_per_partition] = mp[j * ch + k + p * ch_per_partition] * nom + ap[j * ch + k + p * ch_per_partition] * powf(dp[j * ch + k + p * ch_per_partition], -beta);
1025				}
1026		ap += a->cols * ch;
1027		np += n->cols * ch;
1028		mp += m->cols * ch;
1029		dp += denoms->cols * ch;
1030		bp += db->cols * ch;
1031	}
1032}
1033 
1034static void _ccv_convnet_max_pool_backward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t* n, ccv_dense_matrix_t* m, ccv_dense_matrix_t** b)
1035{
1036	// a is the input gradient (for back prop), y is the output (from forward prop),
1037	// x is the input (for forward prop), b is the output gradient (gradient, or known as propagated error)
1038	// pooling layer doesn't need the dropout
1039	if (b)
1040	{
1041		assert(CCV_GET_CHANNEL(a->type) == CCV_GET_CHANNEL(n->type))((void) sizeof ((((a->type) & 0xFFF) == ((n->type) &
 0xFFF)) ? 1 : 0), __extension__ ({ if (((a->type) & 0xFFF
) == ((n->type) & 0xFFF)) ; else __assert_fail ("CCV_GET_CHANNEL(a->type) == CCV_GET_CHANNEL(n->type)"
, "ccv_convnet.c", 1041, __extension__ __PRETTY_FUNCTION__); }
));
1042		assert(CCV_GET_CHANNEL(a->type) == CCV_GET_CHANNEL(m->type))((void) sizeof ((((a->type) & 0xFFF) == ((m->type) &
 0xFFF)) ? 1 : 0), __extension__ ({ if (((a->type) & 0xFFF
) == ((m->type) & 0xFFF)) ; else __assert_fail ("CCV_GET_CHANNEL(a->type) == CCV_GET_CHANNEL(m->type)"
, "ccv_convnet.c", 1042, __extension__ __PRETTY_FUNCTION__); }
));
1043		int ch = CCV_GET_CHANNEL(a->type)((a->type) & 0xFFF);
1044		ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, m->rows, m->cols, CCV_32F | ch, CCV_32F | ch, 0);
1045		ccv_zero(db);
1046		int size = layer->net.pool.size;
1047		int strides = layer->net.pool.strides;
1048		int border = layer->net.pool.border;
1049		int i, j, k, x, y;
1050		float* ap = a->data.f32;
1051		float* bp = db->data.f32;
1052		float* np = n->data.f32;
1053		float* mp = m->data.f32;
1054		for (i = 0; i < a->rows; i++)
1055		{
1056			const int start_y = ccv_max(i * strides - border, 0)({ typeof (i * strides - border) _a = (i * strides - border);
 typeof (0) _b = (0); (_a > _b) ? _a : _b; }) - (i * strides - border);
1057			const int end_y = size + ccv_min(i * strides + size - border, db->rows)({ typeof (i * strides + size - border) _a = (i * strides + size
 - border); typeof (db->rows) _b = (db->rows); (_a <
 _b) ? _a : _b; }) - (i * strides + size - border);
1058			for (j = 0; j < a->cols; j++)
1059			{
1060				const int start_x = ccv_max(j * strides - border, 0)({ typeof (j * strides - border) _a = (j * strides - border);
 typeof (0) _b = (0); (_a > _b) ? _a : _b; }) - (j * strides - border);
1061				const int end_x = size + ccv_min(j * strides + size - border, db->cols)({ typeof (j * strides + size - border) _a = (j * strides + size
 - border); typeof (db->cols) _b = (db->cols); (_a <
 _b) ? _a : _b; }) - (j * strides + size - border);
1062				for (k = 0; k < ch; k++)
1063				{
1064					float v = np[j * ch + k];
1065					float u = ap[j * ch + k];
1066					for (y = start_y; y < end_y; y++)
1067						for (x = start_x; x < end_x; x++)
1068							// we have to do direct comparison otherwise it will contribute to too many cells
1069							// and the propagation won't work. But CPU will have different result comparing with GPU
1070							if (mp[(j * strides - border + x + (y - border) * m->cols) * ch + k] == v)
1071								bp[(j * strides - border + x + (y - border) * db->cols) * ch + k] += u;
1072				}
1073			}
1074			ap += a->cols * ch;
1075			np += n->cols * ch;
1076			bp += db->cols * ch * strides;
1077			mp += m->cols * ch * strides;
1078		}
1079	}
1080}
1081 
1082static void _ccv_convnet_average_pool_backward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t* m, ccv_dense_matrix_t** b)
1083{
1084	// a is the input gradient (for back prop), y is the output (from forward prop),
1085	// x is the input (for forward prop), b is the output gradient (gradient, or known as propagated error)
1086	// pooling layer doesn't need the dropout
1087	if (b)
1088	{
1089		assert(CCV_GET_CHANNEL(a->type) == CCV_GET_CHANNEL(m->type))((void) sizeof ((((a->type) & 0xFFF) == ((m->type) &
 0xFFF)) ? 1 : 0), __extension__ ({ if (((a->type) & 0xFFF
) == ((m->type) & 0xFFF)) ; else __assert_fail ("CCV_GET_CHANNEL(a->type) == CCV_GET_CHANNEL(m->type)"
, "ccv_convnet.c", 1089, __extension__ __PRETTY_FUNCTION__); }
));
1090		int ch = CCV_GET_CHANNEL(a->type)((a->type) & 0xFFF);
1091		ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, m->rows, m->cols, CCV_32F | ch, CCV_32F | ch, 0);
1092		ccv_zero(db);
1093		int size = layer->net.pool.size;
1094		int strides = layer->net.pool.strides;
1095		int border = layer->net.pool.border;
1096		int i, j, k, x, y;
1097		float* ap = a->data.f32;
1098		float* bp = db->data.f32;
1099		for (i = 0; i < a->rows; i++)
1100		{
1101			const int start_y = ccv_max(i * strides - border, 0)({ typeof (i * strides - border) _a = (i * strides - border);
 typeof (0) _b = (0); (_a > _b) ? _a : _b; }) - (i * strides - border);
1102			const int end_y = size + ccv_min(i * strides + size - border, db->rows)({ typeof (i * strides + size - border) _a = (i * strides + size
 - border); typeof (db->rows) _b = (db->rows); (_a <
 _b) ? _a : _b; }) - (i * strides + size - border);
1103			for (j = 0; j < a->cols; j++)
1104			{
1105				const int start_x = ccv_max(j * strides - border, 0)({ typeof (j * strides - border) _a = (j * strides - border);
 typeof (0) _b = (0); (_a > _b) ? _a : _b; }) - (j * strides - border);
1106				const int end_x = size + ccv_min(j * strides + size - border, db->cols)({ typeof (j * strides + size - border) _a = (j * strides + size
 - border); typeof (db->cols) _b = (db->cols); (_a <
 _b) ? _a : _b; }) - (j * strides + size - border);
1107				for (k = 0; k < ch; k++)
1108				{
1109					float u = ap[j * ch + k] / ((end_x - start_x) * (end_y - start_y));
1110					for (y = start_y; y < end_y; y++)
1111						for (x = start_x; x < end_x; x++)
1112							bp[(j * strides - border + x + (y - border) * db->cols) * ch + k] += u;
1113				}
1114			}
1115			ap += a->cols * ch;
1116			bp += db->cols * ch * strides;
1117		}
1118	}
1119}
1120 
1121static void _ccv_convnet_propagate_loss(ccv_convnet_t* convnet, ccv_dense_matrix_t* a, ccv_dense_matrix_t* dloss, ccv_convnet_t* update_params)
1122{
1123	int i;
1124	ccv_convnet_layer_t* layer = convnet->layers + convnet->count - 1;
1125	assert(layer->type == CCV_CONVNET_FULL_CONNECT)((void) sizeof ((layer->type == CCV_CONVNET_FULL_CONNECT) ?
 1 : 0), __extension__ ({ if (layer->type == CCV_CONVNET_FULL_CONNECT
) ; else __assert_fail ("layer->type == CCV_CONVNET_FULL_CONNECT"
, "ccv_convnet.c", 1125, __extension__ __PRETTY_FUNCTION__); }
)); // the last layer has too be a full connect one to generate softmax result
22
←
Assuming field 'type' is equal to CCV_CONVNET_FULL_CONNECT→
23
←
Taking true branch→
1126	_ccv_convnet_full_connect_backward_propagate(layer, dloss, convnet->acts[convnet->count - 1], convnet->acts[convnet->count - 2], convnet->count - 1 > 0 ? update_params->acts + convnet->count - 2 : 0, update_params->layers + convnet->count - 1);
24
←
Assuming the condition is false→
25
←
'?' condition is false→
26
←
Calling '_ccv_convnet_full_connect_backward_propagate'→
1127	for (i = convnet->count - 2; i >= 0; i--)
1128	{
1129		layer = convnet->layers + i;
1130		switch (layer->type)
1131		{
1132			case CCV_CONVNET_CONVOLUTIONAL:
1133				_ccv_convnet_convolutional_backward_propagate(layer, update_params->acts[i], convnet->acts[i], i > 0 ? convnet->acts[i - 1] : a, i > 0 ? update_params->acts + i - 1 : 0, update_params->layers + i);
1134				break;
1135			case CCV_CONVNET_FULL_CONNECT:
1136				_ccv_convnet_full_connect_backward_propagate(layer, update_params->acts[i], convnet->acts[i], i > 0 ? convnet->acts[i - 1] : a, i > 0 ? update_params->acts + i - 1 : 0, update_params->layers + i);
1137				break;
1138			case CCV_CONVNET_LOCAL_RESPONSE_NORM:
1139				_ccv_convnet_rnorm_backward_propagate(layer, update_params->acts[i], convnet->acts[i], i > 0 ? convnet->acts[i - 1] : a, convnet->denoms[i], i > 0 ? update_params->acts + i - 1 : 0);
1140				break;
1141			case CCV_CONVNET_MAX_POOL:
1142				_ccv_convnet_max_pool_backward_propagate(layer, update_params->acts[i], convnet->acts[i], i > 0 ? convnet->acts[i - 1] : a, i > 0 ? update_params->acts + i - 1 : 0);
1143				break;
1144			case CCV_CONVNET_AVERAGE_POOL:
1145				_ccv_convnet_average_pool_backward_propagate(layer, update_params->acts[i], i > 0 ? convnet->acts[i - 1] : a, i > 0 ? update_params->acts + i - 1 : 0);
1146				break;
1147		}
1148	}
1149}
1150 
1151static void _ccv_convnet_update(ccv_convnet_t* convnet, int batch, ccv_convnet_t* momentum, ccv_convnet_t* update_params, ccv_convnet_layer_train_param_t* layer_params)
1152{
1153	int i, j;
1154	float learn_rate;
1155	for (i = 0; i < convnet->count; i++)
1156		switch (update_params->layers[i].type)
1157		{
1158			case CCV_CONVNET_CONVOLUTIONAL:
1159			{
1160				float* w = convnet->layers[i].w;
1161				float* vw = momentum->layers[i].w;
1162				float* dw = update_params->layers[i].w;
1163				learn_rate = layer_params[i].w.learn_rate / batch;
1164				for (j = 0; j < convnet->layers[i].wnum; j++)
1165				{
1166					vw[j] = layer_params[i].w.momentum * vw[j] - layer_params[i].w.decay * layer_params[i].w.learn_rate * w[j] + learn_rate * dw[j];
1167					w[j] += vw[j];
1168				}
1169				float* bias = convnet->layers[i].bias;
1170				float* vbias = momentum->layers[i].bias;
1171				float* dbias = update_params->layers[i].bias;
1172				learn_rate = layer_params[i].bias.learn_rate / batch;
1173				for (j = 0; j < convnet->layers[i].net.convolutional.count; j++)
1174				{
1175					vbias[j] = layer_params[i].bias.momentum * vbias[j] - layer_params[i].bias.decay * layer_params[i].bias.learn_rate * bias[j] + learn_rate * dbias[j];
1176					bias[j] += vbias[j];
1177				}
1178				break;
1179			}
1180			case CCV_CONVNET_FULL_CONNECT:
1181			{
1182				float* w = convnet->layers[i].w;
1183				float* vw = momentum->layers[i].w;
1184				float* dw = update_params->layers[i].w;
1185				learn_rate = layer_params[i].w.learn_rate / batch;
1186				for (j = 0; j < convnet->layers[i].wnum; j++)
1187				{
1188					vw[j] = layer_params[i].w.momentum * vw[j] - layer_params[i].w.decay * layer_params[i].w.learn_rate * w[j] + learn_rate * dw[j];
1189					w[j] += vw[j];
1190				}
1191				float* bias = convnet->layers[i].bias;
1192				float* vbias = momentum->layers[i].bias;
1193				float* dbias = update_params->layers[i].bias;
1194				learn_rate = layer_params[i].bias.learn_rate / batch;
1195				for (j = 0; j < convnet->layers[i].net.full_connect.count; j++)
1196				{
1197					vbias[j] = layer_params[i].bias.momentum * vbias[j] - layer_params[i].bias.decay * layer_params[i].bias.learn_rate * bias[j] + learn_rate * dbias[j];
1198					bias[j] += vbias[j];
1199				}
1200				break;
1201			}
1202		}
1203}
1204 
1205static void _ccv_convnet_update_zero(ccv_convnet_t* update_params)
1206{
1207	int i;
1208	for (i = 0; i < update_params->count; i++)
1209		switch (update_params->layers[i].type)
1210		{
1211			case CCV_CONVNET_CONVOLUTIONAL:
1212				memset(update_params->layers[i].w, 0, sizeof(float) * update_params->layers[i].wnum);
1213				memset(update_params->layers[i].bias, 0, sizeof(float) * update_params->layers[i].net.convolutional.count);
1214				break;
1215			case CCV_CONVNET_FULL_CONNECT:
1216				assert(update_params->layers[i].wnum % update_params->layers[i].net.full_connect.count == 0)((void) sizeof ((update_params->layers[i].wnum % update_params
->layers[i].net.full_connect.count == 0) ? 1 : 0), __extension__
 ({ if (update_params->layers[i].wnum % update_params->
layers[i].net.full_connect.count == 0) ; else __assert_fail (
"update_params->layers[i].wnum % update_params->layers[i].net.full_connect.count == 0"
, "ccv_convnet.c", 1216, __extension__ __PRETTY_FUNCTION__); }
));
1217				memset(update_params->layers[i].w, 0, sizeof(float) * update_params->layers[i].wnum);
1218				memset(update_params->layers[i].bias, 0, sizeof(float) * update_params->layers[i].net.full_connect.count);
1219				break;
1220		}
1221}
1222 
1223static ccv_convnet_t* _ccv_convnet_update_new(ccv_convnet_t* convnet)
1224{
1225	ccv_convnet_t* update_params = (ccv_convnet_t*)ccmallocmalloc(sizeof(ccv_convnet_t) + sizeof(ccv_convnet_layer_t) * convnet->count + sizeof(ccv_dense_matrix_t*) * convnet->count);
8
←
Uninitialized value stored to field 'w'→
1226	update_params->reserved = 0;
1227	update_params->layers = (ccv_convnet_layer_t*)(update_params + 1);
1228	update_params->acts = (ccv_dense_matrix_t**)(update_params->layers + convnet->count);
1229	memset(update_params->acts, 0, sizeof(ccv_dense_matrix_t*) * convnet->count);
1230	update_params->denoms = 0;
1231	update_params->input = convnet->input;
1232	update_params->rows = convnet->rows;
1233	update_params->cols = convnet->cols;
1234	update_params->count = convnet->count;
1235	update_params->channels = convnet->channels;
1236	update_params->mean_activity = 0;
1237	int i;
1238	for (i = 0; i < convnet->count; i++)
9
←
Assuming 'i' is >= field 'count'→
10
←
Loop condition is false. Execution continues on line 1264→
1239	{
1240		update_params->layers[i].type = convnet->layers[i].type;
1241		update_params->layers[i].input = convnet->layers[i].input;
1242		update_params->layers[i].net = convnet->layers[i].net;
1243		update_params->layers[i].wnum = convnet->layers[i].wnum;
1244		update_params->layers[i].reserved = 0;
1245		switch (update_params->layers[i].type)
1246		{
1247			case CCV_CONVNET_CONVOLUTIONAL:
1248				update_params->layers[i].w = (float*)cccalloccalloc(update_params->layers[i].wnum + update_params->layers[i].net.convolutional.count, sizeof(float));
1249				update_params->layers[i].bias = update_params->layers[i].w + update_params->layers[i].wnum;
1250				break;
1251			case CCV_CONVNET_FULL_CONNECT:
1252				assert(update_params->layers[i].wnum % update_params->layers[i].net.full_connect.count == 0)((void) sizeof ((update_params->layers[i].wnum % update_params
->layers[i].net.full_connect.count == 0) ? 1 : 0), __extension__
 ({ if (update_params->layers[i].wnum % update_params->
layers[i].net.full_connect.count == 0) ; else __assert_fail (
"update_params->layers[i].wnum % update_params->layers[i].net.full_connect.count == 0"
, "ccv_convnet.c", 1252, __extension__ __PRETTY_FUNCTION__); }
));
1253				update_params->layers[i].w = (float*)cccalloccalloc(update_params->layers[i].wnum + update_params->layers[i].net.full_connect.count, sizeof(float));
1254				update_params->layers[i].bias = update_params->layers[i].w + update_params->layers[i].wnum;
1255				break;
1256			case CCV_CONVNET_LOCAL_RESPONSE_NORM:
1257			case CCV_CONVNET_MAX_POOL:
1258			case CCV_CONVNET_AVERAGE_POOL:
1259				update_params->layers[i].w = 0;
1260				update_params->layers[i].bias = 0;
1261				break;
1262		}
1263	}
1264	return update_params;
1265}
1266 
1267static void _ccv_convnet_compute_softmax(ccv_dense_matrix_t* a, ccv_dense_matrix_t** b, int type)
1268{
1269	int ch = CCV_GET_CHANNEL(a->type)((a->type) & 0xFFF);
1270	assert(CCV_GET_DATA_TYPE(a->type) == CCV_32F)((void) sizeof ((((a->type) & 0xFF000) == CCV_32F) ? 1
 : 0), __extension__ ({ if (((a->type) & 0xFF000) == CCV_32F
) ; else __assert_fail ("CCV_GET_DATA_TYPE(a->type) == CCV_32F"
, "ccv_convnet.c", 1270, __extension__ __PRETTY_FUNCTION__); }
));
1271	ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, a->rows, a->cols, CCV_32F | ch, CCV_32F | ch, 0);
1272	int i;
1273	float* aptr = a->data.f32;
1274	float* bptr = db->data.f32;
1275	double max = aptr[0];
1276	for (i = 1; i < a->rows * a->cols * ch; i++)
1277		if (aptr[i] > max)
1278			max = aptr[i];
1279	double tt = 0;
1280	for (i = 0; i < a->rows * a->cols * ch; i++)
1281		tt += (bptr[i] = expf(aptr[i] - max));
1282	tt = 1.0 / tt;
1283	for (i = 0; i < a->rows * a->cols * ch; i++)
1284		bptr[i] *= tt;
1285}
1286 
1287static void _ccv_convnet_classify(ccv_convnet_t* convnet, ccv_dense_matrix_t** a, int* labels, int batch)
1288{
1289	assert(batch == 1)((void) sizeof ((batch == 1) ? 1 : 0), __extension__ ({ if (batch
 == 1) ; else __assert_fail ("batch == 1", "ccv_convnet.c", 1289
, __extension__ __PRETTY_FUNCTION__); }));
1290	ccv_convnet_encode(convnet, a, convnet->acts + convnet->count - 1, 1);
1291	int i, c = 0;
1292	ccv_dense_matrix_t* b = convnet->acts[convnet->count - 1];
1293	float maxc = b->data.f32[0];
1294	for (i = 1; i < b->rows; i++)
1295		if (b->data.f32[i] > maxc)
1296			maxc = b->data.f32[i], c = i;
1297	labels[0] = c;
1298}
1299 
1300#endif
1301 
1302#ifndef CASE_TESTS
1303 
1304void ccv_convnet_supervised_train(ccv_convnet_t* convnet, ccv_array_t* categorizeds, ccv_array_t* tests, const char* filename, ccv_convnet_train_param_t params)
1305{
1306#ifdef HAVE_GSL1
1307#ifdef HAVE_CUDA1
1308	if (convnet->use_cwc_accel)
1
Assuming field 'use_cwc_accel' is 0→
2
←
Taking false branch→
1309		cwc_convnet_supervised_train(convnet, categorizeds, tests, filename, params);
1310	else {
1311#endif
1312	int i, j, t;
1313	gsl_rng_env_setup();
1314	gsl_rng* rng = gsl_rng_alloc(gsl_rng_default);
1315	int aligned_padding = categorizeds->rnum % params.mini_batch;
1316	int aligned_rnum = categorizeds->rnum - aligned_padding;
1317	int* idx = (int*)ccmallocmalloc(sizeof(int) * (categorizeds->rnum + aligned_padding));
1318	for (i = 0; i < categorizeds->rnum; i++)
3
←
Assuming 'i' is >= field 'rnum'→
4
←
Loop condition is false. Execution continues on line 1320→
1319		idx[i] = i;
1320	gsl_ran_shuffle(rng, idx, categorizeds->rnum, sizeof(int));
1321	// the last layer has to be full connect, thus we can use it as softmax layer
1322	assert(convnet->layers[convnet->count - 1].type == CCV_CONVNET_FULL_CONNECT)((void) sizeof ((convnet->layers[convnet->count - 1].type
 == CCV_CONVNET_FULL_CONNECT) ? 1 : 0), __extension__ ({ if (
convnet->layers[convnet->count - 1].type == CCV_CONVNET_FULL_CONNECT
) ; else __assert_fail ("convnet->layers[convnet->count - 1].type == CCV_CONVNET_FULL_CONNECT"
, "ccv_convnet.c", 1322, __extension__ __PRETTY_FUNCTION__); }
));
5
←
Assuming field 'type' is equal to CCV_CONVNET_FULL_CONNECT→
6
←
Taking true branch→
1323	int category_count = convnet->layers[convnet->count - 1].net.full_connect.count;
1324	ccv_convnet_t* update_params = _ccv_convnet_update_new(convnet);
7
←
Calling '_ccv_convnet_update_new'→
11
←
Returning from '_ccv_convnet_update_new'→
1325	ccv_convnet_t* momentum = _ccv_convnet_update_new(convnet);
1326	for (t = 0; t < params.max_epoch; t++)
12
←
Assuming 't' is < field 'max_epoch'→
13
←
Loop condition is true.  Entering loop body→
1327	{
1328		for (i = 0; i < aligned_rnum; i++)
14
←
Assuming 'i' is < 'aligned_rnum'→
15
←
Loop condition is true.  Entering loop body→
1329		{
1330			// dropout the first hidden layer
1331			ccv_categorized_t* categorized = (ccv_categorized_t*)ccv_array_get(categorizeds, idx[i])((void*)(((char*)((categorizeds)->data)) + (size_t)(categorizeds
)->rsize * (size_t)(idx[i])));
1332			ccv_convnet_encode(convnet, &categorized->matrix, convnet->acts + convnet->count - 1, 1);
1333			ccv_dense_matrix_t* softmax = convnet->acts[convnet->count - 1];
1334			float* dloss = softmax->data.f32;
1335			_ccv_convnet_compute_softmax(softmax, &softmax, 0);
1336			assert(softmax->rows == category_count && softmax->cols == 1)((void) sizeof ((softmax->rows == category_count &&
 softmax->cols == 1) ? 1 : 0), __extension__ ({ if (softmax
->rows == category_count && softmax->cols == 1)
 ; else __assert_fail ("softmax->rows == category_count && softmax->cols == 1"
, "ccv_convnet.c", 1336, __extension__ __PRETTY_FUNCTION__); }
));
16
←
Assuming 'category_count' is equal to field 'rows'→
17
←
Assuming field 'cols' is equal to 1→
18
←
Taking true branch→
1337			// this mashes softmax and logistic regression together
1338			// also, it gives you -D[loss w.r.t. to x_i] (note the negative sign)
1339			for (j = 0; j < category_count; j++)
19
←
Assuming 'j' is >= 'category_count'→
20
←
Loop condition is false. Execution continues on line 1341→
1340				dloss[j] = (j == categorized->c) - dloss[j];
1341			_ccv_convnet_propagate_loss(convnet, categorized->matrix, softmax, update_params);
21
←
Calling '_ccv_convnet_propagate_loss'→
1342			if ((i + 1) % params.mini_batch == 0)
1343			{
1344				FLUSH(CCV_CLI_INFO, " - at epoch %03d / %d => stochastic gradient descent at %d / %d", t + 1, params.max_epoch, (i + 1) / params.mini_batch, aligned_rnum / params.mini_batch)do { if ((CCV_CLI_INFO & ccv_cli_get_output_levels())) { for
 (_CCV_PRINT_LOOP = 0; _CCV_PRINT_LOOP < _CCV_PRINT_COUNT;
 _CCV_PRINT_LOOP++) printf("\b"); for (_CCV_PRINT_LOOP = 0; _CCV_PRINT_LOOP
 < _CCV_PRINT_COUNT; _CCV_PRINT_LOOP++) printf(" "); for (
_CCV_PRINT_LOOP = 0; _CCV_PRINT_LOOP < _CCV_PRINT_COUNT; _CCV_PRINT_LOOP
++) printf("\b"); _CCV_PRINT_COUNT = printf(" - at epoch %03d / %d => stochastic gradient descent at %d / %d"
, t + 1, params.max_epoch, (i + 1) / params.mini_batch, aligned_rnum
 / params.mini_batch); fflush(stdout); } } while (0);
1345				// update weights
1346				_ccv_convnet_update(convnet, params.mini_batch, momentum, update_params, params.layer_params);
1347				_ccv_convnet_update_zero(update_params);
1348				// compact the convnet to avoid any staled temporary resource
1349				ccv_convnet_compact(convnet);
1350			}
1351		}
1352		int miss = 0;
1353		for (i = 0; i < tests->rnum; i++)
1354		{
1355			FLUSH(CCV_CLI_INFO, " - at epoch %03d / %d => going through %d / %d for tests", t + 1, params.max_epoch, i + 1, tests->rnum)do { if ((CCV_CLI_INFO & ccv_cli_get_output_levels())) { for
 (_CCV_PRINT_LOOP = 0; _CCV_PRINT_LOOP < _CCV_PRINT_COUNT;
 _CCV_PRINT_LOOP++) printf("\b"); for (_CCV_PRINT_LOOP = 0; _CCV_PRINT_LOOP
 < _CCV_PRINT_COUNT; _CCV_PRINT_LOOP++) printf(" "); for (
_CCV_PRINT_LOOP = 0; _CCV_PRINT_LOOP < _CCV_PRINT_COUNT; _CCV_PRINT_LOOP
++) printf("\b"); _CCV_PRINT_COUNT = printf(" - at epoch %03d / %d => going through %d / %d for tests"
, t + 1, params.max_epoch, i + 1, tests->rnum); fflush(stdout
); } } while (0);
1356			ccv_categorized_t* test = (ccv_categorized_t*)ccv_array_get(tests, i)((void*)(((char*)((tests)->data)) + (size_t)(tests)->rsize
 * (size_t)(i)));
1357			int c = 0;
1358			_ccv_convnet_classify(convnet, &test->matrix, &c, 1);
1359			if (c != test->c)
1360				++miss;
1361		}
1362		FLUSH(CCV_CLI_INFO, " - at epoch %03d / %d => with miss rate %.2f%%\n", t + 1, params.max_epoch, miss * 100.0f / tests->rnum)do { if ((CCV_CLI_INFO & ccv_cli_get_output_levels())) { for
 (_CCV_PRINT_LOOP = 0; _CCV_PRINT_LOOP < _CCV_PRINT_COUNT;
 _CCV_PRINT_LOOP++) printf("\b"); for (_CCV_PRINT_LOOP = 0; _CCV_PRINT_LOOP
 < _CCV_PRINT_COUNT; _CCV_PRINT_LOOP++) printf(" "); for (
_CCV_PRINT_LOOP = 0; _CCV_PRINT_LOOP < _CCV_PRINT_COUNT; _CCV_PRINT_LOOP
++) printf("\b"); _CCV_PRINT_COUNT = printf(" - at epoch %03d / %d => with miss rate %.2f%%\n"
, t + 1, params.max_epoch, miss * 100.0f / tests->rnum); fflush
(stdout); } } while (0);
1363		if (t + 1 < params.max_epoch)
1364		{
1365			// reshuffle the parts we visited and move the rest to the beginning
1366			memcpy(idx + categorizeds->rnum, idx + aligned_rnum, sizeof(int) * aligned_padding);
1367			memmove(idx + aligned_padding, idx, sizeof(int) * aligned_rnum);
1368			memcpy(idx, idx + categorizeds->rnum, sizeof(int) * aligned_padding);
1369			gsl_ran_shuffle(rng, idx + aligned_padding, aligned_rnum, sizeof(int));
1370		}
1371	}
1372	ccfreefree(idx);
1373	ccv_convnet_free(momentum);
1374	ccv_convnet_free(update_params);
1375	gsl_rng_free(rng);
1376#ifdef HAVE_CUDA1
1377	}
1378#endif
1379#else
1380	assert(0 && "ccv_convnet_supervised_train requires GSL library support")((void) sizeof ((0 && "ccv_convnet_supervised_train requires GSL library support"
) ? 1 : 0), __extension__ ({ if (0 && "ccv_convnet_supervised_train requires GSL library support"
) ; else __assert_fail ("0 && \"ccv_convnet_supervised_train requires GSL library support\""
, "ccv_convnet.c", 1380, __extension__ __PRETTY_FUNCTION__); }
));
1381#endif
1382}
1383 
1384void ccv_convnet_compact(ccv_convnet_t* convnet)
1385{
1386#ifdef HAVE_CUDA1
1387	cwc_convnet_compact(convnet);
1388#endif
1389	int i;
1390	for (i = 0; i < convnet->count; i++)
1391	{
1392		if (convnet->acts[i])
1393			ccv_matrix_free(convnet->acts[i]);
1394		convnet->acts[i] = 0;
1395		if (convnet->denoms)
1396		{
1397			if (convnet->denoms[i])
1398				ccv_matrix_free(convnet->denoms[i]);
1399			convnet->denoms[i] = 0;
1400		}
1401		if (SIMD(convnet->layers + i)((float*)((convnet->layers + i)->reserved)))
1402		{
1403			ccfreefree(convnet->layers[i].reserved);
1404			convnet->layers[i].reserved = 0;
1405		}
1406	}
1407}
1408 
1409void ccv_convnet_write(ccv_convnet_t* convnet, const char* filename, ccv_convnet_write_param_t params)
1410{
1411	sqlite3* db = 0;
1412	if (SQLITE_OK0 == sqlite3_open(filename, &db))
1413	{
1414		const char layer_create_table_qs[] =
1415			"CREATE TABLE IF NOT EXISTS layer_params "
1416			"(layer INTEGER PRIMARY KEY ASC, type INTEGER, "
1417			"input_matrix_rows INTEGER, input_matrix_cols INTEGER, input_matrix_channels INTEGER, input_matrix_partition INTEGER, input_node_count INTEGER, "
1418			"output_rows INTEGER, output_cols INTEGER, output_channels INTEGER, output_partition INTEGER, output_count INTEGER, output_strides INTEGER, output_border INTEGER, "
1419			"output_size INTEGER, output_kappa REAL, output_alpha REAL, output_beta REAL, output_relu INTEGER);"
1420			"CREATE TABLE IF NOT EXISTS convnet_params "
1421			"(convnet INTEGER PRIMARY KEY ASC, input_height INTEGER, input_width INTEGER, mean_activity BLOB);"
1422			"CREATE TABLE IF NOT EXISTS layer_data "
1423			"(layer INTEGER PRIMARY KEY ASC, weight BLOB, bias BLOB, half_precision INTEGER);";
1424		assert(SQLITE_OK == sqlite3_exec(db, layer_create_table_qs, 0, 0, 0))((void) sizeof ((0 == sqlite3_exec(db, layer_create_table_qs,
 0, 0, 0)) ? 1 : 0), __extension__ ({ if (0 == sqlite3_exec(db
, layer_create_table_qs, 0, 0, 0)) ; else __assert_fail ("SQLITE_OK == sqlite3_exec(db, layer_create_table_qs, 0, 0, 0)"
, "ccv_convnet.c", 1424, __extension__ __PRETTY_FUNCTION__); }
));
1425		const char layer_params_insert_qs[] = 
1426			"REPLACE INTO layer_params "
1427			"(layer, type, "
1428			"input_matrix_rows, input_matrix_cols, input_matrix_channels, input_matrix_partition, input_node_count, "
1429			"output_rows, output_cols, output_channels, output_partition, output_count, output_strides, output_border, "
1430			"output_size, output_kappa, output_alpha, output_beta, output_relu) VALUES "
1431			"($layer, $type, " // 1
1432			"$input_matrix_rows, $input_matrix_cols, $input_matrix_channels, $input_matrix_partition, $input_node_count, " // 6
1433			"$output_rows, $output_cols, $output_channels, $output_partition, $output_count, $output_strides, $output_border, " // 13
1434			"$output_size, $output_kappa, $output_alpha, $output_beta, $output_relu);"; // 18
1435		sqlite3_stmt* layer_params_insert_stmt = 0;
1436		assert(SQLITE_OK == sqlite3_prepare_v2(db, layer_params_insert_qs, sizeof(layer_params_insert_qs), &layer_params_insert_stmt, 0))((void) sizeof ((0 == sqlite3_prepare_v2(db, layer_params_insert_qs
, sizeof(layer_params_insert_qs), &layer_params_insert_stmt
, 0)) ? 1 : 0), __extension__ ({ if (0 == sqlite3_prepare_v2(
db, layer_params_insert_qs, sizeof(layer_params_insert_qs), &
layer_params_insert_stmt, 0)) ; else __assert_fail ("SQLITE_OK == sqlite3_prepare_v2(db, layer_params_insert_qs, sizeof(layer_params_insert_qs), &layer_params_insert_stmt, 0)"
, "ccv_convnet.c", 1436, __extension__ __PRETTY_FUNCTION__); }
));
1437		const char layer_data_insert_qs[] =
1438			"REPLACE INTO layer_data "
1439			"(layer, weight, bias, half_precision) VALUES ($layer, $weight, $bias, $half_precision);";
1440		sqlite3_stmt* layer_data_insert_stmt = 0;
1441		assert(SQLITE_OK == sqlite3_prepare_v2(db, layer_data_insert_qs, sizeof(layer_data_insert_qs), &layer_data_insert_stmt, 0))((void) sizeof ((0 == sqlite3_prepare_v2(db, layer_data_insert_qs
, sizeof(layer_data_insert_qs), &layer_data_insert_stmt, 0
)) ? 1 : 0), __extension__ ({ if (0 == sqlite3_prepare_v2(db,
 layer_data_insert_qs, sizeof(layer_data_insert_qs), &layer_data_insert_stmt
, 0)) ; else __assert_fail ("SQLITE_OK == sqlite3_prepare_v2(db, layer_data_insert_qs, sizeof(layer_data_insert_qs), &layer_data_insert_stmt, 0)"
, "ccv_convnet.c", 1441, __extension__ __PRETTY_FUNCTION__); }
));
1442		int i;
1443		for (i = 0; i < convnet->count; i++)
1444		{
1445			ccv_convnet_layer_t* layer = convnet->layers + i;
1446			// insert layer params
1447			sqlite3_bind_int(layer_params_insert_stmt, 1, i);
1448			sqlite3_bind_int(layer_params_insert_stmt, 2, layer->type);
1449			sqlite3_bind_int(layer_params_insert_stmt, 3, layer->input.matrix.rows);
1450			sqlite3_bind_int(layer_params_insert_stmt, 4, layer->input.matrix.cols);
1451			sqlite3_bind_int(layer_params_insert_stmt, 5, layer->input.matrix.channels);
1452			sqlite3_bind_int(layer_params_insert_stmt, 6, layer->input.matrix.partition);
1453			sqlite3_bind_int(layer_params_insert_stmt, 7, layer->input.node.count);
1454			switch (layer->type)
1455			{
1456				case CCV_CONVNET_CONVOLUTIONAL:
1457					sqlite3_bind_int(layer_params_insert_stmt, 8, layer->net.convolutional.rows);
1458					sqlite3_bind_int(layer_params_insert_stmt, 9, layer->net.convolutional.cols);
1459					sqlite3_bind_int(layer_params_insert_stmt, 10, layer->net.convolutional.channels);
1460					sqlite3_bind_int(layer_params_insert_stmt, 11, layer->net.convolutional.partition);
1461					sqlite3_bind_int(layer_params_insert_stmt, 12, layer->net.convolutional.count);
1462					sqlite3_bind_int(layer_params_insert_stmt, 13, layer->net.convolutional.strides);
1463					sqlite3_bind_int(layer_params_insert_stmt, 14, layer->net.convolutional.border);
1464					break;
1465				case CCV_CONVNET_FULL_CONNECT:
1466					sqlite3_bind_int(layer_params_insert_stmt, 12, layer->net.full_connect.count);
1467					sqlite3_bind_int(layer_params_insert_stmt, 19, layer->net.full_connect.relu);
1468					break;
1469				case CCV_CONVNET_MAX_POOL:
1470				case CCV_CONVNET_AVERAGE_POOL:
1471					sqlite3_bind_int(layer_params_insert_stmt, 13, layer->net.pool.strides);
1472					sqlite3_bind_int(layer_params_insert_stmt, 14, layer->net.pool.border);
1473					sqlite3_bind_int(layer_params_insert_stmt, 15, layer->net.pool.size);
1474					break;
1475				case CCV_CONVNET_LOCAL_RESPONSE_NORM:
1476					sqlite3_bind_int(layer_params_insert_stmt, 15, layer->net.rnorm.size);
1477					sqlite3_bind_double(layer_params_insert_stmt, 16, layer->net.rnorm.kappa);
1478					sqlite3_bind_double(layer_params_insert_stmt, 17, layer->net.rnorm.alpha);
1479					sqlite3_bind_double(layer_params_insert_stmt, 18, layer->net.rnorm.beta);
1480					break;
1481			}
1482			assert(SQLITE_DONE == sqlite3_step(layer_params_insert_stmt))((void) sizeof ((101 == sqlite3_step(layer_params_insert_stmt
)) ? 1 : 0), __extension__ ({ if (101 == sqlite3_step(layer_params_insert_stmt
)) ; else __assert_fail ("SQLITE_DONE == sqlite3_step(layer_params_insert_stmt)"
, "ccv_convnet.c", 1482, __extension__ __PRETTY_FUNCTION__); }
));
1483			sqlite3_reset(layer_params_insert_stmt);
1484			sqlite3_clear_bindings(layer_params_insert_stmt);
1485			// insert layer data
1486			if (layer->type == CCV_CONVNET_CONVOLUTIONAL || layer->type == CCV_CONVNET_FULL_CONNECT)
1487			{
1488				sqlite3_bind_int(layer_data_insert_stmt, 1, i);
1489				if (params.half_precision)
1490				{
1491					uint16_t* w = (uint16_t*)ccmallocmalloc(sizeof(uint16_t) * layer->wnum);
1492					ccv_float_to_half_precision(layer->w, w, layer->wnum);
1493					uint16_t* bias = (uint16_t*)ccmallocmalloc(sizeof(uint16_t) * (layer->type == CCV_CONVNET_CONVOLUTIONAL ? layer->net.convolutional.count : layer->net.full_connect.count));
1494					ccv_float_to_half_precision(layer->bias, bias, layer->type == CCV_CONVNET_CONVOLUTIONAL ? layer->net.convolutional.count : layer->net.full_connect.count);
1495					sqlite3_bind_blob(layer_data_insert_stmt, 2, w, sizeof(uint16_t) * layer->wnum, ccfreefree);
1496					sqlite3_bind_blob(layer_data_insert_stmt, 3, bias, sizeof(uint16_t) * (layer->type == CCV_CONVNET_CONVOLUTIONAL ? layer->net.convolutional.count : layer->net.full_connect.count), ccfreefree);
1497				} else {
1498					sqlite3_bind_blob(layer_data_insert_stmt, 2, layer->w, sizeof(float) * layer->wnum, SQLITE_STATIC((sqlite3_destructor_type)0));
1499					sqlite3_bind_blob(layer_data_insert_stmt, 3, layer->bias, sizeof(float) * (layer->type == CCV_CONVNET_CONVOLUTIONAL ? layer->net.convolutional.count : layer->net.full_connect.count), SQLITE_STATIC((sqlite3_destructor_type)0));
1500				}
1501				sqlite3_bind_int(layer_data_insert_stmt, 4, params.half_precision);
1502				assert(SQLITE_DONE == sqlite3_step(layer_data_insert_stmt))((void) sizeof ((101 == sqlite3_step(layer_data_insert_stmt))
 ? 1 : 0), __extension__ ({ if (101 == sqlite3_step(layer_data_insert_stmt
)) ; else __assert_fail ("SQLITE_DONE == sqlite3_step(layer_data_insert_stmt)"
, "ccv_convnet.c", 1502, __extension__ __PRETTY_FUNCTION__); }
));
1503				sqlite3_reset(layer_data_insert_stmt);
1504				sqlite3_clear_bindings(layer_data_insert_stmt);
1505			}
1506		}
1507		// insert convnet related params
1508		const char convnet_params_insert_qs[] =
1509			"REPLACE INTO convnet_params "
1510			"(convnet, mean_activity, input_height, input_width) VALUES (0, $mean_activity, $input_height, $input_width);";
1511		sqlite3_stmt* convnet_params_insert_stmt = 0;
1512		assert(SQLITE_OK == sqlite3_prepare_v2(db, convnet_params_insert_qs, sizeof(convnet_params_insert_qs), &convnet_params_insert_stmt, 0))((void) sizeof ((0 == sqlite3_prepare_v2(db, convnet_params_insert_qs
, sizeof(convnet_params_insert_qs), &convnet_params_insert_stmt
, 0)) ? 1 : 0), __extension__ ({ if (0 == sqlite3_prepare_v2(
db, convnet_params_insert_qs, sizeof(convnet_params_insert_qs
), &convnet_params_insert_stmt, 0)) ; else __assert_fail (
"SQLITE_OK == sqlite3_prepare_v2(db, convnet_params_insert_qs, sizeof(convnet_params_insert_qs), &convnet_params_insert_stmt, 0)"
, "ccv_convnet.c", 1512, __extension__ __PRETTY_FUNCTION__); }
));
1513		assert(convnet->mean_activity->rows == convnet->input.height)((void) sizeof ((convnet->mean_activity->rows == convnet
->input.height) ? 1 : 0), __extension__ ({ if (convnet->
mean_activity->rows == convnet->input.height) ; else __assert_fail
 ("convnet->mean_activity->rows == convnet->input.height"
, "ccv_convnet.c", 1513, __extension__ __PRETTY_FUNCTION__); }
));
1514		assert(convnet->mean_activity->cols == convnet->input.width)((void) sizeof ((convnet->mean_activity->cols == convnet
->input.width) ? 1 : 0), __extension__ ({ if (convnet->
mean_activity->cols == convnet->input.width) ; else __assert_fail
 ("convnet->mean_activity->cols == convnet->input.width"
, "ccv_convnet.c", 1514, __extension__ __PRETTY_FUNCTION__); }
));
1515		assert(CCV_GET_CHANNEL(convnet->mean_activity->type) == convnet->channels)((void) sizeof ((((convnet->mean_activity->type) & 0xFFF
) == convnet->channels) ? 1 : 0), __extension__ ({ if (((convnet
->mean_activity->type) & 0xFFF) == convnet->channels
) ; else __assert_fail ("CCV_GET_CHANNEL(convnet->mean_activity->type) == convnet->channels"
, "ccv_convnet.c", 1515, __extension__ __PRETTY_FUNCTION__); }
));
1516		assert(CCV_GET_DATA_TYPE(convnet->mean_activity->type) == CCV_32F)((void) sizeof ((((convnet->mean_activity->type) & 0xFF000
) == CCV_32F) ? 1 : 0), __extension__ ({ if (((convnet->mean_activity
->type) & 0xFF000) == CCV_32F) ; else __assert_fail ("CCV_GET_DATA_TYPE(convnet->mean_activity->type) == CCV_32F"
, "ccv_convnet.c", 1516, __extension__ __PRETTY_FUNCTION__); }
));
1517		sqlite3_bind_blob(convnet_params_insert_stmt, 1, convnet->mean_activity->data.f32, sizeof(float) * convnet->input.height * convnet->input.width * convnet->channels, SQLITE_STATIC((sqlite3_destructor_type)0));
1518		sqlite3_bind_int(convnet_params_insert_stmt, 2, convnet->input.height);
1519		sqlite3_bind_int(convnet_params_insert_stmt, 3, convnet->input.width);
1520		assert(SQLITE_DONE == sqlite3_step(convnet_params_insert_stmt))((void) sizeof ((101 == sqlite3_step(convnet_params_insert_stmt
)) ? 1 : 0), __extension__ ({ if (101 == sqlite3_step(convnet_params_insert_stmt
)) ; else __assert_fail ("SQLITE_DONE == sqlite3_step(convnet_params_insert_stmt)"
, "ccv_convnet.c", 1520, __extension__ __PRETTY_FUNCTION__); }
));
1521		sqlite3_reset(convnet_params_insert_stmt);
1522		sqlite3_clear_bindings(convnet_params_insert_stmt);
1523 
1524		sqlite3_finalize(layer_params_insert_stmt);
1525		sqlite3_finalize(layer_data_insert_stmt);
1526		sqlite3_finalize(convnet_params_insert_stmt);
1527		sqlite3_close(db);
1528	}
1529}
1530 
1531ccv_convnet_t* ccv_convnet_read(int use_cwc_accel, const char* filename)
1532{
1533	sqlite3* db = 0;
1534	if (SQLITE_OK0 == sqlite3_open(filename, &db))
1535	{
1536		ccv_convnet_t* convnet = 0;
1537		sqlite3_stmt* layer_params_stmt = 0;
1538		// load layer params
1539		const char layer_params_qs[] =
1540			"SELECT type, " // 1
1541			"input_matrix_rows, input_matrix_cols, input_matrix_channels, input_matrix_partition, input_node_count, " // 6
1542			"output_rows, output_cols, output_channels, output_partition, output_count, output_strides, output_border, " // 13
1543			"output_size, output_kappa, output_alpha, output_beta, output_relu FROM layer_params ORDER BY layer ASC;"; // 18
1544		if (SQLITE_OK0 == sqlite3_prepare_v2(db, layer_params_qs, sizeof(layer_params_qs), &layer_params_stmt, 0))
1545		{
1546			ccv_array_t* layer_params = ccv_array_new(sizeof(ccv_convnet_layer_param_t), 3, 0);
1547			while (sqlite3_step(layer_params_stmt) == SQLITE_ROW100)
1548			{
1549				ccv_convnet_layer_param_t layer_param;
1550				layer_param.type = sqlite3_column_int(layer_params_stmt, 0);
1551				layer_param.input.matrix.rows = sqlite3_column_int(layer_params_stmt, 1);
1552				layer_param.input.matrix.cols = sqlite3_column_int(layer_params_stmt, 2);
1553				layer_param.input.matrix.channels = sqlite3_column_int(layer_params_stmt, 3);
1554				layer_param.input.matrix.partition = sqlite3_column_int(layer_params_stmt, 4);
1555				layer_param.input.node.count = sqlite3_column_int(layer_params_stmt, 5);
1556				layer_param.bias = layer_param.glorot = 0; // this is irrelevant to read convnet
1557				switch (layer_param.type)
1558				{
1559					case CCV_CONVNET_CONVOLUTIONAL:
1560						layer_param.output.convolutional.rows = sqlite3_column_int(layer_params_stmt, 6);
1561						layer_param.output.convolutional.cols = sqlite3_column_int(layer_params_stmt, 7);
1562						layer_param.output.convolutional.channels = sqlite3_column_int(layer_params_stmt, 8);
1563						layer_param.output.convolutional.partition = sqlite3_column_int(layer_params_stmt, 9);
1564						layer_param.output.convolutional.count = sqlite3_column_int(layer_params_stmt, 10);
1565						layer_param.output.convolutional.strides = sqlite3_column_int(layer_params_stmt, 11);
1566						layer_param.output.convolutional.border = sqlite3_column_int(layer_params_stmt, 12);
1567						break;
1568					case CCV_CONVNET_FULL_CONNECT:
1569						layer_param.output.full_connect.count = sqlite3_column_int(layer_params_stmt, 10);
1570						layer_param.output.full_connect.relu = sqlite3_column_int(layer_params_stmt, 17);
1571						break;
1572					case CCV_CONVNET_MAX_POOL:
1573					case CCV_CONVNET_AVERAGE_POOL:
1574						layer_param.output.pool.strides = sqlite3_column_int(layer_params_stmt, 11);
1575						layer_param.output.pool.border = sqlite3_column_int(layer_params_stmt, 12);
1576						layer_param.output.pool.size = sqlite3_column_int(layer_params_stmt, 13);
1577						break;
1578					case CCV_CONVNET_LOCAL_RESPONSE_NORM:
1579						layer_param.output.rnorm.size = sqlite3_column_int(layer_params_stmt, 13);
1580						layer_param.output.rnorm.kappa = sqlite3_column_double(layer_params_stmt, 14);
1581						layer_param.output.rnorm.alpha = sqlite3_column_double(layer_params_stmt, 15);
1582						layer_param.output.rnorm.beta = sqlite3_column_double(layer_params_stmt, 16);
1583						break;
1584				}
1585				ccv_array_push(layer_params, &layer_param);
1586			}
1587			sqlite3_finalize(layer_params_stmt);
1588			sqlite3_stmt* convnet_params_input_stmt = 0;
1589			// load convnet params for input
1590			const char convnet_params_input_qs[] =
1591				"SELECT input_height, input_width FROM convnet_params WHERE convnet = 0;";
1592			ccv_size_t input = ccv_size(0, 0);
1593			if (SQLITE_OK0 == sqlite3_prepare_v2(db, convnet_params_input_qs, sizeof(convnet_params_input_qs), &convnet_params_input_stmt, 0))
1594			{
1595				if (sqlite3_step(convnet_params_input_stmt) == SQLITE_ROW100)
1596				{
1597					input.height = sqlite3_column_int(convnet_params_input_stmt, 0);
1598					input.width = sqlite3_column_int(convnet_params_input_stmt, 1);
1599				}
1600				sqlite3_finalize(convnet_params_input_stmt);
1601			}
1602			assert(input.height != 0 && input.width != 0)((void) sizeof ((input.height != 0 && input.width != 0
) ? 1 : 0), __extension__ ({ if (input.height != 0 &&
 input.width != 0) ; else __assert_fail ("input.height != 0 && input.width != 0"
, "ccv_convnet.c", 1602, __extension__ __PRETTY_FUNCTION__); }
));
1603			convnet = ccv_convnet_new(use_cwc_accel, input, (ccv_convnet_layer_param_t*)ccv_array_get(layer_params, 0)((void*)(((char*)((layer_params)->data)) + (size_t)(layer_params
)->rsize * (size_t)(0))), layer_params->rnum);
1604			ccv_array_free(layer_params);
1605			// load layer data
1606			sqlite3_stmt* layer_data_stmt = 0;
1607			const char layer_data_qs[] =
1608				"SELECT layer, weight, bias, half_precision FROM layer_data;";
1609			if (SQLITE_OK0 == sqlite3_prepare_v2(db, layer_data_qs, sizeof(layer_data_qs), &layer_data_stmt, 0))
1610			{
1611				while (sqlite3_step(layer_data_stmt) == SQLITE_ROW100)
1612				{
1613					ccv_convnet_layer_t* layer = convnet->layers + sqlite3_column_int(layer_data_stmt, 0);
1614					int half_precision = sqlite3_column_int(layer_data_stmt, 3);
1615					int wnum = sqlite3_column_bytes(layer_data_stmt, 1) / (half_precision ? sizeof(uint16_t) : sizeof(float));
1616					// if weights available, load weights
1617					if (wnum == layer->wnum)
1618					{
1619						const void* w = sqlite3_column_blob(layer_data_stmt, 1);
1620						if (half_precision)
1621						{
1622							float* f = (float*)ccmallocmalloc(sizeof(float) * layer->wnum);
1623							ccv_half_precision_to_float((uint16_t*)w, f, layer->wnum);
1624							w = f;
1625						}
1626						switch (layer->type)
1627						{
1628							case CCV_CONVNET_CONVOLUTIONAL:
1629								memcpy(layer->w, w, sizeof(float) * layer->wnum);
1630								break;
1631							case CCV_CONVNET_FULL_CONNECT:
1632								memcpy(layer->w, w, sizeof(float) * layer->wnum);
1633								break;
1634						}
1635						if (half_precision)
1636							ccfreefree((void*)w);
1637					}
1638					int bnum = sqlite3_column_bytes(layer_data_stmt, 2) / (half_precision ? sizeof(uint16_t) : sizeof(float));
1639					// if bias available, load bias
1640					if (bnum == (layer->type == CCV_CONVNET_CONVOLUTIONAL ? layer->net.convolutional.count : layer->net.full_connect.count))
1641					{
1642						const void* bias = sqlite3_column_blob(layer_data_stmt, 2);
1643						if (half_precision)
1644						{
1645							float* f = (float*)ccmallocmalloc(sizeof(float) * (layer->type == CCV_CONVNET_CONVOLUTIONAL ? layer->net.convolutional.count : layer->net.full_connect.count));
1646							ccv_half_precision_to_float((uint16_t*)bias, f, layer->type == CCV_CONVNET_CONVOLUTIONAL ? layer->net.convolutional.count : layer->net.full_connect.count);
1647							bias = f;
1648						}
1649						switch (layer->type)
1650						{
1651							case CCV_CONVNET_CONVOLUTIONAL:
1652								memcpy(layer->bias, bias, sizeof(float) * layer->net.convolutional.count);
1653								break;
1654							case CCV_CONVNET_FULL_CONNECT:
1655								memcpy(layer->bias, bias, sizeof(float) * layer->net.full_connect.count);
1656								break;
1657						}
1658						if (half_precision)
1659							ccfreefree((void*)bias);
1660					}
1661				}
1662				sqlite3_finalize(layer_data_stmt);
1663			}
1664			sqlite3_stmt* convnet_params_mean_activity_stmt = 0;
1665			// load convnet params for mean activity
1666			const char convnet_params_mean_activity_qs[] =
1667				"SELECT mean_activity FROM convnet_params WHERE convnet = 0;";
1668			if (SQLITE_OK0 == sqlite3_prepare_v2(db, convnet_params_mean_activity_qs, sizeof(convnet_params_mean_activity_qs), &convnet_params_mean_activity_stmt, 0))
1669			{
1670				if (sqlite3_step(convnet_params_mean_activity_stmt) == SQLITE_ROW100)
1671				{
1672					int elems = sqlite3_column_bytes(convnet_params_mean_activity_stmt, 0) / sizeof(float);
1673					if (elems == convnet->input.height * convnet->input.width * convnet->channels)
1674						memcpy(convnet->mean_activity->data.f32, sqlite3_column_blob(convnet_params_mean_activity_stmt, 0), sizeof(float) * elems);
1675				}
1676				sqlite3_finalize(convnet_params_mean_activity_stmt);
1677			}
1678		}
1679		sqlite3_close(db);
1680		return convnet;
1681	}
1682	return 0;
1683}
1684 
1685void ccv_convnet_input_formation(ccv_size_t input, ccv_dense_matrix_t* a, ccv_dense_matrix_t** b)
1686{
1687	if (a->rows > input.height && a->cols > input.width)
1688		ccv_resample(a, b, CCV_32F, (double)ccv_max(input.height, (int)(a->rows * (float)input.height / a->cols + 0.5))({ typeof (input.height) _a = (input.height); typeof ((int)(a
->rows * (float)input.height / a->cols + 0.5)) _b = ((int
)(a->rows * (float)input.height / a->cols + 0.5)); (_a >
 _b) ? _a : _b; }) / (double)a->rows, (double)ccv_max(input.width, (int)(a->cols * (float)input.width / a->rows + 0.5))({ typeof (input.width) _a = (input.width); typeof ((int)(a->
cols * (float)input.width / a->rows + 0.5)) _b = ((int)(a->
cols * (float)input.width / a->rows + 0.5)); (_a > _b) ?
 _a : _b; }) / (double)a->cols, CCV_INTER_AREA);
1689	else if (a->rows < input.height || a->cols < input.width)
1690		ccv_resample(a, b, CCV_32F, (double)ccv_max(input.height, (int)(a->rows * (float)input.height / a->cols + 0.5))({ typeof (input.height) _a = (input.height); typeof ((int)(a
->rows * (float)input.height / a->cols + 0.5)) _b = ((int
)(a->rows * (float)input.height / a->cols + 0.5)); (_a >
 _b) ? _a : _b; }) / (double)a->rows, (double)ccv_max(input.width, (int)(a->cols * (float)input.width / a->rows + 0.5))({ typeof (input.width) _a = (input.width); typeof ((int)(a->
cols * (float)input.width / a->rows + 0.5)) _b = ((int)(a->
cols * (float)input.width / a->rows + 0.5)); (_a > _b) ?
 _a : _b; }) / (double)a->cols, CCV_INTER_CUBIC);
1691	else
1692		ccv_shift(a, (ccv_matrix_t**)b, CCV_32F, 0, 0); // converting to 32f
1693}
1694 
1695void ccv_convnet_free(ccv_convnet_t* convnet)
1696{
1697	ccv_convnet_compact(convnet);
1698	int i;
1699	for (i = 0; i < convnet->count; i++)
1700		if (convnet->layers[i].w)
1701			ccfreefree(convnet->layers[i].w);
1702	if (convnet->mean_activity)
1703		ccv_matrix_free(convnet->mean_activity);
1704	ccfreefree(convnet);
1705}
1706 
1707#endif