/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/cmd/convolution/cpu_opt/_ccv_nnc_conv_cpu_opt.c
Line | Count | Source (jump to first uncovered line) |
1 | | #include "ccv.h" |
2 | | #include "ccv_internal.h" |
3 | | #include "nnc/ccv_nnc.h" |
4 | | #include "nnc/ccv_nnc_easy.h" |
5 | | #include "nnc/ccv_nnc_internal.h" |
6 | | #if defined(HAVE_SSE2) |
7 | | #include <xmmintrin.h> |
8 | | #elif defined(HAVE_NEON) |
9 | | #include <arm_neon.h> |
10 | | #endif |
11 | | #ifdef USE_OPENMP |
12 | | #include <omp.h> |
13 | | #endif |
14 | | #ifdef USE_DISPATCH |
15 | | #include <dispatch/dispatch.h> |
16 | | #endif |
17 | | #include "../_ccv_nnc_conv_cpu_opt.h" |
18 | | |
19 | | #ifdef HAVE_SSE2 |
20 | | inline static void _ccv_nnc_x4w_sse2(const float* const w, const int* const dim, float* x4w) |
21 | 1.14k | { |
22 | 1.14k | int jump_dim = dim[0] / 4; |
23 | 16.9k | parallel_for1.14k (k, jump_dim) { |
24 | 16.9k | int i, j; |
25 | 16.9k | float* x4wz = x4w + k * dim[3] * dim[2] * dim[1] * 4; |
26 | 16.9k | const float* wz[] = { |
27 | 16.9k | w + (k * 4) * dim[3] * dim[2] * dim[1], |
28 | 16.9k | w + (k * 4 + 1) * dim[3] * dim[2] * dim[1], |
29 | 16.9k | w + (k * 4 + 2) * dim[3] * dim[2] * dim[1], |
30 | 16.9k | w + (k * 4 + 3) * dim[3] * dim[2] * dim[1], |
31 | 16.9k | }; |
32 | 337k | for (i = 0; i < dim[2] * dim[1]; i++320k ) |
33 | 320k | { |
34 | 28.8M | for (j = 0; j < dim[3]; j++28.5M ) |
35 | 28.5M | { |
36 | 28.5M | x4wz[j * 4] = wz[0][j]; |
37 | 28.5M | x4wz[j * 4 + 1] = wz[1][j]; |
38 | 28.5M | x4wz[j * 4 + 2] = wz[2][j]; |
39 | 28.5M | x4wz[j * 4 + 3] = wz[3][j]; |
40 | 28.5M | } |
41 | 320k | x4wz += dim[3] * 4; |
42 | 320k | wz[0] += dim[3]; |
43 | 320k | wz[1] += dim[3]; |
44 | 320k | wz[2] += dim[3]; |
45 | 320k | wz[3] += dim[3]; |
46 | 320k | } |
47 | 16.9k | } parallel_endfor |
48 | 1.14k | } |
49 | | |
50 | | static int _ccv_nnc_conv_forw_sse2(const ccv_nnc_tensor_view_t* const a, const ccv_nnc_tensor_t* const w, const ccv_nnc_tensor_t* const bias, const ccv_nnc_hint_t hint, ccv_nnc_tensor_view_t* const b) |
51 | 1.14k | { |
52 | 1.14k | const int a_nd = ccv_nnc_tensor_nd(a->info.dim); |
53 | 1.14k | assert(a_nd == CCV_NNC_MAX_DIM + 1 || a_nd == CCV_NNC_MAX_DIM + 2); |
54 | 1.14k | const int* adim = (a_nd == CCV_NNC_MAX_DIM + 1) ? a->info.dim157 : a->info.dim + 1984 ; |
55 | 1.14k | const int b_nd = ccv_nnc_tensor_nd(b->info.dim); |
56 | 1.14k | assert(b_nd == CCV_NNC_MAX_DIM + 1 || b_nd == CCV_NNC_MAX_DIM + 2); |
57 | 1.14k | const int* bdim = (b_nd == CCV_NNC_MAX_DIM + 1) ? b->info.dim157 : b->info.dim + 1984 ; |
58 | 1.14k | int astride[CCV_NNC_MAX_DIM_ALLOC]; |
59 | 1.14k | ccv_nnc_tensor_view_get_stride(a, astride); |
60 | 1.14k | int bstride[CCV_NNC_MAX_DIM_ALLOC]; |
61 | 1.14k | ccv_nnc_tensor_view_get_stride(b, bstride); |
62 | 1.14k | assert(w->info.dim[0] % 4 == 0); |
63 | 1.14k | float* x4w = 0; |
64 | 1.14k | ccmemalign((void **)&x4w, 64, sizeof(float) * w->info.dim[3] * w->info.dim[2] * w->info.dim[1] * w->info.dim[0]); |
65 | 1.14k | if (!x4w) |
66 | 0 | return CCV_NNC_EXEC_OOM; |
67 | 1.14k | _ccv_nnc_x4w_sse2(w->data.f32, w->info.dim, x4w); |
68 | 1.14k | int jump_dim = w->info.dim[0] / 4; |
69 | | // Do naive tail partition unroll |
70 | 1.14k | #define main_for(tail_block) \ |
71 | 16.9k | parallel_for1.14k (k, jump_dim) { \ |
72 | 16.9k | int c; \ |
73 | 16.9k | const float* ap = a->data.f32; \ |
74 | 16.9k | float* bp = b->data.f32 + k * 4; \ |
75 | | /* kernel weight for one dim. */ \ |
76 | 16.9k | const float* const x4wp = x4w + k * 4 * w->info.dim[1] * w->info.dim[2] * w->info.dim[3]; \ |
77 | 16.9k | float biasval[4] __attribute__ ((__aligned__(16))) = {}; \ |
78 | 16.9k | if (bias) \ |
79 | 16.9k | { \ |
80 | 16.9k | biasval[0] = bias->data.f32[k * 4]; \ |
81 | 16.9k | biasval[1] = bias->data.f32[k * 4 + 1]; \ |
82 | 16.9k | biasval[2] = bias->data.f32[k * 4 + 2]; \ |
83 | 16.9k | biasval[3] = bias->data.f32[k * 4 + 3]; \ |
84 | 16.9k | } \ |
85 | | /* This block will be cause in each for-loop, therefore, you can use it to generate some temporary variables. */ \ |
86 | 16.9k | int i[CCV_NNC_MAX_DIM]; \ |
87 | 16.9k | int n[CCV_NNC_MAX_DIM]; \ |
88 | 16.9k | int m[CCV_NNC_MAX_DIM]; \ |
89 | 16.9k | int j[CCV_NNC_MAX_DIM]; \ |
90 | 416k | for (i[0] = 0; i[0] < bdim[0]; i[0]++399k ) \ |
91 | 399k | { \ |
92 | 399k | SET_BORDER_OFFSET_SIZE_FOR(0, i, hint, w->info.dim + 1, adim, n, m); \ |
93 | 399k | const float* wpu = x4wp + n[0] * w->info.dim[2] * w->info.dim[3] * 4; \ |
94 | 23.6M | for (i[1] = 0; i[1] < bdim[1]; i[1]++23.2M ) \ |
95 | 23.2M | { \ |
96 | 23.2M | SET_BORDER_OFFSET_SIZE_FOR(1, i, hint, w->info.dim + 1, adim, n, m); \ |
97 | 23.2M | __m128 v40 = _mm_load_ps(biasval); \ |
98 | 23.2M | __m128 v41 = _mm_setzero_ps(); \ |
99 | 23.2M | __m128 v42 = _mm_setzero_ps(); \ |
100 | 23.2M | __m128 v43 = _mm_setzero_ps(); \ |
101 | 23.2M | const float* wpz = wpu + n[1] * w->info.dim[3] * 4; \ |
102 | 23.2M | const float* apz = ap + ccv_max(i[1] * hint.stride.dim[1] - hint.border.begin[1], 0) * astride[2]; \ |
103 | 98.1M | for (j[0] = 0; j[0] < m[0]; j[0]++74.9M ) \ |
104 | 74.9M | { \ |
105 | 326M | for (j[1] = 0; j[1] < m[1]; j[1]++251M ) \ |
106 | 251M | { \ |
107 | 5.64G | for (c = 0; c < adim[2] - 3; c += 45.39G ) \ |
108 | 5.39G | { \ |
109 | 5.39G | __m128 apz4 = _mm_loadu_ps(apz + j[1] * astride[2] + c); \ |
110 | 5.39G | const float* const wpzu = wpz + (j[1] * w->info.dim[3] + c) * 4; \ |
111 | 5.39G | __m128 w40 = _mm_loadu_ps(wpzu); \ |
112 | 5.39G | __m128 w41 = _mm_loadu_ps(wpzu + 4); \ |
113 | 5.39G | __m128 w42 = _mm_loadu_ps(wpzu + 8); \ |
114 | 5.39G | __m128 w43 = _mm_loadu_ps(wpzu + 12); \ |
115 | 5.39G | __m128 apz40 = _mm_shuffle_ps(apz4, apz4, 0x00); \ |
116 | 5.39G | __m128 apz41 = _mm_shuffle_ps(apz4, apz4, 0x55); \ |
117 | 5.39G | __m128 apz42 = _mm_shuffle_ps(apz4, apz4, 0xAA); \ |
118 | 5.39G | __m128 apz43 = _mm_shuffle_ps(apz4, apz4, 0xFF); \ |
119 | 5.39G | v40 =_mm_add_ps(_mm_mul_ps(w40, apz40), v40); \ |
120 | 5.39G | v41 =_mm_add_ps(_mm_mul_ps(w41, apz41), v41); \ |
121 | 5.39G | v42 =_mm_add_ps(_mm_mul_ps(w42, apz42), v42); \ |
122 | 5.39G | v43 =_mm_add_ps(_mm_mul_ps(w43, apz43), v43); \ |
123 | 5.39G | } \ |
124 | 251M | tail_block /* insert executions for tail partition */ \ |
125 | 251M | } \ |
126 | 74.9M | wpz += w->info.dim[2] * w->info.dim[3] * 4; \ |
127 | 74.9M | apz += astride[1]; \ |
128 | 74.9M | } \ |
129 | 23.2M | __m128 v4 = _mm_add_ps(_mm_add_ps(v40, v41), _mm_add_ps(v42, v43)); \ |
130 | 23.2M | _mm_stream_ps(bp + i[1] * bstride[2], v4); \ |
131 | 23.2M | } \ |
132 | 399k | bp += bstride[1]; \ |
133 | 399k | ap += astride[1] * (ccv_max((i[0] + 1) * hint.stride.dim[0] - hint.border.begin[0], 0) - ccv_max(i[0] * hint.stride.dim[0] - hint.border.begin[0], 0)); \ |
134 | 399k | } \ |
135 | 16.9k | } parallel_endfor |
136 | 1.14k | if (w->info.dim[3] % 4 == 0) |
137 | 799 | { |
138 | 799 | main_for(); |
139 | 799 | } else if (342 w->info.dim[3] % 4 == 3342 ) { // unroll the last for-loops |
140 | 334 | #define tail_block \ |
141 | 334 | __m128 apz40 = _mm_load1_ps(apz + j[1] * astride[2] + c); \ |
142 | 334 | __m128 apz41 = _mm_load1_ps(apz + j[1] * astride[2] + c + 1); \ |
143 | 334 | __m128 apz42 = _mm_load1_ps(apz + j[1] * astride[2] + c + 2); \ |
144 | 334 | const float* const wpzu = wpz + (j[1] * w->info.dim[3] + c) * 4; \ |
145 | 334 | __m128 w40 = _mm_loadu_ps(wpzu); \ |
146 | 334 | __m128 w41 = _mm_loadu_ps(wpzu + 4); \ |
147 | 334 | __m128 w42 = _mm_loadu_ps(wpzu + 8); \ |
148 | 334 | v40 = _mm_add_ps(_mm_mul_ps(w40, apz40), v40); \ |
149 | 334 | v41 = _mm_add_ps(_mm_mul_ps(w41, apz41), v41); \ |
150 | 334 | v42 = _mm_add_ps(_mm_mul_ps(w42, apz42), v42); |
151 | 334 | main_for(tail_block); |
152 | 334 | #undef tail_block |
153 | 334 | } else if (8 w->info.dim[3] % 4 == 28 ) { // unroll the last for-loops |
154 | 8 | #define tail_block \ |
155 | 8 | __m128 apz40 = _mm_load1_ps(apz + j[1] * astride[2] + c); \ |
156 | 8 | __m128 apz41 = _mm_load1_ps(apz + j[1] * astride[2] + c + 1); \ |
157 | 8 | const float* const wpzu = wpz + (j[1] * w->info.dim[3] + c) * 4; \ |
158 | 8 | __m128 w40 = _mm_loadu_ps(wpzu); \ |
159 | 8 | __m128 w41 = _mm_loadu_ps(wpzu + 4); \ |
160 | 8 | v40 = _mm_add_ps(_mm_mul_ps(w40, apz40), v40); \ |
161 | 8 | v41 = _mm_add_ps(_mm_mul_ps(w41, apz41), v41); |
162 | 8 | main_for(tail_block); |
163 | 8 | #undef tail_block |
164 | 8 | } else { |
165 | 0 | #define tail_block \ |
166 | 0 | __m128 apz4 = _mm_load1_ps(apz + j[1] * astride[2] + c); \ |
167 | 0 | const float* const wpzu = wpz + (j[1] * w->info.dim[3] + c) * 4; \ |
168 | 0 | __m128 w4 = _mm_loadu_ps(wpzu); \ |
169 | 0 | v40 = _mm_add_ps(_mm_mul_ps(w4, apz4), v40); |
170 | 0 | main_for(tail_block); |
171 | 0 | #undef tail_block |
172 | 0 | } |
173 | 1.14k | #undef main_for |
174 | 1.14k | ccfree(x4w); |
175 | 1.14k | return CCV_NNC_EXEC_SUCCESS; |
176 | 1.14k | } |
177 | | #endif |
178 | | |
179 | | #ifdef HAVE_NEON |
180 | | inline static void _ccv_nnc_x4w_neon(const float* const w, const int* const dim, float* x4w) |
181 | | { |
182 | | int jump_dim = dim[0] / 4; |
183 | | parallel_for(k, jump_dim) { |
184 | | int i, j; |
185 | | float* x4wz = x4w + k * dim[3] * dim[2] * dim[1] * 4; |
186 | | const float* wz[] = { |
187 | | w + (k * 4) * dim[3] * dim[2] * dim[1], |
188 | | w + (k * 4 + 1) * dim[3] * dim[2] * dim[1], |
189 | | w + (k * 4 + 2) * dim[3] * dim[2] * dim[1], |
190 | | w + (k * 4 + 3) * dim[3] * dim[2] * dim[1], |
191 | | }; |
192 | | for (i = 0; i < dim[2] * dim[1]; i++) |
193 | | { |
194 | | for (j = 0; j < dim[3]; j++) |
195 | | { |
196 | | x4wz[j * 4] = wz[0][j]; |
197 | | x4wz[j * 4 + 1] = wz[1][j]; |
198 | | x4wz[j * 4 + 2] = wz[2][j]; |
199 | | x4wz[j * 4 + 3] = wz[3][j]; |
200 | | } |
201 | | x4wz += dim[3] * 4; |
202 | | wz[0] += dim[3]; |
203 | | wz[1] += dim[3]; |
204 | | wz[2] += dim[3]; |
205 | | wz[3] += dim[3]; |
206 | | } |
207 | | } parallel_endfor |
208 | | } |
209 | | |
210 | | static int _ccv_nnc_conv_forw_neon(const ccv_nnc_tensor_view_t* const a, const ccv_nnc_tensor_t* const w, const ccv_nnc_tensor_t* const bias, const ccv_nnc_hint_t hint, ccv_nnc_tensor_view_t* const b) |
211 | | { |
212 | | const int a_nd = ccv_nnc_tensor_nd(a->info.dim); |
213 | | assert(a_nd == CCV_NNC_MAX_DIM + 1 || a_nd == CCV_NNC_MAX_DIM + 2); |
214 | | const int* adim = (a_nd == CCV_NNC_MAX_DIM + 1) ? a->info.dim : a->info.dim + 1; |
215 | | const int b_nd = ccv_nnc_tensor_nd(b->info.dim); |
216 | | assert(b_nd == CCV_NNC_MAX_DIM + 1 || b_nd == CCV_NNC_MAX_DIM + 2); |
217 | | const int* bdim = (b_nd == CCV_NNC_MAX_DIM + 1) ? b->info.dim : b->info.dim + 1; |
218 | | int astride[CCV_NNC_MAX_DIM_ALLOC]; |
219 | | ccv_nnc_tensor_view_get_stride(a, astride); |
220 | | int bstride[CCV_NNC_MAX_DIM_ALLOC]; |
221 | | ccv_nnc_tensor_view_get_stride(b, bstride); |
222 | | assert(w->info.dim[0] % 4 == 0); |
223 | | float* x4w = 0; |
224 | | ccmemalign((void **)&x4w, 64, sizeof(float) * w->info.dim[3] * w->info.dim[2] * w->info.dim[1] * w->info.dim[0]); |
225 | | if (!x4w) |
226 | | return CCV_NNC_EXEC_OOM; |
227 | | _ccv_nnc_x4w_neon(w->data.f32, w->info.dim, x4w); |
228 | | int jump_dim = w->info.dim[0] / 4; |
229 | | #define main_for(tail_block) \ |
230 | | parallel_for(k, jump_dim) { \ |
231 | | int c; \ |
232 | | const float* ap = a->data.f32; \ |
233 | | float* bp = b->data.f32 + k * 4; \ |
234 | | /* kernel weight for one dim. */ \ |
235 | | const float* const x4wp = x4w + k * 4 * w->info.dim[1] * w->info.dim[2] * w->info.dim[3]; \ |
236 | | float biasval[4] __attribute__ ((__aligned__(16))) = {}; \ |
237 | | if (bias) \ |
238 | | { \ |
239 | | biasval[0] = bias->data.f32[k * 4]; \ |
240 | | biasval[1] = bias->data.f32[k * 4 + 1]; \ |
241 | | biasval[2] = bias->data.f32[k * 4 + 2]; \ |
242 | | biasval[3] = bias->data.f32[k * 4 + 3]; \ |
243 | | } \ |
244 | | /* This block will be cause in each for-loop, therefore, you can use it to generate some temporary variables. */ \ |
245 | | int i[CCV_NNC_MAX_DIM]; \ |
246 | | int n[CCV_NNC_MAX_DIM]; \ |
247 | | int m[CCV_NNC_MAX_DIM]; \ |
248 | | int j[CCV_NNC_MAX_DIM]; \ |
249 | | for (i[0] = 0; i[0] < bdim[0]; i[0]++) \ |
250 | | { \ |
251 | | SET_BORDER_OFFSET_SIZE_FOR(0, i, hint, w->info.dim + 1, adim, n, m); \ |
252 | | const float* wpu = x4wp + n[0] * w->info.dim[2] * w->info.dim[3] * 4; \ |
253 | | for (i[1] = 0; i[1] < bdim[1]; i[1]++) \ |
254 | | { \ |
255 | | SET_BORDER_OFFSET_SIZE_FOR(1, i, hint, w->info.dim + 1, adim, n, m); \ |
256 | | float32x4_t v40 = vld1q_f32(biasval); \ |
257 | | float32x4_t v41 = vmovq_n_f32(0); \ |
258 | | float32x4_t v42 = vmovq_n_f32(0); \ |
259 | | float32x4_t v43 = vmovq_n_f32(0); \ |
260 | | const float* wpz = wpu + n[1] * w->info.dim[3] * 4; \ |
261 | | const float* apz = ap + ccv_max(i[1] * hint.stride.dim[1] - hint.border.begin[1], 0) * astride[2]; \ |
262 | | for (j[0] = 0; j[0] < m[0]; j[0]++) \ |
263 | | { \ |
264 | | for (j[1] = 0; j[1] < m[1]; j[1]++) \ |
265 | | { \ |
266 | | for (c = 0; c < adim[2] - 3; c += 4) \ |
267 | | { \ |
268 | | float32x2x2_t apz4 = vld2_f32(apz + j[1] * astride[2] + c); \ |
269 | | const float* const wpzu = wpz + (j[1] * w->info.dim[3] + c) * 4; \ |
270 | | float32x4_t apz40 = vdupq_lane_f32(apz4.val[0], 0); \ |
271 | | float32x4_t apz41 = vdupq_lane_f32(apz4.val[1], 0); \ |
272 | | float32x4_t apz42 = vdupq_lane_f32(apz4.val[0], 1); \ |
273 | | float32x4_t apz43 = vdupq_lane_f32(apz4.val[1], 1); \ |
274 | | float32x4_t w40 = vld1q_f32(wpzu); \ |
275 | | float32x4_t w41 = vld1q_f32(wpzu + 4); \ |
276 | | float32x4_t w42 = vld1q_f32(wpzu + 8); \ |
277 | | float32x4_t w43 = vld1q_f32(wpzu + 12); \ |
278 | | v40 = vmlaq_f32(v40, w40, apz40); \ |
279 | | v41 = vmlaq_f32(v41, w41, apz41); \ |
280 | | v42 = vmlaq_f32(v42, w42, apz42); \ |
281 | | v43 = vmlaq_f32(v43, w43, apz43); \ |
282 | | } \ |
283 | | tail_block /* insert executions for tail partition */ \ |
284 | | } \ |
285 | | wpz += w->info.dim[2] * w->info.dim[3] * 4; \ |
286 | | apz += astride[1]; \ |
287 | | } \ |
288 | | v40 = vaddq_f32(v40, v41); \ |
289 | | v42 = vaddq_f32(v42, v43); \ |
290 | | vst1q_f32(bp + i[1] * bstride[2], vaddq_f32(v40, v42)); \ |
291 | | } \ |
292 | | bp += bstride[1]; \ |
293 | | ap += astride[1] * (ccv_max((i[0] + 1) * hint.stride.dim[0] - hint.border.begin[0], 0) - ccv_max(i[0] * hint.stride.dim[0] - hint.border.begin[0], 0)); \ |
294 | | } \ |
295 | | } parallel_endfor |
296 | | if (w->info.dim[3] % 4 == 0) |
297 | | { |
298 | | main_for(); |
299 | | } else if (w->info.dim[3] % 4 == 3) { // unroll the last for-loops |
300 | | #define tail_block \ |
301 | | float32x2_t apz4 = vld1_f32(apz + j[1] * astride[2] + c); \ |
302 | | const float* const wpzu = wpz + (j[1] * w->info.dim[3] + c) * 4; \ |
303 | | float32x4_t apz40 = vdupq_lane_f32(apz4, 0); \ |
304 | | float32x4_t apz41 = vdupq_lane_f32(apz4, 1); \ |
305 | | float32x4_t apz42 = vld1q_dup_f32(apz + j[1] * astride[2] + c + 2); \ |
306 | | float32x4_t w40 = vld1q_f32(wpzu); \ |
307 | | float32x4_t w41 = vld1q_f32(wpzu + 4); \ |
308 | | float32x4_t w42 = vld1q_f32(wpzu + 8); \ |
309 | | v40 = vmlaq_f32(v40, w40, apz40); \ |
310 | | v41 = vmlaq_f32(v41, w41, apz41); \ |
311 | | v42 = vmlaq_f32(v42, w42, apz42); |
312 | | main_for(tail_block); |
313 | | #undef tail_block |
314 | | } else if (w->info.dim[3] % 4 == 2) { // unroll the last for-loops |
315 | | #define tail_block \ |
316 | | float32x2_t apz4 = vld1_f32(apz + j[1] * astride[2] + c); \ |
317 | | const float* const wpzu = wpz + (j[1] * w->info.dim[3] + c) * 4; \ |
318 | | float32x4_t apz40 = vdupq_lane_f32(apz4, 0); \ |
319 | | float32x4_t apz41 = vdupq_lane_f32(apz4, 1); \ |
320 | | float32x4_t w40 = vld1q_f32(wpzu); \ |
321 | | float32x4_t w41 = vld1q_f32(wpzu + 4); \ |
322 | | v40 = vmlaq_f32(v40, w40, apz40); \ |
323 | | v41 = vmlaq_f32(v41, w41, apz41); |
324 | | main_for(tail_block); |
325 | | #undef tail_block |
326 | | } else { // unroll the last for-loops |
327 | | #define tail_block \ |
328 | | float32x4_t apz4 = vld1q_dup_f32(apz + j[1] * astride[2] + c); \ |
329 | | const float* const wpzu = wpz + (j[1] * w->info.dim[3] + c) * 4; \ |
330 | | float32x4_t w4 = vld1q_f32(wpzu); \ |
331 | | v40 = vmlaq_f32(v40, w4, apz4); |
332 | | main_for(tail_block); |
333 | | #undef tail_block |
334 | | } |
335 | | #undef main_for |
336 | | ccfree(x4w); |
337 | | return CCV_NNC_EXEC_SUCCESS; |
338 | | } |
339 | | #endif |
340 | | |
341 | | int _ccv_nnc_conv_forw_cpu_opt(const ccv_nnc_tensor_view_t* const a, const ccv_nnc_tensor_t* const w, const ccv_nnc_tensor_t* const bias, const ccv_nnc_hint_t hint, ccv_nnc_tensor_view_t* const b) |
342 | 1.14k | { |
343 | 1.14k | #if defined(HAVE_SSE2) |
344 | 1.14k | if (w->info.dim[0] % 4 == 0) |
345 | 1.14k | return _ccv_nnc_conv_forw_sse2(a, w, bias, hint, b); |
346 | | #elif defined(HAVE_NEON) |
347 | | if (w->info.dim[0] % 4 == 0) |
348 | | return _ccv_nnc_conv_forw_neon(a, w, bias, hint, b); |
349 | | #endif |
350 | 0 | return CCV_NNC_EXEC_INVALID; |
351 | 1.14k | } |