Coverage Report

Created: 2021-09-21 23:33

/home/liu/buildslave/linux-x64-runtests/build/lib/nnc/cmd/convolution/cpu_opt/_ccv_nnc_conv_cpu_opt.c
Line
Count
Source (jump to first uncovered line)
1
#include "ccv.h"
2
#include "ccv_internal.h"
3
#include "nnc/ccv_nnc.h"
4
#include "nnc/ccv_nnc_easy.h"
5
#include "nnc/ccv_nnc_internal.h"
6
#if defined(HAVE_SSE2)
7
#include <xmmintrin.h>
8
#elif defined(HAVE_NEON)
9
#include <arm_neon.h>
10
#endif
11
#ifdef USE_OPENMP
12
#include <omp.h>
13
#endif
14
#ifdef USE_DISPATCH
15
#include <dispatch/dispatch.h>
16
#endif
17
#include "../_ccv_nnc_conv_cpu_opt.h"
18
19
#ifdef HAVE_SSE2
20
inline static void _ccv_nnc_x4w_sse2(const float* const w, const int* const dim, float* x4w)
21
1.13k
{
22
1.13k
  int jump_dim = dim[0] / 4;
23
1.13k
  parallel_for(k, jump_dim) {
24
0
    int i, j;
25
0
    float* x4wz = x4w + k * dim[3] * dim[2] * dim[1] * 4;
26
0
    const float* wz[] = {
27
0
      w + (k * 4) * dim[3] * dim[2] * dim[1],
28
0
      w + (k * 4 + 1) * dim[3] * dim[2] * dim[1],
29
0
      w + (k * 4 + 2) * dim[3] * dim[2] * dim[1],
30
0
      w + (k * 4 + 3) * dim[3] * dim[2] * dim[1],
31
0
    };
32
223k
    for (i = 0; i < dim[2] * dim[1]; i++)
33
223k
    {
34
2.16M
      for (j = 0; j < dim[3]; 
j++1.93M
)
35
1.93M
      {
36
1.93M
        x4wz[j * 4] = wz[0][j];
37
1.93M
        x4wz[j * 4 + 1] = wz[1][j];
38
1.93M
        x4wz[j * 4 + 2] = wz[2][j];
39
1.93M
        x4wz[j * 4 + 3] = wz[3][j];
40
1.93M
      }
41
223k
      x4wz += dim[3] * 4;
42
223k
      wz[0] += dim[3];
43
223k
      wz[1] += dim[3];
44
223k
      wz[2] += dim[3];
45
223k
      wz[3] += dim[3];
46
223k
    }
47
1.13k
  } parallel_endfor
48
1.13k
}
49
50
static int _ccv_nnc_conv_forw_sse2(const ccv_nnc_tensor_view_t* const a, const ccv_nnc_tensor_t* const w, const ccv_nnc_tensor_t* const bias, const ccv_nnc_hint_t hint, ccv_nnc_tensor_view_t* const b)
51
1.13k
{
52
1.13k
  const int a_nd = ccv_nnc_tensor_nd(a->info.dim);
53
1.13k
  assert(a_nd == CCV_NNC_MAX_DIM + 1 || a_nd == CCV_NNC_MAX_DIM + 2);
54
1.13k
  const int* adim = (a_nd == CCV_NNC_MAX_DIM + 1) ? 
a->info.dim146
:
a->info.dim + 1984
;
55
1.13k
  const int b_nd = ccv_nnc_tensor_nd(b->info.dim);
56
1.13k
  assert(b_nd == CCV_NNC_MAX_DIM + 1 || b_nd == CCV_NNC_MAX_DIM + 2);
57
1.13k
  const int* bdim = (b_nd == CCV_NNC_MAX_DIM + 1) ? 
b->info.dim146
:
b->info.dim + 1984
;
58
1.13k
  const int* ainc = CCV_IS_TENSOR_VIEW(a) ? 
((a_nd == 0
CCV_NNC_MAX_DIM0
+ 1) ?
a->inc0
:
a->inc + 10
) : adim;
59
1.13k
  const int* binc = CCV_IS_TENSOR_VIEW(b) ? 
((b_nd == 0
CCV_NNC_MAX_DIM0
+ 1) ?
b->inc0
:
b->inc + 10
) : bdim;
60
1.13k
  assert(w->info.dim[0] % 4 == 0);
61
1.13k
  float* x4w = 0;
62
1.13k
  ccmemalign((void **)&x4w, 16, sizeof(float) * w->info.dim[3] * w->info.dim[2] * w->info.dim[1] * w->info.dim[0]);
63
1.13k
  if (!x4w)
64
0
    return CCV_NNC_EXEC_OOM;
65
1.13k
  _ccv_nnc_x4w_sse2(w->data.f32, w->info.dim, x4w);
66
1.13k
  int jump_dim = w->info.dim[0] / 4;
67
1.13k
  // Do naive tail partition unroll
68
1.13k
#define main_for(tail_block) \
69
1.13k
  parallel_for(k, jump_dim) { \
70
0
    int c; \
71
0
    const float* ap = a->data.f32; \
72
0
    float* bp = b->data.f32 + k * 4; \
73
0
    /* kernel weight for one dim. */ \
74
0
    const float* const x4wp = x4w + k * 4 * w->info.dim[1] * w->info.dim[2] * w->info.dim[3]; \
75
0
    float biasval[4] __attribute__ ((__aligned__(16))) = {}; \
76
0
    if (bias) \
77
8.47k
    { \
78
8.47k
      biasval[0] = bias->data.f32[k * 4]; \
79
8.47k
      biasval[1] = bias->data.f32[k * 4 + 1]; \
80
8.47k
      biasval[2] = bias->data.f32[k * 4 + 2]; \
81
8.47k
      biasval[3] = bias->data.f32[k * 4 + 3]; \
82
8.47k
    } \
83
0
    /* This block will be cause in each for-loop, therefore, you can use it to generate some temporary variables. */ \
84
0
    int i[CCV_NNC_MAX_DIM]; \
85
0
    int n[CCV_NNC_MAX_DIM]; \
86
0
    int m[CCV_NNC_MAX_DIM]; \
87
0
    int j[CCV_NNC_MAX_DIM]; \
88
390k
    for (i[0] = 0; i[0] < bdim[0]; i[0]++) \
89
390k
    { \
90
392k
      SET_BORDER_OFFSET_SIZE_FOR(0, i, hint, w->info.dim + 1, adim, n, m); \
91
390k
      const float* wpu = x4wp + n[0] * w->info.dim[2] * w->info.dim[3] * 4; \
92
22.0M
      for (i[1] = 0; i[1] < bdim[1]; 
i[1]++21.6M
) \
93
21.6M
      { \
94
21.6M
        SET_BORDER_OFFSET_SIZE_FOR(1, i, hint, w->info.dim + 1, adim, n, m); \
95
21.6M
        __m128 v40 = _mm_load_ps(biasval); \
96
21.6M
        __m128 v41 = _mm_setzero_ps(); \
97
21.6M
        __m128 v42 = _mm_setzero_ps(); \
98
21.6M
        __m128 v43 = _mm_setzero_ps(); \
99
21.6M
        const float* wpz = wpu + n[1] * w->info.dim[3] * 4; \
100
21.6M
        const float* apz = ap + ccv_max(i[1] * hint.stride.dim[1] - hint.border.begin[1], 0) * ainc[2]; \
101
84.8M
        for (j[0] = 0; j[0] < m[0]; 
j[0]++63.2M
) \
102
63.2M
        { \
103
232M
          for (j[1] = 0; j[1] < m[1]; 
j[1]++169M
) \
104
169M
          { \
105
888M
            for (c = 0; c < adim[2] - 3; 
c += 4719M
) \
106
719M
            { \
107
719M
              __m128 apz4 = _mm_loadu_ps(apz + j[1] * ainc[2] + c); \
108
719M
              const float* const wpzu = wpz + (j[1] * w->info.dim[3] + c) * 4; \
109
719M
              __m128 w40 = _mm_loadu_ps(wpzu); \
110
719M
              __m128 w41 = _mm_loadu_ps(wpzu + 4); \
111
719M
              __m128 w42 = _mm_loadu_ps(wpzu + 8); \
112
719M
              __m128 w43 = _mm_loadu_ps(wpzu + 12); \
113
719M
              __m128 apz40 = _mm_shuffle_ps(apz4, apz4, 0x00); \
114
719M
              __m128 apz41 = _mm_shuffle_ps(apz4, apz4, 0x55); \
115
719M
              __m128 apz42 = _mm_shuffle_ps(apz4, apz4, 0xAA); \
116
719M
              __m128 apz43 = _mm_shuffle_ps(apz4, apz4, 0xFF); \
117
719M
              v40 =_mm_add_ps(_mm_mul_ps(w40, apz40), v40); \
118
719M
              v41 =_mm_add_ps(_mm_mul_ps(w41, apz41), v41); \
119
719M
              v42 =_mm_add_ps(_mm_mul_ps(w42, apz42), v42); \
120
719M
              v43 =_mm_add_ps(_mm_mul_ps(w43, apz43), v43); \
121
719M
            } \
122
169M
            tail_block /* insert executions for tail partition */ \
123
169M
          } \
124
63.2M
          wpz += w->info.dim[2] * w->info.dim[3] * 4; \
125
63.2M
          apz += ainc[1] * ainc[2]; \
126
63.2M
        } \
127
21.6M
        __m128 v4 = _mm_add_ps(_mm_add_ps(v40, v41), _mm_add_ps(v42, v43)); \
128
21.6M
        _mm_stream_ps(bp + i[1] * binc[2], v4); \
129
21.6M
      } \
130
390k
      bp += binc[1] * binc[2]; \
131
390k
      ap += ainc[1] * ainc[2] * (ccv_max((i[0] + 1) * hint.stride.dim[0] - hint.border.begin[0], 0) - ccv_max(i[0] * hint.stride.dim[0] - hint.border.begin[0], 0)); \
132
390k
    } \
133
1.13k
  } parallel_endfor
Unexecuted instantiation: _ccv_nnc_conv_cpu_opt.c:.omp_outlined._debug__
Unexecuted instantiation: _ccv_nnc_conv_cpu_opt.c:.omp_outlined._debug__.4
Unexecuted instantiation: _ccv_nnc_conv_cpu_opt.c:.omp_outlined._debug__.6
Unexecuted instantiation: _ccv_nnc_conv_cpu_opt.c:.omp_outlined._debug__.8
134
1.13k
  if (w->info.dim[3] % 4 == 0)
135
796
  {
136
796
    main_for();
137
796
  } else 
if (334
w->info.dim[3] % 4 == 3334
) { // unroll the last for-loops
138
334
#define tail_block \
139
334
    __m128 apz40 = _mm_load1_ps(apz + j[1] * ainc[2] + c); \
140
334
    __m128 apz41 = _mm_load1_ps(apz + j[1] * ainc[2] + c + 1); \
141
334
    __m128 apz42 = _mm_load1_ps(apz + j[1] * ainc[2] + c + 2); \
142
334
    const float* const wpzu = wpz + (j[1] * w->info.dim[3] + c) * 4; \
143
334
    __m128 w40 = _mm_loadu_ps(wpzu); \
144
334
    __m128 w41 = _mm_loadu_ps(wpzu + 4); \
145
334
    __m128 w42 = _mm_loadu_ps(wpzu + 8); \
146
334
    v40 = _mm_add_ps(_mm_mul_ps(w40, apz40), v40); \
147
334
    v41 = _mm_add_ps(_mm_mul_ps(w41, apz41), v41); \
148
334
    v42 = _mm_add_ps(_mm_mul_ps(w42, apz42), v42);
149
334
    main_for(tail_block);
150
334
#undef tail_block
151
334
  } else 
if (0
w->info.dim[3] % 4 == 20
) { // unroll the last for-loops
152
0
#define tail_block \
153
0
    __m128 apz40 = _mm_load1_ps(apz + j[1] * ainc[2] + c); \
154
0
    __m128 apz41 = _mm_load1_ps(apz + j[1] * ainc[2] + c + 1); \
155
0
    const float* const wpzu = wpz + (j[1] * w->info.dim[3] + c) * 4; \
156
0
    __m128 w40 = _mm_loadu_ps(wpzu); \
157
0
    __m128 w41 = _mm_loadu_ps(wpzu + 4); \
158
0
    v40 = _mm_add_ps(_mm_mul_ps(w40, apz40), v40); \
159
0
    v41 = _mm_add_ps(_mm_mul_ps(w41, apz41), v41);
160
0
    main_for(tail_block);
161
0
#undef tail_block
162
0
  } else {
163
0
#define tail_block \
164
0
    __m128 apz4 = _mm_load1_ps(apz + j[1] * ainc[2] + c); \
165
0
    const float* const wpzu = wpz + (j[1] * w->info.dim[3] + c) * 4; \
166
0
    __m128 w4 = _mm_loadu_ps(wpzu); \
167
0
    v40 = _mm_add_ps(_mm_mul_ps(w4, apz4), v40);
168
0
    main_for(tail_block);
169
0
#undef tail_block
170
0
  }
171
1.13k
#undef main_for
172
1.13k
  ccfree(x4w);
173
1.13k
  return CCV_NNC_EXEC_SUCCESS;
174
1.13k
}
175
#endif
176
177
#ifdef HAVE_NEON
178
inline static void _ccv_nnc_x4w_neon(const float* const w, const int* const dim, float* x4w)
179
{
180
  int jump_dim = dim[0] / 4;
181
  parallel_for(k, jump_dim) {
182
    int i, j;
183
    float* x4wz = x4w + k * dim[3] * dim[2] * dim[1] * 4;
184
    const float* wz[] = {
185
      w + (k * 4) * dim[3] * dim[2] * dim[1],
186
      w + (k * 4 + 1) * dim[3] * dim[2] * dim[1],
187
      w + (k * 4 + 2) * dim[3] * dim[2] * dim[1],
188
      w + (k * 4 + 3) * dim[3] * dim[2] * dim[1],
189
    };
190
    for (i = 0; i < dim[2] * dim[1]; i++)
191
    {
192
      for (j = 0; j < dim[3]; j++)
193
      {
194
        x4wz[j * 4] = wz[0][j];
195
        x4wz[j * 4 + 1] = wz[1][j];
196
        x4wz[j * 4 + 2] = wz[2][j];
197
        x4wz[j * 4 + 3] = wz[3][j];
198
      }
199
      x4wz += dim[3] * 4;
200
      wz[0] += dim[3];
201
      wz[1] += dim[3];
202
      wz[2] += dim[3];
203
      wz[3] += dim[3];
204
    }
205
  } parallel_endfor
206
}
207
208
static int _ccv_nnc_conv_forw_neon(const ccv_nnc_tensor_view_t* const a, const ccv_nnc_tensor_t* const w, const ccv_nnc_tensor_t* const bias, const ccv_nnc_hint_t hint, ccv_nnc_tensor_view_t* const b)
209
{
210
  const int a_nd = ccv_nnc_tensor_nd(a->info.dim);
211
  assert(a_nd == CCV_NNC_MAX_DIM + 1 || a_nd == CCV_NNC_MAX_DIM + 2);
212
  const int* adim = (a_nd == CCV_NNC_MAX_DIM + 1) ? a->info.dim : a->info.dim + 1;
213
  const int b_nd = ccv_nnc_tensor_nd(b->info.dim);
214
  assert(b_nd == CCV_NNC_MAX_DIM + 1 || b_nd == CCV_NNC_MAX_DIM + 2);
215
  const int* bdim = (b_nd == CCV_NNC_MAX_DIM + 1) ? b->info.dim : b->info.dim + 1;
216
  const int* ainc = CCV_IS_TENSOR_VIEW(a) ? ((a_nd == CCV_NNC_MAX_DIM + 1) ? a->inc : a->inc + 1) : adim;
217
  const int* binc = CCV_IS_TENSOR_VIEW(b) ? ((b_nd == CCV_NNC_MAX_DIM + 1) ? b->inc : b->inc + 1) : bdim;
218
  assert(w->info.dim[0] % 4 == 0);
219
  float* x4w = 0;
220
  ccmemalign((void **)&x4w, 16, sizeof(float) * w->info.dim[3] * w->info.dim[2] * w->info.dim[1] * w->info.dim[0]);
221
  if (!x4w)
222
    return CCV_NNC_EXEC_OOM;
223
  _ccv_nnc_x4w_neon(w->data.f32, w->info.dim, x4w);
224
  int jump_dim = w->info.dim[0] / 4;
225
#define main_for(tail_block) \
226
  parallel_for(k, jump_dim) { \
227
    int c; \
228
    const float* ap = a->data.f32; \
229
    float* bp = b->data.f32 + k * 4; \
230
    /* kernel weight for one dim. */ \
231
    const float* const x4wp = x4w + k * 4 * w->info.dim[1] * w->info.dim[2] * w->info.dim[3]; \
232
    float biasval[4] __attribute__ ((__aligned__(16))) = {}; \
233
    if (bias) \
234
    { \
235
      biasval[0] = bias->data.f32[k * 4]; \
236
      biasval[1] = bias->data.f32[k * 4 + 1]; \
237
      biasval[2] = bias->data.f32[k * 4 + 2]; \
238
      biasval[3] = bias->data.f32[k * 4 + 3]; \
239
    } \
240
    /* This block will be cause in each for-loop, therefore, you can use it to generate some temporary variables. */ \
241
    int i[CCV_NNC_MAX_DIM]; \
242
    int n[CCV_NNC_MAX_DIM]; \
243
    int m[CCV_NNC_MAX_DIM]; \
244
    int j[CCV_NNC_MAX_DIM]; \
245
    for (i[0] = 0; i[0] < bdim[0]; i[0]++) \
246
    { \
247
      SET_BORDER_OFFSET_SIZE_FOR(0, i, hint, w->info.dim + 1, adim, n, m); \
248
      const float* wpu = x4wp + n[0] * w->info.dim[2] * w->info.dim[3] * 4; \
249
      for (i[1] = 0; i[1] < bdim[1]; i[1]++) \
250
      { \
251
        SET_BORDER_OFFSET_SIZE_FOR(1, i, hint, w->info.dim + 1, adim, n, m); \
252
        float32x4_t v40 = vld1q_f32(biasval); \
253
        float32x4_t v41 = vmovq_n_f32(0); \
254
        float32x4_t v42 = vmovq_n_f32(0); \
255
        float32x4_t v43 = vmovq_n_f32(0); \
256
        const float* wpz = wpu + n[1] * w->info.dim[3] * 4; \
257
        const float* apz = ap + ccv_max(i[1] * hint.stride.dim[1] - hint.border.begin[1], 0) * ainc[2]; \
258
        for (j[0] = 0; j[0] < m[0]; j[0]++) \
259
        { \
260
          for (j[1] = 0; j[1] < m[1]; j[1]++) \
261
          { \
262
            for (c = 0; c < adim[2] - 3; c += 4) \
263
            { \
264
              float32x2x2_t apz4 = vld2_f32(apz + j[1] * ainc[2] + c); \
265
              const float* const wpzu = wpz + (j[1] * w->info.dim[3] + c) * 4; \
266
              float32x4_t apz40 = vdupq_lane_f32(apz4.val[0], 0); \
267
              float32x4_t apz41 = vdupq_lane_f32(apz4.val[1], 0); \
268
              float32x4_t apz42 = vdupq_lane_f32(apz4.val[0], 1); \
269
              float32x4_t apz43 = vdupq_lane_f32(apz4.val[1], 1); \
270
              float32x4_t w40 = vld1q_f32(wpzu); \
271
              float32x4_t w41 = vld1q_f32(wpzu + 4); \
272
              float32x4_t w42 = vld1q_f32(wpzu + 8); \
273
              float32x4_t w43 = vld1q_f32(wpzu + 12); \
274
              v40 = vmlaq_f32(v40, w40, apz40); \
275
              v41 = vmlaq_f32(v41, w41, apz41); \
276
              v42 = vmlaq_f32(v42, w42, apz42); \
277
              v43 = vmlaq_f32(v43, w43, apz43); \
278
            } \
279
            tail_block /* insert executions for tail partition */ \
280
          } \
281
          wpz += w->info.dim[2] * w->info.dim[3] * 4; \
282
          apz += ainc[1] * ainc[2]; \
283
        } \
284
        v40 = vaddq_f32(v40, v41); \
285
        v42 = vaddq_f32(v42, v43); \
286
        vst1q_f32(bp + i[1] * binc[2], vaddq_f32(v40, v42)); \
287
      } \
288
      bp += binc[1] * binc[2]; \
289
      ap += ainc[1] * ainc[2] * (ccv_max((i[0] + 1) * hint.stride.dim[0] - hint.border.begin[0], 0) - ccv_max(i[0] * hint.stride.dim[0] - hint.border.begin[0], 0)); \
290
    } \
291
  } parallel_endfor
292
  if (w->info.dim[3] % 4 == 0)
293
  {
294
    main_for();
295
  } else if (w->info.dim[3] % 4 == 3) { // unroll the last for-loops
296
#define tail_block \
297
    float32x2_t apz4 = vld1_f32(apz + j[1] * ainc[2] + c); \
298
    const float* const wpzu = wpz + (j[1] * w->info.dim[3] + c) * 4; \
299
    float32x4_t apz40 = vdupq_lane_f32(apz4, 0); \
300
    float32x4_t apz41 = vdupq_lane_f32(apz4, 1); \
301
    float32x4_t apz42 = vld1q_dup_f32(apz + j[1] * ainc[2] + c + 2); \
302
    float32x4_t w40 = vld1q_f32(wpzu); \
303
    float32x4_t w41 = vld1q_f32(wpzu + 4); \
304
    float32x4_t w42 = vld1q_f32(wpzu + 8); \
305
    v40 = vmlaq_f32(v40, w40, apz40); \
306
    v41 = vmlaq_f32(v41, w41, apz41); \
307
    v42 = vmlaq_f32(v42, w42, apz42);
308
    main_for(tail_block);
309
#undef tail_block
310
  } else if (w->info.dim[3] % 4 == 2) { // unroll the last for-loops
311
#define tail_block \
312
    float32x2_t apz4 = vld1_f32(apz + j[1] * ainc[2] + c); \
313
    const float* const wpzu = wpz + (j[1] * w->info.dim[3] + c) * 4; \
314
    float32x4_t apz40 = vdupq_lane_f32(apz4, 0); \
315
    float32x4_t apz41 = vdupq_lane_f32(apz4, 1); \
316
    float32x4_t w40 = vld1q_f32(wpzu); \
317
    float32x4_t w41 = vld1q_f32(wpzu + 4); \
318
    v40 = vmlaq_f32(v40, w40, apz40); \
319
    v41 = vmlaq_f32(v41, w41, apz41);
320
    main_for(tail_block);
321
#undef tail_block
322
  } else { // unroll the last for-loops
323
#define tail_block \
324
    float32x4_t apz4 = vld1q_dup_f32(apz + j[1] * ainc[2] + c); \
325
    const float* const wpzu = wpz + (j[1] * w->info.dim[3] + c) * 4; \
326
    float32x4_t w4 = vld1q_f32(wpzu); \
327
    v40 = vmlaq_f32(v40, w4, apz4);
328
    main_for(tail_block);
329
#undef tail_block
330
  }
331
#undef main_for
332
  ccfree(x4w);
333
  return CCV_NNC_EXEC_SUCCESS;
334
}
335
#endif
336
337
int _ccv_nnc_conv_forw_cpu_opt(const ccv_nnc_tensor_view_t* const a, const ccv_nnc_tensor_t* const w, const ccv_nnc_tensor_t* const bias, const ccv_nnc_hint_t hint, ccv_nnc_tensor_view_t* const b)
338
1.13k
{
339
1.13k
#if defined(HAVE_SSE2)
340
1.13k
  if (w->info.dim[0] % 4 == 0)
341
1.13k
    return _ccv_nnc_conv_forw_sse2(a, w, bias, hint, b);
342
#elif defined(HAVE_NEON)
343
  if (w->info.dim[0] % 4 == 0)
344
    return _ccv_nnc_conv_forw_neon(a, w, bias, hint, b);
345
#endif
346
0
  return CCV_NNC_EXEC_INVALID;
347
0
}