Coverage Report

Created: 2024-08-19 11:27

/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/cmd/convolution/cpu_opt/_ccv_nnc_conv_cpu_opt.c
Line
Count
Source (jump to first uncovered line)
1
#include "ccv.h"
2
#include "ccv_internal.h"
3
#include "nnc/ccv_nnc.h"
4
#include "nnc/ccv_nnc_easy.h"
5
#include "nnc/ccv_nnc_internal.h"
6
#if defined(HAVE_SSE2)
7
#include <xmmintrin.h>
8
#elif defined(HAVE_NEON)
9
#include <arm_neon.h>
10
#endif
11
#ifdef USE_OPENMP
12
#include <omp.h>
13
#endif
14
#ifdef USE_DISPATCH
15
#include <dispatch/dispatch.h>
16
#endif
17
#include "../_ccv_nnc_conv_cpu_opt.h"
18
19
#ifdef HAVE_SSE2
20
inline static void _ccv_nnc_x4w_sse2(const float* const w, const int* const dim, float* x4w)
21
1.14k
{
22
1.14k
  int jump_dim = dim[0] / 4;
23
16.9k
  
parallel_for1.14k
(k, jump_dim) {
24
16.9k
    int i, j;
25
16.9k
    float* x4wz = x4w + k * dim[3] * dim[2] * dim[1] * 4;
26
16.9k
    const float* wz[] = {
27
16.9k
      w + (k * 4) * dim[3] * dim[2] * dim[1],
28
16.9k
      w + (k * 4 + 1) * dim[3] * dim[2] * dim[1],
29
16.9k
      w + (k * 4 + 2) * dim[3] * dim[2] * dim[1],
30
16.9k
      w + (k * 4 + 3) * dim[3] * dim[2] * dim[1],
31
16.9k
    };
32
337k
    for (i = 0; i < dim[2] * dim[1]; 
i++320k
)
33
320k
    {
34
28.8M
      for (j = 0; j < dim[3]; 
j++28.5M
)
35
28.5M
      {
36
28.5M
        x4wz[j * 4] = wz[0][j];
37
28.5M
        x4wz[j * 4 + 1] = wz[1][j];
38
28.5M
        x4wz[j * 4 + 2] = wz[2][j];
39
28.5M
        x4wz[j * 4 + 3] = wz[3][j];
40
28.5M
      }
41
320k
      x4wz += dim[3] * 4;
42
320k
      wz[0] += dim[3];
43
320k
      wz[1] += dim[3];
44
320k
      wz[2] += dim[3];
45
320k
      wz[3] += dim[3];
46
320k
    }
47
16.9k
  } parallel_endfor
48
1.14k
}
49
50
static int _ccv_nnc_conv_forw_sse2(const ccv_nnc_tensor_view_t* const a, const ccv_nnc_tensor_t* const w, const ccv_nnc_tensor_t* const bias, const ccv_nnc_hint_t hint, ccv_nnc_tensor_view_t* const b)
51
1.14k
{
52
1.14k
  const int a_nd = ccv_nnc_tensor_nd(a->info.dim);
53
1.14k
  assert(a_nd == CCV_NNC_MAX_DIM + 1 || a_nd == CCV_NNC_MAX_DIM + 2);
54
1.14k
  const int* adim = (a_nd == CCV_NNC_MAX_DIM + 1) ? 
a->info.dim157
:
a->info.dim + 1984
;
55
1.14k
  const int b_nd = ccv_nnc_tensor_nd(b->info.dim);
56
1.14k
  assert(b_nd == CCV_NNC_MAX_DIM + 1 || b_nd == CCV_NNC_MAX_DIM + 2);
57
1.14k
  const int* bdim = (b_nd == CCV_NNC_MAX_DIM + 1) ? 
b->info.dim157
:
b->info.dim + 1984
;
58
1.14k
  int astride[CCV_NNC_MAX_DIM_ALLOC];
59
1.14k
  ccv_nnc_tensor_view_get_stride(a, astride);
60
1.14k
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
61
1.14k
  ccv_nnc_tensor_view_get_stride(b, bstride);
62
1.14k
  assert(w->info.dim[0] % 4 == 0);
63
1.14k
  float* x4w = 0;
64
1.14k
  ccmemalign((void **)&x4w, 64, sizeof(float) * w->info.dim[3] * w->info.dim[2] * w->info.dim[1] * w->info.dim[0]);
65
1.14k
  if (!x4w)
66
0
    return CCV_NNC_EXEC_OOM;
67
1.14k
  _ccv_nnc_x4w_sse2(w->data.f32, w->info.dim, x4w);
68
1.14k
  int jump_dim = w->info.dim[0] / 4;
69
  // Do naive tail partition unroll
70
1.14k
#define main_for(tail_block) \
71
16.9k
  
parallel_for1.14k
(k, jump_dim) { \
72
16.9k
    int c; \
73
16.9k
    const float* ap = a->data.f32; \
74
16.9k
    float* bp = b->data.f32 + k * 4; \
75
    /* kernel weight for one dim. */ \
76
16.9k
    const float* const x4wp = x4w + k * 4 * w->info.dim[1] * w->info.dim[2] * w->info.dim[3]; \
77
16.9k
    float biasval[4] __attribute__ ((__aligned__(16))) = {}; \
78
16.9k
    if (bias) \
79
16.9k
    { \
80
16.9k
      biasval[0] = bias->data.f32[k * 4]; \
81
16.9k
      biasval[1] = bias->data.f32[k * 4 + 1]; \
82
16.9k
      biasval[2] = bias->data.f32[k * 4 + 2]; \
83
16.9k
      biasval[3] = bias->data.f32[k * 4 + 3]; \
84
16.9k
    } \
85
    /* This block will be cause in each for-loop, therefore, you can use it to generate some temporary variables. */ \
86
16.9k
    int i[CCV_NNC_MAX_DIM]; \
87
16.9k
    int n[CCV_NNC_MAX_DIM]; \
88
16.9k
    int m[CCV_NNC_MAX_DIM]; \
89
16.9k
    int j[CCV_NNC_MAX_DIM]; \
90
416k
    for (i[0] = 0; i[0] < bdim[0]; 
i[0]++399k
) \
91
399k
    { \
92
399k
      SET_BORDER_OFFSET_SIZE_FOR(0, i, hint, w->info.dim + 1, adim, n, m); \
93
399k
      const float* wpu = x4wp + n[0] * w->info.dim[2] * w->info.dim[3] * 4; \
94
23.6M
      for (i[1] = 0; i[1] < bdim[1]; 
i[1]++23.2M
) \
95
23.2M
      { \
96
23.2M
        SET_BORDER_OFFSET_SIZE_FOR(1, i, hint, w->info.dim + 1, adim, n, m); \
97
23.2M
        __m128 v40 = _mm_load_ps(biasval); \
98
23.2M
        __m128 v41 = _mm_setzero_ps(); \
99
23.2M
        __m128 v42 = _mm_setzero_ps(); \
100
23.2M
        __m128 v43 = _mm_setzero_ps(); \
101
23.2M
        const float* wpz = wpu + n[1] * w->info.dim[3] * 4; \
102
23.2M
        const float* apz = ap + ccv_max(i[1] * hint.stride.dim[1] - hint.border.begin[1], 0) * astride[2]; \
103
98.1M
        for (j[0] = 0; j[0] < m[0]; 
j[0]++74.9M
) \
104
74.9M
        { \
105
326M
          for (j[1] = 0; j[1] < m[1]; 
j[1]++251M
) \
106
251M
          { \
107
5.64G
            for (c = 0; c < adim[2] - 3; 
c += 45.39G
) \
108
5.39G
            { \
109
5.39G
              __m128 apz4 = _mm_loadu_ps(apz + j[1] * astride[2] + c); \
110
5.39G
              const float* const wpzu = wpz + (j[1] * w->info.dim[3] + c) * 4; \
111
5.39G
              __m128 w40 = _mm_loadu_ps(wpzu); \
112
5.39G
              __m128 w41 = _mm_loadu_ps(wpzu + 4); \
113
5.39G
              __m128 w42 = _mm_loadu_ps(wpzu + 8); \
114
5.39G
              __m128 w43 = _mm_loadu_ps(wpzu + 12); \
115
5.39G
              __m128 apz40 = _mm_shuffle_ps(apz4, apz4, 0x00); \
116
5.39G
              __m128 apz41 = _mm_shuffle_ps(apz4, apz4, 0x55); \
117
5.39G
              __m128 apz42 = _mm_shuffle_ps(apz4, apz4, 0xAA); \
118
5.39G
              __m128 apz43 = _mm_shuffle_ps(apz4, apz4, 0xFF); \
119
5.39G
              v40 =_mm_add_ps(_mm_mul_ps(w40, apz40), v40); \
120
5.39G
              v41 =_mm_add_ps(_mm_mul_ps(w41, apz41), v41); \
121
5.39G
              v42 =_mm_add_ps(_mm_mul_ps(w42, apz42), v42); \
122
5.39G
              v43 =_mm_add_ps(_mm_mul_ps(w43, apz43), v43); \
123
5.39G
            } \
124
251M
            tail_block /* insert executions for tail partition */ \
125
251M
          } \
126
74.9M
          wpz += w->info.dim[2] * w->info.dim[3] * 4; \
127
74.9M
          apz += astride[1]; \
128
74.9M
        } \
129
23.2M
        __m128 v4 = _mm_add_ps(_mm_add_ps(v40, v41), _mm_add_ps(v42, v43)); \
130
23.2M
        _mm_stream_ps(bp + i[1] * bstride[2], v4); \
131
23.2M
      } \
132
399k
      bp += bstride[1]; \
133
399k
      ap += astride[1] * (ccv_max((i[0] + 1) * hint.stride.dim[0] - hint.border.begin[0], 0) - ccv_max(i[0] * hint.stride.dim[0] - hint.border.begin[0], 0)); \
134
399k
    } \
135
16.9k
  } parallel_endfor
136
1.14k
  if (w->info.dim[3] % 4 == 0)
137
799
  {
138
799
    main_for();
139
799
  } else 
if (342
w->info.dim[3] % 4 == 3342
) { // unroll the last for-loops
140
334
#define tail_block \
141
334
    __m128 apz40 = _mm_load1_ps(apz + j[1] * astride[2] + c); \
142
334
    __m128 apz41 = _mm_load1_ps(apz + j[1] * astride[2] + c + 1); \
143
334
    __m128 apz42 = _mm_load1_ps(apz + j[1] * astride[2] + c + 2); \
144
334
    const float* const wpzu = wpz + (j[1] * w->info.dim[3] + c) * 4; \
145
334
    __m128 w40 = _mm_loadu_ps(wpzu); \
146
334
    __m128 w41 = _mm_loadu_ps(wpzu + 4); \
147
334
    __m128 w42 = _mm_loadu_ps(wpzu + 8); \
148
334
    v40 = _mm_add_ps(_mm_mul_ps(w40, apz40), v40); \
149
334
    v41 = _mm_add_ps(_mm_mul_ps(w41, apz41), v41); \
150
334
    v42 = _mm_add_ps(_mm_mul_ps(w42, apz42), v42);
151
334
    main_for(tail_block);
152
334
#undef tail_block
153
334
  } else 
if (8
w->info.dim[3] % 4 == 28
) { // unroll the last for-loops
154
8
#define tail_block \
155
8
    __m128 apz40 = _mm_load1_ps(apz + j[1] * astride[2] + c); \
156
8
    __m128 apz41 = _mm_load1_ps(apz + j[1] * astride[2] + c + 1); \
157
8
    const float* const wpzu = wpz + (j[1] * w->info.dim[3] + c) * 4; \
158
8
    __m128 w40 = _mm_loadu_ps(wpzu); \
159
8
    __m128 w41 = _mm_loadu_ps(wpzu + 4); \
160
8
    v40 = _mm_add_ps(_mm_mul_ps(w40, apz40), v40); \
161
8
    v41 = _mm_add_ps(_mm_mul_ps(w41, apz41), v41);
162
8
    main_for(tail_block);
163
8
#undef tail_block
164
8
  } else {
165
0
#define tail_block \
166
0
    __m128 apz4 = _mm_load1_ps(apz + j[1] * astride[2] + c); \
167
0
    const float* const wpzu = wpz + (j[1] * w->info.dim[3] + c) * 4; \
168
0
    __m128 w4 = _mm_loadu_ps(wpzu); \
169
0
    v40 = _mm_add_ps(_mm_mul_ps(w4, apz4), v40);
170
0
    main_for(tail_block);
171
0
#undef tail_block
172
0
  }
173
1.14k
#undef main_for
174
1.14k
  ccfree(x4w);
175
1.14k
  return CCV_NNC_EXEC_SUCCESS;
176
1.14k
}
177
#endif
178
179
#ifdef HAVE_NEON
180
inline static void _ccv_nnc_x4w_neon(const float* const w, const int* const dim, float* x4w)
181
{
182
  int jump_dim = dim[0] / 4;
183
  parallel_for(k, jump_dim) {
184
    int i, j;
185
    float* x4wz = x4w + k * dim[3] * dim[2] * dim[1] * 4;
186
    const float* wz[] = {
187
      w + (k * 4) * dim[3] * dim[2] * dim[1],
188
      w + (k * 4 + 1) * dim[3] * dim[2] * dim[1],
189
      w + (k * 4 + 2) * dim[3] * dim[2] * dim[1],
190
      w + (k * 4 + 3) * dim[3] * dim[2] * dim[1],
191
    };
192
    for (i = 0; i < dim[2] * dim[1]; i++)
193
    {
194
      for (j = 0; j < dim[3]; j++)
195
      {
196
        x4wz[j * 4] = wz[0][j];
197
        x4wz[j * 4 + 1] = wz[1][j];
198
        x4wz[j * 4 + 2] = wz[2][j];
199
        x4wz[j * 4 + 3] = wz[3][j];
200
      }
201
      x4wz += dim[3] * 4;
202
      wz[0] += dim[3];
203
      wz[1] += dim[3];
204
      wz[2] += dim[3];
205
      wz[3] += dim[3];
206
    }
207
  } parallel_endfor
208
}
209
210
static int _ccv_nnc_conv_forw_neon(const ccv_nnc_tensor_view_t* const a, const ccv_nnc_tensor_t* const w, const ccv_nnc_tensor_t* const bias, const ccv_nnc_hint_t hint, ccv_nnc_tensor_view_t* const b)
211
{
212
  const int a_nd = ccv_nnc_tensor_nd(a->info.dim);
213
  assert(a_nd == CCV_NNC_MAX_DIM + 1 || a_nd == CCV_NNC_MAX_DIM + 2);
214
  const int* adim = (a_nd == CCV_NNC_MAX_DIM + 1) ? a->info.dim : a->info.dim + 1;
215
  const int b_nd = ccv_nnc_tensor_nd(b->info.dim);
216
  assert(b_nd == CCV_NNC_MAX_DIM + 1 || b_nd == CCV_NNC_MAX_DIM + 2);
217
  const int* bdim = (b_nd == CCV_NNC_MAX_DIM + 1) ? b->info.dim : b->info.dim + 1;
218
  int astride[CCV_NNC_MAX_DIM_ALLOC];
219
  ccv_nnc_tensor_view_get_stride(a, astride);
220
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
221
  ccv_nnc_tensor_view_get_stride(b, bstride);
222
  assert(w->info.dim[0] % 4 == 0);
223
  float* x4w = 0;
224
  ccmemalign((void **)&x4w, 64, sizeof(float) * w->info.dim[3] * w->info.dim[2] * w->info.dim[1] * w->info.dim[0]);
225
  if (!x4w)
226
    return CCV_NNC_EXEC_OOM;
227
  _ccv_nnc_x4w_neon(w->data.f32, w->info.dim, x4w);
228
  int jump_dim = w->info.dim[0] / 4;
229
#define main_for(tail_block) \
230
  parallel_for(k, jump_dim) { \
231
    int c; \
232
    const float* ap = a->data.f32; \
233
    float* bp = b->data.f32 + k * 4; \
234
    /* kernel weight for one dim. */ \
235
    const float* const x4wp = x4w + k * 4 * w->info.dim[1] * w->info.dim[2] * w->info.dim[3]; \
236
    float biasval[4] __attribute__ ((__aligned__(16))) = {}; \
237
    if (bias) \
238
    { \
239
      biasval[0] = bias->data.f32[k * 4]; \
240
      biasval[1] = bias->data.f32[k * 4 + 1]; \
241
      biasval[2] = bias->data.f32[k * 4 + 2]; \
242
      biasval[3] = bias->data.f32[k * 4 + 3]; \
243
    } \
244
    /* This block will be cause in each for-loop, therefore, you can use it to generate some temporary variables. */ \
245
    int i[CCV_NNC_MAX_DIM]; \
246
    int n[CCV_NNC_MAX_DIM]; \
247
    int m[CCV_NNC_MAX_DIM]; \
248
    int j[CCV_NNC_MAX_DIM]; \
249
    for (i[0] = 0; i[0] < bdim[0]; i[0]++) \
250
    { \
251
      SET_BORDER_OFFSET_SIZE_FOR(0, i, hint, w->info.dim + 1, adim, n, m); \
252
      const float* wpu = x4wp + n[0] * w->info.dim[2] * w->info.dim[3] * 4; \
253
      for (i[1] = 0; i[1] < bdim[1]; i[1]++) \
254
      { \
255
        SET_BORDER_OFFSET_SIZE_FOR(1, i, hint, w->info.dim + 1, adim, n, m); \
256
        float32x4_t v40 = vld1q_f32(biasval); \
257
        float32x4_t v41 = vmovq_n_f32(0); \
258
        float32x4_t v42 = vmovq_n_f32(0); \
259
        float32x4_t v43 = vmovq_n_f32(0); \
260
        const float* wpz = wpu + n[1] * w->info.dim[3] * 4; \
261
        const float* apz = ap + ccv_max(i[1] * hint.stride.dim[1] - hint.border.begin[1], 0) * astride[2]; \
262
        for (j[0] = 0; j[0] < m[0]; j[0]++) \
263
        { \
264
          for (j[1] = 0; j[1] < m[1]; j[1]++) \
265
          { \
266
            for (c = 0; c < adim[2] - 3; c += 4) \
267
            { \
268
              float32x2x2_t apz4 = vld2_f32(apz + j[1] * astride[2] + c); \
269
              const float* const wpzu = wpz + (j[1] * w->info.dim[3] + c) * 4; \
270
              float32x4_t apz40 = vdupq_lane_f32(apz4.val[0], 0); \
271
              float32x4_t apz41 = vdupq_lane_f32(apz4.val[1], 0); \
272
              float32x4_t apz42 = vdupq_lane_f32(apz4.val[0], 1); \
273
              float32x4_t apz43 = vdupq_lane_f32(apz4.val[1], 1); \
274
              float32x4_t w40 = vld1q_f32(wpzu); \
275
              float32x4_t w41 = vld1q_f32(wpzu + 4); \
276
              float32x4_t w42 = vld1q_f32(wpzu + 8); \
277
              float32x4_t w43 = vld1q_f32(wpzu + 12); \
278
              v40 = vmlaq_f32(v40, w40, apz40); \
279
              v41 = vmlaq_f32(v41, w41, apz41); \
280
              v42 = vmlaq_f32(v42, w42, apz42); \
281
              v43 = vmlaq_f32(v43, w43, apz43); \
282
            } \
283
            tail_block /* insert executions for tail partition */ \
284
          } \
285
          wpz += w->info.dim[2] * w->info.dim[3] * 4; \
286
          apz += astride[1]; \
287
        } \
288
        v40 = vaddq_f32(v40, v41); \
289
        v42 = vaddq_f32(v42, v43); \
290
        vst1q_f32(bp + i[1] * bstride[2], vaddq_f32(v40, v42)); \
291
      } \
292
      bp += bstride[1]; \
293
      ap += astride[1] * (ccv_max((i[0] + 1) * hint.stride.dim[0] - hint.border.begin[0], 0) - ccv_max(i[0] * hint.stride.dim[0] - hint.border.begin[0], 0)); \
294
    } \
295
  } parallel_endfor
296
  if (w->info.dim[3] % 4 == 0)
297
  {
298
    main_for();
299
  } else if (w->info.dim[3] % 4 == 3) { // unroll the last for-loops
300
#define tail_block \
301
    float32x2_t apz4 = vld1_f32(apz + j[1] * astride[2] + c); \
302
    const float* const wpzu = wpz + (j[1] * w->info.dim[3] + c) * 4; \
303
    float32x4_t apz40 = vdupq_lane_f32(apz4, 0); \
304
    float32x4_t apz41 = vdupq_lane_f32(apz4, 1); \
305
    float32x4_t apz42 = vld1q_dup_f32(apz + j[1] * astride[2] + c + 2); \
306
    float32x4_t w40 = vld1q_f32(wpzu); \
307
    float32x4_t w41 = vld1q_f32(wpzu + 4); \
308
    float32x4_t w42 = vld1q_f32(wpzu + 8); \
309
    v40 = vmlaq_f32(v40, w40, apz40); \
310
    v41 = vmlaq_f32(v41, w41, apz41); \
311
    v42 = vmlaq_f32(v42, w42, apz42);
312
    main_for(tail_block);
313
#undef tail_block
314
  } else if (w->info.dim[3] % 4 == 2) { // unroll the last for-loops
315
#define tail_block \
316
    float32x2_t apz4 = vld1_f32(apz + j[1] * astride[2] + c); \
317
    const float* const wpzu = wpz + (j[1] * w->info.dim[3] + c) * 4; \
318
    float32x4_t apz40 = vdupq_lane_f32(apz4, 0); \
319
    float32x4_t apz41 = vdupq_lane_f32(apz4, 1); \
320
    float32x4_t w40 = vld1q_f32(wpzu); \
321
    float32x4_t w41 = vld1q_f32(wpzu + 4); \
322
    v40 = vmlaq_f32(v40, w40, apz40); \
323
    v41 = vmlaq_f32(v41, w41, apz41);
324
    main_for(tail_block);
325
#undef tail_block
326
  } else { // unroll the last for-loops
327
#define tail_block \
328
    float32x4_t apz4 = vld1q_dup_f32(apz + j[1] * astride[2] + c); \
329
    const float* const wpzu = wpz + (j[1] * w->info.dim[3] + c) * 4; \
330
    float32x4_t w4 = vld1q_f32(wpzu); \
331
    v40 = vmlaq_f32(v40, w4, apz4);
332
    main_for(tail_block);
333
#undef tail_block
334
  }
335
#undef main_for
336
  ccfree(x4w);
337
  return CCV_NNC_EXEC_SUCCESS;
338
}
339
#endif
340
341
int _ccv_nnc_conv_forw_cpu_opt(const ccv_nnc_tensor_view_t* const a, const ccv_nnc_tensor_t* const w, const ccv_nnc_tensor_t* const bias, const ccv_nnc_hint_t hint, ccv_nnc_tensor_view_t* const b)
342
1.14k
{
343
1.14k
#if defined(HAVE_SSE2)
344
1.14k
  if (w->info.dim[0] % 4 == 0)
345
1.14k
    return _ccv_nnc_conv_forw_sse2(a, w, bias, hint, b);
346
#elif defined(HAVE_NEON)
347
  if (w->info.dim[0] % 4 == 0)
348
    return _ccv_nnc_conv_forw_neon(a, w, bias, hint, b);
349
#endif
350
0
  return CCV_NNC_EXEC_INVALID;
351
1.14k
}