Coverage Report

Created: 2019-07-03 22:50

/home/liu/buildslave/linux-x64-runtests/build/lib/nnc/cmd/blas/cpu_opt/_ccv_nnc_gemm_cpu_opt.c
Line
Count
Source (jump to first uncovered line)
1
#include <ccv.h>
2
#include <ccv_internal.h>
3
#include <nnc/ccv_nnc.h>
4
#include <nnc/ccv_nnc_easy.h>
5
#include <nnc/ccv_nnc_internal.h>
6
#if defined(HAVE_SSE2)
7
#include <xmmintrin.h>
8
#elif defined(HAVE_NEON)
9
#include <arm_neon.h>
10
#endif
11
#ifdef USE_OPENMP
12
#include <omp.h>
13
#endif
14
#ifdef USE_DISPATCH
15
#include <dispatch/dispatch.h>
16
#endif
17
#include "../_ccv_nnc_gemm_cpu_opt.h"
18
19
#ifdef HAVE_SSE2
20
static int _ccv_nnc_gemm_forw_sse2(const ccv_nnc_tensor_view_t* const a, const ccv_nnc_tensor_view_t* const w, const ccv_nnc_tensor_view_t* const bias, ccv_nnc_tensor_view_t* const b)
21
66
{
22
66
  const int a_nd = ccv_nnc_tensor_nd(a->info.dim);
23
66
  const int* adim = (a_nd == 1) ? 
a->info.dim24
:
a->info.dim + 142
;
24
66
  const int b_nd = ccv_nnc_tensor_nd(b->info.dim);
25
66
  const int* bdim = (b_nd == 1) ? 
b->info.dim24
:
b->info.dim + 142
;
26
66
  assert(!bias || bdim[0] == bias->info.dim[0]);
27
66
  assert(bdim[0] == w->info.dim[0]);
28
66
  assert(adim[0] == w->info.dim[1]);
29
66
  const int batch_size = a_nd == 1 ? 
124
:
ccv_max42
(1, a->info.dim[0]);
30
66
  assert(batch_size == (b_nd == 1) ? 1 : ccv_max(1, b->info.dim[0]));
31
66
  const int a_batch_inc = CCV_IS_TENSOR_VIEW(a) ? 
(a_nd == 1 0
?
a->inc[0]0
:
a->inc[1]0
) : adim[0];
32
66
  const int b_batch_inc = CCV_IS_TENSOR_VIEW(b) ? 
(b_nd == 1 0
?
b->inc[0]0
:
b->inc[1]0
) : bdim[0];
33
66
  const int* winc = CCV_IS_TENSOR_VIEW(w) ? 
w->inc0
: w->info.dim;
34
66
  int i;
35
66
  if (bias)
36
66
  {
37
132
    for (i = 0; i < batch_size; 
i++66
)
38
66
    {
39
66
      const float* const ap = a->data.f32 + i * a_batch_inc;
40
66
      float* const bp = b->data.f32 + i * b_batch_inc;
41
66
      parallel_for(j, bdim[0]) {
42
0
        const float* const wp = w->data.f32 + j * winc[1];
43
0
        int k;
44
0
        __m128 v40 = _mm_set_ss(bias->data.f32[j]);
45
0
        __m128 v41 = _mm_setzero_ps();
46
6.02M
        for (k = 0; k < adim[0] - 7; k += 8)
47
6.02M
        {
48
6.02M
          __m128 ap40 = _mm_load_ps(ap + k);
49
6.02M
          __m128 ap41 = _mm_load_ps(ap + k + 4);
50
6.02M
          __m128 w40 = _mm_load_ps(wp + k);
51
6.02M
          __m128 w41 = _mm_load_ps(wp + k + 4);
52
6.02M
          v40 =_mm_add_ps(_mm_mul_ps(w40, ap40), v40);
53
6.02M
          v41 =_mm_add_ps(_mm_mul_ps(w41, ap41), v41);
54
6.02M
        }
55
0
        v40 = _mm_add_ps(v40, v41);
56
0
        v41 = _mm_add_ps(v40, _mm_movehl_ps(v40, v40));
57
0
        v40 = _mm_add_ss(v41, _mm_shuffle_ps(v41, v41, 1));
58
0
        _mm_store_ss(bp + j, v40);
59
66
      } parallel_endfor
60
66
    }
61
66
  } else {
62
0
    for (i = 0; i < batch_size; i++)
63
0
    {
64
0
      const float* const ap = a->data.f32 + i * a_batch_inc;
65
0
      float* const bp = b->data.f32 + i * b_batch_inc;
66
0
      parallel_for(j, bdim[0]) {
67
0
        const float* const wp = w->data.f32 + j * winc[1];
68
0
        int k;
69
0
        __m128 v40 = _mm_setzero_ps();
70
0
        __m128 v41 = _mm_setzero_ps();
71
0
        for (k = 0; k < adim[0] - 7; k += 8)
72
0
        {
73
0
          __m128 ap40 = _mm_load_ps(ap + k);
74
0
          __m128 ap41 = _mm_load_ps(ap + k + 4);
75
0
          __m128 w40 = _mm_load_ps(wp + k);
76
0
          __m128 w41 = _mm_load_ps(wp + k + 4);
77
0
          v40 =_mm_add_ps(_mm_mul_ps(w40, ap40), v40);
78
0
          v41 =_mm_add_ps(_mm_mul_ps(w41, ap41), v41);
79
0
        }
80
0
        v40 = _mm_add_ps(v40, v41);
81
0
        v41 = _mm_add_ps(v40, _mm_movehl_ps(v40, v40));
82
0
        v40 = _mm_add_ss(v41, _mm_shuffle_ps(v41, v41, 1));
83
0
        _mm_store_ss(bp + j, v40);
84
0
      } parallel_endfor
85
0
    }
86
0
  }
87
66
  return CCV_NNC_EXEC_SUCCESS;
88
66
}
89
90
static int _ccv_nnc_gemm_back_sse2(const ccv_nnc_tensor_view_t* const g, const ccv_nnc_tensor_view_t* const a, const ccv_nnc_tensor_view_t* const w, ccv_nnc_tensor_view_t* const dw, ccv_nnc_tensor_view_t* const bias, ccv_nnc_tensor_view_t* const h, const int flags)
91
9
{
92
9
  const int* dwinc = CCV_IS_TENSOR_VIEW(dw) ? 
dw->inc0
: dw->info.dim;
93
9
  if (!(flags & CCV_NNC_ACCUMULATE_OUTPUT)) // reset the gradients to 0
94
9
  {
95
9
    memset(dw->data.u8, 0, sizeof(float) * dwinc[1] * dw->info.dim[0]);
96
9
    if (bias)
97
9
      memset(bias->data.u8, 0, sizeof(float) * bias->info.dim[0]);
98
9
  }
99
9
  const int a_nd = ccv_nnc_tensor_nd(a->info.dim);
100
9
  const int* adim = (a_nd == 1) ? 
a->info.dim0
: a->info.dim + 1;
101
9
  const int g_nd = ccv_nnc_tensor_nd(g->info.dim);
102
9
  const int* gdim = (g_nd == 1) ? 
g->info.dim0
: g->info.dim + 1;
103
9
  const int batch_size = a_nd == 1 ? 
10
: ccv_max(1, a->info.dim[0]);
104
9
  int i, j;
105
9
  float* gp = g->data.f32;
106
9
  const int g_batch_inc = CCV_IS_TENSOR_VIEW(g) ? 
((g_nd == 1) 0
?
g->inc[0]0
:
g->inc[1]0
) : gdim[0];
107
9
  if (bias)
108
9
  {
109
9
    float* bp = bias->data.f32;
110
9
    assert(bias->info.dim[0] == gdim[0]);
111
18
    
for (i = 0; 9
i < batch_size;
i++9
)
112
9
    {
113
585
      for (j = 0; j < gdim[0] - 3; 
j += 4576
)
114
576
      {
115
576
        __m128 g4 = _mm_load_ps(gp + j);
116
576
        __m128 b4 = _mm_load_ps(bp + j);
117
576
        _mm_stream_ps(bp + j, _mm_add_ps(b4, g4));
118
576
      }
119
9
      gp += g_batch_inc;
120
9
    }
121
9
  }
122
9
  assert(gdim[0] == dw->info.dim[0]);
123
9
  assert(adim[0] == dw->info.dim[1]);
124
9
  const int a_batch_inc = CCV_IS_TENSOR_VIEW(a) ? 
((a_nd == 1) 0
?
a->inc[0]0
:
a->inc[1]0
) : adim[0];
125
18
  for (i = 0; i < batch_size; 
i++9
)
126
9
  {
127
9
    const float* const gp = g->data.f32 + i * g_batch_inc;
128
9
    const float* const ap = a->data.f32 + i * a_batch_inc;
129
9
    parallel_for(j, gdim[0]) {
130
0
      float* const dwp = dw->data.f32 + j * dwinc[1];
131
0
      __m128 g4 = _mm_set1_ps(gp[j]);
132
0
      int k;
133
24.4k
      for (k = 0; k < adim[0] - 3; k+= 4)
134
24.4k
      {
135
24.4k
        __m128 a4 = _mm_load_ps(ap + k);
136
24.4k
        __m128 dw4 = _mm_load_ps(dwp + k);
137
24.4k
        _mm_stream_ps(dwp + k, _mm_add_ps(dw4, _mm_mul_ps(a4, g4)));
138
24.4k
      }
139
9
    } parallel_endfor
140
9
  }
141
9
  if (h && w)
142
9
  {
143
9
    const int h_nd = ccv_nnc_tensor_nd(h->info.dim);
144
9
    const int* hdim = (h_nd == 1) ? 
h->info.dim0
: h->info.dim + 1;
145
9
    assert(hdim[0] == adim[0]);
146
9
    const int h_batch_inc = CCV_IS_TENSOR_VIEW(h) ? 
((h_nd == 1) 0
?
h->inc[0]0
:
h->inc[1]0
) : hdim[0];
147
9
    const int* winc = CCV_IS_TENSOR_VIEW(w) ? 
w->inc0
: w->info.dim;
148
18
    for (i = 0; i < batch_size; 
i++9
)
149
9
    {
150
9
      const float* const gp = g->data.f32 + i * g_batch_inc;
151
9
      float* const hp = h->data.f32 + i * h_batch_inc;
152
9
      parallel_for(y, hdim[0] / 4) {
153
0
        const int j = y * 4;
154
0
        const float* const wp = w->data.f32 + j;
155
0
        __m128 v40 = _mm_setzero_ps();
156
0
        __m128 v41 = _mm_setzero_ps();
157
0
        __m128 v42 = _mm_setzero_ps();
158
0
        __m128 v43 = _mm_setzero_ps();
159
0
        int k;
160
11.3k
        for (k = 0; k < gdim[0]; k += 4)
161
11.3k
        {
162
11.3k
          __m128 g4 = _mm_load_ps(gp + k);
163
11.3k
          __m128 w40 = _mm_load_ps(wp + k * winc[1]);
164
11.3k
          __m128 w41 = _mm_load_ps(wp + (k + 1) * winc[1]);
165
11.3k
          __m128 w42 = _mm_load_ps(wp + (k + 2) * winc[1]);
166
11.3k
          __m128 w43 = _mm_load_ps(wp + (k + 3) * winc[1]);
167
11.3k
          __m128 g40 = _mm_shuffle_ps(g4, g4, 0x00);
168
11.3k
          __m128 g41 = _mm_shuffle_ps(g4, g4, 0x55);
169
11.3k
          __m128 g42 = _mm_shuffle_ps(g4, g4, 0xAA);
170
11.3k
          __m128 g43 = _mm_shuffle_ps(g4, g4, 0xFF);
171
11.3k
          v40 = _mm_add_ps(_mm_mul_ps(g40, w40), v40);
172
11.3k
          v41 = _mm_add_ps(_mm_mul_ps(g41, w41), v41);
173
11.3k
          v42 = _mm_add_ps(_mm_mul_ps(g42, w42), v42);
174
11.3k
          v43 = _mm_add_ps(_mm_mul_ps(g43, w43), v43);
175
11.3k
        }
176
0
        v40 = _mm_add_ps(v40, v41);
177
0
        v42 = _mm_add_ps(v42, v43);
178
0
        _mm_stream_ps(hp + j, _mm_add_ps(v40, v42));
179
9
      } parallel_endfor
180
9
    }
181
9
  }
182
9
  return CCV_NNC_EXEC_SUCCESS;
183
9
}
184
#endif
185
186
#ifdef HAVE_NEON
187
static int _ccv_nnc_gemm_forw_neon(const ccv_nnc_tensor_view_t* const a, const ccv_nnc_tensor_view_t* const w, const ccv_nnc_tensor_view_t* const bias, ccv_nnc_tensor_view_t* const b)
188
{
189
  const int a_nd = ccv_nnc_tensor_nd(a->info.dim);
190
  const int* adim = (a_nd == 1) ? a->info.dim : a->info.dim + 1;
191
  const int b_nd = ccv_nnc_tensor_nd(b->info.dim);
192
  const int* bdim = (b_nd == 1) ? b->info.dim : b->info.dim + 1;
193
  const int batch_size = a_nd == 1 ? 1 : ccv_max(1, a->info.dim[0]);
194
  assert(batch_size == (b_nd == 1) ? 1 : ccv_max(1, b->info.dim[0]));
195
  const int a_batch_inc = CCV_IS_TENSOR_VIEW(a) ? (a_nd == 1 ? a->inc[0] : a->inc[1]) : adim[0];
196
  const int b_batch_inc = CCV_IS_TENSOR_VIEW(b) ? (b_nd == 1 ? b->inc[0] : b->inc[1]) : bdim[0];
197
  const int* winc = CCV_IS_TENSOR_VIEW(w) ? w->inc : w->info.dim;
198
  int i;
199
  if (bias)
200
  {
201
    for (i = 0; i < batch_size; i++)
202
    {
203
      const float* const ap = a->data.f32 + i * a_batch_inc;
204
      float* const bp = b->data.f32 + i * b_batch_inc;
205
      parallel_for(j, bdim[0]) {
206
        const float* const wp = w->data.f32 + j * winc[1];
207
        int k;
208
        float32x4_t v41 = vmovq_n_f32(0);
209
        float32x4_t v40 = vld1q_lane_f32(bias->data.f32 + j, v41, 0);
210
        for (k = 0; k < adim[0] - 7; k += 8)
211
        {
212
          float32x4_t ap40 = vld1q_f32(ap + k);
213
          float32x4_t ap41 = vld1q_f32(ap + k + 4);
214
          float32x4_t w40 = vld1q_f32(wp + k);
215
          float32x4_t w41 = vld1q_f32(wp + k + 4);
216
          v40 = vmlaq_f32(v40, w40, ap40);
217
          v41 = vmlaq_f32(v41, w41, ap41);
218
        }
219
        v40 = vaddq_f32(v40, v41);
220
        float32x2_t v2 = vpadd_f32(vget_high_f32(v40), vget_low_f32(v40));
221
        bp[j] = vget_lane_f32(vpadd_f32(v2, v2), 0);
222
      } parallel_endfor
223
    }
224
  } else {
225
    for (i = 0; i < batch_size; i++)
226
    {
227
      const float* const ap = a->data.f32 + i * a_batch_inc;
228
      float* const bp = b->data.f32 + i * b_batch_inc;
229
      parallel_for(j, bdim[0]) {
230
        const float* const wp = w->data.f32 + j * winc[1];
231
        int k;
232
        float32x4_t v41 = vmovq_n_f32(0);
233
        float32x4_t v40 = vmovq_n_f32(0);
234
        for (k = 0; k < adim[0] - 7; k += 8)
235
        {
236
          float32x4_t ap40 = vld1q_f32(ap + k);
237
          float32x4_t ap41 = vld1q_f32(ap + k + 4);
238
          float32x4_t w40 = vld1q_f32(wp + k);
239
          float32x4_t w41 = vld1q_f32(wp + k + 4);
240
          v40 = vmlaq_f32(v40, w40, ap40);
241
          v41 = vmlaq_f32(v41, w41, ap41);
242
        }
243
        v40 = vaddq_f32(v40, v41);
244
        float32x2_t v2 = vpadd_f32(vget_high_f32(v40), vget_low_f32(v40));
245
        bp[j] = vget_lane_f32(vpadd_f32(v2, v2), 0);
246
      } parallel_endfor
247
    }
248
  }
249
  return CCV_NNC_EXEC_SUCCESS;
250
}
251
252
static int _ccv_nnc_gemm_back_neon(const ccv_nnc_tensor_view_t* const g, const ccv_nnc_tensor_view_t* const a, const ccv_nnc_tensor_view_t* const w, ccv_nnc_tensor_view_t* const dw, ccv_nnc_tensor_view_t* const bias, ccv_nnc_tensor_view_t* const h, const int flags)
253
{
254
  const int* dwinc = CCV_IS_TENSOR_VIEW(dw) ? dw->inc : dw->info.dim;
255
  if (!(flags & CCV_NNC_ACCUMULATE_OUTPUT)) // reset the gradients to 0
256
  {
257
    memset(dw->data.u8, 0, sizeof(float) * dwinc[1] * dw->info.dim[0]);
258
    if (bias)
259
      memset(bias->data.u8, 0, sizeof(float) * bias->info.dim[0]);
260
  }
261
  const int a_nd = ccv_nnc_tensor_nd(a->info.dim);
262
  const int* adim = (a_nd == 1) ? a->info.dim : a->info.dim + 1;
263
  const int g_nd = ccv_nnc_tensor_nd(g->info.dim);
264
  const int* gdim = (g_nd == 1) ? g->info.dim : g->info.dim + 1;
265
  const int batch_size = a_nd == 1 ? 1 : ccv_max(1, a->info.dim[0]);
266
  int i, j;
267
  float* gp = g->data.f32;
268
  const int g_batch_inc = CCV_IS_TENSOR_VIEW(g) ? ((g_nd == 1) ? g->inc[0] : g->inc[1]) : gdim[0];
269
  if (bias)
270
  {
271
    float* bp = bias->data.f32;
272
    for (i = 0; i < batch_size; i++)
273
    {
274
      for (j = 0; j < gdim[0] - 3; j += 4)
275
      {
276
        float32x4_t g4 = vld1q_f32(gp + j);
277
        float32x4_t b4 = vld1q_f32(bp + j);
278
        vst1q_f32(bp + j, vaddq_f32(b4, g4));
279
      }
280
      gp += g_batch_inc;
281
    }
282
  }
283
  const int a_batch_inc = CCV_IS_TENSOR_VIEW(a) ? ((a_nd == 1) ? a->inc[0] : a->inc[1]) : adim[0];
284
  for (i = 0; i < batch_size; i++)
285
  {
286
    const float* const gp = g->data.f32 + i * g_batch_inc;
287
    const float* const ap = a->data.f32 + i * a_batch_inc;
288
    parallel_for(j, gdim[0]) {
289
      float* const dwp = dw->data.f32 + j * dwinc[1];
290
      float32x4_t g4 = vld1q_dup_f32(gp + j);
291
      int k;
292
      for (k = 0; k < adim[0] - 3; k+= 4)
293
      {
294
        float32x4_t a4 = vld1q_f32(ap + k);
295
        float32x4_t dw4 = vld1q_f32(dwp + k);
296
        vst1q_f32(dwp + k, vmlaq_f32(dw4, a4, g4));
297
      }
298
    } parallel_endfor
299
  }
300
  if (h && w)
301
  {
302
    const int h_nd = ccv_nnc_tensor_nd(h->info.dim);
303
    const int* hdim = (h_nd == 1) ? h->info.dim : h->info.dim + 1;
304
    const int h_batch_inc = CCV_IS_TENSOR_VIEW(h) ? ((h_nd == 1) ? h->inc[0] : h->inc[1]) : hdim[0];
305
    const int* winc = CCV_IS_TENSOR_VIEW(w) ? w->inc : w->info.dim;
306
    for (i = 0; i < batch_size; i++)
307
    {
308
      const float* const gp = g->data.f32 + i * g_batch_inc;
309
      float* const hp = h->data.f32 + i * h_batch_inc;
310
      parallel_for(y, hdim[0] / 4) {
311
        const int j = y * 4;
312
        const float* const wp = w->data.f32 + j;
313
        float32x4_t v40 = vmovq_n_f32(0);
314
        float32x4_t v41 = vmovq_n_f32(0);
315
        float32x4_t v42 = vmovq_n_f32(0);
316
        float32x4_t v43 = vmovq_n_f32(0);
317
        int k;
318
        for (k = 0; k < gdim[0]; k += 4)
319
        {
320
          float32x2x2_t g4 = vld2_f32(gp + k);
321
          float32x4_t w40 = vld1q_f32(wp + k * winc[1]);
322
          float32x4_t w41 = vld1q_f32(wp + (k + 1) * winc[1]);
323
          float32x4_t w42 = vld1q_f32(wp + (k + 2) * winc[1]);
324
          float32x4_t w43 = vld1q_f32(wp + (k + 3) * winc[1]);
325
          float32x4_t g40 = vdupq_lane_f32(g4.val[0], 0);
326
          float32x4_t g41 = vdupq_lane_f32(g4.val[1], 0);
327
          float32x4_t g42 = vdupq_lane_f32(g4.val[0], 1);
328
          float32x4_t g43 = vdupq_lane_f32(g4.val[1], 1);
329
          v40 = vmlaq_f32(v40, g40, w40);
330
          v41 = vmlaq_f32(v41, g41, w41);
331
          v42 = vmlaq_f32(v42, g42, w42);
332
          v43 = vmlaq_f32(v43, g43, w43);
333
        }
334
        v40 = vaddq_f32(v40, v41);
335
        v42 = vaddq_f32(v42, v43);
336
        vst1q_f32(hp + j, vaddq_f32(v40, v42));
337
      } parallel_endfor
338
    }
339
  }
340
  return CCV_NNC_EXEC_SUCCESS;
341
}
342
#endif
343
344
int _ccv_nnc_gemm_forw_cpu_opt(const ccv_nnc_tensor_view_t* const a, const ccv_nnc_tensor_view_t* const w, const ccv_nnc_tensor_view_t* const bias, ccv_nnc_tensor_view_t* const b)
345
105
{
346
105
#if defined(HAVE_SSE2) || defined(HAVE_NEON)
347
105
  const int a_nd = ccv_nnc_tensor_nd(a->info.dim);
348
105
  const int adim = (a_nd == 1) ? 
a->info.dim[0]45
:
a->info.dim[1]60
;
349
105
#endif
350
105
#if defined(HAVE_SSE2)
351
105
  if (adim % 8 == 0)
352
66
    return _ccv_nnc_gemm_forw_sse2(a, w, bias, b);
353
#elif defined(HAVE_NEON)
354
  if (adim % 8 == 0)
355
    return _ccv_nnc_gemm_forw_neon(a, w, bias, b);
356
#endif
357
39
  return CCV_NNC_EXEC_INVALID;
358
39
}
359
360
int _ccv_nnc_gemm_back_cpu_opt(const ccv_nnc_tensor_view_t* const g, const ccv_nnc_tensor_view_t* const a, const ccv_nnc_tensor_view_t* const w, ccv_nnc_tensor_view_t* const dw, ccv_nnc_tensor_view_t* const bias, ccv_nnc_tensor_view_t* const h, const int flags)
361
36
{
362
36
#if defined(HAVE_SSE2) || defined(HAVE_NEON)
363
36
  const int a_nd = ccv_nnc_tensor_nd(a->info.dim);
364
36
  const int adim = (a_nd == 1) ? 
a->info.dim[0]0
: a->info.dim[1];
365
36
  const int g_nd = ccv_nnc_tensor_nd(g->info.dim);
366
36
  const int gdim = (g_nd == 1) ? 
g->info.dim[0]0
: g->info.dim[1];
367
36
  const int h_nd = h ? ccv_nnc_tensor_nd(h->info.dim) : 
00
;
368
36
  const int hdim = h ? ((h_nd == 1) ? 
h->info.dim[0]0
: h->info.dim[1]) :
00
;
369
36
#endif
370
36
#if defined(HAVE_SSE2)
371
36
  if (gdim % 4 == 0 && 
adim % 4 == 09
&&
(9
!h9
||
hdim % 4 == 09
))
372
9
    return _ccv_nnc_gemm_back_sse2(g, a, w, dw, bias, h, flags);
373
#elif defined(HAVE_NEON)
374
  if (gdim % 4 == 0 && adim % 4 == 0 && (!h || hdim % 4 == 0))
375
    return _ccv_nnc_gemm_back_neon(g, a, w, dw, bias, h, flags);
376
#endif
377
27
  return CCV_NNC_EXEC_INVALID;
378
27
}