Coverage Report

Created: 2017-11-12 13:27

/home/liu/buildslave/linux-x64-runtests/build/lib/nnc/cmd/blas/cpu_opt/_ccv_nnc_gemm_cpu_opt.c
Line
Count
Source (jump to first uncovered line)
1
#include <ccv.h>
2
#include <ccv_internal.h>
3
#include <nnc/ccv_nnc.h>
4
#include <nnc/ccv_nnc_easy.h>
5
#include <nnc/ccv_nnc_internal.h>
6
#if defined(HAVE_SSE2)
7
#include <xmmintrin.h>
8
#elif defined(HAVE_NEON)
9
#include <arm_neon.h>
10
#endif
11
#ifdef USE_OPENMP
12
#include <omp.h>
13
#endif
14
#ifdef USE_DISPATCH
15
#include <dispatch/dispatch.h>
16
#endif
17
#include "../_ccv_nnc_gemm_cpu_opt.h"
18
19
#ifdef HAVE_SSE2
20
static int _ccv_nnc_gemm_forw_sse2(const ccv_nnc_tensor_view_t* const a, const ccv_nnc_tensor_view_t* const w, const ccv_nnc_tensor_view_t* const bias, ccv_nnc_tensor_view_t* const b)
21
18
{
22
18
  const int a_nd = ccv_nnc_tensor_nd(a->info.dim);
23
18
  const int* adim = (a_nd == 1) ? 
a->info.dim18
:
a->info.dim + 10
;
24
18
  const int b_nd = ccv_nnc_tensor_nd(b->info.dim);
25
18
  const int* bdim = (b_nd == 1) ? 
b->info.dim18
:
b->info.dim + 10
;
26
18
  assert(bdim[0] == bias->info.dim[0]);
27
18
  assert(bdim[0] == w->info.dim[0]);
28
18
  assert(adim[0] == w->info.dim[1]);
29
18
  const int* ainc = 
CCV_IS_TENSOR_VIEW18
(a) ?
(a_nd == 1 ? 0
a->inc0
:
a->inc + 10
) :
adim18
;
30
18
  const int* binc = 
CCV_IS_TENSOR_VIEW18
(b) ?
(b_nd == 1 ? 0
b->inc0
:
b->inc + 10
) :
bdim18
;
31
18
  const int* winc = 
CCV_IS_TENSOR_VIEW18
(w) ?
w->inc0
:
w->info.dim18
;
32
18
  const int batch_size = a_nd == 1 ? 
118
:
ccv_max0
(1, a->info.dim[0]);
33
18
  int i;
34
36
  for (i = 0; 
i < batch_size36
;
i++18
)
35
18
  {
36
18
    const float* const ap = a->data.f32 + i * ainc[0];
37
18
    float* const bp = b->data.f32 + i * binc[0];
38
18
    
parallel_for18
(j, bdim[0]) {0
39
0
      const float* const wp = w->data.f32 + j * winc[1];
40
0
      int k;
41
0
      __m128 v40 = _mm_set_ss(bias->data.f32[j]);
42
0
      __m128 v41 = _mm_setzero_ps();
43
18.0M
      for (k = 0; 
k < adim[0] - 718.0M
;
k += 818.0M
)
44
18.0M
      {
45
18.0M
        __m128 ap40 = _mm_load_ps(ap + k);
46
18.0M
        __m128 ap41 = _mm_load_ps(ap + k + 4);
47
18.0M
        __m128 w40 = _mm_load_ps(wp + k);
48
18.0M
        __m128 w41 = _mm_load_ps(wp + k + 4);
49
18.0M
        v40 =_mm_add_ps(_mm_mul_ps(w40, ap40), v40);
50
18.0M
        v41 =_mm_add_ps(_mm_mul_ps(w41, ap41), v41);
51
18.0M
      }
52
0
      v40 = _mm_add_ps(v40, v41);
53
0
      v41 = _mm_add_ps(v40, _mm_movehl_ps(v40, v40));
54
0
      v40 = _mm_add_ss(v41, _mm_shuffle_ps(v41, v41, 1));
55
0
      _mm_store_ss(bp + j, v40);
56
18
    } parallel_endfor
57
18
  }
58
18
  return CCV_NNC_EXEC_SUCCESS;
59
18
}
60
61
static int _ccv_nnc_gemm_back_sse2(const ccv_nnc_tensor_view_t* const g, const ccv_nnc_tensor_view_t* const a, const ccv_nnc_tensor_view_t* const w, ccv_nnc_tensor_view_t* const dw, ccv_nnc_tensor_view_t* const bias, ccv_nnc_tensor_view_t* const h, const int flags)
62
0
{
63
0
  const int* dwinc = 
CCV_IS_TENSOR_VIEW0
(dw) ?
dw->inc0
:
dw->info.dim0
;
64
0
  if (!(flags & CCV_NNC_ACCUMULATE_OUTPUT)) // reset the gradients to 0
65
0
  {
66
0
    memset(dw->data.u8, 0, sizeof(float) * dwinc[1] * dw->info.dim[0]);
67
0
    memset(bias->data.u8, 0, sizeof(float) * bias->info.dim[0]);
68
0
  }
69
0
  const int a_nd = ccv_nnc_tensor_nd(a->info.dim);
70
0
  const int* adim = (a_nd == 1) ? 
a->info.dim0
:
a->info.dim + 10
;
71
0
  const int g_nd = ccv_nnc_tensor_nd(g->info.dim);
72
0
  const int* gdim = (g_nd == 1) ? 
g->info.dim0
:
g->info.dim + 10
;
73
0
  const int batch_size = a_nd == 1 ? 
10
:
ccv_max0
(1, a->info.dim[0]);
74
0
  int i, j;
75
0
  float* gp = g->data.f32;
76
0
  float* bp = bias->data.f32;
77
0
  assert(bias->info.dim[0] == gdim[0]);
78
0
  const int* ginc = 
CCV_IS_TENSOR_VIEW0
(g) ?
((g_nd == 1) ? 0
g->inc0
:
g->inc + 10
) :
gdim0
;
79
0
  for (i = 0; 
i < batch_size0
;
i++0
)
80
0
  {
81
0
    for (j = 0; 
j < gdim[0] - 30
;
j += 40
)
82
0
    {
83
0
      __m128 g4 = _mm_load_ps(gp + j);
84
0
      __m128 b4 = _mm_load_ps(bp + j);
85
0
      _mm_stream_ps(bp + j, _mm_add_ps(b4, g4));
86
0
    }
87
0
    gp += ginc[0];
88
0
  }
89
0
  assert(gdim[0] == dw->info.dim[0]);
90
0
  assert(adim[0] == dw->info.dim[1]);
91
0
  const int* ainc = 
CCV_IS_TENSOR_VIEW0
(a) ?
((a_nd == 1) ? 0
a->inc0
:
a->inc + 10
) :
adim0
;
92
0
  for (i = 0; 
i < batch_size0
;
i++0
)
93
0
  {
94
0
    const float* const gp = g->data.f32 + i * ginc[0];
95
0
    const float* const ap = a->data.f32 + i * ainc[0];
96
0
    
parallel_for0
(j, gdim[0]) {0
97
0
      float* const dwp = dw->data.f32 + j * dwinc[1];
98
0
      __m128 g4 = _mm_set1_ps(gp[j]);
99
0
      int k;
100
0
      for (k = 0; 
k < adim[0] - 30
;
k+= 40
)
101
0
      {
102
0
        __m128 a4 = _mm_load_ps(ap + k);
103
0
        __m128 dw4 = _mm_load_ps(dwp + k);
104
0
        _mm_stream_ps(dwp + k, _mm_add_ps(dw4, _mm_mul_ps(a4, g4)));
105
0
      }
106
0
    } parallel_endfor
107
0
  }
108
0
  if (
h && 0
w0
)
109
0
  {
110
0
    const int h_nd = ccv_nnc_tensor_nd(h->info.dim);
111
0
    const int* hdim = (h_nd == 1) ? 
h->info.dim0
:
h->info.dim + 10
;
112
0
    assert(hdim[0] == adim[0]);
113
0
    const int* hinc = 
CCV_IS_TENSOR_VIEW0
(h) ?
((h_nd == 1) ? 0
h->inc0
:
h->inc + 10
) :
hdim0
;
114
0
    const int* winc = 
CCV_IS_TENSOR_VIEW0
(w) ?
w->inc0
:
w->info.dim0
;
115
0
    for (i = 0; 
i < batch_size0
;
i++0
)
116
0
    {
117
0
      const float* const gp = g->data.f32 + i * ginc[0];
118
0
      float* const hp = h->data.f32 + i * hinc[0];
119
0
      
parallel_for0
(y, hdim[0] / 4) {0
120
0
        const int j = y * 4;
121
0
        const float* const wp = w->data.f32 + j;
122
0
        __m128 v40 = _mm_setzero_ps();
123
0
        __m128 v41 = _mm_setzero_ps();
124
0
        __m128 v42 = _mm_setzero_ps();
125
0
        __m128 v43 = _mm_setzero_ps();
126
0
        int k;
127
0
        for (k = 0; 
k < gdim[0]0
;
k += 40
)
128
0
        {
129
0
          __m128 g4 = _mm_load_ps(gp + k);
130
0
          __m128 w40 = _mm_load_ps(wp + k * winc[1]);
131
0
          __m128 w41 = _mm_load_ps(wp + (k + 1) * winc[1]);
132
0
          __m128 w42 = _mm_load_ps(wp + (k + 2) * winc[1]);
133
0
          __m128 w43 = _mm_load_ps(wp + (k + 3) * winc[1]);
134
0
          __m128 g40 = _mm_shuffle_ps(g4, g4, 0x00);
135
0
          __m128 g41 = _mm_shuffle_ps(g4, g4, 0x55);
136
0
          __m128 g42 = _mm_shuffle_ps(g4, g4, 0xAA);
137
0
          __m128 g43 = _mm_shuffle_ps(g4, g4, 0xFF);
138
0
          v40 = _mm_add_ps(_mm_mul_ps(g40, w40), v40);
139
0
          v41 = _mm_add_ps(_mm_mul_ps(g41, w41), v41);
140
0
          v42 = _mm_add_ps(_mm_mul_ps(g42, w42), v42);
141
0
          v43 = _mm_add_ps(_mm_mul_ps(g43, w43), v43);
142
0
        }
143
0
        v40 = _mm_add_ps(v40, v41);
144
0
        v42 = _mm_add_ps(v42, v43);
145
0
        _mm_stream_ps(hp + j, _mm_add_ps(v40, v42));
146
0
      } parallel_endfor
147
0
    }
148
0
  }
149
0
  return CCV_NNC_EXEC_SUCCESS;
150
0
}
151
#endif
152
153
#ifdef HAVE_NEON
154
static int _ccv_nnc_gemm_forw_neon(const ccv_nnc_tensor_view_t* const a, const ccv_nnc_tensor_view_t* const w, const ccv_nnc_tensor_view_t* const bias, ccv_nnc_tensor_view_t* const b)
155
{
156
  const int a_nd = ccv_nnc_tensor_nd(a->info.dim);
157
  const int* adim = (a_nd == 1) ? a->info.dim : a->info.dim + 1;
158
  const int b_nd = ccv_nnc_tensor_nd(b->info.dim);
159
  const int* bdim = (b_nd == 1) ? b->info.dim : b->info.dim + 1;
160
  const int* ainc = CCV_IS_TENSOR_VIEW(a) ? (a_nd == 1 ? a->inc : a->inc + 1) : adim;
161
  const int* binc = CCV_IS_TENSOR_VIEW(b) ? (b_nd == 1 ? b->inc : b->inc + 1) : bdim;
162
  const int* winc = CCV_IS_TENSOR_VIEW(w) ? w->inc : w->info.dim;
163
  const int batch_size = a_nd == 1 ? 1 : ccv_max(1, a->info.dim[0]);
164
  int i;
165
  for (i = 0; i < batch_size; i++)
166
  {
167
    const float* const ap = a->data.f32 + i * ainc[0];
168
    float* const bp = b->data.f32 + i * binc[0];
169
    parallel_for(j, bdim[0]) {
170
      const float* const wp = w->data.f32 + j * winc[1];
171
      int k;
172
      float32x4_t v41 = vmovq_n_f32(0);
173
      float32x4_t v40 = vld1q_lane_f32(bias->data.f32 + j, v41, 0);
174
      for (k = 0; k < adim[0] - 7; k += 8)
175
      {
176
        float32x4_t ap40 = vld1q_f32(ap + k);
177
        float32x4_t ap41 = vld1q_f32(ap + k + 4);
178
        float32x4_t w40 = vld1q_f32(wp + k);
179
        float32x4_t w41 = vld1q_f32(wp + k + 4);
180
        v40 = vmlaq_f32(v40, w40, ap40);
181
        v41 = vmlaq_f32(v41, w41, ap41);
182
      }
183
      v40 = vaddq_f32(v40, v41);
184
      float32x2_t v2 = vpadd_f32(vget_high_f32(v40), vget_low_f32(v40));
185
      bp[j] = vget_lane_f32(vpadd_f32(v2, v2), 0);
186
    } parallel_endfor
187
  }
188
  return CCV_NNC_EXEC_SUCCESS;
189
}
190
191
static int _ccv_nnc_gemm_back_neon(const ccv_nnc_tensor_view_t* const g, const ccv_nnc_tensor_view_t* const a, const ccv_nnc_tensor_view_t* const w, ccv_nnc_tensor_view_t* const dw, ccv_nnc_tensor_view_t* const bias, ccv_nnc_tensor_view_t* const h, const int flags)
192
{
193
  const int* dwinc = CCV_IS_TENSOR_VIEW(dw) ? dw->inc : dw->info.dim;
194
  if (!(flags & CCV_NNC_ACCUMULATE_OUTPUT)) // reset the gradients to 0
195
  {
196
    memset(dw->data.u8, 0, sizeof(float) * dwinc[1] * dw->info.dim[0]);
197
    memset(bias->data.u8, 0, sizeof(float) * bias->info.dim[0]);
198
  }
199
  const int a_nd = ccv_nnc_tensor_nd(a->info.dim);
200
  const int* adim = (a_nd == 1) ? a->info.dim : a->info.dim + 1;
201
  const int g_nd = ccv_nnc_tensor_nd(g->info.dim);
202
  const int* gdim = (g_nd == 1) ? g->info.dim : g->info.dim + 1;
203
  const int batch_size = a_nd == 1 ? 1 : ccv_max(1, a->info.dim[0]);
204
  int i, j;
205
  float* gp = g->data.f32;
206
  float* bp = bias->data.f32;
207
  const int* ginc = CCV_IS_TENSOR_VIEW(g) ? ((g_nd == 1) ? g->inc : g->inc + 1) : gdim;
208
  for (i = 0; i < batch_size; i++)
209
  {
210
    for (j = 0; j < gdim[0] - 3; j += 4)
211
    {
212
      float32x4_t g4 = vld1q_f32(gp + j);
213
      float32x4_t b4 = vld1q_f32(bp + j);
214
      vst1q_f32(bp + j, vaddq_f32(b4, g4));
215
    }
216
    gp += ginc[0];
217
  }
218
  const int* ainc = CCV_IS_TENSOR_VIEW(a) ? ((a_nd == 1) ? a->inc : a->inc + 1) : adim;
219
  for (i = 0; i < batch_size; i++)
220
  {
221
    const float* const gp = g->data.f32 + i * ginc[0];
222
    const float* const ap = a->data.f32 + i * ainc[0];
223
    parallel_for(j, gdim[0]) {
224
      float* const dwp = dw->data.f32 + j * dwinc[1];
225
      float32x4_t g4 = vld1q_dup_f32(gp + j);
226
      int k;
227
      for (k = 0; k < adim[0] - 3; k+= 4)
228
      {
229
        float32x4_t a4 = vld1q_f32(ap + k);
230
        float32x4_t dw4 = vld1q_f32(dwp + k);
231
        vst1q_f32(dwp + k, vmlaq_f32(dw4, a4, g4));
232
      }
233
    } parallel_endfor
234
  }
235
  if (h && w)
236
  {
237
    const int h_nd = ccv_nnc_tensor_nd(h->info.dim);
238
    const int* hdim = (h_nd == 1) ? h->info.dim : h->info.dim + 1;
239
    const int* hinc = CCV_IS_TENSOR_VIEW(h) ? ((h_nd == 1) ? h->inc : h->inc + 1) : hdim;
240
    const int* winc = CCV_IS_TENSOR_VIEW(w) ? w->inc : w->info.dim;
241
    for (i = 0; i < batch_size; i++)
242
    {
243
      const float* const gp = g->data.f32 + i * ginc[0];
244
      float* const hp = h->data.f32 + i * hinc[0];
245
      parallel_for(y, hdim[0] / 4) {
246
        const int j = y * 4;
247
        const float* const wp = w->data.f32 + j;
248
        float32x4_t v40 = vmovq_n_f32(0);
249
        float32x4_t v41 = vmovq_n_f32(0);
250
        float32x4_t v42 = vmovq_n_f32(0);
251
        float32x4_t v43 = vmovq_n_f32(0);
252
        int k;
253
        for (k = 0; k < gdim[0]; k += 4)
254
        {
255
          float32x2x2_t g4 = vld2_f32(gp + k);
256
          float32x4_t w40 = vld1q_f32(wp + k * winc[1]);
257
          float32x4_t w41 = vld1q_f32(wp + (k + 1) * winc[1]);
258
          float32x4_t w42 = vld1q_f32(wp + (k + 2) * winc[1]);
259
          float32x4_t w43 = vld1q_f32(wp + (k + 3) * winc[1]);
260
          float32x4_t g40 = vdupq_lane_f32(g4.val[0], 0);
261
          float32x4_t g41 = vdupq_lane_f32(g4.val[1], 0);
262
          float32x4_t g42 = vdupq_lane_f32(g4.val[0], 1);
263
          float32x4_t g43 = vdupq_lane_f32(g4.val[1], 1);
264
          v40 = vmlaq_f32(v40, g40, w40);
265
          v41 = vmlaq_f32(v41, g41, w41);
266
          v42 = vmlaq_f32(v42, g42, w42);
267
          v43 = vmlaq_f32(v43, g43, w43);
268
        }
269
        v40 = vaddq_f32(v40, v41);
270
        v42 = vaddq_f32(v42, v43);
271
        vst1q_f32(hp + j, vaddq_f32(v40, v42));
272
      } parallel_endfor
273
    }
274
  }
275
  return CCV_NNC_EXEC_SUCCESS;
276
}
277
#endif
278
279
int _ccv_nnc_gemm_forw_cpu_opt(const ccv_nnc_tensor_view_t* const a, const ccv_nnc_tensor_view_t* const w, const ccv_nnc_tensor_view_t* const bias, ccv_nnc_tensor_view_t* const b)
280
18
{
281
18
#if defined(HAVE_SSE2) || defined(HAVE_NEON)
282
18
  const int a_nd = ccv_nnc_tensor_nd(a->info.dim);
283
18
  const int adim = (a_nd == 1) ? 
a->info.dim[0]18
:
a->info.dim[1]0
;
284
18
#endif
285
18
#if defined(HAVE_SSE2)
286
18
  if (adim % 8 == 0)
287
18
    return _ccv_nnc_gemm_forw_sse2(a, w, bias, b);
288
18
#elif defined(HAVE_NEON)
289
  if (adim % 8 == 0)
290
    return _ccv_nnc_gemm_forw_neon(a, w, bias, b);
291
#endif
292
0
  return CCV_NNC_EXEC_INVALID;
293
18
}
294
295
int _ccv_nnc_gemm_back_cpu_opt(const ccv_nnc_tensor_view_t* const g, const ccv_nnc_tensor_view_t* const a, const ccv_nnc_tensor_view_t* const w, ccv_nnc_tensor_view_t* const dw, ccv_nnc_tensor_view_t* const bias, ccv_nnc_tensor_view_t* const h, const int flags)
296
0
{
297
0
#if defined(HAVE_SSE2) || defined(HAVE_NEON)
298
0
  const int a_nd = ccv_nnc_tensor_nd(a->info.dim);
299
0
  const int adim = (a_nd == 1) ? 
a->info.dim[0]0
:
a->info.dim[1]0
;
300
0
  const int g_nd = ccv_nnc_tensor_nd(g->info.dim);
301
0
  const int gdim = (g_nd == 1) ? 
g->info.dim[0]0
:
g->info.dim[1]0
;
302
0
  const int h_nd = h ? 
ccv_nnc_tensor_nd(h->info.dim)0
:
00
;
303
0
  const int hdim = h ? 
((h_nd == 1) ? 0
h->info.dim[0]0
:
h->info.dim[1]0
) :
00
;
304
0
#endif
305
0
#if defined(HAVE_SSE2)
306
0
  if (
gdim % 4 == 0 && 0
adim % 4 == 00
&&
(!h || 0
hdim % 4 == 00
))
307
0
    return _ccv_nnc_gemm_back_sse2(g, a, w, dw, bias, h, flags);
308
0
#elif defined(HAVE_NEON)
309
  if (gdim % 4 == 0 && adim % 4 == 0 && (!h || hdim % 4 == 0))
310
    return _ccv_nnc_gemm_back_neon(g, a, w, dw, bias, h, flags);
311
#endif
312
0
  return CCV_NNC_EXEC_INVALID;
313
0
}