Coverage Report

Created: 2024-08-18 16:21

/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/cmd/blas/ccv_nnc_gemm_cpu_opt.c
Line
Count
Source (jump to first uncovered line)
1
#include "ccv.h"
2
#include "ccv_internal.h"
3
#include "nnc/ccv_nnc.h"
4
#include "nnc/ccv_nnc_easy.h"
5
#include "nnc/ccv_nnc_internal.h"
6
7
#include "_ccv_nnc_gemm_cpu_opt.h"
8
9
FIND_FILE(cpu_opt/_ccv_nnc_gemm_cpu_opt.c, cpu_sys/_ccv_nnc_gemm_cpu_sys.c)
10
11
enum {
12
  CCV_NNC_CMD_OPT_GEMM_ALGO_DIRECT, // Direct multiplication
13
  CCV_NNC_CMD_OPT_GEMM_ALGO_SYSTEM, // Use system GEMM
14
  CCV_NNC_CMD_OPT_GEMM_ALGO_COUNT
15
};
16
17
static int _ccv_nnc_gemm_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
18
2.36k
{
19
2.36k
  assert(input_size >= 2);
20
2.36k
  const ccv_nnc_tensor_view_t* w = (const ccv_nnc_tensor_view_t*)inputs[1];
21
2.36k
  const ccv_nnc_tensor_view_t* bias = input_size > 2 ? 
(const ccv_nnc_tensor_view_t*)inputs[2]881
:
01.47k
;
22
2.36k
  const ccv_nnc_tensor_view_t* a = (const ccv_nnc_tensor_view_t*)inputs[0];
23
2.36k
  ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)outputs[0];
24
  // Cannot compute if w is not transposed and dimensions are batched.
25
  // Copy the most of parameters, but reshape the dimension of a to a vector.
26
2.36k
  assert(output_size == 1);
27
2.36k
  switch (cmd.algorithm)
28
2.36k
  {
29
289
    case CCV_NNC_CMD_OPT_GEMM_ALGO_DIRECT:
30
      // Cannot handle this with DIRECT.
31
289
      if (ccv_nnc_tensor_nd(a->info.dim) > 2 || ccv_nnc_tensor_nd(b->info.dim) > 2 ||
32
289
        ccv_nnc_tensor_nd(w->info.dim) > 2 ||
33
289
        (bias && 
ccv_nnc_tensor_nd(bias->info.dim) > 1130
) ||
34
289
        cmd.info.blas.transpose_a[0] != cmd.info.blas.transpose_a[1] ||
35
289
        cmd.info.blas.transpose_b[0] == cmd.info.blas.transpose_b[1])
36
3
        break;
37
286
      return _ccv_nnc_gemm_forw_cpu_opt(a, w, bias, b);
38
2.07k
    case CCV_NNC_CMD_OPT_GEMM_ALGO_SYSTEM:
39
2.07k
      return _ccv_nnc_gemm_forw_cpu_sys(cmd.info.blas.transpose_a, cmd.info.blas.transpose_b, a, w, bias, b);
40
0
    case -1:
41
      // Pass-through
42
0
      break;
43
2.36k
  }
44
3
#if (defined HAVE_CBLAS || defined HAVE_ACCELERATE_FRAMEWORK)
45
3
  return _ccv_nnc_gemm_forw_cpu_sys(cmd.info.blas.transpose_a, cmd.info.blas.transpose_b, a, w, bias, b);
46
0
#endif
47
0
  if (ccv_nnc_tensor_nd(a->info.dim) > 2 || ccv_nnc_tensor_nd(b->info.dim) > 2 ||
48
0
    ccv_nnc_tensor_nd(w->info.dim) > 2 ||
49
0
    (bias && ccv_nnc_tensor_nd(bias->info.dim) > 1) ||
50
0
    cmd.info.blas.transpose_a[0] != cmd.info.blas.transpose_a[1] ||
51
0
    cmd.info.blas.transpose_b[0] == cmd.info.blas.transpose_b[1])
52
0
    return CCV_NNC_EXEC_INVALID;
53
0
  assert(w->info.dim[2] == 0); // It is a 2-d array
54
0
  assert(!bias || bias->info.dim[1] == 0); // It is a 1-d array
55
0
  const int a_nd = ccv_nnc_tensor_nd(a->info.dim);
56
0
  assert(a_nd == 1 || a_nd == 2);
57
0
  const int* adim = (a_nd == 1) ? a->info.dim : a->info.dim + 1;
58
0
  const int b_nd = ccv_nnc_tensor_nd(b->info.dim);
59
0
  assert(b_nd == 1 || b_nd == 2);
60
0
  const int* bdim = (b_nd == 1) ? b->info.dim : b->info.dim + 1;
61
0
  const int batch_size = a_nd == 1 ? 1 : ccv_max(1, a->info.dim[0]);
62
0
  assert(batch_size == (b_nd == 1) ? 1 : ccv_max(1, b->info.dim[0]));
63
0
  assert(!bias || bdim[0] == bias->info.dim[0]);
64
0
  assert(bdim[0] == w->info.dim[0]);
65
0
  assert(adim[0] == w->info.dim[1]);
66
0
  return _ccv_nnc_gemm_forw_cpu_opt(a, w, bias, b);
67
0
}
68
69
static int _ccv_nnc_gemm_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
70
5.80k
{
71
  // inputs: gradient, forw prop input, [w]
72
  // outputs: [output gradient], weight updates, bias updates
73
5.80k
  assert(input_size >= 2 && output_size >= 2);
74
5.80k
  const ccv_nnc_tensor_view_t* g = (const ccv_nnc_tensor_view_t*)inputs[0];
75
5.80k
  const ccv_nnc_tensor_view_t* a = (const ccv_nnc_tensor_view_t*)inputs[1];
76
5.80k
  ccv_nnc_tensor_view_t* dw = (ccv_nnc_tensor_view_t*)outputs[1];
77
5.80k
  ccv_nnc_tensor_view_t* bias = output_size > 2 ? 
(ccv_nnc_tensor_view_t*)outputs[2]3.08k
:
02.72k
;
78
5.80k
  const ccv_nnc_tensor_view_t* w = (input_size > 2) ? (const ccv_nnc_tensor_view_t*)inputs[2] : 
00
;
79
5.80k
  ccv_nnc_tensor_view_t* h = (ccv_nnc_tensor_view_t*)outputs[0];
80
  // Cannot compute if w is not transposed and dimensions are batched.
81
5.80k
  switch (cmd.algorithm)
82
5.80k
  {
83
200
    case CCV_NNC_CMD_OPT_GEMM_ALGO_DIRECT:
84
      // Cannot handle this with DIRECT.
85
200
      if ((!a || 
ccv_nnc_tensor_nd(a->info.dim) > 296
) ||
ccv_nnc_tensor_nd(g->info.dim) > 296
||
86
200
        
(96
!dw96
||
ccv_nnc_tensor_nd(dw->info.dim) > 296
) ||
87
200
        
(96
bias96
&&
ccv_nnc_tensor_nd(bias->info.dim) > 157
) ||
88
200
        
cmd.info.blas.transpose_a[0] != cmd.info.blas.transpose_a[1]96
||
89
200
        
cmd.info.blas.transpose_b[0] == cmd.info.blas.transpose_b[1]96
)
90
107
        break;
91
93
      return _ccv_nnc_gemm_back_cpu_opt(g, a, w, dw, bias, h, flags);
92
5.60k
    case CCV_NNC_CMD_OPT_GEMM_ALGO_SYSTEM:
93
5.60k
      return _ccv_nnc_gemm_back_cpu_sys(cmd.info.blas.transpose_a, cmd.info.blas.transpose_b, g, a, w, dw, bias, h, flags);
94
0
    case -1:
95
      // Pass-through
96
0
      break;
97
5.80k
  }
98
107
#if (defined HAVE_CBLAS || defined HAVE_ACCELERATE_FRAMEWORK)
99
107
  return _ccv_nnc_gemm_back_cpu_sys(cmd.info.blas.transpose_a, cmd.info.blas.transpose_b, g, a, w, dw, bias, h, flags);
100
#else
101
  if (ccv_nnc_tensor_nd(a->info.dim) > 2 || ccv_nnc_tensor_nd(g->info.dim) > 2 ||
102
    ccv_nnc_tensor_nd(dw->info.dim) > 2 ||
103
    (bias && ccv_nnc_tensor_nd(bias->info.dim) > 1) ||
104
    cmd.info.blas.transpose_a[0] != cmd.info.blas.transpose_a[1] ||
105
    cmd.info.blas.transpose_b[0] == cmd.info.blas.transpose_b[1])
106
    return CCV_NNC_EXEC_INVALID;
107
  assert(dw->info.dim[2] == 0); // It is a 2-d array.
108
  assert(!bias || bias->info.dim[1] == 0); // It is a 1-d array.
109
  const int a_nd = ccv_nnc_tensor_nd(a->info.dim);
110
  assert(a_nd == 1 || a_nd == 2);
111
  const int* adim = (a_nd == 1) ? a->info.dim : a->info.dim + 1;
112
  const int g_nd = ccv_nnc_tensor_nd(g->info.dim);
113
  assert(g_nd == 1 || g_nd == 2);
114
  const int* gdim = (g_nd == 1) ? g->info.dim : g->info.dim + 1;
115
  const int batch_size = a_nd == 1 ? 1 : ccv_max(1, a->info.dim[0]);
116
  assert(batch_size == (g_nd == 1) ? 1 : ccv_max(1, g->info.dim[0]));
117
  assert(!bias || bias->info.dim[0] == gdim[0]);
118
  assert(gdim[0] == dw->info.dim[0]);
119
  assert(adim[0] == dw->info.dim[1]);
120
  if (h)
121
  {
122
    assert(h->info.dim[2] == 0); // It is a 2-d array.
123
    const int h_nd = ccv_nnc_tensor_nd(h->info.dim);
124
    assert(h_nd == 1 || h_nd == 2);
125
    const int* hdim = (h_nd == 1) ? h->info.dim : h->info.dim + 1;
126
    assert(hdim[0] == adim[0]);
127
  }
128
  if (w)
129
  {
130
    assert(w->info.dim[2] == 0); // It is a 2-d array.
131
    assert(w->info.dim[0] == dw->info.dim[0]);
132
    assert(w->info.dim[1] == dw->info.dim[1]);
133
  }
134
  return _ccv_nnc_gemm_back_cpu_opt(g, a, w, dw, bias, h, flags);
135
#endif
136
5.80k
}
137
138
REGISTER_COMMAND_BACKEND(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_CPU_OPT)(ccv_nnc_cmd_backend_registry_t* const registry)
139
1
{
140
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW;
141
1
  registry->tensor_datatypes = CCV_32F;
142
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
143
1
  registry->algorithms = CCV_NNC_CMD_OPT_GEMM_ALGO_COUNT;
144
1
  registry->exec = _ccv_nnc_gemm_forw;
145
1
}
146
147
REGISTER_COMMAND_BACKEND(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_CPU_OPT)(ccv_nnc_cmd_backend_registry_t* const registry)
148
1
{
149
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW;
150
1
  registry->tensor_datatypes = CCV_32F;
151
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
152
1
  registry->algorithms = CCV_NNC_CMD_OPT_GEMM_ALGO_COUNT;
153
1
  registry->exec = _ccv_nnc_gemm_back;
154
1
}