Coverage Report

Created: 2025-02-24 17:43

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/cmd/blas/ccv_nnc_blas.c
Line
Count
Source
1
#include "ccv.h"
2
#include "ccv_internal.h"
3
#include "nnc/ccv_nnc.h"
4
#include "nnc/ccv_nnc_internal.h"
5
#include "nnc/ccv_nnc_easy.h"
6
7
static int _ccv_nnc_same_pos_inplace(const ccv_nnc_cmd_param_t cmd, const int input_idx, const int input_size, const int output_idx, const int output_size)
8
309
{
9
  // For cudnnOpTensor: "If the input tensor B is the same tensor as the destination tensor C, then the input tensor A also must be the same tensor as the destination tensor C."
10
309
  return input_idx == output_idx;
11
309
}
12
13
static int _ccv_nnc_gemm_forw_bitmask(const ccv_nnc_cmd_param_t cmd, const int input_size, const int output_size, const uint64_t* const input_bitmasks, const int input_bitmask_size, const uint64_t* const output_bitmasks, const int output_bitmask_size)
14
4.62k
{
15
4.62k
  if (input_size == 3 && 
(input_bitmasks[0] & 7u) == ((1u << 0) | (1u << 1) | (1u << 2))487
&&
output_bitmasks[0] == 1u487
)
16
487
    return 1;
17
  // No bias is OK.
18
4.13k
  if (input_size == 2 && (input_bitmasks[0] & 3u) == ((1u << 0) | (1u << 1)) && output_bitmasks[0] == 1u)
19
4.13k
    return 1;
20
0
  return 0;
21
4.13k
}
22
23
static int _ccv_nnc_gemm_back_bitmask(const ccv_nnc_cmd_param_t cmd, const int input_size, const int output_size, const uint64_t* const input_bitmasks, const int input_bitmask_size, const uint64_t* const output_bitmasks, const int output_bitmask_size)
24
36.2k
{
25
  // Output the propagated error, gradient w.r.t. w and bias.
26
36.2k
  if ((input_bitmasks[0] & 7u) == ((1u << 0) | (1u << 1) | (1u << 2) | (0 << 3)) && 
output_bitmasks[0] == ((1u << 0) | (1u << 1) | (1u << 2))10.8k
)
27
2.73k
    return 1;
28
  // No bias.
29
33.4k
  if ((input_bitmasks[0] & 7u) == ((1u << 0) | (1u << 1) | (1u << 2) | (0 << 3)) && 
output_bitmasks[0] == ((1u << 0) | (1u << 1) | (0 << 2))8.11k
)
30
4.08k
    return 1;
31
  // Don't propagate error, only gradient w.r.t. w and bias.
32
29.4k
  if ((input_bitmasks[0] & 3u) == ((1u << 0) | (1u << 1) | (0 << 2) | (0 << 3)) && 
output_bitmasks[0] == ((0 << 0) | (1u << 1) | (1u << 2))14.6k
)
33
2.02k
    return 1;
34
  // No bias.
35
27.3k
  if ((input_bitmasks[0] & 3u) == ((1u << 0) | (1u << 1) | (0 << 2) | (0 << 3)) && 
output_bitmasks[0] == ((0 << 0) | (1u << 1) | (0 << 2))12.6k
)
36
8.03k
    return 1;
37
  // Bias, no weight.
38
19.3k
  if ((input_bitmasks[0] & 5u) == ((1u << 0) | (0 << 1) | (1u << 2) | (0 << 3)) && 
output_bitmasks[0] == ((1u << 0) | (0u << 1) | (1u << 2))6.60k
)
39
4
    return 1;
40
  // No bias, No weight.
41
19.3k
  if ((input_bitmasks[0] & 5u) == ((1u << 0) | (0 << 1) | (1u << 2) | (0 << 3)) && 
output_bitmasks[0] == ((1u << 0) | (0u << 1) | (0 << 2))6.59k
)
42
30
    return 1;
43
19.3k
  return 0;
44
19.3k
}
45
46
static void _ccv_nnc_gemm_tensor_auto_forw(const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_hint_t hint, ccv_nnc_tensor_param_t* const outputs, const int output_size)
47
23.7k
{
48
23.7k
  assert(output_size == 1);
49
23.7k
  int a_batch_size, a_rows, a_cols, a_batch_inc, a_rows_inc, a_cols_inc;
50
23.7k
  int w_batch_size, w_rows, w_cols, w_batch_inc, w_rows_inc, w_cols_inc;
51
23.7k
  const int a_nd = ccv_nnc_tensor_nd(inputs[0].dim);
52
23.7k
  const int w_nd = ccv_nnc_tensor_nd(inputs[1].dim);
53
23.7k
  const int nd = ccv_max(a_nd, w_nd);
54
23.7k
  ccv_nnc_tensor_get_matrix_params(inputs[0], 0, inputs[0].dim, cmd.blas.transpose_a, &a_batch_size, &a_rows, &a_cols, &a_batch_inc, &a_rows_inc, &a_cols_inc);
55
23.7k
  ccv_nnc_tensor_get_matrix_params(inputs[1], 0, inputs[1].dim, cmd.blas.transpose_b, &w_batch_size, &w_rows, &w_cols, &w_batch_inc, &w_rows_inc, &w_cols_inc);
56
23.7k
  outputs[0].type = inputs[0].type;
57
23.7k
  outputs[0].format = inputs[0].format;
58
23.7k
  outputs[0].datatype = inputs[0].datatype;
59
23.7k
  int b_rows = a_rows, b_cols = w_cols;
60
23.7k
  if (nd == 1)
61
1
    outputs[0].dim[0] = b_cols;
62
23.7k
  else if (nd == 2) {
63
23.3k
    if (a_nd == 1) // If a is a vector, output is a vector too.
64
17.0k
      outputs[0].dim[0] = b_cols;
65
6.27k
    else {
66
6.27k
      outputs[0].dim[0] = b_rows;
67
6.27k
      outputs[0].dim[1] = b_cols;
68
6.27k
    }
69
23.3k
  } else {
70
371
    assert(nd >= 3);
71
371
    outputs[0].dim[nd - 3] = ccv_max(a_batch_size, w_batch_size);
72
371
    outputs[0].dim[nd - 2] = b_rows;
73
371
    outputs[0].dim[nd - 1] = b_cols;
74
371
    int i;
75
379
    for (i = 0; i < nd - 3; 
i++8
)
76
8
    {
77
8
      const int a_idx = a_nd - nd + i;
78
8
      const int w_idx = w_nd - nd + i;
79
8
      outputs[0].dim[i] = ccv_max(a_idx >= 0 ? inputs[0].dim[a_idx] : 1, w_idx >= 0 ? inputs[1].dim[w_idx] : 1);
80
8
    }
81
371
  }
82
23.7k
}
83
84
REGISTER_COMMAND(CCV_NNC_GEMM_FORWARD)(ccv_nnc_cmd_registry_t* const registry)
85
  FIND_BACKEND(ccv_nnc_gemm_cpu_ref.c, ccv_nnc_gemm_cpu_opt.c, gpu/ccv_nnc_gemm_gpu_cublas.cu, mps/ccv_nnc_gemm_mps.m)
86
1
{
87
1
  registry->bitmask = _ccv_nnc_gemm_forw_bitmask;
88
1
  registry->tensor_auto = _ccv_nnc_gemm_tensor_auto_forw;
89
1
}
90
91
REGISTER_COMMAND(CCV_NNC_GEMM_BACKWARD)(ccv_nnc_cmd_registry_t* const registry)
92
  FIND_BACKEND(ccv_nnc_gemm_cpu_ref.c, ccv_nnc_gemm_cpu_opt.c, gpu/ccv_nnc_gemm_gpu_cublas.cu, mps/ccv_nnc_gemm_mps.m)
93
1
{
94
1
  registry->bitmask = _ccv_nnc_gemm_back_bitmask;
95
1
  registry->tensor_auto = ccv_nnc_hint_tensor_auto_backward_from_inputs;
96
1
}
97
98
//@REGISTER_EASY_COMMAND_MACRO(CCV_NNC_GEMM_FORWARD)
99
#define CMD_GEMM_FORWARD(...) ccv_nnc_cmd(CCV_NNC_GEMM_FORWARD, 0, CMD_GEMM(__VA_ARGS__), 0)
100
//@REGISTER_EASY_COMMAND_MACRO(CCV_NNC_GEMM_BACKWARD)
101
#define CMD_GEMM_BACKWARD(...) ccv_nnc_cmd(CCV_NNC_GEMM_BACKWARD, 0, CMD_GEMM(__VA_ARGS__), 0)
102
103
static int _ccv_nnc_add_forw_bitmask(const ccv_nnc_cmd_param_t cmd, const int input_size, const int output_size, const uint64_t* const input_bitmasks, const int input_bitmask_size, const uint64_t* const output_bitmasks, const int output_bitmask_size)
104
31
{
105
31
  if ((input_bitmasks[0] & 3u) == ((1u << 0) | (1u << 1)) && output_bitmasks[0] == 1u)
106
31
    return 1;
107
0
  return 0;
108
31
}
109
110
static int _ccv_nnc_add_back_bitmask(const ccv_nnc_cmd_param_t cmd, const int input_size, const int output_size, const uint64_t* const input_bitmasks, const int input_bitmask_size, const uint64_t* const output_bitmasks, const int output_bitmask_size)
111
15.6k
{
112
  // w.r.t. both x and y
113
15.6k
  if ((input_bitmasks[0] & 1u) == 1u && 
output_bitmasks[0] == ((1u << 0) | (1u << 1))11.1k
)
114
2.33k
    return 1;
115
  // w.r.t. x
116
13.2k
  if ((input_bitmasks[0] & 1u) == 1u && 
output_bitmasks[0] == ((1u << 0) | (0u << 1))8.82k
)
117
8.82k
    return 1;
118
  // w.r.t. y
119
4.47k
  if ((input_bitmasks[0] & 1u) == 1u &&  
output_bitmasks[0] == ((0u << 0) | (1u << 1))0
)
120
0
    return 1;
121
4.47k
  return 0;
122
4.47k
}
123
124
static void _ccv_nnc_broadcast_tensor_auto_forw(const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_hint_t hint, ccv_nnc_tensor_param_t* const outputs, const int output_size)
125
12.6k
{
126
12.6k
  assert(input_size >= 2);
127
12.6k
  assert(output_size == 1);
128
12.6k
  const int a_nd = ccv_nnc_tensor_nd(inputs[0].dim);
129
12.6k
  const int b_nd = ccv_nnc_tensor_nd(inputs[1].dim);
130
12.6k
  outputs[0] = inputs[0];
131
12.6k
  const int c_nd = ccv_max(a_nd, b_nd);
132
12.6k
  int i;
133
25.5k
  for (i = a_nd - 1; i >= 0; 
i--12.9k
)
134
12.9k
    outputs[0].dim[i + c_nd - a_nd] = inputs[0].dim[i];
135
25.4k
  for (i = b_nd - 1; i >= 0; 
i--12.8k
)
136
12.8k
    outputs[0].dim[i + c_nd - b_nd] = ccv_max(outputs[0].dim[i + c_nd - b_nd], inputs[1].dim[i]);
137
12.6k
}
138
139
REGISTER_COMMAND(CCV_NNC_ADD_FORWARD)(ccv_nnc_cmd_registry_t* const registry)
140
  FIND_BACKEND(ccv_nnc_add_cpu_ref.c, gpu/ccv_nnc_add_gpu_cudnn.cu, mps/ccv_nnc_add_mps.m)
141
1
{
142
1
  registry->bitmask = _ccv_nnc_add_forw_bitmask;
143
1
  registry->tensor_auto = _ccv_nnc_broadcast_tensor_auto_forw;
144
1
  registry->allow_inplace = _ccv_nnc_same_pos_inplace;
145
1
}
146
147
REGISTER_COMMAND(CCV_NNC_ADD_BACKWARD)(ccv_nnc_cmd_registry_t* const registry)
148
  FIND_BACKEND(ccv_nnc_add_cpu_ref.c, gpu/ccv_nnc_add_gpu_cudnn.cu, mps/ccv_nnc_add_mps.m)
149
1
{
150
1
  registry->flags = CCV_NNC_CMD_ATTR_NULL_IS_ONES;
151
1
  registry->bitmask = _ccv_nnc_add_back_bitmask;
152
1
  registry->tensor_auto = ccv_nnc_hint_tensor_auto_backward_from_inputs;
153
1
}
154
155
//@REGISTER_EASY_COMMAND_MACRO(CCV_NNC_ADD_FORWARD)
156
#define CMD_ADD_FORWARD(_p, _q) ccv_nnc_cmd(CCV_NNC_ADD_FORWARD, 0, (ccv_nnc_cmd_param_t){.size={.dim={1,1,1}},.blas={.a={_p, _q}}}, 0)
157
//@REGISTER_EASY_COMMAND_MACRO(CCV_NNC_ADD_BACKWARD)
158
#define CMD_ADD_BACKWARD(_p, _q) ccv_nnc_cmd(CCV_NNC_ADD_BACKWARD, 0, (ccv_nnc_cmd_param_t){.size={.dim={1,1,1}},.blas={.a={_p, _q}}}, 0)
159
160
static int _ccv_nnc_mul_forw_bitmask(const ccv_nnc_cmd_param_t cmd, const int input_size, const int output_size, const uint64_t* const input_bitmasks, const int input_bitmask_size, const uint64_t* const output_bitmasks, const int output_bitmask_size)
161
26
{
162
26
  if ((input_bitmasks[0] & 3u) == ((1u << 0) | (1u << 1)) && output_bitmasks[0] == 1u)
163
26
    return 1;
164
0
  return 0;
165
26
}
166
167
static int _ccv_nnc_mul_back_bitmask(const ccv_nnc_cmd_param_t cmd, const int input_size, const int output_size, const uint64_t* const input_bitmasks, const int input_bitmask_size, const uint64_t* const output_bitmasks, const int output_bitmask_size)
168
30.1k
{
169
  // w.r.t. both x and y
170
30.1k
  if ((input_bitmasks[0] & 7u) == 7u && 
output_bitmasks[0] == ((1u << 0) | (1u << 1))8.04k
)
171
6.04k
    return 1;
172
  // w.r.t. x
173
24.1k
  if ((input_bitmasks[0] & 5u) == 5u && 
output_bitmasks[0] == ((1u << 0) | (0u << 1))8.04k
)
174
0
    return 1;
175
  // w.r.t. y
176
24.1k
  if ((input_bitmasks[0] & 3u) == 3u && 
output_bitmasks[0] == ((0u << 0) | (1u << 1))10.0k
)
177
6.01k
    return 1;
178
18.0k
  return 0;
179
24.1k
}
180
181
REGISTER_COMMAND(CCV_NNC_MUL_FORWARD)(ccv_nnc_cmd_registry_t* const registry)
182
  FIND_BACKEND(ccv_nnc_mul_cpu_ref.c, gpu/ccv_nnc_mul_gpu_cudnn.cu, mps/ccv_nnc_mul_mps.m)
183
1
{
184
1
  registry->bitmask = _ccv_nnc_mul_forw_bitmask;
185
1
  registry->tensor_auto = _ccv_nnc_broadcast_tensor_auto_forw;
186
1
  registry->allow_inplace = _ccv_nnc_same_pos_inplace;
187
1
}
188
189
REGISTER_COMMAND(CCV_NNC_MUL_BACKWARD)(ccv_nnc_cmd_registry_t* const registry)
190
  FIND_BACKEND(ccv_nnc_mul_cpu_ref.c, gpu/ccv_nnc_mul_gpu_cudnn.cu, mps/ccv_nnc_mul_mps.m)
191
1
{
192
1
  registry->flags = CCV_NNC_CMD_ATTR_NULL_IS_ONES;
193
1
  registry->bitmask = _ccv_nnc_mul_back_bitmask;
194
1
  registry->tensor_auto = ccv_nnc_hint_tensor_auto_backward_from_inputs;
195
1
}
196
197
//@REGISTER_EASY_COMMAND_MACRO(CCV_NNC_MUL_FORWARD)
198
#define CMD_MUL_FORWARD(_p) ccv_nnc_cmd(CCV_NNC_MUL_FORWARD, 0, (ccv_nnc_cmd_param_t){.size={.dim={1,1,1}},.blas={.a={_p,}}}, 0)
199
//@REGISTER_EASY_COMMAND_MACRO(CCV_NNC_MUL_BACKWARD)
200
#define CMD_MUL_BACKWARD(_p) ccv_nnc_cmd(CCV_NNC_MUL_BACKWARD, 0, (ccv_nnc_cmd_param_t){.size={.dim={1,1,1}},.blas={.a={_p,}}}, 0)
201
202
static int _ccv_nnc_scalar_mul_forw_bitmask(const ccv_nnc_cmd_param_t cmd, const int input_size, const int output_size, const uint64_t* const input_bitmasks, const int input_bitmask_size, const uint64_t* const output_bitmasks, const int output_bitmask_size)
203
4
{
204
4
  if ((input_bitmasks[0] & 1u) == 1u && output_bitmasks[0] == 1u)
205
4
    return 1;
206
0
  return 0;
207
4
}
208
209
static int _ccv_nnc_scalar_mul_back_bitmask(const ccv_nnc_cmd_param_t cmd, const int input_size, const int output_size, const uint64_t* const input_bitmasks, const int input_bitmask_size, const uint64_t* const output_bitmasks, const int output_bitmask_size)
210
173
{
211
  // w.r.t. x
212
173
  if ((input_bitmasks[0] & 1u) == 1u && 
output_bitmasks[0] == 1u111
)
213
111
    return 1;
214
62
  return 0;
215
173
}
216
217
REGISTER_COMMAND(CCV_NNC_SCALAR_MUL_FORWARD)(ccv_nnc_cmd_registry_t* const registry)
218
  FIND_BACKEND(ccv_nnc_mul_cpu_ref.c, gpu/ccv_nnc_mul_gpu_cudnn.cu, mps/ccv_nnc_mul_mps.m)
219
1
{
220
1
  registry->bitmask = _ccv_nnc_scalar_mul_forw_bitmask;
221
1
  registry->tensor_auto = ccv_nnc_hint_tensor_auto_forward_from_inputs;
222
1
  registry->allow_inplace = _ccv_nnc_same_pos_inplace;
223
1
}
224
225
REGISTER_COMMAND(CCV_NNC_SCALAR_MUL_BACKWARD)(ccv_nnc_cmd_registry_t* const registry)
226
  FIND_BACKEND(ccv_nnc_mul_cpu_ref.c, gpu/ccv_nnc_mul_gpu_cudnn.cu, mps/ccv_nnc_mul_mps.m)
227
1
{
228
1
  registry->flags = CCV_NNC_CMD_ATTR_NULL_IS_ONES;
229
1
  registry->bitmask = _ccv_nnc_scalar_mul_back_bitmask;
230
1
  registry->tensor_auto = ccv_nnc_hint_tensor_auto_backward_from_inputs;
231
1
}
232
233
//@REGISTER_EASY_COMMAND_MACRO(CCV_NNC_SCALAR_MUL_FORWARD)
234
#define CMD_SCALAR_MUL_FORWARD(_a) ccv_nnc_cmd(CCV_NNC_SCALAR_MUL_FORWARD, 0, (ccv_nnc_cmd_param_t){.size={.dim={1,1,1}},.blas={.a={_a,}}}, 0)
235
//@REGISTER_EASY_COMMAND_MACRO(CCV_NNC_SCALAR_MUL_BACKWARD)
236
#define CMD_SCALAR_MUL_BACKWARD(_a) ccv_nnc_cmd(CCV_NNC_SCALAR_MUL_BACKWARD, 0, (ccv_nnc_cmd_param_t){.size={.dim={1,1,1}},.blas={.a={_a,}}}, 0)
237
238
static int _ccv_nnc_cmul_forw_bitmask(const ccv_nnc_cmd_param_t cmd, const int input_size, const int output_size, const uint64_t* const input_bitmasks, const int input_bitmask_size, const uint64_t* const output_bitmasks, const int output_bitmask_size)
239
0
{
240
0
  if ((input_bitmasks[0] & 3u) == ((1u << 0) | (1u << 1)) && output_bitmasks[0] == 1u)
241
0
    return 1;
242
0
  return 0;
243
0
}
244
245
static int _ccv_nnc_cmul_back_bitmask(const ccv_nnc_cmd_param_t cmd, const int input_size, const int output_size, const uint64_t* const input_bitmasks, const int input_bitmask_size, const uint64_t* const output_bitmasks, const int output_bitmask_size)
246
6
{
247
  // w.r.t. both x and y
248
6
  if ((input_bitmasks[0] & 7u) == 7u && 
output_bitmasks[0] == ((1u << 0) | (1u << 1))2
)
249
1
    return 1;
250
  // w.r.t. x
251
5
  if ((input_bitmasks[0] & 5u) == 5u && 
output_bitmasks[0] == ((1u << 0) | (0u << 1))3
)
252
3
    return 1;
253
  // w.r.t. y
254
2
  if ((input_bitmasks[0] & 3u) == 3u && 
output_bitmasks[0] == ((0u << 0) | (1u << 1))0
)
255
0
    return 1;
256
2
  return 0;
257
2
}
258
259
REGISTER_COMMAND(CCV_NNC_CMUL_FORWARD)(ccv_nnc_cmd_registry_t* const registry)
260
  FIND_BACKEND(ccv_nnc_cmul_cpu_ref.c, gpu/ccv_nnc_cmul_gpu_ref.cu, mps/ccv_nnc_cmul_mps.m)
261
1
{
262
1
  registry->bitmask = _ccv_nnc_cmul_forw_bitmask;
263
1
  registry->tensor_auto = _ccv_nnc_broadcast_tensor_auto_forw;
264
1
  registry->allow_inplace = _ccv_nnc_same_pos_inplace;
265
1
}
266
267
REGISTER_COMMAND(CCV_NNC_CMUL_BACKWARD)(ccv_nnc_cmd_registry_t* const registry)
268
  FIND_BACKEND(ccv_nnc_cmul_cpu_ref.c, gpu/ccv_nnc_cmul_gpu_ref.cu, mps/ccv_nnc_cmul_mps.m)
269
1
{
270
1
  registry->flags = CCV_NNC_CMD_ATTR_NULL_IS_ONES;
271
1
  registry->bitmask = _ccv_nnc_cmul_back_bitmask;
272
1
  registry->tensor_auto = ccv_nnc_hint_tensor_auto_backward_from_inputs;
273
1
}
274
275
//@REGISTER_EASY_COMMAND_MACRO(CCV_NNC_CMUL_FORWARD)
276
#define CMD_CMUL_FORWARD() ccv_nnc_cmd(CCV_NNC_CMUL_FORWARD, 0, (ccv_nnc_cmd_param_t){.size={.dim={1,1,1}}}, 0)
277
//@REGISTER_EASY_COMMAND_MACRO(CCV_NNC_CMUL_BACKWARD)
278
#define CMD_CMUL_BACKWARD() ccv_nnc_cmd(CCV_NNC_CMUL_BACKWARD, 0, (ccv_nnc_cmd_param_t){.size={.dim={1,1,1}}}, 0)