Coverage Report

Created: 2026-04-20 13:39

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/home/liu/actions-runner/_work/ccv/ccv/test/int/nnc/mpsblas.tests.c
Line
Count
Source
1
#include "case.h"
2
#include "ccv_case.h"
3
#include "ccv_nnc_case.h"
4
#include <ccv.h>
5
#include <nnc/ccv_nnc.h>
6
#include <nnc/ccv_nnc_easy.h>
7
#include <nnc/mps/ccv_nnc_mps.h>
8
#include <3rdparty/dsfmt/dSFMT.h>
9
#include <math.h>
10
#include <stdlib.h>
11
12
TEST_SETUP()
13
{
14
  ccv_nnc_init();
15
}
16
17
static float _mps_forward_na_gemm_a_value(const int row, const int k)
18
0
{
19
0
  return (float)(((row * 17 + k * 13) % 23) + 1) / 512.0f;
20
0
}
21
22
static float _mps_forward_na_gemm_b_value(const int col, const int k)
23
0
{
24
0
  return (float)(((col * 19 + k * 7) % 29) + 1) / 512.0f;
25
0
}
26
27
static float _mps_forward_na_gemm_signed_a_value(const int row, const int k)
28
0
{
29
0
  return (float)(((row * 31 + k * 17) % 257) - 128) / 128.0f;
30
0
}
31
32
static float _mps_forward_na_gemm_signed_b_value(const int col, const int k)
33
0
{
34
0
  return (float)(((col * 13 + k * 29) % 251) - 125) / 128.0f;
35
0
}
36
37
static float _mps_forward_na_gemm_bias_value(const int col)
38
0
{
39
0
  return (float)(((col * 5) % 17) - 8) / 256.0f;
40
0
}
41
42
static void _mps_forward_na_gemm_fill_half(ccv_float16_t* const data, const int rows, const int cols, const int for_a)
43
0
{
44
0
  float* const row_buffer = (float*)ccmalloc(sizeof(float) * cols);
45
0
  int i, j;
46
0
  for (i = 0; i < rows; i++)
47
0
  {
48
0
    for (j = 0; j < cols; j++)
49
0
      row_buffer[j] = for_a ? _mps_forward_na_gemm_a_value(i, j) : _mps_forward_na_gemm_b_value(i, j);
50
0
    ccv_float_to_half_precision(row_buffer, (uint16_t*)data + (size_t)i * cols, cols);
51
0
  }
52
0
  ccfree(row_buffer);
53
0
}
54
55
static void _mps_forward_scaled_gemm_to_float(const int datatype, const void* const data, const int count, float* const values);
56
57
static float _mps_forward_na_gemm_round_value(const int datatype, const float value)
58
0
{
59
0
  if (datatype == CCV_16F)
60
0
  {
61
0
    uint16_t h;
62
0
    float f;
63
0
    ccv_float_to_half_precision(&value, &h, 1);
64
0
    ccv_half_precision_to_float(&h, &f, 1);
65
0
    return f;
66
0
  } else if (datatype == CCV_16BF) {
67
0
    uint16_t h;
68
0
    float f;
69
0
    ccv_float_to_bfloat(&value, &h, 1);
70
0
    ccv_bfloat_to_float(&h, &f, 1);
71
0
    return f;
72
0
  }
73
0
  return value;
74
0
}
75
76
static void _mps_forward_na_gemm_fill(const int datatype, void* const data, const int rows, const int cols, const int for_a)
77
0
{
78
0
  float* const values = (float*)ccmalloc(sizeof(float) * rows * cols);
79
0
  int i, j;
80
0
  for (i = 0; i < rows; i++)
81
0
    for (j = 0; j < cols; j++)
82
0
      values[i * cols + j] = for_a ? _mps_forward_na_gemm_a_value(i, j) : _mps_forward_na_gemm_b_value(i, j);
83
0
  if (datatype == CCV_16F)
84
0
    ccv_float_to_half_precision(values, (uint16_t*)data, rows * cols);
85
0
  else if (datatype == CCV_16BF)
86
0
    ccv_float_to_bfloat(values, (uint16_t*)data, rows * cols);
87
0
  else
88
0
    memcpy(data, values, sizeof(float) * rows * cols);
89
0
  ccfree(values);
90
0
}
91
92
static void _mps_forward_na_gemm_fill_signed(const int datatype, void* const data, const int rows, const int cols, const int for_a)
93
0
{
94
0
  float* const values = (float*)ccmalloc(sizeof(float) * rows * cols);
95
0
  int i, j;
96
0
  for (i = 0; i < rows; i++)
97
0
    for (j = 0; j < cols; j++)
98
0
      values[i * cols + j] = for_a ? _mps_forward_na_gemm_signed_a_value(i, j) : _mps_forward_na_gemm_signed_b_value(i, j);
99
0
  if (datatype == CCV_16F)
100
0
    ccv_float_to_half_precision(values, (uint16_t*)data, rows * cols);
101
0
  else if (datatype == CCV_16BF)
102
0
    ccv_float_to_bfloat(values, (uint16_t*)data, rows * cols);
103
0
  else
104
0
    memcpy(data, values, sizeof(float) * rows * cols);
105
0
  ccfree(values);
106
0
}
107
108
static void _mps_forward_na_gemm_fill_bias(const int datatype, void* const data, const int cols)
109
0
{
110
0
  float* const values = (float*)ccmalloc(sizeof(float) * cols);
111
0
  int j;
112
0
  for (j = 0; j < cols; j++)
113
0
    values[j] = _mps_forward_na_gemm_bias_value(j);
114
0
  if (datatype == CCV_16F)
115
0
    ccv_float_to_half_precision(values, (uint16_t*)data, cols);
116
0
  else if (datatype == CCV_16BF)
117
0
    ccv_float_to_bfloat(values, (uint16_t*)data, cols);
118
0
  else
119
0
    memcpy(data, values, sizeof(float) * cols);
120
0
  ccfree(values);
121
0
}
122
123
static float _mps_forward_na_gemm_expected(const int datatype, const int row, const int col, const int k_dim, const int use_bias)
124
0
{
125
0
  float sum = 0;
126
0
  int k;
127
0
  for (k = 0; k < k_dim; k++)
128
0
    sum += _mps_forward_na_gemm_round_value(datatype, _mps_forward_na_gemm_a_value(row, k)) *
129
0
      _mps_forward_na_gemm_round_value(datatype, _mps_forward_na_gemm_b_value(col, k));
130
0
  if (use_bias)
131
0
    sum += _mps_forward_na_gemm_round_value(datatype, _mps_forward_na_gemm_bias_value(col));
132
0
  return sum;
133
0
}
134
135
static float _mps_forward_na_gemm_expected_signed(const int datatype, const int row, const int col, const int k_dim, const int use_bias)
136
0
{
137
0
  float sum = 0;
138
0
  int k;
139
0
  for (k = 0; k < k_dim; k++)
140
0
    sum += _mps_forward_na_gemm_round_value(datatype, _mps_forward_na_gemm_signed_a_value(row, k)) *
141
0
      _mps_forward_na_gemm_round_value(datatype, _mps_forward_na_gemm_signed_b_value(col, k));
142
0
  if (use_bias)
143
0
    sum += _mps_forward_na_gemm_round_value(datatype, _mps_forward_na_gemm_bias_value(col));
144
0
  return sum;
145
0
}
146
147
static int _mps_forward_na_gemm_sample_indices(const int dim, const int boundary, const int include_large_m_boundary, int indices[8])
148
0
{
149
0
  const int candidates[] = {
150
0
    0, 1, boundary - 1, boundary,
151
0
    include_large_m_boundary ? 32767 : -1,
152
0
    include_large_m_boundary ? 32768 : -1,
153
0
    dim / 2, dim - 1,
154
0
  };
155
0
  int i, j;
156
0
  int count = 0;
157
0
  for (i = 0; i < 8; i++)
158
0
  {
159
0
    if (candidates[i] < 0 || candidates[i] >= dim)
160
0
      continue;
161
0
    for (j = 0; j < count; j++)
162
0
      if (indices[j] == candidates[i])
163
0
        break;
164
0
    if (j < count)
165
0
      continue;
166
0
    indices[count++] = candidates[i];
167
0
  }
168
0
  return count;
169
0
}
170
171
typedef struct {
172
  int row;
173
  int col;
174
  float actual;
175
  float expected;
176
  float max_abs;
177
  float max_rel;
178
} _mps_forward_na_gemm_mismatch_t;
179
180
static float _mps_forward_na_gemm_abs_tolerance(const int datatype)
181
0
{
182
0
  return datatype == CCV_16BF ? 2e-1f : 5e-2f;
183
0
}
184
185
static float _mps_forward_na_gemm_rel_tolerance(const int datatype)
186
0
{
187
0
  return datatype == CCV_16BF ? 5e-3f : 2e-3f;
188
0
}
189
190
static int _mps_forward_na_gemm_validate_shape_for_datatype(const int datatype, const int use_bias, const int m_dim, const int n_dim, const int k_dim, _mps_forward_na_gemm_mismatch_t* const mismatch)
191
0
{
192
0
  ccv_nnc_tensor_param_t ga_params = {
193
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
194
0
    .format = CCV_TENSOR_FORMAT_NHWC,
195
0
    .datatype = datatype,
196
0
    .dim = { m_dim, k_dim, 0 },
197
0
  };
198
0
  ccv_nnc_tensor_param_t gw_params = {
199
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
200
0
    .format = CCV_TENSOR_FORMAT_NHWC,
201
0
    .datatype = datatype,
202
0
    .dim = { n_dim, k_dim, 0 },
203
0
  };
204
0
  ccv_nnc_tensor_param_t gbias_params = {
205
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
206
0
    .format = CCV_TENSOR_FORMAT_NHWC,
207
0
    .datatype = datatype,
208
0
    .dim = { n_dim, 0 },
209
0
  };
210
0
  ccv_nnc_tensor_param_t gb_params = {
211
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
212
0
    .format = CCV_TENSOR_FORMAT_NHWC,
213
0
    .datatype = datatype,
214
0
    .dim = { m_dim, n_dim, 0 },
215
0
  };
216
0
  ccv_nnc_tensor_param_t a_params = {
217
0
    .type = CCV_TENSOR_CPU_MEMORY,
218
0
    .format = CCV_TENSOR_FORMAT_NHWC,
219
0
    .datatype = datatype,
220
0
    .dim = { m_dim, k_dim, 0 },
221
0
  };
222
0
  ccv_nnc_tensor_param_t w_params = {
223
0
    .type = CCV_TENSOR_CPU_MEMORY,
224
0
    .format = CCV_TENSOR_FORMAT_NHWC,
225
0
    .datatype = datatype,
226
0
    .dim = { n_dim, k_dim, 0 },
227
0
  };
228
0
  ccv_nnc_tensor_param_t bias_params = {
229
0
    .type = CCV_TENSOR_CPU_MEMORY,
230
0
    .format = CCV_TENSOR_FORMAT_NHWC,
231
0
    .datatype = datatype,
232
0
    .dim = { n_dim, 0 },
233
0
  };
234
0
  ccv_nnc_tensor_param_t sample_params = {
235
0
    .type = CCV_TENSOR_CPU_MEMORY,
236
0
    .format = CCV_TENSOR_FORMAT_NHWC,
237
0
    .datatype = datatype,
238
0
    .dim = { 1, 1, 0 },
239
0
  };
240
0
  ccv_nnc_tensor_param_t gsample_params = {
241
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
242
0
    .format = CCV_TENSOR_FORMAT_NHWC,
243
0
    .datatype = datatype,
244
0
    .dim = { 1, 1, 0 },
245
0
  };
246
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, ga_params, 0);
247
0
  ccv_nnc_tensor_t* const w = ccv_nnc_tensor_new(0, gw_params, 0);
248
0
  ccv_nnc_tensor_t* const bias = use_bias ? ccv_nnc_tensor_new(0, gbias_params, 0) : 0;
249
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, gb_params, 0);
250
0
  ccv_nnc_tensor_t* const ha = ccv_nnc_tensor_new(0, a_params, 0);
251
0
  ccv_nnc_tensor_t* const hw = ccv_nnc_tensor_new(0, w_params, 0);
252
0
  ccv_nnc_tensor_t* const hbias = use_bias ? ccv_nnc_tensor_new(0, bias_params, 0) : 0;
253
0
  _mps_forward_na_gemm_fill(datatype, ha->data.u8, m_dim, k_dim, 1);
254
0
  _mps_forward_na_gemm_fill(datatype, hw->data.u8, n_dim, k_dim, 0);
255
0
  if (use_bias)
256
0
    _mps_forward_na_gemm_fill_bias(datatype, hbias->data.u8, n_dim);
257
0
  if (use_bias)
258
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(a, w, bias), 0);
259
0
  else
260
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw), TENSOR_LIST(a, w), 0);
261
0
  ccv_nnc_tensor_free(ha);
262
0
  ccv_nnc_tensor_free(hw);
263
0
  if (hbias)
264
0
    ccv_nnc_tensor_free(hbias);
265
0
  if (use_bias)
266
0
    ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
267
0
  else
268
0
    ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
269
270
0
  int row_samples[8];
271
0
  int col_samples[8];
272
0
  const int row_sample_size = _mps_forward_na_gemm_sample_indices(m_dim, 128, 1, row_samples);
273
0
  const int col_sample_size = _mps_forward_na_gemm_sample_indices(n_dim, 64, 0, col_samples);
274
0
  ccv_nnc_tensor_t* const sample_h = ccv_nnc_tensor_new(0, sample_params, 0);
275
0
  int ok = 1;
276
0
  int i, j;
277
0
  for (i = 0; i < row_sample_size; i++)
278
0
    for (j = 0; j < col_sample_size; j++)
279
0
    {
280
0
      ccv_nnc_tensor_view_t* const bv = ccv_nnc_tensor_view_new(b, gsample_params, DIM_ALLOC(row_samples[i], col_samples[j]), DIM_ALLOC(n_dim, 1));
281
0
      ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST((ccv_nnc_tensor_t*)bv), TENSOR_LIST(sample_h), 0);
282
0
      mismatch->row = row_samples[i];
283
0
      mismatch->col = col_samples[j];
284
0
      _mps_forward_scaled_gemm_to_float(datatype, sample_h->data.u8, 1, &mismatch->actual);
285
0
      mismatch->expected = _mps_forward_na_gemm_expected(datatype, row_samples[i], col_samples[j], k_dim, use_bias);
286
0
      ccv_nnc_tensor_view_free(bv);
287
0
      const float abs_diff = fabsf(mismatch->actual - mismatch->expected);
288
0
      const float denom = fmaxf(fmaxf(fabsf(mismatch->actual), fabsf(mismatch->expected)), 1.0f);
289
0
      const float rel_diff = abs_diff / denom;
290
0
      if (abs_diff > mismatch->max_abs)
291
0
        mismatch->max_abs = abs_diff;
292
0
      if (rel_diff > mismatch->max_rel)
293
0
        mismatch->max_rel = rel_diff;
294
0
      if (abs_diff > _mps_forward_na_gemm_abs_tolerance(datatype) &&
295
0
        rel_diff > _mps_forward_na_gemm_rel_tolerance(datatype))
296
0
      {
297
0
        ok = 0;
298
0
        goto cleanup;
299
0
      }
300
0
    }
301
302
0
cleanup:
303
0
  ccv_nnc_tensor_free(sample_h);
304
0
  ccv_nnc_tensor_free(a);
305
0
  ccv_nnc_tensor_free(w);
306
0
  if (bias)
307
0
    ccv_nnc_tensor_free(bias);
308
0
  ccv_nnc_tensor_free(b);
309
0
  return ok;
310
0
}
311
312
static int _mps_forward_na_gemm_validate_full_shape_for_datatype(const int datatype, const int use_bias, const int signed_values, const int m_dim, const int n_dim, const int k_dim, _mps_forward_na_gemm_mismatch_t* const mismatch)
313
0
{
314
0
  ccv_nnc_tensor_param_t ga_params = {
315
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
316
0
    .format = CCV_TENSOR_FORMAT_NHWC,
317
0
    .datatype = datatype,
318
0
    .dim = { m_dim, k_dim, 0 },
319
0
  };
320
0
  ccv_nnc_tensor_param_t gw_params = {
321
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
322
0
    .format = CCV_TENSOR_FORMAT_NHWC,
323
0
    .datatype = datatype,
324
0
    .dim = { n_dim, k_dim, 0 },
325
0
  };
326
0
  ccv_nnc_tensor_param_t gbias_params = {
327
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
328
0
    .format = CCV_TENSOR_FORMAT_NHWC,
329
0
    .datatype = datatype,
330
0
    .dim = { n_dim, 0 },
331
0
  };
332
0
  ccv_nnc_tensor_param_t gb_params = {
333
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
334
0
    .format = CCV_TENSOR_FORMAT_NHWC,
335
0
    .datatype = datatype,
336
0
    .dim = { m_dim, n_dim, 0 },
337
0
  };
338
0
  ccv_nnc_tensor_param_t a_params = {
339
0
    .type = CCV_TENSOR_CPU_MEMORY,
340
0
    .format = CCV_TENSOR_FORMAT_NHWC,
341
0
    .datatype = datatype,
342
0
    .dim = { m_dim, k_dim, 0 },
343
0
  };
344
0
  ccv_nnc_tensor_param_t w_params = {
345
0
    .type = CCV_TENSOR_CPU_MEMORY,
346
0
    .format = CCV_TENSOR_FORMAT_NHWC,
347
0
    .datatype = datatype,
348
0
    .dim = { n_dim, k_dim, 0 },
349
0
  };
350
0
  ccv_nnc_tensor_param_t bias_params = {
351
0
    .type = CCV_TENSOR_CPU_MEMORY,
352
0
    .format = CCV_TENSOR_FORMAT_NHWC,
353
0
    .datatype = datatype,
354
0
    .dim = { n_dim, 0 },
355
0
  };
356
0
  ccv_nnc_tensor_param_t b_params = {
357
0
    .type = CCV_TENSOR_CPU_MEMORY,
358
0
    .format = CCV_TENSOR_FORMAT_NHWC,
359
0
    .datatype = datatype,
360
0
    .dim = { m_dim, n_dim, 0 },
361
0
  };
362
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, ga_params, 0);
363
0
  ccv_nnc_tensor_t* const w = ccv_nnc_tensor_new(0, gw_params, 0);
364
0
  ccv_nnc_tensor_t* const bias = use_bias ? ccv_nnc_tensor_new(0, gbias_params, 0) : 0;
365
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, gb_params, 0);
366
0
  ccv_nnc_tensor_t* const ha = ccv_nnc_tensor_new(0, a_params, 0);
367
0
  ccv_nnc_tensor_t* const hw = ccv_nnc_tensor_new(0, w_params, 0);
368
0
  ccv_nnc_tensor_t* const hbias = use_bias ? ccv_nnc_tensor_new(0, bias_params, 0) : 0;
369
0
  ccv_nnc_tensor_t* const hb = ccv_nnc_tensor_new(0, b_params, 0);
370
0
  if (signed_values)
371
0
  {
372
0
    _mps_forward_na_gemm_fill_signed(datatype, ha->data.u8, m_dim, k_dim, 1);
373
0
    _mps_forward_na_gemm_fill_signed(datatype, hw->data.u8, n_dim, k_dim, 0);
374
0
  } else {
375
0
    _mps_forward_na_gemm_fill(datatype, ha->data.u8, m_dim, k_dim, 1);
376
0
    _mps_forward_na_gemm_fill(datatype, hw->data.u8, n_dim, k_dim, 0);
377
0
  }
378
0
  if (use_bias)
379
0
    _mps_forward_na_gemm_fill_bias(datatype, hbias->data.u8, n_dim);
380
0
  if (use_bias)
381
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(a, w, bias), 0);
382
0
  else
383
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw), TENSOR_LIST(a, w), 0);
384
0
  if (use_bias)
385
0
    ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
386
0
  else
387
0
    ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
388
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
389
0
  float* const actual = (float*)ccmalloc(sizeof(float) * m_dim * n_dim);
390
0
  _mps_forward_scaled_gemm_to_float(datatype, hb->data.u8, m_dim * n_dim, actual);
391
0
  int ok = 1;
392
0
  int i, j;
393
0
  for (i = 0; i < m_dim; i++)
394
0
    for (j = 0; j < n_dim; j++)
395
0
    {
396
0
      const float expected = signed_values ? _mps_forward_na_gemm_expected_signed(datatype, i, j, k_dim, use_bias) : _mps_forward_na_gemm_expected(datatype, i, j, k_dim, use_bias);
397
0
      const float abs_diff = fabsf(actual[i * n_dim + j] - expected);
398
0
      const float denom = fmaxf(fmaxf(fabsf(actual[i * n_dim + j]), fabsf(expected)), 1.0f);
399
0
      const float rel_diff = abs_diff / denom;
400
0
      if (abs_diff > mismatch->max_abs)
401
0
      {
402
0
        mismatch->row = i;
403
0
        mismatch->col = j;
404
0
        mismatch->actual = actual[i * n_dim + j];
405
0
        mismatch->expected = expected;
406
0
        mismatch->max_abs = abs_diff;
407
0
      }
408
0
      if (rel_diff > mismatch->max_rel)
409
0
        mismatch->max_rel = rel_diff;
410
0
      if (abs_diff > _mps_forward_na_gemm_abs_tolerance(datatype) &&
411
0
        rel_diff > _mps_forward_na_gemm_rel_tolerance(datatype))
412
0
        ok = 0;
413
0
    }
414
0
  ccfree(actual);
415
0
  ccv_nnc_tensor_free(hb);
416
0
  ccv_nnc_tensor_free(ha);
417
0
  ccv_nnc_tensor_free(hw);
418
0
  if (hbias)
419
0
    ccv_nnc_tensor_free(hbias);
420
0
  ccv_nnc_tensor_free(a);
421
0
  ccv_nnc_tensor_free(w);
422
0
  if (bias)
423
0
    ccv_nnc_tensor_free(bias);
424
0
  ccv_nnc_tensor_free(b);
425
0
  return ok;
426
0
}
427
428
static int _mps_forward_na_gemm_validate_shape(const int m_dim, const int n_dim, const int k_dim, _mps_forward_na_gemm_mismatch_t* const mismatch)
429
0
{
430
0
  return _mps_forward_na_gemm_validate_shape_for_datatype(CCV_16F, 0, m_dim, n_dim, k_dim, mismatch);
431
0
}
432
433
static int _mps_forward_na_gemm_validate_shape_with_bias(const int m_dim, const int n_dim, const int k_dim, _mps_forward_na_gemm_mismatch_t* const mismatch)
434
0
{
435
0
  return _mps_forward_na_gemm_validate_shape_for_datatype(CCV_16F, 1, m_dim, n_dim, k_dim, mismatch);
436
0
}
437
438
static float _mps_forward_ane_stream_lhs_value(const int row, const int k, const int variant)
439
0
{
440
0
  return (float)((((row * 31 + k * 17 + variant * 19) % 97) - 48)) / 64.0f;
441
0
}
442
443
static float _mps_forward_ane_stream_rhs_value(const int row, const int k, const int variant)
444
0
{
445
0
  return (float)((((row * 13 + k * 29 + variant * 23) % 89) - 44)) / 64.0f;
446
0
}
447
448
static void _mps_forward_ane_stream_fill_half(ccv_float16_t* const data, const int rows, const int cols, const int variant, const int for_lhs)
449
0
{
450
0
  float* const row_buffer = (float*)ccmalloc(sizeof(float) * cols);
451
0
  int i, j;
452
0
  for (i = 0; i < rows; i++)
453
0
  {
454
0
    for (j = 0; j < cols; j++)
455
0
      row_buffer[j] = for_lhs ? _mps_forward_ane_stream_lhs_value(i, j, variant) : _mps_forward_ane_stream_rhs_value(i, j, variant);
456
0
    ccv_float_to_half_precision(row_buffer, (uint16_t*)data + (size_t)i * cols, cols);
457
0
  }
458
0
  ccfree(row_buffer);
459
0
}
460
461
static int _mps_forward_ane_rowwise_gemm_stream_sync_validate(double* const max_abs_ref, double* const max_rel_ref)
462
0
{
463
0
  const int m_dim = 512;
464
0
  const int n_dim = 768;
465
0
  const int k_dim = 1024;
466
0
  const int writer_k = 4096;
467
0
  ccv_nnc_tensor_t* const hlhs_old = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, m_dim, writer_k), 0);
468
0
  ccv_nnc_tensor_t* const hrhs_old = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, k_dim, writer_k), 0);
469
0
  ccv_nnc_tensor_t* const hlhs_new = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, m_dim, writer_k), 0);
470
0
  ccv_nnc_tensor_t* const hrhs_new = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, k_dim, writer_k), 0);
471
0
  ccv_nnc_tensor_t* const hw_dense = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, n_dim, k_dim), 0);
472
0
  ccv_nnc_tensor_t* const hwq = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(CPU_TENSOR_NHWC(16F, n_dim, k_dim)), 0);
473
0
  ccv_nnc_tensor_t* const lhs = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, m_dim, writer_k), 0);
474
0
  ccv_nnc_tensor_t* const rhs = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, k_dim, writer_k), 0);
475
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, m_dim, k_dim), 0);
476
0
  ccv_nnc_tensor_t* const w = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(GPU_TENSOR_NHWC(000, 16F, n_dim, k_dim)), 0);
477
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, m_dim, n_dim), 0);
478
0
  ccv_nnc_tensor_t* const bref = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, m_dim, n_dim), 0);
479
0
  ccv_nnc_tensor_t* const hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, m_dim, n_dim), 0);
480
0
  ccv_nnc_tensor_t* const hbref = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, m_dim, n_dim), 0);
481
0
  ccv_nnc_stream_context_t* const stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
482
0
  float* const actual = (float*)ccmalloc(sizeof(float) * m_dim * n_dim);
483
0
  float* const expected = (float*)ccmalloc(sizeof(float) * m_dim * n_dim);
484
0
  _mps_forward_ane_stream_fill_half(hlhs_old->data.f16, m_dim, writer_k, 0, 1);
485
0
  _mps_forward_ane_stream_fill_half(hrhs_old->data.f16, k_dim, writer_k, 0, 0);
486
0
  _mps_forward_ane_stream_fill_half(hlhs_new->data.f16, m_dim, writer_k, 1, 1);
487
0
  _mps_forward_ane_stream_fill_half(hrhs_new->data.f16, k_dim, writer_k, 1, 0);
488
0
  _mps_forward_na_gemm_fill_half(hw_dense->data.f16, n_dim, k_dim, 0);
489
0
  const size_t qsize = ccv_nnc_quantize_8i_rowwise(hw_dense->data.f16, CCV_16F, CCV_TENSOR_CPU_MEMORY, (size_t)n_dim * k_dim, k_dim, hwq->data.u8, ccv_nnc_tensor_data_size_without_padding(hwq->info));
490
0
  if (qsize != ccv_nnc_tensor_data_size_without_padding(hwq->info))
491
0
  {
492
0
    ccv_nnc_stream_context_free(stream_context);
493
0
    ccfree(expected);
494
0
    ccfree(actual);
495
0
    ccv_nnc_tensor_free(hbref);
496
0
    ccv_nnc_tensor_free(hb);
497
0
    ccv_nnc_tensor_free(bref);
498
0
    ccv_nnc_tensor_free(b);
499
0
    ccv_nnc_tensor_free(w);
500
0
    ccv_nnc_tensor_free(a);
501
0
    ccv_nnc_tensor_free(rhs);
502
0
    ccv_nnc_tensor_free(lhs);
503
0
    ccv_nnc_tensor_free(hwq);
504
0
    ccv_nnc_tensor_free(hw_dense);
505
0
    ccv_nnc_tensor_free(hrhs_new);
506
0
    ccv_nnc_tensor_free(hlhs_new);
507
0
    ccv_nnc_tensor_free(hrhs_old);
508
0
    ccv_nnc_tensor_free(hlhs_old);
509
0
    return -1;
510
0
  }
511
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(hlhs_old, hrhs_old, hwq), TENSOR_LIST(lhs, rhs, w), stream_context);
512
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(lhs, rhs), TENSOR_LIST(a), stream_context);
513
0
  ccv_nnc_synchronize_stream_context(stream_context);
514
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), stream_context);
515
0
  ccv_nnc_synchronize_stream_context(stream_context);
516
517
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(hlhs_new, hrhs_new), TENSOR_LIST(lhs, rhs), stream_context);
518
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(lhs, rhs), TENSOR_LIST(a), stream_context);
519
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), stream_context);
520
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), stream_context);
521
0
  ccv_nnc_synchronize_stream_context(stream_context);
522
523
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(hlhs_new, hrhs_new), TENSOR_LIST(lhs, rhs), stream_context);
524
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(lhs, rhs), TENSOR_LIST(a), stream_context);
525
0
  ccv_nnc_synchronize_stream_context(stream_context);
526
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(bref), stream_context);
527
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(bref), TENSOR_LIST(hbref), stream_context);
528
0
  ccv_nnc_synchronize_stream_context(stream_context);
529
530
0
  _mps_forward_scaled_gemm_to_float(CCV_16F, hb->data.f16, m_dim * n_dim, actual);
531
0
  _mps_forward_scaled_gemm_to_float(CCV_16F, hbref->data.f16, m_dim * n_dim, expected);
532
0
  double max_abs = 0;
533
0
  double max_rel = 0;
534
0
  int i;
535
0
  for (i = 0; i < m_dim * n_dim; i++)
536
0
  {
537
0
    const double diff = fabs((double)actual[i] - (double)expected[i]);
538
0
    const double denom = ccv_max(1.0, ccv_max(fabs((double)actual[i]), fabs((double)expected[i])));
539
0
    max_abs = ccv_max(max_abs, diff);
540
0
    max_rel = ccv_max(max_rel, diff / denom);
541
0
  }
542
0
  if (max_abs_ref)
543
0
    *max_abs_ref = max_abs;
544
0
  if (max_rel_ref)
545
0
    *max_rel_ref = max_rel;
546
0
  ccfree(expected);
547
0
  ccfree(actual);
548
0
  ccv_nnc_stream_context_free(stream_context);
549
0
  ccv_nnc_tensor_free(hbref);
550
0
  ccv_nnc_tensor_free(hb);
551
0
  ccv_nnc_tensor_free(bref);
552
0
  ccv_nnc_tensor_free(b);
553
0
  ccv_nnc_tensor_free(w);
554
0
  ccv_nnc_tensor_free(a);
555
0
  ccv_nnc_tensor_free(rhs);
556
0
  ccv_nnc_tensor_free(lhs);
557
0
  ccv_nnc_tensor_free(hwq);
558
0
  ccv_nnc_tensor_free(hw_dense);
559
0
  ccv_nnc_tensor_free(hrhs_new);
560
0
  ccv_nnc_tensor_free(hlhs_new);
561
0
  ccv_nnc_tensor_free(hrhs_old);
562
0
  ccv_nnc_tensor_free(hlhs_old);
563
0
  return 0;
564
0
}
565
566
static void _mps_forward_scaled_gemm_fill_matrix(const int datatype, void* const data, const int rows, const int cols, const int for_a)
567
0
{
568
0
  float* const values = (float*)ccmalloc(sizeof(float) * rows * cols);
569
0
  int i, j;
570
0
  for (i = 0; i < rows; i++)
571
0
    for (j = 0; j < cols; j++)
572
0
      values[i * cols + j] = for_a ? _mps_forward_na_gemm_a_value(i, j) : _mps_forward_na_gemm_b_value(i, j);
573
0
  if (datatype == CCV_16F)
574
0
    ccv_float_to_half_precision(values, (uint16_t*)data, rows * cols);
575
0
  else if (datatype == CCV_16BF)
576
0
    ccv_float_to_bfloat(values, (uint16_t*)data, rows * cols);
577
0
  else
578
0
    memcpy(data, values, sizeof(float) * rows * cols);
579
0
  ccfree(values);
580
0
}
581
582
static void _mps_forward_scaled_gemm_fill_bias(const int datatype, void* const data, const int cols)
583
0
{
584
0
  float* const values = (float*)ccmalloc(sizeof(float) * cols);
585
0
  int j;
586
0
  for (j = 0; j < cols; j++)
587
0
    values[j] = _mps_forward_na_gemm_bias_value(j);
588
0
  if (datatype == CCV_16F)
589
0
    ccv_float_to_half_precision(values, (uint16_t*)data, cols);
590
0
  else if (datatype == CCV_16BF)
591
0
    ccv_float_to_bfloat(values, (uint16_t*)data, cols);
592
0
  else
593
0
    memcpy(data, values, sizeof(float) * cols);
594
0
  ccfree(values);
595
0
}
596
597
static void _mps_forward_scaled_gemm_to_float(const int datatype, const void* const data, const int count, float* const values)
598
0
{
599
0
  if (datatype == CCV_16F)
600
0
    ccv_half_precision_to_float((const uint16_t*)data, values, count);
601
0
  else if (datatype == CCV_16BF)
602
0
    ccv_bfloat_to_float((const uint16_t*)data, values, count);
603
0
  else
604
0
    memcpy(values, data, sizeof(float) * count);
605
0
}
606
607
static void _mps_forward_scaled_gemm_compare_rows(const int datatype, const void* const actual_data, const void* const expected_data, const int rows, const int cols, double* const max_abs_ref, double* const max_rel_ref)
608
0
{
609
0
  float* const actual_row = (float*)ccmalloc(sizeof(float) * cols);
610
0
  float* const expected_row = (float*)ccmalloc(sizeof(float) * cols);
611
0
  const size_t element_size = CCV_GET_DATA_TYPE_SIZE(datatype);
612
0
  const uint8_t* const actual_bytes = (const uint8_t*)actual_data;
613
0
  const uint8_t* const expected_bytes = (const uint8_t*)expected_data;
614
0
  double max_abs = 0;
615
0
  double max_rel = 0;
616
0
  int i, j;
617
0
  for (i = 0; i < rows; i++)
618
0
  {
619
0
    _mps_forward_scaled_gemm_to_float(datatype, actual_bytes + (size_t)i * cols * element_size, cols, actual_row);
620
0
    _mps_forward_scaled_gemm_to_float(datatype, expected_bytes + (size_t)i * cols * element_size, cols, expected_row);
621
0
    for (j = 0; j < cols; j++)
622
0
    {
623
0
      const double diff = fabs((double)actual_row[j] - (double)expected_row[j]);
624
0
      const double denom = ccv_max(1.0, ccv_max(fabs((double)actual_row[j]), fabs((double)expected_row[j])));
625
0
      max_abs = ccv_max(max_abs, diff);
626
0
      max_rel = ccv_max(max_rel, diff / denom);
627
0
    }
628
0
  }
629
0
  ccfree(expected_row);
630
0
  ccfree(actual_row);
631
0
  if (max_abs_ref)
632
0
    *max_abs_ref = max_abs;
633
0
  if (max_rel_ref)
634
0
    *max_rel_ref = max_rel;
635
0
}
636
637
static void _mps_forward_scaled_gemm_quantized_reference(const int datatype, const void* const data, const int rows, const int cols, float* const values)
638
0
{
639
0
  ccv_nnc_tensor_param_t params = {
640
0
    .type = CCV_TENSOR_CPU_MEMORY,
641
0
    .format = CCV_TENSOR_FORMAT_NHWC,
642
0
    .datatype = datatype,
643
0
    .dim = { rows, cols, 0 },
644
0
  };
645
0
  const ccv_nnc_tensor_param_t qparams = ccv_nnc_tensor_8i_rowwise(params);
646
0
  const size_t qsize = ccv_nnc_tensor_data_size_without_padding(qparams);
647
0
  uint8_t* const qdata = (uint8_t*)ccmalloc(qsize);
648
0
  const size_t encoded = ccv_nnc_quantize_8i_rowwise(data, datatype, CCV_TENSOR_CPU_MEMORY, rows * cols, cols, qdata, qsize);
649
0
  void* dequantized = 0;
650
0
  if (datatype == CCV_16F || datatype == CCV_16BF)
651
0
    dequantized = ccmalloc(sizeof(uint16_t) * rows * cols);
652
0
  else
653
0
    dequantized = ccmalloc(sizeof(float) * rows * cols);
654
0
  ccv_nnc_dequantize_8i_rowwise(qdata, datatype, CCV_TENSOR_CPU_MEMORY, encoded, cols, dequantized, rows * cols);
655
0
  _mps_forward_scaled_gemm_to_float(datatype, dequantized, rows * cols, values);
656
0
  ccfree(dequantized);
657
0
  ccfree(qdata);
658
0
}
659
660
static void _mps_forward_scaled_gemm_reference(const float* const a, const float* const w, const float* const bias, const int m_dim, const int n_dim, const int k_dim, float* const out)
661
0
{
662
0
  int i, j, k;
663
0
  for (i = 0; i < m_dim; i++)
664
0
    for (j = 0; j < n_dim; j++)
665
0
    {
666
0
      float sum = bias ? bias[j] : 0;
667
0
      for (k = 0; k < k_dim; k++)
668
0
        sum += a[i * k_dim + k] * w[j * k_dim + k];
669
0
      out[i * n_dim + j] = sum;
670
0
    }
671
0
}
672
673
static float _mps_forward_scaled_gemm_a_batched_value(const int batch, const int row, const int k)
674
0
{
675
0
  return (float)(((batch * 11 + row * 17 + k * 13) % 41) - 20) / 256.0f;
676
0
}
677
678
static float _mps_forward_scaled_gemm_w_batched_value(const int batch, const int col, const int k)
679
0
{
680
0
  return (float)(((batch * 7 + col * 19 + k * 5) % 43) - 21) / 256.0f;
681
0
}
682
683
static float _mps_forward_scaled_gemm_bias_batched_value(const int batch, const int col)
684
0
{
685
0
  return (float)(((batch * 3 + col * 5) % 23) - 11) / 256.0f;
686
0
}
687
688
static void _mps_forward_scaled_gemm_fill_matrix_batched(const int datatype, void* const data, const int batch_dim, const int rows, const int cols, const int for_a)
689
0
{
690
0
  float* const values = (float*)ccmalloc(sizeof(float) * batch_dim * rows * cols);
691
0
  int b, i, j;
692
0
  for (b = 0; b < batch_dim; b++)
693
0
    for (i = 0; i < rows; i++)
694
0
      for (j = 0; j < cols; j++)
695
0
        values[((b * rows) + i) * cols + j] = for_a ? _mps_forward_scaled_gemm_a_batched_value(b, i, j) : _mps_forward_scaled_gemm_w_batched_value(b, i, j);
696
0
  if (datatype == CCV_16F)
697
0
    ccv_float_to_half_precision(values, (uint16_t*)data, batch_dim * rows * cols);
698
0
  else if (datatype == CCV_16BF)
699
0
    ccv_float_to_bfloat(values, (uint16_t*)data, batch_dim * rows * cols);
700
0
  else
701
0
    memcpy(data, values, sizeof(float) * batch_dim * rows * cols);
702
0
  ccfree(values);
703
0
}
704
705
static void _mps_forward_scaled_gemm_fill_bias_batched(const int datatype, void* const data, const int batch_dim, const int cols)
706
0
{
707
0
  float* const values = (float*)ccmalloc(sizeof(float) * batch_dim * cols);
708
0
  int b, j;
709
0
  for (b = 0; b < batch_dim; b++)
710
0
    for (j = 0; j < cols; j++)
711
0
      values[b * cols + j] = _mps_forward_scaled_gemm_bias_batched_value(b, j);
712
0
  if (datatype == CCV_16F)
713
0
    ccv_float_to_half_precision(values, (uint16_t*)data, batch_dim * cols);
714
0
  else if (datatype == CCV_16BF)
715
0
    ccv_float_to_bfloat(values, (uint16_t*)data, batch_dim * cols);
716
0
  else
717
0
    memcpy(data, values, sizeof(float) * batch_dim * cols);
718
0
  ccfree(values);
719
0
}
720
721
static void _mps_forward_scaled_gemm_reference_batched(const float* const a, const float* const w, const float* const bias, const int batch_dim, const int w_batch_dim, const int bias_batch_dim, const int m_dim, const int n_dim, const int k_dim, float* const out)
722
0
{
723
0
  int b, i, j, k;
724
0
  for (b = 0; b < batch_dim; b++)
725
0
    for (i = 0; i < m_dim; i++)
726
0
      for (j = 0; j < n_dim; j++)
727
0
      {
728
0
        const int w_batch = (w_batch_dim > 1) ? b : 0;
729
0
        const int bias_batch = (bias_batch_dim > 1) ? b : 0;
730
0
        float sum = bias ? bias[bias_batch * n_dim + j] : 0;
731
0
        for (k = 0; k < k_dim; k++)
732
0
          sum += a[((b * m_dim) + i) * k_dim + k] * w[((w_batch * n_dim) + j) * k_dim + k];
733
0
        out[((b * m_dim) + i) * n_dim + j] = sum;
734
0
      }
735
0
}
736
737
static int _mps_forward_scaled_gemm_validate_shape(const int datatype, const int use_bias, const int m_dim, const int n_dim, const int k_dim, double* const max_abs_ref, double* const max_rel_ref)
738
0
{
739
0
  ccv_nnc_tensor_param_t ga_params = {
740
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
741
0
    .format = CCV_TENSOR_FORMAT_NHWC,
742
0
    .datatype = datatype,
743
0
    .dim = { m_dim, k_dim, 0 },
744
0
  };
745
0
  ccv_nnc_tensor_param_t gw_params = {
746
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
747
0
    .format = CCV_TENSOR_FORMAT_NHWC,
748
0
    .datatype = datatype,
749
0
    .dim = { n_dim, k_dim, 0 },
750
0
  };
751
0
  ccv_nnc_tensor_param_t gb_params = {
752
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
753
0
    .format = CCV_TENSOR_FORMAT_NHWC,
754
0
    .datatype = datatype,
755
0
    .dim = { m_dim, n_dim, 0 },
756
0
  };
757
0
  ccv_nnc_tensor_param_t gbias_params = {
758
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
759
0
    .format = CCV_TENSOR_FORMAT_NHWC,
760
0
    .datatype = datatype,
761
0
    .dim = { n_dim, 0 },
762
0
  };
763
0
  ccv_nnc_tensor_param_t a_params = {
764
0
    .type = CCV_TENSOR_CPU_MEMORY,
765
0
    .format = CCV_TENSOR_FORMAT_NHWC,
766
0
    .datatype = datatype,
767
0
    .dim = { m_dim, k_dim, 0 },
768
0
  };
769
0
  ccv_nnc_tensor_param_t w_params = {
770
0
    .type = CCV_TENSOR_CPU_MEMORY,
771
0
    .format = CCV_TENSOR_FORMAT_NHWC,
772
0
    .datatype = datatype,
773
0
    .dim = { n_dim, k_dim, 0 },
774
0
  };
775
0
  ccv_nnc_tensor_param_t b_params = {
776
0
    .type = CCV_TENSOR_CPU_MEMORY,
777
0
    .format = CCV_TENSOR_FORMAT_NHWC,
778
0
    .datatype = datatype,
779
0
    .dim = { m_dim, n_dim, 0 },
780
0
  };
781
0
  ccv_nnc_tensor_param_t bias_params = {
782
0
    .type = CCV_TENSOR_CPU_MEMORY,
783
0
    .format = CCV_TENSOR_FORMAT_NHWC,
784
0
    .datatype = datatype,
785
0
    .dim = { n_dim, 0 },
786
0
  };
787
0
  ccv_nnc_tensor_t* const ha = ccv_nnc_tensor_new(0, a_params, 0);
788
0
  ccv_nnc_tensor_t* const hwq = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(w_params), 0);
789
0
  ccv_nnc_tensor_t* const hbias = use_bias ? ccv_nnc_tensor_new(0, bias_params, 0) : 0;
790
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, ga_params, 0);
791
0
  ccv_nnc_tensor_t* const w = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(gw_params), 0);
792
0
  ccv_nnc_tensor_t* const bias = use_bias ? ccv_nnc_tensor_new(0, gbias_params, 0) : 0;
793
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, gb_params, 0);
794
0
  ccv_nnc_tensor_t* const hb = ccv_nnc_tensor_new(0, b_params, 0);
795
0
  _mps_forward_scaled_gemm_fill_matrix(datatype, ha->data.u8, m_dim, k_dim, 1);
796
0
  if (use_bias)
797
0
    _mps_forward_scaled_gemm_fill_bias(datatype, hbias->data.u8, n_dim);
798
0
  void* const w_dense = ccmalloc(CCV_GET_DATA_TYPE_SIZE(datatype) * n_dim * k_dim);
799
0
  _mps_forward_scaled_gemm_fill_matrix(datatype, w_dense, n_dim, k_dim, 0);
800
0
  const size_t qsize = ccv_nnc_quantize_8i_rowwise(w_dense, datatype, CCV_TENSOR_CPU_MEMORY, n_dim * k_dim, k_dim, hwq->data.u8, ccv_nnc_tensor_data_size_without_padding(hwq->info));
801
0
  if (qsize != ccv_nnc_tensor_data_size_without_padding(hwq->info))
802
0
  {
803
0
    ccfree(w_dense);
804
0
    ccv_nnc_tensor_free(ha);
805
0
    ccv_nnc_tensor_free(hwq);
806
0
    if (hbias)
807
0
      ccv_nnc_tensor_free(hbias);
808
0
    ccv_nnc_tensor_free(a);
809
0
    ccv_nnc_tensor_free(w);
810
0
    if (bias)
811
0
      ccv_nnc_tensor_free(bias);
812
0
    ccv_nnc_tensor_free(b);
813
0
    ccv_nnc_tensor_free(hb);
814
0
    return -1;
815
0
  }
816
0
  if (use_bias)
817
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hwq, hbias), TENSOR_LIST(a, w, bias), 0);
818
0
  else
819
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hwq), TENSOR_LIST(a, w), 0);
820
0
  if (use_bias)
821
0
    ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
822
0
  else
823
0
    ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
824
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
825
826
0
  float* const a_ref = (float*)ccmalloc(sizeof(float) * m_dim * k_dim);
827
0
  float* const w_ref = (float*)ccmalloc(sizeof(float) * n_dim * k_dim);
828
0
  float* const bias_ref = use_bias ? (float*)ccmalloc(sizeof(float) * n_dim) : 0;
829
0
  float* const actual = (float*)ccmalloc(sizeof(float) * m_dim * n_dim);
830
0
  float* const expected = (float*)ccmalloc(sizeof(float) * m_dim * n_dim);
831
0
  _mps_forward_scaled_gemm_quantized_reference(datatype, ha->data.u8, m_dim, k_dim, a_ref);
832
0
  _mps_forward_scaled_gemm_quantized_reference(datatype, w_dense, n_dim, k_dim, w_ref);
833
0
  if (use_bias)
834
0
    _mps_forward_scaled_gemm_to_float(datatype, hbias->data.u8, n_dim, bias_ref);
835
0
  _mps_forward_scaled_gemm_to_float(datatype, hb->data.u8, m_dim * n_dim, actual);
836
0
  _mps_forward_scaled_gemm_reference(a_ref, w_ref, bias_ref, m_dim, n_dim, k_dim, expected);
837
0
  double max_abs = 0;
838
0
  double max_rel = 0;
839
0
  int i;
840
0
  for (i = 0; i < m_dim * n_dim; i++)
841
0
  {
842
0
    const double diff = fabs((double)actual[i] - (double)expected[i]);
843
0
    const double denom = ccv_max(1.0, ccv_max(fabs((double)actual[i]), fabs((double)expected[i])));
844
0
    max_abs = ccv_max(max_abs, diff);
845
0
    max_rel = ccv_max(max_rel, diff / denom);
846
0
  }
847
0
  if (max_abs_ref)
848
0
    *max_abs_ref = max_abs;
849
0
  if (max_rel_ref)
850
0
    *max_rel_ref = max_rel;
851
852
0
  ccfree(expected);
853
0
  ccfree(actual);
854
0
  if (bias_ref)
855
0
    ccfree(bias_ref);
856
0
  ccfree(w_ref);
857
0
  ccfree(a_ref);
858
0
  ccfree(w_dense);
859
0
  ccv_nnc_tensor_free(ha);
860
0
  ccv_nnc_tensor_free(hwq);
861
0
  if (hbias)
862
0
    ccv_nnc_tensor_free(hbias);
863
0
  ccv_nnc_tensor_free(a);
864
0
  ccv_nnc_tensor_free(w);
865
0
  if (bias)
866
0
    ccv_nnc_tensor_free(bias);
867
0
  ccv_nnc_tensor_free(b);
868
0
  ccv_nnc_tensor_free(hb);
869
0
  return 0;
870
0
}
871
872
static int _mps_forward_scaled_gemm_validate(const int datatype, const int use_bias, double* const max_abs_ref, double* const max_rel_ref)
873
0
{
874
0
  return _mps_forward_scaled_gemm_validate_shape(datatype, use_bias, 257, 384, 128, max_abs_ref, max_rel_ref);
875
0
}
876
877
static int _mps_forward_scaled_gemm_validate_aligned_m(const int datatype, const int use_bias, double* const max_abs_ref, double* const max_rel_ref)
878
0
{
879
0
  return _mps_forward_scaled_gemm_validate_shape(datatype, use_bias, 384, 384, 128, max_abs_ref, max_rel_ref);
880
0
}
881
882
static int _mps_forward_scaled_gemm_validate_batched(const int datatype, const int use_bias, const int weight_batched, const int bias_batched, double* const max_abs_ref, double* const max_rel_ref)
883
0
{
884
0
  const int batch_dim = 2;
885
0
  const int m_dim = 129;
886
0
  const int n_dim = 384;
887
0
  const int k_dim = 128;
888
0
  ccv_nnc_tensor_param_t ga_params = {
889
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
890
0
    .format = CCV_TENSOR_FORMAT_NHWC,
891
0
    .datatype = datatype,
892
0
    .dim = { batch_dim, m_dim, k_dim, 0 },
893
0
  };
894
0
  ccv_nnc_tensor_param_t gw_params = {
895
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
896
0
    .format = CCV_TENSOR_FORMAT_NHWC,
897
0
    .datatype = datatype,
898
0
    .dim = { weight_batched ? batch_dim : n_dim, weight_batched ? n_dim : k_dim, weight_batched ? k_dim : 0, 0 },
899
0
  };
900
0
  ccv_nnc_tensor_param_t gb_params = {
901
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
902
0
    .format = CCV_TENSOR_FORMAT_NHWC,
903
0
    .datatype = datatype,
904
0
    .dim = { batch_dim, m_dim, n_dim, 0 },
905
0
  };
906
0
  ccv_nnc_tensor_param_t gbias_params = {
907
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
908
0
    .format = CCV_TENSOR_FORMAT_NHWC,
909
0
    .datatype = datatype,
910
0
    .dim = { bias_batched ? batch_dim : n_dim, bias_batched ? n_dim : 0, 0, 0 },
911
0
  };
912
0
  ccv_nnc_tensor_param_t a_params = {
913
0
    .type = CCV_TENSOR_CPU_MEMORY,
914
0
    .format = CCV_TENSOR_FORMAT_NHWC,
915
0
    .datatype = datatype,
916
0
    .dim = { batch_dim, m_dim, k_dim, 0 },
917
0
  };
918
0
  ccv_nnc_tensor_param_t w_params = {
919
0
    .type = CCV_TENSOR_CPU_MEMORY,
920
0
    .format = CCV_TENSOR_FORMAT_NHWC,
921
0
    .datatype = datatype,
922
0
    .dim = { weight_batched ? batch_dim : n_dim, weight_batched ? n_dim : k_dim, weight_batched ? k_dim : 0, 0 },
923
0
  };
924
0
  ccv_nnc_tensor_param_t b_params = {
925
0
    .type = CCV_TENSOR_CPU_MEMORY,
926
0
    .format = CCV_TENSOR_FORMAT_NHWC,
927
0
    .datatype = datatype,
928
0
    .dim = { batch_dim, m_dim, n_dim, 0 },
929
0
  };
930
0
  ccv_nnc_tensor_param_t bias_params = {
931
0
    .type = CCV_TENSOR_CPU_MEMORY,
932
0
    .format = CCV_TENSOR_FORMAT_NHWC,
933
0
    .datatype = datatype,
934
0
    .dim = { bias_batched ? batch_dim : n_dim, bias_batched ? n_dim : 0, 0, 0 },
935
0
  };
936
0
  ccv_nnc_tensor_t* const ha = ccv_nnc_tensor_new(0, a_params, 0);
937
0
  ccv_nnc_tensor_t* const hwq = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(w_params), 0);
938
0
  ccv_nnc_tensor_t* const hbias = use_bias ? ccv_nnc_tensor_new(0, bias_params, 0) : 0;
939
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, ga_params, 0);
940
0
  ccv_nnc_tensor_t* const w = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(gw_params), 0);
941
0
  ccv_nnc_tensor_t* const bias = use_bias ? ccv_nnc_tensor_new(0, gbias_params, 0) : 0;
942
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, gb_params, 0);
943
0
  ccv_nnc_tensor_t* const hb = ccv_nnc_tensor_new(0, b_params, 0);
944
0
  _mps_forward_scaled_gemm_fill_matrix_batched(datatype, ha->data.u8, batch_dim, m_dim, k_dim, 1);
945
0
  if (use_bias)
946
0
  {
947
0
    if (bias_batched)
948
0
      _mps_forward_scaled_gemm_fill_bias_batched(datatype, hbias->data.u8, batch_dim, n_dim);
949
0
    else
950
0
      _mps_forward_scaled_gemm_fill_bias(datatype, hbias->data.u8, n_dim);
951
0
  }
952
0
  const int w_batch_dim = weight_batched ? batch_dim : 1;
953
0
  void* const w_dense = ccmalloc(CCV_GET_DATA_TYPE_SIZE(datatype) * w_batch_dim * n_dim * k_dim);
954
0
  if (weight_batched)
955
0
    _mps_forward_scaled_gemm_fill_matrix_batched(datatype, w_dense, batch_dim, n_dim, k_dim, 0);
956
0
  else
957
0
    _mps_forward_scaled_gemm_fill_matrix(datatype, w_dense, n_dim, k_dim, 0);
958
0
  const size_t qsize = ccv_nnc_quantize_8i_rowwise(w_dense, datatype, CCV_TENSOR_CPU_MEMORY, w_batch_dim * n_dim * k_dim, k_dim, hwq->data.u8, ccv_nnc_tensor_data_size_without_padding(hwq->info));
959
0
  if (qsize != ccv_nnc_tensor_data_size_without_padding(hwq->info))
960
0
  {
961
0
    ccfree(w_dense);
962
0
    ccv_nnc_tensor_free(ha);
963
0
    ccv_nnc_tensor_free(hwq);
964
0
    if (hbias)
965
0
      ccv_nnc_tensor_free(hbias);
966
0
    ccv_nnc_tensor_free(a);
967
0
    ccv_nnc_tensor_free(w);
968
0
    if (bias)
969
0
      ccv_nnc_tensor_free(bias);
970
0
    ccv_nnc_tensor_free(b);
971
0
    ccv_nnc_tensor_free(hb);
972
0
    return -1;
973
0
  }
974
0
  if (use_bias)
975
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hwq, hbias), TENSOR_LIST(a, w, bias), 0);
976
0
  else
977
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hwq), TENSOR_LIST(a, w), 0);
978
0
  if (weight_batched)
979
0
  {
980
0
    if (use_bias)
981
0
      ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
982
0
    else
983
0
      ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
984
0
  } else {
985
0
    if (use_bias)
986
0
      ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
987
0
    else
988
0
      ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
989
0
  }
990
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
991
992
0
  float* const a_ref = (float*)ccmalloc(sizeof(float) * batch_dim * m_dim * k_dim);
993
0
  float* const w_ref = (float*)ccmalloc(sizeof(float) * w_batch_dim * n_dim * k_dim);
994
0
  float* const bias_ref = use_bias ? (float*)ccmalloc(sizeof(float) * (bias_batched ? batch_dim : 1) * n_dim) : 0;
995
0
  float* const actual = (float*)ccmalloc(sizeof(float) * batch_dim * m_dim * n_dim);
996
0
  float* const expected = (float*)ccmalloc(sizeof(float) * batch_dim * m_dim * n_dim);
997
0
  _mps_forward_scaled_gemm_quantized_reference(datatype, ha->data.u8, batch_dim * m_dim, k_dim, a_ref);
998
0
  _mps_forward_scaled_gemm_quantized_reference(datatype, w_dense, w_batch_dim * n_dim, k_dim, w_ref);
999
0
  if (use_bias)
1000
0
    _mps_forward_scaled_gemm_to_float(datatype, hbias->data.u8, (bias_batched ? batch_dim : 1) * n_dim, bias_ref);
1001
0
  _mps_forward_scaled_gemm_to_float(datatype, hb->data.u8, batch_dim * m_dim * n_dim, actual);
1002
0
  _mps_forward_scaled_gemm_reference_batched(a_ref, w_ref, bias_ref, batch_dim, w_batch_dim, bias_batched ? batch_dim : 1, m_dim, n_dim, k_dim, expected);
1003
0
  double max_abs = 0;
1004
0
  double max_rel = 0;
1005
0
  int i;
1006
0
  for (i = 0; i < batch_dim * m_dim * n_dim; i++)
1007
0
  {
1008
0
    const double diff = fabs((double)actual[i] - (double)expected[i]);
1009
0
    const double denom = ccv_max(1.0, ccv_max(fabs((double)actual[i]), fabs((double)expected[i])));
1010
0
    max_abs = ccv_max(max_abs, diff);
1011
0
    max_rel = ccv_max(max_rel, diff / denom);
1012
0
  }
1013
0
  if (max_abs_ref)
1014
0
    *max_abs_ref = max_abs;
1015
0
  if (max_rel_ref)
1016
0
    *max_rel_ref = max_rel;
1017
1018
0
  ccfree(expected);
1019
0
  ccfree(actual);
1020
0
  if (bias_ref)
1021
0
    ccfree(bias_ref);
1022
0
  ccfree(w_ref);
1023
0
  ccfree(a_ref);
1024
0
  ccfree(w_dense);
1025
0
  ccv_nnc_tensor_free(ha);
1026
0
  ccv_nnc_tensor_free(hwq);
1027
0
  if (hbias)
1028
0
    ccv_nnc_tensor_free(hbias);
1029
0
  ccv_nnc_tensor_free(a);
1030
0
  ccv_nnc_tensor_free(w);
1031
0
  if (bias)
1032
0
    ccv_nnc_tensor_free(bias);
1033
0
  ccv_nnc_tensor_free(b);
1034
0
  ccv_nnc_tensor_free(hb);
1035
0
  return 0;
1036
0
}
1037
1038
static int _mps_forward_scaled_gemm_compare_dense(const int datatype, const int use_bias, const int m_dim, const int n_dim, const int k_dim, double* const max_abs_ref, double* const max_rel_ref)
1039
0
{
1040
0
  ccv_nnc_tensor_param_t ga_params = {
1041
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
1042
0
    .format = CCV_TENSOR_FORMAT_NHWC,
1043
0
    .datatype = datatype,
1044
0
    .dim = { m_dim, k_dim, 0 },
1045
0
  };
1046
0
  ccv_nnc_tensor_param_t gwq_params = {
1047
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
1048
0
    .format = CCV_TENSOR_FORMAT_NHWC,
1049
0
    .datatype = ((datatype >> 12) & 0xff) | CCV_QX | CCV_NNC_QX_8I_ROWWISE,
1050
0
    .dim = { n_dim, k_dim, 0 },
1051
0
  };
1052
0
  ccv_nnc_tensor_param_t gwd_params = {
1053
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
1054
0
    .format = CCV_TENSOR_FORMAT_NHWC,
1055
0
    .datatype = datatype,
1056
0
    .dim = { n_dim, k_dim, 0 },
1057
0
  };
1058
0
  ccv_nnc_tensor_param_t gb_params = {
1059
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
1060
0
    .format = CCV_TENSOR_FORMAT_NHWC,
1061
0
    .datatype = datatype,
1062
0
    .dim = { m_dim, n_dim, 0 },
1063
0
  };
1064
0
  ccv_nnc_tensor_param_t gbias_params = {
1065
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
1066
0
    .format = CCV_TENSOR_FORMAT_NHWC,
1067
0
    .datatype = datatype,
1068
0
    .dim = { n_dim, 0 },
1069
0
  };
1070
0
  ccv_nnc_tensor_param_t a_params = {
1071
0
    .type = CCV_TENSOR_CPU_MEMORY,
1072
0
    .format = CCV_TENSOR_FORMAT_NHWC,
1073
0
    .datatype = datatype,
1074
0
    .dim = { m_dim, k_dim, 0 },
1075
0
  };
1076
0
  ccv_nnc_tensor_param_t wd_params = {
1077
0
    .type = CCV_TENSOR_CPU_MEMORY,
1078
0
    .format = CCV_TENSOR_FORMAT_NHWC,
1079
0
    .datatype = datatype,
1080
0
    .dim = { n_dim, k_dim, 0 },
1081
0
  };
1082
0
  ccv_nnc_tensor_param_t b_params = {
1083
0
    .type = CCV_TENSOR_CPU_MEMORY,
1084
0
    .format = CCV_TENSOR_FORMAT_NHWC,
1085
0
    .datatype = datatype,
1086
0
    .dim = { m_dim, n_dim, 0 },
1087
0
  };
1088
0
  ccv_nnc_tensor_param_t bias_params = {
1089
0
    .type = CCV_TENSOR_CPU_MEMORY,
1090
0
    .format = CCV_TENSOR_FORMAT_NHWC,
1091
0
    .datatype = datatype,
1092
0
    .dim = { n_dim, 0 },
1093
0
  };
1094
0
  ccv_nnc_tensor_t* const ha = ccv_nnc_tensor_new(0, a_params, 0);
1095
0
  ccv_nnc_tensor_t* const hwd = ccv_nnc_tensor_new(0, wd_params, 0);
1096
0
  ccv_nnc_tensor_t* const hwq = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(wd_params), 0);
1097
0
  ccv_nnc_tensor_t* const hbias = use_bias ? ccv_nnc_tensor_new(0, bias_params, 0) : 0;
1098
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, ga_params, 0);
1099
0
  ccv_nnc_tensor_t* const wq = ccv_nnc_tensor_new(0, gwq_params, 0);
1100
0
  ccv_nnc_tensor_t* const wd = ccv_nnc_tensor_new(0, gwd_params, 0);
1101
0
  ccv_nnc_tensor_t* const bias = use_bias ? ccv_nnc_tensor_new(0, gbias_params, 0) : 0;
1102
0
  ccv_nnc_tensor_t* const bq = ccv_nnc_tensor_new(0, gb_params, 0);
1103
0
  ccv_nnc_tensor_t* const bd = ccv_nnc_tensor_new(0, gb_params, 0);
1104
0
  ccv_nnc_tensor_t* const hbq = ccv_nnc_tensor_new(0, b_params, 0);
1105
0
  ccv_nnc_tensor_t* const hbd = ccv_nnc_tensor_new(0, b_params, 0);
1106
0
  _mps_forward_scaled_gemm_fill_matrix(datatype, ha->data.u8, m_dim, k_dim, 1);
1107
0
  _mps_forward_scaled_gemm_fill_matrix(datatype, hwd->data.u8, n_dim, k_dim, 0);
1108
0
  if (use_bias)
1109
0
    _mps_forward_scaled_gemm_fill_bias(datatype, hbias->data.u8, n_dim);
1110
0
  const size_t qsize = ccv_nnc_quantize_8i_rowwise(hwd->data.u8, datatype, CCV_TENSOR_CPU_MEMORY, n_dim * k_dim, k_dim, hwq->data.u8, ccv_nnc_tensor_data_size_without_padding(hwq->info));
1111
0
  if (qsize != ccv_nnc_tensor_data_size_without_padding(hwq->info))
1112
0
    return -1;
1113
0
  if (use_bias)
1114
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hwq, hbias), TENSOR_LIST(a, wq, bias), 0);
1115
0
  else
1116
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hwq), TENSOR_LIST(a, wq), 0);
1117
0
  ccv_nnc_dequantize_8i_rowwise(wq->data.u8, datatype, CCV_TENSOR_GPU_MEMORY, qsize, k_dim, wd->data.u8, n_dim * k_dim);
1118
0
  if (use_bias) {
1119
0
    ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, wq, bias), TENSOR_LIST(bq), 0);
1120
0
    ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, wd, bias), TENSOR_LIST(bd), 0);
1121
0
  } else {
1122
0
    ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, wq), TENSOR_LIST(bq), 0);
1123
0
    ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, wd), TENSOR_LIST(bd), 0);
1124
0
  }
1125
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(bq, bd), TENSOR_LIST(hbq, hbd), 0);
1126
0
  _mps_forward_scaled_gemm_compare_rows(datatype, hbq->data.u8, hbd->data.u8, m_dim, n_dim, max_abs_ref, max_rel_ref);
1127
0
  ccv_nnc_tensor_free(hbq);
1128
0
  ccv_nnc_tensor_free(hbd);
1129
0
  ccv_nnc_tensor_free(bq);
1130
0
  ccv_nnc_tensor_free(bd);
1131
0
  ccv_nnc_tensor_free(a);
1132
0
  ccv_nnc_tensor_free(wq);
1133
0
  ccv_nnc_tensor_free(wd);
1134
0
  ccv_nnc_tensor_free(ha);
1135
0
  ccv_nnc_tensor_free(hwd);
1136
0
  ccv_nnc_tensor_free(hwq);
1137
0
  if (hbias)
1138
0
    ccv_nnc_tensor_free(hbias);
1139
0
  if (bias)
1140
0
    ccv_nnc_tensor_free(bias);
1141
0
  return 0;
1142
0
}
1143
1144
static int _mps_forward_scaled_gemm_compare_dense_batched_padded_a_shape(const int datatype, const int use_bias, const int batch_dim, const int m_dim, const int n_dim, const int k_dim, const int padded_m_dim, double* const max_abs_ref, double* const max_rel_ref)
1145
0
{
1146
0
  ccv_nnc_tensor_param_t ga_storage_params = {
1147
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
1148
0
    .format = CCV_TENSOR_FORMAT_NHWC,
1149
0
    .datatype = datatype,
1150
0
    .dim = { batch_dim, padded_m_dim, k_dim, 0 },
1151
0
  };
1152
0
  ccv_nnc_tensor_param_t ga_view_params = {
1153
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
1154
0
    .format = CCV_TENSOR_FORMAT_NHWC,
1155
0
    .datatype = datatype,
1156
0
    .dim = { batch_dim, m_dim, k_dim, 0 },
1157
0
  };
1158
0
  ccv_nnc_tensor_param_t gwq_params = {
1159
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
1160
0
    .format = CCV_TENSOR_FORMAT_NHWC,
1161
0
    .datatype = ((datatype >> 12) & 0xff) | CCV_QX | CCV_NNC_QX_8I_ROWWISE,
1162
0
    .dim = { n_dim, k_dim, 0 },
1163
0
  };
1164
0
  ccv_nnc_tensor_param_t gb_params = {
1165
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
1166
0
    .format = CCV_TENSOR_FORMAT_NHWC,
1167
0
    .datatype = datatype,
1168
0
    .dim = { batch_dim, m_dim, n_dim, 0 },
1169
0
  };
1170
0
  ccv_nnc_tensor_param_t gbias_params = {
1171
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
1172
0
    .format = CCV_TENSOR_FORMAT_NHWC,
1173
0
    .datatype = datatype,
1174
0
    .dim = { n_dim, 0 },
1175
0
  };
1176
0
  ccv_nnc_tensor_param_t ha_storage_params = {
1177
0
    .type = CCV_TENSOR_CPU_MEMORY,
1178
0
    .format = CCV_TENSOR_FORMAT_NHWC,
1179
0
    .datatype = datatype,
1180
0
    .dim = { batch_dim, padded_m_dim, k_dim, 0 },
1181
0
  };
1182
0
  ccv_nnc_tensor_param_t ha_view_params = {
1183
0
    .type = CCV_TENSOR_CPU_MEMORY,
1184
0
    .format = CCV_TENSOR_FORMAT_NHWC,
1185
0
    .datatype = datatype,
1186
0
    .dim = { batch_dim, m_dim, k_dim, 0 },
1187
0
  };
1188
0
  ccv_nnc_tensor_param_t wd_params = {
1189
0
    .type = CCV_TENSOR_CPU_MEMORY,
1190
0
    .format = CCV_TENSOR_FORMAT_NHWC,
1191
0
    .datatype = datatype,
1192
0
    .dim = { n_dim, k_dim, 0 },
1193
0
  };
1194
0
  ccv_nnc_tensor_param_t b_params = {
1195
0
    .type = CCV_TENSOR_CPU_MEMORY,
1196
0
    .format = CCV_TENSOR_FORMAT_NHWC,
1197
0
    .datatype = datatype,
1198
0
    .dim = { batch_dim, m_dim, n_dim, 0 },
1199
0
  };
1200
0
  ccv_nnc_tensor_param_t bias_params = {
1201
0
    .type = CCV_TENSOR_CPU_MEMORY,
1202
0
    .format = CCV_TENSOR_FORMAT_NHWC,
1203
0
    .datatype = datatype,
1204
0
    .dim = { n_dim, 0 },
1205
0
  };
1206
0
  ccv_nnc_tensor_t* const ha_storage = ccv_nnc_tensor_new(0, ha_storage_params, 0);
1207
0
  ccv_nnc_tensor_t* const hwd = ccv_nnc_tensor_new(0, wd_params, 0);
1208
0
  ccv_nnc_tensor_t* const hwq = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(wd_params), 0);
1209
0
  ccv_nnc_tensor_t* const hbias = use_bias ? ccv_nnc_tensor_new(0, bias_params, 0) : 0;
1210
0
  ccv_nnc_tensor_t* const a_storage = ccv_nnc_tensor_new(0, ga_storage_params, 0);
1211
0
  ccv_nnc_tensor_t* const wq = ccv_nnc_tensor_new(0, gwq_params, 0);
1212
0
  ccv_nnc_tensor_t* const bq = ccv_nnc_tensor_new(0, gb_params, 0);
1213
0
  ccv_nnc_tensor_t* const bias = use_bias ? ccv_nnc_tensor_new(0, gbias_params, 0) : 0;
1214
0
  ccv_nnc_tensor_t* const hbq = ccv_nnc_tensor_new(0, b_params, 0);
1215
0
  ccv_nnc_tensor_view_t* const ha = ccv_nnc_tensor_view_new(ha_storage, ha_view_params, ccv_nnc_no_ofs, DIM_ALLOC(padded_m_dim * k_dim, k_dim, 1));
1216
0
  ccv_nnc_tensor_view_t* const a = ccv_nnc_tensor_view_new(a_storage, ga_view_params, ccv_nnc_no_ofs, DIM_ALLOC(padded_m_dim * k_dim, k_dim, 1));
1217
0
  float* const a_ref = (float*)ccmalloc(sizeof(float) * batch_dim * m_dim * k_dim);
1218
0
  float* const w_ref = (float*)ccmalloc(sizeof(float) * n_dim * k_dim);
1219
0
  float* const bias_ref = use_bias ? (float*)ccmalloc(sizeof(float) * n_dim) : 0;
1220
0
  float* const out_ref = (float*)ccmalloc(sizeof(float) * batch_dim * m_dim * n_dim);
1221
0
  int bch, i, j;
1222
0
  for (bch = 0; bch < batch_dim; bch++)
1223
0
    for (i = 0; i < padded_m_dim; i++)
1224
0
      for (j = 0; j < k_dim; j++)
1225
0
      {
1226
0
        const int dst = ((bch * padded_m_dim) + i) * k_dim + j;
1227
0
        float value = 0;
1228
0
        if (i < m_dim)
1229
0
        {
1230
0
          value = _mps_forward_scaled_gemm_a_batched_value(bch, i, j);
1231
0
          a_ref[((bch * m_dim) + i) * k_dim + j] = value;
1232
0
        }
1233
0
        if (datatype == CCV_16F)
1234
0
          ccv_float_to_half_precision(&value, ((uint16_t*)ha_storage->data.u8) + dst, 1);
1235
0
        else if (datatype == CCV_16BF)
1236
0
          ccv_float_to_bfloat(&value, ((uint16_t*)ha_storage->data.u8) + dst, 1);
1237
0
        else
1238
0
          ((float*)ha_storage->data.f32)[dst] = value;
1239
0
      }
1240
0
  _mps_forward_scaled_gemm_fill_matrix(datatype, hwd->data.u8, n_dim, k_dim, 0);
1241
0
  _mps_forward_scaled_gemm_to_float(datatype, hwd->data.u8, n_dim * k_dim, w_ref);
1242
0
  if (use_bias)
1243
0
  {
1244
0
    _mps_forward_scaled_gemm_fill_bias(datatype, hbias->data.u8, n_dim);
1245
0
    _mps_forward_scaled_gemm_to_float(datatype, hbias->data.u8, n_dim, bias_ref);
1246
0
  }
1247
0
  const size_t qsize = ccv_nnc_quantize_8i_rowwise(hwd->data.u8, datatype, CCV_TENSOR_CPU_MEMORY, n_dim * k_dim, k_dim, hwq->data.u8, ccv_nnc_tensor_data_size_without_padding(hwq->info));
1248
0
  if (qsize != ccv_nnc_tensor_data_size_without_padding(hwq->info))
1249
0
    return -1;
1250
0
  if (use_bias)
1251
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha_storage, hwq, hbias), TENSOR_LIST(a_storage, wq, bias), 0);
1252
0
  else
1253
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha_storage, hwq), TENSOR_LIST(a_storage, wq), 0);
1254
0
  if (use_bias)
1255
0
    ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST((ccv_nnc_tensor_t*)a, wq, bias), TENSOR_LIST(bq), 0);
1256
0
  else
1257
0
    ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST((ccv_nnc_tensor_t*)a, wq), TENSOR_LIST(bq), 0);
1258
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(bq), TENSOR_LIST(hbq), 0);
1259
0
  _mps_forward_scaled_gemm_reference_batched(a_ref, w_ref, bias_ref, batch_dim, 1, use_bias ? 1 : 0, m_dim, n_dim, k_dim, out_ref);
1260
0
  if (datatype == CCV_16F)
1261
0
    ccv_float_to_half_precision(out_ref, (uint16_t*)ha_storage->data.u8, batch_dim * m_dim * n_dim);
1262
0
  else if (datatype == CCV_16BF)
1263
0
    ccv_float_to_bfloat(out_ref, (uint16_t*)ha_storage->data.u8, batch_dim * m_dim * n_dim);
1264
0
  else
1265
0
    memcpy(ha_storage->data.f32, out_ref, sizeof(float) * batch_dim * m_dim * n_dim);
1266
0
  _mps_forward_scaled_gemm_compare_rows(datatype, hbq->data.u8, ha_storage->data.u8, batch_dim * m_dim, n_dim, max_abs_ref, max_rel_ref);
1267
0
  ccfree(out_ref);
1268
0
  if (bias_ref)
1269
0
    ccfree(bias_ref);
1270
0
  ccfree(w_ref);
1271
0
  ccfree(a_ref);
1272
0
  ccv_nnc_tensor_view_free(ha);
1273
0
  ccv_nnc_tensor_view_free(a);
1274
0
  ccv_nnc_tensor_free(ha_storage);
1275
0
  ccv_nnc_tensor_free(hwd);
1276
0
  ccv_nnc_tensor_free(hwq);
1277
0
  if (hbias)
1278
0
    ccv_nnc_tensor_free(hbias);
1279
0
  ccv_nnc_tensor_free(a_storage);
1280
0
  ccv_nnc_tensor_free(wq);
1281
0
  ccv_nnc_tensor_free(bq);
1282
0
  if (bias)
1283
0
    ccv_nnc_tensor_free(bias);
1284
0
  ccv_nnc_tensor_free(hbq);
1285
0
  return 0;
1286
0
}
1287
1288
static float _mps_segmented_scaled_gemm_a_value(const int row, const int k)
1289
0
{
1290
0
  return (float)(((row * 17 + k * 13) % 61) - 30) / 128.0f;
1291
0
}
1292
1293
static float _mps_segmented_scaled_gemm_w_value(const int segment, const int col, const int k)
1294
0
{
1295
0
  return (float)(((segment * 23 + col * 11 + k * 7) % 67) - 33) / 256.0f;
1296
0
}
1297
1298
static float _mps_segmented_scaled_gemm_bias_value(const int segment, const int col)
1299
0
{
1300
0
  return (float)(((segment * 5 + col * 3) % 29) - 14) / 256.0f;
1301
0
}
1302
1303
static int _mps_segmented_scaled_gemm_validate(const int datatype, const int use_bias, const int force_fallback, double* const max_abs_ref, double* const max_rel_ref)
1304
0
{
1305
0
  const int total_m = 384;
1306
0
  const int n_dim = 128;
1307
0
  const int k_dim = 256;
1308
0
  const int segments = 3;
1309
0
  const int counts_data[] = {129, 131, 124};
1310
0
  const int indices_data[] = {1, 0, 2};
1311
0
  const ccv_nnc_tensor_param_t ha_params = {
1312
0
    .type = CCV_TENSOR_CPU_MEMORY,
1313
0
    .format = CCV_TENSOR_FORMAT_NHWC,
1314
0
    .datatype = datatype,
1315
0
    .dim = { total_m, k_dim, 0 },
1316
0
  };
1317
0
  const ccv_nnc_tensor_param_t hwd_params = {
1318
0
    .type = CCV_TENSOR_CPU_MEMORY,
1319
0
    .format = CCV_TENSOR_FORMAT_NHWC,
1320
0
    .datatype = datatype,
1321
0
    .dim = { segments, n_dim, k_dim, 0 },
1322
0
  };
1323
0
  const ccv_nnc_tensor_param_t hbias_params = {
1324
0
    .type = CCV_TENSOR_CPU_MEMORY,
1325
0
    .format = CCV_TENSOR_FORMAT_NHWC,
1326
0
    .datatype = datatype,
1327
0
    .dim = { segments, n_dim, 0 },
1328
0
  };
1329
0
  const ccv_nnc_tensor_param_t ga_params = {
1330
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
1331
0
    .format = CCV_TENSOR_FORMAT_NHWC,
1332
0
    .datatype = datatype,
1333
0
    .dim = { total_m, k_dim, 0 },
1334
0
  };
1335
0
  const ccv_nnc_tensor_param_t gw_params = {
1336
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
1337
0
    .format = CCV_TENSOR_FORMAT_NHWC,
1338
0
    .datatype = datatype,
1339
0
    .dim = { segments, n_dim, k_dim, 0 },
1340
0
  };
1341
0
  const ccv_nnc_tensor_param_t gbias_params = {
1342
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
1343
0
    .format = CCV_TENSOR_FORMAT_NHWC,
1344
0
    .datatype = datatype,
1345
0
    .dim = { segments, n_dim, 0 },
1346
0
  };
1347
0
  const ccv_nnc_tensor_param_t gb_params = {
1348
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
1349
0
    .format = CCV_TENSOR_FORMAT_NHWC,
1350
0
    .datatype = datatype,
1351
0
    .dim = { total_m, n_dim, 0 },
1352
0
  };
1353
0
  const ccv_nnc_tensor_param_t hb_params = {
1354
0
    .type = CCV_TENSOR_CPU_MEMORY,
1355
0
    .format = CCV_TENSOR_FORMAT_NHWC,
1356
0
    .datatype = datatype,
1357
0
    .dim = { total_m, n_dim, 0 },
1358
0
  };
1359
0
  ccv_nnc_tensor_t* const ha = ccv_nnc_tensor_new(0, ha_params, 0);
1360
0
  ccv_nnc_tensor_t* const hindices = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32S, segments), 0);
1361
0
  ccv_nnc_tensor_t* const hcounts = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32S, segments), 0);
1362
0
  ccv_nnc_tensor_t* const hwd = ccv_nnc_tensor_new(0, hwd_params, 0);
1363
0
  ccv_nnc_tensor_t* const hwq = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(hwd_params), 0);
1364
0
  ccv_nnc_tensor_t* const hbias = use_bias ? ccv_nnc_tensor_new(0, hbias_params, 0) : 0;
1365
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, ga_params, 0);
1366
0
  ccv_nnc_tensor_t* const indices = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32S, segments), 0);
1367
0
  ccv_nnc_tensor_t* const counts = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32S, segments), 0);
1368
0
  ccv_nnc_tensor_t* const w = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(gw_params), 0);
1369
0
  ccv_nnc_tensor_t* const bias = use_bias ? ccv_nnc_tensor_new(0, gbias_params, 0) : 0;
1370
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, gb_params, 0);
1371
0
  ccv_nnc_tensor_t* const hb = ccv_nnc_tensor_new(0, hb_params, 0);
1372
0
  float* const a_values = (float*)ccmalloc(sizeof(float) * total_m * k_dim);
1373
0
  float* const w_values = (float*)ccmalloc(sizeof(float) * segments * n_dim * k_dim);
1374
0
  float* const bias_values = use_bias ? (float*)ccmalloc(sizeof(float) * segments * n_dim) : 0;
1375
0
  int i, j, k;
1376
0
  for (i = 0; i < total_m; i++)
1377
0
    for (k = 0; k < k_dim; k++)
1378
0
      a_values[i * k_dim + k] = _mps_segmented_scaled_gemm_a_value(i, k);
1379
0
  for (i = 0; i < segments; i++)
1380
0
    for (j = 0; j < n_dim; j++)
1381
0
      for (k = 0; k < k_dim; k++)
1382
0
        w_values[((i * n_dim) + j) * k_dim + k] = _mps_segmented_scaled_gemm_w_value(i, j, k);
1383
0
  if (use_bias)
1384
0
    for (i = 0; i < segments; i++)
1385
0
      for (j = 0; j < n_dim; j++)
1386
0
        bias_values[i * n_dim + j] = _mps_segmented_scaled_gemm_bias_value(i, j);
1387
0
  if (datatype == CCV_16F)
1388
0
  {
1389
0
    ccv_float_to_half_precision(a_values, (uint16_t*)ha->data.u8, total_m * k_dim);
1390
0
    ccv_float_to_half_precision(w_values, (uint16_t*)hwd->data.u8, segments * n_dim * k_dim);
1391
0
    if (use_bias)
1392
0
      ccv_float_to_half_precision(bias_values, (uint16_t*)hbias->data.u8, segments * n_dim);
1393
0
  } else if (datatype == CCV_16BF) {
1394
0
    ccv_float_to_bfloat(a_values, (uint16_t*)ha->data.u8, total_m * k_dim);
1395
0
    ccv_float_to_bfloat(w_values, (uint16_t*)hwd->data.u8, segments * n_dim * k_dim);
1396
0
    if (use_bias)
1397
0
      ccv_float_to_bfloat(bias_values, (uint16_t*)hbias->data.u8, segments * n_dim);
1398
0
  } else {
1399
0
    memcpy(ha->data.f32, a_values, sizeof(float) * total_m * k_dim);
1400
0
    memcpy(hwd->data.f32, w_values, sizeof(float) * segments * n_dim * k_dim);
1401
0
    if (use_bias)
1402
0
      memcpy(hbias->data.f32, bias_values, sizeof(float) * segments * n_dim);
1403
0
  }
1404
0
  memcpy(hindices->data.i32, indices_data, sizeof(indices_data));
1405
0
  memcpy(hcounts->data.i32, counts_data, sizeof(counts_data));
1406
0
  const size_t qsize = ccv_nnc_quantize_8i_rowwise(hwd->data.u8, datatype, CCV_TENSOR_CPU_MEMORY, (size_t)segments * n_dim * k_dim, k_dim, hwq->data.u8, ccv_nnc_tensor_data_size_without_padding(hwq->info));
1407
0
  if (qsize != ccv_nnc_tensor_data_size_without_padding(hwq->info))
1408
0
    return -1;
1409
0
  if (use_bias)
1410
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hindices, hcounts, hwq, hbias), TENSOR_LIST(a, indices, counts, w, bias), 0);
1411
0
  else
1412
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hindices, hcounts, hwq), TENSOR_LIST(a, indices, counts, w), 0);
1413
0
  const uint64_t old_flags = ccv_nnc_flags();
1414
0
  if (force_fallback)
1415
0
    ccv_nnc_enable_flag(CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS);
1416
0
  if (use_bias)
1417
0
    ccv_nnc_cmd_exec(CMD_SEGMENTED_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, indices, counts, w, bias), TENSOR_LIST(b), 0);
1418
0
  else
1419
0
    ccv_nnc_cmd_exec(CMD_SEGMENTED_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, indices, counts, w), TENSOR_LIST(b), 0);
1420
0
  if (force_fallback && !(old_flags & CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS))
1421
0
    ccv_nnc_disable_flag(CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS);
1422
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
1423
1424
0
  float* const a_ref = (float*)ccmalloc(sizeof(float) * total_m * k_dim);
1425
0
  float* const w_ref = (float*)ccmalloc(sizeof(float) * segments * n_dim * k_dim);
1426
0
  float* const bias_ref = use_bias ? (float*)ccmalloc(sizeof(float) * segments * n_dim) : 0;
1427
0
  float* const actual = (float*)ccmalloc(sizeof(float) * total_m * n_dim);
1428
0
  ccv_nnc_tensor_t* const ha_ref = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, total_m, k_dim), 0);
1429
0
  ccv_nnc_tensor_t* const hw_ref = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, segments, n_dim, k_dim), 0);
1430
0
  ccv_nnc_tensor_t* const hbias_ref = use_bias ? ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, segments, n_dim), 0) : 0;
1431
0
  ccv_nnc_tensor_t* const bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, total_m, n_dim), 0);
1432
0
  if (force_fallback)
1433
0
    _mps_forward_scaled_gemm_to_float(datatype, ha->data.u8, total_m * k_dim, a_ref);
1434
0
  else
1435
0
    _mps_forward_scaled_gemm_quantized_reference(datatype, ha->data.u8, total_m, k_dim, a_ref);
1436
0
  _mps_forward_scaled_gemm_quantized_reference(datatype, hwd->data.u8, segments * n_dim, k_dim, w_ref);
1437
0
  if (use_bias)
1438
0
    _mps_forward_scaled_gemm_to_float(datatype, hbias->data.u8, segments * n_dim, bias_ref);
1439
0
  _mps_forward_scaled_gemm_to_float(datatype, hb->data.u8, total_m * n_dim, actual);
1440
0
  memcpy(ha_ref->data.f32, a_ref, sizeof(float) * total_m * k_dim);
1441
0
  memcpy(hw_ref->data.f32, w_ref, sizeof(float) * segments * n_dim * k_dim);
1442
0
  if (use_bias)
1443
0
    memcpy(hbias_ref->data.f32, bias_ref, sizeof(float) * segments * n_dim);
1444
0
  if (use_bias)
1445
0
    ccv_nnc_cmd_exec(CMD_SEGMENTED_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha_ref, hindices, hcounts, hw_ref, hbias_ref), TENSOR_LIST(bt), 0);
1446
0
  else
1447
0
    ccv_nnc_cmd_exec(CMD_SEGMENTED_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha_ref, hindices, hcounts, hw_ref), TENSOR_LIST(bt), 0);
1448
0
  double max_abs = 0;
1449
0
  double max_rel = 0;
1450
0
  for (i = 0; i < total_m * n_dim; i++)
1451
0
  {
1452
0
    const double diff = fabs((double)actual[i] - (double)bt->data.f32[i]);
1453
0
    const double denom = ccv_max(1.0, ccv_max(fabs((double)actual[i]), fabs((double)bt->data.f32[i])));
1454
0
    max_abs = ccv_max(max_abs, diff);
1455
0
    max_rel = ccv_max(max_rel, diff / denom);
1456
0
  }
1457
0
  if (max_abs_ref)
1458
0
    *max_abs_ref = max_abs;
1459
0
  if (max_rel_ref)
1460
0
    *max_rel_ref = max_rel;
1461
0
  ccv_nnc_tensor_free(bt);
1462
0
  if (hbias_ref)
1463
0
    ccv_nnc_tensor_free(hbias_ref);
1464
0
  ccv_nnc_tensor_free(hw_ref);
1465
0
  ccv_nnc_tensor_free(ha_ref);
1466
0
  ccfree(actual);
1467
0
  if (bias_ref)
1468
0
    ccfree(bias_ref);
1469
0
  ccfree(w_ref);
1470
0
  ccfree(a_ref);
1471
0
  ccfree(a_values);
1472
0
  ccfree(w_values);
1473
0
  if (bias_values)
1474
0
    ccfree(bias_values);
1475
0
  ccv_nnc_tensor_free(hb);
1476
0
  ccv_nnc_tensor_free(b);
1477
0
  if (bias)
1478
0
    ccv_nnc_tensor_free(bias);
1479
0
  ccv_nnc_tensor_free(w);
1480
0
  ccv_nnc_tensor_free(counts);
1481
0
  ccv_nnc_tensor_free(indices);
1482
0
  ccv_nnc_tensor_free(a);
1483
0
  if (hbias)
1484
0
    ccv_nnc_tensor_free(hbias);
1485
0
  ccv_nnc_tensor_free(hwq);
1486
0
  ccv_nnc_tensor_free(hwd);
1487
0
  ccv_nnc_tensor_free(hcounts);
1488
0
  ccv_nnc_tensor_free(hindices);
1489
0
  ccv_nnc_tensor_free(ha);
1490
0
  return 0;
1491
0
}
1492
1493
TEST_CASE("mps forward gemm with row-wise 8i weight NA")
1494
1
{
1495
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1496
0
  double max_abs = 0;
1497
0
  double max_rel = 0;
1498
0
  REQUIRE_EQ(_mps_forward_scaled_gemm_validate(CCV_16F, 0, &max_abs, &max_rel), 0, "scaled GEMM validation should run");
1499
0
  REQUIRE(max_rel < 2e-3, "quantized NAInt8MatMul should match row-wise quantized fp16 reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1500
0
  max_abs = 0;
1501
0
  max_rel = 0;
1502
0
  REQUIRE_EQ(_mps_forward_scaled_gemm_validate(CCV_32F, 0, &max_abs, &max_rel), 0, "scaled GEMM validation should run");
1503
0
  REQUIRE(max_rel < 2e-3, "quantized NAInt8MatMul should match row-wise quantized fp32 reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1504
0
  max_abs = 0;
1505
0
  max_rel = 0;
1506
0
  REQUIRE_EQ(_mps_forward_scaled_gemm_validate(CCV_16BF, 0, &max_abs, &max_rel), 0, "scaled GEMM validation should run");
1507
0
  REQUIRE(max_rel < 5e-3, "quantized NAInt8MatMul should match row-wise quantized bf16 reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1508
0
}
1509
1510
TEST_CASE("mps forward gemm with row-wise 8i weight and bias NA")
1511
1
{
1512
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1513
0
  double max_abs = 0;
1514
0
  double max_rel = 0;
1515
0
  REQUIRE_EQ(_mps_forward_scaled_gemm_validate(CCV_16F, 1, &max_abs, &max_rel), 0, "scaled GEMM validation with bias should run");
1516
0
  REQUIRE(max_rel < 2e-3, "quantized NAInt8MatMul with bias should match row-wise quantized fp16 reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1517
0
  max_abs = 0;
1518
0
  max_rel = 0;
1519
0
  REQUIRE_EQ(_mps_forward_scaled_gemm_validate(CCV_32F, 1, &max_abs, &max_rel), 0, "scaled GEMM validation with bias should run");
1520
0
  REQUIRE(max_rel < 2e-3, "quantized NAInt8MatMul with bias should match row-wise quantized fp32 reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1521
0
  max_abs = 0;
1522
0
  max_rel = 0;
1523
0
  REQUIRE_EQ(_mps_forward_scaled_gemm_validate(CCV_16BF, 1, &max_abs, &max_rel), 0, "scaled GEMM validation with bias should run");
1524
0
  REQUIRE(max_rel < 5e-3, "quantized NAInt8MatMul with bias should match row-wise quantized bf16 reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1525
0
}
1526
1527
TEST_CASE("mps forward gemm with row-wise 8i weight NA aligned M")
1528
1
{
1529
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1530
0
  double max_abs = 0;
1531
0
  double max_rel = 0;
1532
0
  REQUIRE_EQ(_mps_forward_scaled_gemm_validate_aligned_m(CCV_16F, 0, &max_abs, &max_rel), 0, "scaled GEMM aligned-M validation should run");
1533
0
  REQUIRE(max_rel < 2e-3, "quantized NAInt8MatMul should match aligned-M row-wise quantized fp16 reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1534
0
  max_abs = 0;
1535
0
  max_rel = 0;
1536
0
  REQUIRE_EQ(_mps_forward_scaled_gemm_validate_aligned_m(CCV_32F, 0, &max_abs, &max_rel), 0, "scaled GEMM aligned-M validation should run");
1537
0
  REQUIRE(max_rel < 2e-3, "quantized NAInt8MatMul should match aligned-M row-wise quantized fp32 reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1538
0
  max_abs = 0;
1539
0
  max_rel = 0;
1540
0
  REQUIRE_EQ(_mps_forward_scaled_gemm_validate_aligned_m(CCV_16BF, 0, &max_abs, &max_rel), 0, "scaled GEMM aligned-M validation should run");
1541
0
  REQUIRE(max_rel < 5e-3, "quantized NAInt8MatMul should match aligned-M row-wise quantized bf16 reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1542
0
}
1543
1544
TEST_CASE("mps forward gemm with row-wise 8i weight and bias NA aligned M")
1545
1
{
1546
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1547
0
  double max_abs = 0;
1548
0
  double max_rel = 0;
1549
0
  REQUIRE_EQ(_mps_forward_scaled_gemm_validate_aligned_m(CCV_16F, 1, &max_abs, &max_rel), 0, "scaled GEMM aligned-M validation with bias should run");
1550
0
  REQUIRE(max_rel < 2e-3, "quantized NAInt8MatMul with bias should match aligned-M row-wise quantized fp16 reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1551
0
  max_abs = 0;
1552
0
  max_rel = 0;
1553
0
  REQUIRE_EQ(_mps_forward_scaled_gemm_validate_aligned_m(CCV_32F, 1, &max_abs, &max_rel), 0, "scaled GEMM aligned-M validation with bias should run");
1554
0
  REQUIRE(max_rel < 2e-3, "quantized NAInt8MatMul with bias should match aligned-M row-wise quantized fp32 reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1555
0
  max_abs = 0;
1556
0
  max_rel = 0;
1557
0
  REQUIRE_EQ(_mps_forward_scaled_gemm_validate_aligned_m(CCV_16BF, 1, &max_abs, &max_rel), 0, "scaled GEMM aligned-M validation with bias should run");
1558
0
  REQUIRE(max_rel < 5e-3, "quantized NAInt8MatMul with bias should match aligned-M row-wise quantized bf16 reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1559
0
}
1560
1561
TEST_CASE("mps forward gemm with row-wise 8i weight ANE stream ordering")
1562
1
{
1563
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1564
0
  double max_abs = 0;
1565
0
  double max_rel = 0;
1566
0
  REQUIRE_EQ(_mps_forward_ane_rowwise_gemm_stream_sync_validate(&max_abs, &max_rel), 0, "ANE row-wise 8i stream-ordering validation should run");
1567
0
  REQUIRE(max_rel < 2e-3, "ANE row-wise 8i GEMM should respect queued Metal writer work before quant/evaluate, max_abs=%g max_rel=%g", max_abs, max_rel);
1568
0
}
1569
1570
TEST_CASE("mps segmented gemm with row-wise 8i weight NA")
1571
1
{
1572
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SEGMENTED_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1573
0
  double max_abs = 0;
1574
0
  double max_rel = 0;
1575
0
  REQUIRE_EQ(_mps_segmented_scaled_gemm_validate(CCV_16F, 0, 0, &max_abs, &max_rel), 0, "segmented row-wise 8i NA validation should run");
1576
0
  REQUIRE(max_rel < 3e-3, "segmented row-wise 8i NA fp16 should match quantized reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1577
0
  max_abs = 0;
1578
0
  max_rel = 0;
1579
0
  REQUIRE_EQ(_mps_segmented_scaled_gemm_validate(CCV_32F, 0, 0, &max_abs, &max_rel), 0, "segmented row-wise 8i NA validation should run");
1580
0
  REQUIRE(max_rel < 3e-3, "segmented row-wise 8i NA fp32 should match quantized reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1581
0
  max_abs = 0;
1582
0
  max_rel = 0;
1583
0
  REQUIRE_EQ(_mps_segmented_scaled_gemm_validate(CCV_16BF, 0, 0, &max_abs, &max_rel), 0, "segmented row-wise 8i NA validation should run");
1584
0
  REQUIRE(max_rel < 6e-3, "segmented row-wise 8i NA bf16 should match quantized reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1585
0
}
1586
1587
TEST_CASE("mps segmented gemm with row-wise 8i weight and bias fallback dequantize")
1588
1
{
1589
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SEGMENTED_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1590
0
  double max_abs = 0;
1591
0
  double max_rel = 0;
1592
0
  REQUIRE_EQ(_mps_segmented_scaled_gemm_validate(CCV_16F, 1, 1, &max_abs, &max_rel), 0, "segmented fallback row-wise 8i validation should run");
1593
0
  REQUIRE(max_rel < 3e-3, "segmented fallback row-wise 8i fp16 should match dense-A reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1594
0
  max_abs = 0;
1595
0
  max_rel = 0;
1596
0
  REQUIRE_EQ(_mps_segmented_scaled_gemm_validate(CCV_32F, 1, 1, &max_abs, &max_rel), 0, "segmented fallback row-wise 8i validation should run");
1597
0
  REQUIRE(max_rel < 3e-3, "segmented fallback row-wise 8i fp32 should match dense-A reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1598
0
  max_abs = 0;
1599
0
  max_rel = 0;
1600
0
  REQUIRE_EQ(_mps_segmented_scaled_gemm_validate(CCV_16BF, 1, 1, &max_abs, &max_rel), 0, "segmented fallback row-wise 8i validation should run");
1601
0
  REQUIRE(max_rel < 6e-3, "segmented fallback row-wise 8i bf16 should match dense-A reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1602
0
}
1603
1604
TEST_CASE("mps forward gemm with row-wise 8i weight fallback dequantize")
1605
1
{
1606
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1607
0
  const uint64_t old_flags = ccv_nnc_flags();
1608
0
  ccv_nnc_enable_flag(CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS);
1609
0
  double max_abs = 0;
1610
0
  double max_rel = 0;
1611
0
  const int status16f = _mps_forward_scaled_gemm_compare_dense(CCV_16F, 0, 257, 384, 128, &max_abs, &max_rel);
1612
0
  if (!(old_flags & CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS)) {
1613
0
    ccv_nnc_disable_flag(CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS);
1614
0
  }
1615
0
  REQUIRE_EQ(status16f, 0, "fallback row-wise 8i GEMM validation should run");
1616
0
  REQUIRE(max_rel < 2e-3, "fallback row-wise 8i GEMM should match dense GPU fp16 reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1617
1618
0
  ccv_nnc_enable_flag(CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS);
1619
0
  max_abs = 0;
1620
0
  max_rel = 0;
1621
0
  const int status32f = _mps_forward_scaled_gemm_compare_dense(CCV_32F, 0, 257, 384, 128, &max_abs, &max_rel);
1622
0
  if (!(old_flags & CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS)) {
1623
0
    ccv_nnc_disable_flag(CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS);
1624
0
  }
1625
0
  REQUIRE_EQ(status32f, 0, "fallback row-wise 8i GEMM validation should run");
1626
0
  REQUIRE(max_rel < 2e-3, "fallback row-wise 8i GEMM should match dense GPU fp32 reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1627
1628
0
  ccv_nnc_enable_flag(CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS);
1629
0
  max_abs = 0;
1630
0
  max_rel = 0;
1631
0
  const int status16bf = _mps_forward_scaled_gemm_compare_dense(CCV_16BF, 0, 257, 384, 128, &max_abs, &max_rel);
1632
0
  if (!(old_flags & CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS)) {
1633
0
    ccv_nnc_disable_flag(CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS);
1634
0
  }
1635
0
  REQUIRE_EQ(status16bf, 0, "fallback row-wise 8i GEMM validation should run");
1636
0
  REQUIRE(max_rel < 5e-3, "fallback row-wise 8i GEMM should match dense GPU bf16 reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1637
0
}
1638
1639
TEST_CASE("mps forward gemm with row-wise 8i weight and bias fallback dequantize")
1640
1
{
1641
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1642
0
  const uint64_t old_flags = ccv_nnc_flags();
1643
0
  double max_abs = 0;
1644
0
  double max_rel = 0;
1645
0
  ccv_nnc_enable_flag(CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS);
1646
0
  REQUIRE_EQ(_mps_forward_scaled_gemm_compare_dense(CCV_16F, 1, 257, 384, 128, &max_abs, &max_rel), 0, "fallback row-wise 8i GEMM with bias validation should run");
1647
0
  if (!(old_flags & CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS)) {
1648
0
    ccv_nnc_disable_flag(CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS);
1649
0
  }
1650
0
  REQUIRE(max_rel < 2e-3, "fallback row-wise 8i GEMM with bias should match dense GPU fp16 reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1651
0
  max_abs = 0;
1652
0
  max_rel = 0;
1653
0
  ccv_nnc_enable_flag(CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS);
1654
0
  REQUIRE_EQ(_mps_forward_scaled_gemm_compare_dense(CCV_32F, 1, 257, 384, 128, &max_abs, &max_rel), 0, "fallback row-wise 8i GEMM with bias validation should run");
1655
0
  if (!(old_flags & CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS)) {
1656
0
    ccv_nnc_disable_flag(CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS);
1657
0
  }
1658
0
  REQUIRE(max_rel < 2e-3, "fallback row-wise 8i GEMM with bias should match dense GPU fp32 reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1659
0
  max_abs = 0;
1660
0
  max_rel = 0;
1661
0
  ccv_nnc_enable_flag(CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS);
1662
0
  REQUIRE_EQ(_mps_forward_scaled_gemm_compare_dense(CCV_16BF, 1, 257, 384, 128, &max_abs, &max_rel), 0, "fallback row-wise 8i GEMM with bias validation should run");
1663
0
  if (!(old_flags & CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS)) {
1664
0
    ccv_nnc_disable_flag(CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS);
1665
0
  }
1666
0
  REQUIRE(max_rel < 5e-3, "fallback row-wise 8i GEMM with bias should match dense GPU bf16 reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1667
0
}
1668
1669
TEST_CASE("mps forward gemm with row-wise 8i weight and bias fallback dequantize large shapes")
1670
1
{
1671
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1672
0
  const uint64_t old_flags = ccv_nnc_flags();
1673
0
  ccv_nnc_enable_flag(CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS);
1674
0
  static const int shapes[][3] = {
1675
0
    {32, 3840, 3840},
1676
0
    {32, 10240, 3840},
1677
0
    {32, 3840, 10240},
1678
0
  };
1679
0
  int i;
1680
0
  for (i = 0; i < (int)(sizeof(shapes) / sizeof(shapes[0])); i++)
1681
0
  {
1682
0
    double max_abs = 0;
1683
0
    double max_rel = 0;
1684
0
    REQUIRE_EQ(_mps_forward_scaled_gemm_compare_dense(CCV_16BF, 1, shapes[i][0], shapes[i][1], shapes[i][2], &max_abs, &max_rel), 0, "large fallback row-wise 8i GEMM with bias validation should run");
1685
0
    REQUIRE(max_abs < 2e-2 || max_rel < 5e-3, "large fallback row-wise 8i GEMM with bias should match dense GPU bf16 reference for shape %d x %d x %d, max_abs=%g max_rel=%g", shapes[i][0], shapes[i][1], shapes[i][2], max_abs, max_rel);
1686
0
  }
1687
0
  if (!(old_flags & CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS)) {
1688
0
    ccv_nnc_disable_flag(CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS);
1689
0
  }
1690
0
}
1691
1692
TEST_CASE("mps forward batched gemm with broadcast row-wise 8i weight NA")
1693
1
{
1694
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1695
0
  double max_abs = 0;
1696
0
  double max_rel = 0;
1697
0
  REQUIRE_EQ(_mps_forward_scaled_gemm_validate_batched(CCV_16F, 0, 0, 0, &max_abs, &max_rel), 0, "batched scaled GEMM validation should run");
1698
0
  REQUIRE(max_rel < 2e-3, "batched quantized NAInt8MatMul should match broadcast-weight fp16 reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1699
0
  max_abs = 0;
1700
0
  max_rel = 0;
1701
0
  REQUIRE_EQ(_mps_forward_scaled_gemm_validate_batched(CCV_32F, 0, 0, 0, &max_abs, &max_rel), 0, "batched scaled GEMM validation should run");
1702
0
  REQUIRE(max_rel < 2e-3, "batched quantized NAInt8MatMul should match broadcast-weight fp32 reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1703
0
  max_abs = 0;
1704
0
  max_rel = 0;
1705
0
  REQUIRE_EQ(_mps_forward_scaled_gemm_validate_batched(CCV_16BF, 0, 0, 0, &max_abs, &max_rel), 0, "batched scaled GEMM validation should run");
1706
0
  REQUIRE(max_rel < 5e-3, "batched quantized NAInt8MatMul should match broadcast-weight bf16 reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1707
0
}
1708
1709
TEST_CASE("mps forward batched gemm with batched row-wise 8i weight and bias NA")
1710
1
{
1711
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1712
0
  double max_abs = 0;
1713
0
  double max_rel = 0;
1714
0
  REQUIRE_EQ(_mps_forward_scaled_gemm_validate_batched(CCV_16F, 1, 1, 1, &max_abs, &max_rel), 0, "batched scaled GEMM validation with batched weight and bias should run");
1715
0
  REQUIRE(max_rel < 2e-3, "batched quantized NAInt8MatMul should match batched-weight fp16 reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1716
0
  max_abs = 0;
1717
0
  max_rel = 0;
1718
0
  REQUIRE_EQ(_mps_forward_scaled_gemm_validate_batched(CCV_32F, 1, 1, 1, &max_abs, &max_rel), 0, "batched scaled GEMM validation with batched weight and bias should run");
1719
0
  REQUIRE(max_rel < 2e-3, "batched quantized NAInt8MatMul should match batched-weight fp32 reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1720
0
  max_abs = 0;
1721
0
  max_rel = 0;
1722
0
  REQUIRE_EQ(_mps_forward_scaled_gemm_validate_batched(CCV_16BF, 1, 1, 1, &max_abs, &max_rel), 0, "batched scaled GEMM validation with batched weight and bias should run");
1723
0
  REQUIRE(max_rel < 5e-3, "batched quantized NAInt8MatMul should match batched-weight bf16 reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1724
0
}
1725
1726
TEST_CASE("mps forward batched gemm with padded A view and broadcast row-wise 8i weight NA")
1727
1
{
1728
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1729
0
  double max_abs = 0;
1730
0
  double max_rel = 0;
1731
0
  REQUIRE_EQ(_mps_forward_scaled_gemm_compare_dense_batched_padded_a_shape(CCV_16F, 0, 1, 512, 3072, 3072, 513, &max_abs, &max_rel), 0, "single-batch padded-A scaled GEMM validation should run");
1732
0
  REQUIRE(max_rel < 2e-3, "single-batch padded-A scaled GEMM without bias should match dense reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1733
0
  max_abs = 0;
1734
0
  max_rel = 0;
1735
0
  REQUIRE_EQ(_mps_forward_scaled_gemm_compare_dense_batched_padded_a_shape(CCV_16F, 0, 2, 512, 3072, 3072, 513, &max_abs, &max_rel), 0, "batched padded-A scaled GEMM validation should run");
1736
0
  REQUIRE(max_rel < 2e-3, "batched padded-A scaled GEMM without bias should match dense reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1737
0
  max_abs = 0;
1738
0
  max_rel = 0;
1739
0
  REQUIRE_EQ(_mps_forward_scaled_gemm_compare_dense_batched_padded_a_shape(CCV_16F, 1, 2, 512, 3072, 3072, 513, &max_abs, &max_rel), 0, "batched padded-A scaled GEMM with bias validation should run");
1740
0
  REQUIRE(max_rel < 2e-3, "batched padded-A scaled GEMM with bias should match dense reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1741
0
}
1742
1743
#define _STRINGIFY(x) #x
1744
#define STRINGIFY(x) _STRINGIFY(x)
1745
#define NA_GEMM_SHAPE_TEST(M, N, K) \
1746
  TEST_CASE("mps forward gemm no bias NA shape " STRINGIFY(M) "x" STRINGIFY(N) "x" STRINGIFY(K)) \
1747
49
  { \
1748
49
    GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS)); \
1749
0
    _mps_forward_na_gemm_mismatch_t mismatch = {}; \
1750
0
    REQUIRE(_mps_forward_na_gemm_validate_shape(M, N, K, &mismatch), "sampled GEMM result should match reference for shape (%d, %d, %d) at (%d, %d): %g vs %g, max_abs=%g max_rel=%g", M, N, K, mismatch.row, mismatch.col, mismatch.actual, mismatch.expected, mismatch.max_abs, mismatch.max_rel); \
1751
0
  }
1752
1753
#define NA_GEMM_BIAS_SHAPE_TEST(M, N, K) \
1754
  TEST_CASE("mps forward gemm with bias NA shape " STRINGIFY(M) "x" STRINGIFY(N) "x" STRINGIFY(K)) \
1755
44
  { \
1756
44
    GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS)); \
1757
0
    _mps_forward_na_gemm_mismatch_t mismatch = {}; \
1758
0
    REQUIRE(_mps_forward_na_gemm_validate_shape_with_bias(M, N, K, &mismatch), "sampled GEMM result with bias should match reference for shape (%d, %d, %d) at (%d, %d): %g vs %g, max_abs=%g max_rel=%g", M, N, K, mismatch.row, mismatch.col, mismatch.actual, mismatch.expected, mismatch.max_abs, mismatch.max_rel); \
1759
0
  }
1760
1761
#define NA_GEMM_BFLOAT_SHAPE_TEST(M, N, K) \
1762
  TEST_CASE("mps forward gemm no bias bfloat NA shape " STRINGIFY(M) "x" STRINGIFY(N) "x" STRINGIFY(K)) \
1763
6
  { \
1764
6
    GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS)); \
1765
0
    _mps_forward_na_gemm_mismatch_t mismatch = {}; \
1766
0
    REQUIRE(_mps_forward_na_gemm_validate_shape_for_datatype(CCV_16BF, 0, M, N, K, &mismatch), "sampled bfloat GEMM result should match reference for shape (%d, %d, %d) at (%d, %d): %g vs %g, max_abs=%g max_rel=%g", M, N, K, mismatch.row, mismatch.col, mismatch.actual, mismatch.expected, mismatch.max_abs, mismatch.max_rel); \
1767
0
  }
1768
1769
#define NA_GEMM_BFLOAT_BIAS_SHAPE_TEST(M, N, K) \
1770
  TEST_CASE("mps forward gemm with bias bfloat NA shape " STRINGIFY(M) "x" STRINGIFY(N) "x" STRINGIFY(K)) \
1771
6
  { \
1772
6
    GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS)); \
1773
0
    _mps_forward_na_gemm_mismatch_t mismatch = {}; \
1774
0
    REQUIRE(_mps_forward_na_gemm_validate_shape_for_datatype(CCV_16BF, 1, M, N, K, &mismatch), "sampled bfloat GEMM result with bias should match reference for shape (%d, %d, %d) at (%d, %d): %g vs %g, max_abs=%g max_rel=%g", M, N, K, mismatch.row, mismatch.col, mismatch.actual, mismatch.expected, mismatch.max_abs, mismatch.max_rel); \
1775
0
  }
1776
1777
TEST_CASE("mps forward gemm no bias NA full shape 6x1024x3072")
1778
1
{
1779
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1780
0
  _mps_forward_na_gemm_mismatch_t mismatch = {};
1781
0
  REQUIRE(_mps_forward_na_gemm_validate_full_shape_for_datatype(CCV_16F, 0, 0, 6, 1024, 3072, &mismatch), "full GEMM result should match reference for shape (6, 1024, 3072) at (%d, %d): %g vs %g, max_abs=%g max_rel=%g", mismatch.row, mismatch.col, mismatch.actual, mismatch.expected, mismatch.max_abs, mismatch.max_rel);
1782
0
}
1783
1784
TEST_CASE("mps forward gemm no bias NA full signed shape 6x1024x3072")
1785
1
{
1786
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1787
0
  _mps_forward_na_gemm_mismatch_t mismatch = {};
1788
0
  REQUIRE(_mps_forward_na_gemm_validate_full_shape_for_datatype(CCV_16F, 0, 1, 6, 1024, 3072, &mismatch), "full signed GEMM result should match reference for shape (6, 1024, 3072) at (%d, %d): %g vs %g, max_abs=%g max_rel=%g", mismatch.row, mismatch.col, mismatch.actual, mismatch.expected, mismatch.max_abs, mismatch.max_rel);
1789
0
}
1790
1791
TEST_CASE("gemm no transpose")
1792
1
{
1793
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1794
0
  float ap[] = {
1795
0
    1, 2,
1796
0
    3, 4,
1797
0
    5, 6,
1798
0
    7, 8,
1799
0
  };
1800
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 4, 2), 0);
1801
0
  float bp[] = {
1802
0
    7, 8, 9,
1803
0
    10, 11, 12,
1804
0
  };
1805
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
1806
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 3), 0);
1807
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
1808
0
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
1809
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
1810
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
1811
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb), TENSOR_LIST(gc), 0);
1812
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
1813
0
  float ctp[] = {
1814
0
    1 * 7 + 2 * 10, 1 * 8 + 2 * 11, 1 * 9 + 2 * 12,
1815
0
    3 * 7 + 4 * 10, 3 * 8 + 4 * 11, 3 * 9 + 4 * 12,
1816
0
    5 * 7 + 6 * 10, 5 * 8 + 6 * 11, 5 * 9 + 6 * 12,
1817
0
    7 * 7 + 8 * 10, 7 * 8 + 8 * 11, 7 * 9 + 8 * 12,
1818
0
  };
1819
0
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
1820
0
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
1821
0
  ccv_nnc_tensor_free(a);
1822
0
  ccv_nnc_tensor_free(b);
1823
0
  ccv_nnc_tensor_free(c);
1824
0
  ccv_nnc_tensor_free(ga);
1825
0
  ccv_nnc_tensor_free(gb);
1826
0
  ccv_nnc_tensor_free(gc);
1827
0
}
1828
1829
TEST_CASE("gemm transpose a")
1830
1
{
1831
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1832
0
  float ap[] = {
1833
0
    1, 3, 5, 7,
1834
0
    2, 4, 6, 8,
1835
0
  };
1836
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4), 0);
1837
0
  float bp[] = {
1838
0
    7, 8, 9,
1839
0
    10, 11, 12,
1840
0
  };
1841
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
1842
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 3), 0);
1843
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4), 0);
1844
0
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
1845
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
1846
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
1847
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb), TENSOR_LIST(gc), 0);
1848
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
1849
0
  float ctp[] = {
1850
0
    1 * 7 + 2 * 10, 1 * 8 + 2 * 11, 1 * 9 + 2 * 12,
1851
0
    3 * 7 + 4 * 10, 3 * 8 + 4 * 11, 3 * 9 + 4 * 12,
1852
0
    5 * 7 + 6 * 10, 5 * 8 + 6 * 11, 5 * 9 + 6 * 12,
1853
0
    7 * 7 + 8 * 10, 7 * 8 + 8 * 11, 7 * 9 + 8 * 12,
1854
0
  };
1855
0
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
1856
0
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
1857
0
  ccv_nnc_tensor_free(a);
1858
0
  ccv_nnc_tensor_free(b);
1859
0
  ccv_nnc_tensor_free(c);
1860
0
  ccv_nnc_tensor_free(ga);
1861
0
  ccv_nnc_tensor_free(gb);
1862
0
  ccv_nnc_tensor_free(gc);
1863
0
}
1864
1865
TEST_CASE("gemm transpose b")
1866
1
{
1867
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1868
0
  float ap[] = {
1869
0
    1, 2,
1870
0
    3, 4,
1871
0
    5, 6,
1872
0
    7, 8,
1873
0
  };
1874
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 4, 2), 0);
1875
0
  float bp[] = {
1876
0
    7, 10,
1877
0
    8, 11,
1878
0
    9, 12,
1879
0
  };
1880
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 3, 2), 0);
1881
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 3), 0);
1882
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
1883
0
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3, 2), 0);
1884
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
1885
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
1886
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb), TENSOR_LIST(gc), 0);
1887
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
1888
0
  float ctp[] = {
1889
0
    1 * 7 + 2 * 10, 1 * 8 + 2 * 11, 1 * 9 + 2 * 12,
1890
0
    3 * 7 + 4 * 10, 3 * 8 + 4 * 11, 3 * 9 + 4 * 12,
1891
0
    5 * 7 + 6 * 10, 5 * 8 + 6 * 11, 5 * 9 + 6 * 12,
1892
0
    7 * 7 + 8 * 10, 7 * 8 + 8 * 11, 7 * 9 + 8 * 12,
1893
0
  };
1894
0
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
1895
0
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
1896
0
  ccv_nnc_tensor_free(a);
1897
0
  ccv_nnc_tensor_free(b);
1898
0
  ccv_nnc_tensor_free(c);
1899
0
  ccv_nnc_tensor_free(ga);
1900
0
  ccv_nnc_tensor_free(gb);
1901
0
  ccv_nnc_tensor_free(gc);
1902
0
}
1903
1904
TEST_CASE("gemm transpose a and b")
1905
1
{
1906
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1907
0
  float ap[] = {
1908
0
    1, 3, 5, 7,
1909
0
    2, 4, 6, 8,
1910
0
  };
1911
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4), 0);
1912
0
  float bp[] = {
1913
0
    7, 10,
1914
0
    8, 11,
1915
0
    9, 12,
1916
0
  };
1917
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 3, 2), 0);
1918
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 3), 0);
1919
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4), 0);
1920
0
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3, 2), 0);
1921
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
1922
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
1923
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(TRANSPOSE(0, 1), TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb), TENSOR_LIST(gc), 0);
1924
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
1925
0
  float ctp[] = {
1926
0
    1 * 7 + 2 * 10, 1 * 8 + 2 * 11, 1 * 9 + 2 * 12,
1927
0
    3 * 7 + 4 * 10, 3 * 8 + 4 * 11, 3 * 9 + 4 * 12,
1928
0
    5 * 7 + 6 * 10, 5 * 8 + 6 * 11, 5 * 9 + 6 * 12,
1929
0
    7 * 7 + 8 * 10, 7 * 8 + 8 * 11, 7 * 9 + 8 * 12,
1930
0
  };
1931
0
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
1932
0
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
1933
0
  ccv_nnc_tensor_free(a);
1934
0
  ccv_nnc_tensor_free(b);
1935
0
  ccv_nnc_tensor_free(c);
1936
0
  ccv_nnc_tensor_free(ga);
1937
0
  ccv_nnc_tensor_free(gb);
1938
0
  ccv_nnc_tensor_free(gc);
1939
0
}
1940
1941
TEST_CASE("gemm no transpose with bias")
1942
1
{
1943
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1944
0
  float ap[] = {
1945
0
    1, 2,
1946
0
    3, 4,
1947
0
    5, 6,
1948
0
    7, 8,
1949
0
  };
1950
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 4, 2), 0);
1951
0
  float bp[] = {
1952
0
    7, 8, 9,
1953
0
    10, 11, 12,
1954
0
  };
1955
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
1956
0
  float dp[] = {
1957
0
    1, -1, 1,
1958
0
    1, -1, 1,
1959
0
    1, -1, 1,
1960
0
    1, -1, 1,
1961
0
  };
1962
0
  ccv_nnc_tensor_t* const d = ccv_nnc_tensor_new(dp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
1963
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 3), 0);
1964
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
1965
0
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
1966
0
  ccv_nnc_tensor_t* gd = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
1967
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
1968
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b, d), TENSOR_LIST(ga, gb, gd), 0);
1969
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb, gd), TENSOR_LIST(gc), 0);
1970
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
1971
0
  float ctp[] = {
1972
0
    1 * 7 + 2 * 10 + 1, 1 * 8 + 2 * 11 - 1, 1 * 9 + 2 * 12 + 1,
1973
0
    3 * 7 + 4 * 10 + 1, 3 * 8 + 4 * 11 - 1, 3 * 9 + 4 * 12 + 1,
1974
0
    5 * 7 + 6 * 10 + 1, 5 * 8 + 6 * 11 - 1, 5 * 9 + 6 * 12 + 1,
1975
0
    7 * 7 + 8 * 10 + 1, 7 * 8 + 8 * 11 - 1, 7 * 9 + 8 * 12 + 1,
1976
0
  };
1977
0
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
1978
0
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
1979
0
  ccv_nnc_tensor_free(a);
1980
0
  ccv_nnc_tensor_free(b);
1981
0
  ccv_nnc_tensor_free(c);
1982
0
  ccv_nnc_tensor_free(d);
1983
0
  ccv_nnc_tensor_free(ga);
1984
0
  ccv_nnc_tensor_free(gb);
1985
0
  ccv_nnc_tensor_free(gc);
1986
0
  ccv_nnc_tensor_free(gd);
1987
0
}
1988
1989
TEST_CASE("gemm no transpose batch 2, no batch b")
1990
1
{
1991
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1992
0
  float ap[] = {
1993
0
    1, 2,
1994
0
    3, 4,
1995
0
    5, 6,
1996
0
    7, 8,
1997
0
    2, 3,
1998
0
    4, 5,
1999
0
    6, 7,
2000
0
    8, 9
2001
0
  };
2002
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
2003
0
  float bp[] = {
2004
0
    7, 8, 9,
2005
0
    10, 11, 12,
2006
0
  };
2007
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
2008
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
2009
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
2010
0
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
2011
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
2012
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
2013
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb), TENSOR_LIST(gc), 0);
2014
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
2015
0
  float ctp[] = {
2016
0
    1 * 7 + 2 * 10, 1 * 8 + 2 * 11, 1 * 9 + 2 * 12,
2017
0
    3 * 7 + 4 * 10, 3 * 8 + 4 * 11, 3 * 9 + 4 * 12,
2018
0
    5 * 7 + 6 * 10, 5 * 8 + 6 * 11, 5 * 9 + 6 * 12,
2019
0
    7 * 7 + 8 * 10, 7 * 8 + 8 * 11, 7 * 9 + 8 * 12,
2020
0
    2 * 7 + 3 * 10, 2 * 8 + 3 * 11, 2 * 9 + 3 * 12,
2021
0
    4 * 7 + 5 * 10, 4 * 8 + 5 * 11, 4 * 9 + 5 * 12,
2022
0
    6 * 7 + 7 * 10, 6 * 8 + 7 * 11, 6 * 9 + 7 * 12,
2023
0
    8 * 7 + 9 * 10, 8 * 8 + 9 * 11, 8 * 9 + 9 * 12,
2024
0
  };
2025
0
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
2026
0
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
2027
0
  ccv_nnc_tensor_free(a);
2028
0
  ccv_nnc_tensor_free(b);
2029
0
  ccv_nnc_tensor_free(c);
2030
0
  ccv_nnc_tensor_free(ga);
2031
0
  ccv_nnc_tensor_free(gb);
2032
0
  ccv_nnc_tensor_free(gc);
2033
0
}
2034
2035
TEST_CASE("gemm no transpose batch 2")
2036
1
{
2037
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
2038
0
  float ap[] = {
2039
0
    1, 2,
2040
0
    3, 4,
2041
0
    5, 6,
2042
0
    7, 8,
2043
0
    2, 3,
2044
0
    4, 5,
2045
0
    6, 7,
2046
0
    8, 9
2047
0
  };
2048
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
2049
0
  float bp[] = {
2050
0
    7, 8, 9,
2051
0
    10, 11, 12,
2052
0
    8, 9, 10,
2053
0
    11, 12, 13,
2054
0
  };
2055
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 2, 3), 0);
2056
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
2057
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
2058
0
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 3), 0);
2059
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
2060
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
2061
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb), TENSOR_LIST(gc), 0);
2062
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
2063
0
  float ctp[] = {
2064
0
    1 * 7 + 2 * 10, 1 * 8 + 2 * 11, 1 * 9 + 2 * 12,
2065
0
    3 * 7 + 4 * 10, 3 * 8 + 4 * 11, 3 * 9 + 4 * 12,
2066
0
    5 * 7 + 6 * 10, 5 * 8 + 6 * 11, 5 * 9 + 6 * 12,
2067
0
    7 * 7 + 8 * 10, 7 * 8 + 8 * 11, 7 * 9 + 8 * 12,
2068
0
    2 * 8 + 3 * 11, 2 * 9 + 3 * 12, 2 * 10 + 3 * 13,
2069
0
    4 * 8 + 5 * 11, 4 * 9 + 5 * 12, 4 * 10 + 5 * 13,
2070
0
    6 * 8 + 7 * 11, 6 * 9 + 7 * 12, 6 * 10 + 7 * 13,
2071
0
    8 * 8 + 9 * 11, 8 * 9 + 9 * 12, 8 * 10 + 9 * 13,
2072
0
  };
2073
0
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
2074
0
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
2075
0
  ccv_nnc_tensor_free(a);
2076
0
  ccv_nnc_tensor_free(b);
2077
0
  ccv_nnc_tensor_free(c);
2078
0
  ccv_nnc_tensor_free(ga);
2079
0
  ccv_nnc_tensor_free(gb);
2080
0
  ccv_nnc_tensor_free(gc);
2081
0
}
2082
2083
TEST_CASE("gemm transpose a batch 2, no batch b, with bias")
2084
1
{
2085
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
2086
0
  float ap[] = {
2087
0
    1, 3, 5, 7,
2088
0
    2, 4, 6, 8,
2089
0
    2, 4, 6, 8,
2090
0
    3, 5, 7, 9,
2091
0
  };
2092
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 2, 4), 0);
2093
0
  float bp[] = {
2094
0
    7, 8, 9,
2095
0
    10, 11, 12,
2096
0
  };
2097
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
2098
0
  float dp[] = {
2099
0
    -1, 0, 1,
2100
0
  };
2101
0
  ccv_nnc_tensor_t* const d = ccv_nnc_tensor_new(dp, CPU_TENSOR_NHWC(32F, 3), 0);
2102
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
2103
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 4), 0);
2104
0
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
2105
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
2106
0
  ccv_nnc_tensor_t* gd = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
2107
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b, d), TENSOR_LIST(ga, gb, gd), 0);
2108
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb, gd), TENSOR_LIST(gc), 0);
2109
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
2110
0
  float ctp[] = {
2111
0
    1 * 7 + 2 * 10 - 1, 1 * 8 + 2 * 11, 1 * 9 + 2 * 12 + 1,
2112
0
    3 * 7 + 4 * 10 - 1, 3 * 8 + 4 * 11, 3 * 9 + 4 * 12 + 1,
2113
0
    5 * 7 + 6 * 10 - 1, 5 * 8 + 6 * 11, 5 * 9 + 6 * 12 + 1,
2114
0
    7 * 7 + 8 * 10 - 1, 7 * 8 + 8 * 11, 7 * 9 + 8 * 12 + 1,
2115
0
    2 * 7 + 3 * 10 - 1, 2 * 8 + 3 * 11, 2 * 9 + 3 * 12 + 1,
2116
0
    4 * 7 + 5 * 10 - 1, 4 * 8 + 5 * 11, 4 * 9 + 5 * 12 + 1,
2117
0
    6 * 7 + 7 * 10 - 1, 6 * 8 + 7 * 11, 6 * 9 + 7 * 12 + 1,
2118
0
    8 * 7 + 9 * 10 - 1, 8 * 8 + 9 * 11, 8 * 9 + 9 * 12 + 1,
2119
0
  };
2120
0
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
2121
0
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
2122
0
  ccv_nnc_tensor_free(a);
2123
0
  ccv_nnc_tensor_free(b);
2124
0
  ccv_nnc_tensor_free(c);
2125
0
  ccv_nnc_tensor_free(d);
2126
0
  ccv_nnc_tensor_free(ga);
2127
0
  ccv_nnc_tensor_free(gb);
2128
0
  ccv_nnc_tensor_free(gc);
2129
0
  ccv_nnc_tensor_free(gd);
2130
0
}
2131
2132
TEST_CASE("gemm transpose a batch 2, with bias")
2133
1
{
2134
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
2135
0
  float ap[] = {
2136
0
    1, 3, 5, 7,
2137
0
    2, 4, 6, 8,
2138
0
    2, 4, 6, 8,
2139
0
    3, 5, 7, 9,
2140
0
  };
2141
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 2, 4), 0);
2142
0
  float bp[] = {
2143
0
    7, 8, 9,
2144
0
    10, 11, 12,
2145
0
    8, 9, 10,
2146
0
    11, 12, 13,
2147
0
  };
2148
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 2, 3), 0);
2149
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
2150
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 4), 0);
2151
0
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 3), 0);
2152
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
2153
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
2154
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb), TENSOR_LIST(gc), 0);
2155
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
2156
0
  float ctp[] = {
2157
0
    1 * 7 + 2 * 10, 1 * 8 + 2 * 11, 1 * 9 + 2 * 12,
2158
0
    3 * 7 + 4 * 10, 3 * 8 + 4 * 11, 3 * 9 + 4 * 12,
2159
0
    5 * 7 + 6 * 10, 5 * 8 + 6 * 11, 5 * 9 + 6 * 12,
2160
0
    7 * 7 + 8 * 10, 7 * 8 + 8 * 11, 7 * 9 + 8 * 12,
2161
0
    2 * 8 + 3 * 11, 2 * 9 + 3 * 12, 2 * 10 + 3 * 13,
2162
0
    4 * 8 + 5 * 11, 4 * 9 + 5 * 12, 4 * 10 + 5 * 13,
2163
0
    6 * 8 + 7 * 11, 6 * 9 + 7 * 12, 6 * 10 + 7 * 13,
2164
0
    8 * 8 + 9 * 11, 8 * 9 + 9 * 12, 8 * 10 + 9 * 13,
2165
0
  };
2166
0
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
2167
0
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
2168
0
  ccv_nnc_tensor_free(a);
2169
0
  ccv_nnc_tensor_free(b);
2170
0
  ccv_nnc_tensor_free(c);
2171
0
  ccv_nnc_tensor_free(ga);
2172
0
  ccv_nnc_tensor_free(gb);
2173
0
  ccv_nnc_tensor_free(gc);
2174
0
}
2175
2176
TEST_CASE("gemm transpose b batch 2, with bias")
2177
1
{
2178
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
2179
0
  float ap[] = {
2180
0
    1, 2,
2181
0
    3, 4,
2182
0
    5, 6,
2183
0
    7, 8,
2184
0
    2, 3,
2185
0
    4, 5,
2186
0
    6, 7,
2187
0
    8, 9
2188
0
  };
2189
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
2190
0
  float bp[] = {
2191
0
    7, 10,
2192
0
    8, 11,
2193
0
    9, 12,
2194
0
    80, 110,
2195
0
    90, 120,
2196
0
    10, 13,
2197
0
  };
2198
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3, 2), 0);
2199
0
  float dp[] = {
2200
0
    -1, 0, 1,
2201
0
    2, 3, -4,
2202
0
  };
2203
0
  ccv_nnc_tensor_t* const d = ccv_nnc_tensor_new(dp, CPU_TENSOR_NHWC(32F, 2, 1, 3), 0);
2204
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
2205
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
2206
0
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3, 2), 0);
2207
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
2208
0
  ccv_nnc_tensor_t* gd = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 1, 3), 0);
2209
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b, d), TENSOR_LIST(ga, gb, gd), 0);
2210
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb, gd), TENSOR_LIST(gc), 0);
2211
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
2212
0
  float ctp[] = {
2213
0
    1 * 7 + 2 * 10 - 1, 1 * 8 + 2 * 11, 1 * 9 + 2 * 12 + 1,
2214
0
    3 * 7 + 4 * 10 - 1, 3 * 8 + 4 * 11, 3 * 9 + 4 * 12 + 1,
2215
0
    5 * 7 + 6 * 10 - 1, 5 * 8 + 6 * 11, 5 * 9 + 6 * 12 + 1,
2216
0
    7 * 7 + 8 * 10 - 1, 7 * 8 + 8 * 11, 7 * 9 + 8 * 12 + 1,
2217
0
    2 * 80 + 3 * 110 + 2, 2 * 90 + 3 * 120 + 3, 2 * 10 + 3 * 13 - 4,
2218
0
    4 * 80 + 5 * 110 + 2, 4 * 90 + 5 * 120 + 3, 4 * 10 + 5 * 13 - 4,
2219
0
    6 * 80 + 7 * 110 + 2, 6 * 90 + 7 * 120 + 3, 6 * 10 + 7 * 13 - 4,
2220
0
    8 * 80 + 9 * 110 + 2, 8 * 90 + 9 * 120 + 3, 8 * 10 + 9 * 13 - 4,
2221
0
  };
2222
0
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
2223
0
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
2224
0
  ccv_nnc_tensor_free(a);
2225
0
  ccv_nnc_tensor_free(b);
2226
0
  ccv_nnc_tensor_free(c);
2227
0
  ccv_nnc_tensor_free(d);
2228
0
  ccv_nnc_tensor_free(ga);
2229
0
  ccv_nnc_tensor_free(gb);
2230
0
  ccv_nnc_tensor_free(gc);
2231
0
  ccv_nnc_tensor_free(gd);
2232
0
}
2233
2234
TEST_CASE("gemm transpose b batch 2")
2235
1
{
2236
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
2237
0
  float ap[] = {
2238
0
    1, 2,
2239
0
    3, 4,
2240
0
    5, 6,
2241
0
    7, 8,
2242
0
    2, 3,
2243
0
    4, 5,
2244
0
    6, 7,
2245
0
    8, 9
2246
0
  };
2247
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
2248
0
  float bp[] = {
2249
0
    7, 10,
2250
0
    8, 11,
2251
0
    9, 12,
2252
0
    80, 110,
2253
0
    90, 120,
2254
0
    10, 13,
2255
0
  };
2256
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3, 2), 0);
2257
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
2258
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
2259
0
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3, 2), 0);
2260
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
2261
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
2262
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb), TENSOR_LIST(gc), 0);
2263
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
2264
0
  float ctp[] = {
2265
0
    1 * 7 + 2 * 10, 1 * 8 + 2 * 11, 1 * 9 + 2 * 12,
2266
0
    3 * 7 + 4 * 10, 3 * 8 + 4 * 11, 3 * 9 + 4 * 12,
2267
0
    5 * 7 + 6 * 10, 5 * 8 + 6 * 11, 5 * 9 + 6 * 12,
2268
0
    7 * 7 + 8 * 10, 7 * 8 + 8 * 11, 7 * 9 + 8 * 12,
2269
0
    2 * 80 + 3 * 110, 2 * 90 + 3 * 120, 2 * 10 + 3 * 13,
2270
0
    4 * 80 + 5 * 110, 4 * 90 + 5 * 120, 4 * 10 + 5 * 13,
2271
0
    6 * 80 + 7 * 110, 6 * 90 + 7 * 120, 6 * 10 + 7 * 13,
2272
0
    8 * 80 + 9 * 110, 8 * 90 + 9 * 120, 8 * 10 + 9 * 13,
2273
0
  };
2274
0
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
2275
0
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
2276
0
  ccv_nnc_tensor_free(a);
2277
0
  ccv_nnc_tensor_free(b);
2278
0
  ccv_nnc_tensor_free(c);
2279
0
  ccv_nnc_tensor_free(ga);
2280
0
  ccv_nnc_tensor_free(gb);
2281
0
  ccv_nnc_tensor_free(gc);
2282
0
}
2283
2284
TEST_CASE("mps forward gemm")
2285
1
{
2286
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
2287
0
  dsfmt_t dsfmt;
2288
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2289
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 128), 0);
2290
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
2291
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64), 0);
2292
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 64), 0);
2293
2294
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
2295
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
2296
0
  ccv_nnc_tensor_t* hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
2297
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
2298
0
  int i;
2299
0
  for (i = 0; i < 64 * 128; i++)
2300
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
2301
0
  for (i = 0; i < 64; i++)
2302
0
    hbias->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2303
0
  ccv_nnc_tensor_t* ha1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
2304
0
  for (i = 0; i < 10 * 128; i++)
2305
0
    ha1->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2306
0
  for (i = 0; i < 128; i++)
2307
0
    ha->data.f32[i] = ha1->data.f32[i];
2308
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha1, hw, hbias), TENSOR_LIST(a, w, bias), 0);
2309
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(hb), 0);
2310
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
2311
0
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
2312
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(tb), 0);
2313
0
  ccv_nnc_tensor_t* tb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
2314
0
  for (i = 0; i < 64; i++)
2315
0
    tb1->data.f32[i] = tb->data.f32[i];
2316
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb1->data.f32, hb->data.f32, 64, 5e-6, "GPU computed output should be numerically close to CPU computed ones");
2317
0
  ccv_nnc_tensor_free(a);
2318
0
  ccv_nnc_tensor_free(w);
2319
0
  ccv_nnc_tensor_free(bias);
2320
0
  ccv_nnc_tensor_free(tb);
2321
0
  ccv_nnc_tensor_free(b);
2322
0
  ccv_nnc_tensor_free(ha);
2323
0
  ccv_nnc_tensor_free(ha1);
2324
0
  ccv_nnc_tensor_free(tb1);
2325
0
  ccv_nnc_tensor_free(hw);
2326
0
  ccv_nnc_tensor_free(hbias);
2327
0
  ccv_nnc_tensor_free(hb);
2328
0
}
2329
2330
TEST_CASE("mps forward gemm in half precision")
2331
1
{
2332
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
2333
0
  dsfmt_t dsfmt;
2334
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2335
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10, 128), 0);
2336
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64, 128), 0);
2337
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64), 0);
2338
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10, 64), 0);
2339
2340
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
2341
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
2342
0
  ccv_nnc_tensor_t* hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
2343
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
2344
0
  int i;
2345
0
  for (i = 0; i < 64 * 128; i++)
2346
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
2347
0
  for (i = 0; i < 64; i++)
2348
0
    hbias->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2349
0
  ccv_nnc_tensor_t* ha1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
2350
0
  for (i = 0; i < 10 * 128; i++)
2351
0
    ha1->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2352
0
  for (i = 0; i < 128; i++)
2353
0
    ha->data.f32[i] = ha1->data.f32[i];
2354
0
  ccv_nnc_tensor_t* ha2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 128), 0);
2355
0
  ccv_nnc_tensor_t* hw2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64, 128), 0);
2356
0
  ccv_nnc_tensor_t* hbias2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64), 0);
2357
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha1, hw, hbias), TENSOR_LIST(ha2, hw2, hbias2), 0);
2358
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha2, hw2, hbias2), TENSOR_LIST(a, w, bias), 0);
2359
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(hb), 0);
2360
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
2361
0
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 64), 0);
2362
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(tb), 0);
2363
0
  ccv_nnc_tensor_t* tb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
2364
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tb), TENSOR_LIST(tb1), 0);
2365
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb1->data.f32, hb->data.f32, 64, 5e-3, "GPU computed output should be the same as CPU computed ones");
2366
0
  ccv_nnc_tensor_free(a);
2367
0
  ccv_nnc_tensor_free(w);
2368
0
  ccv_nnc_tensor_free(bias);
2369
0
  ccv_nnc_tensor_free(b);
2370
0
  ccv_nnc_tensor_free(tb);
2371
0
  ccv_nnc_tensor_free(ha);
2372
0
  ccv_nnc_tensor_free(ha1);
2373
0
  ccv_nnc_tensor_free(tb1);
2374
0
  ccv_nnc_tensor_free(hw);
2375
0
  ccv_nnc_tensor_free(hbias);
2376
0
  ccv_nnc_tensor_free(hb);
2377
0
  ccv_nnc_tensor_free(ha2);
2378
0
  ccv_nnc_tensor_free(hw2);
2379
0
  ccv_nnc_tensor_free(hbias2);
2380
0
}
2381
2382
TEST_CASE("mps forward gemm in bfloat precision")
2383
1
{
2384
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
2385
0
  dsfmt_t dsfmt;
2386
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2387
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, 10, 128), 0);
2388
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, 64, 128), 0);
2389
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, 64), 0);
2390
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, 10, 64), 0);
2391
2392
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
2393
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
2394
0
  ccv_nnc_tensor_t* hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
2395
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
2396
0
  int i;
2397
0
  for (i = 0; i < 64 * 128; i++)
2398
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
2399
0
  for (i = 0; i < 64; i++)
2400
0
    hbias->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2401
0
  ccv_nnc_tensor_t* ha1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
2402
0
  for (i = 0; i < 10 * 128; i++)
2403
0
    ha1->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2404
0
  for (i = 0; i < 128; i++)
2405
0
    ha->data.f32[i] = ha1->data.f32[i];
2406
0
  ccv_nnc_tensor_t* ha2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, 10, 128), 0);
2407
0
  ccv_nnc_tensor_t* hw2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, 64, 128), 0);
2408
0
  ccv_nnc_tensor_t* hbias2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, 64), 0);
2409
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha1, hw, hbias), TENSOR_LIST(ha2, hw2, hbias2), 0);
2410
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha2, hw2, hbias2), TENSOR_LIST(a, w, bias), 0);
2411
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(hb), 0);
2412
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
2413
0
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, 10, 64), 0);
2414
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(tb), 0);
2415
0
  ccv_nnc_tensor_t* tb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
2416
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tb), TENSOR_LIST(tb1), 0);
2417
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb1->data.f32, hb->data.f32, 64, 8e-3, "GPU computed output should be the same as CPU computed ones");
2418
0
  ccv_nnc_tensor_free(a);
2419
0
  ccv_nnc_tensor_free(w);
2420
0
  ccv_nnc_tensor_free(bias);
2421
0
  ccv_nnc_tensor_free(b);
2422
0
  ccv_nnc_tensor_free(tb);
2423
0
  ccv_nnc_tensor_free(ha);
2424
0
  ccv_nnc_tensor_free(ha1);
2425
0
  ccv_nnc_tensor_free(tb1);
2426
0
  ccv_nnc_tensor_free(hw);
2427
0
  ccv_nnc_tensor_free(hbias);
2428
0
  ccv_nnc_tensor_free(hb);
2429
0
  ccv_nnc_tensor_free(ha2);
2430
0
  ccv_nnc_tensor_free(hw2);
2431
0
  ccv_nnc_tensor_free(hbias2);
2432
0
}
2433
2434
TEST_CASE("mps forward gemv in half precision, variant 1")
2435
1
{
2436
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
2437
0
  dsfmt_t dsfmt;
2438
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2439
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 1, 128), 0);
2440
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64, 128), 0);
2441
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64), 0);
2442
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 1, 64), 0);
2443
2444
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
2445
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
2446
0
  ccv_nnc_tensor_t* hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
2447
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
2448
0
  int i;
2449
0
  for (i = 0; i < 64 * 128; i++)
2450
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
2451
0
  for (i = 0; i < 64; i++)
2452
0
    hbias->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2453
0
  ccv_nnc_tensor_t* ha1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
2454
0
  for (i = 0; i < 128; i++)
2455
0
    ha1->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2456
0
  for (i = 0; i < 128; i++)
2457
0
    ha->data.f32[i] = ha1->data.f32[i];
2458
0
  ccv_nnc_tensor_t* ha2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 1, 128), 0);
2459
0
  ccv_nnc_tensor_t* hw2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64, 128), 0);
2460
0
  ccv_nnc_tensor_t* hbias2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64), 0);
2461
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha1, hw, hbias), TENSOR_LIST(ha2, hw2, hbias2), 0);
2462
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha2, hw2, hbias2), TENSOR_LIST(a, w, bias), 0);
2463
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(hb), 0);
2464
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
2465
0
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 1, 64), 0);
2466
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(tb), 0);
2467
0
  ccv_nnc_tensor_t* tb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
2468
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tb), TENSOR_LIST(tb1), 0);
2469
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb1->data.f32, hb->data.f32, 64, 1e-3, "GPU computed output should be the same as CPU computed ones");
2470
0
  ccv_nnc_tensor_free(a);
2471
0
  ccv_nnc_tensor_free(w);
2472
0
  ccv_nnc_tensor_free(bias);
2473
0
  ccv_nnc_tensor_free(b);
2474
0
  ccv_nnc_tensor_free(tb);
2475
0
  ccv_nnc_tensor_free(ha);
2476
0
  ccv_nnc_tensor_free(ha1);
2477
0
  ccv_nnc_tensor_free(tb1);
2478
0
  ccv_nnc_tensor_free(hw);
2479
0
  ccv_nnc_tensor_free(hbias);
2480
0
  ccv_nnc_tensor_free(hb);
2481
0
  ccv_nnc_tensor_free(ha2);
2482
0
  ccv_nnc_tensor_free(hw2);
2483
0
  ccv_nnc_tensor_free(hbias2);
2484
0
}
2485
2486
TEST_CASE("mps forward gemv in bfloat precision, variant 1")
2487
1
{
2488
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
2489
0
  dsfmt_t dsfmt;
2490
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2491
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, 1, 128), 0);
2492
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, 64, 128), 0);
2493
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, 64), 0);
2494
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, 1, 64), 0);
2495
2496
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
2497
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
2498
0
  ccv_nnc_tensor_t* hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
2499
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
2500
0
  int i;
2501
0
  for (i = 0; i < 64 * 128; i++)
2502
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
2503
0
  for (i = 0; i < 64; i++)
2504
0
    hbias->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2505
0
  ccv_nnc_tensor_t* ha1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
2506
0
  for (i = 0; i < 128; i++)
2507
0
    ha1->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2508
0
  for (i = 0; i < 128; i++)
2509
0
    ha->data.f32[i] = ha1->data.f32[i];
2510
0
  ccv_nnc_tensor_t* ha2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, 1, 128), 0);
2511
0
  ccv_nnc_tensor_t* hw2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, 64, 128), 0);
2512
0
  ccv_nnc_tensor_t* hbias2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, 64), 0);
2513
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha1, hw, hbias), TENSOR_LIST(ha2, hw2, hbias2), 0);
2514
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha2, hw2, hbias2), TENSOR_LIST(a, w, bias), 0);
2515
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(hb), 0);
2516
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
2517
0
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, 1, 64), 0);
2518
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(tb), 0);
2519
0
  ccv_nnc_tensor_t* tb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
2520
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tb), TENSOR_LIST(tb1), 0);
2521
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb1->data.f32, hb->data.f32, 64, 8e-3, "GPU computed output should be the same as CPU computed ones");
2522
0
  ccv_nnc_tensor_free(a);
2523
0
  ccv_nnc_tensor_free(w);
2524
0
  ccv_nnc_tensor_free(bias);
2525
0
  ccv_nnc_tensor_free(b);
2526
0
  ccv_nnc_tensor_free(tb);
2527
0
  ccv_nnc_tensor_free(ha);
2528
0
  ccv_nnc_tensor_free(ha1);
2529
0
  ccv_nnc_tensor_free(tb1);
2530
0
  ccv_nnc_tensor_free(hw);
2531
0
  ccv_nnc_tensor_free(hbias);
2532
0
  ccv_nnc_tensor_free(hb);
2533
0
  ccv_nnc_tensor_free(ha2);
2534
0
  ccv_nnc_tensor_free(hw2);
2535
0
  ccv_nnc_tensor_free(hbias2);
2536
0
}
2537
2538
TEST_CASE("mps depalettize 5-bit half precision")
2539
1
{
2540
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_MPS));
2541
0
  float lut_f32[32] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, -1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0, -9.0, -10.0, -11.0, -12.0, -13.0, -14.0, -15.0};
2542
0
  uint16_t lut[32];
2543
0
  ccv_float_to_half_precision(lut_f32, lut, 32);
2544
0
  uint16_t* const values = ccmalloc(sizeof(uint16_t) * 3072);
2545
0
  int i;
2546
0
  for (i = 0; i < 3072; i++)
2547
0
    values[i] = lut[i % 32];
2548
0
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2112 + 3) / 4), 0);
2549
0
  uint8_t* compressed = tensor->data.u8;
2550
0
  const size_t output_size = ccv_nnc_palettize(values, CCV_16F, CCV_TENSOR_CPU_MEMORY, 3072, 5, 1024, compressed, 2112);
2551
0
  REQUIRE_EQ(output_size, 2112, "output size should match");
2552
0
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2112 + 3) / 4), 0);
2553
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
2554
0
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 16F, 3072), 0);
2555
0
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_16F, CCV_TENSOR_GPU_MEMORY, output_size, 5, 1024, gv_tensor->data.u8, 3072);
2556
0
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 3072), 0);
2557
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
2558
0
  REQUIRE_ARRAY_EQ(uint16_t, values, v_tensor->data.f16, 3072, "GPU computed output should match CPU depalettize");
2559
0
  ccfree(values);
2560
0
  ccv_nnc_tensor_free(tensor);
2561
0
  ccv_nnc_tensor_free(g_tensor);
2562
0
  ccv_nnc_tensor_free(gv_tensor);
2563
0
  ccv_nnc_tensor_free(v_tensor);
2564
0
}
2565
2566
TEST_CASE("mps depalettize 6-bit float precision")
2567
1
{
2568
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_MPS));
2569
0
  float lut[64];
2570
0
  int i;
2571
0
  for (i = 0; i < 64; i++)
2572
0
    lut[i] = (float)i;
2573
0
  float* const values = ccmalloc(sizeof(float) * 8192);
2574
0
  for (i = 0; i < 8192; i++)
2575
0
    values[i] = lut[i % 64];
2576
0
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (6144 + 2 * 64 * 4 + 3) / 4), 0);
2577
0
  uint8_t* compressed = tensor->data.u8;
2578
0
  const size_t output_size = ccv_nnc_palettize(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 8192, 6, 4096, compressed, 6144 + 2 * 64 * 4);
2579
0
  REQUIRE_EQ(output_size, 6144 + 2 * 64 * 4, "output size should match");
2580
0
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (6144 + 2 * 64 * 4 + 3) / 4), 0);
2581
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
2582
0
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 8192), 0);
2583
0
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_32F, CCV_TENSOR_GPU_MEMORY, output_size, 6, 4096, gv_tensor->data.u8, 8192);
2584
0
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 8192), 0);
2585
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
2586
0
  REQUIRE_ARRAY_EQ(float, values, v_tensor->data.f32, 8192, "GPU computed output should match CPU depalettize");
2587
0
  ccfree(values);
2588
0
  ccv_nnc_tensor_free(tensor);
2589
0
  ccv_nnc_tensor_free(g_tensor);
2590
0
  ccv_nnc_tensor_free(gv_tensor);
2591
0
  ccv_nnc_tensor_free(v_tensor);
2592
0
}
2593
2594
TEST_CASE("mps depalettize 8-bit float precision with partial block")
2595
1
{
2596
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_MPS));
2597
0
  float lut[256];
2598
0
  int i;
2599
0
  for (i = 0; i < 256; i++)
2600
0
    lut[i] = (float)i;
2601
0
  float* const values = ccmalloc(sizeof(float) * 3072);
2602
0
  for (i = 0; i < 3072; i++)
2603
0
    values[i] = lut[i % 256];
2604
0
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (6144 + 3) / 4), 0);
2605
0
  uint8_t* compressed = tensor->data.u8;
2606
0
  const size_t output_size = ccv_nnc_palettize(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 3072, 8, 2048, compressed, 6144);
2607
0
  REQUIRE(output_size <= 6144, "output size should fit the allocated buffer");
2608
0
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (6144 + 3) / 4), 0);
2609
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
2610
0
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 3072), 0);
2611
0
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_32F, CCV_TENSOR_GPU_MEMORY, output_size, 8, 2048, gv_tensor->data.u8, 3072);
2612
0
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 3072), 0);
2613
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
2614
0
  REQUIRE_ARRAY_EQ(float, values, v_tensor->data.f32, 3072, "GPU computed output should match CPU depalettize");
2615
0
  ccfree(values);
2616
0
  ccv_nnc_tensor_free(tensor);
2617
0
  ccv_nnc_tensor_free(g_tensor);
2618
0
  ccv_nnc_tensor_free(gv_tensor);
2619
0
  ccv_nnc_tensor_free(v_tensor);
2620
0
}
2621
2622
TEST_CASE("mps dequantize row-wise 8i half precision")
2623
1
{
2624
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_MPS));
2625
0
  const int rows = 17;
2626
0
  const int cols = 64;
2627
0
  float* const values = ccmalloc(sizeof(float) * rows * cols);
2628
0
  int i;
2629
0
  for (i = 0; i < rows * cols; i++)
2630
0
    values[i] = ((i * 13) % 41 - 20) / 32.0f;
2631
0
  ccv_nnc_tensor_t* const source = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, rows, cols), 0);
2632
0
  ccv_float_to_half_precision(values, (uint16_t*)source->data.f16, rows * cols);
2633
0
  ccv_nnc_tensor_t* const q = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(CPU_TENSOR_NHWC(16F, rows, cols)), 0);
2634
0
  const size_t qsize = ccv_nnc_quantize_8i_rowwise(source->data.f16, CCV_16F, CCV_TENSOR_CPU_MEMORY, rows * cols, cols, q->data.u8, ccv_nnc_tensor_data_size_without_padding(q->info));
2635
0
  REQUIRE_EQ(qsize, ccv_nnc_tensor_data_size_without_padding(q->info), "quantized row-wise 8i size should match");
2636
0
  ccv_nnc_tensor_t* const expected = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, rows, cols), 0);
2637
0
  ccv_nnc_dequantize_8i_rowwise(q->data.u8, CCV_16F, CCV_TENSOR_CPU_MEMORY, qsize, cols, expected->data.f16, rows * cols);
2638
0
  ccv_nnc_tensor_t* const gq = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(GPU_TENSOR_NHWC(000, 16F, rows, cols)), 0);
2639
0
  ccv_nnc_tensor_t* const gout = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, rows, cols), 0);
2640
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q), TENSOR_LIST(gq), 0);
2641
0
  ccv_nnc_dequantize_8i_rowwise(gq->data.u8, CCV_16F, CCV_TENSOR_GPU_MEMORY, qsize, cols, gout->data.u8, rows * cols);
2642
0
  ccv_nnc_tensor_t* const actual = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, rows, cols), 0);
2643
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gout), TENSOR_LIST(actual), 0);
2644
0
  float* const expected_f32 = (float*)ccmalloc(sizeof(float) * rows * cols);
2645
0
  float* const actual_f32 = (float*)ccmalloc(sizeof(float) * rows * cols);
2646
0
  ccv_half_precision_to_float((uint16_t*)expected->data.f16, expected_f32, rows * cols);
2647
0
  ccv_half_precision_to_float((uint16_t*)actual->data.f16, actual_f32, rows * cols);
2648
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, expected_f32, actual_f32, rows * cols, 1e-3, "GPU row-wise 8i dequantize should match CPU dequantize");
2649
0
  ccfree(actual_f32);
2650
0
  ccfree(expected_f32);
2651
0
  ccv_nnc_tensor_free(actual);
2652
0
  ccv_nnc_tensor_free(gout);
2653
0
  ccv_nnc_tensor_free(gq);
2654
0
  ccv_nnc_tensor_free(expected);
2655
0
  ccv_nnc_tensor_free(q);
2656
0
  ccv_nnc_tensor_free(source);
2657
0
  ccfree(values);
2658
0
}
2659
2660
TEST_CASE("mps dequantize row-wise 8i float precision")
2661
1
{
2662
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_MPS));
2663
0
  const int rows = 11;
2664
0
  const int cols = 128;
2665
0
  ccv_nnc_tensor_t* const source = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, rows, cols), 0);
2666
0
  int i;
2667
0
  for (i = 0; i < rows * cols; i++)
2668
0
    source->data.f32[i] = ((i * 17) % 53 - 26) / 64.0f;
2669
0
  ccv_nnc_tensor_t* const q = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(CPU_TENSOR_NHWC(32F, rows, cols)), 0);
2670
0
  const size_t qsize = ccv_nnc_quantize_8i_rowwise(source->data.f32, CCV_32F, CCV_TENSOR_CPU_MEMORY, rows * cols, cols, q->data.u8, ccv_nnc_tensor_data_size_without_padding(q->info));
2671
0
  REQUIRE_EQ(qsize, ccv_nnc_tensor_data_size_without_padding(q->info), "quantized row-wise 8i size should match");
2672
0
  ccv_nnc_tensor_t* const expected = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, rows, cols), 0);
2673
0
  ccv_nnc_dequantize_8i_rowwise(q->data.u8, CCV_32F, CCV_TENSOR_CPU_MEMORY, qsize, cols, expected->data.f32, rows * cols);
2674
0
  ccv_nnc_tensor_t* const gq = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(GPU_TENSOR_NHWC(000, 32F, rows, cols)), 0);
2675
0
  ccv_nnc_tensor_t* const gout = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, rows, cols), 0);
2676
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q), TENSOR_LIST(gq), 0);
2677
0
  ccv_nnc_dequantize_8i_rowwise(gq->data.u8, CCV_32F, CCV_TENSOR_GPU_MEMORY, qsize, cols, gout->data.u8, rows * cols);
2678
0
  ccv_nnc_tensor_t* const actual = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, rows, cols), 0);
2679
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gout), TENSOR_LIST(actual), 0);
2680
0
  REQUIRE_ARRAY_EQ(float, expected->data.f32, actual->data.f32, rows * cols, "GPU row-wise 8i dequantize should match CPU dequantize");
2681
0
  ccv_nnc_tensor_free(actual);
2682
0
  ccv_nnc_tensor_free(gout);
2683
0
  ccv_nnc_tensor_free(gq);
2684
0
  ccv_nnc_tensor_free(expected);
2685
0
  ccv_nnc_tensor_free(q);
2686
0
  ccv_nnc_tensor_free(source);
2687
0
}
2688
2689
TEST_CASE("mps dequantize row-wise 8i bfloat precision")
2690
1
{
2691
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_MPS));
2692
0
  const int rows = 257;
2693
0
  const int cols = 130;
2694
0
  float* const values = ccmalloc(sizeof(float) * rows * cols);
2695
0
  int i;
2696
0
  for (i = 0; i < rows * cols; i++)
2697
0
    values[i] = ((i * 29) % 97 - 48) / 64.0f;
2698
0
  ccv_nnc_tensor_t* const source = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, rows, cols), 0);
2699
0
  ccv_float_to_bfloat(values, (uint16_t*)source->data.f16, rows * cols);
2700
0
  ccv_nnc_tensor_t* const q = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(CPU_TENSOR_NHWC(16BF, rows, cols)), 0);
2701
0
  const size_t qsize = ccv_nnc_quantize_8i_rowwise(source->data.f16, CCV_16BF, CCV_TENSOR_CPU_MEMORY, rows * cols, cols, q->data.u8, ccv_nnc_tensor_data_size_without_padding(q->info));
2702
0
  REQUIRE_EQ(qsize, ccv_nnc_tensor_data_size_without_padding(q->info), "quantized row-wise 8i size should match");
2703
0
  ccv_nnc_tensor_t* const expected = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, rows, cols), 0);
2704
0
  ccv_nnc_dequantize_8i_rowwise(q->data.u8, CCV_16BF, CCV_TENSOR_CPU_MEMORY, qsize, cols, expected->data.f16, rows * cols);
2705
0
  ccv_nnc_tensor_t* const gq = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(GPU_TENSOR_NHWC(000, 16BF, rows, cols)), 0);
2706
0
  ccv_nnc_tensor_t* const gout = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, rows, cols), 0);
2707
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q), TENSOR_LIST(gq), 0);
2708
0
  ccv_nnc_dequantize_8i_rowwise(gq->data.u8, CCV_16BF, CCV_TENSOR_GPU_MEMORY, qsize, cols, gout->data.u8, rows * cols);
2709
0
  ccv_nnc_tensor_t* const actual = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, rows, cols), 0);
2710
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gout), TENSOR_LIST(actual), 0);
2711
0
  float* const expected_f32 = (float*)ccmalloc(sizeof(float) * rows * cols);
2712
0
  float* const actual_f32 = (float*)ccmalloc(sizeof(float) * rows * cols);
2713
0
  ccv_bfloat_to_float((uint16_t*)expected->data.f16, expected_f32, rows * cols);
2714
0
  ccv_bfloat_to_float((uint16_t*)actual->data.f16, actual_f32, rows * cols);
2715
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, expected_f32, actual_f32, rows * cols, 5e-3, "GPU row-wise 8i bf16 dequantize should match CPU dequantize");
2716
0
  ccfree(actual_f32);
2717
0
  ccfree(expected_f32);
2718
0
  ccv_nnc_tensor_free(actual);
2719
0
  ccv_nnc_tensor_free(gout);
2720
0
  ccv_nnc_tensor_free(gq);
2721
0
  ccv_nnc_tensor_free(expected);
2722
0
  ccv_nnc_tensor_free(q);
2723
0
  ccv_nnc_tensor_free(source);
2724
0
  ccfree(values);
2725
0
}
2726
2727
TEST_CASE("mps dequantize row-wise 8i bfloat precision large shapes")
2728
1
{
2729
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_MPS));
2730
0
  static const int shapes[][2] = {
2731
0
    {3840, 3840},
2732
0
    {10240, 3840},
2733
0
    {3840, 10240},
2734
0
  };
2735
0
  int s;
2736
0
  for (s = 0; s < (int)(sizeof(shapes) / sizeof(shapes[0])); s++)
2737
0
  {
2738
0
    const int rows = shapes[s][0];
2739
0
    const int cols = shapes[s][1];
2740
0
    float* const values = ccmalloc(sizeof(float) * (size_t)rows * cols);
2741
0
    int i;
2742
0
    for (i = 0; i < rows * cols; i++)
2743
0
      values[i] = ((i * 29) % 97 - 48) / 64.0f;
2744
0
    ccv_nnc_tensor_t* const source = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, rows, cols), 0);
2745
0
    ccv_float_to_bfloat(values, (uint16_t*)source->data.f16, rows * cols);
2746
0
    ccv_nnc_tensor_t* const q = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(CPU_TENSOR_NHWC(16BF, rows, cols)), 0);
2747
0
    const size_t qsize = ccv_nnc_quantize_8i_rowwise(source->data.f16, CCV_16BF, CCV_TENSOR_CPU_MEMORY, (size_t)rows * cols, cols, q->data.u8, ccv_nnc_tensor_data_size_without_padding(q->info));
2748
0
    REQUIRE_EQ(qsize, ccv_nnc_tensor_data_size_without_padding(q->info), "quantized row-wise 8i size should match");
2749
0
    ccv_nnc_tensor_t* const expected = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, rows, cols), 0);
2750
0
    ccv_nnc_dequantize_8i_rowwise(q->data.u8, CCV_16BF, CCV_TENSOR_CPU_MEMORY, qsize, cols, expected->data.f16, (size_t)rows * cols);
2751
0
    ccv_nnc_tensor_t* const gq = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(GPU_TENSOR_NHWC(000, 16BF, rows, cols)), 0);
2752
0
    ccv_nnc_tensor_t* const gout = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, rows, cols), 0);
2753
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q), TENSOR_LIST(gq), 0);
2754
0
    ccv_nnc_dequantize_8i_rowwise(gq->data.u8, CCV_16BF, CCV_TENSOR_GPU_MEMORY, qsize, cols, gout->data.u8, (size_t)rows * cols);
2755
0
    ccv_nnc_tensor_t* const actual = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, rows, cols), 0);
2756
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gout), TENSOR_LIST(actual), 0);
2757
0
    float* const expected_f32 = (float*)ccmalloc(sizeof(float) * (size_t)rows * cols);
2758
0
    float* const actual_f32 = (float*)ccmalloc(sizeof(float) * (size_t)rows * cols);
2759
0
    ccv_bfloat_to_float((uint16_t*)expected->data.f16, expected_f32, rows * cols);
2760
0
    ccv_bfloat_to_float((uint16_t*)actual->data.f16, actual_f32, rows * cols);
2761
0
    REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, expected_f32, actual_f32, rows * cols, 5e-3, "GPU row-wise 8i bf16 dequantize should match CPU dequantize on large shape");
2762
0
    ccfree(actual_f32);
2763
0
    ccfree(expected_f32);
2764
0
    ccv_nnc_tensor_free(actual);
2765
0
    ccv_nnc_tensor_free(gout);
2766
0
    ccv_nnc_tensor_free(gq);
2767
0
    ccv_nnc_tensor_free(expected);
2768
0
    ccv_nnc_tensor_free(q);
2769
0
    ccv_nnc_tensor_free(source);
2770
0
    ccfree(values);
2771
0
  }
2772
0
}
2773
2774
TEST_CASE("mps forward gemm no bias")
2775
1
{
2776
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
2777
0
  dsfmt_t dsfmt;
2778
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2779
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 128), 0);
2780
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
2781
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 64), 0);
2782
2783
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
2784
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
2785
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
2786
0
  int i;
2787
0
  for (i = 0; i < 64 * 128; i++)
2788
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
2789
0
  ccv_nnc_tensor_t* ha1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
2790
0
  for (i = 0; i < 10 * 128; i++)
2791
0
    ha1->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2792
0
  for (i = 0; i < 128; i++)
2793
0
    ha->data.f32[i] = ha1->data.f32[i];
2794
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha1, hw), TENSOR_LIST(a, w), 0);
2795
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw), TENSOR_LIST(hb), 0);
2796
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
2797
0
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
2798
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(tb), 0);
2799
0
  ccv_nnc_tensor_t* tb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
2800
0
  for (i = 0; i < 64; i++)
2801
0
    tb1->data.f32[i] = tb->data.f32[i];
2802
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb1->data.f32, hb->data.f32, 64, 5e-6, "GPU computed output should be numerically close to CPU computed ones");
2803
0
  ccv_nnc_tensor_free(a);
2804
0
  ccv_nnc_tensor_free(w);
2805
0
  ccv_nnc_tensor_free(b);
2806
0
  ccv_nnc_tensor_free(tb);
2807
0
  ccv_nnc_tensor_free(ha);
2808
0
  ccv_nnc_tensor_free(ha1);
2809
0
  ccv_nnc_tensor_free(tb1);
2810
0
  ccv_nnc_tensor_free(hw);
2811
0
  ccv_nnc_tensor_free(hb);
2812
0
}
2813
2814
TEST_CASE("mps forward gemm no bias in half precision")
2815
1
{
2816
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
2817
0
  dsfmt_t dsfmt;
2818
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2819
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10, 128), 0);
2820
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64, 128), 0);
2821
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10, 64), 0);
2822
2823
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
2824
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
2825
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
2826
0
  int i;
2827
0
  for (i = 0; i < 64 * 128; i++)
2828
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
2829
0
  ccv_nnc_tensor_t* ha1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
2830
0
  for (i = 0; i < 10 * 128; i++)
2831
0
    ha1->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2832
0
  for (i = 0; i < 128; i++)
2833
0
    ha->data.f32[i] = ha1->data.f32[i];
2834
0
  ccv_nnc_tensor_t* ha2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 128), 0);
2835
0
  ccv_nnc_tensor_t* hw2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64, 128), 0);
2836
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha1, hw), TENSOR_LIST(ha2, hw2), 0);
2837
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha2, hw2), TENSOR_LIST(a, w), 0);
2838
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw), TENSOR_LIST(hb), 0);
2839
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
2840
0
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 64), 0);
2841
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(tb), 0);
2842
0
  ccv_nnc_tensor_t* tb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
2843
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tb), TENSOR_LIST(tb1), 0);
2844
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb1->data.f32, hb->data.f32, 64, 1e-3, "GPU computed output should be the same as CPU computed ones");
2845
0
  ccv_nnc_tensor_free(a);
2846
0
  ccv_nnc_tensor_free(w);
2847
0
  ccv_nnc_tensor_free(b);
2848
0
  ccv_nnc_tensor_free(tb);
2849
0
  ccv_nnc_tensor_free(ha);
2850
0
  ccv_nnc_tensor_free(ha1);
2851
0
  ccv_nnc_tensor_free(tb1);
2852
0
  ccv_nnc_tensor_free(hw);
2853
0
  ccv_nnc_tensor_free(hb);
2854
0
  ccv_nnc_tensor_free(ha2);
2855
0
  ccv_nnc_tensor_free(hw2);
2856
0
}
2857
2858
TEST_CASE("mps forward gemm no bias in bfloat precision")
2859
1
{
2860
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
2861
0
  dsfmt_t dsfmt;
2862
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2863
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, 10, 128), 0);
2864
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, 64, 128), 0);
2865
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, 10, 64), 0);
2866
2867
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
2868
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
2869
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
2870
0
  int i;
2871
0
  for (i = 0; i < 64 * 128; i++)
2872
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
2873
0
  ccv_nnc_tensor_t* ha1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
2874
0
  for (i = 0; i < 10 * 128; i++)
2875
0
    ha1->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2876
0
  for (i = 0; i < 128; i++)
2877
0
    ha->data.f32[i] = ha1->data.f32[i];
2878
0
  ccv_nnc_tensor_t* ha2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, 10, 128), 0);
2879
0
  ccv_nnc_tensor_t* hw2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, 64, 128), 0);
2880
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha1, hw), TENSOR_LIST(ha2, hw2), 0);
2881
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha2, hw2), TENSOR_LIST(a, w), 0);
2882
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw), TENSOR_LIST(hb), 0);
2883
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
2884
0
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, 10, 64), 0);
2885
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(tb), 0);
2886
0
  ccv_nnc_tensor_t* tb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
2887
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tb), TENSOR_LIST(tb1), 0);
2888
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb1->data.f32, hb->data.f32, 64, 1e-3, "GPU computed output should be the same as CPU computed ones");
2889
0
  ccv_nnc_tensor_free(a);
2890
0
  ccv_nnc_tensor_free(w);
2891
0
  ccv_nnc_tensor_free(b);
2892
0
  ccv_nnc_tensor_free(tb);
2893
0
  ccv_nnc_tensor_free(ha);
2894
0
  ccv_nnc_tensor_free(ha1);
2895
0
  ccv_nnc_tensor_free(tb1);
2896
0
  ccv_nnc_tensor_free(hw);
2897
0
  ccv_nnc_tensor_free(hb);
2898
0
  ccv_nnc_tensor_free(ha2);
2899
0
  ccv_nnc_tensor_free(hw2);
2900
0
}
2901
2902
TEST_CASE("mps forward gemv in half precision no bias, variant 1")
2903
1
{
2904
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
2905
0
  dsfmt_t dsfmt;
2906
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2907
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 1, 128), 0);
2908
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64, 128), 0);
2909
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 1, 64), 0);
2910
2911
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
2912
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
2913
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
2914
0
  int i;
2915
0
  for (i = 0; i < 64 * 128; i++)
2916
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
2917
0
  ccv_nnc_tensor_t* ha1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
2918
0
  for (i = 0; i < 128; i++)
2919
0
    ha1->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2920
0
  for (i = 0; i < 128; i++)
2921
0
    ha->data.f32[i] = ha1->data.f32[i];
2922
0
  ccv_nnc_tensor_t* ha2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 1, 128), 0);
2923
0
  ccv_nnc_tensor_t* hw2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64, 128), 0);
2924
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha1, hw), TENSOR_LIST(ha2, hw2), 0);
2925
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha2, hw2), TENSOR_LIST(a, w), 0);
2926
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw), TENSOR_LIST(hb), 0);
2927
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
2928
0
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 1, 64), 0);
2929
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(tb), 0);
2930
0
  ccv_nnc_tensor_t* tb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
2931
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tb), TENSOR_LIST(tb1), 0);
2932
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb1->data.f32, hb->data.f32, 64, 1e-3, "GPU computed output should be the same as CPU computed ones");
2933
0
  ccv_nnc_tensor_free(a);
2934
0
  ccv_nnc_tensor_free(w);
2935
0
  ccv_nnc_tensor_free(b);
2936
0
  ccv_nnc_tensor_free(tb);
2937
0
  ccv_nnc_tensor_free(ha);
2938
0
  ccv_nnc_tensor_free(ha1);
2939
0
  ccv_nnc_tensor_free(tb1);
2940
0
  ccv_nnc_tensor_free(hw);
2941
0
  ccv_nnc_tensor_free(hb);
2942
0
  ccv_nnc_tensor_free(ha2);
2943
0
  ccv_nnc_tensor_free(hw2);
2944
0
}
2945
2946
TEST_CASE("mps forward gemv in half precision no bias, variant 2")
2947
1
{
2948
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
2949
0
  dsfmt_t dsfmt;
2950
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2951
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64, 128), 0);
2952
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 128, 1), 0);
2953
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64, 1), 0);
2954
2955
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
2956
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 128, 1), 0);
2957
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 1), 0);
2958
0
  int i;
2959
0
  for (i = 0; i < 64 * 128; i++)
2960
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
2961
0
  ccv_nnc_tensor_t* ha1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 128, 1), 0);
2962
0
  for (i = 0; i < 128; i++)
2963
0
    ha1->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2964
0
  for (i = 0; i < 128; i++)
2965
0
    ha->data.f32[i] = ha1->data.f32[i];
2966
0
  ccv_nnc_tensor_t* hw2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64, 128), 0);
2967
0
  ccv_nnc_tensor_t* ha2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 128, 1), 0);
2968
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha1, hw), TENSOR_LIST(ha2, hw2), 0);
2969
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha2, hw2), TENSOR_LIST(a, w), 0);
2970
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, NO_TRANSPOSE), ccv_nnc_no_hint, 0, TENSOR_LIST(hw, ha), TENSOR_LIST(hb), 0);
2971
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, NO_TRANSPOSE), ccv_nnc_no_hint, 0, TENSOR_LIST(w, a), TENSOR_LIST(b), 0);
2972
0
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64, 1), 0);
2973
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(tb), 0);
2974
0
  ccv_nnc_tensor_t* tb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 1), 0);
2975
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tb), TENSOR_LIST(tb1), 0);
2976
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb1->data.f32, hb->data.f32, 64, 1e-3, "GPU computed output should be the same as CPU computed ones");
2977
0
  ccv_nnc_tensor_free(a);
2978
0
  ccv_nnc_tensor_free(w);
2979
0
  ccv_nnc_tensor_free(b);
2980
0
  ccv_nnc_tensor_free(tb);
2981
0
  ccv_nnc_tensor_free(ha);
2982
0
  ccv_nnc_tensor_free(ha1);
2983
0
  ccv_nnc_tensor_free(tb1);
2984
0
  ccv_nnc_tensor_free(hw);
2985
0
  ccv_nnc_tensor_free(hb);
2986
0
  ccv_nnc_tensor_free(ha2);
2987
0
  ccv_nnc_tensor_free(hw2);
2988
0
}
2989
2990
TEST_CASE("mps handle permute")
2991
1
{
2992
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
2993
0
  dsfmt_t dsfmt;
2994
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2995
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 2, 128), 0);
2996
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 2, 128), 0);
2997
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 2, 128), 0);
2998
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 2, 128), 0);
2999
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 64), 0);
3000
3001
0
  ccv_nnc_tensor_t* at = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 128), 0);
3002
0
  ccv_nnc_tensor_t* wt = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 64, 128), 0);
3003
0
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 64), 0);
3004
0
  int i;
3005
0
  for (i = 0; i < 2 * 64 * 128; i++)
3006
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
3007
0
  for (i = 0; i < 2 * 10 * 128; i++)
3008
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3009
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw), TENSOR_LIST(a, w), 0);
3010
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(0, 1), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(at), 0);
3011
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(0, 1), ccv_nnc_no_hint, 0, TENSOR_LIST(w), TENSOR_LIST(wt), 0);
3012
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(at, wt), TENSOR_LIST(bt), 0);
3013
0
  ccv_nnc_tensor_view_t* av = ccv_nnc_tensor_view_new(a, GPU_TENSOR_NHWC(000, 32F, 2, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(128, 2 * 128, 1));
3014
0
  ccv_nnc_tensor_view_t* wv = ccv_nnc_tensor_view_new(w, GPU_TENSOR_NHWC(000, 32F, 2, 64, 128), ccv_nnc_no_ofs, DIM_ALLOC(128, 2 * 128, 1));
3015
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST((ccv_nnc_tensor_t*)av, (ccv_nnc_tensor_t*)wv), TENSOR_LIST(b), 0);
3016
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 64), 0);
3017
0
  ccv_nnc_tensor_t* hbt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 64), 0);
3018
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b, bt), TENSOR_LIST(hb, hbt), 0);
3019
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, hb->data.f32, hbt->data.f32, 2 * 10 * 64, 1e-5, "permute computed output should be numerically close to non-permute computed ones");
3020
0
  ccv_nnc_tensor_free(ha);
3021
0
  ccv_nnc_tensor_free(hw);
3022
0
  ccv_nnc_tensor_free(a);
3023
0
  ccv_nnc_tensor_free(w);
3024
0
  ccv_nnc_tensor_free(b);
3025
0
  ccv_nnc_tensor_view_free(av);
3026
0
  ccv_nnc_tensor_view_free(wv);
3027
0
  ccv_nnc_tensor_free(at);
3028
0
  ccv_nnc_tensor_free(wt);
3029
0
  ccv_nnc_tensor_free(bt);
3030
0
  ccv_nnc_tensor_free(hb);
3031
0
  ccv_nnc_tensor_free(hbt);
3032
0
}
3033
3034
TEST_CASE("generalized batched gemm with batch (2, 4) compare mps")
3035
1
{
3036
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
3037
  // This is a particular batched gemm which treat every dimensions other than the last two as batching.
3038
0
  dsfmt_t dsfmt;
3039
0
  dsfmt_init_gen_rand(&dsfmt, 0);
3040
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
3041
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 64, 4, 128), 0);
3042
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
3043
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
3044
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 64, 4, 128), 0);
3045
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 64), 0);
3046
3047
0
  ccv_nnc_tensor_t* at = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
3048
0
  ccv_nnc_tensor_t* wt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 64, 128), 0);
3049
0
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
3050
0
  int i;
3051
0
  for (i = 0; i < 8 * 64 * 128; i++)
3052
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
3053
0
  for (i = 0; i < 8 * 10 * 128; i++)
3054
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3055
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(at), 0);
3056
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(hw), TENSOR_LIST(wt), 0);
3057
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw), TENSOR_LIST(a, w), 0);
3058
0
  ccv_nnc_tensor_view_t* av = ccv_nnc_tensor_view_new(a, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
3059
0
  ccv_nnc_tensor_view_t* wv = ccv_nnc_tensor_view_new(w, GPU_TENSOR_NHWC(000, 32F, 2, 4, 64, 128), ccv_nnc_no_ofs, DIM_ALLOC(64 * 4 * 128, 128, 4 * 128, 1));
3060
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(2, 3)), ccv_nnc_no_hint, 0, TENSOR_LIST((ccv_nnc_tensor_t*)av, (ccv_nnc_tensor_t*)wv), TENSOR_LIST(b), 0);
3061
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(2, 3)), ccv_nnc_no_hint, 0, TENSOR_LIST(at, wt), TENSOR_LIST(bt), 0);
3062
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
3063
0
  REQUIRE_TENSOR_EQ(hb, bt, "permute computed output should be the same as non-permute computed ones");
3064
0
  ccv_nnc_tensor_free(ha);
3065
0
  ccv_nnc_tensor_free(hw);
3066
0
  ccv_nnc_tensor_free(hb);
3067
0
  ccv_nnc_tensor_free(a);
3068
0
  ccv_nnc_tensor_free(w);
3069
0
  ccv_nnc_tensor_free(b);
3070
0
  ccv_nnc_tensor_view_free(av);
3071
0
  ccv_nnc_tensor_view_free(wv);
3072
0
  ccv_nnc_tensor_free(at);
3073
0
  ccv_nnc_tensor_free(wt);
3074
0
  ccv_nnc_tensor_free(bt);
3075
0
}
3076
3077
TEST_CASE("generalized batched gemm with batch (2, 4) and broadcast compare mps")
3078
1
{
3079
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
3080
  // This is a particular batched gemm which treat every dimensions other than the last two as batching.
3081
0
  dsfmt_t dsfmt;
3082
0
  dsfmt_init_gen_rand(&dsfmt, 0);
3083
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
3084
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
3085
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
3086
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
3087
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
3088
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 64), 0);
3089
3090
0
  ccv_nnc_tensor_t* at = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
3091
0
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
3092
0
  int i;
3093
0
  for (i = 0; i < 64 * 128; i++)
3094
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
3095
0
  for (i = 0; i < 8 * 10 * 128; i++)
3096
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3097
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(at), 0);
3098
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw), TENSOR_LIST(a, w), 0);
3099
0
  ccv_nnc_tensor_view_t* av = ccv_nnc_tensor_view_new(a, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
3100
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST((ccv_nnc_tensor_t*)av, w), TENSOR_LIST(b), 0);
3101
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(at, hw), TENSOR_LIST(bt), 0);
3102
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
3103
0
  REQUIRE_TENSOR_EQ(hb, bt, "permute computed output should be the same as non-permute computed ones");
3104
0
  ccv_nnc_tensor_free(ha);
3105
0
  ccv_nnc_tensor_free(hw);
3106
0
  ccv_nnc_tensor_free(hb);
3107
0
  ccv_nnc_tensor_free(a);
3108
0
  ccv_nnc_tensor_free(w);
3109
0
  ccv_nnc_tensor_free(b);
3110
0
  ccv_nnc_tensor_view_free(av);
3111
0
  ccv_nnc_tensor_free(at);
3112
0
  ccv_nnc_tensor_free(bt);
3113
0
}
3114
3115
TEST_CASE("generalized batched gemm with batch (2, 4) with bias compare mps")
3116
1
{
3117
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
3118
  // This is a particular batched gemm which treat every dimensions other than the last two as batching.
3119
0
  dsfmt_t dsfmt;
3120
0
  dsfmt_init_gen_rand(&dsfmt, 0);
3121
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
3122
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 64, 4, 128), 0);
3123
0
  ccv_nnc_tensor_t* hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
3124
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
3125
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
3126
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 64, 4, 128), 0);
3127
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64), 0);
3128
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 64), 0);
3129
3130
0
  ccv_nnc_tensor_t* at = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
3131
0
  ccv_nnc_tensor_t* wt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 64, 128), 0);
3132
0
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
3133
0
  int i;
3134
0
  for (i = 0; i < 8 * 64 * 128; i++)
3135
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
3136
0
  for (i = 0; i < 64; i++)
3137
0
    hbias->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / 64;
3138
0
  for (i = 0; i < 8 * 10 * 128; i++)
3139
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3140
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(at), 0);
3141
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(hw), TENSOR_LIST(wt), 0);
3142
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(a, w, bias), 0);
3143
0
  ccv_nnc_tensor_view_t* av = ccv_nnc_tensor_view_new(a, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
3144
0
  ccv_nnc_tensor_view_t* wv = ccv_nnc_tensor_view_new(w, GPU_TENSOR_NHWC(000, 32F, 2, 4, 64, 128), ccv_nnc_no_ofs, DIM_ALLOC(64 * 4 * 128, 128, 4 * 128, 1));
3145
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(2, 3)), ccv_nnc_no_hint, 0, TENSOR_LIST((ccv_nnc_tensor_t*)av, (ccv_nnc_tensor_t*)wv, bias), TENSOR_LIST(b), 0);
3146
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(2, 3)), ccv_nnc_no_hint, 0, TENSOR_LIST(at, wt, hbias), TENSOR_LIST(bt), 0);
3147
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
3148
0
  REQUIRE_TENSOR_EQ(hb, bt, "permute computed output should be the same as non-permute computed ones");
3149
0
  ccv_nnc_tensor_free(ha);
3150
0
  ccv_nnc_tensor_free(hw);
3151
0
  ccv_nnc_tensor_free(hbias);
3152
0
  ccv_nnc_tensor_free(hb);
3153
0
  ccv_nnc_tensor_free(a);
3154
0
  ccv_nnc_tensor_free(w);
3155
0
  ccv_nnc_tensor_free(bias);
3156
0
  ccv_nnc_tensor_free(b);
3157
0
  ccv_nnc_tensor_view_free(av);
3158
0
  ccv_nnc_tensor_view_free(wv);
3159
0
  ccv_nnc_tensor_free(at);
3160
0
  ccv_nnc_tensor_free(wt);
3161
0
  ccv_nnc_tensor_free(bt);
3162
0
}
3163
3164
TEST_CASE("generalized batched gemm with batch (2, 4) with bias and broadcast compare mps")
3165
1
{
3166
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
3167
  // This is a particular batched gemm which treat every dimensions other than the last two as batching.
3168
0
  dsfmt_t dsfmt;
3169
0
  dsfmt_init_gen_rand(&dsfmt, 0);
3170
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
3171
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
3172
0
  ccv_nnc_tensor_t* hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
3173
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
3174
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
3175
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
3176
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64), 0);
3177
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 64), 0);
3178
3179
0
  ccv_nnc_tensor_t* at = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
3180
0
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
3181
0
  int i;
3182
0
  for (i = 0; i < 64 * 128; i++)
3183
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
3184
0
  for (i = 0; i < 64; i++)
3185
0
    hbias->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / 64;
3186
0
  for (i = 0; i < 8 * 10 * 128; i++)
3187
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3188
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(at), 0);
3189
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(a, w, bias), 0);
3190
0
  ccv_nnc_tensor_view_t* av = ccv_nnc_tensor_view_new(a, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
3191
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST((ccv_nnc_tensor_t*)av, w, bias), TENSOR_LIST(b), 0);
3192
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(at, hw, hbias), TENSOR_LIST(bt), 0);
3193
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
3194
0
  REQUIRE_TENSOR_EQ(hb, bt, "permute computed output should be the same as non-permute computed ones");
3195
0
  ccv_nnc_tensor_free(ha);
3196
0
  ccv_nnc_tensor_free(hw);
3197
0
  ccv_nnc_tensor_free(hbias);
3198
0
  ccv_nnc_tensor_free(hb);
3199
0
  ccv_nnc_tensor_free(a);
3200
0
  ccv_nnc_tensor_free(w);
3201
0
  ccv_nnc_tensor_free(bias);
3202
0
  ccv_nnc_tensor_free(b);
3203
0
  ccv_nnc_tensor_view_free(av);
3204
0
  ccv_nnc_tensor_free(at);
3205
0
  ccv_nnc_tensor_free(bt);
3206
0
}
3207
3208
TEST_CASE("generalized batched backward gemm with batch (2, 4) compare mps")
3209
1
{
3210
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
3211
  // This is a particular batched gemm which treat every dimensions other than the last two as batching.
3212
0
  dsfmt_t dsfmt;
3213
0
  dsfmt_init_gen_rand(&dsfmt, 0);
3214
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
3215
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 64, 4, 128), 0);
3216
0
  ccv_nnc_tensor_t* hda = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
3217
0
  ccv_nnc_tensor_t* hdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 64, 4, 128), 0);
3218
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
3219
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
3220
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 64, 4, 128), 0);
3221
0
  ccv_nnc_tensor_t* da = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
3222
0
  ccv_nnc_tensor_t* dw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 64, 4, 128), 0);
3223
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 64), 0);
3224
3225
0
  ccv_nnc_tensor_t* at = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
3226
0
  ccv_nnc_tensor_t* wt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 64, 128), 0);
3227
0
  ccv_nnc_tensor_t* dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
3228
0
  ccv_nnc_tensor_t* dwt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 64, 128), 0);
3229
0
  ccv_nnc_tensor_t* tda = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
3230
0
  ccv_nnc_tensor_t* tdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 64, 4, 128), 0);
3231
0
  int i;
3232
0
  for (i = 0; i < 8 * 64 * 128; i++)
3233
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
3234
0
  for (i = 0; i < 8 * 10 * 128; i++)
3235
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3236
0
  for (i = 0; i < 2 * 4 * 10 * 64; i++)
3237
0
    hb->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3238
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(at), 0);
3239
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(hw), TENSOR_LIST(wt), 0);
3240
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hb), TENSOR_LIST(a, w, b), 0);
3241
0
  ccv_nnc_tensor_view_t* av = ccv_nnc_tensor_view_new(a, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
3242
0
  ccv_nnc_tensor_view_t* wv = ccv_nnc_tensor_view_new(w, GPU_TENSOR_NHWC(000, 32F, 2, 4, 64, 128), ccv_nnc_no_ofs, DIM_ALLOC(64 * 4 * 128, 128, 4 * 128, 1));
3243
0
  ccv_nnc_tensor_view_t* dav = ccv_nnc_tensor_view_new(da, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
3244
0
  ccv_nnc_tensor_view_t* dwv = ccv_nnc_tensor_view_new(dw, GPU_TENSOR_NHWC(000, 32F, 2, 4, 64, 128), ccv_nnc_no_ofs, DIM_ALLOC(64 * 4 * 128, 128, 4 * 128, 1));
3245
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(2, 3)), ccv_nnc_no_hint, 0, TENSOR_LIST(b, (ccv_nnc_tensor_t*)av, (ccv_nnc_tensor_t*)wv), TENSOR_LIST((ccv_nnc_tensor_t*)dav, (ccv_nnc_tensor_t*)dwv), 0);
3246
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(2, 3)), ccv_nnc_no_hint, 0, TENSOR_LIST(hb, at, wt), TENSOR_LIST(dat, dwt), 0);
3247
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(dat), TENSOR_LIST(tda), 0);
3248
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(dwt), TENSOR_LIST(tdw), 0);
3249
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(da, dw), TENSOR_LIST(hda, hdw), 0);
3250
0
  REQUIRE_TENSOR_EQ(hda, tda, "permute computed output should be the same as non-permute computed ones");
3251
0
  REQUIRE_TENSOR_EQ(hdw, tdw, "permute computed output should be the same as non-permute computed ones");
3252
0
  ccv_nnc_tensor_free(ha);
3253
0
  ccv_nnc_tensor_free(hw);
3254
0
  ccv_nnc_tensor_free(hda);
3255
0
  ccv_nnc_tensor_free(hdw);
3256
0
  ccv_nnc_tensor_free(hb);
3257
0
  ccv_nnc_tensor_free(a);
3258
0
  ccv_nnc_tensor_free(w);
3259
0
  ccv_nnc_tensor_free(da);
3260
0
  ccv_nnc_tensor_free(dw);
3261
0
  ccv_nnc_tensor_free(b);
3262
0
  ccv_nnc_tensor_view_free(av);
3263
0
  ccv_nnc_tensor_view_free(wv);
3264
0
  ccv_nnc_tensor_view_free(dav);
3265
0
  ccv_nnc_tensor_view_free(dwv);
3266
0
  ccv_nnc_tensor_free(at);
3267
0
  ccv_nnc_tensor_free(wt);
3268
0
  ccv_nnc_tensor_free(dat);
3269
0
  ccv_nnc_tensor_free(tda);
3270
0
  ccv_nnc_tensor_free(dwt);
3271
0
  ccv_nnc_tensor_free(tdw);
3272
0
}
3273
3274
TEST_CASE("generalized batched backward gemm with batch (2, 4) and broadcast compare mps")
3275
1
{
3276
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
3277
  // This is a particular batched gemm which treat every dimensions other than the last two as batching.
3278
0
  dsfmt_t dsfmt;
3279
0
  dsfmt_init_gen_rand(&dsfmt, 0);
3280
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
3281
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
3282
0
  ccv_nnc_tensor_t* hda = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
3283
0
  ccv_nnc_tensor_t* hdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
3284
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
3285
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
3286
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
3287
0
  ccv_nnc_tensor_t* da = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
3288
0
  ccv_nnc_tensor_t* dw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
3289
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 64), 0);
3290
3291
0
  ccv_nnc_tensor_t* at = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
3292
0
  ccv_nnc_tensor_t* dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
3293
0
  ccv_nnc_tensor_t* tda = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
3294
0
  ccv_nnc_tensor_t* tdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
3295
0
  int i;
3296
0
  for (i = 0; i < 64 * 128; i++)
3297
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
3298
0
  for (i = 0; i < 8 * 10 * 128; i++)
3299
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3300
0
  for (i = 0; i < 2 * 4 * 10 * 64; i++)
3301
0
    hb->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3302
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(at), 0);
3303
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hb), TENSOR_LIST(a, w, b), 0);
3304
0
  ccv_nnc_tensor_view_t* av = ccv_nnc_tensor_view_new(a, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
3305
0
  ccv_nnc_tensor_view_t* dav = ccv_nnc_tensor_view_new(da, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
3306
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(b, (ccv_nnc_tensor_t*)av, w), TENSOR_LIST((ccv_nnc_tensor_t*)dav, dw), 0);
3307
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(hb, at, hw), TENSOR_LIST(dat, tdw), 0);
3308
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(dat), TENSOR_LIST(tda), 0);
3309
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(da, dw), TENSOR_LIST(hda, hdw), 0);
3310
0
  REQUIRE_TENSOR_EQ(hda, tda, "permute computed output should be the same as non-permute computed ones");
3311
0
  REQUIRE_TENSOR_EQ(hdw, tdw, "permute computed output should be the same as non-permute computed ones");
3312
0
  ccv_nnc_tensor_free(ha);
3313
0
  ccv_nnc_tensor_free(hw);
3314
0
  ccv_nnc_tensor_free(hda);
3315
0
  ccv_nnc_tensor_free(hdw);
3316
0
  ccv_nnc_tensor_free(hb);
3317
0
  ccv_nnc_tensor_free(a);
3318
0
  ccv_nnc_tensor_free(w);
3319
0
  ccv_nnc_tensor_free(da);
3320
0
  ccv_nnc_tensor_free(dw);
3321
0
  ccv_nnc_tensor_free(b);
3322
0
  ccv_nnc_tensor_view_free(av);
3323
0
  ccv_nnc_tensor_view_free(dav);
3324
0
  ccv_nnc_tensor_free(at);
3325
0
  ccv_nnc_tensor_free(dat);
3326
0
  ccv_nnc_tensor_free(tda);
3327
0
  ccv_nnc_tensor_free(tdw);
3328
0
}
3329
3330
TEST_CASE("generalized batched backward gemm with batch (2, 4) with bias compare mps")
3331
1
{
3332
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
3333
  // This is a particular batched gemm which treat every dimensions other than the last two as batching.
3334
0
  dsfmt_t dsfmt;
3335
0
  dsfmt_init_gen_rand(&dsfmt, 0);
3336
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
3337
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 64, 4, 128), 0);
3338
0
  ccv_nnc_tensor_t* hda = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
3339
0
  ccv_nnc_tensor_t* hdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 64, 4, 128), 0);
3340
0
  ccv_nnc_tensor_t* hdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
3341
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
3342
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
3343
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 64, 4, 128), 0);
3344
0
  ccv_nnc_tensor_t* da = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
3345
0
  ccv_nnc_tensor_t* dw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 64, 4, 128), 0);
3346
0
  ccv_nnc_tensor_t* dbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64), 0);
3347
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 64), 0);
3348
3349
0
  ccv_nnc_tensor_t* at = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
3350
0
  ccv_nnc_tensor_t* wt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 64, 128), 0);
3351
0
  ccv_nnc_tensor_t* dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
3352
0
  ccv_nnc_tensor_t* dwt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 64, 128), 0);
3353
0
  ccv_nnc_tensor_t* tda = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
3354
0
  ccv_nnc_tensor_t* tdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 64, 4, 128), 0);
3355
0
  ccv_nnc_tensor_t* tdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
3356
0
  int i;
3357
0
  for (i = 0; i < 8 * 64 * 128; i++)
3358
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
3359
0
  for (i = 0; i < 8 * 10 * 128; i++)
3360
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3361
0
  for (i = 0; i < 2 * 4 * 10 * 64; i++)
3362
0
    hb->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3363
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(at), 0);
3364
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(hw), TENSOR_LIST(wt), 0);
3365
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hb), TENSOR_LIST(a, w, b), 0);
3366
0
  ccv_nnc_tensor_view_t* av = ccv_nnc_tensor_view_new(a, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
3367
0
  ccv_nnc_tensor_view_t* wv = ccv_nnc_tensor_view_new(w, GPU_TENSOR_NHWC(000, 32F, 2, 4, 64, 128), ccv_nnc_no_ofs, DIM_ALLOC(64 * 4 * 128, 128, 4 * 128, 1));
3368
0
  ccv_nnc_tensor_view_t* dav = ccv_nnc_tensor_view_new(da, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
3369
0
  ccv_nnc_tensor_view_t* dwv = ccv_nnc_tensor_view_new(dw, GPU_TENSOR_NHWC(000, 32F, 2, 4, 64, 128), ccv_nnc_no_ofs, DIM_ALLOC(64 * 4 * 128, 128, 4 * 128, 1));
3370
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(2, 3)), ccv_nnc_no_hint, 0, TENSOR_LIST(b, (ccv_nnc_tensor_t*)av, (ccv_nnc_tensor_t*)wv), TENSOR_LIST((ccv_nnc_tensor_t*)dav, (ccv_nnc_tensor_t*)dwv, dbias), 0);
3371
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(2, 3)), ccv_nnc_no_hint, 0, TENSOR_LIST(hb, at, wt), TENSOR_LIST(dat, dwt, tdbias), 0);
3372
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(da, dw, dbias), TENSOR_LIST(hda, hdw, hdbias), 0);
3373
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(dat), TENSOR_LIST(tda), 0);
3374
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(dwt), TENSOR_LIST(tdw), 0);
3375
0
  REQUIRE_TENSOR_EQ(hda, tda, "permute computed output should be the same as non-permute computed ones");
3376
0
  REQUIRE_TENSOR_EQ(hdw, tdw, "permute computed output should be the same as non-permute computed ones");
3377
0
  REQUIRE_TENSOR_EQ(hdbias, tdbias, "permute computed output should be the same as non-permute computed ones");
3378
0
  ccv_nnc_tensor_free(ha);
3379
0
  ccv_nnc_tensor_free(hw);
3380
0
  ccv_nnc_tensor_free(hda);
3381
0
  ccv_nnc_tensor_free(hdw);
3382
0
  ccv_nnc_tensor_free(hdbias);
3383
0
  ccv_nnc_tensor_free(hb);
3384
0
  ccv_nnc_tensor_free(a);
3385
0
  ccv_nnc_tensor_free(w);
3386
0
  ccv_nnc_tensor_free(da);
3387
0
  ccv_nnc_tensor_free(dw);
3388
0
  ccv_nnc_tensor_free(dbias);
3389
0
  ccv_nnc_tensor_free(b);
3390
0
  ccv_nnc_tensor_view_free(av);
3391
0
  ccv_nnc_tensor_view_free(wv);
3392
0
  ccv_nnc_tensor_view_free(dav);
3393
0
  ccv_nnc_tensor_view_free(dwv);
3394
0
  ccv_nnc_tensor_free(at);
3395
0
  ccv_nnc_tensor_free(wt);
3396
0
  ccv_nnc_tensor_free(dat);
3397
0
  ccv_nnc_tensor_free(dwt);
3398
0
  ccv_nnc_tensor_free(tda);
3399
0
  ccv_nnc_tensor_free(tdw);
3400
0
  ccv_nnc_tensor_free(tdbias);
3401
0
}
3402
3403
TEST_CASE("generalized batched backward gemm with batch (2, 4) with bias and broadcast compare mps")
3404
1
{
3405
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
3406
  // This is a particular batched gemm which treat every dimensions other than the last two as batching.
3407
0
  dsfmt_t dsfmt;
3408
0
  dsfmt_init_gen_rand(&dsfmt, 0);
3409
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
3410
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
3411
0
  ccv_nnc_tensor_t* hda = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
3412
0
  ccv_nnc_tensor_t* hdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
3413
0
  ccv_nnc_tensor_t* hdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
3414
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
3415
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
3416
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
3417
0
  ccv_nnc_tensor_t* da = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
3418
0
  ccv_nnc_tensor_t* dw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
3419
0
  ccv_nnc_tensor_t* dbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64), 0);
3420
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 64), 0);
3421
3422
0
  ccv_nnc_tensor_t* at = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
3423
0
  ccv_nnc_tensor_t* dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
3424
0
  ccv_nnc_tensor_t* tda = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
3425
0
  ccv_nnc_tensor_t* tdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
3426
0
  ccv_nnc_tensor_t* tdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
3427
0
  int i;
3428
0
  for (i = 0; i < 64 * 128; i++)
3429
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
3430
0
  for (i = 0; i < 8 * 10 * 128; i++)
3431
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3432
0
  for (i = 0; i < 2 * 4 * 10 * 64; i++)
3433
0
    hb->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3434
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(at), 0);
3435
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hb), TENSOR_LIST(a, w, b), 0);
3436
0
  ccv_nnc_tensor_view_t* av = ccv_nnc_tensor_view_new(a, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
3437
0
  ccv_nnc_tensor_view_t* dav = ccv_nnc_tensor_view_new(da, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
3438
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(b, (ccv_nnc_tensor_t*)av, w, dbias), TENSOR_LIST((ccv_nnc_tensor_t*)dav, dw, dbias), 0);
3439
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(hb, at, hw, hdbias), TENSOR_LIST(dat, tdw, tdbias), 0);
3440
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(da, dw, dbias), TENSOR_LIST(hda, hdw, hdbias), 0);
3441
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(dat), TENSOR_LIST(tda), 0);
3442
0
  REQUIRE_TENSOR_EQ(hda, tda, "permute computed output should be the same as non-permute computed ones");
3443
0
  REQUIRE_TENSOR_EQ(hdw, tdw, "permute computed output should be the same as non-permute computed ones");
3444
0
  REQUIRE_TENSOR_EQ(hdbias, tdbias, "permute computed output should be the same as non-permute computed ones");
3445
0
  ccv_nnc_tensor_free(ha);
3446
0
  ccv_nnc_tensor_free(hw);
3447
0
  ccv_nnc_tensor_free(hda);
3448
0
  ccv_nnc_tensor_free(hdw);
3449
0
  ccv_nnc_tensor_free(hdbias);
3450
0
  ccv_nnc_tensor_free(hb);
3451
0
  ccv_nnc_tensor_free(a);
3452
0
  ccv_nnc_tensor_free(w);
3453
0
  ccv_nnc_tensor_free(da);
3454
0
  ccv_nnc_tensor_free(dw);
3455
0
  ccv_nnc_tensor_free(dbias);
3456
0
  ccv_nnc_tensor_free(b);
3457
0
  ccv_nnc_tensor_view_free(av);
3458
0
  ccv_nnc_tensor_view_free(dav);
3459
0
  ccv_nnc_tensor_free(at);
3460
0
  ccv_nnc_tensor_free(dat);
3461
0
  ccv_nnc_tensor_free(tdw);
3462
0
  ccv_nnc_tensor_free(tdbias);
3463
0
}
3464
3465
TEST_CASE("ewdiv forward with reciprocal")
3466
1
{
3467
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWDIV_FORWARD, CCV_NNC_BACKEND_MPS));
3468
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3469
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3470
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3471
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3472
0
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3473
0
  dsfmt_t dsfmt;
3474
0
  dsfmt_init_gen_rand(&dsfmt, 0);
3475
0
  int i;
3476
0
  for (i = 0; i < 1000; i++)
3477
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 0.01;
3478
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
3479
0
  ccv_nnc_cmd_exec(CMD_EWDIV_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(0, a), TENSOR_LIST(b), 0);
3480
0
  ccv_nnc_cmd_exec(CMD_EWDIV_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(0, ha), TENSOR_LIST(bt), 0);
3481
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
3482
0
  REQUIRE_TENSOR_EQ(bt, hb, "GPU computed output should be the same as CPU computed ones");
3483
0
  ccv_nnc_tensor_free(a);
3484
0
  ccv_nnc_tensor_free(b);
3485
0
  ccv_nnc_tensor_free(ha);
3486
0
  ccv_nnc_tensor_free(hb);
3487
0
  ccv_nnc_tensor_free(bt);
3488
0
}
3489
3490
TEST_CASE("ewdiv forward")
3491
1
{
3492
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWDIV_FORWARD, CCV_NNC_BACKEND_MPS));
3493
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3494
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3495
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3496
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3497
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3498
0
  ccv_nnc_tensor_t* hc = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3499
0
  ccv_nnc_tensor_t* ct = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3500
0
  dsfmt_t dsfmt;
3501
0
  dsfmt_init_gen_rand(&dsfmt, 0);
3502
0
  int i;
3503
0
  for (i = 0; i < 1000; i++)
3504
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 0.01;
3505
0
  for (i = 0; i < 1000; i++)
3506
0
    hb->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 0.01;
3507
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb), TENSOR_LIST(a, b), 0);
3508
0
  ccv_nnc_cmd_exec(CMD_EWDIV_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(c), 0);
3509
0
  ccv_nnc_cmd_exec(CMD_EWDIV_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb), TENSOR_LIST(ct), 0);
3510
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(c), TENSOR_LIST(hc), 0);
3511
0
  REQUIRE_TENSOR_EQ(ct, hc, "GPU computed output should be the same as CPU computed ones");
3512
0
  ccv_nnc_tensor_free(a);
3513
0
  ccv_nnc_tensor_free(b);
3514
0
  ccv_nnc_tensor_free(c);
3515
0
  ccv_nnc_tensor_free(ha);
3516
0
  ccv_nnc_tensor_free(hb);
3517
0
  ccv_nnc_tensor_free(hc);
3518
0
  ccv_nnc_tensor_free(ct);
3519
0
}
3520
3521
TEST_CASE("exp forward")
3522
1
{
3523
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWEXP_FORWARD, CCV_NNC_BACKEND_MPS));
3524
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3525
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3526
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3527
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3528
0
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3529
0
  dsfmt_t dsfmt;
3530
0
  dsfmt_init_gen_rand(&dsfmt, 0);
3531
0
  int i;
3532
0
  for (i = 0; i < 1000; i++)
3533
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 10 - 1;
3534
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
3535
0
  ccv_nnc_cmd_exec(CMD_EWEXP_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b), 0);
3536
0
  ccv_nnc_cmd_exec(CMD_EWEXP_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(bt), 0);
3537
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
3538
0
  REQUIRE_TENSOR_EQ(bt, hb, "GPU computed output should be the same as CPU computed ones");
3539
0
  ccv_nnc_tensor_free(a);
3540
0
  ccv_nnc_tensor_free(b);
3541
0
  ccv_nnc_tensor_free(ha);
3542
0
  ccv_nnc_tensor_free(hb);
3543
0
  ccv_nnc_tensor_free(bt);
3544
0
}
3545
3546
TEST_CASE("ewpow forward")
3547
1
{
3548
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWPOW_FORWARD, CCV_NNC_BACKEND_MPS));
3549
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3550
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3551
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3552
0
  ccv_nnc_tensor_t* hc = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3553
0
  ccv_nnc_tensor_t* ct = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3554
0
  dsfmt_t dsfmt;
3555
0
  dsfmt_init_gen_rand(&dsfmt, 0);
3556
0
  int i;
3557
0
  for (i = 0; i < 1000; i++)
3558
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 2 + 0.1;
3559
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
3560
0
  ccv_nnc_cmd_exec(CMD_EWPOW_FORWARD(3), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(c), 0);
3561
0
  ccv_nnc_cmd_exec(CMD_EWPOW_FORWARD(3), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(ct), 0);
3562
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(c), TENSOR_LIST(hc), 0);
3563
0
  REQUIRE_TENSOR_EQ(ct, hc, "GPU computed output should be the same as CPU computed ones");
3564
0
  ccv_nnc_tensor_free(a);
3565
0
  ccv_nnc_tensor_free(c);
3566
0
  ccv_nnc_tensor_free(ha);
3567
0
  ccv_nnc_tensor_free(hc);
3568
0
  ccv_nnc_tensor_free(ct);
3569
0
}
3570
3571
TEST_CASE("ewsin forward")
3572
1
{
3573
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWSIN_FORWARD, CCV_NNC_BACKEND_MPS));
3574
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3575
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3576
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3577
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3578
0
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3579
0
  dsfmt_t dsfmt;
3580
0
  dsfmt_init_gen_rand(&dsfmt, 0);
3581
0
  int i;
3582
0
  for (i = 0; i < 1000; i++)
3583
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 10 - 5;
3584
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
3585
0
  ccv_nnc_cmd_exec(CMD_EWSIN_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b), 0);
3586
0
  ccv_nnc_cmd_exec(CMD_EWSIN_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(bt), 0);
3587
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
3588
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, bt->data.f32, hb->data.f32, 10 * 100, 1e-3, "GPU computed output should be the same as CPU computed ones");
3589
0
  ccv_nnc_tensor_free(a);
3590
0
  ccv_nnc_tensor_free(b);
3591
0
  ccv_nnc_tensor_free(ha);
3592
0
  ccv_nnc_tensor_free(hb);
3593
0
  ccv_nnc_tensor_free(bt);
3594
0
}
3595
3596
TEST_CASE("ewcos forward")
3597
1
{
3598
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWCOS_FORWARD, CCV_NNC_BACKEND_MPS));
3599
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3600
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3601
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3602
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3603
0
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3604
0
  dsfmt_t dsfmt;
3605
0
  dsfmt_init_gen_rand(&dsfmt, 0);
3606
0
  int i;
3607
0
  for (i = 0; i < 1000; i++)
3608
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 10 - 5;
3609
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
3610
0
  ccv_nnc_cmd_exec(CMD_EWCOS_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b), 0);
3611
0
  ccv_nnc_cmd_exec(CMD_EWCOS_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(bt), 0);
3612
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
3613
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, bt->data.f32, hb->data.f32, 10 * 100, 1e-3, "GPU computed output should be the same as CPU computed ones");
3614
0
  ccv_nnc_tensor_free(a);
3615
0
  ccv_nnc_tensor_free(b);
3616
0
  ccv_nnc_tensor_free(ha);
3617
0
  ccv_nnc_tensor_free(hb);
3618
0
  ccv_nnc_tensor_free(bt);
3619
0
}
3620
3621
TEST_CASE("ewlog forward")
3622
1
{
3623
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWLOG_FORWARD, CCV_NNC_BACKEND_MPS));
3624
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3625
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3626
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3627
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3628
0
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3629
0
  dsfmt_t dsfmt;
3630
0
  dsfmt_init_gen_rand(&dsfmt, 0);
3631
0
  int i;
3632
0
  for (i = 0; i < 1000; i++)
3633
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 10 + 0.0001;
3634
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
3635
0
  ccv_nnc_cmd_exec(CMD_EWLOG_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b), 0);
3636
0
  ccv_nnc_cmd_exec(CMD_EWLOG_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(bt), 0);
3637
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
3638
0
  REQUIRE_TENSOR_EQ(bt, hb, "GPU computed output should be the same as CPU computed ones");
3639
0
  ccv_nnc_tensor_free(a);
3640
0
  ccv_nnc_tensor_free(b);
3641
0
  ccv_nnc_tensor_free(ha);
3642
0
  ccv_nnc_tensor_free(hb);
3643
0
  ccv_nnc_tensor_free(bt);
3644
0
}
3645
3646
TEST_CASE("ewsqrt forward")
3647
1
{
3648
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWSQRT_FORWARD, CCV_NNC_BACKEND_MPS));
3649
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3650
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3651
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3652
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3653
0
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3654
0
  dsfmt_t dsfmt;
3655
0
  dsfmt_init_gen_rand(&dsfmt, 0);
3656
0
  int i;
3657
0
  for (i = 0; i < 1000; i++)
3658
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 10 + 0.0001;
3659
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
3660
0
  ccv_nnc_cmd_exec(CMD_EWSQRT_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b), 0);
3661
0
  ccv_nnc_cmd_exec(CMD_EWSQRT_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(bt), 0);
3662
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
3663
0
  REQUIRE_TENSOR_EQ(bt, hb, "GPU computed output should be the same as CPU computed ones");
3664
0
  ccv_nnc_tensor_free(a);
3665
0
  ccv_nnc_tensor_free(b);
3666
0
  ccv_nnc_tensor_free(ha);
3667
0
  ccv_nnc_tensor_free(hb);
3668
0
  ccv_nnc_tensor_free(bt);
3669
0
}
3670
3671
TEST_CASE("ewabs forward")
3672
1
{
3673
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWABS_FORWARD, CCV_NNC_BACKEND_MPS));
3674
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3675
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3676
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3677
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3678
0
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3679
0
  dsfmt_t dsfmt;
3680
0
  dsfmt_init_gen_rand(&dsfmt, 0);
3681
0
  int i;
3682
0
  for (i = 0; i < 1000; i++)
3683
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 10 - 5 + 0.0001;
3684
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
3685
0
  ccv_nnc_cmd_exec(CMD_EWABS_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b), 0);
3686
0
  ccv_nnc_cmd_exec(CMD_EWABS_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(bt), 0);
3687
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
3688
0
  REQUIRE_TENSOR_EQ(bt, hb, "GPU computed output should be the same as CPU computed ones");
3689
0
  ccv_nnc_tensor_free(a);
3690
0
  ccv_nnc_tensor_free(b);
3691
0
  ccv_nnc_tensor_free(ha);
3692
0
  ccv_nnc_tensor_free(hb);
3693
0
  ccv_nnc_tensor_free(bt);
3694
0
}
3695
3696
TEST_CASE("clamp forward")
3697
1
{
3698
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CLAMP_FORWARD, CCV_NNC_BACKEND_MPS));
3699
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3700
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3701
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3702
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3703
0
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3704
0
  dsfmt_t dsfmt;
3705
0
  dsfmt_init_gen_rand(&dsfmt, 0);
3706
0
  int i;
3707
0
  for (i = 0; i < 1000; i++)
3708
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 10 - 1;
3709
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
3710
0
  ccv_nnc_cmd_exec(CMD_CLAMP_FORWARD(0, 6), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b), 0);
3711
0
  ccv_nnc_cmd_exec(CMD_CLAMP_FORWARD(0, 6), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(bt), 0);
3712
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
3713
0
  REQUIRE_TENSOR_EQ(bt, hb, "GPU computed output should be the same as CPU computed ones");
3714
0
  ccv_nnc_tensor_free(a);
3715
0
  ccv_nnc_tensor_free(b);
3716
0
  ccv_nnc_tensor_free(ha);
3717
0
  ccv_nnc_tensor_free(hb);
3718
0
  ccv_nnc_tensor_free(bt);
3719
0
}
3720
3721
TEST_CASE("clamp forward with only max")
3722
1
{
3723
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CLAMP_FORWARD, CCV_NNC_BACKEND_MPS));
3724
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3725
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3726
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3727
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3728
0
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3729
0
  dsfmt_t dsfmt;
3730
0
  dsfmt_init_gen_rand(&dsfmt, 0);
3731
0
  int i;
3732
0
  for (i = 0; i < 1000; i++)
3733
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 10 - 1;
3734
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
3735
0
  ccv_nnc_cmd_exec(CMD_CLAMP_FORWARD(NAN, 6), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b), 0);
3736
0
  ccv_nnc_cmd_exec(CMD_CLAMP_FORWARD(NAN, 6), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(bt), 0);
3737
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
3738
0
  REQUIRE_TENSOR_EQ(bt, hb, "GPU computed output should be the same as CPU computed ones");
3739
0
  ccv_nnc_tensor_free(a);
3740
0
  ccv_nnc_tensor_free(b);
3741
0
  ccv_nnc_tensor_free(ha);
3742
0
  ccv_nnc_tensor_free(hb);
3743
0
  ccv_nnc_tensor_free(bt);
3744
0
}
3745
3746
TEST_CASE("clamp forward with only min")
3747
1
{
3748
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CLAMP_FORWARD, CCV_NNC_BACKEND_MPS));
3749
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3750
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3751
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3752
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3753
0
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3754
0
  dsfmt_t dsfmt;
3755
0
  dsfmt_init_gen_rand(&dsfmt, 0);
3756
0
  int i;
3757
0
  for (i = 0; i < 1000; i++)
3758
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 10 - 1;
3759
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
3760
0
  ccv_nnc_cmd_exec(CMD_CLAMP_FORWARD(0, NAN), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b), 0);
3761
0
  ccv_nnc_cmd_exec(CMD_CLAMP_FORWARD(0, NAN), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(bt), 0);
3762
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
3763
0
  REQUIRE_TENSOR_EQ(bt, hb, "GPU computed output should be the same as CPU computed ones");
3764
0
  ccv_nnc_tensor_free(a);
3765
0
  ccv_nnc_tensor_free(b);
3766
0
  ccv_nnc_tensor_free(ha);
3767
0
  ccv_nnc_tensor_free(hb);
3768
0
  ccv_nnc_tensor_free(bt);
3769
0
}
3770
3771
TEST_CASE("compare set with mps")
3772
1
{
3773
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS));
3774
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 11, 10, 9, 8), 0);
3775
0
  ccv_nnc_tensor_t* const ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 11, 10, 9, 8), 0);
3776
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 11, 10, 9, 8), 0);
3777
0
  ccv_nnc_cmd_exec(CMD_SET_FORWARD(10), ccv_nnc_no_hint, 0, TENSOR_LIST(), TENSOR_LIST(a), 0);
3778
0
  ccv_nnc_cmd_exec(CMD_SET_FORWARD(10), ccv_nnc_no_hint, 0, TENSOR_LIST(), TENSOR_LIST(ga), 0);
3779
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(ha), 0);
3780
0
  REQUIRE_TENSOR_EQ(ha, ga, "format transform result should be the same");
3781
0
  ccv_nnc_tensor_free(a);
3782
0
  ccv_nnc_tensor_free(ha);
3783
0
  ccv_nnc_tensor_free(ga);
3784
0
}
3785
3786
TEST_CASE("scaled dot product attention with mps")
3787
1
{
3788
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD, CCV_NNC_BACKEND_MPS));
3789
  // Bypass error: variable-sized object may not be initialized
3790
0
#define num_long_trials 6
3791
0
#define num_short_trials 2
3792
0
#define num_trials (num_long_trials + num_short_trials)
3793
3794
0
  for (int trial = 0; trial < num_trials; ++trial) {
3795
0
    int B_candidates[num_trials] =         {  32, 1, 1, 1,  32,   3, 2, 1 };
3796
0
    int R_candidates[num_trials] =         { 128, 4128, 4098, 4162, 128,  61, 6, 2 };
3797
0
    int C_candidates[num_trials] =         { 128, 4128, 4098, 4162, 128,  49, 2, 1 };
3798
0
    int Hq_candidates[num_trials] =        {   8, 32, 32, 32,  32,  13, 3, 1 };
3799
0
    int Hk_candidates[num_trials] =        {   8, 8, 8, 8,   8,  13, 3, 1 };
3800
0
    int D_candidates[num_trials] =         {  64, 32, 32, 32, 128, 191, 4, 8 };
3801
0
    int is_causal_candidates[num_trials] = {   0, 0, 0, 0,   1,   0, 1, 0 };
3802
3803
0
    int B = B_candidates[trial];
3804
0
    int R = R_candidates[trial];
3805
0
    int C = C_candidates[trial];
3806
0
    int Hq = Hq_candidates[trial];
3807
0
    int Hk = Hk_candidates[trial];
3808
0
    int D = D_candidates[trial];
3809
0
    int is_causal = is_causal_candidates[trial];
3810
0
    float scale = 1.0 / sqrt((float)D);
3811
3812
0
    GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD, CCV_NNC_BACKEND_MPS));
3813
0
    ccv_nnc_tensor_t* const q_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
3814
0
    ccv_nnc_tensor_t* const k_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
3815
0
    ccv_nnc_tensor_t* const v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
3816
3817
0
    for (int i = 0; i < B * R * Hq * D; ++i) {
3818
0
      q_tensor->data.f32[i] = (float)(i) / (float)(B * R * Hq * D);
3819
0
    }
3820
0
    for (int i = 0; i < B * C * Hk * D; ++i) {
3821
0
      k_tensor->data.f32[i] = (float)(i) / (float)(B * C * Hk * D);
3822
0
    }
3823
0
    for (int i = 0; i < B * C * Hk * D; ++i) {
3824
0
      v_tensor->data.f32[i] = (float)(i) / (float)(B * C * Hk * D);
3825
0
    }
3826
3827
0
    ccv_nnc_tensor_t* const o_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
3828
0
    ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, is_causal), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor, k_tensor, v_tensor), TENSOR_LIST(o_tensor), 0);
3829
3830
    // Why it there 000 in the beginning of the argument list for GPU_TENSOR_NHWC?
3831
0
    ccv_nnc_tensor_t* const gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, Hq, D), 0);
3832
0
    ccv_nnc_tensor_t* const gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, C, Hk, D), 0);
3833
0
    ccv_nnc_tensor_t* const gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, C, Hk, D), 0);
3834
0
    ccv_nnc_tensor_t* const gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, Hq, D), 0);
3835
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor, k_tensor, v_tensor), TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor), 0);
3836
3837
0
    if (is_causal)
3838
0
    {
3839
0
      ccv_nnc_tensor_t* const causal_mask = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 1, R, C), 0);
3840
0
      ccv_nnc_tensor_t* const gpu_causal_mask = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 1, 1, R, C), 0);
3841
0
      for (int i = 0; i < R; i++)
3842
0
        for (int j = 0; j < C; j++)
3843
0
          causal_mask->data.f32[i * C + j] = 0;
3844
0
      for (int i = 0; i < R - 1; i++)
3845
0
        for (int j = i - R + C + 1; j < C; j++)
3846
0
          causal_mask->data.f32[i * C + j] = -FLT_MAX;
3847
0
      ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(causal_mask), TENSOR_LIST(gpu_causal_mask), 0);
3848
0
      ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, 0), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, gpu_causal_mask), TENSOR_LIST(gpu_o_tensor), 0);
3849
0
      ccv_nnc_tensor_free(gpu_causal_mask);
3850
0
      ccv_nnc_tensor_free(causal_mask);
3851
0
    } else {
3852
0
      ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, 0), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor), TENSOR_LIST(gpu_o_tensor), 0);
3853
0
    }
3854
3855
0
    ccv_nnc_tensor_t* const copy_of_gpu_o_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
3856
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_o_tensor), TENSOR_LIST(copy_of_gpu_o_tensor), 0);
3857
3858
0
    REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, copy_of_gpu_o_tensor->data.f32, o_tensor->data.f32, B * R * Hq * D, 1e-3, "scaled dot product attention result should be the same");
3859
3860
0
    ccv_nnc_tensor_free(o_tensor);
3861
0
    ccv_nnc_tensor_free(gpu_o_tensor);
3862
0
    ccv_nnc_tensor_free(copy_of_gpu_o_tensor);
3863
0
    ccv_nnc_tensor_free(q_tensor);
3864
0
    ccv_nnc_tensor_free(k_tensor);
3865
0
    ccv_nnc_tensor_free(v_tensor);
3866
0
    ccv_nnc_tensor_free(gpu_q_tensor);
3867
0
    ccv_nnc_tensor_free(gpu_k_tensor);
3868
0
    ccv_nnc_tensor_free(gpu_v_tensor);
3869
0
  }
3870
0
#undef num_long_trials
3871
0
#undef num_short_trials
3872
0
#undef num_trials
3873
0
}
3874
3875
TEST_CASE("scaled dot product attention with quantized NA mps")
3876
1
{
3877
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD, CCV_NNC_BACKEND_MPS));
3878
0
  const int B = 1;
3879
0
  const int R = 128;
3880
0
  const int C = 128;
3881
0
  const int H = 24;
3882
0
  const int Ds[] = { 64, 80, 128, 130, 160, 192, 224, 256 };
3883
0
  const int datatypes[] = { CCV_16F, CCV_16BF, CCV_32F };
3884
0
  const float tolerances[] = { 2e-2, 3e-2, 2e-2 };
3885
0
  const char* datatype_names[] = { "16F", "16BF", "32F" };
3886
0
  for (int d_idx = 0; d_idx < (int)(sizeof(Ds) / sizeof(Ds[0])); ++d_idx)
3887
0
  {
3888
0
    const int D = Ds[d_idx];
3889
0
    const float scale = 1.0 / sqrt((float)D);
3890
3891
0
    ccv_nnc_tensor_t* const q_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
3892
0
    ccv_nnc_tensor_t* const k_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
3893
0
    ccv_nnc_tensor_t* const v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
3894
0
    ccv_nnc_tensor_t* const q_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, H, D), 0);
3895
0
    ccv_nnc_tensor_t* const k_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, H, D), 0);
3896
0
    ccv_nnc_tensor_t* const v_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, H, D), 0);
3897
0
    const int q_count = B * R * H * D;
3898
0
    const int kv_count = B * C * H * D;
3899
0
    dsfmt_t dsfmt;
3900
0
    dsfmt_init_gen_rand(&dsfmt, 11 + d_idx);
3901
0
    for (int i = 0; i < q_count; ++i)
3902
0
      q_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
3903
0
    for (int i = 0; i < kv_count; ++i)
3904
0
      k_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
3905
0
    for (int i = 0; i < kv_count; ++i)
3906
0
      v_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
3907
3908
0
    ccv_nnc_tensor_t* const o_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
3909
0
    ccv_nnc_cmd_t cpu_cmd = CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, 0);
3910
0
    ccv_nnc_cmd_exec(cpu_cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor, k_tensor, v_tensor), TENSOR_LIST(o_tensor), 0);
3911
3912
0
    for (int datatype_idx = 0; datatype_idx < 3; ++datatype_idx)
3913
0
    {
3914
0
      const int datatype = datatypes[datatype_idx];
3915
0
      ccv_nnc_tensor_t* q_input = q_tensor;
3916
0
      ccv_nnc_tensor_t* k_input = k_tensor;
3917
0
      ccv_nnc_tensor_t* v_input = v_tensor;
3918
0
      ccv_nnc_tensor_t* copy_of_gpu_o_tensor = 0;
3919
0
      ccv_nnc_tensor_t* gpu_q_tensor = 0;
3920
0
      ccv_nnc_tensor_t* gpu_k_tensor = 0;
3921
0
      ccv_nnc_tensor_t* gpu_v_tensor = 0;
3922
0
      ccv_nnc_tensor_t* gpu_o_tensor = 0;
3923
0
      if (datatype == CCV_16F)
3924
0
      {
3925
0
        ccv_float_to_half_precision(q_tensor->data.f32, (uint16_t*)q_tensor_f16->data.f16, q_count);
3926
0
        ccv_float_to_half_precision(k_tensor->data.f32, (uint16_t*)k_tensor_f16->data.f16, kv_count);
3927
0
        ccv_float_to_half_precision(v_tensor->data.f32, (uint16_t*)v_tensor_f16->data.f16, kv_count);
3928
0
        q_input = q_tensor_f16;
3929
0
        k_input = k_tensor_f16;
3930
0
        v_input = v_tensor_f16;
3931
0
        gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, H, D), 0);
3932
0
        gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, H, D), 0);
3933
0
        gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, H, D), 0);
3934
0
        gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, H, D), 0);
3935
0
        copy_of_gpu_o_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, H, D), 0);
3936
0
      } else if (datatype == CCV_16BF) {
3937
0
        ccv_float_to_bfloat(q_tensor->data.f32, (uint16_t*)q_tensor_f16->data.f16, q_count);
3938
0
        ccv_float_to_bfloat(k_tensor->data.f32, (uint16_t*)k_tensor_f16->data.f16, kv_count);
3939
0
        ccv_float_to_bfloat(v_tensor->data.f32, (uint16_t*)v_tensor_f16->data.f16, kv_count);
3940
0
        q_input = q_tensor_f16;
3941
0
        k_input = k_tensor_f16;
3942
0
        v_input = v_tensor_f16;
3943
0
        gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, R, H, D), 0);
3944
0
        gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, C, H, D), 0);
3945
0
        gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, C, H, D), 0);
3946
0
        gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, R, H, D), 0);
3947
0
        copy_of_gpu_o_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, B, R, H, D), 0);
3948
0
      } else {
3949
0
        gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, H, D), 0);
3950
0
        gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, C, H, D), 0);
3951
0
        gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, C, H, D), 0);
3952
0
        gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, H, D), 0);
3953
0
        copy_of_gpu_o_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
3954
0
      }
3955
0
      ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_input, k_input, v_input), TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor), 0);
3956
0
      ccv_nnc_cmd_t gpu_cmd = CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, 0);
3957
0
      gpu_cmd.info.scaled_dot_product_attention.flags = CCV_NNC_GEMM_16F | CCV_NNC_GEMM_8I;
3958
0
      ccv_nnc_cmd_exec(gpu_cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor), TENSOR_LIST(gpu_o_tensor), 0);
3959
0
      ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_o_tensor), TENSOR_LIST(copy_of_gpu_o_tensor), 0);
3960
3961
0
      const int count = B * R * H * D;
3962
0
      float* const cpu_f32 = (float*)ccmalloc(sizeof(float) * count);
3963
0
      float* const gpu_f32 = (float*)ccmalloc(sizeof(float) * count);
3964
0
      memcpy(cpu_f32, o_tensor->data.f32, sizeof(float) * count);
3965
0
      if (datatype == CCV_16F)
3966
0
        ccv_half_precision_to_float((uint16_t*)copy_of_gpu_o_tensor->data.f16, gpu_f32, count);
3967
0
      else if (datatype == CCV_16BF)
3968
0
        ccv_bfloat_to_float((uint16_t*)copy_of_gpu_o_tensor->data.f16, gpu_f32, count);
3969
0
      else
3970
0
        memcpy(gpu_f32, copy_of_gpu_o_tensor->data.f32, sizeof(float) * count);
3971
0
      float max_relative_diff = 0;
3972
0
      int max_diff_idx = 0;
3973
0
      for (int i = 0; i < count; ++i)
3974
0
      {
3975
0
        const float denom = fmaxf(fmaxf(fabsf(cpu_f32[i]), fabsf(gpu_f32[i])), 1.0f);
3976
0
        const float relative_diff = fabsf(cpu_f32[i] - gpu_f32[i]) / denom;
3977
0
        if (relative_diff > max_relative_diff)
3978
0
          max_relative_diff = relative_diff, max_diff_idx = i;
3979
0
      }
3980
0
      REQUIRE(max_relative_diff <= tolerances[datatype_idx], "quantized attention result should match CPU reference for dtype=%s D=%d (max relative diff %g at %d: %g vs %g)", datatype_names[datatype_idx], D, max_relative_diff, max_diff_idx, cpu_f32[max_diff_idx], gpu_f32[max_diff_idx]);
3981
3982
0
      ccfree(cpu_f32);
3983
0
      ccfree(gpu_f32);
3984
0
      ccv_nnc_tensor_free(gpu_o_tensor);
3985
0
      ccv_nnc_tensor_free(copy_of_gpu_o_tensor);
3986
0
      ccv_nnc_tensor_free(gpu_q_tensor);
3987
0
      ccv_nnc_tensor_free(gpu_k_tensor);
3988
0
      ccv_nnc_tensor_free(gpu_v_tensor);
3989
0
    }
3990
0
    ccv_nnc_tensor_free(o_tensor);
3991
0
    ccv_nnc_tensor_free(q_tensor);
3992
0
    ccv_nnc_tensor_free(k_tensor);
3993
0
    ccv_nnc_tensor_free(v_tensor);
3994
0
    ccv_nnc_tensor_free(q_tensor_f16);
3995
0
    ccv_nnc_tensor_free(k_tensor_f16);
3996
0
    ccv_nnc_tensor_free(v_tensor_f16);
3997
0
  }
3998
0
}
3999
4000
TEST_CASE("scaled dot product attention with quantized NA mps batched")
4001
1
{
4002
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD, CCV_NNC_BACKEND_MPS));
4003
0
  const int B = 3;
4004
0
  const int R = 128;
4005
0
  const int C = 128;
4006
0
  const int H = 8;
4007
0
  const int Ds[] = { 64, 128 };
4008
0
  const int datatypes[] = { CCV_16F, CCV_16BF, CCV_32F };
4009
0
  const float tolerances[] = { 2e-2, 3e-2, 2e-2 };
4010
0
  const char* datatype_names[] = { "16F", "16BF", "32F" };
4011
0
  for (int d_idx = 0; d_idx < (int)(sizeof(Ds) / sizeof(Ds[0])); ++d_idx)
4012
0
  {
4013
0
    const int D = Ds[d_idx];
4014
0
    const float scale = 1.0 / sqrt((float)D);
4015
4016
0
    ccv_nnc_tensor_t* const q_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
4017
0
    ccv_nnc_tensor_t* const k_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
4018
0
    ccv_nnc_tensor_t* const v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
4019
0
    ccv_nnc_tensor_t* const q_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, H, D), 0);
4020
0
    ccv_nnc_tensor_t* const k_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, H, D), 0);
4021
0
    ccv_nnc_tensor_t* const v_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, H, D), 0);
4022
0
    const int q_count = B * R * H * D;
4023
0
    const int kv_count = B * C * H * D;
4024
0
    dsfmt_t dsfmt;
4025
0
    dsfmt_init_gen_rand(&dsfmt, 101 + d_idx);
4026
0
    for (int i = 0; i < q_count; ++i)
4027
0
      q_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
4028
0
    for (int i = 0; i < kv_count; ++i)
4029
0
      k_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
4030
0
    for (int i = 0; i < kv_count; ++i)
4031
0
      v_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
4032
4033
0
    ccv_nnc_tensor_t* const o_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
4034
0
    ccv_nnc_cmd_t cpu_cmd = CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, 0);
4035
0
    ccv_nnc_cmd_exec(cpu_cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor, k_tensor, v_tensor), TENSOR_LIST(o_tensor), 0);
4036
4037
0
    for (int datatype_idx = 0; datatype_idx < 3; ++datatype_idx)
4038
0
    {
4039
0
      const int datatype = datatypes[datatype_idx];
4040
0
      ccv_nnc_tensor_t* q_input = q_tensor;
4041
0
      ccv_nnc_tensor_t* k_input = k_tensor;
4042
0
      ccv_nnc_tensor_t* v_input = v_tensor;
4043
0
      ccv_nnc_tensor_t* copy_of_gpu_o_tensor = 0;
4044
0
      ccv_nnc_tensor_t* gpu_q_tensor = 0;
4045
0
      ccv_nnc_tensor_t* gpu_k_tensor = 0;
4046
0
      ccv_nnc_tensor_t* gpu_v_tensor = 0;
4047
0
      ccv_nnc_tensor_t* gpu_o_tensor = 0;
4048
0
      if (datatype == CCV_16F)
4049
0
      {
4050
0
        ccv_float_to_half_precision(q_tensor->data.f32, (uint16_t*)q_tensor_f16->data.f16, q_count);
4051
0
        ccv_float_to_half_precision(k_tensor->data.f32, (uint16_t*)k_tensor_f16->data.f16, kv_count);
4052
0
        ccv_float_to_half_precision(v_tensor->data.f32, (uint16_t*)v_tensor_f16->data.f16, kv_count);
4053
0
        q_input = q_tensor_f16;
4054
0
        k_input = k_tensor_f16;
4055
0
        v_input = v_tensor_f16;
4056
0
        gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, H, D), 0);
4057
0
        gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, H, D), 0);
4058
0
        gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, H, D), 0);
4059
0
        gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, H, D), 0);
4060
0
        copy_of_gpu_o_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, H, D), 0);
4061
0
      } else if (datatype == CCV_16BF) {
4062
0
        ccv_float_to_bfloat(q_tensor->data.f32, (uint16_t*)q_tensor_f16->data.f16, q_count);
4063
0
        ccv_float_to_bfloat(k_tensor->data.f32, (uint16_t*)k_tensor_f16->data.f16, kv_count);
4064
0
        ccv_float_to_bfloat(v_tensor->data.f32, (uint16_t*)v_tensor_f16->data.f16, kv_count);
4065
0
        q_input = q_tensor_f16;
4066
0
        k_input = k_tensor_f16;
4067
0
        v_input = v_tensor_f16;
4068
0
        gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, R, H, D), 0);
4069
0
        gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, C, H, D), 0);
4070
0
        gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, C, H, D), 0);
4071
0
        gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, R, H, D), 0);
4072
0
        copy_of_gpu_o_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, B, R, H, D), 0);
4073
0
      } else {
4074
0
        gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, H, D), 0);
4075
0
        gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, C, H, D), 0);
4076
0
        gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, C, H, D), 0);
4077
0
        gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, H, D), 0);
4078
0
        copy_of_gpu_o_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
4079
0
      }
4080
0
      ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_input, k_input, v_input), TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor), 0);
4081
0
      ccv_nnc_cmd_t gpu_cmd = CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, 0);
4082
0
      gpu_cmd.info.scaled_dot_product_attention.flags = CCV_NNC_GEMM_16F | CCV_NNC_GEMM_8I;
4083
0
      ccv_nnc_cmd_exec(gpu_cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor), TENSOR_LIST(gpu_o_tensor), 0);
4084
0
      ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_o_tensor), TENSOR_LIST(copy_of_gpu_o_tensor), 0);
4085
4086
0
      const int count = B * R * H * D;
4087
0
      float* const cpu_f32 = (float*)ccmalloc(sizeof(float) * count);
4088
0
      float* const gpu_f32 = (float*)ccmalloc(sizeof(float) * count);
4089
0
      memcpy(cpu_f32, o_tensor->data.f32, sizeof(float) * count);
4090
0
      if (datatype == CCV_16F)
4091
0
        ccv_half_precision_to_float((uint16_t*)copy_of_gpu_o_tensor->data.f16, gpu_f32, count);
4092
0
      else if (datatype == CCV_16BF)
4093
0
        ccv_bfloat_to_float((uint16_t*)copy_of_gpu_o_tensor->data.f16, gpu_f32, count);
4094
0
      else
4095
0
        memcpy(gpu_f32, copy_of_gpu_o_tensor->data.f32, sizeof(float) * count);
4096
0
      float max_relative_diff = 0;
4097
0
      int max_diff_idx = 0;
4098
0
      for (int i = 0; i < count; ++i)
4099
0
      {
4100
0
        const float denom = fmaxf(fmaxf(fabsf(cpu_f32[i]), fabsf(gpu_f32[i])), 1.0f);
4101
0
        const float relative_diff = fabsf(cpu_f32[i] - gpu_f32[i]) / denom;
4102
0
        if (relative_diff > max_relative_diff)
4103
0
          max_relative_diff = relative_diff, max_diff_idx = i;
4104
0
      }
4105
0
      REQUIRE(max_relative_diff <= tolerances[datatype_idx], "quantized batched attention result should match CPU reference for dtype=%s D=%d (max relative diff %g at %d: %g vs %g)", datatype_names[datatype_idx], D, max_relative_diff, max_diff_idx, cpu_f32[max_diff_idx], gpu_f32[max_diff_idx]);
4106
4107
0
      ccfree(cpu_f32);
4108
0
      ccfree(gpu_f32);
4109
0
      ccv_nnc_tensor_free(gpu_o_tensor);
4110
0
      ccv_nnc_tensor_free(copy_of_gpu_o_tensor);
4111
0
      ccv_nnc_tensor_free(gpu_q_tensor);
4112
0
      ccv_nnc_tensor_free(gpu_k_tensor);
4113
0
      ccv_nnc_tensor_free(gpu_v_tensor);
4114
0
    }
4115
0
    ccv_nnc_tensor_free(o_tensor);
4116
0
    ccv_nnc_tensor_free(q_tensor);
4117
0
    ccv_nnc_tensor_free(k_tensor);
4118
0
    ccv_nnc_tensor_free(v_tensor);
4119
0
    ccv_nnc_tensor_free(q_tensor_f16);
4120
0
    ccv_nnc_tensor_free(k_tensor_f16);
4121
0
    ccv_nnc_tensor_free(v_tensor_f16);
4122
0
  }
4123
0
}
4124
4125
TEST_CASE("scaled dot product attention with quantized NA mps for non-multiple-of-64 sequence")
4126
1
{
4127
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD, CCV_NNC_BACKEND_MPS));
4128
0
  const int B = 1;
4129
0
  const int R = 128;
4130
0
  const int H = 24;
4131
0
  const int Cs[] = { 130, 224 };
4132
0
  const int Ds[] = { 128, 130, 224 };
4133
0
  const int datatypes[] = { CCV_16F, CCV_16BF, CCV_32F };
4134
0
  const float tolerances[] = { 4e-2, 5e-2, 4e-2 };
4135
0
  const char* datatype_names[] = { "16F", "16BF", "32F" };
4136
0
  for (int c_idx = 0; c_idx < (int)(sizeof(Cs) / sizeof(Cs[0])); ++c_idx)
4137
0
  {
4138
0
    const int C = Cs[c_idx];
4139
0
    for (int d_idx = 0; d_idx < (int)(sizeof(Ds) / sizeof(Ds[0])); ++d_idx)
4140
0
    {
4141
0
      const int D = Ds[d_idx];
4142
0
      const float scale = 1.0 / sqrt((float)D);
4143
4144
0
      ccv_nnc_tensor_t* const q_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
4145
0
      ccv_nnc_tensor_t* const k_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
4146
0
      ccv_nnc_tensor_t* const v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
4147
0
      ccv_nnc_tensor_t* const q_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, H, D), 0);
4148
0
      ccv_nnc_tensor_t* const k_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, H, D), 0);
4149
0
      ccv_nnc_tensor_t* const v_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, H, D), 0);
4150
0
      const int q_count = B * R * H * D;
4151
0
      const int kv_count = B * C * H * D;
4152
0
      dsfmt_t dsfmt;
4153
0
      dsfmt_init_gen_rand(&dsfmt, 211 + c_idx * 17 + d_idx);
4154
0
      for (int i = 0; i < q_count; ++i)
4155
0
        q_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
4156
0
      for (int i = 0; i < kv_count; ++i)
4157
0
        k_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
4158
0
      for (int i = 0; i < kv_count; ++i)
4159
0
        v_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
4160
4161
0
      ccv_nnc_tensor_t* const o_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
4162
0
      ccv_nnc_cmd_t cpu_cmd = CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, 0);
4163
0
      ccv_nnc_cmd_exec(cpu_cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor, k_tensor, v_tensor), TENSOR_LIST(o_tensor), 0);
4164
4165
0
      for (int datatype_idx = 0; datatype_idx < 3; ++datatype_idx)
4166
0
      {
4167
0
        const int datatype = datatypes[datatype_idx];
4168
0
        ccv_nnc_tensor_t* q_input = q_tensor;
4169
0
        ccv_nnc_tensor_t* k_input = k_tensor;
4170
0
        ccv_nnc_tensor_t* v_input = v_tensor;
4171
0
        ccv_nnc_tensor_t* copy_of_gpu_o_tensor = 0;
4172
0
        ccv_nnc_tensor_t* gpu_q_tensor = 0;
4173
0
        ccv_nnc_tensor_t* gpu_k_tensor = 0;
4174
0
        ccv_nnc_tensor_t* gpu_v_tensor = 0;
4175
0
        ccv_nnc_tensor_t* gpu_o_tensor = 0;
4176
0
        if (datatype == CCV_16F)
4177
0
        {
4178
0
          ccv_float_to_half_precision(q_tensor->data.f32, (uint16_t*)q_tensor_f16->data.f16, q_count);
4179
0
          ccv_float_to_half_precision(k_tensor->data.f32, (uint16_t*)k_tensor_f16->data.f16, kv_count);
4180
0
          ccv_float_to_half_precision(v_tensor->data.f32, (uint16_t*)v_tensor_f16->data.f16, kv_count);
4181
0
          q_input = q_tensor_f16;
4182
0
          k_input = k_tensor_f16;
4183
0
          v_input = v_tensor_f16;
4184
0
          gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, H, D), 0);
4185
0
          gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, H, D), 0);
4186
0
          gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, H, D), 0);
4187
0
          gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, H, D), 0);
4188
0
          copy_of_gpu_o_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, H, D), 0);
4189
0
        } else if (datatype == CCV_16BF) {
4190
0
          ccv_float_to_bfloat(q_tensor->data.f32, (uint16_t*)q_tensor_f16->data.f16, q_count);
4191
0
          ccv_float_to_bfloat(k_tensor->data.f32, (uint16_t*)k_tensor_f16->data.f16, kv_count);
4192
0
          ccv_float_to_bfloat(v_tensor->data.f32, (uint16_t*)v_tensor_f16->data.f16, kv_count);
4193
0
          q_input = q_tensor_f16;
4194
0
          k_input = k_tensor_f16;
4195
0
          v_input = v_tensor_f16;
4196
0
          gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, R, H, D), 0);
4197
0
          gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, C, H, D), 0);
4198
0
          gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, C, H, D), 0);
4199
0
          gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, R, H, D), 0);
4200
0
          copy_of_gpu_o_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, B, R, H, D), 0);
4201
0
        } else {
4202
0
          gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, H, D), 0);
4203
0
          gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, C, H, D), 0);
4204
0
          gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, C, H, D), 0);
4205
0
          gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, H, D), 0);
4206
0
          copy_of_gpu_o_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
4207
0
        }
4208
0
        ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_input, k_input, v_input), TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor), 0);
4209
0
        ccv_nnc_cmd_t gpu_cmd = CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, 0);
4210
0
        gpu_cmd.info.scaled_dot_product_attention.flags = CCV_NNC_GEMM_16F | CCV_NNC_GEMM_8I;
4211
0
        ccv_nnc_cmd_exec(gpu_cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor), TENSOR_LIST(gpu_o_tensor), 0);
4212
0
        ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_o_tensor), TENSOR_LIST(copy_of_gpu_o_tensor), 0);
4213
4214
0
        const int count = B * R * H * D;
4215
0
        float* const cpu_f32 = (float*)ccmalloc(sizeof(float) * count);
4216
0
        float* const gpu_f32 = (float*)ccmalloc(sizeof(float) * count);
4217
0
        memcpy(cpu_f32, o_tensor->data.f32, sizeof(float) * count);
4218
0
        if (datatype == CCV_16F)
4219
0
          ccv_half_precision_to_float((uint16_t*)copy_of_gpu_o_tensor->data.f16, gpu_f32, count);
4220
0
        else if (datatype == CCV_16BF)
4221
0
          ccv_bfloat_to_float((uint16_t*)copy_of_gpu_o_tensor->data.f16, gpu_f32, count);
4222
0
        else
4223
0
          memcpy(gpu_f32, copy_of_gpu_o_tensor->data.f32, sizeof(float) * count);
4224
0
        float max_relative_diff = 0;
4225
0
        int max_diff_idx = 0;
4226
0
        for (int i = 0; i < count; ++i)
4227
0
        {
4228
0
          const float denom = fmaxf(fmaxf(fabsf(cpu_f32[i]), fabsf(gpu_f32[i])), 1.0f);
4229
0
          const float relative_diff = fabsf(cpu_f32[i] - gpu_f32[i]) / denom;
4230
0
          if (relative_diff > max_relative_diff)
4231
0
            max_relative_diff = relative_diff, max_diff_idx = i;
4232
0
        }
4233
0
        REQUIRE(max_relative_diff <= tolerances[datatype_idx], "quantized attention result should match CPU reference for dtype=%s C=%d D=%d (max relative diff %g at %d: %g vs %g)", datatype_names[datatype_idx], C, D, max_relative_diff, max_diff_idx, cpu_f32[max_diff_idx], gpu_f32[max_diff_idx]);
4234
4235
0
        ccfree(cpu_f32);
4236
0
        ccfree(gpu_f32);
4237
0
        ccv_nnc_tensor_free(gpu_o_tensor);
4238
0
        ccv_nnc_tensor_free(copy_of_gpu_o_tensor);
4239
0
        ccv_nnc_tensor_free(gpu_q_tensor);
4240
0
        ccv_nnc_tensor_free(gpu_k_tensor);
4241
0
        ccv_nnc_tensor_free(gpu_v_tensor);
4242
0
      }
4243
0
      ccv_nnc_tensor_free(o_tensor);
4244
0
      ccv_nnc_tensor_free(q_tensor);
4245
0
      ccv_nnc_tensor_free(k_tensor);
4246
0
      ccv_nnc_tensor_free(v_tensor);
4247
0
      ccv_nnc_tensor_free(q_tensor_f16);
4248
0
      ccv_nnc_tensor_free(k_tensor_f16);
4249
0
      ccv_nnc_tensor_free(v_tensor_f16);
4250
0
    }
4251
0
  }
4252
0
}
4253
4254
TEST_CASE("scaled dot product attention gradient with quantized NA mps")
4255
1
{
4256
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD, CCV_NNC_BACKEND_MPS) &&
4257
1
    ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD, CCV_NNC_BACKEND_MPS));
4258
0
  const int B = 2;
4259
0
  const int R = 128;
4260
0
  const int C = 128;
4261
0
  const int H = 8;
4262
0
  const int Ds[] = { 64, 80, 96, 128 };
4263
0
  const int datatypes[] = { CCV_16F, CCV_16BF, CCV_32F };
4264
0
  const char* datatype_names[] = { "16F", "16BF", "32F" };
4265
0
  const float dq_tolerances[] = { 8e-2, 8e-2, 8e-2 };
4266
0
  const float dk_tolerances[] = { 1e-1, 1e-1, 1e-1 };
4267
0
  const float dv_tolerances[] = { 8e-2, 8e-2, 8e-2 };
4268
0
  for (int d_idx = 0; d_idx < (int)(sizeof(Ds) / sizeof(Ds[0])); ++d_idx)
4269
0
  {
4270
0
    const int D = Ds[d_idx];
4271
0
    const int q_count = B * R * H * D;
4272
0
    const int kv_count = B * C * H * D;
4273
0
    const float scale = 1.0 / sqrt((float)D);
4274
0
    ccv_nnc_tensor_t* const q_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
4275
0
    ccv_nnc_tensor_t* const k_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
4276
0
    ccv_nnc_tensor_t* const v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
4277
0
    ccv_nnc_tensor_t* const do_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
4278
0
    ccv_nnc_tensor_t* const dq_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
4279
0
    ccv_nnc_tensor_t* const dk_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
4280
0
    ccv_nnc_tensor_t* const dv_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
4281
0
    ccv_nnc_tensor_t* const q_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, H, D), 0);
4282
0
    ccv_nnc_tensor_t* const k_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, H, D), 0);
4283
0
    ccv_nnc_tensor_t* const v_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, H, D), 0);
4284
0
    ccv_nnc_tensor_t* const do_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, H, D), 0);
4285
0
    dsfmt_t dsfmt;
4286
0
    dsfmt_init_gen_rand(&dsfmt, 181 + d_idx);
4287
0
    for (int i = 0; i < q_count; ++i)
4288
0
    {
4289
0
      q_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
4290
0
      do_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
4291
0
    }
4292
0
    for (int i = 0; i < kv_count; ++i)
4293
0
    {
4294
0
      k_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
4295
0
      v_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
4296
0
    }
4297
0
    ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD(scale, 0), ccv_nnc_no_hint, 0, TENSOR_LIST(do_tensor, 0, 0, q_tensor, k_tensor, v_tensor), TENSOR_LIST(dq_tensor, dk_tensor, dv_tensor), 0);
4298
4299
0
    for (int datatype_idx = 0; datatype_idx < 3; ++datatype_idx)
4300
0
    {
4301
0
      const int datatype = datatypes[datatype_idx];
4302
0
      ccv_nnc_tensor_t* q_input = q_tensor;
4303
0
      ccv_nnc_tensor_t* k_input = k_tensor;
4304
0
      ccv_nnc_tensor_t* v_input = v_tensor;
4305
0
      ccv_nnc_tensor_t* do_input = do_tensor;
4306
0
      ccv_nnc_tensor_t* gpu_q_tensor = 0;
4307
0
      ccv_nnc_tensor_t* gpu_k_tensor = 0;
4308
0
      ccv_nnc_tensor_t* gpu_v_tensor = 0;
4309
0
      ccv_nnc_tensor_t* gpu_do_tensor = 0;
4310
0
      ccv_nnc_tensor_t* gpu_o_tensor = 0;
4311
0
      ccv_nnc_tensor_t* gpu_dq_tensor = 0;
4312
0
      ccv_nnc_tensor_t* gpu_dk_tensor = 0;
4313
0
      ccv_nnc_tensor_t* gpu_dv_tensor = 0;
4314
0
      ccv_nnc_tensor_t* copy_of_gpu_dq_tensor = 0;
4315
0
      ccv_nnc_tensor_t* copy_of_gpu_dk_tensor = 0;
4316
0
      ccv_nnc_tensor_t* copy_of_gpu_dv_tensor = 0;
4317
0
      if (datatype == CCV_16F)
4318
0
      {
4319
0
        ccv_float_to_half_precision(q_tensor->data.f32, (uint16_t*)q_tensor_f16->data.f16, q_count);
4320
0
        ccv_float_to_half_precision(k_tensor->data.f32, (uint16_t*)k_tensor_f16->data.f16, kv_count);
4321
0
        ccv_float_to_half_precision(v_tensor->data.f32, (uint16_t*)v_tensor_f16->data.f16, kv_count);
4322
0
        ccv_float_to_half_precision(do_tensor->data.f32, (uint16_t*)do_tensor_f16->data.f16, q_count);
4323
0
        q_input = q_tensor_f16;
4324
0
        k_input = k_tensor_f16;
4325
0
        v_input = v_tensor_f16;
4326
0
        do_input = do_tensor_f16;
4327
0
        gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, H, D), 0);
4328
0
        gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, H, D), 0);
4329
0
        gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, H, D), 0);
4330
0
        gpu_do_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, H, D), 0);
4331
0
        gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, H, D), 0);
4332
0
        gpu_dq_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, H, D), 0);
4333
0
        gpu_dk_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, H, D), 0);
4334
0
        gpu_dv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, H, D), 0);
4335
0
        copy_of_gpu_dq_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, H, D), 0);
4336
0
        copy_of_gpu_dk_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, H, D), 0);
4337
0
        copy_of_gpu_dv_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, H, D), 0);
4338
0
      } else if (datatype == CCV_16BF) {
4339
0
        ccv_float_to_bfloat(q_tensor->data.f32, (uint16_t*)q_tensor_f16->data.f16, q_count);
4340
0
        ccv_float_to_bfloat(k_tensor->data.f32, (uint16_t*)k_tensor_f16->data.f16, kv_count);
4341
0
        ccv_float_to_bfloat(v_tensor->data.f32, (uint16_t*)v_tensor_f16->data.f16, kv_count);
4342
0
        ccv_float_to_bfloat(do_tensor->data.f32, (uint16_t*)do_tensor_f16->data.f16, q_count);
4343
0
        q_input = q_tensor_f16;
4344
0
        k_input = k_tensor_f16;
4345
0
        v_input = v_tensor_f16;
4346
0
        do_input = do_tensor_f16;
4347
0
        gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, R, H, D), 0);
4348
0
        gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, C, H, D), 0);
4349
0
        gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, C, H, D), 0);
4350
0
        gpu_do_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, R, H, D), 0);
4351
0
        gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, R, H, D), 0);
4352
0
        gpu_dq_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, R, H, D), 0);
4353
0
        gpu_dk_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, C, H, D), 0);
4354
0
        gpu_dv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, C, H, D), 0);
4355
0
        copy_of_gpu_dq_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, B, R, H, D), 0);
4356
0
        copy_of_gpu_dk_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, B, C, H, D), 0);
4357
0
        copy_of_gpu_dv_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, B, C, H, D), 0);
4358
0
      } else {
4359
0
        gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, H, D), 0);
4360
0
        gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, C, H, D), 0);
4361
0
        gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, C, H, D), 0);
4362
0
        gpu_do_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, H, D), 0);
4363
0
        gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, H, D), 0);
4364
0
        gpu_dq_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, H, D), 0);
4365
0
        gpu_dk_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, C, H, D), 0);
4366
0
        gpu_dv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, C, H, D), 0);
4367
0
        copy_of_gpu_dq_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
4368
0
        copy_of_gpu_dk_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
4369
0
        copy_of_gpu_dv_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
4370
0
      }
4371
0
      ccv_nnc_tensor_t* const gpu_softmax_lse = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, H, R), 0);
4372
0
      ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_input, k_input, v_input, do_input), TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, gpu_do_tensor), 0);
4373
0
      ccv_nnc_cmd_t gpu_forw_cmd = CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, 0);
4374
0
      gpu_forw_cmd.info.scaled_dot_product_attention.flags = CCV_NNC_GEMM_16F | CCV_NNC_GEMM_8I;
4375
0
      ccv_nnc_cmd_exec(gpu_forw_cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, NULL, NULL, NULL), TENSOR_LIST(gpu_o_tensor, gpu_softmax_lse), 0);
4376
0
      ccv_nnc_cmd_exec(CMD_SET_FORWARD(0), ccv_nnc_no_hint, 0, TENSOR_LIST(), TENSOR_LIST(gpu_dq_tensor, gpu_dk_tensor, gpu_dv_tensor), 0);
4377
0
      ccv_nnc_cmd_t gpu_back_cmd = CMD_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD(scale, 0);
4378
0
      gpu_back_cmd.info.scaled_dot_product_attention.flags = CCV_NNC_GEMM_16F | CCV_NNC_GEMM_8I;
4379
0
      gpu_back_cmd.info.scaled_dot_product_attention.deterministic = 0;
4380
0
      ccv_nnc_cmd_exec(gpu_back_cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_do_tensor, 0, 0, gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, 0, 0, 0, gpu_o_tensor, gpu_softmax_lse), TENSOR_LIST(gpu_dq_tensor, gpu_dk_tensor, gpu_dv_tensor), 0);
4381
0
      ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_dq_tensor, gpu_dk_tensor, gpu_dv_tensor), TENSOR_LIST(copy_of_gpu_dq_tensor, copy_of_gpu_dk_tensor, copy_of_gpu_dv_tensor), 0);
4382
4383
0
      float* const dq_cpu_f32 = (float*)ccmalloc(sizeof(float) * q_count);
4384
0
      float* const dk_cpu_f32 = (float*)ccmalloc(sizeof(float) * kv_count);
4385
0
      float* const dv_cpu_f32 = (float*)ccmalloc(sizeof(float) * kv_count);
4386
0
      float* const dq_gpu_f32 = (float*)ccmalloc(sizeof(float) * q_count);
4387
0
      float* const dk_gpu_f32 = (float*)ccmalloc(sizeof(float) * kv_count);
4388
0
      float* const dv_gpu_f32 = (float*)ccmalloc(sizeof(float) * kv_count);
4389
0
      memcpy(dq_cpu_f32, dq_tensor->data.f32, sizeof(float) * q_count);
4390
0
      memcpy(dk_cpu_f32, dk_tensor->data.f32, sizeof(float) * kv_count);
4391
0
      memcpy(dv_cpu_f32, dv_tensor->data.f32, sizeof(float) * kv_count);
4392
0
      if (datatype == CCV_16F)
4393
0
      {
4394
0
        ccv_half_precision_to_float((uint16_t*)copy_of_gpu_dq_tensor->data.f16, dq_gpu_f32, q_count);
4395
0
        ccv_half_precision_to_float((uint16_t*)copy_of_gpu_dk_tensor->data.f16, dk_gpu_f32, kv_count);
4396
0
        ccv_half_precision_to_float((uint16_t*)copy_of_gpu_dv_tensor->data.f16, dv_gpu_f32, kv_count);
4397
0
      } else if (datatype == CCV_16BF) {
4398
0
        ccv_bfloat_to_float((uint16_t*)copy_of_gpu_dq_tensor->data.f16, dq_gpu_f32, q_count);
4399
0
        ccv_bfloat_to_float((uint16_t*)copy_of_gpu_dk_tensor->data.f16, dk_gpu_f32, kv_count);
4400
0
        ccv_bfloat_to_float((uint16_t*)copy_of_gpu_dv_tensor->data.f16, dv_gpu_f32, kv_count);
4401
0
      } else {
4402
0
        memcpy(dq_gpu_f32, copy_of_gpu_dq_tensor->data.f32, sizeof(float) * q_count);
4403
0
        memcpy(dk_gpu_f32, copy_of_gpu_dk_tensor->data.f32, sizeof(float) * kv_count);
4404
0
        memcpy(dv_gpu_f32, copy_of_gpu_dv_tensor->data.f32, sizeof(float) * kv_count);
4405
0
      }
4406
0
      float dq_max_relative_diff = 0;
4407
0
      float dk_max_relative_diff = 0;
4408
0
      float dv_max_relative_diff = 0;
4409
0
      int dq_max_diff_idx = 0;
4410
0
      int dk_max_diff_idx = 0;
4411
0
      int dv_max_diff_idx = 0;
4412
0
      for (int i = 0; i < q_count; ++i)
4413
0
      {
4414
0
        const float denom = fmaxf(fmaxf(fabsf(dq_cpu_f32[i]), fabsf(dq_gpu_f32[i])), 1.0f);
4415
0
        const float relative_diff = fabsf(dq_cpu_f32[i] - dq_gpu_f32[i]) / denom;
4416
0
        if (relative_diff > dq_max_relative_diff)
4417
0
          dq_max_relative_diff = relative_diff, dq_max_diff_idx = i;
4418
0
      }
4419
0
      for (int i = 0; i < kv_count; ++i)
4420
0
      {
4421
0
        float denom = fmaxf(fmaxf(fabsf(dk_cpu_f32[i]), fabsf(dk_gpu_f32[i])), 1.0f);
4422
0
        float relative_diff = fabsf(dk_cpu_f32[i] - dk_gpu_f32[i]) / denom;
4423
0
        if (relative_diff > dk_max_relative_diff)
4424
0
          dk_max_relative_diff = relative_diff, dk_max_diff_idx = i;
4425
0
        denom = fmaxf(fmaxf(fabsf(dv_cpu_f32[i]), fabsf(dv_gpu_f32[i])), 1.0f);
4426
0
        relative_diff = fabsf(dv_cpu_f32[i] - dv_gpu_f32[i]) / denom;
4427
0
        if (relative_diff > dv_max_relative_diff)
4428
0
          dv_max_relative_diff = relative_diff, dv_max_diff_idx = i;
4429
0
      }
4430
0
      REQUIRE(dq_max_relative_diff <= dq_tolerances[datatype_idx], "quantized attention dQ should match CPU reference for dtype=%s R=%d C=%d D=%d (max relative diff %g at %d: %g vs %g)", datatype_names[datatype_idx], R, C, D, dq_max_relative_diff, dq_max_diff_idx, dq_cpu_f32[dq_max_diff_idx], dq_gpu_f32[dq_max_diff_idx]);
4431
0
      REQUIRE(dk_max_relative_diff <= dk_tolerances[datatype_idx], "quantized attention dK should match CPU reference for dtype=%s R=%d C=%d D=%d (max relative diff %g at %d: %g vs %g)", datatype_names[datatype_idx], R, C, D, dk_max_relative_diff, dk_max_diff_idx, dk_cpu_f32[dk_max_diff_idx], dk_gpu_f32[dk_max_diff_idx]);
4432
0
      REQUIRE(dv_max_relative_diff <= dv_tolerances[datatype_idx], "quantized attention dV should match CPU reference for dtype=%s R=%d C=%d D=%d (max relative diff %g at %d: %g vs %g)", datatype_names[datatype_idx], R, C, D, dv_max_relative_diff, dv_max_diff_idx, dv_cpu_f32[dv_max_diff_idx], dv_gpu_f32[dv_max_diff_idx]);
4433
4434
0
      ccfree(dq_cpu_f32);
4435
0
      ccfree(dk_cpu_f32);
4436
0
      ccfree(dv_cpu_f32);
4437
0
      ccfree(dq_gpu_f32);
4438
0
      ccfree(dk_gpu_f32);
4439
0
      ccfree(dv_gpu_f32);
4440
0
      ccv_nnc_tensor_free(gpu_q_tensor);
4441
0
      ccv_nnc_tensor_free(gpu_k_tensor);
4442
0
      ccv_nnc_tensor_free(gpu_v_tensor);
4443
0
      ccv_nnc_tensor_free(gpu_do_tensor);
4444
0
      ccv_nnc_tensor_free(gpu_o_tensor);
4445
0
      ccv_nnc_tensor_free(gpu_dq_tensor);
4446
0
      ccv_nnc_tensor_free(gpu_dk_tensor);
4447
0
      ccv_nnc_tensor_free(gpu_dv_tensor);
4448
0
      ccv_nnc_tensor_free(gpu_softmax_lse);
4449
0
      ccv_nnc_tensor_free(copy_of_gpu_dq_tensor);
4450
0
      ccv_nnc_tensor_free(copy_of_gpu_dk_tensor);
4451
0
      ccv_nnc_tensor_free(copy_of_gpu_dv_tensor);
4452
0
    }
4453
4454
0
    ccv_nnc_tensor_free(q_tensor);
4455
0
    ccv_nnc_tensor_free(k_tensor);
4456
0
    ccv_nnc_tensor_free(v_tensor);
4457
0
    ccv_nnc_tensor_free(do_tensor);
4458
0
    ccv_nnc_tensor_free(dq_tensor);
4459
0
    ccv_nnc_tensor_free(dk_tensor);
4460
0
    ccv_nnc_tensor_free(dv_tensor);
4461
0
    ccv_nnc_tensor_free(q_tensor_f16);
4462
0
    ccv_nnc_tensor_free(k_tensor_f16);
4463
0
    ccv_nnc_tensor_free(v_tensor_f16);
4464
0
    ccv_nnc_tensor_free(do_tensor_f16);
4465
0
  }
4466
0
}
4467
4468
TEST_CASE("scaled dot product attention gradient with quantized NA mps for rectangular and edge sequence lengths")
4469
1
{
4470
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD, CCV_NNC_BACKEND_MPS) &&
4471
1
    ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD, CCV_NNC_BACKEND_MPS));
4472
0
  typedef struct {
4473
0
    int R;
4474
0
    int C;
4475
0
  } qna_backward_shape_t;
4476
0
  const int B = 1;
4477
0
  const int H = 8;
4478
0
  const int Ds[] = { 64, 128 };
4479
0
  const qna_backward_shape_t shapes[] = {
4480
0
    { .R = 32, .C = 64 },
4481
0
    { .R = 40, .C = 72 },
4482
0
    { .R = 80, .C = 64 },
4483
0
    { .R = 96, .C = 88 },
4484
0
    { .R = 64, .C = 192 },
4485
0
    { .R = 144, .C = 64 },
4486
0
  };
4487
0
  const int datatypes[] = { CCV_16F, CCV_16BF, CCV_32F };
4488
0
  const char* datatype_names[] = { "16F", "16BF", "32F" };
4489
0
  const float dq_tolerances[] = { 8e-2, 8e-2, 8e-2 };
4490
0
  const float dk_tolerances[] = { 1e-1, 1e-1, 1e-1 };
4491
0
  const float dv_tolerances[] = { 8e-2, 8e-2, 8e-2 };
4492
0
  for (int shape_idx = 0; shape_idx < (int)(sizeof(shapes) / sizeof(shapes[0])); ++shape_idx)
4493
0
  {
4494
0
    const int R = shapes[shape_idx].R;
4495
0
    const int C = shapes[shape_idx].C;
4496
0
    for (int d_idx = 0; d_idx < (int)(sizeof(Ds) / sizeof(Ds[0])); ++d_idx)
4497
0
    {
4498
0
      const int D = Ds[d_idx];
4499
0
      const int q_count = B * R * H * D;
4500
0
      const int kv_count = B * C * H * D;
4501
0
      const float scale = 1.0 / sqrt((float)D);
4502
0
      ccv_nnc_tensor_t* const q_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
4503
0
      ccv_nnc_tensor_t* const k_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
4504
0
      ccv_nnc_tensor_t* const v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
4505
0
      ccv_nnc_tensor_t* const do_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
4506
0
      ccv_nnc_tensor_t* const dq_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
4507
0
      ccv_nnc_tensor_t* const dk_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
4508
0
      ccv_nnc_tensor_t* const dv_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
4509
0
      ccv_nnc_tensor_t* const q_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, H, D), 0);
4510
0
      ccv_nnc_tensor_t* const k_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, H, D), 0);
4511
0
      ccv_nnc_tensor_t* const v_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, H, D), 0);
4512
0
      ccv_nnc_tensor_t* const do_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, H, D), 0);
4513
0
      dsfmt_t dsfmt;
4514
0
      dsfmt_init_gen_rand(&dsfmt, 281 + shape_idx * 17 + d_idx);
4515
0
      for (int i = 0; i < q_count; ++i)
4516
0
      {
4517
0
        q_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
4518
0
        do_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
4519
0
      }
4520
0
      for (int i = 0; i < kv_count; ++i)
4521
0
      {
4522
0
        k_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
4523
0
        v_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
4524
0
      }
4525
0
      ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD(scale, 0), ccv_nnc_no_hint, 0, TENSOR_LIST(do_tensor, 0, 0, q_tensor, k_tensor, v_tensor), TENSOR_LIST(dq_tensor, dk_tensor, dv_tensor), 0);
4526
4527
0
      for (int datatype_idx = 0; datatype_idx < 3; ++datatype_idx)
4528
0
      {
4529
0
        const int datatype = datatypes[datatype_idx];
4530
0
        ccv_nnc_tensor_t* q_input = q_tensor;
4531
0
        ccv_nnc_tensor_t* k_input = k_tensor;
4532
0
        ccv_nnc_tensor_t* v_input = v_tensor;
4533
0
        ccv_nnc_tensor_t* do_input = do_tensor;
4534
0
        ccv_nnc_tensor_t* gpu_q_tensor = 0;
4535
0
        ccv_nnc_tensor_t* gpu_k_tensor = 0;
4536
0
        ccv_nnc_tensor_t* gpu_v_tensor = 0;
4537
0
        ccv_nnc_tensor_t* gpu_do_tensor = 0;
4538
0
        ccv_nnc_tensor_t* gpu_o_tensor = 0;
4539
0
        ccv_nnc_tensor_t* gpu_dq_tensor = 0;
4540
0
        ccv_nnc_tensor_t* gpu_dk_tensor = 0;
4541
0
        ccv_nnc_tensor_t* gpu_dv_tensor = 0;
4542
0
        ccv_nnc_tensor_t* copy_of_gpu_dq_tensor = 0;
4543
0
        ccv_nnc_tensor_t* copy_of_gpu_dk_tensor = 0;
4544
0
        ccv_nnc_tensor_t* copy_of_gpu_dv_tensor = 0;
4545
0
        if (datatype == CCV_16F)
4546
0
        {
4547
0
          ccv_float_to_half_precision(q_tensor->data.f32, (uint16_t*)q_tensor_f16->data.f16, q_count);
4548
0
          ccv_float_to_half_precision(k_tensor->data.f32, (uint16_t*)k_tensor_f16->data.f16, kv_count);
4549
0
          ccv_float_to_half_precision(v_tensor->data.f32, (uint16_t*)v_tensor_f16->data.f16, kv_count);
4550
0
          ccv_float_to_half_precision(do_tensor->data.f32, (uint16_t*)do_tensor_f16->data.f16, q_count);
4551
0
          q_input = q_tensor_f16;
4552
0
          k_input = k_tensor_f16;
4553
0
          v_input = v_tensor_f16;
4554
0
          do_input = do_tensor_f16;
4555
0
          gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, H, D), 0);
4556
0
          gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, H, D), 0);
4557
0
          gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, H, D), 0);
4558
0
          gpu_do_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, H, D), 0);
4559
0
          gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, H, D), 0);
4560
0
          gpu_dq_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, H, D), 0);
4561
0
          gpu_dk_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, H, D), 0);
4562
0
          gpu_dv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, H, D), 0);
4563
0
          copy_of_gpu_dq_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, H, D), 0);
4564
0
          copy_of_gpu_dk_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, H, D), 0);
4565
0
          copy_of_gpu_dv_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, H, D), 0);
4566
0
        } else if (datatype == CCV_16BF) {
4567
0
          ccv_float_to_bfloat(q_tensor->data.f32, (uint16_t*)q_tensor_f16->data.f16, q_count);
4568
0
          ccv_float_to_bfloat(k_tensor->data.f32, (uint16_t*)k_tensor_f16->data.f16, kv_count);
4569
0
          ccv_float_to_bfloat(v_tensor->data.f32, (uint16_t*)v_tensor_f16->data.f16, kv_count);
4570
0
          ccv_float_to_bfloat(do_tensor->data.f32, (uint16_t*)do_tensor_f16->data.f16, q_count);
4571
0
          q_input = q_tensor_f16;
4572
0
          k_input = k_tensor_f16;
4573
0
          v_input = v_tensor_f16;
4574
0
          do_input = do_tensor_f16;
4575
0
          gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, R, H, D), 0);
4576
0
          gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, C, H, D), 0);
4577
0
          gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, C, H, D), 0);
4578
0
          gpu_do_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, R, H, D), 0);
4579
0
          gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, R, H, D), 0);
4580
0
          gpu_dq_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, R, H, D), 0);
4581
0
          gpu_dk_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, C, H, D), 0);
4582
0
          gpu_dv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, C, H, D), 0);
4583
0
          copy_of_gpu_dq_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, B, R, H, D), 0);
4584
0
          copy_of_gpu_dk_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, B, C, H, D), 0);
4585
0
          copy_of_gpu_dv_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, B, C, H, D), 0);
4586
0
        } else {
4587
0
          gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, H, D), 0);
4588
0
          gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, C, H, D), 0);
4589
0
          gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, C, H, D), 0);
4590
0
          gpu_do_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, H, D), 0);
4591
0
          gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, H, D), 0);
4592
0
          gpu_dq_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, H, D), 0);
4593
0
          gpu_dk_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, C, H, D), 0);
4594
0
          gpu_dv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, C, H, D), 0);
4595
0
          copy_of_gpu_dq_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
4596
0
          copy_of_gpu_dk_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
4597
0
          copy_of_gpu_dv_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
4598
0
        }
4599
0
        ccv_nnc_tensor_t* const gpu_softmax_lse = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, H, R), 0);
4600
0
        ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_input, k_input, v_input, do_input), TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, gpu_do_tensor), 0);
4601
0
        ccv_nnc_cmd_t gpu_forw_cmd = CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, 0);
4602
0
        gpu_forw_cmd.info.scaled_dot_product_attention.flags = CCV_NNC_GEMM_16F | CCV_NNC_GEMM_8I;
4603
0
        ccv_nnc_cmd_exec(gpu_forw_cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, NULL, NULL, NULL), TENSOR_LIST(gpu_o_tensor, gpu_softmax_lse), 0);
4604
0
        ccv_nnc_cmd_exec(CMD_SET_FORWARD(0), ccv_nnc_no_hint, 0, TENSOR_LIST(), TENSOR_LIST(gpu_dq_tensor, gpu_dk_tensor, gpu_dv_tensor), 0);
4605
0
        ccv_nnc_cmd_t gpu_back_cmd = CMD_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD(scale, 0);
4606
0
        gpu_back_cmd.info.scaled_dot_product_attention.flags = CCV_NNC_GEMM_16F | CCV_NNC_GEMM_8I;
4607
0
        gpu_back_cmd.info.scaled_dot_product_attention.deterministic = 0;
4608
0
        ccv_nnc_cmd_exec(gpu_back_cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_do_tensor, 0, 0, gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, 0, 0, 0, gpu_o_tensor, gpu_softmax_lse), TENSOR_LIST(gpu_dq_tensor, gpu_dk_tensor, gpu_dv_tensor), 0);
4609
0
        ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_dq_tensor, gpu_dk_tensor, gpu_dv_tensor), TENSOR_LIST(copy_of_gpu_dq_tensor, copy_of_gpu_dk_tensor, copy_of_gpu_dv_tensor), 0);
4610
4611
0
        float* const dq_cpu_f32 = (float*)ccmalloc(sizeof(float) * q_count);
4612
0
        float* const dk_cpu_f32 = (float*)ccmalloc(sizeof(float) * kv_count);
4613
0
        float* const dv_cpu_f32 = (float*)ccmalloc(sizeof(float) * kv_count);
4614
0
        float* const dq_gpu_f32 = (float*)ccmalloc(sizeof(float) * q_count);
4615
0
        float* const dk_gpu_f32 = (float*)ccmalloc(sizeof(float) * kv_count);
4616
0
        float* const dv_gpu_f32 = (float*)ccmalloc(sizeof(float) * kv_count);
4617
0
        memcpy(dq_cpu_f32, dq_tensor->data.f32, sizeof(float) * q_count);
4618
0
        memcpy(dk_cpu_f32, dk_tensor->data.f32, sizeof(float) * kv_count);
4619
0
        memcpy(dv_cpu_f32, dv_tensor->data.f32, sizeof(float) * kv_count);
4620
0
        if (datatype == CCV_16F)
4621
0
        {
4622
0
          ccv_half_precision_to_float((uint16_t*)copy_of_gpu_dq_tensor->data.f16, dq_gpu_f32, q_count);
4623
0
          ccv_half_precision_to_float((uint16_t*)copy_of_gpu_dk_tensor->data.f16, dk_gpu_f32, kv_count);
4624
0
          ccv_half_precision_to_float((uint16_t*)copy_of_gpu_dv_tensor->data.f16, dv_gpu_f32, kv_count);
4625
0
        } else if (datatype == CCV_16BF) {
4626
0
          ccv_bfloat_to_float((uint16_t*)copy_of_gpu_dq_tensor->data.f16, dq_gpu_f32, q_count);
4627
0
          ccv_bfloat_to_float((uint16_t*)copy_of_gpu_dk_tensor->data.f16, dk_gpu_f32, kv_count);
4628
0
          ccv_bfloat_to_float((uint16_t*)copy_of_gpu_dv_tensor->data.f16, dv_gpu_f32, kv_count);
4629
0
        } else {
4630
0
          memcpy(dq_gpu_f32, copy_of_gpu_dq_tensor->data.f32, sizeof(float) * q_count);
4631
0
          memcpy(dk_gpu_f32, copy_of_gpu_dk_tensor->data.f32, sizeof(float) * kv_count);
4632
0
          memcpy(dv_gpu_f32, copy_of_gpu_dv_tensor->data.f32, sizeof(float) * kv_count);
4633
0
        }
4634
0
        float dq_max_relative_diff = 0;
4635
0
        float dk_max_relative_diff = 0;
4636
0
        float dv_max_relative_diff = 0;
4637
0
        int dq_max_diff_idx = 0;
4638
0
        int dk_max_diff_idx = 0;
4639
0
        int dv_max_diff_idx = 0;
4640
0
        for (int i = 0; i < q_count; ++i)
4641
0
        {
4642
0
          const float denom = fmaxf(fmaxf(fabsf(dq_cpu_f32[i]), fabsf(dq_gpu_f32[i])), 1.0f);
4643
0
          const float relative_diff = fabsf(dq_cpu_f32[i] - dq_gpu_f32[i]) / denom;
4644
0
          if (relative_diff > dq_max_relative_diff)
4645
0
            dq_max_relative_diff = relative_diff, dq_max_diff_idx = i;
4646
0
        }
4647
0
        for (int i = 0; i < kv_count; ++i)
4648
0
        {
4649
0
          float denom = fmaxf(fmaxf(fabsf(dk_cpu_f32[i]), fabsf(dk_gpu_f32[i])), 1.0f);
4650
0
          float relative_diff = fabsf(dk_cpu_f32[i] - dk_gpu_f32[i]) / denom;
4651
0
          if (relative_diff > dk_max_relative_diff)
4652
0
            dk_max_relative_diff = relative_diff, dk_max_diff_idx = i;
4653
0
          denom = fmaxf(fmaxf(fabsf(dv_cpu_f32[i]), fabsf(dv_gpu_f32[i])), 1.0f);
4654
0
          relative_diff = fabsf(dv_cpu_f32[i] - dv_gpu_f32[i]) / denom;
4655
0
          if (relative_diff > dv_max_relative_diff)
4656
0
            dv_max_relative_diff = relative_diff, dv_max_diff_idx = i;
4657
0
        }
4658
0
        REQUIRE(dq_max_relative_diff <= dq_tolerances[datatype_idx], "quantized attention dQ should match CPU reference for dtype=%s R=%d C=%d D=%d (max relative diff %g at %d: %g vs %g)", datatype_names[datatype_idx], R, C, D, dq_max_relative_diff, dq_max_diff_idx, dq_cpu_f32[dq_max_diff_idx], dq_gpu_f32[dq_max_diff_idx]);
4659
0
        REQUIRE(dk_max_relative_diff <= dk_tolerances[datatype_idx], "quantized attention dK should match CPU reference for dtype=%s R=%d C=%d D=%d (max relative diff %g at %d: %g vs %g)", datatype_names[datatype_idx], R, C, D, dk_max_relative_diff, dk_max_diff_idx, dk_cpu_f32[dk_max_diff_idx], dk_gpu_f32[dk_max_diff_idx]);
4660
0
        REQUIRE(dv_max_relative_diff <= dv_tolerances[datatype_idx], "quantized attention dV should match CPU reference for dtype=%s R=%d C=%d D=%d (max relative diff %g at %d: %g vs %g)", datatype_names[datatype_idx], R, C, D, dv_max_relative_diff, dv_max_diff_idx, dv_cpu_f32[dv_max_diff_idx], dv_gpu_f32[dv_max_diff_idx]);
4661
4662
0
        ccfree(dq_cpu_f32);
4663
0
        ccfree(dk_cpu_f32);
4664
0
        ccfree(dv_cpu_f32);
4665
0
        ccfree(dq_gpu_f32);
4666
0
        ccfree(dk_gpu_f32);
4667
0
        ccfree(dv_gpu_f32);
4668
0
        ccv_nnc_tensor_free(gpu_q_tensor);
4669
0
        ccv_nnc_tensor_free(gpu_k_tensor);
4670
0
        ccv_nnc_tensor_free(gpu_v_tensor);
4671
0
        ccv_nnc_tensor_free(gpu_do_tensor);
4672
0
        ccv_nnc_tensor_free(gpu_o_tensor);
4673
0
        ccv_nnc_tensor_free(gpu_dq_tensor);
4674
0
        ccv_nnc_tensor_free(gpu_dk_tensor);
4675
0
        ccv_nnc_tensor_free(gpu_dv_tensor);
4676
0
        ccv_nnc_tensor_free(gpu_softmax_lse);
4677
0
        ccv_nnc_tensor_free(copy_of_gpu_dq_tensor);
4678
0
        ccv_nnc_tensor_free(copy_of_gpu_dk_tensor);
4679
0
        ccv_nnc_tensor_free(copy_of_gpu_dv_tensor);
4680
0
      }
4681
4682
0
      ccv_nnc_tensor_free(q_tensor);
4683
0
      ccv_nnc_tensor_free(k_tensor);
4684
0
      ccv_nnc_tensor_free(v_tensor);
4685
0
      ccv_nnc_tensor_free(do_tensor);
4686
0
      ccv_nnc_tensor_free(dq_tensor);
4687
0
      ccv_nnc_tensor_free(dk_tensor);
4688
0
      ccv_nnc_tensor_free(dv_tensor);
4689
0
      ccv_nnc_tensor_free(q_tensor_f16);
4690
0
      ccv_nnc_tensor_free(k_tensor_f16);
4691
0
      ccv_nnc_tensor_free(v_tensor_f16);
4692
0
      ccv_nnc_tensor_free(do_tensor_f16);
4693
0
    }
4694
0
  }
4695
0
}
4696
4697
TEST_CASE("scaled dot product attention gradient with quantized NA mps on 1536 square surface")
4698
1
{
4699
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD, CCV_NNC_BACKEND_MPS) &&
4700
1
    ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD, CCV_NNC_BACKEND_MPS));
4701
0
  const int B = 1;
4702
0
  const int R = 1536;
4703
0
  const int C = 1536;
4704
0
  const int H = 24;
4705
0
  const int D = 128;
4706
0
  const int q_count = B * R * H * D;
4707
0
  const int kv_count = B * C * H * D;
4708
0
  const float scale = 1.0 / sqrt((float)D);
4709
0
  ccv_nnc_tensor_t* const q_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
4710
0
  ccv_nnc_tensor_t* const k_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
4711
0
  ccv_nnc_tensor_t* const v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
4712
0
  ccv_nnc_tensor_t* const do_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
4713
0
  ccv_nnc_tensor_t* const dq_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
4714
0
  ccv_nnc_tensor_t* const dk_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
4715
0
  ccv_nnc_tensor_t* const dv_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
4716
0
  ccv_nnc_tensor_t* const q_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, H, D), 0);
4717
0
  ccv_nnc_tensor_t* const k_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, H, D), 0);
4718
0
  ccv_nnc_tensor_t* const v_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, H, D), 0);
4719
0
  ccv_nnc_tensor_t* const do_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, H, D), 0);
4720
0
  dsfmt_t dsfmt;
4721
0
  dsfmt_init_gen_rand(&dsfmt, 4177);
4722
0
  for (int i = 0; i < q_count; ++i)
4723
0
  {
4724
    // Use a stronger shared Q / K signal on this surface so QK^T produces
4725
    // sharper rows than the fully diffuse random-input case.
4726
0
    const float q = 2.f * (dsfmt_genrand_open_close(&dsfmt) - 0.5f);
4727
0
    q_tensor->data.f32[i] = q;
4728
0
    k_tensor->data.f32[i] = q + 0.125f * (dsfmt_genrand_open_close(&dsfmt) - 0.5f);
4729
0
    do_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
4730
0
    v_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
4731
0
  }
4732
0
  ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD(scale, 0), ccv_nnc_no_hint, 0, TENSOR_LIST(do_tensor, 0, 0, q_tensor, k_tensor, v_tensor), TENSOR_LIST(dq_tensor, dk_tensor, dv_tensor), 0);
4733
4734
0
  ccv_float_to_half_precision(q_tensor->data.f32, (uint16_t*)q_tensor_f16->data.f16, q_count);
4735
0
  ccv_float_to_half_precision(k_tensor->data.f32, (uint16_t*)k_tensor_f16->data.f16, kv_count);
4736
0
  ccv_float_to_half_precision(v_tensor->data.f32, (uint16_t*)v_tensor_f16->data.f16, kv_count);
4737
0
  ccv_float_to_half_precision(do_tensor->data.f32, (uint16_t*)do_tensor_f16->data.f16, q_count);
4738
4739
0
  ccv_nnc_tensor_t* const gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, H, D), 0);
4740
0
  ccv_nnc_tensor_t* const gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, H, D), 0);
4741
0
  ccv_nnc_tensor_t* const gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, H, D), 0);
4742
0
  ccv_nnc_tensor_t* const gpu_do_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, H, D), 0);
4743
0
  ccv_nnc_tensor_t* const gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, H, D), 0);
4744
0
  ccv_nnc_tensor_t* const gpu_dq_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, H, D), 0);
4745
0
  ccv_nnc_tensor_t* const gpu_dk_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, H, D), 0);
4746
0
  ccv_nnc_tensor_t* const gpu_dv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, H, D), 0);
4747
0
  ccv_nnc_tensor_t* const gpu_softmax_lse = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, H, R), 0);
4748
0
  ccv_nnc_tensor_t* const copy_of_gpu_dq_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, H, D), 0);
4749
0
  ccv_nnc_tensor_t* const copy_of_gpu_dk_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, H, D), 0);
4750
0
  ccv_nnc_tensor_t* const copy_of_gpu_dv_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, H, D), 0);
4751
4752
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor_f16, k_tensor_f16, v_tensor_f16, do_tensor_f16), TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, gpu_do_tensor), 0);
4753
0
  ccv_nnc_cmd_t gpu_forw_cmd = CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, 0);
4754
0
  gpu_forw_cmd.info.scaled_dot_product_attention.flags = CCV_NNC_GEMM_16F | CCV_NNC_GEMM_8I;
4755
0
  ccv_nnc_cmd_exec(gpu_forw_cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, NULL, NULL, NULL), TENSOR_LIST(gpu_o_tensor, gpu_softmax_lse), 0);
4756
0
  ccv_nnc_cmd_exec(CMD_SET_FORWARD(0), ccv_nnc_no_hint, 0, TENSOR_LIST(), TENSOR_LIST(gpu_dq_tensor, gpu_dk_tensor, gpu_dv_tensor), 0);
4757
0
  ccv_nnc_cmd_t gpu_back_cmd = CMD_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD(scale, 0);
4758
0
  gpu_back_cmd.info.scaled_dot_product_attention.flags = CCV_NNC_GEMM_16F | CCV_NNC_GEMM_8I;
4759
0
  gpu_back_cmd.info.scaled_dot_product_attention.deterministic = 0;
4760
0
  ccv_nnc_cmd_exec(gpu_back_cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_do_tensor, 0, 0, gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, 0, 0, 0, gpu_o_tensor, gpu_softmax_lse), TENSOR_LIST(gpu_dq_tensor, gpu_dk_tensor, gpu_dv_tensor), 0);
4761
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_dq_tensor, gpu_dk_tensor, gpu_dv_tensor), TENSOR_LIST(copy_of_gpu_dq_tensor, copy_of_gpu_dk_tensor, copy_of_gpu_dv_tensor), 0);
4762
4763
0
  float* const dq_gpu_f32 = (float*)ccmalloc(sizeof(float) * q_count);
4764
0
  float* const dk_gpu_f32 = (float*)ccmalloc(sizeof(float) * kv_count);
4765
0
  float* const dv_gpu_f32 = (float*)ccmalloc(sizeof(float) * kv_count);
4766
0
  ccv_half_precision_to_float((uint16_t*)copy_of_gpu_dq_tensor->data.f16, dq_gpu_f32, q_count);
4767
0
  ccv_half_precision_to_float((uint16_t*)copy_of_gpu_dk_tensor->data.f16, dk_gpu_f32, kv_count);
4768
0
  ccv_half_precision_to_float((uint16_t*)copy_of_gpu_dv_tensor->data.f16, dv_gpu_f32, kv_count);
4769
4770
0
  float dq_max_relative_diff = 0;
4771
0
  float dk_max_relative_diff = 0;
4772
0
  float dv_max_relative_diff = 0;
4773
0
  float dq_cpu_max_abs = 0;
4774
0
  float dq_gpu_max_abs = 0;
4775
0
  float dk_cpu_max_abs = 0;
4776
0
  float dk_gpu_max_abs = 0;
4777
0
  float dv_cpu_max_abs = 0;
4778
0
  float dv_gpu_max_abs = 0;
4779
0
  int dq_max_diff_idx = 0;
4780
0
  int dk_max_diff_idx = 0;
4781
0
  int dv_max_diff_idx = 0;
4782
0
  for (int i = 0; i < q_count; ++i)
4783
0
  {
4784
0
    dq_cpu_max_abs = fmaxf(dq_cpu_max_abs, fabsf(dq_tensor->data.f32[i]));
4785
0
    dq_gpu_max_abs = fmaxf(dq_gpu_max_abs, fabsf(dq_gpu_f32[i]));
4786
0
    const float denom = fmaxf(fmaxf(fabsf(dq_tensor->data.f32[i]), fabsf(dq_gpu_f32[i])), 1.0f);
4787
0
    const float relative_diff = fabsf(dq_tensor->data.f32[i] - dq_gpu_f32[i]) / denom;
4788
0
    if (relative_diff > dq_max_relative_diff)
4789
0
      dq_max_relative_diff = relative_diff, dq_max_diff_idx = i;
4790
0
  }
4791
0
  for (int i = 0; i < kv_count; ++i)
4792
0
  {
4793
0
    dk_cpu_max_abs = fmaxf(dk_cpu_max_abs, fabsf(dk_tensor->data.f32[i]));
4794
0
    dk_gpu_max_abs = fmaxf(dk_gpu_max_abs, fabsf(dk_gpu_f32[i]));
4795
0
    float denom = fmaxf(fmaxf(fabsf(dk_tensor->data.f32[i]), fabsf(dk_gpu_f32[i])), 1.0f);
4796
0
    float relative_diff = fabsf(dk_tensor->data.f32[i] - dk_gpu_f32[i]) / denom;
4797
0
    if (relative_diff > dk_max_relative_diff)
4798
0
      dk_max_relative_diff = relative_diff, dk_max_diff_idx = i;
4799
0
    dv_cpu_max_abs = fmaxf(dv_cpu_max_abs, fabsf(dv_tensor->data.f32[i]));
4800
0
    dv_gpu_max_abs = fmaxf(dv_gpu_max_abs, fabsf(dv_gpu_f32[i]));
4801
0
    denom = fmaxf(fmaxf(fabsf(dv_tensor->data.f32[i]), fabsf(dv_gpu_f32[i])), 1.0f);
4802
0
    relative_diff = fabsf(dv_tensor->data.f32[i] - dv_gpu_f32[i]) / denom;
4803
0
    if (relative_diff > dv_max_relative_diff)
4804
0
      dv_max_relative_diff = relative_diff, dv_max_diff_idx = i;
4805
0
  }
4806
0
  REQUIRE(dq_gpu_max_abs >= dq_cpu_max_abs * 0.5f && dq_gpu_max_abs <= dq_cpu_max_abs * 2.0f,
4807
0
    "quantized attention dQ magnitude should stay close to CPU reference on 1536 surface (cpu max abs %g gpu max abs %g)",
4808
0
    dq_cpu_max_abs, dq_gpu_max_abs);
4809
0
  REQUIRE(dk_gpu_max_abs >= dk_cpu_max_abs * 0.5f && dk_gpu_max_abs <= dk_cpu_max_abs * 2.0f,
4810
0
    "quantized attention dK magnitude should stay close to CPU reference on 1536 surface (cpu max abs %g gpu max abs %g)",
4811
0
    dk_cpu_max_abs, dk_gpu_max_abs);
4812
0
  REQUIRE(dv_gpu_max_abs >= dv_cpu_max_abs * 0.5f && dv_gpu_max_abs <= dv_cpu_max_abs * 2.0f,
4813
0
    "quantized attention dV magnitude should stay close to CPU reference on 1536 surface (cpu max abs %g gpu max abs %g)",
4814
0
    dv_cpu_max_abs, dv_gpu_max_abs);
4815
0
  REQUIRE(dq_max_relative_diff <= 8e-2, "quantized attention dQ should match CPU reference on 1536 surface (max relative diff %g at %d: %g vs %g)", dq_max_relative_diff, dq_max_diff_idx, dq_tensor->data.f32[dq_max_diff_idx], dq_gpu_f32[dq_max_diff_idx]);
4816
0
  REQUIRE(dk_max_relative_diff <= 1e-1, "quantized attention dK should match CPU reference on 1536 surface (max relative diff %g at %d: %g vs %g)", dk_max_relative_diff, dk_max_diff_idx, dk_tensor->data.f32[dk_max_diff_idx], dk_gpu_f32[dk_max_diff_idx]);
4817
0
  REQUIRE(dv_max_relative_diff <= 8e-2, "quantized attention dV should match CPU reference on 1536 surface (max relative diff %g at %d: %g vs %g)", dv_max_relative_diff, dv_max_diff_idx, dv_tensor->data.f32[dv_max_diff_idx], dv_gpu_f32[dv_max_diff_idx]);
4818
4819
0
  ccfree(dq_gpu_f32);
4820
0
  ccfree(dk_gpu_f32);
4821
0
  ccfree(dv_gpu_f32);
4822
0
  ccv_nnc_tensor_free(gpu_q_tensor);
4823
0
  ccv_nnc_tensor_free(gpu_k_tensor);
4824
0
  ccv_nnc_tensor_free(gpu_v_tensor);
4825
0
  ccv_nnc_tensor_free(gpu_do_tensor);
4826
0
  ccv_nnc_tensor_free(gpu_o_tensor);
4827
0
  ccv_nnc_tensor_free(gpu_dq_tensor);
4828
0
  ccv_nnc_tensor_free(gpu_dk_tensor);
4829
0
  ccv_nnc_tensor_free(gpu_dv_tensor);
4830
0
  ccv_nnc_tensor_free(gpu_softmax_lse);
4831
0
  ccv_nnc_tensor_free(copy_of_gpu_dq_tensor);
4832
0
  ccv_nnc_tensor_free(copy_of_gpu_dk_tensor);
4833
0
  ccv_nnc_tensor_free(copy_of_gpu_dv_tensor);
4834
0
  ccv_nnc_tensor_free(q_tensor);
4835
0
  ccv_nnc_tensor_free(k_tensor);
4836
0
  ccv_nnc_tensor_free(v_tensor);
4837
0
  ccv_nnc_tensor_free(do_tensor);
4838
0
  ccv_nnc_tensor_free(dq_tensor);
4839
0
  ccv_nnc_tensor_free(dk_tensor);
4840
0
  ccv_nnc_tensor_free(dv_tensor);
4841
0
  ccv_nnc_tensor_free(q_tensor_f16);
4842
0
  ccv_nnc_tensor_free(k_tensor_f16);
4843
0
  ccv_nnc_tensor_free(v_tensor_f16);
4844
0
  ccv_nnc_tensor_free(do_tensor_f16);
4845
0
}
4846
4847
TEST_CASE("scaled dot product attention with mps in bfloat precision")
4848
1
{
4849
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD, CCV_NNC_BACKEND_MPS) &&
4850
1
    ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD, CCV_NNC_BACKEND_MPS));
4851
0
#define num_long_trials 8
4852
0
#define num_short_trials 4
4853
0
#define num_trials (num_long_trials + num_short_trials)
4854
4855
0
  dsfmt_t dsfmt;
4856
0
  dsfmt_init_gen_rand(&dsfmt, 10);
4857
0
  for (int trial = 0; trial < num_trials; ++trial) {
4858
0
    const int B_candidates[num_trials] = {  32,   12, 16, 1, 2, 1, 32,   12, 16, 1, 2, 1 };
4859
0
    const int R_candidates[num_trials] = { 160,  256, 128, 77, 77, 5, 160,  256, 128, 77, 77, 5 };
4860
0
    const int C_candidates[num_trials] = { 128,  128, 128, 128, 128, 5, 128,  128, 128, 128, 128, 5 };
4861
0
    const int Hq_candidates[num_trials] = {   8,  8, 8, 8, 8, 32, 8,  8, 8, 8, 8, 32 };
4862
0
    const int Hk_candidates[num_trials] = {   8,  8, 4, 2, 8, 32, 8,  8, 8, 8, 8, 32 };
4863
0
    const int D_candidates[num_trials] = {  64, 40, 160, 192, 256, 128, 48, 96, 160, 192, 256, 128 };
4864
4865
0
    const int B = B_candidates[trial];
4866
0
    const int R = R_candidates[trial];
4867
0
    const int C = C_candidates[trial];
4868
0
    const int Hq = Hq_candidates[trial];
4869
0
    const int Hk = Hk_candidates[trial];
4870
0
    const int D = D_candidates[trial];
4871
0
    const int is_causal = 0;
4872
0
    const float scale = 1.0 / sqrt((float)D);
4873
4874
0
    ccv_nnc_tensor_t* const q_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
4875
0
    ccv_nnc_tensor_t* const k_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
4876
0
    ccv_nnc_tensor_t* const v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
4877
4878
0
    for (int i = 0; i < B * R * Hq * D; ++i) {
4879
0
      q_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4880
0
    }
4881
0
    for (int i = 0; i < B * C * Hk * D; ++i) {
4882
0
      k_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4883
0
    }
4884
0
    for (int i = 0; i < B * C * Hk * D; ++i) {
4885
0
      v_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4886
0
    }
4887
4888
0
    ccv_nnc_tensor_t* const o_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
4889
0
    ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, is_causal), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor, k_tensor, v_tensor), TENSOR_LIST(o_tensor), 0);
4890
0
    ccv_nnc_tensor_t* const q_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, B, R, Hq, D), 0);
4891
0
    ccv_nnc_tensor_t* const k_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, B, C, Hk, D), 0);
4892
0
    ccv_nnc_tensor_t* const v_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, B, C, Hk, D), 0);
4893
0
    ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor, k_tensor, v_tensor), TENSOR_LIST(q_tensor_f16, k_tensor_f16, v_tensor_f16), 0);
4894
4895
0
    ccv_nnc_tensor_t* const gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, R, Hq, D), 0);
4896
0
    ccv_nnc_tensor_t* const gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, C, Hk, D), 0);
4897
0
    ccv_nnc_tensor_t* const gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, C, Hk, D), 0);
4898
0
    ccv_nnc_tensor_t* const gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, R, Hq, D), 0);
4899
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor_f16, k_tensor_f16, v_tensor_f16), TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor), 0);
4900
4901
0
    ccv_nnc_tensor_t* const gpu_softmax_lse = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, Hq, R), 0);
4902
0
    ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, is_causal), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, NULL, NULL, NULL), TENSOR_LIST(gpu_o_tensor, gpu_softmax_lse), 0);
4903
4904
0
    ccv_nnc_tensor_t* const copy_of_gpu_o_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, B, R, Hq, D), 0);
4905
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_o_tensor), TENSOR_LIST(copy_of_gpu_o_tensor_f16), 0);
4906
4907
0
    ccv_nnc_tensor_t* const copy_of_gpu_o_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
4908
0
    ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(copy_of_gpu_o_tensor_f16), TENSOR_LIST(copy_of_gpu_o_tensor), 0);
4909
0
    REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, copy_of_gpu_o_tensor->data.f32, o_tensor->data.f32, B * R * Hq * D, 8e-3, "scaled dot product attention result should be the same");
4910
4911
0
    ccv_nnc_tensor_free(o_tensor);
4912
0
    ccv_nnc_tensor_free(gpu_o_tensor);
4913
0
    ccv_nnc_tensor_free(copy_of_gpu_o_tensor_f16);
4914
0
    ccv_nnc_tensor_free(copy_of_gpu_o_tensor);
4915
0
    ccv_nnc_tensor_free(q_tensor);
4916
0
    ccv_nnc_tensor_free(k_tensor);
4917
0
    ccv_nnc_tensor_free(v_tensor);
4918
0
    ccv_nnc_tensor_free(q_tensor_f16);
4919
0
    ccv_nnc_tensor_free(k_tensor_f16);
4920
0
    ccv_nnc_tensor_free(v_tensor_f16);
4921
0
    ccv_nnc_tensor_free(gpu_q_tensor);
4922
0
    ccv_nnc_tensor_free(gpu_k_tensor);
4923
0
    ccv_nnc_tensor_free(gpu_v_tensor);
4924
0
    ccv_nnc_tensor_free(gpu_softmax_lse);
4925
0
  }
4926
0
#undef num_long_trials
4927
0
#undef num_short_trials
4928
0
#undef num_trials
4929
0
}
4930
4931
TEST_CASE("scaled dot product attention + unify head with mps")
4932
1
{
4933
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD, CCV_NNC_BACKEND_MPS));
4934
0
  ccv_nnc_symbolic_graph_t* const sdp_symbolic_graph = ccv_nnc_symbolic_graph_new();
4935
0
  ccv_nnc_tensor_symbol_t q = ccv_nnc_tensor_symbol_new(sdp_symbolic_graph, CPU_TENSOR_NHWC(32F, 32, 128, 8, 64), "q");
4936
0
  ccv_nnc_tensor_symbol_t k = ccv_nnc_tensor_symbol_new(sdp_symbolic_graph, CPU_TENSOR_NHWC(32F, 32, 128, 8, 64), "k");
4937
0
  ccv_nnc_tensor_symbol_t v = ccv_nnc_tensor_symbol_new(sdp_symbolic_graph, CPU_TENSOR_NHWC(32F, 32, 128, 8, 64), "v");
4938
0
  ccv_nnc_tensor_symbol_t w = ccv_nnc_tensor_symbol_new(sdp_symbolic_graph, CPU_TENSOR_NHWC(32F, 512, 512), "w");
4939
0
  ccv_nnc_tensor_symbol_t bias = ccv_nnc_tensor_symbol_new(sdp_symbolic_graph, CPU_TENSOR_NHWC(32F, 512), "bias");
4940
0
  ccv_nnc_tensor_symbol_t c = ccv_nnc_tensor_symbol_new(sdp_symbolic_graph, CPU_TENSOR_NHWC(32F, 32, 128, 8, 64), "c");
4941
0
  ccv_nnc_tensor_symbol_t r = ccv_nnc_tensor_symbol_new(sdp_symbolic_graph, CPU_TENSOR_NHWC(32F, 32, 128, 512), "r");
4942
0
  ccv_nnc_graph_exec_symbol_new(sdp_symbolic_graph, CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(1.0 / 8, 0), TENSOR_SYMBOL_LIST(q, k, v, NO_TENSOR_SYMBOL, w, bias), TENSOR_SYMBOL_LIST(r, NO_TENSOR_SYMBOL, c), "scaled_dot_product_attention");
4943
0
  ccv_nnc_graph_exec_symbol_autogen(sdp_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
4944
0
  ccv_nnc_graph_t* sdp_graph = 0;
4945
0
  ccv_nnc_tensor_arena_t* sdp_tensor_arena = 0;
4946
0
  ccv_nnc_graph_exec_arena_t* sdp_graph_exec_arena = 0;
4947
0
  ccv_nnc_symbolic_graph_compile(sdp_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(sdp_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(sdp_symbolic_graph), &sdp_graph, &sdp_tensor_arena, &sdp_graph_exec_arena);
4948
0
  ccv_nnc_tensor_t* const q_tensor = ccv_nnc_tensor_from_symbol(sdp_tensor_arena, q);
4949
0
  ccv_nnc_tensor_t* const k_tensor = ccv_nnc_tensor_from_symbol(sdp_tensor_arena, k);
4950
0
  ccv_nnc_tensor_t* const v_tensor = ccv_nnc_tensor_from_symbol(sdp_tensor_arena, v);
4951
0
  ccv_nnc_tensor_t* const w_tensor = ccv_nnc_tensor_from_symbol(sdp_tensor_arena, w);
4952
0
  ccv_nnc_tensor_t* const bias_tensor = ccv_nnc_tensor_from_symbol(sdp_tensor_arena, bias);
4953
0
  dsfmt_t dsfmt;
4954
0
  int i;
4955
0
  dsfmt_init_gen_rand(&dsfmt, 1);
4956
0
  for (i = 0; i < 32 * 8 * 128 * 64; i++)
4957
0
    q_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4958
0
  for (i = 0; i < 32 * 8 * 128 * 64; i++)
4959
0
    k_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4960
0
  for (i = 0; i < 32 * 8 * 128 * 64; i++)
4961
0
    v_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4962
0
  for (i = 0; i < 512 * 512; i++)
4963
0
    w_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4964
0
  for (i = 0; i < 512; i++)
4965
0
    bias_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4966
0
  ccv_nnc_symbolic_graph_t* const g_symbolic_graph = ccv_nnc_symbolic_graph_new();
4967
0
  ccv_nnc_tensor_symbol_t gq = ccv_nnc_tensor_symbol_new(g_symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 32, 128, 8, 64), "q");
4968
0
  ccv_nnc_tensor_symbol_t gk = ccv_nnc_tensor_symbol_new(g_symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 32, 128, 8, 64), "k");
4969
0
  ccv_nnc_tensor_symbol_t gv = ccv_nnc_tensor_symbol_new(g_symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 32, 128, 8, 64), "v");
4970
0
  ccv_nnc_tensor_symbol_t gw = ccv_nnc_tensor_symbol_new(g_symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 512, 512), "w");
4971
0
  ccv_nnc_tensor_symbol_t gbias = ccv_nnc_tensor_symbol_new(g_symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 512), "bias");
4972
0
  ccv_nnc_tensor_symbol_t gc = ccv_nnc_tensor_symbol_new(g_symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 32, 128, 8, 64), "c");
4973
0
  ccv_nnc_tensor_symbol_t gr = ccv_nnc_tensor_symbol_new(g_symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 32, 128, 512), "r");
4974
0
  ccv_nnc_graph_exec_symbol_new(g_symbolic_graph, CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(1.0 / 8, 0), TENSOR_SYMBOL_LIST(gq, gk, gv, NO_TENSOR_SYMBOL, gw, gbias), TENSOR_SYMBOL_LIST(gr, NO_TENSOR_SYMBOL, gc), "scaled_dot_product_attention");
4975
0
  ccv_nnc_graph_exec_symbol_autogen(g_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
4976
0
  ccv_nnc_graph_t* g_graph = 0;
4977
0
  ccv_nnc_tensor_arena_t* g_tensor_arena = 0;
4978
0
  ccv_nnc_graph_exec_arena_t* g_graph_exec_arena = 0;
4979
0
  ccv_nnc_symbolic_graph_compile(g_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(g_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(g_symbolic_graph), &g_graph, &g_tensor_arena, &g_graph_exec_arena);
4980
0
  ccv_nnc_tensor_t* const gq_tensor = ccv_nnc_tensor_from_symbol(g_tensor_arena, gq);
4981
0
  ccv_nnc_tensor_t* const gk_tensor = ccv_nnc_tensor_from_symbol(g_tensor_arena, gk);
4982
0
  ccv_nnc_tensor_t* const gv_tensor = ccv_nnc_tensor_from_symbol(g_tensor_arena, gv);
4983
0
  ccv_nnc_tensor_t* const gw_tensor = ccv_nnc_tensor_from_symbol(g_tensor_arena, gw);
4984
0
  ccv_nnc_tensor_t* const gbias_tensor = ccv_nnc_tensor_from_symbol(g_tensor_arena, gbias);
4985
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor, k_tensor, v_tensor, w_tensor, bias_tensor), TENSOR_LIST(gq_tensor, gk_tensor, gv_tensor, gw_tensor, gbias_tensor), 0);
4986
0
  ccv_nnc_graph_run(sdp_graph, 0, TRAVERSE_FULL, 0, 0);
4987
0
  ccv_nnc_graph_run(g_graph, 0, TRAVERSE_FULL, 0, 0);
4988
0
  ccv_nnc_tensor_t* const r_tensor = ccv_nnc_tensor_from_symbol(sdp_tensor_arena, r);
4989
0
  ccv_nnc_tensor_t* const gr_tensor = ccv_nnc_tensor_from_symbol(g_tensor_arena, gr);
4990
0
  ccv_nnc_tensor_t* const hr = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 32, 128, 512), 0);
4991
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gr_tensor), TENSOR_LIST(hr), 0);
4992
0
  float max_relative_diff = 0;
4993
0
  int max_diff_idx = 0;
4994
0
  for (i = 0; i < 32 * 128 * 512; i++)
4995
0
  {
4996
0
    const float denom = fmaxf(fmaxf(fabsf(r_tensor->data.f32[i]), fabsf(hr->data.f32[i])), 1.0f);
4997
0
    const float relative_diff = fabsf(r_tensor->data.f32[i] - hr->data.f32[i]) / denom;
4998
0
    if (relative_diff > max_relative_diff)
4999
0
      max_relative_diff = relative_diff, max_diff_idx = i;
5000
0
  }
5001
0
  REQUIRE(max_relative_diff <= 2e-3, "graph computed result should match scaled dot product attention op result (max relative diff %g at %d: %g vs %g)", max_relative_diff, max_diff_idx, r_tensor->data.f32[max_diff_idx], hr->data.f32[max_diff_idx]);
5002
0
  ccv_nnc_symbolic_graph_free(sdp_symbolic_graph);
5003
0
  ccv_nnc_tensor_arena_free(sdp_tensor_arena);
5004
0
  ccv_nnc_graph_exec_arena_free(sdp_graph_exec_arena);
5005
0
  ccv_nnc_graph_free(sdp_graph);
5006
0
  ccv_nnc_symbolic_graph_free(g_symbolic_graph);
5007
0
  ccv_nnc_tensor_arena_free(g_tensor_arena);
5008
0
  ccv_nnc_graph_exec_arena_free(g_graph_exec_arena);
5009
0
  ccv_nnc_graph_free(g_graph);
5010
0
  ccv_nnc_tensor_free(hr);
5011
0
}
5012
5013
TEST_CASE("scaled dot product attention + row-wise 8i unify head with mps")
5014
1
{
5015
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD, CCV_NNC_BACKEND_MPS));
5016
0
  const int B = 2;
5017
0
  const int R = 16;
5018
0
  const int H = 4;
5019
0
  const int D = 32;
5020
0
  const int K = H * D;
5021
0
  ccv_nnc_tensor_t* const hq = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
5022
0
  ccv_nnc_tensor_t* const hk = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
5023
0
  ccv_nnc_tensor_t* const hv = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
5024
0
  ccv_nnc_tensor_t* const hw_dense = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, K, K), 0);
5025
0
  ccv_nnc_tensor_t* const hwq = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(CPU_TENSOR_NHWC(32F, K, K)), 0);
5026
0
  ccv_nnc_tensor_t* const hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, K), 0);
5027
0
  dsfmt_t dsfmt;
5028
0
  dsfmt_init_gen_rand(&dsfmt, 1);
5029
0
  int i;
5030
0
  for (i = 0; i < B * R * H * D; i++)
5031
0
  {
5032
0
    hq->data.f32[i] = (float)(dsfmt_genrand_open_close(&dsfmt) - 0.5);
5033
0
    hk->data.f32[i] = (float)(dsfmt_genrand_open_close(&dsfmt) - 0.5);
5034
0
    hv->data.f32[i] = (float)(dsfmt_genrand_open_close(&dsfmt) - 0.5);
5035
0
  }
5036
0
  for (i = 0; i < K * K; i++)
5037
0
    hw_dense->data.f32[i] = (float)(dsfmt_genrand_open_close(&dsfmt) - 0.5);
5038
0
  for (i = 0; i < K; i++)
5039
0
    hbias->data.f32[i] = (float)(dsfmt_genrand_open_close(&dsfmt) - 0.5);
5040
0
  const size_t qsize = ccv_nnc_quantize_8i_rowwise(hw_dense->data.u8, CCV_32F, CCV_TENSOR_CPU_MEMORY, K * K, K, hwq->data.u8, ccv_nnc_tensor_data_size_without_padding(hwq->info));
5041
0
  REQUIRE_EQ(qsize, ccv_nnc_tensor_data_size_without_padding(hwq->info), "row-wise 8i weight quantization should fit the output tensor");
5042
0
  ccv_nnc_tensor_t* const gq = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, H, D), 0);
5043
0
  ccv_nnc_tensor_t* const gk = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, H, D), 0);
5044
0
  ccv_nnc_tensor_t* const gv = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, H, D), 0);
5045
0
  ccv_nnc_tensor_t* const gwq = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(GPU_TENSOR_NHWC(000, 32F, K, K)), 0);
5046
0
  ccv_nnc_tensor_t* const gwd = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, K, K), 0);
5047
0
  ccv_nnc_tensor_t* const gbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, K), 0);
5048
0
  ccv_nnc_tensor_t* const grq = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, K), 0);
5049
0
  ccv_nnc_tensor_t* const grd = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, K), 0);
5050
0
  ccv_nnc_tensor_t* const gcq = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, H, D), 0);
5051
0
  ccv_nnc_tensor_t* const gcd = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, H, D), 0);
5052
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(hq, hk, hv, hwq, hbias), TENSOR_LIST(gq, gk, gv, gwq, gbias), 0);
5053
0
  ccv_nnc_dequantize_8i_rowwise(gwq->data.u8, CCV_32F, CCV_TENSOR_GPU_MEMORY, qsize, K, gwd->data.u8, K * K);
5054
0
  ccv_nnc_cmd_t cmd = CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(1.0 / 8, 0);
5055
0
  cmd.info.scaled_dot_product_attention.flags = CCV_NNC_GEMM_8I;
5056
0
  ccv_nnc_cmd_exec(cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(gq, gk, gv, NULL, gwq, gbias), TENSOR_LIST(grq, NULL, gcq), 0);
5057
0
  ccv_nnc_cmd_exec(cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(gq, gk, gv, NULL, gwd, gbias), TENSOR_LIST(grd, NULL, gcd), 0);
5058
0
  ccv_nnc_tensor_t* const hrq = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, K), 0);
5059
0
  ccv_nnc_tensor_t* const hrd = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, K), 0);
5060
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(grq, grd), TENSOR_LIST(hrq, hrd), 0);
5061
0
  float max_relative_diff = 0;
5062
0
  int max_diff_idx = 0;
5063
0
  for (i = 0; i < B * R * K; i++)
5064
0
  {
5065
0
    const float denom = fmaxf(fmaxf(fabsf(hrq->data.f32[i]), fabsf(hrd->data.f32[i])), 1.0f);
5066
0
    const float relative_diff = fabsf(hrq->data.f32[i] - hrd->data.f32[i]) / denom;
5067
0
    if (relative_diff > max_relative_diff)
5068
0
      max_relative_diff = relative_diff, max_diff_idx = i;
5069
0
  }
5070
0
  REQUIRE(max_relative_diff <= 5e-2, "row-wise 8i unify head result should match dequantized weight result (max relative diff %g at %d: %g vs %g)", max_relative_diff, max_diff_idx, hrq->data.f32[max_diff_idx], hrd->data.f32[max_diff_idx]);
5071
0
  ccv_nnc_tensor_free(hq);
5072
0
  ccv_nnc_tensor_free(hk);
5073
0
  ccv_nnc_tensor_free(hv);
5074
0
  ccv_nnc_tensor_free(hw_dense);
5075
0
  ccv_nnc_tensor_free(hwq);
5076
0
  ccv_nnc_tensor_free(hbias);
5077
0
  ccv_nnc_tensor_free(gq);
5078
0
  ccv_nnc_tensor_free(gk);
5079
0
  ccv_nnc_tensor_free(gv);
5080
0
  ccv_nnc_tensor_free(gwq);
5081
0
  ccv_nnc_tensor_free(gwd);
5082
0
  ccv_nnc_tensor_free(gbias);
5083
0
  ccv_nnc_tensor_free(grq);
5084
0
  ccv_nnc_tensor_free(grd);
5085
0
  ccv_nnc_tensor_free(gcq);
5086
0
  ccv_nnc_tensor_free(gcd);
5087
0
  ccv_nnc_tensor_free(hrq);
5088
0
  ccv_nnc_tensor_free(hrd);
5089
0
}
5090
5091
TEST_CASE("scaled dot product attention gradient with mps")
5092
1
{
5093
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD, CCV_NNC_BACKEND_MPS) &&
5094
1
    ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD, CCV_NNC_BACKEND_MPS));
5095
0
#define num_long_trials 2
5096
0
#define num_short_trials 2
5097
0
#define num_trials (num_long_trials + num_short_trials)
5098
5099
0
  dsfmt_t dsfmt;
5100
0
  dsfmt_init_gen_rand(&dsfmt, 10);
5101
0
  for (int trial = 0; trial < num_trials; ++trial) {
5102
0
    int B_candidates[num_trials] = {  32,   3, 2, 1 };
5103
0
    int R_candidates[num_trials] = { 128,  61, 6, 2 };
5104
0
    int C_candidates[num_trials] = { 128,  49, 2, 1 };
5105
0
    int H_candidates[num_trials] = {   8,  13, 3, 1 };
5106
0
    int D_candidates[num_trials] = {  64, 191, 4, 8 };
5107
5108
0
    int B = B_candidates[trial];
5109
0
    int R = R_candidates[trial];
5110
0
    int C = C_candidates[trial];
5111
0
    int H = H_candidates[trial];
5112
0
    int D = D_candidates[trial];
5113
0
    float scale = 1.0 / sqrt((float)D);
5114
5115
0
    ccv_nnc_tensor_t* const q_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
5116
0
    ccv_nnc_tensor_t* const k_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
5117
0
    ccv_nnc_tensor_t* const v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
5118
0
    ccv_nnc_tensor_t* const dq_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
5119
0
    ccv_nnc_tensor_t* const dk_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
5120
0
    ccv_nnc_tensor_t* const dv_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
5121
5122
0
    for (int i = 0; i < B * R * H * D; ++i) {
5123
0
      q_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
5124
0
    }
5125
0
    for (int i = 0; i < B * C * H * D; ++i) {
5126
0
      k_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
5127
0
    }
5128
0
    for (int i = 0; i < B * C * H * D; ++i) {
5129
0
      v_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
5130
0
    }
5131
5132
0
    ccv_nnc_tensor_t* const do_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
5133
0
    for (int i = 0; i < B * R * H * D; ++i) {
5134
0
      do_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
5135
0
    }
5136
0
    ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD(scale, 0), ccv_nnc_no_hint, 0, TENSOR_LIST(do_tensor, 0, 0, q_tensor, k_tensor, v_tensor), TENSOR_LIST(dq_tensor, dk_tensor, dv_tensor), 0);
5137
5138
0
    ccv_nnc_tensor_t* const gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, H, D), 0);
5139
0
    ccv_nnc_tensor_t* const gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, C, H, D), 0);
5140
0
    ccv_nnc_tensor_t* const gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, C, H, D), 0);
5141
0
    ccv_nnc_tensor_t* const gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, H, D), 0);
5142
0
    ccv_nnc_tensor_t* const gpu_dq_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, H, D), 0);
5143
0
    ccv_nnc_tensor_t* const gpu_dk_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, C, H, D), 0);
5144
0
    ccv_nnc_tensor_t* const gpu_dv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, C, H, D), 0);
5145
0
    ccv_nnc_tensor_t* const gpu_do_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, H, D), 0);
5146
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor, k_tensor, v_tensor, do_tensor), TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, gpu_do_tensor), 0);
5147
5148
0
    ccv_nnc_tensor_t* const gpu_softmax_lse = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, H, R), 0);
5149
0
    ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, 0), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, NULL, NULL, NULL), TENSOR_LIST(gpu_o_tensor, gpu_softmax_lse), 0);
5150
5151
0
    ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD(scale, 0), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_do_tensor, 0, 0, gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, 0, 0, 0, gpu_o_tensor, gpu_softmax_lse), TENSOR_LIST(gpu_dq_tensor, gpu_dk_tensor, gpu_dv_tensor), 0);
5152
5153
0
    ccv_nnc_tensor_t* const copy_of_gpu_dq_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
5154
0
    ccv_nnc_tensor_t* const copy_of_gpu_dk_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
5155
0
    ccv_nnc_tensor_t* const copy_of_gpu_dv_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
5156
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_dq_tensor, gpu_dk_tensor, gpu_dv_tensor), TENSOR_LIST(copy_of_gpu_dq_tensor, copy_of_gpu_dk_tensor, copy_of_gpu_dv_tensor), 0);
5157
5158
0
    REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, copy_of_gpu_dv_tensor->data.f32, dv_tensor->data.f32, B * C * H * D, 5e-3, "scaled dot product attention result should be the same");
5159
0
    REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, copy_of_gpu_dq_tensor->data.f32, dq_tensor->data.f32, B * R * H * D, 5e-3, "scaled dot product attention result should be the same");
5160
0
    REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, copy_of_gpu_dk_tensor->data.f32, dk_tensor->data.f32, B * C * H * D, 5e-3, "scaled dot product attention result should be the same");
5161
5162
0
    ccv_nnc_tensor_free(do_tensor);
5163
0
    ccv_nnc_tensor_free(gpu_do_tensor);
5164
0
    ccv_nnc_tensor_free(gpu_o_tensor);
5165
0
    ccv_nnc_tensor_free(copy_of_gpu_dq_tensor);
5166
0
    ccv_nnc_tensor_free(copy_of_gpu_dk_tensor);
5167
0
    ccv_nnc_tensor_free(copy_of_gpu_dv_tensor);
5168
0
    ccv_nnc_tensor_free(q_tensor);
5169
0
    ccv_nnc_tensor_free(k_tensor);
5170
0
    ccv_nnc_tensor_free(v_tensor);
5171
0
    ccv_nnc_tensor_free(gpu_q_tensor);
5172
0
    ccv_nnc_tensor_free(gpu_k_tensor);
5173
0
    ccv_nnc_tensor_free(gpu_v_tensor);
5174
0
    ccv_nnc_tensor_free(dq_tensor);
5175
0
    ccv_nnc_tensor_free(dk_tensor);
5176
0
    ccv_nnc_tensor_free(dv_tensor);
5177
0
    ccv_nnc_tensor_free(gpu_dq_tensor);
5178
0
    ccv_nnc_tensor_free(gpu_dk_tensor);
5179
0
    ccv_nnc_tensor_free(gpu_dv_tensor);
5180
0
    ccv_nnc_tensor_free(gpu_softmax_lse);
5181
0
  }
5182
0
#undef num_long_trials
5183
0
#undef num_short_trials
5184
0
#undef num_trials
5185
0
}
5186
5187
TEST_CASE("scaled dot product attention gradient with mps in half precision")
5188
1
{
5189
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD, CCV_NNC_BACKEND_MPS) &&
5190
1
    ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD, CCV_NNC_BACKEND_MPS));
5191
0
#define num_long_trials 8
5192
0
#define num_short_trials 4
5193
0
#define num_trials (num_long_trials + num_short_trials)
5194
5195
0
  dsfmt_t dsfmt;
5196
0
  dsfmt_init_gen_rand(&dsfmt, 10);
5197
0
  for (int trial = 0; trial < num_trials; ++trial) {
5198
0
    const int B_candidates[num_trials] = {  32,   12, 16, 1, 2, 1, 32,   12, 16, 1, 2, 1 };
5199
0
    const int R_candidates[num_trials] = { 160,  256, 128, 77, 77, 5, 160,  256, 128, 77, 77, 5 };
5200
0
    const int C_candidates[num_trials] = { 128,  128, 128, 128, 128, 5, 128,  128, 128, 128, 128, 5 };
5201
0
    const int Hq_candidates[num_trials] = {   8,  8, 8, 8, 8, 32, 8,  8, 8, 8, 8, 32 };
5202
0
    const int D_candidates[num_trials] = {  64, 40, 160, 192, 256, 128, 48, 96, 160, 192, 256, 128 };
5203
5204
0
    const int B = B_candidates[trial];
5205
0
    const int R = R_candidates[trial];
5206
0
    const int C = C_candidates[trial];
5207
0
    const int Hq = Hq_candidates[trial];
5208
0
    const int Hk = Hq_candidates[trial];
5209
0
    const int D = D_candidates[trial];
5210
0
    const int is_causal = 0;
5211
0
    const float scale = 1.0 / sqrt((float)D);
5212
5213
0
    ccv_nnc_tensor_t* const q_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
5214
0
    ccv_nnc_tensor_t* const k_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
5215
0
    ccv_nnc_tensor_t* const v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
5216
0
    ccv_nnc_tensor_t* const dq_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
5217
0
    ccv_nnc_tensor_t* const dk_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
5218
0
    ccv_nnc_tensor_t* const dv_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
5219
5220
0
    for (int i = 0; i < B * R * Hq * D; ++i) {
5221
0
      q_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
5222
0
    }
5223
0
    for (int i = 0; i < B * C * Hk * D; ++i) {
5224
0
      k_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
5225
0
    }
5226
0
    for (int i = 0; i < B * C * Hk * D; ++i) {
5227
0
      v_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
5228
0
    }
5229
5230
0
    ccv_nnc_tensor_t* const do_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
5231
0
    for (int i = 0; i < B * R * Hq * D; ++i) {
5232
0
      do_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
5233
0
    }
5234
0
    ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD(scale, is_causal), ccv_nnc_no_hint, 0, TENSOR_LIST(do_tensor, 0, 0, q_tensor, k_tensor, v_tensor), TENSOR_LIST(dq_tensor, dk_tensor, dv_tensor), 0);
5235
0
    ccv_nnc_tensor_t* const q_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, Hq, D), 0);
5236
0
    ccv_nnc_tensor_t* const k_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, Hk, D), 0);
5237
0
    ccv_nnc_tensor_t* const v_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, Hk, D), 0);
5238
0
    ccv_nnc_tensor_t* const do_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, Hq, D), 0);
5239
0
    ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor, k_tensor, v_tensor, do_tensor), TENSOR_LIST(q_tensor_f16, k_tensor_f16, v_tensor_f16, do_tensor_f16), 0);
5240
5241
0
    ccv_nnc_tensor_t* const gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, Hq, D), 0);
5242
0
    ccv_nnc_tensor_t* const gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, Hk, D), 0);
5243
0
    ccv_nnc_tensor_t* const gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, Hk, D), 0);
5244
0
    ccv_nnc_tensor_t* const gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, Hq, D), 0);
5245
0
    ccv_nnc_tensor_t* const gpu_do_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, Hq, D), 0);
5246
0
    ccv_nnc_tensor_t* const gpu_dq_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, Hq, D), 0);
5247
0
    ccv_nnc_tensor_t* const gpu_dk_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, Hk, D), 0);
5248
0
    ccv_nnc_tensor_t* const gpu_dv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, Hk, D), 0);
5249
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor_f16, k_tensor_f16, v_tensor_f16, do_tensor_f16), TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, gpu_do_tensor), 0);
5250
5251
0
    ccv_nnc_tensor_t* const gpu_softmax_lse = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, Hq, R), 0);
5252
0
    ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, is_causal), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, NULL, NULL, NULL), TENSOR_LIST(gpu_o_tensor, gpu_softmax_lse), 0);
5253
5254
0
    ccv_nnc_cmd_t cmd = CMD_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD(scale, is_causal);
5255
0
    cmd.info.scaled_dot_product_attention.deterministic = 0;
5256
0
    ccv_nnc_cmd_exec(cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_do_tensor, 0, 0, gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, 0, 0, 0, gpu_o_tensor, gpu_softmax_lse), TENSOR_LIST(gpu_dq_tensor, gpu_dk_tensor, gpu_dv_tensor), 0);
5257
5258
0
    ccv_nnc_tensor_t* const copy_of_gpu_dq_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, Hq, D), 0);
5259
0
    ccv_nnc_tensor_t* const copy_of_gpu_dk_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, Hk, D), 0);
5260
0
    ccv_nnc_tensor_t* const copy_of_gpu_dv_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, Hk, D), 0);
5261
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_dq_tensor, gpu_dk_tensor, gpu_dv_tensor), TENSOR_LIST(copy_of_gpu_dq_tensor_f16, copy_of_gpu_dk_tensor_f16, copy_of_gpu_dv_tensor_f16), 0);
5262
5263
0
    ccv_nnc_tensor_t* const copy_of_gpu_dq_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
5264
0
    ccv_nnc_tensor_t* const copy_of_gpu_dk_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
5265
0
    ccv_nnc_tensor_t* const copy_of_gpu_dv_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
5266
0
    ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(copy_of_gpu_dq_tensor_f16, copy_of_gpu_dk_tensor_f16, copy_of_gpu_dv_tensor_f16), TENSOR_LIST(copy_of_gpu_dq_tensor, copy_of_gpu_dk_tensor, copy_of_gpu_dv_tensor), 0);
5267
5268
0
    REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, copy_of_gpu_dq_tensor->data.f32, dq_tensor->data.f32, B * R * Hq * D, 1e-3, "scaled dot product attention result should be the same");
5269
0
    REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, copy_of_gpu_dk_tensor->data.f32, dk_tensor->data.f32, B * C * Hk * D, 3e-3, "scaled dot product attention result should be the same");
5270
0
    REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, copy_of_gpu_dv_tensor->data.f32, dv_tensor->data.f32, B * C * Hk * D, 6e-3, "GPU computed output should be the same as CPU computed ones");
5271
5272
0
    ccv_nnc_tensor_free(do_tensor);
5273
0
    ccv_nnc_tensor_free(gpu_do_tensor);
5274
0
    ccv_nnc_tensor_free(gpu_o_tensor);
5275
0
    ccv_nnc_tensor_free(copy_of_gpu_dq_tensor_f16);
5276
0
    ccv_nnc_tensor_free(copy_of_gpu_dk_tensor_f16);
5277
0
    ccv_nnc_tensor_free(copy_of_gpu_dv_tensor_f16);
5278
0
    ccv_nnc_tensor_free(copy_of_gpu_dq_tensor);
5279
0
    ccv_nnc_tensor_free(copy_of_gpu_dk_tensor);
5280
0
    ccv_nnc_tensor_free(copy_of_gpu_dv_tensor);
5281
0
    ccv_nnc_tensor_free(q_tensor);
5282
0
    ccv_nnc_tensor_free(k_tensor);
5283
0
    ccv_nnc_tensor_free(v_tensor);
5284
0
    ccv_nnc_tensor_free(q_tensor_f16);
5285
0
    ccv_nnc_tensor_free(k_tensor_f16);
5286
0
    ccv_nnc_tensor_free(v_tensor_f16);
5287
0
    ccv_nnc_tensor_free(do_tensor_f16);
5288
0
    ccv_nnc_tensor_free(gpu_q_tensor);
5289
0
    ccv_nnc_tensor_free(gpu_k_tensor);
5290
0
    ccv_nnc_tensor_free(gpu_v_tensor);
5291
0
    ccv_nnc_tensor_free(dq_tensor);
5292
0
    ccv_nnc_tensor_free(dk_tensor);
5293
0
    ccv_nnc_tensor_free(dv_tensor);
5294
0
    ccv_nnc_tensor_free(gpu_dq_tensor);
5295
0
    ccv_nnc_tensor_free(gpu_dk_tensor);
5296
0
    ccv_nnc_tensor_free(gpu_dv_tensor);
5297
0
    ccv_nnc_tensor_free(gpu_softmax_lse);
5298
0
  }
5299
0
#undef num_long_trials
5300
0
#undef num_short_trials
5301
0
#undef num_trials
5302
0
}
5303
5304
TEST_CASE("scaled dot product attention gradient with mps in bfloat precision")
5305
1
{
5306
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD, CCV_NNC_BACKEND_MPS) &&
5307
1
    ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD, CCV_NNC_BACKEND_MPS));
5308
0
#define num_long_trials 8
5309
0
#define num_short_trials 4
5310
0
#define num_trials (num_long_trials + num_short_trials)
5311
5312
0
  dsfmt_t dsfmt;
5313
0
  dsfmt_init_gen_rand(&dsfmt, 10);
5314
0
  for (int trial = 0; trial < num_trials; ++trial) {
5315
0
    const int B_candidates[num_trials] = {  32,   12, 16, 1, 2, 1, 32,   12, 16, 1, 2, 1 };
5316
0
    const int R_candidates[num_trials] = { 160,  256, 128, 77, 77, 5, 160,  256, 128, 77, 77, 5 };
5317
0
    const int C_candidates[num_trials] = { 128,  128, 128, 128, 128, 5, 128,  128, 128, 128, 128, 5 };
5318
0
    const int Hq_candidates[num_trials] = {   8,  8, 8, 8, 8, 32, 8,  8, 8, 8, 8, 32 };
5319
0
    const int D_candidates[num_trials] = {  64, 40, 160, 192, 256, 128, 48, 96, 160, 192, 256, 128 };
5320
5321
0
    const int B = B_candidates[trial];
5322
0
    const int R = R_candidates[trial];
5323
0
    const int C = C_candidates[trial];
5324
0
    const int Hq = Hq_candidates[trial];
5325
0
    const int Hk = Hq_candidates[trial];
5326
0
    const int D = D_candidates[trial];
5327
0
    const int is_causal = 0;
5328
0
    const float scale = 1.0 / sqrt((float)D);
5329
5330
0
    ccv_nnc_tensor_t* const q_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
5331
0
    ccv_nnc_tensor_t* const k_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
5332
0
    ccv_nnc_tensor_t* const v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
5333
0
    ccv_nnc_tensor_t* const dq_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
5334
0
    ccv_nnc_tensor_t* const dk_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
5335
0
    ccv_nnc_tensor_t* const dv_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
5336
5337
0
    for (int i = 0; i < B * R * Hq * D; ++i) {
5338
0
      q_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
5339
0
    }
5340
0
    for (int i = 0; i < B * C * Hk * D; ++i) {
5341
0
      k_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
5342
0
    }
5343
0
    for (int i = 0; i < B * C * Hk * D; ++i) {
5344
0
      v_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
5345
0
    }
5346
5347
0
    ccv_nnc_tensor_t* const do_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
5348
0
    for (int i = 0; i < B * R * Hq * D; ++i) {
5349
0
      do_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
5350
0
    }
5351
0
    ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD(scale, is_causal), ccv_nnc_no_hint, 0, TENSOR_LIST(do_tensor, 0, 0, q_tensor, k_tensor, v_tensor), TENSOR_LIST(dq_tensor, dk_tensor, dv_tensor), 0);
5352
0
    ccv_nnc_tensor_t* const q_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, B, R, Hq, D), 0);
5353
0
    ccv_nnc_tensor_t* const k_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, B, C, Hk, D), 0);
5354
0
    ccv_nnc_tensor_t* const v_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, B, C, Hk, D), 0);
5355
0
    ccv_nnc_tensor_t* const do_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, B, R, Hq, D), 0);
5356
0
    ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor, k_tensor, v_tensor, do_tensor), TENSOR_LIST(q_tensor_f16, k_tensor_f16, v_tensor_f16, do_tensor_f16), 0);
5357
5358
0
    ccv_nnc_tensor_t* const gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, R, Hq, D), 0);
5359
0
    ccv_nnc_tensor_t* const gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, C, Hk, D), 0);
5360
0
    ccv_nnc_tensor_t* const gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, C, Hk, D), 0);
5361
0
    ccv_nnc_tensor_t* const gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, R, Hq, D), 0);
5362
0
    ccv_nnc_tensor_t* const gpu_do_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, R, Hq, D), 0);
5363
0
    ccv_nnc_tensor_t* const gpu_dq_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, R, Hq, D), 0);
5364
0
    ccv_nnc_tensor_t* const gpu_dk_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, C, Hk, D), 0);
5365
0
    ccv_nnc_tensor_t* const gpu_dv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, C, Hk, D), 0);
5366
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor_f16, k_tensor_f16, v_tensor_f16, do_tensor_f16), TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, gpu_do_tensor), 0);
5367
5368
0
    ccv_nnc_tensor_t* const gpu_softmax_lse = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, Hq, R), 0);
5369
0
    ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, is_causal), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, NULL, NULL, NULL), TENSOR_LIST(gpu_o_tensor, gpu_softmax_lse), 0);
5370
5371
0
    ccv_nnc_cmd_t cmd = CMD_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD(scale, is_causal);
5372
0
    cmd.info.scaled_dot_product_attention.deterministic = 0;
5373
0
    ccv_nnc_cmd_exec(cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_do_tensor, 0, 0, gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, 0, 0, 0, gpu_o_tensor, gpu_softmax_lse), TENSOR_LIST(gpu_dq_tensor, gpu_dk_tensor, gpu_dv_tensor), 0);
5374
5375
0
    ccv_nnc_tensor_t* const copy_of_gpu_dq_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, B, R, Hq, D), 0);
5376
0
    ccv_nnc_tensor_t* const copy_of_gpu_dk_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, B, C, Hk, D), 0);
5377
0
    ccv_nnc_tensor_t* const copy_of_gpu_dv_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, B, C, Hk, D), 0);
5378
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_dq_tensor, gpu_dk_tensor, gpu_dv_tensor), TENSOR_LIST(copy_of_gpu_dq_tensor_f16, copy_of_gpu_dk_tensor_f16, copy_of_gpu_dv_tensor_f16), 0);
5379
5380
0
    ccv_nnc_tensor_t* const copy_of_gpu_dq_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
5381
0
    ccv_nnc_tensor_t* const copy_of_gpu_dk_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
5382
0
    ccv_nnc_tensor_t* const copy_of_gpu_dv_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
5383
0
    ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(copy_of_gpu_dq_tensor_f16, copy_of_gpu_dk_tensor_f16, copy_of_gpu_dv_tensor_f16), TENSOR_LIST(copy_of_gpu_dq_tensor, copy_of_gpu_dk_tensor, copy_of_gpu_dv_tensor), 0);
5384
5385
0
    REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, copy_of_gpu_dq_tensor->data.f32, dq_tensor->data.f32, B * R * Hq * D, 5e-3, "scaled dot product attention result should be the same");
5386
0
    REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, copy_of_gpu_dk_tensor->data.f32, dk_tensor->data.f32, B * C * Hk * D, 1e-2, "scaled dot product attention result should be the same");
5387
0
    REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, copy_of_gpu_dv_tensor->data.f32, dv_tensor->data.f32, B * C * Hk * D, 2e-2, "GPU computed output should be the same as CPU computed ones");
5388
5389
0
    ccv_nnc_tensor_free(do_tensor);
5390
0
    ccv_nnc_tensor_free(gpu_do_tensor);
5391
0
    ccv_nnc_tensor_free(gpu_o_tensor);
5392
0
    ccv_nnc_tensor_free(copy_of_gpu_dq_tensor_f16);
5393
0
    ccv_nnc_tensor_free(copy_of_gpu_dk_tensor_f16);
5394
0
    ccv_nnc_tensor_free(copy_of_gpu_dv_tensor_f16);
5395
0
    ccv_nnc_tensor_free(copy_of_gpu_dq_tensor);
5396
0
    ccv_nnc_tensor_free(copy_of_gpu_dk_tensor);
5397
0
    ccv_nnc_tensor_free(copy_of_gpu_dv_tensor);
5398
0
    ccv_nnc_tensor_free(q_tensor);
5399
0
    ccv_nnc_tensor_free(k_tensor);
5400
0
    ccv_nnc_tensor_free(v_tensor);
5401
0
    ccv_nnc_tensor_free(q_tensor_f16);
5402
0
    ccv_nnc_tensor_free(k_tensor_f16);
5403
0
    ccv_nnc_tensor_free(v_tensor_f16);
5404
0
    ccv_nnc_tensor_free(do_tensor_f16);
5405
0
    ccv_nnc_tensor_free(gpu_q_tensor);
5406
0
    ccv_nnc_tensor_free(gpu_k_tensor);
5407
0
    ccv_nnc_tensor_free(gpu_v_tensor);
5408
0
    ccv_nnc_tensor_free(dq_tensor);
5409
0
    ccv_nnc_tensor_free(dk_tensor);
5410
0
    ccv_nnc_tensor_free(dv_tensor);
5411
0
    ccv_nnc_tensor_free(gpu_dq_tensor);
5412
0
    ccv_nnc_tensor_free(gpu_dk_tensor);
5413
0
    ccv_nnc_tensor_free(gpu_dv_tensor);
5414
0
    ccv_nnc_tensor_free(gpu_softmax_lse);
5415
0
  }
5416
0
#undef num_long_trials
5417
0
#undef num_short_trials
5418
0
#undef num_trials
5419
0
}
5420
5421
TEST_CASE("backward gemm with no transpose")
5422
1
{
5423
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS) &&
5424
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
5425
0
  float gp[] = {
5426
0
    1, 2, 3,
5427
0
    4, 5, 6,
5428
0
    7, 8, 9,
5429
0
    10, 11, 12,
5430
0
  };
5431
0
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
5432
5433
0
  float ap[] = {
5434
0
    13, 14,
5435
0
    15, 16,
5436
0
    17, 18,
5437
0
    19, 20,
5438
0
  };
5439
5440
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 4, 2), 0);
5441
5442
0
  float bp[] = {
5443
0
    21, 22, 23,
5444
0
    24, 25, 26,
5445
0
  };
5446
5447
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
5448
5449
0
  ccv_nnc_tensor_t* const gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
5450
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
5451
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
5452
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(gg, ga, gb), 0);
5453
5454
0
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
5455
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
5456
0
  ccv_nnc_tensor_t* const dbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
5457
0
  ccv_nnc_cmd_t cmd = CMD_GEMM_BACKWARD();
5458
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
5459
0
  cmd.algorithm = 1; // This is cblas.
5460
5461
0
  ccv_nnc_cmd_exec(cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(h, db, dbias), 0);
5462
5463
0
  ccv_nnc_tensor_t* const ch = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC( 32F, 4, 2), 0);
5464
0
  ccv_nnc_tensor_t* const cdb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC( 32F, 2, 3), 0);
5465
0
  ccv_nnc_tensor_t* const cdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC( 32F, 3), 0);
5466
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(h, db, dbias), TENSOR_LIST(ch, cdb, cdbias), 0);
5467
5468
0
  float dbiastp[] = {
5469
0
    22, 26, 30,
5470
0
  };
5471
0
  ccv_nnc_tensor_t dbiast = ccv_nnc_tensor(dbiastp, CPU_TENSOR_NHWC(32F, 3), 0);
5472
5473
0
  REQUIRE_TENSOR_EQ(cdbias, &dbiast, "bias should be equal");
5474
0
  float htp[] = {
5475
0
    1 * 21 + 2 * 22 + 3 * 23, 1 * 24 + 2 * 25 + 3 * 26,
5476
0
    4 * 21 + 5 * 22 + 6 * 23, 4 * 24 + 5 * 25 + 6 * 26,
5477
0
    7 * 21 + 8 * 22 + 9 * 23, 7 * 24 + 8 * 25 + 9 * 26,
5478
0
    10 * 21 + 11 * 22 + 12 * 23, 10 * 24 + 11 * 25 + 12 * 26,
5479
0
  };
5480
0
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 4, 2), 0);
5481
5482
0
  REQUIRE_TENSOR_EQ(ch, &ht, "h should be equal");
5483
0
  float dbtp[] = {
5484
0
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19, 2 * 13 + 5 * 15 + 8 * 17 + 11 * 19, 3 * 13 + 6 * 15 + 9 * 17 + 12 * 19,
5485
0
    1 * 14 + 4 * 16 + 7 * 18 + 10 * 20, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20,
5486
0
  };
5487
0
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
5488
0
  REQUIRE_TENSOR_EQ(cdb, &dbt, "db should be equal");
5489
0
  ccv_nnc_tensor_free(g);
5490
0
  ccv_nnc_tensor_free(a);
5491
0
  ccv_nnc_tensor_free(b);
5492
0
  ccv_nnc_tensor_free(h);
5493
0
  ccv_nnc_tensor_free(db);
5494
0
  ccv_nnc_tensor_free(dbias);
5495
0
}
5496
5497
TEST_CASE("backward gemm with transpose a")
5498
1
{
5499
1
    GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS) &&
5500
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
5501
0
  float gp[] = {
5502
0
    1, 2, 3,
5503
0
    4, 5, 6,
5504
0
    7, 8, 9,
5505
0
    10, 11, 12,
5506
0
  };
5507
0
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
5508
0
  float ap[] = {
5509
0
    13, 15, 17, 19,
5510
0
    14, 16, 18, 20,
5511
0
  };
5512
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4), 0);
5513
0
  float bp[] = {
5514
0
    21, 22, 23,
5515
0
    24, 25, 26,
5516
0
  };
5517
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
5518
0
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4), 0);
5519
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
5520
0
  ccv_nnc_tensor_t* const dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
5521
0
  ccv_nnc_tensor_t* gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
5522
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4), 0);
5523
0
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
5524
0
  ccv_nnc_tensor_t* gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4), 0);
5525
0
  ccv_nnc_tensor_t* gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
5526
0
  ccv_nnc_tensor_t* gdbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
5527
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(gg, ga, gb), 0);
5528
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(gh, gdb, gdbias), 0);
5529
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdb, gdbias), TENSOR_LIST(h, db, dbias), 0);
5530
0
  float dbiastp[] = {
5531
0
    22, 26, 30,
5532
0
  };
5533
0
  ccv_nnc_tensor_t dbiast = ccv_nnc_tensor(dbiastp, CPU_TENSOR_NHWC(32F, 3), 0);
5534
0
  REQUIRE_TENSOR_EQ(dbias, &dbiast, "bias should be equal");
5535
0
  float htp[] = {
5536
0
    1 * 21 + 2 * 22 + 3 * 23, 4 * 21 + 5 * 22 + 6 * 23, 7 * 21 + 8 * 22 + 9 * 23, 10 * 21 + 11 * 22 + 12 * 23,
5537
0
    1 * 24 + 2 * 25 + 3 * 26, 4 * 24 + 5 * 25 + 6 * 26, 7 * 24 + 8 * 25 + 9 * 26, 10 * 24 + 11 * 25 + 12 * 26,
5538
0
  };
5539
0
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 2, 4), 0);
5540
0
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
5541
0
  float dbtp[] = {
5542
0
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19, 2 * 13 + 5 * 15 + 8 * 17 + 11 * 19, 3 * 13 + 6 * 15 + 9 * 17 + 12 * 19,
5543
0
    1 * 14 + 4 * 16 + 7 * 18 + 10 * 20, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20,
5544
0
  };
5545
0
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
5546
0
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
5547
0
  ccv_nnc_tensor_free(g);
5548
0
  ccv_nnc_tensor_free(a);
5549
0
  ccv_nnc_tensor_free(b);
5550
0
  ccv_nnc_tensor_free(h);
5551
0
  ccv_nnc_tensor_free(db);
5552
0
  ccv_nnc_tensor_free(dbias);
5553
0
  ccv_nnc_tensor_free(gg);
5554
0
  ccv_nnc_tensor_free(ga);
5555
0
  ccv_nnc_tensor_free(gb);
5556
0
  ccv_nnc_tensor_free(gh);
5557
0
  ccv_nnc_tensor_free(gdb);
5558
0
  ccv_nnc_tensor_free(gdbias);
5559
0
}
5560
5561
TEST_CASE("backward gemm with transpose b")
5562
1
{
5563
1
    GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS) &&
5564
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
5565
0
  float gp[] = {
5566
0
    1, 2, 3,
5567
0
    4, 5, 6,
5568
0
    7, 8, 9,
5569
0
    10, 11, 12,
5570
0
  };
5571
0
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
5572
0
  float ap[] = {
5573
0
    13, 14,
5574
0
    15, 16,
5575
0
    17, 18,
5576
0
    19, 20,
5577
0
  };
5578
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 4, 2), 0);
5579
0
  float bp[] = {
5580
0
    21, 24,
5581
0
    22, 25,
5582
0
    23, 26,
5583
0
  };
5584
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 3, 2), 0);
5585
0
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 2), 0);
5586
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 2), 0);
5587
0
  ccv_nnc_tensor_t* const dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
5588
0
  ccv_nnc_tensor_t* gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
5589
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
5590
0
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3, 2), 0);
5591
0
  ccv_nnc_tensor_t* gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
5592
0
  ccv_nnc_tensor_t* gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3, 2), 0);
5593
0
  ccv_nnc_tensor_t* gdbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
5594
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(gg, ga, gb), 0);
5595
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(gh, gdb, gdbias), 0);
5596
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdb, gdbias), TENSOR_LIST(h, db, dbias), 0);
5597
0
  float dbiastp[] = {
5598
0
    22, 26, 30,
5599
0
  };
5600
0
  ccv_nnc_tensor_t dbiast = ccv_nnc_tensor(dbiastp, CPU_TENSOR_NHWC(32F, 3), 0);
5601
0
  REQUIRE_TENSOR_EQ(dbias, &dbiast, "bias should be equal");
5602
0
  float htp[] = {
5603
0
    1 * 21 + 2 * 22 + 3 * 23, 1 * 24 + 2 * 25 + 3 * 26,
5604
0
    4 * 21 + 5 * 22 + 6 * 23, 4 * 24 + 5 * 25 + 6 * 26,
5605
0
    7 * 21 + 8 * 22 + 9 * 23, 7 * 24 + 8 * 25 + 9 * 26,
5606
0
    10 * 21 + 11 * 22 + 12 * 23, 10 * 24 + 11 * 25 + 12 * 26,
5607
0
  };
5608
0
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 4, 2), 0);
5609
0
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
5610
0
  float dbtp[] = {
5611
0
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19, 1 * 14 + 4 * 16 + 7 * 18 + 10 * 20,
5612
0
    2 * 13 + 5 * 15 + 8 * 17 + 11 * 19, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20,
5613
0
    3 * 13 + 6 * 15 + 9 * 17 + 12 * 19, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20,
5614
0
  };
5615
0
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 3, 2), 0);
5616
0
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
5617
0
  ccv_nnc_tensor_free(g);
5618
0
  ccv_nnc_tensor_free(a);
5619
0
  ccv_nnc_tensor_free(b);
5620
0
  ccv_nnc_tensor_free(h);
5621
0
  ccv_nnc_tensor_free(db);
5622
0
  ccv_nnc_tensor_free(dbias);
5623
0
  ccv_nnc_tensor_free(gg);
5624
0
  ccv_nnc_tensor_free(ga);
5625
0
  ccv_nnc_tensor_free(gb);
5626
0
  ccv_nnc_tensor_free(gh);
5627
0
  ccv_nnc_tensor_free(gdb);
5628
0
  ccv_nnc_tensor_free(gdbias);
5629
0
}
5630
5631
TEST_CASE("backward gemm with transpose a and b")
5632
1
{
5633
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS) &&
5634
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
5635
0
  float gp[] = {
5636
0
    1, 2, 3,
5637
0
    4, 5, 6,
5638
0
    7, 8, 9,
5639
0
    10, 11, 12,
5640
0
  };
5641
0
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
5642
0
  float ap[] = {
5643
0
    13, 15, 17, 19,
5644
0
    14, 16, 18, 20,
5645
0
  };
5646
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4), 0);
5647
0
  float bp[] = {
5648
0
    21, 24,
5649
0
    22, 25,
5650
0
    23, 26,
5651
0
  };
5652
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 3, 2), 0);
5653
0
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4), 0);
5654
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 2), 0);
5655
0
  ccv_nnc_tensor_t* const dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
5656
0
  ccv_nnc_tensor_t* gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
5657
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4), 0);
5658
0
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3, 2), 0);
5659
0
  ccv_nnc_tensor_t* gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4), 0);
5660
0
  ccv_nnc_tensor_t* gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3, 2), 0);
5661
0
  ccv_nnc_tensor_t* gdbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
5662
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(gg, ga, gb), 0);
5663
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(TRANSPOSE(0, 1), TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(gh, gdb, gdbias), 0);
5664
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdb, gdbias), TENSOR_LIST(h, db, dbias), 0);
5665
0
  float dbiastp[] = {
5666
0
    22, 26, 30,
5667
0
  };
5668
0
  ccv_nnc_tensor_t dbiast = ccv_nnc_tensor(dbiastp, CPU_TENSOR_NHWC(32F, 3), 0);
5669
0
  REQUIRE_TENSOR_EQ(dbias, &dbiast, "bias should be equal");
5670
0
  float htp[] = {
5671
0
    1 * 21 + 2 * 22 + 3 * 23, 4 * 21 + 5 * 22 + 6 * 23, 7 * 21 + 8 * 22 + 9 * 23, 10 * 21 + 11 * 22 + 12 * 23,
5672
0
    1 * 24 + 2 * 25 + 3 * 26, 4 * 24 + 5 * 25 + 6 * 26, 7 * 24 + 8 * 25 + 9 * 26, 10 * 24 + 11 * 25 + 12 * 26,
5673
0
  };
5674
0
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 2, 4), 0);
5675
0
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
5676
0
  float dbtp[] = {
5677
0
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19, 1 * 14 + 4 * 16 + 7 * 18 + 10 * 20,
5678
0
    2 * 13 + 5 * 15 + 8 * 17 + 11 * 19, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20,
5679
0
    3 * 13 + 6 * 15 + 9 * 17 + 12 * 19, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20,
5680
0
  };
5681
0
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 3, 2), 0);
5682
0
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
5683
0
  ccv_nnc_tensor_free(g);
5684
0
  ccv_nnc_tensor_free(a);
5685
0
  ccv_nnc_tensor_free(b);
5686
0
  ccv_nnc_tensor_free(h);
5687
0
  ccv_nnc_tensor_free(db);
5688
0
  ccv_nnc_tensor_free(dbias);
5689
0
  ccv_nnc_tensor_free(gg);
5690
0
  ccv_nnc_tensor_free(ga);
5691
0
  ccv_nnc_tensor_free(gb);
5692
0
  ccv_nnc_tensor_free(gh);
5693
0
  ccv_nnc_tensor_free(gdb);
5694
0
  ccv_nnc_tensor_free(gdbias);
5695
0
}
5696
5697
5698
TEST_CASE("backward gemm large data set")
5699
1
{
5700
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS) &&
5701
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
5702
0
  dsfmt_t dsfmt;
5703
0
  dsfmt_init_gen_rand(&dsfmt, 0);
5704
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 128), 0);
5705
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
5706
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64), 0);
5707
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 64), 0);
5708
0
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 64), 0);
5709
0
  ccv_nnc_tensor_t* dw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
5710
0
  ccv_nnc_tensor_t* dbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64), 0);
5711
0
  ccv_nnc_tensor_t* h = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 128), 0);
5712
5713
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
5714
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
5715
0
  ccv_nnc_tensor_t* hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
5716
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
5717
0
  ccv_nnc_tensor_t* hg = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
5718
0
  ccv_nnc_tensor_t* hdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
5719
0
  ccv_nnc_tensor_t* hdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
5720
0
  ccv_nnc_tensor_t* hh = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
5721
0
  int i;
5722
0
  for (i = 0; i < 64 * 128; i++)
5723
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
5724
0
  for (i = 0; i < 64; i++)
5725
0
    hbias->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
5726
0
  for (i = 0; i < 10 * 128; i++)
5727
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
5728
0
  for (i = 0; i < 10 * 64; i++)
5729
0
    hg->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
5730
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias, hg), TENSOR_LIST(a, w, bias, g), 0);
5731
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(hb), 0);
5732
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(hg, ha, hw, 0), TENSOR_LIST(hh, hdw, hdbias), 0);
5733
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
5734
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, w, 0), TENSOR_LIST(h, dw, dbias), 0);
5735
0
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
5736
0
  ccv_nnc_tensor_t* tdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
5737
0
  ccv_nnc_tensor_t* tdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
5738
0
  ccv_nnc_tensor_t* th = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
5739
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b, dw, dbias, h), TENSOR_LIST(tb, tdw, tdbias, th), 0);
5740
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb->data.f32, hb->data.f32, 10 * 64, 1e-5, "GPU computed output should be numerically close to CPU computed ones");
5741
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tdw->data.f32, hdw->data.f32, 64 * 128, 5e-3, "GPU computed output should be numerically close to CPU computed ones");
5742
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tdbias->data.f32, hdbias->data.f32, 64, 1e-5, "GPU computed output should be numerically close to CPU computed ones");
5743
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, th->data.f32, hh->data.f32, 10 * 128, 1e-5, "GPU computed output should be numerically close to CPU computed ones");
5744
0
  ccv_nnc_tensor_free(a);
5745
0
  ccv_nnc_tensor_free(w);
5746
0
  ccv_nnc_tensor_free(bias);
5747
0
  ccv_nnc_tensor_free(b);
5748
0
  ccv_nnc_tensor_free(g);
5749
0
  ccv_nnc_tensor_free(dw);
5750
0
  ccv_nnc_tensor_free(dbias);
5751
0
  ccv_nnc_tensor_free(h);
5752
0
  ccv_nnc_tensor_free(ha);
5753
0
  ccv_nnc_tensor_free(hw);
5754
0
  ccv_nnc_tensor_free(hbias);
5755
0
  ccv_nnc_tensor_free(hb);
5756
0
  ccv_nnc_tensor_free(hg);
5757
0
  ccv_nnc_tensor_free(hdw);
5758
0
  ccv_nnc_tensor_free(hdbias);
5759
0
  ccv_nnc_tensor_free(hh);
5760
0
  ccv_nnc_tensor_free(tb);
5761
0
  ccv_nnc_tensor_free(th);
5762
0
  ccv_nnc_tensor_free(tdw);
5763
0
  ccv_nnc_tensor_free(tdbias);
5764
0
}
5765
5766
TEST_CASE("backward gemm no bias")
5767
1
{
5768
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS) &&
5769
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
5770
0
  dsfmt_t dsfmt;
5771
0
  dsfmt_init_gen_rand(&dsfmt, 0);
5772
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 128), 0);
5773
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
5774
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 64), 0);
5775
0
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 64), 0);
5776
0
  ccv_nnc_tensor_t* dw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
5777
0
  ccv_nnc_tensor_t* h = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 128), 0);
5778
5779
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
5780
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
5781
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
5782
0
  ccv_nnc_tensor_t* hg = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
5783
0
  ccv_nnc_tensor_t* hdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
5784
0
  ccv_nnc_tensor_t* hh = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
5785
0
  int i;
5786
0
  for (i = 0; i < 64 * 128; i++)
5787
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
5788
0
  for (i = 0; i < 10 * 128; i++)
5789
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
5790
0
  for (i = 0; i < 10 * 64; i++)
5791
0
    hg->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
5792
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hg), TENSOR_LIST(a, w, g), 0);
5793
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw), TENSOR_LIST(hb), 0);
5794
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(hg, ha, hw, 0), TENSOR_LIST(hh, hdw, 0), 0);
5795
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
5796
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, w, 0), TENSOR_LIST(h, dw, 0), 0);
5797
0
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
5798
0
  ccv_nnc_tensor_t* tdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
5799
0
  ccv_nnc_tensor_t* th = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
5800
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b, dw, h), TENSOR_LIST(tb, tdw, th), 0);
5801
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb->data.f32, hb->data.f32, 10 * 64, 1e-5, "GPU computed output should be numerically close to CPU computed ones");
5802
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tdw->data.f32, hdw->data.f32, 64 * 128, 5e-3, "GPU computed output should be numerically close to CPU computed ones");
5803
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, th->data.f32, hh->data.f32, 10 * 128, 1e-5, "GPU computed output should be numerically close to CPU computed ones");
5804
0
  ccv_nnc_tensor_free(a);
5805
0
  ccv_nnc_tensor_free(w);
5806
0
  ccv_nnc_tensor_free(b);
5807
0
  ccv_nnc_tensor_free(g);
5808
0
  ccv_nnc_tensor_free(dw);
5809
0
  ccv_nnc_tensor_free(h);
5810
0
  ccv_nnc_tensor_free(ha);
5811
0
  ccv_nnc_tensor_free(hw);
5812
0
  ccv_nnc_tensor_free(hb);
5813
0
  ccv_nnc_tensor_free(hg);
5814
0
  ccv_nnc_tensor_free(hdw);
5815
0
  ccv_nnc_tensor_free(hh);
5816
0
  ccv_nnc_tensor_free(tb);
5817
0
  ccv_nnc_tensor_free(th);
5818
0
  ccv_nnc_tensor_free(tdw);
5819
0
}
5820
5821
TEST_CASE("backward gemm no h")
5822
1
{
5823
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS) &&
5824
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
5825
0
  dsfmt_t dsfmt;
5826
0
  dsfmt_init_gen_rand(&dsfmt, 0);
5827
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 128), 0);
5828
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
5829
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64), 0);
5830
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 64), 0);
5831
0
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 64), 0);
5832
0
  ccv_nnc_tensor_t* dw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
5833
0
  ccv_nnc_tensor_t* dbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64), 0);
5834
5835
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
5836
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
5837
0
  ccv_nnc_tensor_t* hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
5838
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
5839
0
  ccv_nnc_tensor_t* hg = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
5840
0
  ccv_nnc_tensor_t* hdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
5841
0
  ccv_nnc_tensor_t* hdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
5842
0
  int i;
5843
0
  for (i = 0; i < 64 * 128; i++)
5844
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
5845
0
  for (i = 0; i < 64; i++)
5846
0
    hbias->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
5847
0
  for (i = 0; i < 10 * 128; i++)
5848
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
5849
0
  for (i = 0; i < 10 * 64; i++)
5850
0
    hg->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
5851
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias, hg), TENSOR_LIST(a, w, bias, g), 0);
5852
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(hb), 0);
5853
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(hg, ha, hw, 0), TENSOR_LIST(0, hdw, hdbias), 0);
5854
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
5855
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, w, 0), TENSOR_LIST(0, dw, dbias), 0);
5856
0
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
5857
0
  ccv_nnc_tensor_t* tdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
5858
0
  ccv_nnc_tensor_t* tdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
5859
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b, dw, dbias, 0), TENSOR_LIST(tb, tdw, tdbias, 0), 0);
5860
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb->data.f32, hb->data.f32, 10 * 64, 1e-5, "GPU computed output should be numerically close to CPU computed ones");
5861
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tdw->data.f32, hdw->data.f32, 64 * 128, 5e-3, "GPU computed output should be numerically close to CPU computed ones");
5862
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tdbias->data.f32, hdbias->data.f32, 64, 1e-5, "GPU computed output should be numerically close to CPU computed ones");
5863
0
  ccv_nnc_tensor_free(a);
5864
0
  ccv_nnc_tensor_free(w);
5865
0
  ccv_nnc_tensor_free(bias);
5866
0
  ccv_nnc_tensor_free(b);
5867
0
  ccv_nnc_tensor_free(g);
5868
0
  ccv_nnc_tensor_free(dw);
5869
0
  ccv_nnc_tensor_free(dbias);
5870
0
  ccv_nnc_tensor_free(ha);
5871
0
  ccv_nnc_tensor_free(hw);
5872
0
  ccv_nnc_tensor_free(hbias);
5873
0
  ccv_nnc_tensor_free(hb);
5874
0
  ccv_nnc_tensor_free(hg);
5875
0
  ccv_nnc_tensor_free(hdw);
5876
0
  ccv_nnc_tensor_free(hdbias);
5877
0
  ccv_nnc_tensor_free(tb);
5878
0
  ccv_nnc_tensor_free(tdw);
5879
0
  ccv_nnc_tensor_free(tdbias);
5880
0
}
5881
5882
TEST_CASE("backward gemm no dw")
5883
1
{
5884
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS) &&
5885
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
5886
0
  dsfmt_t dsfmt;
5887
0
  dsfmt_init_gen_rand(&dsfmt, 0);
5888
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 128), 0);
5889
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
5890
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64), 0);
5891
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 64), 0);
5892
0
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 64), 0);
5893
0
  ccv_nnc_tensor_t* dbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64), 0);
5894
0
  ccv_nnc_tensor_t* h = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 128), 0);
5895
5896
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
5897
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
5898
0
  ccv_nnc_tensor_t* hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
5899
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
5900
0
  ccv_nnc_tensor_t* hg = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
5901
0
  ccv_nnc_tensor_t* hdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
5902
0
  ccv_nnc_tensor_t* hh = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
5903
0
  int i;
5904
0
  for (i = 0; i < 64 * 128; i++)
5905
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
5906
0
  for (i = 0; i < 64; i++)
5907
0
    hbias->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
5908
0
  for (i = 0; i < 10 * 128; i++)
5909
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
5910
0
  for (i = 0; i < 10 * 64; i++)
5911
0
    hg->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
5912
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias, hg), TENSOR_LIST(a, w, bias, g), 0);
5913
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(hb), 0);
5914
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(hg, ha, hw, 0), TENSOR_LIST(hh, 0, hdbias), 0);
5915
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
5916
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, w, 0), TENSOR_LIST(h, 0, dbias), 0);
5917
0
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
5918
0
  ccv_nnc_tensor_t* tdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
5919
0
  ccv_nnc_tensor_t* th = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
5920
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b, 0, dbias, h), TENSOR_LIST(tb, 0, tdbias, th), 0);
5921
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb->data.f32, hb->data.f32, 10 * 64, 1e-5, "GPU computed output should be numerically close to CPU computed ones");
5922
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tdbias->data.f32, hdbias->data.f32, 64, 1e-5, "GPU computed output should be numerically close to CPU computed ones");
5923
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, th->data.f32, hh->data.f32, 10 * 128, 1e-5, "GPU computed output should be numerically close to CPU computed ones");
5924
0
  ccv_nnc_tensor_free(a);
5925
0
  ccv_nnc_tensor_free(w);
5926
0
  ccv_nnc_tensor_free(bias);
5927
0
  ccv_nnc_tensor_free(b);
5928
0
  ccv_nnc_tensor_free(g);
5929
0
  ccv_nnc_tensor_free(dbias);
5930
0
  ccv_nnc_tensor_free(h);
5931
0
  ccv_nnc_tensor_free(ha);
5932
0
  ccv_nnc_tensor_free(hw);
5933
0
  ccv_nnc_tensor_free(hbias);
5934
0
  ccv_nnc_tensor_free(hb);
5935
0
  ccv_nnc_tensor_free(hg);
5936
0
  ccv_nnc_tensor_free(hdbias);
5937
0
  ccv_nnc_tensor_free(hh);
5938
0
  ccv_nnc_tensor_free(tb);
5939
0
  ccv_nnc_tensor_free(th);
5940
0
  ccv_nnc_tensor_free(tdbias);
5941
0
}
5942
5943
TEST_CASE("backwar gemm with no transpose batch 2, same b")
5944
1
{
5945
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS) &&
5946
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
5947
0
  float gp[] = {
5948
0
    1, 2, 3,
5949
0
    4, 5, 6,
5950
0
    7, 8, 9,
5951
0
    10, 11, 12,
5952
0
    10, 20, 30,
5953
0
    40, 50, 60,
5954
0
    70, 80, 90,
5955
0
    100, 110, 120,
5956
0
  };
5957
0
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
5958
0
  float ap[] = {
5959
0
    13, 14,
5960
0
    15, 16,
5961
0
    17, 18,
5962
0
    19, 20,
5963
0
    131, 141,
5964
0
    151, 161,
5965
0
    171, 181,
5966
0
    191, 201,
5967
0
  };
5968
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
5969
0
  float bp[] = {
5970
0
    21, 22, 23,
5971
0
    24, 25, 26,
5972
0
  };
5973
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
5974
0
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
5975
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
5976
0
  ccv_nnc_tensor_t* const dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
5977
0
  ccv_nnc_tensor_t* gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
5978
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
5979
0
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
5980
0
  ccv_nnc_tensor_t* gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
5981
0
  ccv_nnc_tensor_t* gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
5982
0
  ccv_nnc_tensor_t* gdbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
5983
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(gg, ga, gb), 0);
5984
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(gh, gdb, gdbias), 0);
5985
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdb, gdbias), TENSOR_LIST(h, db, dbias), 0);
5986
0
  float dbiastp[] = {
5987
0
    22 + 220, 26 + 260, 30 + 300,
5988
0
  };
5989
0
  ccv_nnc_tensor_t dbiast = ccv_nnc_tensor(dbiastp, CPU_TENSOR_NHWC(32F, 3), 0);
5990
  
5991
0
  REQUIRE_TENSOR_EQ(dbias, &dbiast, "bias should be equal");
5992
0
  float htp[] = {
5993
0
    1 * 21 + 2 * 22 + 3 * 23, 1 * 24 + 2 * 25 + 3 * 26,
5994
0
    4 * 21 + 5 * 22 + 6 * 23, 4 * 24 + 5 * 25 + 6 * 26,
5995
0
    7 * 21 + 8 * 22 + 9 * 23, 7 * 24 + 8 * 25 + 9 * 26,
5996
0
    10 * 21 + 11 * 22 + 12 * 23, 10 * 24 + 11 * 25 + 12 * 26,
5997
0
    10 * 21 + 20 * 22 + 30 * 23, 10 * 24 + 20 * 25 + 30 * 26,
5998
0
    40 * 21 + 50 * 22 + 60 * 23, 40 * 24 + 50 * 25 + 60 * 26,
5999
0
    70 * 21 + 80 * 22 + 90 * 23, 70 * 24 + 80 * 25 + 90 * 26,
6000
0
    100 * 21 + 110 * 22 + 120 * 23, 100 * 24 + 110 * 25 + 120 * 26,
6001
0
  };
6002
0
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
6003
  
6004
0
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
6005
0
  float dbtp[] = {
6006
0
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19 + 10 * 131 + 40 * 151 + 70 * 171 + 100 * 191, 2 * 13 + 5 * 15 + 8 * 17 + 11 * 19 + 20 * 131 + 50 * 151 + 80 * 171 + 110 * 191, 3 * 13 + 6 * 15 + 9 * 17 + 12 * 19 + 30 * 131 + 60 * 151 + 90 * 171 + 120 * 191,
6007
0
    1 * 14 + 4 * 16 + 7 * 18 + 10 * 20 + 10 * 141 + 40 * 161 + 70 * 181 + 100 * 201, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20 + 20 * 141 + 50 * 161 + 80 * 181 + 110 * 201, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20 + 30 * 141 + 60 * 161 + 90 * 181 + 120 * 201,
6008
0
  };
6009
0
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
6010
0
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
6011
0
  ccv_nnc_tensor_free(g);
6012
0
  ccv_nnc_tensor_free(a);
6013
0
  ccv_nnc_tensor_free(b);
6014
0
  ccv_nnc_tensor_free(h);
6015
0
  ccv_nnc_tensor_free(db);
6016
0
  ccv_nnc_tensor_free(dbias);
6017
0
  ccv_nnc_tensor_free(gg);
6018
0
  ccv_nnc_tensor_free(ga);
6019
0
  ccv_nnc_tensor_free(gb);
6020
0
  ccv_nnc_tensor_free(gh);
6021
0
  ccv_nnc_tensor_free(gdb);
6022
0
  ccv_nnc_tensor_free(gdbias);
6023
0
}
6024
6025
TEST_CASE("backward gemm with no transpose batch 2, batched b")
6026
1
{
6027
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS) &&
6028
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
6029
0
  float gp[] = {
6030
0
    1, 2, 3,
6031
0
    4, 5, 6,
6032
0
    7, 8, 9,
6033
0
    10, 11, 12,
6034
0
    10, 20, 30,
6035
0
    40, 50, 60,
6036
0
    70, 80, 90,
6037
0
    100, 110, 120,
6038
0
  };
6039
0
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
6040
0
  float ap[] = {
6041
0
    13, 14,
6042
0
    15, 16,
6043
0
    17, 18,
6044
0
    19, 20,
6045
0
    131, 141,
6046
0
    151, 161,
6047
0
    171, 181,
6048
0
    191, 201,
6049
0
  };
6050
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
6051
0
  float bp[] = {
6052
0
    21, 22, 23,
6053
0
    24, 25, 26,
6054
0
    212, 222, 232,
6055
0
    242, 252, 262,
6056
0
  };
6057
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 2, 3), 0);
6058
0
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
6059
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 3), 0);
6060
0
  ccv_nnc_tensor_t* const dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 1, 3), 0);
6061
0
  ccv_nnc_tensor_t* const gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
6062
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
6063
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 3), 0);
6064
0
  ccv_nnc_tensor_t* const gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
6065
0
  ccv_nnc_tensor_t* const gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 3), 0);
6066
0
  ccv_nnc_tensor_t* const gdbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 1, 3), 0);
6067
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(gg, ga, gb), 0);
6068
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(gh, gdb, gdbias), 0);
6069
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdb, gdbias), TENSOR_LIST(h, db, dbias), 0);
6070
0
  float dbiastp[] = {
6071
0
    22, 26, 30,
6072
0
    220, 260, 300,
6073
0
  };
6074
0
  ccv_nnc_tensor_t dbiast = ccv_nnc_tensor(dbiastp, CPU_TENSOR_NHWC(32F, 2, 1, 3), 0);
6075
0
  REQUIRE_TENSOR_EQ(dbias, &dbiast, "bias should be equal");
6076
0
  float htp[] = {
6077
0
    1 * 21 + 2 * 22 + 3 * 23, 1 * 24 + 2 * 25 + 3 * 26,
6078
0
    4 * 21 + 5 * 22 + 6 * 23, 4 * 24 + 5 * 25 + 6 * 26,
6079
0
    7 * 21 + 8 * 22 + 9 * 23, 7 * 24 + 8 * 25 + 9 * 26,
6080
0
    10 * 21 + 11 * 22 + 12 * 23, 10 * 24 + 11 * 25 + 12 * 26,
6081
0
    10 * 212 + 20 * 222 + 30 * 232, 10 * 242 + 20 * 252 + 30 * 262,
6082
0
    40 * 212 + 50 * 222 + 60 * 232, 40 * 242 + 50 * 252 + 60 * 262,
6083
0
    70 * 212 + 80 * 222 + 90 * 232, 70 * 242 + 80 * 252 + 90 * 262,
6084
0
    100 * 212 + 110 * 222 + 120 * 232, 100 * 242 + 110 * 252 + 120 * 262,
6085
0
  };
6086
0
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
6087
0
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
6088
0
  float dbtp[] = {
6089
0
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19, 2 * 13 + 5 * 15 + 8 * 17 + 11 * 19, 3 * 13 + 6 * 15 + 9 * 17 + 12 * 19,
6090
0
    1 * 14 + 4 * 16 + 7 * 18 + 10 * 20, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20,
6091
0
    10 * 131 + 40 * 151 + 70 * 171 + 100 * 191, 20 * 131 + 50 * 151 + 80 * 171 + 110 * 191, 30 * 131 + 60 * 151 + 90 * 171 + 120 * 191,
6092
0
    10 * 141 + 40 * 161 + 70 * 181 + 100 * 201, 20 * 141 + 50 * 161 + 80 * 181 + 110 * 201, 30 * 141 + 60 * 161 + 90 * 181 + 120 * 201,
6093
0
  };
6094
0
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 2, 2, 3), 0);
6095
0
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
6096
0
  ccv_nnc_tensor_free(g);
6097
0
  ccv_nnc_tensor_free(a);
6098
0
  ccv_nnc_tensor_free(b);
6099
0
  ccv_nnc_tensor_free(h);
6100
0
  ccv_nnc_tensor_free(db);
6101
0
  ccv_nnc_tensor_free(dbias);
6102
0
  ccv_nnc_tensor_free(gg);
6103
0
  ccv_nnc_tensor_free(ga);
6104
0
  ccv_nnc_tensor_free(gb);
6105
0
  ccv_nnc_tensor_free(gh);
6106
0
  ccv_nnc_tensor_free(gdb);
6107
0
  ccv_nnc_tensor_free(gdbias);
6108
0
}
6109
6110
TEST_CASE("backward gemm with transpose a batch 2, same b")
6111
1
{
6112
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS) &&
6113
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
6114
0
  float gp[] = {
6115
0
    1, 2, 3,
6116
0
    4, 5, 6,
6117
0
    7, 8, 9,
6118
0
    10, 11, 12,
6119
0
    10, 20, 30,
6120
0
    40, 50, 60,
6121
0
    70, 80, 90,
6122
0
    100, 110, 120,
6123
0
  };
6124
0
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
6125
0
  float ap[] = {
6126
0
    13, 15, 17, 19,
6127
0
    14, 16, 18, 20,
6128
0
    131, 151, 171, 191,
6129
0
    141, 161, 181, 201,
6130
0
  };
6131
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 2, 4), 0);
6132
0
  float bp[] = {
6133
0
    21, 22, 23,
6134
0
    24, 25, 26,
6135
0
  };
6136
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
6137
0
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 4), 0);
6138
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
6139
0
  ccv_nnc_tensor_t* const dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
6140
0
  ccv_nnc_tensor_t* const gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
6141
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 4), 0);
6142
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
6143
0
  ccv_nnc_tensor_t* const gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 4), 0);
6144
0
  ccv_nnc_tensor_t* const gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
6145
0
  ccv_nnc_tensor_t* const gdbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
6146
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(gg, ga, gb), 0);
6147
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(gh, gdb, gdbias), 0);
6148
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdb, gdbias), TENSOR_LIST(h, db, dbias), 0);
6149
0
  float dbiastp[] = {
6150
0
    22 + 220, 26 + 260, 30 + 300,
6151
0
  };
6152
0
  ccv_nnc_tensor_t dbiast = ccv_nnc_tensor(dbiastp, CPU_TENSOR_NHWC(32F, 3), 0);
6153
0
  REQUIRE_TENSOR_EQ(dbias, &dbiast, "bias should be equal");
6154
0
  float htp[] = {
6155
0
    1 * 21 + 2 * 22 + 3 * 23, 4 * 21 + 5 * 22 + 6 * 23, 7 * 21 + 8 * 22 + 9 * 23, 10 * 21 + 11 * 22 + 12 * 23,
6156
0
    1 * 24 + 2 * 25 + 3 * 26, 4 * 24 + 5 * 25 + 6 * 26, 7 * 24 + 8 * 25 + 9 * 26, 10 * 24 + 11 * 25 + 12 * 26,
6157
0
    10 * 21 + 20 * 22 + 30 * 23, 40 * 21 + 50 * 22 + 60 * 23, 70 * 21 + 80 * 22 + 90 * 23, 100 * 21 + 110 * 22 + 120 * 23,
6158
0
    10 * 24 + 20 * 25 + 30 * 26, 40 * 24 + 50 * 25 + 60 * 26, 70 * 24 + 80 * 25 + 90 * 26, 100 * 24 + 110 * 25 + 120 * 26,
6159
0
  };
6160
0
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 2, 2, 4), 0);
6161
0
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
6162
0
  float dbtp[] = {
6163
0
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19 + 10 * 131 + 40 * 151 + 70 * 171 + 100 * 191, 2 * 13 + 5 * 15 + 8 * 17 + 11 * 19 + 20 * 131 + 50 * 151 + 80 * 171 + 110 * 191, 3 * 13 + 6 * 15 + 9 * 17 + 12 * 19 + 30 * 131 + 60 * 151 + 90 * 171 + 120 * 191,
6164
0
    1 * 14 + 4 * 16 + 7 * 18 + 10 * 20 + 10 * 141 + 40 * 161 + 70 * 181 + 100 * 201, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20 + 20 * 141 + 50 * 161 + 80 * 181 + 110 * 201, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20 + 30 * 141 + 60 * 161 + 90 * 181 + 120 * 201,
6165
0
  };
6166
0
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
6167
0
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
6168
0
  ccv_nnc_tensor_free(g);
6169
0
  ccv_nnc_tensor_free(a);
6170
0
  ccv_nnc_tensor_free(b);
6171
0
  ccv_nnc_tensor_free(h);
6172
0
  ccv_nnc_tensor_free(db);
6173
0
  ccv_nnc_tensor_free(dbias);
6174
0
  ccv_nnc_tensor_free(gg);
6175
0
  ccv_nnc_tensor_free(ga);
6176
0
  ccv_nnc_tensor_free(gb);
6177
0
  ccv_nnc_tensor_free(gh);
6178
0
  ccv_nnc_tensor_free(gdb);
6179
0
  ccv_nnc_tensor_free(gdbias);
6180
0
}
6181
6182
TEST_CASE("backward gemm with transpose b batch 2, batched b")
6183
1
{
6184
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS) &&
6185
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
6186
0
  float gp[] = {
6187
0
    1, 2, 3,
6188
0
    4, 5, 6,
6189
0
    7, 8, 9,
6190
0
    10, 11, 12,
6191
0
    10, 20, 30,
6192
0
    40, 50, 60,
6193
0
    70, 80, 90,
6194
0
    100, 110, 120,
6195
0
  };
6196
0
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
6197
0
  float ap[] = {
6198
0
    13, 14,
6199
0
    15, 16,
6200
0
    17, 18,
6201
0
    19, 20,
6202
0
    131, 141,
6203
0
    151, 161,
6204
0
    171, 181,
6205
0
    191, 201,
6206
0
  };
6207
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
6208
0
  float bp[] = {
6209
0
    21, 24,
6210
0
    22, 25,
6211
0
    23, 26,
6212
0
    212, 242,
6213
0
    222, 252,
6214
0
    232, 262,
6215
0
  };
6216
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3, 2), 0);
6217
0
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
6218
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3, 2), 0);
6219
0
  ccv_nnc_tensor_t* const dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 1, 3), 0);
6220
0
  ccv_nnc_tensor_t* const gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
6221
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
6222
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3, 2), 0);
6223
0
  ccv_nnc_tensor_t* const gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
6224
0
  ccv_nnc_tensor_t* const gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3, 2), 0);
6225
0
  ccv_nnc_tensor_t* const gdbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 1, 3), 0);
6226
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(gg, ga, gb), 0);
6227
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(gh, gdb, gdbias), 0);
6228
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdb, gdbias), TENSOR_LIST(h, db, dbias), 0);
6229
0
  float dbiastp[] = {
6230
0
    22, 26, 30,
6231
0
    220, 260, 300,
6232
0
  };
6233
0
  ccv_nnc_tensor_t dbiast = ccv_nnc_tensor(dbiastp, CPU_TENSOR_NHWC(32F, 2, 1, 3), 0);
6234
0
  REQUIRE_TENSOR_EQ(dbias, &dbiast, "bias should be equal");
6235
0
  float htp[] = {
6236
0
    1 * 21 + 2 * 22 + 3 * 23, 1 * 24 + 2 * 25 + 3 * 26,
6237
0
    4 * 21 + 5 * 22 + 6 * 23, 4 * 24 + 5 * 25 + 6 * 26,
6238
0
    7 * 21 + 8 * 22 + 9 * 23, 7 * 24 + 8 * 25 + 9 * 26,
6239
0
    10 * 21 + 11 * 22 + 12 * 23, 10 * 24 + 11 * 25 + 12 * 26,
6240
0
    10 * 212 + 20 * 222 + 30 * 232, 10 * 242 + 20 * 252 + 30 * 262,
6241
0
    40 * 212 + 50 * 222 + 60 * 232, 40 * 242 + 50 * 252 + 60 * 262,
6242
0
    70 * 212 + 80 * 222 + 90 * 232, 70 * 242 + 80 * 252 + 90 * 262,
6243
0
    100 * 212 + 110 * 222 + 120 * 232, 100 * 242 + 110 * 252 + 120 * 262,
6244
0
  };
6245
0
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
6246
0
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
6247
0
  float dbtp[] = {
6248
0
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19, 1 * 14 + 4 * 16 + 7 * 18 + 10 * 20,
6249
0
    2 * 13 + 5 * 15 + 8 * 17 + 11 * 19, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20,
6250
0
    3 * 13 + 6 * 15 + 9 * 17 + 12 * 19, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20,
6251
0
    10 * 131 + 40 * 151 + 70 * 171 + 100 * 191, 10 * 141 + 40 * 161 + 70 * 181 + 100 * 201,
6252
0
    20 * 131 + 50 * 151 + 80 * 171 + 110 * 191, 20 * 141 + 50 * 161 + 80 * 181 + 110 * 201,
6253
0
    30 * 131 + 60 * 151 + 90 * 171 + 120 * 191, 30 * 141 + 60 * 161 + 90 * 181 + 120 * 201,
6254
0
  };
6255
0
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 2, 3, 2), 0);
6256
0
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
6257
0
  ccv_nnc_tensor_free(g);
6258
0
  ccv_nnc_tensor_free(a);
6259
0
  ccv_nnc_tensor_free(b);
6260
0
  ccv_nnc_tensor_free(h);
6261
0
  ccv_nnc_tensor_free(db);
6262
0
  ccv_nnc_tensor_free(dbias);
6263
0
  ccv_nnc_tensor_free(gg);
6264
0
  ccv_nnc_tensor_free(ga);
6265
0
  ccv_nnc_tensor_free(gb);
6266
0
  ccv_nnc_tensor_free(gh);
6267
0
  ccv_nnc_tensor_free(gdb);
6268
0
  ccv_nnc_tensor_free(gdbias);
6269
0
}
6270
6271
TEST_CASE("backward gemm with transpose a and b batch 2, same b")
6272
1
{
6273
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS) &&
6274
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
6275
0
  float gp[] = {
6276
0
    1, 2, 3,
6277
0
    4, 5, 6,
6278
0
    7, 8, 9,
6279
0
    10, 11, 12,
6280
0
    10, 20, 30,
6281
0
    40, 50, 60,
6282
0
    70, 80, 90,
6283
0
    100, 110, 120,
6284
0
  };
6285
0
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
6286
0
  float ap[] = {
6287
0
    13, 15, 17, 19,
6288
0
    14, 16, 18, 20,
6289
0
    131, 151, 171, 191,
6290
0
    141, 161, 181, 201,
6291
0
  };
6292
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 2, 4), 0);
6293
0
  float bp[] = {
6294
0
    21, 24,
6295
0
    22, 25,
6296
0
    23, 26,
6297
0
  };
6298
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 3, 2), 0);
6299
0
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 4), 0);
6300
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 2), 0);
6301
0
  ccv_nnc_tensor_t* const dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
6302
0
  ccv_nnc_tensor_t* const gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
6303
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 4), 0);
6304
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3, 2), 0);
6305
0
  ccv_nnc_tensor_t* const gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 4), 0);
6306
0
  ccv_nnc_tensor_t* const gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3, 2), 0);
6307
0
  ccv_nnc_tensor_t* const gdbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
6308
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(gg, ga, gb), 0);
6309
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(TRANSPOSE(1, 2), TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(gh, gdb, gdbias), 0);
6310
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdb, gdbias), TENSOR_LIST(h, db, dbias), 0);
6311
0
  float dbiastp[] = {
6312
0
    22 + 220, 26 + 260, 30 + 300,
6313
0
  };
6314
0
  ccv_nnc_tensor_t dbiast = ccv_nnc_tensor(dbiastp, CPU_TENSOR_NHWC(32F, 3), 0);
6315
0
  REQUIRE_TENSOR_EQ(dbias, &dbiast, "bias should be equal");
6316
0
  float htp[] = {
6317
0
    1 * 21 + 2 * 22 + 3 * 23, 4 * 21 + 5 * 22 + 6 * 23, 7 * 21 + 8 * 22 + 9 * 23, 10 * 21 + 11 * 22 + 12 * 23,
6318
0
    1 * 24 + 2 * 25 + 3 * 26, 4 * 24 + 5 * 25 + 6 * 26, 7 * 24 + 8 * 25 + 9 * 26, 10 * 24 + 11 * 25 + 12 * 26,
6319
0
    10 * 21 + 20 * 22 + 30 * 23, 40 * 21 + 50 * 22 + 60 * 23, 70 * 21 + 80 * 22 + 90 * 23, 100 * 21 + 110 * 22 + 120 * 23,
6320
0
    10 * 24 + 20 * 25 + 30 * 26, 40 * 24 + 50 * 25 + 60 * 26, 70 * 24 + 80 * 25 + 90 * 26, 100 * 24 + 110 * 25 + 120 * 26,
6321
0
  };
6322
0
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 2, 2, 4), 0);
6323
0
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
6324
0
  float dbtp[] = {
6325
0
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19 + 10 * 131 + 40 * 151 + 70 * 171 + 100 * 191, 1 * 14 + 4 * 16 + 7 * 18 + 10 * 20 + 10 * 141 + 40 * 161 + 70 * 181 + 100 * 201,
6326
0
    2 * 13 + 5 * 15 + 8 * 17 + 11 * 19 + 20 * 131 + 50 * 151 + 80 * 171 + 110 * 191, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20 + 20 * 141 + 50 * 161 + 80 * 181 + 110 * 201,
6327
0
    3 * 13 + 6 * 15 + 9 * 17 + 12 * 19 + 30 * 131 + 60 * 151 + 90 * 171 + 120 * 191, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20 + 30 * 141 + 60 * 161 + 90 * 181 + 120 * 201,
6328
0
  };
6329
0
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 3, 2), 0);
6330
0
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
6331
0
  ccv_nnc_tensor_free(g);
6332
0
  ccv_nnc_tensor_free(a);
6333
0
  ccv_nnc_tensor_free(b);
6334
0
  ccv_nnc_tensor_free(h);
6335
0
  ccv_nnc_tensor_free(db);
6336
0
  ccv_nnc_tensor_free(dbias);
6337
0
  ccv_nnc_tensor_free(gg);
6338
0
  ccv_nnc_tensor_free(ga);
6339
0
  ccv_nnc_tensor_free(gb);
6340
0
  ccv_nnc_tensor_free(gh);
6341
0
  ccv_nnc_tensor_free(gdb);
6342
0
  ccv_nnc_tensor_free(gdbias);
6343
0
}
6344
6345
TEST_CASE("backward gemm with no transpose batch 2, batched b, no bias")
6346
1
{
6347
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS) &&
6348
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
6349
0
  float gp[] = {
6350
0
    1, 2, 3,
6351
0
    4, 5, 6,
6352
0
    7, 8, 9,
6353
0
    10, 11, 12,
6354
0
    10, 20, 30,
6355
0
    40, 50, 60,
6356
0
    70, 80, 90,
6357
0
    100, 110, 120,
6358
0
  };
6359
0
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
6360
0
  float ap[] = {
6361
0
    13, 14,
6362
0
    15, 16,
6363
0
    17, 18,
6364
0
    19, 20,
6365
0
    131, 141,
6366
0
    151, 161,
6367
0
    171, 181,
6368
0
    191, 201,
6369
0
  };
6370
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
6371
0
  float bp[] = {
6372
0
    21, 22, 23,
6373
0
    24, 25, 26,
6374
0
    212, 222, 232,
6375
0
    242, 252, 262,
6376
0
  };
6377
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 2, 3), 0);
6378
0
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
6379
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 3), 0);
6380
0
  ccv_nnc_tensor_t* const gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
6381
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
6382
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 3), 0);
6383
0
  ccv_nnc_tensor_t* const gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
6384
0
  ccv_nnc_tensor_t* const gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 3), 0);
6385
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(gg, ga, gb), 0);
6386
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(gh, gdb), 0);
6387
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdb), TENSOR_LIST(h, db), 0);
6388
0
  float htp[] = {
6389
0
    1 * 21 + 2 * 22 + 3 * 23, 1 * 24 + 2 * 25 + 3 * 26,
6390
0
    4 * 21 + 5 * 22 + 6 * 23, 4 * 24 + 5 * 25 + 6 * 26,
6391
0
    7 * 21 + 8 * 22 + 9 * 23, 7 * 24 + 8 * 25 + 9 * 26,
6392
0
    10 * 21 + 11 * 22 + 12 * 23, 10 * 24 + 11 * 25 + 12 * 26,
6393
0
    10 * 212 + 20 * 222 + 30 * 232, 10 * 242 + 20 * 252 + 30 * 262,
6394
0
    40 * 212 + 50 * 222 + 60 * 232, 40 * 242 + 50 * 252 + 60 * 262,
6395
0
    70 * 212 + 80 * 222 + 90 * 232, 70 * 242 + 80 * 252 + 90 * 262,
6396
0
    100 * 212 + 110 * 222 + 120 * 232, 100 * 242 + 110 * 252 + 120 * 262,
6397
0
  };
6398
0
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
6399
0
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
6400
0
  float dbtp[] = {
6401
0
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19, 2 * 13 + 5 * 15 + 8 * 17 + 11 * 19, 3 * 13 + 6 * 15 + 9 * 17 + 12 * 19,
6402
0
    1 * 14 + 4 * 16 + 7 * 18 + 10 * 20, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20,
6403
0
    10 * 131 + 40 * 151 + 70 * 171 + 100 * 191, 20 * 131 + 50 * 151 + 80 * 171 + 110 * 191, 30 * 131 + 60 * 151 + 90 * 171 + 120 * 191,
6404
0
    10 * 141 + 40 * 161 + 70 * 181 + 100 * 201, 20 * 141 + 50 * 161 + 80 * 181 + 110 * 201, 30 * 141 + 60 * 161 + 90 * 181 + 120 * 201,
6405
0
  };
6406
0
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 2, 2, 3), 0);
6407
0
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
6408
0
  ccv_nnc_tensor_free(g);
6409
0
  ccv_nnc_tensor_free(a);
6410
0
  ccv_nnc_tensor_free(b);
6411
0
  ccv_nnc_tensor_free(h);
6412
0
  ccv_nnc_tensor_free(db);
6413
0
  ccv_nnc_tensor_free(gg);
6414
0
  ccv_nnc_tensor_free(ga);
6415
0
  ccv_nnc_tensor_free(gb);
6416
0
  ccv_nnc_tensor_free(gh);
6417
0
  ccv_nnc_tensor_free(gdb);
6418
0
}
6419
6420
TEST_CASE("backward gemm with transpose b batch 2, batched b, no bias")
6421
1
{
6422
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS) &&
6423
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
6424
0
  float gp[] = {
6425
0
    1, 2, 3,
6426
0
    4, 5, 6,
6427
0
    7, 8, 9,
6428
0
    10, 11, 12,
6429
0
    10, 20, 30,
6430
0
    40, 50, 60,
6431
0
    70, 80, 90,
6432
0
    100, 110, 120,
6433
0
  };
6434
0
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
6435
0
  float ap[] = {
6436
0
    13, 14,
6437
0
    15, 16,
6438
0
    17, 18,
6439
0
    19, 20,
6440
0
    131, 141,
6441
0
    151, 161,
6442
0
    171, 181,
6443
0
    191, 201,
6444
0
  };
6445
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
6446
0
  float bp[] = {
6447
0
    21, 24,
6448
0
    22, 25,
6449
0
    23, 26,
6450
0
    212, 242,
6451
0
    222, 252,
6452
0
    232, 262,
6453
0
  };
6454
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3, 2), 0);
6455
0
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
6456
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3, 2), 0);
6457
0
  ccv_nnc_tensor_t* const gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
6458
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
6459
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3, 2), 0);
6460
0
  ccv_nnc_tensor_t* const gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
6461
0
  ccv_nnc_tensor_t* const gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3, 2), 0);
6462
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(gg, ga, gb), 0);
6463
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(gh, gdb), 0);
6464
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdb), TENSOR_LIST(h, db), 0);
6465
0
  float htp[] = {
6466
0
    1 * 21 + 2 * 22 + 3 * 23, 1 * 24 + 2 * 25 + 3 * 26,
6467
0
    4 * 21 + 5 * 22 + 6 * 23, 4 * 24 + 5 * 25 + 6 * 26,
6468
0
    7 * 21 + 8 * 22 + 9 * 23, 7 * 24 + 8 * 25 + 9 * 26,
6469
0
    10 * 21 + 11 * 22 + 12 * 23, 10 * 24 + 11 * 25 + 12 * 26,
6470
0
    10 * 212 + 20 * 222 + 30 * 232, 10 * 242 + 20 * 252 + 30 * 262,
6471
0
    40 * 212 + 50 * 222 + 60 * 232, 40 * 242 + 50 * 252 + 60 * 262,
6472
0
    70 * 212 + 80 * 222 + 90 * 232, 70 * 242 + 80 * 252 + 90 * 262,
6473
0
    100 * 212 + 110 * 222 + 120 * 232, 100 * 242 + 110 * 252 + 120 * 262,
6474
0
  };
6475
0
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
6476
0
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
6477
0
  float dbtp[] = {
6478
0
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19, 1 * 14 + 4 * 16 + 7 * 18 + 10 * 20,
6479
0
    2 * 13 + 5 * 15 + 8 * 17 + 11 * 19, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20,
6480
0
    3 * 13 + 6 * 15 + 9 * 17 + 12 * 19, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20,
6481
0
    10 * 131 + 40 * 151 + 70 * 171 + 100 * 191, 10 * 141 + 40 * 161 + 70 * 181 + 100 * 201,
6482
0
    20 * 131 + 50 * 151 + 80 * 171 + 110 * 191, 20 * 141 + 50 * 161 + 80 * 181 + 110 * 201,
6483
0
    30 * 131 + 60 * 151 + 90 * 171 + 120 * 191, 30 * 141 + 60 * 161 + 90 * 181 + 120 * 201,
6484
0
  };
6485
0
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 2, 3, 2), 0);
6486
0
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
6487
0
  ccv_nnc_tensor_free(g);
6488
0
  ccv_nnc_tensor_free(a);
6489
0
  ccv_nnc_tensor_free(b);
6490
0
  ccv_nnc_tensor_free(h);
6491
0
  ccv_nnc_tensor_free(db);
6492
0
  ccv_nnc_tensor_free(gg);
6493
0
  ccv_nnc_tensor_free(ga);
6494
0
  ccv_nnc_tensor_free(gb);
6495
0
  ccv_nnc_tensor_free(gh);
6496
0
  ccv_nnc_tensor_free(gdb);
6497
0
}
6498
6499
TEST_CASE("backward gemm with transpose a and b batch 2, batch b, no bias")
6500
1
{
6501
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS) &&
6502
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
6503
0
  float gp[] = {
6504
0
    1, 2, 3,
6505
0
    4, 5, 6,
6506
0
    7, 8, 9,
6507
0
    10, 11, 12,
6508
0
    10, 20, 30,
6509
0
    40, 50, 60,
6510
0
    70, 80, 90,
6511
0
    100, 110, 120,
6512
0
  };
6513
0
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
6514
0
  float ap[] = {
6515
0
    13, 15, 17, 19,
6516
0
    14, 16, 18, 20,
6517
0
    131, 151, 171, 191,
6518
0
    141, 161, 181, 201,
6519
0
  };
6520
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 2, 4), 0);
6521
0
  float bp[] = {
6522
0
    21, 24,
6523
0
    22, 25,
6524
0
    23, 26,
6525
0
    212, 242,
6526
0
    222, 252,
6527
0
    232, 262,
6528
0
  };
6529
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3, 2), 0);
6530
0
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 4), 0);
6531
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3, 2), 0);
6532
0
  ccv_nnc_tensor_t* const gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
6533
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 4), 0);
6534
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3, 2), 0);
6535
0
  ccv_nnc_tensor_t* const gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 4), 0);
6536
0
  ccv_nnc_tensor_t* const gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3, 2), 0);
6537
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(gg, ga, gb), 0);
6538
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(TRANSPOSE(1, 2), TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(gh, gdb), 0);
6539
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdb), TENSOR_LIST(h, db), 0);
6540
0
  float htp[] = {
6541
0
    1 * 21 + 2 * 22 + 3 * 23, 4 * 21 + 5 * 22 + 6 * 23, 7 * 21 + 8 * 22 + 9 * 23, 10 * 21 + 11 * 22 + 12 * 23,
6542
0
    1 * 24 + 2 * 25 + 3 * 26, 4 * 24 + 5 * 25 + 6 * 26, 7 * 24 + 8 * 25 + 9 * 26, 10 * 24 + 11 * 25 + 12 * 26,
6543
0
    10 * 212 + 20 * 222 + 30 * 232, 40 * 212 + 50 * 222 + 60 * 232, 70 * 212 + 80 * 222 + 90 * 232, 100 * 212 + 110 * 222 + 120 * 232,
6544
0
    10 * 242 + 20 * 252 + 30 * 262, 40 * 242 + 50 * 252 + 60 * 262, 70 * 242 + 80 * 252 + 90 * 262, 100 * 242 + 110 * 252 + 120 * 262,
6545
0
  };
6546
0
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 2, 2, 4), 0);
6547
0
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
6548
0
  float dbtp[] = {
6549
0
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19, 1 * 14 + 4 * 16 + 7 * 18 + 10 * 20,
6550
0
    2 * 13 + 5 * 15 + 8 * 17 + 11 * 19, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20,
6551
0
    3 * 13 + 6 * 15 + 9 * 17 + 12 * 19, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20,
6552
0
    10 * 131 + 40 * 151 + 70 * 171 + 100 * 191, 10 * 141 + 40 * 161 + 70 * 181 + 100 * 201,
6553
0
    20 * 131 + 50 * 151 + 80 * 171 + 110 * 191, 20 * 141 + 50 * 161 + 80 * 181 + 110 * 201,
6554
0
    30 * 131 + 60 * 151 + 90 * 171 + 120 * 191, 30 * 141 + 60 * 161 + 90 * 181 + 120 * 201,
6555
0
  };
6556
0
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 2, 3, 2), 0);
6557
0
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
6558
0
  ccv_nnc_tensor_free(g);
6559
0
  ccv_nnc_tensor_free(a);
6560
0
  ccv_nnc_tensor_free(b);
6561
0
  ccv_nnc_tensor_free(h);
6562
0
  ccv_nnc_tensor_free(db);
6563
0
  ccv_nnc_tensor_free(gg);
6564
0
  ccv_nnc_tensor_free(ga);
6565
0
  ccv_nnc_tensor_free(gb);
6566
0
  ccv_nnc_tensor_free(gh);
6567
0
  ccv_nnc_tensor_free(gdb);
6568
0
}
6569
6570
TEST_CASE("mps segmented gemm")
6571
1
{
6572
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SEGMENTED_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
6573
0
  dsfmt_t dsfmt;
6574
0
  dsfmt_init_gen_rand(&dsfmt, 11);
6575
0
  ccv_nnc_tensor_t* const ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 384, 256), 0);
6576
0
  ccv_nnc_tensor_t* const hindices = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32S, 3), 0);
6577
0
  hindices->data.i32[0] = 1;
6578
0
  hindices->data.i32[1] = 0;
6579
0
  hindices->data.i32[2] = 2;
6580
0
  ccv_nnc_tensor_t* const hcounts = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32S, 3), 0);
6581
0
  hcounts->data.i32[0] = 129;
6582
0
  hcounts->data.i32[1] = 131;
6583
0
  hcounts->data.i32[2] = 124;
6584
0
  ccv_nnc_tensor_t* const hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 128, 256), 0);
6585
0
  ccv_nnc_tensor_t* const hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 384, 128), 0);
6586
0
  ccv_nnc_tensor_t* const bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 384, 128), 0);
6587
0
  int i;
6588
0
  for (i = 0; i < 3 * 128 * 256; i++)
6589
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / 256;
6590
0
  for (i = 0; i < 384 * 256; i++)
6591
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
6592
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 384, 256), 0);
6593
0
  ccv_nnc_tensor_t* const indices = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32S, 3), 0);
6594
0
  ccv_nnc_tensor_t* const counts = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32S, 3), 0);
6595
0
  ccv_nnc_tensor_t* const w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3, 128, 256), 0);
6596
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 384, 128), 0);
6597
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hindices, hcounts, hw), TENSOR_LIST(a, indices, counts, w), 0);
6598
0
  ccv_nnc_cmd_exec(CMD_SEGMENTED_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, indices, counts, w), TENSOR_LIST(b), 0);
6599
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
6600
0
  ccv_nnc_cmd_exec(CMD_SEGMENTED_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hindices, hcounts, hw), TENSOR_LIST(bt), 0);
6601
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, hb->data.f32, bt->data.f32, 384 * 128, 3e-4, "segmented GEMM result should match CPU reference");
6602
0
  ccv_nnc_tensor_free(a);
6603
0
  ccv_nnc_tensor_free(indices);
6604
0
  ccv_nnc_tensor_free(counts);
6605
0
  ccv_nnc_tensor_free(w);
6606
0
  ccv_nnc_tensor_free(b);
6607
0
  ccv_nnc_tensor_free(ha);
6608
0
  ccv_nnc_tensor_free(hindices);
6609
0
  ccv_nnc_tensor_free(hcounts);
6610
0
  ccv_nnc_tensor_free(hw);
6611
0
  ccv_nnc_tensor_free(hb);
6612
0
  ccv_nnc_tensor_free(bt);
6613
0
}
6614
6615
TEST_CASE("mps segmented gemm with bias in half precision, split-k")
6616
1
{
6617
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SEGMENTED_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
6618
0
  dsfmt_t dsfmt;
6619
0
  dsfmt_init_gen_rand(&dsfmt, 13);
6620
0
  ccv_nnc_tensor_t* const ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 272, 4096), 0);
6621
0
  ccv_nnc_tensor_t* const hindices = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32S, 2), 0);
6622
0
  hindices->data.i32[0] = 1;
6623
0
  hindices->data.i32[1] = 0;
6624
0
  ccv_nnc_tensor_t* const hcounts = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32S, 2), 0);
6625
0
  hcounts->data.i32[0] = 136;
6626
0
  hcounts->data.i32[1] = 136;
6627
0
  ccv_nnc_tensor_t* const hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 128, 4096), 0);
6628
0
  ccv_nnc_tensor_t* const hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 128), 0);
6629
0
  ccv_nnc_tensor_t* const hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 272, 128), 0);
6630
0
  ccv_nnc_tensor_t* const hb16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 272, 128), 0);
6631
0
  ccv_nnc_tensor_t* const bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 272, 128), 0);
6632
0
  int i;
6633
0
  for (i = 0; i < 2 * 128 * 4096; i++)
6634
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / 4096;
6635
0
  for (i = 0; i < 2 * 128; i++)
6636
0
    hbias->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / 128;
6637
0
  for (i = 0; i < 272 * 4096; i++)
6638
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
6639
0
  ccv_nnc_tensor_t* const ha16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 272, 4096), 0);
6640
0
  ccv_nnc_tensor_t* const hw16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 2, 128, 4096), 0);
6641
0
  ccv_nnc_tensor_t* const hbias16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 2, 128), 0);
6642
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(ha16, hw16, hbias16), 0);
6643
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 272, 4096), 0);
6644
0
  ccv_nnc_tensor_t* const indices = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32S, 2), 0);
6645
0
  ccv_nnc_tensor_t* const counts = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32S, 2), 0);
6646
0
  ccv_nnc_tensor_t* const w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 2, 128, 4096), 0);
6647
0
  ccv_nnc_tensor_t* const bias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 2, 128), 0);
6648
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 272, 128), 0);
6649
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha16, hindices, hcounts, hw16, hbias16), TENSOR_LIST(a, indices, counts, w, bias), 0);
6650
0
  ccv_nnc_cmd_exec(CMD_SEGMENTED_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, indices, counts, w, bias), TENSOR_LIST(b), 0);
6651
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb16), 0);
6652
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(hb16), TENSOR_LIST(hb), 0);
6653
0
  ccv_nnc_cmd_exec(CMD_SEGMENTED_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hindices, hcounts, hw, hbias), TENSOR_LIST(bt), 0);
6654
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, hb->data.f32, bt->data.f32, 272 * 128, 2e-2, "half-precision segmented GEMM result should match CPU reference");
6655
0
  ccv_nnc_tensor_free(a);
6656
0
  ccv_nnc_tensor_free(indices);
6657
0
  ccv_nnc_tensor_free(counts);
6658
0
  ccv_nnc_tensor_free(w);
6659
0
  ccv_nnc_tensor_free(bias);
6660
0
  ccv_nnc_tensor_free(b);
6661
0
  ccv_nnc_tensor_free(ha);
6662
0
  ccv_nnc_tensor_free(hindices);
6663
0
  ccv_nnc_tensor_free(hcounts);
6664
0
  ccv_nnc_tensor_free(hw);
6665
0
  ccv_nnc_tensor_free(hbias);
6666
0
  ccv_nnc_tensor_free(hb);
6667
0
  ccv_nnc_tensor_free(hb16);
6668
0
  ccv_nnc_tensor_free(bt);
6669
0
  ccv_nnc_tensor_free(ha16);
6670
0
  ccv_nnc_tensor_free(hw16);
6671
0
  ccv_nnc_tensor_free(hbias16);
6672
0
}
6673
6674
// Derived from shapes.txt NA lines, assuming the call shape is C = A @ B^T.
6675
1
NA_GEMM_SHAPE_TEST(306, 2048, 3840)
6676
1
NA_GEMM_SHAPE_TEST(306, 4096, 3840)
6677
1
NA_GEMM_SHAPE_TEST(306, 3840, 4096)
6678
1
NA_GEMM_SHAPE_TEST(306, 15360, 3840)
6679
1
NA_GEMM_SHAPE_TEST(306, 3840, 15360)
6680
1
NA_GEMM_SHAPE_TEST(1024, 4096, 4096)
6681
1
NA_GEMM_SHAPE_TEST(1024, 32, 4096)
6682
1
NA_GEMM_SHAPE_TEST(1024, 16384, 4096)
6683
1
NA_GEMM_SHAPE_TEST(1024, 4096, 16384)
6684
1
NA_GEMM_SHAPE_TEST(1024, 2048, 2048)
6685
1
NA_GEMM_SHAPE_TEST(1024, 32, 2048)
6686
1
NA_GEMM_SHAPE_TEST(1024, 8192, 2048)
6687
1
NA_GEMM_SHAPE_TEST(1024, 2048, 8192)
6688
1
NA_GEMM_SHAPE_TEST(1, 2048, 256)
6689
1
NA_GEMM_SHAPE_TEST(1, 2048, 2048)
6690
1
NA_GEMM_SHAPE_TEST(1, 4096, 256)
6691
1
NA_GEMM_SHAPE_TEST(1, 4096, 4096)
6692
1
NA_GEMM_SHAPE_TEST(2, 2048, 2048)
6693
1
NA_GEMM_SHAPE_TEST(2, 4096, 4096)
6694
1
NA_GEMM_SHAPE_TEST(3, 4096, 4096)
6695
1
NA_GEMM_SHAPE_TEST(4, 4096, 4096)
6696
1
NA_GEMM_SHAPE_TEST(5, 4096, 4096)
6697
1
NA_GEMM_SHAPE_TEST(6, 1024, 3072)
6698
1
NA_GEMM_SHAPE_TEST(6, 4096, 4096)
6699
1
NA_GEMM_SHAPE_TEST(7, 4096, 4096)
6700
1
NA_GEMM_SHAPE_TEST(8, 4096, 4096)
6701
1
NA_GEMM_SHAPE_TEST(16, 4096, 4096)
6702
1
NA_GEMM_SHAPE_TEST(32, 4096, 4096)
6703
1
NA_GEMM_SHAPE_TEST(48, 4096, 4096)
6704
1
NA_GEMM_SHAPE_TEST(48, 4096, 15360)
6705
1
NA_GEMM_SHAPE_TEST(16, 4096, 24576)
6706
1
NA_GEMM_SHAPE_TEST(3, 4096, 32768)
6707
1
NA_GEMM_SHAPE_TEST(6, 4096, 32768)
6708
1
NA_GEMM_SHAPE_TEST(8, 4096, 32768)
6709
1
NA_GEMM_SHAPE_TEST(16, 4096, 32768)
6710
1
NA_GEMM_SHAPE_TEST(1024, 4096, 128)
6711
1
NA_GEMM_SHAPE_TEST(257, 2048, 128)
6712
1
NA_GEMM_SHAPE_TEST(33792, 4096, 4096)
6713
1
NA_GEMM_SHAPE_TEST(33792, 32, 4096)
6714
1
NA_GEMM_SHAPE_TEST(257, 2048, 2048)
6715
1
NA_GEMM_SHAPE_TEST(257, 32, 2048)
6716
1
NA_GEMM_SHAPE_TEST(33792, 2048, 4096)
6717
1
NA_GEMM_SHAPE_TEST(33792, 4096, 2048)
6718
1
NA_GEMM_SHAPE_TEST(33792, 16384, 4096)
6719
1
NA_GEMM_SHAPE_TEST(33792, 4096, 16384)
6720
1
NA_GEMM_SHAPE_TEST(257, 8192, 2048)
6721
1
NA_GEMM_SHAPE_TEST(257, 2048, 8192)
6722
1
NA_GEMM_SHAPE_TEST(33792, 128, 4096)
6723
1
NA_GEMM_SHAPE_TEST(257, 128, 2048)
6724
1
NA_GEMM_BIAS_SHAPE_TEST(306, 2048, 3840)
6725
1
NA_GEMM_BIAS_SHAPE_TEST(306, 4096, 3840)
6726
1
NA_GEMM_BIAS_SHAPE_TEST(306, 3840, 4096)
6727
1
NA_GEMM_BIAS_SHAPE_TEST(306, 15360, 3840)
6728
1
NA_GEMM_BIAS_SHAPE_TEST(306, 3840, 15360)
6729
1
NA_GEMM_BIAS_SHAPE_TEST(1024, 4096, 4096)
6730
1
NA_GEMM_BIAS_SHAPE_TEST(1024, 32, 4096)
6731
1
NA_GEMM_BIAS_SHAPE_TEST(1024, 16384, 4096)
6732
1
NA_GEMM_BIAS_SHAPE_TEST(1024, 4096, 16384)
6733
1
NA_GEMM_BIAS_SHAPE_TEST(1024, 2048, 2048)
6734
1
NA_GEMM_BIAS_SHAPE_TEST(1024, 32, 2048)
6735
1
NA_GEMM_BIAS_SHAPE_TEST(1024, 8192, 2048)
6736
1
NA_GEMM_BIAS_SHAPE_TEST(1024, 2048, 8192)
6737
1
NA_GEMM_BIAS_SHAPE_TEST(1, 2048, 256)
6738
1
NA_GEMM_BIAS_SHAPE_TEST(1, 2048, 2048)
6739
1
NA_GEMM_BIAS_SHAPE_TEST(1, 4096, 256)
6740
1
NA_GEMM_BIAS_SHAPE_TEST(1, 4096, 4096)
6741
1
NA_GEMM_BIAS_SHAPE_TEST(2, 2048, 2048)
6742
1
NA_GEMM_BIAS_SHAPE_TEST(2, 4096, 4096)
6743
1
NA_GEMM_BIAS_SHAPE_TEST(3, 4096, 4096)
6744
1
NA_GEMM_BIAS_SHAPE_TEST(4, 4096, 4096)
6745
1
NA_GEMM_BIAS_SHAPE_TEST(5, 4096, 4096)
6746
1
NA_GEMM_BIAS_SHAPE_TEST(6, 1024, 3072)
6747
1
NA_GEMM_BIAS_SHAPE_TEST(6, 4096, 4096)
6748
1
NA_GEMM_BIAS_SHAPE_TEST(7, 4096, 4096)
6749
1
NA_GEMM_BIAS_SHAPE_TEST(8, 4096, 4096)
6750
1
NA_GEMM_BIAS_SHAPE_TEST(3, 4096, 32768)
6751
1
NA_GEMM_BIAS_SHAPE_TEST(16, 4096, 32768)
6752
1
NA_GEMM_BIAS_SHAPE_TEST(32, 4096, 4096)
6753
1
NA_GEMM_BIAS_SHAPE_TEST(48, 4096, 4096)
6754
1
NA_GEMM_BFLOAT_SHAPE_TEST(3, 4096, 4096)
6755
1
NA_GEMM_BFLOAT_SHAPE_TEST(5, 4096, 4096)
6756
1
NA_GEMM_BFLOAT_SHAPE_TEST(6, 1024, 3072)
6757
1
NA_GEMM_BFLOAT_SHAPE_TEST(7, 4096, 4096)
6758
1
NA_GEMM_BFLOAT_SHAPE_TEST(8, 4096, 4096)
6759
1
NA_GEMM_BFLOAT_BIAS_SHAPE_TEST(3, 4096, 4096)
6760
1
NA_GEMM_BFLOAT_BIAS_SHAPE_TEST(5, 4096, 4096)
6761
1
NA_GEMM_BFLOAT_BIAS_SHAPE_TEST(6, 1024, 3072)
6762
1
NA_GEMM_BFLOAT_BIAS_SHAPE_TEST(7, 4096, 4096)
6763
1
NA_GEMM_BFLOAT_BIAS_SHAPE_TEST(8, 4096, 4096)
6764
1
NA_GEMM_BFLOAT_SHAPE_TEST(48, 4096, 4096)
6765
1
NA_GEMM_BFLOAT_BIAS_SHAPE_TEST(48, 4096, 4096)
6766
1
NA_GEMM_BIAS_SHAPE_TEST(1024, 4096, 128)
6767
1
NA_GEMM_BIAS_SHAPE_TEST(257, 2048, 128)
6768
1
NA_GEMM_BIAS_SHAPE_TEST(33792, 4096, 4096)
6769
1
NA_GEMM_BIAS_SHAPE_TEST(33792, 32, 4096)
6770
1
NA_GEMM_BIAS_SHAPE_TEST(257, 2048, 2048)
6771
1
NA_GEMM_BIAS_SHAPE_TEST(257, 32, 2048)
6772
1
NA_GEMM_BIAS_SHAPE_TEST(33792, 2048, 4096)
6773
1
NA_GEMM_BIAS_SHAPE_TEST(33792, 4096, 2048)
6774
1
NA_GEMM_BIAS_SHAPE_TEST(33792, 16384, 4096)
6775
1
NA_GEMM_BIAS_SHAPE_TEST(33792, 4096, 16384)
6776
1
NA_GEMM_BIAS_SHAPE_TEST(257, 8192, 2048)
6777
1
NA_GEMM_BIAS_SHAPE_TEST(257, 2048, 8192)
6778
1
NA_GEMM_BIAS_SHAPE_TEST(33792, 128, 4096)
6779
NA_GEMM_BIAS_SHAPE_TEST(257, 128, 2048)
6780
6781
#include "case_main.h"