Coverage Report

Created: 2026-04-03 17:51

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/home/liu/actions-runner/_work/ccv/ccv/test/int/nnc/mpsblas.tests.c
Line
Count
Source
1
#include "case.h"
2
#include "ccv_case.h"
3
#include "ccv_nnc_case.h"
4
#include <ccv.h>
5
#include <nnc/ccv_nnc.h>
6
#include <nnc/ccv_nnc_easy.h>
7
#include <nnc/mps/ccv_nnc_mps.h>
8
#include <3rdparty/dsfmt/dSFMT.h>
9
#include <math.h>
10
#include <stdlib.h>
11
12
TEST_SETUP()
13
{
14
  ccv_nnc_init();
15
}
16
17
static float _mps_forward_na_gemm_a_value(const int row, const int k)
18
0
{
19
0
  return (float)(((row * 17 + k * 13) % 23) + 1) / 512.0f;
20
0
}
21
22
static float _mps_forward_na_gemm_b_value(const int col, const int k)
23
0
{
24
0
  return (float)(((col * 19 + k * 7) % 29) + 1) / 512.0f;
25
0
}
26
27
static float _mps_forward_na_gemm_bias_value(const int col)
28
0
{
29
0
  return (float)(((col * 5) % 17) - 8) / 256.0f;
30
0
}
31
32
static void _mps_forward_na_gemm_fill_half(ccv_float16_t* const data, const int rows, const int cols, const int for_a)
33
0
{
34
0
  float* const row_buffer = (float*)ccmalloc(sizeof(float) * cols);
35
0
  int i, j;
36
0
  for (i = 0; i < rows; i++)
37
0
  {
38
0
    for (j = 0; j < cols; j++)
39
0
      row_buffer[j] = for_a ? _mps_forward_na_gemm_a_value(i, j) : _mps_forward_na_gemm_b_value(i, j);
40
0
    ccv_float_to_half_precision(row_buffer, (uint16_t*)data + (size_t)i * cols, cols);
41
0
  }
42
0
  ccfree(row_buffer);
43
0
}
44
45
static void _mps_forward_na_gemm_fill_bias_half(ccv_float16_t* const data, const int cols)
46
0
{
47
0
  float* const row_buffer = (float*)ccmalloc(sizeof(float) * cols);
48
0
  int j;
49
0
  for (j = 0; j < cols; j++)
50
0
    row_buffer[j] = _mps_forward_na_gemm_bias_value(j);
51
0
  ccv_float_to_half_precision(row_buffer, (uint16_t*)data, cols);
52
0
  ccfree(row_buffer);
53
0
}
54
55
static float _mps_forward_na_gemm_expected(const int row, const int col, const int k_dim, const int use_bias)
56
0
{
57
0
  float sum = 0;
58
0
  int k;
59
0
  for (k = 0; k < k_dim; k++)
60
0
    sum += _mps_forward_na_gemm_a_value(row, k) * _mps_forward_na_gemm_b_value(col, k);
61
0
  if (use_bias)
62
0
    sum += _mps_forward_na_gemm_bias_value(col);
63
0
  return sum;
64
0
}
65
66
static int _mps_forward_na_gemm_sample_indices(const int dim, const int boundary, const int include_large_m_boundary, int indices[8])
67
0
{
68
0
  const int candidates[] = {
69
0
    0, 1, boundary - 1, boundary,
70
0
    include_large_m_boundary ? 32767 : -1,
71
0
    include_large_m_boundary ? 32768 : -1,
72
0
    dim / 2, dim - 1,
73
0
  };
74
0
  int i, j;
75
0
  int count = 0;
76
0
  for (i = 0; i < 8; i++)
77
0
  {
78
0
    if (candidates[i] < 0 || candidates[i] >= dim)
79
0
      continue;
80
0
    for (j = 0; j < count; j++)
81
0
      if (indices[j] == candidates[i])
82
0
        break;
83
0
    if (j < count)
84
0
      continue;
85
0
    indices[count++] = candidates[i];
86
0
  }
87
0
  return count;
88
0
}
89
90
typedef struct {
91
  int row;
92
  int col;
93
  float actual;
94
  float expected;
95
} _mps_forward_na_gemm_mismatch_t;
96
97
static int _mps_forward_na_gemm_validate_shape(const int m_dim, const int n_dim, const int k_dim, _mps_forward_na_gemm_mismatch_t* const mismatch)
98
0
{
99
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, m_dim, k_dim), 0);
100
0
  ccv_nnc_tensor_t* const w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, n_dim, k_dim), 0);
101
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, m_dim, n_dim), 0);
102
0
  ccv_nnc_tensor_t* const ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, m_dim, k_dim), 0);
103
0
  ccv_nnc_tensor_t* const hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, n_dim, k_dim), 0);
104
0
  _mps_forward_na_gemm_fill_half(ha->data.f16, m_dim, k_dim, 1);
105
0
  _mps_forward_na_gemm_fill_half(hw->data.f16, n_dim, k_dim, 0);
106
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw), TENSOR_LIST(a, w), 0);
107
0
  ccv_nnc_tensor_free(ha);
108
0
  ccv_nnc_tensor_free(hw);
109
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
110
111
0
  int row_samples[8];
112
0
  int col_samples[8];
113
0
  const int row_sample_size = _mps_forward_na_gemm_sample_indices(m_dim, 128, 1, row_samples);
114
0
  const int col_sample_size = _mps_forward_na_gemm_sample_indices(n_dim, 64, 0, col_samples);
115
0
  ccv_nnc_tensor_t* const sample_h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 1, 1), 0);
116
0
  ccv_nnc_tensor_t* const sample_f = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 1), 0);
117
0
  int ok = 1;
118
0
  int i, j;
119
0
  for (i = 0; i < row_sample_size; i++)
120
0
    for (j = 0; j < col_sample_size; j++)
121
0
    {
122
0
      ccv_nnc_tensor_view_t* const bv = ccv_nnc_tensor_view_new(b, GPU_TENSOR_NHWC(000, 16F, 1, 1), DIM_ALLOC(row_samples[i], col_samples[j]), DIM_ALLOC(n_dim, 1));
123
0
      ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST((ccv_nnc_tensor_t*)bv), TENSOR_LIST(sample_h), 0);
124
0
      ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(sample_h), TENSOR_LIST(sample_f), 0);
125
0
      mismatch->row = row_samples[i];
126
0
      mismatch->col = col_samples[j];
127
0
      mismatch->actual = sample_f->data.f32[0];
128
0
      mismatch->expected = _mps_forward_na_gemm_expected(row_samples[i], col_samples[j], k_dim, 0);
129
0
      ccv_nnc_tensor_view_free(bv);
130
0
      if (fabsf(mismatch->actual - mismatch->expected) > 2e-1f)
131
0
      {
132
0
        ok = 0;
133
0
        goto cleanup;
134
0
      }
135
0
    }
136
137
0
cleanup:
138
0
  ccv_nnc_tensor_free(sample_h);
139
0
  ccv_nnc_tensor_free(sample_f);
140
0
  ccv_nnc_tensor_free(a);
141
0
  ccv_nnc_tensor_free(w);
142
0
  ccv_nnc_tensor_free(b);
143
0
  return ok;
144
0
}
145
146
static int _mps_forward_na_gemm_validate_shape_with_bias(const int m_dim, const int n_dim, const int k_dim, _mps_forward_na_gemm_mismatch_t* const mismatch)
147
0
{
148
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, m_dim, k_dim), 0);
149
0
  ccv_nnc_tensor_t* const w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, n_dim, k_dim), 0);
150
0
  ccv_nnc_tensor_t* const bias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, n_dim), 0);
151
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, m_dim, n_dim), 0);
152
0
  ccv_nnc_tensor_t* const ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, m_dim, k_dim), 0);
153
0
  ccv_nnc_tensor_t* const hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, n_dim, k_dim), 0);
154
0
  ccv_nnc_tensor_t* const hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, n_dim), 0);
155
0
  _mps_forward_na_gemm_fill_half(ha->data.f16, m_dim, k_dim, 1);
156
0
  _mps_forward_na_gemm_fill_half(hw->data.f16, n_dim, k_dim, 0);
157
0
  _mps_forward_na_gemm_fill_bias_half(hbias->data.f16, n_dim);
158
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(a, w, bias), 0);
159
0
  ccv_nnc_tensor_free(ha);
160
0
  ccv_nnc_tensor_free(hw);
161
0
  ccv_nnc_tensor_free(hbias);
162
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
163
164
0
  int row_samples[8];
165
0
  int col_samples[8];
166
0
  const int row_sample_size = _mps_forward_na_gemm_sample_indices(m_dim, 128, 1, row_samples);
167
0
  const int col_sample_size = _mps_forward_na_gemm_sample_indices(n_dim, 64, 0, col_samples);
168
0
  ccv_nnc_tensor_t* const sample_h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 1, 1), 0);
169
0
  ccv_nnc_tensor_t* const sample_f = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 1), 0);
170
0
  int ok = 1;
171
0
  int i, j;
172
0
  for (i = 0; i < row_sample_size; i++)
173
0
    for (j = 0; j < col_sample_size; j++)
174
0
    {
175
0
      ccv_nnc_tensor_view_t* const bv = ccv_nnc_tensor_view_new(b, GPU_TENSOR_NHWC(000, 16F, 1, 1), DIM_ALLOC(row_samples[i], col_samples[j]), DIM_ALLOC(n_dim, 1));
176
0
      ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST((ccv_nnc_tensor_t*)bv), TENSOR_LIST(sample_h), 0);
177
0
      ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(sample_h), TENSOR_LIST(sample_f), 0);
178
0
      mismatch->row = row_samples[i];
179
0
      mismatch->col = col_samples[j];
180
0
      mismatch->actual = sample_f->data.f32[0];
181
0
      mismatch->expected = _mps_forward_na_gemm_expected(row_samples[i], col_samples[j], k_dim, 1);
182
0
      ccv_nnc_tensor_view_free(bv);
183
0
      if (fabsf(mismatch->actual - mismatch->expected) > 2e-1f)
184
0
      {
185
0
        ok = 0;
186
0
        goto cleanup;
187
0
      }
188
0
    }
189
190
0
cleanup:
191
0
  ccv_nnc_tensor_free(sample_h);
192
0
  ccv_nnc_tensor_free(sample_f);
193
0
  ccv_nnc_tensor_free(a);
194
0
  ccv_nnc_tensor_free(w);
195
0
  ccv_nnc_tensor_free(bias);
196
0
  ccv_nnc_tensor_free(b);
197
0
  return ok;
198
0
}
199
200
static void _mps_forward_scaled_gemm_fill_matrix(const int datatype, void* const data, const int rows, const int cols, const int for_a)
201
0
{
202
0
  float* const values = (float*)ccmalloc(sizeof(float) * rows * cols);
203
0
  int i, j;
204
0
  for (i = 0; i < rows; i++)
205
0
    for (j = 0; j < cols; j++)
206
0
      values[i * cols + j] = for_a ? _mps_forward_na_gemm_a_value(i, j) : _mps_forward_na_gemm_b_value(i, j);
207
0
  if (datatype == CCV_16F)
208
0
    ccv_float_to_half_precision(values, (uint16_t*)data, rows * cols);
209
0
  else if (datatype == CCV_16BF)
210
0
    ccv_float_to_bfloat(values, (uint16_t*)data, rows * cols);
211
0
  else
212
0
    memcpy(data, values, sizeof(float) * rows * cols);
213
0
  ccfree(values);
214
0
}
215
216
static void _mps_forward_scaled_gemm_fill_bias(const int datatype, void* const data, const int cols)
217
0
{
218
0
  float* const values = (float*)ccmalloc(sizeof(float) * cols);
219
0
  int j;
220
0
  for (j = 0; j < cols; j++)
221
0
    values[j] = _mps_forward_na_gemm_bias_value(j);
222
0
  if (datatype == CCV_16F)
223
0
    ccv_float_to_half_precision(values, (uint16_t*)data, cols);
224
0
  else if (datatype == CCV_16BF)
225
0
    ccv_float_to_bfloat(values, (uint16_t*)data, cols);
226
0
  else
227
0
    memcpy(data, values, sizeof(float) * cols);
228
0
  ccfree(values);
229
0
}
230
231
static void _mps_forward_scaled_gemm_to_float(const int datatype, const void* const data, const int count, float* const values)
232
0
{
233
0
  if (datatype == CCV_16F)
234
0
    ccv_half_precision_to_float((const uint16_t*)data, values, count);
235
0
  else if (datatype == CCV_16BF)
236
0
    ccv_bfloat_to_float((const uint16_t*)data, values, count);
237
0
  else
238
0
    memcpy(values, data, sizeof(float) * count);
239
0
}
240
241
static void _mps_forward_scaled_gemm_quantized_reference(const int datatype, const void* const data, const int rows, const int cols, float* const values)
242
0
{
243
0
  ccv_nnc_tensor_param_t params = {
244
0
    .type = CCV_TENSOR_CPU_MEMORY,
245
0
    .format = CCV_TENSOR_FORMAT_NHWC,
246
0
    .datatype = datatype,
247
0
    .dim = { rows, cols, 0 },
248
0
  };
249
0
  const ccv_nnc_tensor_param_t qparams = ccv_nnc_tensor_8i_rowwise(params);
250
0
  const size_t qsize = ccv_nnc_tensor_data_size_without_padding(qparams);
251
0
  uint8_t* const qdata = (uint8_t*)ccmalloc(qsize);
252
0
  const size_t encoded = ccv_nnc_quantize_8i_rowwise(data, datatype, CCV_TENSOR_CPU_MEMORY, rows * cols, cols, qdata, qsize);
253
0
  void* dequantized = 0;
254
0
  if (datatype == CCV_16F || datatype == CCV_16BF)
255
0
    dequantized = ccmalloc(sizeof(uint16_t) * rows * cols);
256
0
  else
257
0
    dequantized = ccmalloc(sizeof(float) * rows * cols);
258
0
  ccv_nnc_dequantize_8i_rowwise(qdata, datatype, CCV_TENSOR_CPU_MEMORY, encoded, cols, dequantized, rows * cols);
259
0
  _mps_forward_scaled_gemm_to_float(datatype, dequantized, rows * cols, values);
260
0
  ccfree(dequantized);
261
0
  ccfree(qdata);
262
0
}
263
264
static void _mps_forward_scaled_gemm_reference(const float* const a, const float* const w, const float* const bias, const int m_dim, const int n_dim, const int k_dim, float* const out)
265
0
{
266
0
  int i, j, k;
267
0
  for (i = 0; i < m_dim; i++)
268
0
    for (j = 0; j < n_dim; j++)
269
0
    {
270
0
      float sum = bias ? bias[j] : 0;
271
0
      for (k = 0; k < k_dim; k++)
272
0
        sum += a[i * k_dim + k] * w[j * k_dim + k];
273
0
      out[i * n_dim + j] = sum;
274
0
    }
275
0
}
276
277
static float _mps_forward_scaled_gemm_a_batched_value(const int batch, const int row, const int k)
278
0
{
279
0
  return (float)(((batch * 11 + row * 17 + k * 13) % 41) - 20) / 256.0f;
280
0
}
281
282
static float _mps_forward_scaled_gemm_w_batched_value(const int batch, const int col, const int k)
283
0
{
284
0
  return (float)(((batch * 7 + col * 19 + k * 5) % 43) - 21) / 256.0f;
285
0
}
286
287
static float _mps_forward_scaled_gemm_bias_batched_value(const int batch, const int col)
288
0
{
289
0
  return (float)(((batch * 3 + col * 5) % 23) - 11) / 256.0f;
290
0
}
291
292
static void _mps_forward_scaled_gemm_fill_matrix_batched(const int datatype, void* const data, const int batch_dim, const int rows, const int cols, const int for_a)
293
0
{
294
0
  float* const values = (float*)ccmalloc(sizeof(float) * batch_dim * rows * cols);
295
0
  int b, i, j;
296
0
  for (b = 0; b < batch_dim; b++)
297
0
    for (i = 0; i < rows; i++)
298
0
      for (j = 0; j < cols; j++)
299
0
        values[((b * rows) + i) * cols + j] = for_a ? _mps_forward_scaled_gemm_a_batched_value(b, i, j) : _mps_forward_scaled_gemm_w_batched_value(b, i, j);
300
0
  if (datatype == CCV_16F)
301
0
    ccv_float_to_half_precision(values, (uint16_t*)data, batch_dim * rows * cols);
302
0
  else if (datatype == CCV_16BF)
303
0
    ccv_float_to_bfloat(values, (uint16_t*)data, batch_dim * rows * cols);
304
0
  else
305
0
    memcpy(data, values, sizeof(float) * batch_dim * rows * cols);
306
0
  ccfree(values);
307
0
}
308
309
static void _mps_forward_scaled_gemm_fill_bias_batched(const int datatype, void* const data, const int batch_dim, const int cols)
310
0
{
311
0
  float* const values = (float*)ccmalloc(sizeof(float) * batch_dim * cols);
312
0
  int b, j;
313
0
  for (b = 0; b < batch_dim; b++)
314
0
    for (j = 0; j < cols; j++)
315
0
      values[b * cols + j] = _mps_forward_scaled_gemm_bias_batched_value(b, j);
316
0
  if (datatype == CCV_16F)
317
0
    ccv_float_to_half_precision(values, (uint16_t*)data, batch_dim * cols);
318
0
  else if (datatype == CCV_16BF)
319
0
    ccv_float_to_bfloat(values, (uint16_t*)data, batch_dim * cols);
320
0
  else
321
0
    memcpy(data, values, sizeof(float) * batch_dim * cols);
322
0
  ccfree(values);
323
0
}
324
325
static void _mps_forward_scaled_gemm_reference_batched(const float* const a, const float* const w, const float* const bias, const int batch_dim, const int w_batch_dim, const int bias_batch_dim, const int m_dim, const int n_dim, const int k_dim, float* const out)
326
0
{
327
0
  int b, i, j, k;
328
0
  for (b = 0; b < batch_dim; b++)
329
0
    for (i = 0; i < m_dim; i++)
330
0
      for (j = 0; j < n_dim; j++)
331
0
      {
332
0
        const int w_batch = (w_batch_dim > 1) ? b : 0;
333
0
        const int bias_batch = (bias_batch_dim > 1) ? b : 0;
334
0
        float sum = bias ? bias[bias_batch * n_dim + j] : 0;
335
0
        for (k = 0; k < k_dim; k++)
336
0
          sum += a[((b * m_dim) + i) * k_dim + k] * w[((w_batch * n_dim) + j) * k_dim + k];
337
0
        out[((b * m_dim) + i) * n_dim + j] = sum;
338
0
      }
339
0
}
340
341
static int _mps_forward_scaled_gemm_validate(const int datatype, const int use_bias, double* const max_abs_ref, double* const max_rel_ref)
342
0
{
343
0
  const int m_dim = 257;
344
0
  const int n_dim = 384;
345
0
  const int k_dim = 128;
346
0
  ccv_nnc_tensor_param_t ga_params = {
347
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
348
0
    .format = CCV_TENSOR_FORMAT_NHWC,
349
0
    .datatype = datatype,
350
0
    .dim = { m_dim, k_dim, 0 },
351
0
  };
352
0
  ccv_nnc_tensor_param_t gw_params = {
353
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
354
0
    .format = CCV_TENSOR_FORMAT_NHWC,
355
0
    .datatype = datatype,
356
0
    .dim = { n_dim, k_dim, 0 },
357
0
  };
358
0
  ccv_nnc_tensor_param_t gb_params = {
359
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
360
0
    .format = CCV_TENSOR_FORMAT_NHWC,
361
0
    .datatype = datatype,
362
0
    .dim = { m_dim, n_dim, 0 },
363
0
  };
364
0
  ccv_nnc_tensor_param_t gbias_params = {
365
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
366
0
    .format = CCV_TENSOR_FORMAT_NHWC,
367
0
    .datatype = datatype,
368
0
    .dim = { n_dim, 0 },
369
0
  };
370
0
  ccv_nnc_tensor_param_t a_params = {
371
0
    .type = CCV_TENSOR_CPU_MEMORY,
372
0
    .format = CCV_TENSOR_FORMAT_NHWC,
373
0
    .datatype = datatype,
374
0
    .dim = { m_dim, k_dim, 0 },
375
0
  };
376
0
  ccv_nnc_tensor_param_t w_params = {
377
0
    .type = CCV_TENSOR_CPU_MEMORY,
378
0
    .format = CCV_TENSOR_FORMAT_NHWC,
379
0
    .datatype = datatype,
380
0
    .dim = { n_dim, k_dim, 0 },
381
0
  };
382
0
  ccv_nnc_tensor_param_t b_params = {
383
0
    .type = CCV_TENSOR_CPU_MEMORY,
384
0
    .format = CCV_TENSOR_FORMAT_NHWC,
385
0
    .datatype = datatype,
386
0
    .dim = { m_dim, n_dim, 0 },
387
0
  };
388
0
  ccv_nnc_tensor_param_t bias_params = {
389
0
    .type = CCV_TENSOR_CPU_MEMORY,
390
0
    .format = CCV_TENSOR_FORMAT_NHWC,
391
0
    .datatype = datatype,
392
0
    .dim = { n_dim, 0 },
393
0
  };
394
0
  ccv_nnc_tensor_t* const ha = ccv_nnc_tensor_new(0, a_params, 0);
395
0
  ccv_nnc_tensor_t* const hwq = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(w_params), 0);
396
0
  ccv_nnc_tensor_t* const hbias = use_bias ? ccv_nnc_tensor_new(0, bias_params, 0) : 0;
397
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, ga_params, 0);
398
0
  ccv_nnc_tensor_t* const w = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(gw_params), 0);
399
0
  ccv_nnc_tensor_t* const bias = use_bias ? ccv_nnc_tensor_new(0, gbias_params, 0) : 0;
400
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, gb_params, 0);
401
0
  ccv_nnc_tensor_t* const hb = ccv_nnc_tensor_new(0, b_params, 0);
402
0
  _mps_forward_scaled_gemm_fill_matrix(datatype, ha->data.u8, m_dim, k_dim, 1);
403
0
  if (use_bias)
404
0
    _mps_forward_scaled_gemm_fill_bias(datatype, hbias->data.u8, n_dim);
405
0
  void* const w_dense = ccmalloc(CCV_GET_DATA_TYPE_SIZE(datatype) * n_dim * k_dim);
406
0
  _mps_forward_scaled_gemm_fill_matrix(datatype, w_dense, n_dim, k_dim, 0);
407
0
  const size_t qsize = ccv_nnc_quantize_8i_rowwise(w_dense, datatype, CCV_TENSOR_CPU_MEMORY, n_dim * k_dim, k_dim, hwq->data.u8, ccv_nnc_tensor_data_size_without_padding(hwq->info));
408
0
  if (qsize != ccv_nnc_tensor_data_size_without_padding(hwq->info))
409
0
  {
410
0
    ccfree(w_dense);
411
0
    ccv_nnc_tensor_free(ha);
412
0
    ccv_nnc_tensor_free(hwq);
413
0
    if (hbias)
414
0
      ccv_nnc_tensor_free(hbias);
415
0
    ccv_nnc_tensor_free(a);
416
0
    ccv_nnc_tensor_free(w);
417
0
    if (bias)
418
0
      ccv_nnc_tensor_free(bias);
419
0
    ccv_nnc_tensor_free(b);
420
0
    ccv_nnc_tensor_free(hb);
421
0
    return -1;
422
0
  }
423
0
  if (use_bias)
424
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hwq, hbias), TENSOR_LIST(a, w, bias), 0);
425
0
  else
426
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hwq), TENSOR_LIST(a, w), 0);
427
0
  if (use_bias)
428
0
    ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
429
0
  else
430
0
    ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
431
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
432
433
0
  float* const a_ref = (float*)ccmalloc(sizeof(float) * m_dim * k_dim);
434
0
  float* const w_ref = (float*)ccmalloc(sizeof(float) * n_dim * k_dim);
435
0
  float* const bias_ref = use_bias ? (float*)ccmalloc(sizeof(float) * n_dim) : 0;
436
0
  float* const actual = (float*)ccmalloc(sizeof(float) * m_dim * n_dim);
437
0
  float* const expected = (float*)ccmalloc(sizeof(float) * m_dim * n_dim);
438
0
  _mps_forward_scaled_gemm_quantized_reference(datatype, ha->data.u8, m_dim, k_dim, a_ref);
439
0
  _mps_forward_scaled_gemm_quantized_reference(datatype, w_dense, n_dim, k_dim, w_ref);
440
0
  if (use_bias)
441
0
    _mps_forward_scaled_gemm_to_float(datatype, hbias->data.u8, n_dim, bias_ref);
442
0
  _mps_forward_scaled_gemm_to_float(datatype, hb->data.u8, m_dim * n_dim, actual);
443
0
  _mps_forward_scaled_gemm_reference(a_ref, w_ref, bias_ref, m_dim, n_dim, k_dim, expected);
444
0
  double max_abs = 0;
445
0
  double max_rel = 0;
446
0
  int i;
447
0
  for (i = 0; i < m_dim * n_dim; i++)
448
0
  {
449
0
    const double diff = fabs((double)actual[i] - (double)expected[i]);
450
0
    const double denom = ccv_max(1.0, ccv_max(fabs((double)actual[i]), fabs((double)expected[i])));
451
0
    max_abs = ccv_max(max_abs, diff);
452
0
    max_rel = ccv_max(max_rel, diff / denom);
453
0
  }
454
0
  if (max_abs_ref)
455
0
    *max_abs_ref = max_abs;
456
0
  if (max_rel_ref)
457
0
    *max_rel_ref = max_rel;
458
459
0
  ccfree(expected);
460
0
  ccfree(actual);
461
0
  if (bias_ref)
462
0
    ccfree(bias_ref);
463
0
  ccfree(w_ref);
464
0
  ccfree(a_ref);
465
0
  ccfree(w_dense);
466
0
  ccv_nnc_tensor_free(ha);
467
0
  ccv_nnc_tensor_free(hwq);
468
0
  if (hbias)
469
0
    ccv_nnc_tensor_free(hbias);
470
0
  ccv_nnc_tensor_free(a);
471
0
  ccv_nnc_tensor_free(w);
472
0
  if (bias)
473
0
    ccv_nnc_tensor_free(bias);
474
0
  ccv_nnc_tensor_free(b);
475
0
  ccv_nnc_tensor_free(hb);
476
0
  return 0;
477
0
}
478
479
static int _mps_forward_scaled_gemm_validate_batched(const int datatype, const int use_bias, const int weight_batched, const int bias_batched, double* const max_abs_ref, double* const max_rel_ref)
480
0
{
481
0
  const int batch_dim = 2;
482
0
  const int m_dim = 129;
483
0
  const int n_dim = 384;
484
0
  const int k_dim = 128;
485
0
  ccv_nnc_tensor_param_t ga_params = {
486
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
487
0
    .format = CCV_TENSOR_FORMAT_NHWC,
488
0
    .datatype = datatype,
489
0
    .dim = { batch_dim, m_dim, k_dim, 0 },
490
0
  };
491
0
  ccv_nnc_tensor_param_t gw_params = {
492
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
493
0
    .format = CCV_TENSOR_FORMAT_NHWC,
494
0
    .datatype = datatype,
495
0
    .dim = { weight_batched ? batch_dim : n_dim, weight_batched ? n_dim : k_dim, weight_batched ? k_dim : 0, 0 },
496
0
  };
497
0
  ccv_nnc_tensor_param_t gb_params = {
498
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
499
0
    .format = CCV_TENSOR_FORMAT_NHWC,
500
0
    .datatype = datatype,
501
0
    .dim = { batch_dim, m_dim, n_dim, 0 },
502
0
  };
503
0
  ccv_nnc_tensor_param_t gbias_params = {
504
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
505
0
    .format = CCV_TENSOR_FORMAT_NHWC,
506
0
    .datatype = datatype,
507
0
    .dim = { bias_batched ? batch_dim : n_dim, bias_batched ? n_dim : 0, 0, 0 },
508
0
  };
509
0
  ccv_nnc_tensor_param_t a_params = {
510
0
    .type = CCV_TENSOR_CPU_MEMORY,
511
0
    .format = CCV_TENSOR_FORMAT_NHWC,
512
0
    .datatype = datatype,
513
0
    .dim = { batch_dim, m_dim, k_dim, 0 },
514
0
  };
515
0
  ccv_nnc_tensor_param_t w_params = {
516
0
    .type = CCV_TENSOR_CPU_MEMORY,
517
0
    .format = CCV_TENSOR_FORMAT_NHWC,
518
0
    .datatype = datatype,
519
0
    .dim = { weight_batched ? batch_dim : n_dim, weight_batched ? n_dim : k_dim, weight_batched ? k_dim : 0, 0 },
520
0
  };
521
0
  ccv_nnc_tensor_param_t b_params = {
522
0
    .type = CCV_TENSOR_CPU_MEMORY,
523
0
    .format = CCV_TENSOR_FORMAT_NHWC,
524
0
    .datatype = datatype,
525
0
    .dim = { batch_dim, m_dim, n_dim, 0 },
526
0
  };
527
0
  ccv_nnc_tensor_param_t bias_params = {
528
0
    .type = CCV_TENSOR_CPU_MEMORY,
529
0
    .format = CCV_TENSOR_FORMAT_NHWC,
530
0
    .datatype = datatype,
531
0
    .dim = { bias_batched ? batch_dim : n_dim, bias_batched ? n_dim : 0, 0, 0 },
532
0
  };
533
0
  ccv_nnc_tensor_t* const ha = ccv_nnc_tensor_new(0, a_params, 0);
534
0
  ccv_nnc_tensor_t* const hwq = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(w_params), 0);
535
0
  ccv_nnc_tensor_t* const hbias = use_bias ? ccv_nnc_tensor_new(0, bias_params, 0) : 0;
536
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, ga_params, 0);
537
0
  ccv_nnc_tensor_t* const w = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(gw_params), 0);
538
0
  ccv_nnc_tensor_t* const bias = use_bias ? ccv_nnc_tensor_new(0, gbias_params, 0) : 0;
539
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, gb_params, 0);
540
0
  ccv_nnc_tensor_t* const hb = ccv_nnc_tensor_new(0, b_params, 0);
541
0
  _mps_forward_scaled_gemm_fill_matrix_batched(datatype, ha->data.u8, batch_dim, m_dim, k_dim, 1);
542
0
  if (use_bias)
543
0
  {
544
0
    if (bias_batched)
545
0
      _mps_forward_scaled_gemm_fill_bias_batched(datatype, hbias->data.u8, batch_dim, n_dim);
546
0
    else
547
0
      _mps_forward_scaled_gemm_fill_bias(datatype, hbias->data.u8, n_dim);
548
0
  }
549
0
  const int w_batch_dim = weight_batched ? batch_dim : 1;
550
0
  void* const w_dense = ccmalloc(CCV_GET_DATA_TYPE_SIZE(datatype) * w_batch_dim * n_dim * k_dim);
551
0
  if (weight_batched)
552
0
    _mps_forward_scaled_gemm_fill_matrix_batched(datatype, w_dense, batch_dim, n_dim, k_dim, 0);
553
0
  else
554
0
    _mps_forward_scaled_gemm_fill_matrix(datatype, w_dense, n_dim, k_dim, 0);
555
0
  const size_t qsize = ccv_nnc_quantize_8i_rowwise(w_dense, datatype, CCV_TENSOR_CPU_MEMORY, w_batch_dim * n_dim * k_dim, k_dim, hwq->data.u8, ccv_nnc_tensor_data_size_without_padding(hwq->info));
556
0
  if (qsize != ccv_nnc_tensor_data_size_without_padding(hwq->info))
557
0
  {
558
0
    ccfree(w_dense);
559
0
    ccv_nnc_tensor_free(ha);
560
0
    ccv_nnc_tensor_free(hwq);
561
0
    if (hbias)
562
0
      ccv_nnc_tensor_free(hbias);
563
0
    ccv_nnc_tensor_free(a);
564
0
    ccv_nnc_tensor_free(w);
565
0
    if (bias)
566
0
      ccv_nnc_tensor_free(bias);
567
0
    ccv_nnc_tensor_free(b);
568
0
    ccv_nnc_tensor_free(hb);
569
0
    return -1;
570
0
  }
571
0
  if (use_bias)
572
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hwq, hbias), TENSOR_LIST(a, w, bias), 0);
573
0
  else
574
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hwq), TENSOR_LIST(a, w), 0);
575
0
  if (weight_batched)
576
0
  {
577
0
    if (use_bias)
578
0
      ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
579
0
    else
580
0
      ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
581
0
  } else {
582
0
    if (use_bias)
583
0
      ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
584
0
    else
585
0
      ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
586
0
  }
587
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
588
589
0
  float* const a_ref = (float*)ccmalloc(sizeof(float) * batch_dim * m_dim * k_dim);
590
0
  float* const w_ref = (float*)ccmalloc(sizeof(float) * w_batch_dim * n_dim * k_dim);
591
0
  float* const bias_ref = use_bias ? (float*)ccmalloc(sizeof(float) * (bias_batched ? batch_dim : 1) * n_dim) : 0;
592
0
  float* const actual = (float*)ccmalloc(sizeof(float) * batch_dim * m_dim * n_dim);
593
0
  float* const expected = (float*)ccmalloc(sizeof(float) * batch_dim * m_dim * n_dim);
594
0
  _mps_forward_scaled_gemm_quantized_reference(datatype, ha->data.u8, batch_dim * m_dim, k_dim, a_ref);
595
0
  _mps_forward_scaled_gemm_quantized_reference(datatype, w_dense, w_batch_dim * n_dim, k_dim, w_ref);
596
0
  if (use_bias)
597
0
    _mps_forward_scaled_gemm_to_float(datatype, hbias->data.u8, (bias_batched ? batch_dim : 1) * n_dim, bias_ref);
598
0
  _mps_forward_scaled_gemm_to_float(datatype, hb->data.u8, batch_dim * m_dim * n_dim, actual);
599
0
  _mps_forward_scaled_gemm_reference_batched(a_ref, w_ref, bias_ref, batch_dim, w_batch_dim, bias_batched ? batch_dim : 1, m_dim, n_dim, k_dim, expected);
600
0
  double max_abs = 0;
601
0
  double max_rel = 0;
602
0
  int i;
603
0
  for (i = 0; i < batch_dim * m_dim * n_dim; i++)
604
0
  {
605
0
    const double diff = fabs((double)actual[i] - (double)expected[i]);
606
0
    const double denom = ccv_max(1.0, ccv_max(fabs((double)actual[i]), fabs((double)expected[i])));
607
0
    max_abs = ccv_max(max_abs, diff);
608
0
    max_rel = ccv_max(max_rel, diff / denom);
609
0
  }
610
0
  if (max_abs_ref)
611
0
    *max_abs_ref = max_abs;
612
0
  if (max_rel_ref)
613
0
    *max_rel_ref = max_rel;
614
615
0
  ccfree(expected);
616
0
  ccfree(actual);
617
0
  if (bias_ref)
618
0
    ccfree(bias_ref);
619
0
  ccfree(w_ref);
620
0
  ccfree(a_ref);
621
0
  ccfree(w_dense);
622
0
  ccv_nnc_tensor_free(ha);
623
0
  ccv_nnc_tensor_free(hwq);
624
0
  if (hbias)
625
0
    ccv_nnc_tensor_free(hbias);
626
0
  ccv_nnc_tensor_free(a);
627
0
  ccv_nnc_tensor_free(w);
628
0
  if (bias)
629
0
    ccv_nnc_tensor_free(bias);
630
0
  ccv_nnc_tensor_free(b);
631
0
  ccv_nnc_tensor_free(hb);
632
0
  return 0;
633
0
}
634
635
static int _mps_forward_scaled_gemm_compare_dense(const int datatype, const int use_bias, const int m_dim, const int n_dim, const int k_dim, double* const max_abs_ref, double* const max_rel_ref)
636
0
{
637
0
  ccv_nnc_tensor_param_t ga_params = {
638
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
639
0
    .format = CCV_TENSOR_FORMAT_NHWC,
640
0
    .datatype = datatype,
641
0
    .dim = { m_dim, k_dim, 0 },
642
0
  };
643
0
  ccv_nnc_tensor_param_t gwq_params = {
644
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
645
0
    .format = CCV_TENSOR_FORMAT_NHWC,
646
0
    .datatype = ((datatype >> 12) & 0xff) | CCV_QX | CCV_NNC_QX_8I_ROWWISE,
647
0
    .dim = { n_dim, k_dim, 0 },
648
0
  };
649
0
  ccv_nnc_tensor_param_t gwd_params = {
650
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
651
0
    .format = CCV_TENSOR_FORMAT_NHWC,
652
0
    .datatype = datatype,
653
0
    .dim = { n_dim, k_dim, 0 },
654
0
  };
655
0
  ccv_nnc_tensor_param_t gb_params = {
656
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
657
0
    .format = CCV_TENSOR_FORMAT_NHWC,
658
0
    .datatype = datatype,
659
0
    .dim = { m_dim, n_dim, 0 },
660
0
  };
661
0
  ccv_nnc_tensor_param_t gbias_params = {
662
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
663
0
    .format = CCV_TENSOR_FORMAT_NHWC,
664
0
    .datatype = datatype,
665
0
    .dim = { n_dim, 0 },
666
0
  };
667
0
  ccv_nnc_tensor_param_t a_params = {
668
0
    .type = CCV_TENSOR_CPU_MEMORY,
669
0
    .format = CCV_TENSOR_FORMAT_NHWC,
670
0
    .datatype = datatype,
671
0
    .dim = { m_dim, k_dim, 0 },
672
0
  };
673
0
  ccv_nnc_tensor_param_t wd_params = {
674
0
    .type = CCV_TENSOR_CPU_MEMORY,
675
0
    .format = CCV_TENSOR_FORMAT_NHWC,
676
0
    .datatype = datatype,
677
0
    .dim = { n_dim, k_dim, 0 },
678
0
  };
679
0
  ccv_nnc_tensor_param_t b_params = {
680
0
    .type = CCV_TENSOR_CPU_MEMORY,
681
0
    .format = CCV_TENSOR_FORMAT_NHWC,
682
0
    .datatype = datatype,
683
0
    .dim = { m_dim, n_dim, 0 },
684
0
  };
685
0
  ccv_nnc_tensor_param_t bias_params = {
686
0
    .type = CCV_TENSOR_CPU_MEMORY,
687
0
    .format = CCV_TENSOR_FORMAT_NHWC,
688
0
    .datatype = datatype,
689
0
    .dim = { n_dim, 0 },
690
0
  };
691
0
  ccv_nnc_tensor_t* const ha = ccv_nnc_tensor_new(0, a_params, 0);
692
0
  ccv_nnc_tensor_t* const hwd = ccv_nnc_tensor_new(0, wd_params, 0);
693
0
  ccv_nnc_tensor_t* const hwq = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(wd_params), 0);
694
0
  ccv_nnc_tensor_t* const hbias = use_bias ? ccv_nnc_tensor_new(0, bias_params, 0) : 0;
695
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, ga_params, 0);
696
0
  ccv_nnc_tensor_t* const wq = ccv_nnc_tensor_new(0, gwq_params, 0);
697
0
  ccv_nnc_tensor_t* const wd = ccv_nnc_tensor_new(0, gwd_params, 0);
698
0
  ccv_nnc_tensor_t* const bias = use_bias ? ccv_nnc_tensor_new(0, gbias_params, 0) : 0;
699
0
  ccv_nnc_tensor_t* const bq = ccv_nnc_tensor_new(0, gb_params, 0);
700
0
  ccv_nnc_tensor_t* const bd = ccv_nnc_tensor_new(0, gb_params, 0);
701
0
  ccv_nnc_tensor_t* const hbq = ccv_nnc_tensor_new(0, b_params, 0);
702
0
  ccv_nnc_tensor_t* const hbd = ccv_nnc_tensor_new(0, b_params, 0);
703
0
  _mps_forward_scaled_gemm_fill_matrix(datatype, ha->data.u8, m_dim, k_dim, 1);
704
0
  _mps_forward_scaled_gemm_fill_matrix(datatype, hwd->data.u8, n_dim, k_dim, 0);
705
0
  if (use_bias)
706
0
    _mps_forward_scaled_gemm_fill_bias(datatype, hbias->data.u8, n_dim);
707
0
  const size_t qsize = ccv_nnc_quantize_8i_rowwise(hwd->data.u8, datatype, CCV_TENSOR_CPU_MEMORY, n_dim * k_dim, k_dim, hwq->data.u8, ccv_nnc_tensor_data_size_without_padding(hwq->info));
708
0
  if (qsize != ccv_nnc_tensor_data_size_without_padding(hwq->info))
709
0
    return -1;
710
0
  if (use_bias)
711
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hwq, hbias), TENSOR_LIST(a, wq, bias), 0);
712
0
  else
713
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hwq), TENSOR_LIST(a, wq), 0);
714
0
  ccv_nnc_dequantize_8i_rowwise(wq->data.u8, datatype, CCV_TENSOR_GPU_MEMORY, qsize, k_dim, wd->data.u8, n_dim * k_dim);
715
0
  if (use_bias) {
716
0
    ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, wq, bias), TENSOR_LIST(bq), 0);
717
0
    ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, wd, bias), TENSOR_LIST(bd), 0);
718
0
  } else {
719
0
    ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, wq), TENSOR_LIST(bq), 0);
720
0
    ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, wd), TENSOR_LIST(bd), 0);
721
0
  }
722
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(bq, bd), TENSOR_LIST(hbq, hbd), 0);
723
0
  float* const actual = (float*)ccmalloc(sizeof(float) * m_dim * n_dim);
724
0
  float* const expected = (float*)ccmalloc(sizeof(float) * m_dim * n_dim);
725
0
  _mps_forward_scaled_gemm_to_float(datatype, hbq->data.u8, m_dim * n_dim, actual);
726
0
  _mps_forward_scaled_gemm_to_float(datatype, hbd->data.u8, m_dim * n_dim, expected);
727
0
  double max_abs = 0;
728
0
  double max_rel = 0;
729
0
  int i;
730
0
  for (i = 0; i < m_dim * n_dim; i++)
731
0
  {
732
0
    const double diff = fabs((double)actual[i] - (double)expected[i]);
733
0
    const double denom = ccv_max(1.0, ccv_max(fabs((double)actual[i]), fabs((double)expected[i])));
734
0
    max_abs = ccv_max(max_abs, diff);
735
0
    max_rel = ccv_max(max_rel, diff / denom);
736
0
  }
737
0
  if (max_abs_ref)
738
0
    *max_abs_ref = max_abs;
739
0
  if (max_rel_ref)
740
0
    *max_rel_ref = max_rel;
741
0
  ccfree(expected);
742
0
  ccfree(actual);
743
0
  ccv_nnc_tensor_free(hbq);
744
0
  ccv_nnc_tensor_free(hbd);
745
0
  ccv_nnc_tensor_free(bq);
746
0
  ccv_nnc_tensor_free(bd);
747
0
  ccv_nnc_tensor_free(a);
748
0
  ccv_nnc_tensor_free(wq);
749
0
  ccv_nnc_tensor_free(wd);
750
0
  ccv_nnc_tensor_free(ha);
751
0
  ccv_nnc_tensor_free(hwd);
752
0
  ccv_nnc_tensor_free(hwq);
753
0
  if (hbias)
754
0
    ccv_nnc_tensor_free(hbias);
755
0
  if (bias)
756
0
    ccv_nnc_tensor_free(bias);
757
0
  return 0;
758
0
}
759
760
static float _mps_segmented_scaled_gemm_a_value(const int row, const int k)
761
0
{
762
0
  return (float)(((row * 17 + k * 13) % 61) - 30) / 128.0f;
763
0
}
764
765
static float _mps_segmented_scaled_gemm_w_value(const int segment, const int col, const int k)
766
0
{
767
0
  return (float)(((segment * 23 + col * 11 + k * 7) % 67) - 33) / 256.0f;
768
0
}
769
770
static float _mps_segmented_scaled_gemm_bias_value(const int segment, const int col)
771
0
{
772
0
  return (float)(((segment * 5 + col * 3) % 29) - 14) / 256.0f;
773
0
}
774
775
static int _mps_segmented_scaled_gemm_validate(const int datatype, const int use_bias, const int force_fallback, double* const max_abs_ref, double* const max_rel_ref)
776
0
{
777
0
  const int total_m = 384;
778
0
  const int n_dim = 128;
779
0
  const int k_dim = 256;
780
0
  const int segments = 3;
781
0
  const int counts_data[] = {129, 131, 124};
782
0
  const int indices_data[] = {1, 0, 2};
783
0
  const ccv_nnc_tensor_param_t ha_params = {
784
0
    .type = CCV_TENSOR_CPU_MEMORY,
785
0
    .format = CCV_TENSOR_FORMAT_NHWC,
786
0
    .datatype = datatype,
787
0
    .dim = { total_m, k_dim, 0 },
788
0
  };
789
0
  const ccv_nnc_tensor_param_t hwd_params = {
790
0
    .type = CCV_TENSOR_CPU_MEMORY,
791
0
    .format = CCV_TENSOR_FORMAT_NHWC,
792
0
    .datatype = datatype,
793
0
    .dim = { segments, n_dim, k_dim, 0 },
794
0
  };
795
0
  const ccv_nnc_tensor_param_t hbias_params = {
796
0
    .type = CCV_TENSOR_CPU_MEMORY,
797
0
    .format = CCV_TENSOR_FORMAT_NHWC,
798
0
    .datatype = datatype,
799
0
    .dim = { segments, n_dim, 0 },
800
0
  };
801
0
  const ccv_nnc_tensor_param_t ga_params = {
802
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
803
0
    .format = CCV_TENSOR_FORMAT_NHWC,
804
0
    .datatype = datatype,
805
0
    .dim = { total_m, k_dim, 0 },
806
0
  };
807
0
  const ccv_nnc_tensor_param_t gw_params = {
808
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
809
0
    .format = CCV_TENSOR_FORMAT_NHWC,
810
0
    .datatype = datatype,
811
0
    .dim = { segments, n_dim, k_dim, 0 },
812
0
  };
813
0
  const ccv_nnc_tensor_param_t gbias_params = {
814
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
815
0
    .format = CCV_TENSOR_FORMAT_NHWC,
816
0
    .datatype = datatype,
817
0
    .dim = { segments, n_dim, 0 },
818
0
  };
819
0
  const ccv_nnc_tensor_param_t gb_params = {
820
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
821
0
    .format = CCV_TENSOR_FORMAT_NHWC,
822
0
    .datatype = datatype,
823
0
    .dim = { total_m, n_dim, 0 },
824
0
  };
825
0
  const ccv_nnc_tensor_param_t hb_params = {
826
0
    .type = CCV_TENSOR_CPU_MEMORY,
827
0
    .format = CCV_TENSOR_FORMAT_NHWC,
828
0
    .datatype = datatype,
829
0
    .dim = { total_m, n_dim, 0 },
830
0
  };
831
0
  ccv_nnc_tensor_t* const ha = ccv_nnc_tensor_new(0, ha_params, 0);
832
0
  ccv_nnc_tensor_t* const hindices = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32S, segments), 0);
833
0
  ccv_nnc_tensor_t* const hcounts = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32S, segments), 0);
834
0
  ccv_nnc_tensor_t* const hwd = ccv_nnc_tensor_new(0, hwd_params, 0);
835
0
  ccv_nnc_tensor_t* const hwq = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(hwd_params), 0);
836
0
  ccv_nnc_tensor_t* const hbias = use_bias ? ccv_nnc_tensor_new(0, hbias_params, 0) : 0;
837
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, ga_params, 0);
838
0
  ccv_nnc_tensor_t* const indices = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32S, segments), 0);
839
0
  ccv_nnc_tensor_t* const counts = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32S, segments), 0);
840
0
  ccv_nnc_tensor_t* const w = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(gw_params), 0);
841
0
  ccv_nnc_tensor_t* const bias = use_bias ? ccv_nnc_tensor_new(0, gbias_params, 0) : 0;
842
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, gb_params, 0);
843
0
  ccv_nnc_tensor_t* const hb = ccv_nnc_tensor_new(0, hb_params, 0);
844
0
  float* const a_values = (float*)ccmalloc(sizeof(float) * total_m * k_dim);
845
0
  float* const w_values = (float*)ccmalloc(sizeof(float) * segments * n_dim * k_dim);
846
0
  float* const bias_values = use_bias ? (float*)ccmalloc(sizeof(float) * segments * n_dim) : 0;
847
0
  int i, j, k;
848
0
  for (i = 0; i < total_m; i++)
849
0
    for (k = 0; k < k_dim; k++)
850
0
      a_values[i * k_dim + k] = _mps_segmented_scaled_gemm_a_value(i, k);
851
0
  for (i = 0; i < segments; i++)
852
0
    for (j = 0; j < n_dim; j++)
853
0
      for (k = 0; k < k_dim; k++)
854
0
        w_values[((i * n_dim) + j) * k_dim + k] = _mps_segmented_scaled_gemm_w_value(i, j, k);
855
0
  if (use_bias)
856
0
    for (i = 0; i < segments; i++)
857
0
      for (j = 0; j < n_dim; j++)
858
0
        bias_values[i * n_dim + j] = _mps_segmented_scaled_gemm_bias_value(i, j);
859
0
  if (datatype == CCV_16F)
860
0
  {
861
0
    ccv_float_to_half_precision(a_values, (uint16_t*)ha->data.u8, total_m * k_dim);
862
0
    ccv_float_to_half_precision(w_values, (uint16_t*)hwd->data.u8, segments * n_dim * k_dim);
863
0
    if (use_bias)
864
0
      ccv_float_to_half_precision(bias_values, (uint16_t*)hbias->data.u8, segments * n_dim);
865
0
  } else if (datatype == CCV_16BF) {
866
0
    ccv_float_to_bfloat(a_values, (uint16_t*)ha->data.u8, total_m * k_dim);
867
0
    ccv_float_to_bfloat(w_values, (uint16_t*)hwd->data.u8, segments * n_dim * k_dim);
868
0
    if (use_bias)
869
0
      ccv_float_to_bfloat(bias_values, (uint16_t*)hbias->data.u8, segments * n_dim);
870
0
  } else {
871
0
    memcpy(ha->data.f32, a_values, sizeof(float) * total_m * k_dim);
872
0
    memcpy(hwd->data.f32, w_values, sizeof(float) * segments * n_dim * k_dim);
873
0
    if (use_bias)
874
0
      memcpy(hbias->data.f32, bias_values, sizeof(float) * segments * n_dim);
875
0
  }
876
0
  memcpy(hindices->data.i32, indices_data, sizeof(indices_data));
877
0
  memcpy(hcounts->data.i32, counts_data, sizeof(counts_data));
878
0
  const size_t qsize = ccv_nnc_quantize_8i_rowwise(hwd->data.u8, datatype, CCV_TENSOR_CPU_MEMORY, (size_t)segments * n_dim * k_dim, k_dim, hwq->data.u8, ccv_nnc_tensor_data_size_without_padding(hwq->info));
879
0
  if (qsize != ccv_nnc_tensor_data_size_without_padding(hwq->info))
880
0
    return -1;
881
0
  if (use_bias)
882
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hindices, hcounts, hwq, hbias), TENSOR_LIST(a, indices, counts, w, bias), 0);
883
0
  else
884
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hindices, hcounts, hwq), TENSOR_LIST(a, indices, counts, w), 0);
885
0
  const uint64_t old_flags = ccv_nnc_flags();
886
0
  if (force_fallback)
887
0
    ccv_nnc_enable_flag(CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS);
888
0
  if (use_bias)
889
0
    ccv_nnc_cmd_exec(CMD_SEGMENTED_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, indices, counts, w, bias), TENSOR_LIST(b), 0);
890
0
  else
891
0
    ccv_nnc_cmd_exec(CMD_SEGMENTED_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, indices, counts, w), TENSOR_LIST(b), 0);
892
0
  if (force_fallback && !(old_flags & CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS))
893
0
    ccv_nnc_disable_flag(CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS);
894
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
895
896
0
  float* const a_ref = (float*)ccmalloc(sizeof(float) * total_m * k_dim);
897
0
  float* const w_ref = (float*)ccmalloc(sizeof(float) * segments * n_dim * k_dim);
898
0
  float* const bias_ref = use_bias ? (float*)ccmalloc(sizeof(float) * segments * n_dim) : 0;
899
0
  float* const actual = (float*)ccmalloc(sizeof(float) * total_m * n_dim);
900
0
  ccv_nnc_tensor_t* const ha_ref = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, total_m, k_dim), 0);
901
0
  ccv_nnc_tensor_t* const hw_ref = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, segments, n_dim, k_dim), 0);
902
0
  ccv_nnc_tensor_t* const hbias_ref = use_bias ? ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, segments, n_dim), 0) : 0;
903
0
  ccv_nnc_tensor_t* const bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, total_m, n_dim), 0);
904
0
  if (force_fallback)
905
0
    _mps_forward_scaled_gemm_to_float(datatype, ha->data.u8, total_m * k_dim, a_ref);
906
0
  else
907
0
    _mps_forward_scaled_gemm_quantized_reference(datatype, ha->data.u8, total_m, k_dim, a_ref);
908
0
  _mps_forward_scaled_gemm_quantized_reference(datatype, hwd->data.u8, segments * n_dim, k_dim, w_ref);
909
0
  if (use_bias)
910
0
    _mps_forward_scaled_gemm_to_float(datatype, hbias->data.u8, segments * n_dim, bias_ref);
911
0
  _mps_forward_scaled_gemm_to_float(datatype, hb->data.u8, total_m * n_dim, actual);
912
0
  memcpy(ha_ref->data.f32, a_ref, sizeof(float) * total_m * k_dim);
913
0
  memcpy(hw_ref->data.f32, w_ref, sizeof(float) * segments * n_dim * k_dim);
914
0
  if (use_bias)
915
0
    memcpy(hbias_ref->data.f32, bias_ref, sizeof(float) * segments * n_dim);
916
0
  if (use_bias)
917
0
    ccv_nnc_cmd_exec(CMD_SEGMENTED_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha_ref, hindices, hcounts, hw_ref, hbias_ref), TENSOR_LIST(bt), 0);
918
0
  else
919
0
    ccv_nnc_cmd_exec(CMD_SEGMENTED_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha_ref, hindices, hcounts, hw_ref), TENSOR_LIST(bt), 0);
920
0
  double max_abs = 0;
921
0
  double max_rel = 0;
922
0
  for (i = 0; i < total_m * n_dim; i++)
923
0
  {
924
0
    const double diff = fabs((double)actual[i] - (double)bt->data.f32[i]);
925
0
    const double denom = ccv_max(1.0, ccv_max(fabs((double)actual[i]), fabs((double)bt->data.f32[i])));
926
0
    max_abs = ccv_max(max_abs, diff);
927
0
    max_rel = ccv_max(max_rel, diff / denom);
928
0
  }
929
0
  if (max_abs_ref)
930
0
    *max_abs_ref = max_abs;
931
0
  if (max_rel_ref)
932
0
    *max_rel_ref = max_rel;
933
0
  ccv_nnc_tensor_free(bt);
934
0
  if (hbias_ref)
935
0
    ccv_nnc_tensor_free(hbias_ref);
936
0
  ccv_nnc_tensor_free(hw_ref);
937
0
  ccv_nnc_tensor_free(ha_ref);
938
0
  ccfree(actual);
939
0
  if (bias_ref)
940
0
    ccfree(bias_ref);
941
0
  ccfree(w_ref);
942
0
  ccfree(a_ref);
943
0
  ccfree(a_values);
944
0
  ccfree(w_values);
945
0
  if (bias_values)
946
0
    ccfree(bias_values);
947
0
  ccv_nnc_tensor_free(hb);
948
0
  ccv_nnc_tensor_free(b);
949
0
  if (bias)
950
0
    ccv_nnc_tensor_free(bias);
951
0
  ccv_nnc_tensor_free(w);
952
0
  ccv_nnc_tensor_free(counts);
953
0
  ccv_nnc_tensor_free(indices);
954
0
  ccv_nnc_tensor_free(a);
955
0
  if (hbias)
956
0
    ccv_nnc_tensor_free(hbias);
957
0
  ccv_nnc_tensor_free(hwq);
958
0
  ccv_nnc_tensor_free(hwd);
959
0
  ccv_nnc_tensor_free(hcounts);
960
0
  ccv_nnc_tensor_free(hindices);
961
0
  ccv_nnc_tensor_free(ha);
962
0
  return 0;
963
0
}
964
965
TEST_CASE("mps forward gemm with row-wise 8i weight NA")
966
1
{
967
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
968
0
  double max_abs = 0;
969
0
  double max_rel = 0;
970
0
  REQUIRE_EQ(_mps_forward_scaled_gemm_validate(CCV_16F, 0, &max_abs, &max_rel), 0, "scaled GEMM validation should run");
971
0
  REQUIRE(max_rel < 2e-3, "quantized NAInt8MatMul should match row-wise quantized fp16 reference, max_abs=%g max_rel=%g", max_abs, max_rel);
972
0
  max_abs = 0;
973
0
  max_rel = 0;
974
0
  REQUIRE_EQ(_mps_forward_scaled_gemm_validate(CCV_32F, 0, &max_abs, &max_rel), 0, "scaled GEMM validation should run");
975
0
  REQUIRE(max_rel < 2e-3, "quantized NAInt8MatMul should match row-wise quantized fp32 reference, max_abs=%g max_rel=%g", max_abs, max_rel);
976
0
  max_abs = 0;
977
0
  max_rel = 0;
978
0
  REQUIRE_EQ(_mps_forward_scaled_gemm_validate(CCV_16BF, 0, &max_abs, &max_rel), 0, "scaled GEMM validation should run");
979
0
  REQUIRE(max_rel < 5e-3, "quantized NAInt8MatMul should match row-wise quantized bf16 reference, max_abs=%g max_rel=%g", max_abs, max_rel);
980
0
}
981
982
TEST_CASE("mps forward gemm with row-wise 8i weight and bias NA")
983
1
{
984
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
985
0
  double max_abs = 0;
986
0
  double max_rel = 0;
987
0
  REQUIRE_EQ(_mps_forward_scaled_gemm_validate(CCV_16F, 1, &max_abs, &max_rel), 0, "scaled GEMM validation with bias should run");
988
0
  REQUIRE(max_rel < 2e-3, "quantized NAInt8MatMul with bias should match row-wise quantized fp16 reference, max_abs=%g max_rel=%g", max_abs, max_rel);
989
0
  max_abs = 0;
990
0
  max_rel = 0;
991
0
  REQUIRE_EQ(_mps_forward_scaled_gemm_validate(CCV_32F, 1, &max_abs, &max_rel), 0, "scaled GEMM validation with bias should run");
992
0
  REQUIRE(max_rel < 2e-3, "quantized NAInt8MatMul with bias should match row-wise quantized fp32 reference, max_abs=%g max_rel=%g", max_abs, max_rel);
993
0
  max_abs = 0;
994
0
  max_rel = 0;
995
0
  REQUIRE_EQ(_mps_forward_scaled_gemm_validate(CCV_16BF, 1, &max_abs, &max_rel), 0, "scaled GEMM validation with bias should run");
996
0
  REQUIRE(max_rel < 5e-3, "quantized NAInt8MatMul with bias should match row-wise quantized bf16 reference, max_abs=%g max_rel=%g", max_abs, max_rel);
997
0
}
998
999
TEST_CASE("mps segmented gemm with row-wise 8i weight NA")
1000
1
{
1001
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SEGMENTED_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1002
0
  double max_abs = 0;
1003
0
  double max_rel = 0;
1004
0
  REQUIRE_EQ(_mps_segmented_scaled_gemm_validate(CCV_16F, 0, 0, &max_abs, &max_rel), 0, "segmented row-wise 8i NA validation should run");
1005
0
  REQUIRE(max_rel < 3e-3, "segmented row-wise 8i NA fp16 should match quantized reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1006
0
  max_abs = 0;
1007
0
  max_rel = 0;
1008
0
  REQUIRE_EQ(_mps_segmented_scaled_gemm_validate(CCV_32F, 0, 0, &max_abs, &max_rel), 0, "segmented row-wise 8i NA validation should run");
1009
0
  REQUIRE(max_rel < 3e-3, "segmented row-wise 8i NA fp32 should match quantized reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1010
0
  max_abs = 0;
1011
0
  max_rel = 0;
1012
0
  REQUIRE_EQ(_mps_segmented_scaled_gemm_validate(CCV_16BF, 0, 0, &max_abs, &max_rel), 0, "segmented row-wise 8i NA validation should run");
1013
0
  REQUIRE(max_rel < 6e-3, "segmented row-wise 8i NA bf16 should match quantized reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1014
0
}
1015
1016
TEST_CASE("mps segmented gemm with row-wise 8i weight and bias fallback dequantize")
1017
1
{
1018
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SEGMENTED_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1019
0
  double max_abs = 0;
1020
0
  double max_rel = 0;
1021
0
  REQUIRE_EQ(_mps_segmented_scaled_gemm_validate(CCV_16F, 1, 1, &max_abs, &max_rel), 0, "segmented fallback row-wise 8i validation should run");
1022
0
  REQUIRE(max_rel < 3e-3, "segmented fallback row-wise 8i fp16 should match dense-A reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1023
0
  max_abs = 0;
1024
0
  max_rel = 0;
1025
0
  REQUIRE_EQ(_mps_segmented_scaled_gemm_validate(CCV_32F, 1, 1, &max_abs, &max_rel), 0, "segmented fallback row-wise 8i validation should run");
1026
0
  REQUIRE(max_rel < 3e-3, "segmented fallback row-wise 8i fp32 should match dense-A reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1027
0
  max_abs = 0;
1028
0
  max_rel = 0;
1029
0
  REQUIRE_EQ(_mps_segmented_scaled_gemm_validate(CCV_16BF, 1, 1, &max_abs, &max_rel), 0, "segmented fallback row-wise 8i validation should run");
1030
0
  REQUIRE(max_rel < 6e-3, "segmented fallback row-wise 8i bf16 should match dense-A reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1031
0
}
1032
1033
TEST_CASE("mps forward gemm with row-wise 8i weight fallback dequantize")
1034
1
{
1035
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1036
0
  const uint64_t old_flags = ccv_nnc_flags();
1037
0
  ccv_nnc_enable_flag(CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS);
1038
0
  double max_abs = 0;
1039
0
  double max_rel = 0;
1040
0
  const int status16f = _mps_forward_scaled_gemm_compare_dense(CCV_16F, 0, 257, 384, 128, &max_abs, &max_rel);
1041
0
  if (!(old_flags & CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS)) {
1042
0
    ccv_nnc_disable_flag(CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS);
1043
0
  }
1044
0
  REQUIRE_EQ(status16f, 0, "fallback row-wise 8i GEMM validation should run");
1045
0
  REQUIRE(max_rel < 2e-3, "fallback row-wise 8i GEMM should match dense GPU fp16 reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1046
1047
0
  ccv_nnc_enable_flag(CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS);
1048
0
  max_abs = 0;
1049
0
  max_rel = 0;
1050
0
  const int status32f = _mps_forward_scaled_gemm_compare_dense(CCV_32F, 0, 257, 384, 128, &max_abs, &max_rel);
1051
0
  if (!(old_flags & CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS)) {
1052
0
    ccv_nnc_disable_flag(CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS);
1053
0
  }
1054
0
  REQUIRE_EQ(status32f, 0, "fallback row-wise 8i GEMM validation should run");
1055
0
  REQUIRE(max_rel < 2e-3, "fallback row-wise 8i GEMM should match dense GPU fp32 reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1056
1057
0
  ccv_nnc_enable_flag(CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS);
1058
0
  max_abs = 0;
1059
0
  max_rel = 0;
1060
0
  const int status16bf = _mps_forward_scaled_gemm_compare_dense(CCV_16BF, 0, 257, 384, 128, &max_abs, &max_rel);
1061
0
  if (!(old_flags & CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS)) {
1062
0
    ccv_nnc_disable_flag(CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS);
1063
0
  }
1064
0
  REQUIRE_EQ(status16bf, 0, "fallback row-wise 8i GEMM validation should run");
1065
0
  REQUIRE(max_rel < 5e-3, "fallback row-wise 8i GEMM should match dense GPU bf16 reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1066
0
}
1067
1068
TEST_CASE("mps forward gemm with row-wise 8i weight and bias fallback dequantize")
1069
1
{
1070
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1071
0
  const uint64_t old_flags = ccv_nnc_flags();
1072
0
  double max_abs = 0;
1073
0
  double max_rel = 0;
1074
0
  ccv_nnc_enable_flag(CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS);
1075
0
  REQUIRE_EQ(_mps_forward_scaled_gemm_compare_dense(CCV_16F, 1, 257, 384, 128, &max_abs, &max_rel), 0, "fallback row-wise 8i GEMM with bias validation should run");
1076
0
  if (!(old_flags & CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS)) {
1077
0
    ccv_nnc_disable_flag(CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS);
1078
0
  }
1079
0
  REQUIRE(max_rel < 2e-3, "fallback row-wise 8i GEMM with bias should match dense GPU fp16 reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1080
0
  max_abs = 0;
1081
0
  max_rel = 0;
1082
0
  ccv_nnc_enable_flag(CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS);
1083
0
  REQUIRE_EQ(_mps_forward_scaled_gemm_compare_dense(CCV_32F, 1, 257, 384, 128, &max_abs, &max_rel), 0, "fallback row-wise 8i GEMM with bias validation should run");
1084
0
  if (!(old_flags & CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS)) {
1085
0
    ccv_nnc_disable_flag(CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS);
1086
0
  }
1087
0
  REQUIRE(max_rel < 2e-3, "fallback row-wise 8i GEMM with bias should match dense GPU fp32 reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1088
0
  max_abs = 0;
1089
0
  max_rel = 0;
1090
0
  ccv_nnc_enable_flag(CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS);
1091
0
  REQUIRE_EQ(_mps_forward_scaled_gemm_compare_dense(CCV_16BF, 1, 257, 384, 128, &max_abs, &max_rel), 0, "fallback row-wise 8i GEMM with bias validation should run");
1092
0
  if (!(old_flags & CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS)) {
1093
0
    ccv_nnc_disable_flag(CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS);
1094
0
  }
1095
0
  REQUIRE(max_rel < 5e-3, "fallback row-wise 8i GEMM with bias should match dense GPU bf16 reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1096
0
}
1097
1098
TEST_CASE("mps forward gemm with row-wise 8i weight and bias fallback dequantize large shapes")
1099
1
{
1100
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1101
0
  const uint64_t old_flags = ccv_nnc_flags();
1102
0
  ccv_nnc_enable_flag(CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS);
1103
0
  static const int shapes[][3] = {
1104
0
    {32, 3840, 3840},
1105
0
    {32, 10240, 3840},
1106
0
    {32, 3840, 10240},
1107
0
  };
1108
0
  int i;
1109
0
  for (i = 0; i < (int)(sizeof(shapes) / sizeof(shapes[0])); i++)
1110
0
  {
1111
0
    double max_abs = 0;
1112
0
    double max_rel = 0;
1113
0
    REQUIRE_EQ(_mps_forward_scaled_gemm_compare_dense(CCV_16BF, 1, shapes[i][0], shapes[i][1], shapes[i][2], &max_abs, &max_rel), 0, "large fallback row-wise 8i GEMM with bias validation should run");
1114
0
    REQUIRE(max_rel < 5e-3, "large fallback row-wise 8i GEMM with bias should match dense GPU bf16 reference for shape %d x %d x %d, max_abs=%g max_rel=%g", shapes[i][0], shapes[i][1], shapes[i][2], max_abs, max_rel);
1115
0
  }
1116
0
  if (!(old_flags & CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS)) {
1117
0
    ccv_nnc_disable_flag(CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS);
1118
0
  }
1119
0
}
1120
1121
TEST_CASE("mps forward batched gemm with broadcast row-wise 8i weight NA")
1122
1
{
1123
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1124
0
  double max_abs = 0;
1125
0
  double max_rel = 0;
1126
0
  REQUIRE_EQ(_mps_forward_scaled_gemm_validate_batched(CCV_16F, 0, 0, 0, &max_abs, &max_rel), 0, "batched scaled GEMM validation should run");
1127
0
  REQUIRE(max_rel < 2e-3, "batched quantized NAInt8MatMul should match broadcast-weight fp16 reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1128
0
  max_abs = 0;
1129
0
  max_rel = 0;
1130
0
  REQUIRE_EQ(_mps_forward_scaled_gemm_validate_batched(CCV_32F, 0, 0, 0, &max_abs, &max_rel), 0, "batched scaled GEMM validation should run");
1131
0
  REQUIRE(max_rel < 2e-3, "batched quantized NAInt8MatMul should match broadcast-weight fp32 reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1132
0
  max_abs = 0;
1133
0
  max_rel = 0;
1134
0
  REQUIRE_EQ(_mps_forward_scaled_gemm_validate_batched(CCV_16BF, 0, 0, 0, &max_abs, &max_rel), 0, "batched scaled GEMM validation should run");
1135
0
  REQUIRE(max_rel < 5e-3, "batched quantized NAInt8MatMul should match broadcast-weight bf16 reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1136
0
}
1137
1138
TEST_CASE("mps forward batched gemm with batched row-wise 8i weight and bias NA")
1139
1
{
1140
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1141
0
  double max_abs = 0;
1142
0
  double max_rel = 0;
1143
0
  REQUIRE_EQ(_mps_forward_scaled_gemm_validate_batched(CCV_16F, 1, 1, 1, &max_abs, &max_rel), 0, "batched scaled GEMM validation with batched weight and bias should run");
1144
0
  REQUIRE(max_rel < 2e-3, "batched quantized NAInt8MatMul should match batched-weight fp16 reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1145
0
  max_abs = 0;
1146
0
  max_rel = 0;
1147
0
  REQUIRE_EQ(_mps_forward_scaled_gemm_validate_batched(CCV_32F, 1, 1, 1, &max_abs, &max_rel), 0, "batched scaled GEMM validation with batched weight and bias should run");
1148
0
  REQUIRE(max_rel < 2e-3, "batched quantized NAInt8MatMul should match batched-weight fp32 reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1149
0
  max_abs = 0;
1150
0
  max_rel = 0;
1151
0
  REQUIRE_EQ(_mps_forward_scaled_gemm_validate_batched(CCV_16BF, 1, 1, 1, &max_abs, &max_rel), 0, "batched scaled GEMM validation with batched weight and bias should run");
1152
0
  REQUIRE(max_rel < 5e-3, "batched quantized NAInt8MatMul should match batched-weight bf16 reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1153
0
}
1154
1155
#define _STRINGIFY(x) #x
1156
#define STRINGIFY(x) _STRINGIFY(x)
1157
#define NA_GEMM_SHAPE_TEST(M, N, K) \
1158
  TEST_CASE("mps forward gemm no bias NA shape " STRINGIFY(M) "x" STRINGIFY(N) "x" STRINGIFY(K)) \
1159
31
  { \
1160
31
    if (!getenv("CCV_NNC_RUN_NA_GEMM_SHAPE_TESTS")) \
1161
31
      return; \
1162
31
    
GUARD_ELSE_RETURN0
(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS))0
; \
1163
0
    _mps_forward_na_gemm_mismatch_t mismatch = {}; \
1164
0
    REQUIRE(_mps_forward_na_gemm_validate_shape(M, N, K, &mismatch), "sampled GEMM result should match reference for shape (%d, %d, %d) at (%d, %d): %g vs %g", M, N, K, mismatch.row, mismatch.col, mismatch.actual, mismatch.expected); \
1165
0
  }
1166
1167
#define NA_GEMM_BIAS_SHAPE_TEST(M, N, K) \
1168
  TEST_CASE("mps forward gemm with bias NA shape " STRINGIFY(M) "x" STRINGIFY(N) "x" STRINGIFY(K)) \
1169
31
  { \
1170
31
    if (!getenv("CCV_NNC_RUN_NA_GEMM_SHAPE_TESTS")) \
1171
31
      return; \
1172
31
    
GUARD_ELSE_RETURN0
(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS))0
; \
1173
0
    _mps_forward_na_gemm_mismatch_t mismatch = {}; \
1174
0
    REQUIRE(_mps_forward_na_gemm_validate_shape_with_bias(M, N, K, &mismatch), "sampled GEMM result with bias should match reference for shape (%d, %d, %d) at (%d, %d): %g vs %g", M, N, K, mismatch.row, mismatch.col, mismatch.actual, mismatch.expected); \
1175
0
  }
1176
1177
TEST_CASE("gemm no transpose")
1178
1
{
1179
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1180
0
  float ap[] = {
1181
0
    1, 2,
1182
0
    3, 4,
1183
0
    5, 6,
1184
0
    7, 8,
1185
0
  };
1186
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 4, 2), 0);
1187
0
  float bp[] = {
1188
0
    7, 8, 9,
1189
0
    10, 11, 12,
1190
0
  };
1191
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
1192
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 3), 0);
1193
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
1194
0
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
1195
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
1196
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
1197
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb), TENSOR_LIST(gc), 0);
1198
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
1199
0
  float ctp[] = {
1200
0
    1 * 7 + 2 * 10, 1 * 8 + 2 * 11, 1 * 9 + 2 * 12,
1201
0
    3 * 7 + 4 * 10, 3 * 8 + 4 * 11, 3 * 9 + 4 * 12,
1202
0
    5 * 7 + 6 * 10, 5 * 8 + 6 * 11, 5 * 9 + 6 * 12,
1203
0
    7 * 7 + 8 * 10, 7 * 8 + 8 * 11, 7 * 9 + 8 * 12,
1204
0
  };
1205
0
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
1206
0
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
1207
0
  ccv_nnc_tensor_free(a);
1208
0
  ccv_nnc_tensor_free(b);
1209
0
  ccv_nnc_tensor_free(c);
1210
0
  ccv_nnc_tensor_free(ga);
1211
0
  ccv_nnc_tensor_free(gb);
1212
0
  ccv_nnc_tensor_free(gc);
1213
0
}
1214
1215
TEST_CASE("gemm transpose a")
1216
1
{
1217
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1218
0
  float ap[] = {
1219
0
    1, 3, 5, 7,
1220
0
    2, 4, 6, 8,
1221
0
  };
1222
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4), 0);
1223
0
  float bp[] = {
1224
0
    7, 8, 9,
1225
0
    10, 11, 12,
1226
0
  };
1227
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
1228
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 3), 0);
1229
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4), 0);
1230
0
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
1231
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
1232
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
1233
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb), TENSOR_LIST(gc), 0);
1234
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
1235
0
  float ctp[] = {
1236
0
    1 * 7 + 2 * 10, 1 * 8 + 2 * 11, 1 * 9 + 2 * 12,
1237
0
    3 * 7 + 4 * 10, 3 * 8 + 4 * 11, 3 * 9 + 4 * 12,
1238
0
    5 * 7 + 6 * 10, 5 * 8 + 6 * 11, 5 * 9 + 6 * 12,
1239
0
    7 * 7 + 8 * 10, 7 * 8 + 8 * 11, 7 * 9 + 8 * 12,
1240
0
  };
1241
0
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
1242
0
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
1243
0
  ccv_nnc_tensor_free(a);
1244
0
  ccv_nnc_tensor_free(b);
1245
0
  ccv_nnc_tensor_free(c);
1246
0
  ccv_nnc_tensor_free(ga);
1247
0
  ccv_nnc_tensor_free(gb);
1248
0
  ccv_nnc_tensor_free(gc);
1249
0
}
1250
1251
TEST_CASE("gemm transpose b")
1252
1
{
1253
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1254
0
  float ap[] = {
1255
0
    1, 2,
1256
0
    3, 4,
1257
0
    5, 6,
1258
0
    7, 8,
1259
0
  };
1260
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 4, 2), 0);
1261
0
  float bp[] = {
1262
0
    7, 10,
1263
0
    8, 11,
1264
0
    9, 12,
1265
0
  };
1266
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 3, 2), 0);
1267
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 3), 0);
1268
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
1269
0
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3, 2), 0);
1270
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
1271
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
1272
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb), TENSOR_LIST(gc), 0);
1273
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
1274
0
  float ctp[] = {
1275
0
    1 * 7 + 2 * 10, 1 * 8 + 2 * 11, 1 * 9 + 2 * 12,
1276
0
    3 * 7 + 4 * 10, 3 * 8 + 4 * 11, 3 * 9 + 4 * 12,
1277
0
    5 * 7 + 6 * 10, 5 * 8 + 6 * 11, 5 * 9 + 6 * 12,
1278
0
    7 * 7 + 8 * 10, 7 * 8 + 8 * 11, 7 * 9 + 8 * 12,
1279
0
  };
1280
0
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
1281
0
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
1282
0
  ccv_nnc_tensor_free(a);
1283
0
  ccv_nnc_tensor_free(b);
1284
0
  ccv_nnc_tensor_free(c);
1285
0
  ccv_nnc_tensor_free(ga);
1286
0
  ccv_nnc_tensor_free(gb);
1287
0
  ccv_nnc_tensor_free(gc);
1288
0
}
1289
1290
TEST_CASE("gemm transpose a and b")
1291
1
{
1292
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1293
0
  float ap[] = {
1294
0
    1, 3, 5, 7,
1295
0
    2, 4, 6, 8,
1296
0
  };
1297
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4), 0);
1298
0
  float bp[] = {
1299
0
    7, 10,
1300
0
    8, 11,
1301
0
    9, 12,
1302
0
  };
1303
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 3, 2), 0);
1304
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 3), 0);
1305
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4), 0);
1306
0
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3, 2), 0);
1307
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
1308
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
1309
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(TRANSPOSE(0, 1), TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb), TENSOR_LIST(gc), 0);
1310
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
1311
0
  float ctp[] = {
1312
0
    1 * 7 + 2 * 10, 1 * 8 + 2 * 11, 1 * 9 + 2 * 12,
1313
0
    3 * 7 + 4 * 10, 3 * 8 + 4 * 11, 3 * 9 + 4 * 12,
1314
0
    5 * 7 + 6 * 10, 5 * 8 + 6 * 11, 5 * 9 + 6 * 12,
1315
0
    7 * 7 + 8 * 10, 7 * 8 + 8 * 11, 7 * 9 + 8 * 12,
1316
0
  };
1317
0
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
1318
0
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
1319
0
  ccv_nnc_tensor_free(a);
1320
0
  ccv_nnc_tensor_free(b);
1321
0
  ccv_nnc_tensor_free(c);
1322
0
  ccv_nnc_tensor_free(ga);
1323
0
  ccv_nnc_tensor_free(gb);
1324
0
  ccv_nnc_tensor_free(gc);
1325
0
}
1326
1327
TEST_CASE("gemm no transpose with bias")
1328
1
{
1329
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1330
0
  float ap[] = {
1331
0
    1, 2,
1332
0
    3, 4,
1333
0
    5, 6,
1334
0
    7, 8,
1335
0
  };
1336
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 4, 2), 0);
1337
0
  float bp[] = {
1338
0
    7, 8, 9,
1339
0
    10, 11, 12,
1340
0
  };
1341
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
1342
0
  float dp[] = {
1343
0
    1, -1, 1,
1344
0
    1, -1, 1,
1345
0
    1, -1, 1,
1346
0
    1, -1, 1,
1347
0
  };
1348
0
  ccv_nnc_tensor_t* const d = ccv_nnc_tensor_new(dp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
1349
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 3), 0);
1350
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
1351
0
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
1352
0
  ccv_nnc_tensor_t* gd = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
1353
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
1354
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b, d), TENSOR_LIST(ga, gb, gd), 0);
1355
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb, gd), TENSOR_LIST(gc), 0);
1356
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
1357
0
  float ctp[] = {
1358
0
    1 * 7 + 2 * 10 + 1, 1 * 8 + 2 * 11 - 1, 1 * 9 + 2 * 12 + 1,
1359
0
    3 * 7 + 4 * 10 + 1, 3 * 8 + 4 * 11 - 1, 3 * 9 + 4 * 12 + 1,
1360
0
    5 * 7 + 6 * 10 + 1, 5 * 8 + 6 * 11 - 1, 5 * 9 + 6 * 12 + 1,
1361
0
    7 * 7 + 8 * 10 + 1, 7 * 8 + 8 * 11 - 1, 7 * 9 + 8 * 12 + 1,
1362
0
  };
1363
0
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
1364
0
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
1365
0
  ccv_nnc_tensor_free(a);
1366
0
  ccv_nnc_tensor_free(b);
1367
0
  ccv_nnc_tensor_free(c);
1368
0
  ccv_nnc_tensor_free(d);
1369
0
  ccv_nnc_tensor_free(ga);
1370
0
  ccv_nnc_tensor_free(gb);
1371
0
  ccv_nnc_tensor_free(gc);
1372
0
  ccv_nnc_tensor_free(gd);
1373
0
}
1374
1375
TEST_CASE("gemm no transpose batch 2, no batch b")
1376
1
{
1377
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1378
0
  float ap[] = {
1379
0
    1, 2,
1380
0
    3, 4,
1381
0
    5, 6,
1382
0
    7, 8,
1383
0
    2, 3,
1384
0
    4, 5,
1385
0
    6, 7,
1386
0
    8, 9
1387
0
  };
1388
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
1389
0
  float bp[] = {
1390
0
    7, 8, 9,
1391
0
    10, 11, 12,
1392
0
  };
1393
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
1394
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
1395
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
1396
0
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
1397
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
1398
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
1399
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb), TENSOR_LIST(gc), 0);
1400
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
1401
0
  float ctp[] = {
1402
0
    1 * 7 + 2 * 10, 1 * 8 + 2 * 11, 1 * 9 + 2 * 12,
1403
0
    3 * 7 + 4 * 10, 3 * 8 + 4 * 11, 3 * 9 + 4 * 12,
1404
0
    5 * 7 + 6 * 10, 5 * 8 + 6 * 11, 5 * 9 + 6 * 12,
1405
0
    7 * 7 + 8 * 10, 7 * 8 + 8 * 11, 7 * 9 + 8 * 12,
1406
0
    2 * 7 + 3 * 10, 2 * 8 + 3 * 11, 2 * 9 + 3 * 12,
1407
0
    4 * 7 + 5 * 10, 4 * 8 + 5 * 11, 4 * 9 + 5 * 12,
1408
0
    6 * 7 + 7 * 10, 6 * 8 + 7 * 11, 6 * 9 + 7 * 12,
1409
0
    8 * 7 + 9 * 10, 8 * 8 + 9 * 11, 8 * 9 + 9 * 12,
1410
0
  };
1411
0
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
1412
0
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
1413
0
  ccv_nnc_tensor_free(a);
1414
0
  ccv_nnc_tensor_free(b);
1415
0
  ccv_nnc_tensor_free(c);
1416
0
  ccv_nnc_tensor_free(ga);
1417
0
  ccv_nnc_tensor_free(gb);
1418
0
  ccv_nnc_tensor_free(gc);
1419
0
}
1420
1421
TEST_CASE("gemm no transpose batch 2")
1422
1
{
1423
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1424
0
  float ap[] = {
1425
0
    1, 2,
1426
0
    3, 4,
1427
0
    5, 6,
1428
0
    7, 8,
1429
0
    2, 3,
1430
0
    4, 5,
1431
0
    6, 7,
1432
0
    8, 9
1433
0
  };
1434
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
1435
0
  float bp[] = {
1436
0
    7, 8, 9,
1437
0
    10, 11, 12,
1438
0
    8, 9, 10,
1439
0
    11, 12, 13,
1440
0
  };
1441
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 2, 3), 0);
1442
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
1443
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
1444
0
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 3), 0);
1445
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
1446
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
1447
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb), TENSOR_LIST(gc), 0);
1448
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
1449
0
  float ctp[] = {
1450
0
    1 * 7 + 2 * 10, 1 * 8 + 2 * 11, 1 * 9 + 2 * 12,
1451
0
    3 * 7 + 4 * 10, 3 * 8 + 4 * 11, 3 * 9 + 4 * 12,
1452
0
    5 * 7 + 6 * 10, 5 * 8 + 6 * 11, 5 * 9 + 6 * 12,
1453
0
    7 * 7 + 8 * 10, 7 * 8 + 8 * 11, 7 * 9 + 8 * 12,
1454
0
    2 * 8 + 3 * 11, 2 * 9 + 3 * 12, 2 * 10 + 3 * 13,
1455
0
    4 * 8 + 5 * 11, 4 * 9 + 5 * 12, 4 * 10 + 5 * 13,
1456
0
    6 * 8 + 7 * 11, 6 * 9 + 7 * 12, 6 * 10 + 7 * 13,
1457
0
    8 * 8 + 9 * 11, 8 * 9 + 9 * 12, 8 * 10 + 9 * 13,
1458
0
  };
1459
0
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
1460
0
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
1461
0
  ccv_nnc_tensor_free(a);
1462
0
  ccv_nnc_tensor_free(b);
1463
0
  ccv_nnc_tensor_free(c);
1464
0
  ccv_nnc_tensor_free(ga);
1465
0
  ccv_nnc_tensor_free(gb);
1466
0
  ccv_nnc_tensor_free(gc);
1467
0
}
1468
1469
TEST_CASE("gemm transpose a batch 2, no batch b, with bias")
1470
1
{
1471
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1472
0
  float ap[] = {
1473
0
    1, 3, 5, 7,
1474
0
    2, 4, 6, 8,
1475
0
    2, 4, 6, 8,
1476
0
    3, 5, 7, 9,
1477
0
  };
1478
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 2, 4), 0);
1479
0
  float bp[] = {
1480
0
    7, 8, 9,
1481
0
    10, 11, 12,
1482
0
  };
1483
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
1484
0
  float dp[] = {
1485
0
    -1, 0, 1,
1486
0
  };
1487
0
  ccv_nnc_tensor_t* const d = ccv_nnc_tensor_new(dp, CPU_TENSOR_NHWC(32F, 3), 0);
1488
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
1489
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 4), 0);
1490
0
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
1491
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
1492
0
  ccv_nnc_tensor_t* gd = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
1493
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b, d), TENSOR_LIST(ga, gb, gd), 0);
1494
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb, gd), TENSOR_LIST(gc), 0);
1495
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
1496
0
  float ctp[] = {
1497
0
    1 * 7 + 2 * 10 - 1, 1 * 8 + 2 * 11, 1 * 9 + 2 * 12 + 1,
1498
0
    3 * 7 + 4 * 10 - 1, 3 * 8 + 4 * 11, 3 * 9 + 4 * 12 + 1,
1499
0
    5 * 7 + 6 * 10 - 1, 5 * 8 + 6 * 11, 5 * 9 + 6 * 12 + 1,
1500
0
    7 * 7 + 8 * 10 - 1, 7 * 8 + 8 * 11, 7 * 9 + 8 * 12 + 1,
1501
0
    2 * 7 + 3 * 10 - 1, 2 * 8 + 3 * 11, 2 * 9 + 3 * 12 + 1,
1502
0
    4 * 7 + 5 * 10 - 1, 4 * 8 + 5 * 11, 4 * 9 + 5 * 12 + 1,
1503
0
    6 * 7 + 7 * 10 - 1, 6 * 8 + 7 * 11, 6 * 9 + 7 * 12 + 1,
1504
0
    8 * 7 + 9 * 10 - 1, 8 * 8 + 9 * 11, 8 * 9 + 9 * 12 + 1,
1505
0
  };
1506
0
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
1507
0
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
1508
0
  ccv_nnc_tensor_free(a);
1509
0
  ccv_nnc_tensor_free(b);
1510
0
  ccv_nnc_tensor_free(c);
1511
0
  ccv_nnc_tensor_free(d);
1512
0
  ccv_nnc_tensor_free(ga);
1513
0
  ccv_nnc_tensor_free(gb);
1514
0
  ccv_nnc_tensor_free(gc);
1515
0
  ccv_nnc_tensor_free(gd);
1516
0
}
1517
1518
TEST_CASE("gemm transpose a batch 2, with bias")
1519
1
{
1520
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1521
0
  float ap[] = {
1522
0
    1, 3, 5, 7,
1523
0
    2, 4, 6, 8,
1524
0
    2, 4, 6, 8,
1525
0
    3, 5, 7, 9,
1526
0
  };
1527
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 2, 4), 0);
1528
0
  float bp[] = {
1529
0
    7, 8, 9,
1530
0
    10, 11, 12,
1531
0
    8, 9, 10,
1532
0
    11, 12, 13,
1533
0
  };
1534
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 2, 3), 0);
1535
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
1536
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 4), 0);
1537
0
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 3), 0);
1538
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
1539
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
1540
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb), TENSOR_LIST(gc), 0);
1541
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
1542
0
  float ctp[] = {
1543
0
    1 * 7 + 2 * 10, 1 * 8 + 2 * 11, 1 * 9 + 2 * 12,
1544
0
    3 * 7 + 4 * 10, 3 * 8 + 4 * 11, 3 * 9 + 4 * 12,
1545
0
    5 * 7 + 6 * 10, 5 * 8 + 6 * 11, 5 * 9 + 6 * 12,
1546
0
    7 * 7 + 8 * 10, 7 * 8 + 8 * 11, 7 * 9 + 8 * 12,
1547
0
    2 * 8 + 3 * 11, 2 * 9 + 3 * 12, 2 * 10 + 3 * 13,
1548
0
    4 * 8 + 5 * 11, 4 * 9 + 5 * 12, 4 * 10 + 5 * 13,
1549
0
    6 * 8 + 7 * 11, 6 * 9 + 7 * 12, 6 * 10 + 7 * 13,
1550
0
    8 * 8 + 9 * 11, 8 * 9 + 9 * 12, 8 * 10 + 9 * 13,
1551
0
  };
1552
0
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
1553
0
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
1554
0
  ccv_nnc_tensor_free(a);
1555
0
  ccv_nnc_tensor_free(b);
1556
0
  ccv_nnc_tensor_free(c);
1557
0
  ccv_nnc_tensor_free(ga);
1558
0
  ccv_nnc_tensor_free(gb);
1559
0
  ccv_nnc_tensor_free(gc);
1560
0
}
1561
1562
TEST_CASE("gemm transpose b batch 2, with bias")
1563
1
{
1564
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1565
0
  float ap[] = {
1566
0
    1, 2,
1567
0
    3, 4,
1568
0
    5, 6,
1569
0
    7, 8,
1570
0
    2, 3,
1571
0
    4, 5,
1572
0
    6, 7,
1573
0
    8, 9
1574
0
  };
1575
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
1576
0
  float bp[] = {
1577
0
    7, 10,
1578
0
    8, 11,
1579
0
    9, 12,
1580
0
    80, 110,
1581
0
    90, 120,
1582
0
    10, 13,
1583
0
  };
1584
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3, 2), 0);
1585
0
  float dp[] = {
1586
0
    -1, 0, 1,
1587
0
    2, 3, -4,
1588
0
  };
1589
0
  ccv_nnc_tensor_t* const d = ccv_nnc_tensor_new(dp, CPU_TENSOR_NHWC(32F, 2, 1, 3), 0);
1590
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
1591
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
1592
0
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3, 2), 0);
1593
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
1594
0
  ccv_nnc_tensor_t* gd = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 1, 3), 0);
1595
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b, d), TENSOR_LIST(ga, gb, gd), 0);
1596
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb, gd), TENSOR_LIST(gc), 0);
1597
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
1598
0
  float ctp[] = {
1599
0
    1 * 7 + 2 * 10 - 1, 1 * 8 + 2 * 11, 1 * 9 + 2 * 12 + 1,
1600
0
    3 * 7 + 4 * 10 - 1, 3 * 8 + 4 * 11, 3 * 9 + 4 * 12 + 1,
1601
0
    5 * 7 + 6 * 10 - 1, 5 * 8 + 6 * 11, 5 * 9 + 6 * 12 + 1,
1602
0
    7 * 7 + 8 * 10 - 1, 7 * 8 + 8 * 11, 7 * 9 + 8 * 12 + 1,
1603
0
    2 * 80 + 3 * 110 + 2, 2 * 90 + 3 * 120 + 3, 2 * 10 + 3 * 13 - 4,
1604
0
    4 * 80 + 5 * 110 + 2, 4 * 90 + 5 * 120 + 3, 4 * 10 + 5 * 13 - 4,
1605
0
    6 * 80 + 7 * 110 + 2, 6 * 90 + 7 * 120 + 3, 6 * 10 + 7 * 13 - 4,
1606
0
    8 * 80 + 9 * 110 + 2, 8 * 90 + 9 * 120 + 3, 8 * 10 + 9 * 13 - 4,
1607
0
  };
1608
0
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
1609
0
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
1610
0
  ccv_nnc_tensor_free(a);
1611
0
  ccv_nnc_tensor_free(b);
1612
0
  ccv_nnc_tensor_free(c);
1613
0
  ccv_nnc_tensor_free(d);
1614
0
  ccv_nnc_tensor_free(ga);
1615
0
  ccv_nnc_tensor_free(gb);
1616
0
  ccv_nnc_tensor_free(gc);
1617
0
  ccv_nnc_tensor_free(gd);
1618
0
}
1619
1620
TEST_CASE("gemm transpose b batch 2")
1621
1
{
1622
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1623
0
  float ap[] = {
1624
0
    1, 2,
1625
0
    3, 4,
1626
0
    5, 6,
1627
0
    7, 8,
1628
0
    2, 3,
1629
0
    4, 5,
1630
0
    6, 7,
1631
0
    8, 9
1632
0
  };
1633
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
1634
0
  float bp[] = {
1635
0
    7, 10,
1636
0
    8, 11,
1637
0
    9, 12,
1638
0
    80, 110,
1639
0
    90, 120,
1640
0
    10, 13,
1641
0
  };
1642
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3, 2), 0);
1643
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
1644
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
1645
0
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3, 2), 0);
1646
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
1647
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
1648
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb), TENSOR_LIST(gc), 0);
1649
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
1650
0
  float ctp[] = {
1651
0
    1 * 7 + 2 * 10, 1 * 8 + 2 * 11, 1 * 9 + 2 * 12,
1652
0
    3 * 7 + 4 * 10, 3 * 8 + 4 * 11, 3 * 9 + 4 * 12,
1653
0
    5 * 7 + 6 * 10, 5 * 8 + 6 * 11, 5 * 9 + 6 * 12,
1654
0
    7 * 7 + 8 * 10, 7 * 8 + 8 * 11, 7 * 9 + 8 * 12,
1655
0
    2 * 80 + 3 * 110, 2 * 90 + 3 * 120, 2 * 10 + 3 * 13,
1656
0
    4 * 80 + 5 * 110, 4 * 90 + 5 * 120, 4 * 10 + 5 * 13,
1657
0
    6 * 80 + 7 * 110, 6 * 90 + 7 * 120, 6 * 10 + 7 * 13,
1658
0
    8 * 80 + 9 * 110, 8 * 90 + 9 * 120, 8 * 10 + 9 * 13,
1659
0
  };
1660
0
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
1661
0
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
1662
0
  ccv_nnc_tensor_free(a);
1663
0
  ccv_nnc_tensor_free(b);
1664
0
  ccv_nnc_tensor_free(c);
1665
0
  ccv_nnc_tensor_free(ga);
1666
0
  ccv_nnc_tensor_free(gb);
1667
0
  ccv_nnc_tensor_free(gc);
1668
0
}
1669
1670
TEST_CASE("mps forward gemm")
1671
1
{
1672
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1673
0
  dsfmt_t dsfmt;
1674
0
  dsfmt_init_gen_rand(&dsfmt, 0);
1675
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 128), 0);
1676
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
1677
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64), 0);
1678
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 64), 0);
1679
1680
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
1681
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
1682
0
  ccv_nnc_tensor_t* hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
1683
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
1684
0
  int i;
1685
0
  for (i = 0; i < 64 * 128; i++)
1686
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
1687
0
  for (i = 0; i < 64; i++)
1688
0
    hbias->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1689
0
  ccv_nnc_tensor_t* ha1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
1690
0
  for (i = 0; i < 10 * 128; i++)
1691
0
    ha1->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1692
0
  for (i = 0; i < 128; i++)
1693
0
    ha->data.f32[i] = ha1->data.f32[i];
1694
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha1, hw, hbias), TENSOR_LIST(a, w, bias), 0);
1695
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(hb), 0);
1696
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
1697
0
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
1698
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(tb), 0);
1699
0
  ccv_nnc_tensor_t* tb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
1700
0
  for (i = 0; i < 64; i++)
1701
0
    tb1->data.f32[i] = tb->data.f32[i];
1702
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb1->data.f32, hb->data.f32, 64, 5e-6, "GPU computed output should be numerically close to CPU computed ones");
1703
0
  ccv_nnc_tensor_free(a);
1704
0
  ccv_nnc_tensor_free(w);
1705
0
  ccv_nnc_tensor_free(bias);
1706
0
  ccv_nnc_tensor_free(tb);
1707
0
  ccv_nnc_tensor_free(b);
1708
0
  ccv_nnc_tensor_free(ha);
1709
0
  ccv_nnc_tensor_free(ha1);
1710
0
  ccv_nnc_tensor_free(tb1);
1711
0
  ccv_nnc_tensor_free(hw);
1712
0
  ccv_nnc_tensor_free(hbias);
1713
0
  ccv_nnc_tensor_free(hb);
1714
0
}
1715
1716
TEST_CASE("mps forward gemm in half precision")
1717
1
{
1718
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1719
0
  dsfmt_t dsfmt;
1720
0
  dsfmt_init_gen_rand(&dsfmt, 0);
1721
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10, 128), 0);
1722
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64, 128), 0);
1723
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64), 0);
1724
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10, 64), 0);
1725
1726
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
1727
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
1728
0
  ccv_nnc_tensor_t* hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
1729
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
1730
0
  int i;
1731
0
  for (i = 0; i < 64 * 128; i++)
1732
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
1733
0
  for (i = 0; i < 64; i++)
1734
0
    hbias->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1735
0
  ccv_nnc_tensor_t* ha1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
1736
0
  for (i = 0; i < 10 * 128; i++)
1737
0
    ha1->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1738
0
  for (i = 0; i < 128; i++)
1739
0
    ha->data.f32[i] = ha1->data.f32[i];
1740
0
  ccv_nnc_tensor_t* ha2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 128), 0);
1741
0
  ccv_nnc_tensor_t* hw2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64, 128), 0);
1742
0
  ccv_nnc_tensor_t* hbias2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64), 0);
1743
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha1, hw, hbias), TENSOR_LIST(ha2, hw2, hbias2), 0);
1744
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha2, hw2, hbias2), TENSOR_LIST(a, w, bias), 0);
1745
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(hb), 0);
1746
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
1747
0
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 64), 0);
1748
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(tb), 0);
1749
0
  ccv_nnc_tensor_t* tb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
1750
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tb), TENSOR_LIST(tb1), 0);
1751
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb1->data.f32, hb->data.f32, 64, 5e-3, "GPU computed output should be the same as CPU computed ones");
1752
0
  ccv_nnc_tensor_free(a);
1753
0
  ccv_nnc_tensor_free(w);
1754
0
  ccv_nnc_tensor_free(bias);
1755
0
  ccv_nnc_tensor_free(b);
1756
0
  ccv_nnc_tensor_free(tb);
1757
0
  ccv_nnc_tensor_free(ha);
1758
0
  ccv_nnc_tensor_free(ha1);
1759
0
  ccv_nnc_tensor_free(tb1);
1760
0
  ccv_nnc_tensor_free(hw);
1761
0
  ccv_nnc_tensor_free(hbias);
1762
0
  ccv_nnc_tensor_free(hb);
1763
0
  ccv_nnc_tensor_free(ha2);
1764
0
  ccv_nnc_tensor_free(hw2);
1765
0
  ccv_nnc_tensor_free(hbias2);
1766
0
}
1767
1768
TEST_CASE("mps forward gemm in bfloat precision")
1769
1
{
1770
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1771
0
  dsfmt_t dsfmt;
1772
0
  dsfmt_init_gen_rand(&dsfmt, 0);
1773
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, 10, 128), 0);
1774
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, 64, 128), 0);
1775
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, 64), 0);
1776
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, 10, 64), 0);
1777
1778
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
1779
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
1780
0
  ccv_nnc_tensor_t* hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
1781
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
1782
0
  int i;
1783
0
  for (i = 0; i < 64 * 128; i++)
1784
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
1785
0
  for (i = 0; i < 64; i++)
1786
0
    hbias->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1787
0
  ccv_nnc_tensor_t* ha1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
1788
0
  for (i = 0; i < 10 * 128; i++)
1789
0
    ha1->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1790
0
  for (i = 0; i < 128; i++)
1791
0
    ha->data.f32[i] = ha1->data.f32[i];
1792
0
  ccv_nnc_tensor_t* ha2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, 10, 128), 0);
1793
0
  ccv_nnc_tensor_t* hw2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, 64, 128), 0);
1794
0
  ccv_nnc_tensor_t* hbias2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, 64), 0);
1795
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha1, hw, hbias), TENSOR_LIST(ha2, hw2, hbias2), 0);
1796
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha2, hw2, hbias2), TENSOR_LIST(a, w, bias), 0);
1797
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(hb), 0);
1798
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
1799
0
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, 10, 64), 0);
1800
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(tb), 0);
1801
0
  ccv_nnc_tensor_t* tb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
1802
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tb), TENSOR_LIST(tb1), 0);
1803
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb1->data.f32, hb->data.f32, 64, 8e-3, "GPU computed output should be the same as CPU computed ones");
1804
0
  ccv_nnc_tensor_free(a);
1805
0
  ccv_nnc_tensor_free(w);
1806
0
  ccv_nnc_tensor_free(bias);
1807
0
  ccv_nnc_tensor_free(b);
1808
0
  ccv_nnc_tensor_free(tb);
1809
0
  ccv_nnc_tensor_free(ha);
1810
0
  ccv_nnc_tensor_free(ha1);
1811
0
  ccv_nnc_tensor_free(tb1);
1812
0
  ccv_nnc_tensor_free(hw);
1813
0
  ccv_nnc_tensor_free(hbias);
1814
0
  ccv_nnc_tensor_free(hb);
1815
0
  ccv_nnc_tensor_free(ha2);
1816
0
  ccv_nnc_tensor_free(hw2);
1817
0
  ccv_nnc_tensor_free(hbias2);
1818
0
}
1819
1820
TEST_CASE("mps forward gemv in half precision, variant 1")
1821
1
{
1822
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1823
0
  dsfmt_t dsfmt;
1824
0
  dsfmt_init_gen_rand(&dsfmt, 0);
1825
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 1, 128), 0);
1826
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64, 128), 0);
1827
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64), 0);
1828
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 1, 64), 0);
1829
1830
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
1831
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
1832
0
  ccv_nnc_tensor_t* hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
1833
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
1834
0
  int i;
1835
0
  for (i = 0; i < 64 * 128; i++)
1836
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
1837
0
  for (i = 0; i < 64; i++)
1838
0
    hbias->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1839
0
  ccv_nnc_tensor_t* ha1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
1840
0
  for (i = 0; i < 128; i++)
1841
0
    ha1->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1842
0
  for (i = 0; i < 128; i++)
1843
0
    ha->data.f32[i] = ha1->data.f32[i];
1844
0
  ccv_nnc_tensor_t* ha2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 1, 128), 0);
1845
0
  ccv_nnc_tensor_t* hw2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64, 128), 0);
1846
0
  ccv_nnc_tensor_t* hbias2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64), 0);
1847
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha1, hw, hbias), TENSOR_LIST(ha2, hw2, hbias2), 0);
1848
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha2, hw2, hbias2), TENSOR_LIST(a, w, bias), 0);
1849
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(hb), 0);
1850
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
1851
0
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 1, 64), 0);
1852
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(tb), 0);
1853
0
  ccv_nnc_tensor_t* tb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
1854
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tb), TENSOR_LIST(tb1), 0);
1855
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb1->data.f32, hb->data.f32, 64, 1e-3, "GPU computed output should be the same as CPU computed ones");
1856
0
  ccv_nnc_tensor_free(a);
1857
0
  ccv_nnc_tensor_free(w);
1858
0
  ccv_nnc_tensor_free(bias);
1859
0
  ccv_nnc_tensor_free(b);
1860
0
  ccv_nnc_tensor_free(tb);
1861
0
  ccv_nnc_tensor_free(ha);
1862
0
  ccv_nnc_tensor_free(ha1);
1863
0
  ccv_nnc_tensor_free(tb1);
1864
0
  ccv_nnc_tensor_free(hw);
1865
0
  ccv_nnc_tensor_free(hbias);
1866
0
  ccv_nnc_tensor_free(hb);
1867
0
  ccv_nnc_tensor_free(ha2);
1868
0
  ccv_nnc_tensor_free(hw2);
1869
0
  ccv_nnc_tensor_free(hbias2);
1870
0
}
1871
1872
TEST_CASE("mps forward gemv in bfloat precision, variant 1")
1873
1
{
1874
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1875
0
  dsfmt_t dsfmt;
1876
0
  dsfmt_init_gen_rand(&dsfmt, 0);
1877
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, 1, 128), 0);
1878
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, 64, 128), 0);
1879
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, 64), 0);
1880
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, 1, 64), 0);
1881
1882
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
1883
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
1884
0
  ccv_nnc_tensor_t* hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
1885
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
1886
0
  int i;
1887
0
  for (i = 0; i < 64 * 128; i++)
1888
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
1889
0
  for (i = 0; i < 64; i++)
1890
0
    hbias->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1891
0
  ccv_nnc_tensor_t* ha1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
1892
0
  for (i = 0; i < 128; i++)
1893
0
    ha1->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
1894
0
  for (i = 0; i < 128; i++)
1895
0
    ha->data.f32[i] = ha1->data.f32[i];
1896
0
  ccv_nnc_tensor_t* ha2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, 1, 128), 0);
1897
0
  ccv_nnc_tensor_t* hw2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, 64, 128), 0);
1898
0
  ccv_nnc_tensor_t* hbias2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, 64), 0);
1899
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha1, hw, hbias), TENSOR_LIST(ha2, hw2, hbias2), 0);
1900
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha2, hw2, hbias2), TENSOR_LIST(a, w, bias), 0);
1901
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(hb), 0);
1902
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
1903
0
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, 1, 64), 0);
1904
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(tb), 0);
1905
0
  ccv_nnc_tensor_t* tb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
1906
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tb), TENSOR_LIST(tb1), 0);
1907
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb1->data.f32, hb->data.f32, 64, 8e-3, "GPU computed output should be the same as CPU computed ones");
1908
0
  ccv_nnc_tensor_free(a);
1909
0
  ccv_nnc_tensor_free(w);
1910
0
  ccv_nnc_tensor_free(bias);
1911
0
  ccv_nnc_tensor_free(b);
1912
0
  ccv_nnc_tensor_free(tb);
1913
0
  ccv_nnc_tensor_free(ha);
1914
0
  ccv_nnc_tensor_free(ha1);
1915
0
  ccv_nnc_tensor_free(tb1);
1916
0
  ccv_nnc_tensor_free(hw);
1917
0
  ccv_nnc_tensor_free(hbias);
1918
0
  ccv_nnc_tensor_free(hb);
1919
0
  ccv_nnc_tensor_free(ha2);
1920
0
  ccv_nnc_tensor_free(hw2);
1921
0
  ccv_nnc_tensor_free(hbias2);
1922
0
}
1923
1924
TEST_CASE("mps depalettize 5-bit half precision")
1925
1
{
1926
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_MPS));
1927
0
  float lut_f32[32] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, -1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0, -9.0, -10.0, -11.0, -12.0, -13.0, -14.0, -15.0};
1928
0
  uint16_t lut[32];
1929
0
  ccv_float_to_half_precision(lut_f32, lut, 32);
1930
0
  uint16_t* const values = ccmalloc(sizeof(uint16_t) * 3072);
1931
0
  int i;
1932
0
  for (i = 0; i < 3072; i++)
1933
0
    values[i] = lut[i % 32];
1934
0
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2112 + 3) / 4), 0);
1935
0
  uint8_t* compressed = tensor->data.u8;
1936
0
  const size_t output_size = ccv_nnc_palettize(values, CCV_16F, CCV_TENSOR_CPU_MEMORY, 3072, 5, 1024, compressed, 2112);
1937
0
  REQUIRE_EQ(output_size, 2112, "output size should match");
1938
0
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2112 + 3) / 4), 0);
1939
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
1940
0
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 16F, 3072), 0);
1941
0
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_16F, CCV_TENSOR_GPU_MEMORY, output_size, 5, 1024, gv_tensor->data.u8, 3072);
1942
0
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 3072), 0);
1943
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
1944
0
  REQUIRE_ARRAY_EQ(uint16_t, values, v_tensor->data.f16, 3072, "GPU computed output should match CPU depalettize");
1945
0
  ccfree(values);
1946
0
  ccv_nnc_tensor_free(tensor);
1947
0
  ccv_nnc_tensor_free(g_tensor);
1948
0
  ccv_nnc_tensor_free(gv_tensor);
1949
0
  ccv_nnc_tensor_free(v_tensor);
1950
0
}
1951
1952
TEST_CASE("mps depalettize 6-bit float precision")
1953
1
{
1954
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_MPS));
1955
0
  float lut[64];
1956
0
  int i;
1957
0
  for (i = 0; i < 64; i++)
1958
0
    lut[i] = (float)i;
1959
0
  float* const values = ccmalloc(sizeof(float) * 8192);
1960
0
  for (i = 0; i < 8192; i++)
1961
0
    values[i] = lut[i % 64];
1962
0
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (6144 + 2 * 64 * 4 + 3) / 4), 0);
1963
0
  uint8_t* compressed = tensor->data.u8;
1964
0
  const size_t output_size = ccv_nnc_palettize(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 8192, 6, 4096, compressed, 6144 + 2 * 64 * 4);
1965
0
  REQUIRE_EQ(output_size, 6144 + 2 * 64 * 4, "output size should match");
1966
0
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (6144 + 2 * 64 * 4 + 3) / 4), 0);
1967
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
1968
0
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 8192), 0);
1969
0
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_32F, CCV_TENSOR_GPU_MEMORY, output_size, 6, 4096, gv_tensor->data.u8, 8192);
1970
0
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 8192), 0);
1971
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
1972
0
  REQUIRE_ARRAY_EQ(float, values, v_tensor->data.f32, 8192, "GPU computed output should match CPU depalettize");
1973
0
  ccfree(values);
1974
0
  ccv_nnc_tensor_free(tensor);
1975
0
  ccv_nnc_tensor_free(g_tensor);
1976
0
  ccv_nnc_tensor_free(gv_tensor);
1977
0
  ccv_nnc_tensor_free(v_tensor);
1978
0
}
1979
1980
TEST_CASE("mps depalettize 8-bit float precision with partial block")
1981
1
{
1982
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_MPS));
1983
0
  float lut[256];
1984
0
  int i;
1985
0
  for (i = 0; i < 256; i++)
1986
0
    lut[i] = (float)i;
1987
0
  float* const values = ccmalloc(sizeof(float) * 3072);
1988
0
  for (i = 0; i < 3072; i++)
1989
0
    values[i] = lut[i % 256];
1990
0
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (6144 + 3) / 4), 0);
1991
0
  uint8_t* compressed = tensor->data.u8;
1992
0
  const size_t output_size = ccv_nnc_palettize(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 3072, 8, 2048, compressed, 6144);
1993
0
  REQUIRE(output_size <= 6144, "output size should fit the allocated buffer");
1994
0
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (6144 + 3) / 4), 0);
1995
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
1996
0
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 3072), 0);
1997
0
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_32F, CCV_TENSOR_GPU_MEMORY, output_size, 8, 2048, gv_tensor->data.u8, 3072);
1998
0
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 3072), 0);
1999
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
2000
0
  REQUIRE_ARRAY_EQ(float, values, v_tensor->data.f32, 3072, "GPU computed output should match CPU depalettize");
2001
0
  ccfree(values);
2002
0
  ccv_nnc_tensor_free(tensor);
2003
0
  ccv_nnc_tensor_free(g_tensor);
2004
0
  ccv_nnc_tensor_free(gv_tensor);
2005
0
  ccv_nnc_tensor_free(v_tensor);
2006
0
}
2007
2008
TEST_CASE("mps dequantize row-wise 8i half precision")
2009
1
{
2010
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_MPS));
2011
0
  const int rows = 17;
2012
0
  const int cols = 64;
2013
0
  float* const values = ccmalloc(sizeof(float) * rows * cols);
2014
0
  int i;
2015
0
  for (i = 0; i < rows * cols; i++)
2016
0
    values[i] = ((i * 13) % 41 - 20) / 32.0f;
2017
0
  ccv_nnc_tensor_t* const source = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, rows, cols), 0);
2018
0
  ccv_float_to_half_precision(values, (uint16_t*)source->data.f16, rows * cols);
2019
0
  ccv_nnc_tensor_t* const q = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(CPU_TENSOR_NHWC(16F, rows, cols)), 0);
2020
0
  const size_t qsize = ccv_nnc_quantize_8i_rowwise(source->data.f16, CCV_16F, CCV_TENSOR_CPU_MEMORY, rows * cols, cols, q->data.u8, ccv_nnc_tensor_data_size_without_padding(q->info));
2021
0
  REQUIRE_EQ(qsize, ccv_nnc_tensor_data_size_without_padding(q->info), "quantized row-wise 8i size should match");
2022
0
  ccv_nnc_tensor_t* const expected = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, rows, cols), 0);
2023
0
  ccv_nnc_dequantize_8i_rowwise(q->data.u8, CCV_16F, CCV_TENSOR_CPU_MEMORY, qsize, cols, expected->data.f16, rows * cols);
2024
0
  ccv_nnc_tensor_t* const gq = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(GPU_TENSOR_NHWC(000, 16F, rows, cols)), 0);
2025
0
  ccv_nnc_tensor_t* const gout = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, rows, cols), 0);
2026
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q), TENSOR_LIST(gq), 0);
2027
0
  ccv_nnc_dequantize_8i_rowwise(gq->data.u8, CCV_16F, CCV_TENSOR_GPU_MEMORY, qsize, cols, gout->data.u8, rows * cols);
2028
0
  ccv_nnc_tensor_t* const actual = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, rows, cols), 0);
2029
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gout), TENSOR_LIST(actual), 0);
2030
0
  float* const expected_f32 = (float*)ccmalloc(sizeof(float) * rows * cols);
2031
0
  float* const actual_f32 = (float*)ccmalloc(sizeof(float) * rows * cols);
2032
0
  ccv_half_precision_to_float((uint16_t*)expected->data.f16, expected_f32, rows * cols);
2033
0
  ccv_half_precision_to_float((uint16_t*)actual->data.f16, actual_f32, rows * cols);
2034
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, expected_f32, actual_f32, rows * cols, 1e-3, "GPU row-wise 8i dequantize should match CPU dequantize");
2035
0
  ccfree(actual_f32);
2036
0
  ccfree(expected_f32);
2037
0
  ccv_nnc_tensor_free(actual);
2038
0
  ccv_nnc_tensor_free(gout);
2039
0
  ccv_nnc_tensor_free(gq);
2040
0
  ccv_nnc_tensor_free(expected);
2041
0
  ccv_nnc_tensor_free(q);
2042
0
  ccv_nnc_tensor_free(source);
2043
0
  ccfree(values);
2044
0
}
2045
2046
TEST_CASE("mps dequantize row-wise 8i float precision")
2047
1
{
2048
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_MPS));
2049
0
  const int rows = 11;
2050
0
  const int cols = 128;
2051
0
  ccv_nnc_tensor_t* const source = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, rows, cols), 0);
2052
0
  int i;
2053
0
  for (i = 0; i < rows * cols; i++)
2054
0
    source->data.f32[i] = ((i * 17) % 53 - 26) / 64.0f;
2055
0
  ccv_nnc_tensor_t* const q = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(CPU_TENSOR_NHWC(32F, rows, cols)), 0);
2056
0
  const size_t qsize = ccv_nnc_quantize_8i_rowwise(source->data.f32, CCV_32F, CCV_TENSOR_CPU_MEMORY, rows * cols, cols, q->data.u8, ccv_nnc_tensor_data_size_without_padding(q->info));
2057
0
  REQUIRE_EQ(qsize, ccv_nnc_tensor_data_size_without_padding(q->info), "quantized row-wise 8i size should match");
2058
0
  ccv_nnc_tensor_t* const expected = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, rows, cols), 0);
2059
0
  ccv_nnc_dequantize_8i_rowwise(q->data.u8, CCV_32F, CCV_TENSOR_CPU_MEMORY, qsize, cols, expected->data.f32, rows * cols);
2060
0
  ccv_nnc_tensor_t* const gq = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(GPU_TENSOR_NHWC(000, 32F, rows, cols)), 0);
2061
0
  ccv_nnc_tensor_t* const gout = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, rows, cols), 0);
2062
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q), TENSOR_LIST(gq), 0);
2063
0
  ccv_nnc_dequantize_8i_rowwise(gq->data.u8, CCV_32F, CCV_TENSOR_GPU_MEMORY, qsize, cols, gout->data.u8, rows * cols);
2064
0
  ccv_nnc_tensor_t* const actual = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, rows, cols), 0);
2065
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gout), TENSOR_LIST(actual), 0);
2066
0
  REQUIRE_ARRAY_EQ(float, expected->data.f32, actual->data.f32, rows * cols, "GPU row-wise 8i dequantize should match CPU dequantize");
2067
0
  ccv_nnc_tensor_free(actual);
2068
0
  ccv_nnc_tensor_free(gout);
2069
0
  ccv_nnc_tensor_free(gq);
2070
0
  ccv_nnc_tensor_free(expected);
2071
0
  ccv_nnc_tensor_free(q);
2072
0
  ccv_nnc_tensor_free(source);
2073
0
}
2074
2075
TEST_CASE("mps dequantize row-wise 8i bfloat precision")
2076
1
{
2077
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_MPS));
2078
0
  const int rows = 257;
2079
0
  const int cols = 130;
2080
0
  float* const values = ccmalloc(sizeof(float) * rows * cols);
2081
0
  int i;
2082
0
  for (i = 0; i < rows * cols; i++)
2083
0
    values[i] = ((i * 29) % 97 - 48) / 64.0f;
2084
0
  ccv_nnc_tensor_t* const source = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, rows, cols), 0);
2085
0
  ccv_float_to_bfloat(values, (uint16_t*)source->data.f16, rows * cols);
2086
0
  ccv_nnc_tensor_t* const q = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(CPU_TENSOR_NHWC(16BF, rows, cols)), 0);
2087
0
  const size_t qsize = ccv_nnc_quantize_8i_rowwise(source->data.f16, CCV_16BF, CCV_TENSOR_CPU_MEMORY, rows * cols, cols, q->data.u8, ccv_nnc_tensor_data_size_without_padding(q->info));
2088
0
  REQUIRE_EQ(qsize, ccv_nnc_tensor_data_size_without_padding(q->info), "quantized row-wise 8i size should match");
2089
0
  ccv_nnc_tensor_t* const expected = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, rows, cols), 0);
2090
0
  ccv_nnc_dequantize_8i_rowwise(q->data.u8, CCV_16BF, CCV_TENSOR_CPU_MEMORY, qsize, cols, expected->data.f16, rows * cols);
2091
0
  ccv_nnc_tensor_t* const gq = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(GPU_TENSOR_NHWC(000, 16BF, rows, cols)), 0);
2092
0
  ccv_nnc_tensor_t* const gout = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, rows, cols), 0);
2093
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q), TENSOR_LIST(gq), 0);
2094
0
  ccv_nnc_dequantize_8i_rowwise(gq->data.u8, CCV_16BF, CCV_TENSOR_GPU_MEMORY, qsize, cols, gout->data.u8, rows * cols);
2095
0
  ccv_nnc_tensor_t* const actual = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, rows, cols), 0);
2096
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gout), TENSOR_LIST(actual), 0);
2097
0
  float* const expected_f32 = (float*)ccmalloc(sizeof(float) * rows * cols);
2098
0
  float* const actual_f32 = (float*)ccmalloc(sizeof(float) * rows * cols);
2099
0
  ccv_bfloat_to_float((uint16_t*)expected->data.f16, expected_f32, rows * cols);
2100
0
  ccv_bfloat_to_float((uint16_t*)actual->data.f16, actual_f32, rows * cols);
2101
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, expected_f32, actual_f32, rows * cols, 5e-3, "GPU row-wise 8i bf16 dequantize should match CPU dequantize");
2102
0
  ccfree(actual_f32);
2103
0
  ccfree(expected_f32);
2104
0
  ccv_nnc_tensor_free(actual);
2105
0
  ccv_nnc_tensor_free(gout);
2106
0
  ccv_nnc_tensor_free(gq);
2107
0
  ccv_nnc_tensor_free(expected);
2108
0
  ccv_nnc_tensor_free(q);
2109
0
  ccv_nnc_tensor_free(source);
2110
0
  ccfree(values);
2111
0
}
2112
2113
TEST_CASE("mps dequantize row-wise 8i bfloat precision large shapes")
2114
1
{
2115
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_MPS));
2116
0
  static const int shapes[][2] = {
2117
0
    {3840, 3840},
2118
0
    {10240, 3840},
2119
0
    {3840, 10240},
2120
0
  };
2121
0
  int s;
2122
0
  for (s = 0; s < (int)(sizeof(shapes) / sizeof(shapes[0])); s++)
2123
0
  {
2124
0
    const int rows = shapes[s][0];
2125
0
    const int cols = shapes[s][1];
2126
0
    float* const values = ccmalloc(sizeof(float) * (size_t)rows * cols);
2127
0
    int i;
2128
0
    for (i = 0; i < rows * cols; i++)
2129
0
      values[i] = ((i * 29) % 97 - 48) / 64.0f;
2130
0
    ccv_nnc_tensor_t* const source = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, rows, cols), 0);
2131
0
    ccv_float_to_bfloat(values, (uint16_t*)source->data.f16, rows * cols);
2132
0
    ccv_nnc_tensor_t* const q = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(CPU_TENSOR_NHWC(16BF, rows, cols)), 0);
2133
0
    const size_t qsize = ccv_nnc_quantize_8i_rowwise(source->data.f16, CCV_16BF, CCV_TENSOR_CPU_MEMORY, (size_t)rows * cols, cols, q->data.u8, ccv_nnc_tensor_data_size_without_padding(q->info));
2134
0
    REQUIRE_EQ(qsize, ccv_nnc_tensor_data_size_without_padding(q->info), "quantized row-wise 8i size should match");
2135
0
    ccv_nnc_tensor_t* const expected = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, rows, cols), 0);
2136
0
    ccv_nnc_dequantize_8i_rowwise(q->data.u8, CCV_16BF, CCV_TENSOR_CPU_MEMORY, qsize, cols, expected->data.f16, (size_t)rows * cols);
2137
0
    ccv_nnc_tensor_t* const gq = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(GPU_TENSOR_NHWC(000, 16BF, rows, cols)), 0);
2138
0
    ccv_nnc_tensor_t* const gout = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, rows, cols), 0);
2139
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q), TENSOR_LIST(gq), 0);
2140
0
    ccv_nnc_dequantize_8i_rowwise(gq->data.u8, CCV_16BF, CCV_TENSOR_GPU_MEMORY, qsize, cols, gout->data.u8, (size_t)rows * cols);
2141
0
    ccv_nnc_tensor_t* const actual = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, rows, cols), 0);
2142
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gout), TENSOR_LIST(actual), 0);
2143
0
    float* const expected_f32 = (float*)ccmalloc(sizeof(float) * (size_t)rows * cols);
2144
0
    float* const actual_f32 = (float*)ccmalloc(sizeof(float) * (size_t)rows * cols);
2145
0
    ccv_bfloat_to_float((uint16_t*)expected->data.f16, expected_f32, rows * cols);
2146
0
    ccv_bfloat_to_float((uint16_t*)actual->data.f16, actual_f32, rows * cols);
2147
0
    REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, expected_f32, actual_f32, rows * cols, 5e-3, "GPU row-wise 8i bf16 dequantize should match CPU dequantize on large shape");
2148
0
    ccfree(actual_f32);
2149
0
    ccfree(expected_f32);
2150
0
    ccv_nnc_tensor_free(actual);
2151
0
    ccv_nnc_tensor_free(gout);
2152
0
    ccv_nnc_tensor_free(gq);
2153
0
    ccv_nnc_tensor_free(expected);
2154
0
    ccv_nnc_tensor_free(q);
2155
0
    ccv_nnc_tensor_free(source);
2156
0
    ccfree(values);
2157
0
  }
2158
0
}
2159
2160
TEST_CASE("mps forward gemm no bias")
2161
1
{
2162
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
2163
0
  dsfmt_t dsfmt;
2164
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2165
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 128), 0);
2166
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
2167
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 64), 0);
2168
2169
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
2170
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
2171
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
2172
0
  int i;
2173
0
  for (i = 0; i < 64 * 128; i++)
2174
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
2175
0
  ccv_nnc_tensor_t* ha1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
2176
0
  for (i = 0; i < 10 * 128; i++)
2177
0
    ha1->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2178
0
  for (i = 0; i < 128; i++)
2179
0
    ha->data.f32[i] = ha1->data.f32[i];
2180
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha1, hw), TENSOR_LIST(a, w), 0);
2181
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw), TENSOR_LIST(hb), 0);
2182
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
2183
0
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
2184
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(tb), 0);
2185
0
  ccv_nnc_tensor_t* tb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
2186
0
  for (i = 0; i < 64; i++)
2187
0
    tb1->data.f32[i] = tb->data.f32[i];
2188
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb1->data.f32, hb->data.f32, 64, 5e-6, "GPU computed output should be numerically close to CPU computed ones");
2189
0
  ccv_nnc_tensor_free(a);
2190
0
  ccv_nnc_tensor_free(w);
2191
0
  ccv_nnc_tensor_free(b);
2192
0
  ccv_nnc_tensor_free(tb);
2193
0
  ccv_nnc_tensor_free(ha);
2194
0
  ccv_nnc_tensor_free(ha1);
2195
0
  ccv_nnc_tensor_free(tb1);
2196
0
  ccv_nnc_tensor_free(hw);
2197
0
  ccv_nnc_tensor_free(hb);
2198
0
}
2199
2200
TEST_CASE("mps forward gemm no bias in half precision")
2201
1
{
2202
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
2203
0
  dsfmt_t dsfmt;
2204
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2205
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10, 128), 0);
2206
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64, 128), 0);
2207
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10, 64), 0);
2208
2209
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
2210
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
2211
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
2212
0
  int i;
2213
0
  for (i = 0; i < 64 * 128; i++)
2214
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
2215
0
  ccv_nnc_tensor_t* ha1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
2216
0
  for (i = 0; i < 10 * 128; i++)
2217
0
    ha1->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2218
0
  for (i = 0; i < 128; i++)
2219
0
    ha->data.f32[i] = ha1->data.f32[i];
2220
0
  ccv_nnc_tensor_t* ha2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 128), 0);
2221
0
  ccv_nnc_tensor_t* hw2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64, 128), 0);
2222
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha1, hw), TENSOR_LIST(ha2, hw2), 0);
2223
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha2, hw2), TENSOR_LIST(a, w), 0);
2224
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw), TENSOR_LIST(hb), 0);
2225
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
2226
0
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 64), 0);
2227
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(tb), 0);
2228
0
  ccv_nnc_tensor_t* tb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
2229
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tb), TENSOR_LIST(tb1), 0);
2230
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb1->data.f32, hb->data.f32, 64, 1e-3, "GPU computed output should be the same as CPU computed ones");
2231
0
  ccv_nnc_tensor_free(a);
2232
0
  ccv_nnc_tensor_free(w);
2233
0
  ccv_nnc_tensor_free(b);
2234
0
  ccv_nnc_tensor_free(tb);
2235
0
  ccv_nnc_tensor_free(ha);
2236
0
  ccv_nnc_tensor_free(ha1);
2237
0
  ccv_nnc_tensor_free(tb1);
2238
0
  ccv_nnc_tensor_free(hw);
2239
0
  ccv_nnc_tensor_free(hb);
2240
0
  ccv_nnc_tensor_free(ha2);
2241
0
  ccv_nnc_tensor_free(hw2);
2242
0
}
2243
2244
TEST_CASE("mps forward gemm no bias in bfloat precision")
2245
1
{
2246
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
2247
0
  dsfmt_t dsfmt;
2248
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2249
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, 10, 128), 0);
2250
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, 64, 128), 0);
2251
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, 10, 64), 0);
2252
2253
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
2254
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
2255
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
2256
0
  int i;
2257
0
  for (i = 0; i < 64 * 128; i++)
2258
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
2259
0
  ccv_nnc_tensor_t* ha1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
2260
0
  for (i = 0; i < 10 * 128; i++)
2261
0
    ha1->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2262
0
  for (i = 0; i < 128; i++)
2263
0
    ha->data.f32[i] = ha1->data.f32[i];
2264
0
  ccv_nnc_tensor_t* ha2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, 10, 128), 0);
2265
0
  ccv_nnc_tensor_t* hw2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, 64, 128), 0);
2266
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha1, hw), TENSOR_LIST(ha2, hw2), 0);
2267
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha2, hw2), TENSOR_LIST(a, w), 0);
2268
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw), TENSOR_LIST(hb), 0);
2269
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
2270
0
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, 10, 64), 0);
2271
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(tb), 0);
2272
0
  ccv_nnc_tensor_t* tb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
2273
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tb), TENSOR_LIST(tb1), 0);
2274
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb1->data.f32, hb->data.f32, 64, 1e-3, "GPU computed output should be the same as CPU computed ones");
2275
0
  ccv_nnc_tensor_free(a);
2276
0
  ccv_nnc_tensor_free(w);
2277
0
  ccv_nnc_tensor_free(b);
2278
0
  ccv_nnc_tensor_free(tb);
2279
0
  ccv_nnc_tensor_free(ha);
2280
0
  ccv_nnc_tensor_free(ha1);
2281
0
  ccv_nnc_tensor_free(tb1);
2282
0
  ccv_nnc_tensor_free(hw);
2283
0
  ccv_nnc_tensor_free(hb);
2284
0
  ccv_nnc_tensor_free(ha2);
2285
0
  ccv_nnc_tensor_free(hw2);
2286
0
}
2287
2288
TEST_CASE("mps forward gemv in half precision no bias, variant 1")
2289
1
{
2290
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
2291
0
  dsfmt_t dsfmt;
2292
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2293
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 1, 128), 0);
2294
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64, 128), 0);
2295
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 1, 64), 0);
2296
2297
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
2298
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
2299
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
2300
0
  int i;
2301
0
  for (i = 0; i < 64 * 128; i++)
2302
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
2303
0
  ccv_nnc_tensor_t* ha1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
2304
0
  for (i = 0; i < 128; i++)
2305
0
    ha1->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2306
0
  for (i = 0; i < 128; i++)
2307
0
    ha->data.f32[i] = ha1->data.f32[i];
2308
0
  ccv_nnc_tensor_t* ha2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 1, 128), 0);
2309
0
  ccv_nnc_tensor_t* hw2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64, 128), 0);
2310
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha1, hw), TENSOR_LIST(ha2, hw2), 0);
2311
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha2, hw2), TENSOR_LIST(a, w), 0);
2312
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw), TENSOR_LIST(hb), 0);
2313
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
2314
0
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 1, 64), 0);
2315
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(tb), 0);
2316
0
  ccv_nnc_tensor_t* tb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
2317
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tb), TENSOR_LIST(tb1), 0);
2318
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb1->data.f32, hb->data.f32, 64, 1e-3, "GPU computed output should be the same as CPU computed ones");
2319
0
  ccv_nnc_tensor_free(a);
2320
0
  ccv_nnc_tensor_free(w);
2321
0
  ccv_nnc_tensor_free(b);
2322
0
  ccv_nnc_tensor_free(tb);
2323
0
  ccv_nnc_tensor_free(ha);
2324
0
  ccv_nnc_tensor_free(ha1);
2325
0
  ccv_nnc_tensor_free(tb1);
2326
0
  ccv_nnc_tensor_free(hw);
2327
0
  ccv_nnc_tensor_free(hb);
2328
0
  ccv_nnc_tensor_free(ha2);
2329
0
  ccv_nnc_tensor_free(hw2);
2330
0
}
2331
2332
TEST_CASE("mps forward gemv in half precision no bias, variant 2")
2333
1
{
2334
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
2335
0
  dsfmt_t dsfmt;
2336
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2337
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64, 128), 0);
2338
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 128, 1), 0);
2339
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64, 1), 0);
2340
2341
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
2342
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 128, 1), 0);
2343
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 1), 0);
2344
0
  int i;
2345
0
  for (i = 0; i < 64 * 128; i++)
2346
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
2347
0
  ccv_nnc_tensor_t* ha1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 128, 1), 0);
2348
0
  for (i = 0; i < 128; i++)
2349
0
    ha1->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2350
0
  for (i = 0; i < 128; i++)
2351
0
    ha->data.f32[i] = ha1->data.f32[i];
2352
0
  ccv_nnc_tensor_t* hw2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64, 128), 0);
2353
0
  ccv_nnc_tensor_t* ha2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 128, 1), 0);
2354
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha1, hw), TENSOR_LIST(ha2, hw2), 0);
2355
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha2, hw2), TENSOR_LIST(a, w), 0);
2356
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, NO_TRANSPOSE), ccv_nnc_no_hint, 0, TENSOR_LIST(hw, ha), TENSOR_LIST(hb), 0);
2357
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, NO_TRANSPOSE), ccv_nnc_no_hint, 0, TENSOR_LIST(w, a), TENSOR_LIST(b), 0);
2358
0
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64, 1), 0);
2359
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(tb), 0);
2360
0
  ccv_nnc_tensor_t* tb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 1), 0);
2361
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tb), TENSOR_LIST(tb1), 0);
2362
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb1->data.f32, hb->data.f32, 64, 1e-3, "GPU computed output should be the same as CPU computed ones");
2363
0
  ccv_nnc_tensor_free(a);
2364
0
  ccv_nnc_tensor_free(w);
2365
0
  ccv_nnc_tensor_free(b);
2366
0
  ccv_nnc_tensor_free(tb);
2367
0
  ccv_nnc_tensor_free(ha);
2368
0
  ccv_nnc_tensor_free(ha1);
2369
0
  ccv_nnc_tensor_free(tb1);
2370
0
  ccv_nnc_tensor_free(hw);
2371
0
  ccv_nnc_tensor_free(hb);
2372
0
  ccv_nnc_tensor_free(ha2);
2373
0
  ccv_nnc_tensor_free(hw2);
2374
0
}
2375
2376
TEST_CASE("mps handle permute")
2377
1
{
2378
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
2379
0
  dsfmt_t dsfmt;
2380
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2381
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 2, 128), 0);
2382
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 2, 128), 0);
2383
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 2, 128), 0);
2384
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 2, 128), 0);
2385
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 64), 0);
2386
2387
0
  ccv_nnc_tensor_t* at = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 128), 0);
2388
0
  ccv_nnc_tensor_t* wt = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 64, 128), 0);
2389
0
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 64), 0);
2390
0
  int i;
2391
0
  for (i = 0; i < 2 * 64 * 128; i++)
2392
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
2393
0
  for (i = 0; i < 2 * 10 * 128; i++)
2394
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2395
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw), TENSOR_LIST(a, w), 0);
2396
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(0, 1), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(at), 0);
2397
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(0, 1), ccv_nnc_no_hint, 0, TENSOR_LIST(w), TENSOR_LIST(wt), 0);
2398
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(at, wt), TENSOR_LIST(bt), 0);
2399
0
  ccv_nnc_tensor_view_t* av = ccv_nnc_tensor_view_new(a, GPU_TENSOR_NHWC(000, 32F, 2, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(128, 2 * 128, 1));
2400
0
  ccv_nnc_tensor_view_t* wv = ccv_nnc_tensor_view_new(w, GPU_TENSOR_NHWC(000, 32F, 2, 64, 128), ccv_nnc_no_ofs, DIM_ALLOC(128, 2 * 128, 1));
2401
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST((ccv_nnc_tensor_t*)av, (ccv_nnc_tensor_t*)wv), TENSOR_LIST(b), 0);
2402
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 64), 0);
2403
0
  ccv_nnc_tensor_t* hbt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 64), 0);
2404
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b, bt), TENSOR_LIST(hb, hbt), 0);
2405
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, hb->data.f32, hbt->data.f32, 2 * 10 * 64, 1e-5, "permute computed output should be numerically close to non-permute computed ones");
2406
0
  ccv_nnc_tensor_free(ha);
2407
0
  ccv_nnc_tensor_free(hw);
2408
0
  ccv_nnc_tensor_free(a);
2409
0
  ccv_nnc_tensor_free(w);
2410
0
  ccv_nnc_tensor_free(b);
2411
0
  ccv_nnc_tensor_view_free(av);
2412
0
  ccv_nnc_tensor_view_free(wv);
2413
0
  ccv_nnc_tensor_free(at);
2414
0
  ccv_nnc_tensor_free(wt);
2415
0
  ccv_nnc_tensor_free(bt);
2416
0
  ccv_nnc_tensor_free(hb);
2417
0
  ccv_nnc_tensor_free(hbt);
2418
0
}
2419
2420
TEST_CASE("generalized batched gemm with batch (2, 4) compare mps")
2421
1
{
2422
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
2423
  // This is a particular batched gemm which treat every dimensions other than the last two as batching.
2424
0
  dsfmt_t dsfmt;
2425
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2426
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
2427
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 64, 4, 128), 0);
2428
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
2429
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
2430
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 64, 4, 128), 0);
2431
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 64), 0);
2432
2433
0
  ccv_nnc_tensor_t* at = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
2434
0
  ccv_nnc_tensor_t* wt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 64, 128), 0);
2435
0
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
2436
0
  int i;
2437
0
  for (i = 0; i < 8 * 64 * 128; i++)
2438
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
2439
0
  for (i = 0; i < 8 * 10 * 128; i++)
2440
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2441
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(at), 0);
2442
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(hw), TENSOR_LIST(wt), 0);
2443
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw), TENSOR_LIST(a, w), 0);
2444
0
  ccv_nnc_tensor_view_t* av = ccv_nnc_tensor_view_new(a, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
2445
0
  ccv_nnc_tensor_view_t* wv = ccv_nnc_tensor_view_new(w, GPU_TENSOR_NHWC(000, 32F, 2, 4, 64, 128), ccv_nnc_no_ofs, DIM_ALLOC(64 * 4 * 128, 128, 4 * 128, 1));
2446
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(2, 3)), ccv_nnc_no_hint, 0, TENSOR_LIST((ccv_nnc_tensor_t*)av, (ccv_nnc_tensor_t*)wv), TENSOR_LIST(b), 0);
2447
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(2, 3)), ccv_nnc_no_hint, 0, TENSOR_LIST(at, wt), TENSOR_LIST(bt), 0);
2448
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
2449
0
  REQUIRE_TENSOR_EQ(hb, bt, "permute computed output should be the same as non-permute computed ones");
2450
0
  ccv_nnc_tensor_free(ha);
2451
0
  ccv_nnc_tensor_free(hw);
2452
0
  ccv_nnc_tensor_free(hb);
2453
0
  ccv_nnc_tensor_free(a);
2454
0
  ccv_nnc_tensor_free(w);
2455
0
  ccv_nnc_tensor_free(b);
2456
0
  ccv_nnc_tensor_view_free(av);
2457
0
  ccv_nnc_tensor_view_free(wv);
2458
0
  ccv_nnc_tensor_free(at);
2459
0
  ccv_nnc_tensor_free(wt);
2460
0
  ccv_nnc_tensor_free(bt);
2461
0
}
2462
2463
TEST_CASE("generalized batched gemm with batch (2, 4) and broadcast compare mps")
2464
1
{
2465
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
2466
  // This is a particular batched gemm which treat every dimensions other than the last two as batching.
2467
0
  dsfmt_t dsfmt;
2468
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2469
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
2470
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
2471
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
2472
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
2473
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
2474
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 64), 0);
2475
2476
0
  ccv_nnc_tensor_t* at = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
2477
0
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
2478
0
  int i;
2479
0
  for (i = 0; i < 64 * 128; i++)
2480
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
2481
0
  for (i = 0; i < 8 * 10 * 128; i++)
2482
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2483
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(at), 0);
2484
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw), TENSOR_LIST(a, w), 0);
2485
0
  ccv_nnc_tensor_view_t* av = ccv_nnc_tensor_view_new(a, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
2486
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST((ccv_nnc_tensor_t*)av, w), TENSOR_LIST(b), 0);
2487
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(at, hw), TENSOR_LIST(bt), 0);
2488
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
2489
0
  REQUIRE_TENSOR_EQ(hb, bt, "permute computed output should be the same as non-permute computed ones");
2490
0
  ccv_nnc_tensor_free(ha);
2491
0
  ccv_nnc_tensor_free(hw);
2492
0
  ccv_nnc_tensor_free(hb);
2493
0
  ccv_nnc_tensor_free(a);
2494
0
  ccv_nnc_tensor_free(w);
2495
0
  ccv_nnc_tensor_free(b);
2496
0
  ccv_nnc_tensor_view_free(av);
2497
0
  ccv_nnc_tensor_free(at);
2498
0
  ccv_nnc_tensor_free(bt);
2499
0
}
2500
2501
TEST_CASE("generalized batched gemm with batch (2, 4) with bias compare mps")
2502
1
{
2503
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
2504
  // This is a particular batched gemm which treat every dimensions other than the last two as batching.
2505
0
  dsfmt_t dsfmt;
2506
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2507
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
2508
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 64, 4, 128), 0);
2509
0
  ccv_nnc_tensor_t* hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
2510
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
2511
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
2512
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 64, 4, 128), 0);
2513
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64), 0);
2514
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 64), 0);
2515
2516
0
  ccv_nnc_tensor_t* at = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
2517
0
  ccv_nnc_tensor_t* wt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 64, 128), 0);
2518
0
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
2519
0
  int i;
2520
0
  for (i = 0; i < 8 * 64 * 128; i++)
2521
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
2522
0
  for (i = 0; i < 64; i++)
2523
0
    hbias->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / 64;
2524
0
  for (i = 0; i < 8 * 10 * 128; i++)
2525
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2526
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(at), 0);
2527
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(hw), TENSOR_LIST(wt), 0);
2528
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(a, w, bias), 0);
2529
0
  ccv_nnc_tensor_view_t* av = ccv_nnc_tensor_view_new(a, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
2530
0
  ccv_nnc_tensor_view_t* wv = ccv_nnc_tensor_view_new(w, GPU_TENSOR_NHWC(000, 32F, 2, 4, 64, 128), ccv_nnc_no_ofs, DIM_ALLOC(64 * 4 * 128, 128, 4 * 128, 1));
2531
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(2, 3)), ccv_nnc_no_hint, 0, TENSOR_LIST((ccv_nnc_tensor_t*)av, (ccv_nnc_tensor_t*)wv, bias), TENSOR_LIST(b), 0);
2532
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(2, 3)), ccv_nnc_no_hint, 0, TENSOR_LIST(at, wt, hbias), TENSOR_LIST(bt), 0);
2533
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
2534
0
  REQUIRE_TENSOR_EQ(hb, bt, "permute computed output should be the same as non-permute computed ones");
2535
0
  ccv_nnc_tensor_free(ha);
2536
0
  ccv_nnc_tensor_free(hw);
2537
0
  ccv_nnc_tensor_free(hbias);
2538
0
  ccv_nnc_tensor_free(hb);
2539
0
  ccv_nnc_tensor_free(a);
2540
0
  ccv_nnc_tensor_free(w);
2541
0
  ccv_nnc_tensor_free(bias);
2542
0
  ccv_nnc_tensor_free(b);
2543
0
  ccv_nnc_tensor_view_free(av);
2544
0
  ccv_nnc_tensor_view_free(wv);
2545
0
  ccv_nnc_tensor_free(at);
2546
0
  ccv_nnc_tensor_free(wt);
2547
0
  ccv_nnc_tensor_free(bt);
2548
0
}
2549
2550
TEST_CASE("generalized batched gemm with batch (2, 4) with bias and broadcast compare mps")
2551
1
{
2552
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
2553
  // This is a particular batched gemm which treat every dimensions other than the last two as batching.
2554
0
  dsfmt_t dsfmt;
2555
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2556
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
2557
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
2558
0
  ccv_nnc_tensor_t* hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
2559
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
2560
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
2561
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
2562
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64), 0);
2563
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 64), 0);
2564
2565
0
  ccv_nnc_tensor_t* at = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
2566
0
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
2567
0
  int i;
2568
0
  for (i = 0; i < 64 * 128; i++)
2569
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
2570
0
  for (i = 0; i < 64; i++)
2571
0
    hbias->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / 64;
2572
0
  for (i = 0; i < 8 * 10 * 128; i++)
2573
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2574
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(at), 0);
2575
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(a, w, bias), 0);
2576
0
  ccv_nnc_tensor_view_t* av = ccv_nnc_tensor_view_new(a, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
2577
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST((ccv_nnc_tensor_t*)av, w, bias), TENSOR_LIST(b), 0);
2578
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(at, hw, hbias), TENSOR_LIST(bt), 0);
2579
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
2580
0
  REQUIRE_TENSOR_EQ(hb, bt, "permute computed output should be the same as non-permute computed ones");
2581
0
  ccv_nnc_tensor_free(ha);
2582
0
  ccv_nnc_tensor_free(hw);
2583
0
  ccv_nnc_tensor_free(hbias);
2584
0
  ccv_nnc_tensor_free(hb);
2585
0
  ccv_nnc_tensor_free(a);
2586
0
  ccv_nnc_tensor_free(w);
2587
0
  ccv_nnc_tensor_free(bias);
2588
0
  ccv_nnc_tensor_free(b);
2589
0
  ccv_nnc_tensor_view_free(av);
2590
0
  ccv_nnc_tensor_free(at);
2591
0
  ccv_nnc_tensor_free(bt);
2592
0
}
2593
2594
TEST_CASE("generalized batched backward gemm with batch (2, 4) compare mps")
2595
1
{
2596
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
2597
  // This is a particular batched gemm which treat every dimensions other than the last two as batching.
2598
0
  dsfmt_t dsfmt;
2599
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2600
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
2601
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 64, 4, 128), 0);
2602
0
  ccv_nnc_tensor_t* hda = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
2603
0
  ccv_nnc_tensor_t* hdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 64, 4, 128), 0);
2604
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
2605
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
2606
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 64, 4, 128), 0);
2607
0
  ccv_nnc_tensor_t* da = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
2608
0
  ccv_nnc_tensor_t* dw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 64, 4, 128), 0);
2609
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 64), 0);
2610
2611
0
  ccv_nnc_tensor_t* at = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
2612
0
  ccv_nnc_tensor_t* wt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 64, 128), 0);
2613
0
  ccv_nnc_tensor_t* dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
2614
0
  ccv_nnc_tensor_t* dwt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 64, 128), 0);
2615
0
  ccv_nnc_tensor_t* tda = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
2616
0
  ccv_nnc_tensor_t* tdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 64, 4, 128), 0);
2617
0
  int i;
2618
0
  for (i = 0; i < 8 * 64 * 128; i++)
2619
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
2620
0
  for (i = 0; i < 8 * 10 * 128; i++)
2621
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2622
0
  for (i = 0; i < 2 * 4 * 10 * 64; i++)
2623
0
    hb->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2624
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(at), 0);
2625
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(hw), TENSOR_LIST(wt), 0);
2626
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hb), TENSOR_LIST(a, w, b), 0);
2627
0
  ccv_nnc_tensor_view_t* av = ccv_nnc_tensor_view_new(a, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
2628
0
  ccv_nnc_tensor_view_t* wv = ccv_nnc_tensor_view_new(w, GPU_TENSOR_NHWC(000, 32F, 2, 4, 64, 128), ccv_nnc_no_ofs, DIM_ALLOC(64 * 4 * 128, 128, 4 * 128, 1));
2629
0
  ccv_nnc_tensor_view_t* dav = ccv_nnc_tensor_view_new(da, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
2630
0
  ccv_nnc_tensor_view_t* dwv = ccv_nnc_tensor_view_new(dw, GPU_TENSOR_NHWC(000, 32F, 2, 4, 64, 128), ccv_nnc_no_ofs, DIM_ALLOC(64 * 4 * 128, 128, 4 * 128, 1));
2631
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(2, 3)), ccv_nnc_no_hint, 0, TENSOR_LIST(b, (ccv_nnc_tensor_t*)av, (ccv_nnc_tensor_t*)wv), TENSOR_LIST((ccv_nnc_tensor_t*)dav, (ccv_nnc_tensor_t*)dwv), 0);
2632
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(2, 3)), ccv_nnc_no_hint, 0, TENSOR_LIST(hb, at, wt), TENSOR_LIST(dat, dwt), 0);
2633
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(dat), TENSOR_LIST(tda), 0);
2634
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(dwt), TENSOR_LIST(tdw), 0);
2635
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(da, dw), TENSOR_LIST(hda, hdw), 0);
2636
0
  REQUIRE_TENSOR_EQ(hda, tda, "permute computed output should be the same as non-permute computed ones");
2637
0
  REQUIRE_TENSOR_EQ(hdw, tdw, "permute computed output should be the same as non-permute computed ones");
2638
0
  ccv_nnc_tensor_free(ha);
2639
0
  ccv_nnc_tensor_free(hw);
2640
0
  ccv_nnc_tensor_free(hda);
2641
0
  ccv_nnc_tensor_free(hdw);
2642
0
  ccv_nnc_tensor_free(hb);
2643
0
  ccv_nnc_tensor_free(a);
2644
0
  ccv_nnc_tensor_free(w);
2645
0
  ccv_nnc_tensor_free(da);
2646
0
  ccv_nnc_tensor_free(dw);
2647
0
  ccv_nnc_tensor_free(b);
2648
0
  ccv_nnc_tensor_view_free(av);
2649
0
  ccv_nnc_tensor_view_free(wv);
2650
0
  ccv_nnc_tensor_view_free(dav);
2651
0
  ccv_nnc_tensor_view_free(dwv);
2652
0
  ccv_nnc_tensor_free(at);
2653
0
  ccv_nnc_tensor_free(wt);
2654
0
  ccv_nnc_tensor_free(dat);
2655
0
  ccv_nnc_tensor_free(tda);
2656
0
  ccv_nnc_tensor_free(dwt);
2657
0
  ccv_nnc_tensor_free(tdw);
2658
0
}
2659
2660
TEST_CASE("generalized batched backward gemm with batch (2, 4) and broadcast compare mps")
2661
1
{
2662
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
2663
  // This is a particular batched gemm which treat every dimensions other than the last two as batching.
2664
0
  dsfmt_t dsfmt;
2665
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2666
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
2667
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
2668
0
  ccv_nnc_tensor_t* hda = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
2669
0
  ccv_nnc_tensor_t* hdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
2670
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
2671
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
2672
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
2673
0
  ccv_nnc_tensor_t* da = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
2674
0
  ccv_nnc_tensor_t* dw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
2675
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 64), 0);
2676
2677
0
  ccv_nnc_tensor_t* at = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
2678
0
  ccv_nnc_tensor_t* dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
2679
0
  ccv_nnc_tensor_t* tda = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
2680
0
  ccv_nnc_tensor_t* tdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
2681
0
  int i;
2682
0
  for (i = 0; i < 64 * 128; i++)
2683
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
2684
0
  for (i = 0; i < 8 * 10 * 128; i++)
2685
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2686
0
  for (i = 0; i < 2 * 4 * 10 * 64; i++)
2687
0
    hb->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2688
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(at), 0);
2689
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hb), TENSOR_LIST(a, w, b), 0);
2690
0
  ccv_nnc_tensor_view_t* av = ccv_nnc_tensor_view_new(a, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
2691
0
  ccv_nnc_tensor_view_t* dav = ccv_nnc_tensor_view_new(da, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
2692
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(b, (ccv_nnc_tensor_t*)av, w), TENSOR_LIST((ccv_nnc_tensor_t*)dav, dw), 0);
2693
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(hb, at, hw), TENSOR_LIST(dat, tdw), 0);
2694
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(dat), TENSOR_LIST(tda), 0);
2695
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(da, dw), TENSOR_LIST(hda, hdw), 0);
2696
0
  REQUIRE_TENSOR_EQ(hda, tda, "permute computed output should be the same as non-permute computed ones");
2697
0
  REQUIRE_TENSOR_EQ(hdw, tdw, "permute computed output should be the same as non-permute computed ones");
2698
0
  ccv_nnc_tensor_free(ha);
2699
0
  ccv_nnc_tensor_free(hw);
2700
0
  ccv_nnc_tensor_free(hda);
2701
0
  ccv_nnc_tensor_free(hdw);
2702
0
  ccv_nnc_tensor_free(hb);
2703
0
  ccv_nnc_tensor_free(a);
2704
0
  ccv_nnc_tensor_free(w);
2705
0
  ccv_nnc_tensor_free(da);
2706
0
  ccv_nnc_tensor_free(dw);
2707
0
  ccv_nnc_tensor_free(b);
2708
0
  ccv_nnc_tensor_view_free(av);
2709
0
  ccv_nnc_tensor_view_free(dav);
2710
0
  ccv_nnc_tensor_free(at);
2711
0
  ccv_nnc_tensor_free(dat);
2712
0
  ccv_nnc_tensor_free(tda);
2713
0
  ccv_nnc_tensor_free(tdw);
2714
0
}
2715
2716
TEST_CASE("generalized batched backward gemm with batch (2, 4) with bias compare mps")
2717
1
{
2718
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
2719
  // This is a particular batched gemm which treat every dimensions other than the last two as batching.
2720
0
  dsfmt_t dsfmt;
2721
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2722
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
2723
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 64, 4, 128), 0);
2724
0
  ccv_nnc_tensor_t* hda = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
2725
0
  ccv_nnc_tensor_t* hdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 64, 4, 128), 0);
2726
0
  ccv_nnc_tensor_t* hdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
2727
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
2728
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
2729
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 64, 4, 128), 0);
2730
0
  ccv_nnc_tensor_t* da = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
2731
0
  ccv_nnc_tensor_t* dw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 64, 4, 128), 0);
2732
0
  ccv_nnc_tensor_t* dbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64), 0);
2733
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 64), 0);
2734
2735
0
  ccv_nnc_tensor_t* at = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
2736
0
  ccv_nnc_tensor_t* wt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 64, 128), 0);
2737
0
  ccv_nnc_tensor_t* dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
2738
0
  ccv_nnc_tensor_t* dwt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 64, 128), 0);
2739
0
  ccv_nnc_tensor_t* tda = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
2740
0
  ccv_nnc_tensor_t* tdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 64, 4, 128), 0);
2741
0
  ccv_nnc_tensor_t* tdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
2742
0
  int i;
2743
0
  for (i = 0; i < 8 * 64 * 128; i++)
2744
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
2745
0
  for (i = 0; i < 8 * 10 * 128; i++)
2746
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2747
0
  for (i = 0; i < 2 * 4 * 10 * 64; i++)
2748
0
    hb->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2749
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(at), 0);
2750
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(hw), TENSOR_LIST(wt), 0);
2751
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hb), TENSOR_LIST(a, w, b), 0);
2752
0
  ccv_nnc_tensor_view_t* av = ccv_nnc_tensor_view_new(a, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
2753
0
  ccv_nnc_tensor_view_t* wv = ccv_nnc_tensor_view_new(w, GPU_TENSOR_NHWC(000, 32F, 2, 4, 64, 128), ccv_nnc_no_ofs, DIM_ALLOC(64 * 4 * 128, 128, 4 * 128, 1));
2754
0
  ccv_nnc_tensor_view_t* dav = ccv_nnc_tensor_view_new(da, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
2755
0
  ccv_nnc_tensor_view_t* dwv = ccv_nnc_tensor_view_new(dw, GPU_TENSOR_NHWC(000, 32F, 2, 4, 64, 128), ccv_nnc_no_ofs, DIM_ALLOC(64 * 4 * 128, 128, 4 * 128, 1));
2756
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(2, 3)), ccv_nnc_no_hint, 0, TENSOR_LIST(b, (ccv_nnc_tensor_t*)av, (ccv_nnc_tensor_t*)wv), TENSOR_LIST((ccv_nnc_tensor_t*)dav, (ccv_nnc_tensor_t*)dwv, dbias), 0);
2757
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(2, 3)), ccv_nnc_no_hint, 0, TENSOR_LIST(hb, at, wt), TENSOR_LIST(dat, dwt, tdbias), 0);
2758
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(da, dw, dbias), TENSOR_LIST(hda, hdw, hdbias), 0);
2759
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(dat), TENSOR_LIST(tda), 0);
2760
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(dwt), TENSOR_LIST(tdw), 0);
2761
0
  REQUIRE_TENSOR_EQ(hda, tda, "permute computed output should be the same as non-permute computed ones");
2762
0
  REQUIRE_TENSOR_EQ(hdw, tdw, "permute computed output should be the same as non-permute computed ones");
2763
0
  REQUIRE_TENSOR_EQ(hdbias, tdbias, "permute computed output should be the same as non-permute computed ones");
2764
0
  ccv_nnc_tensor_free(ha);
2765
0
  ccv_nnc_tensor_free(hw);
2766
0
  ccv_nnc_tensor_free(hda);
2767
0
  ccv_nnc_tensor_free(hdw);
2768
0
  ccv_nnc_tensor_free(hdbias);
2769
0
  ccv_nnc_tensor_free(hb);
2770
0
  ccv_nnc_tensor_free(a);
2771
0
  ccv_nnc_tensor_free(w);
2772
0
  ccv_nnc_tensor_free(da);
2773
0
  ccv_nnc_tensor_free(dw);
2774
0
  ccv_nnc_tensor_free(dbias);
2775
0
  ccv_nnc_tensor_free(b);
2776
0
  ccv_nnc_tensor_view_free(av);
2777
0
  ccv_nnc_tensor_view_free(wv);
2778
0
  ccv_nnc_tensor_view_free(dav);
2779
0
  ccv_nnc_tensor_view_free(dwv);
2780
0
  ccv_nnc_tensor_free(at);
2781
0
  ccv_nnc_tensor_free(wt);
2782
0
  ccv_nnc_tensor_free(dat);
2783
0
  ccv_nnc_tensor_free(dwt);
2784
0
  ccv_nnc_tensor_free(tda);
2785
0
  ccv_nnc_tensor_free(tdw);
2786
0
  ccv_nnc_tensor_free(tdbias);
2787
0
}
2788
2789
TEST_CASE("generalized batched backward gemm with batch (2, 4) with bias and broadcast compare mps")
2790
1
{
2791
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
2792
  // This is a particular batched gemm which treat every dimensions other than the last two as batching.
2793
0
  dsfmt_t dsfmt;
2794
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2795
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
2796
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
2797
0
  ccv_nnc_tensor_t* hda = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
2798
0
  ccv_nnc_tensor_t* hdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
2799
0
  ccv_nnc_tensor_t* hdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
2800
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
2801
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
2802
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
2803
0
  ccv_nnc_tensor_t* da = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
2804
0
  ccv_nnc_tensor_t* dw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
2805
0
  ccv_nnc_tensor_t* dbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64), 0);
2806
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 64), 0);
2807
2808
0
  ccv_nnc_tensor_t* at = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
2809
0
  ccv_nnc_tensor_t* dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
2810
0
  ccv_nnc_tensor_t* tda = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
2811
0
  ccv_nnc_tensor_t* tdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
2812
0
  ccv_nnc_tensor_t* tdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
2813
0
  int i;
2814
0
  for (i = 0; i < 64 * 128; i++)
2815
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
2816
0
  for (i = 0; i < 8 * 10 * 128; i++)
2817
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2818
0
  for (i = 0; i < 2 * 4 * 10 * 64; i++)
2819
0
    hb->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2820
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(at), 0);
2821
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hb), TENSOR_LIST(a, w, b), 0);
2822
0
  ccv_nnc_tensor_view_t* av = ccv_nnc_tensor_view_new(a, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
2823
0
  ccv_nnc_tensor_view_t* dav = ccv_nnc_tensor_view_new(da, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
2824
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(b, (ccv_nnc_tensor_t*)av, w, dbias), TENSOR_LIST((ccv_nnc_tensor_t*)dav, dw, dbias), 0);
2825
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(hb, at, hw, hdbias), TENSOR_LIST(dat, tdw, tdbias), 0);
2826
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(da, dw, dbias), TENSOR_LIST(hda, hdw, hdbias), 0);
2827
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(dat), TENSOR_LIST(tda), 0);
2828
0
  REQUIRE_TENSOR_EQ(hda, tda, "permute computed output should be the same as non-permute computed ones");
2829
0
  REQUIRE_TENSOR_EQ(hdw, tdw, "permute computed output should be the same as non-permute computed ones");
2830
0
  REQUIRE_TENSOR_EQ(hdbias, tdbias, "permute computed output should be the same as non-permute computed ones");
2831
0
  ccv_nnc_tensor_free(ha);
2832
0
  ccv_nnc_tensor_free(hw);
2833
0
  ccv_nnc_tensor_free(hda);
2834
0
  ccv_nnc_tensor_free(hdw);
2835
0
  ccv_nnc_tensor_free(hdbias);
2836
0
  ccv_nnc_tensor_free(hb);
2837
0
  ccv_nnc_tensor_free(a);
2838
0
  ccv_nnc_tensor_free(w);
2839
0
  ccv_nnc_tensor_free(da);
2840
0
  ccv_nnc_tensor_free(dw);
2841
0
  ccv_nnc_tensor_free(dbias);
2842
0
  ccv_nnc_tensor_free(b);
2843
0
  ccv_nnc_tensor_view_free(av);
2844
0
  ccv_nnc_tensor_view_free(dav);
2845
0
  ccv_nnc_tensor_free(at);
2846
0
  ccv_nnc_tensor_free(dat);
2847
0
  ccv_nnc_tensor_free(tdw);
2848
0
  ccv_nnc_tensor_free(tdbias);
2849
0
}
2850
2851
TEST_CASE("ewdiv forward with reciprocal")
2852
1
{
2853
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWDIV_FORWARD, CCV_NNC_BACKEND_MPS));
2854
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2855
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2856
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2857
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2858
0
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2859
0
  dsfmt_t dsfmt;
2860
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2861
0
  int i;
2862
0
  for (i = 0; i < 1000; i++)
2863
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 0.01;
2864
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
2865
0
  ccv_nnc_cmd_exec(CMD_EWDIV_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(0, a), TENSOR_LIST(b), 0);
2866
0
  ccv_nnc_cmd_exec(CMD_EWDIV_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(0, ha), TENSOR_LIST(bt), 0);
2867
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
2868
0
  REQUIRE_TENSOR_EQ(bt, hb, "GPU computed output should be the same as CPU computed ones");
2869
0
  ccv_nnc_tensor_free(a);
2870
0
  ccv_nnc_tensor_free(b);
2871
0
  ccv_nnc_tensor_free(ha);
2872
0
  ccv_nnc_tensor_free(hb);
2873
0
  ccv_nnc_tensor_free(bt);
2874
0
}
2875
2876
TEST_CASE("ewdiv forward")
2877
1
{
2878
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWDIV_FORWARD, CCV_NNC_BACKEND_MPS));
2879
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2880
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2881
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2882
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2883
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2884
0
  ccv_nnc_tensor_t* hc = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2885
0
  ccv_nnc_tensor_t* ct = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2886
0
  dsfmt_t dsfmt;
2887
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2888
0
  int i;
2889
0
  for (i = 0; i < 1000; i++)
2890
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 0.01;
2891
0
  for (i = 0; i < 1000; i++)
2892
0
    hb->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 0.01;
2893
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb), TENSOR_LIST(a, b), 0);
2894
0
  ccv_nnc_cmd_exec(CMD_EWDIV_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(c), 0);
2895
0
  ccv_nnc_cmd_exec(CMD_EWDIV_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb), TENSOR_LIST(ct), 0);
2896
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(c), TENSOR_LIST(hc), 0);
2897
0
  REQUIRE_TENSOR_EQ(ct, hc, "GPU computed output should be the same as CPU computed ones");
2898
0
  ccv_nnc_tensor_free(a);
2899
0
  ccv_nnc_tensor_free(b);
2900
0
  ccv_nnc_tensor_free(c);
2901
0
  ccv_nnc_tensor_free(ha);
2902
0
  ccv_nnc_tensor_free(hb);
2903
0
  ccv_nnc_tensor_free(hc);
2904
0
  ccv_nnc_tensor_free(ct);
2905
0
}
2906
2907
TEST_CASE("exp forward")
2908
1
{
2909
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWEXP_FORWARD, CCV_NNC_BACKEND_MPS));
2910
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2911
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2912
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2913
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2914
0
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2915
0
  dsfmt_t dsfmt;
2916
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2917
0
  int i;
2918
0
  for (i = 0; i < 1000; i++)
2919
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 10 - 1;
2920
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
2921
0
  ccv_nnc_cmd_exec(CMD_EWEXP_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b), 0);
2922
0
  ccv_nnc_cmd_exec(CMD_EWEXP_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(bt), 0);
2923
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
2924
0
  REQUIRE_TENSOR_EQ(bt, hb, "GPU computed output should be the same as CPU computed ones");
2925
0
  ccv_nnc_tensor_free(a);
2926
0
  ccv_nnc_tensor_free(b);
2927
0
  ccv_nnc_tensor_free(ha);
2928
0
  ccv_nnc_tensor_free(hb);
2929
0
  ccv_nnc_tensor_free(bt);
2930
0
}
2931
2932
TEST_CASE("ewpow forward")
2933
1
{
2934
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWPOW_FORWARD, CCV_NNC_BACKEND_MPS));
2935
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2936
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2937
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2938
0
  ccv_nnc_tensor_t* hc = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2939
0
  ccv_nnc_tensor_t* ct = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2940
0
  dsfmt_t dsfmt;
2941
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2942
0
  int i;
2943
0
  for (i = 0; i < 1000; i++)
2944
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 2 + 0.1;
2945
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
2946
0
  ccv_nnc_cmd_exec(CMD_EWPOW_FORWARD(3), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(c), 0);
2947
0
  ccv_nnc_cmd_exec(CMD_EWPOW_FORWARD(3), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(ct), 0);
2948
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(c), TENSOR_LIST(hc), 0);
2949
0
  REQUIRE_TENSOR_EQ(ct, hc, "GPU computed output should be the same as CPU computed ones");
2950
0
  ccv_nnc_tensor_free(a);
2951
0
  ccv_nnc_tensor_free(c);
2952
0
  ccv_nnc_tensor_free(ha);
2953
0
  ccv_nnc_tensor_free(hc);
2954
0
  ccv_nnc_tensor_free(ct);
2955
0
}
2956
2957
TEST_CASE("ewsin forward")
2958
1
{
2959
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWSIN_FORWARD, CCV_NNC_BACKEND_MPS));
2960
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2961
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2962
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2963
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2964
0
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2965
0
  dsfmt_t dsfmt;
2966
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2967
0
  int i;
2968
0
  for (i = 0; i < 1000; i++)
2969
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 10 - 5;
2970
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
2971
0
  ccv_nnc_cmd_exec(CMD_EWSIN_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b), 0);
2972
0
  ccv_nnc_cmd_exec(CMD_EWSIN_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(bt), 0);
2973
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
2974
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, bt->data.f32, hb->data.f32, 10 * 100, 1e-3, "GPU computed output should be the same as CPU computed ones");
2975
0
  ccv_nnc_tensor_free(a);
2976
0
  ccv_nnc_tensor_free(b);
2977
0
  ccv_nnc_tensor_free(ha);
2978
0
  ccv_nnc_tensor_free(hb);
2979
0
  ccv_nnc_tensor_free(bt);
2980
0
}
2981
2982
TEST_CASE("ewcos forward")
2983
1
{
2984
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWCOS_FORWARD, CCV_NNC_BACKEND_MPS));
2985
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2986
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
2987
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2988
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2989
0
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
2990
0
  dsfmt_t dsfmt;
2991
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2992
0
  int i;
2993
0
  for (i = 0; i < 1000; i++)
2994
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 10 - 5;
2995
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
2996
0
  ccv_nnc_cmd_exec(CMD_EWCOS_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b), 0);
2997
0
  ccv_nnc_cmd_exec(CMD_EWCOS_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(bt), 0);
2998
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
2999
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, bt->data.f32, hb->data.f32, 10 * 100, 1e-3, "GPU computed output should be the same as CPU computed ones");
3000
0
  ccv_nnc_tensor_free(a);
3001
0
  ccv_nnc_tensor_free(b);
3002
0
  ccv_nnc_tensor_free(ha);
3003
0
  ccv_nnc_tensor_free(hb);
3004
0
  ccv_nnc_tensor_free(bt);
3005
0
}
3006
3007
TEST_CASE("ewlog forward")
3008
1
{
3009
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWLOG_FORWARD, CCV_NNC_BACKEND_MPS));
3010
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3011
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3012
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3013
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3014
0
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3015
0
  dsfmt_t dsfmt;
3016
0
  dsfmt_init_gen_rand(&dsfmt, 0);
3017
0
  int i;
3018
0
  for (i = 0; i < 1000; i++)
3019
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 10 + 0.0001;
3020
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
3021
0
  ccv_nnc_cmd_exec(CMD_EWLOG_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b), 0);
3022
0
  ccv_nnc_cmd_exec(CMD_EWLOG_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(bt), 0);
3023
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
3024
0
  REQUIRE_TENSOR_EQ(bt, hb, "GPU computed output should be the same as CPU computed ones");
3025
0
  ccv_nnc_tensor_free(a);
3026
0
  ccv_nnc_tensor_free(b);
3027
0
  ccv_nnc_tensor_free(ha);
3028
0
  ccv_nnc_tensor_free(hb);
3029
0
  ccv_nnc_tensor_free(bt);
3030
0
}
3031
3032
TEST_CASE("ewsqrt forward")
3033
1
{
3034
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWSQRT_FORWARD, CCV_NNC_BACKEND_MPS));
3035
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3036
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3037
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3038
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3039
0
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3040
0
  dsfmt_t dsfmt;
3041
0
  dsfmt_init_gen_rand(&dsfmt, 0);
3042
0
  int i;
3043
0
  for (i = 0; i < 1000; i++)
3044
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 10 + 0.0001;
3045
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
3046
0
  ccv_nnc_cmd_exec(CMD_EWSQRT_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b), 0);
3047
0
  ccv_nnc_cmd_exec(CMD_EWSQRT_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(bt), 0);
3048
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
3049
0
  REQUIRE_TENSOR_EQ(bt, hb, "GPU computed output should be the same as CPU computed ones");
3050
0
  ccv_nnc_tensor_free(a);
3051
0
  ccv_nnc_tensor_free(b);
3052
0
  ccv_nnc_tensor_free(ha);
3053
0
  ccv_nnc_tensor_free(hb);
3054
0
  ccv_nnc_tensor_free(bt);
3055
0
}
3056
3057
TEST_CASE("ewabs forward")
3058
1
{
3059
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWABS_FORWARD, CCV_NNC_BACKEND_MPS));
3060
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3061
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3062
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3063
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3064
0
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3065
0
  dsfmt_t dsfmt;
3066
0
  dsfmt_init_gen_rand(&dsfmt, 0);
3067
0
  int i;
3068
0
  for (i = 0; i < 1000; i++)
3069
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 10 - 5 + 0.0001;
3070
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
3071
0
  ccv_nnc_cmd_exec(CMD_EWABS_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b), 0);
3072
0
  ccv_nnc_cmd_exec(CMD_EWABS_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(bt), 0);
3073
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
3074
0
  REQUIRE_TENSOR_EQ(bt, hb, "GPU computed output should be the same as CPU computed ones");
3075
0
  ccv_nnc_tensor_free(a);
3076
0
  ccv_nnc_tensor_free(b);
3077
0
  ccv_nnc_tensor_free(ha);
3078
0
  ccv_nnc_tensor_free(hb);
3079
0
  ccv_nnc_tensor_free(bt);
3080
0
}
3081
3082
TEST_CASE("clamp forward")
3083
1
{
3084
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CLAMP_FORWARD, CCV_NNC_BACKEND_MPS));
3085
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3086
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3087
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3088
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3089
0
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3090
0
  dsfmt_t dsfmt;
3091
0
  dsfmt_init_gen_rand(&dsfmt, 0);
3092
0
  int i;
3093
0
  for (i = 0; i < 1000; i++)
3094
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 10 - 1;
3095
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
3096
0
  ccv_nnc_cmd_exec(CMD_CLAMP_FORWARD(0, 6), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b), 0);
3097
0
  ccv_nnc_cmd_exec(CMD_CLAMP_FORWARD(0, 6), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(bt), 0);
3098
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
3099
0
  REQUIRE_TENSOR_EQ(bt, hb, "GPU computed output should be the same as CPU computed ones");
3100
0
  ccv_nnc_tensor_free(a);
3101
0
  ccv_nnc_tensor_free(b);
3102
0
  ccv_nnc_tensor_free(ha);
3103
0
  ccv_nnc_tensor_free(hb);
3104
0
  ccv_nnc_tensor_free(bt);
3105
0
}
3106
3107
TEST_CASE("clamp forward with only max")
3108
1
{
3109
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CLAMP_FORWARD, CCV_NNC_BACKEND_MPS));
3110
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3111
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3112
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3113
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3114
0
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3115
0
  dsfmt_t dsfmt;
3116
0
  dsfmt_init_gen_rand(&dsfmt, 0);
3117
0
  int i;
3118
0
  for (i = 0; i < 1000; i++)
3119
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 10 - 1;
3120
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
3121
0
  ccv_nnc_cmd_exec(CMD_CLAMP_FORWARD(NAN, 6), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b), 0);
3122
0
  ccv_nnc_cmd_exec(CMD_CLAMP_FORWARD(NAN, 6), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(bt), 0);
3123
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
3124
0
  REQUIRE_TENSOR_EQ(bt, hb, "GPU computed output should be the same as CPU computed ones");
3125
0
  ccv_nnc_tensor_free(a);
3126
0
  ccv_nnc_tensor_free(b);
3127
0
  ccv_nnc_tensor_free(ha);
3128
0
  ccv_nnc_tensor_free(hb);
3129
0
  ccv_nnc_tensor_free(bt);
3130
0
}
3131
3132
TEST_CASE("clamp forward with only min")
3133
1
{
3134
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CLAMP_FORWARD, CCV_NNC_BACKEND_MPS));
3135
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3136
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3137
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3138
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3139
0
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3140
0
  dsfmt_t dsfmt;
3141
0
  dsfmt_init_gen_rand(&dsfmt, 0);
3142
0
  int i;
3143
0
  for (i = 0; i < 1000; i++)
3144
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 10 - 1;
3145
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
3146
0
  ccv_nnc_cmd_exec(CMD_CLAMP_FORWARD(0, NAN), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b), 0);
3147
0
  ccv_nnc_cmd_exec(CMD_CLAMP_FORWARD(0, NAN), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(bt), 0);
3148
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
3149
0
  REQUIRE_TENSOR_EQ(bt, hb, "GPU computed output should be the same as CPU computed ones");
3150
0
  ccv_nnc_tensor_free(a);
3151
0
  ccv_nnc_tensor_free(b);
3152
0
  ccv_nnc_tensor_free(ha);
3153
0
  ccv_nnc_tensor_free(hb);
3154
0
  ccv_nnc_tensor_free(bt);
3155
0
}
3156
3157
TEST_CASE("compare set with mps")
3158
1
{
3159
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS));
3160
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 11, 10, 9, 8), 0);
3161
0
  ccv_nnc_tensor_t* const ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 11, 10, 9, 8), 0);
3162
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 11, 10, 9, 8), 0);
3163
0
  ccv_nnc_cmd_exec(CMD_SET_FORWARD(10), ccv_nnc_no_hint, 0, TENSOR_LIST(), TENSOR_LIST(a), 0);
3164
0
  ccv_nnc_cmd_exec(CMD_SET_FORWARD(10), ccv_nnc_no_hint, 0, TENSOR_LIST(), TENSOR_LIST(ga), 0);
3165
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(ha), 0);
3166
0
  REQUIRE_TENSOR_EQ(ha, ga, "format transform result should be the same");
3167
0
  ccv_nnc_tensor_free(a);
3168
0
  ccv_nnc_tensor_free(ha);
3169
0
  ccv_nnc_tensor_free(ga);
3170
0
}
3171
3172
TEST_CASE("scaled dot product attention with mps")
3173
1
{
3174
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD, CCV_NNC_BACKEND_MPS));
3175
  // Bypass error: variable-sized object may not be initialized
3176
0
#define num_long_trials 6
3177
0
#define num_short_trials 2
3178
0
#define num_trials (num_long_trials + num_short_trials)
3179
3180
0
  for (int trial = 0; trial < num_trials; ++trial) {
3181
0
    int B_candidates[num_trials] =         {  32, 1, 1, 1,  32,   3, 2, 1 };
3182
0
    int R_candidates[num_trials] =         { 128, 4128, 4098, 4162, 128,  61, 6, 2 };
3183
0
    int C_candidates[num_trials] =         { 128, 4128, 4098, 4162, 128,  49, 2, 1 };
3184
0
    int Hq_candidates[num_trials] =        {   8, 32, 32, 32,  32,  13, 3, 1 };
3185
0
    int Hk_candidates[num_trials] =        {   8, 8, 8, 8,   8,  13, 3, 1 };
3186
0
    int D_candidates[num_trials] =         {  64, 32, 32, 32, 128, 191, 4, 8 };
3187
0
    int is_causal_candidates[num_trials] = {   0, 0, 0, 0,   1,   0, 1, 0 };
3188
3189
0
    int B = B_candidates[trial];
3190
0
    int R = R_candidates[trial];
3191
0
    int C = C_candidates[trial];
3192
0
    int Hq = Hq_candidates[trial];
3193
0
    int Hk = Hk_candidates[trial];
3194
0
    int D = D_candidates[trial];
3195
0
    int is_causal = is_causal_candidates[trial];
3196
0
    float scale = 1.0 / sqrt((float)D);
3197
3198
0
    GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD, CCV_NNC_BACKEND_MPS));
3199
0
    ccv_nnc_tensor_t* const q_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
3200
0
    ccv_nnc_tensor_t* const k_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
3201
0
    ccv_nnc_tensor_t* const v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
3202
3203
0
    for (int i = 0; i < B * R * Hq * D; ++i) {
3204
0
      q_tensor->data.f32[i] = (float)(i) / (float)(B * R * Hq * D);
3205
0
    }
3206
0
    for (int i = 0; i < B * C * Hk * D; ++i) {
3207
0
      k_tensor->data.f32[i] = (float)(i) / (float)(B * C * Hk * D);
3208
0
    }
3209
0
    for (int i = 0; i < B * C * Hk * D; ++i) {
3210
0
      v_tensor->data.f32[i] = (float)(i) / (float)(B * C * Hk * D);
3211
0
    }
3212
3213
0
    ccv_nnc_tensor_t* const o_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
3214
0
    ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, is_causal), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor, k_tensor, v_tensor), TENSOR_LIST(o_tensor), 0);
3215
3216
    // Why it there 000 in the beginning of the argument list for GPU_TENSOR_NHWC?
3217
0
    ccv_nnc_tensor_t* const gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, Hq, D), 0);
3218
0
    ccv_nnc_tensor_t* const gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, C, Hk, D), 0);
3219
0
    ccv_nnc_tensor_t* const gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, C, Hk, D), 0);
3220
0
    ccv_nnc_tensor_t* const gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, Hq, D), 0);
3221
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor, k_tensor, v_tensor), TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor), 0);
3222
3223
0
    if (is_causal)
3224
0
    {
3225
0
      ccv_nnc_tensor_t* const causal_mask = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 1, R, C), 0);
3226
0
      ccv_nnc_tensor_t* const gpu_causal_mask = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 1, 1, R, C), 0);
3227
0
      for (int i = 0; i < R; i++)
3228
0
        for (int j = 0; j < C; j++)
3229
0
          causal_mask->data.f32[i * C + j] = 0;
3230
0
      for (int i = 0; i < R - 1; i++)
3231
0
        for (int j = i - R + C + 1; j < C; j++)
3232
0
          causal_mask->data.f32[i * C + j] = -FLT_MAX;
3233
0
      ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(causal_mask), TENSOR_LIST(gpu_causal_mask), 0);
3234
0
      ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, 0), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, gpu_causal_mask), TENSOR_LIST(gpu_o_tensor), 0);
3235
0
      ccv_nnc_tensor_free(gpu_causal_mask);
3236
0
      ccv_nnc_tensor_free(causal_mask);
3237
0
    } else {
3238
0
      ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, 0), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor), TENSOR_LIST(gpu_o_tensor), 0);
3239
0
    }
3240
3241
0
    ccv_nnc_tensor_t* const copy_of_gpu_o_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
3242
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_o_tensor), TENSOR_LIST(copy_of_gpu_o_tensor), 0);
3243
3244
0
    REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, copy_of_gpu_o_tensor->data.f32, o_tensor->data.f32, B * R * Hq * D, 1e-3, "scaled dot product attention result should be the same");
3245
3246
0
    ccv_nnc_tensor_free(o_tensor);
3247
0
    ccv_nnc_tensor_free(gpu_o_tensor);
3248
0
    ccv_nnc_tensor_free(copy_of_gpu_o_tensor);
3249
0
    ccv_nnc_tensor_free(q_tensor);
3250
0
    ccv_nnc_tensor_free(k_tensor);
3251
0
    ccv_nnc_tensor_free(v_tensor);
3252
0
    ccv_nnc_tensor_free(gpu_q_tensor);
3253
0
    ccv_nnc_tensor_free(gpu_k_tensor);
3254
0
    ccv_nnc_tensor_free(gpu_v_tensor);
3255
0
  }
3256
0
#undef num_long_trials
3257
0
#undef num_short_trials
3258
0
#undef num_trials
3259
0
}
3260
3261
TEST_CASE("scaled dot product attention with quantized NA mps")
3262
1
{
3263
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD, CCV_NNC_BACKEND_MPS));
3264
0
  const int B = 1;
3265
0
  const int R = 128;
3266
0
  const int C = 128;
3267
0
  const int H = 24;
3268
0
  const int Ds[] = { 64, 80, 128, 130, 160, 192, 224, 256 };
3269
0
  const int datatypes[] = { CCV_16F, CCV_16BF, CCV_32F };
3270
0
  const float tolerances[] = { 2e-2, 3e-2, 2e-2 };
3271
0
  const char* datatype_names[] = { "16F", "16BF", "32F" };
3272
0
  for (int d_idx = 0; d_idx < (int)(sizeof(Ds) / sizeof(Ds[0])); ++d_idx)
3273
0
  {
3274
0
    const int D = Ds[d_idx];
3275
0
    const float scale = 1.0 / sqrt((float)D);
3276
3277
0
    ccv_nnc_tensor_t* const q_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
3278
0
    ccv_nnc_tensor_t* const k_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
3279
0
    ccv_nnc_tensor_t* const v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
3280
0
    ccv_nnc_tensor_t* const q_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, H, D), 0);
3281
0
    ccv_nnc_tensor_t* const k_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, H, D), 0);
3282
0
    ccv_nnc_tensor_t* const v_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, H, D), 0);
3283
0
    const int q_count = B * R * H * D;
3284
0
    const int kv_count = B * C * H * D;
3285
0
    dsfmt_t dsfmt;
3286
0
    dsfmt_init_gen_rand(&dsfmt, 11 + d_idx);
3287
0
    for (int i = 0; i < q_count; ++i)
3288
0
      q_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
3289
0
    for (int i = 0; i < kv_count; ++i)
3290
0
      k_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
3291
0
    for (int i = 0; i < kv_count; ++i)
3292
0
      v_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
3293
3294
0
    ccv_nnc_tensor_t* const o_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
3295
0
    ccv_nnc_cmd_t cpu_cmd = CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, 0);
3296
0
    ccv_nnc_cmd_exec(cpu_cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor, k_tensor, v_tensor), TENSOR_LIST(o_tensor), 0);
3297
3298
0
    for (int datatype_idx = 0; datatype_idx < 3; ++datatype_idx)
3299
0
    {
3300
0
      const int datatype = datatypes[datatype_idx];
3301
0
      ccv_nnc_tensor_t* q_input = q_tensor;
3302
0
      ccv_nnc_tensor_t* k_input = k_tensor;
3303
0
      ccv_nnc_tensor_t* v_input = v_tensor;
3304
0
      ccv_nnc_tensor_t* copy_of_gpu_o_tensor = 0;
3305
0
      ccv_nnc_tensor_t* gpu_q_tensor = 0;
3306
0
      ccv_nnc_tensor_t* gpu_k_tensor = 0;
3307
0
      ccv_nnc_tensor_t* gpu_v_tensor = 0;
3308
0
      ccv_nnc_tensor_t* gpu_o_tensor = 0;
3309
0
      if (datatype == CCV_16F)
3310
0
      {
3311
0
        ccv_float_to_half_precision(q_tensor->data.f32, (uint16_t*)q_tensor_f16->data.f16, q_count);
3312
0
        ccv_float_to_half_precision(k_tensor->data.f32, (uint16_t*)k_tensor_f16->data.f16, kv_count);
3313
0
        ccv_float_to_half_precision(v_tensor->data.f32, (uint16_t*)v_tensor_f16->data.f16, kv_count);
3314
0
        q_input = q_tensor_f16;
3315
0
        k_input = k_tensor_f16;
3316
0
        v_input = v_tensor_f16;
3317
0
        gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, H, D), 0);
3318
0
        gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, H, D), 0);
3319
0
        gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, H, D), 0);
3320
0
        gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, H, D), 0);
3321
0
        copy_of_gpu_o_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, H, D), 0);
3322
0
      } else if (datatype == CCV_16BF) {
3323
0
        ccv_float_to_bfloat(q_tensor->data.f32, (uint16_t*)q_tensor_f16->data.f16, q_count);
3324
0
        ccv_float_to_bfloat(k_tensor->data.f32, (uint16_t*)k_tensor_f16->data.f16, kv_count);
3325
0
        ccv_float_to_bfloat(v_tensor->data.f32, (uint16_t*)v_tensor_f16->data.f16, kv_count);
3326
0
        q_input = q_tensor_f16;
3327
0
        k_input = k_tensor_f16;
3328
0
        v_input = v_tensor_f16;
3329
0
        gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, R, H, D), 0);
3330
0
        gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, C, H, D), 0);
3331
0
        gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, C, H, D), 0);
3332
0
        gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, R, H, D), 0);
3333
0
        copy_of_gpu_o_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, B, R, H, D), 0);
3334
0
      } else {
3335
0
        gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, H, D), 0);
3336
0
        gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, C, H, D), 0);
3337
0
        gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, C, H, D), 0);
3338
0
        gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, H, D), 0);
3339
0
        copy_of_gpu_o_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
3340
0
      }
3341
0
      ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_input, k_input, v_input), TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor), 0);
3342
0
      ccv_nnc_cmd_t gpu_cmd = CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, 0);
3343
0
      gpu_cmd.info.scaled_dot_product_attention.flags = CCV_NNC_GEMM_16F | CCV_NNC_GEMM_8I;
3344
0
      ccv_nnc_cmd_exec(gpu_cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor), TENSOR_LIST(gpu_o_tensor), 0);
3345
0
      ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_o_tensor), TENSOR_LIST(copy_of_gpu_o_tensor), 0);
3346
3347
0
      const int count = B * R * H * D;
3348
0
      float* const cpu_f32 = (float*)ccmalloc(sizeof(float) * count);
3349
0
      float* const gpu_f32 = (float*)ccmalloc(sizeof(float) * count);
3350
0
      memcpy(cpu_f32, o_tensor->data.f32, sizeof(float) * count);
3351
0
      if (datatype == CCV_16F)
3352
0
        ccv_half_precision_to_float((uint16_t*)copy_of_gpu_o_tensor->data.f16, gpu_f32, count);
3353
0
      else if (datatype == CCV_16BF)
3354
0
        ccv_bfloat_to_float((uint16_t*)copy_of_gpu_o_tensor->data.f16, gpu_f32, count);
3355
0
      else
3356
0
        memcpy(gpu_f32, copy_of_gpu_o_tensor->data.f32, sizeof(float) * count);
3357
0
      float max_relative_diff = 0;
3358
0
      int max_diff_idx = 0;
3359
0
      for (int i = 0; i < count; ++i)
3360
0
      {
3361
0
        const float denom = fmaxf(fmaxf(fabsf(cpu_f32[i]), fabsf(gpu_f32[i])), 1.0f);
3362
0
        const float relative_diff = fabsf(cpu_f32[i] - gpu_f32[i]) / denom;
3363
0
        if (relative_diff > max_relative_diff)
3364
0
          max_relative_diff = relative_diff, max_diff_idx = i;
3365
0
      }
3366
0
      REQUIRE(max_relative_diff <= tolerances[datatype_idx], "quantized attention result should match CPU reference for dtype=%s D=%d (max relative diff %g at %d: %g vs %g)", datatype_names[datatype_idx], D, max_relative_diff, max_diff_idx, cpu_f32[max_diff_idx], gpu_f32[max_diff_idx]);
3367
3368
0
      ccfree(cpu_f32);
3369
0
      ccfree(gpu_f32);
3370
0
      ccv_nnc_tensor_free(gpu_o_tensor);
3371
0
      ccv_nnc_tensor_free(copy_of_gpu_o_tensor);
3372
0
      ccv_nnc_tensor_free(gpu_q_tensor);
3373
0
      ccv_nnc_tensor_free(gpu_k_tensor);
3374
0
      ccv_nnc_tensor_free(gpu_v_tensor);
3375
0
    }
3376
0
    ccv_nnc_tensor_free(o_tensor);
3377
0
    ccv_nnc_tensor_free(q_tensor);
3378
0
    ccv_nnc_tensor_free(k_tensor);
3379
0
    ccv_nnc_tensor_free(v_tensor);
3380
0
    ccv_nnc_tensor_free(q_tensor_f16);
3381
0
    ccv_nnc_tensor_free(k_tensor_f16);
3382
0
    ccv_nnc_tensor_free(v_tensor_f16);
3383
0
  }
3384
0
}
3385
3386
TEST_CASE("scaled dot product attention with quantized NA mps batched")
3387
1
{
3388
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD, CCV_NNC_BACKEND_MPS));
3389
0
  const int B = 3;
3390
0
  const int R = 128;
3391
0
  const int C = 128;
3392
0
  const int H = 8;
3393
0
  const int Ds[] = { 64, 128 };
3394
0
  const int datatypes[] = { CCV_16F, CCV_16BF, CCV_32F };
3395
0
  const float tolerances[] = { 2e-2, 3e-2, 2e-2 };
3396
0
  const char* datatype_names[] = { "16F", "16BF", "32F" };
3397
0
  for (int d_idx = 0; d_idx < (int)(sizeof(Ds) / sizeof(Ds[0])); ++d_idx)
3398
0
  {
3399
0
    const int D = Ds[d_idx];
3400
0
    const float scale = 1.0 / sqrt((float)D);
3401
3402
0
    ccv_nnc_tensor_t* const q_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
3403
0
    ccv_nnc_tensor_t* const k_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
3404
0
    ccv_nnc_tensor_t* const v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
3405
0
    ccv_nnc_tensor_t* const q_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, H, D), 0);
3406
0
    ccv_nnc_tensor_t* const k_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, H, D), 0);
3407
0
    ccv_nnc_tensor_t* const v_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, H, D), 0);
3408
0
    const int q_count = B * R * H * D;
3409
0
    const int kv_count = B * C * H * D;
3410
0
    dsfmt_t dsfmt;
3411
0
    dsfmt_init_gen_rand(&dsfmt, 101 + d_idx);
3412
0
    for (int i = 0; i < q_count; ++i)
3413
0
      q_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
3414
0
    for (int i = 0; i < kv_count; ++i)
3415
0
      k_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
3416
0
    for (int i = 0; i < kv_count; ++i)
3417
0
      v_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
3418
3419
0
    ccv_nnc_tensor_t* const o_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
3420
0
    ccv_nnc_cmd_t cpu_cmd = CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, 0);
3421
0
    ccv_nnc_cmd_exec(cpu_cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor, k_tensor, v_tensor), TENSOR_LIST(o_tensor), 0);
3422
3423
0
    for (int datatype_idx = 0; datatype_idx < 3; ++datatype_idx)
3424
0
    {
3425
0
      const int datatype = datatypes[datatype_idx];
3426
0
      ccv_nnc_tensor_t* q_input = q_tensor;
3427
0
      ccv_nnc_tensor_t* k_input = k_tensor;
3428
0
      ccv_nnc_tensor_t* v_input = v_tensor;
3429
0
      ccv_nnc_tensor_t* copy_of_gpu_o_tensor = 0;
3430
0
      ccv_nnc_tensor_t* gpu_q_tensor = 0;
3431
0
      ccv_nnc_tensor_t* gpu_k_tensor = 0;
3432
0
      ccv_nnc_tensor_t* gpu_v_tensor = 0;
3433
0
      ccv_nnc_tensor_t* gpu_o_tensor = 0;
3434
0
      if (datatype == CCV_16F)
3435
0
      {
3436
0
        ccv_float_to_half_precision(q_tensor->data.f32, (uint16_t*)q_tensor_f16->data.f16, q_count);
3437
0
        ccv_float_to_half_precision(k_tensor->data.f32, (uint16_t*)k_tensor_f16->data.f16, kv_count);
3438
0
        ccv_float_to_half_precision(v_tensor->data.f32, (uint16_t*)v_tensor_f16->data.f16, kv_count);
3439
0
        q_input = q_tensor_f16;
3440
0
        k_input = k_tensor_f16;
3441
0
        v_input = v_tensor_f16;
3442
0
        gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, H, D), 0);
3443
0
        gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, H, D), 0);
3444
0
        gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, H, D), 0);
3445
0
        gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, H, D), 0);
3446
0
        copy_of_gpu_o_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, H, D), 0);
3447
0
      } else if (datatype == CCV_16BF) {
3448
0
        ccv_float_to_bfloat(q_tensor->data.f32, (uint16_t*)q_tensor_f16->data.f16, q_count);
3449
0
        ccv_float_to_bfloat(k_tensor->data.f32, (uint16_t*)k_tensor_f16->data.f16, kv_count);
3450
0
        ccv_float_to_bfloat(v_tensor->data.f32, (uint16_t*)v_tensor_f16->data.f16, kv_count);
3451
0
        q_input = q_tensor_f16;
3452
0
        k_input = k_tensor_f16;
3453
0
        v_input = v_tensor_f16;
3454
0
        gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, R, H, D), 0);
3455
0
        gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, C, H, D), 0);
3456
0
        gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, C, H, D), 0);
3457
0
        gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, R, H, D), 0);
3458
0
        copy_of_gpu_o_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, B, R, H, D), 0);
3459
0
      } else {
3460
0
        gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, H, D), 0);
3461
0
        gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, C, H, D), 0);
3462
0
        gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, C, H, D), 0);
3463
0
        gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, H, D), 0);
3464
0
        copy_of_gpu_o_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
3465
0
      }
3466
0
      ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_input, k_input, v_input), TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor), 0);
3467
0
      ccv_nnc_cmd_t gpu_cmd = CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, 0);
3468
0
      gpu_cmd.info.scaled_dot_product_attention.flags = CCV_NNC_GEMM_16F | CCV_NNC_GEMM_8I;
3469
0
      ccv_nnc_cmd_exec(gpu_cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor), TENSOR_LIST(gpu_o_tensor), 0);
3470
0
      ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_o_tensor), TENSOR_LIST(copy_of_gpu_o_tensor), 0);
3471
3472
0
      const int count = B * R * H * D;
3473
0
      float* const cpu_f32 = (float*)ccmalloc(sizeof(float) * count);
3474
0
      float* const gpu_f32 = (float*)ccmalloc(sizeof(float) * count);
3475
0
      memcpy(cpu_f32, o_tensor->data.f32, sizeof(float) * count);
3476
0
      if (datatype == CCV_16F)
3477
0
        ccv_half_precision_to_float((uint16_t*)copy_of_gpu_o_tensor->data.f16, gpu_f32, count);
3478
0
      else if (datatype == CCV_16BF)
3479
0
        ccv_bfloat_to_float((uint16_t*)copy_of_gpu_o_tensor->data.f16, gpu_f32, count);
3480
0
      else
3481
0
        memcpy(gpu_f32, copy_of_gpu_o_tensor->data.f32, sizeof(float) * count);
3482
0
      float max_relative_diff = 0;
3483
0
      int max_diff_idx = 0;
3484
0
      for (int i = 0; i < count; ++i)
3485
0
      {
3486
0
        const float denom = fmaxf(fmaxf(fabsf(cpu_f32[i]), fabsf(gpu_f32[i])), 1.0f);
3487
0
        const float relative_diff = fabsf(cpu_f32[i] - gpu_f32[i]) / denom;
3488
0
        if (relative_diff > max_relative_diff)
3489
0
          max_relative_diff = relative_diff, max_diff_idx = i;
3490
0
      }
3491
0
      REQUIRE(max_relative_diff <= tolerances[datatype_idx], "quantized batched attention result should match CPU reference for dtype=%s D=%d (max relative diff %g at %d: %g vs %g)", datatype_names[datatype_idx], D, max_relative_diff, max_diff_idx, cpu_f32[max_diff_idx], gpu_f32[max_diff_idx]);
3492
3493
0
      ccfree(cpu_f32);
3494
0
      ccfree(gpu_f32);
3495
0
      ccv_nnc_tensor_free(gpu_o_tensor);
3496
0
      ccv_nnc_tensor_free(copy_of_gpu_o_tensor);
3497
0
      ccv_nnc_tensor_free(gpu_q_tensor);
3498
0
      ccv_nnc_tensor_free(gpu_k_tensor);
3499
0
      ccv_nnc_tensor_free(gpu_v_tensor);
3500
0
    }
3501
0
    ccv_nnc_tensor_free(o_tensor);
3502
0
    ccv_nnc_tensor_free(q_tensor);
3503
0
    ccv_nnc_tensor_free(k_tensor);
3504
0
    ccv_nnc_tensor_free(v_tensor);
3505
0
    ccv_nnc_tensor_free(q_tensor_f16);
3506
0
    ccv_nnc_tensor_free(k_tensor_f16);
3507
0
    ccv_nnc_tensor_free(v_tensor_f16);
3508
0
  }
3509
0
}
3510
3511
TEST_CASE("scaled dot product attention with quantized NA mps for non-multiple-of-64 sequence")
3512
1
{
3513
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD, CCV_NNC_BACKEND_MPS));
3514
0
  const int B = 1;
3515
0
  const int R = 128;
3516
0
  const int H = 24;
3517
0
  const int Cs[] = { 130, 224 };
3518
0
  const int Ds[] = { 128, 130, 224 };
3519
0
  const int datatypes[] = { CCV_16F, CCV_16BF, CCV_32F };
3520
0
  const float tolerances[] = { 4e-2, 5e-2, 4e-2 };
3521
0
  const char* datatype_names[] = { "16F", "16BF", "32F" };
3522
0
  for (int c_idx = 0; c_idx < (int)(sizeof(Cs) / sizeof(Cs[0])); ++c_idx)
3523
0
  {
3524
0
    const int C = Cs[c_idx];
3525
0
    for (int d_idx = 0; d_idx < (int)(sizeof(Ds) / sizeof(Ds[0])); ++d_idx)
3526
0
    {
3527
0
      const int D = Ds[d_idx];
3528
0
      const float scale = 1.0 / sqrt((float)D);
3529
3530
0
      ccv_nnc_tensor_t* const q_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
3531
0
      ccv_nnc_tensor_t* const k_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
3532
0
      ccv_nnc_tensor_t* const v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
3533
0
      ccv_nnc_tensor_t* const q_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, H, D), 0);
3534
0
      ccv_nnc_tensor_t* const k_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, H, D), 0);
3535
0
      ccv_nnc_tensor_t* const v_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, H, D), 0);
3536
0
      const int q_count = B * R * H * D;
3537
0
      const int kv_count = B * C * H * D;
3538
0
      dsfmt_t dsfmt;
3539
0
      dsfmt_init_gen_rand(&dsfmt, 211 + c_idx * 17 + d_idx);
3540
0
      for (int i = 0; i < q_count; ++i)
3541
0
        q_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
3542
0
      for (int i = 0; i < kv_count; ++i)
3543
0
        k_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
3544
0
      for (int i = 0; i < kv_count; ++i)
3545
0
        v_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
3546
3547
0
      ccv_nnc_tensor_t* const o_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
3548
0
      ccv_nnc_cmd_t cpu_cmd = CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, 0);
3549
0
      ccv_nnc_cmd_exec(cpu_cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor, k_tensor, v_tensor), TENSOR_LIST(o_tensor), 0);
3550
3551
0
      for (int datatype_idx = 0; datatype_idx < 3; ++datatype_idx)
3552
0
      {
3553
0
        const int datatype = datatypes[datatype_idx];
3554
0
        ccv_nnc_tensor_t* q_input = q_tensor;
3555
0
        ccv_nnc_tensor_t* k_input = k_tensor;
3556
0
        ccv_nnc_tensor_t* v_input = v_tensor;
3557
0
        ccv_nnc_tensor_t* copy_of_gpu_o_tensor = 0;
3558
0
        ccv_nnc_tensor_t* gpu_q_tensor = 0;
3559
0
        ccv_nnc_tensor_t* gpu_k_tensor = 0;
3560
0
        ccv_nnc_tensor_t* gpu_v_tensor = 0;
3561
0
        ccv_nnc_tensor_t* gpu_o_tensor = 0;
3562
0
        if (datatype == CCV_16F)
3563
0
        {
3564
0
          ccv_float_to_half_precision(q_tensor->data.f32, (uint16_t*)q_tensor_f16->data.f16, q_count);
3565
0
          ccv_float_to_half_precision(k_tensor->data.f32, (uint16_t*)k_tensor_f16->data.f16, kv_count);
3566
0
          ccv_float_to_half_precision(v_tensor->data.f32, (uint16_t*)v_tensor_f16->data.f16, kv_count);
3567
0
          q_input = q_tensor_f16;
3568
0
          k_input = k_tensor_f16;
3569
0
          v_input = v_tensor_f16;
3570
0
          gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, H, D), 0);
3571
0
          gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, H, D), 0);
3572
0
          gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, H, D), 0);
3573
0
          gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, H, D), 0);
3574
0
          copy_of_gpu_o_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, H, D), 0);
3575
0
        } else if (datatype == CCV_16BF) {
3576
0
          ccv_float_to_bfloat(q_tensor->data.f32, (uint16_t*)q_tensor_f16->data.f16, q_count);
3577
0
          ccv_float_to_bfloat(k_tensor->data.f32, (uint16_t*)k_tensor_f16->data.f16, kv_count);
3578
0
          ccv_float_to_bfloat(v_tensor->data.f32, (uint16_t*)v_tensor_f16->data.f16, kv_count);
3579
0
          q_input = q_tensor_f16;
3580
0
          k_input = k_tensor_f16;
3581
0
          v_input = v_tensor_f16;
3582
0
          gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, R, H, D), 0);
3583
0
          gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, C, H, D), 0);
3584
0
          gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, C, H, D), 0);
3585
0
          gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, R, H, D), 0);
3586
0
          copy_of_gpu_o_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, B, R, H, D), 0);
3587
0
        } else {
3588
0
          gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, H, D), 0);
3589
0
          gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, C, H, D), 0);
3590
0
          gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, C, H, D), 0);
3591
0
          gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, H, D), 0);
3592
0
          copy_of_gpu_o_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
3593
0
        }
3594
0
        ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_input, k_input, v_input), TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor), 0);
3595
0
        ccv_nnc_cmd_t gpu_cmd = CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, 0);
3596
0
        gpu_cmd.info.scaled_dot_product_attention.flags = CCV_NNC_GEMM_16F | CCV_NNC_GEMM_8I;
3597
0
        ccv_nnc_cmd_exec(gpu_cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor), TENSOR_LIST(gpu_o_tensor), 0);
3598
0
        ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_o_tensor), TENSOR_LIST(copy_of_gpu_o_tensor), 0);
3599
3600
0
        const int count = B * R * H * D;
3601
0
        float* const cpu_f32 = (float*)ccmalloc(sizeof(float) * count);
3602
0
        float* const gpu_f32 = (float*)ccmalloc(sizeof(float) * count);
3603
0
        memcpy(cpu_f32, o_tensor->data.f32, sizeof(float) * count);
3604
0
        if (datatype == CCV_16F)
3605
0
          ccv_half_precision_to_float((uint16_t*)copy_of_gpu_o_tensor->data.f16, gpu_f32, count);
3606
0
        else if (datatype == CCV_16BF)
3607
0
          ccv_bfloat_to_float((uint16_t*)copy_of_gpu_o_tensor->data.f16, gpu_f32, count);
3608
0
        else
3609
0
          memcpy(gpu_f32, copy_of_gpu_o_tensor->data.f32, sizeof(float) * count);
3610
0
        float max_relative_diff = 0;
3611
0
        int max_diff_idx = 0;
3612
0
        for (int i = 0; i < count; ++i)
3613
0
        {
3614
0
          const float denom = fmaxf(fmaxf(fabsf(cpu_f32[i]), fabsf(gpu_f32[i])), 1.0f);
3615
0
          const float relative_diff = fabsf(cpu_f32[i] - gpu_f32[i]) / denom;
3616
0
          if (relative_diff > max_relative_diff)
3617
0
            max_relative_diff = relative_diff, max_diff_idx = i;
3618
0
        }
3619
0
        REQUIRE(max_relative_diff <= tolerances[datatype_idx], "quantized attention result should match CPU reference for dtype=%s C=%d D=%d (max relative diff %g at %d: %g vs %g)", datatype_names[datatype_idx], C, D, max_relative_diff, max_diff_idx, cpu_f32[max_diff_idx], gpu_f32[max_diff_idx]);
3620
3621
0
        ccfree(cpu_f32);
3622
0
        ccfree(gpu_f32);
3623
0
        ccv_nnc_tensor_free(gpu_o_tensor);
3624
0
        ccv_nnc_tensor_free(copy_of_gpu_o_tensor);
3625
0
        ccv_nnc_tensor_free(gpu_q_tensor);
3626
0
        ccv_nnc_tensor_free(gpu_k_tensor);
3627
0
        ccv_nnc_tensor_free(gpu_v_tensor);
3628
0
      }
3629
0
      ccv_nnc_tensor_free(o_tensor);
3630
0
      ccv_nnc_tensor_free(q_tensor);
3631
0
      ccv_nnc_tensor_free(k_tensor);
3632
0
      ccv_nnc_tensor_free(v_tensor);
3633
0
      ccv_nnc_tensor_free(q_tensor_f16);
3634
0
      ccv_nnc_tensor_free(k_tensor_f16);
3635
0
      ccv_nnc_tensor_free(v_tensor_f16);
3636
0
    }
3637
0
  }
3638
0
}
3639
3640
TEST_CASE("scaled dot product attention gradient with quantized NA mps")
3641
1
{
3642
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD, CCV_NNC_BACKEND_MPS) &&
3643
1
    ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD, CCV_NNC_BACKEND_MPS));
3644
0
  const int B = 2;
3645
0
  const int R = 128;
3646
0
  const int C = 128;
3647
0
  const int H = 8;
3648
0
  const int Ds[] = { 64, 80, 96, 128 };
3649
0
  const int datatypes[] = { CCV_16F, CCV_16BF, CCV_32F };
3650
0
  const char* datatype_names[] = { "16F", "16BF", "32F" };
3651
0
  const float dq_tolerances[] = { 8e-2, 8e-2, 8e-2 };
3652
0
  const float dk_tolerances[] = { 1e-1, 1e-1, 1e-1 };
3653
0
  const float dv_tolerances[] = { 8e-2, 8e-2, 8e-2 };
3654
0
  for (int d_idx = 0; d_idx < (int)(sizeof(Ds) / sizeof(Ds[0])); ++d_idx)
3655
0
  {
3656
0
    const int D = Ds[d_idx];
3657
0
    const int q_count = B * R * H * D;
3658
0
    const int kv_count = B * C * H * D;
3659
0
    const float scale = 1.0 / sqrt((float)D);
3660
0
    ccv_nnc_tensor_t* const q_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
3661
0
    ccv_nnc_tensor_t* const k_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
3662
0
    ccv_nnc_tensor_t* const v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
3663
0
    ccv_nnc_tensor_t* const do_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
3664
0
    ccv_nnc_tensor_t* const dq_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
3665
0
    ccv_nnc_tensor_t* const dk_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
3666
0
    ccv_nnc_tensor_t* const dv_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
3667
0
    ccv_nnc_tensor_t* const q_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, H, D), 0);
3668
0
    ccv_nnc_tensor_t* const k_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, H, D), 0);
3669
0
    ccv_nnc_tensor_t* const v_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, H, D), 0);
3670
0
    ccv_nnc_tensor_t* const do_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, H, D), 0);
3671
0
    dsfmt_t dsfmt;
3672
0
    dsfmt_init_gen_rand(&dsfmt, 181 + d_idx);
3673
0
    for (int i = 0; i < q_count; ++i)
3674
0
    {
3675
0
      q_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
3676
0
      do_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
3677
0
    }
3678
0
    for (int i = 0; i < kv_count; ++i)
3679
0
    {
3680
0
      k_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
3681
0
      v_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
3682
0
    }
3683
0
    ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD(scale, 0), ccv_nnc_no_hint, 0, TENSOR_LIST(do_tensor, 0, 0, q_tensor, k_tensor, v_tensor), TENSOR_LIST(dq_tensor, dk_tensor, dv_tensor), 0);
3684
3685
0
    for (int datatype_idx = 0; datatype_idx < 3; ++datatype_idx)
3686
0
    {
3687
0
      const int datatype = datatypes[datatype_idx];
3688
0
      ccv_nnc_tensor_t* q_input = q_tensor;
3689
0
      ccv_nnc_tensor_t* k_input = k_tensor;
3690
0
      ccv_nnc_tensor_t* v_input = v_tensor;
3691
0
      ccv_nnc_tensor_t* do_input = do_tensor;
3692
0
      ccv_nnc_tensor_t* gpu_q_tensor = 0;
3693
0
      ccv_nnc_tensor_t* gpu_k_tensor = 0;
3694
0
      ccv_nnc_tensor_t* gpu_v_tensor = 0;
3695
0
      ccv_nnc_tensor_t* gpu_do_tensor = 0;
3696
0
      ccv_nnc_tensor_t* gpu_o_tensor = 0;
3697
0
      ccv_nnc_tensor_t* gpu_dq_tensor = 0;
3698
0
      ccv_nnc_tensor_t* gpu_dk_tensor = 0;
3699
0
      ccv_nnc_tensor_t* gpu_dv_tensor = 0;
3700
0
      ccv_nnc_tensor_t* copy_of_gpu_dq_tensor = 0;
3701
0
      ccv_nnc_tensor_t* copy_of_gpu_dk_tensor = 0;
3702
0
      ccv_nnc_tensor_t* copy_of_gpu_dv_tensor = 0;
3703
0
      if (datatype == CCV_16F)
3704
0
      {
3705
0
        ccv_float_to_half_precision(q_tensor->data.f32, (uint16_t*)q_tensor_f16->data.f16, q_count);
3706
0
        ccv_float_to_half_precision(k_tensor->data.f32, (uint16_t*)k_tensor_f16->data.f16, kv_count);
3707
0
        ccv_float_to_half_precision(v_tensor->data.f32, (uint16_t*)v_tensor_f16->data.f16, kv_count);
3708
0
        ccv_float_to_half_precision(do_tensor->data.f32, (uint16_t*)do_tensor_f16->data.f16, q_count);
3709
0
        q_input = q_tensor_f16;
3710
0
        k_input = k_tensor_f16;
3711
0
        v_input = v_tensor_f16;
3712
0
        do_input = do_tensor_f16;
3713
0
        gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, H, D), 0);
3714
0
        gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, H, D), 0);
3715
0
        gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, H, D), 0);
3716
0
        gpu_do_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, H, D), 0);
3717
0
        gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, H, D), 0);
3718
0
        gpu_dq_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, H, D), 0);
3719
0
        gpu_dk_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, H, D), 0);
3720
0
        gpu_dv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, H, D), 0);
3721
0
        copy_of_gpu_dq_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, H, D), 0);
3722
0
        copy_of_gpu_dk_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, H, D), 0);
3723
0
        copy_of_gpu_dv_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, H, D), 0);
3724
0
      } else if (datatype == CCV_16BF) {
3725
0
        ccv_float_to_bfloat(q_tensor->data.f32, (uint16_t*)q_tensor_f16->data.f16, q_count);
3726
0
        ccv_float_to_bfloat(k_tensor->data.f32, (uint16_t*)k_tensor_f16->data.f16, kv_count);
3727
0
        ccv_float_to_bfloat(v_tensor->data.f32, (uint16_t*)v_tensor_f16->data.f16, kv_count);
3728
0
        ccv_float_to_bfloat(do_tensor->data.f32, (uint16_t*)do_tensor_f16->data.f16, q_count);
3729
0
        q_input = q_tensor_f16;
3730
0
        k_input = k_tensor_f16;
3731
0
        v_input = v_tensor_f16;
3732
0
        do_input = do_tensor_f16;
3733
0
        gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, R, H, D), 0);
3734
0
        gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, C, H, D), 0);
3735
0
        gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, C, H, D), 0);
3736
0
        gpu_do_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, R, H, D), 0);
3737
0
        gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, R, H, D), 0);
3738
0
        gpu_dq_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, R, H, D), 0);
3739
0
        gpu_dk_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, C, H, D), 0);
3740
0
        gpu_dv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, C, H, D), 0);
3741
0
        copy_of_gpu_dq_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, B, R, H, D), 0);
3742
0
        copy_of_gpu_dk_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, B, C, H, D), 0);
3743
0
        copy_of_gpu_dv_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, B, C, H, D), 0);
3744
0
      } else {
3745
0
        gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, H, D), 0);
3746
0
        gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, C, H, D), 0);
3747
0
        gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, C, H, D), 0);
3748
0
        gpu_do_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, H, D), 0);
3749
0
        gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, H, D), 0);
3750
0
        gpu_dq_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, H, D), 0);
3751
0
        gpu_dk_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, C, H, D), 0);
3752
0
        gpu_dv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, C, H, D), 0);
3753
0
        copy_of_gpu_dq_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
3754
0
        copy_of_gpu_dk_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
3755
0
        copy_of_gpu_dv_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
3756
0
      }
3757
0
      ccv_nnc_tensor_t* const gpu_softmax_lse = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, H, R), 0);
3758
0
      ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_input, k_input, v_input, do_input), TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, gpu_do_tensor), 0);
3759
0
      ccv_nnc_cmd_t gpu_forw_cmd = CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, 0);
3760
0
      gpu_forw_cmd.info.scaled_dot_product_attention.flags = CCV_NNC_GEMM_16F | CCV_NNC_GEMM_8I;
3761
0
      ccv_nnc_cmd_exec(gpu_forw_cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, NULL, NULL, NULL), TENSOR_LIST(gpu_o_tensor, gpu_softmax_lse), 0);
3762
0
      ccv_nnc_cmd_exec(CMD_SET_FORWARD(0), ccv_nnc_no_hint, 0, TENSOR_LIST(), TENSOR_LIST(gpu_dq_tensor, gpu_dk_tensor, gpu_dv_tensor), 0);
3763
0
      ccv_nnc_cmd_t gpu_back_cmd = CMD_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD(scale, 0);
3764
0
      gpu_back_cmd.info.scaled_dot_product_attention.flags = CCV_NNC_GEMM_16F | CCV_NNC_GEMM_8I;
3765
0
      gpu_back_cmd.info.scaled_dot_product_attention.deterministic = 0;
3766
0
      ccv_nnc_cmd_exec(gpu_back_cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_do_tensor, 0, 0, gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, 0, 0, 0, gpu_o_tensor, gpu_softmax_lse), TENSOR_LIST(gpu_dq_tensor, gpu_dk_tensor, gpu_dv_tensor), 0);
3767
0
      ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_dq_tensor, gpu_dk_tensor, gpu_dv_tensor), TENSOR_LIST(copy_of_gpu_dq_tensor, copy_of_gpu_dk_tensor, copy_of_gpu_dv_tensor), 0);
3768
3769
0
      float* const dq_cpu_f32 = (float*)ccmalloc(sizeof(float) * q_count);
3770
0
      float* const dk_cpu_f32 = (float*)ccmalloc(sizeof(float) * kv_count);
3771
0
      float* const dv_cpu_f32 = (float*)ccmalloc(sizeof(float) * kv_count);
3772
0
      float* const dq_gpu_f32 = (float*)ccmalloc(sizeof(float) * q_count);
3773
0
      float* const dk_gpu_f32 = (float*)ccmalloc(sizeof(float) * kv_count);
3774
0
      float* const dv_gpu_f32 = (float*)ccmalloc(sizeof(float) * kv_count);
3775
0
      memcpy(dq_cpu_f32, dq_tensor->data.f32, sizeof(float) * q_count);
3776
0
      memcpy(dk_cpu_f32, dk_tensor->data.f32, sizeof(float) * kv_count);
3777
0
      memcpy(dv_cpu_f32, dv_tensor->data.f32, sizeof(float) * kv_count);
3778
0
      if (datatype == CCV_16F)
3779
0
      {
3780
0
        ccv_half_precision_to_float((uint16_t*)copy_of_gpu_dq_tensor->data.f16, dq_gpu_f32, q_count);
3781
0
        ccv_half_precision_to_float((uint16_t*)copy_of_gpu_dk_tensor->data.f16, dk_gpu_f32, kv_count);
3782
0
        ccv_half_precision_to_float((uint16_t*)copy_of_gpu_dv_tensor->data.f16, dv_gpu_f32, kv_count);
3783
0
      } else if (datatype == CCV_16BF) {
3784
0
        ccv_bfloat_to_float((uint16_t*)copy_of_gpu_dq_tensor->data.f16, dq_gpu_f32, q_count);
3785
0
        ccv_bfloat_to_float((uint16_t*)copy_of_gpu_dk_tensor->data.f16, dk_gpu_f32, kv_count);
3786
0
        ccv_bfloat_to_float((uint16_t*)copy_of_gpu_dv_tensor->data.f16, dv_gpu_f32, kv_count);
3787
0
      } else {
3788
0
        memcpy(dq_gpu_f32, copy_of_gpu_dq_tensor->data.f32, sizeof(float) * q_count);
3789
0
        memcpy(dk_gpu_f32, copy_of_gpu_dk_tensor->data.f32, sizeof(float) * kv_count);
3790
0
        memcpy(dv_gpu_f32, copy_of_gpu_dv_tensor->data.f32, sizeof(float) * kv_count);
3791
0
      }
3792
0
      float dq_max_relative_diff = 0;
3793
0
      float dk_max_relative_diff = 0;
3794
0
      float dv_max_relative_diff = 0;
3795
0
      int dq_max_diff_idx = 0;
3796
0
      int dk_max_diff_idx = 0;
3797
0
      int dv_max_diff_idx = 0;
3798
0
      for (int i = 0; i < q_count; ++i)
3799
0
      {
3800
0
        const float denom = fmaxf(fmaxf(fabsf(dq_cpu_f32[i]), fabsf(dq_gpu_f32[i])), 1.0f);
3801
0
        const float relative_diff = fabsf(dq_cpu_f32[i] - dq_gpu_f32[i]) / denom;
3802
0
        if (relative_diff > dq_max_relative_diff)
3803
0
          dq_max_relative_diff = relative_diff, dq_max_diff_idx = i;
3804
0
      }
3805
0
      for (int i = 0; i < kv_count; ++i)
3806
0
      {
3807
0
        float denom = fmaxf(fmaxf(fabsf(dk_cpu_f32[i]), fabsf(dk_gpu_f32[i])), 1.0f);
3808
0
        float relative_diff = fabsf(dk_cpu_f32[i] - dk_gpu_f32[i]) / denom;
3809
0
        if (relative_diff > dk_max_relative_diff)
3810
0
          dk_max_relative_diff = relative_diff, dk_max_diff_idx = i;
3811
0
        denom = fmaxf(fmaxf(fabsf(dv_cpu_f32[i]), fabsf(dv_gpu_f32[i])), 1.0f);
3812
0
        relative_diff = fabsf(dv_cpu_f32[i] - dv_gpu_f32[i]) / denom;
3813
0
        if (relative_diff > dv_max_relative_diff)
3814
0
          dv_max_relative_diff = relative_diff, dv_max_diff_idx = i;
3815
0
      }
3816
0
      REQUIRE(dq_max_relative_diff <= dq_tolerances[datatype_idx], "quantized attention dQ should match CPU reference for dtype=%s R=%d C=%d D=%d (max relative diff %g at %d: %g vs %g)", datatype_names[datatype_idx], R, C, D, dq_max_relative_diff, dq_max_diff_idx, dq_cpu_f32[dq_max_diff_idx], dq_gpu_f32[dq_max_diff_idx]);
3817
0
      REQUIRE(dk_max_relative_diff <= dk_tolerances[datatype_idx], "quantized attention dK should match CPU reference for dtype=%s R=%d C=%d D=%d (max relative diff %g at %d: %g vs %g)", datatype_names[datatype_idx], R, C, D, dk_max_relative_diff, dk_max_diff_idx, dk_cpu_f32[dk_max_diff_idx], dk_gpu_f32[dk_max_diff_idx]);
3818
0
      REQUIRE(dv_max_relative_diff <= dv_tolerances[datatype_idx], "quantized attention dV should match CPU reference for dtype=%s R=%d C=%d D=%d (max relative diff %g at %d: %g vs %g)", datatype_names[datatype_idx], R, C, D, dv_max_relative_diff, dv_max_diff_idx, dv_cpu_f32[dv_max_diff_idx], dv_gpu_f32[dv_max_diff_idx]);
3819
3820
0
      ccfree(dq_cpu_f32);
3821
0
      ccfree(dk_cpu_f32);
3822
0
      ccfree(dv_cpu_f32);
3823
0
      ccfree(dq_gpu_f32);
3824
0
      ccfree(dk_gpu_f32);
3825
0
      ccfree(dv_gpu_f32);
3826
0
      ccv_nnc_tensor_free(gpu_q_tensor);
3827
0
      ccv_nnc_tensor_free(gpu_k_tensor);
3828
0
      ccv_nnc_tensor_free(gpu_v_tensor);
3829
0
      ccv_nnc_tensor_free(gpu_do_tensor);
3830
0
      ccv_nnc_tensor_free(gpu_o_tensor);
3831
0
      ccv_nnc_tensor_free(gpu_dq_tensor);
3832
0
      ccv_nnc_tensor_free(gpu_dk_tensor);
3833
0
      ccv_nnc_tensor_free(gpu_dv_tensor);
3834
0
      ccv_nnc_tensor_free(gpu_softmax_lse);
3835
0
      ccv_nnc_tensor_free(copy_of_gpu_dq_tensor);
3836
0
      ccv_nnc_tensor_free(copy_of_gpu_dk_tensor);
3837
0
      ccv_nnc_tensor_free(copy_of_gpu_dv_tensor);
3838
0
    }
3839
3840
0
    ccv_nnc_tensor_free(q_tensor);
3841
0
    ccv_nnc_tensor_free(k_tensor);
3842
0
    ccv_nnc_tensor_free(v_tensor);
3843
0
    ccv_nnc_tensor_free(do_tensor);
3844
0
    ccv_nnc_tensor_free(dq_tensor);
3845
0
    ccv_nnc_tensor_free(dk_tensor);
3846
0
    ccv_nnc_tensor_free(dv_tensor);
3847
0
    ccv_nnc_tensor_free(q_tensor_f16);
3848
0
    ccv_nnc_tensor_free(k_tensor_f16);
3849
0
    ccv_nnc_tensor_free(v_tensor_f16);
3850
0
    ccv_nnc_tensor_free(do_tensor_f16);
3851
0
  }
3852
0
}
3853
3854
TEST_CASE("scaled dot product attention gradient with quantized NA mps for rectangular and edge sequence lengths")
3855
1
{
3856
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD, CCV_NNC_BACKEND_MPS) &&
3857
1
    ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD, CCV_NNC_BACKEND_MPS));
3858
0
  typedef struct {
3859
0
    int R;
3860
0
    int C;
3861
0
  } qna_backward_shape_t;
3862
0
  const int B = 1;
3863
0
  const int H = 8;
3864
0
  const int Ds[] = { 64, 128 };
3865
0
  const qna_backward_shape_t shapes[] = {
3866
0
    { .R = 32, .C = 64 },
3867
0
    { .R = 40, .C = 72 },
3868
0
    { .R = 80, .C = 64 },
3869
0
    { .R = 96, .C = 88 },
3870
0
    { .R = 64, .C = 192 },
3871
0
    { .R = 144, .C = 64 },
3872
0
  };
3873
0
  const int datatypes[] = { CCV_16F, CCV_16BF, CCV_32F };
3874
0
  const char* datatype_names[] = { "16F", "16BF", "32F" };
3875
0
  const float dq_tolerances[] = { 8e-2, 8e-2, 8e-2 };
3876
0
  const float dk_tolerances[] = { 1e-1, 1e-1, 1e-1 };
3877
0
  const float dv_tolerances[] = { 8e-2, 8e-2, 8e-2 };
3878
0
  for (int shape_idx = 0; shape_idx < (int)(sizeof(shapes) / sizeof(shapes[0])); ++shape_idx)
3879
0
  {
3880
0
    const int R = shapes[shape_idx].R;
3881
0
    const int C = shapes[shape_idx].C;
3882
0
    for (int d_idx = 0; d_idx < (int)(sizeof(Ds) / sizeof(Ds[0])); ++d_idx)
3883
0
    {
3884
0
      const int D = Ds[d_idx];
3885
0
      const int q_count = B * R * H * D;
3886
0
      const int kv_count = B * C * H * D;
3887
0
      const float scale = 1.0 / sqrt((float)D);
3888
0
      ccv_nnc_tensor_t* const q_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
3889
0
      ccv_nnc_tensor_t* const k_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
3890
0
      ccv_nnc_tensor_t* const v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
3891
0
      ccv_nnc_tensor_t* const do_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
3892
0
      ccv_nnc_tensor_t* const dq_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
3893
0
      ccv_nnc_tensor_t* const dk_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
3894
0
      ccv_nnc_tensor_t* const dv_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
3895
0
      ccv_nnc_tensor_t* const q_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, H, D), 0);
3896
0
      ccv_nnc_tensor_t* const k_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, H, D), 0);
3897
0
      ccv_nnc_tensor_t* const v_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, H, D), 0);
3898
0
      ccv_nnc_tensor_t* const do_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, H, D), 0);
3899
0
      dsfmt_t dsfmt;
3900
0
      dsfmt_init_gen_rand(&dsfmt, 281 + shape_idx * 17 + d_idx);
3901
0
      for (int i = 0; i < q_count; ++i)
3902
0
      {
3903
0
        q_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
3904
0
        do_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
3905
0
      }
3906
0
      for (int i = 0; i < kv_count; ++i)
3907
0
      {
3908
0
        k_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
3909
0
        v_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
3910
0
      }
3911
0
      ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD(scale, 0), ccv_nnc_no_hint, 0, TENSOR_LIST(do_tensor, 0, 0, q_tensor, k_tensor, v_tensor), TENSOR_LIST(dq_tensor, dk_tensor, dv_tensor), 0);
3912
3913
0
      for (int datatype_idx = 0; datatype_idx < 3; ++datatype_idx)
3914
0
      {
3915
0
        const int datatype = datatypes[datatype_idx];
3916
0
        ccv_nnc_tensor_t* q_input = q_tensor;
3917
0
        ccv_nnc_tensor_t* k_input = k_tensor;
3918
0
        ccv_nnc_tensor_t* v_input = v_tensor;
3919
0
        ccv_nnc_tensor_t* do_input = do_tensor;
3920
0
        ccv_nnc_tensor_t* gpu_q_tensor = 0;
3921
0
        ccv_nnc_tensor_t* gpu_k_tensor = 0;
3922
0
        ccv_nnc_tensor_t* gpu_v_tensor = 0;
3923
0
        ccv_nnc_tensor_t* gpu_do_tensor = 0;
3924
0
        ccv_nnc_tensor_t* gpu_o_tensor = 0;
3925
0
        ccv_nnc_tensor_t* gpu_dq_tensor = 0;
3926
0
        ccv_nnc_tensor_t* gpu_dk_tensor = 0;
3927
0
        ccv_nnc_tensor_t* gpu_dv_tensor = 0;
3928
0
        ccv_nnc_tensor_t* copy_of_gpu_dq_tensor = 0;
3929
0
        ccv_nnc_tensor_t* copy_of_gpu_dk_tensor = 0;
3930
0
        ccv_nnc_tensor_t* copy_of_gpu_dv_tensor = 0;
3931
0
        if (datatype == CCV_16F)
3932
0
        {
3933
0
          ccv_float_to_half_precision(q_tensor->data.f32, (uint16_t*)q_tensor_f16->data.f16, q_count);
3934
0
          ccv_float_to_half_precision(k_tensor->data.f32, (uint16_t*)k_tensor_f16->data.f16, kv_count);
3935
0
          ccv_float_to_half_precision(v_tensor->data.f32, (uint16_t*)v_tensor_f16->data.f16, kv_count);
3936
0
          ccv_float_to_half_precision(do_tensor->data.f32, (uint16_t*)do_tensor_f16->data.f16, q_count);
3937
0
          q_input = q_tensor_f16;
3938
0
          k_input = k_tensor_f16;
3939
0
          v_input = v_tensor_f16;
3940
0
          do_input = do_tensor_f16;
3941
0
          gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, H, D), 0);
3942
0
          gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, H, D), 0);
3943
0
          gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, H, D), 0);
3944
0
          gpu_do_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, H, D), 0);
3945
0
          gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, H, D), 0);
3946
0
          gpu_dq_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, H, D), 0);
3947
0
          gpu_dk_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, H, D), 0);
3948
0
          gpu_dv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, H, D), 0);
3949
0
          copy_of_gpu_dq_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, H, D), 0);
3950
0
          copy_of_gpu_dk_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, H, D), 0);
3951
0
          copy_of_gpu_dv_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, H, D), 0);
3952
0
        } else if (datatype == CCV_16BF) {
3953
0
          ccv_float_to_bfloat(q_tensor->data.f32, (uint16_t*)q_tensor_f16->data.f16, q_count);
3954
0
          ccv_float_to_bfloat(k_tensor->data.f32, (uint16_t*)k_tensor_f16->data.f16, kv_count);
3955
0
          ccv_float_to_bfloat(v_tensor->data.f32, (uint16_t*)v_tensor_f16->data.f16, kv_count);
3956
0
          ccv_float_to_bfloat(do_tensor->data.f32, (uint16_t*)do_tensor_f16->data.f16, q_count);
3957
0
          q_input = q_tensor_f16;
3958
0
          k_input = k_tensor_f16;
3959
0
          v_input = v_tensor_f16;
3960
0
          do_input = do_tensor_f16;
3961
0
          gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, R, H, D), 0);
3962
0
          gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, C, H, D), 0);
3963
0
          gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, C, H, D), 0);
3964
0
          gpu_do_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, R, H, D), 0);
3965
0
          gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, R, H, D), 0);
3966
0
          gpu_dq_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, R, H, D), 0);
3967
0
          gpu_dk_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, C, H, D), 0);
3968
0
          gpu_dv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, C, H, D), 0);
3969
0
          copy_of_gpu_dq_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, B, R, H, D), 0);
3970
0
          copy_of_gpu_dk_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, B, C, H, D), 0);
3971
0
          copy_of_gpu_dv_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, B, C, H, D), 0);
3972
0
        } else {
3973
0
          gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, H, D), 0);
3974
0
          gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, C, H, D), 0);
3975
0
          gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, C, H, D), 0);
3976
0
          gpu_do_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, H, D), 0);
3977
0
          gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, H, D), 0);
3978
0
          gpu_dq_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, H, D), 0);
3979
0
          gpu_dk_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, C, H, D), 0);
3980
0
          gpu_dv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, C, H, D), 0);
3981
0
          copy_of_gpu_dq_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
3982
0
          copy_of_gpu_dk_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
3983
0
          copy_of_gpu_dv_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
3984
0
        }
3985
0
        ccv_nnc_tensor_t* const gpu_softmax_lse = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, H, R), 0);
3986
0
        ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_input, k_input, v_input, do_input), TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, gpu_do_tensor), 0);
3987
0
        ccv_nnc_cmd_t gpu_forw_cmd = CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, 0);
3988
0
        gpu_forw_cmd.info.scaled_dot_product_attention.flags = CCV_NNC_GEMM_16F | CCV_NNC_GEMM_8I;
3989
0
        ccv_nnc_cmd_exec(gpu_forw_cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, NULL, NULL, NULL), TENSOR_LIST(gpu_o_tensor, gpu_softmax_lse), 0);
3990
0
        ccv_nnc_cmd_exec(CMD_SET_FORWARD(0), ccv_nnc_no_hint, 0, TENSOR_LIST(), TENSOR_LIST(gpu_dq_tensor, gpu_dk_tensor, gpu_dv_tensor), 0);
3991
0
        ccv_nnc_cmd_t gpu_back_cmd = CMD_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD(scale, 0);
3992
0
        gpu_back_cmd.info.scaled_dot_product_attention.flags = CCV_NNC_GEMM_16F | CCV_NNC_GEMM_8I;
3993
0
        gpu_back_cmd.info.scaled_dot_product_attention.deterministic = 0;
3994
0
        ccv_nnc_cmd_exec(gpu_back_cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_do_tensor, 0, 0, gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, 0, 0, 0, gpu_o_tensor, gpu_softmax_lse), TENSOR_LIST(gpu_dq_tensor, gpu_dk_tensor, gpu_dv_tensor), 0);
3995
0
        ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_dq_tensor, gpu_dk_tensor, gpu_dv_tensor), TENSOR_LIST(copy_of_gpu_dq_tensor, copy_of_gpu_dk_tensor, copy_of_gpu_dv_tensor), 0);
3996
3997
0
        float* const dq_cpu_f32 = (float*)ccmalloc(sizeof(float) * q_count);
3998
0
        float* const dk_cpu_f32 = (float*)ccmalloc(sizeof(float) * kv_count);
3999
0
        float* const dv_cpu_f32 = (float*)ccmalloc(sizeof(float) * kv_count);
4000
0
        float* const dq_gpu_f32 = (float*)ccmalloc(sizeof(float) * q_count);
4001
0
        float* const dk_gpu_f32 = (float*)ccmalloc(sizeof(float) * kv_count);
4002
0
        float* const dv_gpu_f32 = (float*)ccmalloc(sizeof(float) * kv_count);
4003
0
        memcpy(dq_cpu_f32, dq_tensor->data.f32, sizeof(float) * q_count);
4004
0
        memcpy(dk_cpu_f32, dk_tensor->data.f32, sizeof(float) * kv_count);
4005
0
        memcpy(dv_cpu_f32, dv_tensor->data.f32, sizeof(float) * kv_count);
4006
0
        if (datatype == CCV_16F)
4007
0
        {
4008
0
          ccv_half_precision_to_float((uint16_t*)copy_of_gpu_dq_tensor->data.f16, dq_gpu_f32, q_count);
4009
0
          ccv_half_precision_to_float((uint16_t*)copy_of_gpu_dk_tensor->data.f16, dk_gpu_f32, kv_count);
4010
0
          ccv_half_precision_to_float((uint16_t*)copy_of_gpu_dv_tensor->data.f16, dv_gpu_f32, kv_count);
4011
0
        } else if (datatype == CCV_16BF) {
4012
0
          ccv_bfloat_to_float((uint16_t*)copy_of_gpu_dq_tensor->data.f16, dq_gpu_f32, q_count);
4013
0
          ccv_bfloat_to_float((uint16_t*)copy_of_gpu_dk_tensor->data.f16, dk_gpu_f32, kv_count);
4014
0
          ccv_bfloat_to_float((uint16_t*)copy_of_gpu_dv_tensor->data.f16, dv_gpu_f32, kv_count);
4015
0
        } else {
4016
0
          memcpy(dq_gpu_f32, copy_of_gpu_dq_tensor->data.f32, sizeof(float) * q_count);
4017
0
          memcpy(dk_gpu_f32, copy_of_gpu_dk_tensor->data.f32, sizeof(float) * kv_count);
4018
0
          memcpy(dv_gpu_f32, copy_of_gpu_dv_tensor->data.f32, sizeof(float) * kv_count);
4019
0
        }
4020
0
        float dq_max_relative_diff = 0;
4021
0
        float dk_max_relative_diff = 0;
4022
0
        float dv_max_relative_diff = 0;
4023
0
        int dq_max_diff_idx = 0;
4024
0
        int dk_max_diff_idx = 0;
4025
0
        int dv_max_diff_idx = 0;
4026
0
        for (int i = 0; i < q_count; ++i)
4027
0
        {
4028
0
          const float denom = fmaxf(fmaxf(fabsf(dq_cpu_f32[i]), fabsf(dq_gpu_f32[i])), 1.0f);
4029
0
          const float relative_diff = fabsf(dq_cpu_f32[i] - dq_gpu_f32[i]) / denom;
4030
0
          if (relative_diff > dq_max_relative_diff)
4031
0
            dq_max_relative_diff = relative_diff, dq_max_diff_idx = i;
4032
0
        }
4033
0
        for (int i = 0; i < kv_count; ++i)
4034
0
        {
4035
0
          float denom = fmaxf(fmaxf(fabsf(dk_cpu_f32[i]), fabsf(dk_gpu_f32[i])), 1.0f);
4036
0
          float relative_diff = fabsf(dk_cpu_f32[i] - dk_gpu_f32[i]) / denom;
4037
0
          if (relative_diff > dk_max_relative_diff)
4038
0
            dk_max_relative_diff = relative_diff, dk_max_diff_idx = i;
4039
0
          denom = fmaxf(fmaxf(fabsf(dv_cpu_f32[i]), fabsf(dv_gpu_f32[i])), 1.0f);
4040
0
          relative_diff = fabsf(dv_cpu_f32[i] - dv_gpu_f32[i]) / denom;
4041
0
          if (relative_diff > dv_max_relative_diff)
4042
0
            dv_max_relative_diff = relative_diff, dv_max_diff_idx = i;
4043
0
        }
4044
0
        REQUIRE(dq_max_relative_diff <= dq_tolerances[datatype_idx], "quantized attention dQ should match CPU reference for dtype=%s R=%d C=%d D=%d (max relative diff %g at %d: %g vs %g)", datatype_names[datatype_idx], R, C, D, dq_max_relative_diff, dq_max_diff_idx, dq_cpu_f32[dq_max_diff_idx], dq_gpu_f32[dq_max_diff_idx]);
4045
0
        REQUIRE(dk_max_relative_diff <= dk_tolerances[datatype_idx], "quantized attention dK should match CPU reference for dtype=%s R=%d C=%d D=%d (max relative diff %g at %d: %g vs %g)", datatype_names[datatype_idx], R, C, D, dk_max_relative_diff, dk_max_diff_idx, dk_cpu_f32[dk_max_diff_idx], dk_gpu_f32[dk_max_diff_idx]);
4046
0
        REQUIRE(dv_max_relative_diff <= dv_tolerances[datatype_idx], "quantized attention dV should match CPU reference for dtype=%s R=%d C=%d D=%d (max relative diff %g at %d: %g vs %g)", datatype_names[datatype_idx], R, C, D, dv_max_relative_diff, dv_max_diff_idx, dv_cpu_f32[dv_max_diff_idx], dv_gpu_f32[dv_max_diff_idx]);
4047
4048
0
        ccfree(dq_cpu_f32);
4049
0
        ccfree(dk_cpu_f32);
4050
0
        ccfree(dv_cpu_f32);
4051
0
        ccfree(dq_gpu_f32);
4052
0
        ccfree(dk_gpu_f32);
4053
0
        ccfree(dv_gpu_f32);
4054
0
        ccv_nnc_tensor_free(gpu_q_tensor);
4055
0
        ccv_nnc_tensor_free(gpu_k_tensor);
4056
0
        ccv_nnc_tensor_free(gpu_v_tensor);
4057
0
        ccv_nnc_tensor_free(gpu_do_tensor);
4058
0
        ccv_nnc_tensor_free(gpu_o_tensor);
4059
0
        ccv_nnc_tensor_free(gpu_dq_tensor);
4060
0
        ccv_nnc_tensor_free(gpu_dk_tensor);
4061
0
        ccv_nnc_tensor_free(gpu_dv_tensor);
4062
0
        ccv_nnc_tensor_free(gpu_softmax_lse);
4063
0
        ccv_nnc_tensor_free(copy_of_gpu_dq_tensor);
4064
0
        ccv_nnc_tensor_free(copy_of_gpu_dk_tensor);
4065
0
        ccv_nnc_tensor_free(copy_of_gpu_dv_tensor);
4066
0
      }
4067
4068
0
      ccv_nnc_tensor_free(q_tensor);
4069
0
      ccv_nnc_tensor_free(k_tensor);
4070
0
      ccv_nnc_tensor_free(v_tensor);
4071
0
      ccv_nnc_tensor_free(do_tensor);
4072
0
      ccv_nnc_tensor_free(dq_tensor);
4073
0
      ccv_nnc_tensor_free(dk_tensor);
4074
0
      ccv_nnc_tensor_free(dv_tensor);
4075
0
      ccv_nnc_tensor_free(q_tensor_f16);
4076
0
      ccv_nnc_tensor_free(k_tensor_f16);
4077
0
      ccv_nnc_tensor_free(v_tensor_f16);
4078
0
      ccv_nnc_tensor_free(do_tensor_f16);
4079
0
    }
4080
0
  }
4081
0
}
4082
4083
TEST_CASE("scaled dot product attention gradient with quantized NA mps on 1536 square surface")
4084
1
{
4085
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD, CCV_NNC_BACKEND_MPS) &&
4086
1
    ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD, CCV_NNC_BACKEND_MPS));
4087
0
  const int B = 1;
4088
0
  const int R = 1536;
4089
0
  const int C = 1536;
4090
0
  const int H = 24;
4091
0
  const int D = 128;
4092
0
  const int q_count = B * R * H * D;
4093
0
  const int kv_count = B * C * H * D;
4094
0
  const float scale = 1.0 / sqrt((float)D);
4095
0
  ccv_nnc_tensor_t* const q_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
4096
0
  ccv_nnc_tensor_t* const k_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
4097
0
  ccv_nnc_tensor_t* const v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
4098
0
  ccv_nnc_tensor_t* const do_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
4099
0
  ccv_nnc_tensor_t* const dq_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
4100
0
  ccv_nnc_tensor_t* const dk_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
4101
0
  ccv_nnc_tensor_t* const dv_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
4102
0
  ccv_nnc_tensor_t* const q_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, H, D), 0);
4103
0
  ccv_nnc_tensor_t* const k_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, H, D), 0);
4104
0
  ccv_nnc_tensor_t* const v_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, H, D), 0);
4105
0
  ccv_nnc_tensor_t* const do_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, H, D), 0);
4106
0
  dsfmt_t dsfmt;
4107
0
  dsfmt_init_gen_rand(&dsfmt, 4177);
4108
0
  for (int i = 0; i < q_count; ++i)
4109
0
  {
4110
0
    q_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
4111
0
    do_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
4112
0
  }
4113
0
  for (int i = 0; i < kv_count; ++i)
4114
0
  {
4115
0
    k_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
4116
0
    v_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
4117
0
  }
4118
0
  ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD(scale, 0), ccv_nnc_no_hint, 0, TENSOR_LIST(do_tensor, 0, 0, q_tensor, k_tensor, v_tensor), TENSOR_LIST(dq_tensor, dk_tensor, dv_tensor), 0);
4119
4120
0
  ccv_float_to_half_precision(q_tensor->data.f32, (uint16_t*)q_tensor_f16->data.f16, q_count);
4121
0
  ccv_float_to_half_precision(k_tensor->data.f32, (uint16_t*)k_tensor_f16->data.f16, kv_count);
4122
0
  ccv_float_to_half_precision(v_tensor->data.f32, (uint16_t*)v_tensor_f16->data.f16, kv_count);
4123
0
  ccv_float_to_half_precision(do_tensor->data.f32, (uint16_t*)do_tensor_f16->data.f16, q_count);
4124
4125
0
  ccv_nnc_tensor_t* const gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, H, D), 0);
4126
0
  ccv_nnc_tensor_t* const gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, H, D), 0);
4127
0
  ccv_nnc_tensor_t* const gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, H, D), 0);
4128
0
  ccv_nnc_tensor_t* const gpu_do_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, H, D), 0);
4129
0
  ccv_nnc_tensor_t* const gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, H, D), 0);
4130
0
  ccv_nnc_tensor_t* const gpu_dq_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, H, D), 0);
4131
0
  ccv_nnc_tensor_t* const gpu_dk_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, H, D), 0);
4132
0
  ccv_nnc_tensor_t* const gpu_dv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, H, D), 0);
4133
0
  ccv_nnc_tensor_t* const gpu_softmax_lse = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, H, R), 0);
4134
0
  ccv_nnc_tensor_t* const copy_of_gpu_dq_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, H, D), 0);
4135
0
  ccv_nnc_tensor_t* const copy_of_gpu_dk_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, H, D), 0);
4136
0
  ccv_nnc_tensor_t* const copy_of_gpu_dv_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, H, D), 0);
4137
4138
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor_f16, k_tensor_f16, v_tensor_f16, do_tensor_f16), TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, gpu_do_tensor), 0);
4139
0
  ccv_nnc_cmd_t gpu_forw_cmd = CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, 0);
4140
0
  gpu_forw_cmd.info.scaled_dot_product_attention.flags = CCV_NNC_GEMM_16F | CCV_NNC_GEMM_8I;
4141
0
  ccv_nnc_cmd_exec(gpu_forw_cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, NULL, NULL, NULL), TENSOR_LIST(gpu_o_tensor, gpu_softmax_lse), 0);
4142
0
  ccv_nnc_cmd_exec(CMD_SET_FORWARD(0), ccv_nnc_no_hint, 0, TENSOR_LIST(), TENSOR_LIST(gpu_dq_tensor, gpu_dk_tensor, gpu_dv_tensor), 0);
4143
0
  ccv_nnc_cmd_t gpu_back_cmd = CMD_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD(scale, 0);
4144
0
  gpu_back_cmd.info.scaled_dot_product_attention.flags = CCV_NNC_GEMM_16F | CCV_NNC_GEMM_8I;
4145
0
  gpu_back_cmd.info.scaled_dot_product_attention.deterministic = 0;
4146
0
  ccv_nnc_cmd_exec(gpu_back_cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_do_tensor, 0, 0, gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, 0, 0, 0, gpu_o_tensor, gpu_softmax_lse), TENSOR_LIST(gpu_dq_tensor, gpu_dk_tensor, gpu_dv_tensor), 0);
4147
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_dq_tensor, gpu_dk_tensor, gpu_dv_tensor), TENSOR_LIST(copy_of_gpu_dq_tensor, copy_of_gpu_dk_tensor, copy_of_gpu_dv_tensor), 0);
4148
4149
0
  float* const dq_gpu_f32 = (float*)ccmalloc(sizeof(float) * q_count);
4150
0
  float* const dk_gpu_f32 = (float*)ccmalloc(sizeof(float) * kv_count);
4151
0
  float* const dv_gpu_f32 = (float*)ccmalloc(sizeof(float) * kv_count);
4152
0
  ccv_half_precision_to_float((uint16_t*)copy_of_gpu_dq_tensor->data.f16, dq_gpu_f32, q_count);
4153
0
  ccv_half_precision_to_float((uint16_t*)copy_of_gpu_dk_tensor->data.f16, dk_gpu_f32, kv_count);
4154
0
  ccv_half_precision_to_float((uint16_t*)copy_of_gpu_dv_tensor->data.f16, dv_gpu_f32, kv_count);
4155
4156
0
  float dq_max_relative_diff = 0;
4157
0
  float dk_max_relative_diff = 0;
4158
0
  float dv_max_relative_diff = 0;
4159
0
  float dq_cpu_max_abs = 0;
4160
0
  float dq_gpu_max_abs = 0;
4161
0
  float dk_cpu_max_abs = 0;
4162
0
  float dk_gpu_max_abs = 0;
4163
0
  float dv_cpu_max_abs = 0;
4164
0
  float dv_gpu_max_abs = 0;
4165
0
  int dq_max_diff_idx = 0;
4166
0
  int dk_max_diff_idx = 0;
4167
0
  int dv_max_diff_idx = 0;
4168
0
  for (int i = 0; i < q_count; ++i)
4169
0
  {
4170
0
    dq_cpu_max_abs = fmaxf(dq_cpu_max_abs, fabsf(dq_tensor->data.f32[i]));
4171
0
    dq_gpu_max_abs = fmaxf(dq_gpu_max_abs, fabsf(dq_gpu_f32[i]));
4172
0
    const float denom = fmaxf(fmaxf(fabsf(dq_tensor->data.f32[i]), fabsf(dq_gpu_f32[i])), 1.0f);
4173
0
    const float relative_diff = fabsf(dq_tensor->data.f32[i] - dq_gpu_f32[i]) / denom;
4174
0
    if (relative_diff > dq_max_relative_diff)
4175
0
      dq_max_relative_diff = relative_diff, dq_max_diff_idx = i;
4176
0
  }
4177
0
  for (int i = 0; i < kv_count; ++i)
4178
0
  {
4179
0
    dk_cpu_max_abs = fmaxf(dk_cpu_max_abs, fabsf(dk_tensor->data.f32[i]));
4180
0
    dk_gpu_max_abs = fmaxf(dk_gpu_max_abs, fabsf(dk_gpu_f32[i]));
4181
0
    float denom = fmaxf(fmaxf(fabsf(dk_tensor->data.f32[i]), fabsf(dk_gpu_f32[i])), 1.0f);
4182
0
    float relative_diff = fabsf(dk_tensor->data.f32[i] - dk_gpu_f32[i]) / denom;
4183
0
    if (relative_diff > dk_max_relative_diff)
4184
0
      dk_max_relative_diff = relative_diff, dk_max_diff_idx = i;
4185
0
    dv_cpu_max_abs = fmaxf(dv_cpu_max_abs, fabsf(dv_tensor->data.f32[i]));
4186
0
    dv_gpu_max_abs = fmaxf(dv_gpu_max_abs, fabsf(dv_gpu_f32[i]));
4187
0
    denom = fmaxf(fmaxf(fabsf(dv_tensor->data.f32[i]), fabsf(dv_gpu_f32[i])), 1.0f);
4188
0
    relative_diff = fabsf(dv_tensor->data.f32[i] - dv_gpu_f32[i]) / denom;
4189
0
    if (relative_diff > dv_max_relative_diff)
4190
0
      dv_max_relative_diff = relative_diff, dv_max_diff_idx = i;
4191
0
  }
4192
0
  REQUIRE(dq_gpu_max_abs >= dq_cpu_max_abs * 0.5f && dq_gpu_max_abs <= dq_cpu_max_abs * 2.0f,
4193
0
    "quantized attention dQ magnitude should stay close to CPU reference on 1536 surface (cpu max abs %g gpu max abs %g)",
4194
0
    dq_cpu_max_abs, dq_gpu_max_abs);
4195
0
  REQUIRE(dk_gpu_max_abs >= dk_cpu_max_abs * 0.5f && dk_gpu_max_abs <= dk_cpu_max_abs * 2.0f,
4196
0
    "quantized attention dK magnitude should stay close to CPU reference on 1536 surface (cpu max abs %g gpu max abs %g)",
4197
0
    dk_cpu_max_abs, dk_gpu_max_abs);
4198
0
  REQUIRE(dv_gpu_max_abs >= dv_cpu_max_abs * 0.5f && dv_gpu_max_abs <= dv_cpu_max_abs * 2.0f,
4199
0
    "quantized attention dV magnitude should stay close to CPU reference on 1536 surface (cpu max abs %g gpu max abs %g)",
4200
0
    dv_cpu_max_abs, dv_gpu_max_abs);
4201
0
  REQUIRE(dq_max_relative_diff <= 8e-2, "quantized attention dQ should match CPU reference on 1536 surface (max relative diff %g at %d: %g vs %g)", dq_max_relative_diff, dq_max_diff_idx, dq_tensor->data.f32[dq_max_diff_idx], dq_gpu_f32[dq_max_diff_idx]);
4202
0
  REQUIRE(dk_max_relative_diff <= 1e-1, "quantized attention dK should match CPU reference on 1536 surface (max relative diff %g at %d: %g vs %g)", dk_max_relative_diff, dk_max_diff_idx, dk_tensor->data.f32[dk_max_diff_idx], dk_gpu_f32[dk_max_diff_idx]);
4203
0
  REQUIRE(dv_max_relative_diff <= 8e-2, "quantized attention dV should match CPU reference on 1536 surface (max relative diff %g at %d: %g vs %g)", dv_max_relative_diff, dv_max_diff_idx, dv_tensor->data.f32[dv_max_diff_idx], dv_gpu_f32[dv_max_diff_idx]);
4204
4205
0
  ccfree(dq_gpu_f32);
4206
0
  ccfree(dk_gpu_f32);
4207
0
  ccfree(dv_gpu_f32);
4208
0
  ccv_nnc_tensor_free(gpu_q_tensor);
4209
0
  ccv_nnc_tensor_free(gpu_k_tensor);
4210
0
  ccv_nnc_tensor_free(gpu_v_tensor);
4211
0
  ccv_nnc_tensor_free(gpu_do_tensor);
4212
0
  ccv_nnc_tensor_free(gpu_o_tensor);
4213
0
  ccv_nnc_tensor_free(gpu_dq_tensor);
4214
0
  ccv_nnc_tensor_free(gpu_dk_tensor);
4215
0
  ccv_nnc_tensor_free(gpu_dv_tensor);
4216
0
  ccv_nnc_tensor_free(gpu_softmax_lse);
4217
0
  ccv_nnc_tensor_free(copy_of_gpu_dq_tensor);
4218
0
  ccv_nnc_tensor_free(copy_of_gpu_dk_tensor);
4219
0
  ccv_nnc_tensor_free(copy_of_gpu_dv_tensor);
4220
0
  ccv_nnc_tensor_free(q_tensor);
4221
0
  ccv_nnc_tensor_free(k_tensor);
4222
0
  ccv_nnc_tensor_free(v_tensor);
4223
0
  ccv_nnc_tensor_free(do_tensor);
4224
0
  ccv_nnc_tensor_free(dq_tensor);
4225
0
  ccv_nnc_tensor_free(dk_tensor);
4226
0
  ccv_nnc_tensor_free(dv_tensor);
4227
0
  ccv_nnc_tensor_free(q_tensor_f16);
4228
0
  ccv_nnc_tensor_free(k_tensor_f16);
4229
0
  ccv_nnc_tensor_free(v_tensor_f16);
4230
0
  ccv_nnc_tensor_free(do_tensor_f16);
4231
0
}
4232
4233
TEST_CASE("scaled dot product attention with mps in bfloat precision")
4234
1
{
4235
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD, CCV_NNC_BACKEND_MPS) &&
4236
1
    ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD, CCV_NNC_BACKEND_MPS));
4237
0
#define num_long_trials 8
4238
0
#define num_short_trials 4
4239
0
#define num_trials (num_long_trials + num_short_trials)
4240
4241
0
  dsfmt_t dsfmt;
4242
0
  dsfmt_init_gen_rand(&dsfmt, 10);
4243
0
  for (int trial = 0; trial < num_trials; ++trial) {
4244
0
    const int B_candidates[num_trials] = {  32,   12, 16, 1, 2, 1, 32,   12, 16, 1, 2, 1 };
4245
0
    const int R_candidates[num_trials] = { 160,  256, 128, 77, 77, 5, 160,  256, 128, 77, 77, 5 };
4246
0
    const int C_candidates[num_trials] = { 128,  128, 128, 128, 128, 5, 128,  128, 128, 128, 128, 5 };
4247
0
    const int Hq_candidates[num_trials] = {   8,  8, 8, 8, 8, 32, 8,  8, 8, 8, 8, 32 };
4248
0
    const int Hk_candidates[num_trials] = {   8,  8, 4, 2, 8, 32, 8,  8, 8, 8, 8, 32 };
4249
0
    const int D_candidates[num_trials] = {  64, 40, 160, 192, 256, 128, 48, 96, 160, 192, 256, 128 };
4250
4251
0
    const int B = B_candidates[trial];
4252
0
    const int R = R_candidates[trial];
4253
0
    const int C = C_candidates[trial];
4254
0
    const int Hq = Hq_candidates[trial];
4255
0
    const int Hk = Hk_candidates[trial];
4256
0
    const int D = D_candidates[trial];
4257
0
    const int is_causal = 0;
4258
0
    const float scale = 1.0 / sqrt((float)D);
4259
4260
0
    ccv_nnc_tensor_t* const q_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
4261
0
    ccv_nnc_tensor_t* const k_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
4262
0
    ccv_nnc_tensor_t* const v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
4263
4264
0
    for (int i = 0; i < B * R * Hq * D; ++i) {
4265
0
      q_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4266
0
    }
4267
0
    for (int i = 0; i < B * C * Hk * D; ++i) {
4268
0
      k_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4269
0
    }
4270
0
    for (int i = 0; i < B * C * Hk * D; ++i) {
4271
0
      v_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4272
0
    }
4273
4274
0
    ccv_nnc_tensor_t* const o_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
4275
0
    ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, is_causal), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor, k_tensor, v_tensor), TENSOR_LIST(o_tensor), 0);
4276
0
    ccv_nnc_tensor_t* const q_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, B, R, Hq, D), 0);
4277
0
    ccv_nnc_tensor_t* const k_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, B, C, Hk, D), 0);
4278
0
    ccv_nnc_tensor_t* const v_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, B, C, Hk, D), 0);
4279
0
    ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor, k_tensor, v_tensor), TENSOR_LIST(q_tensor_f16, k_tensor_f16, v_tensor_f16), 0);
4280
4281
0
    ccv_nnc_tensor_t* const gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, R, Hq, D), 0);
4282
0
    ccv_nnc_tensor_t* const gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, C, Hk, D), 0);
4283
0
    ccv_nnc_tensor_t* const gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, C, Hk, D), 0);
4284
0
    ccv_nnc_tensor_t* const gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, R, Hq, D), 0);
4285
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor_f16, k_tensor_f16, v_tensor_f16), TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor), 0);
4286
4287
0
    ccv_nnc_tensor_t* const gpu_softmax_lse = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, Hq, R), 0);
4288
0
    ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, is_causal), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, NULL, NULL, NULL), TENSOR_LIST(gpu_o_tensor, gpu_softmax_lse), 0);
4289
4290
0
    ccv_nnc_tensor_t* const copy_of_gpu_o_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, B, R, Hq, D), 0);
4291
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_o_tensor), TENSOR_LIST(copy_of_gpu_o_tensor_f16), 0);
4292
4293
0
    ccv_nnc_tensor_t* const copy_of_gpu_o_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
4294
0
    ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(copy_of_gpu_o_tensor_f16), TENSOR_LIST(copy_of_gpu_o_tensor), 0);
4295
0
    REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, copy_of_gpu_o_tensor->data.f32, o_tensor->data.f32, B * R * Hq * D, 8e-3, "scaled dot product attention result should be the same");
4296
4297
0
    ccv_nnc_tensor_free(o_tensor);
4298
0
    ccv_nnc_tensor_free(gpu_o_tensor);
4299
0
    ccv_nnc_tensor_free(copy_of_gpu_o_tensor_f16);
4300
0
    ccv_nnc_tensor_free(copy_of_gpu_o_tensor);
4301
0
    ccv_nnc_tensor_free(q_tensor);
4302
0
    ccv_nnc_tensor_free(k_tensor);
4303
0
    ccv_nnc_tensor_free(v_tensor);
4304
0
    ccv_nnc_tensor_free(q_tensor_f16);
4305
0
    ccv_nnc_tensor_free(k_tensor_f16);
4306
0
    ccv_nnc_tensor_free(v_tensor_f16);
4307
0
    ccv_nnc_tensor_free(gpu_q_tensor);
4308
0
    ccv_nnc_tensor_free(gpu_k_tensor);
4309
0
    ccv_nnc_tensor_free(gpu_v_tensor);
4310
0
    ccv_nnc_tensor_free(gpu_softmax_lse);
4311
0
  }
4312
0
#undef num_long_trials
4313
0
#undef num_short_trials
4314
0
#undef num_trials
4315
0
}
4316
4317
TEST_CASE("scaled dot product attention + unify head with mps")
4318
1
{
4319
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD, CCV_NNC_BACKEND_MPS));
4320
0
  ccv_nnc_symbolic_graph_t* const sdp_symbolic_graph = ccv_nnc_symbolic_graph_new();
4321
0
  ccv_nnc_tensor_symbol_t q = ccv_nnc_tensor_symbol_new(sdp_symbolic_graph, CPU_TENSOR_NHWC(32F, 32, 128, 8, 64), "q");
4322
0
  ccv_nnc_tensor_symbol_t k = ccv_nnc_tensor_symbol_new(sdp_symbolic_graph, CPU_TENSOR_NHWC(32F, 32, 128, 8, 64), "k");
4323
0
  ccv_nnc_tensor_symbol_t v = ccv_nnc_tensor_symbol_new(sdp_symbolic_graph, CPU_TENSOR_NHWC(32F, 32, 128, 8, 64), "v");
4324
0
  ccv_nnc_tensor_symbol_t w = ccv_nnc_tensor_symbol_new(sdp_symbolic_graph, CPU_TENSOR_NHWC(32F, 512, 512), "w");
4325
0
  ccv_nnc_tensor_symbol_t bias = ccv_nnc_tensor_symbol_new(sdp_symbolic_graph, CPU_TENSOR_NHWC(32F, 512), "bias");
4326
0
  ccv_nnc_tensor_symbol_t c = ccv_nnc_tensor_symbol_new(sdp_symbolic_graph, CPU_TENSOR_NHWC(32F, 32, 128, 8, 64), "c");
4327
0
  ccv_nnc_tensor_symbol_t r = ccv_nnc_tensor_symbol_new(sdp_symbolic_graph, CPU_TENSOR_NHWC(32F, 32, 128, 512), "r");
4328
0
  ccv_nnc_graph_exec_symbol_new(sdp_symbolic_graph, CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(1.0 / 8, 0), TENSOR_SYMBOL_LIST(q, k, v, NO_TENSOR_SYMBOL, w, bias), TENSOR_SYMBOL_LIST(r, NO_TENSOR_SYMBOL, c), "scaled_dot_product_attention");
4329
0
  ccv_nnc_graph_exec_symbol_autogen(sdp_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
4330
0
  ccv_nnc_graph_t* sdp_graph = 0;
4331
0
  ccv_nnc_tensor_arena_t* sdp_tensor_arena = 0;
4332
0
  ccv_nnc_graph_exec_arena_t* sdp_graph_exec_arena = 0;
4333
0
  ccv_nnc_symbolic_graph_compile(sdp_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(sdp_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(sdp_symbolic_graph), &sdp_graph, &sdp_tensor_arena, &sdp_graph_exec_arena);
4334
0
  ccv_nnc_tensor_t* const q_tensor = ccv_nnc_tensor_from_symbol(sdp_tensor_arena, q);
4335
0
  ccv_nnc_tensor_t* const k_tensor = ccv_nnc_tensor_from_symbol(sdp_tensor_arena, k);
4336
0
  ccv_nnc_tensor_t* const v_tensor = ccv_nnc_tensor_from_symbol(sdp_tensor_arena, v);
4337
0
  ccv_nnc_tensor_t* const w_tensor = ccv_nnc_tensor_from_symbol(sdp_tensor_arena, w);
4338
0
  ccv_nnc_tensor_t* const bias_tensor = ccv_nnc_tensor_from_symbol(sdp_tensor_arena, bias);
4339
0
  dsfmt_t dsfmt;
4340
0
  int i;
4341
0
  dsfmt_init_gen_rand(&dsfmt, 1);
4342
0
  for (i = 0; i < 32 * 8 * 128 * 64; i++)
4343
0
    q_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4344
0
  for (i = 0; i < 32 * 8 * 128 * 64; i++)
4345
0
    k_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4346
0
  for (i = 0; i < 32 * 8 * 128 * 64; i++)
4347
0
    v_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4348
0
  for (i = 0; i < 512 * 512; i++)
4349
0
    w_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4350
0
  for (i = 0; i < 512; i++)
4351
0
    bias_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4352
0
  ccv_nnc_symbolic_graph_t* const g_symbolic_graph = ccv_nnc_symbolic_graph_new();
4353
0
  ccv_nnc_tensor_symbol_t gq = ccv_nnc_tensor_symbol_new(g_symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 32, 128, 8, 64), "q");
4354
0
  ccv_nnc_tensor_symbol_t gk = ccv_nnc_tensor_symbol_new(g_symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 32, 128, 8, 64), "k");
4355
0
  ccv_nnc_tensor_symbol_t gv = ccv_nnc_tensor_symbol_new(g_symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 32, 128, 8, 64), "v");
4356
0
  ccv_nnc_tensor_symbol_t gw = ccv_nnc_tensor_symbol_new(g_symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 512, 512), "w");
4357
0
  ccv_nnc_tensor_symbol_t gbias = ccv_nnc_tensor_symbol_new(g_symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 512), "bias");
4358
0
  ccv_nnc_tensor_symbol_t gc = ccv_nnc_tensor_symbol_new(g_symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 32, 128, 8, 64), "c");
4359
0
  ccv_nnc_tensor_symbol_t gr = ccv_nnc_tensor_symbol_new(g_symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 32, 128, 512), "r");
4360
0
  ccv_nnc_graph_exec_symbol_new(g_symbolic_graph, CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(1.0 / 8, 0), TENSOR_SYMBOL_LIST(gq, gk, gv, NO_TENSOR_SYMBOL, gw, gbias), TENSOR_SYMBOL_LIST(gr, NO_TENSOR_SYMBOL, gc), "scaled_dot_product_attention");
4361
0
  ccv_nnc_graph_exec_symbol_autogen(g_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
4362
0
  ccv_nnc_graph_t* g_graph = 0;
4363
0
  ccv_nnc_tensor_arena_t* g_tensor_arena = 0;
4364
0
  ccv_nnc_graph_exec_arena_t* g_graph_exec_arena = 0;
4365
0
  ccv_nnc_symbolic_graph_compile(g_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(g_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(g_symbolic_graph), &g_graph, &g_tensor_arena, &g_graph_exec_arena);
4366
0
  ccv_nnc_tensor_t* const gq_tensor = ccv_nnc_tensor_from_symbol(g_tensor_arena, gq);
4367
0
  ccv_nnc_tensor_t* const gk_tensor = ccv_nnc_tensor_from_symbol(g_tensor_arena, gk);
4368
0
  ccv_nnc_tensor_t* const gv_tensor = ccv_nnc_tensor_from_symbol(g_tensor_arena, gv);
4369
0
  ccv_nnc_tensor_t* const gw_tensor = ccv_nnc_tensor_from_symbol(g_tensor_arena, gw);
4370
0
  ccv_nnc_tensor_t* const gbias_tensor = ccv_nnc_tensor_from_symbol(g_tensor_arena, gbias);
4371
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor, k_tensor, v_tensor, w_tensor, bias_tensor), TENSOR_LIST(gq_tensor, gk_tensor, gv_tensor, gw_tensor, gbias_tensor), 0);
4372
0
  ccv_nnc_graph_run(sdp_graph, 0, TRAVERSE_FULL, 0, 0);
4373
0
  ccv_nnc_graph_run(g_graph, 0, TRAVERSE_FULL, 0, 0);
4374
0
  ccv_nnc_tensor_t* const r_tensor = ccv_nnc_tensor_from_symbol(sdp_tensor_arena, r);
4375
0
  ccv_nnc_tensor_t* const gr_tensor = ccv_nnc_tensor_from_symbol(g_tensor_arena, gr);
4376
0
  ccv_nnc_tensor_t* const hr = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 32, 128, 512), 0);
4377
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gr_tensor), TENSOR_LIST(hr), 0);
4378
0
  float max_relative_diff = 0;
4379
0
  int max_diff_idx = 0;
4380
0
  for (i = 0; i < 32 * 128 * 512; i++)
4381
0
  {
4382
0
    const float denom = fmaxf(fmaxf(fabsf(r_tensor->data.f32[i]), fabsf(hr->data.f32[i])), 1.0f);
4383
0
    const float relative_diff = fabsf(r_tensor->data.f32[i] - hr->data.f32[i]) / denom;
4384
0
    if (relative_diff > max_relative_diff)
4385
0
      max_relative_diff = relative_diff, max_diff_idx = i;
4386
0
  }
4387
0
  REQUIRE(max_relative_diff <= 2e-3, "graph computed result should match scaled dot product attention op result (max relative diff %g at %d: %g vs %g)", max_relative_diff, max_diff_idx, r_tensor->data.f32[max_diff_idx], hr->data.f32[max_diff_idx]);
4388
0
  ccv_nnc_symbolic_graph_free(sdp_symbolic_graph);
4389
0
  ccv_nnc_tensor_arena_free(sdp_tensor_arena);
4390
0
  ccv_nnc_graph_exec_arena_free(sdp_graph_exec_arena);
4391
0
  ccv_nnc_graph_free(sdp_graph);
4392
0
  ccv_nnc_symbolic_graph_free(g_symbolic_graph);
4393
0
  ccv_nnc_tensor_arena_free(g_tensor_arena);
4394
0
  ccv_nnc_graph_exec_arena_free(g_graph_exec_arena);
4395
0
  ccv_nnc_graph_free(g_graph);
4396
0
  ccv_nnc_tensor_free(hr);
4397
0
}
4398
4399
TEST_CASE("scaled dot product attention gradient with mps")
4400
1
{
4401
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD, CCV_NNC_BACKEND_MPS) &&
4402
1
    ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD, CCV_NNC_BACKEND_MPS));
4403
0
#define num_long_trials 2
4404
0
#define num_short_trials 2
4405
0
#define num_trials (num_long_trials + num_short_trials)
4406
4407
0
  dsfmt_t dsfmt;
4408
0
  dsfmt_init_gen_rand(&dsfmt, 10);
4409
0
  for (int trial = 0; trial < num_trials; ++trial) {
4410
0
    int B_candidates[num_trials] = {  32,   3, 2, 1 };
4411
0
    int R_candidates[num_trials] = { 128,  61, 6, 2 };
4412
0
    int C_candidates[num_trials] = { 128,  49, 2, 1 };
4413
0
    int H_candidates[num_trials] = {   8,  13, 3, 1 };
4414
0
    int D_candidates[num_trials] = {  64, 191, 4, 8 };
4415
4416
0
    int B = B_candidates[trial];
4417
0
    int R = R_candidates[trial];
4418
0
    int C = C_candidates[trial];
4419
0
    int H = H_candidates[trial];
4420
0
    int D = D_candidates[trial];
4421
0
    float scale = 1.0 / sqrt((float)D);
4422
4423
0
    ccv_nnc_tensor_t* const q_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
4424
0
    ccv_nnc_tensor_t* const k_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
4425
0
    ccv_nnc_tensor_t* const v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
4426
0
    ccv_nnc_tensor_t* const dq_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
4427
0
    ccv_nnc_tensor_t* const dk_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
4428
0
    ccv_nnc_tensor_t* const dv_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
4429
4430
0
    for (int i = 0; i < B * R * H * D; ++i) {
4431
0
      q_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4432
0
    }
4433
0
    for (int i = 0; i < B * C * H * D; ++i) {
4434
0
      k_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4435
0
    }
4436
0
    for (int i = 0; i < B * C * H * D; ++i) {
4437
0
      v_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4438
0
    }
4439
4440
0
    ccv_nnc_tensor_t* const do_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
4441
0
    for (int i = 0; i < B * R * H * D; ++i) {
4442
0
      do_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4443
0
    }
4444
0
    ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD(scale, 0), ccv_nnc_no_hint, 0, TENSOR_LIST(do_tensor, 0, 0, q_tensor, k_tensor, v_tensor), TENSOR_LIST(dq_tensor, dk_tensor, dv_tensor), 0);
4445
4446
0
    ccv_nnc_tensor_t* const gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, H, D), 0);
4447
0
    ccv_nnc_tensor_t* const gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, C, H, D), 0);
4448
0
    ccv_nnc_tensor_t* const gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, C, H, D), 0);
4449
0
    ccv_nnc_tensor_t* const gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, H, D), 0);
4450
0
    ccv_nnc_tensor_t* const gpu_dq_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, H, D), 0);
4451
0
    ccv_nnc_tensor_t* const gpu_dk_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, C, H, D), 0);
4452
0
    ccv_nnc_tensor_t* const gpu_dv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, C, H, D), 0);
4453
0
    ccv_nnc_tensor_t* const gpu_do_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, H, D), 0);
4454
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor, k_tensor, v_tensor, do_tensor), TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, gpu_do_tensor), 0);
4455
4456
0
    ccv_nnc_tensor_t* const gpu_softmax_lse = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, H, R), 0);
4457
0
    ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, 0), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, NULL, NULL, NULL), TENSOR_LIST(gpu_o_tensor, gpu_softmax_lse), 0);
4458
4459
0
    ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD(scale, 0), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_do_tensor, 0, 0, gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, 0, 0, 0, gpu_o_tensor, gpu_softmax_lse), TENSOR_LIST(gpu_dq_tensor, gpu_dk_tensor, gpu_dv_tensor), 0);
4460
4461
0
    ccv_nnc_tensor_t* const copy_of_gpu_dq_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
4462
0
    ccv_nnc_tensor_t* const copy_of_gpu_dk_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
4463
0
    ccv_nnc_tensor_t* const copy_of_gpu_dv_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
4464
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_dq_tensor, gpu_dk_tensor, gpu_dv_tensor), TENSOR_LIST(copy_of_gpu_dq_tensor, copy_of_gpu_dk_tensor, copy_of_gpu_dv_tensor), 0);
4465
4466
0
    REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, copy_of_gpu_dv_tensor->data.f32, dv_tensor->data.f32, B * C * H * D, 5e-3, "scaled dot product attention result should be the same");
4467
0
    REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, copy_of_gpu_dq_tensor->data.f32, dq_tensor->data.f32, B * R * H * D, 5e-3, "scaled dot product attention result should be the same");
4468
0
    REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, copy_of_gpu_dk_tensor->data.f32, dk_tensor->data.f32, B * C * H * D, 5e-3, "scaled dot product attention result should be the same");
4469
4470
0
    ccv_nnc_tensor_free(do_tensor);
4471
0
    ccv_nnc_tensor_free(gpu_do_tensor);
4472
0
    ccv_nnc_tensor_free(gpu_o_tensor);
4473
0
    ccv_nnc_tensor_free(copy_of_gpu_dq_tensor);
4474
0
    ccv_nnc_tensor_free(copy_of_gpu_dk_tensor);
4475
0
    ccv_nnc_tensor_free(copy_of_gpu_dv_tensor);
4476
0
    ccv_nnc_tensor_free(q_tensor);
4477
0
    ccv_nnc_tensor_free(k_tensor);
4478
0
    ccv_nnc_tensor_free(v_tensor);
4479
0
    ccv_nnc_tensor_free(gpu_q_tensor);
4480
0
    ccv_nnc_tensor_free(gpu_k_tensor);
4481
0
    ccv_nnc_tensor_free(gpu_v_tensor);
4482
0
    ccv_nnc_tensor_free(dq_tensor);
4483
0
    ccv_nnc_tensor_free(dk_tensor);
4484
0
    ccv_nnc_tensor_free(dv_tensor);
4485
0
    ccv_nnc_tensor_free(gpu_dq_tensor);
4486
0
    ccv_nnc_tensor_free(gpu_dk_tensor);
4487
0
    ccv_nnc_tensor_free(gpu_dv_tensor);
4488
0
    ccv_nnc_tensor_free(gpu_softmax_lse);
4489
0
  }
4490
0
#undef num_long_trials
4491
0
#undef num_short_trials
4492
0
#undef num_trials
4493
0
}
4494
4495
TEST_CASE("scaled dot product attention gradient with mps in half precision")
4496
1
{
4497
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD, CCV_NNC_BACKEND_MPS) &&
4498
1
    ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD, CCV_NNC_BACKEND_MPS));
4499
0
#define num_long_trials 8
4500
0
#define num_short_trials 4
4501
0
#define num_trials (num_long_trials + num_short_trials)
4502
4503
0
  dsfmt_t dsfmt;
4504
0
  dsfmt_init_gen_rand(&dsfmt, 10);
4505
0
  for (int trial = 0; trial < num_trials; ++trial) {
4506
0
    const int B_candidates[num_trials] = {  32,   12, 16, 1, 2, 1, 32,   12, 16, 1, 2, 1 };
4507
0
    const int R_candidates[num_trials] = { 160,  256, 128, 77, 77, 5, 160,  256, 128, 77, 77, 5 };
4508
0
    const int C_candidates[num_trials] = { 128,  128, 128, 128, 128, 5, 128,  128, 128, 128, 128, 5 };
4509
0
    const int Hq_candidates[num_trials] = {   8,  8, 8, 8, 8, 32, 8,  8, 8, 8, 8, 32 };
4510
0
    const int D_candidates[num_trials] = {  64, 40, 160, 192, 256, 128, 48, 96, 160, 192, 256, 128 };
4511
4512
0
    const int B = B_candidates[trial];
4513
0
    const int R = R_candidates[trial];
4514
0
    const int C = C_candidates[trial];
4515
0
    const int Hq = Hq_candidates[trial];
4516
0
    const int Hk = Hq_candidates[trial];
4517
0
    const int D = D_candidates[trial];
4518
0
    const int is_causal = 0;
4519
0
    const float scale = 1.0 / sqrt((float)D);
4520
4521
0
    ccv_nnc_tensor_t* const q_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
4522
0
    ccv_nnc_tensor_t* const k_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
4523
0
    ccv_nnc_tensor_t* const v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
4524
0
    ccv_nnc_tensor_t* const dq_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
4525
0
    ccv_nnc_tensor_t* const dk_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
4526
0
    ccv_nnc_tensor_t* const dv_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
4527
4528
0
    for (int i = 0; i < B * R * Hq * D; ++i) {
4529
0
      q_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4530
0
    }
4531
0
    for (int i = 0; i < B * C * Hk * D; ++i) {
4532
0
      k_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4533
0
    }
4534
0
    for (int i = 0; i < B * C * Hk * D; ++i) {
4535
0
      v_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4536
0
    }
4537
4538
0
    ccv_nnc_tensor_t* const do_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
4539
0
    for (int i = 0; i < B * R * Hq * D; ++i) {
4540
0
      do_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4541
0
    }
4542
0
    ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD(scale, is_causal), ccv_nnc_no_hint, 0, TENSOR_LIST(do_tensor, 0, 0, q_tensor, k_tensor, v_tensor), TENSOR_LIST(dq_tensor, dk_tensor, dv_tensor), 0);
4543
0
    ccv_nnc_tensor_t* const q_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, Hq, D), 0);
4544
0
    ccv_nnc_tensor_t* const k_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, Hk, D), 0);
4545
0
    ccv_nnc_tensor_t* const v_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, Hk, D), 0);
4546
0
    ccv_nnc_tensor_t* const do_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, Hq, D), 0);
4547
0
    ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor, k_tensor, v_tensor, do_tensor), TENSOR_LIST(q_tensor_f16, k_tensor_f16, v_tensor_f16, do_tensor_f16), 0);
4548
4549
0
    ccv_nnc_tensor_t* const gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, Hq, D), 0);
4550
0
    ccv_nnc_tensor_t* const gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, Hk, D), 0);
4551
0
    ccv_nnc_tensor_t* const gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, Hk, D), 0);
4552
0
    ccv_nnc_tensor_t* const gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, Hq, D), 0);
4553
0
    ccv_nnc_tensor_t* const gpu_do_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, Hq, D), 0);
4554
0
    ccv_nnc_tensor_t* const gpu_dq_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, Hq, D), 0);
4555
0
    ccv_nnc_tensor_t* const gpu_dk_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, Hk, D), 0);
4556
0
    ccv_nnc_tensor_t* const gpu_dv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, Hk, D), 0);
4557
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor_f16, k_tensor_f16, v_tensor_f16, do_tensor_f16), TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, gpu_do_tensor), 0);
4558
4559
0
    ccv_nnc_tensor_t* const gpu_softmax_lse = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, Hq, R), 0);
4560
0
    ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, is_causal), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, NULL, NULL, NULL), TENSOR_LIST(gpu_o_tensor, gpu_softmax_lse), 0);
4561
4562
0
    ccv_nnc_cmd_t cmd = CMD_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD(scale, is_causal);
4563
0
    cmd.info.scaled_dot_product_attention.deterministic = 0;
4564
0
    ccv_nnc_cmd_exec(cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_do_tensor, 0, 0, gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, 0, 0, 0, gpu_o_tensor, gpu_softmax_lse), TENSOR_LIST(gpu_dq_tensor, gpu_dk_tensor, gpu_dv_tensor), 0);
4565
4566
0
    ccv_nnc_tensor_t* const copy_of_gpu_dq_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, Hq, D), 0);
4567
0
    ccv_nnc_tensor_t* const copy_of_gpu_dk_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, Hk, D), 0);
4568
0
    ccv_nnc_tensor_t* const copy_of_gpu_dv_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, Hk, D), 0);
4569
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_dq_tensor, gpu_dk_tensor, gpu_dv_tensor), TENSOR_LIST(copy_of_gpu_dq_tensor_f16, copy_of_gpu_dk_tensor_f16, copy_of_gpu_dv_tensor_f16), 0);
4570
4571
0
    ccv_nnc_tensor_t* const copy_of_gpu_dq_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
4572
0
    ccv_nnc_tensor_t* const copy_of_gpu_dk_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
4573
0
    ccv_nnc_tensor_t* const copy_of_gpu_dv_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
4574
0
    ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(copy_of_gpu_dq_tensor_f16, copy_of_gpu_dk_tensor_f16, copy_of_gpu_dv_tensor_f16), TENSOR_LIST(copy_of_gpu_dq_tensor, copy_of_gpu_dk_tensor, copy_of_gpu_dv_tensor), 0);
4575
4576
0
    REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, copy_of_gpu_dq_tensor->data.f32, dq_tensor->data.f32, B * R * Hq * D, 1e-3, "scaled dot product attention result should be the same");
4577
0
    REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, copy_of_gpu_dk_tensor->data.f32, dk_tensor->data.f32, B * C * Hk * D, 3e-3, "scaled dot product attention result should be the same");
4578
0
    REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, copy_of_gpu_dv_tensor->data.f32, dv_tensor->data.f32, B * C * Hk * D, 6e-3, "GPU computed output should be the same as CPU computed ones");
4579
4580
0
    ccv_nnc_tensor_free(do_tensor);
4581
0
    ccv_nnc_tensor_free(gpu_do_tensor);
4582
0
    ccv_nnc_tensor_free(gpu_o_tensor);
4583
0
    ccv_nnc_tensor_free(copy_of_gpu_dq_tensor_f16);
4584
0
    ccv_nnc_tensor_free(copy_of_gpu_dk_tensor_f16);
4585
0
    ccv_nnc_tensor_free(copy_of_gpu_dv_tensor_f16);
4586
0
    ccv_nnc_tensor_free(copy_of_gpu_dq_tensor);
4587
0
    ccv_nnc_tensor_free(copy_of_gpu_dk_tensor);
4588
0
    ccv_nnc_tensor_free(copy_of_gpu_dv_tensor);
4589
0
    ccv_nnc_tensor_free(q_tensor);
4590
0
    ccv_nnc_tensor_free(k_tensor);
4591
0
    ccv_nnc_tensor_free(v_tensor);
4592
0
    ccv_nnc_tensor_free(q_tensor_f16);
4593
0
    ccv_nnc_tensor_free(k_tensor_f16);
4594
0
    ccv_nnc_tensor_free(v_tensor_f16);
4595
0
    ccv_nnc_tensor_free(do_tensor_f16);
4596
0
    ccv_nnc_tensor_free(gpu_q_tensor);
4597
0
    ccv_nnc_tensor_free(gpu_k_tensor);
4598
0
    ccv_nnc_tensor_free(gpu_v_tensor);
4599
0
    ccv_nnc_tensor_free(dq_tensor);
4600
0
    ccv_nnc_tensor_free(dk_tensor);
4601
0
    ccv_nnc_tensor_free(dv_tensor);
4602
0
    ccv_nnc_tensor_free(gpu_dq_tensor);
4603
0
    ccv_nnc_tensor_free(gpu_dk_tensor);
4604
0
    ccv_nnc_tensor_free(gpu_dv_tensor);
4605
0
    ccv_nnc_tensor_free(gpu_softmax_lse);
4606
0
  }
4607
0
#undef num_long_trials
4608
0
#undef num_short_trials
4609
0
#undef num_trials
4610
0
}
4611
4612
TEST_CASE("scaled dot product attention gradient with mps in bfloat precision")
4613
1
{
4614
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD, CCV_NNC_BACKEND_MPS) &&
4615
1
    ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD, CCV_NNC_BACKEND_MPS));
4616
0
#define num_long_trials 8
4617
0
#define num_short_trials 4
4618
0
#define num_trials (num_long_trials + num_short_trials)
4619
4620
0
  dsfmt_t dsfmt;
4621
0
  dsfmt_init_gen_rand(&dsfmt, 10);
4622
0
  for (int trial = 0; trial < num_trials; ++trial) {
4623
0
    const int B_candidates[num_trials] = {  32,   12, 16, 1, 2, 1, 32,   12, 16, 1, 2, 1 };
4624
0
    const int R_candidates[num_trials] = { 160,  256, 128, 77, 77, 5, 160,  256, 128, 77, 77, 5 };
4625
0
    const int C_candidates[num_trials] = { 128,  128, 128, 128, 128, 5, 128,  128, 128, 128, 128, 5 };
4626
0
    const int Hq_candidates[num_trials] = {   8,  8, 8, 8, 8, 32, 8,  8, 8, 8, 8, 32 };
4627
0
    const int D_candidates[num_trials] = {  64, 40, 160, 192, 256, 128, 48, 96, 160, 192, 256, 128 };
4628
4629
0
    const int B = B_candidates[trial];
4630
0
    const int R = R_candidates[trial];
4631
0
    const int C = C_candidates[trial];
4632
0
    const int Hq = Hq_candidates[trial];
4633
0
    const int Hk = Hq_candidates[trial];
4634
0
    const int D = D_candidates[trial];
4635
0
    const int is_causal = 0;
4636
0
    const float scale = 1.0 / sqrt((float)D);
4637
4638
0
    ccv_nnc_tensor_t* const q_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
4639
0
    ccv_nnc_tensor_t* const k_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
4640
0
    ccv_nnc_tensor_t* const v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
4641
0
    ccv_nnc_tensor_t* const dq_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
4642
0
    ccv_nnc_tensor_t* const dk_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
4643
0
    ccv_nnc_tensor_t* const dv_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
4644
4645
0
    for (int i = 0; i < B * R * Hq * D; ++i) {
4646
0
      q_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4647
0
    }
4648
0
    for (int i = 0; i < B * C * Hk * D; ++i) {
4649
0
      k_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4650
0
    }
4651
0
    for (int i = 0; i < B * C * Hk * D; ++i) {
4652
0
      v_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4653
0
    }
4654
4655
0
    ccv_nnc_tensor_t* const do_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
4656
0
    for (int i = 0; i < B * R * Hq * D; ++i) {
4657
0
      do_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4658
0
    }
4659
0
    ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD(scale, is_causal), ccv_nnc_no_hint, 0, TENSOR_LIST(do_tensor, 0, 0, q_tensor, k_tensor, v_tensor), TENSOR_LIST(dq_tensor, dk_tensor, dv_tensor), 0);
4660
0
    ccv_nnc_tensor_t* const q_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, B, R, Hq, D), 0);
4661
0
    ccv_nnc_tensor_t* const k_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, B, C, Hk, D), 0);
4662
0
    ccv_nnc_tensor_t* const v_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, B, C, Hk, D), 0);
4663
0
    ccv_nnc_tensor_t* const do_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, B, R, Hq, D), 0);
4664
0
    ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor, k_tensor, v_tensor, do_tensor), TENSOR_LIST(q_tensor_f16, k_tensor_f16, v_tensor_f16, do_tensor_f16), 0);
4665
4666
0
    ccv_nnc_tensor_t* const gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, R, Hq, D), 0);
4667
0
    ccv_nnc_tensor_t* const gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, C, Hk, D), 0);
4668
0
    ccv_nnc_tensor_t* const gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, C, Hk, D), 0);
4669
0
    ccv_nnc_tensor_t* const gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, R, Hq, D), 0);
4670
0
    ccv_nnc_tensor_t* const gpu_do_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, R, Hq, D), 0);
4671
0
    ccv_nnc_tensor_t* const gpu_dq_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, R, Hq, D), 0);
4672
0
    ccv_nnc_tensor_t* const gpu_dk_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, C, Hk, D), 0);
4673
0
    ccv_nnc_tensor_t* const gpu_dv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, C, Hk, D), 0);
4674
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor_f16, k_tensor_f16, v_tensor_f16, do_tensor_f16), TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, gpu_do_tensor), 0);
4675
4676
0
    ccv_nnc_tensor_t* const gpu_softmax_lse = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, Hq, R), 0);
4677
0
    ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, is_causal), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, NULL, NULL, NULL), TENSOR_LIST(gpu_o_tensor, gpu_softmax_lse), 0);
4678
4679
0
    ccv_nnc_cmd_t cmd = CMD_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD(scale, is_causal);
4680
0
    cmd.info.scaled_dot_product_attention.deterministic = 0;
4681
0
    ccv_nnc_cmd_exec(cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_do_tensor, 0, 0, gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, 0, 0, 0, gpu_o_tensor, gpu_softmax_lse), TENSOR_LIST(gpu_dq_tensor, gpu_dk_tensor, gpu_dv_tensor), 0);
4682
4683
0
    ccv_nnc_tensor_t* const copy_of_gpu_dq_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, B, R, Hq, D), 0);
4684
0
    ccv_nnc_tensor_t* const copy_of_gpu_dk_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, B, C, Hk, D), 0);
4685
0
    ccv_nnc_tensor_t* const copy_of_gpu_dv_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, B, C, Hk, D), 0);
4686
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_dq_tensor, gpu_dk_tensor, gpu_dv_tensor), TENSOR_LIST(copy_of_gpu_dq_tensor_f16, copy_of_gpu_dk_tensor_f16, copy_of_gpu_dv_tensor_f16), 0);
4687
4688
0
    ccv_nnc_tensor_t* const copy_of_gpu_dq_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
4689
0
    ccv_nnc_tensor_t* const copy_of_gpu_dk_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
4690
0
    ccv_nnc_tensor_t* const copy_of_gpu_dv_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
4691
0
    ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(copy_of_gpu_dq_tensor_f16, copy_of_gpu_dk_tensor_f16, copy_of_gpu_dv_tensor_f16), TENSOR_LIST(copy_of_gpu_dq_tensor, copy_of_gpu_dk_tensor, copy_of_gpu_dv_tensor), 0);
4692
4693
0
    REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, copy_of_gpu_dq_tensor->data.f32, dq_tensor->data.f32, B * R * Hq * D, 5e-3, "scaled dot product attention result should be the same");
4694
0
    REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, copy_of_gpu_dk_tensor->data.f32, dk_tensor->data.f32, B * C * Hk * D, 1e-2, "scaled dot product attention result should be the same");
4695
0
    REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, copy_of_gpu_dv_tensor->data.f32, dv_tensor->data.f32, B * C * Hk * D, 2e-2, "GPU computed output should be the same as CPU computed ones");
4696
4697
0
    ccv_nnc_tensor_free(do_tensor);
4698
0
    ccv_nnc_tensor_free(gpu_do_tensor);
4699
0
    ccv_nnc_tensor_free(gpu_o_tensor);
4700
0
    ccv_nnc_tensor_free(copy_of_gpu_dq_tensor_f16);
4701
0
    ccv_nnc_tensor_free(copy_of_gpu_dk_tensor_f16);
4702
0
    ccv_nnc_tensor_free(copy_of_gpu_dv_tensor_f16);
4703
0
    ccv_nnc_tensor_free(copy_of_gpu_dq_tensor);
4704
0
    ccv_nnc_tensor_free(copy_of_gpu_dk_tensor);
4705
0
    ccv_nnc_tensor_free(copy_of_gpu_dv_tensor);
4706
0
    ccv_nnc_tensor_free(q_tensor);
4707
0
    ccv_nnc_tensor_free(k_tensor);
4708
0
    ccv_nnc_tensor_free(v_tensor);
4709
0
    ccv_nnc_tensor_free(q_tensor_f16);
4710
0
    ccv_nnc_tensor_free(k_tensor_f16);
4711
0
    ccv_nnc_tensor_free(v_tensor_f16);
4712
0
    ccv_nnc_tensor_free(do_tensor_f16);
4713
0
    ccv_nnc_tensor_free(gpu_q_tensor);
4714
0
    ccv_nnc_tensor_free(gpu_k_tensor);
4715
0
    ccv_nnc_tensor_free(gpu_v_tensor);
4716
0
    ccv_nnc_tensor_free(dq_tensor);
4717
0
    ccv_nnc_tensor_free(dk_tensor);
4718
0
    ccv_nnc_tensor_free(dv_tensor);
4719
0
    ccv_nnc_tensor_free(gpu_dq_tensor);
4720
0
    ccv_nnc_tensor_free(gpu_dk_tensor);
4721
0
    ccv_nnc_tensor_free(gpu_dv_tensor);
4722
0
    ccv_nnc_tensor_free(gpu_softmax_lse);
4723
0
  }
4724
0
#undef num_long_trials
4725
0
#undef num_short_trials
4726
0
#undef num_trials
4727
0
}
4728
4729
TEST_CASE("backward gemm with no transpose")
4730
1
{
4731
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS) &&
4732
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
4733
0
  float gp[] = {
4734
0
    1, 2, 3,
4735
0
    4, 5, 6,
4736
0
    7, 8, 9,
4737
0
    10, 11, 12,
4738
0
  };
4739
0
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
4740
4741
0
  float ap[] = {
4742
0
    13, 14,
4743
0
    15, 16,
4744
0
    17, 18,
4745
0
    19, 20,
4746
0
  };
4747
4748
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 4, 2), 0);
4749
4750
0
  float bp[] = {
4751
0
    21, 22, 23,
4752
0
    24, 25, 26,
4753
0
  };
4754
4755
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
4756
4757
0
  ccv_nnc_tensor_t* const gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
4758
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
4759
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
4760
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(gg, ga, gb), 0);
4761
4762
0
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
4763
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
4764
0
  ccv_nnc_tensor_t* const dbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
4765
0
  ccv_nnc_cmd_t cmd = CMD_GEMM_BACKWARD();
4766
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
4767
0
  cmd.algorithm = 1; // This is cblas.
4768
4769
0
  ccv_nnc_cmd_exec(cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(h, db, dbias), 0);
4770
4771
0
  ccv_nnc_tensor_t* const ch = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC( 32F, 4, 2), 0);
4772
0
  ccv_nnc_tensor_t* const cdb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC( 32F, 2, 3), 0);
4773
0
  ccv_nnc_tensor_t* const cdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC( 32F, 3), 0);
4774
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(h, db, dbias), TENSOR_LIST(ch, cdb, cdbias), 0);
4775
4776
0
  float dbiastp[] = {
4777
0
    22, 26, 30,
4778
0
  };
4779
0
  ccv_nnc_tensor_t dbiast = ccv_nnc_tensor(dbiastp, CPU_TENSOR_NHWC(32F, 3), 0);
4780
4781
0
  REQUIRE_TENSOR_EQ(cdbias, &dbiast, "bias should be equal");
4782
0
  float htp[] = {
4783
0
    1 * 21 + 2 * 22 + 3 * 23, 1 * 24 + 2 * 25 + 3 * 26,
4784
0
    4 * 21 + 5 * 22 + 6 * 23, 4 * 24 + 5 * 25 + 6 * 26,
4785
0
    7 * 21 + 8 * 22 + 9 * 23, 7 * 24 + 8 * 25 + 9 * 26,
4786
0
    10 * 21 + 11 * 22 + 12 * 23, 10 * 24 + 11 * 25 + 12 * 26,
4787
0
  };
4788
0
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 4, 2), 0);
4789
4790
0
  REQUIRE_TENSOR_EQ(ch, &ht, "h should be equal");
4791
0
  float dbtp[] = {
4792
0
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19, 2 * 13 + 5 * 15 + 8 * 17 + 11 * 19, 3 * 13 + 6 * 15 + 9 * 17 + 12 * 19,
4793
0
    1 * 14 + 4 * 16 + 7 * 18 + 10 * 20, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20,
4794
0
  };
4795
0
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
4796
0
  REQUIRE_TENSOR_EQ(cdb, &dbt, "db should be equal");
4797
0
  ccv_nnc_tensor_free(g);
4798
0
  ccv_nnc_tensor_free(a);
4799
0
  ccv_nnc_tensor_free(b);
4800
0
  ccv_nnc_tensor_free(h);
4801
0
  ccv_nnc_tensor_free(db);
4802
0
  ccv_nnc_tensor_free(dbias);
4803
0
}
4804
4805
TEST_CASE("backward gemm with transpose a")
4806
1
{
4807
1
    GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS) &&
4808
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
4809
0
  float gp[] = {
4810
0
    1, 2, 3,
4811
0
    4, 5, 6,
4812
0
    7, 8, 9,
4813
0
    10, 11, 12,
4814
0
  };
4815
0
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
4816
0
  float ap[] = {
4817
0
    13, 15, 17, 19,
4818
0
    14, 16, 18, 20,
4819
0
  };
4820
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4), 0);
4821
0
  float bp[] = {
4822
0
    21, 22, 23,
4823
0
    24, 25, 26,
4824
0
  };
4825
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
4826
0
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4), 0);
4827
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
4828
0
  ccv_nnc_tensor_t* const dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
4829
0
  ccv_nnc_tensor_t* gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
4830
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4), 0);
4831
0
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
4832
0
  ccv_nnc_tensor_t* gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4), 0);
4833
0
  ccv_nnc_tensor_t* gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
4834
0
  ccv_nnc_tensor_t* gdbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
4835
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(gg, ga, gb), 0);
4836
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(gh, gdb, gdbias), 0);
4837
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdb, gdbias), TENSOR_LIST(h, db, dbias), 0);
4838
0
  float dbiastp[] = {
4839
0
    22, 26, 30,
4840
0
  };
4841
0
  ccv_nnc_tensor_t dbiast = ccv_nnc_tensor(dbiastp, CPU_TENSOR_NHWC(32F, 3), 0);
4842
0
  REQUIRE_TENSOR_EQ(dbias, &dbiast, "bias should be equal");
4843
0
  float htp[] = {
4844
0
    1 * 21 + 2 * 22 + 3 * 23, 4 * 21 + 5 * 22 + 6 * 23, 7 * 21 + 8 * 22 + 9 * 23, 10 * 21 + 11 * 22 + 12 * 23,
4845
0
    1 * 24 + 2 * 25 + 3 * 26, 4 * 24 + 5 * 25 + 6 * 26, 7 * 24 + 8 * 25 + 9 * 26, 10 * 24 + 11 * 25 + 12 * 26,
4846
0
  };
4847
0
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 2, 4), 0);
4848
0
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
4849
0
  float dbtp[] = {
4850
0
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19, 2 * 13 + 5 * 15 + 8 * 17 + 11 * 19, 3 * 13 + 6 * 15 + 9 * 17 + 12 * 19,
4851
0
    1 * 14 + 4 * 16 + 7 * 18 + 10 * 20, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20,
4852
0
  };
4853
0
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
4854
0
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
4855
0
  ccv_nnc_tensor_free(g);
4856
0
  ccv_nnc_tensor_free(a);
4857
0
  ccv_nnc_tensor_free(b);
4858
0
  ccv_nnc_tensor_free(h);
4859
0
  ccv_nnc_tensor_free(db);
4860
0
  ccv_nnc_tensor_free(dbias);
4861
0
  ccv_nnc_tensor_free(gg);
4862
0
  ccv_nnc_tensor_free(ga);
4863
0
  ccv_nnc_tensor_free(gb);
4864
0
  ccv_nnc_tensor_free(gh);
4865
0
  ccv_nnc_tensor_free(gdb);
4866
0
  ccv_nnc_tensor_free(gdbias);
4867
0
}
4868
4869
TEST_CASE("backward gemm with transpose b")
4870
1
{
4871
1
    GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS) &&
4872
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
4873
0
  float gp[] = {
4874
0
    1, 2, 3,
4875
0
    4, 5, 6,
4876
0
    7, 8, 9,
4877
0
    10, 11, 12,
4878
0
  };
4879
0
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
4880
0
  float ap[] = {
4881
0
    13, 14,
4882
0
    15, 16,
4883
0
    17, 18,
4884
0
    19, 20,
4885
0
  };
4886
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 4, 2), 0);
4887
0
  float bp[] = {
4888
0
    21, 24,
4889
0
    22, 25,
4890
0
    23, 26,
4891
0
  };
4892
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 3, 2), 0);
4893
0
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 2), 0);
4894
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 2), 0);
4895
0
  ccv_nnc_tensor_t* const dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
4896
0
  ccv_nnc_tensor_t* gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
4897
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
4898
0
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3, 2), 0);
4899
0
  ccv_nnc_tensor_t* gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
4900
0
  ccv_nnc_tensor_t* gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3, 2), 0);
4901
0
  ccv_nnc_tensor_t* gdbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
4902
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(gg, ga, gb), 0);
4903
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(gh, gdb, gdbias), 0);
4904
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdb, gdbias), TENSOR_LIST(h, db, dbias), 0);
4905
0
  float dbiastp[] = {
4906
0
    22, 26, 30,
4907
0
  };
4908
0
  ccv_nnc_tensor_t dbiast = ccv_nnc_tensor(dbiastp, CPU_TENSOR_NHWC(32F, 3), 0);
4909
0
  REQUIRE_TENSOR_EQ(dbias, &dbiast, "bias should be equal");
4910
0
  float htp[] = {
4911
0
    1 * 21 + 2 * 22 + 3 * 23, 1 * 24 + 2 * 25 + 3 * 26,
4912
0
    4 * 21 + 5 * 22 + 6 * 23, 4 * 24 + 5 * 25 + 6 * 26,
4913
0
    7 * 21 + 8 * 22 + 9 * 23, 7 * 24 + 8 * 25 + 9 * 26,
4914
0
    10 * 21 + 11 * 22 + 12 * 23, 10 * 24 + 11 * 25 + 12 * 26,
4915
0
  };
4916
0
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 4, 2), 0);
4917
0
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
4918
0
  float dbtp[] = {
4919
0
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19, 1 * 14 + 4 * 16 + 7 * 18 + 10 * 20,
4920
0
    2 * 13 + 5 * 15 + 8 * 17 + 11 * 19, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20,
4921
0
    3 * 13 + 6 * 15 + 9 * 17 + 12 * 19, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20,
4922
0
  };
4923
0
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 3, 2), 0);
4924
0
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
4925
0
  ccv_nnc_tensor_free(g);
4926
0
  ccv_nnc_tensor_free(a);
4927
0
  ccv_nnc_tensor_free(b);
4928
0
  ccv_nnc_tensor_free(h);
4929
0
  ccv_nnc_tensor_free(db);
4930
0
  ccv_nnc_tensor_free(dbias);
4931
0
  ccv_nnc_tensor_free(gg);
4932
0
  ccv_nnc_tensor_free(ga);
4933
0
  ccv_nnc_tensor_free(gb);
4934
0
  ccv_nnc_tensor_free(gh);
4935
0
  ccv_nnc_tensor_free(gdb);
4936
0
  ccv_nnc_tensor_free(gdbias);
4937
0
}
4938
4939
TEST_CASE("backward gemm with transpose a and b")
4940
1
{
4941
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS) &&
4942
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
4943
0
  float gp[] = {
4944
0
    1, 2, 3,
4945
0
    4, 5, 6,
4946
0
    7, 8, 9,
4947
0
    10, 11, 12,
4948
0
  };
4949
0
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
4950
0
  float ap[] = {
4951
0
    13, 15, 17, 19,
4952
0
    14, 16, 18, 20,
4953
0
  };
4954
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4), 0);
4955
0
  float bp[] = {
4956
0
    21, 24,
4957
0
    22, 25,
4958
0
    23, 26,
4959
0
  };
4960
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 3, 2), 0);
4961
0
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4), 0);
4962
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 2), 0);
4963
0
  ccv_nnc_tensor_t* const dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
4964
0
  ccv_nnc_tensor_t* gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
4965
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4), 0);
4966
0
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3, 2), 0);
4967
0
  ccv_nnc_tensor_t* gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4), 0);
4968
0
  ccv_nnc_tensor_t* gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3, 2), 0);
4969
0
  ccv_nnc_tensor_t* gdbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
4970
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(gg, ga, gb), 0);
4971
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(TRANSPOSE(0, 1), TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(gh, gdb, gdbias), 0);
4972
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdb, gdbias), TENSOR_LIST(h, db, dbias), 0);
4973
0
  float dbiastp[] = {
4974
0
    22, 26, 30,
4975
0
  };
4976
0
  ccv_nnc_tensor_t dbiast = ccv_nnc_tensor(dbiastp, CPU_TENSOR_NHWC(32F, 3), 0);
4977
0
  REQUIRE_TENSOR_EQ(dbias, &dbiast, "bias should be equal");
4978
0
  float htp[] = {
4979
0
    1 * 21 + 2 * 22 + 3 * 23, 4 * 21 + 5 * 22 + 6 * 23, 7 * 21 + 8 * 22 + 9 * 23, 10 * 21 + 11 * 22 + 12 * 23,
4980
0
    1 * 24 + 2 * 25 + 3 * 26, 4 * 24 + 5 * 25 + 6 * 26, 7 * 24 + 8 * 25 + 9 * 26, 10 * 24 + 11 * 25 + 12 * 26,
4981
0
  };
4982
0
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 2, 4), 0);
4983
0
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
4984
0
  float dbtp[] = {
4985
0
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19, 1 * 14 + 4 * 16 + 7 * 18 + 10 * 20,
4986
0
    2 * 13 + 5 * 15 + 8 * 17 + 11 * 19, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20,
4987
0
    3 * 13 + 6 * 15 + 9 * 17 + 12 * 19, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20,
4988
0
  };
4989
0
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 3, 2), 0);
4990
0
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
4991
0
  ccv_nnc_tensor_free(g);
4992
0
  ccv_nnc_tensor_free(a);
4993
0
  ccv_nnc_tensor_free(b);
4994
0
  ccv_nnc_tensor_free(h);
4995
0
  ccv_nnc_tensor_free(db);
4996
0
  ccv_nnc_tensor_free(dbias);
4997
0
  ccv_nnc_tensor_free(gg);
4998
0
  ccv_nnc_tensor_free(ga);
4999
0
  ccv_nnc_tensor_free(gb);
5000
0
  ccv_nnc_tensor_free(gh);
5001
0
  ccv_nnc_tensor_free(gdb);
5002
0
  ccv_nnc_tensor_free(gdbias);
5003
0
}
5004
5005
5006
TEST_CASE("backward gemm large data set")
5007
1
{
5008
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS) &&
5009
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
5010
0
  dsfmt_t dsfmt;
5011
0
  dsfmt_init_gen_rand(&dsfmt, 0);
5012
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 128), 0);
5013
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
5014
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64), 0);
5015
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 64), 0);
5016
0
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 64), 0);
5017
0
  ccv_nnc_tensor_t* dw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
5018
0
  ccv_nnc_tensor_t* dbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64), 0);
5019
0
  ccv_nnc_tensor_t* h = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 128), 0);
5020
5021
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
5022
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
5023
0
  ccv_nnc_tensor_t* hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
5024
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
5025
0
  ccv_nnc_tensor_t* hg = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
5026
0
  ccv_nnc_tensor_t* hdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
5027
0
  ccv_nnc_tensor_t* hdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
5028
0
  ccv_nnc_tensor_t* hh = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
5029
0
  int i;
5030
0
  for (i = 0; i < 64 * 128; i++)
5031
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
5032
0
  for (i = 0; i < 64; i++)
5033
0
    hbias->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
5034
0
  for (i = 0; i < 10 * 128; i++)
5035
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
5036
0
  for (i = 0; i < 10 * 64; i++)
5037
0
    hg->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
5038
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias, hg), TENSOR_LIST(a, w, bias, g), 0);
5039
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(hb), 0);
5040
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(hg, ha, hw, 0), TENSOR_LIST(hh, hdw, hdbias), 0);
5041
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
5042
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, w, 0), TENSOR_LIST(h, dw, dbias), 0);
5043
0
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
5044
0
  ccv_nnc_tensor_t* tdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
5045
0
  ccv_nnc_tensor_t* tdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
5046
0
  ccv_nnc_tensor_t* th = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
5047
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b, dw, dbias, h), TENSOR_LIST(tb, tdw, tdbias, th), 0);
5048
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb->data.f32, hb->data.f32, 10 * 64, 1e-5, "GPU computed output should be numerically close to CPU computed ones");
5049
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tdw->data.f32, hdw->data.f32, 64 * 128, 5e-3, "GPU computed output should be numerically close to CPU computed ones");
5050
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tdbias->data.f32, hdbias->data.f32, 64, 1e-5, "GPU computed output should be numerically close to CPU computed ones");
5051
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, th->data.f32, hh->data.f32, 10 * 128, 1e-5, "GPU computed output should be numerically close to CPU computed ones");
5052
0
  ccv_nnc_tensor_free(a);
5053
0
  ccv_nnc_tensor_free(w);
5054
0
  ccv_nnc_tensor_free(bias);
5055
0
  ccv_nnc_tensor_free(b);
5056
0
  ccv_nnc_tensor_free(g);
5057
0
  ccv_nnc_tensor_free(dw);
5058
0
  ccv_nnc_tensor_free(dbias);
5059
0
  ccv_nnc_tensor_free(h);
5060
0
  ccv_nnc_tensor_free(ha);
5061
0
  ccv_nnc_tensor_free(hw);
5062
0
  ccv_nnc_tensor_free(hbias);
5063
0
  ccv_nnc_tensor_free(hb);
5064
0
  ccv_nnc_tensor_free(hg);
5065
0
  ccv_nnc_tensor_free(hdw);
5066
0
  ccv_nnc_tensor_free(hdbias);
5067
0
  ccv_nnc_tensor_free(hh);
5068
0
  ccv_nnc_tensor_free(tb);
5069
0
  ccv_nnc_tensor_free(th);
5070
0
  ccv_nnc_tensor_free(tdw);
5071
0
  ccv_nnc_tensor_free(tdbias);
5072
0
}
5073
5074
TEST_CASE("backward gemm no bias")
5075
1
{
5076
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS) &&
5077
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
5078
0
  dsfmt_t dsfmt;
5079
0
  dsfmt_init_gen_rand(&dsfmt, 0);
5080
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 128), 0);
5081
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
5082
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 64), 0);
5083
0
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 64), 0);
5084
0
  ccv_nnc_tensor_t* dw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
5085
0
  ccv_nnc_tensor_t* h = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 128), 0);
5086
5087
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
5088
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
5089
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
5090
0
  ccv_nnc_tensor_t* hg = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
5091
0
  ccv_nnc_tensor_t* hdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
5092
0
  ccv_nnc_tensor_t* hh = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
5093
0
  int i;
5094
0
  for (i = 0; i < 64 * 128; i++)
5095
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
5096
0
  for (i = 0; i < 10 * 128; i++)
5097
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
5098
0
  for (i = 0; i < 10 * 64; i++)
5099
0
    hg->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
5100
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hg), TENSOR_LIST(a, w, g), 0);
5101
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw), TENSOR_LIST(hb), 0);
5102
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(hg, ha, hw, 0), TENSOR_LIST(hh, hdw, 0), 0);
5103
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
5104
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, w, 0), TENSOR_LIST(h, dw, 0), 0);
5105
0
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
5106
0
  ccv_nnc_tensor_t* tdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
5107
0
  ccv_nnc_tensor_t* th = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
5108
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b, dw, h), TENSOR_LIST(tb, tdw, th), 0);
5109
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb->data.f32, hb->data.f32, 10 * 64, 1e-5, "GPU computed output should be numerically close to CPU computed ones");
5110
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tdw->data.f32, hdw->data.f32, 64 * 128, 5e-3, "GPU computed output should be numerically close to CPU computed ones");
5111
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, th->data.f32, hh->data.f32, 10 * 128, 1e-5, "GPU computed output should be numerically close to CPU computed ones");
5112
0
  ccv_nnc_tensor_free(a);
5113
0
  ccv_nnc_tensor_free(w);
5114
0
  ccv_nnc_tensor_free(b);
5115
0
  ccv_nnc_tensor_free(g);
5116
0
  ccv_nnc_tensor_free(dw);
5117
0
  ccv_nnc_tensor_free(h);
5118
0
  ccv_nnc_tensor_free(ha);
5119
0
  ccv_nnc_tensor_free(hw);
5120
0
  ccv_nnc_tensor_free(hb);
5121
0
  ccv_nnc_tensor_free(hg);
5122
0
  ccv_nnc_tensor_free(hdw);
5123
0
  ccv_nnc_tensor_free(hh);
5124
0
  ccv_nnc_tensor_free(tb);
5125
0
  ccv_nnc_tensor_free(th);
5126
0
  ccv_nnc_tensor_free(tdw);
5127
0
}
5128
5129
TEST_CASE("backward gemm no h")
5130
1
{
5131
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS) &&
5132
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
5133
0
  dsfmt_t dsfmt;
5134
0
  dsfmt_init_gen_rand(&dsfmt, 0);
5135
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 128), 0);
5136
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
5137
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64), 0);
5138
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 64), 0);
5139
0
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 64), 0);
5140
0
  ccv_nnc_tensor_t* dw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
5141
0
  ccv_nnc_tensor_t* dbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64), 0);
5142
5143
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
5144
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
5145
0
  ccv_nnc_tensor_t* hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
5146
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
5147
0
  ccv_nnc_tensor_t* hg = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
5148
0
  ccv_nnc_tensor_t* hdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
5149
0
  ccv_nnc_tensor_t* hdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
5150
0
  int i;
5151
0
  for (i = 0; i < 64 * 128; i++)
5152
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
5153
0
  for (i = 0; i < 64; i++)
5154
0
    hbias->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
5155
0
  for (i = 0; i < 10 * 128; i++)
5156
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
5157
0
  for (i = 0; i < 10 * 64; i++)
5158
0
    hg->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
5159
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias, hg), TENSOR_LIST(a, w, bias, g), 0);
5160
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(hb), 0);
5161
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(hg, ha, hw, 0), TENSOR_LIST(0, hdw, hdbias), 0);
5162
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
5163
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, w, 0), TENSOR_LIST(0, dw, dbias), 0);
5164
0
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
5165
0
  ccv_nnc_tensor_t* tdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
5166
0
  ccv_nnc_tensor_t* tdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
5167
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b, dw, dbias, 0), TENSOR_LIST(tb, tdw, tdbias, 0), 0);
5168
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb->data.f32, hb->data.f32, 10 * 64, 1e-5, "GPU computed output should be numerically close to CPU computed ones");
5169
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tdw->data.f32, hdw->data.f32, 64 * 128, 5e-3, "GPU computed output should be numerically close to CPU computed ones");
5170
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tdbias->data.f32, hdbias->data.f32, 64, 1e-5, "GPU computed output should be numerically close to CPU computed ones");
5171
0
  ccv_nnc_tensor_free(a);
5172
0
  ccv_nnc_tensor_free(w);
5173
0
  ccv_nnc_tensor_free(bias);
5174
0
  ccv_nnc_tensor_free(b);
5175
0
  ccv_nnc_tensor_free(g);
5176
0
  ccv_nnc_tensor_free(dw);
5177
0
  ccv_nnc_tensor_free(dbias);
5178
0
  ccv_nnc_tensor_free(ha);
5179
0
  ccv_nnc_tensor_free(hw);
5180
0
  ccv_nnc_tensor_free(hbias);
5181
0
  ccv_nnc_tensor_free(hb);
5182
0
  ccv_nnc_tensor_free(hg);
5183
0
  ccv_nnc_tensor_free(hdw);
5184
0
  ccv_nnc_tensor_free(hdbias);
5185
0
  ccv_nnc_tensor_free(tb);
5186
0
  ccv_nnc_tensor_free(tdw);
5187
0
  ccv_nnc_tensor_free(tdbias);
5188
0
}
5189
5190
TEST_CASE("backward gemm no dw")
5191
1
{
5192
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS) &&
5193
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
5194
0
  dsfmt_t dsfmt;
5195
0
  dsfmt_init_gen_rand(&dsfmt, 0);
5196
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 128), 0);
5197
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
5198
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64), 0);
5199
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 64), 0);
5200
0
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 64), 0);
5201
0
  ccv_nnc_tensor_t* dbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64), 0);
5202
0
  ccv_nnc_tensor_t* h = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 128), 0);
5203
5204
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
5205
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
5206
0
  ccv_nnc_tensor_t* hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
5207
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
5208
0
  ccv_nnc_tensor_t* hg = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
5209
0
  ccv_nnc_tensor_t* hdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
5210
0
  ccv_nnc_tensor_t* hh = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
5211
0
  int i;
5212
0
  for (i = 0; i < 64 * 128; i++)
5213
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
5214
0
  for (i = 0; i < 64; i++)
5215
0
    hbias->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
5216
0
  for (i = 0; i < 10 * 128; i++)
5217
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
5218
0
  for (i = 0; i < 10 * 64; i++)
5219
0
    hg->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
5220
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias, hg), TENSOR_LIST(a, w, bias, g), 0);
5221
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(hb), 0);
5222
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(hg, ha, hw, 0), TENSOR_LIST(hh, 0, hdbias), 0);
5223
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
5224
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, w, 0), TENSOR_LIST(h, 0, dbias), 0);
5225
0
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
5226
0
  ccv_nnc_tensor_t* tdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
5227
0
  ccv_nnc_tensor_t* th = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
5228
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b, 0, dbias, h), TENSOR_LIST(tb, 0, tdbias, th), 0);
5229
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb->data.f32, hb->data.f32, 10 * 64, 1e-5, "GPU computed output should be numerically close to CPU computed ones");
5230
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tdbias->data.f32, hdbias->data.f32, 64, 1e-5, "GPU computed output should be numerically close to CPU computed ones");
5231
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, th->data.f32, hh->data.f32, 10 * 128, 1e-5, "GPU computed output should be numerically close to CPU computed ones");
5232
0
  ccv_nnc_tensor_free(a);
5233
0
  ccv_nnc_tensor_free(w);
5234
0
  ccv_nnc_tensor_free(bias);
5235
0
  ccv_nnc_tensor_free(b);
5236
0
  ccv_nnc_tensor_free(g);
5237
0
  ccv_nnc_tensor_free(dbias);
5238
0
  ccv_nnc_tensor_free(h);
5239
0
  ccv_nnc_tensor_free(ha);
5240
0
  ccv_nnc_tensor_free(hw);
5241
0
  ccv_nnc_tensor_free(hbias);
5242
0
  ccv_nnc_tensor_free(hb);
5243
0
  ccv_nnc_tensor_free(hg);
5244
0
  ccv_nnc_tensor_free(hdbias);
5245
0
  ccv_nnc_tensor_free(hh);
5246
0
  ccv_nnc_tensor_free(tb);
5247
0
  ccv_nnc_tensor_free(th);
5248
0
  ccv_nnc_tensor_free(tdbias);
5249
0
}
5250
5251
TEST_CASE("backwar gemm with no transpose batch 2, same b")
5252
1
{
5253
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS) &&
5254
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
5255
0
  float gp[] = {
5256
0
    1, 2, 3,
5257
0
    4, 5, 6,
5258
0
    7, 8, 9,
5259
0
    10, 11, 12,
5260
0
    10, 20, 30,
5261
0
    40, 50, 60,
5262
0
    70, 80, 90,
5263
0
    100, 110, 120,
5264
0
  };
5265
0
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
5266
0
  float ap[] = {
5267
0
    13, 14,
5268
0
    15, 16,
5269
0
    17, 18,
5270
0
    19, 20,
5271
0
    131, 141,
5272
0
    151, 161,
5273
0
    171, 181,
5274
0
    191, 201,
5275
0
  };
5276
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
5277
0
  float bp[] = {
5278
0
    21, 22, 23,
5279
0
    24, 25, 26,
5280
0
  };
5281
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
5282
0
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
5283
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
5284
0
  ccv_nnc_tensor_t* const dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
5285
0
  ccv_nnc_tensor_t* gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
5286
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
5287
0
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
5288
0
  ccv_nnc_tensor_t* gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
5289
0
  ccv_nnc_tensor_t* gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
5290
0
  ccv_nnc_tensor_t* gdbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
5291
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(gg, ga, gb), 0);
5292
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(gh, gdb, gdbias), 0);
5293
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdb, gdbias), TENSOR_LIST(h, db, dbias), 0);
5294
0
  float dbiastp[] = {
5295
0
    22 + 220, 26 + 260, 30 + 300,
5296
0
  };
5297
0
  ccv_nnc_tensor_t dbiast = ccv_nnc_tensor(dbiastp, CPU_TENSOR_NHWC(32F, 3), 0);
5298
  
5299
0
  REQUIRE_TENSOR_EQ(dbias, &dbiast, "bias should be equal");
5300
0
  float htp[] = {
5301
0
    1 * 21 + 2 * 22 + 3 * 23, 1 * 24 + 2 * 25 + 3 * 26,
5302
0
    4 * 21 + 5 * 22 + 6 * 23, 4 * 24 + 5 * 25 + 6 * 26,
5303
0
    7 * 21 + 8 * 22 + 9 * 23, 7 * 24 + 8 * 25 + 9 * 26,
5304
0
    10 * 21 + 11 * 22 + 12 * 23, 10 * 24 + 11 * 25 + 12 * 26,
5305
0
    10 * 21 + 20 * 22 + 30 * 23, 10 * 24 + 20 * 25 + 30 * 26,
5306
0
    40 * 21 + 50 * 22 + 60 * 23, 40 * 24 + 50 * 25 + 60 * 26,
5307
0
    70 * 21 + 80 * 22 + 90 * 23, 70 * 24 + 80 * 25 + 90 * 26,
5308
0
    100 * 21 + 110 * 22 + 120 * 23, 100 * 24 + 110 * 25 + 120 * 26,
5309
0
  };
5310
0
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
5311
  
5312
0
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
5313
0
  float dbtp[] = {
5314
0
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19 + 10 * 131 + 40 * 151 + 70 * 171 + 100 * 191, 2 * 13 + 5 * 15 + 8 * 17 + 11 * 19 + 20 * 131 + 50 * 151 + 80 * 171 + 110 * 191, 3 * 13 + 6 * 15 + 9 * 17 + 12 * 19 + 30 * 131 + 60 * 151 + 90 * 171 + 120 * 191,
5315
0
    1 * 14 + 4 * 16 + 7 * 18 + 10 * 20 + 10 * 141 + 40 * 161 + 70 * 181 + 100 * 201, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20 + 20 * 141 + 50 * 161 + 80 * 181 + 110 * 201, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20 + 30 * 141 + 60 * 161 + 90 * 181 + 120 * 201,
5316
0
  };
5317
0
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
5318
0
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
5319
0
  ccv_nnc_tensor_free(g);
5320
0
  ccv_nnc_tensor_free(a);
5321
0
  ccv_nnc_tensor_free(b);
5322
0
  ccv_nnc_tensor_free(h);
5323
0
  ccv_nnc_tensor_free(db);
5324
0
  ccv_nnc_tensor_free(dbias);
5325
0
  ccv_nnc_tensor_free(gg);
5326
0
  ccv_nnc_tensor_free(ga);
5327
0
  ccv_nnc_tensor_free(gb);
5328
0
  ccv_nnc_tensor_free(gh);
5329
0
  ccv_nnc_tensor_free(gdb);
5330
0
  ccv_nnc_tensor_free(gdbias);
5331
0
}
5332
5333
TEST_CASE("backward gemm with no transpose batch 2, batched b")
5334
1
{
5335
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS) &&
5336
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
5337
0
  float gp[] = {
5338
0
    1, 2, 3,
5339
0
    4, 5, 6,
5340
0
    7, 8, 9,
5341
0
    10, 11, 12,
5342
0
    10, 20, 30,
5343
0
    40, 50, 60,
5344
0
    70, 80, 90,
5345
0
    100, 110, 120,
5346
0
  };
5347
0
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
5348
0
  float ap[] = {
5349
0
    13, 14,
5350
0
    15, 16,
5351
0
    17, 18,
5352
0
    19, 20,
5353
0
    131, 141,
5354
0
    151, 161,
5355
0
    171, 181,
5356
0
    191, 201,
5357
0
  };
5358
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
5359
0
  float bp[] = {
5360
0
    21, 22, 23,
5361
0
    24, 25, 26,
5362
0
    212, 222, 232,
5363
0
    242, 252, 262,
5364
0
  };
5365
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 2, 3), 0);
5366
0
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
5367
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 3), 0);
5368
0
  ccv_nnc_tensor_t* const dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 1, 3), 0);
5369
0
  ccv_nnc_tensor_t* const gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
5370
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
5371
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 3), 0);
5372
0
  ccv_nnc_tensor_t* const gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
5373
0
  ccv_nnc_tensor_t* const gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 3), 0);
5374
0
  ccv_nnc_tensor_t* const gdbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 1, 3), 0);
5375
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(gg, ga, gb), 0);
5376
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(gh, gdb, gdbias), 0);
5377
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdb, gdbias), TENSOR_LIST(h, db, dbias), 0);
5378
0
  float dbiastp[] = {
5379
0
    22, 26, 30,
5380
0
    220, 260, 300,
5381
0
  };
5382
0
  ccv_nnc_tensor_t dbiast = ccv_nnc_tensor(dbiastp, CPU_TENSOR_NHWC(32F, 2, 1, 3), 0);
5383
0
  REQUIRE_TENSOR_EQ(dbias, &dbiast, "bias should be equal");
5384
0
  float htp[] = {
5385
0
    1 * 21 + 2 * 22 + 3 * 23, 1 * 24 + 2 * 25 + 3 * 26,
5386
0
    4 * 21 + 5 * 22 + 6 * 23, 4 * 24 + 5 * 25 + 6 * 26,
5387
0
    7 * 21 + 8 * 22 + 9 * 23, 7 * 24 + 8 * 25 + 9 * 26,
5388
0
    10 * 21 + 11 * 22 + 12 * 23, 10 * 24 + 11 * 25 + 12 * 26,
5389
0
    10 * 212 + 20 * 222 + 30 * 232, 10 * 242 + 20 * 252 + 30 * 262,
5390
0
    40 * 212 + 50 * 222 + 60 * 232, 40 * 242 + 50 * 252 + 60 * 262,
5391
0
    70 * 212 + 80 * 222 + 90 * 232, 70 * 242 + 80 * 252 + 90 * 262,
5392
0
    100 * 212 + 110 * 222 + 120 * 232, 100 * 242 + 110 * 252 + 120 * 262,
5393
0
  };
5394
0
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
5395
0
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
5396
0
  float dbtp[] = {
5397
0
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19, 2 * 13 + 5 * 15 + 8 * 17 + 11 * 19, 3 * 13 + 6 * 15 + 9 * 17 + 12 * 19,
5398
0
    1 * 14 + 4 * 16 + 7 * 18 + 10 * 20, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20,
5399
0
    10 * 131 + 40 * 151 + 70 * 171 + 100 * 191, 20 * 131 + 50 * 151 + 80 * 171 + 110 * 191, 30 * 131 + 60 * 151 + 90 * 171 + 120 * 191,
5400
0
    10 * 141 + 40 * 161 + 70 * 181 + 100 * 201, 20 * 141 + 50 * 161 + 80 * 181 + 110 * 201, 30 * 141 + 60 * 161 + 90 * 181 + 120 * 201,
5401
0
  };
5402
0
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 2, 2, 3), 0);
5403
0
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
5404
0
  ccv_nnc_tensor_free(g);
5405
0
  ccv_nnc_tensor_free(a);
5406
0
  ccv_nnc_tensor_free(b);
5407
0
  ccv_nnc_tensor_free(h);
5408
0
  ccv_nnc_tensor_free(db);
5409
0
  ccv_nnc_tensor_free(dbias);
5410
0
  ccv_nnc_tensor_free(gg);
5411
0
  ccv_nnc_tensor_free(ga);
5412
0
  ccv_nnc_tensor_free(gb);
5413
0
  ccv_nnc_tensor_free(gh);
5414
0
  ccv_nnc_tensor_free(gdb);
5415
0
  ccv_nnc_tensor_free(gdbias);
5416
0
}
5417
5418
TEST_CASE("backward gemm with transpose a batch 2, same b")
5419
1
{
5420
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS) &&
5421
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
5422
0
  float gp[] = {
5423
0
    1, 2, 3,
5424
0
    4, 5, 6,
5425
0
    7, 8, 9,
5426
0
    10, 11, 12,
5427
0
    10, 20, 30,
5428
0
    40, 50, 60,
5429
0
    70, 80, 90,
5430
0
    100, 110, 120,
5431
0
  };
5432
0
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
5433
0
  float ap[] = {
5434
0
    13, 15, 17, 19,
5435
0
    14, 16, 18, 20,
5436
0
    131, 151, 171, 191,
5437
0
    141, 161, 181, 201,
5438
0
  };
5439
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 2, 4), 0);
5440
0
  float bp[] = {
5441
0
    21, 22, 23,
5442
0
    24, 25, 26,
5443
0
  };
5444
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
5445
0
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 4), 0);
5446
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
5447
0
  ccv_nnc_tensor_t* const dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
5448
0
  ccv_nnc_tensor_t* const gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
5449
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 4), 0);
5450
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
5451
0
  ccv_nnc_tensor_t* const gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 4), 0);
5452
0
  ccv_nnc_tensor_t* const gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
5453
0
  ccv_nnc_tensor_t* const gdbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
5454
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(gg, ga, gb), 0);
5455
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(gh, gdb, gdbias), 0);
5456
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdb, gdbias), TENSOR_LIST(h, db, dbias), 0);
5457
0
  float dbiastp[] = {
5458
0
    22 + 220, 26 + 260, 30 + 300,
5459
0
  };
5460
0
  ccv_nnc_tensor_t dbiast = ccv_nnc_tensor(dbiastp, CPU_TENSOR_NHWC(32F, 3), 0);
5461
0
  REQUIRE_TENSOR_EQ(dbias, &dbiast, "bias should be equal");
5462
0
  float htp[] = {
5463
0
    1 * 21 + 2 * 22 + 3 * 23, 4 * 21 + 5 * 22 + 6 * 23, 7 * 21 + 8 * 22 + 9 * 23, 10 * 21 + 11 * 22 + 12 * 23,
5464
0
    1 * 24 + 2 * 25 + 3 * 26, 4 * 24 + 5 * 25 + 6 * 26, 7 * 24 + 8 * 25 + 9 * 26, 10 * 24 + 11 * 25 + 12 * 26,
5465
0
    10 * 21 + 20 * 22 + 30 * 23, 40 * 21 + 50 * 22 + 60 * 23, 70 * 21 + 80 * 22 + 90 * 23, 100 * 21 + 110 * 22 + 120 * 23,
5466
0
    10 * 24 + 20 * 25 + 30 * 26, 40 * 24 + 50 * 25 + 60 * 26, 70 * 24 + 80 * 25 + 90 * 26, 100 * 24 + 110 * 25 + 120 * 26,
5467
0
  };
5468
0
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 2, 2, 4), 0);
5469
0
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
5470
0
  float dbtp[] = {
5471
0
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19 + 10 * 131 + 40 * 151 + 70 * 171 + 100 * 191, 2 * 13 + 5 * 15 + 8 * 17 + 11 * 19 + 20 * 131 + 50 * 151 + 80 * 171 + 110 * 191, 3 * 13 + 6 * 15 + 9 * 17 + 12 * 19 + 30 * 131 + 60 * 151 + 90 * 171 + 120 * 191,
5472
0
    1 * 14 + 4 * 16 + 7 * 18 + 10 * 20 + 10 * 141 + 40 * 161 + 70 * 181 + 100 * 201, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20 + 20 * 141 + 50 * 161 + 80 * 181 + 110 * 201, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20 + 30 * 141 + 60 * 161 + 90 * 181 + 120 * 201,
5473
0
  };
5474
0
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
5475
0
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
5476
0
  ccv_nnc_tensor_free(g);
5477
0
  ccv_nnc_tensor_free(a);
5478
0
  ccv_nnc_tensor_free(b);
5479
0
  ccv_nnc_tensor_free(h);
5480
0
  ccv_nnc_tensor_free(db);
5481
0
  ccv_nnc_tensor_free(dbias);
5482
0
  ccv_nnc_tensor_free(gg);
5483
0
  ccv_nnc_tensor_free(ga);
5484
0
  ccv_nnc_tensor_free(gb);
5485
0
  ccv_nnc_tensor_free(gh);
5486
0
  ccv_nnc_tensor_free(gdb);
5487
0
  ccv_nnc_tensor_free(gdbias);
5488
0
}
5489
5490
TEST_CASE("backward gemm with transpose b batch 2, batched b")
5491
1
{
5492
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS) &&
5493
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
5494
0
  float gp[] = {
5495
0
    1, 2, 3,
5496
0
    4, 5, 6,
5497
0
    7, 8, 9,
5498
0
    10, 11, 12,
5499
0
    10, 20, 30,
5500
0
    40, 50, 60,
5501
0
    70, 80, 90,
5502
0
    100, 110, 120,
5503
0
  };
5504
0
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
5505
0
  float ap[] = {
5506
0
    13, 14,
5507
0
    15, 16,
5508
0
    17, 18,
5509
0
    19, 20,
5510
0
    131, 141,
5511
0
    151, 161,
5512
0
    171, 181,
5513
0
    191, 201,
5514
0
  };
5515
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
5516
0
  float bp[] = {
5517
0
    21, 24,
5518
0
    22, 25,
5519
0
    23, 26,
5520
0
    212, 242,
5521
0
    222, 252,
5522
0
    232, 262,
5523
0
  };
5524
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3, 2), 0);
5525
0
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
5526
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3, 2), 0);
5527
0
  ccv_nnc_tensor_t* const dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 1, 3), 0);
5528
0
  ccv_nnc_tensor_t* const gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
5529
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
5530
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3, 2), 0);
5531
0
  ccv_nnc_tensor_t* const gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
5532
0
  ccv_nnc_tensor_t* const gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3, 2), 0);
5533
0
  ccv_nnc_tensor_t* const gdbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 1, 3), 0);
5534
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(gg, ga, gb), 0);
5535
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(gh, gdb, gdbias), 0);
5536
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdb, gdbias), TENSOR_LIST(h, db, dbias), 0);
5537
0
  float dbiastp[] = {
5538
0
    22, 26, 30,
5539
0
    220, 260, 300,
5540
0
  };
5541
0
  ccv_nnc_tensor_t dbiast = ccv_nnc_tensor(dbiastp, CPU_TENSOR_NHWC(32F, 2, 1, 3), 0);
5542
0
  REQUIRE_TENSOR_EQ(dbias, &dbiast, "bias should be equal");
5543
0
  float htp[] = {
5544
0
    1 * 21 + 2 * 22 + 3 * 23, 1 * 24 + 2 * 25 + 3 * 26,
5545
0
    4 * 21 + 5 * 22 + 6 * 23, 4 * 24 + 5 * 25 + 6 * 26,
5546
0
    7 * 21 + 8 * 22 + 9 * 23, 7 * 24 + 8 * 25 + 9 * 26,
5547
0
    10 * 21 + 11 * 22 + 12 * 23, 10 * 24 + 11 * 25 + 12 * 26,
5548
0
    10 * 212 + 20 * 222 + 30 * 232, 10 * 242 + 20 * 252 + 30 * 262,
5549
0
    40 * 212 + 50 * 222 + 60 * 232, 40 * 242 + 50 * 252 + 60 * 262,
5550
0
    70 * 212 + 80 * 222 + 90 * 232, 70 * 242 + 80 * 252 + 90 * 262,
5551
0
    100 * 212 + 110 * 222 + 120 * 232, 100 * 242 + 110 * 252 + 120 * 262,
5552
0
  };
5553
0
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
5554
0
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
5555
0
  float dbtp[] = {
5556
0
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19, 1 * 14 + 4 * 16 + 7 * 18 + 10 * 20,
5557
0
    2 * 13 + 5 * 15 + 8 * 17 + 11 * 19, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20,
5558
0
    3 * 13 + 6 * 15 + 9 * 17 + 12 * 19, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20,
5559
0
    10 * 131 + 40 * 151 + 70 * 171 + 100 * 191, 10 * 141 + 40 * 161 + 70 * 181 + 100 * 201,
5560
0
    20 * 131 + 50 * 151 + 80 * 171 + 110 * 191, 20 * 141 + 50 * 161 + 80 * 181 + 110 * 201,
5561
0
    30 * 131 + 60 * 151 + 90 * 171 + 120 * 191, 30 * 141 + 60 * 161 + 90 * 181 + 120 * 201,
5562
0
  };
5563
0
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 2, 3, 2), 0);
5564
0
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
5565
0
  ccv_nnc_tensor_free(g);
5566
0
  ccv_nnc_tensor_free(a);
5567
0
  ccv_nnc_tensor_free(b);
5568
0
  ccv_nnc_tensor_free(h);
5569
0
  ccv_nnc_tensor_free(db);
5570
0
  ccv_nnc_tensor_free(dbias);
5571
0
  ccv_nnc_tensor_free(gg);
5572
0
  ccv_nnc_tensor_free(ga);
5573
0
  ccv_nnc_tensor_free(gb);
5574
0
  ccv_nnc_tensor_free(gh);
5575
0
  ccv_nnc_tensor_free(gdb);
5576
0
  ccv_nnc_tensor_free(gdbias);
5577
0
}
5578
5579
TEST_CASE("backward gemm with transpose a and b batch 2, same b")
5580
1
{
5581
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS) &&
5582
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
5583
0
  float gp[] = {
5584
0
    1, 2, 3,
5585
0
    4, 5, 6,
5586
0
    7, 8, 9,
5587
0
    10, 11, 12,
5588
0
    10, 20, 30,
5589
0
    40, 50, 60,
5590
0
    70, 80, 90,
5591
0
    100, 110, 120,
5592
0
  };
5593
0
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
5594
0
  float ap[] = {
5595
0
    13, 15, 17, 19,
5596
0
    14, 16, 18, 20,
5597
0
    131, 151, 171, 191,
5598
0
    141, 161, 181, 201,
5599
0
  };
5600
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 2, 4), 0);
5601
0
  float bp[] = {
5602
0
    21, 24,
5603
0
    22, 25,
5604
0
    23, 26,
5605
0
  };
5606
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 3, 2), 0);
5607
0
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 4), 0);
5608
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 2), 0);
5609
0
  ccv_nnc_tensor_t* const dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
5610
0
  ccv_nnc_tensor_t* const gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
5611
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 4), 0);
5612
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3, 2), 0);
5613
0
  ccv_nnc_tensor_t* const gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 4), 0);
5614
0
  ccv_nnc_tensor_t* const gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3, 2), 0);
5615
0
  ccv_nnc_tensor_t* const gdbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
5616
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(gg, ga, gb), 0);
5617
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(TRANSPOSE(1, 2), TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(gh, gdb, gdbias), 0);
5618
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdb, gdbias), TENSOR_LIST(h, db, dbias), 0);
5619
0
  float dbiastp[] = {
5620
0
    22 + 220, 26 + 260, 30 + 300,
5621
0
  };
5622
0
  ccv_nnc_tensor_t dbiast = ccv_nnc_tensor(dbiastp, CPU_TENSOR_NHWC(32F, 3), 0);
5623
0
  REQUIRE_TENSOR_EQ(dbias, &dbiast, "bias should be equal");
5624
0
  float htp[] = {
5625
0
    1 * 21 + 2 * 22 + 3 * 23, 4 * 21 + 5 * 22 + 6 * 23, 7 * 21 + 8 * 22 + 9 * 23, 10 * 21 + 11 * 22 + 12 * 23,
5626
0
    1 * 24 + 2 * 25 + 3 * 26, 4 * 24 + 5 * 25 + 6 * 26, 7 * 24 + 8 * 25 + 9 * 26, 10 * 24 + 11 * 25 + 12 * 26,
5627
0
    10 * 21 + 20 * 22 + 30 * 23, 40 * 21 + 50 * 22 + 60 * 23, 70 * 21 + 80 * 22 + 90 * 23, 100 * 21 + 110 * 22 + 120 * 23,
5628
0
    10 * 24 + 20 * 25 + 30 * 26, 40 * 24 + 50 * 25 + 60 * 26, 70 * 24 + 80 * 25 + 90 * 26, 100 * 24 + 110 * 25 + 120 * 26,
5629
0
  };
5630
0
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 2, 2, 4), 0);
5631
0
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
5632
0
  float dbtp[] = {
5633
0
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19 + 10 * 131 + 40 * 151 + 70 * 171 + 100 * 191, 1 * 14 + 4 * 16 + 7 * 18 + 10 * 20 + 10 * 141 + 40 * 161 + 70 * 181 + 100 * 201,
5634
0
    2 * 13 + 5 * 15 + 8 * 17 + 11 * 19 + 20 * 131 + 50 * 151 + 80 * 171 + 110 * 191, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20 + 20 * 141 + 50 * 161 + 80 * 181 + 110 * 201,
5635
0
    3 * 13 + 6 * 15 + 9 * 17 + 12 * 19 + 30 * 131 + 60 * 151 + 90 * 171 + 120 * 191, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20 + 30 * 141 + 60 * 161 + 90 * 181 + 120 * 201,
5636
0
  };
5637
0
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 3, 2), 0);
5638
0
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
5639
0
  ccv_nnc_tensor_free(g);
5640
0
  ccv_nnc_tensor_free(a);
5641
0
  ccv_nnc_tensor_free(b);
5642
0
  ccv_nnc_tensor_free(h);
5643
0
  ccv_nnc_tensor_free(db);
5644
0
  ccv_nnc_tensor_free(dbias);
5645
0
  ccv_nnc_tensor_free(gg);
5646
0
  ccv_nnc_tensor_free(ga);
5647
0
  ccv_nnc_tensor_free(gb);
5648
0
  ccv_nnc_tensor_free(gh);
5649
0
  ccv_nnc_tensor_free(gdb);
5650
0
  ccv_nnc_tensor_free(gdbias);
5651
0
}
5652
5653
TEST_CASE("backward gemm with no transpose batch 2, batched b, no bias")
5654
1
{
5655
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS) &&
5656
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
5657
0
  float gp[] = {
5658
0
    1, 2, 3,
5659
0
    4, 5, 6,
5660
0
    7, 8, 9,
5661
0
    10, 11, 12,
5662
0
    10, 20, 30,
5663
0
    40, 50, 60,
5664
0
    70, 80, 90,
5665
0
    100, 110, 120,
5666
0
  };
5667
0
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
5668
0
  float ap[] = {
5669
0
    13, 14,
5670
0
    15, 16,
5671
0
    17, 18,
5672
0
    19, 20,
5673
0
    131, 141,
5674
0
    151, 161,
5675
0
    171, 181,
5676
0
    191, 201,
5677
0
  };
5678
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
5679
0
  float bp[] = {
5680
0
    21, 22, 23,
5681
0
    24, 25, 26,
5682
0
    212, 222, 232,
5683
0
    242, 252, 262,
5684
0
  };
5685
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 2, 3), 0);
5686
0
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
5687
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 3), 0);
5688
0
  ccv_nnc_tensor_t* const gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
5689
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
5690
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 3), 0);
5691
0
  ccv_nnc_tensor_t* const gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
5692
0
  ccv_nnc_tensor_t* const gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 3), 0);
5693
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(gg, ga, gb), 0);
5694
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(gh, gdb), 0);
5695
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdb), TENSOR_LIST(h, db), 0);
5696
0
  float htp[] = {
5697
0
    1 * 21 + 2 * 22 + 3 * 23, 1 * 24 + 2 * 25 + 3 * 26,
5698
0
    4 * 21 + 5 * 22 + 6 * 23, 4 * 24 + 5 * 25 + 6 * 26,
5699
0
    7 * 21 + 8 * 22 + 9 * 23, 7 * 24 + 8 * 25 + 9 * 26,
5700
0
    10 * 21 + 11 * 22 + 12 * 23, 10 * 24 + 11 * 25 + 12 * 26,
5701
0
    10 * 212 + 20 * 222 + 30 * 232, 10 * 242 + 20 * 252 + 30 * 262,
5702
0
    40 * 212 + 50 * 222 + 60 * 232, 40 * 242 + 50 * 252 + 60 * 262,
5703
0
    70 * 212 + 80 * 222 + 90 * 232, 70 * 242 + 80 * 252 + 90 * 262,
5704
0
    100 * 212 + 110 * 222 + 120 * 232, 100 * 242 + 110 * 252 + 120 * 262,
5705
0
  };
5706
0
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
5707
0
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
5708
0
  float dbtp[] = {
5709
0
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19, 2 * 13 + 5 * 15 + 8 * 17 + 11 * 19, 3 * 13 + 6 * 15 + 9 * 17 + 12 * 19,
5710
0
    1 * 14 + 4 * 16 + 7 * 18 + 10 * 20, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20,
5711
0
    10 * 131 + 40 * 151 + 70 * 171 + 100 * 191, 20 * 131 + 50 * 151 + 80 * 171 + 110 * 191, 30 * 131 + 60 * 151 + 90 * 171 + 120 * 191,
5712
0
    10 * 141 + 40 * 161 + 70 * 181 + 100 * 201, 20 * 141 + 50 * 161 + 80 * 181 + 110 * 201, 30 * 141 + 60 * 161 + 90 * 181 + 120 * 201,
5713
0
  };
5714
0
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 2, 2, 3), 0);
5715
0
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
5716
0
  ccv_nnc_tensor_free(g);
5717
0
  ccv_nnc_tensor_free(a);
5718
0
  ccv_nnc_tensor_free(b);
5719
0
  ccv_nnc_tensor_free(h);
5720
0
  ccv_nnc_tensor_free(db);
5721
0
  ccv_nnc_tensor_free(gg);
5722
0
  ccv_nnc_tensor_free(ga);
5723
0
  ccv_nnc_tensor_free(gb);
5724
0
  ccv_nnc_tensor_free(gh);
5725
0
  ccv_nnc_tensor_free(gdb);
5726
0
}
5727
5728
TEST_CASE("backward gemm with transpose b batch 2, batched b, no bias")
5729
1
{
5730
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS) &&
5731
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
5732
0
  float gp[] = {
5733
0
    1, 2, 3,
5734
0
    4, 5, 6,
5735
0
    7, 8, 9,
5736
0
    10, 11, 12,
5737
0
    10, 20, 30,
5738
0
    40, 50, 60,
5739
0
    70, 80, 90,
5740
0
    100, 110, 120,
5741
0
  };
5742
0
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
5743
0
  float ap[] = {
5744
0
    13, 14,
5745
0
    15, 16,
5746
0
    17, 18,
5747
0
    19, 20,
5748
0
    131, 141,
5749
0
    151, 161,
5750
0
    171, 181,
5751
0
    191, 201,
5752
0
  };
5753
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
5754
0
  float bp[] = {
5755
0
    21, 24,
5756
0
    22, 25,
5757
0
    23, 26,
5758
0
    212, 242,
5759
0
    222, 252,
5760
0
    232, 262,
5761
0
  };
5762
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3, 2), 0);
5763
0
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
5764
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3, 2), 0);
5765
0
  ccv_nnc_tensor_t* const gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
5766
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
5767
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3, 2), 0);
5768
0
  ccv_nnc_tensor_t* const gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
5769
0
  ccv_nnc_tensor_t* const gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3, 2), 0);
5770
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(gg, ga, gb), 0);
5771
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(gh, gdb), 0);
5772
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdb), TENSOR_LIST(h, db), 0);
5773
0
  float htp[] = {
5774
0
    1 * 21 + 2 * 22 + 3 * 23, 1 * 24 + 2 * 25 + 3 * 26,
5775
0
    4 * 21 + 5 * 22 + 6 * 23, 4 * 24 + 5 * 25 + 6 * 26,
5776
0
    7 * 21 + 8 * 22 + 9 * 23, 7 * 24 + 8 * 25 + 9 * 26,
5777
0
    10 * 21 + 11 * 22 + 12 * 23, 10 * 24 + 11 * 25 + 12 * 26,
5778
0
    10 * 212 + 20 * 222 + 30 * 232, 10 * 242 + 20 * 252 + 30 * 262,
5779
0
    40 * 212 + 50 * 222 + 60 * 232, 40 * 242 + 50 * 252 + 60 * 262,
5780
0
    70 * 212 + 80 * 222 + 90 * 232, 70 * 242 + 80 * 252 + 90 * 262,
5781
0
    100 * 212 + 110 * 222 + 120 * 232, 100 * 242 + 110 * 252 + 120 * 262,
5782
0
  };
5783
0
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
5784
0
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
5785
0
  float dbtp[] = {
5786
0
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19, 1 * 14 + 4 * 16 + 7 * 18 + 10 * 20,
5787
0
    2 * 13 + 5 * 15 + 8 * 17 + 11 * 19, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20,
5788
0
    3 * 13 + 6 * 15 + 9 * 17 + 12 * 19, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20,
5789
0
    10 * 131 + 40 * 151 + 70 * 171 + 100 * 191, 10 * 141 + 40 * 161 + 70 * 181 + 100 * 201,
5790
0
    20 * 131 + 50 * 151 + 80 * 171 + 110 * 191, 20 * 141 + 50 * 161 + 80 * 181 + 110 * 201,
5791
0
    30 * 131 + 60 * 151 + 90 * 171 + 120 * 191, 30 * 141 + 60 * 161 + 90 * 181 + 120 * 201,
5792
0
  };
5793
0
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 2, 3, 2), 0);
5794
0
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
5795
0
  ccv_nnc_tensor_free(g);
5796
0
  ccv_nnc_tensor_free(a);
5797
0
  ccv_nnc_tensor_free(b);
5798
0
  ccv_nnc_tensor_free(h);
5799
0
  ccv_nnc_tensor_free(db);
5800
0
  ccv_nnc_tensor_free(gg);
5801
0
  ccv_nnc_tensor_free(ga);
5802
0
  ccv_nnc_tensor_free(gb);
5803
0
  ccv_nnc_tensor_free(gh);
5804
0
  ccv_nnc_tensor_free(gdb);
5805
0
}
5806
5807
TEST_CASE("backward gemm with transpose a and b batch 2, batch b, no bias")
5808
1
{
5809
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS) &&
5810
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
5811
0
  float gp[] = {
5812
0
    1, 2, 3,
5813
0
    4, 5, 6,
5814
0
    7, 8, 9,
5815
0
    10, 11, 12,
5816
0
    10, 20, 30,
5817
0
    40, 50, 60,
5818
0
    70, 80, 90,
5819
0
    100, 110, 120,
5820
0
  };
5821
0
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
5822
0
  float ap[] = {
5823
0
    13, 15, 17, 19,
5824
0
    14, 16, 18, 20,
5825
0
    131, 151, 171, 191,
5826
0
    141, 161, 181, 201,
5827
0
  };
5828
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 2, 4), 0);
5829
0
  float bp[] = {
5830
0
    21, 24,
5831
0
    22, 25,
5832
0
    23, 26,
5833
0
    212, 242,
5834
0
    222, 252,
5835
0
    232, 262,
5836
0
  };
5837
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3, 2), 0);
5838
0
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 4), 0);
5839
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3, 2), 0);
5840
0
  ccv_nnc_tensor_t* const gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
5841
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 4), 0);
5842
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3, 2), 0);
5843
0
  ccv_nnc_tensor_t* const gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 4), 0);
5844
0
  ccv_nnc_tensor_t* const gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3, 2), 0);
5845
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(gg, ga, gb), 0);
5846
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(TRANSPOSE(1, 2), TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(gh, gdb), 0);
5847
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdb), TENSOR_LIST(h, db), 0);
5848
0
  float htp[] = {
5849
0
    1 * 21 + 2 * 22 + 3 * 23, 4 * 21 + 5 * 22 + 6 * 23, 7 * 21 + 8 * 22 + 9 * 23, 10 * 21 + 11 * 22 + 12 * 23,
5850
0
    1 * 24 + 2 * 25 + 3 * 26, 4 * 24 + 5 * 25 + 6 * 26, 7 * 24 + 8 * 25 + 9 * 26, 10 * 24 + 11 * 25 + 12 * 26,
5851
0
    10 * 212 + 20 * 222 + 30 * 232, 40 * 212 + 50 * 222 + 60 * 232, 70 * 212 + 80 * 222 + 90 * 232, 100 * 212 + 110 * 222 + 120 * 232,
5852
0
    10 * 242 + 20 * 252 + 30 * 262, 40 * 242 + 50 * 252 + 60 * 262, 70 * 242 + 80 * 252 + 90 * 262, 100 * 242 + 110 * 252 + 120 * 262,
5853
0
  };
5854
0
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 2, 2, 4), 0);
5855
0
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
5856
0
  float dbtp[] = {
5857
0
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19, 1 * 14 + 4 * 16 + 7 * 18 + 10 * 20,
5858
0
    2 * 13 + 5 * 15 + 8 * 17 + 11 * 19, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20,
5859
0
    3 * 13 + 6 * 15 + 9 * 17 + 12 * 19, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20,
5860
0
    10 * 131 + 40 * 151 + 70 * 171 + 100 * 191, 10 * 141 + 40 * 161 + 70 * 181 + 100 * 201,
5861
0
    20 * 131 + 50 * 151 + 80 * 171 + 110 * 191, 20 * 141 + 50 * 161 + 80 * 181 + 110 * 201,
5862
0
    30 * 131 + 60 * 151 + 90 * 171 + 120 * 191, 30 * 141 + 60 * 161 + 90 * 181 + 120 * 201,
5863
0
  };
5864
0
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 2, 3, 2), 0);
5865
0
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
5866
0
  ccv_nnc_tensor_free(g);
5867
0
  ccv_nnc_tensor_free(a);
5868
0
  ccv_nnc_tensor_free(b);
5869
0
  ccv_nnc_tensor_free(h);
5870
0
  ccv_nnc_tensor_free(db);
5871
0
  ccv_nnc_tensor_free(gg);
5872
0
  ccv_nnc_tensor_free(ga);
5873
0
  ccv_nnc_tensor_free(gb);
5874
0
  ccv_nnc_tensor_free(gh);
5875
0
  ccv_nnc_tensor_free(gdb);
5876
0
}
5877
5878
TEST_CASE("mps segmented gemm")
5879
1
{
5880
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SEGMENTED_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
5881
0
  dsfmt_t dsfmt;
5882
0
  dsfmt_init_gen_rand(&dsfmt, 11);
5883
0
  ccv_nnc_tensor_t* const ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 384, 256), 0);
5884
0
  ccv_nnc_tensor_t* const hindices = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32S, 3), 0);
5885
0
  hindices->data.i32[0] = 1;
5886
0
  hindices->data.i32[1] = 0;
5887
0
  hindices->data.i32[2] = 2;
5888
0
  ccv_nnc_tensor_t* const hcounts = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32S, 3), 0);
5889
0
  hcounts->data.i32[0] = 129;
5890
0
  hcounts->data.i32[1] = 131;
5891
0
  hcounts->data.i32[2] = 124;
5892
0
  ccv_nnc_tensor_t* const hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 128, 256), 0);
5893
0
  ccv_nnc_tensor_t* const hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 384, 128), 0);
5894
0
  ccv_nnc_tensor_t* const bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 384, 128), 0);
5895
0
  int i;
5896
0
  for (i = 0; i < 3 * 128 * 256; i++)
5897
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / 256;
5898
0
  for (i = 0; i < 384 * 256; i++)
5899
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
5900
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 384, 256), 0);
5901
0
  ccv_nnc_tensor_t* const indices = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32S, 3), 0);
5902
0
  ccv_nnc_tensor_t* const counts = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32S, 3), 0);
5903
0
  ccv_nnc_tensor_t* const w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3, 128, 256), 0);
5904
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 384, 128), 0);
5905
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hindices, hcounts, hw), TENSOR_LIST(a, indices, counts, w), 0);
5906
0
  ccv_nnc_cmd_exec(CMD_SEGMENTED_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, indices, counts, w), TENSOR_LIST(b), 0);
5907
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
5908
0
  ccv_nnc_cmd_exec(CMD_SEGMENTED_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hindices, hcounts, hw), TENSOR_LIST(bt), 0);
5909
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, hb->data.f32, bt->data.f32, 384 * 128, 3e-4, "segmented GEMM result should match CPU reference");
5910
0
  ccv_nnc_tensor_free(a);
5911
0
  ccv_nnc_tensor_free(indices);
5912
0
  ccv_nnc_tensor_free(counts);
5913
0
  ccv_nnc_tensor_free(w);
5914
0
  ccv_nnc_tensor_free(b);
5915
0
  ccv_nnc_tensor_free(ha);
5916
0
  ccv_nnc_tensor_free(hindices);
5917
0
  ccv_nnc_tensor_free(hcounts);
5918
0
  ccv_nnc_tensor_free(hw);
5919
0
  ccv_nnc_tensor_free(hb);
5920
0
  ccv_nnc_tensor_free(bt);
5921
0
}
5922
5923
TEST_CASE("mps segmented gemm with bias in half precision, split-k")
5924
1
{
5925
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SEGMENTED_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
5926
0
  dsfmt_t dsfmt;
5927
0
  dsfmt_init_gen_rand(&dsfmt, 13);
5928
0
  ccv_nnc_tensor_t* const ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 272, 4096), 0);
5929
0
  ccv_nnc_tensor_t* const hindices = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32S, 2), 0);
5930
0
  hindices->data.i32[0] = 1;
5931
0
  hindices->data.i32[1] = 0;
5932
0
  ccv_nnc_tensor_t* const hcounts = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32S, 2), 0);
5933
0
  hcounts->data.i32[0] = 136;
5934
0
  hcounts->data.i32[1] = 136;
5935
0
  ccv_nnc_tensor_t* const hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 128, 4096), 0);
5936
0
  ccv_nnc_tensor_t* const hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 128), 0);
5937
0
  ccv_nnc_tensor_t* const hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 272, 128), 0);
5938
0
  ccv_nnc_tensor_t* const hb16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 272, 128), 0);
5939
0
  ccv_nnc_tensor_t* const bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 272, 128), 0);
5940
0
  int i;
5941
0
  for (i = 0; i < 2 * 128 * 4096; i++)
5942
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / 4096;
5943
0
  for (i = 0; i < 2 * 128; i++)
5944
0
    hbias->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / 128;
5945
0
  for (i = 0; i < 272 * 4096; i++)
5946
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
5947
0
  ccv_nnc_tensor_t* const ha16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 272, 4096), 0);
5948
0
  ccv_nnc_tensor_t* const hw16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 2, 128, 4096), 0);
5949
0
  ccv_nnc_tensor_t* const hbias16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 2, 128), 0);
5950
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(ha16, hw16, hbias16), 0);
5951
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 272, 4096), 0);
5952
0
  ccv_nnc_tensor_t* const indices = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32S, 2), 0);
5953
0
  ccv_nnc_tensor_t* const counts = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32S, 2), 0);
5954
0
  ccv_nnc_tensor_t* const w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 2, 128, 4096), 0);
5955
0
  ccv_nnc_tensor_t* const bias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 2, 128), 0);
5956
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 272, 128), 0);
5957
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha16, hindices, hcounts, hw16, hbias16), TENSOR_LIST(a, indices, counts, w, bias), 0);
5958
0
  ccv_nnc_cmd_exec(CMD_SEGMENTED_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, indices, counts, w, bias), TENSOR_LIST(b), 0);
5959
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb16), 0);
5960
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(hb16), TENSOR_LIST(hb), 0);
5961
0
  ccv_nnc_cmd_exec(CMD_SEGMENTED_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hindices, hcounts, hw, hbias), TENSOR_LIST(bt), 0);
5962
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, hb->data.f32, bt->data.f32, 272 * 128, 2e-2, "half-precision segmented GEMM result should match CPU reference");
5963
0
  ccv_nnc_tensor_free(a);
5964
0
  ccv_nnc_tensor_free(indices);
5965
0
  ccv_nnc_tensor_free(counts);
5966
0
  ccv_nnc_tensor_free(w);
5967
0
  ccv_nnc_tensor_free(bias);
5968
0
  ccv_nnc_tensor_free(b);
5969
0
  ccv_nnc_tensor_free(ha);
5970
0
  ccv_nnc_tensor_free(hindices);
5971
0
  ccv_nnc_tensor_free(hcounts);
5972
0
  ccv_nnc_tensor_free(hw);
5973
0
  ccv_nnc_tensor_free(hbias);
5974
0
  ccv_nnc_tensor_free(hb);
5975
0
  ccv_nnc_tensor_free(hb16);
5976
0
  ccv_nnc_tensor_free(bt);
5977
0
  ccv_nnc_tensor_free(ha16);
5978
0
  ccv_nnc_tensor_free(hw16);
5979
0
  ccv_nnc_tensor_free(hbias16);
5980
0
}
5981
5982
// Derived from shapes.txt NA lines, assuming the call shape is C = A @ B^T.
5983
1
NA_GEMM_SHAPE_TEST(306, 2048, 3840)
5984
1
NA_GEMM_SHAPE_TEST(306, 4096, 3840)
5985
1
NA_GEMM_SHAPE_TEST(306, 3840, 4096)
5986
1
NA_GEMM_SHAPE_TEST(306, 15360, 3840)
5987
1
NA_GEMM_SHAPE_TEST(306, 3840, 15360)
5988
1
NA_GEMM_SHAPE_TEST(1024, 4096, 4096)
5989
1
NA_GEMM_SHAPE_TEST(1024, 32, 4096)
5990
1
NA_GEMM_SHAPE_TEST(1024, 16384, 4096)
5991
1
NA_GEMM_SHAPE_TEST(1024, 4096, 16384)
5992
1
NA_GEMM_SHAPE_TEST(1024, 2048, 2048)
5993
1
NA_GEMM_SHAPE_TEST(1024, 32, 2048)
5994
1
NA_GEMM_SHAPE_TEST(1024, 8192, 2048)
5995
1
NA_GEMM_SHAPE_TEST(1024, 2048, 8192)
5996
1
NA_GEMM_SHAPE_TEST(1, 2048, 256)
5997
1
NA_GEMM_SHAPE_TEST(1, 2048, 2048)
5998
1
NA_GEMM_SHAPE_TEST(1, 4096, 256)
5999
1
NA_GEMM_SHAPE_TEST(1, 4096, 4096)
6000
1
NA_GEMM_SHAPE_TEST(1024, 4096, 128)
6001
1
NA_GEMM_SHAPE_TEST(257, 2048, 128)
6002
1
NA_GEMM_SHAPE_TEST(33792, 4096, 4096)
6003
1
NA_GEMM_SHAPE_TEST(33792, 32, 4096)
6004
1
NA_GEMM_SHAPE_TEST(257, 2048, 2048)
6005
1
NA_GEMM_SHAPE_TEST(257, 32, 2048)
6006
1
NA_GEMM_SHAPE_TEST(33792, 2048, 4096)
6007
1
NA_GEMM_SHAPE_TEST(33792, 4096, 2048)
6008
1
NA_GEMM_SHAPE_TEST(33792, 16384, 4096)
6009
1
NA_GEMM_SHAPE_TEST(33792, 4096, 16384)
6010
1
NA_GEMM_SHAPE_TEST(257, 8192, 2048)
6011
1
NA_GEMM_SHAPE_TEST(257, 2048, 8192)
6012
1
NA_GEMM_SHAPE_TEST(33792, 128, 4096)
6013
1
NA_GEMM_SHAPE_TEST(257, 128, 2048)
6014
1
NA_GEMM_BIAS_SHAPE_TEST(306, 2048, 3840)
6015
1
NA_GEMM_BIAS_SHAPE_TEST(306, 4096, 3840)
6016
1
NA_GEMM_BIAS_SHAPE_TEST(306, 3840, 4096)
6017
1
NA_GEMM_BIAS_SHAPE_TEST(306, 15360, 3840)
6018
1
NA_GEMM_BIAS_SHAPE_TEST(306, 3840, 15360)
6019
1
NA_GEMM_BIAS_SHAPE_TEST(1024, 4096, 4096)
6020
1
NA_GEMM_BIAS_SHAPE_TEST(1024, 32, 4096)
6021
1
NA_GEMM_BIAS_SHAPE_TEST(1024, 16384, 4096)
6022
1
NA_GEMM_BIAS_SHAPE_TEST(1024, 4096, 16384)
6023
1
NA_GEMM_BIAS_SHAPE_TEST(1024, 2048, 2048)
6024
1
NA_GEMM_BIAS_SHAPE_TEST(1024, 32, 2048)
6025
1
NA_GEMM_BIAS_SHAPE_TEST(1024, 8192, 2048)
6026
1
NA_GEMM_BIAS_SHAPE_TEST(1024, 2048, 8192)
6027
1
NA_GEMM_BIAS_SHAPE_TEST(1, 2048, 256)
6028
1
NA_GEMM_BIAS_SHAPE_TEST(1, 2048, 2048)
6029
1
NA_GEMM_BIAS_SHAPE_TEST(1, 4096, 256)
6030
1
NA_GEMM_BIAS_SHAPE_TEST(1, 4096, 4096)
6031
1
NA_GEMM_BIAS_SHAPE_TEST(1024, 4096, 128)
6032
1
NA_GEMM_BIAS_SHAPE_TEST(257, 2048, 128)
6033
1
NA_GEMM_BIAS_SHAPE_TEST(33792, 4096, 4096)
6034
1
NA_GEMM_BIAS_SHAPE_TEST(33792, 32, 4096)
6035
1
NA_GEMM_BIAS_SHAPE_TEST(257, 2048, 2048)
6036
1
NA_GEMM_BIAS_SHAPE_TEST(257, 32, 2048)
6037
1
NA_GEMM_BIAS_SHAPE_TEST(33792, 2048, 4096)
6038
1
NA_GEMM_BIAS_SHAPE_TEST(33792, 4096, 2048)
6039
1
NA_GEMM_BIAS_SHAPE_TEST(33792, 16384, 4096)
6040
1
NA_GEMM_BIAS_SHAPE_TEST(33792, 4096, 16384)
6041
1
NA_GEMM_BIAS_SHAPE_TEST(257, 8192, 2048)
6042
1
NA_GEMM_BIAS_SHAPE_TEST(257, 2048, 8192)
6043
1
NA_GEMM_BIAS_SHAPE_TEST(33792, 128, 4096)
6044
NA_GEMM_BIAS_SHAPE_TEST(257, 128, 2048)
6045
6046
#include "case_main.h"