Coverage Report

Created: 2026-04-14 19:22

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/home/liu/actions-runner/_work/ccv/ccv/test/int/nnc/mpsblas.tests.c
Line
Count
Source
1
#include "case.h"
2
#include "ccv_case.h"
3
#include "ccv_nnc_case.h"
4
#include <ccv.h>
5
#include <nnc/ccv_nnc.h>
6
#include <nnc/ccv_nnc_easy.h>
7
#include <nnc/mps/ccv_nnc_mps.h>
8
#include <3rdparty/dsfmt/dSFMT.h>
9
#include <math.h>
10
#include <stdlib.h>
11
12
TEST_SETUP()
13
{
14
  ccv_nnc_init();
15
}
16
17
static float _mps_forward_na_gemm_a_value(const int row, const int k)
18
0
{
19
0
  return (float)(((row * 17 + k * 13) % 23) + 1) / 512.0f;
20
0
}
21
22
static float _mps_forward_na_gemm_b_value(const int col, const int k)
23
0
{
24
0
  return (float)(((col * 19 + k * 7) % 29) + 1) / 512.0f;
25
0
}
26
27
static float _mps_forward_na_gemm_bias_value(const int col)
28
0
{
29
0
  return (float)(((col * 5) % 17) - 8) / 256.0f;
30
0
}
31
32
static void _mps_forward_na_gemm_fill_half(ccv_float16_t* const data, const int rows, const int cols, const int for_a)
33
0
{
34
0
  float* const row_buffer = (float*)ccmalloc(sizeof(float) * cols);
35
0
  int i, j;
36
0
  for (i = 0; i < rows; i++)
37
0
  {
38
0
    for (j = 0; j < cols; j++)
39
0
      row_buffer[j] = for_a ? _mps_forward_na_gemm_a_value(i, j) : _mps_forward_na_gemm_b_value(i, j);
40
0
    ccv_float_to_half_precision(row_buffer, (uint16_t*)data + (size_t)i * cols, cols);
41
0
  }
42
0
  ccfree(row_buffer);
43
0
}
44
45
static void _mps_forward_na_gemm_fill_bias_half(ccv_float16_t* const data, const int cols)
46
0
{
47
0
  float* const row_buffer = (float*)ccmalloc(sizeof(float) * cols);
48
0
  int j;
49
0
  for (j = 0; j < cols; j++)
50
0
    row_buffer[j] = _mps_forward_na_gemm_bias_value(j);
51
0
  ccv_float_to_half_precision(row_buffer, (uint16_t*)data, cols);
52
0
  ccfree(row_buffer);
53
0
}
54
55
static void _mps_forward_scaled_gemm_to_float(const int datatype, const void* const data, const int count, float* const values);
56
57
static float _mps_forward_na_gemm_expected(const int row, const int col, const int k_dim, const int use_bias)
58
0
{
59
0
  float sum = 0;
60
0
  int k;
61
0
  for (k = 0; k < k_dim; k++)
62
0
    sum += _mps_forward_na_gemm_a_value(row, k) * _mps_forward_na_gemm_b_value(col, k);
63
0
  if (use_bias)
64
0
    sum += _mps_forward_na_gemm_bias_value(col);
65
0
  return sum;
66
0
}
67
68
static int _mps_forward_na_gemm_sample_indices(const int dim, const int boundary, const int include_large_m_boundary, int indices[8])
69
0
{
70
0
  const int candidates[] = {
71
0
    0, 1, boundary - 1, boundary,
72
0
    include_large_m_boundary ? 32767 : -1,
73
0
    include_large_m_boundary ? 32768 : -1,
74
0
    dim / 2, dim - 1,
75
0
  };
76
0
  int i, j;
77
0
  int count = 0;
78
0
  for (i = 0; i < 8; i++)
79
0
  {
80
0
    if (candidates[i] < 0 || candidates[i] >= dim)
81
0
      continue;
82
0
    for (j = 0; j < count; j++)
83
0
      if (indices[j] == candidates[i])
84
0
        break;
85
0
    if (j < count)
86
0
      continue;
87
0
    indices[count++] = candidates[i];
88
0
  }
89
0
  return count;
90
0
}
91
92
typedef struct {
93
  int row;
94
  int col;
95
  float actual;
96
  float expected;
97
} _mps_forward_na_gemm_mismatch_t;
98
99
static int _mps_forward_na_gemm_validate_shape(const int m_dim, const int n_dim, const int k_dim, _mps_forward_na_gemm_mismatch_t* const mismatch)
100
0
{
101
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, m_dim, k_dim), 0);
102
0
  ccv_nnc_tensor_t* const w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, n_dim, k_dim), 0);
103
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, m_dim, n_dim), 0);
104
0
  ccv_nnc_tensor_t* const ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, m_dim, k_dim), 0);
105
0
  ccv_nnc_tensor_t* const hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, n_dim, k_dim), 0);
106
0
  _mps_forward_na_gemm_fill_half(ha->data.f16, m_dim, k_dim, 1);
107
0
  _mps_forward_na_gemm_fill_half(hw->data.f16, n_dim, k_dim, 0);
108
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw), TENSOR_LIST(a, w), 0);
109
0
  ccv_nnc_tensor_free(ha);
110
0
  ccv_nnc_tensor_free(hw);
111
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
112
113
0
  int row_samples[8];
114
0
  int col_samples[8];
115
0
  const int row_sample_size = _mps_forward_na_gemm_sample_indices(m_dim, 128, 1, row_samples);
116
0
  const int col_sample_size = _mps_forward_na_gemm_sample_indices(n_dim, 64, 0, col_samples);
117
0
  ccv_nnc_tensor_t* const sample_h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 1, 1), 0);
118
0
  ccv_nnc_tensor_t* const sample_f = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 1), 0);
119
0
  int ok = 1;
120
0
  int i, j;
121
0
  for (i = 0; i < row_sample_size; i++)
122
0
    for (j = 0; j < col_sample_size; j++)
123
0
    {
124
0
      ccv_nnc_tensor_view_t* const bv = ccv_nnc_tensor_view_new(b, GPU_TENSOR_NHWC(000, 16F, 1, 1), DIM_ALLOC(row_samples[i], col_samples[j]), DIM_ALLOC(n_dim, 1));
125
0
      ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST((ccv_nnc_tensor_t*)bv), TENSOR_LIST(sample_h), 0);
126
0
      ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(sample_h), TENSOR_LIST(sample_f), 0);
127
0
      mismatch->row = row_samples[i];
128
0
      mismatch->col = col_samples[j];
129
0
      mismatch->actual = sample_f->data.f32[0];
130
0
      mismatch->expected = _mps_forward_na_gemm_expected(row_samples[i], col_samples[j], k_dim, 0);
131
0
      ccv_nnc_tensor_view_free(bv);
132
0
      if (fabsf(mismatch->actual - mismatch->expected) > 2e-1f)
133
0
      {
134
0
        ok = 0;
135
0
        goto cleanup;
136
0
      }
137
0
    }
138
139
0
cleanup:
140
0
  ccv_nnc_tensor_free(sample_h);
141
0
  ccv_nnc_tensor_free(sample_f);
142
0
  ccv_nnc_tensor_free(a);
143
0
  ccv_nnc_tensor_free(w);
144
0
  ccv_nnc_tensor_free(b);
145
0
  return ok;
146
0
}
147
148
static int _mps_forward_na_gemm_validate_shape_with_bias(const int m_dim, const int n_dim, const int k_dim, _mps_forward_na_gemm_mismatch_t* const mismatch)
149
0
{
150
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, m_dim, k_dim), 0);
151
0
  ccv_nnc_tensor_t* const w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, n_dim, k_dim), 0);
152
0
  ccv_nnc_tensor_t* const bias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, n_dim), 0);
153
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, m_dim, n_dim), 0);
154
0
  ccv_nnc_tensor_t* const ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, m_dim, k_dim), 0);
155
0
  ccv_nnc_tensor_t* const hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, n_dim, k_dim), 0);
156
0
  ccv_nnc_tensor_t* const hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, n_dim), 0);
157
0
  _mps_forward_na_gemm_fill_half(ha->data.f16, m_dim, k_dim, 1);
158
0
  _mps_forward_na_gemm_fill_half(hw->data.f16, n_dim, k_dim, 0);
159
0
  _mps_forward_na_gemm_fill_bias_half(hbias->data.f16, n_dim);
160
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(a, w, bias), 0);
161
0
  ccv_nnc_tensor_free(ha);
162
0
  ccv_nnc_tensor_free(hw);
163
0
  ccv_nnc_tensor_free(hbias);
164
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
165
166
0
  int row_samples[8];
167
0
  int col_samples[8];
168
0
  const int row_sample_size = _mps_forward_na_gemm_sample_indices(m_dim, 128, 1, row_samples);
169
0
  const int col_sample_size = _mps_forward_na_gemm_sample_indices(n_dim, 64, 0, col_samples);
170
0
  ccv_nnc_tensor_t* const sample_h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 1, 1), 0);
171
0
  ccv_nnc_tensor_t* const sample_f = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 1), 0);
172
0
  int ok = 1;
173
0
  int i, j;
174
0
  for (i = 0; i < row_sample_size; i++)
175
0
    for (j = 0; j < col_sample_size; j++)
176
0
    {
177
0
      ccv_nnc_tensor_view_t* const bv = ccv_nnc_tensor_view_new(b, GPU_TENSOR_NHWC(000, 16F, 1, 1), DIM_ALLOC(row_samples[i], col_samples[j]), DIM_ALLOC(n_dim, 1));
178
0
      ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST((ccv_nnc_tensor_t*)bv), TENSOR_LIST(sample_h), 0);
179
0
      ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(sample_h), TENSOR_LIST(sample_f), 0);
180
0
      mismatch->row = row_samples[i];
181
0
      mismatch->col = col_samples[j];
182
0
      mismatch->actual = sample_f->data.f32[0];
183
0
      mismatch->expected = _mps_forward_na_gemm_expected(row_samples[i], col_samples[j], k_dim, 1);
184
0
      ccv_nnc_tensor_view_free(bv);
185
0
      if (fabsf(mismatch->actual - mismatch->expected) > 2e-1f)
186
0
      {
187
0
        ok = 0;
188
0
        goto cleanup;
189
0
      }
190
0
    }
191
192
0
cleanup:
193
0
  ccv_nnc_tensor_free(sample_h);
194
0
  ccv_nnc_tensor_free(sample_f);
195
0
  ccv_nnc_tensor_free(a);
196
0
  ccv_nnc_tensor_free(w);
197
0
  ccv_nnc_tensor_free(bias);
198
0
  ccv_nnc_tensor_free(b);
199
0
  return ok;
200
0
}
201
202
static float _mps_forward_ane_stream_lhs_value(const int row, const int k, const int variant)
203
0
{
204
0
  return (float)((((row * 31 + k * 17 + variant * 19) % 97) - 48)) / 64.0f;
205
0
}
206
207
static float _mps_forward_ane_stream_rhs_value(const int row, const int k, const int variant)
208
0
{
209
0
  return (float)((((row * 13 + k * 29 + variant * 23) % 89) - 44)) / 64.0f;
210
0
}
211
212
static void _mps_forward_ane_stream_fill_half(ccv_float16_t* const data, const int rows, const int cols, const int variant, const int for_lhs)
213
0
{
214
0
  float* const row_buffer = (float*)ccmalloc(sizeof(float) * cols);
215
0
  int i, j;
216
0
  for (i = 0; i < rows; i++)
217
0
  {
218
0
    for (j = 0; j < cols; j++)
219
0
      row_buffer[j] = for_lhs ? _mps_forward_ane_stream_lhs_value(i, j, variant) : _mps_forward_ane_stream_rhs_value(i, j, variant);
220
0
    ccv_float_to_half_precision(row_buffer, (uint16_t*)data + (size_t)i * cols, cols);
221
0
  }
222
0
  ccfree(row_buffer);
223
0
}
224
225
static int _mps_forward_ane_rowwise_gemm_stream_sync_validate(double* const max_abs_ref, double* const max_rel_ref)
226
0
{
227
0
  const int m_dim = 512;
228
0
  const int n_dim = 768;
229
0
  const int k_dim = 1024;
230
0
  const int writer_k = 4096;
231
0
  ccv_nnc_tensor_t* const hlhs_old = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, m_dim, writer_k), 0);
232
0
  ccv_nnc_tensor_t* const hrhs_old = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, k_dim, writer_k), 0);
233
0
  ccv_nnc_tensor_t* const hlhs_new = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, m_dim, writer_k), 0);
234
0
  ccv_nnc_tensor_t* const hrhs_new = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, k_dim, writer_k), 0);
235
0
  ccv_nnc_tensor_t* const hw_dense = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, n_dim, k_dim), 0);
236
0
  ccv_nnc_tensor_t* const hwq = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(CPU_TENSOR_NHWC(16F, n_dim, k_dim)), 0);
237
0
  ccv_nnc_tensor_t* const lhs = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, m_dim, writer_k), 0);
238
0
  ccv_nnc_tensor_t* const rhs = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, k_dim, writer_k), 0);
239
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, m_dim, k_dim), 0);
240
0
  ccv_nnc_tensor_t* const w = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(GPU_TENSOR_NHWC(000, 16F, n_dim, k_dim)), 0);
241
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, m_dim, n_dim), 0);
242
0
  ccv_nnc_tensor_t* const bref = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, m_dim, n_dim), 0);
243
0
  ccv_nnc_tensor_t* const hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, m_dim, n_dim), 0);
244
0
  ccv_nnc_tensor_t* const hbref = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, m_dim, n_dim), 0);
245
0
  ccv_nnc_stream_context_t* const stream_context = ccv_nnc_stream_context_new(CCV_STREAM_CONTEXT_GPU);
246
0
  float* const actual = (float*)ccmalloc(sizeof(float) * m_dim * n_dim);
247
0
  float* const expected = (float*)ccmalloc(sizeof(float) * m_dim * n_dim);
248
0
  _mps_forward_ane_stream_fill_half(hlhs_old->data.f16, m_dim, writer_k, 0, 1);
249
0
  _mps_forward_ane_stream_fill_half(hrhs_old->data.f16, k_dim, writer_k, 0, 0);
250
0
  _mps_forward_ane_stream_fill_half(hlhs_new->data.f16, m_dim, writer_k, 1, 1);
251
0
  _mps_forward_ane_stream_fill_half(hrhs_new->data.f16, k_dim, writer_k, 1, 0);
252
0
  _mps_forward_na_gemm_fill_half(hw_dense->data.f16, n_dim, k_dim, 0);
253
0
  const size_t qsize = ccv_nnc_quantize_8i_rowwise(hw_dense->data.f16, CCV_16F, CCV_TENSOR_CPU_MEMORY, (size_t)n_dim * k_dim, k_dim, hwq->data.u8, ccv_nnc_tensor_data_size_without_padding(hwq->info));
254
0
  if (qsize != ccv_nnc_tensor_data_size_without_padding(hwq->info))
255
0
  {
256
0
    ccv_nnc_stream_context_free(stream_context);
257
0
    ccfree(expected);
258
0
    ccfree(actual);
259
0
    ccv_nnc_tensor_free(hbref);
260
0
    ccv_nnc_tensor_free(hb);
261
0
    ccv_nnc_tensor_free(bref);
262
0
    ccv_nnc_tensor_free(b);
263
0
    ccv_nnc_tensor_free(w);
264
0
    ccv_nnc_tensor_free(a);
265
0
    ccv_nnc_tensor_free(rhs);
266
0
    ccv_nnc_tensor_free(lhs);
267
0
    ccv_nnc_tensor_free(hwq);
268
0
    ccv_nnc_tensor_free(hw_dense);
269
0
    ccv_nnc_tensor_free(hrhs_new);
270
0
    ccv_nnc_tensor_free(hlhs_new);
271
0
    ccv_nnc_tensor_free(hrhs_old);
272
0
    ccv_nnc_tensor_free(hlhs_old);
273
0
    return -1;
274
0
  }
275
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(hlhs_old, hrhs_old, hwq), TENSOR_LIST(lhs, rhs, w), stream_context);
276
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(lhs, rhs), TENSOR_LIST(a), stream_context);
277
0
  ccv_nnc_synchronize_stream_context(stream_context);
278
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), stream_context);
279
0
  ccv_nnc_synchronize_stream_context(stream_context);
280
281
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(hlhs_new, hrhs_new), TENSOR_LIST(lhs, rhs), stream_context);
282
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(lhs, rhs), TENSOR_LIST(a), stream_context);
283
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), stream_context);
284
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), stream_context);
285
0
  ccv_nnc_synchronize_stream_context(stream_context);
286
287
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(hlhs_new, hrhs_new), TENSOR_LIST(lhs, rhs), stream_context);
288
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(lhs, rhs), TENSOR_LIST(a), stream_context);
289
0
  ccv_nnc_synchronize_stream_context(stream_context);
290
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(bref), stream_context);
291
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(bref), TENSOR_LIST(hbref), stream_context);
292
0
  ccv_nnc_synchronize_stream_context(stream_context);
293
294
0
  _mps_forward_scaled_gemm_to_float(CCV_16F, hb->data.f16, m_dim * n_dim, actual);
295
0
  _mps_forward_scaled_gemm_to_float(CCV_16F, hbref->data.f16, m_dim * n_dim, expected);
296
0
  double max_abs = 0;
297
0
  double max_rel = 0;
298
0
  int i;
299
0
  for (i = 0; i < m_dim * n_dim; i++)
300
0
  {
301
0
    const double diff = fabs((double)actual[i] - (double)expected[i]);
302
0
    const double denom = ccv_max(1.0, ccv_max(fabs((double)actual[i]), fabs((double)expected[i])));
303
0
    max_abs = ccv_max(max_abs, diff);
304
0
    max_rel = ccv_max(max_rel, diff / denom);
305
0
  }
306
0
  if (max_abs_ref)
307
0
    *max_abs_ref = max_abs;
308
0
  if (max_rel_ref)
309
0
    *max_rel_ref = max_rel;
310
0
  ccfree(expected);
311
0
  ccfree(actual);
312
0
  ccv_nnc_stream_context_free(stream_context);
313
0
  ccv_nnc_tensor_free(hbref);
314
0
  ccv_nnc_tensor_free(hb);
315
0
  ccv_nnc_tensor_free(bref);
316
0
  ccv_nnc_tensor_free(b);
317
0
  ccv_nnc_tensor_free(w);
318
0
  ccv_nnc_tensor_free(a);
319
0
  ccv_nnc_tensor_free(rhs);
320
0
  ccv_nnc_tensor_free(lhs);
321
0
  ccv_nnc_tensor_free(hwq);
322
0
  ccv_nnc_tensor_free(hw_dense);
323
0
  ccv_nnc_tensor_free(hrhs_new);
324
0
  ccv_nnc_tensor_free(hlhs_new);
325
0
  ccv_nnc_tensor_free(hrhs_old);
326
0
  ccv_nnc_tensor_free(hlhs_old);
327
0
  return 0;
328
0
}
329
330
static void _mps_forward_scaled_gemm_fill_matrix(const int datatype, void* const data, const int rows, const int cols, const int for_a)
331
0
{
332
0
  float* const values = (float*)ccmalloc(sizeof(float) * rows * cols);
333
0
  int i, j;
334
0
  for (i = 0; i < rows; i++)
335
0
    for (j = 0; j < cols; j++)
336
0
      values[i * cols + j] = for_a ? _mps_forward_na_gemm_a_value(i, j) : _mps_forward_na_gemm_b_value(i, j);
337
0
  if (datatype == CCV_16F)
338
0
    ccv_float_to_half_precision(values, (uint16_t*)data, rows * cols);
339
0
  else if (datatype == CCV_16BF)
340
0
    ccv_float_to_bfloat(values, (uint16_t*)data, rows * cols);
341
0
  else
342
0
    memcpy(data, values, sizeof(float) * rows * cols);
343
0
  ccfree(values);
344
0
}
345
346
static void _mps_forward_scaled_gemm_fill_bias(const int datatype, void* const data, const int cols)
347
0
{
348
0
  float* const values = (float*)ccmalloc(sizeof(float) * cols);
349
0
  int j;
350
0
  for (j = 0; j < cols; j++)
351
0
    values[j] = _mps_forward_na_gemm_bias_value(j);
352
0
  if (datatype == CCV_16F)
353
0
    ccv_float_to_half_precision(values, (uint16_t*)data, cols);
354
0
  else if (datatype == CCV_16BF)
355
0
    ccv_float_to_bfloat(values, (uint16_t*)data, cols);
356
0
  else
357
0
    memcpy(data, values, sizeof(float) * cols);
358
0
  ccfree(values);
359
0
}
360
361
static void _mps_forward_scaled_gemm_to_float(const int datatype, const void* const data, const int count, float* const values)
362
0
{
363
0
  if (datatype == CCV_16F)
364
0
    ccv_half_precision_to_float((const uint16_t*)data, values, count);
365
0
  else if (datatype == CCV_16BF)
366
0
    ccv_bfloat_to_float((const uint16_t*)data, values, count);
367
0
  else
368
0
    memcpy(values, data, sizeof(float) * count);
369
0
}
370
371
static void _mps_forward_scaled_gemm_compare_rows(const int datatype, const void* const actual_data, const void* const expected_data, const int rows, const int cols, double* const max_abs_ref, double* const max_rel_ref)
372
0
{
373
0
  float* const actual_row = (float*)ccmalloc(sizeof(float) * cols);
374
0
  float* const expected_row = (float*)ccmalloc(sizeof(float) * cols);
375
0
  const size_t element_size = CCV_GET_DATA_TYPE_SIZE(datatype);
376
0
  const uint8_t* const actual_bytes = (const uint8_t*)actual_data;
377
0
  const uint8_t* const expected_bytes = (const uint8_t*)expected_data;
378
0
  double max_abs = 0;
379
0
  double max_rel = 0;
380
0
  int i, j;
381
0
  for (i = 0; i < rows; i++)
382
0
  {
383
0
    _mps_forward_scaled_gemm_to_float(datatype, actual_bytes + (size_t)i * cols * element_size, cols, actual_row);
384
0
    _mps_forward_scaled_gemm_to_float(datatype, expected_bytes + (size_t)i * cols * element_size, cols, expected_row);
385
0
    for (j = 0; j < cols; j++)
386
0
    {
387
0
      const double diff = fabs((double)actual_row[j] - (double)expected_row[j]);
388
0
      const double denom = ccv_max(1.0, ccv_max(fabs((double)actual_row[j]), fabs((double)expected_row[j])));
389
0
      max_abs = ccv_max(max_abs, diff);
390
0
      max_rel = ccv_max(max_rel, diff / denom);
391
0
    }
392
0
  }
393
0
  ccfree(expected_row);
394
0
  ccfree(actual_row);
395
0
  if (max_abs_ref)
396
0
    *max_abs_ref = max_abs;
397
0
  if (max_rel_ref)
398
0
    *max_rel_ref = max_rel;
399
0
}
400
401
static void _mps_forward_scaled_gemm_quantized_reference(const int datatype, const void* const data, const int rows, const int cols, float* const values)
402
0
{
403
0
  ccv_nnc_tensor_param_t params = {
404
0
    .type = CCV_TENSOR_CPU_MEMORY,
405
0
    .format = CCV_TENSOR_FORMAT_NHWC,
406
0
    .datatype = datatype,
407
0
    .dim = { rows, cols, 0 },
408
0
  };
409
0
  const ccv_nnc_tensor_param_t qparams = ccv_nnc_tensor_8i_rowwise(params);
410
0
  const size_t qsize = ccv_nnc_tensor_data_size_without_padding(qparams);
411
0
  uint8_t* const qdata = (uint8_t*)ccmalloc(qsize);
412
0
  const size_t encoded = ccv_nnc_quantize_8i_rowwise(data, datatype, CCV_TENSOR_CPU_MEMORY, rows * cols, cols, qdata, qsize);
413
0
  void* dequantized = 0;
414
0
  if (datatype == CCV_16F || datatype == CCV_16BF)
415
0
    dequantized = ccmalloc(sizeof(uint16_t) * rows * cols);
416
0
  else
417
0
    dequantized = ccmalloc(sizeof(float) * rows * cols);
418
0
  ccv_nnc_dequantize_8i_rowwise(qdata, datatype, CCV_TENSOR_CPU_MEMORY, encoded, cols, dequantized, rows * cols);
419
0
  _mps_forward_scaled_gemm_to_float(datatype, dequantized, rows * cols, values);
420
0
  ccfree(dequantized);
421
0
  ccfree(qdata);
422
0
}
423
424
static void _mps_forward_scaled_gemm_reference(const float* const a, const float* const w, const float* const bias, const int m_dim, const int n_dim, const int k_dim, float* const out)
425
0
{
426
0
  int i, j, k;
427
0
  for (i = 0; i < m_dim; i++)
428
0
    for (j = 0; j < n_dim; j++)
429
0
    {
430
0
      float sum = bias ? bias[j] : 0;
431
0
      for (k = 0; k < k_dim; k++)
432
0
        sum += a[i * k_dim + k] * w[j * k_dim + k];
433
0
      out[i * n_dim + j] = sum;
434
0
    }
435
0
}
436
437
static float _mps_forward_scaled_gemm_a_batched_value(const int batch, const int row, const int k)
438
0
{
439
0
  return (float)(((batch * 11 + row * 17 + k * 13) % 41) - 20) / 256.0f;
440
0
}
441
442
static float _mps_forward_scaled_gemm_w_batched_value(const int batch, const int col, const int k)
443
0
{
444
0
  return (float)(((batch * 7 + col * 19 + k * 5) % 43) - 21) / 256.0f;
445
0
}
446
447
static float _mps_forward_scaled_gemm_bias_batched_value(const int batch, const int col)
448
0
{
449
0
  return (float)(((batch * 3 + col * 5) % 23) - 11) / 256.0f;
450
0
}
451
452
static void _mps_forward_scaled_gemm_fill_matrix_batched(const int datatype, void* const data, const int batch_dim, const int rows, const int cols, const int for_a)
453
0
{
454
0
  float* const values = (float*)ccmalloc(sizeof(float) * batch_dim * rows * cols);
455
0
  int b, i, j;
456
0
  for (b = 0; b < batch_dim; b++)
457
0
    for (i = 0; i < rows; i++)
458
0
      for (j = 0; j < cols; j++)
459
0
        values[((b * rows) + i) * cols + j] = for_a ? _mps_forward_scaled_gemm_a_batched_value(b, i, j) : _mps_forward_scaled_gemm_w_batched_value(b, i, j);
460
0
  if (datatype == CCV_16F)
461
0
    ccv_float_to_half_precision(values, (uint16_t*)data, batch_dim * rows * cols);
462
0
  else if (datatype == CCV_16BF)
463
0
    ccv_float_to_bfloat(values, (uint16_t*)data, batch_dim * rows * cols);
464
0
  else
465
0
    memcpy(data, values, sizeof(float) * batch_dim * rows * cols);
466
0
  ccfree(values);
467
0
}
468
469
static void _mps_forward_scaled_gemm_fill_bias_batched(const int datatype, void* const data, const int batch_dim, const int cols)
470
0
{
471
0
  float* const values = (float*)ccmalloc(sizeof(float) * batch_dim * cols);
472
0
  int b, j;
473
0
  for (b = 0; b < batch_dim; b++)
474
0
    for (j = 0; j < cols; j++)
475
0
      values[b * cols + j] = _mps_forward_scaled_gemm_bias_batched_value(b, j);
476
0
  if (datatype == CCV_16F)
477
0
    ccv_float_to_half_precision(values, (uint16_t*)data, batch_dim * cols);
478
0
  else if (datatype == CCV_16BF)
479
0
    ccv_float_to_bfloat(values, (uint16_t*)data, batch_dim * cols);
480
0
  else
481
0
    memcpy(data, values, sizeof(float) * batch_dim * cols);
482
0
  ccfree(values);
483
0
}
484
485
static void _mps_forward_scaled_gemm_reference_batched(const float* const a, const float* const w, const float* const bias, const int batch_dim, const int w_batch_dim, const int bias_batch_dim, const int m_dim, const int n_dim, const int k_dim, float* const out)
486
0
{
487
0
  int b, i, j, k;
488
0
  for (b = 0; b < batch_dim; b++)
489
0
    for (i = 0; i < m_dim; i++)
490
0
      for (j = 0; j < n_dim; j++)
491
0
      {
492
0
        const int w_batch = (w_batch_dim > 1) ? b : 0;
493
0
        const int bias_batch = (bias_batch_dim > 1) ? b : 0;
494
0
        float sum = bias ? bias[bias_batch * n_dim + j] : 0;
495
0
        for (k = 0; k < k_dim; k++)
496
0
          sum += a[((b * m_dim) + i) * k_dim + k] * w[((w_batch * n_dim) + j) * k_dim + k];
497
0
        out[((b * m_dim) + i) * n_dim + j] = sum;
498
0
      }
499
0
}
500
501
static int _mps_forward_scaled_gemm_validate_shape(const int datatype, const int use_bias, const int m_dim, const int n_dim, const int k_dim, double* const max_abs_ref, double* const max_rel_ref)
502
0
{
503
0
  ccv_nnc_tensor_param_t ga_params = {
504
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
505
0
    .format = CCV_TENSOR_FORMAT_NHWC,
506
0
    .datatype = datatype,
507
0
    .dim = { m_dim, k_dim, 0 },
508
0
  };
509
0
  ccv_nnc_tensor_param_t gw_params = {
510
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
511
0
    .format = CCV_TENSOR_FORMAT_NHWC,
512
0
    .datatype = datatype,
513
0
    .dim = { n_dim, k_dim, 0 },
514
0
  };
515
0
  ccv_nnc_tensor_param_t gb_params = {
516
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
517
0
    .format = CCV_TENSOR_FORMAT_NHWC,
518
0
    .datatype = datatype,
519
0
    .dim = { m_dim, n_dim, 0 },
520
0
  };
521
0
  ccv_nnc_tensor_param_t gbias_params = {
522
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
523
0
    .format = CCV_TENSOR_FORMAT_NHWC,
524
0
    .datatype = datatype,
525
0
    .dim = { n_dim, 0 },
526
0
  };
527
0
  ccv_nnc_tensor_param_t a_params = {
528
0
    .type = CCV_TENSOR_CPU_MEMORY,
529
0
    .format = CCV_TENSOR_FORMAT_NHWC,
530
0
    .datatype = datatype,
531
0
    .dim = { m_dim, k_dim, 0 },
532
0
  };
533
0
  ccv_nnc_tensor_param_t w_params = {
534
0
    .type = CCV_TENSOR_CPU_MEMORY,
535
0
    .format = CCV_TENSOR_FORMAT_NHWC,
536
0
    .datatype = datatype,
537
0
    .dim = { n_dim, k_dim, 0 },
538
0
  };
539
0
  ccv_nnc_tensor_param_t b_params = {
540
0
    .type = CCV_TENSOR_CPU_MEMORY,
541
0
    .format = CCV_TENSOR_FORMAT_NHWC,
542
0
    .datatype = datatype,
543
0
    .dim = { m_dim, n_dim, 0 },
544
0
  };
545
0
  ccv_nnc_tensor_param_t bias_params = {
546
0
    .type = CCV_TENSOR_CPU_MEMORY,
547
0
    .format = CCV_TENSOR_FORMAT_NHWC,
548
0
    .datatype = datatype,
549
0
    .dim = { n_dim, 0 },
550
0
  };
551
0
  ccv_nnc_tensor_t* const ha = ccv_nnc_tensor_new(0, a_params, 0);
552
0
  ccv_nnc_tensor_t* const hwq = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(w_params), 0);
553
0
  ccv_nnc_tensor_t* const hbias = use_bias ? ccv_nnc_tensor_new(0, bias_params, 0) : 0;
554
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, ga_params, 0);
555
0
  ccv_nnc_tensor_t* const w = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(gw_params), 0);
556
0
  ccv_nnc_tensor_t* const bias = use_bias ? ccv_nnc_tensor_new(0, gbias_params, 0) : 0;
557
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, gb_params, 0);
558
0
  ccv_nnc_tensor_t* const hb = ccv_nnc_tensor_new(0, b_params, 0);
559
0
  _mps_forward_scaled_gemm_fill_matrix(datatype, ha->data.u8, m_dim, k_dim, 1);
560
0
  if (use_bias)
561
0
    _mps_forward_scaled_gemm_fill_bias(datatype, hbias->data.u8, n_dim);
562
0
  void* const w_dense = ccmalloc(CCV_GET_DATA_TYPE_SIZE(datatype) * n_dim * k_dim);
563
0
  _mps_forward_scaled_gemm_fill_matrix(datatype, w_dense, n_dim, k_dim, 0);
564
0
  const size_t qsize = ccv_nnc_quantize_8i_rowwise(w_dense, datatype, CCV_TENSOR_CPU_MEMORY, n_dim * k_dim, k_dim, hwq->data.u8, ccv_nnc_tensor_data_size_without_padding(hwq->info));
565
0
  if (qsize != ccv_nnc_tensor_data_size_without_padding(hwq->info))
566
0
  {
567
0
    ccfree(w_dense);
568
0
    ccv_nnc_tensor_free(ha);
569
0
    ccv_nnc_tensor_free(hwq);
570
0
    if (hbias)
571
0
      ccv_nnc_tensor_free(hbias);
572
0
    ccv_nnc_tensor_free(a);
573
0
    ccv_nnc_tensor_free(w);
574
0
    if (bias)
575
0
      ccv_nnc_tensor_free(bias);
576
0
    ccv_nnc_tensor_free(b);
577
0
    ccv_nnc_tensor_free(hb);
578
0
    return -1;
579
0
  }
580
0
  if (use_bias)
581
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hwq, hbias), TENSOR_LIST(a, w, bias), 0);
582
0
  else
583
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hwq), TENSOR_LIST(a, w), 0);
584
0
  if (use_bias)
585
0
    ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
586
0
  else
587
0
    ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
588
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
589
590
0
  float* const a_ref = (float*)ccmalloc(sizeof(float) * m_dim * k_dim);
591
0
  float* const w_ref = (float*)ccmalloc(sizeof(float) * n_dim * k_dim);
592
0
  float* const bias_ref = use_bias ? (float*)ccmalloc(sizeof(float) * n_dim) : 0;
593
0
  float* const actual = (float*)ccmalloc(sizeof(float) * m_dim * n_dim);
594
0
  float* const expected = (float*)ccmalloc(sizeof(float) * m_dim * n_dim);
595
0
  _mps_forward_scaled_gemm_quantized_reference(datatype, ha->data.u8, m_dim, k_dim, a_ref);
596
0
  _mps_forward_scaled_gemm_quantized_reference(datatype, w_dense, n_dim, k_dim, w_ref);
597
0
  if (use_bias)
598
0
    _mps_forward_scaled_gemm_to_float(datatype, hbias->data.u8, n_dim, bias_ref);
599
0
  _mps_forward_scaled_gemm_to_float(datatype, hb->data.u8, m_dim * n_dim, actual);
600
0
  _mps_forward_scaled_gemm_reference(a_ref, w_ref, bias_ref, m_dim, n_dim, k_dim, expected);
601
0
  double max_abs = 0;
602
0
  double max_rel = 0;
603
0
  int i;
604
0
  for (i = 0; i < m_dim * n_dim; i++)
605
0
  {
606
0
    const double diff = fabs((double)actual[i] - (double)expected[i]);
607
0
    const double denom = ccv_max(1.0, ccv_max(fabs((double)actual[i]), fabs((double)expected[i])));
608
0
    max_abs = ccv_max(max_abs, diff);
609
0
    max_rel = ccv_max(max_rel, diff / denom);
610
0
  }
611
0
  if (max_abs_ref)
612
0
    *max_abs_ref = max_abs;
613
0
  if (max_rel_ref)
614
0
    *max_rel_ref = max_rel;
615
616
0
  ccfree(expected);
617
0
  ccfree(actual);
618
0
  if (bias_ref)
619
0
    ccfree(bias_ref);
620
0
  ccfree(w_ref);
621
0
  ccfree(a_ref);
622
0
  ccfree(w_dense);
623
0
  ccv_nnc_tensor_free(ha);
624
0
  ccv_nnc_tensor_free(hwq);
625
0
  if (hbias)
626
0
    ccv_nnc_tensor_free(hbias);
627
0
  ccv_nnc_tensor_free(a);
628
0
  ccv_nnc_tensor_free(w);
629
0
  if (bias)
630
0
    ccv_nnc_tensor_free(bias);
631
0
  ccv_nnc_tensor_free(b);
632
0
  ccv_nnc_tensor_free(hb);
633
0
  return 0;
634
0
}
635
636
static int _mps_forward_scaled_gemm_validate(const int datatype, const int use_bias, double* const max_abs_ref, double* const max_rel_ref)
637
0
{
638
0
  return _mps_forward_scaled_gemm_validate_shape(datatype, use_bias, 257, 384, 128, max_abs_ref, max_rel_ref);
639
0
}
640
641
static int _mps_forward_scaled_gemm_validate_aligned_m(const int datatype, const int use_bias, double* const max_abs_ref, double* const max_rel_ref)
642
0
{
643
0
  return _mps_forward_scaled_gemm_validate_shape(datatype, use_bias, 384, 384, 128, max_abs_ref, max_rel_ref);
644
0
}
645
646
static int _mps_forward_scaled_gemm_validate_batched(const int datatype, const int use_bias, const int weight_batched, const int bias_batched, double* const max_abs_ref, double* const max_rel_ref)
647
0
{
648
0
  const int batch_dim = 2;
649
0
  const int m_dim = 129;
650
0
  const int n_dim = 384;
651
0
  const int k_dim = 128;
652
0
  ccv_nnc_tensor_param_t ga_params = {
653
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
654
0
    .format = CCV_TENSOR_FORMAT_NHWC,
655
0
    .datatype = datatype,
656
0
    .dim = { batch_dim, m_dim, k_dim, 0 },
657
0
  };
658
0
  ccv_nnc_tensor_param_t gw_params = {
659
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
660
0
    .format = CCV_TENSOR_FORMAT_NHWC,
661
0
    .datatype = datatype,
662
0
    .dim = { weight_batched ? batch_dim : n_dim, weight_batched ? n_dim : k_dim, weight_batched ? k_dim : 0, 0 },
663
0
  };
664
0
  ccv_nnc_tensor_param_t gb_params = {
665
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
666
0
    .format = CCV_TENSOR_FORMAT_NHWC,
667
0
    .datatype = datatype,
668
0
    .dim = { batch_dim, m_dim, n_dim, 0 },
669
0
  };
670
0
  ccv_nnc_tensor_param_t gbias_params = {
671
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
672
0
    .format = CCV_TENSOR_FORMAT_NHWC,
673
0
    .datatype = datatype,
674
0
    .dim = { bias_batched ? batch_dim : n_dim, bias_batched ? n_dim : 0, 0, 0 },
675
0
  };
676
0
  ccv_nnc_tensor_param_t a_params = {
677
0
    .type = CCV_TENSOR_CPU_MEMORY,
678
0
    .format = CCV_TENSOR_FORMAT_NHWC,
679
0
    .datatype = datatype,
680
0
    .dim = { batch_dim, m_dim, k_dim, 0 },
681
0
  };
682
0
  ccv_nnc_tensor_param_t w_params = {
683
0
    .type = CCV_TENSOR_CPU_MEMORY,
684
0
    .format = CCV_TENSOR_FORMAT_NHWC,
685
0
    .datatype = datatype,
686
0
    .dim = { weight_batched ? batch_dim : n_dim, weight_batched ? n_dim : k_dim, weight_batched ? k_dim : 0, 0 },
687
0
  };
688
0
  ccv_nnc_tensor_param_t b_params = {
689
0
    .type = CCV_TENSOR_CPU_MEMORY,
690
0
    .format = CCV_TENSOR_FORMAT_NHWC,
691
0
    .datatype = datatype,
692
0
    .dim = { batch_dim, m_dim, n_dim, 0 },
693
0
  };
694
0
  ccv_nnc_tensor_param_t bias_params = {
695
0
    .type = CCV_TENSOR_CPU_MEMORY,
696
0
    .format = CCV_TENSOR_FORMAT_NHWC,
697
0
    .datatype = datatype,
698
0
    .dim = { bias_batched ? batch_dim : n_dim, bias_batched ? n_dim : 0, 0, 0 },
699
0
  };
700
0
  ccv_nnc_tensor_t* const ha = ccv_nnc_tensor_new(0, a_params, 0);
701
0
  ccv_nnc_tensor_t* const hwq = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(w_params), 0);
702
0
  ccv_nnc_tensor_t* const hbias = use_bias ? ccv_nnc_tensor_new(0, bias_params, 0) : 0;
703
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, ga_params, 0);
704
0
  ccv_nnc_tensor_t* const w = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(gw_params), 0);
705
0
  ccv_nnc_tensor_t* const bias = use_bias ? ccv_nnc_tensor_new(0, gbias_params, 0) : 0;
706
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, gb_params, 0);
707
0
  ccv_nnc_tensor_t* const hb = ccv_nnc_tensor_new(0, b_params, 0);
708
0
  _mps_forward_scaled_gemm_fill_matrix_batched(datatype, ha->data.u8, batch_dim, m_dim, k_dim, 1);
709
0
  if (use_bias)
710
0
  {
711
0
    if (bias_batched)
712
0
      _mps_forward_scaled_gemm_fill_bias_batched(datatype, hbias->data.u8, batch_dim, n_dim);
713
0
    else
714
0
      _mps_forward_scaled_gemm_fill_bias(datatype, hbias->data.u8, n_dim);
715
0
  }
716
0
  const int w_batch_dim = weight_batched ? batch_dim : 1;
717
0
  void* const w_dense = ccmalloc(CCV_GET_DATA_TYPE_SIZE(datatype) * w_batch_dim * n_dim * k_dim);
718
0
  if (weight_batched)
719
0
    _mps_forward_scaled_gemm_fill_matrix_batched(datatype, w_dense, batch_dim, n_dim, k_dim, 0);
720
0
  else
721
0
    _mps_forward_scaled_gemm_fill_matrix(datatype, w_dense, n_dim, k_dim, 0);
722
0
  const size_t qsize = ccv_nnc_quantize_8i_rowwise(w_dense, datatype, CCV_TENSOR_CPU_MEMORY, w_batch_dim * n_dim * k_dim, k_dim, hwq->data.u8, ccv_nnc_tensor_data_size_without_padding(hwq->info));
723
0
  if (qsize != ccv_nnc_tensor_data_size_without_padding(hwq->info))
724
0
  {
725
0
    ccfree(w_dense);
726
0
    ccv_nnc_tensor_free(ha);
727
0
    ccv_nnc_tensor_free(hwq);
728
0
    if (hbias)
729
0
      ccv_nnc_tensor_free(hbias);
730
0
    ccv_nnc_tensor_free(a);
731
0
    ccv_nnc_tensor_free(w);
732
0
    if (bias)
733
0
      ccv_nnc_tensor_free(bias);
734
0
    ccv_nnc_tensor_free(b);
735
0
    ccv_nnc_tensor_free(hb);
736
0
    return -1;
737
0
  }
738
0
  if (use_bias)
739
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hwq, hbias), TENSOR_LIST(a, w, bias), 0);
740
0
  else
741
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hwq), TENSOR_LIST(a, w), 0);
742
0
  if (weight_batched)
743
0
  {
744
0
    if (use_bias)
745
0
      ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
746
0
    else
747
0
      ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
748
0
  } else {
749
0
    if (use_bias)
750
0
      ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
751
0
    else
752
0
      ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
753
0
  }
754
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
755
756
0
  float* const a_ref = (float*)ccmalloc(sizeof(float) * batch_dim * m_dim * k_dim);
757
0
  float* const w_ref = (float*)ccmalloc(sizeof(float) * w_batch_dim * n_dim * k_dim);
758
0
  float* const bias_ref = use_bias ? (float*)ccmalloc(sizeof(float) * (bias_batched ? batch_dim : 1) * n_dim) : 0;
759
0
  float* const actual = (float*)ccmalloc(sizeof(float) * batch_dim * m_dim * n_dim);
760
0
  float* const expected = (float*)ccmalloc(sizeof(float) * batch_dim * m_dim * n_dim);
761
0
  _mps_forward_scaled_gemm_quantized_reference(datatype, ha->data.u8, batch_dim * m_dim, k_dim, a_ref);
762
0
  _mps_forward_scaled_gemm_quantized_reference(datatype, w_dense, w_batch_dim * n_dim, k_dim, w_ref);
763
0
  if (use_bias)
764
0
    _mps_forward_scaled_gemm_to_float(datatype, hbias->data.u8, (bias_batched ? batch_dim : 1) * n_dim, bias_ref);
765
0
  _mps_forward_scaled_gemm_to_float(datatype, hb->data.u8, batch_dim * m_dim * n_dim, actual);
766
0
  _mps_forward_scaled_gemm_reference_batched(a_ref, w_ref, bias_ref, batch_dim, w_batch_dim, bias_batched ? batch_dim : 1, m_dim, n_dim, k_dim, expected);
767
0
  double max_abs = 0;
768
0
  double max_rel = 0;
769
0
  int i;
770
0
  for (i = 0; i < batch_dim * m_dim * n_dim; i++)
771
0
  {
772
0
    const double diff = fabs((double)actual[i] - (double)expected[i]);
773
0
    const double denom = ccv_max(1.0, ccv_max(fabs((double)actual[i]), fabs((double)expected[i])));
774
0
    max_abs = ccv_max(max_abs, diff);
775
0
    max_rel = ccv_max(max_rel, diff / denom);
776
0
  }
777
0
  if (max_abs_ref)
778
0
    *max_abs_ref = max_abs;
779
0
  if (max_rel_ref)
780
0
    *max_rel_ref = max_rel;
781
782
0
  ccfree(expected);
783
0
  ccfree(actual);
784
0
  if (bias_ref)
785
0
    ccfree(bias_ref);
786
0
  ccfree(w_ref);
787
0
  ccfree(a_ref);
788
0
  ccfree(w_dense);
789
0
  ccv_nnc_tensor_free(ha);
790
0
  ccv_nnc_tensor_free(hwq);
791
0
  if (hbias)
792
0
    ccv_nnc_tensor_free(hbias);
793
0
  ccv_nnc_tensor_free(a);
794
0
  ccv_nnc_tensor_free(w);
795
0
  if (bias)
796
0
    ccv_nnc_tensor_free(bias);
797
0
  ccv_nnc_tensor_free(b);
798
0
  ccv_nnc_tensor_free(hb);
799
0
  return 0;
800
0
}
801
802
static int _mps_forward_scaled_gemm_compare_dense(const int datatype, const int use_bias, const int m_dim, const int n_dim, const int k_dim, double* const max_abs_ref, double* const max_rel_ref)
803
0
{
804
0
  ccv_nnc_tensor_param_t ga_params = {
805
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
806
0
    .format = CCV_TENSOR_FORMAT_NHWC,
807
0
    .datatype = datatype,
808
0
    .dim = { m_dim, k_dim, 0 },
809
0
  };
810
0
  ccv_nnc_tensor_param_t gwq_params = {
811
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
812
0
    .format = CCV_TENSOR_FORMAT_NHWC,
813
0
    .datatype = ((datatype >> 12) & 0xff) | CCV_QX | CCV_NNC_QX_8I_ROWWISE,
814
0
    .dim = { n_dim, k_dim, 0 },
815
0
  };
816
0
  ccv_nnc_tensor_param_t gwd_params = {
817
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
818
0
    .format = CCV_TENSOR_FORMAT_NHWC,
819
0
    .datatype = datatype,
820
0
    .dim = { n_dim, k_dim, 0 },
821
0
  };
822
0
  ccv_nnc_tensor_param_t gb_params = {
823
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
824
0
    .format = CCV_TENSOR_FORMAT_NHWC,
825
0
    .datatype = datatype,
826
0
    .dim = { m_dim, n_dim, 0 },
827
0
  };
828
0
  ccv_nnc_tensor_param_t gbias_params = {
829
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
830
0
    .format = CCV_TENSOR_FORMAT_NHWC,
831
0
    .datatype = datatype,
832
0
    .dim = { n_dim, 0 },
833
0
  };
834
0
  ccv_nnc_tensor_param_t a_params = {
835
0
    .type = CCV_TENSOR_CPU_MEMORY,
836
0
    .format = CCV_TENSOR_FORMAT_NHWC,
837
0
    .datatype = datatype,
838
0
    .dim = { m_dim, k_dim, 0 },
839
0
  };
840
0
  ccv_nnc_tensor_param_t wd_params = {
841
0
    .type = CCV_TENSOR_CPU_MEMORY,
842
0
    .format = CCV_TENSOR_FORMAT_NHWC,
843
0
    .datatype = datatype,
844
0
    .dim = { n_dim, k_dim, 0 },
845
0
  };
846
0
  ccv_nnc_tensor_param_t b_params = {
847
0
    .type = CCV_TENSOR_CPU_MEMORY,
848
0
    .format = CCV_TENSOR_FORMAT_NHWC,
849
0
    .datatype = datatype,
850
0
    .dim = { m_dim, n_dim, 0 },
851
0
  };
852
0
  ccv_nnc_tensor_param_t bias_params = {
853
0
    .type = CCV_TENSOR_CPU_MEMORY,
854
0
    .format = CCV_TENSOR_FORMAT_NHWC,
855
0
    .datatype = datatype,
856
0
    .dim = { n_dim, 0 },
857
0
  };
858
0
  ccv_nnc_tensor_t* const ha = ccv_nnc_tensor_new(0, a_params, 0);
859
0
  ccv_nnc_tensor_t* const hwd = ccv_nnc_tensor_new(0, wd_params, 0);
860
0
  ccv_nnc_tensor_t* const hwq = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(wd_params), 0);
861
0
  ccv_nnc_tensor_t* const hbias = use_bias ? ccv_nnc_tensor_new(0, bias_params, 0) : 0;
862
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, ga_params, 0);
863
0
  ccv_nnc_tensor_t* const wq = ccv_nnc_tensor_new(0, gwq_params, 0);
864
0
  ccv_nnc_tensor_t* const wd = ccv_nnc_tensor_new(0, gwd_params, 0);
865
0
  ccv_nnc_tensor_t* const bias = use_bias ? ccv_nnc_tensor_new(0, gbias_params, 0) : 0;
866
0
  ccv_nnc_tensor_t* const bq = ccv_nnc_tensor_new(0, gb_params, 0);
867
0
  ccv_nnc_tensor_t* const bd = ccv_nnc_tensor_new(0, gb_params, 0);
868
0
  ccv_nnc_tensor_t* const hbq = ccv_nnc_tensor_new(0, b_params, 0);
869
0
  ccv_nnc_tensor_t* const hbd = ccv_nnc_tensor_new(0, b_params, 0);
870
0
  _mps_forward_scaled_gemm_fill_matrix(datatype, ha->data.u8, m_dim, k_dim, 1);
871
0
  _mps_forward_scaled_gemm_fill_matrix(datatype, hwd->data.u8, n_dim, k_dim, 0);
872
0
  if (use_bias)
873
0
    _mps_forward_scaled_gemm_fill_bias(datatype, hbias->data.u8, n_dim);
874
0
  const size_t qsize = ccv_nnc_quantize_8i_rowwise(hwd->data.u8, datatype, CCV_TENSOR_CPU_MEMORY, n_dim * k_dim, k_dim, hwq->data.u8, ccv_nnc_tensor_data_size_without_padding(hwq->info));
875
0
  if (qsize != ccv_nnc_tensor_data_size_without_padding(hwq->info))
876
0
    return -1;
877
0
  if (use_bias)
878
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hwq, hbias), TENSOR_LIST(a, wq, bias), 0);
879
0
  else
880
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hwq), TENSOR_LIST(a, wq), 0);
881
0
  ccv_nnc_dequantize_8i_rowwise(wq->data.u8, datatype, CCV_TENSOR_GPU_MEMORY, qsize, k_dim, wd->data.u8, n_dim * k_dim);
882
0
  if (use_bias) {
883
0
    ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, wq, bias), TENSOR_LIST(bq), 0);
884
0
    ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, wd, bias), TENSOR_LIST(bd), 0);
885
0
  } else {
886
0
    ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, wq), TENSOR_LIST(bq), 0);
887
0
    ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, wd), TENSOR_LIST(bd), 0);
888
0
  }
889
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(bq, bd), TENSOR_LIST(hbq, hbd), 0);
890
0
  _mps_forward_scaled_gemm_compare_rows(datatype, hbq->data.u8, hbd->data.u8, m_dim, n_dim, max_abs_ref, max_rel_ref);
891
0
  ccv_nnc_tensor_free(hbq);
892
0
  ccv_nnc_tensor_free(hbd);
893
0
  ccv_nnc_tensor_free(bq);
894
0
  ccv_nnc_tensor_free(bd);
895
0
  ccv_nnc_tensor_free(a);
896
0
  ccv_nnc_tensor_free(wq);
897
0
  ccv_nnc_tensor_free(wd);
898
0
  ccv_nnc_tensor_free(ha);
899
0
  ccv_nnc_tensor_free(hwd);
900
0
  ccv_nnc_tensor_free(hwq);
901
0
  if (hbias)
902
0
    ccv_nnc_tensor_free(hbias);
903
0
  if (bias)
904
0
    ccv_nnc_tensor_free(bias);
905
0
  return 0;
906
0
}
907
908
static int _mps_forward_scaled_gemm_compare_dense_batched_padded_a_shape(const int datatype, const int use_bias, const int batch_dim, const int m_dim, const int n_dim, const int k_dim, const int padded_m_dim, double* const max_abs_ref, double* const max_rel_ref)
909
0
{
910
0
  ccv_nnc_tensor_param_t ga_storage_params = {
911
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
912
0
    .format = CCV_TENSOR_FORMAT_NHWC,
913
0
    .datatype = datatype,
914
0
    .dim = { batch_dim, padded_m_dim, k_dim, 0 },
915
0
  };
916
0
  ccv_nnc_tensor_param_t ga_view_params = {
917
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
918
0
    .format = CCV_TENSOR_FORMAT_NHWC,
919
0
    .datatype = datatype,
920
0
    .dim = { batch_dim, m_dim, k_dim, 0 },
921
0
  };
922
0
  ccv_nnc_tensor_param_t gwq_params = {
923
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
924
0
    .format = CCV_TENSOR_FORMAT_NHWC,
925
0
    .datatype = ((datatype >> 12) & 0xff) | CCV_QX | CCV_NNC_QX_8I_ROWWISE,
926
0
    .dim = { n_dim, k_dim, 0 },
927
0
  };
928
0
  ccv_nnc_tensor_param_t gb_params = {
929
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
930
0
    .format = CCV_TENSOR_FORMAT_NHWC,
931
0
    .datatype = datatype,
932
0
    .dim = { batch_dim, m_dim, n_dim, 0 },
933
0
  };
934
0
  ccv_nnc_tensor_param_t gbias_params = {
935
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
936
0
    .format = CCV_TENSOR_FORMAT_NHWC,
937
0
    .datatype = datatype,
938
0
    .dim = { n_dim, 0 },
939
0
  };
940
0
  ccv_nnc_tensor_param_t ha_storage_params = {
941
0
    .type = CCV_TENSOR_CPU_MEMORY,
942
0
    .format = CCV_TENSOR_FORMAT_NHWC,
943
0
    .datatype = datatype,
944
0
    .dim = { batch_dim, padded_m_dim, k_dim, 0 },
945
0
  };
946
0
  ccv_nnc_tensor_param_t ha_view_params = {
947
0
    .type = CCV_TENSOR_CPU_MEMORY,
948
0
    .format = CCV_TENSOR_FORMAT_NHWC,
949
0
    .datatype = datatype,
950
0
    .dim = { batch_dim, m_dim, k_dim, 0 },
951
0
  };
952
0
  ccv_nnc_tensor_param_t wd_params = {
953
0
    .type = CCV_TENSOR_CPU_MEMORY,
954
0
    .format = CCV_TENSOR_FORMAT_NHWC,
955
0
    .datatype = datatype,
956
0
    .dim = { n_dim, k_dim, 0 },
957
0
  };
958
0
  ccv_nnc_tensor_param_t b_params = {
959
0
    .type = CCV_TENSOR_CPU_MEMORY,
960
0
    .format = CCV_TENSOR_FORMAT_NHWC,
961
0
    .datatype = datatype,
962
0
    .dim = { batch_dim, m_dim, n_dim, 0 },
963
0
  };
964
0
  ccv_nnc_tensor_param_t bias_params = {
965
0
    .type = CCV_TENSOR_CPU_MEMORY,
966
0
    .format = CCV_TENSOR_FORMAT_NHWC,
967
0
    .datatype = datatype,
968
0
    .dim = { n_dim, 0 },
969
0
  };
970
0
  ccv_nnc_tensor_t* const ha_storage = ccv_nnc_tensor_new(0, ha_storage_params, 0);
971
0
  ccv_nnc_tensor_t* const hwd = ccv_nnc_tensor_new(0, wd_params, 0);
972
0
  ccv_nnc_tensor_t* const hwq = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(wd_params), 0);
973
0
  ccv_nnc_tensor_t* const hbias = use_bias ? ccv_nnc_tensor_new(0, bias_params, 0) : 0;
974
0
  ccv_nnc_tensor_t* const a_storage = ccv_nnc_tensor_new(0, ga_storage_params, 0);
975
0
  ccv_nnc_tensor_t* const wq = ccv_nnc_tensor_new(0, gwq_params, 0);
976
0
  ccv_nnc_tensor_t* const bq = ccv_nnc_tensor_new(0, gb_params, 0);
977
0
  ccv_nnc_tensor_t* const bias = use_bias ? ccv_nnc_tensor_new(0, gbias_params, 0) : 0;
978
0
  ccv_nnc_tensor_t* const hbq = ccv_nnc_tensor_new(0, b_params, 0);
979
0
  ccv_nnc_tensor_view_t* const ha = ccv_nnc_tensor_view_new(ha_storage, ha_view_params, ccv_nnc_no_ofs, DIM_ALLOC(padded_m_dim * k_dim, k_dim, 1));
980
0
  ccv_nnc_tensor_view_t* const a = ccv_nnc_tensor_view_new(a_storage, ga_view_params, ccv_nnc_no_ofs, DIM_ALLOC(padded_m_dim * k_dim, k_dim, 1));
981
0
  float* const a_ref = (float*)ccmalloc(sizeof(float) * batch_dim * m_dim * k_dim);
982
0
  float* const w_ref = (float*)ccmalloc(sizeof(float) * n_dim * k_dim);
983
0
  float* const bias_ref = use_bias ? (float*)ccmalloc(sizeof(float) * n_dim) : 0;
984
0
  float* const out_ref = (float*)ccmalloc(sizeof(float) * batch_dim * m_dim * n_dim);
985
0
  int bch, i, j;
986
0
  for (bch = 0; bch < batch_dim; bch++)
987
0
    for (i = 0; i < padded_m_dim; i++)
988
0
      for (j = 0; j < k_dim; j++)
989
0
      {
990
0
        const int dst = ((bch * padded_m_dim) + i) * k_dim + j;
991
0
        float value = 0;
992
0
        if (i < m_dim)
993
0
        {
994
0
          value = _mps_forward_scaled_gemm_a_batched_value(bch, i, j);
995
0
          a_ref[((bch * m_dim) + i) * k_dim + j] = value;
996
0
        }
997
0
        if (datatype == CCV_16F)
998
0
          ccv_float_to_half_precision(&value, ((uint16_t*)ha_storage->data.u8) + dst, 1);
999
0
        else if (datatype == CCV_16BF)
1000
0
          ccv_float_to_bfloat(&value, ((uint16_t*)ha_storage->data.u8) + dst, 1);
1001
0
        else
1002
0
          ((float*)ha_storage->data.f32)[dst] = value;
1003
0
      }
1004
0
  _mps_forward_scaled_gemm_fill_matrix(datatype, hwd->data.u8, n_dim, k_dim, 0);
1005
0
  _mps_forward_scaled_gemm_to_float(datatype, hwd->data.u8, n_dim * k_dim, w_ref);
1006
0
  if (use_bias)
1007
0
  {
1008
0
    _mps_forward_scaled_gemm_fill_bias(datatype, hbias->data.u8, n_dim);
1009
0
    _mps_forward_scaled_gemm_to_float(datatype, hbias->data.u8, n_dim, bias_ref);
1010
0
  }
1011
0
  const size_t qsize = ccv_nnc_quantize_8i_rowwise(hwd->data.u8, datatype, CCV_TENSOR_CPU_MEMORY, n_dim * k_dim, k_dim, hwq->data.u8, ccv_nnc_tensor_data_size_without_padding(hwq->info));
1012
0
  if (qsize != ccv_nnc_tensor_data_size_without_padding(hwq->info))
1013
0
    return -1;
1014
0
  if (use_bias)
1015
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha_storage, hwq, hbias), TENSOR_LIST(a_storage, wq, bias), 0);
1016
0
  else
1017
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha_storage, hwq), TENSOR_LIST(a_storage, wq), 0);
1018
0
  if (use_bias)
1019
0
    ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST((ccv_nnc_tensor_t*)a, wq, bias), TENSOR_LIST(bq), 0);
1020
0
  else
1021
0
    ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST((ccv_nnc_tensor_t*)a, wq), TENSOR_LIST(bq), 0);
1022
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(bq), TENSOR_LIST(hbq), 0);
1023
0
  _mps_forward_scaled_gemm_reference_batched(a_ref, w_ref, bias_ref, batch_dim, 1, use_bias ? 1 : 0, m_dim, n_dim, k_dim, out_ref);
1024
0
  if (datatype == CCV_16F)
1025
0
    ccv_float_to_half_precision(out_ref, (uint16_t*)ha_storage->data.u8, batch_dim * m_dim * n_dim);
1026
0
  else if (datatype == CCV_16BF)
1027
0
    ccv_float_to_bfloat(out_ref, (uint16_t*)ha_storage->data.u8, batch_dim * m_dim * n_dim);
1028
0
  else
1029
0
    memcpy(ha_storage->data.f32, out_ref, sizeof(float) * batch_dim * m_dim * n_dim);
1030
0
  _mps_forward_scaled_gemm_compare_rows(datatype, hbq->data.u8, ha_storage->data.u8, batch_dim * m_dim, n_dim, max_abs_ref, max_rel_ref);
1031
0
  ccfree(out_ref);
1032
0
  if (bias_ref)
1033
0
    ccfree(bias_ref);
1034
0
  ccfree(w_ref);
1035
0
  ccfree(a_ref);
1036
0
  ccv_nnc_tensor_view_free(ha);
1037
0
  ccv_nnc_tensor_view_free(a);
1038
0
  ccv_nnc_tensor_free(ha_storage);
1039
0
  ccv_nnc_tensor_free(hwd);
1040
0
  ccv_nnc_tensor_free(hwq);
1041
0
  if (hbias)
1042
0
    ccv_nnc_tensor_free(hbias);
1043
0
  ccv_nnc_tensor_free(a_storage);
1044
0
  ccv_nnc_tensor_free(wq);
1045
0
  ccv_nnc_tensor_free(bq);
1046
0
  if (bias)
1047
0
    ccv_nnc_tensor_free(bias);
1048
0
  ccv_nnc_tensor_free(hbq);
1049
0
  return 0;
1050
0
}
1051
1052
static float _mps_segmented_scaled_gemm_a_value(const int row, const int k)
1053
0
{
1054
0
  return (float)(((row * 17 + k * 13) % 61) - 30) / 128.0f;
1055
0
}
1056
1057
static float _mps_segmented_scaled_gemm_w_value(const int segment, const int col, const int k)
1058
0
{
1059
0
  return (float)(((segment * 23 + col * 11 + k * 7) % 67) - 33) / 256.0f;
1060
0
}
1061
1062
static float _mps_segmented_scaled_gemm_bias_value(const int segment, const int col)
1063
0
{
1064
0
  return (float)(((segment * 5 + col * 3) % 29) - 14) / 256.0f;
1065
0
}
1066
1067
static int _mps_segmented_scaled_gemm_validate(const int datatype, const int use_bias, const int force_fallback, double* const max_abs_ref, double* const max_rel_ref)
1068
0
{
1069
0
  const int total_m = 384;
1070
0
  const int n_dim = 128;
1071
0
  const int k_dim = 256;
1072
0
  const int segments = 3;
1073
0
  const int counts_data[] = {129, 131, 124};
1074
0
  const int indices_data[] = {1, 0, 2};
1075
0
  const ccv_nnc_tensor_param_t ha_params = {
1076
0
    .type = CCV_TENSOR_CPU_MEMORY,
1077
0
    .format = CCV_TENSOR_FORMAT_NHWC,
1078
0
    .datatype = datatype,
1079
0
    .dim = { total_m, k_dim, 0 },
1080
0
  };
1081
0
  const ccv_nnc_tensor_param_t hwd_params = {
1082
0
    .type = CCV_TENSOR_CPU_MEMORY,
1083
0
    .format = CCV_TENSOR_FORMAT_NHWC,
1084
0
    .datatype = datatype,
1085
0
    .dim = { segments, n_dim, k_dim, 0 },
1086
0
  };
1087
0
  const ccv_nnc_tensor_param_t hbias_params = {
1088
0
    .type = CCV_TENSOR_CPU_MEMORY,
1089
0
    .format = CCV_TENSOR_FORMAT_NHWC,
1090
0
    .datatype = datatype,
1091
0
    .dim = { segments, n_dim, 0 },
1092
0
  };
1093
0
  const ccv_nnc_tensor_param_t ga_params = {
1094
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
1095
0
    .format = CCV_TENSOR_FORMAT_NHWC,
1096
0
    .datatype = datatype,
1097
0
    .dim = { total_m, k_dim, 0 },
1098
0
  };
1099
0
  const ccv_nnc_tensor_param_t gw_params = {
1100
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
1101
0
    .format = CCV_TENSOR_FORMAT_NHWC,
1102
0
    .datatype = datatype,
1103
0
    .dim = { segments, n_dim, k_dim, 0 },
1104
0
  };
1105
0
  const ccv_nnc_tensor_param_t gbias_params = {
1106
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
1107
0
    .format = CCV_TENSOR_FORMAT_NHWC,
1108
0
    .datatype = datatype,
1109
0
    .dim = { segments, n_dim, 0 },
1110
0
  };
1111
0
  const ccv_nnc_tensor_param_t gb_params = {
1112
0
    .type = CCV_TENSOR_GPU_MEMORY | 000,
1113
0
    .format = CCV_TENSOR_FORMAT_NHWC,
1114
0
    .datatype = datatype,
1115
0
    .dim = { total_m, n_dim, 0 },
1116
0
  };
1117
0
  const ccv_nnc_tensor_param_t hb_params = {
1118
0
    .type = CCV_TENSOR_CPU_MEMORY,
1119
0
    .format = CCV_TENSOR_FORMAT_NHWC,
1120
0
    .datatype = datatype,
1121
0
    .dim = { total_m, n_dim, 0 },
1122
0
  };
1123
0
  ccv_nnc_tensor_t* const ha = ccv_nnc_tensor_new(0, ha_params, 0);
1124
0
  ccv_nnc_tensor_t* const hindices = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32S, segments), 0);
1125
0
  ccv_nnc_tensor_t* const hcounts = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32S, segments), 0);
1126
0
  ccv_nnc_tensor_t* const hwd = ccv_nnc_tensor_new(0, hwd_params, 0);
1127
0
  ccv_nnc_tensor_t* const hwq = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(hwd_params), 0);
1128
0
  ccv_nnc_tensor_t* const hbias = use_bias ? ccv_nnc_tensor_new(0, hbias_params, 0) : 0;
1129
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, ga_params, 0);
1130
0
  ccv_nnc_tensor_t* const indices = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32S, segments), 0);
1131
0
  ccv_nnc_tensor_t* const counts = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32S, segments), 0);
1132
0
  ccv_nnc_tensor_t* const w = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(gw_params), 0);
1133
0
  ccv_nnc_tensor_t* const bias = use_bias ? ccv_nnc_tensor_new(0, gbias_params, 0) : 0;
1134
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, gb_params, 0);
1135
0
  ccv_nnc_tensor_t* const hb = ccv_nnc_tensor_new(0, hb_params, 0);
1136
0
  float* const a_values = (float*)ccmalloc(sizeof(float) * total_m * k_dim);
1137
0
  float* const w_values = (float*)ccmalloc(sizeof(float) * segments * n_dim * k_dim);
1138
0
  float* const bias_values = use_bias ? (float*)ccmalloc(sizeof(float) * segments * n_dim) : 0;
1139
0
  int i, j, k;
1140
0
  for (i = 0; i < total_m; i++)
1141
0
    for (k = 0; k < k_dim; k++)
1142
0
      a_values[i * k_dim + k] = _mps_segmented_scaled_gemm_a_value(i, k);
1143
0
  for (i = 0; i < segments; i++)
1144
0
    for (j = 0; j < n_dim; j++)
1145
0
      for (k = 0; k < k_dim; k++)
1146
0
        w_values[((i * n_dim) + j) * k_dim + k] = _mps_segmented_scaled_gemm_w_value(i, j, k);
1147
0
  if (use_bias)
1148
0
    for (i = 0; i < segments; i++)
1149
0
      for (j = 0; j < n_dim; j++)
1150
0
        bias_values[i * n_dim + j] = _mps_segmented_scaled_gemm_bias_value(i, j);
1151
0
  if (datatype == CCV_16F)
1152
0
  {
1153
0
    ccv_float_to_half_precision(a_values, (uint16_t*)ha->data.u8, total_m * k_dim);
1154
0
    ccv_float_to_half_precision(w_values, (uint16_t*)hwd->data.u8, segments * n_dim * k_dim);
1155
0
    if (use_bias)
1156
0
      ccv_float_to_half_precision(bias_values, (uint16_t*)hbias->data.u8, segments * n_dim);
1157
0
  } else if (datatype == CCV_16BF) {
1158
0
    ccv_float_to_bfloat(a_values, (uint16_t*)ha->data.u8, total_m * k_dim);
1159
0
    ccv_float_to_bfloat(w_values, (uint16_t*)hwd->data.u8, segments * n_dim * k_dim);
1160
0
    if (use_bias)
1161
0
      ccv_float_to_bfloat(bias_values, (uint16_t*)hbias->data.u8, segments * n_dim);
1162
0
  } else {
1163
0
    memcpy(ha->data.f32, a_values, sizeof(float) * total_m * k_dim);
1164
0
    memcpy(hwd->data.f32, w_values, sizeof(float) * segments * n_dim * k_dim);
1165
0
    if (use_bias)
1166
0
      memcpy(hbias->data.f32, bias_values, sizeof(float) * segments * n_dim);
1167
0
  }
1168
0
  memcpy(hindices->data.i32, indices_data, sizeof(indices_data));
1169
0
  memcpy(hcounts->data.i32, counts_data, sizeof(counts_data));
1170
0
  const size_t qsize = ccv_nnc_quantize_8i_rowwise(hwd->data.u8, datatype, CCV_TENSOR_CPU_MEMORY, (size_t)segments * n_dim * k_dim, k_dim, hwq->data.u8, ccv_nnc_tensor_data_size_without_padding(hwq->info));
1171
0
  if (qsize != ccv_nnc_tensor_data_size_without_padding(hwq->info))
1172
0
    return -1;
1173
0
  if (use_bias)
1174
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hindices, hcounts, hwq, hbias), TENSOR_LIST(a, indices, counts, w, bias), 0);
1175
0
  else
1176
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hindices, hcounts, hwq), TENSOR_LIST(a, indices, counts, w), 0);
1177
0
  const uint64_t old_flags = ccv_nnc_flags();
1178
0
  if (force_fallback)
1179
0
    ccv_nnc_enable_flag(CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS);
1180
0
  if (use_bias)
1181
0
    ccv_nnc_cmd_exec(CMD_SEGMENTED_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, indices, counts, w, bias), TENSOR_LIST(b), 0);
1182
0
  else
1183
0
    ccv_nnc_cmd_exec(CMD_SEGMENTED_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, indices, counts, w), TENSOR_LIST(b), 0);
1184
0
  if (force_fallback && !(old_flags & CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS))
1185
0
    ccv_nnc_disable_flag(CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS);
1186
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
1187
1188
0
  float* const a_ref = (float*)ccmalloc(sizeof(float) * total_m * k_dim);
1189
0
  float* const w_ref = (float*)ccmalloc(sizeof(float) * segments * n_dim * k_dim);
1190
0
  float* const bias_ref = use_bias ? (float*)ccmalloc(sizeof(float) * segments * n_dim) : 0;
1191
0
  float* const actual = (float*)ccmalloc(sizeof(float) * total_m * n_dim);
1192
0
  ccv_nnc_tensor_t* const ha_ref = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, total_m, k_dim), 0);
1193
0
  ccv_nnc_tensor_t* const hw_ref = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, segments, n_dim, k_dim), 0);
1194
0
  ccv_nnc_tensor_t* const hbias_ref = use_bias ? ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, segments, n_dim), 0) : 0;
1195
0
  ccv_nnc_tensor_t* const bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, total_m, n_dim), 0);
1196
0
  if (force_fallback)
1197
0
    _mps_forward_scaled_gemm_to_float(datatype, ha->data.u8, total_m * k_dim, a_ref);
1198
0
  else
1199
0
    _mps_forward_scaled_gemm_quantized_reference(datatype, ha->data.u8, total_m, k_dim, a_ref);
1200
0
  _mps_forward_scaled_gemm_quantized_reference(datatype, hwd->data.u8, segments * n_dim, k_dim, w_ref);
1201
0
  if (use_bias)
1202
0
    _mps_forward_scaled_gemm_to_float(datatype, hbias->data.u8, segments * n_dim, bias_ref);
1203
0
  _mps_forward_scaled_gemm_to_float(datatype, hb->data.u8, total_m * n_dim, actual);
1204
0
  memcpy(ha_ref->data.f32, a_ref, sizeof(float) * total_m * k_dim);
1205
0
  memcpy(hw_ref->data.f32, w_ref, sizeof(float) * segments * n_dim * k_dim);
1206
0
  if (use_bias)
1207
0
    memcpy(hbias_ref->data.f32, bias_ref, sizeof(float) * segments * n_dim);
1208
0
  if (use_bias)
1209
0
    ccv_nnc_cmd_exec(CMD_SEGMENTED_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha_ref, hindices, hcounts, hw_ref, hbias_ref), TENSOR_LIST(bt), 0);
1210
0
  else
1211
0
    ccv_nnc_cmd_exec(CMD_SEGMENTED_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha_ref, hindices, hcounts, hw_ref), TENSOR_LIST(bt), 0);
1212
0
  double max_abs = 0;
1213
0
  double max_rel = 0;
1214
0
  for (i = 0; i < total_m * n_dim; i++)
1215
0
  {
1216
0
    const double diff = fabs((double)actual[i] - (double)bt->data.f32[i]);
1217
0
    const double denom = ccv_max(1.0, ccv_max(fabs((double)actual[i]), fabs((double)bt->data.f32[i])));
1218
0
    max_abs = ccv_max(max_abs, diff);
1219
0
    max_rel = ccv_max(max_rel, diff / denom);
1220
0
  }
1221
0
  if (max_abs_ref)
1222
0
    *max_abs_ref = max_abs;
1223
0
  if (max_rel_ref)
1224
0
    *max_rel_ref = max_rel;
1225
0
  ccv_nnc_tensor_free(bt);
1226
0
  if (hbias_ref)
1227
0
    ccv_nnc_tensor_free(hbias_ref);
1228
0
  ccv_nnc_tensor_free(hw_ref);
1229
0
  ccv_nnc_tensor_free(ha_ref);
1230
0
  ccfree(actual);
1231
0
  if (bias_ref)
1232
0
    ccfree(bias_ref);
1233
0
  ccfree(w_ref);
1234
0
  ccfree(a_ref);
1235
0
  ccfree(a_values);
1236
0
  ccfree(w_values);
1237
0
  if (bias_values)
1238
0
    ccfree(bias_values);
1239
0
  ccv_nnc_tensor_free(hb);
1240
0
  ccv_nnc_tensor_free(b);
1241
0
  if (bias)
1242
0
    ccv_nnc_tensor_free(bias);
1243
0
  ccv_nnc_tensor_free(w);
1244
0
  ccv_nnc_tensor_free(counts);
1245
0
  ccv_nnc_tensor_free(indices);
1246
0
  ccv_nnc_tensor_free(a);
1247
0
  if (hbias)
1248
0
    ccv_nnc_tensor_free(hbias);
1249
0
  ccv_nnc_tensor_free(hwq);
1250
0
  ccv_nnc_tensor_free(hwd);
1251
0
  ccv_nnc_tensor_free(hcounts);
1252
0
  ccv_nnc_tensor_free(hindices);
1253
0
  ccv_nnc_tensor_free(ha);
1254
0
  return 0;
1255
0
}
1256
1257
TEST_CASE("mps forward gemm with row-wise 8i weight NA")
1258
1
{
1259
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1260
0
  double max_abs = 0;
1261
0
  double max_rel = 0;
1262
0
  REQUIRE_EQ(_mps_forward_scaled_gemm_validate(CCV_16F, 0, &max_abs, &max_rel), 0, "scaled GEMM validation should run");
1263
0
  REQUIRE(max_rel < 2e-3, "quantized NAInt8MatMul should match row-wise quantized fp16 reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1264
0
  max_abs = 0;
1265
0
  max_rel = 0;
1266
0
  REQUIRE_EQ(_mps_forward_scaled_gemm_validate(CCV_32F, 0, &max_abs, &max_rel), 0, "scaled GEMM validation should run");
1267
0
  REQUIRE(max_rel < 2e-3, "quantized NAInt8MatMul should match row-wise quantized fp32 reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1268
0
  max_abs = 0;
1269
0
  max_rel = 0;
1270
0
  REQUIRE_EQ(_mps_forward_scaled_gemm_validate(CCV_16BF, 0, &max_abs, &max_rel), 0, "scaled GEMM validation should run");
1271
0
  REQUIRE(max_rel < 5e-3, "quantized NAInt8MatMul should match row-wise quantized bf16 reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1272
0
}
1273
1274
TEST_CASE("mps forward gemm with row-wise 8i weight and bias NA")
1275
1
{
1276
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1277
0
  double max_abs = 0;
1278
0
  double max_rel = 0;
1279
0
  REQUIRE_EQ(_mps_forward_scaled_gemm_validate(CCV_16F, 1, &max_abs, &max_rel), 0, "scaled GEMM validation with bias should run");
1280
0
  REQUIRE(max_rel < 2e-3, "quantized NAInt8MatMul with bias should match row-wise quantized fp16 reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1281
0
  max_abs = 0;
1282
0
  max_rel = 0;
1283
0
  REQUIRE_EQ(_mps_forward_scaled_gemm_validate(CCV_32F, 1, &max_abs, &max_rel), 0, "scaled GEMM validation with bias should run");
1284
0
  REQUIRE(max_rel < 2e-3, "quantized NAInt8MatMul with bias should match row-wise quantized fp32 reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1285
0
  max_abs = 0;
1286
0
  max_rel = 0;
1287
0
  REQUIRE_EQ(_mps_forward_scaled_gemm_validate(CCV_16BF, 1, &max_abs, &max_rel), 0, "scaled GEMM validation with bias should run");
1288
0
  REQUIRE(max_rel < 5e-3, "quantized NAInt8MatMul with bias should match row-wise quantized bf16 reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1289
0
}
1290
1291
TEST_CASE("mps forward gemm with row-wise 8i weight NA aligned M")
1292
1
{
1293
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1294
0
  double max_abs = 0;
1295
0
  double max_rel = 0;
1296
0
  REQUIRE_EQ(_mps_forward_scaled_gemm_validate_aligned_m(CCV_16F, 0, &max_abs, &max_rel), 0, "scaled GEMM aligned-M validation should run");
1297
0
  REQUIRE(max_rel < 2e-3, "quantized NAInt8MatMul should match aligned-M row-wise quantized fp16 reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1298
0
  max_abs = 0;
1299
0
  max_rel = 0;
1300
0
  REQUIRE_EQ(_mps_forward_scaled_gemm_validate_aligned_m(CCV_32F, 0, &max_abs, &max_rel), 0, "scaled GEMM aligned-M validation should run");
1301
0
  REQUIRE(max_rel < 2e-3, "quantized NAInt8MatMul should match aligned-M row-wise quantized fp32 reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1302
0
  max_abs = 0;
1303
0
  max_rel = 0;
1304
0
  REQUIRE_EQ(_mps_forward_scaled_gemm_validate_aligned_m(CCV_16BF, 0, &max_abs, &max_rel), 0, "scaled GEMM aligned-M validation should run");
1305
0
  REQUIRE(max_rel < 5e-3, "quantized NAInt8MatMul should match aligned-M row-wise quantized bf16 reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1306
0
}
1307
1308
TEST_CASE("mps forward gemm with row-wise 8i weight and bias NA aligned M")
1309
1
{
1310
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1311
0
  double max_abs = 0;
1312
0
  double max_rel = 0;
1313
0
  REQUIRE_EQ(_mps_forward_scaled_gemm_validate_aligned_m(CCV_16F, 1, &max_abs, &max_rel), 0, "scaled GEMM aligned-M validation with bias should run");
1314
0
  REQUIRE(max_rel < 2e-3, "quantized NAInt8MatMul with bias should match aligned-M row-wise quantized fp16 reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1315
0
  max_abs = 0;
1316
0
  max_rel = 0;
1317
0
  REQUIRE_EQ(_mps_forward_scaled_gemm_validate_aligned_m(CCV_32F, 1, &max_abs, &max_rel), 0, "scaled GEMM aligned-M validation with bias should run");
1318
0
  REQUIRE(max_rel < 2e-3, "quantized NAInt8MatMul with bias should match aligned-M row-wise quantized fp32 reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1319
0
  max_abs = 0;
1320
0
  max_rel = 0;
1321
0
  REQUIRE_EQ(_mps_forward_scaled_gemm_validate_aligned_m(CCV_16BF, 1, &max_abs, &max_rel), 0, "scaled GEMM aligned-M validation with bias should run");
1322
0
  REQUIRE(max_rel < 5e-3, "quantized NAInt8MatMul with bias should match aligned-M row-wise quantized bf16 reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1323
0
}
1324
1325
TEST_CASE("mps forward gemm with row-wise 8i weight ANE stream ordering")
1326
1
{
1327
1
  if (!(getenv("CCV_NNC_MFA_ANE_ROWWISE_GEMM") && 
atoi(getenv("CCV_NNC_MFA_ANE_ROWWISE_GEMM")) != 00
))
1328
1
    return;
1329
0
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1330
0
  double max_abs = 0;
1331
0
  double max_rel = 0;
1332
0
  REQUIRE_EQ(_mps_forward_ane_rowwise_gemm_stream_sync_validate(&max_abs, &max_rel), 0, "ANE row-wise 8i stream-ordering validation should run");
1333
0
  REQUIRE(max_rel < 2e-3, "ANE row-wise 8i GEMM should respect queued Metal writer work before quant/evaluate, max_abs=%g max_rel=%g", max_abs, max_rel);
1334
0
}
1335
1336
TEST_CASE("mps segmented gemm with row-wise 8i weight NA")
1337
1
{
1338
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SEGMENTED_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1339
0
  double max_abs = 0;
1340
0
  double max_rel = 0;
1341
0
  REQUIRE_EQ(_mps_segmented_scaled_gemm_validate(CCV_16F, 0, 0, &max_abs, &max_rel), 0, "segmented row-wise 8i NA validation should run");
1342
0
  REQUIRE(max_rel < 3e-3, "segmented row-wise 8i NA fp16 should match quantized reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1343
0
  max_abs = 0;
1344
0
  max_rel = 0;
1345
0
  REQUIRE_EQ(_mps_segmented_scaled_gemm_validate(CCV_32F, 0, 0, &max_abs, &max_rel), 0, "segmented row-wise 8i NA validation should run");
1346
0
  REQUIRE(max_rel < 3e-3, "segmented row-wise 8i NA fp32 should match quantized reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1347
0
  max_abs = 0;
1348
0
  max_rel = 0;
1349
0
  REQUIRE_EQ(_mps_segmented_scaled_gemm_validate(CCV_16BF, 0, 0, &max_abs, &max_rel), 0, "segmented row-wise 8i NA validation should run");
1350
0
  REQUIRE(max_rel < 6e-3, "segmented row-wise 8i NA bf16 should match quantized reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1351
0
}
1352
1353
TEST_CASE("mps segmented gemm with row-wise 8i weight and bias fallback dequantize")
1354
1
{
1355
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SEGMENTED_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1356
0
  double max_abs = 0;
1357
0
  double max_rel = 0;
1358
0
  REQUIRE_EQ(_mps_segmented_scaled_gemm_validate(CCV_16F, 1, 1, &max_abs, &max_rel), 0, "segmented fallback row-wise 8i validation should run");
1359
0
  REQUIRE(max_rel < 3e-3, "segmented fallback row-wise 8i fp16 should match dense-A reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1360
0
  max_abs = 0;
1361
0
  max_rel = 0;
1362
0
  REQUIRE_EQ(_mps_segmented_scaled_gemm_validate(CCV_32F, 1, 1, &max_abs, &max_rel), 0, "segmented fallback row-wise 8i validation should run");
1363
0
  REQUIRE(max_rel < 3e-3, "segmented fallback row-wise 8i fp32 should match dense-A reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1364
0
  max_abs = 0;
1365
0
  max_rel = 0;
1366
0
  REQUIRE_EQ(_mps_segmented_scaled_gemm_validate(CCV_16BF, 1, 1, &max_abs, &max_rel), 0, "segmented fallback row-wise 8i validation should run");
1367
0
  REQUIRE(max_rel < 6e-3, "segmented fallback row-wise 8i bf16 should match dense-A reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1368
0
}
1369
1370
TEST_CASE("mps forward gemm with row-wise 8i weight fallback dequantize")
1371
1
{
1372
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1373
0
  const uint64_t old_flags = ccv_nnc_flags();
1374
0
  ccv_nnc_enable_flag(CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS);
1375
0
  double max_abs = 0;
1376
0
  double max_rel = 0;
1377
0
  const int status16f = _mps_forward_scaled_gemm_compare_dense(CCV_16F, 0, 257, 384, 128, &max_abs, &max_rel);
1378
0
  if (!(old_flags & CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS)) {
1379
0
    ccv_nnc_disable_flag(CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS);
1380
0
  }
1381
0
  REQUIRE_EQ(status16f, 0, "fallback row-wise 8i GEMM validation should run");
1382
0
  REQUIRE(max_rel < 2e-3, "fallback row-wise 8i GEMM should match dense GPU fp16 reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1383
1384
0
  ccv_nnc_enable_flag(CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS);
1385
0
  max_abs = 0;
1386
0
  max_rel = 0;
1387
0
  const int status32f = _mps_forward_scaled_gemm_compare_dense(CCV_32F, 0, 257, 384, 128, &max_abs, &max_rel);
1388
0
  if (!(old_flags & CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS)) {
1389
0
    ccv_nnc_disable_flag(CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS);
1390
0
  }
1391
0
  REQUIRE_EQ(status32f, 0, "fallback row-wise 8i GEMM validation should run");
1392
0
  REQUIRE(max_rel < 2e-3, "fallback row-wise 8i GEMM should match dense GPU fp32 reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1393
1394
0
  ccv_nnc_enable_flag(CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS);
1395
0
  max_abs = 0;
1396
0
  max_rel = 0;
1397
0
  const int status16bf = _mps_forward_scaled_gemm_compare_dense(CCV_16BF, 0, 257, 384, 128, &max_abs, &max_rel);
1398
0
  if (!(old_flags & CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS)) {
1399
0
    ccv_nnc_disable_flag(CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS);
1400
0
  }
1401
0
  REQUIRE_EQ(status16bf, 0, "fallback row-wise 8i GEMM validation should run");
1402
0
  REQUIRE(max_rel < 5e-3, "fallback row-wise 8i GEMM should match dense GPU bf16 reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1403
0
}
1404
1405
TEST_CASE("mps forward gemm with row-wise 8i weight and bias fallback dequantize")
1406
1
{
1407
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1408
0
  const uint64_t old_flags = ccv_nnc_flags();
1409
0
  double max_abs = 0;
1410
0
  double max_rel = 0;
1411
0
  ccv_nnc_enable_flag(CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS);
1412
0
  REQUIRE_EQ(_mps_forward_scaled_gemm_compare_dense(CCV_16F, 1, 257, 384, 128, &max_abs, &max_rel), 0, "fallback row-wise 8i GEMM with bias validation should run");
1413
0
  if (!(old_flags & CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS)) {
1414
0
    ccv_nnc_disable_flag(CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS);
1415
0
  }
1416
0
  REQUIRE(max_rel < 2e-3, "fallback row-wise 8i GEMM with bias should match dense GPU fp16 reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1417
0
  max_abs = 0;
1418
0
  max_rel = 0;
1419
0
  ccv_nnc_enable_flag(CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS);
1420
0
  REQUIRE_EQ(_mps_forward_scaled_gemm_compare_dense(CCV_32F, 1, 257, 384, 128, &max_abs, &max_rel), 0, "fallback row-wise 8i GEMM with bias validation should run");
1421
0
  if (!(old_flags & CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS)) {
1422
0
    ccv_nnc_disable_flag(CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS);
1423
0
  }
1424
0
  REQUIRE(max_rel < 2e-3, "fallback row-wise 8i GEMM with bias should match dense GPU fp32 reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1425
0
  max_abs = 0;
1426
0
  max_rel = 0;
1427
0
  ccv_nnc_enable_flag(CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS);
1428
0
  REQUIRE_EQ(_mps_forward_scaled_gemm_compare_dense(CCV_16BF, 1, 257, 384, 128, &max_abs, &max_rel), 0, "fallback row-wise 8i GEMM with bias validation should run");
1429
0
  if (!(old_flags & CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS)) {
1430
0
    ccv_nnc_disable_flag(CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS);
1431
0
  }
1432
0
  REQUIRE(max_rel < 5e-3, "fallback row-wise 8i GEMM with bias should match dense GPU bf16 reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1433
0
}
1434
1435
TEST_CASE("mps forward gemm with row-wise 8i weight and bias fallback dequantize large shapes")
1436
1
{
1437
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1438
0
  const uint64_t old_flags = ccv_nnc_flags();
1439
0
  ccv_nnc_enable_flag(CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS);
1440
0
  static const int shapes[][3] = {
1441
0
    {32, 3840, 3840},
1442
0
    {32, 10240, 3840},
1443
0
    {32, 3840, 10240},
1444
0
  };
1445
0
  int i;
1446
0
  for (i = 0; i < (int)(sizeof(shapes) / sizeof(shapes[0])); i++)
1447
0
  {
1448
0
    double max_abs = 0;
1449
0
    double max_rel = 0;
1450
0
    REQUIRE_EQ(_mps_forward_scaled_gemm_compare_dense(CCV_16BF, 1, shapes[i][0], shapes[i][1], shapes[i][2], &max_abs, &max_rel), 0, "large fallback row-wise 8i GEMM with bias validation should run");
1451
0
    REQUIRE(max_rel < 5e-3, "large fallback row-wise 8i GEMM with bias should match dense GPU bf16 reference for shape %d x %d x %d, max_abs=%g max_rel=%g", shapes[i][0], shapes[i][1], shapes[i][2], max_abs, max_rel);
1452
0
  }
1453
0
  if (!(old_flags & CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS)) {
1454
0
    ccv_nnc_disable_flag(CCV_NNC_DISABLE_MFA_NEURAL_ACCELERATORS);
1455
0
  }
1456
0
}
1457
1458
TEST_CASE("mps forward batched gemm with broadcast row-wise 8i weight NA")
1459
1
{
1460
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1461
0
  double max_abs = 0;
1462
0
  double max_rel = 0;
1463
0
  REQUIRE_EQ(_mps_forward_scaled_gemm_validate_batched(CCV_16F, 0, 0, 0, &max_abs, &max_rel), 0, "batched scaled GEMM validation should run");
1464
0
  REQUIRE(max_rel < 2e-3, "batched quantized NAInt8MatMul should match broadcast-weight fp16 reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1465
0
  max_abs = 0;
1466
0
  max_rel = 0;
1467
0
  REQUIRE_EQ(_mps_forward_scaled_gemm_validate_batched(CCV_32F, 0, 0, 0, &max_abs, &max_rel), 0, "batched scaled GEMM validation should run");
1468
0
  REQUIRE(max_rel < 2e-3, "batched quantized NAInt8MatMul should match broadcast-weight fp32 reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1469
0
  max_abs = 0;
1470
0
  max_rel = 0;
1471
0
  REQUIRE_EQ(_mps_forward_scaled_gemm_validate_batched(CCV_16BF, 0, 0, 0, &max_abs, &max_rel), 0, "batched scaled GEMM validation should run");
1472
0
  REQUIRE(max_rel < 5e-3, "batched quantized NAInt8MatMul should match broadcast-weight bf16 reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1473
0
}
1474
1475
TEST_CASE("mps forward batched gemm with batched row-wise 8i weight and bias NA")
1476
1
{
1477
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1478
0
  double max_abs = 0;
1479
0
  double max_rel = 0;
1480
0
  REQUIRE_EQ(_mps_forward_scaled_gemm_validate_batched(CCV_16F, 1, 1, 1, &max_abs, &max_rel), 0, "batched scaled GEMM validation with batched weight and bias should run");
1481
0
  REQUIRE(max_rel < 2e-3, "batched quantized NAInt8MatMul should match batched-weight fp16 reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1482
0
  max_abs = 0;
1483
0
  max_rel = 0;
1484
0
  REQUIRE_EQ(_mps_forward_scaled_gemm_validate_batched(CCV_32F, 1, 1, 1, &max_abs, &max_rel), 0, "batched scaled GEMM validation with batched weight and bias should run");
1485
0
  REQUIRE(max_rel < 2e-3, "batched quantized NAInt8MatMul should match batched-weight fp32 reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1486
0
  max_abs = 0;
1487
0
  max_rel = 0;
1488
0
  REQUIRE_EQ(_mps_forward_scaled_gemm_validate_batched(CCV_16BF, 1, 1, 1, &max_abs, &max_rel), 0, "batched scaled GEMM validation with batched weight and bias should run");
1489
0
  REQUIRE(max_rel < 5e-3, "batched quantized NAInt8MatMul should match batched-weight bf16 reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1490
0
}
1491
1492
TEST_CASE("mps forward batched gemm with padded A view and broadcast row-wise 8i weight NA")
1493
1
{
1494
1
  if (!getenv("CCV_NNC_RUN_PADDED_SCALED_GEMM_TEST"))
1495
1
    return;
1496
0
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1497
0
  double max_abs = 0;
1498
0
  double max_rel = 0;
1499
0
  REQUIRE_EQ(_mps_forward_scaled_gemm_compare_dense_batched_padded_a_shape(CCV_16F, 0, 1, 512, 3072, 3072, 513, &max_abs, &max_rel), 0, "single-batch padded-A scaled GEMM validation should run");
1500
0
  REQUIRE(max_rel < 2e-3, "single-batch padded-A scaled GEMM without bias should match dense reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1501
0
  max_abs = 0;
1502
0
  max_rel = 0;
1503
0
  REQUIRE_EQ(_mps_forward_scaled_gemm_compare_dense_batched_padded_a_shape(CCV_16F, 0, 2, 512, 3072, 3072, 513, &max_abs, &max_rel), 0, "batched padded-A scaled GEMM validation should run");
1504
0
  REQUIRE(max_rel < 2e-3, "batched padded-A scaled GEMM without bias should match dense reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1505
0
  max_abs = 0;
1506
0
  max_rel = 0;
1507
0
  REQUIRE_EQ(_mps_forward_scaled_gemm_compare_dense_batched_padded_a_shape(CCV_16F, 1, 2, 512, 3072, 3072, 513, &max_abs, &max_rel), 0, "batched padded-A scaled GEMM with bias validation should run");
1508
0
  REQUIRE(max_rel < 2e-3, "batched padded-A scaled GEMM with bias should match dense reference, max_abs=%g max_rel=%g", max_abs, max_rel);
1509
0
}
1510
1511
#define _STRINGIFY(x) #x
1512
#define STRINGIFY(x) _STRINGIFY(x)
1513
#define NA_GEMM_SHAPE_TEST(M, N, K) \
1514
  TEST_CASE("mps forward gemm no bias NA shape " STRINGIFY(M) "x" STRINGIFY(N) "x" STRINGIFY(K)) \
1515
31
  { \
1516
31
    if (!getenv("CCV_NNC_RUN_NA_GEMM_SHAPE_TESTS")) \
1517
31
      return; \
1518
31
    
GUARD_ELSE_RETURN0
(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS))0
; \
1519
0
    _mps_forward_na_gemm_mismatch_t mismatch = {}; \
1520
0
    REQUIRE(_mps_forward_na_gemm_validate_shape(M, N, K, &mismatch), "sampled GEMM result should match reference for shape (%d, %d, %d) at (%d, %d): %g vs %g", M, N, K, mismatch.row, mismatch.col, mismatch.actual, mismatch.expected); \
1521
0
  }
1522
1523
#define NA_GEMM_BIAS_SHAPE_TEST(M, N, K) \
1524
  TEST_CASE("mps forward gemm with bias NA shape " STRINGIFY(M) "x" STRINGIFY(N) "x" STRINGIFY(K)) \
1525
31
  { \
1526
31
    if (!getenv("CCV_NNC_RUN_NA_GEMM_SHAPE_TESTS")) \
1527
31
      return; \
1528
31
    
GUARD_ELSE_RETURN0
(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS))0
; \
1529
0
    _mps_forward_na_gemm_mismatch_t mismatch = {}; \
1530
0
    REQUIRE(_mps_forward_na_gemm_validate_shape_with_bias(M, N, K, &mismatch), "sampled GEMM result with bias should match reference for shape (%d, %d, %d) at (%d, %d): %g vs %g", M, N, K, mismatch.row, mismatch.col, mismatch.actual, mismatch.expected); \
1531
0
  }
1532
1533
TEST_CASE("gemm no transpose")
1534
1
{
1535
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1536
0
  float ap[] = {
1537
0
    1, 2,
1538
0
    3, 4,
1539
0
    5, 6,
1540
0
    7, 8,
1541
0
  };
1542
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 4, 2), 0);
1543
0
  float bp[] = {
1544
0
    7, 8, 9,
1545
0
    10, 11, 12,
1546
0
  };
1547
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
1548
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 3), 0);
1549
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
1550
0
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
1551
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
1552
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
1553
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb), TENSOR_LIST(gc), 0);
1554
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
1555
0
  float ctp[] = {
1556
0
    1 * 7 + 2 * 10, 1 * 8 + 2 * 11, 1 * 9 + 2 * 12,
1557
0
    3 * 7 + 4 * 10, 3 * 8 + 4 * 11, 3 * 9 + 4 * 12,
1558
0
    5 * 7 + 6 * 10, 5 * 8 + 6 * 11, 5 * 9 + 6 * 12,
1559
0
    7 * 7 + 8 * 10, 7 * 8 + 8 * 11, 7 * 9 + 8 * 12,
1560
0
  };
1561
0
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
1562
0
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
1563
0
  ccv_nnc_tensor_free(a);
1564
0
  ccv_nnc_tensor_free(b);
1565
0
  ccv_nnc_tensor_free(c);
1566
0
  ccv_nnc_tensor_free(ga);
1567
0
  ccv_nnc_tensor_free(gb);
1568
0
  ccv_nnc_tensor_free(gc);
1569
0
}
1570
1571
TEST_CASE("gemm transpose a")
1572
1
{
1573
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1574
0
  float ap[] = {
1575
0
    1, 3, 5, 7,
1576
0
    2, 4, 6, 8,
1577
0
  };
1578
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4), 0);
1579
0
  float bp[] = {
1580
0
    7, 8, 9,
1581
0
    10, 11, 12,
1582
0
  };
1583
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
1584
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 3), 0);
1585
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4), 0);
1586
0
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
1587
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
1588
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
1589
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb), TENSOR_LIST(gc), 0);
1590
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
1591
0
  float ctp[] = {
1592
0
    1 * 7 + 2 * 10, 1 * 8 + 2 * 11, 1 * 9 + 2 * 12,
1593
0
    3 * 7 + 4 * 10, 3 * 8 + 4 * 11, 3 * 9 + 4 * 12,
1594
0
    5 * 7 + 6 * 10, 5 * 8 + 6 * 11, 5 * 9 + 6 * 12,
1595
0
    7 * 7 + 8 * 10, 7 * 8 + 8 * 11, 7 * 9 + 8 * 12,
1596
0
  };
1597
0
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
1598
0
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
1599
0
  ccv_nnc_tensor_free(a);
1600
0
  ccv_nnc_tensor_free(b);
1601
0
  ccv_nnc_tensor_free(c);
1602
0
  ccv_nnc_tensor_free(ga);
1603
0
  ccv_nnc_tensor_free(gb);
1604
0
  ccv_nnc_tensor_free(gc);
1605
0
}
1606
1607
TEST_CASE("gemm transpose b")
1608
1
{
1609
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1610
0
  float ap[] = {
1611
0
    1, 2,
1612
0
    3, 4,
1613
0
    5, 6,
1614
0
    7, 8,
1615
0
  };
1616
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 4, 2), 0);
1617
0
  float bp[] = {
1618
0
    7, 10,
1619
0
    8, 11,
1620
0
    9, 12,
1621
0
  };
1622
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 3, 2), 0);
1623
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 3), 0);
1624
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
1625
0
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3, 2), 0);
1626
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
1627
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
1628
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb), TENSOR_LIST(gc), 0);
1629
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
1630
0
  float ctp[] = {
1631
0
    1 * 7 + 2 * 10, 1 * 8 + 2 * 11, 1 * 9 + 2 * 12,
1632
0
    3 * 7 + 4 * 10, 3 * 8 + 4 * 11, 3 * 9 + 4 * 12,
1633
0
    5 * 7 + 6 * 10, 5 * 8 + 6 * 11, 5 * 9 + 6 * 12,
1634
0
    7 * 7 + 8 * 10, 7 * 8 + 8 * 11, 7 * 9 + 8 * 12,
1635
0
  };
1636
0
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
1637
0
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
1638
0
  ccv_nnc_tensor_free(a);
1639
0
  ccv_nnc_tensor_free(b);
1640
0
  ccv_nnc_tensor_free(c);
1641
0
  ccv_nnc_tensor_free(ga);
1642
0
  ccv_nnc_tensor_free(gb);
1643
0
  ccv_nnc_tensor_free(gc);
1644
0
}
1645
1646
TEST_CASE("gemm transpose a and b")
1647
1
{
1648
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1649
0
  float ap[] = {
1650
0
    1, 3, 5, 7,
1651
0
    2, 4, 6, 8,
1652
0
  };
1653
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4), 0);
1654
0
  float bp[] = {
1655
0
    7, 10,
1656
0
    8, 11,
1657
0
    9, 12,
1658
0
  };
1659
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 3, 2), 0);
1660
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 3), 0);
1661
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4), 0);
1662
0
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3, 2), 0);
1663
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
1664
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
1665
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(TRANSPOSE(0, 1), TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb), TENSOR_LIST(gc), 0);
1666
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
1667
0
  float ctp[] = {
1668
0
    1 * 7 + 2 * 10, 1 * 8 + 2 * 11, 1 * 9 + 2 * 12,
1669
0
    3 * 7 + 4 * 10, 3 * 8 + 4 * 11, 3 * 9 + 4 * 12,
1670
0
    5 * 7 + 6 * 10, 5 * 8 + 6 * 11, 5 * 9 + 6 * 12,
1671
0
    7 * 7 + 8 * 10, 7 * 8 + 8 * 11, 7 * 9 + 8 * 12,
1672
0
  };
1673
0
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
1674
0
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
1675
0
  ccv_nnc_tensor_free(a);
1676
0
  ccv_nnc_tensor_free(b);
1677
0
  ccv_nnc_tensor_free(c);
1678
0
  ccv_nnc_tensor_free(ga);
1679
0
  ccv_nnc_tensor_free(gb);
1680
0
  ccv_nnc_tensor_free(gc);
1681
0
}
1682
1683
TEST_CASE("gemm no transpose with bias")
1684
1
{
1685
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1686
0
  float ap[] = {
1687
0
    1, 2,
1688
0
    3, 4,
1689
0
    5, 6,
1690
0
    7, 8,
1691
0
  };
1692
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 4, 2), 0);
1693
0
  float bp[] = {
1694
0
    7, 8, 9,
1695
0
    10, 11, 12,
1696
0
  };
1697
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
1698
0
  float dp[] = {
1699
0
    1, -1, 1,
1700
0
    1, -1, 1,
1701
0
    1, -1, 1,
1702
0
    1, -1, 1,
1703
0
  };
1704
0
  ccv_nnc_tensor_t* const d = ccv_nnc_tensor_new(dp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
1705
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 3), 0);
1706
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
1707
0
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
1708
0
  ccv_nnc_tensor_t* gd = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
1709
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
1710
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b, d), TENSOR_LIST(ga, gb, gd), 0);
1711
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb, gd), TENSOR_LIST(gc), 0);
1712
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
1713
0
  float ctp[] = {
1714
0
    1 * 7 + 2 * 10 + 1, 1 * 8 + 2 * 11 - 1, 1 * 9 + 2 * 12 + 1,
1715
0
    3 * 7 + 4 * 10 + 1, 3 * 8 + 4 * 11 - 1, 3 * 9 + 4 * 12 + 1,
1716
0
    5 * 7 + 6 * 10 + 1, 5 * 8 + 6 * 11 - 1, 5 * 9 + 6 * 12 + 1,
1717
0
    7 * 7 + 8 * 10 + 1, 7 * 8 + 8 * 11 - 1, 7 * 9 + 8 * 12 + 1,
1718
0
  };
1719
0
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
1720
0
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
1721
0
  ccv_nnc_tensor_free(a);
1722
0
  ccv_nnc_tensor_free(b);
1723
0
  ccv_nnc_tensor_free(c);
1724
0
  ccv_nnc_tensor_free(d);
1725
0
  ccv_nnc_tensor_free(ga);
1726
0
  ccv_nnc_tensor_free(gb);
1727
0
  ccv_nnc_tensor_free(gc);
1728
0
  ccv_nnc_tensor_free(gd);
1729
0
}
1730
1731
TEST_CASE("gemm no transpose batch 2, no batch b")
1732
1
{
1733
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1734
0
  float ap[] = {
1735
0
    1, 2,
1736
0
    3, 4,
1737
0
    5, 6,
1738
0
    7, 8,
1739
0
    2, 3,
1740
0
    4, 5,
1741
0
    6, 7,
1742
0
    8, 9
1743
0
  };
1744
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
1745
0
  float bp[] = {
1746
0
    7, 8, 9,
1747
0
    10, 11, 12,
1748
0
  };
1749
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
1750
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
1751
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
1752
0
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
1753
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
1754
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
1755
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb), TENSOR_LIST(gc), 0);
1756
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
1757
0
  float ctp[] = {
1758
0
    1 * 7 + 2 * 10, 1 * 8 + 2 * 11, 1 * 9 + 2 * 12,
1759
0
    3 * 7 + 4 * 10, 3 * 8 + 4 * 11, 3 * 9 + 4 * 12,
1760
0
    5 * 7 + 6 * 10, 5 * 8 + 6 * 11, 5 * 9 + 6 * 12,
1761
0
    7 * 7 + 8 * 10, 7 * 8 + 8 * 11, 7 * 9 + 8 * 12,
1762
0
    2 * 7 + 3 * 10, 2 * 8 + 3 * 11, 2 * 9 + 3 * 12,
1763
0
    4 * 7 + 5 * 10, 4 * 8 + 5 * 11, 4 * 9 + 5 * 12,
1764
0
    6 * 7 + 7 * 10, 6 * 8 + 7 * 11, 6 * 9 + 7 * 12,
1765
0
    8 * 7 + 9 * 10, 8 * 8 + 9 * 11, 8 * 9 + 9 * 12,
1766
0
  };
1767
0
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
1768
0
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
1769
0
  ccv_nnc_tensor_free(a);
1770
0
  ccv_nnc_tensor_free(b);
1771
0
  ccv_nnc_tensor_free(c);
1772
0
  ccv_nnc_tensor_free(ga);
1773
0
  ccv_nnc_tensor_free(gb);
1774
0
  ccv_nnc_tensor_free(gc);
1775
0
}
1776
1777
TEST_CASE("gemm no transpose batch 2")
1778
1
{
1779
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1780
0
  float ap[] = {
1781
0
    1, 2,
1782
0
    3, 4,
1783
0
    5, 6,
1784
0
    7, 8,
1785
0
    2, 3,
1786
0
    4, 5,
1787
0
    6, 7,
1788
0
    8, 9
1789
0
  };
1790
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
1791
0
  float bp[] = {
1792
0
    7, 8, 9,
1793
0
    10, 11, 12,
1794
0
    8, 9, 10,
1795
0
    11, 12, 13,
1796
0
  };
1797
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 2, 3), 0);
1798
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
1799
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
1800
0
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 3), 0);
1801
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
1802
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
1803
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb), TENSOR_LIST(gc), 0);
1804
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
1805
0
  float ctp[] = {
1806
0
    1 * 7 + 2 * 10, 1 * 8 + 2 * 11, 1 * 9 + 2 * 12,
1807
0
    3 * 7 + 4 * 10, 3 * 8 + 4 * 11, 3 * 9 + 4 * 12,
1808
0
    5 * 7 + 6 * 10, 5 * 8 + 6 * 11, 5 * 9 + 6 * 12,
1809
0
    7 * 7 + 8 * 10, 7 * 8 + 8 * 11, 7 * 9 + 8 * 12,
1810
0
    2 * 8 + 3 * 11, 2 * 9 + 3 * 12, 2 * 10 + 3 * 13,
1811
0
    4 * 8 + 5 * 11, 4 * 9 + 5 * 12, 4 * 10 + 5 * 13,
1812
0
    6 * 8 + 7 * 11, 6 * 9 + 7 * 12, 6 * 10 + 7 * 13,
1813
0
    8 * 8 + 9 * 11, 8 * 9 + 9 * 12, 8 * 10 + 9 * 13,
1814
0
  };
1815
0
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
1816
0
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
1817
0
  ccv_nnc_tensor_free(a);
1818
0
  ccv_nnc_tensor_free(b);
1819
0
  ccv_nnc_tensor_free(c);
1820
0
  ccv_nnc_tensor_free(ga);
1821
0
  ccv_nnc_tensor_free(gb);
1822
0
  ccv_nnc_tensor_free(gc);
1823
0
}
1824
1825
TEST_CASE("gemm transpose a batch 2, no batch b, with bias")
1826
1
{
1827
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1828
0
  float ap[] = {
1829
0
    1, 3, 5, 7,
1830
0
    2, 4, 6, 8,
1831
0
    2, 4, 6, 8,
1832
0
    3, 5, 7, 9,
1833
0
  };
1834
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 2, 4), 0);
1835
0
  float bp[] = {
1836
0
    7, 8, 9,
1837
0
    10, 11, 12,
1838
0
  };
1839
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
1840
0
  float dp[] = {
1841
0
    -1, 0, 1,
1842
0
  };
1843
0
  ccv_nnc_tensor_t* const d = ccv_nnc_tensor_new(dp, CPU_TENSOR_NHWC(32F, 3), 0);
1844
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
1845
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 4), 0);
1846
0
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
1847
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
1848
0
  ccv_nnc_tensor_t* gd = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
1849
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b, d), TENSOR_LIST(ga, gb, gd), 0);
1850
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb, gd), TENSOR_LIST(gc), 0);
1851
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
1852
0
  float ctp[] = {
1853
0
    1 * 7 + 2 * 10 - 1, 1 * 8 + 2 * 11, 1 * 9 + 2 * 12 + 1,
1854
0
    3 * 7 + 4 * 10 - 1, 3 * 8 + 4 * 11, 3 * 9 + 4 * 12 + 1,
1855
0
    5 * 7 + 6 * 10 - 1, 5 * 8 + 6 * 11, 5 * 9 + 6 * 12 + 1,
1856
0
    7 * 7 + 8 * 10 - 1, 7 * 8 + 8 * 11, 7 * 9 + 8 * 12 + 1,
1857
0
    2 * 7 + 3 * 10 - 1, 2 * 8 + 3 * 11, 2 * 9 + 3 * 12 + 1,
1858
0
    4 * 7 + 5 * 10 - 1, 4 * 8 + 5 * 11, 4 * 9 + 5 * 12 + 1,
1859
0
    6 * 7 + 7 * 10 - 1, 6 * 8 + 7 * 11, 6 * 9 + 7 * 12 + 1,
1860
0
    8 * 7 + 9 * 10 - 1, 8 * 8 + 9 * 11, 8 * 9 + 9 * 12 + 1,
1861
0
  };
1862
0
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
1863
0
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
1864
0
  ccv_nnc_tensor_free(a);
1865
0
  ccv_nnc_tensor_free(b);
1866
0
  ccv_nnc_tensor_free(c);
1867
0
  ccv_nnc_tensor_free(d);
1868
0
  ccv_nnc_tensor_free(ga);
1869
0
  ccv_nnc_tensor_free(gb);
1870
0
  ccv_nnc_tensor_free(gc);
1871
0
  ccv_nnc_tensor_free(gd);
1872
0
}
1873
1874
TEST_CASE("gemm transpose a batch 2, with bias")
1875
1
{
1876
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1877
0
  float ap[] = {
1878
0
    1, 3, 5, 7,
1879
0
    2, 4, 6, 8,
1880
0
    2, 4, 6, 8,
1881
0
    3, 5, 7, 9,
1882
0
  };
1883
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 2, 4), 0);
1884
0
  float bp[] = {
1885
0
    7, 8, 9,
1886
0
    10, 11, 12,
1887
0
    8, 9, 10,
1888
0
    11, 12, 13,
1889
0
  };
1890
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 2, 3), 0);
1891
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
1892
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 4), 0);
1893
0
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 3), 0);
1894
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
1895
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
1896
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb), TENSOR_LIST(gc), 0);
1897
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
1898
0
  float ctp[] = {
1899
0
    1 * 7 + 2 * 10, 1 * 8 + 2 * 11, 1 * 9 + 2 * 12,
1900
0
    3 * 7 + 4 * 10, 3 * 8 + 4 * 11, 3 * 9 + 4 * 12,
1901
0
    5 * 7 + 6 * 10, 5 * 8 + 6 * 11, 5 * 9 + 6 * 12,
1902
0
    7 * 7 + 8 * 10, 7 * 8 + 8 * 11, 7 * 9 + 8 * 12,
1903
0
    2 * 8 + 3 * 11, 2 * 9 + 3 * 12, 2 * 10 + 3 * 13,
1904
0
    4 * 8 + 5 * 11, 4 * 9 + 5 * 12, 4 * 10 + 5 * 13,
1905
0
    6 * 8 + 7 * 11, 6 * 9 + 7 * 12, 6 * 10 + 7 * 13,
1906
0
    8 * 8 + 9 * 11, 8 * 9 + 9 * 12, 8 * 10 + 9 * 13,
1907
0
  };
1908
0
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
1909
0
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
1910
0
  ccv_nnc_tensor_free(a);
1911
0
  ccv_nnc_tensor_free(b);
1912
0
  ccv_nnc_tensor_free(c);
1913
0
  ccv_nnc_tensor_free(ga);
1914
0
  ccv_nnc_tensor_free(gb);
1915
0
  ccv_nnc_tensor_free(gc);
1916
0
}
1917
1918
TEST_CASE("gemm transpose b batch 2, with bias")
1919
1
{
1920
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1921
0
  float ap[] = {
1922
0
    1, 2,
1923
0
    3, 4,
1924
0
    5, 6,
1925
0
    7, 8,
1926
0
    2, 3,
1927
0
    4, 5,
1928
0
    6, 7,
1929
0
    8, 9
1930
0
  };
1931
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
1932
0
  float bp[] = {
1933
0
    7, 10,
1934
0
    8, 11,
1935
0
    9, 12,
1936
0
    80, 110,
1937
0
    90, 120,
1938
0
    10, 13,
1939
0
  };
1940
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3, 2), 0);
1941
0
  float dp[] = {
1942
0
    -1, 0, 1,
1943
0
    2, 3, -4,
1944
0
  };
1945
0
  ccv_nnc_tensor_t* const d = ccv_nnc_tensor_new(dp, CPU_TENSOR_NHWC(32F, 2, 1, 3), 0);
1946
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
1947
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
1948
0
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3, 2), 0);
1949
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
1950
0
  ccv_nnc_tensor_t* gd = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 1, 3), 0);
1951
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b, d), TENSOR_LIST(ga, gb, gd), 0);
1952
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb, gd), TENSOR_LIST(gc), 0);
1953
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
1954
0
  float ctp[] = {
1955
0
    1 * 7 + 2 * 10 - 1, 1 * 8 + 2 * 11, 1 * 9 + 2 * 12 + 1,
1956
0
    3 * 7 + 4 * 10 - 1, 3 * 8 + 4 * 11, 3 * 9 + 4 * 12 + 1,
1957
0
    5 * 7 + 6 * 10 - 1, 5 * 8 + 6 * 11, 5 * 9 + 6 * 12 + 1,
1958
0
    7 * 7 + 8 * 10 - 1, 7 * 8 + 8 * 11, 7 * 9 + 8 * 12 + 1,
1959
0
    2 * 80 + 3 * 110 + 2, 2 * 90 + 3 * 120 + 3, 2 * 10 + 3 * 13 - 4,
1960
0
    4 * 80 + 5 * 110 + 2, 4 * 90 + 5 * 120 + 3, 4 * 10 + 5 * 13 - 4,
1961
0
    6 * 80 + 7 * 110 + 2, 6 * 90 + 7 * 120 + 3, 6 * 10 + 7 * 13 - 4,
1962
0
    8 * 80 + 9 * 110 + 2, 8 * 90 + 9 * 120 + 3, 8 * 10 + 9 * 13 - 4,
1963
0
  };
1964
0
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
1965
0
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
1966
0
  ccv_nnc_tensor_free(a);
1967
0
  ccv_nnc_tensor_free(b);
1968
0
  ccv_nnc_tensor_free(c);
1969
0
  ccv_nnc_tensor_free(d);
1970
0
  ccv_nnc_tensor_free(ga);
1971
0
  ccv_nnc_tensor_free(gb);
1972
0
  ccv_nnc_tensor_free(gc);
1973
0
  ccv_nnc_tensor_free(gd);
1974
0
}
1975
1976
TEST_CASE("gemm transpose b batch 2")
1977
1
{
1978
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
1979
0
  float ap[] = {
1980
0
    1, 2,
1981
0
    3, 4,
1982
0
    5, 6,
1983
0
    7, 8,
1984
0
    2, 3,
1985
0
    4, 5,
1986
0
    6, 7,
1987
0
    8, 9
1988
0
  };
1989
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
1990
0
  float bp[] = {
1991
0
    7, 10,
1992
0
    8, 11,
1993
0
    9, 12,
1994
0
    80, 110,
1995
0
    90, 120,
1996
0
    10, 13,
1997
0
  };
1998
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3, 2), 0);
1999
0
  ccv_nnc_tensor_t* const c = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
2000
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
2001
0
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3, 2), 0);
2002
0
  ccv_nnc_tensor_t* gc = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
2003
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(ga, gb), 0);
2004
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(ga, gb), TENSOR_LIST(gc), 0);
2005
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gc), TENSOR_LIST(c), 0);
2006
0
  float ctp[] = {
2007
0
    1 * 7 + 2 * 10, 1 * 8 + 2 * 11, 1 * 9 + 2 * 12,
2008
0
    3 * 7 + 4 * 10, 3 * 8 + 4 * 11, 3 * 9 + 4 * 12,
2009
0
    5 * 7 + 6 * 10, 5 * 8 + 6 * 11, 5 * 9 + 6 * 12,
2010
0
    7 * 7 + 8 * 10, 7 * 8 + 8 * 11, 7 * 9 + 8 * 12,
2011
0
    2 * 80 + 3 * 110, 2 * 90 + 3 * 120, 2 * 10 + 3 * 13,
2012
0
    4 * 80 + 5 * 110, 4 * 90 + 5 * 120, 4 * 10 + 5 * 13,
2013
0
    6 * 80 + 7 * 110, 6 * 90 + 7 * 120, 6 * 10 + 7 * 13,
2014
0
    8 * 80 + 9 * 110, 8 * 90 + 9 * 120, 8 * 10 + 9 * 13,
2015
0
  };
2016
0
  ccv_nnc_tensor_t ct = ccv_nnc_tensor(ctp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
2017
0
  REQUIRE_TENSOR_EQ(c, &ct, "result should be equal");
2018
0
  ccv_nnc_tensor_free(a);
2019
0
  ccv_nnc_tensor_free(b);
2020
0
  ccv_nnc_tensor_free(c);
2021
0
  ccv_nnc_tensor_free(ga);
2022
0
  ccv_nnc_tensor_free(gb);
2023
0
  ccv_nnc_tensor_free(gc);
2024
0
}
2025
2026
TEST_CASE("mps forward gemm")
2027
1
{
2028
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
2029
0
  dsfmt_t dsfmt;
2030
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2031
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 128), 0);
2032
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
2033
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64), 0);
2034
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 64), 0);
2035
2036
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
2037
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
2038
0
  ccv_nnc_tensor_t* hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
2039
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
2040
0
  int i;
2041
0
  for (i = 0; i < 64 * 128; i++)
2042
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
2043
0
  for (i = 0; i < 64; i++)
2044
0
    hbias->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2045
0
  ccv_nnc_tensor_t* ha1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
2046
0
  for (i = 0; i < 10 * 128; i++)
2047
0
    ha1->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2048
0
  for (i = 0; i < 128; i++)
2049
0
    ha->data.f32[i] = ha1->data.f32[i];
2050
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha1, hw, hbias), TENSOR_LIST(a, w, bias), 0);
2051
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(hb), 0);
2052
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
2053
0
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
2054
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(tb), 0);
2055
0
  ccv_nnc_tensor_t* tb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
2056
0
  for (i = 0; i < 64; i++)
2057
0
    tb1->data.f32[i] = tb->data.f32[i];
2058
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb1->data.f32, hb->data.f32, 64, 5e-6, "GPU computed output should be numerically close to CPU computed ones");
2059
0
  ccv_nnc_tensor_free(a);
2060
0
  ccv_nnc_tensor_free(w);
2061
0
  ccv_nnc_tensor_free(bias);
2062
0
  ccv_nnc_tensor_free(tb);
2063
0
  ccv_nnc_tensor_free(b);
2064
0
  ccv_nnc_tensor_free(ha);
2065
0
  ccv_nnc_tensor_free(ha1);
2066
0
  ccv_nnc_tensor_free(tb1);
2067
0
  ccv_nnc_tensor_free(hw);
2068
0
  ccv_nnc_tensor_free(hbias);
2069
0
  ccv_nnc_tensor_free(hb);
2070
0
}
2071
2072
TEST_CASE("mps forward gemm in half precision")
2073
1
{
2074
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
2075
0
  dsfmt_t dsfmt;
2076
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2077
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10, 128), 0);
2078
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64, 128), 0);
2079
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64), 0);
2080
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10, 64), 0);
2081
2082
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
2083
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
2084
0
  ccv_nnc_tensor_t* hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
2085
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
2086
0
  int i;
2087
0
  for (i = 0; i < 64 * 128; i++)
2088
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
2089
0
  for (i = 0; i < 64; i++)
2090
0
    hbias->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2091
0
  ccv_nnc_tensor_t* ha1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
2092
0
  for (i = 0; i < 10 * 128; i++)
2093
0
    ha1->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2094
0
  for (i = 0; i < 128; i++)
2095
0
    ha->data.f32[i] = ha1->data.f32[i];
2096
0
  ccv_nnc_tensor_t* ha2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 128), 0);
2097
0
  ccv_nnc_tensor_t* hw2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64, 128), 0);
2098
0
  ccv_nnc_tensor_t* hbias2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64), 0);
2099
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha1, hw, hbias), TENSOR_LIST(ha2, hw2, hbias2), 0);
2100
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha2, hw2, hbias2), TENSOR_LIST(a, w, bias), 0);
2101
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(hb), 0);
2102
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
2103
0
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 64), 0);
2104
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(tb), 0);
2105
0
  ccv_nnc_tensor_t* tb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
2106
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tb), TENSOR_LIST(tb1), 0);
2107
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb1->data.f32, hb->data.f32, 64, 5e-3, "GPU computed output should be the same as CPU computed ones");
2108
0
  ccv_nnc_tensor_free(a);
2109
0
  ccv_nnc_tensor_free(w);
2110
0
  ccv_nnc_tensor_free(bias);
2111
0
  ccv_nnc_tensor_free(b);
2112
0
  ccv_nnc_tensor_free(tb);
2113
0
  ccv_nnc_tensor_free(ha);
2114
0
  ccv_nnc_tensor_free(ha1);
2115
0
  ccv_nnc_tensor_free(tb1);
2116
0
  ccv_nnc_tensor_free(hw);
2117
0
  ccv_nnc_tensor_free(hbias);
2118
0
  ccv_nnc_tensor_free(hb);
2119
0
  ccv_nnc_tensor_free(ha2);
2120
0
  ccv_nnc_tensor_free(hw2);
2121
0
  ccv_nnc_tensor_free(hbias2);
2122
0
}
2123
2124
TEST_CASE("mps forward gemm in bfloat precision")
2125
1
{
2126
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
2127
0
  dsfmt_t dsfmt;
2128
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2129
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, 10, 128), 0);
2130
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, 64, 128), 0);
2131
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, 64), 0);
2132
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, 10, 64), 0);
2133
2134
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
2135
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
2136
0
  ccv_nnc_tensor_t* hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
2137
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
2138
0
  int i;
2139
0
  for (i = 0; i < 64 * 128; i++)
2140
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
2141
0
  for (i = 0; i < 64; i++)
2142
0
    hbias->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2143
0
  ccv_nnc_tensor_t* ha1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
2144
0
  for (i = 0; i < 10 * 128; i++)
2145
0
    ha1->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2146
0
  for (i = 0; i < 128; i++)
2147
0
    ha->data.f32[i] = ha1->data.f32[i];
2148
0
  ccv_nnc_tensor_t* ha2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, 10, 128), 0);
2149
0
  ccv_nnc_tensor_t* hw2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, 64, 128), 0);
2150
0
  ccv_nnc_tensor_t* hbias2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, 64), 0);
2151
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha1, hw, hbias), TENSOR_LIST(ha2, hw2, hbias2), 0);
2152
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha2, hw2, hbias2), TENSOR_LIST(a, w, bias), 0);
2153
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(hb), 0);
2154
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
2155
0
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, 10, 64), 0);
2156
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(tb), 0);
2157
0
  ccv_nnc_tensor_t* tb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
2158
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tb), TENSOR_LIST(tb1), 0);
2159
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb1->data.f32, hb->data.f32, 64, 8e-3, "GPU computed output should be the same as CPU computed ones");
2160
0
  ccv_nnc_tensor_free(a);
2161
0
  ccv_nnc_tensor_free(w);
2162
0
  ccv_nnc_tensor_free(bias);
2163
0
  ccv_nnc_tensor_free(b);
2164
0
  ccv_nnc_tensor_free(tb);
2165
0
  ccv_nnc_tensor_free(ha);
2166
0
  ccv_nnc_tensor_free(ha1);
2167
0
  ccv_nnc_tensor_free(tb1);
2168
0
  ccv_nnc_tensor_free(hw);
2169
0
  ccv_nnc_tensor_free(hbias);
2170
0
  ccv_nnc_tensor_free(hb);
2171
0
  ccv_nnc_tensor_free(ha2);
2172
0
  ccv_nnc_tensor_free(hw2);
2173
0
  ccv_nnc_tensor_free(hbias2);
2174
0
}
2175
2176
TEST_CASE("mps forward gemv in half precision, variant 1")
2177
1
{
2178
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
2179
0
  dsfmt_t dsfmt;
2180
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2181
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 1, 128), 0);
2182
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64, 128), 0);
2183
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64), 0);
2184
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 1, 64), 0);
2185
2186
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
2187
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
2188
0
  ccv_nnc_tensor_t* hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
2189
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
2190
0
  int i;
2191
0
  for (i = 0; i < 64 * 128; i++)
2192
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
2193
0
  for (i = 0; i < 64; i++)
2194
0
    hbias->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2195
0
  ccv_nnc_tensor_t* ha1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
2196
0
  for (i = 0; i < 128; i++)
2197
0
    ha1->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2198
0
  for (i = 0; i < 128; i++)
2199
0
    ha->data.f32[i] = ha1->data.f32[i];
2200
0
  ccv_nnc_tensor_t* ha2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 1, 128), 0);
2201
0
  ccv_nnc_tensor_t* hw2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64, 128), 0);
2202
0
  ccv_nnc_tensor_t* hbias2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64), 0);
2203
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha1, hw, hbias), TENSOR_LIST(ha2, hw2, hbias2), 0);
2204
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha2, hw2, hbias2), TENSOR_LIST(a, w, bias), 0);
2205
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(hb), 0);
2206
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
2207
0
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 1, 64), 0);
2208
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(tb), 0);
2209
0
  ccv_nnc_tensor_t* tb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
2210
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tb), TENSOR_LIST(tb1), 0);
2211
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb1->data.f32, hb->data.f32, 64, 1e-3, "GPU computed output should be the same as CPU computed ones");
2212
0
  ccv_nnc_tensor_free(a);
2213
0
  ccv_nnc_tensor_free(w);
2214
0
  ccv_nnc_tensor_free(bias);
2215
0
  ccv_nnc_tensor_free(b);
2216
0
  ccv_nnc_tensor_free(tb);
2217
0
  ccv_nnc_tensor_free(ha);
2218
0
  ccv_nnc_tensor_free(ha1);
2219
0
  ccv_nnc_tensor_free(tb1);
2220
0
  ccv_nnc_tensor_free(hw);
2221
0
  ccv_nnc_tensor_free(hbias);
2222
0
  ccv_nnc_tensor_free(hb);
2223
0
  ccv_nnc_tensor_free(ha2);
2224
0
  ccv_nnc_tensor_free(hw2);
2225
0
  ccv_nnc_tensor_free(hbias2);
2226
0
}
2227
2228
TEST_CASE("mps forward gemv in bfloat precision, variant 1")
2229
1
{
2230
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
2231
0
  dsfmt_t dsfmt;
2232
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2233
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, 1, 128), 0);
2234
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, 64, 128), 0);
2235
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, 64), 0);
2236
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, 1, 64), 0);
2237
2238
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
2239
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
2240
0
  ccv_nnc_tensor_t* hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
2241
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
2242
0
  int i;
2243
0
  for (i = 0; i < 64 * 128; i++)
2244
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
2245
0
  for (i = 0; i < 64; i++)
2246
0
    hbias->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2247
0
  ccv_nnc_tensor_t* ha1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
2248
0
  for (i = 0; i < 128; i++)
2249
0
    ha1->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2250
0
  for (i = 0; i < 128; i++)
2251
0
    ha->data.f32[i] = ha1->data.f32[i];
2252
0
  ccv_nnc_tensor_t* ha2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, 1, 128), 0);
2253
0
  ccv_nnc_tensor_t* hw2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, 64, 128), 0);
2254
0
  ccv_nnc_tensor_t* hbias2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, 64), 0);
2255
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha1, hw, hbias), TENSOR_LIST(ha2, hw2, hbias2), 0);
2256
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha2, hw2, hbias2), TENSOR_LIST(a, w, bias), 0);
2257
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(hb), 0);
2258
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
2259
0
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, 1, 64), 0);
2260
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(tb), 0);
2261
0
  ccv_nnc_tensor_t* tb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
2262
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tb), TENSOR_LIST(tb1), 0);
2263
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb1->data.f32, hb->data.f32, 64, 8e-3, "GPU computed output should be the same as CPU computed ones");
2264
0
  ccv_nnc_tensor_free(a);
2265
0
  ccv_nnc_tensor_free(w);
2266
0
  ccv_nnc_tensor_free(bias);
2267
0
  ccv_nnc_tensor_free(b);
2268
0
  ccv_nnc_tensor_free(tb);
2269
0
  ccv_nnc_tensor_free(ha);
2270
0
  ccv_nnc_tensor_free(ha1);
2271
0
  ccv_nnc_tensor_free(tb1);
2272
0
  ccv_nnc_tensor_free(hw);
2273
0
  ccv_nnc_tensor_free(hbias);
2274
0
  ccv_nnc_tensor_free(hb);
2275
0
  ccv_nnc_tensor_free(ha2);
2276
0
  ccv_nnc_tensor_free(hw2);
2277
0
  ccv_nnc_tensor_free(hbias2);
2278
0
}
2279
2280
TEST_CASE("mps depalettize 5-bit half precision")
2281
1
{
2282
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_MPS));
2283
0
  float lut_f32[32] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, -1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0, -9.0, -10.0, -11.0, -12.0, -13.0, -14.0, -15.0};
2284
0
  uint16_t lut[32];
2285
0
  ccv_float_to_half_precision(lut_f32, lut, 32);
2286
0
  uint16_t* const values = ccmalloc(sizeof(uint16_t) * 3072);
2287
0
  int i;
2288
0
  for (i = 0; i < 3072; i++)
2289
0
    values[i] = lut[i % 32];
2290
0
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (2112 + 3) / 4), 0);
2291
0
  uint8_t* compressed = tensor->data.u8;
2292
0
  const size_t output_size = ccv_nnc_palettize(values, CCV_16F, CCV_TENSOR_CPU_MEMORY, 3072, 5, 1024, compressed, 2112);
2293
0
  REQUIRE_EQ(output_size, 2112, "output size should match");
2294
0
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (2112 + 3) / 4), 0);
2295
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
2296
0
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 16F, 3072), 0);
2297
0
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_16F, CCV_TENSOR_GPU_MEMORY, output_size, 5, 1024, gv_tensor->data.u8, 3072);
2298
0
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(16F, 3072), 0);
2299
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
2300
0
  REQUIRE_ARRAY_EQ(uint16_t, values, v_tensor->data.f16, 3072, "GPU computed output should match CPU depalettize");
2301
0
  ccfree(values);
2302
0
  ccv_nnc_tensor_free(tensor);
2303
0
  ccv_nnc_tensor_free(g_tensor);
2304
0
  ccv_nnc_tensor_free(gv_tensor);
2305
0
  ccv_nnc_tensor_free(v_tensor);
2306
0
}
2307
2308
TEST_CASE("mps depalettize 6-bit float precision")
2309
1
{
2310
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_MPS));
2311
0
  float lut[64];
2312
0
  int i;
2313
0
  for (i = 0; i < 64; i++)
2314
0
    lut[i] = (float)i;
2315
0
  float* const values = ccmalloc(sizeof(float) * 8192);
2316
0
  for (i = 0; i < 8192; i++)
2317
0
    values[i] = lut[i % 64];
2318
0
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (6144 + 2 * 64 * 4 + 3) / 4), 0);
2319
0
  uint8_t* compressed = tensor->data.u8;
2320
0
  const size_t output_size = ccv_nnc_palettize(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 8192, 6, 4096, compressed, 6144 + 2 * 64 * 4);
2321
0
  REQUIRE_EQ(output_size, 6144 + 2 * 64 * 4, "output size should match");
2322
0
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (6144 + 2 * 64 * 4 + 3) / 4), 0);
2323
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
2324
0
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 8192), 0);
2325
0
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_32F, CCV_TENSOR_GPU_MEMORY, output_size, 6, 4096, gv_tensor->data.u8, 8192);
2326
0
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 8192), 0);
2327
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
2328
0
  REQUIRE_ARRAY_EQ(float, values, v_tensor->data.f32, 8192, "GPU computed output should match CPU depalettize");
2329
0
  ccfree(values);
2330
0
  ccv_nnc_tensor_free(tensor);
2331
0
  ccv_nnc_tensor_free(g_tensor);
2332
0
  ccv_nnc_tensor_free(gv_tensor);
2333
0
  ccv_nnc_tensor_free(v_tensor);
2334
0
}
2335
2336
TEST_CASE("mps depalettize 8-bit float precision with partial block")
2337
1
{
2338
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_MPS));
2339
0
  float lut[256];
2340
0
  int i;
2341
0
  for (i = 0; i < 256; i++)
2342
0
    lut[i] = (float)i;
2343
0
  float* const values = ccmalloc(sizeof(float) * 3072);
2344
0
  for (i = 0; i < 3072; i++)
2345
0
    values[i] = lut[i % 256];
2346
0
  ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, (6144 + 3) / 4), 0);
2347
0
  uint8_t* compressed = tensor->data.u8;
2348
0
  const size_t output_size = ccv_nnc_palettize(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 3072, 8, 2048, compressed, 6144);
2349
0
  REQUIRE(output_size <= 6144, "output size should fit the allocated buffer");
2350
0
  ccv_nnc_tensor_t* g_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, (6144 + 3) / 4), 0);
2351
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(g_tensor), 0);
2352
0
  ccv_nnc_tensor_t* gv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 3072), 0);
2353
0
  ccv_nnc_depalettize(g_tensor->data.u8, CCV_32F, CCV_TENSOR_GPU_MEMORY, output_size, 8, 2048, gv_tensor->data.u8, 3072);
2354
0
  ccv_nnc_tensor_t* v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 3072), 0);
2355
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gv_tensor), TENSOR_LIST(v_tensor), 0);
2356
0
  REQUIRE_ARRAY_EQ(float, values, v_tensor->data.f32, 3072, "GPU computed output should match CPU depalettize");
2357
0
  ccfree(values);
2358
0
  ccv_nnc_tensor_free(tensor);
2359
0
  ccv_nnc_tensor_free(g_tensor);
2360
0
  ccv_nnc_tensor_free(gv_tensor);
2361
0
  ccv_nnc_tensor_free(v_tensor);
2362
0
}
2363
2364
TEST_CASE("mps dequantize row-wise 8i half precision")
2365
1
{
2366
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_MPS));
2367
0
  const int rows = 17;
2368
0
  const int cols = 64;
2369
0
  float* const values = ccmalloc(sizeof(float) * rows * cols);
2370
0
  int i;
2371
0
  for (i = 0; i < rows * cols; i++)
2372
0
    values[i] = ((i * 13) % 41 - 20) / 32.0f;
2373
0
  ccv_nnc_tensor_t* const source = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, rows, cols), 0);
2374
0
  ccv_float_to_half_precision(values, (uint16_t*)source->data.f16, rows * cols);
2375
0
  ccv_nnc_tensor_t* const q = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(CPU_TENSOR_NHWC(16F, rows, cols)), 0);
2376
0
  const size_t qsize = ccv_nnc_quantize_8i_rowwise(source->data.f16, CCV_16F, CCV_TENSOR_CPU_MEMORY, rows * cols, cols, q->data.u8, ccv_nnc_tensor_data_size_without_padding(q->info));
2377
0
  REQUIRE_EQ(qsize, ccv_nnc_tensor_data_size_without_padding(q->info), "quantized row-wise 8i size should match");
2378
0
  ccv_nnc_tensor_t* const expected = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, rows, cols), 0);
2379
0
  ccv_nnc_dequantize_8i_rowwise(q->data.u8, CCV_16F, CCV_TENSOR_CPU_MEMORY, qsize, cols, expected->data.f16, rows * cols);
2380
0
  ccv_nnc_tensor_t* const gq = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(GPU_TENSOR_NHWC(000, 16F, rows, cols)), 0);
2381
0
  ccv_nnc_tensor_t* const gout = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, rows, cols), 0);
2382
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q), TENSOR_LIST(gq), 0);
2383
0
  ccv_nnc_dequantize_8i_rowwise(gq->data.u8, CCV_16F, CCV_TENSOR_GPU_MEMORY, qsize, cols, gout->data.u8, rows * cols);
2384
0
  ccv_nnc_tensor_t* const actual = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, rows, cols), 0);
2385
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gout), TENSOR_LIST(actual), 0);
2386
0
  float* const expected_f32 = (float*)ccmalloc(sizeof(float) * rows * cols);
2387
0
  float* const actual_f32 = (float*)ccmalloc(sizeof(float) * rows * cols);
2388
0
  ccv_half_precision_to_float((uint16_t*)expected->data.f16, expected_f32, rows * cols);
2389
0
  ccv_half_precision_to_float((uint16_t*)actual->data.f16, actual_f32, rows * cols);
2390
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, expected_f32, actual_f32, rows * cols, 1e-3, "GPU row-wise 8i dequantize should match CPU dequantize");
2391
0
  ccfree(actual_f32);
2392
0
  ccfree(expected_f32);
2393
0
  ccv_nnc_tensor_free(actual);
2394
0
  ccv_nnc_tensor_free(gout);
2395
0
  ccv_nnc_tensor_free(gq);
2396
0
  ccv_nnc_tensor_free(expected);
2397
0
  ccv_nnc_tensor_free(q);
2398
0
  ccv_nnc_tensor_free(source);
2399
0
  ccfree(values);
2400
0
}
2401
2402
TEST_CASE("mps dequantize row-wise 8i float precision")
2403
1
{
2404
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_MPS));
2405
0
  const int rows = 11;
2406
0
  const int cols = 128;
2407
0
  ccv_nnc_tensor_t* const source = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, rows, cols), 0);
2408
0
  int i;
2409
0
  for (i = 0; i < rows * cols; i++)
2410
0
    source->data.f32[i] = ((i * 17) % 53 - 26) / 64.0f;
2411
0
  ccv_nnc_tensor_t* const q = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(CPU_TENSOR_NHWC(32F, rows, cols)), 0);
2412
0
  const size_t qsize = ccv_nnc_quantize_8i_rowwise(source->data.f32, CCV_32F, CCV_TENSOR_CPU_MEMORY, rows * cols, cols, q->data.u8, ccv_nnc_tensor_data_size_without_padding(q->info));
2413
0
  REQUIRE_EQ(qsize, ccv_nnc_tensor_data_size_without_padding(q->info), "quantized row-wise 8i size should match");
2414
0
  ccv_nnc_tensor_t* const expected = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, rows, cols), 0);
2415
0
  ccv_nnc_dequantize_8i_rowwise(q->data.u8, CCV_32F, CCV_TENSOR_CPU_MEMORY, qsize, cols, expected->data.f32, rows * cols);
2416
0
  ccv_nnc_tensor_t* const gq = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(GPU_TENSOR_NHWC(000, 32F, rows, cols)), 0);
2417
0
  ccv_nnc_tensor_t* const gout = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, rows, cols), 0);
2418
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q), TENSOR_LIST(gq), 0);
2419
0
  ccv_nnc_dequantize_8i_rowwise(gq->data.u8, CCV_32F, CCV_TENSOR_GPU_MEMORY, qsize, cols, gout->data.u8, rows * cols);
2420
0
  ccv_nnc_tensor_t* const actual = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, rows, cols), 0);
2421
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gout), TENSOR_LIST(actual), 0);
2422
0
  REQUIRE_ARRAY_EQ(float, expected->data.f32, actual->data.f32, rows * cols, "GPU row-wise 8i dequantize should match CPU dequantize");
2423
0
  ccv_nnc_tensor_free(actual);
2424
0
  ccv_nnc_tensor_free(gout);
2425
0
  ccv_nnc_tensor_free(gq);
2426
0
  ccv_nnc_tensor_free(expected);
2427
0
  ccv_nnc_tensor_free(q);
2428
0
  ccv_nnc_tensor_free(source);
2429
0
}
2430
2431
TEST_CASE("mps dequantize row-wise 8i bfloat precision")
2432
1
{
2433
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_MPS));
2434
0
  const int rows = 257;
2435
0
  const int cols = 130;
2436
0
  float* const values = ccmalloc(sizeof(float) * rows * cols);
2437
0
  int i;
2438
0
  for (i = 0; i < rows * cols; i++)
2439
0
    values[i] = ((i * 29) % 97 - 48) / 64.0f;
2440
0
  ccv_nnc_tensor_t* const source = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, rows, cols), 0);
2441
0
  ccv_float_to_bfloat(values, (uint16_t*)source->data.f16, rows * cols);
2442
0
  ccv_nnc_tensor_t* const q = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(CPU_TENSOR_NHWC(16BF, rows, cols)), 0);
2443
0
  const size_t qsize = ccv_nnc_quantize_8i_rowwise(source->data.f16, CCV_16BF, CCV_TENSOR_CPU_MEMORY, rows * cols, cols, q->data.u8, ccv_nnc_tensor_data_size_without_padding(q->info));
2444
0
  REQUIRE_EQ(qsize, ccv_nnc_tensor_data_size_without_padding(q->info), "quantized row-wise 8i size should match");
2445
0
  ccv_nnc_tensor_t* const expected = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, rows, cols), 0);
2446
0
  ccv_nnc_dequantize_8i_rowwise(q->data.u8, CCV_16BF, CCV_TENSOR_CPU_MEMORY, qsize, cols, expected->data.f16, rows * cols);
2447
0
  ccv_nnc_tensor_t* const gq = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(GPU_TENSOR_NHWC(000, 16BF, rows, cols)), 0);
2448
0
  ccv_nnc_tensor_t* const gout = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, rows, cols), 0);
2449
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q), TENSOR_LIST(gq), 0);
2450
0
  ccv_nnc_dequantize_8i_rowwise(gq->data.u8, CCV_16BF, CCV_TENSOR_GPU_MEMORY, qsize, cols, gout->data.u8, rows * cols);
2451
0
  ccv_nnc_tensor_t* const actual = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, rows, cols), 0);
2452
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gout), TENSOR_LIST(actual), 0);
2453
0
  float* const expected_f32 = (float*)ccmalloc(sizeof(float) * rows * cols);
2454
0
  float* const actual_f32 = (float*)ccmalloc(sizeof(float) * rows * cols);
2455
0
  ccv_bfloat_to_float((uint16_t*)expected->data.f16, expected_f32, rows * cols);
2456
0
  ccv_bfloat_to_float((uint16_t*)actual->data.f16, actual_f32, rows * cols);
2457
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, expected_f32, actual_f32, rows * cols, 5e-3, "GPU row-wise 8i bf16 dequantize should match CPU dequantize");
2458
0
  ccfree(actual_f32);
2459
0
  ccfree(expected_f32);
2460
0
  ccv_nnc_tensor_free(actual);
2461
0
  ccv_nnc_tensor_free(gout);
2462
0
  ccv_nnc_tensor_free(gq);
2463
0
  ccv_nnc_tensor_free(expected);
2464
0
  ccv_nnc_tensor_free(q);
2465
0
  ccv_nnc_tensor_free(source);
2466
0
  ccfree(values);
2467
0
}
2468
2469
TEST_CASE("mps dequantize row-wise 8i bfloat precision large shapes")
2470
1
{
2471
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_MPS));
2472
0
  static const int shapes[][2] = {
2473
0
    {3840, 3840},
2474
0
    {10240, 3840},
2475
0
    {3840, 10240},
2476
0
  };
2477
0
  int s;
2478
0
  for (s = 0; s < (int)(sizeof(shapes) / sizeof(shapes[0])); s++)
2479
0
  {
2480
0
    const int rows = shapes[s][0];
2481
0
    const int cols = shapes[s][1];
2482
0
    float* const values = ccmalloc(sizeof(float) * (size_t)rows * cols);
2483
0
    int i;
2484
0
    for (i = 0; i < rows * cols; i++)
2485
0
      values[i] = ((i * 29) % 97 - 48) / 64.0f;
2486
0
    ccv_nnc_tensor_t* const source = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, rows, cols), 0);
2487
0
    ccv_float_to_bfloat(values, (uint16_t*)source->data.f16, rows * cols);
2488
0
    ccv_nnc_tensor_t* const q = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(CPU_TENSOR_NHWC(16BF, rows, cols)), 0);
2489
0
    const size_t qsize = ccv_nnc_quantize_8i_rowwise(source->data.f16, CCV_16BF, CCV_TENSOR_CPU_MEMORY, (size_t)rows * cols, cols, q->data.u8, ccv_nnc_tensor_data_size_without_padding(q->info));
2490
0
    REQUIRE_EQ(qsize, ccv_nnc_tensor_data_size_without_padding(q->info), "quantized row-wise 8i size should match");
2491
0
    ccv_nnc_tensor_t* const expected = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, rows, cols), 0);
2492
0
    ccv_nnc_dequantize_8i_rowwise(q->data.u8, CCV_16BF, CCV_TENSOR_CPU_MEMORY, qsize, cols, expected->data.f16, (size_t)rows * cols);
2493
0
    ccv_nnc_tensor_t* const gq = ccv_nnc_tensor_new(0, ccv_nnc_tensor_8i_rowwise(GPU_TENSOR_NHWC(000, 16BF, rows, cols)), 0);
2494
0
    ccv_nnc_tensor_t* const gout = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, rows, cols), 0);
2495
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q), TENSOR_LIST(gq), 0);
2496
0
    ccv_nnc_dequantize_8i_rowwise(gq->data.u8, CCV_16BF, CCV_TENSOR_GPU_MEMORY, qsize, cols, gout->data.u8, (size_t)rows * cols);
2497
0
    ccv_nnc_tensor_t* const actual = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, rows, cols), 0);
2498
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gout), TENSOR_LIST(actual), 0);
2499
0
    float* const expected_f32 = (float*)ccmalloc(sizeof(float) * (size_t)rows * cols);
2500
0
    float* const actual_f32 = (float*)ccmalloc(sizeof(float) * (size_t)rows * cols);
2501
0
    ccv_bfloat_to_float((uint16_t*)expected->data.f16, expected_f32, rows * cols);
2502
0
    ccv_bfloat_to_float((uint16_t*)actual->data.f16, actual_f32, rows * cols);
2503
0
    REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, expected_f32, actual_f32, rows * cols, 5e-3, "GPU row-wise 8i bf16 dequantize should match CPU dequantize on large shape");
2504
0
    ccfree(actual_f32);
2505
0
    ccfree(expected_f32);
2506
0
    ccv_nnc_tensor_free(actual);
2507
0
    ccv_nnc_tensor_free(gout);
2508
0
    ccv_nnc_tensor_free(gq);
2509
0
    ccv_nnc_tensor_free(expected);
2510
0
    ccv_nnc_tensor_free(q);
2511
0
    ccv_nnc_tensor_free(source);
2512
0
    ccfree(values);
2513
0
  }
2514
0
}
2515
2516
TEST_CASE("mps forward gemm no bias")
2517
1
{
2518
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
2519
0
  dsfmt_t dsfmt;
2520
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2521
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 128), 0);
2522
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
2523
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 64), 0);
2524
2525
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
2526
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
2527
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
2528
0
  int i;
2529
0
  for (i = 0; i < 64 * 128; i++)
2530
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
2531
0
  ccv_nnc_tensor_t* ha1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
2532
0
  for (i = 0; i < 10 * 128; i++)
2533
0
    ha1->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2534
0
  for (i = 0; i < 128; i++)
2535
0
    ha->data.f32[i] = ha1->data.f32[i];
2536
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha1, hw), TENSOR_LIST(a, w), 0);
2537
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw), TENSOR_LIST(hb), 0);
2538
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
2539
0
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
2540
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(tb), 0);
2541
0
  ccv_nnc_tensor_t* tb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
2542
0
  for (i = 0; i < 64; i++)
2543
0
    tb1->data.f32[i] = tb->data.f32[i];
2544
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb1->data.f32, hb->data.f32, 64, 5e-6, "GPU computed output should be numerically close to CPU computed ones");
2545
0
  ccv_nnc_tensor_free(a);
2546
0
  ccv_nnc_tensor_free(w);
2547
0
  ccv_nnc_tensor_free(b);
2548
0
  ccv_nnc_tensor_free(tb);
2549
0
  ccv_nnc_tensor_free(ha);
2550
0
  ccv_nnc_tensor_free(ha1);
2551
0
  ccv_nnc_tensor_free(tb1);
2552
0
  ccv_nnc_tensor_free(hw);
2553
0
  ccv_nnc_tensor_free(hb);
2554
0
}
2555
2556
TEST_CASE("mps forward gemm no bias in half precision")
2557
1
{
2558
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
2559
0
  dsfmt_t dsfmt;
2560
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2561
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10, 128), 0);
2562
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64, 128), 0);
2563
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 10, 64), 0);
2564
2565
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
2566
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
2567
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
2568
0
  int i;
2569
0
  for (i = 0; i < 64 * 128; i++)
2570
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
2571
0
  ccv_nnc_tensor_t* ha1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
2572
0
  for (i = 0; i < 10 * 128; i++)
2573
0
    ha1->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2574
0
  for (i = 0; i < 128; i++)
2575
0
    ha->data.f32[i] = ha1->data.f32[i];
2576
0
  ccv_nnc_tensor_t* ha2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 128), 0);
2577
0
  ccv_nnc_tensor_t* hw2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64, 128), 0);
2578
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha1, hw), TENSOR_LIST(ha2, hw2), 0);
2579
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha2, hw2), TENSOR_LIST(a, w), 0);
2580
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw), TENSOR_LIST(hb), 0);
2581
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
2582
0
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 10, 64), 0);
2583
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(tb), 0);
2584
0
  ccv_nnc_tensor_t* tb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
2585
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tb), TENSOR_LIST(tb1), 0);
2586
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb1->data.f32, hb->data.f32, 64, 1e-3, "GPU computed output should be the same as CPU computed ones");
2587
0
  ccv_nnc_tensor_free(a);
2588
0
  ccv_nnc_tensor_free(w);
2589
0
  ccv_nnc_tensor_free(b);
2590
0
  ccv_nnc_tensor_free(tb);
2591
0
  ccv_nnc_tensor_free(ha);
2592
0
  ccv_nnc_tensor_free(ha1);
2593
0
  ccv_nnc_tensor_free(tb1);
2594
0
  ccv_nnc_tensor_free(hw);
2595
0
  ccv_nnc_tensor_free(hb);
2596
0
  ccv_nnc_tensor_free(ha2);
2597
0
  ccv_nnc_tensor_free(hw2);
2598
0
}
2599
2600
TEST_CASE("mps forward gemm no bias in bfloat precision")
2601
1
{
2602
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
2603
0
  dsfmt_t dsfmt;
2604
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2605
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, 10, 128), 0);
2606
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, 64, 128), 0);
2607
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, 10, 64), 0);
2608
2609
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
2610
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
2611
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
2612
0
  int i;
2613
0
  for (i = 0; i < 64 * 128; i++)
2614
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
2615
0
  ccv_nnc_tensor_t* ha1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
2616
0
  for (i = 0; i < 10 * 128; i++)
2617
0
    ha1->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2618
0
  for (i = 0; i < 128; i++)
2619
0
    ha->data.f32[i] = ha1->data.f32[i];
2620
0
  ccv_nnc_tensor_t* ha2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, 10, 128), 0);
2621
0
  ccv_nnc_tensor_t* hw2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, 64, 128), 0);
2622
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha1, hw), TENSOR_LIST(ha2, hw2), 0);
2623
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha2, hw2), TENSOR_LIST(a, w), 0);
2624
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw), TENSOR_LIST(hb), 0);
2625
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
2626
0
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, 10, 64), 0);
2627
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(tb), 0);
2628
0
  ccv_nnc_tensor_t* tb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
2629
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tb), TENSOR_LIST(tb1), 0);
2630
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb1->data.f32, hb->data.f32, 64, 1e-3, "GPU computed output should be the same as CPU computed ones");
2631
0
  ccv_nnc_tensor_free(a);
2632
0
  ccv_nnc_tensor_free(w);
2633
0
  ccv_nnc_tensor_free(b);
2634
0
  ccv_nnc_tensor_free(tb);
2635
0
  ccv_nnc_tensor_free(ha);
2636
0
  ccv_nnc_tensor_free(ha1);
2637
0
  ccv_nnc_tensor_free(tb1);
2638
0
  ccv_nnc_tensor_free(hw);
2639
0
  ccv_nnc_tensor_free(hb);
2640
0
  ccv_nnc_tensor_free(ha2);
2641
0
  ccv_nnc_tensor_free(hw2);
2642
0
}
2643
2644
TEST_CASE("mps forward gemv in half precision no bias, variant 1")
2645
1
{
2646
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
2647
0
  dsfmt_t dsfmt;
2648
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2649
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 1, 128), 0);
2650
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64, 128), 0);
2651
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 1, 64), 0);
2652
2653
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
2654
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
2655
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
2656
0
  int i;
2657
0
  for (i = 0; i < 64 * 128; i++)
2658
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
2659
0
  ccv_nnc_tensor_t* ha1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 128), 0);
2660
0
  for (i = 0; i < 128; i++)
2661
0
    ha1->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2662
0
  for (i = 0; i < 128; i++)
2663
0
    ha->data.f32[i] = ha1->data.f32[i];
2664
0
  ccv_nnc_tensor_t* ha2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 1, 128), 0);
2665
0
  ccv_nnc_tensor_t* hw2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64, 128), 0);
2666
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha1, hw), TENSOR_LIST(ha2, hw2), 0);
2667
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha2, hw2), TENSOR_LIST(a, w), 0);
2668
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw), TENSOR_LIST(hb), 0);
2669
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
2670
0
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 1, 64), 0);
2671
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(tb), 0);
2672
0
  ccv_nnc_tensor_t* tb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 64), 0);
2673
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tb), TENSOR_LIST(tb1), 0);
2674
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb1->data.f32, hb->data.f32, 64, 1e-3, "GPU computed output should be the same as CPU computed ones");
2675
0
  ccv_nnc_tensor_free(a);
2676
0
  ccv_nnc_tensor_free(w);
2677
0
  ccv_nnc_tensor_free(b);
2678
0
  ccv_nnc_tensor_free(tb);
2679
0
  ccv_nnc_tensor_free(ha);
2680
0
  ccv_nnc_tensor_free(ha1);
2681
0
  ccv_nnc_tensor_free(tb1);
2682
0
  ccv_nnc_tensor_free(hw);
2683
0
  ccv_nnc_tensor_free(hb);
2684
0
  ccv_nnc_tensor_free(ha2);
2685
0
  ccv_nnc_tensor_free(hw2);
2686
0
}
2687
2688
TEST_CASE("mps forward gemv in half precision no bias, variant 2")
2689
1
{
2690
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
2691
0
  dsfmt_t dsfmt;
2692
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2693
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64, 128), 0);
2694
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 128, 1), 0);
2695
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 64, 1), 0);
2696
2697
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
2698
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 128, 1), 0);
2699
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 1), 0);
2700
0
  int i;
2701
0
  for (i = 0; i < 64 * 128; i++)
2702
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
2703
0
  ccv_nnc_tensor_t* ha1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 128, 1), 0);
2704
0
  for (i = 0; i < 128; i++)
2705
0
    ha1->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2706
0
  for (i = 0; i < 128; i++)
2707
0
    ha->data.f32[i] = ha1->data.f32[i];
2708
0
  ccv_nnc_tensor_t* hw2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64, 128), 0);
2709
0
  ccv_nnc_tensor_t* ha2 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 128, 1), 0);
2710
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha1, hw), TENSOR_LIST(ha2, hw2), 0);
2711
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha2, hw2), TENSOR_LIST(a, w), 0);
2712
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, NO_TRANSPOSE), ccv_nnc_no_hint, 0, TENSOR_LIST(hw, ha), TENSOR_LIST(hb), 0);
2713
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, NO_TRANSPOSE), ccv_nnc_no_hint, 0, TENSOR_LIST(w, a), TENSOR_LIST(b), 0);
2714
0
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 64, 1), 0);
2715
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(tb), 0);
2716
0
  ccv_nnc_tensor_t* tb1 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 1), 0);
2717
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tb), TENSOR_LIST(tb1), 0);
2718
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb1->data.f32, hb->data.f32, 64, 1e-3, "GPU computed output should be the same as CPU computed ones");
2719
0
  ccv_nnc_tensor_free(a);
2720
0
  ccv_nnc_tensor_free(w);
2721
0
  ccv_nnc_tensor_free(b);
2722
0
  ccv_nnc_tensor_free(tb);
2723
0
  ccv_nnc_tensor_free(ha);
2724
0
  ccv_nnc_tensor_free(ha1);
2725
0
  ccv_nnc_tensor_free(tb1);
2726
0
  ccv_nnc_tensor_free(hw);
2727
0
  ccv_nnc_tensor_free(hb);
2728
0
  ccv_nnc_tensor_free(ha2);
2729
0
  ccv_nnc_tensor_free(hw2);
2730
0
}
2731
2732
TEST_CASE("mps handle permute")
2733
1
{
2734
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
2735
0
  dsfmt_t dsfmt;
2736
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2737
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 2, 128), 0);
2738
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 2, 128), 0);
2739
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 2, 128), 0);
2740
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 2, 128), 0);
2741
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 64), 0);
2742
2743
0
  ccv_nnc_tensor_t* at = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 128), 0);
2744
0
  ccv_nnc_tensor_t* wt = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 64, 128), 0);
2745
0
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 64), 0);
2746
0
  int i;
2747
0
  for (i = 0; i < 2 * 64 * 128; i++)
2748
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
2749
0
  for (i = 0; i < 2 * 10 * 128; i++)
2750
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2751
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw), TENSOR_LIST(a, w), 0);
2752
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(0, 1), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(at), 0);
2753
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(0, 1), ccv_nnc_no_hint, 0, TENSOR_LIST(w), TENSOR_LIST(wt), 0);
2754
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(at, wt), TENSOR_LIST(bt), 0);
2755
0
  ccv_nnc_tensor_view_t* av = ccv_nnc_tensor_view_new(a, GPU_TENSOR_NHWC(000, 32F, 2, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(128, 2 * 128, 1));
2756
0
  ccv_nnc_tensor_view_t* wv = ccv_nnc_tensor_view_new(w, GPU_TENSOR_NHWC(000, 32F, 2, 64, 128), ccv_nnc_no_ofs, DIM_ALLOC(128, 2 * 128, 1));
2757
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST((ccv_nnc_tensor_t*)av, (ccv_nnc_tensor_t*)wv), TENSOR_LIST(b), 0);
2758
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 64), 0);
2759
0
  ccv_nnc_tensor_t* hbt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 64), 0);
2760
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b, bt), TENSOR_LIST(hb, hbt), 0);
2761
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, hb->data.f32, hbt->data.f32, 2 * 10 * 64, 1e-5, "permute computed output should be numerically close to non-permute computed ones");
2762
0
  ccv_nnc_tensor_free(ha);
2763
0
  ccv_nnc_tensor_free(hw);
2764
0
  ccv_nnc_tensor_free(a);
2765
0
  ccv_nnc_tensor_free(w);
2766
0
  ccv_nnc_tensor_free(b);
2767
0
  ccv_nnc_tensor_view_free(av);
2768
0
  ccv_nnc_tensor_view_free(wv);
2769
0
  ccv_nnc_tensor_free(at);
2770
0
  ccv_nnc_tensor_free(wt);
2771
0
  ccv_nnc_tensor_free(bt);
2772
0
  ccv_nnc_tensor_free(hb);
2773
0
  ccv_nnc_tensor_free(hbt);
2774
0
}
2775
2776
TEST_CASE("generalized batched gemm with batch (2, 4) compare mps")
2777
1
{
2778
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
2779
  // This is a particular batched gemm which treat every dimensions other than the last two as batching.
2780
0
  dsfmt_t dsfmt;
2781
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2782
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
2783
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 64, 4, 128), 0);
2784
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
2785
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
2786
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 64, 4, 128), 0);
2787
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 64), 0);
2788
2789
0
  ccv_nnc_tensor_t* at = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
2790
0
  ccv_nnc_tensor_t* wt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 64, 128), 0);
2791
0
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
2792
0
  int i;
2793
0
  for (i = 0; i < 8 * 64 * 128; i++)
2794
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
2795
0
  for (i = 0; i < 8 * 10 * 128; i++)
2796
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2797
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(at), 0);
2798
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(hw), TENSOR_LIST(wt), 0);
2799
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw), TENSOR_LIST(a, w), 0);
2800
0
  ccv_nnc_tensor_view_t* av = ccv_nnc_tensor_view_new(a, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
2801
0
  ccv_nnc_tensor_view_t* wv = ccv_nnc_tensor_view_new(w, GPU_TENSOR_NHWC(000, 32F, 2, 4, 64, 128), ccv_nnc_no_ofs, DIM_ALLOC(64 * 4 * 128, 128, 4 * 128, 1));
2802
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(2, 3)), ccv_nnc_no_hint, 0, TENSOR_LIST((ccv_nnc_tensor_t*)av, (ccv_nnc_tensor_t*)wv), TENSOR_LIST(b), 0);
2803
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(2, 3)), ccv_nnc_no_hint, 0, TENSOR_LIST(at, wt), TENSOR_LIST(bt), 0);
2804
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
2805
0
  REQUIRE_TENSOR_EQ(hb, bt, "permute computed output should be the same as non-permute computed ones");
2806
0
  ccv_nnc_tensor_free(ha);
2807
0
  ccv_nnc_tensor_free(hw);
2808
0
  ccv_nnc_tensor_free(hb);
2809
0
  ccv_nnc_tensor_free(a);
2810
0
  ccv_nnc_tensor_free(w);
2811
0
  ccv_nnc_tensor_free(b);
2812
0
  ccv_nnc_tensor_view_free(av);
2813
0
  ccv_nnc_tensor_view_free(wv);
2814
0
  ccv_nnc_tensor_free(at);
2815
0
  ccv_nnc_tensor_free(wt);
2816
0
  ccv_nnc_tensor_free(bt);
2817
0
}
2818
2819
TEST_CASE("generalized batched gemm with batch (2, 4) and broadcast compare mps")
2820
1
{
2821
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
2822
  // This is a particular batched gemm which treat every dimensions other than the last two as batching.
2823
0
  dsfmt_t dsfmt;
2824
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2825
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
2826
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
2827
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
2828
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
2829
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
2830
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 64), 0);
2831
2832
0
  ccv_nnc_tensor_t* at = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
2833
0
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
2834
0
  int i;
2835
0
  for (i = 0; i < 64 * 128; i++)
2836
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
2837
0
  for (i = 0; i < 8 * 10 * 128; i++)
2838
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2839
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(at), 0);
2840
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw), TENSOR_LIST(a, w), 0);
2841
0
  ccv_nnc_tensor_view_t* av = ccv_nnc_tensor_view_new(a, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
2842
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST((ccv_nnc_tensor_t*)av, w), TENSOR_LIST(b), 0);
2843
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(at, hw), TENSOR_LIST(bt), 0);
2844
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
2845
0
  REQUIRE_TENSOR_EQ(hb, bt, "permute computed output should be the same as non-permute computed ones");
2846
0
  ccv_nnc_tensor_free(ha);
2847
0
  ccv_nnc_tensor_free(hw);
2848
0
  ccv_nnc_tensor_free(hb);
2849
0
  ccv_nnc_tensor_free(a);
2850
0
  ccv_nnc_tensor_free(w);
2851
0
  ccv_nnc_tensor_free(b);
2852
0
  ccv_nnc_tensor_view_free(av);
2853
0
  ccv_nnc_tensor_free(at);
2854
0
  ccv_nnc_tensor_free(bt);
2855
0
}
2856
2857
TEST_CASE("generalized batched gemm with batch (2, 4) with bias compare mps")
2858
1
{
2859
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
2860
  // This is a particular batched gemm which treat every dimensions other than the last two as batching.
2861
0
  dsfmt_t dsfmt;
2862
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2863
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
2864
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 64, 4, 128), 0);
2865
0
  ccv_nnc_tensor_t* hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
2866
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
2867
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
2868
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 64, 4, 128), 0);
2869
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64), 0);
2870
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 64), 0);
2871
2872
0
  ccv_nnc_tensor_t* at = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
2873
0
  ccv_nnc_tensor_t* wt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 64, 128), 0);
2874
0
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
2875
0
  int i;
2876
0
  for (i = 0; i < 8 * 64 * 128; i++)
2877
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
2878
0
  for (i = 0; i < 64; i++)
2879
0
    hbias->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / 64;
2880
0
  for (i = 0; i < 8 * 10 * 128; i++)
2881
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2882
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(at), 0);
2883
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(hw), TENSOR_LIST(wt), 0);
2884
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(a, w, bias), 0);
2885
0
  ccv_nnc_tensor_view_t* av = ccv_nnc_tensor_view_new(a, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
2886
0
  ccv_nnc_tensor_view_t* wv = ccv_nnc_tensor_view_new(w, GPU_TENSOR_NHWC(000, 32F, 2, 4, 64, 128), ccv_nnc_no_ofs, DIM_ALLOC(64 * 4 * 128, 128, 4 * 128, 1));
2887
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(2, 3)), ccv_nnc_no_hint, 0, TENSOR_LIST((ccv_nnc_tensor_t*)av, (ccv_nnc_tensor_t*)wv, bias), TENSOR_LIST(b), 0);
2888
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(2, 3)), ccv_nnc_no_hint, 0, TENSOR_LIST(at, wt, hbias), TENSOR_LIST(bt), 0);
2889
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
2890
0
  REQUIRE_TENSOR_EQ(hb, bt, "permute computed output should be the same as non-permute computed ones");
2891
0
  ccv_nnc_tensor_free(ha);
2892
0
  ccv_nnc_tensor_free(hw);
2893
0
  ccv_nnc_tensor_free(hbias);
2894
0
  ccv_nnc_tensor_free(hb);
2895
0
  ccv_nnc_tensor_free(a);
2896
0
  ccv_nnc_tensor_free(w);
2897
0
  ccv_nnc_tensor_free(bias);
2898
0
  ccv_nnc_tensor_free(b);
2899
0
  ccv_nnc_tensor_view_free(av);
2900
0
  ccv_nnc_tensor_view_free(wv);
2901
0
  ccv_nnc_tensor_free(at);
2902
0
  ccv_nnc_tensor_free(wt);
2903
0
  ccv_nnc_tensor_free(bt);
2904
0
}
2905
2906
TEST_CASE("generalized batched gemm with batch (2, 4) with bias and broadcast compare mps")
2907
1
{
2908
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
2909
  // This is a particular batched gemm which treat every dimensions other than the last two as batching.
2910
0
  dsfmt_t dsfmt;
2911
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2912
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
2913
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
2914
0
  ccv_nnc_tensor_t* hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
2915
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
2916
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
2917
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
2918
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64), 0);
2919
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 64), 0);
2920
2921
0
  ccv_nnc_tensor_t* at = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
2922
0
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
2923
0
  int i;
2924
0
  for (i = 0; i < 64 * 128; i++)
2925
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
2926
0
  for (i = 0; i < 64; i++)
2927
0
    hbias->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / 64;
2928
0
  for (i = 0; i < 8 * 10 * 128; i++)
2929
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2930
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(at), 0);
2931
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(a, w, bias), 0);
2932
0
  ccv_nnc_tensor_view_t* av = ccv_nnc_tensor_view_new(a, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
2933
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST((ccv_nnc_tensor_t*)av, w, bias), TENSOR_LIST(b), 0);
2934
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(at, hw, hbias), TENSOR_LIST(bt), 0);
2935
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
2936
0
  REQUIRE_TENSOR_EQ(hb, bt, "permute computed output should be the same as non-permute computed ones");
2937
0
  ccv_nnc_tensor_free(ha);
2938
0
  ccv_nnc_tensor_free(hw);
2939
0
  ccv_nnc_tensor_free(hbias);
2940
0
  ccv_nnc_tensor_free(hb);
2941
0
  ccv_nnc_tensor_free(a);
2942
0
  ccv_nnc_tensor_free(w);
2943
0
  ccv_nnc_tensor_free(bias);
2944
0
  ccv_nnc_tensor_free(b);
2945
0
  ccv_nnc_tensor_view_free(av);
2946
0
  ccv_nnc_tensor_free(at);
2947
0
  ccv_nnc_tensor_free(bt);
2948
0
}
2949
2950
TEST_CASE("generalized batched backward gemm with batch (2, 4) compare mps")
2951
1
{
2952
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
2953
  // This is a particular batched gemm which treat every dimensions other than the last two as batching.
2954
0
  dsfmt_t dsfmt;
2955
0
  dsfmt_init_gen_rand(&dsfmt, 0);
2956
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
2957
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 64, 4, 128), 0);
2958
0
  ccv_nnc_tensor_t* hda = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
2959
0
  ccv_nnc_tensor_t* hdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 64, 4, 128), 0);
2960
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
2961
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
2962
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 64, 4, 128), 0);
2963
0
  ccv_nnc_tensor_t* da = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
2964
0
  ccv_nnc_tensor_t* dw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 64, 4, 128), 0);
2965
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 64), 0);
2966
2967
0
  ccv_nnc_tensor_t* at = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
2968
0
  ccv_nnc_tensor_t* wt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 64, 128), 0);
2969
0
  ccv_nnc_tensor_t* dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
2970
0
  ccv_nnc_tensor_t* dwt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 64, 128), 0);
2971
0
  ccv_nnc_tensor_t* tda = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
2972
0
  ccv_nnc_tensor_t* tdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 64, 4, 128), 0);
2973
0
  int i;
2974
0
  for (i = 0; i < 8 * 64 * 128; i++)
2975
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
2976
0
  for (i = 0; i < 8 * 10 * 128; i++)
2977
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2978
0
  for (i = 0; i < 2 * 4 * 10 * 64; i++)
2979
0
    hb->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
2980
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(at), 0);
2981
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(hw), TENSOR_LIST(wt), 0);
2982
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hb), TENSOR_LIST(a, w, b), 0);
2983
0
  ccv_nnc_tensor_view_t* av = ccv_nnc_tensor_view_new(a, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
2984
0
  ccv_nnc_tensor_view_t* wv = ccv_nnc_tensor_view_new(w, GPU_TENSOR_NHWC(000, 32F, 2, 4, 64, 128), ccv_nnc_no_ofs, DIM_ALLOC(64 * 4 * 128, 128, 4 * 128, 1));
2985
0
  ccv_nnc_tensor_view_t* dav = ccv_nnc_tensor_view_new(da, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
2986
0
  ccv_nnc_tensor_view_t* dwv = ccv_nnc_tensor_view_new(dw, GPU_TENSOR_NHWC(000, 32F, 2, 4, 64, 128), ccv_nnc_no_ofs, DIM_ALLOC(64 * 4 * 128, 128, 4 * 128, 1));
2987
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(2, 3)), ccv_nnc_no_hint, 0, TENSOR_LIST(b, (ccv_nnc_tensor_t*)av, (ccv_nnc_tensor_t*)wv), TENSOR_LIST((ccv_nnc_tensor_t*)dav, (ccv_nnc_tensor_t*)dwv), 0);
2988
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(2, 3)), ccv_nnc_no_hint, 0, TENSOR_LIST(hb, at, wt), TENSOR_LIST(dat, dwt), 0);
2989
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(dat), TENSOR_LIST(tda), 0);
2990
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(dwt), TENSOR_LIST(tdw), 0);
2991
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(da, dw), TENSOR_LIST(hda, hdw), 0);
2992
0
  REQUIRE_TENSOR_EQ(hda, tda, "permute computed output should be the same as non-permute computed ones");
2993
0
  REQUIRE_TENSOR_EQ(hdw, tdw, "permute computed output should be the same as non-permute computed ones");
2994
0
  ccv_nnc_tensor_free(ha);
2995
0
  ccv_nnc_tensor_free(hw);
2996
0
  ccv_nnc_tensor_free(hda);
2997
0
  ccv_nnc_tensor_free(hdw);
2998
0
  ccv_nnc_tensor_free(hb);
2999
0
  ccv_nnc_tensor_free(a);
3000
0
  ccv_nnc_tensor_free(w);
3001
0
  ccv_nnc_tensor_free(da);
3002
0
  ccv_nnc_tensor_free(dw);
3003
0
  ccv_nnc_tensor_free(b);
3004
0
  ccv_nnc_tensor_view_free(av);
3005
0
  ccv_nnc_tensor_view_free(wv);
3006
0
  ccv_nnc_tensor_view_free(dav);
3007
0
  ccv_nnc_tensor_view_free(dwv);
3008
0
  ccv_nnc_tensor_free(at);
3009
0
  ccv_nnc_tensor_free(wt);
3010
0
  ccv_nnc_tensor_free(dat);
3011
0
  ccv_nnc_tensor_free(tda);
3012
0
  ccv_nnc_tensor_free(dwt);
3013
0
  ccv_nnc_tensor_free(tdw);
3014
0
}
3015
3016
TEST_CASE("generalized batched backward gemm with batch (2, 4) and broadcast compare mps")
3017
1
{
3018
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
3019
  // This is a particular batched gemm which treat every dimensions other than the last two as batching.
3020
0
  dsfmt_t dsfmt;
3021
0
  dsfmt_init_gen_rand(&dsfmt, 0);
3022
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
3023
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
3024
0
  ccv_nnc_tensor_t* hda = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
3025
0
  ccv_nnc_tensor_t* hdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
3026
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
3027
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
3028
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
3029
0
  ccv_nnc_tensor_t* da = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
3030
0
  ccv_nnc_tensor_t* dw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
3031
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 64), 0);
3032
3033
0
  ccv_nnc_tensor_t* at = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
3034
0
  ccv_nnc_tensor_t* dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
3035
0
  ccv_nnc_tensor_t* tda = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
3036
0
  ccv_nnc_tensor_t* tdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
3037
0
  int i;
3038
0
  for (i = 0; i < 64 * 128; i++)
3039
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
3040
0
  for (i = 0; i < 8 * 10 * 128; i++)
3041
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3042
0
  for (i = 0; i < 2 * 4 * 10 * 64; i++)
3043
0
    hb->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3044
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(at), 0);
3045
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hb), TENSOR_LIST(a, w, b), 0);
3046
0
  ccv_nnc_tensor_view_t* av = ccv_nnc_tensor_view_new(a, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
3047
0
  ccv_nnc_tensor_view_t* dav = ccv_nnc_tensor_view_new(da, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
3048
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(b, (ccv_nnc_tensor_t*)av, w), TENSOR_LIST((ccv_nnc_tensor_t*)dav, dw), 0);
3049
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(hb, at, hw), TENSOR_LIST(dat, tdw), 0);
3050
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(dat), TENSOR_LIST(tda), 0);
3051
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(da, dw), TENSOR_LIST(hda, hdw), 0);
3052
0
  REQUIRE_TENSOR_EQ(hda, tda, "permute computed output should be the same as non-permute computed ones");
3053
0
  REQUIRE_TENSOR_EQ(hdw, tdw, "permute computed output should be the same as non-permute computed ones");
3054
0
  ccv_nnc_tensor_free(ha);
3055
0
  ccv_nnc_tensor_free(hw);
3056
0
  ccv_nnc_tensor_free(hda);
3057
0
  ccv_nnc_tensor_free(hdw);
3058
0
  ccv_nnc_tensor_free(hb);
3059
0
  ccv_nnc_tensor_free(a);
3060
0
  ccv_nnc_tensor_free(w);
3061
0
  ccv_nnc_tensor_free(da);
3062
0
  ccv_nnc_tensor_free(dw);
3063
0
  ccv_nnc_tensor_free(b);
3064
0
  ccv_nnc_tensor_view_free(av);
3065
0
  ccv_nnc_tensor_view_free(dav);
3066
0
  ccv_nnc_tensor_free(at);
3067
0
  ccv_nnc_tensor_free(dat);
3068
0
  ccv_nnc_tensor_free(tda);
3069
0
  ccv_nnc_tensor_free(tdw);
3070
0
}
3071
3072
TEST_CASE("generalized batched backward gemm with batch (2, 4) with bias compare mps")
3073
1
{
3074
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
3075
  // This is a particular batched gemm which treat every dimensions other than the last two as batching.
3076
0
  dsfmt_t dsfmt;
3077
0
  dsfmt_init_gen_rand(&dsfmt, 0);
3078
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
3079
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 64, 4, 128), 0);
3080
0
  ccv_nnc_tensor_t* hda = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
3081
0
  ccv_nnc_tensor_t* hdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 64, 4, 128), 0);
3082
0
  ccv_nnc_tensor_t* hdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
3083
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
3084
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
3085
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 64, 4, 128), 0);
3086
0
  ccv_nnc_tensor_t* da = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
3087
0
  ccv_nnc_tensor_t* dw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 64, 4, 128), 0);
3088
0
  ccv_nnc_tensor_t* dbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64), 0);
3089
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 64), 0);
3090
3091
0
  ccv_nnc_tensor_t* at = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
3092
0
  ccv_nnc_tensor_t* wt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 64, 128), 0);
3093
0
  ccv_nnc_tensor_t* dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
3094
0
  ccv_nnc_tensor_t* dwt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 64, 128), 0);
3095
0
  ccv_nnc_tensor_t* tda = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
3096
0
  ccv_nnc_tensor_t* tdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 64, 4, 128), 0);
3097
0
  ccv_nnc_tensor_t* tdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
3098
0
  int i;
3099
0
  for (i = 0; i < 8 * 64 * 128; i++)
3100
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
3101
0
  for (i = 0; i < 8 * 10 * 128; i++)
3102
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3103
0
  for (i = 0; i < 2 * 4 * 10 * 64; i++)
3104
0
    hb->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3105
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(at), 0);
3106
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(hw), TENSOR_LIST(wt), 0);
3107
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hb), TENSOR_LIST(a, w, b), 0);
3108
0
  ccv_nnc_tensor_view_t* av = ccv_nnc_tensor_view_new(a, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
3109
0
  ccv_nnc_tensor_view_t* wv = ccv_nnc_tensor_view_new(w, GPU_TENSOR_NHWC(000, 32F, 2, 4, 64, 128), ccv_nnc_no_ofs, DIM_ALLOC(64 * 4 * 128, 128, 4 * 128, 1));
3110
0
  ccv_nnc_tensor_view_t* dav = ccv_nnc_tensor_view_new(da, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
3111
0
  ccv_nnc_tensor_view_t* dwv = ccv_nnc_tensor_view_new(dw, GPU_TENSOR_NHWC(000, 32F, 2, 4, 64, 128), ccv_nnc_no_ofs, DIM_ALLOC(64 * 4 * 128, 128, 4 * 128, 1));
3112
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(2, 3)), ccv_nnc_no_hint, 0, TENSOR_LIST(b, (ccv_nnc_tensor_t*)av, (ccv_nnc_tensor_t*)wv), TENSOR_LIST((ccv_nnc_tensor_t*)dav, (ccv_nnc_tensor_t*)dwv, dbias), 0);
3113
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(2, 3)), ccv_nnc_no_hint, 0, TENSOR_LIST(hb, at, wt), TENSOR_LIST(dat, dwt, tdbias), 0);
3114
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(da, dw, dbias), TENSOR_LIST(hda, hdw, hdbias), 0);
3115
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(dat), TENSOR_LIST(tda), 0);
3116
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(dwt), TENSOR_LIST(tdw), 0);
3117
0
  REQUIRE_TENSOR_EQ(hda, tda, "permute computed output should be the same as non-permute computed ones");
3118
0
  REQUIRE_TENSOR_EQ(hdw, tdw, "permute computed output should be the same as non-permute computed ones");
3119
0
  REQUIRE_TENSOR_EQ(hdbias, tdbias, "permute computed output should be the same as non-permute computed ones");
3120
0
  ccv_nnc_tensor_free(ha);
3121
0
  ccv_nnc_tensor_free(hw);
3122
0
  ccv_nnc_tensor_free(hda);
3123
0
  ccv_nnc_tensor_free(hdw);
3124
0
  ccv_nnc_tensor_free(hdbias);
3125
0
  ccv_nnc_tensor_free(hb);
3126
0
  ccv_nnc_tensor_free(a);
3127
0
  ccv_nnc_tensor_free(w);
3128
0
  ccv_nnc_tensor_free(da);
3129
0
  ccv_nnc_tensor_free(dw);
3130
0
  ccv_nnc_tensor_free(dbias);
3131
0
  ccv_nnc_tensor_free(b);
3132
0
  ccv_nnc_tensor_view_free(av);
3133
0
  ccv_nnc_tensor_view_free(wv);
3134
0
  ccv_nnc_tensor_view_free(dav);
3135
0
  ccv_nnc_tensor_view_free(dwv);
3136
0
  ccv_nnc_tensor_free(at);
3137
0
  ccv_nnc_tensor_free(wt);
3138
0
  ccv_nnc_tensor_free(dat);
3139
0
  ccv_nnc_tensor_free(dwt);
3140
0
  ccv_nnc_tensor_free(tda);
3141
0
  ccv_nnc_tensor_free(tdw);
3142
0
  ccv_nnc_tensor_free(tdbias);
3143
0
}
3144
3145
TEST_CASE("generalized batched backward gemm with batch (2, 4) with bias and broadcast compare mps")
3146
1
{
3147
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
3148
  // This is a particular batched gemm which treat every dimensions other than the last two as batching.
3149
0
  dsfmt_t dsfmt;
3150
0
  dsfmt_init_gen_rand(&dsfmt, 0);
3151
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
3152
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
3153
0
  ccv_nnc_tensor_t* hda = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
3154
0
  ccv_nnc_tensor_t* hdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
3155
0
  ccv_nnc_tensor_t* hdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
3156
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 64), 0);
3157
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
3158
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
3159
0
  ccv_nnc_tensor_t* da = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 10, 4, 128), 0);
3160
0
  ccv_nnc_tensor_t* dw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
3161
0
  ccv_nnc_tensor_t* dbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64), 0);
3162
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 64), 0);
3163
3164
0
  ccv_nnc_tensor_t* at = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
3165
0
  ccv_nnc_tensor_t* dat = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 10, 128), 0);
3166
0
  ccv_nnc_tensor_t* tda = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 10, 4, 128), 0);
3167
0
  ccv_nnc_tensor_t* tdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
3168
0
  ccv_nnc_tensor_t* tdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
3169
0
  int i;
3170
0
  for (i = 0; i < 64 * 128; i++)
3171
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
3172
0
  for (i = 0; i < 8 * 10 * 128; i++)
3173
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3174
0
  for (i = 0; i < 2 * 4 * 10 * 64; i++)
3175
0
    hb->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
3176
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(at), 0);
3177
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hb), TENSOR_LIST(a, w, b), 0);
3178
0
  ccv_nnc_tensor_view_t* av = ccv_nnc_tensor_view_new(a, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
3179
0
  ccv_nnc_tensor_view_t* dav = ccv_nnc_tensor_view_new(da, GPU_TENSOR_NHWC(000, 32F, 2, 4, 10, 128), ccv_nnc_no_ofs, DIM_ALLOC(10 * 4 * 128, 128, 4 * 128, 1));
3180
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(b, (ccv_nnc_tensor_t*)av, w, dbias), TENSOR_LIST((ccv_nnc_tensor_t*)dav, dw, dbias), 0);
3181
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(hb, at, hw, hdbias), TENSOR_LIST(dat, tdw, tdbias), 0);
3182
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(da, dw, dbias), TENSOR_LIST(hda, hdw, hdbias), 0);
3183
0
  ccv_nnc_cmd_exec(CMD_TRANSPOSE_FORWARD(1, 2), ccv_nnc_no_hint, 0, TENSOR_LIST(dat), TENSOR_LIST(tda), 0);
3184
0
  REQUIRE_TENSOR_EQ(hda, tda, "permute computed output should be the same as non-permute computed ones");
3185
0
  REQUIRE_TENSOR_EQ(hdw, tdw, "permute computed output should be the same as non-permute computed ones");
3186
0
  REQUIRE_TENSOR_EQ(hdbias, tdbias, "permute computed output should be the same as non-permute computed ones");
3187
0
  ccv_nnc_tensor_free(ha);
3188
0
  ccv_nnc_tensor_free(hw);
3189
0
  ccv_nnc_tensor_free(hda);
3190
0
  ccv_nnc_tensor_free(hdw);
3191
0
  ccv_nnc_tensor_free(hdbias);
3192
0
  ccv_nnc_tensor_free(hb);
3193
0
  ccv_nnc_tensor_free(a);
3194
0
  ccv_nnc_tensor_free(w);
3195
0
  ccv_nnc_tensor_free(da);
3196
0
  ccv_nnc_tensor_free(dw);
3197
0
  ccv_nnc_tensor_free(dbias);
3198
0
  ccv_nnc_tensor_free(b);
3199
0
  ccv_nnc_tensor_view_free(av);
3200
0
  ccv_nnc_tensor_view_free(dav);
3201
0
  ccv_nnc_tensor_free(at);
3202
0
  ccv_nnc_tensor_free(dat);
3203
0
  ccv_nnc_tensor_free(tdw);
3204
0
  ccv_nnc_tensor_free(tdbias);
3205
0
}
3206
3207
TEST_CASE("ewdiv forward with reciprocal")
3208
1
{
3209
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWDIV_FORWARD, CCV_NNC_BACKEND_MPS));
3210
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3211
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3212
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3213
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3214
0
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3215
0
  dsfmt_t dsfmt;
3216
0
  dsfmt_init_gen_rand(&dsfmt, 0);
3217
0
  int i;
3218
0
  for (i = 0; i < 1000; i++)
3219
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 0.01;
3220
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
3221
0
  ccv_nnc_cmd_exec(CMD_EWDIV_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(0, a), TENSOR_LIST(b), 0);
3222
0
  ccv_nnc_cmd_exec(CMD_EWDIV_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(0, ha), TENSOR_LIST(bt), 0);
3223
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
3224
0
  REQUIRE_TENSOR_EQ(bt, hb, "GPU computed output should be the same as CPU computed ones");
3225
0
  ccv_nnc_tensor_free(a);
3226
0
  ccv_nnc_tensor_free(b);
3227
0
  ccv_nnc_tensor_free(ha);
3228
0
  ccv_nnc_tensor_free(hb);
3229
0
  ccv_nnc_tensor_free(bt);
3230
0
}
3231
3232
TEST_CASE("ewdiv forward")
3233
1
{
3234
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWDIV_FORWARD, CCV_NNC_BACKEND_MPS));
3235
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3236
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3237
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3238
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3239
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3240
0
  ccv_nnc_tensor_t* hc = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3241
0
  ccv_nnc_tensor_t* ct = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3242
0
  dsfmt_t dsfmt;
3243
0
  dsfmt_init_gen_rand(&dsfmt, 0);
3244
0
  int i;
3245
0
  for (i = 0; i < 1000; i++)
3246
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 0.01;
3247
0
  for (i = 0; i < 1000; i++)
3248
0
    hb->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 0.01;
3249
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb), TENSOR_LIST(a, b), 0);
3250
0
  ccv_nnc_cmd_exec(CMD_EWDIV_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a, b), TENSOR_LIST(c), 0);
3251
0
  ccv_nnc_cmd_exec(CMD_EWDIV_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hb), TENSOR_LIST(ct), 0);
3252
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(c), TENSOR_LIST(hc), 0);
3253
0
  REQUIRE_TENSOR_EQ(ct, hc, "GPU computed output should be the same as CPU computed ones");
3254
0
  ccv_nnc_tensor_free(a);
3255
0
  ccv_nnc_tensor_free(b);
3256
0
  ccv_nnc_tensor_free(c);
3257
0
  ccv_nnc_tensor_free(ha);
3258
0
  ccv_nnc_tensor_free(hb);
3259
0
  ccv_nnc_tensor_free(hc);
3260
0
  ccv_nnc_tensor_free(ct);
3261
0
}
3262
3263
TEST_CASE("exp forward")
3264
1
{
3265
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWEXP_FORWARD, CCV_NNC_BACKEND_MPS));
3266
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3267
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3268
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3269
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3270
0
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3271
0
  dsfmt_t dsfmt;
3272
0
  dsfmt_init_gen_rand(&dsfmt, 0);
3273
0
  int i;
3274
0
  for (i = 0; i < 1000; i++)
3275
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 10 - 1;
3276
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
3277
0
  ccv_nnc_cmd_exec(CMD_EWEXP_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b), 0);
3278
0
  ccv_nnc_cmd_exec(CMD_EWEXP_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(bt), 0);
3279
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
3280
0
  REQUIRE_TENSOR_EQ(bt, hb, "GPU computed output should be the same as CPU computed ones");
3281
0
  ccv_nnc_tensor_free(a);
3282
0
  ccv_nnc_tensor_free(b);
3283
0
  ccv_nnc_tensor_free(ha);
3284
0
  ccv_nnc_tensor_free(hb);
3285
0
  ccv_nnc_tensor_free(bt);
3286
0
}
3287
3288
TEST_CASE("ewpow forward")
3289
1
{
3290
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWPOW_FORWARD, CCV_NNC_BACKEND_MPS));
3291
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3292
0
  ccv_nnc_tensor_t* c = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3293
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3294
0
  ccv_nnc_tensor_t* hc = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3295
0
  ccv_nnc_tensor_t* ct = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3296
0
  dsfmt_t dsfmt;
3297
0
  dsfmt_init_gen_rand(&dsfmt, 0);
3298
0
  int i;
3299
0
  for (i = 0; i < 1000; i++)
3300
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 2 + 0.1;
3301
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
3302
0
  ccv_nnc_cmd_exec(CMD_EWPOW_FORWARD(3), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(c), 0);
3303
0
  ccv_nnc_cmd_exec(CMD_EWPOW_FORWARD(3), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(ct), 0);
3304
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(c), TENSOR_LIST(hc), 0);
3305
0
  REQUIRE_TENSOR_EQ(ct, hc, "GPU computed output should be the same as CPU computed ones");
3306
0
  ccv_nnc_tensor_free(a);
3307
0
  ccv_nnc_tensor_free(c);
3308
0
  ccv_nnc_tensor_free(ha);
3309
0
  ccv_nnc_tensor_free(hc);
3310
0
  ccv_nnc_tensor_free(ct);
3311
0
}
3312
3313
TEST_CASE("ewsin forward")
3314
1
{
3315
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWSIN_FORWARD, CCV_NNC_BACKEND_MPS));
3316
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3317
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3318
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3319
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3320
0
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3321
0
  dsfmt_t dsfmt;
3322
0
  dsfmt_init_gen_rand(&dsfmt, 0);
3323
0
  int i;
3324
0
  for (i = 0; i < 1000; i++)
3325
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 10 - 5;
3326
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
3327
0
  ccv_nnc_cmd_exec(CMD_EWSIN_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b), 0);
3328
0
  ccv_nnc_cmd_exec(CMD_EWSIN_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(bt), 0);
3329
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
3330
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, bt->data.f32, hb->data.f32, 10 * 100, 1e-3, "GPU computed output should be the same as CPU computed ones");
3331
0
  ccv_nnc_tensor_free(a);
3332
0
  ccv_nnc_tensor_free(b);
3333
0
  ccv_nnc_tensor_free(ha);
3334
0
  ccv_nnc_tensor_free(hb);
3335
0
  ccv_nnc_tensor_free(bt);
3336
0
}
3337
3338
TEST_CASE("ewcos forward")
3339
1
{
3340
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWCOS_FORWARD, CCV_NNC_BACKEND_MPS));
3341
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3342
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3343
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3344
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3345
0
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3346
0
  dsfmt_t dsfmt;
3347
0
  dsfmt_init_gen_rand(&dsfmt, 0);
3348
0
  int i;
3349
0
  for (i = 0; i < 1000; i++)
3350
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 10 - 5;
3351
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
3352
0
  ccv_nnc_cmd_exec(CMD_EWCOS_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b), 0);
3353
0
  ccv_nnc_cmd_exec(CMD_EWCOS_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(bt), 0);
3354
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
3355
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, bt->data.f32, hb->data.f32, 10 * 100, 1e-3, "GPU computed output should be the same as CPU computed ones");
3356
0
  ccv_nnc_tensor_free(a);
3357
0
  ccv_nnc_tensor_free(b);
3358
0
  ccv_nnc_tensor_free(ha);
3359
0
  ccv_nnc_tensor_free(hb);
3360
0
  ccv_nnc_tensor_free(bt);
3361
0
}
3362
3363
TEST_CASE("ewlog forward")
3364
1
{
3365
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWLOG_FORWARD, CCV_NNC_BACKEND_MPS));
3366
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3367
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3368
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3369
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3370
0
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3371
0
  dsfmt_t dsfmt;
3372
0
  dsfmt_init_gen_rand(&dsfmt, 0);
3373
0
  int i;
3374
0
  for (i = 0; i < 1000; i++)
3375
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 10 + 0.0001;
3376
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
3377
0
  ccv_nnc_cmd_exec(CMD_EWLOG_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b), 0);
3378
0
  ccv_nnc_cmd_exec(CMD_EWLOG_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(bt), 0);
3379
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
3380
0
  REQUIRE_TENSOR_EQ(bt, hb, "GPU computed output should be the same as CPU computed ones");
3381
0
  ccv_nnc_tensor_free(a);
3382
0
  ccv_nnc_tensor_free(b);
3383
0
  ccv_nnc_tensor_free(ha);
3384
0
  ccv_nnc_tensor_free(hb);
3385
0
  ccv_nnc_tensor_free(bt);
3386
0
}
3387
3388
TEST_CASE("ewsqrt forward")
3389
1
{
3390
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWSQRT_FORWARD, CCV_NNC_BACKEND_MPS));
3391
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3392
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3393
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3394
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3395
0
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3396
0
  dsfmt_t dsfmt;
3397
0
  dsfmt_init_gen_rand(&dsfmt, 0);
3398
0
  int i;
3399
0
  for (i = 0; i < 1000; i++)
3400
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 10 + 0.0001;
3401
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
3402
0
  ccv_nnc_cmd_exec(CMD_EWSQRT_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b), 0);
3403
0
  ccv_nnc_cmd_exec(CMD_EWSQRT_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(bt), 0);
3404
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
3405
0
  REQUIRE_TENSOR_EQ(bt, hb, "GPU computed output should be the same as CPU computed ones");
3406
0
  ccv_nnc_tensor_free(a);
3407
0
  ccv_nnc_tensor_free(b);
3408
0
  ccv_nnc_tensor_free(ha);
3409
0
  ccv_nnc_tensor_free(hb);
3410
0
  ccv_nnc_tensor_free(bt);
3411
0
}
3412
3413
TEST_CASE("ewabs forward")
3414
1
{
3415
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_EWABS_FORWARD, CCV_NNC_BACKEND_MPS));
3416
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3417
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3418
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3419
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3420
0
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3421
0
  dsfmt_t dsfmt;
3422
0
  dsfmt_init_gen_rand(&dsfmt, 0);
3423
0
  int i;
3424
0
  for (i = 0; i < 1000; i++)
3425
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 10 - 5 + 0.0001;
3426
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
3427
0
  ccv_nnc_cmd_exec(CMD_EWABS_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b), 0);
3428
0
  ccv_nnc_cmd_exec(CMD_EWABS_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(bt), 0);
3429
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
3430
0
  REQUIRE_TENSOR_EQ(bt, hb, "GPU computed output should be the same as CPU computed ones");
3431
0
  ccv_nnc_tensor_free(a);
3432
0
  ccv_nnc_tensor_free(b);
3433
0
  ccv_nnc_tensor_free(ha);
3434
0
  ccv_nnc_tensor_free(hb);
3435
0
  ccv_nnc_tensor_free(bt);
3436
0
}
3437
3438
TEST_CASE("clamp forward")
3439
1
{
3440
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CLAMP_FORWARD, CCV_NNC_BACKEND_MPS));
3441
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3442
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3443
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3444
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3445
0
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3446
0
  dsfmt_t dsfmt;
3447
0
  dsfmt_init_gen_rand(&dsfmt, 0);
3448
0
  int i;
3449
0
  for (i = 0; i < 1000; i++)
3450
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 10 - 1;
3451
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
3452
0
  ccv_nnc_cmd_exec(CMD_CLAMP_FORWARD(0, 6), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b), 0);
3453
0
  ccv_nnc_cmd_exec(CMD_CLAMP_FORWARD(0, 6), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(bt), 0);
3454
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
3455
0
  REQUIRE_TENSOR_EQ(bt, hb, "GPU computed output should be the same as CPU computed ones");
3456
0
  ccv_nnc_tensor_free(a);
3457
0
  ccv_nnc_tensor_free(b);
3458
0
  ccv_nnc_tensor_free(ha);
3459
0
  ccv_nnc_tensor_free(hb);
3460
0
  ccv_nnc_tensor_free(bt);
3461
0
}
3462
3463
TEST_CASE("clamp forward with only max")
3464
1
{
3465
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CLAMP_FORWARD, CCV_NNC_BACKEND_MPS));
3466
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3467
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3468
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3469
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3470
0
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3471
0
  dsfmt_t dsfmt;
3472
0
  dsfmt_init_gen_rand(&dsfmt, 0);
3473
0
  int i;
3474
0
  for (i = 0; i < 1000; i++)
3475
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 10 - 1;
3476
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
3477
0
  ccv_nnc_cmd_exec(CMD_CLAMP_FORWARD(NAN, 6), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b), 0);
3478
0
  ccv_nnc_cmd_exec(CMD_CLAMP_FORWARD(NAN, 6), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(bt), 0);
3479
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
3480
0
  REQUIRE_TENSOR_EQ(bt, hb, "GPU computed output should be the same as CPU computed ones");
3481
0
  ccv_nnc_tensor_free(a);
3482
0
  ccv_nnc_tensor_free(b);
3483
0
  ccv_nnc_tensor_free(ha);
3484
0
  ccv_nnc_tensor_free(hb);
3485
0
  ccv_nnc_tensor_free(bt);
3486
0
}
3487
3488
TEST_CASE("clamp forward with only min")
3489
1
{
3490
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_CLAMP_FORWARD, CCV_NNC_BACKEND_MPS));
3491
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3492
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NCHW(000, 32F, 10, 100), 0);
3493
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3494
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3495
0
  ccv_nnc_tensor_t* bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 10, 100), 0);
3496
0
  dsfmt_t dsfmt;
3497
0
  dsfmt_init_gen_rand(&dsfmt, 0);
3498
0
  int i;
3499
0
  for (i = 0; i < 1000; i++)
3500
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) * 10 - 1;
3501
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(a), 0);
3502
0
  ccv_nnc_cmd_exec(CMD_CLAMP_FORWARD(0, NAN), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(b), 0);
3503
0
  ccv_nnc_cmd_exec(CMD_CLAMP_FORWARD(0, NAN), ccv_nnc_no_hint, 0, TENSOR_LIST(ha), TENSOR_LIST(bt), 0);
3504
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
3505
0
  REQUIRE_TENSOR_EQ(bt, hb, "GPU computed output should be the same as CPU computed ones");
3506
0
  ccv_nnc_tensor_free(a);
3507
0
  ccv_nnc_tensor_free(b);
3508
0
  ccv_nnc_tensor_free(ha);
3509
0
  ccv_nnc_tensor_free(hb);
3510
0
  ccv_nnc_tensor_free(bt);
3511
0
}
3512
3513
TEST_CASE("compare set with mps")
3514
1
{
3515
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_MPS));
3516
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 11, 10, 9, 8), 0);
3517
0
  ccv_nnc_tensor_t* const ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 11, 10, 9, 8), 0);
3518
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, CPU_TENSOR_NCHW(32F, 11, 10, 9, 8), 0);
3519
0
  ccv_nnc_cmd_exec(CMD_SET_FORWARD(10), ccv_nnc_no_hint, 0, TENSOR_LIST(), TENSOR_LIST(a), 0);
3520
0
  ccv_nnc_cmd_exec(CMD_SET_FORWARD(10), ccv_nnc_no_hint, 0, TENSOR_LIST(), TENSOR_LIST(ga), 0);
3521
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(a), TENSOR_LIST(ha), 0);
3522
0
  REQUIRE_TENSOR_EQ(ha, ga, "format transform result should be the same");
3523
0
  ccv_nnc_tensor_free(a);
3524
0
  ccv_nnc_tensor_free(ha);
3525
0
  ccv_nnc_tensor_free(ga);
3526
0
}
3527
3528
TEST_CASE("scaled dot product attention with mps")
3529
1
{
3530
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD, CCV_NNC_BACKEND_MPS));
3531
  // Bypass error: variable-sized object may not be initialized
3532
0
#define num_long_trials 6
3533
0
#define num_short_trials 2
3534
0
#define num_trials (num_long_trials + num_short_trials)
3535
3536
0
  for (int trial = 0; trial < num_trials; ++trial) {
3537
0
    int B_candidates[num_trials] =         {  32, 1, 1, 1,  32,   3, 2, 1 };
3538
0
    int R_candidates[num_trials] =         { 128, 4128, 4098, 4162, 128,  61, 6, 2 };
3539
0
    int C_candidates[num_trials] =         { 128, 4128, 4098, 4162, 128,  49, 2, 1 };
3540
0
    int Hq_candidates[num_trials] =        {   8, 32, 32, 32,  32,  13, 3, 1 };
3541
0
    int Hk_candidates[num_trials] =        {   8, 8, 8, 8,   8,  13, 3, 1 };
3542
0
    int D_candidates[num_trials] =         {  64, 32, 32, 32, 128, 191, 4, 8 };
3543
0
    int is_causal_candidates[num_trials] = {   0, 0, 0, 0,   1,   0, 1, 0 };
3544
3545
0
    int B = B_candidates[trial];
3546
0
    int R = R_candidates[trial];
3547
0
    int C = C_candidates[trial];
3548
0
    int Hq = Hq_candidates[trial];
3549
0
    int Hk = Hk_candidates[trial];
3550
0
    int D = D_candidates[trial];
3551
0
    int is_causal = is_causal_candidates[trial];
3552
0
    float scale = 1.0 / sqrt((float)D);
3553
3554
0
    GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD, CCV_NNC_BACKEND_MPS));
3555
0
    ccv_nnc_tensor_t* const q_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
3556
0
    ccv_nnc_tensor_t* const k_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
3557
0
    ccv_nnc_tensor_t* const v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
3558
3559
0
    for (int i = 0; i < B * R * Hq * D; ++i) {
3560
0
      q_tensor->data.f32[i] = (float)(i) / (float)(B * R * Hq * D);
3561
0
    }
3562
0
    for (int i = 0; i < B * C * Hk * D; ++i) {
3563
0
      k_tensor->data.f32[i] = (float)(i) / (float)(B * C * Hk * D);
3564
0
    }
3565
0
    for (int i = 0; i < B * C * Hk * D; ++i) {
3566
0
      v_tensor->data.f32[i] = (float)(i) / (float)(B * C * Hk * D);
3567
0
    }
3568
3569
0
    ccv_nnc_tensor_t* const o_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
3570
0
    ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, is_causal), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor, k_tensor, v_tensor), TENSOR_LIST(o_tensor), 0);
3571
3572
    // Why it there 000 in the beginning of the argument list for GPU_TENSOR_NHWC?
3573
0
    ccv_nnc_tensor_t* const gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, Hq, D), 0);
3574
0
    ccv_nnc_tensor_t* const gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, C, Hk, D), 0);
3575
0
    ccv_nnc_tensor_t* const gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, C, Hk, D), 0);
3576
0
    ccv_nnc_tensor_t* const gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, Hq, D), 0);
3577
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor, k_tensor, v_tensor), TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor), 0);
3578
3579
0
    if (is_causal)
3580
0
    {
3581
0
      ccv_nnc_tensor_t* const causal_mask = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 1, 1, R, C), 0);
3582
0
      ccv_nnc_tensor_t* const gpu_causal_mask = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 1, 1, R, C), 0);
3583
0
      for (int i = 0; i < R; i++)
3584
0
        for (int j = 0; j < C; j++)
3585
0
          causal_mask->data.f32[i * C + j] = 0;
3586
0
      for (int i = 0; i < R - 1; i++)
3587
0
        for (int j = i - R + C + 1; j < C; j++)
3588
0
          causal_mask->data.f32[i * C + j] = -FLT_MAX;
3589
0
      ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(causal_mask), TENSOR_LIST(gpu_causal_mask), 0);
3590
0
      ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, 0), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, gpu_causal_mask), TENSOR_LIST(gpu_o_tensor), 0);
3591
0
      ccv_nnc_tensor_free(gpu_causal_mask);
3592
0
      ccv_nnc_tensor_free(causal_mask);
3593
0
    } else {
3594
0
      ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, 0), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor), TENSOR_LIST(gpu_o_tensor), 0);
3595
0
    }
3596
3597
0
    ccv_nnc_tensor_t* const copy_of_gpu_o_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
3598
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_o_tensor), TENSOR_LIST(copy_of_gpu_o_tensor), 0);
3599
3600
0
    REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, copy_of_gpu_o_tensor->data.f32, o_tensor->data.f32, B * R * Hq * D, 1e-3, "scaled dot product attention result should be the same");
3601
3602
0
    ccv_nnc_tensor_free(o_tensor);
3603
0
    ccv_nnc_tensor_free(gpu_o_tensor);
3604
0
    ccv_nnc_tensor_free(copy_of_gpu_o_tensor);
3605
0
    ccv_nnc_tensor_free(q_tensor);
3606
0
    ccv_nnc_tensor_free(k_tensor);
3607
0
    ccv_nnc_tensor_free(v_tensor);
3608
0
    ccv_nnc_tensor_free(gpu_q_tensor);
3609
0
    ccv_nnc_tensor_free(gpu_k_tensor);
3610
0
    ccv_nnc_tensor_free(gpu_v_tensor);
3611
0
  }
3612
0
#undef num_long_trials
3613
0
#undef num_short_trials
3614
0
#undef num_trials
3615
0
}
3616
3617
TEST_CASE("scaled dot product attention with quantized NA mps")
3618
1
{
3619
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD, CCV_NNC_BACKEND_MPS));
3620
0
  const int B = 1;
3621
0
  const int R = 128;
3622
0
  const int C = 128;
3623
0
  const int H = 24;
3624
0
  const int Ds[] = { 64, 80, 128, 130, 160, 192, 224, 256 };
3625
0
  const int datatypes[] = { CCV_16F, CCV_16BF, CCV_32F };
3626
0
  const float tolerances[] = { 2e-2, 3e-2, 2e-2 };
3627
0
  const char* datatype_names[] = { "16F", "16BF", "32F" };
3628
0
  for (int d_idx = 0; d_idx < (int)(sizeof(Ds) / sizeof(Ds[0])); ++d_idx)
3629
0
  {
3630
0
    const int D = Ds[d_idx];
3631
0
    const float scale = 1.0 / sqrt((float)D);
3632
3633
0
    ccv_nnc_tensor_t* const q_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
3634
0
    ccv_nnc_tensor_t* const k_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
3635
0
    ccv_nnc_tensor_t* const v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
3636
0
    ccv_nnc_tensor_t* const q_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, H, D), 0);
3637
0
    ccv_nnc_tensor_t* const k_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, H, D), 0);
3638
0
    ccv_nnc_tensor_t* const v_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, H, D), 0);
3639
0
    const int q_count = B * R * H * D;
3640
0
    const int kv_count = B * C * H * D;
3641
0
    dsfmt_t dsfmt;
3642
0
    dsfmt_init_gen_rand(&dsfmt, 11 + d_idx);
3643
0
    for (int i = 0; i < q_count; ++i)
3644
0
      q_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
3645
0
    for (int i = 0; i < kv_count; ++i)
3646
0
      k_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
3647
0
    for (int i = 0; i < kv_count; ++i)
3648
0
      v_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
3649
3650
0
    ccv_nnc_tensor_t* const o_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
3651
0
    ccv_nnc_cmd_t cpu_cmd = CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, 0);
3652
0
    ccv_nnc_cmd_exec(cpu_cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor, k_tensor, v_tensor), TENSOR_LIST(o_tensor), 0);
3653
3654
0
    for (int datatype_idx = 0; datatype_idx < 3; ++datatype_idx)
3655
0
    {
3656
0
      const int datatype = datatypes[datatype_idx];
3657
0
      ccv_nnc_tensor_t* q_input = q_tensor;
3658
0
      ccv_nnc_tensor_t* k_input = k_tensor;
3659
0
      ccv_nnc_tensor_t* v_input = v_tensor;
3660
0
      ccv_nnc_tensor_t* copy_of_gpu_o_tensor = 0;
3661
0
      ccv_nnc_tensor_t* gpu_q_tensor = 0;
3662
0
      ccv_nnc_tensor_t* gpu_k_tensor = 0;
3663
0
      ccv_nnc_tensor_t* gpu_v_tensor = 0;
3664
0
      ccv_nnc_tensor_t* gpu_o_tensor = 0;
3665
0
      if (datatype == CCV_16F)
3666
0
      {
3667
0
        ccv_float_to_half_precision(q_tensor->data.f32, (uint16_t*)q_tensor_f16->data.f16, q_count);
3668
0
        ccv_float_to_half_precision(k_tensor->data.f32, (uint16_t*)k_tensor_f16->data.f16, kv_count);
3669
0
        ccv_float_to_half_precision(v_tensor->data.f32, (uint16_t*)v_tensor_f16->data.f16, kv_count);
3670
0
        q_input = q_tensor_f16;
3671
0
        k_input = k_tensor_f16;
3672
0
        v_input = v_tensor_f16;
3673
0
        gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, H, D), 0);
3674
0
        gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, H, D), 0);
3675
0
        gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, H, D), 0);
3676
0
        gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, H, D), 0);
3677
0
        copy_of_gpu_o_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, H, D), 0);
3678
0
      } else if (datatype == CCV_16BF) {
3679
0
        ccv_float_to_bfloat(q_tensor->data.f32, (uint16_t*)q_tensor_f16->data.f16, q_count);
3680
0
        ccv_float_to_bfloat(k_tensor->data.f32, (uint16_t*)k_tensor_f16->data.f16, kv_count);
3681
0
        ccv_float_to_bfloat(v_tensor->data.f32, (uint16_t*)v_tensor_f16->data.f16, kv_count);
3682
0
        q_input = q_tensor_f16;
3683
0
        k_input = k_tensor_f16;
3684
0
        v_input = v_tensor_f16;
3685
0
        gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, R, H, D), 0);
3686
0
        gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, C, H, D), 0);
3687
0
        gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, C, H, D), 0);
3688
0
        gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, R, H, D), 0);
3689
0
        copy_of_gpu_o_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, B, R, H, D), 0);
3690
0
      } else {
3691
0
        gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, H, D), 0);
3692
0
        gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, C, H, D), 0);
3693
0
        gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, C, H, D), 0);
3694
0
        gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, H, D), 0);
3695
0
        copy_of_gpu_o_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
3696
0
      }
3697
0
      ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_input, k_input, v_input), TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor), 0);
3698
0
      ccv_nnc_cmd_t gpu_cmd = CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, 0);
3699
0
      gpu_cmd.info.scaled_dot_product_attention.flags = CCV_NNC_GEMM_16F | CCV_NNC_GEMM_8I;
3700
0
      ccv_nnc_cmd_exec(gpu_cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor), TENSOR_LIST(gpu_o_tensor), 0);
3701
0
      ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_o_tensor), TENSOR_LIST(copy_of_gpu_o_tensor), 0);
3702
3703
0
      const int count = B * R * H * D;
3704
0
      float* const cpu_f32 = (float*)ccmalloc(sizeof(float) * count);
3705
0
      float* const gpu_f32 = (float*)ccmalloc(sizeof(float) * count);
3706
0
      memcpy(cpu_f32, o_tensor->data.f32, sizeof(float) * count);
3707
0
      if (datatype == CCV_16F)
3708
0
        ccv_half_precision_to_float((uint16_t*)copy_of_gpu_o_tensor->data.f16, gpu_f32, count);
3709
0
      else if (datatype == CCV_16BF)
3710
0
        ccv_bfloat_to_float((uint16_t*)copy_of_gpu_o_tensor->data.f16, gpu_f32, count);
3711
0
      else
3712
0
        memcpy(gpu_f32, copy_of_gpu_o_tensor->data.f32, sizeof(float) * count);
3713
0
      float max_relative_diff = 0;
3714
0
      int max_diff_idx = 0;
3715
0
      for (int i = 0; i < count; ++i)
3716
0
      {
3717
0
        const float denom = fmaxf(fmaxf(fabsf(cpu_f32[i]), fabsf(gpu_f32[i])), 1.0f);
3718
0
        const float relative_diff = fabsf(cpu_f32[i] - gpu_f32[i]) / denom;
3719
0
        if (relative_diff > max_relative_diff)
3720
0
          max_relative_diff = relative_diff, max_diff_idx = i;
3721
0
      }
3722
0
      REQUIRE(max_relative_diff <= tolerances[datatype_idx], "quantized attention result should match CPU reference for dtype=%s D=%d (max relative diff %g at %d: %g vs %g)", datatype_names[datatype_idx], D, max_relative_diff, max_diff_idx, cpu_f32[max_diff_idx], gpu_f32[max_diff_idx]);
3723
3724
0
      ccfree(cpu_f32);
3725
0
      ccfree(gpu_f32);
3726
0
      ccv_nnc_tensor_free(gpu_o_tensor);
3727
0
      ccv_nnc_tensor_free(copy_of_gpu_o_tensor);
3728
0
      ccv_nnc_tensor_free(gpu_q_tensor);
3729
0
      ccv_nnc_tensor_free(gpu_k_tensor);
3730
0
      ccv_nnc_tensor_free(gpu_v_tensor);
3731
0
    }
3732
0
    ccv_nnc_tensor_free(o_tensor);
3733
0
    ccv_nnc_tensor_free(q_tensor);
3734
0
    ccv_nnc_tensor_free(k_tensor);
3735
0
    ccv_nnc_tensor_free(v_tensor);
3736
0
    ccv_nnc_tensor_free(q_tensor_f16);
3737
0
    ccv_nnc_tensor_free(k_tensor_f16);
3738
0
    ccv_nnc_tensor_free(v_tensor_f16);
3739
0
  }
3740
0
}
3741
3742
TEST_CASE("scaled dot product attention with quantized NA mps batched")
3743
1
{
3744
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD, CCV_NNC_BACKEND_MPS));
3745
0
  const int B = 3;
3746
0
  const int R = 128;
3747
0
  const int C = 128;
3748
0
  const int H = 8;
3749
0
  const int Ds[] = { 64, 128 };
3750
0
  const int datatypes[] = { CCV_16F, CCV_16BF, CCV_32F };
3751
0
  const float tolerances[] = { 2e-2, 3e-2, 2e-2 };
3752
0
  const char* datatype_names[] = { "16F", "16BF", "32F" };
3753
0
  for (int d_idx = 0; d_idx < (int)(sizeof(Ds) / sizeof(Ds[0])); ++d_idx)
3754
0
  {
3755
0
    const int D = Ds[d_idx];
3756
0
    const float scale = 1.0 / sqrt((float)D);
3757
3758
0
    ccv_nnc_tensor_t* const q_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
3759
0
    ccv_nnc_tensor_t* const k_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
3760
0
    ccv_nnc_tensor_t* const v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
3761
0
    ccv_nnc_tensor_t* const q_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, H, D), 0);
3762
0
    ccv_nnc_tensor_t* const k_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, H, D), 0);
3763
0
    ccv_nnc_tensor_t* const v_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, H, D), 0);
3764
0
    const int q_count = B * R * H * D;
3765
0
    const int kv_count = B * C * H * D;
3766
0
    dsfmt_t dsfmt;
3767
0
    dsfmt_init_gen_rand(&dsfmt, 101 + d_idx);
3768
0
    for (int i = 0; i < q_count; ++i)
3769
0
      q_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
3770
0
    for (int i = 0; i < kv_count; ++i)
3771
0
      k_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
3772
0
    for (int i = 0; i < kv_count; ++i)
3773
0
      v_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
3774
3775
0
    ccv_nnc_tensor_t* const o_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
3776
0
    ccv_nnc_cmd_t cpu_cmd = CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, 0);
3777
0
    ccv_nnc_cmd_exec(cpu_cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor, k_tensor, v_tensor), TENSOR_LIST(o_tensor), 0);
3778
3779
0
    for (int datatype_idx = 0; datatype_idx < 3; ++datatype_idx)
3780
0
    {
3781
0
      const int datatype = datatypes[datatype_idx];
3782
0
      ccv_nnc_tensor_t* q_input = q_tensor;
3783
0
      ccv_nnc_tensor_t* k_input = k_tensor;
3784
0
      ccv_nnc_tensor_t* v_input = v_tensor;
3785
0
      ccv_nnc_tensor_t* copy_of_gpu_o_tensor = 0;
3786
0
      ccv_nnc_tensor_t* gpu_q_tensor = 0;
3787
0
      ccv_nnc_tensor_t* gpu_k_tensor = 0;
3788
0
      ccv_nnc_tensor_t* gpu_v_tensor = 0;
3789
0
      ccv_nnc_tensor_t* gpu_o_tensor = 0;
3790
0
      if (datatype == CCV_16F)
3791
0
      {
3792
0
        ccv_float_to_half_precision(q_tensor->data.f32, (uint16_t*)q_tensor_f16->data.f16, q_count);
3793
0
        ccv_float_to_half_precision(k_tensor->data.f32, (uint16_t*)k_tensor_f16->data.f16, kv_count);
3794
0
        ccv_float_to_half_precision(v_tensor->data.f32, (uint16_t*)v_tensor_f16->data.f16, kv_count);
3795
0
        q_input = q_tensor_f16;
3796
0
        k_input = k_tensor_f16;
3797
0
        v_input = v_tensor_f16;
3798
0
        gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, H, D), 0);
3799
0
        gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, H, D), 0);
3800
0
        gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, H, D), 0);
3801
0
        gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, H, D), 0);
3802
0
        copy_of_gpu_o_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, H, D), 0);
3803
0
      } else if (datatype == CCV_16BF) {
3804
0
        ccv_float_to_bfloat(q_tensor->data.f32, (uint16_t*)q_tensor_f16->data.f16, q_count);
3805
0
        ccv_float_to_bfloat(k_tensor->data.f32, (uint16_t*)k_tensor_f16->data.f16, kv_count);
3806
0
        ccv_float_to_bfloat(v_tensor->data.f32, (uint16_t*)v_tensor_f16->data.f16, kv_count);
3807
0
        q_input = q_tensor_f16;
3808
0
        k_input = k_tensor_f16;
3809
0
        v_input = v_tensor_f16;
3810
0
        gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, R, H, D), 0);
3811
0
        gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, C, H, D), 0);
3812
0
        gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, C, H, D), 0);
3813
0
        gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, R, H, D), 0);
3814
0
        copy_of_gpu_o_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, B, R, H, D), 0);
3815
0
      } else {
3816
0
        gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, H, D), 0);
3817
0
        gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, C, H, D), 0);
3818
0
        gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, C, H, D), 0);
3819
0
        gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, H, D), 0);
3820
0
        copy_of_gpu_o_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
3821
0
      }
3822
0
      ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_input, k_input, v_input), TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor), 0);
3823
0
      ccv_nnc_cmd_t gpu_cmd = CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, 0);
3824
0
      gpu_cmd.info.scaled_dot_product_attention.flags = CCV_NNC_GEMM_16F | CCV_NNC_GEMM_8I;
3825
0
      ccv_nnc_cmd_exec(gpu_cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor), TENSOR_LIST(gpu_o_tensor), 0);
3826
0
      ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_o_tensor), TENSOR_LIST(copy_of_gpu_o_tensor), 0);
3827
3828
0
      const int count = B * R * H * D;
3829
0
      float* const cpu_f32 = (float*)ccmalloc(sizeof(float) * count);
3830
0
      float* const gpu_f32 = (float*)ccmalloc(sizeof(float) * count);
3831
0
      memcpy(cpu_f32, o_tensor->data.f32, sizeof(float) * count);
3832
0
      if (datatype == CCV_16F)
3833
0
        ccv_half_precision_to_float((uint16_t*)copy_of_gpu_o_tensor->data.f16, gpu_f32, count);
3834
0
      else if (datatype == CCV_16BF)
3835
0
        ccv_bfloat_to_float((uint16_t*)copy_of_gpu_o_tensor->data.f16, gpu_f32, count);
3836
0
      else
3837
0
        memcpy(gpu_f32, copy_of_gpu_o_tensor->data.f32, sizeof(float) * count);
3838
0
      float max_relative_diff = 0;
3839
0
      int max_diff_idx = 0;
3840
0
      for (int i = 0; i < count; ++i)
3841
0
      {
3842
0
        const float denom = fmaxf(fmaxf(fabsf(cpu_f32[i]), fabsf(gpu_f32[i])), 1.0f);
3843
0
        const float relative_diff = fabsf(cpu_f32[i] - gpu_f32[i]) / denom;
3844
0
        if (relative_diff > max_relative_diff)
3845
0
          max_relative_diff = relative_diff, max_diff_idx = i;
3846
0
      }
3847
0
      REQUIRE(max_relative_diff <= tolerances[datatype_idx], "quantized batched attention result should match CPU reference for dtype=%s D=%d (max relative diff %g at %d: %g vs %g)", datatype_names[datatype_idx], D, max_relative_diff, max_diff_idx, cpu_f32[max_diff_idx], gpu_f32[max_diff_idx]);
3848
3849
0
      ccfree(cpu_f32);
3850
0
      ccfree(gpu_f32);
3851
0
      ccv_nnc_tensor_free(gpu_o_tensor);
3852
0
      ccv_nnc_tensor_free(copy_of_gpu_o_tensor);
3853
0
      ccv_nnc_tensor_free(gpu_q_tensor);
3854
0
      ccv_nnc_tensor_free(gpu_k_tensor);
3855
0
      ccv_nnc_tensor_free(gpu_v_tensor);
3856
0
    }
3857
0
    ccv_nnc_tensor_free(o_tensor);
3858
0
    ccv_nnc_tensor_free(q_tensor);
3859
0
    ccv_nnc_tensor_free(k_tensor);
3860
0
    ccv_nnc_tensor_free(v_tensor);
3861
0
    ccv_nnc_tensor_free(q_tensor_f16);
3862
0
    ccv_nnc_tensor_free(k_tensor_f16);
3863
0
    ccv_nnc_tensor_free(v_tensor_f16);
3864
0
  }
3865
0
}
3866
3867
TEST_CASE("scaled dot product attention with quantized NA mps for non-multiple-of-64 sequence")
3868
1
{
3869
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD, CCV_NNC_BACKEND_MPS));
3870
0
  const int B = 1;
3871
0
  const int R = 128;
3872
0
  const int H = 24;
3873
0
  const int Cs[] = { 130, 224 };
3874
0
  const int Ds[] = { 128, 130, 224 };
3875
0
  const int datatypes[] = { CCV_16F, CCV_16BF, CCV_32F };
3876
0
  const float tolerances[] = { 4e-2, 5e-2, 4e-2 };
3877
0
  const char* datatype_names[] = { "16F", "16BF", "32F" };
3878
0
  for (int c_idx = 0; c_idx < (int)(sizeof(Cs) / sizeof(Cs[0])); ++c_idx)
3879
0
  {
3880
0
    const int C = Cs[c_idx];
3881
0
    for (int d_idx = 0; d_idx < (int)(sizeof(Ds) / sizeof(Ds[0])); ++d_idx)
3882
0
    {
3883
0
      const int D = Ds[d_idx];
3884
0
      const float scale = 1.0 / sqrt((float)D);
3885
3886
0
      ccv_nnc_tensor_t* const q_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
3887
0
      ccv_nnc_tensor_t* const k_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
3888
0
      ccv_nnc_tensor_t* const v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
3889
0
      ccv_nnc_tensor_t* const q_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, H, D), 0);
3890
0
      ccv_nnc_tensor_t* const k_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, H, D), 0);
3891
0
      ccv_nnc_tensor_t* const v_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, H, D), 0);
3892
0
      const int q_count = B * R * H * D;
3893
0
      const int kv_count = B * C * H * D;
3894
0
      dsfmt_t dsfmt;
3895
0
      dsfmt_init_gen_rand(&dsfmt, 211 + c_idx * 17 + d_idx);
3896
0
      for (int i = 0; i < q_count; ++i)
3897
0
        q_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
3898
0
      for (int i = 0; i < kv_count; ++i)
3899
0
        k_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
3900
0
      for (int i = 0; i < kv_count; ++i)
3901
0
        v_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
3902
3903
0
      ccv_nnc_tensor_t* const o_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
3904
0
      ccv_nnc_cmd_t cpu_cmd = CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, 0);
3905
0
      ccv_nnc_cmd_exec(cpu_cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor, k_tensor, v_tensor), TENSOR_LIST(o_tensor), 0);
3906
3907
0
      for (int datatype_idx = 0; datatype_idx < 3; ++datatype_idx)
3908
0
      {
3909
0
        const int datatype = datatypes[datatype_idx];
3910
0
        ccv_nnc_tensor_t* q_input = q_tensor;
3911
0
        ccv_nnc_tensor_t* k_input = k_tensor;
3912
0
        ccv_nnc_tensor_t* v_input = v_tensor;
3913
0
        ccv_nnc_tensor_t* copy_of_gpu_o_tensor = 0;
3914
0
        ccv_nnc_tensor_t* gpu_q_tensor = 0;
3915
0
        ccv_nnc_tensor_t* gpu_k_tensor = 0;
3916
0
        ccv_nnc_tensor_t* gpu_v_tensor = 0;
3917
0
        ccv_nnc_tensor_t* gpu_o_tensor = 0;
3918
0
        if (datatype == CCV_16F)
3919
0
        {
3920
0
          ccv_float_to_half_precision(q_tensor->data.f32, (uint16_t*)q_tensor_f16->data.f16, q_count);
3921
0
          ccv_float_to_half_precision(k_tensor->data.f32, (uint16_t*)k_tensor_f16->data.f16, kv_count);
3922
0
          ccv_float_to_half_precision(v_tensor->data.f32, (uint16_t*)v_tensor_f16->data.f16, kv_count);
3923
0
          q_input = q_tensor_f16;
3924
0
          k_input = k_tensor_f16;
3925
0
          v_input = v_tensor_f16;
3926
0
          gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, H, D), 0);
3927
0
          gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, H, D), 0);
3928
0
          gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, H, D), 0);
3929
0
          gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, H, D), 0);
3930
0
          copy_of_gpu_o_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, H, D), 0);
3931
0
        } else if (datatype == CCV_16BF) {
3932
0
          ccv_float_to_bfloat(q_tensor->data.f32, (uint16_t*)q_tensor_f16->data.f16, q_count);
3933
0
          ccv_float_to_bfloat(k_tensor->data.f32, (uint16_t*)k_tensor_f16->data.f16, kv_count);
3934
0
          ccv_float_to_bfloat(v_tensor->data.f32, (uint16_t*)v_tensor_f16->data.f16, kv_count);
3935
0
          q_input = q_tensor_f16;
3936
0
          k_input = k_tensor_f16;
3937
0
          v_input = v_tensor_f16;
3938
0
          gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, R, H, D), 0);
3939
0
          gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, C, H, D), 0);
3940
0
          gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, C, H, D), 0);
3941
0
          gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, R, H, D), 0);
3942
0
          copy_of_gpu_o_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, B, R, H, D), 0);
3943
0
        } else {
3944
0
          gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, H, D), 0);
3945
0
          gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, C, H, D), 0);
3946
0
          gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, C, H, D), 0);
3947
0
          gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, H, D), 0);
3948
0
          copy_of_gpu_o_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
3949
0
        }
3950
0
        ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_input, k_input, v_input), TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor), 0);
3951
0
        ccv_nnc_cmd_t gpu_cmd = CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, 0);
3952
0
        gpu_cmd.info.scaled_dot_product_attention.flags = CCV_NNC_GEMM_16F | CCV_NNC_GEMM_8I;
3953
0
        ccv_nnc_cmd_exec(gpu_cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor), TENSOR_LIST(gpu_o_tensor), 0);
3954
0
        ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_o_tensor), TENSOR_LIST(copy_of_gpu_o_tensor), 0);
3955
3956
0
        const int count = B * R * H * D;
3957
0
        float* const cpu_f32 = (float*)ccmalloc(sizeof(float) * count);
3958
0
        float* const gpu_f32 = (float*)ccmalloc(sizeof(float) * count);
3959
0
        memcpy(cpu_f32, o_tensor->data.f32, sizeof(float) * count);
3960
0
        if (datatype == CCV_16F)
3961
0
          ccv_half_precision_to_float((uint16_t*)copy_of_gpu_o_tensor->data.f16, gpu_f32, count);
3962
0
        else if (datatype == CCV_16BF)
3963
0
          ccv_bfloat_to_float((uint16_t*)copy_of_gpu_o_tensor->data.f16, gpu_f32, count);
3964
0
        else
3965
0
          memcpy(gpu_f32, copy_of_gpu_o_tensor->data.f32, sizeof(float) * count);
3966
0
        float max_relative_diff = 0;
3967
0
        int max_diff_idx = 0;
3968
0
        for (int i = 0; i < count; ++i)
3969
0
        {
3970
0
          const float denom = fmaxf(fmaxf(fabsf(cpu_f32[i]), fabsf(gpu_f32[i])), 1.0f);
3971
0
          const float relative_diff = fabsf(cpu_f32[i] - gpu_f32[i]) / denom;
3972
0
          if (relative_diff > max_relative_diff)
3973
0
            max_relative_diff = relative_diff, max_diff_idx = i;
3974
0
        }
3975
0
        REQUIRE(max_relative_diff <= tolerances[datatype_idx], "quantized attention result should match CPU reference for dtype=%s C=%d D=%d (max relative diff %g at %d: %g vs %g)", datatype_names[datatype_idx], C, D, max_relative_diff, max_diff_idx, cpu_f32[max_diff_idx], gpu_f32[max_diff_idx]);
3976
3977
0
        ccfree(cpu_f32);
3978
0
        ccfree(gpu_f32);
3979
0
        ccv_nnc_tensor_free(gpu_o_tensor);
3980
0
        ccv_nnc_tensor_free(copy_of_gpu_o_tensor);
3981
0
        ccv_nnc_tensor_free(gpu_q_tensor);
3982
0
        ccv_nnc_tensor_free(gpu_k_tensor);
3983
0
        ccv_nnc_tensor_free(gpu_v_tensor);
3984
0
      }
3985
0
      ccv_nnc_tensor_free(o_tensor);
3986
0
      ccv_nnc_tensor_free(q_tensor);
3987
0
      ccv_nnc_tensor_free(k_tensor);
3988
0
      ccv_nnc_tensor_free(v_tensor);
3989
0
      ccv_nnc_tensor_free(q_tensor_f16);
3990
0
      ccv_nnc_tensor_free(k_tensor_f16);
3991
0
      ccv_nnc_tensor_free(v_tensor_f16);
3992
0
    }
3993
0
  }
3994
0
}
3995
3996
TEST_CASE("scaled dot product attention gradient with quantized NA mps")
3997
1
{
3998
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD, CCV_NNC_BACKEND_MPS) &&
3999
1
    ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD, CCV_NNC_BACKEND_MPS));
4000
0
  const int B = 2;
4001
0
  const int R = 128;
4002
0
  const int C = 128;
4003
0
  const int H = 8;
4004
0
  const int Ds[] = { 64, 80, 96, 128 };
4005
0
  const int datatypes[] = { CCV_16F, CCV_16BF, CCV_32F };
4006
0
  const char* datatype_names[] = { "16F", "16BF", "32F" };
4007
0
  const float dq_tolerances[] = { 8e-2, 8e-2, 8e-2 };
4008
0
  const float dk_tolerances[] = { 1e-1, 1e-1, 1e-1 };
4009
0
  const float dv_tolerances[] = { 8e-2, 8e-2, 8e-2 };
4010
0
  for (int d_idx = 0; d_idx < (int)(sizeof(Ds) / sizeof(Ds[0])); ++d_idx)
4011
0
  {
4012
0
    const int D = Ds[d_idx];
4013
0
    const int q_count = B * R * H * D;
4014
0
    const int kv_count = B * C * H * D;
4015
0
    const float scale = 1.0 / sqrt((float)D);
4016
0
    ccv_nnc_tensor_t* const q_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
4017
0
    ccv_nnc_tensor_t* const k_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
4018
0
    ccv_nnc_tensor_t* const v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
4019
0
    ccv_nnc_tensor_t* const do_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
4020
0
    ccv_nnc_tensor_t* const dq_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
4021
0
    ccv_nnc_tensor_t* const dk_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
4022
0
    ccv_nnc_tensor_t* const dv_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
4023
0
    ccv_nnc_tensor_t* const q_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, H, D), 0);
4024
0
    ccv_nnc_tensor_t* const k_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, H, D), 0);
4025
0
    ccv_nnc_tensor_t* const v_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, H, D), 0);
4026
0
    ccv_nnc_tensor_t* const do_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, H, D), 0);
4027
0
    dsfmt_t dsfmt;
4028
0
    dsfmt_init_gen_rand(&dsfmt, 181 + d_idx);
4029
0
    for (int i = 0; i < q_count; ++i)
4030
0
    {
4031
0
      q_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
4032
0
      do_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
4033
0
    }
4034
0
    for (int i = 0; i < kv_count; ++i)
4035
0
    {
4036
0
      k_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
4037
0
      v_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
4038
0
    }
4039
0
    ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD(scale, 0), ccv_nnc_no_hint, 0, TENSOR_LIST(do_tensor, 0, 0, q_tensor, k_tensor, v_tensor), TENSOR_LIST(dq_tensor, dk_tensor, dv_tensor), 0);
4040
4041
0
    for (int datatype_idx = 0; datatype_idx < 3; ++datatype_idx)
4042
0
    {
4043
0
      const int datatype = datatypes[datatype_idx];
4044
0
      ccv_nnc_tensor_t* q_input = q_tensor;
4045
0
      ccv_nnc_tensor_t* k_input = k_tensor;
4046
0
      ccv_nnc_tensor_t* v_input = v_tensor;
4047
0
      ccv_nnc_tensor_t* do_input = do_tensor;
4048
0
      ccv_nnc_tensor_t* gpu_q_tensor = 0;
4049
0
      ccv_nnc_tensor_t* gpu_k_tensor = 0;
4050
0
      ccv_nnc_tensor_t* gpu_v_tensor = 0;
4051
0
      ccv_nnc_tensor_t* gpu_do_tensor = 0;
4052
0
      ccv_nnc_tensor_t* gpu_o_tensor = 0;
4053
0
      ccv_nnc_tensor_t* gpu_dq_tensor = 0;
4054
0
      ccv_nnc_tensor_t* gpu_dk_tensor = 0;
4055
0
      ccv_nnc_tensor_t* gpu_dv_tensor = 0;
4056
0
      ccv_nnc_tensor_t* copy_of_gpu_dq_tensor = 0;
4057
0
      ccv_nnc_tensor_t* copy_of_gpu_dk_tensor = 0;
4058
0
      ccv_nnc_tensor_t* copy_of_gpu_dv_tensor = 0;
4059
0
      if (datatype == CCV_16F)
4060
0
      {
4061
0
        ccv_float_to_half_precision(q_tensor->data.f32, (uint16_t*)q_tensor_f16->data.f16, q_count);
4062
0
        ccv_float_to_half_precision(k_tensor->data.f32, (uint16_t*)k_tensor_f16->data.f16, kv_count);
4063
0
        ccv_float_to_half_precision(v_tensor->data.f32, (uint16_t*)v_tensor_f16->data.f16, kv_count);
4064
0
        ccv_float_to_half_precision(do_tensor->data.f32, (uint16_t*)do_tensor_f16->data.f16, q_count);
4065
0
        q_input = q_tensor_f16;
4066
0
        k_input = k_tensor_f16;
4067
0
        v_input = v_tensor_f16;
4068
0
        do_input = do_tensor_f16;
4069
0
        gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, H, D), 0);
4070
0
        gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, H, D), 0);
4071
0
        gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, H, D), 0);
4072
0
        gpu_do_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, H, D), 0);
4073
0
        gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, H, D), 0);
4074
0
        gpu_dq_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, H, D), 0);
4075
0
        gpu_dk_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, H, D), 0);
4076
0
        gpu_dv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, H, D), 0);
4077
0
        copy_of_gpu_dq_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, H, D), 0);
4078
0
        copy_of_gpu_dk_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, H, D), 0);
4079
0
        copy_of_gpu_dv_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, H, D), 0);
4080
0
      } else if (datatype == CCV_16BF) {
4081
0
        ccv_float_to_bfloat(q_tensor->data.f32, (uint16_t*)q_tensor_f16->data.f16, q_count);
4082
0
        ccv_float_to_bfloat(k_tensor->data.f32, (uint16_t*)k_tensor_f16->data.f16, kv_count);
4083
0
        ccv_float_to_bfloat(v_tensor->data.f32, (uint16_t*)v_tensor_f16->data.f16, kv_count);
4084
0
        ccv_float_to_bfloat(do_tensor->data.f32, (uint16_t*)do_tensor_f16->data.f16, q_count);
4085
0
        q_input = q_tensor_f16;
4086
0
        k_input = k_tensor_f16;
4087
0
        v_input = v_tensor_f16;
4088
0
        do_input = do_tensor_f16;
4089
0
        gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, R, H, D), 0);
4090
0
        gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, C, H, D), 0);
4091
0
        gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, C, H, D), 0);
4092
0
        gpu_do_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, R, H, D), 0);
4093
0
        gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, R, H, D), 0);
4094
0
        gpu_dq_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, R, H, D), 0);
4095
0
        gpu_dk_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, C, H, D), 0);
4096
0
        gpu_dv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, C, H, D), 0);
4097
0
        copy_of_gpu_dq_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, B, R, H, D), 0);
4098
0
        copy_of_gpu_dk_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, B, C, H, D), 0);
4099
0
        copy_of_gpu_dv_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, B, C, H, D), 0);
4100
0
      } else {
4101
0
        gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, H, D), 0);
4102
0
        gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, C, H, D), 0);
4103
0
        gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, C, H, D), 0);
4104
0
        gpu_do_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, H, D), 0);
4105
0
        gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, H, D), 0);
4106
0
        gpu_dq_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, H, D), 0);
4107
0
        gpu_dk_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, C, H, D), 0);
4108
0
        gpu_dv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, C, H, D), 0);
4109
0
        copy_of_gpu_dq_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
4110
0
        copy_of_gpu_dk_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
4111
0
        copy_of_gpu_dv_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
4112
0
      }
4113
0
      ccv_nnc_tensor_t* const gpu_softmax_lse = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, H, R), 0);
4114
0
      ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_input, k_input, v_input, do_input), TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, gpu_do_tensor), 0);
4115
0
      ccv_nnc_cmd_t gpu_forw_cmd = CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, 0);
4116
0
      gpu_forw_cmd.info.scaled_dot_product_attention.flags = CCV_NNC_GEMM_16F | CCV_NNC_GEMM_8I;
4117
0
      ccv_nnc_cmd_exec(gpu_forw_cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, NULL, NULL, NULL), TENSOR_LIST(gpu_o_tensor, gpu_softmax_lse), 0);
4118
0
      ccv_nnc_cmd_exec(CMD_SET_FORWARD(0), ccv_nnc_no_hint, 0, TENSOR_LIST(), TENSOR_LIST(gpu_dq_tensor, gpu_dk_tensor, gpu_dv_tensor), 0);
4119
0
      ccv_nnc_cmd_t gpu_back_cmd = CMD_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD(scale, 0);
4120
0
      gpu_back_cmd.info.scaled_dot_product_attention.flags = CCV_NNC_GEMM_16F | CCV_NNC_GEMM_8I;
4121
0
      gpu_back_cmd.info.scaled_dot_product_attention.deterministic = 0;
4122
0
      ccv_nnc_cmd_exec(gpu_back_cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_do_tensor, 0, 0, gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, 0, 0, 0, gpu_o_tensor, gpu_softmax_lse), TENSOR_LIST(gpu_dq_tensor, gpu_dk_tensor, gpu_dv_tensor), 0);
4123
0
      ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_dq_tensor, gpu_dk_tensor, gpu_dv_tensor), TENSOR_LIST(copy_of_gpu_dq_tensor, copy_of_gpu_dk_tensor, copy_of_gpu_dv_tensor), 0);
4124
4125
0
      float* const dq_cpu_f32 = (float*)ccmalloc(sizeof(float) * q_count);
4126
0
      float* const dk_cpu_f32 = (float*)ccmalloc(sizeof(float) * kv_count);
4127
0
      float* const dv_cpu_f32 = (float*)ccmalloc(sizeof(float) * kv_count);
4128
0
      float* const dq_gpu_f32 = (float*)ccmalloc(sizeof(float) * q_count);
4129
0
      float* const dk_gpu_f32 = (float*)ccmalloc(sizeof(float) * kv_count);
4130
0
      float* const dv_gpu_f32 = (float*)ccmalloc(sizeof(float) * kv_count);
4131
0
      memcpy(dq_cpu_f32, dq_tensor->data.f32, sizeof(float) * q_count);
4132
0
      memcpy(dk_cpu_f32, dk_tensor->data.f32, sizeof(float) * kv_count);
4133
0
      memcpy(dv_cpu_f32, dv_tensor->data.f32, sizeof(float) * kv_count);
4134
0
      if (datatype == CCV_16F)
4135
0
      {
4136
0
        ccv_half_precision_to_float((uint16_t*)copy_of_gpu_dq_tensor->data.f16, dq_gpu_f32, q_count);
4137
0
        ccv_half_precision_to_float((uint16_t*)copy_of_gpu_dk_tensor->data.f16, dk_gpu_f32, kv_count);
4138
0
        ccv_half_precision_to_float((uint16_t*)copy_of_gpu_dv_tensor->data.f16, dv_gpu_f32, kv_count);
4139
0
      } else if (datatype == CCV_16BF) {
4140
0
        ccv_bfloat_to_float((uint16_t*)copy_of_gpu_dq_tensor->data.f16, dq_gpu_f32, q_count);
4141
0
        ccv_bfloat_to_float((uint16_t*)copy_of_gpu_dk_tensor->data.f16, dk_gpu_f32, kv_count);
4142
0
        ccv_bfloat_to_float((uint16_t*)copy_of_gpu_dv_tensor->data.f16, dv_gpu_f32, kv_count);
4143
0
      } else {
4144
0
        memcpy(dq_gpu_f32, copy_of_gpu_dq_tensor->data.f32, sizeof(float) * q_count);
4145
0
        memcpy(dk_gpu_f32, copy_of_gpu_dk_tensor->data.f32, sizeof(float) * kv_count);
4146
0
        memcpy(dv_gpu_f32, copy_of_gpu_dv_tensor->data.f32, sizeof(float) * kv_count);
4147
0
      }
4148
0
      float dq_max_relative_diff = 0;
4149
0
      float dk_max_relative_diff = 0;
4150
0
      float dv_max_relative_diff = 0;
4151
0
      int dq_max_diff_idx = 0;
4152
0
      int dk_max_diff_idx = 0;
4153
0
      int dv_max_diff_idx = 0;
4154
0
      for (int i = 0; i < q_count; ++i)
4155
0
      {
4156
0
        const float denom = fmaxf(fmaxf(fabsf(dq_cpu_f32[i]), fabsf(dq_gpu_f32[i])), 1.0f);
4157
0
        const float relative_diff = fabsf(dq_cpu_f32[i] - dq_gpu_f32[i]) / denom;
4158
0
        if (relative_diff > dq_max_relative_diff)
4159
0
          dq_max_relative_diff = relative_diff, dq_max_diff_idx = i;
4160
0
      }
4161
0
      for (int i = 0; i < kv_count; ++i)
4162
0
      {
4163
0
        float denom = fmaxf(fmaxf(fabsf(dk_cpu_f32[i]), fabsf(dk_gpu_f32[i])), 1.0f);
4164
0
        float relative_diff = fabsf(dk_cpu_f32[i] - dk_gpu_f32[i]) / denom;
4165
0
        if (relative_diff > dk_max_relative_diff)
4166
0
          dk_max_relative_diff = relative_diff, dk_max_diff_idx = i;
4167
0
        denom = fmaxf(fmaxf(fabsf(dv_cpu_f32[i]), fabsf(dv_gpu_f32[i])), 1.0f);
4168
0
        relative_diff = fabsf(dv_cpu_f32[i] - dv_gpu_f32[i]) / denom;
4169
0
        if (relative_diff > dv_max_relative_diff)
4170
0
          dv_max_relative_diff = relative_diff, dv_max_diff_idx = i;
4171
0
      }
4172
0
      REQUIRE(dq_max_relative_diff <= dq_tolerances[datatype_idx], "quantized attention dQ should match CPU reference for dtype=%s R=%d C=%d D=%d (max relative diff %g at %d: %g vs %g)", datatype_names[datatype_idx], R, C, D, dq_max_relative_diff, dq_max_diff_idx, dq_cpu_f32[dq_max_diff_idx], dq_gpu_f32[dq_max_diff_idx]);
4173
0
      REQUIRE(dk_max_relative_diff <= dk_tolerances[datatype_idx], "quantized attention dK should match CPU reference for dtype=%s R=%d C=%d D=%d (max relative diff %g at %d: %g vs %g)", datatype_names[datatype_idx], R, C, D, dk_max_relative_diff, dk_max_diff_idx, dk_cpu_f32[dk_max_diff_idx], dk_gpu_f32[dk_max_diff_idx]);
4174
0
      REQUIRE(dv_max_relative_diff <= dv_tolerances[datatype_idx], "quantized attention dV should match CPU reference for dtype=%s R=%d C=%d D=%d (max relative diff %g at %d: %g vs %g)", datatype_names[datatype_idx], R, C, D, dv_max_relative_diff, dv_max_diff_idx, dv_cpu_f32[dv_max_diff_idx], dv_gpu_f32[dv_max_diff_idx]);
4175
4176
0
      ccfree(dq_cpu_f32);
4177
0
      ccfree(dk_cpu_f32);
4178
0
      ccfree(dv_cpu_f32);
4179
0
      ccfree(dq_gpu_f32);
4180
0
      ccfree(dk_gpu_f32);
4181
0
      ccfree(dv_gpu_f32);
4182
0
      ccv_nnc_tensor_free(gpu_q_tensor);
4183
0
      ccv_nnc_tensor_free(gpu_k_tensor);
4184
0
      ccv_nnc_tensor_free(gpu_v_tensor);
4185
0
      ccv_nnc_tensor_free(gpu_do_tensor);
4186
0
      ccv_nnc_tensor_free(gpu_o_tensor);
4187
0
      ccv_nnc_tensor_free(gpu_dq_tensor);
4188
0
      ccv_nnc_tensor_free(gpu_dk_tensor);
4189
0
      ccv_nnc_tensor_free(gpu_dv_tensor);
4190
0
      ccv_nnc_tensor_free(gpu_softmax_lse);
4191
0
      ccv_nnc_tensor_free(copy_of_gpu_dq_tensor);
4192
0
      ccv_nnc_tensor_free(copy_of_gpu_dk_tensor);
4193
0
      ccv_nnc_tensor_free(copy_of_gpu_dv_tensor);
4194
0
    }
4195
4196
0
    ccv_nnc_tensor_free(q_tensor);
4197
0
    ccv_nnc_tensor_free(k_tensor);
4198
0
    ccv_nnc_tensor_free(v_tensor);
4199
0
    ccv_nnc_tensor_free(do_tensor);
4200
0
    ccv_nnc_tensor_free(dq_tensor);
4201
0
    ccv_nnc_tensor_free(dk_tensor);
4202
0
    ccv_nnc_tensor_free(dv_tensor);
4203
0
    ccv_nnc_tensor_free(q_tensor_f16);
4204
0
    ccv_nnc_tensor_free(k_tensor_f16);
4205
0
    ccv_nnc_tensor_free(v_tensor_f16);
4206
0
    ccv_nnc_tensor_free(do_tensor_f16);
4207
0
  }
4208
0
}
4209
4210
TEST_CASE("scaled dot product attention gradient with quantized NA mps for rectangular and edge sequence lengths")
4211
1
{
4212
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD, CCV_NNC_BACKEND_MPS) &&
4213
1
    ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD, CCV_NNC_BACKEND_MPS));
4214
0
  typedef struct {
4215
0
    int R;
4216
0
    int C;
4217
0
  } qna_backward_shape_t;
4218
0
  const int B = 1;
4219
0
  const int H = 8;
4220
0
  const int Ds[] = { 64, 128 };
4221
0
  const qna_backward_shape_t shapes[] = {
4222
0
    { .R = 32, .C = 64 },
4223
0
    { .R = 40, .C = 72 },
4224
0
    { .R = 80, .C = 64 },
4225
0
    { .R = 96, .C = 88 },
4226
0
    { .R = 64, .C = 192 },
4227
0
    { .R = 144, .C = 64 },
4228
0
  };
4229
0
  const int datatypes[] = { CCV_16F, CCV_16BF, CCV_32F };
4230
0
  const char* datatype_names[] = { "16F", "16BF", "32F" };
4231
0
  const float dq_tolerances[] = { 8e-2, 8e-2, 8e-2 };
4232
0
  const float dk_tolerances[] = { 1e-1, 1e-1, 1e-1 };
4233
0
  const float dv_tolerances[] = { 8e-2, 8e-2, 8e-2 };
4234
0
  for (int shape_idx = 0; shape_idx < (int)(sizeof(shapes) / sizeof(shapes[0])); ++shape_idx)
4235
0
  {
4236
0
    const int R = shapes[shape_idx].R;
4237
0
    const int C = shapes[shape_idx].C;
4238
0
    for (int d_idx = 0; d_idx < (int)(sizeof(Ds) / sizeof(Ds[0])); ++d_idx)
4239
0
    {
4240
0
      const int D = Ds[d_idx];
4241
0
      const int q_count = B * R * H * D;
4242
0
      const int kv_count = B * C * H * D;
4243
0
      const float scale = 1.0 / sqrt((float)D);
4244
0
      ccv_nnc_tensor_t* const q_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
4245
0
      ccv_nnc_tensor_t* const k_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
4246
0
      ccv_nnc_tensor_t* const v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
4247
0
      ccv_nnc_tensor_t* const do_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
4248
0
      ccv_nnc_tensor_t* const dq_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
4249
0
      ccv_nnc_tensor_t* const dk_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
4250
0
      ccv_nnc_tensor_t* const dv_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
4251
0
      ccv_nnc_tensor_t* const q_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, H, D), 0);
4252
0
      ccv_nnc_tensor_t* const k_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, H, D), 0);
4253
0
      ccv_nnc_tensor_t* const v_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, H, D), 0);
4254
0
      ccv_nnc_tensor_t* const do_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, H, D), 0);
4255
0
      dsfmt_t dsfmt;
4256
0
      dsfmt_init_gen_rand(&dsfmt, 281 + shape_idx * 17 + d_idx);
4257
0
      for (int i = 0; i < q_count; ++i)
4258
0
      {
4259
0
        q_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
4260
0
        do_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
4261
0
      }
4262
0
      for (int i = 0; i < kv_count; ++i)
4263
0
      {
4264
0
        k_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
4265
0
        v_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
4266
0
      }
4267
0
      ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD(scale, 0), ccv_nnc_no_hint, 0, TENSOR_LIST(do_tensor, 0, 0, q_tensor, k_tensor, v_tensor), TENSOR_LIST(dq_tensor, dk_tensor, dv_tensor), 0);
4268
4269
0
      for (int datatype_idx = 0; datatype_idx < 3; ++datatype_idx)
4270
0
      {
4271
0
        const int datatype = datatypes[datatype_idx];
4272
0
        ccv_nnc_tensor_t* q_input = q_tensor;
4273
0
        ccv_nnc_tensor_t* k_input = k_tensor;
4274
0
        ccv_nnc_tensor_t* v_input = v_tensor;
4275
0
        ccv_nnc_tensor_t* do_input = do_tensor;
4276
0
        ccv_nnc_tensor_t* gpu_q_tensor = 0;
4277
0
        ccv_nnc_tensor_t* gpu_k_tensor = 0;
4278
0
        ccv_nnc_tensor_t* gpu_v_tensor = 0;
4279
0
        ccv_nnc_tensor_t* gpu_do_tensor = 0;
4280
0
        ccv_nnc_tensor_t* gpu_o_tensor = 0;
4281
0
        ccv_nnc_tensor_t* gpu_dq_tensor = 0;
4282
0
        ccv_nnc_tensor_t* gpu_dk_tensor = 0;
4283
0
        ccv_nnc_tensor_t* gpu_dv_tensor = 0;
4284
0
        ccv_nnc_tensor_t* copy_of_gpu_dq_tensor = 0;
4285
0
        ccv_nnc_tensor_t* copy_of_gpu_dk_tensor = 0;
4286
0
        ccv_nnc_tensor_t* copy_of_gpu_dv_tensor = 0;
4287
0
        if (datatype == CCV_16F)
4288
0
        {
4289
0
          ccv_float_to_half_precision(q_tensor->data.f32, (uint16_t*)q_tensor_f16->data.f16, q_count);
4290
0
          ccv_float_to_half_precision(k_tensor->data.f32, (uint16_t*)k_tensor_f16->data.f16, kv_count);
4291
0
          ccv_float_to_half_precision(v_tensor->data.f32, (uint16_t*)v_tensor_f16->data.f16, kv_count);
4292
0
          ccv_float_to_half_precision(do_tensor->data.f32, (uint16_t*)do_tensor_f16->data.f16, q_count);
4293
0
          q_input = q_tensor_f16;
4294
0
          k_input = k_tensor_f16;
4295
0
          v_input = v_tensor_f16;
4296
0
          do_input = do_tensor_f16;
4297
0
          gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, H, D), 0);
4298
0
          gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, H, D), 0);
4299
0
          gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, H, D), 0);
4300
0
          gpu_do_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, H, D), 0);
4301
0
          gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, H, D), 0);
4302
0
          gpu_dq_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, H, D), 0);
4303
0
          gpu_dk_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, H, D), 0);
4304
0
          gpu_dv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, H, D), 0);
4305
0
          copy_of_gpu_dq_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, H, D), 0);
4306
0
          copy_of_gpu_dk_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, H, D), 0);
4307
0
          copy_of_gpu_dv_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, H, D), 0);
4308
0
        } else if (datatype == CCV_16BF) {
4309
0
          ccv_float_to_bfloat(q_tensor->data.f32, (uint16_t*)q_tensor_f16->data.f16, q_count);
4310
0
          ccv_float_to_bfloat(k_tensor->data.f32, (uint16_t*)k_tensor_f16->data.f16, kv_count);
4311
0
          ccv_float_to_bfloat(v_tensor->data.f32, (uint16_t*)v_tensor_f16->data.f16, kv_count);
4312
0
          ccv_float_to_bfloat(do_tensor->data.f32, (uint16_t*)do_tensor_f16->data.f16, q_count);
4313
0
          q_input = q_tensor_f16;
4314
0
          k_input = k_tensor_f16;
4315
0
          v_input = v_tensor_f16;
4316
0
          do_input = do_tensor_f16;
4317
0
          gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, R, H, D), 0);
4318
0
          gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, C, H, D), 0);
4319
0
          gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, C, H, D), 0);
4320
0
          gpu_do_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, R, H, D), 0);
4321
0
          gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, R, H, D), 0);
4322
0
          gpu_dq_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, R, H, D), 0);
4323
0
          gpu_dk_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, C, H, D), 0);
4324
0
          gpu_dv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, C, H, D), 0);
4325
0
          copy_of_gpu_dq_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, B, R, H, D), 0);
4326
0
          copy_of_gpu_dk_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, B, C, H, D), 0);
4327
0
          copy_of_gpu_dv_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, B, C, H, D), 0);
4328
0
        } else {
4329
0
          gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, H, D), 0);
4330
0
          gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, C, H, D), 0);
4331
0
          gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, C, H, D), 0);
4332
0
          gpu_do_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, H, D), 0);
4333
0
          gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, H, D), 0);
4334
0
          gpu_dq_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, H, D), 0);
4335
0
          gpu_dk_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, C, H, D), 0);
4336
0
          gpu_dv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, C, H, D), 0);
4337
0
          copy_of_gpu_dq_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
4338
0
          copy_of_gpu_dk_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
4339
0
          copy_of_gpu_dv_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
4340
0
        }
4341
0
        ccv_nnc_tensor_t* const gpu_softmax_lse = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, H, R), 0);
4342
0
        ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_input, k_input, v_input, do_input), TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, gpu_do_tensor), 0);
4343
0
        ccv_nnc_cmd_t gpu_forw_cmd = CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, 0);
4344
0
        gpu_forw_cmd.info.scaled_dot_product_attention.flags = CCV_NNC_GEMM_16F | CCV_NNC_GEMM_8I;
4345
0
        ccv_nnc_cmd_exec(gpu_forw_cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, NULL, NULL, NULL), TENSOR_LIST(gpu_o_tensor, gpu_softmax_lse), 0);
4346
0
        ccv_nnc_cmd_exec(CMD_SET_FORWARD(0), ccv_nnc_no_hint, 0, TENSOR_LIST(), TENSOR_LIST(gpu_dq_tensor, gpu_dk_tensor, gpu_dv_tensor), 0);
4347
0
        ccv_nnc_cmd_t gpu_back_cmd = CMD_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD(scale, 0);
4348
0
        gpu_back_cmd.info.scaled_dot_product_attention.flags = CCV_NNC_GEMM_16F | CCV_NNC_GEMM_8I;
4349
0
        gpu_back_cmd.info.scaled_dot_product_attention.deterministic = 0;
4350
0
        ccv_nnc_cmd_exec(gpu_back_cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_do_tensor, 0, 0, gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, 0, 0, 0, gpu_o_tensor, gpu_softmax_lse), TENSOR_LIST(gpu_dq_tensor, gpu_dk_tensor, gpu_dv_tensor), 0);
4351
0
        ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_dq_tensor, gpu_dk_tensor, gpu_dv_tensor), TENSOR_LIST(copy_of_gpu_dq_tensor, copy_of_gpu_dk_tensor, copy_of_gpu_dv_tensor), 0);
4352
4353
0
        float* const dq_cpu_f32 = (float*)ccmalloc(sizeof(float) * q_count);
4354
0
        float* const dk_cpu_f32 = (float*)ccmalloc(sizeof(float) * kv_count);
4355
0
        float* const dv_cpu_f32 = (float*)ccmalloc(sizeof(float) * kv_count);
4356
0
        float* const dq_gpu_f32 = (float*)ccmalloc(sizeof(float) * q_count);
4357
0
        float* const dk_gpu_f32 = (float*)ccmalloc(sizeof(float) * kv_count);
4358
0
        float* const dv_gpu_f32 = (float*)ccmalloc(sizeof(float) * kv_count);
4359
0
        memcpy(dq_cpu_f32, dq_tensor->data.f32, sizeof(float) * q_count);
4360
0
        memcpy(dk_cpu_f32, dk_tensor->data.f32, sizeof(float) * kv_count);
4361
0
        memcpy(dv_cpu_f32, dv_tensor->data.f32, sizeof(float) * kv_count);
4362
0
        if (datatype == CCV_16F)
4363
0
        {
4364
0
          ccv_half_precision_to_float((uint16_t*)copy_of_gpu_dq_tensor->data.f16, dq_gpu_f32, q_count);
4365
0
          ccv_half_precision_to_float((uint16_t*)copy_of_gpu_dk_tensor->data.f16, dk_gpu_f32, kv_count);
4366
0
          ccv_half_precision_to_float((uint16_t*)copy_of_gpu_dv_tensor->data.f16, dv_gpu_f32, kv_count);
4367
0
        } else if (datatype == CCV_16BF) {
4368
0
          ccv_bfloat_to_float((uint16_t*)copy_of_gpu_dq_tensor->data.f16, dq_gpu_f32, q_count);
4369
0
          ccv_bfloat_to_float((uint16_t*)copy_of_gpu_dk_tensor->data.f16, dk_gpu_f32, kv_count);
4370
0
          ccv_bfloat_to_float((uint16_t*)copy_of_gpu_dv_tensor->data.f16, dv_gpu_f32, kv_count);
4371
0
        } else {
4372
0
          memcpy(dq_gpu_f32, copy_of_gpu_dq_tensor->data.f32, sizeof(float) * q_count);
4373
0
          memcpy(dk_gpu_f32, copy_of_gpu_dk_tensor->data.f32, sizeof(float) * kv_count);
4374
0
          memcpy(dv_gpu_f32, copy_of_gpu_dv_tensor->data.f32, sizeof(float) * kv_count);
4375
0
        }
4376
0
        float dq_max_relative_diff = 0;
4377
0
        float dk_max_relative_diff = 0;
4378
0
        float dv_max_relative_diff = 0;
4379
0
        int dq_max_diff_idx = 0;
4380
0
        int dk_max_diff_idx = 0;
4381
0
        int dv_max_diff_idx = 0;
4382
0
        for (int i = 0; i < q_count; ++i)
4383
0
        {
4384
0
          const float denom = fmaxf(fmaxf(fabsf(dq_cpu_f32[i]), fabsf(dq_gpu_f32[i])), 1.0f);
4385
0
          const float relative_diff = fabsf(dq_cpu_f32[i] - dq_gpu_f32[i]) / denom;
4386
0
          if (relative_diff > dq_max_relative_diff)
4387
0
            dq_max_relative_diff = relative_diff, dq_max_diff_idx = i;
4388
0
        }
4389
0
        for (int i = 0; i < kv_count; ++i)
4390
0
        {
4391
0
          float denom = fmaxf(fmaxf(fabsf(dk_cpu_f32[i]), fabsf(dk_gpu_f32[i])), 1.0f);
4392
0
          float relative_diff = fabsf(dk_cpu_f32[i] - dk_gpu_f32[i]) / denom;
4393
0
          if (relative_diff > dk_max_relative_diff)
4394
0
            dk_max_relative_diff = relative_diff, dk_max_diff_idx = i;
4395
0
          denom = fmaxf(fmaxf(fabsf(dv_cpu_f32[i]), fabsf(dv_gpu_f32[i])), 1.0f);
4396
0
          relative_diff = fabsf(dv_cpu_f32[i] - dv_gpu_f32[i]) / denom;
4397
0
          if (relative_diff > dv_max_relative_diff)
4398
0
            dv_max_relative_diff = relative_diff, dv_max_diff_idx = i;
4399
0
        }
4400
0
        REQUIRE(dq_max_relative_diff <= dq_tolerances[datatype_idx], "quantized attention dQ should match CPU reference for dtype=%s R=%d C=%d D=%d (max relative diff %g at %d: %g vs %g)", datatype_names[datatype_idx], R, C, D, dq_max_relative_diff, dq_max_diff_idx, dq_cpu_f32[dq_max_diff_idx], dq_gpu_f32[dq_max_diff_idx]);
4401
0
        REQUIRE(dk_max_relative_diff <= dk_tolerances[datatype_idx], "quantized attention dK should match CPU reference for dtype=%s R=%d C=%d D=%d (max relative diff %g at %d: %g vs %g)", datatype_names[datatype_idx], R, C, D, dk_max_relative_diff, dk_max_diff_idx, dk_cpu_f32[dk_max_diff_idx], dk_gpu_f32[dk_max_diff_idx]);
4402
0
        REQUIRE(dv_max_relative_diff <= dv_tolerances[datatype_idx], "quantized attention dV should match CPU reference for dtype=%s R=%d C=%d D=%d (max relative diff %g at %d: %g vs %g)", datatype_names[datatype_idx], R, C, D, dv_max_relative_diff, dv_max_diff_idx, dv_cpu_f32[dv_max_diff_idx], dv_gpu_f32[dv_max_diff_idx]);
4403
4404
0
        ccfree(dq_cpu_f32);
4405
0
        ccfree(dk_cpu_f32);
4406
0
        ccfree(dv_cpu_f32);
4407
0
        ccfree(dq_gpu_f32);
4408
0
        ccfree(dk_gpu_f32);
4409
0
        ccfree(dv_gpu_f32);
4410
0
        ccv_nnc_tensor_free(gpu_q_tensor);
4411
0
        ccv_nnc_tensor_free(gpu_k_tensor);
4412
0
        ccv_nnc_tensor_free(gpu_v_tensor);
4413
0
        ccv_nnc_tensor_free(gpu_do_tensor);
4414
0
        ccv_nnc_tensor_free(gpu_o_tensor);
4415
0
        ccv_nnc_tensor_free(gpu_dq_tensor);
4416
0
        ccv_nnc_tensor_free(gpu_dk_tensor);
4417
0
        ccv_nnc_tensor_free(gpu_dv_tensor);
4418
0
        ccv_nnc_tensor_free(gpu_softmax_lse);
4419
0
        ccv_nnc_tensor_free(copy_of_gpu_dq_tensor);
4420
0
        ccv_nnc_tensor_free(copy_of_gpu_dk_tensor);
4421
0
        ccv_nnc_tensor_free(copy_of_gpu_dv_tensor);
4422
0
      }
4423
4424
0
      ccv_nnc_tensor_free(q_tensor);
4425
0
      ccv_nnc_tensor_free(k_tensor);
4426
0
      ccv_nnc_tensor_free(v_tensor);
4427
0
      ccv_nnc_tensor_free(do_tensor);
4428
0
      ccv_nnc_tensor_free(dq_tensor);
4429
0
      ccv_nnc_tensor_free(dk_tensor);
4430
0
      ccv_nnc_tensor_free(dv_tensor);
4431
0
      ccv_nnc_tensor_free(q_tensor_f16);
4432
0
      ccv_nnc_tensor_free(k_tensor_f16);
4433
0
      ccv_nnc_tensor_free(v_tensor_f16);
4434
0
      ccv_nnc_tensor_free(do_tensor_f16);
4435
0
    }
4436
0
  }
4437
0
}
4438
4439
TEST_CASE("scaled dot product attention gradient with quantized NA mps on 1536 square surface")
4440
1
{
4441
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD, CCV_NNC_BACKEND_MPS) &&
4442
1
    ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD, CCV_NNC_BACKEND_MPS));
4443
0
  const int B = 1;
4444
0
  const int R = 1536;
4445
0
  const int C = 1536;
4446
0
  const int H = 24;
4447
0
  const int D = 128;
4448
0
  const int q_count = B * R * H * D;
4449
0
  const int kv_count = B * C * H * D;
4450
0
  const float scale = 1.0 / sqrt((float)D);
4451
0
  ccv_nnc_tensor_t* const q_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
4452
0
  ccv_nnc_tensor_t* const k_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
4453
0
  ccv_nnc_tensor_t* const v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
4454
0
  ccv_nnc_tensor_t* const do_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
4455
0
  ccv_nnc_tensor_t* const dq_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
4456
0
  ccv_nnc_tensor_t* const dk_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
4457
0
  ccv_nnc_tensor_t* const dv_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
4458
0
  ccv_nnc_tensor_t* const q_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, H, D), 0);
4459
0
  ccv_nnc_tensor_t* const k_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, H, D), 0);
4460
0
  ccv_nnc_tensor_t* const v_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, H, D), 0);
4461
0
  ccv_nnc_tensor_t* const do_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, H, D), 0);
4462
0
  dsfmt_t dsfmt;
4463
0
  dsfmt_init_gen_rand(&dsfmt, 4177);
4464
0
  for (int i = 0; i < q_count; ++i)
4465
0
  {
4466
    // Use a stronger shared Q / K signal on this surface so QK^T produces
4467
    // sharper rows than the fully diffuse random-input case.
4468
0
    const float q = 2.f * (dsfmt_genrand_open_close(&dsfmt) - 0.5f);
4469
0
    q_tensor->data.f32[i] = q;
4470
0
    k_tensor->data.f32[i] = q + 0.125f * (dsfmt_genrand_open_close(&dsfmt) - 0.5f);
4471
0
    do_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
4472
0
    v_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) - 0.5;
4473
0
  }
4474
0
  ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD(scale, 0), ccv_nnc_no_hint, 0, TENSOR_LIST(do_tensor, 0, 0, q_tensor, k_tensor, v_tensor), TENSOR_LIST(dq_tensor, dk_tensor, dv_tensor), 0);
4475
4476
0
  ccv_float_to_half_precision(q_tensor->data.f32, (uint16_t*)q_tensor_f16->data.f16, q_count);
4477
0
  ccv_float_to_half_precision(k_tensor->data.f32, (uint16_t*)k_tensor_f16->data.f16, kv_count);
4478
0
  ccv_float_to_half_precision(v_tensor->data.f32, (uint16_t*)v_tensor_f16->data.f16, kv_count);
4479
0
  ccv_float_to_half_precision(do_tensor->data.f32, (uint16_t*)do_tensor_f16->data.f16, q_count);
4480
4481
0
  ccv_nnc_tensor_t* const gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, H, D), 0);
4482
0
  ccv_nnc_tensor_t* const gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, H, D), 0);
4483
0
  ccv_nnc_tensor_t* const gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, H, D), 0);
4484
0
  ccv_nnc_tensor_t* const gpu_do_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, H, D), 0);
4485
0
  ccv_nnc_tensor_t* const gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, H, D), 0);
4486
0
  ccv_nnc_tensor_t* const gpu_dq_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, H, D), 0);
4487
0
  ccv_nnc_tensor_t* const gpu_dk_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, H, D), 0);
4488
0
  ccv_nnc_tensor_t* const gpu_dv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, H, D), 0);
4489
0
  ccv_nnc_tensor_t* const gpu_softmax_lse = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, H, R), 0);
4490
0
  ccv_nnc_tensor_t* const copy_of_gpu_dq_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, H, D), 0);
4491
0
  ccv_nnc_tensor_t* const copy_of_gpu_dk_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, H, D), 0);
4492
0
  ccv_nnc_tensor_t* const copy_of_gpu_dv_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, H, D), 0);
4493
4494
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor_f16, k_tensor_f16, v_tensor_f16, do_tensor_f16), TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, gpu_do_tensor), 0);
4495
0
  ccv_nnc_cmd_t gpu_forw_cmd = CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, 0);
4496
0
  gpu_forw_cmd.info.scaled_dot_product_attention.flags = CCV_NNC_GEMM_16F | CCV_NNC_GEMM_8I;
4497
0
  ccv_nnc_cmd_exec(gpu_forw_cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, NULL, NULL, NULL), TENSOR_LIST(gpu_o_tensor, gpu_softmax_lse), 0);
4498
0
  ccv_nnc_cmd_exec(CMD_SET_FORWARD(0), ccv_nnc_no_hint, 0, TENSOR_LIST(), TENSOR_LIST(gpu_dq_tensor, gpu_dk_tensor, gpu_dv_tensor), 0);
4499
0
  ccv_nnc_cmd_t gpu_back_cmd = CMD_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD(scale, 0);
4500
0
  gpu_back_cmd.info.scaled_dot_product_attention.flags = CCV_NNC_GEMM_16F | CCV_NNC_GEMM_8I;
4501
0
  gpu_back_cmd.info.scaled_dot_product_attention.deterministic = 0;
4502
0
  ccv_nnc_cmd_exec(gpu_back_cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_do_tensor, 0, 0, gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, 0, 0, 0, gpu_o_tensor, gpu_softmax_lse), TENSOR_LIST(gpu_dq_tensor, gpu_dk_tensor, gpu_dv_tensor), 0);
4503
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_dq_tensor, gpu_dk_tensor, gpu_dv_tensor), TENSOR_LIST(copy_of_gpu_dq_tensor, copy_of_gpu_dk_tensor, copy_of_gpu_dv_tensor), 0);
4504
4505
0
  float* const dq_gpu_f32 = (float*)ccmalloc(sizeof(float) * q_count);
4506
0
  float* const dk_gpu_f32 = (float*)ccmalloc(sizeof(float) * kv_count);
4507
0
  float* const dv_gpu_f32 = (float*)ccmalloc(sizeof(float) * kv_count);
4508
0
  ccv_half_precision_to_float((uint16_t*)copy_of_gpu_dq_tensor->data.f16, dq_gpu_f32, q_count);
4509
0
  ccv_half_precision_to_float((uint16_t*)copy_of_gpu_dk_tensor->data.f16, dk_gpu_f32, kv_count);
4510
0
  ccv_half_precision_to_float((uint16_t*)copy_of_gpu_dv_tensor->data.f16, dv_gpu_f32, kv_count);
4511
4512
0
  float dq_max_relative_diff = 0;
4513
0
  float dk_max_relative_diff = 0;
4514
0
  float dv_max_relative_diff = 0;
4515
0
  float dq_cpu_max_abs = 0;
4516
0
  float dq_gpu_max_abs = 0;
4517
0
  float dk_cpu_max_abs = 0;
4518
0
  float dk_gpu_max_abs = 0;
4519
0
  float dv_cpu_max_abs = 0;
4520
0
  float dv_gpu_max_abs = 0;
4521
0
  int dq_max_diff_idx = 0;
4522
0
  int dk_max_diff_idx = 0;
4523
0
  int dv_max_diff_idx = 0;
4524
0
  for (int i = 0; i < q_count; ++i)
4525
0
  {
4526
0
    dq_cpu_max_abs = fmaxf(dq_cpu_max_abs, fabsf(dq_tensor->data.f32[i]));
4527
0
    dq_gpu_max_abs = fmaxf(dq_gpu_max_abs, fabsf(dq_gpu_f32[i]));
4528
0
    const float denom = fmaxf(fmaxf(fabsf(dq_tensor->data.f32[i]), fabsf(dq_gpu_f32[i])), 1.0f);
4529
0
    const float relative_diff = fabsf(dq_tensor->data.f32[i] - dq_gpu_f32[i]) / denom;
4530
0
    if (relative_diff > dq_max_relative_diff)
4531
0
      dq_max_relative_diff = relative_diff, dq_max_diff_idx = i;
4532
0
  }
4533
0
  for (int i = 0; i < kv_count; ++i)
4534
0
  {
4535
0
    dk_cpu_max_abs = fmaxf(dk_cpu_max_abs, fabsf(dk_tensor->data.f32[i]));
4536
0
    dk_gpu_max_abs = fmaxf(dk_gpu_max_abs, fabsf(dk_gpu_f32[i]));
4537
0
    float denom = fmaxf(fmaxf(fabsf(dk_tensor->data.f32[i]), fabsf(dk_gpu_f32[i])), 1.0f);
4538
0
    float relative_diff = fabsf(dk_tensor->data.f32[i] - dk_gpu_f32[i]) / denom;
4539
0
    if (relative_diff > dk_max_relative_diff)
4540
0
      dk_max_relative_diff = relative_diff, dk_max_diff_idx = i;
4541
0
    dv_cpu_max_abs = fmaxf(dv_cpu_max_abs, fabsf(dv_tensor->data.f32[i]));
4542
0
    dv_gpu_max_abs = fmaxf(dv_gpu_max_abs, fabsf(dv_gpu_f32[i]));
4543
0
    denom = fmaxf(fmaxf(fabsf(dv_tensor->data.f32[i]), fabsf(dv_gpu_f32[i])), 1.0f);
4544
0
    relative_diff = fabsf(dv_tensor->data.f32[i] - dv_gpu_f32[i]) / denom;
4545
0
    if (relative_diff > dv_max_relative_diff)
4546
0
      dv_max_relative_diff = relative_diff, dv_max_diff_idx = i;
4547
0
  }
4548
0
  REQUIRE(dq_gpu_max_abs >= dq_cpu_max_abs * 0.5f && dq_gpu_max_abs <= dq_cpu_max_abs * 2.0f,
4549
0
    "quantized attention dQ magnitude should stay close to CPU reference on 1536 surface (cpu max abs %g gpu max abs %g)",
4550
0
    dq_cpu_max_abs, dq_gpu_max_abs);
4551
0
  REQUIRE(dk_gpu_max_abs >= dk_cpu_max_abs * 0.5f && dk_gpu_max_abs <= dk_cpu_max_abs * 2.0f,
4552
0
    "quantized attention dK magnitude should stay close to CPU reference on 1536 surface (cpu max abs %g gpu max abs %g)",
4553
0
    dk_cpu_max_abs, dk_gpu_max_abs);
4554
0
  REQUIRE(dv_gpu_max_abs >= dv_cpu_max_abs * 0.5f && dv_gpu_max_abs <= dv_cpu_max_abs * 2.0f,
4555
0
    "quantized attention dV magnitude should stay close to CPU reference on 1536 surface (cpu max abs %g gpu max abs %g)",
4556
0
    dv_cpu_max_abs, dv_gpu_max_abs);
4557
0
  REQUIRE(dq_max_relative_diff <= 8e-2, "quantized attention dQ should match CPU reference on 1536 surface (max relative diff %g at %d: %g vs %g)", dq_max_relative_diff, dq_max_diff_idx, dq_tensor->data.f32[dq_max_diff_idx], dq_gpu_f32[dq_max_diff_idx]);
4558
0
  REQUIRE(dk_max_relative_diff <= 1e-1, "quantized attention dK should match CPU reference on 1536 surface (max relative diff %g at %d: %g vs %g)", dk_max_relative_diff, dk_max_diff_idx, dk_tensor->data.f32[dk_max_diff_idx], dk_gpu_f32[dk_max_diff_idx]);
4559
0
  REQUIRE(dv_max_relative_diff <= 8e-2, "quantized attention dV should match CPU reference on 1536 surface (max relative diff %g at %d: %g vs %g)", dv_max_relative_diff, dv_max_diff_idx, dv_tensor->data.f32[dv_max_diff_idx], dv_gpu_f32[dv_max_diff_idx]);
4560
4561
0
  ccfree(dq_gpu_f32);
4562
0
  ccfree(dk_gpu_f32);
4563
0
  ccfree(dv_gpu_f32);
4564
0
  ccv_nnc_tensor_free(gpu_q_tensor);
4565
0
  ccv_nnc_tensor_free(gpu_k_tensor);
4566
0
  ccv_nnc_tensor_free(gpu_v_tensor);
4567
0
  ccv_nnc_tensor_free(gpu_do_tensor);
4568
0
  ccv_nnc_tensor_free(gpu_o_tensor);
4569
0
  ccv_nnc_tensor_free(gpu_dq_tensor);
4570
0
  ccv_nnc_tensor_free(gpu_dk_tensor);
4571
0
  ccv_nnc_tensor_free(gpu_dv_tensor);
4572
0
  ccv_nnc_tensor_free(gpu_softmax_lse);
4573
0
  ccv_nnc_tensor_free(copy_of_gpu_dq_tensor);
4574
0
  ccv_nnc_tensor_free(copy_of_gpu_dk_tensor);
4575
0
  ccv_nnc_tensor_free(copy_of_gpu_dv_tensor);
4576
0
  ccv_nnc_tensor_free(q_tensor);
4577
0
  ccv_nnc_tensor_free(k_tensor);
4578
0
  ccv_nnc_tensor_free(v_tensor);
4579
0
  ccv_nnc_tensor_free(do_tensor);
4580
0
  ccv_nnc_tensor_free(dq_tensor);
4581
0
  ccv_nnc_tensor_free(dk_tensor);
4582
0
  ccv_nnc_tensor_free(dv_tensor);
4583
0
  ccv_nnc_tensor_free(q_tensor_f16);
4584
0
  ccv_nnc_tensor_free(k_tensor_f16);
4585
0
  ccv_nnc_tensor_free(v_tensor_f16);
4586
0
  ccv_nnc_tensor_free(do_tensor_f16);
4587
0
}
4588
4589
TEST_CASE("scaled dot product attention with mps in bfloat precision")
4590
1
{
4591
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD, CCV_NNC_BACKEND_MPS) &&
4592
1
    ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD, CCV_NNC_BACKEND_MPS));
4593
0
#define num_long_trials 8
4594
0
#define num_short_trials 4
4595
0
#define num_trials (num_long_trials + num_short_trials)
4596
4597
0
  dsfmt_t dsfmt;
4598
0
  dsfmt_init_gen_rand(&dsfmt, 10);
4599
0
  for (int trial = 0; trial < num_trials; ++trial) {
4600
0
    const int B_candidates[num_trials] = {  32,   12, 16, 1, 2, 1, 32,   12, 16, 1, 2, 1 };
4601
0
    const int R_candidates[num_trials] = { 160,  256, 128, 77, 77, 5, 160,  256, 128, 77, 77, 5 };
4602
0
    const int C_candidates[num_trials] = { 128,  128, 128, 128, 128, 5, 128,  128, 128, 128, 128, 5 };
4603
0
    const int Hq_candidates[num_trials] = {   8,  8, 8, 8, 8, 32, 8,  8, 8, 8, 8, 32 };
4604
0
    const int Hk_candidates[num_trials] = {   8,  8, 4, 2, 8, 32, 8,  8, 8, 8, 8, 32 };
4605
0
    const int D_candidates[num_trials] = {  64, 40, 160, 192, 256, 128, 48, 96, 160, 192, 256, 128 };
4606
4607
0
    const int B = B_candidates[trial];
4608
0
    const int R = R_candidates[trial];
4609
0
    const int C = C_candidates[trial];
4610
0
    const int Hq = Hq_candidates[trial];
4611
0
    const int Hk = Hk_candidates[trial];
4612
0
    const int D = D_candidates[trial];
4613
0
    const int is_causal = 0;
4614
0
    const float scale = 1.0 / sqrt((float)D);
4615
4616
0
    ccv_nnc_tensor_t* const q_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
4617
0
    ccv_nnc_tensor_t* const k_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
4618
0
    ccv_nnc_tensor_t* const v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
4619
4620
0
    for (int i = 0; i < B * R * Hq * D; ++i) {
4621
0
      q_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4622
0
    }
4623
0
    for (int i = 0; i < B * C * Hk * D; ++i) {
4624
0
      k_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4625
0
    }
4626
0
    for (int i = 0; i < B * C * Hk * D; ++i) {
4627
0
      v_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4628
0
    }
4629
4630
0
    ccv_nnc_tensor_t* const o_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
4631
0
    ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, is_causal), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor, k_tensor, v_tensor), TENSOR_LIST(o_tensor), 0);
4632
0
    ccv_nnc_tensor_t* const q_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, B, R, Hq, D), 0);
4633
0
    ccv_nnc_tensor_t* const k_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, B, C, Hk, D), 0);
4634
0
    ccv_nnc_tensor_t* const v_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, B, C, Hk, D), 0);
4635
0
    ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor, k_tensor, v_tensor), TENSOR_LIST(q_tensor_f16, k_tensor_f16, v_tensor_f16), 0);
4636
4637
0
    ccv_nnc_tensor_t* const gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, R, Hq, D), 0);
4638
0
    ccv_nnc_tensor_t* const gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, C, Hk, D), 0);
4639
0
    ccv_nnc_tensor_t* const gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, C, Hk, D), 0);
4640
0
    ccv_nnc_tensor_t* const gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, R, Hq, D), 0);
4641
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor_f16, k_tensor_f16, v_tensor_f16), TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor), 0);
4642
4643
0
    ccv_nnc_tensor_t* const gpu_softmax_lse = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, Hq, R), 0);
4644
0
    ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, is_causal), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, NULL, NULL, NULL), TENSOR_LIST(gpu_o_tensor, gpu_softmax_lse), 0);
4645
4646
0
    ccv_nnc_tensor_t* const copy_of_gpu_o_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, B, R, Hq, D), 0);
4647
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_o_tensor), TENSOR_LIST(copy_of_gpu_o_tensor_f16), 0);
4648
4649
0
    ccv_nnc_tensor_t* const copy_of_gpu_o_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
4650
0
    ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(copy_of_gpu_o_tensor_f16), TENSOR_LIST(copy_of_gpu_o_tensor), 0);
4651
0
    REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, copy_of_gpu_o_tensor->data.f32, o_tensor->data.f32, B * R * Hq * D, 8e-3, "scaled dot product attention result should be the same");
4652
4653
0
    ccv_nnc_tensor_free(o_tensor);
4654
0
    ccv_nnc_tensor_free(gpu_o_tensor);
4655
0
    ccv_nnc_tensor_free(copy_of_gpu_o_tensor_f16);
4656
0
    ccv_nnc_tensor_free(copy_of_gpu_o_tensor);
4657
0
    ccv_nnc_tensor_free(q_tensor);
4658
0
    ccv_nnc_tensor_free(k_tensor);
4659
0
    ccv_nnc_tensor_free(v_tensor);
4660
0
    ccv_nnc_tensor_free(q_tensor_f16);
4661
0
    ccv_nnc_tensor_free(k_tensor_f16);
4662
0
    ccv_nnc_tensor_free(v_tensor_f16);
4663
0
    ccv_nnc_tensor_free(gpu_q_tensor);
4664
0
    ccv_nnc_tensor_free(gpu_k_tensor);
4665
0
    ccv_nnc_tensor_free(gpu_v_tensor);
4666
0
    ccv_nnc_tensor_free(gpu_softmax_lse);
4667
0
  }
4668
0
#undef num_long_trials
4669
0
#undef num_short_trials
4670
0
#undef num_trials
4671
0
}
4672
4673
TEST_CASE("scaled dot product attention + unify head with mps")
4674
1
{
4675
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD, CCV_NNC_BACKEND_MPS));
4676
0
  ccv_nnc_symbolic_graph_t* const sdp_symbolic_graph = ccv_nnc_symbolic_graph_new();
4677
0
  ccv_nnc_tensor_symbol_t q = ccv_nnc_tensor_symbol_new(sdp_symbolic_graph, CPU_TENSOR_NHWC(32F, 32, 128, 8, 64), "q");
4678
0
  ccv_nnc_tensor_symbol_t k = ccv_nnc_tensor_symbol_new(sdp_symbolic_graph, CPU_TENSOR_NHWC(32F, 32, 128, 8, 64), "k");
4679
0
  ccv_nnc_tensor_symbol_t v = ccv_nnc_tensor_symbol_new(sdp_symbolic_graph, CPU_TENSOR_NHWC(32F, 32, 128, 8, 64), "v");
4680
0
  ccv_nnc_tensor_symbol_t w = ccv_nnc_tensor_symbol_new(sdp_symbolic_graph, CPU_TENSOR_NHWC(32F, 512, 512), "w");
4681
0
  ccv_nnc_tensor_symbol_t bias = ccv_nnc_tensor_symbol_new(sdp_symbolic_graph, CPU_TENSOR_NHWC(32F, 512), "bias");
4682
0
  ccv_nnc_tensor_symbol_t c = ccv_nnc_tensor_symbol_new(sdp_symbolic_graph, CPU_TENSOR_NHWC(32F, 32, 128, 8, 64), "c");
4683
0
  ccv_nnc_tensor_symbol_t r = ccv_nnc_tensor_symbol_new(sdp_symbolic_graph, CPU_TENSOR_NHWC(32F, 32, 128, 512), "r");
4684
0
  ccv_nnc_graph_exec_symbol_new(sdp_symbolic_graph, CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(1.0 / 8, 0), TENSOR_SYMBOL_LIST(q, k, v, NO_TENSOR_SYMBOL, w, bias), TENSOR_SYMBOL_LIST(r, NO_TENSOR_SYMBOL, c), "scaled_dot_product_attention");
4685
0
  ccv_nnc_graph_exec_symbol_autogen(sdp_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
4686
0
  ccv_nnc_graph_t* sdp_graph = 0;
4687
0
  ccv_nnc_tensor_arena_t* sdp_tensor_arena = 0;
4688
0
  ccv_nnc_graph_exec_arena_t* sdp_graph_exec_arena = 0;
4689
0
  ccv_nnc_symbolic_graph_compile(sdp_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(sdp_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(sdp_symbolic_graph), &sdp_graph, &sdp_tensor_arena, &sdp_graph_exec_arena);
4690
0
  ccv_nnc_tensor_t* const q_tensor = ccv_nnc_tensor_from_symbol(sdp_tensor_arena, q);
4691
0
  ccv_nnc_tensor_t* const k_tensor = ccv_nnc_tensor_from_symbol(sdp_tensor_arena, k);
4692
0
  ccv_nnc_tensor_t* const v_tensor = ccv_nnc_tensor_from_symbol(sdp_tensor_arena, v);
4693
0
  ccv_nnc_tensor_t* const w_tensor = ccv_nnc_tensor_from_symbol(sdp_tensor_arena, w);
4694
0
  ccv_nnc_tensor_t* const bias_tensor = ccv_nnc_tensor_from_symbol(sdp_tensor_arena, bias);
4695
0
  dsfmt_t dsfmt;
4696
0
  int i;
4697
0
  dsfmt_init_gen_rand(&dsfmt, 1);
4698
0
  for (i = 0; i < 32 * 8 * 128 * 64; i++)
4699
0
    q_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4700
0
  for (i = 0; i < 32 * 8 * 128 * 64; i++)
4701
0
    k_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4702
0
  for (i = 0; i < 32 * 8 * 128 * 64; i++)
4703
0
    v_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4704
0
  for (i = 0; i < 512 * 512; i++)
4705
0
    w_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4706
0
  for (i = 0; i < 512; i++)
4707
0
    bias_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4708
0
  ccv_nnc_symbolic_graph_t* const g_symbolic_graph = ccv_nnc_symbolic_graph_new();
4709
0
  ccv_nnc_tensor_symbol_t gq = ccv_nnc_tensor_symbol_new(g_symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 32, 128, 8, 64), "q");
4710
0
  ccv_nnc_tensor_symbol_t gk = ccv_nnc_tensor_symbol_new(g_symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 32, 128, 8, 64), "k");
4711
0
  ccv_nnc_tensor_symbol_t gv = ccv_nnc_tensor_symbol_new(g_symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 32, 128, 8, 64), "v");
4712
0
  ccv_nnc_tensor_symbol_t gw = ccv_nnc_tensor_symbol_new(g_symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 512, 512), "w");
4713
0
  ccv_nnc_tensor_symbol_t gbias = ccv_nnc_tensor_symbol_new(g_symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 512), "bias");
4714
0
  ccv_nnc_tensor_symbol_t gc = ccv_nnc_tensor_symbol_new(g_symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 32, 128, 8, 64), "c");
4715
0
  ccv_nnc_tensor_symbol_t gr = ccv_nnc_tensor_symbol_new(g_symbolic_graph, GPU_TENSOR_NHWC(000, 32F, 32, 128, 512), "r");
4716
0
  ccv_nnc_graph_exec_symbol_new(g_symbolic_graph, CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(1.0 / 8, 0), TENSOR_SYMBOL_LIST(gq, gk, gv, NO_TENSOR_SYMBOL, gw, gbias), TENSOR_SYMBOL_LIST(gr, NO_TENSOR_SYMBOL, gc), "scaled_dot_product_attention");
4717
0
  ccv_nnc_graph_exec_symbol_autogen(g_symbolic_graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
4718
0
  ccv_nnc_graph_t* g_graph = 0;
4719
0
  ccv_nnc_tensor_arena_t* g_tensor_arena = 0;
4720
0
  ccv_nnc_graph_exec_arena_t* g_graph_exec_arena = 0;
4721
0
  ccv_nnc_symbolic_graph_compile(g_symbolic_graph, ccv_nnc_default_compile_params, 0, 0, 0, 0, SYMBOLIC_GRAPH_SOURCES(g_symbolic_graph), SYMBOLIC_GRAPH_DESTINATIONS(g_symbolic_graph), &g_graph, &g_tensor_arena, &g_graph_exec_arena);
4722
0
  ccv_nnc_tensor_t* const gq_tensor = ccv_nnc_tensor_from_symbol(g_tensor_arena, gq);
4723
0
  ccv_nnc_tensor_t* const gk_tensor = ccv_nnc_tensor_from_symbol(g_tensor_arena, gk);
4724
0
  ccv_nnc_tensor_t* const gv_tensor = ccv_nnc_tensor_from_symbol(g_tensor_arena, gv);
4725
0
  ccv_nnc_tensor_t* const gw_tensor = ccv_nnc_tensor_from_symbol(g_tensor_arena, gw);
4726
0
  ccv_nnc_tensor_t* const gbias_tensor = ccv_nnc_tensor_from_symbol(g_tensor_arena, gbias);
4727
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor, k_tensor, v_tensor, w_tensor, bias_tensor), TENSOR_LIST(gq_tensor, gk_tensor, gv_tensor, gw_tensor, gbias_tensor), 0);
4728
0
  ccv_nnc_graph_run(sdp_graph, 0, TRAVERSE_FULL, 0, 0);
4729
0
  ccv_nnc_graph_run(g_graph, 0, TRAVERSE_FULL, 0, 0);
4730
0
  ccv_nnc_tensor_t* const r_tensor = ccv_nnc_tensor_from_symbol(sdp_tensor_arena, r);
4731
0
  ccv_nnc_tensor_t* const gr_tensor = ccv_nnc_tensor_from_symbol(g_tensor_arena, gr);
4732
0
  ccv_nnc_tensor_t* const hr = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 32, 128, 512), 0);
4733
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gr_tensor), TENSOR_LIST(hr), 0);
4734
0
  float max_relative_diff = 0;
4735
0
  int max_diff_idx = 0;
4736
0
  for (i = 0; i < 32 * 128 * 512; i++)
4737
0
  {
4738
0
    const float denom = fmaxf(fmaxf(fabsf(r_tensor->data.f32[i]), fabsf(hr->data.f32[i])), 1.0f);
4739
0
    const float relative_diff = fabsf(r_tensor->data.f32[i] - hr->data.f32[i]) / denom;
4740
0
    if (relative_diff > max_relative_diff)
4741
0
      max_relative_diff = relative_diff, max_diff_idx = i;
4742
0
  }
4743
0
  REQUIRE(max_relative_diff <= 2e-3, "graph computed result should match scaled dot product attention op result (max relative diff %g at %d: %g vs %g)", max_relative_diff, max_diff_idx, r_tensor->data.f32[max_diff_idx], hr->data.f32[max_diff_idx]);
4744
0
  ccv_nnc_symbolic_graph_free(sdp_symbolic_graph);
4745
0
  ccv_nnc_tensor_arena_free(sdp_tensor_arena);
4746
0
  ccv_nnc_graph_exec_arena_free(sdp_graph_exec_arena);
4747
0
  ccv_nnc_graph_free(sdp_graph);
4748
0
  ccv_nnc_symbolic_graph_free(g_symbolic_graph);
4749
0
  ccv_nnc_tensor_arena_free(g_tensor_arena);
4750
0
  ccv_nnc_graph_exec_arena_free(g_graph_exec_arena);
4751
0
  ccv_nnc_graph_free(g_graph);
4752
0
  ccv_nnc_tensor_free(hr);
4753
0
}
4754
4755
TEST_CASE("scaled dot product attention gradient with mps")
4756
1
{
4757
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD, CCV_NNC_BACKEND_MPS) &&
4758
1
    ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD, CCV_NNC_BACKEND_MPS));
4759
0
#define num_long_trials 2
4760
0
#define num_short_trials 2
4761
0
#define num_trials (num_long_trials + num_short_trials)
4762
4763
0
  dsfmt_t dsfmt;
4764
0
  dsfmt_init_gen_rand(&dsfmt, 10);
4765
0
  for (int trial = 0; trial < num_trials; ++trial) {
4766
0
    int B_candidates[num_trials] = {  32,   3, 2, 1 };
4767
0
    int R_candidates[num_trials] = { 128,  61, 6, 2 };
4768
0
    int C_candidates[num_trials] = { 128,  49, 2, 1 };
4769
0
    int H_candidates[num_trials] = {   8,  13, 3, 1 };
4770
0
    int D_candidates[num_trials] = {  64, 191, 4, 8 };
4771
4772
0
    int B = B_candidates[trial];
4773
0
    int R = R_candidates[trial];
4774
0
    int C = C_candidates[trial];
4775
0
    int H = H_candidates[trial];
4776
0
    int D = D_candidates[trial];
4777
0
    float scale = 1.0 / sqrt((float)D);
4778
4779
0
    ccv_nnc_tensor_t* const q_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
4780
0
    ccv_nnc_tensor_t* const k_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
4781
0
    ccv_nnc_tensor_t* const v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
4782
0
    ccv_nnc_tensor_t* const dq_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
4783
0
    ccv_nnc_tensor_t* const dk_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
4784
0
    ccv_nnc_tensor_t* const dv_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
4785
4786
0
    for (int i = 0; i < B * R * H * D; ++i) {
4787
0
      q_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4788
0
    }
4789
0
    for (int i = 0; i < B * C * H * D; ++i) {
4790
0
      k_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4791
0
    }
4792
0
    for (int i = 0; i < B * C * H * D; ++i) {
4793
0
      v_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4794
0
    }
4795
4796
0
    ccv_nnc_tensor_t* const do_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
4797
0
    for (int i = 0; i < B * R * H * D; ++i) {
4798
0
      do_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4799
0
    }
4800
0
    ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD(scale, 0), ccv_nnc_no_hint, 0, TENSOR_LIST(do_tensor, 0, 0, q_tensor, k_tensor, v_tensor), TENSOR_LIST(dq_tensor, dk_tensor, dv_tensor), 0);
4801
4802
0
    ccv_nnc_tensor_t* const gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, H, D), 0);
4803
0
    ccv_nnc_tensor_t* const gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, C, H, D), 0);
4804
0
    ccv_nnc_tensor_t* const gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, C, H, D), 0);
4805
0
    ccv_nnc_tensor_t* const gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, H, D), 0);
4806
0
    ccv_nnc_tensor_t* const gpu_dq_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, H, D), 0);
4807
0
    ccv_nnc_tensor_t* const gpu_dk_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, C, H, D), 0);
4808
0
    ccv_nnc_tensor_t* const gpu_dv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, C, H, D), 0);
4809
0
    ccv_nnc_tensor_t* const gpu_do_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, R, H, D), 0);
4810
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor, k_tensor, v_tensor, do_tensor), TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, gpu_do_tensor), 0);
4811
4812
0
    ccv_nnc_tensor_t* const gpu_softmax_lse = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, H, R), 0);
4813
0
    ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, 0), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, NULL, NULL, NULL), TENSOR_LIST(gpu_o_tensor, gpu_softmax_lse), 0);
4814
4815
0
    ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD(scale, 0), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_do_tensor, 0, 0, gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, 0, 0, 0, gpu_o_tensor, gpu_softmax_lse), TENSOR_LIST(gpu_dq_tensor, gpu_dk_tensor, gpu_dv_tensor), 0);
4816
4817
0
    ccv_nnc_tensor_t* const copy_of_gpu_dq_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, H, D), 0);
4818
0
    ccv_nnc_tensor_t* const copy_of_gpu_dk_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
4819
0
    ccv_nnc_tensor_t* const copy_of_gpu_dv_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, H, D), 0);
4820
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_dq_tensor, gpu_dk_tensor, gpu_dv_tensor), TENSOR_LIST(copy_of_gpu_dq_tensor, copy_of_gpu_dk_tensor, copy_of_gpu_dv_tensor), 0);
4821
4822
0
    REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, copy_of_gpu_dv_tensor->data.f32, dv_tensor->data.f32, B * C * H * D, 5e-3, "scaled dot product attention result should be the same");
4823
0
    REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, copy_of_gpu_dq_tensor->data.f32, dq_tensor->data.f32, B * R * H * D, 5e-3, "scaled dot product attention result should be the same");
4824
0
    REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, copy_of_gpu_dk_tensor->data.f32, dk_tensor->data.f32, B * C * H * D, 5e-3, "scaled dot product attention result should be the same");
4825
4826
0
    ccv_nnc_tensor_free(do_tensor);
4827
0
    ccv_nnc_tensor_free(gpu_do_tensor);
4828
0
    ccv_nnc_tensor_free(gpu_o_tensor);
4829
0
    ccv_nnc_tensor_free(copy_of_gpu_dq_tensor);
4830
0
    ccv_nnc_tensor_free(copy_of_gpu_dk_tensor);
4831
0
    ccv_nnc_tensor_free(copy_of_gpu_dv_tensor);
4832
0
    ccv_nnc_tensor_free(q_tensor);
4833
0
    ccv_nnc_tensor_free(k_tensor);
4834
0
    ccv_nnc_tensor_free(v_tensor);
4835
0
    ccv_nnc_tensor_free(gpu_q_tensor);
4836
0
    ccv_nnc_tensor_free(gpu_k_tensor);
4837
0
    ccv_nnc_tensor_free(gpu_v_tensor);
4838
0
    ccv_nnc_tensor_free(dq_tensor);
4839
0
    ccv_nnc_tensor_free(dk_tensor);
4840
0
    ccv_nnc_tensor_free(dv_tensor);
4841
0
    ccv_nnc_tensor_free(gpu_dq_tensor);
4842
0
    ccv_nnc_tensor_free(gpu_dk_tensor);
4843
0
    ccv_nnc_tensor_free(gpu_dv_tensor);
4844
0
    ccv_nnc_tensor_free(gpu_softmax_lse);
4845
0
  }
4846
0
#undef num_long_trials
4847
0
#undef num_short_trials
4848
0
#undef num_trials
4849
0
}
4850
4851
TEST_CASE("scaled dot product attention gradient with mps in half precision")
4852
1
{
4853
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD, CCV_NNC_BACKEND_MPS) &&
4854
1
    ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD, CCV_NNC_BACKEND_MPS));
4855
0
#define num_long_trials 8
4856
0
#define num_short_trials 4
4857
0
#define num_trials (num_long_trials + num_short_trials)
4858
4859
0
  dsfmt_t dsfmt;
4860
0
  dsfmt_init_gen_rand(&dsfmt, 10);
4861
0
  for (int trial = 0; trial < num_trials; ++trial) {
4862
0
    const int B_candidates[num_trials] = {  32,   12, 16, 1, 2, 1, 32,   12, 16, 1, 2, 1 };
4863
0
    const int R_candidates[num_trials] = { 160,  256, 128, 77, 77, 5, 160,  256, 128, 77, 77, 5 };
4864
0
    const int C_candidates[num_trials] = { 128,  128, 128, 128, 128, 5, 128,  128, 128, 128, 128, 5 };
4865
0
    const int Hq_candidates[num_trials] = {   8,  8, 8, 8, 8, 32, 8,  8, 8, 8, 8, 32 };
4866
0
    const int D_candidates[num_trials] = {  64, 40, 160, 192, 256, 128, 48, 96, 160, 192, 256, 128 };
4867
4868
0
    const int B = B_candidates[trial];
4869
0
    const int R = R_candidates[trial];
4870
0
    const int C = C_candidates[trial];
4871
0
    const int Hq = Hq_candidates[trial];
4872
0
    const int Hk = Hq_candidates[trial];
4873
0
    const int D = D_candidates[trial];
4874
0
    const int is_causal = 0;
4875
0
    const float scale = 1.0 / sqrt((float)D);
4876
4877
0
    ccv_nnc_tensor_t* const q_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
4878
0
    ccv_nnc_tensor_t* const k_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
4879
0
    ccv_nnc_tensor_t* const v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
4880
0
    ccv_nnc_tensor_t* const dq_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
4881
0
    ccv_nnc_tensor_t* const dk_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
4882
0
    ccv_nnc_tensor_t* const dv_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
4883
4884
0
    for (int i = 0; i < B * R * Hq * D; ++i) {
4885
0
      q_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4886
0
    }
4887
0
    for (int i = 0; i < B * C * Hk * D; ++i) {
4888
0
      k_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4889
0
    }
4890
0
    for (int i = 0; i < B * C * Hk * D; ++i) {
4891
0
      v_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4892
0
    }
4893
4894
0
    ccv_nnc_tensor_t* const do_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
4895
0
    for (int i = 0; i < B * R * Hq * D; ++i) {
4896
0
      do_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
4897
0
    }
4898
0
    ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD(scale, is_causal), ccv_nnc_no_hint, 0, TENSOR_LIST(do_tensor, 0, 0, q_tensor, k_tensor, v_tensor), TENSOR_LIST(dq_tensor, dk_tensor, dv_tensor), 0);
4899
0
    ccv_nnc_tensor_t* const q_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, Hq, D), 0);
4900
0
    ccv_nnc_tensor_t* const k_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, Hk, D), 0);
4901
0
    ccv_nnc_tensor_t* const v_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, Hk, D), 0);
4902
0
    ccv_nnc_tensor_t* const do_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, Hq, D), 0);
4903
0
    ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor, k_tensor, v_tensor, do_tensor), TENSOR_LIST(q_tensor_f16, k_tensor_f16, v_tensor_f16, do_tensor_f16), 0);
4904
4905
0
    ccv_nnc_tensor_t* const gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, Hq, D), 0);
4906
0
    ccv_nnc_tensor_t* const gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, Hk, D), 0);
4907
0
    ccv_nnc_tensor_t* const gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, Hk, D), 0);
4908
0
    ccv_nnc_tensor_t* const gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, Hq, D), 0);
4909
0
    ccv_nnc_tensor_t* const gpu_do_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, Hq, D), 0);
4910
0
    ccv_nnc_tensor_t* const gpu_dq_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, R, Hq, D), 0);
4911
0
    ccv_nnc_tensor_t* const gpu_dk_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, Hk, D), 0);
4912
0
    ccv_nnc_tensor_t* const gpu_dv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, B, C, Hk, D), 0);
4913
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor_f16, k_tensor_f16, v_tensor_f16, do_tensor_f16), TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, gpu_do_tensor), 0);
4914
4915
0
    ccv_nnc_tensor_t* const gpu_softmax_lse = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, Hq, R), 0);
4916
0
    ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, is_causal), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, NULL, NULL, NULL), TENSOR_LIST(gpu_o_tensor, gpu_softmax_lse), 0);
4917
4918
0
    ccv_nnc_cmd_t cmd = CMD_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD(scale, is_causal);
4919
0
    cmd.info.scaled_dot_product_attention.deterministic = 0;
4920
0
    ccv_nnc_cmd_exec(cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_do_tensor, 0, 0, gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, 0, 0, 0, gpu_o_tensor, gpu_softmax_lse), TENSOR_LIST(gpu_dq_tensor, gpu_dk_tensor, gpu_dv_tensor), 0);
4921
4922
0
    ccv_nnc_tensor_t* const copy_of_gpu_dq_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, R, Hq, D), 0);
4923
0
    ccv_nnc_tensor_t* const copy_of_gpu_dk_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, Hk, D), 0);
4924
0
    ccv_nnc_tensor_t* const copy_of_gpu_dv_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, B, C, Hk, D), 0);
4925
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_dq_tensor, gpu_dk_tensor, gpu_dv_tensor), TENSOR_LIST(copy_of_gpu_dq_tensor_f16, copy_of_gpu_dk_tensor_f16, copy_of_gpu_dv_tensor_f16), 0);
4926
4927
0
    ccv_nnc_tensor_t* const copy_of_gpu_dq_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
4928
0
    ccv_nnc_tensor_t* const copy_of_gpu_dk_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
4929
0
    ccv_nnc_tensor_t* const copy_of_gpu_dv_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
4930
0
    ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(copy_of_gpu_dq_tensor_f16, copy_of_gpu_dk_tensor_f16, copy_of_gpu_dv_tensor_f16), TENSOR_LIST(copy_of_gpu_dq_tensor, copy_of_gpu_dk_tensor, copy_of_gpu_dv_tensor), 0);
4931
4932
0
    REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, copy_of_gpu_dq_tensor->data.f32, dq_tensor->data.f32, B * R * Hq * D, 1e-3, "scaled dot product attention result should be the same");
4933
0
    REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, copy_of_gpu_dk_tensor->data.f32, dk_tensor->data.f32, B * C * Hk * D, 3e-3, "scaled dot product attention result should be the same");
4934
0
    REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, copy_of_gpu_dv_tensor->data.f32, dv_tensor->data.f32, B * C * Hk * D, 6e-3, "GPU computed output should be the same as CPU computed ones");
4935
4936
0
    ccv_nnc_tensor_free(do_tensor);
4937
0
    ccv_nnc_tensor_free(gpu_do_tensor);
4938
0
    ccv_nnc_tensor_free(gpu_o_tensor);
4939
0
    ccv_nnc_tensor_free(copy_of_gpu_dq_tensor_f16);
4940
0
    ccv_nnc_tensor_free(copy_of_gpu_dk_tensor_f16);
4941
0
    ccv_nnc_tensor_free(copy_of_gpu_dv_tensor_f16);
4942
0
    ccv_nnc_tensor_free(copy_of_gpu_dq_tensor);
4943
0
    ccv_nnc_tensor_free(copy_of_gpu_dk_tensor);
4944
0
    ccv_nnc_tensor_free(copy_of_gpu_dv_tensor);
4945
0
    ccv_nnc_tensor_free(q_tensor);
4946
0
    ccv_nnc_tensor_free(k_tensor);
4947
0
    ccv_nnc_tensor_free(v_tensor);
4948
0
    ccv_nnc_tensor_free(q_tensor_f16);
4949
0
    ccv_nnc_tensor_free(k_tensor_f16);
4950
0
    ccv_nnc_tensor_free(v_tensor_f16);
4951
0
    ccv_nnc_tensor_free(do_tensor_f16);
4952
0
    ccv_nnc_tensor_free(gpu_q_tensor);
4953
0
    ccv_nnc_tensor_free(gpu_k_tensor);
4954
0
    ccv_nnc_tensor_free(gpu_v_tensor);
4955
0
    ccv_nnc_tensor_free(dq_tensor);
4956
0
    ccv_nnc_tensor_free(dk_tensor);
4957
0
    ccv_nnc_tensor_free(dv_tensor);
4958
0
    ccv_nnc_tensor_free(gpu_dq_tensor);
4959
0
    ccv_nnc_tensor_free(gpu_dk_tensor);
4960
0
    ccv_nnc_tensor_free(gpu_dv_tensor);
4961
0
    ccv_nnc_tensor_free(gpu_softmax_lse);
4962
0
  }
4963
0
#undef num_long_trials
4964
0
#undef num_short_trials
4965
0
#undef num_trials
4966
0
}
4967
4968
TEST_CASE("scaled dot product attention gradient with mps in bfloat precision")
4969
1
{
4970
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD, CCV_NNC_BACKEND_MPS) &&
4971
1
    ccv_nnc_cmd_ok(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD, CCV_NNC_BACKEND_MPS));
4972
0
#define num_long_trials 8
4973
0
#define num_short_trials 4
4974
0
#define num_trials (num_long_trials + num_short_trials)
4975
4976
0
  dsfmt_t dsfmt;
4977
0
  dsfmt_init_gen_rand(&dsfmt, 10);
4978
0
  for (int trial = 0; trial < num_trials; ++trial) {
4979
0
    const int B_candidates[num_trials] = {  32,   12, 16, 1, 2, 1, 32,   12, 16, 1, 2, 1 };
4980
0
    const int R_candidates[num_trials] = { 160,  256, 128, 77, 77, 5, 160,  256, 128, 77, 77, 5 };
4981
0
    const int C_candidates[num_trials] = { 128,  128, 128, 128, 128, 5, 128,  128, 128, 128, 128, 5 };
4982
0
    const int Hq_candidates[num_trials] = {   8,  8, 8, 8, 8, 32, 8,  8, 8, 8, 8, 32 };
4983
0
    const int D_candidates[num_trials] = {  64, 40, 160, 192, 256, 128, 48, 96, 160, 192, 256, 128 };
4984
4985
0
    const int B = B_candidates[trial];
4986
0
    const int R = R_candidates[trial];
4987
0
    const int C = C_candidates[trial];
4988
0
    const int Hq = Hq_candidates[trial];
4989
0
    const int Hk = Hq_candidates[trial];
4990
0
    const int D = D_candidates[trial];
4991
0
    const int is_causal = 0;
4992
0
    const float scale = 1.0 / sqrt((float)D);
4993
4994
0
    ccv_nnc_tensor_t* const q_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
4995
0
    ccv_nnc_tensor_t* const k_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
4996
0
    ccv_nnc_tensor_t* const v_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
4997
0
    ccv_nnc_tensor_t* const dq_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
4998
0
    ccv_nnc_tensor_t* const dk_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
4999
0
    ccv_nnc_tensor_t* const dv_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
5000
5001
0
    for (int i = 0; i < B * R * Hq * D; ++i) {
5002
0
      q_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
5003
0
    }
5004
0
    for (int i = 0; i < B * C * Hk * D; ++i) {
5005
0
      k_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
5006
0
    }
5007
0
    for (int i = 0; i < B * C * Hk * D; ++i) {
5008
0
      v_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
5009
0
    }
5010
5011
0
    ccv_nnc_tensor_t* const do_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
5012
0
    for (int i = 0; i < B * R * Hq * D; ++i) {
5013
0
      do_tensor->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
5014
0
    }
5015
0
    ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD(scale, is_causal), ccv_nnc_no_hint, 0, TENSOR_LIST(do_tensor, 0, 0, q_tensor, k_tensor, v_tensor), TENSOR_LIST(dq_tensor, dk_tensor, dv_tensor), 0);
5016
0
    ccv_nnc_tensor_t* const q_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, B, R, Hq, D), 0);
5017
0
    ccv_nnc_tensor_t* const k_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, B, C, Hk, D), 0);
5018
0
    ccv_nnc_tensor_t* const v_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, B, C, Hk, D), 0);
5019
0
    ccv_nnc_tensor_t* const do_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, B, R, Hq, D), 0);
5020
0
    ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor, k_tensor, v_tensor, do_tensor), TENSOR_LIST(q_tensor_f16, k_tensor_f16, v_tensor_f16, do_tensor_f16), 0);
5021
5022
0
    ccv_nnc_tensor_t* const gpu_q_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, R, Hq, D), 0);
5023
0
    ccv_nnc_tensor_t* const gpu_k_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, C, Hk, D), 0);
5024
0
    ccv_nnc_tensor_t* const gpu_v_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, C, Hk, D), 0);
5025
0
    ccv_nnc_tensor_t* const gpu_o_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, R, Hq, D), 0);
5026
0
    ccv_nnc_tensor_t* const gpu_do_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, R, Hq, D), 0);
5027
0
    ccv_nnc_tensor_t* const gpu_dq_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, R, Hq, D), 0);
5028
0
    ccv_nnc_tensor_t* const gpu_dk_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, C, Hk, D), 0);
5029
0
    ccv_nnc_tensor_t* const gpu_dv_tensor = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16BF, B, C, Hk, D), 0);
5030
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(q_tensor_f16, k_tensor_f16, v_tensor_f16, do_tensor_f16), TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, gpu_do_tensor), 0);
5031
5032
0
    ccv_nnc_tensor_t* const gpu_softmax_lse = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, B, Hq, R), 0);
5033
0
    ccv_nnc_cmd_exec(CMD_SCALED_DOT_PRODUCT_ATTENTION_FORWARD(scale, is_causal), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, NULL, NULL, NULL), TENSOR_LIST(gpu_o_tensor, gpu_softmax_lse), 0);
5034
5035
0
    ccv_nnc_cmd_t cmd = CMD_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD(scale, is_causal);
5036
0
    cmd.info.scaled_dot_product_attention.deterministic = 0;
5037
0
    ccv_nnc_cmd_exec(cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_do_tensor, 0, 0, gpu_q_tensor, gpu_k_tensor, gpu_v_tensor, 0, 0, 0, gpu_o_tensor, gpu_softmax_lse), TENSOR_LIST(gpu_dq_tensor, gpu_dk_tensor, gpu_dv_tensor), 0);
5038
5039
0
    ccv_nnc_tensor_t* const copy_of_gpu_dq_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, B, R, Hq, D), 0);
5040
0
    ccv_nnc_tensor_t* const copy_of_gpu_dk_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, B, C, Hk, D), 0);
5041
0
    ccv_nnc_tensor_t* const copy_of_gpu_dv_tensor_f16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16BF, B, C, Hk, D), 0);
5042
0
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gpu_dq_tensor, gpu_dk_tensor, gpu_dv_tensor), TENSOR_LIST(copy_of_gpu_dq_tensor_f16, copy_of_gpu_dk_tensor_f16, copy_of_gpu_dv_tensor_f16), 0);
5043
5044
0
    ccv_nnc_tensor_t* const copy_of_gpu_dq_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, R, Hq, D), 0);
5045
0
    ccv_nnc_tensor_t* const copy_of_gpu_dk_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
5046
0
    ccv_nnc_tensor_t* const copy_of_gpu_dv_tensor = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, B, C, Hk, D), 0);
5047
0
    ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(copy_of_gpu_dq_tensor_f16, copy_of_gpu_dk_tensor_f16, copy_of_gpu_dv_tensor_f16), TENSOR_LIST(copy_of_gpu_dq_tensor, copy_of_gpu_dk_tensor, copy_of_gpu_dv_tensor), 0);
5048
5049
0
    REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, copy_of_gpu_dq_tensor->data.f32, dq_tensor->data.f32, B * R * Hq * D, 5e-3, "scaled dot product attention result should be the same");
5050
0
    REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, copy_of_gpu_dk_tensor->data.f32, dk_tensor->data.f32, B * C * Hk * D, 1e-2, "scaled dot product attention result should be the same");
5051
0
    REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, copy_of_gpu_dv_tensor->data.f32, dv_tensor->data.f32, B * C * Hk * D, 2e-2, "GPU computed output should be the same as CPU computed ones");
5052
5053
0
    ccv_nnc_tensor_free(do_tensor);
5054
0
    ccv_nnc_tensor_free(gpu_do_tensor);
5055
0
    ccv_nnc_tensor_free(gpu_o_tensor);
5056
0
    ccv_nnc_tensor_free(copy_of_gpu_dq_tensor_f16);
5057
0
    ccv_nnc_tensor_free(copy_of_gpu_dk_tensor_f16);
5058
0
    ccv_nnc_tensor_free(copy_of_gpu_dv_tensor_f16);
5059
0
    ccv_nnc_tensor_free(copy_of_gpu_dq_tensor);
5060
0
    ccv_nnc_tensor_free(copy_of_gpu_dk_tensor);
5061
0
    ccv_nnc_tensor_free(copy_of_gpu_dv_tensor);
5062
0
    ccv_nnc_tensor_free(q_tensor);
5063
0
    ccv_nnc_tensor_free(k_tensor);
5064
0
    ccv_nnc_tensor_free(v_tensor);
5065
0
    ccv_nnc_tensor_free(q_tensor_f16);
5066
0
    ccv_nnc_tensor_free(k_tensor_f16);
5067
0
    ccv_nnc_tensor_free(v_tensor_f16);
5068
0
    ccv_nnc_tensor_free(do_tensor_f16);
5069
0
    ccv_nnc_tensor_free(gpu_q_tensor);
5070
0
    ccv_nnc_tensor_free(gpu_k_tensor);
5071
0
    ccv_nnc_tensor_free(gpu_v_tensor);
5072
0
    ccv_nnc_tensor_free(dq_tensor);
5073
0
    ccv_nnc_tensor_free(dk_tensor);
5074
0
    ccv_nnc_tensor_free(dv_tensor);
5075
0
    ccv_nnc_tensor_free(gpu_dq_tensor);
5076
0
    ccv_nnc_tensor_free(gpu_dk_tensor);
5077
0
    ccv_nnc_tensor_free(gpu_dv_tensor);
5078
0
    ccv_nnc_tensor_free(gpu_softmax_lse);
5079
0
  }
5080
0
#undef num_long_trials
5081
0
#undef num_short_trials
5082
0
#undef num_trials
5083
0
}
5084
5085
TEST_CASE("backward gemm with no transpose")
5086
1
{
5087
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS) &&
5088
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
5089
0
  float gp[] = {
5090
0
    1, 2, 3,
5091
0
    4, 5, 6,
5092
0
    7, 8, 9,
5093
0
    10, 11, 12,
5094
0
  };
5095
0
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
5096
5097
0
  float ap[] = {
5098
0
    13, 14,
5099
0
    15, 16,
5100
0
    17, 18,
5101
0
    19, 20,
5102
0
  };
5103
5104
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 4, 2), 0);
5105
5106
0
  float bp[] = {
5107
0
    21, 22, 23,
5108
0
    24, 25, 26,
5109
0
  };
5110
5111
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
5112
5113
0
  ccv_nnc_tensor_t* const gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
5114
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
5115
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
5116
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(gg, ga, gb), 0);
5117
5118
0
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
5119
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
5120
0
  ccv_nnc_tensor_t* const dbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
5121
0
  ccv_nnc_cmd_t cmd = CMD_GEMM_BACKWARD();
5122
0
  cmd.backend = CCV_NNC_BACKEND_MPS;
5123
0
  cmd.algorithm = 1; // This is cblas.
5124
5125
0
  ccv_nnc_cmd_exec(cmd, ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(h, db, dbias), 0);
5126
5127
0
  ccv_nnc_tensor_t* const ch = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC( 32F, 4, 2), 0);
5128
0
  ccv_nnc_tensor_t* const cdb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC( 32F, 2, 3), 0);
5129
0
  ccv_nnc_tensor_t* const cdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC( 32F, 3), 0);
5130
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(h, db, dbias), TENSOR_LIST(ch, cdb, cdbias), 0);
5131
5132
0
  float dbiastp[] = {
5133
0
    22, 26, 30,
5134
0
  };
5135
0
  ccv_nnc_tensor_t dbiast = ccv_nnc_tensor(dbiastp, CPU_TENSOR_NHWC(32F, 3), 0);
5136
5137
0
  REQUIRE_TENSOR_EQ(cdbias, &dbiast, "bias should be equal");
5138
0
  float htp[] = {
5139
0
    1 * 21 + 2 * 22 + 3 * 23, 1 * 24 + 2 * 25 + 3 * 26,
5140
0
    4 * 21 + 5 * 22 + 6 * 23, 4 * 24 + 5 * 25 + 6 * 26,
5141
0
    7 * 21 + 8 * 22 + 9 * 23, 7 * 24 + 8 * 25 + 9 * 26,
5142
0
    10 * 21 + 11 * 22 + 12 * 23, 10 * 24 + 11 * 25 + 12 * 26,
5143
0
  };
5144
0
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 4, 2), 0);
5145
5146
0
  REQUIRE_TENSOR_EQ(ch, &ht, "h should be equal");
5147
0
  float dbtp[] = {
5148
0
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19, 2 * 13 + 5 * 15 + 8 * 17 + 11 * 19, 3 * 13 + 6 * 15 + 9 * 17 + 12 * 19,
5149
0
    1 * 14 + 4 * 16 + 7 * 18 + 10 * 20, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20,
5150
0
  };
5151
0
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
5152
0
  REQUIRE_TENSOR_EQ(cdb, &dbt, "db should be equal");
5153
0
  ccv_nnc_tensor_free(g);
5154
0
  ccv_nnc_tensor_free(a);
5155
0
  ccv_nnc_tensor_free(b);
5156
0
  ccv_nnc_tensor_free(h);
5157
0
  ccv_nnc_tensor_free(db);
5158
0
  ccv_nnc_tensor_free(dbias);
5159
0
}
5160
5161
TEST_CASE("backward gemm with transpose a")
5162
1
{
5163
1
    GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS) &&
5164
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
5165
0
  float gp[] = {
5166
0
    1, 2, 3,
5167
0
    4, 5, 6,
5168
0
    7, 8, 9,
5169
0
    10, 11, 12,
5170
0
  };
5171
0
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
5172
0
  float ap[] = {
5173
0
    13, 15, 17, 19,
5174
0
    14, 16, 18, 20,
5175
0
  };
5176
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4), 0);
5177
0
  float bp[] = {
5178
0
    21, 22, 23,
5179
0
    24, 25, 26,
5180
0
  };
5181
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
5182
0
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4), 0);
5183
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
5184
0
  ccv_nnc_tensor_t* const dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
5185
0
  ccv_nnc_tensor_t* gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
5186
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4), 0);
5187
0
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
5188
0
  ccv_nnc_tensor_t* gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4), 0);
5189
0
  ccv_nnc_tensor_t* gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
5190
0
  ccv_nnc_tensor_t* gdbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
5191
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(gg, ga, gb), 0);
5192
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(gh, gdb, gdbias), 0);
5193
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdb, gdbias), TENSOR_LIST(h, db, dbias), 0);
5194
0
  float dbiastp[] = {
5195
0
    22, 26, 30,
5196
0
  };
5197
0
  ccv_nnc_tensor_t dbiast = ccv_nnc_tensor(dbiastp, CPU_TENSOR_NHWC(32F, 3), 0);
5198
0
  REQUIRE_TENSOR_EQ(dbias, &dbiast, "bias should be equal");
5199
0
  float htp[] = {
5200
0
    1 * 21 + 2 * 22 + 3 * 23, 4 * 21 + 5 * 22 + 6 * 23, 7 * 21 + 8 * 22 + 9 * 23, 10 * 21 + 11 * 22 + 12 * 23,
5201
0
    1 * 24 + 2 * 25 + 3 * 26, 4 * 24 + 5 * 25 + 6 * 26, 7 * 24 + 8 * 25 + 9 * 26, 10 * 24 + 11 * 25 + 12 * 26,
5202
0
  };
5203
0
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 2, 4), 0);
5204
0
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
5205
0
  float dbtp[] = {
5206
0
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19, 2 * 13 + 5 * 15 + 8 * 17 + 11 * 19, 3 * 13 + 6 * 15 + 9 * 17 + 12 * 19,
5207
0
    1 * 14 + 4 * 16 + 7 * 18 + 10 * 20, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20,
5208
0
  };
5209
0
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
5210
0
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
5211
0
  ccv_nnc_tensor_free(g);
5212
0
  ccv_nnc_tensor_free(a);
5213
0
  ccv_nnc_tensor_free(b);
5214
0
  ccv_nnc_tensor_free(h);
5215
0
  ccv_nnc_tensor_free(db);
5216
0
  ccv_nnc_tensor_free(dbias);
5217
0
  ccv_nnc_tensor_free(gg);
5218
0
  ccv_nnc_tensor_free(ga);
5219
0
  ccv_nnc_tensor_free(gb);
5220
0
  ccv_nnc_tensor_free(gh);
5221
0
  ccv_nnc_tensor_free(gdb);
5222
0
  ccv_nnc_tensor_free(gdbias);
5223
0
}
5224
5225
TEST_CASE("backward gemm with transpose b")
5226
1
{
5227
1
    GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS) &&
5228
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
5229
0
  float gp[] = {
5230
0
    1, 2, 3,
5231
0
    4, 5, 6,
5232
0
    7, 8, 9,
5233
0
    10, 11, 12,
5234
0
  };
5235
0
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
5236
0
  float ap[] = {
5237
0
    13, 14,
5238
0
    15, 16,
5239
0
    17, 18,
5240
0
    19, 20,
5241
0
  };
5242
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 4, 2), 0);
5243
0
  float bp[] = {
5244
0
    21, 24,
5245
0
    22, 25,
5246
0
    23, 26,
5247
0
  };
5248
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 3, 2), 0);
5249
0
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 4, 2), 0);
5250
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 2), 0);
5251
0
  ccv_nnc_tensor_t* const dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
5252
0
  ccv_nnc_tensor_t* gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
5253
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
5254
0
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3, 2), 0);
5255
0
  ccv_nnc_tensor_t* gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 2), 0);
5256
0
  ccv_nnc_tensor_t* gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3, 2), 0);
5257
0
  ccv_nnc_tensor_t* gdbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
5258
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(gg, ga, gb), 0);
5259
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(gh, gdb, gdbias), 0);
5260
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdb, gdbias), TENSOR_LIST(h, db, dbias), 0);
5261
0
  float dbiastp[] = {
5262
0
    22, 26, 30,
5263
0
  };
5264
0
  ccv_nnc_tensor_t dbiast = ccv_nnc_tensor(dbiastp, CPU_TENSOR_NHWC(32F, 3), 0);
5265
0
  REQUIRE_TENSOR_EQ(dbias, &dbiast, "bias should be equal");
5266
0
  float htp[] = {
5267
0
    1 * 21 + 2 * 22 + 3 * 23, 1 * 24 + 2 * 25 + 3 * 26,
5268
0
    4 * 21 + 5 * 22 + 6 * 23, 4 * 24 + 5 * 25 + 6 * 26,
5269
0
    7 * 21 + 8 * 22 + 9 * 23, 7 * 24 + 8 * 25 + 9 * 26,
5270
0
    10 * 21 + 11 * 22 + 12 * 23, 10 * 24 + 11 * 25 + 12 * 26,
5271
0
  };
5272
0
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 4, 2), 0);
5273
0
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
5274
0
  float dbtp[] = {
5275
0
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19, 1 * 14 + 4 * 16 + 7 * 18 + 10 * 20,
5276
0
    2 * 13 + 5 * 15 + 8 * 17 + 11 * 19, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20,
5277
0
    3 * 13 + 6 * 15 + 9 * 17 + 12 * 19, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20,
5278
0
  };
5279
0
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 3, 2), 0);
5280
0
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
5281
0
  ccv_nnc_tensor_free(g);
5282
0
  ccv_nnc_tensor_free(a);
5283
0
  ccv_nnc_tensor_free(b);
5284
0
  ccv_nnc_tensor_free(h);
5285
0
  ccv_nnc_tensor_free(db);
5286
0
  ccv_nnc_tensor_free(dbias);
5287
0
  ccv_nnc_tensor_free(gg);
5288
0
  ccv_nnc_tensor_free(ga);
5289
0
  ccv_nnc_tensor_free(gb);
5290
0
  ccv_nnc_tensor_free(gh);
5291
0
  ccv_nnc_tensor_free(gdb);
5292
0
  ccv_nnc_tensor_free(gdbias);
5293
0
}
5294
5295
TEST_CASE("backward gemm with transpose a and b")
5296
1
{
5297
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS) &&
5298
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
5299
0
  float gp[] = {
5300
0
    1, 2, 3,
5301
0
    4, 5, 6,
5302
0
    7, 8, 9,
5303
0
    10, 11, 12,
5304
0
  };
5305
0
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 4, 3), 0);
5306
0
  float ap[] = {
5307
0
    13, 15, 17, 19,
5308
0
    14, 16, 18, 20,
5309
0
  };
5310
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4), 0);
5311
0
  float bp[] = {
5312
0
    21, 24,
5313
0
    22, 25,
5314
0
    23, 26,
5315
0
  };
5316
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 3, 2), 0);
5317
0
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4), 0);
5318
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 2), 0);
5319
0
  ccv_nnc_tensor_t* const dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
5320
0
  ccv_nnc_tensor_t* gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 4, 3), 0);
5321
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4), 0);
5322
0
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3, 2), 0);
5323
0
  ccv_nnc_tensor_t* gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4), 0);
5324
0
  ccv_nnc_tensor_t* gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3, 2), 0);
5325
0
  ccv_nnc_tensor_t* gdbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
5326
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(gg, ga, gb), 0);
5327
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(TRANSPOSE(0, 1), TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(gh, gdb, gdbias), 0);
5328
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdb, gdbias), TENSOR_LIST(h, db, dbias), 0);
5329
0
  float dbiastp[] = {
5330
0
    22, 26, 30,
5331
0
  };
5332
0
  ccv_nnc_tensor_t dbiast = ccv_nnc_tensor(dbiastp, CPU_TENSOR_NHWC(32F, 3), 0);
5333
0
  REQUIRE_TENSOR_EQ(dbias, &dbiast, "bias should be equal");
5334
0
  float htp[] = {
5335
0
    1 * 21 + 2 * 22 + 3 * 23, 4 * 21 + 5 * 22 + 6 * 23, 7 * 21 + 8 * 22 + 9 * 23, 10 * 21 + 11 * 22 + 12 * 23,
5336
0
    1 * 24 + 2 * 25 + 3 * 26, 4 * 24 + 5 * 25 + 6 * 26, 7 * 24 + 8 * 25 + 9 * 26, 10 * 24 + 11 * 25 + 12 * 26,
5337
0
  };
5338
0
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 2, 4), 0);
5339
0
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
5340
0
  float dbtp[] = {
5341
0
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19, 1 * 14 + 4 * 16 + 7 * 18 + 10 * 20,
5342
0
    2 * 13 + 5 * 15 + 8 * 17 + 11 * 19, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20,
5343
0
    3 * 13 + 6 * 15 + 9 * 17 + 12 * 19, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20,
5344
0
  };
5345
0
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 3, 2), 0);
5346
0
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
5347
0
  ccv_nnc_tensor_free(g);
5348
0
  ccv_nnc_tensor_free(a);
5349
0
  ccv_nnc_tensor_free(b);
5350
0
  ccv_nnc_tensor_free(h);
5351
0
  ccv_nnc_tensor_free(db);
5352
0
  ccv_nnc_tensor_free(dbias);
5353
0
  ccv_nnc_tensor_free(gg);
5354
0
  ccv_nnc_tensor_free(ga);
5355
0
  ccv_nnc_tensor_free(gb);
5356
0
  ccv_nnc_tensor_free(gh);
5357
0
  ccv_nnc_tensor_free(gdb);
5358
0
  ccv_nnc_tensor_free(gdbias);
5359
0
}
5360
5361
5362
TEST_CASE("backward gemm large data set")
5363
1
{
5364
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS) &&
5365
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
5366
0
  dsfmt_t dsfmt;
5367
0
  dsfmt_init_gen_rand(&dsfmt, 0);
5368
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 128), 0);
5369
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
5370
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64), 0);
5371
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 64), 0);
5372
0
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 64), 0);
5373
0
  ccv_nnc_tensor_t* dw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
5374
0
  ccv_nnc_tensor_t* dbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64), 0);
5375
0
  ccv_nnc_tensor_t* h = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 128), 0);
5376
5377
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
5378
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
5379
0
  ccv_nnc_tensor_t* hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
5380
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
5381
0
  ccv_nnc_tensor_t* hg = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
5382
0
  ccv_nnc_tensor_t* hdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
5383
0
  ccv_nnc_tensor_t* hdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
5384
0
  ccv_nnc_tensor_t* hh = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
5385
0
  int i;
5386
0
  for (i = 0; i < 64 * 128; i++)
5387
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
5388
0
  for (i = 0; i < 64; i++)
5389
0
    hbias->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
5390
0
  for (i = 0; i < 10 * 128; i++)
5391
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
5392
0
  for (i = 0; i < 10 * 64; i++)
5393
0
    hg->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
5394
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias, hg), TENSOR_LIST(a, w, bias, g), 0);
5395
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(hb), 0);
5396
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(hg, ha, hw, 0), TENSOR_LIST(hh, hdw, hdbias), 0);
5397
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
5398
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, w, 0), TENSOR_LIST(h, dw, dbias), 0);
5399
0
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
5400
0
  ccv_nnc_tensor_t* tdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
5401
0
  ccv_nnc_tensor_t* tdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
5402
0
  ccv_nnc_tensor_t* th = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
5403
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b, dw, dbias, h), TENSOR_LIST(tb, tdw, tdbias, th), 0);
5404
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb->data.f32, hb->data.f32, 10 * 64, 1e-5, "GPU computed output should be numerically close to CPU computed ones");
5405
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tdw->data.f32, hdw->data.f32, 64 * 128, 5e-3, "GPU computed output should be numerically close to CPU computed ones");
5406
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tdbias->data.f32, hdbias->data.f32, 64, 1e-5, "GPU computed output should be numerically close to CPU computed ones");
5407
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, th->data.f32, hh->data.f32, 10 * 128, 1e-5, "GPU computed output should be numerically close to CPU computed ones");
5408
0
  ccv_nnc_tensor_free(a);
5409
0
  ccv_nnc_tensor_free(w);
5410
0
  ccv_nnc_tensor_free(bias);
5411
0
  ccv_nnc_tensor_free(b);
5412
0
  ccv_nnc_tensor_free(g);
5413
0
  ccv_nnc_tensor_free(dw);
5414
0
  ccv_nnc_tensor_free(dbias);
5415
0
  ccv_nnc_tensor_free(h);
5416
0
  ccv_nnc_tensor_free(ha);
5417
0
  ccv_nnc_tensor_free(hw);
5418
0
  ccv_nnc_tensor_free(hbias);
5419
0
  ccv_nnc_tensor_free(hb);
5420
0
  ccv_nnc_tensor_free(hg);
5421
0
  ccv_nnc_tensor_free(hdw);
5422
0
  ccv_nnc_tensor_free(hdbias);
5423
0
  ccv_nnc_tensor_free(hh);
5424
0
  ccv_nnc_tensor_free(tb);
5425
0
  ccv_nnc_tensor_free(th);
5426
0
  ccv_nnc_tensor_free(tdw);
5427
0
  ccv_nnc_tensor_free(tdbias);
5428
0
}
5429
5430
TEST_CASE("backward gemm no bias")
5431
1
{
5432
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS) &&
5433
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
5434
0
  dsfmt_t dsfmt;
5435
0
  dsfmt_init_gen_rand(&dsfmt, 0);
5436
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 128), 0);
5437
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
5438
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 64), 0);
5439
0
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 64), 0);
5440
0
  ccv_nnc_tensor_t* dw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
5441
0
  ccv_nnc_tensor_t* h = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 128), 0);
5442
5443
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
5444
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
5445
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
5446
0
  ccv_nnc_tensor_t* hg = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
5447
0
  ccv_nnc_tensor_t* hdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
5448
0
  ccv_nnc_tensor_t* hh = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
5449
0
  int i;
5450
0
  for (i = 0; i < 64 * 128; i++)
5451
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
5452
0
  for (i = 0; i < 10 * 128; i++)
5453
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
5454
0
  for (i = 0; i < 10 * 64; i++)
5455
0
    hg->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
5456
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hg), TENSOR_LIST(a, w, g), 0);
5457
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw), TENSOR_LIST(hb), 0);
5458
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(hg, ha, hw, 0), TENSOR_LIST(hh, hdw, 0), 0);
5459
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w), TENSOR_LIST(b), 0);
5460
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, w, 0), TENSOR_LIST(h, dw, 0), 0);
5461
0
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
5462
0
  ccv_nnc_tensor_t* tdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
5463
0
  ccv_nnc_tensor_t* th = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
5464
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b, dw, h), TENSOR_LIST(tb, tdw, th), 0);
5465
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb->data.f32, hb->data.f32, 10 * 64, 1e-5, "GPU computed output should be numerically close to CPU computed ones");
5466
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tdw->data.f32, hdw->data.f32, 64 * 128, 5e-3, "GPU computed output should be numerically close to CPU computed ones");
5467
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, th->data.f32, hh->data.f32, 10 * 128, 1e-5, "GPU computed output should be numerically close to CPU computed ones");
5468
0
  ccv_nnc_tensor_free(a);
5469
0
  ccv_nnc_tensor_free(w);
5470
0
  ccv_nnc_tensor_free(b);
5471
0
  ccv_nnc_tensor_free(g);
5472
0
  ccv_nnc_tensor_free(dw);
5473
0
  ccv_nnc_tensor_free(h);
5474
0
  ccv_nnc_tensor_free(ha);
5475
0
  ccv_nnc_tensor_free(hw);
5476
0
  ccv_nnc_tensor_free(hb);
5477
0
  ccv_nnc_tensor_free(hg);
5478
0
  ccv_nnc_tensor_free(hdw);
5479
0
  ccv_nnc_tensor_free(hh);
5480
0
  ccv_nnc_tensor_free(tb);
5481
0
  ccv_nnc_tensor_free(th);
5482
0
  ccv_nnc_tensor_free(tdw);
5483
0
}
5484
5485
TEST_CASE("backward gemm no h")
5486
1
{
5487
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS) &&
5488
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
5489
0
  dsfmt_t dsfmt;
5490
0
  dsfmt_init_gen_rand(&dsfmt, 0);
5491
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 128), 0);
5492
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
5493
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64), 0);
5494
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 64), 0);
5495
0
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 64), 0);
5496
0
  ccv_nnc_tensor_t* dw = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
5497
0
  ccv_nnc_tensor_t* dbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64), 0);
5498
5499
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
5500
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
5501
0
  ccv_nnc_tensor_t* hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
5502
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
5503
0
  ccv_nnc_tensor_t* hg = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
5504
0
  ccv_nnc_tensor_t* hdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
5505
0
  ccv_nnc_tensor_t* hdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
5506
0
  int i;
5507
0
  for (i = 0; i < 64 * 128; i++)
5508
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
5509
0
  for (i = 0; i < 64; i++)
5510
0
    hbias->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
5511
0
  for (i = 0; i < 10 * 128; i++)
5512
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
5513
0
  for (i = 0; i < 10 * 64; i++)
5514
0
    hg->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
5515
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias, hg), TENSOR_LIST(a, w, bias, g), 0);
5516
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(hb), 0);
5517
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(hg, ha, hw, 0), TENSOR_LIST(0, hdw, hdbias), 0);
5518
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
5519
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, w, 0), TENSOR_LIST(0, dw, dbias), 0);
5520
0
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
5521
0
  ccv_nnc_tensor_t* tdw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
5522
0
  ccv_nnc_tensor_t* tdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
5523
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b, dw, dbias, 0), TENSOR_LIST(tb, tdw, tdbias, 0), 0);
5524
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb->data.f32, hb->data.f32, 10 * 64, 1e-5, "GPU computed output should be numerically close to CPU computed ones");
5525
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tdw->data.f32, hdw->data.f32, 64 * 128, 5e-3, "GPU computed output should be numerically close to CPU computed ones");
5526
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tdbias->data.f32, hdbias->data.f32, 64, 1e-5, "GPU computed output should be numerically close to CPU computed ones");
5527
0
  ccv_nnc_tensor_free(a);
5528
0
  ccv_nnc_tensor_free(w);
5529
0
  ccv_nnc_tensor_free(bias);
5530
0
  ccv_nnc_tensor_free(b);
5531
0
  ccv_nnc_tensor_free(g);
5532
0
  ccv_nnc_tensor_free(dw);
5533
0
  ccv_nnc_tensor_free(dbias);
5534
0
  ccv_nnc_tensor_free(ha);
5535
0
  ccv_nnc_tensor_free(hw);
5536
0
  ccv_nnc_tensor_free(hbias);
5537
0
  ccv_nnc_tensor_free(hb);
5538
0
  ccv_nnc_tensor_free(hg);
5539
0
  ccv_nnc_tensor_free(hdw);
5540
0
  ccv_nnc_tensor_free(hdbias);
5541
0
  ccv_nnc_tensor_free(tb);
5542
0
  ccv_nnc_tensor_free(tdw);
5543
0
  ccv_nnc_tensor_free(tdbias);
5544
0
}
5545
5546
TEST_CASE("backward gemm no dw")
5547
1
{
5548
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS) &&
5549
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
5550
0
  dsfmt_t dsfmt;
5551
0
  dsfmt_init_gen_rand(&dsfmt, 0);
5552
0
  ccv_nnc_tensor_t* a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 128), 0);
5553
0
  ccv_nnc_tensor_t* w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64, 128), 0);
5554
0
  ccv_nnc_tensor_t* bias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64), 0);
5555
0
  ccv_nnc_tensor_t* b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 64), 0);
5556
0
  ccv_nnc_tensor_t* g = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 64), 0);
5557
0
  ccv_nnc_tensor_t* dbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 64), 0);
5558
0
  ccv_nnc_tensor_t* h = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 10, 128), 0);
5559
5560
0
  ccv_nnc_tensor_t* ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
5561
0
  ccv_nnc_tensor_t* hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64, 128), 0);
5562
0
  ccv_nnc_tensor_t* hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
5563
0
  ccv_nnc_tensor_t* hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
5564
0
  ccv_nnc_tensor_t* hg = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
5565
0
  ccv_nnc_tensor_t* hdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
5566
0
  ccv_nnc_tensor_t* hh = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
5567
0
  int i;
5568
0
  for (i = 0; i < 64 * 128; i++)
5569
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / (64 * 128);
5570
0
  for (i = 0; i < 64; i++)
5571
0
    hbias->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
5572
0
  for (i = 0; i < 10 * 128; i++)
5573
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
5574
0
  for (i = 0; i < 10 * 64; i++)
5575
0
    hg->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
5576
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias, hg), TENSOR_LIST(a, w, bias, g), 0);
5577
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(hb), 0);
5578
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(hg, ha, hw, 0), TENSOR_LIST(hh, 0, hdbias), 0);
5579
0
  ccv_nnc_cmd_exec(CMD_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, w, bias), TENSOR_LIST(b), 0);
5580
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, w, 0), TENSOR_LIST(h, 0, dbias), 0);
5581
0
  ccv_nnc_tensor_t* tb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 64), 0);
5582
0
  ccv_nnc_tensor_t* tdbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 64), 0);
5583
0
  ccv_nnc_tensor_t* th = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 10, 128), 0);
5584
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b, 0, dbias, h), TENSOR_LIST(tb, 0, tdbias, th), 0);
5585
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tb->data.f32, hb->data.f32, 10 * 64, 1e-5, "GPU computed output should be numerically close to CPU computed ones");
5586
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, tdbias->data.f32, hdbias->data.f32, 64, 1e-5, "GPU computed output should be numerically close to CPU computed ones");
5587
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, th->data.f32, hh->data.f32, 10 * 128, 1e-5, "GPU computed output should be numerically close to CPU computed ones");
5588
0
  ccv_nnc_tensor_free(a);
5589
0
  ccv_nnc_tensor_free(w);
5590
0
  ccv_nnc_tensor_free(bias);
5591
0
  ccv_nnc_tensor_free(b);
5592
0
  ccv_nnc_tensor_free(g);
5593
0
  ccv_nnc_tensor_free(dbias);
5594
0
  ccv_nnc_tensor_free(h);
5595
0
  ccv_nnc_tensor_free(ha);
5596
0
  ccv_nnc_tensor_free(hw);
5597
0
  ccv_nnc_tensor_free(hbias);
5598
0
  ccv_nnc_tensor_free(hb);
5599
0
  ccv_nnc_tensor_free(hg);
5600
0
  ccv_nnc_tensor_free(hdbias);
5601
0
  ccv_nnc_tensor_free(hh);
5602
0
  ccv_nnc_tensor_free(tb);
5603
0
  ccv_nnc_tensor_free(th);
5604
0
  ccv_nnc_tensor_free(tdbias);
5605
0
}
5606
5607
TEST_CASE("backwar gemm with no transpose batch 2, same b")
5608
1
{
5609
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS) &&
5610
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
5611
0
  float gp[] = {
5612
0
    1, 2, 3,
5613
0
    4, 5, 6,
5614
0
    7, 8, 9,
5615
0
    10, 11, 12,
5616
0
    10, 20, 30,
5617
0
    40, 50, 60,
5618
0
    70, 80, 90,
5619
0
    100, 110, 120,
5620
0
  };
5621
0
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
5622
0
  float ap[] = {
5623
0
    13, 14,
5624
0
    15, 16,
5625
0
    17, 18,
5626
0
    19, 20,
5627
0
    131, 141,
5628
0
    151, 161,
5629
0
    171, 181,
5630
0
    191, 201,
5631
0
  };
5632
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
5633
0
  float bp[] = {
5634
0
    21, 22, 23,
5635
0
    24, 25, 26,
5636
0
  };
5637
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
5638
0
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
5639
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
5640
0
  ccv_nnc_tensor_t* const dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
5641
0
  ccv_nnc_tensor_t* gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
5642
0
  ccv_nnc_tensor_t* ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
5643
0
  ccv_nnc_tensor_t* gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
5644
0
  ccv_nnc_tensor_t* gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
5645
0
  ccv_nnc_tensor_t* gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
5646
0
  ccv_nnc_tensor_t* gdbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
5647
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(gg, ga, gb), 0);
5648
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(gh, gdb, gdbias), 0);
5649
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdb, gdbias), TENSOR_LIST(h, db, dbias), 0);
5650
0
  float dbiastp[] = {
5651
0
    22 + 220, 26 + 260, 30 + 300,
5652
0
  };
5653
0
  ccv_nnc_tensor_t dbiast = ccv_nnc_tensor(dbiastp, CPU_TENSOR_NHWC(32F, 3), 0);
5654
  
5655
0
  REQUIRE_TENSOR_EQ(dbias, &dbiast, "bias should be equal");
5656
0
  float htp[] = {
5657
0
    1 * 21 + 2 * 22 + 3 * 23, 1 * 24 + 2 * 25 + 3 * 26,
5658
0
    4 * 21 + 5 * 22 + 6 * 23, 4 * 24 + 5 * 25 + 6 * 26,
5659
0
    7 * 21 + 8 * 22 + 9 * 23, 7 * 24 + 8 * 25 + 9 * 26,
5660
0
    10 * 21 + 11 * 22 + 12 * 23, 10 * 24 + 11 * 25 + 12 * 26,
5661
0
    10 * 21 + 20 * 22 + 30 * 23, 10 * 24 + 20 * 25 + 30 * 26,
5662
0
    40 * 21 + 50 * 22 + 60 * 23, 40 * 24 + 50 * 25 + 60 * 26,
5663
0
    70 * 21 + 80 * 22 + 90 * 23, 70 * 24 + 80 * 25 + 90 * 26,
5664
0
    100 * 21 + 110 * 22 + 120 * 23, 100 * 24 + 110 * 25 + 120 * 26,
5665
0
  };
5666
0
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
5667
  
5668
0
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
5669
0
  float dbtp[] = {
5670
0
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19 + 10 * 131 + 40 * 151 + 70 * 171 + 100 * 191, 2 * 13 + 5 * 15 + 8 * 17 + 11 * 19 + 20 * 131 + 50 * 151 + 80 * 171 + 110 * 191, 3 * 13 + 6 * 15 + 9 * 17 + 12 * 19 + 30 * 131 + 60 * 151 + 90 * 171 + 120 * 191,
5671
0
    1 * 14 + 4 * 16 + 7 * 18 + 10 * 20 + 10 * 141 + 40 * 161 + 70 * 181 + 100 * 201, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20 + 20 * 141 + 50 * 161 + 80 * 181 + 110 * 201, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20 + 30 * 141 + 60 * 161 + 90 * 181 + 120 * 201,
5672
0
  };
5673
0
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
5674
0
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
5675
0
  ccv_nnc_tensor_free(g);
5676
0
  ccv_nnc_tensor_free(a);
5677
0
  ccv_nnc_tensor_free(b);
5678
0
  ccv_nnc_tensor_free(h);
5679
0
  ccv_nnc_tensor_free(db);
5680
0
  ccv_nnc_tensor_free(dbias);
5681
0
  ccv_nnc_tensor_free(gg);
5682
0
  ccv_nnc_tensor_free(ga);
5683
0
  ccv_nnc_tensor_free(gb);
5684
0
  ccv_nnc_tensor_free(gh);
5685
0
  ccv_nnc_tensor_free(gdb);
5686
0
  ccv_nnc_tensor_free(gdbias);
5687
0
}
5688
5689
TEST_CASE("backward gemm with no transpose batch 2, batched b")
5690
1
{
5691
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS) &&
5692
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
5693
0
  float gp[] = {
5694
0
    1, 2, 3,
5695
0
    4, 5, 6,
5696
0
    7, 8, 9,
5697
0
    10, 11, 12,
5698
0
    10, 20, 30,
5699
0
    40, 50, 60,
5700
0
    70, 80, 90,
5701
0
    100, 110, 120,
5702
0
  };
5703
0
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
5704
0
  float ap[] = {
5705
0
    13, 14,
5706
0
    15, 16,
5707
0
    17, 18,
5708
0
    19, 20,
5709
0
    131, 141,
5710
0
    151, 161,
5711
0
    171, 181,
5712
0
    191, 201,
5713
0
  };
5714
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
5715
0
  float bp[] = {
5716
0
    21, 22, 23,
5717
0
    24, 25, 26,
5718
0
    212, 222, 232,
5719
0
    242, 252, 262,
5720
0
  };
5721
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 2, 3), 0);
5722
0
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
5723
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 3), 0);
5724
0
  ccv_nnc_tensor_t* const dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 1, 3), 0);
5725
0
  ccv_nnc_tensor_t* const gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
5726
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
5727
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 3), 0);
5728
0
  ccv_nnc_tensor_t* const gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
5729
0
  ccv_nnc_tensor_t* const gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 3), 0);
5730
0
  ccv_nnc_tensor_t* const gdbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 1, 3), 0);
5731
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(gg, ga, gb), 0);
5732
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(gh, gdb, gdbias), 0);
5733
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdb, gdbias), TENSOR_LIST(h, db, dbias), 0);
5734
0
  float dbiastp[] = {
5735
0
    22, 26, 30,
5736
0
    220, 260, 300,
5737
0
  };
5738
0
  ccv_nnc_tensor_t dbiast = ccv_nnc_tensor(dbiastp, CPU_TENSOR_NHWC(32F, 2, 1, 3), 0);
5739
0
  REQUIRE_TENSOR_EQ(dbias, &dbiast, "bias should be equal");
5740
0
  float htp[] = {
5741
0
    1 * 21 + 2 * 22 + 3 * 23, 1 * 24 + 2 * 25 + 3 * 26,
5742
0
    4 * 21 + 5 * 22 + 6 * 23, 4 * 24 + 5 * 25 + 6 * 26,
5743
0
    7 * 21 + 8 * 22 + 9 * 23, 7 * 24 + 8 * 25 + 9 * 26,
5744
0
    10 * 21 + 11 * 22 + 12 * 23, 10 * 24 + 11 * 25 + 12 * 26,
5745
0
    10 * 212 + 20 * 222 + 30 * 232, 10 * 242 + 20 * 252 + 30 * 262,
5746
0
    40 * 212 + 50 * 222 + 60 * 232, 40 * 242 + 50 * 252 + 60 * 262,
5747
0
    70 * 212 + 80 * 222 + 90 * 232, 70 * 242 + 80 * 252 + 90 * 262,
5748
0
    100 * 212 + 110 * 222 + 120 * 232, 100 * 242 + 110 * 252 + 120 * 262,
5749
0
  };
5750
0
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
5751
0
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
5752
0
  float dbtp[] = {
5753
0
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19, 2 * 13 + 5 * 15 + 8 * 17 + 11 * 19, 3 * 13 + 6 * 15 + 9 * 17 + 12 * 19,
5754
0
    1 * 14 + 4 * 16 + 7 * 18 + 10 * 20, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20,
5755
0
    10 * 131 + 40 * 151 + 70 * 171 + 100 * 191, 20 * 131 + 50 * 151 + 80 * 171 + 110 * 191, 30 * 131 + 60 * 151 + 90 * 171 + 120 * 191,
5756
0
    10 * 141 + 40 * 161 + 70 * 181 + 100 * 201, 20 * 141 + 50 * 161 + 80 * 181 + 110 * 201, 30 * 141 + 60 * 161 + 90 * 181 + 120 * 201,
5757
0
  };
5758
0
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 2, 2, 3), 0);
5759
0
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
5760
0
  ccv_nnc_tensor_free(g);
5761
0
  ccv_nnc_tensor_free(a);
5762
0
  ccv_nnc_tensor_free(b);
5763
0
  ccv_nnc_tensor_free(h);
5764
0
  ccv_nnc_tensor_free(db);
5765
0
  ccv_nnc_tensor_free(dbias);
5766
0
  ccv_nnc_tensor_free(gg);
5767
0
  ccv_nnc_tensor_free(ga);
5768
0
  ccv_nnc_tensor_free(gb);
5769
0
  ccv_nnc_tensor_free(gh);
5770
0
  ccv_nnc_tensor_free(gdb);
5771
0
  ccv_nnc_tensor_free(gdbias);
5772
0
}
5773
5774
TEST_CASE("backward gemm with transpose a batch 2, same b")
5775
1
{
5776
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS) &&
5777
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
5778
0
  float gp[] = {
5779
0
    1, 2, 3,
5780
0
    4, 5, 6,
5781
0
    7, 8, 9,
5782
0
    10, 11, 12,
5783
0
    10, 20, 30,
5784
0
    40, 50, 60,
5785
0
    70, 80, 90,
5786
0
    100, 110, 120,
5787
0
  };
5788
0
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
5789
0
  float ap[] = {
5790
0
    13, 15, 17, 19,
5791
0
    14, 16, 18, 20,
5792
0
    131, 151, 171, 191,
5793
0
    141, 161, 181, 201,
5794
0
  };
5795
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 2, 4), 0);
5796
0
  float bp[] = {
5797
0
    21, 22, 23,
5798
0
    24, 25, 26,
5799
0
  };
5800
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
5801
0
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 4), 0);
5802
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3), 0);
5803
0
  ccv_nnc_tensor_t* const dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
5804
0
  ccv_nnc_tensor_t* const gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
5805
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 4), 0);
5806
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
5807
0
  ccv_nnc_tensor_t* const gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 4), 0);
5808
0
  ccv_nnc_tensor_t* const gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3), 0);
5809
0
  ccv_nnc_tensor_t* const gdbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
5810
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(gg, ga, gb), 0);
5811
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(gh, gdb, gdbias), 0);
5812
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdb, gdbias), TENSOR_LIST(h, db, dbias), 0);
5813
0
  float dbiastp[] = {
5814
0
    22 + 220, 26 + 260, 30 + 300,
5815
0
  };
5816
0
  ccv_nnc_tensor_t dbiast = ccv_nnc_tensor(dbiastp, CPU_TENSOR_NHWC(32F, 3), 0);
5817
0
  REQUIRE_TENSOR_EQ(dbias, &dbiast, "bias should be equal");
5818
0
  float htp[] = {
5819
0
    1 * 21 + 2 * 22 + 3 * 23, 4 * 21 + 5 * 22 + 6 * 23, 7 * 21 + 8 * 22 + 9 * 23, 10 * 21 + 11 * 22 + 12 * 23,
5820
0
    1 * 24 + 2 * 25 + 3 * 26, 4 * 24 + 5 * 25 + 6 * 26, 7 * 24 + 8 * 25 + 9 * 26, 10 * 24 + 11 * 25 + 12 * 26,
5821
0
    10 * 21 + 20 * 22 + 30 * 23, 40 * 21 + 50 * 22 + 60 * 23, 70 * 21 + 80 * 22 + 90 * 23, 100 * 21 + 110 * 22 + 120 * 23,
5822
0
    10 * 24 + 20 * 25 + 30 * 26, 40 * 24 + 50 * 25 + 60 * 26, 70 * 24 + 80 * 25 + 90 * 26, 100 * 24 + 110 * 25 + 120 * 26,
5823
0
  };
5824
0
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 2, 2, 4), 0);
5825
0
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
5826
0
  float dbtp[] = {
5827
0
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19 + 10 * 131 + 40 * 151 + 70 * 171 + 100 * 191, 2 * 13 + 5 * 15 + 8 * 17 + 11 * 19 + 20 * 131 + 50 * 151 + 80 * 171 + 110 * 191, 3 * 13 + 6 * 15 + 9 * 17 + 12 * 19 + 30 * 131 + 60 * 151 + 90 * 171 + 120 * 191,
5828
0
    1 * 14 + 4 * 16 + 7 * 18 + 10 * 20 + 10 * 141 + 40 * 161 + 70 * 181 + 100 * 201, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20 + 20 * 141 + 50 * 161 + 80 * 181 + 110 * 201, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20 + 30 * 141 + 60 * 161 + 90 * 181 + 120 * 201,
5829
0
  };
5830
0
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 2, 3), 0);
5831
0
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
5832
0
  ccv_nnc_tensor_free(g);
5833
0
  ccv_nnc_tensor_free(a);
5834
0
  ccv_nnc_tensor_free(b);
5835
0
  ccv_nnc_tensor_free(h);
5836
0
  ccv_nnc_tensor_free(db);
5837
0
  ccv_nnc_tensor_free(dbias);
5838
0
  ccv_nnc_tensor_free(gg);
5839
0
  ccv_nnc_tensor_free(ga);
5840
0
  ccv_nnc_tensor_free(gb);
5841
0
  ccv_nnc_tensor_free(gh);
5842
0
  ccv_nnc_tensor_free(gdb);
5843
0
  ccv_nnc_tensor_free(gdbias);
5844
0
}
5845
5846
TEST_CASE("backward gemm with transpose b batch 2, batched b")
5847
1
{
5848
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS) &&
5849
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
5850
0
  float gp[] = {
5851
0
    1, 2, 3,
5852
0
    4, 5, 6,
5853
0
    7, 8, 9,
5854
0
    10, 11, 12,
5855
0
    10, 20, 30,
5856
0
    40, 50, 60,
5857
0
    70, 80, 90,
5858
0
    100, 110, 120,
5859
0
  };
5860
0
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
5861
0
  float ap[] = {
5862
0
    13, 14,
5863
0
    15, 16,
5864
0
    17, 18,
5865
0
    19, 20,
5866
0
    131, 141,
5867
0
    151, 161,
5868
0
    171, 181,
5869
0
    191, 201,
5870
0
  };
5871
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
5872
0
  float bp[] = {
5873
0
    21, 24,
5874
0
    22, 25,
5875
0
    23, 26,
5876
0
    212, 242,
5877
0
    222, 252,
5878
0
    232, 262,
5879
0
  };
5880
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3, 2), 0);
5881
0
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
5882
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3, 2), 0);
5883
0
  ccv_nnc_tensor_t* const dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 1, 3), 0);
5884
0
  ccv_nnc_tensor_t* const gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
5885
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
5886
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3, 2), 0);
5887
0
  ccv_nnc_tensor_t* const gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
5888
0
  ccv_nnc_tensor_t* const gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3, 2), 0);
5889
0
  ccv_nnc_tensor_t* const gdbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 1, 3), 0);
5890
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(gg, ga, gb), 0);
5891
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(gh, gdb, gdbias), 0);
5892
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdb, gdbias), TENSOR_LIST(h, db, dbias), 0);
5893
0
  float dbiastp[] = {
5894
0
    22, 26, 30,
5895
0
    220, 260, 300,
5896
0
  };
5897
0
  ccv_nnc_tensor_t dbiast = ccv_nnc_tensor(dbiastp, CPU_TENSOR_NHWC(32F, 2, 1, 3), 0);
5898
0
  REQUIRE_TENSOR_EQ(dbias, &dbiast, "bias should be equal");
5899
0
  float htp[] = {
5900
0
    1 * 21 + 2 * 22 + 3 * 23, 1 * 24 + 2 * 25 + 3 * 26,
5901
0
    4 * 21 + 5 * 22 + 6 * 23, 4 * 24 + 5 * 25 + 6 * 26,
5902
0
    7 * 21 + 8 * 22 + 9 * 23, 7 * 24 + 8 * 25 + 9 * 26,
5903
0
    10 * 21 + 11 * 22 + 12 * 23, 10 * 24 + 11 * 25 + 12 * 26,
5904
0
    10 * 212 + 20 * 222 + 30 * 232, 10 * 242 + 20 * 252 + 30 * 262,
5905
0
    40 * 212 + 50 * 222 + 60 * 232, 40 * 242 + 50 * 252 + 60 * 262,
5906
0
    70 * 212 + 80 * 222 + 90 * 232, 70 * 242 + 80 * 252 + 90 * 262,
5907
0
    100 * 212 + 110 * 222 + 120 * 232, 100 * 242 + 110 * 252 + 120 * 262,
5908
0
  };
5909
0
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
5910
0
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
5911
0
  float dbtp[] = {
5912
0
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19, 1 * 14 + 4 * 16 + 7 * 18 + 10 * 20,
5913
0
    2 * 13 + 5 * 15 + 8 * 17 + 11 * 19, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20,
5914
0
    3 * 13 + 6 * 15 + 9 * 17 + 12 * 19, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20,
5915
0
    10 * 131 + 40 * 151 + 70 * 171 + 100 * 191, 10 * 141 + 40 * 161 + 70 * 181 + 100 * 201,
5916
0
    20 * 131 + 50 * 151 + 80 * 171 + 110 * 191, 20 * 141 + 50 * 161 + 80 * 181 + 110 * 201,
5917
0
    30 * 131 + 60 * 151 + 90 * 171 + 120 * 191, 30 * 141 + 60 * 161 + 90 * 181 + 120 * 201,
5918
0
  };
5919
0
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 2, 3, 2), 0);
5920
0
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
5921
0
  ccv_nnc_tensor_free(g);
5922
0
  ccv_nnc_tensor_free(a);
5923
0
  ccv_nnc_tensor_free(b);
5924
0
  ccv_nnc_tensor_free(h);
5925
0
  ccv_nnc_tensor_free(db);
5926
0
  ccv_nnc_tensor_free(dbias);
5927
0
  ccv_nnc_tensor_free(gg);
5928
0
  ccv_nnc_tensor_free(ga);
5929
0
  ccv_nnc_tensor_free(gb);
5930
0
  ccv_nnc_tensor_free(gh);
5931
0
  ccv_nnc_tensor_free(gdb);
5932
0
  ccv_nnc_tensor_free(gdbias);
5933
0
}
5934
5935
TEST_CASE("backward gemm with transpose a and b batch 2, same b")
5936
1
{
5937
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS) &&
5938
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
5939
0
  float gp[] = {
5940
0
    1, 2, 3,
5941
0
    4, 5, 6,
5942
0
    7, 8, 9,
5943
0
    10, 11, 12,
5944
0
    10, 20, 30,
5945
0
    40, 50, 60,
5946
0
    70, 80, 90,
5947
0
    100, 110, 120,
5948
0
  };
5949
0
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
5950
0
  float ap[] = {
5951
0
    13, 15, 17, 19,
5952
0
    14, 16, 18, 20,
5953
0
    131, 151, 171, 191,
5954
0
    141, 161, 181, 201,
5955
0
  };
5956
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 2, 4), 0);
5957
0
  float bp[] = {
5958
0
    21, 24,
5959
0
    22, 25,
5960
0
    23, 26,
5961
0
  };
5962
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 3, 2), 0);
5963
0
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 4), 0);
5964
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 2), 0);
5965
0
  ccv_nnc_tensor_t* const dbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3), 0);
5966
0
  ccv_nnc_tensor_t* const gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
5967
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 4), 0);
5968
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3, 2), 0);
5969
0
  ccv_nnc_tensor_t* const gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 4), 0);
5970
0
  ccv_nnc_tensor_t* const gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3, 2), 0);
5971
0
  ccv_nnc_tensor_t* const gdbias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3), 0);
5972
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(gg, ga, gb), 0);
5973
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(TRANSPOSE(1, 2), TRANSPOSE(0, 1)), ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(gh, gdb, gdbias), 0);
5974
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdb, gdbias), TENSOR_LIST(h, db, dbias), 0);
5975
0
  float dbiastp[] = {
5976
0
    22 + 220, 26 + 260, 30 + 300,
5977
0
  };
5978
0
  ccv_nnc_tensor_t dbiast = ccv_nnc_tensor(dbiastp, CPU_TENSOR_NHWC(32F, 3), 0);
5979
0
  REQUIRE_TENSOR_EQ(dbias, &dbiast, "bias should be equal");
5980
0
  float htp[] = {
5981
0
    1 * 21 + 2 * 22 + 3 * 23, 4 * 21 + 5 * 22 + 6 * 23, 7 * 21 + 8 * 22 + 9 * 23, 10 * 21 + 11 * 22 + 12 * 23,
5982
0
    1 * 24 + 2 * 25 + 3 * 26, 4 * 24 + 5 * 25 + 6 * 26, 7 * 24 + 8 * 25 + 9 * 26, 10 * 24 + 11 * 25 + 12 * 26,
5983
0
    10 * 21 + 20 * 22 + 30 * 23, 40 * 21 + 50 * 22 + 60 * 23, 70 * 21 + 80 * 22 + 90 * 23, 100 * 21 + 110 * 22 + 120 * 23,
5984
0
    10 * 24 + 20 * 25 + 30 * 26, 40 * 24 + 50 * 25 + 60 * 26, 70 * 24 + 80 * 25 + 90 * 26, 100 * 24 + 110 * 25 + 120 * 26,
5985
0
  };
5986
0
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 2, 2, 4), 0);
5987
0
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
5988
0
  float dbtp[] = {
5989
0
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19 + 10 * 131 + 40 * 151 + 70 * 171 + 100 * 191, 1 * 14 + 4 * 16 + 7 * 18 + 10 * 20 + 10 * 141 + 40 * 161 + 70 * 181 + 100 * 201,
5990
0
    2 * 13 + 5 * 15 + 8 * 17 + 11 * 19 + 20 * 131 + 50 * 151 + 80 * 171 + 110 * 191, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20 + 20 * 141 + 50 * 161 + 80 * 181 + 110 * 201,
5991
0
    3 * 13 + 6 * 15 + 9 * 17 + 12 * 19 + 30 * 131 + 60 * 151 + 90 * 171 + 120 * 191, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20 + 30 * 141 + 60 * 161 + 90 * 181 + 120 * 201,
5992
0
  };
5993
0
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 3, 2), 0);
5994
0
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
5995
0
  ccv_nnc_tensor_free(g);
5996
0
  ccv_nnc_tensor_free(a);
5997
0
  ccv_nnc_tensor_free(b);
5998
0
  ccv_nnc_tensor_free(h);
5999
0
  ccv_nnc_tensor_free(db);
6000
0
  ccv_nnc_tensor_free(dbias);
6001
0
  ccv_nnc_tensor_free(gg);
6002
0
  ccv_nnc_tensor_free(ga);
6003
0
  ccv_nnc_tensor_free(gb);
6004
0
  ccv_nnc_tensor_free(gh);
6005
0
  ccv_nnc_tensor_free(gdb);
6006
0
  ccv_nnc_tensor_free(gdbias);
6007
0
}
6008
6009
TEST_CASE("backward gemm with no transpose batch 2, batched b, no bias")
6010
1
{
6011
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS) &&
6012
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
6013
0
  float gp[] = {
6014
0
    1, 2, 3,
6015
0
    4, 5, 6,
6016
0
    7, 8, 9,
6017
0
    10, 11, 12,
6018
0
    10, 20, 30,
6019
0
    40, 50, 60,
6020
0
    70, 80, 90,
6021
0
    100, 110, 120,
6022
0
  };
6023
0
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
6024
0
  float ap[] = {
6025
0
    13, 14,
6026
0
    15, 16,
6027
0
    17, 18,
6028
0
    19, 20,
6029
0
    131, 141,
6030
0
    151, 161,
6031
0
    171, 181,
6032
0
    191, 201,
6033
0
  };
6034
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
6035
0
  float bp[] = {
6036
0
    21, 22, 23,
6037
0
    24, 25, 26,
6038
0
    212, 222, 232,
6039
0
    242, 252, 262,
6040
0
  };
6041
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 2, 3), 0);
6042
0
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
6043
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 3), 0);
6044
0
  ccv_nnc_tensor_t* const gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
6045
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
6046
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 3), 0);
6047
0
  ccv_nnc_tensor_t* const gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
6048
0
  ccv_nnc_tensor_t* const gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 3), 0);
6049
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(gg, ga, gb), 0);
6050
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(gh, gdb), 0);
6051
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdb), TENSOR_LIST(h, db), 0);
6052
0
  float htp[] = {
6053
0
    1 * 21 + 2 * 22 + 3 * 23, 1 * 24 + 2 * 25 + 3 * 26,
6054
0
    4 * 21 + 5 * 22 + 6 * 23, 4 * 24 + 5 * 25 + 6 * 26,
6055
0
    7 * 21 + 8 * 22 + 9 * 23, 7 * 24 + 8 * 25 + 9 * 26,
6056
0
    10 * 21 + 11 * 22 + 12 * 23, 10 * 24 + 11 * 25 + 12 * 26,
6057
0
    10 * 212 + 20 * 222 + 30 * 232, 10 * 242 + 20 * 252 + 30 * 262,
6058
0
    40 * 212 + 50 * 222 + 60 * 232, 40 * 242 + 50 * 252 + 60 * 262,
6059
0
    70 * 212 + 80 * 222 + 90 * 232, 70 * 242 + 80 * 252 + 90 * 262,
6060
0
    100 * 212 + 110 * 222 + 120 * 232, 100 * 242 + 110 * 252 + 120 * 262,
6061
0
  };
6062
0
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
6063
0
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
6064
0
  float dbtp[] = {
6065
0
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19, 2 * 13 + 5 * 15 + 8 * 17 + 11 * 19, 3 * 13 + 6 * 15 + 9 * 17 + 12 * 19,
6066
0
    1 * 14 + 4 * 16 + 7 * 18 + 10 * 20, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20,
6067
0
    10 * 131 + 40 * 151 + 70 * 171 + 100 * 191, 20 * 131 + 50 * 151 + 80 * 171 + 110 * 191, 30 * 131 + 60 * 151 + 90 * 171 + 120 * 191,
6068
0
    10 * 141 + 40 * 161 + 70 * 181 + 100 * 201, 20 * 141 + 50 * 161 + 80 * 181 + 110 * 201, 30 * 141 + 60 * 161 + 90 * 181 + 120 * 201,
6069
0
  };
6070
0
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 2, 2, 3), 0);
6071
0
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
6072
0
  ccv_nnc_tensor_free(g);
6073
0
  ccv_nnc_tensor_free(a);
6074
0
  ccv_nnc_tensor_free(b);
6075
0
  ccv_nnc_tensor_free(h);
6076
0
  ccv_nnc_tensor_free(db);
6077
0
  ccv_nnc_tensor_free(gg);
6078
0
  ccv_nnc_tensor_free(ga);
6079
0
  ccv_nnc_tensor_free(gb);
6080
0
  ccv_nnc_tensor_free(gh);
6081
0
  ccv_nnc_tensor_free(gdb);
6082
0
}
6083
6084
TEST_CASE("backward gemm with transpose b batch 2, batched b, no bias")
6085
1
{
6086
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS) &&
6087
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
6088
0
  float gp[] = {
6089
0
    1, 2, 3,
6090
0
    4, 5, 6,
6091
0
    7, 8, 9,
6092
0
    10, 11, 12,
6093
0
    10, 20, 30,
6094
0
    40, 50, 60,
6095
0
    70, 80, 90,
6096
0
    100, 110, 120,
6097
0
  };
6098
0
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
6099
0
  float ap[] = {
6100
0
    13, 14,
6101
0
    15, 16,
6102
0
    17, 18,
6103
0
    19, 20,
6104
0
    131, 141,
6105
0
    151, 161,
6106
0
    171, 181,
6107
0
    191, 201,
6108
0
  };
6109
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
6110
0
  float bp[] = {
6111
0
    21, 24,
6112
0
    22, 25,
6113
0
    23, 26,
6114
0
    212, 242,
6115
0
    222, 252,
6116
0
    232, 262,
6117
0
  };
6118
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3, 2), 0);
6119
0
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
6120
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3, 2), 0);
6121
0
  ccv_nnc_tensor_t* const gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
6122
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
6123
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3, 2), 0);
6124
0
  ccv_nnc_tensor_t* const gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 2), 0);
6125
0
  ccv_nnc_tensor_t* const gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3, 2), 0);
6126
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(gg, ga, gb), 0);
6127
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(gh, gdb), 0);
6128
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdb), TENSOR_LIST(h, db), 0);
6129
0
  float htp[] = {
6130
0
    1 * 21 + 2 * 22 + 3 * 23, 1 * 24 + 2 * 25 + 3 * 26,
6131
0
    4 * 21 + 5 * 22 + 6 * 23, 4 * 24 + 5 * 25 + 6 * 26,
6132
0
    7 * 21 + 8 * 22 + 9 * 23, 7 * 24 + 8 * 25 + 9 * 26,
6133
0
    10 * 21 + 11 * 22 + 12 * 23, 10 * 24 + 11 * 25 + 12 * 26,
6134
0
    10 * 212 + 20 * 222 + 30 * 232, 10 * 242 + 20 * 252 + 30 * 262,
6135
0
    40 * 212 + 50 * 222 + 60 * 232, 40 * 242 + 50 * 252 + 60 * 262,
6136
0
    70 * 212 + 80 * 222 + 90 * 232, 70 * 242 + 80 * 252 + 90 * 262,
6137
0
    100 * 212 + 110 * 222 + 120 * 232, 100 * 242 + 110 * 252 + 120 * 262,
6138
0
  };
6139
0
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 2, 4, 2), 0);
6140
0
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
6141
0
  float dbtp[] = {
6142
0
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19, 1 * 14 + 4 * 16 + 7 * 18 + 10 * 20,
6143
0
    2 * 13 + 5 * 15 + 8 * 17 + 11 * 19, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20,
6144
0
    3 * 13 + 6 * 15 + 9 * 17 + 12 * 19, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20,
6145
0
    10 * 131 + 40 * 151 + 70 * 171 + 100 * 191, 10 * 141 + 40 * 161 + 70 * 181 + 100 * 201,
6146
0
    20 * 131 + 50 * 151 + 80 * 171 + 110 * 191, 20 * 141 + 50 * 161 + 80 * 181 + 110 * 201,
6147
0
    30 * 131 + 60 * 151 + 90 * 171 + 120 * 191, 30 * 141 + 60 * 161 + 90 * 181 + 120 * 201,
6148
0
  };
6149
0
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 2, 3, 2), 0);
6150
0
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
6151
0
  ccv_nnc_tensor_free(g);
6152
0
  ccv_nnc_tensor_free(a);
6153
0
  ccv_nnc_tensor_free(b);
6154
0
  ccv_nnc_tensor_free(h);
6155
0
  ccv_nnc_tensor_free(db);
6156
0
  ccv_nnc_tensor_free(gg);
6157
0
  ccv_nnc_tensor_free(ga);
6158
0
  ccv_nnc_tensor_free(gb);
6159
0
  ccv_nnc_tensor_free(gh);
6160
0
  ccv_nnc_tensor_free(gdb);
6161
0
}
6162
6163
TEST_CASE("backward gemm with transpose a and b batch 2, batch b, no bias")
6164
1
{
6165
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_MPS) &&
6166
1
    ccv_nnc_cmd_ok(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_MPS));
6167
0
  float gp[] = {
6168
0
    1, 2, 3,
6169
0
    4, 5, 6,
6170
0
    7, 8, 9,
6171
0
    10, 11, 12,
6172
0
    10, 20, 30,
6173
0
    40, 50, 60,
6174
0
    70, 80, 90,
6175
0
    100, 110, 120,
6176
0
  };
6177
0
  ccv_nnc_tensor_t* const g = ccv_nnc_tensor_new(gp, CPU_TENSOR_NHWC(32F, 2, 4, 3), 0);
6178
0
  float ap[] = {
6179
0
    13, 15, 17, 19,
6180
0
    14, 16, 18, 20,
6181
0
    131, 151, 171, 191,
6182
0
    141, 161, 181, 201,
6183
0
  };
6184
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(ap, CPU_TENSOR_NHWC(32F, 2, 2, 4), 0);
6185
0
  float bp[] = {
6186
0
    21, 24,
6187
0
    22, 25,
6188
0
    23, 26,
6189
0
    212, 242,
6190
0
    222, 252,
6191
0
    232, 262,
6192
0
  };
6193
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(bp, CPU_TENSOR_NHWC(32F, 2, 3, 2), 0);
6194
0
  ccv_nnc_tensor_t* const h = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 2, 4), 0);
6195
0
  ccv_nnc_tensor_t* const db = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 3, 2), 0);
6196
0
  ccv_nnc_tensor_t* const gg = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 4, 3), 0);
6197
0
  ccv_nnc_tensor_t* const ga = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 4), 0);
6198
0
  ccv_nnc_tensor_t* const gb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3, 2), 0);
6199
0
  ccv_nnc_tensor_t* const gh = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 2, 4), 0);
6200
0
  ccv_nnc_tensor_t* const gdb = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 2, 3, 2), 0);
6201
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(g, a, b), TENSOR_LIST(gg, ga, gb), 0);
6202
0
  ccv_nnc_cmd_exec(CMD_GEMM_BACKWARD(TRANSPOSE(1, 2), TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(gg, ga, gb), TENSOR_LIST(gh, gdb), 0);
6203
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(gh, gdb), TENSOR_LIST(h, db), 0);
6204
0
  float htp[] = {
6205
0
    1 * 21 + 2 * 22 + 3 * 23, 4 * 21 + 5 * 22 + 6 * 23, 7 * 21 + 8 * 22 + 9 * 23, 10 * 21 + 11 * 22 + 12 * 23,
6206
0
    1 * 24 + 2 * 25 + 3 * 26, 4 * 24 + 5 * 25 + 6 * 26, 7 * 24 + 8 * 25 + 9 * 26, 10 * 24 + 11 * 25 + 12 * 26,
6207
0
    10 * 212 + 20 * 222 + 30 * 232, 40 * 212 + 50 * 222 + 60 * 232, 70 * 212 + 80 * 222 + 90 * 232, 100 * 212 + 110 * 222 + 120 * 232,
6208
0
    10 * 242 + 20 * 252 + 30 * 262, 40 * 242 + 50 * 252 + 60 * 262, 70 * 242 + 80 * 252 + 90 * 262, 100 * 242 + 110 * 252 + 120 * 262,
6209
0
  };
6210
0
  ccv_nnc_tensor_t ht = ccv_nnc_tensor(htp, CPU_TENSOR_NHWC(32F, 2, 2, 4), 0);
6211
0
  REQUIRE_TENSOR_EQ(h, &ht, "h should be equal");
6212
0
  float dbtp[] = {
6213
0
    1 * 13 + 4 * 15 + 7 * 17 + 10 * 19, 1 * 14 + 4 * 16 + 7 * 18 + 10 * 20,
6214
0
    2 * 13 + 5 * 15 + 8 * 17 + 11 * 19, 2 * 14 + 5 * 16 + 8 * 18 + 11 * 20,
6215
0
    3 * 13 + 6 * 15 + 9 * 17 + 12 * 19, 3 * 14 + 6 * 16 + 9 * 18 + 12 * 20,
6216
0
    10 * 131 + 40 * 151 + 70 * 171 + 100 * 191, 10 * 141 + 40 * 161 + 70 * 181 + 100 * 201,
6217
0
    20 * 131 + 50 * 151 + 80 * 171 + 110 * 191, 20 * 141 + 50 * 161 + 80 * 181 + 110 * 201,
6218
0
    30 * 131 + 60 * 151 + 90 * 171 + 120 * 191, 30 * 141 + 60 * 161 + 90 * 181 + 120 * 201,
6219
0
  };
6220
0
  ccv_nnc_tensor_t dbt = ccv_nnc_tensor(dbtp, CPU_TENSOR_NHWC(32F, 2, 3, 2), 0);
6221
0
  REQUIRE_TENSOR_EQ(db, &dbt, "db should be equal");
6222
0
  ccv_nnc_tensor_free(g);
6223
0
  ccv_nnc_tensor_free(a);
6224
0
  ccv_nnc_tensor_free(b);
6225
0
  ccv_nnc_tensor_free(h);
6226
0
  ccv_nnc_tensor_free(db);
6227
0
  ccv_nnc_tensor_free(gg);
6228
0
  ccv_nnc_tensor_free(ga);
6229
0
  ccv_nnc_tensor_free(gb);
6230
0
  ccv_nnc_tensor_free(gh);
6231
0
  ccv_nnc_tensor_free(gdb);
6232
0
}
6233
6234
TEST_CASE("mps segmented gemm")
6235
1
{
6236
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SEGMENTED_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
6237
0
  dsfmt_t dsfmt;
6238
0
  dsfmt_init_gen_rand(&dsfmt, 11);
6239
0
  ccv_nnc_tensor_t* const ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 384, 256), 0);
6240
0
  ccv_nnc_tensor_t* const hindices = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32S, 3), 0);
6241
0
  hindices->data.i32[0] = 1;
6242
0
  hindices->data.i32[1] = 0;
6243
0
  hindices->data.i32[2] = 2;
6244
0
  ccv_nnc_tensor_t* const hcounts = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32S, 3), 0);
6245
0
  hcounts->data.i32[0] = 129;
6246
0
  hcounts->data.i32[1] = 131;
6247
0
  hcounts->data.i32[2] = 124;
6248
0
  ccv_nnc_tensor_t* const hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 3, 128, 256), 0);
6249
0
  ccv_nnc_tensor_t* const hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 384, 128), 0);
6250
0
  ccv_nnc_tensor_t* const bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 384, 128), 0);
6251
0
  int i;
6252
0
  for (i = 0; i < 3 * 128 * 256; i++)
6253
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / 256;
6254
0
  for (i = 0; i < 384 * 256; i++)
6255
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
6256
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 384, 256), 0);
6257
0
  ccv_nnc_tensor_t* const indices = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32S, 3), 0);
6258
0
  ccv_nnc_tensor_t* const counts = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32S, 3), 0);
6259
0
  ccv_nnc_tensor_t* const w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 3, 128, 256), 0);
6260
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32F, 384, 128), 0);
6261
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hindices, hcounts, hw), TENSOR_LIST(a, indices, counts, w), 0);
6262
0
  ccv_nnc_cmd_exec(CMD_SEGMENTED_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, indices, counts, w), TENSOR_LIST(b), 0);
6263
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb), 0);
6264
0
  ccv_nnc_cmd_exec(CMD_SEGMENTED_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hindices, hcounts, hw), TENSOR_LIST(bt), 0);
6265
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, hb->data.f32, bt->data.f32, 384 * 128, 3e-4, "segmented GEMM result should match CPU reference");
6266
0
  ccv_nnc_tensor_free(a);
6267
0
  ccv_nnc_tensor_free(indices);
6268
0
  ccv_nnc_tensor_free(counts);
6269
0
  ccv_nnc_tensor_free(w);
6270
0
  ccv_nnc_tensor_free(b);
6271
0
  ccv_nnc_tensor_free(ha);
6272
0
  ccv_nnc_tensor_free(hindices);
6273
0
  ccv_nnc_tensor_free(hcounts);
6274
0
  ccv_nnc_tensor_free(hw);
6275
0
  ccv_nnc_tensor_free(hb);
6276
0
  ccv_nnc_tensor_free(bt);
6277
0
}
6278
6279
TEST_CASE("mps segmented gemm with bias in half precision, split-k")
6280
1
{
6281
1
  GUARD_ELSE_RETURN(ccv_nnc_cmd_ok(CCV_NNC_SEGMENTED_GEMM_FORWARD, CCV_NNC_BACKEND_MPS));
6282
0
  dsfmt_t dsfmt;
6283
0
  dsfmt_init_gen_rand(&dsfmt, 13);
6284
0
  ccv_nnc_tensor_t* const ha = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 272, 4096), 0);
6285
0
  ccv_nnc_tensor_t* const hindices = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32S, 2), 0);
6286
0
  hindices->data.i32[0] = 1;
6287
0
  hindices->data.i32[1] = 0;
6288
0
  ccv_nnc_tensor_t* const hcounts = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32S, 2), 0);
6289
0
  hcounts->data.i32[0] = 136;
6290
0
  hcounts->data.i32[1] = 136;
6291
0
  ccv_nnc_tensor_t* const hw = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 128, 4096), 0);
6292
0
  ccv_nnc_tensor_t* const hbias = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 2, 128), 0);
6293
0
  ccv_nnc_tensor_t* const hb = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 272, 128), 0);
6294
0
  ccv_nnc_tensor_t* const hb16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 272, 128), 0);
6295
0
  ccv_nnc_tensor_t* const bt = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(32F, 272, 128), 0);
6296
0
  int i;
6297
0
  for (i = 0; i < 2 * 128 * 4096; i++)
6298
0
    hw->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / 4096;
6299
0
  for (i = 0; i < 2 * 128; i++)
6300
0
    hbias->data.f32[i] = dsfmt_genrand_open_close(&dsfmt) / 128;
6301
0
  for (i = 0; i < 272 * 4096; i++)
6302
0
    ha->data.f32[i] = dsfmt_genrand_open_close(&dsfmt);
6303
0
  ccv_nnc_tensor_t* const ha16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 272, 4096), 0);
6304
0
  ccv_nnc_tensor_t* const hw16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 2, 128, 4096), 0);
6305
0
  ccv_nnc_tensor_t* const hbias16 = ccv_nnc_tensor_new(0, CPU_TENSOR_NHWC(16F, 2, 128), 0);
6306
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hw, hbias), TENSOR_LIST(ha16, hw16, hbias16), 0);
6307
0
  ccv_nnc_tensor_t* const a = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 272, 4096), 0);
6308
0
  ccv_nnc_tensor_t* const indices = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32S, 2), 0);
6309
0
  ccv_nnc_tensor_t* const counts = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 32S, 2), 0);
6310
0
  ccv_nnc_tensor_t* const w = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 2, 128, 4096), 0);
6311
0
  ccv_nnc_tensor_t* const bias = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 2, 128), 0);
6312
0
  ccv_nnc_tensor_t* const b = ccv_nnc_tensor_new(0, GPU_TENSOR_NHWC(000, 16F, 272, 128), 0);
6313
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(ha16, hindices, hcounts, hw16, hbias16), TENSOR_LIST(a, indices, counts, w, bias), 0);
6314
0
  ccv_nnc_cmd_exec(CMD_SEGMENTED_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(a, indices, counts, w, bias), TENSOR_LIST(b), 0);
6315
0
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(b), TENSOR_LIST(hb16), 0);
6316
0
  ccv_nnc_cmd_exec(CMD_DATATYPE_CONVERSION_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(hb16), TENSOR_LIST(hb), 0);
6317
0
  ccv_nnc_cmd_exec(CMD_SEGMENTED_GEMM_FORWARD(NO_TRANSPOSE, TRANSPOSE(1, 2)), ccv_nnc_no_hint, 0, TENSOR_LIST(ha, hindices, hcounts, hw, hbias), TENSOR_LIST(bt), 0);
6318
0
  REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, hb->data.f32, bt->data.f32, 272 * 128, 2e-2, "half-precision segmented GEMM result should match CPU reference");
6319
0
  ccv_nnc_tensor_free(a);
6320
0
  ccv_nnc_tensor_free(indices);
6321
0
  ccv_nnc_tensor_free(counts);
6322
0
  ccv_nnc_tensor_free(w);
6323
0
  ccv_nnc_tensor_free(bias);
6324
0
  ccv_nnc_tensor_free(b);
6325
0
  ccv_nnc_tensor_free(ha);
6326
0
  ccv_nnc_tensor_free(hindices);
6327
0
  ccv_nnc_tensor_free(hcounts);
6328
0
  ccv_nnc_tensor_free(hw);
6329
0
  ccv_nnc_tensor_free(hbias);
6330
0
  ccv_nnc_tensor_free(hb);
6331
0
  ccv_nnc_tensor_free(hb16);
6332
0
  ccv_nnc_tensor_free(bt);
6333
0
  ccv_nnc_tensor_free(ha16);
6334
0
  ccv_nnc_tensor_free(hw16);
6335
0
  ccv_nnc_tensor_free(hbias16);
6336
0
}
6337
6338
// Derived from shapes.txt NA lines, assuming the call shape is C = A @ B^T.
6339
1
NA_GEMM_SHAPE_TEST(306, 2048, 3840)
6340
1
NA_GEMM_SHAPE_TEST(306, 4096, 3840)
6341
1
NA_GEMM_SHAPE_TEST(306, 3840, 4096)
6342
1
NA_GEMM_SHAPE_TEST(306, 15360, 3840)
6343
1
NA_GEMM_SHAPE_TEST(306, 3840, 15360)
6344
1
NA_GEMM_SHAPE_TEST(1024, 4096, 4096)
6345
1
NA_GEMM_SHAPE_TEST(1024, 32, 4096)
6346
1
NA_GEMM_SHAPE_TEST(1024, 16384, 4096)
6347
1
NA_GEMM_SHAPE_TEST(1024, 4096, 16384)
6348
1
NA_GEMM_SHAPE_TEST(1024, 2048, 2048)
6349
1
NA_GEMM_SHAPE_TEST(1024, 32, 2048)
6350
1
NA_GEMM_SHAPE_TEST(1024, 8192, 2048)
6351
1
NA_GEMM_SHAPE_TEST(1024, 2048, 8192)
6352
1
NA_GEMM_SHAPE_TEST(1, 2048, 256)
6353
1
NA_GEMM_SHAPE_TEST(1, 2048, 2048)
6354
1
NA_GEMM_SHAPE_TEST(1, 4096, 256)
6355
1
NA_GEMM_SHAPE_TEST(1, 4096, 4096)
6356
1
NA_GEMM_SHAPE_TEST(1024, 4096, 128)
6357
1
NA_GEMM_SHAPE_TEST(257, 2048, 128)
6358
1
NA_GEMM_SHAPE_TEST(33792, 4096, 4096)
6359
1
NA_GEMM_SHAPE_TEST(33792, 32, 4096)
6360
1
NA_GEMM_SHAPE_TEST(257, 2048, 2048)
6361
1
NA_GEMM_SHAPE_TEST(257, 32, 2048)
6362
1
NA_GEMM_SHAPE_TEST(33792, 2048, 4096)
6363
1
NA_GEMM_SHAPE_TEST(33792, 4096, 2048)
6364
1
NA_GEMM_SHAPE_TEST(33792, 16384, 4096)
6365
1
NA_GEMM_SHAPE_TEST(33792, 4096, 16384)
6366
1
NA_GEMM_SHAPE_TEST(257, 8192, 2048)
6367
1
NA_GEMM_SHAPE_TEST(257, 2048, 8192)
6368
1
NA_GEMM_SHAPE_TEST(33792, 128, 4096)
6369
1
NA_GEMM_SHAPE_TEST(257, 128, 2048)
6370
1
NA_GEMM_BIAS_SHAPE_TEST(306, 2048, 3840)
6371
1
NA_GEMM_BIAS_SHAPE_TEST(306, 4096, 3840)
6372
1
NA_GEMM_BIAS_SHAPE_TEST(306, 3840, 4096)
6373
1
NA_GEMM_BIAS_SHAPE_TEST(306, 15360, 3840)
6374
1
NA_GEMM_BIAS_SHAPE_TEST(306, 3840, 15360)
6375
1
NA_GEMM_BIAS_SHAPE_TEST(1024, 4096, 4096)
6376
1
NA_GEMM_BIAS_SHAPE_TEST(1024, 32, 4096)
6377
1
NA_GEMM_BIAS_SHAPE_TEST(1024, 16384, 4096)
6378
1
NA_GEMM_BIAS_SHAPE_TEST(1024, 4096, 16384)
6379
1
NA_GEMM_BIAS_SHAPE_TEST(1024, 2048, 2048)
6380
1
NA_GEMM_BIAS_SHAPE_TEST(1024, 32, 2048)
6381
1
NA_GEMM_BIAS_SHAPE_TEST(1024, 8192, 2048)
6382
1
NA_GEMM_BIAS_SHAPE_TEST(1024, 2048, 8192)
6383
1
NA_GEMM_BIAS_SHAPE_TEST(1, 2048, 256)
6384
1
NA_GEMM_BIAS_SHAPE_TEST(1, 2048, 2048)
6385
1
NA_GEMM_BIAS_SHAPE_TEST(1, 4096, 256)
6386
1
NA_GEMM_BIAS_SHAPE_TEST(1, 4096, 4096)
6387
1
NA_GEMM_BIAS_SHAPE_TEST(1024, 4096, 128)
6388
1
NA_GEMM_BIAS_SHAPE_TEST(257, 2048, 128)
6389
1
NA_GEMM_BIAS_SHAPE_TEST(33792, 4096, 4096)
6390
1
NA_GEMM_BIAS_SHAPE_TEST(33792, 32, 4096)
6391
1
NA_GEMM_BIAS_SHAPE_TEST(257, 2048, 2048)
6392
1
NA_GEMM_BIAS_SHAPE_TEST(257, 32, 2048)
6393
1
NA_GEMM_BIAS_SHAPE_TEST(33792, 2048, 4096)
6394
1
NA_GEMM_BIAS_SHAPE_TEST(33792, 4096, 2048)
6395
1
NA_GEMM_BIAS_SHAPE_TEST(33792, 16384, 4096)
6396
1
NA_GEMM_BIAS_SHAPE_TEST(33792, 4096, 16384)
6397
1
NA_GEMM_BIAS_SHAPE_TEST(257, 8192, 2048)
6398
1
NA_GEMM_BIAS_SHAPE_TEST(257, 2048, 8192)
6399
1
NA_GEMM_BIAS_SHAPE_TEST(33792, 128, 4096)
6400
NA_GEMM_BIAS_SHAPE_TEST(257, 128, 2048)
6401
6402
#include "case_main.h"