scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu

Bug Summary

File:	nnc/cmd/scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c
Warning:	line 264, column 28 Array access (from variable 'amp2') results in a null pointer dereference
Annotated Source Code

Press '?' to see keyboard shortcuts
Show analyzer invocation
clang -cc1 -cc1 -triple x86_64-unknown-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name ccv_nnc_scaled_dot_product_attention_cpu_ref.c -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -mrelocation-model pic -pic-level 2 -pic-is-pie -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -target-feature +sse2 -tune-cpu generic -debugger-tuning=gdb -fdebug-compilation-dir=/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/cmd -fcoverage-compilation-dir=/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/cmd -resource-dir /usr/local/lib/clang/19 -I ../../ -I .. -I /usr/local/cuda/include -D HAVE_CBLAS -D HAVE_LIBPNG -D HAVE_LIBJPEG -D HAVE_FFTW3 -D HAVE_PTHREAD -D HAVE_LIBLINEAR -D HAVE_TESSERACT -D HAVE_AVCODEC -D HAVE_AVFORMAT -D HAVE_AVUTIL -D HAVE_SWSCALE -D HAVE_SSE2 -D HAVE_GSL -D HAVE_CUDA -D HAVE_CUDNN -D HAVE_NCCL -D USE_SYSTEM_CUB -I /usr/local/include -internal-isystem /usr/local/lib/clang/19/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/12/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -ferror-limit 19 -fgnuc-version=4.2.1 -fskip-odr-check-in-gmf -vectorize-loops -vectorize-slp -analyzer-output=html -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /home/liu/actions-runner/_work/ccv/ccv/_analyze/2026-05-28-211646-3113523-1 -x c scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c
1#include "ccv.h"
2#include "ccv_internal.h"
3#include "nnc/ccv_nnc.h"
4#include "nnc/ccv_nnc_easy.h"
5#include "nnc/ccv_nnc_internal.h"
6#ifdef USE_OPENMP
7#include <omp.h>
8#endif
9#ifdef USE_DISPATCH
10#include <dispatch/dispatch.h>
11#endif
12 
13// Shared methods.
14#include "../_ccv_nnc_cpu_ref.h"
15 
16static int _ccv_nnc_scaled_dot_product_attention_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
17{
18	assert(input_size >= 3)((void) sizeof ((input_size >= 3) ? 1 : 0), __extension__ (
{ if (input_size >= 3) ; else __assert_fail ("input_size >= 3"
, "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 18, __extension__ __PRETTY_FUNCTION__); }));
1
Assuming 'input_size' is >= 3→
2
←
Taking true branch→
19	assert(output_size >= 1)((void) sizeof ((output_size >= 1) ? 1 : 0), __extension__
 ({ if (output_size >= 1) ; else __assert_fail ("output_size >= 1"
, "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 19, __extension__ __PRETTY_FUNCTION__); }));
3
←
Assuming 'output_size' is >= 1→
4
←
Taking true branch→
20	const int is_varlen = cmd.info.scaled_dot_product_attention.is_varlen;
21	ccv_nnc_tensor_view_t* const q = (ccv_nnc_tensor_view_t*)inputs[0];
22	ccv_nnc_tensor_view_t* const k = (ccv_nnc_tensor_view_t*)inputs[1];
23	ccv_nnc_tensor_view_t* const v = (ccv_nnc_tensor_view_t*)inputs[2];
24	ccv_nnc_tensor_view_t* const attn_mask = input_size > 3 ? (ccv_nnc_tensor_view_t*)inputs[3] : 0;
5
←
Assuming 'input_size' is > 3→
6
←
'?' condition is true→
25	ccv_nnc_tensor_view_t* const w = input_size > 4 ? (ccv_nnc_tensor_view_t*)inputs[4] : 0;
7
←
Assuming 'input_size' is > 4→
8
←
'?' condition is true→
26	ccv_nnc_tensor_view_t* const bias = input_size > 5 ? (ccv_nnc_tensor_view_t*)inputs[5] : 0;
9
←
Assuming 'input_size' is > 5→
10
←
'?' condition is true→
27	ccv_nnc_tensor_view_t* const q_seq_offsets = is_varlen && input_size > 6 ? (ccv_nnc_tensor_view_t*)inputs[6] : 0;
11
←
Assuming 'is_varlen' is 0→
28	ccv_nnc_tensor_view_t* const kv_seq_offsets = is_varlen11.1
'is_varlen' is 0
 && input_size > 7 ? (ccv_nnc_tensor_view_t*)inputs[7] : 0;
29	const int attention_sinks = cmd.info.scaled_dot_product_attention.attention_sinks;
30	const int sliding_window = cmd.info.scaled_dot_product_attention.sliding_window;
31	ccv_nnc_tensor_view_t* const sinks = attention_sinks && input_size > 8 ? (ccv_nnc_tensor_view_t*)inputs[8] : 0;
12
←
Assuming 'attention_sinks' is not equal to 0→
13
←
Assuming 'input_size' is > 8→
14
←
'?' condition is true→
32	if (bias) // bias always requires a weight matrix.
15
←
Assuming 'bias' is null→
33		{ assert(w)((void) sizeof ((w) ? 1 : 0), __extension__ ({ if (w) ; else __assert_fail
 ("w", "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 33, __extension__ __PRETTY_FUNCTION__); })); }
34	if (sliding_window < 0 || (sliding_window > 0 && (!cmd.info.scaled_dot_product_attention.is_causal || is_varlen)))
16
←
Assuming 'sliding_window' is >= 0→
17
←
Assuming 'sliding_window' is <= 0→
35		return CCV_NNC_EXEC_INVALID;
36	if (is_varlen17.1
'is_varlen' is 0
 && (attn_mask || w || bias || !q_seq_offsets || !kv_seq_offsets))
37		return CCV_NNC_EXEC_INVALID;
38	if (attention_sinks17.2
'attention_sinks' is not equal to 0
 && !sinks)
18
←
Assuming 'sinks' is non-null→
39		return CCV_NNC_EXEC_INVALID;
40	ccv_nnc_tensor_view_t* const c = (w) ? (ccv_nnc_tensor_view_t*)outputs[2] : (ccv_nnc_tensor_view_t*)outputs[0];
19
←
Taking false branch→
20
←
Assuming 'w' is non-null→
21
←
'?' condition is true→
41	const int q_nd = ccv_nnc_tensor_nd(q->info.dim);
42	assert(q_nd == 3 || q_nd == 4)((void) sizeof ((q_nd == 3 || q_nd == 4) ? 1 : 0), __extension__
 ({ if (q_nd == 3 || q_nd == 4) ; else __assert_fail ("q_nd == 3 || q_nd == 4"
, "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 42, __extension__ __PRETTY_FUNCTION__); }));
22
←
Assuming 'q_nd' is not equal to 3→
23
←
Assuming 'q_nd' is equal to 4→
24
←
Taking true branch→
43	const int k_nd = ccv_nnc_tensor_nd(k->info.dim);
44	assert(k_nd == 3 || k_nd == 4)((void) sizeof ((k_nd == 3 || k_nd == 4) ? 1 : 0), __extension__
 ({ if (k_nd == 3 || k_nd == 4) ; else __assert_fail ("k_nd == 3 || k_nd == 4"
, "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 44, __extension__ __PRETTY_FUNCTION__); }));
25
←
Assuming 'k_nd' is not equal to 3→
26
←
Assuming 'k_nd' is equal to 4→
27
←
Taking true branch→
45	const int v_nd = ccv_nnc_tensor_nd(v->info.dim);
46	assert(v_nd == 3 || v_nd == 4)((void) sizeof ((v_nd == 3 || v_nd == 4) ? 1 : 0), __extension__
 ({ if (v_nd == 3 || v_nd == 4) ; else __assert_fail ("v_nd == 3 || v_nd == 4"
, "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 46, __extension__ __PRETTY_FUNCTION__); }));
28
←
Assuming 'v_nd' is not equal to 3→
29
←
Assuming 'v_nd' is equal to 4→
30
←
Taking true branch→
47	const int c_nd = ccv_nnc_tensor_nd(c->info.dim);
48	assert(c_nd == 3 || c_nd == 4)((void) sizeof ((c_nd == 3 || c_nd == 4) ? 1 : 0), __extension__
 ({ if (c_nd == 3 || c_nd == 4) ; else __assert_fail ("c_nd == 3 || c_nd == 4"
, "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 48, __extension__ __PRETTY_FUNCTION__); }));
31
←
Assuming 'c_nd' is not equal to 3→
32
←
Assuming 'c_nd' is equal to 4→
33
←
Taking true branch→
49	assert(q_nd == k_nd && k_nd == v_nd && v_nd == c_nd)((void) sizeof ((q_nd == k_nd && k_nd == v_nd &&
 v_nd == c_nd) ? 1 : 0), __extension__ ({ if (q_nd == k_nd &&
 k_nd == v_nd && v_nd == c_nd) ; else __assert_fail (
"q_nd == k_nd && k_nd == v_nd && v_nd == c_nd"
, "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 49, __extension__ __PRETTY_FUNCTION__); }));
34
←
Taking true branch→
50	if (is_varlen34.1
'is_varlen' is 0
 && q_nd != 4)
51		return CCV_NNC_EXEC_INVALID;
52	// Assuming this is float 32.
53	int qdim[CCV_NNC_MAX_DIM_ALLOC(12)];
54	int kdim[CCV_NNC_MAX_DIM_ALLOC(12)];
55	int vdim[CCV_NNC_MAX_DIM_ALLOC(12)];
56	int cdim[CCV_NNC_MAX_DIM_ALLOC(12)];
57	int amdim[CCV_NNC_MAX_DIM_ALLOC(12)];
58	ccv_nnc_tensor_view_get_dim(q, qdim);
59	ccv_nnc_tensor_view_get_dim(k, kdim);
60	ccv_nnc_tensor_view_get_dim(v, vdim);
61	ccv_nnc_tensor_view_get_dim(c, cdim);
62	if (is_varlen34.2
'is_varlen' is 0
)
35
←
Taking false branch→
63	{
64		assert(q_seq_offsets->info.datatype == CCV_32S)((void) sizeof ((q_seq_offsets->info.datatype == CCV_32S) ?
 1 : 0), __extension__ ({ if (q_seq_offsets->info.datatype
 == CCV_32S) ; else __assert_fail ("q_seq_offsets->info.datatype == CCV_32S"
, "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 64, __extension__ __PRETTY_FUNCTION__); }));
65		assert(kv_seq_offsets->info.datatype == CCV_32S)((void) sizeof ((kv_seq_offsets->info.datatype == CCV_32S)
 ? 1 : 0), __extension__ ({ if (kv_seq_offsets->info.datatype
 == CCV_32S) ; else __assert_fail ("kv_seq_offsets->info.datatype == CCV_32S"
, "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 65, __extension__ __PRETTY_FUNCTION__); }));
66		assert(CCV_IS_TENSOR_CONTIGUOUS(q_seq_offsets))((void) sizeof (((!((*(int*)(q_seq_offsets)) & CCV_TENSOR_VIEW
) || (((ccv_nnc_tensor_view_t*)q_seq_offsets)->contiguous ==
 1))) ? 1 : 0), __extension__ ({ if ((!((*(int*)(q_seq_offsets
)) & CCV_TENSOR_VIEW) || (((ccv_nnc_tensor_view_t*)q_seq_offsets
)->contiguous == 1))) ; else __assert_fail ("CCV_IS_TENSOR_CONTIGUOUS(q_seq_offsets)"
, "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 66, __extension__ __PRETTY_FUNCTION__); }));
67		assert(CCV_IS_TENSOR_CONTIGUOUS(kv_seq_offsets))((void) sizeof (((!((*(int*)(kv_seq_offsets)) & CCV_TENSOR_VIEW
) || (((ccv_nnc_tensor_view_t*)kv_seq_offsets)->contiguous
 == 1))) ? 1 : 0), __extension__ ({ if ((!((*(int*)(kv_seq_offsets
)) & CCV_TENSOR_VIEW) || (((ccv_nnc_tensor_view_t*)kv_seq_offsets
)->contiguous == 1))) ; else __assert_fail ("CCV_IS_TENSOR_CONTIGUOUS(kv_seq_offsets)"
, "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 67, __extension__ __PRETTY_FUNCTION__); }));
68	}
69	if (q_nd35.1
'q_nd' is not equal to 3
 == 3)
36
←
Taking false branch→
70	{
71		qdim[0] = qdim[1], qdim[1] = qdim[2], qdim[2] = 1;
72		kdim[0] = kdim[1], kdim[1] = kdim[2], kdim[2] = 1;
73		vdim[0] = vdim[1], vdim[1] = vdim[2], vdim[2] = 1;
74		cdim[0] = cdim[1], cdim[1] = cdim[2], cdim[2] = 1;
75	}
76	assert(qdim[0] == kdim[0] && kdim[0] == vdim[0] && vdim[0] == cdim[0])((void) sizeof ((qdim[0] == kdim[0] && kdim[0] == vdim
[0] && vdim[0] == cdim[0]) ? 1 : 0), __extension__ ({
 if (qdim[0] == kdim[0] && kdim[0] == vdim[0] &&
 vdim[0] == cdim[0]) ; else __assert_fail ("qdim[0] == kdim[0] && kdim[0] == vdim[0] && vdim[0] == cdim[0]"
, "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 76, __extension__ __PRETTY_FUNCTION__); }));
37
←
Assuming the condition is true→
38
←
Assuming the condition is true→
39
←
Assuming the condition is true→
40
←
Taking true branch→
77	assert(qdim[2] == cdim[2])((void) sizeof ((qdim[2] == cdim[2]) ? 1 : 0), __extension__ (
{ if (qdim[2] == cdim[2]) ; else __assert_fail ("qdim[2] == cdim[2]"
, "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 77, __extension__ __PRETTY_FUNCTION__); }));
41
←
Assuming the condition is true→
42
←
Taking true branch→
78	assert(kdim[2] == vdim[2])((void) sizeof ((kdim[2] == vdim[2]) ? 1 : 0), __extension__ (
{ if (kdim[2] == vdim[2]) ; else __assert_fail ("kdim[2] == vdim[2]"
, "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 78, __extension__ __PRETTY_FUNCTION__); }));
43
←
Assuming the condition is true→
44
←
Taking true branch→
79	assert(qdim[2] % kdim[2] == 0)((void) sizeof ((qdim[2] % kdim[2] == 0) ? 1 : 0), __extension__
 ({ if (qdim[2] % kdim[2] == 0) ; else __assert_fail ("qdim[2] % kdim[2] == 0"
, "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 79, __extension__ __PRETTY_FUNCTION__); }));
45
←
Assuming the condition is true→
46
←
Taking true branch→
80	assert(qdim[2] >= kdim[2])((void) sizeof ((qdim[2] >= kdim[2]) ? 1 : 0), __extension__
 ({ if (qdim[2] >= kdim[2]) ; else __assert_fail ("qdim[2] >= kdim[2]"
, "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 80, __extension__ __PRETTY_FUNCTION__); }));
47
←
Assuming the condition is true→
48
←
Taking true branch→
81	assert(qdim[3] == kdim[3])((void) sizeof ((qdim[3] == kdim[3]) ? 1 : 0), __extension__ (
{ if (qdim[3] == kdim[3]) ; else __assert_fail ("qdim[3] == kdim[3]"
, "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 81, __extension__ __PRETTY_FUNCTION__); }));
49
←
Assuming the condition is true→
50
←
Taking true branch→
82	assert(kdim[1] == vdim[1])((void) sizeof ((kdim[1] == vdim[1]) ? 1 : 0), __extension__ (
{ if (kdim[1] == vdim[1]) ; else __assert_fail ("kdim[1] == vdim[1]"
, "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 82, __extension__ __PRETTY_FUNCTION__); }));
51
←
Assuming the condition is true→
52
←
Taking true branch→
83	assert(cdim[1] == qdim[1])((void) sizeof ((cdim[1] == qdim[1]) ? 1 : 0), __extension__ (
{ if (cdim[1] == qdim[1]) ; else __assert_fail ("cdim[1] == qdim[1]"
, "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 83, __extension__ __PRETTY_FUNCTION__); }));
53
←
Assuming the condition is true→
54
←
Taking true branch→
84	assert(cdim[3] == vdim[3])((void) sizeof ((cdim[3] == vdim[3]) ? 1 : 0), __extension__ (
{ if (cdim[3] == vdim[3]) ; else __assert_fail ("cdim[3] == vdim[3]"
, "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 84, __extension__ __PRETTY_FUNCTION__); }));
55
←
Assuming the condition is true→
56
←
Taking true branch→
85	const float* const sinkp = sinks56.1
'sinks' is non-null
 ? sinks->data.f32 : 0;
57
←
'?' condition is true→
86	int sink_head_stride = 0;
87	if (attention_sinks57.1
'attention_sinks' is not equal to 0
)
58
←
Taking true branch→
88	{
89		assert(CCV_IS_TENSOR_CONTIGUOUS(sinks))((void) sizeof (((!((*(int*)(sinks)) & CCV_TENSOR_VIEW) ||
 (((ccv_nnc_tensor_view_t*)sinks)->contiguous == 1))) ? 1 :
 0), __extension__ ({ if ((!((*(int*)(sinks)) & CCV_TENSOR_VIEW
) || (((ccv_nnc_tensor_view_t*)sinks)->contiguous == 1))) ;
 else __assert_fail ("CCV_IS_TENSOR_CONTIGUOUS(sinks)", "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 89, __extension__ __PRETTY_FUNCTION__); }));
59
←
Assuming the condition is true→
90		const int sink_count = ccv_nnc_tensor_count(sinks->info);
91		assert(sink_count == 1 || sink_count == qdim[2])((void) sizeof ((sink_count == 1 || sink_count == qdim[2]) ? 1
 : 0), __extension__ ({ if (sink_count == 1 || sink_count == qdim
[2]) ; else __assert_fail ("sink_count == 1 || sink_count == qdim[2]"
, "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 91, __extension__ __PRETTY_FUNCTION__); }));
60
←
Assuming 'sink_count' is equal to 1→
92		sink_head_stride = (sink_count60.1
'sink_count' is equal to 1
 == 1) ? 0 : 1;
61
←
'?' condition is true→
93	}
94	assert(CCV_NNC_MAX_DIM == 2)((void) sizeof (((2) == 2) ? 1 : 0), __extension__ ({ if ((2)
 == 2) ; else __assert_fail ("CCV_NNC_MAX_DIM == 2", "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 94, __extension__ __PRETTY_FUNCTION__); })); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
62
←
Taking true branch→
95	int qstride[CCV_NNC_MAX_DIM_ALLOC(12)];
96	int kstride[CCV_NNC_MAX_DIM_ALLOC(12)];
97	int vstride[CCV_NNC_MAX_DIM_ALLOC(12)];
98	int cstride[CCV_NNC_MAX_DIM_ALLOC(12)];
99	int amstride[CCV_NNC_MAX_DIM_ALLOC(12)];
100	ccv_nnc_tensor_view_get_stride(q, qstride);
101	ccv_nnc_tensor_view_get_stride(k, kstride);
102	ccv_nnc_tensor_view_get_stride(v, vstride);
103	ccv_nnc_tensor_view_get_stride(c, cstride);
104	if (q_nd62.1
'q_nd' is not equal to 3
 == 3)
63
←
Taking false branch→
105	{
106		qstride[0] = qstride[1], qstride[1] = qstride[2], qstride[2] = qstride[3];
107		kstride[0] = kstride[1], kstride[1] = kstride[2], kstride[2] = kstride[3];
108		vstride[0] = vstride[1], vstride[1] = vstride[2], vstride[2] = vstride[3];
109		cstride[0] = cstride[1], cstride[1] = cstride[2], cstride[2] = cstride[3];
110	}
111	if (attn_mask)
64
←
Assuming 'attn_mask' is non-null→
65
←
Taking true branch→
112	{
113		ccv_nnc_tensor_view_get_dim(attn_mask, amdim);
114		ccv_nnc_tensor_view_get_stride(attn_mask, amstride);
115		assert(amdim[0] == qdim[0] || amdim[0] == 1)((void) sizeof ((amdim[0] == qdim[0] || amdim[0] == 1) ? 1 : 0
), __extension__ ({ if (amdim[0] == qdim[0] || amdim[0] == 1)
 ; else __assert_fail ("amdim[0] == qdim[0] || amdim[0] == 1"
, "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 115, __extension__ __PRETTY_FUNCTION__); }));
66
←
Assuming the condition is true→
116		assert(amdim[1] == qdim[2] || amdim[1] == 1)((void) sizeof ((amdim[1] == qdim[2] || amdim[1] == 1) ? 1 : 0
), __extension__ ({ if (amdim[1] == qdim[2] || amdim[1] == 1)
 ; else __assert_fail ("amdim[1] == qdim[2] || amdim[1] == 1"
, "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 116, __extension__ __PRETTY_FUNCTION__); }));
67
←
Assuming the condition is true→
117		assert(amdim[2] == qdim[1])((void) sizeof ((amdim[2] == qdim[1]) ? 1 : 0), __extension__
 ({ if (amdim[2] == qdim[1]) ; else __assert_fail ("amdim[2] == qdim[1]"
, "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 117, __extension__ __PRETTY_FUNCTION__); }));
68
←
Assuming the condition is true→
69
←
Taking true branch→
118		assert(amdim[3] == kdim[1])((void) sizeof ((amdim[3] == kdim[1]) ? 1 : 0), __extension__
 ({ if (amdim[3] == kdim[1]) ; else __assert_fail ("amdim[3] == kdim[1]"
, "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 118, __extension__ __PRETTY_FUNCTION__); }));
70
←
Assuming the condition is true→
71
←
Taking true branch→
119	}
120	int i[CCV_NNC_MAX_DIM(2) + 2];
121	float* qk = ccv_nnc_stream_context_get_workspace(stream_context, sizeof(float) * qdim[1] * kdim[1], CCV_TENSOR_CPU_MEMORY);
122	const float* const qp = q->data.f32;
123	const float* const kp = k->data.f32;
124	const float* const vp = v->data.f32;
125	const float* const amp = attn_mask71.1
'attn_mask' is non-null
 ? attn_mask->data.f32 : 0;
72
←
'?' condition is true→
126	float* const cp = c->data.f32;
127	const float scale = cmd.info.scaled_dot_product_attention.scale;
128	const int is_causal = cmd.info.scaled_dot_product_attention.is_causal;
129	const int h_h_k_ratio = qdim[2] / kdim[2];
130	assert(kdim[2] == vdim[2])((void) sizeof ((kdim[2] == vdim[2]) ? 1 : 0), __extension__ (
{ if (kdim[2] == vdim[2]) ; else __assert_fail ("kdim[2] == vdim[2]"
, "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 130, __extension__ __PRETTY_FUNCTION__); }));
73
←
Taking true branch→
131	assert(qdim[2] >= kdim[2])((void) sizeof ((qdim[2] >= kdim[2]) ? 1 : 0), __extension__
 ({ if (qdim[2] >= kdim[2]) ; else __assert_fail ("qdim[2] >= kdim[2]"
, "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 131, __extension__ __PRETTY_FUNCTION__); }));
74
←
Taking true branch→
132	assert(qdim[2] % kdim[2] == 0)((void) sizeof ((qdim[2] % kdim[2] == 0) ? 1 : 0), __extension__
 ({ if (qdim[2] % kdim[2] == 0) ; else __assert_fail ("qdim[2] % kdim[2] == 0"
, "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 132, __extension__ __PRETTY_FUNCTION__); }));
75
←
Taking true branch→
133	if (is_varlen75.1
'is_varlen' is 0
)
76
←
Taking false branch→
134	{
135		const int batch_size = ccv_nnc_tensor_count(q_seq_offsets->info) - 1;
136		assert(batch_size > 0)((void) sizeof ((batch_size > 0) ? 1 : 0), __extension__ (
{ if (batch_size > 0) ; else __assert_fail ("batch_size > 0"
, "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 136, __extension__ __PRETTY_FUNCTION__); }));
137		assert(ccv_nnc_tensor_count(kv_seq_offsets->info) == batch_size + 1)((void) sizeof ((ccv_nnc_tensor_count(kv_seq_offsets->info
) == batch_size + 1) ? 1 : 0), __extension__ ({ if (ccv_nnc_tensor_count
(kv_seq_offsets->info) == batch_size + 1) ; else __assert_fail
 ("ccv_nnc_tensor_count(kv_seq_offsets->info) == batch_size + 1"
, "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 137, __extension__ __PRETTY_FUNCTION__); }));
138		assert(qdim[0] == 1)((void) sizeof ((qdim[0] == 1) ? 1 : 0), __extension__ ({ if (
qdim[0] == 1) ; else __assert_fail ("qdim[0] == 1", "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 138, __extension__ __PRETTY_FUNCTION__); }));
139		assert(kdim[0] == 1)((void) sizeof ((kdim[0] == 1) ? 1 : 0), __extension__ ({ if (
kdim[0] == 1) ; else __assert_fail ("kdim[0] == 1", "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 139, __extension__ __PRETTY_FUNCTION__); }));
140		assert(vdim[0] == 1)((void) sizeof ((vdim[0] == 1) ? 1 : 0), __extension__ ({ if (
vdim[0] == 1) ; else __assert_fail ("vdim[0] == 1", "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 140, __extension__ __PRETTY_FUNCTION__); }));
141		assert(cdim[0] == 1)((void) sizeof ((cdim[0] == 1) ? 1 : 0), __extension__ ({ if (
cdim[0] == 1) ; else __assert_fail ("cdim[0] == 1", "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 141, __extension__ __PRETTY_FUNCTION__); }));
142		assert(cdim[1] == qdim[1])((void) sizeof ((cdim[1] == qdim[1]) ? 1 : 0), __extension__ (
{ if (cdim[1] == qdim[1]) ; else __assert_fail ("cdim[1] == qdim[1]"
, "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 142, __extension__ __PRETTY_FUNCTION__); }));
143		assert(cdim[2] == qdim[2])((void) sizeof ((cdim[2] == qdim[2]) ? 1 : 0), __extension__ (
{ if (cdim[2] == qdim[2]) ; else __assert_fail ("cdim[2] == qdim[2]"
, "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 143, __extension__ __PRETTY_FUNCTION__); }));
144		assert(cdim[3] == vdim[3])((void) sizeof ((cdim[3] == vdim[3]) ? 1 : 0), __extension__ (
{ if (cdim[3] == vdim[3]) ; else __assert_fail ("cdim[3] == vdim[3]"
, "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 144, __extension__ __PRETTY_FUNCTION__); }));
145		assert(kdim[1] == vdim[1])((void) sizeof ((kdim[1] == vdim[1]) ? 1 : 0), __extension__ (
{ if (kdim[1] == vdim[1]) ; else __assert_fail ("kdim[1] == vdim[1]"
, "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 145, __extension__ __PRETTY_FUNCTION__); }));
146		const int* const q_offset = q_seq_offsets->data.i32;
147		const int* const kv_offset = kv_seq_offsets->data.i32;
148		assert(q_offset[0] == 0)((void) sizeof ((q_offset[0] == 0) ? 1 : 0), __extension__ ({
 if (q_offset[0] == 0) ; else __assert_fail ("q_offset[0] == 0"
, "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 148, __extension__ __PRETTY_FUNCTION__); }));
149		assert(kv_offset[0] == 0)((void) sizeof ((kv_offset[0] == 0) ? 1 : 0), __extension__ (
{ if (kv_offset[0] == 0) ; else __assert_fail ("kv_offset[0] == 0"
, "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 149, __extension__ __PRETTY_FUNCTION__); }));
150		assert(q_offset[batch_size] == qdim[1])((void) sizeof ((q_offset[batch_size] == qdim[1]) ? 1 : 0), __extension__
 ({ if (q_offset[batch_size] == qdim[1]) ; else __assert_fail
 ("q_offset[batch_size] == qdim[1]", "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 150, __extension__ __PRETTY_FUNCTION__); }));
151		assert(kv_offset[batch_size] == kdim[1])((void) sizeof ((kv_offset[batch_size] == kdim[1]) ? 1 : 0), __extension__
 ({ if (kv_offset[batch_size] == kdim[1]) ; else __assert_fail
 ("kv_offset[batch_size] == kdim[1]", "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 151, __extension__ __PRETTY_FUNCTION__); }));
152		for (i[0] = 0; i[0] < batch_size; i[0]++)
153		{
154			const int q_start = q_offset[i[0]];
155			const int q_end = q_offset[i[0] + 1];
156			const int k_start = kv_offset[i[0]];
157			const int k_end = kv_offset[i[0] + 1];
158			assert(q_start <= q_end)((void) sizeof ((q_start <= q_end) ? 1 : 0), __extension__
 ({ if (q_start <= q_end) ; else __assert_fail ("q_start <= q_end"
, "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 158, __extension__ __PRETTY_FUNCTION__); }));
159			assert(k_start <= k_end)((void) sizeof ((k_start <= k_end) ? 1 : 0), __extension__
 ({ if (k_start <= k_end) ; else __assert_fail ("k_start <= k_end"
, "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 159, __extension__ __PRETTY_FUNCTION__); }));
160			const int R = q_end - q_start;
161			const int K = k_end - k_start;
162			assert(R > 0)((void) sizeof ((R > 0) ? 1 : 0), __extension__ ({ if (R >
 0) ; else __assert_fail ("R > 0", "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 162, __extension__ __PRETTY_FUNCTION__); }));
163			assert(K > 0)((void) sizeof ((K > 0) ? 1 : 0), __extension__ ({ if (K >
 0) ; else __assert_fail ("K > 0", "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 163, __extension__ __PRETTY_FUNCTION__); }));
164			assert(R <= cmd.info.scaled_dot_product_attention.max_seqlen_q)((void) sizeof ((R <= cmd.info.scaled_dot_product_attention
.max_seqlen_q) ? 1 : 0), __extension__ ({ if (R <= cmd.info
.scaled_dot_product_attention.max_seqlen_q) ; else __assert_fail
 ("R <= cmd.info.scaled_dot_product_attention.max_seqlen_q"
, "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 164, __extension__ __PRETTY_FUNCTION__); }));
165			assert(K <= cmd.info.scaled_dot_product_attention.max_seqlen_kv)((void) sizeof ((K <= cmd.info.scaled_dot_product_attention
.max_seqlen_kv) ? 1 : 0), __extension__ ({ if (K <= cmd.info
.scaled_dot_product_attention.max_seqlen_kv) ; else __assert_fail
 ("K <= cmd.info.scaled_dot_product_attention.max_seqlen_kv"
, "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 165, __extension__ __PRETTY_FUNCTION__); }));
166			const float* const qp0 = qp + q_start * qstride[1];
167			const float* const kp0 = kp + k_start * kstride[1];
168			const float* const vp0 = vp + k_start * vstride[1];
169			float* const cp0 = cp + q_start * cstride[1];
170			for (i[1] = 0; i[1] < qdim[2]; i[1]++)
171			{
172				const float* const qp1 = qp0 + i[1] * qstride[2];
173				const float* const kp1 = kp0 + (i[1] / h_h_k_ratio) * kstride[2];
174				const float* const vp1 = vp0 + (i[1] / h_h_k_ratio) * vstride[2];
175				float* const cp1 = cp0 + i[1] * cstride[2];
176				const double sink = attention_sinks ? (double)sinkp[i[1] * sink_head_stride] : 0;
177				parallel_for(x, R){ int x; for ((x) = 0; (x) < (R); (x)++) { {
178					int y, k;
179					const float* const qp2 = qp1 + x * qstride[1];
180					float* const cp2 = cp1 + x * cstride[1];
181					float* const qk0 = qk + x * K;
182					for (y = 0; y < K; y++)
183					{
184						const float* const kp2 = kp1 + y * kstride[1];
185						float v = 0;
186						for (k = 0; k < qdim[3]; k++)
187							v += qp2[k * qstride[3]] * kp2[k * kstride[3]];
188						qk0[y] = scale * v;
189					}
190					if (is_causal)
191					{
192						const int x_end = ccv_max(x - R + K + 1, 0)({ typeof (x - R + K + 1) _a = (x - R + K + 1); typeof (0) _b
 = (0); (_a > _b) ? _a : _b; });
193						const int x_start = sliding_window > 0 ? ccv_max(x_end - sliding_window, 0)({ typeof (x_end - sliding_window) _a = (x_end - sliding_window
); typeof (0) _b = (0); (_a > _b) ? _a : _b; }) : 0;
194						for (y = 0; y < x_start; y++)
195							qk0[y] = 0;
196						for (y = x_end; y < K; y++)
197							qk0[y] = 0;
198						double maxval = attention_sinks ? sink : qk0[x_start];
199						for (y = attention_sinks ? x_start : x_start + 1; y < x_end; y++)
200							if (qk0[y] > maxval)
201								maxval = qk0[y];
202						double sumval = attention_sinks ? expf(sink - maxval) : 0;
203						for (y = x_start; y < x_end; y++)
204							sumval += (qk0[y] = expf(qk0[y] - maxval));
205						sumval = 1.0 / sumval;
206						for (y = x_start; y < x_end; y++)
207							qk0[y] *= sumval;
208					} else {
209						double maxval = attention_sinks ? sink : qk0[0];
210						for (y = attention_sinks ? 0 : 1; y < K; y++)
211							if (qk0[y] > maxval)
212								maxval = qk0[y];
213						double sumval = attention_sinks ? expf(sink - maxval) : 0;
214						for (y = 0; y < K; y++)
215							sumval += (qk0[y] = expf(qk0[y] - maxval));
216						sumval = 1.0 / sumval;
217						for (y = 0; y < K; y++)
218							qk0[y] *= sumval;
219					}
220					for (k = 0; k < vdim[3]; k++)
221						cp2[k * cstride[3]] = 0;
222					for (y = 0; y < K; y++)
223					{
224						const float* const vp2 = vp1 + y * vstride[1];
225						const float v = qk0[y];
226						for (k = 0; k < vdim[3]; k++)
227							cp2[k * cstride[3]] += v * vp2[k * vstride[3]];
228					}
229				} parallel_endfor} }
230			}
231		}
232		return CCV_NNC_EXEC_SUCCESS;
233	}
234	for (i[0] = 0; i[0] < qdim[0]; i[0]++)
77
←
Assuming the condition is true→
78
←
Loop condition is true.  Entering loop body→
235	{
236		const float* const qp0 = qp + i[0] * qstride[0];
237		const float* const kp0 = kp + i[0] * kstride[0];
238		const float* const vp0 = vp + i[0] * vstride[0];
239		const float* const amp0 = amp && amdim[0] > 1 ? amp + i[0] * amstride[0] : amp;
79
←
Assuming 'amp' is null→
240		float* const cp0 = cp + i[0] * cstride[0];
241		for (i[1] = 0; i[1] < qdim[2]; i[1]++)
80
←
Assuming the condition is true→
81
←
Loop condition is true.  Entering loop body→
242		{
243			const float* const qp1 = qp0 + i[1] * qstride[2];
244			const float* const kp1 = kp0 + (i[1] / h_h_k_ratio) * kstride[2];
245			const float* const vp1 = vp0 + (i[1] / h_h_k_ratio) * vstride[2];
246			const float* const amp1 = amp81.1
'amp' is null
 && amdim[1] > 1 ? amp0 + i[1] * amstride[1] : amp0;
247			float* const cp1 = cp0 + i[1] * cstride[2];
248			const double sink = attention_sinks81.2
'attention_sinks' is not equal to 0
 ? (double)sinkp[i[1] * sink_head_stride] : 0;
82
←
'?' condition is true→
249			// Compute Q @ K^T
250			parallel_for(x, qdim[1]){ int x; for ((x) = 0; (x) < (qdim[1]); (x)++) { {
83
←
Assuming the condition is true→
84
←
Loop condition is true.  Entering loop body→
251				int y, k;
252				const float* const qp2 = qp1 + x * qstride[1];
253				float* const cp2 = cp1 + x * cstride[1];
254				float* const qk0 = qk + x * kdim[1];
255				const float* const amp2 = amp184.1
'amp1' is null
 ? amp1 + x * amstride[2] : 0;
85
←
'?' condition is false→
86
←
'amp2' initialized to a null pointer value→
256				if (attn_mask86.1
'attn_mask' is non-null
)
87
←
Taking true branch→
257				{
258					for (y = 0; y < kdim[1]; y++)
88
←
Assuming the condition is true→
89
←
Loop condition is true.  Entering loop body→
259					{
260						const float* const kp2 = kp1 + y * kstride[1];
261						float v = 0;
262						for (k = 0; k < qdim[3]; k++)
90
←
Assuming the condition is false→
91
←
Loop condition is false. Execution continues on line 264→
263							v += qp2[k * qstride[3]] * kp2[k * kstride[3]];
264						qk0[y] = scale * v + amp2[y * amstride[3]];
92
←
Array access (from variable 'amp2') results in a null pointer dereference
265					}
266				} else {
267					for (y = 0; y < kdim[1]; y++)
268					{
269						const float* const kp2 = kp1 + y * kstride[1];
270						float v = 0;
271						for (k = 0; k < qdim[3]; k++)
272							v += qp2[k * qstride[3]] * kp2[k * kstride[3]];
273						qk0[y] = scale * v;
274					}
275				}
276				// Compute softmax on qk.
277				if (is_causal)
278				{
279					const int x_end = ccv_max(x - qdim[1] + kdim[1] + 1, 0)({ typeof (x - qdim[1] + kdim[1] + 1) _a = (x - qdim[1] + kdim
[1] + 1); typeof (0) _b = (0); (_a > _b) ? _a : _b; });
280					const int x_start = sliding_window > 0 ? ccv_max(x_end - sliding_window, 0)({ typeof (x_end - sliding_window) _a = (x_end - sliding_window
); typeof (0) _b = (0); (_a > _b) ? _a : _b; }) : 0;
281					for (y = 0; y < x_start; y++)
282						qk0[y] = 0;
283					for (y = x_end; y < kdim[1]; y++)
284						qk0[y] = 0;
285					double maxval = attention_sinks ? sink : qk0[x_start];
286					for (y = attention_sinks ? x_start : x_start + 1; y < x_end; y++)
287						if (qk0[y] > maxval)
288							maxval = qk0[y];
289					double sumval = attention_sinks ? expf(sink - maxval) : 0;
290					for (y = x_start; y < x_end; y++)
291						sumval += (qk0[y] = expf(qk0[y] - maxval));
292					sumval = 1.0 / sumval;
293					for (y = x_start; y < x_end; y++)
294						qk0[y] *= sumval;
295				} else {
296					double maxval = attention_sinks ? sink : qk0[0];
297					for (y = attention_sinks ? 0 : 1; y < kdim[1]; y++)
298						if (qk0[y] > maxval)
299							maxval = qk0[y];
300					double sumval = attention_sinks ? expf(sink - maxval) : 0;
301					for (y = 0; y < kdim[1]; y++)
302						sumval += (qk0[y] = expf(qk0[y] - maxval));
303					sumval = 1.0 / sumval;
304					for (y = 0; y < kdim[1]; y++)
305						qk0[y] *= sumval;
306				}
307				for (k = 0; k < vdim[3]; k++)
308					cp2[k * cstride[3]] = 0;
309				for (y = 0; y < kdim[1]; y++)
310				{
311					const float* const vp2 = vp1 + y * vstride[1];
312					const float v = qk0[y];
313					for (k = 0; k < vdim[3]; k++)
314						cp2[k * cstride[3]] += v * vp2[k * vstride[3]];
315				}
316			} parallel_endfor} }
317		}
318	}
319	if (w)
320	{
321		const int num_heads = cdim[2];
322		ccv_nnc_tensor_view_t* const d = (ccv_nnc_tensor_view_t*)outputs[0];
323		const int w_nd = ccv_nnc_tensor_nd(w->info.dim);
324		assert(w_nd == 2)((void) sizeof ((w_nd == 2) ? 1 : 0), __extension__ ({ if (w_nd
 == 2) ; else __assert_fail ("w_nd == 2", "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 324, __extension__ __PRETTY_FUNCTION__); }));
325		assert(CCV_IS_TENSOR_CONTIGUOUS(w))((void) sizeof (((!((*(int*)(w)) & CCV_TENSOR_VIEW) || ((
(ccv_nnc_tensor_view_t*)w)->contiguous == 1))) ? 1 : 0), __extension__
 ({ if ((!((*(int*)(w)) & CCV_TENSOR_VIEW) || (((ccv_nnc_tensor_view_t
*)w)->contiguous == 1))) ; else __assert_fail ("CCV_IS_TENSOR_CONTIGUOUS(w)"
, "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 325, __extension__ __PRETTY_FUNCTION__); }));
326		const int d_nd = ccv_nnc_tensor_nd(d->info.dim);
327		assert(d_nd == 3)((void) sizeof ((d_nd == 3) ? 1 : 0), __extension__ ({ if (d_nd
 == 3) ; else __assert_fail ("d_nd == 3", "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 327, __extension__ __PRETTY_FUNCTION__); }));
328		int ddim[CCV_NNC_MAX_DIM_ALLOC(12)];
329		int dstride[CCV_NNC_MAX_DIM_ALLOC(12)];
330		ccv_nnc_tensor_view_get_dim(d, ddim);
331		ccv_nnc_tensor_view_get_stride(d, dstride);
332		assert(ddim[2] == cdim[1])((void) sizeof ((ddim[2] == cdim[1]) ? 1 : 0), __extension__ (
{ if (ddim[2] == cdim[1]) ; else __assert_fail ("ddim[2] == cdim[1]"
, "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 332, __extension__ __PRETTY_FUNCTION__); }));
333		assert(ddim[3] == num_heads * cdim[3])((void) sizeof ((ddim[3] == num_heads * cdim[3]) ? 1 : 0), __extension__
 ({ if (ddim[3] == num_heads * cdim[3]) ; else __assert_fail (
"ddim[3] == num_heads * cdim[3]", "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 333, __extension__ __PRETTY_FUNCTION__); }));
334		assert(w->info.dim[1] == ddim[3])((void) sizeof ((w->info.dim[1] == ddim[3]) ? 1 : 0), __extension__
 ({ if (w->info.dim[1] == ddim[3]) ; else __assert_fail ("w->info.dim[1] == ddim[3]"
, "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 334, __extension__ __PRETTY_FUNCTION__); }));
335		assert(w->info.dim[0] == ddim[3])((void) sizeof ((w->info.dim[0] == ddim[3]) ? 1 : 0), __extension__
 ({ if (w->info.dim[0] == ddim[3]) ; else __assert_fail ("w->info.dim[0] == ddim[3]"
, "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 335, __extension__ __PRETTY_FUNCTION__); }));
336		float* const dp = d->data.f32;
337		const float* const wp = w->data.f32;
338		const float* const cp = c->data.f32;
339		if (bias)
340		{
341			assert(ccv_nnc_tensor_count(bias->info) == ddim[3])((void) sizeof ((ccv_nnc_tensor_count(bias->info) == ddim[
3]) ? 1 : 0), __extension__ ({ if (ccv_nnc_tensor_count(bias->
info) == ddim[3]) ; else __assert_fail ("ccv_nnc_tensor_count(bias->info) == ddim[3]"
, "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 341, __extension__ __PRETTY_FUNCTION__); }));
342			assert(CCV_IS_TENSOR_CONTIGUOUS(bias))((void) sizeof (((!((*(int*)(bias)) & CCV_TENSOR_VIEW) ||
 (((ccv_nnc_tensor_view_t*)bias)->contiguous == 1))) ? 1 :
 0), __extension__ ({ if ((!((*(int*)(bias)) & CCV_TENSOR_VIEW
) || (((ccv_nnc_tensor_view_t*)bias)->contiguous == 1))) ;
 else __assert_fail ("CCV_IS_TENSOR_CONTIGUOUS(bias)", "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 342, __extension__ __PRETTY_FUNCTION__); }));
343			const float* const biasp = bias->data.f32;
344			for (i[0] = 0; i[0] < ddim[1]; i[0]++)
345			{
346				const float* const cp0 = cp + i[0] * cstride[0];
347				float* const dp0 = dp + i[0] * dstride[1];
348				parallel_for(y, ddim[2]){ int y; for ((y) = 0; (y) < (ddim[2]); (y)++) { {
349					int x, j, k;
350					const float* const cp1 = cp0 + y * cstride[1];
351					float* const dp1 = dp0 + y * dstride[2];
352					for (x = 0; x < ddim[3]; x++)
353					{
354						const float* const wp0 = wp + x * ddim[3];
355						float v = biasp[x];
356						for (j = 0; j < num_heads; j++)
357						{
358							const float* const cp2 = cp1 + j * cstride[2];
359							for (k = 0; k < cdim[3]; k++)
360								v += wp0[j * cdim[3] + k] * cp2[k * cstride[3]];
361						}
362						dp1[x * dstride[3]] = v;
363					}
364				} parallel_endfor} }
365			}
366		} else {
367			for (i[0] = 0; i[0] < ddim[1]; i[0]++)
368			{
369				const float* const cp0 = cp + i[0] * cstride[0];
370				float* const dp0 = dp + i[0] * dstride[1];
371				parallel_for(y, ddim[2]){ int y; for ((y) = 0; (y) < (ddim[2]); (y)++) { {
372					int x, j, k;
373					const float* const cp1 = cp0 + y * cstride[1];
374					float* const dp1 = dp0 + y * dstride[2];
375					for (x = 0; x < ddim[3]; x++)
376					{
377						const float* const wp0 = wp + x * ddim[3];
378						float v = 0;
379						for (j = 0; j < num_heads; j++)
380						{
381							const float* const cp2 = cp1 + j * cstride[2];
382							for (k = 0; k < cdim[3]; k++)
383								v += wp0[j * cdim[3] + k] * cp2[k * cstride[3]];
384						}
385						dp1[x * dstride[3]] = v;
386					}
387				} parallel_endfor} }
388			}
389		}
390	}
391	return CCV_NNC_EXEC_SUCCESS;
392}
393 
394static int _ccv_nnc_scaled_dot_product_attention_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
395{
396	// Assuming no saved_softmax, we need to recompute from q, k, v.
397	// We cannot do this with masks (yet).
398	assert(input_size >= 6)((void) sizeof ((input_size >= 6) ? 1 : 0), __extension__ (
{ if (input_size >= 6) ; else __assert_fail ("input_size >= 6"
, "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 398, __extension__ __PRETTY_FUNCTION__); }));
399	if (cmd.info.scaled_dot_product_attention.is_varlen)
400		return CCV_NNC_EXEC_INVALID;
401	if (cmd.info.scaled_dot_product_attention.attention_sinks)
402		return CCV_NNC_EXEC_INVALID;
403	if (cmd.info.scaled_dot_product_attention.sliding_window != 0)
404		return CCV_NNC_EXEC_INVALID;
405	ccv_nnc_tensor_view_t* const g = (ccv_nnc_tensor_view_t*)inputs[0];
406	ccv_nnc_tensor_view_t* const q = (ccv_nnc_tensor_view_t*)inputs[3];
407	ccv_nnc_tensor_view_t* const k = (ccv_nnc_tensor_view_t*)inputs[4];
408	ccv_nnc_tensor_view_t* const v = (ccv_nnc_tensor_view_t*)inputs[5];
409	ccv_nnc_tensor_view_t* const dq = (ccv_nnc_tensor_view_t*)outputs[0];
410	ccv_nnc_tensor_view_t* const dk = (ccv_nnc_tensor_view_t*)outputs[1];
411	ccv_nnc_tensor_view_t* const dv = (ccv_nnc_tensor_view_t*)outputs[2];
412	const int q_nd = ccv_nnc_tensor_nd(q->info.dim);
413	assert(q_nd == 3 || q_nd == 4)((void) sizeof ((q_nd == 3 || q_nd == 4) ? 1 : 0), __extension__
 ({ if (q_nd == 3 || q_nd == 4) ; else __assert_fail ("q_nd == 3 || q_nd == 4"
, "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 413, __extension__ __PRETTY_FUNCTION__); }));
414	const int k_nd = ccv_nnc_tensor_nd(k->info.dim);
415	assert(k_nd == 3 || k_nd == 4)((void) sizeof ((k_nd == 3 || k_nd == 4) ? 1 : 0), __extension__
 ({ if (k_nd == 3 || k_nd == 4) ; else __assert_fail ("k_nd == 3 || k_nd == 4"
, "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 415, __extension__ __PRETTY_FUNCTION__); }));
416	const int v_nd = ccv_nnc_tensor_nd(v->info.dim);
417	assert(v_nd == 3 || v_nd == 4)((void) sizeof ((v_nd == 3 || v_nd == 4) ? 1 : 0), __extension__
 ({ if (v_nd == 3 || v_nd == 4) ; else __assert_fail ("v_nd == 3 || v_nd == 4"
, "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 417, __extension__ __PRETTY_FUNCTION__); }));
418	const int g_nd = ccv_nnc_tensor_nd(g->info.dim);
419	assert(g_nd == 3 || g_nd == 4)((void) sizeof ((g_nd == 3 || g_nd == 4) ? 1 : 0), __extension__
 ({ if (g_nd == 3 || g_nd == 4) ; else __assert_fail ("g_nd == 3 || g_nd == 4"
, "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 419, __extension__ __PRETTY_FUNCTION__); }));
420	const int dq_nd = ccv_nnc_tensor_nd(dq->info.dim);
421	assert(dq_nd == 3 || dq_nd == 4)((void) sizeof ((dq_nd == 3 || dq_nd == 4) ? 1 : 0), __extension__
 ({ if (dq_nd == 3 || dq_nd == 4) ; else __assert_fail ("dq_nd == 3 || dq_nd == 4"
, "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 421, __extension__ __PRETTY_FUNCTION__); }));
422	assert(dq_nd == q_nd)((void) sizeof ((dq_nd == q_nd) ? 1 : 0), __extension__ ({ if
 (dq_nd == q_nd) ; else __assert_fail ("dq_nd == q_nd", "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 422, __extension__ __PRETTY_FUNCTION__); }));
423	const int dk_nd = ccv_nnc_tensor_nd(dk->info.dim);
424	assert(dk_nd == 3 || dk_nd == 4)((void) sizeof ((dk_nd == 3 || dk_nd == 4) ? 1 : 0), __extension__
 ({ if (dk_nd == 3 || dk_nd == 4) ; else __assert_fail ("dk_nd == 3 || dk_nd == 4"
, "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 424, __extension__ __PRETTY_FUNCTION__); }));
425	assert(dk_nd == k_nd)((void) sizeof ((dk_nd == k_nd) ? 1 : 0), __extension__ ({ if
 (dk_nd == k_nd) ; else __assert_fail ("dk_nd == k_nd", "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 425, __extension__ __PRETTY_FUNCTION__); }));
426	const int dv_nd = ccv_nnc_tensor_nd(dv->info.dim);
427	assert(dv_nd == 3 || dv_nd == 4)((void) sizeof ((dv_nd == 3 || dv_nd == 4) ? 1 : 0), __extension__
 ({ if (dv_nd == 3 || dv_nd == 4) ; else __assert_fail ("dv_nd == 3 || dv_nd == 4"
, "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 427, __extension__ __PRETTY_FUNCTION__); }));
428	assert(dv_nd == v_nd)((void) sizeof ((dv_nd == v_nd) ? 1 : 0), __extension__ ({ if
 (dv_nd == v_nd) ; else __assert_fail ("dv_nd == v_nd", "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 428, __extension__ __PRETTY_FUNCTION__); }));
429	assert(q_nd == k_nd && k_nd == v_nd && v_nd == g_nd)((void) sizeof ((q_nd == k_nd && k_nd == v_nd &&
 v_nd == g_nd) ? 1 : 0), __extension__ ({ if (q_nd == k_nd &&
 k_nd == v_nd && v_nd == g_nd) ; else __assert_fail (
"q_nd == k_nd && k_nd == v_nd && v_nd == g_nd"
, "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 429, __extension__ __PRETTY_FUNCTION__); }));
430	// Assuming this is float 32.
431	int qdim[CCV_NNC_MAX_DIM_ALLOC(12)];
432	int kdim[CCV_NNC_MAX_DIM_ALLOC(12)];
433	int vdim[CCV_NNC_MAX_DIM_ALLOC(12)];
434	int gdim[CCV_NNC_MAX_DIM_ALLOC(12)];
435	int dqdim[CCV_NNC_MAX_DIM_ALLOC(12)];
436	int dkdim[CCV_NNC_MAX_DIM_ALLOC(12)];
437	int dvdim[CCV_NNC_MAX_DIM_ALLOC(12)];
438	ccv_nnc_tensor_view_get_dim(q, qdim);
439	ccv_nnc_tensor_view_get_dim(k, kdim);
440	ccv_nnc_tensor_view_get_dim(v, vdim);
441	ccv_nnc_tensor_view_get_dim(g, gdim);
442	ccv_nnc_tensor_view_get_dim(dq, dqdim);
443	ccv_nnc_tensor_view_get_dim(dk, dkdim);
444	ccv_nnc_tensor_view_get_dim(dv, dvdim);
445	if (q_nd == 3)
446	{
447		qdim[0] = qdim[1], qdim[1] = qdim[2], qdim[2] = 1;
448		kdim[0] = kdim[1], kdim[1] = kdim[2], kdim[2] = 1;
449		vdim[0] = vdim[1], vdim[1] = vdim[2], vdim[2] = 1;
450		gdim[0] = gdim[1], gdim[1] = gdim[2], gdim[2] = 1;
451		dqdim[0] = dqdim[1], dqdim[1] = dqdim[2], dqdim[2] = 1;
452		dkdim[0] = dkdim[1], dkdim[1] = dkdim[2], dkdim[2] = 1;
453		dvdim[0] = dvdim[1], dvdim[1] = dvdim[2], dvdim[2] = 1;
454	}
455	assert(qdim[0] == kdim[0] && kdim[0] == vdim[0] && vdim[0] == gdim[0])((void) sizeof ((qdim[0] == kdim[0] && kdim[0] == vdim
[0] && vdim[0] == gdim[0]) ? 1 : 0), __extension__ ({
 if (qdim[0] == kdim[0] && kdim[0] == vdim[0] &&
 vdim[0] == gdim[0]) ; else __assert_fail ("qdim[0] == kdim[0] && kdim[0] == vdim[0] && vdim[0] == gdim[0]"
, "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 455, __extension__ __PRETTY_FUNCTION__); }));
456	assert(qdim[2] == gdim[2])((void) sizeof ((qdim[2] == gdim[2]) ? 1 : 0), __extension__ (
{ if (qdim[2] == gdim[2]) ; else __assert_fail ("qdim[2] == gdim[2]"
, "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 456, __extension__ __PRETTY_FUNCTION__); }));
457	assert(kdim[2] == vdim[2])((void) sizeof ((kdim[2] == vdim[2]) ? 1 : 0), __extension__ (
{ if (kdim[2] == vdim[2]) ; else __assert_fail ("kdim[2] == vdim[2]"
, "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 457, __extension__ __PRETTY_FUNCTION__); }));
458	assert(qdim[2] % kdim[2] == 0)((void) sizeof ((qdim[2] % kdim[2] == 0) ? 1 : 0), __extension__
 ({ if (qdim[2] % kdim[2] == 0) ; else __assert_fail ("qdim[2] % kdim[2] == 0"
, "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 458, __extension__ __PRETTY_FUNCTION__); }));
459	assert(qdim[2] >= kdim[2])((void) sizeof ((qdim[2] >= kdim[2]) ? 1 : 0), __extension__
 ({ if (qdim[2] >= kdim[2]) ; else __assert_fail ("qdim[2] >= kdim[2]"
, "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 459, __extension__ __PRETTY_FUNCTION__); }));
460	assert(qdim[3] == kdim[3])((void) sizeof ((qdim[3] == kdim[3]) ? 1 : 0), __extension__ (
{ if (qdim[3] == kdim[3]) ; else __assert_fail ("qdim[3] == kdim[3]"
, "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 460, __extension__ __PRETTY_FUNCTION__); }));
461	assert(kdim[1] == vdim[1])((void) sizeof ((kdim[1] == vdim[1]) ? 1 : 0), __extension__ (
{ if (kdim[1] == vdim[1]) ; else __assert_fail ("kdim[1] == vdim[1]"
, "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 461, __extension__ __PRETTY_FUNCTION__); }));
462	assert(gdim[1] == qdim[1])((void) sizeof ((gdim[1] == qdim[1]) ? 1 : 0), __extension__ (
{ if (gdim[1] == qdim[1]) ; else __assert_fail ("gdim[1] == qdim[1]"
, "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 462, __extension__ __PRETTY_FUNCTION__); }));
463	assert(gdim[3] == vdim[3])((void) sizeof ((gdim[3] == vdim[3]) ? 1 : 0), __extension__ (
{ if (gdim[3] == vdim[3]) ; else __assert_fail ("gdim[3] == vdim[3]"
, "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 463, __extension__ __PRETTY_FUNCTION__); }));
464	assert(CCV_NNC_MAX_DIM == 2)((void) sizeof (((2) == 2) ? 1 : 0), __extension__ ({ if ((2)
 == 2) ; else __assert_fail ("CCV_NNC_MAX_DIM == 2", "scaled_dot_product_attention/ccv_nnc_scaled_dot_product_attention_cpu_ref.c"
, 464, __extension__ __PRETTY_FUNCTION__); })); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
465	int qstride[CCV_NNC_MAX_DIM_ALLOC(12)];
466	int kstride[CCV_NNC_MAX_DIM_ALLOC(12)];
467	int vstride[CCV_NNC_MAX_DIM_ALLOC(12)];
468	int gstride[CCV_NNC_MAX_DIM_ALLOC(12)];
469	int dqstride[CCV_NNC_MAX_DIM_ALLOC(12)];
470	int dkstride[CCV_NNC_MAX_DIM_ALLOC(12)];
471	int dvstride[CCV_NNC_MAX_DIM_ALLOC(12)];
472	ccv_nnc_tensor_view_get_stride(q, qstride);
473	ccv_nnc_tensor_view_get_stride(k, kstride);
474	ccv_nnc_tensor_view_get_stride(v, vstride);
475	ccv_nnc_tensor_view_get_stride(g, gstride);
476	ccv_nnc_tensor_view_get_stride(dq, dqstride);
477	ccv_nnc_tensor_view_get_stride(dk, dkstride);
478	ccv_nnc_tensor_view_get_stride(dv, dvstride);
479	if (q_nd == 3)
480	{
481		qstride[0] = qstride[1], qstride[1] = qstride[2], qstride[2] = qstride[3];
482		kstride[0] = kstride[1], kstride[1] = kstride[2], kstride[2] = kstride[3];
483		vstride[0] = vstride[1], vstride[1] = vstride[2], vstride[2] = vstride[3];
484		gstride[0] = gstride[1], gstride[1] = gstride[2], gstride[2] = gstride[3];
485		dqstride[0] = dqstride[1], dqstride[1] = dqstride[2], dqstride[2] = dqstride[3];
486		dkstride[0] = dkstride[1], dkstride[1] = dkstride[2], dkstride[2] = dkstride[3];
487		dvstride[0] = dvstride[1], dvstride[1] = dvstride[2], dvstride[2] = dvstride[3];
488	}
489	int i[CCV_NNC_MAX_DIM(2) + 2];
490	float* qk = ccv_nnc_stream_context_get_workspace(stream_context, sizeof(float) * 2 * kdim[1], CCV_TENSOR_CPU_MEMORY);
491	const float* const qp = q->data.f32;
492	const float* const kp = k->data.f32;
493	const float* const vp = v->data.f32;
494	const float* const gp = g->data.f32;
495	float* const dqp = dq->data.f32;
496	float* const dkp = dk->data.f32;
497	float* const dvp = dv->data.f32;
498	const float scale = cmd.info.scaled_dot_product_attention.scale;
499	const int is_causal = cmd.info.scaled_dot_product_attention.is_causal;
500	const int h_h_k_ratio = qdim[2] / kdim[2];
501	for (i[0] = 0; i[0] < qdim[0]; i[0]++)
502	{
503		const float* const qp0 = qp + i[0] * qstride[0];
504		const float* const kp0 = kp + i[0] * kstride[0];
505		const float* const vp0 = vp + i[0] * vstride[0];
506		const float* const gp0 = gp + i[0] * gstride[0];
507		float* const dqp0 = dqp + i[0] * dqstride[0];
508		float* const dkp0 = dkp + i[0] * dkstride[0];
509		float* const dvp0 = dvp + i[0] * dvstride[0];
510		for (i[1] = 0; i[1] < qdim[2]; i[1]++)
511		{
512			const float* const qp1 = qp0 + i[1] * qstride[2];
513			const float* const kp1 = kp0 + (i[1] / h_h_k_ratio) * kstride[2];
514			const float* const vp1 = vp0 + (i[1] / h_h_k_ratio) * vstride[2];
515			const float* const gp1 = gp0 + i[1] * gstride[2];
516			float* const dqp1 = dqp0 + i[1] * dqstride[2];
517			float* const dkp1 = dkp0 + (i[1] / h_h_k_ratio) * dkstride[2];
518			float* const dvp1 = dvp0 + (i[1] / h_h_k_ratio) * dvstride[2];
519			// Compute Q @ K^T
520			int x, y, k;
521			for (x = 0; x < qdim[1]; x++)
522			{
523				float* const dqp2 = dqp1 + x * dqstride[1];
524				for (k = 0; k < qdim[3]; k++)
525					dqp2[k * dqstride[3]] = 0;
526			}
527			// Only zero out when it is at 0-index.
528			if (i[1] % h_h_k_ratio == 0)
529				for (y = 0; y < kdim[1]; y++)
530				{
531					float* const dkp2 = dkp1 + y * dkstride[1];
532					for (k = 0; k < qdim[3]; k++)
533						dkp2[k * dkstride[3]] = 0;
534				}
535			// Only zero out when it is at 0-index.
536			if (i[1] % h_h_k_ratio == 0)
537				for (y = 0; y < kdim[1]; y++)
538				{
539					float* const dvp2 = dvp1 + y * dvstride[1];
540					for (k = 0; k < vdim[3]; k++)
541						dvp2[k * dvstride[3]] = 0;
542				}
543			for (x = 0; x < qdim[1]; x++)
544			{
545				const float* const qp2 = qp1 + x * qstride[1];
546				const float* const gp2 = gp1 + x * gstride[1];
547				float* const qk0 = qk;
548				float* const qks0 = qk + kdim[1];
549				for (y = 0; y < kdim[1]; y++)
550				{
551					const float* const kp2 = kp1 + y * kstride[1];
552					float v = 0;
553					for (k = 0; k < qdim[3]; k++)
554						v += qp2[k * qstride[3]] * kp2[k * kstride[3]];
555					qk0[y] = scale * v;
556				}
557				// Compute softmax on qk.
558				if (is_causal)
559				{
560					const int x_end = ccv_max(x - qdim[1] + kdim[1] + 1, 0)({ typeof (x - qdim[1] + kdim[1] + 1) _a = (x - qdim[1] + kdim
[1] + 1); typeof (0) _b = (0); (_a > _b) ? _a : _b; });
561					for (y = x_end; y < kdim[1]; y++)
562						qk0[y] = 0;
563					double maxval = qk0[0];
564					for (y = 1; y < x_end; y++)
565						if (qk0[y] > maxval)
566							maxval = qk0[y];
567					double sumval = 0;
568					for (y = 0; y < x_end; y++)
569						sumval += (qk0[y] = expf(qk0[y] - maxval));
570					sumval = 1.0 / sumval;
571					for (y = 0; y < x_end; y++)
572						qk0[y] *= sumval;
573				} else {
574					double maxval = qk0[0];
575					for (y = 1; y < kdim[1]; y++)
576						if (qk0[y] > maxval)
577							maxval = qk0[y];
578					double sumval = 0;
579					for (y = 0; y < kdim[1]; y++)
580						sumval += (qk0[y] = expf(qk0[y] - maxval));
581					sumval = 1.0 / sumval;
582					for (y = 0; y < kdim[1]; y++)
583						qk0[y] *= sumval;
584				}
585				for (y = 0; y < kdim[1]; y++)
586				{
587					float* const dvp2 = dvp1 + y * dvstride[1];
588					const float v = qk0[y];
589					for (k = 0; k < vdim[3]; k++)
590						dvp2[k * dvstride[3]] += v * gp2[k * gstride[3]];
591				}
592				double sumval = 0;
593				for (y = 0; y < kdim[1]; y++)
594				{
595					const float* const vp2 = vp1 + y * vstride[1];
596					float v = 0;
597					for (k = 0; k < vdim[3]; k++)
598						v += gp2[k * gstride[3]] * vp2[k * vstride[3]];
599					qks0[y] = v;
600					sumval += v * qk0[y];
601				}
602				for (y = 0; y < kdim[1]; y++)
603					qk0[y] = (qks0[y] - sumval) * qk0[y];
604				float* const dqp2 = dqp1 + x * dqstride[1];
605				for (y = 0; y < kdim[1]; y++)
606				{
607					const float* const kp2 = kp1 + y * kstride[1];
608					float* const dkp2 = dkp1 + y * dkstride[1];
609					const float v = scale * qk0[y];
610					for (k = 0; k < qdim[3]; k++)
611					{
612						dqp2[k * dqstride[3]] += v * kp2[k * kstride[3]];
613						dkp2[k * dkstride[3]] += v * qp2[k * qstride[3]];
614					}
615				}
616			}
617		}
618	}
619	return CCV_NNC_EXEC_SUCCESS;
620}
621 
622REGISTER_COMMAND_BACKEND(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD, CCV_NNC_BACKEND_CPU_REF)void _register_command_CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_FORWARD_backend_CCV_NNC_BACKEND_CPU_REF(ccv_nnc_cmd_backend_registry_t* const registry)
623{
624	registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC;
625	registry->tensor_datatypes = CCV_32F | CCV_32S;
626	registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
627	registry->algorithms = 1;
628	registry->exec = _ccv_nnc_scaled_dot_product_attention_forw;
629}
630 
631REGISTER_COMMAND_BACKEND(CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD, CCV_NNC_BACKEND_CPU_REF)void _register_command_CCV_NNC_SCALED_DOT_PRODUCT_ATTENTION_BACKWARD_backend_CCV_NNC_BACKEND_CPU_REF(ccv_nnc_cmd_backend_registry_t* const registry)
632{
633	registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC;
634	registry->tensor_datatypes = CCV_32F;
635	registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
636	registry->algorithms = 1;
637	registry->exec = _ccv_nnc_scaled_dot_product_attention_back;
638}