/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/cmd/scatter_add/ccv_nnc_scatter_add_cpu_ref.c
Line | Count | Source |
1 | | #include "ccv.h" |
2 | | #include "ccv_internal.h" |
3 | | #include "nnc/ccv_nnc.h" |
4 | | #include "nnc/ccv_nnc_easy.h" |
5 | | #include "nnc/ccv_nnc_internal.h" |
6 | | #ifdef USE_OPENMP |
7 | | #include <omp.h> |
8 | | #endif |
9 | | #ifdef USE_DISPATCH |
10 | | #include <dispatch/dispatch.h> |
11 | | #endif |
12 | | |
13 | | static int _ccv_nnc_scatter_add_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
14 | 5 | { |
15 | 5 | assert(input_size == 2); |
16 | 5 | assert(output_size == 1); |
17 | 5 | const ccv_nnc_tensor_view_t* const a = (ccv_nnc_tensor_view_t*)inputs[0]; |
18 | 5 | const int a_nd = ccv_nnc_tensor_nd(a->info.dim); |
19 | 5 | assert(a_nd <= 2); |
20 | 5 | const ccv_nnc_tensor_view_t* const indices = (ccv_nnc_tensor_view_t*)inputs[1]; |
21 | 5 | assert(ccv_nnc_tensor_nd(indices->info.dim) == 1); |
22 | 5 | ccv_nnc_tensor_view_t* const b = (ccv_nnc_tensor_view_t*)outputs[0]; |
23 | 5 | const int b_nd = ccv_nnc_tensor_nd(b->info.dim); |
24 | 5 | assert(b_nd <= 2); |
25 | 5 | const int a_cols = a_nd < 2 ? 11 : a->info.dim[1]4 ; |
26 | 5 | const int a_cols_inc = CCV_IS_TENSOR_VIEW(a) ? (1 a_nd < 21 ? 10 : a->stride[0]1 ) : a_cols4 ; |
27 | 5 | const int a_rows = a->info.dim[0]; |
28 | 5 | const int b_cols = b_nd < 2 ? 11 : b->info.dim[1]4 ; |
29 | 5 | const int b_cols_inc = CCV_IS_TENSOR_VIEW(b) ? (1 b_nd < 21 ? 10 : b->stride[0]1 ) : b_cols4 ; |
30 | 5 | const int b_rows = b->info.dim[0]; |
31 | 5 | assert(a_rows == indices->info.dim[0]); |
32 | 5 | assert(indices->info.datatype == CCV_32S); |
33 | 5 | assert(a_cols == b_cols); |
34 | 5 | assert(a->info.datatype == b->info.datatype); |
35 | 5 | ccv_nnc_tensor_zero((ccv_nnc_tensor_t*)b); |
36 | 5 | int i; |
37 | 5 | if (a->info.datatype == CCV_32F) |
38 | 4 | { |
39 | 13 | for (i = 0; i < a_rows; i++9 ) |
40 | 9 | { |
41 | 9 | const int index = indices->data.i32[i]; |
42 | 9 | assert(index < b_rows); |
43 | 9 | float* const bp = b->data.f32 + b_cols_inc * index; |
44 | 9 | float* const ap = a->data.f32 + a_cols_inc * i; |
45 | 15 | parallel_for9 (j, a_cols) { |
46 | 15 | bp[j] += ap[j]; |
47 | 15 | } parallel_endfor |
48 | 9 | } |
49 | 4 | } else { |
50 | 11 | for (i = 0; i < a_rows; i++10 ) |
51 | 10 | { |
52 | 10 | const int index = indices->data.i32[i]; |
53 | 10 | assert(index < b_rows); |
54 | 10 | ccv_float16_t* const bp = b->data.f16 + b_cols_inc * index; |
55 | 10 | ccv_float16_t* const ap = a->data.f16 + a_cols_inc * i; |
56 | 100 | parallel_for10 (j, a_cols) { |
57 | 100 | float t, v; |
58 | 100 | ccv_half_precision_to_float((uint16_t*)ap + j, &t, 1); |
59 | 100 | ccv_half_precision_to_float((uint16_t*)bp + j, &v, 1); |
60 | 100 | v += t; |
61 | 100 | ccv_float_to_half_precision(&v, (uint16_t*)bp + j, 1); |
62 | 100 | } parallel_endfor |
63 | 10 | } |
64 | 1 | } |
65 | 5 | return CCV_NNC_EXEC_SUCCESS; |
66 | 5 | } |
67 | | |
68 | | static int _ccv_nnc_scatter_add_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
69 | 4 | { |
70 | 4 | assert(input_size >= 3); |
71 | 4 | assert(output_size <= 2); |
72 | 4 | const ccv_nnc_tensor_view_t* const g = (ccv_nnc_tensor_view_t*)inputs[0]; |
73 | 4 | const int g_nd = ccv_nnc_tensor_nd(g->info.dim); |
74 | 4 | assert(g_nd <= 2); |
75 | 4 | const ccv_nnc_tensor_view_t* const indices = (ccv_nnc_tensor_view_t*)inputs[2]; |
76 | 4 | assert(ccv_nnc_tensor_nd(indices->info.dim) == 1); |
77 | 4 | ccv_nnc_tensor_view_t* const h = (ccv_nnc_tensor_view_t*)outputs[0]; |
78 | 4 | const int h_nd = ccv_nnc_tensor_nd(h->info.dim); |
79 | 4 | assert(h_nd <= 2); |
80 | 4 | if (output_size >= 2 && outputs[1]0 ) |
81 | 0 | ccv_nnc_tensor_zero(outputs[1]); |
82 | 4 | const int g_cols = g_nd < 2 ? 11 : g->info.dim[1]3 ; |
83 | 4 | const int g_cols_inc = CCV_IS_TENSOR_VIEW(g) ? (1 g_nd < 21 ? 10 : g->stride[0]1 ) : g_cols3 ; |
84 | 4 | const int g_rows = g->info.dim[0]; |
85 | 4 | const int h_cols = h_nd < 2 ? 11 : h->info.dim[1]3 ; |
86 | 4 | const int h_cols_inc = CCV_IS_TENSOR_VIEW(h) ? (1 h_nd < 21 ? 10 : h->stride[0]1 ) : h_cols3 ; |
87 | 4 | const int h_rows = h->info.dim[0]; |
88 | 4 | assert(h_rows == indices->info.dim[0]); |
89 | 4 | assert(g_cols == h_cols); |
90 | 4 | assert(indices->info.datatype == CCV_32S); |
91 | 4 | assert(g->info.datatype == h->info.datatype); |
92 | 4 | assert(g->info.datatype == CCV_32F || g->info.datatype == CCV_16F); |
93 | 4 | const size_t data_size = CCV_GET_DATA_TYPE_SIZE(g->info.datatype); |
94 | 4 | assert(g->info.datatype == CCV_32F || g->info.datatype == CCV_16F); |
95 | 17 | parallel_for4 (i, h_rows) { |
96 | 17 | const int index = indices->data.i32[i]; |
97 | 17 | assert(index < g_rows); |
98 | 17 | uint8_t* const hp = h->data.u8 + data_size * h_cols_inc * i; |
99 | 17 | uint8_t* const gp = g->data.u8 + data_size * g_cols_inc * index; |
100 | 17 | memcpy(hp, gp, data_size * g_cols); |
101 | 17 | } parallel_endfor |
102 | 4 | return CCV_NNC_EXEC_SUCCESS; |
103 | 4 | } |
104 | | |
105 | | REGISTER_COMMAND_BACKEND(CCV_NNC_SCATTER_ADD_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
106 | 1 | { |
107 | 1 | registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW; |
108 | 1 | registry->tensor_datatypes = CCV_32F | CCV_16F | CCV_32S; |
109 | 1 | registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
110 | 1 | registry->algorithms = 1; |
111 | 1 | registry->exec = _ccv_nnc_scatter_add_forw; |
112 | 1 | } |
113 | | |
114 | | REGISTER_COMMAND_BACKEND(CCV_NNC_SCATTER_ADD_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
115 | 1 | { |
116 | 1 | registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW; |
117 | 1 | registry->tensor_datatypes = CCV_32F | CCV_16F | CCV_32S; |
118 | 1 | registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
119 | 1 | registry->algorithms = 1; |
120 | 1 | registry->exec = _ccv_nnc_scatter_add_back; |
121 | 1 | } |