/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/cmd/reduce/ccv_nnc_reduce_sum_cpu_ref.c
Line | Count | Source (jump to first uncovered line) |
1 | | #include "ccv.h" |
2 | | #include "ccv_internal.h" |
3 | | #include "nnc/ccv_nnc.h" |
4 | | #include "nnc/ccv_nnc_easy.h" |
5 | | #include "nnc/ccv_nnc_internal.h" |
6 | | #ifdef USE_OPENMP |
7 | | #include <omp.h> |
8 | | #endif |
9 | | #ifdef USE_DISPATCH |
10 | | #include <dispatch/dispatch.h> |
11 | | #endif |
12 | | |
13 | | // Shared methods. |
14 | | #include "../_ccv_nnc_cpu_ref.h" |
15 | | |
16 | | void _ccv_nnc_reduce_sum_forw_cpu_ref(ccv_nnc_tensor_view_t* const a, ccv_nnc_tensor_view_t* const b) |
17 | 6.08k | { |
18 | 6.08k | assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2); |
19 | 6.08k | assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2); |
20 | | // Assuming this is float 32. |
21 | 6.08k | int adim[CCV_NNC_MAX_DIM_ALLOC]; |
22 | 6.08k | int bdim[CCV_NNC_MAX_DIM_ALLOC]; |
23 | 6.08k | ccv_nnc_tensor_view_get_dim(a, adim); |
24 | 6.08k | ccv_nnc_tensor_view_get_dim(b, bdim); |
25 | 6.08k | assert(ccv_nnc_tensor_view_check_broadcast_dim(b, adim)); |
26 | 6.08k | int astride[CCV_NNC_MAX_DIM_ALLOC]; |
27 | 6.08k | int bstride[CCV_NNC_MAX_DIM_ALLOC]; |
28 | 6.08k | assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number. |
29 | 6.08k | ccv_nnc_tensor_view_get_stride(a, astride); |
30 | 6.08k | ccv_nnc_tensor_view_get_stride(b, bstride); |
31 | 6.08k | int i[CCV_NNC_MAX_DIM + 2]; |
32 | 6.08k | int x; |
33 | 6.08k | ccv_nnc_tensor_zero(b); |
34 | 6.08k | float* const ap = a->data.f32; |
35 | 6.08k | float* const bp = b->data.f32; |
36 | | // Non-optimal case, need to do skip if needed. |
37 | 12.5k | for (i[0] = 0; i[0] < adim[0]; i[0]++6.44k ) |
38 | 6.44k | { |
39 | 6.44k | float* const ap0 = ap + i[0] * astride[0]; |
40 | 6.44k | float* const bp0 = bdim[0] == 1 ? bp6.20k : bp + i[0] * bstride[0]244 ; |
41 | 14.1k | for (i[1] = 0; i[1] < adim[1]; i[1]++7.73k ) |
42 | 7.73k | { |
43 | 7.73k | float* ap1 = ap0 + i[1] * astride[1]; |
44 | 7.73k | float* const bp1 = bdim[1] == 1 ? bp07.22k : bp0 + i[1] * bstride[1]516 ; |
45 | 26.4k | for (i[2] = 0; i[2] < adim[2]; i[2]++18.7k ) |
46 | 18.7k | { |
47 | 18.7k | float* const bp2 = bdim[2] == 1 ? bp117.7k : bp1 + i[2] * bstride[2]978 ; |
48 | 18.7k | if (bdim[3] == 1) |
49 | 65.8k | for (x = 0; 14.7k x < adim[3]; x++51.1k ) |
50 | 51.1k | bp2[0] += ap1[x]; |
51 | 4.01k | else |
52 | 38.3k | for (x = 0; 4.01k x < adim[3]; x++34.3k ) |
53 | 34.3k | bp2[x] += ap1[x]; |
54 | 18.7k | ap1 += astride[2]; |
55 | 18.7k | } |
56 | 7.73k | } |
57 | 6.44k | } |
58 | 6.08k | } |
59 | | |
60 | | static int _ccv_nnc_reduce_sum_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
61 | 6.03k | { |
62 | 6.03k | assert(input_size == 1); |
63 | 6.03k | ccv_nnc_tensor_view_t* const a = (ccv_nnc_tensor_view_t*)inputs[0]; |
64 | 6.03k | ccv_nnc_tensor_view_t* const b = (ccv_nnc_tensor_view_t*)outputs[0]; |
65 | 6.03k | _ccv_nnc_reduce_sum_forw_cpu_ref(a, b); |
66 | 6.03k | return CCV_NNC_EXEC_SUCCESS; |
67 | 6.03k | } |
68 | | |
69 | | static int _ccv_nnc_reduce_sum_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
70 | 6.01k | { |
71 | 6.01k | if (inputs[0] == 0) |
72 | 0 | { |
73 | 0 | _ccv_nnc_tensor_set_cpu_ref_f32((ccv_nnc_tensor_view_t*)outputs[0], 1); |
74 | 0 | return CCV_NNC_EXEC_SUCCESS; |
75 | 0 | } |
76 | 6.01k | ccv_nnc_tensor_view_t* const a = (ccv_nnc_tensor_view_t*)outputs[0]; |
77 | 6.01k | ccv_nnc_tensor_view_t* const b = (ccv_nnc_tensor_view_t*)inputs[0]; |
78 | 6.01k | assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2); |
79 | 6.01k | assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2); |
80 | | // Assuming this is float 32. |
81 | 6.01k | int adim[CCV_NNC_MAX_DIM_ALLOC]; |
82 | 6.01k | int bdim[CCV_NNC_MAX_DIM_ALLOC]; |
83 | 6.01k | ccv_nnc_tensor_view_get_dim(a, adim); |
84 | 6.01k | ccv_nnc_tensor_view_get_dim(b, bdim); |
85 | 6.01k | int astride[CCV_NNC_MAX_DIM_ALLOC]; |
86 | 6.01k | int bstride[CCV_NNC_MAX_DIM_ALLOC]; |
87 | 6.01k | assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number. |
88 | 6.01k | ccv_nnc_tensor_view_get_stride(a, astride); |
89 | 6.01k | ccv_nnc_tensor_view_get_stride(b, bstride); |
90 | 6.01k | int i[CCV_NNC_MAX_DIM + 2]; |
91 | 6.01k | int x; |
92 | 6.01k | float* const ap = a->data.f32; |
93 | 6.01k | float* const bp = b->data.f32; |
94 | | // Non-optimal case, need to do skip if needed. |
95 | 12.1k | for (i[0] = 0; i[0] < adim[0]; i[0]++6.09k ) |
96 | 6.09k | { |
97 | 6.09k | float* const ap0 = ap + i[0] * astride[0]; |
98 | 6.09k | float* const bp0 = bdim[0] == 1 ? bp6.00k : bp + i[0] * bstride[0]88 ; |
99 | 12.4k | for (i[1] = 0; i[1] < adim[1]; i[1]++6.36k ) |
100 | 6.36k | { |
101 | 6.36k | float* ap1 = ap0 + i[1] * astride[1]; |
102 | 6.36k | float* const bp1 = bdim[1] == 1 ? bp06.17k : bp0 + i[1] * bstride[1]192 ; |
103 | 19.7k | for (i[2] = 0; i[2] < adim[2]; i[2]++13.4k ) |
104 | 13.4k | { |
105 | 13.4k | float* const bp2 = bdim[2] == 1 ? bp113.1k : bp1 + i[2] * bstride[2]256 ; |
106 | 13.4k | if (bdim[3] == 1) |
107 | 45.9k | for (x = 0; 12.8k x < adim[3]; x++33.0k ) |
108 | 33.0k | ap1[x] = bp2[0]; |
109 | 530 | else |
110 | 3.25k | for (x = 0; 530 x < adim[3]; x++2.72k ) |
111 | 2.72k | ap1[x] = bp2[x]; |
112 | 13.4k | ap1 += astride[2]; |
113 | 13.4k | } |
114 | 6.36k | } |
115 | 6.09k | } |
116 | 6.01k | return CCV_NNC_EXEC_SUCCESS; |
117 | 6.01k | } |
118 | | |
119 | | REGISTER_COMMAND_BACKEND(CCV_NNC_REDUCE_SUM_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
120 | 1 | { |
121 | 1 | registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN; |
122 | 1 | registry->tensor_datatypes = CCV_32F; |
123 | 1 | registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
124 | 1 | registry->algorithms = 1; |
125 | 1 | registry->exec = _ccv_nnc_reduce_sum_forw; |
126 | 1 | } |
127 | | |
128 | | REGISTER_COMMAND_BACKEND(CCV_NNC_REDUCE_SUM_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
129 | 1 | { |
130 | 1 | registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN; |
131 | 1 | registry->tensor_datatypes = CCV_32F; |
132 | 1 | registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
133 | 1 | registry->algorithms = 1; |
134 | 1 | registry->exec = _ccv_nnc_reduce_sum_back; |
135 | 1 | } |