/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/cmd/sgd/ccv_nnc_sgd_cpu_ref.c
Line | Count | Source (jump to first uncovered line) |
1 | | #include "ccv.h" |
2 | | #include "ccv_internal.h" |
3 | | #include "nnc/ccv_nnc.h" |
4 | | #include "nnc/ccv_nnc_easy.h" |
5 | | #include "nnc/ccv_nnc_internal.h" |
6 | | #ifdef USE_OPENMP |
7 | | #include <omp.h> |
8 | | #endif |
9 | | #ifdef USE_DISPATCH |
10 | | #include <dispatch/dispatch.h> |
11 | | #endif |
12 | | |
13 | | // Shared methods. |
14 | | #include "../_ccv_nnc_cpu_ref.h" |
15 | | |
16 | | static int _ccv_nnc_sgd_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
17 | 18.3k | { |
18 | 18.3k | assert(input_size == 3); |
19 | 18.3k | assert(output_size == 2); |
20 | 18.3k | ccv_nnc_tensor_view_t* const g = (ccv_nnc_tensor_view_t*)inputs[0]; |
21 | 18.3k | ccv_nnc_tensor_view_t* const a = (ccv_nnc_tensor_view_t*)inputs[1]; |
22 | 18.3k | ccv_nnc_tensor_view_t* const m = (ccv_nnc_tensor_view_t*)inputs[2]; |
23 | 18.3k | ccv_nnc_tensor_view_t* const b = (ccv_nnc_tensor_view_t*)outputs[0]; |
24 | 18.3k | ccv_nnc_tensor_view_t* const n = (ccv_nnc_tensor_view_t*)outputs[1]; |
25 | 18.3k | assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2); |
26 | 18.3k | assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2); |
27 | | // Assuming this is float 32. |
28 | 18.3k | int adim[CCV_NNC_MAX_DIM_ALLOC]; |
29 | 18.3k | ccv_nnc_tensor_view_get_dim(a, adim); |
30 | 18.3k | assert(ccv_nnc_tensor_view_check_dim(g, adim)); |
31 | 18.3k | assert(ccv_nnc_tensor_view_check_dim(m, adim)); |
32 | 18.3k | assert(ccv_nnc_tensor_view_check_dim(b, adim)); |
33 | 18.3k | assert(ccv_nnc_tensor_view_check_dim(n, adim)); |
34 | 18.3k | assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number. |
35 | 18.3k | int gstride[CCV_NNC_MAX_DIM_ALLOC]; |
36 | 18.3k | int astride[CCV_NNC_MAX_DIM_ALLOC]; |
37 | 18.3k | int mstride[CCV_NNC_MAX_DIM_ALLOC]; |
38 | 18.3k | int bstride[CCV_NNC_MAX_DIM_ALLOC]; |
39 | 18.3k | int nstride[CCV_NNC_MAX_DIM_ALLOC]; |
40 | 18.3k | ccv_nnc_tensor_view_get_stride(g, gstride); |
41 | 18.3k | ccv_nnc_tensor_view_get_stride(a, astride); |
42 | 18.3k | ccv_nnc_tensor_view_get_stride(m, mstride); |
43 | 18.3k | ccv_nnc_tensor_view_get_stride(b, bstride); |
44 | 18.3k | ccv_nnc_tensor_view_get_stride(n, nstride); |
45 | 18.3k | const float rate = cmd.info.sgd.rate; |
46 | 18.3k | const float scale = cmd.info.sgd.scale; |
47 | 18.3k | const float decay = cmd.info.sgd.decay; |
48 | 18.3k | const float momentum = cmd.info.sgd.momentum; |
49 | 18.3k | const float dampening = cmd.info.sgd.dampening; |
50 | 18.3k | const float inv_dampening = 1 - dampening; |
51 | 18.3k | const int nesterov = cmd.info.sgd.nesterov; |
52 | 18.3k | if (nesterov) |
53 | 1.65k | { assert(dampening == 0); } |
54 | 18.3k | int i[CCV_NNC_MAX_DIM + 1]; |
55 | 18.3k | int x; |
56 | 18.3k | float* const gp = g->data.f32; |
57 | 18.3k | float* const ap = a->data.f32; |
58 | 18.3k | float* const mp = m->data.f32; |
59 | 18.3k | float* const bp = b->data.f32; |
60 | 18.3k | float* const np = n->data.f32; |
61 | 18.3k | if (nesterov) |
62 | 1.65k | { |
63 | 15.8k | for (i[0] = 0; i[0] < adim[0]; i[0]++14.1k ) |
64 | 14.1k | { |
65 | 14.1k | float* const gp0 = gp + i[0] * gstride[0]; |
66 | 14.1k | float* const ap0 = ap + i[0] * astride[0]; |
67 | 14.1k | float* const mp0 = mp + i[0] * mstride[0]; |
68 | 14.1k | float* const bp0 = bp + i[0] * bstride[0]; |
69 | 14.1k | float* const np0 = np + i[0] * nstride[0]; |
70 | 79.5k | for (i[1] = 0; i[1] < adim[1]; i[1]++65.3k ) |
71 | 65.3k | { |
72 | 65.3k | float* gp1 = gp0 + i[1] * gstride[1]; |
73 | 65.3k | float* ap1 = ap0 + i[1] * astride[1]; |
74 | 65.3k | float* mp1 = mp0 + i[1] * mstride[1]; |
75 | 65.3k | float* bp1 = bp0 + i[1] * bstride[1]; |
76 | 65.3k | float* np1 = np0 + i[1] * nstride[1]; |
77 | 415k | for (i[2] = 0; i[2] < adim[2]; i[2]++350k ) |
78 | 350k | { |
79 | 23.3M | for (x = 0; x < adim[3]; x++22.9M ) |
80 | 22.9M | { |
81 | 22.9M | float grad = scale * gp1[x]; |
82 | 22.9M | const float mom = np1[x] = momentum * mp1[x] + grad + decay * ap1[x]; |
83 | 22.9M | grad += momentum * mom; |
84 | 22.9M | bp1[x] = ap1[x] - rate * grad; |
85 | 22.9M | } |
86 | 350k | gp1 += gstride[2]; |
87 | 350k | ap1 += astride[2]; |
88 | 350k | mp1 += mstride[2]; |
89 | 350k | bp1 += bstride[2]; |
90 | 350k | np1 += nstride[2]; |
91 | 350k | } |
92 | 65.3k | } |
93 | 14.1k | } |
94 | 16.7k | } else { |
95 | 58.4k | for (i[0] = 0; i[0] < adim[0]; i[0]++41.7k ) |
96 | 41.7k | { |
97 | 41.7k | float* const gp0 = gp + i[0] * gstride[0]; |
98 | 41.7k | float* const ap0 = ap + i[0] * astride[0]; |
99 | 41.7k | float* const mp0 = mp + i[0] * mstride[0]; |
100 | 41.7k | float* const bp0 = bp + i[0] * bstride[0]; |
101 | 41.7k | float* const np0 = np + i[0] * nstride[0]; |
102 | 185k | for (i[1] = 0; i[1] < adim[1]; i[1]++144k ) |
103 | 144k | { |
104 | 144k | float* gp1 = gp0 + i[1] * gstride[1]; |
105 | 144k | float* ap1 = ap0 + i[1] * astride[1]; |
106 | 144k | float* mp1 = mp0 + i[1] * mstride[1]; |
107 | 144k | float* bp1 = bp0 + i[1] * bstride[1]; |
108 | 144k | float* np1 = np0 + i[1] * nstride[1]; |
109 | 856k | for (i[2] = 0; i[2] < adim[2]; i[2]++711k ) |
110 | 711k | { |
111 | 46.6M | for (x = 0; x < adim[3]; x++45.9M ) |
112 | 45.9M | { |
113 | 45.9M | const float mom = np1[x] = momentum * mp1[x] + inv_dampening * (scale * gp1[x] + decay * ap1[x]); |
114 | 45.9M | bp1[x] = ap1[x] - rate * mom; |
115 | 45.9M | } |
116 | 711k | gp1 += gstride[2]; |
117 | 711k | ap1 += astride[2]; |
118 | 711k | mp1 += mstride[2]; |
119 | 711k | bp1 += bstride[2]; |
120 | 711k | np1 += nstride[2]; |
121 | 711k | } |
122 | 144k | } |
123 | 41.7k | } |
124 | 16.7k | } |
125 | 18.3k | return CCV_NNC_EXEC_SUCCESS; |
126 | 18.3k | } |
127 | | |
128 | | static int _ccv_nnc_sgd_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
129 | 0 | { |
130 | 0 | return CCV_NNC_EXEC_INVALID; |
131 | 0 | } |
132 | | |
133 | | REGISTER_COMMAND_BACKEND(CCV_NNC_SGD_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
134 | 1 | { |
135 | 1 | registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN; |
136 | 1 | registry->tensor_datatypes = CCV_32F; |
137 | 1 | registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
138 | 1 | registry->algorithms = 1; |
139 | 1 | registry->exec = _ccv_nnc_sgd_forw; |
140 | 1 | } |
141 | | |
142 | | REGISTER_COMMAND_BACKEND(CCV_NNC_SGD_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry) |
143 | 1 | { |
144 | 1 | registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN; |
145 | 1 | registry->tensor_datatypes = CCV_32F; |
146 | 1 | registry->tensor_memory = CCV_TENSOR_CPU_MEMORY; |
147 | 1 | registry->algorithms = 1; |
148 | 1 | registry->exec = _ccv_nnc_sgd_back; |
149 | 1 | } |