/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/ccv_nnc_xpu_alloc.c
Line | Count | Source (jump to first uncovered line) |
1 | | #include "ccv_nnc.h" |
2 | | #include "ccv_nnc_easy.h" |
3 | | #include "ccv_nnc_internal.h" |
4 | | #include "ccv_nnc_easy.h" |
5 | | #include "ccv_internal.h" |
6 | | #include "_ccv_nnc_xpu_alloc.h" |
7 | | #if defined(HAVE_CUDA) || defined(HAVE_MPS) |
8 | | #ifdef HAVE_CUDA |
9 | | #include "gpu/ccv_nnc_compat.h" |
10 | | #else |
11 | | #include "mps/ccv_nnc_mps.h" |
12 | | #endif |
13 | | #include <stdbool.h> |
14 | | |
15 | | static int dy_alloc_tree_cmp(const dy_alloc_metadata_t* const a_node, const dy_alloc_metadata_t* const b_node) |
16 | 1.50k | { |
17 | 1.50k | return (a_node->size > b_node->size) - (b_node->size > a_node->size); |
18 | 1.50k | } |
19 | | |
20 | | rb_gen(, dy_alloc_tree_, dy_alloc_tree_t, dy_alloc_metadata_t, link, dy_alloc_tree_cmp) |
21 | | |
22 | | static void _ccv_nnc_xpu_metadata_free(dy_alloc_metadata_t* node, void* arg) |
23 | 92 | { |
24 | 227 | do { |
25 | 227 | dy_alloc_metadata_t* const next = node->next; |
26 | 227 | #ifdef HAVE_CUDA |
27 | 227 | cufree(node->device, node->ptr); |
28 | | #elif defined(HAVE_MPS) |
29 | | mpobjfree(node->device, node->ptr); |
30 | | #endif |
31 | 227 | ccfree(node); |
32 | 227 | node = next; |
33 | 227 | } while (node); |
34 | 92 | } |
35 | | |
36 | | static void _ccv_nnc_xpu_alloc_drain(const int device, khash_t(dy_dev)* const dev, const ccv_nnc_stream_context_t* const stream) |
37 | 19 | { |
38 | | // Wait until the stream is free, and then do the free. |
39 | 19 | if (stream) |
40 | 8 | ccv_nnc_stream_context_wait(stream); |
41 | 19 | khiter_t k; |
42 | 19 | if (device >= 0) |
43 | 0 | { |
44 | 0 | k = kh_get(dy_dev, dev, device); |
45 | 0 | if (k != kh_end(dev)) |
46 | 0 | { |
47 | 0 | dy_alloc_tree_t* const tree = &kh_val(dev, k); |
48 | 0 | dy_alloc_tree_destroy(tree, _ccv_nnc_xpu_metadata_free, 0); |
49 | 0 | kh_del(dy_dev, dev, k); |
50 | 0 | } |
51 | 0 | return; |
52 | 0 | } |
53 | 111 | for (k = 19 kh_begin19 (dev); k != kh_end(dev); ++k92 ) |
54 | 92 | { |
55 | 92 | if (!kh_exist(dev, k)) |
56 | 50 | continue; |
57 | 42 | dy_alloc_tree_t* const tree = &kh_val(dev, k); |
58 | 42 | dy_alloc_tree_destroy(tree, _ccv_nnc_xpu_metadata_free, 0); |
59 | 42 | kh_del(dy_dev, dev, k); |
60 | 42 | } |
61 | 19 | } |
62 | | |
63 | | static void _ccv_nnc_xpu_stream_destructor_hook(const ccv_nnc_stream_context_t* const stream, void* const context) |
64 | 2 | { |
65 | 2 | ccv_nnc_xpu_alloc_t* const xpu_alloc = (ccv_nnc_xpu_alloc_t*)context; |
66 | 2 | khash_t(dy_str)* const freed = xpu_alloc->freed; |
67 | 2 | const int64_t str = (int64_t)(intptr_t)stream; |
68 | 2 | khiter_t i = kh_get(dy_str, freed, str); |
69 | 2 | assert(i != kh_end(freed)); |
70 | 2 | khash_t(dy_dev)* const dev = kh_val(freed, i).dev; |
71 | 2 | _ccv_nnc_xpu_alloc_drain(-1, dev, stream); |
72 | 2 | kh_destroy(dy_dev, dev); |
73 | 2 | kh_del(dy_str, freed, i); |
74 | 2 | } |
75 | | |
76 | | void* ccv_nnc_xpu_alloc(ccv_nnc_xpu_alloc_t* const xpu_alloc, const int device, ccv_nnc_stream_context_t* const stream, const size_t size) |
77 | 723 | { |
78 | 723 | khash_t(dy_str)* const freed = xpu_alloc->freed; |
79 | 723 | const int64_t str = (int64_t)(intptr_t)stream; |
80 | 723 | int ret; |
81 | 723 | khiter_t i = kh_put(dy_str, freed, str, &ret); |
82 | 723 | assert(ret >= 0); |
83 | 723 | dy_alloc_metadata_t* node = 0; |
84 | 723 | if (ret == 0) |
85 | 704 | { |
86 | | // If we can find stream related allocations, try to |
87 | | // find the suitable ones. |
88 | 704 | khash_t(dy_dev)* const dev = kh_val(freed, i).dev; |
89 | 704 | assert(dev); |
90 | 704 | khiter_t j = kh_get(dy_dev, dev, device); |
91 | 704 | if (j != kh_end(dev)) |
92 | 558 | { |
93 | 558 | dy_alloc_tree_t* const tree = &kh_val(dev, j); |
94 | 558 | dy_alloc_metadata_t key = { |
95 | 558 | .size = size |
96 | 558 | }; |
97 | 558 | node = dy_alloc_tree_nsearch(tree, &key); |
98 | 558 | if (node) |
99 | 448 | { |
100 | 448 | if (node->next) // If it is a linked list, select the one. |
101 | 214 | { |
102 | 214 | dy_alloc_metadata_t* next_node = node->next; |
103 | 214 | node->next = node->next->next; |
104 | 214 | node = next_node; |
105 | 214 | } else |
106 | 234 | dy_alloc_tree_remove(tree, node); |
107 | 448 | } |
108 | 558 | } |
109 | 704 | } else { |
110 | | // Otherwise, create it. |
111 | 19 | kh_val(freed, i).dev = kh_init(dy_dev); |
112 | 19 | kh_val(freed, i).hook_id = stream ? ccv_nnc_stream_context_add_destructor_hook(stream, _ccv_nnc_xpu_stream_destructor_hook, xpu_alloc)8 : -111 ; |
113 | | |
114 | 19 | } |
115 | 723 | if (!node) |
116 | 275 | { |
117 | 275 | node = (dy_alloc_metadata_t*)ccmalloc(sizeof(dy_alloc_metadata_t)); |
118 | 275 | #ifdef HAVE_CUDA |
119 | 275 | if (xpu_alloc->mp_hdr < 0) |
120 | 12 | xpu_alloc->mp_hdr = curegmp(device, (cump_f)ccv_nnc_xpu_gc, xpu_alloc); |
121 | 275 | node->ptr = cumalloc(device, size); |
122 | | #elif defined(HAVE_MPS) |
123 | | if (xpu_alloc->mp_hdr < 0) |
124 | | xpu_alloc->mp_hdr = mpregmp(device, (mpmp_f)ccv_nnc_xpu_gc, xpu_alloc); |
125 | | node->ptr = mpobjmalloc(device, size); |
126 | | #endif |
127 | 275 | if (!node->ptr) // If cannot allocate, drain the pool first and then allocate. |
128 | 0 | { |
129 | 0 | ccfree(node); |
130 | 0 | return 0; |
131 | 0 | } |
132 | 275 | node->device = device; |
133 | 275 | node->size = size; |
134 | 275 | node->str = str; |
135 | 448 | } else { |
136 | 448 | assert(node->size >= size); |
137 | 448 | assert(node->device == device); |
138 | 448 | assert(node->str == str); |
139 | 448 | } |
140 | 723 | node->next = 0; |
141 | 723 | khash_t(dy_alloc)* const allocd = xpu_alloc->allocd; |
142 | 723 | i = kh_put(dy_alloc, allocd, (int64_t)(intptr_t)node->ptr, &ret); |
143 | 723 | assert(ret > 0); |
144 | 723 | kh_val(allocd, i) = node; |
145 | 723 | return node->ptr; |
146 | 723 | } |
147 | | |
148 | | void ccv_nnc_xpu_free(ccv_nnc_xpu_alloc_t* const xpu_alloc, void* const ptr) |
149 | 723 | { |
150 | 723 | khash_t(dy_alloc)* const allocd = xpu_alloc->allocd; |
151 | 723 | khiter_t i = kh_get(dy_alloc, allocd, (int64_t)(intptr_t)ptr); |
152 | 723 | assert(i != kh_end(allocd)); |
153 | 723 | dy_alloc_metadata_t* const node = kh_val(allocd, i); |
154 | 723 | kh_del(dy_alloc, allocd, i); |
155 | 723 | assert(node->ptr == ptr); |
156 | 723 | khash_t(dy_str)* const freed = xpu_alloc->freed; |
157 | 723 | i = kh_get(dy_str, freed, node->str); |
158 | | // If cannot find associated stream, that means this allocation associated |
159 | | // stream has been freed. I have to do synchronous free of this pointer. |
160 | 723 | if (i == kh_end(freed)) |
161 | 48 | { |
162 | 48 | #ifdef HAVE_CUDA |
163 | 48 | cufree(node->device, node->ptr); |
164 | | #elif defined(HAVE_MPS) |
165 | | mpobjfree(node->device, node->ptr); |
166 | | #endif |
167 | 48 | ccfree(node); |
168 | 48 | return; |
169 | 48 | } |
170 | 675 | khash_t(dy_dev)* const dev = kh_val(freed, i).dev; |
171 | 675 | int ret; |
172 | 675 | khiter_t j = kh_put(dy_dev, dev, node->device, &ret); |
173 | 675 | assert(ret >= 0); |
174 | 675 | dy_alloc_tree_t* const tree = &kh_val(dev, j); |
175 | 675 | if (ret != 0) |
176 | 42 | dy_alloc_tree_new(tree); |
177 | 675 | dy_alloc_metadata_t* const canon_node = dy_alloc_tree_search(tree, node); |
178 | 675 | if (!canon_node) |
179 | 326 | dy_alloc_tree_insert(tree, node); |
180 | 349 | else { // Insert into the linked list. |
181 | 349 | node->next = canon_node->next; |
182 | 349 | canon_node->next = node; |
183 | 349 | } |
184 | 675 | } |
185 | | |
186 | | void ccv_nnc_xpu_alloc_destroy(ccv_nnc_xpu_alloc_t* const xpu_alloc) |
187 | 2.34k | { |
188 | 2.34k | khash_t(dy_alloc)* const allocd = xpu_alloc->allocd; |
189 | 2.34k | khiter_t k; |
190 | 2.72k | for (k = kh_begin2.34k (allocd); k != kh_end(allocd); ++k384 ) |
191 | 384 | { |
192 | 384 | if (!kh_exist(allocd, k)) |
193 | 384 | continue; |
194 | 0 | _ccv_nnc_xpu_metadata_free(kh_val(allocd, k), 0); |
195 | 0 | } |
196 | 2.34k | kh_destroy(dy_alloc, allocd); |
197 | 2.34k | khash_t(dy_str)* const freed = xpu_alloc->freed; |
198 | 2.39k | for (k = kh_begin2.34k (freed); k != kh_end(freed); ++k48 ) |
199 | 48 | { |
200 | 48 | if (!kh_exist(freed, k)) |
201 | 31 | continue; |
202 | 17 | khash_t(dy_dev)* const dev = kh_val(freed, k).dev; |
203 | 17 | ccv_nnc_stream_context_t* const stream = (ccv_nnc_stream_context_t*)(intptr_t)kh_key(freed, k); |
204 | 17 | _ccv_nnc_xpu_alloc_drain(-1, dev, stream); |
205 | 17 | if (stream) |
206 | 6 | { |
207 | 6 | const int hook_id = kh_val(freed, k).hook_id; |
208 | 6 | ccv_nnc_stream_context_remove_destructor_hook(stream, hook_id); |
209 | 6 | } |
210 | 17 | kh_destroy(dy_dev, dev); |
211 | 17 | } |
212 | 2.34k | kh_destroy(dy_str, freed); |
213 | 2.34k | #ifdef HAVE_CUDA |
214 | 2.34k | if (xpu_alloc->mp_hdr >= 0) |
215 | 12 | cuunregmp(xpu_alloc->mp_hdr); |
216 | | #elif defined(HAVE_MPS) |
217 | | if (xpu_alloc->mp_hdr >= 0) |
218 | | mpunregmp(xpu_alloc->mp_hdr); |
219 | | #endif |
220 | 2.34k | } |
221 | | |
222 | | void ccv_nnc_xpu_gc(const int device, ccv_nnc_xpu_alloc_t* const xpu_alloc) |
223 | 0 | { |
224 | 0 | khash_t(dy_str)* const freed = xpu_alloc->freed; |
225 | 0 | khiter_t k; |
226 | 0 | for (k = kh_begin(freed); k != kh_end(freed); ++k) |
227 | 0 | { |
228 | 0 | if (!kh_exist(freed, k)) |
229 | 0 | continue; |
230 | 0 | khash_t(dy_dev)* const dev = kh_val(freed, k).dev; |
231 | 0 | ccv_nnc_stream_context_t* const stream = (ccv_nnc_stream_context_t*)(intptr_t)kh_key(freed, k); |
232 | 0 | _ccv_nnc_xpu_alloc_drain(device, dev, stream); |
233 | 0 | } |
234 | 0 | } |
235 | | #else |
236 | | void* ccv_nnc_xpu_alloc(ccv_nnc_xpu_alloc_t* const xpu_alloc, const int device, ccv_nnc_stream_context_t* const stream, const size_t size) |
237 | | { |
238 | | return 0; |
239 | | } |
240 | | |
241 | | void ccv_nnc_xpu_free(ccv_nnc_xpu_alloc_t* const xpu_alloc, void* const ptr) |
242 | | { |
243 | | } |
244 | | |
245 | | void ccv_nnc_xpu_alloc_destroy(ccv_nnc_xpu_alloc_t* const xpu_alloc) |
246 | | { |
247 | | } |
248 | | |
249 | | void ccv_nnc_xpu_gc(const int device, ccv_nnc_xpu_alloc_t* const dynamic_graph) |
250 | | { |
251 | | } |
252 | | #endif |