Coverage Report

Created: 2024-08-18 16:21

/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/ccv_nnc_xpu_alloc.c
Line
Count
Source (jump to first uncovered line)
1
#include "ccv_nnc.h"
2
#include "ccv_nnc_easy.h"
3
#include "ccv_nnc_internal.h"
4
#include "ccv_nnc_easy.h"
5
#include "ccv_internal.h"
6
#include "_ccv_nnc_xpu_alloc.h"
7
#if defined(HAVE_CUDA) || defined(HAVE_MPS)
8
#ifdef HAVE_CUDA
9
#include "gpu/ccv_nnc_compat.h"
10
#else
11
#include "mps/ccv_nnc_mps.h"
12
#endif
13
#include <stdbool.h>
14
15
static int dy_alloc_tree_cmp(const dy_alloc_metadata_t* const a_node, const dy_alloc_metadata_t* const b_node)
16
1.50k
{
17
1.50k
  return (a_node->size > b_node->size) - (b_node->size > a_node->size);
18
1.50k
}
19
20
rb_gen(, dy_alloc_tree_, dy_alloc_tree_t, dy_alloc_metadata_t, link, dy_alloc_tree_cmp)
21
22
static void _ccv_nnc_xpu_metadata_free(dy_alloc_metadata_t* node, void* arg)
23
92
{
24
227
  do {
25
227
    dy_alloc_metadata_t* const next = node->next;
26
227
#ifdef HAVE_CUDA
27
227
    cufree(node->device, node->ptr);
28
#elif defined(HAVE_MPS)
29
    mpobjfree(node->device, node->ptr);
30
#endif
31
227
    ccfree(node);
32
227
    node = next;
33
227
  } while (node);
34
92
}
35
36
static void _ccv_nnc_xpu_alloc_drain(const int device, khash_t(dy_dev)* const dev, const ccv_nnc_stream_context_t* const stream)
37
19
{
38
  // Wait until the stream is free, and then do the free.
39
19
  if (stream)
40
8
    ccv_nnc_stream_context_wait(stream);
41
19
  khiter_t k;
42
19
  if (device >= 0)
43
0
  {
44
0
    k = kh_get(dy_dev, dev, device);
45
0
    if (k != kh_end(dev))
46
0
    {
47
0
      dy_alloc_tree_t* const tree = &kh_val(dev, k);
48
0
      dy_alloc_tree_destroy(tree, _ccv_nnc_xpu_metadata_free, 0);
49
0
      kh_del(dy_dev, dev, k);
50
0
    }
51
0
    return;
52
0
  }
53
111
  
for (k = 19
kh_begin19
(dev); k != kh_end(dev);
++k92
)
54
92
  {
55
92
    if (!kh_exist(dev, k))
56
50
      continue;
57
42
    dy_alloc_tree_t* const tree = &kh_val(dev, k);
58
42
    dy_alloc_tree_destroy(tree, _ccv_nnc_xpu_metadata_free, 0);
59
42
    kh_del(dy_dev, dev, k);
60
42
  }
61
19
}
62
63
static void _ccv_nnc_xpu_stream_destructor_hook(const ccv_nnc_stream_context_t* const stream, void* const context)
64
2
{
65
2
  ccv_nnc_xpu_alloc_t* const xpu_alloc = (ccv_nnc_xpu_alloc_t*)context;
66
2
  khash_t(dy_str)* const freed = xpu_alloc->freed;
67
2
  const int64_t str = (int64_t)(intptr_t)stream;
68
2
  khiter_t i = kh_get(dy_str, freed, str);
69
2
  assert(i != kh_end(freed));
70
2
  khash_t(dy_dev)* const dev = kh_val(freed, i).dev;
71
2
  _ccv_nnc_xpu_alloc_drain(-1, dev, stream);
72
2
  kh_destroy(dy_dev, dev);
73
2
  kh_del(dy_str, freed, i);
74
2
}
75
76
void* ccv_nnc_xpu_alloc(ccv_nnc_xpu_alloc_t* const xpu_alloc, const int device, ccv_nnc_stream_context_t* const stream, const size_t size)
77
723
{
78
723
  khash_t(dy_str)* const freed = xpu_alloc->freed;
79
723
  const int64_t str = (int64_t)(intptr_t)stream;
80
723
  int ret;
81
723
  khiter_t i = kh_put(dy_str, freed, str, &ret);
82
723
  assert(ret >= 0);
83
723
  dy_alloc_metadata_t* node = 0;
84
723
  if (ret == 0)
85
704
  {
86
    // If we can find stream related allocations, try to
87
    // find the suitable ones.
88
704
    khash_t(dy_dev)* const dev = kh_val(freed, i).dev;
89
704
    assert(dev);
90
704
    khiter_t j = kh_get(dy_dev, dev, device);
91
704
    if (j != kh_end(dev))
92
558
    {
93
558
      dy_alloc_tree_t* const tree = &kh_val(dev, j);
94
558
      dy_alloc_metadata_t key = {
95
558
        .size = size
96
558
      };
97
558
      node = dy_alloc_tree_nsearch(tree, &key);
98
558
      if (node)
99
448
      {
100
448
        if (node->next) // If it is a linked list, select the one.
101
214
        {
102
214
          dy_alloc_metadata_t* next_node = node->next;
103
214
          node->next = node->next->next;
104
214
          node = next_node;
105
214
        } else
106
234
          dy_alloc_tree_remove(tree, node);
107
448
      }
108
558
    }
109
704
  } else {
110
    // Otherwise, create it.
111
19
    kh_val(freed, i).dev = kh_init(dy_dev);
112
19
    kh_val(freed, i).hook_id = stream ? 
ccv_nnc_stream_context_add_destructor_hook(stream, _ccv_nnc_xpu_stream_destructor_hook, xpu_alloc)8
:
-111
;
113
114
19
  }
115
723
  if (!node)
116
275
  {
117
275
    node = (dy_alloc_metadata_t*)ccmalloc(sizeof(dy_alloc_metadata_t));
118
275
#ifdef HAVE_CUDA
119
275
    if (xpu_alloc->mp_hdr < 0)
120
12
      xpu_alloc->mp_hdr = curegmp(device, (cump_f)ccv_nnc_xpu_gc, xpu_alloc);
121
275
    node->ptr = cumalloc(device, size);
122
#elif defined(HAVE_MPS)
123
    if (xpu_alloc->mp_hdr < 0)
124
      xpu_alloc->mp_hdr = mpregmp(device, (mpmp_f)ccv_nnc_xpu_gc, xpu_alloc);
125
    node->ptr = mpobjmalloc(device, size);
126
#endif
127
275
    if (!node->ptr) // If cannot allocate, drain the pool first and then allocate.
128
0
    {
129
0
      ccfree(node);
130
0
      return 0;
131
0
    }
132
275
    node->device = device;
133
275
    node->size = size;
134
275
    node->str = str;
135
448
  } else {
136
448
    assert(node->size >= size);
137
448
    assert(node->device == device);
138
448
    assert(node->str == str);
139
448
  }
140
723
  node->next = 0;
141
723
  khash_t(dy_alloc)* const allocd = xpu_alloc->allocd;
142
723
  i = kh_put(dy_alloc, allocd, (int64_t)(intptr_t)node->ptr, &ret);
143
723
  assert(ret > 0);
144
723
  kh_val(allocd, i) = node;
145
723
  return node->ptr;
146
723
}
147
148
void ccv_nnc_xpu_free(ccv_nnc_xpu_alloc_t* const xpu_alloc, void* const ptr)
149
723
{
150
723
  khash_t(dy_alloc)* const allocd = xpu_alloc->allocd;
151
723
  khiter_t i = kh_get(dy_alloc, allocd, (int64_t)(intptr_t)ptr);
152
723
  assert(i != kh_end(allocd));
153
723
  dy_alloc_metadata_t* const node = kh_val(allocd, i);
154
723
  kh_del(dy_alloc, allocd, i);
155
723
  assert(node->ptr == ptr);
156
723
  khash_t(dy_str)* const freed = xpu_alloc->freed;
157
723
  i = kh_get(dy_str, freed, node->str);
158
  // If cannot find associated stream, that means this allocation associated
159
  // stream has been freed. I have to do synchronous free of this pointer.
160
723
  if (i == kh_end(freed))
161
48
  {
162
48
#ifdef HAVE_CUDA
163
48
    cufree(node->device, node->ptr);
164
#elif defined(HAVE_MPS)
165
    mpobjfree(node->device, node->ptr);
166
#endif
167
48
    ccfree(node);
168
48
    return;
169
48
  }
170
675
  khash_t(dy_dev)* const dev = kh_val(freed, i).dev;
171
675
  int ret;
172
675
  khiter_t j = kh_put(dy_dev, dev, node->device, &ret);
173
675
  assert(ret >= 0);
174
675
  dy_alloc_tree_t* const tree = &kh_val(dev, j);
175
675
  if (ret != 0)
176
42
    dy_alloc_tree_new(tree);
177
675
  dy_alloc_metadata_t* const canon_node = dy_alloc_tree_search(tree, node);
178
675
  if (!canon_node)
179
326
    dy_alloc_tree_insert(tree, node);
180
349
  else { // Insert into the linked list.
181
349
    node->next = canon_node->next;
182
349
    canon_node->next = node;
183
349
  }
184
675
}
185
186
void ccv_nnc_xpu_alloc_destroy(ccv_nnc_xpu_alloc_t* const xpu_alloc)
187
2.34k
{
188
2.34k
  khash_t(dy_alloc)* const allocd = xpu_alloc->allocd;
189
2.34k
  khiter_t k;
190
2.72k
  for (k = 
kh_begin2.34k
(allocd); k != kh_end(allocd);
++k384
)
191
384
  {
192
384
    if (!kh_exist(allocd, k))
193
384
      continue;
194
0
    _ccv_nnc_xpu_metadata_free(kh_val(allocd, k), 0);
195
0
  }
196
2.34k
  kh_destroy(dy_alloc, allocd);
197
2.34k
  khash_t(dy_str)* const freed = xpu_alloc->freed;
198
2.39k
  for (k = 
kh_begin2.34k
(freed); k != kh_end(freed);
++k48
)
199
48
  {
200
48
    if (!kh_exist(freed, k))
201
31
      continue;
202
17
    khash_t(dy_dev)* const dev = kh_val(freed, k).dev;
203
17
    ccv_nnc_stream_context_t* const stream = (ccv_nnc_stream_context_t*)(intptr_t)kh_key(freed, k);
204
17
    _ccv_nnc_xpu_alloc_drain(-1, dev, stream);
205
17
    if (stream)
206
6
    {
207
6
      const int hook_id = kh_val(freed, k).hook_id;
208
6
      ccv_nnc_stream_context_remove_destructor_hook(stream, hook_id);
209
6
    }
210
17
    kh_destroy(dy_dev, dev);
211
17
  }
212
2.34k
  kh_destroy(dy_str, freed);
213
2.34k
#ifdef HAVE_CUDA
214
2.34k
  if (xpu_alloc->mp_hdr >= 0)
215
12
    cuunregmp(xpu_alloc->mp_hdr);
216
#elif defined(HAVE_MPS)
217
  if (xpu_alloc->mp_hdr >= 0)
218
    mpunregmp(xpu_alloc->mp_hdr);
219
#endif
220
2.34k
}
221
222
void ccv_nnc_xpu_gc(const int device, ccv_nnc_xpu_alloc_t* const xpu_alloc)
223
0
{
224
0
  khash_t(dy_str)* const freed = xpu_alloc->freed;
225
0
  khiter_t k;
226
0
  for (k = kh_begin(freed); k != kh_end(freed); ++k)
227
0
  {
228
0
    if (!kh_exist(freed, k))
229
0
      continue;
230
0
    khash_t(dy_dev)* const dev = kh_val(freed, k).dev;
231
0
    ccv_nnc_stream_context_t* const stream = (ccv_nnc_stream_context_t*)(intptr_t)kh_key(freed, k);
232
0
    _ccv_nnc_xpu_alloc_drain(device, dev, stream);
233
0
  }
234
0
}
235
#else
236
void* ccv_nnc_xpu_alloc(ccv_nnc_xpu_alloc_t* const xpu_alloc, const int device, ccv_nnc_stream_context_t* const stream, const size_t size)
237
{
238
  return 0;
239
}
240
241
void ccv_nnc_xpu_free(ccv_nnc_xpu_alloc_t* const xpu_alloc, void* const ptr)
242
{
243
}
244
245
void ccv_nnc_xpu_alloc_destroy(ccv_nnc_xpu_alloc_t* const xpu_alloc)
246
{
247
}
248
249
void ccv_nnc_xpu_gc(const int device, ccv_nnc_xpu_alloc_t* const dynamic_graph)
250
{
251
}
252
#endif