1 | //===-- GPU memory allocator implementation ---------------------*- C++ -*-===// |
---|---|
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file implements a parallel allocator intended for use on a GPU device. |
10 | // The core algorithm is slab allocator using a random walk over a bitfield for |
11 | // maximum parallel progress. Slab handling is done by a wait-free reference |
12 | // counted guard. The first use of a slab will create it from system memory for |
13 | // re-use. The last use will invalidate it and free the memory. |
14 | // |
15 | //===----------------------------------------------------------------------===// |
16 | |
17 | #include "allocator.h" |
18 | |
19 | #include "src/__support/CPP/atomic.h" |
20 | #include "src/__support/CPP/bit.h" |
21 | #include "src/__support/CPP/new.h" |
22 | #include "src/__support/GPU/utils.h" |
23 | #include "src/__support/RPC/rpc_client.h" |
24 | #include "src/__support/threads/sleep.h" |
25 | |
26 | namespace LIBC_NAMESPACE_DECL { |
27 | |
28 | constexpr static uint64_t MAX_SIZE = /* 64 GiB */ 64ull * 1024 * 1024 * 1024; |
29 | constexpr static uint64_t SLAB_SIZE = /* 2 MiB */ 2ull * 1024 * 1024; |
30 | constexpr static uint64_t ARRAY_SIZE = MAX_SIZE / SLAB_SIZE; |
31 | constexpr static uint64_t SLAB_ALIGNMENT = SLAB_SIZE - 1; |
32 | constexpr static uint32_t BITS_IN_WORD = sizeof(uint32_t) * 8; |
33 | constexpr static uint32_t MIN_SIZE = 16; |
34 | constexpr static uint32_t MIN_ALIGNMENT = MIN_SIZE - 1; |
35 | |
36 | // A sentinel used to indicate an invalid but non-null pointer value. |
37 | constexpr static uint64_t SENTINEL = cpp::numeric_limits<uint64_t>::max(); |
38 | |
39 | // The number of times we will try starting on a single index before skipping |
40 | // past it. |
41 | constexpr static uint32_t MAX_TRIES = 512; |
42 | |
43 | static_assert(!(ARRAY_SIZE & (ARRAY_SIZE - 1)), "Must be a power of two"); |
44 | |
45 | namespace impl { |
46 | // Allocates more memory from the system through the RPC interface. All |
47 | // allocations from the system MUST be aligned on a 2MiB barrier. The default |
48 | // HSA allocator has this behavior for any allocation >= 2MiB and the CUDA |
49 | // driver provides an alignment field for virtual memory allocations. |
50 | static void *rpc_allocate(uint64_t size) { |
51 | void *ptr = nullptr; |
52 | rpc::Client::Port port = rpc::client.open<LIBC_MALLOC>(); |
53 | port.send_and_recv( |
54 | [=](rpc::Buffer *buffer, uint32_t) { buffer->data[0] = size; }, |
55 | [&](rpc::Buffer *buffer, uint32_t) { |
56 | ptr = reinterpret_cast<void *>(buffer->data[0]); |
57 | }); |
58 | port.close(); |
59 | return ptr; |
60 | } |
61 | |
62 | // Deallocates the associated system memory. |
63 | static void rpc_free(void *ptr) { |
64 | rpc::Client::Port port = rpc::client.open<LIBC_FREE>(); |
65 | port.send([=](rpc::Buffer *buffer, uint32_t) { |
66 | buffer->data[0] = reinterpret_cast<uintptr_t>(ptr); |
67 | }); |
68 | port.close(); |
69 | } |
70 | |
71 | // Convert a potentially disjoint bitmask into an increasing integer per-lane |
72 | // for use with indexing between gpu lanes. |
73 | static inline uint32_t lane_count(uint64_t lane_mask) { |
74 | return cpp::popcount(lane_mask & ((uint64_t(1) << gpu::get_lane_id()) - 1)); |
75 | } |
76 | |
77 | // Obtain an initial value to seed a random number generator. We use the rounded |
78 | // multiples of the golden ratio from xorshift* as additional spreading. |
79 | static inline uint32_t entropy() { |
80 | return (static_cast<uint32_t>(gpu::processor_clock()) ^ |
81 | (gpu::get_thread_id_x() * 0x632be59b) ^ |
82 | (gpu::get_block_id_x() * 0x85157af5)) * |
83 | 0x9e3779bb; |
84 | } |
85 | |
86 | // Generate a random number and update the state using the xorshift32* PRNG. |
87 | static inline uint32_t xorshift32(uint32_t &state) { |
88 | state ^= state << 13; |
89 | state ^= state >> 17; |
90 | state ^= state << 5; |
91 | return state * 0x9e3779bb; |
92 | } |
93 | |
94 | // Final stage of murmurhash used to get a unique index for the global array |
95 | static inline uint32_t hash(uint32_t x) { |
96 | x ^= x >> 16; |
97 | x *= 0x85ebca6b; |
98 | x ^= x >> 13; |
99 | x *= 0xc2b2ae35; |
100 | x ^= x >> 16; |
101 | return x; |
102 | } |
103 | |
104 | // Rounds the input value to the closest permitted chunk size. Here we accept |
105 | // the sum of the closest three powers of two. For a 2MiB slab size this is 48 |
106 | // different chunk sizes. This gives us average internal fragmentation of 87.5%. |
107 | static inline uint32_t get_chunk_size(uint32_t x) { |
108 | uint32_t y = x < MIN_SIZE ? MIN_SIZE : x; |
109 | uint32_t pow2 = BITS_IN_WORD - cpp::countl_zero(y - 1); |
110 | |
111 | uint32_t s0 = 0b0100 << (pow2 - 3); |
112 | uint32_t s1 = 0b0110 << (pow2 - 3); |
113 | uint32_t s2 = 0b0111 << (pow2 - 3); |
114 | uint32_t s3 = 0b1000 << (pow2 - 3); |
115 | |
116 | if (s0 > y) |
117 | return (s0 + MIN_ALIGNMENT) & ~MIN_ALIGNMENT; |
118 | if (s1 > y) |
119 | return (s1 + MIN_ALIGNMENT) & ~MIN_ALIGNMENT; |
120 | if (s2 > y) |
121 | return (s2 + MIN_ALIGNMENT) & ~MIN_ALIGNMENT; |
122 | return (s3 + MIN_ALIGNMENT) & ~MIN_ALIGNMENT; |
123 | } |
124 | |
125 | // Rounds to the nearest power of two. |
126 | template <uint32_t N, typename T> |
127 | static inline constexpr T round_up(const T x) { |
128 | static_assert(((N - 1) & N) == 0, "N must be a power of two"); |
129 | return (x + N) & ~(N - 1); |
130 | } |
131 | |
132 | } // namespace impl |
133 | |
134 | /// A slab allocator used to hand out identically sized slabs of memory. |
135 | /// Allocation is done through random walks of a bitfield until a free bit is |
136 | /// encountered. This reduces contention and is highly parallel on a GPU. |
137 | /// |
138 | /// 0 4 8 16 ... 2 MiB |
139 | /// ┌────────┬──────────┬────────┬──────────────────┬──────────────────────────┐ |
140 | /// │ chunk │ index │ pad │ bitfield[] │ memory[] │ |
141 | /// └────────┴──────────┴────────┴──────────────────┴──────────────────────────┘ |
142 | /// |
143 | /// The size of the bitfield is the slab size divided by the chunk size divided |
144 | /// by the number of bits per word. We pad the interface to ensure 16 byte |
145 | /// alignment and to indicate that if the pointer is not aligned by 2MiB it |
146 | /// belongs to a slab rather than the global allocator. |
147 | struct Slab { |
148 | // Header metadata for the slab, aligned to the minimum alignment. |
149 | struct alignas(MIN_SIZE) Header { |
150 | uint32_t chunk_size; |
151 | uint32_t global_index; |
152 | }; |
153 | |
154 | // Initialize the slab with its chunk size and index in the global table for |
155 | // use when freeing. |
156 | Slab(uint32_t chunk_size, uint32_t global_index) { |
157 | Header *header = reinterpret_cast<Header *>(memory); |
158 | header->chunk_size = chunk_size; |
159 | header->global_index = global_index; |
160 | |
161 | // This memset is expensive and likely not necessary for the current 'kfd' |
162 | // driver. Until zeroed pages are exposed by the API we must be careful. |
163 | __builtin_memset(get_bitfield(), 0, bitfield_bytes(chunk_size)); |
164 | } |
165 | |
166 | // Get the number of chunks that can theoretically fit inside this slab. |
167 | constexpr static uint32_t num_chunks(uint32_t chunk_size) { |
168 | return SLAB_SIZE / chunk_size; |
169 | } |
170 | |
171 | // Get the number of bytes needed to contain the bitfield bits. |
172 | constexpr static uint32_t bitfield_bytes(uint32_t chunk_size) { |
173 | return ((num_chunks(chunk_size) + BITS_IN_WORD - 1) / BITS_IN_WORD) * 8; |
174 | } |
175 | |
176 | // The actual amount of memory available excluding the bitfield and metadata. |
177 | constexpr static uint32_t available_bytes(uint32_t chunk_size) { |
178 | return SLAB_SIZE - bitfield_bytes(chunk_size) - sizeof(Header); |
179 | } |
180 | |
181 | // The number of chunks that can be stored in this slab. |
182 | constexpr static uint32_t available_chunks(uint32_t chunk_size) { |
183 | return available_bytes(chunk_size) / chunk_size; |
184 | } |
185 | |
186 | // The length in bits of the bitfield. |
187 | constexpr static uint32_t usable_bits(uint32_t chunk_size) { |
188 | return available_bytes(chunk_size) / chunk_size; |
189 | } |
190 | |
191 | // Get the location in the memory where we will store the chunk size. |
192 | uint32_t get_chunk_size() const { |
193 | return reinterpret_cast<const Header *>(memory)->chunk_size; |
194 | } |
195 | |
196 | // Get the location in the memory where we will store the global index. |
197 | uint32_t get_global_index() const { |
198 | return reinterpret_cast<const Header *>(memory)->global_index; |
199 | } |
200 | |
201 | // Get a pointer to where the bitfield is located in the memory. |
202 | uint32_t *get_bitfield() { |
203 | return reinterpret_cast<uint32_t *>(memory + sizeof(Header)); |
204 | } |
205 | |
206 | // Get a pointer to where the actual memory to be allocated lives. |
207 | uint8_t *get_memory(uint32_t chunk_size) { |
208 | return reinterpret_cast<uint8_t *>(get_bitfield()) + |
209 | bitfield_bytes(chunk_size); |
210 | } |
211 | |
212 | // Get a pointer to the actual memory given an index into the bitfield. |
213 | void *ptr_from_index(uint32_t index, uint32_t chunk_size) { |
214 | return get_memory(chunk_size) + index * chunk_size; |
215 | } |
216 | |
217 | // Convert a pointer back into its bitfield index using its offset. |
218 | uint32_t index_from_ptr(void *ptr, uint32_t chunk_size) { |
219 | return static_cast<uint32_t>(reinterpret_cast<uint8_t *>(ptr) - |
220 | get_memory(chunk_size)) / |
221 | chunk_size; |
222 | } |
223 | |
224 | // Randomly walks the bitfield until it finds a free bit. Allocations attempt |
225 | // to put lanes right next to each other for better caching and convergence. |
226 | void *allocate(uint64_t lane_mask, uint64_t uniform) { |
227 | uint32_t chunk_size = get_chunk_size(); |
228 | uint32_t state = impl::entropy(); |
229 | |
230 | // The uniform mask represents which lanes contain a uniform target pointer. |
231 | // We attempt to place these next to each other. |
232 | void *result = nullptr; |
233 | for (uint64_t mask = lane_mask; mask; |
234 | mask = gpu::ballot(lane_mask, !result)) { |
235 | if (result) |
236 | continue; |
237 | |
238 | uint32_t start = gpu::broadcast_value(lane_mask, impl::xorshift32(state)); |
239 | |
240 | uint32_t id = impl::lane_count(lane_mask: uniform & mask); |
241 | uint32_t index = (start + id) % usable_bits(chunk_size); |
242 | uint32_t slot = index / BITS_IN_WORD; |
243 | uint32_t bit = index % BITS_IN_WORD; |
244 | |
245 | // Get the mask of bits destined for the same slot and coalesce it. |
246 | uint64_t match = uniform & gpu::match_any(mask, slot); |
247 | uint32_t length = cpp::popcount(match); |
248 | uint32_t bitmask = static_cast<uint32_t>((uint64_t(1) << length) - 1) |
249 | << bit; |
250 | |
251 | uint32_t before = 0; |
252 | if (gpu::get_lane_id() == static_cast<uint32_t>(cpp::countr_zero(match))) |
253 | before = cpp::AtomicRef(get_bitfield()[slot]) |
254 | .fetch_or(bitmask, cpp::MemoryOrder::RELAXED); |
255 | before = gpu::shuffle(mask, cpp::countr_zero(match), before); |
256 | if (~before & (1 << bit)) |
257 | result = ptr_from_index(index, chunk_size); |
258 | else |
259 | sleep_briefly(); |
260 | } |
261 | |
262 | cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE); |
263 | return result; |
264 | } |
265 | |
266 | // Deallocates memory by resetting its corresponding bit in the bitfield. |
267 | void deallocate(void *ptr) { |
268 | uint32_t chunk_size = get_chunk_size(); |
269 | uint32_t index = index_from_ptr(ptr, chunk_size); |
270 | uint32_t slot = index / BITS_IN_WORD; |
271 | uint32_t bit = index % BITS_IN_WORD; |
272 | |
273 | cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE); |
274 | cpp::AtomicRef(get_bitfield()[slot]) |
275 | .fetch_and(~(1u << bit), cpp::MemoryOrder::RELAXED); |
276 | } |
277 | |
278 | // The actual memory the slab will manage. All offsets are calculated at |
279 | // runtime with the chunk size to keep the interface convergent when a warp or |
280 | // wavefront is handling multiple sizes at once. |
281 | uint8_t memory[SLAB_SIZE]; |
282 | }; |
283 | |
284 | /// A wait-free guard around a pointer resource to be created dynamically if |
285 | /// space is available and freed once there are no more users. |
286 | struct GuardPtr { |
287 | private: |
288 | struct RefCounter { |
289 | // Indicates that the object is in its deallocation phase and thus invalid. |
290 | static constexpr uint64_t INVALID = uint64_t(1) << 63; |
291 | |
292 | // If a read preempts an unlock call we indicate this so the following |
293 | // unlock call can swap out the helped bit and maintain exclusive ownership. |
294 | static constexpr uint64_t HELPED = uint64_t(1) << 62; |
295 | |
296 | // Resets the reference counter, cannot be reset to zero safely. |
297 | void reset(uint32_t n, uint64_t &count) { |
298 | counter.store(n, cpp::MemoryOrder::RELAXED); |
299 | count = n; |
300 | } |
301 | |
302 | // Acquire a slot in the reference counter if it is not invalid. |
303 | bool acquire(uint32_t n, uint64_t &count) { |
304 | count = counter.fetch_add(n, cpp::MemoryOrder::RELAXED) + n; |
305 | return (count & INVALID) == 0; |
306 | } |
307 | |
308 | // Release a slot in the reference counter. This function should only be |
309 | // called following a valid acquire call. |
310 | bool release(uint32_t n) { |
311 | // If this thread caused the counter to reach zero we try to invalidate it |
312 | // and obtain exclusive rights to deconstruct it. If the CAS failed either |
313 | // another thread resurrected the counter and we quit, or a parallel read |
314 | // helped us invalidating it. For the latter, claim that flag and return. |
315 | if (counter.fetch_sub(n, cpp::MemoryOrder::RELAXED) == n) { |
316 | uint64_t expected = 0; |
317 | if (counter.compare_exchange_strong(expected, INVALID, |
318 | cpp::MemoryOrder::RELAXED, |
319 | cpp::MemoryOrder::RELAXED)) |
320 | return true; |
321 | else if ((expected & HELPED) && |
322 | (counter.exchange(INVALID, cpp::MemoryOrder::RELAXED) & |
323 | HELPED)) |
324 | return true; |
325 | } |
326 | return false; |
327 | } |
328 | |
329 | // Returns the current reference count, potentially helping a releasing |
330 | // thread. |
331 | uint64_t read() { |
332 | auto val = counter.load(cpp::MemoryOrder::RELAXED); |
333 | if (val == 0 && counter.compare_exchange_strong( |
334 | val, INVALID | HELPED, cpp::MemoryOrder::RELAXED)) |
335 | return 0; |
336 | return (val & INVALID) ? 0 : val; |
337 | } |
338 | |
339 | cpp::Atomic<uint64_t> counter{0}; |
340 | }; |
341 | |
342 | cpp::Atomic<Slab *> ptr{nullptr}; |
343 | RefCounter ref{}; |
344 | |
345 | // Should be called be a single lane for each different pointer. |
346 | template <typename... Args> |
347 | Slab *try_lock_impl(uint32_t n, uint64_t &count, Args &&...args) { |
348 | Slab *expected = ptr.load(cpp::MemoryOrder::RELAXED); |
349 | if (!expected && |
350 | ptr.compare_exchange_strong( |
351 | expected, reinterpret_cast<Slab *>(SENTINEL), |
352 | cpp::MemoryOrder::RELAXED, cpp::MemoryOrder::RELAXED)) { |
353 | count = cpp::numeric_limits<uint64_t>::max(); |
354 | void *raw = impl::rpc_allocate(size: sizeof(Slab)); |
355 | if (!raw) |
356 | return nullptr; |
357 | Slab *mem = new (raw) Slab(cpp::forward<Args>(args)...); |
358 | |
359 | cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE); |
360 | ptr.store(mem, cpp::MemoryOrder::RELAXED); |
361 | cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE); |
362 | if (!ref.acquire(n, count)) |
363 | ref.reset(n, count); |
364 | return mem; |
365 | } |
366 | |
367 | if (!expected || expected == reinterpret_cast<Slab *>(SENTINEL)) |
368 | return nullptr; |
369 | |
370 | if (!ref.acquire(n, count)) |
371 | return nullptr; |
372 | |
373 | cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE); |
374 | return ptr.load(cpp::MemoryOrder::RELAXED); |
375 | } |
376 | |
377 | public: |
378 | // Attempt to lock access to the pointer, potentially creating it if empty. |
379 | // The uniform mask represents which lanes share the same pointer. For each |
380 | // uniform value we elect a leader to handle it on behalf of the other lanes. |
381 | template <typename... Args> |
382 | Slab *try_lock(uint64_t lane_mask, uint64_t uniform, uint64_t &count, |
383 | Args &&...args) { |
384 | count = 0; |
385 | Slab *result = nullptr; |
386 | if (gpu::get_lane_id() == uint32_t(cpp::countr_zero(uniform))) |
387 | result = try_lock_impl(cpp::popcount(uniform), count, |
388 | cpp::forward<Args>(args)...); |
389 | result = gpu::shuffle(lane_mask, cpp::countr_zero(uniform), result); |
390 | count = gpu::shuffle(lane_mask, cpp::countr_zero(uniform), count); |
391 | |
392 | if (!result) |
393 | return nullptr; |
394 | |
395 | if (count != cpp::numeric_limits<uint64_t>::max()) |
396 | count = count - cpp::popcount(uniform) + impl::lane_count(uniform) + 1; |
397 | |
398 | return result; |
399 | } |
400 | |
401 | // Release the associated lock on the pointer, potentially destroying it. |
402 | void unlock(uint64_t lane_mask, uint64_t mask) { |
403 | cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE); |
404 | if (gpu::get_lane_id() == uint32_t(cpp::countr_zero(mask)) && |
405 | ref.release(cpp::popcount(mask))) { |
406 | Slab *p = ptr.load(cpp::MemoryOrder::RELAXED); |
407 | p->~Slab(); |
408 | impl::rpc_free(ptr: p); |
409 | cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE); |
410 | ptr.store(nullptr, cpp::MemoryOrder::RELAXED); |
411 | } |
412 | gpu::sync_lane(lane_mask); |
413 | } |
414 | |
415 | // Get the current value of the reference counter. |
416 | uint64_t use_count() { return ref.read(); } |
417 | }; |
418 | |
419 | // The global array used to search for a valid slab to allocate from. |
420 | static GuardPtr slots[ARRAY_SIZE] = {}; |
421 | |
422 | // Tries to find a slab in the table that can support the given chunk size. |
423 | static Slab *find_slab(uint32_t chunk_size) { |
424 | // We start at a hashed value to spread out different chunk sizes. |
425 | uint32_t start = impl::hash(x: chunk_size); |
426 | uint64_t lane_mask = gpu::get_lane_mask(); |
427 | uint64_t uniform = gpu::match_any(lane_mask, chunk_size); |
428 | |
429 | Slab *result = nullptr; |
430 | uint32_t nudge = 0; |
431 | for (uint64_t mask = lane_mask; mask; |
432 | mask = gpu::ballot(lane_mask, !result), ++nudge) { |
433 | uint32_t index = cpp::numeric_limits<uint32_t>::max(); |
434 | for (uint32_t offset = nudge / MAX_TRIES; |
435 | gpu::ballot(lane_mask, index == cpp::numeric_limits<uint32_t>::max()); |
436 | offset += cpp::popcount(uniform & lane_mask)) { |
437 | uint32_t candidate = |
438 | (start + offset + impl::lane_count(uniform & lane_mask)) % ARRAY_SIZE; |
439 | uint64_t available = |
440 | gpu::ballot(lane_mask, slots[candidate].use_count() < |
441 | Slab::available_chunks(chunk_size)); |
442 | uint32_t new_index = gpu::shuffle( |
443 | lane_mask, cpp::countr_zero(available & uniform), candidate); |
444 | |
445 | // Each uniform group will use the first empty slot they find. |
446 | if ((index == cpp::numeric_limits<uint32_t>::max() && |
447 | (available & uniform))) |
448 | index = new_index; |
449 | |
450 | // Guaruntees that this loop will eventuall exit if there is no space. |
451 | if (offset >= ARRAY_SIZE) { |
452 | result = reinterpret_cast<Slab *>(SENTINEL); |
453 | index = 0; |
454 | } |
455 | } |
456 | |
457 | // Try to claim a slot for the found slot. |
458 | if (!result) { |
459 | uint64_t reserved = 0; |
460 | Slab *slab = slots[index].try_lock(lane_mask: lane_mask & mask, uniform: uniform & mask, |
461 | count&: reserved, args&: chunk_size, args&: index); |
462 | // If we find a slab with a matching chunk size then we store the result. |
463 | // Otherwise, we need to free the claimed lock and continue. In the case |
464 | // of out-of-memory we return a sentinel value. |
465 | if (slab && reserved <= Slab::available_chunks(chunk_size) && |
466 | slab->get_chunk_size() == chunk_size) { |
467 | result = slab; |
468 | } else if (slab && (reserved > Slab::available_chunks(chunk_size) || |
469 | slab->get_chunk_size() != chunk_size)) { |
470 | if (slab->get_chunk_size() != chunk_size) |
471 | start = index + 1; |
472 | slots[index].unlock(gpu::get_lane_mask(), |
473 | gpu::get_lane_mask() & uniform); |
474 | } else if (!slab && reserved == cpp::numeric_limits<uint64_t>::max()) { |
475 | result = reinterpret_cast<Slab *>(SENTINEL); |
476 | } else { |
477 | sleep_briefly(); |
478 | } |
479 | } |
480 | } |
481 | return result; |
482 | } |
483 | |
484 | // Release the lock associated with a given slab. |
485 | static void release_slab(Slab *slab) { |
486 | uint32_t index = slab->get_global_index(); |
487 | uint64_t lane_mask = gpu::get_lane_mask(); |
488 | uint64_t uniform = gpu::match_any(lane_mask, index); |
489 | slots[index].unlock(lane_mask, mask: uniform); |
490 | } |
491 | |
492 | namespace gpu { |
493 | |
494 | void *allocate(uint64_t size) { |
495 | if (!size) |
496 | return nullptr; |
497 | |
498 | // Allocations requiring a full slab or more go directly to memory. |
499 | if (size >= SLAB_SIZE / 2) |
500 | return impl::rpc_allocate(size: impl::round_up<SLAB_SIZE>(x: size)); |
501 | |
502 | // Try to find a slab for the rounded up chunk size and allocate from it. |
503 | uint32_t chunk_size = impl::get_chunk_size(x: static_cast<uint32_t>(size)); |
504 | Slab *slab = find_slab(chunk_size); |
505 | if (!slab || slab == reinterpret_cast<Slab *>(SENTINEL)) |
506 | return nullptr; |
507 | |
508 | uint64_t lane_mask = gpu::get_lane_mask(); |
509 | uint64_t uniform = gpu::match_any(lane_mask, slab->get_global_index()); |
510 | void *ptr = slab->allocate(lane_mask, uniform); |
511 | return ptr; |
512 | } |
513 | |
514 | void deallocate(void *ptr) { |
515 | if (!ptr) |
516 | return; |
517 | |
518 | // All non-slab allocations will be aligned on a 2MiB boundary. |
519 | if ((reinterpret_cast<uintptr_t>(ptr) & SLAB_ALIGNMENT) == 0) |
520 | return impl::rpc_free(ptr); |
521 | |
522 | // The original slab pointer is the 2MiB boundary using the given pointer. |
523 | Slab *slab = reinterpret_cast<Slab *>( |
524 | (reinterpret_cast<uintptr_t>(ptr) & ~SLAB_ALIGNMENT)); |
525 | slab->deallocate(ptr); |
526 | release_slab(slab); |
527 | } |
528 | |
529 | } // namespace gpu |
530 | } // namespace LIBC_NAMESPACE_DECL |
531 |
Definitions
- MAX_SIZE
- SLAB_SIZE
- ARRAY_SIZE
- SLAB_ALIGNMENT
- BITS_IN_WORD
- MIN_SIZE
- MIN_ALIGNMENT
- SENTINEL
- MAX_TRIES
- rpc_allocate
- rpc_free
- lane_count
- entropy
- xorshift32
- hash
- get_chunk_size
- round_up
- Slab
- Header
- Slab
- num_chunks
- bitfield_bytes
- available_bytes
- available_chunks
- usable_bits
- get_chunk_size
- get_global_index
- get_bitfield
- get_memory
- ptr_from_index
- index_from_ptr
- allocate
- deallocate
- GuardPtr
- RefCounter
- INVALID
- HELPED
- reset
- acquire
- release
- read
- try_lock_impl
- try_lock
- unlock
- use_count
- slots
- find_slab
- release_slab
- allocate
Update your C++ knowledge – Modern C++11/14/17 Training
Find out more