1//===-- GPU memory allocator implementation ---------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements a parallel allocator intended for use on a GPU device.
10// The core algorithm is slab allocator using a random walk over a bitfield for
11// maximum parallel progress. Slab handling is done by a wait-free reference
12// counted guard. The first use of a slab will create it from system memory for
13// re-use. The last use will invalidate it and free the memory.
14//
15//===----------------------------------------------------------------------===//
16
17#include "allocator.h"
18
19#include "src/__support/CPP/atomic.h"
20#include "src/__support/CPP/bit.h"
21#include "src/__support/CPP/new.h"
22#include "src/__support/GPU/utils.h"
23#include "src/__support/RPC/rpc_client.h"
24#include "src/__support/threads/sleep.h"
25#include "src/string/memory_utils/inline_memcpy.h"
26
27namespace LIBC_NAMESPACE_DECL {
28
29constexpr static uint64_t MAX_SIZE = /* 64 GiB */ 64ull * 1024 * 1024 * 1024;
30constexpr static uint64_t SLAB_SIZE = /* 2 MiB */ 2ull * 1024 * 1024;
31constexpr static uint64_t ARRAY_SIZE = MAX_SIZE / SLAB_SIZE;
32constexpr static uint64_t SLAB_ALIGNMENT = SLAB_SIZE - 1;
33constexpr static uint32_t BITS_IN_WORD = sizeof(uint32_t) * 8;
34constexpr static uint32_t MIN_SIZE = 16;
35constexpr static uint32_t MIN_ALIGNMENT = MIN_SIZE - 1;
36
37// A sentinel used to indicate an invalid but non-null pointer value.
38constexpr static uint64_t SENTINEL = cpp::numeric_limits<uint64_t>::max();
39
40// The number of times we will try starting on a single index before skipping
41// past it.
42constexpr static uint32_t MAX_TRIES = 512;
43
44static_assert(!(ARRAY_SIZE & (ARRAY_SIZE - 1)), "Must be a power of two");
45
46namespace impl {
47// Allocates more memory from the system through the RPC interface. All
48// allocations from the system MUST be aligned on a 2MiB barrier. The default
49// HSA allocator has this behavior for any allocation >= 2MiB and the CUDA
50// driver provides an alignment field for virtual memory allocations.
51static void *rpc_allocate(uint64_t size) {
52 void *ptr = nullptr;
53 rpc::Client::Port port = rpc::client.open<LIBC_MALLOC>();
54 port.send_and_recv(
55 [=](rpc::Buffer *buffer, uint32_t) { buffer->data[0] = size; },
56 [&](rpc::Buffer *buffer, uint32_t) {
57 ptr = reinterpret_cast<void *>(buffer->data[0]);
58 });
59 port.close();
60 return ptr;
61}
62
63// Deallocates the associated system memory.
64static void rpc_free(void *ptr) {
65 rpc::Client::Port port = rpc::client.open<LIBC_FREE>();
66 port.send([=](rpc::Buffer *buffer, uint32_t) {
67 buffer->data[0] = reinterpret_cast<uintptr_t>(ptr);
68 });
69 port.close();
70}
71
72// Convert a potentially disjoint bitmask into an increasing integer per-lane
73// for use with indexing between gpu lanes.
74static inline uint32_t lane_count(uint64_t lane_mask) {
75 return cpp::popcount(lane_mask & ((uint64_t(1) << gpu::get_lane_id()) - 1));
76}
77
78// Obtain an initial value to seed a random number generator. We use the rounded
79// multiples of the golden ratio from xorshift* as additional spreading.
80static inline uint32_t entropy() {
81 return (static_cast<uint32_t>(gpu::processor_clock()) ^
82 (gpu::get_thread_id_x() * 0x632be59b) ^
83 (gpu::get_block_id_x() * 0x85157af5)) *
84 0x9e3779bb;
85}
86
87// Generate a random number and update the state using the xorshift32* PRNG.
88static inline uint32_t xorshift32(uint32_t &state) {
89 state ^= state << 13;
90 state ^= state >> 17;
91 state ^= state << 5;
92 return state * 0x9e3779bb;
93}
94
95// Final stage of murmurhash used to get a unique index for the global array
96static inline uint32_t hash(uint32_t x) {
97 x ^= x >> 16;
98 x *= 0x85ebca6b;
99 x ^= x >> 13;
100 x *= 0xc2b2ae35;
101 x ^= x >> 16;
102 return x;
103}
104
105// Rounds the input value to the closest permitted chunk size. Here we accept
106// the sum of the closest three powers of two. For a 2MiB slab size this is 48
107// different chunk sizes. This gives us average internal fragmentation of 87.5%.
108static inline uint32_t get_chunk_size(uint32_t x) {
109 uint32_t y = x < MIN_SIZE ? MIN_SIZE : x;
110 uint32_t pow2 = BITS_IN_WORD - cpp::countl_zero(y - 1);
111
112 uint32_t s0 = 0b0100 << (pow2 - 3);
113 uint32_t s1 = 0b0110 << (pow2 - 3);
114 uint32_t s2 = 0b0111 << (pow2 - 3);
115 uint32_t s3 = 0b1000 << (pow2 - 3);
116
117 if (s0 > y)
118 return (s0 + MIN_ALIGNMENT) & ~MIN_ALIGNMENT;
119 if (s1 > y)
120 return (s1 + MIN_ALIGNMENT) & ~MIN_ALIGNMENT;
121 if (s2 > y)
122 return (s2 + MIN_ALIGNMENT) & ~MIN_ALIGNMENT;
123 return (s3 + MIN_ALIGNMENT) & ~MIN_ALIGNMENT;
124}
125
126// Rounds to the nearest power of two.
127template <uint32_t N, typename T>
128static inline constexpr T round_up(const T x) {
129 static_assert(((N - 1) & N) == 0, "N must be a power of two");
130 return (x + N) & ~(N - 1);
131}
132
133// Perform a lane parallel memset on a uint32_t pointer.
134void uniform_memset(uint32_t *s, uint32_t c, uint32_t n, uint64_t uniform) {
135 uint64_t mask = gpu::get_lane_mask();
136 uint32_t workers = cpp::popcount(uniform);
137 for (uint32_t i = impl::lane_count(lane_mask: mask & uniform); i < n; i += workers)
138 s[i] = c;
139}
140
141// Indicates that the provided value is a power of two.
142static inline constexpr bool is_pow2(uint64_t x) {
143 return x && (x & (x - 1)) == 0;
144}
145
146} // namespace impl
147
148/// A slab allocator used to hand out identically sized slabs of memory.
149/// Allocation is done through random walks of a bitfield until a free bit is
150/// encountered. This reduces contention and is highly parallel on a GPU.
151///
152/// 0 4 8 16 ... 2 MiB
153/// ┌────────┬──────────┬────────┬──────────────────┬──────────────────────────┐
154/// │ chunk │ index │ pad │ bitfield[] │ memory[] │
155/// └────────┴──────────┴────────┴──────────────────┴──────────────────────────┘
156///
157/// The size of the bitfield is the slab size divided by the chunk size divided
158/// by the number of bits per word. We pad the interface to ensure 16 byte
159/// alignment and to indicate that if the pointer is not aligned by 2MiB it
160/// belongs to a slab rather than the global allocator.
161struct Slab {
162 // Header metadata for the slab, aligned to the minimum alignment.
163 struct alignas(MIN_SIZE) Header {
164 uint32_t chunk_size;
165 uint32_t global_index;
166 };
167
168 // Initialize the slab with its chunk size and index in the global table for
169 // use when freeing.
170 Slab(uint32_t chunk_size, uint32_t global_index) {
171 Header *header = reinterpret_cast<Header *>(memory);
172 header->chunk_size = chunk_size;
173 header->global_index = global_index;
174 }
175
176 // Set the necessary bitfield bytes to zero in parallel using many lanes. This
177 // must be called before the bitfield can be accessed safely, memory is not
178 // guaranteed to be zero initialized in the current implementation.
179 void initialize(uint64_t uniform) {
180 uint32_t size = (bitfield_bytes(chunk_size: get_chunk_size()) + sizeof(uint32_t) - 1) /
181 sizeof(uint32_t);
182 impl::uniform_memset(s: get_bitfield(), c: 0, n: size, uniform);
183 }
184
185 // Get the number of chunks that can theoretically fit inside this slab.
186 constexpr static uint32_t num_chunks(uint32_t chunk_size) {
187 return SLAB_SIZE / chunk_size;
188 }
189
190 // Get the number of bytes needed to contain the bitfield bits.
191 constexpr static uint32_t bitfield_bytes(uint32_t chunk_size) {
192 return __builtin_align_up(
193 ((num_chunks(chunk_size) + BITS_IN_WORD - 1) / BITS_IN_WORD) * 8,
194 MIN_ALIGNMENT + 1);
195 }
196
197 // The actual amount of memory available excluding the bitfield and metadata.
198 constexpr static uint32_t available_bytes(uint32_t chunk_size) {
199 return SLAB_SIZE - bitfield_bytes(chunk_size) - sizeof(Header);
200 }
201
202 // The number of chunks that can be stored in this slab.
203 constexpr static uint32_t available_chunks(uint32_t chunk_size) {
204 return available_bytes(chunk_size) / chunk_size;
205 }
206
207 // The length in bits of the bitfield.
208 constexpr static uint32_t usable_bits(uint32_t chunk_size) {
209 return available_bytes(chunk_size) / chunk_size;
210 }
211
212 // Get the location in the memory where we will store the chunk size.
213 uint32_t get_chunk_size() const {
214 return reinterpret_cast<const Header *>(memory)->chunk_size;
215 }
216
217 // Get the location in the memory where we will store the global index.
218 uint32_t get_global_index() const {
219 return reinterpret_cast<const Header *>(memory)->global_index;
220 }
221
222 // Get a pointer to where the bitfield is located in the memory.
223 uint32_t *get_bitfield() {
224 return reinterpret_cast<uint32_t *>(memory + sizeof(Header));
225 }
226
227 // Get a pointer to where the actual memory to be allocated lives.
228 uint8_t *get_memory(uint32_t chunk_size) {
229 return reinterpret_cast<uint8_t *>(get_bitfield()) +
230 bitfield_bytes(chunk_size);
231 }
232
233 // Get a pointer to the actual memory given an index into the bitfield.
234 void *ptr_from_index(uint32_t index, uint32_t chunk_size) {
235 return get_memory(chunk_size) + index * chunk_size;
236 }
237
238 // Convert a pointer back into its bitfield index using its offset.
239 uint32_t index_from_ptr(void *ptr, uint32_t chunk_size) {
240 return static_cast<uint32_t>(reinterpret_cast<uint8_t *>(ptr) -
241 get_memory(chunk_size)) /
242 chunk_size;
243 }
244
245 // Randomly walks the bitfield until it finds a free bit. Allocations attempt
246 // to put lanes right next to each other for better caching and convergence.
247 void *allocate(uint64_t lane_mask, uint64_t uniform) {
248 uint32_t chunk_size = get_chunk_size();
249 uint32_t state = impl::entropy();
250
251 // The uniform mask represents which lanes contain a uniform target pointer.
252 // We attempt to place these next to each other.
253 void *result = nullptr;
254 for (uint64_t mask = lane_mask; mask;
255 mask = gpu::ballot(lane_mask, !result)) {
256 if (result)
257 continue;
258
259 uint32_t start = gpu::broadcast_value(lane_mask, impl::xorshift32(state));
260
261 uint32_t id = impl::lane_count(lane_mask: uniform & mask);
262 uint32_t index = (start + id) % usable_bits(chunk_size);
263 uint32_t slot = index / BITS_IN_WORD;
264 uint32_t bit = index % BITS_IN_WORD;
265
266 // Get the mask of bits destined for the same slot and coalesce it.
267 uint64_t match = uniform & gpu::match_any(mask, slot);
268 uint32_t length = cpp::popcount(match);
269 uint32_t bitmask = static_cast<uint32_t>((uint64_t(1) << length) - 1)
270 << bit;
271
272 uint32_t before = 0;
273 if (gpu::get_lane_id() == static_cast<uint32_t>(cpp::countr_zero(match)))
274 before = cpp::AtomicRef(get_bitfield()[slot])
275 .fetch_or(bitmask, cpp::MemoryOrder::RELAXED);
276 before = gpu::shuffle(mask, cpp::countr_zero(match), before);
277 if (~before & (1 << bit))
278 result = ptr_from_index(index, chunk_size);
279 else
280 sleep_briefly();
281 }
282
283 cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
284 return result;
285 }
286
287 // Deallocates memory by resetting its corresponding bit in the bitfield.
288 void deallocate(void *ptr) {
289 uint32_t chunk_size = get_chunk_size();
290 uint32_t index = index_from_ptr(ptr, chunk_size);
291 uint32_t slot = index / BITS_IN_WORD;
292 uint32_t bit = index % BITS_IN_WORD;
293
294 cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
295 cpp::AtomicRef(get_bitfield()[slot])
296 .fetch_and(~(1u << bit), cpp::MemoryOrder::RELAXED);
297 }
298
299 // The actual memory the slab will manage. All offsets are calculated at
300 // runtime with the chunk size to keep the interface convergent when a warp or
301 // wavefront is handling multiple sizes at once.
302 uint8_t memory[SLAB_SIZE];
303};
304
305/// A wait-free guard around a pointer resource to be created dynamically if
306/// space is available and freed once there are no more users.
307struct GuardPtr {
308private:
309 struct RefCounter {
310 // Indicates that the object is in its deallocation phase and thus invalid.
311 static constexpr uint64_t INVALID = uint64_t(1) << 63;
312
313 // If a read preempts an unlock call we indicate this so the following
314 // unlock call can swap out the helped bit and maintain exclusive ownership.
315 static constexpr uint64_t HELPED = uint64_t(1) << 62;
316
317 // Resets the reference counter, cannot be reset to zero safely.
318 void reset(uint32_t n, uint64_t &count) {
319 counter.store(n, cpp::MemoryOrder::RELAXED);
320 count = n;
321 }
322
323 // Acquire a slot in the reference counter if it is not invalid.
324 bool acquire(uint32_t n, uint64_t &count) {
325 count = counter.fetch_add(n, cpp::MemoryOrder::RELAXED) + n;
326 return (count & INVALID) == 0;
327 }
328
329 // Release a slot in the reference counter. This function should only be
330 // called following a valid acquire call.
331 bool release(uint32_t n) {
332 // If this thread caused the counter to reach zero we try to invalidate it
333 // and obtain exclusive rights to deconstruct it. If the CAS failed either
334 // another thread resurrected the counter and we quit, or a parallel read
335 // helped us invalidating it. For the latter, claim that flag and return.
336 if (counter.fetch_sub(n, cpp::MemoryOrder::RELAXED) == n) {
337 uint64_t expected = 0;
338 if (counter.compare_exchange_strong(expected, INVALID,
339 cpp::MemoryOrder::RELAXED,
340 cpp::MemoryOrder::RELAXED))
341 return true;
342 else if ((expected & HELPED) &&
343 (counter.exchange(INVALID, cpp::MemoryOrder::RELAXED) &
344 HELPED))
345 return true;
346 }
347 return false;
348 }
349
350 // Returns the current reference count, potentially helping a releasing
351 // thread.
352 uint64_t read() {
353 auto val = counter.load(cpp::MemoryOrder::RELAXED);
354 if (val == 0 && counter.compare_exchange_strong(
355 val, INVALID | HELPED, cpp::MemoryOrder::RELAXED))
356 return 0;
357 return (val & INVALID) ? 0 : val;
358 }
359
360 cpp::Atomic<uint64_t> counter{0};
361 };
362
363 cpp::Atomic<Slab *> ptr{nullptr};
364 RefCounter ref{};
365
366 // Should be called be a single lane for each different pointer.
367 template <typename... Args>
368 Slab *try_lock_impl(uint32_t n, uint64_t &count, Args &&...args) {
369 Slab *expected = ptr.load(cpp::MemoryOrder::RELAXED);
370 if (!expected &&
371 ptr.compare_exchange_strong(
372 expected, reinterpret_cast<Slab *>(SENTINEL),
373 cpp::MemoryOrder::RELAXED, cpp::MemoryOrder::RELAXED)) {
374 count = cpp::numeric_limits<uint64_t>::max();
375 void *raw = impl::rpc_allocate(size: sizeof(Slab));
376 if (!raw)
377 return nullptr;
378 return new (raw) Slab(cpp::forward<Args>(args)...);
379 }
380
381 if (!expected || expected == reinterpret_cast<Slab *>(SENTINEL))
382 return nullptr;
383
384 if (!ref.acquire(n, count))
385 return nullptr;
386
387 cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
388 return ptr.load(cpp::MemoryOrder::RELAXED);
389 }
390
391 // Finalize the associated memory and signal that it is ready to use by
392 // resetting the counter.
393 void finalize(Slab *mem, uint32_t n, uint64_t &count) {
394 cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
395 ptr.store(mem, cpp::MemoryOrder::RELAXED);
396 cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
397 if (!ref.acquire(n, count))
398 ref.reset(n, count);
399 }
400
401public:
402 // Attempt to lock access to the pointer, potentially creating it if empty.
403 // The uniform mask represents which lanes share the same pointer. For each
404 // uniform value we elect a leader to handle it on behalf of the other lanes.
405 template <typename... Args>
406 Slab *try_lock(uint64_t lane_mask, uint64_t uniform, uint64_t &count,
407 Args &&...args) {
408 count = 0;
409 Slab *result = nullptr;
410 if (gpu::get_lane_id() == uint32_t(cpp::countr_zero(uniform)))
411 result = try_lock_impl(cpp::popcount(uniform), count,
412 cpp::forward<Args>(args)...);
413 result = gpu::shuffle(lane_mask, cpp::countr_zero(uniform), result);
414 count = gpu::shuffle(lane_mask, cpp::countr_zero(uniform), count);
415
416 if (!result)
417 return nullptr;
418
419 // We defer storing the newly allocated slab until now so that we can use
420 // multiple lanes to initialize it and release it for use.
421 if (count == cpp::numeric_limits<uint64_t>::max()) {
422 result->initialize(uniform);
423 if (gpu::get_lane_id() == uint32_t(cpp::countr_zero(uniform)))
424 finalize(result, cpp::popcount(uniform), count);
425 }
426
427 if (count != cpp::numeric_limits<uint64_t>::max())
428 count = count - cpp::popcount(uniform) + impl::lane_count(uniform) + 1;
429
430 return result;
431 }
432
433 // Release the associated lock on the pointer, potentially destroying it.
434 void unlock(uint64_t lane_mask, uint64_t mask) {
435 cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
436 if (gpu::get_lane_id() == uint32_t(cpp::countr_zero(mask)) &&
437 ref.release(cpp::popcount(mask))) {
438 Slab *p = ptr.load(cpp::MemoryOrder::RELAXED);
439 p->~Slab();
440 impl::rpc_free(ptr: p);
441 cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
442 ptr.store(nullptr, cpp::MemoryOrder::RELAXED);
443 }
444 gpu::sync_lane(lane_mask);
445 }
446
447 // Get the current value of the reference counter.
448 uint64_t use_count() { return ref.read(); }
449};
450
451// The global array used to search for a valid slab to allocate from.
452static GuardPtr slots[ARRAY_SIZE] = {};
453
454// Tries to find a slab in the table that can support the given chunk size.
455static Slab *find_slab(uint32_t chunk_size) {
456 // We start at a hashed value to spread out different chunk sizes.
457 uint32_t start = impl::hash(x: chunk_size);
458 uint64_t lane_mask = gpu::get_lane_mask();
459 uint64_t uniform = gpu::match_any(lane_mask, chunk_size);
460
461 Slab *result = nullptr;
462 uint32_t nudge = 0;
463 for (uint64_t mask = lane_mask; mask;
464 mask = gpu::ballot(lane_mask, !result), ++nudge) {
465 uint32_t index = cpp::numeric_limits<uint32_t>::max();
466 for (uint32_t offset = nudge / MAX_TRIES;
467 gpu::ballot(lane_mask, index == cpp::numeric_limits<uint32_t>::max());
468 offset += cpp::popcount(uniform & lane_mask)) {
469 uint32_t candidate =
470 (start + offset + impl::lane_count(lane_mask: uniform & lane_mask)) % ARRAY_SIZE;
471 uint64_t available =
472 gpu::ballot(lane_mask, slots[candidate].use_count() <
473 Slab::available_chunks(chunk_size));
474 uint32_t new_index = gpu::shuffle(
475 lane_mask, cpp::countr_zero(available & uniform), candidate);
476
477 // Each uniform group will use the first empty slot they find.
478 if ((index == cpp::numeric_limits<uint32_t>::max() &&
479 (available & uniform)))
480 index = new_index;
481
482 // Guaruntees that this loop will eventuall exit if there is no space.
483 if (offset >= ARRAY_SIZE) {
484 result = reinterpret_cast<Slab *>(SENTINEL);
485 index = 0;
486 }
487 }
488
489 // Try to claim a slot for the found slot.
490 if (!result) {
491 uint64_t reserved = 0;
492 Slab *slab = slots[index].try_lock(lane_mask: lane_mask & mask, uniform: uniform & mask,
493 count&: reserved, args&: chunk_size, args&: index);
494 // If we find a slab with a matching chunk size then we store the result.
495 // Otherwise, we need to free the claimed lock and continue. In the case
496 // of out-of-memory we return a sentinel value.
497 if (slab && reserved <= Slab::available_chunks(chunk_size) &&
498 slab->get_chunk_size() == chunk_size) {
499 result = slab;
500 } else if (slab && (reserved > Slab::available_chunks(chunk_size) ||
501 slab->get_chunk_size() != chunk_size)) {
502 if (slab->get_chunk_size() != chunk_size)
503 start = index + 1;
504 slots[index].unlock(gpu::get_lane_mask(),
505 gpu::get_lane_mask() & uniform);
506 } else if (!slab && reserved == cpp::numeric_limits<uint64_t>::max()) {
507 result = reinterpret_cast<Slab *>(SENTINEL);
508 } else {
509 sleep_briefly();
510 }
511 }
512 }
513 return result;
514}
515
516// Release the lock associated with a given slab.
517static void release_slab(Slab *slab) {
518 uint32_t index = slab->get_global_index();
519 uint64_t lane_mask = gpu::get_lane_mask();
520 uint64_t uniform = gpu::match_any(lane_mask, index);
521 slots[index].unlock(lane_mask, mask: uniform);
522}
523
524namespace gpu {
525
526void *allocate(uint64_t size) {
527 if (!size)
528 return nullptr;
529
530 // Allocations requiring a full slab or more go directly to memory.
531 if (size >= SLAB_SIZE / 2)
532 return impl::rpc_allocate(size: impl::round_up<SLAB_SIZE>(x: size));
533
534 // Try to find a slab for the rounded up chunk size and allocate from it.
535 uint32_t chunk_size = impl::get_chunk_size(x: static_cast<uint32_t>(size));
536 Slab *slab = find_slab(chunk_size);
537 if (!slab || slab == reinterpret_cast<Slab *>(SENTINEL))
538 return nullptr;
539
540 uint64_t lane_mask = gpu::get_lane_mask();
541 uint64_t uniform = gpu::match_any(lane_mask, slab->get_global_index());
542 void *ptr = slab->allocate(lane_mask, uniform);
543 return ptr;
544}
545
546void deallocate(void *ptr) {
547 if (!ptr)
548 return;
549
550 // All non-slab allocations will be aligned on a 2MiB boundary.
551 if (__builtin_is_aligned(ptr, SLAB_ALIGNMENT + 1))
552 return impl::rpc_free(ptr);
553
554 // The original slab pointer is the 2MiB boundary using the given pointer.
555 Slab *slab = cpp::launder(reinterpret_cast<Slab *>(
556 (reinterpret_cast<uintptr_t>(ptr) & ~SLAB_ALIGNMENT)));
557 slab->deallocate(ptr);
558 release_slab(slab);
559}
560
561void *reallocate(void *ptr, uint64_t size) {
562 if (ptr == nullptr)
563 return gpu::allocate(size);
564
565 // Non-slab allocations are considered foreign pointers so we fail.
566 if (__builtin_is_aligned(ptr, SLAB_ALIGNMENT + 1))
567 return nullptr;
568
569 // The original slab pointer is the 2MiB boundary using the given pointer.
570 Slab *slab = cpp::launder(reinterpret_cast<Slab *>(
571 (reinterpret_cast<uintptr_t>(ptr) & ~SLAB_ALIGNMENT)));
572 if (slab->get_chunk_size() >= size)
573 return ptr;
574
575 // If we need a new chunk we reallocate and copy it over.
576 void *new_ptr = gpu::allocate(size);
577 inline_memcpy(new_ptr, ptr, slab->get_chunk_size());
578 gpu::deallocate(ptr);
579 return new_ptr;
580}
581
582void *aligned_allocate(uint32_t alignment, uint64_t size) {
583 // All alignment values must be a non-zero power of two.
584 if (!impl::is_pow2(x: alignment))
585 return nullptr;
586
587 // If the requested alignment is less than what we already provide this is
588 // just a normal allocation.
589 if (alignment <= MIN_ALIGNMENT + 1)
590 return gpu::allocate(size);
591
592 // We can't handle alignments greater than 2MiB so we simply fail.
593 if (alignment > SLAB_ALIGNMENT + 1)
594 return nullptr;
595
596 // Trying to handle allocation internally would break the assumption that each
597 // chunk is identical to eachother. Allocate enough memory with worst-case
598 // alignment and then round up. The index logic will round down properly.
599 uint64_t rounded = size + alignment - MIN_ALIGNMENT;
600 void *ptr = gpu::allocate(size: rounded);
601 return __builtin_align_up(ptr, alignment);
602}
603
604} // namespace gpu
605} // namespace LIBC_NAMESPACE_DECL
606

source code of libc/src/__support/GPU/allocator.cpp