1//===-- GPU memory allocator implementation ---------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements a parallel allocator intended for use on a GPU device.
10// The core algorithm is slab allocator using a random walk over a bitfield for
11// maximum parallel progress. Slab handling is done by a wait-free reference
12// counted guard. The first use of a slab will create it from system memory for
13// re-use. The last use will invalidate it and free the memory.
14//
15//===----------------------------------------------------------------------===//
16
17#include "allocator.h"
18
19#include "src/__support/CPP/atomic.h"
20#include "src/__support/CPP/bit.h"
21#include "src/__support/CPP/new.h"
22#include "src/__support/GPU/utils.h"
23#include "src/__support/RPC/rpc_client.h"
24#include "src/__support/threads/sleep.h"
25
26namespace LIBC_NAMESPACE_DECL {
27
28constexpr static uint64_t MAX_SIZE = /* 64 GiB */ 64ull * 1024 * 1024 * 1024;
29constexpr static uint64_t SLAB_SIZE = /* 2 MiB */ 2ull * 1024 * 1024;
30constexpr static uint64_t ARRAY_SIZE = MAX_SIZE / SLAB_SIZE;
31constexpr static uint64_t SLAB_ALIGNMENT = SLAB_SIZE - 1;
32constexpr static uint32_t BITS_IN_WORD = sizeof(uint32_t) * 8;
33constexpr static uint32_t MIN_SIZE = 16;
34constexpr static uint32_t MIN_ALIGNMENT = MIN_SIZE - 1;
35
36// A sentinel used to indicate an invalid but non-null pointer value.
37constexpr static uint64_t SENTINEL = cpp::numeric_limits<uint64_t>::max();
38
39// The number of times we will try starting on a single index before skipping
40// past it.
41constexpr static uint32_t MAX_TRIES = 512;
42
43static_assert(!(ARRAY_SIZE & (ARRAY_SIZE - 1)), "Must be a power of two");
44
45namespace impl {
46// Allocates more memory from the system through the RPC interface. All
47// allocations from the system MUST be aligned on a 2MiB barrier. The default
48// HSA allocator has this behavior for any allocation >= 2MiB and the CUDA
49// driver provides an alignment field for virtual memory allocations.
50static void *rpc_allocate(uint64_t size) {
51 void *ptr = nullptr;
52 rpc::Client::Port port = rpc::client.open<LIBC_MALLOC>();
53 port.send_and_recv(
54 [=](rpc::Buffer *buffer, uint32_t) { buffer->data[0] = size; },
55 [&](rpc::Buffer *buffer, uint32_t) {
56 ptr = reinterpret_cast<void *>(buffer->data[0]);
57 });
58 port.close();
59 return ptr;
60}
61
62// Deallocates the associated system memory.
63static void rpc_free(void *ptr) {
64 rpc::Client::Port port = rpc::client.open<LIBC_FREE>();
65 port.send([=](rpc::Buffer *buffer, uint32_t) {
66 buffer->data[0] = reinterpret_cast<uintptr_t>(ptr);
67 });
68 port.close();
69}
70
71// Convert a potentially disjoint bitmask into an increasing integer per-lane
72// for use with indexing between gpu lanes.
73static inline uint32_t lane_count(uint64_t lane_mask) {
74 return cpp::popcount(lane_mask & ((uint64_t(1) << gpu::get_lane_id()) - 1));
75}
76
77// Obtain an initial value to seed a random number generator. We use the rounded
78// multiples of the golden ratio from xorshift* as additional spreading.
79static inline uint32_t entropy() {
80 return (static_cast<uint32_t>(gpu::processor_clock()) ^
81 (gpu::get_thread_id_x() * 0x632be59b) ^
82 (gpu::get_block_id_x() * 0x85157af5)) *
83 0x9e3779bb;
84}
85
86// Generate a random number and update the state using the xorshift32* PRNG.
87static inline uint32_t xorshift32(uint32_t &state) {
88 state ^= state << 13;
89 state ^= state >> 17;
90 state ^= state << 5;
91 return state * 0x9e3779bb;
92}
93
94// Final stage of murmurhash used to get a unique index for the global array
95static inline uint32_t hash(uint32_t x) {
96 x ^= x >> 16;
97 x *= 0x85ebca6b;
98 x ^= x >> 13;
99 x *= 0xc2b2ae35;
100 x ^= x >> 16;
101 return x;
102}
103
104// Rounds the input value to the closest permitted chunk size. Here we accept
105// the sum of the closest three powers of two. For a 2MiB slab size this is 48
106// different chunk sizes. This gives us average internal fragmentation of 87.5%.
107static inline uint32_t get_chunk_size(uint32_t x) {
108 uint32_t y = x < MIN_SIZE ? MIN_SIZE : x;
109 uint32_t pow2 = BITS_IN_WORD - cpp::countl_zero(y - 1);
110
111 uint32_t s0 = 0b0100 << (pow2 - 3);
112 uint32_t s1 = 0b0110 << (pow2 - 3);
113 uint32_t s2 = 0b0111 << (pow2 - 3);
114 uint32_t s3 = 0b1000 << (pow2 - 3);
115
116 if (s0 > y)
117 return (s0 + MIN_ALIGNMENT) & ~MIN_ALIGNMENT;
118 if (s1 > y)
119 return (s1 + MIN_ALIGNMENT) & ~MIN_ALIGNMENT;
120 if (s2 > y)
121 return (s2 + MIN_ALIGNMENT) & ~MIN_ALIGNMENT;
122 return (s3 + MIN_ALIGNMENT) & ~MIN_ALIGNMENT;
123}
124
125// Rounds to the nearest power of two.
126template <uint32_t N, typename T>
127static inline constexpr T round_up(const T x) {
128 static_assert(((N - 1) & N) == 0, "N must be a power of two");
129 return (x + N) & ~(N - 1);
130}
131
132} // namespace impl
133
134/// A slab allocator used to hand out identically sized slabs of memory.
135/// Allocation is done through random walks of a bitfield until a free bit is
136/// encountered. This reduces contention and is highly parallel on a GPU.
137///
138/// 0 4 8 16 ... 2 MiB
139/// ┌────────┬──────────┬────────┬──────────────────┬──────────────────────────┐
140/// │ chunk │ index │ pad │ bitfield[] │ memory[] │
141/// └────────┴──────────┴────────┴──────────────────┴──────────────────────────┘
142///
143/// The size of the bitfield is the slab size divided by the chunk size divided
144/// by the number of bits per word. We pad the interface to ensure 16 byte
145/// alignment and to indicate that if the pointer is not aligned by 2MiB it
146/// belongs to a slab rather than the global allocator.
147struct Slab {
148 // Header metadata for the slab, aligned to the minimum alignment.
149 struct alignas(MIN_SIZE) Header {
150 uint32_t chunk_size;
151 uint32_t global_index;
152 };
153
154 // Initialize the slab with its chunk size and index in the global table for
155 // use when freeing.
156 Slab(uint32_t chunk_size, uint32_t global_index) {
157 Header *header = reinterpret_cast<Header *>(memory);
158 header->chunk_size = chunk_size;
159 header->global_index = global_index;
160
161 // This memset is expensive and likely not necessary for the current 'kfd'
162 // driver. Until zeroed pages are exposed by the API we must be careful.
163 __builtin_memset(get_bitfield(), 0, bitfield_bytes(chunk_size));
164 }
165
166 // Get the number of chunks that can theoretically fit inside this slab.
167 constexpr static uint32_t num_chunks(uint32_t chunk_size) {
168 return SLAB_SIZE / chunk_size;
169 }
170
171 // Get the number of bytes needed to contain the bitfield bits.
172 constexpr static uint32_t bitfield_bytes(uint32_t chunk_size) {
173 return ((num_chunks(chunk_size) + BITS_IN_WORD - 1) / BITS_IN_WORD) * 8;
174 }
175
176 // The actual amount of memory available excluding the bitfield and metadata.
177 constexpr static uint32_t available_bytes(uint32_t chunk_size) {
178 return SLAB_SIZE - bitfield_bytes(chunk_size) - sizeof(Header);
179 }
180
181 // The number of chunks that can be stored in this slab.
182 constexpr static uint32_t available_chunks(uint32_t chunk_size) {
183 return available_bytes(chunk_size) / chunk_size;
184 }
185
186 // The length in bits of the bitfield.
187 constexpr static uint32_t usable_bits(uint32_t chunk_size) {
188 return available_bytes(chunk_size) / chunk_size;
189 }
190
191 // Get the location in the memory where we will store the chunk size.
192 uint32_t get_chunk_size() const {
193 return reinterpret_cast<const Header *>(memory)->chunk_size;
194 }
195
196 // Get the location in the memory where we will store the global index.
197 uint32_t get_global_index() const {
198 return reinterpret_cast<const Header *>(memory)->global_index;
199 }
200
201 // Get a pointer to where the bitfield is located in the memory.
202 uint32_t *get_bitfield() {
203 return reinterpret_cast<uint32_t *>(memory + sizeof(Header));
204 }
205
206 // Get a pointer to where the actual memory to be allocated lives.
207 uint8_t *get_memory(uint32_t chunk_size) {
208 return reinterpret_cast<uint8_t *>(get_bitfield()) +
209 bitfield_bytes(chunk_size);
210 }
211
212 // Get a pointer to the actual memory given an index into the bitfield.
213 void *ptr_from_index(uint32_t index, uint32_t chunk_size) {
214 return get_memory(chunk_size) + index * chunk_size;
215 }
216
217 // Convert a pointer back into its bitfield index using its offset.
218 uint32_t index_from_ptr(void *ptr, uint32_t chunk_size) {
219 return static_cast<uint32_t>(reinterpret_cast<uint8_t *>(ptr) -
220 get_memory(chunk_size)) /
221 chunk_size;
222 }
223
224 // Randomly walks the bitfield until it finds a free bit. Allocations attempt
225 // to put lanes right next to each other for better caching and convergence.
226 void *allocate(uint64_t lane_mask, uint64_t uniform) {
227 uint32_t chunk_size = get_chunk_size();
228 uint32_t state = impl::entropy();
229
230 // The uniform mask represents which lanes contain a uniform target pointer.
231 // We attempt to place these next to each other.
232 void *result = nullptr;
233 for (uint64_t mask = lane_mask; mask;
234 mask = gpu::ballot(lane_mask, !result)) {
235 if (result)
236 continue;
237
238 uint32_t start = gpu::broadcast_value(lane_mask, impl::xorshift32(state));
239
240 uint32_t id = impl::lane_count(lane_mask: uniform & mask);
241 uint32_t index = (start + id) % usable_bits(chunk_size);
242 uint32_t slot = index / BITS_IN_WORD;
243 uint32_t bit = index % BITS_IN_WORD;
244
245 // Get the mask of bits destined for the same slot and coalesce it.
246 uint64_t match = uniform & gpu::match_any(mask, slot);
247 uint32_t length = cpp::popcount(match);
248 uint32_t bitmask = static_cast<uint32_t>((uint64_t(1) << length) - 1)
249 << bit;
250
251 uint32_t before = 0;
252 if (gpu::get_lane_id() == static_cast<uint32_t>(cpp::countr_zero(match)))
253 before = cpp::AtomicRef(get_bitfield()[slot])
254 .fetch_or(bitmask, cpp::MemoryOrder::RELAXED);
255 before = gpu::shuffle(mask, cpp::countr_zero(match), before);
256 if (~before & (1 << bit))
257 result = ptr_from_index(index, chunk_size);
258 else
259 sleep_briefly();
260 }
261
262 cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
263 return result;
264 }
265
266 // Deallocates memory by resetting its corresponding bit in the bitfield.
267 void deallocate(void *ptr) {
268 uint32_t chunk_size = get_chunk_size();
269 uint32_t index = index_from_ptr(ptr, chunk_size);
270 uint32_t slot = index / BITS_IN_WORD;
271 uint32_t bit = index % BITS_IN_WORD;
272
273 cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
274 cpp::AtomicRef(get_bitfield()[slot])
275 .fetch_and(~(1u << bit), cpp::MemoryOrder::RELAXED);
276 }
277
278 // The actual memory the slab will manage. All offsets are calculated at
279 // runtime with the chunk size to keep the interface convergent when a warp or
280 // wavefront is handling multiple sizes at once.
281 uint8_t memory[SLAB_SIZE];
282};
283
284/// A wait-free guard around a pointer resource to be created dynamically if
285/// space is available and freed once there are no more users.
286struct GuardPtr {
287private:
288 struct RefCounter {
289 // Indicates that the object is in its deallocation phase and thus invalid.
290 static constexpr uint64_t INVALID = uint64_t(1) << 63;
291
292 // If a read preempts an unlock call we indicate this so the following
293 // unlock call can swap out the helped bit and maintain exclusive ownership.
294 static constexpr uint64_t HELPED = uint64_t(1) << 62;
295
296 // Resets the reference counter, cannot be reset to zero safely.
297 void reset(uint32_t n, uint64_t &count) {
298 counter.store(n, cpp::MemoryOrder::RELAXED);
299 count = n;
300 }
301
302 // Acquire a slot in the reference counter if it is not invalid.
303 bool acquire(uint32_t n, uint64_t &count) {
304 count = counter.fetch_add(n, cpp::MemoryOrder::RELAXED) + n;
305 return (count & INVALID) == 0;
306 }
307
308 // Release a slot in the reference counter. This function should only be
309 // called following a valid acquire call.
310 bool release(uint32_t n) {
311 // If this thread caused the counter to reach zero we try to invalidate it
312 // and obtain exclusive rights to deconstruct it. If the CAS failed either
313 // another thread resurrected the counter and we quit, or a parallel read
314 // helped us invalidating it. For the latter, claim that flag and return.
315 if (counter.fetch_sub(n, cpp::MemoryOrder::RELAXED) == n) {
316 uint64_t expected = 0;
317 if (counter.compare_exchange_strong(expected, INVALID,
318 cpp::MemoryOrder::RELAXED,
319 cpp::MemoryOrder::RELAXED))
320 return true;
321 else if ((expected & HELPED) &&
322 (counter.exchange(INVALID, cpp::MemoryOrder::RELAXED) &
323 HELPED))
324 return true;
325 }
326 return false;
327 }
328
329 // Returns the current reference count, potentially helping a releasing
330 // thread.
331 uint64_t read() {
332 auto val = counter.load(cpp::MemoryOrder::RELAXED);
333 if (val == 0 && counter.compare_exchange_strong(
334 val, INVALID | HELPED, cpp::MemoryOrder::RELAXED))
335 return 0;
336 return (val & INVALID) ? 0 : val;
337 }
338
339 cpp::Atomic<uint64_t> counter{0};
340 };
341
342 cpp::Atomic<Slab *> ptr{nullptr};
343 RefCounter ref{};
344
345 // Should be called be a single lane for each different pointer.
346 template <typename... Args>
347 Slab *try_lock_impl(uint32_t n, uint64_t &count, Args &&...args) {
348 Slab *expected = ptr.load(cpp::MemoryOrder::RELAXED);
349 if (!expected &&
350 ptr.compare_exchange_strong(
351 expected, reinterpret_cast<Slab *>(SENTINEL),
352 cpp::MemoryOrder::RELAXED, cpp::MemoryOrder::RELAXED)) {
353 count = cpp::numeric_limits<uint64_t>::max();
354 void *raw = impl::rpc_allocate(size: sizeof(Slab));
355 if (!raw)
356 return nullptr;
357 Slab *mem = new (raw) Slab(cpp::forward<Args>(args)...);
358
359 cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
360 ptr.store(mem, cpp::MemoryOrder::RELAXED);
361 cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
362 if (!ref.acquire(n, count))
363 ref.reset(n, count);
364 return mem;
365 }
366
367 if (!expected || expected == reinterpret_cast<Slab *>(SENTINEL))
368 return nullptr;
369
370 if (!ref.acquire(n, count))
371 return nullptr;
372
373 cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
374 return ptr.load(cpp::MemoryOrder::RELAXED);
375 }
376
377public:
378 // Attempt to lock access to the pointer, potentially creating it if empty.
379 // The uniform mask represents which lanes share the same pointer. For each
380 // uniform value we elect a leader to handle it on behalf of the other lanes.
381 template <typename... Args>
382 Slab *try_lock(uint64_t lane_mask, uint64_t uniform, uint64_t &count,
383 Args &&...args) {
384 count = 0;
385 Slab *result = nullptr;
386 if (gpu::get_lane_id() == uint32_t(cpp::countr_zero(uniform)))
387 result = try_lock_impl(cpp::popcount(uniform), count,
388 cpp::forward<Args>(args)...);
389 result = gpu::shuffle(lane_mask, cpp::countr_zero(uniform), result);
390 count = gpu::shuffle(lane_mask, cpp::countr_zero(uniform), count);
391
392 if (!result)
393 return nullptr;
394
395 if (count != cpp::numeric_limits<uint64_t>::max())
396 count = count - cpp::popcount(uniform) + impl::lane_count(uniform) + 1;
397
398 return result;
399 }
400
401 // Release the associated lock on the pointer, potentially destroying it.
402 void unlock(uint64_t lane_mask, uint64_t mask) {
403 cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
404 if (gpu::get_lane_id() == uint32_t(cpp::countr_zero(mask)) &&
405 ref.release(cpp::popcount(mask))) {
406 Slab *p = ptr.load(cpp::MemoryOrder::RELAXED);
407 p->~Slab();
408 impl::rpc_free(ptr: p);
409 cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
410 ptr.store(nullptr, cpp::MemoryOrder::RELAXED);
411 }
412 gpu::sync_lane(lane_mask);
413 }
414
415 // Get the current value of the reference counter.
416 uint64_t use_count() { return ref.read(); }
417};
418
419// The global array used to search for a valid slab to allocate from.
420static GuardPtr slots[ARRAY_SIZE] = {};
421
422// Tries to find a slab in the table that can support the given chunk size.
423static Slab *find_slab(uint32_t chunk_size) {
424 // We start at a hashed value to spread out different chunk sizes.
425 uint32_t start = impl::hash(x: chunk_size);
426 uint64_t lane_mask = gpu::get_lane_mask();
427 uint64_t uniform = gpu::match_any(lane_mask, chunk_size);
428
429 Slab *result = nullptr;
430 uint32_t nudge = 0;
431 for (uint64_t mask = lane_mask; mask;
432 mask = gpu::ballot(lane_mask, !result), ++nudge) {
433 uint32_t index = cpp::numeric_limits<uint32_t>::max();
434 for (uint32_t offset = nudge / MAX_TRIES;
435 gpu::ballot(lane_mask, index == cpp::numeric_limits<uint32_t>::max());
436 offset += cpp::popcount(uniform & lane_mask)) {
437 uint32_t candidate =
438 (start + offset + impl::lane_count(uniform & lane_mask)) % ARRAY_SIZE;
439 uint64_t available =
440 gpu::ballot(lane_mask, slots[candidate].use_count() <
441 Slab::available_chunks(chunk_size));
442 uint32_t new_index = gpu::shuffle(
443 lane_mask, cpp::countr_zero(available & uniform), candidate);
444
445 // Each uniform group will use the first empty slot they find.
446 if ((index == cpp::numeric_limits<uint32_t>::max() &&
447 (available & uniform)))
448 index = new_index;
449
450 // Guaruntees that this loop will eventuall exit if there is no space.
451 if (offset >= ARRAY_SIZE) {
452 result = reinterpret_cast<Slab *>(SENTINEL);
453 index = 0;
454 }
455 }
456
457 // Try to claim a slot for the found slot.
458 if (!result) {
459 uint64_t reserved = 0;
460 Slab *slab = slots[index].try_lock(lane_mask: lane_mask & mask, uniform: uniform & mask,
461 count&: reserved, args&: chunk_size, args&: index);
462 // If we find a slab with a matching chunk size then we store the result.
463 // Otherwise, we need to free the claimed lock and continue. In the case
464 // of out-of-memory we return a sentinel value.
465 if (slab && reserved <= Slab::available_chunks(chunk_size) &&
466 slab->get_chunk_size() == chunk_size) {
467 result = slab;
468 } else if (slab && (reserved > Slab::available_chunks(chunk_size) ||
469 slab->get_chunk_size() != chunk_size)) {
470 if (slab->get_chunk_size() != chunk_size)
471 start = index + 1;
472 slots[index].unlock(gpu::get_lane_mask(),
473 gpu::get_lane_mask() & uniform);
474 } else if (!slab && reserved == cpp::numeric_limits<uint64_t>::max()) {
475 result = reinterpret_cast<Slab *>(SENTINEL);
476 } else {
477 sleep_briefly();
478 }
479 }
480 }
481 return result;
482}
483
484// Release the lock associated with a given slab.
485static void release_slab(Slab *slab) {
486 uint32_t index = slab->get_global_index();
487 uint64_t lane_mask = gpu::get_lane_mask();
488 uint64_t uniform = gpu::match_any(lane_mask, index);
489 slots[index].unlock(lane_mask, mask: uniform);
490}
491
492namespace gpu {
493
494void *allocate(uint64_t size) {
495 if (!size)
496 return nullptr;
497
498 // Allocations requiring a full slab or more go directly to memory.
499 if (size >= SLAB_SIZE / 2)
500 return impl::rpc_allocate(size: impl::round_up<SLAB_SIZE>(x: size));
501
502 // Try to find a slab for the rounded up chunk size and allocate from it.
503 uint32_t chunk_size = impl::get_chunk_size(x: static_cast<uint32_t>(size));
504 Slab *slab = find_slab(chunk_size);
505 if (!slab || slab == reinterpret_cast<Slab *>(SENTINEL))
506 return nullptr;
507
508 uint64_t lane_mask = gpu::get_lane_mask();
509 uint64_t uniform = gpu::match_any(lane_mask, slab->get_global_index());
510 void *ptr = slab->allocate(lane_mask, uniform);
511 return ptr;
512}
513
514void deallocate(void *ptr) {
515 if (!ptr)
516 return;
517
518 // All non-slab allocations will be aligned on a 2MiB boundary.
519 if ((reinterpret_cast<uintptr_t>(ptr) & SLAB_ALIGNMENT) == 0)
520 return impl::rpc_free(ptr);
521
522 // The original slab pointer is the 2MiB boundary using the given pointer.
523 Slab *slab = reinterpret_cast<Slab *>(
524 (reinterpret_cast<uintptr_t>(ptr) & ~SLAB_ALIGNMENT));
525 slab->deallocate(ptr);
526 release_slab(slab);
527}
528
529} // namespace gpu
530} // namespace LIBC_NAMESPACE_DECL
531

Provided by KDAB

Privacy Policy
Update your C++ knowledge – Modern C++11/14/17 Training
Find out more

source code of libc/src/__support/GPU/allocator.cpp