| 1 | //===-- GPU memory allocator implementation ---------------------*- C++ -*-===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // This file implements a parallel allocator intended for use on a GPU device. |
| 10 | // The core algorithm is slab allocator using a random walk over a bitfield for |
| 11 | // maximum parallel progress. Slab handling is done by a wait-free reference |
| 12 | // counted guard. The first use of a slab will create it from system memory for |
| 13 | // re-use. The last use will invalidate it and free the memory. |
| 14 | // |
| 15 | //===----------------------------------------------------------------------===// |
| 16 | |
| 17 | #include "allocator.h" |
| 18 | |
| 19 | #include "src/__support/CPP/atomic.h" |
| 20 | #include "src/__support/CPP/bit.h" |
| 21 | #include "src/__support/CPP/new.h" |
| 22 | #include "src/__support/GPU/utils.h" |
| 23 | #include "src/__support/RPC/rpc_client.h" |
| 24 | #include "src/__support/threads/sleep.h" |
| 25 | #include "src/string/memory_utils/inline_memcpy.h" |
| 26 | |
| 27 | namespace LIBC_NAMESPACE_DECL { |
| 28 | |
| 29 | constexpr static uint64_t MAX_SIZE = /* 64 GiB */ 64ull * 1024 * 1024 * 1024; |
| 30 | constexpr static uint64_t SLAB_SIZE = /* 2 MiB */ 2ull * 1024 * 1024; |
| 31 | constexpr static uint64_t ARRAY_SIZE = MAX_SIZE / SLAB_SIZE; |
| 32 | constexpr static uint64_t SLAB_ALIGNMENT = SLAB_SIZE - 1; |
| 33 | constexpr static uint32_t BITS_IN_WORD = sizeof(uint32_t) * 8; |
| 34 | constexpr static uint32_t MIN_SIZE = 16; |
| 35 | constexpr static uint32_t MIN_ALIGNMENT = MIN_SIZE - 1; |
| 36 | |
| 37 | // A sentinel used to indicate an invalid but non-null pointer value. |
| 38 | constexpr static uint64_t SENTINEL = cpp::numeric_limits<uint64_t>::max(); |
| 39 | |
| 40 | // The number of times we will try starting on a single index before skipping |
| 41 | // past it. |
| 42 | constexpr static uint32_t MAX_TRIES = 512; |
| 43 | |
| 44 | static_assert(!(ARRAY_SIZE & (ARRAY_SIZE - 1)), "Must be a power of two" ); |
| 45 | |
| 46 | namespace impl { |
| 47 | // Allocates more memory from the system through the RPC interface. All |
| 48 | // allocations from the system MUST be aligned on a 2MiB barrier. The default |
| 49 | // HSA allocator has this behavior for any allocation >= 2MiB and the CUDA |
| 50 | // driver provides an alignment field for virtual memory allocations. |
| 51 | static void *rpc_allocate(uint64_t size) { |
| 52 | void *ptr = nullptr; |
| 53 | rpc::Client::Port port = rpc::client.open<LIBC_MALLOC>(); |
| 54 | port.send_and_recv( |
| 55 | [=](rpc::Buffer *buffer, uint32_t) { buffer->data[0] = size; }, |
| 56 | [&](rpc::Buffer *buffer, uint32_t) { |
| 57 | ptr = reinterpret_cast<void *>(buffer->data[0]); |
| 58 | }); |
| 59 | port.close(); |
| 60 | return ptr; |
| 61 | } |
| 62 | |
| 63 | // Deallocates the associated system memory. |
| 64 | static void rpc_free(void *ptr) { |
| 65 | rpc::Client::Port port = rpc::client.open<LIBC_FREE>(); |
| 66 | port.send([=](rpc::Buffer *buffer, uint32_t) { |
| 67 | buffer->data[0] = reinterpret_cast<uintptr_t>(ptr); |
| 68 | }); |
| 69 | port.close(); |
| 70 | } |
| 71 | |
| 72 | // Convert a potentially disjoint bitmask into an increasing integer per-lane |
| 73 | // for use with indexing between gpu lanes. |
| 74 | static inline uint32_t lane_count(uint64_t lane_mask) { |
| 75 | return cpp::popcount(lane_mask & ((uint64_t(1) << gpu::get_lane_id()) - 1)); |
| 76 | } |
| 77 | |
| 78 | // Obtain an initial value to seed a random number generator. We use the rounded |
| 79 | // multiples of the golden ratio from xorshift* as additional spreading. |
| 80 | static inline uint32_t entropy() { |
| 81 | return (static_cast<uint32_t>(gpu::processor_clock()) ^ |
| 82 | (gpu::get_thread_id_x() * 0x632be59b) ^ |
| 83 | (gpu::get_block_id_x() * 0x85157af5)) * |
| 84 | 0x9e3779bb; |
| 85 | } |
| 86 | |
| 87 | // Generate a random number and update the state using the xorshift32* PRNG. |
| 88 | static inline uint32_t xorshift32(uint32_t &state) { |
| 89 | state ^= state << 13; |
| 90 | state ^= state >> 17; |
| 91 | state ^= state << 5; |
| 92 | return state * 0x9e3779bb; |
| 93 | } |
| 94 | |
| 95 | // Final stage of murmurhash used to get a unique index for the global array |
| 96 | static inline uint32_t hash(uint32_t x) { |
| 97 | x ^= x >> 16; |
| 98 | x *= 0x85ebca6b; |
| 99 | x ^= x >> 13; |
| 100 | x *= 0xc2b2ae35; |
| 101 | x ^= x >> 16; |
| 102 | return x; |
| 103 | } |
| 104 | |
| 105 | // Rounds the input value to the closest permitted chunk size. Here we accept |
| 106 | // the sum of the closest three powers of two. For a 2MiB slab size this is 48 |
| 107 | // different chunk sizes. This gives us average internal fragmentation of 87.5%. |
| 108 | static inline uint32_t get_chunk_size(uint32_t x) { |
| 109 | uint32_t y = x < MIN_SIZE ? MIN_SIZE : x; |
| 110 | uint32_t pow2 = BITS_IN_WORD - cpp::countl_zero(y - 1); |
| 111 | |
| 112 | uint32_t s0 = 0b0100 << (pow2 - 3); |
| 113 | uint32_t s1 = 0b0110 << (pow2 - 3); |
| 114 | uint32_t s2 = 0b0111 << (pow2 - 3); |
| 115 | uint32_t s3 = 0b1000 << (pow2 - 3); |
| 116 | |
| 117 | if (s0 > y) |
| 118 | return (s0 + MIN_ALIGNMENT) & ~MIN_ALIGNMENT; |
| 119 | if (s1 > y) |
| 120 | return (s1 + MIN_ALIGNMENT) & ~MIN_ALIGNMENT; |
| 121 | if (s2 > y) |
| 122 | return (s2 + MIN_ALIGNMENT) & ~MIN_ALIGNMENT; |
| 123 | return (s3 + MIN_ALIGNMENT) & ~MIN_ALIGNMENT; |
| 124 | } |
| 125 | |
| 126 | // Rounds to the nearest power of two. |
| 127 | template <uint32_t N, typename T> |
| 128 | static inline constexpr T round_up(const T x) { |
| 129 | static_assert(((N - 1) & N) == 0, "N must be a power of two" ); |
| 130 | return (x + N) & ~(N - 1); |
| 131 | } |
| 132 | |
| 133 | // Perform a lane parallel memset on a uint32_t pointer. |
| 134 | void uniform_memset(uint32_t *s, uint32_t c, uint32_t n, uint64_t uniform) { |
| 135 | uint64_t mask = gpu::get_lane_mask(); |
| 136 | uint32_t workers = cpp::popcount(uniform); |
| 137 | for (uint32_t i = impl::lane_count(lane_mask: mask & uniform); i < n; i += workers) |
| 138 | s[i] = c; |
| 139 | } |
| 140 | |
| 141 | // Indicates that the provided value is a power of two. |
| 142 | static inline constexpr bool is_pow2(uint64_t x) { |
| 143 | return x && (x & (x - 1)) == 0; |
| 144 | } |
| 145 | |
| 146 | } // namespace impl |
| 147 | |
| 148 | /// A slab allocator used to hand out identically sized slabs of memory. |
| 149 | /// Allocation is done through random walks of a bitfield until a free bit is |
| 150 | /// encountered. This reduces contention and is highly parallel on a GPU. |
| 151 | /// |
| 152 | /// 0 4 8 16 ... 2 MiB |
| 153 | /// ┌────────┬──────────┬────────┬──────────────────┬──────────────────────────┐ |
| 154 | /// │ chunk │ index │ pad │ bitfield[] │ memory[] │ |
| 155 | /// └────────┴──────────┴────────┴──────────────────┴──────────────────────────┘ |
| 156 | /// |
| 157 | /// The size of the bitfield is the slab size divided by the chunk size divided |
| 158 | /// by the number of bits per word. We pad the interface to ensure 16 byte |
| 159 | /// alignment and to indicate that if the pointer is not aligned by 2MiB it |
| 160 | /// belongs to a slab rather than the global allocator. |
| 161 | struct Slab { |
| 162 | // Header metadata for the slab, aligned to the minimum alignment. |
| 163 | struct alignas(MIN_SIZE) { |
| 164 | uint32_t ; |
| 165 | uint32_t ; |
| 166 | }; |
| 167 | |
| 168 | // Initialize the slab with its chunk size and index in the global table for |
| 169 | // use when freeing. |
| 170 | Slab(uint32_t chunk_size, uint32_t global_index) { |
| 171 | Header * = reinterpret_cast<Header *>(memory); |
| 172 | header->chunk_size = chunk_size; |
| 173 | header->global_index = global_index; |
| 174 | } |
| 175 | |
| 176 | // Set the necessary bitfield bytes to zero in parallel using many lanes. This |
| 177 | // must be called before the bitfield can be accessed safely, memory is not |
| 178 | // guaranteed to be zero initialized in the current implementation. |
| 179 | void initialize(uint64_t uniform) { |
| 180 | uint32_t size = (bitfield_bytes(chunk_size: get_chunk_size()) + sizeof(uint32_t) - 1) / |
| 181 | sizeof(uint32_t); |
| 182 | impl::uniform_memset(s: get_bitfield(), c: 0, n: size, uniform); |
| 183 | } |
| 184 | |
| 185 | // Get the number of chunks that can theoretically fit inside this slab. |
| 186 | constexpr static uint32_t num_chunks(uint32_t chunk_size) { |
| 187 | return SLAB_SIZE / chunk_size; |
| 188 | } |
| 189 | |
| 190 | // Get the number of bytes needed to contain the bitfield bits. |
| 191 | constexpr static uint32_t bitfield_bytes(uint32_t chunk_size) { |
| 192 | return __builtin_align_up( |
| 193 | ((num_chunks(chunk_size) + BITS_IN_WORD - 1) / BITS_IN_WORD) * 8, |
| 194 | MIN_ALIGNMENT + 1); |
| 195 | } |
| 196 | |
| 197 | // The actual amount of memory available excluding the bitfield and metadata. |
| 198 | constexpr static uint32_t available_bytes(uint32_t chunk_size) { |
| 199 | return SLAB_SIZE - bitfield_bytes(chunk_size) - sizeof(Header); |
| 200 | } |
| 201 | |
| 202 | // The number of chunks that can be stored in this slab. |
| 203 | constexpr static uint32_t available_chunks(uint32_t chunk_size) { |
| 204 | return available_bytes(chunk_size) / chunk_size; |
| 205 | } |
| 206 | |
| 207 | // The length in bits of the bitfield. |
| 208 | constexpr static uint32_t usable_bits(uint32_t chunk_size) { |
| 209 | return available_bytes(chunk_size) / chunk_size; |
| 210 | } |
| 211 | |
| 212 | // Get the location in the memory where we will store the chunk size. |
| 213 | uint32_t get_chunk_size() const { |
| 214 | return reinterpret_cast<const Header *>(memory)->chunk_size; |
| 215 | } |
| 216 | |
| 217 | // Get the location in the memory where we will store the global index. |
| 218 | uint32_t get_global_index() const { |
| 219 | return reinterpret_cast<const Header *>(memory)->global_index; |
| 220 | } |
| 221 | |
| 222 | // Get a pointer to where the bitfield is located in the memory. |
| 223 | uint32_t *get_bitfield() { |
| 224 | return reinterpret_cast<uint32_t *>(memory + sizeof(Header)); |
| 225 | } |
| 226 | |
| 227 | // Get a pointer to where the actual memory to be allocated lives. |
| 228 | uint8_t *get_memory(uint32_t chunk_size) { |
| 229 | return reinterpret_cast<uint8_t *>(get_bitfield()) + |
| 230 | bitfield_bytes(chunk_size); |
| 231 | } |
| 232 | |
| 233 | // Get a pointer to the actual memory given an index into the bitfield. |
| 234 | void *ptr_from_index(uint32_t index, uint32_t chunk_size) { |
| 235 | return get_memory(chunk_size) + index * chunk_size; |
| 236 | } |
| 237 | |
| 238 | // Convert a pointer back into its bitfield index using its offset. |
| 239 | uint32_t index_from_ptr(void *ptr, uint32_t chunk_size) { |
| 240 | return static_cast<uint32_t>(reinterpret_cast<uint8_t *>(ptr) - |
| 241 | get_memory(chunk_size)) / |
| 242 | chunk_size; |
| 243 | } |
| 244 | |
| 245 | // Randomly walks the bitfield until it finds a free bit. Allocations attempt |
| 246 | // to put lanes right next to each other for better caching and convergence. |
| 247 | void *allocate(uint64_t lane_mask, uint64_t uniform) { |
| 248 | uint32_t chunk_size = get_chunk_size(); |
| 249 | uint32_t state = impl::entropy(); |
| 250 | |
| 251 | // The uniform mask represents which lanes contain a uniform target pointer. |
| 252 | // We attempt to place these next to each other. |
| 253 | void *result = nullptr; |
| 254 | for (uint64_t mask = lane_mask; mask; |
| 255 | mask = gpu::ballot(lane_mask, !result)) { |
| 256 | if (result) |
| 257 | continue; |
| 258 | |
| 259 | uint32_t start = gpu::broadcast_value(lane_mask, impl::xorshift32(state)); |
| 260 | |
| 261 | uint32_t id = impl::lane_count(lane_mask: uniform & mask); |
| 262 | uint32_t index = (start + id) % usable_bits(chunk_size); |
| 263 | uint32_t slot = index / BITS_IN_WORD; |
| 264 | uint32_t bit = index % BITS_IN_WORD; |
| 265 | |
| 266 | // Get the mask of bits destined for the same slot and coalesce it. |
| 267 | uint64_t match = uniform & gpu::match_any(mask, slot); |
| 268 | uint32_t length = cpp::popcount(match); |
| 269 | uint32_t bitmask = static_cast<uint32_t>((uint64_t(1) << length) - 1) |
| 270 | << bit; |
| 271 | |
| 272 | uint32_t before = 0; |
| 273 | if (gpu::get_lane_id() == static_cast<uint32_t>(cpp::countr_zero(match))) |
| 274 | before = cpp::AtomicRef(get_bitfield()[slot]) |
| 275 | .fetch_or(bitmask, cpp::MemoryOrder::RELAXED); |
| 276 | before = gpu::shuffle(mask, cpp::countr_zero(match), before); |
| 277 | if (~before & (1 << bit)) |
| 278 | result = ptr_from_index(index, chunk_size); |
| 279 | else |
| 280 | sleep_briefly(); |
| 281 | } |
| 282 | |
| 283 | cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE); |
| 284 | return result; |
| 285 | } |
| 286 | |
| 287 | // Deallocates memory by resetting its corresponding bit in the bitfield. |
| 288 | void deallocate(void *ptr) { |
| 289 | uint32_t chunk_size = get_chunk_size(); |
| 290 | uint32_t index = index_from_ptr(ptr, chunk_size); |
| 291 | uint32_t slot = index / BITS_IN_WORD; |
| 292 | uint32_t bit = index % BITS_IN_WORD; |
| 293 | |
| 294 | cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE); |
| 295 | cpp::AtomicRef(get_bitfield()[slot]) |
| 296 | .fetch_and(~(1u << bit), cpp::MemoryOrder::RELAXED); |
| 297 | } |
| 298 | |
| 299 | // The actual memory the slab will manage. All offsets are calculated at |
| 300 | // runtime with the chunk size to keep the interface convergent when a warp or |
| 301 | // wavefront is handling multiple sizes at once. |
| 302 | uint8_t memory[SLAB_SIZE]; |
| 303 | }; |
| 304 | |
| 305 | /// A wait-free guard around a pointer resource to be created dynamically if |
| 306 | /// space is available and freed once there are no more users. |
| 307 | struct GuardPtr { |
| 308 | private: |
| 309 | struct RefCounter { |
| 310 | // Indicates that the object is in its deallocation phase and thus invalid. |
| 311 | static constexpr uint64_t INVALID = uint64_t(1) << 63; |
| 312 | |
| 313 | // If a read preempts an unlock call we indicate this so the following |
| 314 | // unlock call can swap out the helped bit and maintain exclusive ownership. |
| 315 | static constexpr uint64_t HELPED = uint64_t(1) << 62; |
| 316 | |
| 317 | // Resets the reference counter, cannot be reset to zero safely. |
| 318 | void reset(uint32_t n, uint64_t &count) { |
| 319 | counter.store(n, cpp::MemoryOrder::RELAXED); |
| 320 | count = n; |
| 321 | } |
| 322 | |
| 323 | // Acquire a slot in the reference counter if it is not invalid. |
| 324 | bool acquire(uint32_t n, uint64_t &count) { |
| 325 | count = counter.fetch_add(n, cpp::MemoryOrder::RELAXED) + n; |
| 326 | return (count & INVALID) == 0; |
| 327 | } |
| 328 | |
| 329 | // Release a slot in the reference counter. This function should only be |
| 330 | // called following a valid acquire call. |
| 331 | bool release(uint32_t n) { |
| 332 | // If this thread caused the counter to reach zero we try to invalidate it |
| 333 | // and obtain exclusive rights to deconstruct it. If the CAS failed either |
| 334 | // another thread resurrected the counter and we quit, or a parallel read |
| 335 | // helped us invalidating it. For the latter, claim that flag and return. |
| 336 | if (counter.fetch_sub(n, cpp::MemoryOrder::RELAXED) == n) { |
| 337 | uint64_t expected = 0; |
| 338 | if (counter.compare_exchange_strong(expected, INVALID, |
| 339 | cpp::MemoryOrder::RELAXED, |
| 340 | cpp::MemoryOrder::RELAXED)) |
| 341 | return true; |
| 342 | else if ((expected & HELPED) && |
| 343 | (counter.exchange(INVALID, cpp::MemoryOrder::RELAXED) & |
| 344 | HELPED)) |
| 345 | return true; |
| 346 | } |
| 347 | return false; |
| 348 | } |
| 349 | |
| 350 | // Returns the current reference count, potentially helping a releasing |
| 351 | // thread. |
| 352 | uint64_t read() { |
| 353 | auto val = counter.load(cpp::MemoryOrder::RELAXED); |
| 354 | if (val == 0 && counter.compare_exchange_strong( |
| 355 | val, INVALID | HELPED, cpp::MemoryOrder::RELAXED)) |
| 356 | return 0; |
| 357 | return (val & INVALID) ? 0 : val; |
| 358 | } |
| 359 | |
| 360 | cpp::Atomic<uint64_t> counter{0}; |
| 361 | }; |
| 362 | |
| 363 | cpp::Atomic<Slab *> ptr{nullptr}; |
| 364 | RefCounter ref{}; |
| 365 | |
| 366 | // Should be called be a single lane for each different pointer. |
| 367 | template <typename... Args> |
| 368 | Slab *try_lock_impl(uint32_t n, uint64_t &count, Args &&...args) { |
| 369 | Slab *expected = ptr.load(cpp::MemoryOrder::RELAXED); |
| 370 | if (!expected && |
| 371 | ptr.compare_exchange_strong( |
| 372 | expected, reinterpret_cast<Slab *>(SENTINEL), |
| 373 | cpp::MemoryOrder::RELAXED, cpp::MemoryOrder::RELAXED)) { |
| 374 | count = cpp::numeric_limits<uint64_t>::max(); |
| 375 | void *raw = impl::rpc_allocate(size: sizeof(Slab)); |
| 376 | if (!raw) |
| 377 | return nullptr; |
| 378 | return new (raw) Slab(cpp::forward<Args>(args)...); |
| 379 | } |
| 380 | |
| 381 | if (!expected || expected == reinterpret_cast<Slab *>(SENTINEL)) |
| 382 | return nullptr; |
| 383 | |
| 384 | if (!ref.acquire(n, count)) |
| 385 | return nullptr; |
| 386 | |
| 387 | cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE); |
| 388 | return ptr.load(cpp::MemoryOrder::RELAXED); |
| 389 | } |
| 390 | |
| 391 | // Finalize the associated memory and signal that it is ready to use by |
| 392 | // resetting the counter. |
| 393 | void finalize(Slab *mem, uint32_t n, uint64_t &count) { |
| 394 | cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE); |
| 395 | ptr.store(mem, cpp::MemoryOrder::RELAXED); |
| 396 | cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE); |
| 397 | if (!ref.acquire(n, count)) |
| 398 | ref.reset(n, count); |
| 399 | } |
| 400 | |
| 401 | public: |
| 402 | // Attempt to lock access to the pointer, potentially creating it if empty. |
| 403 | // The uniform mask represents which lanes share the same pointer. For each |
| 404 | // uniform value we elect a leader to handle it on behalf of the other lanes. |
| 405 | template <typename... Args> |
| 406 | Slab *try_lock(uint64_t lane_mask, uint64_t uniform, uint64_t &count, |
| 407 | Args &&...args) { |
| 408 | count = 0; |
| 409 | Slab *result = nullptr; |
| 410 | if (gpu::get_lane_id() == uint32_t(cpp::countr_zero(uniform))) |
| 411 | result = try_lock_impl(cpp::popcount(uniform), count, |
| 412 | cpp::forward<Args>(args)...); |
| 413 | result = gpu::shuffle(lane_mask, cpp::countr_zero(uniform), result); |
| 414 | count = gpu::shuffle(lane_mask, cpp::countr_zero(uniform), count); |
| 415 | |
| 416 | if (!result) |
| 417 | return nullptr; |
| 418 | |
| 419 | // We defer storing the newly allocated slab until now so that we can use |
| 420 | // multiple lanes to initialize it and release it for use. |
| 421 | if (count == cpp::numeric_limits<uint64_t>::max()) { |
| 422 | result->initialize(uniform); |
| 423 | if (gpu::get_lane_id() == uint32_t(cpp::countr_zero(uniform))) |
| 424 | finalize(result, cpp::popcount(uniform), count); |
| 425 | } |
| 426 | |
| 427 | if (count != cpp::numeric_limits<uint64_t>::max()) |
| 428 | count = count - cpp::popcount(uniform) + impl::lane_count(uniform) + 1; |
| 429 | |
| 430 | return result; |
| 431 | } |
| 432 | |
| 433 | // Release the associated lock on the pointer, potentially destroying it. |
| 434 | void unlock(uint64_t lane_mask, uint64_t mask) { |
| 435 | cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE); |
| 436 | if (gpu::get_lane_id() == uint32_t(cpp::countr_zero(mask)) && |
| 437 | ref.release(cpp::popcount(mask))) { |
| 438 | Slab *p = ptr.load(cpp::MemoryOrder::RELAXED); |
| 439 | p->~Slab(); |
| 440 | impl::rpc_free(ptr: p); |
| 441 | cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE); |
| 442 | ptr.store(nullptr, cpp::MemoryOrder::RELAXED); |
| 443 | } |
| 444 | gpu::sync_lane(lane_mask); |
| 445 | } |
| 446 | |
| 447 | // Get the current value of the reference counter. |
| 448 | uint64_t use_count() { return ref.read(); } |
| 449 | }; |
| 450 | |
| 451 | // The global array used to search for a valid slab to allocate from. |
| 452 | static GuardPtr slots[ARRAY_SIZE] = {}; |
| 453 | |
| 454 | // Tries to find a slab in the table that can support the given chunk size. |
| 455 | static Slab *find_slab(uint32_t chunk_size) { |
| 456 | // We start at a hashed value to spread out different chunk sizes. |
| 457 | uint32_t start = impl::hash(x: chunk_size); |
| 458 | uint64_t lane_mask = gpu::get_lane_mask(); |
| 459 | uint64_t uniform = gpu::match_any(lane_mask, chunk_size); |
| 460 | |
| 461 | Slab *result = nullptr; |
| 462 | uint32_t nudge = 0; |
| 463 | for (uint64_t mask = lane_mask; mask; |
| 464 | mask = gpu::ballot(lane_mask, !result), ++nudge) { |
| 465 | uint32_t index = cpp::numeric_limits<uint32_t>::max(); |
| 466 | for (uint32_t offset = nudge / MAX_TRIES; |
| 467 | gpu::ballot(lane_mask, index == cpp::numeric_limits<uint32_t>::max()); |
| 468 | offset += cpp::popcount(uniform & lane_mask)) { |
| 469 | uint32_t candidate = |
| 470 | (start + offset + impl::lane_count(lane_mask: uniform & lane_mask)) % ARRAY_SIZE; |
| 471 | uint64_t available = |
| 472 | gpu::ballot(lane_mask, slots[candidate].use_count() < |
| 473 | Slab::available_chunks(chunk_size)); |
| 474 | uint32_t new_index = gpu::shuffle( |
| 475 | lane_mask, cpp::countr_zero(available & uniform), candidate); |
| 476 | |
| 477 | // Each uniform group will use the first empty slot they find. |
| 478 | if ((index == cpp::numeric_limits<uint32_t>::max() && |
| 479 | (available & uniform))) |
| 480 | index = new_index; |
| 481 | |
| 482 | // Guaruntees that this loop will eventuall exit if there is no space. |
| 483 | if (offset >= ARRAY_SIZE) { |
| 484 | result = reinterpret_cast<Slab *>(SENTINEL); |
| 485 | index = 0; |
| 486 | } |
| 487 | } |
| 488 | |
| 489 | // Try to claim a slot for the found slot. |
| 490 | if (!result) { |
| 491 | uint64_t reserved = 0; |
| 492 | Slab *slab = slots[index].try_lock(lane_mask: lane_mask & mask, uniform: uniform & mask, |
| 493 | count&: reserved, args&: chunk_size, args&: index); |
| 494 | // If we find a slab with a matching chunk size then we store the result. |
| 495 | // Otherwise, we need to free the claimed lock and continue. In the case |
| 496 | // of out-of-memory we return a sentinel value. |
| 497 | if (slab && reserved <= Slab::available_chunks(chunk_size) && |
| 498 | slab->get_chunk_size() == chunk_size) { |
| 499 | result = slab; |
| 500 | } else if (slab && (reserved > Slab::available_chunks(chunk_size) || |
| 501 | slab->get_chunk_size() != chunk_size)) { |
| 502 | if (slab->get_chunk_size() != chunk_size) |
| 503 | start = index + 1; |
| 504 | slots[index].unlock(gpu::get_lane_mask(), |
| 505 | gpu::get_lane_mask() & uniform); |
| 506 | } else if (!slab && reserved == cpp::numeric_limits<uint64_t>::max()) { |
| 507 | result = reinterpret_cast<Slab *>(SENTINEL); |
| 508 | } else { |
| 509 | sleep_briefly(); |
| 510 | } |
| 511 | } |
| 512 | } |
| 513 | return result; |
| 514 | } |
| 515 | |
| 516 | // Release the lock associated with a given slab. |
| 517 | static void release_slab(Slab *slab) { |
| 518 | uint32_t index = slab->get_global_index(); |
| 519 | uint64_t lane_mask = gpu::get_lane_mask(); |
| 520 | uint64_t uniform = gpu::match_any(lane_mask, index); |
| 521 | slots[index].unlock(lane_mask, mask: uniform); |
| 522 | } |
| 523 | |
| 524 | namespace gpu { |
| 525 | |
| 526 | void *allocate(uint64_t size) { |
| 527 | if (!size) |
| 528 | return nullptr; |
| 529 | |
| 530 | // Allocations requiring a full slab or more go directly to memory. |
| 531 | if (size >= SLAB_SIZE / 2) |
| 532 | return impl::rpc_allocate(size: impl::round_up<SLAB_SIZE>(x: size)); |
| 533 | |
| 534 | // Try to find a slab for the rounded up chunk size and allocate from it. |
| 535 | uint32_t chunk_size = impl::get_chunk_size(x: static_cast<uint32_t>(size)); |
| 536 | Slab *slab = find_slab(chunk_size); |
| 537 | if (!slab || slab == reinterpret_cast<Slab *>(SENTINEL)) |
| 538 | return nullptr; |
| 539 | |
| 540 | uint64_t lane_mask = gpu::get_lane_mask(); |
| 541 | uint64_t uniform = gpu::match_any(lane_mask, slab->get_global_index()); |
| 542 | void *ptr = slab->allocate(lane_mask, uniform); |
| 543 | return ptr; |
| 544 | } |
| 545 | |
| 546 | void deallocate(void *ptr) { |
| 547 | if (!ptr) |
| 548 | return; |
| 549 | |
| 550 | // All non-slab allocations will be aligned on a 2MiB boundary. |
| 551 | if (__builtin_is_aligned(ptr, SLAB_ALIGNMENT + 1)) |
| 552 | return impl::rpc_free(ptr); |
| 553 | |
| 554 | // The original slab pointer is the 2MiB boundary using the given pointer. |
| 555 | Slab *slab = cpp::launder(reinterpret_cast<Slab *>( |
| 556 | (reinterpret_cast<uintptr_t>(ptr) & ~SLAB_ALIGNMENT))); |
| 557 | slab->deallocate(ptr); |
| 558 | release_slab(slab); |
| 559 | } |
| 560 | |
| 561 | void *reallocate(void *ptr, uint64_t size) { |
| 562 | if (ptr == nullptr) |
| 563 | return gpu::allocate(size); |
| 564 | |
| 565 | // Non-slab allocations are considered foreign pointers so we fail. |
| 566 | if (__builtin_is_aligned(ptr, SLAB_ALIGNMENT + 1)) |
| 567 | return nullptr; |
| 568 | |
| 569 | // The original slab pointer is the 2MiB boundary using the given pointer. |
| 570 | Slab *slab = cpp::launder(reinterpret_cast<Slab *>( |
| 571 | (reinterpret_cast<uintptr_t>(ptr) & ~SLAB_ALIGNMENT))); |
| 572 | if (slab->get_chunk_size() >= size) |
| 573 | return ptr; |
| 574 | |
| 575 | // If we need a new chunk we reallocate and copy it over. |
| 576 | void *new_ptr = gpu::allocate(size); |
| 577 | inline_memcpy(new_ptr, ptr, slab->get_chunk_size()); |
| 578 | gpu::deallocate(ptr); |
| 579 | return new_ptr; |
| 580 | } |
| 581 | |
| 582 | void *aligned_allocate(uint32_t alignment, uint64_t size) { |
| 583 | // All alignment values must be a non-zero power of two. |
| 584 | if (!impl::is_pow2(x: alignment)) |
| 585 | return nullptr; |
| 586 | |
| 587 | // If the requested alignment is less than what we already provide this is |
| 588 | // just a normal allocation. |
| 589 | if (alignment <= MIN_ALIGNMENT + 1) |
| 590 | return gpu::allocate(size); |
| 591 | |
| 592 | // We can't handle alignments greater than 2MiB so we simply fail. |
| 593 | if (alignment > SLAB_ALIGNMENT + 1) |
| 594 | return nullptr; |
| 595 | |
| 596 | // Trying to handle allocation internally would break the assumption that each |
| 597 | // chunk is identical to eachother. Allocate enough memory with worst-case |
| 598 | // alignment and then round up. The index logic will round down properly. |
| 599 | uint64_t rounded = size + alignment - MIN_ALIGNMENT; |
| 600 | void *ptr = gpu::allocate(size: rounded); |
| 601 | return __builtin_align_up(ptr, alignment); |
| 602 | } |
| 603 | |
| 604 | } // namespace gpu |
| 605 | } // namespace LIBC_NAMESPACE_DECL |
| 606 | |