| 1 | //===-- GPU memory allocator implementation ---------------------*- C++ -*-===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // This file implements a parallel allocator intended for use on a GPU device. |
| 10 | // The core algorithm is slab allocator using a random walk over a bitfield for |
| 11 | // maximum parallel progress. Slab handling is done by a wait-free reference |
| 12 | // counted guard. The first use of a slab will create it from system memory for |
| 13 | // re-use. The last use will invalidate it and free the memory. |
| 14 | // |
| 15 | //===----------------------------------------------------------------------===// |
| 16 | |
| 17 | #include "allocator.h" |
| 18 | |
| 19 | #include "src/__support/CPP/atomic.h" |
| 20 | #include "src/__support/CPP/bit.h" |
| 21 | #include "src/__support/CPP/new.h" |
| 22 | #include "src/__support/GPU/utils.h" |
| 23 | #include "src/__support/RPC/rpc_client.h" |
| 24 | #include "src/__support/threads/sleep.h" |
| 25 | |
| 26 | namespace LIBC_NAMESPACE_DECL { |
| 27 | |
| 28 | constexpr static uint64_t MAX_SIZE = /* 64 GiB */ 64ull * 1024 * 1024 * 1024; |
| 29 | constexpr static uint64_t SLAB_SIZE = /* 2 MiB */ 2ull * 1024 * 1024; |
| 30 | constexpr static uint64_t ARRAY_SIZE = MAX_SIZE / SLAB_SIZE; |
| 31 | constexpr static uint64_t SLAB_ALIGNMENT = SLAB_SIZE - 1; |
| 32 | constexpr static uint32_t BITS_IN_WORD = sizeof(uint32_t) * 8; |
| 33 | constexpr static uint32_t MIN_SIZE = 16; |
| 34 | constexpr static uint32_t MIN_ALIGNMENT = MIN_SIZE - 1; |
| 35 | |
| 36 | // A sentinel used to indicate an invalid but non-null pointer value. |
| 37 | constexpr static uint64_t SENTINEL = cpp::numeric_limits<uint64_t>::max(); |
| 38 | |
| 39 | // The number of times we will try starting on a single index before skipping |
| 40 | // past it. |
| 41 | constexpr static uint32_t MAX_TRIES = 512; |
| 42 | |
| 43 | static_assert(!(ARRAY_SIZE & (ARRAY_SIZE - 1)), "Must be a power of two" ); |
| 44 | |
| 45 | namespace impl { |
| 46 | // Allocates more memory from the system through the RPC interface. All |
| 47 | // allocations from the system MUST be aligned on a 2MiB barrier. The default |
| 48 | // HSA allocator has this behavior for any allocation >= 2MiB and the CUDA |
| 49 | // driver provides an alignment field for virtual memory allocations. |
| 50 | static void *rpc_allocate(uint64_t size) { |
| 51 | void *ptr = nullptr; |
| 52 | rpc::Client::Port port = rpc::client.open<LIBC_MALLOC>(); |
| 53 | port.send_and_recv( |
| 54 | [=](rpc::Buffer *buffer, uint32_t) { buffer->data[0] = size; }, |
| 55 | [&](rpc::Buffer *buffer, uint32_t) { |
| 56 | ptr = reinterpret_cast<void *>(buffer->data[0]); |
| 57 | }); |
| 58 | port.close(); |
| 59 | return ptr; |
| 60 | } |
| 61 | |
| 62 | // Deallocates the associated system memory. |
| 63 | static void rpc_free(void *ptr) { |
| 64 | rpc::Client::Port port = rpc::client.open<LIBC_FREE>(); |
| 65 | port.send([=](rpc::Buffer *buffer, uint32_t) { |
| 66 | buffer->data[0] = reinterpret_cast<uintptr_t>(ptr); |
| 67 | }); |
| 68 | port.close(); |
| 69 | } |
| 70 | |
| 71 | // Convert a potentially disjoint bitmask into an increasing integer per-lane |
| 72 | // for use with indexing between gpu lanes. |
| 73 | static inline uint32_t lane_count(uint64_t lane_mask) { |
| 74 | return cpp::popcount(lane_mask & ((uint64_t(1) << gpu::get_lane_id()) - 1)); |
| 75 | } |
| 76 | |
| 77 | // Obtain an initial value to seed a random number generator. We use the rounded |
| 78 | // multiples of the golden ratio from xorshift* as additional spreading. |
| 79 | static inline uint32_t entropy() { |
| 80 | return (static_cast<uint32_t>(gpu::processor_clock()) ^ |
| 81 | (gpu::get_thread_id_x() * 0x632be59b) ^ |
| 82 | (gpu::get_block_id_x() * 0x85157af5)) * |
| 83 | 0x9e3779bb; |
| 84 | } |
| 85 | |
| 86 | // Generate a random number and update the state using the xorshift32* PRNG. |
| 87 | static inline uint32_t xorshift32(uint32_t &state) { |
| 88 | state ^= state << 13; |
| 89 | state ^= state >> 17; |
| 90 | state ^= state << 5; |
| 91 | return state * 0x9e3779bb; |
| 92 | } |
| 93 | |
| 94 | // Final stage of murmurhash used to get a unique index for the global array |
| 95 | static inline uint32_t hash(uint32_t x) { |
| 96 | x ^= x >> 16; |
| 97 | x *= 0x85ebca6b; |
| 98 | x ^= x >> 13; |
| 99 | x *= 0xc2b2ae35; |
| 100 | x ^= x >> 16; |
| 101 | return x; |
| 102 | } |
| 103 | |
| 104 | // Rounds the input value to the closest permitted chunk size. Here we accept |
| 105 | // the sum of the closest three powers of two. For a 2MiB slab size this is 48 |
| 106 | // different chunk sizes. This gives us average internal fragmentation of 87.5%. |
| 107 | static inline uint32_t get_chunk_size(uint32_t x) { |
| 108 | uint32_t y = x < MIN_SIZE ? MIN_SIZE : x; |
| 109 | uint32_t pow2 = BITS_IN_WORD - cpp::countl_zero(y - 1); |
| 110 | |
| 111 | uint32_t s0 = 0b0100 << (pow2 - 3); |
| 112 | uint32_t s1 = 0b0110 << (pow2 - 3); |
| 113 | uint32_t s2 = 0b0111 << (pow2 - 3); |
| 114 | uint32_t s3 = 0b1000 << (pow2 - 3); |
| 115 | |
| 116 | if (s0 > y) |
| 117 | return (s0 + MIN_ALIGNMENT) & ~MIN_ALIGNMENT; |
| 118 | if (s1 > y) |
| 119 | return (s1 + MIN_ALIGNMENT) & ~MIN_ALIGNMENT; |
| 120 | if (s2 > y) |
| 121 | return (s2 + MIN_ALIGNMENT) & ~MIN_ALIGNMENT; |
| 122 | return (s3 + MIN_ALIGNMENT) & ~MIN_ALIGNMENT; |
| 123 | } |
| 124 | |
| 125 | // Rounds to the nearest power of two. |
| 126 | template <uint32_t N, typename T> |
| 127 | static inline constexpr T round_up(const T x) { |
| 128 | static_assert(((N - 1) & N) == 0, "N must be a power of two" ); |
| 129 | return (x + N) & ~(N - 1); |
| 130 | } |
| 131 | |
| 132 | } // namespace impl |
| 133 | |
| 134 | /// A slab allocator used to hand out identically sized slabs of memory. |
| 135 | /// Allocation is done through random walks of a bitfield until a free bit is |
| 136 | /// encountered. This reduces contention and is highly parallel on a GPU. |
| 137 | /// |
| 138 | /// 0 4 8 16 ... 2 MiB |
| 139 | /// ┌────────┬──────────┬────────┬──────────────────┬──────────────────────────┐ |
| 140 | /// │ chunk │ index │ pad │ bitfield[] │ memory[] │ |
| 141 | /// └────────┴──────────┴────────┴──────────────────┴──────────────────────────┘ |
| 142 | /// |
| 143 | /// The size of the bitfield is the slab size divided by the chunk size divided |
| 144 | /// by the number of bits per word. We pad the interface to ensure 16 byte |
| 145 | /// alignment and to indicate that if the pointer is not aligned by 2MiB it |
| 146 | /// belongs to a slab rather than the global allocator. |
| 147 | struct Slab { |
| 148 | // Header metadata for the slab, aligned to the minimum alignment. |
| 149 | struct alignas(MIN_SIZE) { |
| 150 | uint32_t ; |
| 151 | uint32_t ; |
| 152 | }; |
| 153 | |
| 154 | // Initialize the slab with its chunk size and index in the global table for |
| 155 | // use when freeing. |
| 156 | Slab(uint32_t chunk_size, uint32_t global_index) { |
| 157 | Header * = reinterpret_cast<Header *>(memory); |
| 158 | header->chunk_size = chunk_size; |
| 159 | header->global_index = global_index; |
| 160 | |
| 161 | // This memset is expensive and likely not necessary for the current 'kfd' |
| 162 | // driver. Until zeroed pages are exposed by the API we must be careful. |
| 163 | __builtin_memset(get_bitfield(), 0, bitfield_bytes(chunk_size)); |
| 164 | } |
| 165 | |
| 166 | // Get the number of chunks that can theoretically fit inside this slab. |
| 167 | constexpr static uint32_t num_chunks(uint32_t chunk_size) { |
| 168 | return SLAB_SIZE / chunk_size; |
| 169 | } |
| 170 | |
| 171 | // Get the number of bytes needed to contain the bitfield bits. |
| 172 | constexpr static uint32_t bitfield_bytes(uint32_t chunk_size) { |
| 173 | return ((num_chunks(chunk_size) + BITS_IN_WORD - 1) / BITS_IN_WORD) * 8; |
| 174 | } |
| 175 | |
| 176 | // The actual amount of memory available excluding the bitfield and metadata. |
| 177 | constexpr static uint32_t available_bytes(uint32_t chunk_size) { |
| 178 | return SLAB_SIZE - bitfield_bytes(chunk_size) - sizeof(Header); |
| 179 | } |
| 180 | |
| 181 | // The number of chunks that can be stored in this slab. |
| 182 | constexpr static uint32_t available_chunks(uint32_t chunk_size) { |
| 183 | return available_bytes(chunk_size) / chunk_size; |
| 184 | } |
| 185 | |
| 186 | // The length in bits of the bitfield. |
| 187 | constexpr static uint32_t usable_bits(uint32_t chunk_size) { |
| 188 | return available_bytes(chunk_size) / chunk_size; |
| 189 | } |
| 190 | |
| 191 | // Get the location in the memory where we will store the chunk size. |
| 192 | uint32_t get_chunk_size() const { |
| 193 | return reinterpret_cast<const Header *>(memory)->chunk_size; |
| 194 | } |
| 195 | |
| 196 | // Get the location in the memory where we will store the global index. |
| 197 | uint32_t get_global_index() const { |
| 198 | return reinterpret_cast<const Header *>(memory)->global_index; |
| 199 | } |
| 200 | |
| 201 | // Get a pointer to where the bitfield is located in the memory. |
| 202 | uint32_t *get_bitfield() { |
| 203 | return reinterpret_cast<uint32_t *>(memory + sizeof(Header)); |
| 204 | } |
| 205 | |
| 206 | // Get a pointer to where the actual memory to be allocated lives. |
| 207 | uint8_t *get_memory(uint32_t chunk_size) { |
| 208 | return reinterpret_cast<uint8_t *>(get_bitfield()) + |
| 209 | bitfield_bytes(chunk_size); |
| 210 | } |
| 211 | |
| 212 | // Get a pointer to the actual memory given an index into the bitfield. |
| 213 | void *ptr_from_index(uint32_t index, uint32_t chunk_size) { |
| 214 | return get_memory(chunk_size) + index * chunk_size; |
| 215 | } |
| 216 | |
| 217 | // Convert a pointer back into its bitfield index using its offset. |
| 218 | uint32_t index_from_ptr(void *ptr, uint32_t chunk_size) { |
| 219 | return static_cast<uint32_t>(reinterpret_cast<uint8_t *>(ptr) - |
| 220 | get_memory(chunk_size)) / |
| 221 | chunk_size; |
| 222 | } |
| 223 | |
| 224 | // Randomly walks the bitfield until it finds a free bit. Allocations attempt |
| 225 | // to put lanes right next to each other for better caching and convergence. |
| 226 | void *allocate(uint64_t lane_mask, uint64_t uniform) { |
| 227 | uint32_t chunk_size = get_chunk_size(); |
| 228 | uint32_t state = impl::entropy(); |
| 229 | |
| 230 | // The uniform mask represents which lanes contain a uniform target pointer. |
| 231 | // We attempt to place these next to each other. |
| 232 | void *result = nullptr; |
| 233 | for (uint64_t mask = lane_mask; mask; |
| 234 | mask = gpu::ballot(lane_mask, !result)) { |
| 235 | if (result) |
| 236 | continue; |
| 237 | |
| 238 | uint32_t start = gpu::broadcast_value(lane_mask, impl::xorshift32(state)); |
| 239 | |
| 240 | uint32_t id = impl::lane_count(lane_mask: uniform & mask); |
| 241 | uint32_t index = (start + id) % usable_bits(chunk_size); |
| 242 | uint32_t slot = index / BITS_IN_WORD; |
| 243 | uint32_t bit = index % BITS_IN_WORD; |
| 244 | |
| 245 | // Get the mask of bits destined for the same slot and coalesce it. |
| 246 | uint64_t match = uniform & gpu::match_any(mask, slot); |
| 247 | uint32_t length = cpp::popcount(match); |
| 248 | uint32_t bitmask = static_cast<uint32_t>((uint64_t(1) << length) - 1) |
| 249 | << bit; |
| 250 | |
| 251 | uint32_t before = 0; |
| 252 | if (gpu::get_lane_id() == static_cast<uint32_t>(cpp::countr_zero(match))) |
| 253 | before = cpp::AtomicRef(get_bitfield()[slot]) |
| 254 | .fetch_or(bitmask, cpp::MemoryOrder::RELAXED); |
| 255 | before = gpu::shuffle(mask, cpp::countr_zero(match), before); |
| 256 | if (~before & (1 << bit)) |
| 257 | result = ptr_from_index(index, chunk_size); |
| 258 | else |
| 259 | sleep_briefly(); |
| 260 | } |
| 261 | |
| 262 | cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE); |
| 263 | return result; |
| 264 | } |
| 265 | |
| 266 | // Deallocates memory by resetting its corresponding bit in the bitfield. |
| 267 | void deallocate(void *ptr) { |
| 268 | uint32_t chunk_size = get_chunk_size(); |
| 269 | uint32_t index = index_from_ptr(ptr, chunk_size); |
| 270 | uint32_t slot = index / BITS_IN_WORD; |
| 271 | uint32_t bit = index % BITS_IN_WORD; |
| 272 | |
| 273 | cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE); |
| 274 | cpp::AtomicRef(get_bitfield()[slot]) |
| 275 | .fetch_and(~(1u << bit), cpp::MemoryOrder::RELAXED); |
| 276 | } |
| 277 | |
| 278 | // The actual memory the slab will manage. All offsets are calculated at |
| 279 | // runtime with the chunk size to keep the interface convergent when a warp or |
| 280 | // wavefront is handling multiple sizes at once. |
| 281 | uint8_t memory[SLAB_SIZE]; |
| 282 | }; |
| 283 | |
| 284 | /// A wait-free guard around a pointer resource to be created dynamically if |
| 285 | /// space is available and freed once there are no more users. |
| 286 | struct GuardPtr { |
| 287 | private: |
| 288 | struct RefCounter { |
| 289 | // Indicates that the object is in its deallocation phase and thus invalid. |
| 290 | static constexpr uint64_t INVALID = uint64_t(1) << 63; |
| 291 | |
| 292 | // If a read preempts an unlock call we indicate this so the following |
| 293 | // unlock call can swap out the helped bit and maintain exclusive ownership. |
| 294 | static constexpr uint64_t HELPED = uint64_t(1) << 62; |
| 295 | |
| 296 | // Resets the reference counter, cannot be reset to zero safely. |
| 297 | void reset(uint32_t n, uint64_t &count) { |
| 298 | counter.store(n, cpp::MemoryOrder::RELAXED); |
| 299 | count = n; |
| 300 | } |
| 301 | |
| 302 | // Acquire a slot in the reference counter if it is not invalid. |
| 303 | bool acquire(uint32_t n, uint64_t &count) { |
| 304 | count = counter.fetch_add(n, cpp::MemoryOrder::RELAXED) + n; |
| 305 | return (count & INVALID) == 0; |
| 306 | } |
| 307 | |
| 308 | // Release a slot in the reference counter. This function should only be |
| 309 | // called following a valid acquire call. |
| 310 | bool release(uint32_t n) { |
| 311 | // If this thread caused the counter to reach zero we try to invalidate it |
| 312 | // and obtain exclusive rights to deconstruct it. If the CAS failed either |
| 313 | // another thread resurrected the counter and we quit, or a parallel read |
| 314 | // helped us invalidating it. For the latter, claim that flag and return. |
| 315 | if (counter.fetch_sub(n, cpp::MemoryOrder::RELAXED) == n) { |
| 316 | uint64_t expected = 0; |
| 317 | if (counter.compare_exchange_strong(expected, INVALID, |
| 318 | cpp::MemoryOrder::RELAXED, |
| 319 | cpp::MemoryOrder::RELAXED)) |
| 320 | return true; |
| 321 | else if ((expected & HELPED) && |
| 322 | (counter.exchange(INVALID, cpp::MemoryOrder::RELAXED) & |
| 323 | HELPED)) |
| 324 | return true; |
| 325 | } |
| 326 | return false; |
| 327 | } |
| 328 | |
| 329 | // Returns the current reference count, potentially helping a releasing |
| 330 | // thread. |
| 331 | uint64_t read() { |
| 332 | auto val = counter.load(cpp::MemoryOrder::RELAXED); |
| 333 | if (val == 0 && counter.compare_exchange_strong( |
| 334 | val, INVALID | HELPED, cpp::MemoryOrder::RELAXED)) |
| 335 | return 0; |
| 336 | return (val & INVALID) ? 0 : val; |
| 337 | } |
| 338 | |
| 339 | cpp::Atomic<uint64_t> counter{0}; |
| 340 | }; |
| 341 | |
| 342 | cpp::Atomic<Slab *> ptr{nullptr}; |
| 343 | RefCounter ref{}; |
| 344 | |
| 345 | // Should be called be a single lane for each different pointer. |
| 346 | template <typename... Args> |
| 347 | Slab *try_lock_impl(uint32_t n, uint64_t &count, Args &&...args) { |
| 348 | Slab *expected = ptr.load(cpp::MemoryOrder::RELAXED); |
| 349 | if (!expected && |
| 350 | ptr.compare_exchange_strong( |
| 351 | expected, reinterpret_cast<Slab *>(SENTINEL), |
| 352 | cpp::MemoryOrder::RELAXED, cpp::MemoryOrder::RELAXED)) { |
| 353 | count = cpp::numeric_limits<uint64_t>::max(); |
| 354 | void *raw = impl::rpc_allocate(size: sizeof(Slab)); |
| 355 | if (!raw) |
| 356 | return nullptr; |
| 357 | Slab *mem = new (raw) Slab(cpp::forward<Args>(args)...); |
| 358 | |
| 359 | cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE); |
| 360 | ptr.store(mem, cpp::MemoryOrder::RELAXED); |
| 361 | cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE); |
| 362 | if (!ref.acquire(n, count)) |
| 363 | ref.reset(n, count); |
| 364 | return mem; |
| 365 | } |
| 366 | |
| 367 | if (!expected || expected == reinterpret_cast<Slab *>(SENTINEL)) |
| 368 | return nullptr; |
| 369 | |
| 370 | if (!ref.acquire(n, count)) |
| 371 | return nullptr; |
| 372 | |
| 373 | cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE); |
| 374 | return ptr.load(cpp::MemoryOrder::RELAXED); |
| 375 | } |
| 376 | |
| 377 | public: |
| 378 | // Attempt to lock access to the pointer, potentially creating it if empty. |
| 379 | // The uniform mask represents which lanes share the same pointer. For each |
| 380 | // uniform value we elect a leader to handle it on behalf of the other lanes. |
| 381 | template <typename... Args> |
| 382 | Slab *try_lock(uint64_t lane_mask, uint64_t uniform, uint64_t &count, |
| 383 | Args &&...args) { |
| 384 | count = 0; |
| 385 | Slab *result = nullptr; |
| 386 | if (gpu::get_lane_id() == uint32_t(cpp::countr_zero(uniform))) |
| 387 | result = try_lock_impl(cpp::popcount(uniform), count, |
| 388 | cpp::forward<Args>(args)...); |
| 389 | result = gpu::shuffle(lane_mask, cpp::countr_zero(uniform), result); |
| 390 | count = gpu::shuffle(lane_mask, cpp::countr_zero(uniform), count); |
| 391 | |
| 392 | if (!result) |
| 393 | return nullptr; |
| 394 | |
| 395 | if (count != cpp::numeric_limits<uint64_t>::max()) |
| 396 | count = count - cpp::popcount(uniform) + impl::lane_count(uniform) + 1; |
| 397 | |
| 398 | return result; |
| 399 | } |
| 400 | |
| 401 | // Release the associated lock on the pointer, potentially destroying it. |
| 402 | void unlock(uint64_t lane_mask, uint64_t mask) { |
| 403 | cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE); |
| 404 | if (gpu::get_lane_id() == uint32_t(cpp::countr_zero(mask)) && |
| 405 | ref.release(cpp::popcount(mask))) { |
| 406 | Slab *p = ptr.load(cpp::MemoryOrder::RELAXED); |
| 407 | p->~Slab(); |
| 408 | impl::rpc_free(ptr: p); |
| 409 | cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE); |
| 410 | ptr.store(nullptr, cpp::MemoryOrder::RELAXED); |
| 411 | } |
| 412 | gpu::sync_lane(lane_mask); |
| 413 | } |
| 414 | |
| 415 | // Get the current value of the reference counter. |
| 416 | uint64_t use_count() { return ref.read(); } |
| 417 | }; |
| 418 | |
| 419 | // The global array used to search for a valid slab to allocate from. |
| 420 | static GuardPtr slots[ARRAY_SIZE] = {}; |
| 421 | |
| 422 | // Tries to find a slab in the table that can support the given chunk size. |
| 423 | static Slab *find_slab(uint32_t chunk_size) { |
| 424 | // We start at a hashed value to spread out different chunk sizes. |
| 425 | uint32_t start = impl::hash(x: chunk_size); |
| 426 | uint64_t lane_mask = gpu::get_lane_mask(); |
| 427 | uint64_t uniform = gpu::match_any(lane_mask, chunk_size); |
| 428 | |
| 429 | Slab *result = nullptr; |
| 430 | uint32_t nudge = 0; |
| 431 | for (uint64_t mask = lane_mask; mask; |
| 432 | mask = gpu::ballot(lane_mask, !result), ++nudge) { |
| 433 | uint32_t index = cpp::numeric_limits<uint32_t>::max(); |
| 434 | for (uint32_t offset = nudge / MAX_TRIES; |
| 435 | gpu::ballot(lane_mask, index == cpp::numeric_limits<uint32_t>::max()); |
| 436 | offset += cpp::popcount(uniform & lane_mask)) { |
| 437 | uint32_t candidate = |
| 438 | (start + offset + impl::lane_count(uniform & lane_mask)) % ARRAY_SIZE; |
| 439 | uint64_t available = |
| 440 | gpu::ballot(lane_mask, slots[candidate].use_count() < |
| 441 | Slab::available_chunks(chunk_size)); |
| 442 | uint32_t new_index = gpu::shuffle( |
| 443 | lane_mask, cpp::countr_zero(available & uniform), candidate); |
| 444 | |
| 445 | // Each uniform group will use the first empty slot they find. |
| 446 | if ((index == cpp::numeric_limits<uint32_t>::max() && |
| 447 | (available & uniform))) |
| 448 | index = new_index; |
| 449 | |
| 450 | // Guaruntees that this loop will eventuall exit if there is no space. |
| 451 | if (offset >= ARRAY_SIZE) { |
| 452 | result = reinterpret_cast<Slab *>(SENTINEL); |
| 453 | index = 0; |
| 454 | } |
| 455 | } |
| 456 | |
| 457 | // Try to claim a slot for the found slot. |
| 458 | if (!result) { |
| 459 | uint64_t reserved = 0; |
| 460 | Slab *slab = slots[index].try_lock(lane_mask: lane_mask & mask, uniform: uniform & mask, |
| 461 | count&: reserved, args&: chunk_size, args&: index); |
| 462 | // If we find a slab with a matching chunk size then we store the result. |
| 463 | // Otherwise, we need to free the claimed lock and continue. In the case |
| 464 | // of out-of-memory we return a sentinel value. |
| 465 | if (slab && reserved <= Slab::available_chunks(chunk_size) && |
| 466 | slab->get_chunk_size() == chunk_size) { |
| 467 | result = slab; |
| 468 | } else if (slab && (reserved > Slab::available_chunks(chunk_size) || |
| 469 | slab->get_chunk_size() != chunk_size)) { |
| 470 | if (slab->get_chunk_size() != chunk_size) |
| 471 | start = index + 1; |
| 472 | slots[index].unlock(gpu::get_lane_mask(), |
| 473 | gpu::get_lane_mask() & uniform); |
| 474 | } else if (!slab && reserved == cpp::numeric_limits<uint64_t>::max()) { |
| 475 | result = reinterpret_cast<Slab *>(SENTINEL); |
| 476 | } else { |
| 477 | sleep_briefly(); |
| 478 | } |
| 479 | } |
| 480 | } |
| 481 | return result; |
| 482 | } |
| 483 | |
| 484 | // Release the lock associated with a given slab. |
| 485 | static void release_slab(Slab *slab) { |
| 486 | uint32_t index = slab->get_global_index(); |
| 487 | uint64_t lane_mask = gpu::get_lane_mask(); |
| 488 | uint64_t uniform = gpu::match_any(lane_mask, index); |
| 489 | slots[index].unlock(lane_mask, mask: uniform); |
| 490 | } |
| 491 | |
| 492 | namespace gpu { |
| 493 | |
| 494 | void *allocate(uint64_t size) { |
| 495 | if (!size) |
| 496 | return nullptr; |
| 497 | |
| 498 | // Allocations requiring a full slab or more go directly to memory. |
| 499 | if (size >= SLAB_SIZE / 2) |
| 500 | return impl::rpc_allocate(size: impl::round_up<SLAB_SIZE>(x: size)); |
| 501 | |
| 502 | // Try to find a slab for the rounded up chunk size and allocate from it. |
| 503 | uint32_t chunk_size = impl::get_chunk_size(x: static_cast<uint32_t>(size)); |
| 504 | Slab *slab = find_slab(chunk_size); |
| 505 | if (!slab || slab == reinterpret_cast<Slab *>(SENTINEL)) |
| 506 | return nullptr; |
| 507 | |
| 508 | uint64_t lane_mask = gpu::get_lane_mask(); |
| 509 | uint64_t uniform = gpu::match_any(lane_mask, slab->get_global_index()); |
| 510 | void *ptr = slab->allocate(lane_mask, uniform); |
| 511 | return ptr; |
| 512 | } |
| 513 | |
| 514 | void deallocate(void *ptr) { |
| 515 | if (!ptr) |
| 516 | return; |
| 517 | |
| 518 | // All non-slab allocations will be aligned on a 2MiB boundary. |
| 519 | if ((reinterpret_cast<uintptr_t>(ptr) & SLAB_ALIGNMENT) == 0) |
| 520 | return impl::rpc_free(ptr); |
| 521 | |
| 522 | // The original slab pointer is the 2MiB boundary using the given pointer. |
| 523 | Slab *slab = reinterpret_cast<Slab *>( |
| 524 | (reinterpret_cast<uintptr_t>(ptr) & ~SLAB_ALIGNMENT)); |
| 525 | slab->deallocate(ptr); |
| 526 | release_slab(slab); |
| 527 | } |
| 528 | |
| 529 | } // namespace gpu |
| 530 | } // namespace LIBC_NAMESPACE_DECL |
| 531 | |