allocator.cpp source code [libc/src/__support/GPU/allocator.cpp]

1	//===-- GPU memory allocator implementation ---------------------- C++ --===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file implements a parallel allocator intended for use on a GPU device.
10	// The core algorithm is slab allocator using a random walk over a bitfield for
11	// maximum parallel progress. Slab handling is done by a wait-free reference
12	// counted guard. The first use of a slab will create it from system memory for
13	// re-use. The last use will invalidate it and free the memory.
14	//
15	//===----------------------------------------------------------------------===//
16
17	#include "allocator.h"
18
19	#include "src/__support/CPP/atomic.h"
20	#include "src/__support/CPP/bit.h"
21	#include "src/__support/CPP/new.h"
22	#include "src/__support/GPU/utils.h"
23	#include "src/__support/RPC/rpc_client.h"
24	#include "src/__support/threads/sleep.h"
25
26	namespace LIBC_NAMESPACE_DECL {
27
28	constexpr static uint64_t MAX_SIZE = / 64 GiB / `64ull` * `1024` * `1024` * `1024`;
29	constexpr static uint64_t SLAB_SIZE = / 2 MiB / `2ull` * `1024` * `1024`;
30	constexpr static uint64_t ARRAY_SIZE = MAX_SIZE / SLAB_SIZE;
31	constexpr static uint64_t SLAB_ALIGNMENT = SLAB_SIZE - `1`;
32	constexpr static uint32_t BITS_IN_WORD = sizeof(uint32_t) * `8`;
33	constexpr static uint32_t MIN_SIZE = `16`;
34	constexpr static uint32_t MIN_ALIGNMENT = MIN_SIZE - `1`;
35
36	// A sentinel used to indicate an invalid but non-null pointer value.
37	constexpr static uint64_t SENTINEL = cpp::numeric_limits<uint64_t>::max();
38
39	// The number of times we will try starting on a single index before skipping
40	// past it.
41	constexpr static uint32_t MAX_TRIES = `512`;
42
43	static_assert(!(ARRAY_SIZE & (ARRAY_SIZE - `1`)), "Must be a power of two");
44
45	namespace impl {
46	// Allocates more memory from the system through the RPC interface. All
47	// allocations from the system MUST be aligned on a 2MiB barrier. The default
48	// HSA allocator has this behavior for any allocation >= 2MiB and the CUDA
49	// driver provides an alignment field for virtual memory allocations.
50	static void *rpc_allocate(uint64_t size) {
51	void ptr = nullptr*;
52	rpc::Client::Port port = rpc::client.open<LIBC_MALLOC>();
53	port.send_and_recv(
54	[=](rpc::Buffer *buffer, uint32_t) { buffer->data[`0`] = size; },
55	[&](rpc::Buffer *buffer, uint32_t) {
56	ptr = reinterpret_cast<void *>(buffer->data[`0`]);
57	});
58	port.close();
59	return ptr;
60	}
61
62	// Deallocates the associated system memory.
63	static void rpc_free(void *ptr) {
64	rpc::Client::Port port = rpc::client.open<LIBC_FREE>();
65	port.send([=](rpc::Buffer *buffer, uint32_t) {
66	buffer->data[`0`] = reinterpret_cast<uintptr_t>(ptr);
67	});
68	port.close();
69	}
70
71	// Convert a potentially disjoint bitmask into an increasing integer per-lane
72	// for use with indexing between gpu lanes.
73	static inline uint32_t lane_count(uint64_t lane_mask) {
74	return cpp::popcount(lane_mask & ((uint64_t(`1`) << gpu::get_lane_id()) - `1`));
75	}
76
77	// Obtain an initial value to seed a random number generator. We use the rounded
78	// multiples of the golden ratio from xorshift as additional spreading.*
79	static inline uint32_t entropy() {
80	return (static_cast<uint32_t>(gpu::processor_clock()) ^
81	(gpu::get_thread_id_x() * `0x632be59b`) ^
82	(gpu::get_block_id_x() * `0x85157af5`)) *
83	`0x9e3779bb`;
84	}
85
86	// Generate a random number and update the state using the xorshift32 PRNG.*
87	static inline uint32_t xorshift32(uint32_t &state) {
88	state ^= state << `13`;
89	state ^= state >> `17`;
90	state ^= state << `5`;
91	return state * `0x9e3779bb`;
92	}
93
94	// Final stage of murmurhash used to get a unique index for the global array
95	static inline uint32_t hash(uint32_t x) {
96	x ^= x >> `16`;
97	x *= `0x85ebca6b`;
98	x ^= x >> `13`;
99	x *= `0xc2b2ae35`;
100	x ^= x >> `16`;
101	return x;
102	}
103
104	// Rounds the input value to the closest permitted chunk size. Here we accept
105	// the sum of the closest three powers of two. For a 2MiB slab size this is 48
106	// different chunk sizes. This gives us average internal fragmentation of 87.5%.
107	static inline uint32_t get_chunk_size(uint32_t x) {
108	uint32_t y = x < MIN_SIZE ? MIN_SIZE : x;
109	uint32_t pow2 = BITS_IN_WORD - cpp::countl_zero(y - `1`);
110
111	uint32_t s0 = `0b0100` << (pow2 - `3`);
112	uint32_t s1 = `0b0110` << (pow2 - `3`);
113	uint32_t s2 = `0b0111` << (pow2 - `3`);
114	uint32_t s3 = `0b1000` << (pow2 - `3`);
115
116	if (s0 > y)
117	return (s0 + MIN_ALIGNMENT) & ~MIN_ALIGNMENT;
118	if (s1 > y)
119	return (s1 + MIN_ALIGNMENT) & ~MIN_ALIGNMENT;
120	if (s2 > y)
121	return (s2 + MIN_ALIGNMENT) & ~MIN_ALIGNMENT;
122	return (s3 + MIN_ALIGNMENT) & ~MIN_ALIGNMENT;
123	}
124
125	// Rounds to the nearest power of two.
126	template <uint32_t N, typename T>
127	static inline constexpr T round_up(const T x) {
128	static_assert(((N - `1`) & N) == `0`, "N must be a power of two");
129	return (x + N) & ~(N - `1`);
130	}
131
132	} // namespace impl
133
134	/// A slab allocator used to hand out identically sized slabs of memory.
135	/// Allocation is done through random walks of a bitfield until a free bit is
136	/// encountered. This reduces contention and is highly parallel on a GPU.
137	///
138	/// 0 4 8 16 ... 2 MiB
139	/// ┌────────┬──────────┬────────┬──────────────────┬──────────────────────────┐
140	/// │ chunk │ index │ pad │ bitfield[] │ memory[] │
141	/// └────────┴──────────┴────────┴──────────────────┴──────────────────────────┘
142	///
143	/// The size of the bitfield is the slab size divided by the chunk size divided
144	/// by the number of bits per word. We pad the interface to ensure 16 byte
145	/// alignment and to indicate that if the pointer is not aligned by 2MiB it
146	/// belongs to a slab rather than the global allocator.
147	struct Slab {
148	// Header metadata for the slab, aligned to the minimum alignment.
149	struct alignas(MIN_SIZE) Header {
150	uint32_t chunk_size;
151	uint32_t global_index;
152	};
153
154	// Initialize the slab with its chunk size and index in the global table for
155	// use when freeing.
156	Slab(uint32_t chunk_size, uint32_t global_index) {
157	Header header = reinterpret_cast<Header >(memory);
158	header->chunk_size = chunk_size;
159	header->global_index = global_index;
160
161	// This memset is expensive and likely not necessary for the current 'kfd'
162	// driver. Until zeroed pages are exposed by the API we must be careful.
163	__builtin_memset(get_bitfield(), `0`, bitfield_bytes(chunk_size));
164	}
165
166	// Get the number of chunks that can theoretically fit inside this slab.
167	constexpr static uint32_t num_chunks(uint32_t chunk_size) {
168	return SLAB_SIZE / chunk_size;
169	}
170
171	// Get the number of bytes needed to contain the bitfield bits.
172	constexpr static uint32_t bitfield_bytes(uint32_t chunk_size) {
173	return ((num_chunks(chunk_size) + BITS_IN_WORD - `1`) / BITS_IN_WORD) * `8`;
174	}
175
176	// The actual amount of memory available excluding the bitfield and metadata.
177	constexpr static uint32_t available_bytes(uint32_t chunk_size) {
178	return SLAB_SIZE - bitfield_bytes(chunk_size) - sizeof(Header);
179	}
180
181	// The number of chunks that can be stored in this slab.
182	constexpr static uint32_t available_chunks(uint32_t chunk_size) {
183	return available_bytes(chunk_size) / chunk_size;
184	}
185
186	// The length in bits of the bitfield.
187	constexpr static uint32_t usable_bits(uint32_t chunk_size) {
188	return available_bytes(chunk_size) / chunk_size;
189	}
190
191	// Get the location in the memory where we will store the chunk size.
192	uint32_t get_chunk_size() const {
193	return reinterpret_cast<const Header *>(memory)->chunk_size;
194	}
195
196	// Get the location in the memory where we will store the global index.
197	uint32_t get_global_index() const {
198	return reinterpret_cast<const Header *>(memory)->global_index;
199	}
200
201	// Get a pointer to where the bitfield is located in the memory.
202	uint32_t *get_bitfield() {
203	return reinterpret_cast<uint32_t >(memory + sizeof*(Header));
204	}
205
206	// Get a pointer to where the actual memory to be allocated lives.
207	uint8_t *get_memory(uint32_t chunk_size) {
208	return reinterpret_cast<uint8_t *>(get_bitfield()) +
209	bitfield_bytes(chunk_size);
210	}
211
212	// Get a pointer to the actual memory given an index into the bitfield.
213	void *ptr_from_index(uint32_t index, uint32_t chunk_size) {
214	return get_memory(chunk_size) + index * chunk_size;
215	}
216
217	// Convert a pointer back into its bitfield index using its offset.
218	uint32_t index_from_ptr(void *ptr, uint32_t chunk_size) {
219	return static_cast<uint32_t>(reinterpret_cast<uint8_t *>(ptr) -
220	get_memory(chunk_size)) /
221	chunk_size;
222	}
223
224	// Randomly walks the bitfield until it finds a free bit. Allocations attempt
225	// to put lanes right next to each other for better caching and convergence.
226	void *allocate(uint64_t lane_mask, uint64_t uniform) {
227	uint32_t chunk_size = get_chunk_size();
228	uint32_t state = impl::entropy();
229
230	// The uniform mask represents which lanes contain a uniform target pointer.
231	// We attempt to place these next to each other.
232	void result = nullptr*;
233	for (uint64_t mask = lane_mask; mask;
234	mask = gpu::ballot(lane_mask, !result)) {
235	if (result)
236	continue;
237
238	uint32_t start = gpu::broadcast_value(lane_mask, impl::xorshift32(state));
239
240	uint32_t id = impl::lane_count(lane_mask: uniform & mask);
241	uint32_t index = (start + id) % usable_bits(chunk_size);
242	uint32_t slot = index / BITS_IN_WORD;
243	uint32_t bit = index % BITS_IN_WORD;
244
245	// Get the mask of bits destined for the same slot and coalesce it.
246	uint64_t match = uniform & gpu::match_any(mask, slot);
247	uint32_t length = cpp::popcount(match);
248	uint32_t bitmask = static_cast<uint32_t>((uint64_t(`1`) << length) - `1`)
249	<< bit;
250
251	uint32_t before = `0`;
252	if (gpu::get_lane_id() == static_cast<uint32_t>(cpp::countr_zero(match)))
253	before = cpp::AtomicRef(get_bitfield()[slot])
254	.fetch_or(bitmask, cpp::MemoryOrder::RELAXED);
255	before = gpu::shuffle(mask, cpp::countr_zero(match), before);
256	if (~before & (`1` << bit))
257	result = ptr_from_index(index, chunk_size);
258	else
259	sleep_briefly();
260	}
261
262	cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
263	return result;
264	}
265
266	// Deallocates memory by resetting its corresponding bit in the bitfield.
267	void deallocate(void *ptr) {
268	uint32_t chunk_size = get_chunk_size();
269	uint32_t index = index_from_ptr(ptr, chunk_size);
270	uint32_t slot = index / BITS_IN_WORD;
271	uint32_t bit = index % BITS_IN_WORD;
272
273	cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
274	cpp::AtomicRef(get_bitfield()[slot])
275	.fetch_and(~(`1u` << bit), cpp::MemoryOrder::RELAXED);
276	}
277
278	// The actual memory the slab will manage. All offsets are calculated at
279	// runtime with the chunk size to keep the interface convergent when a warp or
280	// wavefront is handling multiple sizes at once.
281	uint8_t memory[SLAB_SIZE];
282	};
283
284	/// A wait-free guard around a pointer resource to be created dynamically if
285	/// space is available and freed once there are no more users.
286	struct GuardPtr {
287	private:
288	struct RefCounter {
289	// Indicates that the object is in its deallocation phase and thus invalid.
290	static constexpr uint64_t INVALID = uint64_t(`1`) << `63`;
291
292	// If a read preempts an unlock call we indicate this so the following
293	// unlock call can swap out the helped bit and maintain exclusive ownership.
294	static constexpr uint64_t HELPED = uint64_t(`1`) << `62`;
295
296	// Resets the reference counter, cannot be reset to zero safely.
297	void reset(uint32_t n, uint64_t &count) {
298	counter.store(n, cpp::MemoryOrder::RELAXED);
299	count = n;
300	}
301
302	// Acquire a slot in the reference counter if it is not invalid.
303	bool acquire(uint32_t n, uint64_t &count) {
304	count = counter.fetch_add(n, cpp::MemoryOrder::RELAXED) + n;
305	return (count & INVALID) == `0`;
306	}
307
308	// Release a slot in the reference counter. This function should only be
309	// called following a valid acquire call.
310	bool release(uint32_t n) {
311	// If this thread caused the counter to reach zero we try to invalidate it
312	// and obtain exclusive rights to deconstruct it. If the CAS failed either
313	// another thread resurrected the counter and we quit, or a parallel read
314	// helped us invalidating it. For the latter, claim that flag and return.
315	if (counter.fetch_sub(n, cpp::MemoryOrder::RELAXED) == n) {
316	uint64_t expected = `0`;
317	if (counter.compare_exchange_strong(expected, INVALID,
318	cpp::MemoryOrder::RELAXED,
319	cpp::MemoryOrder::RELAXED))
320	return true;
321	else if ((expected & HELPED) &&
322	(counter.exchange(INVALID, cpp::MemoryOrder::RELAXED) &
323	HELPED))
324	return true;
325	}
326	return false;
327	}
328
329	// Returns the current reference count, potentially helping a releasing
330	// thread.
331	uint64_t read() {
332	auto val = counter.load(cpp::MemoryOrder::RELAXED);
333	if (val == `0` && counter.compare_exchange_strong(
334	val, INVALID \| HELPED, cpp::MemoryOrder::RELAXED))
335	return `0`;
336	return (val & INVALID) ? `0` : val;
337	}
338
339	cpp::Atomic<uint64_t> counter{`0`};
340	};
341
342	cpp::Atomic<Slab > ptr{nullptr*};
343	RefCounter ref{};
344
345	// Should be called be a single lane for each different pointer.
346	template <typename... Args>
347	Slab *try_lock_impl(uint32_t n, uint64_t &count, Args &&...args) {
348	Slab *expected = ptr.load(cpp::MemoryOrder::RELAXED);
349	if (!expected &&
350	ptr.compare_exchange_strong(
351	expected, reinterpret_cast<Slab *>(SENTINEL),
352	cpp::MemoryOrder::RELAXED, cpp::MemoryOrder::RELAXED)) {
353	count = cpp::numeric_limits<uint64_t>::max();
354	void raw = impl::rpc_allocate(size: sizeof*(Slab));
355	if (!raw)
356	return nullptr;
357	Slab mem = new* (raw) Slab(cpp::forward<Args>(args)...);
358
359	cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
360	ptr.store(mem, cpp::MemoryOrder::RELAXED);
361	cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
362	if (!ref.acquire(n, count))
363	ref.reset(n, count);
364	return mem;
365	}
366
367	if (!expected \|\| expected == reinterpret_cast<Slab *>(SENTINEL))
368	return nullptr;
369
370	if (!ref.acquire(n, count))
371	return nullptr;
372
373	cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
374	return ptr.load(cpp::MemoryOrder::RELAXED);
375	}
376
377	public:
378	// Attempt to lock access to the pointer, potentially creating it if empty.
379	// The uniform mask represents which lanes share the same pointer. For each
380	// uniform value we elect a leader to handle it on behalf of the other lanes.
381	template <typename... Args>
382	Slab *try_lock(uint64_t lane_mask, uint64_t uniform, uint64_t &count,
383	Args &&...args) {
384	count = `0`;
385	Slab result = nullptr*;
386	if (gpu::get_lane_id() == uint32_t(cpp::countr_zero(uniform)))
387	result = try_lock_impl(cpp::popcount(uniform), count,
388	cpp::forward<Args>(args)...);
389	result = gpu::shuffle(lane_mask, cpp::countr_zero(uniform), result);
390	count = gpu::shuffle(lane_mask, cpp::countr_zero(uniform), count);
391
392	if (!result)
393	return nullptr;
394
395	if (count != cpp::numeric_limits<uint64_t>::max())
396	count = count - cpp::popcount(uniform) + impl::lane_count(uniform) + `1`;
397
398	return result;
399	}
400
401	// Release the associated lock on the pointer, potentially destroying it.
402	void unlock(uint64_t lane_mask, uint64_t mask) {
403	cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
404	if (gpu::get_lane_id() == uint32_t(cpp::countr_zero(mask)) &&
405	ref.release(cpp::popcount(mask))) {
406	Slab *p = ptr.load(cpp::MemoryOrder::RELAXED);
407	p->~Slab();
408	impl::rpc_free(ptr: p);
409	cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
410	ptr.store(nullptr, cpp::MemoryOrder::RELAXED);
411	}
412	gpu::sync_lane(lane_mask);
413	}
414
415	// Get the current value of the reference counter.
416	uint64_t use_count() { return ref.read(); }
417	};
418
419	// The global array used to search for a valid slab to allocate from.
420	static GuardPtr slots[ARRAY_SIZE] = {};
421
422	// Tries to find a slab in the table that can support the given chunk size.
423	static Slab *find_slab(uint32_t chunk_size) {
424	// We start at a hashed value to spread out different chunk sizes.
425	uint32_t start = impl::hash(x: chunk_size);
426	uint64_t lane_mask = gpu::get_lane_mask();
427	uint64_t uniform = gpu::match_any(lane_mask, chunk_size);
428
429	Slab result = nullptr*;
430	uint32_t nudge = `0`;
431	for (uint64_t mask = lane_mask; mask;
432	mask = gpu::ballot(lane_mask, !result), ++nudge) {
433	uint32_t index = cpp::numeric_limits<uint32_t>::max();
434	for (uint32_t offset = nudge / MAX_TRIES;
435	gpu::ballot(lane_mask, index == cpp::numeric_limits<uint32_t>::max());
436	offset += cpp::popcount(uniform & lane_mask)) {
437	uint32_t candidate =
438	(start + offset + impl::lane_count(uniform & lane_mask)) % ARRAY_SIZE;
439	uint64_t available =
440	gpu::ballot(lane_mask, slots[candidate].use_count() <
441	Slab::available_chunks(chunk_size));
442	uint32_t new_index = gpu::shuffle(
443	lane_mask, cpp::countr_zero(available & uniform), candidate);
444
445	// Each uniform group will use the first empty slot they find.
446	if ((index == cpp::numeric_limits<uint32_t>::max() &&
447	(available & uniform)))
448	index = new_index;
449
450	// Guaruntees that this loop will eventuall exit if there is no space.
451	if (offset >= ARRAY_SIZE) {
452	result = reinterpret_cast<Slab *>(SENTINEL);
453	index = `0`;
454	}
455	}
456
457	// Try to claim a slot for the found slot.
458	if (!result) {
459	uint64_t reserved = `0`;
460	Slab *slab = slots[index].try_lock(lane_mask: lane_mask & mask, uniform: uniform & mask,
461	count&: reserved, args&: chunk_size, args&: index);
462	// If we find a slab with a matching chunk size then we store the result.
463	// Otherwise, we need to free the claimed lock and continue. In the case
464	// of out-of-memory we return a sentinel value.
465	if (slab && reserved <= Slab::available_chunks(chunk_size) &&
466	slab->get_chunk_size() == chunk_size) {
467	result = slab;
468	} else if (slab && (reserved > Slab::available_chunks(chunk_size) \|\|
469	slab->get_chunk_size() != chunk_size)) {
470	if (slab->get_chunk_size() != chunk_size)
471	start = index + `1`;
472	slots[index].unlock(gpu::get_lane_mask(),
473	gpu::get_lane_mask() & uniform);
474	} else if (!slab && reserved == cpp::numeric_limits<uint64_t>::max()) {
475	result = reinterpret_cast<Slab *>(SENTINEL);
476	} else {
477	sleep_briefly();
478	}
479	}
480	}
481	return result;
482	}
483
484	// Release the lock associated with a given slab.
485	static void release_slab(Slab *slab) {
486	uint32_t index = slab->get_global_index();
487	uint64_t lane_mask = gpu::get_lane_mask();
488	uint64_t uniform = gpu::match_any(lane_mask, index);
489	slots[index].unlock(lane_mask, mask: uniform);
490	}
491
492	namespace gpu {
493
494	void *allocate(uint64_t size) {
495	if (!size)
496	return nullptr;
497
498	// Allocations requiring a full slab or more go directly to memory.
499	if (size >= SLAB_SIZE / `2`)
500	return impl::rpc_allocate(size: impl::round_up<SLAB_SIZE>(x: size));
501
502	// Try to find a slab for the rounded up chunk size and allocate from it.
503	uint32_t chunk_size = impl::get_chunk_size(x: static_cast<uint32_t>(size));
504	Slab *slab = find_slab(chunk_size);
505	if (!slab \|\| slab == reinterpret_cast<Slab *>(SENTINEL))
506	return nullptr;
507
508	uint64_t lane_mask = gpu::get_lane_mask();
509	uint64_t uniform = gpu::match_any(lane_mask, slab->get_global_index());
510	void *ptr = slab->allocate(lane_mask, uniform);
511	return ptr;
512	}
513
514	void deallocate(void *ptr) {
515	if (!ptr)
516	return;
517
518	// All non-slab allocations will be aligned on a 2MiB boundary.
519	if ((reinterpret_cast<uintptr_t>(ptr) & SLAB_ALIGNMENT) == `0`)
520	return impl::rpc_free(ptr);
521
522	// The original slab pointer is the 2MiB boundary using the given pointer.
523	Slab slab = reinterpret_cast<Slab >(
524	(reinterpret_cast<uintptr_t>(ptr) & ~SLAB_ALIGNMENT));
525	slab->deallocate(ptr);
526	release_slab(slab);
527	}
528
529	} // namespace gpu
530	} // namespace LIBC_NAMESPACE_DECL
531

Provided by KDAB

Definitions

source code of libc/src/__support/GPU/allocator.cpp