alloc.rs source code [crates/divan/src/alloc.rs]

1	use std::{alloc::*, fmt, ptr::NonNull};
2
3	use cfg_if::cfg_if;
4
5	use crate::{stats::StatsSet, util::sync::AtomicFlag};
6
7	#[cfg(target_os = "macos")]
8	use crate::util::{sync::CachePadded, thread::PThreadKey};
9
10	#[cfg(not(target_os = "macos"))]
11	use std::cell::UnsafeCell;
12
13	/// The `AllocProfiler` when running crate-internal tests.
14	///
15	/// This enables us to test it for:
16	/// - Undefined behavior with Miri
17	/// - Correctness when tallying
18	#[cfg(test)]
19	#[global_allocator]
20	static ALLOC: AllocProfiler = AllocProfiler::system();
21
22	/// Whether to ignore allocation info set during the benchmark.
23	pub(crate) static IGNORE_ALLOC: AtomicFlag = AtomicFlag::new(`false`);
24
25	/// Measures [`GlobalAlloc`] memory usage.
26	///
27	/// # Examples
28	///
29	/// The default usage is to create a
30	/// [`#[global_allocator]`](macro@global_allocator) that wraps the [`System`]
31	/// allocator with [`AllocProfiler::system()`]:
32	///
33	/// ```
34	/// use std::collections::*;
35	/// use divan::AllocProfiler;
36	///
37	/// #[global_allocator]
38	/// static ALLOC: AllocProfiler = AllocProfiler::system();
39	///
40	/// fn main() {
41	/// divan::main();
42	/// }
43	///
44	/// #[divan::bench(types = [
45	/// Vec<i32>,
46	/// LinkedList<i32>,
47	/// HashSet<i32>,
48	/// ])]
49	/// fn from_iter<T>() -> T
50	/// where
51	/// T: FromIterator<i32>,
52	/// {
53	/// (`0`..`100`).collect()
54	/// }
55	///
56	/// #[divan::bench(types = [
57	/// Vec<i32>,
58	/// LinkedList<i32>,
59	/// HashSet<i32>,
60	/// ])]
61	/// fn drop<T>(bencher: divan::Bencher)
62	/// where
63	/// T: FromIterator<i32>,
64	/// {
65	/// bencher
66	/// .with_inputs(\|\| (`0`..`100`).collect::<T>())
67	/// .bench_values(std::mem::drop);
68	/// }
69	/// ```
70	///
71	/// Wrap other [`GlobalAlloc`] implementations like
72	/// [`mimalloc`](https://docs.rs/mimalloc) with [`AllocProfiler::new()`]:
73	///
74	/// ```
75	/// use divan::AllocProfiler;
76	/// use mimalloc::MiMalloc;
77	///
78	/// # #[cfg(not(miri))]
79	/// #[global_allocator]
80	/// static ALLOC: AllocProfiler<MiMalloc> = AllocProfiler::new(MiMalloc);
81	/// ```
82	///
83	/// See [`string`](https://github.com/nvzqz/divan/blob/main/examples/benches/string.rs)
84	/// and [`collections`](https://github.com/nvzqz/divan/blob/main/examples/benches/collections.rs)
85	/// benchmarks for more examples.
86	///
87	/// # Implementation
88	///
89	/// Collecting allocation information happens at any point during which Divan is
90	/// also measuring the time. As a result, counting allocations affects timing.
91	///
92	/// To reduce Divan's footprint during benchmarking:
93	/// - Allocation information is recorded in thread-local storage to prevent
94	/// contention when benchmarks involve multiple threads, either through
95	/// options like [`threads`](macro@crate::bench#threads) or internally
96	/// spawning their own threads.
97	/// - It does not check for overflow and assumes it will not happen. This is
98	/// subject to change in the future.
99	/// - Fast thread-local storage access is assembly-optimized on macOS.
100	///
101	/// Allocation information is the only data Divan records outside of timing, and
102	/// thus it also has the only code that affects timing. Steps for recording
103	/// alloc info:
104	/// 1. Load the thread-local slot for allocation information.
105	///
106	/// On macOS, this is via the
107	/// [`gs`](https://github.com/nvzqz/divan/blob/v0.1.6/src/util/sync.rs#L34)/[`tpidrro_el0`](https://github.com/nvzqz/divan/blob/v0.1.6/src/util/sync.rs#L47)
108	/// registers for
109	/// [`pthread_getspecific`](https://pubs.opengroup.org/onlinepubs/9699919799/functions/pthread_getspecific.html).
110	/// Although this is not guaranteed as stable ABI, in practice many programs
111	/// assume these registers store thread-local data. [`thread_local!`] is used
112	/// on all other platforms.
113	///
114	/// 2. Increment allocation operation invocation count and bytes count
115	/// (a.k.a. size).
116	///
117	/// Allocation information is recorded in thread-local storage to prevent
118	/// slowdowns from synchronized sharing when using multiple threads, through
119	/// options like [`threads`](macro@crate::bench#threads).
120	///
121	/// Note that allocations in threads not controlled by Divan are not currently
122	/// counted.
123	#[derive(Debug, Default)]
124	pub struct AllocProfiler<Alloc = System> {
125	alloc: Alloc,
126	}
127
128	unsafe impl<A: GlobalAlloc> GlobalAlloc for AllocProfiler<A> {
129	unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
130	// Tally allocation count.
131	if let Some(mut info) = ThreadAllocInfo::try_current() {
132	// SAFETY: We have exclusive access.
133	let info = unsafe { info.as_mut() };
134
135	info.tally_alloc(layout.size());
136	};
137
138	self.alloc.alloc(layout)
139	}
140
141	unsafe fn alloc_zeroed(&self, layout: Layout) -> *mut u8 {
142	// Tally allocation count.
143	if let Some(mut info) = ThreadAllocInfo::try_current() {
144	// SAFETY: We have exclusive access.
145	let info = unsafe { info.as_mut() };
146
147	info.tally_alloc(layout.size());
148	};
149
150	self.alloc.alloc_zeroed(layout)
151	}
152
153	unsafe fn realloc(&self, ptr: *mut u8, layout: Layout, new_size: usize) -> *mut u8 {
154	// Tally reallocation count.
155	if let Some(mut info) = ThreadAllocInfo::try_current() {
156	// SAFETY: We have exclusive access.
157	let info = unsafe { info.as_mut() };
158
159	info.tally_realloc(layout.size(), new_size);
160	};
161
162	self.alloc.realloc(ptr, layout, new_size)
163	}
164
165	unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) {
166	// Tally deallocation count.
167	if let Some(mut info) = ThreadAllocInfo::try_current() {
168	// SAFETY: We have exclusive access.
169	let info = unsafe { info.as_mut() };
170
171	info.tally_dealloc(layout.size());
172	};
173
174	self.alloc.dealloc(ptr, layout)
175	}
176	}
177
178	impl AllocProfiler {
179	/// Profiles the [`System`] allocator.
180	#[inline]
181	pub const fn system() -> Self {
182	Self::new(alloc:System)
183	}
184	}
185
186	impl<A> AllocProfiler<A> {
187	/// Profiles a [`GlobalAlloc`].
188	#[inline]
189	pub const fn new(alloc: A) -> Self {
190	Self { alloc }
191	}
192	}
193
194	/// Thread-local allocation information.
195	#[derive(Clone, Default)]
196	#[repr(C)]
197	pub(crate) struct ThreadAllocInfo {
198	// NOTE: `tallies` should be ordered first so that `tally_realloc` can
199	// directly index `&self` without an offset.
200	pub tallies: ThreadAllocTallyMap,
201
202	// NOTE: Max size and count are signed for convenience but can never be
203	// negative due to it being initialized to 0.
204	//
205	// PERF: Grouping current/max fields together by count and size makes
206	// `tally_alloc` take the least time on M1 Mac.
207	pub current_count: ThreadAllocCountSigned,
208	pub max_count: ThreadAllocCountSigned,
209	pub current_size: ThreadAllocCountSigned,
210	pub max_size: ThreadAllocCountSigned,
211	}
212
213	#[cfg(not(target_os = "macos"))]
214	thread_local! {
215	/// Instance specific to the current thread.
216	///
217	/// On macOS, we use `ALLOC_PTHREAD_KEY` instead.
218	static CURRENT_THREAD_INFO: UnsafeCell<ThreadAllocInfo> = const {
219	UnsafeCell::new(ThreadAllocInfo::new())
220	};
221	}
222
223	#[cfg(target_os = "macos")]
224	static ALLOC_PTHREAD_KEY: CachePadded<PThreadKey<ThreadAllocInfo>> = CachePadded(PThreadKey::new());
225
226	impl ThreadAllocInfo {
227	#[inline]
228	pub const fn new() -> Self {
229	Self {
230	tallies: ThreadAllocTallyMap::new(),
231	max_count: `0`,
232	current_count: `0`,
233	max_size: `0`,
234	current_size: `0`,
235	}
236	}
237
238	/// Returns the current thread's allocation information, initializing it on
239	/// first access.
240	///
241	/// Returns `None` if the thread is terminating and has thus deallocated its
242	/// local instance.
243	#[inline]
244	pub fn current() -> Option<NonNull<Self>> {
245	cfg_if! {
246	if #[cfg(target_os = "macos")] {
247	return Self::try_current().or_else(slow_impl);
248	} else {
249	Self::try_current()
250	}
251	}
252
253	#[cfg(target_os = "macos")]
254	#[cold]
255	#[inline(never)]
256	fn slow_impl() -> Option<NonNull<ThreadAllocInfo>> {
257	unsafe {
258	let layout = Layout::new::<ThreadAllocInfo>();
259
260	let Some(info_alloc) = NonNull::new(unsafe { System.alloc_zeroed(layout) }) else {
261	handle_alloc_error(layout);
262	};
263
264	let success = ALLOC_PTHREAD_KEY.0.set(info_alloc.as_ptr().cast(), \|this\| {
265	System.dealloc(this.as_ptr().cast(), Layout::new::<ThreadAllocInfo>());
266	});
267
268	if !success {
269	System.dealloc(info_alloc.as_ptr(), layout);
270	return None;
271	}
272
273	// When using static thread local key, write directly because it
274	// is undefined behavior to call `pthread_setspecific` with a
275	// key that didn't originate from `pthread_key_create`.
276	#[cfg(all(not(miri), not(feature = "dyn_thread_local"), target_arch = "x86_64"))]
277	unsafe {
278	crate::util::thread::fast::set_static_thread_local(info_alloc.as_ptr());
279	};
280
281	Some(info_alloc.cast())
282	}
283	}
284	}
285
286	/// Returns the current thread's allocation information if initialized.
287	///
288	/// Returns `None` if the instance has not yet been allocated or the thread
289	/// is terminating and has thus deallocated its local instance.
290	#[inline]
291	pub fn try_current() -> Option<NonNull<Self>> {
292	cfg_if! {
293	if #[cfg(target_os = "macos")] {
294	// Fast path: static thread local.
295	#[cfg(all(
296	not(miri),
297	not(feature = "dyn_thread_local"),
298	target_arch = "x86_64",
299	))]
300	return NonNull::new(unsafe {
301	crate::util::thread::fast::get_static_thread_local::<Self>().cast_mut()
302	});
303
304	#[allow(unreachable_code)]
305	ALLOC_PTHREAD_KEY.0.get()
306	} else {
307	CURRENT_THREAD_INFO.try_with(\|info\| unsafe {
308	NonNull::new_unchecked(info.get())
309	}).ok()
310	}
311	}
312	}
313
314	/// Sets 0 to all values.
315	pub fn clear(&mut self) {
316	*self = Self::new();
317	}
318
319	/// Tallies the total count and size of the allocation operation.
320	#[inline]
321	pub fn tally_alloc(&mut self, size: usize) {
322	self.tally_op(AllocOp::Alloc, size);
323
324	self.current_count += `1`;
325	self.max_count = self.max_count.max(self.current_count);
326
327	self.current_size += size as ThreadAllocCountSigned;
328	self.max_size = self.max_size.max(self.current_size);
329	}
330
331	/// Tallies the total count and size of the deallocation operation.
332	#[inline]
333	pub fn tally_dealloc(&mut self, size: usize) {
334	self.tally_op(AllocOp::Dealloc, size);
335
336	self.current_count -= `1`;
337	self.current_size -= size as ThreadAllocCountSigned;
338	}
339
340	/// Tallies the total count and size of the reallocation operation.
341	#[inline]
342	pub fn tally_realloc(&mut self, old_size: usize, new_size: usize) {
343	let (diff, is_shrink) = new_size.overflowing_sub(old_size);
344	let diff = diff as isize;
345	let abs_diff = diff.wrapping_abs() as usize;
346
347	self.tally_op(AllocOp::realloc(is_shrink), abs_diff);
348
349	// NOTE: Realloc does not change allocation count.
350	self.current_size += diff as ThreadAllocCountSigned;
351	self.max_size = self.max_size.max(self.current_size);
352	}
353
354	/// Tallies the total count and size of the allocation operation.
355	#[inline]
356	fn tally_op(&mut self, op: AllocOp, size: usize) {
357	let tally = self.tallies.get_mut(op);
358	tally.count += `1`;
359	tally.size += size as ThreadAllocCount;
360	}
361	}
362
363	/// Allocation numbers being accumulated.
364	///
365	/// # Memory Layout
366	///
367	/// Aligning to 16 nudges the compiler to emit aligned SIMD operations.
368	///
369	/// Placing `count` first generates less code on AArch64.
370	#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
371	#[repr(C, align(`16`))]
372	pub(crate) struct AllocTally<Count> {
373	/// The number of times this operation was performed.
374	pub count: Count,
375
376	/// The amount of memory this operation changed.
377	pub size: Count,
378	}
379
380	pub(crate) type ThreadAllocCount = condtype::num::Usize64;
381	pub(crate) type ThreadAllocCountSigned = condtype::num::Isize64;
382
383	pub(crate) type ThreadAllocTally = AllocTally<ThreadAllocCount>;
384
385	pub(crate) type TotalAllocTally = AllocTally<u128>;
386
387	impl AllocTally<StatsSet<f64>> {
388	pub fn is_zero(&self) -> bool {
389	self.count.is_zero() && self.size.is_zero()
390	}
391	}
392
393	impl<C> AllocTally<C> {
394	#[inline]
395	pub fn as_array(&self) -> &[C; `2`] {
396	// SAFETY: This is `#[repr(C)]`, so we can treat it as a contiguous
397	// sequence of items.
398	unsafe { &(self as const _ as *const _) }
399	}
400	}
401
402	/// Allocation number categories.
403	///
404	/// Note that grow/shrink are first to improve code generation for `realloc`.
405	#[derive(Clone, Copy, PartialEq, Eq)]
406	pub(crate) enum AllocOp {
407	Grow,
408	Shrink,
409	Alloc,
410	Dealloc,
411	}
412
413	impl AllocOp {
414	pub const ALL: [Self; `4`] = {
415	use AllocOp::*;
416
417	// Use same order as declared so that it can be indexed as-is.
418	[Grow, Shrink, Alloc, Dealloc]
419	};
420
421	#[inline]
422	pub fn realloc(shrink: bool) -> Self {
423	// This generates the same code as `std::mem::transmute`.
424	if shrink {
425	Self::Shrink
426	} else {
427	Self::Grow
428	}
429	}
430
431	#[inline]
432	pub fn name(self) -> &'static str {
433	match self {
434	Self::Grow => "grow",
435	Self::Shrink => "shrink",
436	Self::Alloc => "alloc",
437	Self::Dealloc => "dealloc",
438	}
439	}
440
441	#[inline]
442	pub fn prefix(self) -> &'static str {
443	match self {
444	Self::Grow => "grow:",
445	Self::Shrink => "shrink:",
446	Self::Alloc => "alloc:",
447	Self::Dealloc => "dealloc:",
448	}
449	}
450	}
451
452	/// Values keyed by `AllocOp`.
453	#[derive(Clone, Copy, Default, PartialEq, Eq)]
454	pub(crate) struct AllocOpMap<T> {
455	pub values: [T; `4`],
456	}
457
458	pub(crate) type ThreadAllocTallyMap = AllocOpMap<ThreadAllocTally>;
459
460	pub(crate) type TotalAllocTallyMap = AllocOpMap<TotalAllocTally>;
461
462	impl<T: fmt::Debug> fmt::Debug for AllocOpMap<T> {
463	fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
464	f.debug_map().entries(AllocOp::ALL.iter().map(\|&op: AllocOp\| (op.name(), self.get(op)))).finish()
465	}
466	}
467
468	impl ThreadAllocTallyMap {
469	#[inline]
470	pub const fn new() -> Self {
471	unsafe { std::mem::transmute([`0u8`; size_of::<Self>()]) }
472	}
473
474	/// Returns `true` if all tallies are 0.
475	#[inline]
476	pub fn is_empty(&self) -> bool {
477	self.values.iter().all(\|tally: &AllocTally<{unknown}>\| tally.count == `0` && tally.size == `0`)
478	}
479
480	pub fn add_to_total(&self, total: &mut TotalAllocTallyMap) {
481	for (i: usize, value: &AllocTally) in self.values.iter().enumerate() {
482	total.values[i].count += value.count as u128;
483	total.values[i].size += value.size as u128;
484	}
485	}
486	}
487
488	impl<T> AllocOpMap<T> {
489	#[cfg(test)]
490	pub fn from_fn<F>(f: F) -> Self
491	where
492	F: FnMut(AllocOp) -> T,
493	{
494	Self { values: AllocOp::ALL.map(f) }
495	}
496
497	#[inline]
498	pub const fn get(&self, op: AllocOp) -> &T {
499	&self.values[op as usize]
500	}
501
502	#[inline]
503	pub fn get_mut(&mut self, op: AllocOp) -> &mut T {
504	&mut self.values[op as usize]
505	}
506	}
507
508	#[cfg(feature = "internal_benches")]
509	mod benches {
510	use super::*;
511
512	// We want the approach to scale well with thread count.
513	const THREADS: &[usize] = &[`0`, `1`, `2`, `4`, `16`];
514
515	#[crate::bench(crate = crate, threads = THREADS)]
516	fn tally_alloc(bencher: crate::Bencher) {
517	IGNORE_ALLOC.set(`true`);
518
519	// Using 0 simulates tallying without affecting benchmark reporting.
520	let size = crate::black_box(`0`);
521
522	bencher.bench(\|\| {
523	if let Some(mut info) = ThreadAllocInfo::try_current() {
524	// SAFETY: We have exclusive access.
525	let info = unsafe { info.as_mut() };
526
527	info.tally_alloc(size);
528	}
529	})
530	}
531
532	#[crate::bench(crate = crate, threads = THREADS)]
533	fn tally_dealloc(bencher: crate::Bencher) {
534	IGNORE_ALLOC.set(`true`);
535
536	// Using 0 simulates tallying without affecting benchmark reporting.
537	let size = crate::black_box(`0`);
538
539	bencher.bench(\|\| {
540	if let Some(mut info) = ThreadAllocInfo::try_current() {
541	// SAFETY: We have exclusive access.
542	let info = unsafe { info.as_mut() };
543
544	info.tally_dealloc(size);
545	}
546	})
547	}
548
549	#[crate::bench(crate = crate, threads = THREADS)]
550	fn tally_realloc(bencher: crate::Bencher) {
551	IGNORE_ALLOC.set(`true`);
552
553	// Using 0 simulates tallying without affecting benchmark reporting.
554	let new_size = crate::black_box(`0`);
555	let old_size = crate::black_box(`0`);
556
557	bencher.bench(\|\| {
558	if let Some(mut info) = ThreadAllocInfo::try_current() {
559	// SAFETY: We have exclusive access.
560	let info = unsafe { info.as_mut() };
561
562	info.tally_realloc(old_size, new_size);
563	}
564	})
565	}
566
567	#[crate::bench_group(crate = crate, threads = THREADS)]
568	mod current {
569	use super::*;
570
571	#[crate::bench(crate = crate)]
572	fn init() -> Option<NonNull<ThreadAllocInfo>> {
573	ThreadAllocInfo::current()
574	}
575
576	#[crate::bench(crate = crate)]
577	fn r#try() -> Option<NonNull<ThreadAllocInfo>> {
578	ThreadAllocInfo::try_current()
579	}
580	}
581	}
582
583	#[cfg(test)]
584	mod tests {
585	use super::*;
586
587	/// Tests that `AllocProfiler` is counting correctly.
588	#[test]
589	fn tally() {
590	// Initialize the thread's alloc info.
591	//
592	// SAFETY: This cannot be kept as a reference and is instead a raw
593	// pointer because a reference would cause undefined behavior when
594	// `AllocProfiler` attempts to update tallies.
595	let mut alloc_info = ThreadAllocInfo::current().unwrap();
596
597	// Resets the allocation tallies and returns the previous tallies.
598	let mut take_alloc_tallies = \|\| std::mem::take(unsafe { &mut alloc_info.as_mut().tallies });
599
600	// Start fresh.
601	_ = take_alloc_tallies();
602
603	// Helper to create `ThreadAllocTallyMap` since each operation only
604	// changes `buf` by 1 `i32`.
605	let item_tally = ThreadAllocTally { count: `1`, size: size_of::<i32>() as _ };
606	let make_tally_map = \|op: AllocOp\| {
607	ThreadAllocTallyMap::from_fn(\|other_op\| {
608	if other_op == op {
609	item_tally
610	} else {
611	Default::default()
612	}
613	})
614	};
615
616	// Test zero.
617	let mut buf: Vec<i32> = Vec::new();
618	assert_eq!(take_alloc_tallies(), Default::default());
619
620	// Test allocation.
621	buf.reserve_exact(`1`);
622	assert_eq!(take_alloc_tallies(), make_tally_map(AllocOp::Alloc));
623
624	// Test grow.
625	buf.reserve_exact(`2`);
626	assert_eq!(take_alloc_tallies(), make_tally_map(AllocOp::Grow));
627
628	// Test shrink.
629	buf.shrink_to(`1`);
630	assert_eq!(take_alloc_tallies(), make_tally_map(AllocOp::Shrink));
631
632	// Test dealloc.
633	drop(buf);
634	assert_eq!(take_alloc_tallies(), make_tally_map(AllocOp::Dealloc));
635
636	// Test all of the above together.
637	let mut buf: Vec<i32> = Vec::new();
638	buf.reserve_exact(`1`); // alloc
639	buf.reserve_exact(`2`); // grow
640	buf.shrink_to(`1`); // shrink
641	drop(buf); // dealloc
642	assert_eq!(take_alloc_tallies(), ThreadAllocTallyMap { values: [item_tally; `4`] });
643	}
644	}
645