1use std::{alloc::*, fmt, ptr::NonNull};
2
3use cfg_if::cfg_if;
4
5use crate::{stats::StatsSet, util::sync::AtomicFlag};
6
7#[cfg(target_os = "macos")]
8use crate::util::{sync::CachePadded, thread::PThreadKey};
9
10#[cfg(not(target_os = "macos"))]
11use std::cell::UnsafeCell;
12
13/// The `AllocProfiler` when running crate-internal tests.
14///
15/// This enables us to test it for:
16/// - Undefined behavior with Miri
17/// - Correctness when tallying
18#[cfg(test)]
19#[global_allocator]
20static ALLOC: AllocProfiler = AllocProfiler::system();
21
22/// Whether to ignore allocation info set during the benchmark.
23pub(crate) static IGNORE_ALLOC: AtomicFlag = AtomicFlag::new(false);
24
25/// Measures [`GlobalAlloc`] memory usage.
26///
27/// # Examples
28///
29/// The default usage is to create a
30/// [`#[global_allocator]`](macro@global_allocator) that wraps the [`System`]
31/// allocator with [`AllocProfiler::system()`]:
32///
33/// ```
34/// use std::collections::*;
35/// use divan::AllocProfiler;
36///
37/// #[global_allocator]
38/// static ALLOC: AllocProfiler = AllocProfiler::system();
39///
40/// fn main() {
41/// divan::main();
42/// }
43///
44/// #[divan::bench(types = [
45/// Vec<i32>,
46/// LinkedList<i32>,
47/// HashSet<i32>,
48/// ])]
49/// fn from_iter<T>() -> T
50/// where
51/// T: FromIterator<i32>,
52/// {
53/// (0..100).collect()
54/// }
55///
56/// #[divan::bench(types = [
57/// Vec<i32>,
58/// LinkedList<i32>,
59/// HashSet<i32>,
60/// ])]
61/// fn drop<T>(bencher: divan::Bencher)
62/// where
63/// T: FromIterator<i32>,
64/// {
65/// bencher
66/// .with_inputs(|| (0..100).collect::<T>())
67/// .bench_values(std::mem::drop);
68/// }
69/// ```
70///
71/// Wrap other [`GlobalAlloc`] implementations like
72/// [`mimalloc`](https://docs.rs/mimalloc) with [`AllocProfiler::new()`]:
73///
74/// ```
75/// use divan::AllocProfiler;
76/// use mimalloc::MiMalloc;
77///
78/// # #[cfg(not(miri))]
79/// #[global_allocator]
80/// static ALLOC: AllocProfiler<MiMalloc> = AllocProfiler::new(MiMalloc);
81/// ```
82///
83/// See [`string`](https://github.com/nvzqz/divan/blob/main/examples/benches/string.rs)
84/// and [`collections`](https://github.com/nvzqz/divan/blob/main/examples/benches/collections.rs)
85/// benchmarks for more examples.
86///
87/// # Implementation
88///
89/// Collecting allocation information happens at any point during which Divan is
90/// also measuring the time. As a result, counting allocations affects timing.
91///
92/// To reduce Divan's footprint during benchmarking:
93/// - Allocation information is recorded in thread-local storage to prevent
94/// contention when benchmarks involve multiple threads, either through
95/// options like [`threads`](macro@crate::bench#threads) or internally
96/// spawning their own threads.
97/// - It does not check for overflow and assumes it will not happen. This is
98/// subject to change in the future.
99/// - Fast thread-local storage access is assembly-optimized on macOS.
100///
101/// Allocation information is the only data Divan records outside of timing, and
102/// thus it also has the only code that affects timing. Steps for recording
103/// alloc info:
104/// 1. Load the thread-local slot for allocation information.
105///
106/// On macOS, this is via the
107/// [`gs`](https://github.com/nvzqz/divan/blob/v0.1.6/src/util/sync.rs#L34)/[`tpidrro_el0`](https://github.com/nvzqz/divan/blob/v0.1.6/src/util/sync.rs#L47)
108/// registers for
109/// [`pthread_getspecific`](https://pubs.opengroup.org/onlinepubs/9699919799/functions/pthread_getspecific.html).
110/// Although this is not guaranteed as stable ABI, in practice many programs
111/// assume these registers store thread-local data. [`thread_local!`] is used
112/// on all other platforms.
113///
114/// 2. Increment allocation operation invocation count and bytes count
115/// (a.k.a. size).
116///
117/// Allocation information is recorded in thread-local storage to prevent
118/// slowdowns from synchronized sharing when using multiple threads, through
119/// options like [`threads`](macro@crate::bench#threads).
120///
121/// Note that allocations in threads not controlled by Divan are not currently
122/// counted.
123#[derive(Debug, Default)]
124pub struct AllocProfiler<Alloc = System> {
125 alloc: Alloc,
126}
127
128unsafe impl<A: GlobalAlloc> GlobalAlloc for AllocProfiler<A> {
129 unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
130 // Tally allocation count.
131 if let Some(mut info) = ThreadAllocInfo::try_current() {
132 // SAFETY: We have exclusive access.
133 let info = unsafe { info.as_mut() };
134
135 info.tally_alloc(layout.size());
136 };
137
138 self.alloc.alloc(layout)
139 }
140
141 unsafe fn alloc_zeroed(&self, layout: Layout) -> *mut u8 {
142 // Tally allocation count.
143 if let Some(mut info) = ThreadAllocInfo::try_current() {
144 // SAFETY: We have exclusive access.
145 let info = unsafe { info.as_mut() };
146
147 info.tally_alloc(layout.size());
148 };
149
150 self.alloc.alloc_zeroed(layout)
151 }
152
153 unsafe fn realloc(&self, ptr: *mut u8, layout: Layout, new_size: usize) -> *mut u8 {
154 // Tally reallocation count.
155 if let Some(mut info) = ThreadAllocInfo::try_current() {
156 // SAFETY: We have exclusive access.
157 let info = unsafe { info.as_mut() };
158
159 info.tally_realloc(layout.size(), new_size);
160 };
161
162 self.alloc.realloc(ptr, layout, new_size)
163 }
164
165 unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) {
166 // Tally deallocation count.
167 if let Some(mut info) = ThreadAllocInfo::try_current() {
168 // SAFETY: We have exclusive access.
169 let info = unsafe { info.as_mut() };
170
171 info.tally_dealloc(layout.size());
172 };
173
174 self.alloc.dealloc(ptr, layout)
175 }
176}
177
178impl AllocProfiler {
179 /// Profiles the [`System`] allocator.
180 #[inline]
181 pub const fn system() -> Self {
182 Self::new(alloc:System)
183 }
184}
185
186impl<A> AllocProfiler<A> {
187 /// Profiles a [`GlobalAlloc`].
188 #[inline]
189 pub const fn new(alloc: A) -> Self {
190 Self { alloc }
191 }
192}
193
194/// Thread-local allocation information.
195#[derive(Clone, Default)]
196#[repr(C)]
197pub(crate) struct ThreadAllocInfo {
198 // NOTE: `tallies` should be ordered first so that `tally_realloc` can
199 // directly index `&self` without an offset.
200 pub tallies: ThreadAllocTallyMap,
201
202 // NOTE: Max size and count are signed for convenience but can never be
203 // negative due to it being initialized to 0.
204 //
205 // PERF: Grouping current/max fields together by count and size makes
206 // `tally_alloc` take the least time on M1 Mac.
207 pub current_count: ThreadAllocCountSigned,
208 pub max_count: ThreadAllocCountSigned,
209 pub current_size: ThreadAllocCountSigned,
210 pub max_size: ThreadAllocCountSigned,
211}
212
213#[cfg(not(target_os = "macos"))]
214thread_local! {
215 /// Instance specific to the current thread.
216 ///
217 /// On macOS, we use `ALLOC_PTHREAD_KEY` instead.
218 static CURRENT_THREAD_INFO: UnsafeCell<ThreadAllocInfo> = const {
219 UnsafeCell::new(ThreadAllocInfo::new())
220 };
221}
222
223#[cfg(target_os = "macos")]
224static ALLOC_PTHREAD_KEY: CachePadded<PThreadKey<ThreadAllocInfo>> = CachePadded(PThreadKey::new());
225
226impl ThreadAllocInfo {
227 #[inline]
228 pub const fn new() -> Self {
229 Self {
230 tallies: ThreadAllocTallyMap::new(),
231 max_count: 0,
232 current_count: 0,
233 max_size: 0,
234 current_size: 0,
235 }
236 }
237
238 /// Returns the current thread's allocation information, initializing it on
239 /// first access.
240 ///
241 /// Returns `None` if the thread is terminating and has thus deallocated its
242 /// local instance.
243 #[inline]
244 pub fn current() -> Option<NonNull<Self>> {
245 cfg_if! {
246 if #[cfg(target_os = "macos")] {
247 return Self::try_current().or_else(slow_impl);
248 } else {
249 Self::try_current()
250 }
251 }
252
253 #[cfg(target_os = "macos")]
254 #[cold]
255 #[inline(never)]
256 fn slow_impl() -> Option<NonNull<ThreadAllocInfo>> {
257 unsafe {
258 let layout = Layout::new::<ThreadAllocInfo>();
259
260 let Some(info_alloc) = NonNull::new(unsafe { System.alloc_zeroed(layout) }) else {
261 handle_alloc_error(layout);
262 };
263
264 let success = ALLOC_PTHREAD_KEY.0.set(info_alloc.as_ptr().cast(), |this| {
265 System.dealloc(this.as_ptr().cast(), Layout::new::<ThreadAllocInfo>());
266 });
267
268 if !success {
269 System.dealloc(info_alloc.as_ptr(), layout);
270 return None;
271 }
272
273 // When using static thread local key, write directly because it
274 // is undefined behavior to call `pthread_setspecific` with a
275 // key that didn't originate from `pthread_key_create`.
276 #[cfg(all(not(miri), not(feature = "dyn_thread_local"), target_arch = "x86_64"))]
277 unsafe {
278 crate::util::thread::fast::set_static_thread_local(info_alloc.as_ptr());
279 };
280
281 Some(info_alloc.cast())
282 }
283 }
284 }
285
286 /// Returns the current thread's allocation information if initialized.
287 ///
288 /// Returns `None` if the instance has not yet been allocated or the thread
289 /// is terminating and has thus deallocated its local instance.
290 #[inline]
291 pub fn try_current() -> Option<NonNull<Self>> {
292 cfg_if! {
293 if #[cfg(target_os = "macos")] {
294 // Fast path: static thread local.
295 #[cfg(all(
296 not(miri),
297 not(feature = "dyn_thread_local"),
298 target_arch = "x86_64",
299 ))]
300 return NonNull::new(unsafe {
301 crate::util::thread::fast::get_static_thread_local::<Self>().cast_mut()
302 });
303
304 #[allow(unreachable_code)]
305 ALLOC_PTHREAD_KEY.0.get()
306 } else {
307 CURRENT_THREAD_INFO.try_with(|info| unsafe {
308 NonNull::new_unchecked(info.get())
309 }).ok()
310 }
311 }
312 }
313
314 /// Sets 0 to all values.
315 pub fn clear(&mut self) {
316 *self = Self::new();
317 }
318
319 /// Tallies the total count and size of the allocation operation.
320 #[inline]
321 pub fn tally_alloc(&mut self, size: usize) {
322 self.tally_op(AllocOp::Alloc, size);
323
324 self.current_count += 1;
325 self.max_count = self.max_count.max(self.current_count);
326
327 self.current_size += size as ThreadAllocCountSigned;
328 self.max_size = self.max_size.max(self.current_size);
329 }
330
331 /// Tallies the total count and size of the deallocation operation.
332 #[inline]
333 pub fn tally_dealloc(&mut self, size: usize) {
334 self.tally_op(AllocOp::Dealloc, size);
335
336 self.current_count -= 1;
337 self.current_size -= size as ThreadAllocCountSigned;
338 }
339
340 /// Tallies the total count and size of the reallocation operation.
341 #[inline]
342 pub fn tally_realloc(&mut self, old_size: usize, new_size: usize) {
343 let (diff, is_shrink) = new_size.overflowing_sub(old_size);
344 let diff = diff as isize;
345 let abs_diff = diff.wrapping_abs() as usize;
346
347 self.tally_op(AllocOp::realloc(is_shrink), abs_diff);
348
349 // NOTE: Realloc does not change allocation count.
350 self.current_size += diff as ThreadAllocCountSigned;
351 self.max_size = self.max_size.max(self.current_size);
352 }
353
354 /// Tallies the total count and size of the allocation operation.
355 #[inline]
356 fn tally_op(&mut self, op: AllocOp, size: usize) {
357 let tally = self.tallies.get_mut(op);
358 tally.count += 1;
359 tally.size += size as ThreadAllocCount;
360 }
361}
362
363/// Allocation numbers being accumulated.
364///
365/// # Memory Layout
366///
367/// Aligning to 16 nudges the compiler to emit aligned SIMD operations.
368///
369/// Placing `count` first generates less code on AArch64.
370#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
371#[repr(C, align(16))]
372pub(crate) struct AllocTally<Count> {
373 /// The number of times this operation was performed.
374 pub count: Count,
375
376 /// The amount of memory this operation changed.
377 pub size: Count,
378}
379
380pub(crate) type ThreadAllocCount = condtype::num::Usize64;
381pub(crate) type ThreadAllocCountSigned = condtype::num::Isize64;
382
383pub(crate) type ThreadAllocTally = AllocTally<ThreadAllocCount>;
384
385pub(crate) type TotalAllocTally = AllocTally<u128>;
386
387impl AllocTally<StatsSet<f64>> {
388 pub fn is_zero(&self) -> bool {
389 self.count.is_zero() && self.size.is_zero()
390 }
391}
392
393impl<C> AllocTally<C> {
394 #[inline]
395 pub fn as_array(&self) -> &[C; 2] {
396 // SAFETY: This is `#[repr(C)]`, so we can treat it as a contiguous
397 // sequence of items.
398 unsafe { &*(self as *const _ as *const _) }
399 }
400}
401
402/// Allocation number categories.
403///
404/// Note that grow/shrink are first to improve code generation for `realloc`.
405#[derive(Clone, Copy, PartialEq, Eq)]
406pub(crate) enum AllocOp {
407 Grow,
408 Shrink,
409 Alloc,
410 Dealloc,
411}
412
413impl AllocOp {
414 pub const ALL: [Self; 4] = {
415 use AllocOp::*;
416
417 // Use same order as declared so that it can be indexed as-is.
418 [Grow, Shrink, Alloc, Dealloc]
419 };
420
421 #[inline]
422 pub fn realloc(shrink: bool) -> Self {
423 // This generates the same code as `std::mem::transmute`.
424 if shrink {
425 Self::Shrink
426 } else {
427 Self::Grow
428 }
429 }
430
431 #[inline]
432 pub fn name(self) -> &'static str {
433 match self {
434 Self::Grow => "grow",
435 Self::Shrink => "shrink",
436 Self::Alloc => "alloc",
437 Self::Dealloc => "dealloc",
438 }
439 }
440
441 #[inline]
442 pub fn prefix(self) -> &'static str {
443 match self {
444 Self::Grow => "grow:",
445 Self::Shrink => "shrink:",
446 Self::Alloc => "alloc:",
447 Self::Dealloc => "dealloc:",
448 }
449 }
450}
451
452/// Values keyed by `AllocOp`.
453#[derive(Clone, Copy, Default, PartialEq, Eq)]
454pub(crate) struct AllocOpMap<T> {
455 pub values: [T; 4],
456}
457
458pub(crate) type ThreadAllocTallyMap = AllocOpMap<ThreadAllocTally>;
459
460pub(crate) type TotalAllocTallyMap = AllocOpMap<TotalAllocTally>;
461
462impl<T: fmt::Debug> fmt::Debug for AllocOpMap<T> {
463 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
464 f.debug_map().entries(AllocOp::ALL.iter().map(|&op: AllocOp| (op.name(), self.get(op)))).finish()
465 }
466}
467
468impl ThreadAllocTallyMap {
469 #[inline]
470 pub const fn new() -> Self {
471 unsafe { std::mem::transmute([0u8; size_of::<Self>()]) }
472 }
473
474 /// Returns `true` if all tallies are 0.
475 #[inline]
476 pub fn is_empty(&self) -> bool {
477 self.values.iter().all(|tally: &AllocTally<{unknown}>| tally.count == 0 && tally.size == 0)
478 }
479
480 pub fn add_to_total(&self, total: &mut TotalAllocTallyMap) {
481 for (i: usize, value: &AllocTally) in self.values.iter().enumerate() {
482 total.values[i].count += value.count as u128;
483 total.values[i].size += value.size as u128;
484 }
485 }
486}
487
488impl<T> AllocOpMap<T> {
489 #[cfg(test)]
490 pub fn from_fn<F>(f: F) -> Self
491 where
492 F: FnMut(AllocOp) -> T,
493 {
494 Self { values: AllocOp::ALL.map(f) }
495 }
496
497 #[inline]
498 pub const fn get(&self, op: AllocOp) -> &T {
499 &self.values[op as usize]
500 }
501
502 #[inline]
503 pub fn get_mut(&mut self, op: AllocOp) -> &mut T {
504 &mut self.values[op as usize]
505 }
506}
507
508#[cfg(feature = "internal_benches")]
509mod benches {
510 use super::*;
511
512 // We want the approach to scale well with thread count.
513 const THREADS: &[usize] = &[0, 1, 2, 4, 16];
514
515 #[crate::bench(crate = crate, threads = THREADS)]
516 fn tally_alloc(bencher: crate::Bencher) {
517 IGNORE_ALLOC.set(true);
518
519 // Using 0 simulates tallying without affecting benchmark reporting.
520 let size = crate::black_box(0);
521
522 bencher.bench(|| {
523 if let Some(mut info) = ThreadAllocInfo::try_current() {
524 // SAFETY: We have exclusive access.
525 let info = unsafe { info.as_mut() };
526
527 info.tally_alloc(size);
528 }
529 })
530 }
531
532 #[crate::bench(crate = crate, threads = THREADS)]
533 fn tally_dealloc(bencher: crate::Bencher) {
534 IGNORE_ALLOC.set(true);
535
536 // Using 0 simulates tallying without affecting benchmark reporting.
537 let size = crate::black_box(0);
538
539 bencher.bench(|| {
540 if let Some(mut info) = ThreadAllocInfo::try_current() {
541 // SAFETY: We have exclusive access.
542 let info = unsafe { info.as_mut() };
543
544 info.tally_dealloc(size);
545 }
546 })
547 }
548
549 #[crate::bench(crate = crate, threads = THREADS)]
550 fn tally_realloc(bencher: crate::Bencher) {
551 IGNORE_ALLOC.set(true);
552
553 // Using 0 simulates tallying without affecting benchmark reporting.
554 let new_size = crate::black_box(0);
555 let old_size = crate::black_box(0);
556
557 bencher.bench(|| {
558 if let Some(mut info) = ThreadAllocInfo::try_current() {
559 // SAFETY: We have exclusive access.
560 let info = unsafe { info.as_mut() };
561
562 info.tally_realloc(old_size, new_size);
563 }
564 })
565 }
566
567 #[crate::bench_group(crate = crate, threads = THREADS)]
568 mod current {
569 use super::*;
570
571 #[crate::bench(crate = crate)]
572 fn init() -> Option<NonNull<ThreadAllocInfo>> {
573 ThreadAllocInfo::current()
574 }
575
576 #[crate::bench(crate = crate)]
577 fn r#try() -> Option<NonNull<ThreadAllocInfo>> {
578 ThreadAllocInfo::try_current()
579 }
580 }
581}
582
583#[cfg(test)]
584mod tests {
585 use super::*;
586
587 /// Tests that `AllocProfiler` is counting correctly.
588 #[test]
589 fn tally() {
590 // Initialize the thread's alloc info.
591 //
592 // SAFETY: This cannot be kept as a reference and is instead a raw
593 // pointer because a reference would cause undefined behavior when
594 // `AllocProfiler` attempts to update tallies.
595 let mut alloc_info = ThreadAllocInfo::current().unwrap();
596
597 // Resets the allocation tallies and returns the previous tallies.
598 let mut take_alloc_tallies = || std::mem::take(unsafe { &mut alloc_info.as_mut().tallies });
599
600 // Start fresh.
601 _ = take_alloc_tallies();
602
603 // Helper to create `ThreadAllocTallyMap` since each operation only
604 // changes `buf` by 1 `i32`.
605 let item_tally = ThreadAllocTally { count: 1, size: size_of::<i32>() as _ };
606 let make_tally_map = |op: AllocOp| {
607 ThreadAllocTallyMap::from_fn(|other_op| {
608 if other_op == op {
609 item_tally
610 } else {
611 Default::default()
612 }
613 })
614 };
615
616 // Test zero.
617 let mut buf: Vec<i32> = Vec::new();
618 assert_eq!(take_alloc_tallies(), Default::default());
619
620 // Test allocation.
621 buf.reserve_exact(1);
622 assert_eq!(take_alloc_tallies(), make_tally_map(AllocOp::Alloc));
623
624 // Test grow.
625 buf.reserve_exact(2);
626 assert_eq!(take_alloc_tallies(), make_tally_map(AllocOp::Grow));
627
628 // Test shrink.
629 buf.shrink_to(1);
630 assert_eq!(take_alloc_tallies(), make_tally_map(AllocOp::Shrink));
631
632 // Test dealloc.
633 drop(buf);
634 assert_eq!(take_alloc_tallies(), make_tally_map(AllocOp::Dealloc));
635
636 // Test all of the above together.
637 let mut buf: Vec<i32> = Vec::new();
638 buf.reserve_exact(1); // alloc
639 buf.reserve_exact(2); // grow
640 buf.shrink_to(1); // shrink
641 drop(buf); // dealloc
642 assert_eq!(take_alloc_tallies(), ThreadAllocTallyMap { values: [item_tally; 4] });
643 }
644}
645