mod.rs source code [crates/divan/src/benchmark/mod.rs]

1	use std::{
2	cell::UnsafeCell,
3	fmt,
4	mem::{self, MaybeUninit},
5	num::NonZeroUsize,
6	sync::Barrier,
7	};
8
9	use crate::{
10	alloc::{
11	AllocOp, AllocOpMap, AllocTally, ThreadAllocInfo, ThreadAllocTally, TotalAllocTallyMap,
12	},
13	black_box, black_box_drop,
14	counter::{
15	AnyCounter, AsCountUInt, BytesCount, CharsCount, Counter, CounterCollection, CyclesCount,
16	IntoCounter, ItemsCount, KnownCounterKind, MaxCountUInt,
17	},
18	divan::SharedContext,
19	stats::{RawSample, SampleCollection, Stats, StatsSet, TimeSample},
20	thread_pool::BENCH_POOL,
21	time::{FineDuration, Timestamp, UntaggedTimestamp},
22	util::{self, sync::SyncWrap, Unit},
23	};
24
25	#[cfg(test)]
26	mod tests;
27
28	mod args;
29	mod defer;
30	mod options;
31
32	use defer::{DeferSlot, DeferStore};
33
34	pub use self::{
35	args::{BenchArgs, BenchArgsRunner},
36	options::BenchOptions,
37	};
38
39	pub(crate) const DEFAULT_SAMPLE_COUNT: u32 = `100`;
40
41	/// Enables contextual benchmarking in [`#[divan::bench]`](attr.bench.html).
42	///
43	/// # Examples
44	///
45	/// ```
46	/// use divan::{Bencher, black_box};
47	///
48	/// #[divan::bench]
49	/// fn copy_from_slice(bencher: Bencher) {
50	/// // Input and output buffers get used in the closure.
51	/// let src = (`0`..`100`).collect::<Vec<i32>>();
52	/// let mut dst = vec![`0`; src.len()];
53	///
54	/// bencher.bench_local(\|\| {
55	/// black_box(&mut dst).copy_from_slice(black_box(&src));
56	/// });
57	/// }
58	/// ```
59	#[must_use = "a benchmark function must be registered"]
60	pub struct Bencher<'a, 'b, C = BencherConfig> {
61	pub(crate) context: &'a mut BenchContext<'b>,
62	pub(crate) config: C,
63	}
64
65	/// Public-in-private type for statically-typed `Bencher` configuration.
66	///
67	/// This enables configuring `Bencher` using the builder pattern with zero
68	/// runtime cost.
69	pub struct BencherConfig<GenI = Unit> {
70	gen_input: GenI,
71	}
72
73	impl<C> fmt::Debug for Bencher<'_, '_, C> {
74	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
75	f.debug_struct(name:"Bencher").finish_non_exhaustive()
76	}
77	}
78
79	impl<'a, 'b> Bencher<'a, 'b> {
80	#[inline]
81	pub(crate) fn new(context: &'a mut BenchContext<'b>) -> Self {
82	Self { context, config: BencherConfig { gen_input: Unit } }
83	}
84	}
85
86	impl<'a, 'b> Bencher<'a, 'b> {
87	/// Benchmarks a function.
88	///
89	/// The function can be benchmarked in parallel using the [`threads`
90	/// option](macro@crate::bench#threads). If the function is strictly
91	/// single-threaded, use [`Bencher::bench_local`] instead.
92	///
93	/// # Examples
94	///
95	/// ```
96	/// #[divan::bench]
97	/// fn bench(bencher: divan::Bencher) {
98	/// bencher.bench(\|\| {
99	/// // Benchmarked code...
100	/// });
101	/// }
102	/// ```
103	pub fn bench<O, B>(self, benched: B)
104	where
105	B: Fn() -> O + Sync,
106	{
107	// Reusing `bench_values` for a zero-sized non-drop input type should
108	// have no overhead.
109	self.with_inputs(\|\| ()).bench_values(\|_: ()\| benched());
110	}
111
112	/// Benchmarks a function on the current thread.
113	///
114	/// # Examples
115	///
116	/// ```
117	/// #[divan::bench]
118	/// fn bench(bencher: divan::Bencher) {
119	/// bencher.bench_local(\|\| {
120	/// // Benchmarked code...
121	/// });
122	/// }
123	/// ```
124	pub fn bench_local<O, B>(self, mut benched: B)
125	where
126	B: FnMut() -> O,
127	{
128	// Reusing `bench_local_values` for a zero-sized non-drop input type
129	// should have no overhead.
130	self.with_inputs(\|\| ()).bench_local_values(\|_: ()\| benched());
131	}
132
133	/// Generate inputs for the [benchmarked function](#input-bench).
134	///
135	/// Time spent generating inputs does not affect benchmark timing.
136	///
137	/// When [benchmarking in parallel](macro@crate::bench#threads), the input
138	/// generator is called on the same thread as the sample loop that uses that
139	/// input.
140	///
141	/// # Examples
142	///
143	/// ```
144	/// #[divan::bench]
145	/// fn bench(bencher: divan::Bencher) {
146	/// bencher
147	/// .with_inputs(\|\| {
148	/// // Generate input:
149	/// String::from("...")
150	/// })
151	/// .bench_values(\|s\| {
152	/// // Use input by-value:
153	/// s + "123"
154	/// });
155	/// }
156	/// ```
157	pub fn with_inputs<G>(self, gen_input: G) -> Bencher<'a, 'b, BencherConfig<G>> {
158	Bencher { context: self.context, config: BencherConfig { gen_input } }
159	}
160	}
161
162	impl<'a, 'b, GenI> Bencher<'a, 'b, BencherConfig<GenI>> {
163	/// Assign a [`Counter`] for all iterations of the benchmarked function.
164	///
165	/// This will either:
166	/// - Assign a new counter
167	/// - Override an existing counter of the same type
168	///
169	/// If the counter depends on [generated inputs](Self::with_inputs), use
170	/// [`Bencher::input_counter`] instead.
171	///
172	/// If context is not needed, the counter can instead be set via
173	/// [`#[divan::bench(counters = ...)]`](macro@crate::bench#counters).
174	///
175	/// # Examples
176	///
177	/// ```
178	/// use divan::{Bencher, counter::BytesCount};
179	///
180	/// #[divan::bench]
181	/// fn char_count(bencher: Bencher) {
182	/// let s: String = // ...
183	/// # String::new();
184	///
185	/// bencher
186	/// .counter(BytesCount::of_str(&s))
187	/// .bench(\|\| {
188	/// divan::black_box(&s).chars().count()
189	/// });
190	/// }
191	/// ```
192	#[doc(alias = "throughput")]
193	pub fn counter<C>(self, counter: C) -> Self
194	where
195	C: IntoCounter,
196	{
197	let counter = AnyCounter::new(counter);
198	self.context.counters.set_counter(counter);
199	self
200	}
201	}
202
203	/// <span id="input-bench"></span> Benchmark over [generated inputs](Self::with_inputs).
204	impl<'a, 'b, I, GenI> Bencher<'a, 'b, BencherConfig<GenI>>
205	where
206	GenI: FnMut() -> I,
207	{
208	/// Calls a closure to create a [`Counter`] for each input of the
209	/// benchmarked function.
210	///
211	/// This will either:
212	/// - Assign a new counter
213	/// - Override an existing counter of the same type
214	///
215	/// If the counter is constant, use [`Bencher::counter`] instead.
216	///
217	/// When [benchmarking in parallel](macro@crate::bench#threads), the input
218	/// counter is called on the same thread as the sample loop that generates
219	/// and uses that input.
220	///
221	/// # Examples
222	///
223	/// The following example emits info for the number of bytes processed when
224	/// benchmarking [`char`-counting](std::str::Chars::count). The byte count
225	/// is gotten by calling [`BytesCount::of_str`] on each iteration's input
226	/// [`String`].
227	///
228	/// ```
229	/// use divan::{Bencher, counter::BytesCount};
230	///
231	/// #[divan::bench]
232	/// fn char_count(bencher: Bencher) {
233	/// bencher
234	/// .with_inputs(\|\| -> String {
235	/// // ...
236	/// # String::new()
237	/// })
238	/// .input_counter(BytesCount::of_str)
239	/// .bench_refs(\|s\| {
240	/// s.chars().count()
241	/// });
242	/// }
243	/// ```
244	pub fn input_counter<C, F>(self, make_counter: F) -> Self
245	where
246	F: Fn(&I) -> C + Sync + 'static,
247	C: IntoCounter,
248	{
249	self.context.counters.set_input_counter(make_counter);
250	self
251	}
252
253	/// Creates a [`Counter`] from each input of the benchmarked function.
254	///
255	/// This may be used if the input returns [`u8`]–[`u64`], [`usize`], or any
256	/// nesting of references to those types.
257	///
258	/// # Examples
259	///
260	/// The following example emits info for the number of items processed when
261	/// benchmarking [`FromIterator`] from
262	/// <code>[Range](std::ops::Range)<[usize]></code> to [`Vec`].
263	///
264	/// ```
265	/// use divan::{Bencher, counter::ItemsCount};
266	///
267	/// #[divan::bench]
268	/// fn range_to_vec(bencher: Bencher) {
269	/// bencher
270	/// .with_inputs(\|\| -> usize {
271	/// // ...
272	/// # `0`
273	/// })
274	/// .count_inputs_as::<ItemsCount>()
275	/// .bench_values(\|n\| -> Vec<usize> {
276	/// (`0`..n).collect()
277	/// });
278	/// }
279	/// ```
280	#[inline]
281	pub fn count_inputs_as<C>(self) -> Self
282	where
283	C: Counter,
284	I: AsCountUInt,
285	{
286	match KnownCounterKind::of::<C>() {
287	KnownCounterKind::Bytes => self.input_counter(\|c\| BytesCount::from(c)),
288	KnownCounterKind::Chars => self.input_counter(\|c\| CharsCount::from(c)),
289	KnownCounterKind::Cycles => self.input_counter(\|c\| CyclesCount::from(c)),
290	KnownCounterKind::Items => self.input_counter(\|c\| ItemsCount::from(c)),
291	}
292	}
293
294	/// Benchmarks a function over per-iteration [generated inputs](Self::with_inputs),
295	/// provided by-value.
296	///
297	/// Per-iteration means the benchmarked function is called exactly once for
298	/// each generated input.
299	///
300	/// The function can be benchmarked in parallel using the [`threads`
301	/// option](macro@crate::bench#threads). If the function is strictly
302	/// single-threaded, use [`Bencher::bench_local_values`] instead.
303	///
304	/// # Examples
305	///
306	/// ```
307	/// #[divan::bench]
308	/// fn bench(bencher: divan::Bencher) {
309	/// bencher
310	/// .with_inputs(\|\| {
311	/// // Generate input:
312	/// String::from("...")
313	/// })
314	/// .bench_values(\|s\| {
315	/// // Use input by-value:
316	/// s + "123"
317	/// });
318	/// }
319	/// ```
320	pub fn bench_values<O, B>(self, benched: B)
321	where
322	B: Fn(I) -> O + Sync,
323	GenI: Fn() -> I + Sync,
324	{
325	self.context.bench_loop_threaded(
326	self.config.gen_input,
327	\|input\| {
328	// SAFETY: Input is guaranteed to be initialized and not
329	// currently referenced by anything else.
330	let input = unsafe { input.get().read().assume_init() };
331
332	benched(input)
333	},
334	// Input ownership is transferred to `benched`.
335	\|_input\| {},
336	);
337	}
338
339	/// Benchmarks a function over per-iteration [generated inputs](Self::with_inputs),
340	/// provided by-value.
341	///
342	/// Per-iteration means the benchmarked function is called exactly once for
343	/// each generated input.
344	///
345	/// # Examples
346	///
347	/// ```
348	/// #[divan::bench]
349	/// fn bench(bencher: divan::Bencher) {
350	/// let mut values = Vec::new();
351	/// bencher
352	/// .with_inputs(\|\| {
353	/// // Generate input:
354	/// String::from("...")
355	/// })
356	/// .bench_local_values(\|s\| {
357	/// // Use input by-value:
358	/// values.push(s);
359	/// });
360	/// }
361	/// ```
362	pub fn bench_local_values<O, B>(self, mut benched: B)
363	where
364	B: FnMut(I) -> O,
365	{
366	self.context.bench_loop_local(
367	self.config.gen_input,
368	\|input\| {
369	// SAFETY: Input is guaranteed to be initialized and not
370	// currently referenced by anything else.
371	let input = unsafe { input.get().read().assume_init() };
372
373	benched(input)
374	},
375	// Input ownership is transferred to `benched`.
376	\|_input\| {},
377	);
378	}
379
380	/// Benchmarks a function over per-iteration [generated inputs](Self::with_inputs),
381	/// provided by-reference.
382	///
383	/// Per-iteration means the benchmarked function is called exactly once for
384	/// each generated input.
385	///
386	/// # Examples
387	///
388	/// ```
389	/// #[divan::bench]
390	/// fn bench(bencher: divan::Bencher) {
391	/// bencher
392	/// .with_inputs(\|\| {
393	/// // Generate input:
394	/// String::from("...")
395	/// })
396	/// .bench_refs(\|s\| {
397	/// // Use input by-reference:
398	/// *s += "123";
399	/// });
400	/// }
401	/// ```
402	pub fn bench_refs<O, B>(self, benched: B)
403	where
404	B: Fn(&mut I) -> O + Sync,
405	GenI: Fn() -> I + Sync,
406	{
407	// TODO: Allow `O` to reference `&mut I` as long as `I` outlives `O`.
408	self.context.bench_loop_threaded(
409	self.config.gen_input,
410	\|input\| {
411	// SAFETY: Input is guaranteed to be initialized and not
412	// currently referenced by anything else.
413	let input = unsafe { (*input.get()).assume_init_mut() };
414
415	benched(input)
416	},
417	// Input ownership was not transferred to `benched`.
418	\|input\| {
419	// SAFETY: This function is called after `benched` outputs are
420	// dropped, so we have exclusive access.
421	unsafe { (*input.get()).assume_init_drop() }
422	},
423	);
424	}
425
426	/// Benchmarks a function over per-iteration [generated inputs](Self::with_inputs),
427	/// provided by-reference.
428	///
429	/// Per-iteration means the benchmarked function is called exactly once for
430	/// each generated input.
431	///
432	/// # Examples
433	///
434	/// ```
435	/// #[divan::bench]
436	/// fn bench(bencher: divan::Bencher) {
437	/// bencher
438	/// .with_inputs(\|\| {
439	/// // Generate input:
440	/// String::from("...")
441	/// })
442	/// .bench_local_refs(\|s\| {
443	/// // Use input by-reference:
444	/// *s += "123";
445	/// });
446	/// }
447	/// ```
448	pub fn bench_local_refs<O, B>(self, mut benched: B)
449	where
450	B: FnMut(&mut I) -> O,
451	{
452	// TODO: Allow `O` to reference `&mut I` as long as `I` outlives `O`.
453	self.context.bench_loop_local(
454	self.config.gen_input,
455	\|input\| {
456	// SAFETY: Input is guaranteed to be initialized and not
457	// currently referenced by anything else.
458	let input = unsafe { (*input.get()).assume_init_mut() };
459
460	benched(input)
461	},
462	// Input ownership was not transferred to `benched`.
463	\|input\| {
464	// SAFETY: This function is called after `benched` outputs are
465	// dropped, so we have exclusive access.
466	unsafe { (*input.get()).assume_init_drop() }
467	},
468	);
469	}
470	}
471
472	/// State machine for how the benchmark is being run.
473	#[derive(Clone, Copy)]
474	pub(crate) enum BenchMode {
475	/// The benchmark is being run as `--test`.
476	///
477	/// Don't collect samples and run exactly once.
478	Test,
479
480	/// Scale `sample_size` to determine the right size for collecting.
481	Tune { sample_size: u32 },
482
483	/// Simply collect samples.
484	Collect { sample_size: u32 },
485	}
486
487	impl BenchMode {
488	#[inline]
489	pub fn is_test(self) -> bool {
490	matches!(self, Self::Test)
491	}
492
493	#[inline]
494	pub fn is_tune(self) -> bool {
495	matches!(self, Self::Tune { .. })
496	}
497
498	#[inline]
499	pub fn is_collect(self) -> bool {
500	matches!(self, Self::Collect { .. })
501	}
502
503	#[inline]
504	pub fn sample_size(self) -> u32 {
505	match self {
506	Self::Test => `1`,
507	Self::Tune { sample_size: u32, .. } \| Self::Collect { sample_size: u32, .. } => sample_size,
508	}
509	}
510	}
511
512	/// `#[divan::bench]` loop context.
513	///
514	/// Functions called within the benchmark loop should be `#[inline(always)]` to
515	/// ensure instruction cache locality.
516	pub(crate) struct BenchContext<'a> {
517	shared_context: &'a SharedContext,
518
519	/// User-configured options.
520	pub options: &'a BenchOptions<'a>,
521
522	/// Whether the benchmark loop was started.
523	pub did_run: bool,
524
525	/// The number of threads to run the benchmark. The default is 1.
526	///
527	/// When set to 1, the benchmark loop is guaranteed to stay on the current
528	/// thread and not spawn any threads.
529	pub thread_count: NonZeroUsize,
530
531	/// Recorded samples.
532	samples: SampleCollection,
533
534	/// Per-iteration counters grouped by sample.
535	counters: CounterCollection,
536	}
537
538	impl<'a> BenchContext<'a> {
539	/// Creates a new benchmarking context.
540	pub fn new(
541	shared_context: &'a SharedContext,
542	options: &'a BenchOptions,
543	thread_count: NonZeroUsize,
544	) -> Self {
545	Self {
546	shared_context,
547	options,
548	thread_count,
549	did_run: `false`,
550	samples: SampleCollection::default(),
551	counters: options.counters.to_collection(),
552	}
553	}
554
555	/// Runs the single-threaded loop for benchmarking `benched`.
556	///
557	/// # Safety
558	///
559	/// See `bench_loop_threaded`.
560	pub fn bench_loop_local<I, O>(
561	&mut self,
562	gen_input: impl FnMut() -> I,
563	benched: impl FnMut(&UnsafeCell<MaybeUninit<I>>) -> O,
564	drop_input: impl Fn(&UnsafeCell<MaybeUninit<I>>),
565	) {
566	// SAFETY: Closures are guaranteed to run on the current thread, so they
567	// can safely be mutable and non-`Sync`.
568	unsafe {
569	let gen_input = SyncWrap::new(UnsafeCell::new(gen_input));
570	let benched = SyncWrap::new(UnsafeCell::new(benched));
571	let drop_input = SyncWrap::new(drop_input);
572
573	self.thread_count = NonZeroUsize::MIN;
574	self.bench_loop_threaded::<I, O>(
575	\|\| (*gen_input.get())(),
576	\|input\| (*benched.get())(input),
577	\|input\| drop_input(input),
578	)
579	}
580	}
581
582	/// Runs the multi-threaded loop for benchmarking `benched`.
583	///
584	/// # Safety
585	///
586	/// If `self.threads` is 1, the incoming closures will not escape the
587	/// current thread. This guarantee ensures `bench_loop_local` can soundly
588	/// reuse this method with mutable non-`Sync` closures.
589	///
590	/// When `benched` is called:
591	/// - `I` is guaranteed to be initialized.
592	/// - No external `&I` or `&mut I` exists.
593	///
594	/// When `drop_input` is called:
595	/// - All instances of `O` returned from `benched` have been dropped.
596	/// - The same guarantees for `I` apply as in `benched`, unless `benched`
597	/// escaped references to `I`.
598	fn bench_loop_threaded<I, O>(
599	&mut self,
600	gen_input: impl Fn() -> I + Sync,
601	benched: impl Fn(&UnsafeCell<MaybeUninit<I>>) -> O + Sync,
602	drop_input: impl Fn(&UnsafeCell<MaybeUninit<I>>) + Sync,
603	) {
604	self.did_run = `true`;
605
606	let mut current_mode = self.initial_mode();
607	let is_test = current_mode.is_test();
608
609	let record_sample = self.sample_recorder(gen_input, benched, drop_input);
610
611	let thread_count = self.thread_count.get();
612	let aux_thread_count = thread_count - `1`;
613
614	let is_single_thread = aux_thread_count == `0`;
615
616	// Per-thread sample info returned by `record_sample`. These are
617	// processed locally to emit user-facing sample info. As a result, this
618	// only contains `thread_count` many elements at a time.
619	let mut raw_samples = Vec::<Option<RawSample>>::new();
620
621	// The time spent benchmarking, in picoseconds.
622	//
623	// Unless `skip_ext_time` is set, this includes time external to
624	// `benched`, such as time spent generating inputs and running drop.
625	let mut elapsed_picos: u128 = `0`;
626
627	// The minimum time for benchmarking, in picoseconds.
628	let min_picos = self.options.min_time().picos;
629
630	// The remaining time left for benchmarking, in picoseconds.
631	let max_picos = self.options.max_time().picos;
632
633	// Don't bother running if user specifies 0 max time or 0 samples.
634	if max_picos == `0` \|\| !self.options.has_samples() {
635	return;
636	}
637
638	let timer = self.shared_context.timer;
639	let timer_kind = timer.kind();
640
641	let mut rem_samples = if current_mode.is_collect() {
642	Some(self.options.sample_count.unwrap_or(DEFAULT_SAMPLE_COUNT))
643	} else {
644	None
645	};
646
647	// Only measure precision if we need to tune sample size.
648	let timer_precision =
649	if current_mode.is_tune() { timer.precision() } else { FineDuration::default() };
650
651	if !is_test {
652	self.samples.time_samples.reserve(self.options.sample_count.unwrap_or(`1`) as usize);
653	}
654
655	let skip_ext_time = self.options.skip_ext_time.unwrap_or_default();
656	let initial_start = if skip_ext_time { None } else { Some(Timestamp::start(timer_kind)) };
657
658	let bench_overheads = timer.bench_overheads();
659
660	while {
661	// Conditions for when sampling is over:
662	if elapsed_picos >= max_picos {
663	// Depleted the benchmarking time budget. This is a strict
664	// condition regardless of sample count and minimum time.
665	`false`
666	} else if rem_samples.unwrap_or(`1`) > `0` {
667	// More samples expected.
668	`true`
669	} else {
670	// Continue if we haven't reached the time floor.
671	elapsed_picos < min_picos
672	}
673	} {
674	let sample_size = current_mode.sample_size();
675	self.samples.sample_size = sample_size;
676
677	let barrier = if is_single_thread { None } else { Some(Barrier::new(thread_count)) };
678
679	// Sample loop helper:
680	let record_sample = \|\| -> RawSample {
681	let mut counter_totals: [u128; KnownCounterKind::COUNT] =
682	[`0`; KnownCounterKind::COUNT];
683
684	// Updates per-input counter info for this sample.
685	let mut count_input = \|input: &I\| {
686	for counter_kind in KnownCounterKind::ALL {
687	// SAFETY: The `I` type cannot change since `with_inputs`
688	// cannot be called more than once on the same `Bencher`.
689	if let Some(count) =
690	unsafe { self.counters.get_input_count(counter_kind, input) }
691	{
692	let total = &mut counter_totals[counter_kind as usize];
693	total = (total).saturating_add(count as u128);
694	}
695	}
696	};
697
698	// Sample loop:
699	let ([start, end], alloc_info) =
700	record_sample(sample_size as usize, barrier.as_ref(), &mut count_input);
701
702	RawSample { start, end, timer, alloc_info, counter_totals }
703	};
704
705	// Sample loop:
706	raw_samples.clear();
707	BENCH_POOL.par_extend(&mut raw_samples, aux_thread_count, \|_\| record_sample());
708
709	// Convert `&[Option<RawSample>]` to `&[Sample]`.
710	let raw_samples: &[RawSample] = {
711	if let Some(thread) = raw_samples
712	.iter()
713	.enumerate()
714	.find_map(\|(thread, sample)\| sample.is_none().then_some(thread))
715	{
716	panic!("Divan benchmarking thread {thread} panicked");
717	}
718
719	unsafe {
720	assert_eq!(mem::size_of::<RawSample>(), mem::size_of::<Option<RawSample>>());
721	std::slice::from_raw_parts(raw_samples.as_ptr().cast(), raw_samples.len())
722	}
723	};
724
725	// If testing, exit the benchmarking loop immediately after timing a
726	// single run.
727	if is_test {
728	break;
729	}
730
731	let slowest_sample = raw_samples.iter().max_by_key(\|s\| s.duration()).unwrap();
732	let slowest_time = slowest_sample.duration();
733
734	// TODO: Make tuning be less influenced by early runs. Currently if
735	// early runs are very quick but later runs are slow, benchmarking
736	// will take a very long time.
737	//
738	// TODO: Make `sample_size` consider time generating inputs and
739	// dropping inputs/outputs. Currently benchmarks like
740	// `Bencher::bench_refs(String::clear)` take a very long time.
741	if current_mode.is_tune() {
742	// Clear previous smaller samples.
743	self.samples.clear();
744	self.counters.clear_input_counts();
745
746	// If within 100x timer precision, continue tuning.
747	let precision_multiple = slowest_time.picos / timer_precision.picos;
748	if precision_multiple <= `100` {
749	current_mode = BenchMode::Tune { sample_size: sample_size * `2` };
750	} else {
751	current_mode = BenchMode::Collect { sample_size };
752	rem_samples = Some(self.options.sample_count.unwrap_or(DEFAULT_SAMPLE_COUNT));
753	}
754	}
755
756	// Returns the sample's duration adjusted for overhead.
757	let sample_duration_sub_overhead = \|raw_sample: &RawSample\| {
758	let overhead = bench_overheads.total_overhead(sample_size, &raw_sample.alloc_info);
759
760	FineDuration {
761	picos: raw_sample
762	.duration()
763	.clamp_to(timer_precision)
764	.picos
765	.saturating_sub(overhead.picos),
766	}
767	.clamp_to(timer_precision)
768	};
769
770	for raw_sample in raw_samples {
771	let sample_index = self.samples.time_samples.len();
772
773	self.samples
774	.time_samples
775	.push(TimeSample { duration: sample_duration_sub_overhead(raw_sample) });
776
777	if !raw_sample.alloc_info.tallies.is_empty() {
778	self.samples
779	.alloc_info_by_sample
780	.insert(sample_index as u32, raw_sample.alloc_info.clone());
781	}
782
783	// Insert per-input counter information.
784	for counter_kind in KnownCounterKind::ALL {
785	if !self.counters.uses_input_counts(counter_kind) {
786	continue;
787	}
788
789	let total_count = raw_sample.counter_totals[counter_kind as usize];
790
791	// Cannot overflow `MaxCountUInt` because `total_count`
792	// cannot exceed `MaxCountUInt::MAX sample_size`.*
793	let per_iter_count = (total_count / sample_size as u128) as MaxCountUInt;
794
795	self.counters.push_counter(AnyCounter::known(counter_kind, per_iter_count));
796	}
797
798	if let Some(rem_samples) = &mut rem_samples {
799	*rem_samples = rem_samples.saturating_sub(`1`);
800	}
801	}
802
803	if let Some(initial_start) = initial_start {
804	let last_end = raw_samples.iter().map(\|s\| s.end).max().unwrap();
805	elapsed_picos = last_end.duration_since(initial_start, timer).picos;
806	} else {
807	// Progress by at least 1ns to prevent extremely fast
808	// functions from taking forever when `min_time` is set.
809	let progress_picos = slowest_time.picos.max(`1_000`);
810	elapsed_picos = elapsed_picos.saturating_add(progress_picos);
811	}
812	}
813
814	// Reset flag for ignoring allocations.
815	crate::alloc::IGNORE_ALLOC.set(`false`);
816	}
817
818	/// Returns a closure that takes the sample size and input counter, and then
819	/// returns a newly recorded sample.
820	fn sample_recorder<I, O>(
821	&self,
822	gen_input: impl Fn() -> I,
823	benched: impl Fn(&UnsafeCell<MaybeUninit<I>>) -> O,
824	drop_input: impl Fn(&UnsafeCell<MaybeUninit<I>>),
825	) -> impl Fn(usize, Option<&Barrier>, &mut dyn FnMut(&I)) -> ([Timestamp; `2`], ThreadAllocInfo)
826	{
827	// We defer:
828	// - Usage of `gen_input` values.
829	// - Drop destructor for `O`, preventing it from affecting sample
830	// measurements. Outputs are stored into a pre-allocated buffer during
831	// the sample loop. The allocation is reused between samples to reduce
832	// time spent between samples.
833
834	let timer_kind = self.shared_context.timer.kind();
835
836	move \|sample_size: usize, barrier: Option<&Barrier>, count_input: &mut dyn FnMut(&I)\| {
837	let mut defer_store = DeferStore::<I, O>::default();
838
839	let mut saved_alloc_info = ThreadAllocInfo::new();
840	let mut save_alloc_info = \|\| {
841	if crate::alloc::IGNORE_ALLOC.get() {
842	return;
843	}
844
845	if let Some(alloc_info) = ThreadAllocInfo::try_current() {
846	// SAFETY: We have exclusive access.
847	saved_alloc_info = unsafe { alloc_info.as_ptr().read() };
848	}
849	};
850
851	// Synchronize all threads to start timed section simultaneously and
852	// clear every thread's memory profiling info.
853	//
854	// This ensures work external to the timed section does not affect
855	// the timing of other threads.
856	let sync_threads = \|is_start: bool\| {
857	sync_impl(barrier, is_start);
858
859	// Monomorphize implementation to reduce code size.
860	#[inline(never)]
861	fn sync_impl(barrier: Option<&Barrier>, is_start: bool) {
862	// Ensure benchmarked section has a `ThreadAllocInfo`
863	// allocated for the current thread and clear previous info.
864	let alloc_info = if is_start { ThreadAllocInfo::current() } else { None };
865
866	// Synchronize all threads.
867	//
868	// This is the final synchronization point for the end.
869	if let Some(barrier) = barrier {
870	barrier.wait();
871	}
872
873	if let Some(mut alloc_info) = alloc_info {
874	// SAFETY: We have exclusive access.
875	let alloc_info = unsafe { alloc_info.as_mut() };
876
877	alloc_info.clear();
878
879	// Synchronize all threads.
880	if let Some(barrier) = barrier {
881	barrier.wait();
882	}
883	}
884	}
885	};
886
887	// The following logic chooses how to efficiently sample the
888	// benchmark function once and assigns `sample_start`/`sample_end`
889	// before/after the sample loop.
890	//
891	// NOTE: Testing and benchmarking should behave exactly the same
892	// when getting the sample time span. We don't want to introduce
893	// extra work that may worsen measurement quality for real
894	// benchmarking.
895	let sample_start: UntaggedTimestamp;
896	let sample_end: UntaggedTimestamp;
897
898	if size_of::<I>() == `0` && (size_of::<O>() == `0` \|\| !mem::needs_drop::<O>()) {
899	// Use a range instead of `defer_store` to make the benchmarking
900	// loop cheaper.
901
902	// Run `gen_input` the expected number of times in case it
903	// updates external state used by `benched`.
904	for _ in `0`..sample_size {
905	let input = gen_input();
906	count_input(&input);
907
908	// Inputs are consumed/dropped later.
909	mem::forget(input);
910	}
911
912	sync_threads(`true`);
913	sample_start = UntaggedTimestamp::start(timer_kind);
914
915	// Sample loop:
916	for _ in `0`..sample_size {
917	// SAFETY: Input is a ZST, so we can construct one out of
918	// thin air.
919	let input = unsafe { UnsafeCell::new(MaybeUninit::<I>::zeroed()) };
920
921	mem::forget(black_box(benched(&input)));
922	}
923
924	sample_end = UntaggedTimestamp::end(timer_kind);
925	sync_threads(`false`);
926	save_alloc_info();
927
928	// Drop outputs and inputs.
929	for _ in `0`..sample_size {
930	// Output only needs drop if ZST.
931	if size_of::<O>() == `0` {
932	// SAFETY: Output is a ZST, so we can construct one out
933	// of thin air.
934	unsafe { _ = mem::zeroed::<O>() }
935	}
936
937	if mem::needs_drop::<I>() {
938	// SAFETY: Input is a ZST, so we can construct one out
939	// of thin air and not worry about aliasing.
940	unsafe { drop_input(&UnsafeCell::new(MaybeUninit::<I>::zeroed())) }
941	}
942	}
943	} else {
944	defer_store.prepare(sample_size);
945
946	match defer_store.slots() {
947	// Output needs to be dropped. We defer drop in the sample
948	// loop by inserting it into `defer_store`.
949	Ok(defer_slots_slice) => {
950	// Initialize and store inputs.
951	for DeferSlot { input, .. } in defer_slots_slice {
952	// SAFETY: We have exclusive access to `input`.
953	let input = unsafe { &mut *input.get() };
954	let input = input.write(gen_input());
955	count_input(input);
956
957	// Make input opaque to benchmarked function.
958	black_box(input);
959	}
960
961	// Create iterator before the sample timing section to
962	// reduce benchmarking overhead.
963	let defer_slots_iter = defer_slots_slice.iter();
964
965	sync_threads(`true`);
966	sample_start = UntaggedTimestamp::start(timer_kind);
967
968	// Sample loop:
969	for defer_slot in defer_slots_iter {
970	// SAFETY: All inputs in `defer_store` were
971	// initialized and we have exclusive access to the
972	// output slot.
973	unsafe {
974	let output = benched(&defer_slot.input);
975	*defer_slot.output.get() = MaybeUninit::new(output);
976	}
977	}
978
979	sample_end = UntaggedTimestamp::end(timer_kind);
980	sync_threads(`false`);
981	save_alloc_info();
982
983	// Prevent the optimizer from removing writes to inputs
984	// and outputs in the sample loop.
985	black_box(defer_slots_slice);
986
987	// Drop outputs and inputs.
988	for DeferSlot { input, output } in defer_slots_slice {
989	// SAFETY: All outputs were initialized in the
990	// sample loop and we have exclusive access.
991	unsafe { (*output.get()).assume_init_drop() }
992
993	if mem::needs_drop::<I>() {
994	// SAFETY: The output was dropped and thus we
995	// have exclusive access to inputs.
996	unsafe { drop_input(input) }
997	}
998	}
999	}
1000
1001	// Output does not need to be dropped.
1002	Err(defer_inputs_slice) => {
1003	// Initialize and store inputs.
1004	for input in defer_inputs_slice {
1005	// SAFETY: We have exclusive access to `input`.
1006	let input = unsafe { &mut *input.get() };
1007	let input = input.write(gen_input());
1008	count_input(input);
1009
1010	// Make input opaque to benchmarked function.
1011	black_box(input);
1012	}
1013
1014	// Create iterator before the sample timing section to
1015	// reduce benchmarking overhead.
1016	let defer_inputs_iter = defer_inputs_slice.iter();
1017
1018	sync_threads(`true`);
1019	sample_start = UntaggedTimestamp::start(timer_kind);
1020
1021	// Sample loop:
1022	for input in defer_inputs_iter {
1023	// SAFETY: All inputs in `defer_store` were
1024	// initialized.
1025	black_box_drop(unsafe { benched(input) });
1026	}
1027
1028	sample_end = UntaggedTimestamp::end(timer_kind);
1029	sync_threads(`false`);
1030	save_alloc_info();
1031
1032	// Prevent the optimizer from removing writes to inputs
1033	// in the sample loop.
1034	black_box(defer_inputs_slice);
1035
1036	// Drop inputs.
1037	if mem::needs_drop::<I>() {
1038	for input in defer_inputs_slice {
1039	// SAFETY: We have exclusive access to inputs.
1040	unsafe { drop_input(input) }
1041	}
1042	}
1043	}
1044	}
1045	}
1046
1047	// SAFETY: These values are guaranteed to be the correct variant
1048	// because they were created from the same `timer_kind`.
1049	let interval = unsafe {
1050	[sample_start.into_timestamp(timer_kind), sample_end.into_timestamp(timer_kind)]
1051	};
1052
1053	(interval, saved_alloc_info)
1054	}
1055	}
1056
1057	#[inline]
1058	fn initial_mode(&self) -> BenchMode {
1059	if self.shared_context.action.is_test() {
1060	BenchMode::Test
1061	} else if let Some(sample_size) = self.options.sample_size {
1062	BenchMode::Collect { sample_size }
1063	} else {
1064	BenchMode::Tune { sample_size: `1` }
1065	}
1066	}
1067
1068	pub fn compute_stats(&self) -> Stats {
1069	let time_samples = &self.samples.time_samples;
1070	let alloc_info_by_sample = &self.samples.alloc_info_by_sample;
1071
1072	let sample_count = time_samples.len();
1073	let sample_size = self.samples.sample_size;
1074
1075	let total_count = self.samples.iter_count();
1076
1077	let total_duration = self.samples.total_duration();
1078	let mean_duration = FineDuration {
1079	picos: total_duration.picos.checked_div(total_count as u128).unwrap_or_default(),
1080	};
1081
1082	// Samples sorted by duration.
1083	let sorted_samples = self.samples.sorted_samples();
1084	let median_samples = util::slice_middle(&sorted_samples);
1085
1086	let index_of_sample = \|sample: &TimeSample\| -> usize {
1087	util::slice_ptr_index(&self.samples.time_samples, sample)
1088	};
1089
1090	let counter_count_for_sample =
1091	\|sample: &TimeSample, counter_kind: KnownCounterKind\| -> Option<MaxCountUInt> {
1092	let counts = self.counters.counts(counter_kind);
1093
1094	let index = if self.counters.uses_input_counts(counter_kind) {
1095	index_of_sample(sample)
1096	} else {
1097	`0`
1098	};
1099
1100	counts.get(index).copied()
1101	};
1102
1103	let min_duration =
1104	sorted_samples.first().map(\|s\| s.duration / sample_size).unwrap_or_default();
1105	let max_duration =
1106	sorted_samples.last().map(\|s\| s.duration / sample_size).unwrap_or_default();
1107
1108	let median_duration = if median_samples.is_empty() {
1109	FineDuration::default()
1110	} else {
1111	let sum: u128 = median_samples.iter().map(\|s\| s.duration.picos).sum();
1112	FineDuration { picos: sum / median_samples.len() as u128 } / sample_size
1113	};
1114
1115	let counts = KnownCounterKind::ALL.map(\|counter_kind\| {
1116	let median: MaxCountUInt = {
1117	let mut sum: u128 = `0`;
1118
1119	for sample in median_samples {
1120	let sample_count = counter_count_for_sample(sample, counter_kind)? as u128;
1121
1122	// Saturating add in case `MaxUIntCount > u64`.
1123	sum = sum.saturating_add(sample_count);
1124	}
1125
1126	(sum / median_samples.len() as u128) as MaxCountUInt
1127	};
1128
1129	Some(StatsSet {
1130	fastest: sorted_samples
1131	.first()
1132	.and_then(\|s\| counter_count_for_sample(s, counter_kind))?,
1133	slowest: sorted_samples
1134	.last()
1135	.and_then(\|s\| counter_count_for_sample(s, counter_kind))?,
1136	median,
1137	mean: self.counters.mean_count(counter_kind),
1138	})
1139	});
1140
1141	let sample_alloc_info = \|sample: Option<&TimeSample>\| -> Option<&ThreadAllocInfo> {
1142	sample
1143	.and_then(\|sample\| u32::try_from(index_of_sample(sample)).ok())
1144	.and_then(\|index\| self.samples.alloc_info_by_sample.get(&index))
1145	};
1146
1147	let sample_alloc_tally = \|sample: Option<&TimeSample>, op: AllocOp\| -> ThreadAllocTally {
1148	sample_alloc_info(sample)
1149	.map(\|alloc_info\| alloc_info.tallies.get(op))
1150	.copied()
1151	.unwrap_or_default()
1152	};
1153
1154	let mut alloc_total_max_count = `0u128`;
1155	let mut alloc_total_max_size = `0u128`;
1156	let mut alloc_total_tallies = TotalAllocTallyMap::default();
1157
1158	for alloc_info in alloc_info_by_sample.values() {
1159	alloc_total_max_count += alloc_info.max_count as u128;
1160	alloc_total_max_size += alloc_info.max_size as u128;
1161	alloc_info.tallies.add_to_total(&mut alloc_total_tallies);
1162	}
1163
1164	let sample_size = f64::from(sample_size);
1165	Stats {
1166	sample_count: sample_count as u32,
1167	iter_count: total_count,
1168	time: StatsSet {
1169	fastest: min_duration,
1170	slowest: max_duration,
1171	median: median_duration,
1172	mean: mean_duration,
1173	},
1174	max_alloc: StatsSet {
1175	fastest: {
1176	let alloc_info = sample_alloc_info(sorted_samples.first().copied());
1177
1178	AllocTally {
1179	count: alloc_info.map(\|info\| info.max_count as f64).unwrap_or_default()
1180	/ sample_size,
1181	size: alloc_info.map(\|info\| info.max_size as f64).unwrap_or_default()
1182	/ sample_size,
1183	}
1184	},
1185	slowest: {
1186	let alloc_info = sample_alloc_info(sorted_samples.last().copied());
1187
1188	AllocTally {
1189	count: alloc_info.map(\|info\| info.max_count as f64).unwrap_or_default()
1190	/ sample_size,
1191	size: alloc_info.map(\|info\| info.max_size as f64).unwrap_or_default()
1192	/ sample_size,
1193	}
1194	},
1195	// TODO: Switch to median of alloc info itself, rather than
1196	// basing off of median times.
1197	median: {
1198	let alloc_info_for_median =
1199	\|index\| sample_alloc_info(median_samples.get(index).copied());
1200
1201	let max_count_for_median = \|index: usize\| -> f64 {
1202	alloc_info_for_median(index)
1203	.map(\|info\| info.max_count as f64)
1204	.unwrap_or_default()
1205	};
1206
1207	let max_size_for_median = \|index: usize\| -> f64 {
1208	alloc_info_for_median(index)
1209	.map(\|info\| info.max_size as f64)
1210	.unwrap_or_default()
1211	};
1212
1213	let median_count = median_samples.len().max(`1`) as f64;
1214
1215	let median_max_count = max_count_for_median(`0`) + max_count_for_median(`1`);
1216	let median_max_size = max_size_for_median(`0`) + max_size_for_median(`1`);
1217
1218	AllocTally {
1219	count: median_max_count / median_count / sample_size,
1220	size: median_max_size / median_count / sample_size,
1221	}
1222	},
1223	mean: AllocTally {
1224	count: alloc_total_max_count as f64 / total_count as f64,
1225	size: alloc_total_max_size as f64 / total_count as f64,
1226	},
1227	}
1228	.transpose(),
1229	alloc_tallies: AllocOpMap {
1230	values: AllocOp::ALL
1231	.map(\|op\| StatsSet {
1232	fastest: {
1233	let fastest = sample_alloc_tally(sorted_samples.first().copied(), op);
1234
1235	AllocTally {
1236	count: fastest.count as f64 / sample_size,
1237	size: fastest.size as f64 / sample_size,
1238	}
1239	},
1240	slowest: {
1241	let slowest = sample_alloc_tally(sorted_samples.last().copied(), op);
1242
1243	AllocTally {
1244	count: slowest.count as f64 / sample_size,
1245	size: slowest.size as f64 / sample_size,
1246	}
1247	},
1248	median: {
1249	let tally_for_median = \|index: usize\| -> ThreadAllocTally {
1250	sample_alloc_tally(median_samples.get(index).copied(), op)
1251	};
1252
1253	let a = tally_for_median(`0`);
1254	let b = tally_for_median(`1`);
1255
1256	let median_count = median_samples.len().max(`1`) as f64;
1257
1258	let avg_count = (a.count as f64 + b.count as f64) / median_count;
1259	let avg_size = (a.size as f64 + b.size as f64) / median_count;
1260
1261	AllocTally {
1262	count: avg_count / sample_size,
1263	size: avg_size / sample_size,
1264	}
1265	},
1266	mean: {
1267	let tally = alloc_total_tallies.get(op);
1268	AllocTally {
1269	count: tally.count as f64 / total_count as f64,
1270	size: tally.size as f64 / total_count as f64,
1271	}
1272	},
1273	})
1274	.map(StatsSet::transpose),
1275	},
1276	counts,
1277	}
1278	}
1279	}
1280
1281	impl<T> StatsSet<AllocTally<T>> {
1282	#[inline]
1283	pub fn transpose(self) -> AllocTally<StatsSet<T>> {
1284	AllocTally {
1285	count: StatsSet {
1286	fastest: self.fastest.count,
1287	slowest: self.slowest.count,
1288	median: self.median.count,
1289	mean: self.mean.count,
1290	},
1291	size: StatsSet {
1292	fastest: self.fastest.size,
1293	slowest: self.slowest.size,
1294	median: self.median.size,
1295	mean: self.mean.size,
1296	},
1297	}
1298	}
1299	}
1300