| 1 | use std::{ |
| 2 | cell::UnsafeCell, |
| 3 | fmt, |
| 4 | mem::{self, MaybeUninit}, |
| 5 | num::NonZeroUsize, |
| 6 | sync::Barrier, |
| 7 | }; |
| 8 | |
| 9 | use crate::{ |
| 10 | alloc::{ |
| 11 | AllocOp, AllocOpMap, AllocTally, ThreadAllocInfo, ThreadAllocTally, TotalAllocTallyMap, |
| 12 | }, |
| 13 | black_box, black_box_drop, |
| 14 | counter::{ |
| 15 | AnyCounter, AsCountUInt, BytesCount, CharsCount, Counter, CounterCollection, CyclesCount, |
| 16 | IntoCounter, ItemsCount, KnownCounterKind, MaxCountUInt, |
| 17 | }, |
| 18 | divan::SharedContext, |
| 19 | stats::{RawSample, SampleCollection, Stats, StatsSet, TimeSample}, |
| 20 | thread_pool::BENCH_POOL, |
| 21 | time::{FineDuration, Timestamp, UntaggedTimestamp}, |
| 22 | util::{self, sync::SyncWrap, Unit}, |
| 23 | }; |
| 24 | |
| 25 | #[cfg (test)] |
| 26 | mod tests; |
| 27 | |
| 28 | mod args; |
| 29 | mod defer; |
| 30 | mod options; |
| 31 | |
| 32 | use defer::{DeferSlot, DeferStore}; |
| 33 | |
| 34 | pub use self::{ |
| 35 | args::{BenchArgs, BenchArgsRunner}, |
| 36 | options::BenchOptions, |
| 37 | }; |
| 38 | |
| 39 | pub(crate) const DEFAULT_SAMPLE_COUNT: u32 = 100; |
| 40 | |
| 41 | /// Enables contextual benchmarking in [`#[divan::bench]`](attr.bench.html). |
| 42 | /// |
| 43 | /// # Examples |
| 44 | /// |
| 45 | /// ``` |
| 46 | /// use divan::{Bencher, black_box}; |
| 47 | /// |
| 48 | /// #[divan::bench] |
| 49 | /// fn copy_from_slice(bencher: Bencher) { |
| 50 | /// // Input and output buffers get used in the closure. |
| 51 | /// let src = (0..100).collect::<Vec<i32>>(); |
| 52 | /// let mut dst = vec![0; src.len()]; |
| 53 | /// |
| 54 | /// bencher.bench_local(|| { |
| 55 | /// black_box(&mut dst).copy_from_slice(black_box(&src)); |
| 56 | /// }); |
| 57 | /// } |
| 58 | /// ``` |
| 59 | #[must_use = "a benchmark function must be registered" ] |
| 60 | pub struct Bencher<'a, 'b, C = BencherConfig> { |
| 61 | pub(crate) context: &'a mut BenchContext<'b>, |
| 62 | pub(crate) config: C, |
| 63 | } |
| 64 | |
| 65 | /// Public-in-private type for statically-typed `Bencher` configuration. |
| 66 | /// |
| 67 | /// This enables configuring `Bencher` using the builder pattern with zero |
| 68 | /// runtime cost. |
| 69 | pub struct BencherConfig<GenI = Unit> { |
| 70 | gen_input: GenI, |
| 71 | } |
| 72 | |
| 73 | impl<C> fmt::Debug for Bencher<'_, '_, C> { |
| 74 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| 75 | f.debug_struct(name:"Bencher" ).finish_non_exhaustive() |
| 76 | } |
| 77 | } |
| 78 | |
| 79 | impl<'a, 'b> Bencher<'a, 'b> { |
| 80 | #[inline ] |
| 81 | pub(crate) fn new(context: &'a mut BenchContext<'b>) -> Self { |
| 82 | Self { context, config: BencherConfig { gen_input: Unit } } |
| 83 | } |
| 84 | } |
| 85 | |
| 86 | impl<'a, 'b> Bencher<'a, 'b> { |
| 87 | /// Benchmarks a function. |
| 88 | /// |
| 89 | /// The function can be benchmarked in parallel using the [`threads` |
| 90 | /// option](macro@crate::bench#threads). If the function is strictly |
| 91 | /// single-threaded, use [`Bencher::bench_local`] instead. |
| 92 | /// |
| 93 | /// # Examples |
| 94 | /// |
| 95 | /// ``` |
| 96 | /// #[divan::bench] |
| 97 | /// fn bench(bencher: divan::Bencher) { |
| 98 | /// bencher.bench(|| { |
| 99 | /// // Benchmarked code... |
| 100 | /// }); |
| 101 | /// } |
| 102 | /// ``` |
| 103 | pub fn bench<O, B>(self, benched: B) |
| 104 | where |
| 105 | B: Fn() -> O + Sync, |
| 106 | { |
| 107 | // Reusing `bench_values` for a zero-sized non-drop input type should |
| 108 | // have no overhead. |
| 109 | self.with_inputs(|| ()).bench_values(|_: ()| benched()); |
| 110 | } |
| 111 | |
| 112 | /// Benchmarks a function on the current thread. |
| 113 | /// |
| 114 | /// # Examples |
| 115 | /// |
| 116 | /// ``` |
| 117 | /// #[divan::bench] |
| 118 | /// fn bench(bencher: divan::Bencher) { |
| 119 | /// bencher.bench_local(|| { |
| 120 | /// // Benchmarked code... |
| 121 | /// }); |
| 122 | /// } |
| 123 | /// ``` |
| 124 | pub fn bench_local<O, B>(self, mut benched: B) |
| 125 | where |
| 126 | B: FnMut() -> O, |
| 127 | { |
| 128 | // Reusing `bench_local_values` for a zero-sized non-drop input type |
| 129 | // should have no overhead. |
| 130 | self.with_inputs(|| ()).bench_local_values(|_: ()| benched()); |
| 131 | } |
| 132 | |
| 133 | /// Generate inputs for the [benchmarked function](#input-bench). |
| 134 | /// |
| 135 | /// Time spent generating inputs does not affect benchmark timing. |
| 136 | /// |
| 137 | /// When [benchmarking in parallel](macro@crate::bench#threads), the input |
| 138 | /// generator is called on the same thread as the sample loop that uses that |
| 139 | /// input. |
| 140 | /// |
| 141 | /// # Examples |
| 142 | /// |
| 143 | /// ``` |
| 144 | /// #[divan::bench] |
| 145 | /// fn bench(bencher: divan::Bencher) { |
| 146 | /// bencher |
| 147 | /// .with_inputs(|| { |
| 148 | /// // Generate input: |
| 149 | /// String::from("..." ) |
| 150 | /// }) |
| 151 | /// .bench_values(|s| { |
| 152 | /// // Use input by-value: |
| 153 | /// s + "123" |
| 154 | /// }); |
| 155 | /// } |
| 156 | /// ``` |
| 157 | pub fn with_inputs<G>(self, gen_input: G) -> Bencher<'a, 'b, BencherConfig<G>> { |
| 158 | Bencher { context: self.context, config: BencherConfig { gen_input } } |
| 159 | } |
| 160 | } |
| 161 | |
| 162 | impl<'a, 'b, GenI> Bencher<'a, 'b, BencherConfig<GenI>> { |
| 163 | /// Assign a [`Counter`] for all iterations of the benchmarked function. |
| 164 | /// |
| 165 | /// This will either: |
| 166 | /// - Assign a new counter |
| 167 | /// - Override an existing counter of the same type |
| 168 | /// |
| 169 | /// If the counter depends on [generated inputs](Self::with_inputs), use |
| 170 | /// [`Bencher::input_counter`] instead. |
| 171 | /// |
| 172 | /// If context is not needed, the counter can instead be set via |
| 173 | /// [`#[divan::bench(counters = ...)]`](macro@crate::bench#counters). |
| 174 | /// |
| 175 | /// # Examples |
| 176 | /// |
| 177 | /// ``` |
| 178 | /// use divan::{Bencher, counter::BytesCount}; |
| 179 | /// |
| 180 | /// #[divan::bench] |
| 181 | /// fn char_count(bencher: Bencher) { |
| 182 | /// let s: String = // ... |
| 183 | /// # String::new(); |
| 184 | /// |
| 185 | /// bencher |
| 186 | /// .counter(BytesCount::of_str(&s)) |
| 187 | /// .bench(|| { |
| 188 | /// divan::black_box(&s).chars().count() |
| 189 | /// }); |
| 190 | /// } |
| 191 | /// ``` |
| 192 | #[doc (alias = "throughput" )] |
| 193 | pub fn counter<C>(self, counter: C) -> Self |
| 194 | where |
| 195 | C: IntoCounter, |
| 196 | { |
| 197 | let counter = AnyCounter::new(counter); |
| 198 | self.context.counters.set_counter(counter); |
| 199 | self |
| 200 | } |
| 201 | } |
| 202 | |
| 203 | /// <span id="input-bench"></span> Benchmark over [generated inputs](Self::with_inputs). |
| 204 | impl<'a, 'b, I, GenI> Bencher<'a, 'b, BencherConfig<GenI>> |
| 205 | where |
| 206 | GenI: FnMut() -> I, |
| 207 | { |
| 208 | /// Calls a closure to create a [`Counter`] for each input of the |
| 209 | /// benchmarked function. |
| 210 | /// |
| 211 | /// This will either: |
| 212 | /// - Assign a new counter |
| 213 | /// - Override an existing counter of the same type |
| 214 | /// |
| 215 | /// If the counter is constant, use [`Bencher::counter`] instead. |
| 216 | /// |
| 217 | /// When [benchmarking in parallel](macro@crate::bench#threads), the input |
| 218 | /// counter is called on the same thread as the sample loop that generates |
| 219 | /// and uses that input. |
| 220 | /// |
| 221 | /// # Examples |
| 222 | /// |
| 223 | /// The following example emits info for the number of bytes processed when |
| 224 | /// benchmarking [`char`-counting](std::str::Chars::count). The byte count |
| 225 | /// is gotten by calling [`BytesCount::of_str`] on each iteration's input |
| 226 | /// [`String`]. |
| 227 | /// |
| 228 | /// ``` |
| 229 | /// use divan::{Bencher, counter::BytesCount}; |
| 230 | /// |
| 231 | /// #[divan::bench] |
| 232 | /// fn char_count(bencher: Bencher) { |
| 233 | /// bencher |
| 234 | /// .with_inputs(|| -> String { |
| 235 | /// // ... |
| 236 | /// # String::new() |
| 237 | /// }) |
| 238 | /// .input_counter(BytesCount::of_str) |
| 239 | /// .bench_refs(|s| { |
| 240 | /// s.chars().count() |
| 241 | /// }); |
| 242 | /// } |
| 243 | /// ``` |
| 244 | pub fn input_counter<C, F>(self, make_counter: F) -> Self |
| 245 | where |
| 246 | F: Fn(&I) -> C + Sync + 'static, |
| 247 | C: IntoCounter, |
| 248 | { |
| 249 | self.context.counters.set_input_counter(make_counter); |
| 250 | self |
| 251 | } |
| 252 | |
| 253 | /// Creates a [`Counter`] from each input of the benchmarked function. |
| 254 | /// |
| 255 | /// This may be used if the input returns [`u8`]–[`u64`], [`usize`], or any |
| 256 | /// nesting of references to those types. |
| 257 | /// |
| 258 | /// # Examples |
| 259 | /// |
| 260 | /// The following example emits info for the number of items processed when |
| 261 | /// benchmarking [`FromIterator`] from |
| 262 | /// <code>[Range](std::ops::Range)<[usize]></code> to [`Vec`]. |
| 263 | /// |
| 264 | /// ``` |
| 265 | /// use divan::{Bencher, counter::ItemsCount}; |
| 266 | /// |
| 267 | /// #[divan::bench] |
| 268 | /// fn range_to_vec(bencher: Bencher) { |
| 269 | /// bencher |
| 270 | /// .with_inputs(|| -> usize { |
| 271 | /// // ... |
| 272 | /// # 0 |
| 273 | /// }) |
| 274 | /// .count_inputs_as::<ItemsCount>() |
| 275 | /// .bench_values(|n| -> Vec<usize> { |
| 276 | /// (0..n).collect() |
| 277 | /// }); |
| 278 | /// } |
| 279 | /// ``` |
| 280 | #[inline ] |
| 281 | pub fn count_inputs_as<C>(self) -> Self |
| 282 | where |
| 283 | C: Counter, |
| 284 | I: AsCountUInt, |
| 285 | { |
| 286 | match KnownCounterKind::of::<C>() { |
| 287 | KnownCounterKind::Bytes => self.input_counter(|c| BytesCount::from(c)), |
| 288 | KnownCounterKind::Chars => self.input_counter(|c| CharsCount::from(c)), |
| 289 | KnownCounterKind::Cycles => self.input_counter(|c| CyclesCount::from(c)), |
| 290 | KnownCounterKind::Items => self.input_counter(|c| ItemsCount::from(c)), |
| 291 | } |
| 292 | } |
| 293 | |
| 294 | /// Benchmarks a function over per-iteration [generated inputs](Self::with_inputs), |
| 295 | /// provided by-value. |
| 296 | /// |
| 297 | /// Per-iteration means the benchmarked function is called exactly once for |
| 298 | /// each generated input. |
| 299 | /// |
| 300 | /// The function can be benchmarked in parallel using the [`threads` |
| 301 | /// option](macro@crate::bench#threads). If the function is strictly |
| 302 | /// single-threaded, use [`Bencher::bench_local_values`] instead. |
| 303 | /// |
| 304 | /// # Examples |
| 305 | /// |
| 306 | /// ``` |
| 307 | /// #[divan::bench] |
| 308 | /// fn bench(bencher: divan::Bencher) { |
| 309 | /// bencher |
| 310 | /// .with_inputs(|| { |
| 311 | /// // Generate input: |
| 312 | /// String::from("..." ) |
| 313 | /// }) |
| 314 | /// .bench_values(|s| { |
| 315 | /// // Use input by-value: |
| 316 | /// s + "123" |
| 317 | /// }); |
| 318 | /// } |
| 319 | /// ``` |
| 320 | pub fn bench_values<O, B>(self, benched: B) |
| 321 | where |
| 322 | B: Fn(I) -> O + Sync, |
| 323 | GenI: Fn() -> I + Sync, |
| 324 | { |
| 325 | self.context.bench_loop_threaded( |
| 326 | self.config.gen_input, |
| 327 | |input| { |
| 328 | // SAFETY: Input is guaranteed to be initialized and not |
| 329 | // currently referenced by anything else. |
| 330 | let input = unsafe { input.get().read().assume_init() }; |
| 331 | |
| 332 | benched(input) |
| 333 | }, |
| 334 | // Input ownership is transferred to `benched`. |
| 335 | |_input| {}, |
| 336 | ); |
| 337 | } |
| 338 | |
| 339 | /// Benchmarks a function over per-iteration [generated inputs](Self::with_inputs), |
| 340 | /// provided by-value. |
| 341 | /// |
| 342 | /// Per-iteration means the benchmarked function is called exactly once for |
| 343 | /// each generated input. |
| 344 | /// |
| 345 | /// # Examples |
| 346 | /// |
| 347 | /// ``` |
| 348 | /// #[divan::bench] |
| 349 | /// fn bench(bencher: divan::Bencher) { |
| 350 | /// let mut values = Vec::new(); |
| 351 | /// bencher |
| 352 | /// .with_inputs(|| { |
| 353 | /// // Generate input: |
| 354 | /// String::from("..." ) |
| 355 | /// }) |
| 356 | /// .bench_local_values(|s| { |
| 357 | /// // Use input by-value: |
| 358 | /// values.push(s); |
| 359 | /// }); |
| 360 | /// } |
| 361 | /// ``` |
| 362 | pub fn bench_local_values<O, B>(self, mut benched: B) |
| 363 | where |
| 364 | B: FnMut(I) -> O, |
| 365 | { |
| 366 | self.context.bench_loop_local( |
| 367 | self.config.gen_input, |
| 368 | |input| { |
| 369 | // SAFETY: Input is guaranteed to be initialized and not |
| 370 | // currently referenced by anything else. |
| 371 | let input = unsafe { input.get().read().assume_init() }; |
| 372 | |
| 373 | benched(input) |
| 374 | }, |
| 375 | // Input ownership is transferred to `benched`. |
| 376 | |_input| {}, |
| 377 | ); |
| 378 | } |
| 379 | |
| 380 | /// Benchmarks a function over per-iteration [generated inputs](Self::with_inputs), |
| 381 | /// provided by-reference. |
| 382 | /// |
| 383 | /// Per-iteration means the benchmarked function is called exactly once for |
| 384 | /// each generated input. |
| 385 | /// |
| 386 | /// # Examples |
| 387 | /// |
| 388 | /// ``` |
| 389 | /// #[divan::bench] |
| 390 | /// fn bench(bencher: divan::Bencher) { |
| 391 | /// bencher |
| 392 | /// .with_inputs(|| { |
| 393 | /// // Generate input: |
| 394 | /// String::from("..." ) |
| 395 | /// }) |
| 396 | /// .bench_refs(|s| { |
| 397 | /// // Use input by-reference: |
| 398 | /// *s += "123" ; |
| 399 | /// }); |
| 400 | /// } |
| 401 | /// ``` |
| 402 | pub fn bench_refs<O, B>(self, benched: B) |
| 403 | where |
| 404 | B: Fn(&mut I) -> O + Sync, |
| 405 | GenI: Fn() -> I + Sync, |
| 406 | { |
| 407 | // TODO: Allow `O` to reference `&mut I` as long as `I` outlives `O`. |
| 408 | self.context.bench_loop_threaded( |
| 409 | self.config.gen_input, |
| 410 | |input| { |
| 411 | // SAFETY: Input is guaranteed to be initialized and not |
| 412 | // currently referenced by anything else. |
| 413 | let input = unsafe { (*input.get()).assume_init_mut() }; |
| 414 | |
| 415 | benched(input) |
| 416 | }, |
| 417 | // Input ownership was not transferred to `benched`. |
| 418 | |input| { |
| 419 | // SAFETY: This function is called after `benched` outputs are |
| 420 | // dropped, so we have exclusive access. |
| 421 | unsafe { (*input.get()).assume_init_drop() } |
| 422 | }, |
| 423 | ); |
| 424 | } |
| 425 | |
| 426 | /// Benchmarks a function over per-iteration [generated inputs](Self::with_inputs), |
| 427 | /// provided by-reference. |
| 428 | /// |
| 429 | /// Per-iteration means the benchmarked function is called exactly once for |
| 430 | /// each generated input. |
| 431 | /// |
| 432 | /// # Examples |
| 433 | /// |
| 434 | /// ``` |
| 435 | /// #[divan::bench] |
| 436 | /// fn bench(bencher: divan::Bencher) { |
| 437 | /// bencher |
| 438 | /// .with_inputs(|| { |
| 439 | /// // Generate input: |
| 440 | /// String::from("..." ) |
| 441 | /// }) |
| 442 | /// .bench_local_refs(|s| { |
| 443 | /// // Use input by-reference: |
| 444 | /// *s += "123" ; |
| 445 | /// }); |
| 446 | /// } |
| 447 | /// ``` |
| 448 | pub fn bench_local_refs<O, B>(self, mut benched: B) |
| 449 | where |
| 450 | B: FnMut(&mut I) -> O, |
| 451 | { |
| 452 | // TODO: Allow `O` to reference `&mut I` as long as `I` outlives `O`. |
| 453 | self.context.bench_loop_local( |
| 454 | self.config.gen_input, |
| 455 | |input| { |
| 456 | // SAFETY: Input is guaranteed to be initialized and not |
| 457 | // currently referenced by anything else. |
| 458 | let input = unsafe { (*input.get()).assume_init_mut() }; |
| 459 | |
| 460 | benched(input) |
| 461 | }, |
| 462 | // Input ownership was not transferred to `benched`. |
| 463 | |input| { |
| 464 | // SAFETY: This function is called after `benched` outputs are |
| 465 | // dropped, so we have exclusive access. |
| 466 | unsafe { (*input.get()).assume_init_drop() } |
| 467 | }, |
| 468 | ); |
| 469 | } |
| 470 | } |
| 471 | |
| 472 | /// State machine for how the benchmark is being run. |
| 473 | #[derive (Clone, Copy)] |
| 474 | pub(crate) enum BenchMode { |
| 475 | /// The benchmark is being run as `--test`. |
| 476 | /// |
| 477 | /// Don't collect samples and run exactly once. |
| 478 | Test, |
| 479 | |
| 480 | /// Scale `sample_size` to determine the right size for collecting. |
| 481 | Tune { sample_size: u32 }, |
| 482 | |
| 483 | /// Simply collect samples. |
| 484 | Collect { sample_size: u32 }, |
| 485 | } |
| 486 | |
| 487 | impl BenchMode { |
| 488 | #[inline ] |
| 489 | pub fn is_test(self) -> bool { |
| 490 | matches!(self, Self::Test) |
| 491 | } |
| 492 | |
| 493 | #[inline ] |
| 494 | pub fn is_tune(self) -> bool { |
| 495 | matches!(self, Self::Tune { .. }) |
| 496 | } |
| 497 | |
| 498 | #[inline ] |
| 499 | pub fn is_collect(self) -> bool { |
| 500 | matches!(self, Self::Collect { .. }) |
| 501 | } |
| 502 | |
| 503 | #[inline ] |
| 504 | pub fn sample_size(self) -> u32 { |
| 505 | match self { |
| 506 | Self::Test => 1, |
| 507 | Self::Tune { sample_size: u32, .. } | Self::Collect { sample_size: u32, .. } => sample_size, |
| 508 | } |
| 509 | } |
| 510 | } |
| 511 | |
| 512 | /// `#[divan::bench]` loop context. |
| 513 | /// |
| 514 | /// Functions called within the benchmark loop should be `#[inline(always)]` to |
| 515 | /// ensure instruction cache locality. |
| 516 | pub(crate) struct BenchContext<'a> { |
| 517 | shared_context: &'a SharedContext, |
| 518 | |
| 519 | /// User-configured options. |
| 520 | pub options: &'a BenchOptions<'a>, |
| 521 | |
| 522 | /// Whether the benchmark loop was started. |
| 523 | pub did_run: bool, |
| 524 | |
| 525 | /// The number of threads to run the benchmark. The default is 1. |
| 526 | /// |
| 527 | /// When set to 1, the benchmark loop is guaranteed to stay on the current |
| 528 | /// thread and not spawn any threads. |
| 529 | pub thread_count: NonZeroUsize, |
| 530 | |
| 531 | /// Recorded samples. |
| 532 | samples: SampleCollection, |
| 533 | |
| 534 | /// Per-iteration counters grouped by sample. |
| 535 | counters: CounterCollection, |
| 536 | } |
| 537 | |
| 538 | impl<'a> BenchContext<'a> { |
| 539 | /// Creates a new benchmarking context. |
| 540 | pub fn new( |
| 541 | shared_context: &'a SharedContext, |
| 542 | options: &'a BenchOptions, |
| 543 | thread_count: NonZeroUsize, |
| 544 | ) -> Self { |
| 545 | Self { |
| 546 | shared_context, |
| 547 | options, |
| 548 | thread_count, |
| 549 | did_run: false, |
| 550 | samples: SampleCollection::default(), |
| 551 | counters: options.counters.to_collection(), |
| 552 | } |
| 553 | } |
| 554 | |
| 555 | /// Runs the single-threaded loop for benchmarking `benched`. |
| 556 | /// |
| 557 | /// # Safety |
| 558 | /// |
| 559 | /// See `bench_loop_threaded`. |
| 560 | pub fn bench_loop_local<I, O>( |
| 561 | &mut self, |
| 562 | gen_input: impl FnMut() -> I, |
| 563 | benched: impl FnMut(&UnsafeCell<MaybeUninit<I>>) -> O, |
| 564 | drop_input: impl Fn(&UnsafeCell<MaybeUninit<I>>), |
| 565 | ) { |
| 566 | // SAFETY: Closures are guaranteed to run on the current thread, so they |
| 567 | // can safely be mutable and non-`Sync`. |
| 568 | unsafe { |
| 569 | let gen_input = SyncWrap::new(UnsafeCell::new(gen_input)); |
| 570 | let benched = SyncWrap::new(UnsafeCell::new(benched)); |
| 571 | let drop_input = SyncWrap::new(drop_input); |
| 572 | |
| 573 | self.thread_count = NonZeroUsize::MIN; |
| 574 | self.bench_loop_threaded::<I, O>( |
| 575 | || (*gen_input.get())(), |
| 576 | |input| (*benched.get())(input), |
| 577 | |input| drop_input(input), |
| 578 | ) |
| 579 | } |
| 580 | } |
| 581 | |
| 582 | /// Runs the multi-threaded loop for benchmarking `benched`. |
| 583 | /// |
| 584 | /// # Safety |
| 585 | /// |
| 586 | /// If `self.threads` is 1, the incoming closures will not escape the |
| 587 | /// current thread. This guarantee ensures `bench_loop_local` can soundly |
| 588 | /// reuse this method with mutable non-`Sync` closures. |
| 589 | /// |
| 590 | /// When `benched` is called: |
| 591 | /// - `I` is guaranteed to be initialized. |
| 592 | /// - No external `&I` or `&mut I` exists. |
| 593 | /// |
| 594 | /// When `drop_input` is called: |
| 595 | /// - All instances of `O` returned from `benched` have been dropped. |
| 596 | /// - The same guarantees for `I` apply as in `benched`, unless `benched` |
| 597 | /// escaped references to `I`. |
| 598 | fn bench_loop_threaded<I, O>( |
| 599 | &mut self, |
| 600 | gen_input: impl Fn() -> I + Sync, |
| 601 | benched: impl Fn(&UnsafeCell<MaybeUninit<I>>) -> O + Sync, |
| 602 | drop_input: impl Fn(&UnsafeCell<MaybeUninit<I>>) + Sync, |
| 603 | ) { |
| 604 | self.did_run = true; |
| 605 | |
| 606 | let mut current_mode = self.initial_mode(); |
| 607 | let is_test = current_mode.is_test(); |
| 608 | |
| 609 | let record_sample = self.sample_recorder(gen_input, benched, drop_input); |
| 610 | |
| 611 | let thread_count = self.thread_count.get(); |
| 612 | let aux_thread_count = thread_count - 1; |
| 613 | |
| 614 | let is_single_thread = aux_thread_count == 0; |
| 615 | |
| 616 | // Per-thread sample info returned by `record_sample`. These are |
| 617 | // processed locally to emit user-facing sample info. As a result, this |
| 618 | // only contains `thread_count` many elements at a time. |
| 619 | let mut raw_samples = Vec::<Option<RawSample>>::new(); |
| 620 | |
| 621 | // The time spent benchmarking, in picoseconds. |
| 622 | // |
| 623 | // Unless `skip_ext_time` is set, this includes time external to |
| 624 | // `benched`, such as time spent generating inputs and running drop. |
| 625 | let mut elapsed_picos: u128 = 0; |
| 626 | |
| 627 | // The minimum time for benchmarking, in picoseconds. |
| 628 | let min_picos = self.options.min_time().picos; |
| 629 | |
| 630 | // The remaining time left for benchmarking, in picoseconds. |
| 631 | let max_picos = self.options.max_time().picos; |
| 632 | |
| 633 | // Don't bother running if user specifies 0 max time or 0 samples. |
| 634 | if max_picos == 0 || !self.options.has_samples() { |
| 635 | return; |
| 636 | } |
| 637 | |
| 638 | let timer = self.shared_context.timer; |
| 639 | let timer_kind = timer.kind(); |
| 640 | |
| 641 | let mut rem_samples = if current_mode.is_collect() { |
| 642 | Some(self.options.sample_count.unwrap_or(DEFAULT_SAMPLE_COUNT)) |
| 643 | } else { |
| 644 | None |
| 645 | }; |
| 646 | |
| 647 | // Only measure precision if we need to tune sample size. |
| 648 | let timer_precision = |
| 649 | if current_mode.is_tune() { timer.precision() } else { FineDuration::default() }; |
| 650 | |
| 651 | if !is_test { |
| 652 | self.samples.time_samples.reserve(self.options.sample_count.unwrap_or(1) as usize); |
| 653 | } |
| 654 | |
| 655 | let skip_ext_time = self.options.skip_ext_time.unwrap_or_default(); |
| 656 | let initial_start = if skip_ext_time { None } else { Some(Timestamp::start(timer_kind)) }; |
| 657 | |
| 658 | let bench_overheads = timer.bench_overheads(); |
| 659 | |
| 660 | while { |
| 661 | // Conditions for when sampling is over: |
| 662 | if elapsed_picos >= max_picos { |
| 663 | // Depleted the benchmarking time budget. This is a strict |
| 664 | // condition regardless of sample count and minimum time. |
| 665 | false |
| 666 | } else if rem_samples.unwrap_or(1) > 0 { |
| 667 | // More samples expected. |
| 668 | true |
| 669 | } else { |
| 670 | // Continue if we haven't reached the time floor. |
| 671 | elapsed_picos < min_picos |
| 672 | } |
| 673 | } { |
| 674 | let sample_size = current_mode.sample_size(); |
| 675 | self.samples.sample_size = sample_size; |
| 676 | |
| 677 | let barrier = if is_single_thread { None } else { Some(Barrier::new(thread_count)) }; |
| 678 | |
| 679 | // Sample loop helper: |
| 680 | let record_sample = || -> RawSample { |
| 681 | let mut counter_totals: [u128; KnownCounterKind::COUNT] = |
| 682 | [0; KnownCounterKind::COUNT]; |
| 683 | |
| 684 | // Updates per-input counter info for this sample. |
| 685 | let mut count_input = |input: &I| { |
| 686 | for counter_kind in KnownCounterKind::ALL { |
| 687 | // SAFETY: The `I` type cannot change since `with_inputs` |
| 688 | // cannot be called more than once on the same `Bencher`. |
| 689 | if let Some(count) = |
| 690 | unsafe { self.counters.get_input_count(counter_kind, input) } |
| 691 | { |
| 692 | let total = &mut counter_totals[counter_kind as usize]; |
| 693 | *total = (*total).saturating_add(count as u128); |
| 694 | } |
| 695 | } |
| 696 | }; |
| 697 | |
| 698 | // Sample loop: |
| 699 | let ([start, end], alloc_info) = |
| 700 | record_sample(sample_size as usize, barrier.as_ref(), &mut count_input); |
| 701 | |
| 702 | RawSample { start, end, timer, alloc_info, counter_totals } |
| 703 | }; |
| 704 | |
| 705 | // Sample loop: |
| 706 | raw_samples.clear(); |
| 707 | BENCH_POOL.par_extend(&mut raw_samples, aux_thread_count, |_| record_sample()); |
| 708 | |
| 709 | // Convert `&[Option<RawSample>]` to `&[Sample]`. |
| 710 | let raw_samples: &[RawSample] = { |
| 711 | if let Some(thread) = raw_samples |
| 712 | .iter() |
| 713 | .enumerate() |
| 714 | .find_map(|(thread, sample)| sample.is_none().then_some(thread)) |
| 715 | { |
| 716 | panic!("Divan benchmarking thread {thread} panicked" ); |
| 717 | } |
| 718 | |
| 719 | unsafe { |
| 720 | assert_eq!(mem::size_of::<RawSample>(), mem::size_of::<Option<RawSample>>()); |
| 721 | std::slice::from_raw_parts(raw_samples.as_ptr().cast(), raw_samples.len()) |
| 722 | } |
| 723 | }; |
| 724 | |
| 725 | // If testing, exit the benchmarking loop immediately after timing a |
| 726 | // single run. |
| 727 | if is_test { |
| 728 | break; |
| 729 | } |
| 730 | |
| 731 | let slowest_sample = raw_samples.iter().max_by_key(|s| s.duration()).unwrap(); |
| 732 | let slowest_time = slowest_sample.duration(); |
| 733 | |
| 734 | // TODO: Make tuning be less influenced by early runs. Currently if |
| 735 | // early runs are very quick but later runs are slow, benchmarking |
| 736 | // will take a very long time. |
| 737 | // |
| 738 | // TODO: Make `sample_size` consider time generating inputs and |
| 739 | // dropping inputs/outputs. Currently benchmarks like |
| 740 | // `Bencher::bench_refs(String::clear)` take a very long time. |
| 741 | if current_mode.is_tune() { |
| 742 | // Clear previous smaller samples. |
| 743 | self.samples.clear(); |
| 744 | self.counters.clear_input_counts(); |
| 745 | |
| 746 | // If within 100x timer precision, continue tuning. |
| 747 | let precision_multiple = slowest_time.picos / timer_precision.picos; |
| 748 | if precision_multiple <= 100 { |
| 749 | current_mode = BenchMode::Tune { sample_size: sample_size * 2 }; |
| 750 | } else { |
| 751 | current_mode = BenchMode::Collect { sample_size }; |
| 752 | rem_samples = Some(self.options.sample_count.unwrap_or(DEFAULT_SAMPLE_COUNT)); |
| 753 | } |
| 754 | } |
| 755 | |
| 756 | // Returns the sample's duration adjusted for overhead. |
| 757 | let sample_duration_sub_overhead = |raw_sample: &RawSample| { |
| 758 | let overhead = bench_overheads.total_overhead(sample_size, &raw_sample.alloc_info); |
| 759 | |
| 760 | FineDuration { |
| 761 | picos: raw_sample |
| 762 | .duration() |
| 763 | .clamp_to(timer_precision) |
| 764 | .picos |
| 765 | .saturating_sub(overhead.picos), |
| 766 | } |
| 767 | .clamp_to(timer_precision) |
| 768 | }; |
| 769 | |
| 770 | for raw_sample in raw_samples { |
| 771 | let sample_index = self.samples.time_samples.len(); |
| 772 | |
| 773 | self.samples |
| 774 | .time_samples |
| 775 | .push(TimeSample { duration: sample_duration_sub_overhead(raw_sample) }); |
| 776 | |
| 777 | if !raw_sample.alloc_info.tallies.is_empty() { |
| 778 | self.samples |
| 779 | .alloc_info_by_sample |
| 780 | .insert(sample_index as u32, raw_sample.alloc_info.clone()); |
| 781 | } |
| 782 | |
| 783 | // Insert per-input counter information. |
| 784 | for counter_kind in KnownCounterKind::ALL { |
| 785 | if !self.counters.uses_input_counts(counter_kind) { |
| 786 | continue; |
| 787 | } |
| 788 | |
| 789 | let total_count = raw_sample.counter_totals[counter_kind as usize]; |
| 790 | |
| 791 | // Cannot overflow `MaxCountUInt` because `total_count` |
| 792 | // cannot exceed `MaxCountUInt::MAX * sample_size`. |
| 793 | let per_iter_count = (total_count / sample_size as u128) as MaxCountUInt; |
| 794 | |
| 795 | self.counters.push_counter(AnyCounter::known(counter_kind, per_iter_count)); |
| 796 | } |
| 797 | |
| 798 | if let Some(rem_samples) = &mut rem_samples { |
| 799 | *rem_samples = rem_samples.saturating_sub(1); |
| 800 | } |
| 801 | } |
| 802 | |
| 803 | if let Some(initial_start) = initial_start { |
| 804 | let last_end = raw_samples.iter().map(|s| s.end).max().unwrap(); |
| 805 | elapsed_picos = last_end.duration_since(initial_start, timer).picos; |
| 806 | } else { |
| 807 | // Progress by at least 1ns to prevent extremely fast |
| 808 | // functions from taking forever when `min_time` is set. |
| 809 | let progress_picos = slowest_time.picos.max(1_000); |
| 810 | elapsed_picos = elapsed_picos.saturating_add(progress_picos); |
| 811 | } |
| 812 | } |
| 813 | |
| 814 | // Reset flag for ignoring allocations. |
| 815 | crate::alloc::IGNORE_ALLOC.set(false); |
| 816 | } |
| 817 | |
| 818 | /// Returns a closure that takes the sample size and input counter, and then |
| 819 | /// returns a newly recorded sample. |
| 820 | fn sample_recorder<I, O>( |
| 821 | &self, |
| 822 | gen_input: impl Fn() -> I, |
| 823 | benched: impl Fn(&UnsafeCell<MaybeUninit<I>>) -> O, |
| 824 | drop_input: impl Fn(&UnsafeCell<MaybeUninit<I>>), |
| 825 | ) -> impl Fn(usize, Option<&Barrier>, &mut dyn FnMut(&I)) -> ([Timestamp; 2], ThreadAllocInfo) |
| 826 | { |
| 827 | // We defer: |
| 828 | // - Usage of `gen_input` values. |
| 829 | // - Drop destructor for `O`, preventing it from affecting sample |
| 830 | // measurements. Outputs are stored into a pre-allocated buffer during |
| 831 | // the sample loop. The allocation is reused between samples to reduce |
| 832 | // time spent between samples. |
| 833 | |
| 834 | let timer_kind = self.shared_context.timer.kind(); |
| 835 | |
| 836 | move |sample_size: usize, barrier: Option<&Barrier>, count_input: &mut dyn FnMut(&I)| { |
| 837 | let mut defer_store = DeferStore::<I, O>::default(); |
| 838 | |
| 839 | let mut saved_alloc_info = ThreadAllocInfo::new(); |
| 840 | let mut save_alloc_info = || { |
| 841 | if crate::alloc::IGNORE_ALLOC.get() { |
| 842 | return; |
| 843 | } |
| 844 | |
| 845 | if let Some(alloc_info) = ThreadAllocInfo::try_current() { |
| 846 | // SAFETY: We have exclusive access. |
| 847 | saved_alloc_info = unsafe { alloc_info.as_ptr().read() }; |
| 848 | } |
| 849 | }; |
| 850 | |
| 851 | // Synchronize all threads to start timed section simultaneously and |
| 852 | // clear every thread's memory profiling info. |
| 853 | // |
| 854 | // This ensures work external to the timed section does not affect |
| 855 | // the timing of other threads. |
| 856 | let sync_threads = |is_start: bool| { |
| 857 | sync_impl(barrier, is_start); |
| 858 | |
| 859 | // Monomorphize implementation to reduce code size. |
| 860 | #[inline (never)] |
| 861 | fn sync_impl(barrier: Option<&Barrier>, is_start: bool) { |
| 862 | // Ensure benchmarked section has a `ThreadAllocInfo` |
| 863 | // allocated for the current thread and clear previous info. |
| 864 | let alloc_info = if is_start { ThreadAllocInfo::current() } else { None }; |
| 865 | |
| 866 | // Synchronize all threads. |
| 867 | // |
| 868 | // This is the final synchronization point for the end. |
| 869 | if let Some(barrier) = barrier { |
| 870 | barrier.wait(); |
| 871 | } |
| 872 | |
| 873 | if let Some(mut alloc_info) = alloc_info { |
| 874 | // SAFETY: We have exclusive access. |
| 875 | let alloc_info = unsafe { alloc_info.as_mut() }; |
| 876 | |
| 877 | alloc_info.clear(); |
| 878 | |
| 879 | // Synchronize all threads. |
| 880 | if let Some(barrier) = barrier { |
| 881 | barrier.wait(); |
| 882 | } |
| 883 | } |
| 884 | } |
| 885 | }; |
| 886 | |
| 887 | // The following logic chooses how to efficiently sample the |
| 888 | // benchmark function once and assigns `sample_start`/`sample_end` |
| 889 | // before/after the sample loop. |
| 890 | // |
| 891 | // NOTE: Testing and benchmarking should behave exactly the same |
| 892 | // when getting the sample time span. We don't want to introduce |
| 893 | // extra work that may worsen measurement quality for real |
| 894 | // benchmarking. |
| 895 | let sample_start: UntaggedTimestamp; |
| 896 | let sample_end: UntaggedTimestamp; |
| 897 | |
| 898 | if size_of::<I>() == 0 && (size_of::<O>() == 0 || !mem::needs_drop::<O>()) { |
| 899 | // Use a range instead of `defer_store` to make the benchmarking |
| 900 | // loop cheaper. |
| 901 | |
| 902 | // Run `gen_input` the expected number of times in case it |
| 903 | // updates external state used by `benched`. |
| 904 | for _ in 0..sample_size { |
| 905 | let input = gen_input(); |
| 906 | count_input(&input); |
| 907 | |
| 908 | // Inputs are consumed/dropped later. |
| 909 | mem::forget(input); |
| 910 | } |
| 911 | |
| 912 | sync_threads(true); |
| 913 | sample_start = UntaggedTimestamp::start(timer_kind); |
| 914 | |
| 915 | // Sample loop: |
| 916 | for _ in 0..sample_size { |
| 917 | // SAFETY: Input is a ZST, so we can construct one out of |
| 918 | // thin air. |
| 919 | let input = unsafe { UnsafeCell::new(MaybeUninit::<I>::zeroed()) }; |
| 920 | |
| 921 | mem::forget(black_box(benched(&input))); |
| 922 | } |
| 923 | |
| 924 | sample_end = UntaggedTimestamp::end(timer_kind); |
| 925 | sync_threads(false); |
| 926 | save_alloc_info(); |
| 927 | |
| 928 | // Drop outputs and inputs. |
| 929 | for _ in 0..sample_size { |
| 930 | // Output only needs drop if ZST. |
| 931 | if size_of::<O>() == 0 { |
| 932 | // SAFETY: Output is a ZST, so we can construct one out |
| 933 | // of thin air. |
| 934 | unsafe { _ = mem::zeroed::<O>() } |
| 935 | } |
| 936 | |
| 937 | if mem::needs_drop::<I>() { |
| 938 | // SAFETY: Input is a ZST, so we can construct one out |
| 939 | // of thin air and not worry about aliasing. |
| 940 | unsafe { drop_input(&UnsafeCell::new(MaybeUninit::<I>::zeroed())) } |
| 941 | } |
| 942 | } |
| 943 | } else { |
| 944 | defer_store.prepare(sample_size); |
| 945 | |
| 946 | match defer_store.slots() { |
| 947 | // Output needs to be dropped. We defer drop in the sample |
| 948 | // loop by inserting it into `defer_store`. |
| 949 | Ok(defer_slots_slice) => { |
| 950 | // Initialize and store inputs. |
| 951 | for DeferSlot { input, .. } in defer_slots_slice { |
| 952 | // SAFETY: We have exclusive access to `input`. |
| 953 | let input = unsafe { &mut *input.get() }; |
| 954 | let input = input.write(gen_input()); |
| 955 | count_input(input); |
| 956 | |
| 957 | // Make input opaque to benchmarked function. |
| 958 | black_box(input); |
| 959 | } |
| 960 | |
| 961 | // Create iterator before the sample timing section to |
| 962 | // reduce benchmarking overhead. |
| 963 | let defer_slots_iter = defer_slots_slice.iter(); |
| 964 | |
| 965 | sync_threads(true); |
| 966 | sample_start = UntaggedTimestamp::start(timer_kind); |
| 967 | |
| 968 | // Sample loop: |
| 969 | for defer_slot in defer_slots_iter { |
| 970 | // SAFETY: All inputs in `defer_store` were |
| 971 | // initialized and we have exclusive access to the |
| 972 | // output slot. |
| 973 | unsafe { |
| 974 | let output = benched(&defer_slot.input); |
| 975 | *defer_slot.output.get() = MaybeUninit::new(output); |
| 976 | } |
| 977 | } |
| 978 | |
| 979 | sample_end = UntaggedTimestamp::end(timer_kind); |
| 980 | sync_threads(false); |
| 981 | save_alloc_info(); |
| 982 | |
| 983 | // Prevent the optimizer from removing writes to inputs |
| 984 | // and outputs in the sample loop. |
| 985 | black_box(defer_slots_slice); |
| 986 | |
| 987 | // Drop outputs and inputs. |
| 988 | for DeferSlot { input, output } in defer_slots_slice { |
| 989 | // SAFETY: All outputs were initialized in the |
| 990 | // sample loop and we have exclusive access. |
| 991 | unsafe { (*output.get()).assume_init_drop() } |
| 992 | |
| 993 | if mem::needs_drop::<I>() { |
| 994 | // SAFETY: The output was dropped and thus we |
| 995 | // have exclusive access to inputs. |
| 996 | unsafe { drop_input(input) } |
| 997 | } |
| 998 | } |
| 999 | } |
| 1000 | |
| 1001 | // Output does not need to be dropped. |
| 1002 | Err(defer_inputs_slice) => { |
| 1003 | // Initialize and store inputs. |
| 1004 | for input in defer_inputs_slice { |
| 1005 | // SAFETY: We have exclusive access to `input`. |
| 1006 | let input = unsafe { &mut *input.get() }; |
| 1007 | let input = input.write(gen_input()); |
| 1008 | count_input(input); |
| 1009 | |
| 1010 | // Make input opaque to benchmarked function. |
| 1011 | black_box(input); |
| 1012 | } |
| 1013 | |
| 1014 | // Create iterator before the sample timing section to |
| 1015 | // reduce benchmarking overhead. |
| 1016 | let defer_inputs_iter = defer_inputs_slice.iter(); |
| 1017 | |
| 1018 | sync_threads(true); |
| 1019 | sample_start = UntaggedTimestamp::start(timer_kind); |
| 1020 | |
| 1021 | // Sample loop: |
| 1022 | for input in defer_inputs_iter { |
| 1023 | // SAFETY: All inputs in `defer_store` were |
| 1024 | // initialized. |
| 1025 | black_box_drop(unsafe { benched(input) }); |
| 1026 | } |
| 1027 | |
| 1028 | sample_end = UntaggedTimestamp::end(timer_kind); |
| 1029 | sync_threads(false); |
| 1030 | save_alloc_info(); |
| 1031 | |
| 1032 | // Prevent the optimizer from removing writes to inputs |
| 1033 | // in the sample loop. |
| 1034 | black_box(defer_inputs_slice); |
| 1035 | |
| 1036 | // Drop inputs. |
| 1037 | if mem::needs_drop::<I>() { |
| 1038 | for input in defer_inputs_slice { |
| 1039 | // SAFETY: We have exclusive access to inputs. |
| 1040 | unsafe { drop_input(input) } |
| 1041 | } |
| 1042 | } |
| 1043 | } |
| 1044 | } |
| 1045 | } |
| 1046 | |
| 1047 | // SAFETY: These values are guaranteed to be the correct variant |
| 1048 | // because they were created from the same `timer_kind`. |
| 1049 | let interval = unsafe { |
| 1050 | [sample_start.into_timestamp(timer_kind), sample_end.into_timestamp(timer_kind)] |
| 1051 | }; |
| 1052 | |
| 1053 | (interval, saved_alloc_info) |
| 1054 | } |
| 1055 | } |
| 1056 | |
| 1057 | #[inline ] |
| 1058 | fn initial_mode(&self) -> BenchMode { |
| 1059 | if self.shared_context.action.is_test() { |
| 1060 | BenchMode::Test |
| 1061 | } else if let Some(sample_size) = self.options.sample_size { |
| 1062 | BenchMode::Collect { sample_size } |
| 1063 | } else { |
| 1064 | BenchMode::Tune { sample_size: 1 } |
| 1065 | } |
| 1066 | } |
| 1067 | |
| 1068 | pub fn compute_stats(&self) -> Stats { |
| 1069 | let time_samples = &self.samples.time_samples; |
| 1070 | let alloc_info_by_sample = &self.samples.alloc_info_by_sample; |
| 1071 | |
| 1072 | let sample_count = time_samples.len(); |
| 1073 | let sample_size = self.samples.sample_size; |
| 1074 | |
| 1075 | let total_count = self.samples.iter_count(); |
| 1076 | |
| 1077 | let total_duration = self.samples.total_duration(); |
| 1078 | let mean_duration = FineDuration { |
| 1079 | picos: total_duration.picos.checked_div(total_count as u128).unwrap_or_default(), |
| 1080 | }; |
| 1081 | |
| 1082 | // Samples sorted by duration. |
| 1083 | let sorted_samples = self.samples.sorted_samples(); |
| 1084 | let median_samples = util::slice_middle(&sorted_samples); |
| 1085 | |
| 1086 | let index_of_sample = |sample: &TimeSample| -> usize { |
| 1087 | util::slice_ptr_index(&self.samples.time_samples, sample) |
| 1088 | }; |
| 1089 | |
| 1090 | let counter_count_for_sample = |
| 1091 | |sample: &TimeSample, counter_kind: KnownCounterKind| -> Option<MaxCountUInt> { |
| 1092 | let counts = self.counters.counts(counter_kind); |
| 1093 | |
| 1094 | let index = if self.counters.uses_input_counts(counter_kind) { |
| 1095 | index_of_sample(sample) |
| 1096 | } else { |
| 1097 | 0 |
| 1098 | }; |
| 1099 | |
| 1100 | counts.get(index).copied() |
| 1101 | }; |
| 1102 | |
| 1103 | let min_duration = |
| 1104 | sorted_samples.first().map(|s| s.duration / sample_size).unwrap_or_default(); |
| 1105 | let max_duration = |
| 1106 | sorted_samples.last().map(|s| s.duration / sample_size).unwrap_or_default(); |
| 1107 | |
| 1108 | let median_duration = if median_samples.is_empty() { |
| 1109 | FineDuration::default() |
| 1110 | } else { |
| 1111 | let sum: u128 = median_samples.iter().map(|s| s.duration.picos).sum(); |
| 1112 | FineDuration { picos: sum / median_samples.len() as u128 } / sample_size |
| 1113 | }; |
| 1114 | |
| 1115 | let counts = KnownCounterKind::ALL.map(|counter_kind| { |
| 1116 | let median: MaxCountUInt = { |
| 1117 | let mut sum: u128 = 0; |
| 1118 | |
| 1119 | for sample in median_samples { |
| 1120 | let sample_count = counter_count_for_sample(sample, counter_kind)? as u128; |
| 1121 | |
| 1122 | // Saturating add in case `MaxUIntCount > u64`. |
| 1123 | sum = sum.saturating_add(sample_count); |
| 1124 | } |
| 1125 | |
| 1126 | (sum / median_samples.len() as u128) as MaxCountUInt |
| 1127 | }; |
| 1128 | |
| 1129 | Some(StatsSet { |
| 1130 | fastest: sorted_samples |
| 1131 | .first() |
| 1132 | .and_then(|s| counter_count_for_sample(s, counter_kind))?, |
| 1133 | slowest: sorted_samples |
| 1134 | .last() |
| 1135 | .and_then(|s| counter_count_for_sample(s, counter_kind))?, |
| 1136 | median, |
| 1137 | mean: self.counters.mean_count(counter_kind), |
| 1138 | }) |
| 1139 | }); |
| 1140 | |
| 1141 | let sample_alloc_info = |sample: Option<&TimeSample>| -> Option<&ThreadAllocInfo> { |
| 1142 | sample |
| 1143 | .and_then(|sample| u32::try_from(index_of_sample(sample)).ok()) |
| 1144 | .and_then(|index| self.samples.alloc_info_by_sample.get(&index)) |
| 1145 | }; |
| 1146 | |
| 1147 | let sample_alloc_tally = |sample: Option<&TimeSample>, op: AllocOp| -> ThreadAllocTally { |
| 1148 | sample_alloc_info(sample) |
| 1149 | .map(|alloc_info| alloc_info.tallies.get(op)) |
| 1150 | .copied() |
| 1151 | .unwrap_or_default() |
| 1152 | }; |
| 1153 | |
| 1154 | let mut alloc_total_max_count = 0u128; |
| 1155 | let mut alloc_total_max_size = 0u128; |
| 1156 | let mut alloc_total_tallies = TotalAllocTallyMap::default(); |
| 1157 | |
| 1158 | for alloc_info in alloc_info_by_sample.values() { |
| 1159 | alloc_total_max_count += alloc_info.max_count as u128; |
| 1160 | alloc_total_max_size += alloc_info.max_size as u128; |
| 1161 | alloc_info.tallies.add_to_total(&mut alloc_total_tallies); |
| 1162 | } |
| 1163 | |
| 1164 | let sample_size = f64::from(sample_size); |
| 1165 | Stats { |
| 1166 | sample_count: sample_count as u32, |
| 1167 | iter_count: total_count, |
| 1168 | time: StatsSet { |
| 1169 | fastest: min_duration, |
| 1170 | slowest: max_duration, |
| 1171 | median: median_duration, |
| 1172 | mean: mean_duration, |
| 1173 | }, |
| 1174 | max_alloc: StatsSet { |
| 1175 | fastest: { |
| 1176 | let alloc_info = sample_alloc_info(sorted_samples.first().copied()); |
| 1177 | |
| 1178 | AllocTally { |
| 1179 | count: alloc_info.map(|info| info.max_count as f64).unwrap_or_default() |
| 1180 | / sample_size, |
| 1181 | size: alloc_info.map(|info| info.max_size as f64).unwrap_or_default() |
| 1182 | / sample_size, |
| 1183 | } |
| 1184 | }, |
| 1185 | slowest: { |
| 1186 | let alloc_info = sample_alloc_info(sorted_samples.last().copied()); |
| 1187 | |
| 1188 | AllocTally { |
| 1189 | count: alloc_info.map(|info| info.max_count as f64).unwrap_or_default() |
| 1190 | / sample_size, |
| 1191 | size: alloc_info.map(|info| info.max_size as f64).unwrap_or_default() |
| 1192 | / sample_size, |
| 1193 | } |
| 1194 | }, |
| 1195 | // TODO: Switch to median of alloc info itself, rather than |
| 1196 | // basing off of median times. |
| 1197 | median: { |
| 1198 | let alloc_info_for_median = |
| 1199 | |index| sample_alloc_info(median_samples.get(index).copied()); |
| 1200 | |
| 1201 | let max_count_for_median = |index: usize| -> f64 { |
| 1202 | alloc_info_for_median(index) |
| 1203 | .map(|info| info.max_count as f64) |
| 1204 | .unwrap_or_default() |
| 1205 | }; |
| 1206 | |
| 1207 | let max_size_for_median = |index: usize| -> f64 { |
| 1208 | alloc_info_for_median(index) |
| 1209 | .map(|info| info.max_size as f64) |
| 1210 | .unwrap_or_default() |
| 1211 | }; |
| 1212 | |
| 1213 | let median_count = median_samples.len().max(1) as f64; |
| 1214 | |
| 1215 | let median_max_count = max_count_for_median(0) + max_count_for_median(1); |
| 1216 | let median_max_size = max_size_for_median(0) + max_size_for_median(1); |
| 1217 | |
| 1218 | AllocTally { |
| 1219 | count: median_max_count / median_count / sample_size, |
| 1220 | size: median_max_size / median_count / sample_size, |
| 1221 | } |
| 1222 | }, |
| 1223 | mean: AllocTally { |
| 1224 | count: alloc_total_max_count as f64 / total_count as f64, |
| 1225 | size: alloc_total_max_size as f64 / total_count as f64, |
| 1226 | }, |
| 1227 | } |
| 1228 | .transpose(), |
| 1229 | alloc_tallies: AllocOpMap { |
| 1230 | values: AllocOp::ALL |
| 1231 | .map(|op| StatsSet { |
| 1232 | fastest: { |
| 1233 | let fastest = sample_alloc_tally(sorted_samples.first().copied(), op); |
| 1234 | |
| 1235 | AllocTally { |
| 1236 | count: fastest.count as f64 / sample_size, |
| 1237 | size: fastest.size as f64 / sample_size, |
| 1238 | } |
| 1239 | }, |
| 1240 | slowest: { |
| 1241 | let slowest = sample_alloc_tally(sorted_samples.last().copied(), op); |
| 1242 | |
| 1243 | AllocTally { |
| 1244 | count: slowest.count as f64 / sample_size, |
| 1245 | size: slowest.size as f64 / sample_size, |
| 1246 | } |
| 1247 | }, |
| 1248 | median: { |
| 1249 | let tally_for_median = |index: usize| -> ThreadAllocTally { |
| 1250 | sample_alloc_tally(median_samples.get(index).copied(), op) |
| 1251 | }; |
| 1252 | |
| 1253 | let a = tally_for_median(0); |
| 1254 | let b = tally_for_median(1); |
| 1255 | |
| 1256 | let median_count = median_samples.len().max(1) as f64; |
| 1257 | |
| 1258 | let avg_count = (a.count as f64 + b.count as f64) / median_count; |
| 1259 | let avg_size = (a.size as f64 + b.size as f64) / median_count; |
| 1260 | |
| 1261 | AllocTally { |
| 1262 | count: avg_count / sample_size, |
| 1263 | size: avg_size / sample_size, |
| 1264 | } |
| 1265 | }, |
| 1266 | mean: { |
| 1267 | let tally = alloc_total_tallies.get(op); |
| 1268 | AllocTally { |
| 1269 | count: tally.count as f64 / total_count as f64, |
| 1270 | size: tally.size as f64 / total_count as f64, |
| 1271 | } |
| 1272 | }, |
| 1273 | }) |
| 1274 | .map(StatsSet::transpose), |
| 1275 | }, |
| 1276 | counts, |
| 1277 | } |
| 1278 | } |
| 1279 | } |
| 1280 | |
| 1281 | impl<T> StatsSet<AllocTally<T>> { |
| 1282 | #[inline ] |
| 1283 | pub fn transpose(self) -> AllocTally<StatsSet<T>> { |
| 1284 | AllocTally { |
| 1285 | count: StatsSet { |
| 1286 | fastest: self.fastest.count, |
| 1287 | slowest: self.slowest.count, |
| 1288 | median: self.median.count, |
| 1289 | mean: self.mean.count, |
| 1290 | }, |
| 1291 | size: StatsSet { |
| 1292 | fastest: self.fastest.size, |
| 1293 | slowest: self.slowest.size, |
| 1294 | median: self.median.size, |
| 1295 | mean: self.mean.size, |
| 1296 | }, |
| 1297 | } |
| 1298 | } |
| 1299 | } |
| 1300 | |