| 1 | use std::{cmp::Ordering, num::NonZeroU64, sync::OnceLock}; |
| 2 | |
| 3 | use crate::{ |
| 4 | alloc::{AllocOp, ThreadAllocInfo}, |
| 5 | black_box, |
| 6 | time::{FineDuration, TscTimestamp, TscUnavailable, UntaggedTimestamp}, |
| 7 | }; |
| 8 | |
| 9 | /// Measures time. |
| 10 | #[derive (Clone, Copy, Default)] |
| 11 | pub(crate) enum Timer { |
| 12 | /// Operating system timer. |
| 13 | #[default] |
| 14 | Os, |
| 15 | |
| 16 | /// CPU timestamp counter. |
| 17 | Tsc { |
| 18 | /// [`TscTimestamp::frequency`]. |
| 19 | frequency: NonZeroU64, |
| 20 | }, |
| 21 | } |
| 22 | |
| 23 | impl Timer { |
| 24 | const COUNT: usize = 2; |
| 25 | |
| 26 | /// Returns all available timers. |
| 27 | #[cfg (test)] |
| 28 | pub fn available() -> Vec<Self> { |
| 29 | let mut timers = vec![Self::Os]; |
| 30 | |
| 31 | if let Ok(tsc) = Self::get_tsc() { |
| 32 | timers.push(tsc); |
| 33 | } |
| 34 | |
| 35 | timers |
| 36 | } |
| 37 | |
| 38 | /// Attempts to get the CPU timestamp counter. |
| 39 | #[inline ] |
| 40 | pub fn get_tsc() -> Result<Self, TscUnavailable> { |
| 41 | Ok(Self::Tsc { frequency: TscTimestamp::frequency()? }) |
| 42 | } |
| 43 | |
| 44 | #[inline ] |
| 45 | pub fn kind(self) -> TimerKind { |
| 46 | match self { |
| 47 | Self::Os => TimerKind::Os, |
| 48 | Self::Tsc { .. } => TimerKind::Tsc, |
| 49 | } |
| 50 | } |
| 51 | |
| 52 | /// Returns the smallest non-zero duration that this timer can measure. |
| 53 | /// |
| 54 | /// The result is cached. |
| 55 | pub fn precision(self) -> FineDuration { |
| 56 | static CACHED: [OnceLock<FineDuration>; Timer::COUNT] = [OnceLock::new(), OnceLock::new()]; |
| 57 | |
| 58 | let cached = &CACHED[self.kind() as usize]; |
| 59 | |
| 60 | *cached.get_or_init(|| self.measure_precision()) |
| 61 | } |
| 62 | |
| 63 | fn measure_precision(self) -> FineDuration { |
| 64 | let timer_kind = self.kind(); |
| 65 | |
| 66 | // Start with the worst possible minimum. |
| 67 | let mut min_sample = FineDuration::MAX; |
| 68 | let mut seen_count = 0; |
| 69 | |
| 70 | // If timing in immediate succession fails to produce a non-zero sample, |
| 71 | // an artificial delay is added by looping. `usize` is intentionally |
| 72 | // used to make looping cheap. |
| 73 | let mut delay_len: usize = 0; |
| 74 | |
| 75 | loop { |
| 76 | for _ in 0..100 { |
| 77 | // Use `UntaggedTimestamp` to minimize overhead. |
| 78 | let sample_start: UntaggedTimestamp; |
| 79 | let sample_end: UntaggedTimestamp; |
| 80 | |
| 81 | if delay_len == 0 { |
| 82 | // Immediate succession. |
| 83 | sample_start = UntaggedTimestamp::start(timer_kind); |
| 84 | sample_end = UntaggedTimestamp::end(timer_kind); |
| 85 | } else { |
| 86 | // Add delay. |
| 87 | sample_start = UntaggedTimestamp::start(timer_kind); |
| 88 | for n in 0..delay_len { |
| 89 | crate::black_box(n); |
| 90 | } |
| 91 | sample_end = UntaggedTimestamp::end(timer_kind); |
| 92 | } |
| 93 | |
| 94 | // SAFETY: These values are guaranteed to be the correct variant |
| 95 | // because they were created from the same `timer_kind`. |
| 96 | let [sample_start, sample_end] = unsafe { |
| 97 | [sample_start.into_timestamp(timer_kind), sample_end.into_timestamp(timer_kind)] |
| 98 | }; |
| 99 | |
| 100 | let sample = sample_end.duration_since(sample_start, self); |
| 101 | |
| 102 | // Discard sample if irrelevant. |
| 103 | if sample.is_zero() { |
| 104 | continue; |
| 105 | } |
| 106 | |
| 107 | match sample.cmp(&min_sample) { |
| 108 | Ordering::Greater => { |
| 109 | // If we already delayed a lot, and not hit the seen |
| 110 | // count threshold, then use current minimum. |
| 111 | if delay_len > 100 { |
| 112 | return min_sample; |
| 113 | } |
| 114 | } |
| 115 | Ordering::Equal => { |
| 116 | seen_count += 1; |
| 117 | |
| 118 | // If we've seen this min 100 times, we have high |
| 119 | // confidence this is the smallest duration. |
| 120 | if seen_count >= 100 { |
| 121 | return min_sample; |
| 122 | } |
| 123 | } |
| 124 | Ordering::Less => { |
| 125 | min_sample = sample; |
| 126 | seen_count = 0; |
| 127 | } |
| 128 | } |
| 129 | } |
| 130 | |
| 131 | delay_len = delay_len.saturating_add(1); |
| 132 | } |
| 133 | } |
| 134 | |
| 135 | /// Returns the overheads added by the benchmarker. |
| 136 | /// |
| 137 | /// `min_time` and `max_time` do not consider this as benchmarking time. |
| 138 | pub fn bench_overheads(self) -> &'static TimedOverhead { |
| 139 | // Miri is slow, so don't waste time on this. |
| 140 | if cfg!(miri) { |
| 141 | return &TimedOverhead::ZERO; |
| 142 | } |
| 143 | |
| 144 | static CACHED: [OnceLock<TimedOverhead>; Timer::COUNT] = [OnceLock::new(), OnceLock::new()]; |
| 145 | |
| 146 | let cached = &CACHED[self.kind() as usize]; |
| 147 | |
| 148 | cached.get_or_init(|| TimedOverhead { |
| 149 | sample_loop: self.sample_loop_overhead(), |
| 150 | tally_alloc: self.measure_tally_alloc_overhead(), |
| 151 | tally_dealloc: self.measure_tally_dealloc_overhead(), |
| 152 | tally_realloc: self.measure_tally_realloc_overhead(), |
| 153 | }) |
| 154 | } |
| 155 | |
| 156 | /// Returns the per-iteration overhead of the benchmarking sample loop. |
| 157 | fn sample_loop_overhead(self) -> FineDuration { |
| 158 | // Miri is slow, so don't waste time on this. |
| 159 | if cfg!(miri) { |
| 160 | return FineDuration::default(); |
| 161 | } |
| 162 | |
| 163 | static CACHED: [OnceLock<FineDuration>; Timer::COUNT] = [OnceLock::new(), OnceLock::new()]; |
| 164 | |
| 165 | let cached = &CACHED[self.kind() as usize]; |
| 166 | |
| 167 | *cached.get_or_init(|| self.measure_sample_loop_overhead()) |
| 168 | } |
| 169 | |
| 170 | /// Calculates the per-iteration overhead of the benchmarking sample loop. |
| 171 | fn measure_sample_loop_overhead(self) -> FineDuration { |
| 172 | let timer_kind = self.kind(); |
| 173 | |
| 174 | let sample_count: usize = 100; |
| 175 | let sample_size: usize = 10_000; |
| 176 | |
| 177 | // The minimum non-zero sample. |
| 178 | let mut min_sample = FineDuration::default(); |
| 179 | |
| 180 | for _ in 0..sample_count { |
| 181 | let start = UntaggedTimestamp::start(timer_kind); |
| 182 | |
| 183 | for i in 0..sample_size { |
| 184 | _ = crate::black_box(i); |
| 185 | } |
| 186 | |
| 187 | let end = UntaggedTimestamp::end(timer_kind); |
| 188 | |
| 189 | // SAFETY: These values are guaranteed to be the correct variant because |
| 190 | // they were created from the same `timer_kind`. |
| 191 | let [start, end] = |
| 192 | unsafe { [start.into_timestamp(timer_kind), end.into_timestamp(timer_kind)] }; |
| 193 | |
| 194 | let mut sample = end.duration_since(start, self); |
| 195 | sample.picos /= sample_size as u128; |
| 196 | |
| 197 | min_sample = min_sample.clamp_to_min(sample); |
| 198 | } |
| 199 | |
| 200 | min_sample |
| 201 | } |
| 202 | |
| 203 | fn measure_tally_alloc_overhead(self) -> FineDuration { |
| 204 | let size = black_box(0); |
| 205 | self.measure_alloc_info_overhead(|alloc_info| alloc_info.tally_alloc(size)) |
| 206 | } |
| 207 | |
| 208 | fn measure_tally_dealloc_overhead(self) -> FineDuration { |
| 209 | let size = black_box(0); |
| 210 | self.measure_alloc_info_overhead(|alloc_info| alloc_info.tally_dealloc(size)) |
| 211 | } |
| 212 | |
| 213 | fn measure_tally_realloc_overhead(self) -> FineDuration { |
| 214 | let new_size = black_box(0); |
| 215 | let old_size = black_box(0); |
| 216 | self.measure_alloc_info_overhead(|alloc_info| alloc_info.tally_realloc(old_size, new_size)) |
| 217 | } |
| 218 | |
| 219 | // SAFETY: This function is not reentrant. Calling it within `operation` |
| 220 | // would cause aliasing of `ThreadAllocInfo::current`. |
| 221 | fn measure_alloc_info_overhead(self, operation: impl Fn(&mut ThreadAllocInfo)) -> FineDuration { |
| 222 | // Initialize the current thread's alloc info. |
| 223 | let alloc_info = ThreadAllocInfo::current(); |
| 224 | |
| 225 | let sample_count = 100; |
| 226 | let sample_size = 50_000; |
| 227 | |
| 228 | let result = self.measure_min_time(sample_count, sample_size, || { |
| 229 | if let Some(mut alloc_info) = ThreadAllocInfo::try_current() { |
| 230 | // SAFETY: We have exclusive access. |
| 231 | operation(unsafe { alloc_info.as_mut() }); |
| 232 | } |
| 233 | }); |
| 234 | |
| 235 | // Clear alloc info. |
| 236 | if let Some(mut alloc_info) = alloc_info { |
| 237 | // SAFETY: We have exclusive access. |
| 238 | let alloc_info = unsafe { alloc_info.as_mut() }; |
| 239 | |
| 240 | alloc_info.clear(); |
| 241 | } |
| 242 | |
| 243 | result |
| 244 | } |
| 245 | |
| 246 | /// Calculates the smallest non-zero time to perform an operation. |
| 247 | fn measure_min_time( |
| 248 | self, |
| 249 | sample_count: usize, |
| 250 | sample_size: usize, |
| 251 | operation: impl Fn(), |
| 252 | ) -> FineDuration { |
| 253 | let timer_kind = self.kind(); |
| 254 | |
| 255 | let loop_overhead = self.sample_loop_overhead(); |
| 256 | let mut min_sample = FineDuration::default(); |
| 257 | |
| 258 | for _ in 0..sample_count { |
| 259 | let start = UntaggedTimestamp::start(timer_kind); |
| 260 | |
| 261 | for _ in 0..sample_size { |
| 262 | operation(); |
| 263 | } |
| 264 | |
| 265 | let end = UntaggedTimestamp::end(timer_kind); |
| 266 | |
| 267 | // SAFETY: These values are guaranteed to be the correct variant |
| 268 | // because they were created from the same `timer_kind`. |
| 269 | let [start, end] = |
| 270 | unsafe { [start.into_timestamp(timer_kind), end.into_timestamp(timer_kind)] }; |
| 271 | |
| 272 | let mut sample = end.duration_since(start, self); |
| 273 | sample.picos /= sample_size as u128; |
| 274 | |
| 275 | // Remove benchmarking loop overhead. |
| 276 | sample.picos = sample.picos.saturating_sub(loop_overhead.picos); |
| 277 | |
| 278 | min_sample = min_sample.clamp_to_min(sample); |
| 279 | } |
| 280 | |
| 281 | min_sample |
| 282 | } |
| 283 | } |
| 284 | |
| 285 | /// [`Timer`] kind. |
| 286 | #[derive (Clone, Copy, Default)] |
| 287 | pub(crate) enum TimerKind { |
| 288 | /// Operating system timer. |
| 289 | #[default] |
| 290 | Os, |
| 291 | |
| 292 | /// CPU timestamp counter. |
| 293 | Tsc, |
| 294 | } |
| 295 | |
| 296 | /// The measured overhead of various benchmarking operations. |
| 297 | pub(crate) struct TimedOverhead { |
| 298 | pub sample_loop: FineDuration, |
| 299 | pub tally_alloc: FineDuration, |
| 300 | pub tally_dealloc: FineDuration, |
| 301 | pub tally_realloc: FineDuration, |
| 302 | } |
| 303 | |
| 304 | impl TimedOverhead { |
| 305 | pub const ZERO: Self = Self { |
| 306 | sample_loop: FineDuration::ZERO, |
| 307 | tally_alloc: FineDuration::ZERO, |
| 308 | tally_dealloc: FineDuration::ZERO, |
| 309 | tally_realloc: FineDuration::ZERO, |
| 310 | }; |
| 311 | |
| 312 | pub fn total_overhead(&self, sample_size: u32, alloc_info: &ThreadAllocInfo) -> FineDuration { |
| 313 | let sample_loop_overhead = self.sample_loop.picos.saturating_mul(sample_size as u128); |
| 314 | |
| 315 | let tally_alloc_overhead = self |
| 316 | .tally_alloc |
| 317 | .picos |
| 318 | .saturating_mul(alloc_info.tallies.get(AllocOp::Alloc).count as u128); |
| 319 | |
| 320 | let tally_dealloc_overhead = self |
| 321 | .tally_dealloc |
| 322 | .picos |
| 323 | .saturating_mul(alloc_info.tallies.get(AllocOp::Dealloc).count as u128); |
| 324 | |
| 325 | let tally_realloc_overhead = self.tally_realloc.picos.saturating_mul( |
| 326 | alloc_info.tallies.get(AllocOp::Grow).count as u128 |
| 327 | + alloc_info.tallies.get(AllocOp::Shrink).count as u128, |
| 328 | ); |
| 329 | |
| 330 | FineDuration { |
| 331 | picos: sample_loop_overhead |
| 332 | .saturating_add(tally_alloc_overhead) |
| 333 | .saturating_add(tally_dealloc_overhead) |
| 334 | .saturating_add(tally_realloc_overhead), |
| 335 | } |
| 336 | } |
| 337 | } |
| 338 | |
| 339 | #[cfg (feature = "internal_benches" )] |
| 340 | mod benches { |
| 341 | use super::*; |
| 342 | |
| 343 | #[crate::bench (crate = crate)] |
| 344 | fn get_tsc() -> Result<Timer, TscUnavailable> { |
| 345 | Timer::get_tsc() |
| 346 | } |
| 347 | |
| 348 | mod measure { |
| 349 | use super::*; |
| 350 | |
| 351 | #[crate::bench (crate = crate)] |
| 352 | fn precision() -> FineDuration { |
| 353 | Timer::Os.measure_precision() |
| 354 | } |
| 355 | |
| 356 | #[crate::bench (crate = crate)] |
| 357 | fn sample_loop_overhead() -> FineDuration { |
| 358 | Timer::Os.measure_sample_loop_overhead() |
| 359 | } |
| 360 | |
| 361 | #[crate::bench (crate = crate)] |
| 362 | fn tally_alloc_overhead() -> FineDuration { |
| 363 | Timer::Os.measure_tally_alloc_overhead() |
| 364 | } |
| 365 | |
| 366 | #[crate::bench (crate = crate)] |
| 367 | fn tally_dealloc_overhead() -> FineDuration { |
| 368 | Timer::Os.measure_tally_dealloc_overhead() |
| 369 | } |
| 370 | |
| 371 | #[crate::bench (crate = crate)] |
| 372 | fn tally_realloc_overhead() -> FineDuration { |
| 373 | Timer::Os.measure_tally_realloc_overhead() |
| 374 | } |
| 375 | } |
| 376 | } |
| 377 | |