1 | use std::{cmp::Ordering, num::NonZeroU64, sync::OnceLock}; |
2 | |
3 | use crate::{ |
4 | alloc::{AllocOp, ThreadAllocInfo}, |
5 | black_box, |
6 | time::{FineDuration, TscTimestamp, TscUnavailable, UntaggedTimestamp}, |
7 | }; |
8 | |
9 | /// Measures time. |
10 | #[derive (Clone, Copy, Default)] |
11 | pub(crate) enum Timer { |
12 | /// Operating system timer. |
13 | #[default] |
14 | Os, |
15 | |
16 | /// CPU timestamp counter. |
17 | Tsc { |
18 | /// [`TscTimestamp::frequency`]. |
19 | frequency: NonZeroU64, |
20 | }, |
21 | } |
22 | |
23 | impl Timer { |
24 | const COUNT: usize = 2; |
25 | |
26 | /// Returns all available timers. |
27 | #[cfg (test)] |
28 | pub fn available() -> Vec<Self> { |
29 | let mut timers = vec![Self::Os]; |
30 | |
31 | if let Ok(tsc) = Self::get_tsc() { |
32 | timers.push(tsc); |
33 | } |
34 | |
35 | timers |
36 | } |
37 | |
38 | /// Attempts to get the CPU timestamp counter. |
39 | #[inline ] |
40 | pub fn get_tsc() -> Result<Self, TscUnavailable> { |
41 | Ok(Self::Tsc { frequency: TscTimestamp::frequency()? }) |
42 | } |
43 | |
44 | #[inline ] |
45 | pub fn kind(self) -> TimerKind { |
46 | match self { |
47 | Self::Os => TimerKind::Os, |
48 | Self::Tsc { .. } => TimerKind::Tsc, |
49 | } |
50 | } |
51 | |
52 | /// Returns the smallest non-zero duration that this timer can measure. |
53 | /// |
54 | /// The result is cached. |
55 | pub fn precision(self) -> FineDuration { |
56 | static CACHED: [OnceLock<FineDuration>; Timer::COUNT] = [OnceLock::new(), OnceLock::new()]; |
57 | |
58 | let cached = &CACHED[self.kind() as usize]; |
59 | |
60 | *cached.get_or_init(|| self.measure_precision()) |
61 | } |
62 | |
63 | fn measure_precision(self) -> FineDuration { |
64 | let timer_kind = self.kind(); |
65 | |
66 | // Start with the worst possible minimum. |
67 | let mut min_sample = FineDuration::MAX; |
68 | let mut seen_count = 0; |
69 | |
70 | // If timing in immediate succession fails to produce a non-zero sample, |
71 | // an artificial delay is added by looping. `usize` is intentionally |
72 | // used to make looping cheap. |
73 | let mut delay_len: usize = 0; |
74 | |
75 | loop { |
76 | for _ in 0..100 { |
77 | // Use `UntaggedTimestamp` to minimize overhead. |
78 | let sample_start: UntaggedTimestamp; |
79 | let sample_end: UntaggedTimestamp; |
80 | |
81 | if delay_len == 0 { |
82 | // Immediate succession. |
83 | sample_start = UntaggedTimestamp::start(timer_kind); |
84 | sample_end = UntaggedTimestamp::end(timer_kind); |
85 | } else { |
86 | // Add delay. |
87 | sample_start = UntaggedTimestamp::start(timer_kind); |
88 | for n in 0..delay_len { |
89 | crate::black_box(n); |
90 | } |
91 | sample_end = UntaggedTimestamp::end(timer_kind); |
92 | } |
93 | |
94 | // SAFETY: These values are guaranteed to be the correct variant |
95 | // because they were created from the same `timer_kind`. |
96 | let [sample_start, sample_end] = unsafe { |
97 | [sample_start.into_timestamp(timer_kind), sample_end.into_timestamp(timer_kind)] |
98 | }; |
99 | |
100 | let sample = sample_end.duration_since(sample_start, self); |
101 | |
102 | // Discard sample if irrelevant. |
103 | if sample.is_zero() { |
104 | continue; |
105 | } |
106 | |
107 | match sample.cmp(&min_sample) { |
108 | Ordering::Greater => { |
109 | // If we already delayed a lot, and not hit the seen |
110 | // count threshold, then use current minimum. |
111 | if delay_len > 100 { |
112 | return min_sample; |
113 | } |
114 | } |
115 | Ordering::Equal => { |
116 | seen_count += 1; |
117 | |
118 | // If we've seen this min 100 times, we have high |
119 | // confidence this is the smallest duration. |
120 | if seen_count >= 100 { |
121 | return min_sample; |
122 | } |
123 | } |
124 | Ordering::Less => { |
125 | min_sample = sample; |
126 | seen_count = 0; |
127 | } |
128 | } |
129 | } |
130 | |
131 | delay_len = delay_len.saturating_add(1); |
132 | } |
133 | } |
134 | |
135 | /// Returns the overheads added by the benchmarker. |
136 | /// |
137 | /// `min_time` and `max_time` do not consider this as benchmarking time. |
138 | pub fn bench_overheads(self) -> &'static TimedOverhead { |
139 | // Miri is slow, so don't waste time on this. |
140 | if cfg!(miri) { |
141 | return &TimedOverhead::ZERO; |
142 | } |
143 | |
144 | static CACHED: [OnceLock<TimedOverhead>; Timer::COUNT] = [OnceLock::new(), OnceLock::new()]; |
145 | |
146 | let cached = &CACHED[self.kind() as usize]; |
147 | |
148 | cached.get_or_init(|| TimedOverhead { |
149 | sample_loop: self.sample_loop_overhead(), |
150 | tally_alloc: self.measure_tally_alloc_overhead(), |
151 | tally_dealloc: self.measure_tally_dealloc_overhead(), |
152 | tally_realloc: self.measure_tally_realloc_overhead(), |
153 | }) |
154 | } |
155 | |
156 | /// Returns the per-iteration overhead of the benchmarking sample loop. |
157 | fn sample_loop_overhead(self) -> FineDuration { |
158 | // Miri is slow, so don't waste time on this. |
159 | if cfg!(miri) { |
160 | return FineDuration::default(); |
161 | } |
162 | |
163 | static CACHED: [OnceLock<FineDuration>; Timer::COUNT] = [OnceLock::new(), OnceLock::new()]; |
164 | |
165 | let cached = &CACHED[self.kind() as usize]; |
166 | |
167 | *cached.get_or_init(|| self.measure_sample_loop_overhead()) |
168 | } |
169 | |
170 | /// Calculates the per-iteration overhead of the benchmarking sample loop. |
171 | fn measure_sample_loop_overhead(self) -> FineDuration { |
172 | let timer_kind = self.kind(); |
173 | |
174 | let sample_count: usize = 100; |
175 | let sample_size: usize = 10_000; |
176 | |
177 | // The minimum non-zero sample. |
178 | let mut min_sample = FineDuration::default(); |
179 | |
180 | for _ in 0..sample_count { |
181 | let start = UntaggedTimestamp::start(timer_kind); |
182 | |
183 | for i in 0..sample_size { |
184 | _ = crate::black_box(i); |
185 | } |
186 | |
187 | let end = UntaggedTimestamp::end(timer_kind); |
188 | |
189 | // SAFETY: These values are guaranteed to be the correct variant because |
190 | // they were created from the same `timer_kind`. |
191 | let [start, end] = |
192 | unsafe { [start.into_timestamp(timer_kind), end.into_timestamp(timer_kind)] }; |
193 | |
194 | let mut sample = end.duration_since(start, self); |
195 | sample.picos /= sample_size as u128; |
196 | |
197 | min_sample = min_sample.clamp_to_min(sample); |
198 | } |
199 | |
200 | min_sample |
201 | } |
202 | |
203 | fn measure_tally_alloc_overhead(self) -> FineDuration { |
204 | let size = black_box(0); |
205 | self.measure_alloc_info_overhead(|alloc_info| alloc_info.tally_alloc(size)) |
206 | } |
207 | |
208 | fn measure_tally_dealloc_overhead(self) -> FineDuration { |
209 | let size = black_box(0); |
210 | self.measure_alloc_info_overhead(|alloc_info| alloc_info.tally_dealloc(size)) |
211 | } |
212 | |
213 | fn measure_tally_realloc_overhead(self) -> FineDuration { |
214 | let new_size = black_box(0); |
215 | let old_size = black_box(0); |
216 | self.measure_alloc_info_overhead(|alloc_info| alloc_info.tally_realloc(old_size, new_size)) |
217 | } |
218 | |
219 | // SAFETY: This function is not reentrant. Calling it within `operation` |
220 | // would cause aliasing of `ThreadAllocInfo::current`. |
221 | fn measure_alloc_info_overhead(self, operation: impl Fn(&mut ThreadAllocInfo)) -> FineDuration { |
222 | // Initialize the current thread's alloc info. |
223 | let alloc_info = ThreadAllocInfo::current(); |
224 | |
225 | let sample_count = 100; |
226 | let sample_size = 50_000; |
227 | |
228 | let result = self.measure_min_time(sample_count, sample_size, || { |
229 | if let Some(mut alloc_info) = ThreadAllocInfo::try_current() { |
230 | // SAFETY: We have exclusive access. |
231 | operation(unsafe { alloc_info.as_mut() }); |
232 | } |
233 | }); |
234 | |
235 | // Clear alloc info. |
236 | if let Some(mut alloc_info) = alloc_info { |
237 | // SAFETY: We have exclusive access. |
238 | let alloc_info = unsafe { alloc_info.as_mut() }; |
239 | |
240 | alloc_info.clear(); |
241 | } |
242 | |
243 | result |
244 | } |
245 | |
246 | /// Calculates the smallest non-zero time to perform an operation. |
247 | fn measure_min_time( |
248 | self, |
249 | sample_count: usize, |
250 | sample_size: usize, |
251 | operation: impl Fn(), |
252 | ) -> FineDuration { |
253 | let timer_kind = self.kind(); |
254 | |
255 | let loop_overhead = self.sample_loop_overhead(); |
256 | let mut min_sample = FineDuration::default(); |
257 | |
258 | for _ in 0..sample_count { |
259 | let start = UntaggedTimestamp::start(timer_kind); |
260 | |
261 | for _ in 0..sample_size { |
262 | operation(); |
263 | } |
264 | |
265 | let end = UntaggedTimestamp::end(timer_kind); |
266 | |
267 | // SAFETY: These values are guaranteed to be the correct variant |
268 | // because they were created from the same `timer_kind`. |
269 | let [start, end] = |
270 | unsafe { [start.into_timestamp(timer_kind), end.into_timestamp(timer_kind)] }; |
271 | |
272 | let mut sample = end.duration_since(start, self); |
273 | sample.picos /= sample_size as u128; |
274 | |
275 | // Remove benchmarking loop overhead. |
276 | sample.picos = sample.picos.saturating_sub(loop_overhead.picos); |
277 | |
278 | min_sample = min_sample.clamp_to_min(sample); |
279 | } |
280 | |
281 | min_sample |
282 | } |
283 | } |
284 | |
285 | /// [`Timer`] kind. |
286 | #[derive (Clone, Copy, Default)] |
287 | pub(crate) enum TimerKind { |
288 | /// Operating system timer. |
289 | #[default] |
290 | Os, |
291 | |
292 | /// CPU timestamp counter. |
293 | Tsc, |
294 | } |
295 | |
296 | /// The measured overhead of various benchmarking operations. |
297 | pub(crate) struct TimedOverhead { |
298 | pub sample_loop: FineDuration, |
299 | pub tally_alloc: FineDuration, |
300 | pub tally_dealloc: FineDuration, |
301 | pub tally_realloc: FineDuration, |
302 | } |
303 | |
304 | impl TimedOverhead { |
305 | pub const ZERO: Self = Self { |
306 | sample_loop: FineDuration::ZERO, |
307 | tally_alloc: FineDuration::ZERO, |
308 | tally_dealloc: FineDuration::ZERO, |
309 | tally_realloc: FineDuration::ZERO, |
310 | }; |
311 | |
312 | pub fn total_overhead(&self, sample_size: u32, alloc_info: &ThreadAllocInfo) -> FineDuration { |
313 | let sample_loop_overhead = self.sample_loop.picos.saturating_mul(sample_size as u128); |
314 | |
315 | let tally_alloc_overhead = self |
316 | .tally_alloc |
317 | .picos |
318 | .saturating_mul(alloc_info.tallies.get(AllocOp::Alloc).count as u128); |
319 | |
320 | let tally_dealloc_overhead = self |
321 | .tally_dealloc |
322 | .picos |
323 | .saturating_mul(alloc_info.tallies.get(AllocOp::Dealloc).count as u128); |
324 | |
325 | let tally_realloc_overhead = self.tally_realloc.picos.saturating_mul( |
326 | alloc_info.tallies.get(AllocOp::Grow).count as u128 |
327 | + alloc_info.tallies.get(AllocOp::Shrink).count as u128, |
328 | ); |
329 | |
330 | FineDuration { |
331 | picos: sample_loop_overhead |
332 | .saturating_add(tally_alloc_overhead) |
333 | .saturating_add(tally_dealloc_overhead) |
334 | .saturating_add(tally_realloc_overhead), |
335 | } |
336 | } |
337 | } |
338 | |
339 | #[cfg (feature = "internal_benches" )] |
340 | mod benches { |
341 | use super::*; |
342 | |
343 | #[crate::bench (crate = crate)] |
344 | fn get_tsc() -> Result<Timer, TscUnavailable> { |
345 | Timer::get_tsc() |
346 | } |
347 | |
348 | mod measure { |
349 | use super::*; |
350 | |
351 | #[crate::bench (crate = crate)] |
352 | fn precision() -> FineDuration { |
353 | Timer::Os.measure_precision() |
354 | } |
355 | |
356 | #[crate::bench (crate = crate)] |
357 | fn sample_loop_overhead() -> FineDuration { |
358 | Timer::Os.measure_sample_loop_overhead() |
359 | } |
360 | |
361 | #[crate::bench (crate = crate)] |
362 | fn tally_alloc_overhead() -> FineDuration { |
363 | Timer::Os.measure_tally_alloc_overhead() |
364 | } |
365 | |
366 | #[crate::bench (crate = crate)] |
367 | fn tally_dealloc_overhead() -> FineDuration { |
368 | Timer::Os.measure_tally_dealloc_overhead() |
369 | } |
370 | |
371 | #[crate::bench (crate = crate)] |
372 | fn tally_realloc_overhead() -> FineDuration { |
373 | Timer::Os.measure_tally_realloc_overhead() |
374 | } |
375 | } |
376 | } |
377 | |