1 | //! Profiling counters and their implementation. |
2 | //! |
3 | //! # Available counters |
4 | //! |
5 | //! Name (for [`Counter::by_name()`]) | Counter | OSes | CPUs |
6 | //! --------------------------------- | ------- | ---- | ---- |
7 | //! `wall-time` | [`WallTime`] | any | any |
8 | //! `instructions:u` | [`Instructions`] | Linux | `x86_64` |
9 | //! `instructions-minus-irqs:u` | [`InstructionsMinusIrqs`] | Linux | `x86_64`<br>- AMD (since K8)<br>- Intel (since Sandy Bridge) |
10 | //! `instructions-minus-r0420:u` | [`InstructionsMinusRaw0420`] | Linux | `x86_64`<br>- AMD (Zen) |
11 | //! |
12 | //! *Note: `:u` suffixes for hardware performance counters come from the Linux `perf` |
13 | //! tool, and indicate that the counter is only active while userspace code executes |
14 | //! (i.e. it's paused while the kernel handles syscalls, interrupts, etc.).* |
15 | //! |
16 | //! # Limitations and caveats |
17 | //! |
18 | //! *Note: for more information, also see the GitHub PR which first implemented hardware |
19 | //! performance counter support ([#143](https://github.com/rust-lang/measureme/pull/143)).* |
20 | //! |
21 | //! The hardware performance counters (i.e. all counters other than `wall-time`) are limited to: |
22 | //! * Linux, for out-of-the-box performance counter reads from userspace |
23 | //! * other OSes could work through custom kernel extensions/drivers, in the future |
24 | //! * `x86_64` CPUs, mostly due to lack of other available test hardware |
25 | //! * new architectures would be easier to support (on Linux) than new OSes |
26 | //! * easiest to add would be 32-bit `x86` (aka `i686`), which would reuse |
27 | //! most of the `x86_64` CPU model detection logic |
28 | //! * specific (newer) CPU models, for certain non-standard counters |
29 | //! * e.g. `instructions-minus-irqs:u` requires a "hardware interrupts" (aka "IRQs") |
30 | //! counter, which is implemented differently between vendors / models (if at all) |
31 | //! * single-threaded programs (counters only work on the thread they were created on) |
32 | //! * for profiling `rustc`, this means only "check mode" (`--emit=metadata`), |
33 | //! is supported currently (`-Z no-llvm-threads` could also work) |
34 | //! * unclear what the best approach for handling multiple threads would be |
35 | //! * changing the API (e.g. to require per-thread profiler handles) could result |
36 | //! in a more efficient implementation, but would also be less ergonomic |
37 | //! * profiling data from multithreaded programs would be harder to use due to |
38 | //! noise from synchronization mechanisms, non-deterministic work-stealing, etc. |
39 | //! |
40 | //! For ergonomic reasons, the public API doesn't vary based on `features` or target. |
41 | //! Instead, attempting to create any unsupported counter will return `Err`, just |
42 | //! like it does for any issue detected at runtime (e.g. incompatible CPU model). |
43 | //! |
44 | //! When counting instructions specifically, these factors will impact the profiling quality: |
45 | //! * high-level non-determinism (e.g. user interactions, networking) |
46 | //! * the ideal use-case is a mostly-deterministic program, e.g. a compiler like `rustc` |
47 | //! * if I/O can be isolated to separate profiling events, and doesn't impact |
48 | //! execution in a more subtle way (see below), the deterministic parts of |
49 | //! the program can still be profiled with high accuracy |
50 | //! * intentional uses of randomness may change execution paths, though for |
51 | //! cryptographic operations specifically, "constant time" implementations |
52 | //! are preferred / necessary (in order to limit an external observer's |
53 | //! ability to infer secrets), so they're not as much of a problem |
54 | //! * even otherwise-deterministic machine-local communication (to e.g. system |
55 | //! services or drivers) can behave unpredictably (especially under load) |
56 | //! * while we haven't observed this in the wild yet, it's possible for |
57 | //! file reads/writes to be split up into multiple smaller chunks |
58 | //! (and therefore take more userspace instructions to fully read/write) |
59 | //! * low-level non-determinism (e.g. ASLR, randomized `HashMap`s, timers) |
60 | //! * ASLR ("Address Space Layout Randomization"), may be provided by the OS for |
61 | //! security reasons, or accidentally caused through allocations that depend on |
62 | //! random data (even as low-entropy as e.g. the base 10 length of a process ID) |
63 | //! * on Linux ASLR can be disabled by running the process under `setarch -R` |
64 | //! * this impacts `rustc` and LLVM, which rely on keying `HashMap`s by addresses |
65 | //! (typically of interned data) as an optimization, and while non-determinstic |
66 | //! outputs are considered bugs, the instructions executed can still vary a lot, |
67 | //! even when the externally observable behavior is perfectly repeatable |
68 | //! * `HashMap`s are involved in one more than one way: |
69 | //! * both the executed instructions, and the shape of the allocations depend |
70 | //! on both the hasher state and choice of keys (as the buckets are in |
71 | //! a flat array indexed by some of the lower bits of the key hashes) |
72 | //! * so every `HashMap` with keys being/containing addresses will amplify |
73 | //! ASLR and ASLR-like effects, making the entire program more sensitive |
74 | //! * the default hasher is randomized, and while `rustc` doesn't use it, |
75 | //! proc macros can (and will), and it's harder to disable than Linux ASLR |
76 | //! * most ways of measuring time will inherently never perfectly align with |
77 | //! exact points in the program's execution, making time behave like another |
78 | //! low-entropy source of randomness - this also means timers will elapse at |
79 | //! unpredictable points (which can further impact the rest of the execution) |
80 | //! * this includes the common thread scheduler technique of preempting the |
81 | //! currently executing thread with a periodic timer interrupt, so the exact |
82 | //! interleaving of multiple threads will likely not be reproducible without |
83 | //! special OS configuration, or tools that emulate a deterministic scheduler |
84 | //! * `jemalloc` (the allocator used by `rustc`, at least in official releases) |
85 | //! has a 10 second "purge timer", which can introduce an ASLR-like effect, |
86 | //! unless disabled with `MALLOC_CONF=dirty_decay_ms:0,muzzy_decay_ms:0` |
87 | //! * hardware flaws (whether in the design or implementation) |
88 | //! * hardware interrupts ("IRQs") and exceptions (like page faults) cause |
89 | //! overcounting (1 instruction per interrupt, possibly the `iret` from the |
90 | //! kernel handler back to the interrupted userspace program) |
91 | //! * this is the reason why `instructions-minus-irqs:u` should be preferred |
92 | //! to `instructions:u`, where the former is available |
93 | //! * there are system-wide options (e.g. `CONFIG_NO_HZ_FULL`) for removing |
94 | //! some interrupts from the cores used for profiling, but they're not as |
95 | //! complete of a solution, nor easy to set up in the first place |
96 | //! * AMD Zen CPUs have a speculative execution feature (dubbed `SpecLockMap`), |
97 | //! which can cause non-deterministic overcounting for instructions following |
98 | //! an atomic instruction (such as found in heap allocators, or `measureme`) |
99 | //! * this is automatically detected, with a `log` message pointing the user |
100 | //! to <https://github.com/mozilla/rr/wiki/Zen> for guidance on how to |
101 | //! disable `SpecLockMap` on their system (sadly requires root access) |
102 | //! |
103 | //! Even if some of the above caveats apply for some profiling setup, as long as |
104 | //! the counters function, they can still be used, and compared with `wall-time`. |
105 | //! Chances are, they will still have less variance, as everything that impacts |
106 | //! instruction counts will also impact any time measurements. |
107 | //! |
108 | //! Also keep in mind that instruction counts do not properly reflect all kinds |
109 | //! of workloads, e.g. SIMD throughput and cache locality are unaccounted for. |
110 | |
111 | // FIXME: Use a cargo feature for accurate_seqlock_rdpmc and unserialized_rdpmc |
112 | // so we don't need this: |
113 | #![allow (unexpected_cfgs)] |
114 | |
115 | use std::error::Error; |
116 | use std::time::Instant; |
117 | |
118 | // HACK(eddyb) this is semantically `warn!` but uses `error!` because |
119 | // that's the only log level enabled by default - see also |
120 | // https://github.com/rust-lang/rust/issues/76824 |
121 | macro_rules! really_warn { |
122 | ($msg:literal $($rest:tt)*) => { |
123 | error!(concat!("[WARNING] " , $msg) $($rest)*) |
124 | } |
125 | } |
126 | |
127 | pub enum Counter { |
128 | WallTime(WallTime), |
129 | Instructions(Instructions), |
130 | InstructionsMinusIrqs(InstructionsMinusIrqs), |
131 | InstructionsMinusRaw0420(InstructionsMinusRaw0420), |
132 | } |
133 | |
134 | impl Counter { |
135 | pub fn by_name(name: &str) -> Result<Self, Box<dyn Error + Send + Sync>> { |
136 | Ok(match name { |
137 | WallTime::NAME => Counter::WallTime(WallTime::new()), |
138 | Instructions::NAME => Counter::Instructions(Instructions::new()?), |
139 | InstructionsMinusIrqs::NAME => { |
140 | Counter::InstructionsMinusIrqs(InstructionsMinusIrqs::new()?) |
141 | } |
142 | InstructionsMinusRaw0420::NAME => { |
143 | Counter::InstructionsMinusRaw0420(InstructionsMinusRaw0420::new()?) |
144 | } |
145 | _ => return Err(format!(" {:?} is not a valid counter name" , name).into()), |
146 | }) |
147 | } |
148 | |
149 | pub(super) fn describe_as_json(&self) -> String { |
150 | let (name, units) = match self { |
151 | Counter::WallTime(_) => ( |
152 | WallTime::NAME, |
153 | r#"[["ns", 1], ["μs", 1000], ["ms", 1000000], ["s", 1000000000]]"# , |
154 | ), |
155 | Counter::Instructions(_) => (Instructions::NAME, r#"[["instructions", 1]]"# ), |
156 | Counter::InstructionsMinusIrqs(_) => { |
157 | (InstructionsMinusIrqs::NAME, r#"[["instructions", 1]]"# ) |
158 | } |
159 | Counter::InstructionsMinusRaw0420(_) => { |
160 | (InstructionsMinusRaw0420::NAME, r#"[["instructions", 1]]"# ) |
161 | } |
162 | }; |
163 | format!(r#" {{ "name": " {}", "units": {} }}"# , name, units) |
164 | } |
165 | |
166 | #[inline ] |
167 | pub(super) fn since_start(&self) -> u64 { |
168 | match self { |
169 | Counter::WallTime(counter) => counter.since_start(), |
170 | Counter::Instructions(counter) => counter.since_start(), |
171 | Counter::InstructionsMinusIrqs(counter) => counter.since_start(), |
172 | Counter::InstructionsMinusRaw0420(counter) => counter.since_start(), |
173 | } |
174 | } |
175 | } |
176 | |
177 | /// "Monotonic clock" with nanosecond precision (using [`std::time::Instant`]). |
178 | /// |
179 | /// Can be obtained with `Counter::by_name("wall-time")`. |
180 | pub struct WallTime { |
181 | start: Instant, |
182 | } |
183 | |
184 | impl WallTime { |
185 | const NAME: &'static str = "wall-time" ; |
186 | |
187 | pub fn new() -> Self { |
188 | WallTime { |
189 | start: Instant::now(), |
190 | } |
191 | } |
192 | |
193 | #[inline ] |
194 | fn since_start(&self) -> u64 { |
195 | self.start.elapsed().as_nanos() as u64 |
196 | } |
197 | } |
198 | |
199 | /// "Instructions retired" hardware performance counter (userspace-only). |
200 | /// |
201 | /// Can be obtained with `Counter::by_name("instructions:u")`. |
202 | pub struct Instructions { |
203 | instructions: hw::Counter, |
204 | start: u64, |
205 | } |
206 | |
207 | impl Instructions { |
208 | const NAME: &'static str = "instructions:u" ; |
209 | |
210 | pub fn new() -> Result<Self, Box<dyn Error + Send + Sync>> { |
211 | let model: CpuModel = hw::CpuModel::detect()?; |
212 | let instructions: Counter = hw::Counter::new(&model, counter_type:HwCounterType::Instructions)?; |
213 | let start: u64 = instructions.read(); |
214 | Ok(Instructions { |
215 | instructions, |
216 | start, |
217 | }) |
218 | } |
219 | |
220 | #[inline ] |
221 | fn since_start(&self) -> u64 { |
222 | self.instructions.read().wrapping_sub(self.start) |
223 | } |
224 | } |
225 | |
226 | /// More accurate [`Instructions`] (subtracting hardware interrupt counts). |
227 | /// |
228 | /// Can be obtained with `Counter::by_name("instructions-minus-irqs:u")`. |
229 | pub struct InstructionsMinusIrqs { |
230 | instructions: hw::Counter, |
231 | irqs: hw::Counter, |
232 | start: u64, |
233 | } |
234 | |
235 | impl InstructionsMinusIrqs { |
236 | const NAME: &'static str = "instructions-minus-irqs:u" ; |
237 | |
238 | pub fn new() -> Result<Self, Box<dyn Error + Send + Sync>> { |
239 | let model: CpuModel = hw::CpuModel::detect()?; |
240 | let instructions: Counter = hw::Counter::new(&model, counter_type:HwCounterType::Instructions)?; |
241 | let irqs: Counter = hw::Counter::new(&model, counter_type:HwCounterType::Irqs)?; |
242 | let (start_instructions: u64, start_irqs: u64) = (&instructions, &irqs).read(); |
243 | let start: u64 = start_instructions.wrapping_sub(start_irqs); |
244 | Ok(InstructionsMinusIrqs { |
245 | instructions, |
246 | irqs, |
247 | start, |
248 | }) |
249 | } |
250 | |
251 | #[inline ] |
252 | fn since_start(&self) -> u64 { |
253 | let (instructions: u64, irqs: u64) = (&self.instructions, &self.irqs).read(); |
254 | instructions.wrapping_sub(irqs).wrapping_sub(self.start) |
255 | } |
256 | } |
257 | |
258 | /// (Experimental) Like [`InstructionsMinusIrqs`] (but using an undocumented `r0420:u` counter). |
259 | /// |
260 | /// Can be obtained with `Counter::by_name("instructions-minus-r0420:u")`. |
261 | // |
262 | // HACK(eddyb) this is a variant of `instructions-minus-irqs:u`, where `r0420` |
263 | // is subtracted, instead of the usual "hardware interrupts" (aka IRQs). |
264 | // `r0420` is an undocumented counter on AMD Zen CPUs which appears to count |
265 | // both hardware interrupts and exceptions (such as page faults), though |
266 | // it's unclear yet what exactly it's counting (could even be `iret`s). |
267 | pub struct InstructionsMinusRaw0420(InstructionsMinusIrqs); |
268 | |
269 | impl InstructionsMinusRaw0420 { |
270 | const NAME: &'static str = "instructions-minus-r0420:u" ; |
271 | |
272 | pub fn new() -> Result<Self, Box<dyn Error + Send + Sync>> { |
273 | let model: CpuModel = hw::CpuModel::detect()?; |
274 | let instructions: Counter = hw::Counter::new(&model, counter_type:HwCounterType::Instructions)?; |
275 | let irqs: Counter = hw::Counter::new(&model, counter_type:HwCounterType::Raw0420)?; |
276 | let (start_instructions: u64, start_irqs: u64) = (&instructions, &irqs).read(); |
277 | let start: u64 = start_instructions.wrapping_sub(start_irqs); |
278 | Ok(InstructionsMinusRaw0420(InstructionsMinusIrqs { |
279 | instructions, |
280 | irqs, |
281 | start, |
282 | })) |
283 | } |
284 | |
285 | #[inline ] |
286 | fn since_start(&self) -> u64 { |
287 | self.0.since_start() |
288 | } |
289 | } |
290 | |
291 | trait HwCounterRead { |
292 | type Output; |
293 | fn read(&self) -> Self::Output; |
294 | } |
295 | |
296 | enum HwCounterType { |
297 | Instructions, |
298 | Irqs, |
299 | Raw0420, |
300 | } |
301 | |
302 | const BUG_REPORT_MSG: &str = |
303 | "please report this to https://github.com/rust-lang/measureme/issues/new" ; |
304 | |
305 | /// Linux x86_64 implementation based on `perf_event_open` and `rdpmc`. |
306 | #[cfg (all(target_arch = "x86_64" , target_os = "linux" ))] |
307 | mod hw { |
308 | use memmap2::{Mmap, MmapOptions}; |
309 | use perf_event_open_sys::{bindings::*, perf_event_open}; |
310 | use std::arch::asm; |
311 | use std::convert::TryInto; |
312 | use std::error::Error; |
313 | use std::fs; |
314 | use std::mem; |
315 | use std::os::unix::io::FromRawFd; |
316 | |
317 | pub(super) struct Counter { |
318 | mmap: Mmap, |
319 | reg_idx: u32, |
320 | } |
321 | |
322 | impl Counter { |
323 | pub(super) fn new( |
324 | model: &CpuModel, |
325 | counter_type: super::HwCounterType, |
326 | ) -> Result<Self, Box<dyn Error + Send + Sync>> { |
327 | let (type_, hw_id) = match counter_type { |
328 | super::HwCounterType::Instructions => { |
329 | (PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS) |
330 | } |
331 | super::HwCounterType::Irqs => (PERF_TYPE_RAW, model.irqs_counter_config()?), |
332 | super::HwCounterType::Raw0420 => { |
333 | match model { |
334 | CpuModel::Amd(AmdGen::Zen) => {} |
335 | |
336 | _ => really_warn!( |
337 | "Counter::new: the undocumented `r0420` performance \ |
338 | counter has only been observed on AMD Zen CPUs" |
339 | ), |
340 | } |
341 | |
342 | (PERF_TYPE_RAW, 0x04_20) |
343 | } |
344 | }; |
345 | Self::with_type_and_hw_id(type_, hw_id) |
346 | } |
347 | |
348 | fn with_type_and_hw_id( |
349 | type_: perf_type_id, |
350 | hw_id: u32, |
351 | ) -> Result<Self, Box<dyn Error + Send + Sync>> { |
352 | let mut attrs = perf_event_attr { |
353 | size: mem::size_of::<perf_event_attr>().try_into().unwrap(), |
354 | type_, |
355 | config: hw_id.into(), |
356 | ..perf_event_attr::default() |
357 | }; |
358 | |
359 | // Only record same-thread, any CPUs, and only userspace (no kernel/hypervisor). |
360 | // NOTE(eddyb) `pid = 0`, despite talking about "process id", means |
361 | // "calling process/thread", *not* "any thread in the calling process" |
362 | // (i.e. "process" is interchangeable with "main thread of the process") |
363 | // FIXME(eddyb) introduce per-thread counters and/or use `inherit` |
364 | // (and `inherit_stat`? though they might not be appropriate here) |
365 | // to be able to read the counter on more than just the initial thread. |
366 | let pid = 0; |
367 | let cpu = -1; |
368 | let group_fd = -1; |
369 | attrs.set_exclude_kernel(1); |
370 | attrs.set_exclude_hv(1); |
371 | |
372 | let file = unsafe { |
373 | let fd = |
374 | perf_event_open(&mut attrs, pid, cpu, group_fd, PERF_FLAG_FD_CLOEXEC.into()); |
375 | if fd < 0 { |
376 | Err(std::io::Error::from_raw_os_error(-fd)) |
377 | } else { |
378 | Ok(fs::File::from_raw_fd(fd)) |
379 | } |
380 | }; |
381 | let file = file.map_err(|e| format!("perf_event_open failed: {:?}" , e))?; |
382 | |
383 | let mmap = unsafe { |
384 | MmapOptions::new() |
385 | .len(mem::size_of::<perf_event_mmap_page>()) |
386 | .map(&file) |
387 | }; |
388 | let mmap = mmap.map_err(|e| format!("perf_event_mmap_page: mmap failed: {:?}" , e))?; |
389 | |
390 | let mut counter = Counter { mmap, reg_idx: 0 }; |
391 | |
392 | let (version, compat_version, caps, index, pmc_width) = counter |
393 | .access_mmap_page_with_seqlock(|mp| { |
394 | ( |
395 | mp.version, |
396 | mp.compat_version, |
397 | unsafe { mp.__bindgen_anon_1.__bindgen_anon_1 }, |
398 | mp.index, |
399 | mp.pmc_width, |
400 | ) |
401 | }); |
402 | |
403 | info!( |
404 | "Counter::new: version= {} compat_version= {} index= {:#x}" , |
405 | version, compat_version, index, |
406 | ); |
407 | |
408 | if caps.cap_user_rdpmc() == 0 { |
409 | return Err(format!( |
410 | "perf_event_mmap_page: missing cap_user_rdpmc {}" , |
411 | if caps.cap_bit0_is_deprecated() == 0 && caps.cap_bit0() == 1 { |
412 | " (ignoring legacy/broken rdpmc support)" |
413 | } else { |
414 | "" |
415 | } |
416 | ) |
417 | .into()); |
418 | } |
419 | |
420 | if index == 0 { |
421 | return Err(format!( |
422 | "perf_event_mmap_page: no allocated hardware register (ran out?)" |
423 | ) |
424 | .into()); |
425 | } |
426 | counter.reg_idx = index - 1; |
427 | |
428 | if (cfg!(not(accurate_seqlock_rdpmc)) || true) && pmc_width != 48 { |
429 | return Err(format!( |
430 | "perf_event_mmap_page: {}-bit hardware counter found, only 48-bit supported" , |
431 | pmc_width |
432 | ) |
433 | .into()); |
434 | } |
435 | |
436 | Ok(counter) |
437 | } |
438 | |
439 | /// Try to access the mmap page, retrying the `attempt` closure as long |
440 | /// as the "seqlock" sequence number changes (which indicates the kernel |
441 | /// has updated one or more fields within the mmap page). |
442 | #[inline ] |
443 | fn access_mmap_page_with_seqlock<T>( |
444 | &self, |
445 | attempt: impl Fn(&perf_event_mmap_page) -> T, |
446 | ) -> T { |
447 | // FIXME(eddyb) it's probably UB to use regular reads, especially |
448 | // from behind `&T`, with the only synchronization being barriers. |
449 | // Probably needs atomic reads, and stronger ones at that, for the |
450 | // `lock` field, than the fields (which would be `Relaxed`?). |
451 | let mmap_page = unsafe { &*(self.mmap.as_ptr() as *const perf_event_mmap_page) }; |
452 | let barrier = || std::sync::atomic::fence(std::sync::atomic::Ordering::Acquire); |
453 | |
454 | loop { |
455 | // Grab the "seqlock" - the kernel will update this value when it |
456 | // updates any of the other fields that may be read in `attempt`. |
457 | let seq_lock = mmap_page.lock; |
458 | barrier(); |
459 | |
460 | let result = attempt(mmap_page); |
461 | |
462 | // If nothing has changed, we're done. Otherwise, keep retrying. |
463 | barrier(); |
464 | if mmap_page.lock == seq_lock { |
465 | return result; |
466 | } |
467 | } |
468 | } |
469 | } |
470 | |
471 | impl super::HwCounterRead for Counter { |
472 | type Output = u64; |
473 | |
474 | #[inline ] |
475 | fn read(&self) -> u64 { |
476 | // HACK(eddyb) keep the accurate code around while not using it, |
477 | // to minimize overhead without losing the more complex implementation. |
478 | let (counter, offset, pmc_width) = if cfg!(accurate_seqlock_rdpmc) && false { |
479 | self.access_mmap_page_with_seqlock(|mp| { |
480 | let caps = unsafe { mp.__bindgen_anon_1.__bindgen_anon_1 }; |
481 | assert_ne!(caps.cap_user_rdpmc(), 0); |
482 | |
483 | ( |
484 | rdpmc(mp.index.checked_sub(1).unwrap()), |
485 | mp.offset, |
486 | mp.pmc_width, |
487 | ) |
488 | }) |
489 | } else { |
490 | (rdpmc(self.reg_idx), 0, 48) |
491 | }; |
492 | |
493 | let counter = offset + (counter as i64); |
494 | |
495 | // Sign-extend the `pmc_width`-bit value to `i64`. |
496 | (counter << (64 - pmc_width) >> (64 - pmc_width)) as u64 |
497 | } |
498 | } |
499 | |
500 | impl super::HwCounterRead for (&Counter, &Counter) { |
501 | type Output = (u64, u64); |
502 | |
503 | #[inline ] |
504 | fn read(&self) -> (u64, u64) { |
505 | // HACK(eddyb) keep the accurate code around while not using it, |
506 | // to minimize overhead without losing the more complex implementation. |
507 | if (cfg!(accurate_seqlock_rdpmc) || cfg!(unserialized_rdpmc)) && false { |
508 | return (self.0.read(), self.1.read()); |
509 | } |
510 | |
511 | let pmc_width = 48; |
512 | |
513 | let (a_counter, b_counter) = rdpmc_pair(self.0.reg_idx, self.1.reg_idx); |
514 | |
515 | // Sign-extend the `pmc_width`-bit values to `i64`. |
516 | ( |
517 | ((a_counter as i64) << (64 - pmc_width) >> (64 - pmc_width)) as u64, |
518 | ((b_counter as i64) << (64 - pmc_width) >> (64 - pmc_width)) as u64, |
519 | ) |
520 | } |
521 | } |
522 | |
523 | /// Read the hardware performance counter indicated by `reg_idx`. |
524 | /// |
525 | /// If the counter is signed, sign extension should be performed based on |
526 | /// the width of the register (32 to 64 bits, e.g. 48-bit seems common). |
527 | #[inline (always)] |
528 | fn rdpmc(reg_idx: u32) -> u64 { |
529 | // NOTE(eddyb) below comment is outdated (the other branch uses `cpuid`). |
530 | if cfg!(unserialized_rdpmc) && false { |
531 | // FIXME(eddyb) the Intel and AMD manuals warn about the need for |
532 | // "serializing instructions" before/after `rdpmc`, if avoiding any |
533 | // reordering is desired, but do not agree on the full set of usable |
534 | // "serializing instructions" (e.g. `mfence` isn't listed in both). |
535 | // |
536 | // The only usable, and guaranteed to work, "serializing instruction" |
537 | // appears to be `cpuid`, but it doesn't seem easy to use, especially |
538 | // due to the overlap in registers with `rdpmc` itself, and it might |
539 | // have too high of a cost, compared to serialization benefits (if any). |
540 | unserialized_rdpmc(reg_idx) |
541 | } else { |
542 | serialize_instruction_execution(); |
543 | unserialized_rdpmc(reg_idx) |
544 | } |
545 | } |
546 | |
547 | /// Read two hardware performance counters at once (see `rdpmc`). |
548 | /// |
549 | /// Should be more efficient/accurate than two `rdpmc` calls, as it |
550 | /// only requires one "serializing instruction", rather than two. |
551 | #[inline (always)] |
552 | fn rdpmc_pair(a_reg_idx: u32, b_reg_idx: u32) -> (u64, u64) { |
553 | serialize_instruction_execution(); |
554 | (unserialized_rdpmc(a_reg_idx), unserialized_rdpmc(b_reg_idx)) |
555 | } |
556 | |
557 | /// Dummy `cpuid(0)` to serialize instruction execution. |
558 | #[inline (always)] |
559 | fn serialize_instruction_execution() { |
560 | unsafe { |
561 | asm!( |
562 | "xor %eax, %eax" , // Intel syntax: "xor eax, eax" |
563 | // LLVM sometimes reserves `ebx` for its internal use, so we need to use |
564 | // a scratch register for it instead. |
565 | "mov %rbx, {tmp_rbx:r}" , // Intel syntax: "mov {tmp_rbx:r}, rbx" |
566 | "cpuid" , |
567 | "mov {tmp_rbx:r}, %rbx" , // Intel syntax: "mov rbx, {tmp_rbx:r}" |
568 | tmp_rbx = lateout(reg) _, |
569 | // `cpuid` clobbers. |
570 | lateout("eax" ) _, |
571 | lateout("edx" ) _, |
572 | lateout("ecx" ) _, |
573 | |
574 | options(nostack), |
575 | // Older versions of LLVM do not support modifiers in |
576 | // Intel syntax inline asm; whenever Rust minimum LLVM version |
577 | // supports Intel syntax inline asm, remove and replace above |
578 | // instructions with Intel syntax version (from comments). |
579 | options(att_syntax), |
580 | ); |
581 | } |
582 | } |
583 | |
584 | /// Read the hardware performance counter indicated by `reg_idx`. |
585 | /// |
586 | /// If the counter is signed, sign extension should be performed based on |
587 | /// the width of the register (32 to 64 bits, e.g. 48-bit seems common). |
588 | #[inline (always)] |
589 | fn unserialized_rdpmc(reg_idx: u32) -> u64 { |
590 | let (lo, hi): (u32, u32); |
591 | unsafe { |
592 | asm!( |
593 | "rdpmc" , |
594 | in("ecx" ) reg_idx, |
595 | lateout("eax" ) lo, |
596 | lateout("edx" ) hi, |
597 | options(nostack), |
598 | // Older versions of LLVM do not support modifiers in |
599 | // Intel syntax inline asm; whenever Rust minimum LLVM version |
600 | // supports Intel syntax inline asm, remove and replace above |
601 | // instructions with Intel syntax version (from comments). |
602 | options(att_syntax), |
603 | ); |
604 | } |
605 | lo as u64 | (hi as u64) << 32 |
606 | } |
607 | |
608 | /// Categorization of `x86_64` CPUs, primarily based on how they |
609 | /// support for counting "hardware interrupts" (documented or not). |
610 | pub(super) enum CpuModel { |
611 | Amd(AmdGen), |
612 | Intel(IntelGen), |
613 | } |
614 | |
615 | pub(super) enum AmdGen { |
616 | /// K8 (Hammer) to Jaguar / Puma. |
617 | PreZen, |
618 | |
619 | /// Zen / Zen+ / Zen 2. |
620 | Zen, |
621 | |
622 | /// Unknown AMD CPU, contemporary to/succeeding Zen/Zen+/Zen 2, |
623 | /// but likely similar to them. |
624 | UnknownMaybeZenLike, |
625 | } |
626 | |
627 | pub(super) enum IntelGen { |
628 | /// Intel CPU predating Sandy Bridge. These are the only CPUs we |
629 | /// can't support (more) accurate instruction counting on, as they |
630 | /// don't (appear to) have any way to count "hardware interrupts". |
631 | PreBridge, |
632 | |
633 | /// Sandy Bridge / Ivy Bridge: |
634 | /// * client: Sandy Bridge (M/H) / Ivy Bridge (M/H/Gladden) |
635 | /// * server: Sandy Bridge (E/EN/EP) / Ivy Bridge (E/EN/EP/EX) |
636 | /// |
637 | /// Intel doesn't document support for counting "hardware interrupts" |
638 | /// prior to Skylake, but testing found that `HW_INTERRUPTS.RECEIVED` |
639 | /// from Skylake has existed, with the same config, as far back as |
640 | /// "Sandy Bridge" (but before that it mapped to a different event). |
641 | /// |
642 | /// These are the (pre-Skylake) *Bridge CPU models confirmed so far: |
643 | /// * Sandy Bridge (client) Family 6 Model 42 |
644 | /// Intel(R) Core(TM) i5-2520M CPU @ 2.50GHz (@alyssais) |
645 | /// * Ivy Bridge (client) Family 6 Model 58 |
646 | /// Intel(R) Core(TM) i7-3520M CPU @ 2.90GHz (@eddyb) |
647 | /// |
648 | /// We later found this paper, which on page 5 lists 12 counters, |
649 | /// for each of Nehalem/Westmere, Sandy Bridge and Ivy Bridge: |
650 | /// http://web.eece.maine.edu/~vweaver/projects/deterministic/deterministic_counters.pdf |
651 | /// It appears that both Sandy Bridge and Ivy Bridge used to have |
652 | /// `HW_INTERRUPTS.RECEIVED` documented, before Intel removed every |
653 | /// mention of the counter from newer versions of their manuals. |
654 | Bridge, |
655 | |
656 | /// Haswell / Broadwell: |
657 | /// * client: Haswell (S/ULT/GT3e) / Broadwell (U/Y/S/H/C/W) |
658 | /// * server: Haswell (E/EP/EX) / Broadwell (E/EP/EX/DE/Hewitt Lake) |
659 | /// |
660 | /// Equally as undocumented as "Sandy Bridge / Ivy Bridge" (see above). |
661 | /// |
662 | /// These are the (pre-Skylake) *Well CPU models confirmed so far: |
663 | /// * Haswell (client) Family 6 Model 60 |
664 | /// Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz (@m-ou-se) |
665 | /// * Haswell (server) Family 6 Model 63 |
666 | /// Intel(R) Xeon(R) CPU E5-2697 v3 @ 2.60GHz (@cuviper) |
667 | /// * Haswell (client + GT3e) Family 6 Model 70 |
668 | /// Intel(R) Core(TM) i7-4750HQ CPU @ 2.00GHz (@nagisa) |
669 | /// Intel(R) Core(TM) i7-4770HQ CPU @ 2.20GHz (@m-ou-se) |
670 | Well, |
671 | |
672 | /// Skylake / Skylake-derived: |
673 | /// * client: Skylake (Y/U/DT/H/S) / Kaby Lake (Y/U/DT/H/S/X) / Coffee Lake (U/S/H/E) |
674 | /// * server: Skylake (SP/X/DE/W) / Cascade Lake (SP/X/W) |
675 | /// |
676 | /// Both "client" and "server" product lines have documented support |
677 | /// for counting "hardware interrupts" (`HW_INTERRUPTS.RECEIVED`). |
678 | /// |
679 | /// Intel does not make it clear that future product lines, such as |
680 | /// "Ice Lake", will continue to support this (or with what config), |
681 | /// and even "Comet Lake" (aka "10th gen") isn't explicitly listed. |
682 | Lake, |
683 | |
684 | /// Unknown Intel CPU, contemporary to/succeeding *Bridge/*Well/*Lake, |
685 | /// but likely similar to them. |
686 | UnknownMaybeLakeLike, |
687 | } |
688 | |
689 | impl CpuModel { |
690 | /// Detect the model of the current CPU using `cpuid`. |
691 | pub(super) fn detect() -> Result<Self, Box<dyn Error + Send + Sync>> { |
692 | let cpuid0 = unsafe { std::arch::x86_64::__cpuid(0) }; |
693 | let cpuid1 = unsafe { std::arch::x86_64::__cpuid(1) }; |
694 | let mut vendor = [0; 12]; |
695 | vendor[0..4].copy_from_slice(&cpuid0.ebx.to_le_bytes()); |
696 | vendor[4..8].copy_from_slice(&cpuid0.edx.to_le_bytes()); |
697 | vendor[8..12].copy_from_slice(&cpuid0.ecx.to_le_bytes()); |
698 | |
699 | let vendor = std::str::from_utf8(&vendor).map_err(|_| { |
700 | format!( |
701 | "cpuid returned non-UTF-8 vendor name: cpuid(0)= {:?} cpuid(1)= {:?}" , |
702 | cpuid0, cpuid1 |
703 | ) |
704 | })?; |
705 | |
706 | let version = cpuid1.eax; |
707 | |
708 | let mut family = (version >> 8) & 0xf; |
709 | if family == 15 { |
710 | // Extended family. |
711 | family += (version >> 20) & 0xff; |
712 | } |
713 | |
714 | let mut model = (version >> 4) & 0xf; |
715 | if family >= 15 || vendor == "GenuineIntel" && family == 6 { |
716 | // Extended model. |
717 | model += ((version >> 16) & 0xf) << 4; |
718 | } |
719 | |
720 | info!( |
721 | "CpuModel::detect: vendor= {:?} family= {} model= {}" , |
722 | vendor, family, model |
723 | ); |
724 | |
725 | match vendor { |
726 | "AuthenticAMD" => { |
727 | use self::AmdGen::*; |
728 | |
729 | let (gen, name) = match (family, model) { |
730 | (0..=14, _) | (19, _) => { |
731 | return Err(format!( |
732 | "impossible AMD64 CPU detected (Family {} Model {}); {}" , |
733 | family, |
734 | model, |
735 | super::BUG_REPORT_MSG |
736 | ) |
737 | .into()); |
738 | } |
739 | |
740 | (15, _) => (PreZen, "K8 (Hammer)" ), |
741 | (16, _) => (PreZen, "K10 (Barcelona/Shanghai/Istanbul)" ), |
742 | (17, _) => (PreZen, "K8+K10 hybrid (Turion X2 Ultra)" ), |
743 | (18, _) => (PreZen, "Fusion" ), |
744 | (20, _) => (PreZen, "Bobcat" ), |
745 | (21, _) => (PreZen, "Bulldozer / Piledriver / Steamroller / Excavator" ), |
746 | (22, _) => (PreZen, "Jaguar / Puma" ), |
747 | |
748 | (23, 1) => (Zen, "Zen (Naples/Whitehaven/Summit Ridge/Snowy Owl)" ), |
749 | (23, 17) => (Zen, "Zen (Raven Ridge)" ), |
750 | (23, 24) => (Zen, "Zen (Banded Kestrel/Dali) / Zen+ (Picasso)" ), |
751 | (23, 8) => (Zen, "Zen+ (Pinnacle Ridge)" ), |
752 | (23, 49) => (Zen, "Zen 2 (Rome/Castle Peak)" ), |
753 | (23, 113) => (Zen, "Zen 2 (Matisse)" ), |
754 | |
755 | (23..=0xffff_ffff, _) => { |
756 | really_warn!( |
757 | "CpuModel::detect: unknown AMD CPU (Family {} Model {}), \ |
758 | assuming Zen-like; {}" , |
759 | family, |
760 | model, |
761 | super::BUG_REPORT_MSG |
762 | ); |
763 | |
764 | (UnknownMaybeZenLike, "" ) |
765 | } |
766 | }; |
767 | |
768 | if !name.is_empty() { |
769 | info!("CpuModel::detect: known AMD CPU: {}" , name); |
770 | } |
771 | |
772 | // The `SpecLockMap` (speculative atomic aka `lock` instruction |
773 | // execution, unclear what "Map" refers to) feature in AMD Zen CPUs |
774 | // causes non-deterministic overcounting of atomic instructions, |
775 | // presumably whenever it has to roll back the speculation |
776 | // (as in, the performance counters aren't rolled back). |
777 | // Even this this may be rare when uncontended, it adds up. |
778 | // |
779 | // There is an MSR bit (`MSRC001_1020[54]`) that's not officially |
780 | // documented, but which several motherboards and profiling tools |
781 | // set whenever IBS (Instruction-Based Sampling) is in use, and |
782 | // it is sometimes referred to as "disabling `SpecLockMap`" |
783 | // (hence having a name for the feature that speculates `lock`s). |
784 | // |
785 | // One way we could detect that the bit has been set would be to |
786 | // parse `uname().release` (aka `uname -r`) and look for versions |
787 | // which are known to include the patch suggested in this thread: |
788 | // https://github.com/mozilla/rr/issues/2034#issuecomment-693761247 |
789 | // |
790 | // However, one may set the bit using e.g. `wrmsr`, even on older |
791 | // kernels, so a more reliable approach is to execute some atomics |
792 | // and look at the `SpecLockMapCommit` (`r0825:u`) Zen counter, |
793 | // which only reliably remains `0` when `SpecLockMap` is disabled. |
794 | if matches!(gen, Zen | UnknownMaybeZenLike) { |
795 | if let Ok(spec_lock_map_commit) = |
796 | Counter::with_type_and_hw_id(PERF_TYPE_RAW, 0x08_25) |
797 | { |
798 | use super::HwCounterRead; |
799 | |
800 | let start_spec_lock_map_commit = spec_lock_map_commit.read(); |
801 | |
802 | // Execute an atomic (`lock`) instruction, which should |
803 | // start speculative execution for following instructions |
804 | // (as long as `SpecLockMap` isn't disabled). |
805 | let mut atomic: u64 = 0; |
806 | let mut _tmp: u64 = 0; |
807 | unsafe { |
808 | asm!( |
809 | // Intel syntax: "lock xadd [{atomic}], {tmp}" |
810 | "lock xadd {tmp}, ( {atomic})" , |
811 | |
812 | atomic = in(reg) &mut atomic, |
813 | tmp = inout(reg) _tmp, |
814 | |
815 | // Older versions of LLVM do not support modifiers in |
816 | // Intel syntax inline asm; whenever Rust minimum LLVM |
817 | // version supports Intel syntax inline asm, remove |
818 | // and replace above instructions with Intel syntax |
819 | // version (from comments). |
820 | options(att_syntax), |
821 | ); |
822 | } |
823 | |
824 | if spec_lock_map_commit.read() != start_spec_lock_map_commit { |
825 | really_warn!( |
826 | "CpuModel::detect: SpecLockMap detected, in AMD {} CPU; \ |
827 | this may add some non-deterministic noise - \ |
828 | for information on disabling SpecLockMap, see \ |
829 | https://github.com/mozilla/rr/wiki/Zen" , |
830 | name |
831 | ); |
832 | } |
833 | } |
834 | } |
835 | |
836 | Ok(CpuModel::Amd(gen)) |
837 | } |
838 | |
839 | "GenuineIntel" => { |
840 | use self::IntelGen::*; |
841 | |
842 | let (gen, name) = match (family, model) { |
843 | // No need to name these, they're unsupported anyway. |
844 | (0..=5, _) => (PreBridge, "" ), |
845 | (15, _) => (PreBridge, "Netburst" ), |
846 | (6, 0..=41) => (PreBridge, "" ), |
847 | |
848 | // Older Xeon Phi CPUs, misplaced in Family 6. |
849 | (6, 87) => (PreBridge, "Knights Landing" ), |
850 | (6, 133) => (PreBridge, "Knights Mill" ), |
851 | |
852 | // Older Atom CPUs, interleaved with other CPUs. |
853 | // FIXME(eddyb) figure out if these are like *Bridge/*Well. |
854 | (6, 53) | (6, 54) => (PreBridge, "Saltwell" ), |
855 | (6, 55) | (6, 74) | (6, 77) | (6, 90) | (6, 93) => { |
856 | (PreBridge, "Silvermont" ) |
857 | } |
858 | (6, 76) => (PreBridge, "Airmont (Cherry Trail/Braswell)" ), |
859 | |
860 | // Older server CPUs, numbered out of order. |
861 | (6, 44) => (PreBridge, "Westmere (Gulftown/EP)" ), |
862 | (6, 46) => (PreBridge, "Nehalem (EX)" ), |
863 | (6, 47) => (PreBridge, "Westmere (EX)" ), |
864 | |
865 | (6, 42) => (Bridge, "Sandy Bridge (M/H)" ), |
866 | (6, 45) => (Bridge, "Sandy Bridge (E/EN/EP)" ), |
867 | (6, 58) => (Bridge, "Ivy Bridge (M/H/Gladden)" ), |
868 | (6, 62) => (Bridge, "Ivy Bridge (E/EN/EP/EX)" ), |
869 | |
870 | (6, 60) => (Well, "Haswell (S)" ), |
871 | (6, 61) => (Well, "Broadwell (U/Y/S)" ), |
872 | (6, 63) => (Well, "Haswell (E/EP/EX)" ), |
873 | (6, 69) => (Well, "Haswell (ULT)" ), |
874 | (6, 70) => (Well, "Haswell (GT3e)" ), |
875 | (6, 71) => (Well, "Broadwell (H/C/W)" ), |
876 | (6, 79) => (Well, "Broadwell (E/EP/EX)" ), |
877 | (6, 86) => (Well, "Broadwell (DE/Hewitt Lake)" ), |
878 | |
879 | (6, 78) => (Lake, "Skylake (Y/U)" ), |
880 | (6, 85) => (Lake, "Skylake (SP/X/DE/W) / Cascade Lake (SP/X/W)" ), |
881 | (6, 94) => (Lake, "Skylake (DT/H/S)" ), |
882 | (6, 142) => (Lake, "Kaby Lake (Y/U) / Coffee Lake (U)" ), |
883 | (6, 158) => (Lake, "Kaby Lake (DT/H/S/X) / Coffee Lake (S/H/E)" ), |
884 | |
885 | (6..=14, _) | (16..=0xffff_ffff, _) => { |
886 | really_warn!( |
887 | "CpuModel::detect: unknown Intel CPU (Family {} Model {}), \ |
888 | assuming Skylake-like; {}" , |
889 | family, |
890 | model, |
891 | super::BUG_REPORT_MSG |
892 | ); |
893 | |
894 | (UnknownMaybeLakeLike, "" ) |
895 | } |
896 | }; |
897 | |
898 | if !name.is_empty() { |
899 | info!("CpuModel::detect: known Intel CPU: {}" , name); |
900 | } |
901 | |
902 | Ok(CpuModel::Intel(gen)) |
903 | } |
904 | |
905 | _ => Err(format!( |
906 | "cpuid returned unknown CPU vendor {:?}; version= {:#x}" , |
907 | vendor, version |
908 | ) |
909 | .into()), |
910 | } |
911 | } |
912 | |
913 | /// Return the hardware performance counter configuration for |
914 | /// counting "hardware interrupts" (documented or not). |
915 | fn irqs_counter_config(&self) -> Result<u32, Box<dyn Error + Send + Sync>> { |
916 | match self { |
917 | CpuModel::Amd(model) => match model { |
918 | AmdGen::PreZen => Ok(0x00_cf), |
919 | AmdGen::Zen | AmdGen::UnknownMaybeZenLike => Ok(0x00_2c), |
920 | }, |
921 | CpuModel::Intel(model) => match model { |
922 | IntelGen::PreBridge => Err(format!( |
923 | "counting IRQs not yet supported on Intel CPUs \ |
924 | predating Sandy Bridge; {}" , |
925 | super::BUG_REPORT_MSG |
926 | ) |
927 | .into()), |
928 | IntelGen::Bridge |
929 | | IntelGen::Well |
930 | | IntelGen::Lake |
931 | | IntelGen::UnknownMaybeLakeLike => Ok(0x01_cb), |
932 | }, |
933 | } |
934 | } |
935 | } |
936 | } |
937 | |
938 | #[cfg (not(all(target_arch = "x86_64" , target_os = "linux" )))] |
939 | mod hw { |
940 | use std::error::Error; |
941 | |
942 | pub(super) enum Counter {} |
943 | |
944 | impl Counter { |
945 | pub(super) fn new( |
946 | model: &CpuModel, |
947 | _: super::HwCounterType, |
948 | ) -> Result<Self, Box<dyn Error + Send + Sync>> { |
949 | match *model {} |
950 | } |
951 | } |
952 | |
953 | impl super::HwCounterRead for Counter { |
954 | type Output = u64; |
955 | |
956 | #[inline ] |
957 | fn read(&self) -> u64 { |
958 | match *self {} |
959 | } |
960 | } |
961 | |
962 | impl super::HwCounterRead for (&Counter, &Counter) { |
963 | type Output = (u64, u64); |
964 | |
965 | #[inline ] |
966 | fn read(&self) -> (u64, u64) { |
967 | match *self.0 {} |
968 | } |
969 | } |
970 | |
971 | pub(super) enum CpuModel {} |
972 | |
973 | impl CpuModel { |
974 | pub(super) fn detect() -> Result<Self, Box<dyn Error + Send + Sync>> { |
975 | // HACK(eddyb) mark `really_warn!` (and transitively `log` macros) |
976 | // and `BUG_REPORT_MSG` as "used" to silence warnings. |
977 | if false { |
978 | really_warn!("unsupported; {}" , super::BUG_REPORT_MSG); |
979 | } |
980 | |
981 | let mut msg = String::new(); |
982 | let mut add_error = |s| { |
983 | if !msg.is_empty() { |
984 | msg += "; " ; |
985 | } |
986 | msg += s; |
987 | }; |
988 | |
989 | if cfg!(not(target_arch = "x86_64" )) { |
990 | add_error("only supported architecture is x86_64" ); |
991 | } |
992 | |
993 | if cfg!(not(target_os = "linux" )) { |
994 | add_error("only supported OS is Linux" ); |
995 | } |
996 | |
997 | Err(msg.into()) |
998 | } |
999 | } |
1000 | } |
1001 | |