counters.rs source code [crates/measureme-11.0.1/src/counters.rs]

1	//! Profiling counters and their implementation.
2	//!
3	//! # Available counters
4	//!
5	//! Name (for [`Counter::by_name()`]) \| Counter \| OSes \| CPUs
6	//! --------------------------------- \| ------- \| ---- \| ----
7	//! `wall-time` \| [`WallTime`] \| any \| any
8	//! `instructions:u` \| [`Instructions`] \| Linux \| `x86_64`
9	//! `instructions-minus-irqs:u` \| [`InstructionsMinusIrqs`] \| Linux \| `x86_64`<br>- AMD (since K8)<br>- Intel (since Sandy Bridge)
10	//! `instructions-minus-r0420:u` \| [`InstructionsMinusRaw0420`] \| Linux \| `x86_64`<br>- AMD (Zen)
11	//!
12	//! Note: `:u` suffixes for hardware performance counters come from the Linux `perf`*
13	//! tool, and indicate that the counter is only active while userspace code executes
14	//! (i.e. it's paused while the kernel handles syscalls, interrupts, etc.).*
15	//!
16	//! # Limitations and caveats
17	//!
18	//! Note: for more information, also see the GitHub PR which first implemented hardware*
19	//! performance counter support ([#143](https://github.com/rust-lang/measureme/pull/143)).*
20	//!
21	//! The hardware performance counters (i.e. all counters other than `wall-time`) are limited to:
22	//! Linux, for out-of-the-box performance counter reads from userspace*
23	//! other OSes could work through custom kernel extensions/drivers, in the future*
24	//! `x86_64` CPUs, mostly due to lack of other available test hardware*
25	//! new architectures would be easier to support (on Linux) than new OSes*
26	//! easiest to add would be 32-bit `x86` (aka `i686`), which would reuse*
27	//! most of the `x86_64` CPU model detection logic
28	//! specific (newer) CPU models, for certain non-standard counters*
29	//! e.g. `instructions-minus-irqs:u` requires a "hardware interrupts" (aka "IRQs")*
30	//! counter, which is implemented differently between vendors / models (if at all)
31	//! single-threaded programs (counters only work on the thread they were created on)*
32	//! for profiling `rustc`, this means only "check mode" (`--emit=metadata`),*
33	//! is supported currently (`-Z no-llvm-threads` could also work)
34	//! unclear what the best approach for handling multiple threads would be*
35	//! changing the API (e.g. to require per-thread profiler handles) could result*
36	//! in a more efficient implementation, but would also be less ergonomic
37	//! profiling data from multithreaded programs would be harder to use due to*
38	//! noise from synchronization mechanisms, non-deterministic work-stealing, etc.
39	//!
40	//! For ergonomic reasons, the public API doesn't vary based on `features` or target.
41	//! Instead, attempting to create any unsupported counter will return `Err`, just
42	//! like it does for any issue detected at runtime (e.g. incompatible CPU model).
43	//!
44	//! When counting instructions specifically, these factors will impact the profiling quality:
45	//! high-level non-determinism (e.g. user interactions, networking)*
46	//! the ideal use-case is a mostly-deterministic program, e.g. a compiler like `rustc`*
47	//! if I/O can be isolated to separate profiling events, and doesn't impact*
48	//! execution in a more subtle way (see below), the deterministic parts of
49	//! the program can still be profiled with high accuracy
50	//! intentional uses of randomness may change execution paths, though for*
51	//! cryptographic operations specifically, "constant time" implementations
52	//! are preferred / necessary (in order to limit an external observer's
53	//! ability to infer secrets), so they're not as much of a problem
54	//! even otherwise-deterministic machine-local communication (to e.g. system*
55	//! services or drivers) can behave unpredictably (especially under load)
56	//! while we haven't observed this in the wild yet, it's possible for*
57	//! file reads/writes to be split up into multiple smaller chunks
58	//! (and therefore take more userspace instructions to fully read/write)
59	//! low-level non-determinism (e.g. ASLR, randomized `HashMap`s, timers)*
60	//! ASLR ("Address Space Layout Randomization"), may be provided by the OS for*
61	//! security reasons, or accidentally caused through allocations that depend on
62	//! random data (even as low-entropy as e.g. the base 10 length of a process ID)
63	//! on Linux ASLR can be disabled by running the process under `setarch -R`*
64	//! this impacts `rustc` and LLVM, which rely on keying `HashMap`s by addresses*
65	//! (typically of interned data) as an optimization, and while non-determinstic
66	//! outputs are considered bugs, the instructions executed can still vary a lot,
67	//! even when the externally observable behavior is perfectly repeatable
68	//! `HashMap`s are involved in one more than one way:*
69	//! both the executed instructions, and the shape of the allocations depend*
70	//! on both the hasher state and choice of keys (as the buckets are in
71	//! a flat array indexed by some of the lower bits of the key hashes)
72	//! so every `HashMap` with keys being/containing addresses will amplify*
73	//! ASLR and ASLR-like effects, making the entire program more sensitive
74	//! the default hasher is randomized, and while `rustc` doesn't use it,*
75	//! proc macros can (and will), and it's harder to disable than Linux ASLR
76	//! most ways of measuring time will inherently never perfectly align with*
77	//! exact points in the program's execution, making time behave like another
78	//! low-entropy source of randomness - this also means timers will elapse at
79	//! unpredictable points (which can further impact the rest of the execution)
80	//! this includes the common thread scheduler technique of preempting the*
81	//! currently executing thread with a periodic timer interrupt, so the exact
82	//! interleaving of multiple threads will likely not be reproducible without
83	//! special OS configuration, or tools that emulate a deterministic scheduler
84	//! `jemalloc` (the allocator used by `rustc`, at least in official releases)*
85	//! has a 10 second "purge timer", which can introduce an ASLR-like effect,
86	//! unless disabled with `MALLOC_CONF=dirty_decay_ms:0,muzzy_decay_ms:0`
87	//! hardware flaws (whether in the design or implementation)*
88	//! hardware interrupts ("IRQs") and exceptions (like page faults) cause*
89	//! overcounting (1 instruction per interrupt, possibly the `iret` from the
90	//! kernel handler back to the interrupted userspace program)
91	//! this is the reason why `instructions-minus-irqs:u` should be preferred*
92	//! to `instructions:u`, where the former is available
93	//! there are system-wide options (e.g. `CONFIG_NO_HZ_FULL`) for removing*
94	//! some interrupts from the cores used for profiling, but they're not as
95	//! complete of a solution, nor easy to set up in the first place
96	//! AMD Zen CPUs have a speculative execution feature (dubbed `SpecLockMap`),*
97	//! which can cause non-deterministic overcounting for instructions following
98	//! an atomic instruction (such as found in heap allocators, or `measureme`)
99	//! this is automatically detected, with a `log` message pointing the user*
100	//! to <https://github.com/mozilla/rr/wiki/Zen> for guidance on how to
101	//! disable `SpecLockMap` on their system (sadly requires root access)
102	//!
103	//! Even if some of the above caveats apply for some profiling setup, as long as
104	//! the counters function, they can still be used, and compared with `wall-time`.
105	//! Chances are, they will still have less variance, as everything that impacts
106	//! instruction counts will also impact any time measurements.
107	//!
108	//! Also keep in mind that instruction counts do not properly reflect all kinds
109	//! of workloads, e.g. SIMD throughput and cache locality are unaccounted for.
110
111	// FIXME: Use a cargo feature for accurate_seqlock_rdpmc and unserialized_rdpmc
112	// so we don't need this:
113	#![allow(unexpected_cfgs)]
114
115	use std::error::Error;
116	use std::time::Instant;
117
118	// HACK(eddyb) this is semantically `warn!` but uses `error!` because
119	// that's the only log level enabled by default - see also
120	// https://github.com/rust-lang/rust/issues/76824
121	macro_rules! really_warn {
122	($msg:literal $($rest:tt)*) => {
123	error!(concat!("[WARNING] ", $msg) $($rest)*)
124	}
125	}
126
127	pub enum Counter {
128	WallTime(WallTime),
129	Instructions(Instructions),
130	InstructionsMinusIrqs(InstructionsMinusIrqs),
131	InstructionsMinusRaw0420(InstructionsMinusRaw0420),
132	}
133
134	impl Counter {
135	pub fn by_name(name: &str) -> Result<Self, Box<dyn Error + Send + Sync>> {
136	Ok(match name {
137	WallTime::NAME => Counter::WallTime(WallTime::new()),
138	Instructions::NAME => Counter::Instructions(Instructions::new()?),
139	InstructionsMinusIrqs::NAME => {
140	Counter::InstructionsMinusIrqs(InstructionsMinusIrqs::new()?)
141	}
142	InstructionsMinusRaw0420::NAME => {
143	Counter::InstructionsMinusRaw0420(InstructionsMinusRaw0420::new()?)
144	}
145	_ => return Err(format!("{:?} is not a valid counter name", name).into()),
146	})
147	}
148
149	pub(super) fn describe_as_json(&self) -> String {
150	let (name, units) = match self {
151	Counter::WallTime(_) => (
152	WallTime::NAME,
153	r#"[["ns", 1], ["μs", 1000], ["ms", 1000000], ["s", 1000000000]]"#,
154	),
155	Counter::Instructions(_) => (Instructions::NAME, r#"[["instructions", 1]]"#),
156	Counter::InstructionsMinusIrqs(_) => {
157	(InstructionsMinusIrqs::NAME, r#"[["instructions", 1]]"#)
158	}
159	Counter::InstructionsMinusRaw0420(_) => {
160	(InstructionsMinusRaw0420::NAME, r#"[["instructions", 1]]"#)
161	}
162	};
163	format!(r#"`{{` "name": "{}", "units": {} `}}`"#, name, units)
164	}
165
166	#[inline]
167	pub(super) fn since_start(&self) -> u64 {
168	match self {
169	Counter::WallTime(counter) => counter.since_start(),
170	Counter::Instructions(counter) => counter.since_start(),
171	Counter::InstructionsMinusIrqs(counter) => counter.since_start(),
172	Counter::InstructionsMinusRaw0420(counter) => counter.since_start(),
173	}
174	}
175	}
176
177	/// "Monotonic clock" with nanosecond precision (using [`std::time::Instant`]).
178	///
179	/// Can be obtained with `Counter::by_name("wall-time")`.
180	pub struct WallTime {
181	start: Instant,
182	}
183
184	impl WallTime {
185	const NAME: &'static str = "wall-time";
186
187	pub fn new() -> Self {
188	WallTime {
189	start: Instant::now(),
190	}
191	}
192
193	#[inline]
194	fn since_start(&self) -> u64 {
195	self.start.elapsed().as_nanos() as u64
196	}
197	}
198
199	/// "Instructions retired" hardware performance counter (userspace-only).
200	///
201	/// Can be obtained with `Counter::by_name("instructions:u")`.
202	pub struct Instructions {
203	instructions: hw::Counter,
204	start: u64,
205	}
206
207	impl Instructions {
208	const NAME: &'static str = "instructions:u";
209
210	pub fn new() -> Result<Self, Box<dyn Error + Send + Sync>> {
211	let model: CpuModel = hw::CpuModel::detect()?;
212	let instructions: Counter = hw::Counter::new(&model, counter_type:HwCounterType::Instructions)?;
213	let start: u64 = instructions.read();
214	Ok(Instructions {
215	instructions,
216	start,
217	})
218	}
219
220	#[inline]
221	fn since_start(&self) -> u64 {
222	self.instructions.read().wrapping_sub(self.start)
223	}
224	}
225
226	/// More accurate [`Instructions`] (subtracting hardware interrupt counts).
227	///
228	/// Can be obtained with `Counter::by_name("instructions-minus-irqs:u")`.
229	pub struct InstructionsMinusIrqs {
230	instructions: hw::Counter,
231	irqs: hw::Counter,
232	start: u64,
233	}
234
235	impl InstructionsMinusIrqs {
236	const NAME: &'static str = "instructions-minus-irqs:u";
237
238	pub fn new() -> Result<Self, Box<dyn Error + Send + Sync>> {
239	let model: CpuModel = hw::CpuModel::detect()?;
240	let instructions: Counter = hw::Counter::new(&model, counter_type:HwCounterType::Instructions)?;
241	let irqs: Counter = hw::Counter::new(&model, counter_type:HwCounterType::Irqs)?;
242	let (start_instructions: u64, start_irqs: u64) = (&instructions, &irqs).read();
243	let start: u64 = start_instructions.wrapping_sub(start_irqs);
244	Ok(InstructionsMinusIrqs {
245	instructions,
246	irqs,
247	start,
248	})
249	}
250
251	#[inline]
252	fn since_start(&self) -> u64 {
253	let (instructions: u64, irqs: u64) = (&self.instructions, &self.irqs).read();
254	instructions.wrapping_sub(irqs).wrapping_sub(self.start)
255	}
256	}
257
258	/// (Experimental) Like [`InstructionsMinusIrqs`] (but using an undocumented `r0420:u` counter).
259	///
260	/// Can be obtained with `Counter::by_name("instructions-minus-r0420:u")`.
261	//
262	// HACK(eddyb) this is a variant of `instructions-minus-irqs:u`, where `r0420`
263	// is subtracted, instead of the usual "hardware interrupts" (aka IRQs).
264	// `r0420` is an undocumented counter on AMD Zen CPUs which appears to count
265	// both hardware interrupts and exceptions (such as page faults), though
266	// it's unclear yet what exactly it's counting (could even be `iret`s).
267	pub struct InstructionsMinusRaw0420(InstructionsMinusIrqs);
268
269	impl InstructionsMinusRaw0420 {
270	const NAME: &'static str = "instructions-minus-r0420:u";
271
272	pub fn new() -> Result<Self, Box<dyn Error + Send + Sync>> {
273	let model: CpuModel = hw::CpuModel::detect()?;
274	let instructions: Counter = hw::Counter::new(&model, counter_type:HwCounterType::Instructions)?;
275	let irqs: Counter = hw::Counter::new(&model, counter_type:HwCounterType::Raw0420)?;
276	let (start_instructions: u64, start_irqs: u64) = (&instructions, &irqs).read();
277	let start: u64 = start_instructions.wrapping_sub(start_irqs);
278	Ok(InstructionsMinusRaw0420(InstructionsMinusIrqs {
279	instructions,
280	irqs,
281	start,
282	}))
283	}
284
285	#[inline]
286	fn since_start(&self) -> u64 {
287	self.0.since_start()
288	}
289	}
290
291	trait HwCounterRead {
292	type Output;
293	fn read(&self) -> Self::Output;
294	}
295
296	enum HwCounterType {
297	Instructions,
298	Irqs,
299	Raw0420,
300	}
301
302	const BUG_REPORT_MSG: &str =
303	"please report this to https://github.com/rust-lang/measureme/issues/new";
304
305	/// Linux x86_64 implementation based on `perf_event_open` and `rdpmc`.
306	#[cfg(all(target_arch = "x86_64", target_os = "linux"))]
307	mod hw {
308	use memmap2::{Mmap, MmapOptions};
309	use perf_event_open_sys::{bindings::*, perf_event_open};
310	use std::arch::asm;
311	use std::convert::TryInto;
312	use std::error::Error;
313	use std::fs;
314	use std::mem;
315	use std::os::unix::io::FromRawFd;
316
317	pub(super) struct Counter {
318	mmap: Mmap,
319	reg_idx: u32,
320	}
321
322	impl Counter {
323	pub(super) fn new(
324	model: &CpuModel,
325	counter_type: super::HwCounterType,
326	) -> Result<Self, Box<dyn Error + Send + Sync>> {
327	let (type_, hw_id) = match counter_type {
328	super::HwCounterType::Instructions => {
329	(PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS)
330	}
331	super::HwCounterType::Irqs => (PERF_TYPE_RAW, model.irqs_counter_config()?),
332	super::HwCounterType::Raw0420 => {
333	match model {
334	CpuModel::Amd(AmdGen::Zen) => {}
335
336	_ => really_warn!(
337	"Counter::new: the undocumented `r0420` performance \
338	counter has only been observed on AMD Zen CPUs"
339	),
340	}
341
342	(PERF_TYPE_RAW, `0x04_20`)
343	}
344	};
345	Self::with_type_and_hw_id(type_, hw_id)
346	}
347
348	fn with_type_and_hw_id(
349	type_: perf_type_id,
350	hw_id: u32,
351	) -> Result<Self, Box<dyn Error + Send + Sync>> {
352	let mut attrs = perf_event_attr {
353	size: mem::size_of::<perf_event_attr>().try_into().unwrap(),
354	type_,
355	config: hw_id.into(),
356	..perf_event_attr::default()
357	};
358
359	// Only record same-thread, any CPUs, and only userspace (no kernel/hypervisor).
360	// NOTE(eddyb) `pid = 0`, despite talking about "process id", means
361	// "calling process/thread", not* "any thread in the calling process"*
362	// (i.e. "process" is interchangeable with "main thread of the process")
363	// FIXME(eddyb) introduce per-thread counters and/or use `inherit`
364	// (and `inherit_stat`? though they might not be appropriate here)
365	// to be able to read the counter on more than just the initial thread.
366	let pid = `0`;
367	let cpu = `-1`;
368	let group_fd = `-1`;
369	attrs.set_exclude_kernel(`1`);
370	attrs.set_exclude_hv(`1`);
371
372	let file = unsafe {
373	let fd =
374	perf_event_open(&mut attrs, pid, cpu, group_fd, PERF_FLAG_FD_CLOEXEC.into());
375	if fd < `0` {
376	Err(std::io::Error::from_raw_os_error(-fd))
377	} else {
378	Ok(fs::File::from_raw_fd(fd))
379	}
380	};
381	let file = file.map_err(\|e\| format!("perf_event_open failed: {:?}", e))?;
382
383	let mmap = unsafe {
384	MmapOptions::new()
385	.len(mem::size_of::<perf_event_mmap_page>())
386	.map(&file)
387	};
388	let mmap = mmap.map_err(\|e\| format!("perf_event_mmap_page: mmap failed: {:?}", e))?;
389
390	let mut counter = Counter { mmap, reg_idx: `0` };
391
392	let (version, compat_version, caps, index, pmc_width) = counter
393	.access_mmap_page_with_seqlock(\|mp\| {
394	(
395	mp.version,
396	mp.compat_version,
397	unsafe { mp.__bindgen_anon_1.__bindgen_anon_1 },
398	mp.index,
399	mp.pmc_width,
400	)
401	});
402
403	info!(
404	"Counter::new: version={} compat_version={} index={:#x}",
405	version, compat_version, index,
406	);
407
408	if caps.cap_user_rdpmc() == `0` {
409	return Err(format!(
410	"perf_event_mmap_page: missing cap_user_rdpmc{}",
411	if caps.cap_bit0_is_deprecated() == `0` && caps.cap_bit0() == `1` {
412	" (ignoring legacy/broken rdpmc support)"
413	} else {
414	""
415	}
416	)
417	.into());
418	}
419
420	if index == `0` {
421	return Err(format!(
422	"perf_event_mmap_page: no allocated hardware register (ran out?)"
423	)
424	.into());
425	}
426	counter.reg_idx = index - `1`;
427
428	if (cfg!(not(accurate_seqlock_rdpmc)) \|\| `true`) && pmc_width != `48` {
429	return Err(format!(
430	"perf_event_mmap_page: {}-bit hardware counter found, only 48-bit supported",
431	pmc_width
432	)
433	.into());
434	}
435
436	Ok(counter)
437	}
438
439	/// Try to access the mmap page, retrying the `attempt` closure as long
440	/// as the "seqlock" sequence number changes (which indicates the kernel
441	/// has updated one or more fields within the mmap page).
442	#[inline]
443	fn access_mmap_page_with_seqlock<T>(
444	&self,
445	attempt: impl Fn(&perf_event_mmap_page) -> T,
446	) -> T {
447	// FIXME(eddyb) it's probably UB to use regular reads, especially
448	// from behind `&T`, with the only synchronization being barriers.
449	// Probably needs atomic reads, and stronger ones at that, for the
450	// `lock` field, than the fields (which would be `Relaxed`?).
451	let mmap_page = unsafe { &(self.mmap.as_ptr() as const perf_event_mmap_page) };
452	let barrier = \|\| std::sync::atomic::fence(std::sync::atomic::Ordering::Acquire);
453
454	loop {
455	// Grab the "seqlock" - the kernel will update this value when it
456	// updates any of the other fields that may be read in `attempt`.
457	let seq_lock = mmap_page.lock;
458	barrier();
459
460	let result = attempt(mmap_page);
461
462	// If nothing has changed, we're done. Otherwise, keep retrying.
463	barrier();
464	if mmap_page.lock == seq_lock {
465	return result;
466	}
467	}
468	}
469	}
470
471	impl super::HwCounterRead for Counter {
472	type Output = u64;
473
474	#[inline]
475	fn read(&self) -> u64 {
476	// HACK(eddyb) keep the accurate code around while not using it,
477	// to minimize overhead without losing the more complex implementation.
478	let (counter, offset, pmc_width) = if cfg!(accurate_seqlock_rdpmc) && `false` {
479	self.access_mmap_page_with_seqlock(\|mp\| {
480	let caps = unsafe { mp.__bindgen_anon_1.__bindgen_anon_1 };
481	assert_ne!(caps.cap_user_rdpmc(), `0`);
482
483	(
484	rdpmc(mp.index.checked_sub(`1`).unwrap()),
485	mp.offset,
486	mp.pmc_width,
487	)
488	})
489	} else {
490	(rdpmc(self.reg_idx), `0`, `48`)
491	};
492
493	let counter = offset + (counter as i64);
494
495	// Sign-extend the `pmc_width`-bit value to `i64`.
496	(counter << (`64` - pmc_width) >> (`64` - pmc_width)) as u64
497	}
498	}
499
500	impl super::HwCounterRead for (&Counter, &Counter) {
501	type Output = (u64, u64);
502
503	#[inline]
504	fn read(&self) -> (u64, u64) {
505	// HACK(eddyb) keep the accurate code around while not using it,
506	// to minimize overhead without losing the more complex implementation.
507	if (cfg!(accurate_seqlock_rdpmc) \|\| cfg!(unserialized_rdpmc)) && `false` {
508	return (self.0.read(), self.1.read());
509	}
510
511	let pmc_width = `48`;
512
513	let (a_counter, b_counter) = rdpmc_pair(self.0.reg_idx, self.1.reg_idx);
514
515	// Sign-extend the `pmc_width`-bit values to `i64`.
516	(
517	((a_counter as i64) << (`64` - pmc_width) >> (`64` - pmc_width)) as u64,
518	((b_counter as i64) << (`64` - pmc_width) >> (`64` - pmc_width)) as u64,
519	)
520	}
521	}
522
523	/// Read the hardware performance counter indicated by `reg_idx`.
524	///
525	/// If the counter is signed, sign extension should be performed based on
526	/// the width of the register (32 to 64 bits, e.g. 48-bit seems common).
527	#[inline(always)]
528	fn rdpmc(reg_idx: u32) -> u64 {
529	// NOTE(eddyb) below comment is outdated (the other branch uses `cpuid`).
530	if cfg!(unserialized_rdpmc) && `false` {
531	// FIXME(eddyb) the Intel and AMD manuals warn about the need for
532	// "serializing instructions" before/after `rdpmc`, if avoiding any
533	// reordering is desired, but do not agree on the full set of usable
534	// "serializing instructions" (e.g. `mfence` isn't listed in both).
535	//
536	// The only usable, and guaranteed to work, "serializing instruction"
537	// appears to be `cpuid`, but it doesn't seem easy to use, especially
538	// due to the overlap in registers with `rdpmc` itself, and it might
539	// have too high of a cost, compared to serialization benefits (if any).
540	unserialized_rdpmc(reg_idx)
541	} else {
542	serialize_instruction_execution();
543	unserialized_rdpmc(reg_idx)
544	}
545	}
546
547	/// Read two hardware performance counters at once (see `rdpmc`).
548	///
549	/// Should be more efficient/accurate than two `rdpmc` calls, as it
550	/// only requires one "serializing instruction", rather than two.
551	#[inline(always)]
552	fn rdpmc_pair(a_reg_idx: u32, b_reg_idx: u32) -> (u64, u64) {
553	serialize_instruction_execution();
554	(unserialized_rdpmc(a_reg_idx), unserialized_rdpmc(b_reg_idx))
555	}
556
557	/// Dummy `cpuid(0)` to serialize instruction execution.
558	#[inline(always)]
559	fn serialize_instruction_execution() {
560	unsafe {
561	asm!(
562	"xor %eax, %eax", // Intel syntax: "xor eax, eax"
563	// LLVM sometimes reserves `ebx` for its internal use, so we need to use
564	// a scratch register for it instead.
565	"mov %rbx, {tmp_rbx:r}", // Intel syntax: "mov {tmp_rbx:r}, rbx"
566	"cpuid",
567	"mov {tmp_rbx:r}, %rbx", // Intel syntax: "mov rbx, {tmp_rbx:r}"
568	tmp_rbx = lateout(reg) _,
569	// `cpuid` clobbers.
570	lateout("eax") _,
571	lateout("edx") _,
572	lateout("ecx") _,
573
574	options(nostack),
575	// Older versions of LLVM do not support modifiers in
576	// Intel syntax inline asm; whenever Rust minimum LLVM version
577	// supports Intel syntax inline asm, remove and replace above
578	// instructions with Intel syntax version (from comments).
579	options(att_syntax),
580	);
581	}
582	}
583
584	/// Read the hardware performance counter indicated by `reg_idx`.
585	///
586	/// If the counter is signed, sign extension should be performed based on
587	/// the width of the register (32 to 64 bits, e.g. 48-bit seems common).
588	#[inline(always)]
589	fn unserialized_rdpmc(reg_idx: u32) -> u64 {
590	let (lo, hi): (u32, u32);
591	unsafe {
592	asm!(
593	"rdpmc",
594	in("ecx") reg_idx,
595	lateout("eax") lo,
596	lateout("edx") hi,
597	options(nostack),
598	// Older versions of LLVM do not support modifiers in
599	// Intel syntax inline asm; whenever Rust minimum LLVM version
600	// supports Intel syntax inline asm, remove and replace above
601	// instructions with Intel syntax version (from comments).
602	options(att_syntax),
603	);
604	}
605	lo as u64 \| (hi as u64) << `32`
606	}
607
608	/// Categorization of `x86_64` CPUs, primarily based on how they
609	/// support for counting "hardware interrupts" (documented or not).
610	pub(super) enum CpuModel {
611	Amd(AmdGen),
612	Intel(IntelGen),
613	}
614
615	pub(super) enum AmdGen {
616	/// K8 (Hammer) to Jaguar / Puma.
617	PreZen,
618
619	/// Zen / Zen+ / Zen 2.
620	Zen,
621
622	/// Unknown AMD CPU, contemporary to/succeeding Zen/Zen+/Zen 2,
623	/// but likely similar to them.
624	UnknownMaybeZenLike,
625	}
626
627	pub(super) enum IntelGen {
628	/// Intel CPU predating Sandy Bridge. These are the only CPUs we
629	/// can't support (more) accurate instruction counting on, as they
630	/// don't (appear to) have any way to count "hardware interrupts".
631	PreBridge,
632
633	/// Sandy Bridge / Ivy Bridge:
634	/// client: Sandy Bridge (M/H) / Ivy Bridge (M/H/Gladden)*
635	/// server: Sandy Bridge (E/EN/EP) / Ivy Bridge (E/EN/EP/EX)*
636	///
637	/// Intel doesn't document support for counting "hardware interrupts"
638	/// prior to Skylake, but testing found that `HW_INTERRUPTS.RECEIVED`
639	/// from Skylake has existed, with the same config, as far back as
640	/// "Sandy Bridge" (but before that it mapped to a different event).
641	///
642	/// These are the (pre-Skylake) Bridge CPU models confirmed so far:*
643	/// Sandy Bridge (client) Family 6 Model 42*
644	/// Intel(R) Core(TM) i5-2520M CPU @ 2.50GHz (@alyssais)
645	/// Ivy Bridge (client) Family 6 Model 58*
646	/// Intel(R) Core(TM) i7-3520M CPU @ 2.90GHz (@eddyb)
647	///
648	/// We later found this paper, which on page 5 lists 12 counters,
649	/// for each of Nehalem/Westmere, Sandy Bridge and Ivy Bridge:
650	/// http://web.eece.maine.edu/~vweaver/projects/deterministic/deterministic_counters.pdf
651	/// It appears that both Sandy Bridge and Ivy Bridge used to have
652	/// `HW_INTERRUPTS.RECEIVED` documented, before Intel removed every
653	/// mention of the counter from newer versions of their manuals.
654	Bridge,
655
656	/// Haswell / Broadwell:
657	/// client: Haswell (S/ULT/GT3e) / Broadwell (U/Y/S/H/C/W)*
658	/// server: Haswell (E/EP/EX) / Broadwell (E/EP/EX/DE/Hewitt Lake)*
659	///
660	/// Equally as undocumented as "Sandy Bridge / Ivy Bridge" (see above).
661	///
662	/// These are the (pre-Skylake) Well CPU models confirmed so far:*
663	/// Haswell (client) Family 6 Model 60*
664	/// Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz (@m-ou-se)
665	/// Haswell (server) Family 6 Model 63*
666	/// Intel(R) Xeon(R) CPU E5-2697 v3 @ 2.60GHz (@cuviper)
667	/// Haswell (client + GT3e) Family 6 Model 70*
668	/// Intel(R) Core(TM) i7-4750HQ CPU @ 2.00GHz (@nagisa)
669	/// Intel(R) Core(TM) i7-4770HQ CPU @ 2.20GHz (@m-ou-se)
670	Well,
671
672	/// Skylake / Skylake-derived:
673	/// client: Skylake (Y/U/DT/H/S) / Kaby Lake (Y/U/DT/H/S/X) / Coffee Lake (U/S/H/E)*
674	/// server: Skylake (SP/X/DE/W) / Cascade Lake (SP/X/W)*
675	///
676	/// Both "client" and "server" product lines have documented support
677	/// for counting "hardware interrupts" (`HW_INTERRUPTS.RECEIVED`).
678	///
679	/// Intel does not make it clear that future product lines, such as
680	/// "Ice Lake", will continue to support this (or with what config),
681	/// and even "Comet Lake" (aka "10th gen") isn't explicitly listed.
682	Lake,
683
684	/// Unknown Intel CPU, contemporary to/succeeding Bridge/Well/Lake,*
685	/// but likely similar to them.
686	UnknownMaybeLakeLike,
687	}
688
689	impl CpuModel {
690	/// Detect the model of the current CPU using `cpuid`.
691	pub(super) fn detect() -> Result<Self, Box<dyn Error + Send + Sync>> {
692	let cpuid0 = unsafe { std::arch::x86_64::__cpuid(`0`) };
693	let cpuid1 = unsafe { std::arch::x86_64::__cpuid(`1`) };
694	let mut vendor = [`0`; `12`];
695	vendor[`0`..`4`].copy_from_slice(&cpuid0.ebx.to_le_bytes());
696	vendor[`4`..`8`].copy_from_slice(&cpuid0.edx.to_le_bytes());
697	vendor[`8`..`12`].copy_from_slice(&cpuid0.ecx.to_le_bytes());
698
699	let vendor = std::str::from_utf8(&vendor).map_err(\|_\| {
700	format!(
701	"cpuid returned non-UTF-8 vendor name: cpuid(0)={:?} cpuid(1)={:?}",
702	cpuid0, cpuid1
703	)
704	})?;
705
706	let version = cpuid1.eax;
707
708	let mut family = (version >> `8`) & `0xf`;
709	if family == `15` {
710	// Extended family.
711	family += (version >> `20`) & `0xff`;
712	}
713
714	let mut model = (version >> `4`) & `0xf`;
715	if family >= `15` \|\| vendor == "GenuineIntel" && family == `6` {
716	// Extended model.
717	model += ((version >> `16`) & `0xf`) << `4`;
718	}
719
720	info!(
721	"CpuModel::detect: vendor={:?} family={} model={}",
722	vendor, family, model
723	);
724
725	match vendor {
726	"AuthenticAMD" => {
727	use self::AmdGen::*;
728
729	let (gen, name) = match (family, model) {
730	(`0`..=`14`, _) \| (`19`, _) => {
731	return Err(format!(
732	"impossible AMD64 CPU detected (Family {} Model {}); {}",
733	family,
734	model,
735	super::BUG_REPORT_MSG
736	)
737	.into());
738	}
739
740	(`15`, _) => (PreZen, "K8 (Hammer)"),
741	(`16`, _) => (PreZen, "K10 (Barcelona/Shanghai/Istanbul)"),
742	(`17`, _) => (PreZen, "K8+K10 hybrid (Turion X2 Ultra)"),
743	(`18`, _) => (PreZen, "Fusion"),
744	(`20`, _) => (PreZen, "Bobcat"),
745	(`21`, _) => (PreZen, "Bulldozer / Piledriver / Steamroller / Excavator"),
746	(`22`, _) => (PreZen, "Jaguar / Puma"),
747
748	(`23`, `1`) => (Zen, "Zen (Naples/Whitehaven/Summit Ridge/Snowy Owl)"),
749	(`23`, `17`) => (Zen, "Zen (Raven Ridge)"),
750	(`23`, `24`) => (Zen, "Zen (Banded Kestrel/Dali) / Zen+ (Picasso)"),
751	(`23`, `8`) => (Zen, "Zen+ (Pinnacle Ridge)"),
752	(`23`, `49`) => (Zen, "Zen 2 (Rome/Castle Peak)"),
753	(`23`, `113`) => (Zen, "Zen 2 (Matisse)"),
754
755	(`23`..=`0xffff_ffff`, _) => {
756	really_warn!(
757	"CpuModel::detect: unknown AMD CPU (Family {} Model {}), \
758	assuming Zen-like; {}",
759	family,
760	model,
761	super::BUG_REPORT_MSG
762	);
763
764	(UnknownMaybeZenLike, "")
765	}
766	};
767
768	if !name.is_empty() {
769	info!("CpuModel::detect: known AMD CPU: {}", name);
770	}
771
772	// The `SpecLockMap` (speculative atomic aka `lock` instruction
773	// execution, unclear what "Map" refers to) feature in AMD Zen CPUs
774	// causes non-deterministic overcounting of atomic instructions,
775	// presumably whenever it has to roll back the speculation
776	// (as in, the performance counters aren't rolled back).
777	// Even this this may be rare when uncontended, it adds up.
778	//
779	// There is an MSR bit (`MSRC001_1020[54]`) that's not officially
780	// documented, but which several motherboards and profiling tools
781	// set whenever IBS (Instruction-Based Sampling) is in use, and
782	// it is sometimes referred to as "disabling `SpecLockMap`"
783	// (hence having a name for the feature that speculates `lock`s).
784	//
785	// One way we could detect that the bit has been set would be to
786	// parse `uname().release` (aka `uname -r`) and look for versions
787	// which are known to include the patch suggested in this thread:
788	// https://github.com/mozilla/rr/issues/2034#issuecomment-693761247
789	//
790	// However, one may set the bit using e.g. `wrmsr`, even on older
791	// kernels, so a more reliable approach is to execute some atomics
792	// and look at the `SpecLockMapCommit` (`r0825:u`) Zen counter,
793	// which only reliably remains `0` when `SpecLockMap` is disabled.
794	if matches!(gen, Zen \| UnknownMaybeZenLike) {
795	if let Ok(spec_lock_map_commit) =
796	Counter::with_type_and_hw_id(PERF_TYPE_RAW, `0x08_25`)
797	{
798	use super::HwCounterRead;
799
800	let start_spec_lock_map_commit = spec_lock_map_commit.read();
801
802	// Execute an atomic (`lock`) instruction, which should
803	// start speculative execution for following instructions
804	// (as long as `SpecLockMap` isn't disabled).
805	let mut atomic: u64 = `0`;
806	let mut _tmp: u64 = `0`;
807	unsafe {
808	asm!(
809	// Intel syntax: "lock xadd [{atomic}], {tmp}"
810	"lock xadd {tmp}, ({atomic})",
811
812	atomic = in(reg) &mut atomic,
813	tmp = inout(reg) _tmp,
814
815	// Older versions of LLVM do not support modifiers in
816	// Intel syntax inline asm; whenever Rust minimum LLVM
817	// version supports Intel syntax inline asm, remove
818	// and replace above instructions with Intel syntax
819	// version (from comments).
820	options(att_syntax),
821	);
822	}
823
824	if spec_lock_map_commit.read() != start_spec_lock_map_commit {
825	really_warn!(
826	"CpuModel::detect: SpecLockMap detected, in AMD {} CPU; \
827	this may add some non-deterministic noise - \
828	for information on disabling SpecLockMap, see \
829	https://github.com/mozilla/rr/wiki/Zen",
830	name
831	);
832	}
833	}
834	}
835
836	Ok(CpuModel::Amd(gen))
837	}
838
839	"GenuineIntel" => {
840	use self::IntelGen::*;
841
842	let (gen, name) = match (family, model) {
843	// No need to name these, they're unsupported anyway.
844	(`0`..=`5`, _) => (PreBridge, ""),
845	(`15`, _) => (PreBridge, "Netburst"),
846	(`6`, `0`..=`41`) => (PreBridge, ""),
847
848	// Older Xeon Phi CPUs, misplaced in Family 6.
849	(`6`, `87`) => (PreBridge, "Knights Landing"),
850	(`6`, `133`) => (PreBridge, "Knights Mill"),
851
852	// Older Atom CPUs, interleaved with other CPUs.
853	// FIXME(eddyb) figure out if these are like Bridge/Well.
854	(`6`, `53`) \| (`6`, `54`) => (PreBridge, "Saltwell"),
855	(`6`, `55`) \| (`6`, `74`) \| (`6`, `77`) \| (`6`, `90`) \| (`6`, `93`) => {
856	(PreBridge, "Silvermont")
857	}
858	(`6`, `76`) => (PreBridge, "Airmont (Cherry Trail/Braswell)"),
859
860	// Older server CPUs, numbered out of order.
861	(`6`, `44`) => (PreBridge, "Westmere (Gulftown/EP)"),
862	(`6`, `46`) => (PreBridge, "Nehalem (EX)"),
863	(`6`, `47`) => (PreBridge, "Westmere (EX)"),
864
865	(`6`, `42`) => (Bridge, "Sandy Bridge (M/H)"),
866	(`6`, `45`) => (Bridge, "Sandy Bridge (E/EN/EP)"),
867	(`6`, `58`) => (Bridge, "Ivy Bridge (M/H/Gladden)"),
868	(`6`, `62`) => (Bridge, "Ivy Bridge (E/EN/EP/EX)"),
869
870	(`6`, `60`) => (Well, "Haswell (S)"),
871	(`6`, `61`) => (Well, "Broadwell (U/Y/S)"),
872	(`6`, `63`) => (Well, "Haswell (E/EP/EX)"),
873	(`6`, `69`) => (Well, "Haswell (ULT)"),
874	(`6`, `70`) => (Well, "Haswell (GT3e)"),
875	(`6`, `71`) => (Well, "Broadwell (H/C/W)"),
876	(`6`, `79`) => (Well, "Broadwell (E/EP/EX)"),
877	(`6`, `86`) => (Well, "Broadwell (DE/Hewitt Lake)"),
878
879	(`6`, `78`) => (Lake, "Skylake (Y/U)"),
880	(`6`, `85`) => (Lake, "Skylake (SP/X/DE/W) / Cascade Lake (SP/X/W)"),
881	(`6`, `94`) => (Lake, "Skylake (DT/H/S)"),
882	(`6`, `142`) => (Lake, "Kaby Lake (Y/U) / Coffee Lake (U)"),
883	(`6`, `158`) => (Lake, "Kaby Lake (DT/H/S/X) / Coffee Lake (S/H/E)"),
884
885	(`6`..=`14`, _) \| (`16`..=`0xffff_ffff`, _) => {
886	really_warn!(
887	"CpuModel::detect: unknown Intel CPU (Family {} Model {}), \
888	assuming Skylake-like; {}",
889	family,
890	model,
891	super::BUG_REPORT_MSG
892	);
893
894	(UnknownMaybeLakeLike, "")
895	}
896	};
897
898	if !name.is_empty() {
899	info!("CpuModel::detect: known Intel CPU: {}", name);
900	}
901
902	Ok(CpuModel::Intel(gen))
903	}
904
905	_ => Err(format!(
906	"cpuid returned unknown CPU vendor {:?}; version={:#x}",
907	vendor, version
908	)
909	.into()),
910	}
911	}
912
913	/// Return the hardware performance counter configuration for
914	/// counting "hardware interrupts" (documented or not).
915	fn irqs_counter_config(&self) -> Result<u32, Box<dyn Error + Send + Sync>> {
916	match self {
917	CpuModel::Amd(model) => match model {
918	AmdGen::PreZen => Ok(`0x00_cf`),
919	AmdGen::Zen \| AmdGen::UnknownMaybeZenLike => Ok(`0x00_2c`),
920	},
921	CpuModel::Intel(model) => match model {
922	IntelGen::PreBridge => Err(format!(
923	"counting IRQs not yet supported on Intel CPUs \
924	predating Sandy Bridge; {}",
925	super::BUG_REPORT_MSG
926	)
927	.into()),
928	IntelGen::Bridge
929	\| IntelGen::Well
930	\| IntelGen::Lake
931	\| IntelGen::UnknownMaybeLakeLike => Ok(`0x01_cb`),
932	},
933	}
934	}
935	}
936	}
937
938	#[cfg(not(all(target_arch = "x86_64", target_os = "linux")))]
939	mod hw {
940	use std::error::Error;
941
942	pub(super) enum Counter {}
943
944	impl Counter {
945	pub(super) fn new(
946	model: &CpuModel,
947	_: super::HwCounterType,
948	) -> Result<Self, Box<dyn Error + Send + Sync>> {
949	match *model {}
950	}
951	}
952
953	impl super::HwCounterRead for Counter {
954	type Output = u64;
955
956	#[inline]
957	fn read(&self) -> u64 {
958	match *self {}
959	}
960	}
961
962	impl super::HwCounterRead for (&Counter, &Counter) {
963	type Output = (u64, u64);
964
965	#[inline]
966	fn read(&self) -> (u64, u64) {
967	match *self.0 {}
968	}
969	}
970
971	pub(super) enum CpuModel {}
972
973	impl CpuModel {
974	pub(super) fn detect() -> Result<Self, Box<dyn Error + Send + Sync>> {
975	// HACK(eddyb) mark `really_warn!` (and transitively `log` macros)
976	// and `BUG_REPORT_MSG` as "used" to silence warnings.
977	if `false` {
978	really_warn!("unsupported; {}", super::BUG_REPORT_MSG);
979	}
980
981	let mut msg = String::new();
982	let mut add_error = \|s\| {
983	if !msg.is_empty() {
984	msg += "; ";
985	}
986	msg += s;
987	};
988
989	if cfg!(not(target_arch = "x86_64")) {
990	add_error("only supported architecture is x86_64");
991	}
992
993	if cfg!(not(target_os = "linux")) {
994	add_error("only supported OS is Linux");
995	}
996
997	Err(msg.into())
998	}
999	}
1000	}
1001