1//! Implement syscalls using the vDSO.
2//!
3//! <https://man7.org/linux/man-pages/man7/vdso.7.html>
4//!
5//! # Safety
6//!
7//! Similar to syscalls.rs, this file performs raw system calls, and sometimes
8//! passes them uninitialized memory buffers. This file also calls vDSO
9//! functions.
10#![allow(unsafe_code)]
11
12#[cfg(target_arch = "x86")]
13use super::reg::{ArgReg, RetReg, SyscallNumber, A0, A1, A2, A3, A4, A5, R0};
14use super::vdso;
15#[cfg(target_arch = "x86")]
16use core::arch::global_asm;
17#[cfg(feature = "process")]
18#[cfg(any(
19 target_arch = "x86_64",
20 target_arch = "x86",
21 target_arch = "riscv64",
22 target_arch = "powerpc64"
23))]
24use core::ffi::c_void;
25use core::mem::transmute;
26use core::ptr::null_mut;
27use core::sync::atomic::AtomicPtr;
28use core::sync::atomic::Ordering::Relaxed;
29#[cfg(target_pointer_width = "32")]
30#[cfg(feature = "time")]
31use linux_raw_sys::general::timespec as __kernel_old_timespec;
32#[cfg(any(
33 all(
34 feature = "process",
35 any(
36 target_arch = "x86_64",
37 target_arch = "x86",
38 target_arch = "riscv64",
39 target_arch = "powerpc64"
40 )
41 ),
42 feature = "time"
43))]
44use {super::c, super::conv::ret, core::mem::MaybeUninit};
45#[cfg(feature = "time")]
46use {
47 super::conv::c_int,
48 crate::clockid::{ClockId, DynamicClockId},
49 crate::io,
50 crate::timespec::Timespec,
51 linux_raw_sys::general::{__kernel_clockid_t, __kernel_timespec},
52};
53
54#[cfg(feature = "time")]
55#[inline]
56pub(crate) fn clock_gettime(which_clock: ClockId) -> __kernel_timespec {
57 // SAFETY: `CLOCK_GETTIME` contains either null or the address of a
58 // function with an ABI like libc `clock_gettime`, and calling it has the
59 // side effect of writing to the result buffer, and no others.
60 unsafe {
61 let mut result: MaybeUninit<__kernel_timespec> = MaybeUninit::<__kernel_timespec>::uninit();
62 let callee: unsafe fn(i32, *mut __kernel_timespec) -> … = match transmute(src:CLOCK_GETTIME.load(order:Relaxed)) {
63 Some(callee: unsafe fn(i32, *mut __kernel_timespec) -> …) => callee,
64 None => init_clock_gettime(),
65 };
66 let r0: i32 = callee(which_clock as c::c_int, result.as_mut_ptr());
67 // The `ClockId` enum only contains clocks which never fail. It may be
68 // tempting to change this to `debug_assert_eq`, however they can still
69 // fail on uncommon kernel configs, so we leave this in place to ensure
70 // that we don't execute undefined behavior if they ever do fail.
71 assert_eq!(r0, 0);
72 result.assume_init()
73 }
74}
75
76#[cfg(feature = "time")]
77#[inline]
78pub(crate) fn clock_gettime_dynamic(which_clock: DynamicClockId<'_>) -> io::Result<Timespec> {
79 let id = match which_clock {
80 DynamicClockId::Known(id) => id as __kernel_clockid_t,
81
82 DynamicClockId::Dynamic(fd) => {
83 // See `FD_TO_CLOCKID` in Linux's `clock_gettime` documentation.
84 use crate::backend::fd::AsRawFd;
85 const CLOCKFD: i32 = 3;
86 ((!fd.as_raw_fd() << 3) | CLOCKFD) as __kernel_clockid_t
87 }
88
89 DynamicClockId::RealtimeAlarm => c::CLOCK_REALTIME_ALARM as __kernel_clockid_t,
90 DynamicClockId::Tai => c::CLOCK_TAI as __kernel_clockid_t,
91 DynamicClockId::Boottime => c::CLOCK_BOOTTIME as __kernel_clockid_t,
92 DynamicClockId::BoottimeAlarm => c::CLOCK_BOOTTIME_ALARM as __kernel_clockid_t,
93 };
94
95 // SAFETY: `CLOCK_GETTIME` contains either null or the address of a
96 // function with an ABI like libc `clock_gettime`, and calling it has the
97 // side effect of writing to the result buffer, and no others.
98 unsafe {
99 const EINVAL: c::c_int = -(c::EINVAL as c::c_int);
100 let mut timespec = MaybeUninit::<Timespec>::uninit();
101 let callee = match transmute(CLOCK_GETTIME.load(Relaxed)) {
102 Some(callee) => callee,
103 None => init_clock_gettime(),
104 };
105 match callee(id, timespec.as_mut_ptr()) {
106 0 => (),
107 EINVAL => return Err(io::Errno::INVAL),
108 _ => _rustix_clock_gettime_via_syscall(id, timespec.as_mut_ptr())?,
109 }
110 Ok(timespec.assume_init())
111 }
112}
113
114#[cfg(feature = "process")]
115#[cfg(any(
116 target_arch = "x86_64",
117 target_arch = "x86",
118 target_arch = "riscv64",
119 target_arch = "powerpc64"
120))]
121#[inline]
122pub(crate) fn sched_getcpu() -> usize {
123 // SAFETY: `GETCPU` contains either null or the address of a function with
124 // an ABI like libc `getcpu`, and calling it has the side effect of writing
125 // to the result buffers, and no others.
126 unsafe {
127 let mut cpu: MaybeUninit = MaybeUninit::<u32>::uninit();
128 let callee: unsafe fn(*mut u32, *mut …, …) -> … = match transmute(src:GETCPU.load(order:Relaxed)) {
129 Some(callee: unsafe fn(*mut u32, *mut …, …) -> …) => callee,
130 None => init_getcpu(),
131 };
132 let r0: i32 = callee(cpu.as_mut_ptr(), null_mut(), null_mut());
133 debug_assert_eq!(r0, 0);
134 cpu.assume_init() as usize
135 }
136}
137
138#[cfg(target_arch = "x86")]
139pub(super) mod x86_via_vdso {
140 use super::{transmute, ArgReg, Relaxed, RetReg, SyscallNumber, A0, A1, A2, A3, A4, A5, R0};
141 use crate::backend::arch::asm;
142
143 #[inline]
144 pub(in crate::backend) unsafe fn syscall0(nr: SyscallNumber<'_>) -> RetReg<R0> {
145 let callee = match transmute(super::SYSCALL.load(Relaxed)) {
146 Some(callee) => callee,
147 None => super::init_syscall(),
148 };
149 asm::indirect_syscall0(callee, nr)
150 }
151
152 #[inline]
153 pub(in crate::backend) unsafe fn syscall1<'a>(
154 nr: SyscallNumber<'a>,
155 a0: ArgReg<'a, A0>,
156 ) -> RetReg<R0> {
157 let callee = match transmute(super::SYSCALL.load(Relaxed)) {
158 Some(callee) => callee,
159 None => super::init_syscall(),
160 };
161 asm::indirect_syscall1(callee, nr, a0)
162 }
163
164 #[inline]
165 pub(in crate::backend) unsafe fn syscall1_noreturn<'a>(
166 nr: SyscallNumber<'a>,
167 a0: ArgReg<'a, A0>,
168 ) -> ! {
169 let callee = match transmute(super::SYSCALL.load(Relaxed)) {
170 Some(callee) => callee,
171 None => super::init_syscall(),
172 };
173 asm::indirect_syscall1_noreturn(callee, nr, a0)
174 }
175
176 #[inline]
177 pub(in crate::backend) unsafe fn syscall2<'a>(
178 nr: SyscallNumber<'a>,
179 a0: ArgReg<'a, A0>,
180 a1: ArgReg<'a, A1>,
181 ) -> RetReg<R0> {
182 let callee = match transmute(super::SYSCALL.load(Relaxed)) {
183 Some(callee) => callee,
184 None => super::init_syscall(),
185 };
186 asm::indirect_syscall2(callee, nr, a0, a1)
187 }
188
189 #[inline]
190 pub(in crate::backend) unsafe fn syscall3<'a>(
191 nr: SyscallNumber<'a>,
192 a0: ArgReg<'a, A0>,
193 a1: ArgReg<'a, A1>,
194 a2: ArgReg<'a, A2>,
195 ) -> RetReg<R0> {
196 let callee = match transmute(super::SYSCALL.load(Relaxed)) {
197 Some(callee) => callee,
198 None => super::init_syscall(),
199 };
200 asm::indirect_syscall3(callee, nr, a0, a1, a2)
201 }
202
203 #[inline]
204 pub(in crate::backend) unsafe fn syscall4<'a>(
205 nr: SyscallNumber<'a>,
206 a0: ArgReg<'a, A0>,
207 a1: ArgReg<'a, A1>,
208 a2: ArgReg<'a, A2>,
209 a3: ArgReg<'a, A3>,
210 ) -> RetReg<R0> {
211 let callee = match transmute(super::SYSCALL.load(Relaxed)) {
212 Some(callee) => callee,
213 None => super::init_syscall(),
214 };
215 asm::indirect_syscall4(callee, nr, a0, a1, a2, a3)
216 }
217
218 #[inline]
219 pub(in crate::backend) unsafe fn syscall5<'a>(
220 nr: SyscallNumber<'a>,
221 a0: ArgReg<'a, A0>,
222 a1: ArgReg<'a, A1>,
223 a2: ArgReg<'a, A2>,
224 a3: ArgReg<'a, A3>,
225 a4: ArgReg<'a, A4>,
226 ) -> RetReg<R0> {
227 let callee = match transmute(super::SYSCALL.load(Relaxed)) {
228 Some(callee) => callee,
229 None => super::init_syscall(),
230 };
231 asm::indirect_syscall5(callee, nr, a0, a1, a2, a3, a4)
232 }
233
234 #[inline]
235 pub(in crate::backend) unsafe fn syscall6<'a>(
236 nr: SyscallNumber<'a>,
237 a0: ArgReg<'a, A0>,
238 a1: ArgReg<'a, A1>,
239 a2: ArgReg<'a, A2>,
240 a3: ArgReg<'a, A3>,
241 a4: ArgReg<'a, A4>,
242 a5: ArgReg<'a, A5>,
243 ) -> RetReg<R0> {
244 let callee = match transmute(super::SYSCALL.load(Relaxed)) {
245 Some(callee) => callee,
246 None => super::init_syscall(),
247 };
248 asm::indirect_syscall6(callee, nr, a0, a1, a2, a3, a4, a5)
249 }
250
251 // With the indirect call, it isn't meaningful to do a separate
252 // `_readonly` optimization.
253 #[allow(unused_imports)]
254 pub(in crate::backend) use {
255 syscall0 as syscall0_readonly, syscall1 as syscall1_readonly,
256 syscall2 as syscall2_readonly, syscall3 as syscall3_readonly,
257 syscall4 as syscall4_readonly, syscall5 as syscall5_readonly,
258 syscall6 as syscall6_readonly,
259 };
260}
261
262#[cfg(feature = "time")]
263type ClockGettimeType = unsafe extern "C" fn(c::c_int, *mut Timespec) -> c::c_int;
264
265#[cfg(feature = "process")]
266#[cfg(any(
267 target_arch = "x86_64",
268 target_arch = "x86",
269 target_arch = "riscv64",
270 target_arch = "powerpc64"
271))]
272type GetcpuType = unsafe extern "C" fn(*mut u32, *mut u32, *mut c_void) -> c::c_int;
273
274/// The underlying syscall functions are only called from asm, using the
275/// special syscall calling convention to pass arguments and return values,
276/// which the signature here doesn't reflect.
277#[cfg(target_arch = "x86")]
278pub(super) type SyscallType = unsafe extern "C" fn();
279
280/// Initialize `CLOCK_GETTIME` and return its value.
281#[cfg(feature = "time")]
282#[cold]
283fn init_clock_gettime() -> ClockGettimeType {
284 init();
285 // SAFETY: Load the function address from static storage that we just
286 // initialized.
287 unsafe { transmute(src:CLOCK_GETTIME.load(order:Relaxed)) }
288}
289
290/// Initialize `GETCPU` and return its value.
291#[cfg(feature = "process")]
292#[cfg(any(
293 target_arch = "x86_64",
294 target_arch = "x86",
295 target_arch = "riscv64",
296 target_arch = "powerpc64"
297))]
298#[cold]
299fn init_getcpu() -> GetcpuType {
300 init();
301 // SAFETY: Load the function address from static storage that we just
302 // initialized.
303 unsafe { transmute(src:GETCPU.load(order:Relaxed)) }
304}
305
306/// Initialize `SYSCALL` and return its value.
307#[cfg(target_arch = "x86")]
308#[cold]
309fn init_syscall() -> SyscallType {
310 init();
311 // SAFETY: Load the function address from static storage that we just
312 // initialized.
313 unsafe { transmute(SYSCALL.load(Relaxed)) }
314}
315
316/// `AtomicPtr` can't hold a `fn` pointer, so we use a `*` pointer to this
317/// placeholder type, and cast it as needed.
318struct Function;
319#[cfg(feature = "time")]
320static mut CLOCK_GETTIME: AtomicPtr<Function> = AtomicPtr::new(null_mut());
321#[cfg(feature = "process")]
322#[cfg(any(
323 target_arch = "x86_64",
324 target_arch = "x86",
325 target_arch = "riscv64",
326 target_arch = "powerpc64"
327))]
328static mut GETCPU: AtomicPtr<Function> = AtomicPtr::new(null_mut());
329#[cfg(target_arch = "x86")]
330static mut SYSCALL: AtomicPtr<Function> = AtomicPtr::new(null_mut());
331
332#[cfg(feature = "time")]
333unsafe extern "C" fn rustix_clock_gettime_via_syscall(
334 clockid: c::c_int,
335 res: *mut Timespec,
336) -> c::c_int {
337 match _rustix_clock_gettime_via_syscall(clockid, res) {
338 Ok(()) => 0,
339 Err(err: Errno) => err.raw_os_error().wrapping_neg(),
340 }
341}
342
343#[cfg(feature = "time")]
344#[cfg(target_pointer_width = "32")]
345unsafe fn _rustix_clock_gettime_via_syscall(
346 clockid: c::c_int,
347 res: *mut Timespec,
348) -> io::Result<()> {
349 let r0 = syscall!(__NR_clock_gettime64, c_int(clockid), res);
350 match ret(r0) {
351 Err(io::Errno::NOSYS) => _rustix_clock_gettime_via_syscall_old(clockid, res),
352 otherwise => otherwise,
353 }
354}
355
356#[cfg(feature = "time")]
357#[cfg(target_pointer_width = "32")]
358unsafe fn _rustix_clock_gettime_via_syscall_old(
359 clockid: c::c_int,
360 res: *mut Timespec,
361) -> io::Result<()> {
362 // Ordinarily `rustix` doesn't like to emulate system calls, but in the
363 // case of time APIs, it's specific to Linux, specific to 32-bit
364 // architectures *and* specific to old kernel versions, and it's not that
365 // hard to fix up here, so that no other code needs to worry about this.
366 let mut old_result = MaybeUninit::<__kernel_old_timespec>::uninit();
367 let r0 = syscall!(__NR_clock_gettime, c_int(clockid), &mut old_result);
368 match ret(r0) {
369 Ok(()) => {
370 let old_result = old_result.assume_init();
371 *res = Timespec {
372 tv_sec: old_result.tv_sec.into(),
373 tv_nsec: old_result.tv_nsec.into(),
374 };
375 Ok(())
376 }
377 otherwise => otherwise,
378 }
379}
380
381#[cfg(feature = "time")]
382#[cfg(target_pointer_width = "64")]
383unsafe fn _rustix_clock_gettime_via_syscall(
384 clockid: c::c_int,
385 res: *mut Timespec,
386) -> io::Result<()> {
387 ret(raw:syscall!(__NR_clock_gettime, c_int(clockid), res))
388}
389
390#[cfg(feature = "process")]
391#[cfg(any(
392 target_arch = "x86_64",
393 target_arch = "x86",
394 target_arch = "riscv64",
395 target_arch = "powerpc64"
396))]
397unsafe extern "C" fn rustix_getcpu_via_syscall(
398 cpu: *mut u32,
399 node: *mut u32,
400 unused: *mut c_void,
401) -> c::c_int {
402 match ret(raw:syscall!(__NR_getcpu, cpu, node, unused)) {
403 Ok(()) => 0,
404 Err(err: Errno) => err.raw_os_error().wrapping_neg(),
405 }
406}
407
408#[cfg(target_arch = "x86")]
409extern "C" {
410 /// A symbol pointing to an `int 0x80` instruction. This “function” is only
411 /// called from assembly, and only with the x86 syscall calling convention,
412 /// so its signature here is not its true signature.
413 ///
414 /// This extern block and the `global_asm!` below can be replaced with
415 /// `#[naked]` if it's stabilized.
416 fn rustix_int_0x80();
417}
418
419#[cfg(target_arch = "x86")]
420global_asm!(
421 r#"
422 .section .text.rustix_int_0x80,"ax",@progbits
423 .p2align 4
424 .weak rustix_int_0x80
425 .hidden rustix_int_0x80
426 .type rustix_int_0x80, @function
427rustix_int_0x80:
428 .cfi_startproc
429 int 0x80
430 ret
431 .cfi_endproc
432 .size rustix_int_0x80, .-rustix_int_0x80
433"#
434);
435
436fn minimal_init() {
437 // SAFETY: Store default function addresses in static storage so that if we
438 // end up making any system calls while we read the vDSO, they'll work. If
439 // the memory happens to already be initialized, this is redundant, but not
440 // harmful.
441 unsafe {
442 #[cfg(feature = "time")]
443 {
444 CLOCK_GETTIME
445 .compare_exchange(
446 null_mut(),
447 rustix_clock_gettime_via_syscall as *mut Function,
448 Relaxed,
449 Relaxed,
450 )
451 .ok();
452 }
453
454 #[cfg(feature = "process")]
455 #[cfg(any(
456 target_arch = "x86_64",
457 target_arch = "x86",
458 target_arch = "riscv64",
459 target_arch = "powerpc64"
460 ))]
461 {
462 GETCPU
463 .compare_exchange(
464 null_mut(),
465 rustix_getcpu_via_syscall as *mut Function,
466 Relaxed,
467 Relaxed,
468 )
469 .ok();
470 }
471
472 #[cfg(target_arch = "x86")]
473 {
474 SYSCALL
475 .compare_exchange(
476 null_mut(),
477 rustix_int_0x80 as *mut Function,
478 Relaxed,
479 Relaxed,
480 )
481 .ok();
482 }
483 }
484}
485
486fn init() {
487 minimal_init();
488
489 if let Some(vdso) = vdso::Vdso::new() {
490 #[cfg(feature = "time")]
491 {
492 // Look up the platform-specific `clock_gettime` symbol as
493 // documented [here], except on 32-bit platforms where we look up
494 // the `64`-suffixed variant and fail if we don't find it.
495 //
496 // [here]: https://man7.org/linux/man-pages/man7/vdso.7.html
497 #[cfg(target_arch = "x86_64")]
498 let ptr = vdso.sym(cstr!("LINUX_2.6"), cstr!("__vdso_clock_gettime"));
499 #[cfg(target_arch = "arm")]
500 let ptr = vdso.sym(cstr!("LINUX_2.6"), cstr!("__vdso_clock_gettime64"));
501 #[cfg(target_arch = "aarch64")]
502 let ptr = vdso.sym(cstr!("LINUX_2.6.39"), cstr!("__kernel_clock_gettime"));
503 #[cfg(target_arch = "x86")]
504 let ptr = vdso.sym(cstr!("LINUX_2.6"), cstr!("__vdso_clock_gettime64"));
505 #[cfg(target_arch = "riscv64")]
506 let ptr = vdso.sym(cstr!("LINUX_4.15"), cstr!("__vdso_clock_gettime"));
507 #[cfg(target_arch = "powerpc64")]
508 let ptr = vdso.sym(cstr!("LINUX_2.6.15"), cstr!("__kernel_clock_gettime"));
509 #[cfg(any(target_arch = "mips", target_arch = "mips32r6"))]
510 let ptr = vdso.sym(cstr!("LINUX_2.6"), cstr!("__vdso_clock_gettime64"));
511 #[cfg(any(target_arch = "mips64", target_arch = "mips64r6"))]
512 let ptr = vdso.sym(cstr!("LINUX_2.6"), cstr!("__vdso_clock_gettime"));
513
514 // On all 64-bit platforms, the 64-bit `clock_gettime` symbols are
515 // always available.
516 #[cfg(target_pointer_width = "64")]
517 let ok = true;
518
519 // On some 32-bit platforms, the 64-bit `clock_gettime` symbols are
520 // not available on older kernel versions.
521 #[cfg(any(
522 target_arch = "arm",
523 target_arch = "mips",
524 target_arch = "mips32r6",
525 target_arch = "x86"
526 ))]
527 let ok = !ptr.is_null();
528
529 if ok {
530 assert!(!ptr.is_null());
531
532 // SAFETY: Store the computed function addresses in static
533 // storage so that we don't need to compute it again (but if
534 // we do, it doesn't hurt anything).
535 unsafe {
536 CLOCK_GETTIME.store(ptr.cast(), Relaxed);
537 }
538 }
539 }
540
541 #[cfg(feature = "process")]
542 #[cfg(any(
543 target_arch = "x86_64",
544 target_arch = "x86",
545 target_arch = "riscv64",
546 target_arch = "powerpc64"
547 ))]
548 {
549 // Look up the platform-specific `getcpu` symbol as documented
550 // [here].
551 //
552 // [here]: https://man7.org/linux/man-pages/man7/vdso.7.html
553 #[cfg(target_arch = "x86_64")]
554 let ptr = vdso.sym(cstr!("LINUX_2.6"), cstr!("__vdso_getcpu"));
555 #[cfg(target_arch = "x86")]
556 let ptr = vdso.sym(cstr!("LINUX_2.6"), cstr!("__vdso_getcpu"));
557 #[cfg(target_arch = "riscv64")]
558 let ptr = vdso.sym(cstr!("LINUX_4.15"), cstr!("__kernel_getcpu"));
559 #[cfg(target_arch = "powerpc64")]
560 let ptr = vdso.sym(cstr!("LINUX_2.6.15"), cstr!("__kernel_getcpu"));
561
562 #[cfg(any(
563 target_arch = "x86_64",
564 target_arch = "riscv64",
565 target_arch = "powerpc64"
566 ))]
567 let ok = true;
568
569 // On 32-bit x86, the symbol doesn't appear present sometimes.
570 #[cfg(target_arch = "x86")]
571 let ok = !ptr.is_null();
572
573 #[cfg(any(
574 target_arch = "aarch64",
575 target_arch = "arm",
576 target_arch = "mips",
577 target_arch = "mips32r6",
578 target_arch = "mips64",
579 target_arch = "mips64r6"
580 ))]
581 let ok = false;
582
583 if ok {
584 assert!(!ptr.is_null());
585
586 // SAFETY: Store the computed function addresses in static
587 // storage so that we don't need to compute it again (but if
588 // we do, it doesn't hurt anything).
589 unsafe {
590 GETCPU.store(ptr.cast(), Relaxed);
591 }
592 }
593 }
594
595 // On x86, also look up the vsyscall entry point.
596 #[cfg(target_arch = "x86")]
597 {
598 let ptr = vdso.sym(cstr!("LINUX_2.5"), cstr!("__kernel_vsyscall"));
599 assert!(!ptr.is_null());
600
601 // SAFETY: As above, store the computed function addresses in
602 // static storage.
603 unsafe {
604 SYSCALL.store(ptr.cast(), Relaxed);
605 }
606 }
607 }
608}
609