1//! This module contains specializations that can offload `io::copy()` operations on file descriptor
2//! containing types (`File`, `TcpStream`, etc.) to more efficient syscalls than `read(2)` and `write(2)`.
3//!
4//! Specialization is only applied to wholly std-owned types so that user code can't observe
5//! that the `Read` and `Write` traits are not used.
6//!
7//! Since a copy operation involves a reader and writer side where each can consist of different types
8//! and also involve generic wrappers (e.g. `Take`, `BufReader`) it is not practical to specialize
9//! a single method on all possible combinations.
10//!
11//! Instead readers and writers are handled separately by the `CopyRead` and `CopyWrite` specialization
12//! traits and then specialized on by the `Copier::copy` method.
13//!
14//! `Copier` uses the specialization traits to unpack the underlying file descriptors and
15//! additional prerequisites and constraints imposed by the wrapper types.
16//!
17//! Once it has obtained all necessary pieces and brought any wrapper types into a state where they
18//! can be safely bypassed it will attempt to use the `copy_file_range(2)`,
19//! `sendfile(2)` or `splice(2)` syscalls to move data directly between file descriptors.
20//! Since those syscalls have requirements that cannot be fully checked in advance it attempts
21//! to use them one after another (guided by hints) to figure out which one works and
22//! falls back to the generic read-write copy loop if none of them does.
23//! Once a working syscall is found for a pair of file descriptors it will be called in a loop
24//! until the copy operation is completed.
25//!
26//! Advantages of using these syscalls:
27//!
28//! * fewer context switches since reads and writes are coalesced into a single syscall
29//! and more bytes are transferred per syscall. This translates to higher throughput
30//! and fewer CPU cycles, at least for sufficiently large transfers to amortize the initial probing.
31//! * `copy_file_range` creates reflink copies on CoW filesystems, thus moving less data and
32//! consuming less disk space
33//! * `sendfile` and `splice` can perform zero-copy IO under some circumstances while
34//! a naive copy loop would move every byte through the CPU.
35//!
36//! Drawbacks:
37//!
38//! * copy operations smaller than the default buffer size can under some circumstances, especially
39//! on older kernels, incur more syscalls than the naive approach would. As mentioned above
40//! the syscall selection is guided by hints to minimize this possibility but they are not perfect.
41//! * optimizations only apply to std types. If a user adds a custom wrapper type, e.g. to report
42//! progress, they can hit a performance cliff.
43//! * complexity
44
45use crate::cmp::min;
46use crate::fs::{File, Metadata};
47use crate::io::copy::generic_copy;
48use crate::io::{
49 BufRead, BufReader, BufWriter, Error, Read, Result, StderrLock, StdinLock, StdoutLock, Take,
50 Write,
51};
52use crate::mem::ManuallyDrop;
53use crate::net::TcpStream;
54use crate::os::unix::fs::FileTypeExt;
55use crate::os::unix::io::{AsRawFd, FromRawFd, RawFd};
56use crate::os::unix::net::UnixStream;
57use crate::process::{ChildStderr, ChildStdin, ChildStdout};
58use crate::ptr;
59use crate::sync::atomic::{AtomicBool, AtomicU8, Ordering};
60use crate::sys::cvt;
61use crate::sys::weak::syscall;
62#[cfg(not(any(all(target_os = "linux", target_env = "gnu"), target_os = "hurd")))]
63use libc::sendfile as sendfile64;
64#[cfg(any(all(target_os = "linux", target_env = "gnu"), target_os = "hurd"))]
65use libc::sendfile64;
66use libc::{EBADF, EINVAL, ENOSYS, EOPNOTSUPP, EOVERFLOW, EPERM, EXDEV};
67
68#[cfg(test)]
69mod tests;
70
71pub(crate) fn copy_spec<R: Read + ?Sized, W: Write + ?Sized>(
72 read: &mut R,
73 write: &mut W,
74) -> Result<u64> {
75 let copier: Copier<'_, '_, R, W> = Copier { read, write };
76 SpecCopy::copy(self:copier)
77}
78
79/// This type represents either the inferred `FileType` of a `RawFd` based on the source
80/// type from which it was extracted or the actual metadata
81///
82/// The methods on this type only provide hints, due to `AsRawFd` and `FromRawFd` the inferred
83/// type may be wrong.
84enum FdMeta {
85 Metadata(Metadata),
86 Socket,
87 Pipe,
88 /// We don't have any metadata because the stat syscall failed
89 NoneObtained,
90}
91
92#[derive(PartialEq)]
93enum FdHandle {
94 Input,
95 Output,
96}
97
98impl FdMeta {
99 fn maybe_fifo(&self) -> bool {
100 match self {
101 FdMeta::Metadata(meta) => meta.file_type().is_fifo(),
102 FdMeta::Socket => false,
103 FdMeta::Pipe => true,
104 FdMeta::NoneObtained => true,
105 }
106 }
107
108 fn potential_sendfile_source(&self) -> bool {
109 match self {
110 // procfs erroneously shows 0 length on non-empty readable files.
111 // and if a file is truly empty then a `read` syscall will determine that and skip the write syscall
112 // thus there would be benefit from attempting sendfile
113 FdMeta::Metadata(meta)
114 if meta.file_type().is_file() && meta.len() > 0
115 || meta.file_type().is_block_device() =>
116 {
117 true
118 }
119 _ => false,
120 }
121 }
122
123 fn copy_file_range_candidate(&self, f: FdHandle) -> bool {
124 match self {
125 // copy_file_range will fail on empty procfs files. `read` can determine whether EOF has been reached
126 // without extra cost and skip the write, thus there is no benefit in attempting copy_file_range
127 FdMeta::Metadata(meta) if f == FdHandle::Input && meta.is_file() && meta.len() > 0 => {
128 true
129 }
130 FdMeta::Metadata(meta) if f == FdHandle::Output && meta.is_file() => true,
131 _ => false,
132 }
133 }
134}
135
136/// Returns true either if changes made to the source after a sendfile/splice call won't become
137/// visible in the sink or the source has explicitly opted into such behavior (e.g. by splicing
138/// a file into a pipe, the pipe being the source in this case).
139///
140/// This will prevent File -> Pipe and File -> Socket splicing/sendfile optimizations to uphold
141/// the Read/Write API semantics of io::copy.
142///
143/// Note: This is not 100% airtight, the caller can use the RawFd conversion methods to turn a
144/// regular file into a TcpSocket which will be treated as a socket here without checking.
145fn safe_kernel_copy(source: &FdMeta, sink: &FdMeta) -> bool {
146 match (source, sink) {
147 // Data arriving from a socket is safe because the sender can't modify the socket buffer.
148 // Data arriving from a pipe is safe(-ish) because either the sender *copied*
149 // the bytes into the pipe OR explicitly performed an operation that enables zero-copy,
150 // thus promising not to modify the data later.
151 (FdMeta::Socket, _) => true,
152 (FdMeta::Pipe, _) => true,
153 (FdMeta::Metadata(meta: &Metadata), _)
154 if meta.file_type().is_fifo() || meta.file_type().is_socket() =>
155 {
156 true
157 }
158 // Data going into non-pipes/non-sockets is safe because the "later changes may become visible" issue
159 // only happens for pages sitting in send buffers or pipes.
160 (_, FdMeta::Metadata(meta: &Metadata))
161 if !meta.file_type().is_fifo() && !meta.file_type().is_socket() =>
162 {
163 true
164 }
165 _ => false,
166 }
167}
168
169struct CopyParams(FdMeta, Option<RawFd>);
170
171struct Copier<'a, 'b, R: Read + ?Sized, W: Write + ?Sized> {
172 read: &'a mut R,
173 write: &'b mut W,
174}
175
176trait SpecCopy {
177 fn copy(self) -> Result<u64>;
178}
179
180impl<R: Read + ?Sized, W: Write + ?Sized> SpecCopy for Copier<'_, '_, R, W> {
181 default fn copy(self) -> Result<u64> {
182 generic_copy(self.read, self.write)
183 }
184}
185
186impl<R: CopyRead, W: CopyWrite> SpecCopy for Copier<'_, '_, R, W> {
187 fn copy(self) -> Result<u64> {
188 let (reader, writer) = (self.read, self.write);
189 let r_cfg = reader.properties();
190 let w_cfg = writer.properties();
191
192 // before direct operations on file descriptors ensure that all source and sink buffers are empty
193 let mut flush = || -> crate::io::Result<u64> {
194 let bytes = reader.drain_to(writer, u64::MAX)?;
195 // BufWriter buffered bytes have already been accounted for in earlier write() calls
196 writer.flush()?;
197 Ok(bytes)
198 };
199
200 let mut written = 0u64;
201
202 if let (CopyParams(input_meta, Some(readfd)), CopyParams(output_meta, Some(writefd))) =
203 (r_cfg, w_cfg)
204 {
205 written += flush()?;
206 let max_write = reader.min_limit();
207
208 if input_meta.copy_file_range_candidate(FdHandle::Input)
209 && output_meta.copy_file_range_candidate(FdHandle::Output)
210 {
211 let result = copy_regular_files(readfd, writefd, max_write);
212 result.update_take(reader);
213
214 match result {
215 CopyResult::Ended(bytes_copied) => return Ok(bytes_copied + written),
216 CopyResult::Error(e, _) => return Err(e),
217 CopyResult::Fallback(bytes) => written += bytes,
218 }
219 }
220
221 // on modern kernels sendfile can copy from any mmapable type (some but not all regular files and block devices)
222 // to any writable file descriptor. On older kernels the writer side can only be a socket.
223 // So we just try and fallback if needed.
224 // If current file offsets + write sizes overflow it may also fail, we do not try to fix that and instead
225 // fall back to the generic copy loop.
226 if input_meta.potential_sendfile_source() && safe_kernel_copy(&input_meta, &output_meta)
227 {
228 let result = sendfile_splice(SpliceMode::Sendfile, readfd, writefd, max_write);
229 result.update_take(reader);
230
231 match result {
232 CopyResult::Ended(bytes_copied) => return Ok(bytes_copied + written),
233 CopyResult::Error(e, _) => return Err(e),
234 CopyResult::Fallback(bytes) => written += bytes,
235 }
236 }
237
238 if (input_meta.maybe_fifo() || output_meta.maybe_fifo())
239 && safe_kernel_copy(&input_meta, &output_meta)
240 {
241 let result = sendfile_splice(SpliceMode::Splice, readfd, writefd, max_write);
242 result.update_take(reader);
243
244 match result {
245 CopyResult::Ended(bytes_copied) => return Ok(bytes_copied + written),
246 CopyResult::Error(e, _) => return Err(e),
247 CopyResult::Fallback(0) => { /* use the fallback below */ }
248 CopyResult::Fallback(_) => {
249 unreachable!("splice should not return > 0 bytes on the fallback path")
250 }
251 }
252 }
253 }
254
255 // fallback if none of the more specialized syscalls wants to work with these file descriptors
256 match generic_copy(reader, writer) {
257 Ok(bytes) => Ok(bytes + written),
258 err => err,
259 }
260 }
261}
262
263#[rustc_specialization_trait]
264trait CopyRead: Read {
265 /// Implementations that contain buffers (i.e. `BufReader`) must transfer data from their internal
266 /// buffers into `writer` until either the buffers are emptied or `limit` bytes have been
267 /// transferred, whichever occurs sooner.
268 /// If nested buffers are present the outer buffers must be drained first.
269 ///
270 /// This is necessary to directly bypass the wrapper types while preserving the data order
271 /// when operating directly on the underlying file descriptors.
272 fn drain_to<W: Write>(&mut self, _writer: &mut W, _limit: u64) -> Result<u64> {
273 Ok(0)
274 }
275
276 /// Updates `Take` wrappers to remove the number of bytes copied.
277 fn taken(&mut self, _bytes: u64) {}
278
279 /// The minimum of the limit of all `Take<_>` wrappers, `u64::MAX` otherwise.
280 /// This method does not account for data `BufReader` buffers and would underreport
281 /// the limit of a `Take<BufReader<Take<_>>>` type. Thus its result is only valid
282 /// after draining the buffers via `drain_to`.
283 fn min_limit(&self) -> u64 {
284 u64::MAX
285 }
286
287 /// Extracts the file descriptor and hints/metadata, delegating through wrappers if necessary.
288 fn properties(&self) -> CopyParams;
289}
290
291#[rustc_specialization_trait]
292trait CopyWrite: Write {
293 /// Extracts the file descriptor and hints/metadata, delegating through wrappers if necessary.
294 fn properties(&self) -> CopyParams;
295}
296
297impl<T> CopyRead for &mut T
298where
299 T: CopyRead,
300{
301 fn drain_to<W: Write>(&mut self, writer: &mut W, limit: u64) -> Result<u64> {
302 (**self).drain_to(writer, limit)
303 }
304
305 fn taken(&mut self, bytes: u64) {
306 (**self).taken(bytes);
307 }
308
309 fn min_limit(&self) -> u64 {
310 (**self).min_limit()
311 }
312
313 fn properties(&self) -> CopyParams {
314 (**self).properties()
315 }
316}
317
318impl<T> CopyWrite for &mut T
319where
320 T: CopyWrite,
321{
322 fn properties(&self) -> CopyParams {
323 (**self).properties()
324 }
325}
326
327impl CopyRead for File {
328 fn properties(&self) -> CopyParams {
329 CopyParams(fd_to_meta(self), Some(self.as_raw_fd()))
330 }
331}
332
333impl CopyRead for &File {
334 fn properties(&self) -> CopyParams {
335 CopyParams(fd_to_meta(*self), Some(self.as_raw_fd()))
336 }
337}
338
339impl CopyWrite for File {
340 fn properties(&self) -> CopyParams {
341 CopyParams(fd_to_meta(self), Some(self.as_raw_fd()))
342 }
343}
344
345impl CopyWrite for &File {
346 fn properties(&self) -> CopyParams {
347 CopyParams(fd_to_meta(*self), Some(self.as_raw_fd()))
348 }
349}
350
351impl CopyRead for TcpStream {
352 fn properties(&self) -> CopyParams {
353 // avoid the stat syscall since we can be fairly sure it's a socket
354 CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
355 }
356}
357
358impl CopyRead for &TcpStream {
359 fn properties(&self) -> CopyParams {
360 // avoid the stat syscall since we can be fairly sure it's a socket
361 CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
362 }
363}
364
365impl CopyWrite for TcpStream {
366 fn properties(&self) -> CopyParams {
367 // avoid the stat syscall since we can be fairly sure it's a socket
368 CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
369 }
370}
371
372impl CopyWrite for &TcpStream {
373 fn properties(&self) -> CopyParams {
374 // avoid the stat syscall since we can be fairly sure it's a socket
375 CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
376 }
377}
378
379impl CopyRead for UnixStream {
380 fn properties(&self) -> CopyParams {
381 // avoid the stat syscall since we can be fairly sure it's a socket
382 CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
383 }
384}
385
386impl CopyRead for &UnixStream {
387 fn properties(&self) -> CopyParams {
388 // avoid the stat syscall since we can be fairly sure it's a socket
389 CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
390 }
391}
392
393impl CopyWrite for UnixStream {
394 fn properties(&self) -> CopyParams {
395 // avoid the stat syscall since we can be fairly sure it's a socket
396 CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
397 }
398}
399
400impl CopyWrite for &UnixStream {
401 fn properties(&self) -> CopyParams {
402 // avoid the stat syscall since we can be fairly sure it's a socket
403 CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
404 }
405}
406
407impl CopyWrite for ChildStdin {
408 fn properties(&self) -> CopyParams {
409 CopyParams(FdMeta::Pipe, Some(self.as_raw_fd()))
410 }
411}
412
413impl CopyRead for ChildStdout {
414 fn properties(&self) -> CopyParams {
415 CopyParams(FdMeta::Pipe, Some(self.as_raw_fd()))
416 }
417}
418
419impl CopyRead for ChildStderr {
420 fn properties(&self) -> CopyParams {
421 CopyParams(FdMeta::Pipe, Some(self.as_raw_fd()))
422 }
423}
424
425impl CopyRead for StdinLock<'_> {
426 fn drain_to<W: Write>(&mut self, writer: &mut W, outer_limit: u64) -> Result<u64> {
427 let buf_reader: &mut BufReader = self.as_mut_buf();
428 let buf: &[u8] = buf_reader.buffer();
429 let buf: &[u8] = &buf[0..min(v1:buf.len(), v2:outer_limit.try_into().unwrap_or(default:usize::MAX))];
430 let bytes_drained: usize = buf.len();
431 writer.write_all(buf)?;
432 buf_reader.consume(amt:bytes_drained);
433
434 Ok(bytes_drained as u64)
435 }
436
437 fn properties(&self) -> CopyParams {
438 CopyParams(fd_to_meta(self), Some(self.as_raw_fd()))
439 }
440}
441
442impl CopyWrite for StdoutLock<'_> {
443 fn properties(&self) -> CopyParams {
444 CopyParams(fd_to_meta(self), Some(self.as_raw_fd()))
445 }
446}
447
448impl CopyWrite for StderrLock<'_> {
449 fn properties(&self) -> CopyParams {
450 CopyParams(fd_to_meta(self), Some(self.as_raw_fd()))
451 }
452}
453
454impl<T: CopyRead> CopyRead for Take<T> {
455 fn drain_to<W: Write>(&mut self, writer: &mut W, outer_limit: u64) -> Result<u64> {
456 let local_limit: u64 = self.limit();
457 let combined_limit: u64 = min(v1:outer_limit, v2:local_limit);
458 let bytes_drained: u64 = self.get_mut().drain_to(writer, combined_limit)?;
459 // update limit since read() was bypassed
460 self.set_limit(local_limit - bytes_drained);
461
462 Ok(bytes_drained)
463 }
464
465 fn taken(&mut self, bytes: u64) {
466 self.set_limit(self.limit() - bytes);
467 self.get_mut().taken(bytes);
468 }
469
470 fn min_limit(&self) -> u64 {
471 min(v1:Take::limit(self), self.get_ref().min_limit())
472 }
473
474 fn properties(&self) -> CopyParams {
475 self.get_ref().properties()
476 }
477}
478
479impl<T: ?Sized + CopyRead> CopyRead for BufReader<T> {
480 fn drain_to<W: Write>(&mut self, writer: &mut W, outer_limit: u64) -> Result<u64> {
481 let buf = self.buffer();
482 let buf = &buf[0..min(buf.len(), outer_limit.try_into().unwrap_or(usize::MAX))];
483 let bytes = buf.len();
484 writer.write_all(buf)?;
485 self.consume(bytes);
486
487 let remaining = outer_limit - bytes as u64;
488
489 // in case of nested bufreaders we also need to drain the ones closer to the source
490 let inner_bytes = self.get_mut().drain_to(writer, remaining)?;
491
492 Ok(bytes as u64 + inner_bytes)
493 }
494
495 fn taken(&mut self, bytes: u64) {
496 self.get_mut().taken(bytes);
497 }
498
499 fn min_limit(&self) -> u64 {
500 self.get_ref().min_limit()
501 }
502
503 fn properties(&self) -> CopyParams {
504 self.get_ref().properties()
505 }
506}
507
508impl<T: ?Sized + CopyWrite> CopyWrite for BufWriter<T> {
509 fn properties(&self) -> CopyParams {
510 self.get_ref().properties()
511 }
512}
513
514fn fd_to_meta<T: AsRawFd>(fd: &T) -> FdMeta {
515 let fd: i32 = fd.as_raw_fd();
516 let file: ManuallyDrop<File> = ManuallyDrop::new(unsafe { File::from_raw_fd(fd) });
517 match file.metadata() {
518 Ok(meta: Metadata) => FdMeta::Metadata(meta),
519 Err(_) => FdMeta::NoneObtained,
520 }
521}
522
523pub(super) enum CopyResult {
524 Ended(u64),
525 Error(Error, u64),
526 Fallback(u64),
527}
528
529impl CopyResult {
530 fn update_take(&self, reader: &mut impl CopyRead) {
531 match *self {
532 CopyResult::Fallback(bytes: u64)
533 | CopyResult::Ended(bytes: u64)
534 | CopyResult::Error(_, bytes: u64) => reader.taken(bytes),
535 }
536 }
537}
538
539/// Invalid file descriptor.
540///
541/// Valid file descriptors are guaranteed to be positive numbers (see `open()` manpage)
542/// while negative values are used to indicate errors.
543/// Thus -1 will never be overlap with a valid open file.
544const INVALID_FD: RawFd = -1;
545
546/// Linux-specific implementation that will attempt to use copy_file_range for copy offloading.
547/// As the name says, it only works on regular files.
548///
549/// Callers must handle fallback to a generic copy loop.
550/// `Fallback` may indicate non-zero number of bytes already written
551/// if one of the files' cursor +`max_len` would exceed u64::MAX (`EOVERFLOW`).
552pub(super) fn copy_regular_files(reader: RawFd, writer: RawFd, max_len: u64) -> CopyResult {
553 use crate::cmp;
554
555 const NOT_PROBED: u8 = 0;
556 const UNAVAILABLE: u8 = 1;
557 const AVAILABLE: u8 = 2;
558
559 // Kernel prior to 4.5 don't have copy_file_range
560 // We store the availability in a global to avoid unnecessary syscalls
561 static HAS_COPY_FILE_RANGE: AtomicU8 = AtomicU8::new(NOT_PROBED);
562
563 syscall! {
564 fn copy_file_range(
565 fd_in: libc::c_int,
566 off_in: *mut libc::loff_t,
567 fd_out: libc::c_int,
568 off_out: *mut libc::loff_t,
569 len: libc::size_t,
570 flags: libc::c_uint
571 ) -> libc::ssize_t
572 }
573
574 match HAS_COPY_FILE_RANGE.load(Ordering::Relaxed) {
575 NOT_PROBED => {
576 // EPERM can indicate seccomp filters or an immutable file.
577 // To distinguish these cases we probe with invalid file descriptors which should result in EBADF if the syscall is supported
578 // and some other error (ENOSYS or EPERM) if it's not available
579 let result = unsafe {
580 cvt(copy_file_range(INVALID_FD, ptr::null_mut(), INVALID_FD, ptr::null_mut(), 1, 0))
581 };
582
583 if matches!(result.map_err(|e| e.raw_os_error()), Err(Some(EBADF))) {
584 HAS_COPY_FILE_RANGE.store(AVAILABLE, Ordering::Relaxed);
585 } else {
586 HAS_COPY_FILE_RANGE.store(UNAVAILABLE, Ordering::Relaxed);
587 return CopyResult::Fallback(0);
588 }
589 }
590 UNAVAILABLE => return CopyResult::Fallback(0),
591 _ => {}
592 };
593
594 let mut written = 0u64;
595 while written < max_len {
596 let bytes_to_copy = cmp::min(max_len - written, usize::MAX as u64);
597 // cap to 1GB chunks in case u64::MAX is passed as max_len and the file has a non-zero seek position
598 // this allows us to copy large chunks without hitting EOVERFLOW,
599 // unless someone sets a file offset close to u64::MAX - 1GB, in which case a fallback would be required
600 let bytes_to_copy = cmp::min(bytes_to_copy as usize, 0x4000_0000usize);
601 let copy_result = unsafe {
602 // We actually don't have to adjust the offsets,
603 // because copy_file_range adjusts the file offset automatically
604 cvt(copy_file_range(reader, ptr::null_mut(), writer, ptr::null_mut(), bytes_to_copy, 0))
605 };
606
607 match copy_result {
608 Ok(0) if written == 0 => {
609 // fallback to work around several kernel bugs where copy_file_range will fail to
610 // copy any bytes and return 0 instead of an error if
611 // - reading virtual files from the proc filesystem which appear to have 0 size
612 // but are not empty. noted in coreutils to affect kernels at least up to 5.6.19.
613 // - copying from an overlay filesystem in docker. reported to occur on fedora 32.
614 return CopyResult::Fallback(0);
615 }
616 Ok(0) => return CopyResult::Ended(written), // reached EOF
617 Ok(ret) => written += ret as u64,
618 Err(err) => {
619 return match err.raw_os_error() {
620 // when file offset + max_length > u64::MAX
621 Some(EOVERFLOW) => CopyResult::Fallback(written),
622 Some(ENOSYS | EXDEV | EINVAL | EPERM | EOPNOTSUPP | EBADF) if written == 0 => {
623 // Try fallback io::copy if either:
624 // - Kernel version is < 4.5 (ENOSYS¹)
625 // - Files are mounted on different fs (EXDEV)
626 // - copy_file_range is broken in various ways on RHEL/CentOS 7 (EOPNOTSUPP)
627 // - copy_file_range file is immutable or syscall is blocked by seccomp¹ (EPERM)
628 // - copy_file_range cannot be used with pipes or device nodes (EINVAL)
629 // - the writer fd was opened with O_APPEND (EBADF²)
630 // and no bytes were written successfully yet. (All these errnos should
631 // not be returned if something was already written, but they happen in
632 // the wild, see #91152.)
633 //
634 // ¹ these cases should be detected by the initial probe but we handle them here
635 // anyway in case syscall interception changes during runtime
636 // ² actually invalid file descriptors would cause this too, but in that case
637 // the fallback code path is expected to encounter the same error again
638 CopyResult::Fallback(0)
639 }
640 _ => CopyResult::Error(err, written),
641 };
642 }
643 }
644 }
645 CopyResult::Ended(written)
646}
647
648#[derive(PartialEq)]
649enum SpliceMode {
650 Sendfile,
651 Splice,
652}
653
654/// performs splice or sendfile between file descriptors
655/// Does _not_ fall back to a generic copy loop.
656fn sendfile_splice(mode: SpliceMode, reader: RawFd, writer: RawFd, len: u64) -> CopyResult {
657 static HAS_SENDFILE: AtomicBool = AtomicBool::new(true);
658 static HAS_SPLICE: AtomicBool = AtomicBool::new(true);
659
660 // Android builds use feature level 14, but the libc wrapper for splice is
661 // gated on feature level 21+, so we have to invoke the syscall directly.
662 #[cfg(target_os = "android")]
663 syscall! {
664 fn splice(
665 srcfd: libc::c_int,
666 src_offset: *const i64,
667 dstfd: libc::c_int,
668 dst_offset: *const i64,
669 len: libc::size_t,
670 flags: libc::c_int
671 ) -> libc::ssize_t
672 }
673
674 #[cfg(target_os = "linux")]
675 use libc::splice;
676
677 match mode {
678 SpliceMode::Sendfile if !HAS_SENDFILE.load(Ordering::Relaxed) => {
679 return CopyResult::Fallback(0);
680 }
681 SpliceMode::Splice if !HAS_SPLICE.load(Ordering::Relaxed) => {
682 return CopyResult::Fallback(0);
683 }
684 _ => (),
685 }
686
687 let mut written = 0u64;
688 while written < len {
689 // according to its manpage that's the maximum size sendfile() will copy per invocation
690 let chunk_size = crate::cmp::min(len - written, 0x7ffff000_u64) as usize;
691
692 let result = match mode {
693 SpliceMode::Sendfile => {
694 cvt(unsafe { sendfile64(writer, reader, ptr::null_mut(), chunk_size) })
695 }
696 SpliceMode::Splice => cvt(unsafe {
697 splice(reader, ptr::null_mut(), writer, ptr::null_mut(), chunk_size, 0)
698 }),
699 };
700
701 match result {
702 Ok(0) => break, // EOF
703 Ok(ret) => written += ret as u64,
704 Err(err) => {
705 return match err.raw_os_error() {
706 Some(ENOSYS | EPERM) => {
707 // syscall not supported (ENOSYS)
708 // syscall is disallowed, e.g. by seccomp (EPERM)
709 match mode {
710 SpliceMode::Sendfile => HAS_SENDFILE.store(false, Ordering::Relaxed),
711 SpliceMode::Splice => HAS_SPLICE.store(false, Ordering::Relaxed),
712 }
713 assert_eq!(written, 0);
714 CopyResult::Fallback(0)
715 }
716 Some(EINVAL) => {
717 // splice/sendfile do not support this particular file descriptor (EINVAL)
718 assert_eq!(written, 0);
719 CopyResult::Fallback(0)
720 }
721 Some(os_err) if mode == SpliceMode::Sendfile && os_err == EOVERFLOW => {
722 CopyResult::Fallback(written)
723 }
724 _ => CopyResult::Error(err, written),
725 };
726 }
727 }
728 }
729 CopyResult::Ended(written)
730}
731