1//! This module contains specializations that can offload `io::copy()` operations on file descriptor
2//! containing types (`File`, `TcpStream`, etc.) to more efficient syscalls than `read(2)` and `write(2)`.
3//!
4//! Specialization is only applied to wholly std-owned types so that user code can't observe
5//! that the `Read` and `Write` traits are not used.
6//!
7//! Since a copy operation involves a reader and writer side where each can consist of different types
8//! and also involve generic wrappers (e.g. `Take`, `BufReader`) it is not practical to specialize
9//! a single method on all possible combinations.
10//!
11//! Instead readers and writers are handled separately by the `CopyRead` and `CopyWrite` specialization
12//! traits and then specialized on by the `Copier::copy` method.
13//!
14//! `Copier` uses the specialization traits to unpack the underlying file descriptors and
15//! additional prerequisites and constraints imposed by the wrapper types.
16//!
17//! Once it has obtained all necessary pieces and brought any wrapper types into a state where they
18//! can be safely bypassed it will attempt to use the `copy_file_range(2)`,
19//! `sendfile(2)` or `splice(2)` syscalls to move data directly between file descriptors.
20//! Since those syscalls have requirements that cannot be fully checked in advance it attempts
21//! to use them one after another (guided by hints) to figure out which one works and
22//! falls back to the generic read-write copy loop if none of them does.
23//! Once a working syscall is found for a pair of file descriptors it will be called in a loop
24//! until the copy operation is completed.
25//!
26//! Advantages of using these syscalls:
27//!
28//! * fewer context switches since reads and writes are coalesced into a single syscall
29//! and more bytes are transferred per syscall. This translates to higher throughput
30//! and fewer CPU cycles, at least for sufficiently large transfers to amortize the initial probing.
31//! * `copy_file_range` creates reflink copies on CoW filesystems, thus moving less data and
32//! consuming less disk space
33//! * `sendfile` and `splice` can perform zero-copy IO under some circumstances while
34//! a naive copy loop would move every byte through the CPU.
35//!
36//! Drawbacks:
37//!
38//! * copy operations smaller than the default buffer size can under some circumstances, especially
39//! on older kernels, incur more syscalls than the naive approach would. As mentioned above
40//! the syscall selection is guided by hints to minimize this possibility but they are not perfect.
41//! * optimizations only apply to std types. If a user adds a custom wrapper type, e.g. to report
42//! progress, they can hit a performance cliff.
43//! * complexity
44
45#[cfg(not(any(all(target_os = "linux", target_env = "gnu"), target_os = "hurd")))]
46use libc::sendfile as sendfile64;
47#[cfg(any(all(target_os = "linux", target_env = "gnu"), target_os = "hurd"))]
48use libc::sendfile64;
49use libc::{EBADF, EINVAL, ENOSYS, EOPNOTSUPP, EOVERFLOW, EPERM, EXDEV};
50
51use crate::cmp::min;
52use crate::fs::{File, Metadata};
53use crate::io::copy::generic_copy;
54use crate::io::{
55 BufRead, BufReader, BufWriter, Error, PipeReader, PipeWriter, Read, Result, StderrLock,
56 StdinLock, StdoutLock, Take, Write,
57};
58use crate::mem::ManuallyDrop;
59use crate::net::TcpStream;
60use crate::os::unix::fs::FileTypeExt;
61use crate::os::unix::io::{AsRawFd, FromRawFd, RawFd};
62use crate::os::unix::net::UnixStream;
63use crate::process::{ChildStderr, ChildStdin, ChildStdout};
64use crate::ptr;
65use crate::sync::atomic::{Atomic, AtomicBool, AtomicU8, Ordering};
66use crate::sys::cvt;
67use crate::sys::fs::CachedFileMetadata;
68use crate::sys::weak::syscall;
69
70#[cfg(test)]
71mod tests;
72
73pub(crate) fn copy_spec<R: Read + ?Sized, W: Write + ?Sized>(
74 read: &mut R,
75 write: &mut W,
76) -> Result<u64> {
77 let copier: Copier<'_, '_, R, W> = Copier { read, write };
78 SpecCopy::copy(self:copier)
79}
80
81/// This type represents either the inferred `FileType` of a `RawFd` based on the source
82/// type from which it was extracted or the actual metadata
83///
84/// The methods on this type only provide hints, due to `AsRawFd` and `FromRawFd` the inferred
85/// type may be wrong.
86enum FdMeta {
87 Metadata(Metadata),
88 Socket,
89 Pipe,
90 /// We don't have any metadata because the stat syscall failed
91 NoneObtained,
92}
93
94#[derive(PartialEq)]
95enum FdHandle {
96 Input,
97 Output,
98}
99
100impl FdMeta {
101 fn maybe_fifo(&self) -> bool {
102 match self {
103 FdMeta::Metadata(meta) => meta.file_type().is_fifo(),
104 FdMeta::Socket => false,
105 FdMeta::Pipe => true,
106 FdMeta::NoneObtained => true,
107 }
108 }
109
110 fn potential_sendfile_source(&self) -> bool {
111 match self {
112 // procfs erroneously shows 0 length on non-empty readable files.
113 // and if a file is truly empty then a `read` syscall will determine that and skip the write syscall
114 // thus there would be benefit from attempting sendfile
115 FdMeta::Metadata(meta)
116 if meta.file_type().is_file() && meta.len() > 0
117 || meta.file_type().is_block_device() =>
118 {
119 true
120 }
121 _ => false,
122 }
123 }
124
125 fn copy_file_range_candidate(&self, f: FdHandle) -> bool {
126 match self {
127 // copy_file_range will fail on empty procfs files. `read` can determine whether EOF has been reached
128 // without extra cost and skip the write, thus there is no benefit in attempting copy_file_range
129 FdMeta::Metadata(meta) if f == FdHandle::Input && meta.is_file() && meta.len() > 0 => {
130 true
131 }
132 FdMeta::Metadata(meta) if f == FdHandle::Output && meta.is_file() => true,
133 _ => false,
134 }
135 }
136}
137
138/// Returns true either if changes made to the source after a sendfile/splice call won't become
139/// visible in the sink or the source has explicitly opted into such behavior (e.g. by splicing
140/// a file into a pipe, the pipe being the source in this case).
141///
142/// This will prevent File -> Pipe and File -> Socket splicing/sendfile optimizations to uphold
143/// the Read/Write API semantics of io::copy.
144///
145/// Note: This is not 100% airtight, the caller can use the RawFd conversion methods to turn a
146/// regular file into a TcpSocket which will be treated as a socket here without checking.
147fn safe_kernel_copy(source: &FdMeta, sink: &FdMeta) -> bool {
148 match (source, sink) {
149 // Data arriving from a socket is safe because the sender can't modify the socket buffer.
150 // Data arriving from a pipe is safe(-ish) because either the sender *copied*
151 // the bytes into the pipe OR explicitly performed an operation that enables zero-copy,
152 // thus promising not to modify the data later.
153 (FdMeta::Socket, _) => true,
154 (FdMeta::Pipe, _) => true,
155 (FdMeta::Metadata(meta: &Metadata), _)
156 if meta.file_type().is_fifo() || meta.file_type().is_socket() =>
157 {
158 true
159 }
160 // Data going into non-pipes/non-sockets is safe because the "later changes may become visible" issue
161 // only happens for pages sitting in send buffers or pipes.
162 (_, FdMeta::Metadata(meta: &Metadata))
163 if !meta.file_type().is_fifo() && !meta.file_type().is_socket() =>
164 {
165 true
166 }
167 _ => false,
168 }
169}
170
171struct CopyParams(FdMeta, Option<RawFd>);
172
173struct Copier<'a, 'b, R: Read + ?Sized, W: Write + ?Sized> {
174 read: &'a mut R,
175 write: &'b mut W,
176}
177
178trait SpecCopy {
179 fn copy(self) -> Result<u64>;
180}
181
182impl<R: Read + ?Sized, W: Write + ?Sized> SpecCopy for Copier<'_, '_, R, W> {
183 default fn copy(self) -> Result<u64> {
184 generic_copy(self.read, self.write)
185 }
186}
187
188impl<R: CopyRead, W: CopyWrite> SpecCopy for Copier<'_, '_, R, W> {
189 fn copy(self) -> Result<u64> {
190 let (reader, writer) = (self.read, self.write);
191 let r_cfg = reader.properties();
192 let w_cfg = writer.properties();
193
194 // before direct operations on file descriptors ensure that all source and sink buffers are empty
195 let mut flush = || -> Result<u64> {
196 let bytes = reader.drain_to(writer, u64::MAX)?;
197 // BufWriter buffered bytes have already been accounted for in earlier write() calls
198 writer.flush()?;
199 Ok(bytes)
200 };
201
202 let mut written = 0u64;
203
204 if let (CopyParams(input_meta, Some(readfd)), CopyParams(output_meta, Some(writefd))) =
205 (r_cfg, w_cfg)
206 {
207 written += flush()?;
208 let max_write = reader.min_limit();
209
210 if input_meta.copy_file_range_candidate(FdHandle::Input)
211 && output_meta.copy_file_range_candidate(FdHandle::Output)
212 {
213 let result = copy_regular_files(readfd, writefd, max_write);
214 result.update_take(reader);
215
216 match result {
217 CopyResult::Ended(bytes_copied) => return Ok(bytes_copied + written),
218 CopyResult::Error(e, _) => return Err(e),
219 CopyResult::Fallback(bytes) => written += bytes,
220 }
221 }
222
223 // on modern kernels sendfile can copy from any mmapable type (some but not all regular files and block devices)
224 // to any writable file descriptor. On older kernels the writer side can only be a socket.
225 // So we just try and fallback if needed.
226 // If current file offsets + write sizes overflow it may also fail, we do not try to fix that and instead
227 // fall back to the generic copy loop.
228 if input_meta.potential_sendfile_source() && safe_kernel_copy(&input_meta, &output_meta)
229 {
230 let result = sendfile_splice(SpliceMode::Sendfile, readfd, writefd, max_write);
231 result.update_take(reader);
232
233 match result {
234 CopyResult::Ended(bytes_copied) => return Ok(bytes_copied + written),
235 CopyResult::Error(e, _) => return Err(e),
236 CopyResult::Fallback(bytes) => written += bytes,
237 }
238 }
239
240 if (input_meta.maybe_fifo() || output_meta.maybe_fifo())
241 && safe_kernel_copy(&input_meta, &output_meta)
242 {
243 let result = sendfile_splice(SpliceMode::Splice, readfd, writefd, max_write);
244 result.update_take(reader);
245
246 match result {
247 CopyResult::Ended(bytes_copied) => return Ok(bytes_copied + written),
248 CopyResult::Error(e, _) => return Err(e),
249 CopyResult::Fallback(0) => { /* use the fallback below */ }
250 CopyResult::Fallback(_) => {
251 unreachable!("splice should not return > 0 bytes on the fallback path")
252 }
253 }
254 }
255 }
256
257 // fallback if none of the more specialized syscalls wants to work with these file descriptors
258 match generic_copy(reader, writer) {
259 Ok(bytes) => Ok(bytes + written),
260 err => err,
261 }
262 }
263}
264
265#[rustc_specialization_trait]
266trait CopyRead: Read {
267 /// Implementations that contain buffers (i.e. `BufReader`) must transfer data from their internal
268 /// buffers into `writer` until either the buffers are emptied or `limit` bytes have been
269 /// transferred, whichever occurs sooner.
270 /// If nested buffers are present the outer buffers must be drained first.
271 ///
272 /// This is necessary to directly bypass the wrapper types while preserving the data order
273 /// when operating directly on the underlying file descriptors.
274 fn drain_to<W: Write>(&mut self, _writer: &mut W, _limit: u64) -> Result<u64> {
275 Ok(0)
276 }
277
278 /// Updates `Take` wrappers to remove the number of bytes copied.
279 fn taken(&mut self, _bytes: u64) {}
280
281 /// The minimum of the limit of all `Take<_>` wrappers, `u64::MAX` otherwise.
282 /// This method does not account for data `BufReader` buffers and would underreport
283 /// the limit of a `Take<BufReader<Take<_>>>` type. Thus its result is only valid
284 /// after draining the buffers via `drain_to`.
285 fn min_limit(&self) -> u64 {
286 u64::MAX
287 }
288
289 /// Extracts the file descriptor and hints/metadata, delegating through wrappers if necessary.
290 fn properties(&self) -> CopyParams;
291}
292
293#[rustc_specialization_trait]
294trait CopyWrite: Write {
295 /// Extracts the file descriptor and hints/metadata, delegating through wrappers if necessary.
296 fn properties(&self) -> CopyParams;
297}
298
299impl<T> CopyRead for &mut T
300where
301 T: CopyRead,
302{
303 fn drain_to<W: Write>(&mut self, writer: &mut W, limit: u64) -> Result<u64> {
304 (**self).drain_to(writer, limit)
305 }
306
307 fn taken(&mut self, bytes: u64) {
308 (**self).taken(bytes);
309 }
310
311 fn min_limit(&self) -> u64 {
312 (**self).min_limit()
313 }
314
315 fn properties(&self) -> CopyParams {
316 (**self).properties()
317 }
318}
319
320impl<T> CopyWrite for &mut T
321where
322 T: CopyWrite,
323{
324 fn properties(&self) -> CopyParams {
325 (**self).properties()
326 }
327}
328
329impl CopyRead for File {
330 fn properties(&self) -> CopyParams {
331 CopyParams(fd_to_meta(self), Some(self.as_raw_fd()))
332 }
333}
334
335impl CopyRead for &File {
336 fn properties(&self) -> CopyParams {
337 CopyParams(fd_to_meta(*self), Some(self.as_raw_fd()))
338 }
339}
340
341impl CopyWrite for File {
342 fn properties(&self) -> CopyParams {
343 CopyParams(fd_to_meta(self), Some(self.as_raw_fd()))
344 }
345}
346
347impl CopyWrite for &File {
348 fn properties(&self) -> CopyParams {
349 CopyParams(fd_to_meta(*self), Some(self.as_raw_fd()))
350 }
351}
352
353impl CopyRead for TcpStream {
354 fn properties(&self) -> CopyParams {
355 // avoid the stat syscall since we can be fairly sure it's a socket
356 CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
357 }
358}
359
360impl CopyRead for &TcpStream {
361 fn properties(&self) -> CopyParams {
362 // avoid the stat syscall since we can be fairly sure it's a socket
363 CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
364 }
365}
366
367impl CopyWrite for TcpStream {
368 fn properties(&self) -> CopyParams {
369 // avoid the stat syscall since we can be fairly sure it's a socket
370 CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
371 }
372}
373
374impl CopyWrite for &TcpStream {
375 fn properties(&self) -> CopyParams {
376 // avoid the stat syscall since we can be fairly sure it's a socket
377 CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
378 }
379}
380
381impl CopyRead for UnixStream {
382 fn properties(&self) -> CopyParams {
383 // avoid the stat syscall since we can be fairly sure it's a socket
384 CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
385 }
386}
387
388impl CopyRead for &UnixStream {
389 fn properties(&self) -> CopyParams {
390 // avoid the stat syscall since we can be fairly sure it's a socket
391 CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
392 }
393}
394
395impl CopyWrite for UnixStream {
396 fn properties(&self) -> CopyParams {
397 // avoid the stat syscall since we can be fairly sure it's a socket
398 CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
399 }
400}
401
402impl CopyWrite for &UnixStream {
403 fn properties(&self) -> CopyParams {
404 // avoid the stat syscall since we can be fairly sure it's a socket
405 CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
406 }
407}
408
409impl CopyRead for PipeReader {
410 fn properties(&self) -> CopyParams {
411 CopyParams(FdMeta::Pipe, Some(self.as_raw_fd()))
412 }
413}
414
415impl CopyRead for &PipeReader {
416 fn properties(&self) -> CopyParams {
417 CopyParams(FdMeta::Pipe, Some(self.as_raw_fd()))
418 }
419}
420
421impl CopyWrite for PipeWriter {
422 fn properties(&self) -> CopyParams {
423 CopyParams(FdMeta::Pipe, Some(self.as_raw_fd()))
424 }
425}
426
427impl CopyWrite for &PipeWriter {
428 fn properties(&self) -> CopyParams {
429 CopyParams(FdMeta::Pipe, Some(self.as_raw_fd()))
430 }
431}
432
433impl CopyWrite for ChildStdin {
434 fn properties(&self) -> CopyParams {
435 CopyParams(FdMeta::Pipe, Some(self.as_raw_fd()))
436 }
437}
438
439impl CopyRead for ChildStdout {
440 fn properties(&self) -> CopyParams {
441 CopyParams(FdMeta::Pipe, Some(self.as_raw_fd()))
442 }
443}
444
445impl CopyRead for ChildStderr {
446 fn properties(&self) -> CopyParams {
447 CopyParams(FdMeta::Pipe, Some(self.as_raw_fd()))
448 }
449}
450
451impl CopyRead for StdinLock<'_> {
452 fn drain_to<W: Write>(&mut self, writer: &mut W, outer_limit: u64) -> Result<u64> {
453 let buf_reader: &mut BufReader = self.as_mut_buf();
454 let buf: &[u8] = buf_reader.buffer();
455 let buf: &[u8] = &buf[0..min(v1:buf.len(), v2:outer_limit.try_into().unwrap_or(default:usize::MAX))];
456 let bytes_drained: usize = buf.len();
457 writer.write_all(buf)?;
458 buf_reader.consume(amount:bytes_drained);
459
460 Ok(bytes_drained as u64)
461 }
462
463 fn properties(&self) -> CopyParams {
464 CopyParams(fd_to_meta(self), Some(self.as_raw_fd()))
465 }
466}
467
468impl CopyWrite for StdoutLock<'_> {
469 fn properties(&self) -> CopyParams {
470 CopyParams(fd_to_meta(self), Some(self.as_raw_fd()))
471 }
472}
473
474impl CopyWrite for StderrLock<'_> {
475 fn properties(&self) -> CopyParams {
476 CopyParams(fd_to_meta(self), Some(self.as_raw_fd()))
477 }
478}
479
480impl<T: CopyRead> CopyRead for Take<T> {
481 fn drain_to<W: Write>(&mut self, writer: &mut W, outer_limit: u64) -> Result<u64> {
482 let local_limit: u64 = self.limit();
483 let combined_limit: u64 = min(v1:outer_limit, v2:local_limit);
484 let bytes_drained: u64 = self.get_mut().drain_to(writer, combined_limit)?;
485 // update limit since read() was bypassed
486 self.set_limit(local_limit - bytes_drained);
487
488 Ok(bytes_drained)
489 }
490
491 fn taken(&mut self, bytes: u64) {
492 self.set_limit(self.limit() - bytes);
493 self.get_mut().taken(bytes);
494 }
495
496 fn min_limit(&self) -> u64 {
497 min(v1:Take::limit(self), self.get_ref().min_limit())
498 }
499
500 fn properties(&self) -> CopyParams {
501 self.get_ref().properties()
502 }
503}
504
505impl<T: ?Sized + CopyRead> CopyRead for BufReader<T> {
506 fn drain_to<W: Write>(&mut self, writer: &mut W, outer_limit: u64) -> Result<u64> {
507 let buf = self.buffer();
508 let buf = &buf[0..min(buf.len(), outer_limit.try_into().unwrap_or(usize::MAX))];
509 let bytes = buf.len();
510 writer.write_all(buf)?;
511 self.consume(bytes);
512
513 let remaining = outer_limit - bytes as u64;
514
515 // in case of nested bufreaders we also need to drain the ones closer to the source
516 let inner_bytes = self.get_mut().drain_to(writer, remaining)?;
517
518 Ok(bytes as u64 + inner_bytes)
519 }
520
521 fn taken(&mut self, bytes: u64) {
522 self.get_mut().taken(bytes);
523 }
524
525 fn min_limit(&self) -> u64 {
526 self.get_ref().min_limit()
527 }
528
529 fn properties(&self) -> CopyParams {
530 self.get_ref().properties()
531 }
532}
533
534impl<T: ?Sized + CopyWrite> CopyWrite for BufWriter<T> {
535 fn properties(&self) -> CopyParams {
536 self.get_ref().properties()
537 }
538}
539
540impl CopyRead for CachedFileMetadata {
541 fn properties(&self) -> CopyParams {
542 CopyParams(FdMeta::Metadata(self.1.clone()), Some(self.0.as_raw_fd()))
543 }
544}
545
546impl CopyWrite for CachedFileMetadata {
547 fn properties(&self) -> CopyParams {
548 CopyParams(FdMeta::Metadata(self.1.clone()), Some(self.0.as_raw_fd()))
549 }
550}
551
552fn fd_to_meta<T: AsRawFd>(fd: &T) -> FdMeta {
553 let fd: i32 = fd.as_raw_fd();
554 let file: ManuallyDrop<File> = ManuallyDrop::new(unsafe { File::from_raw_fd(fd) });
555 match file.metadata() {
556 Ok(meta: Metadata) => FdMeta::Metadata(meta),
557 Err(_) => FdMeta::NoneObtained,
558 }
559}
560
561pub(super) enum CopyResult {
562 Ended(u64),
563 Error(Error, u64),
564 Fallback(u64),
565}
566
567impl CopyResult {
568 fn update_take(&self, reader: &mut impl CopyRead) {
569 match *self {
570 CopyResult::Fallback(bytes: u64)
571 | CopyResult::Ended(bytes: u64)
572 | CopyResult::Error(_, bytes: u64) => reader.taken(bytes),
573 }
574 }
575}
576
577/// Invalid file descriptor.
578///
579/// Valid file descriptors are guaranteed to be positive numbers (see `open()` manpage)
580/// while negative values are used to indicate errors.
581/// Thus -1 will never be overlap with a valid open file.
582const INVALID_FD: RawFd = -1;
583
584/// Linux-specific implementation that will attempt to use copy_file_range for copy offloading.
585/// As the name says, it only works on regular files.
586///
587/// Callers must handle fallback to a generic copy loop.
588/// `Fallback` may indicate non-zero number of bytes already written
589/// if one of the files' cursor +`max_len` would exceed u64::MAX (`EOVERFLOW`).
590pub(super) fn copy_regular_files(reader: RawFd, writer: RawFd, max_len: u64) -> CopyResult {
591 use crate::cmp;
592
593 const NOT_PROBED: u8 = 0;
594 const UNAVAILABLE: u8 = 1;
595 const AVAILABLE: u8 = 2;
596
597 // Kernel prior to 4.5 don't have copy_file_range
598 // We store the availability in a global to avoid unnecessary syscalls
599 static HAS_COPY_FILE_RANGE: Atomic<u8> = AtomicU8::new(NOT_PROBED);
600
601 let mut have_probed = match HAS_COPY_FILE_RANGE.load(Ordering::Relaxed) {
602 NOT_PROBED => false,
603 UNAVAILABLE => return CopyResult::Fallback(0),
604 _ => true,
605 };
606
607 syscall!(
608 fn copy_file_range(
609 fd_in: libc::c_int,
610 off_in: *mut libc::loff_t,
611 fd_out: libc::c_int,
612 off_out: *mut libc::loff_t,
613 len: libc::size_t,
614 flags: libc::c_uint,
615 ) -> libc::ssize_t;
616 );
617
618 fn probe_copy_file_range_support() -> u8 {
619 // In some cases, we cannot determine availability from the first
620 // `copy_file_range` call. In this case, we probe with an invalid file
621 // descriptor so that the results are easily interpretable.
622 match unsafe {
623 cvt(copy_file_range(INVALID_FD, ptr::null_mut(), INVALID_FD, ptr::null_mut(), 1, 0))
624 .map_err(|e| e.raw_os_error())
625 } {
626 Err(Some(EPERM | ENOSYS)) => UNAVAILABLE,
627 Err(Some(EBADF)) => AVAILABLE,
628 Ok(_) => panic!("unexpected copy_file_range probe success"),
629 // Treat other errors as the syscall
630 // being unavailable.
631 Err(_) => UNAVAILABLE,
632 }
633 }
634
635 let mut written = 0u64;
636 while written < max_len {
637 let bytes_to_copy = cmp::min(max_len - written, usize::MAX as u64);
638 // cap to 1GB chunks in case u64::MAX is passed as max_len and the file has a non-zero seek position
639 // this allows us to copy large chunks without hitting EOVERFLOW,
640 // unless someone sets a file offset close to u64::MAX - 1GB, in which case a fallback would be required
641 let bytes_to_copy = cmp::min(bytes_to_copy as usize, 0x4000_0000usize);
642 let copy_result = unsafe {
643 // We actually don't have to adjust the offsets,
644 // because copy_file_range adjusts the file offset automatically
645 cvt(copy_file_range(reader, ptr::null_mut(), writer, ptr::null_mut(), bytes_to_copy, 0))
646 };
647
648 if !have_probed && copy_result.is_ok() {
649 have_probed = true;
650 HAS_COPY_FILE_RANGE.store(AVAILABLE, Ordering::Relaxed);
651 }
652
653 match copy_result {
654 Ok(0) if written == 0 => {
655 // fallback to work around several kernel bugs where copy_file_range will fail to
656 // copy any bytes and return 0 instead of an error if
657 // - reading virtual files from the proc filesystem which appear to have 0 size
658 // but are not empty. noted in coreutils to affect kernels at least up to 5.6.19.
659 // - copying from an overlay filesystem in docker. reported to occur on fedora 32.
660 return CopyResult::Fallback(0);
661 }
662 Ok(0) => return CopyResult::Ended(written), // reached EOF
663 Ok(ret) => written += ret as u64,
664 Err(err) => {
665 return match err.raw_os_error() {
666 // when file offset + max_length > u64::MAX
667 Some(EOVERFLOW) => CopyResult::Fallback(written),
668 Some(raw_os_error @ (ENOSYS | EXDEV | EINVAL | EPERM | EOPNOTSUPP | EBADF))
669 if written == 0 =>
670 {
671 if !have_probed {
672 let available = if matches!(raw_os_error, ENOSYS | EOPNOTSUPP | EPERM) {
673 // EPERM can indicate seccomp filters or an
674 // immutable file. To distinguish these
675 // cases we probe with invalid file
676 // descriptors which should result in EBADF
677 // if the syscall is supported and EPERM or
678 // ENOSYS if it's not available.
679 //
680 // For EOPNOTSUPP, see below. In the case of
681 // ENOSYS, we try to cover for faulty FUSE
682 // drivers.
683 probe_copy_file_range_support()
684 } else {
685 AVAILABLE
686 };
687 HAS_COPY_FILE_RANGE.store(available, Ordering::Relaxed);
688 }
689
690 // Try fallback io::copy if either:
691 // - Kernel version is < 4.5 (ENOSYS¹)
692 // - Files are mounted on different fs (EXDEV)
693 // - copy_file_range is broken in various ways on RHEL/CentOS 7 (EOPNOTSUPP)
694 // - copy_file_range file is immutable or syscall is blocked by seccomp¹ (EPERM)
695 // - copy_file_range cannot be used with pipes or device nodes (EINVAL)
696 // - the writer fd was opened with O_APPEND (EBADF²)
697 // and no bytes were written successfully yet. (All these errnos should
698 // not be returned if something was already written, but they happen in
699 // the wild, see #91152.)
700 //
701 // ¹ these cases should be detected by the initial probe but we handle them here
702 // anyway in case syscall interception changes during runtime
703 // ² actually invalid file descriptors would cause this too, but in that case
704 // the fallback code path is expected to encounter the same error again
705 CopyResult::Fallback(0)
706 }
707 _ => CopyResult::Error(err, written),
708 };
709 }
710 }
711 }
712 CopyResult::Ended(written)
713}
714
715#[derive(PartialEq)]
716enum SpliceMode {
717 Sendfile,
718 Splice,
719}
720
721/// performs splice or sendfile between file descriptors
722/// Does _not_ fall back to a generic copy loop.
723fn sendfile_splice(mode: SpliceMode, reader: RawFd, writer: RawFd, len: u64) -> CopyResult {
724 static HAS_SENDFILE: Atomic<bool> = AtomicBool::new(true);
725 static HAS_SPLICE: Atomic<bool> = AtomicBool::new(true);
726
727 // Android builds use feature level 14, but the libc wrapper for splice is
728 // gated on feature level 21+, so we have to invoke the syscall directly.
729 #[cfg(target_os = "android")]
730 syscall!(
731 fn splice(
732 srcfd: libc::c_int,
733 src_offset: *const i64,
734 dstfd: libc::c_int,
735 dst_offset: *const i64,
736 len: libc::size_t,
737 flags: libc::c_int,
738 ) -> libc::ssize_t;
739 );
740
741 #[cfg(target_os = "linux")]
742 use libc::splice;
743
744 match mode {
745 SpliceMode::Sendfile if !HAS_SENDFILE.load(Ordering::Relaxed) => {
746 return CopyResult::Fallback(0);
747 }
748 SpliceMode::Splice if !HAS_SPLICE.load(Ordering::Relaxed) => {
749 return CopyResult::Fallback(0);
750 }
751 _ => (),
752 }
753
754 let mut written = 0u64;
755 while written < len {
756 // according to its manpage that's the maximum size sendfile() will copy per invocation
757 let chunk_size = crate::cmp::min(len - written, 0x7ffff000_u64) as usize;
758
759 let result = match mode {
760 SpliceMode::Sendfile => {
761 cvt(unsafe { sendfile64(writer, reader, ptr::null_mut(), chunk_size) })
762 }
763 SpliceMode::Splice => cvt(unsafe {
764 splice(reader, ptr::null_mut(), writer, ptr::null_mut(), chunk_size, 0)
765 }),
766 };
767
768 match result {
769 Ok(0) => break, // EOF
770 Ok(ret) => written += ret as u64,
771 Err(err) => {
772 return match err.raw_os_error() {
773 Some(ENOSYS | EPERM) => {
774 // syscall not supported (ENOSYS)
775 // syscall is disallowed, e.g. by seccomp (EPERM)
776 match mode {
777 SpliceMode::Sendfile => HAS_SENDFILE.store(false, Ordering::Relaxed),
778 SpliceMode::Splice => HAS_SPLICE.store(false, Ordering::Relaxed),
779 }
780 assert_eq!(written, 0);
781 CopyResult::Fallback(0)
782 }
783 Some(EINVAL) => {
784 // splice/sendfile do not support this particular file descriptor (EINVAL)
785 assert_eq!(written, 0);
786 CopyResult::Fallback(0)
787 }
788 Some(os_err) if mode == SpliceMode::Sendfile && os_err == EOVERFLOW => {
789 CopyResult::Fallback(written)
790 }
791 _ => CopyResult::Error(err, written),
792 };
793 }
794 }
795 }
796 CopyResult::Ended(written)
797}
798

Provided by KDAB

Privacy Policy
Learn Rust with the experts
Find out more