kernel_copy.rs source code [crates/std/src/sys/pal/unix/kernel_copy.rs]

1	//! This module contains specializations that can offload `io::copy()` operations on file descriptor
2	//! containing types (`File`, `TcpStream`, etc.) to more efficient syscalls than `read(2)` and `write(2)`.
3	//!
4	//! Specialization is only applied to wholly std-owned types so that user code can't observe
5	//! that the `Read` and `Write` traits are not used.
6	//!
7	//! Since a copy operation involves a reader and writer side where each can consist of different types
8	//! and also involve generic wrappers (e.g. `Take`, `BufReader`) it is not practical to specialize
9	//! a single method on all possible combinations.
10	//!
11	//! Instead readers and writers are handled separately by the `CopyRead` and `CopyWrite` specialization
12	//! traits and then specialized on by the `Copier::copy` method.
13	//!
14	//! `Copier` uses the specialization traits to unpack the underlying file descriptors and
15	//! additional prerequisites and constraints imposed by the wrapper types.
16	//!
17	//! Once it has obtained all necessary pieces and brought any wrapper types into a state where they
18	//! can be safely bypassed it will attempt to use the `copy_file_range(2)`,
19	//! `sendfile(2)` or `splice(2)` syscalls to move data directly between file descriptors.
20	//! Since those syscalls have requirements that cannot be fully checked in advance it attempts
21	//! to use them one after another (guided by hints) to figure out which one works and
22	//! falls back to the generic read-write copy loop if none of them does.
23	//! Once a working syscall is found for a pair of file descriptors it will be called in a loop
24	//! until the copy operation is completed.
25	//!
26	//! Advantages of using these syscalls:
27	//!
28	//! fewer context switches since reads and writes are coalesced into a single syscall*
29	//! and more bytes are transferred per syscall. This translates to higher throughput
30	//! and fewer CPU cycles, at least for sufficiently large transfers to amortize the initial probing.
31	//! `copy_file_range` creates reflink copies on CoW filesystems, thus moving less data and*
32	//! consuming less disk space
33	//! `sendfile` and `splice` can perform zero-copy IO under some circumstances while*
34	//! a naive copy loop would move every byte through the CPU.
35	//!
36	//! Drawbacks:
37	//!
38	//! copy operations smaller than the default buffer size can under some circumstances, especially*
39	//! on older kernels, incur more syscalls than the naive approach would. As mentioned above
40	//! the syscall selection is guided by hints to minimize this possibility but they are not perfect.
41	//! optimizations only apply to std types. If a user adds a custom wrapper type, e.g. to report*
42	//! progress, they can hit a performance cliff.
43	//! complexity*
44
45	use crate::cmp::min;
46	use crate::fs::{File, Metadata};
47	use crate::io::copy::generic_copy;
48	use crate::io::{
49	BufRead, BufReader, BufWriter, Error, Read, Result, StderrLock, StdinLock, StdoutLock, Take,
50	Write,
51	};
52	use crate::mem::ManuallyDrop;
53	use crate::net::TcpStream;
54	use crate::os::unix::fs::FileTypeExt;
55	use crate::os::unix::io::{AsRawFd, FromRawFd, RawFd};
56	use crate::os::unix::net::UnixStream;
57	use crate::process::{ChildStderr, ChildStdin, ChildStdout};
58	use crate::ptr;
59	use crate::sync::atomic::{AtomicBool, AtomicU8, Ordering};
60	use crate::sys::cvt;
61	use crate::sys::weak::syscall;
62	#[cfg(not(any(all(target_os = "linux", target_env = "gnu"), target_os = "hurd")))]
63	use libc::sendfile as sendfile64;
64	#[cfg(any(all(target_os = "linux", target_env = "gnu"), target_os = "hurd"))]
65	use libc::sendfile64;
66	use libc::{EBADF, EINVAL, ENOSYS, EOPNOTSUPP, EOVERFLOW, EPERM, EXDEV};
67
68	#[cfg(test)]
69	mod tests;
70
71	pub(crate) fn copy_spec<R: Read + ?Sized, W: Write + ?Sized>(
72	read: &mut R,
73	write: &mut W,
74	) -> Result<u64> {
75	let copier: Copier<'_, '_, R, W> = Copier { read, write };
76	SpecCopy::copy(self:copier)
77	}
78
79	/// This type represents either the inferred `FileType` of a `RawFd` based on the source
80	/// type from which it was extracted or the actual metadata
81	///
82	/// The methods on this type only provide hints, due to `AsRawFd` and `FromRawFd` the inferred
83	/// type may be wrong.
84	enum FdMeta {
85	Metadata(Metadata),
86	Socket,
87	Pipe,
88	/// We don't have any metadata because the stat syscall failed
89	NoneObtained,
90	}
91
92	#[derive(PartialEq)]
93	enum FdHandle {
94	Input,
95	Output,
96	}
97
98	impl FdMeta {
99	fn maybe_fifo(&self) -> bool {
100	match self {
101	FdMeta::Metadata(meta) => meta.file_type().is_fifo(),
102	FdMeta::Socket => `false`,
103	FdMeta::Pipe => `true`,
104	FdMeta::NoneObtained => `true`,
105	}
106	}
107
108	fn potential_sendfile_source(&self) -> bool {
109	match self {
110	// procfs erroneously shows 0 length on non-empty readable files.
111	// and if a file is truly empty then a `read` syscall will determine that and skip the write syscall
112	// thus there would be benefit from attempting sendfile
113	FdMeta::Metadata(meta)
114	if meta.file_type().is_file() && meta.len() > `0`
115	\|\| meta.file_type().is_block_device() =>
116	{
117	`true`
118	}
119	_ => `false`,
120	}
121	}
122
123	fn copy_file_range_candidate(&self, f: FdHandle) -> bool {
124	match self {
125	// copy_file_range will fail on empty procfs files. `read` can determine whether EOF has been reached
126	// without extra cost and skip the write, thus there is no benefit in attempting copy_file_range
127	FdMeta::Metadata(meta) if f == FdHandle::Input && meta.is_file() && meta.len() > `0` => {
128	`true`
129	}
130	FdMeta::Metadata(meta) if f == FdHandle::Output && meta.is_file() => `true`,
131	_ => `false`,
132	}
133	}
134	}
135
136	/// Returns true either if changes made to the source after a sendfile/splice call won't become
137	/// visible in the sink or the source has explicitly opted into such behavior (e.g. by splicing
138	/// a file into a pipe, the pipe being the source in this case).
139	///
140	/// This will prevent File -> Pipe and File -> Socket splicing/sendfile optimizations to uphold
141	/// the Read/Write API semantics of io::copy.
142	///
143	/// Note: This is not 100% airtight, the caller can use the RawFd conversion methods to turn a
144	/// regular file into a TcpSocket which will be treated as a socket here without checking.
145	fn safe_kernel_copy(source: &FdMeta, sink: &FdMeta) -> bool {
146	match (source, sink) {
147	// Data arriving from a socket is safe because the sender can't modify the socket buffer.
148	// Data arriving from a pipe is safe(-ish) because either the sender copied
149	// the bytes into the pipe OR explicitly performed an operation that enables zero-copy,
150	// thus promising not to modify the data later.
151	(FdMeta::Socket, _) => `true`,
152	(FdMeta::Pipe, _) => `true`,
153	(FdMeta::Metadata(meta: &Metadata), _)
154	if meta.file_type().is_fifo() \|\| meta.file_type().is_socket() =>
155	{
156	`true`
157	}
158	// Data going into non-pipes/non-sockets is safe because the "later changes may become visible" issue
159	// only happens for pages sitting in send buffers or pipes.
160	(_, FdMeta::Metadata(meta: &Metadata))
161	if !meta.file_type().is_fifo() && !meta.file_type().is_socket() =>
162	{
163	`true`
164	}
165	_ => `false`,
166	}
167	}
168
169	struct CopyParams(FdMeta, Option<RawFd>);
170
171	struct Copier<'a, 'b, R: Read + ?Sized, W: Write + ?Sized> {
172	read: &'a mut R,
173	write: &'b mut W,
174	}
175
176	trait SpecCopy {
177	fn copy(self) -> Result<u64>;
178	}
179
180	impl<R: Read + ?Sized, W: Write + ?Sized> SpecCopy for Copier<'_, '_, R, W> {
181	default fn copy(self) -> Result<u64> {
182	generic_copy(self.read, self.write)
183	}
184	}
185
186	impl<R: CopyRead, W: CopyWrite> SpecCopy for Copier<'_, '_, R, W> {
187	fn copy(self) -> Result<u64> {
188	let (reader, writer) = (self.read, self.write);
189	let r_cfg = reader.properties();
190	let w_cfg = writer.properties();
191
192	// before direct operations on file descriptors ensure that all source and sink buffers are empty
193	let mut flush = \|\| -> crate::io::Result<u64> {
194	let bytes = reader.drain_to(writer, u64::MAX)?;
195	// BufWriter buffered bytes have already been accounted for in earlier write() calls
196	writer.flush()?;
197	Ok(bytes)
198	};
199
200	let mut written = `0u64`;
201
202	if let (CopyParams(input_meta, Some(readfd)), CopyParams(output_meta, Some(writefd))) =
203	(r_cfg, w_cfg)
204	{
205	written += flush()?;
206	let max_write = reader.min_limit();
207
208	if input_meta.copy_file_range_candidate(FdHandle::Input)
209	&& output_meta.copy_file_range_candidate(FdHandle::Output)
210	{
211	let result = copy_regular_files(readfd, writefd, max_write);
212	result.update_take(reader);
213
214	match result {
215	CopyResult::Ended(bytes_copied) => return Ok(bytes_copied + written),
216	CopyResult::Error(e, _) => return Err(e),
217	CopyResult::Fallback(bytes) => written += bytes,
218	}
219	}
220
221	// on modern kernels sendfile can copy from any mmapable type (some but not all regular files and block devices)
222	// to any writable file descriptor. On older kernels the writer side can only be a socket.
223	// So we just try and fallback if needed.
224	// If current file offsets + write sizes overflow it may also fail, we do not try to fix that and instead
225	// fall back to the generic copy loop.
226	if input_meta.potential_sendfile_source() && safe_kernel_copy(&input_meta, &output_meta)
227	{
228	let result = sendfile_splice(SpliceMode::Sendfile, readfd, writefd, max_write);
229	result.update_take(reader);
230
231	match result {
232	CopyResult::Ended(bytes_copied) => return Ok(bytes_copied + written),
233	CopyResult::Error(e, _) => return Err(e),
234	CopyResult::Fallback(bytes) => written += bytes,
235	}
236	}
237
238	if (input_meta.maybe_fifo() \|\| output_meta.maybe_fifo())
239	&& safe_kernel_copy(&input_meta, &output_meta)
240	{
241	let result = sendfile_splice(SpliceMode::Splice, readfd, writefd, max_write);
242	result.update_take(reader);
243
244	match result {
245	CopyResult::Ended(bytes_copied) => return Ok(bytes_copied + written),
246	CopyResult::Error(e, _) => return Err(e),
247	CopyResult::Fallback(`0`) => { / use the fallback below / }
248	CopyResult::Fallback(_) => {
249	unreachable!("splice should not return > 0 bytes on the fallback path")
250	}
251	}
252	}
253	}
254
255	// fallback if none of the more specialized syscalls wants to work with these file descriptors
256	match generic_copy(reader, writer) {
257	Ok(bytes) => Ok(bytes + written),
258	err => err,
259	}
260	}
261	}
262
263	#[rustc_specialization_trait]
264	trait CopyRead: Read {
265	/// Implementations that contain buffers (i.e. `BufReader`) must transfer data from their internal
266	/// buffers into `writer` until either the buffers are emptied or `limit` bytes have been
267	/// transferred, whichever occurs sooner.
268	/// If nested buffers are present the outer buffers must be drained first.
269	///
270	/// This is necessary to directly bypass the wrapper types while preserving the data order
271	/// when operating directly on the underlying file descriptors.
272	fn drain_to<W: Write>(&mut self, _writer: &mut W, _limit: u64) -> Result<u64> {
273	Ok(`0`)
274	}
275
276	/// Updates `Take` wrappers to remove the number of bytes copied.
277	fn taken(&mut self, _bytes: u64) {}
278
279	/// The minimum of the limit of all `Take<_>` wrappers, `u64::MAX` otherwise.
280	/// This method does not account for data `BufReader` buffers and would underreport
281	/// the limit of a `Take<BufReader<Take<_>>>` type. Thus its result is only valid
282	/// after draining the buffers via `drain_to`.
283	fn min_limit(&self) -> u64 {
284	u64::MAX
285	}
286
287	/// Extracts the file descriptor and hints/metadata, delegating through wrappers if necessary.
288	fn properties(&self) -> CopyParams;
289	}
290
291	#[rustc_specialization_trait]
292	trait CopyWrite: Write {
293	/// Extracts the file descriptor and hints/metadata, delegating through wrappers if necessary.
294	fn properties(&self) -> CopyParams;
295	}
296
297	impl<T> CopyRead for &mut T
298	where
299	T: CopyRead,
300	{
301	fn drain_to<W: Write>(&mut self, writer: &mut W, limit: u64) -> Result<u64> {
302	(**self).drain_to(writer, limit)
303	}
304
305	fn taken(&mut self, bytes: u64) {
306	(**self).taken(bytes);
307	}
308
309	fn min_limit(&self) -> u64 {
310	(**self).min_limit()
311	}
312
313	fn properties(&self) -> CopyParams {
314	(**self).properties()
315	}
316	}
317
318	impl<T> CopyWrite for &mut T
319	where
320	T: CopyWrite,
321	{
322	fn properties(&self) -> CopyParams {
323	(**self).properties()
324	}
325	}
326
327	impl CopyRead for File {
328	fn properties(&self) -> CopyParams {
329	CopyParams(fd_to_meta(self), Some(self.as_raw_fd()))
330	}
331	}
332
333	impl CopyRead for &File {
334	fn properties(&self) -> CopyParams {
335	CopyParams(fd_to_meta(*self), Some(self.as_raw_fd()))
336	}
337	}
338
339	impl CopyWrite for File {
340	fn properties(&self) -> CopyParams {
341	CopyParams(fd_to_meta(self), Some(self.as_raw_fd()))
342	}
343	}
344
345	impl CopyWrite for &File {
346	fn properties(&self) -> CopyParams {
347	CopyParams(fd_to_meta(*self), Some(self.as_raw_fd()))
348	}
349	}
350
351	impl CopyRead for TcpStream {
352	fn properties(&self) -> CopyParams {
353	// avoid the stat syscall since we can be fairly sure it's a socket
354	CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
355	}
356	}
357
358	impl CopyRead for &TcpStream {
359	fn properties(&self) -> CopyParams {
360	// avoid the stat syscall since we can be fairly sure it's a socket
361	CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
362	}
363	}
364
365	impl CopyWrite for TcpStream {
366	fn properties(&self) -> CopyParams {
367	// avoid the stat syscall since we can be fairly sure it's a socket
368	CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
369	}
370	}
371
372	impl CopyWrite for &TcpStream {
373	fn properties(&self) -> CopyParams {
374	// avoid the stat syscall since we can be fairly sure it's a socket
375	CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
376	}
377	}
378
379	impl CopyRead for UnixStream {
380	fn properties(&self) -> CopyParams {
381	// avoid the stat syscall since we can be fairly sure it's a socket
382	CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
383	}
384	}
385
386	impl CopyRead for &UnixStream {
387	fn properties(&self) -> CopyParams {
388	// avoid the stat syscall since we can be fairly sure it's a socket
389	CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
390	}
391	}
392
393	impl CopyWrite for UnixStream {
394	fn properties(&self) -> CopyParams {
395	// avoid the stat syscall since we can be fairly sure it's a socket
396	CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
397	}
398	}
399
400	impl CopyWrite for &UnixStream {
401	fn properties(&self) -> CopyParams {
402	// avoid the stat syscall since we can be fairly sure it's a socket
403	CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
404	}
405	}
406
407	impl CopyWrite for ChildStdin {
408	fn properties(&self) -> CopyParams {
409	CopyParams(FdMeta::Pipe, Some(self.as_raw_fd()))
410	}
411	}
412
413	impl CopyRead for ChildStdout {
414	fn properties(&self) -> CopyParams {
415	CopyParams(FdMeta::Pipe, Some(self.as_raw_fd()))
416	}
417	}
418
419	impl CopyRead for ChildStderr {
420	fn properties(&self) -> CopyParams {
421	CopyParams(FdMeta::Pipe, Some(self.as_raw_fd()))
422	}
423	}
424
425	impl CopyRead for StdinLock<'_> {
426	fn drain_to<W: Write>(&mut self, writer: &mut W, outer_limit: u64) -> Result<u64> {
427	let buf_reader: &mut BufReader = self.as_mut_buf();
428	let buf: &[u8] = buf_reader.buffer();
429	let buf: &[u8] = &buf[`0`..min(v1:buf.len(), v2:outer_limit.try_into().unwrap_or(default:usize::MAX))];
430	let bytes_drained: usize = buf.len();
431	writer.write_all(buf)?;
432	buf_reader.consume(amt:bytes_drained);
433
434	Ok(bytes_drained as u64)
435	}
436
437	fn properties(&self) -> CopyParams {
438	CopyParams(fd_to_meta(self), Some(self.as_raw_fd()))
439	}
440	}
441
442	impl CopyWrite for StdoutLock<'_> {
443	fn properties(&self) -> CopyParams {
444	CopyParams(fd_to_meta(self), Some(self.as_raw_fd()))
445	}
446	}
447
448	impl CopyWrite for StderrLock<'_> {
449	fn properties(&self) -> CopyParams {
450	CopyParams(fd_to_meta(self), Some(self.as_raw_fd()))
451	}
452	}
453
454	impl<T: CopyRead> CopyRead for Take<T> {
455	fn drain_to<W: Write>(&mut self, writer: &mut W, outer_limit: u64) -> Result<u64> {
456	let local_limit: u64 = self.limit();
457	let combined_limit: u64 = min(v1:outer_limit, v2:local_limit);
458	let bytes_drained: u64 = self.get_mut().drain_to(writer, combined_limit)?;
459	// update limit since read() was bypassed
460	self.set_limit(local_limit - bytes_drained);
461
462	Ok(bytes_drained)
463	}
464
465	fn taken(&mut self, bytes: u64) {
466	self.set_limit(self.limit() - bytes);
467	self.get_mut().taken(bytes);
468	}
469
470	fn min_limit(&self) -> u64 {
471	min(v1:Take::limit(self), self.get_ref().min_limit())
472	}
473
474	fn properties(&self) -> CopyParams {
475	self.get_ref().properties()
476	}
477	}
478
479	impl<T: ?Sized + CopyRead> CopyRead for BufReader<T> {
480	fn drain_to<W: Write>(&mut self, writer: &mut W, outer_limit: u64) -> Result<u64> {
481	let buf = self.buffer();
482	let buf = &buf[`0`..min(buf.len(), outer_limit.try_into().unwrap_or(usize::MAX))];
483	let bytes = buf.len();
484	writer.write_all(buf)?;
485	self.consume(bytes);
486
487	let remaining = outer_limit - bytes as u64;
488
489	// in case of nested bufreaders we also need to drain the ones closer to the source
490	let inner_bytes = self.get_mut().drain_to(writer, remaining)?;
491
492	Ok(bytes as u64 + inner_bytes)
493	}
494
495	fn taken(&mut self, bytes: u64) {
496	self.get_mut().taken(bytes);
497	}
498
499	fn min_limit(&self) -> u64 {
500	self.get_ref().min_limit()
501	}
502
503	fn properties(&self) -> CopyParams {
504	self.get_ref().properties()
505	}
506	}
507
508	impl<T: ?Sized + CopyWrite> CopyWrite for BufWriter<T> {
509	fn properties(&self) -> CopyParams {
510	self.get_ref().properties()
511	}
512	}
513
514	fn fd_to_meta<T: AsRawFd>(fd: &T) -> FdMeta {
515	let fd: i32 = fd.as_raw_fd();
516	let file: ManuallyDrop<File> = ManuallyDrop::new(unsafe { File::from_raw_fd(fd) });
517	match file.metadata() {
518	Ok(meta: Metadata) => FdMeta::Metadata(meta),
519	Err(_) => FdMeta::NoneObtained,
520	}
521	}
522
523	pub(super) enum CopyResult {
524	Ended(u64),
525	Error(Error, u64),
526	Fallback(u64),
527	}
528
529	impl CopyResult {
530	fn update_take(&self, reader: &mut impl CopyRead) {
531	match *self {
532	CopyResult::Fallback(bytes: u64)
533	\| CopyResult::Ended(bytes: u64)
534	\| CopyResult::Error(_, bytes: u64) => reader.taken(bytes),
535	}
536	}
537	}
538
539	/// Invalid file descriptor.
540	///
541	/// Valid file descriptors are guaranteed to be positive numbers (see `open()` manpage)
542	/// while negative values are used to indicate errors.
543	/// Thus -1 will never be overlap with a valid open file.
544	const INVALID_FD: RawFd = `-1`;
545
546	/// Linux-specific implementation that will attempt to use copy_file_range for copy offloading.
547	/// As the name says, it only works on regular files.
548	///
549	/// Callers must handle fallback to a generic copy loop.
550	/// `Fallback` may indicate non-zero number of bytes already written
551	/// if one of the files' cursor +`max_len` would exceed u64::MAX (`EOVERFLOW`).
552	pub(super) fn copy_regular_files(reader: RawFd, writer: RawFd, max_len: u64) -> CopyResult {
553	use crate::cmp;
554
555	const NOT_PROBED: u8 = `0`;
556	const UNAVAILABLE: u8 = `1`;
557	const AVAILABLE: u8 = `2`;
558
559	// Kernel prior to 4.5 don't have copy_file_range
560	// We store the availability in a global to avoid unnecessary syscalls
561	static HAS_COPY_FILE_RANGE: AtomicU8 = AtomicU8::new(NOT_PROBED);
562
563	syscall! {
564	fn copy_file_range(
565	fd_in: libc::c_int,
566	off_in: *mut libc::loff_t,
567	fd_out: libc::c_int,
568	off_out: *mut libc::loff_t,
569	len: libc::size_t,
570	flags: libc::c_uint
571	) -> libc::ssize_t
572	}
573
574	match HAS_COPY_FILE_RANGE.load(Ordering::Relaxed) {
575	NOT_PROBED => {
576	// EPERM can indicate seccomp filters or an immutable file.
577	// To distinguish these cases we probe with invalid file descriptors which should result in EBADF if the syscall is supported
578	// and some other error (ENOSYS or EPERM) if it's not available
579	let result = unsafe {
580	cvt(copy_file_range(INVALID_FD, ptr::null_mut(), INVALID_FD, ptr::null_mut(), `1`, `0`))
581	};
582
583	if matches!(result.map_err(\|e\| e.raw_os_error()), Err(Some(EBADF))) {
584	HAS_COPY_FILE_RANGE.store(AVAILABLE, Ordering::Relaxed);
585	} else {
586	HAS_COPY_FILE_RANGE.store(UNAVAILABLE, Ordering::Relaxed);
587	return CopyResult::Fallback(`0`);
588	}
589	}
590	UNAVAILABLE => return CopyResult::Fallback(`0`),
591	_ => {}
592	};
593
594	let mut written = `0u64`;
595	while written < max_len {
596	let bytes_to_copy = cmp::min(max_len - written, usize::MAX as u64);
597	// cap to 1GB chunks in case u64::MAX is passed as max_len and the file has a non-zero seek position
598	// this allows us to copy large chunks without hitting EOVERFLOW,
599	// unless someone sets a file offset close to u64::MAX - 1GB, in which case a fallback would be required
600	let bytes_to_copy = cmp::min(bytes_to_copy as usize, `0x4000_0000usize`);
601	let copy_result = unsafe {
602	// We actually don't have to adjust the offsets,
603	// because copy_file_range adjusts the file offset automatically
604	cvt(copy_file_range(reader, ptr::null_mut(), writer, ptr::null_mut(), bytes_to_copy, `0`))
605	};
606
607	match copy_result {
608	Ok(`0`) if written == `0` => {
609	// fallback to work around several kernel bugs where copy_file_range will fail to
610	// copy any bytes and return 0 instead of an error if
611	// - reading virtual files from the proc filesystem which appear to have 0 size
612	// but are not empty. noted in coreutils to affect kernels at least up to 5.6.19.
613	// - copying from an overlay filesystem in docker. reported to occur on fedora 32.
614	return CopyResult::Fallback(`0`);
615	}
616	Ok(`0`) => return CopyResult::Ended(written), // reached EOF
617	Ok(ret) => written += ret as u64,
618	Err(err) => {
619	return match err.raw_os_error() {
620	// when file offset + max_length > u64::MAX
621	Some(EOVERFLOW) => CopyResult::Fallback(written),
622	Some(ENOSYS \| EXDEV \| EINVAL \| EPERM \| EOPNOTSUPP \| EBADF) if written == `0` => {
623	// Try fallback io::copy if either:
624	// - Kernel version is < 4.5 (ENOSYS¹)
625	// - Files are mounted on different fs (EXDEV)
626	// - copy_file_range is broken in various ways on RHEL/CentOS 7 (EOPNOTSUPP)
627	// - copy_file_range file is immutable or syscall is blocked by seccomp¹ (EPERM)
628	// - copy_file_range cannot be used with pipes or device nodes (EINVAL)
629	// - the writer fd was opened with O_APPEND (EBADF²)
630	// and no bytes were written successfully yet. (All these errnos should
631	// not be returned if something was already written, but they happen in
632	// the wild, see #91152.)
633	//
634	// ¹ these cases should be detected by the initial probe but we handle them here
635	// anyway in case syscall interception changes during runtime
636	// ² actually invalid file descriptors would cause this too, but in that case
637	// the fallback code path is expected to encounter the same error again
638	CopyResult::Fallback(`0`)
639	}
640	_ => CopyResult::Error(err, written),
641	};
642	}
643	}
644	}
645	CopyResult::Ended(written)
646	}
647
648	#[derive(PartialEq)]
649	enum SpliceMode {
650	Sendfile,
651	Splice,
652	}
653
654	/// performs splice or sendfile between file descriptors
655	/// Does _not_ fall back to a generic copy loop.
656	fn sendfile_splice(mode: SpliceMode, reader: RawFd, writer: RawFd, len: u64) -> CopyResult {
657	static HAS_SENDFILE: AtomicBool = AtomicBool::new(`true`);
658	static HAS_SPLICE: AtomicBool = AtomicBool::new(`true`);
659
660	// Android builds use feature level 14, but the libc wrapper for splice is
661	// gated on feature level 21+, so we have to invoke the syscall directly.
662	#[cfg(target_os = "android")]
663	syscall! {
664	fn splice(
665	srcfd: libc::c_int,
666	src_offset: *const i64,
667	dstfd: libc::c_int,
668	dst_offset: *const i64,
669	len: libc::size_t,
670	flags: libc::c_int
671	) -> libc::ssize_t
672	}
673
674	#[cfg(target_os = "linux")]
675	use libc::splice;
676
677	match mode {
678	SpliceMode::Sendfile if !HAS_SENDFILE.load(Ordering::Relaxed) => {
679	return CopyResult::Fallback(`0`);
680	}
681	SpliceMode::Splice if !HAS_SPLICE.load(Ordering::Relaxed) => {
682	return CopyResult::Fallback(`0`);
683	}
684	_ => (),
685	}
686
687	let mut written = `0u64`;
688	while written < len {
689	// according to its manpage that's the maximum size sendfile() will copy per invocation
690	let chunk_size = crate::cmp::min(len - written, `0x7ffff000_u64`) as usize;
691
692	let result = match mode {
693	SpliceMode::Sendfile => {
694	cvt(unsafe { sendfile64(writer, reader, ptr::null_mut(), chunk_size) })
695	}
696	SpliceMode::Splice => cvt(unsafe {
697	splice(reader, ptr::null_mut(), writer, ptr::null_mut(), chunk_size, `0`)
698	}),
699	};
700
701	match result {
702	Ok(`0`) => break, // EOF
703	Ok(ret) => written += ret as u64,
704	Err(err) => {
705	return match err.raw_os_error() {
706	Some(ENOSYS \| EPERM) => {
707	// syscall not supported (ENOSYS)
708	// syscall is disallowed, e.g. by seccomp (EPERM)
709	match mode {
710	SpliceMode::Sendfile => HAS_SENDFILE.store(`false`, Ordering::Relaxed),
711	SpliceMode::Splice => HAS_SPLICE.store(`false`, Ordering::Relaxed),
712	}
713	assert_eq!(written, `0`);
714	CopyResult::Fallback(`0`)
715	}
716	Some(EINVAL) => {
717	// splice/sendfile do not support this particular file descriptor (EINVAL)
718	assert_eq!(written, `0`);
719	CopyResult::Fallback(`0`)
720	}
721	Some(os_err) if mode == SpliceMode::Sendfile && os_err == EOVERFLOW => {
722	CopyResult::Fallback(written)
723	}
724	_ => CopyResult::Error(err, written),
725	};
726	}
727	}
728	}
729	CopyResult::Ended(written)
730	}
731