1 | //! This module contains specializations that can offload `io::copy()` operations on file descriptor |
2 | //! containing types (`File`, `TcpStream`, etc.) to more efficient syscalls than `read(2)` and `write(2)`. |
3 | //! |
4 | //! Specialization is only applied to wholly std-owned types so that user code can't observe |
5 | //! that the `Read` and `Write` traits are not used. |
6 | //! |
7 | //! Since a copy operation involves a reader and writer side where each can consist of different types |
8 | //! and also involve generic wrappers (e.g. `Take`, `BufReader`) it is not practical to specialize |
9 | //! a single method on all possible combinations. |
10 | //! |
11 | //! Instead readers and writers are handled separately by the `CopyRead` and `CopyWrite` specialization |
12 | //! traits and then specialized on by the `Copier::copy` method. |
13 | //! |
14 | //! `Copier` uses the specialization traits to unpack the underlying file descriptors and |
15 | //! additional prerequisites and constraints imposed by the wrapper types. |
16 | //! |
17 | //! Once it has obtained all necessary pieces and brought any wrapper types into a state where they |
18 | //! can be safely bypassed it will attempt to use the `copy_file_range(2)`, |
19 | //! `sendfile(2)` or `splice(2)` syscalls to move data directly between file descriptors. |
20 | //! Since those syscalls have requirements that cannot be fully checked in advance it attempts |
21 | //! to use them one after another (guided by hints) to figure out which one works and |
22 | //! falls back to the generic read-write copy loop if none of them does. |
23 | //! Once a working syscall is found for a pair of file descriptors it will be called in a loop |
24 | //! until the copy operation is completed. |
25 | //! |
26 | //! Advantages of using these syscalls: |
27 | //! |
28 | //! * fewer context switches since reads and writes are coalesced into a single syscall |
29 | //! and more bytes are transferred per syscall. This translates to higher throughput |
30 | //! and fewer CPU cycles, at least for sufficiently large transfers to amortize the initial probing. |
31 | //! * `copy_file_range` creates reflink copies on CoW filesystems, thus moving less data and |
32 | //! consuming less disk space |
33 | //! * `sendfile` and `splice` can perform zero-copy IO under some circumstances while |
34 | //! a naive copy loop would move every byte through the CPU. |
35 | //! |
36 | //! Drawbacks: |
37 | //! |
38 | //! * copy operations smaller than the default buffer size can under some circumstances, especially |
39 | //! on older kernels, incur more syscalls than the naive approach would. As mentioned above |
40 | //! the syscall selection is guided by hints to minimize this possibility but they are not perfect. |
41 | //! * optimizations only apply to std types. If a user adds a custom wrapper type, e.g. to report |
42 | //! progress, they can hit a performance cliff. |
43 | //! * complexity |
44 | |
45 | use crate::cmp::min; |
46 | use crate::fs::{File, Metadata}; |
47 | use crate::io::copy::generic_copy; |
48 | use crate::io::{ |
49 | BufRead, BufReader, BufWriter, Error, Read, Result, StderrLock, StdinLock, StdoutLock, Take, |
50 | Write, |
51 | }; |
52 | use crate::mem::ManuallyDrop; |
53 | use crate::net::TcpStream; |
54 | use crate::os::unix::fs::FileTypeExt; |
55 | use crate::os::unix::io::{AsRawFd, FromRawFd, RawFd}; |
56 | use crate::os::unix::net::UnixStream; |
57 | use crate::process::{ChildStderr, ChildStdin, ChildStdout}; |
58 | use crate::ptr; |
59 | use crate::sync::atomic::{AtomicBool, AtomicU8, Ordering}; |
60 | use crate::sys::cvt; |
61 | use crate::sys::weak::syscall; |
62 | #[cfg (not(any(all(target_os = "linux" , target_env = "gnu" ), target_os = "hurd" )))] |
63 | use libc::sendfile as sendfile64; |
64 | #[cfg (any(all(target_os = "linux" , target_env = "gnu" ), target_os = "hurd" ))] |
65 | use libc::sendfile64; |
66 | use libc::{EBADF, EINVAL, ENOSYS, EOPNOTSUPP, EOVERFLOW, EPERM, EXDEV}; |
67 | |
68 | #[cfg (test)] |
69 | mod tests; |
70 | |
71 | pub(crate) fn copy_spec<R: Read + ?Sized, W: Write + ?Sized>( |
72 | read: &mut R, |
73 | write: &mut W, |
74 | ) -> Result<u64> { |
75 | let copier: Copier<'_, '_, R, W> = Copier { read, write }; |
76 | SpecCopy::copy(self:copier) |
77 | } |
78 | |
79 | /// This type represents either the inferred `FileType` of a `RawFd` based on the source |
80 | /// type from which it was extracted or the actual metadata |
81 | /// |
82 | /// The methods on this type only provide hints, due to `AsRawFd` and `FromRawFd` the inferred |
83 | /// type may be wrong. |
84 | enum FdMeta { |
85 | Metadata(Metadata), |
86 | Socket, |
87 | Pipe, |
88 | /// We don't have any metadata because the stat syscall failed |
89 | NoneObtained, |
90 | } |
91 | |
92 | #[derive (PartialEq)] |
93 | enum FdHandle { |
94 | Input, |
95 | Output, |
96 | } |
97 | |
98 | impl FdMeta { |
99 | fn maybe_fifo(&self) -> bool { |
100 | match self { |
101 | FdMeta::Metadata(meta) => meta.file_type().is_fifo(), |
102 | FdMeta::Socket => false, |
103 | FdMeta::Pipe => true, |
104 | FdMeta::NoneObtained => true, |
105 | } |
106 | } |
107 | |
108 | fn potential_sendfile_source(&self) -> bool { |
109 | match self { |
110 | // procfs erroneously shows 0 length on non-empty readable files. |
111 | // and if a file is truly empty then a `read` syscall will determine that and skip the write syscall |
112 | // thus there would be benefit from attempting sendfile |
113 | FdMeta::Metadata(meta) |
114 | if meta.file_type().is_file() && meta.len() > 0 |
115 | || meta.file_type().is_block_device() => |
116 | { |
117 | true |
118 | } |
119 | _ => false, |
120 | } |
121 | } |
122 | |
123 | fn copy_file_range_candidate(&self, f: FdHandle) -> bool { |
124 | match self { |
125 | // copy_file_range will fail on empty procfs files. `read` can determine whether EOF has been reached |
126 | // without extra cost and skip the write, thus there is no benefit in attempting copy_file_range |
127 | FdMeta::Metadata(meta) if f == FdHandle::Input && meta.is_file() && meta.len() > 0 => { |
128 | true |
129 | } |
130 | FdMeta::Metadata(meta) if f == FdHandle::Output && meta.is_file() => true, |
131 | _ => false, |
132 | } |
133 | } |
134 | } |
135 | |
136 | /// Returns true either if changes made to the source after a sendfile/splice call won't become |
137 | /// visible in the sink or the source has explicitly opted into such behavior (e.g. by splicing |
138 | /// a file into a pipe, the pipe being the source in this case). |
139 | /// |
140 | /// This will prevent File -> Pipe and File -> Socket splicing/sendfile optimizations to uphold |
141 | /// the Read/Write API semantics of io::copy. |
142 | /// |
143 | /// Note: This is not 100% airtight, the caller can use the RawFd conversion methods to turn a |
144 | /// regular file into a TcpSocket which will be treated as a socket here without checking. |
145 | fn safe_kernel_copy(source: &FdMeta, sink: &FdMeta) -> bool { |
146 | match (source, sink) { |
147 | // Data arriving from a socket is safe because the sender can't modify the socket buffer. |
148 | // Data arriving from a pipe is safe(-ish) because either the sender *copied* |
149 | // the bytes into the pipe OR explicitly performed an operation that enables zero-copy, |
150 | // thus promising not to modify the data later. |
151 | (FdMeta::Socket, _) => true, |
152 | (FdMeta::Pipe, _) => true, |
153 | (FdMeta::Metadata(meta: &Metadata), _) |
154 | if meta.file_type().is_fifo() || meta.file_type().is_socket() => |
155 | { |
156 | true |
157 | } |
158 | // Data going into non-pipes/non-sockets is safe because the "later changes may become visible" issue |
159 | // only happens for pages sitting in send buffers or pipes. |
160 | (_, FdMeta::Metadata(meta: &Metadata)) |
161 | if !meta.file_type().is_fifo() && !meta.file_type().is_socket() => |
162 | { |
163 | true |
164 | } |
165 | _ => false, |
166 | } |
167 | } |
168 | |
169 | struct CopyParams(FdMeta, Option<RawFd>); |
170 | |
171 | struct Copier<'a, 'b, R: Read + ?Sized, W: Write + ?Sized> { |
172 | read: &'a mut R, |
173 | write: &'b mut W, |
174 | } |
175 | |
176 | trait SpecCopy { |
177 | fn copy(self) -> Result<u64>; |
178 | } |
179 | |
180 | impl<R: Read + ?Sized, W: Write + ?Sized> SpecCopy for Copier<'_, '_, R, W> { |
181 | default fn copy(self) -> Result<u64> { |
182 | generic_copy(self.read, self.write) |
183 | } |
184 | } |
185 | |
186 | impl<R: CopyRead, W: CopyWrite> SpecCopy for Copier<'_, '_, R, W> { |
187 | fn copy(self) -> Result<u64> { |
188 | let (reader, writer) = (self.read, self.write); |
189 | let r_cfg = reader.properties(); |
190 | let w_cfg = writer.properties(); |
191 | |
192 | // before direct operations on file descriptors ensure that all source and sink buffers are empty |
193 | let mut flush = || -> crate::io::Result<u64> { |
194 | let bytes = reader.drain_to(writer, u64::MAX)?; |
195 | // BufWriter buffered bytes have already been accounted for in earlier write() calls |
196 | writer.flush()?; |
197 | Ok(bytes) |
198 | }; |
199 | |
200 | let mut written = 0u64; |
201 | |
202 | if let (CopyParams(input_meta, Some(readfd)), CopyParams(output_meta, Some(writefd))) = |
203 | (r_cfg, w_cfg) |
204 | { |
205 | written += flush()?; |
206 | let max_write = reader.min_limit(); |
207 | |
208 | if input_meta.copy_file_range_candidate(FdHandle::Input) |
209 | && output_meta.copy_file_range_candidate(FdHandle::Output) |
210 | { |
211 | let result = copy_regular_files(readfd, writefd, max_write); |
212 | result.update_take(reader); |
213 | |
214 | match result { |
215 | CopyResult::Ended(bytes_copied) => return Ok(bytes_copied + written), |
216 | CopyResult::Error(e, _) => return Err(e), |
217 | CopyResult::Fallback(bytes) => written += bytes, |
218 | } |
219 | } |
220 | |
221 | // on modern kernels sendfile can copy from any mmapable type (some but not all regular files and block devices) |
222 | // to any writable file descriptor. On older kernels the writer side can only be a socket. |
223 | // So we just try and fallback if needed. |
224 | // If current file offsets + write sizes overflow it may also fail, we do not try to fix that and instead |
225 | // fall back to the generic copy loop. |
226 | if input_meta.potential_sendfile_source() && safe_kernel_copy(&input_meta, &output_meta) |
227 | { |
228 | let result = sendfile_splice(SpliceMode::Sendfile, readfd, writefd, max_write); |
229 | result.update_take(reader); |
230 | |
231 | match result { |
232 | CopyResult::Ended(bytes_copied) => return Ok(bytes_copied + written), |
233 | CopyResult::Error(e, _) => return Err(e), |
234 | CopyResult::Fallback(bytes) => written += bytes, |
235 | } |
236 | } |
237 | |
238 | if (input_meta.maybe_fifo() || output_meta.maybe_fifo()) |
239 | && safe_kernel_copy(&input_meta, &output_meta) |
240 | { |
241 | let result = sendfile_splice(SpliceMode::Splice, readfd, writefd, max_write); |
242 | result.update_take(reader); |
243 | |
244 | match result { |
245 | CopyResult::Ended(bytes_copied) => return Ok(bytes_copied + written), |
246 | CopyResult::Error(e, _) => return Err(e), |
247 | CopyResult::Fallback(0) => { /* use the fallback below */ } |
248 | CopyResult::Fallback(_) => { |
249 | unreachable!("splice should not return > 0 bytes on the fallback path" ) |
250 | } |
251 | } |
252 | } |
253 | } |
254 | |
255 | // fallback if none of the more specialized syscalls wants to work with these file descriptors |
256 | match generic_copy(reader, writer) { |
257 | Ok(bytes) => Ok(bytes + written), |
258 | err => err, |
259 | } |
260 | } |
261 | } |
262 | |
263 | #[rustc_specialization_trait ] |
264 | trait CopyRead: Read { |
265 | /// Implementations that contain buffers (i.e. `BufReader`) must transfer data from their internal |
266 | /// buffers into `writer` until either the buffers are emptied or `limit` bytes have been |
267 | /// transferred, whichever occurs sooner. |
268 | /// If nested buffers are present the outer buffers must be drained first. |
269 | /// |
270 | /// This is necessary to directly bypass the wrapper types while preserving the data order |
271 | /// when operating directly on the underlying file descriptors. |
272 | fn drain_to<W: Write>(&mut self, _writer: &mut W, _limit: u64) -> Result<u64> { |
273 | Ok(0) |
274 | } |
275 | |
276 | /// Updates `Take` wrappers to remove the number of bytes copied. |
277 | fn taken(&mut self, _bytes: u64) {} |
278 | |
279 | /// The minimum of the limit of all `Take<_>` wrappers, `u64::MAX` otherwise. |
280 | /// This method does not account for data `BufReader` buffers and would underreport |
281 | /// the limit of a `Take<BufReader<Take<_>>>` type. Thus its result is only valid |
282 | /// after draining the buffers via `drain_to`. |
283 | fn min_limit(&self) -> u64 { |
284 | u64::MAX |
285 | } |
286 | |
287 | /// Extracts the file descriptor and hints/metadata, delegating through wrappers if necessary. |
288 | fn properties(&self) -> CopyParams; |
289 | } |
290 | |
291 | #[rustc_specialization_trait ] |
292 | trait CopyWrite: Write { |
293 | /// Extracts the file descriptor and hints/metadata, delegating through wrappers if necessary. |
294 | fn properties(&self) -> CopyParams; |
295 | } |
296 | |
297 | impl<T> CopyRead for &mut T |
298 | where |
299 | T: CopyRead, |
300 | { |
301 | fn drain_to<W: Write>(&mut self, writer: &mut W, limit: u64) -> Result<u64> { |
302 | (**self).drain_to(writer, limit) |
303 | } |
304 | |
305 | fn taken(&mut self, bytes: u64) { |
306 | (**self).taken(bytes); |
307 | } |
308 | |
309 | fn min_limit(&self) -> u64 { |
310 | (**self).min_limit() |
311 | } |
312 | |
313 | fn properties(&self) -> CopyParams { |
314 | (**self).properties() |
315 | } |
316 | } |
317 | |
318 | impl<T> CopyWrite for &mut T |
319 | where |
320 | T: CopyWrite, |
321 | { |
322 | fn properties(&self) -> CopyParams { |
323 | (**self).properties() |
324 | } |
325 | } |
326 | |
327 | impl CopyRead for File { |
328 | fn properties(&self) -> CopyParams { |
329 | CopyParams(fd_to_meta(self), Some(self.as_raw_fd())) |
330 | } |
331 | } |
332 | |
333 | impl CopyRead for &File { |
334 | fn properties(&self) -> CopyParams { |
335 | CopyParams(fd_to_meta(*self), Some(self.as_raw_fd())) |
336 | } |
337 | } |
338 | |
339 | impl CopyWrite for File { |
340 | fn properties(&self) -> CopyParams { |
341 | CopyParams(fd_to_meta(self), Some(self.as_raw_fd())) |
342 | } |
343 | } |
344 | |
345 | impl CopyWrite for &File { |
346 | fn properties(&self) -> CopyParams { |
347 | CopyParams(fd_to_meta(*self), Some(self.as_raw_fd())) |
348 | } |
349 | } |
350 | |
351 | impl CopyRead for TcpStream { |
352 | fn properties(&self) -> CopyParams { |
353 | // avoid the stat syscall since we can be fairly sure it's a socket |
354 | CopyParams(FdMeta::Socket, Some(self.as_raw_fd())) |
355 | } |
356 | } |
357 | |
358 | impl CopyRead for &TcpStream { |
359 | fn properties(&self) -> CopyParams { |
360 | // avoid the stat syscall since we can be fairly sure it's a socket |
361 | CopyParams(FdMeta::Socket, Some(self.as_raw_fd())) |
362 | } |
363 | } |
364 | |
365 | impl CopyWrite for TcpStream { |
366 | fn properties(&self) -> CopyParams { |
367 | // avoid the stat syscall since we can be fairly sure it's a socket |
368 | CopyParams(FdMeta::Socket, Some(self.as_raw_fd())) |
369 | } |
370 | } |
371 | |
372 | impl CopyWrite for &TcpStream { |
373 | fn properties(&self) -> CopyParams { |
374 | // avoid the stat syscall since we can be fairly sure it's a socket |
375 | CopyParams(FdMeta::Socket, Some(self.as_raw_fd())) |
376 | } |
377 | } |
378 | |
379 | impl CopyRead for UnixStream { |
380 | fn properties(&self) -> CopyParams { |
381 | // avoid the stat syscall since we can be fairly sure it's a socket |
382 | CopyParams(FdMeta::Socket, Some(self.as_raw_fd())) |
383 | } |
384 | } |
385 | |
386 | impl CopyRead for &UnixStream { |
387 | fn properties(&self) -> CopyParams { |
388 | // avoid the stat syscall since we can be fairly sure it's a socket |
389 | CopyParams(FdMeta::Socket, Some(self.as_raw_fd())) |
390 | } |
391 | } |
392 | |
393 | impl CopyWrite for UnixStream { |
394 | fn properties(&self) -> CopyParams { |
395 | // avoid the stat syscall since we can be fairly sure it's a socket |
396 | CopyParams(FdMeta::Socket, Some(self.as_raw_fd())) |
397 | } |
398 | } |
399 | |
400 | impl CopyWrite for &UnixStream { |
401 | fn properties(&self) -> CopyParams { |
402 | // avoid the stat syscall since we can be fairly sure it's a socket |
403 | CopyParams(FdMeta::Socket, Some(self.as_raw_fd())) |
404 | } |
405 | } |
406 | |
407 | impl CopyWrite for ChildStdin { |
408 | fn properties(&self) -> CopyParams { |
409 | CopyParams(FdMeta::Pipe, Some(self.as_raw_fd())) |
410 | } |
411 | } |
412 | |
413 | impl CopyRead for ChildStdout { |
414 | fn properties(&self) -> CopyParams { |
415 | CopyParams(FdMeta::Pipe, Some(self.as_raw_fd())) |
416 | } |
417 | } |
418 | |
419 | impl CopyRead for ChildStderr { |
420 | fn properties(&self) -> CopyParams { |
421 | CopyParams(FdMeta::Pipe, Some(self.as_raw_fd())) |
422 | } |
423 | } |
424 | |
425 | impl CopyRead for StdinLock<'_> { |
426 | fn drain_to<W: Write>(&mut self, writer: &mut W, outer_limit: u64) -> Result<u64> { |
427 | let buf_reader: &mut BufReader = self.as_mut_buf(); |
428 | let buf: &[u8] = buf_reader.buffer(); |
429 | let buf: &[u8] = &buf[0..min(v1:buf.len(), v2:outer_limit.try_into().unwrap_or(default:usize::MAX))]; |
430 | let bytes_drained: usize = buf.len(); |
431 | writer.write_all(buf)?; |
432 | buf_reader.consume(amt:bytes_drained); |
433 | |
434 | Ok(bytes_drained as u64) |
435 | } |
436 | |
437 | fn properties(&self) -> CopyParams { |
438 | CopyParams(fd_to_meta(self), Some(self.as_raw_fd())) |
439 | } |
440 | } |
441 | |
442 | impl CopyWrite for StdoutLock<'_> { |
443 | fn properties(&self) -> CopyParams { |
444 | CopyParams(fd_to_meta(self), Some(self.as_raw_fd())) |
445 | } |
446 | } |
447 | |
448 | impl CopyWrite for StderrLock<'_> { |
449 | fn properties(&self) -> CopyParams { |
450 | CopyParams(fd_to_meta(self), Some(self.as_raw_fd())) |
451 | } |
452 | } |
453 | |
454 | impl<T: CopyRead> CopyRead for Take<T> { |
455 | fn drain_to<W: Write>(&mut self, writer: &mut W, outer_limit: u64) -> Result<u64> { |
456 | let local_limit: u64 = self.limit(); |
457 | let combined_limit: u64 = min(v1:outer_limit, v2:local_limit); |
458 | let bytes_drained: u64 = self.get_mut().drain_to(writer, combined_limit)?; |
459 | // update limit since read() was bypassed |
460 | self.set_limit(local_limit - bytes_drained); |
461 | |
462 | Ok(bytes_drained) |
463 | } |
464 | |
465 | fn taken(&mut self, bytes: u64) { |
466 | self.set_limit(self.limit() - bytes); |
467 | self.get_mut().taken(bytes); |
468 | } |
469 | |
470 | fn min_limit(&self) -> u64 { |
471 | min(v1:Take::limit(self), self.get_ref().min_limit()) |
472 | } |
473 | |
474 | fn properties(&self) -> CopyParams { |
475 | self.get_ref().properties() |
476 | } |
477 | } |
478 | |
479 | impl<T: ?Sized + CopyRead> CopyRead for BufReader<T> { |
480 | fn drain_to<W: Write>(&mut self, writer: &mut W, outer_limit: u64) -> Result<u64> { |
481 | let buf = self.buffer(); |
482 | let buf = &buf[0..min(buf.len(), outer_limit.try_into().unwrap_or(usize::MAX))]; |
483 | let bytes = buf.len(); |
484 | writer.write_all(buf)?; |
485 | self.consume(bytes); |
486 | |
487 | let remaining = outer_limit - bytes as u64; |
488 | |
489 | // in case of nested bufreaders we also need to drain the ones closer to the source |
490 | let inner_bytes = self.get_mut().drain_to(writer, remaining)?; |
491 | |
492 | Ok(bytes as u64 + inner_bytes) |
493 | } |
494 | |
495 | fn taken(&mut self, bytes: u64) { |
496 | self.get_mut().taken(bytes); |
497 | } |
498 | |
499 | fn min_limit(&self) -> u64 { |
500 | self.get_ref().min_limit() |
501 | } |
502 | |
503 | fn properties(&self) -> CopyParams { |
504 | self.get_ref().properties() |
505 | } |
506 | } |
507 | |
508 | impl<T: ?Sized + CopyWrite> CopyWrite for BufWriter<T> { |
509 | fn properties(&self) -> CopyParams { |
510 | self.get_ref().properties() |
511 | } |
512 | } |
513 | |
514 | fn fd_to_meta<T: AsRawFd>(fd: &T) -> FdMeta { |
515 | let fd: i32 = fd.as_raw_fd(); |
516 | let file: ManuallyDrop<File> = ManuallyDrop::new(unsafe { File::from_raw_fd(fd) }); |
517 | match file.metadata() { |
518 | Ok(meta: Metadata) => FdMeta::Metadata(meta), |
519 | Err(_) => FdMeta::NoneObtained, |
520 | } |
521 | } |
522 | |
523 | pub(super) enum CopyResult { |
524 | Ended(u64), |
525 | Error(Error, u64), |
526 | Fallback(u64), |
527 | } |
528 | |
529 | impl CopyResult { |
530 | fn update_take(&self, reader: &mut impl CopyRead) { |
531 | match *self { |
532 | CopyResult::Fallback(bytes: u64) |
533 | | CopyResult::Ended(bytes: u64) |
534 | | CopyResult::Error(_, bytes: u64) => reader.taken(bytes), |
535 | } |
536 | } |
537 | } |
538 | |
539 | /// Invalid file descriptor. |
540 | /// |
541 | /// Valid file descriptors are guaranteed to be positive numbers (see `open()` manpage) |
542 | /// while negative values are used to indicate errors. |
543 | /// Thus -1 will never be overlap with a valid open file. |
544 | const INVALID_FD: RawFd = -1; |
545 | |
546 | /// Linux-specific implementation that will attempt to use copy_file_range for copy offloading. |
547 | /// As the name says, it only works on regular files. |
548 | /// |
549 | /// Callers must handle fallback to a generic copy loop. |
550 | /// `Fallback` may indicate non-zero number of bytes already written |
551 | /// if one of the files' cursor +`max_len` would exceed u64::MAX (`EOVERFLOW`). |
552 | pub(super) fn copy_regular_files(reader: RawFd, writer: RawFd, max_len: u64) -> CopyResult { |
553 | use crate::cmp; |
554 | |
555 | const NOT_PROBED: u8 = 0; |
556 | const UNAVAILABLE: u8 = 1; |
557 | const AVAILABLE: u8 = 2; |
558 | |
559 | // Kernel prior to 4.5 don't have copy_file_range |
560 | // We store the availability in a global to avoid unnecessary syscalls |
561 | static HAS_COPY_FILE_RANGE: AtomicU8 = AtomicU8::new(NOT_PROBED); |
562 | |
563 | syscall! { |
564 | fn copy_file_range( |
565 | fd_in: libc::c_int, |
566 | off_in: *mut libc::loff_t, |
567 | fd_out: libc::c_int, |
568 | off_out: *mut libc::loff_t, |
569 | len: libc::size_t, |
570 | flags: libc::c_uint |
571 | ) -> libc::ssize_t |
572 | } |
573 | |
574 | match HAS_COPY_FILE_RANGE.load(Ordering::Relaxed) { |
575 | NOT_PROBED => { |
576 | // EPERM can indicate seccomp filters or an immutable file. |
577 | // To distinguish these cases we probe with invalid file descriptors which should result in EBADF if the syscall is supported |
578 | // and some other error (ENOSYS or EPERM) if it's not available |
579 | let result = unsafe { |
580 | cvt(copy_file_range(INVALID_FD, ptr::null_mut(), INVALID_FD, ptr::null_mut(), 1, 0)) |
581 | }; |
582 | |
583 | if matches!(result.map_err(|e| e.raw_os_error()), Err(Some(EBADF))) { |
584 | HAS_COPY_FILE_RANGE.store(AVAILABLE, Ordering::Relaxed); |
585 | } else { |
586 | HAS_COPY_FILE_RANGE.store(UNAVAILABLE, Ordering::Relaxed); |
587 | return CopyResult::Fallback(0); |
588 | } |
589 | } |
590 | UNAVAILABLE => return CopyResult::Fallback(0), |
591 | _ => {} |
592 | }; |
593 | |
594 | let mut written = 0u64; |
595 | while written < max_len { |
596 | let bytes_to_copy = cmp::min(max_len - written, usize::MAX as u64); |
597 | // cap to 1GB chunks in case u64::MAX is passed as max_len and the file has a non-zero seek position |
598 | // this allows us to copy large chunks without hitting EOVERFLOW, |
599 | // unless someone sets a file offset close to u64::MAX - 1GB, in which case a fallback would be required |
600 | let bytes_to_copy = cmp::min(bytes_to_copy as usize, 0x4000_0000usize); |
601 | let copy_result = unsafe { |
602 | // We actually don't have to adjust the offsets, |
603 | // because copy_file_range adjusts the file offset automatically |
604 | cvt(copy_file_range(reader, ptr::null_mut(), writer, ptr::null_mut(), bytes_to_copy, 0)) |
605 | }; |
606 | |
607 | match copy_result { |
608 | Ok(0) if written == 0 => { |
609 | // fallback to work around several kernel bugs where copy_file_range will fail to |
610 | // copy any bytes and return 0 instead of an error if |
611 | // - reading virtual files from the proc filesystem which appear to have 0 size |
612 | // but are not empty. noted in coreutils to affect kernels at least up to 5.6.19. |
613 | // - copying from an overlay filesystem in docker. reported to occur on fedora 32. |
614 | return CopyResult::Fallback(0); |
615 | } |
616 | Ok(0) => return CopyResult::Ended(written), // reached EOF |
617 | Ok(ret) => written += ret as u64, |
618 | Err(err) => { |
619 | return match err.raw_os_error() { |
620 | // when file offset + max_length > u64::MAX |
621 | Some(EOVERFLOW) => CopyResult::Fallback(written), |
622 | Some(ENOSYS | EXDEV | EINVAL | EPERM | EOPNOTSUPP | EBADF) if written == 0 => { |
623 | // Try fallback io::copy if either: |
624 | // - Kernel version is < 4.5 (ENOSYS¹) |
625 | // - Files are mounted on different fs (EXDEV) |
626 | // - copy_file_range is broken in various ways on RHEL/CentOS 7 (EOPNOTSUPP) |
627 | // - copy_file_range file is immutable or syscall is blocked by seccomp¹ (EPERM) |
628 | // - copy_file_range cannot be used with pipes or device nodes (EINVAL) |
629 | // - the writer fd was opened with O_APPEND (EBADF²) |
630 | // and no bytes were written successfully yet. (All these errnos should |
631 | // not be returned if something was already written, but they happen in |
632 | // the wild, see #91152.) |
633 | // |
634 | // ¹ these cases should be detected by the initial probe but we handle them here |
635 | // anyway in case syscall interception changes during runtime |
636 | // ² actually invalid file descriptors would cause this too, but in that case |
637 | // the fallback code path is expected to encounter the same error again |
638 | CopyResult::Fallback(0) |
639 | } |
640 | _ => CopyResult::Error(err, written), |
641 | }; |
642 | } |
643 | } |
644 | } |
645 | CopyResult::Ended(written) |
646 | } |
647 | |
648 | #[derive (PartialEq)] |
649 | enum SpliceMode { |
650 | Sendfile, |
651 | Splice, |
652 | } |
653 | |
654 | /// performs splice or sendfile between file descriptors |
655 | /// Does _not_ fall back to a generic copy loop. |
656 | fn sendfile_splice(mode: SpliceMode, reader: RawFd, writer: RawFd, len: u64) -> CopyResult { |
657 | static HAS_SENDFILE: AtomicBool = AtomicBool::new(true); |
658 | static HAS_SPLICE: AtomicBool = AtomicBool::new(true); |
659 | |
660 | // Android builds use feature level 14, but the libc wrapper for splice is |
661 | // gated on feature level 21+, so we have to invoke the syscall directly. |
662 | #[cfg (target_os = "android" )] |
663 | syscall! { |
664 | fn splice( |
665 | srcfd: libc::c_int, |
666 | src_offset: *const i64, |
667 | dstfd: libc::c_int, |
668 | dst_offset: *const i64, |
669 | len: libc::size_t, |
670 | flags: libc::c_int |
671 | ) -> libc::ssize_t |
672 | } |
673 | |
674 | #[cfg (target_os = "linux" )] |
675 | use libc::splice; |
676 | |
677 | match mode { |
678 | SpliceMode::Sendfile if !HAS_SENDFILE.load(Ordering::Relaxed) => { |
679 | return CopyResult::Fallback(0); |
680 | } |
681 | SpliceMode::Splice if !HAS_SPLICE.load(Ordering::Relaxed) => { |
682 | return CopyResult::Fallback(0); |
683 | } |
684 | _ => (), |
685 | } |
686 | |
687 | let mut written = 0u64; |
688 | while written < len { |
689 | // according to its manpage that's the maximum size sendfile() will copy per invocation |
690 | let chunk_size = crate::cmp::min(len - written, 0x7ffff000_u64) as usize; |
691 | |
692 | let result = match mode { |
693 | SpliceMode::Sendfile => { |
694 | cvt(unsafe { sendfile64(writer, reader, ptr::null_mut(), chunk_size) }) |
695 | } |
696 | SpliceMode::Splice => cvt(unsafe { |
697 | splice(reader, ptr::null_mut(), writer, ptr::null_mut(), chunk_size, 0) |
698 | }), |
699 | }; |
700 | |
701 | match result { |
702 | Ok(0) => break, // EOF |
703 | Ok(ret) => written += ret as u64, |
704 | Err(err) => { |
705 | return match err.raw_os_error() { |
706 | Some(ENOSYS | EPERM) => { |
707 | // syscall not supported (ENOSYS) |
708 | // syscall is disallowed, e.g. by seccomp (EPERM) |
709 | match mode { |
710 | SpliceMode::Sendfile => HAS_SENDFILE.store(false, Ordering::Relaxed), |
711 | SpliceMode::Splice => HAS_SPLICE.store(false, Ordering::Relaxed), |
712 | } |
713 | assert_eq!(written, 0); |
714 | CopyResult::Fallback(0) |
715 | } |
716 | Some(EINVAL) => { |
717 | // splice/sendfile do not support this particular file descriptor (EINVAL) |
718 | assert_eq!(written, 0); |
719 | CopyResult::Fallback(0) |
720 | } |
721 | Some(os_err) if mode == SpliceMode::Sendfile && os_err == EOVERFLOW => { |
722 | CopyResult::Fallback(written) |
723 | } |
724 | _ => CopyResult::Error(err, written), |
725 | }; |
726 | } |
727 | } |
728 | } |
729 | CopyResult::Ended(written) |
730 | } |
731 | |