1 | //! This module contains specializations that can offload `io::copy()` operations on file descriptor |
---|---|
2 | //! containing types (`File`, `TcpStream`, etc.) to more efficient syscalls than `read(2)` and `write(2)`. |
3 | //! |
4 | //! Specialization is only applied to wholly std-owned types so that user code can't observe |
5 | //! that the `Read` and `Write` traits are not used. |
6 | //! |
7 | //! Since a copy operation involves a reader and writer side where each can consist of different types |
8 | //! and also involve generic wrappers (e.g. `Take`, `BufReader`) it is not practical to specialize |
9 | //! a single method on all possible combinations. |
10 | //! |
11 | //! Instead readers and writers are handled separately by the `CopyRead` and `CopyWrite` specialization |
12 | //! traits and then specialized on by the `Copier::copy` method. |
13 | //! |
14 | //! `Copier` uses the specialization traits to unpack the underlying file descriptors and |
15 | //! additional prerequisites and constraints imposed by the wrapper types. |
16 | //! |
17 | //! Once it has obtained all necessary pieces and brought any wrapper types into a state where they |
18 | //! can be safely bypassed it will attempt to use the `copy_file_range(2)`, |
19 | //! `sendfile(2)` or `splice(2)` syscalls to move data directly between file descriptors. |
20 | //! Since those syscalls have requirements that cannot be fully checked in advance it attempts |
21 | //! to use them one after another (guided by hints) to figure out which one works and |
22 | //! falls back to the generic read-write copy loop if none of them does. |
23 | //! Once a working syscall is found for a pair of file descriptors it will be called in a loop |
24 | //! until the copy operation is completed. |
25 | //! |
26 | //! Advantages of using these syscalls: |
27 | //! |
28 | //! * fewer context switches since reads and writes are coalesced into a single syscall |
29 | //! and more bytes are transferred per syscall. This translates to higher throughput |
30 | //! and fewer CPU cycles, at least for sufficiently large transfers to amortize the initial probing. |
31 | //! * `copy_file_range` creates reflink copies on CoW filesystems, thus moving less data and |
32 | //! consuming less disk space |
33 | //! * `sendfile` and `splice` can perform zero-copy IO under some circumstances while |
34 | //! a naive copy loop would move every byte through the CPU. |
35 | //! |
36 | //! Drawbacks: |
37 | //! |
38 | //! * copy operations smaller than the default buffer size can under some circumstances, especially |
39 | //! on older kernels, incur more syscalls than the naive approach would. As mentioned above |
40 | //! the syscall selection is guided by hints to minimize this possibility but they are not perfect. |
41 | //! * optimizations only apply to std types. If a user adds a custom wrapper type, e.g. to report |
42 | //! progress, they can hit a performance cliff. |
43 | //! * complexity |
44 | |
45 | #[cfg(not(any(all(target_os = "linux", target_env = "gnu"), target_os = "hurd")))] |
46 | use libc::sendfile as sendfile64; |
47 | #[cfg(any(all(target_os = "linux", target_env = "gnu"), target_os = "hurd"))] |
48 | use libc::sendfile64; |
49 | use libc::{EBADF, EINVAL, ENOSYS, EOPNOTSUPP, EOVERFLOW, EPERM, EXDEV}; |
50 | |
51 | use crate::cmp::min; |
52 | use crate::fs::{File, Metadata}; |
53 | use crate::io::copy::generic_copy; |
54 | use crate::io::{ |
55 | BufRead, BufReader, BufWriter, Error, PipeReader, PipeWriter, Read, Result, StderrLock, |
56 | StdinLock, StdoutLock, Take, Write, |
57 | }; |
58 | use crate::mem::ManuallyDrop; |
59 | use crate::net::TcpStream; |
60 | use crate::os::unix::fs::FileTypeExt; |
61 | use crate::os::unix::io::{AsRawFd, FromRawFd, RawFd}; |
62 | use crate::os::unix::net::UnixStream; |
63 | use crate::process::{ChildStderr, ChildStdin, ChildStdout}; |
64 | use crate::ptr; |
65 | use crate::sync::atomic::{Atomic, AtomicBool, AtomicU8, Ordering}; |
66 | use crate::sys::cvt; |
67 | use crate::sys::fs::CachedFileMetadata; |
68 | use crate::sys::weak::syscall; |
69 | |
70 | #[cfg(test)] |
71 | mod tests; |
72 | |
73 | pub(crate) fn copy_spec<R: Read + ?Sized, W: Write + ?Sized>( |
74 | read: &mut R, |
75 | write: &mut W, |
76 | ) -> Result<u64> { |
77 | let copier: Copier<'_, '_, R, W> = Copier { read, write }; |
78 | SpecCopy::copy(self:copier) |
79 | } |
80 | |
81 | /// This type represents either the inferred `FileType` of a `RawFd` based on the source |
82 | /// type from which it was extracted or the actual metadata |
83 | /// |
84 | /// The methods on this type only provide hints, due to `AsRawFd` and `FromRawFd` the inferred |
85 | /// type may be wrong. |
86 | enum FdMeta { |
87 | Metadata(Metadata), |
88 | Socket, |
89 | Pipe, |
90 | /// We don't have any metadata because the stat syscall failed |
91 | NoneObtained, |
92 | } |
93 | |
94 | #[derive(PartialEq)] |
95 | enum FdHandle { |
96 | Input, |
97 | Output, |
98 | } |
99 | |
100 | impl FdMeta { |
101 | fn maybe_fifo(&self) -> bool { |
102 | match self { |
103 | FdMeta::Metadata(meta) => meta.file_type().is_fifo(), |
104 | FdMeta::Socket => false, |
105 | FdMeta::Pipe => true, |
106 | FdMeta::NoneObtained => true, |
107 | } |
108 | } |
109 | |
110 | fn potential_sendfile_source(&self) -> bool { |
111 | match self { |
112 | // procfs erroneously shows 0 length on non-empty readable files. |
113 | // and if a file is truly empty then a `read` syscall will determine that and skip the write syscall |
114 | // thus there would be benefit from attempting sendfile |
115 | FdMeta::Metadata(meta) |
116 | if meta.file_type().is_file() && meta.len() > 0 |
117 | || meta.file_type().is_block_device() => |
118 | { |
119 | true |
120 | } |
121 | _ => false, |
122 | } |
123 | } |
124 | |
125 | fn copy_file_range_candidate(&self, f: FdHandle) -> bool { |
126 | match self { |
127 | // copy_file_range will fail on empty procfs files. `read` can determine whether EOF has been reached |
128 | // without extra cost and skip the write, thus there is no benefit in attempting copy_file_range |
129 | FdMeta::Metadata(meta) if f == FdHandle::Input && meta.is_file() && meta.len() > 0 => { |
130 | true |
131 | } |
132 | FdMeta::Metadata(meta) if f == FdHandle::Output && meta.is_file() => true, |
133 | _ => false, |
134 | } |
135 | } |
136 | } |
137 | |
138 | /// Returns true either if changes made to the source after a sendfile/splice call won't become |
139 | /// visible in the sink or the source has explicitly opted into such behavior (e.g. by splicing |
140 | /// a file into a pipe, the pipe being the source in this case). |
141 | /// |
142 | /// This will prevent File -> Pipe and File -> Socket splicing/sendfile optimizations to uphold |
143 | /// the Read/Write API semantics of io::copy. |
144 | /// |
145 | /// Note: This is not 100% airtight, the caller can use the RawFd conversion methods to turn a |
146 | /// regular file into a TcpSocket which will be treated as a socket here without checking. |
147 | fn safe_kernel_copy(source: &FdMeta, sink: &FdMeta) -> bool { |
148 | match (source, sink) { |
149 | // Data arriving from a socket is safe because the sender can't modify the socket buffer. |
150 | // Data arriving from a pipe is safe(-ish) because either the sender *copied* |
151 | // the bytes into the pipe OR explicitly performed an operation that enables zero-copy, |
152 | // thus promising not to modify the data later. |
153 | (FdMeta::Socket, _) => true, |
154 | (FdMeta::Pipe, _) => true, |
155 | (FdMeta::Metadata(meta: &Metadata), _) |
156 | if meta.file_type().is_fifo() || meta.file_type().is_socket() => |
157 | { |
158 | true |
159 | } |
160 | // Data going into non-pipes/non-sockets is safe because the "later changes may become visible" issue |
161 | // only happens for pages sitting in send buffers or pipes. |
162 | (_, FdMeta::Metadata(meta: &Metadata)) |
163 | if !meta.file_type().is_fifo() && !meta.file_type().is_socket() => |
164 | { |
165 | true |
166 | } |
167 | _ => false, |
168 | } |
169 | } |
170 | |
171 | struct CopyParams(FdMeta, Option<RawFd>); |
172 | |
173 | struct Copier<'a, 'b, R: Read + ?Sized, W: Write + ?Sized> { |
174 | read: &'a mut R, |
175 | write: &'b mut W, |
176 | } |
177 | |
178 | trait SpecCopy { |
179 | fn copy(self) -> Result<u64>; |
180 | } |
181 | |
182 | impl<R: Read + ?Sized, W: Write + ?Sized> SpecCopy for Copier<'_, '_, R, W> { |
183 | default fn copy(self) -> Result<u64> { |
184 | generic_copy(self.read, self.write) |
185 | } |
186 | } |
187 | |
188 | impl<R: CopyRead, W: CopyWrite> SpecCopy for Copier<'_, '_, R, W> { |
189 | fn copy(self) -> Result<u64> { |
190 | let (reader, writer) = (self.read, self.write); |
191 | let r_cfg = reader.properties(); |
192 | let w_cfg = writer.properties(); |
193 | |
194 | // before direct operations on file descriptors ensure that all source and sink buffers are empty |
195 | let mut flush = || -> Result<u64> { |
196 | let bytes = reader.drain_to(writer, u64::MAX)?; |
197 | // BufWriter buffered bytes have already been accounted for in earlier write() calls |
198 | writer.flush()?; |
199 | Ok(bytes) |
200 | }; |
201 | |
202 | let mut written = 0u64; |
203 | |
204 | if let (CopyParams(input_meta, Some(readfd)), CopyParams(output_meta, Some(writefd))) = |
205 | (r_cfg, w_cfg) |
206 | { |
207 | written += flush()?; |
208 | let max_write = reader.min_limit(); |
209 | |
210 | if input_meta.copy_file_range_candidate(FdHandle::Input) |
211 | && output_meta.copy_file_range_candidate(FdHandle::Output) |
212 | { |
213 | let result = copy_regular_files(readfd, writefd, max_write); |
214 | result.update_take(reader); |
215 | |
216 | match result { |
217 | CopyResult::Ended(bytes_copied) => return Ok(bytes_copied + written), |
218 | CopyResult::Error(e, _) => return Err(e), |
219 | CopyResult::Fallback(bytes) => written += bytes, |
220 | } |
221 | } |
222 | |
223 | // on modern kernels sendfile can copy from any mmapable type (some but not all regular files and block devices) |
224 | // to any writable file descriptor. On older kernels the writer side can only be a socket. |
225 | // So we just try and fallback if needed. |
226 | // If current file offsets + write sizes overflow it may also fail, we do not try to fix that and instead |
227 | // fall back to the generic copy loop. |
228 | if input_meta.potential_sendfile_source() && safe_kernel_copy(&input_meta, &output_meta) |
229 | { |
230 | let result = sendfile_splice(SpliceMode::Sendfile, readfd, writefd, max_write); |
231 | result.update_take(reader); |
232 | |
233 | match result { |
234 | CopyResult::Ended(bytes_copied) => return Ok(bytes_copied + written), |
235 | CopyResult::Error(e, _) => return Err(e), |
236 | CopyResult::Fallback(bytes) => written += bytes, |
237 | } |
238 | } |
239 | |
240 | if (input_meta.maybe_fifo() || output_meta.maybe_fifo()) |
241 | && safe_kernel_copy(&input_meta, &output_meta) |
242 | { |
243 | let result = sendfile_splice(SpliceMode::Splice, readfd, writefd, max_write); |
244 | result.update_take(reader); |
245 | |
246 | match result { |
247 | CopyResult::Ended(bytes_copied) => return Ok(bytes_copied + written), |
248 | CopyResult::Error(e, _) => return Err(e), |
249 | CopyResult::Fallback(0) => { /* use the fallback below */ } |
250 | CopyResult::Fallback(_) => { |
251 | unreachable!("splice should not return > 0 bytes on the fallback path") |
252 | } |
253 | } |
254 | } |
255 | } |
256 | |
257 | // fallback if none of the more specialized syscalls wants to work with these file descriptors |
258 | match generic_copy(reader, writer) { |
259 | Ok(bytes) => Ok(bytes + written), |
260 | err => err, |
261 | } |
262 | } |
263 | } |
264 | |
265 | #[rustc_specialization_trait] |
266 | trait CopyRead: Read { |
267 | /// Implementations that contain buffers (i.e. `BufReader`) must transfer data from their internal |
268 | /// buffers into `writer` until either the buffers are emptied or `limit` bytes have been |
269 | /// transferred, whichever occurs sooner. |
270 | /// If nested buffers are present the outer buffers must be drained first. |
271 | /// |
272 | /// This is necessary to directly bypass the wrapper types while preserving the data order |
273 | /// when operating directly on the underlying file descriptors. |
274 | fn drain_to<W: Write>(&mut self, _writer: &mut W, _limit: u64) -> Result<u64> { |
275 | Ok(0) |
276 | } |
277 | |
278 | /// Updates `Take` wrappers to remove the number of bytes copied. |
279 | fn taken(&mut self, _bytes: u64) {} |
280 | |
281 | /// The minimum of the limit of all `Take<_>` wrappers, `u64::MAX` otherwise. |
282 | /// This method does not account for data `BufReader` buffers and would underreport |
283 | /// the limit of a `Take<BufReader<Take<_>>>` type. Thus its result is only valid |
284 | /// after draining the buffers via `drain_to`. |
285 | fn min_limit(&self) -> u64 { |
286 | u64::MAX |
287 | } |
288 | |
289 | /// Extracts the file descriptor and hints/metadata, delegating through wrappers if necessary. |
290 | fn properties(&self) -> CopyParams; |
291 | } |
292 | |
293 | #[rustc_specialization_trait] |
294 | trait CopyWrite: Write { |
295 | /// Extracts the file descriptor and hints/metadata, delegating through wrappers if necessary. |
296 | fn properties(&self) -> CopyParams; |
297 | } |
298 | |
299 | impl<T> CopyRead for &mut T |
300 | where |
301 | T: CopyRead, |
302 | { |
303 | fn drain_to<W: Write>(&mut self, writer: &mut W, limit: u64) -> Result<u64> { |
304 | (**self).drain_to(writer, limit) |
305 | } |
306 | |
307 | fn taken(&mut self, bytes: u64) { |
308 | (**self).taken(bytes); |
309 | } |
310 | |
311 | fn min_limit(&self) -> u64 { |
312 | (**self).min_limit() |
313 | } |
314 | |
315 | fn properties(&self) -> CopyParams { |
316 | (**self).properties() |
317 | } |
318 | } |
319 | |
320 | impl<T> CopyWrite for &mut T |
321 | where |
322 | T: CopyWrite, |
323 | { |
324 | fn properties(&self) -> CopyParams { |
325 | (**self).properties() |
326 | } |
327 | } |
328 | |
329 | impl CopyRead for File { |
330 | fn properties(&self) -> CopyParams { |
331 | CopyParams(fd_to_meta(self), Some(self.as_raw_fd())) |
332 | } |
333 | } |
334 | |
335 | impl CopyRead for &File { |
336 | fn properties(&self) -> CopyParams { |
337 | CopyParams(fd_to_meta(*self), Some(self.as_raw_fd())) |
338 | } |
339 | } |
340 | |
341 | impl CopyWrite for File { |
342 | fn properties(&self) -> CopyParams { |
343 | CopyParams(fd_to_meta(self), Some(self.as_raw_fd())) |
344 | } |
345 | } |
346 | |
347 | impl CopyWrite for &File { |
348 | fn properties(&self) -> CopyParams { |
349 | CopyParams(fd_to_meta(*self), Some(self.as_raw_fd())) |
350 | } |
351 | } |
352 | |
353 | impl CopyRead for TcpStream { |
354 | fn properties(&self) -> CopyParams { |
355 | // avoid the stat syscall since we can be fairly sure it's a socket |
356 | CopyParams(FdMeta::Socket, Some(self.as_raw_fd())) |
357 | } |
358 | } |
359 | |
360 | impl CopyRead for &TcpStream { |
361 | fn properties(&self) -> CopyParams { |
362 | // avoid the stat syscall since we can be fairly sure it's a socket |
363 | CopyParams(FdMeta::Socket, Some(self.as_raw_fd())) |
364 | } |
365 | } |
366 | |
367 | impl CopyWrite for TcpStream { |
368 | fn properties(&self) -> CopyParams { |
369 | // avoid the stat syscall since we can be fairly sure it's a socket |
370 | CopyParams(FdMeta::Socket, Some(self.as_raw_fd())) |
371 | } |
372 | } |
373 | |
374 | impl CopyWrite for &TcpStream { |
375 | fn properties(&self) -> CopyParams { |
376 | // avoid the stat syscall since we can be fairly sure it's a socket |
377 | CopyParams(FdMeta::Socket, Some(self.as_raw_fd())) |
378 | } |
379 | } |
380 | |
381 | impl CopyRead for UnixStream { |
382 | fn properties(&self) -> CopyParams { |
383 | // avoid the stat syscall since we can be fairly sure it's a socket |
384 | CopyParams(FdMeta::Socket, Some(self.as_raw_fd())) |
385 | } |
386 | } |
387 | |
388 | impl CopyRead for &UnixStream { |
389 | fn properties(&self) -> CopyParams { |
390 | // avoid the stat syscall since we can be fairly sure it's a socket |
391 | CopyParams(FdMeta::Socket, Some(self.as_raw_fd())) |
392 | } |
393 | } |
394 | |
395 | impl CopyWrite for UnixStream { |
396 | fn properties(&self) -> CopyParams { |
397 | // avoid the stat syscall since we can be fairly sure it's a socket |
398 | CopyParams(FdMeta::Socket, Some(self.as_raw_fd())) |
399 | } |
400 | } |
401 | |
402 | impl CopyWrite for &UnixStream { |
403 | fn properties(&self) -> CopyParams { |
404 | // avoid the stat syscall since we can be fairly sure it's a socket |
405 | CopyParams(FdMeta::Socket, Some(self.as_raw_fd())) |
406 | } |
407 | } |
408 | |
409 | impl CopyRead for PipeReader { |
410 | fn properties(&self) -> CopyParams { |
411 | CopyParams(FdMeta::Pipe, Some(self.as_raw_fd())) |
412 | } |
413 | } |
414 | |
415 | impl CopyRead for &PipeReader { |
416 | fn properties(&self) -> CopyParams { |
417 | CopyParams(FdMeta::Pipe, Some(self.as_raw_fd())) |
418 | } |
419 | } |
420 | |
421 | impl CopyWrite for PipeWriter { |
422 | fn properties(&self) -> CopyParams { |
423 | CopyParams(FdMeta::Pipe, Some(self.as_raw_fd())) |
424 | } |
425 | } |
426 | |
427 | impl CopyWrite for &PipeWriter { |
428 | fn properties(&self) -> CopyParams { |
429 | CopyParams(FdMeta::Pipe, Some(self.as_raw_fd())) |
430 | } |
431 | } |
432 | |
433 | impl CopyWrite for ChildStdin { |
434 | fn properties(&self) -> CopyParams { |
435 | CopyParams(FdMeta::Pipe, Some(self.as_raw_fd())) |
436 | } |
437 | } |
438 | |
439 | impl CopyRead for ChildStdout { |
440 | fn properties(&self) -> CopyParams { |
441 | CopyParams(FdMeta::Pipe, Some(self.as_raw_fd())) |
442 | } |
443 | } |
444 | |
445 | impl CopyRead for ChildStderr { |
446 | fn properties(&self) -> CopyParams { |
447 | CopyParams(FdMeta::Pipe, Some(self.as_raw_fd())) |
448 | } |
449 | } |
450 | |
451 | impl CopyRead for StdinLock<'_> { |
452 | fn drain_to<W: Write>(&mut self, writer: &mut W, outer_limit: u64) -> Result<u64> { |
453 | let buf_reader: &mut BufReader |
454 | let buf: &[u8] = buf_reader.buffer(); |
455 | let buf: &[u8] = &buf[0..min(v1:buf.len(), v2:outer_limit.try_into().unwrap_or(default:usize::MAX))]; |
456 | let bytes_drained: usize = buf.len(); |
457 | writer.write_all(buf)?; |
458 | buf_reader.consume(amount:bytes_drained); |
459 | |
460 | Ok(bytes_drained as u64) |
461 | } |
462 | |
463 | fn properties(&self) -> CopyParams { |
464 | CopyParams(fd_to_meta(self), Some(self.as_raw_fd())) |
465 | } |
466 | } |
467 | |
468 | impl CopyWrite for StdoutLock<'_> { |
469 | fn properties(&self) -> CopyParams { |
470 | CopyParams(fd_to_meta(self), Some(self.as_raw_fd())) |
471 | } |
472 | } |
473 | |
474 | impl CopyWrite for StderrLock<'_> { |
475 | fn properties(&self) -> CopyParams { |
476 | CopyParams(fd_to_meta(self), Some(self.as_raw_fd())) |
477 | } |
478 | } |
479 | |
480 | impl<T: CopyRead> CopyRead for Take<T> { |
481 | fn drain_to<W: Write>(&mut self, writer: &mut W, outer_limit: u64) -> Result<u64> { |
482 | let local_limit: u64 = self.limit(); |
483 | let combined_limit: u64 = min(v1:outer_limit, v2:local_limit); |
484 | let bytes_drained: u64 = self.get_mut().drain_to(writer, combined_limit)?; |
485 | // update limit since read() was bypassed |
486 | self.set_limit(local_limit - bytes_drained); |
487 | |
488 | Ok(bytes_drained) |
489 | } |
490 | |
491 | fn taken(&mut self, bytes: u64) { |
492 | self.set_limit(self.limit() - bytes); |
493 | self.get_mut().taken(bytes); |
494 | } |
495 | |
496 | fn min_limit(&self) -> u64 { |
497 | min(v1:Take::limit(self), self.get_ref().min_limit()) |
498 | } |
499 | |
500 | fn properties(&self) -> CopyParams { |
501 | self.get_ref().properties() |
502 | } |
503 | } |
504 | |
505 | impl<T: ?Sized + CopyRead> CopyRead for BufReader<T> { |
506 | fn drain_to<W: Write>(&mut self, writer: &mut W, outer_limit: u64) -> Result<u64> { |
507 | let buf = self.buffer(); |
508 | let buf = &buf[0..min(buf.len(), outer_limit.try_into().unwrap_or(usize::MAX))]; |
509 | let bytes = buf.len(); |
510 | writer.write_all(buf)?; |
511 | self.consume(bytes); |
512 | |
513 | let remaining = outer_limit - bytes as u64; |
514 | |
515 | // in case of nested bufreaders we also need to drain the ones closer to the source |
516 | let inner_bytes = self.get_mut().drain_to(writer, remaining)?; |
517 | |
518 | Ok(bytes as u64 + inner_bytes) |
519 | } |
520 | |
521 | fn taken(&mut self, bytes: u64) { |
522 | self.get_mut().taken(bytes); |
523 | } |
524 | |
525 | fn min_limit(&self) -> u64 { |
526 | self.get_ref().min_limit() |
527 | } |
528 | |
529 | fn properties(&self) -> CopyParams { |
530 | self.get_ref().properties() |
531 | } |
532 | } |
533 | |
534 | impl<T: ?Sized + CopyWrite> CopyWrite for BufWriter<T> { |
535 | fn properties(&self) -> CopyParams { |
536 | self.get_ref().properties() |
537 | } |
538 | } |
539 | |
540 | impl CopyRead for CachedFileMetadata { |
541 | fn properties(&self) -> CopyParams { |
542 | CopyParams(FdMeta::Metadata(self.1.clone()), Some(self.0.as_raw_fd())) |
543 | } |
544 | } |
545 | |
546 | impl CopyWrite for CachedFileMetadata { |
547 | fn properties(&self) -> CopyParams { |
548 | CopyParams(FdMeta::Metadata(self.1.clone()), Some(self.0.as_raw_fd())) |
549 | } |
550 | } |
551 | |
552 | fn fd_to_meta<T: AsRawFd>(fd: &T) -> FdMeta { |
553 | let fd: i32 = fd.as_raw_fd(); |
554 | let file: ManuallyDrop<File> = ManuallyDrop::new(unsafe { File::from_raw_fd(fd) }); |
555 | match file.metadata() { |
556 | Ok(meta: Metadata) => FdMeta::Metadata(meta), |
557 | Err(_) => FdMeta::NoneObtained, |
558 | } |
559 | } |
560 | |
561 | pub(super) enum CopyResult { |
562 | Ended(u64), |
563 | Error(Error, u64), |
564 | Fallback(u64), |
565 | } |
566 | |
567 | impl CopyResult { |
568 | fn update_take(&self, reader: &mut impl CopyRead) { |
569 | match *self { |
570 | CopyResult::Fallback(bytes: u64) |
571 | | CopyResult::Ended(bytes: u64) |
572 | | CopyResult::Error(_, bytes: u64) => reader.taken(bytes), |
573 | } |
574 | } |
575 | } |
576 | |
577 | /// Invalid file descriptor. |
578 | /// |
579 | /// Valid file descriptors are guaranteed to be positive numbers (see `open()` manpage) |
580 | /// while negative values are used to indicate errors. |
581 | /// Thus -1 will never be overlap with a valid open file. |
582 | const INVALID_FD: RawFd = -1; |
583 | |
584 | /// Linux-specific implementation that will attempt to use copy_file_range for copy offloading. |
585 | /// As the name says, it only works on regular files. |
586 | /// |
587 | /// Callers must handle fallback to a generic copy loop. |
588 | /// `Fallback` may indicate non-zero number of bytes already written |
589 | /// if one of the files' cursor +`max_len` would exceed u64::MAX (`EOVERFLOW`). |
590 | pub(super) fn copy_regular_files(reader: RawFd, writer: RawFd, max_len: u64) -> CopyResult { |
591 | use crate::cmp; |
592 | |
593 | const NOT_PROBED: u8 = 0; |
594 | const UNAVAILABLE: u8 = 1; |
595 | const AVAILABLE: u8 = 2; |
596 | |
597 | // Kernel prior to 4.5 don't have copy_file_range |
598 | // We store the availability in a global to avoid unnecessary syscalls |
599 | static HAS_COPY_FILE_RANGE: Atomic<u8> = AtomicU8::new(NOT_PROBED); |
600 | |
601 | let mut have_probed = match HAS_COPY_FILE_RANGE.load(Ordering::Relaxed) { |
602 | NOT_PROBED => false, |
603 | UNAVAILABLE => return CopyResult::Fallback(0), |
604 | _ => true, |
605 | }; |
606 | |
607 | syscall!( |
608 | fn copy_file_range( |
609 | fd_in: libc::c_int, |
610 | off_in: *mut libc::loff_t, |
611 | fd_out: libc::c_int, |
612 | off_out: *mut libc::loff_t, |
613 | len: libc::size_t, |
614 | flags: libc::c_uint, |
615 | ) -> libc::ssize_t; |
616 | ); |
617 | |
618 | fn probe_copy_file_range_support() -> u8 { |
619 | // In some cases, we cannot determine availability from the first |
620 | // `copy_file_range` call. In this case, we probe with an invalid file |
621 | // descriptor so that the results are easily interpretable. |
622 | match unsafe { |
623 | cvt(copy_file_range(INVALID_FD, ptr::null_mut(), INVALID_FD, ptr::null_mut(), 1, 0)) |
624 | .map_err(|e| e.raw_os_error()) |
625 | } { |
626 | Err(Some(EPERM | ENOSYS)) => UNAVAILABLE, |
627 | Err(Some(EBADF)) => AVAILABLE, |
628 | Ok(_) => panic!("unexpected copy_file_range probe success"), |
629 | // Treat other errors as the syscall |
630 | // being unavailable. |
631 | Err(_) => UNAVAILABLE, |
632 | } |
633 | } |
634 | |
635 | let mut written = 0u64; |
636 | while written < max_len { |
637 | let bytes_to_copy = cmp::min(max_len - written, usize::MAX as u64); |
638 | // cap to 1GB chunks in case u64::MAX is passed as max_len and the file has a non-zero seek position |
639 | // this allows us to copy large chunks without hitting EOVERFLOW, |
640 | // unless someone sets a file offset close to u64::MAX - 1GB, in which case a fallback would be required |
641 | let bytes_to_copy = cmp::min(bytes_to_copy as usize, 0x4000_0000usize); |
642 | let copy_result = unsafe { |
643 | // We actually don't have to adjust the offsets, |
644 | // because copy_file_range adjusts the file offset automatically |
645 | cvt(copy_file_range(reader, ptr::null_mut(), writer, ptr::null_mut(), bytes_to_copy, 0)) |
646 | }; |
647 | |
648 | if !have_probed && copy_result.is_ok() { |
649 | have_probed = true; |
650 | HAS_COPY_FILE_RANGE.store(AVAILABLE, Ordering::Relaxed); |
651 | } |
652 | |
653 | match copy_result { |
654 | Ok(0) if written == 0 => { |
655 | // fallback to work around several kernel bugs where copy_file_range will fail to |
656 | // copy any bytes and return 0 instead of an error if |
657 | // - reading virtual files from the proc filesystem which appear to have 0 size |
658 | // but are not empty. noted in coreutils to affect kernels at least up to 5.6.19. |
659 | // - copying from an overlay filesystem in docker. reported to occur on fedora 32. |
660 | return CopyResult::Fallback(0); |
661 | } |
662 | Ok(0) => return CopyResult::Ended(written), // reached EOF |
663 | Ok(ret) => written += ret as u64, |
664 | Err(err) => { |
665 | return match err.raw_os_error() { |
666 | // when file offset + max_length > u64::MAX |
667 | Some(EOVERFLOW) => CopyResult::Fallback(written), |
668 | Some(raw_os_error @ (ENOSYS | EXDEV | EINVAL | EPERM | EOPNOTSUPP | EBADF)) |
669 | if written == 0 => |
670 | { |
671 | if !have_probed { |
672 | let available = if matches!(raw_os_error, ENOSYS | EOPNOTSUPP | EPERM) { |
673 | // EPERM can indicate seccomp filters or an |
674 | // immutable file. To distinguish these |
675 | // cases we probe with invalid file |
676 | // descriptors which should result in EBADF |
677 | // if the syscall is supported and EPERM or |
678 | // ENOSYS if it's not available. |
679 | // |
680 | // For EOPNOTSUPP, see below. In the case of |
681 | // ENOSYS, we try to cover for faulty FUSE |
682 | // drivers. |
683 | probe_copy_file_range_support() |
684 | } else { |
685 | AVAILABLE |
686 | }; |
687 | HAS_COPY_FILE_RANGE.store(available, Ordering::Relaxed); |
688 | } |
689 | |
690 | // Try fallback io::copy if either: |
691 | // - Kernel version is < 4.5 (ENOSYS¹) |
692 | // - Files are mounted on different fs (EXDEV) |
693 | // - copy_file_range is broken in various ways on RHEL/CentOS 7 (EOPNOTSUPP) |
694 | // - copy_file_range file is immutable or syscall is blocked by seccomp¹ (EPERM) |
695 | // - copy_file_range cannot be used with pipes or device nodes (EINVAL) |
696 | // - the writer fd was opened with O_APPEND (EBADF²) |
697 | // and no bytes were written successfully yet. (All these errnos should |
698 | // not be returned if something was already written, but they happen in |
699 | // the wild, see #91152.) |
700 | // |
701 | // ¹ these cases should be detected by the initial probe but we handle them here |
702 | // anyway in case syscall interception changes during runtime |
703 | // ² actually invalid file descriptors would cause this too, but in that case |
704 | // the fallback code path is expected to encounter the same error again |
705 | CopyResult::Fallback(0) |
706 | } |
707 | _ => CopyResult::Error(err, written), |
708 | }; |
709 | } |
710 | } |
711 | } |
712 | CopyResult::Ended(written) |
713 | } |
714 | |
715 | #[derive(PartialEq)] |
716 | enum SpliceMode { |
717 | Sendfile, |
718 | Splice, |
719 | } |
720 | |
721 | /// performs splice or sendfile between file descriptors |
722 | /// Does _not_ fall back to a generic copy loop. |
723 | fn sendfile_splice(mode: SpliceMode, reader: RawFd, writer: RawFd, len: u64) -> CopyResult { |
724 | static HAS_SENDFILE: Atomic<bool> = AtomicBool::new(true); |
725 | static HAS_SPLICE: Atomic<bool> = AtomicBool::new(true); |
726 | |
727 | // Android builds use feature level 14, but the libc wrapper for splice is |
728 | // gated on feature level 21+, so we have to invoke the syscall directly. |
729 | #[cfg(target_os = "android")] |
730 | syscall!( |
731 | fn splice( |
732 | srcfd: libc::c_int, |
733 | src_offset: *const i64, |
734 | dstfd: libc::c_int, |
735 | dst_offset: *const i64, |
736 | len: libc::size_t, |
737 | flags: libc::c_int, |
738 | ) -> libc::ssize_t; |
739 | ); |
740 | |
741 | #[cfg(target_os = "linux")] |
742 | use libc::splice; |
743 | |
744 | match mode { |
745 | SpliceMode::Sendfile if !HAS_SENDFILE.load(Ordering::Relaxed) => { |
746 | return CopyResult::Fallback(0); |
747 | } |
748 | SpliceMode::Splice if !HAS_SPLICE.load(Ordering::Relaxed) => { |
749 | return CopyResult::Fallback(0); |
750 | } |
751 | _ => (), |
752 | } |
753 | |
754 | let mut written = 0u64; |
755 | while written < len { |
756 | // according to its manpage that's the maximum size sendfile() will copy per invocation |
757 | let chunk_size = crate::cmp::min(len - written, 0x7ffff000_u64) as usize; |
758 | |
759 | let result = match mode { |
760 | SpliceMode::Sendfile => { |
761 | cvt(unsafe { sendfile64(writer, reader, ptr::null_mut(), chunk_size) }) |
762 | } |
763 | SpliceMode::Splice => cvt(unsafe { |
764 | splice(reader, ptr::null_mut(), writer, ptr::null_mut(), chunk_size, 0) |
765 | }), |
766 | }; |
767 | |
768 | match result { |
769 | Ok(0) => break, // EOF |
770 | Ok(ret) => written += ret as u64, |
771 | Err(err) => { |
772 | return match err.raw_os_error() { |
773 | Some(ENOSYS | EPERM) => { |
774 | // syscall not supported (ENOSYS) |
775 | // syscall is disallowed, e.g. by seccomp (EPERM) |
776 | match mode { |
777 | SpliceMode::Sendfile => HAS_SENDFILE.store(false, Ordering::Relaxed), |
778 | SpliceMode::Splice => HAS_SPLICE.store(false, Ordering::Relaxed), |
779 | } |
780 | assert_eq!(written, 0); |
781 | CopyResult::Fallback(0) |
782 | } |
783 | Some(EINVAL) => { |
784 | // splice/sendfile do not support this particular file descriptor (EINVAL) |
785 | assert_eq!(written, 0); |
786 | CopyResult::Fallback(0) |
787 | } |
788 | Some(os_err) if mode == SpliceMode::Sendfile && os_err == EOVERFLOW => { |
789 | CopyResult::Fallback(written) |
790 | } |
791 | _ => CopyResult::Error(err, written), |
792 | }; |
793 | } |
794 | } |
795 | } |
796 | CopyResult::Ended(written) |
797 | } |
798 |
Definitions
- copy_spec
- FdMeta
- Metadata
- Socket
- Pipe
- NoneObtained
- FdHandle
- Input
- Output
- maybe_fifo
- potential_sendfile_source
- copy_file_range_candidate
- safe_kernel_copy
- CopyParams
- Copier
- read
- write
- SpecCopy
- copy
- copy
- copy
- CopyRead
- drain_to
- taken
- min_limit
- properties
- CopyWrite
- properties
- drain_to
- taken
- min_limit
- properties
- properties
- properties
- properties
- properties
- properties
- properties
- properties
- properties
- properties
- properties
- properties
- properties
- properties
- properties
- properties
- properties
- properties
- properties
- properties
- properties
- drain_to
- properties
- properties
- properties
- drain_to
- taken
- min_limit
- properties
- drain_to
- taken
- min_limit
- properties
- properties
- properties
- properties
- fd_to_meta
- CopyResult
- Ended
- Error
- Fallback
- update_take
- copy_regular_files
- copy_file_range
- probe_copy_file_range_support
- SpliceMode
- Sendfile
- Splice
- sendfile_splice
Learn Rust with the experts
Find out more