1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | #include <linux/slab.h> |
3 | #include <linux/stat.h> |
4 | #include <linux/sched/xacct.h> |
5 | #include <linux/fcntl.h> |
6 | #include <linux/file.h> |
7 | #include <linux/uio.h> |
8 | #include <linux/fsnotify.h> |
9 | #include <linux/security.h> |
10 | #include <linux/export.h> |
11 | #include <linux/syscalls.h> |
12 | #include <linux/pagemap.h> |
13 | #include <linux/splice.h> |
14 | #include <linux/compat.h> |
15 | #include <linux/mount.h> |
16 | #include <linux/fs.h> |
17 | #include <linux/dax.h> |
18 | #include <linux/overflow.h> |
19 | #include "internal.h" |
20 | |
21 | #include <linux/uaccess.h> |
22 | #include <asm/unistd.h> |
23 | |
24 | /* |
25 | * Performs necessary checks before doing a clone. |
26 | * |
27 | * Can adjust amount of bytes to clone via @req_count argument. |
28 | * Returns appropriate error code that caller should return or |
29 | * zero in case the clone should be allowed. |
30 | */ |
31 | static int generic_remap_checks(struct file *file_in, loff_t pos_in, |
32 | struct file *file_out, loff_t pos_out, |
33 | loff_t *req_count, unsigned int remap_flags) |
34 | { |
35 | struct inode *inode_in = file_in->f_mapping->host; |
36 | struct inode *inode_out = file_out->f_mapping->host; |
37 | uint64_t count = *req_count; |
38 | uint64_t bcount; |
39 | loff_t size_in, size_out; |
40 | loff_t bs = inode_out->i_sb->s_blocksize; |
41 | int ret; |
42 | |
43 | /* The start of both ranges must be aligned to an fs block. */ |
44 | if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_out, bs)) |
45 | return -EINVAL; |
46 | |
47 | /* Ensure offsets don't wrap. */ |
48 | if (pos_in + count < pos_in || pos_out + count < pos_out) |
49 | return -EINVAL; |
50 | |
51 | size_in = i_size_read(inode: inode_in); |
52 | size_out = i_size_read(inode: inode_out); |
53 | |
54 | /* Dedupe requires both ranges to be within EOF. */ |
55 | if ((remap_flags & REMAP_FILE_DEDUP) && |
56 | (pos_in >= size_in || pos_in + count > size_in || |
57 | pos_out >= size_out || pos_out + count > size_out)) |
58 | return -EINVAL; |
59 | |
60 | /* Ensure the infile range is within the infile. */ |
61 | if (pos_in >= size_in) |
62 | return -EINVAL; |
63 | count = min(count, size_in - (uint64_t)pos_in); |
64 | |
65 | ret = generic_write_check_limits(file: file_out, pos: pos_out, count: &count); |
66 | if (ret) |
67 | return ret; |
68 | |
69 | /* |
70 | * If the user wanted us to link to the infile's EOF, round up to the |
71 | * next block boundary for this check. |
72 | * |
73 | * Otherwise, make sure the count is also block-aligned, having |
74 | * already confirmed the starting offsets' block alignment. |
75 | */ |
76 | if (pos_in + count == size_in && |
77 | (!(remap_flags & REMAP_FILE_DEDUP) || pos_out + count == size_out)) { |
78 | bcount = ALIGN(size_in, bs) - pos_in; |
79 | } else { |
80 | if (!IS_ALIGNED(count, bs)) |
81 | count = ALIGN_DOWN(count, bs); |
82 | bcount = count; |
83 | } |
84 | |
85 | /* Don't allow overlapped cloning within the same file. */ |
86 | if (inode_in == inode_out && |
87 | pos_out + bcount > pos_in && |
88 | pos_out < pos_in + bcount) |
89 | return -EINVAL; |
90 | |
91 | /* |
92 | * We shortened the request but the caller can't deal with that, so |
93 | * bounce the request back to userspace. |
94 | */ |
95 | if (*req_count != count && !(remap_flags & REMAP_FILE_CAN_SHORTEN)) |
96 | return -EINVAL; |
97 | |
98 | *req_count = count; |
99 | return 0; |
100 | } |
101 | |
102 | static int remap_verify_area(struct file *file, loff_t pos, loff_t len, |
103 | bool write) |
104 | { |
105 | loff_t tmp; |
106 | |
107 | if (unlikely(pos < 0 || len < 0)) |
108 | return -EINVAL; |
109 | |
110 | if (unlikely(check_add_overflow(pos, len, &tmp))) |
111 | return -EINVAL; |
112 | |
113 | return security_file_permission(file, mask: write ? MAY_WRITE : MAY_READ); |
114 | } |
115 | |
116 | /* |
117 | * Ensure that we don't remap a partial EOF block in the middle of something |
118 | * else. Assume that the offsets have already been checked for block |
119 | * alignment. |
120 | * |
121 | * For clone we only link a partial EOF block above or at the destination file's |
122 | * EOF. For deduplication we accept a partial EOF block only if it ends at the |
123 | * destination file's EOF (can not link it into the middle of a file). |
124 | * |
125 | * Shorten the request if possible. |
126 | */ |
127 | static int generic_remap_check_len(struct inode *inode_in, |
128 | struct inode *inode_out, |
129 | loff_t pos_out, |
130 | loff_t *len, |
131 | unsigned int remap_flags) |
132 | { |
133 | u64 blkmask = i_blocksize(node: inode_in) - 1; |
134 | loff_t new_len = *len; |
135 | |
136 | if ((*len & blkmask) == 0) |
137 | return 0; |
138 | |
139 | if (pos_out + *len < i_size_read(inode: inode_out)) |
140 | new_len &= ~blkmask; |
141 | |
142 | if (new_len == *len) |
143 | return 0; |
144 | |
145 | if (remap_flags & REMAP_FILE_CAN_SHORTEN) { |
146 | *len = new_len; |
147 | return 0; |
148 | } |
149 | |
150 | return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL; |
151 | } |
152 | |
153 | /* Read a page's worth of file data into the page cache. */ |
154 | static struct folio *vfs_dedupe_get_folio(struct file *file, loff_t pos) |
155 | { |
156 | return read_mapping_folio(mapping: file->f_mapping, index: pos >> PAGE_SHIFT, file); |
157 | } |
158 | |
159 | /* |
160 | * Lock two folios, ensuring that we lock in offset order if the folios |
161 | * are from the same file. |
162 | */ |
163 | static void vfs_lock_two_folios(struct folio *folio1, struct folio *folio2) |
164 | { |
165 | /* Always lock in order of increasing index. */ |
166 | if (folio1->index > folio2->index) |
167 | swap(folio1, folio2); |
168 | |
169 | folio_lock(folio: folio1); |
170 | if (folio1 != folio2) |
171 | folio_lock(folio: folio2); |
172 | } |
173 | |
174 | /* Unlock two folios, being careful not to unlock the same folio twice. */ |
175 | static void vfs_unlock_two_folios(struct folio *folio1, struct folio *folio2) |
176 | { |
177 | folio_unlock(folio: folio1); |
178 | if (folio1 != folio2) |
179 | folio_unlock(folio: folio2); |
180 | } |
181 | |
182 | /* |
183 | * Compare extents of two files to see if they are the same. |
184 | * Caller must have locked both inodes to prevent write races. |
185 | */ |
186 | static int vfs_dedupe_file_range_compare(struct file *src, loff_t srcoff, |
187 | struct file *dest, loff_t dstoff, |
188 | loff_t len, bool *is_same) |
189 | { |
190 | bool same = true; |
191 | int error = -EINVAL; |
192 | |
193 | while (len) { |
194 | struct folio *src_folio, *dst_folio; |
195 | void *src_addr, *dst_addr; |
196 | loff_t cmp_len = min(PAGE_SIZE - offset_in_page(srcoff), |
197 | PAGE_SIZE - offset_in_page(dstoff)); |
198 | |
199 | cmp_len = min(cmp_len, len); |
200 | if (cmp_len <= 0) |
201 | goto out_error; |
202 | |
203 | src_folio = vfs_dedupe_get_folio(file: src, pos: srcoff); |
204 | if (IS_ERR(ptr: src_folio)) { |
205 | error = PTR_ERR(ptr: src_folio); |
206 | goto out_error; |
207 | } |
208 | dst_folio = vfs_dedupe_get_folio(file: dest, pos: dstoff); |
209 | if (IS_ERR(ptr: dst_folio)) { |
210 | error = PTR_ERR(ptr: dst_folio); |
211 | folio_put(folio: src_folio); |
212 | goto out_error; |
213 | } |
214 | |
215 | vfs_lock_two_folios(folio1: src_folio, folio2: dst_folio); |
216 | |
217 | /* |
218 | * Now that we've locked both folios, make sure they're still |
219 | * mapped to the file data we're interested in. If not, |
220 | * someone is invalidating pages on us and we lose. |
221 | */ |
222 | if (!folio_test_uptodate(folio: src_folio) || !folio_test_uptodate(folio: dst_folio) || |
223 | src_folio->mapping != src->f_mapping || |
224 | dst_folio->mapping != dest->f_mapping) { |
225 | same = false; |
226 | goto unlock; |
227 | } |
228 | |
229 | src_addr = kmap_local_folio(folio: src_folio, |
230 | offset_in_folio(src_folio, srcoff)); |
231 | dst_addr = kmap_local_folio(folio: dst_folio, |
232 | offset_in_folio(dst_folio, dstoff)); |
233 | |
234 | flush_dcache_folio(folio: src_folio); |
235 | flush_dcache_folio(folio: dst_folio); |
236 | |
237 | if (memcmp(p: src_addr, q: dst_addr, size: cmp_len)) |
238 | same = false; |
239 | |
240 | kunmap_local(dst_addr); |
241 | kunmap_local(src_addr); |
242 | unlock: |
243 | vfs_unlock_two_folios(folio1: src_folio, folio2: dst_folio); |
244 | folio_put(folio: dst_folio); |
245 | folio_put(folio: src_folio); |
246 | |
247 | if (!same) |
248 | break; |
249 | |
250 | srcoff += cmp_len; |
251 | dstoff += cmp_len; |
252 | len -= cmp_len; |
253 | } |
254 | |
255 | *is_same = same; |
256 | return 0; |
257 | |
258 | out_error: |
259 | return error; |
260 | } |
261 | |
262 | /* |
263 | * Check that the two inodes are eligible for cloning, the ranges make |
264 | * sense, and then flush all dirty data. Caller must ensure that the |
265 | * inodes have been locked against any other modifications. |
266 | * |
267 | * If there's an error, then the usual negative error code is returned. |
268 | * Otherwise returns 0 with *len set to the request length. |
269 | */ |
270 | int |
271 | __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, |
272 | struct file *file_out, loff_t pos_out, |
273 | loff_t *len, unsigned int remap_flags, |
274 | const struct iomap_ops *dax_read_ops) |
275 | { |
276 | struct inode *inode_in = file_inode(f: file_in); |
277 | struct inode *inode_out = file_inode(f: file_out); |
278 | bool same_inode = (inode_in == inode_out); |
279 | int ret; |
280 | |
281 | /* Don't touch certain kinds of inodes */ |
282 | if (IS_IMMUTABLE(inode_out)) |
283 | return -EPERM; |
284 | |
285 | if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out)) |
286 | return -ETXTBSY; |
287 | |
288 | /* Don't reflink dirs, pipes, sockets... */ |
289 | if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) |
290 | return -EISDIR; |
291 | if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) |
292 | return -EINVAL; |
293 | |
294 | /* Zero length dedupe exits immediately; reflink goes to EOF. */ |
295 | if (*len == 0) { |
296 | loff_t isize = i_size_read(inode: inode_in); |
297 | |
298 | if ((remap_flags & REMAP_FILE_DEDUP) || pos_in == isize) |
299 | return 0; |
300 | if (pos_in > isize) |
301 | return -EINVAL; |
302 | *len = isize - pos_in; |
303 | if (*len == 0) |
304 | return 0; |
305 | } |
306 | |
307 | /* Check that we don't violate system file offset limits. */ |
308 | ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, req_count: len, |
309 | remap_flags); |
310 | if (ret || *len == 0) |
311 | return ret; |
312 | |
313 | /* Wait for the completion of any pending IOs on both files */ |
314 | inode_dio_wait(inode: inode_in); |
315 | if (!same_inode) |
316 | inode_dio_wait(inode: inode_out); |
317 | |
318 | ret = filemap_write_and_wait_range(mapping: inode_in->i_mapping, |
319 | lstart: pos_in, lend: pos_in + *len - 1); |
320 | if (ret) |
321 | return ret; |
322 | |
323 | ret = filemap_write_and_wait_range(mapping: inode_out->i_mapping, |
324 | lstart: pos_out, lend: pos_out + *len - 1); |
325 | if (ret) |
326 | return ret; |
327 | |
328 | /* |
329 | * Check that the extents are the same. |
330 | */ |
331 | if (remap_flags & REMAP_FILE_DEDUP) { |
332 | bool is_same = false; |
333 | |
334 | if (!IS_DAX(inode_in)) |
335 | ret = vfs_dedupe_file_range_compare(src: file_in, srcoff: pos_in, |
336 | dest: file_out, dstoff: pos_out, len: *len, is_same: &is_same); |
337 | else if (dax_read_ops) |
338 | ret = dax_dedupe_file_range_compare(src: inode_in, srcoff: pos_in, |
339 | dest: inode_out, destoff: pos_out, len: *len, is_same: &is_same, |
340 | ops: dax_read_ops); |
341 | else |
342 | return -EINVAL; |
343 | if (ret) |
344 | return ret; |
345 | if (!is_same) |
346 | return -EBADE; |
347 | } |
348 | |
349 | ret = generic_remap_check_len(inode_in, inode_out, pos_out, len, |
350 | remap_flags); |
351 | if (ret || *len == 0) |
352 | return ret; |
353 | |
354 | /* If can't alter the file contents, we're done. */ |
355 | if (!(remap_flags & REMAP_FILE_DEDUP)) |
356 | ret = file_modified(file: file_out); |
357 | |
358 | return ret; |
359 | } |
360 | |
361 | int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, |
362 | struct file *file_out, loff_t pos_out, |
363 | loff_t *len, unsigned int remap_flags) |
364 | { |
365 | return __generic_remap_file_range_prep(file_in, pos_in, file_out, |
366 | pos_out, len, remap_flags, NULL); |
367 | } |
368 | EXPORT_SYMBOL(generic_remap_file_range_prep); |
369 | |
370 | loff_t do_clone_file_range(struct file *file_in, loff_t pos_in, |
371 | struct file *file_out, loff_t pos_out, |
372 | loff_t len, unsigned int remap_flags) |
373 | { |
374 | loff_t ret; |
375 | |
376 | WARN_ON_ONCE(remap_flags & REMAP_FILE_DEDUP); |
377 | |
378 | if (file_inode(f: file_in)->i_sb != file_inode(f: file_out)->i_sb) |
379 | return -EXDEV; |
380 | |
381 | ret = generic_file_rw_checks(file_in, file_out); |
382 | if (ret < 0) |
383 | return ret; |
384 | |
385 | if (!file_in->f_op->remap_file_range) |
386 | return -EOPNOTSUPP; |
387 | |
388 | ret = remap_verify_area(file: file_in, pos: pos_in, len, write: false); |
389 | if (ret) |
390 | return ret; |
391 | |
392 | ret = remap_verify_area(file: file_out, pos: pos_out, len, write: true); |
393 | if (ret) |
394 | return ret; |
395 | |
396 | ret = file_in->f_op->remap_file_range(file_in, pos_in, |
397 | file_out, pos_out, len, remap_flags); |
398 | if (ret < 0) |
399 | return ret; |
400 | |
401 | fsnotify_access(file: file_in); |
402 | fsnotify_modify(file: file_out); |
403 | return ret; |
404 | } |
405 | EXPORT_SYMBOL(do_clone_file_range); |
406 | |
407 | loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in, |
408 | struct file *file_out, loff_t pos_out, |
409 | loff_t len, unsigned int remap_flags) |
410 | { |
411 | loff_t ret; |
412 | |
413 | file_start_write(file: file_out); |
414 | ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len, |
415 | remap_flags); |
416 | file_end_write(file: file_out); |
417 | |
418 | return ret; |
419 | } |
420 | EXPORT_SYMBOL(vfs_clone_file_range); |
421 | |
422 | /* Check whether we are allowed to dedupe the destination file */ |
423 | static bool allow_file_dedupe(struct file *file) |
424 | { |
425 | struct mnt_idmap *idmap = file_mnt_idmap(file); |
426 | struct inode *inode = file_inode(f: file); |
427 | |
428 | if (capable(CAP_SYS_ADMIN)) |
429 | return true; |
430 | if (file->f_mode & FMODE_WRITE) |
431 | return true; |
432 | if (vfsuid_eq_kuid(vfsuid: i_uid_into_vfsuid(idmap, inode), current_fsuid())) |
433 | return true; |
434 | if (!inode_permission(idmap, inode, MAY_WRITE)) |
435 | return true; |
436 | return false; |
437 | } |
438 | |
439 | loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, |
440 | struct file *dst_file, loff_t dst_pos, |
441 | loff_t len, unsigned int remap_flags) |
442 | { |
443 | loff_t ret; |
444 | |
445 | WARN_ON_ONCE(remap_flags & ~(REMAP_FILE_DEDUP | |
446 | REMAP_FILE_CAN_SHORTEN)); |
447 | |
448 | ret = mnt_want_write_file(file: dst_file); |
449 | if (ret) |
450 | return ret; |
451 | |
452 | /* |
453 | * This is redundant if called from vfs_dedupe_file_range(), but other |
454 | * callers need it and it's not performance sesitive... |
455 | */ |
456 | ret = remap_verify_area(file: src_file, pos: src_pos, len, write: false); |
457 | if (ret) |
458 | goto out_drop_write; |
459 | |
460 | ret = remap_verify_area(file: dst_file, pos: dst_pos, len, write: true); |
461 | if (ret) |
462 | goto out_drop_write; |
463 | |
464 | ret = -EPERM; |
465 | if (!allow_file_dedupe(file: dst_file)) |
466 | goto out_drop_write; |
467 | |
468 | ret = -EXDEV; |
469 | if (file_inode(f: src_file)->i_sb != file_inode(f: dst_file)->i_sb) |
470 | goto out_drop_write; |
471 | |
472 | ret = -EISDIR; |
473 | if (S_ISDIR(file_inode(dst_file)->i_mode)) |
474 | goto out_drop_write; |
475 | |
476 | ret = -EINVAL; |
477 | if (!dst_file->f_op->remap_file_range) |
478 | goto out_drop_write; |
479 | |
480 | if (len == 0) { |
481 | ret = 0; |
482 | goto out_drop_write; |
483 | } |
484 | |
485 | ret = dst_file->f_op->remap_file_range(src_file, src_pos, dst_file, |
486 | dst_pos, len, remap_flags | REMAP_FILE_DEDUP); |
487 | out_drop_write: |
488 | mnt_drop_write_file(file: dst_file); |
489 | |
490 | return ret; |
491 | } |
492 | EXPORT_SYMBOL(vfs_dedupe_file_range_one); |
493 | |
494 | int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) |
495 | { |
496 | struct file_dedupe_range_info *info; |
497 | struct inode *src = file_inode(f: file); |
498 | u64 off; |
499 | u64 len; |
500 | int i; |
501 | int ret; |
502 | u16 count = same->dest_count; |
503 | loff_t deduped; |
504 | |
505 | if (!(file->f_mode & FMODE_READ)) |
506 | return -EINVAL; |
507 | |
508 | if (same->reserved1 || same->reserved2) |
509 | return -EINVAL; |
510 | |
511 | off = same->src_offset; |
512 | len = same->src_length; |
513 | |
514 | if (S_ISDIR(src->i_mode)) |
515 | return -EISDIR; |
516 | |
517 | if (!S_ISREG(src->i_mode)) |
518 | return -EINVAL; |
519 | |
520 | if (!file->f_op->remap_file_range) |
521 | return -EOPNOTSUPP; |
522 | |
523 | ret = remap_verify_area(file, pos: off, len, write: false); |
524 | if (ret < 0) |
525 | return ret; |
526 | ret = 0; |
527 | |
528 | if (off + len > i_size_read(inode: src)) |
529 | return -EINVAL; |
530 | |
531 | /* Arbitrary 1G limit on a single dedupe request, can be raised. */ |
532 | len = min_t(u64, len, 1 << 30); |
533 | |
534 | /* pre-format output fields to sane values */ |
535 | for (i = 0; i < count; i++) { |
536 | same->info[i].bytes_deduped = 0ULL; |
537 | same->info[i].status = FILE_DEDUPE_RANGE_SAME; |
538 | } |
539 | |
540 | for (i = 0, info = same->info; i < count; i++, info++) { |
541 | struct fd dst_fd = fdget(fd: info->dest_fd); |
542 | struct file *dst_file = dst_fd.file; |
543 | |
544 | if (!dst_file) { |
545 | info->status = -EBADF; |
546 | goto next_loop; |
547 | } |
548 | |
549 | if (info->reserved) { |
550 | info->status = -EINVAL; |
551 | goto next_fdput; |
552 | } |
553 | |
554 | deduped = vfs_dedupe_file_range_one(file, off, dst_file, |
555 | info->dest_offset, len, |
556 | REMAP_FILE_CAN_SHORTEN); |
557 | if (deduped == -EBADE) |
558 | info->status = FILE_DEDUPE_RANGE_DIFFERS; |
559 | else if (deduped < 0) |
560 | info->status = deduped; |
561 | else |
562 | info->bytes_deduped = len; |
563 | |
564 | next_fdput: |
565 | fdput(fd: dst_fd); |
566 | next_loop: |
567 | if (fatal_signal_pending(current)) |
568 | break; |
569 | } |
570 | return ret; |
571 | } |
572 | EXPORT_SYMBOL(vfs_dedupe_file_range); |
573 | |