| 1 | // SPDX-License-Identifier: LGPL-2.1 |
| 2 | /* |
| 3 | * Copyright (c) 2008,2009 NEC Software Tohoku, Ltd. |
| 4 | * Written by Takashi Sato <t-sato@yk.jp.nec.com> |
| 5 | * Akira Fujita <a-fujita@rs.jp.nec.com> |
| 6 | */ |
| 7 | |
| 8 | #include <linux/fs.h> |
| 9 | #include <linux/quotaops.h> |
| 10 | #include <linux/slab.h> |
| 11 | #include <linux/sched/mm.h> |
| 12 | #include "ext4_jbd2.h" |
| 13 | #include "ext4.h" |
| 14 | #include "ext4_extents.h" |
| 15 | |
| 16 | #include <trace/events/ext4.h> |
| 17 | |
| 18 | struct mext_data { |
| 19 | struct inode *orig_inode; /* Origin file inode */ |
| 20 | struct inode *donor_inode; /* Donor file inode */ |
| 21 | struct ext4_map_blocks orig_map;/* Origin file's move mapping */ |
| 22 | ext4_lblk_t donor_lblk; /* Start block of the donor file */ |
| 23 | }; |
| 24 | |
| 25 | /** |
| 26 | * ext4_double_down_write_data_sem() - write lock two inodes's i_data_sem |
| 27 | * @first: inode to be locked |
| 28 | * @second: inode to be locked |
| 29 | * |
| 30 | * Acquire write lock of i_data_sem of the two inodes |
| 31 | */ |
| 32 | void |
| 33 | ext4_double_down_write_data_sem(struct inode *first, struct inode *second) |
| 34 | { |
| 35 | if (first < second) { |
| 36 | down_write(sem: &EXT4_I(first)->i_data_sem); |
| 37 | down_write_nested(sem: &EXT4_I(second)->i_data_sem, subclass: I_DATA_SEM_OTHER); |
| 38 | } else { |
| 39 | down_write(sem: &EXT4_I(second)->i_data_sem); |
| 40 | down_write_nested(sem: &EXT4_I(first)->i_data_sem, subclass: I_DATA_SEM_OTHER); |
| 41 | } |
| 42 | } |
| 43 | |
| 44 | /** |
| 45 | * ext4_double_up_write_data_sem - Release two inodes' write lock of i_data_sem |
| 46 | * |
| 47 | * @orig_inode: original inode structure to be released its lock first |
| 48 | * @donor_inode: donor inode structure to be released its lock second |
| 49 | * Release write lock of i_data_sem of two inodes (orig and donor). |
| 50 | */ |
| 51 | void |
| 52 | ext4_double_up_write_data_sem(struct inode *orig_inode, |
| 53 | struct inode *donor_inode) |
| 54 | { |
| 55 | up_write(sem: &EXT4_I(orig_inode)->i_data_sem); |
| 56 | up_write(sem: &EXT4_I(donor_inode)->i_data_sem); |
| 57 | } |
| 58 | |
| 59 | /* Grab and lock folio on both @inode1 and @inode2 by inode order. */ |
| 60 | static int mext_folio_double_lock(struct inode *inode1, struct inode *inode2, |
| 61 | pgoff_t index1, pgoff_t index2, size_t len, |
| 62 | struct folio *folio[2]) |
| 63 | { |
| 64 | struct address_space *mapping[2]; |
| 65 | unsigned int flags; |
| 66 | fgf_t fgp_flags = FGP_WRITEBEGIN; |
| 67 | |
| 68 | BUG_ON(!inode1 || !inode2); |
| 69 | if (inode1 < inode2) { |
| 70 | mapping[0] = inode1->i_mapping; |
| 71 | mapping[1] = inode2->i_mapping; |
| 72 | } else { |
| 73 | swap(index1, index2); |
| 74 | mapping[0] = inode2->i_mapping; |
| 75 | mapping[1] = inode1->i_mapping; |
| 76 | } |
| 77 | |
| 78 | flags = memalloc_nofs_save(); |
| 79 | fgp_flags |= fgf_set_order(size: len); |
| 80 | folio[0] = __filemap_get_folio(mapping: mapping[0], index: index1, fgf_flags: fgp_flags, |
| 81 | gfp: mapping_gfp_mask(mapping: mapping[0])); |
| 82 | if (IS_ERR(ptr: folio[0])) { |
| 83 | memalloc_nofs_restore(flags); |
| 84 | return PTR_ERR(ptr: folio[0]); |
| 85 | } |
| 86 | |
| 87 | folio[1] = __filemap_get_folio(mapping: mapping[1], index: index2, fgf_flags: fgp_flags, |
| 88 | gfp: mapping_gfp_mask(mapping: mapping[1])); |
| 89 | memalloc_nofs_restore(flags); |
| 90 | if (IS_ERR(ptr: folio[1])) { |
| 91 | folio_unlock(folio: folio[0]); |
| 92 | folio_put(folio: folio[0]); |
| 93 | return PTR_ERR(ptr: folio[1]); |
| 94 | } |
| 95 | /* |
| 96 | * __filemap_get_folio() may not wait on folio's writeback if |
| 97 | * BDI not demand that. But it is reasonable to be very conservative |
| 98 | * here and explicitly wait on folio's writeback |
| 99 | */ |
| 100 | folio_wait_writeback(folio: folio[0]); |
| 101 | folio_wait_writeback(folio: folio[1]); |
| 102 | if (inode1 > inode2) |
| 103 | swap(folio[0], folio[1]); |
| 104 | |
| 105 | return 0; |
| 106 | } |
| 107 | |
| 108 | static void mext_folio_double_unlock(struct folio *folio[2]) |
| 109 | { |
| 110 | folio_unlock(folio: folio[0]); |
| 111 | folio_put(folio: folio[0]); |
| 112 | folio_unlock(folio: folio[1]); |
| 113 | folio_put(folio: folio[1]); |
| 114 | } |
| 115 | |
| 116 | /* Force folio buffers uptodate w/o dropping folio's lock */ |
| 117 | static int mext_folio_mkuptodate(struct folio *folio, size_t from, size_t to) |
| 118 | { |
| 119 | struct inode *inode = folio->mapping->host; |
| 120 | sector_t block; |
| 121 | struct buffer_head *bh, *head; |
| 122 | unsigned int blocksize, block_start, block_end; |
| 123 | int nr = 0; |
| 124 | bool partial = false; |
| 125 | |
| 126 | BUG_ON(!folio_test_locked(folio)); |
| 127 | BUG_ON(folio_test_writeback(folio)); |
| 128 | |
| 129 | if (folio_test_uptodate(folio)) |
| 130 | return 0; |
| 131 | |
| 132 | blocksize = i_blocksize(node: inode); |
| 133 | head = folio_buffers(folio); |
| 134 | if (!head) |
| 135 | head = create_empty_buffers(folio, blocksize, b_state: 0); |
| 136 | |
| 137 | block = folio_pos(folio) >> inode->i_blkbits; |
| 138 | block_end = 0; |
| 139 | bh = head; |
| 140 | do { |
| 141 | block_start = block_end; |
| 142 | block_end = block_start + blocksize; |
| 143 | if (block_end <= from || block_start >= to) { |
| 144 | if (!buffer_uptodate(bh)) |
| 145 | partial = true; |
| 146 | continue; |
| 147 | } |
| 148 | if (buffer_uptodate(bh)) |
| 149 | continue; |
| 150 | if (!buffer_mapped(bh)) { |
| 151 | int err = ext4_get_block(inode, iblock: block, bh_result: bh, create: 0); |
| 152 | if (err) |
| 153 | return err; |
| 154 | if (!buffer_mapped(bh)) { |
| 155 | folio_zero_range(folio, start: block_start, length: blocksize); |
| 156 | set_buffer_uptodate(bh); |
| 157 | continue; |
| 158 | } |
| 159 | } |
| 160 | lock_buffer(bh); |
| 161 | if (buffer_uptodate(bh)) { |
| 162 | unlock_buffer(bh); |
| 163 | continue; |
| 164 | } |
| 165 | ext4_read_bh_nowait(bh, op_flags: 0, NULL, simu_fail: false); |
| 166 | nr++; |
| 167 | } while (block++, (bh = bh->b_this_page) != head); |
| 168 | |
| 169 | /* No io required */ |
| 170 | if (!nr) |
| 171 | goto out; |
| 172 | |
| 173 | bh = head; |
| 174 | do { |
| 175 | if (bh_offset(bh) + blocksize <= from) |
| 176 | continue; |
| 177 | if (bh_offset(bh) >= to) |
| 178 | break; |
| 179 | wait_on_buffer(bh); |
| 180 | if (buffer_uptodate(bh)) |
| 181 | continue; |
| 182 | return -EIO; |
| 183 | } while ((bh = bh->b_this_page) != head); |
| 184 | out: |
| 185 | if (!partial) |
| 186 | folio_mark_uptodate(folio); |
| 187 | return 0; |
| 188 | } |
| 189 | |
| 190 | enum mext_move_type {MEXT_SKIP_EXTENT, MEXT_MOVE_EXTENT, MEXT_COPY_DATA}; |
| 191 | |
| 192 | /* |
| 193 | * Start to move extent between the origin inode and the donor inode, |
| 194 | * hold one folio for each inode and check the candidate moving extent |
| 195 | * mapping status again. |
| 196 | */ |
| 197 | static int mext_move_begin(struct mext_data *mext, struct folio *folio[2], |
| 198 | enum mext_move_type *move_type) |
| 199 | { |
| 200 | struct inode *orig_inode = mext->orig_inode; |
| 201 | struct inode *donor_inode = mext->donor_inode; |
| 202 | unsigned int blkbits = orig_inode->i_blkbits; |
| 203 | struct ext4_map_blocks donor_map = {0}; |
| 204 | loff_t orig_pos, donor_pos; |
| 205 | size_t move_len; |
| 206 | int ret; |
| 207 | |
| 208 | orig_pos = ((loff_t)mext->orig_map.m_lblk) << blkbits; |
| 209 | donor_pos = ((loff_t)mext->donor_lblk) << blkbits; |
| 210 | ret = mext_folio_double_lock(inode1: orig_inode, inode2: donor_inode, |
| 211 | index1: orig_pos >> PAGE_SHIFT, index2: donor_pos >> PAGE_SHIFT, |
| 212 | len: ((size_t)mext->orig_map.m_len) << blkbits, folio); |
| 213 | if (ret) |
| 214 | return ret; |
| 215 | |
| 216 | /* |
| 217 | * Check the origin inode's mapping information again under the |
| 218 | * folio lock, as we do not hold the i_data_sem at all times, and |
| 219 | * it may change during the concurrent write-back operation. |
| 220 | */ |
| 221 | if (mext->orig_map.m_seq != READ_ONCE(EXT4_I(orig_inode)->i_es_seq)) { |
| 222 | ret = -ESTALE; |
| 223 | goto error; |
| 224 | } |
| 225 | |
| 226 | /* Adjust the moving length according to the length of shorter folio. */ |
| 227 | move_len = umin(folio_pos(folio[0]) + folio_size(folio[0]) - orig_pos, |
| 228 | folio_pos(folio[1]) + folio_size(folio[1]) - donor_pos); |
| 229 | move_len >>= blkbits; |
| 230 | if (move_len < mext->orig_map.m_len) |
| 231 | mext->orig_map.m_len = move_len; |
| 232 | |
| 233 | donor_map.m_lblk = mext->donor_lblk; |
| 234 | donor_map.m_len = mext->orig_map.m_len; |
| 235 | donor_map.m_flags = 0; |
| 236 | ret = ext4_map_blocks(NULL, inode: donor_inode, map: &donor_map, flags: 0); |
| 237 | if (ret < 0) |
| 238 | goto error; |
| 239 | |
| 240 | /* Adjust the moving length according to the donor mapping length. */ |
| 241 | mext->orig_map.m_len = donor_map.m_len; |
| 242 | |
| 243 | /* Skip moving if the donor range is a hole or a delalloc extent. */ |
| 244 | if (!(donor_map.m_flags & (EXT4_MAP_MAPPED | EXT4_MAP_UNWRITTEN))) |
| 245 | *move_type = MEXT_SKIP_EXTENT; |
| 246 | /* If both mapping ranges are unwritten, no need to copy data. */ |
| 247 | else if ((mext->orig_map.m_flags & EXT4_MAP_UNWRITTEN) && |
| 248 | (donor_map.m_flags & EXT4_MAP_UNWRITTEN)) |
| 249 | *move_type = MEXT_MOVE_EXTENT; |
| 250 | else |
| 251 | *move_type = MEXT_COPY_DATA; |
| 252 | |
| 253 | return 0; |
| 254 | error: |
| 255 | mext_folio_double_unlock(folio); |
| 256 | return ret; |
| 257 | } |
| 258 | |
| 259 | /* |
| 260 | * Re-create the new moved mapping buffers of the original inode and commit |
| 261 | * the entire written range. |
| 262 | */ |
| 263 | static int mext_folio_mkwrite(struct inode *inode, struct folio *folio, |
| 264 | size_t from, size_t to) |
| 265 | { |
| 266 | unsigned int blocksize = i_blocksize(node: inode); |
| 267 | struct buffer_head *bh, *head; |
| 268 | size_t block_start, block_end; |
| 269 | sector_t block; |
| 270 | int ret; |
| 271 | |
| 272 | head = folio_buffers(folio); |
| 273 | if (!head) |
| 274 | head = create_empty_buffers(folio, blocksize, b_state: 0); |
| 275 | |
| 276 | block = folio_pos(folio) >> inode->i_blkbits; |
| 277 | block_end = 0; |
| 278 | bh = head; |
| 279 | do { |
| 280 | block_start = block_end; |
| 281 | block_end = block_start + blocksize; |
| 282 | if (block_end <= from || block_start >= to) |
| 283 | continue; |
| 284 | |
| 285 | ret = ext4_get_block(inode, iblock: block, bh_result: bh, create: 0); |
| 286 | if (ret) |
| 287 | return ret; |
| 288 | } while (block++, (bh = bh->b_this_page) != head); |
| 289 | |
| 290 | block_commit_write(folio, from, to); |
| 291 | return 0; |
| 292 | } |
| 293 | |
| 294 | /* |
| 295 | * Save the data in original inode extent blocks and replace one folio size |
| 296 | * aligned original inode extent with one or one partial donor inode extent, |
| 297 | * and then write out the saved data in new original inode blocks. Pass out |
| 298 | * the replaced block count through m_len. Return 0 on success, and an error |
| 299 | * code otherwise. |
| 300 | */ |
| 301 | static int mext_move_extent(struct mext_data *mext, u64 *m_len) |
| 302 | { |
| 303 | struct inode *orig_inode = mext->orig_inode; |
| 304 | struct inode *donor_inode = mext->donor_inode; |
| 305 | struct ext4_map_blocks *orig_map = &mext->orig_map; |
| 306 | unsigned int blkbits = orig_inode->i_blkbits; |
| 307 | struct folio *folio[2] = {NULL, NULL}; |
| 308 | loff_t from, length; |
| 309 | enum mext_move_type move_type = 0; |
| 310 | handle_t *handle; |
| 311 | u64 r_len = 0; |
| 312 | unsigned int credits; |
| 313 | int ret, ret2; |
| 314 | |
| 315 | *m_len = 0; |
| 316 | trace_ext4_move_extent_enter(orig_inode, orig_map, donor_inode, |
| 317 | donor_lblk: mext->donor_lblk); |
| 318 | credits = ext4_chunk_trans_extent(inode: orig_inode, nrblocks: 0) * 2; |
| 319 | handle = ext4_journal_start(orig_inode, EXT4_HT_MOVE_EXTENTS, credits); |
| 320 | if (IS_ERR(ptr: handle)) { |
| 321 | ret = PTR_ERR(ptr: handle); |
| 322 | goto out; |
| 323 | } |
| 324 | |
| 325 | ret = mext_move_begin(mext, folio, move_type: &move_type); |
| 326 | if (ret) |
| 327 | goto stop_handle; |
| 328 | |
| 329 | if (move_type == MEXT_SKIP_EXTENT) |
| 330 | goto unlock; |
| 331 | |
| 332 | /* |
| 333 | * Copy the data. First, read the original inode data into the page |
| 334 | * cache. Then, release the existing mapping relationships and swap |
| 335 | * the extent. Finally, re-establish the new mapping relationships |
| 336 | * and dirty the page cache. |
| 337 | */ |
| 338 | if (move_type == MEXT_COPY_DATA) { |
| 339 | from = offset_in_folio(folio[0], |
| 340 | ((loff_t)orig_map->m_lblk) << blkbits); |
| 341 | length = ((loff_t)orig_map->m_len) << blkbits; |
| 342 | |
| 343 | ret = mext_folio_mkuptodate(folio: folio[0], from, to: from + length); |
| 344 | if (ret) |
| 345 | goto unlock; |
| 346 | } |
| 347 | |
| 348 | if (!filemap_release_folio(folio: folio[0], gfp: 0) || |
| 349 | !filemap_release_folio(folio: folio[1], gfp: 0)) { |
| 350 | ret = -EBUSY; |
| 351 | goto unlock; |
| 352 | } |
| 353 | |
| 354 | /* Move extent */ |
| 355 | ext4_double_down_write_data_sem(first: orig_inode, second: donor_inode); |
| 356 | *m_len = ext4_swap_extents(handle, inode1: orig_inode, inode2: donor_inode, |
| 357 | lblk1: orig_map->m_lblk, lblk2: mext->donor_lblk, |
| 358 | count: orig_map->m_len, mark_unwritten: 1, err: &ret); |
| 359 | ext4_double_up_write_data_sem(orig_inode, donor_inode); |
| 360 | |
| 361 | /* A short-length swap cannot occur after a successful swap extent. */ |
| 362 | if (WARN_ON_ONCE(!ret && (*m_len != orig_map->m_len))) |
| 363 | ret = -EIO; |
| 364 | |
| 365 | if (!(*m_len) || (move_type == MEXT_MOVE_EXTENT)) |
| 366 | goto unlock; |
| 367 | |
| 368 | /* Copy data */ |
| 369 | length = (*m_len) << blkbits; |
| 370 | ret2 = mext_folio_mkwrite(inode: orig_inode, folio: folio[0], from, to: from + length); |
| 371 | if (ret2) { |
| 372 | if (!ret) |
| 373 | ret = ret2; |
| 374 | goto repair_branches; |
| 375 | } |
| 376 | /* |
| 377 | * Even in case of data=writeback it is reasonable to pin |
| 378 | * inode to transaction, to prevent unexpected data loss. |
| 379 | */ |
| 380 | ret2 = ext4_jbd2_inode_add_write(handle, inode: orig_inode, |
| 381 | start_byte: ((loff_t)orig_map->m_lblk) << blkbits, length); |
| 382 | if (!ret) |
| 383 | ret = ret2; |
| 384 | unlock: |
| 385 | mext_folio_double_unlock(folio); |
| 386 | stop_handle: |
| 387 | ext4_journal_stop(handle); |
| 388 | out: |
| 389 | trace_ext4_move_extent_exit(orig_inode, orig_lblk: orig_map->m_lblk, donor_inode, |
| 390 | donor_lblk: mext->donor_lblk, m_len: orig_map->m_len, move_len: *m_len, |
| 391 | move_type, ret); |
| 392 | return ret; |
| 393 | |
| 394 | repair_branches: |
| 395 | ret2 = 0; |
| 396 | ext4_double_down_write_data_sem(first: orig_inode, second: donor_inode); |
| 397 | r_len = ext4_swap_extents(handle, inode1: donor_inode, inode2: orig_inode, |
| 398 | lblk1: mext->donor_lblk, lblk2: orig_map->m_lblk, |
| 399 | count: *m_len, mark_unwritten: 0, err: &ret2); |
| 400 | ext4_double_up_write_data_sem(orig_inode, donor_inode); |
| 401 | if (ret2 || r_len != *m_len) { |
| 402 | ext4_error_inode_block(orig_inode, (sector_t)(orig_map->m_lblk), |
| 403 | EIO, "Unable to copy data block, data will be lost!" ); |
| 404 | ret = -EIO; |
| 405 | } |
| 406 | *m_len = 0; |
| 407 | goto unlock; |
| 408 | } |
| 409 | |
| 410 | /* |
| 411 | * Check the validity of the basic filesystem environment and the |
| 412 | * inodes' support status. |
| 413 | */ |
| 414 | static int mext_check_validity(struct inode *orig_inode, |
| 415 | struct inode *donor_inode) |
| 416 | { |
| 417 | struct super_block *sb = orig_inode->i_sb; |
| 418 | |
| 419 | /* origin and donor should be different inodes */ |
| 420 | if (orig_inode == donor_inode) { |
| 421 | ext4_debug("ext4 move extent: The argument files should not be same inode [ino:orig %lu, donor %lu]\n" , |
| 422 | orig_inode->i_ino, donor_inode->i_ino); |
| 423 | return -EINVAL; |
| 424 | } |
| 425 | |
| 426 | /* origin and donor should belone to the same filesystem */ |
| 427 | if (orig_inode->i_sb != donor_inode->i_sb) { |
| 428 | ext4_debug("ext4 move extent: The argument files should be in same FS [ino:orig %lu, donor %lu]\n" , |
| 429 | orig_inode->i_ino, donor_inode->i_ino); |
| 430 | return -EINVAL; |
| 431 | } |
| 432 | |
| 433 | /* Regular file check */ |
| 434 | if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) { |
| 435 | ext4_debug("ext4 move extent: The argument files should be regular file [ino:orig %lu, donor %lu]\n" , |
| 436 | orig_inode->i_ino, donor_inode->i_ino); |
| 437 | return -EINVAL; |
| 438 | } |
| 439 | |
| 440 | if (ext4_has_feature_bigalloc(sb)) { |
| 441 | ext4_msg(sb, KERN_ERR, |
| 442 | "Online defrag not supported with bigalloc" ); |
| 443 | return -EOPNOTSUPP; |
| 444 | } |
| 445 | |
| 446 | if (IS_DAX(orig_inode)) { |
| 447 | ext4_msg(sb, KERN_ERR, |
| 448 | "Online defrag not supported with DAX" ); |
| 449 | return -EOPNOTSUPP; |
| 450 | } |
| 451 | |
| 452 | /* |
| 453 | * TODO: it's not obvious how to swap blocks for inodes with full |
| 454 | * journaling enabled. |
| 455 | */ |
| 456 | if (ext4_should_journal_data(inode: orig_inode) || |
| 457 | ext4_should_journal_data(inode: donor_inode)) { |
| 458 | ext4_msg(sb, KERN_ERR, |
| 459 | "Online defrag not supported with data journaling" ); |
| 460 | return -EOPNOTSUPP; |
| 461 | } |
| 462 | |
| 463 | if (IS_ENCRYPTED(orig_inode) || IS_ENCRYPTED(donor_inode)) { |
| 464 | ext4_msg(sb, KERN_ERR, |
| 465 | "Online defrag not supported for encrypted files" ); |
| 466 | return -EOPNOTSUPP; |
| 467 | } |
| 468 | |
| 469 | /* Ext4 move extent supports only extent based file */ |
| 470 | if (!(ext4_test_inode_flag(inode: orig_inode, bit: EXT4_INODE_EXTENTS)) || |
| 471 | !(ext4_test_inode_flag(inode: donor_inode, bit: EXT4_INODE_EXTENTS))) { |
| 472 | ext4_msg(sb, KERN_ERR, |
| 473 | "Online defrag not supported for non-extent files" ); |
| 474 | return -EOPNOTSUPP; |
| 475 | } |
| 476 | |
| 477 | if (donor_inode->i_mode & (S_ISUID|S_ISGID)) { |
| 478 | ext4_debug("ext4 move extent: suid or sgid is set to donor file [ino:orig %lu, donor %lu]\n" , |
| 479 | orig_inode->i_ino, donor_inode->i_ino); |
| 480 | return -EINVAL; |
| 481 | } |
| 482 | |
| 483 | if (IS_IMMUTABLE(donor_inode) || IS_APPEND(donor_inode)) { |
| 484 | ext4_debug("ext4 move extent: donor should not be immutable or append file [ino:orig %lu, donor %lu]\n" , |
| 485 | orig_inode->i_ino, donor_inode->i_ino); |
| 486 | return -EPERM; |
| 487 | } |
| 488 | |
| 489 | /* Ext4 move extent does not support swap files */ |
| 490 | if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) { |
| 491 | ext4_debug("ext4 move extent: The argument files should not be swap files [ino:orig %lu, donor %lu]\n" , |
| 492 | orig_inode->i_ino, donor_inode->i_ino); |
| 493 | return -ETXTBSY; |
| 494 | } |
| 495 | |
| 496 | if (ext4_is_quota_file(inode: orig_inode) || ext4_is_quota_file(inode: donor_inode)) { |
| 497 | ext4_debug("ext4 move extent: The argument files should not be quota files [ino:orig %lu, donor %lu]\n" , |
| 498 | orig_inode->i_ino, donor_inode->i_ino); |
| 499 | return -EOPNOTSUPP; |
| 500 | } |
| 501 | |
| 502 | if ((!orig_inode->i_size) || (!donor_inode->i_size)) { |
| 503 | ext4_debug("ext4 move extent: File size is 0 byte\n" ); |
| 504 | return -EINVAL; |
| 505 | } |
| 506 | |
| 507 | return 0; |
| 508 | } |
| 509 | |
| 510 | /* |
| 511 | * Check the moving range of ext4_move_extents() whether the files can be |
| 512 | * exchanged with each other, and adjust the length to fit within the file |
| 513 | * size. Return 0 on success, or a negative error value on failure. |
| 514 | */ |
| 515 | static int mext_check_adjust_range(struct inode *orig_inode, |
| 516 | struct inode *donor_inode, __u64 orig_start, |
| 517 | __u64 donor_start, __u64 *len) |
| 518 | { |
| 519 | __u64 orig_eof, donor_eof; |
| 520 | |
| 521 | /* Start offset should be same */ |
| 522 | if ((orig_start & ~(PAGE_MASK >> orig_inode->i_blkbits)) != |
| 523 | (donor_start & ~(PAGE_MASK >> orig_inode->i_blkbits))) { |
| 524 | ext4_debug("ext4 move extent: orig and donor's start offsets are not aligned [ino:orig %lu, donor %lu]\n" , |
| 525 | orig_inode->i_ino, donor_inode->i_ino); |
| 526 | return -EINVAL; |
| 527 | } |
| 528 | |
| 529 | if ((orig_start >= EXT_MAX_BLOCKS) || |
| 530 | (donor_start >= EXT_MAX_BLOCKS) || |
| 531 | (*len > EXT_MAX_BLOCKS) || |
| 532 | (donor_start + *len >= EXT_MAX_BLOCKS) || |
| 533 | (orig_start + *len >= EXT_MAX_BLOCKS)) { |
| 534 | ext4_debug("ext4 move extent: Can't handle over [%u] blocks [ino:orig %lu, donor %lu]\n" , |
| 535 | EXT_MAX_BLOCKS, |
| 536 | orig_inode->i_ino, donor_inode->i_ino); |
| 537 | return -EINVAL; |
| 538 | } |
| 539 | |
| 540 | orig_eof = EXT4_B_TO_LBLK(orig_inode, i_size_read(orig_inode)); |
| 541 | donor_eof = EXT4_B_TO_LBLK(donor_inode, i_size_read(donor_inode)); |
| 542 | if (orig_eof <= orig_start) |
| 543 | *len = 0; |
| 544 | else if (orig_eof < orig_start + *len - 1) |
| 545 | *len = orig_eof - orig_start; |
| 546 | if (donor_eof <= donor_start) |
| 547 | *len = 0; |
| 548 | else if (donor_eof < donor_start + *len - 1) |
| 549 | *len = donor_eof - donor_start; |
| 550 | if (!*len) { |
| 551 | ext4_debug("ext4 move extent: len should not be 0 [ino:orig %lu, donor %lu]\n" , |
| 552 | orig_inode->i_ino, donor_inode->i_ino); |
| 553 | return -EINVAL; |
| 554 | } |
| 555 | |
| 556 | return 0; |
| 557 | } |
| 558 | |
| 559 | /** |
| 560 | * ext4_move_extents - Exchange the specified range of a file |
| 561 | * |
| 562 | * @o_filp: file structure of the original file |
| 563 | * @d_filp: file structure of the donor file |
| 564 | * @orig_blk: start offset in block for orig |
| 565 | * @donor_blk: start offset in block for donor |
| 566 | * @len: the number of blocks to be moved |
| 567 | * @moved_len: moved block length |
| 568 | * |
| 569 | * This function returns 0 and moved block length is set in moved_len |
| 570 | * if succeed, otherwise returns error value. |
| 571 | */ |
| 572 | int ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, |
| 573 | __u64 donor_blk, __u64 len, __u64 *moved_len) |
| 574 | { |
| 575 | struct inode *orig_inode = file_inode(f: o_filp); |
| 576 | struct inode *donor_inode = file_inode(f: d_filp); |
| 577 | struct mext_data mext; |
| 578 | struct super_block *sb = orig_inode->i_sb; |
| 579 | struct ext4_sb_info *sbi = EXT4_SB(sb); |
| 580 | int retries = 0; |
| 581 | u64 m_len; |
| 582 | int ret; |
| 583 | |
| 584 | *moved_len = 0; |
| 585 | |
| 586 | /* Protect orig and donor inodes against a truncate */ |
| 587 | lock_two_nondirectories(orig_inode, donor_inode); |
| 588 | |
| 589 | ret = mext_check_validity(orig_inode, donor_inode); |
| 590 | if (ret) |
| 591 | goto out; |
| 592 | |
| 593 | /* Wait for all existing dio workers */ |
| 594 | inode_dio_wait(inode: orig_inode); |
| 595 | inode_dio_wait(inode: donor_inode); |
| 596 | |
| 597 | /* Check and adjust the specified move_extent range. */ |
| 598 | ret = mext_check_adjust_range(orig_inode, donor_inode, orig_start: orig_blk, |
| 599 | donor_start: donor_blk, len: &len); |
| 600 | if (ret) |
| 601 | goto out; |
| 602 | |
| 603 | mext.orig_inode = orig_inode; |
| 604 | mext.donor_inode = donor_inode; |
| 605 | while (len) { |
| 606 | mext.orig_map.m_lblk = orig_blk; |
| 607 | mext.orig_map.m_len = len; |
| 608 | mext.orig_map.m_flags = 0; |
| 609 | mext.donor_lblk = donor_blk; |
| 610 | |
| 611 | ret = ext4_map_blocks(NULL, inode: orig_inode, map: &mext.orig_map, flags: 0); |
| 612 | if (ret < 0) |
| 613 | goto out; |
| 614 | |
| 615 | /* Skip moving if it is a hole or a delalloc extent. */ |
| 616 | if (mext.orig_map.m_flags & |
| 617 | (EXT4_MAP_MAPPED | EXT4_MAP_UNWRITTEN)) { |
| 618 | ret = mext_move_extent(mext: &mext, m_len: &m_len); |
| 619 | *moved_len += m_len; |
| 620 | if (!ret) |
| 621 | goto next; |
| 622 | |
| 623 | /* Move failed or partially failed. */ |
| 624 | if (m_len) { |
| 625 | orig_blk += m_len; |
| 626 | donor_blk += m_len; |
| 627 | len -= m_len; |
| 628 | } |
| 629 | if (ret == -ESTALE) |
| 630 | continue; |
| 631 | if (ret == -ENOSPC && |
| 632 | ext4_should_retry_alloc(sb, retries: &retries)) |
| 633 | continue; |
| 634 | if (ret == -EBUSY && |
| 635 | sbi->s_journal && retries++ < 4 && |
| 636 | jbd2_journal_force_commit_nested(sbi->s_journal)) |
| 637 | continue; |
| 638 | |
| 639 | goto out; |
| 640 | } |
| 641 | next: |
| 642 | orig_blk += mext.orig_map.m_len; |
| 643 | donor_blk += mext.orig_map.m_len; |
| 644 | len -= mext.orig_map.m_len; |
| 645 | retries = 0; |
| 646 | } |
| 647 | |
| 648 | out: |
| 649 | if (*moved_len) { |
| 650 | ext4_discard_preallocations(orig_inode); |
| 651 | ext4_discard_preallocations(donor_inode); |
| 652 | } |
| 653 | |
| 654 | unlock_two_nondirectories(orig_inode, donor_inode); |
| 655 | return ret; |
| 656 | } |
| 657 | |