1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com |
4 | * Written by Alex Tomas <alex@clusterfs.com> |
5 | * |
6 | * Architecture independence: |
7 | * Copyright (c) 2005, Bull S.A. |
8 | * Written by Pierre Peiffer <pierre.peiffer@bull.net> |
9 | */ |
10 | |
11 | /* |
12 | * Extents support for EXT4 |
13 | * |
14 | * TODO: |
15 | * - ext4*_error() should be used in some situations |
16 | * - analyze all BUG()/BUG_ON(), use -EIO where appropriate |
17 | * - smart tree reduction |
18 | */ |
19 | |
20 | #include <linux/fs.h> |
21 | #include <linux/time.h> |
22 | #include <linux/jbd2.h> |
23 | #include <linux/highuid.h> |
24 | #include <linux/pagemap.h> |
25 | #include <linux/quotaops.h> |
26 | #include <linux/string.h> |
27 | #include <linux/slab.h> |
28 | #include <linux/uaccess.h> |
29 | #include <linux/fiemap.h> |
30 | #include <linux/iomap.h> |
31 | #include <linux/sched/mm.h> |
32 | #include "ext4_jbd2.h" |
33 | #include "ext4_extents.h" |
34 | #include "xattr.h" |
35 | |
36 | #include <trace/events/ext4.h> |
37 | |
38 | /* |
39 | * used by extent splitting. |
40 | */ |
41 | #define EXT4_EXT_MAY_ZEROOUT 0x1 /* safe to zeroout if split fails \ |
42 | due to ENOSPC */ |
43 | #define EXT4_EXT_MARK_UNWRIT1 0x2 /* mark first half unwritten */ |
44 | #define EXT4_EXT_MARK_UNWRIT2 0x4 /* mark second half unwritten */ |
45 | |
46 | #define EXT4_EXT_DATA_VALID1 0x8 /* first half contains valid data */ |
47 | #define EXT4_EXT_DATA_VALID2 0x10 /* second half contains valid data */ |
48 | |
49 | static __le32 ext4_extent_block_csum(struct inode *inode, |
50 | struct ext4_extent_header *eh) |
51 | { |
52 | struct ext4_inode_info *ei = EXT4_I(inode); |
53 | struct ext4_sb_info *sbi = EXT4_SB(sb: inode->i_sb); |
54 | __u32 csum; |
55 | |
56 | csum = ext4_chksum(sbi, crc: ei->i_csum_seed, address: (__u8 *)eh, |
57 | EXT4_EXTENT_TAIL_OFFSET(eh)); |
58 | return cpu_to_le32(csum); |
59 | } |
60 | |
61 | static int ext4_extent_block_csum_verify(struct inode *inode, |
62 | struct ext4_extent_header *eh) |
63 | { |
64 | struct ext4_extent_tail *et; |
65 | |
66 | if (!ext4_has_metadata_csum(sb: inode->i_sb)) |
67 | return 1; |
68 | |
69 | et = find_ext4_extent_tail(eh); |
70 | if (et->et_checksum != ext4_extent_block_csum(inode, eh)) |
71 | return 0; |
72 | return 1; |
73 | } |
74 | |
75 | static void ext4_extent_block_csum_set(struct inode *inode, |
76 | struct ext4_extent_header *eh) |
77 | { |
78 | struct ext4_extent_tail *et; |
79 | |
80 | if (!ext4_has_metadata_csum(sb: inode->i_sb)) |
81 | return; |
82 | |
83 | et = find_ext4_extent_tail(eh); |
84 | et->et_checksum = ext4_extent_block_csum(inode, eh); |
85 | } |
86 | |
87 | static int ext4_split_extent_at(handle_t *handle, |
88 | struct inode *inode, |
89 | struct ext4_ext_path **ppath, |
90 | ext4_lblk_t split, |
91 | int split_flag, |
92 | int flags); |
93 | |
94 | static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped) |
95 | { |
96 | /* |
97 | * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this |
98 | * moment, get_block can be called only for blocks inside i_size since |
99 | * page cache has been already dropped and writes are blocked by |
100 | * i_rwsem. So we can safely drop the i_data_sem here. |
101 | */ |
102 | BUG_ON(EXT4_JOURNAL(inode) == NULL); |
103 | ext4_discard_preallocations(inode); |
104 | up_write(sem: &EXT4_I(inode)->i_data_sem); |
105 | *dropped = 1; |
106 | return 0; |
107 | } |
108 | |
109 | static void ext4_ext_drop_refs(struct ext4_ext_path *path) |
110 | { |
111 | int depth, i; |
112 | |
113 | if (!path) |
114 | return; |
115 | depth = path->p_depth; |
116 | for (i = 0; i <= depth; i++, path++) { |
117 | brelse(bh: path->p_bh); |
118 | path->p_bh = NULL; |
119 | } |
120 | } |
121 | |
122 | void ext4_free_ext_path(struct ext4_ext_path *path) |
123 | { |
124 | ext4_ext_drop_refs(path); |
125 | kfree(objp: path); |
126 | } |
127 | |
128 | /* |
129 | * Make sure 'handle' has at least 'check_cred' credits. If not, restart |
130 | * transaction with 'restart_cred' credits. The function drops i_data_sem |
131 | * when restarting transaction and gets it after transaction is restarted. |
132 | * |
133 | * The function returns 0 on success, 1 if transaction had to be restarted, |
134 | * and < 0 in case of fatal error. |
135 | */ |
136 | int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode, |
137 | int check_cred, int restart_cred, |
138 | int revoke_cred) |
139 | { |
140 | int ret; |
141 | int dropped = 0; |
142 | |
143 | ret = ext4_journal_ensure_credits_fn(handle, check_cred, restart_cred, |
144 | revoke_cred, ext4_ext_trunc_restart_fn(inode, &dropped)); |
145 | if (dropped) |
146 | down_write(sem: &EXT4_I(inode)->i_data_sem); |
147 | return ret; |
148 | } |
149 | |
150 | /* |
151 | * could return: |
152 | * - EROFS |
153 | * - ENOMEM |
154 | */ |
155 | static int ext4_ext_get_access(handle_t *handle, struct inode *inode, |
156 | struct ext4_ext_path *path) |
157 | { |
158 | int err = 0; |
159 | |
160 | if (path->p_bh) { |
161 | /* path points to block */ |
162 | BUFFER_TRACE(path->p_bh, "get_write_access" ); |
163 | err = ext4_journal_get_write_access(handle, inode->i_sb, |
164 | path->p_bh, EXT4_JTR_NONE); |
165 | /* |
166 | * The extent buffer's verified bit will be set again in |
167 | * __ext4_ext_dirty(). We could leave an inconsistent |
168 | * buffer if the extents updating procudure break off du |
169 | * to some error happens, force to check it again. |
170 | */ |
171 | if (!err) |
172 | clear_buffer_verified(bh: path->p_bh); |
173 | } |
174 | /* path points to leaf/index in inode body */ |
175 | /* we use in-core data, no need to protect them */ |
176 | return err; |
177 | } |
178 | |
179 | /* |
180 | * could return: |
181 | * - EROFS |
182 | * - ENOMEM |
183 | * - EIO |
184 | */ |
185 | static int __ext4_ext_dirty(const char *where, unsigned int line, |
186 | handle_t *handle, struct inode *inode, |
187 | struct ext4_ext_path *path) |
188 | { |
189 | int err; |
190 | |
191 | WARN_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem)); |
192 | if (path->p_bh) { |
193 | ext4_extent_block_csum_set(inode, eh: ext_block_hdr(bh: path->p_bh)); |
194 | /* path points to block */ |
195 | err = __ext4_handle_dirty_metadata(where, line, handle, |
196 | inode, bh: path->p_bh); |
197 | /* Extents updating done, re-set verified flag */ |
198 | if (!err) |
199 | set_buffer_verified(path->p_bh); |
200 | } else { |
201 | /* path points to leaf/index in inode body */ |
202 | err = ext4_mark_inode_dirty(handle, inode); |
203 | } |
204 | return err; |
205 | } |
206 | |
207 | #define ext4_ext_dirty(handle, inode, path) \ |
208 | __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path)) |
209 | |
210 | static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, |
211 | struct ext4_ext_path *path, |
212 | ext4_lblk_t block) |
213 | { |
214 | if (path) { |
215 | int depth = path->p_depth; |
216 | struct ext4_extent *ex; |
217 | |
218 | /* |
219 | * Try to predict block placement assuming that we are |
220 | * filling in a file which will eventually be |
221 | * non-sparse --- i.e., in the case of libbfd writing |
222 | * an ELF object sections out-of-order but in a way |
223 | * the eventually results in a contiguous object or |
224 | * executable file, or some database extending a table |
225 | * space file. However, this is actually somewhat |
226 | * non-ideal if we are writing a sparse file such as |
227 | * qemu or KVM writing a raw image file that is going |
228 | * to stay fairly sparse, since it will end up |
229 | * fragmenting the file system's free space. Maybe we |
230 | * should have some hueristics or some way to allow |
231 | * userspace to pass a hint to file system, |
232 | * especially if the latter case turns out to be |
233 | * common. |
234 | */ |
235 | ex = path[depth].p_ext; |
236 | if (ex) { |
237 | ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex); |
238 | ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block); |
239 | |
240 | if (block > ext_block) |
241 | return ext_pblk + (block - ext_block); |
242 | else |
243 | return ext_pblk - (ext_block - block); |
244 | } |
245 | |
246 | /* it looks like index is empty; |
247 | * try to find starting block from index itself */ |
248 | if (path[depth].p_bh) |
249 | return path[depth].p_bh->b_blocknr; |
250 | } |
251 | |
252 | /* OK. use inode's group */ |
253 | return ext4_inode_to_goal_block(inode); |
254 | } |
255 | |
256 | /* |
257 | * Allocation for a meta data block |
258 | */ |
259 | static ext4_fsblk_t |
260 | ext4_ext_new_meta_block(handle_t *handle, struct inode *inode, |
261 | struct ext4_ext_path *path, |
262 | struct ext4_extent *ex, int *err, unsigned int flags) |
263 | { |
264 | ext4_fsblk_t goal, newblock; |
265 | |
266 | goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block)); |
267 | newblock = ext4_new_meta_blocks(handle, inode, goal, flags, |
268 | NULL, errp: err); |
269 | return newblock; |
270 | } |
271 | |
272 | static inline int ext4_ext_space_block(struct inode *inode, int check) |
273 | { |
274 | int size; |
275 | |
276 | size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) |
277 | / sizeof(struct ext4_extent); |
278 | #ifdef AGGRESSIVE_TEST |
279 | if (!check && size > 6) |
280 | size = 6; |
281 | #endif |
282 | return size; |
283 | } |
284 | |
285 | static inline int ext4_ext_space_block_idx(struct inode *inode, int check) |
286 | { |
287 | int size; |
288 | |
289 | size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) |
290 | / sizeof(struct ext4_extent_idx); |
291 | #ifdef AGGRESSIVE_TEST |
292 | if (!check && size > 5) |
293 | size = 5; |
294 | #endif |
295 | return size; |
296 | } |
297 | |
298 | static inline int ext4_ext_space_root(struct inode *inode, int check) |
299 | { |
300 | int size; |
301 | |
302 | size = sizeof(EXT4_I(inode)->i_data); |
303 | size -= sizeof(struct ext4_extent_header); |
304 | size /= sizeof(struct ext4_extent); |
305 | #ifdef AGGRESSIVE_TEST |
306 | if (!check && size > 3) |
307 | size = 3; |
308 | #endif |
309 | return size; |
310 | } |
311 | |
312 | static inline int ext4_ext_space_root_idx(struct inode *inode, int check) |
313 | { |
314 | int size; |
315 | |
316 | size = sizeof(EXT4_I(inode)->i_data); |
317 | size -= sizeof(struct ext4_extent_header); |
318 | size /= sizeof(struct ext4_extent_idx); |
319 | #ifdef AGGRESSIVE_TEST |
320 | if (!check && size > 4) |
321 | size = 4; |
322 | #endif |
323 | return size; |
324 | } |
325 | |
326 | static inline int |
327 | ext4_force_split_extent_at(handle_t *handle, struct inode *inode, |
328 | struct ext4_ext_path **ppath, ext4_lblk_t lblk, |
329 | int nofail) |
330 | { |
331 | struct ext4_ext_path *path = *ppath; |
332 | int unwritten = ext4_ext_is_unwritten(ext: path[path->p_depth].p_ext); |
333 | int flags = EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO; |
334 | |
335 | if (nofail) |
336 | flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL | EXT4_EX_NOFAIL; |
337 | |
338 | return ext4_split_extent_at(handle, inode, ppath, split: lblk, split_flag: unwritten ? |
339 | EXT4_EXT_MARK_UNWRIT1|EXT4_EXT_MARK_UNWRIT2 : 0, |
340 | flags); |
341 | } |
342 | |
343 | static int |
344 | ext4_ext_max_entries(struct inode *inode, int depth) |
345 | { |
346 | int max; |
347 | |
348 | if (depth == ext_depth(inode)) { |
349 | if (depth == 0) |
350 | max = ext4_ext_space_root(inode, check: 1); |
351 | else |
352 | max = ext4_ext_space_root_idx(inode, check: 1); |
353 | } else { |
354 | if (depth == 0) |
355 | max = ext4_ext_space_block(inode, check: 1); |
356 | else |
357 | max = ext4_ext_space_block_idx(inode, check: 1); |
358 | } |
359 | |
360 | return max; |
361 | } |
362 | |
363 | static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) |
364 | { |
365 | ext4_fsblk_t block = ext4_ext_pblock(ex: ext); |
366 | int len = ext4_ext_get_actual_len(ext); |
367 | ext4_lblk_t lblock = le32_to_cpu(ext->ee_block); |
368 | |
369 | /* |
370 | * We allow neither: |
371 | * - zero length |
372 | * - overflow/wrap-around |
373 | */ |
374 | if (lblock + len <= lblock) |
375 | return 0; |
376 | return ext4_inode_block_valid(inode, start_blk: block, count: len); |
377 | } |
378 | |
379 | static int ext4_valid_extent_idx(struct inode *inode, |
380 | struct ext4_extent_idx *ext_idx) |
381 | { |
382 | ext4_fsblk_t block = ext4_idx_pblock(ix: ext_idx); |
383 | |
384 | return ext4_inode_block_valid(inode, start_blk: block, count: 1); |
385 | } |
386 | |
387 | static int ext4_valid_extent_entries(struct inode *inode, |
388 | struct ext4_extent_header *eh, |
389 | ext4_lblk_t lblk, ext4_fsblk_t *pblk, |
390 | int depth) |
391 | { |
392 | unsigned short entries; |
393 | ext4_lblk_t lblock = 0; |
394 | ext4_lblk_t cur = 0; |
395 | |
396 | if (eh->eh_entries == 0) |
397 | return 1; |
398 | |
399 | entries = le16_to_cpu(eh->eh_entries); |
400 | |
401 | if (depth == 0) { |
402 | /* leaf entries */ |
403 | struct ext4_extent *ext = EXT_FIRST_EXTENT(eh); |
404 | |
405 | /* |
406 | * The logical block in the first entry should equal to |
407 | * the number in the index block. |
408 | */ |
409 | if (depth != ext_depth(inode) && |
410 | lblk != le32_to_cpu(ext->ee_block)) |
411 | return 0; |
412 | while (entries) { |
413 | if (!ext4_valid_extent(inode, ext)) |
414 | return 0; |
415 | |
416 | /* Check for overlapping extents */ |
417 | lblock = le32_to_cpu(ext->ee_block); |
418 | if (lblock < cur) { |
419 | *pblk = ext4_ext_pblock(ex: ext); |
420 | return 0; |
421 | } |
422 | cur = lblock + ext4_ext_get_actual_len(ext); |
423 | ext++; |
424 | entries--; |
425 | } |
426 | } else { |
427 | struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh); |
428 | |
429 | /* |
430 | * The logical block in the first entry should equal to |
431 | * the number in the parent index block. |
432 | */ |
433 | if (depth != ext_depth(inode) && |
434 | lblk != le32_to_cpu(ext_idx->ei_block)) |
435 | return 0; |
436 | while (entries) { |
437 | if (!ext4_valid_extent_idx(inode, ext_idx)) |
438 | return 0; |
439 | |
440 | /* Check for overlapping index extents */ |
441 | lblock = le32_to_cpu(ext_idx->ei_block); |
442 | if (lblock < cur) { |
443 | *pblk = ext4_idx_pblock(ix: ext_idx); |
444 | return 0; |
445 | } |
446 | ext_idx++; |
447 | entries--; |
448 | cur = lblock + 1; |
449 | } |
450 | } |
451 | return 1; |
452 | } |
453 | |
454 | static int __ext4_ext_check(const char *function, unsigned int line, |
455 | struct inode *inode, struct ext4_extent_header *eh, |
456 | int depth, ext4_fsblk_t pblk, ext4_lblk_t lblk) |
457 | { |
458 | const char *error_msg; |
459 | int max = 0, err = -EFSCORRUPTED; |
460 | |
461 | if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) { |
462 | error_msg = "invalid magic" ; |
463 | goto corrupted; |
464 | } |
465 | if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) { |
466 | error_msg = "unexpected eh_depth" ; |
467 | goto corrupted; |
468 | } |
469 | if (unlikely(eh->eh_max == 0)) { |
470 | error_msg = "invalid eh_max" ; |
471 | goto corrupted; |
472 | } |
473 | max = ext4_ext_max_entries(inode, depth); |
474 | if (unlikely(le16_to_cpu(eh->eh_max) > max)) { |
475 | error_msg = "too large eh_max" ; |
476 | goto corrupted; |
477 | } |
478 | if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) { |
479 | error_msg = "invalid eh_entries" ; |
480 | goto corrupted; |
481 | } |
482 | if (unlikely((eh->eh_entries == 0) && (depth > 0))) { |
483 | error_msg = "eh_entries is 0 but eh_depth is > 0" ; |
484 | goto corrupted; |
485 | } |
486 | if (!ext4_valid_extent_entries(inode, eh, lblk, pblk: &pblk, depth)) { |
487 | error_msg = "invalid extent entries" ; |
488 | goto corrupted; |
489 | } |
490 | if (unlikely(depth > 32)) { |
491 | error_msg = "too large eh_depth" ; |
492 | goto corrupted; |
493 | } |
494 | /* Verify checksum on non-root extent tree nodes */ |
495 | if (ext_depth(inode) != depth && |
496 | !ext4_extent_block_csum_verify(inode, eh)) { |
497 | error_msg = "extent tree corrupted" ; |
498 | err = -EFSBADCRC; |
499 | goto corrupted; |
500 | } |
501 | return 0; |
502 | |
503 | corrupted: |
504 | ext4_error_inode_err(inode, function, line, 0, -err, |
505 | "pblk %llu bad header/extent: %s - magic %x, " |
506 | "entries %u, max %u(%u), depth %u(%u)" , |
507 | (unsigned long long) pblk, error_msg, |
508 | le16_to_cpu(eh->eh_magic), |
509 | le16_to_cpu(eh->eh_entries), |
510 | le16_to_cpu(eh->eh_max), |
511 | max, le16_to_cpu(eh->eh_depth), depth); |
512 | return err; |
513 | } |
514 | |
515 | #define ext4_ext_check(inode, eh, depth, pblk) \ |
516 | __ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk), 0) |
517 | |
518 | int ext4_ext_check_inode(struct inode *inode) |
519 | { |
520 | return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode), 0); |
521 | } |
522 | |
523 | static void ext4_cache_extents(struct inode *inode, |
524 | struct ext4_extent_header *eh) |
525 | { |
526 | struct ext4_extent *ex = EXT_FIRST_EXTENT(eh); |
527 | ext4_lblk_t prev = 0; |
528 | int i; |
529 | |
530 | for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) { |
531 | unsigned int status = EXTENT_STATUS_WRITTEN; |
532 | ext4_lblk_t lblk = le32_to_cpu(ex->ee_block); |
533 | int len = ext4_ext_get_actual_len(ext: ex); |
534 | |
535 | if (prev && (prev != lblk)) |
536 | ext4_es_cache_extent(inode, lblk: prev, len: lblk - prev, pblk: ~0, |
537 | EXTENT_STATUS_HOLE); |
538 | |
539 | if (ext4_ext_is_unwritten(ext: ex)) |
540 | status = EXTENT_STATUS_UNWRITTEN; |
541 | ext4_es_cache_extent(inode, lblk, len, |
542 | pblk: ext4_ext_pblock(ex), status); |
543 | prev = lblk + len; |
544 | } |
545 | } |
546 | |
547 | static struct buffer_head * |
548 | __read_extent_tree_block(const char *function, unsigned int line, |
549 | struct inode *inode, struct ext4_extent_idx *idx, |
550 | int depth, int flags) |
551 | { |
552 | struct buffer_head *bh; |
553 | int err; |
554 | gfp_t gfp_flags = __GFP_MOVABLE | GFP_NOFS; |
555 | ext4_fsblk_t pblk; |
556 | |
557 | if (flags & EXT4_EX_NOFAIL) |
558 | gfp_flags |= __GFP_NOFAIL; |
559 | |
560 | pblk = ext4_idx_pblock(ix: idx); |
561 | bh = sb_getblk_gfp(sb: inode->i_sb, block: pblk, gfp: gfp_flags); |
562 | if (unlikely(!bh)) |
563 | return ERR_PTR(error: -ENOMEM); |
564 | |
565 | if (!bh_uptodate_or_lock(bh)) { |
566 | trace_ext4_ext_load_extent(inode, lblk: pblk, _RET_IP_); |
567 | err = ext4_read_bh(bh, op_flags: 0, NULL); |
568 | if (err < 0) |
569 | goto errout; |
570 | } |
571 | if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE)) |
572 | return bh; |
573 | err = __ext4_ext_check(function, line, inode, eh: ext_block_hdr(bh), |
574 | depth, pblk, le32_to_cpu(idx->ei_block)); |
575 | if (err) |
576 | goto errout; |
577 | set_buffer_verified(bh); |
578 | /* |
579 | * If this is a leaf block, cache all of its entries |
580 | */ |
581 | if (!(flags & EXT4_EX_NOCACHE) && depth == 0) { |
582 | struct ext4_extent_header *eh = ext_block_hdr(bh); |
583 | ext4_cache_extents(inode, eh); |
584 | } |
585 | return bh; |
586 | errout: |
587 | put_bh(bh); |
588 | return ERR_PTR(error: err); |
589 | |
590 | } |
591 | |
592 | #define read_extent_tree_block(inode, idx, depth, flags) \ |
593 | __read_extent_tree_block(__func__, __LINE__, (inode), (idx), \ |
594 | (depth), (flags)) |
595 | |
596 | /* |
597 | * This function is called to cache a file's extent information in the |
598 | * extent status tree |
599 | */ |
600 | int ext4_ext_precache(struct inode *inode) |
601 | { |
602 | struct ext4_inode_info *ei = EXT4_I(inode); |
603 | struct ext4_ext_path *path = NULL; |
604 | struct buffer_head *bh; |
605 | int i = 0, depth, ret = 0; |
606 | |
607 | if (!ext4_test_inode_flag(inode, bit: EXT4_INODE_EXTENTS)) |
608 | return 0; /* not an extent-mapped inode */ |
609 | |
610 | down_read(sem: &ei->i_data_sem); |
611 | depth = ext_depth(inode); |
612 | |
613 | /* Don't cache anything if there are no external extent blocks */ |
614 | if (!depth) { |
615 | up_read(sem: &ei->i_data_sem); |
616 | return ret; |
617 | } |
618 | |
619 | path = kcalloc(n: depth + 1, size: sizeof(struct ext4_ext_path), |
620 | GFP_NOFS); |
621 | if (path == NULL) { |
622 | up_read(sem: &ei->i_data_sem); |
623 | return -ENOMEM; |
624 | } |
625 | |
626 | path[0].p_hdr = ext_inode_hdr(inode); |
627 | ret = ext4_ext_check(inode, path[0].p_hdr, depth, 0); |
628 | if (ret) |
629 | goto out; |
630 | path[0].p_idx = EXT_FIRST_INDEX(path[0].p_hdr); |
631 | while (i >= 0) { |
632 | /* |
633 | * If this is a leaf block or we've reached the end of |
634 | * the index block, go up |
635 | */ |
636 | if ((i == depth) || |
637 | path[i].p_idx > EXT_LAST_INDEX(path[i].p_hdr)) { |
638 | brelse(bh: path[i].p_bh); |
639 | path[i].p_bh = NULL; |
640 | i--; |
641 | continue; |
642 | } |
643 | bh = read_extent_tree_block(inode, path[i].p_idx++, |
644 | depth - i - 1, |
645 | EXT4_EX_FORCE_CACHE); |
646 | if (IS_ERR(ptr: bh)) { |
647 | ret = PTR_ERR(ptr: bh); |
648 | break; |
649 | } |
650 | i++; |
651 | path[i].p_bh = bh; |
652 | path[i].p_hdr = ext_block_hdr(bh); |
653 | path[i].p_idx = EXT_FIRST_INDEX(path[i].p_hdr); |
654 | } |
655 | ext4_set_inode_state(inode, bit: EXT4_STATE_EXT_PRECACHED); |
656 | out: |
657 | up_read(sem: &ei->i_data_sem); |
658 | ext4_free_ext_path(path); |
659 | return ret; |
660 | } |
661 | |
662 | #ifdef EXT_DEBUG |
663 | static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) |
664 | { |
665 | int k, l = path->p_depth; |
666 | |
667 | ext_debug(inode, "path:" ); |
668 | for (k = 0; k <= l; k++, path++) { |
669 | if (path->p_idx) { |
670 | ext_debug(inode, " %d->%llu" , |
671 | le32_to_cpu(path->p_idx->ei_block), |
672 | ext4_idx_pblock(path->p_idx)); |
673 | } else if (path->p_ext) { |
674 | ext_debug(inode, " %d:[%d]%d:%llu " , |
675 | le32_to_cpu(path->p_ext->ee_block), |
676 | ext4_ext_is_unwritten(path->p_ext), |
677 | ext4_ext_get_actual_len(path->p_ext), |
678 | ext4_ext_pblock(path->p_ext)); |
679 | } else |
680 | ext_debug(inode, " []" ); |
681 | } |
682 | ext_debug(inode, "\n" ); |
683 | } |
684 | |
685 | static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path) |
686 | { |
687 | int depth = ext_depth(inode); |
688 | struct ext4_extent_header *eh; |
689 | struct ext4_extent *ex; |
690 | int i; |
691 | |
692 | if (!path) |
693 | return; |
694 | |
695 | eh = path[depth].p_hdr; |
696 | ex = EXT_FIRST_EXTENT(eh); |
697 | |
698 | ext_debug(inode, "Displaying leaf extents\n" ); |
699 | |
700 | for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) { |
701 | ext_debug(inode, "%d:[%d]%d:%llu " , le32_to_cpu(ex->ee_block), |
702 | ext4_ext_is_unwritten(ex), |
703 | ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex)); |
704 | } |
705 | ext_debug(inode, "\n" ); |
706 | } |
707 | |
708 | static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path, |
709 | ext4_fsblk_t newblock, int level) |
710 | { |
711 | int depth = ext_depth(inode); |
712 | struct ext4_extent *ex; |
713 | |
714 | if (depth != level) { |
715 | struct ext4_extent_idx *idx; |
716 | idx = path[level].p_idx; |
717 | while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) { |
718 | ext_debug(inode, "%d: move %d:%llu in new index %llu\n" , |
719 | level, le32_to_cpu(idx->ei_block), |
720 | ext4_idx_pblock(idx), newblock); |
721 | idx++; |
722 | } |
723 | |
724 | return; |
725 | } |
726 | |
727 | ex = path[depth].p_ext; |
728 | while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) { |
729 | ext_debug(inode, "move %d:%llu:[%d]%d in new leaf %llu\n" , |
730 | le32_to_cpu(ex->ee_block), |
731 | ext4_ext_pblock(ex), |
732 | ext4_ext_is_unwritten(ex), |
733 | ext4_ext_get_actual_len(ex), |
734 | newblock); |
735 | ex++; |
736 | } |
737 | } |
738 | |
739 | #else |
740 | #define ext4_ext_show_path(inode, path) |
741 | #define ext4_ext_show_leaf(inode, path) |
742 | #define ext4_ext_show_move(inode, path, newblock, level) |
743 | #endif |
744 | |
745 | /* |
746 | * ext4_ext_binsearch_idx: |
747 | * binary search for the closest index of the given block |
748 | * the header must be checked before calling this |
749 | */ |
750 | static void |
751 | ext4_ext_binsearch_idx(struct inode *inode, |
752 | struct ext4_ext_path *path, ext4_lblk_t block) |
753 | { |
754 | struct ext4_extent_header *eh = path->p_hdr; |
755 | struct ext4_extent_idx *r, *l, *m; |
756 | |
757 | |
758 | ext_debug(inode, "binsearch for %u(idx): " , block); |
759 | |
760 | l = EXT_FIRST_INDEX(eh) + 1; |
761 | r = EXT_LAST_INDEX(eh); |
762 | while (l <= r) { |
763 | m = l + (r - l) / 2; |
764 | ext_debug(inode, "%p(%u):%p(%u):%p(%u) " , l, |
765 | le32_to_cpu(l->ei_block), m, le32_to_cpu(m->ei_block), |
766 | r, le32_to_cpu(r->ei_block)); |
767 | |
768 | if (block < le32_to_cpu(m->ei_block)) |
769 | r = m - 1; |
770 | else |
771 | l = m + 1; |
772 | } |
773 | |
774 | path->p_idx = l - 1; |
775 | ext_debug(inode, " -> %u->%lld " , le32_to_cpu(path->p_idx->ei_block), |
776 | ext4_idx_pblock(path->p_idx)); |
777 | |
778 | #ifdef CHECK_BINSEARCH |
779 | { |
780 | struct ext4_extent_idx *chix, *ix; |
781 | int k; |
782 | |
783 | chix = ix = EXT_FIRST_INDEX(eh); |
784 | for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) { |
785 | if (k != 0 && le32_to_cpu(ix->ei_block) <= |
786 | le32_to_cpu(ix[-1].ei_block)) { |
787 | printk(KERN_DEBUG "k=%d, ix=0x%p, " |
788 | "first=0x%p\n" , k, |
789 | ix, EXT_FIRST_INDEX(eh)); |
790 | printk(KERN_DEBUG "%u <= %u\n" , |
791 | le32_to_cpu(ix->ei_block), |
792 | le32_to_cpu(ix[-1].ei_block)); |
793 | } |
794 | BUG_ON(k && le32_to_cpu(ix->ei_block) |
795 | <= le32_to_cpu(ix[-1].ei_block)); |
796 | if (block < le32_to_cpu(ix->ei_block)) |
797 | break; |
798 | chix = ix; |
799 | } |
800 | BUG_ON(chix != path->p_idx); |
801 | } |
802 | #endif |
803 | |
804 | } |
805 | |
806 | /* |
807 | * ext4_ext_binsearch: |
808 | * binary search for closest extent of the given block |
809 | * the header must be checked before calling this |
810 | */ |
811 | static void |
812 | ext4_ext_binsearch(struct inode *inode, |
813 | struct ext4_ext_path *path, ext4_lblk_t block) |
814 | { |
815 | struct ext4_extent_header *eh = path->p_hdr; |
816 | struct ext4_extent *r, *l, *m; |
817 | |
818 | if (eh->eh_entries == 0) { |
819 | /* |
820 | * this leaf is empty: |
821 | * we get such a leaf in split/add case |
822 | */ |
823 | return; |
824 | } |
825 | |
826 | ext_debug(inode, "binsearch for %u: " , block); |
827 | |
828 | l = EXT_FIRST_EXTENT(eh) + 1; |
829 | r = EXT_LAST_EXTENT(eh); |
830 | |
831 | while (l <= r) { |
832 | m = l + (r - l) / 2; |
833 | ext_debug(inode, "%p(%u):%p(%u):%p(%u) " , l, |
834 | le32_to_cpu(l->ee_block), m, le32_to_cpu(m->ee_block), |
835 | r, le32_to_cpu(r->ee_block)); |
836 | |
837 | if (block < le32_to_cpu(m->ee_block)) |
838 | r = m - 1; |
839 | else |
840 | l = m + 1; |
841 | } |
842 | |
843 | path->p_ext = l - 1; |
844 | ext_debug(inode, " -> %d:%llu:[%d]%d " , |
845 | le32_to_cpu(path->p_ext->ee_block), |
846 | ext4_ext_pblock(path->p_ext), |
847 | ext4_ext_is_unwritten(path->p_ext), |
848 | ext4_ext_get_actual_len(path->p_ext)); |
849 | |
850 | #ifdef CHECK_BINSEARCH |
851 | { |
852 | struct ext4_extent *chex, *ex; |
853 | int k; |
854 | |
855 | chex = ex = EXT_FIRST_EXTENT(eh); |
856 | for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) { |
857 | BUG_ON(k && le32_to_cpu(ex->ee_block) |
858 | <= le32_to_cpu(ex[-1].ee_block)); |
859 | if (block < le32_to_cpu(ex->ee_block)) |
860 | break; |
861 | chex = ex; |
862 | } |
863 | BUG_ON(chex != path->p_ext); |
864 | } |
865 | #endif |
866 | |
867 | } |
868 | |
869 | void ext4_ext_tree_init(handle_t *handle, struct inode *inode) |
870 | { |
871 | struct ext4_extent_header *eh; |
872 | |
873 | eh = ext_inode_hdr(inode); |
874 | eh->eh_depth = 0; |
875 | eh->eh_entries = 0; |
876 | eh->eh_magic = EXT4_EXT_MAGIC; |
877 | eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0)); |
878 | eh->eh_generation = 0; |
879 | ext4_mark_inode_dirty(handle, inode); |
880 | } |
881 | |
882 | struct ext4_ext_path * |
883 | ext4_find_extent(struct inode *inode, ext4_lblk_t block, |
884 | struct ext4_ext_path **orig_path, int flags) |
885 | { |
886 | struct ext4_extent_header *eh; |
887 | struct buffer_head *bh; |
888 | struct ext4_ext_path *path = orig_path ? *orig_path : NULL; |
889 | short int depth, i, ppos = 0; |
890 | int ret; |
891 | gfp_t gfp_flags = GFP_NOFS; |
892 | |
893 | if (flags & EXT4_EX_NOFAIL) |
894 | gfp_flags |= __GFP_NOFAIL; |
895 | |
896 | eh = ext_inode_hdr(inode); |
897 | depth = ext_depth(inode); |
898 | if (depth < 0 || depth > EXT4_MAX_EXTENT_DEPTH) { |
899 | EXT4_ERROR_INODE(inode, "inode has invalid extent depth: %d" , |
900 | depth); |
901 | ret = -EFSCORRUPTED; |
902 | goto err; |
903 | } |
904 | |
905 | if (path) { |
906 | ext4_ext_drop_refs(path); |
907 | if (depth > path[0].p_maxdepth) { |
908 | kfree(objp: path); |
909 | *orig_path = path = NULL; |
910 | } |
911 | } |
912 | if (!path) { |
913 | /* account possible depth increase */ |
914 | path = kcalloc(n: depth + 2, size: sizeof(struct ext4_ext_path), |
915 | flags: gfp_flags); |
916 | if (unlikely(!path)) |
917 | return ERR_PTR(error: -ENOMEM); |
918 | path[0].p_maxdepth = depth + 1; |
919 | } |
920 | path[0].p_hdr = eh; |
921 | path[0].p_bh = NULL; |
922 | |
923 | i = depth; |
924 | if (!(flags & EXT4_EX_NOCACHE) && depth == 0) |
925 | ext4_cache_extents(inode, eh); |
926 | /* walk through the tree */ |
927 | while (i) { |
928 | ext_debug(inode, "depth %d: num %d, max %d\n" , |
929 | ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); |
930 | |
931 | ext4_ext_binsearch_idx(inode, path: path + ppos, block); |
932 | path[ppos].p_block = ext4_idx_pblock(ix: path[ppos].p_idx); |
933 | path[ppos].p_depth = i; |
934 | path[ppos].p_ext = NULL; |
935 | |
936 | bh = read_extent_tree_block(inode, path[ppos].p_idx, --i, flags); |
937 | if (IS_ERR(ptr: bh)) { |
938 | ret = PTR_ERR(ptr: bh); |
939 | goto err; |
940 | } |
941 | |
942 | eh = ext_block_hdr(bh); |
943 | ppos++; |
944 | path[ppos].p_bh = bh; |
945 | path[ppos].p_hdr = eh; |
946 | } |
947 | |
948 | path[ppos].p_depth = i; |
949 | path[ppos].p_ext = NULL; |
950 | path[ppos].p_idx = NULL; |
951 | |
952 | /* find extent */ |
953 | ext4_ext_binsearch(inode, path: path + ppos, block); |
954 | /* if not an empty leaf */ |
955 | if (path[ppos].p_ext) |
956 | path[ppos].p_block = ext4_ext_pblock(ex: path[ppos].p_ext); |
957 | |
958 | ext4_ext_show_path(inode, path); |
959 | |
960 | return path; |
961 | |
962 | err: |
963 | ext4_free_ext_path(path); |
964 | if (orig_path) |
965 | *orig_path = NULL; |
966 | return ERR_PTR(error: ret); |
967 | } |
968 | |
969 | /* |
970 | * ext4_ext_insert_index: |
971 | * insert new index [@logical;@ptr] into the block at @curp; |
972 | * check where to insert: before @curp or after @curp |
973 | */ |
974 | static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, |
975 | struct ext4_ext_path *curp, |
976 | int logical, ext4_fsblk_t ptr) |
977 | { |
978 | struct ext4_extent_idx *ix; |
979 | int len, err; |
980 | |
981 | err = ext4_ext_get_access(handle, inode, path: curp); |
982 | if (err) |
983 | return err; |
984 | |
985 | if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) { |
986 | EXT4_ERROR_INODE(inode, |
987 | "logical %d == ei_block %d!" , |
988 | logical, le32_to_cpu(curp->p_idx->ei_block)); |
989 | return -EFSCORRUPTED; |
990 | } |
991 | |
992 | if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries) |
993 | >= le16_to_cpu(curp->p_hdr->eh_max))) { |
994 | EXT4_ERROR_INODE(inode, |
995 | "eh_entries %d >= eh_max %d!" , |
996 | le16_to_cpu(curp->p_hdr->eh_entries), |
997 | le16_to_cpu(curp->p_hdr->eh_max)); |
998 | return -EFSCORRUPTED; |
999 | } |
1000 | |
1001 | if (logical > le32_to_cpu(curp->p_idx->ei_block)) { |
1002 | /* insert after */ |
1003 | ext_debug(inode, "insert new index %d after: %llu\n" , |
1004 | logical, ptr); |
1005 | ix = curp->p_idx + 1; |
1006 | } else { |
1007 | /* insert before */ |
1008 | ext_debug(inode, "insert new index %d before: %llu\n" , |
1009 | logical, ptr); |
1010 | ix = curp->p_idx; |
1011 | } |
1012 | |
1013 | if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) { |
1014 | EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!" ); |
1015 | return -EFSCORRUPTED; |
1016 | } |
1017 | |
1018 | len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1; |
1019 | BUG_ON(len < 0); |
1020 | if (len > 0) { |
1021 | ext_debug(inode, "insert new index %d: " |
1022 | "move %d indices from 0x%p to 0x%p\n" , |
1023 | logical, len, ix, ix + 1); |
1024 | memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx)); |
1025 | } |
1026 | |
1027 | ix->ei_block = cpu_to_le32(logical); |
1028 | ext4_idx_store_pblock(ix, pb: ptr); |
1029 | le16_add_cpu(var: &curp->p_hdr->eh_entries, val: 1); |
1030 | |
1031 | if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) { |
1032 | EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!" ); |
1033 | return -EFSCORRUPTED; |
1034 | } |
1035 | |
1036 | err = ext4_ext_dirty(handle, inode, curp); |
1037 | ext4_std_error(inode->i_sb, err); |
1038 | |
1039 | return err; |
1040 | } |
1041 | |
1042 | /* |
1043 | * ext4_ext_split: |
1044 | * inserts new subtree into the path, using free index entry |
1045 | * at depth @at: |
1046 | * - allocates all needed blocks (new leaf and all intermediate index blocks) |
1047 | * - makes decision where to split |
1048 | * - moves remaining extents and index entries (right to the split point) |
1049 | * into the newly allocated blocks |
1050 | * - initializes subtree |
1051 | */ |
1052 | static int ext4_ext_split(handle_t *handle, struct inode *inode, |
1053 | unsigned int flags, |
1054 | struct ext4_ext_path *path, |
1055 | struct ext4_extent *newext, int at) |
1056 | { |
1057 | struct buffer_head *bh = NULL; |
1058 | int depth = ext_depth(inode); |
1059 | struct ext4_extent_header *neh; |
1060 | struct ext4_extent_idx *fidx; |
1061 | int i = at, k, m, a; |
1062 | ext4_fsblk_t newblock, oldblock; |
1063 | __le32 border; |
1064 | ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */ |
1065 | gfp_t gfp_flags = GFP_NOFS; |
1066 | int err = 0; |
1067 | size_t ext_size = 0; |
1068 | |
1069 | if (flags & EXT4_EX_NOFAIL) |
1070 | gfp_flags |= __GFP_NOFAIL; |
1071 | |
1072 | /* make decision: where to split? */ |
1073 | /* FIXME: now decision is simplest: at current extent */ |
1074 | |
1075 | /* if current leaf will be split, then we should use |
1076 | * border from split point */ |
1077 | if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) { |
1078 | EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!" ); |
1079 | return -EFSCORRUPTED; |
1080 | } |
1081 | if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { |
1082 | border = path[depth].p_ext[1].ee_block; |
1083 | ext_debug(inode, "leaf will be split." |
1084 | " next leaf starts at %d\n" , |
1085 | le32_to_cpu(border)); |
1086 | } else { |
1087 | border = newext->ee_block; |
1088 | ext_debug(inode, "leaf will be added." |
1089 | " next leaf starts at %d\n" , |
1090 | le32_to_cpu(border)); |
1091 | } |
1092 | |
1093 | /* |
1094 | * If error occurs, then we break processing |
1095 | * and mark filesystem read-only. index won't |
1096 | * be inserted and tree will be in consistent |
1097 | * state. Next mount will repair buffers too. |
1098 | */ |
1099 | |
1100 | /* |
1101 | * Get array to track all allocated blocks. |
1102 | * We need this to handle errors and free blocks |
1103 | * upon them. |
1104 | */ |
1105 | ablocks = kcalloc(n: depth, size: sizeof(ext4_fsblk_t), flags: gfp_flags); |
1106 | if (!ablocks) |
1107 | return -ENOMEM; |
1108 | |
1109 | /* allocate all needed blocks */ |
1110 | ext_debug(inode, "allocate %d blocks for indexes/leaf\n" , depth - at); |
1111 | for (a = 0; a < depth - at; a++) { |
1112 | newblock = ext4_ext_new_meta_block(handle, inode, path, |
1113 | ex: newext, err: &err, flags); |
1114 | if (newblock == 0) |
1115 | goto cleanup; |
1116 | ablocks[a] = newblock; |
1117 | } |
1118 | |
1119 | /* initialize new leaf */ |
1120 | newblock = ablocks[--a]; |
1121 | if (unlikely(newblock == 0)) { |
1122 | EXT4_ERROR_INODE(inode, "newblock == 0!" ); |
1123 | err = -EFSCORRUPTED; |
1124 | goto cleanup; |
1125 | } |
1126 | bh = sb_getblk_gfp(sb: inode->i_sb, block: newblock, __GFP_MOVABLE | GFP_NOFS); |
1127 | if (unlikely(!bh)) { |
1128 | err = -ENOMEM; |
1129 | goto cleanup; |
1130 | } |
1131 | lock_buffer(bh); |
1132 | |
1133 | err = ext4_journal_get_create_access(handle, inode->i_sb, bh, |
1134 | EXT4_JTR_NONE); |
1135 | if (err) |
1136 | goto cleanup; |
1137 | |
1138 | neh = ext_block_hdr(bh); |
1139 | neh->eh_entries = 0; |
1140 | neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0)); |
1141 | neh->eh_magic = EXT4_EXT_MAGIC; |
1142 | neh->eh_depth = 0; |
1143 | neh->eh_generation = 0; |
1144 | |
1145 | /* move remainder of path[depth] to the new leaf */ |
1146 | if (unlikely(path[depth].p_hdr->eh_entries != |
1147 | path[depth].p_hdr->eh_max)) { |
1148 | EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!" , |
1149 | path[depth].p_hdr->eh_entries, |
1150 | path[depth].p_hdr->eh_max); |
1151 | err = -EFSCORRUPTED; |
1152 | goto cleanup; |
1153 | } |
1154 | /* start copy from next extent */ |
1155 | m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++; |
1156 | ext4_ext_show_move(inode, path, newblock, depth); |
1157 | if (m) { |
1158 | struct ext4_extent *ex; |
1159 | ex = EXT_FIRST_EXTENT(neh); |
1160 | memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m); |
1161 | le16_add_cpu(var: &neh->eh_entries, val: m); |
1162 | } |
1163 | |
1164 | /* zero out unused area in the extent block */ |
1165 | ext_size = sizeof(struct ext4_extent_header) + |
1166 | sizeof(struct ext4_extent) * le16_to_cpu(neh->eh_entries); |
1167 | memset(bh->b_data + ext_size, 0, inode->i_sb->s_blocksize - ext_size); |
1168 | ext4_extent_block_csum_set(inode, eh: neh); |
1169 | set_buffer_uptodate(bh); |
1170 | unlock_buffer(bh); |
1171 | |
1172 | err = ext4_handle_dirty_metadata(handle, inode, bh); |
1173 | if (err) |
1174 | goto cleanup; |
1175 | brelse(bh); |
1176 | bh = NULL; |
1177 | |
1178 | /* correct old leaf */ |
1179 | if (m) { |
1180 | err = ext4_ext_get_access(handle, inode, path: path + depth); |
1181 | if (err) |
1182 | goto cleanup; |
1183 | le16_add_cpu(var: &path[depth].p_hdr->eh_entries, val: -m); |
1184 | err = ext4_ext_dirty(handle, inode, path + depth); |
1185 | if (err) |
1186 | goto cleanup; |
1187 | |
1188 | } |
1189 | |
1190 | /* create intermediate indexes */ |
1191 | k = depth - at - 1; |
1192 | if (unlikely(k < 0)) { |
1193 | EXT4_ERROR_INODE(inode, "k %d < 0!" , k); |
1194 | err = -EFSCORRUPTED; |
1195 | goto cleanup; |
1196 | } |
1197 | if (k) |
1198 | ext_debug(inode, "create %d intermediate indices\n" , k); |
1199 | /* insert new index into current index block */ |
1200 | /* current depth stored in i var */ |
1201 | i = depth - 1; |
1202 | while (k--) { |
1203 | oldblock = newblock; |
1204 | newblock = ablocks[--a]; |
1205 | bh = sb_getblk(sb: inode->i_sb, block: newblock); |
1206 | if (unlikely(!bh)) { |
1207 | err = -ENOMEM; |
1208 | goto cleanup; |
1209 | } |
1210 | lock_buffer(bh); |
1211 | |
1212 | err = ext4_journal_get_create_access(handle, inode->i_sb, bh, |
1213 | EXT4_JTR_NONE); |
1214 | if (err) |
1215 | goto cleanup; |
1216 | |
1217 | neh = ext_block_hdr(bh); |
1218 | neh->eh_entries = cpu_to_le16(1); |
1219 | neh->eh_magic = EXT4_EXT_MAGIC; |
1220 | neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0)); |
1221 | neh->eh_depth = cpu_to_le16(depth - i); |
1222 | neh->eh_generation = 0; |
1223 | fidx = EXT_FIRST_INDEX(neh); |
1224 | fidx->ei_block = border; |
1225 | ext4_idx_store_pblock(ix: fidx, pb: oldblock); |
1226 | |
1227 | ext_debug(inode, "int.index at %d (block %llu): %u -> %llu\n" , |
1228 | i, newblock, le32_to_cpu(border), oldblock); |
1229 | |
1230 | /* move remainder of path[i] to the new index block */ |
1231 | if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) != |
1232 | EXT_LAST_INDEX(path[i].p_hdr))) { |
1233 | EXT4_ERROR_INODE(inode, |
1234 | "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!" , |
1235 | le32_to_cpu(path[i].p_ext->ee_block)); |
1236 | err = -EFSCORRUPTED; |
1237 | goto cleanup; |
1238 | } |
1239 | /* start copy indexes */ |
1240 | m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++; |
1241 | ext_debug(inode, "cur 0x%p, last 0x%p\n" , path[i].p_idx, |
1242 | EXT_MAX_INDEX(path[i].p_hdr)); |
1243 | ext4_ext_show_move(inode, path, newblock, i); |
1244 | if (m) { |
1245 | memmove(++fidx, path[i].p_idx, |
1246 | sizeof(struct ext4_extent_idx) * m); |
1247 | le16_add_cpu(var: &neh->eh_entries, val: m); |
1248 | } |
1249 | /* zero out unused area in the extent block */ |
1250 | ext_size = sizeof(struct ext4_extent_header) + |
1251 | (sizeof(struct ext4_extent) * le16_to_cpu(neh->eh_entries)); |
1252 | memset(bh->b_data + ext_size, 0, |
1253 | inode->i_sb->s_blocksize - ext_size); |
1254 | ext4_extent_block_csum_set(inode, eh: neh); |
1255 | set_buffer_uptodate(bh); |
1256 | unlock_buffer(bh); |
1257 | |
1258 | err = ext4_handle_dirty_metadata(handle, inode, bh); |
1259 | if (err) |
1260 | goto cleanup; |
1261 | brelse(bh); |
1262 | bh = NULL; |
1263 | |
1264 | /* correct old index */ |
1265 | if (m) { |
1266 | err = ext4_ext_get_access(handle, inode, path: path + i); |
1267 | if (err) |
1268 | goto cleanup; |
1269 | le16_add_cpu(var: &path[i].p_hdr->eh_entries, val: -m); |
1270 | err = ext4_ext_dirty(handle, inode, path + i); |
1271 | if (err) |
1272 | goto cleanup; |
1273 | } |
1274 | |
1275 | i--; |
1276 | } |
1277 | |
1278 | /* insert new index */ |
1279 | err = ext4_ext_insert_index(handle, inode, curp: path + at, |
1280 | le32_to_cpu(border), ptr: newblock); |
1281 | |
1282 | cleanup: |
1283 | if (bh) { |
1284 | if (buffer_locked(bh)) |
1285 | unlock_buffer(bh); |
1286 | brelse(bh); |
1287 | } |
1288 | |
1289 | if (err) { |
1290 | /* free all allocated blocks in error case */ |
1291 | for (i = 0; i < depth; i++) { |
1292 | if (!ablocks[i]) |
1293 | continue; |
1294 | ext4_free_blocks(handle, inode, NULL, block: ablocks[i], count: 1, |
1295 | EXT4_FREE_BLOCKS_METADATA); |
1296 | } |
1297 | } |
1298 | kfree(objp: ablocks); |
1299 | |
1300 | return err; |
1301 | } |
1302 | |
1303 | /* |
1304 | * ext4_ext_grow_indepth: |
1305 | * implements tree growing procedure: |
1306 | * - allocates new block |
1307 | * - moves top-level data (index block or leaf) into the new block |
1308 | * - initializes new top-level, creating index that points to the |
1309 | * just created block |
1310 | */ |
1311 | static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, |
1312 | unsigned int flags) |
1313 | { |
1314 | struct ext4_extent_header *neh; |
1315 | struct buffer_head *bh; |
1316 | ext4_fsblk_t newblock, goal = 0; |
1317 | struct ext4_super_block *es = EXT4_SB(sb: inode->i_sb)->s_es; |
1318 | int err = 0; |
1319 | size_t ext_size = 0; |
1320 | |
1321 | /* Try to prepend new index to old one */ |
1322 | if (ext_depth(inode)) |
1323 | goal = ext4_idx_pblock(EXT_FIRST_INDEX(ext_inode_hdr(inode))); |
1324 | if (goal > le32_to_cpu(es->s_first_data_block)) { |
1325 | flags |= EXT4_MB_HINT_TRY_GOAL; |
1326 | goal--; |
1327 | } else |
1328 | goal = ext4_inode_to_goal_block(inode); |
1329 | newblock = ext4_new_meta_blocks(handle, inode, goal, flags, |
1330 | NULL, errp: &err); |
1331 | if (newblock == 0) |
1332 | return err; |
1333 | |
1334 | bh = sb_getblk_gfp(sb: inode->i_sb, block: newblock, __GFP_MOVABLE | GFP_NOFS); |
1335 | if (unlikely(!bh)) |
1336 | return -ENOMEM; |
1337 | lock_buffer(bh); |
1338 | |
1339 | err = ext4_journal_get_create_access(handle, inode->i_sb, bh, |
1340 | EXT4_JTR_NONE); |
1341 | if (err) { |
1342 | unlock_buffer(bh); |
1343 | goto out; |
1344 | } |
1345 | |
1346 | ext_size = sizeof(EXT4_I(inode)->i_data); |
1347 | /* move top-level index/leaf into new block */ |
1348 | memmove(bh->b_data, EXT4_I(inode)->i_data, ext_size); |
1349 | /* zero out unused area in the extent block */ |
1350 | memset(bh->b_data + ext_size, 0, inode->i_sb->s_blocksize - ext_size); |
1351 | |
1352 | /* set size of new block */ |
1353 | neh = ext_block_hdr(bh); |
1354 | /* old root could have indexes or leaves |
1355 | * so calculate e_max right way */ |
1356 | if (ext_depth(inode)) |
1357 | neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0)); |
1358 | else |
1359 | neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0)); |
1360 | neh->eh_magic = EXT4_EXT_MAGIC; |
1361 | ext4_extent_block_csum_set(inode, eh: neh); |
1362 | set_buffer_uptodate(bh); |
1363 | set_buffer_verified(bh); |
1364 | unlock_buffer(bh); |
1365 | |
1366 | err = ext4_handle_dirty_metadata(handle, inode, bh); |
1367 | if (err) |
1368 | goto out; |
1369 | |
1370 | /* Update top-level index: num,max,pointer */ |
1371 | neh = ext_inode_hdr(inode); |
1372 | neh->eh_entries = cpu_to_le16(1); |
1373 | ext4_idx_store_pblock(EXT_FIRST_INDEX(neh), pb: newblock); |
1374 | if (neh->eh_depth == 0) { |
1375 | /* Root extent block becomes index block */ |
1376 | neh->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0)); |
1377 | EXT_FIRST_INDEX(neh)->ei_block = |
1378 | EXT_FIRST_EXTENT(neh)->ee_block; |
1379 | } |
1380 | ext_debug(inode, "new root: num %d(%d), lblock %d, ptr %llu\n" , |
1381 | le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max), |
1382 | le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block), |
1383 | ext4_idx_pblock(EXT_FIRST_INDEX(neh))); |
1384 | |
1385 | le16_add_cpu(var: &neh->eh_depth, val: 1); |
1386 | err = ext4_mark_inode_dirty(handle, inode); |
1387 | out: |
1388 | brelse(bh); |
1389 | |
1390 | return err; |
1391 | } |
1392 | |
1393 | /* |
1394 | * ext4_ext_create_new_leaf: |
1395 | * finds empty index and adds new leaf. |
1396 | * if no free index is found, then it requests in-depth growing. |
1397 | */ |
1398 | static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode, |
1399 | unsigned int mb_flags, |
1400 | unsigned int gb_flags, |
1401 | struct ext4_ext_path **ppath, |
1402 | struct ext4_extent *newext) |
1403 | { |
1404 | struct ext4_ext_path *path = *ppath; |
1405 | struct ext4_ext_path *curp; |
1406 | int depth, i, err = 0; |
1407 | |
1408 | repeat: |
1409 | i = depth = ext_depth(inode); |
1410 | |
1411 | /* walk up to the tree and look for free index entry */ |
1412 | curp = path + depth; |
1413 | while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) { |
1414 | i--; |
1415 | curp--; |
1416 | } |
1417 | |
1418 | /* we use already allocated block for index block, |
1419 | * so subsequent data blocks should be contiguous */ |
1420 | if (EXT_HAS_FREE_INDEX(curp)) { |
1421 | /* if we found index with free entry, then use that |
1422 | * entry: create all needed subtree and add new leaf */ |
1423 | err = ext4_ext_split(handle, inode, flags: mb_flags, path, newext, at: i); |
1424 | if (err) |
1425 | goto out; |
1426 | |
1427 | /* refill path */ |
1428 | path = ext4_find_extent(inode, |
1429 | block: (ext4_lblk_t)le32_to_cpu(newext->ee_block), |
1430 | orig_path: ppath, flags: gb_flags); |
1431 | if (IS_ERR(ptr: path)) |
1432 | err = PTR_ERR(ptr: path); |
1433 | } else { |
1434 | /* tree is full, time to grow in depth */ |
1435 | err = ext4_ext_grow_indepth(handle, inode, flags: mb_flags); |
1436 | if (err) |
1437 | goto out; |
1438 | |
1439 | /* refill path */ |
1440 | path = ext4_find_extent(inode, |
1441 | block: (ext4_lblk_t)le32_to_cpu(newext->ee_block), |
1442 | orig_path: ppath, flags: gb_flags); |
1443 | if (IS_ERR(ptr: path)) { |
1444 | err = PTR_ERR(ptr: path); |
1445 | goto out; |
1446 | } |
1447 | |
1448 | /* |
1449 | * only first (depth 0 -> 1) produces free space; |
1450 | * in all other cases we have to split the grown tree |
1451 | */ |
1452 | depth = ext_depth(inode); |
1453 | if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) { |
1454 | /* now we need to split */ |
1455 | goto repeat; |
1456 | } |
1457 | } |
1458 | |
1459 | out: |
1460 | return err; |
1461 | } |
1462 | |
1463 | /* |
1464 | * search the closest allocated block to the left for *logical |
1465 | * and returns it at @logical + it's physical address at @phys |
1466 | * if *logical is the smallest allocated block, the function |
1467 | * returns 0 at @phys |
1468 | * return value contains 0 (success) or error code |
1469 | */ |
1470 | static int ext4_ext_search_left(struct inode *inode, |
1471 | struct ext4_ext_path *path, |
1472 | ext4_lblk_t *logical, ext4_fsblk_t *phys) |
1473 | { |
1474 | struct ext4_extent_idx *ix; |
1475 | struct ext4_extent *ex; |
1476 | int depth, ee_len; |
1477 | |
1478 | if (unlikely(path == NULL)) { |
1479 | EXT4_ERROR_INODE(inode, "path == NULL *logical %d!" , *logical); |
1480 | return -EFSCORRUPTED; |
1481 | } |
1482 | depth = path->p_depth; |
1483 | *phys = 0; |
1484 | |
1485 | if (depth == 0 && path->p_ext == NULL) |
1486 | return 0; |
1487 | |
1488 | /* usually extent in the path covers blocks smaller |
1489 | * then *logical, but it can be that extent is the |
1490 | * first one in the file */ |
1491 | |
1492 | ex = path[depth].p_ext; |
1493 | ee_len = ext4_ext_get_actual_len(ext: ex); |
1494 | if (*logical < le32_to_cpu(ex->ee_block)) { |
1495 | if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) { |
1496 | EXT4_ERROR_INODE(inode, |
1497 | "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!" , |
1498 | *logical, le32_to_cpu(ex->ee_block)); |
1499 | return -EFSCORRUPTED; |
1500 | } |
1501 | while (--depth >= 0) { |
1502 | ix = path[depth].p_idx; |
1503 | if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) { |
1504 | EXT4_ERROR_INODE(inode, |
1505 | "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!" , |
1506 | ix != NULL ? le32_to_cpu(ix->ei_block) : 0, |
1507 | le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block), |
1508 | depth); |
1509 | return -EFSCORRUPTED; |
1510 | } |
1511 | } |
1512 | return 0; |
1513 | } |
1514 | |
1515 | if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) { |
1516 | EXT4_ERROR_INODE(inode, |
1517 | "logical %d < ee_block %d + ee_len %d!" , |
1518 | *logical, le32_to_cpu(ex->ee_block), ee_len); |
1519 | return -EFSCORRUPTED; |
1520 | } |
1521 | |
1522 | *logical = le32_to_cpu(ex->ee_block) + ee_len - 1; |
1523 | *phys = ext4_ext_pblock(ex) + ee_len - 1; |
1524 | return 0; |
1525 | } |
1526 | |
1527 | /* |
1528 | * Search the closest allocated block to the right for *logical |
1529 | * and returns it at @logical + it's physical address at @phys. |
1530 | * If not exists, return 0 and @phys is set to 0. We will return |
1531 | * 1 which means we found an allocated block and ret_ex is valid. |
1532 | * Or return a (< 0) error code. |
1533 | */ |
1534 | static int ext4_ext_search_right(struct inode *inode, |
1535 | struct ext4_ext_path *path, |
1536 | ext4_lblk_t *logical, ext4_fsblk_t *phys, |
1537 | struct ext4_extent *ret_ex) |
1538 | { |
1539 | struct buffer_head *bh = NULL; |
1540 | struct ext4_extent_header *eh; |
1541 | struct ext4_extent_idx *ix; |
1542 | struct ext4_extent *ex; |
1543 | int depth; /* Note, NOT eh_depth; depth from top of tree */ |
1544 | int ee_len; |
1545 | |
1546 | if (unlikely(path == NULL)) { |
1547 | EXT4_ERROR_INODE(inode, "path == NULL *logical %d!" , *logical); |
1548 | return -EFSCORRUPTED; |
1549 | } |
1550 | depth = path->p_depth; |
1551 | *phys = 0; |
1552 | |
1553 | if (depth == 0 && path->p_ext == NULL) |
1554 | return 0; |
1555 | |
1556 | /* usually extent in the path covers blocks smaller |
1557 | * then *logical, but it can be that extent is the |
1558 | * first one in the file */ |
1559 | |
1560 | ex = path[depth].p_ext; |
1561 | ee_len = ext4_ext_get_actual_len(ext: ex); |
1562 | if (*logical < le32_to_cpu(ex->ee_block)) { |
1563 | if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) { |
1564 | EXT4_ERROR_INODE(inode, |
1565 | "first_extent(path[%d].p_hdr) != ex" , |
1566 | depth); |
1567 | return -EFSCORRUPTED; |
1568 | } |
1569 | while (--depth >= 0) { |
1570 | ix = path[depth].p_idx; |
1571 | if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) { |
1572 | EXT4_ERROR_INODE(inode, |
1573 | "ix != EXT_FIRST_INDEX *logical %d!" , |
1574 | *logical); |
1575 | return -EFSCORRUPTED; |
1576 | } |
1577 | } |
1578 | goto found_extent; |
1579 | } |
1580 | |
1581 | if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) { |
1582 | EXT4_ERROR_INODE(inode, |
1583 | "logical %d < ee_block %d + ee_len %d!" , |
1584 | *logical, le32_to_cpu(ex->ee_block), ee_len); |
1585 | return -EFSCORRUPTED; |
1586 | } |
1587 | |
1588 | if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) { |
1589 | /* next allocated block in this leaf */ |
1590 | ex++; |
1591 | goto found_extent; |
1592 | } |
1593 | |
1594 | /* go up and search for index to the right */ |
1595 | while (--depth >= 0) { |
1596 | ix = path[depth].p_idx; |
1597 | if (ix != EXT_LAST_INDEX(path[depth].p_hdr)) |
1598 | goto got_index; |
1599 | } |
1600 | |
1601 | /* we've gone up to the root and found no index to the right */ |
1602 | return 0; |
1603 | |
1604 | got_index: |
1605 | /* we've found index to the right, let's |
1606 | * follow it and find the closest allocated |
1607 | * block to the right */ |
1608 | ix++; |
1609 | while (++depth < path->p_depth) { |
1610 | /* subtract from p_depth to get proper eh_depth */ |
1611 | bh = read_extent_tree_block(inode, ix, path->p_depth - depth, 0); |
1612 | if (IS_ERR(ptr: bh)) |
1613 | return PTR_ERR(ptr: bh); |
1614 | eh = ext_block_hdr(bh); |
1615 | ix = EXT_FIRST_INDEX(eh); |
1616 | put_bh(bh); |
1617 | } |
1618 | |
1619 | bh = read_extent_tree_block(inode, ix, path->p_depth - depth, 0); |
1620 | if (IS_ERR(ptr: bh)) |
1621 | return PTR_ERR(ptr: bh); |
1622 | eh = ext_block_hdr(bh); |
1623 | ex = EXT_FIRST_EXTENT(eh); |
1624 | found_extent: |
1625 | *logical = le32_to_cpu(ex->ee_block); |
1626 | *phys = ext4_ext_pblock(ex); |
1627 | if (ret_ex) |
1628 | *ret_ex = *ex; |
1629 | if (bh) |
1630 | put_bh(bh); |
1631 | return 1; |
1632 | } |
1633 | |
1634 | /* |
1635 | * ext4_ext_next_allocated_block: |
1636 | * returns allocated block in subsequent extent or EXT_MAX_BLOCKS. |
1637 | * NOTE: it considers block number from index entry as |
1638 | * allocated block. Thus, index entries have to be consistent |
1639 | * with leaves. |
1640 | */ |
1641 | ext4_lblk_t |
1642 | ext4_ext_next_allocated_block(struct ext4_ext_path *path) |
1643 | { |
1644 | int depth; |
1645 | |
1646 | BUG_ON(path == NULL); |
1647 | depth = path->p_depth; |
1648 | |
1649 | if (depth == 0 && path->p_ext == NULL) |
1650 | return EXT_MAX_BLOCKS; |
1651 | |
1652 | while (depth >= 0) { |
1653 | struct ext4_ext_path *p = &path[depth]; |
1654 | |
1655 | if (depth == path->p_depth) { |
1656 | /* leaf */ |
1657 | if (p->p_ext && p->p_ext != EXT_LAST_EXTENT(p->p_hdr)) |
1658 | return le32_to_cpu(p->p_ext[1].ee_block); |
1659 | } else { |
1660 | /* index */ |
1661 | if (p->p_idx != EXT_LAST_INDEX(p->p_hdr)) |
1662 | return le32_to_cpu(p->p_idx[1].ei_block); |
1663 | } |
1664 | depth--; |
1665 | } |
1666 | |
1667 | return EXT_MAX_BLOCKS; |
1668 | } |
1669 | |
1670 | /* |
1671 | * ext4_ext_next_leaf_block: |
1672 | * returns first allocated block from next leaf or EXT_MAX_BLOCKS |
1673 | */ |
1674 | static ext4_lblk_t ext4_ext_next_leaf_block(struct ext4_ext_path *path) |
1675 | { |
1676 | int depth; |
1677 | |
1678 | BUG_ON(path == NULL); |
1679 | depth = path->p_depth; |
1680 | |
1681 | /* zero-tree has no leaf blocks at all */ |
1682 | if (depth == 0) |
1683 | return EXT_MAX_BLOCKS; |
1684 | |
1685 | /* go to index block */ |
1686 | depth--; |
1687 | |
1688 | while (depth >= 0) { |
1689 | if (path[depth].p_idx != |
1690 | EXT_LAST_INDEX(path[depth].p_hdr)) |
1691 | return (ext4_lblk_t) |
1692 | le32_to_cpu(path[depth].p_idx[1].ei_block); |
1693 | depth--; |
1694 | } |
1695 | |
1696 | return EXT_MAX_BLOCKS; |
1697 | } |
1698 | |
1699 | /* |
1700 | * ext4_ext_correct_indexes: |
1701 | * if leaf gets modified and modified extent is first in the leaf, |
1702 | * then we have to correct all indexes above. |
1703 | * TODO: do we need to correct tree in all cases? |
1704 | */ |
1705 | static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode, |
1706 | struct ext4_ext_path *path) |
1707 | { |
1708 | struct ext4_extent_header *eh; |
1709 | int depth = ext_depth(inode); |
1710 | struct ext4_extent *ex; |
1711 | __le32 border; |
1712 | int k, err = 0; |
1713 | |
1714 | eh = path[depth].p_hdr; |
1715 | ex = path[depth].p_ext; |
1716 | |
1717 | if (unlikely(ex == NULL || eh == NULL)) { |
1718 | EXT4_ERROR_INODE(inode, |
1719 | "ex %p == NULL or eh %p == NULL" , ex, eh); |
1720 | return -EFSCORRUPTED; |
1721 | } |
1722 | |
1723 | if (depth == 0) { |
1724 | /* there is no tree at all */ |
1725 | return 0; |
1726 | } |
1727 | |
1728 | if (ex != EXT_FIRST_EXTENT(eh)) { |
1729 | /* we correct tree if first leaf got modified only */ |
1730 | return 0; |
1731 | } |
1732 | |
1733 | /* |
1734 | * TODO: we need correction if border is smaller than current one |
1735 | */ |
1736 | k = depth - 1; |
1737 | border = path[depth].p_ext->ee_block; |
1738 | err = ext4_ext_get_access(handle, inode, path: path + k); |
1739 | if (err) |
1740 | return err; |
1741 | path[k].p_idx->ei_block = border; |
1742 | err = ext4_ext_dirty(handle, inode, path + k); |
1743 | if (err) |
1744 | return err; |
1745 | |
1746 | while (k--) { |
1747 | /* change all left-side indexes */ |
1748 | if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr)) |
1749 | break; |
1750 | err = ext4_ext_get_access(handle, inode, path: path + k); |
1751 | if (err) |
1752 | break; |
1753 | path[k].p_idx->ei_block = border; |
1754 | err = ext4_ext_dirty(handle, inode, path + k); |
1755 | if (err) |
1756 | break; |
1757 | } |
1758 | |
1759 | return err; |
1760 | } |
1761 | |
1762 | static int ext4_can_extents_be_merged(struct inode *inode, |
1763 | struct ext4_extent *ex1, |
1764 | struct ext4_extent *ex2) |
1765 | { |
1766 | unsigned short ext1_ee_len, ext2_ee_len; |
1767 | |
1768 | if (ext4_ext_is_unwritten(ext: ex1) != ext4_ext_is_unwritten(ext: ex2)) |
1769 | return 0; |
1770 | |
1771 | ext1_ee_len = ext4_ext_get_actual_len(ext: ex1); |
1772 | ext2_ee_len = ext4_ext_get_actual_len(ext: ex2); |
1773 | |
1774 | if (le32_to_cpu(ex1->ee_block) + ext1_ee_len != |
1775 | le32_to_cpu(ex2->ee_block)) |
1776 | return 0; |
1777 | |
1778 | if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN) |
1779 | return 0; |
1780 | |
1781 | if (ext4_ext_is_unwritten(ext: ex1) && |
1782 | ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN) |
1783 | return 0; |
1784 | #ifdef AGGRESSIVE_TEST |
1785 | if (ext1_ee_len >= 4) |
1786 | return 0; |
1787 | #endif |
1788 | |
1789 | if (ext4_ext_pblock(ex: ex1) + ext1_ee_len == ext4_ext_pblock(ex: ex2)) |
1790 | return 1; |
1791 | return 0; |
1792 | } |
1793 | |
1794 | /* |
1795 | * This function tries to merge the "ex" extent to the next extent in the tree. |
1796 | * It always tries to merge towards right. If you want to merge towards |
1797 | * left, pass "ex - 1" as argument instead of "ex". |
1798 | * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns |
1799 | * 1 if they got merged. |
1800 | */ |
1801 | static int ext4_ext_try_to_merge_right(struct inode *inode, |
1802 | struct ext4_ext_path *path, |
1803 | struct ext4_extent *ex) |
1804 | { |
1805 | struct ext4_extent_header *eh; |
1806 | unsigned int depth, len; |
1807 | int merge_done = 0, unwritten; |
1808 | |
1809 | depth = ext_depth(inode); |
1810 | BUG_ON(path[depth].p_hdr == NULL); |
1811 | eh = path[depth].p_hdr; |
1812 | |
1813 | while (ex < EXT_LAST_EXTENT(eh)) { |
1814 | if (!ext4_can_extents_be_merged(inode, ex1: ex, ex2: ex + 1)) |
1815 | break; |
1816 | /* merge with next extent! */ |
1817 | unwritten = ext4_ext_is_unwritten(ext: ex); |
1818 | ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) |
1819 | + ext4_ext_get_actual_len(ex + 1)); |
1820 | if (unwritten) |
1821 | ext4_ext_mark_unwritten(ext: ex); |
1822 | |
1823 | if (ex + 1 < EXT_LAST_EXTENT(eh)) { |
1824 | len = (EXT_LAST_EXTENT(eh) - ex - 1) |
1825 | * sizeof(struct ext4_extent); |
1826 | memmove(ex + 1, ex + 2, len); |
1827 | } |
1828 | le16_add_cpu(var: &eh->eh_entries, val: -1); |
1829 | merge_done = 1; |
1830 | WARN_ON(eh->eh_entries == 0); |
1831 | if (!eh->eh_entries) |
1832 | EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!" ); |
1833 | } |
1834 | |
1835 | return merge_done; |
1836 | } |
1837 | |
1838 | /* |
1839 | * This function does a very simple check to see if we can collapse |
1840 | * an extent tree with a single extent tree leaf block into the inode. |
1841 | */ |
1842 | static void ext4_ext_try_to_merge_up(handle_t *handle, |
1843 | struct inode *inode, |
1844 | struct ext4_ext_path *path) |
1845 | { |
1846 | size_t s; |
1847 | unsigned max_root = ext4_ext_space_root(inode, check: 0); |
1848 | ext4_fsblk_t blk; |
1849 | |
1850 | if ((path[0].p_depth != 1) || |
1851 | (le16_to_cpu(path[0].p_hdr->eh_entries) != 1) || |
1852 | (le16_to_cpu(path[1].p_hdr->eh_entries) > max_root)) |
1853 | return; |
1854 | |
1855 | /* |
1856 | * We need to modify the block allocation bitmap and the block |
1857 | * group descriptor to release the extent tree block. If we |
1858 | * can't get the journal credits, give up. |
1859 | */ |
1860 | if (ext4_journal_extend(handle, nblocks: 2, |
1861 | revoke: ext4_free_metadata_revoke_credits(sb: inode->i_sb, blocks: 1))) |
1862 | return; |
1863 | |
1864 | /* |
1865 | * Copy the extent data up to the inode |
1866 | */ |
1867 | blk = ext4_idx_pblock(ix: path[0].p_idx); |
1868 | s = le16_to_cpu(path[1].p_hdr->eh_entries) * |
1869 | sizeof(struct ext4_extent_idx); |
1870 | s += sizeof(struct ext4_extent_header); |
1871 | |
1872 | path[1].p_maxdepth = path[0].p_maxdepth; |
1873 | memcpy(path[0].p_hdr, path[1].p_hdr, s); |
1874 | path[0].p_depth = 0; |
1875 | path[0].p_ext = EXT_FIRST_EXTENT(path[0].p_hdr) + |
1876 | (path[1].p_ext - EXT_FIRST_EXTENT(path[1].p_hdr)); |
1877 | path[0].p_hdr->eh_max = cpu_to_le16(max_root); |
1878 | |
1879 | brelse(bh: path[1].p_bh); |
1880 | ext4_free_blocks(handle, inode, NULL, block: blk, count: 1, |
1881 | EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); |
1882 | } |
1883 | |
1884 | /* |
1885 | * This function tries to merge the @ex extent to neighbours in the tree, then |
1886 | * tries to collapse the extent tree into the inode. |
1887 | */ |
1888 | static void ext4_ext_try_to_merge(handle_t *handle, |
1889 | struct inode *inode, |
1890 | struct ext4_ext_path *path, |
1891 | struct ext4_extent *ex) |
1892 | { |
1893 | struct ext4_extent_header *eh; |
1894 | unsigned int depth; |
1895 | int merge_done = 0; |
1896 | |
1897 | depth = ext_depth(inode); |
1898 | BUG_ON(path[depth].p_hdr == NULL); |
1899 | eh = path[depth].p_hdr; |
1900 | |
1901 | if (ex > EXT_FIRST_EXTENT(eh)) |
1902 | merge_done = ext4_ext_try_to_merge_right(inode, path, ex: ex - 1); |
1903 | |
1904 | if (!merge_done) |
1905 | (void) ext4_ext_try_to_merge_right(inode, path, ex); |
1906 | |
1907 | ext4_ext_try_to_merge_up(handle, inode, path); |
1908 | } |
1909 | |
1910 | /* |
1911 | * check if a portion of the "newext" extent overlaps with an |
1912 | * existing extent. |
1913 | * |
1914 | * If there is an overlap discovered, it updates the length of the newext |
1915 | * such that there will be no overlap, and then returns 1. |
1916 | * If there is no overlap found, it returns 0. |
1917 | */ |
1918 | static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi, |
1919 | struct inode *inode, |
1920 | struct ext4_extent *newext, |
1921 | struct ext4_ext_path *path) |
1922 | { |
1923 | ext4_lblk_t b1, b2; |
1924 | unsigned int depth, len1; |
1925 | unsigned int ret = 0; |
1926 | |
1927 | b1 = le32_to_cpu(newext->ee_block); |
1928 | len1 = ext4_ext_get_actual_len(ext: newext); |
1929 | depth = ext_depth(inode); |
1930 | if (!path[depth].p_ext) |
1931 | goto out; |
1932 | b2 = EXT4_LBLK_CMASK(sbi, le32_to_cpu(path[depth].p_ext->ee_block)); |
1933 | |
1934 | /* |
1935 | * get the next allocated block if the extent in the path |
1936 | * is before the requested block(s) |
1937 | */ |
1938 | if (b2 < b1) { |
1939 | b2 = ext4_ext_next_allocated_block(path); |
1940 | if (b2 == EXT_MAX_BLOCKS) |
1941 | goto out; |
1942 | b2 = EXT4_LBLK_CMASK(sbi, b2); |
1943 | } |
1944 | |
1945 | /* check for wrap through zero on extent logical start block*/ |
1946 | if (b1 + len1 < b1) { |
1947 | len1 = EXT_MAX_BLOCKS - b1; |
1948 | newext->ee_len = cpu_to_le16(len1); |
1949 | ret = 1; |
1950 | } |
1951 | |
1952 | /* check for overlap */ |
1953 | if (b1 + len1 > b2) { |
1954 | newext->ee_len = cpu_to_le16(b2 - b1); |
1955 | ret = 1; |
1956 | } |
1957 | out: |
1958 | return ret; |
1959 | } |
1960 | |
1961 | /* |
1962 | * ext4_ext_insert_extent: |
1963 | * tries to merge requested extent into the existing extent or |
1964 | * inserts requested extent as new one into the tree, |
1965 | * creating new leaf in the no-space case. |
1966 | */ |
1967 | int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, |
1968 | struct ext4_ext_path **ppath, |
1969 | struct ext4_extent *newext, int gb_flags) |
1970 | { |
1971 | struct ext4_ext_path *path = *ppath; |
1972 | struct ext4_extent_header *eh; |
1973 | struct ext4_extent *ex, *fex; |
1974 | struct ext4_extent *nearex; /* nearest extent */ |
1975 | struct ext4_ext_path *npath = NULL; |
1976 | int depth, len, err; |
1977 | ext4_lblk_t next; |
1978 | int mb_flags = 0, unwritten; |
1979 | |
1980 | if (gb_flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) |
1981 | mb_flags |= EXT4_MB_DELALLOC_RESERVED; |
1982 | if (unlikely(ext4_ext_get_actual_len(newext) == 0)) { |
1983 | EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0" ); |
1984 | return -EFSCORRUPTED; |
1985 | } |
1986 | depth = ext_depth(inode); |
1987 | ex = path[depth].p_ext; |
1988 | eh = path[depth].p_hdr; |
1989 | if (unlikely(path[depth].p_hdr == NULL)) { |
1990 | EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL" , depth); |
1991 | return -EFSCORRUPTED; |
1992 | } |
1993 | |
1994 | /* try to insert block into found extent and return */ |
1995 | if (ex && !(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) { |
1996 | |
1997 | /* |
1998 | * Try to see whether we should rather test the extent on |
1999 | * right from ex, or from the left of ex. This is because |
2000 | * ext4_find_extent() can return either extent on the |
2001 | * left, or on the right from the searched position. This |
2002 | * will make merging more effective. |
2003 | */ |
2004 | if (ex < EXT_LAST_EXTENT(eh) && |
2005 | (le32_to_cpu(ex->ee_block) + |
2006 | ext4_ext_get_actual_len(ext: ex) < |
2007 | le32_to_cpu(newext->ee_block))) { |
2008 | ex += 1; |
2009 | goto prepend; |
2010 | } else if ((ex > EXT_FIRST_EXTENT(eh)) && |
2011 | (le32_to_cpu(newext->ee_block) + |
2012 | ext4_ext_get_actual_len(ext: newext) < |
2013 | le32_to_cpu(ex->ee_block))) |
2014 | ex -= 1; |
2015 | |
2016 | /* Try to append newex to the ex */ |
2017 | if (ext4_can_extents_be_merged(inode, ex1: ex, ex2: newext)) { |
2018 | ext_debug(inode, "append [%d]%d block to %u:[%d]%d" |
2019 | "(from %llu)\n" , |
2020 | ext4_ext_is_unwritten(newext), |
2021 | ext4_ext_get_actual_len(newext), |
2022 | le32_to_cpu(ex->ee_block), |
2023 | ext4_ext_is_unwritten(ex), |
2024 | ext4_ext_get_actual_len(ex), |
2025 | ext4_ext_pblock(ex)); |
2026 | err = ext4_ext_get_access(handle, inode, |
2027 | path: path + depth); |
2028 | if (err) |
2029 | return err; |
2030 | unwritten = ext4_ext_is_unwritten(ext: ex); |
2031 | ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) |
2032 | + ext4_ext_get_actual_len(newext)); |
2033 | if (unwritten) |
2034 | ext4_ext_mark_unwritten(ext: ex); |
2035 | nearex = ex; |
2036 | goto merge; |
2037 | } |
2038 | |
2039 | prepend: |
2040 | /* Try to prepend newex to the ex */ |
2041 | if (ext4_can_extents_be_merged(inode, ex1: newext, ex2: ex)) { |
2042 | ext_debug(inode, "prepend %u[%d]%d block to %u:[%d]%d" |
2043 | "(from %llu)\n" , |
2044 | le32_to_cpu(newext->ee_block), |
2045 | ext4_ext_is_unwritten(newext), |
2046 | ext4_ext_get_actual_len(newext), |
2047 | le32_to_cpu(ex->ee_block), |
2048 | ext4_ext_is_unwritten(ex), |
2049 | ext4_ext_get_actual_len(ex), |
2050 | ext4_ext_pblock(ex)); |
2051 | err = ext4_ext_get_access(handle, inode, |
2052 | path: path + depth); |
2053 | if (err) |
2054 | return err; |
2055 | |
2056 | unwritten = ext4_ext_is_unwritten(ext: ex); |
2057 | ex->ee_block = newext->ee_block; |
2058 | ext4_ext_store_pblock(ex, pb: ext4_ext_pblock(ex: newext)); |
2059 | ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) |
2060 | + ext4_ext_get_actual_len(newext)); |
2061 | if (unwritten) |
2062 | ext4_ext_mark_unwritten(ext: ex); |
2063 | nearex = ex; |
2064 | goto merge; |
2065 | } |
2066 | } |
2067 | |
2068 | depth = ext_depth(inode); |
2069 | eh = path[depth].p_hdr; |
2070 | if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) |
2071 | goto has_space; |
2072 | |
2073 | /* probably next leaf has space for us? */ |
2074 | fex = EXT_LAST_EXTENT(eh); |
2075 | next = EXT_MAX_BLOCKS; |
2076 | if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block)) |
2077 | next = ext4_ext_next_leaf_block(path); |
2078 | if (next != EXT_MAX_BLOCKS) { |
2079 | ext_debug(inode, "next leaf block - %u\n" , next); |
2080 | BUG_ON(npath != NULL); |
2081 | npath = ext4_find_extent(inode, block: next, NULL, flags: gb_flags); |
2082 | if (IS_ERR(ptr: npath)) |
2083 | return PTR_ERR(ptr: npath); |
2084 | BUG_ON(npath->p_depth != path->p_depth); |
2085 | eh = npath[depth].p_hdr; |
2086 | if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) { |
2087 | ext_debug(inode, "next leaf isn't full(%d)\n" , |
2088 | le16_to_cpu(eh->eh_entries)); |
2089 | path = npath; |
2090 | goto has_space; |
2091 | } |
2092 | ext_debug(inode, "next leaf has no free space(%d,%d)\n" , |
2093 | le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); |
2094 | } |
2095 | |
2096 | /* |
2097 | * There is no free space in the found leaf. |
2098 | * We're gonna add a new leaf in the tree. |
2099 | */ |
2100 | if (gb_flags & EXT4_GET_BLOCKS_METADATA_NOFAIL) |
2101 | mb_flags |= EXT4_MB_USE_RESERVED; |
2102 | err = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags, |
2103 | ppath, newext); |
2104 | if (err) |
2105 | goto cleanup; |
2106 | depth = ext_depth(inode); |
2107 | eh = path[depth].p_hdr; |
2108 | |
2109 | has_space: |
2110 | nearex = path[depth].p_ext; |
2111 | |
2112 | err = ext4_ext_get_access(handle, inode, path: path + depth); |
2113 | if (err) |
2114 | goto cleanup; |
2115 | |
2116 | if (!nearex) { |
2117 | /* there is no extent in this leaf, create first one */ |
2118 | ext_debug(inode, "first extent in the leaf: %u:%llu:[%d]%d\n" , |
2119 | le32_to_cpu(newext->ee_block), |
2120 | ext4_ext_pblock(newext), |
2121 | ext4_ext_is_unwritten(newext), |
2122 | ext4_ext_get_actual_len(newext)); |
2123 | nearex = EXT_FIRST_EXTENT(eh); |
2124 | } else { |
2125 | if (le32_to_cpu(newext->ee_block) |
2126 | > le32_to_cpu(nearex->ee_block)) { |
2127 | /* Insert after */ |
2128 | ext_debug(inode, "insert %u:%llu:[%d]%d before: " |
2129 | "nearest %p\n" , |
2130 | le32_to_cpu(newext->ee_block), |
2131 | ext4_ext_pblock(newext), |
2132 | ext4_ext_is_unwritten(newext), |
2133 | ext4_ext_get_actual_len(newext), |
2134 | nearex); |
2135 | nearex++; |
2136 | } else { |
2137 | /* Insert before */ |
2138 | BUG_ON(newext->ee_block == nearex->ee_block); |
2139 | ext_debug(inode, "insert %u:%llu:[%d]%d after: " |
2140 | "nearest %p\n" , |
2141 | le32_to_cpu(newext->ee_block), |
2142 | ext4_ext_pblock(newext), |
2143 | ext4_ext_is_unwritten(newext), |
2144 | ext4_ext_get_actual_len(newext), |
2145 | nearex); |
2146 | } |
2147 | len = EXT_LAST_EXTENT(eh) - nearex + 1; |
2148 | if (len > 0) { |
2149 | ext_debug(inode, "insert %u:%llu:[%d]%d: " |
2150 | "move %d extents from 0x%p to 0x%p\n" , |
2151 | le32_to_cpu(newext->ee_block), |
2152 | ext4_ext_pblock(newext), |
2153 | ext4_ext_is_unwritten(newext), |
2154 | ext4_ext_get_actual_len(newext), |
2155 | len, nearex, nearex + 1); |
2156 | memmove(nearex + 1, nearex, |
2157 | len * sizeof(struct ext4_extent)); |
2158 | } |
2159 | } |
2160 | |
2161 | le16_add_cpu(var: &eh->eh_entries, val: 1); |
2162 | path[depth].p_ext = nearex; |
2163 | nearex->ee_block = newext->ee_block; |
2164 | ext4_ext_store_pblock(ex: nearex, pb: ext4_ext_pblock(ex: newext)); |
2165 | nearex->ee_len = newext->ee_len; |
2166 | |
2167 | merge: |
2168 | /* try to merge extents */ |
2169 | if (!(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) |
2170 | ext4_ext_try_to_merge(handle, inode, path, ex: nearex); |
2171 | |
2172 | |
2173 | /* time to correct all indexes above */ |
2174 | err = ext4_ext_correct_indexes(handle, inode, path); |
2175 | if (err) |
2176 | goto cleanup; |
2177 | |
2178 | err = ext4_ext_dirty(handle, inode, path + path->p_depth); |
2179 | |
2180 | cleanup: |
2181 | ext4_free_ext_path(path: npath); |
2182 | return err; |
2183 | } |
2184 | |
2185 | static int ext4_fill_es_cache_info(struct inode *inode, |
2186 | ext4_lblk_t block, ext4_lblk_t num, |
2187 | struct fiemap_extent_info *fieinfo) |
2188 | { |
2189 | ext4_lblk_t next, end = block + num - 1; |
2190 | struct extent_status es; |
2191 | unsigned char blksize_bits = inode->i_sb->s_blocksize_bits; |
2192 | unsigned int flags; |
2193 | int err; |
2194 | |
2195 | while (block <= end) { |
2196 | next = 0; |
2197 | flags = 0; |
2198 | if (!ext4_es_lookup_extent(inode, lblk: block, next_lblk: &next, es: &es)) |
2199 | break; |
2200 | if (ext4_es_is_unwritten(es: &es)) |
2201 | flags |= FIEMAP_EXTENT_UNWRITTEN; |
2202 | if (ext4_es_is_delayed(es: &es)) |
2203 | flags |= (FIEMAP_EXTENT_DELALLOC | |
2204 | FIEMAP_EXTENT_UNKNOWN); |
2205 | if (ext4_es_is_hole(es: &es)) |
2206 | flags |= EXT4_FIEMAP_EXTENT_HOLE; |
2207 | if (next == 0) |
2208 | flags |= FIEMAP_EXTENT_LAST; |
2209 | if (flags & (FIEMAP_EXTENT_DELALLOC| |
2210 | EXT4_FIEMAP_EXTENT_HOLE)) |
2211 | es.es_pblk = 0; |
2212 | else |
2213 | es.es_pblk = ext4_es_pblock(es: &es); |
2214 | err = fiemap_fill_next_extent(info: fieinfo, |
2215 | logical: (__u64)es.es_lblk << blksize_bits, |
2216 | phys: (__u64)es.es_pblk << blksize_bits, |
2217 | len: (__u64)es.es_len << blksize_bits, |
2218 | flags); |
2219 | if (next == 0) |
2220 | break; |
2221 | block = next; |
2222 | if (err < 0) |
2223 | return err; |
2224 | if (err == 1) |
2225 | return 0; |
2226 | } |
2227 | return 0; |
2228 | } |
2229 | |
2230 | |
2231 | /* |
2232 | * ext4_ext_find_hole - find hole around given block according to the given path |
2233 | * @inode: inode we lookup in |
2234 | * @path: path in extent tree to @lblk |
2235 | * @lblk: pointer to logical block around which we want to determine hole |
2236 | * |
2237 | * Determine hole length (and start if easily possible) around given logical |
2238 | * block. We don't try too hard to find the beginning of the hole but @path |
2239 | * actually points to extent before @lblk, we provide it. |
2240 | * |
2241 | * The function returns the length of a hole starting at @lblk. We update @lblk |
2242 | * to the beginning of the hole if we managed to find it. |
2243 | */ |
2244 | static ext4_lblk_t ext4_ext_find_hole(struct inode *inode, |
2245 | struct ext4_ext_path *path, |
2246 | ext4_lblk_t *lblk) |
2247 | { |
2248 | int depth = ext_depth(inode); |
2249 | struct ext4_extent *ex; |
2250 | ext4_lblk_t len; |
2251 | |
2252 | ex = path[depth].p_ext; |
2253 | if (ex == NULL) { |
2254 | /* there is no extent yet, so gap is [0;-] */ |
2255 | *lblk = 0; |
2256 | len = EXT_MAX_BLOCKS; |
2257 | } else if (*lblk < le32_to_cpu(ex->ee_block)) { |
2258 | len = le32_to_cpu(ex->ee_block) - *lblk; |
2259 | } else if (*lblk >= le32_to_cpu(ex->ee_block) |
2260 | + ext4_ext_get_actual_len(ext: ex)) { |
2261 | ext4_lblk_t next; |
2262 | |
2263 | *lblk = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ext: ex); |
2264 | next = ext4_ext_next_allocated_block(path); |
2265 | BUG_ON(next == *lblk); |
2266 | len = next - *lblk; |
2267 | } else { |
2268 | BUG(); |
2269 | } |
2270 | return len; |
2271 | } |
2272 | |
2273 | /* |
2274 | * ext4_ext_rm_idx: |
2275 | * removes index from the index block. |
2276 | */ |
2277 | static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, |
2278 | struct ext4_ext_path *path, int depth) |
2279 | { |
2280 | int err; |
2281 | ext4_fsblk_t leaf; |
2282 | |
2283 | /* free index block */ |
2284 | depth--; |
2285 | path = path + depth; |
2286 | leaf = ext4_idx_pblock(ix: path->p_idx); |
2287 | if (unlikely(path->p_hdr->eh_entries == 0)) { |
2288 | EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0" ); |
2289 | return -EFSCORRUPTED; |
2290 | } |
2291 | err = ext4_ext_get_access(handle, inode, path); |
2292 | if (err) |
2293 | return err; |
2294 | |
2295 | if (path->p_idx != EXT_LAST_INDEX(path->p_hdr)) { |
2296 | int len = EXT_LAST_INDEX(path->p_hdr) - path->p_idx; |
2297 | len *= sizeof(struct ext4_extent_idx); |
2298 | memmove(path->p_idx, path->p_idx + 1, len); |
2299 | } |
2300 | |
2301 | le16_add_cpu(var: &path->p_hdr->eh_entries, val: -1); |
2302 | err = ext4_ext_dirty(handle, inode, path); |
2303 | if (err) |
2304 | return err; |
2305 | ext_debug(inode, "index is empty, remove it, free block %llu\n" , leaf); |
2306 | trace_ext4_ext_rm_idx(inode, pblk: leaf); |
2307 | |
2308 | ext4_free_blocks(handle, inode, NULL, block: leaf, count: 1, |
2309 | EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); |
2310 | |
2311 | while (--depth >= 0) { |
2312 | if (path->p_idx != EXT_FIRST_INDEX(path->p_hdr)) |
2313 | break; |
2314 | path--; |
2315 | err = ext4_ext_get_access(handle, inode, path); |
2316 | if (err) |
2317 | break; |
2318 | path->p_idx->ei_block = (path+1)->p_idx->ei_block; |
2319 | err = ext4_ext_dirty(handle, inode, path); |
2320 | if (err) |
2321 | break; |
2322 | } |
2323 | return err; |
2324 | } |
2325 | |
2326 | /* |
2327 | * ext4_ext_calc_credits_for_single_extent: |
2328 | * This routine returns max. credits that needed to insert an extent |
2329 | * to the extent tree. |
2330 | * When pass the actual path, the caller should calculate credits |
2331 | * under i_data_sem. |
2332 | */ |
2333 | int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks, |
2334 | struct ext4_ext_path *path) |
2335 | { |
2336 | if (path) { |
2337 | int depth = ext_depth(inode); |
2338 | int ret = 0; |
2339 | |
2340 | /* probably there is space in leaf? */ |
2341 | if (le16_to_cpu(path[depth].p_hdr->eh_entries) |
2342 | < le16_to_cpu(path[depth].p_hdr->eh_max)) { |
2343 | |
2344 | /* |
2345 | * There are some space in the leaf tree, no |
2346 | * need to account for leaf block credit |
2347 | * |
2348 | * bitmaps and block group descriptor blocks |
2349 | * and other metadata blocks still need to be |
2350 | * accounted. |
2351 | */ |
2352 | /* 1 bitmap, 1 block group descriptor */ |
2353 | ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb); |
2354 | return ret; |
2355 | } |
2356 | } |
2357 | |
2358 | return ext4_chunk_trans_blocks(inode, nrblocks); |
2359 | } |
2360 | |
2361 | /* |
2362 | * How many index/leaf blocks need to change/allocate to add @extents extents? |
2363 | * |
2364 | * If we add a single extent, then in the worse case, each tree level |
2365 | * index/leaf need to be changed in case of the tree split. |
2366 | * |
2367 | * If more extents are inserted, they could cause the whole tree split more |
2368 | * than once, but this is really rare. |
2369 | */ |
2370 | int ext4_ext_index_trans_blocks(struct inode *inode, int extents) |
2371 | { |
2372 | int index; |
2373 | int depth; |
2374 | |
2375 | /* If we are converting the inline data, only one is needed here. */ |
2376 | if (ext4_has_inline_data(inode)) |
2377 | return 1; |
2378 | |
2379 | depth = ext_depth(inode); |
2380 | |
2381 | if (extents <= 1) |
2382 | index = depth * 2; |
2383 | else |
2384 | index = depth * 3; |
2385 | |
2386 | return index; |
2387 | } |
2388 | |
2389 | static inline int get_default_free_blocks_flags(struct inode *inode) |
2390 | { |
2391 | if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) || |
2392 | ext4_test_inode_flag(inode, bit: EXT4_INODE_EA_INODE)) |
2393 | return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET; |
2394 | else if (ext4_should_journal_data(inode)) |
2395 | return EXT4_FREE_BLOCKS_FORGET; |
2396 | return 0; |
2397 | } |
2398 | |
2399 | /* |
2400 | * ext4_rereserve_cluster - increment the reserved cluster count when |
2401 | * freeing a cluster with a pending reservation |
2402 | * |
2403 | * @inode - file containing the cluster |
2404 | * @lblk - logical block in cluster to be reserved |
2405 | * |
2406 | * Increments the reserved cluster count and adjusts quota in a bigalloc |
2407 | * file system when freeing a partial cluster containing at least one |
2408 | * delayed and unwritten block. A partial cluster meeting that |
2409 | * requirement will have a pending reservation. If so, the |
2410 | * RERESERVE_CLUSTER flag is used when calling ext4_free_blocks() to |
2411 | * defer reserved and allocated space accounting to a subsequent call |
2412 | * to this function. |
2413 | */ |
2414 | static void ext4_rereserve_cluster(struct inode *inode, ext4_lblk_t lblk) |
2415 | { |
2416 | struct ext4_sb_info *sbi = EXT4_SB(sb: inode->i_sb); |
2417 | struct ext4_inode_info *ei = EXT4_I(inode); |
2418 | |
2419 | dquot_reclaim_block(inode, EXT4_C2B(sbi, 1)); |
2420 | |
2421 | spin_lock(lock: &ei->i_block_reservation_lock); |
2422 | ei->i_reserved_data_blocks++; |
2423 | percpu_counter_add(fbc: &sbi->s_dirtyclusters_counter, amount: 1); |
2424 | spin_unlock(lock: &ei->i_block_reservation_lock); |
2425 | |
2426 | percpu_counter_add(fbc: &sbi->s_freeclusters_counter, amount: 1); |
2427 | ext4_remove_pending(inode, lblk); |
2428 | } |
2429 | |
2430 | static int ext4_remove_blocks(handle_t *handle, struct inode *inode, |
2431 | struct ext4_extent *ex, |
2432 | struct partial_cluster *partial, |
2433 | ext4_lblk_t from, ext4_lblk_t to) |
2434 | { |
2435 | struct ext4_sb_info *sbi = EXT4_SB(sb: inode->i_sb); |
2436 | unsigned short ee_len = ext4_ext_get_actual_len(ext: ex); |
2437 | ext4_fsblk_t last_pblk, pblk; |
2438 | ext4_lblk_t num; |
2439 | int flags; |
2440 | |
2441 | /* only extent tail removal is allowed */ |
2442 | if (from < le32_to_cpu(ex->ee_block) || |
2443 | to != le32_to_cpu(ex->ee_block) + ee_len - 1) { |
2444 | ext4_error(sbi->s_sb, |
2445 | "strange request: removal(2) %u-%u from %u:%u" , |
2446 | from, to, le32_to_cpu(ex->ee_block), ee_len); |
2447 | return 0; |
2448 | } |
2449 | |
2450 | #ifdef EXTENTS_STATS |
2451 | spin_lock(&sbi->s_ext_stats_lock); |
2452 | sbi->s_ext_blocks += ee_len; |
2453 | sbi->s_ext_extents++; |
2454 | if (ee_len < sbi->s_ext_min) |
2455 | sbi->s_ext_min = ee_len; |
2456 | if (ee_len > sbi->s_ext_max) |
2457 | sbi->s_ext_max = ee_len; |
2458 | if (ext_depth(inode) > sbi->s_depth_max) |
2459 | sbi->s_depth_max = ext_depth(inode); |
2460 | spin_unlock(&sbi->s_ext_stats_lock); |
2461 | #endif |
2462 | |
2463 | trace_ext4_remove_blocks(inode, ex, from, to, pc: partial); |
2464 | |
2465 | /* |
2466 | * if we have a partial cluster, and it's different from the |
2467 | * cluster of the last block in the extent, we free it |
2468 | */ |
2469 | last_pblk = ext4_ext_pblock(ex) + ee_len - 1; |
2470 | |
2471 | if (partial->state != initial && |
2472 | partial->pclu != EXT4_B2C(sbi, last_pblk)) { |
2473 | if (partial->state == tofree) { |
2474 | flags = get_default_free_blocks_flags(inode); |
2475 | if (ext4_is_pending(inode, lblk: partial->lblk)) |
2476 | flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER; |
2477 | ext4_free_blocks(handle, inode, NULL, |
2478 | EXT4_C2B(sbi, partial->pclu), |
2479 | count: sbi->s_cluster_ratio, flags); |
2480 | if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER) |
2481 | ext4_rereserve_cluster(inode, lblk: partial->lblk); |
2482 | } |
2483 | partial->state = initial; |
2484 | } |
2485 | |
2486 | num = le32_to_cpu(ex->ee_block) + ee_len - from; |
2487 | pblk = ext4_ext_pblock(ex) + ee_len - num; |
2488 | |
2489 | /* |
2490 | * We free the partial cluster at the end of the extent (if any), |
2491 | * unless the cluster is used by another extent (partial_cluster |
2492 | * state is nofree). If a partial cluster exists here, it must be |
2493 | * shared with the last block in the extent. |
2494 | */ |
2495 | flags = get_default_free_blocks_flags(inode); |
2496 | |
2497 | /* partial, left end cluster aligned, right end unaligned */ |
2498 | if ((EXT4_LBLK_COFF(sbi, to) != sbi->s_cluster_ratio - 1) && |
2499 | (EXT4_LBLK_CMASK(sbi, to) >= from) && |
2500 | (partial->state != nofree)) { |
2501 | if (ext4_is_pending(inode, lblk: to)) |
2502 | flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER; |
2503 | ext4_free_blocks(handle, inode, NULL, |
2504 | EXT4_PBLK_CMASK(sbi, last_pblk), |
2505 | count: sbi->s_cluster_ratio, flags); |
2506 | if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER) |
2507 | ext4_rereserve_cluster(inode, lblk: to); |
2508 | partial->state = initial; |
2509 | flags = get_default_free_blocks_flags(inode); |
2510 | } |
2511 | |
2512 | flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER; |
2513 | |
2514 | /* |
2515 | * For bigalloc file systems, we never free a partial cluster |
2516 | * at the beginning of the extent. Instead, we check to see if we |
2517 | * need to free it on a subsequent call to ext4_remove_blocks, |
2518 | * or at the end of ext4_ext_rm_leaf or ext4_ext_remove_space. |
2519 | */ |
2520 | flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER; |
2521 | ext4_free_blocks(handle, inode, NULL, block: pblk, count: num, flags); |
2522 | |
2523 | /* reset the partial cluster if we've freed past it */ |
2524 | if (partial->state != initial && partial->pclu != EXT4_B2C(sbi, pblk)) |
2525 | partial->state = initial; |
2526 | |
2527 | /* |
2528 | * If we've freed the entire extent but the beginning is not left |
2529 | * cluster aligned and is not marked as ineligible for freeing we |
2530 | * record the partial cluster at the beginning of the extent. It |
2531 | * wasn't freed by the preceding ext4_free_blocks() call, and we |
2532 | * need to look farther to the left to determine if it's to be freed |
2533 | * (not shared with another extent). Else, reset the partial |
2534 | * cluster - we're either done freeing or the beginning of the |
2535 | * extent is left cluster aligned. |
2536 | */ |
2537 | if (EXT4_LBLK_COFF(sbi, from) && num == ee_len) { |
2538 | if (partial->state == initial) { |
2539 | partial->pclu = EXT4_B2C(sbi, pblk); |
2540 | partial->lblk = from; |
2541 | partial->state = tofree; |
2542 | } |
2543 | } else { |
2544 | partial->state = initial; |
2545 | } |
2546 | |
2547 | return 0; |
2548 | } |
2549 | |
2550 | /* |
2551 | * ext4_ext_rm_leaf() Removes the extents associated with the |
2552 | * blocks appearing between "start" and "end". Both "start" |
2553 | * and "end" must appear in the same extent or EIO is returned. |
2554 | * |
2555 | * @handle: The journal handle |
2556 | * @inode: The files inode |
2557 | * @path: The path to the leaf |
2558 | * @partial_cluster: The cluster which we'll have to free if all extents |
2559 | * has been released from it. However, if this value is |
2560 | * negative, it's a cluster just to the right of the |
2561 | * punched region and it must not be freed. |
2562 | * @start: The first block to remove |
2563 | * @end: The last block to remove |
2564 | */ |
2565 | static int |
2566 | ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, |
2567 | struct ext4_ext_path *path, |
2568 | struct partial_cluster *partial, |
2569 | ext4_lblk_t start, ext4_lblk_t end) |
2570 | { |
2571 | struct ext4_sb_info *sbi = EXT4_SB(sb: inode->i_sb); |
2572 | int err = 0, correct_index = 0; |
2573 | int depth = ext_depth(inode), credits, revoke_credits; |
2574 | struct ext4_extent_header *eh; |
2575 | ext4_lblk_t a, b; |
2576 | unsigned num; |
2577 | ext4_lblk_t ex_ee_block; |
2578 | unsigned short ex_ee_len; |
2579 | unsigned unwritten = 0; |
2580 | struct ext4_extent *ex; |
2581 | ext4_fsblk_t pblk; |
2582 | |
2583 | /* the header must be checked already in ext4_ext_remove_space() */ |
2584 | ext_debug(inode, "truncate since %u in leaf to %u\n" , start, end); |
2585 | if (!path[depth].p_hdr) |
2586 | path[depth].p_hdr = ext_block_hdr(bh: path[depth].p_bh); |
2587 | eh = path[depth].p_hdr; |
2588 | if (unlikely(path[depth].p_hdr == NULL)) { |
2589 | EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL" , depth); |
2590 | return -EFSCORRUPTED; |
2591 | } |
2592 | /* find where to start removing */ |
2593 | ex = path[depth].p_ext; |
2594 | if (!ex) |
2595 | ex = EXT_LAST_EXTENT(eh); |
2596 | |
2597 | ex_ee_block = le32_to_cpu(ex->ee_block); |
2598 | ex_ee_len = ext4_ext_get_actual_len(ext: ex); |
2599 | |
2600 | trace_ext4_ext_rm_leaf(inode, start, ex, pc: partial); |
2601 | |
2602 | while (ex >= EXT_FIRST_EXTENT(eh) && |
2603 | ex_ee_block + ex_ee_len > start) { |
2604 | |
2605 | if (ext4_ext_is_unwritten(ext: ex)) |
2606 | unwritten = 1; |
2607 | else |
2608 | unwritten = 0; |
2609 | |
2610 | ext_debug(inode, "remove ext %u:[%d]%d\n" , ex_ee_block, |
2611 | unwritten, ex_ee_len); |
2612 | path[depth].p_ext = ex; |
2613 | |
2614 | a = max(ex_ee_block, start); |
2615 | b = min(ex_ee_block + ex_ee_len - 1, end); |
2616 | |
2617 | ext_debug(inode, " border %u:%u\n" , a, b); |
2618 | |
2619 | /* If this extent is beyond the end of the hole, skip it */ |
2620 | if (end < ex_ee_block) { |
2621 | /* |
2622 | * We're going to skip this extent and move to another, |
2623 | * so note that its first cluster is in use to avoid |
2624 | * freeing it when removing blocks. Eventually, the |
2625 | * right edge of the truncated/punched region will |
2626 | * be just to the left. |
2627 | */ |
2628 | if (sbi->s_cluster_ratio > 1) { |
2629 | pblk = ext4_ext_pblock(ex); |
2630 | partial->pclu = EXT4_B2C(sbi, pblk); |
2631 | partial->state = nofree; |
2632 | } |
2633 | ex--; |
2634 | ex_ee_block = le32_to_cpu(ex->ee_block); |
2635 | ex_ee_len = ext4_ext_get_actual_len(ext: ex); |
2636 | continue; |
2637 | } else if (b != ex_ee_block + ex_ee_len - 1) { |
2638 | EXT4_ERROR_INODE(inode, |
2639 | "can not handle truncate %u:%u " |
2640 | "on extent %u:%u" , |
2641 | start, end, ex_ee_block, |
2642 | ex_ee_block + ex_ee_len - 1); |
2643 | err = -EFSCORRUPTED; |
2644 | goto out; |
2645 | } else if (a != ex_ee_block) { |
2646 | /* remove tail of the extent */ |
2647 | num = a - ex_ee_block; |
2648 | } else { |
2649 | /* remove whole extent: excellent! */ |
2650 | num = 0; |
2651 | } |
2652 | /* |
2653 | * 3 for leaf, sb, and inode plus 2 (bmap and group |
2654 | * descriptor) for each block group; assume two block |
2655 | * groups plus ex_ee_len/blocks_per_block_group for |
2656 | * the worst case |
2657 | */ |
2658 | credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb)); |
2659 | if (ex == EXT_FIRST_EXTENT(eh)) { |
2660 | correct_index = 1; |
2661 | credits += (ext_depth(inode)) + 1; |
2662 | } |
2663 | credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); |
2664 | /* |
2665 | * We may end up freeing some index blocks and data from the |
2666 | * punched range. Note that partial clusters are accounted for |
2667 | * by ext4_free_data_revoke_credits(). |
2668 | */ |
2669 | revoke_credits = |
2670 | ext4_free_metadata_revoke_credits(sb: inode->i_sb, |
2671 | blocks: ext_depth(inode)) + |
2672 | ext4_free_data_revoke_credits(inode, blocks: b - a + 1); |
2673 | |
2674 | err = ext4_datasem_ensure_credits(handle, inode, check_cred: credits, |
2675 | restart_cred: credits, revoke_cred: revoke_credits); |
2676 | if (err) { |
2677 | if (err > 0) |
2678 | err = -EAGAIN; |
2679 | goto out; |
2680 | } |
2681 | |
2682 | err = ext4_ext_get_access(handle, inode, path: path + depth); |
2683 | if (err) |
2684 | goto out; |
2685 | |
2686 | err = ext4_remove_blocks(handle, inode, ex, partial, from: a, to: b); |
2687 | if (err) |
2688 | goto out; |
2689 | |
2690 | if (num == 0) |
2691 | /* this extent is removed; mark slot entirely unused */ |
2692 | ext4_ext_store_pblock(ex, pb: 0); |
2693 | |
2694 | ex->ee_len = cpu_to_le16(num); |
2695 | /* |
2696 | * Do not mark unwritten if all the blocks in the |
2697 | * extent have been removed. |
2698 | */ |
2699 | if (unwritten && num) |
2700 | ext4_ext_mark_unwritten(ext: ex); |
2701 | /* |
2702 | * If the extent was completely released, |
2703 | * we need to remove it from the leaf |
2704 | */ |
2705 | if (num == 0) { |
2706 | if (end != EXT_MAX_BLOCKS - 1) { |
2707 | /* |
2708 | * For hole punching, we need to scoot all the |
2709 | * extents up when an extent is removed so that |
2710 | * we dont have blank extents in the middle |
2711 | */ |
2712 | memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) * |
2713 | sizeof(struct ext4_extent)); |
2714 | |
2715 | /* Now get rid of the one at the end */ |
2716 | memset(EXT_LAST_EXTENT(eh), 0, |
2717 | sizeof(struct ext4_extent)); |
2718 | } |
2719 | le16_add_cpu(var: &eh->eh_entries, val: -1); |
2720 | } |
2721 | |
2722 | err = ext4_ext_dirty(handle, inode, path + depth); |
2723 | if (err) |
2724 | goto out; |
2725 | |
2726 | ext_debug(inode, "new extent: %u:%u:%llu\n" , ex_ee_block, num, |
2727 | ext4_ext_pblock(ex)); |
2728 | ex--; |
2729 | ex_ee_block = le32_to_cpu(ex->ee_block); |
2730 | ex_ee_len = ext4_ext_get_actual_len(ext: ex); |
2731 | } |
2732 | |
2733 | if (correct_index && eh->eh_entries) |
2734 | err = ext4_ext_correct_indexes(handle, inode, path); |
2735 | |
2736 | /* |
2737 | * If there's a partial cluster and at least one extent remains in |
2738 | * the leaf, free the partial cluster if it isn't shared with the |
2739 | * current extent. If it is shared with the current extent |
2740 | * we reset the partial cluster because we've reached the start of the |
2741 | * truncated/punched region and we're done removing blocks. |
2742 | */ |
2743 | if (partial->state == tofree && ex >= EXT_FIRST_EXTENT(eh)) { |
2744 | pblk = ext4_ext_pblock(ex) + ex_ee_len - 1; |
2745 | if (partial->pclu != EXT4_B2C(sbi, pblk)) { |
2746 | int flags = get_default_free_blocks_flags(inode); |
2747 | |
2748 | if (ext4_is_pending(inode, lblk: partial->lblk)) |
2749 | flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER; |
2750 | ext4_free_blocks(handle, inode, NULL, |
2751 | EXT4_C2B(sbi, partial->pclu), |
2752 | count: sbi->s_cluster_ratio, flags); |
2753 | if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER) |
2754 | ext4_rereserve_cluster(inode, lblk: partial->lblk); |
2755 | } |
2756 | partial->state = initial; |
2757 | } |
2758 | |
2759 | /* if this leaf is free, then we should |
2760 | * remove it from index block above */ |
2761 | if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL) |
2762 | err = ext4_ext_rm_idx(handle, inode, path, depth); |
2763 | |
2764 | out: |
2765 | return err; |
2766 | } |
2767 | |
2768 | /* |
2769 | * ext4_ext_more_to_rm: |
2770 | * returns 1 if current index has to be freed (even partial) |
2771 | */ |
2772 | static int |
2773 | ext4_ext_more_to_rm(struct ext4_ext_path *path) |
2774 | { |
2775 | BUG_ON(path->p_idx == NULL); |
2776 | |
2777 | if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr)) |
2778 | return 0; |
2779 | |
2780 | /* |
2781 | * if truncate on deeper level happened, it wasn't partial, |
2782 | * so we have to consider current index for truncation |
2783 | */ |
2784 | if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block) |
2785 | return 0; |
2786 | return 1; |
2787 | } |
2788 | |
2789 | int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, |
2790 | ext4_lblk_t end) |
2791 | { |
2792 | struct ext4_sb_info *sbi = EXT4_SB(sb: inode->i_sb); |
2793 | int depth = ext_depth(inode); |
2794 | struct ext4_ext_path *path = NULL; |
2795 | struct partial_cluster partial; |
2796 | handle_t *handle; |
2797 | int i = 0, err = 0; |
2798 | |
2799 | partial.pclu = 0; |
2800 | partial.lblk = 0; |
2801 | partial.state = initial; |
2802 | |
2803 | ext_debug(inode, "truncate since %u to %u\n" , start, end); |
2804 | |
2805 | /* probably first extent we're gonna free will be last in block */ |
2806 | handle = ext4_journal_start_with_revoke(inode, EXT4_HT_TRUNCATE, |
2807 | depth + 1, |
2808 | ext4_free_metadata_revoke_credits(inode->i_sb, depth)); |
2809 | if (IS_ERR(ptr: handle)) |
2810 | return PTR_ERR(ptr: handle); |
2811 | |
2812 | again: |
2813 | trace_ext4_ext_remove_space(inode, start, end, depth); |
2814 | |
2815 | /* |
2816 | * Check if we are removing extents inside the extent tree. If that |
2817 | * is the case, we are going to punch a hole inside the extent tree |
2818 | * so we have to check whether we need to split the extent covering |
2819 | * the last block to remove so we can easily remove the part of it |
2820 | * in ext4_ext_rm_leaf(). |
2821 | */ |
2822 | if (end < EXT_MAX_BLOCKS - 1) { |
2823 | struct ext4_extent *ex; |
2824 | ext4_lblk_t ee_block, ex_end, lblk; |
2825 | ext4_fsblk_t pblk; |
2826 | |
2827 | /* find extent for or closest extent to this block */ |
2828 | path = ext4_find_extent(inode, block: end, NULL, |
2829 | EXT4_EX_NOCACHE | EXT4_EX_NOFAIL); |
2830 | if (IS_ERR(ptr: path)) { |
2831 | ext4_journal_stop(handle); |
2832 | return PTR_ERR(ptr: path); |
2833 | } |
2834 | depth = ext_depth(inode); |
2835 | /* Leaf not may not exist only if inode has no blocks at all */ |
2836 | ex = path[depth].p_ext; |
2837 | if (!ex) { |
2838 | if (depth) { |
2839 | EXT4_ERROR_INODE(inode, |
2840 | "path[%d].p_hdr == NULL" , |
2841 | depth); |
2842 | err = -EFSCORRUPTED; |
2843 | } |
2844 | goto out; |
2845 | } |
2846 | |
2847 | ee_block = le32_to_cpu(ex->ee_block); |
2848 | ex_end = ee_block + ext4_ext_get_actual_len(ext: ex) - 1; |
2849 | |
2850 | /* |
2851 | * See if the last block is inside the extent, if so split |
2852 | * the extent at 'end' block so we can easily remove the |
2853 | * tail of the first part of the split extent in |
2854 | * ext4_ext_rm_leaf(). |
2855 | */ |
2856 | if (end >= ee_block && end < ex_end) { |
2857 | |
2858 | /* |
2859 | * If we're going to split the extent, note that |
2860 | * the cluster containing the block after 'end' is |
2861 | * in use to avoid freeing it when removing blocks. |
2862 | */ |
2863 | if (sbi->s_cluster_ratio > 1) { |
2864 | pblk = ext4_ext_pblock(ex) + end - ee_block + 1; |
2865 | partial.pclu = EXT4_B2C(sbi, pblk); |
2866 | partial.state = nofree; |
2867 | } |
2868 | |
2869 | /* |
2870 | * Split the extent in two so that 'end' is the last |
2871 | * block in the first new extent. Also we should not |
2872 | * fail removing space due to ENOSPC so try to use |
2873 | * reserved block if that happens. |
2874 | */ |
2875 | err = ext4_force_split_extent_at(handle, inode, ppath: &path, |
2876 | lblk: end + 1, nofail: 1); |
2877 | if (err < 0) |
2878 | goto out; |
2879 | |
2880 | } else if (sbi->s_cluster_ratio > 1 && end >= ex_end && |
2881 | partial.state == initial) { |
2882 | /* |
2883 | * If we're punching, there's an extent to the right. |
2884 | * If the partial cluster hasn't been set, set it to |
2885 | * that extent's first cluster and its state to nofree |
2886 | * so it won't be freed should it contain blocks to be |
2887 | * removed. If it's already set (tofree/nofree), we're |
2888 | * retrying and keep the original partial cluster info |
2889 | * so a cluster marked tofree as a result of earlier |
2890 | * extent removal is not lost. |
2891 | */ |
2892 | lblk = ex_end + 1; |
2893 | err = ext4_ext_search_right(inode, path, logical: &lblk, phys: &pblk, |
2894 | NULL); |
2895 | if (err < 0) |
2896 | goto out; |
2897 | if (pblk) { |
2898 | partial.pclu = EXT4_B2C(sbi, pblk); |
2899 | partial.state = nofree; |
2900 | } |
2901 | } |
2902 | } |
2903 | /* |
2904 | * We start scanning from right side, freeing all the blocks |
2905 | * after i_size and walking into the tree depth-wise. |
2906 | */ |
2907 | depth = ext_depth(inode); |
2908 | if (path) { |
2909 | int k = i = depth; |
2910 | while (--k > 0) |
2911 | path[k].p_block = |
2912 | le16_to_cpu(path[k].p_hdr->eh_entries)+1; |
2913 | } else { |
2914 | path = kcalloc(n: depth + 1, size: sizeof(struct ext4_ext_path), |
2915 | GFP_NOFS | __GFP_NOFAIL); |
2916 | if (path == NULL) { |
2917 | ext4_journal_stop(handle); |
2918 | return -ENOMEM; |
2919 | } |
2920 | path[0].p_maxdepth = path[0].p_depth = depth; |
2921 | path[0].p_hdr = ext_inode_hdr(inode); |
2922 | i = 0; |
2923 | |
2924 | if (ext4_ext_check(inode, path[0].p_hdr, depth, 0)) { |
2925 | err = -EFSCORRUPTED; |
2926 | goto out; |
2927 | } |
2928 | } |
2929 | err = 0; |
2930 | |
2931 | while (i >= 0 && err == 0) { |
2932 | if (i == depth) { |
2933 | /* this is leaf block */ |
2934 | err = ext4_ext_rm_leaf(handle, inode, path, |
2935 | partial: &partial, start, end); |
2936 | /* root level has p_bh == NULL, brelse() eats this */ |
2937 | brelse(bh: path[i].p_bh); |
2938 | path[i].p_bh = NULL; |
2939 | i--; |
2940 | continue; |
2941 | } |
2942 | |
2943 | /* this is index block */ |
2944 | if (!path[i].p_hdr) { |
2945 | ext_debug(inode, "initialize header\n" ); |
2946 | path[i].p_hdr = ext_block_hdr(bh: path[i].p_bh); |
2947 | } |
2948 | |
2949 | if (!path[i].p_idx) { |
2950 | /* this level hasn't been touched yet */ |
2951 | path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr); |
2952 | path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1; |
2953 | ext_debug(inode, "init index ptr: hdr 0x%p, num %d\n" , |
2954 | path[i].p_hdr, |
2955 | le16_to_cpu(path[i].p_hdr->eh_entries)); |
2956 | } else { |
2957 | /* we were already here, see at next index */ |
2958 | path[i].p_idx--; |
2959 | } |
2960 | |
2961 | ext_debug(inode, "level %d - index, first 0x%p, cur 0x%p\n" , |
2962 | i, EXT_FIRST_INDEX(path[i].p_hdr), |
2963 | path[i].p_idx); |
2964 | if (ext4_ext_more_to_rm(path: path + i)) { |
2965 | struct buffer_head *bh; |
2966 | /* go to the next level */ |
2967 | ext_debug(inode, "move to level %d (block %llu)\n" , |
2968 | i + 1, ext4_idx_pblock(path[i].p_idx)); |
2969 | memset(path + i + 1, 0, sizeof(*path)); |
2970 | bh = read_extent_tree_block(inode, path[i].p_idx, |
2971 | depth - i - 1, |
2972 | EXT4_EX_NOCACHE); |
2973 | if (IS_ERR(ptr: bh)) { |
2974 | /* should we reset i_size? */ |
2975 | err = PTR_ERR(ptr: bh); |
2976 | break; |
2977 | } |
2978 | /* Yield here to deal with large extent trees. |
2979 | * Should be a no-op if we did IO above. */ |
2980 | cond_resched(); |
2981 | if (WARN_ON(i + 1 > depth)) { |
2982 | err = -EFSCORRUPTED; |
2983 | break; |
2984 | } |
2985 | path[i + 1].p_bh = bh; |
2986 | |
2987 | /* save actual number of indexes since this |
2988 | * number is changed at the next iteration */ |
2989 | path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries); |
2990 | i++; |
2991 | } else { |
2992 | /* we finished processing this index, go up */ |
2993 | if (path[i].p_hdr->eh_entries == 0 && i > 0) { |
2994 | /* index is empty, remove it; |
2995 | * handle must be already prepared by the |
2996 | * truncatei_leaf() */ |
2997 | err = ext4_ext_rm_idx(handle, inode, path, depth: i); |
2998 | } |
2999 | /* root level has p_bh == NULL, brelse() eats this */ |
3000 | brelse(bh: path[i].p_bh); |
3001 | path[i].p_bh = NULL; |
3002 | i--; |
3003 | ext_debug(inode, "return to level %d\n" , i); |
3004 | } |
3005 | } |
3006 | |
3007 | trace_ext4_ext_remove_space_done(inode, start, end, depth, pc: &partial, |
3008 | eh_entries: path->p_hdr->eh_entries); |
3009 | |
3010 | /* |
3011 | * if there's a partial cluster and we have removed the first extent |
3012 | * in the file, then we also free the partial cluster, if any |
3013 | */ |
3014 | if (partial.state == tofree && err == 0) { |
3015 | int flags = get_default_free_blocks_flags(inode); |
3016 | |
3017 | if (ext4_is_pending(inode, lblk: partial.lblk)) |
3018 | flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER; |
3019 | ext4_free_blocks(handle, inode, NULL, |
3020 | EXT4_C2B(sbi, partial.pclu), |
3021 | count: sbi->s_cluster_ratio, flags); |
3022 | if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER) |
3023 | ext4_rereserve_cluster(inode, lblk: partial.lblk); |
3024 | partial.state = initial; |
3025 | } |
3026 | |
3027 | /* TODO: flexible tree reduction should be here */ |
3028 | if (path->p_hdr->eh_entries == 0) { |
3029 | /* |
3030 | * truncate to zero freed all the tree, |
3031 | * so we need to correct eh_depth |
3032 | */ |
3033 | err = ext4_ext_get_access(handle, inode, path); |
3034 | if (err == 0) { |
3035 | ext_inode_hdr(inode)->eh_depth = 0; |
3036 | ext_inode_hdr(inode)->eh_max = |
3037 | cpu_to_le16(ext4_ext_space_root(inode, 0)); |
3038 | err = ext4_ext_dirty(handle, inode, path); |
3039 | } |
3040 | } |
3041 | out: |
3042 | ext4_free_ext_path(path); |
3043 | path = NULL; |
3044 | if (err == -EAGAIN) |
3045 | goto again; |
3046 | ext4_journal_stop(handle); |
3047 | |
3048 | return err; |
3049 | } |
3050 | |
3051 | /* |
3052 | * called at mount time |
3053 | */ |
3054 | void ext4_ext_init(struct super_block *sb) |
3055 | { |
3056 | /* |
3057 | * possible initialization would be here |
3058 | */ |
3059 | |
3060 | if (ext4_has_feature_extents(sb)) { |
3061 | #if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS) |
3062 | printk(KERN_INFO "EXT4-fs: file extents enabled" |
3063 | #ifdef AGGRESSIVE_TEST |
3064 | ", aggressive tests" |
3065 | #endif |
3066 | #ifdef CHECK_BINSEARCH |
3067 | ", check binsearch" |
3068 | #endif |
3069 | #ifdef EXTENTS_STATS |
3070 | ", stats" |
3071 | #endif |
3072 | "\n" ); |
3073 | #endif |
3074 | #ifdef EXTENTS_STATS |
3075 | spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock); |
3076 | EXT4_SB(sb)->s_ext_min = 1 << 30; |
3077 | EXT4_SB(sb)->s_ext_max = 0; |
3078 | #endif |
3079 | } |
3080 | } |
3081 | |
3082 | /* |
3083 | * called at umount time |
3084 | */ |
3085 | void ext4_ext_release(struct super_block *sb) |
3086 | { |
3087 | if (!ext4_has_feature_extents(sb)) |
3088 | return; |
3089 | |
3090 | #ifdef EXTENTS_STATS |
3091 | if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) { |
3092 | struct ext4_sb_info *sbi = EXT4_SB(sb); |
3093 | printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n" , |
3094 | sbi->s_ext_blocks, sbi->s_ext_extents, |
3095 | sbi->s_ext_blocks / sbi->s_ext_extents); |
3096 | printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n" , |
3097 | sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max); |
3098 | } |
3099 | #endif |
3100 | } |
3101 | |
3102 | static void ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex) |
3103 | { |
3104 | ext4_lblk_t ee_block; |
3105 | ext4_fsblk_t ee_pblock; |
3106 | unsigned int ee_len; |
3107 | |
3108 | ee_block = le32_to_cpu(ex->ee_block); |
3109 | ee_len = ext4_ext_get_actual_len(ext: ex); |
3110 | ee_pblock = ext4_ext_pblock(ex); |
3111 | |
3112 | if (ee_len == 0) |
3113 | return; |
3114 | |
3115 | ext4_es_insert_extent(inode, lblk: ee_block, len: ee_len, pblk: ee_pblock, |
3116 | EXTENT_STATUS_WRITTEN); |
3117 | } |
3118 | |
3119 | /* FIXME!! we need to try to merge to left or right after zero-out */ |
3120 | static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) |
3121 | { |
3122 | ext4_fsblk_t ee_pblock; |
3123 | unsigned int ee_len; |
3124 | |
3125 | ee_len = ext4_ext_get_actual_len(ext: ex); |
3126 | ee_pblock = ext4_ext_pblock(ex); |
3127 | return ext4_issue_zeroout(inode, le32_to_cpu(ex->ee_block), pblk: ee_pblock, |
3128 | len: ee_len); |
3129 | } |
3130 | |
3131 | /* |
3132 | * ext4_split_extent_at() splits an extent at given block. |
3133 | * |
3134 | * @handle: the journal handle |
3135 | * @inode: the file inode |
3136 | * @path: the path to the extent |
3137 | * @split: the logical block where the extent is splitted. |
3138 | * @split_flags: indicates if the extent could be zeroout if split fails, and |
3139 | * the states(init or unwritten) of new extents. |
3140 | * @flags: flags used to insert new extent to extent tree. |
3141 | * |
3142 | * |
3143 | * Splits extent [a, b] into two extents [a, @split) and [@split, b], states |
3144 | * of which are determined by split_flag. |
3145 | * |
3146 | * There are two cases: |
3147 | * a> the extent are splitted into two extent. |
3148 | * b> split is not needed, and just mark the extent. |
3149 | * |
3150 | * return 0 on success. |
3151 | */ |
3152 | static int ext4_split_extent_at(handle_t *handle, |
3153 | struct inode *inode, |
3154 | struct ext4_ext_path **ppath, |
3155 | ext4_lblk_t split, |
3156 | int split_flag, |
3157 | int flags) |
3158 | { |
3159 | struct ext4_ext_path *path = *ppath; |
3160 | ext4_fsblk_t newblock; |
3161 | ext4_lblk_t ee_block; |
3162 | struct ext4_extent *ex, newex, orig_ex, zero_ex; |
3163 | struct ext4_extent *ex2 = NULL; |
3164 | unsigned int ee_len, depth; |
3165 | int err = 0; |
3166 | |
3167 | BUG_ON((split_flag & (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)) == |
3168 | (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)); |
3169 | |
3170 | ext_debug(inode, "logical block %llu\n" , (unsigned long long)split); |
3171 | |
3172 | ext4_ext_show_leaf(inode, path); |
3173 | |
3174 | depth = ext_depth(inode); |
3175 | ex = path[depth].p_ext; |
3176 | ee_block = le32_to_cpu(ex->ee_block); |
3177 | ee_len = ext4_ext_get_actual_len(ext: ex); |
3178 | newblock = split - ee_block + ext4_ext_pblock(ex); |
3179 | |
3180 | BUG_ON(split < ee_block || split >= (ee_block + ee_len)); |
3181 | BUG_ON(!ext4_ext_is_unwritten(ex) && |
3182 | split_flag & (EXT4_EXT_MAY_ZEROOUT | |
3183 | EXT4_EXT_MARK_UNWRIT1 | |
3184 | EXT4_EXT_MARK_UNWRIT2)); |
3185 | |
3186 | err = ext4_ext_get_access(handle, inode, path: path + depth); |
3187 | if (err) |
3188 | goto out; |
3189 | |
3190 | if (split == ee_block) { |
3191 | /* |
3192 | * case b: block @split is the block that the extent begins with |
3193 | * then we just change the state of the extent, and splitting |
3194 | * is not needed. |
3195 | */ |
3196 | if (split_flag & EXT4_EXT_MARK_UNWRIT2) |
3197 | ext4_ext_mark_unwritten(ext: ex); |
3198 | else |
3199 | ext4_ext_mark_initialized(ext: ex); |
3200 | |
3201 | if (!(flags & EXT4_GET_BLOCKS_PRE_IO)) |
3202 | ext4_ext_try_to_merge(handle, inode, path, ex); |
3203 | |
3204 | err = ext4_ext_dirty(handle, inode, path + path->p_depth); |
3205 | goto out; |
3206 | } |
3207 | |
3208 | /* case a */ |
3209 | memcpy(&orig_ex, ex, sizeof(orig_ex)); |
3210 | ex->ee_len = cpu_to_le16(split - ee_block); |
3211 | if (split_flag & EXT4_EXT_MARK_UNWRIT1) |
3212 | ext4_ext_mark_unwritten(ext: ex); |
3213 | |
3214 | /* |
3215 | * path may lead to new leaf, not to original leaf any more |
3216 | * after ext4_ext_insert_extent() returns, |
3217 | */ |
3218 | err = ext4_ext_dirty(handle, inode, path + depth); |
3219 | if (err) |
3220 | goto fix_extent_len; |
3221 | |
3222 | ex2 = &newex; |
3223 | ex2->ee_block = cpu_to_le32(split); |
3224 | ex2->ee_len = cpu_to_le16(ee_len - (split - ee_block)); |
3225 | ext4_ext_store_pblock(ex: ex2, pb: newblock); |
3226 | if (split_flag & EXT4_EXT_MARK_UNWRIT2) |
3227 | ext4_ext_mark_unwritten(ext: ex2); |
3228 | |
3229 | err = ext4_ext_insert_extent(handle, inode, ppath, newext: &newex, gb_flags: flags); |
3230 | if (err != -ENOSPC && err != -EDQUOT && err != -ENOMEM) |
3231 | goto out; |
3232 | |
3233 | if (EXT4_EXT_MAY_ZEROOUT & split_flag) { |
3234 | if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) { |
3235 | if (split_flag & EXT4_EXT_DATA_VALID1) { |
3236 | err = ext4_ext_zeroout(inode, ex: ex2); |
3237 | zero_ex.ee_block = ex2->ee_block; |
3238 | zero_ex.ee_len = cpu_to_le16( |
3239 | ext4_ext_get_actual_len(ex2)); |
3240 | ext4_ext_store_pblock(ex: &zero_ex, |
3241 | pb: ext4_ext_pblock(ex: ex2)); |
3242 | } else { |
3243 | err = ext4_ext_zeroout(inode, ex); |
3244 | zero_ex.ee_block = ex->ee_block; |
3245 | zero_ex.ee_len = cpu_to_le16( |
3246 | ext4_ext_get_actual_len(ex)); |
3247 | ext4_ext_store_pblock(ex: &zero_ex, |
3248 | pb: ext4_ext_pblock(ex)); |
3249 | } |
3250 | } else { |
3251 | err = ext4_ext_zeroout(inode, ex: &orig_ex); |
3252 | zero_ex.ee_block = orig_ex.ee_block; |
3253 | zero_ex.ee_len = cpu_to_le16( |
3254 | ext4_ext_get_actual_len(&orig_ex)); |
3255 | ext4_ext_store_pblock(ex: &zero_ex, |
3256 | pb: ext4_ext_pblock(ex: &orig_ex)); |
3257 | } |
3258 | |
3259 | if (!err) { |
3260 | /* update the extent length and mark as initialized */ |
3261 | ex->ee_len = cpu_to_le16(ee_len); |
3262 | ext4_ext_try_to_merge(handle, inode, path, ex); |
3263 | err = ext4_ext_dirty(handle, inode, path + path->p_depth); |
3264 | if (!err) |
3265 | /* update extent status tree */ |
3266 | ext4_zeroout_es(inode, ex: &zero_ex); |
3267 | /* If we failed at this point, we don't know in which |
3268 | * state the extent tree exactly is so don't try to fix |
3269 | * length of the original extent as it may do even more |
3270 | * damage. |
3271 | */ |
3272 | goto out; |
3273 | } |
3274 | } |
3275 | |
3276 | fix_extent_len: |
3277 | ex->ee_len = orig_ex.ee_len; |
3278 | /* |
3279 | * Ignore ext4_ext_dirty return value since we are already in error path |
3280 | * and err is a non-zero error code. |
3281 | */ |
3282 | ext4_ext_dirty(handle, inode, path + path->p_depth); |
3283 | return err; |
3284 | out: |
3285 | ext4_ext_show_leaf(inode, path); |
3286 | return err; |
3287 | } |
3288 | |
3289 | /* |
3290 | * ext4_split_extents() splits an extent and mark extent which is covered |
3291 | * by @map as split_flags indicates |
3292 | * |
3293 | * It may result in splitting the extent into multiple extents (up to three) |
3294 | * There are three possibilities: |
3295 | * a> There is no split required |
3296 | * b> Splits in two extents: Split is happening at either end of the extent |
3297 | * c> Splits in three extents: Somone is splitting in middle of the extent |
3298 | * |
3299 | */ |
3300 | static int ext4_split_extent(handle_t *handle, |
3301 | struct inode *inode, |
3302 | struct ext4_ext_path **ppath, |
3303 | struct ext4_map_blocks *map, |
3304 | int split_flag, |
3305 | int flags) |
3306 | { |
3307 | struct ext4_ext_path *path = *ppath; |
3308 | ext4_lblk_t ee_block; |
3309 | struct ext4_extent *ex; |
3310 | unsigned int ee_len, depth; |
3311 | int err = 0; |
3312 | int unwritten; |
3313 | int split_flag1, flags1; |
3314 | int allocated = map->m_len; |
3315 | |
3316 | depth = ext_depth(inode); |
3317 | ex = path[depth].p_ext; |
3318 | ee_block = le32_to_cpu(ex->ee_block); |
3319 | ee_len = ext4_ext_get_actual_len(ext: ex); |
3320 | unwritten = ext4_ext_is_unwritten(ext: ex); |
3321 | |
3322 | if (map->m_lblk + map->m_len < ee_block + ee_len) { |
3323 | split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT; |
3324 | flags1 = flags | EXT4_GET_BLOCKS_PRE_IO; |
3325 | if (unwritten) |
3326 | split_flag1 |= EXT4_EXT_MARK_UNWRIT1 | |
3327 | EXT4_EXT_MARK_UNWRIT2; |
3328 | if (split_flag & EXT4_EXT_DATA_VALID2) |
3329 | split_flag1 |= EXT4_EXT_DATA_VALID1; |
3330 | err = ext4_split_extent_at(handle, inode, ppath, |
3331 | split: map->m_lblk + map->m_len, split_flag: split_flag1, flags: flags1); |
3332 | if (err) |
3333 | goto out; |
3334 | } else { |
3335 | allocated = ee_len - (map->m_lblk - ee_block); |
3336 | } |
3337 | /* |
3338 | * Update path is required because previous ext4_split_extent_at() may |
3339 | * result in split of original leaf or extent zeroout. |
3340 | */ |
3341 | path = ext4_find_extent(inode, block: map->m_lblk, orig_path: ppath, flags); |
3342 | if (IS_ERR(ptr: path)) |
3343 | return PTR_ERR(ptr: path); |
3344 | depth = ext_depth(inode); |
3345 | ex = path[depth].p_ext; |
3346 | if (!ex) { |
3347 | EXT4_ERROR_INODE(inode, "unexpected hole at %lu" , |
3348 | (unsigned long) map->m_lblk); |
3349 | return -EFSCORRUPTED; |
3350 | } |
3351 | unwritten = ext4_ext_is_unwritten(ext: ex); |
3352 | |
3353 | if (map->m_lblk >= ee_block) { |
3354 | split_flag1 = split_flag & EXT4_EXT_DATA_VALID2; |
3355 | if (unwritten) { |
3356 | split_flag1 |= EXT4_EXT_MARK_UNWRIT1; |
3357 | split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT | |
3358 | EXT4_EXT_MARK_UNWRIT2); |
3359 | } |
3360 | err = ext4_split_extent_at(handle, inode, ppath, |
3361 | split: map->m_lblk, split_flag: split_flag1, flags); |
3362 | if (err) |
3363 | goto out; |
3364 | } |
3365 | |
3366 | ext4_ext_show_leaf(inode, path); |
3367 | out: |
3368 | return err ? err : allocated; |
3369 | } |
3370 | |
3371 | /* |
3372 | * This function is called by ext4_ext_map_blocks() if someone tries to write |
3373 | * to an unwritten extent. It may result in splitting the unwritten |
3374 | * extent into multiple extents (up to three - one initialized and two |
3375 | * unwritten). |
3376 | * There are three possibilities: |
3377 | * a> There is no split required: Entire extent should be initialized |
3378 | * b> Splits in two extents: Write is happening at either end of the extent |
3379 | * c> Splits in three extents: Somone is writing in middle of the extent |
3380 | * |
3381 | * Pre-conditions: |
3382 | * - The extent pointed to by 'path' is unwritten. |
3383 | * - The extent pointed to by 'path' contains a superset |
3384 | * of the logical span [map->m_lblk, map->m_lblk + map->m_len). |
3385 | * |
3386 | * Post-conditions on success: |
3387 | * - the returned value is the number of blocks beyond map->l_lblk |
3388 | * that are allocated and initialized. |
3389 | * It is guaranteed to be >= map->m_len. |
3390 | */ |
3391 | static int ext4_ext_convert_to_initialized(handle_t *handle, |
3392 | struct inode *inode, |
3393 | struct ext4_map_blocks *map, |
3394 | struct ext4_ext_path **ppath, |
3395 | int flags) |
3396 | { |
3397 | struct ext4_ext_path *path = *ppath; |
3398 | struct ext4_sb_info *sbi; |
3399 | struct ext4_extent_header *eh; |
3400 | struct ext4_map_blocks split_map; |
3401 | struct ext4_extent zero_ex1, zero_ex2; |
3402 | struct ext4_extent *ex, *abut_ex; |
3403 | ext4_lblk_t ee_block, eof_block; |
3404 | unsigned int ee_len, depth, map_len = map->m_len; |
3405 | int allocated = 0, max_zeroout = 0; |
3406 | int err = 0; |
3407 | int split_flag = EXT4_EXT_DATA_VALID2; |
3408 | |
3409 | ext_debug(inode, "logical block %llu, max_blocks %u\n" , |
3410 | (unsigned long long)map->m_lblk, map_len); |
3411 | |
3412 | sbi = EXT4_SB(sb: inode->i_sb); |
3413 | eof_block = (EXT4_I(inode)->i_disksize + inode->i_sb->s_blocksize - 1) |
3414 | >> inode->i_sb->s_blocksize_bits; |
3415 | if (eof_block < map->m_lblk + map_len) |
3416 | eof_block = map->m_lblk + map_len; |
3417 | |
3418 | depth = ext_depth(inode); |
3419 | eh = path[depth].p_hdr; |
3420 | ex = path[depth].p_ext; |
3421 | ee_block = le32_to_cpu(ex->ee_block); |
3422 | ee_len = ext4_ext_get_actual_len(ext: ex); |
3423 | zero_ex1.ee_len = 0; |
3424 | zero_ex2.ee_len = 0; |
3425 | |
3426 | trace_ext4_ext_convert_to_initialized_enter(inode, map, ux: ex); |
3427 | |
3428 | /* Pre-conditions */ |
3429 | BUG_ON(!ext4_ext_is_unwritten(ex)); |
3430 | BUG_ON(!in_range(map->m_lblk, ee_block, ee_len)); |
3431 | |
3432 | /* |
3433 | * Attempt to transfer newly initialized blocks from the currently |
3434 | * unwritten extent to its neighbor. This is much cheaper |
3435 | * than an insertion followed by a merge as those involve costly |
3436 | * memmove() calls. Transferring to the left is the common case in |
3437 | * steady state for workloads doing fallocate(FALLOC_FL_KEEP_SIZE) |
3438 | * followed by append writes. |
3439 | * |
3440 | * Limitations of the current logic: |
3441 | * - L1: we do not deal with writes covering the whole extent. |
3442 | * This would require removing the extent if the transfer |
3443 | * is possible. |
3444 | * - L2: we only attempt to merge with an extent stored in the |
3445 | * same extent tree node. |
3446 | */ |
3447 | if ((map->m_lblk == ee_block) && |
3448 | /* See if we can merge left */ |
3449 | (map_len < ee_len) && /*L1*/ |
3450 | (ex > EXT_FIRST_EXTENT(eh))) { /*L2*/ |
3451 | ext4_lblk_t prev_lblk; |
3452 | ext4_fsblk_t prev_pblk, ee_pblk; |
3453 | unsigned int prev_len; |
3454 | |
3455 | abut_ex = ex - 1; |
3456 | prev_lblk = le32_to_cpu(abut_ex->ee_block); |
3457 | prev_len = ext4_ext_get_actual_len(ext: abut_ex); |
3458 | prev_pblk = ext4_ext_pblock(ex: abut_ex); |
3459 | ee_pblk = ext4_ext_pblock(ex); |
3460 | |
3461 | /* |
3462 | * A transfer of blocks from 'ex' to 'abut_ex' is allowed |
3463 | * upon those conditions: |
3464 | * - C1: abut_ex is initialized, |
3465 | * - C2: abut_ex is logically abutting ex, |
3466 | * - C3: abut_ex is physically abutting ex, |
3467 | * - C4: abut_ex can receive the additional blocks without |
3468 | * overflowing the (initialized) length limit. |
3469 | */ |
3470 | if ((!ext4_ext_is_unwritten(ext: abut_ex)) && /*C1*/ |
3471 | ((prev_lblk + prev_len) == ee_block) && /*C2*/ |
3472 | ((prev_pblk + prev_len) == ee_pblk) && /*C3*/ |
3473 | (prev_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/ |
3474 | err = ext4_ext_get_access(handle, inode, path: path + depth); |
3475 | if (err) |
3476 | goto out; |
3477 | |
3478 | trace_ext4_ext_convert_to_initialized_fastpath(inode, |
3479 | map, ux: ex, ix: abut_ex); |
3480 | |
3481 | /* Shift the start of ex by 'map_len' blocks */ |
3482 | ex->ee_block = cpu_to_le32(ee_block + map_len); |
3483 | ext4_ext_store_pblock(ex, pb: ee_pblk + map_len); |
3484 | ex->ee_len = cpu_to_le16(ee_len - map_len); |
3485 | ext4_ext_mark_unwritten(ext: ex); /* Restore the flag */ |
3486 | |
3487 | /* Extend abut_ex by 'map_len' blocks */ |
3488 | abut_ex->ee_len = cpu_to_le16(prev_len + map_len); |
3489 | |
3490 | /* Result: number of initialized blocks past m_lblk */ |
3491 | allocated = map_len; |
3492 | } |
3493 | } else if (((map->m_lblk + map_len) == (ee_block + ee_len)) && |
3494 | (map_len < ee_len) && /*L1*/ |
3495 | ex < EXT_LAST_EXTENT(eh)) { /*L2*/ |
3496 | /* See if we can merge right */ |
3497 | ext4_lblk_t next_lblk; |
3498 | ext4_fsblk_t next_pblk, ee_pblk; |
3499 | unsigned int next_len; |
3500 | |
3501 | abut_ex = ex + 1; |
3502 | next_lblk = le32_to_cpu(abut_ex->ee_block); |
3503 | next_len = ext4_ext_get_actual_len(ext: abut_ex); |
3504 | next_pblk = ext4_ext_pblock(ex: abut_ex); |
3505 | ee_pblk = ext4_ext_pblock(ex); |
3506 | |
3507 | /* |
3508 | * A transfer of blocks from 'ex' to 'abut_ex' is allowed |
3509 | * upon those conditions: |
3510 | * - C1: abut_ex is initialized, |
3511 | * - C2: abut_ex is logically abutting ex, |
3512 | * - C3: abut_ex is physically abutting ex, |
3513 | * - C4: abut_ex can receive the additional blocks without |
3514 | * overflowing the (initialized) length limit. |
3515 | */ |
3516 | if ((!ext4_ext_is_unwritten(ext: abut_ex)) && /*C1*/ |
3517 | ((map->m_lblk + map_len) == next_lblk) && /*C2*/ |
3518 | ((ee_pblk + ee_len) == next_pblk) && /*C3*/ |
3519 | (next_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/ |
3520 | err = ext4_ext_get_access(handle, inode, path: path + depth); |
3521 | if (err) |
3522 | goto out; |
3523 | |
3524 | trace_ext4_ext_convert_to_initialized_fastpath(inode, |
3525 | map, ux: ex, ix: abut_ex); |
3526 | |
3527 | /* Shift the start of abut_ex by 'map_len' blocks */ |
3528 | abut_ex->ee_block = cpu_to_le32(next_lblk - map_len); |
3529 | ext4_ext_store_pblock(ex: abut_ex, pb: next_pblk - map_len); |
3530 | ex->ee_len = cpu_to_le16(ee_len - map_len); |
3531 | ext4_ext_mark_unwritten(ext: ex); /* Restore the flag */ |
3532 | |
3533 | /* Extend abut_ex by 'map_len' blocks */ |
3534 | abut_ex->ee_len = cpu_to_le16(next_len + map_len); |
3535 | |
3536 | /* Result: number of initialized blocks past m_lblk */ |
3537 | allocated = map_len; |
3538 | } |
3539 | } |
3540 | if (allocated) { |
3541 | /* Mark the block containing both extents as dirty */ |
3542 | err = ext4_ext_dirty(handle, inode, path + depth); |
3543 | |
3544 | /* Update path to point to the right extent */ |
3545 | path[depth].p_ext = abut_ex; |
3546 | goto out; |
3547 | } else |
3548 | allocated = ee_len - (map->m_lblk - ee_block); |
3549 | |
3550 | WARN_ON(map->m_lblk < ee_block); |
3551 | /* |
3552 | * It is safe to convert extent to initialized via explicit |
3553 | * zeroout only if extent is fully inside i_size or new_size. |
3554 | */ |
3555 | split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; |
3556 | |
3557 | if (EXT4_EXT_MAY_ZEROOUT & split_flag) |
3558 | max_zeroout = sbi->s_extent_max_zeroout_kb >> |
3559 | (inode->i_sb->s_blocksize_bits - 10); |
3560 | |
3561 | /* |
3562 | * five cases: |
3563 | * 1. split the extent into three extents. |
3564 | * 2. split the extent into two extents, zeroout the head of the first |
3565 | * extent. |
3566 | * 3. split the extent into two extents, zeroout the tail of the second |
3567 | * extent. |
3568 | * 4. split the extent into two extents with out zeroout. |
3569 | * 5. no splitting needed, just possibly zeroout the head and / or the |
3570 | * tail of the extent. |
3571 | */ |
3572 | split_map.m_lblk = map->m_lblk; |
3573 | split_map.m_len = map->m_len; |
3574 | |
3575 | if (max_zeroout && (allocated > split_map.m_len)) { |
3576 | if (allocated <= max_zeroout) { |
3577 | /* case 3 or 5 */ |
3578 | zero_ex1.ee_block = |
3579 | cpu_to_le32(split_map.m_lblk + |
3580 | split_map.m_len); |
3581 | zero_ex1.ee_len = |
3582 | cpu_to_le16(allocated - split_map.m_len); |
3583 | ext4_ext_store_pblock(ex: &zero_ex1, |
3584 | pb: ext4_ext_pblock(ex) + split_map.m_lblk + |
3585 | split_map.m_len - ee_block); |
3586 | err = ext4_ext_zeroout(inode, ex: &zero_ex1); |
3587 | if (err) |
3588 | goto fallback; |
3589 | split_map.m_len = allocated; |
3590 | } |
3591 | if (split_map.m_lblk - ee_block + split_map.m_len < |
3592 | max_zeroout) { |
3593 | /* case 2 or 5 */ |
3594 | if (split_map.m_lblk != ee_block) { |
3595 | zero_ex2.ee_block = ex->ee_block; |
3596 | zero_ex2.ee_len = cpu_to_le16(split_map.m_lblk - |
3597 | ee_block); |
3598 | ext4_ext_store_pblock(ex: &zero_ex2, |
3599 | pb: ext4_ext_pblock(ex)); |
3600 | err = ext4_ext_zeroout(inode, ex: &zero_ex2); |
3601 | if (err) |
3602 | goto fallback; |
3603 | } |
3604 | |
3605 | split_map.m_len += split_map.m_lblk - ee_block; |
3606 | split_map.m_lblk = ee_block; |
3607 | allocated = map->m_len; |
3608 | } |
3609 | } |
3610 | |
3611 | fallback: |
3612 | err = ext4_split_extent(handle, inode, ppath, map: &split_map, split_flag, |
3613 | flags); |
3614 | if (err > 0) |
3615 | err = 0; |
3616 | out: |
3617 | /* If we have gotten a failure, don't zero out status tree */ |
3618 | if (!err) { |
3619 | ext4_zeroout_es(inode, ex: &zero_ex1); |
3620 | ext4_zeroout_es(inode, ex: &zero_ex2); |
3621 | } |
3622 | return err ? err : allocated; |
3623 | } |
3624 | |
3625 | /* |
3626 | * This function is called by ext4_ext_map_blocks() from |
3627 | * ext4_get_blocks_dio_write() when DIO to write |
3628 | * to an unwritten extent. |
3629 | * |
3630 | * Writing to an unwritten extent may result in splitting the unwritten |
3631 | * extent into multiple initialized/unwritten extents (up to three) |
3632 | * There are three possibilities: |
3633 | * a> There is no split required: Entire extent should be unwritten |
3634 | * b> Splits in two extents: Write is happening at either end of the extent |
3635 | * c> Splits in three extents: Somone is writing in middle of the extent |
3636 | * |
3637 | * This works the same way in the case of initialized -> unwritten conversion. |
3638 | * |
3639 | * One of more index blocks maybe needed if the extent tree grow after |
3640 | * the unwritten extent split. To prevent ENOSPC occur at the IO |
3641 | * complete, we need to split the unwritten extent before DIO submit |
3642 | * the IO. The unwritten extent called at this time will be split |
3643 | * into three unwritten extent(at most). After IO complete, the part |
3644 | * being filled will be convert to initialized by the end_io callback function |
3645 | * via ext4_convert_unwritten_extents(). |
3646 | * |
3647 | * Returns the size of unwritten extent to be written on success. |
3648 | */ |
3649 | static int ext4_split_convert_extents(handle_t *handle, |
3650 | struct inode *inode, |
3651 | struct ext4_map_blocks *map, |
3652 | struct ext4_ext_path **ppath, |
3653 | int flags) |
3654 | { |
3655 | struct ext4_ext_path *path = *ppath; |
3656 | ext4_lblk_t eof_block; |
3657 | ext4_lblk_t ee_block; |
3658 | struct ext4_extent *ex; |
3659 | unsigned int ee_len; |
3660 | int split_flag = 0, depth; |
3661 | |
3662 | ext_debug(inode, "logical block %llu, max_blocks %u\n" , |
3663 | (unsigned long long)map->m_lblk, map->m_len); |
3664 | |
3665 | eof_block = (EXT4_I(inode)->i_disksize + inode->i_sb->s_blocksize - 1) |
3666 | >> inode->i_sb->s_blocksize_bits; |
3667 | if (eof_block < map->m_lblk + map->m_len) |
3668 | eof_block = map->m_lblk + map->m_len; |
3669 | /* |
3670 | * It is safe to convert extent to initialized via explicit |
3671 | * zeroout only if extent is fully inside i_size or new_size. |
3672 | */ |
3673 | depth = ext_depth(inode); |
3674 | ex = path[depth].p_ext; |
3675 | ee_block = le32_to_cpu(ex->ee_block); |
3676 | ee_len = ext4_ext_get_actual_len(ext: ex); |
3677 | |
3678 | /* Convert to unwritten */ |
3679 | if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) { |
3680 | split_flag |= EXT4_EXT_DATA_VALID1; |
3681 | /* Convert to initialized */ |
3682 | } else if (flags & EXT4_GET_BLOCKS_CONVERT) { |
3683 | split_flag |= ee_block + ee_len <= eof_block ? |
3684 | EXT4_EXT_MAY_ZEROOUT : 0; |
3685 | split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2); |
3686 | } |
3687 | flags |= EXT4_GET_BLOCKS_PRE_IO; |
3688 | return ext4_split_extent(handle, inode, ppath, map, split_flag, flags); |
3689 | } |
3690 | |
3691 | static int ext4_convert_unwritten_extents_endio(handle_t *handle, |
3692 | struct inode *inode, |
3693 | struct ext4_map_blocks *map, |
3694 | struct ext4_ext_path **ppath) |
3695 | { |
3696 | struct ext4_ext_path *path = *ppath; |
3697 | struct ext4_extent *ex; |
3698 | ext4_lblk_t ee_block; |
3699 | unsigned int ee_len; |
3700 | int depth; |
3701 | int err = 0; |
3702 | |
3703 | depth = ext_depth(inode); |
3704 | ex = path[depth].p_ext; |
3705 | ee_block = le32_to_cpu(ex->ee_block); |
3706 | ee_len = ext4_ext_get_actual_len(ext: ex); |
3707 | |
3708 | ext_debug(inode, "logical block %llu, max_blocks %u\n" , |
3709 | (unsigned long long)ee_block, ee_len); |
3710 | |
3711 | /* If extent is larger than requested it is a clear sign that we still |
3712 | * have some extent state machine issues left. So extent_split is still |
3713 | * required. |
3714 | * TODO: Once all related issues will be fixed this situation should be |
3715 | * illegal. |
3716 | */ |
3717 | if (ee_block != map->m_lblk || ee_len > map->m_len) { |
3718 | #ifdef CONFIG_EXT4_DEBUG |
3719 | ext4_warning(inode->i_sb, "Inode (%ld) finished: extent logical block %llu," |
3720 | " len %u; IO logical block %llu, len %u" , |
3721 | inode->i_ino, (unsigned long long)ee_block, ee_len, |
3722 | (unsigned long long)map->m_lblk, map->m_len); |
3723 | #endif |
3724 | err = ext4_split_convert_extents(handle, inode, map, ppath, |
3725 | EXT4_GET_BLOCKS_CONVERT); |
3726 | if (err < 0) |
3727 | return err; |
3728 | path = ext4_find_extent(inode, block: map->m_lblk, orig_path: ppath, flags: 0); |
3729 | if (IS_ERR(ptr: path)) |
3730 | return PTR_ERR(ptr: path); |
3731 | depth = ext_depth(inode); |
3732 | ex = path[depth].p_ext; |
3733 | } |
3734 | |
3735 | err = ext4_ext_get_access(handle, inode, path: path + depth); |
3736 | if (err) |
3737 | goto out; |
3738 | /* first mark the extent as initialized */ |
3739 | ext4_ext_mark_initialized(ext: ex); |
3740 | |
3741 | /* note: ext4_ext_correct_indexes() isn't needed here because |
3742 | * borders are not changed |
3743 | */ |
3744 | ext4_ext_try_to_merge(handle, inode, path, ex); |
3745 | |
3746 | /* Mark modified extent as dirty */ |
3747 | err = ext4_ext_dirty(handle, inode, path + path->p_depth); |
3748 | out: |
3749 | ext4_ext_show_leaf(inode, path); |
3750 | return err; |
3751 | } |
3752 | |
3753 | static int |
3754 | convert_initialized_extent(handle_t *handle, struct inode *inode, |
3755 | struct ext4_map_blocks *map, |
3756 | struct ext4_ext_path **ppath, |
3757 | unsigned int *allocated) |
3758 | { |
3759 | struct ext4_ext_path *path = *ppath; |
3760 | struct ext4_extent *ex; |
3761 | ext4_lblk_t ee_block; |
3762 | unsigned int ee_len; |
3763 | int depth; |
3764 | int err = 0; |
3765 | |
3766 | /* |
3767 | * Make sure that the extent is no bigger than we support with |
3768 | * unwritten extent |
3769 | */ |
3770 | if (map->m_len > EXT_UNWRITTEN_MAX_LEN) |
3771 | map->m_len = EXT_UNWRITTEN_MAX_LEN / 2; |
3772 | |
3773 | depth = ext_depth(inode); |
3774 | ex = path[depth].p_ext; |
3775 | ee_block = le32_to_cpu(ex->ee_block); |
3776 | ee_len = ext4_ext_get_actual_len(ext: ex); |
3777 | |
3778 | ext_debug(inode, "logical block %llu, max_blocks %u\n" , |
3779 | (unsigned long long)ee_block, ee_len); |
3780 | |
3781 | if (ee_block != map->m_lblk || ee_len > map->m_len) { |
3782 | err = ext4_split_convert_extents(handle, inode, map, ppath, |
3783 | EXT4_GET_BLOCKS_CONVERT_UNWRITTEN); |
3784 | if (err < 0) |
3785 | return err; |
3786 | path = ext4_find_extent(inode, block: map->m_lblk, orig_path: ppath, flags: 0); |
3787 | if (IS_ERR(ptr: path)) |
3788 | return PTR_ERR(ptr: path); |
3789 | depth = ext_depth(inode); |
3790 | ex = path[depth].p_ext; |
3791 | if (!ex) { |
3792 | EXT4_ERROR_INODE(inode, "unexpected hole at %lu" , |
3793 | (unsigned long) map->m_lblk); |
3794 | return -EFSCORRUPTED; |
3795 | } |
3796 | } |
3797 | |
3798 | err = ext4_ext_get_access(handle, inode, path: path + depth); |
3799 | if (err) |
3800 | return err; |
3801 | /* first mark the extent as unwritten */ |
3802 | ext4_ext_mark_unwritten(ext: ex); |
3803 | |
3804 | /* note: ext4_ext_correct_indexes() isn't needed here because |
3805 | * borders are not changed |
3806 | */ |
3807 | ext4_ext_try_to_merge(handle, inode, path, ex); |
3808 | |
3809 | /* Mark modified extent as dirty */ |
3810 | err = ext4_ext_dirty(handle, inode, path + path->p_depth); |
3811 | if (err) |
3812 | return err; |
3813 | ext4_ext_show_leaf(inode, path); |
3814 | |
3815 | ext4_update_inode_fsync_trans(handle, inode, datasync: 1); |
3816 | |
3817 | map->m_flags |= EXT4_MAP_UNWRITTEN; |
3818 | if (*allocated > map->m_len) |
3819 | *allocated = map->m_len; |
3820 | map->m_len = *allocated; |
3821 | return 0; |
3822 | } |
3823 | |
3824 | static int |
3825 | ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, |
3826 | struct ext4_map_blocks *map, |
3827 | struct ext4_ext_path **ppath, int flags, |
3828 | unsigned int allocated, ext4_fsblk_t newblock) |
3829 | { |
3830 | struct ext4_ext_path __maybe_unused *path = *ppath; |
3831 | int ret = 0; |
3832 | int err = 0; |
3833 | |
3834 | ext_debug(inode, "logical block %llu, max_blocks %u, flags 0x%x, allocated %u\n" , |
3835 | (unsigned long long)map->m_lblk, map->m_len, flags, |
3836 | allocated); |
3837 | ext4_ext_show_leaf(inode, path); |
3838 | |
3839 | /* |
3840 | * When writing into unwritten space, we should not fail to |
3841 | * allocate metadata blocks for the new extent block if needed. |
3842 | */ |
3843 | flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL; |
3844 | |
3845 | trace_ext4_ext_handle_unwritten_extents(inode, map, flags, |
3846 | allocated, newblock); |
3847 | |
3848 | /* get_block() before submitting IO, split the extent */ |
3849 | if (flags & EXT4_GET_BLOCKS_PRE_IO) { |
3850 | ret = ext4_split_convert_extents(handle, inode, map, ppath, |
3851 | flags: flags | EXT4_GET_BLOCKS_CONVERT); |
3852 | if (ret < 0) { |
3853 | err = ret; |
3854 | goto out2; |
3855 | } |
3856 | /* |
3857 | * shouldn't get a 0 return when splitting an extent unless |
3858 | * m_len is 0 (bug) or extent has been corrupted |
3859 | */ |
3860 | if (unlikely(ret == 0)) { |
3861 | EXT4_ERROR_INODE(inode, |
3862 | "unexpected ret == 0, m_len = %u" , |
3863 | map->m_len); |
3864 | err = -EFSCORRUPTED; |
3865 | goto out2; |
3866 | } |
3867 | map->m_flags |= EXT4_MAP_UNWRITTEN; |
3868 | goto out; |
3869 | } |
3870 | /* IO end_io complete, convert the filled extent to written */ |
3871 | if (flags & EXT4_GET_BLOCKS_CONVERT) { |
3872 | err = ext4_convert_unwritten_extents_endio(handle, inode, map, |
3873 | ppath); |
3874 | if (err < 0) |
3875 | goto out2; |
3876 | ext4_update_inode_fsync_trans(handle, inode, datasync: 1); |
3877 | goto map_out; |
3878 | } |
3879 | /* buffered IO cases */ |
3880 | /* |
3881 | * repeat fallocate creation request |
3882 | * we already have an unwritten extent |
3883 | */ |
3884 | if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) { |
3885 | map->m_flags |= EXT4_MAP_UNWRITTEN; |
3886 | goto map_out; |
3887 | } |
3888 | |
3889 | /* buffered READ or buffered write_begin() lookup */ |
3890 | if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { |
3891 | /* |
3892 | * We have blocks reserved already. We |
3893 | * return allocated blocks so that delalloc |
3894 | * won't do block reservation for us. But |
3895 | * the buffer head will be unmapped so that |
3896 | * a read from the block returns 0s. |
3897 | */ |
3898 | map->m_flags |= EXT4_MAP_UNWRITTEN; |
3899 | goto out1; |
3900 | } |
3901 | |
3902 | /* |
3903 | * Default case when (flags & EXT4_GET_BLOCKS_CREATE) == 1. |
3904 | * For buffered writes, at writepage time, etc. Convert a |
3905 | * discovered unwritten extent to written. |
3906 | */ |
3907 | ret = ext4_ext_convert_to_initialized(handle, inode, map, ppath, flags); |
3908 | if (ret < 0) { |
3909 | err = ret; |
3910 | goto out2; |
3911 | } |
3912 | ext4_update_inode_fsync_trans(handle, inode, datasync: 1); |
3913 | /* |
3914 | * shouldn't get a 0 return when converting an unwritten extent |
3915 | * unless m_len is 0 (bug) or extent has been corrupted |
3916 | */ |
3917 | if (unlikely(ret == 0)) { |
3918 | EXT4_ERROR_INODE(inode, "unexpected ret == 0, m_len = %u" , |
3919 | map->m_len); |
3920 | err = -EFSCORRUPTED; |
3921 | goto out2; |
3922 | } |
3923 | |
3924 | out: |
3925 | allocated = ret; |
3926 | map->m_flags |= EXT4_MAP_NEW; |
3927 | map_out: |
3928 | map->m_flags |= EXT4_MAP_MAPPED; |
3929 | out1: |
3930 | map->m_pblk = newblock; |
3931 | if (allocated > map->m_len) |
3932 | allocated = map->m_len; |
3933 | map->m_len = allocated; |
3934 | ext4_ext_show_leaf(inode, path); |
3935 | out2: |
3936 | return err ? err : allocated; |
3937 | } |
3938 | |
3939 | /* |
3940 | * get_implied_cluster_alloc - check to see if the requested |
3941 | * allocation (in the map structure) overlaps with a cluster already |
3942 | * allocated in an extent. |
3943 | * @sb The filesystem superblock structure |
3944 | * @map The requested lblk->pblk mapping |
3945 | * @ex The extent structure which might contain an implied |
3946 | * cluster allocation |
3947 | * |
3948 | * This function is called by ext4_ext_map_blocks() after we failed to |
3949 | * find blocks that were already in the inode's extent tree. Hence, |
3950 | * we know that the beginning of the requested region cannot overlap |
3951 | * the extent from the inode's extent tree. There are three cases we |
3952 | * want to catch. The first is this case: |
3953 | * |
3954 | * |--- cluster # N--| |
3955 | * |--- extent ---| |---- requested region ---| |
3956 | * |==========| |
3957 | * |
3958 | * The second case that we need to test for is this one: |
3959 | * |
3960 | * |--------- cluster # N ----------------| |
3961 | * |--- requested region --| |------- extent ----| |
3962 | * |=======================| |
3963 | * |
3964 | * The third case is when the requested region lies between two extents |
3965 | * within the same cluster: |
3966 | * |------------- cluster # N-------------| |
3967 | * |----- ex -----| |---- ex_right ----| |
3968 | * |------ requested region ------| |
3969 | * |================| |
3970 | * |
3971 | * In each of the above cases, we need to set the map->m_pblk and |
3972 | * map->m_len so it corresponds to the return the extent labelled as |
3973 | * "|====|" from cluster #N, since it is already in use for data in |
3974 | * cluster EXT4_B2C(sbi, map->m_lblk). We will then return 1 to |
3975 | * signal to ext4_ext_map_blocks() that map->m_pblk should be treated |
3976 | * as a new "allocated" block region. Otherwise, we will return 0 and |
3977 | * ext4_ext_map_blocks() will then allocate one or more new clusters |
3978 | * by calling ext4_mb_new_blocks(). |
3979 | */ |
3980 | static int get_implied_cluster_alloc(struct super_block *sb, |
3981 | struct ext4_map_blocks *map, |
3982 | struct ext4_extent *ex, |
3983 | struct ext4_ext_path *path) |
3984 | { |
3985 | struct ext4_sb_info *sbi = EXT4_SB(sb); |
3986 | ext4_lblk_t c_offset = EXT4_LBLK_COFF(sbi, map->m_lblk); |
3987 | ext4_lblk_t ex_cluster_start, ex_cluster_end; |
3988 | ext4_lblk_t rr_cluster_start; |
3989 | ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block); |
3990 | ext4_fsblk_t ee_start = ext4_ext_pblock(ex); |
3991 | unsigned short ee_len = ext4_ext_get_actual_len(ext: ex); |
3992 | |
3993 | /* The extent passed in that we are trying to match */ |
3994 | ex_cluster_start = EXT4_B2C(sbi, ee_block); |
3995 | ex_cluster_end = EXT4_B2C(sbi, ee_block + ee_len - 1); |
3996 | |
3997 | /* The requested region passed into ext4_map_blocks() */ |
3998 | rr_cluster_start = EXT4_B2C(sbi, map->m_lblk); |
3999 | |
4000 | if ((rr_cluster_start == ex_cluster_end) || |
4001 | (rr_cluster_start == ex_cluster_start)) { |
4002 | if (rr_cluster_start == ex_cluster_end) |
4003 | ee_start += ee_len - 1; |
4004 | map->m_pblk = EXT4_PBLK_CMASK(sbi, ee_start) + c_offset; |
4005 | map->m_len = min(map->m_len, |
4006 | (unsigned) sbi->s_cluster_ratio - c_offset); |
4007 | /* |
4008 | * Check for and handle this case: |
4009 | * |
4010 | * |--------- cluster # N-------------| |
4011 | * |------- extent ----| |
4012 | * |--- requested region ---| |
4013 | * |===========| |
4014 | */ |
4015 | |
4016 | if (map->m_lblk < ee_block) |
4017 | map->m_len = min(map->m_len, ee_block - map->m_lblk); |
4018 | |
4019 | /* |
4020 | * Check for the case where there is already another allocated |
4021 | * block to the right of 'ex' but before the end of the cluster. |
4022 | * |
4023 | * |------------- cluster # N-------------| |
4024 | * |----- ex -----| |---- ex_right ----| |
4025 | * |------ requested region ------| |
4026 | * |================| |
4027 | */ |
4028 | if (map->m_lblk > ee_block) { |
4029 | ext4_lblk_t next = ext4_ext_next_allocated_block(path); |
4030 | map->m_len = min(map->m_len, next - map->m_lblk); |
4031 | } |
4032 | |
4033 | trace_ext4_get_implied_cluster_alloc_exit(sb, map, ret: 1); |
4034 | return 1; |
4035 | } |
4036 | |
4037 | trace_ext4_get_implied_cluster_alloc_exit(sb, map, ret: 0); |
4038 | return 0; |
4039 | } |
4040 | |
4041 | /* |
4042 | * Determine hole length around the given logical block, first try to |
4043 | * locate and expand the hole from the given @path, and then adjust it |
4044 | * if it's partially or completely converted to delayed extents, insert |
4045 | * it into the extent cache tree if it's indeed a hole, finally return |
4046 | * the length of the determined extent. |
4047 | */ |
4048 | static ext4_lblk_t ext4_ext_determine_insert_hole(struct inode *inode, |
4049 | struct ext4_ext_path *path, |
4050 | ext4_lblk_t lblk) |
4051 | { |
4052 | ext4_lblk_t hole_start, len; |
4053 | struct extent_status es; |
4054 | |
4055 | hole_start = lblk; |
4056 | len = ext4_ext_find_hole(inode, path, lblk: &hole_start); |
4057 | again: |
4058 | ext4_es_find_extent_range(inode, match_fn: &ext4_es_is_delayed, lblk: hole_start, |
4059 | end: hole_start + len - 1, es: &es); |
4060 | if (!es.es_len) |
4061 | goto insert_hole; |
4062 | |
4063 | /* |
4064 | * There's a delalloc extent in the hole, handle it if the delalloc |
4065 | * extent is in front of, behind and straddle the queried range. |
4066 | */ |
4067 | if (lblk >= es.es_lblk + es.es_len) { |
4068 | /* |
4069 | * The delalloc extent is in front of the queried range, |
4070 | * find again from the queried start block. |
4071 | */ |
4072 | len -= lblk - hole_start; |
4073 | hole_start = lblk; |
4074 | goto again; |
4075 | } else if (in_range(lblk, es.es_lblk, es.es_len)) { |
4076 | /* |
4077 | * The delalloc extent containing lblk, it must have been |
4078 | * added after ext4_map_blocks() checked the extent status |
4079 | * tree so we are not holding i_rwsem and delalloc info is |
4080 | * only stabilized by i_data_sem we are going to release |
4081 | * soon. Don't modify the extent status tree and report |
4082 | * extent as a hole, just adjust the length to the delalloc |
4083 | * extent's after lblk. |
4084 | */ |
4085 | len = es.es_lblk + es.es_len - lblk; |
4086 | return len; |
4087 | } else { |
4088 | /* |
4089 | * The delalloc extent is partially or completely behind |
4090 | * the queried range, update hole length until the |
4091 | * beginning of the delalloc extent. |
4092 | */ |
4093 | len = min(es.es_lblk - hole_start, len); |
4094 | } |
4095 | |
4096 | insert_hole: |
4097 | /* Put just found gap into cache to speed up subsequent requests */ |
4098 | ext_debug(inode, " -> %u:%u\n" , hole_start, len); |
4099 | ext4_es_insert_extent(inode, lblk: hole_start, len, pblk: ~0, EXTENT_STATUS_HOLE); |
4100 | |
4101 | /* Update hole_len to reflect hole size after lblk */ |
4102 | if (hole_start != lblk) |
4103 | len -= lblk - hole_start; |
4104 | |
4105 | return len; |
4106 | } |
4107 | |
4108 | /* |
4109 | * Block allocation/map/preallocation routine for extents based files |
4110 | * |
4111 | * |
4112 | * Need to be called with |
4113 | * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block |
4114 | * (ie, flags is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem) |
4115 | * |
4116 | * return > 0, number of blocks already mapped/allocated |
4117 | * if flags doesn't contain EXT4_GET_BLOCKS_CREATE and these are pre-allocated blocks |
4118 | * buffer head is unmapped |
4119 | * otherwise blocks are mapped |
4120 | * |
4121 | * return = 0, if plain look up failed (blocks have not been allocated) |
4122 | * buffer head is unmapped |
4123 | * |
4124 | * return < 0, error case. |
4125 | */ |
4126 | int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, |
4127 | struct ext4_map_blocks *map, int flags) |
4128 | { |
4129 | struct ext4_ext_path *path = NULL; |
4130 | struct ext4_extent newex, *ex, ex2; |
4131 | struct ext4_sb_info *sbi = EXT4_SB(sb: inode->i_sb); |
4132 | ext4_fsblk_t newblock = 0, pblk; |
4133 | int err = 0, depth, ret; |
4134 | unsigned int allocated = 0, offset = 0; |
4135 | unsigned int allocated_clusters = 0; |
4136 | struct ext4_allocation_request ar; |
4137 | ext4_lblk_t cluster_offset; |
4138 | |
4139 | ext_debug(inode, "blocks %u/%u requested\n" , map->m_lblk, map->m_len); |
4140 | trace_ext4_ext_map_blocks_enter(inode, lblk: map->m_lblk, len: map->m_len, flags); |
4141 | |
4142 | /* find extent for this block */ |
4143 | path = ext4_find_extent(inode, block: map->m_lblk, NULL, flags: 0); |
4144 | if (IS_ERR(ptr: path)) { |
4145 | err = PTR_ERR(ptr: path); |
4146 | path = NULL; |
4147 | goto out; |
4148 | } |
4149 | |
4150 | depth = ext_depth(inode); |
4151 | |
4152 | /* |
4153 | * consistent leaf must not be empty; |
4154 | * this situation is possible, though, _during_ tree modification; |
4155 | * this is why assert can't be put in ext4_find_extent() |
4156 | */ |
4157 | if (unlikely(path[depth].p_ext == NULL && depth != 0)) { |
4158 | EXT4_ERROR_INODE(inode, "bad extent address " |
4159 | "lblock: %lu, depth: %d pblock %lld" , |
4160 | (unsigned long) map->m_lblk, depth, |
4161 | path[depth].p_block); |
4162 | err = -EFSCORRUPTED; |
4163 | goto out; |
4164 | } |
4165 | |
4166 | ex = path[depth].p_ext; |
4167 | if (ex) { |
4168 | ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block); |
4169 | ext4_fsblk_t ee_start = ext4_ext_pblock(ex); |
4170 | unsigned short ee_len; |
4171 | |
4172 | |
4173 | /* |
4174 | * unwritten extents are treated as holes, except that |
4175 | * we split out initialized portions during a write. |
4176 | */ |
4177 | ee_len = ext4_ext_get_actual_len(ext: ex); |
4178 | |
4179 | trace_ext4_ext_show_extent(inode, lblk: ee_block, pblk: ee_start, len: ee_len); |
4180 | |
4181 | /* if found extent covers block, simply return it */ |
4182 | if (in_range(map->m_lblk, ee_block, ee_len)) { |
4183 | newblock = map->m_lblk - ee_block + ee_start; |
4184 | /* number of remaining blocks in the extent */ |
4185 | allocated = ee_len - (map->m_lblk - ee_block); |
4186 | ext_debug(inode, "%u fit into %u:%d -> %llu\n" , |
4187 | map->m_lblk, ee_block, ee_len, newblock); |
4188 | |
4189 | /* |
4190 | * If the extent is initialized check whether the |
4191 | * caller wants to convert it to unwritten. |
4192 | */ |
4193 | if ((!ext4_ext_is_unwritten(ext: ex)) && |
4194 | (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) { |
4195 | err = convert_initialized_extent(handle, |
4196 | inode, map, ppath: &path, allocated: &allocated); |
4197 | goto out; |
4198 | } else if (!ext4_ext_is_unwritten(ext: ex)) { |
4199 | map->m_flags |= EXT4_MAP_MAPPED; |
4200 | map->m_pblk = newblock; |
4201 | if (allocated > map->m_len) |
4202 | allocated = map->m_len; |
4203 | map->m_len = allocated; |
4204 | ext4_ext_show_leaf(inode, path); |
4205 | goto out; |
4206 | } |
4207 | |
4208 | ret = ext4_ext_handle_unwritten_extents( |
4209 | handle, inode, map, ppath: &path, flags, |
4210 | allocated, newblock); |
4211 | if (ret < 0) |
4212 | err = ret; |
4213 | else |
4214 | allocated = ret; |
4215 | goto out; |
4216 | } |
4217 | } |
4218 | |
4219 | /* |
4220 | * requested block isn't allocated yet; |
4221 | * we couldn't try to create block if flags doesn't contain EXT4_GET_BLOCKS_CREATE |
4222 | */ |
4223 | if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { |
4224 | ext4_lblk_t len; |
4225 | |
4226 | len = ext4_ext_determine_insert_hole(inode, path, lblk: map->m_lblk); |
4227 | |
4228 | map->m_pblk = 0; |
4229 | map->m_len = min_t(unsigned int, map->m_len, len); |
4230 | goto out; |
4231 | } |
4232 | |
4233 | /* |
4234 | * Okay, we need to do block allocation. |
4235 | */ |
4236 | newex.ee_block = cpu_to_le32(map->m_lblk); |
4237 | cluster_offset = EXT4_LBLK_COFF(sbi, map->m_lblk); |
4238 | |
4239 | /* |
4240 | * If we are doing bigalloc, check to see if the extent returned |
4241 | * by ext4_find_extent() implies a cluster we can use. |
4242 | */ |
4243 | if (cluster_offset && ex && |
4244 | get_implied_cluster_alloc(sb: inode->i_sb, map, ex, path)) { |
4245 | ar.len = allocated = map->m_len; |
4246 | newblock = map->m_pblk; |
4247 | goto got_allocated_blocks; |
4248 | } |
4249 | |
4250 | /* find neighbour allocated blocks */ |
4251 | ar.lleft = map->m_lblk; |
4252 | err = ext4_ext_search_left(inode, path, logical: &ar.lleft, phys: &ar.pleft); |
4253 | if (err) |
4254 | goto out; |
4255 | ar.lright = map->m_lblk; |
4256 | err = ext4_ext_search_right(inode, path, logical: &ar.lright, phys: &ar.pright, ret_ex: &ex2); |
4257 | if (err < 0) |
4258 | goto out; |
4259 | |
4260 | /* Check if the extent after searching to the right implies a |
4261 | * cluster we can use. */ |
4262 | if ((sbi->s_cluster_ratio > 1) && err && |
4263 | get_implied_cluster_alloc(sb: inode->i_sb, map, ex: &ex2, path)) { |
4264 | ar.len = allocated = map->m_len; |
4265 | newblock = map->m_pblk; |
4266 | goto got_allocated_blocks; |
4267 | } |
4268 | |
4269 | /* |
4270 | * See if request is beyond maximum number of blocks we can have in |
4271 | * a single extent. For an initialized extent this limit is |
4272 | * EXT_INIT_MAX_LEN and for an unwritten extent this limit is |
4273 | * EXT_UNWRITTEN_MAX_LEN. |
4274 | */ |
4275 | if (map->m_len > EXT_INIT_MAX_LEN && |
4276 | !(flags & EXT4_GET_BLOCKS_UNWRIT_EXT)) |
4277 | map->m_len = EXT_INIT_MAX_LEN; |
4278 | else if (map->m_len > EXT_UNWRITTEN_MAX_LEN && |
4279 | (flags & EXT4_GET_BLOCKS_UNWRIT_EXT)) |
4280 | map->m_len = EXT_UNWRITTEN_MAX_LEN; |
4281 | |
4282 | /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */ |
4283 | newex.ee_len = cpu_to_le16(map->m_len); |
4284 | err = ext4_ext_check_overlap(sbi, inode, newext: &newex, path); |
4285 | if (err) |
4286 | allocated = ext4_ext_get_actual_len(ext: &newex); |
4287 | else |
4288 | allocated = map->m_len; |
4289 | |
4290 | /* allocate new block */ |
4291 | ar.inode = inode; |
4292 | ar.goal = ext4_ext_find_goal(inode, path, block: map->m_lblk); |
4293 | ar.logical = map->m_lblk; |
4294 | /* |
4295 | * We calculate the offset from the beginning of the cluster |
4296 | * for the logical block number, since when we allocate a |
4297 | * physical cluster, the physical block should start at the |
4298 | * same offset from the beginning of the cluster. This is |
4299 | * needed so that future calls to get_implied_cluster_alloc() |
4300 | * work correctly. |
4301 | */ |
4302 | offset = EXT4_LBLK_COFF(sbi, map->m_lblk); |
4303 | ar.len = EXT4_NUM_B2C(sbi, offset+allocated); |
4304 | ar.goal -= offset; |
4305 | ar.logical -= offset; |
4306 | if (S_ISREG(inode->i_mode)) |
4307 | ar.flags = EXT4_MB_HINT_DATA; |
4308 | else |
4309 | /* disable in-core preallocation for non-regular files */ |
4310 | ar.flags = 0; |
4311 | if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE) |
4312 | ar.flags |= EXT4_MB_HINT_NOPREALLOC; |
4313 | if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) |
4314 | ar.flags |= EXT4_MB_DELALLOC_RESERVED; |
4315 | if (flags & EXT4_GET_BLOCKS_METADATA_NOFAIL) |
4316 | ar.flags |= EXT4_MB_USE_RESERVED; |
4317 | newblock = ext4_mb_new_blocks(handle, &ar, &err); |
4318 | if (!newblock) |
4319 | goto out; |
4320 | allocated_clusters = ar.len; |
4321 | ar.len = EXT4_C2B(sbi, ar.len) - offset; |
4322 | ext_debug(inode, "allocate new block: goal %llu, found %llu/%u, requested %u\n" , |
4323 | ar.goal, newblock, ar.len, allocated); |
4324 | if (ar.len > allocated) |
4325 | ar.len = allocated; |
4326 | |
4327 | got_allocated_blocks: |
4328 | /* try to insert new extent into found leaf and return */ |
4329 | pblk = newblock + offset; |
4330 | ext4_ext_store_pblock(ex: &newex, pb: pblk); |
4331 | newex.ee_len = cpu_to_le16(ar.len); |
4332 | /* Mark unwritten */ |
4333 | if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) { |
4334 | ext4_ext_mark_unwritten(ext: &newex); |
4335 | map->m_flags |= EXT4_MAP_UNWRITTEN; |
4336 | } |
4337 | |
4338 | err = ext4_ext_insert_extent(handle, inode, ppath: &path, newext: &newex, gb_flags: flags); |
4339 | if (err) { |
4340 | if (allocated_clusters) { |
4341 | int fb_flags = 0; |
4342 | |
4343 | /* |
4344 | * free data blocks we just allocated. |
4345 | * not a good idea to call discard here directly, |
4346 | * but otherwise we'd need to call it every free(). |
4347 | */ |
4348 | ext4_discard_preallocations(inode); |
4349 | if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) |
4350 | fb_flags = EXT4_FREE_BLOCKS_NO_QUOT_UPDATE; |
4351 | ext4_free_blocks(handle, inode, NULL, block: newblock, |
4352 | EXT4_C2B(sbi, allocated_clusters), |
4353 | flags: fb_flags); |
4354 | } |
4355 | goto out; |
4356 | } |
4357 | |
4358 | /* |
4359 | * Reduce the reserved cluster count to reflect successful deferred |
4360 | * allocation of delayed allocated clusters or direct allocation of |
4361 | * clusters discovered to be delayed allocated. Once allocated, a |
4362 | * cluster is not included in the reserved count. |
4363 | */ |
4364 | if (test_opt(inode->i_sb, DELALLOC) && allocated_clusters) { |
4365 | if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { |
4366 | /* |
4367 | * When allocating delayed allocated clusters, simply |
4368 | * reduce the reserved cluster count and claim quota |
4369 | */ |
4370 | ext4_da_update_reserve_space(inode, used: allocated_clusters, |
4371 | quota_claim: 1); |
4372 | } else { |
4373 | ext4_lblk_t lblk, len; |
4374 | unsigned int n; |
4375 | |
4376 | /* |
4377 | * When allocating non-delayed allocated clusters |
4378 | * (from fallocate, filemap, DIO, or clusters |
4379 | * allocated when delalloc has been disabled by |
4380 | * ext4_nonda_switch), reduce the reserved cluster |
4381 | * count by the number of allocated clusters that |
4382 | * have previously been delayed allocated. Quota |
4383 | * has been claimed by ext4_mb_new_blocks() above, |
4384 | * so release the quota reservations made for any |
4385 | * previously delayed allocated clusters. |
4386 | */ |
4387 | lblk = EXT4_LBLK_CMASK(sbi, map->m_lblk); |
4388 | len = allocated_clusters << sbi->s_cluster_bits; |
4389 | n = ext4_es_delayed_clu(inode, lblk, len); |
4390 | if (n > 0) |
4391 | ext4_da_update_reserve_space(inode, used: (int) n, quota_claim: 0); |
4392 | } |
4393 | } |
4394 | |
4395 | /* |
4396 | * Cache the extent and update transaction to commit on fdatasync only |
4397 | * when it is _not_ an unwritten extent. |
4398 | */ |
4399 | if ((flags & EXT4_GET_BLOCKS_UNWRIT_EXT) == 0) |
4400 | ext4_update_inode_fsync_trans(handle, inode, datasync: 1); |
4401 | else |
4402 | ext4_update_inode_fsync_trans(handle, inode, datasync: 0); |
4403 | |
4404 | map->m_flags |= (EXT4_MAP_NEW | EXT4_MAP_MAPPED); |
4405 | map->m_pblk = pblk; |
4406 | map->m_len = ar.len; |
4407 | allocated = map->m_len; |
4408 | ext4_ext_show_leaf(inode, path); |
4409 | out: |
4410 | ext4_free_ext_path(path); |
4411 | |
4412 | trace_ext4_ext_map_blocks_exit(inode, flags, map, |
4413 | ret: err ? err : allocated); |
4414 | return err ? err : allocated; |
4415 | } |
4416 | |
4417 | int ext4_ext_truncate(handle_t *handle, struct inode *inode) |
4418 | { |
4419 | struct super_block *sb = inode->i_sb; |
4420 | ext4_lblk_t last_block; |
4421 | int err = 0; |
4422 | |
4423 | /* |
4424 | * TODO: optimization is possible here. |
4425 | * Probably we need not scan at all, |
4426 | * because page truncation is enough. |
4427 | */ |
4428 | |
4429 | /* we have to know where to truncate from in crash case */ |
4430 | EXT4_I(inode)->i_disksize = inode->i_size; |
4431 | err = ext4_mark_inode_dirty(handle, inode); |
4432 | if (err) |
4433 | return err; |
4434 | |
4435 | last_block = (inode->i_size + sb->s_blocksize - 1) |
4436 | >> EXT4_BLOCK_SIZE_BITS(sb); |
4437 | ext4_es_remove_extent(inode, lblk: last_block, EXT_MAX_BLOCKS - last_block); |
4438 | |
4439 | retry_remove_space: |
4440 | err = ext4_ext_remove_space(inode, start: last_block, EXT_MAX_BLOCKS - 1); |
4441 | if (err == -ENOMEM) { |
4442 | memalloc_retry_wait(GFP_ATOMIC); |
4443 | goto retry_remove_space; |
4444 | } |
4445 | return err; |
4446 | } |
4447 | |
4448 | static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, |
4449 | ext4_lblk_t len, loff_t new_size, |
4450 | int flags) |
4451 | { |
4452 | struct inode *inode = file_inode(f: file); |
4453 | handle_t *handle; |
4454 | int ret = 0, ret2 = 0, ret3 = 0; |
4455 | int retries = 0; |
4456 | int depth = 0; |
4457 | struct ext4_map_blocks map; |
4458 | unsigned int credits; |
4459 | loff_t epos; |
4460 | |
4461 | BUG_ON(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)); |
4462 | map.m_lblk = offset; |
4463 | map.m_len = len; |
4464 | /* |
4465 | * Don't normalize the request if it can fit in one extent so |
4466 | * that it doesn't get unnecessarily split into multiple |
4467 | * extents. |
4468 | */ |
4469 | if (len <= EXT_UNWRITTEN_MAX_LEN) |
4470 | flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; |
4471 | |
4472 | /* |
4473 | * credits to insert 1 extent into extent tree |
4474 | */ |
4475 | credits = ext4_chunk_trans_blocks(inode, nrblocks: len); |
4476 | depth = ext_depth(inode); |
4477 | |
4478 | retry: |
4479 | while (len) { |
4480 | /* |
4481 | * Recalculate credits when extent tree depth changes. |
4482 | */ |
4483 | if (depth != ext_depth(inode)) { |
4484 | credits = ext4_chunk_trans_blocks(inode, nrblocks: len); |
4485 | depth = ext_depth(inode); |
4486 | } |
4487 | |
4488 | handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, |
4489 | credits); |
4490 | if (IS_ERR(ptr: handle)) { |
4491 | ret = PTR_ERR(ptr: handle); |
4492 | break; |
4493 | } |
4494 | ret = ext4_map_blocks(handle, inode, map: &map, flags); |
4495 | if (ret <= 0) { |
4496 | ext4_debug("inode #%lu: block %u: len %u: " |
4497 | "ext4_ext_map_blocks returned %d" , |
4498 | inode->i_ino, map.m_lblk, |
4499 | map.m_len, ret); |
4500 | ext4_mark_inode_dirty(handle, inode); |
4501 | ext4_journal_stop(handle); |
4502 | break; |
4503 | } |
4504 | /* |
4505 | * allow a full retry cycle for any remaining allocations |
4506 | */ |
4507 | retries = 0; |
4508 | map.m_lblk += ret; |
4509 | map.m_len = len = len - ret; |
4510 | epos = (loff_t)map.m_lblk << inode->i_blkbits; |
4511 | inode_set_ctime_current(inode); |
4512 | if (new_size) { |
4513 | if (epos > new_size) |
4514 | epos = new_size; |
4515 | if (ext4_update_inode_size(inode, newsize: epos) & 0x1) |
4516 | inode_set_mtime_to_ts(inode, |
4517 | ts: inode_get_ctime(inode)); |
4518 | } |
4519 | ret2 = ext4_mark_inode_dirty(handle, inode); |
4520 | ext4_update_inode_fsync_trans(handle, inode, datasync: 1); |
4521 | ret3 = ext4_journal_stop(handle); |
4522 | ret2 = ret3 ? ret3 : ret2; |
4523 | if (unlikely(ret2)) |
4524 | break; |
4525 | } |
4526 | if (ret == -ENOSPC && ext4_should_retry_alloc(sb: inode->i_sb, retries: &retries)) |
4527 | goto retry; |
4528 | |
4529 | return ret > 0 ? ret2 : ret; |
4530 | } |
4531 | |
4532 | static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len); |
4533 | |
4534 | static int ext4_insert_range(struct file *file, loff_t offset, loff_t len); |
4535 | |
4536 | static long ext4_zero_range(struct file *file, loff_t offset, |
4537 | loff_t len, int mode) |
4538 | { |
4539 | struct inode *inode = file_inode(f: file); |
4540 | struct address_space *mapping = file->f_mapping; |
4541 | handle_t *handle = NULL; |
4542 | unsigned int max_blocks; |
4543 | loff_t new_size = 0; |
4544 | int ret = 0; |
4545 | int flags; |
4546 | int credits; |
4547 | int partial_begin, partial_end; |
4548 | loff_t start, end; |
4549 | ext4_lblk_t lblk; |
4550 | unsigned int blkbits = inode->i_blkbits; |
4551 | |
4552 | trace_ext4_zero_range(inode, offset, len, mode); |
4553 | |
4554 | /* |
4555 | * Round up offset. This is not fallocate, we need to zero out |
4556 | * blocks, so convert interior block aligned part of the range to |
4557 | * unwritten and possibly manually zero out unaligned parts of the |
4558 | * range. Here, start and partial_begin are inclusive, end and |
4559 | * partial_end are exclusive. |
4560 | */ |
4561 | start = round_up(offset, 1 << blkbits); |
4562 | end = round_down((offset + len), 1 << blkbits); |
4563 | |
4564 | if (start < offset || end > offset + len) |
4565 | return -EINVAL; |
4566 | partial_begin = offset & ((1 << blkbits) - 1); |
4567 | partial_end = (offset + len) & ((1 << blkbits) - 1); |
4568 | |
4569 | lblk = start >> blkbits; |
4570 | max_blocks = (end >> blkbits); |
4571 | if (max_blocks < lblk) |
4572 | max_blocks = 0; |
4573 | else |
4574 | max_blocks -= lblk; |
4575 | |
4576 | inode_lock(inode); |
4577 | |
4578 | /* |
4579 | * Indirect files do not support unwritten extents |
4580 | */ |
4581 | if (!(ext4_test_inode_flag(inode, bit: EXT4_INODE_EXTENTS))) { |
4582 | ret = -EOPNOTSUPP; |
4583 | goto out_mutex; |
4584 | } |
4585 | |
4586 | if (!(mode & FALLOC_FL_KEEP_SIZE) && |
4587 | (offset + len > inode->i_size || |
4588 | offset + len > EXT4_I(inode)->i_disksize)) { |
4589 | new_size = offset + len; |
4590 | ret = inode_newsize_ok(inode, offset: new_size); |
4591 | if (ret) |
4592 | goto out_mutex; |
4593 | } |
4594 | |
4595 | flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT; |
4596 | |
4597 | /* Wait all existing dio workers, newcomers will block on i_rwsem */ |
4598 | inode_dio_wait(inode); |
4599 | |
4600 | ret = file_modified(file); |
4601 | if (ret) |
4602 | goto out_mutex; |
4603 | |
4604 | /* Preallocate the range including the unaligned edges */ |
4605 | if (partial_begin || partial_end) { |
4606 | ret = ext4_alloc_file_blocks(file, |
4607 | round_down(offset, 1 << blkbits) >> blkbits, |
4608 | len: (round_up((offset + len), 1 << blkbits) - |
4609 | round_down(offset, 1 << blkbits)) >> blkbits, |
4610 | new_size, flags); |
4611 | if (ret) |
4612 | goto out_mutex; |
4613 | |
4614 | } |
4615 | |
4616 | /* Zero range excluding the unaligned edges */ |
4617 | if (max_blocks > 0) { |
4618 | flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | |
4619 | EXT4_EX_NOCACHE); |
4620 | |
4621 | /* |
4622 | * Prevent page faults from reinstantiating pages we have |
4623 | * released from page cache. |
4624 | */ |
4625 | filemap_invalidate_lock(mapping); |
4626 | |
4627 | ret = ext4_break_layouts(inode); |
4628 | if (ret) { |
4629 | filemap_invalidate_unlock(mapping); |
4630 | goto out_mutex; |
4631 | } |
4632 | |
4633 | ret = ext4_update_disksize_before_punch(inode, offset, len); |
4634 | if (ret) { |
4635 | filemap_invalidate_unlock(mapping); |
4636 | goto out_mutex; |
4637 | } |
4638 | |
4639 | /* |
4640 | * For journalled data we need to write (and checkpoint) pages |
4641 | * before discarding page cache to avoid inconsitent data on |
4642 | * disk in case of crash before zeroing trans is committed. |
4643 | */ |
4644 | if (ext4_should_journal_data(inode)) { |
4645 | ret = filemap_write_and_wait_range(mapping, lstart: start, |
4646 | lend: end - 1); |
4647 | if (ret) { |
4648 | filemap_invalidate_unlock(mapping); |
4649 | goto out_mutex; |
4650 | } |
4651 | } |
4652 | |
4653 | /* Now release the pages and zero block aligned part of pages */ |
4654 | truncate_pagecache_range(inode, offset: start, end: end - 1); |
4655 | inode_set_mtime_to_ts(inode, ts: inode_set_ctime_current(inode)); |
4656 | |
4657 | ret = ext4_alloc_file_blocks(file, offset: lblk, len: max_blocks, new_size, |
4658 | flags); |
4659 | filemap_invalidate_unlock(mapping); |
4660 | if (ret) |
4661 | goto out_mutex; |
4662 | } |
4663 | if (!partial_begin && !partial_end) |
4664 | goto out_mutex; |
4665 | |
4666 | /* |
4667 | * In worst case we have to writeout two nonadjacent unwritten |
4668 | * blocks and update the inode |
4669 | */ |
4670 | credits = (2 * ext4_ext_index_trans_blocks(inode, extents: 2)) + 1; |
4671 | if (ext4_should_journal_data(inode)) |
4672 | credits += 2; |
4673 | handle = ext4_journal_start(inode, EXT4_HT_MISC, credits); |
4674 | if (IS_ERR(ptr: handle)) { |
4675 | ret = PTR_ERR(ptr: handle); |
4676 | ext4_std_error(inode->i_sb, ret); |
4677 | goto out_mutex; |
4678 | } |
4679 | |
4680 | inode_set_mtime_to_ts(inode, ts: inode_set_ctime_current(inode)); |
4681 | if (new_size) |
4682 | ext4_update_inode_size(inode, newsize: new_size); |
4683 | ret = ext4_mark_inode_dirty(handle, inode); |
4684 | if (unlikely(ret)) |
4685 | goto out_handle; |
4686 | /* Zero out partial block at the edges of the range */ |
4687 | ret = ext4_zero_partial_blocks(handle, inode, lstart: offset, lend: len); |
4688 | if (ret >= 0) |
4689 | ext4_update_inode_fsync_trans(handle, inode, datasync: 1); |
4690 | |
4691 | if (file->f_flags & O_SYNC) |
4692 | ext4_handle_sync(handle); |
4693 | |
4694 | out_handle: |
4695 | ext4_journal_stop(handle); |
4696 | out_mutex: |
4697 | inode_unlock(inode); |
4698 | return ret; |
4699 | } |
4700 | |
4701 | /* |
4702 | * preallocate space for a file. This implements ext4's fallocate file |
4703 | * operation, which gets called from sys_fallocate system call. |
4704 | * For block-mapped files, posix_fallocate should fall back to the method |
4705 | * of writing zeroes to the required new blocks (the same behavior which is |
4706 | * expected for file systems which do not support fallocate() system call). |
4707 | */ |
4708 | long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) |
4709 | { |
4710 | struct inode *inode = file_inode(f: file); |
4711 | loff_t new_size = 0; |
4712 | unsigned int max_blocks; |
4713 | int ret = 0; |
4714 | int flags; |
4715 | ext4_lblk_t lblk; |
4716 | unsigned int blkbits = inode->i_blkbits; |
4717 | |
4718 | /* |
4719 | * Encrypted inodes can't handle collapse range or insert |
4720 | * range since we would need to re-encrypt blocks with a |
4721 | * different IV or XTS tweak (which are based on the logical |
4722 | * block number). |
4723 | */ |
4724 | if (IS_ENCRYPTED(inode) && |
4725 | (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE))) |
4726 | return -EOPNOTSUPP; |
4727 | |
4728 | /* Return error if mode is not supported */ |
4729 | if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | |
4730 | FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | |
4731 | FALLOC_FL_INSERT_RANGE)) |
4732 | return -EOPNOTSUPP; |
4733 | |
4734 | inode_lock(inode); |
4735 | ret = ext4_convert_inline_data(inode); |
4736 | inode_unlock(inode); |
4737 | if (ret) |
4738 | goto exit; |
4739 | |
4740 | if (mode & FALLOC_FL_PUNCH_HOLE) { |
4741 | ret = ext4_punch_hole(file, offset, length: len); |
4742 | goto exit; |
4743 | } |
4744 | |
4745 | if (mode & FALLOC_FL_COLLAPSE_RANGE) { |
4746 | ret = ext4_collapse_range(file, offset, len); |
4747 | goto exit; |
4748 | } |
4749 | |
4750 | if (mode & FALLOC_FL_INSERT_RANGE) { |
4751 | ret = ext4_insert_range(file, offset, len); |
4752 | goto exit; |
4753 | } |
4754 | |
4755 | if (mode & FALLOC_FL_ZERO_RANGE) { |
4756 | ret = ext4_zero_range(file, offset, len, mode); |
4757 | goto exit; |
4758 | } |
4759 | trace_ext4_fallocate_enter(inode, offset, len, mode); |
4760 | lblk = offset >> blkbits; |
4761 | |
4762 | max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits); |
4763 | flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT; |
4764 | |
4765 | inode_lock(inode); |
4766 | |
4767 | /* |
4768 | * We only support preallocation for extent-based files only |
4769 | */ |
4770 | if (!(ext4_test_inode_flag(inode, bit: EXT4_INODE_EXTENTS))) { |
4771 | ret = -EOPNOTSUPP; |
4772 | goto out; |
4773 | } |
4774 | |
4775 | if (!(mode & FALLOC_FL_KEEP_SIZE) && |
4776 | (offset + len > inode->i_size || |
4777 | offset + len > EXT4_I(inode)->i_disksize)) { |
4778 | new_size = offset + len; |
4779 | ret = inode_newsize_ok(inode, offset: new_size); |
4780 | if (ret) |
4781 | goto out; |
4782 | } |
4783 | |
4784 | /* Wait all existing dio workers, newcomers will block on i_rwsem */ |
4785 | inode_dio_wait(inode); |
4786 | |
4787 | ret = file_modified(file); |
4788 | if (ret) |
4789 | goto out; |
4790 | |
4791 | ret = ext4_alloc_file_blocks(file, offset: lblk, len: max_blocks, new_size, flags); |
4792 | if (ret) |
4793 | goto out; |
4794 | |
4795 | if (file->f_flags & O_SYNC && EXT4_SB(sb: inode->i_sb)->s_journal) { |
4796 | ret = ext4_fc_commit(journal: EXT4_SB(sb: inode->i_sb)->s_journal, |
4797 | EXT4_I(inode)->i_sync_tid); |
4798 | } |
4799 | out: |
4800 | inode_unlock(inode); |
4801 | trace_ext4_fallocate_exit(inode, offset, max_blocks, ret); |
4802 | exit: |
4803 | return ret; |
4804 | } |
4805 | |
4806 | /* |
4807 | * This function convert a range of blocks to written extents |
4808 | * The caller of this function will pass the start offset and the size. |
4809 | * all unwritten extents within this range will be converted to |
4810 | * written extents. |
4811 | * |
4812 | * This function is called from the direct IO end io call back |
4813 | * function, to convert the fallocated extents after IO is completed. |
4814 | * Returns 0 on success. |
4815 | */ |
4816 | int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, |
4817 | loff_t offset, ssize_t len) |
4818 | { |
4819 | unsigned int max_blocks; |
4820 | int ret = 0, ret2 = 0, ret3 = 0; |
4821 | struct ext4_map_blocks map; |
4822 | unsigned int blkbits = inode->i_blkbits; |
4823 | unsigned int credits = 0; |
4824 | |
4825 | map.m_lblk = offset >> blkbits; |
4826 | max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits); |
4827 | |
4828 | if (!handle) { |
4829 | /* |
4830 | * credits to insert 1 extent into extent tree |
4831 | */ |
4832 | credits = ext4_chunk_trans_blocks(inode, nrblocks: max_blocks); |
4833 | } |
4834 | while (ret >= 0 && ret < max_blocks) { |
4835 | map.m_lblk += ret; |
4836 | map.m_len = (max_blocks -= ret); |
4837 | if (credits) { |
4838 | handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, |
4839 | credits); |
4840 | if (IS_ERR(ptr: handle)) { |
4841 | ret = PTR_ERR(ptr: handle); |
4842 | break; |
4843 | } |
4844 | } |
4845 | ret = ext4_map_blocks(handle, inode, map: &map, |
4846 | EXT4_GET_BLOCKS_IO_CONVERT_EXT); |
4847 | if (ret <= 0) |
4848 | ext4_warning(inode->i_sb, |
4849 | "inode #%lu: block %u: len %u: " |
4850 | "ext4_ext_map_blocks returned %d" , |
4851 | inode->i_ino, map.m_lblk, |
4852 | map.m_len, ret); |
4853 | ret2 = ext4_mark_inode_dirty(handle, inode); |
4854 | if (credits) { |
4855 | ret3 = ext4_journal_stop(handle); |
4856 | if (unlikely(ret3)) |
4857 | ret2 = ret3; |
4858 | } |
4859 | |
4860 | if (ret <= 0 || ret2) |
4861 | break; |
4862 | } |
4863 | return ret > 0 ? ret2 : ret; |
4864 | } |
4865 | |
4866 | int ext4_convert_unwritten_io_end_vec(handle_t *handle, ext4_io_end_t *io_end) |
4867 | { |
4868 | int ret = 0, err = 0; |
4869 | struct ext4_io_end_vec *io_end_vec; |
4870 | |
4871 | /* |
4872 | * This is somewhat ugly but the idea is clear: When transaction is |
4873 | * reserved, everything goes into it. Otherwise we rather start several |
4874 | * smaller transactions for conversion of each extent separately. |
4875 | */ |
4876 | if (handle) { |
4877 | handle = ext4_journal_start_reserved(handle, |
4878 | EXT4_HT_EXT_CONVERT); |
4879 | if (IS_ERR(ptr: handle)) |
4880 | return PTR_ERR(ptr: handle); |
4881 | } |
4882 | |
4883 | list_for_each_entry(io_end_vec, &io_end->list_vec, list) { |
4884 | ret = ext4_convert_unwritten_extents(handle, inode: io_end->inode, |
4885 | offset: io_end_vec->offset, |
4886 | len: io_end_vec->size); |
4887 | if (ret) |
4888 | break; |
4889 | } |
4890 | |
4891 | if (handle) |
4892 | err = ext4_journal_stop(handle); |
4893 | |
4894 | return ret < 0 ? ret : err; |
4895 | } |
4896 | |
4897 | static int ext4_iomap_xattr_fiemap(struct inode *inode, struct iomap *iomap) |
4898 | { |
4899 | __u64 physical = 0; |
4900 | __u64 length = 0; |
4901 | int blockbits = inode->i_sb->s_blocksize_bits; |
4902 | int error = 0; |
4903 | u16 iomap_type; |
4904 | |
4905 | /* in-inode? */ |
4906 | if (ext4_test_inode_state(inode, bit: EXT4_STATE_XATTR)) { |
4907 | struct ext4_iloc iloc; |
4908 | int offset; /* offset of xattr in inode */ |
4909 | |
4910 | error = ext4_get_inode_loc(inode, &iloc); |
4911 | if (error) |
4912 | return error; |
4913 | physical = (__u64)iloc.bh->b_blocknr << blockbits; |
4914 | offset = EXT4_GOOD_OLD_INODE_SIZE + |
4915 | EXT4_I(inode)->i_extra_isize; |
4916 | physical += offset; |
4917 | length = EXT4_SB(sb: inode->i_sb)->s_inode_size - offset; |
4918 | brelse(bh: iloc.bh); |
4919 | iomap_type = IOMAP_INLINE; |
4920 | } else if (EXT4_I(inode)->i_file_acl) { /* external block */ |
4921 | physical = (__u64)EXT4_I(inode)->i_file_acl << blockbits; |
4922 | length = inode->i_sb->s_blocksize; |
4923 | iomap_type = IOMAP_MAPPED; |
4924 | } else { |
4925 | /* no in-inode or external block for xattr, so return -ENOENT */ |
4926 | error = -ENOENT; |
4927 | goto out; |
4928 | } |
4929 | |
4930 | iomap->addr = physical; |
4931 | iomap->offset = 0; |
4932 | iomap->length = length; |
4933 | iomap->type = iomap_type; |
4934 | iomap->flags = 0; |
4935 | out: |
4936 | return error; |
4937 | } |
4938 | |
4939 | static int ext4_iomap_xattr_begin(struct inode *inode, loff_t offset, |
4940 | loff_t length, unsigned flags, |
4941 | struct iomap *iomap, struct iomap *srcmap) |
4942 | { |
4943 | int error; |
4944 | |
4945 | error = ext4_iomap_xattr_fiemap(inode, iomap); |
4946 | if (error == 0 && (offset >= iomap->length)) |
4947 | error = -ENOENT; |
4948 | return error; |
4949 | } |
4950 | |
4951 | static const struct iomap_ops ext4_iomap_xattr_ops = { |
4952 | .iomap_begin = ext4_iomap_xattr_begin, |
4953 | }; |
4954 | |
4955 | static int ext4_fiemap_check_ranges(struct inode *inode, u64 start, u64 *len) |
4956 | { |
4957 | u64 maxbytes; |
4958 | |
4959 | if (ext4_test_inode_flag(inode, bit: EXT4_INODE_EXTENTS)) |
4960 | maxbytes = inode->i_sb->s_maxbytes; |
4961 | else |
4962 | maxbytes = EXT4_SB(sb: inode->i_sb)->s_bitmap_maxbytes; |
4963 | |
4964 | if (*len == 0) |
4965 | return -EINVAL; |
4966 | if (start > maxbytes) |
4967 | return -EFBIG; |
4968 | |
4969 | /* |
4970 | * Shrink request scope to what the fs can actually handle. |
4971 | */ |
4972 | if (*len > maxbytes || (maxbytes - *len) < start) |
4973 | *len = maxbytes - start; |
4974 | return 0; |
4975 | } |
4976 | |
4977 | int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, |
4978 | u64 start, u64 len) |
4979 | { |
4980 | int error = 0; |
4981 | |
4982 | if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) { |
4983 | error = ext4_ext_precache(inode); |
4984 | if (error) |
4985 | return error; |
4986 | fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE; |
4987 | } |
4988 | |
4989 | /* |
4990 | * For bitmap files the maximum size limit could be smaller than |
4991 | * s_maxbytes, so check len here manually instead of just relying on the |
4992 | * generic check. |
4993 | */ |
4994 | error = ext4_fiemap_check_ranges(inode, start, len: &len); |
4995 | if (error) |
4996 | return error; |
4997 | |
4998 | if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { |
4999 | fieinfo->fi_flags &= ~FIEMAP_FLAG_XATTR; |
5000 | return iomap_fiemap(inode, fieinfo, start, len, |
5001 | ops: &ext4_iomap_xattr_ops); |
5002 | } |
5003 | |
5004 | return iomap_fiemap(inode, fieinfo, start, len, ops: &ext4_iomap_report_ops); |
5005 | } |
5006 | |
5007 | int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo, |
5008 | __u64 start, __u64 len) |
5009 | { |
5010 | ext4_lblk_t start_blk, len_blks; |
5011 | __u64 last_blk; |
5012 | int error = 0; |
5013 | |
5014 | if (ext4_has_inline_data(inode)) { |
5015 | int has_inline; |
5016 | |
5017 | down_read(sem: &EXT4_I(inode)->xattr_sem); |
5018 | has_inline = ext4_has_inline_data(inode); |
5019 | up_read(sem: &EXT4_I(inode)->xattr_sem); |
5020 | if (has_inline) |
5021 | return 0; |
5022 | } |
5023 | |
5024 | if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) { |
5025 | error = ext4_ext_precache(inode); |
5026 | if (error) |
5027 | return error; |
5028 | fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE; |
5029 | } |
5030 | |
5031 | error = fiemap_prep(inode, fieinfo, start, len: &len, supported_flags: 0); |
5032 | if (error) |
5033 | return error; |
5034 | |
5035 | error = ext4_fiemap_check_ranges(inode, start, len: &len); |
5036 | if (error) |
5037 | return error; |
5038 | |
5039 | start_blk = start >> inode->i_sb->s_blocksize_bits; |
5040 | last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits; |
5041 | if (last_blk >= EXT_MAX_BLOCKS) |
5042 | last_blk = EXT_MAX_BLOCKS-1; |
5043 | len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1; |
5044 | |
5045 | /* |
5046 | * Walk the extent tree gathering extent information |
5047 | * and pushing extents back to the user. |
5048 | */ |
5049 | return ext4_fill_es_cache_info(inode, block: start_blk, num: len_blks, fieinfo); |
5050 | } |
5051 | |
5052 | /* |
5053 | * ext4_ext_shift_path_extents: |
5054 | * Shift the extents of a path structure lying between path[depth].p_ext |
5055 | * and EXT_LAST_EXTENT(path[depth].p_hdr), by @shift blocks. @SHIFT tells |
5056 | * if it is right shift or left shift operation. |
5057 | */ |
5058 | static int |
5059 | ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, |
5060 | struct inode *inode, handle_t *handle, |
5061 | enum SHIFT_DIRECTION SHIFT) |
5062 | { |
5063 | int depth, err = 0; |
5064 | struct ext4_extent *ex_start, *ex_last; |
5065 | bool update = false; |
5066 | int credits, restart_credits; |
5067 | depth = path->p_depth; |
5068 | |
5069 | while (depth >= 0) { |
5070 | if (depth == path->p_depth) { |
5071 | ex_start = path[depth].p_ext; |
5072 | if (!ex_start) |
5073 | return -EFSCORRUPTED; |
5074 | |
5075 | ex_last = EXT_LAST_EXTENT(path[depth].p_hdr); |
5076 | /* leaf + sb + inode */ |
5077 | credits = 3; |
5078 | if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr)) { |
5079 | update = true; |
5080 | /* extent tree + sb + inode */ |
5081 | credits = depth + 2; |
5082 | } |
5083 | |
5084 | restart_credits = ext4_writepage_trans_blocks(inode); |
5085 | err = ext4_datasem_ensure_credits(handle, inode, check_cred: credits, |
5086 | restart_cred: restart_credits, revoke_cred: 0); |
5087 | if (err) { |
5088 | if (err > 0) |
5089 | err = -EAGAIN; |
5090 | goto out; |
5091 | } |
5092 | |
5093 | err = ext4_ext_get_access(handle, inode, path: path + depth); |
5094 | if (err) |
5095 | goto out; |
5096 | |
5097 | while (ex_start <= ex_last) { |
5098 | if (SHIFT == SHIFT_LEFT) { |
5099 | le32_add_cpu(var: &ex_start->ee_block, |
5100 | val: -shift); |
5101 | /* Try to merge to the left. */ |
5102 | if ((ex_start > |
5103 | EXT_FIRST_EXTENT(path[depth].p_hdr)) |
5104 | && |
5105 | ext4_ext_try_to_merge_right(inode, |
5106 | path, ex: ex_start - 1)) |
5107 | ex_last--; |
5108 | else |
5109 | ex_start++; |
5110 | } else { |
5111 | le32_add_cpu(var: &ex_last->ee_block, val: shift); |
5112 | ext4_ext_try_to_merge_right(inode, path, |
5113 | ex: ex_last); |
5114 | ex_last--; |
5115 | } |
5116 | } |
5117 | err = ext4_ext_dirty(handle, inode, path + depth); |
5118 | if (err) |
5119 | goto out; |
5120 | |
5121 | if (--depth < 0 || !update) |
5122 | break; |
5123 | } |
5124 | |
5125 | /* Update index too */ |
5126 | err = ext4_ext_get_access(handle, inode, path: path + depth); |
5127 | if (err) |
5128 | goto out; |
5129 | |
5130 | if (SHIFT == SHIFT_LEFT) |
5131 | le32_add_cpu(var: &path[depth].p_idx->ei_block, val: -shift); |
5132 | else |
5133 | le32_add_cpu(var: &path[depth].p_idx->ei_block, val: shift); |
5134 | err = ext4_ext_dirty(handle, inode, path + depth); |
5135 | if (err) |
5136 | goto out; |
5137 | |
5138 | /* we are done if current index is not a starting index */ |
5139 | if (path[depth].p_idx != EXT_FIRST_INDEX(path[depth].p_hdr)) |
5140 | break; |
5141 | |
5142 | depth--; |
5143 | } |
5144 | |
5145 | out: |
5146 | return err; |
5147 | } |
5148 | |
5149 | /* |
5150 | * ext4_ext_shift_extents: |
5151 | * All the extents which lies in the range from @start to the last allocated |
5152 | * block for the @inode are shifted either towards left or right (depending |
5153 | * upon @SHIFT) by @shift blocks. |
5154 | * On success, 0 is returned, error otherwise. |
5155 | */ |
5156 | static int |
5157 | ext4_ext_shift_extents(struct inode *inode, handle_t *handle, |
5158 | ext4_lblk_t start, ext4_lblk_t shift, |
5159 | enum SHIFT_DIRECTION SHIFT) |
5160 | { |
5161 | struct ext4_ext_path *path; |
5162 | int ret = 0, depth; |
5163 | struct ext4_extent *extent; |
5164 | ext4_lblk_t stop, *iterator, ex_start, ex_end; |
5165 | ext4_lblk_t tmp = EXT_MAX_BLOCKS; |
5166 | |
5167 | /* Let path point to the last extent */ |
5168 | path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, |
5169 | EXT4_EX_NOCACHE); |
5170 | if (IS_ERR(ptr: path)) |
5171 | return PTR_ERR(ptr: path); |
5172 | |
5173 | depth = path->p_depth; |
5174 | extent = path[depth].p_ext; |
5175 | if (!extent) |
5176 | goto out; |
5177 | |
5178 | stop = le32_to_cpu(extent->ee_block); |
5179 | |
5180 | /* |
5181 | * For left shifts, make sure the hole on the left is big enough to |
5182 | * accommodate the shift. For right shifts, make sure the last extent |
5183 | * won't be shifted beyond EXT_MAX_BLOCKS. |
5184 | */ |
5185 | if (SHIFT == SHIFT_LEFT) { |
5186 | path = ext4_find_extent(inode, block: start - 1, orig_path: &path, |
5187 | EXT4_EX_NOCACHE); |
5188 | if (IS_ERR(ptr: path)) |
5189 | return PTR_ERR(ptr: path); |
5190 | depth = path->p_depth; |
5191 | extent = path[depth].p_ext; |
5192 | if (extent) { |
5193 | ex_start = le32_to_cpu(extent->ee_block); |
5194 | ex_end = le32_to_cpu(extent->ee_block) + |
5195 | ext4_ext_get_actual_len(ext: extent); |
5196 | } else { |
5197 | ex_start = 0; |
5198 | ex_end = 0; |
5199 | } |
5200 | |
5201 | if ((start == ex_start && shift > ex_start) || |
5202 | (shift > start - ex_end)) { |
5203 | ret = -EINVAL; |
5204 | goto out; |
5205 | } |
5206 | } else { |
5207 | if (shift > EXT_MAX_BLOCKS - |
5208 | (stop + ext4_ext_get_actual_len(ext: extent))) { |
5209 | ret = -EINVAL; |
5210 | goto out; |
5211 | } |
5212 | } |
5213 | |
5214 | /* |
5215 | * In case of left shift, iterator points to start and it is increased |
5216 | * till we reach stop. In case of right shift, iterator points to stop |
5217 | * and it is decreased till we reach start. |
5218 | */ |
5219 | again: |
5220 | ret = 0; |
5221 | if (SHIFT == SHIFT_LEFT) |
5222 | iterator = &start; |
5223 | else |
5224 | iterator = &stop; |
5225 | |
5226 | if (tmp != EXT_MAX_BLOCKS) |
5227 | *iterator = tmp; |
5228 | |
5229 | /* |
5230 | * Its safe to start updating extents. Start and stop are unsigned, so |
5231 | * in case of right shift if extent with 0 block is reached, iterator |
5232 | * becomes NULL to indicate the end of the loop. |
5233 | */ |
5234 | while (iterator && start <= stop) { |
5235 | path = ext4_find_extent(inode, block: *iterator, orig_path: &path, |
5236 | EXT4_EX_NOCACHE); |
5237 | if (IS_ERR(ptr: path)) |
5238 | return PTR_ERR(ptr: path); |
5239 | depth = path->p_depth; |
5240 | extent = path[depth].p_ext; |
5241 | if (!extent) { |
5242 | EXT4_ERROR_INODE(inode, "unexpected hole at %lu" , |
5243 | (unsigned long) *iterator); |
5244 | return -EFSCORRUPTED; |
5245 | } |
5246 | if (SHIFT == SHIFT_LEFT && *iterator > |
5247 | le32_to_cpu(extent->ee_block)) { |
5248 | /* Hole, move to the next extent */ |
5249 | if (extent < EXT_LAST_EXTENT(path[depth].p_hdr)) { |
5250 | path[depth].p_ext++; |
5251 | } else { |
5252 | *iterator = ext4_ext_next_allocated_block(path); |
5253 | continue; |
5254 | } |
5255 | } |
5256 | |
5257 | tmp = *iterator; |
5258 | if (SHIFT == SHIFT_LEFT) { |
5259 | extent = EXT_LAST_EXTENT(path[depth].p_hdr); |
5260 | *iterator = le32_to_cpu(extent->ee_block) + |
5261 | ext4_ext_get_actual_len(ext: extent); |
5262 | } else { |
5263 | extent = EXT_FIRST_EXTENT(path[depth].p_hdr); |
5264 | if (le32_to_cpu(extent->ee_block) > start) |
5265 | *iterator = le32_to_cpu(extent->ee_block) - 1; |
5266 | else if (le32_to_cpu(extent->ee_block) == start) |
5267 | iterator = NULL; |
5268 | else { |
5269 | extent = EXT_LAST_EXTENT(path[depth].p_hdr); |
5270 | while (le32_to_cpu(extent->ee_block) >= start) |
5271 | extent--; |
5272 | |
5273 | if (extent == EXT_LAST_EXTENT(path[depth].p_hdr)) |
5274 | break; |
5275 | |
5276 | extent++; |
5277 | iterator = NULL; |
5278 | } |
5279 | path[depth].p_ext = extent; |
5280 | } |
5281 | ret = ext4_ext_shift_path_extents(path, shift, inode, |
5282 | handle, SHIFT); |
5283 | /* iterator can be NULL which means we should break */ |
5284 | if (ret == -EAGAIN) |
5285 | goto again; |
5286 | if (ret) |
5287 | break; |
5288 | } |
5289 | out: |
5290 | ext4_free_ext_path(path); |
5291 | return ret; |
5292 | } |
5293 | |
5294 | /* |
5295 | * ext4_collapse_range: |
5296 | * This implements the fallocate's collapse range functionality for ext4 |
5297 | * Returns: 0 and non-zero on error. |
5298 | */ |
5299 | static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len) |
5300 | { |
5301 | struct inode *inode = file_inode(f: file); |
5302 | struct super_block *sb = inode->i_sb; |
5303 | struct address_space *mapping = inode->i_mapping; |
5304 | ext4_lblk_t punch_start, punch_stop; |
5305 | handle_t *handle; |
5306 | unsigned int credits; |
5307 | loff_t new_size, ioffset; |
5308 | int ret; |
5309 | |
5310 | /* |
5311 | * We need to test this early because xfstests assumes that a |
5312 | * collapse range of (0, 1) will return EOPNOTSUPP if the file |
5313 | * system does not support collapse range. |
5314 | */ |
5315 | if (!ext4_test_inode_flag(inode, bit: EXT4_INODE_EXTENTS)) |
5316 | return -EOPNOTSUPP; |
5317 | |
5318 | /* Collapse range works only on fs cluster size aligned regions. */ |
5319 | if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb))) |
5320 | return -EINVAL; |
5321 | |
5322 | trace_ext4_collapse_range(inode, offset, len); |
5323 | |
5324 | punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb); |
5325 | punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb); |
5326 | |
5327 | inode_lock(inode); |
5328 | /* |
5329 | * There is no need to overlap collapse range with EOF, in which case |
5330 | * it is effectively a truncate operation |
5331 | */ |
5332 | if (offset + len >= inode->i_size) { |
5333 | ret = -EINVAL; |
5334 | goto out_mutex; |
5335 | } |
5336 | |
5337 | /* Currently just for extent based files */ |
5338 | if (!ext4_test_inode_flag(inode, bit: EXT4_INODE_EXTENTS)) { |
5339 | ret = -EOPNOTSUPP; |
5340 | goto out_mutex; |
5341 | } |
5342 | |
5343 | /* Wait for existing dio to complete */ |
5344 | inode_dio_wait(inode); |
5345 | |
5346 | ret = file_modified(file); |
5347 | if (ret) |
5348 | goto out_mutex; |
5349 | |
5350 | /* |
5351 | * Prevent page faults from reinstantiating pages we have released from |
5352 | * page cache. |
5353 | */ |
5354 | filemap_invalidate_lock(mapping); |
5355 | |
5356 | ret = ext4_break_layouts(inode); |
5357 | if (ret) |
5358 | goto out_mmap; |
5359 | |
5360 | /* |
5361 | * Need to round down offset to be aligned with page size boundary |
5362 | * for page size > block size. |
5363 | */ |
5364 | ioffset = round_down(offset, PAGE_SIZE); |
5365 | /* |
5366 | * Write tail of the last page before removed range since it will get |
5367 | * removed from the page cache below. |
5368 | */ |
5369 | ret = filemap_write_and_wait_range(mapping, lstart: ioffset, lend: offset); |
5370 | if (ret) |
5371 | goto out_mmap; |
5372 | /* |
5373 | * Write data that will be shifted to preserve them when discarding |
5374 | * page cache below. We are also protected from pages becoming dirty |
5375 | * by i_rwsem and invalidate_lock. |
5376 | */ |
5377 | ret = filemap_write_and_wait_range(mapping, lstart: offset + len, |
5378 | LLONG_MAX); |
5379 | if (ret) |
5380 | goto out_mmap; |
5381 | truncate_pagecache(inode, new: ioffset); |
5382 | |
5383 | credits = ext4_writepage_trans_blocks(inode); |
5384 | handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); |
5385 | if (IS_ERR(ptr: handle)) { |
5386 | ret = PTR_ERR(ptr: handle); |
5387 | goto out_mmap; |
5388 | } |
5389 | ext4_fc_mark_ineligible(sb, reason: EXT4_FC_REASON_FALLOC_RANGE, handle); |
5390 | |
5391 | down_write(sem: &EXT4_I(inode)->i_data_sem); |
5392 | ext4_discard_preallocations(inode); |
5393 | ext4_es_remove_extent(inode, lblk: punch_start, EXT_MAX_BLOCKS - punch_start); |
5394 | |
5395 | ret = ext4_ext_remove_space(inode, start: punch_start, end: punch_stop - 1); |
5396 | if (ret) { |
5397 | up_write(sem: &EXT4_I(inode)->i_data_sem); |
5398 | goto out_stop; |
5399 | } |
5400 | ext4_discard_preallocations(inode); |
5401 | |
5402 | ret = ext4_ext_shift_extents(inode, handle, start: punch_stop, |
5403 | shift: punch_stop - punch_start, SHIFT: SHIFT_LEFT); |
5404 | if (ret) { |
5405 | up_write(sem: &EXT4_I(inode)->i_data_sem); |
5406 | goto out_stop; |
5407 | } |
5408 | |
5409 | new_size = inode->i_size - len; |
5410 | i_size_write(inode, i_size: new_size); |
5411 | EXT4_I(inode)->i_disksize = new_size; |
5412 | |
5413 | up_write(sem: &EXT4_I(inode)->i_data_sem); |
5414 | if (IS_SYNC(inode)) |
5415 | ext4_handle_sync(handle); |
5416 | inode_set_mtime_to_ts(inode, ts: inode_set_ctime_current(inode)); |
5417 | ret = ext4_mark_inode_dirty(handle, inode); |
5418 | ext4_update_inode_fsync_trans(handle, inode, datasync: 1); |
5419 | |
5420 | out_stop: |
5421 | ext4_journal_stop(handle); |
5422 | out_mmap: |
5423 | filemap_invalidate_unlock(mapping); |
5424 | out_mutex: |
5425 | inode_unlock(inode); |
5426 | return ret; |
5427 | } |
5428 | |
5429 | /* |
5430 | * ext4_insert_range: |
5431 | * This function implements the FALLOC_FL_INSERT_RANGE flag of fallocate. |
5432 | * The data blocks starting from @offset to the EOF are shifted by @len |
5433 | * towards right to create a hole in the @inode. Inode size is increased |
5434 | * by len bytes. |
5435 | * Returns 0 on success, error otherwise. |
5436 | */ |
5437 | static int ext4_insert_range(struct file *file, loff_t offset, loff_t len) |
5438 | { |
5439 | struct inode *inode = file_inode(f: file); |
5440 | struct super_block *sb = inode->i_sb; |
5441 | struct address_space *mapping = inode->i_mapping; |
5442 | handle_t *handle; |
5443 | struct ext4_ext_path *path; |
5444 | struct ext4_extent *extent; |
5445 | ext4_lblk_t offset_lblk, len_lblk, ee_start_lblk = 0; |
5446 | unsigned int credits, ee_len; |
5447 | int ret = 0, depth, split_flag = 0; |
5448 | loff_t ioffset; |
5449 | |
5450 | /* |
5451 | * We need to test this early because xfstests assumes that an |
5452 | * insert range of (0, 1) will return EOPNOTSUPP if the file |
5453 | * system does not support insert range. |
5454 | */ |
5455 | if (!ext4_test_inode_flag(inode, bit: EXT4_INODE_EXTENTS)) |
5456 | return -EOPNOTSUPP; |
5457 | |
5458 | /* Insert range works only on fs cluster size aligned regions. */ |
5459 | if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb))) |
5460 | return -EINVAL; |
5461 | |
5462 | trace_ext4_insert_range(inode, offset, len); |
5463 | |
5464 | offset_lblk = offset >> EXT4_BLOCK_SIZE_BITS(sb); |
5465 | len_lblk = len >> EXT4_BLOCK_SIZE_BITS(sb); |
5466 | |
5467 | inode_lock(inode); |
5468 | /* Currently just for extent based files */ |
5469 | if (!ext4_test_inode_flag(inode, bit: EXT4_INODE_EXTENTS)) { |
5470 | ret = -EOPNOTSUPP; |
5471 | goto out_mutex; |
5472 | } |
5473 | |
5474 | /* Check whether the maximum file size would be exceeded */ |
5475 | if (len > inode->i_sb->s_maxbytes - inode->i_size) { |
5476 | ret = -EFBIG; |
5477 | goto out_mutex; |
5478 | } |
5479 | |
5480 | /* Offset must be less than i_size */ |
5481 | if (offset >= inode->i_size) { |
5482 | ret = -EINVAL; |
5483 | goto out_mutex; |
5484 | } |
5485 | |
5486 | /* Wait for existing dio to complete */ |
5487 | inode_dio_wait(inode); |
5488 | |
5489 | ret = file_modified(file); |
5490 | if (ret) |
5491 | goto out_mutex; |
5492 | |
5493 | /* |
5494 | * Prevent page faults from reinstantiating pages we have released from |
5495 | * page cache. |
5496 | */ |
5497 | filemap_invalidate_lock(mapping); |
5498 | |
5499 | ret = ext4_break_layouts(inode); |
5500 | if (ret) |
5501 | goto out_mmap; |
5502 | |
5503 | /* |
5504 | * Need to round down to align start offset to page size boundary |
5505 | * for page size > block size. |
5506 | */ |
5507 | ioffset = round_down(offset, PAGE_SIZE); |
5508 | /* Write out all dirty pages */ |
5509 | ret = filemap_write_and_wait_range(mapping: inode->i_mapping, lstart: ioffset, |
5510 | LLONG_MAX); |
5511 | if (ret) |
5512 | goto out_mmap; |
5513 | truncate_pagecache(inode, new: ioffset); |
5514 | |
5515 | credits = ext4_writepage_trans_blocks(inode); |
5516 | handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); |
5517 | if (IS_ERR(ptr: handle)) { |
5518 | ret = PTR_ERR(ptr: handle); |
5519 | goto out_mmap; |
5520 | } |
5521 | ext4_fc_mark_ineligible(sb, reason: EXT4_FC_REASON_FALLOC_RANGE, handle); |
5522 | |
5523 | /* Expand file to avoid data loss if there is error while shifting */ |
5524 | inode->i_size += len; |
5525 | EXT4_I(inode)->i_disksize += len; |
5526 | inode_set_mtime_to_ts(inode, ts: inode_set_ctime_current(inode)); |
5527 | ret = ext4_mark_inode_dirty(handle, inode); |
5528 | if (ret) |
5529 | goto out_stop; |
5530 | |
5531 | down_write(sem: &EXT4_I(inode)->i_data_sem); |
5532 | ext4_discard_preallocations(inode); |
5533 | |
5534 | path = ext4_find_extent(inode, block: offset_lblk, NULL, flags: 0); |
5535 | if (IS_ERR(ptr: path)) { |
5536 | up_write(sem: &EXT4_I(inode)->i_data_sem); |
5537 | goto out_stop; |
5538 | } |
5539 | |
5540 | depth = ext_depth(inode); |
5541 | extent = path[depth].p_ext; |
5542 | if (extent) { |
5543 | ee_start_lblk = le32_to_cpu(extent->ee_block); |
5544 | ee_len = ext4_ext_get_actual_len(ext: extent); |
5545 | |
5546 | /* |
5547 | * If offset_lblk is not the starting block of extent, split |
5548 | * the extent @offset_lblk |
5549 | */ |
5550 | if ((offset_lblk > ee_start_lblk) && |
5551 | (offset_lblk < (ee_start_lblk + ee_len))) { |
5552 | if (ext4_ext_is_unwritten(ext: extent)) |
5553 | split_flag = EXT4_EXT_MARK_UNWRIT1 | |
5554 | EXT4_EXT_MARK_UNWRIT2; |
5555 | ret = ext4_split_extent_at(handle, inode, ppath: &path, |
5556 | split: offset_lblk, split_flag, |
5557 | EXT4_EX_NOCACHE | |
5558 | EXT4_GET_BLOCKS_PRE_IO | |
5559 | EXT4_GET_BLOCKS_METADATA_NOFAIL); |
5560 | } |
5561 | |
5562 | ext4_free_ext_path(path); |
5563 | if (ret < 0) { |
5564 | up_write(sem: &EXT4_I(inode)->i_data_sem); |
5565 | goto out_stop; |
5566 | } |
5567 | } else { |
5568 | ext4_free_ext_path(path); |
5569 | } |
5570 | |
5571 | ext4_es_remove_extent(inode, lblk: offset_lblk, EXT_MAX_BLOCKS - offset_lblk); |
5572 | |
5573 | /* |
5574 | * if offset_lblk lies in a hole which is at start of file, use |
5575 | * ee_start_lblk to shift extents |
5576 | */ |
5577 | ret = ext4_ext_shift_extents(inode, handle, |
5578 | max(ee_start_lblk, offset_lblk), shift: len_lblk, SHIFT: SHIFT_RIGHT); |
5579 | |
5580 | up_write(sem: &EXT4_I(inode)->i_data_sem); |
5581 | if (IS_SYNC(inode)) |
5582 | ext4_handle_sync(handle); |
5583 | if (ret >= 0) |
5584 | ext4_update_inode_fsync_trans(handle, inode, datasync: 1); |
5585 | |
5586 | out_stop: |
5587 | ext4_journal_stop(handle); |
5588 | out_mmap: |
5589 | filemap_invalidate_unlock(mapping); |
5590 | out_mutex: |
5591 | inode_unlock(inode); |
5592 | return ret; |
5593 | } |
5594 | |
5595 | /** |
5596 | * ext4_swap_extents() - Swap extents between two inodes |
5597 | * @handle: handle for this transaction |
5598 | * @inode1: First inode |
5599 | * @inode2: Second inode |
5600 | * @lblk1: Start block for first inode |
5601 | * @lblk2: Start block for second inode |
5602 | * @count: Number of blocks to swap |
5603 | * @unwritten: Mark second inode's extents as unwritten after swap |
5604 | * @erp: Pointer to save error value |
5605 | * |
5606 | * This helper routine does exactly what is promise "swap extents". All other |
5607 | * stuff such as page-cache locking consistency, bh mapping consistency or |
5608 | * extent's data copying must be performed by caller. |
5609 | * Locking: |
5610 | * i_rwsem is held for both inodes |
5611 | * i_data_sem is locked for write for both inodes |
5612 | * Assumptions: |
5613 | * All pages from requested range are locked for both inodes |
5614 | */ |
5615 | int |
5616 | ext4_swap_extents(handle_t *handle, struct inode *inode1, |
5617 | struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2, |
5618 | ext4_lblk_t count, int unwritten, int *erp) |
5619 | { |
5620 | struct ext4_ext_path *path1 = NULL; |
5621 | struct ext4_ext_path *path2 = NULL; |
5622 | int replaced_count = 0; |
5623 | |
5624 | BUG_ON(!rwsem_is_locked(&EXT4_I(inode1)->i_data_sem)); |
5625 | BUG_ON(!rwsem_is_locked(&EXT4_I(inode2)->i_data_sem)); |
5626 | BUG_ON(!inode_is_locked(inode1)); |
5627 | BUG_ON(!inode_is_locked(inode2)); |
5628 | |
5629 | ext4_es_remove_extent(inode: inode1, lblk: lblk1, len: count); |
5630 | ext4_es_remove_extent(inode: inode2, lblk: lblk2, len: count); |
5631 | |
5632 | while (count) { |
5633 | struct ext4_extent *ex1, *ex2, tmp_ex; |
5634 | ext4_lblk_t e1_blk, e2_blk; |
5635 | int e1_len, e2_len, len; |
5636 | int split = 0; |
5637 | |
5638 | path1 = ext4_find_extent(inode: inode1, block: lblk1, NULL, EXT4_EX_NOCACHE); |
5639 | if (IS_ERR(ptr: path1)) { |
5640 | *erp = PTR_ERR(ptr: path1); |
5641 | path1 = NULL; |
5642 | finish: |
5643 | count = 0; |
5644 | goto repeat; |
5645 | } |
5646 | path2 = ext4_find_extent(inode: inode2, block: lblk2, NULL, EXT4_EX_NOCACHE); |
5647 | if (IS_ERR(ptr: path2)) { |
5648 | *erp = PTR_ERR(ptr: path2); |
5649 | path2 = NULL; |
5650 | goto finish; |
5651 | } |
5652 | ex1 = path1[path1->p_depth].p_ext; |
5653 | ex2 = path2[path2->p_depth].p_ext; |
5654 | /* Do we have something to swap ? */ |
5655 | if (unlikely(!ex2 || !ex1)) |
5656 | goto finish; |
5657 | |
5658 | e1_blk = le32_to_cpu(ex1->ee_block); |
5659 | e2_blk = le32_to_cpu(ex2->ee_block); |
5660 | e1_len = ext4_ext_get_actual_len(ext: ex1); |
5661 | e2_len = ext4_ext_get_actual_len(ext: ex2); |
5662 | |
5663 | /* Hole handling */ |
5664 | if (!in_range(lblk1, e1_blk, e1_len) || |
5665 | !in_range(lblk2, e2_blk, e2_len)) { |
5666 | ext4_lblk_t next1, next2; |
5667 | |
5668 | /* if hole after extent, then go to next extent */ |
5669 | next1 = ext4_ext_next_allocated_block(path: path1); |
5670 | next2 = ext4_ext_next_allocated_block(path: path2); |
5671 | /* If hole before extent, then shift to that extent */ |
5672 | if (e1_blk > lblk1) |
5673 | next1 = e1_blk; |
5674 | if (e2_blk > lblk2) |
5675 | next2 = e2_blk; |
5676 | /* Do we have something to swap */ |
5677 | if (next1 == EXT_MAX_BLOCKS || next2 == EXT_MAX_BLOCKS) |
5678 | goto finish; |
5679 | /* Move to the rightest boundary */ |
5680 | len = next1 - lblk1; |
5681 | if (len < next2 - lblk2) |
5682 | len = next2 - lblk2; |
5683 | if (len > count) |
5684 | len = count; |
5685 | lblk1 += len; |
5686 | lblk2 += len; |
5687 | count -= len; |
5688 | goto repeat; |
5689 | } |
5690 | |
5691 | /* Prepare left boundary */ |
5692 | if (e1_blk < lblk1) { |
5693 | split = 1; |
5694 | *erp = ext4_force_split_extent_at(handle, inode: inode1, |
5695 | ppath: &path1, lblk: lblk1, nofail: 0); |
5696 | if (unlikely(*erp)) |
5697 | goto finish; |
5698 | } |
5699 | if (e2_blk < lblk2) { |
5700 | split = 1; |
5701 | *erp = ext4_force_split_extent_at(handle, inode: inode2, |
5702 | ppath: &path2, lblk: lblk2, nofail: 0); |
5703 | if (unlikely(*erp)) |
5704 | goto finish; |
5705 | } |
5706 | /* ext4_split_extent_at() may result in leaf extent split, |
5707 | * path must to be revalidated. */ |
5708 | if (split) |
5709 | goto repeat; |
5710 | |
5711 | /* Prepare right boundary */ |
5712 | len = count; |
5713 | if (len > e1_blk + e1_len - lblk1) |
5714 | len = e1_blk + e1_len - lblk1; |
5715 | if (len > e2_blk + e2_len - lblk2) |
5716 | len = e2_blk + e2_len - lblk2; |
5717 | |
5718 | if (len != e1_len) { |
5719 | split = 1; |
5720 | *erp = ext4_force_split_extent_at(handle, inode: inode1, |
5721 | ppath: &path1, lblk: lblk1 + len, nofail: 0); |
5722 | if (unlikely(*erp)) |
5723 | goto finish; |
5724 | } |
5725 | if (len != e2_len) { |
5726 | split = 1; |
5727 | *erp = ext4_force_split_extent_at(handle, inode: inode2, |
5728 | ppath: &path2, lblk: lblk2 + len, nofail: 0); |
5729 | if (*erp) |
5730 | goto finish; |
5731 | } |
5732 | /* ext4_split_extent_at() may result in leaf extent split, |
5733 | * path must to be revalidated. */ |
5734 | if (split) |
5735 | goto repeat; |
5736 | |
5737 | BUG_ON(e2_len != e1_len); |
5738 | *erp = ext4_ext_get_access(handle, inode: inode1, path: path1 + path1->p_depth); |
5739 | if (unlikely(*erp)) |
5740 | goto finish; |
5741 | *erp = ext4_ext_get_access(handle, inode: inode2, path: path2 + path2->p_depth); |
5742 | if (unlikely(*erp)) |
5743 | goto finish; |
5744 | |
5745 | /* Both extents are fully inside boundaries. Swap it now */ |
5746 | tmp_ex = *ex1; |
5747 | ext4_ext_store_pblock(ex: ex1, pb: ext4_ext_pblock(ex: ex2)); |
5748 | ext4_ext_store_pblock(ex: ex2, pb: ext4_ext_pblock(ex: &tmp_ex)); |
5749 | ex1->ee_len = cpu_to_le16(e2_len); |
5750 | ex2->ee_len = cpu_to_le16(e1_len); |
5751 | if (unwritten) |
5752 | ext4_ext_mark_unwritten(ext: ex2); |
5753 | if (ext4_ext_is_unwritten(ext: &tmp_ex)) |
5754 | ext4_ext_mark_unwritten(ext: ex1); |
5755 | |
5756 | ext4_ext_try_to_merge(handle, inode: inode2, path: path2, ex: ex2); |
5757 | ext4_ext_try_to_merge(handle, inode: inode1, path: path1, ex: ex1); |
5758 | *erp = ext4_ext_dirty(handle, inode2, path2 + |
5759 | path2->p_depth); |
5760 | if (unlikely(*erp)) |
5761 | goto finish; |
5762 | *erp = ext4_ext_dirty(handle, inode1, path1 + |
5763 | path1->p_depth); |
5764 | /* |
5765 | * Looks scarry ah..? second inode already points to new blocks, |
5766 | * and it was successfully dirtied. But luckily error may happen |
5767 | * only due to journal error, so full transaction will be |
5768 | * aborted anyway. |
5769 | */ |
5770 | if (unlikely(*erp)) |
5771 | goto finish; |
5772 | lblk1 += len; |
5773 | lblk2 += len; |
5774 | replaced_count += len; |
5775 | count -= len; |
5776 | |
5777 | repeat: |
5778 | ext4_free_ext_path(path: path1); |
5779 | ext4_free_ext_path(path: path2); |
5780 | path1 = path2 = NULL; |
5781 | } |
5782 | return replaced_count; |
5783 | } |
5784 | |
5785 | /* |
5786 | * ext4_clu_mapped - determine whether any block in a logical cluster has |
5787 | * been mapped to a physical cluster |
5788 | * |
5789 | * @inode - file containing the logical cluster |
5790 | * @lclu - logical cluster of interest |
5791 | * |
5792 | * Returns 1 if any block in the logical cluster is mapped, signifying |
5793 | * that a physical cluster has been allocated for it. Otherwise, |
5794 | * returns 0. Can also return negative error codes. Derived from |
5795 | * ext4_ext_map_blocks(). |
5796 | */ |
5797 | int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu) |
5798 | { |
5799 | struct ext4_sb_info *sbi = EXT4_SB(sb: inode->i_sb); |
5800 | struct ext4_ext_path *path; |
5801 | int depth, mapped = 0, err = 0; |
5802 | struct ext4_extent *extent; |
5803 | ext4_lblk_t first_lblk, first_lclu, last_lclu; |
5804 | |
5805 | /* |
5806 | * if data can be stored inline, the logical cluster isn't |
5807 | * mapped - no physical clusters have been allocated, and the |
5808 | * file has no extents |
5809 | */ |
5810 | if (ext4_test_inode_state(inode, bit: EXT4_STATE_MAY_INLINE_DATA) || |
5811 | ext4_has_inline_data(inode)) |
5812 | return 0; |
5813 | |
5814 | /* search for the extent closest to the first block in the cluster */ |
5815 | path = ext4_find_extent(inode, EXT4_C2B(sbi, lclu), NULL, flags: 0); |
5816 | if (IS_ERR(ptr: path)) { |
5817 | err = PTR_ERR(ptr: path); |
5818 | path = NULL; |
5819 | goto out; |
5820 | } |
5821 | |
5822 | depth = ext_depth(inode); |
5823 | |
5824 | /* |
5825 | * A consistent leaf must not be empty. This situation is possible, |
5826 | * though, _during_ tree modification, and it's why an assert can't |
5827 | * be put in ext4_find_extent(). |
5828 | */ |
5829 | if (unlikely(path[depth].p_ext == NULL && depth != 0)) { |
5830 | EXT4_ERROR_INODE(inode, |
5831 | "bad extent address - lblock: %lu, depth: %d, pblock: %lld" , |
5832 | (unsigned long) EXT4_C2B(sbi, lclu), |
5833 | depth, path[depth].p_block); |
5834 | err = -EFSCORRUPTED; |
5835 | goto out; |
5836 | } |
5837 | |
5838 | extent = path[depth].p_ext; |
5839 | |
5840 | /* can't be mapped if the extent tree is empty */ |
5841 | if (extent == NULL) |
5842 | goto out; |
5843 | |
5844 | first_lblk = le32_to_cpu(extent->ee_block); |
5845 | first_lclu = EXT4_B2C(sbi, first_lblk); |
5846 | |
5847 | /* |
5848 | * Three possible outcomes at this point - found extent spanning |
5849 | * the target cluster, to the left of the target cluster, or to the |
5850 | * right of the target cluster. The first two cases are handled here. |
5851 | * The last case indicates the target cluster is not mapped. |
5852 | */ |
5853 | if (lclu >= first_lclu) { |
5854 | last_lclu = EXT4_B2C(sbi, first_lblk + |
5855 | ext4_ext_get_actual_len(extent) - 1); |
5856 | if (lclu <= last_lclu) { |
5857 | mapped = 1; |
5858 | } else { |
5859 | first_lblk = ext4_ext_next_allocated_block(path); |
5860 | first_lclu = EXT4_B2C(sbi, first_lblk); |
5861 | if (lclu == first_lclu) |
5862 | mapped = 1; |
5863 | } |
5864 | } |
5865 | |
5866 | out: |
5867 | ext4_free_ext_path(path); |
5868 | |
5869 | return err ? err : mapped; |
5870 | } |
5871 | |
5872 | /* |
5873 | * Updates physical block address and unwritten status of extent |
5874 | * starting at lblk start and of len. If such an extent doesn't exist, |
5875 | * this function splits the extent tree appropriately to create an |
5876 | * extent like this. This function is called in the fast commit |
5877 | * replay path. Returns 0 on success and error on failure. |
5878 | */ |
5879 | int ext4_ext_replay_update_ex(struct inode *inode, ext4_lblk_t start, |
5880 | int len, int unwritten, ext4_fsblk_t pblk) |
5881 | { |
5882 | struct ext4_ext_path *path = NULL, *ppath; |
5883 | struct ext4_extent *ex; |
5884 | int ret; |
5885 | |
5886 | path = ext4_find_extent(inode, block: start, NULL, flags: 0); |
5887 | if (IS_ERR(ptr: path)) |
5888 | return PTR_ERR(ptr: path); |
5889 | ex = path[path->p_depth].p_ext; |
5890 | if (!ex) { |
5891 | ret = -EFSCORRUPTED; |
5892 | goto out; |
5893 | } |
5894 | |
5895 | if (le32_to_cpu(ex->ee_block) != start || |
5896 | ext4_ext_get_actual_len(ext: ex) != len) { |
5897 | /* We need to split this extent to match our extent first */ |
5898 | ppath = path; |
5899 | down_write(sem: &EXT4_I(inode)->i_data_sem); |
5900 | ret = ext4_force_split_extent_at(NULL, inode, ppath: &ppath, lblk: start, nofail: 1); |
5901 | up_write(sem: &EXT4_I(inode)->i_data_sem); |
5902 | if (ret) |
5903 | goto out; |
5904 | kfree(objp: path); |
5905 | path = ext4_find_extent(inode, block: start, NULL, flags: 0); |
5906 | if (IS_ERR(ptr: path)) |
5907 | return -1; |
5908 | ppath = path; |
5909 | ex = path[path->p_depth].p_ext; |
5910 | WARN_ON(le32_to_cpu(ex->ee_block) != start); |
5911 | if (ext4_ext_get_actual_len(ext: ex) != len) { |
5912 | down_write(sem: &EXT4_I(inode)->i_data_sem); |
5913 | ret = ext4_force_split_extent_at(NULL, inode, ppath: &ppath, |
5914 | lblk: start + len, nofail: 1); |
5915 | up_write(sem: &EXT4_I(inode)->i_data_sem); |
5916 | if (ret) |
5917 | goto out; |
5918 | kfree(objp: path); |
5919 | path = ext4_find_extent(inode, block: start, NULL, flags: 0); |
5920 | if (IS_ERR(ptr: path)) |
5921 | return -EINVAL; |
5922 | ex = path[path->p_depth].p_ext; |
5923 | } |
5924 | } |
5925 | if (unwritten) |
5926 | ext4_ext_mark_unwritten(ext: ex); |
5927 | else |
5928 | ext4_ext_mark_initialized(ext: ex); |
5929 | ext4_ext_store_pblock(ex, pb: pblk); |
5930 | down_write(sem: &EXT4_I(inode)->i_data_sem); |
5931 | ret = ext4_ext_dirty(NULL, inode, &path[path->p_depth]); |
5932 | up_write(sem: &EXT4_I(inode)->i_data_sem); |
5933 | out: |
5934 | ext4_free_ext_path(path); |
5935 | ext4_mark_inode_dirty(NULL, inode); |
5936 | return ret; |
5937 | } |
5938 | |
5939 | /* Try to shrink the extent tree */ |
5940 | void ext4_ext_replay_shrink_inode(struct inode *inode, ext4_lblk_t end) |
5941 | { |
5942 | struct ext4_ext_path *path = NULL; |
5943 | struct ext4_extent *ex; |
5944 | ext4_lblk_t old_cur, cur = 0; |
5945 | |
5946 | while (cur < end) { |
5947 | path = ext4_find_extent(inode, block: cur, NULL, flags: 0); |
5948 | if (IS_ERR(ptr: path)) |
5949 | return; |
5950 | ex = path[path->p_depth].p_ext; |
5951 | if (!ex) { |
5952 | ext4_free_ext_path(path); |
5953 | ext4_mark_inode_dirty(NULL, inode); |
5954 | return; |
5955 | } |
5956 | old_cur = cur; |
5957 | cur = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ext: ex); |
5958 | if (cur <= old_cur) |
5959 | cur = old_cur + 1; |
5960 | ext4_ext_try_to_merge(NULL, inode, path, ex); |
5961 | down_write(sem: &EXT4_I(inode)->i_data_sem); |
5962 | ext4_ext_dirty(NULL, inode, &path[path->p_depth]); |
5963 | up_write(sem: &EXT4_I(inode)->i_data_sem); |
5964 | ext4_mark_inode_dirty(NULL, inode); |
5965 | ext4_free_ext_path(path); |
5966 | } |
5967 | } |
5968 | |
5969 | /* Check if *cur is a hole and if it is, skip it */ |
5970 | static int skip_hole(struct inode *inode, ext4_lblk_t *cur) |
5971 | { |
5972 | int ret; |
5973 | struct ext4_map_blocks map; |
5974 | |
5975 | map.m_lblk = *cur; |
5976 | map.m_len = ((inode->i_size) >> inode->i_sb->s_blocksize_bits) - *cur; |
5977 | |
5978 | ret = ext4_map_blocks(NULL, inode, map: &map, flags: 0); |
5979 | if (ret < 0) |
5980 | return ret; |
5981 | if (ret != 0) |
5982 | return 0; |
5983 | *cur = *cur + map.m_len; |
5984 | return 0; |
5985 | } |
5986 | |
5987 | /* Count number of blocks used by this inode and update i_blocks */ |
5988 | int ext4_ext_replay_set_iblocks(struct inode *inode) |
5989 | { |
5990 | struct ext4_ext_path *path = NULL, *path2 = NULL; |
5991 | struct ext4_extent *ex; |
5992 | ext4_lblk_t cur = 0, end; |
5993 | int numblks = 0, i, ret = 0; |
5994 | ext4_fsblk_t cmp1, cmp2; |
5995 | struct ext4_map_blocks map; |
5996 | |
5997 | /* Determin the size of the file first */ |
5998 | path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, |
5999 | EXT4_EX_NOCACHE); |
6000 | if (IS_ERR(ptr: path)) |
6001 | return PTR_ERR(ptr: path); |
6002 | ex = path[path->p_depth].p_ext; |
6003 | if (!ex) { |
6004 | ext4_free_ext_path(path); |
6005 | goto out; |
6006 | } |
6007 | end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ext: ex); |
6008 | ext4_free_ext_path(path); |
6009 | |
6010 | /* Count the number of data blocks */ |
6011 | cur = 0; |
6012 | while (cur < end) { |
6013 | map.m_lblk = cur; |
6014 | map.m_len = end - cur; |
6015 | ret = ext4_map_blocks(NULL, inode, map: &map, flags: 0); |
6016 | if (ret < 0) |
6017 | break; |
6018 | if (ret > 0) |
6019 | numblks += ret; |
6020 | cur = cur + map.m_len; |
6021 | } |
6022 | |
6023 | /* |
6024 | * Count the number of extent tree blocks. We do it by looking up |
6025 | * two successive extents and determining the difference between |
6026 | * their paths. When path is different for 2 successive extents |
6027 | * we compare the blocks in the path at each level and increment |
6028 | * iblocks by total number of differences found. |
6029 | */ |
6030 | cur = 0; |
6031 | ret = skip_hole(inode, cur: &cur); |
6032 | if (ret < 0) |
6033 | goto out; |
6034 | path = ext4_find_extent(inode, block: cur, NULL, flags: 0); |
6035 | if (IS_ERR(ptr: path)) |
6036 | goto out; |
6037 | numblks += path->p_depth; |
6038 | ext4_free_ext_path(path); |
6039 | while (cur < end) { |
6040 | path = ext4_find_extent(inode, block: cur, NULL, flags: 0); |
6041 | if (IS_ERR(ptr: path)) |
6042 | break; |
6043 | ex = path[path->p_depth].p_ext; |
6044 | if (!ex) { |
6045 | ext4_free_ext_path(path); |
6046 | return 0; |
6047 | } |
6048 | cur = max(cur + 1, le32_to_cpu(ex->ee_block) + |
6049 | ext4_ext_get_actual_len(ex)); |
6050 | ret = skip_hole(inode, cur: &cur); |
6051 | if (ret < 0) { |
6052 | ext4_free_ext_path(path); |
6053 | break; |
6054 | } |
6055 | path2 = ext4_find_extent(inode, block: cur, NULL, flags: 0); |
6056 | if (IS_ERR(ptr: path2)) { |
6057 | ext4_free_ext_path(path); |
6058 | break; |
6059 | } |
6060 | for (i = 0; i <= max(path->p_depth, path2->p_depth); i++) { |
6061 | cmp1 = cmp2 = 0; |
6062 | if (i <= path->p_depth) |
6063 | cmp1 = path[i].p_bh ? |
6064 | path[i].p_bh->b_blocknr : 0; |
6065 | if (i <= path2->p_depth) |
6066 | cmp2 = path2[i].p_bh ? |
6067 | path2[i].p_bh->b_blocknr : 0; |
6068 | if (cmp1 != cmp2 && cmp2 != 0) |
6069 | numblks++; |
6070 | } |
6071 | ext4_free_ext_path(path); |
6072 | ext4_free_ext_path(path: path2); |
6073 | } |
6074 | |
6075 | out: |
6076 | inode->i_blocks = numblks << (inode->i_sb->s_blocksize_bits - 9); |
6077 | ext4_mark_inode_dirty(NULL, inode); |
6078 | return 0; |
6079 | } |
6080 | |
6081 | int ext4_ext_clear_bb(struct inode *inode) |
6082 | { |
6083 | struct ext4_ext_path *path = NULL; |
6084 | struct ext4_extent *ex; |
6085 | ext4_lblk_t cur = 0, end; |
6086 | int j, ret = 0; |
6087 | struct ext4_map_blocks map; |
6088 | |
6089 | if (ext4_test_inode_flag(inode, bit: EXT4_INODE_INLINE_DATA)) |
6090 | return 0; |
6091 | |
6092 | /* Determin the size of the file first */ |
6093 | path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, |
6094 | EXT4_EX_NOCACHE); |
6095 | if (IS_ERR(ptr: path)) |
6096 | return PTR_ERR(ptr: path); |
6097 | ex = path[path->p_depth].p_ext; |
6098 | if (!ex) { |
6099 | ext4_free_ext_path(path); |
6100 | return 0; |
6101 | } |
6102 | end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ext: ex); |
6103 | ext4_free_ext_path(path); |
6104 | |
6105 | cur = 0; |
6106 | while (cur < end) { |
6107 | map.m_lblk = cur; |
6108 | map.m_len = end - cur; |
6109 | ret = ext4_map_blocks(NULL, inode, map: &map, flags: 0); |
6110 | if (ret < 0) |
6111 | break; |
6112 | if (ret > 0) { |
6113 | path = ext4_find_extent(inode, block: map.m_lblk, NULL, flags: 0); |
6114 | if (!IS_ERR_OR_NULL(ptr: path)) { |
6115 | for (j = 0; j < path->p_depth; j++) { |
6116 | |
6117 | ext4_mb_mark_bb(sb: inode->i_sb, |
6118 | block: path[j].p_block, len: 1, state: false); |
6119 | ext4_fc_record_regions(sb: inode->i_sb, ino: inode->i_ino, |
6120 | lblk: 0, pblk: path[j].p_block, len: 1, replay: 1); |
6121 | } |
6122 | ext4_free_ext_path(path); |
6123 | } |
6124 | ext4_mb_mark_bb(sb: inode->i_sb, block: map.m_pblk, len: map.m_len, state: false); |
6125 | ext4_fc_record_regions(sb: inode->i_sb, ino: inode->i_ino, |
6126 | lblk: map.m_lblk, pblk: map.m_pblk, len: map.m_len, replay: 1); |
6127 | } |
6128 | cur = cur + map.m_len; |
6129 | } |
6130 | |
6131 | return 0; |
6132 | } |
6133 | |