1 | // SPDX-License-Identifier: GPL-2.0 |
---|---|
2 | /* |
3 | * Copyright (C) 2010 Red Hat, Inc. |
4 | * Copyright (C) 2016-2023 Christoph Hellwig. |
5 | */ |
6 | #include <linux/module.h> |
7 | #include <linux/compiler.h> |
8 | #include <linux/fs.h> |
9 | #include <linux/iomap.h> |
10 | #include <linux/pagemap.h> |
11 | #include <linux/uio.h> |
12 | #include <linux/buffer_head.h> |
13 | #include <linux/dax.h> |
14 | #include <linux/writeback.h> |
15 | #include <linux/swap.h> |
16 | #include <linux/bio.h> |
17 | #include <linux/sched/signal.h> |
18 | #include <linux/migrate.h> |
19 | #include "internal.h" |
20 | #include "trace.h" |
21 | |
22 | #include "../internal.h" |
23 | |
24 | /* |
25 | * Structure allocated for each folio to track per-block uptodate, dirty state |
26 | * and I/O completions. |
27 | */ |
28 | struct iomap_folio_state { |
29 | spinlock_t state_lock; |
30 | unsigned int read_bytes_pending; |
31 | atomic_t write_bytes_pending; |
32 | |
33 | /* |
34 | * Each block has two bits in this bitmap: |
35 | * Bits [0..blocks_per_folio) has the uptodate status. |
36 | * Bits [b_p_f...(2*b_p_f)) has the dirty status. |
37 | */ |
38 | unsigned long state[]; |
39 | }; |
40 | |
41 | static inline bool ifs_is_fully_uptodate(struct folio *folio, |
42 | struct iomap_folio_state *ifs) |
43 | { |
44 | struct inode *inode = folio->mapping->host; |
45 | |
46 | return bitmap_full(src: ifs->state, nbits: i_blocks_per_folio(inode, folio)); |
47 | } |
48 | |
49 | static inline bool ifs_block_is_uptodate(struct iomap_folio_state *ifs, |
50 | unsigned int block) |
51 | { |
52 | return test_bit(block, ifs->state); |
53 | } |
54 | |
55 | static bool ifs_set_range_uptodate(struct folio *folio, |
56 | struct iomap_folio_state *ifs, size_t off, size_t len) |
57 | { |
58 | struct inode *inode = folio->mapping->host; |
59 | unsigned int first_blk = off >> inode->i_blkbits; |
60 | unsigned int last_blk = (off + len - 1) >> inode->i_blkbits; |
61 | unsigned int nr_blks = last_blk - first_blk + 1; |
62 | |
63 | bitmap_set(map: ifs->state, start: first_blk, nbits: nr_blks); |
64 | return ifs_is_fully_uptodate(folio, ifs); |
65 | } |
66 | |
67 | static void iomap_set_range_uptodate(struct folio *folio, size_t off, |
68 | size_t len) |
69 | { |
70 | struct iomap_folio_state *ifs = folio->private; |
71 | unsigned long flags; |
72 | bool uptodate = true; |
73 | |
74 | if (ifs) { |
75 | spin_lock_irqsave(&ifs->state_lock, flags); |
76 | uptodate = ifs_set_range_uptodate(folio, ifs, off, len); |
77 | spin_unlock_irqrestore(lock: &ifs->state_lock, flags); |
78 | } |
79 | |
80 | if (uptodate) |
81 | folio_mark_uptodate(folio); |
82 | } |
83 | |
84 | static inline bool ifs_block_is_dirty(struct folio *folio, |
85 | struct iomap_folio_state *ifs, int block) |
86 | { |
87 | struct inode *inode = folio->mapping->host; |
88 | unsigned int blks_per_folio = i_blocks_per_folio(inode, folio); |
89 | |
90 | return test_bit(block + blks_per_folio, ifs->state); |
91 | } |
92 | |
93 | static unsigned ifs_find_dirty_range(struct folio *folio, |
94 | struct iomap_folio_state *ifs, u64 *range_start, u64 range_end) |
95 | { |
96 | struct inode *inode = folio->mapping->host; |
97 | unsigned start_blk = |
98 | offset_in_folio(folio, *range_start) >> inode->i_blkbits; |
99 | unsigned end_blk = min_not_zero( |
100 | offset_in_folio(folio, range_end) >> inode->i_blkbits, |
101 | i_blocks_per_folio(inode, folio)); |
102 | unsigned nblks = 1; |
103 | |
104 | while (!ifs_block_is_dirty(folio, ifs, block: start_blk)) |
105 | if (++start_blk == end_blk) |
106 | return 0; |
107 | |
108 | while (start_blk + nblks < end_blk) { |
109 | if (!ifs_block_is_dirty(folio, ifs, block: start_blk + nblks)) |
110 | break; |
111 | nblks++; |
112 | } |
113 | |
114 | *range_start = folio_pos(folio) + (start_blk << inode->i_blkbits); |
115 | return nblks << inode->i_blkbits; |
116 | } |
117 | |
118 | static unsigned iomap_find_dirty_range(struct folio *folio, u64 *range_start, |
119 | u64 range_end) |
120 | { |
121 | struct iomap_folio_state *ifs = folio->private; |
122 | |
123 | if (*range_start >= range_end) |
124 | return 0; |
125 | |
126 | if (ifs) |
127 | return ifs_find_dirty_range(folio, ifs, range_start, range_end); |
128 | return range_end - *range_start; |
129 | } |
130 | |
131 | static void ifs_clear_range_dirty(struct folio *folio, |
132 | struct iomap_folio_state *ifs, size_t off, size_t len) |
133 | { |
134 | struct inode *inode = folio->mapping->host; |
135 | unsigned int blks_per_folio = i_blocks_per_folio(inode, folio); |
136 | unsigned int first_blk = (off >> inode->i_blkbits); |
137 | unsigned int last_blk = (off + len - 1) >> inode->i_blkbits; |
138 | unsigned int nr_blks = last_blk - first_blk + 1; |
139 | unsigned long flags; |
140 | |
141 | spin_lock_irqsave(&ifs->state_lock, flags); |
142 | bitmap_clear(map: ifs->state, start: first_blk + blks_per_folio, nbits: nr_blks); |
143 | spin_unlock_irqrestore(lock: &ifs->state_lock, flags); |
144 | } |
145 | |
146 | static void iomap_clear_range_dirty(struct folio *folio, size_t off, size_t len) |
147 | { |
148 | struct iomap_folio_state *ifs = folio->private; |
149 | |
150 | if (ifs) |
151 | ifs_clear_range_dirty(folio, ifs, off, len); |
152 | } |
153 | |
154 | static void ifs_set_range_dirty(struct folio *folio, |
155 | struct iomap_folio_state *ifs, size_t off, size_t len) |
156 | { |
157 | struct inode *inode = folio->mapping->host; |
158 | unsigned int blks_per_folio = i_blocks_per_folio(inode, folio); |
159 | unsigned int first_blk = (off >> inode->i_blkbits); |
160 | unsigned int last_blk = (off + len - 1) >> inode->i_blkbits; |
161 | unsigned int nr_blks = last_blk - first_blk + 1; |
162 | unsigned long flags; |
163 | |
164 | spin_lock_irqsave(&ifs->state_lock, flags); |
165 | bitmap_set(map: ifs->state, start: first_blk + blks_per_folio, nbits: nr_blks); |
166 | spin_unlock_irqrestore(lock: &ifs->state_lock, flags); |
167 | } |
168 | |
169 | static void iomap_set_range_dirty(struct folio *folio, size_t off, size_t len) |
170 | { |
171 | struct iomap_folio_state *ifs = folio->private; |
172 | |
173 | if (ifs) |
174 | ifs_set_range_dirty(folio, ifs, off, len); |
175 | } |
176 | |
177 | static struct iomap_folio_state *ifs_alloc(struct inode *inode, |
178 | struct folio *folio, unsigned int flags) |
179 | { |
180 | struct iomap_folio_state *ifs = folio->private; |
181 | unsigned int nr_blocks = i_blocks_per_folio(inode, folio); |
182 | gfp_t gfp; |
183 | |
184 | if (ifs || nr_blocks <= 1) |
185 | return ifs; |
186 | |
187 | if (flags & IOMAP_NOWAIT) |
188 | gfp = GFP_NOWAIT; |
189 | else |
190 | gfp = GFP_NOFS | __GFP_NOFAIL; |
191 | |
192 | /* |
193 | * ifs->state tracks two sets of state flags when the |
194 | * filesystem block size is smaller than the folio size. |
195 | * The first state tracks per-block uptodate and the |
196 | * second tracks per-block dirty state. |
197 | */ |
198 | ifs = kzalloc(struct_size(ifs, state, |
199 | BITS_TO_LONGS(2 * nr_blocks)), gfp); |
200 | if (!ifs) |
201 | return ifs; |
202 | |
203 | spin_lock_init(&ifs->state_lock); |
204 | if (folio_test_uptodate(folio)) |
205 | bitmap_set(map: ifs->state, start: 0, nbits: nr_blocks); |
206 | if (folio_test_dirty(folio)) |
207 | bitmap_set(map: ifs->state, start: nr_blocks, nbits: nr_blocks); |
208 | folio_attach_private(folio, data: ifs); |
209 | |
210 | return ifs; |
211 | } |
212 | |
213 | static void ifs_free(struct folio *folio) |
214 | { |
215 | struct iomap_folio_state *ifs = folio_detach_private(folio); |
216 | |
217 | if (!ifs) |
218 | return; |
219 | WARN_ON_ONCE(ifs->read_bytes_pending != 0); |
220 | WARN_ON_ONCE(atomic_read(&ifs->write_bytes_pending)); |
221 | WARN_ON_ONCE(ifs_is_fully_uptodate(folio, ifs) != |
222 | folio_test_uptodate(folio)); |
223 | kfree(objp: ifs); |
224 | } |
225 | |
226 | /* |
227 | * Calculate the range inside the folio that we actually need to read. |
228 | */ |
229 | static void iomap_adjust_read_range(struct inode *inode, struct folio *folio, |
230 | loff_t *pos, loff_t length, size_t *offp, size_t *lenp) |
231 | { |
232 | struct iomap_folio_state *ifs = folio->private; |
233 | loff_t orig_pos = *pos; |
234 | loff_t isize = i_size_read(inode); |
235 | unsigned block_bits = inode->i_blkbits; |
236 | unsigned block_size = (1 << block_bits); |
237 | size_t poff = offset_in_folio(folio, *pos); |
238 | size_t plen = min_t(loff_t, folio_size(folio) - poff, length); |
239 | size_t orig_plen = plen; |
240 | unsigned first = poff >> block_bits; |
241 | unsigned last = (poff + plen - 1) >> block_bits; |
242 | |
243 | /* |
244 | * If the block size is smaller than the page size, we need to check the |
245 | * per-block uptodate status and adjust the offset and length if needed |
246 | * to avoid reading in already uptodate ranges. |
247 | */ |
248 | if (ifs) { |
249 | unsigned int i; |
250 | |
251 | /* move forward for each leading block marked uptodate */ |
252 | for (i = first; i <= last; i++) { |
253 | if (!ifs_block_is_uptodate(ifs, block: i)) |
254 | break; |
255 | *pos += block_size; |
256 | poff += block_size; |
257 | plen -= block_size; |
258 | first++; |
259 | } |
260 | |
261 | /* truncate len if we find any trailing uptodate block(s) */ |
262 | while (++i <= last) { |
263 | if (ifs_block_is_uptodate(ifs, block: i)) { |
264 | plen -= (last - i + 1) * block_size; |
265 | last = i - 1; |
266 | break; |
267 | } |
268 | } |
269 | } |
270 | |
271 | /* |
272 | * If the extent spans the block that contains the i_size, we need to |
273 | * handle both halves separately so that we properly zero data in the |
274 | * page cache for blocks that are entirely outside of i_size. |
275 | */ |
276 | if (orig_pos <= isize && orig_pos + orig_plen > isize) { |
277 | unsigned end = offset_in_folio(folio, isize - 1) >> block_bits; |
278 | |
279 | if (first <= end && last > end) |
280 | plen -= (last - end) * block_size; |
281 | } |
282 | |
283 | *offp = poff; |
284 | *lenp = plen; |
285 | } |
286 | |
287 | static void iomap_finish_folio_read(struct folio *folio, size_t off, |
288 | size_t len, int error) |
289 | { |
290 | struct iomap_folio_state *ifs = folio->private; |
291 | bool uptodate = !error; |
292 | bool finished = true; |
293 | |
294 | if (ifs) { |
295 | unsigned long flags; |
296 | |
297 | spin_lock_irqsave(&ifs->state_lock, flags); |
298 | if (!error) |
299 | uptodate = ifs_set_range_uptodate(folio, ifs, off, len); |
300 | ifs->read_bytes_pending -= len; |
301 | finished = !ifs->read_bytes_pending; |
302 | spin_unlock_irqrestore(lock: &ifs->state_lock, flags); |
303 | } |
304 | |
305 | if (finished) |
306 | folio_end_read(folio, success: uptodate); |
307 | } |
308 | |
309 | static void iomap_read_end_io(struct bio *bio) |
310 | { |
311 | int error = blk_status_to_errno(status: bio->bi_status); |
312 | struct folio_iter fi; |
313 | |
314 | bio_for_each_folio_all(fi, bio) |
315 | iomap_finish_folio_read(folio: fi.folio, off: fi.offset, len: fi.length, error); |
316 | bio_put(bio); |
317 | } |
318 | |
319 | struct iomap_readpage_ctx { |
320 | struct folio *cur_folio; |
321 | bool cur_folio_in_bio; |
322 | struct bio *bio; |
323 | struct readahead_control *rac; |
324 | }; |
325 | |
326 | /** |
327 | * iomap_read_inline_data - copy inline data into the page cache |
328 | * @iter: iteration structure |
329 | * @folio: folio to copy to |
330 | * |
331 | * Copy the inline data in @iter into @folio and zero out the rest of the folio. |
332 | * Only a single IOMAP_INLINE extent is allowed at the end of each file. |
333 | * Returns zero for success to complete the read, or the usual negative errno. |
334 | */ |
335 | static int iomap_read_inline_data(const struct iomap_iter *iter, |
336 | struct folio *folio) |
337 | { |
338 | const struct iomap *iomap = iomap_iter_srcmap(i: iter); |
339 | size_t size = i_size_read(inode: iter->inode) - iomap->offset; |
340 | size_t offset = offset_in_folio(folio, iomap->offset); |
341 | |
342 | if (folio_test_uptodate(folio)) |
343 | return 0; |
344 | |
345 | if (WARN_ON_ONCE(size > iomap->length)) |
346 | return -EIO; |
347 | if (offset > 0) |
348 | ifs_alloc(inode: iter->inode, folio, flags: iter->flags); |
349 | |
350 | folio_fill_tail(folio, offset, from: iomap->inline_data, len: size); |
351 | iomap_set_range_uptodate(folio, off: offset, len: folio_size(folio) - offset); |
352 | return 0; |
353 | } |
354 | |
355 | static inline bool iomap_block_needs_zeroing(const struct iomap_iter *iter, |
356 | loff_t pos) |
357 | { |
358 | const struct iomap *srcmap = iomap_iter_srcmap(i: iter); |
359 | |
360 | return srcmap->type != IOMAP_MAPPED || |
361 | (srcmap->flags & IOMAP_F_NEW) || |
362 | pos >= i_size_read(inode: iter->inode); |
363 | } |
364 | |
365 | static int iomap_readpage_iter(struct iomap_iter *iter, |
366 | struct iomap_readpage_ctx *ctx) |
367 | { |
368 | const struct iomap *iomap = &iter->iomap; |
369 | loff_t pos = iter->pos; |
370 | loff_t length = iomap_length(iter); |
371 | struct folio *folio = ctx->cur_folio; |
372 | struct iomap_folio_state *ifs; |
373 | size_t poff, plen; |
374 | sector_t sector; |
375 | int ret; |
376 | |
377 | if (iomap->type == IOMAP_INLINE) { |
378 | ret = iomap_read_inline_data(iter, folio); |
379 | if (ret) |
380 | return ret; |
381 | return iomap_iter_advance(iter, count: &length); |
382 | } |
383 | |
384 | /* zero post-eof blocks as the page may be mapped */ |
385 | ifs = ifs_alloc(inode: iter->inode, folio, flags: iter->flags); |
386 | iomap_adjust_read_range(inode: iter->inode, folio, pos: &pos, length, offp: &poff, lenp: &plen); |
387 | if (plen == 0) |
388 | goto done; |
389 | |
390 | if (iomap_block_needs_zeroing(iter, pos)) { |
391 | folio_zero_range(folio, start: poff, length: plen); |
392 | iomap_set_range_uptodate(folio, off: poff, len: plen); |
393 | goto done; |
394 | } |
395 | |
396 | ctx->cur_folio_in_bio = true; |
397 | if (ifs) { |
398 | spin_lock_irq(lock: &ifs->state_lock); |
399 | ifs->read_bytes_pending += plen; |
400 | spin_unlock_irq(lock: &ifs->state_lock); |
401 | } |
402 | |
403 | sector = iomap_sector(iomap, pos); |
404 | if (!ctx->bio || |
405 | bio_end_sector(ctx->bio) != sector || |
406 | !bio_add_folio(bio: ctx->bio, folio, len: plen, off: poff)) { |
407 | gfp_t gfp = mapping_gfp_constraint(mapping: folio->mapping, GFP_KERNEL); |
408 | gfp_t orig_gfp = gfp; |
409 | unsigned int nr_vecs = DIV_ROUND_UP(length, PAGE_SIZE); |
410 | |
411 | if (ctx->bio) |
412 | submit_bio(bio: ctx->bio); |
413 | |
414 | if (ctx->rac) /* same as readahead_gfp_mask */ |
415 | gfp |= __GFP_NORETRY | __GFP_NOWARN; |
416 | ctx->bio = bio_alloc(bdev: iomap->bdev, nr_vecs: bio_max_segs(nr_segs: nr_vecs), |
417 | opf: REQ_OP_READ, gfp_mask: gfp); |
418 | /* |
419 | * If the bio_alloc fails, try it again for a single page to |
420 | * avoid having to deal with partial page reads. This emulates |
421 | * what do_mpage_read_folio does. |
422 | */ |
423 | if (!ctx->bio) { |
424 | ctx->bio = bio_alloc(bdev: iomap->bdev, nr_vecs: 1, opf: REQ_OP_READ, |
425 | gfp_mask: orig_gfp); |
426 | } |
427 | if (ctx->rac) |
428 | ctx->bio->bi_opf |= REQ_RAHEAD; |
429 | ctx->bio->bi_iter.bi_sector = sector; |
430 | ctx->bio->bi_end_io = iomap_read_end_io; |
431 | bio_add_folio_nofail(bio: ctx->bio, folio, len: plen, off: poff); |
432 | } |
433 | |
434 | done: |
435 | /* |
436 | * Move the caller beyond our range so that it keeps making progress. |
437 | * For that, we have to include any leading non-uptodate ranges, but |
438 | * we can skip trailing ones as they will be handled in the next |
439 | * iteration. |
440 | */ |
441 | length = pos - iter->pos + plen; |
442 | return iomap_iter_advance(iter, count: &length); |
443 | } |
444 | |
445 | static int iomap_read_folio_iter(struct iomap_iter *iter, |
446 | struct iomap_readpage_ctx *ctx) |
447 | { |
448 | int ret; |
449 | |
450 | while (iomap_length(iter)) { |
451 | ret = iomap_readpage_iter(iter, ctx); |
452 | if (ret) |
453 | return ret; |
454 | } |
455 | |
456 | return 0; |
457 | } |
458 | |
459 | int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops) |
460 | { |
461 | struct iomap_iter iter = { |
462 | .inode = folio->mapping->host, |
463 | .pos = folio_pos(folio), |
464 | .len = folio_size(folio), |
465 | }; |
466 | struct iomap_readpage_ctx ctx = { |
467 | .cur_folio = folio, |
468 | }; |
469 | int ret; |
470 | |
471 | trace_iomap_readpage(inode: iter.inode, nr_pages: 1); |
472 | |
473 | while ((ret = iomap_iter(iter: &iter, ops)) > 0) |
474 | iter.status = iomap_read_folio_iter(iter: &iter, ctx: &ctx); |
475 | |
476 | if (ctx.bio) { |
477 | submit_bio(bio: ctx.bio); |
478 | WARN_ON_ONCE(!ctx.cur_folio_in_bio); |
479 | } else { |
480 | WARN_ON_ONCE(ctx.cur_folio_in_bio); |
481 | folio_unlock(folio); |
482 | } |
483 | |
484 | /* |
485 | * Just like mpage_readahead and block_read_full_folio, we always |
486 | * return 0 and just set the folio error flag on errors. This |
487 | * should be cleaned up throughout the stack eventually. |
488 | */ |
489 | return 0; |
490 | } |
491 | EXPORT_SYMBOL_GPL(iomap_read_folio); |
492 | |
493 | static int iomap_readahead_iter(struct iomap_iter *iter, |
494 | struct iomap_readpage_ctx *ctx) |
495 | { |
496 | int ret; |
497 | |
498 | while (iomap_length(iter)) { |
499 | if (ctx->cur_folio && |
500 | offset_in_folio(ctx->cur_folio, iter->pos) == 0) { |
501 | if (!ctx->cur_folio_in_bio) |
502 | folio_unlock(folio: ctx->cur_folio); |
503 | ctx->cur_folio = NULL; |
504 | } |
505 | if (!ctx->cur_folio) { |
506 | ctx->cur_folio = readahead_folio(ractl: ctx->rac); |
507 | ctx->cur_folio_in_bio = false; |
508 | } |
509 | ret = iomap_readpage_iter(iter, ctx); |
510 | if (ret) |
511 | return ret; |
512 | } |
513 | |
514 | return 0; |
515 | } |
516 | |
517 | /** |
518 | * iomap_readahead - Attempt to read pages from a file. |
519 | * @rac: Describes the pages to be read. |
520 | * @ops: The operations vector for the filesystem. |
521 | * |
522 | * This function is for filesystems to call to implement their readahead |
523 | * address_space operation. |
524 | * |
525 | * Context: The @ops callbacks may submit I/O (eg to read the addresses of |
526 | * blocks from disc), and may wait for it. The caller may be trying to |
527 | * access a different page, and so sleeping excessively should be avoided. |
528 | * It may allocate memory, but should avoid costly allocations. This |
529 | * function is called with memalloc_nofs set, so allocations will not cause |
530 | * the filesystem to be reentered. |
531 | */ |
532 | void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops) |
533 | { |
534 | struct iomap_iter iter = { |
535 | .inode = rac->mapping->host, |
536 | .pos = readahead_pos(rac), |
537 | .len = readahead_length(rac), |
538 | }; |
539 | struct iomap_readpage_ctx ctx = { |
540 | .rac = rac, |
541 | }; |
542 | |
543 | trace_iomap_readahead(inode: rac->mapping->host, nr_pages: readahead_count(rac)); |
544 | |
545 | while (iomap_iter(iter: &iter, ops) > 0) |
546 | iter.status = iomap_readahead_iter(iter: &iter, ctx: &ctx); |
547 | |
548 | if (ctx.bio) |
549 | submit_bio(bio: ctx.bio); |
550 | if (ctx.cur_folio) { |
551 | if (!ctx.cur_folio_in_bio) |
552 | folio_unlock(folio: ctx.cur_folio); |
553 | } |
554 | } |
555 | EXPORT_SYMBOL_GPL(iomap_readahead); |
556 | |
557 | /* |
558 | * iomap_is_partially_uptodate checks whether blocks within a folio are |
559 | * uptodate or not. |
560 | * |
561 | * Returns true if all blocks which correspond to the specified part |
562 | * of the folio are uptodate. |
563 | */ |
564 | bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count) |
565 | { |
566 | struct iomap_folio_state *ifs = folio->private; |
567 | struct inode *inode = folio->mapping->host; |
568 | unsigned first, last, i; |
569 | |
570 | if (!ifs) |
571 | return false; |
572 | |
573 | /* Caller's range may extend past the end of this folio */ |
574 | count = min(folio_size(folio) - from, count); |
575 | |
576 | /* First and last blocks in range within folio */ |
577 | first = from >> inode->i_blkbits; |
578 | last = (from + count - 1) >> inode->i_blkbits; |
579 | |
580 | for (i = first; i <= last; i++) |
581 | if (!ifs_block_is_uptodate(ifs, block: i)) |
582 | return false; |
583 | return true; |
584 | } |
585 | EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate); |
586 | |
587 | /** |
588 | * iomap_get_folio - get a folio reference for writing |
589 | * @iter: iteration structure |
590 | * @pos: start offset of write |
591 | * @len: Suggested size of folio to create. |
592 | * |
593 | * Returns a locked reference to the folio at @pos, or an error pointer if the |
594 | * folio could not be obtained. |
595 | */ |
596 | struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len) |
597 | { |
598 | fgf_t fgp = FGP_WRITEBEGIN | FGP_NOFS; |
599 | |
600 | if (iter->flags & IOMAP_NOWAIT) |
601 | fgp |= FGP_NOWAIT; |
602 | if (iter->flags & IOMAP_DONTCACHE) |
603 | fgp |= FGP_DONTCACHE; |
604 | fgp |= fgf_set_order(size: len); |
605 | |
606 | return __filemap_get_folio(mapping: iter->inode->i_mapping, index: pos >> PAGE_SHIFT, |
607 | fgp_flags: fgp, gfp: mapping_gfp_mask(mapping: iter->inode->i_mapping)); |
608 | } |
609 | EXPORT_SYMBOL_GPL(iomap_get_folio); |
610 | |
611 | bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags) |
612 | { |
613 | trace_iomap_release_folio(inode: folio->mapping->host, off: folio_pos(folio), |
614 | len: folio_size(folio)); |
615 | |
616 | /* |
617 | * If the folio is dirty, we refuse to release our metadata because |
618 | * it may be partially dirty. Once we track per-block dirty state, |
619 | * we can release the metadata if every block is dirty. |
620 | */ |
621 | if (folio_test_dirty(folio)) |
622 | return false; |
623 | ifs_free(folio); |
624 | return true; |
625 | } |
626 | EXPORT_SYMBOL_GPL(iomap_release_folio); |
627 | |
628 | void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len) |
629 | { |
630 | trace_iomap_invalidate_folio(inode: folio->mapping->host, |
631 | off: folio_pos(folio) + offset, len); |
632 | |
633 | /* |
634 | * If we're invalidating the entire folio, clear the dirty state |
635 | * from it and release it to avoid unnecessary buildup of the LRU. |
636 | */ |
637 | if (offset == 0 && len == folio_size(folio)) { |
638 | WARN_ON_ONCE(folio_test_writeback(folio)); |
639 | folio_cancel_dirty(folio); |
640 | ifs_free(folio); |
641 | } |
642 | } |
643 | EXPORT_SYMBOL_GPL(iomap_invalidate_folio); |
644 | |
645 | bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio) |
646 | { |
647 | struct inode *inode = mapping->host; |
648 | size_t len = folio_size(folio); |
649 | |
650 | ifs_alloc(inode, folio, flags: 0); |
651 | iomap_set_range_dirty(folio, off: 0, len); |
652 | return filemap_dirty_folio(mapping, folio); |
653 | } |
654 | EXPORT_SYMBOL_GPL(iomap_dirty_folio); |
655 | |
656 | static void |
657 | iomap_write_failed(struct inode *inode, loff_t pos, unsigned len) |
658 | { |
659 | loff_t i_size = i_size_read(inode); |
660 | |
661 | /* |
662 | * Only truncate newly allocated pages beyoned EOF, even if the |
663 | * write started inside the existing inode size. |
664 | */ |
665 | if (pos + len > i_size) |
666 | truncate_pagecache_range(inode, max(pos, i_size), |
667 | end: pos + len - 1); |
668 | } |
669 | |
670 | static int iomap_read_folio_sync(loff_t block_start, struct folio *folio, |
671 | size_t poff, size_t plen, const struct iomap *iomap) |
672 | { |
673 | struct bio_vec bvec; |
674 | struct bio bio; |
675 | |
676 | bio_init(bio: &bio, bdev: iomap->bdev, table: &bvec, max_vecs: 1, opf: REQ_OP_READ); |
677 | bio.bi_iter.bi_sector = iomap_sector(iomap, pos: block_start); |
678 | bio_add_folio_nofail(bio: &bio, folio, len: plen, off: poff); |
679 | return submit_bio_wait(bio: &bio); |
680 | } |
681 | |
682 | static int __iomap_write_begin(const struct iomap_iter *iter, size_t len, |
683 | struct folio *folio) |
684 | { |
685 | const struct iomap *srcmap = iomap_iter_srcmap(i: iter); |
686 | struct iomap_folio_state *ifs; |
687 | loff_t pos = iter->pos; |
688 | loff_t block_size = i_blocksize(node: iter->inode); |
689 | loff_t block_start = round_down(pos, block_size); |
690 | loff_t block_end = round_up(pos + len, block_size); |
691 | unsigned int nr_blocks = i_blocks_per_folio(inode: iter->inode, folio); |
692 | size_t from = offset_in_folio(folio, pos), to = from + len; |
693 | size_t poff, plen; |
694 | |
695 | /* |
696 | * If the write or zeroing completely overlaps the current folio, then |
697 | * entire folio will be dirtied so there is no need for |
698 | * per-block state tracking structures to be attached to this folio. |
699 | * For the unshare case, we must read in the ondisk contents because we |
700 | * are not changing pagecache contents. |
701 | */ |
702 | if (!(iter->flags & IOMAP_UNSHARE) && pos <= folio_pos(folio) && |
703 | pos + len >= folio_pos(folio) + folio_size(folio)) |
704 | return 0; |
705 | |
706 | ifs = ifs_alloc(inode: iter->inode, folio, flags: iter->flags); |
707 | if ((iter->flags & IOMAP_NOWAIT) && !ifs && nr_blocks > 1) |
708 | return -EAGAIN; |
709 | |
710 | if (folio_test_uptodate(folio)) |
711 | return 0; |
712 | |
713 | do { |
714 | iomap_adjust_read_range(inode: iter->inode, folio, pos: &block_start, |
715 | length: block_end - block_start, offp: &poff, lenp: &plen); |
716 | if (plen == 0) |
717 | break; |
718 | |
719 | if (!(iter->flags & IOMAP_UNSHARE) && |
720 | (from <= poff || from >= poff + plen) && |
721 | (to <= poff || to >= poff + plen)) |
722 | continue; |
723 | |
724 | if (iomap_block_needs_zeroing(iter, pos: block_start)) { |
725 | if (WARN_ON_ONCE(iter->flags & IOMAP_UNSHARE)) |
726 | return -EIO; |
727 | folio_zero_segments(folio, start1: poff, xend1: from, start2: to, xend2: poff + plen); |
728 | } else { |
729 | int status; |
730 | |
731 | if (iter->flags & IOMAP_NOWAIT) |
732 | return -EAGAIN; |
733 | |
734 | status = iomap_read_folio_sync(block_start, folio, |
735 | poff, plen, iomap: srcmap); |
736 | if (status) |
737 | return status; |
738 | } |
739 | iomap_set_range_uptodate(folio, off: poff, len: plen); |
740 | } while ((block_start += plen) < block_end); |
741 | |
742 | return 0; |
743 | } |
744 | |
745 | static struct folio *__iomap_get_folio(struct iomap_iter *iter, size_t len) |
746 | { |
747 | const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops; |
748 | loff_t pos = iter->pos; |
749 | |
750 | if (!mapping_large_folio_support(mapping: iter->inode->i_mapping)) |
751 | len = min_t(size_t, len, PAGE_SIZE - offset_in_page(pos)); |
752 | |
753 | if (folio_ops && folio_ops->get_folio) |
754 | return folio_ops->get_folio(iter, pos, len); |
755 | else |
756 | return iomap_get_folio(iter, pos, len); |
757 | } |
758 | |
759 | static void __iomap_put_folio(struct iomap_iter *iter, size_t ret, |
760 | struct folio *folio) |
761 | { |
762 | const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops; |
763 | loff_t pos = iter->pos; |
764 | |
765 | if (folio_ops && folio_ops->put_folio) { |
766 | folio_ops->put_folio(iter->inode, pos, ret, folio); |
767 | } else { |
768 | folio_unlock(folio); |
769 | folio_put(folio); |
770 | } |
771 | } |
772 | |
773 | /* trim pos and bytes to within a given folio */ |
774 | static loff_t iomap_trim_folio_range(struct iomap_iter *iter, |
775 | struct folio *folio, size_t *offset, u64 *bytes) |
776 | { |
777 | loff_t pos = iter->pos; |
778 | size_t fsize = folio_size(folio); |
779 | |
780 | WARN_ON_ONCE(pos < folio_pos(folio)); |
781 | WARN_ON_ONCE(pos >= folio_pos(folio) + fsize); |
782 | |
783 | *offset = offset_in_folio(folio, pos); |
784 | *bytes = min(*bytes, fsize - *offset); |
785 | |
786 | return pos; |
787 | } |
788 | |
789 | static int iomap_write_begin_inline(const struct iomap_iter *iter, |
790 | struct folio *folio) |
791 | { |
792 | /* needs more work for the tailpacking case; disable for now */ |
793 | if (WARN_ON_ONCE(iomap_iter_srcmap(iter)->offset != 0)) |
794 | return -EIO; |
795 | return iomap_read_inline_data(iter, folio); |
796 | } |
797 | |
798 | /* |
799 | * Grab and prepare a folio for write based on iter state. Returns the folio, |
800 | * offset, and length. Callers can optionally pass a max length *plen, |
801 | * otherwise init to zero. |
802 | */ |
803 | static int iomap_write_begin(struct iomap_iter *iter, struct folio **foliop, |
804 | size_t *poffset, u64 *plen) |
805 | { |
806 | const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops; |
807 | const struct iomap *srcmap = iomap_iter_srcmap(i: iter); |
808 | loff_t pos = iter->pos; |
809 | u64 len = min_t(u64, SIZE_MAX, iomap_length(iter)); |
810 | struct folio *folio; |
811 | int status = 0; |
812 | |
813 | len = min_not_zero(len, *plen); |
814 | BUG_ON(pos + len > iter->iomap.offset + iter->iomap.length); |
815 | if (srcmap != &iter->iomap) |
816 | BUG_ON(pos + len > srcmap->offset + srcmap->length); |
817 | |
818 | if (fatal_signal_pending(current)) |
819 | return -EINTR; |
820 | |
821 | folio = __iomap_get_folio(iter, len); |
822 | if (IS_ERR(ptr: folio)) |
823 | return PTR_ERR(ptr: folio); |
824 | |
825 | /* |
826 | * Now we have a locked folio, before we do anything with it we need to |
827 | * check that the iomap we have cached is not stale. The inode extent |
828 | * mapping can change due to concurrent IO in flight (e.g. |
829 | * IOMAP_UNWRITTEN state can change and memory reclaim could have |
830 | * reclaimed a previously partially written page at this index after IO |
831 | * completion before this write reaches this file offset) and hence we |
832 | * could do the wrong thing here (zero a page range incorrectly or fail |
833 | * to zero) and corrupt data. |
834 | */ |
835 | if (folio_ops && folio_ops->iomap_valid) { |
836 | bool iomap_valid = folio_ops->iomap_valid(iter->inode, |
837 | &iter->iomap); |
838 | if (!iomap_valid) { |
839 | iter->iomap.flags |= IOMAP_F_STALE; |
840 | status = 0; |
841 | goto out_unlock; |
842 | } |
843 | } |
844 | |
845 | pos = iomap_trim_folio_range(iter, folio, offset: poffset, bytes: &len); |
846 | |
847 | if (srcmap->type == IOMAP_INLINE) |
848 | status = iomap_write_begin_inline(iter, folio); |
849 | else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) |
850 | status = __block_write_begin_int(folio, pos, len, NULL, iomap: srcmap); |
851 | else |
852 | status = __iomap_write_begin(iter, len, folio); |
853 | |
854 | if (unlikely(status)) |
855 | goto out_unlock; |
856 | |
857 | *foliop = folio; |
858 | *plen = len; |
859 | return 0; |
860 | |
861 | out_unlock: |
862 | __iomap_put_folio(iter, ret: 0, folio); |
863 | |
864 | return status; |
865 | } |
866 | |
867 | static bool __iomap_write_end(struct inode *inode, loff_t pos, size_t len, |
868 | size_t copied, struct folio *folio) |
869 | { |
870 | flush_dcache_folio(folio); |
871 | |
872 | /* |
873 | * The blocks that were entirely written will now be uptodate, so we |
874 | * don't have to worry about a read_folio reading them and overwriting a |
875 | * partial write. However, if we've encountered a short write and only |
876 | * partially written into a block, it will not be marked uptodate, so a |
877 | * read_folio might come in and destroy our partial write. |
878 | * |
879 | * Do the simplest thing and just treat any short write to a |
880 | * non-uptodate page as a zero-length write, and force the caller to |
881 | * redo the whole thing. |
882 | */ |
883 | if (unlikely(copied < len && !folio_test_uptodate(folio))) |
884 | return false; |
885 | iomap_set_range_uptodate(folio, offset_in_folio(folio, pos), len); |
886 | iomap_set_range_dirty(folio, offset_in_folio(folio, pos), len: copied); |
887 | filemap_dirty_folio(mapping: inode->i_mapping, folio); |
888 | return true; |
889 | } |
890 | |
891 | static void iomap_write_end_inline(const struct iomap_iter *iter, |
892 | struct folio *folio, loff_t pos, size_t copied) |
893 | { |
894 | const struct iomap *iomap = &iter->iomap; |
895 | void *addr; |
896 | |
897 | WARN_ON_ONCE(!folio_test_uptodate(folio)); |
898 | BUG_ON(!iomap_inline_data_valid(iomap)); |
899 | |
900 | flush_dcache_folio(folio); |
901 | addr = kmap_local_folio(folio, offset: pos); |
902 | memcpy(iomap_inline_data(iomap, pos), addr, copied); |
903 | kunmap_local(addr); |
904 | |
905 | mark_inode_dirty(inode: iter->inode); |
906 | } |
907 | |
908 | /* |
909 | * Returns true if all copied bytes have been written to the pagecache, |
910 | * otherwise return false. |
911 | */ |
912 | static bool iomap_write_end(struct iomap_iter *iter, size_t len, size_t copied, |
913 | struct folio *folio) |
914 | { |
915 | const struct iomap *srcmap = iomap_iter_srcmap(i: iter); |
916 | loff_t pos = iter->pos; |
917 | |
918 | if (srcmap->type == IOMAP_INLINE) { |
919 | iomap_write_end_inline(iter, folio, pos, copied); |
920 | return true; |
921 | } |
922 | |
923 | if (srcmap->flags & IOMAP_F_BUFFER_HEAD) { |
924 | size_t bh_written; |
925 | |
926 | bh_written = block_write_end(NULL, iter->inode->i_mapping, pos, |
927 | len, copied, folio, NULL); |
928 | WARN_ON_ONCE(bh_written != copied && bh_written != 0); |
929 | return bh_written == copied; |
930 | } |
931 | |
932 | return __iomap_write_end(inode: iter->inode, pos, len, copied, folio); |
933 | } |
934 | |
935 | static int iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) |
936 | { |
937 | ssize_t total_written = 0; |
938 | int status = 0; |
939 | struct address_space *mapping = iter->inode->i_mapping; |
940 | size_t chunk = mapping_max_folio_size(mapping); |
941 | unsigned int bdp_flags = (iter->flags & IOMAP_NOWAIT) ? BDP_ASYNC : 0; |
942 | |
943 | do { |
944 | struct folio *folio; |
945 | loff_t old_size; |
946 | size_t offset; /* Offset into folio */ |
947 | u64 bytes; /* Bytes to write to folio */ |
948 | size_t copied; /* Bytes copied from user */ |
949 | u64 written; /* Bytes have been written */ |
950 | loff_t pos; |
951 | |
952 | bytes = iov_iter_count(i); |
953 | retry: |
954 | offset = iter->pos & (chunk - 1); |
955 | bytes = min(chunk - offset, bytes); |
956 | status = balance_dirty_pages_ratelimited_flags(mapping, |
957 | flags: bdp_flags); |
958 | if (unlikely(status)) |
959 | break; |
960 | |
961 | if (bytes > iomap_length(iter)) |
962 | bytes = iomap_length(iter); |
963 | |
964 | /* |
965 | * Bring in the user page that we'll copy from _first_. |
966 | * Otherwise there's a nasty deadlock on copying from the |
967 | * same page as we're writing to, without it being marked |
968 | * up-to-date. |
969 | * |
970 | * For async buffered writes the assumption is that the user |
971 | * page has already been faulted in. This can be optimized by |
972 | * faulting the user page. |
973 | */ |
974 | if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) { |
975 | status = -EFAULT; |
976 | break; |
977 | } |
978 | |
979 | status = iomap_write_begin(iter, foliop: &folio, poffset: &offset, plen: &bytes); |
980 | if (unlikely(status)) { |
981 | iomap_write_failed(inode: iter->inode, pos: iter->pos, len: bytes); |
982 | break; |
983 | } |
984 | if (iter->iomap.flags & IOMAP_F_STALE) |
985 | break; |
986 | |
987 | pos = iter->pos; |
988 | |
989 | if (mapping_writably_mapped(mapping)) |
990 | flush_dcache_folio(folio); |
991 | |
992 | copied = copy_folio_from_iter_atomic(folio, offset, bytes, i); |
993 | written = iomap_write_end(iter, len: bytes, copied, folio) ? |
994 | copied : 0; |
995 | |
996 | /* |
997 | * Update the in-memory inode size after copying the data into |
998 | * the page cache. It's up to the file system to write the |
999 | * updated size to disk, preferably after I/O completion so that |
1000 | * no stale data is exposed. Only once that's done can we |
1001 | * unlock and release the folio. |
1002 | */ |
1003 | old_size = iter->inode->i_size; |
1004 | if (pos + written > old_size) { |
1005 | i_size_write(inode: iter->inode, i_size: pos + written); |
1006 | iter->iomap.flags |= IOMAP_F_SIZE_CHANGED; |
1007 | } |
1008 | __iomap_put_folio(iter, ret: written, folio); |
1009 | |
1010 | if (old_size < pos) |
1011 | pagecache_isize_extended(inode: iter->inode, from: old_size, to: pos); |
1012 | |
1013 | cond_resched(); |
1014 | if (unlikely(written == 0)) { |
1015 | /* |
1016 | * A short copy made iomap_write_end() reject the |
1017 | * thing entirely. Might be memory poisoning |
1018 | * halfway through, might be a race with munmap, |
1019 | * might be severe memory pressure. |
1020 | */ |
1021 | iomap_write_failed(inode: iter->inode, pos, len: bytes); |
1022 | iov_iter_revert(i, bytes: copied); |
1023 | |
1024 | if (chunk > PAGE_SIZE) |
1025 | chunk /= 2; |
1026 | if (copied) { |
1027 | bytes = copied; |
1028 | goto retry; |
1029 | } |
1030 | } else { |
1031 | total_written += written; |
1032 | iomap_iter_advance(iter, count: &written); |
1033 | } |
1034 | } while (iov_iter_count(i) && iomap_length(iter)); |
1035 | |
1036 | return total_written ? 0 : status; |
1037 | } |
1038 | |
1039 | ssize_t |
1040 | iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i, |
1041 | const struct iomap_ops *ops, void *private) |
1042 | { |
1043 | struct iomap_iter iter = { |
1044 | .inode = iocb->ki_filp->f_mapping->host, |
1045 | .pos = iocb->ki_pos, |
1046 | .len = iov_iter_count(i), |
1047 | .flags = IOMAP_WRITE, |
1048 | .private = private, |
1049 | }; |
1050 | ssize_t ret; |
1051 | |
1052 | if (iocb->ki_flags & IOCB_NOWAIT) |
1053 | iter.flags |= IOMAP_NOWAIT; |
1054 | if (iocb->ki_flags & IOCB_DONTCACHE) |
1055 | iter.flags |= IOMAP_DONTCACHE; |
1056 | |
1057 | while ((ret = iomap_iter(iter: &iter, ops)) > 0) |
1058 | iter.status = iomap_write_iter(iter: &iter, i); |
1059 | |
1060 | if (unlikely(iter.pos == iocb->ki_pos)) |
1061 | return ret; |
1062 | ret = iter.pos - iocb->ki_pos; |
1063 | iocb->ki_pos = iter.pos; |
1064 | return ret; |
1065 | } |
1066 | EXPORT_SYMBOL_GPL(iomap_file_buffered_write); |
1067 | |
1068 | static void iomap_write_delalloc_ifs_punch(struct inode *inode, |
1069 | struct folio *folio, loff_t start_byte, loff_t end_byte, |
1070 | struct iomap *iomap, iomap_punch_t punch) |
1071 | { |
1072 | unsigned int first_blk, last_blk, i; |
1073 | loff_t last_byte; |
1074 | u8 blkbits = inode->i_blkbits; |
1075 | struct iomap_folio_state *ifs; |
1076 | |
1077 | /* |
1078 | * When we have per-block dirty tracking, there can be |
1079 | * blocks within a folio which are marked uptodate |
1080 | * but not dirty. In that case it is necessary to punch |
1081 | * out such blocks to avoid leaking any delalloc blocks. |
1082 | */ |
1083 | ifs = folio->private; |
1084 | if (!ifs) |
1085 | return; |
1086 | |
1087 | last_byte = min_t(loff_t, end_byte - 1, |
1088 | folio_pos(folio) + folio_size(folio) - 1); |
1089 | first_blk = offset_in_folio(folio, start_byte) >> blkbits; |
1090 | last_blk = offset_in_folio(folio, last_byte) >> blkbits; |
1091 | for (i = first_blk; i <= last_blk; i++) { |
1092 | if (!ifs_block_is_dirty(folio, ifs, block: i)) |
1093 | punch(inode, folio_pos(folio) + (i << blkbits), |
1094 | 1 << blkbits, iomap); |
1095 | } |
1096 | } |
1097 | |
1098 | static void iomap_write_delalloc_punch(struct inode *inode, struct folio *folio, |
1099 | loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte, |
1100 | struct iomap *iomap, iomap_punch_t punch) |
1101 | { |
1102 | if (!folio_test_dirty(folio)) |
1103 | return; |
1104 | |
1105 | /* if dirty, punch up to offset */ |
1106 | if (start_byte > *punch_start_byte) { |
1107 | punch(inode, *punch_start_byte, start_byte - *punch_start_byte, |
1108 | iomap); |
1109 | } |
1110 | |
1111 | /* Punch non-dirty blocks within folio */ |
1112 | iomap_write_delalloc_ifs_punch(inode, folio, start_byte, end_byte, |
1113 | iomap, punch); |
1114 | |
1115 | /* |
1116 | * Make sure the next punch start is correctly bound to |
1117 | * the end of this data range, not the end of the folio. |
1118 | */ |
1119 | *punch_start_byte = min_t(loff_t, end_byte, |
1120 | folio_pos(folio) + folio_size(folio)); |
1121 | } |
1122 | |
1123 | /* |
1124 | * Scan the data range passed to us for dirty page cache folios. If we find a |
1125 | * dirty folio, punch out the preceding range and update the offset from which |
1126 | * the next punch will start from. |
1127 | * |
1128 | * We can punch out storage reservations under clean pages because they either |
1129 | * contain data that has been written back - in which case the delalloc punch |
1130 | * over that range is a no-op - or they have been read faults in which case they |
1131 | * contain zeroes and we can remove the delalloc backing range and any new |
1132 | * writes to those pages will do the normal hole filling operation... |
1133 | * |
1134 | * This makes the logic simple: we only need to keep the delalloc extents only |
1135 | * over the dirty ranges of the page cache. |
1136 | * |
1137 | * This function uses [start_byte, end_byte) intervals (i.e. open ended) to |
1138 | * simplify range iterations. |
1139 | */ |
1140 | static void iomap_write_delalloc_scan(struct inode *inode, |
1141 | loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte, |
1142 | struct iomap *iomap, iomap_punch_t punch) |
1143 | { |
1144 | while (start_byte < end_byte) { |
1145 | struct folio *folio; |
1146 | |
1147 | /* grab locked page */ |
1148 | folio = filemap_lock_folio(mapping: inode->i_mapping, |
1149 | index: start_byte >> PAGE_SHIFT); |
1150 | if (IS_ERR(ptr: folio)) { |
1151 | start_byte = ALIGN_DOWN(start_byte, PAGE_SIZE) + |
1152 | PAGE_SIZE; |
1153 | continue; |
1154 | } |
1155 | |
1156 | iomap_write_delalloc_punch(inode, folio, punch_start_byte, |
1157 | start_byte, end_byte, iomap, punch); |
1158 | |
1159 | /* move offset to start of next folio in range */ |
1160 | start_byte = folio_pos(folio) + folio_size(folio); |
1161 | folio_unlock(folio); |
1162 | folio_put(folio); |
1163 | } |
1164 | } |
1165 | |
1166 | /* |
1167 | * When a short write occurs, the filesystem might need to use ->iomap_end |
1168 | * to remove space reservations created in ->iomap_begin. |
1169 | * |
1170 | * For filesystems that use delayed allocation, there can be dirty pages over |
1171 | * the delalloc extent outside the range of a short write but still within the |
1172 | * delalloc extent allocated for this iomap if the write raced with page |
1173 | * faults. |
1174 | * |
1175 | * Punch out all the delalloc blocks in the range given except for those that |
1176 | * have dirty data still pending in the page cache - those are going to be |
1177 | * written and so must still retain the delalloc backing for writeback. |
1178 | * |
1179 | * The punch() callback *must* only punch delalloc extents in the range passed |
1180 | * to it. It must skip over all other types of extents in the range and leave |
1181 | * them completely unchanged. It must do this punch atomically with respect to |
1182 | * other extent modifications. |
1183 | * |
1184 | * The punch() callback may be called with a folio locked to prevent writeback |
1185 | * extent allocation racing at the edge of the range we are currently punching. |
1186 | * The locked folio may or may not cover the range being punched, so it is not |
1187 | * safe for the punch() callback to lock folios itself. |
1188 | * |
1189 | * Lock order is: |
1190 | * |
1191 | * inode->i_rwsem (shared or exclusive) |
1192 | * inode->i_mapping->invalidate_lock (exclusive) |
1193 | * folio_lock() |
1194 | * ->punch |
1195 | * internal filesystem allocation lock |
1196 | * |
1197 | * As we are scanning the page cache for data, we don't need to reimplement the |
1198 | * wheel - mapping_seek_hole_data() does exactly what we need to identify the |
1199 | * start and end of data ranges correctly even for sub-folio block sizes. This |
1200 | * byte range based iteration is especially convenient because it means we |
1201 | * don't have to care about variable size folios, nor where the start or end of |
1202 | * the data range lies within a folio, if they lie within the same folio or even |
1203 | * if there are multiple discontiguous data ranges within the folio. |
1204 | * |
1205 | * It should be noted that mapping_seek_hole_data() is not aware of EOF, and so |
1206 | * can return data ranges that exist in the cache beyond EOF. e.g. a page fault |
1207 | * spanning EOF will initialise the post-EOF data to zeroes and mark it up to |
1208 | * date. A write page fault can then mark it dirty. If we then fail a write() |
1209 | * beyond EOF into that up to date cached range, we allocate a delalloc block |
1210 | * beyond EOF and then have to punch it out. Because the range is up to date, |
1211 | * mapping_seek_hole_data() will return it, and we will skip the punch because |
1212 | * the folio is dirty. THis is incorrect - we always need to punch out delalloc |
1213 | * beyond EOF in this case as writeback will never write back and covert that |
1214 | * delalloc block beyond EOF. Hence we limit the cached data scan range to EOF, |
1215 | * resulting in always punching out the range from the EOF to the end of the |
1216 | * range the iomap spans. |
1217 | * |
1218 | * Intervals are of the form [start_byte, end_byte) (i.e. open ended) because it |
1219 | * matches the intervals returned by mapping_seek_hole_data(). i.e. SEEK_DATA |
1220 | * returns the start of a data range (start_byte), and SEEK_HOLE(start_byte) |
1221 | * returns the end of the data range (data_end). Using closed intervals would |
1222 | * require sprinkling this code with magic "+ 1" and "- 1" arithmetic and expose |
1223 | * the code to subtle off-by-one bugs.... |
1224 | */ |
1225 | void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte, |
1226 | loff_t end_byte, unsigned flags, struct iomap *iomap, |
1227 | iomap_punch_t punch) |
1228 | { |
1229 | loff_t punch_start_byte = start_byte; |
1230 | loff_t scan_end_byte = min(i_size_read(inode), end_byte); |
1231 | |
1232 | /* |
1233 | * The caller must hold invalidate_lock to avoid races with page faults |
1234 | * re-instantiating folios and dirtying them via ->page_mkwrite whilst |
1235 | * we walk the cache and perform delalloc extent removal. Failing to do |
1236 | * this can leave dirty pages with no space reservation in the cache. |
1237 | */ |
1238 | lockdep_assert_held_write(&inode->i_mapping->invalidate_lock); |
1239 | |
1240 | while (start_byte < scan_end_byte) { |
1241 | loff_t data_end; |
1242 | |
1243 | start_byte = mapping_seek_hole_data(inode->i_mapping, |
1244 | start: start_byte, end: scan_end_byte, SEEK_DATA); |
1245 | /* |
1246 | * If there is no more data to scan, all that is left is to |
1247 | * punch out the remaining range. |
1248 | * |
1249 | * Note that mapping_seek_hole_data is only supposed to return |
1250 | * either an offset or -ENXIO, so WARN on any other error as |
1251 | * that would be an API change without updating the callers. |
1252 | */ |
1253 | if (start_byte == -ENXIO || start_byte == scan_end_byte) |
1254 | break; |
1255 | if (WARN_ON_ONCE(start_byte < 0)) |
1256 | return; |
1257 | WARN_ON_ONCE(start_byte < punch_start_byte); |
1258 | WARN_ON_ONCE(start_byte > scan_end_byte); |
1259 | |
1260 | /* |
1261 | * We find the end of this contiguous cached data range by |
1262 | * seeking from start_byte to the beginning of the next hole. |
1263 | */ |
1264 | data_end = mapping_seek_hole_data(inode->i_mapping, start: start_byte, |
1265 | end: scan_end_byte, SEEK_HOLE); |
1266 | if (WARN_ON_ONCE(data_end < 0)) |
1267 | return; |
1268 | |
1269 | /* |
1270 | * If we race with post-direct I/O invalidation of the page cache, |
1271 | * there might be no data left at start_byte. |
1272 | */ |
1273 | if (data_end == start_byte) |
1274 | continue; |
1275 | |
1276 | WARN_ON_ONCE(data_end < start_byte); |
1277 | WARN_ON_ONCE(data_end > scan_end_byte); |
1278 | |
1279 | iomap_write_delalloc_scan(inode, punch_start_byte: &punch_start_byte, start_byte, |
1280 | end_byte: data_end, iomap, punch); |
1281 | |
1282 | /* The next data search starts at the end of this one. */ |
1283 | start_byte = data_end; |
1284 | } |
1285 | |
1286 | if (punch_start_byte < end_byte) |
1287 | punch(inode, punch_start_byte, end_byte - punch_start_byte, |
1288 | iomap); |
1289 | } |
1290 | EXPORT_SYMBOL_GPL(iomap_write_delalloc_release); |
1291 | |
1292 | static int iomap_unshare_iter(struct iomap_iter *iter) |
1293 | { |
1294 | struct iomap *iomap = &iter->iomap; |
1295 | u64 bytes = iomap_length(iter); |
1296 | int status; |
1297 | |
1298 | if (!iomap_want_unshare_iter(iter)) |
1299 | return iomap_iter_advance(iter, count: &bytes); |
1300 | |
1301 | do { |
1302 | struct folio *folio; |
1303 | size_t offset; |
1304 | bool ret; |
1305 | |
1306 | bytes = min_t(u64, SIZE_MAX, bytes); |
1307 | status = iomap_write_begin(iter, foliop: &folio, poffset: &offset, plen: &bytes); |
1308 | if (unlikely(status)) |
1309 | return status; |
1310 | if (iomap->flags & IOMAP_F_STALE) |
1311 | break; |
1312 | |
1313 | ret = iomap_write_end(iter, len: bytes, copied: bytes, folio); |
1314 | __iomap_put_folio(iter, ret: bytes, folio); |
1315 | if (WARN_ON_ONCE(!ret)) |
1316 | return -EIO; |
1317 | |
1318 | cond_resched(); |
1319 | |
1320 | balance_dirty_pages_ratelimited(mapping: iter->inode->i_mapping); |
1321 | |
1322 | status = iomap_iter_advance(iter, count: &bytes); |
1323 | if (status) |
1324 | break; |
1325 | } while (bytes > 0); |
1326 | |
1327 | return status; |
1328 | } |
1329 | |
1330 | int |
1331 | iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, |
1332 | const struct iomap_ops *ops) |
1333 | { |
1334 | struct iomap_iter iter = { |
1335 | .inode = inode, |
1336 | .pos = pos, |
1337 | .flags = IOMAP_WRITE | IOMAP_UNSHARE, |
1338 | }; |
1339 | loff_t size = i_size_read(inode); |
1340 | int ret; |
1341 | |
1342 | if (pos < 0 || pos >= size) |
1343 | return 0; |
1344 | |
1345 | iter.len = min(len, size - pos); |
1346 | while ((ret = iomap_iter(iter: &iter, ops)) > 0) |
1347 | iter.status = iomap_unshare_iter(iter: &iter); |
1348 | return ret; |
1349 | } |
1350 | EXPORT_SYMBOL_GPL(iomap_file_unshare); |
1351 | |
1352 | /* |
1353 | * Flush the remaining range of the iter and mark the current mapping stale. |
1354 | * This is used when zero range sees an unwritten mapping that may have had |
1355 | * dirty pagecache over it. |
1356 | */ |
1357 | static inline int iomap_zero_iter_flush_and_stale(struct iomap_iter *i) |
1358 | { |
1359 | struct address_space *mapping = i->inode->i_mapping; |
1360 | loff_t end = i->pos + i->len - 1; |
1361 | |
1362 | i->iomap.flags |= IOMAP_F_STALE; |
1363 | return filemap_write_and_wait_range(mapping, lstart: i->pos, lend: end); |
1364 | } |
1365 | |
1366 | static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero) |
1367 | { |
1368 | u64 bytes = iomap_length(iter); |
1369 | int status; |
1370 | |
1371 | do { |
1372 | struct folio *folio; |
1373 | size_t offset; |
1374 | bool ret; |
1375 | |
1376 | bytes = min_t(u64, SIZE_MAX, bytes); |
1377 | status = iomap_write_begin(iter, foliop: &folio, poffset: &offset, plen: &bytes); |
1378 | if (status) |
1379 | return status; |
1380 | if (iter->iomap.flags & IOMAP_F_STALE) |
1381 | break; |
1382 | |
1383 | /* warn about zeroing folios beyond eof that won't write back */ |
1384 | WARN_ON_ONCE(folio_pos(folio) > iter->inode->i_size); |
1385 | |
1386 | folio_zero_range(folio, start: offset, length: bytes); |
1387 | folio_mark_accessed(folio); |
1388 | |
1389 | ret = iomap_write_end(iter, len: bytes, copied: bytes, folio); |
1390 | __iomap_put_folio(iter, ret: bytes, folio); |
1391 | if (WARN_ON_ONCE(!ret)) |
1392 | return -EIO; |
1393 | |
1394 | status = iomap_iter_advance(iter, count: &bytes); |
1395 | if (status) |
1396 | break; |
1397 | } while (bytes > 0); |
1398 | |
1399 | if (did_zero) |
1400 | *did_zero = true; |
1401 | return status; |
1402 | } |
1403 | |
1404 | int |
1405 | iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, |
1406 | const struct iomap_ops *ops, void *private) |
1407 | { |
1408 | struct iomap_iter iter = { |
1409 | .inode = inode, |
1410 | .pos = pos, |
1411 | .len = len, |
1412 | .flags = IOMAP_ZERO, |
1413 | .private = private, |
1414 | }; |
1415 | struct address_space *mapping = inode->i_mapping; |
1416 | unsigned int blocksize = i_blocksize(node: inode); |
1417 | unsigned int off = pos & (blocksize - 1); |
1418 | loff_t plen = min_t(loff_t, len, blocksize - off); |
1419 | int ret; |
1420 | bool range_dirty; |
1421 | |
1422 | /* |
1423 | * Zero range can skip mappings that are zero on disk so long as |
1424 | * pagecache is clean. If pagecache was dirty prior to zero range, the |
1425 | * mapping converts on writeback completion and so must be zeroed. |
1426 | * |
1427 | * The simplest way to deal with this across a range is to flush |
1428 | * pagecache and process the updated mappings. To avoid excessive |
1429 | * flushing on partial eof zeroing, special case it to zero the |
1430 | * unaligned start portion if already dirty in pagecache. |
1431 | */ |
1432 | if (off && |
1433 | filemap_range_needs_writeback(mapping, start_byte: pos, end_byte: pos + plen - 1)) { |
1434 | iter.len = plen; |
1435 | while ((ret = iomap_iter(iter: &iter, ops)) > 0) |
1436 | iter.status = iomap_zero_iter(iter: &iter, did_zero); |
1437 | |
1438 | iter.len = len - (iter.pos - pos); |
1439 | if (ret || !iter.len) |
1440 | return ret; |
1441 | } |
1442 | |
1443 | /* |
1444 | * To avoid an unconditional flush, check pagecache state and only flush |
1445 | * if dirty and the fs returns a mapping that might convert on |
1446 | * writeback. |
1447 | */ |
1448 | range_dirty = filemap_range_needs_writeback(mapping: inode->i_mapping, |
1449 | start_byte: iter.pos, end_byte: iter.pos + iter.len - 1); |
1450 | while ((ret = iomap_iter(iter: &iter, ops)) > 0) { |
1451 | const struct iomap *srcmap = iomap_iter_srcmap(i: &iter); |
1452 | |
1453 | if (srcmap->type == IOMAP_HOLE || |
1454 | srcmap->type == IOMAP_UNWRITTEN) { |
1455 | s64 status; |
1456 | |
1457 | if (range_dirty) { |
1458 | range_dirty = false; |
1459 | status = iomap_zero_iter_flush_and_stale(i: &iter); |
1460 | } else { |
1461 | status = iomap_iter_advance_full(iter: &iter); |
1462 | } |
1463 | iter.status = status; |
1464 | continue; |
1465 | } |
1466 | |
1467 | iter.status = iomap_zero_iter(iter: &iter, did_zero); |
1468 | } |
1469 | return ret; |
1470 | } |
1471 | EXPORT_SYMBOL_GPL(iomap_zero_range); |
1472 | |
1473 | int |
1474 | iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, |
1475 | const struct iomap_ops *ops, void *private) |
1476 | { |
1477 | unsigned int blocksize = i_blocksize(node: inode); |
1478 | unsigned int off = pos & (blocksize - 1); |
1479 | |
1480 | /* Block boundary? Nothing to do */ |
1481 | if (!off) |
1482 | return 0; |
1483 | return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops, |
1484 | private); |
1485 | } |
1486 | EXPORT_SYMBOL_GPL(iomap_truncate_page); |
1487 | |
1488 | static int iomap_folio_mkwrite_iter(struct iomap_iter *iter, |
1489 | struct folio *folio) |
1490 | { |
1491 | loff_t length = iomap_length(iter); |
1492 | int ret; |
1493 | |
1494 | if (iter->iomap.flags & IOMAP_F_BUFFER_HEAD) { |
1495 | ret = __block_write_begin_int(folio, pos: iter->pos, len: length, NULL, |
1496 | iomap: &iter->iomap); |
1497 | if (ret) |
1498 | return ret; |
1499 | block_commit_write(folio, from: 0, to: length); |
1500 | } else { |
1501 | WARN_ON_ONCE(!folio_test_uptodate(folio)); |
1502 | folio_mark_dirty(folio); |
1503 | } |
1504 | |
1505 | return iomap_iter_advance(iter, count: &length); |
1506 | } |
1507 | |
1508 | vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops, |
1509 | void *private) |
1510 | { |
1511 | struct iomap_iter iter = { |
1512 | .inode = file_inode(f: vmf->vma->vm_file), |
1513 | .flags = IOMAP_WRITE | IOMAP_FAULT, |
1514 | .private = private, |
1515 | }; |
1516 | struct folio *folio = page_folio(vmf->page); |
1517 | ssize_t ret; |
1518 | |
1519 | folio_lock(folio); |
1520 | ret = folio_mkwrite_check_truncate(folio, inode: iter.inode); |
1521 | if (ret < 0) |
1522 | goto out_unlock; |
1523 | iter.pos = folio_pos(folio); |
1524 | iter.len = ret; |
1525 | while ((ret = iomap_iter(iter: &iter, ops)) > 0) |
1526 | iter.status = iomap_folio_mkwrite_iter(iter: &iter, folio); |
1527 | |
1528 | if (ret < 0) |
1529 | goto out_unlock; |
1530 | folio_wait_stable(folio); |
1531 | return VM_FAULT_LOCKED; |
1532 | out_unlock: |
1533 | folio_unlock(folio); |
1534 | return vmf_fs_error(err: ret); |
1535 | } |
1536 | EXPORT_SYMBOL_GPL(iomap_page_mkwrite); |
1537 | |
1538 | static void iomap_finish_folio_write(struct inode *inode, struct folio *folio, |
1539 | size_t len) |
1540 | { |
1541 | struct iomap_folio_state *ifs = folio->private; |
1542 | |
1543 | WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !ifs); |
1544 | WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) <= 0); |
1545 | |
1546 | if (!ifs || atomic_sub_and_test(i: len, v: &ifs->write_bytes_pending)) |
1547 | folio_end_writeback(folio); |
1548 | } |
1549 | |
1550 | /* |
1551 | * We're now finished for good with this ioend structure. Update the page |
1552 | * state, release holds on bios, and finally free up memory. Do not use the |
1553 | * ioend after this. |
1554 | */ |
1555 | u32 iomap_finish_ioend_buffered(struct iomap_ioend *ioend) |
1556 | { |
1557 | struct inode *inode = ioend->io_inode; |
1558 | struct bio *bio = &ioend->io_bio; |
1559 | struct folio_iter fi; |
1560 | u32 folio_count = 0; |
1561 | |
1562 | if (ioend->io_error) { |
1563 | mapping_set_error(mapping: inode->i_mapping, error: ioend->io_error); |
1564 | if (!bio_flagged(bio, bit: BIO_QUIET)) { |
1565 | pr_err_ratelimited( |
1566 | "%s: writeback error on inode %lu, offset %lld, sector %llu", |
1567 | inode->i_sb->s_id, inode->i_ino, |
1568 | ioend->io_offset, ioend->io_sector); |
1569 | } |
1570 | } |
1571 | |
1572 | /* walk all folios in bio, ending page IO on them */ |
1573 | bio_for_each_folio_all(fi, bio) { |
1574 | iomap_finish_folio_write(inode, folio: fi.folio, len: fi.length); |
1575 | folio_count++; |
1576 | } |
1577 | |
1578 | bio_put(bio); /* frees the ioend */ |
1579 | return folio_count; |
1580 | } |
1581 | |
1582 | static void iomap_writepage_end_bio(struct bio *bio) |
1583 | { |
1584 | struct iomap_ioend *ioend = iomap_ioend_from_bio(bio); |
1585 | |
1586 | ioend->io_error = blk_status_to_errno(status: bio->bi_status); |
1587 | iomap_finish_ioend_buffered(ioend); |
1588 | } |
1589 | |
1590 | /* |
1591 | * Submit an ioend. |
1592 | * |
1593 | * If @error is non-zero, it means that we have a situation where some part of |
1594 | * the submission process has failed after we've marked pages for writeback. |
1595 | * We cannot cancel ioend directly in that case, so call the bio end I/O handler |
1596 | * with the error status here to run the normal I/O completion handler to clear |
1597 | * the writeback bit and let the file system proess the errors. |
1598 | */ |
1599 | static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error) |
1600 | { |
1601 | if (!wpc->ioend) |
1602 | return error; |
1603 | |
1604 | /* |
1605 | * Let the file systems prepare the I/O submission and hook in an I/O |
1606 | * comletion handler. This also needs to happen in case after a |
1607 | * failure happened so that the file system end I/O handler gets called |
1608 | * to clean up. |
1609 | */ |
1610 | if (wpc->ops->submit_ioend) { |
1611 | error = wpc->ops->submit_ioend(wpc, error); |
1612 | } else { |
1613 | if (WARN_ON_ONCE(wpc->iomap.flags & IOMAP_F_ANON_WRITE)) |
1614 | error = -EIO; |
1615 | if (!error) |
1616 | submit_bio(bio: &wpc->ioend->io_bio); |
1617 | } |
1618 | |
1619 | if (error) { |
1620 | wpc->ioend->io_bio.bi_status = errno_to_blk_status(errno: error); |
1621 | bio_endio(&wpc->ioend->io_bio); |
1622 | } |
1623 | |
1624 | wpc->ioend = NULL; |
1625 | return error; |
1626 | } |
1627 | |
1628 | static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc, |
1629 | struct writeback_control *wbc, struct inode *inode, loff_t pos, |
1630 | u16 ioend_flags) |
1631 | { |
1632 | struct bio *bio; |
1633 | |
1634 | bio = bio_alloc_bioset(bdev: wpc->iomap.bdev, BIO_MAX_VECS, |
1635 | opf: REQ_OP_WRITE | wbc_to_write_flags(wbc), |
1636 | GFP_NOFS, bs: &iomap_ioend_bioset); |
1637 | bio->bi_iter.bi_sector = iomap_sector(iomap: &wpc->iomap, pos); |
1638 | bio->bi_end_io = iomap_writepage_end_bio; |
1639 | bio->bi_write_hint = inode->i_write_hint; |
1640 | wbc_init_bio(wbc, bio); |
1641 | wpc->nr_folios = 0; |
1642 | return iomap_init_ioend(inode, bio, file_offset: pos, ioend_flags); |
1643 | } |
1644 | |
1645 | static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos, |
1646 | u16 ioend_flags) |
1647 | { |
1648 | if (ioend_flags & IOMAP_IOEND_BOUNDARY) |
1649 | return false; |
1650 | if ((ioend_flags & IOMAP_IOEND_NOMERGE_FLAGS) != |
1651 | (wpc->ioend->io_flags & IOMAP_IOEND_NOMERGE_FLAGS)) |
1652 | return false; |
1653 | if (pos != wpc->ioend->io_offset + wpc->ioend->io_size) |
1654 | return false; |
1655 | if (!(wpc->iomap.flags & IOMAP_F_ANON_WRITE) && |
1656 | iomap_sector(iomap: &wpc->iomap, pos) != |
1657 | bio_end_sector(&wpc->ioend->io_bio)) |
1658 | return false; |
1659 | /* |
1660 | * Limit ioend bio chain lengths to minimise IO completion latency. This |
1661 | * also prevents long tight loops ending page writeback on all the |
1662 | * folios in the ioend. |
1663 | */ |
1664 | if (wpc->nr_folios >= IOEND_BATCH_SIZE) |
1665 | return false; |
1666 | return true; |
1667 | } |
1668 | |
1669 | /* |
1670 | * Test to see if we have an existing ioend structure that we could append to |
1671 | * first; otherwise finish off the current ioend and start another. |
1672 | * |
1673 | * If a new ioend is created and cached, the old ioend is submitted to the block |
1674 | * layer instantly. Batching optimisations are provided by higher level block |
1675 | * plugging. |
1676 | * |
1677 | * At the end of a writeback pass, there will be a cached ioend remaining on the |
1678 | * writepage context that the caller will need to submit. |
1679 | */ |
1680 | static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, |
1681 | struct writeback_control *wbc, struct folio *folio, |
1682 | struct inode *inode, loff_t pos, loff_t end_pos, |
1683 | unsigned len) |
1684 | { |
1685 | struct iomap_folio_state *ifs = folio->private; |
1686 | size_t poff = offset_in_folio(folio, pos); |
1687 | unsigned int ioend_flags = 0; |
1688 | int error; |
1689 | |
1690 | if (wpc->iomap.type == IOMAP_UNWRITTEN) |
1691 | ioend_flags |= IOMAP_IOEND_UNWRITTEN; |
1692 | if (wpc->iomap.flags & IOMAP_F_SHARED) |
1693 | ioend_flags |= IOMAP_IOEND_SHARED; |
1694 | if (folio_test_dropbehind(folio)) |
1695 | ioend_flags |= IOMAP_IOEND_DONTCACHE; |
1696 | if (pos == wpc->iomap.offset && (wpc->iomap.flags & IOMAP_F_BOUNDARY)) |
1697 | ioend_flags |= IOMAP_IOEND_BOUNDARY; |
1698 | |
1699 | if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos, ioend_flags)) { |
1700 | new_ioend: |
1701 | error = iomap_submit_ioend(wpc, error: 0); |
1702 | if (error) |
1703 | return error; |
1704 | wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos, |
1705 | ioend_flags); |
1706 | } |
1707 | |
1708 | if (!bio_add_folio(bio: &wpc->ioend->io_bio, folio, len, off: poff)) |
1709 | goto new_ioend; |
1710 | |
1711 | if (ifs) |
1712 | atomic_add(i: len, v: &ifs->write_bytes_pending); |
1713 | |
1714 | /* |
1715 | * Clamp io_offset and io_size to the incore EOF so that ondisk |
1716 | * file size updates in the ioend completion are byte-accurate. |
1717 | * This avoids recovering files with zeroed tail regions when |
1718 | * writeback races with appending writes: |
1719 | * |
1720 | * Thread 1: Thread 2: |
1721 | * ------------ ----------- |
1722 | * write [A, A+B] |
1723 | * update inode size to A+B |
1724 | * submit I/O [A, A+BS] |
1725 | * write [A+B, A+B+C] |
1726 | * update inode size to A+B+C |
1727 | * <I/O completes, updates disk size to min(A+B+C, A+BS)> |
1728 | * <power failure> |
1729 | * |
1730 | * After reboot: |
1731 | * 1) with A+B+C < A+BS, the file has zero padding in range |
1732 | * [A+B, A+B+C] |
1733 | * |
1734 | * |< Block Size (BS) >| |
1735 | * |DDDDDDDDDDDD0000000000000| |
1736 | * ^ ^ ^ |
1737 | * A A+B A+B+C |
1738 | * (EOF) |
1739 | * |
1740 | * 2) with A+B+C > A+BS, the file has zero padding in range |
1741 | * [A+B, A+BS] |
1742 | * |
1743 | * |< Block Size (BS) >|< Block Size (BS) >| |
1744 | * |DDDDDDDDDDDD0000000000000|00000000000000000000000000| |
1745 | * ^ ^ ^ ^ |
1746 | * A A+B A+BS A+B+C |
1747 | * (EOF) |
1748 | * |
1749 | * D = Valid Data |
1750 | * 0 = Zero Padding |
1751 | * |
1752 | * Note that this defeats the ability to chain the ioends of |
1753 | * appending writes. |
1754 | */ |
1755 | wpc->ioend->io_size += len; |
1756 | if (wpc->ioend->io_offset + wpc->ioend->io_size > end_pos) |
1757 | wpc->ioend->io_size = end_pos - wpc->ioend->io_offset; |
1758 | |
1759 | wbc_account_cgroup_owner(wbc, folio, bytes: len); |
1760 | return 0; |
1761 | } |
1762 | |
1763 | static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc, |
1764 | struct writeback_control *wbc, struct folio *folio, |
1765 | struct inode *inode, u64 pos, u64 end_pos, |
1766 | unsigned dirty_len, unsigned *count) |
1767 | { |
1768 | int error; |
1769 | |
1770 | do { |
1771 | unsigned map_len; |
1772 | |
1773 | error = wpc->ops->map_blocks(wpc, inode, pos, dirty_len); |
1774 | if (error) |
1775 | break; |
1776 | trace_iomap_writepage_map(inode, pos, dirty_len, iomap: &wpc->iomap); |
1777 | |
1778 | map_len = min_t(u64, dirty_len, |
1779 | wpc->iomap.offset + wpc->iomap.length - pos); |
1780 | WARN_ON_ONCE(!folio->private && map_len < dirty_len); |
1781 | |
1782 | switch (wpc->iomap.type) { |
1783 | case IOMAP_INLINE: |
1784 | WARN_ON_ONCE(1); |
1785 | error = -EIO; |
1786 | break; |
1787 | case IOMAP_HOLE: |
1788 | break; |
1789 | default: |
1790 | error = iomap_add_to_ioend(wpc, wbc, folio, inode, pos, |
1791 | end_pos, len: map_len); |
1792 | if (!error) |
1793 | (*count)++; |
1794 | break; |
1795 | } |
1796 | dirty_len -= map_len; |
1797 | pos += map_len; |
1798 | } while (dirty_len && !error); |
1799 | |
1800 | /* |
1801 | * We cannot cancel the ioend directly here on error. We may have |
1802 | * already set other pages under writeback and hence we have to run I/O |
1803 | * completion to mark the error state of the pages under writeback |
1804 | * appropriately. |
1805 | * |
1806 | * Just let the file system know what portion of the folio failed to |
1807 | * map. |
1808 | */ |
1809 | if (error && wpc->ops->discard_folio) |
1810 | wpc->ops->discard_folio(folio, pos); |
1811 | return error; |
1812 | } |
1813 | |
1814 | /* |
1815 | * Check interaction of the folio with the file end. |
1816 | * |
1817 | * If the folio is entirely beyond i_size, return false. If it straddles |
1818 | * i_size, adjust end_pos and zero all data beyond i_size. |
1819 | */ |
1820 | static bool iomap_writepage_handle_eof(struct folio *folio, struct inode *inode, |
1821 | u64 *end_pos) |
1822 | { |
1823 | u64 isize = i_size_read(inode); |
1824 | |
1825 | if (*end_pos > isize) { |
1826 | size_t poff = offset_in_folio(folio, isize); |
1827 | pgoff_t end_index = isize >> PAGE_SHIFT; |
1828 | |
1829 | /* |
1830 | * If the folio is entirely ouside of i_size, skip it. |
1831 | * |
1832 | * This can happen due to a truncate operation that is in |
1833 | * progress and in that case truncate will finish it off once |
1834 | * we've dropped the folio lock. |
1835 | * |
1836 | * Note that the pgoff_t used for end_index is an unsigned long. |
1837 | * If the given offset is greater than 16TB on a 32-bit system, |
1838 | * then if we checked if the folio is fully outside i_size with |
1839 | * "if (folio->index >= end_index + 1)", "end_index + 1" would |
1840 | * overflow and evaluate to 0. Hence this folio would be |
1841 | * redirtied and written out repeatedly, which would result in |
1842 | * an infinite loop; the user program performing this operation |
1843 | * would hang. Instead, we can detect this situation by |
1844 | * checking if the folio is totally beyond i_size or if its |
1845 | * offset is just equal to the EOF. |
1846 | */ |
1847 | if (folio->index > end_index || |
1848 | (folio->index == end_index && poff == 0)) |
1849 | return false; |
1850 | |
1851 | /* |
1852 | * The folio straddles i_size. |
1853 | * |
1854 | * It must be zeroed out on each and every writepage invocation |
1855 | * because it may be mmapped: |
1856 | * |
1857 | * A file is mapped in multiples of the page size. For a |
1858 | * file that is not a multiple of the page size, the |
1859 | * remaining memory is zeroed when mapped, and writes to that |
1860 | * region are not written out to the file. |
1861 | * |
1862 | * Also adjust the end_pos to the end of file and skip writeback |
1863 | * for all blocks entirely beyond i_size. |
1864 | */ |
1865 | folio_zero_segment(folio, start: poff, xend: folio_size(folio)); |
1866 | *end_pos = isize; |
1867 | } |
1868 | |
1869 | return true; |
1870 | } |
1871 | |
1872 | static int iomap_writepage_map(struct iomap_writepage_ctx *wpc, |
1873 | struct writeback_control *wbc, struct folio *folio) |
1874 | { |
1875 | struct iomap_folio_state *ifs = folio->private; |
1876 | struct inode *inode = folio->mapping->host; |
1877 | u64 pos = folio_pos(folio); |
1878 | u64 end_pos = pos + folio_size(folio); |
1879 | u64 end_aligned = 0; |
1880 | unsigned count = 0; |
1881 | int error = 0; |
1882 | u32 rlen; |
1883 | |
1884 | WARN_ON_ONCE(!folio_test_locked(folio)); |
1885 | WARN_ON_ONCE(folio_test_dirty(folio)); |
1886 | WARN_ON_ONCE(folio_test_writeback(folio)); |
1887 | |
1888 | trace_iomap_writepage(inode, off: pos, len: folio_size(folio)); |
1889 | |
1890 | if (!iomap_writepage_handle_eof(folio, inode, end_pos: &end_pos)) { |
1891 | folio_unlock(folio); |
1892 | return 0; |
1893 | } |
1894 | WARN_ON_ONCE(end_pos <= pos); |
1895 | |
1896 | if (i_blocks_per_folio(inode, folio) > 1) { |
1897 | if (!ifs) { |
1898 | ifs = ifs_alloc(inode, folio, flags: 0); |
1899 | iomap_set_range_dirty(folio, off: 0, len: end_pos - pos); |
1900 | } |
1901 | |
1902 | /* |
1903 | * Keep the I/O completion handler from clearing the writeback |
1904 | * bit until we have submitted all blocks by adding a bias to |
1905 | * ifs->write_bytes_pending, which is dropped after submitting |
1906 | * all blocks. |
1907 | */ |
1908 | WARN_ON_ONCE(atomic_read(&ifs->write_bytes_pending) != 0); |
1909 | atomic_inc(v: &ifs->write_bytes_pending); |
1910 | } |
1911 | |
1912 | /* |
1913 | * Set the writeback bit ASAP, as the I/O completion for the single |
1914 | * block per folio case happen hit as soon as we're submitting the bio. |
1915 | */ |
1916 | folio_start_writeback(folio); |
1917 | |
1918 | /* |
1919 | * Walk through the folio to find dirty areas to write back. |
1920 | */ |
1921 | end_aligned = round_up(end_pos, i_blocksize(inode)); |
1922 | while ((rlen = iomap_find_dirty_range(folio, range_start: &pos, range_end: end_aligned))) { |
1923 | error = iomap_writepage_map_blocks(wpc, wbc, folio, inode, |
1924 | pos, end_pos, dirty_len: rlen, count: &count); |
1925 | if (error) |
1926 | break; |
1927 | pos += rlen; |
1928 | } |
1929 | |
1930 | if (count) |
1931 | wpc->nr_folios++; |
1932 | |
1933 | /* |
1934 | * We can have dirty bits set past end of file in page_mkwrite path |
1935 | * while mapping the last partial folio. Hence it's better to clear |
1936 | * all the dirty bits in the folio here. |
1937 | */ |
1938 | iomap_clear_range_dirty(folio, off: 0, len: folio_size(folio)); |
1939 | |
1940 | /* |
1941 | * Usually the writeback bit is cleared by the I/O completion handler. |
1942 | * But we may end up either not actually writing any blocks, or (when |
1943 | * there are multiple blocks in a folio) all I/O might have finished |
1944 | * already at this point. In that case we need to clear the writeback |
1945 | * bit ourselves right after unlocking the page. |
1946 | */ |
1947 | folio_unlock(folio); |
1948 | if (ifs) { |
1949 | if (atomic_dec_and_test(v: &ifs->write_bytes_pending)) |
1950 | folio_end_writeback(folio); |
1951 | } else { |
1952 | if (!count) |
1953 | folio_end_writeback(folio); |
1954 | } |
1955 | mapping_set_error(mapping: inode->i_mapping, error); |
1956 | return error; |
1957 | } |
1958 | |
1959 | int |
1960 | iomap_writepages(struct address_space *mapping, struct writeback_control *wbc, |
1961 | struct iomap_writepage_ctx *wpc, |
1962 | const struct iomap_writeback_ops *ops) |
1963 | { |
1964 | struct folio *folio = NULL; |
1965 | int error; |
1966 | |
1967 | /* |
1968 | * Writeback from reclaim context should never happen except in the case |
1969 | * of a VM regression so warn about it and refuse to write the data. |
1970 | */ |
1971 | if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC | PF_KSWAPD)) == |
1972 | PF_MEMALLOC)) |
1973 | return -EIO; |
1974 | |
1975 | wpc->ops = ops; |
1976 | while ((folio = writeback_iter(mapping, wbc, folio, error: &error))) |
1977 | error = iomap_writepage_map(wpc, wbc, folio); |
1978 | return iomap_submit_ioend(wpc, error); |
1979 | } |
1980 | EXPORT_SYMBOL_GPL(iomap_writepages); |
1981 |
Definitions
- iomap_folio_state
- ifs_is_fully_uptodate
- ifs_block_is_uptodate
- ifs_set_range_uptodate
- iomap_set_range_uptodate
- ifs_block_is_dirty
- ifs_find_dirty_range
- iomap_find_dirty_range
- ifs_clear_range_dirty
- iomap_clear_range_dirty
- ifs_set_range_dirty
- iomap_set_range_dirty
- ifs_alloc
- ifs_free
- iomap_adjust_read_range
- iomap_finish_folio_read
- iomap_read_end_io
- iomap_readpage_ctx
- iomap_read_inline_data
- iomap_block_needs_zeroing
- iomap_readpage_iter
- iomap_read_folio_iter
- iomap_read_folio
- iomap_readahead_iter
- iomap_readahead
- iomap_is_partially_uptodate
- iomap_get_folio
- iomap_release_folio
- iomap_invalidate_folio
- iomap_dirty_folio
- iomap_write_failed
- iomap_read_folio_sync
- __iomap_write_begin
- __iomap_get_folio
- __iomap_put_folio
- iomap_trim_folio_range
- iomap_write_begin_inline
- iomap_write_begin
- __iomap_write_end
- iomap_write_end_inline
- iomap_write_end
- iomap_write_iter
- iomap_file_buffered_write
- iomap_write_delalloc_ifs_punch
- iomap_write_delalloc_punch
- iomap_write_delalloc_scan
- iomap_write_delalloc_release
- iomap_unshare_iter
- iomap_file_unshare
- iomap_zero_iter_flush_and_stale
- iomap_zero_iter
- iomap_zero_range
- iomap_truncate_page
- iomap_folio_mkwrite_iter
- iomap_page_mkwrite
- iomap_finish_folio_write
- iomap_finish_ioend_buffered
- iomap_writepage_end_bio
- iomap_submit_ioend
- iomap_alloc_ioend
- iomap_can_add_to_ioend
- iomap_add_to_ioend
- iomap_writepage_map_blocks
- iomap_writepage_handle_eof
- iomap_writepage_map
Improve your Profiling and Debugging skills
Find out more