1 | // SPDX-License-Identifier: GPL-2.0 |
---|---|
2 | /* |
3 | * Copyright (C) 2010 Red Hat, Inc. |
4 | * Copyright (C) 2016-2023 Christoph Hellwig. |
5 | */ |
6 | #include <linux/module.h> |
7 | #include <linux/compiler.h> |
8 | #include <linux/fs.h> |
9 | #include <linux/iomap.h> |
10 | #include <linux/pagemap.h> |
11 | #include <linux/uio.h> |
12 | #include <linux/buffer_head.h> |
13 | #include <linux/dax.h> |
14 | #include <linux/writeback.h> |
15 | #include <linux/list_sort.h> |
16 | #include <linux/swap.h> |
17 | #include <linux/bio.h> |
18 | #include <linux/sched/signal.h> |
19 | #include <linux/migrate.h> |
20 | #include "trace.h" |
21 | |
22 | #include "../internal.h" |
23 | |
24 | #define IOEND_BATCH_SIZE 4096 |
25 | |
26 | typedef int (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length); |
27 | /* |
28 | * Structure allocated for each folio to track per-block uptodate, dirty state |
29 | * and I/O completions. |
30 | */ |
31 | struct iomap_folio_state { |
32 | spinlock_t state_lock; |
33 | unsigned int read_bytes_pending; |
34 | atomic_t write_bytes_pending; |
35 | |
36 | /* |
37 | * Each block has two bits in this bitmap: |
38 | * Bits [0..blocks_per_folio) has the uptodate status. |
39 | * Bits [b_p_f...(2*b_p_f)) has the dirty status. |
40 | */ |
41 | unsigned long state[]; |
42 | }; |
43 | |
44 | static struct bio_set iomap_ioend_bioset; |
45 | |
46 | static inline bool ifs_is_fully_uptodate(struct folio *folio, |
47 | struct iomap_folio_state *ifs) |
48 | { |
49 | struct inode *inode = folio->mapping->host; |
50 | |
51 | return bitmap_full(src: ifs->state, nbits: i_blocks_per_folio(inode, folio)); |
52 | } |
53 | |
54 | static inline bool ifs_block_is_uptodate(struct iomap_folio_state *ifs, |
55 | unsigned int block) |
56 | { |
57 | return test_bit(block, ifs->state); |
58 | } |
59 | |
60 | static bool ifs_set_range_uptodate(struct folio *folio, |
61 | struct iomap_folio_state *ifs, size_t off, size_t len) |
62 | { |
63 | struct inode *inode = folio->mapping->host; |
64 | unsigned int first_blk = off >> inode->i_blkbits; |
65 | unsigned int last_blk = (off + len - 1) >> inode->i_blkbits; |
66 | unsigned int nr_blks = last_blk - first_blk + 1; |
67 | |
68 | bitmap_set(map: ifs->state, start: first_blk, nbits: nr_blks); |
69 | return ifs_is_fully_uptodate(folio, ifs); |
70 | } |
71 | |
72 | static void iomap_set_range_uptodate(struct folio *folio, size_t off, |
73 | size_t len) |
74 | { |
75 | struct iomap_folio_state *ifs = folio->private; |
76 | unsigned long flags; |
77 | bool uptodate = true; |
78 | |
79 | if (ifs) { |
80 | spin_lock_irqsave(&ifs->state_lock, flags); |
81 | uptodate = ifs_set_range_uptodate(folio, ifs, off, len); |
82 | spin_unlock_irqrestore(lock: &ifs->state_lock, flags); |
83 | } |
84 | |
85 | if (uptodate) |
86 | folio_mark_uptodate(folio); |
87 | } |
88 | |
89 | static inline bool ifs_block_is_dirty(struct folio *folio, |
90 | struct iomap_folio_state *ifs, int block) |
91 | { |
92 | struct inode *inode = folio->mapping->host; |
93 | unsigned int blks_per_folio = i_blocks_per_folio(inode, folio); |
94 | |
95 | return test_bit(block + blks_per_folio, ifs->state); |
96 | } |
97 | |
98 | static unsigned ifs_find_dirty_range(struct folio *folio, |
99 | struct iomap_folio_state *ifs, u64 *range_start, u64 range_end) |
100 | { |
101 | struct inode *inode = folio->mapping->host; |
102 | unsigned start_blk = |
103 | offset_in_folio(folio, *range_start) >> inode->i_blkbits; |
104 | unsigned end_blk = min_not_zero( |
105 | offset_in_folio(folio, range_end) >> inode->i_blkbits, |
106 | i_blocks_per_folio(inode, folio)); |
107 | unsigned nblks = 1; |
108 | |
109 | while (!ifs_block_is_dirty(folio, ifs, block: start_blk)) |
110 | if (++start_blk == end_blk) |
111 | return 0; |
112 | |
113 | while (start_blk + nblks < end_blk) { |
114 | if (!ifs_block_is_dirty(folio, ifs, block: start_blk + nblks)) |
115 | break; |
116 | nblks++; |
117 | } |
118 | |
119 | *range_start = folio_pos(folio) + (start_blk << inode->i_blkbits); |
120 | return nblks << inode->i_blkbits; |
121 | } |
122 | |
123 | static unsigned iomap_find_dirty_range(struct folio *folio, u64 *range_start, |
124 | u64 range_end) |
125 | { |
126 | struct iomap_folio_state *ifs = folio->private; |
127 | |
128 | if (*range_start >= range_end) |
129 | return 0; |
130 | |
131 | if (ifs) |
132 | return ifs_find_dirty_range(folio, ifs, range_start, range_end); |
133 | return range_end - *range_start; |
134 | } |
135 | |
136 | static void ifs_clear_range_dirty(struct folio *folio, |
137 | struct iomap_folio_state *ifs, size_t off, size_t len) |
138 | { |
139 | struct inode *inode = folio->mapping->host; |
140 | unsigned int blks_per_folio = i_blocks_per_folio(inode, folio); |
141 | unsigned int first_blk = (off >> inode->i_blkbits); |
142 | unsigned int last_blk = (off + len - 1) >> inode->i_blkbits; |
143 | unsigned int nr_blks = last_blk - first_blk + 1; |
144 | unsigned long flags; |
145 | |
146 | spin_lock_irqsave(&ifs->state_lock, flags); |
147 | bitmap_clear(map: ifs->state, start: first_blk + blks_per_folio, nbits: nr_blks); |
148 | spin_unlock_irqrestore(lock: &ifs->state_lock, flags); |
149 | } |
150 | |
151 | static void iomap_clear_range_dirty(struct folio *folio, size_t off, size_t len) |
152 | { |
153 | struct iomap_folio_state *ifs = folio->private; |
154 | |
155 | if (ifs) |
156 | ifs_clear_range_dirty(folio, ifs, off, len); |
157 | } |
158 | |
159 | static void ifs_set_range_dirty(struct folio *folio, |
160 | struct iomap_folio_state *ifs, size_t off, size_t len) |
161 | { |
162 | struct inode *inode = folio->mapping->host; |
163 | unsigned int blks_per_folio = i_blocks_per_folio(inode, folio); |
164 | unsigned int first_blk = (off >> inode->i_blkbits); |
165 | unsigned int last_blk = (off + len - 1) >> inode->i_blkbits; |
166 | unsigned int nr_blks = last_blk - first_blk + 1; |
167 | unsigned long flags; |
168 | |
169 | spin_lock_irqsave(&ifs->state_lock, flags); |
170 | bitmap_set(map: ifs->state, start: first_blk + blks_per_folio, nbits: nr_blks); |
171 | spin_unlock_irqrestore(lock: &ifs->state_lock, flags); |
172 | } |
173 | |
174 | static void iomap_set_range_dirty(struct folio *folio, size_t off, size_t len) |
175 | { |
176 | struct iomap_folio_state *ifs = folio->private; |
177 | |
178 | if (ifs) |
179 | ifs_set_range_dirty(folio, ifs, off, len); |
180 | } |
181 | |
182 | static struct iomap_folio_state *ifs_alloc(struct inode *inode, |
183 | struct folio *folio, unsigned int flags) |
184 | { |
185 | struct iomap_folio_state *ifs = folio->private; |
186 | unsigned int nr_blocks = i_blocks_per_folio(inode, folio); |
187 | gfp_t gfp; |
188 | |
189 | if (ifs || nr_blocks <= 1) |
190 | return ifs; |
191 | |
192 | if (flags & IOMAP_NOWAIT) |
193 | gfp = GFP_NOWAIT; |
194 | else |
195 | gfp = GFP_NOFS | __GFP_NOFAIL; |
196 | |
197 | /* |
198 | * ifs->state tracks two sets of state flags when the |
199 | * filesystem block size is smaller than the folio size. |
200 | * The first state tracks per-block uptodate and the |
201 | * second tracks per-block dirty state. |
202 | */ |
203 | ifs = kzalloc(struct_size(ifs, state, |
204 | BITS_TO_LONGS(2 * nr_blocks)), flags: gfp); |
205 | if (!ifs) |
206 | return ifs; |
207 | |
208 | spin_lock_init(&ifs->state_lock); |
209 | if (folio_test_uptodate(folio)) |
210 | bitmap_set(map: ifs->state, start: 0, nbits: nr_blocks); |
211 | if (folio_test_dirty(folio)) |
212 | bitmap_set(map: ifs->state, start: nr_blocks, nbits: nr_blocks); |
213 | folio_attach_private(folio, data: ifs); |
214 | |
215 | return ifs; |
216 | } |
217 | |
218 | static void ifs_free(struct folio *folio) |
219 | { |
220 | struct iomap_folio_state *ifs = folio_detach_private(folio); |
221 | |
222 | if (!ifs) |
223 | return; |
224 | WARN_ON_ONCE(ifs->read_bytes_pending != 0); |
225 | WARN_ON_ONCE(atomic_read(&ifs->write_bytes_pending)); |
226 | WARN_ON_ONCE(ifs_is_fully_uptodate(folio, ifs) != |
227 | folio_test_uptodate(folio)); |
228 | kfree(objp: ifs); |
229 | } |
230 | |
231 | /* |
232 | * Calculate the range inside the folio that we actually need to read. |
233 | */ |
234 | static void iomap_adjust_read_range(struct inode *inode, struct folio *folio, |
235 | loff_t *pos, loff_t length, size_t *offp, size_t *lenp) |
236 | { |
237 | struct iomap_folio_state *ifs = folio->private; |
238 | loff_t orig_pos = *pos; |
239 | loff_t isize = i_size_read(inode); |
240 | unsigned block_bits = inode->i_blkbits; |
241 | unsigned block_size = (1 << block_bits); |
242 | size_t poff = offset_in_folio(folio, *pos); |
243 | size_t plen = min_t(loff_t, folio_size(folio) - poff, length); |
244 | unsigned first = poff >> block_bits; |
245 | unsigned last = (poff + plen - 1) >> block_bits; |
246 | |
247 | /* |
248 | * If the block size is smaller than the page size, we need to check the |
249 | * per-block uptodate status and adjust the offset and length if needed |
250 | * to avoid reading in already uptodate ranges. |
251 | */ |
252 | if (ifs) { |
253 | unsigned int i; |
254 | |
255 | /* move forward for each leading block marked uptodate */ |
256 | for (i = first; i <= last; i++) { |
257 | if (!ifs_block_is_uptodate(ifs, block: i)) |
258 | break; |
259 | *pos += block_size; |
260 | poff += block_size; |
261 | plen -= block_size; |
262 | first++; |
263 | } |
264 | |
265 | /* truncate len if we find any trailing uptodate block(s) */ |
266 | for ( ; i <= last; i++) { |
267 | if (ifs_block_is_uptodate(ifs, block: i)) { |
268 | plen -= (last - i + 1) * block_size; |
269 | last = i - 1; |
270 | break; |
271 | } |
272 | } |
273 | } |
274 | |
275 | /* |
276 | * If the extent spans the block that contains the i_size, we need to |
277 | * handle both halves separately so that we properly zero data in the |
278 | * page cache for blocks that are entirely outside of i_size. |
279 | */ |
280 | if (orig_pos <= isize && orig_pos + length > isize) { |
281 | unsigned end = offset_in_folio(folio, isize - 1) >> block_bits; |
282 | |
283 | if (first <= end && last > end) |
284 | plen -= (last - end) * block_size; |
285 | } |
286 | |
287 | *offp = poff; |
288 | *lenp = plen; |
289 | } |
290 | |
291 | static void iomap_finish_folio_read(struct folio *folio, size_t off, |
292 | size_t len, int error) |
293 | { |
294 | struct iomap_folio_state *ifs = folio->private; |
295 | bool uptodate = !error; |
296 | bool finished = true; |
297 | |
298 | if (ifs) { |
299 | unsigned long flags; |
300 | |
301 | spin_lock_irqsave(&ifs->state_lock, flags); |
302 | if (!error) |
303 | uptodate = ifs_set_range_uptodate(folio, ifs, off, len); |
304 | ifs->read_bytes_pending -= len; |
305 | finished = !ifs->read_bytes_pending; |
306 | spin_unlock_irqrestore(lock: &ifs->state_lock, flags); |
307 | } |
308 | |
309 | if (error) |
310 | folio_set_error(folio); |
311 | if (finished) |
312 | folio_end_read(folio, success: uptodate); |
313 | } |
314 | |
315 | static void iomap_read_end_io(struct bio *bio) |
316 | { |
317 | int error = blk_status_to_errno(status: bio->bi_status); |
318 | struct folio_iter fi; |
319 | |
320 | bio_for_each_folio_all(fi, bio) |
321 | iomap_finish_folio_read(folio: fi.folio, off: fi.offset, len: fi.length, error); |
322 | bio_put(bio); |
323 | } |
324 | |
325 | struct iomap_readpage_ctx { |
326 | struct folio *cur_folio; |
327 | bool cur_folio_in_bio; |
328 | struct bio *bio; |
329 | struct readahead_control *rac; |
330 | }; |
331 | |
332 | /** |
333 | * iomap_read_inline_data - copy inline data into the page cache |
334 | * @iter: iteration structure |
335 | * @folio: folio to copy to |
336 | * |
337 | * Copy the inline data in @iter into @folio and zero out the rest of the folio. |
338 | * Only a single IOMAP_INLINE extent is allowed at the end of each file. |
339 | * Returns zero for success to complete the read, or the usual negative errno. |
340 | */ |
341 | static int iomap_read_inline_data(const struct iomap_iter *iter, |
342 | struct folio *folio) |
343 | { |
344 | const struct iomap *iomap = iomap_iter_srcmap(i: iter); |
345 | size_t size = i_size_read(inode: iter->inode) - iomap->offset; |
346 | size_t offset = offset_in_folio(folio, iomap->offset); |
347 | |
348 | if (folio_test_uptodate(folio)) |
349 | return 0; |
350 | |
351 | if (WARN_ON_ONCE(size > iomap->length)) |
352 | return -EIO; |
353 | if (offset > 0) |
354 | ifs_alloc(inode: iter->inode, folio, flags: iter->flags); |
355 | |
356 | folio_fill_tail(folio, offset, from: iomap->inline_data, len: size); |
357 | iomap_set_range_uptodate(folio, off: offset, len: folio_size(folio) - offset); |
358 | return 0; |
359 | } |
360 | |
361 | static inline bool iomap_block_needs_zeroing(const struct iomap_iter *iter, |
362 | loff_t pos) |
363 | { |
364 | const struct iomap *srcmap = iomap_iter_srcmap(i: iter); |
365 | |
366 | return srcmap->type != IOMAP_MAPPED || |
367 | (srcmap->flags & IOMAP_F_NEW) || |
368 | pos >= i_size_read(inode: iter->inode); |
369 | } |
370 | |
371 | static loff_t iomap_readpage_iter(const struct iomap_iter *iter, |
372 | struct iomap_readpage_ctx *ctx, loff_t offset) |
373 | { |
374 | const struct iomap *iomap = &iter->iomap; |
375 | loff_t pos = iter->pos + offset; |
376 | loff_t length = iomap_length(iter) - offset; |
377 | struct folio *folio = ctx->cur_folio; |
378 | struct iomap_folio_state *ifs; |
379 | loff_t orig_pos = pos; |
380 | size_t poff, plen; |
381 | sector_t sector; |
382 | |
383 | if (iomap->type == IOMAP_INLINE) |
384 | return iomap_read_inline_data(iter, folio); |
385 | |
386 | /* zero post-eof blocks as the page may be mapped */ |
387 | ifs = ifs_alloc(inode: iter->inode, folio, flags: iter->flags); |
388 | iomap_adjust_read_range(inode: iter->inode, folio, pos: &pos, length, offp: &poff, lenp: &plen); |
389 | if (plen == 0) |
390 | goto done; |
391 | |
392 | if (iomap_block_needs_zeroing(iter, pos)) { |
393 | folio_zero_range(folio, start: poff, length: plen); |
394 | iomap_set_range_uptodate(folio, off: poff, len: plen); |
395 | goto done; |
396 | } |
397 | |
398 | ctx->cur_folio_in_bio = true; |
399 | if (ifs) { |
400 | spin_lock_irq(lock: &ifs->state_lock); |
401 | ifs->read_bytes_pending += plen; |
402 | spin_unlock_irq(lock: &ifs->state_lock); |
403 | } |
404 | |
405 | sector = iomap_sector(iomap, pos); |
406 | if (!ctx->bio || |
407 | bio_end_sector(ctx->bio) != sector || |
408 | !bio_add_folio(bio: ctx->bio, folio, len: plen, off: poff)) { |
409 | gfp_t gfp = mapping_gfp_constraint(mapping: folio->mapping, GFP_KERNEL); |
410 | gfp_t orig_gfp = gfp; |
411 | unsigned int nr_vecs = DIV_ROUND_UP(length, PAGE_SIZE); |
412 | |
413 | if (ctx->bio) |
414 | submit_bio(bio: ctx->bio); |
415 | |
416 | if (ctx->rac) /* same as readahead_gfp_mask */ |
417 | gfp |= __GFP_NORETRY | __GFP_NOWARN; |
418 | ctx->bio = bio_alloc(bdev: iomap->bdev, nr_vecs: bio_max_segs(nr_segs: nr_vecs), |
419 | opf: REQ_OP_READ, gfp_mask: gfp); |
420 | /* |
421 | * If the bio_alloc fails, try it again for a single page to |
422 | * avoid having to deal with partial page reads. This emulates |
423 | * what do_mpage_read_folio does. |
424 | */ |
425 | if (!ctx->bio) { |
426 | ctx->bio = bio_alloc(bdev: iomap->bdev, nr_vecs: 1, opf: REQ_OP_READ, |
427 | gfp_mask: orig_gfp); |
428 | } |
429 | if (ctx->rac) |
430 | ctx->bio->bi_opf |= REQ_RAHEAD; |
431 | ctx->bio->bi_iter.bi_sector = sector; |
432 | ctx->bio->bi_end_io = iomap_read_end_io; |
433 | bio_add_folio_nofail(bio: ctx->bio, folio, len: plen, off: poff); |
434 | } |
435 | |
436 | done: |
437 | /* |
438 | * Move the caller beyond our range so that it keeps making progress. |
439 | * For that, we have to include any leading non-uptodate ranges, but |
440 | * we can skip trailing ones as they will be handled in the next |
441 | * iteration. |
442 | */ |
443 | return pos - orig_pos + plen; |
444 | } |
445 | |
446 | int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops) |
447 | { |
448 | struct iomap_iter iter = { |
449 | .inode = folio->mapping->host, |
450 | .pos = folio_pos(folio), |
451 | .len = folio_size(folio), |
452 | }; |
453 | struct iomap_readpage_ctx ctx = { |
454 | .cur_folio = folio, |
455 | }; |
456 | int ret; |
457 | |
458 | trace_iomap_readpage(inode: iter.inode, nr_pages: 1); |
459 | |
460 | while ((ret = iomap_iter(iter: &iter, ops)) > 0) |
461 | iter.processed = iomap_readpage_iter(iter: &iter, ctx: &ctx, offset: 0); |
462 | |
463 | if (ret < 0) |
464 | folio_set_error(folio); |
465 | |
466 | if (ctx.bio) { |
467 | submit_bio(bio: ctx.bio); |
468 | WARN_ON_ONCE(!ctx.cur_folio_in_bio); |
469 | } else { |
470 | WARN_ON_ONCE(ctx.cur_folio_in_bio); |
471 | folio_unlock(folio); |
472 | } |
473 | |
474 | /* |
475 | * Just like mpage_readahead and block_read_full_folio, we always |
476 | * return 0 and just set the folio error flag on errors. This |
477 | * should be cleaned up throughout the stack eventually. |
478 | */ |
479 | return 0; |
480 | } |
481 | EXPORT_SYMBOL_GPL(iomap_read_folio); |
482 | |
483 | static loff_t iomap_readahead_iter(const struct iomap_iter *iter, |
484 | struct iomap_readpage_ctx *ctx) |
485 | { |
486 | loff_t length = iomap_length(iter); |
487 | loff_t done, ret; |
488 | |
489 | for (done = 0; done < length; done += ret) { |
490 | if (ctx->cur_folio && |
491 | offset_in_folio(ctx->cur_folio, iter->pos + done) == 0) { |
492 | if (!ctx->cur_folio_in_bio) |
493 | folio_unlock(folio: ctx->cur_folio); |
494 | ctx->cur_folio = NULL; |
495 | } |
496 | if (!ctx->cur_folio) { |
497 | ctx->cur_folio = readahead_folio(ractl: ctx->rac); |
498 | ctx->cur_folio_in_bio = false; |
499 | } |
500 | ret = iomap_readpage_iter(iter, ctx, offset: done); |
501 | if (ret <= 0) |
502 | return ret; |
503 | } |
504 | |
505 | return done; |
506 | } |
507 | |
508 | /** |
509 | * iomap_readahead - Attempt to read pages from a file. |
510 | * @rac: Describes the pages to be read. |
511 | * @ops: The operations vector for the filesystem. |
512 | * |
513 | * This function is for filesystems to call to implement their readahead |
514 | * address_space operation. |
515 | * |
516 | * Context: The @ops callbacks may submit I/O (eg to read the addresses of |
517 | * blocks from disc), and may wait for it. The caller may be trying to |
518 | * access a different page, and so sleeping excessively should be avoided. |
519 | * It may allocate memory, but should avoid costly allocations. This |
520 | * function is called with memalloc_nofs set, so allocations will not cause |
521 | * the filesystem to be reentered. |
522 | */ |
523 | void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops) |
524 | { |
525 | struct iomap_iter iter = { |
526 | .inode = rac->mapping->host, |
527 | .pos = readahead_pos(rac), |
528 | .len = readahead_length(rac), |
529 | }; |
530 | struct iomap_readpage_ctx ctx = { |
531 | .rac = rac, |
532 | }; |
533 | |
534 | trace_iomap_readahead(inode: rac->mapping->host, nr_pages: readahead_count(rac)); |
535 | |
536 | while (iomap_iter(iter: &iter, ops) > 0) |
537 | iter.processed = iomap_readahead_iter(iter: &iter, ctx: &ctx); |
538 | |
539 | if (ctx.bio) |
540 | submit_bio(bio: ctx.bio); |
541 | if (ctx.cur_folio) { |
542 | if (!ctx.cur_folio_in_bio) |
543 | folio_unlock(folio: ctx.cur_folio); |
544 | } |
545 | } |
546 | EXPORT_SYMBOL_GPL(iomap_readahead); |
547 | |
548 | /* |
549 | * iomap_is_partially_uptodate checks whether blocks within a folio are |
550 | * uptodate or not. |
551 | * |
552 | * Returns true if all blocks which correspond to the specified part |
553 | * of the folio are uptodate. |
554 | */ |
555 | bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count) |
556 | { |
557 | struct iomap_folio_state *ifs = folio->private; |
558 | struct inode *inode = folio->mapping->host; |
559 | unsigned first, last, i; |
560 | |
561 | if (!ifs) |
562 | return false; |
563 | |
564 | /* Caller's range may extend past the end of this folio */ |
565 | count = min(folio_size(folio) - from, count); |
566 | |
567 | /* First and last blocks in range within folio */ |
568 | first = from >> inode->i_blkbits; |
569 | last = (from + count - 1) >> inode->i_blkbits; |
570 | |
571 | for (i = first; i <= last; i++) |
572 | if (!ifs_block_is_uptodate(ifs, block: i)) |
573 | return false; |
574 | return true; |
575 | } |
576 | EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate); |
577 | |
578 | /** |
579 | * iomap_get_folio - get a folio reference for writing |
580 | * @iter: iteration structure |
581 | * @pos: start offset of write |
582 | * @len: Suggested size of folio to create. |
583 | * |
584 | * Returns a locked reference to the folio at @pos, or an error pointer if the |
585 | * folio could not be obtained. |
586 | */ |
587 | struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len) |
588 | { |
589 | fgf_t fgp = FGP_WRITEBEGIN | FGP_NOFS; |
590 | |
591 | if (iter->flags & IOMAP_NOWAIT) |
592 | fgp |= FGP_NOWAIT; |
593 | fgp |= fgf_set_order(size: len); |
594 | |
595 | return __filemap_get_folio(mapping: iter->inode->i_mapping, index: pos >> PAGE_SHIFT, |
596 | fgp_flags: fgp, gfp: mapping_gfp_mask(mapping: iter->inode->i_mapping)); |
597 | } |
598 | EXPORT_SYMBOL_GPL(iomap_get_folio); |
599 | |
600 | bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags) |
601 | { |
602 | trace_iomap_release_folio(inode: folio->mapping->host, off: folio_pos(folio), |
603 | len: folio_size(folio)); |
604 | |
605 | /* |
606 | * If the folio is dirty, we refuse to release our metadata because |
607 | * it may be partially dirty. Once we track per-block dirty state, |
608 | * we can release the metadata if every block is dirty. |
609 | */ |
610 | if (folio_test_dirty(folio)) |
611 | return false; |
612 | ifs_free(folio); |
613 | return true; |
614 | } |
615 | EXPORT_SYMBOL_GPL(iomap_release_folio); |
616 | |
617 | void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len) |
618 | { |
619 | trace_iomap_invalidate_folio(inode: folio->mapping->host, |
620 | off: folio_pos(folio) + offset, len); |
621 | |
622 | /* |
623 | * If we're invalidating the entire folio, clear the dirty state |
624 | * from it and release it to avoid unnecessary buildup of the LRU. |
625 | */ |
626 | if (offset == 0 && len == folio_size(folio)) { |
627 | WARN_ON_ONCE(folio_test_writeback(folio)); |
628 | folio_cancel_dirty(folio); |
629 | ifs_free(folio); |
630 | } |
631 | } |
632 | EXPORT_SYMBOL_GPL(iomap_invalidate_folio); |
633 | |
634 | bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio) |
635 | { |
636 | struct inode *inode = mapping->host; |
637 | size_t len = folio_size(folio); |
638 | |
639 | ifs_alloc(inode, folio, flags: 0); |
640 | iomap_set_range_dirty(folio, off: 0, len); |
641 | return filemap_dirty_folio(mapping, folio); |
642 | } |
643 | EXPORT_SYMBOL_GPL(iomap_dirty_folio); |
644 | |
645 | static void |
646 | iomap_write_failed(struct inode *inode, loff_t pos, unsigned len) |
647 | { |
648 | loff_t i_size = i_size_read(inode); |
649 | |
650 | /* |
651 | * Only truncate newly allocated pages beyoned EOF, even if the |
652 | * write started inside the existing inode size. |
653 | */ |
654 | if (pos + len > i_size) |
655 | truncate_pagecache_range(inode, max(pos, i_size), |
656 | end: pos + len - 1); |
657 | } |
658 | |
659 | static int iomap_read_folio_sync(loff_t block_start, struct folio *folio, |
660 | size_t poff, size_t plen, const struct iomap *iomap) |
661 | { |
662 | struct bio_vec bvec; |
663 | struct bio bio; |
664 | |
665 | bio_init(bio: &bio, bdev: iomap->bdev, table: &bvec, max_vecs: 1, opf: REQ_OP_READ); |
666 | bio.bi_iter.bi_sector = iomap_sector(iomap, pos: block_start); |
667 | bio_add_folio_nofail(bio: &bio, folio, len: plen, off: poff); |
668 | return submit_bio_wait(bio: &bio); |
669 | } |
670 | |
671 | static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos, |
672 | size_t len, struct folio *folio) |
673 | { |
674 | const struct iomap *srcmap = iomap_iter_srcmap(i: iter); |
675 | struct iomap_folio_state *ifs; |
676 | loff_t block_size = i_blocksize(node: iter->inode); |
677 | loff_t block_start = round_down(pos, block_size); |
678 | loff_t block_end = round_up(pos + len, block_size); |
679 | unsigned int nr_blocks = i_blocks_per_folio(inode: iter->inode, folio); |
680 | size_t from = offset_in_folio(folio, pos), to = from + len; |
681 | size_t poff, plen; |
682 | |
683 | /* |
684 | * If the write or zeroing completely overlaps the current folio, then |
685 | * entire folio will be dirtied so there is no need for |
686 | * per-block state tracking structures to be attached to this folio. |
687 | * For the unshare case, we must read in the ondisk contents because we |
688 | * are not changing pagecache contents. |
689 | */ |
690 | if (!(iter->flags & IOMAP_UNSHARE) && pos <= folio_pos(folio) && |
691 | pos + len >= folio_pos(folio) + folio_size(folio)) |
692 | return 0; |
693 | |
694 | ifs = ifs_alloc(inode: iter->inode, folio, flags: iter->flags); |
695 | if ((iter->flags & IOMAP_NOWAIT) && !ifs && nr_blocks > 1) |
696 | return -EAGAIN; |
697 | |
698 | if (folio_test_uptodate(folio)) |
699 | return 0; |
700 | folio_clear_error(folio); |
701 | |
702 | do { |
703 | iomap_adjust_read_range(inode: iter->inode, folio, pos: &block_start, |
704 | length: block_end - block_start, offp: &poff, lenp: &plen); |
705 | if (plen == 0) |
706 | break; |
707 | |
708 | if (!(iter->flags & IOMAP_UNSHARE) && |
709 | (from <= poff || from >= poff + plen) && |
710 | (to <= poff || to >= poff + plen)) |
711 | continue; |
712 | |
713 | if (iomap_block_needs_zeroing(iter, pos: block_start)) { |
714 | if (WARN_ON_ONCE(iter->flags & IOMAP_UNSHARE)) |
715 | return -EIO; |
716 | folio_zero_segments(folio, start1: poff, xend1: from, start2: to, xend2: poff + plen); |
717 | } else { |
718 | int status; |
719 | |
720 | if (iter->flags & IOMAP_NOWAIT) |
721 | return -EAGAIN; |
722 | |
723 | status = iomap_read_folio_sync(block_start, folio, |
724 | poff, plen, iomap: srcmap); |
725 | if (status) |
726 | return status; |
727 | } |
728 | iomap_set_range_uptodate(folio, off: poff, len: plen); |
729 | } while ((block_start += plen) < block_end); |
730 | |
731 | return 0; |
732 | } |
733 | |
734 | static struct folio *__iomap_get_folio(struct iomap_iter *iter, loff_t pos, |
735 | size_t len) |
736 | { |
737 | const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops; |
738 | |
739 | if (folio_ops && folio_ops->get_folio) |
740 | return folio_ops->get_folio(iter, pos, len); |
741 | else |
742 | return iomap_get_folio(iter, pos, len); |
743 | } |
744 | |
745 | static void __iomap_put_folio(struct iomap_iter *iter, loff_t pos, size_t ret, |
746 | struct folio *folio) |
747 | { |
748 | const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops; |
749 | |
750 | if (folio_ops && folio_ops->put_folio) { |
751 | folio_ops->put_folio(iter->inode, pos, ret, folio); |
752 | } else { |
753 | folio_unlock(folio); |
754 | folio_put(folio); |
755 | } |
756 | } |
757 | |
758 | static int iomap_write_begin_inline(const struct iomap_iter *iter, |
759 | struct folio *folio) |
760 | { |
761 | /* needs more work for the tailpacking case; disable for now */ |
762 | if (WARN_ON_ONCE(iomap_iter_srcmap(iter)->offset != 0)) |
763 | return -EIO; |
764 | return iomap_read_inline_data(iter, folio); |
765 | } |
766 | |
767 | static int iomap_write_begin(struct iomap_iter *iter, loff_t pos, |
768 | size_t len, struct folio **foliop) |
769 | { |
770 | const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops; |
771 | const struct iomap *srcmap = iomap_iter_srcmap(i: iter); |
772 | struct folio *folio; |
773 | int status = 0; |
774 | |
775 | BUG_ON(pos + len > iter->iomap.offset + iter->iomap.length); |
776 | if (srcmap != &iter->iomap) |
777 | BUG_ON(pos + len > srcmap->offset + srcmap->length); |
778 | |
779 | if (fatal_signal_pending(current)) |
780 | return -EINTR; |
781 | |
782 | if (!mapping_large_folio_support(mapping: iter->inode->i_mapping)) |
783 | len = min_t(size_t, len, PAGE_SIZE - offset_in_page(pos)); |
784 | |
785 | folio = __iomap_get_folio(iter, pos, len); |
786 | if (IS_ERR(ptr: folio)) |
787 | return PTR_ERR(ptr: folio); |
788 | |
789 | /* |
790 | * Now we have a locked folio, before we do anything with it we need to |
791 | * check that the iomap we have cached is not stale. The inode extent |
792 | * mapping can change due to concurrent IO in flight (e.g. |
793 | * IOMAP_UNWRITTEN state can change and memory reclaim could have |
794 | * reclaimed a previously partially written page at this index after IO |
795 | * completion before this write reaches this file offset) and hence we |
796 | * could do the wrong thing here (zero a page range incorrectly or fail |
797 | * to zero) and corrupt data. |
798 | */ |
799 | if (folio_ops && folio_ops->iomap_valid) { |
800 | bool iomap_valid = folio_ops->iomap_valid(iter->inode, |
801 | &iter->iomap); |
802 | if (!iomap_valid) { |
803 | iter->iomap.flags |= IOMAP_F_STALE; |
804 | status = 0; |
805 | goto out_unlock; |
806 | } |
807 | } |
808 | |
809 | if (pos + len > folio_pos(folio) + folio_size(folio)) |
810 | len = folio_pos(folio) + folio_size(folio) - pos; |
811 | |
812 | if (srcmap->type == IOMAP_INLINE) |
813 | status = iomap_write_begin_inline(iter, folio); |
814 | else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) |
815 | status = __block_write_begin_int(folio, pos, len, NULL, iomap: srcmap); |
816 | else |
817 | status = __iomap_write_begin(iter, pos, len, folio); |
818 | |
819 | if (unlikely(status)) |
820 | goto out_unlock; |
821 | |
822 | *foliop = folio; |
823 | return 0; |
824 | |
825 | out_unlock: |
826 | __iomap_put_folio(iter, pos, ret: 0, folio); |
827 | iomap_write_failed(inode: iter->inode, pos, len); |
828 | |
829 | return status; |
830 | } |
831 | |
832 | static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len, |
833 | size_t copied, struct folio *folio) |
834 | { |
835 | flush_dcache_folio(folio); |
836 | |
837 | /* |
838 | * The blocks that were entirely written will now be uptodate, so we |
839 | * don't have to worry about a read_folio reading them and overwriting a |
840 | * partial write. However, if we've encountered a short write and only |
841 | * partially written into a block, it will not be marked uptodate, so a |
842 | * read_folio might come in and destroy our partial write. |
843 | * |
844 | * Do the simplest thing and just treat any short write to a |
845 | * non-uptodate page as a zero-length write, and force the caller to |
846 | * redo the whole thing. |
847 | */ |
848 | if (unlikely(copied < len && !folio_test_uptodate(folio))) |
849 | return 0; |
850 | iomap_set_range_uptodate(folio, offset_in_folio(folio, pos), len); |
851 | iomap_set_range_dirty(folio, offset_in_folio(folio, pos), len: copied); |
852 | filemap_dirty_folio(mapping: inode->i_mapping, folio); |
853 | return copied; |
854 | } |
855 | |
856 | static size_t iomap_write_end_inline(const struct iomap_iter *iter, |
857 | struct folio *folio, loff_t pos, size_t copied) |
858 | { |
859 | const struct iomap *iomap = &iter->iomap; |
860 | void *addr; |
861 | |
862 | WARN_ON_ONCE(!folio_test_uptodate(folio)); |
863 | BUG_ON(!iomap_inline_data_valid(iomap)); |
864 | |
865 | flush_dcache_folio(folio); |
866 | addr = kmap_local_folio(folio, offset: pos); |
867 | memcpy(iomap_inline_data(iomap, pos), addr, copied); |
868 | kunmap_local(addr); |
869 | |
870 | mark_inode_dirty(inode: iter->inode); |
871 | return copied; |
872 | } |
873 | |
874 | /* Returns the number of bytes copied. May be 0. Cannot be an errno. */ |
875 | static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len, |
876 | size_t copied, struct folio *folio) |
877 | { |
878 | const struct iomap *srcmap = iomap_iter_srcmap(i: iter); |
879 | loff_t old_size = iter->inode->i_size; |
880 | size_t ret; |
881 | |
882 | if (srcmap->type == IOMAP_INLINE) { |
883 | ret = iomap_write_end_inline(iter, folio, pos, copied); |
884 | } else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) { |
885 | ret = block_write_end(NULL, iter->inode->i_mapping, pos, len, |
886 | copied, &folio->page, NULL); |
887 | } else { |
888 | ret = __iomap_write_end(inode: iter->inode, pos, len, copied, folio); |
889 | } |
890 | |
891 | /* |
892 | * Update the in-memory inode size after copying the data into the page |
893 | * cache. It's up to the file system to write the updated size to disk, |
894 | * preferably after I/O completion so that no stale data is exposed. |
895 | */ |
896 | if (pos + ret > old_size) { |
897 | i_size_write(inode: iter->inode, i_size: pos + ret); |
898 | iter->iomap.flags |= IOMAP_F_SIZE_CHANGED; |
899 | } |
900 | __iomap_put_folio(iter, pos, ret, folio); |
901 | |
902 | if (old_size < pos) |
903 | pagecache_isize_extended(inode: iter->inode, from: old_size, to: pos); |
904 | if (ret < len) |
905 | iomap_write_failed(inode: iter->inode, pos: pos + ret, len: len - ret); |
906 | return ret; |
907 | } |
908 | |
909 | static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) |
910 | { |
911 | loff_t length = iomap_length(iter); |
912 | size_t chunk = PAGE_SIZE << MAX_PAGECACHE_ORDER; |
913 | loff_t pos = iter->pos; |
914 | ssize_t written = 0; |
915 | long status = 0; |
916 | struct address_space *mapping = iter->inode->i_mapping; |
917 | unsigned int bdp_flags = (iter->flags & IOMAP_NOWAIT) ? BDP_ASYNC : 0; |
918 | |
919 | do { |
920 | struct folio *folio; |
921 | size_t offset; /* Offset into folio */ |
922 | size_t bytes; /* Bytes to write to folio */ |
923 | size_t copied; /* Bytes copied from user */ |
924 | |
925 | bytes = iov_iter_count(i); |
926 | retry: |
927 | offset = pos & (chunk - 1); |
928 | bytes = min(chunk - offset, bytes); |
929 | status = balance_dirty_pages_ratelimited_flags(mapping, |
930 | flags: bdp_flags); |
931 | if (unlikely(status)) |
932 | break; |
933 | |
934 | if (bytes > length) |
935 | bytes = length; |
936 | |
937 | /* |
938 | * Bring in the user page that we'll copy from _first_. |
939 | * Otherwise there's a nasty deadlock on copying from the |
940 | * same page as we're writing to, without it being marked |
941 | * up-to-date. |
942 | * |
943 | * For async buffered writes the assumption is that the user |
944 | * page has already been faulted in. This can be optimized by |
945 | * faulting the user page. |
946 | */ |
947 | if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) { |
948 | status = -EFAULT; |
949 | break; |
950 | } |
951 | |
952 | status = iomap_write_begin(iter, pos, len: bytes, foliop: &folio); |
953 | if (unlikely(status)) |
954 | break; |
955 | if (iter->iomap.flags & IOMAP_F_STALE) |
956 | break; |
957 | |
958 | offset = offset_in_folio(folio, pos); |
959 | if (bytes > folio_size(folio) - offset) |
960 | bytes = folio_size(folio) - offset; |
961 | |
962 | if (mapping_writably_mapped(mapping)) |
963 | flush_dcache_folio(folio); |
964 | |
965 | copied = copy_folio_from_iter_atomic(folio, offset, bytes, i); |
966 | status = iomap_write_end(iter, pos, len: bytes, copied, folio); |
967 | |
968 | if (unlikely(copied != status)) |
969 | iov_iter_revert(i, bytes: copied - status); |
970 | |
971 | cond_resched(); |
972 | if (unlikely(status == 0)) { |
973 | /* |
974 | * A short copy made iomap_write_end() reject the |
975 | * thing entirely. Might be memory poisoning |
976 | * halfway through, might be a race with munmap, |
977 | * might be severe memory pressure. |
978 | */ |
979 | if (chunk > PAGE_SIZE) |
980 | chunk /= 2; |
981 | if (copied) { |
982 | bytes = copied; |
983 | goto retry; |
984 | } |
985 | } else { |
986 | pos += status; |
987 | written += status; |
988 | length -= status; |
989 | } |
990 | } while (iov_iter_count(i) && length); |
991 | |
992 | if (status == -EAGAIN) { |
993 | iov_iter_revert(i, bytes: written); |
994 | return -EAGAIN; |
995 | } |
996 | return written ? written : status; |
997 | } |
998 | |
999 | ssize_t |
1000 | iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i, |
1001 | const struct iomap_ops *ops) |
1002 | { |
1003 | struct iomap_iter iter = { |
1004 | .inode = iocb->ki_filp->f_mapping->host, |
1005 | .pos = iocb->ki_pos, |
1006 | .len = iov_iter_count(i), |
1007 | .flags = IOMAP_WRITE, |
1008 | }; |
1009 | ssize_t ret; |
1010 | |
1011 | if (iocb->ki_flags & IOCB_NOWAIT) |
1012 | iter.flags |= IOMAP_NOWAIT; |
1013 | |
1014 | while ((ret = iomap_iter(iter: &iter, ops)) > 0) |
1015 | iter.processed = iomap_write_iter(iter: &iter, i); |
1016 | |
1017 | if (unlikely(iter.pos == iocb->ki_pos)) |
1018 | return ret; |
1019 | ret = iter.pos - iocb->ki_pos; |
1020 | iocb->ki_pos = iter.pos; |
1021 | return ret; |
1022 | } |
1023 | EXPORT_SYMBOL_GPL(iomap_file_buffered_write); |
1024 | |
1025 | static int iomap_write_delalloc_ifs_punch(struct inode *inode, |
1026 | struct folio *folio, loff_t start_byte, loff_t end_byte, |
1027 | iomap_punch_t punch) |
1028 | { |
1029 | unsigned int first_blk, last_blk, i; |
1030 | loff_t last_byte; |
1031 | u8 blkbits = inode->i_blkbits; |
1032 | struct iomap_folio_state *ifs; |
1033 | int ret = 0; |
1034 | |
1035 | /* |
1036 | * When we have per-block dirty tracking, there can be |
1037 | * blocks within a folio which are marked uptodate |
1038 | * but not dirty. In that case it is necessary to punch |
1039 | * out such blocks to avoid leaking any delalloc blocks. |
1040 | */ |
1041 | ifs = folio->private; |
1042 | if (!ifs) |
1043 | return ret; |
1044 | |
1045 | last_byte = min_t(loff_t, end_byte - 1, |
1046 | folio_pos(folio) + folio_size(folio) - 1); |
1047 | first_blk = offset_in_folio(folio, start_byte) >> blkbits; |
1048 | last_blk = offset_in_folio(folio, last_byte) >> blkbits; |
1049 | for (i = first_blk; i <= last_blk; i++) { |
1050 | if (!ifs_block_is_dirty(folio, ifs, block: i)) { |
1051 | ret = punch(inode, folio_pos(folio) + (i << blkbits), |
1052 | 1 << blkbits); |
1053 | if (ret) |
1054 | return ret; |
1055 | } |
1056 | } |
1057 | |
1058 | return ret; |
1059 | } |
1060 | |
1061 | |
1062 | static int iomap_write_delalloc_punch(struct inode *inode, struct folio *folio, |
1063 | loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte, |
1064 | iomap_punch_t punch) |
1065 | { |
1066 | int ret = 0; |
1067 | |
1068 | if (!folio_test_dirty(folio)) |
1069 | return ret; |
1070 | |
1071 | /* if dirty, punch up to offset */ |
1072 | if (start_byte > *punch_start_byte) { |
1073 | ret = punch(inode, *punch_start_byte, |
1074 | start_byte - *punch_start_byte); |
1075 | if (ret) |
1076 | return ret; |
1077 | } |
1078 | |
1079 | /* Punch non-dirty blocks within folio */ |
1080 | ret = iomap_write_delalloc_ifs_punch(inode, folio, start_byte, |
1081 | end_byte, punch); |
1082 | if (ret) |
1083 | return ret; |
1084 | |
1085 | /* |
1086 | * Make sure the next punch start is correctly bound to |
1087 | * the end of this data range, not the end of the folio. |
1088 | */ |
1089 | *punch_start_byte = min_t(loff_t, end_byte, |
1090 | folio_pos(folio) + folio_size(folio)); |
1091 | |
1092 | return ret; |
1093 | } |
1094 | |
1095 | /* |
1096 | * Scan the data range passed to us for dirty page cache folios. If we find a |
1097 | * dirty folio, punch out the preceding range and update the offset from which |
1098 | * the next punch will start from. |
1099 | * |
1100 | * We can punch out storage reservations under clean pages because they either |
1101 | * contain data that has been written back - in which case the delalloc punch |
1102 | * over that range is a no-op - or they have been read faults in which case they |
1103 | * contain zeroes and we can remove the delalloc backing range and any new |
1104 | * writes to those pages will do the normal hole filling operation... |
1105 | * |
1106 | * This makes the logic simple: we only need to keep the delalloc extents only |
1107 | * over the dirty ranges of the page cache. |
1108 | * |
1109 | * This function uses [start_byte, end_byte) intervals (i.e. open ended) to |
1110 | * simplify range iterations. |
1111 | */ |
1112 | static int iomap_write_delalloc_scan(struct inode *inode, |
1113 | loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte, |
1114 | iomap_punch_t punch) |
1115 | { |
1116 | while (start_byte < end_byte) { |
1117 | struct folio *folio; |
1118 | int ret; |
1119 | |
1120 | /* grab locked page */ |
1121 | folio = filemap_lock_folio(mapping: inode->i_mapping, |
1122 | index: start_byte >> PAGE_SHIFT); |
1123 | if (IS_ERR(ptr: folio)) { |
1124 | start_byte = ALIGN_DOWN(start_byte, PAGE_SIZE) + |
1125 | PAGE_SIZE; |
1126 | continue; |
1127 | } |
1128 | |
1129 | ret = iomap_write_delalloc_punch(inode, folio, punch_start_byte, |
1130 | start_byte, end_byte, punch); |
1131 | if (ret) { |
1132 | folio_unlock(folio); |
1133 | folio_put(folio); |
1134 | return ret; |
1135 | } |
1136 | |
1137 | /* move offset to start of next folio in range */ |
1138 | start_byte = folio_next_index(folio) << PAGE_SHIFT; |
1139 | folio_unlock(folio); |
1140 | folio_put(folio); |
1141 | } |
1142 | return 0; |
1143 | } |
1144 | |
1145 | /* |
1146 | * Punch out all the delalloc blocks in the range given except for those that |
1147 | * have dirty data still pending in the page cache - those are going to be |
1148 | * written and so must still retain the delalloc backing for writeback. |
1149 | * |
1150 | * As we are scanning the page cache for data, we don't need to reimplement the |
1151 | * wheel - mapping_seek_hole_data() does exactly what we need to identify the |
1152 | * start and end of data ranges correctly even for sub-folio block sizes. This |
1153 | * byte range based iteration is especially convenient because it means we |
1154 | * don't have to care about variable size folios, nor where the start or end of |
1155 | * the data range lies within a folio, if they lie within the same folio or even |
1156 | * if there are multiple discontiguous data ranges within the folio. |
1157 | * |
1158 | * It should be noted that mapping_seek_hole_data() is not aware of EOF, and so |
1159 | * can return data ranges that exist in the cache beyond EOF. e.g. a page fault |
1160 | * spanning EOF will initialise the post-EOF data to zeroes and mark it up to |
1161 | * date. A write page fault can then mark it dirty. If we then fail a write() |
1162 | * beyond EOF into that up to date cached range, we allocate a delalloc block |
1163 | * beyond EOF and then have to punch it out. Because the range is up to date, |
1164 | * mapping_seek_hole_data() will return it, and we will skip the punch because |
1165 | * the folio is dirty. THis is incorrect - we always need to punch out delalloc |
1166 | * beyond EOF in this case as writeback will never write back and covert that |
1167 | * delalloc block beyond EOF. Hence we limit the cached data scan range to EOF, |
1168 | * resulting in always punching out the range from the EOF to the end of the |
1169 | * range the iomap spans. |
1170 | * |
1171 | * Intervals are of the form [start_byte, end_byte) (i.e. open ended) because it |
1172 | * matches the intervals returned by mapping_seek_hole_data(). i.e. SEEK_DATA |
1173 | * returns the start of a data range (start_byte), and SEEK_HOLE(start_byte) |
1174 | * returns the end of the data range (data_end). Using closed intervals would |
1175 | * require sprinkling this code with magic "+ 1" and "- 1" arithmetic and expose |
1176 | * the code to subtle off-by-one bugs.... |
1177 | */ |
1178 | static int iomap_write_delalloc_release(struct inode *inode, |
1179 | loff_t start_byte, loff_t end_byte, iomap_punch_t punch) |
1180 | { |
1181 | loff_t punch_start_byte = start_byte; |
1182 | loff_t scan_end_byte = min(i_size_read(inode), end_byte); |
1183 | int error = 0; |
1184 | |
1185 | /* |
1186 | * Lock the mapping to avoid races with page faults re-instantiating |
1187 | * folios and dirtying them via ->page_mkwrite whilst we walk the |
1188 | * cache and perform delalloc extent removal. Failing to do this can |
1189 | * leave dirty pages with no space reservation in the cache. |
1190 | */ |
1191 | filemap_invalidate_lock(mapping: inode->i_mapping); |
1192 | while (start_byte < scan_end_byte) { |
1193 | loff_t data_end; |
1194 | |
1195 | start_byte = mapping_seek_hole_data(inode->i_mapping, |
1196 | start: start_byte, end: scan_end_byte, SEEK_DATA); |
1197 | /* |
1198 | * If there is no more data to scan, all that is left is to |
1199 | * punch out the remaining range. |
1200 | */ |
1201 | if (start_byte == -ENXIO || start_byte == scan_end_byte) |
1202 | break; |
1203 | if (start_byte < 0) { |
1204 | error = start_byte; |
1205 | goto out_unlock; |
1206 | } |
1207 | WARN_ON_ONCE(start_byte < punch_start_byte); |
1208 | WARN_ON_ONCE(start_byte > scan_end_byte); |
1209 | |
1210 | /* |
1211 | * We find the end of this contiguous cached data range by |
1212 | * seeking from start_byte to the beginning of the next hole. |
1213 | */ |
1214 | data_end = mapping_seek_hole_data(inode->i_mapping, start: start_byte, |
1215 | end: scan_end_byte, SEEK_HOLE); |
1216 | if (data_end < 0) { |
1217 | error = data_end; |
1218 | goto out_unlock; |
1219 | } |
1220 | WARN_ON_ONCE(data_end <= start_byte); |
1221 | WARN_ON_ONCE(data_end > scan_end_byte); |
1222 | |
1223 | error = iomap_write_delalloc_scan(inode, punch_start_byte: &punch_start_byte, |
1224 | start_byte, end_byte: data_end, punch); |
1225 | if (error) |
1226 | goto out_unlock; |
1227 | |
1228 | /* The next data search starts at the end of this one. */ |
1229 | start_byte = data_end; |
1230 | } |
1231 | |
1232 | if (punch_start_byte < end_byte) |
1233 | error = punch(inode, punch_start_byte, |
1234 | end_byte - punch_start_byte); |
1235 | out_unlock: |
1236 | filemap_invalidate_unlock(mapping: inode->i_mapping); |
1237 | return error; |
1238 | } |
1239 | |
1240 | /* |
1241 | * When a short write occurs, the filesystem may need to remove reserved space |
1242 | * that was allocated in ->iomap_begin from it's ->iomap_end method. For |
1243 | * filesystems that use delayed allocation, we need to punch out delalloc |
1244 | * extents from the range that are not dirty in the page cache. As the write can |
1245 | * race with page faults, there can be dirty pages over the delalloc extent |
1246 | * outside the range of a short write but still within the delalloc extent |
1247 | * allocated for this iomap. |
1248 | * |
1249 | * This function uses [start_byte, end_byte) intervals (i.e. open ended) to |
1250 | * simplify range iterations. |
1251 | * |
1252 | * The punch() callback *must* only punch delalloc extents in the range passed |
1253 | * to it. It must skip over all other types of extents in the range and leave |
1254 | * them completely unchanged. It must do this punch atomically with respect to |
1255 | * other extent modifications. |
1256 | * |
1257 | * The punch() callback may be called with a folio locked to prevent writeback |
1258 | * extent allocation racing at the edge of the range we are currently punching. |
1259 | * The locked folio may or may not cover the range being punched, so it is not |
1260 | * safe for the punch() callback to lock folios itself. |
1261 | * |
1262 | * Lock order is: |
1263 | * |
1264 | * inode->i_rwsem (shared or exclusive) |
1265 | * inode->i_mapping->invalidate_lock (exclusive) |
1266 | * folio_lock() |
1267 | * ->punch |
1268 | * internal filesystem allocation lock |
1269 | */ |
1270 | int iomap_file_buffered_write_punch_delalloc(struct inode *inode, |
1271 | struct iomap *iomap, loff_t pos, loff_t length, |
1272 | ssize_t written, iomap_punch_t punch) |
1273 | { |
1274 | loff_t start_byte; |
1275 | loff_t end_byte; |
1276 | unsigned int blocksize = i_blocksize(node: inode); |
1277 | |
1278 | if (iomap->type != IOMAP_DELALLOC) |
1279 | return 0; |
1280 | |
1281 | /* If we didn't reserve the blocks, we're not allowed to punch them. */ |
1282 | if (!(iomap->flags & IOMAP_F_NEW)) |
1283 | return 0; |
1284 | |
1285 | /* |
1286 | * start_byte refers to the first unused block after a short write. If |
1287 | * nothing was written, round offset down to point at the first block in |
1288 | * the range. |
1289 | */ |
1290 | if (unlikely(!written)) |
1291 | start_byte = round_down(pos, blocksize); |
1292 | else |
1293 | start_byte = round_up(pos + written, blocksize); |
1294 | end_byte = round_up(pos + length, blocksize); |
1295 | |
1296 | /* Nothing to do if we've written the entire delalloc extent */ |
1297 | if (start_byte >= end_byte) |
1298 | return 0; |
1299 | |
1300 | return iomap_write_delalloc_release(inode, start_byte, end_byte, |
1301 | punch); |
1302 | } |
1303 | EXPORT_SYMBOL_GPL(iomap_file_buffered_write_punch_delalloc); |
1304 | |
1305 | static loff_t iomap_unshare_iter(struct iomap_iter *iter) |
1306 | { |
1307 | struct iomap *iomap = &iter->iomap; |
1308 | const struct iomap *srcmap = iomap_iter_srcmap(i: iter); |
1309 | loff_t pos = iter->pos; |
1310 | loff_t length = iomap_length(iter); |
1311 | loff_t written = 0; |
1312 | |
1313 | /* don't bother with blocks that are not shared to start with */ |
1314 | if (!(iomap->flags & IOMAP_F_SHARED)) |
1315 | return length; |
1316 | /* don't bother with holes or unwritten extents */ |
1317 | if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN) |
1318 | return length; |
1319 | |
1320 | do { |
1321 | struct folio *folio; |
1322 | int status; |
1323 | size_t offset; |
1324 | size_t bytes = min_t(u64, SIZE_MAX, length); |
1325 | |
1326 | status = iomap_write_begin(iter, pos, len: bytes, foliop: &folio); |
1327 | if (unlikely(status)) |
1328 | return status; |
1329 | if (iomap->flags & IOMAP_F_STALE) |
1330 | break; |
1331 | |
1332 | offset = offset_in_folio(folio, pos); |
1333 | if (bytes > folio_size(folio) - offset) |
1334 | bytes = folio_size(folio) - offset; |
1335 | |
1336 | bytes = iomap_write_end(iter, pos, len: bytes, copied: bytes, folio); |
1337 | if (WARN_ON_ONCE(bytes == 0)) |
1338 | return -EIO; |
1339 | |
1340 | cond_resched(); |
1341 | |
1342 | pos += bytes; |
1343 | written += bytes; |
1344 | length -= bytes; |
1345 | |
1346 | balance_dirty_pages_ratelimited(mapping: iter->inode->i_mapping); |
1347 | } while (length > 0); |
1348 | |
1349 | return written; |
1350 | } |
1351 | |
1352 | int |
1353 | iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, |
1354 | const struct iomap_ops *ops) |
1355 | { |
1356 | struct iomap_iter iter = { |
1357 | .inode = inode, |
1358 | .pos = pos, |
1359 | .len = len, |
1360 | .flags = IOMAP_WRITE | IOMAP_UNSHARE, |
1361 | }; |
1362 | int ret; |
1363 | |
1364 | while ((ret = iomap_iter(iter: &iter, ops)) > 0) |
1365 | iter.processed = iomap_unshare_iter(iter: &iter); |
1366 | return ret; |
1367 | } |
1368 | EXPORT_SYMBOL_GPL(iomap_file_unshare); |
1369 | |
1370 | static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero) |
1371 | { |
1372 | const struct iomap *srcmap = iomap_iter_srcmap(i: iter); |
1373 | loff_t pos = iter->pos; |
1374 | loff_t length = iomap_length(iter); |
1375 | loff_t written = 0; |
1376 | |
1377 | /* already zeroed? we're done. */ |
1378 | if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN) |
1379 | return length; |
1380 | |
1381 | do { |
1382 | struct folio *folio; |
1383 | int status; |
1384 | size_t offset; |
1385 | size_t bytes = min_t(u64, SIZE_MAX, length); |
1386 | |
1387 | status = iomap_write_begin(iter, pos, len: bytes, foliop: &folio); |
1388 | if (status) |
1389 | return status; |
1390 | if (iter->iomap.flags & IOMAP_F_STALE) |
1391 | break; |
1392 | |
1393 | offset = offset_in_folio(folio, pos); |
1394 | if (bytes > folio_size(folio) - offset) |
1395 | bytes = folio_size(folio) - offset; |
1396 | |
1397 | folio_zero_range(folio, start: offset, length: bytes); |
1398 | folio_mark_accessed(folio); |
1399 | |
1400 | bytes = iomap_write_end(iter, pos, len: bytes, copied: bytes, folio); |
1401 | if (WARN_ON_ONCE(bytes == 0)) |
1402 | return -EIO; |
1403 | |
1404 | pos += bytes; |
1405 | length -= bytes; |
1406 | written += bytes; |
1407 | } while (length > 0); |
1408 | |
1409 | if (did_zero) |
1410 | *did_zero = true; |
1411 | return written; |
1412 | } |
1413 | |
1414 | int |
1415 | iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, |
1416 | const struct iomap_ops *ops) |
1417 | { |
1418 | struct iomap_iter iter = { |
1419 | .inode = inode, |
1420 | .pos = pos, |
1421 | .len = len, |
1422 | .flags = IOMAP_ZERO, |
1423 | }; |
1424 | int ret; |
1425 | |
1426 | while ((ret = iomap_iter(iter: &iter, ops)) > 0) |
1427 | iter.processed = iomap_zero_iter(iter: &iter, did_zero); |
1428 | return ret; |
1429 | } |
1430 | EXPORT_SYMBOL_GPL(iomap_zero_range); |
1431 | |
1432 | int |
1433 | iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, |
1434 | const struct iomap_ops *ops) |
1435 | { |
1436 | unsigned int blocksize = i_blocksize(node: inode); |
1437 | unsigned int off = pos & (blocksize - 1); |
1438 | |
1439 | /* Block boundary? Nothing to do */ |
1440 | if (!off) |
1441 | return 0; |
1442 | return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops); |
1443 | } |
1444 | EXPORT_SYMBOL_GPL(iomap_truncate_page); |
1445 | |
1446 | static loff_t iomap_folio_mkwrite_iter(struct iomap_iter *iter, |
1447 | struct folio *folio) |
1448 | { |
1449 | loff_t length = iomap_length(iter); |
1450 | int ret; |
1451 | |
1452 | if (iter->iomap.flags & IOMAP_F_BUFFER_HEAD) { |
1453 | ret = __block_write_begin_int(folio, pos: iter->pos, len: length, NULL, |
1454 | iomap: &iter->iomap); |
1455 | if (ret) |
1456 | return ret; |
1457 | block_commit_write(page: &folio->page, from: 0, to: length); |
1458 | } else { |
1459 | WARN_ON_ONCE(!folio_test_uptodate(folio)); |
1460 | folio_mark_dirty(folio); |
1461 | } |
1462 | |
1463 | return length; |
1464 | } |
1465 | |
1466 | vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) |
1467 | { |
1468 | struct iomap_iter iter = { |
1469 | .inode = file_inode(f: vmf->vma->vm_file), |
1470 | .flags = IOMAP_WRITE | IOMAP_FAULT, |
1471 | }; |
1472 | struct folio *folio = page_folio(vmf->page); |
1473 | ssize_t ret; |
1474 | |
1475 | folio_lock(folio); |
1476 | ret = folio_mkwrite_check_truncate(folio, inode: iter.inode); |
1477 | if (ret < 0) |
1478 | goto out_unlock; |
1479 | iter.pos = folio_pos(folio); |
1480 | iter.len = ret; |
1481 | while ((ret = iomap_iter(iter: &iter, ops)) > 0) |
1482 | iter.processed = iomap_folio_mkwrite_iter(iter: &iter, folio); |
1483 | |
1484 | if (ret < 0) |
1485 | goto out_unlock; |
1486 | folio_wait_stable(folio); |
1487 | return VM_FAULT_LOCKED; |
1488 | out_unlock: |
1489 | folio_unlock(folio); |
1490 | return vmf_fs_error(err: ret); |
1491 | } |
1492 | EXPORT_SYMBOL_GPL(iomap_page_mkwrite); |
1493 | |
1494 | static void iomap_finish_folio_write(struct inode *inode, struct folio *folio, |
1495 | size_t len) |
1496 | { |
1497 | struct iomap_folio_state *ifs = folio->private; |
1498 | |
1499 | WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !ifs); |
1500 | WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) <= 0); |
1501 | |
1502 | if (!ifs || atomic_sub_and_test(i: len, v: &ifs->write_bytes_pending)) |
1503 | folio_end_writeback(folio); |
1504 | } |
1505 | |
1506 | /* |
1507 | * We're now finished for good with this ioend structure. Update the page |
1508 | * state, release holds on bios, and finally free up memory. Do not use the |
1509 | * ioend after this. |
1510 | */ |
1511 | static u32 |
1512 | iomap_finish_ioend(struct iomap_ioend *ioend, int error) |
1513 | { |
1514 | struct inode *inode = ioend->io_inode; |
1515 | struct bio *bio = &ioend->io_bio; |
1516 | struct folio_iter fi; |
1517 | u32 folio_count = 0; |
1518 | |
1519 | if (error) { |
1520 | mapping_set_error(mapping: inode->i_mapping, error); |
1521 | if (!bio_flagged(bio, bit: BIO_QUIET)) { |
1522 | pr_err_ratelimited( |
1523 | "%s: writeback error on inode %lu, offset %lld, sector %llu", |
1524 | inode->i_sb->s_id, inode->i_ino, |
1525 | ioend->io_offset, ioend->io_sector); |
1526 | } |
1527 | } |
1528 | |
1529 | /* walk all folios in bio, ending page IO on them */ |
1530 | bio_for_each_folio_all(fi, bio) { |
1531 | if (error) |
1532 | folio_set_error(folio: fi.folio); |
1533 | iomap_finish_folio_write(inode, folio: fi.folio, len: fi.length); |
1534 | folio_count++; |
1535 | } |
1536 | |
1537 | bio_put(bio); /* frees the ioend */ |
1538 | return folio_count; |
1539 | } |
1540 | |
1541 | /* |
1542 | * Ioend completion routine for merged bios. This can only be called from task |
1543 | * contexts as merged ioends can be of unbound length. Hence we have to break up |
1544 | * the writeback completions into manageable chunks to avoid long scheduler |
1545 | * holdoffs. We aim to keep scheduler holdoffs down below 10ms so that we get |
1546 | * good batch processing throughput without creating adverse scheduler latency |
1547 | * conditions. |
1548 | */ |
1549 | void |
1550 | iomap_finish_ioends(struct iomap_ioend *ioend, int error) |
1551 | { |
1552 | struct list_head tmp; |
1553 | u32 completions; |
1554 | |
1555 | might_sleep(); |
1556 | |
1557 | list_replace_init(old: &ioend->io_list, new: &tmp); |
1558 | completions = iomap_finish_ioend(ioend, error); |
1559 | |
1560 | while (!list_empty(head: &tmp)) { |
1561 | if (completions > IOEND_BATCH_SIZE * 8) { |
1562 | cond_resched(); |
1563 | completions = 0; |
1564 | } |
1565 | ioend = list_first_entry(&tmp, struct iomap_ioend, io_list); |
1566 | list_del_init(entry: &ioend->io_list); |
1567 | completions += iomap_finish_ioend(ioend, error); |
1568 | } |
1569 | } |
1570 | EXPORT_SYMBOL_GPL(iomap_finish_ioends); |
1571 | |
1572 | /* |
1573 | * We can merge two adjacent ioends if they have the same set of work to do. |
1574 | */ |
1575 | static bool |
1576 | iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next) |
1577 | { |
1578 | if (ioend->io_bio.bi_status != next->io_bio.bi_status) |
1579 | return false; |
1580 | if ((ioend->io_flags & IOMAP_F_SHARED) ^ |
1581 | (next->io_flags & IOMAP_F_SHARED)) |
1582 | return false; |
1583 | if ((ioend->io_type == IOMAP_UNWRITTEN) ^ |
1584 | (next->io_type == IOMAP_UNWRITTEN)) |
1585 | return false; |
1586 | if (ioend->io_offset + ioend->io_size != next->io_offset) |
1587 | return false; |
1588 | /* |
1589 | * Do not merge physically discontiguous ioends. The filesystem |
1590 | * completion functions will have to iterate the physical |
1591 | * discontiguities even if we merge the ioends at a logical level, so |
1592 | * we don't gain anything by merging physical discontiguities here. |
1593 | * |
1594 | * We cannot use bio->bi_iter.bi_sector here as it is modified during |
1595 | * submission so does not point to the start sector of the bio at |
1596 | * completion. |
1597 | */ |
1598 | if (ioend->io_sector + (ioend->io_size >> 9) != next->io_sector) |
1599 | return false; |
1600 | return true; |
1601 | } |
1602 | |
1603 | void |
1604 | iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends) |
1605 | { |
1606 | struct iomap_ioend *next; |
1607 | |
1608 | INIT_LIST_HEAD(list: &ioend->io_list); |
1609 | |
1610 | while ((next = list_first_entry_or_null(more_ioends, struct iomap_ioend, |
1611 | io_list))) { |
1612 | if (!iomap_ioend_can_merge(ioend, next)) |
1613 | break; |
1614 | list_move_tail(list: &next->io_list, head: &ioend->io_list); |
1615 | ioend->io_size += next->io_size; |
1616 | } |
1617 | } |
1618 | EXPORT_SYMBOL_GPL(iomap_ioend_try_merge); |
1619 | |
1620 | static int |
1621 | iomap_ioend_compare(void *priv, const struct list_head *a, |
1622 | const struct list_head *b) |
1623 | { |
1624 | struct iomap_ioend *ia = container_of(a, struct iomap_ioend, io_list); |
1625 | struct iomap_ioend *ib = container_of(b, struct iomap_ioend, io_list); |
1626 | |
1627 | if (ia->io_offset < ib->io_offset) |
1628 | return -1; |
1629 | if (ia->io_offset > ib->io_offset) |
1630 | return 1; |
1631 | return 0; |
1632 | } |
1633 | |
1634 | void |
1635 | iomap_sort_ioends(struct list_head *ioend_list) |
1636 | { |
1637 | list_sort(NULL, head: ioend_list, cmp: iomap_ioend_compare); |
1638 | } |
1639 | EXPORT_SYMBOL_GPL(iomap_sort_ioends); |
1640 | |
1641 | static void iomap_writepage_end_bio(struct bio *bio) |
1642 | { |
1643 | iomap_finish_ioend(ioend: iomap_ioend_from_bio(bio), |
1644 | error: blk_status_to_errno(status: bio->bi_status)); |
1645 | } |
1646 | |
1647 | /* |
1648 | * Submit the final bio for an ioend. |
1649 | * |
1650 | * If @error is non-zero, it means that we have a situation where some part of |
1651 | * the submission process has failed after we've marked pages for writeback. |
1652 | * We cannot cancel ioend directly in that case, so call the bio end I/O handler |
1653 | * with the error status here to run the normal I/O completion handler to clear |
1654 | * the writeback bit and let the file system proess the errors. |
1655 | */ |
1656 | static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error) |
1657 | { |
1658 | if (!wpc->ioend) |
1659 | return error; |
1660 | |
1661 | /* |
1662 | * Let the file systems prepare the I/O submission and hook in an I/O |
1663 | * comletion handler. This also needs to happen in case after a |
1664 | * failure happened so that the file system end I/O handler gets called |
1665 | * to clean up. |
1666 | */ |
1667 | if (wpc->ops->prepare_ioend) |
1668 | error = wpc->ops->prepare_ioend(wpc->ioend, error); |
1669 | |
1670 | if (error) { |
1671 | wpc->ioend->io_bio.bi_status = errno_to_blk_status(errno: error); |
1672 | bio_endio(&wpc->ioend->io_bio); |
1673 | } else { |
1674 | submit_bio(bio: &wpc->ioend->io_bio); |
1675 | } |
1676 | |
1677 | wpc->ioend = NULL; |
1678 | return error; |
1679 | } |
1680 | |
1681 | static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc, |
1682 | struct writeback_control *wbc, struct inode *inode, loff_t pos) |
1683 | { |
1684 | struct iomap_ioend *ioend; |
1685 | struct bio *bio; |
1686 | |
1687 | bio = bio_alloc_bioset(bdev: wpc->iomap.bdev, BIO_MAX_VECS, |
1688 | opf: REQ_OP_WRITE | wbc_to_write_flags(wbc), |
1689 | GFP_NOFS, bs: &iomap_ioend_bioset); |
1690 | bio->bi_iter.bi_sector = iomap_sector(iomap: &wpc->iomap, pos); |
1691 | bio->bi_end_io = iomap_writepage_end_bio; |
1692 | wbc_init_bio(wbc, bio); |
1693 | bio->bi_write_hint = inode->i_write_hint; |
1694 | |
1695 | ioend = iomap_ioend_from_bio(bio); |
1696 | INIT_LIST_HEAD(list: &ioend->io_list); |
1697 | ioend->io_type = wpc->iomap.type; |
1698 | ioend->io_flags = wpc->iomap.flags; |
1699 | ioend->io_inode = inode; |
1700 | ioend->io_size = 0; |
1701 | ioend->io_offset = pos; |
1702 | ioend->io_sector = bio->bi_iter.bi_sector; |
1703 | |
1704 | wpc->nr_folios = 0; |
1705 | return ioend; |
1706 | } |
1707 | |
1708 | static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos) |
1709 | { |
1710 | if ((wpc->iomap.flags & IOMAP_F_SHARED) != |
1711 | (wpc->ioend->io_flags & IOMAP_F_SHARED)) |
1712 | return false; |
1713 | if (wpc->iomap.type != wpc->ioend->io_type) |
1714 | return false; |
1715 | if (pos != wpc->ioend->io_offset + wpc->ioend->io_size) |
1716 | return false; |
1717 | if (iomap_sector(iomap: &wpc->iomap, pos) != |
1718 | bio_end_sector(&wpc->ioend->io_bio)) |
1719 | return false; |
1720 | /* |
1721 | * Limit ioend bio chain lengths to minimise IO completion latency. This |
1722 | * also prevents long tight loops ending page writeback on all the |
1723 | * folios in the ioend. |
1724 | */ |
1725 | if (wpc->nr_folios >= IOEND_BATCH_SIZE) |
1726 | return false; |
1727 | return true; |
1728 | } |
1729 | |
1730 | /* |
1731 | * Test to see if we have an existing ioend structure that we could append to |
1732 | * first; otherwise finish off the current ioend and start another. |
1733 | * |
1734 | * If a new ioend is created and cached, the old ioend is submitted to the block |
1735 | * layer instantly. Batching optimisations are provided by higher level block |
1736 | * plugging. |
1737 | * |
1738 | * At the end of a writeback pass, there will be a cached ioend remaining on the |
1739 | * writepage context that the caller will need to submit. |
1740 | */ |
1741 | static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, |
1742 | struct writeback_control *wbc, struct folio *folio, |
1743 | struct inode *inode, loff_t pos, unsigned len) |
1744 | { |
1745 | struct iomap_folio_state *ifs = folio->private; |
1746 | size_t poff = offset_in_folio(folio, pos); |
1747 | int error; |
1748 | |
1749 | if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos)) { |
1750 | new_ioend: |
1751 | error = iomap_submit_ioend(wpc, error: 0); |
1752 | if (error) |
1753 | return error; |
1754 | wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos); |
1755 | } |
1756 | |
1757 | if (!bio_add_folio(bio: &wpc->ioend->io_bio, folio, len, off: poff)) |
1758 | goto new_ioend; |
1759 | |
1760 | if (ifs) |
1761 | atomic_add(i: len, v: &ifs->write_bytes_pending); |
1762 | wpc->ioend->io_size += len; |
1763 | wbc_account_cgroup_owner(wbc, page: &folio->page, bytes: len); |
1764 | return 0; |
1765 | } |
1766 | |
1767 | static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc, |
1768 | struct writeback_control *wbc, struct folio *folio, |
1769 | struct inode *inode, u64 pos, unsigned dirty_len, |
1770 | unsigned *count) |
1771 | { |
1772 | int error; |
1773 | |
1774 | do { |
1775 | unsigned map_len; |
1776 | |
1777 | error = wpc->ops->map_blocks(wpc, inode, pos, dirty_len); |
1778 | if (error) |
1779 | break; |
1780 | trace_iomap_writepage_map(inode, pos, dirty_len, iomap: &wpc->iomap); |
1781 | |
1782 | map_len = min_t(u64, dirty_len, |
1783 | wpc->iomap.offset + wpc->iomap.length - pos); |
1784 | WARN_ON_ONCE(!folio->private && map_len < dirty_len); |
1785 | |
1786 | switch (wpc->iomap.type) { |
1787 | case IOMAP_INLINE: |
1788 | WARN_ON_ONCE(1); |
1789 | error = -EIO; |
1790 | break; |
1791 | case IOMAP_HOLE: |
1792 | break; |
1793 | default: |
1794 | error = iomap_add_to_ioend(wpc, wbc, folio, inode, pos, |
1795 | len: map_len); |
1796 | if (!error) |
1797 | (*count)++; |
1798 | break; |
1799 | } |
1800 | dirty_len -= map_len; |
1801 | pos += map_len; |
1802 | } while (dirty_len && !error); |
1803 | |
1804 | /* |
1805 | * We cannot cancel the ioend directly here on error. We may have |
1806 | * already set other pages under writeback and hence we have to run I/O |
1807 | * completion to mark the error state of the pages under writeback |
1808 | * appropriately. |
1809 | * |
1810 | * Just let the file system know what portion of the folio failed to |
1811 | * map. |
1812 | */ |
1813 | if (error && wpc->ops->discard_folio) |
1814 | wpc->ops->discard_folio(folio, pos); |
1815 | return error; |
1816 | } |
1817 | |
1818 | /* |
1819 | * Check interaction of the folio with the file end. |
1820 | * |
1821 | * If the folio is entirely beyond i_size, return false. If it straddles |
1822 | * i_size, adjust end_pos and zero all data beyond i_size. |
1823 | */ |
1824 | static bool iomap_writepage_handle_eof(struct folio *folio, struct inode *inode, |
1825 | u64 *end_pos) |
1826 | { |
1827 | u64 isize = i_size_read(inode); |
1828 | |
1829 | if (*end_pos > isize) { |
1830 | size_t poff = offset_in_folio(folio, isize); |
1831 | pgoff_t end_index = isize >> PAGE_SHIFT; |
1832 | |
1833 | /* |
1834 | * If the folio is entirely ouside of i_size, skip it. |
1835 | * |
1836 | * This can happen due to a truncate operation that is in |
1837 | * progress and in that case truncate will finish it off once |
1838 | * we've dropped the folio lock. |
1839 | * |
1840 | * Note that the pgoff_t used for end_index is an unsigned long. |
1841 | * If the given offset is greater than 16TB on a 32-bit system, |
1842 | * then if we checked if the folio is fully outside i_size with |
1843 | * "if (folio->index >= end_index + 1)", "end_index + 1" would |
1844 | * overflow and evaluate to 0. Hence this folio would be |
1845 | * redirtied and written out repeatedly, which would result in |
1846 | * an infinite loop; the user program performing this operation |
1847 | * would hang. Instead, we can detect this situation by |
1848 | * checking if the folio is totally beyond i_size or if its |
1849 | * offset is just equal to the EOF. |
1850 | */ |
1851 | if (folio->index > end_index || |
1852 | (folio->index == end_index && poff == 0)) |
1853 | return false; |
1854 | |
1855 | /* |
1856 | * The folio straddles i_size. |
1857 | * |
1858 | * It must be zeroed out on each and every writepage invocation |
1859 | * because it may be mmapped: |
1860 | * |
1861 | * A file is mapped in multiples of the page size. For a |
1862 | * file that is not a multiple of the page size, the |
1863 | * remaining memory is zeroed when mapped, and writes to that |
1864 | * region are not written out to the file. |
1865 | * |
1866 | * Also adjust the writeback range to skip all blocks entirely |
1867 | * beyond i_size. |
1868 | */ |
1869 | folio_zero_segment(folio, start: poff, xend: folio_size(folio)); |
1870 | *end_pos = round_up(isize, i_blocksize(inode)); |
1871 | } |
1872 | |
1873 | return true; |
1874 | } |
1875 | |
1876 | static int iomap_writepage_map(struct iomap_writepage_ctx *wpc, |
1877 | struct writeback_control *wbc, struct folio *folio) |
1878 | { |
1879 | struct iomap_folio_state *ifs = folio->private; |
1880 | struct inode *inode = folio->mapping->host; |
1881 | u64 pos = folio_pos(folio); |
1882 | u64 end_pos = pos + folio_size(folio); |
1883 | unsigned count = 0; |
1884 | int error = 0; |
1885 | u32 rlen; |
1886 | |
1887 | WARN_ON_ONCE(!folio_test_locked(folio)); |
1888 | WARN_ON_ONCE(folio_test_dirty(folio)); |
1889 | WARN_ON_ONCE(folio_test_writeback(folio)); |
1890 | |
1891 | trace_iomap_writepage(inode, off: pos, len: folio_size(folio)); |
1892 | |
1893 | if (!iomap_writepage_handle_eof(folio, inode, end_pos: &end_pos)) { |
1894 | folio_unlock(folio); |
1895 | return 0; |
1896 | } |
1897 | WARN_ON_ONCE(end_pos <= pos); |
1898 | |
1899 | if (i_blocks_per_folio(inode, folio) > 1) { |
1900 | if (!ifs) { |
1901 | ifs = ifs_alloc(inode, folio, flags: 0); |
1902 | iomap_set_range_dirty(folio, off: 0, len: end_pos - pos); |
1903 | } |
1904 | |
1905 | /* |
1906 | * Keep the I/O completion handler from clearing the writeback |
1907 | * bit until we have submitted all blocks by adding a bias to |
1908 | * ifs->write_bytes_pending, which is dropped after submitting |
1909 | * all blocks. |
1910 | */ |
1911 | WARN_ON_ONCE(atomic_read(&ifs->write_bytes_pending) != 0); |
1912 | atomic_inc(v: &ifs->write_bytes_pending); |
1913 | } |
1914 | |
1915 | /* |
1916 | * Set the writeback bit ASAP, as the I/O completion for the single |
1917 | * block per folio case happen hit as soon as we're submitting the bio. |
1918 | */ |
1919 | folio_start_writeback(folio); |
1920 | |
1921 | /* |
1922 | * Walk through the folio to find dirty areas to write back. |
1923 | */ |
1924 | while ((rlen = iomap_find_dirty_range(folio, range_start: &pos, range_end: end_pos))) { |
1925 | error = iomap_writepage_map_blocks(wpc, wbc, folio, inode, |
1926 | pos, dirty_len: rlen, count: &count); |
1927 | if (error) |
1928 | break; |
1929 | pos += rlen; |
1930 | } |
1931 | |
1932 | if (count) |
1933 | wpc->nr_folios++; |
1934 | |
1935 | /* |
1936 | * We can have dirty bits set past end of file in page_mkwrite path |
1937 | * while mapping the last partial folio. Hence it's better to clear |
1938 | * all the dirty bits in the folio here. |
1939 | */ |
1940 | iomap_clear_range_dirty(folio, off: 0, len: folio_size(folio)); |
1941 | |
1942 | /* |
1943 | * Usually the writeback bit is cleared by the I/O completion handler. |
1944 | * But we may end up either not actually writing any blocks, or (when |
1945 | * there are multiple blocks in a folio) all I/O might have finished |
1946 | * already at this point. In that case we need to clear the writeback |
1947 | * bit ourselves right after unlocking the page. |
1948 | */ |
1949 | folio_unlock(folio); |
1950 | if (ifs) { |
1951 | if (atomic_dec_and_test(v: &ifs->write_bytes_pending)) |
1952 | folio_end_writeback(folio); |
1953 | } else { |
1954 | if (!count) |
1955 | folio_end_writeback(folio); |
1956 | } |
1957 | mapping_set_error(mapping: inode->i_mapping, error); |
1958 | return error; |
1959 | } |
1960 | |
1961 | static int iomap_do_writepage(struct folio *folio, |
1962 | struct writeback_control *wbc, void *data) |
1963 | { |
1964 | return iomap_writepage_map(wpc: data, wbc, folio); |
1965 | } |
1966 | |
1967 | int |
1968 | iomap_writepages(struct address_space *mapping, struct writeback_control *wbc, |
1969 | struct iomap_writepage_ctx *wpc, |
1970 | const struct iomap_writeback_ops *ops) |
1971 | { |
1972 | int ret; |
1973 | |
1974 | /* |
1975 | * Writeback from reclaim context should never happen except in the case |
1976 | * of a VM regression so warn about it and refuse to write the data. |
1977 | */ |
1978 | if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC | PF_KSWAPD)) == |
1979 | PF_MEMALLOC)) |
1980 | return -EIO; |
1981 | |
1982 | wpc->ops = ops; |
1983 | ret = write_cache_pages(mapping, wbc, writepage: iomap_do_writepage, data: wpc); |
1984 | return iomap_submit_ioend(wpc, error: ret); |
1985 | } |
1986 | EXPORT_SYMBOL_GPL(iomap_writepages); |
1987 | |
1988 | static int __init iomap_init(void) |
1989 | { |
1990 | return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE), |
1991 | offsetof(struct iomap_ioend, io_bio), |
1992 | flags: BIOSET_NEED_BVECS); |
1993 | } |
1994 | fs_initcall(iomap_init); |
1995 |
Definitions
- iomap_folio_state
- iomap_ioend_bioset
- ifs_is_fully_uptodate
- ifs_block_is_uptodate
- ifs_set_range_uptodate
- iomap_set_range_uptodate
- ifs_block_is_dirty
- ifs_find_dirty_range
- iomap_find_dirty_range
- ifs_clear_range_dirty
- iomap_clear_range_dirty
- ifs_set_range_dirty
- iomap_set_range_dirty
- ifs_alloc
- ifs_free
- iomap_adjust_read_range
- iomap_finish_folio_read
- iomap_read_end_io
- iomap_readpage_ctx
- iomap_read_inline_data
- iomap_block_needs_zeroing
- iomap_readpage_iter
- iomap_read_folio
- iomap_readahead_iter
- iomap_readahead
- iomap_is_partially_uptodate
- iomap_get_folio
- iomap_release_folio
- iomap_invalidate_folio
- iomap_dirty_folio
- iomap_write_failed
- iomap_read_folio_sync
- __iomap_write_begin
- __iomap_get_folio
- __iomap_put_folio
- iomap_write_begin_inline
- iomap_write_begin
- __iomap_write_end
- iomap_write_end_inline
- iomap_write_end
- iomap_write_iter
- iomap_file_buffered_write
- iomap_write_delalloc_ifs_punch
- iomap_write_delalloc_punch
- iomap_write_delalloc_scan
- iomap_write_delalloc_release
- iomap_file_buffered_write_punch_delalloc
- iomap_unshare_iter
- iomap_file_unshare
- iomap_zero_iter
- iomap_zero_range
- iomap_truncate_page
- iomap_folio_mkwrite_iter
- iomap_page_mkwrite
- iomap_finish_folio_write
- iomap_finish_ioend
- iomap_finish_ioends
- iomap_ioend_can_merge
- iomap_ioend_try_merge
- iomap_ioend_compare
- iomap_sort_ioends
- iomap_writepage_end_bio
- iomap_submit_ioend
- iomap_alloc_ioend
- iomap_can_add_to_ioend
- iomap_add_to_ioend
- iomap_writepage_map_blocks
- iomap_writepage_handle_eof
- iomap_writepage_map
- iomap_do_writepage
- iomap_writepages
Improve your Profiling and Debugging skills
Find out more