buffered-io.c source code [linux/fs/iomap/buffered-io.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* Copyright (C) 2010 Red Hat, Inc.
4	* Copyright (C) 2016-2023 Christoph Hellwig.
5	*/
6	#include <linux/module.h>
7	#include <linux/compiler.h>
8	#include <linux/fs.h>
9	#include <linux/iomap.h>
10	#include <linux/pagemap.h>
11	#include <linux/uio.h>
12	#include <linux/buffer_head.h>
13	#include <linux/dax.h>
14	#include <linux/writeback.h>
15	#include <linux/swap.h>
16	#include <linux/bio.h>
17	#include <linux/sched/signal.h>
18	#include <linux/migrate.h>
19	#include "internal.h"
20	#include "trace.h"
21
22	#include "../internal.h"
23
24	/*
25	* Structure allocated for each folio to track per-block uptodate, dirty state
26	* and I/O completions.
27	*/
28	struct iomap_folio_state {
29	spinlock_t state_lock;
30	unsigned int read_bytes_pending;
31	atomic_t write_bytes_pending;
32
33	/*
34	* Each block has two bits in this bitmap:
35	* Bits [0..blocks_per_folio) has the uptodate status.
36	* Bits [b_p_f...(2*b_p_f)) has the dirty status.
37	*/
38	unsigned long state[];
39	};
40
41	static inline bool ifs_is_fully_uptodate(struct folio *folio,
42	struct iomap_folio_state *ifs)
43	{
44	struct inode *inode = folio->mapping->host;
45
46	return bitmap_full(src: ifs->state, nbits: i_blocks_per_folio(inode, folio));
47	}
48
49	static inline bool ifs_block_is_uptodate(struct iomap_folio_state *ifs,
50	unsigned int block)
51	{
52	return test_bit(block, ifs->state);
53	}
54
55	static bool ifs_set_range_uptodate(struct folio *folio,
56	struct iomap_folio_state *ifs, size_t off, size_t len)
57	{
58	struct inode *inode = folio->mapping->host;
59	unsigned int first_blk = off >> inode->i_blkbits;
60	unsigned int last_blk = (off + len - `1`) >> inode->i_blkbits;
61	unsigned int nr_blks = last_blk - first_blk + `1`;
62
63	bitmap_set(map: ifs->state, start: first_blk, nbits: nr_blks);
64	return ifs_is_fully_uptodate(folio, ifs);
65	}
66
67	static void iomap_set_range_uptodate(struct folio *folio, size_t off,
68	size_t len)
69	{
70	struct iomap_folio_state *ifs = folio->private;
71	unsigned long flags;
72	bool uptodate = true;
73
74	if (ifs) {
75	spin_lock_irqsave(&ifs->state_lock, flags);
76	uptodate = ifs_set_range_uptodate(folio, ifs, off, len);
77	spin_unlock_irqrestore(lock: &ifs->state_lock, flags);
78	}
79
80	if (uptodate)
81	folio_mark_uptodate(folio);
82	}
83
84	static inline bool ifs_block_is_dirty(struct folio *folio,
85	struct iomap_folio_state ifs, int* block)
86	{
87	struct inode *inode = folio->mapping->host;
88	unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
89
90	return test_bit(block + blks_per_folio, ifs->state);
91	}
92
93	static unsigned ifs_find_dirty_range(struct folio *folio,
94	struct iomap_folio_state ifs, u64 range_start, u64 range_end)
95	{
96	struct inode *inode = folio->mapping->host;
97	unsigned start_blk =
98	offset_in_folio(folio, *range_start) >> inode->i_blkbits;
99	unsigned end_blk = min_not_zero(
100	offset_in_folio(folio, range_end) >> inode->i_blkbits,
101	i_blocks_per_folio(inode, folio));
102	unsigned nblks = `1`;
103
104	while (!ifs_block_is_dirty(folio, ifs, block: start_blk))
105	if (++start_blk == end_blk)
106	return `0`;
107
108	while (start_blk + nblks < end_blk) {
109	if (!ifs_block_is_dirty(folio, ifs, block: start_blk + nblks))
110	break;
111	nblks++;
112	}
113
114	*range_start = folio_pos(folio) + (start_blk << inode->i_blkbits);
115	return nblks << inode->i_blkbits;
116	}
117
118	static unsigned iomap_find_dirty_range(struct folio folio, u64 range_start,
119	u64 range_end)
120	{
121	struct iomap_folio_state *ifs = folio->private;
122
123	if (*range_start >= range_end)
124	return `0`;
125
126	if (ifs)
127	return ifs_find_dirty_range(folio, ifs, range_start, range_end);
128	return range_end - *range_start;
129	}
130
131	static void ifs_clear_range_dirty(struct folio *folio,
132	struct iomap_folio_state *ifs, size_t off, size_t len)
133	{
134	struct inode *inode = folio->mapping->host;
135	unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
136	unsigned int first_blk = (off >> inode->i_blkbits);
137	unsigned int last_blk = (off + len - `1`) >> inode->i_blkbits;
138	unsigned int nr_blks = last_blk - first_blk + `1`;
139	unsigned long flags;
140
141	spin_lock_irqsave(&ifs->state_lock, flags);
142	bitmap_clear(map: ifs->state, start: first_blk + blks_per_folio, nbits: nr_blks);
143	spin_unlock_irqrestore(lock: &ifs->state_lock, flags);
144	}
145
146	static void iomap_clear_range_dirty(struct folio *folio, size_t off, size_t len)
147	{
148	struct iomap_folio_state *ifs = folio->private;
149
150	if (ifs)
151	ifs_clear_range_dirty(folio, ifs, off, len);
152	}
153
154	static void ifs_set_range_dirty(struct folio *folio,
155	struct iomap_folio_state *ifs, size_t off, size_t len)
156	{
157	struct inode *inode = folio->mapping->host;
158	unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
159	unsigned int first_blk = (off >> inode->i_blkbits);
160	unsigned int last_blk = (off + len - `1`) >> inode->i_blkbits;
161	unsigned int nr_blks = last_blk - first_blk + `1`;
162	unsigned long flags;
163
164	spin_lock_irqsave(&ifs->state_lock, flags);
165	bitmap_set(map: ifs->state, start: first_blk + blks_per_folio, nbits: nr_blks);
166	spin_unlock_irqrestore(lock: &ifs->state_lock, flags);
167	}
168
169	static void iomap_set_range_dirty(struct folio *folio, size_t off, size_t len)
170	{
171	struct iomap_folio_state *ifs = folio->private;
172
173	if (ifs)
174	ifs_set_range_dirty(folio, ifs, off, len);
175	}
176
177	static struct iomap_folio_state ifs_alloc(struct* inode *inode,
178	struct folio folio, unsigned* int flags)
179	{
180	struct iomap_folio_state *ifs = folio->private;
181	unsigned int nr_blocks = i_blocks_per_folio(inode, folio);
182	gfp_t gfp;
183
184	if (ifs \|\| nr_blocks <= `1`)
185	return ifs;
186
187	if (flags & IOMAP_NOWAIT)
188	gfp = GFP_NOWAIT;
189	else
190	gfp = GFP_NOFS \| __GFP_NOFAIL;
191
192	/*
193	* ifs->state tracks two sets of state flags when the
194	* filesystem block size is smaller than the folio size.
195	* The first state tracks per-block uptodate and the
196	* second tracks per-block dirty state.
197	*/
198	ifs = kzalloc(struct_size(ifs, state,
199	BITS_TO_LONGS(`2` * nr_blocks)), gfp);
200	if (!ifs)
201	return ifs;
202
203	spin_lock_init(&ifs->state_lock);
204	if (folio_test_uptodate(folio))
205	bitmap_set(map: ifs->state, start: `0`, nbits: nr_blocks);
206	if (folio_test_dirty(folio))
207	bitmap_set(map: ifs->state, start: nr_blocks, nbits: nr_blocks);
208	folio_attach_private(folio, data: ifs);
209
210	return ifs;
211	}
212
213	static void ifs_free(struct folio *folio)
214	{
215	struct iomap_folio_state *ifs = folio_detach_private(folio);
216
217	if (!ifs)
218	return;
219	WARN_ON_ONCE(ifs->read_bytes_pending != `0`);
220	WARN_ON_ONCE(atomic_read(&ifs->write_bytes_pending));
221	WARN_ON_ONCE(ifs_is_fully_uptodate(folio, ifs) !=
222	folio_test_uptodate(folio));
223	kfree(objp: ifs);
224	}
225
226	/*
227	* Calculate the range inside the folio that we actually need to read.
228	*/
229	static void iomap_adjust_read_range(struct inode inode, struct* folio *folio,
230	loff_t pos, loff_t length, size_t offp, size_t *lenp)
231	{
232	struct iomap_folio_state *ifs = folio->private;
233	loff_t orig_pos = *pos;
234	loff_t isize = i_size_read(inode);
235	unsigned block_bits = inode->i_blkbits;
236	unsigned block_size = (`1` << block_bits);
237	size_t poff = offset_in_folio(folio, *pos);
238	size_t plen = min_t(loff_t, folio_size(folio) - poff, length);
239	size_t orig_plen = plen;
240	unsigned first = poff >> block_bits;
241	unsigned last = (poff + plen - `1`) >> block_bits;
242
243	/*
244	* If the block size is smaller than the page size, we need to check the
245	* per-block uptodate status and adjust the offset and length if needed
246	* to avoid reading in already uptodate ranges.
247	*/
248	if (ifs) {
249	unsigned int i;
250
251	/ move forward for each leading block marked uptodate /
252	for (i = first; i <= last; i++) {
253	if (!ifs_block_is_uptodate(ifs, block: i))
254	break;
255	*pos += block_size;
256	poff += block_size;
257	plen -= block_size;
258	first++;
259	}
260
261	/ truncate len if we find any trailing uptodate block(s) /
262	while (++i <= last) {
263	if (ifs_block_is_uptodate(ifs, block: i)) {
264	plen -= (last - i + `1`) * block_size;
265	last = i - `1`;
266	break;
267	}
268	}
269	}
270
271	/*
272	* If the extent spans the block that contains the i_size, we need to
273	* handle both halves separately so that we properly zero data in the
274	* page cache for blocks that are entirely outside of i_size.
275	*/
276	if (orig_pos <= isize && orig_pos + orig_plen > isize) {
277	unsigned end = offset_in_folio(folio, isize - `1`) >> block_bits;
278
279	if (first <= end && last > end)
280	plen -= (last - end) * block_size;
281	}
282
283	*offp = poff;
284	*lenp = plen;
285	}
286
287	static void iomap_finish_folio_read(struct folio *folio, size_t off,
288	size_t len, int error)
289	{
290	struct iomap_folio_state *ifs = folio->private;
291	bool uptodate = !error;
292	bool finished = true;
293
294	if (ifs) {
295	unsigned long flags;
296
297	spin_lock_irqsave(&ifs->state_lock, flags);
298	if (!error)
299	uptodate = ifs_set_range_uptodate(folio, ifs, off, len);
300	ifs->read_bytes_pending -= len;
301	finished = !ifs->read_bytes_pending;
302	spin_unlock_irqrestore(lock: &ifs->state_lock, flags);
303	}
304
305	if (finished)
306	folio_end_read(folio, success: uptodate);
307	}
308
309	static void iomap_read_end_io(struct bio *bio)
310	{
311	int error = blk_status_to_errno(status: bio->bi_status);
312	struct folio_iter fi;
313
314	bio_for_each_folio_all(fi, bio)
315	iomap_finish_folio_read(folio: fi.folio, off: fi.offset, len: fi.length, error);
316	bio_put(bio);
317	}
318
319	struct iomap_readpage_ctx {
320	struct folio *cur_folio;
321	bool cur_folio_in_bio;
322	struct bio *bio;
323	struct readahead_control *rac;
324	};
325
326	/**
327	* iomap_read_inline_data - copy inline data into the page cache
328	* @iter: iteration structure
329	* @folio: folio to copy to
330	*
331	* Copy the inline data in @iter into @folio and zero out the rest of the folio.
332	* Only a single IOMAP_INLINE extent is allowed at the end of each file.
333	* Returns zero for success to complete the read, or the usual negative errno.
334	*/
335	static int iomap_read_inline_data(const struct iomap_iter *iter,
336	struct folio *folio)
337	{
338	const struct iomap *iomap = iomap_iter_srcmap(i: iter);
339	size_t size = i_size_read(inode: iter->inode) - iomap->offset;
340	size_t offset = offset_in_folio(folio, iomap->offset);
341
342	if (folio_test_uptodate(folio))
343	return `0`;
344
345	if (WARN_ON_ONCE(size > iomap->length))
346	return -EIO;
347	if (offset > `0`)
348	ifs_alloc(inode: iter->inode, folio, flags: iter->flags);
349
350	folio_fill_tail(folio, offset, from: iomap->inline_data, len: size);
351	iomap_set_range_uptodate(folio, off: offset, len: folio_size(folio) - offset);
352	return `0`;
353	}
354
355	static inline bool iomap_block_needs_zeroing(const struct iomap_iter *iter,
356	loff_t pos)
357	{
358	const struct iomap *srcmap = iomap_iter_srcmap(i: iter);
359
360	return srcmap->type != IOMAP_MAPPED \|\|
361	(srcmap->flags & IOMAP_F_NEW) \|\|
362	pos >= i_size_read(inode: iter->inode);
363	}
364
365	static int iomap_readpage_iter(struct iomap_iter *iter,
366	struct iomap_readpage_ctx *ctx)
367	{
368	const struct iomap *iomap = &iter->iomap;
369	loff_t pos = iter->pos;
370	loff_t length = iomap_length(iter);
371	struct folio *folio = ctx->cur_folio;
372	struct iomap_folio_state *ifs;
373	size_t poff, plen;
374	sector_t sector;
375	int ret;
376
377	if (iomap->type == IOMAP_INLINE) {
378	ret = iomap_read_inline_data(iter, folio);
379	if (ret)
380	return ret;
381	return iomap_iter_advance(iter, count: &length);
382	}
383
384	/ zero post-eof blocks as the page may be mapped /
385	ifs = ifs_alloc(inode: iter->inode, folio, flags: iter->flags);
386	iomap_adjust_read_range(inode: iter->inode, folio, pos: &pos, length, offp: &poff, lenp: &plen);
387	if (plen == `0`)
388	goto done;
389
390	if (iomap_block_needs_zeroing(iter, pos)) {
391	folio_zero_range(folio, start: poff, length: plen);
392	iomap_set_range_uptodate(folio, off: poff, len: plen);
393	goto done;
394	}
395
396	ctx->cur_folio_in_bio = true;
397	if (ifs) {
398	spin_lock_irq(lock: &ifs->state_lock);
399	ifs->read_bytes_pending += plen;
400	spin_unlock_irq(lock: &ifs->state_lock);
401	}
402
403	sector = iomap_sector(iomap, pos);
404	if (!ctx->bio \|\|
405	bio_end_sector(ctx->bio) != sector \|\|
406	!bio_add_folio(bio: ctx->bio, folio, len: plen, off: poff)) {
407	gfp_t gfp = mapping_gfp_constraint(mapping: folio->mapping, GFP_KERNEL);
408	gfp_t orig_gfp = gfp;
409	unsigned int nr_vecs = DIV_ROUND_UP(length, PAGE_SIZE);
410
411	if (ctx->bio)
412	submit_bio(bio: ctx->bio);
413
414	if (ctx->rac) / same as readahead_gfp_mask /
415	gfp \|= __GFP_NORETRY \| __GFP_NOWARN;
416	ctx->bio = bio_alloc(bdev: iomap->bdev, nr_vecs: bio_max_segs(nr_segs: nr_vecs),
417	opf: REQ_OP_READ, gfp_mask: gfp);
418	/*
419	* If the bio_alloc fails, try it again for a single page to
420	* avoid having to deal with partial page reads. This emulates
421	* what do_mpage_read_folio does.
422	*/
423	if (!ctx->bio) {
424	ctx->bio = bio_alloc(bdev: iomap->bdev, nr_vecs: `1`, opf: REQ_OP_READ,
425	gfp_mask: orig_gfp);
426	}
427	if (ctx->rac)
428	ctx->bio->bi_opf \|= REQ_RAHEAD;
429	ctx->bio->bi_iter.bi_sector = sector;
430	ctx->bio->bi_end_io = iomap_read_end_io;
431	bio_add_folio_nofail(bio: ctx->bio, folio, len: plen, off: poff);
432	}
433
434	done:
435	/*
436	* Move the caller beyond our range so that it keeps making progress.
437	* For that, we have to include any leading non-uptodate ranges, but
438	* we can skip trailing ones as they will be handled in the next
439	* iteration.
440	*/
441	length = pos - iter->pos + plen;
442	return iomap_iter_advance(iter, count: &length);
443	}
444
445	static int iomap_read_folio_iter(struct iomap_iter *iter,
446	struct iomap_readpage_ctx *ctx)
447	{
448	int ret;
449
450	while (iomap_length(iter)) {
451	ret = iomap_readpage_iter(iter, ctx);
452	if (ret)
453	return ret;
454	}
455
456	return `0`;
457	}
458
459	int iomap_read_folio(struct folio folio, const* struct iomap_ops *ops)
460	{
461	struct iomap_iter iter = {
462	.inode = folio->mapping->host,
463	.pos = folio_pos(folio),
464	.len = folio_size(folio),
465	};
466	struct iomap_readpage_ctx ctx = {
467	.cur_folio = folio,
468	};
469	int ret;
470
471	trace_iomap_readpage(inode: iter.inode, nr_pages: `1`);
472
473	while ((ret = iomap_iter(iter: &iter, ops)) > `0`)
474	iter.status = iomap_read_folio_iter(iter: &iter, ctx: &ctx);
475
476	if (ctx.bio) {
477	submit_bio(bio: ctx.bio);
478	WARN_ON_ONCE(!ctx.cur_folio_in_bio);
479	} else {
480	WARN_ON_ONCE(ctx.cur_folio_in_bio);
481	folio_unlock(folio);
482	}
483
484	/*
485	* Just like mpage_readahead and block_read_full_folio, we always
486	* return 0 and just set the folio error flag on errors. This
487	* should be cleaned up throughout the stack eventually.
488	*/
489	return `0`;
490	}
491	EXPORT_SYMBOL_GPL(iomap_read_folio);
492
493	static int iomap_readahead_iter(struct iomap_iter *iter,
494	struct iomap_readpage_ctx *ctx)
495	{
496	int ret;
497
498	while (iomap_length(iter)) {
499	if (ctx->cur_folio &&
500	offset_in_folio(ctx->cur_folio, iter->pos) == `0`) {
501	if (!ctx->cur_folio_in_bio)
502	folio_unlock(folio: ctx->cur_folio);
503	ctx->cur_folio = NULL;
504	}
505	if (!ctx->cur_folio) {
506	ctx->cur_folio = readahead_folio(ractl: ctx->rac);
507	ctx->cur_folio_in_bio = false;
508	}
509	ret = iomap_readpage_iter(iter, ctx);
510	if (ret)
511	return ret;
512	}
513
514	return `0`;
515	}
516
517	/**
518	* iomap_readahead - Attempt to read pages from a file.
519	* @rac: Describes the pages to be read.
520	* @ops: The operations vector for the filesystem.
521	*
522	* This function is for filesystems to call to implement their readahead
523	* address_space operation.
524	*
525	* Context: The @ops callbacks may submit I/O (eg to read the addresses of
526	* blocks from disc), and may wait for it. The caller may be trying to
527	* access a different page, and so sleeping excessively should be avoided.
528	* It may allocate memory, but should avoid costly allocations. This
529	* function is called with memalloc_nofs set, so allocations will not cause
530	* the filesystem to be reentered.
531	*/
532	void iomap_readahead(struct readahead_control rac, const* struct iomap_ops *ops)
533	{
534	struct iomap_iter iter = {
535	.inode = rac->mapping->host,
536	.pos = readahead_pos(rac),
537	.len = readahead_length(rac),
538	};
539	struct iomap_readpage_ctx ctx = {
540	.rac = rac,
541	};
542
543	trace_iomap_readahead(inode: rac->mapping->host, nr_pages: readahead_count(rac));
544
545	while (iomap_iter(iter: &iter, ops) > `0`)
546	iter.status = iomap_readahead_iter(iter: &iter, ctx: &ctx);
547
548	if (ctx.bio)
549	submit_bio(bio: ctx.bio);
550	if (ctx.cur_folio) {
551	if (!ctx.cur_folio_in_bio)
552	folio_unlock(folio: ctx.cur_folio);
553	}
554	}
555	EXPORT_SYMBOL_GPL(iomap_readahead);
556
557	/*
558	* iomap_is_partially_uptodate checks whether blocks within a folio are
559	* uptodate or not.
560	*
561	* Returns true if all blocks which correspond to the specified part
562	* of the folio are uptodate.
563	*/
564	bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
565	{
566	struct iomap_folio_state *ifs = folio->private;
567	struct inode *inode = folio->mapping->host;
568	unsigned first, last, i;
569
570	if (!ifs)
571	return false;
572
573	/ Caller's range may extend past the end of this folio /
574	count = min(folio_size(folio) - from, count);
575
576	/ First and last blocks in range within folio /
577	first = from >> inode->i_blkbits;
578	last = (from + count - `1`) >> inode->i_blkbits;
579
580	for (i = first; i <= last; i++)
581	if (!ifs_block_is_uptodate(ifs, block: i))
582	return false;
583	return true;
584	}
585	EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate);
586
587	/**
588	* iomap_get_folio - get a folio reference for writing
589	* @iter: iteration structure
590	* @pos: start offset of write
591	* @len: Suggested size of folio to create.
592	*
593	* Returns a locked reference to the folio at @pos, or an error pointer if the
594	* folio could not be obtained.
595	*/
596	struct folio iomap_get_folio(struct* iomap_iter *iter, loff_t pos, size_t len)
597	{
598	fgf_t fgp = FGP_WRITEBEGIN \| FGP_NOFS;
599
600	if (iter->flags & IOMAP_NOWAIT)
601	fgp \|= FGP_NOWAIT;
602	if (iter->flags & IOMAP_DONTCACHE)
603	fgp \|= FGP_DONTCACHE;
604	fgp \|= fgf_set_order(size: len);
605
606	return __filemap_get_folio(mapping: iter->inode->i_mapping, index: pos >> PAGE_SHIFT,
607	fgp_flags: fgp, gfp: mapping_gfp_mask(mapping: iter->inode->i_mapping));
608	}
609	EXPORT_SYMBOL_GPL(iomap_get_folio);
610
611	bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags)
612	{
613	trace_iomap_release_folio(inode: folio->mapping->host, off: folio_pos(folio),
614	len: folio_size(folio));
615
616	/*
617	* If the folio is dirty, we refuse to release our metadata because
618	* it may be partially dirty. Once we track per-block dirty state,
619	* we can release the metadata if every block is dirty.
620	*/
621	if (folio_test_dirty(folio))
622	return false;
623	ifs_free(folio);
624	return true;
625	}
626	EXPORT_SYMBOL_GPL(iomap_release_folio);
627
628	void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len)
629	{
630	trace_iomap_invalidate_folio(inode: folio->mapping->host,
631	off: folio_pos(folio) + offset, len);
632
633	/*
634	* If we're invalidating the entire folio, clear the dirty state
635	* from it and release it to avoid unnecessary buildup of the LRU.
636	*/
637	if (offset == `0` && len == folio_size(folio)) {
638	WARN_ON_ONCE(folio_test_writeback(folio));
639	folio_cancel_dirty(folio);
640	ifs_free(folio);
641	}
642	}
643	EXPORT_SYMBOL_GPL(iomap_invalidate_folio);
644
645	bool iomap_dirty_folio(struct address_space mapping, struct* folio *folio)
646	{
647	struct inode *inode = mapping->host;
648	size_t len = folio_size(folio);
649
650	ifs_alloc(inode, folio, flags: `0`);
651	iomap_set_range_dirty(folio, off: `0`, len);
652	return filemap_dirty_folio(mapping, folio);
653	}
654	EXPORT_SYMBOL_GPL(iomap_dirty_folio);
655
656	static void
657	iomap_write_failed(struct inode inode, loff_t pos, unsigned* len)
658	{
659	loff_t i_size = i_size_read(inode);
660
661	/*
662	* Only truncate newly allocated pages beyoned EOF, even if the
663	* write started inside the existing inode size.
664	*/
665	if (pos + len > i_size)
666	truncate_pagecache_range(inode, max(pos, i_size),
667	end: pos + len - `1`);
668	}
669
670	static int iomap_read_folio_sync(loff_t block_start, struct folio *folio,
671	size_t poff, size_t plen, const struct iomap *iomap)
672	{
673	struct bio_vec bvec;
674	struct bio bio;
675
676	bio_init(bio: &bio, bdev: iomap->bdev, table: &bvec, max_vecs: `1`, opf: REQ_OP_READ);
677	bio.bi_iter.bi_sector = iomap_sector(iomap, pos: block_start);
678	bio_add_folio_nofail(bio: &bio, folio, len: plen, off: poff);
679	return submit_bio_wait(bio: &bio);
680	}
681
682	static int __iomap_write_begin(const struct iomap_iter *iter, size_t len,
683	struct folio *folio)
684	{
685	const struct iomap *srcmap = iomap_iter_srcmap(i: iter);
686	struct iomap_folio_state *ifs;
687	loff_t pos = iter->pos;
688	loff_t block_size = i_blocksize(node: iter->inode);
689	loff_t block_start = round_down(pos, block_size);
690	loff_t block_end = round_up(pos + len, block_size);
691	unsigned int nr_blocks = i_blocks_per_folio(inode: iter->inode, folio);
692	size_t from = offset_in_folio(folio, pos), to = from + len;
693	size_t poff, plen;
694
695	/*
696	* If the write or zeroing completely overlaps the current folio, then
697	* entire folio will be dirtied so there is no need for
698	* per-block state tracking structures to be attached to this folio.
699	* For the unshare case, we must read in the ondisk contents because we
700	* are not changing pagecache contents.
701	*/
702	if (!(iter->flags & IOMAP_UNSHARE) && pos <= folio_pos(folio) &&
703	pos + len >= folio_pos(folio) + folio_size(folio))
704	return `0`;
705
706	ifs = ifs_alloc(inode: iter->inode, folio, flags: iter->flags);
707	if ((iter->flags & IOMAP_NOWAIT) && !ifs && nr_blocks > `1`)
708	return -EAGAIN;
709
710	if (folio_test_uptodate(folio))
711	return `0`;
712
713	do {
714	iomap_adjust_read_range(inode: iter->inode, folio, pos: &block_start,
715	length: block_end - block_start, offp: &poff, lenp: &plen);
716	if (plen == `0`)
717	break;
718
719	if (!(iter->flags & IOMAP_UNSHARE) &&
720	(from <= poff \|\| from >= poff + plen) &&
721	(to <= poff \|\| to >= poff + plen))
722	continue;
723
724	if (iomap_block_needs_zeroing(iter, pos: block_start)) {
725	if (WARN_ON_ONCE(iter->flags & IOMAP_UNSHARE))
726	return -EIO;
727	folio_zero_segments(folio, start1: poff, xend1: from, start2: to, xend2: poff + plen);
728	} else {
729	int status;
730
731	if (iter->flags & IOMAP_NOWAIT)
732	return -EAGAIN;
733
734	status = iomap_read_folio_sync(block_start, folio,
735	poff, plen, iomap: srcmap);
736	if (status)
737	return status;
738	}
739	iomap_set_range_uptodate(folio, off: poff, len: plen);
740	} while ((block_start += plen) < block_end);
741
742	return `0`;
743	}
744
745	static struct folio __iomap_get_folio(struct* iomap_iter *iter, size_t len)
746	{
747	const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops;
748	loff_t pos = iter->pos;
749
750	if (!mapping_large_folio_support(mapping: iter->inode->i_mapping))
751	len = min_t(size_t, len, PAGE_SIZE - offset_in_page(pos));
752
753	if (folio_ops && folio_ops->get_folio)
754	return folio_ops->get_folio(iter, pos, len);
755	else
756	return iomap_get_folio(iter, pos, len);
757	}
758
759	static void __iomap_put_folio(struct iomap_iter *iter, size_t ret,
760	struct folio *folio)
761	{
762	const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops;
763	loff_t pos = iter->pos;
764
765	if (folio_ops && folio_ops->put_folio) {
766	folio_ops->put_folio(iter->inode, pos, ret, folio);
767	} else {
768	folio_unlock(folio);
769	folio_put(folio);
770	}
771	}
772
773	/ trim pos and bytes to within a given folio /
774	static loff_t iomap_trim_folio_range(struct iomap_iter *iter,
775	struct folio folio, size_t offset, u64 *bytes)
776	{
777	loff_t pos = iter->pos;
778	size_t fsize = folio_size(folio);
779
780	WARN_ON_ONCE(pos < folio_pos(folio));
781	WARN_ON_ONCE(pos >= folio_pos(folio) + fsize);
782
783	*offset = offset_in_folio(folio, pos);
784	bytes = min(bytes, fsize - *offset);
785
786	return pos;
787	}
788
789	static int iomap_write_begin_inline(const struct iomap_iter *iter,
790	struct folio *folio)
791	{
792	/ needs more work for the tailpacking case; disable for now /
793	if (WARN_ON_ONCE(iomap_iter_srcmap(iter)->offset != `0`))
794	return -EIO;
795	return iomap_read_inline_data(iter, folio);
796	}
797
798	/*
799	* Grab and prepare a folio for write based on iter state. Returns the folio,
800	* offset, and length. Callers can optionally pass a max length *plen,
801	* otherwise init to zero.
802	*/
803	static int iomap_write_begin(struct iomap_iter iter, struct* folio **foliop,
804	size_t poffset, u64 plen)
805	{
806	const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops;
807	const struct iomap *srcmap = iomap_iter_srcmap(i: iter);
808	loff_t pos = iter->pos;
809	u64 len = min_t(u64, SIZE_MAX, iomap_length(iter));
810	struct folio *folio;
811	int status = `0`;
812
813	len = min_not_zero(len, *plen);
814	BUG_ON(pos + len > iter->iomap.offset + iter->iomap.length);
815	if (srcmap != &iter->iomap)
816	BUG_ON(pos + len > srcmap->offset + srcmap->length);
817
818	if (fatal_signal_pending(current))
819	return -EINTR;
820
821	folio = __iomap_get_folio(iter, len);
822	if (IS_ERR(ptr: folio))
823	return PTR_ERR(ptr: folio);
824
825	/*
826	* Now we have a locked folio, before we do anything with it we need to
827	* check that the iomap we have cached is not stale. The inode extent
828	* mapping can change due to concurrent IO in flight (e.g.
829	* IOMAP_UNWRITTEN state can change and memory reclaim could have
830	* reclaimed a previously partially written page at this index after IO
831	* completion before this write reaches this file offset) and hence we
832	* could do the wrong thing here (zero a page range incorrectly or fail
833	* to zero) and corrupt data.
834	*/
835	if (folio_ops && folio_ops->iomap_valid) {
836	bool iomap_valid = folio_ops->iomap_valid(iter->inode,
837	&iter->iomap);
838	if (!iomap_valid) {
839	iter->iomap.flags \|= IOMAP_F_STALE;
840	status = `0`;
841	goto out_unlock;
842	}
843	}
844
845	pos = iomap_trim_folio_range(iter, folio, offset: poffset, bytes: &len);
846
847	if (srcmap->type == IOMAP_INLINE)
848	status = iomap_write_begin_inline(iter, folio);
849	else if (srcmap->flags & IOMAP_F_BUFFER_HEAD)
850	status = __block_write_begin_int(folio, pos, len, NULL, iomap: srcmap);
851	else
852	status = __iomap_write_begin(iter, len, folio);
853
854	if (unlikely(status))
855	goto out_unlock;
856
857	*foliop = folio;
858	*plen = len;
859	return `0`;
860
861	out_unlock:
862	__iomap_put_folio(iter, ret: `0`, folio);
863
864	return status;
865	}
866
867	static bool __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
868	size_t copied, struct folio *folio)
869	{
870	flush_dcache_folio(folio);
871
872	/*
873	* The blocks that were entirely written will now be uptodate, so we
874	* don't have to worry about a read_folio reading them and overwriting a
875	* partial write. However, if we've encountered a short write and only
876	* partially written into a block, it will not be marked uptodate, so a
877	* read_folio might come in and destroy our partial write.
878	*
879	* Do the simplest thing and just treat any short write to a
880	* non-uptodate page as a zero-length write, and force the caller to
881	* redo the whole thing.
882	*/
883	if (unlikely(copied < len && !folio_test_uptodate(folio)))
884	return false;
885	iomap_set_range_uptodate(folio, offset_in_folio(folio, pos), len);
886	iomap_set_range_dirty(folio, offset_in_folio(folio, pos), len: copied);
887	filemap_dirty_folio(mapping: inode->i_mapping, folio);
888	return true;
889	}
890
891	static void iomap_write_end_inline(const struct iomap_iter *iter,
892	struct folio *folio, loff_t pos, size_t copied)
893	{
894	const struct iomap *iomap = &iter->iomap;
895	void *addr;
896
897	WARN_ON_ONCE(!folio_test_uptodate(folio));
898	BUG_ON(!iomap_inline_data_valid(iomap));
899
900	flush_dcache_folio(folio);
901	addr = kmap_local_folio(folio, offset: pos);
902	memcpy(iomap_inline_data(iomap, pos), addr, copied);
903	kunmap_local(addr);
904
905	mark_inode_dirty(inode: iter->inode);
906	}
907
908	/*
909	* Returns true if all copied bytes have been written to the pagecache,
910	* otherwise return false.
911	*/
912	static bool iomap_write_end(struct iomap_iter *iter, size_t len, size_t copied,
913	struct folio *folio)
914	{
915	const struct iomap *srcmap = iomap_iter_srcmap(i: iter);
916	loff_t pos = iter->pos;
917
918	if (srcmap->type == IOMAP_INLINE) {
919	iomap_write_end_inline(iter, folio, pos, copied);
920	return true;
921	}
922
923	if (srcmap->flags & IOMAP_F_BUFFER_HEAD) {
924	size_t bh_written;
925
926	bh_written = block_write_end(NULL, iter->inode->i_mapping, pos,
927	len, copied, folio, NULL);
928	WARN_ON_ONCE(bh_written != copied && bh_written != `0`);
929	return bh_written == copied;
930	}
931
932	return __iomap_write_end(inode: iter->inode, pos, len, copied, folio);
933	}
934
935	static int iomap_write_iter(struct iomap_iter iter, struct* iov_iter *i)
936	{
937	ssize_t total_written = `0`;
938	int status = `0`;
939	struct address_space *mapping = iter->inode->i_mapping;
940	size_t chunk = mapping_max_folio_size(mapping);
941	unsigned int bdp_flags = (iter->flags & IOMAP_NOWAIT) ? BDP_ASYNC : `0`;
942
943	do {
944	struct folio *folio;
945	loff_t old_size;
946	size_t offset; / Offset into folio /
947	u64 bytes; / Bytes to write to folio /
948	size_t copied; / Bytes copied from user /
949	u64 written; / Bytes have been written /
950	loff_t pos;
951
952	bytes = iov_iter_count(i);
953	retry:
954	offset = iter->pos & (chunk - `1`);
955	bytes = min(chunk - offset, bytes);
956	status = balance_dirty_pages_ratelimited_flags(mapping,
957	flags: bdp_flags);
958	if (unlikely(status))
959	break;
960
961	if (bytes > iomap_length(iter))
962	bytes = iomap_length(iter);
963
964	/*
965	* Bring in the user page that we'll copy from _first_.
966	* Otherwise there's a nasty deadlock on copying from the
967	* same page as we're writing to, without it being marked
968	* up-to-date.
969	*
970	* For async buffered writes the assumption is that the user
971	* page has already been faulted in. This can be optimized by
972	* faulting the user page.
973	*/
974	if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) {
975	status = -EFAULT;
976	break;
977	}
978
979	status = iomap_write_begin(iter, foliop: &folio, poffset: &offset, plen: &bytes);
980	if (unlikely(status)) {
981	iomap_write_failed(inode: iter->inode, pos: iter->pos, len: bytes);
982	break;
983	}
984	if (iter->iomap.flags & IOMAP_F_STALE)
985	break;
986
987	pos = iter->pos;
988
989	if (mapping_writably_mapped(mapping))
990	flush_dcache_folio(folio);
991
992	copied = copy_folio_from_iter_atomic(folio, offset, bytes, i);
993	written = iomap_write_end(iter, len: bytes, copied, folio) ?
994	copied : `0`;
995
996	/*
997	* Update the in-memory inode size after copying the data into
998	* the page cache. It's up to the file system to write the
999	* updated size to disk, preferably after I/O completion so that
1000	* no stale data is exposed. Only once that's done can we
1001	* unlock and release the folio.
1002	*/
1003	old_size = iter->inode->i_size;
1004	if (pos + written > old_size) {
1005	i_size_write(inode: iter->inode, i_size: pos + written);
1006	iter->iomap.flags \|= IOMAP_F_SIZE_CHANGED;
1007	}
1008	__iomap_put_folio(iter, ret: written, folio);
1009
1010	if (old_size < pos)
1011	pagecache_isize_extended(inode: iter->inode, from: old_size, to: pos);
1012
1013	cond_resched();
1014	if (unlikely(written == `0`)) {
1015	/*
1016	* A short copy made iomap_write_end() reject the
1017	* thing entirely. Might be memory poisoning
1018	* halfway through, might be a race with munmap,
1019	* might be severe memory pressure.
1020	*/
1021	iomap_write_failed(inode: iter->inode, pos, len: bytes);
1022	iov_iter_revert(i, bytes: copied);
1023
1024	if (chunk > PAGE_SIZE)
1025	chunk /= `2`;
1026	if (copied) {
1027	bytes = copied;
1028	goto retry;
1029	}
1030	} else {
1031	total_written += written;
1032	iomap_iter_advance(iter, count: &written);
1033	}
1034	} while (iov_iter_count(i) && iomap_length(iter));
1035
1036	return total_written ? `0` : status;
1037	}
1038
1039	ssize_t
1040	iomap_file_buffered_write(struct kiocb iocb, struct* iov_iter *i,
1041	const struct iomap_ops ops, void* *private)
1042	{
1043	struct iomap_iter iter = {
1044	.inode = iocb->ki_filp->f_mapping->host,
1045	.pos = iocb->ki_pos,
1046	.len = iov_iter_count(i),
1047	.flags = IOMAP_WRITE,
1048	.private = private,
1049	};
1050	ssize_t ret;
1051
1052	if (iocb->ki_flags & IOCB_NOWAIT)
1053	iter.flags \|= IOMAP_NOWAIT;
1054	if (iocb->ki_flags & IOCB_DONTCACHE)
1055	iter.flags \|= IOMAP_DONTCACHE;
1056
1057	while ((ret = iomap_iter(iter: &iter, ops)) > `0`)
1058	iter.status = iomap_write_iter(iter: &iter, i);
1059
1060	if (unlikely(iter.pos == iocb->ki_pos))
1061	return ret;
1062	ret = iter.pos - iocb->ki_pos;
1063	iocb->ki_pos = iter.pos;
1064	return ret;
1065	}
1066	EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
1067
1068	static void iomap_write_delalloc_ifs_punch(struct inode *inode,
1069	struct folio *folio, loff_t start_byte, loff_t end_byte,
1070	struct iomap *iomap, iomap_punch_t punch)
1071	{
1072	unsigned int first_blk, last_blk, i;
1073	loff_t last_byte;
1074	u8 blkbits = inode->i_blkbits;
1075	struct iomap_folio_state *ifs;
1076
1077	/*
1078	* When we have per-block dirty tracking, there can be
1079	* blocks within a folio which are marked uptodate
1080	* but not dirty. In that case it is necessary to punch
1081	* out such blocks to avoid leaking any delalloc blocks.
1082	*/
1083	ifs = folio->private;
1084	if (!ifs)
1085	return;
1086
1087	last_byte = min_t(loff_t, end_byte - `1`,
1088	folio_pos(folio) + folio_size(folio) - `1`);
1089	first_blk = offset_in_folio(folio, start_byte) >> blkbits;
1090	last_blk = offset_in_folio(folio, last_byte) >> blkbits;
1091	for (i = first_blk; i <= last_blk; i++) {
1092	if (!ifs_block_is_dirty(folio, ifs, block: i))
1093	punch(inode, folio_pos(folio) + (i << blkbits),
1094	`1` << blkbits, iomap);
1095	}
1096	}
1097
1098	static void iomap_write_delalloc_punch(struct inode inode, struct* folio *folio,
1099	loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte,
1100	struct iomap *iomap, iomap_punch_t punch)
1101	{
1102	if (!folio_test_dirty(folio))
1103	return;
1104
1105	/ if dirty, punch up to offset /
1106	if (start_byte > *punch_start_byte) {
1107	punch(inode, punch_start_byte, start_byte - punch_start_byte,
1108	iomap);
1109	}
1110
1111	/ Punch non-dirty blocks within folio /
1112	iomap_write_delalloc_ifs_punch(inode, folio, start_byte, end_byte,
1113	iomap, punch);
1114
1115	/*
1116	* Make sure the next punch start is correctly bound to
1117	* the end of this data range, not the end of the folio.
1118	*/
1119	*punch_start_byte = min_t(loff_t, end_byte,
1120	folio_pos(folio) + folio_size(folio));
1121	}
1122
1123	/*
1124	* Scan the data range passed to us for dirty page cache folios. If we find a
1125	* dirty folio, punch out the preceding range and update the offset from which
1126	* the next punch will start from.
1127	*
1128	* We can punch out storage reservations under clean pages because they either
1129	* contain data that has been written back - in which case the delalloc punch
1130	* over that range is a no-op - or they have been read faults in which case they
1131	* contain zeroes and we can remove the delalloc backing range and any new
1132	* writes to those pages will do the normal hole filling operation...
1133	*
1134	* This makes the logic simple: we only need to keep the delalloc extents only
1135	* over the dirty ranges of the page cache.
1136	*
1137	* This function uses [start_byte, end_byte) intervals (i.e. open ended) to
1138	* simplify range iterations.
1139	*/
1140	static void iomap_write_delalloc_scan(struct inode *inode,
1141	loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte,
1142	struct iomap *iomap, iomap_punch_t punch)
1143	{
1144	while (start_byte < end_byte) {
1145	struct folio *folio;
1146
1147	/ grab locked page /
1148	folio = filemap_lock_folio(mapping: inode->i_mapping,
1149	index: start_byte >> PAGE_SHIFT);
1150	if (IS_ERR(ptr: folio)) {
1151	start_byte = ALIGN_DOWN(start_byte, PAGE_SIZE) +
1152	PAGE_SIZE;
1153	continue;
1154	}
1155
1156	iomap_write_delalloc_punch(inode, folio, punch_start_byte,
1157	start_byte, end_byte, iomap, punch);
1158
1159	/ move offset to start of next folio in range /
1160	start_byte = folio_pos(folio) + folio_size(folio);
1161	folio_unlock(folio);
1162	folio_put(folio);
1163	}
1164	}
1165
1166	/*
1167	* When a short write occurs, the filesystem might need to use ->iomap_end
1168	* to remove space reservations created in ->iomap_begin.
1169	*
1170	* For filesystems that use delayed allocation, there can be dirty pages over
1171	* the delalloc extent outside the range of a short write but still within the
1172	* delalloc extent allocated for this iomap if the write raced with page
1173	* faults.
1174	*
1175	* Punch out all the delalloc blocks in the range given except for those that
1176	* have dirty data still pending in the page cache - those are going to be
1177	* written and so must still retain the delalloc backing for writeback.
1178	*
1179	* The punch() callback must only punch delalloc extents in the range passed
1180	* to it. It must skip over all other types of extents in the range and leave
1181	* them completely unchanged. It must do this punch atomically with respect to
1182	* other extent modifications.
1183	*
1184	* The punch() callback may be called with a folio locked to prevent writeback
1185	* extent allocation racing at the edge of the range we are currently punching.
1186	* The locked folio may or may not cover the range being punched, so it is not
1187	* safe for the punch() callback to lock folios itself.
1188	*
1189	* Lock order is:
1190	*
1191	* inode->i_rwsem (shared or exclusive)
1192	* inode->i_mapping->invalidate_lock (exclusive)
1193	* folio_lock()
1194	* ->punch
1195	* internal filesystem allocation lock
1196	*
1197	* As we are scanning the page cache for data, we don't need to reimplement the
1198	* wheel - mapping_seek_hole_data() does exactly what we need to identify the
1199	* start and end of data ranges correctly even for sub-folio block sizes. This
1200	* byte range based iteration is especially convenient because it means we
1201	* don't have to care about variable size folios, nor where the start or end of
1202	* the data range lies within a folio, if they lie within the same folio or even
1203	* if there are multiple discontiguous data ranges within the folio.
1204	*
1205	* It should be noted that mapping_seek_hole_data() is not aware of EOF, and so
1206	* can return data ranges that exist in the cache beyond EOF. e.g. a page fault
1207	* spanning EOF will initialise the post-EOF data to zeroes and mark it up to
1208	* date. A write page fault can then mark it dirty. If we then fail a write()
1209	* beyond EOF into that up to date cached range, we allocate a delalloc block
1210	* beyond EOF and then have to punch it out. Because the range is up to date,
1211	* mapping_seek_hole_data() will return it, and we will skip the punch because
1212	* the folio is dirty. THis is incorrect - we always need to punch out delalloc
1213	* beyond EOF in this case as writeback will never write back and covert that
1214	* delalloc block beyond EOF. Hence we limit the cached data scan range to EOF,
1215	* resulting in always punching out the range from the EOF to the end of the
1216	* range the iomap spans.
1217	*
1218	* Intervals are of the form [start_byte, end_byte) (i.e. open ended) because it
1219	* matches the intervals returned by mapping_seek_hole_data(). i.e. SEEK_DATA
1220	* returns the start of a data range (start_byte), and SEEK_HOLE(start_byte)
1221	* returns the end of the data range (data_end). Using closed intervals would
1222	* require sprinkling this code with magic "+ 1" and "- 1" arithmetic and expose
1223	* the code to subtle off-by-one bugs....
1224	*/
1225	void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte,
1226	loff_t end_byte, unsigned flags, struct iomap *iomap,
1227	iomap_punch_t punch)
1228	{
1229	loff_t punch_start_byte = start_byte;
1230	loff_t scan_end_byte = min(i_size_read(inode), end_byte);
1231
1232	/*
1233	* The caller must hold invalidate_lock to avoid races with page faults
1234	* re-instantiating folios and dirtying them via ->page_mkwrite whilst
1235	* we walk the cache and perform delalloc extent removal. Failing to do
1236	* this can leave dirty pages with no space reservation in the cache.
1237	*/
1238	lockdep_assert_held_write(&inode->i_mapping->invalidate_lock);
1239
1240	while (start_byte < scan_end_byte) {
1241	loff_t data_end;
1242
1243	start_byte = mapping_seek_hole_data(inode->i_mapping,
1244	start: start_byte, end: scan_end_byte, SEEK_DATA);
1245	/*
1246	* If there is no more data to scan, all that is left is to
1247	* punch out the remaining range.
1248	*
1249	* Note that mapping_seek_hole_data is only supposed to return
1250	* either an offset or -ENXIO, so WARN on any other error as
1251	* that would be an API change without updating the callers.
1252	*/
1253	if (start_byte == -ENXIO \|\| start_byte == scan_end_byte)
1254	break;
1255	if (WARN_ON_ONCE(start_byte < `0`))
1256	return;
1257	WARN_ON_ONCE(start_byte < punch_start_byte);
1258	WARN_ON_ONCE(start_byte > scan_end_byte);
1259
1260	/*
1261	* We find the end of this contiguous cached data range by
1262	* seeking from start_byte to the beginning of the next hole.
1263	*/
1264	data_end = mapping_seek_hole_data(inode->i_mapping, start: start_byte,
1265	end: scan_end_byte, SEEK_HOLE);
1266	if (WARN_ON_ONCE(data_end < `0`))
1267	return;
1268
1269	/*
1270	* If we race with post-direct I/O invalidation of the page cache,
1271	* there might be no data left at start_byte.
1272	*/
1273	if (data_end == start_byte)
1274	continue;
1275
1276	WARN_ON_ONCE(data_end < start_byte);
1277	WARN_ON_ONCE(data_end > scan_end_byte);
1278
1279	iomap_write_delalloc_scan(inode, punch_start_byte: &punch_start_byte, start_byte,
1280	end_byte: data_end, iomap, punch);
1281
1282	/ The next data search starts at the end of this one. /
1283	start_byte = data_end;
1284	}
1285
1286	if (punch_start_byte < end_byte)
1287	punch(inode, punch_start_byte, end_byte - punch_start_byte,
1288	iomap);
1289	}
1290	EXPORT_SYMBOL_GPL(iomap_write_delalloc_release);
1291
1292	static int iomap_unshare_iter(struct iomap_iter *iter)
1293	{
1294	struct iomap *iomap = &iter->iomap;
1295	u64 bytes = iomap_length(iter);
1296	int status;
1297
1298	if (!iomap_want_unshare_iter(iter))
1299	return iomap_iter_advance(iter, count: &bytes);
1300
1301	do {
1302	struct folio *folio;
1303	size_t offset;
1304	bool ret;
1305
1306	bytes = min_t(u64, SIZE_MAX, bytes);
1307	status = iomap_write_begin(iter, foliop: &folio, poffset: &offset, plen: &bytes);
1308	if (unlikely(status))
1309	return status;
1310	if (iomap->flags & IOMAP_F_STALE)
1311	break;
1312
1313	ret = iomap_write_end(iter, len: bytes, copied: bytes, folio);
1314	__iomap_put_folio(iter, ret: bytes, folio);
1315	if (WARN_ON_ONCE(!ret))
1316	return -EIO;
1317
1318	cond_resched();
1319
1320	balance_dirty_pages_ratelimited(mapping: iter->inode->i_mapping);
1321
1322	status = iomap_iter_advance(iter, count: &bytes);
1323	if (status)
1324	break;
1325	} while (bytes > `0`);
1326
1327	return status;
1328	}
1329
1330	int
1331	iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
1332	const struct iomap_ops *ops)
1333	{
1334	struct iomap_iter iter = {
1335	.inode = inode,
1336	.pos = pos,
1337	.flags = IOMAP_WRITE \| IOMAP_UNSHARE,
1338	};
1339	loff_t size = i_size_read(inode);
1340	int ret;
1341
1342	if (pos < `0` \|\| pos >= size)
1343	return `0`;
1344
1345	iter.len = min(len, size - pos);
1346	while ((ret = iomap_iter(iter: &iter, ops)) > `0`)
1347	iter.status = iomap_unshare_iter(iter: &iter);
1348	return ret;
1349	}
1350	EXPORT_SYMBOL_GPL(iomap_file_unshare);
1351
1352	/*
1353	* Flush the remaining range of the iter and mark the current mapping stale.
1354	* This is used when zero range sees an unwritten mapping that may have had
1355	* dirty pagecache over it.
1356	*/
1357	static inline int iomap_zero_iter_flush_and_stale(struct iomap_iter *i)
1358	{
1359	struct address_space *mapping = i->inode->i_mapping;
1360	loff_t end = i->pos + i->len - `1`;
1361
1362	i->iomap.flags \|= IOMAP_F_STALE;
1363	return filemap_write_and_wait_range(mapping, lstart: i->pos, lend: end);
1364	}
1365
1366	static int iomap_zero_iter(struct iomap_iter iter, bool did_zero)
1367	{
1368	u64 bytes = iomap_length(iter);
1369	int status;
1370
1371	do {
1372	struct folio *folio;
1373	size_t offset;
1374	bool ret;
1375
1376	bytes = min_t(u64, SIZE_MAX, bytes);
1377	status = iomap_write_begin(iter, foliop: &folio, poffset: &offset, plen: &bytes);
1378	if (status)
1379	return status;
1380	if (iter->iomap.flags & IOMAP_F_STALE)
1381	break;
1382
1383	/ warn about zeroing folios beyond eof that won't write back /
1384	WARN_ON_ONCE(folio_pos(folio) > iter->inode->i_size);
1385
1386	folio_zero_range(folio, start: offset, length: bytes);
1387	folio_mark_accessed(folio);
1388
1389	ret = iomap_write_end(iter, len: bytes, copied: bytes, folio);
1390	__iomap_put_folio(iter, ret: bytes, folio);
1391	if (WARN_ON_ONCE(!ret))
1392	return -EIO;
1393
1394	status = iomap_iter_advance(iter, count: &bytes);
1395	if (status)
1396	break;
1397	} while (bytes > `0`);
1398
1399	if (did_zero)
1400	*did_zero = true;
1401	return status;
1402	}
1403
1404	int
1405	iomap_zero_range(struct inode inode, loff_t pos, loff_t len, bool did_zero,
1406	const struct iomap_ops ops, void* *private)
1407	{
1408	struct iomap_iter iter = {
1409	.inode = inode,
1410	.pos = pos,
1411	.len = len,
1412	.flags = IOMAP_ZERO,
1413	.private = private,
1414	};
1415	struct address_space *mapping = inode->i_mapping;
1416	unsigned int blocksize = i_blocksize(node: inode);
1417	unsigned int off = pos & (blocksize - `1`);
1418	loff_t plen = min_t(loff_t, len, blocksize - off);
1419	int ret;
1420	bool range_dirty;
1421
1422	/*
1423	* Zero range can skip mappings that are zero on disk so long as
1424	* pagecache is clean. If pagecache was dirty prior to zero range, the
1425	* mapping converts on writeback completion and so must be zeroed.
1426	*
1427	* The simplest way to deal with this across a range is to flush
1428	* pagecache and process the updated mappings. To avoid excessive
1429	* flushing on partial eof zeroing, special case it to zero the
1430	* unaligned start portion if already dirty in pagecache.
1431	*/
1432	if (off &&
1433	filemap_range_needs_writeback(mapping, start_byte: pos, end_byte: pos + plen - `1`)) {
1434	iter.len = plen;
1435	while ((ret = iomap_iter(iter: &iter, ops)) > `0`)
1436	iter.status = iomap_zero_iter(iter: &iter, did_zero);
1437
1438	iter.len = len - (iter.pos - pos);
1439	if (ret \|\| !iter.len)
1440	return ret;
1441	}
1442
1443	/*
1444	* To avoid an unconditional flush, check pagecache state and only flush
1445	* if dirty and the fs returns a mapping that might convert on
1446	* writeback.
1447	*/
1448	range_dirty = filemap_range_needs_writeback(mapping: inode->i_mapping,
1449	start_byte: iter.pos, end_byte: iter.pos + iter.len - `1`);
1450	while ((ret = iomap_iter(iter: &iter, ops)) > `0`) {
1451	const struct iomap *srcmap = iomap_iter_srcmap(i: &iter);
1452
1453	if (srcmap->type == IOMAP_HOLE \|\|
1454	srcmap->type == IOMAP_UNWRITTEN) {
1455	s64 status;
1456
1457	if (range_dirty) {
1458	range_dirty = false;
1459	status = iomap_zero_iter_flush_and_stale(i: &iter);
1460	} else {
1461	status = iomap_iter_advance_full(iter: &iter);
1462	}
1463	iter.status = status;
1464	continue;
1465	}
1466
1467	iter.status = iomap_zero_iter(iter: &iter, did_zero);
1468	}
1469	return ret;
1470	}
1471	EXPORT_SYMBOL_GPL(iomap_zero_range);
1472
1473	int
1474	iomap_truncate_page(struct inode inode, loff_t pos, bool did_zero,
1475	const struct iomap_ops ops, void* *private)
1476	{
1477	unsigned int blocksize = i_blocksize(node: inode);
1478	unsigned int off = pos & (blocksize - `1`);
1479
1480	/ Block boundary? Nothing to do /
1481	if (!off)
1482	return `0`;
1483	return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops,
1484	private);
1485	}
1486	EXPORT_SYMBOL_GPL(iomap_truncate_page);
1487
1488	static int iomap_folio_mkwrite_iter(struct iomap_iter *iter,
1489	struct folio *folio)
1490	{
1491	loff_t length = iomap_length(iter);
1492	int ret;
1493
1494	if (iter->iomap.flags & IOMAP_F_BUFFER_HEAD) {
1495	ret = __block_write_begin_int(folio, pos: iter->pos, len: length, NULL,
1496	iomap: &iter->iomap);
1497	if (ret)
1498	return ret;
1499	block_commit_write(folio, from: `0`, to: length);
1500	} else {
1501	WARN_ON_ONCE(!folio_test_uptodate(folio));
1502	folio_mark_dirty(folio);
1503	}
1504
1505	return iomap_iter_advance(iter, count: &length);
1506	}
1507
1508	vm_fault_t iomap_page_mkwrite(struct vm_fault vmf, const* struct iomap_ops *ops,
1509	void *private)
1510	{
1511	struct iomap_iter iter = {
1512	.inode = file_inode(f: vmf->vma->vm_file),
1513	.flags = IOMAP_WRITE \| IOMAP_FAULT,
1514	.private = private,
1515	};
1516	struct folio *folio = page_folio(vmf->page);
1517	ssize_t ret;
1518
1519	folio_lock(folio);
1520	ret = folio_mkwrite_check_truncate(folio, inode: iter.inode);
1521	if (ret < `0`)
1522	goto out_unlock;
1523	iter.pos = folio_pos(folio);
1524	iter.len = ret;
1525	while ((ret = iomap_iter(iter: &iter, ops)) > `0`)
1526	iter.status = iomap_folio_mkwrite_iter(iter: &iter, folio);
1527
1528	if (ret < `0`)
1529	goto out_unlock;
1530	folio_wait_stable(folio);
1531	return VM_FAULT_LOCKED;
1532	out_unlock:
1533	folio_unlock(folio);
1534	return vmf_fs_error(err: ret);
1535	}
1536	EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
1537
1538	static void iomap_finish_folio_write(struct inode inode, struct* folio *folio,
1539	size_t len)
1540	{
1541	struct iomap_folio_state *ifs = folio->private;
1542
1543	WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > `1` && !ifs);
1544	WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) <= `0`);
1545
1546	if (!ifs \|\| atomic_sub_and_test(i: len, v: &ifs->write_bytes_pending))
1547	folio_end_writeback(folio);
1548	}
1549
1550	/*
1551	* We're now finished for good with this ioend structure. Update the page
1552	* state, release holds on bios, and finally free up memory. Do not use the
1553	* ioend after this.
1554	*/
1555	u32 iomap_finish_ioend_buffered(struct iomap_ioend *ioend)
1556	{
1557	struct inode *inode = ioend->io_inode;
1558	struct bio *bio = &ioend->io_bio;
1559	struct folio_iter fi;
1560	u32 folio_count = `0`;
1561
1562	if (ioend->io_error) {
1563	mapping_set_error(mapping: inode->i_mapping, error: ioend->io_error);
1564	if (!bio_flagged(bio, bit: BIO_QUIET)) {
1565	pr_err_ratelimited(
1566	"%s: writeback error on inode %lu, offset %lld, sector %llu",
1567	inode->i_sb->s_id, inode->i_ino,
1568	ioend->io_offset, ioend->io_sector);
1569	}
1570	}
1571
1572	/ walk all folios in bio, ending page IO on them /
1573	bio_for_each_folio_all(fi, bio) {
1574	iomap_finish_folio_write(inode, folio: fi.folio, len: fi.length);
1575	folio_count++;
1576	}
1577
1578	bio_put(bio); / frees the ioend /
1579	return folio_count;
1580	}
1581
1582	static void iomap_writepage_end_bio(struct bio *bio)
1583	{
1584	struct iomap_ioend *ioend = iomap_ioend_from_bio(bio);
1585
1586	ioend->io_error = blk_status_to_errno(status: bio->bi_status);
1587	iomap_finish_ioend_buffered(ioend);
1588	}
1589
1590	/*
1591	* Submit an ioend.
1592	*
1593	* If @error is non-zero, it means that we have a situation where some part of
1594	* the submission process has failed after we've marked pages for writeback.
1595	* We cannot cancel ioend directly in that case, so call the bio end I/O handler
1596	* with the error status here to run the normal I/O completion handler to clear
1597	* the writeback bit and let the file system proess the errors.
1598	*/
1599	static int iomap_submit_ioend(struct iomap_writepage_ctx wpc, int* error)
1600	{
1601	if (!wpc->ioend)
1602	return error;
1603
1604	/*
1605	* Let the file systems prepare the I/O submission and hook in an I/O
1606	* comletion handler. This also needs to happen in case after a
1607	* failure happened so that the file system end I/O handler gets called
1608	* to clean up.
1609	*/
1610	if (wpc->ops->submit_ioend) {
1611	error = wpc->ops->submit_ioend(wpc, error);
1612	} else {
1613	if (WARN_ON_ONCE(wpc->iomap.flags & IOMAP_F_ANON_WRITE))
1614	error = -EIO;
1615	if (!error)
1616	submit_bio(bio: &wpc->ioend->io_bio);
1617	}
1618
1619	if (error) {
1620	wpc->ioend->io_bio.bi_status = errno_to_blk_status(errno: error);
1621	bio_endio(&wpc->ioend->io_bio);
1622	}
1623
1624	wpc->ioend = NULL;
1625	return error;
1626	}
1627
1628	static struct iomap_ioend iomap_alloc_ioend(struct* iomap_writepage_ctx *wpc,
1629	struct writeback_control wbc, struct* inode *inode, loff_t pos,
1630	u16 ioend_flags)
1631	{
1632	struct bio *bio;
1633
1634	bio = bio_alloc_bioset(bdev: wpc->iomap.bdev, BIO_MAX_VECS,
1635	opf: REQ_OP_WRITE \| wbc_to_write_flags(wbc),
1636	GFP_NOFS, bs: &iomap_ioend_bioset);
1637	bio->bi_iter.bi_sector = iomap_sector(iomap: &wpc->iomap, pos);
1638	bio->bi_end_io = iomap_writepage_end_bio;
1639	bio->bi_write_hint = inode->i_write_hint;
1640	wbc_init_bio(wbc, bio);
1641	wpc->nr_folios = `0`;
1642	return iomap_init_ioend(inode, bio, file_offset: pos, ioend_flags);
1643	}
1644
1645	static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos,
1646	u16 ioend_flags)
1647	{
1648	if (ioend_flags & IOMAP_IOEND_BOUNDARY)
1649	return false;
1650	if ((ioend_flags & IOMAP_IOEND_NOMERGE_FLAGS) !=
1651	(wpc->ioend->io_flags & IOMAP_IOEND_NOMERGE_FLAGS))
1652	return false;
1653	if (pos != wpc->ioend->io_offset + wpc->ioend->io_size)
1654	return false;
1655	if (!(wpc->iomap.flags & IOMAP_F_ANON_WRITE) &&
1656	iomap_sector(iomap: &wpc->iomap, pos) !=
1657	bio_end_sector(&wpc->ioend->io_bio))
1658	return false;
1659	/*
1660	* Limit ioend bio chain lengths to minimise IO completion latency. This
1661	* also prevents long tight loops ending page writeback on all the
1662	* folios in the ioend.
1663	*/
1664	if (wpc->nr_folios >= IOEND_BATCH_SIZE)
1665	return false;
1666	return true;
1667	}
1668
1669	/*
1670	* Test to see if we have an existing ioend structure that we could append to
1671	* first; otherwise finish off the current ioend and start another.
1672	*
1673	* If a new ioend is created and cached, the old ioend is submitted to the block
1674	* layer instantly. Batching optimisations are provided by higher level block
1675	* plugging.
1676	*
1677	* At the end of a writeback pass, there will be a cached ioend remaining on the
1678	* writepage context that the caller will need to submit.
1679	*/
1680	static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc,
1681	struct writeback_control wbc, struct* folio *folio,
1682	struct inode *inode, loff_t pos, loff_t end_pos,
1683	unsigned len)
1684	{
1685	struct iomap_folio_state *ifs = folio->private;
1686	size_t poff = offset_in_folio(folio, pos);
1687	unsigned int ioend_flags = `0`;
1688	int error;
1689
1690	if (wpc->iomap.type == IOMAP_UNWRITTEN)
1691	ioend_flags \|= IOMAP_IOEND_UNWRITTEN;
1692	if (wpc->iomap.flags & IOMAP_F_SHARED)
1693	ioend_flags \|= IOMAP_IOEND_SHARED;
1694	if (folio_test_dropbehind(folio))
1695	ioend_flags \|= IOMAP_IOEND_DONTCACHE;
1696	if (pos == wpc->iomap.offset && (wpc->iomap.flags & IOMAP_F_BOUNDARY))
1697	ioend_flags \|= IOMAP_IOEND_BOUNDARY;
1698
1699	if (!wpc->ioend \|\| !iomap_can_add_to_ioend(wpc, pos, ioend_flags)) {
1700	new_ioend:
1701	error = iomap_submit_ioend(wpc, error: `0`);
1702	if (error)
1703	return error;
1704	wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos,
1705	ioend_flags);
1706	}
1707
1708	if (!bio_add_folio(bio: &wpc->ioend->io_bio, folio, len, off: poff))
1709	goto new_ioend;
1710
1711	if (ifs)
1712	atomic_add(i: len, v: &ifs->write_bytes_pending);
1713
1714	/*
1715	* Clamp io_offset and io_size to the incore EOF so that ondisk
1716	* file size updates in the ioend completion are byte-accurate.
1717	* This avoids recovering files with zeroed tail regions when
1718	* writeback races with appending writes:
1719	*
1720	* Thread 1: Thread 2:
1721	* ------------ -----------
1722	* write [A, A+B]
1723	* update inode size to A+B
1724	* submit I/O [A, A+BS]
1725	* write [A+B, A+B+C]
1726	* update inode size to A+B+C
1727	* <I/O completes, updates disk size to min(A+B+C, A+BS)>
1728	* <power failure>
1729	*
1730	* After reboot:
1731	* 1) with A+B+C < A+BS, the file has zero padding in range
1732	* [A+B, A+B+C]
1733	*
1734	* \|< Block Size (BS) >\|
1735	* \|DDDDDDDDDDDD0000000000000\|
1736	* ^ ^ ^
1737	* A A+B A+B+C
1738	* (EOF)
1739	*
1740	* 2) with A+B+C > A+BS, the file has zero padding in range
1741	* [A+B, A+BS]
1742	*
1743	* \|< Block Size (BS) >\|< Block Size (BS) >\|
1744	* \|DDDDDDDDDDDD0000000000000\|00000000000000000000000000\|
1745	* ^ ^ ^ ^
1746	* A A+B A+BS A+B+C
1747	* (EOF)
1748	*
1749	* D = Valid Data
1750	* 0 = Zero Padding
1751	*
1752	* Note that this defeats the ability to chain the ioends of
1753	* appending writes.
1754	*/
1755	wpc->ioend->io_size += len;
1756	if (wpc->ioend->io_offset + wpc->ioend->io_size > end_pos)
1757	wpc->ioend->io_size = end_pos - wpc->ioend->io_offset;
1758
1759	wbc_account_cgroup_owner(wbc, folio, bytes: len);
1760	return `0`;
1761	}
1762
1763	static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc,
1764	struct writeback_control wbc, struct* folio *folio,
1765	struct inode *inode, u64 pos, u64 end_pos,
1766	unsigned dirty_len, unsigned *count)
1767	{
1768	int error;
1769
1770	do {
1771	unsigned map_len;
1772
1773	error = wpc->ops->map_blocks(wpc, inode, pos, dirty_len);
1774	if (error)
1775	break;
1776	trace_iomap_writepage_map(inode, pos, dirty_len, iomap: &wpc->iomap);
1777
1778	map_len = min_t(u64, dirty_len,
1779	wpc->iomap.offset + wpc->iomap.length - pos);
1780	WARN_ON_ONCE(!folio->private && map_len < dirty_len);
1781
1782	switch (wpc->iomap.type) {
1783	case IOMAP_INLINE:
1784	WARN_ON_ONCE(`1`);
1785	error = -EIO;
1786	break;
1787	case IOMAP_HOLE:
1788	break;
1789	default:
1790	error = iomap_add_to_ioend(wpc, wbc, folio, inode, pos,
1791	end_pos, len: map_len);
1792	if (!error)
1793	(*count)++;
1794	break;
1795	}
1796	dirty_len -= map_len;
1797	pos += map_len;
1798	} while (dirty_len && !error);
1799
1800	/*
1801	* We cannot cancel the ioend directly here on error. We may have
1802	* already set other pages under writeback and hence we have to run I/O
1803	* completion to mark the error state of the pages under writeback
1804	* appropriately.
1805	*
1806	* Just let the file system know what portion of the folio failed to
1807	* map.
1808	*/
1809	if (error && wpc->ops->discard_folio)
1810	wpc->ops->discard_folio(folio, pos);
1811	return error;
1812	}
1813
1814	/*
1815	* Check interaction of the folio with the file end.
1816	*
1817	* If the folio is entirely beyond i_size, return false. If it straddles
1818	* i_size, adjust end_pos and zero all data beyond i_size.
1819	*/
1820	static bool iomap_writepage_handle_eof(struct folio folio, struct* inode *inode,
1821	u64 *end_pos)
1822	{
1823	u64 isize = i_size_read(inode);
1824
1825	if (*end_pos > isize) {
1826	size_t poff = offset_in_folio(folio, isize);
1827	pgoff_t end_index = isize >> PAGE_SHIFT;
1828
1829	/*
1830	* If the folio is entirely ouside of i_size, skip it.
1831	*
1832	* This can happen due to a truncate operation that is in
1833	* progress and in that case truncate will finish it off once
1834	* we've dropped the folio lock.
1835	*
1836	* Note that the pgoff_t used for end_index is an unsigned long.
1837	* If the given offset is greater than 16TB on a 32-bit system,
1838	* then if we checked if the folio is fully outside i_size with
1839	* "if (folio->index >= end_index + 1)", "end_index + 1" would
1840	* overflow and evaluate to 0. Hence this folio would be
1841	* redirtied and written out repeatedly, which would result in
1842	* an infinite loop; the user program performing this operation
1843	* would hang. Instead, we can detect this situation by
1844	* checking if the folio is totally beyond i_size or if its
1845	* offset is just equal to the EOF.
1846	*/
1847	if (folio->index > end_index \|\|
1848	(folio->index == end_index && poff == `0`))
1849	return false;
1850
1851	/*
1852	* The folio straddles i_size.
1853	*
1854	* It must be zeroed out on each and every writepage invocation
1855	* because it may be mmapped:
1856	*
1857	* A file is mapped in multiples of the page size. For a
1858	* file that is not a multiple of the page size, the
1859	* remaining memory is zeroed when mapped, and writes to that
1860	* region are not written out to the file.
1861	*
1862	* Also adjust the end_pos to the end of file and skip writeback
1863	* for all blocks entirely beyond i_size.
1864	*/
1865	folio_zero_segment(folio, start: poff, xend: folio_size(folio));
1866	*end_pos = isize;
1867	}
1868
1869	return true;
1870	}
1871
1872	static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
1873	struct writeback_control wbc, struct* folio *folio)
1874	{
1875	struct iomap_folio_state *ifs = folio->private;
1876	struct inode *inode = folio->mapping->host;
1877	u64 pos = folio_pos(folio);
1878	u64 end_pos = pos + folio_size(folio);
1879	u64 end_aligned = `0`;
1880	unsigned count = `0`;
1881	int error = `0`;
1882	u32 rlen;
1883
1884	WARN_ON_ONCE(!folio_test_locked(folio));
1885	WARN_ON_ONCE(folio_test_dirty(folio));
1886	WARN_ON_ONCE(folio_test_writeback(folio));
1887
1888	trace_iomap_writepage(inode, off: pos, len: folio_size(folio));
1889
1890	if (!iomap_writepage_handle_eof(folio, inode, end_pos: &end_pos)) {
1891	folio_unlock(folio);
1892	return `0`;
1893	}
1894	WARN_ON_ONCE(end_pos <= pos);
1895
1896	if (i_blocks_per_folio(inode, folio) > `1`) {
1897	if (!ifs) {
1898	ifs = ifs_alloc(inode, folio, flags: `0`);
1899	iomap_set_range_dirty(folio, off: `0`, len: end_pos - pos);
1900	}
1901
1902	/*
1903	* Keep the I/O completion handler from clearing the writeback
1904	* bit until we have submitted all blocks by adding a bias to
1905	* ifs->write_bytes_pending, which is dropped after submitting
1906	* all blocks.
1907	*/
1908	WARN_ON_ONCE(atomic_read(&ifs->write_bytes_pending) != `0`);
1909	atomic_inc(v: &ifs->write_bytes_pending);
1910	}
1911
1912	/*
1913	* Set the writeback bit ASAP, as the I/O completion for the single
1914	* block per folio case happen hit as soon as we're submitting the bio.
1915	*/
1916	folio_start_writeback(folio);
1917
1918	/*
1919	* Walk through the folio to find dirty areas to write back.
1920	*/
1921	end_aligned = round_up(end_pos, i_blocksize(inode));
1922	while ((rlen = iomap_find_dirty_range(folio, range_start: &pos, range_end: end_aligned))) {
1923	error = iomap_writepage_map_blocks(wpc, wbc, folio, inode,
1924	pos, end_pos, dirty_len: rlen, count: &count);
1925	if (error)
1926	break;
1927	pos += rlen;
1928	}
1929
1930	if (count)
1931	wpc->nr_folios++;
1932
1933	/*
1934	* We can have dirty bits set past end of file in page_mkwrite path
1935	* while mapping the last partial folio. Hence it's better to clear
1936	* all the dirty bits in the folio here.
1937	*/
1938	iomap_clear_range_dirty(folio, off: `0`, len: folio_size(folio));
1939
1940	/*
1941	* Usually the writeback bit is cleared by the I/O completion handler.
1942	* But we may end up either not actually writing any blocks, or (when
1943	* there are multiple blocks in a folio) all I/O might have finished
1944	* already at this point. In that case we need to clear the writeback
1945	* bit ourselves right after unlocking the page.
1946	*/
1947	folio_unlock(folio);
1948	if (ifs) {
1949	if (atomic_dec_and_test(v: &ifs->write_bytes_pending))
1950	folio_end_writeback(folio);
1951	} else {
1952	if (!count)
1953	folio_end_writeback(folio);
1954	}
1955	mapping_set_error(mapping: inode->i_mapping, error);
1956	return error;
1957	}
1958
1959	int
1960	iomap_writepages(struct address_space mapping, struct* writeback_control *wbc,
1961	struct iomap_writepage_ctx *wpc,
1962	const struct iomap_writeback_ops *ops)
1963	{
1964	struct folio *folio = NULL;
1965	int error;
1966
1967	/*
1968	* Writeback from reclaim context should never happen except in the case
1969	* of a VM regression so warn about it and refuse to write the data.
1970	*/
1971	if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC \| PF_KSWAPD)) ==
1972	PF_MEMALLOC))
1973	return -EIO;
1974
1975	wpc->ops = ops;
1976	while ((folio = writeback_iter(mapping, wbc, folio, error: &error)))
1977	error = iomap_writepage_map(wpc, wbc, folio);
1978	return iomap_submit_ioend(wpc, error);
1979	}
1980	EXPORT_SYMBOL_GPL(iomap_writepages);
1981

Provided by KDAB

Definitions

iomap_folio_state
ifs_is_fully_uptodate
ifs_block_is_uptodate
ifs_set_range_uptodate
iomap_set_range_uptodate
ifs_block_is_dirty
ifs_find_dirty_range
iomap_find_dirty_range
ifs_clear_range_dirty
iomap_clear_range_dirty
ifs_set_range_dirty
iomap_set_range_dirty
ifs_alloc
ifs_free
iomap_adjust_read_range
iomap_finish_folio_read
iomap_read_end_io
iomap_readpage_ctx
iomap_read_inline_data
iomap_block_needs_zeroing
iomap_readpage_iter
iomap_read_folio_iter
iomap_read_folio
iomap_readahead_iter
iomap_readahead
iomap_is_partially_uptodate
iomap_get_folio
iomap_release_folio
iomap_invalidate_folio
iomap_dirty_folio
iomap_write_failed
iomap_read_folio_sync
__iomap_write_begin
__iomap_get_folio
__iomap_put_folio
iomap_trim_folio_range
iomap_write_begin_inline
iomap_write_begin
__iomap_write_end
iomap_write_end_inline
iomap_write_end
iomap_write_iter
iomap_file_buffered_write
iomap_write_delalloc_ifs_punch
iomap_write_delalloc_punch
iomap_write_delalloc_scan
iomap_write_delalloc_release
iomap_unshare_iter
iomap_file_unshare
iomap_zero_iter_flush_and_stale
iomap_zero_iter
iomap_zero_range
iomap_truncate_page
iomap_folio_mkwrite_iter
iomap_page_mkwrite
iomap_finish_folio_write
iomap_finish_ioend_buffered
iomap_writepage_end_bio
iomap_submit_ioend
iomap_alloc_ioend
iomap_can_add_to_ioend
iomap_add_to_ioend
iomap_writepage_map_blocks
iomap_writepage_handle_eof
iomap_writepage_map

Improve your Profiling and Debugging skills

Find out more

Definitions

source code of linux/fs/iomap/buffered-io.c