1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * Copyright (C) 2010 Red Hat, Inc. |
4 | * Copyright (c) 2016-2021 Christoph Hellwig. |
5 | */ |
6 | #include <linux/module.h> |
7 | #include <linux/compiler.h> |
8 | #include <linux/fs.h> |
9 | #include <linux/fscrypt.h> |
10 | #include <linux/pagemap.h> |
11 | #include <linux/iomap.h> |
12 | #include <linux/backing-dev.h> |
13 | #include <linux/uio.h> |
14 | #include <linux/task_io_accounting_ops.h> |
15 | #include "trace.h" |
16 | |
17 | #include "../internal.h" |
18 | |
19 | /* |
20 | * Private flags for iomap_dio, must not overlap with the public ones in |
21 | * iomap.h: |
22 | */ |
23 | #define IOMAP_DIO_CALLER_COMP (1U << 26) |
24 | #define IOMAP_DIO_INLINE_COMP (1U << 27) |
25 | #define IOMAP_DIO_WRITE_THROUGH (1U << 28) |
26 | #define IOMAP_DIO_NEED_SYNC (1U << 29) |
27 | #define IOMAP_DIO_WRITE (1U << 30) |
28 | #define IOMAP_DIO_DIRTY (1U << 31) |
29 | |
30 | struct iomap_dio { |
31 | struct kiocb *iocb; |
32 | const struct iomap_dio_ops *dops; |
33 | loff_t i_size; |
34 | loff_t size; |
35 | atomic_t ref; |
36 | unsigned flags; |
37 | int error; |
38 | size_t done_before; |
39 | bool wait_for_completion; |
40 | |
41 | union { |
42 | /* used during submission and for synchronous completion: */ |
43 | struct { |
44 | struct iov_iter *iter; |
45 | struct task_struct *waiter; |
46 | } submit; |
47 | |
48 | /* used for aio completion: */ |
49 | struct { |
50 | struct work_struct work; |
51 | } aio; |
52 | }; |
53 | }; |
54 | |
55 | static struct bio *iomap_dio_alloc_bio(const struct iomap_iter *iter, |
56 | struct iomap_dio *dio, unsigned short nr_vecs, blk_opf_t opf) |
57 | { |
58 | if (dio->dops && dio->dops->bio_set) |
59 | return bio_alloc_bioset(bdev: iter->iomap.bdev, nr_vecs, opf, |
60 | GFP_KERNEL, bs: dio->dops->bio_set); |
61 | return bio_alloc(bdev: iter->iomap.bdev, nr_vecs, opf, GFP_KERNEL); |
62 | } |
63 | |
64 | static void iomap_dio_submit_bio(const struct iomap_iter *iter, |
65 | struct iomap_dio *dio, struct bio *bio, loff_t pos) |
66 | { |
67 | struct kiocb *iocb = dio->iocb; |
68 | |
69 | atomic_inc(v: &dio->ref); |
70 | |
71 | /* Sync dio can't be polled reliably */ |
72 | if ((iocb->ki_flags & IOCB_HIPRI) && !is_sync_kiocb(kiocb: iocb)) { |
73 | bio_set_polled(bio, kiocb: iocb); |
74 | WRITE_ONCE(iocb->private, bio); |
75 | } |
76 | |
77 | if (dio->dops && dio->dops->submit_io) |
78 | dio->dops->submit_io(iter, bio, pos); |
79 | else |
80 | submit_bio(bio); |
81 | } |
82 | |
83 | ssize_t iomap_dio_complete(struct iomap_dio *dio) |
84 | { |
85 | const struct iomap_dio_ops *dops = dio->dops; |
86 | struct kiocb *iocb = dio->iocb; |
87 | loff_t offset = iocb->ki_pos; |
88 | ssize_t ret = dio->error; |
89 | |
90 | if (dops && dops->end_io) |
91 | ret = dops->end_io(iocb, dio->size, ret, dio->flags); |
92 | |
93 | if (likely(!ret)) { |
94 | ret = dio->size; |
95 | /* check for short read */ |
96 | if (offset + ret > dio->i_size && |
97 | !(dio->flags & IOMAP_DIO_WRITE)) |
98 | ret = dio->i_size - offset; |
99 | } |
100 | |
101 | /* |
102 | * Try again to invalidate clean pages which might have been cached by |
103 | * non-direct readahead, or faulted in by get_user_pages() if the source |
104 | * of the write was an mmap'ed region of the file we're writing. Either |
105 | * one is a pretty crazy thing to do, so we don't support it 100%. If |
106 | * this invalidation fails, tough, the write still worked... |
107 | * |
108 | * And this page cache invalidation has to be after ->end_io(), as some |
109 | * filesystems convert unwritten extents to real allocations in |
110 | * ->end_io() when necessary, otherwise a racing buffer read would cache |
111 | * zeros from unwritten extents. |
112 | */ |
113 | if (!dio->error && dio->size && (dio->flags & IOMAP_DIO_WRITE)) |
114 | kiocb_invalidate_post_direct_write(iocb, count: dio->size); |
115 | |
116 | inode_dio_end(inode: file_inode(f: iocb->ki_filp)); |
117 | |
118 | if (ret > 0) { |
119 | iocb->ki_pos += ret; |
120 | |
121 | /* |
122 | * If this is a DSYNC write, make sure we push it to stable |
123 | * storage now that we've written data. |
124 | */ |
125 | if (dio->flags & IOMAP_DIO_NEED_SYNC) |
126 | ret = generic_write_sync(iocb, count: ret); |
127 | if (ret > 0) |
128 | ret += dio->done_before; |
129 | } |
130 | trace_iomap_dio_complete(iocb, error: dio->error, ret); |
131 | kfree(objp: dio); |
132 | return ret; |
133 | } |
134 | EXPORT_SYMBOL_GPL(iomap_dio_complete); |
135 | |
136 | static ssize_t iomap_dio_deferred_complete(void *data) |
137 | { |
138 | return iomap_dio_complete(data); |
139 | } |
140 | |
141 | static void iomap_dio_complete_work(struct work_struct *work) |
142 | { |
143 | struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work); |
144 | struct kiocb *iocb = dio->iocb; |
145 | |
146 | iocb->ki_complete(iocb, iomap_dio_complete(dio)); |
147 | } |
148 | |
149 | /* |
150 | * Set an error in the dio if none is set yet. We have to use cmpxchg |
151 | * as the submission context and the completion context(s) can race to |
152 | * update the error. |
153 | */ |
154 | static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret) |
155 | { |
156 | cmpxchg(&dio->error, 0, ret); |
157 | } |
158 | |
159 | void iomap_dio_bio_end_io(struct bio *bio) |
160 | { |
161 | struct iomap_dio *dio = bio->bi_private; |
162 | bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY); |
163 | struct kiocb *iocb = dio->iocb; |
164 | |
165 | if (bio->bi_status) |
166 | iomap_dio_set_error(dio, ret: blk_status_to_errno(status: bio->bi_status)); |
167 | if (!atomic_dec_and_test(v: &dio->ref)) |
168 | goto release_bio; |
169 | |
170 | /* |
171 | * Synchronous dio, task itself will handle any completion work |
172 | * that needs after IO. All we need to do is wake the task. |
173 | */ |
174 | if (dio->wait_for_completion) { |
175 | struct task_struct *waiter = dio->submit.waiter; |
176 | |
177 | WRITE_ONCE(dio->submit.waiter, NULL); |
178 | blk_wake_io_task(waiter); |
179 | goto release_bio; |
180 | } |
181 | |
182 | /* |
183 | * Flagged with IOMAP_DIO_INLINE_COMP, we can complete it inline |
184 | */ |
185 | if (dio->flags & IOMAP_DIO_INLINE_COMP) { |
186 | WRITE_ONCE(iocb->private, NULL); |
187 | iomap_dio_complete_work(work: &dio->aio.work); |
188 | goto release_bio; |
189 | } |
190 | |
191 | /* |
192 | * If this dio is flagged with IOMAP_DIO_CALLER_COMP, then schedule |
193 | * our completion that way to avoid an async punt to a workqueue. |
194 | */ |
195 | if (dio->flags & IOMAP_DIO_CALLER_COMP) { |
196 | /* only polled IO cares about private cleared */ |
197 | iocb->private = dio; |
198 | iocb->dio_complete = iomap_dio_deferred_complete; |
199 | |
200 | /* |
201 | * Invoke ->ki_complete() directly. We've assigned our |
202 | * dio_complete callback handler, and since the issuer set |
203 | * IOCB_DIO_CALLER_COMP, we know their ki_complete handler will |
204 | * notice ->dio_complete being set and will defer calling that |
205 | * handler until it can be done from a safe task context. |
206 | * |
207 | * Note that the 'res' being passed in here is not important |
208 | * for this case. The actual completion value of the request |
209 | * will be gotten from dio_complete when that is run by the |
210 | * issuer. |
211 | */ |
212 | iocb->ki_complete(iocb, 0); |
213 | goto release_bio; |
214 | } |
215 | |
216 | /* |
217 | * Async DIO completion that requires filesystem level completion work |
218 | * gets punted to a work queue to complete as the operation may require |
219 | * more IO to be issued to finalise filesystem metadata changes or |
220 | * guarantee data integrity. |
221 | */ |
222 | INIT_WORK(&dio->aio.work, iomap_dio_complete_work); |
223 | queue_work(wq: file_inode(f: iocb->ki_filp)->i_sb->s_dio_done_wq, |
224 | work: &dio->aio.work); |
225 | release_bio: |
226 | if (should_dirty) { |
227 | bio_check_pages_dirty(bio); |
228 | } else { |
229 | bio_release_pages(bio, mark_dirty: false); |
230 | bio_put(bio); |
231 | } |
232 | } |
233 | EXPORT_SYMBOL_GPL(iomap_dio_bio_end_io); |
234 | |
235 | static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, |
236 | loff_t pos, unsigned len) |
237 | { |
238 | struct inode *inode = file_inode(f: dio->iocb->ki_filp); |
239 | struct page *page = ZERO_PAGE(0); |
240 | struct bio *bio; |
241 | |
242 | bio = iomap_dio_alloc_bio(iter, dio, nr_vecs: 1, opf: REQ_OP_WRITE | REQ_SYNC | REQ_IDLE); |
243 | fscrypt_set_bio_crypt_ctx(bio, inode, first_lblk: pos >> inode->i_blkbits, |
244 | GFP_KERNEL); |
245 | bio->bi_iter.bi_sector = iomap_sector(iomap: &iter->iomap, pos); |
246 | bio->bi_private = dio; |
247 | bio->bi_end_io = iomap_dio_bio_end_io; |
248 | |
249 | __bio_add_page(bio, page, len, off: 0); |
250 | iomap_dio_submit_bio(iter, dio, bio, pos); |
251 | } |
252 | |
253 | /* |
254 | * Figure out the bio's operation flags from the dio request, the |
255 | * mapping, and whether or not we want FUA. Note that we can end up |
256 | * clearing the WRITE_THROUGH flag in the dio request. |
257 | */ |
258 | static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio, |
259 | const struct iomap *iomap, bool use_fua) |
260 | { |
261 | blk_opf_t opflags = REQ_SYNC | REQ_IDLE; |
262 | |
263 | if (!(dio->flags & IOMAP_DIO_WRITE)) |
264 | return REQ_OP_READ; |
265 | |
266 | opflags |= REQ_OP_WRITE; |
267 | if (use_fua) |
268 | opflags |= REQ_FUA; |
269 | else |
270 | dio->flags &= ~IOMAP_DIO_WRITE_THROUGH; |
271 | |
272 | return opflags; |
273 | } |
274 | |
275 | static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, |
276 | struct iomap_dio *dio) |
277 | { |
278 | const struct iomap *iomap = &iter->iomap; |
279 | struct inode *inode = iter->inode; |
280 | unsigned int fs_block_size = i_blocksize(node: inode), pad; |
281 | loff_t length = iomap_length(iter); |
282 | loff_t pos = iter->pos; |
283 | blk_opf_t bio_opf; |
284 | struct bio *bio; |
285 | bool need_zeroout = false; |
286 | bool use_fua = false; |
287 | int nr_pages, ret = 0; |
288 | size_t copied = 0; |
289 | size_t orig_count; |
290 | |
291 | if ((pos | length) & (bdev_logical_block_size(bdev: iomap->bdev) - 1) || |
292 | !bdev_iter_is_aligned(bdev: iomap->bdev, iter: dio->submit.iter)) |
293 | return -EINVAL; |
294 | |
295 | if (iomap->type == IOMAP_UNWRITTEN) { |
296 | dio->flags |= IOMAP_DIO_UNWRITTEN; |
297 | need_zeroout = true; |
298 | } |
299 | |
300 | if (iomap->flags & IOMAP_F_SHARED) |
301 | dio->flags |= IOMAP_DIO_COW; |
302 | |
303 | if (iomap->flags & IOMAP_F_NEW) { |
304 | need_zeroout = true; |
305 | } else if (iomap->type == IOMAP_MAPPED) { |
306 | /* |
307 | * Use a FUA write if we need datasync semantics, this is a pure |
308 | * data IO that doesn't require any metadata updates (including |
309 | * after IO completion such as unwritten extent conversion) and |
310 | * the underlying device either supports FUA or doesn't have |
311 | * a volatile write cache. This allows us to avoid cache flushes |
312 | * on IO completion. If we can't use writethrough and need to |
313 | * sync, disable in-task completions as dio completion will |
314 | * need to call generic_write_sync() which will do a blocking |
315 | * fsync / cache flush call. |
316 | */ |
317 | if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) && |
318 | (dio->flags & IOMAP_DIO_WRITE_THROUGH) && |
319 | (bdev_fua(bdev: iomap->bdev) || !bdev_write_cache(bdev: iomap->bdev))) |
320 | use_fua = true; |
321 | else if (dio->flags & IOMAP_DIO_NEED_SYNC) |
322 | dio->flags &= ~IOMAP_DIO_CALLER_COMP; |
323 | } |
324 | |
325 | /* |
326 | * Save the original count and trim the iter to just the extent we |
327 | * are operating on right now. The iter will be re-expanded once |
328 | * we are done. |
329 | */ |
330 | orig_count = iov_iter_count(i: dio->submit.iter); |
331 | iov_iter_truncate(i: dio->submit.iter, count: length); |
332 | |
333 | if (!iov_iter_count(i: dio->submit.iter)) |
334 | goto out; |
335 | |
336 | /* |
337 | * We can only do deferred completion for pure overwrites that |
338 | * don't require additional IO at completion. This rules out |
339 | * writes that need zeroing or extent conversion, extend |
340 | * the file size, or issue journal IO or cache flushes |
341 | * during completion processing. |
342 | */ |
343 | if (need_zeroout || |
344 | ((dio->flags & IOMAP_DIO_NEED_SYNC) && !use_fua) || |
345 | ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) |
346 | dio->flags &= ~IOMAP_DIO_CALLER_COMP; |
347 | |
348 | /* |
349 | * The rules for polled IO completions follow the guidelines as the |
350 | * ones we set for inline and deferred completions. If none of those |
351 | * are available for this IO, clear the polled flag. |
352 | */ |
353 | if (!(dio->flags & (IOMAP_DIO_INLINE_COMP|IOMAP_DIO_CALLER_COMP))) |
354 | dio->iocb->ki_flags &= ~IOCB_HIPRI; |
355 | |
356 | if (need_zeroout) { |
357 | /* zero out from the start of the block to the write offset */ |
358 | pad = pos & (fs_block_size - 1); |
359 | if (pad) |
360 | iomap_dio_zero(iter, dio, pos: pos - pad, len: pad); |
361 | } |
362 | |
363 | /* |
364 | * Set the operation flags early so that bio_iov_iter_get_pages |
365 | * can set up the page vector appropriately for a ZONE_APPEND |
366 | * operation. |
367 | */ |
368 | bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua); |
369 | |
370 | nr_pages = bio_iov_vecs_to_alloc(iter: dio->submit.iter, BIO_MAX_VECS); |
371 | do { |
372 | size_t n; |
373 | if (dio->error) { |
374 | iov_iter_revert(i: dio->submit.iter, bytes: copied); |
375 | copied = ret = 0; |
376 | goto out; |
377 | } |
378 | |
379 | bio = iomap_dio_alloc_bio(iter, dio, nr_vecs: nr_pages, opf: bio_opf); |
380 | fscrypt_set_bio_crypt_ctx(bio, inode, first_lblk: pos >> inode->i_blkbits, |
381 | GFP_KERNEL); |
382 | bio->bi_iter.bi_sector = iomap_sector(iomap, pos); |
383 | bio->bi_write_hint = inode->i_write_hint; |
384 | bio->bi_ioprio = dio->iocb->ki_ioprio; |
385 | bio->bi_private = dio; |
386 | bio->bi_end_io = iomap_dio_bio_end_io; |
387 | |
388 | ret = bio_iov_iter_get_pages(bio, iter: dio->submit.iter); |
389 | if (unlikely(ret)) { |
390 | /* |
391 | * We have to stop part way through an IO. We must fall |
392 | * through to the sub-block tail zeroing here, otherwise |
393 | * this short IO may expose stale data in the tail of |
394 | * the block we haven't written data to. |
395 | */ |
396 | bio_put(bio); |
397 | goto zero_tail; |
398 | } |
399 | |
400 | n = bio->bi_iter.bi_size; |
401 | if (dio->flags & IOMAP_DIO_WRITE) { |
402 | task_io_account_write(bytes: n); |
403 | } else { |
404 | if (dio->flags & IOMAP_DIO_DIRTY) |
405 | bio_set_pages_dirty(bio); |
406 | } |
407 | |
408 | dio->size += n; |
409 | copied += n; |
410 | |
411 | nr_pages = bio_iov_vecs_to_alloc(iter: dio->submit.iter, |
412 | BIO_MAX_VECS); |
413 | /* |
414 | * We can only poll for single bio I/Os. |
415 | */ |
416 | if (nr_pages) |
417 | dio->iocb->ki_flags &= ~IOCB_HIPRI; |
418 | iomap_dio_submit_bio(iter, dio, bio, pos); |
419 | pos += n; |
420 | } while (nr_pages); |
421 | |
422 | /* |
423 | * We need to zeroout the tail of a sub-block write if the extent type |
424 | * requires zeroing or the write extends beyond EOF. If we don't zero |
425 | * the block tail in the latter case, we can expose stale data via mmap |
426 | * reads of the EOF block. |
427 | */ |
428 | zero_tail: |
429 | if (need_zeroout || |
430 | ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) { |
431 | /* zero out from the end of the write to the end of the block */ |
432 | pad = pos & (fs_block_size - 1); |
433 | if (pad) |
434 | iomap_dio_zero(iter, dio, pos, len: fs_block_size - pad); |
435 | } |
436 | out: |
437 | /* Undo iter limitation to current extent */ |
438 | iov_iter_reexpand(i: dio->submit.iter, count: orig_count - copied); |
439 | if (copied) |
440 | return copied; |
441 | return ret; |
442 | } |
443 | |
444 | static loff_t iomap_dio_hole_iter(const struct iomap_iter *iter, |
445 | struct iomap_dio *dio) |
446 | { |
447 | loff_t length = iov_iter_zero(bytes: iomap_length(iter), dio->submit.iter); |
448 | |
449 | dio->size += length; |
450 | if (!length) |
451 | return -EFAULT; |
452 | return length; |
453 | } |
454 | |
455 | static loff_t iomap_dio_inline_iter(const struct iomap_iter *iomi, |
456 | struct iomap_dio *dio) |
457 | { |
458 | const struct iomap *iomap = &iomi->iomap; |
459 | struct iov_iter *iter = dio->submit.iter; |
460 | void *inline_data = iomap_inline_data(iomap, pos: iomi->pos); |
461 | loff_t length = iomap_length(iter: iomi); |
462 | loff_t pos = iomi->pos; |
463 | size_t copied; |
464 | |
465 | if (WARN_ON_ONCE(!iomap_inline_data_valid(iomap))) |
466 | return -EIO; |
467 | |
468 | if (dio->flags & IOMAP_DIO_WRITE) { |
469 | loff_t size = iomi->inode->i_size; |
470 | |
471 | if (pos > size) |
472 | memset(iomap_inline_data(iomap, size), 0, pos - size); |
473 | copied = copy_from_iter(addr: inline_data, bytes: length, i: iter); |
474 | if (copied) { |
475 | if (pos + copied > size) |
476 | i_size_write(inode: iomi->inode, i_size: pos + copied); |
477 | mark_inode_dirty(inode: iomi->inode); |
478 | } |
479 | } else { |
480 | copied = copy_to_iter(addr: inline_data, bytes: length, i: iter); |
481 | } |
482 | dio->size += copied; |
483 | if (!copied) |
484 | return -EFAULT; |
485 | return copied; |
486 | } |
487 | |
488 | static loff_t iomap_dio_iter(const struct iomap_iter *iter, |
489 | struct iomap_dio *dio) |
490 | { |
491 | switch (iter->iomap.type) { |
492 | case IOMAP_HOLE: |
493 | if (WARN_ON_ONCE(dio->flags & IOMAP_DIO_WRITE)) |
494 | return -EIO; |
495 | return iomap_dio_hole_iter(iter, dio); |
496 | case IOMAP_UNWRITTEN: |
497 | if (!(dio->flags & IOMAP_DIO_WRITE)) |
498 | return iomap_dio_hole_iter(iter, dio); |
499 | return iomap_dio_bio_iter(iter, dio); |
500 | case IOMAP_MAPPED: |
501 | return iomap_dio_bio_iter(iter, dio); |
502 | case IOMAP_INLINE: |
503 | return iomap_dio_inline_iter(iomi: iter, dio); |
504 | case IOMAP_DELALLOC: |
505 | /* |
506 | * DIO is not serialised against mmap() access at all, and so |
507 | * if the page_mkwrite occurs between the writeback and the |
508 | * iomap_iter() call in the DIO path, then it will see the |
509 | * DELALLOC block that the page-mkwrite allocated. |
510 | */ |
511 | pr_warn_ratelimited("Direct I/O collision with buffered writes! File: %pD4 Comm: %.20s\n" , |
512 | dio->iocb->ki_filp, current->comm); |
513 | return -EIO; |
514 | default: |
515 | WARN_ON_ONCE(1); |
516 | return -EIO; |
517 | } |
518 | } |
519 | |
520 | /* |
521 | * iomap_dio_rw() always completes O_[D]SYNC writes regardless of whether the IO |
522 | * is being issued as AIO or not. This allows us to optimise pure data writes |
523 | * to use REQ_FUA rather than requiring generic_write_sync() to issue a |
524 | * REQ_FLUSH post write. This is slightly tricky because a single request here |
525 | * can be mapped into multiple disjoint IOs and only a subset of the IOs issued |
526 | * may be pure data writes. In that case, we still need to do a full data sync |
527 | * completion. |
528 | * |
529 | * When page faults are disabled and @dio_flags includes IOMAP_DIO_PARTIAL, |
530 | * __iomap_dio_rw can return a partial result if it encounters a non-resident |
531 | * page in @iter after preparing a transfer. In that case, the non-resident |
532 | * pages can be faulted in and the request resumed with @done_before set to the |
533 | * number of bytes previously transferred. The request will then complete with |
534 | * the correct total number of bytes transferred; this is essential for |
535 | * completing partial requests asynchronously. |
536 | * |
537 | * Returns -ENOTBLK In case of a page invalidation invalidation failure for |
538 | * writes. The callers needs to fall back to buffered I/O in this case. |
539 | */ |
540 | struct iomap_dio * |
541 | __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, |
542 | const struct iomap_ops *ops, const struct iomap_dio_ops *dops, |
543 | unsigned int dio_flags, void *private, size_t done_before) |
544 | { |
545 | struct inode *inode = file_inode(f: iocb->ki_filp); |
546 | struct iomap_iter iomi = { |
547 | .inode = inode, |
548 | .pos = iocb->ki_pos, |
549 | .len = iov_iter_count(i: iter), |
550 | .flags = IOMAP_DIRECT, |
551 | .private = private, |
552 | }; |
553 | bool wait_for_completion = |
554 | is_sync_kiocb(kiocb: iocb) || (dio_flags & IOMAP_DIO_FORCE_WAIT); |
555 | struct blk_plug plug; |
556 | struct iomap_dio *dio; |
557 | loff_t ret = 0; |
558 | |
559 | trace_iomap_dio_rw_begin(iocb, iter, dio_flags, done_before); |
560 | |
561 | if (!iomi.len) |
562 | return NULL; |
563 | |
564 | dio = kmalloc(size: sizeof(*dio), GFP_KERNEL); |
565 | if (!dio) |
566 | return ERR_PTR(error: -ENOMEM); |
567 | |
568 | dio->iocb = iocb; |
569 | atomic_set(v: &dio->ref, i: 1); |
570 | dio->size = 0; |
571 | dio->i_size = i_size_read(inode); |
572 | dio->dops = dops; |
573 | dio->error = 0; |
574 | dio->flags = 0; |
575 | dio->done_before = done_before; |
576 | |
577 | dio->submit.iter = iter; |
578 | dio->submit.waiter = current; |
579 | |
580 | if (iocb->ki_flags & IOCB_NOWAIT) |
581 | iomi.flags |= IOMAP_NOWAIT; |
582 | |
583 | if (iov_iter_rw(i: iter) == READ) { |
584 | /* reads can always complete inline */ |
585 | dio->flags |= IOMAP_DIO_INLINE_COMP; |
586 | |
587 | if (iomi.pos >= dio->i_size) |
588 | goto out_free_dio; |
589 | |
590 | if (user_backed_iter(i: iter)) |
591 | dio->flags |= IOMAP_DIO_DIRTY; |
592 | |
593 | ret = kiocb_write_and_wait(iocb, count: iomi.len); |
594 | if (ret) |
595 | goto out_free_dio; |
596 | } else { |
597 | iomi.flags |= IOMAP_WRITE; |
598 | dio->flags |= IOMAP_DIO_WRITE; |
599 | |
600 | /* |
601 | * Flag as supporting deferred completions, if the issuer |
602 | * groks it. This can avoid a workqueue punt for writes. |
603 | * We may later clear this flag if we need to do other IO |
604 | * as part of this IO completion. |
605 | */ |
606 | if (iocb->ki_flags & IOCB_DIO_CALLER_COMP) |
607 | dio->flags |= IOMAP_DIO_CALLER_COMP; |
608 | |
609 | if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) { |
610 | ret = -EAGAIN; |
611 | if (iomi.pos >= dio->i_size || |
612 | iomi.pos + iomi.len > dio->i_size) |
613 | goto out_free_dio; |
614 | iomi.flags |= IOMAP_OVERWRITE_ONLY; |
615 | } |
616 | |
617 | /* for data sync or sync, we need sync completion processing */ |
618 | if (iocb_is_dsync(iocb)) { |
619 | dio->flags |= IOMAP_DIO_NEED_SYNC; |
620 | |
621 | /* |
622 | * For datasync only writes, we optimistically try using |
623 | * WRITE_THROUGH for this IO. This flag requires either |
624 | * FUA writes through the device's write cache, or a |
625 | * normal write to a device without a volatile write |
626 | * cache. For the former, Any non-FUA write that occurs |
627 | * will clear this flag, hence we know before completion |
628 | * whether a cache flush is necessary. |
629 | */ |
630 | if (!(iocb->ki_flags & IOCB_SYNC)) |
631 | dio->flags |= IOMAP_DIO_WRITE_THROUGH; |
632 | } |
633 | |
634 | /* |
635 | * Try to invalidate cache pages for the range we are writing. |
636 | * If this invalidation fails, let the caller fall back to |
637 | * buffered I/O. |
638 | */ |
639 | ret = kiocb_invalidate_pages(iocb, count: iomi.len); |
640 | if (ret) { |
641 | if (ret != -EAGAIN) { |
642 | trace_iomap_dio_invalidate_fail(inode, off: iomi.pos, |
643 | len: iomi.len); |
644 | ret = -ENOTBLK; |
645 | } |
646 | goto out_free_dio; |
647 | } |
648 | |
649 | if (!wait_for_completion && !inode->i_sb->s_dio_done_wq) { |
650 | ret = sb_init_dio_done_wq(sb: inode->i_sb); |
651 | if (ret < 0) |
652 | goto out_free_dio; |
653 | } |
654 | } |
655 | |
656 | inode_dio_begin(inode); |
657 | |
658 | blk_start_plug(&plug); |
659 | while ((ret = iomap_iter(iter: &iomi, ops)) > 0) { |
660 | iomi.processed = iomap_dio_iter(iter: &iomi, dio); |
661 | |
662 | /* |
663 | * We can only poll for single bio I/Os. |
664 | */ |
665 | iocb->ki_flags &= ~IOCB_HIPRI; |
666 | } |
667 | |
668 | blk_finish_plug(&plug); |
669 | |
670 | /* |
671 | * We only report that we've read data up to i_size. |
672 | * Revert iter to a state corresponding to that as some callers (such |
673 | * as the splice code) rely on it. |
674 | */ |
675 | if (iov_iter_rw(i: iter) == READ && iomi.pos >= dio->i_size) |
676 | iov_iter_revert(i: iter, bytes: iomi.pos - dio->i_size); |
677 | |
678 | if (ret == -EFAULT && dio->size && (dio_flags & IOMAP_DIO_PARTIAL)) { |
679 | if (!(iocb->ki_flags & IOCB_NOWAIT)) |
680 | wait_for_completion = true; |
681 | ret = 0; |
682 | } |
683 | |
684 | /* magic error code to fall back to buffered I/O */ |
685 | if (ret == -ENOTBLK) { |
686 | wait_for_completion = true; |
687 | ret = 0; |
688 | } |
689 | if (ret < 0) |
690 | iomap_dio_set_error(dio, ret); |
691 | |
692 | /* |
693 | * If all the writes we issued were already written through to the |
694 | * media, we don't need to flush the cache on IO completion. Clear the |
695 | * sync flag for this case. |
696 | */ |
697 | if (dio->flags & IOMAP_DIO_WRITE_THROUGH) |
698 | dio->flags &= ~IOMAP_DIO_NEED_SYNC; |
699 | |
700 | /* |
701 | * We are about to drop our additional submission reference, which |
702 | * might be the last reference to the dio. There are three different |
703 | * ways we can progress here: |
704 | * |
705 | * (a) If this is the last reference we will always complete and free |
706 | * the dio ourselves. |
707 | * (b) If this is not the last reference, and we serve an asynchronous |
708 | * iocb, we must never touch the dio after the decrement, the |
709 | * I/O completion handler will complete and free it. |
710 | * (c) If this is not the last reference, but we serve a synchronous |
711 | * iocb, the I/O completion handler will wake us up on the drop |
712 | * of the final reference, and we will complete and free it here |
713 | * after we got woken by the I/O completion handler. |
714 | */ |
715 | dio->wait_for_completion = wait_for_completion; |
716 | if (!atomic_dec_and_test(v: &dio->ref)) { |
717 | if (!wait_for_completion) { |
718 | trace_iomap_dio_rw_queued(inode, off: iomi.pos, len: iomi.len); |
719 | return ERR_PTR(error: -EIOCBQUEUED); |
720 | } |
721 | |
722 | for (;;) { |
723 | set_current_state(TASK_UNINTERRUPTIBLE); |
724 | if (!READ_ONCE(dio->submit.waiter)) |
725 | break; |
726 | |
727 | blk_io_schedule(); |
728 | } |
729 | __set_current_state(TASK_RUNNING); |
730 | } |
731 | |
732 | return dio; |
733 | |
734 | out_free_dio: |
735 | kfree(objp: dio); |
736 | if (ret) |
737 | return ERR_PTR(error: ret); |
738 | return NULL; |
739 | } |
740 | EXPORT_SYMBOL_GPL(__iomap_dio_rw); |
741 | |
742 | ssize_t |
743 | iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, |
744 | const struct iomap_ops *ops, const struct iomap_dio_ops *dops, |
745 | unsigned int dio_flags, void *private, size_t done_before) |
746 | { |
747 | struct iomap_dio *dio; |
748 | |
749 | dio = __iomap_dio_rw(iocb, iter, ops, dops, dio_flags, private, |
750 | done_before); |
751 | if (IS_ERR_OR_NULL(ptr: dio)) |
752 | return PTR_ERR_OR_ZERO(ptr: dio); |
753 | return iomap_dio_complete(dio); |
754 | } |
755 | EXPORT_SYMBOL_GPL(iomap_dio_rw); |
756 | |