1// SPDX-License-Identifier: GPL-2.0-or-later
2/* Network filesystem high-level buffered read support.
3 *
4 * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
5 * Written by David Howells (dhowells@redhat.com)
6 */
7
8#include <linux/export.h>
9#include <linux/task_io_accounting_ops.h>
10#include "internal.h"
11
12/*
13 * Unlock the folios in a read operation. We need to set PG_fscache on any
14 * folios we're going to write back before we unlock them.
15 */
16void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
17{
18 struct netfs_io_subrequest *subreq;
19 struct netfs_folio *finfo;
20 struct folio *folio;
21 pgoff_t start_page = rreq->start / PAGE_SIZE;
22 pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1;
23 size_t account = 0;
24 bool subreq_failed = false;
25
26 XA_STATE(xas, &rreq->mapping->i_pages, start_page);
27
28 if (test_bit(NETFS_RREQ_FAILED, &rreq->flags)) {
29 __clear_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags);
30 list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
31 __clear_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
32 }
33 }
34
35 /* Walk through the pagecache and the I/O request lists simultaneously.
36 * We may have a mixture of cached and uncached sections and we only
37 * really want to write out the uncached sections. This is slightly
38 * complicated by the possibility that we might have huge pages with a
39 * mixture inside.
40 */
41 subreq = list_first_entry(&rreq->subrequests,
42 struct netfs_io_subrequest, rreq_link);
43 subreq_failed = (subreq->error < 0);
44
45 trace_netfs_rreq(rreq, what: netfs_rreq_trace_unlock);
46
47 rcu_read_lock();
48 xas_for_each(&xas, folio, last_page) {
49 loff_t pg_end;
50 bool pg_failed = false;
51 bool folio_started;
52
53 if (xas_retry(xas: &xas, entry: folio))
54 continue;
55
56 pg_end = folio_pos(folio) + folio_size(folio) - 1;
57
58 folio_started = false;
59 for (;;) {
60 loff_t sreq_end;
61
62 if (!subreq) {
63 pg_failed = true;
64 break;
65 }
66 if (!folio_started && test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) {
67 trace_netfs_folio(folio, why: netfs_folio_trace_copy_to_cache);
68 folio_start_fscache(folio);
69 folio_started = true;
70 }
71 pg_failed |= subreq_failed;
72 sreq_end = subreq->start + subreq->len - 1;
73 if (pg_end < sreq_end)
74 break;
75
76 account += subreq->transferred;
77 if (!list_is_last(list: &subreq->rreq_link, head: &rreq->subrequests)) {
78 subreq = list_next_entry(subreq, rreq_link);
79 subreq_failed = (subreq->error < 0);
80 } else {
81 subreq = NULL;
82 subreq_failed = false;
83 }
84
85 if (pg_end == sreq_end)
86 break;
87 }
88
89 if (!pg_failed) {
90 flush_dcache_folio(folio);
91 finfo = netfs_folio_info(folio);
92 if (finfo) {
93 trace_netfs_folio(folio, why: netfs_folio_trace_filled_gaps);
94 if (finfo->netfs_group)
95 folio_change_private(folio, data: finfo->netfs_group);
96 else
97 folio_detach_private(folio);
98 kfree(objp: finfo);
99 }
100 folio_mark_uptodate(folio);
101 }
102
103 if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) {
104 if (folio->index == rreq->no_unlock_folio &&
105 test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags))
106 _debug("no unlock");
107 else
108 folio_unlock(folio);
109 }
110 }
111 rcu_read_unlock();
112
113 task_io_account_read(bytes: account);
114 if (rreq->netfs_ops->done)
115 rreq->netfs_ops->done(rreq);
116}
117
118static void netfs_cache_expand_readahead(struct netfs_io_request *rreq,
119 loff_t *_start, size_t *_len, loff_t i_size)
120{
121 struct netfs_cache_resources *cres = &rreq->cache_resources;
122
123 if (cres->ops && cres->ops->expand_readahead)
124 cres->ops->expand_readahead(cres, _start, _len, i_size);
125}
126
127static void netfs_rreq_expand(struct netfs_io_request *rreq,
128 struct readahead_control *ractl)
129{
130 /* Give the cache a chance to change the request parameters. The
131 * resultant request must contain the original region.
132 */
133 netfs_cache_expand_readahead(rreq, start: &rreq->start, len: &rreq->len, i_size: rreq->i_size);
134
135 /* Give the netfs a chance to change the request parameters. The
136 * resultant request must contain the original region.
137 */
138 if (rreq->netfs_ops->expand_readahead)
139 rreq->netfs_ops->expand_readahead(rreq);
140
141 /* Expand the request if the cache wants it to start earlier. Note
142 * that the expansion may get further extended if the VM wishes to
143 * insert THPs and the preferred start and/or end wind up in the middle
144 * of THPs.
145 *
146 * If this is the case, however, the THP size should be an integer
147 * multiple of the cache granule size, so we get a whole number of
148 * granules to deal with.
149 */
150 if (rreq->start != readahead_pos(rac: ractl) ||
151 rreq->len != readahead_length(rac: ractl)) {
152 readahead_expand(ractl, new_start: rreq->start, new_len: rreq->len);
153 rreq->start = readahead_pos(rac: ractl);
154 rreq->len = readahead_length(rac: ractl);
155
156 trace_netfs_read(rreq, start: readahead_pos(rac: ractl), len: readahead_length(rac: ractl),
157 what: netfs_read_trace_expanded);
158 }
159}
160
161/*
162 * Begin an operation, and fetch the stored zero point value from the cookie if
163 * available.
164 */
165static int netfs_begin_cache_read(struct netfs_io_request *rreq, struct netfs_inode *ctx)
166{
167 return fscache_begin_read_operation(cres: &rreq->cache_resources, cookie: netfs_i_cookie(ctx));
168}
169
170/**
171 * netfs_readahead - Helper to manage a read request
172 * @ractl: The description of the readahead request
173 *
174 * Fulfil a readahead request by drawing data from the cache if possible, or
175 * the netfs if not. Space beyond the EOF is zero-filled. Multiple I/O
176 * requests from different sources will get munged together. If necessary, the
177 * readahead window can be expanded in either direction to a more convenient
178 * alighment for RPC efficiency or to make storage in the cache feasible.
179 *
180 * The calling netfs must initialise a netfs context contiguous to the vfs
181 * inode before calling this.
182 *
183 * This is usable whether or not caching is enabled.
184 */
185void netfs_readahead(struct readahead_control *ractl)
186{
187 struct netfs_io_request *rreq;
188 struct netfs_inode *ctx = netfs_inode(inode: ractl->mapping->host);
189 int ret;
190
191 _enter("%lx,%x", readahead_index(ractl), readahead_count(ractl));
192
193 if (readahead_count(rac: ractl) == 0)
194 return;
195
196 rreq = netfs_alloc_request(mapping: ractl->mapping, file: ractl->file,
197 start: readahead_pos(rac: ractl),
198 len: readahead_length(rac: ractl),
199 origin: NETFS_READAHEAD);
200 if (IS_ERR(ptr: rreq))
201 return;
202
203 ret = netfs_begin_cache_read(rreq, ctx);
204 if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
205 goto cleanup_free;
206
207 netfs_stat(stat: &netfs_n_rh_readahead);
208 trace_netfs_read(rreq, start: readahead_pos(rac: ractl), len: readahead_length(rac: ractl),
209 what: netfs_read_trace_readahead);
210
211 netfs_rreq_expand(rreq, ractl);
212
213 /* Set up the output buffer */
214 iov_iter_xarray(i: &rreq->iter, ITER_DEST, xarray: &ractl->mapping->i_pages,
215 start: rreq->start, count: rreq->len);
216
217 /* Drop the refs on the folios here rather than in the cache or
218 * filesystem. The locks will be dropped in netfs_rreq_unlock().
219 */
220 while (readahead_folio(ractl))
221 ;
222
223 netfs_begin_read(rreq, sync: false);
224 netfs_put_request(rreq, was_async: false, what: netfs_rreq_trace_put_return);
225 return;
226
227cleanup_free:
228 netfs_put_request(rreq, was_async: false, what: netfs_rreq_trace_put_failed);
229 return;
230}
231EXPORT_SYMBOL(netfs_readahead);
232
233/**
234 * netfs_read_folio - Helper to manage a read_folio request
235 * @file: The file to read from
236 * @folio: The folio to read
237 *
238 * Fulfil a read_folio request by drawing data from the cache if
239 * possible, or the netfs if not. Space beyond the EOF is zero-filled.
240 * Multiple I/O requests from different sources will get munged together.
241 *
242 * The calling netfs must initialise a netfs context contiguous to the vfs
243 * inode before calling this.
244 *
245 * This is usable whether or not caching is enabled.
246 */
247int netfs_read_folio(struct file *file, struct folio *folio)
248{
249 struct address_space *mapping = folio->mapping;
250 struct netfs_io_request *rreq;
251 struct netfs_inode *ctx = netfs_inode(inode: mapping->host);
252 struct folio *sink = NULL;
253 int ret;
254
255 _enter("%lx", folio->index);
256
257 rreq = netfs_alloc_request(mapping, file,
258 start: folio_file_pos(folio), len: folio_size(folio),
259 origin: NETFS_READPAGE);
260 if (IS_ERR(ptr: rreq)) {
261 ret = PTR_ERR(ptr: rreq);
262 goto alloc_error;
263 }
264
265 ret = netfs_begin_cache_read(rreq, ctx);
266 if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
267 goto discard;
268
269 netfs_stat(stat: &netfs_n_rh_readpage);
270 trace_netfs_read(rreq, start: rreq->start, len: rreq->len, what: netfs_read_trace_readpage);
271
272 /* Set up the output buffer */
273 if (folio_test_dirty(folio)) {
274 /* Handle someone trying to read from an unflushed streaming
275 * write. We fiddle the buffer so that a gap at the beginning
276 * and/or a gap at the end get copied to, but the middle is
277 * discarded.
278 */
279 struct netfs_folio *finfo = netfs_folio_info(folio);
280 struct bio_vec *bvec;
281 unsigned int from = finfo->dirty_offset;
282 unsigned int to = from + finfo->dirty_len;
283 unsigned int off = 0, i = 0;
284 size_t flen = folio_size(folio);
285 size_t nr_bvec = flen / PAGE_SIZE + 2;
286 size_t part;
287
288 ret = -ENOMEM;
289 bvec = kmalloc_array(n: nr_bvec, size: sizeof(*bvec), GFP_KERNEL);
290 if (!bvec)
291 goto discard;
292
293 sink = folio_alloc(GFP_KERNEL, order: 0);
294 if (!sink)
295 goto discard;
296
297 trace_netfs_folio(folio, why: netfs_folio_trace_read_gaps);
298
299 rreq->direct_bv = bvec;
300 rreq->direct_bv_count = nr_bvec;
301 if (from > 0) {
302 bvec_set_folio(bv: &bvec[i++], folio, len: from, offset: 0);
303 off = from;
304 }
305 while (off < to) {
306 part = min_t(size_t, to - off, PAGE_SIZE);
307 bvec_set_folio(bv: &bvec[i++], folio: sink, len: part, offset: 0);
308 off += part;
309 }
310 if (to < flen)
311 bvec_set_folio(bv: &bvec[i++], folio, len: flen - to, offset: to);
312 iov_iter_bvec(i: &rreq->iter, ITER_DEST, bvec, nr_segs: i, count: rreq->len);
313 } else {
314 iov_iter_xarray(i: &rreq->iter, ITER_DEST, xarray: &mapping->i_pages,
315 start: rreq->start, count: rreq->len);
316 }
317
318 ret = netfs_begin_read(rreq, sync: true);
319 if (sink)
320 folio_put(folio: sink);
321 netfs_put_request(rreq, was_async: false, what: netfs_rreq_trace_put_return);
322 return ret < 0 ? ret : 0;
323
324discard:
325 netfs_put_request(rreq, was_async: false, what: netfs_rreq_trace_put_discard);
326alloc_error:
327 folio_unlock(folio);
328 return ret;
329}
330EXPORT_SYMBOL(netfs_read_folio);
331
332/*
333 * Prepare a folio for writing without reading first
334 * @folio: The folio being prepared
335 * @pos: starting position for the write
336 * @len: length of write
337 * @always_fill: T if the folio should always be completely filled/cleared
338 *
339 * In some cases, write_begin doesn't need to read at all:
340 * - full folio write
341 * - write that lies in a folio that is completely beyond EOF
342 * - write that covers the folio from start to EOF or beyond it
343 *
344 * If any of these criteria are met, then zero out the unwritten parts
345 * of the folio and return true. Otherwise, return false.
346 */
347static bool netfs_skip_folio_read(struct folio *folio, loff_t pos, size_t len,
348 bool always_fill)
349{
350 struct inode *inode = folio_inode(folio);
351 loff_t i_size = i_size_read(inode);
352 size_t offset = offset_in_folio(folio, pos);
353 size_t plen = folio_size(folio);
354
355 if (unlikely(always_fill)) {
356 if (pos - offset + len <= i_size)
357 return false; /* Page entirely before EOF */
358 zero_user_segment(page: &folio->page, start: 0, end: plen);
359 folio_mark_uptodate(folio);
360 return true;
361 }
362
363 /* Full folio write */
364 if (offset == 0 && len >= plen)
365 return true;
366
367 /* Page entirely beyond the end of the file */
368 if (pos - offset >= i_size)
369 goto zero_out;
370
371 /* Write that covers from the start of the folio to EOF or beyond */
372 if (offset == 0 && (pos + len) >= i_size)
373 goto zero_out;
374
375 return false;
376zero_out:
377 zero_user_segments(page: &folio->page, start1: 0, end1: offset, start2: offset + len, end2: plen);
378 return true;
379}
380
381/**
382 * netfs_write_begin - Helper to prepare for writing
383 * @ctx: The netfs context
384 * @file: The file to read from
385 * @mapping: The mapping to read from
386 * @pos: File position at which the write will begin
387 * @len: The length of the write (may extend beyond the end of the folio chosen)
388 * @_folio: Where to put the resultant folio
389 * @_fsdata: Place for the netfs to store a cookie
390 *
391 * Pre-read data for a write-begin request by drawing data from the cache if
392 * possible, or the netfs if not. Space beyond the EOF is zero-filled.
393 * Multiple I/O requests from different sources will get munged together. If
394 * necessary, the readahead window can be expanded in either direction to a
395 * more convenient alighment for RPC efficiency or to make storage in the cache
396 * feasible.
397 *
398 * The calling netfs must provide a table of operations, only one of which,
399 * issue_op, is mandatory.
400 *
401 * The check_write_begin() operation can be provided to check for and flush
402 * conflicting writes once the folio is grabbed and locked. It is passed a
403 * pointer to the fsdata cookie that gets returned to the VM to be passed to
404 * write_end. It is permitted to sleep. It should return 0 if the request
405 * should go ahead or it may return an error. It may also unlock and put the
406 * folio, provided it sets ``*foliop`` to NULL, in which case a return of 0
407 * will cause the folio to be re-got and the process to be retried.
408 *
409 * The calling netfs must initialise a netfs context contiguous to the vfs
410 * inode before calling this.
411 *
412 * This is usable whether or not caching is enabled.
413 */
414int netfs_write_begin(struct netfs_inode *ctx,
415 struct file *file, struct address_space *mapping,
416 loff_t pos, unsigned int len, struct folio **_folio,
417 void **_fsdata)
418{
419 struct netfs_io_request *rreq;
420 struct folio *folio;
421 pgoff_t index = pos >> PAGE_SHIFT;
422 int ret;
423
424 DEFINE_READAHEAD(ractl, file, NULL, mapping, index);
425
426retry:
427 folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
428 gfp: mapping_gfp_mask(mapping));
429 if (IS_ERR(ptr: folio))
430 return PTR_ERR(ptr: folio);
431
432 if (ctx->ops->check_write_begin) {
433 /* Allow the netfs (eg. ceph) to flush conflicts. */
434 ret = ctx->ops->check_write_begin(file, pos, len, &folio, _fsdata);
435 if (ret < 0) {
436 trace_netfs_failure(NULL, NULL, error: ret, what: netfs_fail_check_write_begin);
437 goto error;
438 }
439 if (!folio)
440 goto retry;
441 }
442
443 if (folio_test_uptodate(folio))
444 goto have_folio;
445
446 /* If the page is beyond the EOF, we want to clear it - unless it's
447 * within the cache granule containing the EOF, in which case we need
448 * to preload the granule.
449 */
450 if (!netfs_is_cache_enabled(ctx) &&
451 netfs_skip_folio_read(folio, pos, len, always_fill: false)) {
452 netfs_stat(stat: &netfs_n_rh_write_zskip);
453 goto have_folio_no_wait;
454 }
455
456 rreq = netfs_alloc_request(mapping, file,
457 start: folio_file_pos(folio), len: folio_size(folio),
458 origin: NETFS_READ_FOR_WRITE);
459 if (IS_ERR(ptr: rreq)) {
460 ret = PTR_ERR(ptr: rreq);
461 goto error;
462 }
463 rreq->no_unlock_folio = folio->index;
464 __set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags);
465
466 ret = netfs_begin_cache_read(rreq, ctx);
467 if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
468 goto error_put;
469
470 netfs_stat(stat: &netfs_n_rh_write_begin);
471 trace_netfs_read(rreq, start: pos, len, what: netfs_read_trace_write_begin);
472
473 /* Expand the request to meet caching requirements and download
474 * preferences.
475 */
476 ractl._nr_pages = folio_nr_pages(folio);
477 netfs_rreq_expand(rreq, ractl: &ractl);
478
479 /* Set up the output buffer */
480 iov_iter_xarray(i: &rreq->iter, ITER_DEST, xarray: &mapping->i_pages,
481 start: rreq->start, count: rreq->len);
482
483 /* We hold the folio locks, so we can drop the references */
484 folio_get(folio);
485 while (readahead_folio(ractl: &ractl))
486 ;
487
488 ret = netfs_begin_read(rreq, sync: true);
489 if (ret < 0)
490 goto error;
491 netfs_put_request(rreq, was_async: false, what: netfs_rreq_trace_put_return);
492
493have_folio:
494 ret = folio_wait_fscache_killable(folio);
495 if (ret < 0)
496 goto error;
497have_folio_no_wait:
498 *_folio = folio;
499 _leave(" = 0");
500 return 0;
501
502error_put:
503 netfs_put_request(rreq, was_async: false, what: netfs_rreq_trace_put_failed);
504error:
505 if (folio) {
506 folio_unlock(folio);
507 folio_put(folio);
508 }
509 _leave(" = %d", ret);
510 return ret;
511}
512EXPORT_SYMBOL(netfs_write_begin);
513
514/*
515 * Preload the data into a page we're proposing to write into.
516 */
517int netfs_prefetch_for_write(struct file *file, struct folio *folio,
518 size_t offset, size_t len)
519{
520 struct netfs_io_request *rreq;
521 struct address_space *mapping = folio->mapping;
522 struct netfs_inode *ctx = netfs_inode(inode: mapping->host);
523 unsigned long long start = folio_pos(folio);
524 size_t flen = folio_size(folio);
525 int ret;
526
527 _enter("%zx @%llx", flen, start);
528
529 ret = -ENOMEM;
530
531 rreq = netfs_alloc_request(mapping, file, start, len: flen,
532 origin: NETFS_READ_FOR_WRITE);
533 if (IS_ERR(ptr: rreq)) {
534 ret = PTR_ERR(ptr: rreq);
535 goto error;
536 }
537
538 rreq->no_unlock_folio = folio->index;
539 __set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags);
540 ret = netfs_begin_cache_read(rreq, ctx);
541 if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
542 goto error_put;
543
544 netfs_stat(stat: &netfs_n_rh_write_begin);
545 trace_netfs_read(rreq, start, len: flen, what: netfs_read_trace_prefetch_for_write);
546
547 /* Set up the output buffer */
548 iov_iter_xarray(i: &rreq->iter, ITER_DEST, xarray: &mapping->i_pages,
549 start: rreq->start, count: rreq->len);
550
551 ret = netfs_begin_read(rreq, sync: true);
552 netfs_put_request(rreq, was_async: false, what: netfs_rreq_trace_put_return);
553 return ret;
554
555error_put:
556 netfs_put_request(rreq, was_async: false, what: netfs_rreq_trace_put_discard);
557error:
558 _leave(" = %d", ret);
559 return ret;
560}
561
562/**
563 * netfs_buffered_read_iter - Filesystem buffered I/O read routine
564 * @iocb: kernel I/O control block
565 * @iter: destination for the data read
566 *
567 * This is the ->read_iter() routine for all filesystems that can use the page
568 * cache directly.
569 *
570 * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall be
571 * returned when no data can be read without waiting for I/O requests to
572 * complete; it doesn't prevent readahead.
573 *
574 * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O requests
575 * shall be made for the read or for readahead. When no data can be read,
576 * -EAGAIN shall be returned. When readahead would be triggered, a partial,
577 * possibly empty read shall be returned.
578 *
579 * Return:
580 * * number of bytes copied, even for partial reads
581 * * negative error code (or 0 if IOCB_NOIO) if nothing was read
582 */
583ssize_t netfs_buffered_read_iter(struct kiocb *iocb, struct iov_iter *iter)
584{
585 struct inode *inode = file_inode(f: iocb->ki_filp);
586 struct netfs_inode *ictx = netfs_inode(inode);
587 ssize_t ret;
588
589 if (WARN_ON_ONCE((iocb->ki_flags & IOCB_DIRECT) ||
590 test_bit(NETFS_ICTX_UNBUFFERED, &ictx->flags)))
591 return -EINVAL;
592
593 ret = netfs_start_io_read(inode);
594 if (ret == 0) {
595 ret = filemap_read(iocb, to: iter, already_read: 0);
596 netfs_end_io_read(inode);
597 }
598 return ret;
599}
600EXPORT_SYMBOL(netfs_buffered_read_iter);
601
602/**
603 * netfs_file_read_iter - Generic filesystem read routine
604 * @iocb: kernel I/O control block
605 * @iter: destination for the data read
606 *
607 * This is the ->read_iter() routine for all filesystems that can use the page
608 * cache directly.
609 *
610 * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall be
611 * returned when no data can be read without waiting for I/O requests to
612 * complete; it doesn't prevent readahead.
613 *
614 * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O requests
615 * shall be made for the read or for readahead. When no data can be read,
616 * -EAGAIN shall be returned. When readahead would be triggered, a partial,
617 * possibly empty read shall be returned.
618 *
619 * Return:
620 * * number of bytes copied, even for partial reads
621 * * negative error code (or 0 if IOCB_NOIO) if nothing was read
622 */
623ssize_t netfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
624{
625 struct netfs_inode *ictx = netfs_inode(inode: iocb->ki_filp->f_mapping->host);
626
627 if ((iocb->ki_flags & IOCB_DIRECT) ||
628 test_bit(NETFS_ICTX_UNBUFFERED, &ictx->flags))
629 return netfs_unbuffered_read_iter(iocb, iter);
630
631 return netfs_buffered_read_iter(iocb, iter);
632}
633EXPORT_SYMBOL(netfs_file_read_iter);
634

source code of linux/fs/netfs/buffered_read.c