1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * linux/fs/nfs/direct.c |
4 | * |
5 | * Copyright (C) 2003 by Chuck Lever <cel@netapp.com> |
6 | * |
7 | * High-performance uncached I/O for the Linux NFS client |
8 | * |
9 | * There are important applications whose performance or correctness |
10 | * depends on uncached access to file data. Database clusters |
11 | * (multiple copies of the same instance running on separate hosts) |
12 | * implement their own cache coherency protocol that subsumes file |
13 | * system cache protocols. Applications that process datasets |
14 | * considerably larger than the client's memory do not always benefit |
15 | * from a local cache. A streaming video server, for instance, has no |
16 | * need to cache the contents of a file. |
17 | * |
18 | * When an application requests uncached I/O, all read and write requests |
19 | * are made directly to the server; data stored or fetched via these |
20 | * requests is not cached in the Linux page cache. The client does not |
21 | * correct unaligned requests from applications. All requested bytes are |
22 | * held on permanent storage before a direct write system call returns to |
23 | * an application. |
24 | * |
25 | * Solaris implements an uncached I/O facility called directio() that |
26 | * is used for backups and sequential I/O to very large files. Solaris |
27 | * also supports uncaching whole NFS partitions with "-o forcedirectio," |
28 | * an undocumented mount option. |
29 | * |
30 | * Designed by Jeff Kimmel, Chuck Lever, and Trond Myklebust, with |
31 | * help from Andrew Morton. |
32 | * |
33 | * 18 Dec 2001 Initial implementation for 2.4 --cel |
34 | * 08 Jul 2002 Version for 2.4.19, with bug fixes --trondmy |
35 | * 08 Jun 2003 Port to 2.5 APIs --cel |
36 | * 31 Mar 2004 Handle direct I/O without VFS support --cel |
37 | * 15 Sep 2004 Parallel async reads --cel |
38 | * 04 May 2005 support O_DIRECT with aio --cel |
39 | * |
40 | */ |
41 | |
42 | #include <linux/errno.h> |
43 | #include <linux/sched.h> |
44 | #include <linux/kernel.h> |
45 | #include <linux/file.h> |
46 | #include <linux/pagemap.h> |
47 | #include <linux/kref.h> |
48 | #include <linux/slab.h> |
49 | #include <linux/task_io_accounting_ops.h> |
50 | #include <linux/module.h> |
51 | |
52 | #include <linux/nfs_fs.h> |
53 | #include <linux/nfs_page.h> |
54 | #include <linux/sunrpc/clnt.h> |
55 | |
56 | #include <linux/uaccess.h> |
57 | #include <linux/atomic.h> |
58 | |
59 | #include "internal.h" |
60 | #include "iostat.h" |
61 | #include "pnfs.h" |
62 | #include "fscache.h" |
63 | #include "nfstrace.h" |
64 | |
65 | #define NFSDBG_FACILITY NFSDBG_VFS |
66 | |
67 | static struct kmem_cache *nfs_direct_cachep; |
68 | |
69 | static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops; |
70 | static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops; |
71 | static void nfs_direct_write_complete(struct nfs_direct_req *dreq); |
72 | static void nfs_direct_write_schedule_work(struct work_struct *work); |
73 | |
74 | static inline void get_dreq(struct nfs_direct_req *dreq) |
75 | { |
76 | atomic_inc(v: &dreq->io_count); |
77 | } |
78 | |
79 | static inline int put_dreq(struct nfs_direct_req *dreq) |
80 | { |
81 | return atomic_dec_and_test(v: &dreq->io_count); |
82 | } |
83 | |
84 | static void |
85 | nfs_direct_handle_truncated(struct nfs_direct_req *dreq, |
86 | const struct nfs_pgio_header *hdr, |
87 | ssize_t dreq_len) |
88 | { |
89 | if (!(test_bit(NFS_IOHDR_ERROR, &hdr->flags) || |
90 | test_bit(NFS_IOHDR_EOF, &hdr->flags))) |
91 | return; |
92 | if (dreq->max_count >= dreq_len) { |
93 | dreq->max_count = dreq_len; |
94 | if (dreq->count > dreq_len) |
95 | dreq->count = dreq_len; |
96 | } |
97 | |
98 | if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) && !dreq->error) |
99 | dreq->error = hdr->error; |
100 | } |
101 | |
102 | static void |
103 | nfs_direct_count_bytes(struct nfs_direct_req *dreq, |
104 | const struct nfs_pgio_header *hdr) |
105 | { |
106 | loff_t hdr_end = hdr->io_start + hdr->good_bytes; |
107 | ssize_t dreq_len = 0; |
108 | |
109 | if (hdr_end > dreq->io_start) |
110 | dreq_len = hdr_end - dreq->io_start; |
111 | |
112 | nfs_direct_handle_truncated(dreq, hdr, dreq_len); |
113 | |
114 | if (dreq_len > dreq->max_count) |
115 | dreq_len = dreq->max_count; |
116 | |
117 | if (dreq->count < dreq_len) |
118 | dreq->count = dreq_len; |
119 | } |
120 | |
121 | static void nfs_direct_truncate_request(struct nfs_direct_req *dreq, |
122 | struct nfs_page *req) |
123 | { |
124 | loff_t offs = req_offset(req); |
125 | size_t req_start = (size_t)(offs - dreq->io_start); |
126 | |
127 | if (req_start < dreq->max_count) |
128 | dreq->max_count = req_start; |
129 | if (req_start < dreq->count) |
130 | dreq->count = req_start; |
131 | } |
132 | |
133 | /** |
134 | * nfs_swap_rw - NFS address space operation for swap I/O |
135 | * @iocb: target I/O control block |
136 | * @iter: I/O buffer |
137 | * |
138 | * Perform IO to the swap-file. This is much like direct IO. |
139 | */ |
140 | int nfs_swap_rw(struct kiocb *iocb, struct iov_iter *iter) |
141 | { |
142 | ssize_t ret; |
143 | |
144 | VM_BUG_ON(iov_iter_count(iter) != PAGE_SIZE); |
145 | |
146 | if (iov_iter_rw(i: iter) == READ) |
147 | ret = nfs_file_direct_read(iocb, iter, swap: true); |
148 | else |
149 | ret = nfs_file_direct_write(iocb, iter, swap: true); |
150 | if (ret < 0) |
151 | return ret; |
152 | return 0; |
153 | } |
154 | |
155 | static void nfs_direct_release_pages(struct page **pages, unsigned int npages) |
156 | { |
157 | unsigned int i; |
158 | for (i = 0; i < npages; i++) |
159 | put_page(page: pages[i]); |
160 | } |
161 | |
162 | void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo, |
163 | struct nfs_direct_req *dreq) |
164 | { |
165 | cinfo->inode = dreq->inode; |
166 | cinfo->mds = &dreq->mds_cinfo; |
167 | cinfo->ds = &dreq->ds_cinfo; |
168 | cinfo->dreq = dreq; |
169 | cinfo->completion_ops = &nfs_direct_commit_completion_ops; |
170 | } |
171 | |
172 | static inline struct nfs_direct_req *nfs_direct_req_alloc(void) |
173 | { |
174 | struct nfs_direct_req *dreq; |
175 | |
176 | dreq = kmem_cache_zalloc(k: nfs_direct_cachep, GFP_KERNEL); |
177 | if (!dreq) |
178 | return NULL; |
179 | |
180 | kref_init(kref: &dreq->kref); |
181 | kref_get(kref: &dreq->kref); |
182 | init_completion(x: &dreq->completion); |
183 | INIT_LIST_HEAD(list: &dreq->mds_cinfo.list); |
184 | pnfs_init_ds_commit_info(fl_cinfo: &dreq->ds_cinfo); |
185 | INIT_WORK(&dreq->work, nfs_direct_write_schedule_work); |
186 | spin_lock_init(&dreq->lock); |
187 | |
188 | return dreq; |
189 | } |
190 | |
191 | static void nfs_direct_req_free(struct kref *kref) |
192 | { |
193 | struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref); |
194 | |
195 | pnfs_release_ds_info(fl_cinfo: &dreq->ds_cinfo, inode: dreq->inode); |
196 | if (dreq->l_ctx != NULL) |
197 | nfs_put_lock_context(l_ctx: dreq->l_ctx); |
198 | if (dreq->ctx != NULL) |
199 | put_nfs_open_context(ctx: dreq->ctx); |
200 | kmem_cache_free(s: nfs_direct_cachep, objp: dreq); |
201 | } |
202 | |
203 | static void nfs_direct_req_release(struct nfs_direct_req *dreq) |
204 | { |
205 | kref_put(kref: &dreq->kref, release: nfs_direct_req_free); |
206 | } |
207 | |
208 | ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq) |
209 | { |
210 | return dreq->bytes_left; |
211 | } |
212 | EXPORT_SYMBOL_GPL(nfs_dreq_bytes_left); |
213 | |
214 | /* |
215 | * Collects and returns the final error value/byte-count. |
216 | */ |
217 | static ssize_t nfs_direct_wait(struct nfs_direct_req *dreq) |
218 | { |
219 | ssize_t result = -EIOCBQUEUED; |
220 | |
221 | /* Async requests don't wait here */ |
222 | if (dreq->iocb) |
223 | goto out; |
224 | |
225 | result = wait_for_completion_killable(x: &dreq->completion); |
226 | |
227 | if (!result) { |
228 | result = dreq->count; |
229 | WARN_ON_ONCE(dreq->count < 0); |
230 | } |
231 | if (!result) |
232 | result = dreq->error; |
233 | |
234 | out: |
235 | return (ssize_t) result; |
236 | } |
237 | |
238 | /* |
239 | * Synchronous I/O uses a stack-allocated iocb. Thus we can't trust |
240 | * the iocb is still valid here if this is a synchronous request. |
241 | */ |
242 | static void nfs_direct_complete(struct nfs_direct_req *dreq) |
243 | { |
244 | struct inode *inode = dreq->inode; |
245 | |
246 | inode_dio_end(inode); |
247 | |
248 | if (dreq->iocb) { |
249 | long res = (long) dreq->error; |
250 | if (dreq->count != 0) { |
251 | res = (long) dreq->count; |
252 | WARN_ON_ONCE(dreq->count < 0); |
253 | } |
254 | dreq->iocb->ki_complete(dreq->iocb, res); |
255 | } |
256 | |
257 | complete(&dreq->completion); |
258 | |
259 | nfs_direct_req_release(dreq); |
260 | } |
261 | |
262 | static void nfs_direct_read_completion(struct nfs_pgio_header *hdr) |
263 | { |
264 | unsigned long bytes = 0; |
265 | struct nfs_direct_req *dreq = hdr->dreq; |
266 | |
267 | spin_lock(lock: &dreq->lock); |
268 | if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) { |
269 | spin_unlock(lock: &dreq->lock); |
270 | goto out_put; |
271 | } |
272 | |
273 | nfs_direct_count_bytes(dreq, hdr); |
274 | spin_unlock(lock: &dreq->lock); |
275 | |
276 | while (!list_empty(head: &hdr->pages)) { |
277 | struct nfs_page *req = nfs_list_entry(head: hdr->pages.next); |
278 | struct page *page = req->wb_page; |
279 | |
280 | if (!PageCompound(page) && bytes < hdr->good_bytes && |
281 | (dreq->flags == NFS_ODIRECT_SHOULD_DIRTY)) |
282 | set_page_dirty(page); |
283 | bytes += req->wb_bytes; |
284 | nfs_list_remove_request(req); |
285 | nfs_release_request(req); |
286 | } |
287 | out_put: |
288 | if (put_dreq(dreq)) |
289 | nfs_direct_complete(dreq); |
290 | hdr->release(hdr); |
291 | } |
292 | |
293 | static void nfs_read_sync_pgio_error(struct list_head *head, int error) |
294 | { |
295 | struct nfs_page *req; |
296 | |
297 | while (!list_empty(head)) { |
298 | req = nfs_list_entry(head: head->next); |
299 | nfs_list_remove_request(req); |
300 | nfs_release_request(req); |
301 | } |
302 | } |
303 | |
304 | static void nfs_direct_pgio_init(struct nfs_pgio_header *hdr) |
305 | { |
306 | get_dreq(dreq: hdr->dreq); |
307 | } |
308 | |
309 | static const struct nfs_pgio_completion_ops nfs_direct_read_completion_ops = { |
310 | .error_cleanup = nfs_read_sync_pgio_error, |
311 | .init_hdr = nfs_direct_pgio_init, |
312 | .completion = nfs_direct_read_completion, |
313 | }; |
314 | |
315 | /* |
316 | * For each rsize'd chunk of the user's buffer, dispatch an NFS READ |
317 | * operation. If nfs_readdata_alloc() or get_user_pages() fails, |
318 | * bail and stop sending more reads. Read length accounting is |
319 | * handled automatically by nfs_direct_read_result(). Otherwise, if |
320 | * no requests have been sent, just return an error. |
321 | */ |
322 | |
323 | static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, |
324 | struct iov_iter *iter, |
325 | loff_t pos) |
326 | { |
327 | struct nfs_pageio_descriptor desc; |
328 | struct inode *inode = dreq->inode; |
329 | ssize_t result = -EINVAL; |
330 | size_t requested_bytes = 0; |
331 | size_t rsize = max_t(size_t, NFS_SERVER(inode)->rsize, PAGE_SIZE); |
332 | |
333 | nfs_pageio_init_read(pgio: &desc, inode: dreq->inode, force_mds: false, |
334 | compl_ops: &nfs_direct_read_completion_ops); |
335 | get_dreq(dreq); |
336 | desc.pg_dreq = dreq; |
337 | inode_dio_begin(inode); |
338 | |
339 | while (iov_iter_count(i: iter)) { |
340 | struct page **pagevec; |
341 | size_t bytes; |
342 | size_t pgbase; |
343 | unsigned npages, i; |
344 | |
345 | result = iov_iter_get_pages_alloc2(i: iter, pages: &pagevec, |
346 | maxsize: rsize, start: &pgbase); |
347 | if (result < 0) |
348 | break; |
349 | |
350 | bytes = result; |
351 | npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE; |
352 | for (i = 0; i < npages; i++) { |
353 | struct nfs_page *req; |
354 | unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase); |
355 | /* XXX do we need to do the eof zeroing found in async_filler? */ |
356 | req = nfs_page_create_from_page(ctx: dreq->ctx, page: pagevec[i], |
357 | pgbase, offset: pos, count: req_len); |
358 | if (IS_ERR(ptr: req)) { |
359 | result = PTR_ERR(ptr: req); |
360 | break; |
361 | } |
362 | if (!nfs_pageio_add_request(&desc, req)) { |
363 | result = desc.pg_error; |
364 | nfs_release_request(req); |
365 | break; |
366 | } |
367 | pgbase = 0; |
368 | bytes -= req_len; |
369 | requested_bytes += req_len; |
370 | pos += req_len; |
371 | dreq->bytes_left -= req_len; |
372 | } |
373 | nfs_direct_release_pages(pages: pagevec, npages); |
374 | kvfree(addr: pagevec); |
375 | if (result < 0) |
376 | break; |
377 | } |
378 | |
379 | nfs_pageio_complete(desc: &desc); |
380 | |
381 | /* |
382 | * If no bytes were started, return the error, and let the |
383 | * generic layer handle the completion. |
384 | */ |
385 | if (requested_bytes == 0) { |
386 | inode_dio_end(inode); |
387 | nfs_direct_req_release(dreq); |
388 | return result < 0 ? result : -EIO; |
389 | } |
390 | |
391 | if (put_dreq(dreq)) |
392 | nfs_direct_complete(dreq); |
393 | return requested_bytes; |
394 | } |
395 | |
396 | /** |
397 | * nfs_file_direct_read - file direct read operation for NFS files |
398 | * @iocb: target I/O control block |
399 | * @iter: vector of user buffers into which to read data |
400 | * @swap: flag indicating this is swap IO, not O_DIRECT IO |
401 | * |
402 | * We use this function for direct reads instead of calling |
403 | * generic_file_aio_read() in order to avoid gfar's check to see if |
404 | * the request starts before the end of the file. For that check |
405 | * to work, we must generate a GETATTR before each direct read, and |
406 | * even then there is a window between the GETATTR and the subsequent |
407 | * READ where the file size could change. Our preference is simply |
408 | * to do all reads the application wants, and the server will take |
409 | * care of managing the end of file boundary. |
410 | * |
411 | * This function also eliminates unnecessarily updating the file's |
412 | * atime locally, as the NFS server sets the file's atime, and this |
413 | * client must read the updated atime from the server back into its |
414 | * cache. |
415 | */ |
416 | ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter, |
417 | bool swap) |
418 | { |
419 | struct file *file = iocb->ki_filp; |
420 | struct address_space *mapping = file->f_mapping; |
421 | struct inode *inode = mapping->host; |
422 | struct nfs_direct_req *dreq; |
423 | struct nfs_lock_context *l_ctx; |
424 | ssize_t result, requested; |
425 | size_t count = iov_iter_count(i: iter); |
426 | nfs_add_stats(inode: mapping->host, stat: NFSIOS_DIRECTREADBYTES, addend: count); |
427 | |
428 | dfprintk(FILE, "NFS: direct read(%pD2, %zd@%Ld)\n" , |
429 | file, count, (long long) iocb->ki_pos); |
430 | |
431 | result = 0; |
432 | if (!count) |
433 | goto out; |
434 | |
435 | task_io_account_read(bytes: count); |
436 | |
437 | result = -ENOMEM; |
438 | dreq = nfs_direct_req_alloc(); |
439 | if (dreq == NULL) |
440 | goto out; |
441 | |
442 | dreq->inode = inode; |
443 | dreq->bytes_left = dreq->max_count = count; |
444 | dreq->io_start = iocb->ki_pos; |
445 | dreq->ctx = get_nfs_open_context(ctx: nfs_file_open_context(filp: iocb->ki_filp)); |
446 | l_ctx = nfs_get_lock_context(ctx: dreq->ctx); |
447 | if (IS_ERR(ptr: l_ctx)) { |
448 | result = PTR_ERR(ptr: l_ctx); |
449 | nfs_direct_req_release(dreq); |
450 | goto out_release; |
451 | } |
452 | dreq->l_ctx = l_ctx; |
453 | if (!is_sync_kiocb(kiocb: iocb)) |
454 | dreq->iocb = iocb; |
455 | |
456 | if (user_backed_iter(i: iter)) |
457 | dreq->flags = NFS_ODIRECT_SHOULD_DIRTY; |
458 | |
459 | if (!swap) |
460 | nfs_start_io_direct(inode); |
461 | |
462 | NFS_I(inode)->read_io += count; |
463 | requested = nfs_direct_read_schedule_iovec(dreq, iter, pos: iocb->ki_pos); |
464 | |
465 | if (!swap) |
466 | nfs_end_io_direct(inode); |
467 | |
468 | if (requested > 0) { |
469 | result = nfs_direct_wait(dreq); |
470 | if (result > 0) { |
471 | requested -= result; |
472 | iocb->ki_pos += result; |
473 | } |
474 | iov_iter_revert(i: iter, bytes: requested); |
475 | } else { |
476 | result = requested; |
477 | } |
478 | |
479 | out_release: |
480 | nfs_direct_req_release(dreq); |
481 | out: |
482 | return result; |
483 | } |
484 | |
485 | static void nfs_direct_add_page_head(struct list_head *list, |
486 | struct nfs_page *req) |
487 | { |
488 | struct nfs_page *head = req->wb_head; |
489 | |
490 | if (!list_empty(head: &head->wb_list) || !nfs_lock_request(req: head)) |
491 | return; |
492 | if (!list_empty(head: &head->wb_list)) { |
493 | nfs_unlock_request(req: head); |
494 | return; |
495 | } |
496 | list_add(new: &head->wb_list, head: list); |
497 | kref_get(kref: &head->wb_kref); |
498 | kref_get(kref: &head->wb_kref); |
499 | } |
500 | |
501 | static void nfs_direct_join_group(struct list_head *list, |
502 | struct nfs_commit_info *cinfo, |
503 | struct inode *inode) |
504 | { |
505 | struct nfs_page *req, *subreq; |
506 | |
507 | list_for_each_entry(req, list, wb_list) { |
508 | if (req->wb_head != req) { |
509 | nfs_direct_add_page_head(list: &req->wb_list, req); |
510 | continue; |
511 | } |
512 | subreq = req->wb_this_page; |
513 | if (subreq == req) |
514 | continue; |
515 | do { |
516 | /* |
517 | * Remove subrequests from this list before freeing |
518 | * them in the call to nfs_join_page_group(). |
519 | */ |
520 | if (!list_empty(head: &subreq->wb_list)) { |
521 | nfs_list_remove_request(req: subreq); |
522 | nfs_release_request(subreq); |
523 | } |
524 | } while ((subreq = subreq->wb_this_page) != req); |
525 | nfs_join_page_group(head: req, cinfo, inode); |
526 | } |
527 | } |
528 | |
529 | static void |
530 | nfs_direct_write_scan_commit_list(struct inode *inode, |
531 | struct list_head *list, |
532 | struct nfs_commit_info *cinfo) |
533 | { |
534 | mutex_lock(&NFS_I(cinfo->inode)->commit_mutex); |
535 | pnfs_recover_commit_reqs(head: list, cinfo); |
536 | nfs_scan_commit_list(src: &cinfo->mds->list, dst: list, cinfo, max: 0); |
537 | mutex_unlock(lock: &NFS_I(inode: cinfo->inode)->commit_mutex); |
538 | } |
539 | |
540 | static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) |
541 | { |
542 | struct nfs_pageio_descriptor desc; |
543 | struct nfs_page *req; |
544 | LIST_HEAD(reqs); |
545 | struct nfs_commit_info cinfo; |
546 | |
547 | nfs_init_cinfo_from_dreq(cinfo: &cinfo, dreq); |
548 | nfs_direct_write_scan_commit_list(inode: dreq->inode, list: &reqs, cinfo: &cinfo); |
549 | |
550 | nfs_direct_join_group(list: &reqs, cinfo: &cinfo, inode: dreq->inode); |
551 | |
552 | nfs_clear_pnfs_ds_commit_verifiers(cinfo: &dreq->ds_cinfo); |
553 | get_dreq(dreq); |
554 | |
555 | nfs_pageio_init_write(pgio: &desc, inode: dreq->inode, FLUSH_STABLE, force_mds: false, |
556 | compl_ops: &nfs_direct_write_completion_ops); |
557 | desc.pg_dreq = dreq; |
558 | |
559 | while (!list_empty(head: &reqs)) { |
560 | req = nfs_list_entry(head: reqs.next); |
561 | /* Bump the transmission count */ |
562 | req->wb_nio++; |
563 | if (!nfs_pageio_add_request(&desc, req)) { |
564 | spin_lock(lock: &dreq->lock); |
565 | if (dreq->error < 0) { |
566 | desc.pg_error = dreq->error; |
567 | } else if (desc.pg_error != -EAGAIN) { |
568 | dreq->flags = 0; |
569 | if (!desc.pg_error) |
570 | desc.pg_error = -EIO; |
571 | dreq->error = desc.pg_error; |
572 | } else |
573 | dreq->flags = NFS_ODIRECT_RESCHED_WRITES; |
574 | spin_unlock(lock: &dreq->lock); |
575 | break; |
576 | } |
577 | nfs_release_request(req); |
578 | } |
579 | nfs_pageio_complete(desc: &desc); |
580 | |
581 | while (!list_empty(head: &reqs)) { |
582 | req = nfs_list_entry(head: reqs.next); |
583 | nfs_list_remove_request(req); |
584 | nfs_unlock_and_release_request(req); |
585 | if (desc.pg_error == -EAGAIN) { |
586 | nfs_mark_request_commit(req, NULL, cinfo: &cinfo, ds_commit_idx: 0); |
587 | } else { |
588 | spin_lock(lock: &dreq->lock); |
589 | nfs_direct_truncate_request(dreq, req); |
590 | spin_unlock(lock: &dreq->lock); |
591 | nfs_release_request(req); |
592 | } |
593 | } |
594 | |
595 | if (put_dreq(dreq)) |
596 | nfs_direct_write_complete(dreq); |
597 | } |
598 | |
599 | static void nfs_direct_commit_complete(struct nfs_commit_data *data) |
600 | { |
601 | const struct nfs_writeverf *verf = data->res.verf; |
602 | struct nfs_direct_req *dreq = data->dreq; |
603 | struct nfs_commit_info cinfo; |
604 | struct nfs_page *req; |
605 | int status = data->task.tk_status; |
606 | |
607 | trace_nfs_direct_commit_complete(dreq); |
608 | |
609 | if (status < 0) { |
610 | /* Errors in commit are fatal */ |
611 | dreq->error = status; |
612 | dreq->flags = NFS_ODIRECT_DONE; |
613 | } else { |
614 | status = dreq->error; |
615 | } |
616 | |
617 | nfs_init_cinfo_from_dreq(cinfo: &cinfo, dreq); |
618 | |
619 | while (!list_empty(head: &data->pages)) { |
620 | req = nfs_list_entry(head: data->pages.next); |
621 | nfs_list_remove_request(req); |
622 | if (status < 0) { |
623 | spin_lock(lock: &dreq->lock); |
624 | nfs_direct_truncate_request(dreq, req); |
625 | spin_unlock(lock: &dreq->lock); |
626 | nfs_release_request(req); |
627 | } else if (!nfs_write_match_verf(verf, req)) { |
628 | dreq->flags = NFS_ODIRECT_RESCHED_WRITES; |
629 | /* |
630 | * Despite the reboot, the write was successful, |
631 | * so reset wb_nio. |
632 | */ |
633 | req->wb_nio = 0; |
634 | nfs_mark_request_commit(req, NULL, cinfo: &cinfo, ds_commit_idx: 0); |
635 | } else |
636 | nfs_release_request(req); |
637 | nfs_unlock_and_release_request(req); |
638 | } |
639 | |
640 | if (nfs_commit_end(cinfo: cinfo.mds)) |
641 | nfs_direct_write_complete(dreq); |
642 | } |
643 | |
644 | static void nfs_direct_resched_write(struct nfs_commit_info *cinfo, |
645 | struct nfs_page *req) |
646 | { |
647 | struct nfs_direct_req *dreq = cinfo->dreq; |
648 | |
649 | trace_nfs_direct_resched_write(dreq); |
650 | |
651 | spin_lock(lock: &dreq->lock); |
652 | if (dreq->flags != NFS_ODIRECT_DONE) |
653 | dreq->flags = NFS_ODIRECT_RESCHED_WRITES; |
654 | spin_unlock(lock: &dreq->lock); |
655 | nfs_mark_request_commit(req, NULL, cinfo, ds_commit_idx: 0); |
656 | } |
657 | |
658 | static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops = { |
659 | .completion = nfs_direct_commit_complete, |
660 | .resched_write = nfs_direct_resched_write, |
661 | }; |
662 | |
663 | static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq) |
664 | { |
665 | int res; |
666 | struct nfs_commit_info cinfo; |
667 | LIST_HEAD(mds_list); |
668 | |
669 | nfs_init_cinfo_from_dreq(cinfo: &cinfo, dreq); |
670 | nfs_scan_commit(inode: dreq->inode, dst: &mds_list, cinfo: &cinfo); |
671 | res = nfs_generic_commit_list(inode: dreq->inode, head: &mds_list, how: 0, cinfo: &cinfo); |
672 | if (res < 0) /* res == -ENOMEM */ |
673 | nfs_direct_write_reschedule(dreq); |
674 | } |
675 | |
676 | static void nfs_direct_write_clear_reqs(struct nfs_direct_req *dreq) |
677 | { |
678 | struct nfs_commit_info cinfo; |
679 | struct nfs_page *req; |
680 | LIST_HEAD(reqs); |
681 | |
682 | nfs_init_cinfo_from_dreq(cinfo: &cinfo, dreq); |
683 | nfs_direct_write_scan_commit_list(inode: dreq->inode, list: &reqs, cinfo: &cinfo); |
684 | |
685 | while (!list_empty(head: &reqs)) { |
686 | req = nfs_list_entry(head: reqs.next); |
687 | nfs_list_remove_request(req); |
688 | nfs_direct_truncate_request(dreq, req); |
689 | nfs_release_request(req); |
690 | nfs_unlock_and_release_request(req); |
691 | } |
692 | } |
693 | |
694 | static void nfs_direct_write_schedule_work(struct work_struct *work) |
695 | { |
696 | struct nfs_direct_req *dreq = container_of(work, struct nfs_direct_req, work); |
697 | int flags = dreq->flags; |
698 | |
699 | dreq->flags = 0; |
700 | switch (flags) { |
701 | case NFS_ODIRECT_DO_COMMIT: |
702 | nfs_direct_commit_schedule(dreq); |
703 | break; |
704 | case NFS_ODIRECT_RESCHED_WRITES: |
705 | nfs_direct_write_reschedule(dreq); |
706 | break; |
707 | default: |
708 | nfs_direct_write_clear_reqs(dreq); |
709 | nfs_zap_mapping(inode: dreq->inode, mapping: dreq->inode->i_mapping); |
710 | nfs_direct_complete(dreq); |
711 | } |
712 | } |
713 | |
714 | static void nfs_direct_write_complete(struct nfs_direct_req *dreq) |
715 | { |
716 | trace_nfs_direct_write_complete(dreq); |
717 | queue_work(wq: nfsiod_workqueue, work: &dreq->work); /* Calls nfs_direct_write_schedule_work */ |
718 | } |
719 | |
720 | static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) |
721 | { |
722 | struct nfs_direct_req *dreq = hdr->dreq; |
723 | struct nfs_commit_info cinfo; |
724 | struct nfs_page *req = nfs_list_entry(head: hdr->pages.next); |
725 | int flags = NFS_ODIRECT_DONE; |
726 | |
727 | trace_nfs_direct_write_completion(dreq); |
728 | |
729 | nfs_init_cinfo_from_dreq(cinfo: &cinfo, dreq); |
730 | |
731 | spin_lock(lock: &dreq->lock); |
732 | if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) { |
733 | spin_unlock(lock: &dreq->lock); |
734 | goto out_put; |
735 | } |
736 | |
737 | nfs_direct_count_bytes(dreq, hdr); |
738 | if (test_bit(NFS_IOHDR_UNSTABLE_WRITES, &hdr->flags) && |
739 | !test_bit(NFS_IOHDR_ERROR, &hdr->flags)) { |
740 | if (!dreq->flags) |
741 | dreq->flags = NFS_ODIRECT_DO_COMMIT; |
742 | flags = dreq->flags; |
743 | } |
744 | spin_unlock(lock: &dreq->lock); |
745 | |
746 | while (!list_empty(head: &hdr->pages)) { |
747 | |
748 | req = nfs_list_entry(head: hdr->pages.next); |
749 | nfs_list_remove_request(req); |
750 | if (flags == NFS_ODIRECT_DO_COMMIT) { |
751 | kref_get(kref: &req->wb_kref); |
752 | memcpy(&req->wb_verf, &hdr->verf.verifier, |
753 | sizeof(req->wb_verf)); |
754 | nfs_mark_request_commit(req, lseg: hdr->lseg, cinfo: &cinfo, |
755 | ds_commit_idx: hdr->ds_commit_idx); |
756 | } else if (flags == NFS_ODIRECT_RESCHED_WRITES) { |
757 | kref_get(kref: &req->wb_kref); |
758 | nfs_mark_request_commit(req, NULL, cinfo: &cinfo, ds_commit_idx: 0); |
759 | } |
760 | nfs_unlock_and_release_request(req); |
761 | } |
762 | |
763 | out_put: |
764 | if (put_dreq(dreq)) |
765 | nfs_direct_write_complete(dreq); |
766 | hdr->release(hdr); |
767 | } |
768 | |
769 | static void nfs_write_sync_pgio_error(struct list_head *head, int error) |
770 | { |
771 | struct nfs_page *req; |
772 | |
773 | while (!list_empty(head)) { |
774 | req = nfs_list_entry(head: head->next); |
775 | nfs_list_remove_request(req); |
776 | nfs_unlock_and_release_request(req); |
777 | } |
778 | } |
779 | |
780 | static void nfs_direct_write_reschedule_io(struct nfs_pgio_header *hdr) |
781 | { |
782 | struct nfs_direct_req *dreq = hdr->dreq; |
783 | struct nfs_page *req; |
784 | struct nfs_commit_info cinfo; |
785 | |
786 | trace_nfs_direct_write_reschedule_io(dreq); |
787 | |
788 | nfs_init_cinfo_from_dreq(cinfo: &cinfo, dreq); |
789 | spin_lock(lock: &dreq->lock); |
790 | if (dreq->error == 0) |
791 | dreq->flags = NFS_ODIRECT_RESCHED_WRITES; |
792 | set_bit(nr: NFS_IOHDR_REDO, addr: &hdr->flags); |
793 | spin_unlock(lock: &dreq->lock); |
794 | while (!list_empty(head: &hdr->pages)) { |
795 | req = nfs_list_entry(head: hdr->pages.next); |
796 | nfs_list_remove_request(req); |
797 | nfs_unlock_request(req); |
798 | nfs_mark_request_commit(req, NULL, cinfo: &cinfo, ds_commit_idx: 0); |
799 | } |
800 | } |
801 | |
802 | static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = { |
803 | .error_cleanup = nfs_write_sync_pgio_error, |
804 | .init_hdr = nfs_direct_pgio_init, |
805 | .completion = nfs_direct_write_completion, |
806 | .reschedule_io = nfs_direct_write_reschedule_io, |
807 | }; |
808 | |
809 | |
810 | /* |
811 | * NB: Return the value of the first error return code. Subsequent |
812 | * errors after the first one are ignored. |
813 | */ |
814 | /* |
815 | * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE |
816 | * operation. If nfs_writedata_alloc() or get_user_pages() fails, |
817 | * bail and stop sending more writes. Write length accounting is |
818 | * handled automatically by nfs_direct_write_result(). Otherwise, if |
819 | * no requests have been sent, just return an error. |
820 | */ |
821 | static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, |
822 | struct iov_iter *iter, |
823 | loff_t pos, int ioflags) |
824 | { |
825 | struct nfs_pageio_descriptor desc; |
826 | struct inode *inode = dreq->inode; |
827 | struct nfs_commit_info cinfo; |
828 | ssize_t result = 0; |
829 | size_t requested_bytes = 0; |
830 | size_t wsize = max_t(size_t, NFS_SERVER(inode)->wsize, PAGE_SIZE); |
831 | bool defer = false; |
832 | |
833 | trace_nfs_direct_write_schedule_iovec(dreq); |
834 | |
835 | nfs_pageio_init_write(pgio: &desc, inode, ioflags, force_mds: false, |
836 | compl_ops: &nfs_direct_write_completion_ops); |
837 | desc.pg_dreq = dreq; |
838 | get_dreq(dreq); |
839 | inode_dio_begin(inode); |
840 | |
841 | NFS_I(inode)->write_io += iov_iter_count(i: iter); |
842 | while (iov_iter_count(i: iter)) { |
843 | struct page **pagevec; |
844 | size_t bytes; |
845 | size_t pgbase; |
846 | unsigned npages, i; |
847 | |
848 | result = iov_iter_get_pages_alloc2(i: iter, pages: &pagevec, |
849 | maxsize: wsize, start: &pgbase); |
850 | if (result < 0) |
851 | break; |
852 | |
853 | bytes = result; |
854 | npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE; |
855 | for (i = 0; i < npages; i++) { |
856 | struct nfs_page *req; |
857 | unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase); |
858 | |
859 | req = nfs_page_create_from_page(ctx: dreq->ctx, page: pagevec[i], |
860 | pgbase, offset: pos, count: req_len); |
861 | if (IS_ERR(ptr: req)) { |
862 | result = PTR_ERR(ptr: req); |
863 | break; |
864 | } |
865 | |
866 | if (desc.pg_error < 0) { |
867 | nfs_free_request(req); |
868 | result = desc.pg_error; |
869 | break; |
870 | } |
871 | |
872 | pgbase = 0; |
873 | bytes -= req_len; |
874 | requested_bytes += req_len; |
875 | pos += req_len; |
876 | dreq->bytes_left -= req_len; |
877 | |
878 | if (defer) { |
879 | nfs_mark_request_commit(req, NULL, cinfo: &cinfo, ds_commit_idx: 0); |
880 | continue; |
881 | } |
882 | |
883 | nfs_lock_request(req); |
884 | if (nfs_pageio_add_request(&desc, req)) |
885 | continue; |
886 | |
887 | /* Exit on hard errors */ |
888 | if (desc.pg_error < 0 && desc.pg_error != -EAGAIN) { |
889 | result = desc.pg_error; |
890 | nfs_unlock_and_release_request(req); |
891 | break; |
892 | } |
893 | |
894 | /* If the error is soft, defer remaining requests */ |
895 | nfs_init_cinfo_from_dreq(cinfo: &cinfo, dreq); |
896 | spin_lock(lock: &dreq->lock); |
897 | dreq->flags = NFS_ODIRECT_RESCHED_WRITES; |
898 | spin_unlock(lock: &dreq->lock); |
899 | nfs_unlock_request(req); |
900 | nfs_mark_request_commit(req, NULL, cinfo: &cinfo, ds_commit_idx: 0); |
901 | desc.pg_error = 0; |
902 | defer = true; |
903 | } |
904 | nfs_direct_release_pages(pages: pagevec, npages); |
905 | kvfree(addr: pagevec); |
906 | if (result < 0) |
907 | break; |
908 | } |
909 | nfs_pageio_complete(desc: &desc); |
910 | |
911 | /* |
912 | * If no bytes were started, return the error, and let the |
913 | * generic layer handle the completion. |
914 | */ |
915 | if (requested_bytes == 0) { |
916 | inode_dio_end(inode); |
917 | nfs_direct_req_release(dreq); |
918 | return result < 0 ? result : -EIO; |
919 | } |
920 | |
921 | if (put_dreq(dreq)) |
922 | nfs_direct_write_complete(dreq); |
923 | return requested_bytes; |
924 | } |
925 | |
926 | /** |
927 | * nfs_file_direct_write - file direct write operation for NFS files |
928 | * @iocb: target I/O control block |
929 | * @iter: vector of user buffers from which to write data |
930 | * @swap: flag indicating this is swap IO, not O_DIRECT IO |
931 | * |
932 | * We use this function for direct writes instead of calling |
933 | * generic_file_aio_write() in order to avoid taking the inode |
934 | * semaphore and updating the i_size. The NFS server will set |
935 | * the new i_size and this client must read the updated size |
936 | * back into its cache. We let the server do generic write |
937 | * parameter checking and report problems. |
938 | * |
939 | * We eliminate local atime updates, see direct read above. |
940 | * |
941 | * We avoid unnecessary page cache invalidations for normal cached |
942 | * readers of this file. |
943 | * |
944 | * Note that O_APPEND is not supported for NFS direct writes, as there |
945 | * is no atomic O_APPEND write facility in the NFS protocol. |
946 | */ |
947 | ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter, |
948 | bool swap) |
949 | { |
950 | ssize_t result, requested; |
951 | size_t count; |
952 | struct file *file = iocb->ki_filp; |
953 | struct address_space *mapping = file->f_mapping; |
954 | struct inode *inode = mapping->host; |
955 | struct nfs_direct_req *dreq; |
956 | struct nfs_lock_context *l_ctx; |
957 | loff_t pos, end; |
958 | |
959 | dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n" , |
960 | file, iov_iter_count(iter), (long long) iocb->ki_pos); |
961 | |
962 | if (swap) |
963 | /* bypass generic checks */ |
964 | result = iov_iter_count(i: iter); |
965 | else |
966 | result = generic_write_checks(iocb, iter); |
967 | if (result <= 0) |
968 | return result; |
969 | count = result; |
970 | nfs_add_stats(inode: mapping->host, stat: NFSIOS_DIRECTWRITTENBYTES, addend: count); |
971 | |
972 | pos = iocb->ki_pos; |
973 | end = (pos + iov_iter_count(i: iter) - 1) >> PAGE_SHIFT; |
974 | |
975 | task_io_account_write(bytes: count); |
976 | |
977 | result = -ENOMEM; |
978 | dreq = nfs_direct_req_alloc(); |
979 | if (!dreq) |
980 | goto out; |
981 | |
982 | dreq->inode = inode; |
983 | dreq->bytes_left = dreq->max_count = count; |
984 | dreq->io_start = pos; |
985 | dreq->ctx = get_nfs_open_context(ctx: nfs_file_open_context(filp: iocb->ki_filp)); |
986 | l_ctx = nfs_get_lock_context(ctx: dreq->ctx); |
987 | if (IS_ERR(ptr: l_ctx)) { |
988 | result = PTR_ERR(ptr: l_ctx); |
989 | nfs_direct_req_release(dreq); |
990 | goto out_release; |
991 | } |
992 | dreq->l_ctx = l_ctx; |
993 | if (!is_sync_kiocb(kiocb: iocb)) |
994 | dreq->iocb = iocb; |
995 | pnfs_init_ds_commit_info_ops(fl_cinfo: &dreq->ds_cinfo, inode); |
996 | |
997 | if (swap) { |
998 | requested = nfs_direct_write_schedule_iovec(dreq, iter, pos, |
999 | FLUSH_STABLE); |
1000 | } else { |
1001 | nfs_start_io_direct(inode); |
1002 | |
1003 | requested = nfs_direct_write_schedule_iovec(dreq, iter, pos, |
1004 | FLUSH_COND_STABLE); |
1005 | |
1006 | if (mapping->nrpages) { |
1007 | invalidate_inode_pages2_range(mapping, |
1008 | start: pos >> PAGE_SHIFT, end); |
1009 | } |
1010 | |
1011 | nfs_end_io_direct(inode); |
1012 | } |
1013 | |
1014 | if (requested > 0) { |
1015 | result = nfs_direct_wait(dreq); |
1016 | if (result > 0) { |
1017 | requested -= result; |
1018 | iocb->ki_pos = pos + result; |
1019 | /* XXX: should check the generic_write_sync retval */ |
1020 | generic_write_sync(iocb, count: result); |
1021 | } |
1022 | iov_iter_revert(i: iter, bytes: requested); |
1023 | } else { |
1024 | result = requested; |
1025 | } |
1026 | nfs_fscache_invalidate(inode, FSCACHE_INVAL_DIO_WRITE); |
1027 | out_release: |
1028 | nfs_direct_req_release(dreq); |
1029 | out: |
1030 | return result; |
1031 | } |
1032 | |
1033 | /** |
1034 | * nfs_init_directcache - create a slab cache for nfs_direct_req structures |
1035 | * |
1036 | */ |
1037 | int __init nfs_init_directcache(void) |
1038 | { |
1039 | nfs_direct_cachep = kmem_cache_create(name: "nfs_direct_cache" , |
1040 | size: sizeof(struct nfs_direct_req), |
1041 | align: 0, flags: (SLAB_RECLAIM_ACCOUNT| |
1042 | SLAB_MEM_SPREAD), |
1043 | NULL); |
1044 | if (nfs_direct_cachep == NULL) |
1045 | return -ENOMEM; |
1046 | |
1047 | return 0; |
1048 | } |
1049 | |
1050 | /** |
1051 | * nfs_destroy_directcache - destroy the slab cache for nfs_direct_req structures |
1052 | * |
1053 | */ |
1054 | void nfs_destroy_directcache(void) |
1055 | { |
1056 | kmem_cache_destroy(s: nfs_direct_cachep); |
1057 | } |
1058 | |