direct-io.c source code [linux/fs/direct-io.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* fs/direct-io.c
4	*
5	* Copyright (C) 2002, Linus Torvalds.
6	*
7	* O_DIRECT
8	*
9	* 04Jul2002 Andrew Morton
10	* Initial version
11	* 11Sep2002 janetinc@us.ibm.com
12	* added readv/writev support.
13	* 29Oct2002 Andrew Morton
14	* rewrote bio_add_page() support.
15	* 30Oct2002 pbadari@us.ibm.com
16	* added support for non-aligned IO.
17	* 06Nov2002 pbadari@us.ibm.com
18	* added asynchronous IO support.
19	* 21Jul2003 nathans@sgi.com
20	* added IO completion notifier.
21	*/
22
23	#include <linux/kernel.h>
24	#include <linux/module.h>
25	#include <linux/types.h>
26	#include <linux/fs.h>
27	#include <linux/mm.h>
28	#include <linux/slab.h>
29	#include <linux/highmem.h>
30	#include <linux/pagemap.h>
31	#include <linux/task_io_accounting_ops.h>
32	#include <linux/bio.h>
33	#include <linux/wait.h>
34	#include <linux/err.h>
35	#include <linux/blkdev.h>
36	#include <linux/buffer_head.h>
37	#include <linux/rwsem.h>
38	#include <linux/uio.h>
39	#include <linux/atomic.h>
40
41	#include "internal.h"
42
43	/*
44	* How many user pages to map in one call to iov_iter_extract_pages(). This
45	* determines the size of a structure in the slab cache
46	*/
47	#define DIO_PAGES 64
48
49	/*
50	* Flags for dio_complete()
51	*/
52	#define DIO_COMPLETE_ASYNC 0x01 /* This is async IO */
53	#define DIO_COMPLETE_INVALIDATE 0x02 /* Can invalidate pages */
54
55	/*
56	* This code generally works in units of "dio_blocks". A dio_block is
57	* somewhere between the hard sector size and the filesystem block size. it
58	* is determined on a per-invocation basis. When talking to the filesystem
59	* we need to convert dio_blocks to fs_blocks by scaling the dio_block quantity
60	* down by dio->blkfactor. Similarly, fs-blocksize quantities are converted
61	* to bio_block quantities by shifting left by blkfactor.
62	*
63	* If blkfactor is zero then the user's request was aligned to the filesystem's
64	* blocksize.
65	*/
66
67	/ dio_state only used in the submission path /
68
69	struct dio_submit {
70	struct bio bio; /* bio under assembly /
71	unsigned blkbits; / doesn't change /
72	unsigned blkfactor; / When we're using an alignment which*
73	is finer than the filesystem's soft
74	blocksize, this specifies how much
75	finer. blkfactor=2 means 1/4-block
76	alignment. Does not change /*
77	unsigned start_zero_done; / flag: sub-blocksize zeroing has*
78	been performed at the start of a
79	write /*
80	int pages_in_io; / approximate total IO pages /
81	sector_t block_in_file; / Current offset into the underlying*
82	file in dio_block units. /*
83	unsigned blocks_available; / At block_in_file. changes /
84	int reap_counter; / rate limit reaping /
85	sector_t final_block_in_request;/ doesn't change /
86	int boundary; / prev block is at a boundary /
87	get_block_t get_block; /* block mapping function /
88
89	loff_t logical_offset_in_bio; / current first logical block in bio /
90	sector_t final_block_in_bio; / current final block in bio + 1 /
91	sector_t next_block_for_io; / next block to be put under IO,*
92	in dio_blocks units /*
93
94	/*
95	* Deferred addition of a page to the dio. These variables are
96	* private to dio_send_cur_page(), submit_page_section() and
97	* dio_bio_add_page().
98	*/
99	struct page cur_page; /* The page /
100	unsigned cur_page_offset; / Offset into it, in bytes /
101	unsigned cur_page_len; / Nr of bytes at cur_page_offset /
102	sector_t cur_page_block; / Where it starts /
103	loff_t cur_page_fs_offset; / Offset in file /
104
105	struct iov_iter *iter;
106	/*
107	* Page queue. These variables belong to dio_refill_pages() and
108	* dio_get_page().
109	*/
110	unsigned head; / next page to process /
111	unsigned tail; / last valid page + 1 /
112	size_t from, to;
113	};
114
115	/ dio_state communicated between submission path and end_io /
116	struct dio {
117	int flags; / doesn't change /
118	blk_opf_t opf; / request operation type and flags /
119	struct gendisk *bio_disk;
120	struct inode *inode;
121	loff_t i_size; / i_size when submitted /
122	dio_iodone_t end_io; /* IO completion function /
123	bool is_pinned; / T if we have pins on the pages /
124
125	void private; /* copy from map_bh.b_private /
126
127	/ BIO completion state /
128	spinlock_t bio_lock; / protects BIO fields below /
129	int page_errors; / err from iov_iter_extract_pages() /
130	int is_async; / is IO async ? /
131	bool defer_completion; / defer AIO completion to workqueue? /
132	bool should_dirty; / if pages should be dirtied /
133	int io_error; / IO error in completion path /
134	unsigned long refcount; / direct_io_worker() and bios /
135	struct bio bio_list; /* singly linked via bi_private /
136	struct task_struct waiter; /* waiting task (NULL if none) /
137
138	/ AIO related stuff /
139	struct kiocb iocb; /* kiocb /
140	ssize_t result; / IO result /
141
142	/*
143	* pages[] (and any fields placed after it) are not zeroed out at
144	* allocation time. Don't add new fields after pages[] unless you
145	* wish that they not be zeroed.
146	*/
147	union {
148	struct page pages[DIO_PAGES]; /* page buffer /
149	struct work_struct complete_work;/ deferred AIO completion /
150	};
151	} ____cacheline_aligned_in_smp;
152
153	static struct kmem_cache *dio_cache __ro_after_init;
154
155	/*
156	* How many pages are in the queue?
157	*/
158	static inline unsigned dio_pages_present(struct dio_submit *sdio)
159	{
160	return sdio->tail - sdio->head;
161	}
162
163	/*
164	* Go grab and pin some userspace pages. Typically we'll get 64 at a time.
165	*/
166	static inline int dio_refill_pages(struct dio dio, struct* dio_submit *sdio)
167	{
168	struct page **pages = dio->pages;
169	const enum req_op dio_op = dio->opf & REQ_OP_MASK;
170	ssize_t ret;
171
172	ret = iov_iter_extract_pages(i: sdio->iter, pages: &pages, LONG_MAX,
173	DIO_PAGES, extraction_flags: `0`, offset0: &sdio->from);
174
175	if (ret < `0` && sdio->blocks_available && dio_op == REQ_OP_WRITE) {
176	/*
177	* A memory fault, but the filesystem has some outstanding
178	* mapped blocks. We need to use those blocks up to avoid
179	* leaking stale data in the file.
180	*/
181	if (dio->page_errors == `0`)
182	dio->page_errors = ret;
183	dio->pages[`0`] = ZERO_PAGE(`0`);
184	sdio->head = `0`;
185	sdio->tail = `1`;
186	sdio->from = `0`;
187	sdio->to = PAGE_SIZE;
188	return `0`;
189	}
190
191	if (ret >= `0`) {
192	ret += sdio->from;
193	sdio->head = `0`;
194	sdio->tail = (ret + PAGE_SIZE - `1`) / PAGE_SIZE;
195	sdio->to = ((ret - `1`) & (PAGE_SIZE - `1`)) + `1`;
196	return `0`;
197	}
198	return ret;
199	}
200
201	/*
202	* Get another userspace page. Returns an ERR_PTR on error. Pages are
203	* buffered inside the dio so that we can call iov_iter_extract_pages()
204	* against a decent number of pages, less frequently. To provide nicer use of
205	* the L1 cache.
206	*/
207	static inline struct page dio_get_page(struct* dio *dio,
208	struct dio_submit *sdio)
209	{
210	if (dio_pages_present(sdio) == `0`) {
211	int ret;
212
213	ret = dio_refill_pages(dio, sdio);
214	if (ret)
215	return ERR_PTR(error: ret);
216	BUG_ON(dio_pages_present(sdio) == `0`);
217	}
218	return dio->pages[sdio->head];
219	}
220
221	static void dio_pin_page(struct dio dio, struct* page *page)
222	{
223	if (dio->is_pinned)
224	folio_add_pin(page_folio(page));
225	}
226
227	static void dio_unpin_page(struct dio dio, struct* page *page)
228	{
229	if (dio->is_pinned)
230	unpin_user_page(page);
231	}
232
233	/*
234	* dio_complete() - called when all DIO BIO I/O has been completed
235	*
236	* This drops i_dio_count, lets interested parties know that a DIO operation
237	* has completed, and calculates the resulting return code for the operation.
238	*
239	* It lets the filesystem know if it registered an interest earlier via
240	* get_block. Pass the private field of the map buffer_head so that
241	* filesystems can use it to hold additional state between get_block calls and
242	* dio_complete.
243	*/
244	static ssize_t dio_complete(struct dio dio, ssize_t ret, unsigned* int flags)
245	{
246	const enum req_op dio_op = dio->opf & REQ_OP_MASK;
247	loff_t offset = dio->iocb->ki_pos;
248	ssize_t transferred = `0`;
249	int err;
250
251	/*
252	* AIO submission can race with bio completion to get here while
253	* expecting to have the last io completed by bio completion.
254	* In that case -EIOCBQUEUED is in fact not an error we want
255	* to preserve through this call.
256	*/
257	if (ret == -EIOCBQUEUED)
258	ret = `0`;
259
260	if (dio->result) {
261	transferred = dio->result;
262
263	/ Check for short read case /
264	if (dio_op == REQ_OP_READ &&
265	((offset + transferred) > dio->i_size))
266	transferred = dio->i_size - offset;
267	/ ignore EFAULT if some IO has been done /
268	if (unlikely(ret == -EFAULT) && transferred)
269	ret = `0`;
270	}
271
272	if (ret == `0`)
273	ret = dio->page_errors;
274	if (ret == `0`)
275	ret = dio->io_error;
276	if (ret == `0`)
277	ret = transferred;
278
279	if (dio->end_io) {
280	// XXX: ki_pos??
281	err = dio->end_io(dio->iocb, offset, ret, dio->private);
282	if (err)
283	ret = err;
284	}
285
286	/*
287	* Try again to invalidate clean pages which might have been cached by
288	* non-direct readahead, or faulted in by get_user_pages() if the source
289	* of the write was an mmap'ed region of the file we're writing. Either
290	* one is a pretty crazy thing to do, so we don't support it 100%. If
291	* this invalidation fails, tough, the write still worked...
292	*
293	* And this page cache invalidation has to be after dio->end_io(), as
294	* some filesystems convert unwritten extents to real allocations in
295	* end_io() when necessary, otherwise a racing buffer read would cache
296	* zeros from unwritten extents.
297	*/
298	if (flags & DIO_COMPLETE_INVALIDATE &&
299	ret > `0` && dio_op == REQ_OP_WRITE)
300	kiocb_invalidate_post_direct_write(iocb: dio->iocb, count: ret);
301
302	inode_dio_end(inode: dio->inode);
303
304	if (flags & DIO_COMPLETE_ASYNC) {
305	/*
306	* generic_write_sync expects ki_pos to have been updated
307	* already, but the submission path only does this for
308	* synchronous I/O.
309	*/
310	dio->iocb->ki_pos += transferred;
311
312	if (ret > `0` && dio_op == REQ_OP_WRITE)
313	ret = generic_write_sync(iocb: dio->iocb, count: ret);
314	dio->iocb->ki_complete(dio->iocb, ret);
315	}
316
317	kmem_cache_free(s: dio_cache, objp: dio);
318	return ret;
319	}
320
321	static void dio_aio_complete_work(struct work_struct *work)
322	{
323	struct dio dio = container_of(work, struct* dio, complete_work);
324
325	dio_complete(dio, ret: `0`, DIO_COMPLETE_ASYNC \| DIO_COMPLETE_INVALIDATE);
326	}
327
328	static blk_status_t dio_bio_complete(struct dio dio, struct* bio *bio);
329
330	/*
331	* Asynchronous IO callback.
332	*/
333	static void dio_bio_end_aio(struct bio *bio)
334	{
335	struct dio *dio = bio->bi_private;
336	const enum req_op dio_op = dio->opf & REQ_OP_MASK;
337	unsigned long remaining;
338	unsigned long flags;
339	bool defer_completion = false;
340
341	/ cleanup the bio /
342	dio_bio_complete(dio, bio);
343
344	spin_lock_irqsave(&dio->bio_lock, flags);
345	remaining = --dio->refcount;
346	if (remaining == `1` && dio->waiter)
347	wake_up_process(tsk: dio->waiter);
348	spin_unlock_irqrestore(lock: &dio->bio_lock, flags);
349
350	if (remaining == `0`) {
351	/*
352	* Defer completion when defer_completion is set or
353	* when the inode has pages mapped and this is AIO write.
354	* We need to invalidate those pages because there is a
355	* chance they contain stale data in the case buffered IO
356	* went in between AIO submission and completion into the
357	* same region.
358	*/
359	if (dio->result)
360	defer_completion = dio->defer_completion \|\|
361	(dio_op == REQ_OP_WRITE &&
362	dio->inode->i_mapping->nrpages);
363	if (defer_completion) {
364	INIT_WORK(&dio->complete_work, dio_aio_complete_work);
365	queue_work(wq: dio->inode->i_sb->s_dio_done_wq,
366	work: &dio->complete_work);
367	} else {
368	dio_complete(dio, ret: `0`, DIO_COMPLETE_ASYNC);
369	}
370	}
371	}
372
373	/*
374	* The BIO completion handler simply queues the BIO up for the process-context
375	* handler.
376	*
377	* During I/O bi_private points at the dio. After I/O, bi_private is used to
378	* implement a singly-linked list of completed BIOs, at dio->bio_list.
379	*/
380	static void dio_bio_end_io(struct bio *bio)
381	{
382	struct dio *dio = bio->bi_private;
383	unsigned long flags;
384
385	spin_lock_irqsave(&dio->bio_lock, flags);
386	bio->bi_private = dio->bio_list;
387	dio->bio_list = bio;
388	if (--dio->refcount == `1` && dio->waiter)
389	wake_up_process(tsk: dio->waiter);
390	spin_unlock_irqrestore(lock: &dio->bio_lock, flags);
391	}
392
393	static inline void
394	dio_bio_alloc(struct dio dio, struct* dio_submit *sdio,
395	struct block_device *bdev,
396	sector_t first_sector, int nr_vecs)
397	{
398	struct bio *bio;
399
400	/*
401	* bio_alloc() is guaranteed to return a bio when allowed to sleep and
402	* we request a valid number of vectors.
403	*/
404	bio = bio_alloc(bdev, nr_vecs, opf: dio->opf, GFP_KERNEL);
405	bio->bi_iter.bi_sector = first_sector;
406	if (dio->is_async)
407	bio->bi_end_io = dio_bio_end_aio;
408	else
409	bio->bi_end_io = dio_bio_end_io;
410	if (dio->is_pinned)
411	bio_set_flag(bio, bit: BIO_PAGE_PINNED);
412	bio->bi_write_hint = file_inode(f: dio->iocb->ki_filp)->i_write_hint;
413
414	sdio->bio = bio;
415	sdio->logical_offset_in_bio = sdio->cur_page_fs_offset;
416	}
417
418	/*
419	* In the AIO read case we speculatively dirty the pages before starting IO.
420	* During IO completion, any of these pages which happen to have been written
421	* back will be redirtied by bio_check_pages_dirty().
422	*
423	* bios hold a dio reference between submit_bio and ->end_io.
424	*/
425	static inline void dio_bio_submit(struct dio dio, struct* dio_submit *sdio)
426	{
427	const enum req_op dio_op = dio->opf & REQ_OP_MASK;
428	struct bio *bio = sdio->bio;
429	unsigned long flags;
430
431	bio->bi_private = dio;
432
433	spin_lock_irqsave(&dio->bio_lock, flags);
434	dio->refcount++;
435	spin_unlock_irqrestore(lock: &dio->bio_lock, flags);
436
437	if (dio->is_async && dio_op == REQ_OP_READ && dio->should_dirty)
438	bio_set_pages_dirty(bio);
439
440	dio->bio_disk = bio->bi_bdev->bd_disk;
441
442	submit_bio(bio);
443
444	sdio->bio = NULL;
445	sdio->boundary = `0`;
446	sdio->logical_offset_in_bio = `0`;
447	}
448
449	/*
450	* Release any resources in case of a failure
451	*/
452	static inline void dio_cleanup(struct dio dio, struct* dio_submit *sdio)
453	{
454	if (dio->is_pinned)
455	unpin_user_pages(pages: dio->pages + sdio->head,
456	npages: sdio->tail - sdio->head);
457	sdio->head = sdio->tail;
458	}
459
460	/*
461	* Wait for the next BIO to complete. Remove it and return it. NULL is
462	* returned once all BIOs have been completed. This must only be called once
463	* all bios have been issued so that dio->refcount can only decrease. This
464	* requires that the caller hold a reference on the dio.
465	*/
466	static struct bio dio_await_one(struct* dio *dio)
467	{
468	unsigned long flags;
469	struct bio *bio = NULL;
470
471	spin_lock_irqsave(&dio->bio_lock, flags);
472
473	/*
474	* Wait as long as the list is empty and there are bios in flight. bio
475	* completion drops the count, maybe adds to the list, and wakes while
476	* holding the bio_lock so we don't need set_current_state()'s barrier
477	* and can call it after testing our condition.
478	*/
479	while (dio->refcount > `1` && dio->bio_list == NULL) {
480	__set_current_state(TASK_UNINTERRUPTIBLE);
481	dio->waiter = current;
482	spin_unlock_irqrestore(lock: &dio->bio_lock, flags);
483	blk_io_schedule();
484	/ wake up sets us TASK_RUNNING /
485	spin_lock_irqsave(&dio->bio_lock, flags);
486	dio->waiter = NULL;
487	}
488	if (dio->bio_list) {
489	bio = dio->bio_list;
490	dio->bio_list = bio->bi_private;
491	}
492	spin_unlock_irqrestore(lock: &dio->bio_lock, flags);
493	return bio;
494	}
495
496	/*
497	* Process one completed BIO. No locks are held.
498	*/
499	static blk_status_t dio_bio_complete(struct dio dio, struct* bio *bio)
500	{
501	blk_status_t err = bio->bi_status;
502	const enum req_op dio_op = dio->opf & REQ_OP_MASK;
503	bool should_dirty = dio_op == REQ_OP_READ && dio->should_dirty;
504
505	if (err) {
506	if (err == BLK_STS_AGAIN && (bio->bi_opf & REQ_NOWAIT))
507	dio->io_error = -EAGAIN;
508	else
509	dio->io_error = -EIO;
510	}
511
512	if (dio->is_async && should_dirty) {
513	bio_check_pages_dirty(bio); / transfers ownership /
514	} else {
515	bio_release_pages(bio, mark_dirty: should_dirty);
516	bio_put(bio);
517	}
518	return err;
519	}
520
521	/*
522	* Wait on and process all in-flight BIOs. This must only be called once
523	* all bios have been issued so that the refcount can only decrease.
524	* This just waits for all bios to make it through dio_bio_complete. IO
525	* errors are propagated through dio->io_error and should be propagated via
526	* dio_complete().
527	*/
528	static void dio_await_completion(struct dio *dio)
529	{
530	struct bio *bio;
531	do {
532	bio = dio_await_one(dio);
533	if (bio)
534	dio_bio_complete(dio, bio);
535	} while (bio);
536	}
537
538	/*
539	* A really large O_DIRECT read or write can generate a lot of BIOs. So
540	* to keep the memory consumption sane we periodically reap any completed BIOs
541	* during the BIO generation phase.
542	*
543	* This also helps to limit the peak amount of pinned userspace memory.
544	*/
545	static inline int dio_bio_reap(struct dio dio, struct* dio_submit *sdio)
546	{
547	int ret = `0`;
548
549	if (sdio->reap_counter++ >= `64`) {
550	while (dio->bio_list) {
551	unsigned long flags;
552	struct bio *bio;
553	int ret2;
554
555	spin_lock_irqsave(&dio->bio_lock, flags);
556	bio = dio->bio_list;
557	dio->bio_list = bio->bi_private;
558	spin_unlock_irqrestore(lock: &dio->bio_lock, flags);
559	ret2 = blk_status_to_errno(status: dio_bio_complete(dio, bio));
560	if (ret == `0`)
561	ret = ret2;
562	}
563	sdio->reap_counter = `0`;
564	}
565	return ret;
566	}
567
568	static int dio_set_defer_completion(struct dio *dio)
569	{
570	struct super_block *sb = dio->inode->i_sb;
571
572	if (dio->defer_completion)
573	return `0`;
574	dio->defer_completion = true;
575	if (!sb->s_dio_done_wq)
576	return sb_init_dio_done_wq(sb);
577	return `0`;
578	}
579
580	/*
581	* Call into the fs to map some more disk blocks. We record the current number
582	* of available blocks at sdio->blocks_available. These are in units of the
583	* fs blocksize, i_blocksize(inode).
584	*
585	* The fs is allowed to map lots of blocks at once. If it wants to do that,
586	* it uses the passed inode-relative block number as the file offset, as usual.
587	*
588	* get_block() is passed the number of i_blkbits-sized blocks which direct_io
589	* has remaining to do. The fs should not map more than this number of blocks.
590	*
591	* If the fs has mapped a lot of blocks, it should populate bh->b_size to
592	* indicate how much contiguous disk space has been made available at
593	* bh->b_blocknr.
594	*
595	* If any of the mapped blocks are new, then the fs must set buffer_new().
596	* This isn't very efficient...
597	*
598	* In the case of filesystem holes: the fs may return an arbitrarily-large
599	* hole by returning an appropriate value in b_size and by clearing
600	* buffer_mapped(). However the direct-io code will only process holes one
601	* block at a time - it will repeatedly call get_block() as it walks the hole.
602	*/
603	static int get_more_blocks(struct dio dio, struct* dio_submit *sdio,
604	struct buffer_head *map_bh)
605	{
606	const enum req_op dio_op = dio->opf & REQ_OP_MASK;
607	int ret;
608	sector_t fs_startblk; / Into file, in filesystem-sized blocks /
609	sector_t fs_endblk; / Into file, in filesystem-sized blocks /
610	unsigned long fs_count; / Number of filesystem-sized blocks /
611	int create;
612	unsigned int i_blkbits = sdio->blkbits + sdio->blkfactor;
613	loff_t i_size;
614
615	/*
616	* If there was a memory error and we've overwritten all the
617	* mapped blocks then we can now return that memory error
618	*/
619	ret = dio->page_errors;
620	if (ret == `0`) {
621	BUG_ON(sdio->block_in_file >= sdio->final_block_in_request);
622	fs_startblk = sdio->block_in_file >> sdio->blkfactor;
623	fs_endblk = (sdio->final_block_in_request - `1`) >>
624	sdio->blkfactor;
625	fs_count = fs_endblk - fs_startblk + `1`;
626
627	map_bh->b_state = `0`;
628	map_bh->b_size = fs_count << i_blkbits;
629
630	/*
631	* For writes that could fill holes inside i_size on a
632	* DIO_SKIP_HOLES filesystem we forbid block creations: only
633	* overwrites are permitted. We will return early to the caller
634	* once we see an unmapped buffer head returned, and the caller
635	* will fall back to buffered I/O.
636	*
637	* Otherwise the decision is left to the get_blocks method,
638	* which may decide to handle it or also return an unmapped
639	* buffer head.
640	*/
641	create = dio_op == REQ_OP_WRITE;
642	if (dio->flags & DIO_SKIP_HOLES) {
643	i_size = i_size_read(inode: dio->inode);
644	if (i_size && fs_startblk <= (i_size - `1`) >> i_blkbits)
645	create = `0`;
646	}
647
648	ret = (*sdio->get_block)(dio->inode, fs_startblk,
649	map_bh, create);
650
651	/ Store for completion /
652	dio->private = map_bh->b_private;
653
654	if (ret == `0` && buffer_defer_completion(bh: map_bh))
655	ret = dio_set_defer_completion(dio);
656	}
657	return ret;
658	}
659
660	/*
661	* There is no bio. Make one now.
662	*/
663	static inline int dio_new_bio(struct dio dio, struct* dio_submit *sdio,
664	sector_t start_sector, struct buffer_head *map_bh)
665	{
666	sector_t sector;
667	int ret, nr_pages;
668
669	ret = dio_bio_reap(dio, sdio);
670	if (ret)
671	goto out;
672	sector = start_sector << (sdio->blkbits - `9`);
673	nr_pages = bio_max_segs(nr_segs: sdio->pages_in_io);
674	BUG_ON(nr_pages <= `0`);
675	dio_bio_alloc(dio, sdio, bdev: map_bh->b_bdev, first_sector: sector, nr_vecs: nr_pages);
676	sdio->boundary = `0`;
677	out:
678	return ret;
679	}
680
681	/*
682	* Attempt to put the current chunk of 'cur_page' into the current BIO. If
683	* that was successful then update final_block_in_bio and take a ref against
684	* the just-added page.
685	*
686	* Return zero on success. Non-zero means the caller needs to start a new BIO.
687	*/
688	static inline int dio_bio_add_page(struct dio dio, struct* dio_submit *sdio)
689	{
690	int ret;
691
692	ret = bio_add_page(bio: sdio->bio, page: sdio->cur_page,
693	len: sdio->cur_page_len, off: sdio->cur_page_offset);
694	if (ret == sdio->cur_page_len) {
695	/*
696	* Decrement count only, if we are done with this page
697	*/
698	if ((sdio->cur_page_len + sdio->cur_page_offset) == PAGE_SIZE)
699	sdio->pages_in_io--;
700	dio_pin_page(dio, page: sdio->cur_page);
701	sdio->final_block_in_bio = sdio->cur_page_block +
702	(sdio->cur_page_len >> sdio->blkbits);
703	ret = `0`;
704	} else {
705	ret = `1`;
706	}
707	return ret;
708	}
709
710	/*
711	* Put cur_page under IO. The section of cur_page which is described by
712	* cur_page_offset,cur_page_len is put into a BIO. The section of cur_page
713	* starts on-disk at cur_page_block.
714	*
715	* We take a ref against the page here (on behalf of its presence in the bio).
716	*
717	* The caller of this function is responsible for removing cur_page from the
718	* dio, and for dropping the refcount which came from that presence.
719	*/
720	static inline int dio_send_cur_page(struct dio dio, struct* dio_submit *sdio,
721	struct buffer_head *map_bh)
722	{
723	int ret = `0`;
724
725	if (sdio->bio) {
726	loff_t cur_offset = sdio->cur_page_fs_offset;
727	loff_t bio_next_offset = sdio->logical_offset_in_bio +
728	sdio->bio->bi_iter.bi_size;
729
730	/*
731	* See whether this new request is contiguous with the old.
732	*
733	* Btrfs cannot handle having logically non-contiguous requests
734	* submitted. For example if you have
735	*
736	* Logical: [0-4095][HOLE][8192-12287]
737	* Physical: [0-4095] [4096-8191]
738	*
739	* We cannot submit those pages together as one BIO. So if our
740	* current logical offset in the file does not equal what would
741	* be the next logical offset in the bio, submit the bio we
742	* have.
743	*/
744	if (sdio->final_block_in_bio != sdio->cur_page_block \|\|
745	cur_offset != bio_next_offset)
746	dio_bio_submit(dio, sdio);
747	}
748
749	if (sdio->bio == NULL) {
750	ret = dio_new_bio(dio, sdio, start_sector: sdio->cur_page_block, map_bh);
751	if (ret)
752	goto out;
753	}
754
755	if (dio_bio_add_page(dio, sdio) != `0`) {
756	dio_bio_submit(dio, sdio);
757	ret = dio_new_bio(dio, sdio, start_sector: sdio->cur_page_block, map_bh);
758	if (ret == `0`) {
759	ret = dio_bio_add_page(dio, sdio);
760	BUG_ON(ret != `0`);
761	}
762	}
763	out:
764	return ret;
765	}
766
767	/*
768	* An autonomous function to put a chunk of a page under deferred IO.
769	*
770	* The caller doesn't actually know (or care) whether this piece of page is in
771	* a BIO, or is under IO or whatever. We just take care of all possible
772	* situations here. The separation between the logic of do_direct_IO() and
773	* that of submit_page_section() is important for clarity. Please don't break.
774	*
775	* The chunk of page starts on-disk at blocknr.
776	*
777	* We perform deferred IO, by recording the last-submitted page inside our
778	* private part of the dio structure. If possible, we just expand the IO
779	* across that page here.
780	*
781	* If that doesn't work out then we put the old page into the bio and add this
782	* page to the dio instead.
783	*/
784	static inline int
785	submit_page_section(struct dio dio, struct* dio_submit sdio, struct* page *page,
786	unsigned offset, unsigned len, sector_t blocknr,
787	struct buffer_head *map_bh)
788	{
789	const enum req_op dio_op = dio->opf & REQ_OP_MASK;
790	int ret = `0`;
791	int boundary = sdio->boundary; / dio_send_cur_page may clear it /
792
793	if (dio_op == REQ_OP_WRITE) {
794	/*
795	* Read accounting is performed in submit_bio()
796	*/
797	task_io_account_write(bytes: len);
798	}
799
800	/*
801	* Can we just grow the current page's presence in the dio?
802	*/
803	if (sdio->cur_page == page &&
804	sdio->cur_page_offset + sdio->cur_page_len == offset &&
805	sdio->cur_page_block +
806	(sdio->cur_page_len >> sdio->blkbits) == blocknr) {
807	sdio->cur_page_len += len;
808	goto out;
809	}
810
811	/*
812	* If there's a deferred page already there then send it.
813	*/
814	if (sdio->cur_page) {
815	ret = dio_send_cur_page(dio, sdio, map_bh);
816	dio_unpin_page(dio, page: sdio->cur_page);
817	sdio->cur_page = NULL;
818	if (ret)
819	return ret;
820	}
821
822	dio_pin_page(dio, page); / It is in dio /
823	sdio->cur_page = page;
824	sdio->cur_page_offset = offset;
825	sdio->cur_page_len = len;
826	sdio->cur_page_block = blocknr;
827	sdio->cur_page_fs_offset = sdio->block_in_file << sdio->blkbits;
828	out:
829	/*
830	* If boundary then we want to schedule the IO now to
831	* avoid metadata seeks.
832	*/
833	if (boundary) {
834	ret = dio_send_cur_page(dio, sdio, map_bh);
835	if (sdio->bio)
836	dio_bio_submit(dio, sdio);
837	dio_unpin_page(dio, page: sdio->cur_page);
838	sdio->cur_page = NULL;
839	}
840	return ret;
841	}
842
843	/*
844	* If we are not writing the entire block and get_block() allocated
845	* the block for us, we need to fill-in the unused portion of the
846	* block with zeros. This happens only if user-buffer, fileoffset or
847	* io length is not filesystem block-size multiple.
848	*
849	* `end' is zero if we're doing the start of the IO, 1 at the end of the
850	* IO.
851	*/
852	static inline void dio_zero_block(struct dio dio, struct* dio_submit *sdio,
853	int end, struct buffer_head *map_bh)
854	{
855	unsigned dio_blocks_per_fs_block;
856	unsigned this_chunk_blocks; / In dio_blocks /
857	unsigned this_chunk_bytes;
858	struct page *page;
859
860	sdio->start_zero_done = `1`;
861	if (!sdio->blkfactor \|\| !buffer_new(bh: map_bh))
862	return;
863
864	dio_blocks_per_fs_block = `1` << sdio->blkfactor;
865	this_chunk_blocks = sdio->block_in_file & (dio_blocks_per_fs_block - `1`);
866
867	if (!this_chunk_blocks)
868	return;
869
870	/*
871	* We need to zero out part of an fs block. It is either at the
872	* beginning or the end of the fs block.
873	*/
874	if (end)
875	this_chunk_blocks = dio_blocks_per_fs_block - this_chunk_blocks;
876
877	this_chunk_bytes = this_chunk_blocks << sdio->blkbits;
878
879	page = ZERO_PAGE(`0`);
880	if (submit_page_section(dio, sdio, page, offset: `0`, len: this_chunk_bytes,
881	blocknr: sdio->next_block_for_io, map_bh))
882	return;
883
884	sdio->next_block_for_io += this_chunk_blocks;
885	}
886
887	/*
888	* Walk the user pages, and the file, mapping blocks to disk and generating
889	* a sequence of (page,offset,len,block) mappings. These mappings are injected
890	* into submit_page_section(), which takes care of the next stage of submission
891	*
892	* Direct IO against a blockdev is different from a file. Because we can
893	* happily perform page-sized but 512-byte aligned IOs. It is important that
894	* blockdev IO be able to have fine alignment and large sizes.
895	*
896	* So what we do is to permit the ->get_block function to populate bh.b_size
897	* with the size of IO which is permitted at this offset and this i_blkbits.
898	*
899	* For best results, the blockdev should be set up with 512-byte i_blkbits and
900	* it should set b_size to PAGE_SIZE or more inside get_block(). This gives
901	* fine alignment but still allows this function to work in PAGE_SIZE units.
902	*/
903	static int do_direct_IO(struct dio dio, struct* dio_submit *sdio,
904	struct buffer_head *map_bh)
905	{
906	const enum req_op dio_op = dio->opf & REQ_OP_MASK;
907	const unsigned blkbits = sdio->blkbits;
908	const unsigned i_blkbits = blkbits + sdio->blkfactor;
909	int ret = `0`;
910
911	while (sdio->block_in_file < sdio->final_block_in_request) {
912	struct page *page;
913	size_t from, to;
914
915	page = dio_get_page(dio, sdio);
916	if (IS_ERR(ptr: page)) {
917	ret = PTR_ERR(ptr: page);
918	goto out;
919	}
920	from = sdio->head ? `0` : sdio->from;
921	to = (sdio->head == sdio->tail - `1`) ? sdio->to : PAGE_SIZE;
922	sdio->head++;
923
924	while (from < to) {
925	unsigned this_chunk_bytes; / # of bytes mapped /
926	unsigned this_chunk_blocks; / # of blocks /
927	unsigned u;
928
929	if (sdio->blocks_available == `0`) {
930	/*
931	* Need to go and map some more disk
932	*/
933	unsigned long blkmask;
934	unsigned long dio_remainder;
935
936	ret = get_more_blocks(dio, sdio, map_bh);
937	if (ret) {
938	dio_unpin_page(dio, page);
939	goto out;
940	}
941	if (!buffer_mapped(bh: map_bh))
942	goto do_holes;
943
944	sdio->blocks_available =
945	map_bh->b_size >> blkbits;
946	sdio->next_block_for_io =
947	map_bh->b_blocknr << sdio->blkfactor;
948	if (buffer_new(bh: map_bh)) {
949	clean_bdev_aliases(
950	bdev: map_bh->b_bdev,
951	block: map_bh->b_blocknr,
952	len: map_bh->b_size >> i_blkbits);
953	}
954
955	if (!sdio->blkfactor)
956	goto do_holes;
957
958	blkmask = (`1` << sdio->blkfactor) - `1`;
959	dio_remainder = (sdio->block_in_file & blkmask);
960
961	/*
962	* If we are at the start of IO and that IO
963	* starts partway into a fs-block,
964	* dio_remainder will be non-zero. If the IO
965	* is a read then we can simply advance the IO
966	* cursor to the first block which is to be
967	* read. But if the IO is a write and the
968	* block was newly allocated we cannot do that;
969	* the start of the fs block must be zeroed out
970	* on-disk
971	*/
972	if (!buffer_new(bh: map_bh))
973	sdio->next_block_for_io += dio_remainder;
974	sdio->blocks_available -= dio_remainder;
975	}
976	do_holes:
977	/ Handle holes /
978	if (!buffer_mapped(bh: map_bh)) {
979	loff_t i_size_aligned;
980
981	/ AKPM: eargh, -ENOTBLK is a hack /
982	if (dio_op == REQ_OP_WRITE) {
983	dio_unpin_page(dio, page);
984	return -ENOTBLK;
985	}
986
987	/*
988	* Be sure to account for a partial block as the
989	* last block in the file
990	*/
991	i_size_aligned = ALIGN(i_size_read(dio->inode),
992	`1` << blkbits);
993	if (sdio->block_in_file >=
994	i_size_aligned >> blkbits) {
995	/ We hit eof /
996	dio_unpin_page(dio, page);
997	goto out;
998	}
999	zero_user(page, start: from, size: `1` << blkbits);
1000	sdio->block_in_file++;
1001	from += `1` << blkbits;
1002	dio->result += `1` << blkbits;
1003	goto next_block;
1004	}
1005
1006	/*
1007	* If we're performing IO which has an alignment which
1008	* is finer than the underlying fs, go check to see if
1009	* we must zero out the start of this block.
1010	*/
1011	if (unlikely(sdio->blkfactor && !sdio->start_zero_done))
1012	dio_zero_block(dio, sdio, end: `0`, map_bh);
1013
1014	/*
1015	* Work out, in this_chunk_blocks, how much disk we
1016	* can add to this page
1017	*/
1018	this_chunk_blocks = sdio->blocks_available;
1019	u = (to - from) >> blkbits;
1020	if (this_chunk_blocks > u)
1021	this_chunk_blocks = u;
1022	u = sdio->final_block_in_request - sdio->block_in_file;
1023	if (this_chunk_blocks > u)
1024	this_chunk_blocks = u;
1025	this_chunk_bytes = this_chunk_blocks << blkbits;
1026	BUG_ON(this_chunk_bytes == `0`);
1027
1028	if (this_chunk_blocks == sdio->blocks_available)
1029	sdio->boundary = buffer_boundary(bh: map_bh);
1030	ret = submit_page_section(dio, sdio, page,
1031	offset: from,
1032	len: this_chunk_bytes,
1033	blocknr: sdio->next_block_for_io,
1034	map_bh);
1035	if (ret) {
1036	dio_unpin_page(dio, page);
1037	goto out;
1038	}
1039	sdio->next_block_for_io += this_chunk_blocks;
1040
1041	sdio->block_in_file += this_chunk_blocks;
1042	from += this_chunk_bytes;
1043	dio->result += this_chunk_bytes;
1044	sdio->blocks_available -= this_chunk_blocks;
1045	next_block:
1046	BUG_ON(sdio->block_in_file > sdio->final_block_in_request);
1047	if (sdio->block_in_file == sdio->final_block_in_request)
1048	break;
1049	}
1050
1051	/ Drop the pin which was taken in get_user_pages() /
1052	dio_unpin_page(dio, page);
1053	}
1054	out:
1055	return ret;
1056	}
1057
1058	static inline int drop_refcount(struct dio *dio)
1059	{
1060	int ret2;
1061	unsigned long flags;
1062
1063	/*
1064	* Sync will always be dropping the final ref and completing the
1065	* operation. AIO can if it was a broken operation described above or
1066	* in fact if all the bios race to complete before we get here. In
1067	* that case dio_complete() translates the EIOCBQUEUED into the proper
1068	* return code that the caller will hand to ->complete().
1069	*
1070	* This is managed by the bio_lock instead of being an atomic_t so that
1071	* completion paths can drop their ref and use the remaining count to
1072	* decide to wake the submission path atomically.
1073	*/
1074	spin_lock_irqsave(&dio->bio_lock, flags);
1075	ret2 = --dio->refcount;
1076	spin_unlock_irqrestore(lock: &dio->bio_lock, flags);
1077	return ret2;
1078	}
1079
1080	/*
1081	* This is a library function for use by filesystem drivers.
1082	*
1083	* The locking rules are governed by the flags parameter:
1084	* - if the flags value contains DIO_LOCKING we use a fancy locking
1085	* scheme for dumb filesystems.
1086	* For writes this function is called under i_mutex and returns with
1087	* i_mutex held, for reads, i_mutex is not held on entry, but it is
1088	* taken and dropped again before returning.
1089	* - if the flags value does NOT contain DIO_LOCKING we don't use any
1090	* internal locking but rather rely on the filesystem to synchronize
1091	* direct I/O reads/writes versus each other and truncate.
1092	*
1093	* To help with locking against truncate we incremented the i_dio_count
1094	* counter before starting direct I/O, and decrement it once we are done.
1095	* Truncate can wait for it to reach zero to provide exclusion. It is
1096	* expected that filesystem provide exclusion between new direct I/O
1097	* and truncates. For DIO_LOCKING filesystems this is done by i_mutex,
1098	* but other filesystems need to take care of this on their own.
1099	*
1100	* NOTE: if you pass "sdio" to anything by pointer make sure that function
1101	* is always inlined. Otherwise gcc is unable to split the structure into
1102	* individual fields and will generate much worse code. This is important
1103	* for the whole file.
1104	*/
1105	ssize_t __blockdev_direct_IO(struct kiocb iocb, struct* inode *inode,
1106	struct block_device bdev, struct* iov_iter *iter,
1107	get_block_t get_block, dio_iodone_t end_io,
1108	int flags)
1109	{
1110	unsigned i_blkbits = READ_ONCE(inode->i_blkbits);
1111	unsigned blkbits = i_blkbits;
1112	unsigned blocksize_mask = (`1` << blkbits) - `1`;
1113	ssize_t retval = -EINVAL;
1114	const size_t count = iov_iter_count(i: iter);
1115	loff_t offset = iocb->ki_pos;
1116	const loff_t end = offset + count;
1117	struct dio *dio;
1118	struct dio_submit sdio = { NULL, };
1119	struct buffer_head map_bh = { `0`, };
1120	struct blk_plug plug;
1121	unsigned long align = offset \| iov_iter_alignment(i: iter);
1122
1123	/ watch out for a 0 len io from a tricksy fs /
1124	if (iov_iter_rw(i: iter) == READ && !count)
1125	return `0`;
1126
1127	dio = kmem_cache_alloc(dio_cache, GFP_KERNEL);
1128	if (!dio)
1129	return -ENOMEM;
1130	/*
1131	* Believe it or not, zeroing out the page array caused a .5%
1132	* performance regression in a database benchmark. So, we take
1133	* care to only zero out what's needed.
1134	*/
1135	memset(dio, `0`, offsetof(struct dio, pages));
1136
1137	dio->flags = flags;
1138	if (dio->flags & DIO_LOCKING && iov_iter_rw(i: iter) == READ) {
1139	/ will be released by direct_io_worker /
1140	inode_lock(inode);
1141	}
1142	dio->is_pinned = iov_iter_extract_will_pin(iter);
1143
1144	/ Once we sampled i_size check for reads beyond EOF /
1145	dio->i_size = i_size_read(inode);
1146	if (iov_iter_rw(i: iter) == READ && offset >= dio->i_size) {
1147	retval = `0`;
1148	goto fail_dio;
1149	}
1150
1151	if (align & blocksize_mask) {
1152	if (bdev)
1153	blkbits = blksize_bits(size: bdev_logical_block_size(bdev));
1154	blocksize_mask = (`1` << blkbits) - `1`;
1155	if (align & blocksize_mask)
1156	goto fail_dio;
1157	}
1158
1159	if (dio->flags & DIO_LOCKING && iov_iter_rw(i: iter) == READ) {
1160	struct address_space *mapping = iocb->ki_filp->f_mapping;
1161
1162	retval = filemap_write_and_wait_range(mapping, lstart: offset, lend: end - `1`);
1163	if (retval)
1164	goto fail_dio;
1165	}
1166
1167	/*
1168	* For file extending writes updating i_size before data writeouts
1169	* complete can expose uninitialized blocks in dumb filesystems.
1170	* In that case we need to wait for I/O completion even if asked
1171	* for an asynchronous write.
1172	*/
1173	if (is_sync_kiocb(kiocb: iocb))
1174	dio->is_async = false;
1175	else if (iov_iter_rw(i: iter) == WRITE && end > i_size_read(inode))
1176	dio->is_async = false;
1177	else
1178	dio->is_async = true;
1179
1180	dio->inode = inode;
1181	if (iov_iter_rw(i: iter) == WRITE) {
1182	dio->opf = REQ_OP_WRITE \| REQ_SYNC \| REQ_IDLE;
1183	if (iocb->ki_flags & IOCB_NOWAIT)
1184	dio->opf \|= REQ_NOWAIT;
1185	} else {
1186	dio->opf = REQ_OP_READ;
1187	}
1188
1189	/*
1190	* For AIO O_(D)SYNC writes we need to defer completions to a workqueue
1191	* so that we can call ->fsync.
1192	*/
1193	if (dio->is_async && iov_iter_rw(i: iter) == WRITE) {
1194	retval = `0`;
1195	if (iocb_is_dsync(iocb))
1196	retval = dio_set_defer_completion(dio);
1197	else if (!dio->inode->i_sb->s_dio_done_wq) {
1198	/*
1199	* In case of AIO write racing with buffered read we
1200	* need to defer completion. We can't decide this now,
1201	* however the workqueue needs to be initialized here.
1202	*/
1203	retval = sb_init_dio_done_wq(sb: dio->inode->i_sb);
1204	}
1205	if (retval)
1206	goto fail_dio;
1207	}
1208
1209	/*
1210	* Will be decremented at I/O completion time.
1211	*/
1212	inode_dio_begin(inode);
1213
1214	sdio.blkbits = blkbits;
1215	sdio.blkfactor = i_blkbits - blkbits;
1216	sdio.block_in_file = offset >> blkbits;
1217
1218	sdio.get_block = get_block;
1219	dio->end_io = end_io;
1220	sdio.final_block_in_bio = -`1`;
1221	sdio.next_block_for_io = -`1`;
1222
1223	dio->iocb = iocb;
1224
1225	spin_lock_init(&dio->bio_lock);
1226	dio->refcount = `1`;
1227
1228	dio->should_dirty = user_backed_iter(i: iter) && iov_iter_rw(i: iter) == READ;
1229	sdio.iter = iter;
1230	sdio.final_block_in_request = end >> blkbits;
1231
1232	/*
1233	* In case of non-aligned buffers, we may need 2 more
1234	* pages since we need to zero out first and last block.
1235	*/
1236	if (unlikely(sdio.blkfactor))
1237	sdio.pages_in_io = `2`;
1238
1239	sdio.pages_in_io += iov_iter_npages(i: iter, INT_MAX);
1240
1241	blk_start_plug(&plug);
1242
1243	retval = do_direct_IO(dio, sdio: &sdio, map_bh: &map_bh);
1244	if (retval)
1245	dio_cleanup(dio, sdio: &sdio);
1246
1247	if (retval == -ENOTBLK) {
1248	/*
1249	* The remaining part of the request will be
1250	* handled by buffered I/O when we return
1251	*/
1252	retval = `0`;
1253	}
1254	/*
1255	* There may be some unwritten disk at the end of a part-written
1256	* fs-block-sized block. Go zero that now.
1257	*/
1258	dio_zero_block(dio, sdio: &sdio, end: `1`, map_bh: &map_bh);
1259
1260	if (sdio.cur_page) {
1261	ssize_t ret2;
1262
1263	ret2 = dio_send_cur_page(dio, sdio: &sdio, map_bh: &map_bh);
1264	if (retval == `0`)
1265	retval = ret2;
1266	dio_unpin_page(dio, page: sdio.cur_page);
1267	sdio.cur_page = NULL;
1268	}
1269	if (sdio.bio)
1270	dio_bio_submit(dio, sdio: &sdio);
1271
1272	blk_finish_plug(&plug);
1273
1274	/*
1275	* It is possible that, we return short IO due to end of file.
1276	* In that case, we need to release all the pages we got hold on.
1277	*/
1278	dio_cleanup(dio, sdio: &sdio);
1279
1280	/*
1281	* All block lookups have been performed. For READ requests
1282	* we can let i_mutex go now that its achieved its purpose
1283	* of protecting us from looking up uninitialized blocks.
1284	*/
1285	if (iov_iter_rw(i: iter) == READ && (dio->flags & DIO_LOCKING))
1286	inode_unlock(inode: dio->inode);
1287
1288	/*
1289	* The only time we want to leave bios in flight is when a successful
1290	* partial aio read or full aio write have been setup. In that case
1291	* bio completion will call aio_complete. The only time it's safe to
1292	* call aio_complete is when we return -EIOCBQUEUED, so we key on that.
1293	* This had better be the only place that raises -EIOCBQUEUED.
1294	*/
1295	BUG_ON(retval == -EIOCBQUEUED);
1296	if (dio->is_async && retval == `0` && dio->result &&
1297	(iov_iter_rw(i: iter) == READ \|\| dio->result == count))
1298	retval = -EIOCBQUEUED;
1299	else
1300	dio_await_completion(dio);
1301
1302	if (drop_refcount(dio) == `0`) {
1303	retval = dio_complete(dio, ret: retval, DIO_COMPLETE_INVALIDATE);
1304	} else
1305	BUG_ON(retval != -EIOCBQUEUED);
1306
1307	return retval;
1308
1309	fail_dio:
1310	if (dio->flags & DIO_LOCKING && iov_iter_rw(i: iter) == READ)
1311	inode_unlock(inode);
1312
1313	kmem_cache_free(s: dio_cache, objp: dio);
1314	return retval;
1315	}
1316	EXPORT_SYMBOL(__blockdev_direct_IO);
1317
1318	static __init int dio_init(void)
1319	{
1320	dio_cache = KMEM_CACHE(dio, SLAB_PANIC);
1321	return `0`;
1322	}
1323	module_init(dio_init)
1324

Provided by KDAB

Definitions

source code of linux/fs/direct-io.c