file.c source code [linux/fs/zonefs/file.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* Simple file system for zoned block devices exposing zones as files.
4	*
5	* Copyright (C) 2022 Western Digital Corporation or its affiliates.
6	*/
7	#include <linux/module.h>
8	#include <linux/pagemap.h>
9	#include <linux/iomap.h>
10	#include <linux/init.h>
11	#include <linux/slab.h>
12	#include <linux/blkdev.h>
13	#include <linux/statfs.h>
14	#include <linux/writeback.h>
15	#include <linux/quotaops.h>
16	#include <linux/seq_file.h>
17	#include <linux/parser.h>
18	#include <linux/uio.h>
19	#include <linux/mman.h>
20	#include <linux/sched/mm.h>
21	#include <linux/task_io_accounting_ops.h>
22
23	#include "zonefs.h"
24
25	#include "trace.h"
26
27	static int zonefs_read_iomap_begin(struct inode *inode, loff_t offset,
28	loff_t length, unsigned int flags,
29	struct iomap iomap, struct* iomap *srcmap)
30	{
31	struct zonefs_inode_info *zi = ZONEFS_I(inode);
32	struct zonefs_zone *z = zonefs_inode_zone(inode);
33	struct super_block *sb = inode->i_sb;
34	loff_t isize;
35
36	/*
37	* All blocks are always mapped below EOF. If reading past EOF,
38	* act as if there is a hole up to the file maximum size.
39	*/
40	mutex_lock(&zi->i_truncate_mutex);
41	iomap->bdev = inode->i_sb->s_bdev;
42	iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize);
43	isize = i_size_read(inode);
44	if (iomap->offset >= isize) {
45	iomap->type = IOMAP_HOLE;
46	iomap->addr = IOMAP_NULL_ADDR;
47	iomap->length = length;
48	} else {
49	iomap->type = IOMAP_MAPPED;
50	iomap->addr = (z->z_sector << SECTOR_SHIFT) + iomap->offset;
51	iomap->length = isize - iomap->offset;
52	}
53	mutex_unlock(lock: &zi->i_truncate_mutex);
54
55	trace_zonefs_iomap_begin(inode, iomap);
56
57	return `0`;
58	}
59
60	static const struct iomap_ops zonefs_read_iomap_ops = {
61	.iomap_begin = zonefs_read_iomap_begin,
62	};
63
64	static int zonefs_write_iomap_begin(struct inode *inode, loff_t offset,
65	loff_t length, unsigned int flags,
66	struct iomap iomap, struct* iomap *srcmap)
67	{
68	struct zonefs_inode_info *zi = ZONEFS_I(inode);
69	struct zonefs_zone *z = zonefs_inode_zone(inode);
70	struct super_block *sb = inode->i_sb;
71	loff_t isize;
72
73	/ All write I/Os should always be within the file maximum size /
74	if (WARN_ON_ONCE(offset + length > z->z_capacity))
75	return -EIO;
76
77	/*
78	* Sequential zones can only accept direct writes. This is already
79	* checked when writes are issued, so warn if we see a page writeback
80	* operation.
81	*/
82	if (WARN_ON_ONCE(zonefs_zone_is_seq(z) && !(flags & IOMAP_DIRECT)))
83	return -EIO;
84
85	/*
86	* For conventional zones, all blocks are always mapped. For sequential
87	* zones, all blocks after always mapped below the inode size (zone
88	* write pointer) and unwriten beyond.
89	*/
90	mutex_lock(&zi->i_truncate_mutex);
91	iomap->bdev = inode->i_sb->s_bdev;
92	iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize);
93	iomap->addr = (z->z_sector << SECTOR_SHIFT) + iomap->offset;
94	isize = i_size_read(inode);
95	if (iomap->offset >= isize) {
96	iomap->type = IOMAP_UNWRITTEN;
97	iomap->length = z->z_capacity - iomap->offset;
98	} else {
99	iomap->type = IOMAP_MAPPED;
100	iomap->length = isize - iomap->offset;
101	}
102	mutex_unlock(lock: &zi->i_truncate_mutex);
103
104	trace_zonefs_iomap_begin(inode, iomap);
105
106	return `0`;
107	}
108
109	static const struct iomap_ops zonefs_write_iomap_ops = {
110	.iomap_begin = zonefs_write_iomap_begin,
111	};
112
113	static int zonefs_read_folio(struct file unused, struct* folio *folio)
114	{
115	return iomap_read_folio(folio, ops: &zonefs_read_iomap_ops);
116	}
117
118	static void zonefs_readahead(struct readahead_control *rac)
119	{
120	iomap_readahead(rac, ops: &zonefs_read_iomap_ops);
121	}
122
123	/*
124	* Map blocks for page writeback. This is used only on conventional zone files,
125	* which implies that the page range can only be within the fixed inode size.
126	*/
127	static int zonefs_write_map_blocks(struct iomap_writepage_ctx *wpc,
128	struct inode *inode, loff_t offset,
129	unsigned int len)
130	{
131	struct zonefs_zone *z = zonefs_inode_zone(inode);
132
133	if (WARN_ON_ONCE(zonefs_zone_is_seq(z)))
134	return -EIO;
135	if (WARN_ON_ONCE(offset >= i_size_read(inode)))
136	return -EIO;
137
138	/ If the mapping is already OK, nothing needs to be done /
139	if (offset >= wpc->iomap.offset &&
140	offset < wpc->iomap.offset + wpc->iomap.length)
141	return `0`;
142
143	return zonefs_write_iomap_begin(inode, offset,
144	length: z->z_capacity - offset,
145	IOMAP_WRITE, iomap: &wpc->iomap, NULL);
146	}
147
148	static const struct iomap_writeback_ops zonefs_writeback_ops = {
149	.map_blocks = zonefs_write_map_blocks,
150	};
151
152	static int zonefs_writepages(struct address_space *mapping,
153	struct writeback_control *wbc)
154	{
155	struct iomap_writepage_ctx wpc = { };
156
157	return iomap_writepages(mapping, wbc, wpc: &wpc, ops: &zonefs_writeback_ops);
158	}
159
160	static int zonefs_swap_activate(struct swap_info_struct *sis,
161	struct file swap_file, sector_t span)
162	{
163	struct inode *inode = file_inode(f: swap_file);
164
165	if (zonefs_inode_is_seq(inode)) {
166	zonefs_err(inode->i_sb,
167	"swap file: not a conventional zone file\n");
168	return -EINVAL;
169	}
170
171	return iomap_swapfile_activate(sis, swap_file, pagespan: span,
172	ops: &zonefs_read_iomap_ops);
173	}
174
175	const struct address_space_operations zonefs_file_aops = {
176	.read_folio = zonefs_read_folio,
177	.readahead = zonefs_readahead,
178	.writepages = zonefs_writepages,
179	.dirty_folio = iomap_dirty_folio,
180	.release_folio = iomap_release_folio,
181	.invalidate_folio = iomap_invalidate_folio,
182	.migrate_folio = filemap_migrate_folio,
183	.is_partially_uptodate = iomap_is_partially_uptodate,
184	.error_remove_folio = generic_error_remove_folio,
185	.swap_activate = zonefs_swap_activate,
186	};
187
188	int zonefs_file_truncate(struct inode *inode, loff_t isize)
189	{
190	struct zonefs_inode_info *zi = ZONEFS_I(inode);
191	struct zonefs_zone *z = zonefs_inode_zone(inode);
192	loff_t old_isize;
193	enum req_op op;
194	int ret = `0`;
195
196	/*
197	* Only sequential zone files can be truncated and truncation is allowed
198	* only down to a 0 size, which is equivalent to a zone reset, and to
199	* the maximum file size, which is equivalent to a zone finish.
200	*/
201	if (!zonefs_zone_is_seq(z))
202	return -EPERM;
203
204	if (!isize)
205	op = REQ_OP_ZONE_RESET;
206	else if (isize == z->z_capacity)
207	op = REQ_OP_ZONE_FINISH;
208	else
209	return -EPERM;
210
211	inode_dio_wait(inode);
212
213	/ Serialize against page faults /
214	filemap_invalidate_lock(mapping: inode->i_mapping);
215
216	/ Serialize against zonefs_iomap_begin() /
217	mutex_lock(&zi->i_truncate_mutex);
218
219	old_isize = i_size_read(inode);
220	if (isize == old_isize)
221	goto unlock;
222
223	ret = zonefs_inode_zone_mgmt(inode, op);
224	if (ret)
225	goto unlock;
226
227	/*
228	* If the mount option ZONEFS_MNTOPT_EXPLICIT_OPEN is set,
229	* take care of open zones.
230	*/
231	if (z->z_flags & ZONEFS_ZONE_OPEN) {
232	/*
233	* Truncating a zone to EMPTY or FULL is the equivalent of
234	* closing the zone. For a truncation to 0, we need to
235	* re-open the zone to ensure new writes can be processed.
236	* For a truncation to the maximum file size, the zone is
237	* closed and writes cannot be accepted anymore, so clear
238	* the open flag.
239	*/
240	if (!isize)
241	ret = zonefs_inode_zone_mgmt(inode, op: REQ_OP_ZONE_OPEN);
242	else
243	z->z_flags &= ~ZONEFS_ZONE_OPEN;
244	}
245
246	zonefs_update_stats(inode, new_isize: isize);
247	truncate_setsize(inode, newsize: isize);
248	z->z_wpoffset = isize;
249	zonefs_inode_account_active(inode);
250
251	unlock:
252	mutex_unlock(lock: &zi->i_truncate_mutex);
253	filemap_invalidate_unlock(mapping: inode->i_mapping);
254
255	return ret;
256	}
257
258	static int zonefs_file_fsync(struct file *file, loff_t start, loff_t end,
259	int datasync)
260	{
261	struct inode *inode = file_inode(f: file);
262	int ret = `0`;
263
264	if (unlikely(IS_IMMUTABLE(inode)))
265	return -EPERM;
266
267	/*
268	* Since only direct writes are allowed in sequential files, page cache
269	* flush is needed only for conventional zone files.
270	*/
271	if (zonefs_inode_is_cnv(inode))
272	ret = file_write_and_wait_range(file, start, end);
273	if (!ret)
274	ret = blkdev_issue_flush(bdev: inode->i_sb->s_bdev);
275
276	if (ret)
277	zonefs_io_error(inode, write: true);
278
279	return ret;
280	}
281
282	static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf)
283	{
284	struct inode *inode = file_inode(f: vmf->vma->vm_file);
285	vm_fault_t ret;
286
287	if (unlikely(IS_IMMUTABLE(inode)))
288	return VM_FAULT_SIGBUS;
289
290	/*
291	* Sanity check: only conventional zone files can have shared
292	* writeable mappings.
293	*/
294	if (zonefs_inode_is_seq(inode))
295	return VM_FAULT_NOPAGE;
296
297	sb_start_pagefault(sb: inode->i_sb);
298	file_update_time(file: vmf->vma->vm_file);
299
300	/ Serialize against truncates /
301	filemap_invalidate_lock_shared(mapping: inode->i_mapping);
302	ret = iomap_page_mkwrite(vmf, ops: &zonefs_write_iomap_ops);
303	filemap_invalidate_unlock_shared(mapping: inode->i_mapping);
304
305	sb_end_pagefault(sb: inode->i_sb);
306	return ret;
307	}
308
309	static const struct vm_operations_struct zonefs_file_vm_ops = {
310	.fault = filemap_fault,
311	.map_pages = filemap_map_pages,
312	.page_mkwrite = zonefs_filemap_page_mkwrite,
313	};
314
315	static int zonefs_file_mmap(struct file file, struct* vm_area_struct *vma)
316	{
317	/*
318	* Conventional zones accept random writes, so their files can support
319	* shared writable mappings. For sequential zone files, only read
320	* mappings are possible since there are no guarantees for write
321	* ordering between msync() and page cache writeback.
322	*/
323	if (zonefs_inode_is_seq(inode: file_inode(f: file)) &&
324	(vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
325	return -EINVAL;
326
327	file_accessed(file);
328	vma->vm_ops = &zonefs_file_vm_ops;
329
330	return `0`;
331	}
332
333	static loff_t zonefs_file_llseek(struct file file, loff_t offset, int* whence)
334	{
335	loff_t isize = i_size_read(inode: file_inode(f: file));
336
337	/*
338	* Seeks are limited to below the zone size for conventional zones
339	* and below the zone write pointer for sequential zones. In both
340	* cases, this limit is the inode size.
341	*/
342	return generic_file_llseek_size(file, offset, whence, maxsize: isize, eof: isize);
343	}
344
345	static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size,
346	int error, unsigned int flags)
347	{
348	struct inode *inode = file_inode(f: iocb->ki_filp);
349	struct zonefs_inode_info *zi = ZONEFS_I(inode);
350
351	if (error) {
352	/*
353	* For Sync IOs, error recovery is called from
354	* zonefs_file_dio_write().
355	*/
356	if (!is_sync_kiocb(kiocb: iocb))
357	zonefs_io_error(inode, write: true);
358	return error;
359	}
360
361	if (size && zonefs_inode_is_seq(inode)) {
362	/*
363	* Note that we may be seeing completions out of order,
364	* but that is not a problem since a write completed
365	* successfully necessarily means that all preceding writes
366	* were also successful. So we can safely increase the inode
367	* size to the write end location.
368	*/
369	mutex_lock(&zi->i_truncate_mutex);
370	if (i_size_read(inode) < iocb->ki_pos + size) {
371	zonefs_update_stats(inode, new_isize: iocb->ki_pos + size);
372	zonefs_i_size_write(inode, isize: iocb->ki_pos + size);
373	}
374	mutex_unlock(lock: &zi->i_truncate_mutex);
375	}
376
377	return `0`;
378	}
379
380	static const struct iomap_dio_ops zonefs_write_dio_ops = {
381	.end_io = zonefs_file_write_dio_end_io,
382	};
383
384	/*
385	* Do not exceed the LFS limits nor the file zone size. If pos is under the
386	* limit it becomes a short access. If it exceeds the limit, return -EFBIG.
387	*/
388	static loff_t zonefs_write_check_limits(struct file *file, loff_t pos,
389	loff_t count)
390	{
391	struct inode *inode = file_inode(f: file);
392	struct zonefs_zone *z = zonefs_inode_zone(inode);
393	loff_t limit = rlimit(RLIMIT_FSIZE);
394	loff_t max_size = z->z_capacity;
395
396	if (limit != RLIM_INFINITY) {
397	if (pos >= limit) {
398	send_sig(SIGXFSZ, current, `0`);
399	return -EFBIG;
400	}
401	count = min(count, limit - pos);
402	}
403
404	if (!(file->f_flags & O_LARGEFILE))
405	max_size = min_t(loff_t, MAX_NON_LFS, max_size);
406
407	if (unlikely(pos >= max_size))
408	return -EFBIG;
409
410	return min(count, max_size - pos);
411	}
412
413	static ssize_t zonefs_write_checks(struct kiocb iocb, struct* iov_iter *from)
414	{
415	struct file *file = iocb->ki_filp;
416	struct inode *inode = file_inode(f: file);
417	struct zonefs_inode_info *zi = ZONEFS_I(inode);
418	struct zonefs_zone *z = zonefs_inode_zone(inode);
419	loff_t count;
420
421	if (IS_SWAPFILE(inode))
422	return -ETXTBSY;
423
424	if (!iov_iter_count(i: from))
425	return `0`;
426
427	if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
428	return -EINVAL;
429
430	if (iocb->ki_flags & IOCB_APPEND) {
431	if (zonefs_zone_is_cnv(z))
432	return -EINVAL;
433	mutex_lock(&zi->i_truncate_mutex);
434	iocb->ki_pos = z->z_wpoffset;
435	mutex_unlock(lock: &zi->i_truncate_mutex);
436	}
437
438	count = zonefs_write_check_limits(file, pos: iocb->ki_pos,
439	count: iov_iter_count(i: from));
440	if (count < `0`)
441	return count;
442
443	iov_iter_truncate(i: from, count);
444	return iov_iter_count(i: from);
445	}
446
447	/*
448	* Handle direct writes. For sequential zone files, this is the only possible
449	* write path. For these files, check that the user is issuing writes
450	* sequentially from the end of the file. This code assumes that the block layer
451	* delivers write requests to the device in sequential order. This is always the
452	* case if a block IO scheduler implementing the ELEVATOR_F_ZBD_SEQ_WRITE
453	* elevator feature is being used (e.g. mq-deadline). The block layer always
454	* automatically select such an elevator for zoned block devices during the
455	* device initialization.
456	*/
457	static ssize_t zonefs_file_dio_write(struct kiocb iocb, struct* iov_iter *from)
458	{
459	struct inode *inode = file_inode(f: iocb->ki_filp);
460	struct zonefs_inode_info *zi = ZONEFS_I(inode);
461	struct zonefs_zone *z = zonefs_inode_zone(inode);
462	struct super_block *sb = inode->i_sb;
463	ssize_t ret, count;
464
465	/*
466	* For async direct IOs to sequential zone files, refuse IOCB_NOWAIT
467	* as this can cause write reordering (e.g. the first aio gets EAGAIN
468	* on the inode lock but the second goes through but is now unaligned).
469	*/
470	if (zonefs_zone_is_seq(z) && !is_sync_kiocb(kiocb: iocb) &&
471	(iocb->ki_flags & IOCB_NOWAIT))
472	return -EOPNOTSUPP;
473
474	if (iocb->ki_flags & IOCB_NOWAIT) {
475	if (!inode_trylock(inode))
476	return -EAGAIN;
477	} else {
478	inode_lock(inode);
479	}
480
481	count = zonefs_write_checks(iocb, from);
482	if (count <= `0`) {
483	ret = count;
484	goto inode_unlock;
485	}
486
487	if ((iocb->ki_pos \| count) & (sb->s_blocksize - `1`)) {
488	ret = -EINVAL;
489	goto inode_unlock;
490	}
491
492	/ Enforce sequential writes (append only) in sequential zones /
493	if (zonefs_zone_is_seq(z)) {
494	mutex_lock(&zi->i_truncate_mutex);
495	if (iocb->ki_pos != z->z_wpoffset) {
496	mutex_unlock(lock: &zi->i_truncate_mutex);
497	ret = -EINVAL;
498	goto inode_unlock;
499	}
500	/*
501	* Advance the zone write pointer offset. This assumes that the
502	* IO will succeed, which is OK to do because we do not allow
503	* partial writes (IOMAP_DIO_PARTIAL is not set) and if the IO
504	* fails, the error path will correct the write pointer offset.
505	*/
506	z->z_wpoffset += count;
507	zonefs_inode_account_active(inode);
508	mutex_unlock(lock: &zi->i_truncate_mutex);
509	}
510
511	/*
512	* iomap_dio_rw() may return ENOTBLK if there was an issue with
513	* page invalidation. Overwrite that error code with EBUSY so that
514	* the user can make sense of the error.
515	*/
516	ret = iomap_dio_rw(iocb, iter: from, ops: &zonefs_write_iomap_ops,
517	dops: &zonefs_write_dio_ops, dio_flags: `0`, NULL, done_before: `0`);
518	if (ret == -ENOTBLK)
519	ret = -EBUSY;
520
521	/*
522	* For a failed IO or partial completion, trigger error recovery
523	* to update the zone write pointer offset to a correct value.
524	* For asynchronous IOs, zonefs_file_write_dio_end_io() may already
525	* have executed error recovery if the IO already completed when we
526	* reach here. However, we cannot know that and execute error recovery
527	* again (that will not change anything).
528	*/
529	if (zonefs_zone_is_seq(z)) {
530	if (ret > `0` && ret != count)
531	ret = -EIO;
532	if (ret < `0` && ret != -EIOCBQUEUED)
533	zonefs_io_error(inode, write: true);
534	}
535
536	inode_unlock:
537	inode_unlock(inode);
538
539	return ret;
540	}
541
542	static ssize_t zonefs_file_buffered_write(struct kiocb *iocb,
543	struct iov_iter *from)
544	{
545	struct inode *inode = file_inode(f: iocb->ki_filp);
546	ssize_t ret;
547
548	/*
549	* Direct IO writes are mandatory for sequential zone files so that the
550	* write IO issuing order is preserved.
551	*/
552	if (zonefs_inode_is_seq(inode))
553	return -EIO;
554
555	if (iocb->ki_flags & IOCB_NOWAIT) {
556	if (!inode_trylock(inode))
557	return -EAGAIN;
558	} else {
559	inode_lock(inode);
560	}
561
562	ret = zonefs_write_checks(iocb, from);
563	if (ret <= `0`)
564	goto inode_unlock;
565
566	ret = iomap_file_buffered_write(iocb, from, ops: &zonefs_write_iomap_ops);
567	if (ret == -EIO)
568	zonefs_io_error(inode, write: true);
569
570	inode_unlock:
571	inode_unlock(inode);
572	if (ret > `0`)
573	ret = generic_write_sync(iocb, count: ret);
574
575	return ret;
576	}
577
578	static ssize_t zonefs_file_write_iter(struct kiocb iocb, struct* iov_iter *from)
579	{
580	struct inode *inode = file_inode(f: iocb->ki_filp);
581	struct zonefs_zone *z = zonefs_inode_zone(inode);
582
583	if (unlikely(IS_IMMUTABLE(inode)))
584	return -EPERM;
585
586	if (sb_rdonly(sb: inode->i_sb))
587	return -EROFS;
588
589	/ Write operations beyond the zone capacity are not allowed /
590	if (iocb->ki_pos >= z->z_capacity)
591	return -EFBIG;
592
593	if (iocb->ki_flags & IOCB_DIRECT) {
594	ssize_t ret = zonefs_file_dio_write(iocb, from);
595
596	if (ret != -ENOTBLK)
597	return ret;
598	}
599
600	return zonefs_file_buffered_write(iocb, from);
601	}
602
603	static int zonefs_file_read_dio_end_io(struct kiocb *iocb, ssize_t size,
604	int error, unsigned int flags)
605	{
606	if (error) {
607	zonefs_io_error(inode: file_inode(f: iocb->ki_filp), write: false);
608	return error;
609	}
610
611	return `0`;
612	}
613
614	static const struct iomap_dio_ops zonefs_read_dio_ops = {
615	.end_io = zonefs_file_read_dio_end_io,
616	};
617
618	static ssize_t zonefs_file_read_iter(struct kiocb iocb, struct* iov_iter *to)
619	{
620	struct inode *inode = file_inode(f: iocb->ki_filp);
621	struct zonefs_inode_info *zi = ZONEFS_I(inode);
622	struct zonefs_zone *z = zonefs_inode_zone(inode);
623	struct super_block *sb = inode->i_sb;
624	loff_t isize;
625	ssize_t ret;
626
627	/ Offline zones cannot be read /
628	if (unlikely(IS_IMMUTABLE(inode) && !(inode->i_mode & `0777`)))
629	return -EPERM;
630
631	if (iocb->ki_pos >= z->z_capacity)
632	return `0`;
633
634	if (iocb->ki_flags & IOCB_NOWAIT) {
635	if (!inode_trylock_shared(inode))
636	return -EAGAIN;
637	} else {
638	inode_lock_shared(inode);
639	}
640
641	/ Limit read operations to written data /
642	mutex_lock(&zi->i_truncate_mutex);
643	isize = i_size_read(inode);
644	if (iocb->ki_pos >= isize) {
645	mutex_unlock(lock: &zi->i_truncate_mutex);
646	ret = `0`;
647	goto inode_unlock;
648	}
649	iov_iter_truncate(i: to, count: isize - iocb->ki_pos);
650	mutex_unlock(lock: &zi->i_truncate_mutex);
651
652	if (iocb->ki_flags & IOCB_DIRECT) {
653	size_t count = iov_iter_count(i: to);
654
655	if ((iocb->ki_pos \| count) & (sb->s_blocksize - `1`)) {
656	ret = -EINVAL;
657	goto inode_unlock;
658	}
659	file_accessed(file: iocb->ki_filp);
660	ret = iomap_dio_rw(iocb, iter: to, ops: &zonefs_read_iomap_ops,
661	dops: &zonefs_read_dio_ops, dio_flags: `0`, NULL, done_before: `0`);
662	} else {
663	ret = generic_file_read_iter(iocb, to);
664	if (ret == -EIO)
665	zonefs_io_error(inode, write: false);
666	}
667
668	inode_unlock:
669	inode_unlock_shared(inode);
670
671	return ret;
672	}
673
674	static ssize_t zonefs_file_splice_read(struct file in, loff_t ppos,
675	struct pipe_inode_info *pipe,
676	size_t len, unsigned int flags)
677	{
678	struct inode *inode = file_inode(f: in);
679	struct zonefs_inode_info *zi = ZONEFS_I(inode);
680	struct zonefs_zone *z = zonefs_inode_zone(inode);
681	loff_t isize;
682	ssize_t ret = `0`;
683
684	/ Offline zones cannot be read /
685	if (unlikely(IS_IMMUTABLE(inode) && !(inode->i_mode & `0777`)))
686	return -EPERM;
687
688	if (*ppos >= z->z_capacity)
689	return `0`;
690
691	inode_lock_shared(inode);
692
693	/ Limit read operations to written data /
694	mutex_lock(&zi->i_truncate_mutex);
695	isize = i_size_read(inode);
696	if (*ppos >= isize)
697	len = `0`;
698	else
699	len = min_t(loff_t, len, isize - *ppos);
700	mutex_unlock(lock: &zi->i_truncate_mutex);
701
702	if (len > `0`) {
703	ret = filemap_splice_read(in, ppos, pipe, len, flags);
704	if (ret == -EIO)
705	zonefs_io_error(inode, write: false);
706	}
707
708	inode_unlock_shared(inode);
709	return ret;
710	}
711
712	/*
713	* Write open accounting is done only for sequential files.
714	*/
715	static inline bool zonefs_seq_file_need_wro(struct inode *inode,
716	struct file *file)
717	{
718	if (zonefs_inode_is_cnv(inode))
719	return false;
720
721	if (!(file->f_mode & FMODE_WRITE))
722	return false;
723
724	return true;
725	}
726
727	static int zonefs_seq_file_write_open(struct inode *inode)
728	{
729	struct zonefs_inode_info *zi = ZONEFS_I(inode);
730	struct zonefs_zone *z = zonefs_inode_zone(inode);
731	int ret = `0`;
732
733	mutex_lock(&zi->i_truncate_mutex);
734
735	if (!zi->i_wr_refcnt) {
736	struct zonefs_sb_info *sbi = ZONEFS_SB(sb: inode->i_sb);
737	unsigned int wro = atomic_inc_return(v: &sbi->s_wro_seq_files);
738
739	if (sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) {
740
741	if (sbi->s_max_wro_seq_files
742	&& wro > sbi->s_max_wro_seq_files) {
743	atomic_dec(v: &sbi->s_wro_seq_files);
744	ret = -EBUSY;
745	goto unlock;
746	}
747
748	if (i_size_read(inode) < z->z_capacity) {
749	ret = zonefs_inode_zone_mgmt(inode,
750	op: REQ_OP_ZONE_OPEN);
751	if (ret) {
752	atomic_dec(v: &sbi->s_wro_seq_files);
753	goto unlock;
754	}
755	z->z_flags \|= ZONEFS_ZONE_OPEN;
756	zonefs_inode_account_active(inode);
757	}
758	}
759	}
760
761	zi->i_wr_refcnt++;
762
763	unlock:
764	mutex_unlock(lock: &zi->i_truncate_mutex);
765
766	return ret;
767	}
768
769	static int zonefs_file_open(struct inode inode, struct* file *file)
770	{
771	int ret;
772
773	file->f_mode \|= FMODE_CAN_ODIRECT;
774	ret = generic_file_open(inode, filp: file);
775	if (ret)
776	return ret;
777
778	if (zonefs_seq_file_need_wro(inode, file))
779	return zonefs_seq_file_write_open(inode);
780
781	return `0`;
782	}
783
784	static void zonefs_seq_file_write_close(struct inode *inode)
785	{
786	struct zonefs_inode_info *zi = ZONEFS_I(inode);
787	struct zonefs_zone *z = zonefs_inode_zone(inode);
788	struct super_block *sb = inode->i_sb;
789	struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
790	int ret = `0`;
791
792	mutex_lock(&zi->i_truncate_mutex);
793
794	zi->i_wr_refcnt--;
795	if (zi->i_wr_refcnt)
796	goto unlock;
797
798	/*
799	* The file zone may not be open anymore (e.g. the file was truncated to
800	* its maximum size or it was fully written). For this case, we only
801	* need to decrement the write open count.
802	*/
803	if (z->z_flags & ZONEFS_ZONE_OPEN) {
804	ret = zonefs_inode_zone_mgmt(inode, op: REQ_OP_ZONE_CLOSE);
805	if (ret) {
806	__zonefs_io_error(inode, write: false);
807	/*
808	* Leaving zones explicitly open may lead to a state
809	* where most zones cannot be written (zone resources
810	* exhausted). So take preventive action by remounting
811	* read-only.
812	*/
813	if (z->z_flags & ZONEFS_ZONE_OPEN &&
814	!(sb->s_flags & SB_RDONLY)) {
815	zonefs_warn(sb,
816	"closing zone at %llu failed %d\n",
817	z->z_sector, ret);
818	zonefs_warn(sb,
819	"remounting filesystem read-only\n");
820	sb->s_flags \|= SB_RDONLY;
821	}
822	goto unlock;
823	}
824
825	z->z_flags &= ~ZONEFS_ZONE_OPEN;
826	zonefs_inode_account_active(inode);
827	}
828
829	atomic_dec(v: &sbi->s_wro_seq_files);
830
831	unlock:
832	mutex_unlock(lock: &zi->i_truncate_mutex);
833	}
834
835	static int zonefs_file_release(struct inode inode, struct* file *file)
836	{
837	/*
838	* If we explicitly open a zone we must close it again as well, but the
839	* zone management operation can fail (either due to an IO error or as
840	* the zone has gone offline or read-only). Make sure we don't fail the
841	* close(2) for user-space.
842	*/
843	if (zonefs_seq_file_need_wro(inode, file))
844	zonefs_seq_file_write_close(inode);
845
846	return `0`;
847	}
848
849	const struct file_operations zonefs_file_operations = {
850	.open = zonefs_file_open,
851	.release = zonefs_file_release,
852	.fsync = zonefs_file_fsync,
853	.mmap = zonefs_file_mmap,
854	.llseek = zonefs_file_llseek,
855	.read_iter = zonefs_file_read_iter,
856	.write_iter = zonefs_file_write_iter,
857	.splice_read = zonefs_file_splice_read,
858	.splice_write = iter_file_splice_write,
859	.iopoll = iocb_bio_iopoll,
860	};
861

source code of linux/fs/zonefs/file.c