file.c source code [linux/fs/ocfs2/file.c]

1	// SPDX-License-Identifier: GPL-2.0-or-later
2	/*
3	* file.c
4	*
5	* File open, close, extend, truncate
6	*
7	* Copyright (C) 2002, 2004 Oracle. All rights reserved.
8	*/
9
10	#include <linux/capability.h>
11	#include <linux/fs.h>
12	#include <linux/types.h>
13	#include <linux/slab.h>
14	#include <linux/highmem.h>
15	#include <linux/pagemap.h>
16	#include <linux/uio.h>
17	#include <linux/sched.h>
18	#include <linux/splice.h>
19	#include <linux/mount.h>
20	#include <linux/writeback.h>
21	#include <linux/falloc.h>
22	#include <linux/quotaops.h>
23	#include <linux/blkdev.h>
24	#include <linux/backing-dev.h>
25
26	#include <cluster/masklog.h>
27
28	#include "ocfs2.h"
29
30	#include "alloc.h"
31	#include "aops.h"
32	#include "dir.h"
33	#include "dlmglue.h"
34	#include "extent_map.h"
35	#include "file.h"
36	#include "sysfile.h"
37	#include "inode.h"
38	#include "ioctl.h"
39	#include "journal.h"
40	#include "locks.h"
41	#include "mmap.h"
42	#include "suballoc.h"
43	#include "super.h"
44	#include "xattr.h"
45	#include "acl.h"
46	#include "quota.h"
47	#include "refcounttree.h"
48	#include "ocfs2_trace.h"
49
50	#include "buffer_head_io.h"
51
52	static int ocfs2_init_file_private(struct inode inode, struct* file *file)
53	{
54	struct ocfs2_file_private *fp;
55
56	fp = kzalloc(size: sizeof(struct ocfs2_file_private), GFP_KERNEL);
57	if (!fp)
58	return -ENOMEM;
59
60	fp->fp_file = file;
61	mutex_init(&fp->fp_mutex);
62	ocfs2_file_lock_res_init(lockres: &fp->fp_flock, fp);
63	file->private_data = fp;
64
65	return `0`;
66	}
67
68	static void ocfs2_free_file_private(struct inode inode, struct* file *file)
69	{
70	struct ocfs2_file_private *fp = file->private_data;
71	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
72
73	if (fp) {
74	ocfs2_simple_drop_lockres(osb, lockres: &fp->fp_flock);
75	ocfs2_lock_res_free(res: &fp->fp_flock);
76	kfree(objp: fp);
77	file->private_data = NULL;
78	}
79	}
80
81	static int ocfs2_file_open(struct inode inode, struct* file *file)
82	{
83	int status;
84	int mode = file->f_flags;
85	struct ocfs2_inode_info *oi = OCFS2_I(inode);
86
87	trace_ocfs2_file_open(inode, file, dentry: file->f_path.dentry,
88	ino: (unsigned long long)oi->ip_blkno,
89	d_len: file->f_path.dentry->d_name.len,
90	d_name: file->f_path.dentry->d_name.name, mode);
91
92	if (file->f_mode & FMODE_WRITE) {
93	status = dquot_initialize(inode);
94	if (status)
95	goto leave;
96	}
97
98	spin_lock(lock: &oi->ip_lock);
99
100	/ Check that the inode hasn't been wiped from disk by another*
101	* node. If it hasn't then we're safe as long as we hold the
102	* spin lock until our increment of open count. */
103	if (oi->ip_flags & OCFS2_INODE_DELETED) {
104	spin_unlock(lock: &oi->ip_lock);
105
106	status = -ENOENT;
107	goto leave;
108	}
109
110	if (mode & O_DIRECT)
111	oi->ip_flags \|= OCFS2_INODE_OPEN_DIRECT;
112
113	oi->ip_open_count++;
114	spin_unlock(lock: &oi->ip_lock);
115
116	status = ocfs2_init_file_private(inode, file);
117	if (status) {
118	/*
119	* We want to set open count back if we're failing the
120	* open.
121	*/
122	spin_lock(lock: &oi->ip_lock);
123	oi->ip_open_count--;
124	spin_unlock(lock: &oi->ip_lock);
125	}
126
127	file->f_mode \|= FMODE_NOWAIT;
128
129	leave:
130	return status;
131	}
132
133	static int ocfs2_file_release(struct inode inode, struct* file *file)
134	{
135	struct ocfs2_inode_info *oi = OCFS2_I(inode);
136
137	spin_lock(lock: &oi->ip_lock);
138	if (!--oi->ip_open_count)
139	oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;
140
141	trace_ocfs2_file_release(inode, file, dentry: file->f_path.dentry,
142	ino: oi->ip_blkno,
143	d_len: file->f_path.dentry->d_name.len,
144	d_name: file->f_path.dentry->d_name.name,
145	mode: oi->ip_open_count);
146	spin_unlock(lock: &oi->ip_lock);
147
148	ocfs2_free_file_private(inode, file);
149
150	return `0`;
151	}
152
153	static int ocfs2_dir_open(struct inode inode, struct* file *file)
154	{
155	return ocfs2_init_file_private(inode, file);
156	}
157
158	static int ocfs2_dir_release(struct inode inode, struct* file *file)
159	{
160	ocfs2_free_file_private(inode, file);
161	return `0`;
162	}
163
164	static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end,
165	int datasync)
166	{
167	int err = `0`;
168	struct inode *inode = file->f_mapping->host;
169	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
170	struct ocfs2_inode_info *oi = OCFS2_I(inode);
171	journal_t *journal = osb->journal->j_journal;
172	int ret;
173	tid_t commit_tid;
174	bool needs_barrier = false;
175
176	trace_ocfs2_sync_file(inode, file, dentry: file->f_path.dentry,
177	ino: oi->ip_blkno,
178	d_len: file->f_path.dentry->d_name.len,
179	d_name: file->f_path.dentry->d_name.name,
180	mode: (unsigned long long)datasync);
181
182	if (ocfs2_is_hard_readonly(osb) \|\| ocfs2_is_soft_readonly(osb))
183	return -EROFS;
184
185	err = file_write_and_wait_range(file, start, end);
186	if (err)
187	return err;
188
189	commit_tid = datasync ? oi->i_datasync_tid : oi->i_sync_tid;
190	if (journal->j_flags & JBD2_BARRIER &&
191	!jbd2_trans_will_send_data_barrier(journal, tid: commit_tid))
192	needs_barrier = true;
193	err = jbd2_complete_transaction(journal, tid: commit_tid);
194	if (needs_barrier) {
195	ret = blkdev_issue_flush(bdev: inode->i_sb->s_bdev);
196	if (!err)
197	err = ret;
198	}
199
200	if (err)
201	mlog_errno(err);
202
203	return (err < `0`) ? -EIO : `0`;
204	}
205
206	int ocfs2_should_update_atime(struct inode *inode,
207	struct vfsmount *vfsmnt)
208	{
209	struct timespec64 now;
210	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
211
212	if (ocfs2_is_hard_readonly(osb) \|\| ocfs2_is_soft_readonly(osb))
213	return `0`;
214
215	if ((inode->i_flags & S_NOATIME) \|\|
216	((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode)))
217	return `0`;
218
219	/*
220	* We can be called with no vfsmnt structure - NFSD will
221	* sometimes do this.
222	*
223	* Note that our action here is different than touch_atime() -
224	* if we can't tell whether this is a noatime mount, then we
225	* don't know whether to trust the value of s_atime_quantum.
226	*/
227	if (vfsmnt == NULL)
228	return `0`;
229
230	if ((vfsmnt->mnt_flags & MNT_NOATIME) \|\|
231	((vfsmnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)))
232	return `0`;
233
234	if (vfsmnt->mnt_flags & MNT_RELATIME) {
235	struct timespec64 ctime = inode_get_ctime(inode);
236	struct timespec64 atime = inode_get_atime(inode);
237	struct timespec64 mtime = inode_get_mtime(inode);
238
239	if ((timespec64_compare(lhs: &atime, rhs: &mtime) <= `0`) \|\|
240	(timespec64_compare(lhs: &atime, rhs: &ctime) <= `0`))
241	return `1`;
242
243	return `0`;
244	}
245
246	now = current_time(inode);
247	if ((now.tv_sec - inode_get_atime_sec(inode) <= osb->s_atime_quantum))
248	return `0`;
249	else
250	return `1`;
251	}
252
253	int ocfs2_update_inode_atime(struct inode *inode,
254	struct buffer_head *bh)
255	{
256	int ret;
257	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
258	handle_t *handle;
259	struct ocfs2_dinode di = (struct* ocfs2_dinode *) bh->b_data;
260
261	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
262	if (IS_ERR(ptr: handle)) {
263	ret = PTR_ERR(ptr: handle);
264	mlog_errno(ret);
265	goto out;
266	}
267
268	ret = ocfs2_journal_access_di(handle, ci: INODE_CACHE(inode), bh,
269	OCFS2_JOURNAL_ACCESS_WRITE);
270	if (ret) {
271	mlog_errno(ret);
272	goto out_commit;
273	}
274
275	/*
276	* Don't use ocfs2_mark_inode_dirty() here as we don't always
277	* have i_rwsem to guard against concurrent changes to other
278	* inode fields.
279	*/
280	inode_set_atime_to_ts(inode, ts: current_time(inode));
281	di->i_atime = cpu_to_le64(inode_get_atime_sec(inode));
282	di->i_atime_nsec = cpu_to_le32(inode_get_atime_nsec(inode));
283	ocfs2_update_inode_fsync_trans(handle, inode, datasync: `0`);
284	ocfs2_journal_dirty(handle, bh);
285
286	out_commit:
287	ocfs2_commit_trans(osb, handle);
288	out:
289	return ret;
290	}
291
292	int ocfs2_set_inode_size(handle_t *handle,
293	struct inode *inode,
294	struct buffer_head *fe_bh,
295	u64 new_i_size)
296	{
297	int status;
298
299	i_size_write(inode, i_size: new_i_size);
300	inode->i_blocks = ocfs2_inode_sector_count(inode);
301	inode_set_mtime_to_ts(inode, ts: inode_set_ctime_current(inode));
302
303	status = ocfs2_mark_inode_dirty(handle, inode, bh: fe_bh);
304	if (status < `0`) {
305	mlog_errno(status);
306	goto bail;
307	}
308
309	bail:
310	return status;
311	}
312
313	int ocfs2_simple_size_update(struct inode *inode,
314	struct buffer_head *di_bh,
315	u64 new_i_size)
316	{
317	int ret;
318	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
319	handle_t *handle = NULL;
320
321	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
322	if (IS_ERR(ptr: handle)) {
323	ret = PTR_ERR(ptr: handle);
324	mlog_errno(ret);
325	goto out;
326	}
327
328	ret = ocfs2_set_inode_size(handle, inode, fe_bh: di_bh,
329	new_i_size);
330	if (ret < `0`)
331	mlog_errno(ret);
332
333	ocfs2_update_inode_fsync_trans(handle, inode, datasync: `0`);
334	ocfs2_commit_trans(osb, handle);
335	out:
336	return ret;
337	}
338
339	static int ocfs2_cow_file_pos(struct inode *inode,
340	struct buffer_head *fe_bh,
341	u64 offset)
342	{
343	int status;
344	u32 phys, cpos = offset >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
345	unsigned int num_clusters = `0`;
346	unsigned int ext_flags = `0`;
347
348	/*
349	* If the new offset is aligned to the range of the cluster, there is
350	* no space for ocfs2_zero_range_for_truncate to fill, so no need to
351	* CoW either.
352	*/
353	if ((offset & (OCFS2_SB(inode->i_sb)->s_clustersize - `1`)) == `0`)
354	return `0`;
355
356	status = ocfs2_get_clusters(inode, v_cluster: cpos, p_cluster: &phys,
357	num_clusters: &num_clusters, extent_flags: &ext_flags);
358	if (status) {
359	mlog_errno(status);
360	goto out;
361	}
362
363	if (!(ext_flags & OCFS2_EXT_REFCOUNTED))
364	goto out;
365
366	return ocfs2_refcount_cow(inode, di_bh: fe_bh, cpos, write_len: `1`, max_cpos: cpos+`1`);
367
368	out:
369	return status;
370	}
371
372	static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
373	struct inode *inode,
374	struct buffer_head *fe_bh,
375	u64 new_i_size)
376	{
377	int status;
378	handle_t *handle;
379	struct ocfs2_dinode *di;
380	u64 cluster_bytes;
381
382	/*
383	* We need to CoW the cluster contains the offset if it is reflinked
384	* since we will call ocfs2_zero_range_for_truncate later which will
385	* write "0" from offset to the end of the cluster.
386	*/
387	status = ocfs2_cow_file_pos(inode, fe_bh, offset: new_i_size);
388	if (status) {
389	mlog_errno(status);
390	return status;
391	}
392
393	/ TODO: This needs to actually orphan the inode in this*
394	* transaction. */
395
396	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
397	if (IS_ERR(ptr: handle)) {
398	status = PTR_ERR(ptr: handle);
399	mlog_errno(status);
400	goto out;
401	}
402
403	status = ocfs2_journal_access_di(handle, ci: INODE_CACHE(inode), bh: fe_bh,
404	OCFS2_JOURNAL_ACCESS_WRITE);
405	if (status < `0`) {
406	mlog_errno(status);
407	goto out_commit;
408	}
409
410	/*
411	* Do this before setting i_size.
412	*/
413	cluster_bytes = ocfs2_align_bytes_to_clusters(sb: inode->i_sb, bytes: new_i_size);
414	status = ocfs2_zero_range_for_truncate(inode, handle, range_start: new_i_size,
415	range_end: cluster_bytes);
416	if (status) {
417	mlog_errno(status);
418	goto out_commit;
419	}
420
421	i_size_write(inode, i_size: new_i_size);
422	inode_set_mtime_to_ts(inode, ts: inode_set_ctime_current(inode));
423
424	di = (struct ocfs2_dinode *) fe_bh->b_data;
425	di->i_size = cpu_to_le64(new_i_size);
426	di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime_sec(inode));
427	di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode));
428	ocfs2_update_inode_fsync_trans(handle, inode, datasync: `0`);
429
430	ocfs2_journal_dirty(handle, bh: fe_bh);
431
432	out_commit:
433	ocfs2_commit_trans(osb, handle);
434	out:
435	return status;
436	}
437
438	int ocfs2_truncate_file(struct inode *inode,
439	struct buffer_head *di_bh,
440	u64 new_i_size)
441	{
442	int status = `0`;
443	struct ocfs2_dinode *fe = NULL;
444	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
445
446	/ We trust di_bh because it comes from ocfs2_inode_lock(), which*
447	* already validated it */
448	fe = (struct ocfs2_dinode *) di_bh->b_data;
449
450	trace_ocfs2_truncate_file(value1: (unsigned long long)OCFS2_I(inode)->ip_blkno,
451	value2: (unsigned long long)le64_to_cpu(fe->i_size),
452	value3: (unsigned long long)new_i_size);
453
454	mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
455	"Inode %llu, inode i_size = %lld != di "
456	"i_size = %llu, i_flags = 0x%x\n",
457	(unsigned long long)OCFS2_I(inode)->ip_blkno,
458	i_size_read(inode),
459	(unsigned long long)le64_to_cpu(fe->i_size),
460	le32_to_cpu(fe->i_flags));
461
462	if (new_i_size > le64_to_cpu(fe->i_size)) {
463	trace_ocfs2_truncate_file_error(
464	val1: (unsigned long long)le64_to_cpu(fe->i_size),
465	val2: (unsigned long long)new_i_size);
466	status = -EINVAL;
467	mlog_errno(status);
468	goto bail;
469	}
470
471	down_write(sem: &OCFS2_I(inode)->ip_alloc_sem);
472
473	ocfs2_resv_discard(resmap: &osb->osb_la_resmap,
474	resv: &OCFS2_I(inode)->ip_la_data_resv);
475
476	/*
477	* The inode lock forced other nodes to sync and drop their
478	* pages, which (correctly) happens even if we have a truncate
479	* without allocation change - ocfs2 cluster sizes can be much
480	* greater than page size, so we have to truncate them
481	* anyway.
482	*/
483
484	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
485	unmap_mapping_range(mapping: inode->i_mapping,
486	holebegin: new_i_size + PAGE_SIZE - `1`, holelen: `0`, even_cows: `1`);
487	truncate_inode_pages(inode->i_mapping, new_i_size);
488	status = ocfs2_truncate_inline(inode, di_bh, start: new_i_size,
489	end: i_size_read(inode), trunc: `1`);
490	if (status)
491	mlog_errno(status);
492
493	goto bail_unlock_sem;
494	}
495
496	/ alright, we're going to need to do a full blown alloc size*
497	* change. Orphan the inode so that recovery can complete the
498	* truncate if necessary. This does the task of marking
499	* i_size. */
500	status = ocfs2_orphan_for_truncate(osb, inode, fe_bh: di_bh, new_i_size);
501	if (status < `0`) {
502	mlog_errno(status);
503	goto bail_unlock_sem;
504	}
505
506	unmap_mapping_range(mapping: inode->i_mapping, holebegin: new_i_size + PAGE_SIZE - `1`, holelen: `0`, even_cows: `1`);
507	truncate_inode_pages(inode->i_mapping, new_i_size);
508
509	status = ocfs2_commit_truncate(osb, inode, di_bh);
510	if (status < `0`) {
511	mlog_errno(status);
512	goto bail_unlock_sem;
513	}
514
515	/ TODO: orphan dir cleanup here. /
516	bail_unlock_sem:
517	up_write(sem: &OCFS2_I(inode)->ip_alloc_sem);
518
519	bail:
520	if (!status && OCFS2_I(inode)->ip_clusters == `0`)
521	status = ocfs2_try_remove_refcount_tree(inode, di_bh);
522
523	return status;
524	}
525
526	/*
527	* extend file allocation only here.
528	* we'll update all the disk stuff, and oip->alloc_size
529	*
530	* expect stuff to be locked, a transaction started and enough data /
531	* metadata reservations in the contexts.
532	*
533	* Will return -EAGAIN, and a reason if a restart is needed.
534	* If passed in, *reason will always be set, even in error.
535	*/
536	int ocfs2_add_inode_data(struct ocfs2_super *osb,
537	struct inode *inode,
538	u32 *logical_offset,
539	u32 clusters_to_add,
540	int mark_unwritten,
541	struct buffer_head *fe_bh,
542	handle_t *handle,
543	struct ocfs2_alloc_context *data_ac,
544	struct ocfs2_alloc_context *meta_ac,
545	enum ocfs2_alloc_restarted *reason_ret)
546	{
547	struct ocfs2_extent_tree et;
548
549	ocfs2_init_dinode_extent_tree(et: &et, ci: INODE_CACHE(inode), bh: fe_bh);
550	return ocfs2_add_clusters_in_btree(handle, et: &et, logical_offset,
551	clusters_to_add, mark_unwritten,
552	data_ac, meta_ac, reason_ret);
553	}
554
555	static int ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
556	u32 clusters_to_add, int mark_unwritten)
557	{
558	int status = `0`;
559	int restart_func = `0`;
560	int credits;
561	u32 prev_clusters;
562	struct buffer_head *bh = NULL;
563	struct ocfs2_dinode *fe = NULL;
564	handle_t *handle = NULL;
565	struct ocfs2_alloc_context *data_ac = NULL;
566	struct ocfs2_alloc_context *meta_ac = NULL;
567	enum ocfs2_alloc_restarted why = RESTART_NONE;
568	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
569	struct ocfs2_extent_tree et;
570	int did_quota = `0`;
571
572	/*
573	* Unwritten extent only exists for file systems which
574	* support holes.
575	*/
576	BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));
577
578	status = ocfs2_read_inode_block(inode, bh: &bh);
579	if (status < `0`) {
580	mlog_errno(status);
581	goto leave;
582	}
583	fe = (struct ocfs2_dinode *) bh->b_data;
584
585	restart_all:
586	BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
587
588	ocfs2_init_dinode_extent_tree(et: &et, ci: INODE_CACHE(inode), bh);
589	status = ocfs2_lock_allocators(inode, et: &et, clusters_to_add, extents_to_split: `0`,
590	data_ac: &data_ac, meta_ac: &meta_ac);
591	if (status) {
592	mlog_errno(status);
593	goto leave;
594	}
595
596	credits = ocfs2_calc_extend_credits(sb: osb->sb, root_el: &fe->id2.i_list);
597	handle = ocfs2_start_trans(osb, max_buffs: credits);
598	if (IS_ERR(ptr: handle)) {
599	status = PTR_ERR(ptr: handle);
600	handle = NULL;
601	mlog_errno(status);
602	goto leave;
603	}
604
605	restarted_transaction:
606	trace_ocfs2_extend_allocation(
607	ip_blkno: (unsigned long long)OCFS2_I(inode)->ip_blkno,
608	size: (unsigned long long)i_size_read(inode),
609	le32_to_cpu(fe->i_clusters), clusters_to_add,
610	why, restart_func);
611
612	status = dquot_alloc_space_nodirty(inode,
613	nr: ocfs2_clusters_to_bytes(sb: osb->sb, clusters: clusters_to_add));
614	if (status)
615	goto leave;
616	did_quota = `1`;
617
618	/ reserve a write to the file entry early on - that we if we*
619	* run out of credits in the allocation path, we can still
620	* update i_size. */
621	status = ocfs2_journal_access_di(handle, ci: INODE_CACHE(inode), bh,
622	OCFS2_JOURNAL_ACCESS_WRITE);
623	if (status < `0`) {
624	mlog_errno(status);
625	goto leave;
626	}
627
628	prev_clusters = OCFS2_I(inode)->ip_clusters;
629
630	status = ocfs2_add_inode_data(osb,
631	inode,
632	logical_offset: &logical_start,
633	clusters_to_add,
634	mark_unwritten,
635	fe_bh: bh,
636	handle,
637	data_ac,
638	meta_ac,
639	reason_ret: &why);
640	if ((status < `0`) && (status != -EAGAIN)) {
641	if (status != -ENOSPC)
642	mlog_errno(status);
643	goto leave;
644	}
645	ocfs2_update_inode_fsync_trans(handle, inode, datasync: `1`);
646	ocfs2_journal_dirty(handle, bh);
647
648	spin_lock(lock: &OCFS2_I(inode)->ip_lock);
649	clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
650	spin_unlock(lock: &OCFS2_I(inode)->ip_lock);
651	/ Release unused quota reservation /
652	dquot_free_space(inode,
653	nr: ocfs2_clusters_to_bytes(sb: osb->sb, clusters: clusters_to_add));
654	did_quota = `0`;
655
656	if (why != RESTART_NONE && clusters_to_add) {
657	if (why == RESTART_META) {
658	restart_func = `1`;
659	status = `0`;
660	} else {
661	BUG_ON(why != RESTART_TRANS);
662
663	status = ocfs2_allocate_extend_trans(handle, thresh: `1`);
664	if (status < `0`) {
665	/ handle still has to be committed at*
666	* this point. */
667	status = -ENOMEM;
668	mlog_errno(status);
669	goto leave;
670	}
671	goto restarted_transaction;
672	}
673	}
674
675	trace_ocfs2_extend_allocation_end(ino: OCFS2_I(inode)->ip_blkno,
676	le32_to_cpu(fe->i_clusters),
677	di_size: (unsigned long long)le64_to_cpu(fe->i_size),
678	ip_clusters: OCFS2_I(inode)->ip_clusters,
679	i_size: (unsigned long long)i_size_read(inode));
680
681	leave:
682	if (status < `0` && did_quota)
683	dquot_free_space(inode,
684	nr: ocfs2_clusters_to_bytes(sb: osb->sb, clusters: clusters_to_add));
685	if (handle) {
686	ocfs2_commit_trans(osb, handle);
687	handle = NULL;
688	}
689	if (data_ac) {
690	ocfs2_free_alloc_context(ac: data_ac);
691	data_ac = NULL;
692	}
693	if (meta_ac) {
694	ocfs2_free_alloc_context(ac: meta_ac);
695	meta_ac = NULL;
696	}
697	if ((!status) && restart_func) {
698	restart_func = `0`;
699	goto restart_all;
700	}
701	brelse(bh);
702	bh = NULL;
703
704	return status;
705	}
706
707	/*
708	* While a write will already be ordering the data, a truncate will not.
709	* Thus, we need to explicitly order the zeroed pages.
710	*/
711	static handle_t ocfs2_zero_start_ordered_transaction(struct* inode *inode,
712	struct buffer_head *di_bh,
713	loff_t start_byte,
714	loff_t length)
715	{
716	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
717	handle_t *handle = NULL;
718	int ret = `0`;
719
720	if (!ocfs2_should_order_data(inode))
721	goto out;
722
723	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
724	if (IS_ERR(ptr: handle)) {
725	ret = -ENOMEM;
726	mlog_errno(ret);
727	goto out;
728	}
729
730	ret = ocfs2_jbd2_inode_add_write(handle, inode, start_byte, length);
731	if (ret < `0`) {
732	mlog_errno(ret);
733	goto out;
734	}
735
736	ret = ocfs2_journal_access_di(handle, ci: INODE_CACHE(inode), bh: di_bh,
737	OCFS2_JOURNAL_ACCESS_WRITE);
738	if (ret)
739	mlog_errno(ret);
740	ocfs2_update_inode_fsync_trans(handle, inode, datasync: `1`);
741
742	out:
743	if (ret) {
744	if (!IS_ERR(ptr: handle))
745	ocfs2_commit_trans(osb, handle);
746	handle = ERR_PTR(error: ret);
747	}
748	return handle;
749	}
750
751	/ Some parts of this taken from generic_cont_expand, which turned out*
752	* to be too fragile to do exactly what we need without us having to
753	* worry about recursive locking in ->write_begin() and ->write_end(). */
754	static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
755	u64 abs_to, struct buffer_head *di_bh)
756	{
757	struct address_space *mapping = inode->i_mapping;
758	struct page *page;
759	unsigned long index = abs_from >> PAGE_SHIFT;
760	handle_t *handle;
761	int ret = `0`;
762	unsigned zero_from, zero_to, block_start, block_end;
763	struct ocfs2_dinode di = (struct* ocfs2_dinode *)di_bh->b_data;
764
765	BUG_ON(abs_from >= abs_to);
766	BUG_ON(abs_to > (((u64)index + `1`) << PAGE_SHIFT));
767	BUG_ON(abs_from & (inode->i_blkbits - `1`));
768
769	handle = ocfs2_zero_start_ordered_transaction(inode, di_bh,
770	start_byte: abs_from,
771	length: abs_to - abs_from);
772	if (IS_ERR(ptr: handle)) {
773	ret = PTR_ERR(ptr: handle);
774	goto out;
775	}
776
777	page = find_or_create_page(mapping, index, GFP_NOFS);
778	if (!page) {
779	ret = -ENOMEM;
780	mlog_errno(ret);
781	goto out_commit_trans;
782	}
783
784	/ Get the offsets within the page that we want to zero /
785	zero_from = abs_from & (PAGE_SIZE - `1`);
786	zero_to = abs_to & (PAGE_SIZE - `1`);
787	if (!zero_to)
788	zero_to = PAGE_SIZE;
789
790	trace_ocfs2_write_zero_page(
791	ino: (unsigned long long)OCFS2_I(inode)->ip_blkno,
792	abs_from: (unsigned long long)abs_from,
793	abs_to: (unsigned long long)abs_to,
794	index, zero_from, zero_to);
795
796	/ We know that zero_from is block aligned /
797	for (block_start = zero_from; block_start < zero_to;
798	block_start = block_end) {
799	block_end = block_start + i_blocksize(node: inode);
800
801	/*
802	* block_start is block-aligned. Bump it by one to force
803	* __block_write_begin and block_commit_write to zero the
804	* whole block.
805	*/
806	ret = __block_write_begin(page, pos: block_start + `1`, len: `0`,
807	get_block: ocfs2_get_block);
808	if (ret < `0`) {
809	mlog_errno(ret);
810	goto out_unlock;
811	}
812
813
814	/ must not update i_size! /
815	block_commit_write(page, from: block_start + `1`, to: block_start + `1`);
816	}
817
818	/*
819	* fs-writeback will release the dirty pages without page lock
820	* whose offset are over inode size, the release happens at
821	* block_write_full_folio().
822	*/
823	i_size_write(inode, i_size: abs_to);
824	inode->i_blocks = ocfs2_inode_sector_count(inode);
825	di->i_size = cpu_to_le64((u64)i_size_read(inode));
826	inode_set_mtime_to_ts(inode, ts: inode_set_ctime_current(inode));
827	di->i_mtime = di->i_ctime = cpu_to_le64(inode_get_mtime_sec(inode));
828	di->i_ctime_nsec = cpu_to_le32(inode_get_mtime_nsec(inode));
829	di->i_mtime_nsec = di->i_ctime_nsec;
830	if (handle) {
831	ocfs2_journal_dirty(handle, bh: di_bh);
832	ocfs2_update_inode_fsync_trans(handle, inode, datasync: `1`);
833	}
834
835	out_unlock:
836	unlock_page(page);
837	put_page(page);
838	out_commit_trans:
839	if (handle)
840	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
841	out:
842	return ret;
843	}
844
845	/*
846	* Find the next range to zero. We do this in terms of bytes because
847	* that's what ocfs2_zero_extend() wants, and it is dealing with the
848	* pagecache. We may return multiple extents.
849	*
850	* zero_start and zero_end are ocfs2_zero_extend()s current idea of what
851	* needs to be zeroed. range_start and range_end return the next zeroing
852	* range. A subsequent call should pass the previous range_end as its
853	* zero_start. If range_end is 0, there's nothing to do.
854	*
855	* Unwritten extents are skipped over. Refcounted extents are CoWd.
856	*/
857	static int ocfs2_zero_extend_get_range(struct inode *inode,
858	struct buffer_head *di_bh,
859	u64 zero_start, u64 zero_end,
860	u64 range_start, u64 range_end)
861	{
862	int rc = `0`, needs_cow = `0`;
863	u32 p_cpos, zero_clusters = `0`;
864	u32 zero_cpos =
865	zero_start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
866	u32 last_cpos = ocfs2_clusters_for_bytes(sb: inode->i_sb, bytes: zero_end);
867	unsigned int num_clusters = `0`;
868	unsigned int ext_flags = `0`;
869
870	while (zero_cpos < last_cpos) {
871	rc = ocfs2_get_clusters(inode, v_cluster: zero_cpos, p_cluster: &p_cpos,
872	num_clusters: &num_clusters, extent_flags: &ext_flags);
873	if (rc) {
874	mlog_errno(rc);
875	goto out;
876	}
877
878	if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
879	zero_clusters = num_clusters;
880	if (ext_flags & OCFS2_EXT_REFCOUNTED)
881	needs_cow = `1`;
882	break;
883	}
884
885	zero_cpos += num_clusters;
886	}
887	if (!zero_clusters) {
888	*range_end = `0`;
889	goto out;
890	}
891
892	while ((zero_cpos + zero_clusters) < last_cpos) {
893	rc = ocfs2_get_clusters(inode, v_cluster: zero_cpos + zero_clusters,
894	p_cluster: &p_cpos, num_clusters: &num_clusters,
895	extent_flags: &ext_flags);
896	if (rc) {
897	mlog_errno(rc);
898	goto out;
899	}
900
901	if (!p_cpos \|\| (ext_flags & OCFS2_EXT_UNWRITTEN))
902	break;
903	if (ext_flags & OCFS2_EXT_REFCOUNTED)
904	needs_cow = `1`;
905	zero_clusters += num_clusters;
906	}
907	if ((zero_cpos + zero_clusters) > last_cpos)
908	zero_clusters = last_cpos - zero_cpos;
909
910	if (needs_cow) {
911	rc = ocfs2_refcount_cow(inode, di_bh, cpos: zero_cpos,
912	write_len: zero_clusters, UINT_MAX);
913	if (rc) {
914	mlog_errno(rc);
915	goto out;
916	}
917	}
918
919	*range_start = ocfs2_clusters_to_bytes(sb: inode->i_sb, clusters: zero_cpos);
920	*range_end = ocfs2_clusters_to_bytes(sb: inode->i_sb,
921	clusters: zero_cpos + zero_clusters);
922
923	out:
924	return rc;
925	}
926
927	/*
928	* Zero one range returned from ocfs2_zero_extend_get_range(). The caller
929	* has made sure that the entire range needs zeroing.
930	*/
931	static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start,
932	u64 range_end, struct buffer_head *di_bh)
933	{
934	int rc = `0`;
935	u64 next_pos;
936	u64 zero_pos = range_start;
937
938	trace_ocfs2_zero_extend_range(
939	value1: (unsigned long long)OCFS2_I(inode)->ip_blkno,
940	value2: (unsigned long long)range_start,
941	value3: (unsigned long long)range_end);
942	BUG_ON(range_start >= range_end);
943
944	while (zero_pos < range_end) {
945	next_pos = (zero_pos & PAGE_MASK) + PAGE_SIZE;
946	if (next_pos > range_end)
947	next_pos = range_end;
948	rc = ocfs2_write_zero_page(inode, abs_from: zero_pos, abs_to: next_pos, di_bh);
949	if (rc < `0`) {
950	mlog_errno(rc);
951	break;
952	}
953	zero_pos = next_pos;
954
955	/*
956	* Very large extends have the potential to lock up
957	* the cpu for extended periods of time.
958	*/
959	cond_resched();
960	}
961
962	return rc;
963	}
964
965	int ocfs2_zero_extend(struct inode inode, struct* buffer_head *di_bh,
966	loff_t zero_to_size)
967	{
968	int ret = `0`;
969	u64 zero_start, range_start = `0`, range_end = `0`;
970	struct super_block *sb = inode->i_sb;
971
972	zero_start = ocfs2_align_bytes_to_blocks(sb, bytes: i_size_read(inode));
973	trace_ocfs2_zero_extend(value1: (unsigned long long)OCFS2_I(inode)->ip_blkno,
974	value2: (unsigned long long)zero_start,
975	value3: (unsigned long long)i_size_read(inode));
976	while (zero_start < zero_to_size) {
977	ret = ocfs2_zero_extend_get_range(inode, di_bh, zero_start,
978	zero_end: zero_to_size,
979	range_start: &range_start,
980	range_end: &range_end);
981	if (ret) {
982	mlog_errno(ret);
983	break;
984	}
985	if (!range_end)
986	break;
987	/ Trim the ends /
988	if (range_start < zero_start)
989	range_start = zero_start;
990	if (range_end > zero_to_size)
991	range_end = zero_to_size;
992
993	ret = ocfs2_zero_extend_range(inode, range_start,
994	range_end, di_bh);
995	if (ret) {
996	mlog_errno(ret);
997	break;
998	}
999	zero_start = range_end;
1000	}
1001
1002	return ret;
1003	}
1004
1005	int ocfs2_extend_no_holes(struct inode inode, struct* buffer_head *di_bh,
1006	u64 new_i_size, u64 zero_to)
1007	{
1008	int ret;
1009	u32 clusters_to_add;
1010	struct ocfs2_inode_info *oi = OCFS2_I(inode);
1011
1012	/*
1013	* Only quota files call this without a bh, and they can't be
1014	* refcounted.
1015	*/
1016	BUG_ON(!di_bh && ocfs2_is_refcount_inode(inode));
1017	BUG_ON(!di_bh && !(oi->ip_flags & OCFS2_INODE_SYSTEM_FILE));
1018
1019	clusters_to_add = ocfs2_clusters_for_bytes(sb: inode->i_sb, bytes: new_i_size);
1020	if (clusters_to_add < oi->ip_clusters)
1021	clusters_to_add = `0`;
1022	else
1023	clusters_to_add -= oi->ip_clusters;
1024
1025	if (clusters_to_add) {
1026	ret = ocfs2_extend_allocation(inode, logical_start: oi->ip_clusters,
1027	clusters_to_add, mark_unwritten: `0`);
1028	if (ret) {
1029	mlog_errno(ret);
1030	goto out;
1031	}
1032	}
1033
1034	/*
1035	* Call this even if we don't add any clusters to the tree. We
1036	* still need to zero the area between the old i_size and the
1037	* new i_size.
1038	*/
1039	ret = ocfs2_zero_extend(inode, di_bh, zero_to_size: zero_to);
1040	if (ret < `0`)
1041	mlog_errno(ret);
1042
1043	out:
1044	return ret;
1045	}
1046
1047	static int ocfs2_extend_file(struct inode *inode,
1048	struct buffer_head *di_bh,
1049	u64 new_i_size)
1050	{
1051	int ret = `0`;
1052	struct ocfs2_inode_info *oi = OCFS2_I(inode);
1053
1054	BUG_ON(!di_bh);
1055
1056	/ setattr sometimes calls us like this. /
1057	if (new_i_size == `0`)
1058	goto out;
1059
1060	if (i_size_read(inode) == new_i_size)
1061	goto out;
1062	BUG_ON(new_i_size < i_size_read(inode));
1063
1064	/*
1065	* The alloc sem blocks people in read/write from reading our
1066	* allocation until we're done changing it. We depend on
1067	* i_rwsem to block other extend/truncate calls while we're
1068	* here. We even have to hold it for sparse files because there
1069	* might be some tail zeroing.
1070	*/
1071	down_write(sem: &oi->ip_alloc_sem);
1072
1073	if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1074	/*
1075	* We can optimize small extends by keeping the inodes
1076	* inline data.
1077	*/
1078	if (ocfs2_size_fits_inline_data(di_bh, new_size: new_i_size)) {
1079	up_write(sem: &oi->ip_alloc_sem);
1080	goto out_update_size;
1081	}
1082
1083	ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
1084	if (ret) {
1085	up_write(sem: &oi->ip_alloc_sem);
1086	mlog_errno(ret);
1087	goto out;
1088	}
1089	}
1090
1091	if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
1092	ret = ocfs2_zero_extend(inode, di_bh, zero_to_size: new_i_size);
1093	else
1094	ret = ocfs2_extend_no_holes(inode, di_bh, new_i_size,
1095	zero_to: new_i_size);
1096
1097	up_write(sem: &oi->ip_alloc_sem);
1098
1099	if (ret < `0`) {
1100	mlog_errno(ret);
1101	goto out;
1102	}
1103
1104	out_update_size:
1105	ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);
1106	if (ret < `0`)
1107	mlog_errno(ret);
1108
1109	out:
1110	return ret;
1111	}
1112
1113	int ocfs2_setattr(struct mnt_idmap idmap, struct* dentry *dentry,
1114	struct iattr *attr)
1115	{
1116	int status = `0`, size_change;
1117	int inode_locked = `0`;
1118	struct inode *inode = d_inode(dentry);
1119	struct super_block *sb = inode->i_sb;
1120	struct ocfs2_super *osb = OCFS2_SB(sb);
1121	struct buffer_head *bh = NULL;
1122	handle_t *handle = NULL;
1123	struct dquot *transfer_to[MAXQUOTAS] = { };
1124	int qtype;
1125	int had_lock;
1126	struct ocfs2_lock_holder oh;
1127
1128	trace_ocfs2_setattr(inode, dentry,
1129	ino: (unsigned long long)OCFS2_I(inode)->ip_blkno,
1130	d_len: dentry->d_name.len, d_name: dentry->d_name.name,
1131	ia_valid: attr->ia_valid, ia_mode: attr->ia_mode,
1132	ia_uid: from_kuid(to: &init_user_ns, uid: attr->ia_uid),
1133	ia_gid: from_kgid(to: &init_user_ns, gid: attr->ia_gid));
1134
1135	/ ensuring we don't even attempt to truncate a symlink /
1136	if (S_ISLNK(inode->i_mode))
1137	attr->ia_valid &= ~ATTR_SIZE;
1138
1139	#define OCFS2_VALID_ATTRS (ATTR_ATIME \| ATTR_MTIME \| ATTR_CTIME \| ATTR_SIZE \
1140	\| ATTR_GID \| ATTR_UID \| ATTR_MODE)
1141	if (!(attr->ia_valid & OCFS2_VALID_ATTRS))
1142	return `0`;
1143
1144	status = setattr_prepare(&nop_mnt_idmap, dentry, attr);
1145	if (status)
1146	return status;
1147
1148	if (is_quota_modification(idmap: &nop_mnt_idmap, inode, ia: attr)) {
1149	status = dquot_initialize(inode);
1150	if (status)
1151	return status;
1152	}
1153	size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
1154	if (size_change) {
1155	/*
1156	* Here we should wait dio to finish before inode lock
1157	* to avoid a deadlock between ocfs2_setattr() and
1158	* ocfs2_dio_end_io_write()
1159	*/
1160	inode_dio_wait(inode);
1161
1162	status = ocfs2_rw_lock(inode, write: `1`);
1163	if (status < `0`) {
1164	mlog_errno(status);
1165	goto bail;
1166	}
1167	}
1168
1169	had_lock = ocfs2_inode_lock_tracker(inode, ret_bh: &bh, ex: `1`, oh: &oh);
1170	if (had_lock < `0`) {
1171	status = had_lock;
1172	goto bail_unlock_rw;
1173	} else if (had_lock) {
1174	/*
1175	* As far as we know, ocfs2_setattr() could only be the first
1176	* VFS entry point in the call chain of recursive cluster
1177	* locking issue.
1178	*
1179	* For instance:
1180	* chmod_common()
1181	* notify_change()
1182	* ocfs2_setattr()
1183	* posix_acl_chmod()
1184	* ocfs2_iop_get_acl()
1185	*
1186	* But, we're not 100% sure if it's always true, because the
1187	* ordering of the VFS entry points in the call chain is out
1188	* of our control. So, we'd better dump the stack here to
1189	* catch the other cases of recursive locking.
1190	*/
1191	mlog(ML_ERROR, "Another case of recursive locking:\n");
1192	dump_stack();
1193	}
1194	inode_locked = `1`;
1195
1196	if (size_change) {
1197	status = inode_newsize_ok(inode, offset: attr->ia_size);
1198	if (status)
1199	goto bail_unlock;
1200
1201	if (i_size_read(inode) >= attr->ia_size) {
1202	if (ocfs2_should_order_data(inode)) {
1203	status = ocfs2_begin_ordered_truncate(inode,
1204	new_size: attr->ia_size);
1205	if (status)
1206	goto bail_unlock;
1207	}
1208	status = ocfs2_truncate_file(inode, di_bh: bh, new_i_size: attr->ia_size);
1209	} else
1210	status = ocfs2_extend_file(inode, di_bh: bh, new_i_size: attr->ia_size);
1211	if (status < `0`) {
1212	if (status != -ENOSPC)
1213	mlog_errno(status);
1214	status = -ENOSPC;
1215	goto bail_unlock;
1216	}
1217	}
1218
1219	if ((attr->ia_valid & ATTR_UID && !uid_eq(left: attr->ia_uid, right: inode->i_uid)) \|\|
1220	(attr->ia_valid & ATTR_GID && !gid_eq(left: attr->ia_gid, right: inode->i_gid))) {
1221	/*
1222	* Gather pointers to quota structures so that allocation /
1223	* freeing of quota structures happens here and not inside
1224	* dquot_transfer() where we have problems with lock ordering
1225	*/
1226	if (attr->ia_valid & ATTR_UID && !uid_eq(left: attr->ia_uid, right: inode->i_uid)
1227	&& OCFS2_HAS_RO_COMPAT_FEATURE(sb,
1228	OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
1229	transfer_to[USRQUOTA] = dqget(sb, qid: make_kqid_uid(uid: attr->ia_uid));
1230	if (IS_ERR(ptr: transfer_to[USRQUOTA])) {
1231	status = PTR_ERR(ptr: transfer_to[USRQUOTA]);
1232	transfer_to[USRQUOTA] = NULL;
1233	goto bail_unlock;
1234	}
1235	}
1236	if (attr->ia_valid & ATTR_GID && !gid_eq(left: attr->ia_gid, right: inode->i_gid)
1237	&& OCFS2_HAS_RO_COMPAT_FEATURE(sb,
1238	OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
1239	transfer_to[GRPQUOTA] = dqget(sb, qid: make_kqid_gid(gid: attr->ia_gid));
1240	if (IS_ERR(ptr: transfer_to[GRPQUOTA])) {
1241	status = PTR_ERR(ptr: transfer_to[GRPQUOTA]);
1242	transfer_to[GRPQUOTA] = NULL;
1243	goto bail_unlock;
1244	}
1245	}
1246	down_write(sem: &OCFS2_I(inode)->ip_alloc_sem);
1247	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS +
1248	`2` * ocfs2_quota_trans_credits(sb));
1249	if (IS_ERR(ptr: handle)) {
1250	status = PTR_ERR(ptr: handle);
1251	mlog_errno(status);
1252	goto bail_unlock_alloc;
1253	}
1254	status = __dquot_transfer(inode, transfer_to);
1255	if (status < `0`)
1256	goto bail_commit;
1257	} else {
1258	down_write(sem: &OCFS2_I(inode)->ip_alloc_sem);
1259	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1260	if (IS_ERR(ptr: handle)) {
1261	status = PTR_ERR(ptr: handle);
1262	mlog_errno(status);
1263	goto bail_unlock_alloc;
1264	}
1265	}
1266
1267	setattr_copy(&nop_mnt_idmap, inode, attr);
1268	mark_inode_dirty(inode);
1269
1270	status = ocfs2_mark_inode_dirty(handle, inode, bh);
1271	if (status < `0`)
1272	mlog_errno(status);
1273
1274	bail_commit:
1275	ocfs2_commit_trans(osb, handle);
1276	bail_unlock_alloc:
1277	up_write(sem: &OCFS2_I(inode)->ip_alloc_sem);
1278	bail_unlock:
1279	if (status && inode_locked) {
1280	ocfs2_inode_unlock_tracker(inode, ex: `1`, oh: &oh, had_lock);
1281	inode_locked = `0`;
1282	}
1283	bail_unlock_rw:
1284	if (size_change)
1285	ocfs2_rw_unlock(inode, write: `1`);
1286	bail:
1287
1288	/ Release quota pointers in case we acquired them /
1289	for (qtype = `0`; qtype < OCFS2_MAXQUOTAS; qtype++)
1290	dqput(dquot: transfer_to[qtype]);
1291
1292	if (!status && attr->ia_valid & ATTR_MODE) {
1293	status = ocfs2_acl_chmod(inode, bh);
1294	if (status < `0`)
1295	mlog_errno(status);
1296	}
1297	if (inode_locked)
1298	ocfs2_inode_unlock_tracker(inode, ex: `1`, oh: &oh, had_lock);
1299
1300	brelse(bh);
1301	return status;
1302	}
1303
1304	int ocfs2_getattr(struct mnt_idmap idmap, const* struct path *path,
1305	struct kstat stat, u32 request_mask, unsigned* int flags)
1306	{
1307	struct inode *inode = d_inode(dentry: path->dentry);
1308	struct super_block *sb = path->dentry->d_sb;
1309	struct ocfs2_super *osb = sb->s_fs_info;
1310	int err;
1311
1312	err = ocfs2_inode_revalidate(dentry: path->dentry);
1313	if (err) {
1314	if (err != -ENOENT)
1315	mlog_errno(err);
1316	goto bail;
1317	}
1318
1319	generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
1320	/*
1321	* If there is inline data in the inode, the inode will normally not
1322	* have data blocks allocated (it may have an external xattr block).
1323	* Report at least one sector for such files, so tools like tar, rsync,
1324	* others don't incorrectly think the file is completely sparse.
1325	*/
1326	if (unlikely(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL))
1327	stat->blocks += (stat->size + `511`)>>`9`;
1328
1329	/ We set the blksize from the cluster size for performance /
1330	stat->blksize = osb->s_clustersize;
1331
1332	bail:
1333	return err;
1334	}
1335
1336	int ocfs2_permission(struct mnt_idmap idmap, struct* inode *inode,
1337	int mask)
1338	{
1339	int ret, had_lock;
1340	struct ocfs2_lock_holder oh;
1341
1342	if (mask & MAY_NOT_BLOCK)
1343	return -ECHILD;
1344
1345	had_lock = ocfs2_inode_lock_tracker(inode, NULL, ex: `0`, oh: &oh);
1346	if (had_lock < `0`) {
1347	ret = had_lock;
1348	goto out;
1349	} else if (had_lock) {
1350	/ See comments in ocfs2_setattr() for details.*
1351	* The call chain of this case could be:
1352	* do_sys_open()
1353	* may_open()
1354	* inode_permission()
1355	* ocfs2_permission()
1356	* ocfs2_iop_get_acl()
1357	*/
1358	mlog(ML_ERROR, "Another case of recursive locking:\n");
1359	dump_stack();
1360	}
1361
1362	ret = generic_permission(&nop_mnt_idmap, inode, mask);
1363
1364	ocfs2_inode_unlock_tracker(inode, ex: `0`, oh: &oh, had_lock);
1365	out:
1366	return ret;
1367	}
1368
1369	static int __ocfs2_write_remove_suid(struct inode *inode,
1370	struct buffer_head *bh)
1371	{
1372	int ret;
1373	handle_t *handle;
1374	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1375	struct ocfs2_dinode *di;
1376
1377	trace_ocfs2_write_remove_suid(
1378	val1: (unsigned long long)OCFS2_I(inode)->ip_blkno,
1379	val2: inode->i_mode);
1380
1381	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1382	if (IS_ERR(ptr: handle)) {
1383	ret = PTR_ERR(ptr: handle);
1384	mlog_errno(ret);
1385	goto out;
1386	}
1387
1388	ret = ocfs2_journal_access_di(handle, ci: INODE_CACHE(inode), bh,
1389	OCFS2_JOURNAL_ACCESS_WRITE);
1390	if (ret < `0`) {
1391	mlog_errno(ret);
1392	goto out_trans;
1393	}
1394
1395	inode->i_mode &= ~S_ISUID;
1396	if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP))
1397	inode->i_mode &= ~S_ISGID;
1398
1399	di = (struct ocfs2_dinode *) bh->b_data;
1400	di->i_mode = cpu_to_le16(inode->i_mode);
1401	ocfs2_update_inode_fsync_trans(handle, inode, datasync: `0`);
1402
1403	ocfs2_journal_dirty(handle, bh);
1404
1405	out_trans:
1406	ocfs2_commit_trans(osb, handle);
1407	out:
1408	return ret;
1409	}
1410
1411	static int ocfs2_write_remove_suid(struct inode *inode)
1412	{
1413	int ret;
1414	struct buffer_head *bh = NULL;
1415
1416	ret = ocfs2_read_inode_block(inode, bh: &bh);
1417	if (ret < `0`) {
1418	mlog_errno(ret);
1419	goto out;
1420	}
1421
1422	ret = __ocfs2_write_remove_suid(inode, bh);
1423	out:
1424	brelse(bh);
1425	return ret;
1426	}
1427
1428	/*
1429	* Allocate enough extents to cover the region starting at byte offset
1430	* start for len bytes. Existing extents are skipped, any extents
1431	* added are marked as "unwritten".
1432	*/
1433	static int ocfs2_allocate_unwritten_extents(struct inode *inode,
1434	u64 start, u64 len)
1435	{
1436	int ret;
1437	u32 cpos, phys_cpos, clusters, alloc_size;
1438	u64 end = start + len;
1439	struct buffer_head *di_bh = NULL;
1440
1441	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1442	ret = ocfs2_read_inode_block(inode, bh: &di_bh);
1443	if (ret) {
1444	mlog_errno(ret);
1445	goto out;
1446	}
1447
1448	/*
1449	* Nothing to do if the requested reservation range
1450	* fits within the inode.
1451	*/
1452	if (ocfs2_size_fits_inline_data(di_bh, new_size: end))
1453	goto out;
1454
1455	ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
1456	if (ret) {
1457	mlog_errno(ret);
1458	goto out;
1459	}
1460	}
1461
1462	/*
1463	* We consider both start and len to be inclusive.
1464	*/
1465	cpos = start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
1466	clusters = ocfs2_clusters_for_bytes(sb: inode->i_sb, bytes: start + len);
1467	clusters -= cpos;
1468
1469	while (clusters) {
1470	ret = ocfs2_get_clusters(inode, v_cluster: cpos, p_cluster: &phys_cpos,
1471	num_clusters: &alloc_size, NULL);
1472	if (ret) {
1473	mlog_errno(ret);
1474	goto out;
1475	}
1476
1477	/*
1478	* Hole or existing extent len can be arbitrary, so
1479	* cap it to our own allocation request.
1480	*/
1481	if (alloc_size > clusters)
1482	alloc_size = clusters;
1483
1484	if (phys_cpos) {
1485	/*
1486	* We already have an allocation at this
1487	* region so we can safely skip it.
1488	*/
1489	goto next;
1490	}
1491
1492	ret = ocfs2_extend_allocation(inode, logical_start: cpos, clusters_to_add: alloc_size, mark_unwritten: `1`);
1493	if (ret) {
1494	if (ret != -ENOSPC)
1495	mlog_errno(ret);
1496	goto out;
1497	}
1498
1499	next:
1500	cpos += alloc_size;
1501	clusters -= alloc_size;
1502	}
1503
1504	ret = `0`;
1505	out:
1506
1507	brelse(bh: di_bh);
1508	return ret;
1509	}
1510
1511	/*
1512	* Truncate a byte range, avoiding pages within partial clusters. This
1513	* preserves those pages for the zeroing code to write to.
1514	*/
1515	static void ocfs2_truncate_cluster_pages(struct inode *inode, u64 byte_start,
1516	u64 byte_len)
1517	{
1518	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1519	loff_t start, end;
1520	struct address_space *mapping = inode->i_mapping;
1521
1522	start = (loff_t)ocfs2_align_bytes_to_clusters(sb: inode->i_sb, bytes: byte_start);
1523	end = byte_start + byte_len;
1524	end = end & ~(osb->s_clustersize - `1`);
1525
1526	if (start < end) {
1527	unmap_mapping_range(mapping, holebegin: start, holelen: end - start, even_cows: `0`);
1528	truncate_inode_pages_range(mapping, lstart: start, lend: end - `1`);
1529	}
1530	}
1531
1532	/*
1533	* zero out partial blocks of one cluster.
1534	*
1535	* start: file offset where zero starts, will be made upper block aligned.
1536	* len: it will be trimmed to the end of current cluster if "start + len"
1537	* is bigger than it.
1538	*/
1539	static int ocfs2_zeroout_partial_cluster(struct inode *inode,
1540	u64 start, u64 len)
1541	{
1542	int ret;
1543	u64 start_block, end_block, nr_blocks;
1544	u64 p_block, offset;
1545	u32 cluster, p_cluster, nr_clusters;
1546	struct super_block *sb = inode->i_sb;
1547	u64 end = ocfs2_align_bytes_to_clusters(sb, bytes: start);
1548
1549	if (start + len < end)
1550	end = start + len;
1551
1552	start_block = ocfs2_blocks_for_bytes(sb, bytes: start);
1553	end_block = ocfs2_blocks_for_bytes(sb, bytes: end);
1554	nr_blocks = end_block - start_block;
1555	if (!nr_blocks)
1556	return `0`;
1557
1558	cluster = ocfs2_bytes_to_clusters(sb, bytes: start);
1559	ret = ocfs2_get_clusters(inode, v_cluster: cluster, p_cluster: &p_cluster,
1560	num_clusters: &nr_clusters, NULL);
1561	if (ret)
1562	return ret;
1563	if (!p_cluster)
1564	return `0`;
1565
1566	offset = start_block - ocfs2_clusters_to_blocks(sb, clusters: cluster);
1567	p_block = ocfs2_clusters_to_blocks(sb, clusters: p_cluster) + offset;
1568	return sb_issue_zeroout(sb, block: p_block, nr_blocks, GFP_NOFS);
1569	}
1570
1571	static int ocfs2_zero_partial_clusters(struct inode *inode,
1572	u64 start, u64 len)
1573	{
1574	int ret = `0`;
1575	u64 tmpend = `0`;
1576	u64 end = start + len;
1577	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1578	unsigned int csize = osb->s_clustersize;
1579	handle_t *handle;
1580	loff_t isize = i_size_read(inode);
1581
1582	/*
1583	* The "start" and "end" values are NOT necessarily part of
1584	* the range whose allocation is being deleted. Rather, this
1585	* is what the user passed in with the request. We must zero
1586	* partial clusters here. There's no need to worry about
1587	* physical allocation - the zeroing code knows to skip holes.
1588	*/
1589	trace_ocfs2_zero_partial_clusters(
1590	value1: (unsigned long long)OCFS2_I(inode)->ip_blkno,
1591	value2: (unsigned long long)start, value3: (unsigned long long)end);
1592
1593	/*
1594	* If both edges are on a cluster boundary then there's no
1595	* zeroing required as the region is part of the allocation to
1596	* be truncated.
1597	*/
1598	if ((start & (csize - `1`)) == `0` && (end & (csize - `1`)) == `0`)
1599	goto out;
1600
1601	/ No page cache for EOF blocks, issue zero out to disk. /
1602	if (end > isize) {
1603	/*
1604	* zeroout eof blocks in last cluster starting from
1605	* "isize" even "start" > "isize" because it is
1606	* complicated to zeroout just at "start" as "start"
1607	* may be not aligned with block size, buffer write
1608	* would be required to do that, but out of eof buffer
1609	* write is not supported.
1610	*/
1611	ret = ocfs2_zeroout_partial_cluster(inode, start: isize,
1612	len: end - isize);
1613	if (ret) {
1614	mlog_errno(ret);
1615	goto out;
1616	}
1617	if (start >= isize)
1618	goto out;
1619	end = isize;
1620	}
1621	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1622	if (IS_ERR(ptr: handle)) {
1623	ret = PTR_ERR(ptr: handle);
1624	mlog_errno(ret);
1625	goto out;
1626	}
1627
1628	/*
1629	* If start is on a cluster boundary and end is somewhere in another
1630	* cluster, we have not COWed the cluster starting at start, unless
1631	* end is also within the same cluster. So, in this case, we skip this
1632	* first call to ocfs2_zero_range_for_truncate() truncate and move on
1633	* to the next one.
1634	*/
1635	if ((start & (csize - `1`)) != `0`) {
1636	/*
1637	* We want to get the byte offset of the end of the 1st
1638	* cluster.
1639	*/
1640	tmpend = (u64)osb->s_clustersize +
1641	(start & ~(osb->s_clustersize - `1`));
1642	if (tmpend > end)
1643	tmpend = end;
1644
1645	trace_ocfs2_zero_partial_clusters_range1(
1646	val1: (unsigned long long)start,
1647	val2: (unsigned long long)tmpend);
1648
1649	ret = ocfs2_zero_range_for_truncate(inode, handle, range_start: start,
1650	range_end: tmpend);
1651	if (ret)
1652	mlog_errno(ret);
1653	}
1654
1655	if (tmpend < end) {
1656	/*
1657	* This may make start and end equal, but the zeroing
1658	* code will skip any work in that case so there's no
1659	* need to catch it up here.
1660	*/
1661	start = end & ~(osb->s_clustersize - `1`);
1662
1663	trace_ocfs2_zero_partial_clusters_range2(
1664	val1: (unsigned long long)start, val2: (unsigned long long)end);
1665
1666	ret = ocfs2_zero_range_for_truncate(inode, handle, range_start: start, range_end: end);
1667	if (ret)
1668	mlog_errno(ret);
1669	}
1670	ocfs2_update_inode_fsync_trans(handle, inode, datasync: `1`);
1671
1672	ocfs2_commit_trans(osb, handle);
1673	out:
1674	return ret;
1675	}
1676
1677	static int ocfs2_find_rec(struct ocfs2_extent_list *el, u32 pos)
1678	{
1679	int i;
1680	struct ocfs2_extent_rec *rec = NULL;
1681
1682	for (i = le16_to_cpu(el->l_next_free_rec) - `1`; i >= `0`; i--) {
1683
1684	rec = &el->l_recs[i];
1685
1686	if (le32_to_cpu(rec->e_cpos) < pos)
1687	break;
1688	}
1689
1690	return i;
1691	}
1692
1693	/*
1694	* Helper to calculate the punching pos and length in one run, we handle the
1695	* following three cases in order:
1696	*
1697	* - remove the entire record
1698	* - remove a partial record
1699	* - no record needs to be removed (hole-punching completed)
1700	*/
1701	static void ocfs2_calc_trunc_pos(struct inode *inode,
1702	struct ocfs2_extent_list *el,
1703	struct ocfs2_extent_rec *rec,
1704	u32 trunc_start, u32 *trunc_cpos,
1705	u32 trunc_len, u32 trunc_end,
1706	u64 blkno, int* *done)
1707	{
1708	int ret = `0`;
1709	u32 coff, range;
1710
1711	range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
1712
1713	if (le32_to_cpu(rec->e_cpos) >= trunc_start) {
1714	/*
1715	* remove an entire extent record.
1716	*/
1717	*trunc_cpos = le32_to_cpu(rec->e_cpos);
1718	/*
1719	* Skip holes if any.
1720	*/
1721	if (range < *trunc_end)
1722	*trunc_end = range;
1723	trunc_len = trunc_end - le32_to_cpu(rec->e_cpos);
1724	*blkno = le64_to_cpu(rec->e_blkno);
1725	*trunc_end = le32_to_cpu(rec->e_cpos);
1726	} else if (range > trunc_start) {
1727	/*
1728	* remove a partial extent record, which means we're
1729	* removing the last extent record.
1730	*/
1731	*trunc_cpos = trunc_start;
1732	/*
1733	* skip hole if any.
1734	*/
1735	if (range < *trunc_end)
1736	*trunc_end = range;
1737	trunc_len = trunc_end - trunc_start;
1738	coff = trunc_start - le32_to_cpu(rec->e_cpos);
1739	*blkno = le64_to_cpu(rec->e_blkno) +
1740	ocfs2_clusters_to_blocks(sb: inode->i_sb, clusters: coff);
1741	*trunc_end = trunc_start;
1742	} else {
1743	/*
1744	* It may have two following possibilities:
1745	*
1746	* - last record has been removed
1747	* - trunc_start was within a hole
1748	*
1749	* both two cases mean the completion of hole punching.
1750	*/
1751	ret = `1`;
1752	}
1753
1754	*done = ret;
1755	}
1756
1757	int ocfs2_remove_inode_range(struct inode *inode,
1758	struct buffer_head *di_bh, u64 byte_start,
1759	u64 byte_len)
1760	{
1761	int ret = `0`, flags = `0`, done = `0`, i;
1762	u32 trunc_start, trunc_len, trunc_end, trunc_cpos, phys_cpos;
1763	u32 cluster_in_el;
1764	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1765	struct ocfs2_cached_dealloc_ctxt dealloc;
1766	struct address_space *mapping = inode->i_mapping;
1767	struct ocfs2_extent_tree et;
1768	struct ocfs2_path *path = NULL;
1769	struct ocfs2_extent_list *el = NULL;
1770	struct ocfs2_extent_rec *rec = NULL;
1771	struct ocfs2_dinode di = (struct* ocfs2_dinode *)di_bh->b_data;
1772	u64 blkno, refcount_loc = le64_to_cpu(di->i_refcount_loc);
1773
1774	ocfs2_init_dinode_extent_tree(et: &et, ci: INODE_CACHE(inode), bh: di_bh);
1775	ocfs2_init_dealloc_ctxt(c: &dealloc);
1776
1777	trace_ocfs2_remove_inode_range(
1778	value1: (unsigned long long)OCFS2_I(inode)->ip_blkno,
1779	value2: (unsigned long long)byte_start,
1780	value3: (unsigned long long)byte_len);
1781
1782	if (byte_len == `0`)
1783	return `0`;
1784
1785	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1786	ret = ocfs2_truncate_inline(inode, di_bh, start: byte_start,
1787	end: byte_start + byte_len, trunc: `0`);
1788	if (ret) {
1789	mlog_errno(ret);
1790	goto out;
1791	}
1792	/*
1793	* There's no need to get fancy with the page cache
1794	* truncate of an inline-data inode. We're talking
1795	* about less than a page here, which will be cached
1796	* in the dinode buffer anyway.
1797	*/
1798	unmap_mapping_range(mapping, holebegin: `0`, holelen: `0`, even_cows: `0`);
1799	truncate_inode_pages(mapping, `0`);
1800	goto out;
1801	}
1802
1803	/*
1804	* For reflinks, we may need to CoW 2 clusters which might be
1805	* partially zero'd later, if hole's start and end offset were
1806	* within one cluster(means is not exactly aligned to clustersize).
1807	*/
1808
1809	if (ocfs2_is_refcount_inode(inode)) {
1810	ret = ocfs2_cow_file_pos(inode, fe_bh: di_bh, offset: byte_start);
1811	if (ret) {
1812	mlog_errno(ret);
1813	goto out;
1814	}
1815
1816	ret = ocfs2_cow_file_pos(inode, fe_bh: di_bh, offset: byte_start + byte_len);
1817	if (ret) {
1818	mlog_errno(ret);
1819	goto out;
1820	}
1821	}
1822
1823	trunc_start = ocfs2_clusters_for_bytes(sb: osb->sb, bytes: byte_start);
1824	trunc_end = (byte_start + byte_len) >> osb->s_clustersize_bits;
1825	cluster_in_el = trunc_end;
1826
1827	ret = ocfs2_zero_partial_clusters(inode, start: byte_start, len: byte_len);
1828	if (ret) {
1829	mlog_errno(ret);
1830	goto out;
1831	}
1832
1833	path = ocfs2_new_path_from_et(et: &et);
1834	if (!path) {
1835	ret = -ENOMEM;
1836	mlog_errno(ret);
1837	goto out;
1838	}
1839
1840	while (trunc_end > trunc_start) {
1841
1842	ret = ocfs2_find_path(ci: INODE_CACHE(inode), path,
1843	cpos: cluster_in_el);
1844	if (ret) {
1845	mlog_errno(ret);
1846	goto out;
1847	}
1848
1849	el = path_leaf_el(path);
1850
1851	i = ocfs2_find_rec(el, pos: trunc_end);
1852	/*
1853	* Need to go to previous extent block.
1854	*/
1855	if (i < `0`) {
1856	if (path->p_tree_depth == `0`)
1857	break;
1858
1859	ret = ocfs2_find_cpos_for_left_leaf(sb: inode->i_sb,
1860	path,
1861	cpos: &cluster_in_el);
1862	if (ret) {
1863	mlog_errno(ret);
1864	goto out;
1865	}
1866
1867	/*
1868	* We've reached the leftmost extent block,
1869	* it's safe to leave.
1870	*/
1871	if (cluster_in_el == `0`)
1872	break;
1873
1874	/*
1875	* The 'pos' searched for previous extent block is
1876	* always one cluster less than actual trunc_end.
1877	*/
1878	trunc_end = cluster_in_el + `1`;
1879
1880	ocfs2_reinit_path(path, keep_root: `1`);
1881
1882	continue;
1883
1884	} else
1885	rec = &el->l_recs[i];
1886
1887	ocfs2_calc_trunc_pos(inode, el, rec, trunc_start, trunc_cpos: &trunc_cpos,
1888	trunc_len: &trunc_len, trunc_end: &trunc_end, blkno: &blkno, done: &done);
1889	if (done)
1890	break;
1891
1892	flags = rec->e_flags;
1893	phys_cpos = ocfs2_blocks_to_clusters(sb: inode->i_sb, blocks: blkno);
1894
1895	ret = ocfs2_remove_btree_range(inode, et: &et, cpos: trunc_cpos,
1896	phys_cpos, len: trunc_len, flags,
1897	dealloc: &dealloc, refcount_loc, refcount_tree_locked: false);
1898	if (ret < `0`) {
1899	mlog_errno(ret);
1900	goto out;
1901	}
1902
1903	cluster_in_el = trunc_end;
1904
1905	ocfs2_reinit_path(path, keep_root: `1`);
1906	}
1907
1908	ocfs2_truncate_cluster_pages(inode, byte_start, byte_len);
1909
1910	out:
1911	ocfs2_free_path(path);
1912	ocfs2_schedule_truncate_log_flush(osb, cancel: `1`);
1913	ocfs2_run_deallocs(osb, ctxt: &dealloc);
1914
1915	return ret;
1916	}
1917
1918	/*
1919	* Parts of this function taken from xfs_change_file_space()
1920	*/
1921	static int __ocfs2_change_file_space(struct file file, struct* inode *inode,
1922	loff_t f_pos, unsigned int cmd,
1923	struct ocfs2_space_resv *sr,
1924	int change_size)
1925	{
1926	int ret;
1927	s64 llen;
1928	loff_t size, orig_isize;
1929	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1930	struct buffer_head *di_bh = NULL;
1931	handle_t *handle;
1932	unsigned long long max_off = inode->i_sb->s_maxbytes;
1933
1934	if (ocfs2_is_hard_readonly(osb) \|\| ocfs2_is_soft_readonly(osb))
1935	return -EROFS;
1936
1937	inode_lock(inode);
1938
1939	/*
1940	* This prevents concurrent writes on other nodes
1941	*/
1942	ret = ocfs2_rw_lock(inode, write: `1`);
1943	if (ret) {
1944	mlog_errno(ret);
1945	goto out;
1946	}
1947
1948	ret = ocfs2_inode_lock(inode, &di_bh, `1`);
1949	if (ret) {
1950	mlog_errno(ret);
1951	goto out_rw_unlock;
1952	}
1953
1954	if (inode->i_flags & (S_IMMUTABLE\|S_APPEND)) {
1955	ret = -EPERM;
1956	goto out_inode_unlock;
1957	}
1958
1959	switch (sr->l_whence) {
1960	case `0`: /SEEK_SET/
1961	break;
1962	case `1`: /SEEK_CUR/
1963	sr->l_start += f_pos;
1964	break;
1965	case `2`: /SEEK_END/
1966	sr->l_start += i_size_read(inode);
1967	break;
1968	default:
1969	ret = -EINVAL;
1970	goto out_inode_unlock;
1971	}
1972	sr->l_whence = `0`;
1973
1974	llen = sr->l_len > `0` ? sr->l_len - `1` : sr->l_len;
1975
1976	if (sr->l_start < `0`
1977	\|\| sr->l_start > max_off
1978	\|\| (sr->l_start + llen) < `0`
1979	\|\| (sr->l_start + llen) > max_off) {
1980	ret = -EINVAL;
1981	goto out_inode_unlock;
1982	}
1983	size = sr->l_start + sr->l_len;
1984
1985	if (cmd == OCFS2_IOC_RESVSP \|\| cmd == OCFS2_IOC_RESVSP64 \|\|
1986	cmd == OCFS2_IOC_UNRESVSP \|\| cmd == OCFS2_IOC_UNRESVSP64) {
1987	if (sr->l_len <= `0`) {
1988	ret = -EINVAL;
1989	goto out_inode_unlock;
1990	}
1991	}
1992
1993	if (file && setattr_should_drop_suidgid(&nop_mnt_idmap, file_inode(f: file))) {
1994	ret = __ocfs2_write_remove_suid(inode, bh: di_bh);
1995	if (ret) {
1996	mlog_errno(ret);
1997	goto out_inode_unlock;
1998	}
1999	}
2000
2001	down_write(sem: &OCFS2_I(inode)->ip_alloc_sem);
2002	switch (cmd) {
2003	case OCFS2_IOC_RESVSP:
2004	case OCFS2_IOC_RESVSP64:
2005	/*
2006	* This takes unsigned offsets, but the signed ones we
2007	* pass have been checked against overflow above.
2008	*/
2009	ret = ocfs2_allocate_unwritten_extents(inode, start: sr->l_start,
2010	len: sr->l_len);
2011	break;
2012	case OCFS2_IOC_UNRESVSP:
2013	case OCFS2_IOC_UNRESVSP64:
2014	ret = ocfs2_remove_inode_range(inode, di_bh, byte_start: sr->l_start,
2015	byte_len: sr->l_len);
2016	break;
2017	default:
2018	ret = -EINVAL;
2019	}
2020
2021	orig_isize = i_size_read(inode);
2022	/ zeroout eof blocks in the cluster. /
2023	if (!ret && change_size && orig_isize < size) {
2024	ret = ocfs2_zeroout_partial_cluster(inode, start: orig_isize,
2025	len: size - orig_isize);
2026	if (!ret)
2027	i_size_write(inode, i_size: size);
2028	}
2029	up_write(sem: &OCFS2_I(inode)->ip_alloc_sem);
2030	if (ret) {
2031	mlog_errno(ret);
2032	goto out_inode_unlock;
2033	}
2034
2035	/*
2036	* We update c/mtime for these changes
2037	*/
2038	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
2039	if (IS_ERR(ptr: handle)) {
2040	ret = PTR_ERR(ptr: handle);
2041	mlog_errno(ret);
2042	goto out_inode_unlock;
2043	}
2044
2045	inode_set_mtime_to_ts(inode, ts: inode_set_ctime_current(inode));
2046	ret = ocfs2_mark_inode_dirty(handle, inode, bh: di_bh);
2047	if (ret < `0`)
2048	mlog_errno(ret);
2049
2050	if (file && (file->f_flags & O_SYNC))
2051	handle->h_sync = `1`;
2052
2053	ocfs2_commit_trans(osb, handle);
2054
2055	out_inode_unlock:
2056	brelse(bh: di_bh);
2057	ocfs2_inode_unlock(inode, ex: `1`);
2058	out_rw_unlock:
2059	ocfs2_rw_unlock(inode, write: `1`);
2060
2061	out:
2062	inode_unlock(inode);
2063	return ret;
2064	}
2065
2066	int ocfs2_change_file_space(struct file file, unsigned* int cmd,
2067	struct ocfs2_space_resv *sr)
2068	{
2069	struct inode *inode = file_inode(f: file);
2070	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2071	int ret;
2072
2073	if ((cmd == OCFS2_IOC_RESVSP \|\| cmd == OCFS2_IOC_RESVSP64) &&
2074	!ocfs2_writes_unwritten_extents(osb))
2075	return -ENOTTY;
2076	else if ((cmd == OCFS2_IOC_UNRESVSP \|\| cmd == OCFS2_IOC_UNRESVSP64) &&
2077	!ocfs2_sparse_alloc(osb))
2078	return -ENOTTY;
2079
2080	if (!S_ISREG(inode->i_mode))
2081	return -EINVAL;
2082
2083	if (!(file->f_mode & FMODE_WRITE))
2084	return -EBADF;
2085
2086	ret = mnt_want_write_file(file);
2087	if (ret)
2088	return ret;
2089	ret = __ocfs2_change_file_space(file, inode, f_pos: file->f_pos, cmd, sr, change_size: `0`);
2090	mnt_drop_write_file(file);
2091	return ret;
2092	}
2093
2094	static long ocfs2_fallocate(struct file file, int* mode, loff_t offset,
2095	loff_t len)
2096	{
2097	struct inode *inode = file_inode(f: file);
2098	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2099	struct ocfs2_space_resv sr;
2100	int change_size = `1`;
2101	int cmd = OCFS2_IOC_RESVSP64;
2102	int ret = `0`;
2103
2104	if (mode & ~(FALLOC_FL_KEEP_SIZE \| FALLOC_FL_PUNCH_HOLE))
2105	return -EOPNOTSUPP;
2106	if (!ocfs2_writes_unwritten_extents(osb))
2107	return -EOPNOTSUPP;
2108
2109	if (mode & FALLOC_FL_KEEP_SIZE) {
2110	change_size = `0`;
2111	} else {
2112	ret = inode_newsize_ok(inode, offset: offset + len);
2113	if (ret)
2114	return ret;
2115	}
2116
2117	if (mode & FALLOC_FL_PUNCH_HOLE)
2118	cmd = OCFS2_IOC_UNRESVSP64;
2119
2120	sr.l_whence = `0`;
2121	sr.l_start = (s64)offset;
2122	sr.l_len = (s64)len;
2123
2124	return __ocfs2_change_file_space(NULL, inode, f_pos: offset, cmd, sr: &sr,
2125	change_size);
2126	}
2127
2128	int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
2129	size_t count)
2130	{
2131	int ret = `0`;
2132	unsigned int extent_flags;
2133	u32 cpos, clusters, extent_len, phys_cpos;
2134	struct super_block *sb = inode->i_sb;
2135
2136	if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)) \|\|
2137	!ocfs2_is_refcount_inode(inode) \|\|
2138	OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
2139	return `0`;
2140
2141	cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
2142	clusters = ocfs2_clusters_for_bytes(sb, bytes: pos + count) - cpos;
2143
2144	while (clusters) {
2145	ret = ocfs2_get_clusters(inode, v_cluster: cpos, p_cluster: &phys_cpos, num_clusters: &extent_len,
2146	extent_flags: &extent_flags);
2147	if (ret < `0`) {
2148	mlog_errno(ret);
2149	goto out;
2150	}
2151
2152	if (phys_cpos && (extent_flags & OCFS2_EXT_REFCOUNTED)) {
2153	ret = `1`;
2154	break;
2155	}
2156
2157	if (extent_len > clusters)
2158	extent_len = clusters;
2159
2160	clusters -= extent_len;
2161	cpos += extent_len;
2162	}
2163	out:
2164	return ret;
2165	}
2166
2167	static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos)
2168	{
2169	int blockmask = inode->i_sb->s_blocksize - `1`;
2170	loff_t final_size = pos + count;
2171
2172	if ((pos & blockmask) \|\| (final_size & blockmask))
2173	return `1`;
2174	return `0`;
2175	}
2176
2177	static int ocfs2_inode_lock_for_extent_tree(struct inode *inode,
2178	struct buffer_head **di_bh,
2179	int meta_level,
2180	int write_sem,
2181	int wait)
2182	{
2183	int ret = `0`;
2184
2185	if (wait)
2186	ret = ocfs2_inode_lock(inode, di_bh, meta_level);
2187	else
2188	ret = ocfs2_try_inode_lock(inode, di_bh, meta_level);
2189	if (ret < `0`)
2190	goto out;
2191
2192	if (wait) {
2193	if (write_sem)
2194	down_write(sem: &OCFS2_I(inode)->ip_alloc_sem);
2195	else
2196	down_read(sem: &OCFS2_I(inode)->ip_alloc_sem);
2197	} else {
2198	if (write_sem)
2199	ret = down_write_trylock(sem: &OCFS2_I(inode)->ip_alloc_sem);
2200	else
2201	ret = down_read_trylock(sem: &OCFS2_I(inode)->ip_alloc_sem);
2202
2203	if (!ret) {
2204	ret = -EAGAIN;
2205	goto out_unlock;
2206	}
2207	}
2208
2209	return ret;
2210
2211	out_unlock:
2212	brelse(bh: *di_bh);
2213	*di_bh = NULL;
2214	ocfs2_inode_unlock(inode, ex: meta_level);
2215	out:
2216	return ret;
2217	}
2218
2219	static void ocfs2_inode_unlock_for_extent_tree(struct inode *inode,
2220	struct buffer_head **di_bh,
2221	int meta_level,
2222	int write_sem)
2223	{
2224	if (write_sem)
2225	up_write(sem: &OCFS2_I(inode)->ip_alloc_sem);
2226	else
2227	up_read(sem: &OCFS2_I(inode)->ip_alloc_sem);
2228
2229	brelse(bh: *di_bh);
2230	*di_bh = NULL;
2231
2232	if (meta_level >= `0`)
2233	ocfs2_inode_unlock(inode, ex: meta_level);
2234	}
2235
2236	static int ocfs2_prepare_inode_for_write(struct file *file,
2237	loff_t pos, size_t count, int wait)
2238	{
2239	int ret = `0`, meta_level = `0`, overwrite_io = `0`;
2240	int write_sem = `0`;
2241	struct dentry *dentry = file->f_path.dentry;
2242	struct inode *inode = d_inode(dentry);
2243	struct buffer_head *di_bh = NULL;
2244	u32 cpos;
2245	u32 clusters;
2246
2247	/*
2248	* We start with a read level meta lock and only jump to an ex
2249	* if we need to make modifications here.
2250	*/
2251	for(;;) {
2252	ret = ocfs2_inode_lock_for_extent_tree(inode,
2253	di_bh: &di_bh,
2254	meta_level,
2255	write_sem,
2256	wait);
2257	if (ret < `0`) {
2258	if (ret != -EAGAIN)
2259	mlog_errno(ret);
2260	goto out;
2261	}
2262
2263	/*
2264	* Check if IO will overwrite allocated blocks in case
2265	* IOCB_NOWAIT flag is set.
2266	*/
2267	if (!wait && !overwrite_io) {
2268	overwrite_io = `1`;
2269
2270	ret = ocfs2_overwrite_io(inode, di_bh, map_start: pos, map_len: count);
2271	if (ret < `0`) {
2272	if (ret != -EAGAIN)
2273	mlog_errno(ret);
2274	goto out_unlock;
2275	}
2276	}
2277
2278	/ Clear suid / sgid if necessary. We do this here*
2279	* instead of later in the write path because
2280	* remove_suid() calls ->setattr without any hint that
2281	* we may have already done our cluster locking. Since
2282	* ocfs2_setattr() must take cluster locks to
2283	* proceed, this will lead us to recursively lock the
2284	* inode. There's also the dinode i_size state which
2285	* can be lost via setattr during extending writes (we
2286	* set inode->i_size at the end of a write. */
2287	if (setattr_should_drop_suidgid(&nop_mnt_idmap, inode)) {
2288	if (meta_level == `0`) {
2289	ocfs2_inode_unlock_for_extent_tree(inode,
2290	di_bh: &di_bh,
2291	meta_level,
2292	write_sem);
2293	meta_level = `1`;
2294	continue;
2295	}
2296
2297	ret = ocfs2_write_remove_suid(inode);
2298	if (ret < `0`) {
2299	mlog_errno(ret);
2300	goto out_unlock;
2301	}
2302	}
2303
2304	ret = ocfs2_check_range_for_refcount(inode, pos, count);
2305	if (ret == `1`) {
2306	ocfs2_inode_unlock_for_extent_tree(inode,
2307	di_bh: &di_bh,
2308	meta_level,
2309	write_sem);
2310	meta_level = `1`;
2311	write_sem = `1`;
2312	ret = ocfs2_inode_lock_for_extent_tree(inode,
2313	di_bh: &di_bh,
2314	meta_level,
2315	write_sem,
2316	wait);
2317	if (ret < `0`) {
2318	if (ret != -EAGAIN)
2319	mlog_errno(ret);
2320	goto out;
2321	}
2322
2323	cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
2324	clusters =
2325	ocfs2_clusters_for_bytes(sb: inode->i_sb, bytes: pos + count) - cpos;
2326	ret = ocfs2_refcount_cow(inode, di_bh, cpos, write_len: clusters, UINT_MAX);
2327	}
2328
2329	if (ret < `0`) {
2330	if (ret != -EAGAIN)
2331	mlog_errno(ret);
2332	goto out_unlock;
2333	}
2334
2335	break;
2336	}
2337
2338	out_unlock:
2339	trace_ocfs2_prepare_inode_for_write(ino: OCFS2_I(inode)->ip_blkno,
2340	saved_pos: pos, count, wait);
2341
2342	ocfs2_inode_unlock_for_extent_tree(inode,
2343	di_bh: &di_bh,
2344	meta_level,
2345	write_sem);
2346
2347	out:
2348	return ret;
2349	}
2350
2351	static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
2352	struct iov_iter *from)
2353	{
2354	int rw_level;
2355	ssize_t written = `0`;
2356	ssize_t ret;
2357	size_t count = iov_iter_count(i: from);
2358	struct file *file = iocb->ki_filp;
2359	struct inode *inode = file_inode(f: file);
2360	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2361	int full_coherency = !(osb->s_mount_opt &
2362	OCFS2_MOUNT_COHERENCY_BUFFERED);
2363	void *saved_ki_complete = NULL;
2364	int append_write = ((iocb->ki_pos + count) >=
2365	i_size_read(inode) ? `1` : `0`);
2366	int direct_io = iocb->ki_flags & IOCB_DIRECT ? `1` : `0`;
2367	int nowait = iocb->ki_flags & IOCB_NOWAIT ? `1` : `0`;
2368
2369	trace_ocfs2_file_write_iter(inode, file, dentry: file->f_path.dentry,
2370	ino: (unsigned long long)OCFS2_I(inode)->ip_blkno,
2371	d_len: file->f_path.dentry->d_name.len,
2372	d_name: file->f_path.dentry->d_name.name,
2373	mode: (unsigned int)from->nr_segs); / GRRRRR /
2374
2375	if (!direct_io && nowait)
2376	return -EOPNOTSUPP;
2377
2378	if (count == `0`)
2379	return `0`;
2380
2381	if (nowait) {
2382	if (!inode_trylock(inode))
2383	return -EAGAIN;
2384	} else
2385	inode_lock(inode);
2386
2387	/*
2388	* Concurrent O_DIRECT writes are allowed with
2389	* mount_option "coherency=buffered".
2390	* For append write, we must take rw EX.
2391	*/
2392	rw_level = (!direct_io \|\| full_coherency \|\| append_write);
2393
2394	if (nowait)
2395	ret = ocfs2_try_rw_lock(inode, write: rw_level);
2396	else
2397	ret = ocfs2_rw_lock(inode, write: rw_level);
2398	if (ret < `0`) {
2399	if (ret != -EAGAIN)
2400	mlog_errno(ret);
2401	goto out_mutex;
2402	}
2403
2404	/*
2405	* O_DIRECT writes with "coherency=full" need to take EX cluster
2406	* inode_lock to guarantee coherency.
2407	*/
2408	if (direct_io && full_coherency) {
2409	/*
2410	* We need to take and drop the inode lock to force
2411	* other nodes to drop their caches. Buffered I/O
2412	* already does this in write_begin().
2413	*/
2414	if (nowait)
2415	ret = ocfs2_try_inode_lock(inode, NULL, `1`);
2416	else
2417	ret = ocfs2_inode_lock(inode, NULL, `1`);
2418	if (ret < `0`) {
2419	if (ret != -EAGAIN)
2420	mlog_errno(ret);
2421	goto out;
2422	}
2423
2424	ocfs2_inode_unlock(inode, ex: `1`);
2425	}
2426
2427	ret = generic_write_checks(iocb, from);
2428	if (ret <= `0`) {
2429	if (ret)
2430	mlog_errno(ret);
2431	goto out;
2432	}
2433	count = ret;
2434
2435	ret = ocfs2_prepare_inode_for_write(file, pos: iocb->ki_pos, count, wait: !nowait);
2436	if (ret < `0`) {
2437	if (ret != -EAGAIN)
2438	mlog_errno(ret);
2439	goto out;
2440	}
2441
2442	if (direct_io && !is_sync_kiocb(kiocb: iocb) &&
2443	ocfs2_is_io_unaligned(inode, count, pos: iocb->ki_pos)) {
2444	/*
2445	* Make it a sync io if it's an unaligned aio.
2446	*/
2447	saved_ki_complete = xchg(&iocb->ki_complete, NULL);
2448	}
2449
2450	/ communicate with ocfs2_dio_end_io /
2451	ocfs2_iocb_set_rw_locked(iocb, level: rw_level);
2452
2453	written = __generic_file_write_iter(iocb, from);
2454	/ buffered aio wouldn't have proper lock coverage today /
2455	BUG_ON(written == -EIOCBQUEUED && !direct_io);
2456
2457	/*
2458	* deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
2459	* function pointer which is called when o_direct io completes so that
2460	* it can unlock our rw lock.
2461	* Unfortunately there are error cases which call end_io and others
2462	* that don't. so we don't have to unlock the rw_lock if either an
2463	* async dio is going to do it in the future or an end_io after an
2464	* error has already done it.
2465	*/
2466	if ((written == -EIOCBQUEUED) \|\| (!ocfs2_iocb_is_rw_locked(iocb))) {
2467	rw_level = -`1`;
2468	}
2469
2470	if (unlikely(written <= `0`))
2471	goto out;
2472
2473	if (((file->f_flags & O_DSYNC) && !direct_io) \|\|
2474	IS_SYNC(inode)) {
2475	ret = filemap_fdatawrite_range(mapping: file->f_mapping,
2476	start: iocb->ki_pos - written,
2477	end: iocb->ki_pos - `1`);
2478	if (ret < `0`)
2479	written = ret;
2480
2481	if (!ret) {
2482	ret = jbd2_journal_force_commit(osb->journal->j_journal);
2483	if (ret < `0`)
2484	written = ret;
2485	}
2486
2487	if (!ret)
2488	ret = filemap_fdatawait_range(file->f_mapping,
2489	lstart: iocb->ki_pos - written,
2490	lend: iocb->ki_pos - `1`);
2491	}
2492
2493	out:
2494	if (saved_ki_complete)
2495	xchg(&iocb->ki_complete, saved_ki_complete);
2496
2497	if (rw_level != -`1`)
2498	ocfs2_rw_unlock(inode, write: rw_level);
2499
2500	out_mutex:
2501	inode_unlock(inode);
2502
2503	if (written)
2504	ret = written;
2505	return ret;
2506	}
2507
2508	static ssize_t ocfs2_file_read_iter(struct kiocb *iocb,
2509	struct iov_iter *to)
2510	{
2511	int ret = `0`, rw_level = -`1`, lock_level = `0`;
2512	struct file *filp = iocb->ki_filp;
2513	struct inode *inode = file_inode(f: filp);
2514	int direct_io = iocb->ki_flags & IOCB_DIRECT ? `1` : `0`;
2515	int nowait = iocb->ki_flags & IOCB_NOWAIT ? `1` : `0`;
2516
2517	trace_ocfs2_file_read_iter(inode, file: filp, dentry: filp->f_path.dentry,
2518	ino: (unsigned long long)OCFS2_I(inode)->ip_blkno,
2519	d_len: filp->f_path.dentry->d_name.len,
2520	d_name: filp->f_path.dentry->d_name.name,
2521	mode: to->nr_segs); / GRRRRR /
2522
2523
2524	if (!inode) {
2525	ret = -EINVAL;
2526	mlog_errno(ret);
2527	goto bail;
2528	}
2529
2530	if (!direct_io && nowait)
2531	return -EOPNOTSUPP;
2532
2533	/*
2534	* buffered reads protect themselves in ->read_folio(). O_DIRECT reads
2535	* need locks to protect pending reads from racing with truncate.
2536	*/
2537	if (direct_io) {
2538	if (nowait)
2539	ret = ocfs2_try_rw_lock(inode, write: `0`);
2540	else
2541	ret = ocfs2_rw_lock(inode, write: `0`);
2542
2543	if (ret < `0`) {
2544	if (ret != -EAGAIN)
2545	mlog_errno(ret);
2546	goto bail;
2547	}
2548	rw_level = `0`;
2549	/ communicate with ocfs2_dio_end_io /
2550	ocfs2_iocb_set_rw_locked(iocb, level: rw_level);
2551	}
2552
2553	/*
2554	* We're fine letting folks race truncates and extending
2555	* writes with read across the cluster, just like they can
2556	* locally. Hence no rw_lock during read.
2557	*
2558	* Take and drop the meta data lock to update inode fields
2559	* like i_size. This allows the checks down below
2560	* copy_splice_read() a chance of actually working.
2561	*/
2562	ret = ocfs2_inode_lock_atime(inode, vfsmnt: filp->f_path.mnt, level: &lock_level,
2563	wait: !nowait);
2564	if (ret < `0`) {
2565	if (ret != -EAGAIN)
2566	mlog_errno(ret);
2567	goto bail;
2568	}
2569	ocfs2_inode_unlock(inode, ex: lock_level);
2570
2571	ret = generic_file_read_iter(iocb, to);
2572	trace_generic_file_read_iter_ret(num: ret);
2573
2574	/ buffered aio wouldn't have proper lock coverage today /
2575	BUG_ON(ret == -EIOCBQUEUED && !direct_io);
2576
2577	/ see ocfs2_file_write_iter /
2578	if (ret == -EIOCBQUEUED \|\| !ocfs2_iocb_is_rw_locked(iocb)) {
2579	rw_level = -`1`;
2580	}
2581
2582	bail:
2583	if (rw_level != -`1`)
2584	ocfs2_rw_unlock(inode, write: rw_level);
2585
2586	return ret;
2587	}
2588
2589	static ssize_t ocfs2_file_splice_read(struct file in, loff_t ppos,
2590	struct pipe_inode_info *pipe,
2591	size_t len, unsigned int flags)
2592	{
2593	struct inode *inode = file_inode(f: in);
2594	ssize_t ret = `0`;
2595	int lock_level = `0`;
2596
2597	trace_ocfs2_file_splice_read(inode, file: in, dentry: in->f_path.dentry,
2598	ino: (unsigned long long)OCFS2_I(inode)->ip_blkno,
2599	d_len: in->f_path.dentry->d_name.len,
2600	d_name: in->f_path.dentry->d_name.name,
2601	mode: flags);
2602
2603	/*
2604	* We're fine letting folks race truncates and extending writes with
2605	* read across the cluster, just like they can locally. Hence no
2606	* rw_lock during read.
2607	*
2608	* Take and drop the meta data lock to update inode fields like i_size.
2609	* This allows the checks down below filemap_splice_read() a chance of
2610	* actually working.
2611	*/
2612	ret = ocfs2_inode_lock_atime(inode, vfsmnt: in->f_path.mnt, level: &lock_level, wait: `1`);
2613	if (ret < `0`) {
2614	if (ret != -EAGAIN)
2615	mlog_errno(ret);
2616	goto bail;
2617	}
2618	ocfs2_inode_unlock(inode, ex: lock_level);
2619
2620	ret = filemap_splice_read(in, ppos, pipe, len, flags);
2621	trace_filemap_splice_read_ret(num: ret);
2622	bail:
2623	return ret;
2624	}
2625
2626	/ Refer generic_file_llseek_unlocked() /
2627	static loff_t ocfs2_file_llseek(struct file file, loff_t offset, int* whence)
2628	{
2629	struct inode *inode = file->f_mapping->host;
2630	int ret = `0`;
2631
2632	inode_lock(inode);
2633
2634	switch (whence) {
2635	case SEEK_SET:
2636	break;
2637	case SEEK_END:
2638	/ SEEK_END requires the OCFS2 inode lock for the file*
2639	* because it references the file's size.
2640	*/
2641	ret = ocfs2_inode_lock(inode, NULL, `0`);
2642	if (ret < `0`) {
2643	mlog_errno(ret);
2644	goto out;
2645	}
2646	offset += i_size_read(inode);
2647	ocfs2_inode_unlock(inode, ex: `0`);
2648	break;
2649	case SEEK_CUR:
2650	if (offset == `0`) {
2651	offset = file->f_pos;
2652	goto out;
2653	}
2654	offset += file->f_pos;
2655	break;
2656	case SEEK_DATA:
2657	case SEEK_HOLE:
2658	ret = ocfs2_seek_data_hole_offset(file, offset: &offset, origin: whence);
2659	if (ret)
2660	goto out;
2661	break;
2662	default:
2663	ret = -EINVAL;
2664	goto out;
2665	}
2666
2667	offset = vfs_setpos(file, offset, maxsize: inode->i_sb->s_maxbytes);
2668
2669	out:
2670	inode_unlock(inode);
2671	if (ret)
2672	return ret;
2673	return offset;
2674	}
2675
2676	static loff_t ocfs2_remap_file_range(struct file *file_in, loff_t pos_in,
2677	struct file *file_out, loff_t pos_out,
2678	loff_t len, unsigned int remap_flags)
2679	{
2680	struct inode *inode_in = file_inode(f: file_in);
2681	struct inode *inode_out = file_inode(f: file_out);
2682	struct ocfs2_super *osb = OCFS2_SB(inode_in->i_sb);
2683	struct buffer_head in_bh = NULL, out_bh = NULL;
2684	bool same_inode = (inode_in == inode_out);
2685	loff_t remapped = `0`;
2686	ssize_t ret;
2687
2688	if (remap_flags & ~(REMAP_FILE_DEDUP \| REMAP_FILE_ADVISORY))
2689	return -EINVAL;
2690	if (!ocfs2_refcount_tree(osb))
2691	return -EOPNOTSUPP;
2692	if (ocfs2_is_hard_readonly(osb) \|\| ocfs2_is_soft_readonly(osb))
2693	return -EROFS;
2694
2695	/ Lock both files against IO /
2696	ret = ocfs2_reflink_inodes_lock(s_inode: inode_in, bh1: &in_bh, t_inode: inode_out, bh2: &out_bh);
2697	if (ret)
2698	return ret;
2699
2700	/ Check file eligibility and prepare for block sharing. /
2701	ret = -EINVAL;
2702	if ((OCFS2_I(inode: inode_in)->ip_flags & OCFS2_INODE_SYSTEM_FILE) \|\|
2703	(OCFS2_I(inode: inode_out)->ip_flags & OCFS2_INODE_SYSTEM_FILE))
2704	goto out_unlock;
2705
2706	ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
2707	count: &len, remap_flags);
2708	if (ret < `0` \|\| len == `0`)
2709	goto out_unlock;
2710
2711	/ Lock out changes to the allocation maps and remap. /
2712	down_write(sem: &OCFS2_I(inode: inode_in)->ip_alloc_sem);
2713	if (!same_inode)
2714	down_write_nested(sem: &OCFS2_I(inode: inode_out)->ip_alloc_sem,
2715	SINGLE_DEPTH_NESTING);
2716
2717	/ Zap any page cache for the destination file's range. /
2718	truncate_inode_pages_range(&inode_out->i_data,
2719	round_down(pos_out, PAGE_SIZE),
2720	round_up(pos_out + len, PAGE_SIZE) - `1`);
2721
2722	remapped = ocfs2_reflink_remap_blocks(s_inode: inode_in, s_bh: in_bh, pos_in,
2723	t_inode: inode_out, t_bh: out_bh, pos_out, len);
2724	up_write(sem: &OCFS2_I(inode: inode_in)->ip_alloc_sem);
2725	if (!same_inode)
2726	up_write(sem: &OCFS2_I(inode: inode_out)->ip_alloc_sem);
2727	if (remapped < `0`) {
2728	ret = remapped;
2729	mlog_errno(ret);
2730	goto out_unlock;
2731	}
2732
2733	/*
2734	* Empty the extent map so that we may get the right extent
2735	* record from the disk.
2736	*/
2737	ocfs2_extent_map_trunc(inode: inode_in, cluster: `0`);
2738	ocfs2_extent_map_trunc(inode: inode_out, cluster: `0`);
2739
2740	ret = ocfs2_reflink_update_dest(dest: inode_out, d_bh: out_bh, newlen: pos_out + len);
2741	if (ret) {
2742	mlog_errno(ret);
2743	goto out_unlock;
2744	}
2745
2746	out_unlock:
2747	ocfs2_reflink_inodes_unlock(s_inode: inode_in, s_bh: in_bh, t_inode: inode_out, t_bh: out_bh);
2748	return remapped > `0` ? remapped : ret;
2749	}
2750
2751	const struct inode_operations ocfs2_file_iops = {
2752	.setattr = ocfs2_setattr,
2753	.getattr = ocfs2_getattr,
2754	.permission = ocfs2_permission,
2755	.listxattr = ocfs2_listxattr,
2756	.fiemap = ocfs2_fiemap,
2757	.get_inode_acl = ocfs2_iop_get_acl,
2758	.set_acl = ocfs2_iop_set_acl,
2759	.fileattr_get = ocfs2_fileattr_get,
2760	.fileattr_set = ocfs2_fileattr_set,
2761	};
2762
2763	const struct inode_operations ocfs2_special_file_iops = {
2764	.setattr = ocfs2_setattr,
2765	.getattr = ocfs2_getattr,
2766	.listxattr = ocfs2_listxattr,
2767	.permission = ocfs2_permission,
2768	.get_inode_acl = ocfs2_iop_get_acl,
2769	.set_acl = ocfs2_iop_set_acl,
2770	};
2771
2772	/*
2773	* Other than ->lock, keep ocfs2_fops and ocfs2_dops in sync with
2774	* ocfs2_fops_no_plocks and ocfs2_dops_no_plocks!
2775	*/
2776	const struct file_operations ocfs2_fops = {
2777	.llseek = ocfs2_file_llseek,
2778	.mmap = ocfs2_mmap,
2779	.fsync = ocfs2_sync_file,
2780	.release = ocfs2_file_release,
2781	.open = ocfs2_file_open,
2782	.read_iter = ocfs2_file_read_iter,
2783	.write_iter = ocfs2_file_write_iter,
2784	.unlocked_ioctl = ocfs2_ioctl,
2785	#ifdef CONFIG_COMPAT
2786	.compat_ioctl = ocfs2_compat_ioctl,
2787	#endif
2788	.lock = ocfs2_lock,
2789	.flock = ocfs2_flock,
2790	.splice_read = ocfs2_file_splice_read,
2791	.splice_write = iter_file_splice_write,
2792	.fallocate = ocfs2_fallocate,
2793	.remap_file_range = ocfs2_remap_file_range,
2794	};
2795
2796	WRAP_DIR_ITER(ocfs2_readdir) // FIXME!
2797	const struct file_operations ocfs2_dops = {
2798	.llseek = generic_file_llseek,
2799	.read = generic_read_dir,
2800	.iterate_shared = shared_ocfs2_readdir,
2801	.fsync = ocfs2_sync_file,
2802	.release = ocfs2_dir_release,
2803	.open = ocfs2_dir_open,
2804	.unlocked_ioctl = ocfs2_ioctl,
2805	#ifdef CONFIG_COMPAT
2806	.compat_ioctl = ocfs2_compat_ioctl,
2807	#endif
2808	.lock = ocfs2_lock,
2809	.flock = ocfs2_flock,
2810	};
2811
2812	/*
2813	* POSIX-lockless variants of our file_operations.
2814	*
2815	* These will be used if the underlying cluster stack does not support
2816	* posix file locking, if the user passes the "localflocks" mount
2817	* option, or if we have a local-only fs.
2818	*
2819	* ocfs2_flock is in here because all stacks handle UNIX file locks,
2820	* so we still want it in the case of no stack support for
2821	* plocks. Internally, it will do the right thing when asked to ignore
2822	* the cluster.
2823	*/
2824	const struct file_operations ocfs2_fops_no_plocks = {
2825	.llseek = ocfs2_file_llseek,
2826	.mmap = ocfs2_mmap,
2827	.fsync = ocfs2_sync_file,
2828	.release = ocfs2_file_release,
2829	.open = ocfs2_file_open,
2830	.read_iter = ocfs2_file_read_iter,
2831	.write_iter = ocfs2_file_write_iter,
2832	.unlocked_ioctl = ocfs2_ioctl,
2833	#ifdef CONFIG_COMPAT
2834	.compat_ioctl = ocfs2_compat_ioctl,
2835	#endif
2836	.flock = ocfs2_flock,
2837	.splice_read = filemap_splice_read,
2838	.splice_write = iter_file_splice_write,
2839	.fallocate = ocfs2_fallocate,
2840	.remap_file_range = ocfs2_remap_file_range,
2841	};
2842
2843	const struct file_operations ocfs2_dops_no_plocks = {
2844	.llseek = generic_file_llseek,
2845	.read = generic_read_dir,
2846	.iterate_shared = shared_ocfs2_readdir,
2847	.fsync = ocfs2_sync_file,
2848	.release = ocfs2_dir_release,
2849	.open = ocfs2_dir_open,
2850	.unlocked_ioctl = ocfs2_ioctl,
2851	#ifdef CONFIG_COMPAT
2852	.compat_ioctl = ocfs2_compat_ioctl,
2853	#endif
2854	.flock = ocfs2_flock,
2855	};
2856

source code of linux/fs/ocfs2/file.c