sync.c source code [linux/fs/sync.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* High-level sync()-related operations
4	*/
5
6	#include <linux/blkdev.h>
7	#include <linux/kernel.h>
8	#include <linux/file.h>
9	#include <linux/fs.h>
10	#include <linux/slab.h>
11	#include <linux/export.h>
12	#include <linux/namei.h>
13	#include <linux/sched.h>
14	#include <linux/writeback.h>
15	#include <linux/syscalls.h>
16	#include <linux/linkage.h>
17	#include <linux/pagemap.h>
18	#include <linux/quotaops.h>
19	#include <linux/backing-dev.h>
20	#include "internal.h"
21
22	#define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE\|SYNC_FILE_RANGE_WRITE\| \
23	SYNC_FILE_RANGE_WAIT_AFTER)
24
25	/*
26	* Write out and wait upon all dirty data associated with this
27	* superblock. Filesystem data as well as the underlying block
28	* device. Takes the superblock lock.
29	*/
30	int sync_filesystem(struct super_block *sb)
31	{
32	int ret = `0`;
33
34	/*
35	* We need to be protected against the filesystem going from
36	* r/o to r/w or vice versa.
37	*/
38	WARN_ON(!rwsem_is_locked(&sb->s_umount));
39
40	/*
41	* No point in syncing out anything if the filesystem is read-only.
42	*/
43	if (sb_rdonly(sb))
44	return `0`;
45
46	/*
47	* Do the filesystem syncing work. For simple filesystems
48	* writeback_inodes_sb(sb) just dirties buffers with inodes so we have
49	* to submit I/O for these buffers via sync_blockdev(). This also
50	* speeds up the wait == 1 case since in that case write_inode()
51	* methods call sync_dirty_buffer() and thus effectively write one block
52	* at a time.
53	*/
54	writeback_inodes_sb(sb, reason: WB_REASON_SYNC);
55	if (sb->s_op->sync_fs) {
56	ret = sb->s_op->sync_fs(sb, `0`);
57	if (ret)
58	return ret;
59	}
60	ret = sync_blockdev_nowait(bdev: sb->s_bdev);
61	if (ret)
62	return ret;
63
64	sync_inodes_sb(sb);
65	if (sb->s_op->sync_fs) {
66	ret = sb->s_op->sync_fs(sb, `1`);
67	if (ret)
68	return ret;
69	}
70	return sync_blockdev(bdev: sb->s_bdev);
71	}
72	EXPORT_SYMBOL(sync_filesystem);
73
74	static void sync_inodes_one_sb(struct super_block sb, void* *arg)
75	{
76	if (!sb_rdonly(sb))
77	sync_inodes_sb(sb);
78	}
79
80	static void sync_fs_one_sb(struct super_block sb, void* *arg)
81	{
82	if (!sb_rdonly(sb) && !(sb->s_iflags & SB_I_SKIP_SYNC) &&
83	sb->s_op->sync_fs)
84	sb->s_op->sync_fs(sb, (int* *)arg);
85	}
86
87	/*
88	* Sync everything. We start by waking flusher threads so that most of
89	* writeback runs on all devices in parallel. Then we sync all inodes reliably
90	* which effectively also waits for all flusher threads to finish doing
91	* writeback. At this point all data is on disk so metadata should be stable
92	* and we tell filesystems to sync their metadata via ->sync_fs() calls.
93	* Finally, we writeout all block devices because some filesystems (e.g. ext2)
94	* just write metadata (such as inodes or bitmaps) to block device page cache
95	* and do not sync it on their own in ->sync_fs().
96	*/
97	void ksys_sync(void)
98	{
99	int nowait = `0`, wait = `1`;
100
101	wakeup_flusher_threads(reason: WB_REASON_SYNC);
102	iterate_supers(sync_inodes_one_sb, NULL);
103	iterate_supers(sync_fs_one_sb, &nowait);
104	iterate_supers(sync_fs_one_sb, &wait);
105	sync_bdevs(wait: false);
106	sync_bdevs(wait: true);
107	if (unlikely(laptop_mode))
108	laptop_sync_completion();
109	}
110
111	SYSCALL_DEFINE0(sync)
112	{
113	ksys_sync();
114	return `0`;
115	}
116
117	static void do_sync_work(struct work_struct *work)
118	{
119	int nowait = `0`;
120
121	/*
122	* Sync twice to reduce the possibility we skipped some inodes / pages
123	* because they were temporarily locked
124	*/
125	iterate_supers(sync_inodes_one_sb, &nowait);
126	iterate_supers(sync_fs_one_sb, &nowait);
127	sync_bdevs(wait: false);
128	iterate_supers(sync_inodes_one_sb, &nowait);
129	iterate_supers(sync_fs_one_sb, &nowait);
130	sync_bdevs(wait: false);
131	printk("Emergency Sync complete\n");
132	kfree(objp: work);
133	}
134
135	void emergency_sync(void)
136	{
137	struct work_struct *work;
138
139	work = kmalloc(size: sizeof(*work), GFP_ATOMIC);
140	if (work) {
141	INIT_WORK(work, do_sync_work);
142	schedule_work(work);
143	}
144	}
145
146	/*
147	* sync a single super
148	*/
149	SYSCALL_DEFINE1(syncfs, int, fd)
150	{
151	struct fd f = fdget(fd);
152	struct super_block *sb;
153	int ret, ret2;
154
155	if (!f.file)
156	return -EBADF;
157	sb = f.file->f_path.dentry->d_sb;
158
159	down_read(sem: &sb->s_umount);
160	ret = sync_filesystem(sb);
161	up_read(sem: &sb->s_umount);
162
163	ret2 = errseq_check_and_advance(eseq: &sb->s_wb_err, since: &f.file->f_sb_err);
164
165	fdput(fd: f);
166	return ret ? ret : ret2;
167	}
168
169	/**
170	* vfs_fsync_range - helper to sync a range of data & metadata to disk
171	* @file: file to sync
172	* @start: offset in bytes of the beginning of data range to sync
173	* @end: offset in bytes of the end of data range (inclusive)
174	* @datasync: perform only datasync
175	*
176	* Write back data in range @start..@end and metadata for @file to disk. If
177	* @datasync is set only metadata needed to access modified file data is
178	* written.
179	*/
180	int vfs_fsync_range(struct file file, loff_t start, loff_t end, int* datasync)
181	{
182	struct inode *inode = file->f_mapping->host;
183
184	if (!file->f_op->fsync)
185	return -EINVAL;
186	if (!datasync && (inode->i_state & I_DIRTY_TIME))
187	mark_inode_dirty_sync(inode);
188	return file->f_op->fsync(file, start, end, datasync);
189	}
190	EXPORT_SYMBOL(vfs_fsync_range);
191
192	/**
193	* vfs_fsync - perform a fsync or fdatasync on a file
194	* @file: file to sync
195	* @datasync: only perform a fdatasync operation
196	*
197	* Write back data and metadata for @file to disk. If @datasync is
198	* set only metadata needed to access modified file data is written.
199	*/
200	int vfs_fsync(struct file file, int* datasync)
201	{
202	return vfs_fsync_range(file, `0`, LLONG_MAX, datasync);
203	}
204	EXPORT_SYMBOL(vfs_fsync);
205
206	static int do_fsync(unsigned int fd, int datasync)
207	{
208	struct fd f = fdget(fd);
209	int ret = -EBADF;
210
211	if (f.file) {
212	ret = vfs_fsync(f.file, datasync);
213	fdput(fd: f);
214	}
215	return ret;
216	}
217
218	SYSCALL_DEFINE1(fsync, unsigned int, fd)
219	{
220	return do_fsync(fd, datasync: `0`);
221	}
222
223	SYSCALL_DEFINE1(fdatasync, unsigned int, fd)
224	{
225	return do_fsync(fd, datasync: `1`);
226	}
227
228	int sync_file_range(struct file *file, loff_t offset, loff_t nbytes,
229	unsigned int flags)
230	{
231	int ret;
232	struct address_space *mapping;
233	loff_t endbyte; / inclusive /
234	umode_t i_mode;
235
236	ret = -EINVAL;
237	if (flags & ~VALID_FLAGS)
238	goto out;
239
240	endbyte = offset + nbytes;
241
242	if ((s64)offset < `0`)
243	goto out;
244	if ((s64)endbyte < `0`)
245	goto out;
246	if (endbyte < offset)
247	goto out;
248
249	if (sizeof(pgoff_t) == `4`) {
250	if (offset >= (`0x100000000ULL` << PAGE_SHIFT)) {
251	/*
252	* The range starts outside a 32 bit machine's
253	* pagecache addressing capabilities. Let it "succeed"
254	*/
255	ret = `0`;
256	goto out;
257	}
258	if (endbyte >= (`0x100000000ULL` << PAGE_SHIFT)) {
259	/*
260	* Out to EOF
261	*/
262	nbytes = `0`;
263	}
264	}
265
266	if (nbytes == `0`)
267	endbyte = LLONG_MAX;
268	else
269	endbyte--; / inclusive /
270
271	i_mode = file_inode(f: file)->i_mode;
272	ret = -ESPIPE;
273	if (!S_ISREG(i_mode) && !S_ISBLK(i_mode) && !S_ISDIR(i_mode) &&
274	!S_ISLNK(i_mode))
275	goto out;
276
277	mapping = file->f_mapping;
278	ret = `0`;
279	if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) {
280	ret = file_fdatawait_range(file, lstart: offset, lend: endbyte);
281	if (ret < `0`)
282	goto out;
283	}
284
285	if (flags & SYNC_FILE_RANGE_WRITE) {
286	int sync_mode = WB_SYNC_NONE;
287
288	if ((flags & SYNC_FILE_RANGE_WRITE_AND_WAIT) ==
289	SYNC_FILE_RANGE_WRITE_AND_WAIT)
290	sync_mode = WB_SYNC_ALL;
291
292	ret = __filemap_fdatawrite_range(mapping, start: offset, end: endbyte,
293	sync_mode);
294	if (ret < `0`)
295	goto out;
296	}
297
298	if (flags & SYNC_FILE_RANGE_WAIT_AFTER)
299	ret = file_fdatawait_range(file, lstart: offset, lend: endbyte);
300
301	out:
302	return ret;
303	}
304
305	/*
306	* ksys_sync_file_range() permits finely controlled syncing over a segment of
307	* a file in the range offset .. (offset+nbytes-1) inclusive. If nbytes is
308	* zero then ksys_sync_file_range() will operate from offset out to EOF.
309	*
310	* The flag bits are:
311	*
312	* SYNC_FILE_RANGE_WAIT_BEFORE: wait upon writeout of all pages in the range
313	* before performing the write.
314	*
315	* SYNC_FILE_RANGE_WRITE: initiate writeout of all those dirty pages in the
316	* range which are not presently under writeback. Note that this may block for
317	* significant periods due to exhaustion of disk request structures.
318	*
319	* SYNC_FILE_RANGE_WAIT_AFTER: wait upon writeout of all pages in the range
320	* after performing the write.
321	*
322	* Useful combinations of the flag bits are:
323	*
324	* SYNC_FILE_RANGE_WAIT_BEFORE\|SYNC_FILE_RANGE_WRITE: ensures that all pages
325	* in the range which were dirty on entry to ksys_sync_file_range() are placed
326	* under writeout. This is a start-write-for-data-integrity operation.
327	*
328	* SYNC_FILE_RANGE_WRITE: start writeout of all dirty pages in the range which
329	* are not presently under writeout. This is an asynchronous flush-to-disk
330	* operation. Not suitable for data integrity operations.
331	*
332	* SYNC_FILE_RANGE_WAIT_BEFORE (or SYNC_FILE_RANGE_WAIT_AFTER): wait for
333	* completion of writeout of all pages in the range. This will be used after an
334	* earlier SYNC_FILE_RANGE_WAIT_BEFORE\|SYNC_FILE_RANGE_WRITE operation to wait
335	* for that operation to complete and to return the result.
336	*
337	* SYNC_FILE_RANGE_WAIT_BEFORE\|SYNC_FILE_RANGE_WRITE\|SYNC_FILE_RANGE_WAIT_AFTER
338	* (a.k.a. SYNC_FILE_RANGE_WRITE_AND_WAIT):
339	* a traditional sync() operation. This is a write-for-data-integrity operation
340	* which will ensure that all pages in the range which were dirty on entry to
341	* ksys_sync_file_range() are written to disk. It should be noted that disk
342	* caches are not flushed by this call, so there are no guarantees here that the
343	* data will be available on disk after a crash.
344	*
345	*
346	* SYNC_FILE_RANGE_WAIT_BEFORE and SYNC_FILE_RANGE_WAIT_AFTER will detect any
347	* I/O errors or ENOSPC conditions and will return those to the caller, after
348	* clearing the EIO and ENOSPC flags in the address_space.
349	*
350	* It should be noted that none of these operations write out the file's
351	* metadata. So unless the application is strictly performing overwrites of
352	* already-instantiated disk blocks, there are no guarantees here that the data
353	* will be available after a crash.
354	*/
355	int ksys_sync_file_range(int fd, loff_t offset, loff_t nbytes,
356	unsigned int flags)
357	{
358	int ret;
359	struct fd f;
360
361	ret = -EBADF;
362	f = fdget(fd);
363	if (f.file)
364	ret = sync_file_range(file: f.file, offset, nbytes, flags);
365
366	fdput(fd: f);
367	return ret;
368	}
369
370	SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes,
371	unsigned int, flags)
372	{
373	return ksys_sync_file_range(fd, offset, nbytes, flags);
374	}
375
376	#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_SYNC_FILE_RANGE)
377	COMPAT_SYSCALL_DEFINE6(sync_file_range, int, fd, compat_arg_u64_dual(offset),
378	compat_arg_u64_dual(nbytes), unsigned int, flags)
379	{
380	return ksys_sync_file_range(fd, compat_arg_u64_glue(offset),
381	compat_arg_u64_glue(nbytes), flags);
382	}
383	#endif
384
385	/ It would be nice if people remember that not all the world's an i386*
386	when they introduce new system calls /*
387	SYSCALL_DEFINE4(sync_file_range2, int, fd, unsigned int, flags,
388	loff_t, offset, loff_t, nbytes)
389	{
390	return ksys_sync_file_range(fd, offset, nbytes, flags);
391	}
392

source code of linux/fs/sync.c