1 | // SPDX-License-Identifier: GPL-2.0-only |
---|---|
2 | /* |
3 | * linux/fs/file_table.c |
4 | * |
5 | * Copyright (C) 1991, 1992 Linus Torvalds |
6 | * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu) |
7 | */ |
8 | |
9 | #include <linux/string.h> |
10 | #include <linux/slab.h> |
11 | #include <linux/file.h> |
12 | #include <linux/fdtable.h> |
13 | #include <linux/init.h> |
14 | #include <linux/module.h> |
15 | #include <linux/fs.h> |
16 | #include <linux/filelock.h> |
17 | #include <linux/security.h> |
18 | #include <linux/cred.h> |
19 | #include <linux/eventpoll.h> |
20 | #include <linux/rcupdate.h> |
21 | #include <linux/mount.h> |
22 | #include <linux/capability.h> |
23 | #include <linux/cdev.h> |
24 | #include <linux/fsnotify.h> |
25 | #include <linux/sysctl.h> |
26 | #include <linux/percpu_counter.h> |
27 | #include <linux/percpu.h> |
28 | #include <linux/task_work.h> |
29 | #include <linux/swap.h> |
30 | #include <linux/kmemleak.h> |
31 | |
32 | #include <linux/atomic.h> |
33 | |
34 | #include "internal.h" |
35 | |
36 | /* sysctl tunables... */ |
37 | static struct files_stat_struct files_stat = { |
38 | .max_files = NR_FILE |
39 | }; |
40 | |
41 | /* SLAB cache for file structures */ |
42 | static struct kmem_cache *filp_cachep __ro_after_init; |
43 | |
44 | static struct percpu_counter nr_files __cacheline_aligned_in_smp; |
45 | |
46 | /* Container for backing file with optional user path */ |
47 | struct backing_file { |
48 | struct file file; |
49 | struct path user_path; |
50 | }; |
51 | |
52 | static inline struct backing_file *backing_file(struct file *f) |
53 | { |
54 | return container_of(f, struct backing_file, file); |
55 | } |
56 | |
57 | struct path *backing_file_user_path(struct file *f) |
58 | { |
59 | return &backing_file(f)->user_path; |
60 | } |
61 | EXPORT_SYMBOL_GPL(backing_file_user_path); |
62 | |
63 | static inline void file_free(struct file *f) |
64 | { |
65 | security_file_free(file: f); |
66 | if (likely(!(f->f_mode & FMODE_NOACCOUNT))) |
67 | percpu_counter_dec(fbc: &nr_files); |
68 | put_cred(cred: f->f_cred); |
69 | if (unlikely(f->f_mode & FMODE_BACKING)) { |
70 | path_put(backing_file_user_path(f)); |
71 | kfree(objp: backing_file(f)); |
72 | } else { |
73 | kmem_cache_free(s: filp_cachep, objp: f); |
74 | } |
75 | } |
76 | |
77 | /* |
78 | * Return the total number of open files in the system |
79 | */ |
80 | static long get_nr_files(void) |
81 | { |
82 | return percpu_counter_read_positive(fbc: &nr_files); |
83 | } |
84 | |
85 | /* |
86 | * Return the maximum number of open files in the system |
87 | */ |
88 | unsigned long get_max_files(void) |
89 | { |
90 | return files_stat.max_files; |
91 | } |
92 | EXPORT_SYMBOL_GPL(get_max_files); |
93 | |
94 | #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) |
95 | |
96 | /* |
97 | * Handle nr_files sysctl |
98 | */ |
99 | static int proc_nr_files(struct ctl_table *table, int write, void *buffer, |
100 | size_t *lenp, loff_t *ppos) |
101 | { |
102 | files_stat.nr_files = get_nr_files(); |
103 | return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); |
104 | } |
105 | |
106 | static struct ctl_table fs_stat_sysctls[] = { |
107 | { |
108 | .procname = "file-nr", |
109 | .data = &files_stat, |
110 | .maxlen = sizeof(files_stat), |
111 | .mode = 0444, |
112 | .proc_handler = proc_nr_files, |
113 | }, |
114 | { |
115 | .procname = "file-max", |
116 | .data = &files_stat.max_files, |
117 | .maxlen = sizeof(files_stat.max_files), |
118 | .mode = 0644, |
119 | .proc_handler = proc_doulongvec_minmax, |
120 | .extra1 = SYSCTL_LONG_ZERO, |
121 | .extra2 = SYSCTL_LONG_MAX, |
122 | }, |
123 | { |
124 | .procname = "nr_open", |
125 | .data = &sysctl_nr_open, |
126 | .maxlen = sizeof(unsigned int), |
127 | .mode = 0644, |
128 | .proc_handler = proc_dointvec_minmax, |
129 | .extra1 = &sysctl_nr_open_min, |
130 | .extra2 = &sysctl_nr_open_max, |
131 | }, |
132 | }; |
133 | |
134 | static int __init init_fs_stat_sysctls(void) |
135 | { |
136 | register_sysctl_init("fs", fs_stat_sysctls); |
137 | if (IS_ENABLED(CONFIG_BINFMT_MISC)) { |
138 | struct ctl_table_header *hdr; |
139 | hdr = register_sysctl_mount_point(path: "fs/binfmt_misc"); |
140 | kmemleak_not_leak(ptr: hdr); |
141 | } |
142 | return 0; |
143 | } |
144 | fs_initcall(init_fs_stat_sysctls); |
145 | #endif |
146 | |
147 | static int init_file(struct file *f, int flags, const struct cred *cred) |
148 | { |
149 | int error; |
150 | |
151 | f->f_cred = get_cred(cred); |
152 | error = security_file_alloc(file: f); |
153 | if (unlikely(error)) { |
154 | put_cred(cred: f->f_cred); |
155 | return error; |
156 | } |
157 | |
158 | rwlock_init(&f->f_owner.lock); |
159 | spin_lock_init(&f->f_lock); |
160 | mutex_init(&f->f_pos_lock); |
161 | f->f_flags = flags; |
162 | f->f_mode = OPEN_FMODE(flags); |
163 | /* f->f_version: 0 */ |
164 | |
165 | /* |
166 | * We're SLAB_TYPESAFE_BY_RCU so initialize f_count last. While |
167 | * fget-rcu pattern users need to be able to handle spurious |
168 | * refcount bumps we should reinitialize the reused file first. |
169 | */ |
170 | atomic_long_set(v: &f->f_count, i: 1); |
171 | return 0; |
172 | } |
173 | |
174 | /* Find an unused file structure and return a pointer to it. |
175 | * Returns an error pointer if some error happend e.g. we over file |
176 | * structures limit, run out of memory or operation is not permitted. |
177 | * |
178 | * Be very careful using this. You are responsible for |
179 | * getting write access to any mount that you might assign |
180 | * to this filp, if it is opened for write. If this is not |
181 | * done, you will imbalance int the mount's writer count |
182 | * and a warning at __fput() time. |
183 | */ |
184 | struct file *alloc_empty_file(int flags, const struct cred *cred) |
185 | { |
186 | static long old_max; |
187 | struct file *f; |
188 | int error; |
189 | |
190 | /* |
191 | * Privileged users can go above max_files |
192 | */ |
193 | if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) { |
194 | /* |
195 | * percpu_counters are inaccurate. Do an expensive check before |
196 | * we go and fail. |
197 | */ |
198 | if (percpu_counter_sum_positive(fbc: &nr_files) >= files_stat.max_files) |
199 | goto over; |
200 | } |
201 | |
202 | f = kmem_cache_zalloc(k: filp_cachep, GFP_KERNEL); |
203 | if (unlikely(!f)) |
204 | return ERR_PTR(error: -ENOMEM); |
205 | |
206 | error = init_file(f, flags, cred); |
207 | if (unlikely(error)) { |
208 | kmem_cache_free(s: filp_cachep, objp: f); |
209 | return ERR_PTR(error); |
210 | } |
211 | |
212 | percpu_counter_inc(fbc: &nr_files); |
213 | |
214 | return f; |
215 | |
216 | over: |
217 | /* Ran out of filps - report that */ |
218 | if (get_nr_files() > old_max) { |
219 | pr_info("VFS: file-max limit %lu reached\n", get_max_files()); |
220 | old_max = get_nr_files(); |
221 | } |
222 | return ERR_PTR(error: -ENFILE); |
223 | } |
224 | |
225 | /* |
226 | * Variant of alloc_empty_file() that doesn't check and modify nr_files. |
227 | * |
228 | * This is only for kernel internal use, and the allocate file must not be |
229 | * installed into file tables or such. |
230 | */ |
231 | struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred) |
232 | { |
233 | struct file *f; |
234 | int error; |
235 | |
236 | f = kmem_cache_zalloc(k: filp_cachep, GFP_KERNEL); |
237 | if (unlikely(!f)) |
238 | return ERR_PTR(error: -ENOMEM); |
239 | |
240 | error = init_file(f, flags, cred); |
241 | if (unlikely(error)) { |
242 | kmem_cache_free(s: filp_cachep, objp: f); |
243 | return ERR_PTR(error); |
244 | } |
245 | |
246 | f->f_mode |= FMODE_NOACCOUNT; |
247 | |
248 | return f; |
249 | } |
250 | |
251 | /* |
252 | * Variant of alloc_empty_file() that allocates a backing_file container |
253 | * and doesn't check and modify nr_files. |
254 | * |
255 | * This is only for kernel internal use, and the allocate file must not be |
256 | * installed into file tables or such. |
257 | */ |
258 | struct file *alloc_empty_backing_file(int flags, const struct cred *cred) |
259 | { |
260 | struct backing_file *ff; |
261 | int error; |
262 | |
263 | ff = kzalloc(size: sizeof(struct backing_file), GFP_KERNEL); |
264 | if (unlikely(!ff)) |
265 | return ERR_PTR(error: -ENOMEM); |
266 | |
267 | error = init_file(f: &ff->file, flags, cred); |
268 | if (unlikely(error)) { |
269 | kfree(objp: ff); |
270 | return ERR_PTR(error); |
271 | } |
272 | |
273 | ff->file.f_mode |= FMODE_BACKING | FMODE_NOACCOUNT; |
274 | return &ff->file; |
275 | } |
276 | |
277 | /** |
278 | * file_init_path - initialize a 'struct file' based on path |
279 | * |
280 | * @file: the file to set up |
281 | * @path: the (dentry, vfsmount) pair for the new file |
282 | * @fop: the 'struct file_operations' for the new file |
283 | */ |
284 | static void file_init_path(struct file *file, const struct path *path, |
285 | const struct file_operations *fop) |
286 | { |
287 | file->f_path = *path; |
288 | file->f_inode = path->dentry->d_inode; |
289 | file->f_mapping = path->dentry->d_inode->i_mapping; |
290 | file->f_wb_err = filemap_sample_wb_err(mapping: file->f_mapping); |
291 | file->f_sb_err = file_sample_sb_err(file); |
292 | if (fop->llseek) |
293 | file->f_mode |= FMODE_LSEEK; |
294 | if ((file->f_mode & FMODE_READ) && |
295 | likely(fop->read || fop->read_iter)) |
296 | file->f_mode |= FMODE_CAN_READ; |
297 | if ((file->f_mode & FMODE_WRITE) && |
298 | likely(fop->write || fop->write_iter)) |
299 | file->f_mode |= FMODE_CAN_WRITE; |
300 | file->f_iocb_flags = iocb_flags(file); |
301 | file->f_mode |= FMODE_OPENED; |
302 | file->f_op = fop; |
303 | if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) |
304 | i_readcount_inc(inode: path->dentry->d_inode); |
305 | } |
306 | |
307 | /** |
308 | * alloc_file - allocate and initialize a 'struct file' |
309 | * |
310 | * @path: the (dentry, vfsmount) pair for the new file |
311 | * @flags: O_... flags with which the new file will be opened |
312 | * @fop: the 'struct file_operations' for the new file |
313 | */ |
314 | static struct file *alloc_file(const struct path *path, int flags, |
315 | const struct file_operations *fop) |
316 | { |
317 | struct file *file; |
318 | |
319 | file = alloc_empty_file(flags, current_cred()); |
320 | if (!IS_ERR(ptr: file)) |
321 | file_init_path(file, path, fop); |
322 | return file; |
323 | } |
324 | |
325 | static inline int alloc_path_pseudo(const char *name, struct inode *inode, |
326 | struct vfsmount *mnt, struct path *path) |
327 | { |
328 | struct qstr this = QSTR_INIT(name, strlen(name)); |
329 | |
330 | path->dentry = d_alloc_pseudo(mnt->mnt_sb, &this); |
331 | if (!path->dentry) |
332 | return -ENOMEM; |
333 | path->mnt = mntget(mnt); |
334 | d_instantiate(path->dentry, inode); |
335 | return 0; |
336 | } |
337 | |
338 | struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt, |
339 | const char *name, int flags, |
340 | const struct file_operations *fops) |
341 | { |
342 | int ret; |
343 | struct path path; |
344 | struct file *file; |
345 | |
346 | ret = alloc_path_pseudo(name, inode, mnt, path: &path); |
347 | if (ret) |
348 | return ERR_PTR(error: ret); |
349 | |
350 | file = alloc_file(path: &path, flags, fop: fops); |
351 | if (IS_ERR(ptr: file)) { |
352 | ihold(inode); |
353 | path_put(&path); |
354 | } |
355 | return file; |
356 | } |
357 | EXPORT_SYMBOL(alloc_file_pseudo); |
358 | |
359 | struct file *alloc_file_pseudo_noaccount(struct inode *inode, |
360 | struct vfsmount *mnt, const char *name, |
361 | int flags, |
362 | const struct file_operations *fops) |
363 | { |
364 | int ret; |
365 | struct path path; |
366 | struct file *file; |
367 | |
368 | ret = alloc_path_pseudo(name, inode, mnt, path: &path); |
369 | if (ret) |
370 | return ERR_PTR(error: ret); |
371 | |
372 | file = alloc_empty_file_noaccount(flags, current_cred()); |
373 | if (IS_ERR(ptr: file)) { |
374 | ihold(inode); |
375 | path_put(&path); |
376 | return file; |
377 | } |
378 | file_init_path(file, path: &path, fop: fops); |
379 | return file; |
380 | } |
381 | EXPORT_SYMBOL_GPL(alloc_file_pseudo_noaccount); |
382 | |
383 | struct file *alloc_file_clone(struct file *base, int flags, |
384 | const struct file_operations *fops) |
385 | { |
386 | struct file *f = alloc_file(path: &base->f_path, flags, fop: fops); |
387 | if (!IS_ERR(ptr: f)) { |
388 | path_get(&f->f_path); |
389 | f->f_mapping = base->f_mapping; |
390 | } |
391 | return f; |
392 | } |
393 | |
394 | /* the real guts of fput() - releasing the last reference to file |
395 | */ |
396 | static void __fput(struct file *file) |
397 | { |
398 | struct dentry *dentry = file->f_path.dentry; |
399 | struct vfsmount *mnt = file->f_path.mnt; |
400 | struct inode *inode = file->f_inode; |
401 | fmode_t mode = file->f_mode; |
402 | |
403 | if (unlikely(!(file->f_mode & FMODE_OPENED))) |
404 | goto out; |
405 | |
406 | might_sleep(); |
407 | |
408 | fsnotify_close(file); |
409 | /* |
410 | * The function eventpoll_release() should be the first called |
411 | * in the file cleanup chain. |
412 | */ |
413 | eventpoll_release(file); |
414 | locks_remove_file(file); |
415 | |
416 | security_file_release(file); |
417 | if (unlikely(file->f_flags & FASYNC)) { |
418 | if (file->f_op->fasync) |
419 | file->f_op->fasync(-1, file, 0); |
420 | } |
421 | if (file->f_op->release) |
422 | file->f_op->release(inode, file); |
423 | if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL && |
424 | !(mode & FMODE_PATH))) { |
425 | cdev_put(p: inode->i_cdev); |
426 | } |
427 | fops_put(file->f_op); |
428 | put_pid(pid: file->f_owner.pid); |
429 | put_file_access(file); |
430 | dput(dentry); |
431 | if (unlikely(mode & FMODE_NEED_UNMOUNT)) |
432 | dissolve_on_fput(mnt); |
433 | mntput(mnt); |
434 | out: |
435 | file_free(f: file); |
436 | } |
437 | |
438 | static LLIST_HEAD(delayed_fput_list); |
439 | static void delayed_fput(struct work_struct *unused) |
440 | { |
441 | struct llist_node *node = llist_del_all(head: &delayed_fput_list); |
442 | struct file *f, *t; |
443 | |
444 | llist_for_each_entry_safe(f, t, node, f_llist) |
445 | __fput(file: f); |
446 | } |
447 | |
448 | static void ____fput(struct callback_head *work) |
449 | { |
450 | __fput(container_of(work, struct file, f_task_work)); |
451 | } |
452 | |
453 | /* |
454 | * If kernel thread really needs to have the final fput() it has done |
455 | * to complete, call this. The only user right now is the boot - we |
456 | * *do* need to make sure our writes to binaries on initramfs has |
457 | * not left us with opened struct file waiting for __fput() - execve() |
458 | * won't work without that. Please, don't add more callers without |
459 | * very good reasons; in particular, never call that with locks |
460 | * held and never call that from a thread that might need to do |
461 | * some work on any kind of umount. |
462 | */ |
463 | void flush_delayed_fput(void) |
464 | { |
465 | delayed_fput(NULL); |
466 | } |
467 | EXPORT_SYMBOL_GPL(flush_delayed_fput); |
468 | |
469 | static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput); |
470 | |
471 | void fput(struct file *file) |
472 | { |
473 | if (atomic_long_dec_and_test(v: &file->f_count)) { |
474 | struct task_struct *task = current; |
475 | |
476 | if (unlikely(!(file->f_mode & (FMODE_BACKING | FMODE_OPENED)))) { |
477 | file_free(f: file); |
478 | return; |
479 | } |
480 | if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) { |
481 | init_task_work(twork: &file->f_task_work, func: ____fput); |
482 | if (!task_work_add(task, twork: &file->f_task_work, mode: TWA_RESUME)) |
483 | return; |
484 | /* |
485 | * After this task has run exit_task_work(), |
486 | * task_work_add() will fail. Fall through to delayed |
487 | * fput to avoid leaking *file. |
488 | */ |
489 | } |
490 | |
491 | if (llist_add(new: &file->f_llist, head: &delayed_fput_list)) |
492 | schedule_delayed_work(dwork: &delayed_fput_work, delay: 1); |
493 | } |
494 | } |
495 | |
496 | /* |
497 | * synchronous analog of fput(); for kernel threads that might be needed |
498 | * in some umount() (and thus can't use flush_delayed_fput() without |
499 | * risking deadlocks), need to wait for completion of __fput() and know |
500 | * for this specific struct file it won't involve anything that would |
501 | * need them. Use only if you really need it - at the very least, |
502 | * don't blindly convert fput() by kernel thread to that. |
503 | */ |
504 | void __fput_sync(struct file *file) |
505 | { |
506 | if (atomic_long_dec_and_test(v: &file->f_count)) |
507 | __fput(file); |
508 | } |
509 | |
510 | EXPORT_SYMBOL(fput); |
511 | EXPORT_SYMBOL(__fput_sync); |
512 | |
513 | void __init files_init(void) |
514 | { |
515 | filp_cachep = kmem_cache_create(name: "filp", size: sizeof(struct file), align: 0, |
516 | SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN | |
517 | SLAB_PANIC | SLAB_ACCOUNT, NULL); |
518 | percpu_counter_init(&nr_files, 0, GFP_KERNEL); |
519 | } |
520 | |
521 | /* |
522 | * One file with associated inode and dcache is very roughly 1K. Per default |
523 | * do not use more than 10% of our memory for files. |
524 | */ |
525 | void __init files_maxfiles_init(void) |
526 | { |
527 | unsigned long n; |
528 | unsigned long nr_pages = totalram_pages(); |
529 | unsigned long memreserve = (nr_pages - nr_free_pages()) * 3/2; |
530 | |
531 | memreserve = min(memreserve, nr_pages - 1); |
532 | n = ((nr_pages - memreserve) * (PAGE_SIZE / 1024)) / 10; |
533 | |
534 | files_stat.max_files = max_t(unsigned long, n, NR_FILE); |
535 | } |
536 |
Definitions
- files_stat
- filp_cachep
- nr_files
- backing_file
- backing_file
- backing_file_user_path
- file_free
- get_nr_files
- get_max_files
- proc_nr_files
- fs_stat_sysctls
- init_fs_stat_sysctls
- init_file
- alloc_empty_file
- alloc_empty_file_noaccount
- alloc_empty_backing_file
- file_init_path
- alloc_file
- alloc_path_pseudo
- alloc_file_pseudo
- alloc_file_pseudo_noaccount
- alloc_file_clone
- __fput
- delayed_fput_list
- delayed_fput
- ____fput
- flush_delayed_fput
- delayed_fput_work
- fput
- __fput_sync
- files_init
Improve your Profiling and Debugging skills
Find out more