1 | // SPDX-License-Identifier: GPL-2.0 |
---|---|
2 | /* |
3 | * linux/fs/file.c |
4 | * |
5 | * Copyright (C) 1998-1999, Stephen Tweedie and Bill Hawes |
6 | * |
7 | * Manage the dynamic fd arrays in the process files_struct. |
8 | */ |
9 | |
10 | #include <linux/syscalls.h> |
11 | #include <linux/export.h> |
12 | #include <linux/fs.h> |
13 | #include <linux/kernel.h> |
14 | #include <linux/mm.h> |
15 | #include <linux/sched/signal.h> |
16 | #include <linux/slab.h> |
17 | #include <linux/file.h> |
18 | #include <linux/fdtable.h> |
19 | #include <linux/bitops.h> |
20 | #include <linux/spinlock.h> |
21 | #include <linux/rcupdate.h> |
22 | #include <linux/close_range.h> |
23 | #include <linux/file_ref.h> |
24 | #include <net/sock.h> |
25 | #include <linux/init_task.h> |
26 | |
27 | #include "internal.h" |
28 | |
29 | static noinline bool __file_ref_put_badval(file_ref_t *ref, unsigned long cnt) |
30 | { |
31 | /* |
32 | * If the reference count was already in the dead zone, then this |
33 | * put() operation is imbalanced. Warn, put the reference count back to |
34 | * DEAD and tell the caller to not deconstruct the object. |
35 | */ |
36 | if (WARN_ONCE(cnt >= FILE_REF_RELEASED, "imbalanced put on file reference count")) { |
37 | atomic_long_set(v: &ref->refcnt, FILE_REF_DEAD); |
38 | return false; |
39 | } |
40 | |
41 | /* |
42 | * This is a put() operation on a saturated refcount. Restore the |
43 | * mean saturation value and tell the caller to not deconstruct the |
44 | * object. |
45 | */ |
46 | if (cnt > FILE_REF_MAXREF) |
47 | atomic_long_set(v: &ref->refcnt, FILE_REF_SATURATED); |
48 | return false; |
49 | } |
50 | |
51 | /** |
52 | * __file_ref_put - Slowpath of file_ref_put() |
53 | * @ref: Pointer to the reference count |
54 | * @cnt: Current reference count |
55 | * |
56 | * Invoked when the reference count is outside of the valid zone. |
57 | * |
58 | * Return: |
59 | * True if this was the last reference with no future references |
60 | * possible. This signals the caller that it can safely schedule the |
61 | * object, which is protected by the reference counter, for |
62 | * deconstruction. |
63 | * |
64 | * False if there are still active references or the put() raced |
65 | * with a concurrent get()/put() pair. Caller is not allowed to |
66 | * deconstruct the protected object. |
67 | */ |
68 | bool __file_ref_put(file_ref_t *ref, unsigned long cnt) |
69 | { |
70 | /* Did this drop the last reference? */ |
71 | if (likely(cnt == FILE_REF_NOREF)) { |
72 | /* |
73 | * Carefully try to set the reference count to FILE_REF_DEAD. |
74 | * |
75 | * This can fail if a concurrent get() operation has |
76 | * elevated it again or the corresponding put() even marked |
77 | * it dead already. Both are valid situations and do not |
78 | * require a retry. If this fails the caller is not |
79 | * allowed to deconstruct the object. |
80 | */ |
81 | if (!atomic_long_try_cmpxchg_release(v: &ref->refcnt, old: &cnt, FILE_REF_DEAD)) |
82 | return false; |
83 | |
84 | /* |
85 | * The caller can safely schedule the object for |
86 | * deconstruction. Provide acquire ordering. |
87 | */ |
88 | smp_acquire__after_ctrl_dep(); |
89 | return true; |
90 | } |
91 | |
92 | return __file_ref_put_badval(ref, cnt); |
93 | } |
94 | EXPORT_SYMBOL_GPL(__file_ref_put); |
95 | |
96 | unsigned int sysctl_nr_open __read_mostly = 1024*1024; |
97 | unsigned int sysctl_nr_open_min = BITS_PER_LONG; |
98 | /* our min() is unusable in constant expressions ;-/ */ |
99 | #define __const_min(x, y) ((x) < (y) ? (x) : (y)) |
100 | unsigned int sysctl_nr_open_max = |
101 | __const_min(INT_MAX, ~(size_t)0/sizeof(void *)) & -BITS_PER_LONG; |
102 | |
103 | static void __free_fdtable(struct fdtable *fdt) |
104 | { |
105 | kvfree(addr: fdt->fd); |
106 | kvfree(addr: fdt->open_fds); |
107 | kfree(objp: fdt); |
108 | } |
109 | |
110 | static void free_fdtable_rcu(struct rcu_head *rcu) |
111 | { |
112 | __free_fdtable(container_of(rcu, struct fdtable, rcu)); |
113 | } |
114 | |
115 | #define BITBIT_NR(nr) BITS_TO_LONGS(BITS_TO_LONGS(nr)) |
116 | #define BITBIT_SIZE(nr) (BITBIT_NR(nr) * sizeof(long)) |
117 | |
118 | #define fdt_words(fdt) ((fdt)->max_fds / BITS_PER_LONG) // words in ->open_fds |
119 | /* |
120 | * Copy 'count' fd bits from the old table to the new table and clear the extra |
121 | * space if any. This does not copy the file pointers. Called with the files |
122 | * spinlock held for write. |
123 | */ |
124 | static inline void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt, |
125 | unsigned int copy_words) |
126 | { |
127 | unsigned int nwords = fdt_words(nfdt); |
128 | |
129 | bitmap_copy_and_extend(to: nfdt->open_fds, from: ofdt->open_fds, |
130 | count: copy_words * BITS_PER_LONG, size: nwords * BITS_PER_LONG); |
131 | bitmap_copy_and_extend(to: nfdt->close_on_exec, from: ofdt->close_on_exec, |
132 | count: copy_words * BITS_PER_LONG, size: nwords * BITS_PER_LONG); |
133 | bitmap_copy_and_extend(to: nfdt->full_fds_bits, from: ofdt->full_fds_bits, |
134 | count: copy_words, size: nwords); |
135 | } |
136 | |
137 | /* |
138 | * Copy all file descriptors from the old table to the new, expanded table and |
139 | * clear the extra space. Called with the files spinlock held for write. |
140 | */ |
141 | static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt) |
142 | { |
143 | size_t cpy, set; |
144 | |
145 | BUG_ON(nfdt->max_fds < ofdt->max_fds); |
146 | |
147 | cpy = ofdt->max_fds * sizeof(struct file *); |
148 | set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *); |
149 | memcpy(nfdt->fd, ofdt->fd, cpy); |
150 | memset((char *)nfdt->fd + cpy, 0, set); |
151 | |
152 | copy_fd_bitmaps(nfdt, ofdt, fdt_words(ofdt)); |
153 | } |
154 | |
155 | /* |
156 | * Note how the fdtable bitmap allocations very much have to be a multiple of |
157 | * BITS_PER_LONG. This is not only because we walk those things in chunks of |
158 | * 'unsigned long' in some places, but simply because that is how the Linux |
159 | * kernel bitmaps are defined to work: they are not "bits in an array of bytes", |
160 | * they are very much "bits in an array of unsigned long". |
161 | */ |
162 | static struct fdtable *alloc_fdtable(unsigned int slots_wanted) |
163 | { |
164 | struct fdtable *fdt; |
165 | unsigned int nr; |
166 | void *data; |
167 | |
168 | /* |
169 | * Figure out how many fds we actually want to support in this fdtable. |
170 | * Allocation steps are keyed to the size of the fdarray, since it |
171 | * grows far faster than any of the other dynamic data. We try to fit |
172 | * the fdarray into comfortable page-tuned chunks: starting at 1024B |
173 | * and growing in powers of two from there on. Since we called only |
174 | * with slots_wanted > BITS_PER_LONG (embedded instance in files->fdtab |
175 | * already gives BITS_PER_LONG slots), the above boils down to |
176 | * 1. use the smallest power of two large enough to give us that many |
177 | * slots. |
178 | * 2. on 32bit skip 64 and 128 - the minimal capacity we want there is |
179 | * 256 slots (i.e. 1Kb fd array). |
180 | * 3. on 64bit don't skip anything, 1Kb fd array means 128 slots there |
181 | * and we are never going to be asked for 64 or less. |
182 | */ |
183 | if (IS_ENABLED(CONFIG_32BIT) && slots_wanted < 256) |
184 | nr = 256; |
185 | else |
186 | nr = roundup_pow_of_two(slots_wanted); |
187 | /* |
188 | * Note that this can drive nr *below* what we had passed if sysctl_nr_open |
189 | * had been set lower between the check in expand_files() and here. |
190 | * |
191 | * We make sure that nr remains a multiple of BITS_PER_LONG - otherwise |
192 | * bitmaps handling below becomes unpleasant, to put it mildly... |
193 | */ |
194 | if (unlikely(nr > sysctl_nr_open)) { |
195 | nr = round_down(sysctl_nr_open, BITS_PER_LONG); |
196 | if (nr < slots_wanted) |
197 | return ERR_PTR(error: -EMFILE); |
198 | } |
199 | |
200 | fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT); |
201 | if (!fdt) |
202 | goto out; |
203 | fdt->max_fds = nr; |
204 | data = kvmalloc_array(nr, sizeof(struct file *), GFP_KERNEL_ACCOUNT); |
205 | if (!data) |
206 | goto out_fdt; |
207 | fdt->fd = data; |
208 | |
209 | data = kvmalloc(max_t(size_t, |
210 | 2 * nr / BITS_PER_BYTE + BITBIT_SIZE(nr), L1_CACHE_BYTES), |
211 | GFP_KERNEL_ACCOUNT); |
212 | if (!data) |
213 | goto out_arr; |
214 | fdt->open_fds = data; |
215 | data += nr / BITS_PER_BYTE; |
216 | fdt->close_on_exec = data; |
217 | data += nr / BITS_PER_BYTE; |
218 | fdt->full_fds_bits = data; |
219 | |
220 | return fdt; |
221 | |
222 | out_arr: |
223 | kvfree(addr: fdt->fd); |
224 | out_fdt: |
225 | kfree(objp: fdt); |
226 | out: |
227 | return ERR_PTR(error: -ENOMEM); |
228 | } |
229 | |
230 | /* |
231 | * Expand the file descriptor table. |
232 | * This function will allocate a new fdtable and both fd array and fdset, of |
233 | * the given size. |
234 | * Return <0 error code on error; 0 on successful completion. |
235 | * The files->file_lock should be held on entry, and will be held on exit. |
236 | */ |
237 | static int expand_fdtable(struct files_struct *files, unsigned int nr) |
238 | __releases(files->file_lock) |
239 | __acquires(files->file_lock) |
240 | { |
241 | struct fdtable *new_fdt, *cur_fdt; |
242 | |
243 | spin_unlock(lock: &files->file_lock); |
244 | new_fdt = alloc_fdtable(slots_wanted: nr + 1); |
245 | |
246 | /* make sure all fd_install() have seen resize_in_progress |
247 | * or have finished their rcu_read_lock_sched() section. |
248 | */ |
249 | if (atomic_read(v: &files->count) > 1) |
250 | synchronize_rcu(); |
251 | |
252 | spin_lock(lock: &files->file_lock); |
253 | if (IS_ERR(ptr: new_fdt)) |
254 | return PTR_ERR(ptr: new_fdt); |
255 | cur_fdt = files_fdtable(files); |
256 | BUG_ON(nr < cur_fdt->max_fds); |
257 | copy_fdtable(nfdt: new_fdt, ofdt: cur_fdt); |
258 | rcu_assign_pointer(files->fdt, new_fdt); |
259 | if (cur_fdt != &files->fdtab) |
260 | call_rcu(head: &cur_fdt->rcu, func: free_fdtable_rcu); |
261 | /* coupled with smp_rmb() in fd_install() */ |
262 | smp_wmb(); |
263 | return 0; |
264 | } |
265 | |
266 | /* |
267 | * Expand files. |
268 | * This function will expand the file structures, if the requested size exceeds |
269 | * the current capacity and there is room for expansion. |
270 | * Return <0 error code on error; 0 on success. |
271 | * The files->file_lock should be held on entry, and will be held on exit. |
272 | */ |
273 | static int expand_files(struct files_struct *files, unsigned int nr) |
274 | __releases(files->file_lock) |
275 | __acquires(files->file_lock) |
276 | { |
277 | struct fdtable *fdt; |
278 | int error; |
279 | |
280 | repeat: |
281 | fdt = files_fdtable(files); |
282 | |
283 | /* Do we need to expand? */ |
284 | if (nr < fdt->max_fds) |
285 | return 0; |
286 | |
287 | if (unlikely(files->resize_in_progress)) { |
288 | spin_unlock(lock: &files->file_lock); |
289 | wait_event(files->resize_wait, !files->resize_in_progress); |
290 | spin_lock(lock: &files->file_lock); |
291 | goto repeat; |
292 | } |
293 | |
294 | /* Can we expand? */ |
295 | if (unlikely(nr >= sysctl_nr_open)) |
296 | return -EMFILE; |
297 | |
298 | /* All good, so we try */ |
299 | files->resize_in_progress = true; |
300 | error = expand_fdtable(files, nr); |
301 | files->resize_in_progress = false; |
302 | |
303 | wake_up_all(&files->resize_wait); |
304 | return error; |
305 | } |
306 | |
307 | static inline void __set_close_on_exec(unsigned int fd, struct fdtable *fdt, |
308 | bool set) |
309 | { |
310 | if (set) { |
311 | __set_bit(fd, fdt->close_on_exec); |
312 | } else { |
313 | if (test_bit(fd, fdt->close_on_exec)) |
314 | __clear_bit(fd, fdt->close_on_exec); |
315 | } |
316 | } |
317 | |
318 | static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt, bool set) |
319 | { |
320 | __set_bit(fd, fdt->open_fds); |
321 | __set_close_on_exec(fd, fdt, set); |
322 | fd /= BITS_PER_LONG; |
323 | if (!~fdt->open_fds[fd]) |
324 | __set_bit(fd, fdt->full_fds_bits); |
325 | } |
326 | |
327 | static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt) |
328 | { |
329 | __clear_bit(fd, fdt->open_fds); |
330 | fd /= BITS_PER_LONG; |
331 | if (test_bit(fd, fdt->full_fds_bits)) |
332 | __clear_bit(fd, fdt->full_fds_bits); |
333 | } |
334 | |
335 | static inline bool fd_is_open(unsigned int fd, const struct fdtable *fdt) |
336 | { |
337 | return test_bit(fd, fdt->open_fds); |
338 | } |
339 | |
340 | /* |
341 | * Note that a sane fdtable size always has to be a multiple of |
342 | * BITS_PER_LONG, since we have bitmaps that are sized by this. |
343 | * |
344 | * punch_hole is optional - when close_range() is asked to unshare |
345 | * and close, we don't need to copy descriptors in that range, so |
346 | * a smaller cloned descriptor table might suffice if the last |
347 | * currently opened descriptor falls into that range. |
348 | */ |
349 | static unsigned int sane_fdtable_size(struct fdtable *fdt, struct fd_range *punch_hole) |
350 | { |
351 | unsigned int last = find_last_bit(addr: fdt->open_fds, size: fdt->max_fds); |
352 | |
353 | if (last == fdt->max_fds) |
354 | return NR_OPEN_DEFAULT; |
355 | if (punch_hole && punch_hole->to >= last && punch_hole->from <= last) { |
356 | last = find_last_bit(addr: fdt->open_fds, size: punch_hole->from); |
357 | if (last == punch_hole->from) |
358 | return NR_OPEN_DEFAULT; |
359 | } |
360 | return ALIGN(last + 1, BITS_PER_LONG); |
361 | } |
362 | |
363 | /* |
364 | * Allocate a new descriptor table and copy contents from the passed in |
365 | * instance. Returns a pointer to cloned table on success, ERR_PTR() |
366 | * on failure. For 'punch_hole' see sane_fdtable_size(). |
367 | */ |
368 | struct files_struct *dup_fd(struct files_struct *oldf, struct fd_range *punch_hole) |
369 | { |
370 | struct files_struct *newf; |
371 | struct file **old_fds, **new_fds; |
372 | unsigned int open_files, i; |
373 | struct fdtable *old_fdt, *new_fdt; |
374 | |
375 | newf = kmem_cache_alloc(files_cachep, GFP_KERNEL); |
376 | if (!newf) |
377 | return ERR_PTR(error: -ENOMEM); |
378 | |
379 | atomic_set(v: &newf->count, i: 1); |
380 | |
381 | spin_lock_init(&newf->file_lock); |
382 | newf->resize_in_progress = false; |
383 | init_waitqueue_head(&newf->resize_wait); |
384 | newf->next_fd = 0; |
385 | new_fdt = &newf->fdtab; |
386 | new_fdt->max_fds = NR_OPEN_DEFAULT; |
387 | new_fdt->close_on_exec = newf->close_on_exec_init; |
388 | new_fdt->open_fds = newf->open_fds_init; |
389 | new_fdt->full_fds_bits = newf->full_fds_bits_init; |
390 | new_fdt->fd = &newf->fd_array[0]; |
391 | |
392 | spin_lock(lock: &oldf->file_lock); |
393 | old_fdt = files_fdtable(oldf); |
394 | open_files = sane_fdtable_size(fdt: old_fdt, punch_hole); |
395 | |
396 | /* |
397 | * Check whether we need to allocate a larger fd array and fd set. |
398 | */ |
399 | while (unlikely(open_files > new_fdt->max_fds)) { |
400 | spin_unlock(lock: &oldf->file_lock); |
401 | |
402 | if (new_fdt != &newf->fdtab) |
403 | __free_fdtable(fdt: new_fdt); |
404 | |
405 | new_fdt = alloc_fdtable(slots_wanted: open_files); |
406 | if (IS_ERR(ptr: new_fdt)) { |
407 | kmem_cache_free(s: files_cachep, objp: newf); |
408 | return ERR_CAST(ptr: new_fdt); |
409 | } |
410 | |
411 | /* |
412 | * Reacquire the oldf lock and a pointer to its fd table |
413 | * who knows it may have a new bigger fd table. We need |
414 | * the latest pointer. |
415 | */ |
416 | spin_lock(lock: &oldf->file_lock); |
417 | old_fdt = files_fdtable(oldf); |
418 | open_files = sane_fdtable_size(fdt: old_fdt, punch_hole); |
419 | } |
420 | |
421 | copy_fd_bitmaps(nfdt: new_fdt, ofdt: old_fdt, copy_words: open_files / BITS_PER_LONG); |
422 | |
423 | old_fds = old_fdt->fd; |
424 | new_fds = new_fdt->fd; |
425 | |
426 | /* |
427 | * We may be racing against fd allocation from other threads using this |
428 | * files_struct, despite holding ->file_lock. |
429 | * |
430 | * alloc_fd() might have already claimed a slot, while fd_install() |
431 | * did not populate it yet. Note the latter operates locklessly, so |
432 | * the file can show up as we are walking the array below. |
433 | * |
434 | * At the same time we know no files will disappear as all other |
435 | * operations take the lock. |
436 | * |
437 | * Instead of trying to placate userspace racing with itself, we |
438 | * ref the file if we see it and mark the fd slot as unused otherwise. |
439 | */ |
440 | for (i = open_files; i != 0; i--) { |
441 | struct file *f = rcu_dereference_raw(*old_fds++); |
442 | if (f) { |
443 | get_file(f); |
444 | } else { |
445 | __clear_open_fd(fd: open_files - i, fdt: new_fdt); |
446 | } |
447 | rcu_assign_pointer(*new_fds++, f); |
448 | } |
449 | spin_unlock(lock: &oldf->file_lock); |
450 | |
451 | /* clear the remainder */ |
452 | memset(new_fds, 0, (new_fdt->max_fds - open_files) * sizeof(struct file *)); |
453 | |
454 | rcu_assign_pointer(newf->fdt, new_fdt); |
455 | |
456 | return newf; |
457 | } |
458 | |
459 | static struct fdtable *close_files(struct files_struct * files) |
460 | { |
461 | /* |
462 | * It is safe to dereference the fd table without RCU or |
463 | * ->file_lock because this is the last reference to the |
464 | * files structure. |
465 | */ |
466 | struct fdtable *fdt = rcu_dereference_raw(files->fdt); |
467 | unsigned int i, j = 0; |
468 | |
469 | for (;;) { |
470 | unsigned long set; |
471 | i = j * BITS_PER_LONG; |
472 | if (i >= fdt->max_fds) |
473 | break; |
474 | set = fdt->open_fds[j++]; |
475 | while (set) { |
476 | if (set & 1) { |
477 | struct file *file = fdt->fd[i]; |
478 | if (file) { |
479 | filp_close(file, id: files); |
480 | cond_resched(); |
481 | } |
482 | } |
483 | i++; |
484 | set >>= 1; |
485 | } |
486 | } |
487 | |
488 | return fdt; |
489 | } |
490 | |
491 | void put_files_struct(struct files_struct *files) |
492 | { |
493 | if (atomic_dec_and_test(v: &files->count)) { |
494 | struct fdtable *fdt = close_files(files); |
495 | |
496 | /* free the arrays if they are not embedded */ |
497 | if (fdt != &files->fdtab) |
498 | __free_fdtable(fdt); |
499 | kmem_cache_free(s: files_cachep, objp: files); |
500 | } |
501 | } |
502 | |
503 | void exit_files(struct task_struct *tsk) |
504 | { |
505 | struct files_struct * files = tsk->files; |
506 | |
507 | if (files) { |
508 | task_lock(p: tsk); |
509 | tsk->files = NULL; |
510 | task_unlock(p: tsk); |
511 | put_files_struct(files); |
512 | } |
513 | } |
514 | |
515 | struct files_struct init_files = { |
516 | .count = ATOMIC_INIT(1), |
517 | .fdt = &init_files.fdtab, |
518 | .fdtab = { |
519 | .max_fds = NR_OPEN_DEFAULT, |
520 | .fd = &init_files.fd_array[0], |
521 | .close_on_exec = init_files.close_on_exec_init, |
522 | .open_fds = init_files.open_fds_init, |
523 | .full_fds_bits = init_files.full_fds_bits_init, |
524 | }, |
525 | .file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock), |
526 | .resize_wait = __WAIT_QUEUE_HEAD_INITIALIZER(init_files.resize_wait), |
527 | }; |
528 | |
529 | static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start) |
530 | { |
531 | unsigned int maxfd = fdt->max_fds; /* always multiple of BITS_PER_LONG */ |
532 | unsigned int maxbit = maxfd / BITS_PER_LONG; |
533 | unsigned int bitbit = start / BITS_PER_LONG; |
534 | unsigned int bit; |
535 | |
536 | /* |
537 | * Try to avoid looking at the second level bitmap |
538 | */ |
539 | bit = find_next_zero_bit(addr: &fdt->open_fds[bitbit], BITS_PER_LONG, |
540 | offset: start & (BITS_PER_LONG - 1)); |
541 | if (bit < BITS_PER_LONG) |
542 | return bit + bitbit * BITS_PER_LONG; |
543 | |
544 | bitbit = find_next_zero_bit(addr: fdt->full_fds_bits, size: maxbit, offset: bitbit) * BITS_PER_LONG; |
545 | if (bitbit >= maxfd) |
546 | return maxfd; |
547 | if (bitbit > start) |
548 | start = bitbit; |
549 | return find_next_zero_bit(addr: fdt->open_fds, size: maxfd, offset: start); |
550 | } |
551 | |
552 | /* |
553 | * allocate a file descriptor, mark it busy. |
554 | */ |
555 | static int alloc_fd(unsigned start, unsigned end, unsigned flags) |
556 | { |
557 | struct files_struct *files = current->files; |
558 | unsigned int fd; |
559 | int error; |
560 | struct fdtable *fdt; |
561 | |
562 | spin_lock(lock: &files->file_lock); |
563 | repeat: |
564 | fdt = files_fdtable(files); |
565 | fd = start; |
566 | if (fd < files->next_fd) |
567 | fd = files->next_fd; |
568 | |
569 | if (likely(fd < fdt->max_fds)) |
570 | fd = find_next_fd(fdt, start: fd); |
571 | |
572 | /* |
573 | * N.B. For clone tasks sharing a files structure, this test |
574 | * will limit the total number of files that can be opened. |
575 | */ |
576 | error = -EMFILE; |
577 | if (unlikely(fd >= end)) |
578 | goto out; |
579 | |
580 | if (unlikely(fd >= fdt->max_fds)) { |
581 | error = expand_files(files, nr: fd); |
582 | if (error < 0) |
583 | goto out; |
584 | |
585 | goto repeat; |
586 | } |
587 | |
588 | if (start <= files->next_fd) |
589 | files->next_fd = fd + 1; |
590 | |
591 | __set_open_fd(fd, fdt, set: flags & O_CLOEXEC); |
592 | error = fd; |
593 | VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL); |
594 | |
595 | out: |
596 | spin_unlock(lock: &files->file_lock); |
597 | return error; |
598 | } |
599 | |
600 | int __get_unused_fd_flags(unsigned flags, unsigned long nofile) |
601 | { |
602 | return alloc_fd(start: 0, end: nofile, flags); |
603 | } |
604 | |
605 | int get_unused_fd_flags(unsigned flags) |
606 | { |
607 | return __get_unused_fd_flags(flags, nofile: rlimit(RLIMIT_NOFILE)); |
608 | } |
609 | EXPORT_SYMBOL(get_unused_fd_flags); |
610 | |
611 | static void __put_unused_fd(struct files_struct *files, unsigned int fd) |
612 | { |
613 | struct fdtable *fdt = files_fdtable(files); |
614 | __clear_open_fd(fd, fdt); |
615 | if (fd < files->next_fd) |
616 | files->next_fd = fd; |
617 | } |
618 | |
619 | void put_unused_fd(unsigned int fd) |
620 | { |
621 | struct files_struct *files = current->files; |
622 | spin_lock(lock: &files->file_lock); |
623 | __put_unused_fd(files, fd); |
624 | spin_unlock(lock: &files->file_lock); |
625 | } |
626 | |
627 | EXPORT_SYMBOL(put_unused_fd); |
628 | |
629 | /** |
630 | * fd_install - install a file pointer in the fd array |
631 | * @fd: file descriptor to install the file in |
632 | * @file: the file to install |
633 | * |
634 | * This consumes the "file" refcount, so callers should treat it |
635 | * as if they had called fput(file). |
636 | */ |
637 | void fd_install(unsigned int fd, struct file *file) |
638 | { |
639 | struct files_struct *files = current->files; |
640 | struct fdtable *fdt; |
641 | |
642 | if (WARN_ON_ONCE(unlikely(file->f_mode & FMODE_BACKING))) |
643 | return; |
644 | |
645 | rcu_read_lock_sched(); |
646 | |
647 | if (unlikely(files->resize_in_progress)) { |
648 | rcu_read_unlock_sched(); |
649 | spin_lock(lock: &files->file_lock); |
650 | fdt = files_fdtable(files); |
651 | VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL); |
652 | rcu_assign_pointer(fdt->fd[fd], file); |
653 | spin_unlock(lock: &files->file_lock); |
654 | return; |
655 | } |
656 | /* coupled with smp_wmb() in expand_fdtable() */ |
657 | smp_rmb(); |
658 | fdt = rcu_dereference_sched(files->fdt); |
659 | VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL); |
660 | rcu_assign_pointer(fdt->fd[fd], file); |
661 | rcu_read_unlock_sched(); |
662 | } |
663 | |
664 | EXPORT_SYMBOL(fd_install); |
665 | |
666 | /** |
667 | * file_close_fd_locked - return file associated with fd |
668 | * @files: file struct to retrieve file from |
669 | * @fd: file descriptor to retrieve file for |
670 | * |
671 | * Doesn't take a separate reference count. |
672 | * |
673 | * Context: files_lock must be held. |
674 | * |
675 | * Returns: The file associated with @fd (NULL if @fd is not open) |
676 | */ |
677 | struct file *file_close_fd_locked(struct files_struct *files, unsigned fd) |
678 | { |
679 | struct fdtable *fdt = files_fdtable(files); |
680 | struct file *file; |
681 | |
682 | lockdep_assert_held(&files->file_lock); |
683 | |
684 | if (fd >= fdt->max_fds) |
685 | return NULL; |
686 | |
687 | fd = array_index_nospec(fd, fdt->max_fds); |
688 | file = rcu_dereference_raw(fdt->fd[fd]); |
689 | if (file) { |
690 | rcu_assign_pointer(fdt->fd[fd], NULL); |
691 | __put_unused_fd(files, fd); |
692 | } |
693 | return file; |
694 | } |
695 | |
696 | int close_fd(unsigned fd) |
697 | { |
698 | struct files_struct *files = current->files; |
699 | struct file *file; |
700 | |
701 | spin_lock(lock: &files->file_lock); |
702 | file = file_close_fd_locked(files, fd); |
703 | spin_unlock(lock: &files->file_lock); |
704 | if (!file) |
705 | return -EBADF; |
706 | |
707 | return filp_close(file, id: files); |
708 | } |
709 | EXPORT_SYMBOL(close_fd); |
710 | |
711 | /** |
712 | * last_fd - return last valid index into fd table |
713 | * @fdt: File descriptor table. |
714 | * |
715 | * Context: Either rcu read lock or files_lock must be held. |
716 | * |
717 | * Returns: Last valid index into fdtable. |
718 | */ |
719 | static inline unsigned last_fd(struct fdtable *fdt) |
720 | { |
721 | return fdt->max_fds - 1; |
722 | } |
723 | |
724 | static inline void __range_cloexec(struct files_struct *cur_fds, |
725 | unsigned int fd, unsigned int max_fd) |
726 | { |
727 | struct fdtable *fdt; |
728 | |
729 | /* make sure we're using the correct maximum value */ |
730 | spin_lock(lock: &cur_fds->file_lock); |
731 | fdt = files_fdtable(cur_fds); |
732 | max_fd = min(last_fd(fdt), max_fd); |
733 | if (fd <= max_fd) |
734 | bitmap_set(map: fdt->close_on_exec, start: fd, nbits: max_fd - fd + 1); |
735 | spin_unlock(lock: &cur_fds->file_lock); |
736 | } |
737 | |
738 | static inline void __range_close(struct files_struct *files, unsigned int fd, |
739 | unsigned int max_fd) |
740 | { |
741 | struct file *file; |
742 | unsigned n; |
743 | |
744 | spin_lock(lock: &files->file_lock); |
745 | n = last_fd(files_fdtable(files)); |
746 | max_fd = min(max_fd, n); |
747 | |
748 | for (; fd <= max_fd; fd++) { |
749 | file = file_close_fd_locked(files, fd); |
750 | if (file) { |
751 | spin_unlock(lock: &files->file_lock); |
752 | filp_close(file, id: files); |
753 | cond_resched(); |
754 | spin_lock(lock: &files->file_lock); |
755 | } else if (need_resched()) { |
756 | spin_unlock(lock: &files->file_lock); |
757 | cond_resched(); |
758 | spin_lock(lock: &files->file_lock); |
759 | } |
760 | } |
761 | spin_unlock(lock: &files->file_lock); |
762 | } |
763 | |
764 | /** |
765 | * sys_close_range() - Close all file descriptors in a given range. |
766 | * |
767 | * @fd: starting file descriptor to close |
768 | * @max_fd: last file descriptor to close |
769 | * @flags: CLOSE_RANGE flags. |
770 | * |
771 | * This closes a range of file descriptors. All file descriptors |
772 | * from @fd up to and including @max_fd are closed. |
773 | * Currently, errors to close a given file descriptor are ignored. |
774 | */ |
775 | SYSCALL_DEFINE3(close_range, unsigned int, fd, unsigned int, max_fd, |
776 | unsigned int, flags) |
777 | { |
778 | struct task_struct *me = current; |
779 | struct files_struct *cur_fds = me->files, *fds = NULL; |
780 | |
781 | if (flags & ~(CLOSE_RANGE_UNSHARE | CLOSE_RANGE_CLOEXEC)) |
782 | return -EINVAL; |
783 | |
784 | if (fd > max_fd) |
785 | return -EINVAL; |
786 | |
787 | if ((flags & CLOSE_RANGE_UNSHARE) && atomic_read(v: &cur_fds->count) > 1) { |
788 | struct fd_range range = {fd, max_fd}, *punch_hole = ⦥ |
789 | |
790 | /* |
791 | * If the caller requested all fds to be made cloexec we always |
792 | * copy all of the file descriptors since they still want to |
793 | * use them. |
794 | */ |
795 | if (flags & CLOSE_RANGE_CLOEXEC) |
796 | punch_hole = NULL; |
797 | |
798 | fds = dup_fd(oldf: cur_fds, punch_hole); |
799 | if (IS_ERR(ptr: fds)) |
800 | return PTR_ERR(ptr: fds); |
801 | /* |
802 | * We used to share our file descriptor table, and have now |
803 | * created a private one, make sure we're using it below. |
804 | */ |
805 | swap(cur_fds, fds); |
806 | } |
807 | |
808 | if (flags & CLOSE_RANGE_CLOEXEC) |
809 | __range_cloexec(cur_fds, fd, max_fd); |
810 | else |
811 | __range_close(files: cur_fds, fd, max_fd); |
812 | |
813 | if (fds) { |
814 | /* |
815 | * We're done closing the files we were supposed to. Time to install |
816 | * the new file descriptor table and drop the old one. |
817 | */ |
818 | task_lock(p: me); |
819 | me->files = cur_fds; |
820 | task_unlock(p: me); |
821 | put_files_struct(files: fds); |
822 | } |
823 | |
824 | return 0; |
825 | } |
826 | |
827 | /** |
828 | * file_close_fd - return file associated with fd |
829 | * @fd: file descriptor to retrieve file for |
830 | * |
831 | * Doesn't take a separate reference count. |
832 | * |
833 | * Returns: The file associated with @fd (NULL if @fd is not open) |
834 | */ |
835 | struct file *file_close_fd(unsigned int fd) |
836 | { |
837 | struct files_struct *files = current->files; |
838 | struct file *file; |
839 | |
840 | spin_lock(lock: &files->file_lock); |
841 | file = file_close_fd_locked(files, fd); |
842 | spin_unlock(lock: &files->file_lock); |
843 | |
844 | return file; |
845 | } |
846 | |
847 | void do_close_on_exec(struct files_struct *files) |
848 | { |
849 | unsigned i; |
850 | struct fdtable *fdt; |
851 | |
852 | /* exec unshares first */ |
853 | spin_lock(lock: &files->file_lock); |
854 | for (i = 0; ; i++) { |
855 | unsigned long set; |
856 | unsigned fd = i * BITS_PER_LONG; |
857 | fdt = files_fdtable(files); |
858 | if (fd >= fdt->max_fds) |
859 | break; |
860 | set = fdt->close_on_exec[i]; |
861 | if (!set) |
862 | continue; |
863 | fdt->close_on_exec[i] = 0; |
864 | for ( ; set ; fd++, set >>= 1) { |
865 | struct file *file; |
866 | if (!(set & 1)) |
867 | continue; |
868 | file = fdt->fd[fd]; |
869 | if (!file) |
870 | continue; |
871 | rcu_assign_pointer(fdt->fd[fd], NULL); |
872 | __put_unused_fd(files, fd); |
873 | spin_unlock(lock: &files->file_lock); |
874 | filp_close(file, id: files); |
875 | cond_resched(); |
876 | spin_lock(lock: &files->file_lock); |
877 | } |
878 | |
879 | } |
880 | spin_unlock(lock: &files->file_lock); |
881 | } |
882 | |
883 | static struct file *__get_file_rcu(struct file __rcu **f) |
884 | { |
885 | struct file __rcu *file; |
886 | struct file __rcu *file_reloaded; |
887 | struct file __rcu *file_reloaded_cmp; |
888 | |
889 | file = rcu_dereference_raw(*f); |
890 | if (!file) |
891 | return NULL; |
892 | |
893 | if (unlikely(!file_ref_get(&file->f_ref))) |
894 | return ERR_PTR(error: -EAGAIN); |
895 | |
896 | file_reloaded = rcu_dereference_raw(*f); |
897 | |
898 | /* |
899 | * Ensure that all accesses have a dependency on the load from |
900 | * rcu_dereference_raw() above so we get correct ordering |
901 | * between reuse/allocation and the pointer check below. |
902 | */ |
903 | file_reloaded_cmp = file_reloaded; |
904 | OPTIMIZER_HIDE_VAR(file_reloaded_cmp); |
905 | |
906 | /* |
907 | * file_ref_get() above provided a full memory barrier when we |
908 | * acquired a reference. |
909 | * |
910 | * This is paired with the write barrier from assigning to the |
911 | * __rcu protected file pointer so that if that pointer still |
912 | * matches the current file, we know we have successfully |
913 | * acquired a reference to the right file. |
914 | * |
915 | * If the pointers don't match the file has been reallocated by |
916 | * SLAB_TYPESAFE_BY_RCU. |
917 | */ |
918 | if (file == file_reloaded_cmp) |
919 | return file_reloaded; |
920 | |
921 | fput(file); |
922 | return ERR_PTR(error: -EAGAIN); |
923 | } |
924 | |
925 | /** |
926 | * get_file_rcu - try go get a reference to a file under rcu |
927 | * @f: the file to get a reference on |
928 | * |
929 | * This function tries to get a reference on @f carefully verifying that |
930 | * @f hasn't been reused. |
931 | * |
932 | * This function should rarely have to be used and only by users who |
933 | * understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it. |
934 | * |
935 | * Return: Returns @f with the reference count increased or NULL. |
936 | */ |
937 | struct file *get_file_rcu(struct file __rcu **f) |
938 | { |
939 | for (;;) { |
940 | struct file __rcu *file; |
941 | |
942 | file = __get_file_rcu(f); |
943 | if (!IS_ERR(ptr: file)) |
944 | return file; |
945 | } |
946 | } |
947 | EXPORT_SYMBOL_GPL(get_file_rcu); |
948 | |
949 | /** |
950 | * get_file_active - try go get a reference to a file |
951 | * @f: the file to get a reference on |
952 | * |
953 | * In contast to get_file_rcu() the pointer itself isn't part of the |
954 | * reference counting. |
955 | * |
956 | * This function should rarely have to be used and only by users who |
957 | * understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it. |
958 | * |
959 | * Return: Returns @f with the reference count increased or NULL. |
960 | */ |
961 | struct file *get_file_active(struct file **f) |
962 | { |
963 | struct file __rcu *file; |
964 | |
965 | rcu_read_lock(); |
966 | file = __get_file_rcu(f); |
967 | rcu_read_unlock(); |
968 | if (IS_ERR(ptr: file)) |
969 | file = NULL; |
970 | return file; |
971 | } |
972 | EXPORT_SYMBOL_GPL(get_file_active); |
973 | |
974 | static inline struct file *__fget_files_rcu(struct files_struct *files, |
975 | unsigned int fd, fmode_t mask) |
976 | { |
977 | for (;;) { |
978 | struct file *file; |
979 | struct fdtable *fdt = rcu_dereference_raw(files->fdt); |
980 | struct file __rcu **fdentry; |
981 | unsigned long nospec_mask; |
982 | |
983 | /* Mask is a 0 for invalid fd's, ~0 for valid ones */ |
984 | nospec_mask = array_index_mask_nospec(fd, fdt->max_fds); |
985 | |
986 | /* |
987 | * fdentry points to the 'fd' offset, or fdt->fd[0]. |
988 | * Loading from fdt->fd[0] is always safe, because the |
989 | * array always exists. |
990 | */ |
991 | fdentry = fdt->fd + (fd & nospec_mask); |
992 | |
993 | /* Do the load, then mask any invalid result */ |
994 | file = rcu_dereference_raw(*fdentry); |
995 | file = (void *)(nospec_mask & (unsigned long)file); |
996 | if (unlikely(!file)) |
997 | return NULL; |
998 | |
999 | /* |
1000 | * Ok, we have a file pointer that was valid at |
1001 | * some point, but it might have become stale since. |
1002 | * |
1003 | * We need to confirm it by incrementing the refcount |
1004 | * and then check the lookup again. |
1005 | * |
1006 | * file_ref_get() gives us a full memory barrier. We |
1007 | * only really need an 'acquire' one to protect the |
1008 | * loads below, but we don't have that. |
1009 | */ |
1010 | if (unlikely(!file_ref_get(&file->f_ref))) |
1011 | continue; |
1012 | |
1013 | /* |
1014 | * Such a race can take two forms: |
1015 | * |
1016 | * (a) the file ref already went down to zero and the |
1017 | * file hasn't been reused yet or the file count |
1018 | * isn't zero but the file has already been reused. |
1019 | * |
1020 | * (b) the file table entry has changed under us. |
1021 | * Note that we don't need to re-check the 'fdt->fd' |
1022 | * pointer having changed, because it always goes |
1023 | * hand-in-hand with 'fdt'. |
1024 | * |
1025 | * If so, we need to put our ref and try again. |
1026 | */ |
1027 | if (unlikely(file != rcu_dereference_raw(*fdentry)) || |
1028 | unlikely(rcu_dereference_raw(files->fdt) != fdt)) { |
1029 | fput(file); |
1030 | continue; |
1031 | } |
1032 | |
1033 | /* |
1034 | * This isn't the file we're looking for or we're not |
1035 | * allowed to get a reference to it. |
1036 | */ |
1037 | if (unlikely(file->f_mode & mask)) { |
1038 | fput(file); |
1039 | return NULL; |
1040 | } |
1041 | |
1042 | /* |
1043 | * Ok, we have a ref to the file, and checked that it |
1044 | * still exists. |
1045 | */ |
1046 | return file; |
1047 | } |
1048 | } |
1049 | |
1050 | static struct file *__fget_files(struct files_struct *files, unsigned int fd, |
1051 | fmode_t mask) |
1052 | { |
1053 | struct file *file; |
1054 | |
1055 | rcu_read_lock(); |
1056 | file = __fget_files_rcu(files, fd, mask); |
1057 | rcu_read_unlock(); |
1058 | |
1059 | return file; |
1060 | } |
1061 | |
1062 | static inline struct file *__fget(unsigned int fd, fmode_t mask) |
1063 | { |
1064 | return __fget_files(current->files, fd, mask); |
1065 | } |
1066 | |
1067 | struct file *fget(unsigned int fd) |
1068 | { |
1069 | return __fget(fd, FMODE_PATH); |
1070 | } |
1071 | EXPORT_SYMBOL(fget); |
1072 | |
1073 | struct file *fget_raw(unsigned int fd) |
1074 | { |
1075 | return __fget(fd, mask: 0); |
1076 | } |
1077 | EXPORT_SYMBOL(fget_raw); |
1078 | |
1079 | struct file *fget_task(struct task_struct *task, unsigned int fd) |
1080 | { |
1081 | struct file *file = NULL; |
1082 | |
1083 | task_lock(p: task); |
1084 | if (task->files) |
1085 | file = __fget_files(files: task->files, fd, mask: 0); |
1086 | task_unlock(p: task); |
1087 | |
1088 | return file; |
1089 | } |
1090 | |
1091 | struct file *fget_task_next(struct task_struct *task, unsigned int *ret_fd) |
1092 | { |
1093 | /* Must be called with rcu_read_lock held */ |
1094 | struct files_struct *files; |
1095 | unsigned int fd = *ret_fd; |
1096 | struct file *file = NULL; |
1097 | |
1098 | task_lock(p: task); |
1099 | files = task->files; |
1100 | if (files) { |
1101 | rcu_read_lock(); |
1102 | for (; fd < files_fdtable(files)->max_fds; fd++) { |
1103 | file = __fget_files_rcu(files, fd, mask: 0); |
1104 | if (file) |
1105 | break; |
1106 | } |
1107 | rcu_read_unlock(); |
1108 | } |
1109 | task_unlock(p: task); |
1110 | *ret_fd = fd; |
1111 | return file; |
1112 | } |
1113 | EXPORT_SYMBOL(fget_task_next); |
1114 | |
1115 | /* |
1116 | * Lightweight file lookup - no refcnt increment if fd table isn't shared. |
1117 | * |
1118 | * You can use this instead of fget if you satisfy all of the following |
1119 | * conditions: |
1120 | * 1) You must call fput_light before exiting the syscall and returning control |
1121 | * to userspace (i.e. you cannot remember the returned struct file * after |
1122 | * returning to userspace). |
1123 | * 2) You must not call filp_close on the returned struct file * in between |
1124 | * calls to fget_light and fput_light. |
1125 | * 3) You must not clone the current task in between the calls to fget_light |
1126 | * and fput_light. |
1127 | * |
1128 | * The fput_needed flag returned by fget_light should be passed to the |
1129 | * corresponding fput_light. |
1130 | * |
1131 | * (As an exception to rule 2, you can call filp_close between fget_light and |
1132 | * fput_light provided that you capture a real refcount with get_file before |
1133 | * the call to filp_close, and ensure that this real refcount is fput *after* |
1134 | * the fput_light call.) |
1135 | * |
1136 | * See also the documentation in rust/kernel/file.rs. |
1137 | */ |
1138 | static inline struct fd __fget_light(unsigned int fd, fmode_t mask) |
1139 | { |
1140 | struct files_struct *files = current->files; |
1141 | struct file *file; |
1142 | |
1143 | /* |
1144 | * If another thread is concurrently calling close_fd() followed |
1145 | * by put_files_struct(), we must not observe the old table |
1146 | * entry combined with the new refcount - otherwise we could |
1147 | * return a file that is concurrently being freed. |
1148 | * |
1149 | * atomic_read_acquire() pairs with atomic_dec_and_test() in |
1150 | * put_files_struct(). |
1151 | */ |
1152 | if (likely(atomic_read_acquire(&files->count) == 1)) { |
1153 | file = files_lookup_fd_raw(files, fd); |
1154 | if (!file || unlikely(file->f_mode & mask)) |
1155 | return EMPTY_FD; |
1156 | return BORROWED_FD(file); |
1157 | } else { |
1158 | file = __fget_files(files, fd, mask); |
1159 | if (!file) |
1160 | return EMPTY_FD; |
1161 | return CLONED_FD(file); |
1162 | } |
1163 | } |
1164 | struct fd fdget(unsigned int fd) |
1165 | { |
1166 | return __fget_light(fd, FMODE_PATH); |
1167 | } |
1168 | EXPORT_SYMBOL(fdget); |
1169 | |
1170 | struct fd fdget_raw(unsigned int fd) |
1171 | { |
1172 | return __fget_light(fd, mask: 0); |
1173 | } |
1174 | |
1175 | /* |
1176 | * Try to avoid f_pos locking. We only need it if the |
1177 | * file is marked for FMODE_ATOMIC_POS, and it can be |
1178 | * accessed multiple ways. |
1179 | * |
1180 | * Always do it for directories, because pidfd_getfd() |
1181 | * can make a file accessible even if it otherwise would |
1182 | * not be, and for directories this is a correctness |
1183 | * issue, not a "POSIX requirement". |
1184 | */ |
1185 | static inline bool file_needs_f_pos_lock(struct file *file) |
1186 | { |
1187 | if (!(file->f_mode & FMODE_ATOMIC_POS)) |
1188 | return false; |
1189 | if (__file_ref_read_raw(ref: &file->f_ref) != FILE_REF_ONEREF) |
1190 | return true; |
1191 | if (file->f_op->iterate_shared) |
1192 | return true; |
1193 | return false; |
1194 | } |
1195 | |
1196 | bool file_seek_cur_needs_f_lock(struct file *file) |
1197 | { |
1198 | if (!(file->f_mode & FMODE_ATOMIC_POS) && !file->f_op->iterate_shared) |
1199 | return false; |
1200 | |
1201 | VFS_WARN_ON_ONCE((file_count(file) > 1) && |
1202 | !mutex_is_locked(&file->f_pos_lock)); |
1203 | return true; |
1204 | } |
1205 | |
1206 | struct fd fdget_pos(unsigned int fd) |
1207 | { |
1208 | struct fd f = fdget(fd); |
1209 | struct file *file = fd_file(f); |
1210 | |
1211 | if (likely(file) && file_needs_f_pos_lock(file)) { |
1212 | f.word |= FDPUT_POS_UNLOCK; |
1213 | mutex_lock(&file->f_pos_lock); |
1214 | } |
1215 | return f; |
1216 | } |
1217 | |
1218 | void __f_unlock_pos(struct file *f) |
1219 | { |
1220 | mutex_unlock(lock: &f->f_pos_lock); |
1221 | } |
1222 | |
1223 | /* |
1224 | * We only lock f_pos if we have threads or if the file might be |
1225 | * shared with another process. In both cases we'll have an elevated |
1226 | * file count (done either by fdget() or by fork()). |
1227 | */ |
1228 | |
1229 | void set_close_on_exec(unsigned int fd, int flag) |
1230 | { |
1231 | struct files_struct *files = current->files; |
1232 | spin_lock(lock: &files->file_lock); |
1233 | __set_close_on_exec(fd, files_fdtable(files), set: flag); |
1234 | spin_unlock(lock: &files->file_lock); |
1235 | } |
1236 | |
1237 | bool get_close_on_exec(unsigned int fd) |
1238 | { |
1239 | bool res; |
1240 | rcu_read_lock(); |
1241 | res = close_on_exec(fd, current->files); |
1242 | rcu_read_unlock(); |
1243 | return res; |
1244 | } |
1245 | |
1246 | static int do_dup2(struct files_struct *files, |
1247 | struct file *file, unsigned fd, unsigned flags) |
1248 | __releases(&files->file_lock) |
1249 | { |
1250 | struct file *tofree; |
1251 | struct fdtable *fdt; |
1252 | |
1253 | /* |
1254 | * dup2() is expected to close the file installed in the target fd slot |
1255 | * (if any). However, userspace hand-picking a fd may be racing against |
1256 | * its own threads which happened to allocate it in open() et al but did |
1257 | * not populate it yet. |
1258 | * |
1259 | * Broadly speaking we may be racing against the following: |
1260 | * fd = get_unused_fd_flags(); // fd slot reserved, ->fd[fd] == NULL |
1261 | * file = hard_work_goes_here(); |
1262 | * fd_install(fd, file); // only now ->fd[fd] == file |
1263 | * |
1264 | * It is an invariant that a successfully allocated fd has a NULL entry |
1265 | * in the array until the matching fd_install(). |
1266 | * |
1267 | * If we fit the window, we have the fd to populate, yet no target file |
1268 | * to close. Trying to ignore it and install our new file would violate |
1269 | * the invariant and make fd_install() overwrite our file. |
1270 | * |
1271 | * Things can be done(tm) to handle this. However, the issue does not |
1272 | * concern legitimate programs and we only need to make sure the kernel |
1273 | * does not trip over it. |
1274 | * |
1275 | * The simplest way out is to return an error if we find ourselves here. |
1276 | * |
1277 | * POSIX is silent on the issue, we return -EBUSY. |
1278 | */ |
1279 | fdt = files_fdtable(files); |
1280 | fd = array_index_nospec(fd, fdt->max_fds); |
1281 | tofree = rcu_dereference_raw(fdt->fd[fd]); |
1282 | if (!tofree && fd_is_open(fd, fdt)) |
1283 | goto Ebusy; |
1284 | get_file(f: file); |
1285 | rcu_assign_pointer(fdt->fd[fd], file); |
1286 | __set_open_fd(fd, fdt, set: flags & O_CLOEXEC); |
1287 | spin_unlock(lock: &files->file_lock); |
1288 | |
1289 | if (tofree) |
1290 | filp_close(tofree, id: files); |
1291 | |
1292 | return fd; |
1293 | |
1294 | Ebusy: |
1295 | spin_unlock(lock: &files->file_lock); |
1296 | return -EBUSY; |
1297 | } |
1298 | |
1299 | int replace_fd(unsigned fd, struct file *file, unsigned flags) |
1300 | { |
1301 | int err; |
1302 | struct files_struct *files = current->files; |
1303 | |
1304 | if (!file) |
1305 | return close_fd(fd); |
1306 | |
1307 | if (fd >= rlimit(RLIMIT_NOFILE)) |
1308 | return -EBADF; |
1309 | |
1310 | spin_lock(lock: &files->file_lock); |
1311 | err = expand_files(files, nr: fd); |
1312 | if (unlikely(err < 0)) |
1313 | goto out_unlock; |
1314 | return do_dup2(files, file, fd, flags); |
1315 | |
1316 | out_unlock: |
1317 | spin_unlock(lock: &files->file_lock); |
1318 | return err; |
1319 | } |
1320 | |
1321 | /** |
1322 | * receive_fd() - Install received file into file descriptor table |
1323 | * @file: struct file that was received from another process |
1324 | * @ufd: __user pointer to write new fd number to |
1325 | * @o_flags: the O_* flags to apply to the new fd entry |
1326 | * |
1327 | * Installs a received file into the file descriptor table, with appropriate |
1328 | * checks and count updates. Optionally writes the fd number to userspace, if |
1329 | * @ufd is non-NULL. |
1330 | * |
1331 | * This helper handles its own reference counting of the incoming |
1332 | * struct file. |
1333 | * |
1334 | * Returns newly install fd or -ve on error. |
1335 | */ |
1336 | int receive_fd(struct file *file, int __user *ufd, unsigned int o_flags) |
1337 | { |
1338 | int new_fd; |
1339 | int error; |
1340 | |
1341 | error = security_file_receive(file); |
1342 | if (error) |
1343 | return error; |
1344 | |
1345 | new_fd = get_unused_fd_flags(o_flags); |
1346 | if (new_fd < 0) |
1347 | return new_fd; |
1348 | |
1349 | if (ufd) { |
1350 | error = put_user(new_fd, ufd); |
1351 | if (error) { |
1352 | put_unused_fd(new_fd); |
1353 | return error; |
1354 | } |
1355 | } |
1356 | |
1357 | fd_install(new_fd, get_file(f: file)); |
1358 | __receive_sock(file); |
1359 | return new_fd; |
1360 | } |
1361 | EXPORT_SYMBOL_GPL(receive_fd); |
1362 | |
1363 | int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags) |
1364 | { |
1365 | int error; |
1366 | |
1367 | error = security_file_receive(file); |
1368 | if (error) |
1369 | return error; |
1370 | error = replace_fd(fd: new_fd, file, flags: o_flags); |
1371 | if (error) |
1372 | return error; |
1373 | __receive_sock(file); |
1374 | return new_fd; |
1375 | } |
1376 | |
1377 | static int ksys_dup3(unsigned int oldfd, unsigned int newfd, int flags) |
1378 | { |
1379 | int err = -EBADF; |
1380 | struct file *file; |
1381 | struct files_struct *files = current->files; |
1382 | |
1383 | if ((flags & ~O_CLOEXEC) != 0) |
1384 | return -EINVAL; |
1385 | |
1386 | if (unlikely(oldfd == newfd)) |
1387 | return -EINVAL; |
1388 | |
1389 | if (newfd >= rlimit(RLIMIT_NOFILE)) |
1390 | return -EBADF; |
1391 | |
1392 | spin_lock(lock: &files->file_lock); |
1393 | err = expand_files(files, nr: newfd); |
1394 | file = files_lookup_fd_locked(files, fd: oldfd); |
1395 | if (unlikely(!file)) |
1396 | goto Ebadf; |
1397 | if (unlikely(err < 0)) { |
1398 | if (err == -EMFILE) |
1399 | goto Ebadf; |
1400 | goto out_unlock; |
1401 | } |
1402 | return do_dup2(files, file, fd: newfd, flags); |
1403 | |
1404 | Ebadf: |
1405 | err = -EBADF; |
1406 | out_unlock: |
1407 | spin_unlock(lock: &files->file_lock); |
1408 | return err; |
1409 | } |
1410 | |
1411 | SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags) |
1412 | { |
1413 | return ksys_dup3(oldfd, newfd, flags); |
1414 | } |
1415 | |
1416 | SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd) |
1417 | { |
1418 | if (unlikely(newfd == oldfd)) { /* corner case */ |
1419 | struct files_struct *files = current->files; |
1420 | struct file *f; |
1421 | int retval = oldfd; |
1422 | |
1423 | rcu_read_lock(); |
1424 | f = __fget_files_rcu(files, fd: oldfd, mask: 0); |
1425 | if (!f) |
1426 | retval = -EBADF; |
1427 | rcu_read_unlock(); |
1428 | if (f) |
1429 | fput(f); |
1430 | return retval; |
1431 | } |
1432 | return ksys_dup3(oldfd, newfd, flags: 0); |
1433 | } |
1434 | |
1435 | SYSCALL_DEFINE1(dup, unsigned int, fildes) |
1436 | { |
1437 | int ret = -EBADF; |
1438 | struct file *file = fget_raw(fildes); |
1439 | |
1440 | if (file) { |
1441 | ret = get_unused_fd_flags(0); |
1442 | if (ret >= 0) |
1443 | fd_install(ret, file); |
1444 | else |
1445 | fput(file); |
1446 | } |
1447 | return ret; |
1448 | } |
1449 | |
1450 | int f_dupfd(unsigned int from, struct file *file, unsigned flags) |
1451 | { |
1452 | unsigned long nofile = rlimit(RLIMIT_NOFILE); |
1453 | int err; |
1454 | if (from >= nofile) |
1455 | return -EINVAL; |
1456 | err = alloc_fd(start: from, end: nofile, flags); |
1457 | if (err >= 0) { |
1458 | get_file(f: file); |
1459 | fd_install(err, file); |
1460 | } |
1461 | return err; |
1462 | } |
1463 | |
1464 | int iterate_fd(struct files_struct *files, unsigned n, |
1465 | int (*f)(const void *, struct file *, unsigned), |
1466 | const void *p) |
1467 | { |
1468 | struct fdtable *fdt; |
1469 | int res = 0; |
1470 | if (!files) |
1471 | return 0; |
1472 | spin_lock(lock: &files->file_lock); |
1473 | for (fdt = files_fdtable(files); n < fdt->max_fds; n++) { |
1474 | struct file *file; |
1475 | file = rcu_dereference_check_fdtable(files, fdt->fd[n]); |
1476 | if (!file) |
1477 | continue; |
1478 | res = f(p, file, n); |
1479 | if (res) |
1480 | break; |
1481 | } |
1482 | spin_unlock(lock: &files->file_lock); |
1483 | return res; |
1484 | } |
1485 | EXPORT_SYMBOL(iterate_fd); |
1486 |
Definitions
- __file_ref_put_badval
- __file_ref_put
- sysctl_nr_open
- sysctl_nr_open_min
- sysctl_nr_open_max
- __free_fdtable
- free_fdtable_rcu
- copy_fd_bitmaps
- copy_fdtable
- alloc_fdtable
- expand_fdtable
- expand_files
- __set_close_on_exec
- __set_open_fd
- __clear_open_fd
- fd_is_open
- sane_fdtable_size
- dup_fd
- close_files
- put_files_struct
- exit_files
- init_files
- find_next_fd
- alloc_fd
- __get_unused_fd_flags
- get_unused_fd_flags
- __put_unused_fd
- put_unused_fd
- fd_install
- file_close_fd_locked
- close_fd
- last_fd
- __range_cloexec
- __range_close
- file_close_fd
- do_close_on_exec
- __get_file_rcu
- get_file_rcu
- get_file_active
- __fget_files_rcu
- __fget_files
- __fget
- fget
- fget_raw
- fget_task
- fget_task_next
- __fget_light
- fdget
- fdget_raw
- file_needs_f_pos_lock
- file_seek_cur_needs_f_lock
- fdget_pos
- __f_unlock_pos
- set_close_on_exec
- get_close_on_exec
- do_dup2
- replace_fd
- receive_fd
- receive_fd_replace
- ksys_dup3
- f_dupfd
Improve your Profiling and Debugging skills
Find out more