seccomp.c source code [linux/kernel/seccomp.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* linux/kernel/seccomp.c
4	*
5	* Copyright 2004-2005 Andrea Arcangeli <andrea@cpushare.com>
6	*
7	* Copyright (C) 2012 Google, Inc.
8	* Will Drewry <wad@chromium.org>
9	*
10	* This defines a simple but solid secure-computing facility.
11	*
12	* Mode 1 uses a fixed list of allowed system calls.
13	* Mode 2 allows user-defined system call filters in the form
14	* of Berkeley Packet Filters/Linux Socket Filters.
15	*/
16	#define pr_fmt(fmt) "seccomp: " fmt
17
18	#include <linux/refcount.h>
19	#include <linux/audit.h>
20	#include <linux/compat.h>
21	#include <linux/coredump.h>
22	#include <linux/kmemleak.h>
23	#include <linux/nospec.h>
24	#include <linux/prctl.h>
25	#include <linux/sched.h>
26	#include <linux/sched/task_stack.h>
27	#include <linux/seccomp.h>
28	#include <linux/slab.h>
29	#include <linux/syscalls.h>
30	#include <linux/sysctl.h>
31
32	#include <asm/syscall.h>
33
34	/ Not exposed in headers: strictly internal use only. /
35	#define SECCOMP_MODE_DEAD (SECCOMP_MODE_FILTER + 1)
36
37	#ifdef CONFIG_SECCOMP_FILTER
38	#include <linux/file.h>
39	#include <linux/filter.h>
40	#include <linux/pid.h>
41	#include <linux/ptrace.h>
42	#include <linux/capability.h>
43	#include <linux/uaccess.h>
44	#include <linux/anon_inodes.h>
45	#include <linux/lockdep.h>
46
47	/*
48	* When SECCOMP_IOCTL_NOTIF_ID_VALID was first introduced, it had the
49	* wrong direction flag in the ioctl number. This is the broken one,
50	* which the kernel needs to keep supporting until all userspaces stop
51	* using the wrong command number.
52	*/
53	#define SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR SECCOMP_IOR(2, __u64)
54
55	enum notify_state {
56	SECCOMP_NOTIFY_INIT,
57	SECCOMP_NOTIFY_SENT,
58	SECCOMP_NOTIFY_REPLIED,
59	};
60
61	struct seccomp_knotif {
62	/ The struct pid of the task whose filter triggered the notification /
63	struct task_struct *task;
64
65	/ The "cookie" for this request; this is unique for this filter. /
66	u64 id;
67
68	/*
69	* The seccomp data. This pointer is valid the entire time this
70	* notification is active, since it comes from __seccomp_filter which
71	* eclipses the entire lifecycle here.
72	*/
73	const struct seccomp_data *data;
74
75	/*
76	* Notification states. When SECCOMP_RET_USER_NOTIF is returned, a
77	* struct seccomp_knotif is created and starts out in INIT. Once the
78	* handler reads the notification off of an FD, it transitions to SENT.
79	* If a signal is received the state transitions back to INIT and
80	* another message is sent. When the userspace handler replies, state
81	* transitions to REPLIED.
82	*/
83	enum notify_state state;
84
85	/ The return values, only valid when in SECCOMP_NOTIFY_REPLIED /
86	int error;
87	long val;
88	u32 flags;
89
90	/*
91	* Signals when this has changed states, such as the listener
92	* dying, a new seccomp addfd message, or changing to REPLIED
93	*/
94	struct completion ready;
95
96	struct list_head list;
97
98	/ outstanding addfd requests /
99	struct list_head addfd;
100	};
101
102	/**
103	* struct seccomp_kaddfd - container for seccomp_addfd ioctl messages
104	*
105	* @file: A reference to the file to install in the other task
106	* @fd: The fd number to install it at. If the fd number is -1, it means the
107	* installing process should allocate the fd as normal.
108	* @flags: The flags for the new file descriptor. At the moment, only O_CLOEXEC
109	* is allowed.
110	* @ioctl_flags: The flags used for the seccomp_addfd ioctl.
111	* @setfd: whether or not SECCOMP_ADDFD_FLAG_SETFD was set during notify_addfd
112	* @ret: The return value of the installing process. It is set to the fd num
113	* upon success (>= 0).
114	* @completion: Indicates that the installing process has completed fd
115	* installation, or gone away (either due to successful
116	* reply, or signal)
117	* @list: list_head for chaining seccomp_kaddfd together.
118	*
119	*/
120	struct seccomp_kaddfd {
121	struct file *file;
122	int fd;
123	unsigned int flags;
124	__u32 ioctl_flags;
125
126	union {
127	bool setfd;
128	/ To only be set on reply /
129	int ret;
130	};
131	struct completion completion;
132	struct list_head list;
133	};
134
135	/**
136	* struct notification - container for seccomp userspace notifications. Since
137	* most seccomp filters will not have notification listeners attached and this
138	* structure is fairly large, we store the notification-specific stuff in a
139	* separate structure.
140	*
141	* @requests: A semaphore that users of this notification can wait on for
142	* changes. Actual reads and writes are still controlled with
143	* filter->notify_lock.
144	* @flags: A set of SECCOMP_USER_NOTIF_FD_* flags.
145	* @next_id: The id of the next request.
146	* @notifications: A list of struct seccomp_knotif elements.
147	*/
148
149	struct notification {
150	atomic_t requests;
151	u32 flags;
152	u64 next_id;
153	struct list_head notifications;
154	};
155
156	#ifdef SECCOMP_ARCH_NATIVE
157	/**
158	* struct action_cache - per-filter cache of seccomp actions per
159	* arch/syscall pair
160	*
161	* @allow_native: A bitmap where each bit represents whether the
162	* filter will always allow the syscall, for the
163	* native architecture.
164	* @allow_compat: A bitmap where each bit represents whether the
165	* filter will always allow the syscall, for the
166	* compat architecture.
167	*/
168	struct action_cache {
169	DECLARE_BITMAP(allow_native, SECCOMP_ARCH_NATIVE_NR);
170	#ifdef SECCOMP_ARCH_COMPAT
171	DECLARE_BITMAP(allow_compat, SECCOMP_ARCH_COMPAT_NR);
172	#endif
173	};
174	#else
175	struct action_cache { };
176
177	static inline bool seccomp_cache_check_allow(const struct seccomp_filter *sfilter,
178	const struct seccomp_data *sd)
179	{
180	return false;
181	}
182
183	static inline void seccomp_cache_prepare(struct seccomp_filter *sfilter)
184	{
185	}
186	#endif /* SECCOMP_ARCH_NATIVE */
187
188	/**
189	* struct seccomp_filter - container for seccomp BPF programs
190	*
191	* @refs: Reference count to manage the object lifetime.
192	* A filter's reference count is incremented for each directly
193	* attached task, once for the dependent filter, and if
194	* requested for the user notifier. When @refs reaches zero,
195	* the filter can be freed.
196	* @users: A filter's @users count is incremented for each directly
197	* attached task (filter installation, fork(), thread_sync),
198	* and once for the dependent filter (tracked in filter->prev).
199	* When it reaches zero it indicates that no direct or indirect
200	* users of that filter exist. No new tasks can get associated with
201	* this filter after reaching 0. The @users count is always smaller
202	* or equal to @refs. Hence, reaching 0 for @users does not mean
203	* the filter can be freed.
204	* @cache: cache of arch/syscall mappings to actions
205	* @log: true if all actions except for SECCOMP_RET_ALLOW should be logged
206	* @wait_killable_recv: Put notifying process in killable state once the
207	* notification is received by the userspace listener.
208	* @prev: points to a previously installed, or inherited, filter
209	* @prog: the BPF program to evaluate
210	* @notif: the struct that holds all notification related information
211	* @notify_lock: A lock for all notification-related accesses.
212	* @wqh: A wait queue for poll if a notifier is in use.
213	*
214	* seccomp_filter objects are organized in a tree linked via the @prev
215	* pointer. For any task, it appears to be a singly-linked list starting
216	* with current->seccomp.filter, the most recently attached or inherited filter.
217	* However, multiple filters may share a @prev node, by way of fork(), which
218	* results in a unidirectional tree existing in memory. This is similar to
219	* how namespaces work.
220	*
221	* seccomp_filter objects should never be modified after being attached
222	* to a task_struct (other than @refs).
223	*/
224	struct seccomp_filter {
225	refcount_t refs;
226	refcount_t users;
227	bool log;
228	bool wait_killable_recv;
229	struct action_cache cache;
230	struct seccomp_filter *prev;
231	struct bpf_prog *prog;
232	struct notification *notif;
233	struct mutex notify_lock;
234	wait_queue_head_t wqh;
235	};
236
237	/ Limit any path through the tree to 256KB worth of instructions. /
238	#define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter))
239
240	/*
241	* Endianness is explicitly ignored and left for BPF program authors to manage
242	* as per the specific architecture.
243	*/
244	static void populate_seccomp_data(struct seccomp_data *sd)
245	{
246	/*
247	* Instead of using current_pt_reg(), we're already doing the work
248	* to safely fetch "current", so just use "task" everywhere below.
249	*/
250	struct task_struct *task = current;
251	struct pt_regs *regs = task_pt_regs(task);
252	unsigned long args[`6`];
253
254	sd->nr = syscall_get_nr(task, regs);
255	sd->arch = syscall_get_arch(task);
256	syscall_get_arguments(task, regs, args);
257	sd->args[`0`] = args[`0`];
258	sd->args[`1`] = args[`1`];
259	sd->args[`2`] = args[`2`];
260	sd->args[`3`] = args[`3`];
261	sd->args[`4`] = args[`4`];
262	sd->args[`5`] = args[`5`];
263	sd->instruction_pointer = KSTK_EIP(task);
264	}
265
266	/**
267	* seccomp_check_filter - verify seccomp filter code
268	* @filter: filter to verify
269	* @flen: length of filter
270	*
271	* Takes a previously checked filter (by bpf_check_classic) and
272	* redirects all filter code that loads struct sk_buff data
273	* and related data through seccomp_bpf_load. It also
274	* enforces length and alignment checking of those loads.
275	*
276	* Returns 0 if the rule set is legal or -EINVAL if not.
277	*/
278	static int seccomp_check_filter(struct sock_filter filter, unsigned* int flen)
279	{
280	int pc;
281	for (pc = `0`; pc < flen; pc++) {
282	struct sock_filter *ftest = &filter[pc];
283	u16 code = ftest->code;
284	u32 k = ftest->k;
285
286	switch (code) {
287	case BPF_LD \| BPF_W \| BPF_ABS:
288	ftest->code = BPF_LDX \| BPF_W \| BPF_ABS;
289	/ 32-bit aligned and not out of bounds. /
290	if (k >= sizeof(struct seccomp_data) \|\| k & `3`)
291	return -EINVAL;
292	continue;
293	case BPF_LD \| BPF_W \| BPF_LEN:
294	ftest->code = BPF_LD \| BPF_IMM;
295	ftest->k = sizeof(struct seccomp_data);
296	continue;
297	case BPF_LDX \| BPF_W \| BPF_LEN:
298	ftest->code = BPF_LDX \| BPF_IMM;
299	ftest->k = sizeof(struct seccomp_data);
300	continue;
301	/ Explicitly include allowed calls. /
302	case BPF_RET \| BPF_K:
303	case BPF_RET \| BPF_A:
304	case BPF_ALU \| BPF_ADD \| BPF_K:
305	case BPF_ALU \| BPF_ADD \| BPF_X:
306	case BPF_ALU \| BPF_SUB \| BPF_K:
307	case BPF_ALU \| BPF_SUB \| BPF_X:
308	case BPF_ALU \| BPF_MUL \| BPF_K:
309	case BPF_ALU \| BPF_MUL \| BPF_X:
310	case BPF_ALU \| BPF_DIV \| BPF_K:
311	case BPF_ALU \| BPF_DIV \| BPF_X:
312	case BPF_ALU \| BPF_AND \| BPF_K:
313	case BPF_ALU \| BPF_AND \| BPF_X:
314	case BPF_ALU \| BPF_OR \| BPF_K:
315	case BPF_ALU \| BPF_OR \| BPF_X:
316	case BPF_ALU \| BPF_XOR \| BPF_K:
317	case BPF_ALU \| BPF_XOR \| BPF_X:
318	case BPF_ALU \| BPF_LSH \| BPF_K:
319	case BPF_ALU \| BPF_LSH \| BPF_X:
320	case BPF_ALU \| BPF_RSH \| BPF_K:
321	case BPF_ALU \| BPF_RSH \| BPF_X:
322	case BPF_ALU \| BPF_NEG:
323	case BPF_LD \| BPF_IMM:
324	case BPF_LDX \| BPF_IMM:
325	case BPF_MISC \| BPF_TAX:
326	case BPF_MISC \| BPF_TXA:
327	case BPF_LD \| BPF_MEM:
328	case BPF_LDX \| BPF_MEM:
329	case BPF_ST:
330	case BPF_STX:
331	case BPF_JMP \| BPF_JA:
332	case BPF_JMP \| BPF_JEQ \| BPF_K:
333	case BPF_JMP \| BPF_JEQ \| BPF_X:
334	case BPF_JMP \| BPF_JGE \| BPF_K:
335	case BPF_JMP \| BPF_JGE \| BPF_X:
336	case BPF_JMP \| BPF_JGT \| BPF_K:
337	case BPF_JMP \| BPF_JGT \| BPF_X:
338	case BPF_JMP \| BPF_JSET \| BPF_K:
339	case BPF_JMP \| BPF_JSET \| BPF_X:
340	continue;
341	default:
342	return -EINVAL;
343	}
344	}
345	return `0`;
346	}
347
348	#ifdef SECCOMP_ARCH_NATIVE
349	static inline bool seccomp_cache_check_allow_bitmap(const void *bitmap,
350	size_t bitmap_size,
351	int syscall_nr)
352	{
353	if (unlikely(syscall_nr < `0` \|\| syscall_nr >= bitmap_size))
354	return false;
355	syscall_nr = array_index_nospec(syscall_nr, bitmap_size);
356
357	return test_bit(syscall_nr, bitmap);
358	}
359
360	/**
361	* seccomp_cache_check_allow - lookup seccomp cache
362	* @sfilter: The seccomp filter
363	* @sd: The seccomp data to lookup the cache with
364	*
365	* Returns true if the seccomp_data is cached and allowed.
366	*/
367	static inline bool seccomp_cache_check_allow(const struct seccomp_filter *sfilter,
368	const struct seccomp_data *sd)
369	{
370	int syscall_nr = sd->nr;
371	const struct action_cache *cache = &sfilter->cache;
372
373	#ifndef SECCOMP_ARCH_COMPAT
374	/ A native-only architecture doesn't need to check sd->arch. /
375	return seccomp_cache_check_allow_bitmap(cache->allow_native,
376	SECCOMP_ARCH_NATIVE_NR,
377	syscall_nr);
378	#else
379	if (likely(sd->arch == SECCOMP_ARCH_NATIVE))
380	return seccomp_cache_check_allow_bitmap(bitmap: cache->allow_native,
381	SECCOMP_ARCH_NATIVE_NR,
382	syscall_nr);
383	if (likely(sd->arch == SECCOMP_ARCH_COMPAT))
384	return seccomp_cache_check_allow_bitmap(bitmap: cache->allow_compat,
385	SECCOMP_ARCH_COMPAT_NR,
386	syscall_nr);
387	#endif /* SECCOMP_ARCH_COMPAT */
388
389	WARN_ON_ONCE(true);
390	return false;
391	}
392	#endif /* SECCOMP_ARCH_NATIVE */
393
394	#define ACTION_ONLY(ret) ((s32)((ret) & (SECCOMP_RET_ACTION_FULL)))
395	/**
396	* seccomp_run_filters - evaluates all seccomp filters against @sd
397	* @sd: optional seccomp data to be passed to filters
398	* @match: stores struct seccomp_filter that resulted in the return value,
399	* unless filter returned SECCOMP_RET_ALLOW, in which case it will
400	* be unchanged.
401	*
402	* Returns valid seccomp BPF response codes.
403	*/
404	static u32 seccomp_run_filters(const struct seccomp_data *sd,
405	struct seccomp_filter **match)
406	{
407	u32 ret = SECCOMP_RET_ALLOW;
408	/ Make sure cross-thread synced filter points somewhere sane. /
409	struct seccomp_filter *f =
410	READ_ONCE(current->seccomp.filter);
411
412	/ Ensure unexpected behavior doesn't result in failing open. /
413	if (WARN_ON(f == NULL))
414	return SECCOMP_RET_KILL_PROCESS;
415
416	if (seccomp_cache_check_allow(sfilter: f, sd))
417	return SECCOMP_RET_ALLOW;
418
419	/*
420	* All filters in the list are evaluated and the lowest BPF return
421	* value always takes priority (ignoring the DATA).
422	*/
423	for (; f; f = f->prev) {
424	u32 cur_ret = bpf_prog_run_pin_on_cpu(prog: f->prog, ctx: sd);
425
426	if (ACTION_ONLY(cur_ret) < ACTION_ONLY(ret)) {
427	ret = cur_ret;
428	*match = f;
429	}
430	}
431	return ret;
432	}
433	#endif /* CONFIG_SECCOMP_FILTER */
434
435	static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode)
436	{
437	assert_spin_locked(&current->sighand->siglock);
438
439	if (current->seccomp.mode && current->seccomp.mode != seccomp_mode)
440	return false;
441
442	return true;
443	}
444
445	void __weak arch_seccomp_spec_mitigate(struct task_struct *task) { }
446
447	static inline void seccomp_assign_mode(struct task_struct *task,
448	unsigned long seccomp_mode,
449	unsigned long flags)
450	{
451	assert_spin_locked(&task->sighand->siglock);
452
453	task->seccomp.mode = seccomp_mode;
454	/*
455	* Make sure SYSCALL_WORK_SECCOMP cannot be set before the mode (and
456	* filter) is set.
457	*/
458	smp_mb__before_atomic();
459	/ Assume default seccomp processes want spec flaw mitigation. /
460	if ((flags & SECCOMP_FILTER_FLAG_SPEC_ALLOW) == `0`)
461	arch_seccomp_spec_mitigate(task);
462	set_task_syscall_work(task, SECCOMP);
463	}
464
465	#ifdef CONFIG_SECCOMP_FILTER
466	/ Returns 1 if the parent is an ancestor of the child. /
467	static int is_ancestor(struct seccomp_filter *parent,
468	struct seccomp_filter *child)
469	{
470	/ NULL is the root ancestor. /
471	if (parent == NULL)
472	return `1`;
473	for (; child; child = child->prev)
474	if (child == parent)
475	return `1`;
476	return `0`;
477	}
478
479	/**
480	* seccomp_can_sync_threads: checks if all threads can be synchronized
481	*
482	* Expects sighand and cred_guard_mutex locks to be held.
483	*
484	* Returns 0 on success, -ve on error, or the pid of a thread which was
485	* either not in the correct seccomp mode or did not have an ancestral
486	* seccomp filter.
487	*/
488	static inline pid_t seccomp_can_sync_threads(void)
489	{
490	struct task_struct thread, caller;
491
492	BUG_ON(!mutex_is_locked(&current->signal->cred_guard_mutex));
493	assert_spin_locked(&current->sighand->siglock);
494
495	/ Validate all threads being eligible for synchronization. /
496	caller = current;
497	for_each_thread(caller, thread) {
498	pid_t failed;
499
500	/ Skip current, since it is initiating the sync. /
501	if (thread == caller)
502	continue;
503	/ Skip exited threads. /
504	if (thread->flags & PF_EXITING)
505	continue;
506
507	if (thread->seccomp.mode == SECCOMP_MODE_DISABLED \|\|
508	(thread->seccomp.mode == SECCOMP_MODE_FILTER &&
509	is_ancestor(parent: thread->seccomp.filter,
510	child: caller->seccomp.filter)))
511	continue;
512
513	/ Return the first thread that cannot be synchronized. /
514	failed = task_pid_vnr(tsk: thread);
515	/ If the pid cannot be resolved, then return -ESRCH /
516	if (WARN_ON(failed == `0`))
517	failed = -ESRCH;
518	return failed;
519	}
520
521	return `0`;
522	}
523
524	static inline void seccomp_filter_free(struct seccomp_filter *filter)
525	{
526	if (filter) {
527	bpf_prog_destroy(fp: filter->prog);
528	kfree(objp: filter);
529	}
530	}
531
532	static void __seccomp_filter_orphan(struct seccomp_filter *orig)
533	{
534	while (orig && refcount_dec_and_test(r: &orig->users)) {
535	if (waitqueue_active(wq_head: &orig->wqh))
536	wake_up_poll(&orig->wqh, EPOLLHUP);
537	orig = orig->prev;
538	}
539	}
540
541	static void __put_seccomp_filter(struct seccomp_filter *orig)
542	{
543	/ Clean up single-reference branches iteratively. /
544	while (orig && refcount_dec_and_test(r: &orig->refs)) {
545	struct seccomp_filter *freeme = orig;
546	orig = orig->prev;
547	seccomp_filter_free(filter: freeme);
548	}
549	}
550
551	static void __seccomp_filter_release(struct seccomp_filter *orig)
552	{
553	/ Notify about any unused filters in the task's former filter tree. /
554	__seccomp_filter_orphan(orig);
555	/ Finally drop all references to the task's former tree. /
556	__put_seccomp_filter(orig);
557	}
558
559	/**
560	* seccomp_filter_release - Detach the task from its filter tree,
561	* drop its reference count, and notify
562	* about unused filters
563	*
564	* @tsk: task the filter should be released from.
565	*
566	* This function should only be called when the task is exiting as
567	* it detaches it from its filter tree. PF_EXITING has to be set
568	* for the task.
569	*/
570	void seccomp_filter_release(struct task_struct *tsk)
571	{
572	struct seccomp_filter *orig;
573
574	if (WARN_ON((tsk->flags & PF_EXITING) == `0`))
575	return;
576
577	if (READ_ONCE(tsk->seccomp.filter) == NULL)
578	return;
579
580	spin_lock_irq(lock: &tsk->sighand->siglock);
581	orig = tsk->seccomp.filter;
582	/ Detach task from its filter tree. /
583	tsk->seccomp.filter = NULL;
584	spin_unlock_irq(lock: &tsk->sighand->siglock);
585	__seccomp_filter_release(orig);
586	}
587
588	/**
589	* seccomp_sync_threads: sets all threads to use current's filter
590	*
591	* @flags: SECCOMP_FILTER_FLAG_* flags to set during sync.
592	*
593	* Expects sighand and cred_guard_mutex locks to be held, and for
594	* seccomp_can_sync_threads() to have returned success already
595	* without dropping the locks.
596	*
597	*/
598	static inline void seccomp_sync_threads(unsigned long flags)
599	{
600	struct task_struct thread, caller;
601
602	BUG_ON(!mutex_is_locked(&current->signal->cred_guard_mutex));
603	assert_spin_locked(&current->sighand->siglock);
604
605	/*
606	* Don't touch any of the threads if the process is being killed.
607	* This allows for a lockless check in seccomp_filter_release.
608	*/
609	if (current->signal->flags & SIGNAL_GROUP_EXIT)
610	return;
611
612	/ Synchronize all threads. /
613	caller = current;
614	for_each_thread(caller, thread) {
615	/ Skip current, since it needs no changes. /
616	if (thread == caller)
617	continue;
618
619	/*
620	* Skip exited threads. seccomp_filter_release could have
621	* been already called for this task.
622	*/
623	if (thread->flags & PF_EXITING)
624	continue;
625
626	/ Get a task reference for the new leaf node. /
627	get_seccomp_filter(tsk: caller);
628
629	/*
630	* Drop the task reference to the shared ancestor since
631	* current's path will hold a reference. (This also
632	* allows a put before the assignment.)
633	*/
634	__seccomp_filter_release(orig: thread->seccomp.filter);
635
636	/ Make our new filter tree visible. /
637	smp_store_release(&thread->seccomp.filter,
638	caller->seccomp.filter);
639	atomic_set(v: &thread->seccomp.filter_count,
640	i: atomic_read(v: &caller->seccomp.filter_count));
641
642	/*
643	* Don't let an unprivileged task work around
644	* the no_new_privs restriction by creating
645	* a thread that sets it up, enters seccomp,
646	* then dies.
647	*/
648	if (task_no_new_privs(p: caller))
649	task_set_no_new_privs(p: thread);
650
651	/*
652	* Opt the other thread into seccomp if needed.
653	* As threads are considered to be trust-realm
654	* equivalent (see ptrace_may_access), it is safe to
655	* allow one thread to transition the other.
656	*/
657	if (thread->seccomp.mode == SECCOMP_MODE_DISABLED)
658	seccomp_assign_mode(task: thread, SECCOMP_MODE_FILTER,
659	flags);
660	}
661	}
662
663	/**
664	* seccomp_prepare_filter: Prepares a seccomp filter for use.
665	* @fprog: BPF program to install
666	*
667	* Returns filter on success or an ERR_PTR on failure.
668	*/
669	static struct seccomp_filter seccomp_prepare_filter(struct* sock_fprog *fprog)
670	{
671	struct seccomp_filter *sfilter;
672	int ret;
673	const bool save_orig =
674	#if defined(CONFIG_CHECKPOINT_RESTORE) \|\| defined(SECCOMP_ARCH_NATIVE)
675	true;
676	#else
677	false;
678	#endif
679
680	if (fprog->len == `0` \|\| fprog->len > BPF_MAXINSNS)
681	return ERR_PTR(error: -EINVAL);
682
683	BUG_ON(INT_MAX / fprog->len < sizeof(struct sock_filter));
684
685	/*
686	* Installing a seccomp filter requires that the task has
687	* CAP_SYS_ADMIN in its namespace or be running with no_new_privs.
688	* This avoids scenarios where unprivileged tasks can affect the
689	* behavior of privileged children.
690	*/
691	if (!task_no_new_privs(current) &&
692	!ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN))
693	return ERR_PTR(error: -EACCES);
694
695	/ Allocate a new seccomp_filter /
696	sfilter = kzalloc(sizeof(*sfilter), GFP_KERNEL \| __GFP_NOWARN);
697	if (!sfilter)
698	return ERR_PTR(error: -ENOMEM);
699
700	mutex_init(&sfilter->notify_lock);
701	ret = bpf_prog_create_from_user(pfp: &sfilter->prog, fprog,
702	trans: seccomp_check_filter, save_orig);
703	if (ret < `0`) {
704	kfree(objp: sfilter);
705	return ERR_PTR(error: ret);
706	}
707
708	refcount_set(r: &sfilter->refs, n: `1`);
709	refcount_set(r: &sfilter->users, n: `1`);
710	init_waitqueue_head(&sfilter->wqh);
711
712	return sfilter;
713	}
714
715	/**
716	* seccomp_prepare_user_filter - prepares a user-supplied sock_fprog
717	* @user_filter: pointer to the user data containing a sock_fprog.
718	*
719	* Returns 0 on success and non-zero otherwise.
720	*/
721	static struct seccomp_filter *
722	seccomp_prepare_user_filter(const char __user *user_filter)
723	{
724	struct sock_fprog fprog;
725	struct seccomp_filter *filter = ERR_PTR(error: -EFAULT);
726
727	#ifdef CONFIG_COMPAT
728	if (in_compat_syscall()) {
729	struct compat_sock_fprog fprog32;
730	if (copy_from_user(to: &fprog32, from: user_filter, n: sizeof(fprog32)))
731	goto out;
732	fprog.len = fprog32.len;
733	fprog.filter = compat_ptr(uptr: fprog32.filter);
734	} else / falls through to the if below. /
735	#endif
736	if (copy_from_user(to: &fprog, from: user_filter, n: sizeof(fprog)))
737	goto out;
738	filter = seccomp_prepare_filter(fprog: &fprog);
739	out:
740	return filter;
741	}
742
743	#ifdef SECCOMP_ARCH_NATIVE
744	/**
745	* seccomp_is_const_allow - check if filter is constant allow with given data
746	* @fprog: The BPF programs
747	* @sd: The seccomp data to check against, only syscall number and arch
748	* number are considered constant.
749	*/
750	static bool seccomp_is_const_allow(struct sock_fprog_kern *fprog,
751	struct seccomp_data *sd)
752	{
753	unsigned int reg_value = `0`;
754	unsigned int pc;
755	bool op_res;
756
757	if (WARN_ON_ONCE(!fprog))
758	return false;
759
760	/ Our single exception to filtering. /
761	#ifdef __NR_uretprobe
762	#ifdef SECCOMP_ARCH_COMPAT
763	if (sd->arch == SECCOMP_ARCH_NATIVE)
764	#endif
765	if (sd->nr == __NR_uretprobe)
766	return true;
767	#endif
768
769	for (pc = `0`; pc < fprog->len; pc++) {
770	struct sock_filter *insn = &fprog->filter[pc];
771	u16 code = insn->code;
772	u32 k = insn->k;
773
774	switch (code) {
775	case BPF_LD \| BPF_W \| BPF_ABS:
776	switch (k) {
777	case offsetof(struct seccomp_data, nr):
778	reg_value = sd->nr;
779	break;
780	case offsetof(struct seccomp_data, arch):
781	reg_value = sd->arch;
782	break;
783	default:
784	/ can't optimize (non-constant value load) /
785	return false;
786	}
787	break;
788	case BPF_RET \| BPF_K:
789	/ reached return with constant values only, check allow /
790	return k == SECCOMP_RET_ALLOW;
791	case BPF_JMP \| BPF_JA:
792	pc += insn->k;
793	break;
794	case BPF_JMP \| BPF_JEQ \| BPF_K:
795	case BPF_JMP \| BPF_JGE \| BPF_K:
796	case BPF_JMP \| BPF_JGT \| BPF_K:
797	case BPF_JMP \| BPF_JSET \| BPF_K:
798	switch (BPF_OP(code)) {
799	case BPF_JEQ:
800	op_res = reg_value == k;
801	break;
802	case BPF_JGE:
803	op_res = reg_value >= k;
804	break;
805	case BPF_JGT:
806	op_res = reg_value > k;
807	break;
808	case BPF_JSET:
809	op_res = !!(reg_value & k);
810	break;
811	default:
812	/ can't optimize (unknown jump) /
813	return false;
814	}
815
816	pc += op_res ? insn->jt : insn->jf;
817	break;
818	case BPF_ALU \| BPF_AND \| BPF_K:
819	reg_value &= k;
820	break;
821	default:
822	/ can't optimize (unknown insn) /
823	return false;
824	}
825	}
826
827	/ ran off the end of the filter?! /
828	WARN_ON(`1`);
829	return false;
830	}
831
832	static void seccomp_cache_prepare_bitmap(struct seccomp_filter *sfilter,
833	void bitmap, const* void *bitmap_prev,
834	size_t bitmap_size, int arch)
835	{
836	struct sock_fprog_kern *fprog = sfilter->prog->orig_prog;
837	struct seccomp_data sd;
838	int nr;
839
840	if (bitmap_prev) {
841	/ The new filter must be as restrictive as the last. /
842	bitmap_copy(dst: bitmap, src: bitmap_prev, nbits: bitmap_size);
843	} else {
844	/ Before any filters, all syscalls are always allowed. /
845	bitmap_fill(dst: bitmap, nbits: bitmap_size);
846	}
847
848	for (nr = `0`; nr < bitmap_size; nr++) {
849	/ No bitmap change: not a cacheable action. /
850	if (!test_bit(nr, bitmap))
851	continue;
852
853	sd.nr = nr;
854	sd.arch = arch;
855
856	/ No bitmap change: continue to always allow. /
857	if (seccomp_is_const_allow(fprog, sd: &sd))
858	continue;
859
860	/*
861	* Not a cacheable action: always run filters.
862	* atomic clear_bit() not needed, filter not visible yet.
863	*/
864	__clear_bit(nr, bitmap);
865	}
866	}
867
868	/**
869	* seccomp_cache_prepare - emulate the filter to find cacheable syscalls
870	* @sfilter: The seccomp filter
871	*
872	* Returns 0 if successful or -errno if error occurred.
873	*/
874	static void seccomp_cache_prepare(struct seccomp_filter *sfilter)
875	{
876	struct action_cache *cache = &sfilter->cache;
877	const struct action_cache *cache_prev =
878	sfilter->prev ? &sfilter->prev->cache : NULL;
879
880	seccomp_cache_prepare_bitmap(sfilter, bitmap: cache->allow_native,
881	bitmap_prev: cache_prev ? cache_prev->allow_native : NULL,
882	SECCOMP_ARCH_NATIVE_NR,
883	SECCOMP_ARCH_NATIVE);
884
885	#ifdef SECCOMP_ARCH_COMPAT
886	seccomp_cache_prepare_bitmap(sfilter, bitmap: cache->allow_compat,
887	bitmap_prev: cache_prev ? cache_prev->allow_compat : NULL,
888	SECCOMP_ARCH_COMPAT_NR,
889	SECCOMP_ARCH_COMPAT);
890	#endif /* SECCOMP_ARCH_COMPAT */
891	}
892	#endif /* SECCOMP_ARCH_NATIVE */
893
894	/**
895	* seccomp_attach_filter: validate and attach filter
896	* @flags: flags to change filter behavior
897	* @filter: seccomp filter to add to the current process
898	*
899	* Caller must be holding current->sighand->siglock lock.
900	*
901	* Returns 0 on success, -ve on error, or
902	* - in TSYNC mode: the pid of a thread which was either not in the correct
903	* seccomp mode or did not have an ancestral seccomp filter
904	* - in NEW_LISTENER mode: the fd of the new listener
905	*/
906	static long seccomp_attach_filter(unsigned int flags,
907	struct seccomp_filter *filter)
908	{
909	unsigned long total_insns;
910	struct seccomp_filter *walker;
911
912	assert_spin_locked(&current->sighand->siglock);
913
914	/ Validate resulting filter length. /
915	total_insns = filter->prog->len;
916	for (walker = current->seccomp.filter; walker; walker = walker->prev)
917	total_insns += walker->prog->len + `4`; / 4 instr penalty /
918	if (total_insns > MAX_INSNS_PER_PATH)
919	return -ENOMEM;
920
921	/ If thread sync has been requested, check that it is possible. /
922	if (flags & SECCOMP_FILTER_FLAG_TSYNC) {
923	int ret;
924
925	ret = seccomp_can_sync_threads();
926	if (ret) {
927	if (flags & SECCOMP_FILTER_FLAG_TSYNC_ESRCH)
928	return -ESRCH;
929	else
930	return ret;
931	}
932	}
933
934	/ Set log flag, if present. /
935	if (flags & SECCOMP_FILTER_FLAG_LOG)
936	filter->log = true;
937
938	/ Set wait killable flag, if present. /
939	if (flags & SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV)
940	filter->wait_killable_recv = true;
941
942	/*
943	* If there is an existing filter, make it the prev and don't drop its
944	* task reference.
945	*/
946	filter->prev = current->seccomp.filter;
947	seccomp_cache_prepare(sfilter: filter);
948	current->seccomp.filter = filter;
949	atomic_inc(v: &current->seccomp.filter_count);
950
951	/ Now that the new filter is in place, synchronize to all threads. /
952	if (flags & SECCOMP_FILTER_FLAG_TSYNC)
953	seccomp_sync_threads(flags);
954
955	return `0`;
956	}
957
958	static void __get_seccomp_filter(struct seccomp_filter *filter)
959	{
960	refcount_inc(r: &filter->refs);
961	}
962
963	/ get_seccomp_filter - increments the reference count of the filter on @tsk /
964	void get_seccomp_filter(struct task_struct *tsk)
965	{
966	struct seccomp_filter *orig = tsk->seccomp.filter;
967	if (!orig)
968	return;
969	__get_seccomp_filter(filter: orig);
970	refcount_inc(r: &orig->users);
971	}
972
973	#endif /* CONFIG_SECCOMP_FILTER */
974
975	/ For use with seccomp_actions_logged /
976	#define SECCOMP_LOG_KILL_PROCESS (1 << 0)
977	#define SECCOMP_LOG_KILL_THREAD (1 << 1)
978	#define SECCOMP_LOG_TRAP (1 << 2)
979	#define SECCOMP_LOG_ERRNO (1 << 3)
980	#define SECCOMP_LOG_TRACE (1 << 4)
981	#define SECCOMP_LOG_LOG (1 << 5)
982	#define SECCOMP_LOG_ALLOW (1 << 6)
983	#define SECCOMP_LOG_USER_NOTIF (1 << 7)
984
985	static u32 seccomp_actions_logged = SECCOMP_LOG_KILL_PROCESS \|
986	SECCOMP_LOG_KILL_THREAD \|
987	SECCOMP_LOG_TRAP \|
988	SECCOMP_LOG_ERRNO \|
989	SECCOMP_LOG_USER_NOTIF \|
990	SECCOMP_LOG_TRACE \|
991	SECCOMP_LOG_LOG;
992
993	static inline void seccomp_log(unsigned long syscall, long signr, u32 action,
994	bool requested)
995	{
996	bool log = false;
997
998	switch (action) {
999	case SECCOMP_RET_ALLOW:
1000	break;
1001	case SECCOMP_RET_TRAP:
1002	log = requested && seccomp_actions_logged & SECCOMP_LOG_TRAP;
1003	break;
1004	case SECCOMP_RET_ERRNO:
1005	log = requested && seccomp_actions_logged & SECCOMP_LOG_ERRNO;
1006	break;
1007	case SECCOMP_RET_TRACE:
1008	log = requested && seccomp_actions_logged & SECCOMP_LOG_TRACE;
1009	break;
1010	case SECCOMP_RET_USER_NOTIF:
1011	log = requested && seccomp_actions_logged & SECCOMP_LOG_USER_NOTIF;
1012	break;
1013	case SECCOMP_RET_LOG:
1014	log = seccomp_actions_logged & SECCOMP_LOG_LOG;
1015	break;
1016	case SECCOMP_RET_KILL_THREAD:
1017	log = seccomp_actions_logged & SECCOMP_LOG_KILL_THREAD;
1018	break;
1019	case SECCOMP_RET_KILL_PROCESS:
1020	default:
1021	log = seccomp_actions_logged & SECCOMP_LOG_KILL_PROCESS;
1022	}
1023
1024	/*
1025	* Emit an audit message when the action is RET_KILL_*, RET_LOG, or the
1026	* FILTER_FLAG_LOG bit was set. The admin has the ability to silence
1027	* any action from being logged by removing the action name from the
1028	* seccomp_actions_logged sysctl.
1029	*/
1030	if (!log)
1031	return;
1032
1033	audit_seccomp(syscall, signr, code: action);
1034	}
1035
1036	/*
1037	* Secure computing mode 1 allows only read/write/exit/sigreturn.
1038	* To be fully secure this must be combined with rlimit
1039	* to limit the stack allocations too.
1040	*/
1041	static const int mode1_syscalls[] = {
1042	__NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn,
1043	#ifdef __NR_uretprobe
1044	__NR_uretprobe,
1045	#endif
1046	-`1`, / negative terminated /
1047	};
1048
1049	static void __secure_computing_strict(int this_syscall)
1050	{
1051	const int *allowed_syscalls = mode1_syscalls;
1052	#ifdef CONFIG_COMPAT
1053	if (in_compat_syscall())
1054	allowed_syscalls = get_compat_mode1_syscalls();
1055	#endif
1056	do {
1057	if (*allowed_syscalls == this_syscall)
1058	return;
1059	} while (*++allowed_syscalls != -`1`);
1060
1061	#ifdef SECCOMP_DEBUG
1062	dump_stack();
1063	#endif
1064	current->seccomp.mode = SECCOMP_MODE_DEAD;
1065	seccomp_log(syscall: this_syscall, SIGKILL, SECCOMP_RET_KILL_THREAD, requested: true);
1066	do_exit(SIGKILL);
1067	}
1068
1069	#ifndef CONFIG_HAVE_ARCH_SECCOMP_FILTER
1070	void secure_computing_strict(int this_syscall)
1071	{
1072	int mode = current->seccomp.mode;
1073
1074	if (IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) &&
1075	unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
1076	return;
1077
1078	if (mode == SECCOMP_MODE_DISABLED)
1079	return;
1080	else if (mode == SECCOMP_MODE_STRICT)
1081	__secure_computing_strict(this_syscall);
1082	else
1083	BUG();
1084	}
1085	int __secure_computing(void)
1086	{
1087	int this_syscall = syscall_get_nr(current, current_pt_regs());
1088
1089	secure_computing_strict(this_syscall);
1090	return `0`;
1091	}
1092	#else
1093
1094	#ifdef CONFIG_SECCOMP_FILTER
1095	static u64 seccomp_next_notify_id(struct seccomp_filter *filter)
1096	{
1097	/*
1098	* Note: overflow is ok here, the id just needs to be unique per
1099	* filter.
1100	*/
1101	lockdep_assert_held(&filter->notify_lock);
1102	return filter->notif->next_id++;
1103	}
1104
1105	static void seccomp_handle_addfd(struct seccomp_kaddfd addfd, struct* seccomp_knotif *n)
1106	{
1107	int fd;
1108
1109	/*
1110	* Remove the notification, and reset the list pointers, indicating
1111	* that it has been handled.
1112	*/
1113	list_del_init(entry: &addfd->list);
1114	if (!addfd->setfd)
1115	fd = receive_fd(file: addfd->file, NULL, o_flags: addfd->flags);
1116	else
1117	fd = receive_fd_replace(new_fd: addfd->fd, file: addfd->file, o_flags: addfd->flags);
1118	addfd->ret = fd;
1119
1120	if (addfd->ioctl_flags & SECCOMP_ADDFD_FLAG_SEND) {
1121	/ If we fail reset and return an error to the notifier /
1122	if (fd < `0`) {
1123	n->state = SECCOMP_NOTIFY_SENT;
1124	} else {
1125	/ Return the FD we just added /
1126	n->flags = `0`;
1127	n->error = `0`;
1128	n->val = fd;
1129	}
1130	}
1131
1132	/*
1133	* Mark the notification as completed. From this point, addfd mem
1134	* might be invalidated and we can't safely read it anymore.
1135	*/
1136	complete(&addfd->completion);
1137	}
1138
1139	static bool should_sleep_killable(struct seccomp_filter *match,
1140	struct seccomp_knotif *n)
1141	{
1142	return match->wait_killable_recv && n->state == SECCOMP_NOTIFY_SENT;
1143	}
1144
1145	static int seccomp_do_user_notification(int this_syscall,
1146	struct seccomp_filter *match,
1147	const struct seccomp_data *sd)
1148	{
1149	int err;
1150	u32 flags = `0`;
1151	long ret = `0`;
1152	struct seccomp_knotif n = {};
1153	struct seccomp_kaddfd addfd, tmp;
1154
1155	mutex_lock(&match->notify_lock);
1156	err = -ENOSYS;
1157	if (!match->notif)
1158	goto out;
1159
1160	n.task = current;
1161	n.state = SECCOMP_NOTIFY_INIT;
1162	n.data = sd;
1163	n.id = seccomp_next_notify_id(filter: match);
1164	init_completion(x: &n.ready);
1165	list_add_tail(new: &n.list, head: &match->notif->notifications);
1166	INIT_LIST_HEAD(list: &n.addfd);
1167
1168	atomic_inc(v: &match->notif->requests);
1169	if (match->notif->flags & SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP)
1170	wake_up_poll_on_current_cpu(&match->wqh, EPOLLIN \| EPOLLRDNORM);
1171	else
1172	wake_up_poll(&match->wqh, EPOLLIN \| EPOLLRDNORM);
1173
1174	/*
1175	* This is where we wait for a reply from userspace.
1176	*/
1177	do {
1178	bool wait_killable = should_sleep_killable(match, n: &n);
1179
1180	mutex_unlock(lock: &match->notify_lock);
1181	if (wait_killable)
1182	err = wait_for_completion_killable(x: &n.ready);
1183	else
1184	err = wait_for_completion_interruptible(x: &n.ready);
1185	mutex_lock(&match->notify_lock);
1186
1187	if (err != `0`) {
1188	/*
1189	* Check to see if the notifcation got picked up and
1190	* whether we should switch to wait killable.
1191	*/
1192	if (!wait_killable && should_sleep_killable(match, n: &n))
1193	continue;
1194
1195	goto interrupted;
1196	}
1197
1198	addfd = list_first_entry_or_null(&n.addfd,
1199	struct seccomp_kaddfd, list);
1200	/ Check if we were woken up by a addfd message /
1201	if (addfd)
1202	seccomp_handle_addfd(addfd, n: &n);
1203
1204	} while (n.state != SECCOMP_NOTIFY_REPLIED);
1205
1206	ret = n.val;
1207	err = n.error;
1208	flags = n.flags;
1209
1210	interrupted:
1211	/ If there were any pending addfd calls, clear them out /
1212	list_for_each_entry_safe(addfd, tmp, &n.addfd, list) {
1213	/ The process went away before we got a chance to handle it /
1214	addfd->ret = -ESRCH;
1215	list_del_init(entry: &addfd->list);
1216	complete(&addfd->completion);
1217	}
1218
1219	/*
1220	* Note that it's possible the listener died in between the time when
1221	* we were notified of a response (or a signal) and when we were able to
1222	* re-acquire the lock, so only delete from the list if the
1223	* notification actually exists.
1224	*
1225	* Also note that this test is only valid because there's no way to
1226	* reattach to a notifier right now. If one is added, we'll need to
1227	* keep track of the notif itself and make sure they match here.
1228	*/
1229	if (match->notif)
1230	list_del(entry: &n.list);
1231	out:
1232	mutex_unlock(lock: &match->notify_lock);
1233
1234	/ Userspace requests to continue the syscall. /
1235	if (flags & SECCOMP_USER_NOTIF_FLAG_CONTINUE)
1236	return `0`;
1237
1238	syscall_set_return_value(current, current_pt_regs(),
1239	error: err, val: ret);
1240	return -`1`;
1241	}
1242
1243	static int __seccomp_filter(int this_syscall, const bool recheck_after_trace)
1244	{
1245	u32 filter_ret, action;
1246	struct seccomp_data sd;
1247	struct seccomp_filter *match = NULL;
1248	int data;
1249
1250	/*
1251	* Make sure that any changes to mode from another thread have
1252	* been seen after SYSCALL_WORK_SECCOMP was seen.
1253	*/
1254	smp_rmb();
1255
1256	populate_seccomp_data(sd: &sd);
1257
1258	filter_ret = seccomp_run_filters(sd: &sd, match: &match);
1259	data = filter_ret & SECCOMP_RET_DATA;
1260	action = filter_ret & SECCOMP_RET_ACTION_FULL;
1261
1262	switch (action) {
1263	case SECCOMP_RET_ERRNO:
1264	/ Set low-order bits as an errno, capped at MAX_ERRNO. /
1265	if (data > MAX_ERRNO)
1266	data = MAX_ERRNO;
1267	syscall_set_return_value(current, current_pt_regs(),
1268	error: -data, val: `0`);
1269	goto skip;
1270
1271	case SECCOMP_RET_TRAP:
1272	/ Show the handler the original registers. /
1273	syscall_rollback(current, current_pt_regs());
1274	/ Let the filter pass back 16 bits of data. /
1275	force_sig_seccomp(syscall: this_syscall, reason: data, force_coredump: false);
1276	goto skip;
1277
1278	case SECCOMP_RET_TRACE:
1279	/ We've been put in this state by the ptracer already. /
1280	if (recheck_after_trace)
1281	return `0`;
1282
1283	/ ENOSYS these calls if there is no tracer attached. /
1284	if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
1285	syscall_set_return_value(current,
1286	current_pt_regs(),
1287	error: -ENOSYS, val: `0`);
1288	goto skip;
1289	}
1290
1291	/ Allow the BPF to provide the event message /
1292	ptrace_event(PTRACE_EVENT_SECCOMP, message: data);
1293	/*
1294	* The delivery of a fatal signal during event
1295	* notification may silently skip tracer notification,
1296	* which could leave us with a potentially unmodified
1297	* syscall that the tracer would have liked to have
1298	* changed. Since the process is about to die, we just
1299	* force the syscall to be skipped and let the signal
1300	* kill the process and correctly handle any tracer exit
1301	* notifications.
1302	*/
1303	if (fatal_signal_pending(current))
1304	goto skip;
1305	/ Check if the tracer forced the syscall to be skipped. /
1306	this_syscall = syscall_get_nr(current, current_pt_regs());
1307	if (this_syscall < `0`)
1308	goto skip;
1309
1310	/*
1311	* Recheck the syscall, since it may have changed. This
1312	* intentionally uses a NULL struct seccomp_data to force
1313	* a reload of all registers. This does not goto skip since
1314	* a skip would have already been reported.
1315	*/
1316	if (__seccomp_filter(this_syscall, recheck_after_trace: true))
1317	return -`1`;
1318
1319	return `0`;
1320
1321	case SECCOMP_RET_USER_NOTIF:
1322	if (seccomp_do_user_notification(this_syscall, match, sd: &sd))
1323	goto skip;
1324
1325	return `0`;
1326
1327	case SECCOMP_RET_LOG:
1328	seccomp_log(syscall: this_syscall, signr: `0`, action, requested: true);
1329	return `0`;
1330
1331	case SECCOMP_RET_ALLOW:
1332	/*
1333	* Note that the "match" filter will always be NULL for
1334	* this action since SECCOMP_RET_ALLOW is the starting
1335	* state in seccomp_run_filters().
1336	*/
1337	return `0`;
1338
1339	case SECCOMP_RET_KILL_THREAD:
1340	case SECCOMP_RET_KILL_PROCESS:
1341	default:
1342	current->seccomp.mode = SECCOMP_MODE_DEAD;
1343	seccomp_log(syscall: this_syscall, SIGSYS, action, requested: true);
1344	/ Dump core only if this is the last remaining thread. /
1345	if (action != SECCOMP_RET_KILL_THREAD \|\|
1346	(atomic_read(v: &current->signal->live) == `1`)) {
1347	/ Show the original registers in the dump. /
1348	syscall_rollback(current, current_pt_regs());
1349	/ Trigger a coredump with SIGSYS /
1350	force_sig_seccomp(syscall: this_syscall, reason: data, force_coredump: true);
1351	} else {
1352	do_exit(SIGSYS);
1353	}
1354	return -`1`; / skip the syscall go directly to signal handling /
1355	}
1356
1357	unreachable();
1358
1359	skip:
1360	seccomp_log(syscall: this_syscall, signr: `0`, action, requested: match ? match->log : false);
1361	return -`1`;
1362	}
1363	#else
1364	static int __seccomp_filter(int this_syscall, const bool recheck_after_trace)
1365	{
1366	BUG();
1367
1368	return -`1`;
1369	}
1370	#endif
1371
1372	int __secure_computing(void)
1373	{
1374	int mode = current->seccomp.mode;
1375	int this_syscall;
1376
1377	if (IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) &&
1378	unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
1379	return `0`;
1380
1381	this_syscall = syscall_get_nr(current, current_pt_regs());
1382
1383	switch (mode) {
1384	case SECCOMP_MODE_STRICT:
1385	__secure_computing_strict(this_syscall); / may call do_exit /
1386	return `0`;
1387	case SECCOMP_MODE_FILTER:
1388	return __seccomp_filter(this_syscall, recheck_after_trace: false);
1389	/ Surviving SECCOMP_RET_KILL_* must be proactively impossible. /
1390	case SECCOMP_MODE_DEAD:
1391	WARN_ON_ONCE(`1`);
1392	do_exit(SIGKILL);
1393	return -`1`;
1394	default:
1395	BUG();
1396	}
1397	}
1398	#endif /* CONFIG_HAVE_ARCH_SECCOMP_FILTER */
1399
1400	long prctl_get_seccomp(void)
1401	{
1402	return current->seccomp.mode;
1403	}
1404
1405	/**
1406	* seccomp_set_mode_strict: internal function for setting strict seccomp
1407	*
1408	* Once current->seccomp.mode is non-zero, it may not be changed.
1409	*
1410	* Returns 0 on success or -EINVAL on failure.
1411	*/
1412	static long seccomp_set_mode_strict(void)
1413	{
1414	const unsigned long seccomp_mode = SECCOMP_MODE_STRICT;
1415	long ret = -EINVAL;
1416
1417	spin_lock_irq(lock: &current->sighand->siglock);
1418
1419	if (!seccomp_may_assign_mode(seccomp_mode))
1420	goto out;
1421
1422	#ifdef TIF_NOTSC
1423	disable_TSC();
1424	#endif
1425	seccomp_assign_mode(current, seccomp_mode, flags: `0`);
1426	ret = `0`;
1427
1428	out:
1429	spin_unlock_irq(lock: &current->sighand->siglock);
1430
1431	return ret;
1432	}
1433
1434	#ifdef CONFIG_SECCOMP_FILTER
1435	static void seccomp_notify_free(struct seccomp_filter *filter)
1436	{
1437	kfree(objp: filter->notif);
1438	filter->notif = NULL;
1439	}
1440
1441	static void seccomp_notify_detach(struct seccomp_filter *filter)
1442	{
1443	struct seccomp_knotif *knotif;
1444
1445	if (!filter)
1446	return;
1447
1448	mutex_lock(&filter->notify_lock);
1449
1450	/*
1451	* If this file is being closed because e.g. the task who owned it
1452	* died, let's wake everyone up who was waiting on us.
1453	*/
1454	list_for_each_entry(knotif, &filter->notif->notifications, list) {
1455	if (knotif->state == SECCOMP_NOTIFY_REPLIED)
1456	continue;
1457
1458	knotif->state = SECCOMP_NOTIFY_REPLIED;
1459	knotif->error = -ENOSYS;
1460	knotif->val = `0`;
1461
1462	/*
1463	* We do not need to wake up any pending addfd messages, as
1464	* the notifier will do that for us, as this just looks
1465	* like a standard reply.
1466	*/
1467	complete(&knotif->ready);
1468	}
1469
1470	seccomp_notify_free(filter);
1471	mutex_unlock(lock: &filter->notify_lock);
1472	}
1473
1474	static int seccomp_notify_release(struct inode inode, struct* file *file)
1475	{
1476	struct seccomp_filter *filter = file->private_data;
1477
1478	seccomp_notify_detach(filter);
1479	__put_seccomp_filter(orig: filter);
1480	return `0`;
1481	}
1482
1483	/ must be called with notif_lock held /
1484	static inline struct seccomp_knotif *
1485	find_notification(struct seccomp_filter *filter, u64 id)
1486	{
1487	struct seccomp_knotif *cur;
1488
1489	lockdep_assert_held(&filter->notify_lock);
1490
1491	list_for_each_entry(cur, &filter->notif->notifications, list) {
1492	if (cur->id == id)
1493	return cur;
1494	}
1495
1496	return NULL;
1497	}
1498
1499	static int recv_wake_function(wait_queue_entry_t wait, unsigned* int mode, int sync,
1500	void *key)
1501	{
1502	/ Avoid a wakeup if event not interesting for us. /
1503	if (key && !(key_to_poll(key) & (EPOLLIN \| EPOLLERR \| EPOLLHUP)))
1504	return `0`;
1505	return autoremove_wake_function(wq_entry: wait, mode, sync, key);
1506	}
1507
1508	static int recv_wait_event(struct seccomp_filter *filter)
1509	{
1510	DEFINE_WAIT_FUNC(wait, recv_wake_function);
1511	int ret;
1512
1513	if (refcount_read(r: &filter->users) == `0`)
1514	return `0`;
1515
1516	if (atomic_dec_if_positive(v: &filter->notif->requests) >= `0`)
1517	return `0`;
1518
1519	for (;;) {
1520	ret = prepare_to_wait_event(wq_head: &filter->wqh, wq_entry: &wait, TASK_INTERRUPTIBLE);
1521
1522	if (atomic_dec_if_positive(v: &filter->notif->requests) >= `0`)
1523	break;
1524	if (refcount_read(r: &filter->users) == `0`)
1525	break;
1526
1527	if (ret)
1528	return ret;
1529
1530	schedule();
1531	}
1532	finish_wait(wq_head: &filter->wqh, wq_entry: &wait);
1533	return `0`;
1534	}
1535
1536	static long seccomp_notify_recv(struct seccomp_filter *filter,
1537	void __user *buf)
1538	{
1539	struct seccomp_knotif knotif = NULL, cur;
1540	struct seccomp_notif unotif;
1541	ssize_t ret;
1542
1543	/ Verify that we're not given garbage to keep struct extensible. /
1544	ret = check_zeroed_user(from: buf, size: sizeof(unotif));
1545	if (ret < `0`)
1546	return ret;
1547	if (!ret)
1548	return -EINVAL;
1549
1550	memset(&unotif, `0`, sizeof(unotif));
1551
1552	ret = recv_wait_event(filter);
1553	if (ret < `0`)
1554	return ret;
1555
1556	mutex_lock(&filter->notify_lock);
1557	list_for_each_entry(cur, &filter->notif->notifications, list) {
1558	if (cur->state == SECCOMP_NOTIFY_INIT) {
1559	knotif = cur;
1560	break;
1561	}
1562	}
1563
1564	/*
1565	* If we didn't find a notification, it could be that the task was
1566	* interrupted by a fatal signal between the time we were woken and
1567	* when we were able to acquire the rw lock.
1568	*/
1569	if (!knotif) {
1570	ret = -ENOENT;
1571	goto out;
1572	}
1573
1574	unotif.id = knotif->id;
1575	unotif.pid = task_pid_vnr(tsk: knotif->task);
1576	unotif.data = *(knotif->data);
1577
1578	knotif->state = SECCOMP_NOTIFY_SENT;
1579	wake_up_poll(&filter->wqh, EPOLLOUT \| EPOLLWRNORM);
1580	ret = `0`;
1581	out:
1582	mutex_unlock(lock: &filter->notify_lock);
1583
1584	if (ret == `0` && copy_to_user(to: buf, from: &unotif, n: sizeof(unotif))) {
1585	ret = -EFAULT;
1586
1587	/*
1588	* Userspace screwed up. To make sure that we keep this
1589	* notification alive, let's reset it back to INIT. It
1590	* may have died when we released the lock, so we need to make
1591	* sure it's still around.
1592	*/
1593	mutex_lock(&filter->notify_lock);
1594	knotif = find_notification(filter, id: unotif.id);
1595	if (knotif) {
1596	/ Reset the process to make sure it's not stuck /
1597	if (should_sleep_killable(match: filter, n: knotif))
1598	complete(&knotif->ready);
1599	knotif->state = SECCOMP_NOTIFY_INIT;
1600	atomic_inc(v: &filter->notif->requests);
1601	wake_up_poll(&filter->wqh, EPOLLIN \| EPOLLRDNORM);
1602	}
1603	mutex_unlock(lock: &filter->notify_lock);
1604	}
1605
1606	return ret;
1607	}
1608
1609	static long seccomp_notify_send(struct seccomp_filter *filter,
1610	void __user *buf)
1611	{
1612	struct seccomp_notif_resp resp = {};
1613	struct seccomp_knotif *knotif;
1614	long ret;
1615
1616	if (copy_from_user(to: &resp, from: buf, n: sizeof(resp)))
1617	return -EFAULT;
1618
1619	if (resp.flags & ~SECCOMP_USER_NOTIF_FLAG_CONTINUE)
1620	return -EINVAL;
1621
1622	if ((resp.flags & SECCOMP_USER_NOTIF_FLAG_CONTINUE) &&
1623	(resp.error \|\| resp.val))
1624	return -EINVAL;
1625
1626	ret = mutex_lock_interruptible(&filter->notify_lock);
1627	if (ret < `0`)
1628	return ret;
1629
1630	knotif = find_notification(filter, id: resp.id);
1631	if (!knotif) {
1632	ret = -ENOENT;
1633	goto out;
1634	}
1635
1636	/ Allow exactly one reply. /
1637	if (knotif->state != SECCOMP_NOTIFY_SENT) {
1638	ret = -EINPROGRESS;
1639	goto out;
1640	}
1641
1642	ret = `0`;
1643	knotif->state = SECCOMP_NOTIFY_REPLIED;
1644	knotif->error = resp.error;
1645	knotif->val = resp.val;
1646	knotif->flags = resp.flags;
1647	if (filter->notif->flags & SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP)
1648	complete_on_current_cpu(x: &knotif->ready);
1649	else
1650	complete(&knotif->ready);
1651	out:
1652	mutex_unlock(lock: &filter->notify_lock);
1653	return ret;
1654	}
1655
1656	static long seccomp_notify_id_valid(struct seccomp_filter *filter,
1657	void __user *buf)
1658	{
1659	struct seccomp_knotif *knotif;
1660	u64 id;
1661	long ret;
1662
1663	if (copy_from_user(to: &id, from: buf, n: sizeof(id)))
1664	return -EFAULT;
1665
1666	ret = mutex_lock_interruptible(&filter->notify_lock);
1667	if (ret < `0`)
1668	return ret;
1669
1670	knotif = find_notification(filter, id);
1671	if (knotif && knotif->state == SECCOMP_NOTIFY_SENT)
1672	ret = `0`;
1673	else
1674	ret = -ENOENT;
1675
1676	mutex_unlock(lock: &filter->notify_lock);
1677	return ret;
1678	}
1679
1680	static long seccomp_notify_set_flags(struct seccomp_filter *filter,
1681	unsigned long flags)
1682	{
1683	long ret;
1684
1685	if (flags & ~SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP)
1686	return -EINVAL;
1687
1688	ret = mutex_lock_interruptible(&filter->notify_lock);
1689	if (ret < `0`)
1690	return ret;
1691	filter->notif->flags = flags;
1692	mutex_unlock(lock: &filter->notify_lock);
1693	return `0`;
1694	}
1695
1696	static long seccomp_notify_addfd(struct seccomp_filter *filter,
1697	struct seccomp_notif_addfd __user *uaddfd,
1698	unsigned int size)
1699	{
1700	struct seccomp_notif_addfd addfd;
1701	struct seccomp_knotif *knotif;
1702	struct seccomp_kaddfd kaddfd;
1703	int ret;
1704
1705	BUILD_BUG_ON(sizeof(addfd) < SECCOMP_NOTIFY_ADDFD_SIZE_VER0);
1706	BUILD_BUG_ON(sizeof(addfd) != SECCOMP_NOTIFY_ADDFD_SIZE_LATEST);
1707
1708	if (size < SECCOMP_NOTIFY_ADDFD_SIZE_VER0 \|\| size >= PAGE_SIZE)
1709	return -EINVAL;
1710
1711	ret = copy_struct_from_user(dst: &addfd, ksize: sizeof(addfd), src: uaddfd, usize: size);
1712	if (ret)
1713	return ret;
1714
1715	if (addfd.newfd_flags & ~O_CLOEXEC)
1716	return -EINVAL;
1717
1718	if (addfd.flags & ~(SECCOMP_ADDFD_FLAG_SETFD \| SECCOMP_ADDFD_FLAG_SEND))
1719	return -EINVAL;
1720
1721	if (addfd.newfd && !(addfd.flags & SECCOMP_ADDFD_FLAG_SETFD))
1722	return -EINVAL;
1723
1724	kaddfd.file = fget(fd: addfd.srcfd);
1725	if (!kaddfd.file)
1726	return -EBADF;
1727
1728	kaddfd.ioctl_flags = addfd.flags;
1729	kaddfd.flags = addfd.newfd_flags;
1730	kaddfd.setfd = addfd.flags & SECCOMP_ADDFD_FLAG_SETFD;
1731	kaddfd.fd = addfd.newfd;
1732	init_completion(x: &kaddfd.completion);
1733
1734	ret = mutex_lock_interruptible(&filter->notify_lock);
1735	if (ret < `0`)
1736	goto out;
1737
1738	knotif = find_notification(filter, id: addfd.id);
1739	if (!knotif) {
1740	ret = -ENOENT;
1741	goto out_unlock;
1742	}
1743
1744	/*
1745	* We do not want to allow for FD injection to occur before the
1746	* notification has been picked up by a userspace handler, or after
1747	* the notification has been replied to.
1748	*/
1749	if (knotif->state != SECCOMP_NOTIFY_SENT) {
1750	ret = -EINPROGRESS;
1751	goto out_unlock;
1752	}
1753
1754	if (addfd.flags & SECCOMP_ADDFD_FLAG_SEND) {
1755	/*
1756	* Disallow queuing an atomic addfd + send reply while there are
1757	* some addfd requests still to process.
1758	*
1759	* There is no clear reason to support it and allows us to keep
1760	* the loop on the other side straight-forward.
1761	*/
1762	if (!list_empty(head: &knotif->addfd)) {
1763	ret = -EBUSY;
1764	goto out_unlock;
1765	}
1766
1767	/ Allow exactly only one reply /
1768	knotif->state = SECCOMP_NOTIFY_REPLIED;
1769	}
1770
1771	list_add(new: &kaddfd.list, head: &knotif->addfd);
1772	complete(&knotif->ready);
1773	mutex_unlock(lock: &filter->notify_lock);
1774
1775	/ Now we wait for it to be processed or be interrupted /
1776	ret = wait_for_completion_interruptible(x: &kaddfd.completion);
1777	if (ret == `0`) {
1778	/*
1779	* We had a successful completion. The other side has already
1780	* removed us from the addfd queue, and
1781	* wait_for_completion_interruptible has a memory barrier upon
1782	* success that lets us read this value directly without
1783	* locking.
1784	*/
1785	ret = kaddfd.ret;
1786	goto out;
1787	}
1788
1789	mutex_lock(&filter->notify_lock);
1790	/*
1791	* Even though we were woken up by a signal and not a successful
1792	* completion, a completion may have happened in the mean time.
1793	*
1794	* We need to check again if the addfd request has been handled,
1795	* and if not, we will remove it from the queue.
1796	*/
1797	if (list_empty(head: &kaddfd.list))
1798	ret = kaddfd.ret;
1799	else
1800	list_del(entry: &kaddfd.list);
1801
1802	out_unlock:
1803	mutex_unlock(lock: &filter->notify_lock);
1804	out:
1805	fput(kaddfd.file);
1806
1807	return ret;
1808	}
1809
1810	static long seccomp_notify_ioctl(struct file file, unsigned* int cmd,
1811	unsigned long arg)
1812	{
1813	struct seccomp_filter *filter = file->private_data;
1814	void __user buf = (void* __user *)arg;
1815
1816	/ Fixed-size ioctls /
1817	switch (cmd) {
1818	case SECCOMP_IOCTL_NOTIF_RECV:
1819	return seccomp_notify_recv(filter, buf);
1820	case SECCOMP_IOCTL_NOTIF_SEND:
1821	return seccomp_notify_send(filter, buf);
1822	case SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR:
1823	case SECCOMP_IOCTL_NOTIF_ID_VALID:
1824	return seccomp_notify_id_valid(filter, buf);
1825	case SECCOMP_IOCTL_NOTIF_SET_FLAGS:
1826	return seccomp_notify_set_flags(filter, flags: arg);
1827	}
1828
1829	/ Extensible Argument ioctls /
1830	#define EA_IOCTL(cmd) ((cmd) & ~(IOC_INOUT \| IOCSIZE_MASK))
1831	switch (EA_IOCTL(cmd)) {
1832	case EA_IOCTL(SECCOMP_IOCTL_NOTIF_ADDFD):
1833	return seccomp_notify_addfd(filter, uaddfd: buf, _IOC_SIZE(cmd));
1834	default:
1835	return -EINVAL;
1836	}
1837	}
1838
1839	static __poll_t seccomp_notify_poll(struct file *file,
1840	struct poll_table_struct *poll_tab)
1841	{
1842	struct seccomp_filter *filter = file->private_data;
1843	__poll_t ret = `0`;
1844	struct seccomp_knotif *cur;
1845
1846	poll_wait(filp: file, wait_address: &filter->wqh, p: poll_tab);
1847
1848	if (mutex_lock_interruptible(&filter->notify_lock) < `0`)
1849	return EPOLLERR;
1850
1851	list_for_each_entry(cur, &filter->notif->notifications, list) {
1852	if (cur->state == SECCOMP_NOTIFY_INIT)
1853	ret \|= EPOLLIN \| EPOLLRDNORM;
1854	if (cur->state == SECCOMP_NOTIFY_SENT)
1855	ret \|= EPOLLOUT \| EPOLLWRNORM;
1856	if ((ret & EPOLLIN) && (ret & EPOLLOUT))
1857	break;
1858	}
1859
1860	mutex_unlock(lock: &filter->notify_lock);
1861
1862	if (refcount_read(r: &filter->users) == `0`)
1863	ret \|= EPOLLHUP;
1864
1865	return ret;
1866	}
1867
1868	static const struct file_operations seccomp_notify_ops = {
1869	.poll = seccomp_notify_poll,
1870	.release = seccomp_notify_release,
1871	.unlocked_ioctl = seccomp_notify_ioctl,
1872	.compat_ioctl = seccomp_notify_ioctl,
1873	};
1874
1875	static struct file init_listener(struct* seccomp_filter *filter)
1876	{
1877	struct file *ret;
1878
1879	ret = ERR_PTR(error: -ENOMEM);
1880	filter->notif = kzalloc(sizeof(*(filter->notif)), GFP_KERNEL);
1881	if (!filter->notif)
1882	goto out;
1883
1884	filter->notif->next_id = get_random_u64();
1885	INIT_LIST_HEAD(list: &filter->notif->notifications);
1886
1887	ret = anon_inode_getfile(name: "seccomp notify", fops: &seccomp_notify_ops,
1888	priv: filter, O_RDWR);
1889	if (IS_ERR(ptr: ret))
1890	goto out_notif;
1891
1892	/ The file has a reference to it now /
1893	__get_seccomp_filter(filter);
1894
1895	out_notif:
1896	if (IS_ERR(ptr: ret))
1897	seccomp_notify_free(filter);
1898	out:
1899	return ret;
1900	}
1901
1902	/*
1903	* Does @new_child have a listener while an ancestor also has a listener?
1904	* If so, we'll want to reject this filter.
1905	* This only has to be tested for the current process, even in the TSYNC case,
1906	* because TSYNC installs @child with the same parent on all threads.
1907	* Note that @new_child is not hooked up to its parent at this point yet, so
1908	* we use current->seccomp.filter.
1909	*/
1910	static bool has_duplicate_listener(struct seccomp_filter *new_child)
1911	{
1912	struct seccomp_filter *cur;
1913
1914	/ must be protected against concurrent TSYNC /
1915	lockdep_assert_held(&current->sighand->siglock);
1916
1917	if (!new_child->notif)
1918	return false;
1919	for (cur = current->seccomp.filter; cur; cur = cur->prev) {
1920	if (cur->notif)
1921	return true;
1922	}
1923
1924	return false;
1925	}
1926
1927	/**
1928	* seccomp_set_mode_filter: internal function for setting seccomp filter
1929	* @flags: flags to change filter behavior
1930	* @filter: struct sock_fprog containing filter
1931	*
1932	* This function may be called repeatedly to install additional filters.
1933	* Every filter successfully installed will be evaluated (in reverse order)
1934	* for each system call the task makes.
1935	*
1936	* Once current->seccomp.mode is non-zero, it may not be changed.
1937	*
1938	* Returns 0 on success or -EINVAL on failure.
1939	*/
1940	static long seccomp_set_mode_filter(unsigned int flags,
1941	const char __user *filter)
1942	{
1943	const unsigned long seccomp_mode = SECCOMP_MODE_FILTER;
1944	struct seccomp_filter *prepared = NULL;
1945	long ret = -EINVAL;
1946	int listener = -`1`;
1947	struct file *listener_f = NULL;
1948
1949	/ Validate flags. /
1950	if (flags & ~SECCOMP_FILTER_FLAG_MASK)
1951	return -EINVAL;
1952
1953	/*
1954	* In the successful case, NEW_LISTENER returns the new listener fd.
1955	* But in the failure case, TSYNC returns the thread that died. If you
1956	* combine these two flags, there's no way to tell whether something
1957	* succeeded or failed. So, let's disallow this combination if the user
1958	* has not explicitly requested no errors from TSYNC.
1959	*/
1960	if ((flags & SECCOMP_FILTER_FLAG_TSYNC) &&
1961	(flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) &&
1962	((flags & SECCOMP_FILTER_FLAG_TSYNC_ESRCH) == `0`))
1963	return -EINVAL;
1964
1965	/*
1966	* The SECCOMP_FILTER_FLAG_WAIT_KILLABLE_SENT flag doesn't make sense
1967	* without the SECCOMP_FILTER_FLAG_NEW_LISTENER flag.
1968	*/
1969	if ((flags & SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV) &&
1970	((flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) == `0`))
1971	return -EINVAL;
1972
1973	/ Prepare the new filter before holding any locks. /
1974	prepared = seccomp_prepare_user_filter(user_filter: filter);
1975	if (IS_ERR(ptr: prepared))
1976	return PTR_ERR(ptr: prepared);
1977
1978	if (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) {
1979	listener = get_unused_fd_flags(O_CLOEXEC);
1980	if (listener < `0`) {
1981	ret = listener;
1982	goto out_free;
1983	}
1984
1985	listener_f = init_listener(filter: prepared);
1986	if (IS_ERR(ptr: listener_f)) {
1987	put_unused_fd(fd: listener);
1988	ret = PTR_ERR(ptr: listener_f);
1989	goto out_free;
1990	}
1991	}
1992
1993	/*
1994	* Make sure we cannot change seccomp or nnp state via TSYNC
1995	* while another thread is in the middle of calling exec.
1996	*/
1997	if (flags & SECCOMP_FILTER_FLAG_TSYNC &&
1998	mutex_lock_killable(&current->signal->cred_guard_mutex))
1999	goto out_put_fd;
2000
2001	spin_lock_irq(lock: &current->sighand->siglock);
2002
2003	if (!seccomp_may_assign_mode(seccomp_mode))
2004	goto out;
2005
2006	if (has_duplicate_listener(new_child: prepared)) {
2007	ret = -EBUSY;
2008	goto out;
2009	}
2010
2011	ret = seccomp_attach_filter(flags, filter: prepared);
2012	if (ret)
2013	goto out;
2014	/ Do not free the successfully attached filter. /
2015	prepared = NULL;
2016
2017	seccomp_assign_mode(current, seccomp_mode, flags);
2018	out:
2019	spin_unlock_irq(lock: &current->sighand->siglock);
2020	if (flags & SECCOMP_FILTER_FLAG_TSYNC)
2021	mutex_unlock(lock: &current->signal->cred_guard_mutex);
2022	out_put_fd:
2023	if (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) {
2024	if (ret) {
2025	listener_f->private_data = NULL;
2026	fput(listener_f);
2027	put_unused_fd(fd: listener);
2028	seccomp_notify_detach(filter: prepared);
2029	} else {
2030	fd_install(fd: listener, file: listener_f);
2031	ret = listener;
2032	}
2033	}
2034	out_free:
2035	seccomp_filter_free(filter: prepared);
2036	return ret;
2037	}
2038	#else
2039	static inline long seccomp_set_mode_filter(unsigned int flags,
2040	const char __user *filter)
2041	{
2042	return -EINVAL;
2043	}
2044	#endif
2045
2046	static long seccomp_get_action_avail(const char __user *uaction)
2047	{
2048	u32 action;
2049
2050	if (copy_from_user(to: &action, from: uaction, n: sizeof(action)))
2051	return -EFAULT;
2052
2053	switch (action) {
2054	case SECCOMP_RET_KILL_PROCESS:
2055	case SECCOMP_RET_KILL_THREAD:
2056	case SECCOMP_RET_TRAP:
2057	case SECCOMP_RET_ERRNO:
2058	case SECCOMP_RET_USER_NOTIF:
2059	case SECCOMP_RET_TRACE:
2060	case SECCOMP_RET_LOG:
2061	case SECCOMP_RET_ALLOW:
2062	break;
2063	default:
2064	return -EOPNOTSUPP;
2065	}
2066
2067	return `0`;
2068	}
2069
2070	static long seccomp_get_notif_sizes(void __user *usizes)
2071	{
2072	struct seccomp_notif_sizes sizes = {
2073	.seccomp_notif = sizeof(struct seccomp_notif),
2074	.seccomp_notif_resp = sizeof(struct seccomp_notif_resp),
2075	.seccomp_data = sizeof(struct seccomp_data),
2076	};
2077
2078	if (copy_to_user(to: usizes, from: &sizes, n: sizeof(sizes)))
2079	return -EFAULT;
2080
2081	return `0`;
2082	}
2083
2084	/ Common entry point for both prctl and syscall. /
2085	static long do_seccomp(unsigned int op, unsigned int flags,
2086	void __user *uargs)
2087	{
2088	switch (op) {
2089	case SECCOMP_SET_MODE_STRICT:
2090	if (flags != `0` \|\| uargs != NULL)
2091	return -EINVAL;
2092	return seccomp_set_mode_strict();
2093	case SECCOMP_SET_MODE_FILTER:
2094	return seccomp_set_mode_filter(flags, filter: uargs);
2095	case SECCOMP_GET_ACTION_AVAIL:
2096	if (flags != `0`)
2097	return -EINVAL;
2098
2099	return seccomp_get_action_avail(uaction: uargs);
2100	case SECCOMP_GET_NOTIF_SIZES:
2101	if (flags != `0`)
2102	return -EINVAL;
2103
2104	return seccomp_get_notif_sizes(usizes: uargs);
2105	default:
2106	return -EINVAL;
2107	}
2108	}
2109
2110	SYSCALL_DEFINE3(seccomp, unsigned int, op, unsigned int, flags,
2111	void __user *, uargs)
2112	{
2113	return do_seccomp(op, flags, uargs);
2114	}
2115
2116	/**
2117	* prctl_set_seccomp: configures current->seccomp.mode
2118	* @seccomp_mode: requested mode to use
2119	* @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER
2120	*
2121	* Returns 0 on success or -EINVAL on failure.
2122	*/
2123	long prctl_set_seccomp(unsigned long seccomp_mode, void __user *filter)
2124	{
2125	unsigned int op;
2126	void __user *uargs;
2127
2128	switch (seccomp_mode) {
2129	case SECCOMP_MODE_STRICT:
2130	op = SECCOMP_SET_MODE_STRICT;
2131	/*
2132	* Setting strict mode through prctl always ignored filter,
2133	* so make sure it is always NULL here to pass the internal
2134	* check in do_seccomp().
2135	*/
2136	uargs = NULL;
2137	break;
2138	case SECCOMP_MODE_FILTER:
2139	op = SECCOMP_SET_MODE_FILTER;
2140	uargs = filter;
2141	break;
2142	default:
2143	return -EINVAL;
2144	}
2145
2146	/ prctl interface doesn't have flags, so they are always zero. /
2147	return do_seccomp(op, flags: `0`, uargs);
2148	}
2149
2150	#if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE)
2151	static struct seccomp_filter get_nth_filter(struct* task_struct *task,
2152	unsigned long filter_off)
2153	{
2154	struct seccomp_filter orig, filter;
2155	unsigned long count;
2156
2157	/*
2158	* Note: this is only correct because the caller should be the (ptrace)
2159	* tracer of the task, otherwise lock_task_sighand is needed.
2160	*/
2161	spin_lock_irq(lock: &task->sighand->siglock);
2162
2163	if (task->seccomp.mode != SECCOMP_MODE_FILTER) {
2164	spin_unlock_irq(lock: &task->sighand->siglock);
2165	return ERR_PTR(error: -EINVAL);
2166	}
2167
2168	orig = task->seccomp.filter;
2169	__get_seccomp_filter(filter: orig);
2170	spin_unlock_irq(lock: &task->sighand->siglock);
2171
2172	count = `0`;
2173	for (filter = orig; filter; filter = filter->prev)
2174	count++;
2175
2176	if (filter_off >= count) {
2177	filter = ERR_PTR(error: -ENOENT);
2178	goto out;
2179	}
2180
2181	count -= filter_off;
2182	for (filter = orig; filter && count > `1`; filter = filter->prev)
2183	count--;
2184
2185	if (WARN_ON(count != `1` \|\| !filter)) {
2186	filter = ERR_PTR(error: -ENOENT);
2187	goto out;
2188	}
2189
2190	__get_seccomp_filter(filter);
2191
2192	out:
2193	__put_seccomp_filter(orig);
2194	return filter;
2195	}
2196
2197	long seccomp_get_filter(struct task_struct task, unsigned* long filter_off,
2198	void __user *data)
2199	{
2200	struct seccomp_filter *filter;
2201	struct sock_fprog_kern *fprog;
2202	long ret;
2203
2204	if (!capable(CAP_SYS_ADMIN) \|\|
2205	current->seccomp.mode != SECCOMP_MODE_DISABLED) {
2206	return -EACCES;
2207	}
2208
2209	filter = get_nth_filter(task, filter_off);
2210	if (IS_ERR(ptr: filter))
2211	return PTR_ERR(ptr: filter);
2212
2213	fprog = filter->prog->orig_prog;
2214	if (!fprog) {
2215	/ This must be a new non-cBPF filter, since we save*
2216	* every cBPF filter's orig_prog above when
2217	* CONFIG_CHECKPOINT_RESTORE is enabled.
2218	*/
2219	ret = -EMEDIUMTYPE;
2220	goto out;
2221	}
2222
2223	ret = fprog->len;
2224	if (!data)
2225	goto out;
2226
2227	if (copy_to_user(to: data, from: fprog->filter, bpf_classic_proglen(fprog)))
2228	ret = -EFAULT;
2229
2230	out:
2231	__put_seccomp_filter(orig: filter);
2232	return ret;
2233	}
2234
2235	long seccomp_get_metadata(struct task_struct *task,
2236	unsigned long size, void __user *data)
2237	{
2238	long ret;
2239	struct seccomp_filter *filter;
2240	struct seccomp_metadata kmd = {};
2241
2242	if (!capable(CAP_SYS_ADMIN) \|\|
2243	current->seccomp.mode != SECCOMP_MODE_DISABLED) {
2244	return -EACCES;
2245	}
2246
2247	size = min_t(unsigned long, size, sizeof(kmd));
2248
2249	if (size < sizeof(kmd.filter_off))
2250	return -EINVAL;
2251
2252	if (copy_from_user(to: &kmd.filter_off, from: data, n: sizeof(kmd.filter_off)))
2253	return -EFAULT;
2254
2255	filter = get_nth_filter(task, filter_off: kmd.filter_off);
2256	if (IS_ERR(ptr: filter))
2257	return PTR_ERR(ptr: filter);
2258
2259	if (filter->log)
2260	kmd.flags \|= SECCOMP_FILTER_FLAG_LOG;
2261
2262	ret = size;
2263	if (copy_to_user(to: data, from: &kmd, n: size))
2264	ret = -EFAULT;
2265
2266	__put_seccomp_filter(orig: filter);
2267	return ret;
2268	}
2269	#endif
2270
2271	#ifdef CONFIG_SYSCTL
2272
2273	/ Human readable action names for friendly sysctl interaction /
2274	#define SECCOMP_RET_KILL_PROCESS_NAME "kill_process"
2275	#define SECCOMP_RET_KILL_THREAD_NAME "kill_thread"
2276	#define SECCOMP_RET_TRAP_NAME "trap"
2277	#define SECCOMP_RET_ERRNO_NAME "errno"
2278	#define SECCOMP_RET_USER_NOTIF_NAME "user_notif"
2279	#define SECCOMP_RET_TRACE_NAME "trace"
2280	#define SECCOMP_RET_LOG_NAME "log"
2281	#define SECCOMP_RET_ALLOW_NAME "allow"
2282
2283	static const char seccomp_actions_avail[] =
2284	SECCOMP_RET_KILL_PROCESS_NAME " "
2285	SECCOMP_RET_KILL_THREAD_NAME " "
2286	SECCOMP_RET_TRAP_NAME " "
2287	SECCOMP_RET_ERRNO_NAME " "
2288	SECCOMP_RET_USER_NOTIF_NAME " "
2289	SECCOMP_RET_TRACE_NAME " "
2290	SECCOMP_RET_LOG_NAME " "
2291	SECCOMP_RET_ALLOW_NAME;
2292
2293	struct seccomp_log_name {
2294	u32 log;
2295	const char *name;
2296	};
2297
2298	static const struct seccomp_log_name seccomp_log_names[] = {
2299	{ SECCOMP_LOG_KILL_PROCESS, SECCOMP_RET_KILL_PROCESS_NAME },
2300	{ SECCOMP_LOG_KILL_THREAD, SECCOMP_RET_KILL_THREAD_NAME },
2301	{ SECCOMP_LOG_TRAP, SECCOMP_RET_TRAP_NAME },
2302	{ SECCOMP_LOG_ERRNO, SECCOMP_RET_ERRNO_NAME },
2303	{ SECCOMP_LOG_USER_NOTIF, SECCOMP_RET_USER_NOTIF_NAME },
2304	{ SECCOMP_LOG_TRACE, SECCOMP_RET_TRACE_NAME },
2305	{ SECCOMP_LOG_LOG, SECCOMP_RET_LOG_NAME },
2306	{ SECCOMP_LOG_ALLOW, SECCOMP_RET_ALLOW_NAME },
2307	{ }
2308	};
2309
2310	static bool seccomp_names_from_actions_logged(char *names, size_t size,
2311	u32 actions_logged,
2312	const char *sep)
2313	{
2314	const struct seccomp_log_name *cur;
2315	bool append_sep = false;
2316
2317	for (cur = seccomp_log_names; cur->name && size; cur++) {
2318	ssize_t ret;
2319
2320	if (!(actions_logged & cur->log))
2321	continue;
2322
2323	if (append_sep) {
2324	ret = strscpy(names, sep, size);
2325	if (ret < `0`)
2326	return false;
2327
2328	names += ret;
2329	size -= ret;
2330	} else
2331	append_sep = true;
2332
2333	ret = strscpy(names, cur->name, size);
2334	if (ret < `0`)
2335	return false;
2336
2337	names += ret;
2338	size -= ret;
2339	}
2340
2341	return true;
2342	}
2343
2344	static bool seccomp_action_logged_from_name(u32 *action_logged,
2345	const char *name)
2346	{
2347	const struct seccomp_log_name *cur;
2348
2349	for (cur = seccomp_log_names; cur->name; cur++) {
2350	if (!strcmp(cur->name, name)) {
2351	*action_logged = cur->log;
2352	return true;
2353	}
2354	}
2355
2356	return false;
2357	}
2358
2359	static bool seccomp_actions_logged_from_names(u32 actions_logged, char* *names)
2360	{
2361	char *name;
2362
2363	*actions_logged = `0`;
2364	while ((name = strsep(&names, " ")) && *name) {
2365	u32 action_logged = `0`;
2366
2367	if (!seccomp_action_logged_from_name(action_logged: &action_logged, name))
2368	return false;
2369
2370	*actions_logged \|= action_logged;
2371	}
2372
2373	return true;
2374	}
2375
2376	static int read_actions_logged(const struct ctl_table ro_table, void* *buffer,
2377	size_t lenp, loff_t ppos)
2378	{
2379	char names[sizeof(seccomp_actions_avail)];
2380	struct ctl_table table;
2381
2382	memset(names, `0`, sizeof(names));
2383
2384	if (!seccomp_names_from_actions_logged(names, size: sizeof(names),
2385	actions_logged: seccomp_actions_logged, sep: " "))
2386	return -EINVAL;
2387
2388	table = *ro_table;
2389	table.data = names;
2390	table.maxlen = sizeof(names);
2391	return proc_dostring(&table, `0`, buffer, lenp, ppos);
2392	}
2393
2394	static int write_actions_logged(const struct ctl_table ro_table, void* *buffer,
2395	size_t lenp, loff_t ppos, u32 *actions_logged)
2396	{
2397	char names[sizeof(seccomp_actions_avail)];
2398	struct ctl_table table;
2399	int ret;
2400
2401	if (!capable(CAP_SYS_ADMIN))
2402	return -EPERM;
2403
2404	memset(names, `0`, sizeof(names));
2405
2406	table = *ro_table;
2407	table.data = names;
2408	table.maxlen = sizeof(names);
2409	ret = proc_dostring(&table, `1`, buffer, lenp, ppos);
2410	if (ret)
2411	return ret;
2412
2413	if (!seccomp_actions_logged_from_names(actions_logged, names: table.data))
2414	return -EINVAL;
2415
2416	if (*actions_logged & SECCOMP_LOG_ALLOW)
2417	return -EINVAL;
2418
2419	seccomp_actions_logged = *actions_logged;
2420	return `0`;
2421	}
2422
2423	static void audit_actions_logged(u32 actions_logged, u32 old_actions_logged,
2424	int ret)
2425	{
2426	char names[sizeof(seccomp_actions_avail)];
2427	char old_names[sizeof(seccomp_actions_avail)];
2428	const char *new = names;
2429	const char *old = old_names;
2430
2431	if (!audit_enabled)
2432	return;
2433
2434	memset(names, `0`, sizeof(names));
2435	memset(old_names, `0`, sizeof(old_names));
2436
2437	if (ret)
2438	new = "?";
2439	else if (!actions_logged)
2440	new = "(none)";
2441	else if (!seccomp_names_from_actions_logged(names, size: sizeof(names),
2442	actions_logged, sep: ","))
2443	new = "?";
2444
2445	if (!old_actions_logged)
2446	old = "(none)";
2447	else if (!seccomp_names_from_actions_logged(names: old_names,
2448	size: sizeof(old_names),
2449	actions_logged: old_actions_logged, sep: ","))
2450	old = "?";
2451
2452	return audit_seccomp_actions_logged(names: new, old_names: old, res: !ret);
2453	}
2454
2455	static int seccomp_actions_logged_handler(const struct ctl_table ro_table, int* write,
2456	void buffer, size_t lenp,
2457	loff_t *ppos)
2458	{
2459	int ret;
2460
2461	if (write) {
2462	u32 actions_logged = `0`;
2463	u32 old_actions_logged = seccomp_actions_logged;
2464
2465	ret = write_actions_logged(ro_table, buffer, lenp, ppos,
2466	actions_logged: &actions_logged);
2467	audit_actions_logged(actions_logged, old_actions_logged, ret);
2468	} else
2469	ret = read_actions_logged(ro_table, buffer, lenp, ppos);
2470
2471	return ret;
2472	}
2473
2474	static const struct ctl_table seccomp_sysctl_table[] = {
2475	{
2476	.procname = "actions_avail",
2477	.data = (void *) &seccomp_actions_avail,
2478	.maxlen = sizeof(seccomp_actions_avail),
2479	.mode = `0444`,
2480	.proc_handler = proc_dostring,
2481	},
2482	{
2483	.procname = "actions_logged",
2484	.mode = `0644`,
2485	.proc_handler = seccomp_actions_logged_handler,
2486	},
2487	};
2488
2489	static int __init seccomp_sysctl_init(void)
2490	{
2491	register_sysctl_init("kernel/seccomp", seccomp_sysctl_table);
2492	return `0`;
2493	}
2494
2495	device_initcall(seccomp_sysctl_init)
2496
2497	#endif /* CONFIG_SYSCTL */
2498
2499	#ifdef CONFIG_SECCOMP_CACHE_DEBUG
2500	/ Currently CONFIG_SECCOMP_CACHE_DEBUG implies SECCOMP_ARCH_NATIVE /
2501	static void proc_pid_seccomp_cache_arch(struct seq_file m, const* char *name,
2502	const void *bitmap, size_t bitmap_size)
2503	{
2504	int nr;
2505
2506	for (nr = `0`; nr < bitmap_size; nr++) {
2507	bool cached = test_bit(nr, bitmap);
2508	char *status = cached ? "ALLOW" : "FILTER";
2509
2510	seq_printf(m, fmt: "%s %d %s\n", name, nr, status);
2511	}
2512	}
2513
2514	int proc_pid_seccomp_cache(struct seq_file m, struct* pid_namespace *ns,
2515	struct pid pid, struct* task_struct *task)
2516	{
2517	struct seccomp_filter *f;
2518	unsigned long flags;
2519
2520	/*
2521	* We don't want some sandboxed process to know what their seccomp
2522	* filters consist of.
2523	*/
2524	if (!file_ns_capable(file: m->file, ns: &init_user_ns, CAP_SYS_ADMIN))
2525	return -EACCES;
2526
2527	if (!lock_task_sighand(task, flags: &flags))
2528	return -ESRCH;
2529
2530	f = READ_ONCE(task->seccomp.filter);
2531	if (!f) {
2532	unlock_task_sighand(task, flags: &flags);
2533	return `0`;
2534	}
2535
2536	/ prevent filter from being freed while we are printing it /
2537	__get_seccomp_filter(filter: f);
2538	unlock_task_sighand(task, flags: &flags);
2539
2540	proc_pid_seccomp_cache_arch(m, SECCOMP_ARCH_NATIVE_NAME,
2541	bitmap: f->cache.allow_native,
2542	SECCOMP_ARCH_NATIVE_NR);
2543
2544	#ifdef SECCOMP_ARCH_COMPAT
2545	proc_pid_seccomp_cache_arch(m, SECCOMP_ARCH_COMPAT_NAME,
2546	bitmap: f->cache.allow_compat,
2547	SECCOMP_ARCH_COMPAT_NR);
2548	#endif /* SECCOMP_ARCH_COMPAT */
2549
2550	__put_seccomp_filter(orig: f);
2551	return `0`;
2552	}
2553	#endif /* CONFIG_SECCOMP_CACHE_DEBUG */
2554

source code of linux/kernel/seccomp.c