1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (C) 2021 Benjamin Berg <benjamin@sipsolutions.net>
4 * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
5 */
6
7#include <stdio.h>
8#include <stdlib.h>
9#include <stdarg.h>
10#include <unistd.h>
11#include <errno.h>
12#include <fcntl.h>
13#include <sched.h>
14#include <signal.h>
15#include <string.h>
16#include <sys/mman.h>
17#include <sys/stat.h>
18#include <sys/wait.h>
19#include <sys/time.h>
20#include <sys/resource.h>
21#include <asm/ldt.h>
22#include <asm/unistd.h>
23#include <init.h>
24#include <os.h>
25#include <smp.h>
26#include <kern_util.h>
27#include <mem_user.h>
28#include <ptrace_user.h>
29#include <stdbool.h>
30#include <stub-data.h>
31#include <sys/prctl.h>
32#include <linux/seccomp.h>
33#include <linux/filter.h>
34#include <sysdep/mcontext.h>
35#include <sysdep/stub.h>
36#include <registers.h>
37#include <skas.h>
38#include "internal.h"
39
40static void ptrace_child(void)
41{
42 int ret;
43 /* Calling os_getpid because some libcs cached getpid incorrectly */
44 int pid = os_getpid(), ppid = getppid();
45 int sc_result;
46
47 if (change_sig(SIGWINCH, 0) < 0 ||
48 ptrace(PTRACE_TRACEME, 0, 0, 0) < 0) {
49 perror("ptrace");
50 kill(pid, SIGKILL);
51 }
52 kill(pid, SIGSTOP);
53
54 /*
55 * This syscall will be intercepted by the parent. Don't call more than
56 * once, please.
57 */
58 sc_result = os_getpid();
59
60 if (sc_result == pid)
61 /* Nothing modified by the parent, we are running normally. */
62 ret = 1;
63 else if (sc_result == ppid)
64 /*
65 * Expected in check_ptrace and check_sysemu when they succeed
66 * in modifying the stack frame
67 */
68 ret = 0;
69 else
70 /* Serious trouble! This could be caused by a bug in host 2.6
71 * SKAS3/2.6 patch before release -V6, together with a bug in
72 * the UML code itself.
73 */
74 ret = 2;
75
76 exit(ret);
77}
78
79static void fatal_perror(const char *str)
80{
81 perror(str);
82 exit(1);
83}
84
85static void fatal(char *fmt, ...)
86{
87 va_list list;
88
89 va_start(list, fmt);
90 vfprintf(stderr, fmt, list);
91 va_end(list);
92
93 exit(1);
94}
95
96static void non_fatal(char *fmt, ...)
97{
98 va_list list;
99
100 va_start(list, fmt);
101 vfprintf(stderr, fmt, list);
102 va_end(list);
103}
104
105static int start_ptraced_child(void)
106{
107 int pid, n, status;
108
109 fflush(stdout);
110
111 pid = fork();
112 if (pid == 0)
113 ptrace_child();
114 else if (pid < 0)
115 fatal_perror(str: "start_ptraced_child : fork failed");
116
117 CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED));
118 if (n < 0)
119 fatal_perror(str: "check_ptrace : waitpid failed");
120 if (!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGSTOP))
121 fatal(fmt: "check_ptrace : expected SIGSTOP, got status = %d",
122 status);
123
124 return pid;
125}
126
127static void stop_ptraced_child(int pid, int exitcode)
128{
129 int status, n;
130
131 if (ptrace(PTRACE_CONT, pid, 0, 0) < 0)
132 fatal_perror(str: "stop_ptraced_child : ptrace failed");
133
134 CATCH_EINTR(n = waitpid(pid, &status, 0));
135 if (!WIFEXITED(status) || (WEXITSTATUS(status) != exitcode)) {
136 int exit_with = WEXITSTATUS(status);
137 fatal(fmt: "stop_ptraced_child : child exited with exitcode %d, "
138 "while expecting %d; status 0x%x\n", exit_with,
139 exitcode, status);
140 }
141}
142
143static void __init check_sysemu(void)
144{
145 int pid, n, status, count=0;
146
147 os_info("Checking syscall emulation for ptrace...");
148 pid = start_ptraced_child();
149
150 if ((ptrace(PTRACE_SETOPTIONS, pid, 0,
151 (void *) PTRACE_O_TRACESYSGOOD) < 0))
152 fatal_perror(str: "check_sysemu: PTRACE_SETOPTIONS failed");
153
154 while (1) {
155 count++;
156 if (ptrace(PTRACE_SYSEMU_SINGLESTEP, pid, 0, 0) < 0)
157 goto fail;
158 CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED));
159 if (n < 0)
160 fatal_perror(str: "check_sysemu: wait failed");
161
162 if (WIFSTOPPED(status) &&
163 (WSTOPSIG(status) == (SIGTRAP|0x80))) {
164 if (!count) {
165 non_fatal(fmt: "check_sysemu: SYSEMU_SINGLESTEP "
166 "doesn't singlestep");
167 goto fail;
168 }
169 n = ptrace(PTRACE_POKEUSER, pid, PT_SYSCALL_RET_OFFSET,
170 os_getpid());
171 if (n < 0)
172 fatal_perror(str: "check_sysemu : failed to modify "
173 "system call return");
174 break;
175 }
176 else if (WIFSTOPPED(status) && (WSTOPSIG(status) == SIGTRAP))
177 count++;
178 else {
179 non_fatal(fmt: "check_sysemu: expected SIGTRAP or "
180 "(SIGTRAP | 0x80), got status = %d\n",
181 status);
182 goto fail;
183 }
184 }
185 stop_ptraced_child(pid, exitcode: 0);
186
187 os_info("OK\n");
188
189 return;
190
191fail:
192 stop_ptraced_child(pid, exitcode: 1);
193 fatal(fmt: "missing\n");
194}
195
196static void __init check_ptrace(void)
197{
198 int pid, syscall, n, status;
199
200 os_info("Checking that ptrace can change system call numbers...");
201 pid = start_ptraced_child();
202
203 if ((ptrace(PTRACE_SETOPTIONS, pid, 0,
204 (void *) PTRACE_O_TRACESYSGOOD) < 0))
205 fatal_perror(str: "check_ptrace: PTRACE_SETOPTIONS failed");
206
207 while (1) {
208 if (ptrace(PTRACE_SYSCALL, pid, 0, 0) < 0)
209 fatal_perror(str: "check_ptrace : ptrace failed");
210
211 CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED));
212 if (n < 0)
213 fatal_perror(str: "check_ptrace : wait failed");
214
215 if (!WIFSTOPPED(status) ||
216 (WSTOPSIG(status) != (SIGTRAP | 0x80)))
217 fatal(fmt: "check_ptrace : expected (SIGTRAP|0x80), "
218 "got status = %d", status);
219
220 syscall = ptrace(PTRACE_PEEKUSER, pid, PT_SYSCALL_NR_OFFSET,
221 0);
222 if (syscall == __NR_getpid) {
223 n = ptrace(PTRACE_POKEUSER, pid, PT_SYSCALL_NR_OFFSET,
224 __NR_getppid);
225 if (n < 0)
226 fatal_perror(str: "check_ptrace : failed to modify "
227 "system call");
228 break;
229 }
230 }
231 stop_ptraced_child(pid, exitcode: 0);
232 os_info("OK\n");
233 check_sysemu();
234}
235
236extern unsigned long host_fp_size;
237extern unsigned long exec_regs[MAX_REG_NR];
238extern unsigned long *exec_fp_regs;
239
240__initdata static struct stub_data *seccomp_test_stub_data;
241
242static void __init sigsys_handler(int sig, siginfo_t *info, void *p)
243{
244 ucontext_t *uc = p;
245
246 /* Stow away the location of the mcontext in the stack */
247 seccomp_test_stub_data->mctx_offset = (unsigned long)&uc->uc_mcontext -
248 (unsigned long)&seccomp_test_stub_data->sigstack[0];
249
250 /* Prevent libc from clearing memory (mctx_offset in particular) */
251 syscall(__NR_exit, 0);
252}
253
254static int __init seccomp_helper(void *data)
255{
256 static struct sock_filter filter[] = {
257 BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
258 offsetof(struct seccomp_data, nr)),
259 BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_clock_nanosleep, 1, 0),
260 BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
261 BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_TRAP),
262 };
263 static struct sock_fprog prog = {
264 .len = ARRAY_SIZE(filter),
265 .filter = filter,
266 };
267 struct sigaction sa;
268
269 /* close_range is needed for the stub */
270 if (stub_syscall3(__NR_close_range, 1, ~0U, 0))
271 exit(1);
272
273 set_sigstack(seccomp_test_stub_data->sigstack,
274 sizeof(seccomp_test_stub_data->sigstack));
275
276 sa.sa_flags = SA_ONSTACK | SA_NODEFER | SA_SIGINFO;
277 sa.sa_sigaction = (void *) sigsys_handler;
278 sa.sa_restorer = NULL;
279 if (sigaction(SIGSYS, &sa, NULL) < 0)
280 exit(2);
281
282 prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
283 if (syscall(__NR_seccomp, SECCOMP_SET_MODE_FILTER,
284 SECCOMP_FILTER_FLAG_TSYNC, &prog) != 0)
285 exit(3);
286
287 sleep(0);
288
289 /* Never reached. */
290 _exit(4);
291}
292
293static bool __init init_seccomp(void)
294{
295 int pid;
296 int status;
297 int n;
298 unsigned long sp;
299
300 /*
301 * We check that we can install a seccomp filter and then exit(0)
302 * from a trapped syscall.
303 *
304 * Note that we cannot verify that no seccomp filter already exists
305 * for a syscall that results in the process/thread to be killed.
306 */
307
308 os_info("Checking that seccomp filters can be installed...");
309
310 seccomp_test_stub_data = mmap(0, sizeof(*seccomp_test_stub_data),
311 PROT_READ | PROT_WRITE,
312 MAP_SHARED | MAP_ANON, 0, 0);
313
314 /* Use the syscall data area as stack, we just need something */
315 sp = (unsigned long)&seccomp_test_stub_data->syscall_data +
316 sizeof(seccomp_test_stub_data->syscall_data) -
317 sizeof(void *);
318 pid = clone(seccomp_helper, (void *)sp, CLONE_VFORK | CLONE_VM, NULL);
319
320 if (pid < 0)
321 fatal_perror(str: "check_seccomp : clone failed");
322
323 CATCH_EINTR(n = waitpid(pid, &status, __WCLONE));
324 if (n < 0)
325 fatal_perror(str: "check_seccomp : waitpid failed");
326
327 if (WIFEXITED(status) && WEXITSTATUS(status) == 0) {
328 struct uml_pt_regs *regs;
329 unsigned long fp_size;
330 int r;
331
332 /* Fill in the host_fp_size from the mcontext. */
333 regs = calloc(1, sizeof(struct uml_pt_regs));
334 get_stub_state(regs, seccomp_test_stub_data, &fp_size);
335 host_fp_size = fp_size;
336 free(regs);
337
338 /* Repeat with the correct size */
339 regs = calloc(1, sizeof(struct uml_pt_regs) + host_fp_size);
340 r = get_stub_state(regs, seccomp_test_stub_data, NULL);
341
342 /* Store as the default startup registers */
343 exec_fp_regs = malloc(host_fp_size);
344 memcpy(exec_regs, regs->gp, sizeof(exec_regs));
345 memcpy(exec_fp_regs, regs->fp, host_fp_size);
346
347 munmap(seccomp_test_stub_data, sizeof(*seccomp_test_stub_data));
348
349 free(regs);
350
351 if (r) {
352 os_info("failed to fetch registers: %d\n", r);
353 return false;
354 }
355
356 os_info("OK\n");
357 return true;
358 }
359
360 if (WIFEXITED(status) && WEXITSTATUS(status) == 2)
361 os_info("missing\n");
362 else
363 os_info("error\n");
364
365 munmap(seccomp_test_stub_data, sizeof(*seccomp_test_stub_data));
366 return false;
367}
368
369
370static void __init check_coredump_limit(void)
371{
372 struct rlimit lim;
373 int err = getrlimit(RLIMIT_CORE, &lim);
374
375 if (err) {
376 perror("Getting core dump limit");
377 return;
378 }
379
380 os_info("Core dump limits :\n\tsoft - ");
381 if (lim.rlim_cur == RLIM_INFINITY)
382 os_info("NONE\n");
383 else
384 os_info("%llu\n", (unsigned long long)lim.rlim_cur);
385
386 os_info("\thard - ");
387 if (lim.rlim_max == RLIM_INFINITY)
388 os_info("NONE\n");
389 else
390 os_info("%llu\n", (unsigned long long)lim.rlim_max);
391}
392
393void __init get_host_cpu_features(
394 void (*flags_helper_func)(char *line),
395 void (*cache_helper_func)(char *line))
396{
397 FILE *cpuinfo;
398 char *line = NULL;
399 size_t len = 0;
400 int done_parsing = 0;
401
402 cpuinfo = fopen("/proc/cpuinfo", "r");
403 if (cpuinfo == NULL) {
404 os_info("Failed to get host CPU features\n");
405 } else {
406 while ((getline(&line, &len, cpuinfo)) != -1) {
407 if (strstr(line, "flags")) {
408 flags_helper_func(line);
409 done_parsing++;
410 }
411 if (strstr(line, "cache_alignment")) {
412 cache_helper_func(line);
413 done_parsing++;
414 }
415 free(line);
416 line = NULL;
417 if (done_parsing > 1)
418 break;
419 }
420 fclose(cpuinfo);
421 }
422}
423
424static int seccomp_config __initdata;
425
426static int __init uml_seccomp_config(char *line, int *add)
427{
428 *add = 0;
429
430 if (strcmp(line, "off") == 0)
431 seccomp_config = 0;
432 else if (strcmp(line, "auto") == 0)
433 seccomp_config = 1;
434 else if (strcmp(line, "on") == 0)
435 seccomp_config = 2;
436 else
437 fatal(fmt: "Invalid seccomp option '%s', expected on/auto/off\n",
438 line);
439
440 return 0;
441}
442
443__uml_setup("seccomp=", uml_seccomp_config,
444"seccomp=<on/auto/off>\n"
445" Configure whether or not SECCOMP is used. With SECCOMP, userspace\n"
446" processes work collaboratively with the kernel instead of being\n"
447" traced using ptrace. All syscalls from the application are caught and\n"
448" redirected using a signal. This signal handler in turn is permitted to\n"
449" do the selected set of syscalls to communicate with the UML kernel and\n"
450" do the required memory management.\n"
451"\n"
452" This method is overall faster than the ptrace based userspace, primarily\n"
453" because it reduces the number of context switches for (minor) page faults.\n"
454"\n"
455" However, the SECCOMP filter is not (yet) restrictive enough to prevent\n"
456" userspace from reading and writing all physical memory. Userspace\n"
457" processes could also trick the stub into disabling SIGALRM which\n"
458" prevents it from being interrupted for scheduling purposes.\n"
459"\n"
460" This is insecure and should only be used with a trusted userspace\n\n"
461);
462
463void __init os_early_checks(void)
464{
465 int pid;
466
467 /* Print out the core dump limits early */
468 check_coredump_limit();
469
470 /* Need to check this early because mmapping happens before the
471 * kernel is running.
472 */
473 check_tmpexec();
474
475 if (seccomp_config) {
476 if (init_seccomp()) {
477 using_seccomp = 1;
478 return;
479 }
480
481 if (seccomp_config == 2)
482 fatal(fmt: "SECCOMP userspace requested but not functional!\n");
483 }
484
485 if (uml_ncpus > 1)
486 fatal(fmt: "SMP is not supported with PTRACE userspace.\n");
487
488 using_seccomp = 0;
489 check_ptrace();
490
491 pid = start_ptraced_child();
492 if (init_pid_registers(pid))
493 fatal(fmt: "Failed to initialize default registers");
494 stop_ptraced_child(pid, exitcode: 1);
495}
496

source code of linux/arch/um/os-Linux/start_up.c