| 1 | // SPDX-License-Identifier: GPL-2.0 |
| 2 | /* |
| 3 | * Copyright (C) 2021 Benjamin Berg <benjamin@sipsolutions.net> |
| 4 | * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) |
| 5 | */ |
| 6 | |
| 7 | #include <stdio.h> |
| 8 | #include <stdlib.h> |
| 9 | #include <stdarg.h> |
| 10 | #include <unistd.h> |
| 11 | #include <errno.h> |
| 12 | #include <fcntl.h> |
| 13 | #include <sched.h> |
| 14 | #include <signal.h> |
| 15 | #include <string.h> |
| 16 | #include <sys/mman.h> |
| 17 | #include <sys/stat.h> |
| 18 | #include <sys/wait.h> |
| 19 | #include <sys/time.h> |
| 20 | #include <sys/resource.h> |
| 21 | #include <asm/ldt.h> |
| 22 | #include <asm/unistd.h> |
| 23 | #include <init.h> |
| 24 | #include <os.h> |
| 25 | #include <smp.h> |
| 26 | #include <kern_util.h> |
| 27 | #include <mem_user.h> |
| 28 | #include <ptrace_user.h> |
| 29 | #include <stdbool.h> |
| 30 | #include <stub-data.h> |
| 31 | #include <sys/prctl.h> |
| 32 | #include <linux/seccomp.h> |
| 33 | #include <linux/filter.h> |
| 34 | #include <sysdep/mcontext.h> |
| 35 | #include <sysdep/stub.h> |
| 36 | #include <registers.h> |
| 37 | #include <skas.h> |
| 38 | #include "internal.h" |
| 39 | |
| 40 | static void ptrace_child(void) |
| 41 | { |
| 42 | int ret; |
| 43 | /* Calling os_getpid because some libcs cached getpid incorrectly */ |
| 44 | int pid = os_getpid(), ppid = getppid(); |
| 45 | int sc_result; |
| 46 | |
| 47 | if (change_sig(SIGWINCH, 0) < 0 || |
| 48 | ptrace(PTRACE_TRACEME, 0, 0, 0) < 0) { |
| 49 | perror("ptrace" ); |
| 50 | kill(pid, SIGKILL); |
| 51 | } |
| 52 | kill(pid, SIGSTOP); |
| 53 | |
| 54 | /* |
| 55 | * This syscall will be intercepted by the parent. Don't call more than |
| 56 | * once, please. |
| 57 | */ |
| 58 | sc_result = os_getpid(); |
| 59 | |
| 60 | if (sc_result == pid) |
| 61 | /* Nothing modified by the parent, we are running normally. */ |
| 62 | ret = 1; |
| 63 | else if (sc_result == ppid) |
| 64 | /* |
| 65 | * Expected in check_ptrace and check_sysemu when they succeed |
| 66 | * in modifying the stack frame |
| 67 | */ |
| 68 | ret = 0; |
| 69 | else |
| 70 | /* Serious trouble! This could be caused by a bug in host 2.6 |
| 71 | * SKAS3/2.6 patch before release -V6, together with a bug in |
| 72 | * the UML code itself. |
| 73 | */ |
| 74 | ret = 2; |
| 75 | |
| 76 | exit(ret); |
| 77 | } |
| 78 | |
| 79 | static void fatal_perror(const char *str) |
| 80 | { |
| 81 | perror(str); |
| 82 | exit(1); |
| 83 | } |
| 84 | |
| 85 | static void fatal(char *fmt, ...) |
| 86 | { |
| 87 | va_list list; |
| 88 | |
| 89 | va_start(list, fmt); |
| 90 | vfprintf(stderr, fmt, list); |
| 91 | va_end(list); |
| 92 | |
| 93 | exit(1); |
| 94 | } |
| 95 | |
| 96 | static void non_fatal(char *fmt, ...) |
| 97 | { |
| 98 | va_list list; |
| 99 | |
| 100 | va_start(list, fmt); |
| 101 | vfprintf(stderr, fmt, list); |
| 102 | va_end(list); |
| 103 | } |
| 104 | |
| 105 | static int start_ptraced_child(void) |
| 106 | { |
| 107 | int pid, n, status; |
| 108 | |
| 109 | fflush(stdout); |
| 110 | |
| 111 | pid = fork(); |
| 112 | if (pid == 0) |
| 113 | ptrace_child(); |
| 114 | else if (pid < 0) |
| 115 | fatal_perror(str: "start_ptraced_child : fork failed" ); |
| 116 | |
| 117 | CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED)); |
| 118 | if (n < 0) |
| 119 | fatal_perror(str: "check_ptrace : waitpid failed" ); |
| 120 | if (!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGSTOP)) |
| 121 | fatal(fmt: "check_ptrace : expected SIGSTOP, got status = %d" , |
| 122 | status); |
| 123 | |
| 124 | return pid; |
| 125 | } |
| 126 | |
| 127 | static void stop_ptraced_child(int pid, int exitcode) |
| 128 | { |
| 129 | int status, n; |
| 130 | |
| 131 | if (ptrace(PTRACE_CONT, pid, 0, 0) < 0) |
| 132 | fatal_perror(str: "stop_ptraced_child : ptrace failed" ); |
| 133 | |
| 134 | CATCH_EINTR(n = waitpid(pid, &status, 0)); |
| 135 | if (!WIFEXITED(status) || (WEXITSTATUS(status) != exitcode)) { |
| 136 | int exit_with = WEXITSTATUS(status); |
| 137 | fatal(fmt: "stop_ptraced_child : child exited with exitcode %d, " |
| 138 | "while expecting %d; status 0x%x\n" , exit_with, |
| 139 | exitcode, status); |
| 140 | } |
| 141 | } |
| 142 | |
| 143 | static void __init check_sysemu(void) |
| 144 | { |
| 145 | int pid, n, status, count=0; |
| 146 | |
| 147 | os_info("Checking syscall emulation for ptrace..." ); |
| 148 | pid = start_ptraced_child(); |
| 149 | |
| 150 | if ((ptrace(PTRACE_SETOPTIONS, pid, 0, |
| 151 | (void *) PTRACE_O_TRACESYSGOOD) < 0)) |
| 152 | fatal_perror(str: "check_sysemu: PTRACE_SETOPTIONS failed" ); |
| 153 | |
| 154 | while (1) { |
| 155 | count++; |
| 156 | if (ptrace(PTRACE_SYSEMU_SINGLESTEP, pid, 0, 0) < 0) |
| 157 | goto fail; |
| 158 | CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED)); |
| 159 | if (n < 0) |
| 160 | fatal_perror(str: "check_sysemu: wait failed" ); |
| 161 | |
| 162 | if (WIFSTOPPED(status) && |
| 163 | (WSTOPSIG(status) == (SIGTRAP|0x80))) { |
| 164 | if (!count) { |
| 165 | non_fatal(fmt: "check_sysemu: SYSEMU_SINGLESTEP " |
| 166 | "doesn't singlestep" ); |
| 167 | goto fail; |
| 168 | } |
| 169 | n = ptrace(PTRACE_POKEUSER, pid, PT_SYSCALL_RET_OFFSET, |
| 170 | os_getpid()); |
| 171 | if (n < 0) |
| 172 | fatal_perror(str: "check_sysemu : failed to modify " |
| 173 | "system call return" ); |
| 174 | break; |
| 175 | } |
| 176 | else if (WIFSTOPPED(status) && (WSTOPSIG(status) == SIGTRAP)) |
| 177 | count++; |
| 178 | else { |
| 179 | non_fatal(fmt: "check_sysemu: expected SIGTRAP or " |
| 180 | "(SIGTRAP | 0x80), got status = %d\n" , |
| 181 | status); |
| 182 | goto fail; |
| 183 | } |
| 184 | } |
| 185 | stop_ptraced_child(pid, exitcode: 0); |
| 186 | |
| 187 | os_info("OK\n" ); |
| 188 | |
| 189 | return; |
| 190 | |
| 191 | fail: |
| 192 | stop_ptraced_child(pid, exitcode: 1); |
| 193 | fatal(fmt: "missing\n" ); |
| 194 | } |
| 195 | |
| 196 | static void __init check_ptrace(void) |
| 197 | { |
| 198 | int pid, syscall, n, status; |
| 199 | |
| 200 | os_info("Checking that ptrace can change system call numbers..." ); |
| 201 | pid = start_ptraced_child(); |
| 202 | |
| 203 | if ((ptrace(PTRACE_SETOPTIONS, pid, 0, |
| 204 | (void *) PTRACE_O_TRACESYSGOOD) < 0)) |
| 205 | fatal_perror(str: "check_ptrace: PTRACE_SETOPTIONS failed" ); |
| 206 | |
| 207 | while (1) { |
| 208 | if (ptrace(PTRACE_SYSCALL, pid, 0, 0) < 0) |
| 209 | fatal_perror(str: "check_ptrace : ptrace failed" ); |
| 210 | |
| 211 | CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED)); |
| 212 | if (n < 0) |
| 213 | fatal_perror(str: "check_ptrace : wait failed" ); |
| 214 | |
| 215 | if (!WIFSTOPPED(status) || |
| 216 | (WSTOPSIG(status) != (SIGTRAP | 0x80))) |
| 217 | fatal(fmt: "check_ptrace : expected (SIGTRAP|0x80), " |
| 218 | "got status = %d" , status); |
| 219 | |
| 220 | syscall = ptrace(PTRACE_PEEKUSER, pid, PT_SYSCALL_NR_OFFSET, |
| 221 | 0); |
| 222 | if (syscall == __NR_getpid) { |
| 223 | n = ptrace(PTRACE_POKEUSER, pid, PT_SYSCALL_NR_OFFSET, |
| 224 | __NR_getppid); |
| 225 | if (n < 0) |
| 226 | fatal_perror(str: "check_ptrace : failed to modify " |
| 227 | "system call" ); |
| 228 | break; |
| 229 | } |
| 230 | } |
| 231 | stop_ptraced_child(pid, exitcode: 0); |
| 232 | os_info("OK\n" ); |
| 233 | check_sysemu(); |
| 234 | } |
| 235 | |
| 236 | extern unsigned long host_fp_size; |
| 237 | extern unsigned long exec_regs[MAX_REG_NR]; |
| 238 | extern unsigned long *exec_fp_regs; |
| 239 | |
| 240 | __initdata static struct stub_data *seccomp_test_stub_data; |
| 241 | |
| 242 | static void __init sigsys_handler(int sig, siginfo_t *info, void *p) |
| 243 | { |
| 244 | ucontext_t *uc = p; |
| 245 | |
| 246 | /* Stow away the location of the mcontext in the stack */ |
| 247 | seccomp_test_stub_data->mctx_offset = (unsigned long)&uc->uc_mcontext - |
| 248 | (unsigned long)&seccomp_test_stub_data->sigstack[0]; |
| 249 | |
| 250 | /* Prevent libc from clearing memory (mctx_offset in particular) */ |
| 251 | syscall(__NR_exit, 0); |
| 252 | } |
| 253 | |
| 254 | static int __init seccomp_helper(void *data) |
| 255 | { |
| 256 | static struct sock_filter filter[] = { |
| 257 | BPF_STMT(BPF_LD | BPF_W | BPF_ABS, |
| 258 | offsetof(struct seccomp_data, nr)), |
| 259 | BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_clock_nanosleep, 1, 0), |
| 260 | BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW), |
| 261 | BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_TRAP), |
| 262 | }; |
| 263 | static struct sock_fprog prog = { |
| 264 | .len = ARRAY_SIZE(filter), |
| 265 | .filter = filter, |
| 266 | }; |
| 267 | struct sigaction sa; |
| 268 | |
| 269 | /* close_range is needed for the stub */ |
| 270 | if (stub_syscall3(__NR_close_range, 1, ~0U, 0)) |
| 271 | exit(1); |
| 272 | |
| 273 | set_sigstack(seccomp_test_stub_data->sigstack, |
| 274 | sizeof(seccomp_test_stub_data->sigstack)); |
| 275 | |
| 276 | sa.sa_flags = SA_ONSTACK | SA_NODEFER | SA_SIGINFO; |
| 277 | sa.sa_sigaction = (void *) sigsys_handler; |
| 278 | sa.sa_restorer = NULL; |
| 279 | if (sigaction(SIGSYS, &sa, NULL) < 0) |
| 280 | exit(2); |
| 281 | |
| 282 | prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); |
| 283 | if (syscall(__NR_seccomp, SECCOMP_SET_MODE_FILTER, |
| 284 | SECCOMP_FILTER_FLAG_TSYNC, &prog) != 0) |
| 285 | exit(3); |
| 286 | |
| 287 | sleep(0); |
| 288 | |
| 289 | /* Never reached. */ |
| 290 | _exit(4); |
| 291 | } |
| 292 | |
| 293 | static bool __init init_seccomp(void) |
| 294 | { |
| 295 | int pid; |
| 296 | int status; |
| 297 | int n; |
| 298 | unsigned long sp; |
| 299 | |
| 300 | /* |
| 301 | * We check that we can install a seccomp filter and then exit(0) |
| 302 | * from a trapped syscall. |
| 303 | * |
| 304 | * Note that we cannot verify that no seccomp filter already exists |
| 305 | * for a syscall that results in the process/thread to be killed. |
| 306 | */ |
| 307 | |
| 308 | os_info("Checking that seccomp filters can be installed..." ); |
| 309 | |
| 310 | seccomp_test_stub_data = mmap(0, sizeof(*seccomp_test_stub_data), |
| 311 | PROT_READ | PROT_WRITE, |
| 312 | MAP_SHARED | MAP_ANON, 0, 0); |
| 313 | |
| 314 | /* Use the syscall data area as stack, we just need something */ |
| 315 | sp = (unsigned long)&seccomp_test_stub_data->syscall_data + |
| 316 | sizeof(seccomp_test_stub_data->syscall_data) - |
| 317 | sizeof(void *); |
| 318 | pid = clone(seccomp_helper, (void *)sp, CLONE_VFORK | CLONE_VM, NULL); |
| 319 | |
| 320 | if (pid < 0) |
| 321 | fatal_perror(str: "check_seccomp : clone failed" ); |
| 322 | |
| 323 | CATCH_EINTR(n = waitpid(pid, &status, __WCLONE)); |
| 324 | if (n < 0) |
| 325 | fatal_perror(str: "check_seccomp : waitpid failed" ); |
| 326 | |
| 327 | if (WIFEXITED(status) && WEXITSTATUS(status) == 0) { |
| 328 | struct uml_pt_regs *regs; |
| 329 | unsigned long fp_size; |
| 330 | int r; |
| 331 | |
| 332 | /* Fill in the host_fp_size from the mcontext. */ |
| 333 | regs = calloc(1, sizeof(struct uml_pt_regs)); |
| 334 | get_stub_state(regs, seccomp_test_stub_data, &fp_size); |
| 335 | host_fp_size = fp_size; |
| 336 | free(regs); |
| 337 | |
| 338 | /* Repeat with the correct size */ |
| 339 | regs = calloc(1, sizeof(struct uml_pt_regs) + host_fp_size); |
| 340 | r = get_stub_state(regs, seccomp_test_stub_data, NULL); |
| 341 | |
| 342 | /* Store as the default startup registers */ |
| 343 | exec_fp_regs = malloc(host_fp_size); |
| 344 | memcpy(exec_regs, regs->gp, sizeof(exec_regs)); |
| 345 | memcpy(exec_fp_regs, regs->fp, host_fp_size); |
| 346 | |
| 347 | munmap(seccomp_test_stub_data, sizeof(*seccomp_test_stub_data)); |
| 348 | |
| 349 | free(regs); |
| 350 | |
| 351 | if (r) { |
| 352 | os_info("failed to fetch registers: %d\n" , r); |
| 353 | return false; |
| 354 | } |
| 355 | |
| 356 | os_info("OK\n" ); |
| 357 | return true; |
| 358 | } |
| 359 | |
| 360 | if (WIFEXITED(status) && WEXITSTATUS(status) == 2) |
| 361 | os_info("missing\n" ); |
| 362 | else |
| 363 | os_info("error\n" ); |
| 364 | |
| 365 | munmap(seccomp_test_stub_data, sizeof(*seccomp_test_stub_data)); |
| 366 | return false; |
| 367 | } |
| 368 | |
| 369 | |
| 370 | static void __init check_coredump_limit(void) |
| 371 | { |
| 372 | struct rlimit lim; |
| 373 | int err = getrlimit(RLIMIT_CORE, &lim); |
| 374 | |
| 375 | if (err) { |
| 376 | perror("Getting core dump limit" ); |
| 377 | return; |
| 378 | } |
| 379 | |
| 380 | os_info("Core dump limits :\n\tsoft - " ); |
| 381 | if (lim.rlim_cur == RLIM_INFINITY) |
| 382 | os_info("NONE\n" ); |
| 383 | else |
| 384 | os_info("%llu\n" , (unsigned long long)lim.rlim_cur); |
| 385 | |
| 386 | os_info("\thard - " ); |
| 387 | if (lim.rlim_max == RLIM_INFINITY) |
| 388 | os_info("NONE\n" ); |
| 389 | else |
| 390 | os_info("%llu\n" , (unsigned long long)lim.rlim_max); |
| 391 | } |
| 392 | |
| 393 | void __init get_host_cpu_features( |
| 394 | void (*flags_helper_func)(char *line), |
| 395 | void (*cache_helper_func)(char *line)) |
| 396 | { |
| 397 | FILE *cpuinfo; |
| 398 | char *line = NULL; |
| 399 | size_t len = 0; |
| 400 | int done_parsing = 0; |
| 401 | |
| 402 | cpuinfo = fopen("/proc/cpuinfo" , "r" ); |
| 403 | if (cpuinfo == NULL) { |
| 404 | os_info("Failed to get host CPU features\n" ); |
| 405 | } else { |
| 406 | while ((getline(&line, &len, cpuinfo)) != -1) { |
| 407 | if (strstr(line, "flags" )) { |
| 408 | flags_helper_func(line); |
| 409 | done_parsing++; |
| 410 | } |
| 411 | if (strstr(line, "cache_alignment" )) { |
| 412 | cache_helper_func(line); |
| 413 | done_parsing++; |
| 414 | } |
| 415 | free(line); |
| 416 | line = NULL; |
| 417 | if (done_parsing > 1) |
| 418 | break; |
| 419 | } |
| 420 | fclose(cpuinfo); |
| 421 | } |
| 422 | } |
| 423 | |
| 424 | static int seccomp_config __initdata; |
| 425 | |
| 426 | static int __init uml_seccomp_config(char *line, int *add) |
| 427 | { |
| 428 | *add = 0; |
| 429 | |
| 430 | if (strcmp(line, "off" ) == 0) |
| 431 | seccomp_config = 0; |
| 432 | else if (strcmp(line, "auto" ) == 0) |
| 433 | seccomp_config = 1; |
| 434 | else if (strcmp(line, "on" ) == 0) |
| 435 | seccomp_config = 2; |
| 436 | else |
| 437 | fatal(fmt: "Invalid seccomp option '%s', expected on/auto/off\n" , |
| 438 | line); |
| 439 | |
| 440 | return 0; |
| 441 | } |
| 442 | |
| 443 | __uml_setup("seccomp=" , uml_seccomp_config, |
| 444 | "seccomp=<on/auto/off>\n" |
| 445 | " Configure whether or not SECCOMP is used. With SECCOMP, userspace\n" |
| 446 | " processes work collaboratively with the kernel instead of being\n" |
| 447 | " traced using ptrace. All syscalls from the application are caught and\n" |
| 448 | " redirected using a signal. This signal handler in turn is permitted to\n" |
| 449 | " do the selected set of syscalls to communicate with the UML kernel and\n" |
| 450 | " do the required memory management.\n" |
| 451 | "\n" |
| 452 | " This method is overall faster than the ptrace based userspace, primarily\n" |
| 453 | " because it reduces the number of context switches for (minor) page faults.\n" |
| 454 | "\n" |
| 455 | " However, the SECCOMP filter is not (yet) restrictive enough to prevent\n" |
| 456 | " userspace from reading and writing all physical memory. Userspace\n" |
| 457 | " processes could also trick the stub into disabling SIGALRM which\n" |
| 458 | " prevents it from being interrupted for scheduling purposes.\n" |
| 459 | "\n" |
| 460 | " This is insecure and should only be used with a trusted userspace\n\n" |
| 461 | ); |
| 462 | |
| 463 | void __init os_early_checks(void) |
| 464 | { |
| 465 | int pid; |
| 466 | |
| 467 | /* Print out the core dump limits early */ |
| 468 | check_coredump_limit(); |
| 469 | |
| 470 | /* Need to check this early because mmapping happens before the |
| 471 | * kernel is running. |
| 472 | */ |
| 473 | check_tmpexec(); |
| 474 | |
| 475 | if (seccomp_config) { |
| 476 | if (init_seccomp()) { |
| 477 | using_seccomp = 1; |
| 478 | return; |
| 479 | } |
| 480 | |
| 481 | if (seccomp_config == 2) |
| 482 | fatal(fmt: "SECCOMP userspace requested but not functional!\n" ); |
| 483 | } |
| 484 | |
| 485 | if (uml_ncpus > 1) |
| 486 | fatal(fmt: "SMP is not supported with PTRACE userspace.\n" ); |
| 487 | |
| 488 | using_seccomp = 0; |
| 489 | check_ptrace(); |
| 490 | |
| 491 | pid = start_ptraced_child(); |
| 492 | if (init_pid_registers(pid)) |
| 493 | fatal(fmt: "Failed to initialize default registers" ); |
| 494 | stop_ptraced_child(pid, exitcode: 1); |
| 495 | } |
| 496 | |