1 | /**************************************************************************** |
2 | ** |
3 | ** Copyright (C) 2020 Intel Corporation. |
4 | ** |
5 | ** Permission is hereby granted, free of charge, to any person obtaining a copy |
6 | ** of this software and associated documentation files (the "Software"), to deal |
7 | ** in the Software without restriction, including without limitation the rights |
8 | ** to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
9 | ** copies of the Software, and to permit persons to whom the Software is |
10 | ** furnished to do so, subject to the following conditions: |
11 | ** |
12 | ** The above copyright notice and this permission notice shall be included in |
13 | ** all copies or substantial portions of the Software. |
14 | ** |
15 | ** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
16 | ** IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
17 | ** FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
18 | ** AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
19 | ** LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
20 | ** OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
21 | ** THE SOFTWARE. |
22 | ** |
23 | ****************************************************************************/ |
24 | |
25 | #ifndef _GNU_SOURCE |
26 | # define _GNU_SOURCE |
27 | #endif |
28 | |
29 | #include "forkfd.h" |
30 | |
31 | #include <errno.h> |
32 | #include <fcntl.h> |
33 | #include <limits.h> |
34 | #include <sched.h> |
35 | #include <signal.h> |
36 | #include <stdio.h> |
37 | #include <stdlib.h> |
38 | #include <string.h> |
39 | #include <sys/resource.h> |
40 | #include <sys/syscall.h> |
41 | #include <sys/types.h> |
42 | #include <sys/wait.h> |
43 | #include <unistd.h> |
44 | |
45 | #include "forkfd_atomic.h" |
46 | |
47 | #ifndef CLONE_PIDFD |
48 | # define CLONE_PIDFD 0x00001000 |
49 | #endif |
50 | #ifndef P_PIDFD |
51 | # define P_PIDFD 3 |
52 | #endif |
53 | |
54 | #define SYSTEM_FORKFD_CAN_VFORK |
55 | |
56 | // in forkfd.c |
57 | static int convertForkfdWaitFlagsToWaitFlags(int ffdoptions); |
58 | static void convertStatusToForkfdInfo(int status, struct forkfd_info *info); |
59 | |
60 | static ffd_atomic_int system_forkfd_state = FFD_ATOMIC_INIT(0); |
61 | |
62 | static int sys_waitid(int which, int pid_or_pidfd, siginfo_t *infop, int options, |
63 | struct rusage *ru) |
64 | { |
65 | /* use the waitid raw system call, which has an extra parameter that glibc |
66 | * doesn't offer to us */ |
67 | return syscall(__NR_waitid, which, pid_or_pidfd, infop, options, ru); |
68 | } |
69 | |
70 | static int sys_clone(unsigned long cloneflags, int *ptid) |
71 | { |
72 | void *child_stack = NULL; |
73 | int *ctid = NULL; |
74 | unsigned long newtls = 0; |
75 | #if defined(__NR_clone2) |
76 | size_t stack_size = 0; |
77 | return syscall(__NR_clone2, cloneflags, child_stack, stack_size, ptid, ctid, newtls); |
78 | #elif defined(__cris__) || defined(__s390__) |
79 | /* a.k.a., CONFIG_CLONE_BACKWARDS2 architectures */ |
80 | return syscall(__NR_clone, child_stack, cloneflags, ptid, newtls, ctid); |
81 | #elif defined(__microblaze__) |
82 | /* a.k.a., CONFIG_CLONE_BACKWARDS3 architectures */ |
83 | size_t stack_size = 0; |
84 | return syscall(__NR_clone, cloneflags, child_stack, stack_size, ptid, newtls, ctid); |
85 | #elif defined(__arc__) || defined(__arm__) || defined(__aarch64__) || defined(__mips__) || \ |
86 | defined(__nds32__) || defined(__hppa__) || defined(__powerpc__) || defined(__i386__) || \ |
87 | defined(__x86_64__) || defined(__xtensa__) || defined(__alpha__) || defined(__riscv) || \ |
88 | defined(__loongarch__) |
89 | /* ctid and newtls are inverted on CONFIG_CLONE_BACKWARDS architectures, |
90 | * but since both values are 0, there's no harm. */ |
91 | return syscall(__NR_clone, cloneflags, child_stack, ptid, ctid, newtls); |
92 | #else |
93 | (void) child_stack; |
94 | (void) ctid; |
95 | (void) newtls; |
96 | errno = ENOSYS; |
97 | return -1; |
98 | #endif |
99 | } |
100 | |
101 | static int detect_clone_pidfd_support() |
102 | { |
103 | /* |
104 | * Detect support for CLONE_PIDFD and P_PIDFD. Support was added in steps: |
105 | * - Linux 5.2 added CLONE_PIDFD support in clone(2) system call |
106 | * - Linux 5.2 added pidfd_send_signal(2) |
107 | * - Linux 5.3 added support for poll(2) on pidfds |
108 | * - Linux 5.3 added clone3(2) |
109 | * - Linux 5.4 added P_PIDFD support in waitid(2) |
110 | * |
111 | * We need CLONE_PIDFD and the poll(2) support. We could emulate the |
112 | * P_PIDFD support by reading the PID from /proc/self/fdinfo/n, which works |
113 | * in Linux 5.2, but without poll(2), we can't guarantee the functionality |
114 | * anyway. |
115 | * |
116 | * So we detect by trying to waitid(2) on a positive file descriptor that |
117 | * is definitely closed (INT_MAX). If P_PIDFD is supported, waitid(2) will |
118 | * return EBADF. If it isn't supported, it returns EINVAL (as it would for |
119 | * a negative file descriptor). This will succeed on Linux 5.4. |
120 | * |
121 | * We could have instead detected by the existence of the clone3(2) system |
122 | * call, but for that we would have needed to wait for __NR_clone3 to show |
123 | * up on the libcs. We choose to go via the waitid(2) route, which requires |
124 | * platform-independent constants only. It would have simplified the |
125 | * sys_clone() mess above... |
126 | */ |
127 | |
128 | sys_waitid(P_PIDFD, INT_MAX, NULL, WEXITED|WNOHANG, NULL); |
129 | return errno == EBADF ? 1 : -1; |
130 | } |
131 | |
132 | int system_has_forkfd() |
133 | { |
134 | return ffd_atomic_load(&system_forkfd_state, FFD_ATOMIC_RELAXED) > 0; |
135 | } |
136 | |
137 | static int system_forkfd_availability(void) |
138 | { |
139 | int state = ffd_atomic_load(&system_forkfd_state, FFD_ATOMIC_RELAXED); |
140 | if (state == 0) { |
141 | state = detect_clone_pidfd_support(); |
142 | ffd_atomic_store(&system_forkfd_state, state, FFD_ATOMIC_RELAXED); |
143 | } |
144 | return state; |
145 | } |
146 | |
147 | static int system_forkfd_pidfd_set_flags(int pidfd, int flags) |
148 | { |
149 | if ((flags & FFD_CLOEXEC) == 0) { |
150 | /* pidfd defaults to O_CLOEXEC */ |
151 | fcntl(fd: pidfd, F_SETFD, 0); |
152 | } |
153 | if (flags & FFD_NONBLOCK) |
154 | fcntl(fd: pidfd, F_SETFL, fcntl(fd: pidfd, F_GETFL) | O_NONBLOCK); |
155 | return pidfd; |
156 | } |
157 | |
158 | int system_vforkfd(int flags, pid_t *ppid, int (*childFn)(void *), void *token, int *system) |
159 | { |
160 | __attribute__((aligned(64))) char childStack[4096]; |
161 | pid_t pid; |
162 | int pidfd; |
163 | unsigned long cloneflags = CLONE_PIDFD | CLONE_VFORK | CLONE_VM | SIGCHLD; |
164 | |
165 | int state = system_forkfd_availability(); |
166 | if (state < 0) { |
167 | *system = 0; |
168 | return state; |
169 | } |
170 | *system = 1; |
171 | |
172 | pid = clone(fn: childFn, child_stack: childStack + sizeof(childStack), flags: cloneflags, arg: token, &pidfd, NULL, NULL); |
173 | if (pid < 0) |
174 | return pid; |
175 | if (ppid) |
176 | *ppid = pid; |
177 | return system_forkfd_pidfd_set_flags(pidfd, flags); |
178 | } |
179 | |
180 | int system_forkfd(int flags, pid_t *ppid, int *system) |
181 | { |
182 | pid_t pid; |
183 | int pidfd; |
184 | |
185 | int state = system_forkfd_availability(); |
186 | if (state < 0) { |
187 | *system = 0; |
188 | return state; |
189 | } |
190 | |
191 | *system = 1; |
192 | unsigned long cloneflags = CLONE_PIDFD | SIGCHLD; |
193 | pid = sys_clone(cloneflags, ptid: &pidfd); |
194 | if (pid < 0) |
195 | return pid; |
196 | if (ppid) |
197 | *ppid = pid; |
198 | |
199 | if (pid == 0) { |
200 | /* Child process */ |
201 | return FFD_CHILD_PROCESS; |
202 | } |
203 | |
204 | /* parent process */ |
205 | return system_forkfd_pidfd_set_flags(pidfd, flags); |
206 | } |
207 | |
208 | int system_forkfd_wait(int ffd, struct forkfd_info *info, int ffdoptions, struct rusage *rusage) |
209 | { |
210 | siginfo_t si; |
211 | int ret; |
212 | int options = convertForkfdWaitFlagsToWaitFlags(ffdoptions); |
213 | |
214 | if ((options & WNOHANG) == 0) { |
215 | /* check if the file descriptor is non-blocking */ |
216 | ret = fcntl(fd: ffd, F_GETFL); |
217 | if (ret == -1) |
218 | return ret; |
219 | if (ret & O_NONBLOCK) |
220 | options |= WNOHANG; |
221 | } |
222 | |
223 | si.si_status = si.si_code = 0; |
224 | ret = sys_waitid(P_PIDFD, pid_or_pidfd: ffd, infop: &si, options, ru: rusage); |
225 | if (info) { |
226 | info->code = si.si_code; |
227 | info->status = si.si_status; |
228 | } |
229 | return ret; |
230 | } |
231 | |