1 | // SPDX-License-Identifier: GPL-2.0 |
2 | |
3 | #define _GNU_SOURCE |
4 | #include <errno.h> |
5 | #include <fcntl.h> |
6 | #include <limits.h> |
7 | #include <linux/types.h> |
8 | #include <sched.h> |
9 | #include <signal.h> |
10 | #include <stdio.h> |
11 | #include <stdlib.h> |
12 | #include <string.h> |
13 | #include <syscall.h> |
14 | #include <sys/prctl.h> |
15 | #include <sys/wait.h> |
16 | #include <unistd.h> |
17 | #include <sys/socket.h> |
18 | #include <sys/stat.h> |
19 | |
20 | #include "pidfd.h" |
21 | #include "../clone3/clone3_selftests.h" |
22 | #include "../kselftest_harness.h" |
23 | |
24 | enum { |
25 | PIDFD_NS_USER, |
26 | PIDFD_NS_MNT, |
27 | PIDFD_NS_PID, |
28 | PIDFD_NS_UTS, |
29 | PIDFD_NS_IPC, |
30 | PIDFD_NS_NET, |
31 | PIDFD_NS_CGROUP, |
32 | PIDFD_NS_PIDCLD, |
33 | PIDFD_NS_TIME, |
34 | PIDFD_NS_MAX |
35 | }; |
36 | |
37 | const struct ns_info { |
38 | const char *name; |
39 | int flag; |
40 | } ns_info[] = { |
41 | [PIDFD_NS_USER] = { .name: "user" , CLONE_NEWUSER, }, |
42 | [PIDFD_NS_MNT] = { .name: "mnt" , CLONE_NEWNS, }, |
43 | [PIDFD_NS_PID] = { .name: "pid" , CLONE_NEWPID, }, |
44 | [PIDFD_NS_UTS] = { .name: "uts" , CLONE_NEWUTS, }, |
45 | [PIDFD_NS_IPC] = { .name: "ipc" , CLONE_NEWIPC, }, |
46 | [PIDFD_NS_NET] = { .name: "net" , CLONE_NEWNET, }, |
47 | [PIDFD_NS_CGROUP] = { .name: "cgroup" , CLONE_NEWCGROUP, }, |
48 | [PIDFD_NS_PIDCLD] = { .name: "pid_for_children" , .flag: 0, }, |
49 | [PIDFD_NS_TIME] = { .name: "time" , CLONE_NEWTIME, }, |
50 | }; |
51 | |
52 | FIXTURE(current_nsset) |
53 | { |
54 | pid_t pid; |
55 | int pidfd; |
56 | int nsfds[PIDFD_NS_MAX]; |
57 | |
58 | pid_t child_pid_exited; |
59 | int child_pidfd_exited; |
60 | |
61 | pid_t child_pid1; |
62 | int child_pidfd1; |
63 | int child_nsfds1[PIDFD_NS_MAX]; |
64 | |
65 | pid_t child_pid2; |
66 | int child_pidfd2; |
67 | int child_nsfds2[PIDFD_NS_MAX]; |
68 | }; |
69 | |
70 | static int sys_waitid(int which, pid_t pid, int options) |
71 | { |
72 | return syscall(__NR_waitid, which, pid, NULL, options, NULL); |
73 | } |
74 | |
75 | pid_t create_child(int *pidfd, unsigned flags) |
76 | { |
77 | struct __clone_args args = { |
78 | .flags = CLONE_PIDFD | flags, |
79 | .exit_signal = SIGCHLD, |
80 | .pidfd = ptr_to_u64(pidfd), |
81 | }; |
82 | |
83 | return sys_clone3(args: &args, size: sizeof(struct clone_args)); |
84 | } |
85 | |
86 | static bool switch_timens(void) |
87 | { |
88 | int fd, ret; |
89 | |
90 | if (unshare(CLONE_NEWTIME)) |
91 | return false; |
92 | |
93 | fd = open("/proc/self/ns/time_for_children" , O_RDONLY | O_CLOEXEC); |
94 | if (fd < 0) |
95 | return false; |
96 | |
97 | ret = setns(fd, CLONE_NEWTIME); |
98 | close(fd); |
99 | return ret == 0; |
100 | } |
101 | |
102 | static ssize_t read_nointr(int fd, void *buf, size_t count) |
103 | { |
104 | ssize_t ret; |
105 | |
106 | do { |
107 | ret = read(fd, buf, count); |
108 | } while (ret < 0 && errno == EINTR); |
109 | |
110 | return ret; |
111 | } |
112 | |
113 | static ssize_t write_nointr(int fd, const void *buf, size_t count) |
114 | { |
115 | ssize_t ret; |
116 | |
117 | do { |
118 | ret = write(fd, buf, count); |
119 | } while (ret < 0 && errno == EINTR); |
120 | |
121 | return ret; |
122 | } |
123 | |
124 | FIXTURE_SETUP(current_nsset) |
125 | { |
126 | int i, proc_fd, ret; |
127 | int ipc_sockets[2]; |
128 | char c; |
129 | |
130 | for (i = 0; i < PIDFD_NS_MAX; i++) { |
131 | self->nsfds[i] = -EBADF; |
132 | self->child_nsfds1[i] = -EBADF; |
133 | self->child_nsfds2[i] = -EBADF; |
134 | } |
135 | |
136 | proc_fd = open("/proc/self/ns" , O_DIRECTORY | O_CLOEXEC); |
137 | ASSERT_GE(proc_fd, 0) { |
138 | TH_LOG("%m - Failed to open /proc/self/ns" ); |
139 | } |
140 | |
141 | self->pid = getpid(); |
142 | for (i = 0; i < PIDFD_NS_MAX; i++) { |
143 | const struct ns_info *info = &ns_info[i]; |
144 | self->nsfds[i] = openat(proc_fd, info->name, O_RDONLY | O_CLOEXEC); |
145 | if (self->nsfds[i] < 0) { |
146 | EXPECT_EQ(errno, ENOENT) { |
147 | TH_LOG("%m - Failed to open %s namespace for process %d" , |
148 | info->name, self->pid); |
149 | } |
150 | } |
151 | } |
152 | |
153 | self->pidfd = sys_pidfd_open(pid: self->pid, flags: 0); |
154 | EXPECT_GT(self->pidfd, 0) { |
155 | TH_LOG("%m - Failed to open pidfd for process %d" , self->pid); |
156 | } |
157 | |
158 | /* Create task that exits right away. */ |
159 | self->child_pid_exited = create_child(pidfd: &self->child_pidfd_exited, |
160 | CLONE_NEWUSER | CLONE_NEWNET); |
161 | EXPECT_GT(self->child_pid_exited, 0); |
162 | |
163 | if (self->child_pid_exited == 0) |
164 | _exit(EXIT_SUCCESS); |
165 | |
166 | ASSERT_EQ(sys_waitid(P_PID, self->child_pid_exited, WEXITED | WNOWAIT), 0); |
167 | |
168 | self->pidfd = sys_pidfd_open(pid: self->pid, flags: 0); |
169 | EXPECT_GE(self->pidfd, 0) { |
170 | TH_LOG("%m - Failed to open pidfd for process %d" , self->pid); |
171 | } |
172 | |
173 | ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); |
174 | EXPECT_EQ(ret, 0); |
175 | |
176 | /* Create tasks that will be stopped. */ |
177 | self->child_pid1 = create_child(pidfd: &self->child_pidfd1, |
178 | CLONE_NEWUSER | CLONE_NEWNS | |
179 | CLONE_NEWCGROUP | CLONE_NEWIPC | |
180 | CLONE_NEWUTS | CLONE_NEWPID | |
181 | CLONE_NEWNET); |
182 | EXPECT_GE(self->child_pid1, 0); |
183 | |
184 | if (self->child_pid1 == 0) { |
185 | close(ipc_sockets[0]); |
186 | |
187 | if (!switch_timens()) |
188 | _exit(EXIT_FAILURE); |
189 | |
190 | if (write_nointr(ipc_sockets[1], "1" , 1) < 0) |
191 | _exit(EXIT_FAILURE); |
192 | |
193 | close(ipc_sockets[1]); |
194 | |
195 | pause(); |
196 | _exit(EXIT_SUCCESS); |
197 | } |
198 | |
199 | close(ipc_sockets[1]); |
200 | ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); |
201 | close(ipc_sockets[0]); |
202 | |
203 | ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); |
204 | EXPECT_EQ(ret, 0); |
205 | |
206 | self->child_pid2 = create_child(pidfd: &self->child_pidfd2, |
207 | CLONE_NEWUSER | CLONE_NEWNS | |
208 | CLONE_NEWCGROUP | CLONE_NEWIPC | |
209 | CLONE_NEWUTS | CLONE_NEWPID | |
210 | CLONE_NEWNET); |
211 | EXPECT_GE(self->child_pid2, 0); |
212 | |
213 | if (self->child_pid2 == 0) { |
214 | close(ipc_sockets[0]); |
215 | |
216 | if (!switch_timens()) |
217 | _exit(EXIT_FAILURE); |
218 | |
219 | if (write_nointr(ipc_sockets[1], "1" , 1) < 0) |
220 | _exit(EXIT_FAILURE); |
221 | |
222 | close(ipc_sockets[1]); |
223 | |
224 | pause(); |
225 | _exit(EXIT_SUCCESS); |
226 | } |
227 | |
228 | close(ipc_sockets[1]); |
229 | ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); |
230 | close(ipc_sockets[0]); |
231 | |
232 | for (i = 0; i < PIDFD_NS_MAX; i++) { |
233 | char p[100]; |
234 | |
235 | const struct ns_info *info = &ns_info[i]; |
236 | |
237 | self->nsfds[i] = openat(proc_fd, info->name, O_RDONLY | O_CLOEXEC); |
238 | if (self->nsfds[i] < 0) { |
239 | EXPECT_EQ(errno, ENOENT) { |
240 | TH_LOG("%m - Failed to open %s namespace for process %d" , |
241 | info->name, self->pid); |
242 | } |
243 | } |
244 | |
245 | ret = snprintf(buf: p, size: sizeof(p), fmt: "/proc/%d/ns/%s" , |
246 | self->child_pid1, info->name); |
247 | EXPECT_GT(ret, 0); |
248 | EXPECT_LT(ret, sizeof(p)); |
249 | |
250 | self->child_nsfds1[i] = open(p, O_RDONLY | O_CLOEXEC); |
251 | if (self->child_nsfds1[i] < 0) { |
252 | EXPECT_EQ(errno, ENOENT) { |
253 | TH_LOG("%m - Failed to open %s namespace for process %d" , |
254 | info->name, self->child_pid1); |
255 | } |
256 | } |
257 | |
258 | ret = snprintf(buf: p, size: sizeof(p), fmt: "/proc/%d/ns/%s" , |
259 | self->child_pid2, info->name); |
260 | EXPECT_GT(ret, 0); |
261 | EXPECT_LT(ret, sizeof(p)); |
262 | |
263 | self->child_nsfds2[i] = open(p, O_RDONLY | O_CLOEXEC); |
264 | if (self->child_nsfds2[i] < 0) { |
265 | EXPECT_EQ(errno, ENOENT) { |
266 | TH_LOG("%m - Failed to open %s namespace for process %d" , |
267 | info->name, self->child_pid1); |
268 | } |
269 | } |
270 | } |
271 | |
272 | close(proc_fd); |
273 | } |
274 | |
275 | FIXTURE_TEARDOWN(current_nsset) |
276 | { |
277 | int i; |
278 | |
279 | ASSERT_EQ(sys_pidfd_send_signal(self->child_pidfd1, |
280 | SIGKILL, NULL, 0), 0); |
281 | ASSERT_EQ(sys_pidfd_send_signal(self->child_pidfd2, |
282 | SIGKILL, NULL, 0), 0); |
283 | |
284 | for (i = 0; i < PIDFD_NS_MAX; i++) { |
285 | if (self->nsfds[i] >= 0) |
286 | close(self->nsfds[i]); |
287 | if (self->child_nsfds1[i] >= 0) |
288 | close(self->child_nsfds1[i]); |
289 | if (self->child_nsfds2[i] >= 0) |
290 | close(self->child_nsfds2[i]); |
291 | } |
292 | |
293 | if (self->child_pidfd1 >= 0) |
294 | EXPECT_EQ(0, close(self->child_pidfd1)); |
295 | if (self->child_pidfd2 >= 0) |
296 | EXPECT_EQ(0, close(self->child_pidfd2)); |
297 | ASSERT_EQ(sys_waitid(P_PID, self->child_pid_exited, WEXITED), 0); |
298 | ASSERT_EQ(sys_waitid(P_PID, self->child_pid1, WEXITED), 0); |
299 | ASSERT_EQ(sys_waitid(P_PID, self->child_pid2, WEXITED), 0); |
300 | } |
301 | |
302 | static int preserve_ns(const int pid, const char *ns) |
303 | { |
304 | int ret; |
305 | char path[50]; |
306 | |
307 | ret = snprintf(buf: path, size: sizeof(path), fmt: "/proc/%d/ns/%s" , pid, ns); |
308 | if (ret < 0 || (size_t)ret >= sizeof(path)) |
309 | return -EIO; |
310 | |
311 | return open(path, O_RDONLY | O_CLOEXEC); |
312 | } |
313 | |
314 | static int in_same_namespace(int ns_fd1, pid_t pid2, const char *ns) |
315 | { |
316 | int ns_fd2 = -EBADF; |
317 | int ret = -1; |
318 | struct stat ns_st1, ns_st2; |
319 | |
320 | ret = fstat(ns_fd1, &ns_st1); |
321 | if (ret < 0) |
322 | return -1; |
323 | |
324 | ns_fd2 = preserve_ns(pid: pid2, ns); |
325 | if (ns_fd2 < 0) |
326 | return -1; |
327 | |
328 | ret = fstat(ns_fd2, &ns_st2); |
329 | close(ns_fd2); |
330 | if (ret < 0) |
331 | return -1; |
332 | |
333 | /* processes are in the same namespace */ |
334 | if ((ns_st1.st_dev == ns_st2.st_dev) && |
335 | (ns_st1.st_ino == ns_st2.st_ino)) |
336 | return 1; |
337 | |
338 | /* processes are in different namespaces */ |
339 | return 0; |
340 | } |
341 | |
342 | /* Test that we can't pass garbage to the kernel. */ |
343 | TEST_F(current_nsset, invalid_flags) |
344 | { |
345 | ASSERT_NE(setns(self->pidfd, 0), 0); |
346 | EXPECT_EQ(errno, EINVAL); |
347 | |
348 | ASSERT_NE(setns(self->pidfd, -1), 0); |
349 | EXPECT_EQ(errno, EINVAL); |
350 | |
351 | ASSERT_NE(setns(self->pidfd, CLONE_VM), 0); |
352 | EXPECT_EQ(errno, EINVAL); |
353 | |
354 | ASSERT_NE(setns(self->pidfd, CLONE_NEWUSER | CLONE_VM), 0); |
355 | EXPECT_EQ(errno, EINVAL); |
356 | } |
357 | |
358 | /* Test that we can't attach to a task that has already exited. */ |
359 | TEST_F(current_nsset, pidfd_exited_child) |
360 | { |
361 | int i; |
362 | pid_t pid; |
363 | |
364 | ASSERT_NE(setns(self->child_pidfd_exited, CLONE_NEWUSER | CLONE_NEWNET), |
365 | 0); |
366 | EXPECT_EQ(errno, ESRCH); |
367 | |
368 | pid = getpid(); |
369 | for (i = 0; i < PIDFD_NS_MAX; i++) { |
370 | const struct ns_info *info = &ns_info[i]; |
371 | /* Verify that we haven't changed any namespaces. */ |
372 | if (self->nsfds[i] >= 0) |
373 | ASSERT_EQ(in_same_namespace(self->nsfds[i], pid, info->name), 1); |
374 | } |
375 | } |
376 | |
377 | TEST_F(current_nsset, pidfd_incremental_setns) |
378 | { |
379 | int i; |
380 | pid_t pid; |
381 | |
382 | pid = getpid(); |
383 | for (i = 0; i < PIDFD_NS_MAX; i++) { |
384 | const struct ns_info *info = &ns_info[i]; |
385 | int nsfd; |
386 | |
387 | if (self->child_nsfds1[i] < 0) |
388 | continue; |
389 | |
390 | if (info->flag) { |
391 | ASSERT_EQ(setns(self->child_pidfd1, info->flag), 0) { |
392 | TH_LOG("%m - Failed to setns to %s namespace of %d via pidfd %d" , |
393 | info->name, self->child_pid1, |
394 | self->child_pidfd1); |
395 | } |
396 | } |
397 | |
398 | /* Verify that we have changed to the correct namespaces. */ |
399 | if (info->flag == CLONE_NEWPID) |
400 | nsfd = self->nsfds[i]; |
401 | else |
402 | nsfd = self->child_nsfds1[i]; |
403 | ASSERT_EQ(in_same_namespace(nsfd, pid, info->name), 1) { |
404 | TH_LOG("setns failed to place us correctly into %s namespace of %d via pidfd %d" , |
405 | info->name, self->child_pid1, |
406 | self->child_pidfd1); |
407 | } |
408 | TH_LOG("Managed to correctly setns to %s namespace of %d via pidfd %d" , |
409 | info->name, self->child_pid1, self->child_pidfd1); |
410 | } |
411 | } |
412 | |
413 | TEST_F(current_nsset, nsfd_incremental_setns) |
414 | { |
415 | int i; |
416 | pid_t pid; |
417 | |
418 | pid = getpid(); |
419 | for (i = 0; i < PIDFD_NS_MAX; i++) { |
420 | const struct ns_info *info = &ns_info[i]; |
421 | int nsfd; |
422 | |
423 | if (self->child_nsfds1[i] < 0) |
424 | continue; |
425 | |
426 | if (info->flag) { |
427 | ASSERT_EQ(setns(self->child_nsfds1[i], info->flag), 0) { |
428 | TH_LOG("%m - Failed to setns to %s namespace of %d via nsfd %d" , |
429 | info->name, self->child_pid1, |
430 | self->child_nsfds1[i]); |
431 | } |
432 | } |
433 | |
434 | /* Verify that we have changed to the correct namespaces. */ |
435 | if (info->flag == CLONE_NEWPID) |
436 | nsfd = self->nsfds[i]; |
437 | else |
438 | nsfd = self->child_nsfds1[i]; |
439 | ASSERT_EQ(in_same_namespace(nsfd, pid, info->name), 1) { |
440 | TH_LOG("setns failed to place us correctly into %s namespace of %d via nsfd %d" , |
441 | info->name, self->child_pid1, |
442 | self->child_nsfds1[i]); |
443 | } |
444 | TH_LOG("Managed to correctly setns to %s namespace of %d via nsfd %d" , |
445 | info->name, self->child_pid1, self->child_nsfds1[i]); |
446 | } |
447 | } |
448 | |
449 | TEST_F(current_nsset, pidfd_one_shot_setns) |
450 | { |
451 | unsigned flags = 0; |
452 | int i; |
453 | pid_t pid; |
454 | |
455 | for (i = 0; i < PIDFD_NS_MAX; i++) { |
456 | const struct ns_info *info = &ns_info[i]; |
457 | |
458 | if (self->child_nsfds1[i] < 0) |
459 | continue; |
460 | |
461 | flags |= info->flag; |
462 | TH_LOG("Adding %s namespace of %d to list of namespaces to attach to" , |
463 | info->name, self->child_pid1); |
464 | } |
465 | |
466 | ASSERT_EQ(setns(self->child_pidfd1, flags), 0) { |
467 | TH_LOG("%m - Failed to setns to namespaces of %d" , |
468 | self->child_pid1); |
469 | } |
470 | |
471 | pid = getpid(); |
472 | for (i = 0; i < PIDFD_NS_MAX; i++) { |
473 | const struct ns_info *info = &ns_info[i]; |
474 | int nsfd; |
475 | |
476 | if (self->child_nsfds1[i] < 0) |
477 | continue; |
478 | |
479 | /* Verify that we have changed to the correct namespaces. */ |
480 | if (info->flag == CLONE_NEWPID) |
481 | nsfd = self->nsfds[i]; |
482 | else |
483 | nsfd = self->child_nsfds1[i]; |
484 | ASSERT_EQ(in_same_namespace(nsfd, pid, info->name), 1) { |
485 | TH_LOG("setns failed to place us correctly into %s namespace of %d" , |
486 | info->name, self->child_pid1); |
487 | } |
488 | TH_LOG("Managed to correctly setns to %s namespace of %d" , |
489 | info->name, self->child_pid1); |
490 | } |
491 | } |
492 | |
493 | TEST_F(current_nsset, no_foul_play) |
494 | { |
495 | unsigned flags = 0; |
496 | int i; |
497 | |
498 | for (i = 0; i < PIDFD_NS_MAX; i++) { |
499 | const struct ns_info *info = &ns_info[i]; |
500 | |
501 | if (self->child_nsfds1[i] < 0) |
502 | continue; |
503 | |
504 | flags |= info->flag; |
505 | if (info->flag) /* No use logging pid_for_children. */ |
506 | TH_LOG("Adding %s namespace of %d to list of namespaces to attach to" , |
507 | info->name, self->child_pid1); |
508 | } |
509 | |
510 | ASSERT_EQ(setns(self->child_pidfd1, flags), 0) { |
511 | TH_LOG("%m - Failed to setns to namespaces of %d vid pidfd %d" , |
512 | self->child_pid1, self->child_pidfd1); |
513 | } |
514 | |
515 | /* |
516 | * Can't setns to a user namespace outside of our hierarchy since we |
517 | * don't have caps in there and didn't create it. That means that under |
518 | * no circumstances should we be able to setns to any of the other |
519 | * ones since they aren't owned by our user namespace. |
520 | */ |
521 | for (i = 0; i < PIDFD_NS_MAX; i++) { |
522 | const struct ns_info *info = &ns_info[i]; |
523 | |
524 | if (self->child_nsfds2[i] < 0 || !info->flag) |
525 | continue; |
526 | |
527 | ASSERT_NE(setns(self->child_pidfd2, info->flag), 0) { |
528 | TH_LOG("Managed to setns to %s namespace of %d via pidfd %d" , |
529 | info->name, self->child_pid2, |
530 | self->child_pidfd2); |
531 | } |
532 | TH_LOG("%m - Correctly failed to setns to %s namespace of %d via pidfd %d" , |
533 | info->name, self->child_pid2, |
534 | self->child_pidfd2); |
535 | |
536 | ASSERT_NE(setns(self->child_nsfds2[i], info->flag), 0) { |
537 | TH_LOG("Managed to setns to %s namespace of %d via nsfd %d" , |
538 | info->name, self->child_pid2, |
539 | self->child_nsfds2[i]); |
540 | } |
541 | TH_LOG("%m - Correctly failed to setns to %s namespace of %d via nsfd %d" , |
542 | info->name, self->child_pid2, |
543 | self->child_nsfds2[i]); |
544 | } |
545 | } |
546 | |
547 | TEST(setns_einval) |
548 | { |
549 | int fd; |
550 | |
551 | fd = sys_memfd_create(name: "rostock" , flags: 0); |
552 | EXPECT_GT(fd, 0); |
553 | |
554 | ASSERT_NE(setns(fd, 0), 0); |
555 | EXPECT_EQ(errno, EINVAL); |
556 | close(fd); |
557 | } |
558 | |
559 | TEST_HARNESS_MAIN |
560 | |