1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * COW (Copy On Write) tests.
4 *
5 * Copyright 2022, Red Hat, Inc.
6 *
7 * Author(s): David Hildenbrand <david@redhat.com>
8 */
9#define _GNU_SOURCE
10#include <stdlib.h>
11#include <string.h>
12#include <stdbool.h>
13#include <stdint.h>
14#include <unistd.h>
15#include <errno.h>
16#include <fcntl.h>
17#include <assert.h>
18#include <linux/mman.h>
19#include <sys/mman.h>
20#include <sys/ioctl.h>
21#include <sys/wait.h>
22#include <linux/memfd.h>
23
24#include "local_config.h"
25#ifdef LOCAL_CONFIG_HAVE_LIBURING
26#include <liburing.h>
27#endif /* LOCAL_CONFIG_HAVE_LIBURING */
28
29#include "../../../../mm/gup_test.h"
30#include "../kselftest.h"
31#include "vm_util.h"
32#include "thp_settings.h"
33
34static size_t pagesize;
35static int pagemap_fd;
36static size_t pmdsize;
37static int nr_thpsizes;
38static size_t thpsizes[20];
39static int nr_hugetlbsizes;
40static size_t hugetlbsizes[10];
41static int gup_fd;
42static bool has_huge_zeropage;
43
44static int sz2ord(size_t size)
45{
46 return __builtin_ctzll(size / pagesize);
47}
48
49static int detect_thp_sizes(size_t sizes[], int max)
50{
51 int count = 0;
52 unsigned long orders;
53 size_t kb;
54 int i;
55
56 /* thp not supported at all. */
57 if (!pmdsize)
58 return 0;
59
60 orders = 1UL << sz2ord(size: pmdsize);
61 orders |= thp_supported_orders();
62
63 for (i = 0; orders && count < max; i++) {
64 if (!(orders & (1UL << i)))
65 continue;
66 orders &= ~(1UL << i);
67 kb = (pagesize >> 10) << i;
68 sizes[count++] = kb * 1024;
69 ksft_print_msg(msg: "[INFO] detected THP size: %zu KiB\n", kb);
70 }
71
72 return count;
73}
74
75static void detect_huge_zeropage(void)
76{
77 int fd = open("/sys/kernel/mm/transparent_hugepage/use_zero_page",
78 O_RDONLY);
79 size_t enabled = 0;
80 char buf[15];
81 int ret;
82
83 if (fd < 0)
84 return;
85
86 ret = pread(fd, buf, sizeof(buf), 0);
87 if (ret > 0 && ret < sizeof(buf)) {
88 buf[ret] = 0;
89
90 enabled = strtoul(buf, NULL, 10);
91 if (enabled == 1) {
92 has_huge_zeropage = true;
93 ksft_print_msg(msg: "[INFO] huge zeropage is enabled\n");
94 }
95 }
96
97 close(fd);
98}
99
100static bool range_is_swapped(void *addr, size_t size)
101{
102 for (; size; addr += pagesize, size -= pagesize)
103 if (!pagemap_is_swapped(fd: pagemap_fd, start: addr))
104 return false;
105 return true;
106}
107
108struct comm_pipes {
109 int child_ready[2];
110 int parent_ready[2];
111};
112
113static int setup_comm_pipes(struct comm_pipes *comm_pipes)
114{
115 if (pipe(comm_pipes->child_ready) < 0)
116 return -errno;
117 if (pipe(comm_pipes->parent_ready) < 0) {
118 close(comm_pipes->child_ready[0]);
119 close(comm_pipes->child_ready[1]);
120 return -errno;
121 }
122
123 return 0;
124}
125
126static void close_comm_pipes(struct comm_pipes *comm_pipes)
127{
128 close(comm_pipes->child_ready[0]);
129 close(comm_pipes->child_ready[1]);
130 close(comm_pipes->parent_ready[0]);
131 close(comm_pipes->parent_ready[1]);
132}
133
134static int child_memcmp_fn(char *mem, size_t size,
135 struct comm_pipes *comm_pipes)
136{
137 char *old = malloc(size);
138 char buf;
139
140 /* Backup the original content. */
141 memcpy(old, mem, size);
142
143 /* Wait until the parent modified the page. */
144 write(comm_pipes->child_ready[1], "0", 1);
145 while (read(comm_pipes->parent_ready[0], &buf, 1) != 1)
146 ;
147
148 /* See if we still read the old values. */
149 return memcmp(p: old, q: mem, size);
150}
151
152static int child_vmsplice_memcmp_fn(char *mem, size_t size,
153 struct comm_pipes *comm_pipes)
154{
155 struct iovec iov = {
156 .iov_base = mem,
157 .iov_len = size,
158 };
159 ssize_t cur, total, transferred;
160 char *old, *new;
161 int fds[2];
162 char buf;
163
164 old = malloc(size);
165 new = malloc(size);
166
167 /* Backup the original content. */
168 memcpy(old, mem, size);
169
170 if (pipe(fds) < 0)
171 return -errno;
172
173 /* Trigger a read-only pin. */
174 transferred = vmsplice(fds[1], &iov, 1, 0);
175 if (transferred < 0)
176 return -errno;
177 if (transferred == 0)
178 return -EINVAL;
179
180 /* Unmap it from our page tables. */
181 if (munmap(mem, size) < 0)
182 return -errno;
183
184 /* Wait until the parent modified it. */
185 write(comm_pipes->child_ready[1], "0", 1);
186 while (read(comm_pipes->parent_ready[0], &buf, 1) != 1)
187 ;
188
189 /* See if we still read the old values via the pipe. */
190 for (total = 0; total < transferred; total += cur) {
191 cur = read(fds[0], new + total, transferred - total);
192 if (cur < 0)
193 return -errno;
194 }
195
196 return memcmp(p: old, q: new, size: transferred);
197}
198
199typedef int (*child_fn)(char *mem, size_t size, struct comm_pipes *comm_pipes);
200
201static void do_test_cow_in_parent(char *mem, size_t size, bool do_mprotect,
202 child_fn fn)
203{
204 struct comm_pipes comm_pipes;
205 char buf;
206 int ret;
207
208 ret = setup_comm_pipes(&comm_pipes);
209 if (ret) {
210 ksft_test_result_fail(msg: "pipe() failed\n");
211 return;
212 }
213
214 ret = fork();
215 if (ret < 0) {
216 ksft_test_result_fail(msg: "fork() failed\n");
217 goto close_comm_pipes;
218 } else if (!ret) {
219 exit(fn(mem, size, &comm_pipes));
220 }
221
222 while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
223 ;
224
225 if (do_mprotect) {
226 /*
227 * mprotect() optimizations might try avoiding
228 * write-faults by directly mapping pages writable.
229 */
230 ret = mprotect(mem, size, PROT_READ);
231 ret |= mprotect(mem, size, PROT_READ|PROT_WRITE);
232 if (ret) {
233 ksft_test_result_fail(msg: "mprotect() failed\n");
234 write(comm_pipes.parent_ready[1], "0", 1);
235 wait(&ret);
236 goto close_comm_pipes;
237 }
238 }
239
240 /* Modify the page. */
241 memset(mem, 0xff, size);
242 write(comm_pipes.parent_ready[1], "0", 1);
243
244 wait(&ret);
245 if (WIFEXITED(ret))
246 ret = WEXITSTATUS(ret);
247 else
248 ret = -EINVAL;
249
250 ksft_test_result(!ret, "No leak from parent into child\n");
251close_comm_pipes:
252 close_comm_pipes(comm_pipes: &comm_pipes);
253}
254
255static void test_cow_in_parent(char *mem, size_t size)
256{
257 do_test_cow_in_parent(mem, size, do_mprotect: false, fn: child_memcmp_fn);
258}
259
260static void test_cow_in_parent_mprotect(char *mem, size_t size)
261{
262 do_test_cow_in_parent(mem, size, do_mprotect: true, fn: child_memcmp_fn);
263}
264
265static void test_vmsplice_in_child(char *mem, size_t size)
266{
267 do_test_cow_in_parent(mem, size, do_mprotect: false, fn: child_vmsplice_memcmp_fn);
268}
269
270static void test_vmsplice_in_child_mprotect(char *mem, size_t size)
271{
272 do_test_cow_in_parent(mem, size, do_mprotect: true, fn: child_vmsplice_memcmp_fn);
273}
274
275static void do_test_vmsplice_in_parent(char *mem, size_t size,
276 bool before_fork)
277{
278 struct iovec iov = {
279 .iov_base = mem,
280 .iov_len = size,
281 };
282 ssize_t cur, total, transferred;
283 struct comm_pipes comm_pipes;
284 char *old, *new;
285 int ret, fds[2];
286 char buf;
287
288 old = malloc(size);
289 new = malloc(size);
290
291 memcpy(old, mem, size);
292
293 ret = setup_comm_pipes(&comm_pipes);
294 if (ret) {
295 ksft_test_result_fail(msg: "pipe() failed\n");
296 goto free;
297 }
298
299 if (pipe(fds) < 0) {
300 ksft_test_result_fail(msg: "pipe() failed\n");
301 goto close_comm_pipes;
302 }
303
304 if (before_fork) {
305 transferred = vmsplice(fds[1], &iov, 1, 0);
306 if (transferred <= 0) {
307 ksft_test_result_fail(msg: "vmsplice() failed\n");
308 goto close_pipe;
309 }
310 }
311
312 ret = fork();
313 if (ret < 0) {
314 ksft_test_result_fail(msg: "fork() failed\n");
315 goto close_pipe;
316 } else if (!ret) {
317 write(comm_pipes.child_ready[1], "0", 1);
318 while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
319 ;
320 /* Modify page content in the child. */
321 memset(mem, 0xff, size);
322 exit(0);
323 }
324
325 if (!before_fork) {
326 transferred = vmsplice(fds[1], &iov, 1, 0);
327 if (transferred <= 0) {
328 ksft_test_result_fail(msg: "vmsplice() failed\n");
329 wait(&ret);
330 goto close_pipe;
331 }
332 }
333
334 while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
335 ;
336 if (munmap(mem, size) < 0) {
337 ksft_test_result_fail(msg: "munmap() failed\n");
338 goto close_pipe;
339 }
340 write(comm_pipes.parent_ready[1], "0", 1);
341
342 /* Wait until the child is done writing. */
343 wait(&ret);
344 if (!WIFEXITED(ret)) {
345 ksft_test_result_fail(msg: "wait() failed\n");
346 goto close_pipe;
347 }
348
349 /* See if we still read the old values. */
350 for (total = 0; total < transferred; total += cur) {
351 cur = read(fds[0], new + total, transferred - total);
352 if (cur < 0) {
353 ksft_test_result_fail(msg: "read() failed\n");
354 goto close_pipe;
355 }
356 }
357
358 ksft_test_result(!memcmp(old, new, transferred),
359 "No leak from child into parent\n");
360close_pipe:
361 close(fds[0]);
362 close(fds[1]);
363close_comm_pipes:
364 close_comm_pipes(comm_pipes: &comm_pipes);
365free:
366 free(old);
367 free(new);
368}
369
370static void test_vmsplice_before_fork(char *mem, size_t size)
371{
372 do_test_vmsplice_in_parent(mem, size, before_fork: true);
373}
374
375static void test_vmsplice_after_fork(char *mem, size_t size)
376{
377 do_test_vmsplice_in_parent(mem, size, before_fork: false);
378}
379
380#ifdef LOCAL_CONFIG_HAVE_LIBURING
381static void do_test_iouring(char *mem, size_t size, bool use_fork)
382{
383 struct comm_pipes comm_pipes;
384 struct io_uring_cqe *cqe;
385 struct io_uring_sqe *sqe;
386 struct io_uring ring;
387 ssize_t cur, total;
388 struct iovec iov;
389 char *buf, *tmp;
390 int ret, fd;
391 FILE *file;
392
393 ret = setup_comm_pipes(&comm_pipes);
394 if (ret) {
395 ksft_test_result_fail("pipe() failed\n");
396 return;
397 }
398
399 file = tmpfile();
400 if (!file) {
401 ksft_test_result_fail("tmpfile() failed\n");
402 goto close_comm_pipes;
403 }
404 fd = fileno(file);
405 assert(fd);
406
407 tmp = malloc(size);
408 if (!tmp) {
409 ksft_test_result_fail("malloc() failed\n");
410 goto close_file;
411 }
412
413 /* Skip on errors, as we might just lack kernel support. */
414 ret = io_uring_queue_init(1, &ring, 0);
415 if (ret < 0) {
416 ksft_test_result_skip("io_uring_queue_init() failed\n");
417 goto free_tmp;
418 }
419
420 /*
421 * Register the range as a fixed buffer. This will FOLL_WRITE | FOLL_PIN
422 * | FOLL_LONGTERM the range.
423 *
424 * Skip on errors, as we might just lack kernel support or might not
425 * have sufficient MEMLOCK permissions.
426 */
427 iov.iov_base = mem;
428 iov.iov_len = size;
429 ret = io_uring_register_buffers(&ring, &iov, 1);
430 if (ret) {
431 ksft_test_result_skip("io_uring_register_buffers() failed\n");
432 goto queue_exit;
433 }
434
435 if (use_fork) {
436 /*
437 * fork() and keep the child alive until we're done. Note that
438 * we expect the pinned page to not get shared with the child.
439 */
440 ret = fork();
441 if (ret < 0) {
442 ksft_test_result_fail("fork() failed\n");
443 goto unregister_buffers;
444 } else if (!ret) {
445 write(comm_pipes.child_ready[1], "0", 1);
446 while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
447 ;
448 exit(0);
449 }
450
451 while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
452 ;
453 } else {
454 /*
455 * Map the page R/O into the page table. Enable softdirty
456 * tracking to stop the page from getting mapped R/W immediately
457 * again by mprotect() optimizations. Note that we don't have an
458 * easy way to test if that worked (the pagemap does not export
459 * if the page is mapped R/O vs. R/W).
460 */
461 ret = mprotect(mem, size, PROT_READ);
462 clear_softdirty();
463 ret |= mprotect(mem, size, PROT_READ | PROT_WRITE);
464 if (ret) {
465 ksft_test_result_fail("mprotect() failed\n");
466 goto unregister_buffers;
467 }
468 }
469
470 /*
471 * Modify the page and write page content as observed by the fixed
472 * buffer pin to the file so we can verify it.
473 */
474 memset(mem, 0xff, size);
475 sqe = io_uring_get_sqe(&ring);
476 if (!sqe) {
477 ksft_test_result_fail("io_uring_get_sqe() failed\n");
478 goto quit_child;
479 }
480 io_uring_prep_write_fixed(sqe, fd, mem, size, 0, 0);
481
482 ret = io_uring_submit(&ring);
483 if (ret < 0) {
484 ksft_test_result_fail("io_uring_submit() failed\n");
485 goto quit_child;
486 }
487
488 ret = io_uring_wait_cqe(&ring, &cqe);
489 if (ret < 0) {
490 ksft_test_result_fail("io_uring_wait_cqe() failed\n");
491 goto quit_child;
492 }
493
494 if (cqe->res != size) {
495 ksft_test_result_fail("write_fixed failed\n");
496 goto quit_child;
497 }
498 io_uring_cqe_seen(&ring, cqe);
499
500 /* Read back the file content to the temporary buffer. */
501 total = 0;
502 while (total < size) {
503 cur = pread(fd, tmp + total, size - total, total);
504 if (cur < 0) {
505 ksft_test_result_fail("pread() failed\n");
506 goto quit_child;
507 }
508 total += cur;
509 }
510
511 /* Finally, check if we read what we expected. */
512 ksft_test_result(!memcmp(mem, tmp, size),
513 "Longterm R/W pin is reliable\n");
514
515quit_child:
516 if (use_fork) {
517 write(comm_pipes.parent_ready[1], "0", 1);
518 wait(&ret);
519 }
520unregister_buffers:
521 io_uring_unregister_buffers(&ring);
522queue_exit:
523 io_uring_queue_exit(&ring);
524free_tmp:
525 free(tmp);
526close_file:
527 fclose(file);
528close_comm_pipes:
529 close_comm_pipes(&comm_pipes);
530}
531
532static void test_iouring_ro(char *mem, size_t size)
533{
534 do_test_iouring(mem, size, false);
535}
536
537static void test_iouring_fork(char *mem, size_t size)
538{
539 do_test_iouring(mem, size, true);
540}
541
542#endif /* LOCAL_CONFIG_HAVE_LIBURING */
543
544enum ro_pin_test {
545 RO_PIN_TEST,
546 RO_PIN_TEST_SHARED,
547 RO_PIN_TEST_PREVIOUSLY_SHARED,
548 RO_PIN_TEST_RO_EXCLUSIVE,
549};
550
551static void do_test_ro_pin(char *mem, size_t size, enum ro_pin_test test,
552 bool fast)
553{
554 struct pin_longterm_test args;
555 struct comm_pipes comm_pipes;
556 char *tmp, buf;
557 __u64 tmp_val;
558 int ret;
559
560 if (gup_fd < 0) {
561 ksft_test_result_skip(msg: "gup_test not available\n");
562 return;
563 }
564
565 tmp = malloc(size);
566 if (!tmp) {
567 ksft_test_result_fail(msg: "malloc() failed\n");
568 return;
569 }
570
571 ret = setup_comm_pipes(&comm_pipes);
572 if (ret) {
573 ksft_test_result_fail(msg: "pipe() failed\n");
574 goto free_tmp;
575 }
576
577 switch (test) {
578 case RO_PIN_TEST:
579 break;
580 case RO_PIN_TEST_SHARED:
581 case RO_PIN_TEST_PREVIOUSLY_SHARED:
582 /*
583 * Share the pages with our child. As the pages are not pinned,
584 * this should just work.
585 */
586 ret = fork();
587 if (ret < 0) {
588 ksft_test_result_fail(msg: "fork() failed\n");
589 goto close_comm_pipes;
590 } else if (!ret) {
591 write(comm_pipes.child_ready[1], "0", 1);
592 while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
593 ;
594 exit(0);
595 }
596
597 /* Wait until our child is ready. */
598 while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
599 ;
600
601 if (test == RO_PIN_TEST_PREVIOUSLY_SHARED) {
602 /*
603 * Tell the child to quit now and wait until it quit.
604 * The pages should now be mapped R/O into our page
605 * tables, but they are no longer shared.
606 */
607 write(comm_pipes.parent_ready[1], "0", 1);
608 wait(&ret);
609 if (!WIFEXITED(ret))
610 ksft_print_msg(msg: "[INFO] wait() failed\n");
611 }
612 break;
613 case RO_PIN_TEST_RO_EXCLUSIVE:
614 /*
615 * Map the page R/O into the page table. Enable softdirty
616 * tracking to stop the page from getting mapped R/W immediately
617 * again by mprotect() optimizations. Note that we don't have an
618 * easy way to test if that worked (the pagemap does not export
619 * if the page is mapped R/O vs. R/W).
620 */
621 ret = mprotect(mem, size, PROT_READ);
622 clear_softdirty();
623 ret |= mprotect(mem, size, PROT_READ | PROT_WRITE);
624 if (ret) {
625 ksft_test_result_fail(msg: "mprotect() failed\n");
626 goto close_comm_pipes;
627 }
628 break;
629 default:
630 assert(false);
631 }
632
633 /* Take a R/O pin. This should trigger unsharing. */
634 args.addr = (__u64)(uintptr_t)mem;
635 args.size = size;
636 args.flags = fast ? PIN_LONGTERM_TEST_FLAG_USE_FAST : 0;
637 ret = ioctl(gup_fd, PIN_LONGTERM_TEST_START, &args);
638 if (ret) {
639 if (errno == EINVAL)
640 ksft_test_result_skip(msg: "PIN_LONGTERM_TEST_START failed\n");
641 else
642 ksft_test_result_fail(msg: "PIN_LONGTERM_TEST_START failed\n");
643 goto wait;
644 }
645
646 /* Modify the page. */
647 memset(mem, 0xff, size);
648
649 /*
650 * Read back the content via the pin to the temporary buffer and
651 * test if we observed the modification.
652 */
653 tmp_val = (__u64)(uintptr_t)tmp;
654 ret = ioctl(gup_fd, PIN_LONGTERM_TEST_READ, &tmp_val);
655 if (ret)
656 ksft_test_result_fail(msg: "PIN_LONGTERM_TEST_READ failed\n");
657 else
658 ksft_test_result(!memcmp(mem, tmp, size),
659 "Longterm R/O pin is reliable\n");
660
661 ret = ioctl(gup_fd, PIN_LONGTERM_TEST_STOP);
662 if (ret)
663 ksft_print_msg(msg: "[INFO] PIN_LONGTERM_TEST_STOP failed\n");
664wait:
665 switch (test) {
666 case RO_PIN_TEST_SHARED:
667 write(comm_pipes.parent_ready[1], "0", 1);
668 wait(&ret);
669 if (!WIFEXITED(ret))
670 ksft_print_msg(msg: "[INFO] wait() failed\n");
671 break;
672 default:
673 break;
674 }
675close_comm_pipes:
676 close_comm_pipes(comm_pipes: &comm_pipes);
677free_tmp:
678 free(tmp);
679}
680
681static void test_ro_pin_on_shared(char *mem, size_t size)
682{
683 do_test_ro_pin(mem, size, test: RO_PIN_TEST_SHARED, fast: false);
684}
685
686static void test_ro_fast_pin_on_shared(char *mem, size_t size)
687{
688 do_test_ro_pin(mem, size, test: RO_PIN_TEST_SHARED, fast: true);
689}
690
691static void test_ro_pin_on_ro_previously_shared(char *mem, size_t size)
692{
693 do_test_ro_pin(mem, size, test: RO_PIN_TEST_PREVIOUSLY_SHARED, fast: false);
694}
695
696static void test_ro_fast_pin_on_ro_previously_shared(char *mem, size_t size)
697{
698 do_test_ro_pin(mem, size, test: RO_PIN_TEST_PREVIOUSLY_SHARED, fast: true);
699}
700
701static void test_ro_pin_on_ro_exclusive(char *mem, size_t size)
702{
703 do_test_ro_pin(mem, size, test: RO_PIN_TEST_RO_EXCLUSIVE, fast: false);
704}
705
706static void test_ro_fast_pin_on_ro_exclusive(char *mem, size_t size)
707{
708 do_test_ro_pin(mem, size, test: RO_PIN_TEST_RO_EXCLUSIVE, fast: true);
709}
710
711typedef void (*test_fn)(char *mem, size_t size);
712
713static void do_run_with_base_page(test_fn fn, bool swapout)
714{
715 char *mem;
716 int ret;
717
718 mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE,
719 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
720 if (mem == MAP_FAILED) {
721 ksft_test_result_fail(msg: "mmap() failed\n");
722 return;
723 }
724
725 ret = madvise(mem, pagesize, MADV_NOHUGEPAGE);
726 /* Ignore if not around on a kernel. */
727 if (ret && errno != EINVAL) {
728 ksft_test_result_fail(msg: "MADV_NOHUGEPAGE failed\n");
729 goto munmap;
730 }
731
732 /* Populate a base page. */
733 memset(mem, 0, pagesize);
734
735 if (swapout) {
736 madvise(mem, pagesize, MADV_PAGEOUT);
737 if (!pagemap_is_swapped(fd: pagemap_fd, start: mem)) {
738 ksft_test_result_skip(msg: "MADV_PAGEOUT did not work, is swap enabled?\n");
739 goto munmap;
740 }
741 }
742
743 fn(mem, pagesize);
744munmap:
745 munmap(mem, pagesize);
746}
747
748static void run_with_base_page(test_fn fn, const char *desc)
749{
750 ksft_print_msg(msg: "[RUN] %s ... with base page\n", desc);
751 do_run_with_base_page(fn, swapout: false);
752}
753
754static void run_with_base_page_swap(test_fn fn, const char *desc)
755{
756 ksft_print_msg(msg: "[RUN] %s ... with swapped out base page\n", desc);
757 do_run_with_base_page(fn, swapout: true);
758}
759
760enum thp_run {
761 THP_RUN_PMD,
762 THP_RUN_PMD_SWAPOUT,
763 THP_RUN_PTE,
764 THP_RUN_PTE_SWAPOUT,
765 THP_RUN_SINGLE_PTE,
766 THP_RUN_SINGLE_PTE_SWAPOUT,
767 THP_RUN_PARTIAL_MREMAP,
768 THP_RUN_PARTIAL_SHARED,
769};
770
771static void do_run_with_thp(test_fn fn, enum thp_run thp_run, size_t thpsize)
772{
773 char *mem, *mmap_mem, *tmp, *mremap_mem = MAP_FAILED;
774 size_t size, mmap_size, mremap_size;
775 int ret;
776
777 /* For alignment purposes, we need twice the thp size. */
778 mmap_size = 2 * thpsize;
779 mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
780 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
781 if (mmap_mem == MAP_FAILED) {
782 ksft_test_result_fail(msg: "mmap() failed\n");
783 return;
784 }
785
786 /* We need a THP-aligned memory area. */
787 mem = (char *)(((uintptr_t)mmap_mem + thpsize) & ~(thpsize - 1));
788
789 ret = madvise(mem, thpsize, MADV_HUGEPAGE);
790 if (ret) {
791 ksft_test_result_fail(msg: "MADV_HUGEPAGE failed\n");
792 goto munmap;
793 }
794
795 /*
796 * Try to populate a THP. Touch the first sub-page and test if
797 * we get the last sub-page populated automatically.
798 */
799 mem[0] = 0;
800 if (!pagemap_is_populated(fd: pagemap_fd, start: mem + thpsize - pagesize)) {
801 ksft_test_result_skip(msg: "Did not get a THP populated\n");
802 goto munmap;
803 }
804 memset(mem, 0, thpsize);
805
806 size = thpsize;
807 switch (thp_run) {
808 case THP_RUN_PMD:
809 case THP_RUN_PMD_SWAPOUT:
810 assert(thpsize == pmdsize);
811 break;
812 case THP_RUN_PTE:
813 case THP_RUN_PTE_SWAPOUT:
814 /*
815 * Trigger PTE-mapping the THP by temporarily mapping a single
816 * subpage R/O. This is a noop if the THP is not pmdsize (and
817 * therefore already PTE-mapped).
818 */
819 ret = mprotect(mem + pagesize, pagesize, PROT_READ);
820 if (ret) {
821 ksft_test_result_fail(msg: "mprotect() failed\n");
822 goto munmap;
823 }
824 ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE);
825 if (ret) {
826 ksft_test_result_fail(msg: "mprotect() failed\n");
827 goto munmap;
828 }
829 break;
830 case THP_RUN_SINGLE_PTE:
831 case THP_RUN_SINGLE_PTE_SWAPOUT:
832 /*
833 * Discard all but a single subpage of that PTE-mapped THP. What
834 * remains is a single PTE mapping a single subpage.
835 */
836 ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTNEED);
837 if (ret) {
838 ksft_test_result_fail(msg: "MADV_DONTNEED failed\n");
839 goto munmap;
840 }
841 size = pagesize;
842 break;
843 case THP_RUN_PARTIAL_MREMAP:
844 /*
845 * Remap half of the THP. We need some new memory location
846 * for that.
847 */
848 mremap_size = thpsize / 2;
849 mremap_mem = mmap(NULL, mremap_size, PROT_NONE,
850 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
851 if (mem == MAP_FAILED) {
852 ksft_test_result_fail(msg: "mmap() failed\n");
853 goto munmap;
854 }
855 tmp = mremap(mem + mremap_size, mremap_size, mremap_size,
856 MREMAP_MAYMOVE | MREMAP_FIXED, mremap_mem);
857 if (tmp != mremap_mem) {
858 ksft_test_result_fail(msg: "mremap() failed\n");
859 goto munmap;
860 }
861 size = mremap_size;
862 break;
863 case THP_RUN_PARTIAL_SHARED:
864 /*
865 * Share the first page of the THP with a child and quit the
866 * child. This will result in some parts of the THP never
867 * have been shared.
868 */
869 ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTFORK);
870 if (ret) {
871 ksft_test_result_fail(msg: "MADV_DONTFORK failed\n");
872 goto munmap;
873 }
874 ret = fork();
875 if (ret < 0) {
876 ksft_test_result_fail(msg: "fork() failed\n");
877 goto munmap;
878 } else if (!ret) {
879 exit(0);
880 }
881 wait(&ret);
882 /* Allow for sharing all pages again. */
883 ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DOFORK);
884 if (ret) {
885 ksft_test_result_fail(msg: "MADV_DOFORK failed\n");
886 goto munmap;
887 }
888 break;
889 default:
890 assert(false);
891 }
892
893 switch (thp_run) {
894 case THP_RUN_PMD_SWAPOUT:
895 case THP_RUN_PTE_SWAPOUT:
896 case THP_RUN_SINGLE_PTE_SWAPOUT:
897 madvise(mem, size, MADV_PAGEOUT);
898 if (!range_is_swapped(addr: mem, size)) {
899 ksft_test_result_skip(msg: "MADV_PAGEOUT did not work, is swap enabled?\n");
900 goto munmap;
901 }
902 break;
903 default:
904 break;
905 }
906
907 fn(mem, size);
908munmap:
909 munmap(mmap_mem, mmap_size);
910 if (mremap_mem != MAP_FAILED)
911 munmap(mremap_mem, mremap_size);
912}
913
914static void run_with_thp(test_fn fn, const char *desc, size_t size)
915{
916 ksft_print_msg(msg: "[RUN] %s ... with THP (%zu kB)\n",
917 desc, size / 1024);
918 do_run_with_thp(fn, thp_run: THP_RUN_PMD, thpsize: size);
919}
920
921static void run_with_thp_swap(test_fn fn, const char *desc, size_t size)
922{
923 ksft_print_msg(msg: "[RUN] %s ... with swapped-out THP (%zu kB)\n",
924 desc, size / 1024);
925 do_run_with_thp(fn, thp_run: THP_RUN_PMD_SWAPOUT, thpsize: size);
926}
927
928static void run_with_pte_mapped_thp(test_fn fn, const char *desc, size_t size)
929{
930 ksft_print_msg(msg: "[RUN] %s ... with PTE-mapped THP (%zu kB)\n",
931 desc, size / 1024);
932 do_run_with_thp(fn, thp_run: THP_RUN_PTE, thpsize: size);
933}
934
935static void run_with_pte_mapped_thp_swap(test_fn fn, const char *desc, size_t size)
936{
937 ksft_print_msg(msg: "[RUN] %s ... with swapped-out, PTE-mapped THP (%zu kB)\n",
938 desc, size / 1024);
939 do_run_with_thp(fn, thp_run: THP_RUN_PTE_SWAPOUT, thpsize: size);
940}
941
942static void run_with_single_pte_of_thp(test_fn fn, const char *desc, size_t size)
943{
944 ksft_print_msg(msg: "[RUN] %s ... with single PTE of THP (%zu kB)\n",
945 desc, size / 1024);
946 do_run_with_thp(fn, thp_run: THP_RUN_SINGLE_PTE, thpsize: size);
947}
948
949static void run_with_single_pte_of_thp_swap(test_fn fn, const char *desc, size_t size)
950{
951 ksft_print_msg(msg: "[RUN] %s ... with single PTE of swapped-out THP (%zu kB)\n",
952 desc, size / 1024);
953 do_run_with_thp(fn, thp_run: THP_RUN_SINGLE_PTE_SWAPOUT, thpsize: size);
954}
955
956static void run_with_partial_mremap_thp(test_fn fn, const char *desc, size_t size)
957{
958 ksft_print_msg(msg: "[RUN] %s ... with partially mremap()'ed THP (%zu kB)\n",
959 desc, size / 1024);
960 do_run_with_thp(fn, thp_run: THP_RUN_PARTIAL_MREMAP, thpsize: size);
961}
962
963static void run_with_partial_shared_thp(test_fn fn, const char *desc, size_t size)
964{
965 ksft_print_msg(msg: "[RUN] %s ... with partially shared THP (%zu kB)\n",
966 desc, size / 1024);
967 do_run_with_thp(fn, thp_run: THP_RUN_PARTIAL_SHARED, thpsize: size);
968}
969
970static void run_with_hugetlb(test_fn fn, const char *desc, size_t hugetlbsize)
971{
972 int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB;
973 char *mem, *dummy;
974
975 ksft_print_msg(msg: "[RUN] %s ... with hugetlb (%zu kB)\n", desc,
976 hugetlbsize / 1024);
977
978 flags |= __builtin_ctzll(hugetlbsize) << MAP_HUGE_SHIFT;
979
980 mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0);
981 if (mem == MAP_FAILED) {
982 ksft_test_result_skip(msg: "need more free huge pages\n");
983 return;
984 }
985
986 /* Populate an huge page. */
987 memset(mem, 0, hugetlbsize);
988
989 /*
990 * We need a total of two hugetlb pages to handle COW/unsharing
991 * properly, otherwise we might get zapped by a SIGBUS.
992 */
993 dummy = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0);
994 if (dummy == MAP_FAILED) {
995 ksft_test_result_skip(msg: "need more free huge pages\n");
996 goto munmap;
997 }
998 munmap(dummy, hugetlbsize);
999
1000 fn(mem, hugetlbsize);
1001munmap:
1002 munmap(mem, hugetlbsize);
1003}
1004
1005struct test_case {
1006 const char *desc;
1007 test_fn fn;
1008};
1009
1010/*
1011 * Test cases that are specific to anonymous pages: pages in private mappings
1012 * that may get shared via COW during fork().
1013 */
1014static const struct test_case anon_test_cases[] = {
1015 /*
1016 * Basic COW tests for fork() without any GUP. If we miss to break COW,
1017 * either the child can observe modifications by the parent or the
1018 * other way around.
1019 */
1020 {
1021 "Basic COW after fork()",
1022 test_cow_in_parent,
1023 },
1024 /*
1025 * Basic test, but do an additional mprotect(PROT_READ)+
1026 * mprotect(PROT_READ|PROT_WRITE) in the parent before write access.
1027 */
1028 {
1029 "Basic COW after fork() with mprotect() optimization",
1030 test_cow_in_parent_mprotect,
1031 },
1032 /*
1033 * vmsplice() [R/O GUP] + unmap in the child; modify in the parent. If
1034 * we miss to break COW, the child observes modifications by the parent.
1035 * This is CVE-2020-29374 reported by Jann Horn.
1036 */
1037 {
1038 "vmsplice() + unmap in child",
1039 test_vmsplice_in_child
1040 },
1041 /*
1042 * vmsplice() test, but do an additional mprotect(PROT_READ)+
1043 * mprotect(PROT_READ|PROT_WRITE) in the parent before write access.
1044 */
1045 {
1046 "vmsplice() + unmap in child with mprotect() optimization",
1047 test_vmsplice_in_child_mprotect
1048 },
1049 /*
1050 * vmsplice() [R/O GUP] in parent before fork(), unmap in parent after
1051 * fork(); modify in the child. If we miss to break COW, the parent
1052 * observes modifications by the child.
1053 */
1054 {
1055 "vmsplice() before fork(), unmap in parent after fork()",
1056 test_vmsplice_before_fork,
1057 },
1058 /*
1059 * vmsplice() [R/O GUP] + unmap in parent after fork(); modify in the
1060 * child. If we miss to break COW, the parent observes modifications by
1061 * the child.
1062 */
1063 {
1064 "vmsplice() + unmap in parent after fork()",
1065 test_vmsplice_after_fork,
1066 },
1067#ifdef LOCAL_CONFIG_HAVE_LIBURING
1068 /*
1069 * Take a R/W longterm pin and then map the page R/O into the page
1070 * table to trigger a write fault on next access. When modifying the
1071 * page, the page content must be visible via the pin.
1072 */
1073 {
1074 "R/O-mapping a page registered as iouring fixed buffer",
1075 test_iouring_ro,
1076 },
1077 /*
1078 * Take a R/W longterm pin and then fork() a child. When modifying the
1079 * page, the page content must be visible via the pin. We expect the
1080 * pinned page to not get shared with the child.
1081 */
1082 {
1083 "fork() with an iouring fixed buffer",
1084 test_iouring_fork,
1085 },
1086
1087#endif /* LOCAL_CONFIG_HAVE_LIBURING */
1088 /*
1089 * Take a R/O longterm pin on a R/O-mapped shared anonymous page.
1090 * When modifying the page via the page table, the page content change
1091 * must be visible via the pin.
1092 */
1093 {
1094 "R/O GUP pin on R/O-mapped shared page",
1095 test_ro_pin_on_shared,
1096 },
1097 /* Same as above, but using GUP-fast. */
1098 {
1099 "R/O GUP-fast pin on R/O-mapped shared page",
1100 test_ro_fast_pin_on_shared,
1101 },
1102 /*
1103 * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page that
1104 * was previously shared. When modifying the page via the page table,
1105 * the page content change must be visible via the pin.
1106 */
1107 {
1108 "R/O GUP pin on R/O-mapped previously-shared page",
1109 test_ro_pin_on_ro_previously_shared,
1110 },
1111 /* Same as above, but using GUP-fast. */
1112 {
1113 "R/O GUP-fast pin on R/O-mapped previously-shared page",
1114 test_ro_fast_pin_on_ro_previously_shared,
1115 },
1116 /*
1117 * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page.
1118 * When modifying the page via the page table, the page content change
1119 * must be visible via the pin.
1120 */
1121 {
1122 "R/O GUP pin on R/O-mapped exclusive page",
1123 test_ro_pin_on_ro_exclusive,
1124 },
1125 /* Same as above, but using GUP-fast. */
1126 {
1127 "R/O GUP-fast pin on R/O-mapped exclusive page",
1128 test_ro_fast_pin_on_ro_exclusive,
1129 },
1130};
1131
1132static void run_anon_test_case(struct test_case const *test_case)
1133{
1134 int i;
1135
1136 run_with_base_page(fn: test_case->fn, desc: test_case->desc);
1137 run_with_base_page_swap(fn: test_case->fn, desc: test_case->desc);
1138 for (i = 0; i < nr_thpsizes; i++) {
1139 size_t size = thpsizes[i];
1140 struct thp_settings settings = *thp_current_settings();
1141
1142 settings.hugepages[sz2ord(size: pmdsize)].enabled = THP_NEVER;
1143 settings.hugepages[sz2ord(size)].enabled = THP_ALWAYS;
1144 thp_push_settings(settings: &settings);
1145
1146 if (size == pmdsize) {
1147 run_with_thp(fn: test_case->fn, desc: test_case->desc, size);
1148 run_with_thp_swap(fn: test_case->fn, desc: test_case->desc, size);
1149 }
1150
1151 run_with_pte_mapped_thp(fn: test_case->fn, desc: test_case->desc, size);
1152 run_with_pte_mapped_thp_swap(fn: test_case->fn, desc: test_case->desc, size);
1153 run_with_single_pte_of_thp(fn: test_case->fn, desc: test_case->desc, size);
1154 run_with_single_pte_of_thp_swap(fn: test_case->fn, desc: test_case->desc, size);
1155 run_with_partial_mremap_thp(fn: test_case->fn, desc: test_case->desc, size);
1156 run_with_partial_shared_thp(fn: test_case->fn, desc: test_case->desc, size);
1157
1158 thp_pop_settings();
1159 }
1160 for (i = 0; i < nr_hugetlbsizes; i++)
1161 run_with_hugetlb(fn: test_case->fn, desc: test_case->desc,
1162 hugetlbsize: hugetlbsizes[i]);
1163}
1164
1165static void run_anon_test_cases(void)
1166{
1167 int i;
1168
1169 ksft_print_msg(msg: "[INFO] Anonymous memory tests in private mappings\n");
1170
1171 for (i = 0; i < ARRAY_SIZE(anon_test_cases); i++)
1172 run_anon_test_case(test_case: &anon_test_cases[i]);
1173}
1174
1175static int tests_per_anon_test_case(void)
1176{
1177 int tests = 2 + nr_hugetlbsizes;
1178
1179 tests += 6 * nr_thpsizes;
1180 if (pmdsize)
1181 tests += 2;
1182 return tests;
1183}
1184
1185enum anon_thp_collapse_test {
1186 ANON_THP_COLLAPSE_UNSHARED,
1187 ANON_THP_COLLAPSE_FULLY_SHARED,
1188 ANON_THP_COLLAPSE_LOWER_SHARED,
1189 ANON_THP_COLLAPSE_UPPER_SHARED,
1190};
1191
1192static void do_test_anon_thp_collapse(char *mem, size_t size,
1193 enum anon_thp_collapse_test test)
1194{
1195 struct comm_pipes comm_pipes;
1196 char buf;
1197 int ret;
1198
1199 ret = setup_comm_pipes(&comm_pipes);
1200 if (ret) {
1201 ksft_test_result_fail(msg: "pipe() failed\n");
1202 return;
1203 }
1204
1205 /*
1206 * Trigger PTE-mapping the THP by temporarily mapping a single subpage
1207 * R/O, such that we can try collapsing it later.
1208 */
1209 ret = mprotect(mem + pagesize, pagesize, PROT_READ);
1210 if (ret) {
1211 ksft_test_result_fail(msg: "mprotect() failed\n");
1212 goto close_comm_pipes;
1213 }
1214 ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE);
1215 if (ret) {
1216 ksft_test_result_fail(msg: "mprotect() failed\n");
1217 goto close_comm_pipes;
1218 }
1219
1220 switch (test) {
1221 case ANON_THP_COLLAPSE_UNSHARED:
1222 /* Collapse before actually COW-sharing the page. */
1223 ret = madvise(mem, size, MADV_COLLAPSE);
1224 if (ret) {
1225 ksft_test_result_skip(msg: "MADV_COLLAPSE failed: %s\n",
1226 strerror(errno));
1227 goto close_comm_pipes;
1228 }
1229 break;
1230 case ANON_THP_COLLAPSE_FULLY_SHARED:
1231 /* COW-share the full PTE-mapped THP. */
1232 break;
1233 case ANON_THP_COLLAPSE_LOWER_SHARED:
1234 /* Don't COW-share the upper part of the THP. */
1235 ret = madvise(mem + size / 2, size / 2, MADV_DONTFORK);
1236 if (ret) {
1237 ksft_test_result_fail(msg: "MADV_DONTFORK failed\n");
1238 goto close_comm_pipes;
1239 }
1240 break;
1241 case ANON_THP_COLLAPSE_UPPER_SHARED:
1242 /* Don't COW-share the lower part of the THP. */
1243 ret = madvise(mem, size / 2, MADV_DONTFORK);
1244 if (ret) {
1245 ksft_test_result_fail(msg: "MADV_DONTFORK failed\n");
1246 goto close_comm_pipes;
1247 }
1248 break;
1249 default:
1250 assert(false);
1251 }
1252
1253 ret = fork();
1254 if (ret < 0) {
1255 ksft_test_result_fail(msg: "fork() failed\n");
1256 goto close_comm_pipes;
1257 } else if (!ret) {
1258 switch (test) {
1259 case ANON_THP_COLLAPSE_UNSHARED:
1260 case ANON_THP_COLLAPSE_FULLY_SHARED:
1261 exit(child_memcmp_fn(mem, size, comm_pipes: &comm_pipes));
1262 break;
1263 case ANON_THP_COLLAPSE_LOWER_SHARED:
1264 exit(child_memcmp_fn(mem, size: size / 2, comm_pipes: &comm_pipes));
1265 break;
1266 case ANON_THP_COLLAPSE_UPPER_SHARED:
1267 exit(child_memcmp_fn(mem: mem + size / 2, size: size / 2,
1268 comm_pipes: &comm_pipes));
1269 break;
1270 default:
1271 assert(false);
1272 }
1273 }
1274
1275 while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
1276 ;
1277
1278 switch (test) {
1279 case ANON_THP_COLLAPSE_UNSHARED:
1280 break;
1281 case ANON_THP_COLLAPSE_UPPER_SHARED:
1282 case ANON_THP_COLLAPSE_LOWER_SHARED:
1283 /*
1284 * Revert MADV_DONTFORK such that we merge the VMAs and are
1285 * able to actually collapse.
1286 */
1287 ret = madvise(mem, size, MADV_DOFORK);
1288 if (ret) {
1289 ksft_test_result_fail(msg: "MADV_DOFORK failed\n");
1290 write(comm_pipes.parent_ready[1], "0", 1);
1291 wait(&ret);
1292 goto close_comm_pipes;
1293 }
1294 /* FALLTHROUGH */
1295 case ANON_THP_COLLAPSE_FULLY_SHARED:
1296 /* Collapse before anyone modified the COW-shared page. */
1297 ret = madvise(mem, size, MADV_COLLAPSE);
1298 if (ret) {
1299 ksft_test_result_skip(msg: "MADV_COLLAPSE failed: %s\n",
1300 strerror(errno));
1301 write(comm_pipes.parent_ready[1], "0", 1);
1302 wait(&ret);
1303 goto close_comm_pipes;
1304 }
1305 break;
1306 default:
1307 assert(false);
1308 }
1309
1310 /* Modify the page. */
1311 memset(mem, 0xff, size);
1312 write(comm_pipes.parent_ready[1], "0", 1);
1313
1314 wait(&ret);
1315 if (WIFEXITED(ret))
1316 ret = WEXITSTATUS(ret);
1317 else
1318 ret = -EINVAL;
1319
1320 ksft_test_result(!ret, "No leak from parent into child\n");
1321close_comm_pipes:
1322 close_comm_pipes(comm_pipes: &comm_pipes);
1323}
1324
1325static void test_anon_thp_collapse_unshared(char *mem, size_t size)
1326{
1327 do_test_anon_thp_collapse(mem, size, test: ANON_THP_COLLAPSE_UNSHARED);
1328}
1329
1330static void test_anon_thp_collapse_fully_shared(char *mem, size_t size)
1331{
1332 do_test_anon_thp_collapse(mem, size, test: ANON_THP_COLLAPSE_FULLY_SHARED);
1333}
1334
1335static void test_anon_thp_collapse_lower_shared(char *mem, size_t size)
1336{
1337 do_test_anon_thp_collapse(mem, size, test: ANON_THP_COLLAPSE_LOWER_SHARED);
1338}
1339
1340static void test_anon_thp_collapse_upper_shared(char *mem, size_t size)
1341{
1342 do_test_anon_thp_collapse(mem, size, test: ANON_THP_COLLAPSE_UPPER_SHARED);
1343}
1344
1345/*
1346 * Test cases that are specific to anonymous THP: pages in private mappings
1347 * that may get shared via COW during fork().
1348 */
1349static const struct test_case anon_thp_test_cases[] = {
1350 /*
1351 * Basic COW test for fork() without any GUP when collapsing a THP
1352 * before fork().
1353 *
1354 * Re-mapping a PTE-mapped anon THP using a single PMD ("in-place
1355 * collapse") might easily get COW handling wrong when not collapsing
1356 * exclusivity information properly.
1357 */
1358 {
1359 "Basic COW after fork() when collapsing before fork()",
1360 test_anon_thp_collapse_unshared,
1361 },
1362 /* Basic COW test, but collapse after COW-sharing a full THP. */
1363 {
1364 "Basic COW after fork() when collapsing after fork() (fully shared)",
1365 test_anon_thp_collapse_fully_shared,
1366 },
1367 /*
1368 * Basic COW test, but collapse after COW-sharing the lower half of a
1369 * THP.
1370 */
1371 {
1372 "Basic COW after fork() when collapsing after fork() (lower shared)",
1373 test_anon_thp_collapse_lower_shared,
1374 },
1375 /*
1376 * Basic COW test, but collapse after COW-sharing the upper half of a
1377 * THP.
1378 */
1379 {
1380 "Basic COW after fork() when collapsing after fork() (upper shared)",
1381 test_anon_thp_collapse_upper_shared,
1382 },
1383};
1384
1385static void run_anon_thp_test_cases(void)
1386{
1387 int i;
1388
1389 if (!pmdsize)
1390 return;
1391
1392 ksft_print_msg(msg: "[INFO] Anonymous THP tests\n");
1393
1394 for (i = 0; i < ARRAY_SIZE(anon_thp_test_cases); i++) {
1395 struct test_case const *test_case = &anon_thp_test_cases[i];
1396
1397 ksft_print_msg(msg: "[RUN] %s\n", test_case->desc);
1398 do_run_with_thp(fn: test_case->fn, thp_run: THP_RUN_PMD, thpsize: pmdsize);
1399 }
1400}
1401
1402static int tests_per_anon_thp_test_case(void)
1403{
1404 return pmdsize ? 1 : 0;
1405}
1406
1407typedef void (*non_anon_test_fn)(char *mem, const char *smem, size_t size);
1408
1409static void test_cow(char *mem, const char *smem, size_t size)
1410{
1411 char *old = malloc(size);
1412
1413 /* Backup the original content. */
1414 memcpy(old, smem, size);
1415
1416 /* Modify the page. */
1417 memset(mem, 0xff, size);
1418
1419 /* See if we still read the old values via the other mapping. */
1420 ksft_test_result(!memcmp(smem, old, size),
1421 "Other mapping not modified\n");
1422 free(old);
1423}
1424
1425static void test_ro_pin(char *mem, const char *smem, size_t size)
1426{
1427 do_test_ro_pin(mem, size, test: RO_PIN_TEST, fast: false);
1428}
1429
1430static void test_ro_fast_pin(char *mem, const char *smem, size_t size)
1431{
1432 do_test_ro_pin(mem, size, test: RO_PIN_TEST, fast: true);
1433}
1434
1435static void run_with_zeropage(non_anon_test_fn fn, const char *desc)
1436{
1437 char *mem, *smem, tmp;
1438
1439 ksft_print_msg(msg: "[RUN] %s ... with shared zeropage\n", desc);
1440
1441 mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE,
1442 MAP_PRIVATE | MAP_ANON, -1, 0);
1443 if (mem == MAP_FAILED) {
1444 ksft_test_result_fail(msg: "mmap() failed\n");
1445 return;
1446 }
1447
1448 smem = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANON, -1, 0);
1449 if (mem == MAP_FAILED) {
1450 ksft_test_result_fail(msg: "mmap() failed\n");
1451 goto munmap;
1452 }
1453
1454 /* Read from the page to populate the shared zeropage. */
1455 tmp = *mem + *smem;
1456 asm volatile("" : "+r" (tmp));
1457
1458 fn(mem, smem, pagesize);
1459munmap:
1460 munmap(mem, pagesize);
1461 if (smem != MAP_FAILED)
1462 munmap(smem, pagesize);
1463}
1464
1465static void run_with_huge_zeropage(non_anon_test_fn fn, const char *desc)
1466{
1467 char *mem, *smem, *mmap_mem, *mmap_smem, tmp;
1468 size_t mmap_size;
1469 int ret;
1470
1471 ksft_print_msg(msg: "[RUN] %s ... with huge zeropage\n", desc);
1472
1473 if (!has_huge_zeropage) {
1474 ksft_test_result_skip(msg: "Huge zeropage not enabled\n");
1475 return;
1476 }
1477
1478 /* For alignment purposes, we need twice the thp size. */
1479 mmap_size = 2 * pmdsize;
1480 mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
1481 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
1482 if (mmap_mem == MAP_FAILED) {
1483 ksft_test_result_fail(msg: "mmap() failed\n");
1484 return;
1485 }
1486 mmap_smem = mmap(NULL, mmap_size, PROT_READ,
1487 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
1488 if (mmap_smem == MAP_FAILED) {
1489 ksft_test_result_fail(msg: "mmap() failed\n");
1490 goto munmap;
1491 }
1492
1493 /* We need a THP-aligned memory area. */
1494 mem = (char *)(((uintptr_t)mmap_mem + pmdsize) & ~(pmdsize - 1));
1495 smem = (char *)(((uintptr_t)mmap_smem + pmdsize) & ~(pmdsize - 1));
1496
1497 ret = madvise(mem, pmdsize, MADV_HUGEPAGE);
1498 ret |= madvise(smem, pmdsize, MADV_HUGEPAGE);
1499 if (ret) {
1500 ksft_test_result_fail(msg: "MADV_HUGEPAGE failed\n");
1501 goto munmap;
1502 }
1503
1504 /*
1505 * Read from the memory to populate the huge shared zeropage. Read from
1506 * the first sub-page and test if we get another sub-page populated
1507 * automatically.
1508 */
1509 tmp = *mem + *smem;
1510 asm volatile("" : "+r" (tmp));
1511 if (!pagemap_is_populated(fd: pagemap_fd, start: mem + pagesize) ||
1512 !pagemap_is_populated(fd: pagemap_fd, start: smem + pagesize)) {
1513 ksft_test_result_skip(msg: "Did not get THPs populated\n");
1514 goto munmap;
1515 }
1516
1517 fn(mem, smem, pmdsize);
1518munmap:
1519 munmap(mmap_mem, mmap_size);
1520 if (mmap_smem != MAP_FAILED)
1521 munmap(mmap_smem, mmap_size);
1522}
1523
1524static void run_with_memfd(non_anon_test_fn fn, const char *desc)
1525{
1526 char *mem, *smem, tmp;
1527 int fd;
1528
1529 ksft_print_msg(msg: "[RUN] %s ... with memfd\n", desc);
1530
1531 fd = memfd_create("test", 0);
1532 if (fd < 0) {
1533 ksft_test_result_fail(msg: "memfd_create() failed\n");
1534 return;
1535 }
1536
1537 /* File consists of a single page filled with zeroes. */
1538 if (fallocate(fd, 0, 0, pagesize)) {
1539 ksft_test_result_fail(msg: "fallocate() failed\n");
1540 goto close;
1541 }
1542
1543 /* Create a private mapping of the memfd. */
1544 mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
1545 if (mem == MAP_FAILED) {
1546 ksft_test_result_fail(msg: "mmap() failed\n");
1547 goto close;
1548 }
1549 smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0);
1550 if (mem == MAP_FAILED) {
1551 ksft_test_result_fail(msg: "mmap() failed\n");
1552 goto munmap;
1553 }
1554
1555 /* Fault the page in. */
1556 tmp = *mem + *smem;
1557 asm volatile("" : "+r" (tmp));
1558
1559 fn(mem, smem, pagesize);
1560munmap:
1561 munmap(mem, pagesize);
1562 if (smem != MAP_FAILED)
1563 munmap(smem, pagesize);
1564close:
1565 close(fd);
1566}
1567
1568static void run_with_tmpfile(non_anon_test_fn fn, const char *desc)
1569{
1570 char *mem, *smem, tmp;
1571 FILE *file;
1572 int fd;
1573
1574 ksft_print_msg(msg: "[RUN] %s ... with tmpfile\n", desc);
1575
1576 file = tmpfile();
1577 if (!file) {
1578 ksft_test_result_fail(msg: "tmpfile() failed\n");
1579 return;
1580 }
1581
1582 fd = fileno(file);
1583 if (fd < 0) {
1584 ksft_test_result_skip(msg: "fileno() failed\n");
1585 return;
1586 }
1587
1588 /* File consists of a single page filled with zeroes. */
1589 if (fallocate(fd, 0, 0, pagesize)) {
1590 ksft_test_result_fail(msg: "fallocate() failed\n");
1591 goto close;
1592 }
1593
1594 /* Create a private mapping of the memfd. */
1595 mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
1596 if (mem == MAP_FAILED) {
1597 ksft_test_result_fail(msg: "mmap() failed\n");
1598 goto close;
1599 }
1600 smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0);
1601 if (mem == MAP_FAILED) {
1602 ksft_test_result_fail(msg: "mmap() failed\n");
1603 goto munmap;
1604 }
1605
1606 /* Fault the page in. */
1607 tmp = *mem + *smem;
1608 asm volatile("" : "+r" (tmp));
1609
1610 fn(mem, smem, pagesize);
1611munmap:
1612 munmap(mem, pagesize);
1613 if (smem != MAP_FAILED)
1614 munmap(smem, pagesize);
1615close:
1616 fclose(file);
1617}
1618
1619static void run_with_memfd_hugetlb(non_anon_test_fn fn, const char *desc,
1620 size_t hugetlbsize)
1621{
1622 int flags = MFD_HUGETLB;
1623 char *mem, *smem, tmp;
1624 int fd;
1625
1626 ksft_print_msg(msg: "[RUN] %s ... with memfd hugetlb (%zu kB)\n", desc,
1627 hugetlbsize / 1024);
1628
1629 flags |= __builtin_ctzll(hugetlbsize) << MFD_HUGE_SHIFT;
1630
1631 fd = memfd_create("test", flags);
1632 if (fd < 0) {
1633 ksft_test_result_skip(msg: "memfd_create() failed\n");
1634 return;
1635 }
1636
1637 /* File consists of a single page filled with zeroes. */
1638 if (fallocate(fd, 0, 0, hugetlbsize)) {
1639 ksft_test_result_skip(msg: "need more free huge pages\n");
1640 goto close;
1641 }
1642
1643 /* Create a private mapping of the memfd. */
1644 mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd,
1645 0);
1646 if (mem == MAP_FAILED) {
1647 ksft_test_result_skip(msg: "need more free huge pages\n");
1648 goto close;
1649 }
1650 smem = mmap(NULL, hugetlbsize, PROT_READ, MAP_SHARED, fd, 0);
1651 if (mem == MAP_FAILED) {
1652 ksft_test_result_fail(msg: "mmap() failed\n");
1653 goto munmap;
1654 }
1655
1656 /* Fault the page in. */
1657 tmp = *mem + *smem;
1658 asm volatile("" : "+r" (tmp));
1659
1660 fn(mem, smem, hugetlbsize);
1661munmap:
1662 munmap(mem, hugetlbsize);
1663 if (mem != MAP_FAILED)
1664 munmap(smem, hugetlbsize);
1665close:
1666 close(fd);
1667}
1668
1669struct non_anon_test_case {
1670 const char *desc;
1671 non_anon_test_fn fn;
1672};
1673
1674/*
1675 * Test cases that target any pages in private mappings that are not anonymous:
1676 * pages that may get shared via COW ndependent of fork(). This includes
1677 * the shared zeropage(s), pagecache pages, ...
1678 */
1679static const struct non_anon_test_case non_anon_test_cases[] = {
1680 /*
1681 * Basic COW test without any GUP. If we miss to break COW, changes are
1682 * visible via other private/shared mappings.
1683 */
1684 {
1685 "Basic COW",
1686 test_cow,
1687 },
1688 /*
1689 * Take a R/O longterm pin. When modifying the page via the page table,
1690 * the page content change must be visible via the pin.
1691 */
1692 {
1693 "R/O longterm GUP pin",
1694 test_ro_pin,
1695 },
1696 /* Same as above, but using GUP-fast. */
1697 {
1698 "R/O longterm GUP-fast pin",
1699 test_ro_fast_pin,
1700 },
1701};
1702
1703static void run_non_anon_test_case(struct non_anon_test_case const *test_case)
1704{
1705 int i;
1706
1707 run_with_zeropage(fn: test_case->fn, desc: test_case->desc);
1708 run_with_memfd(fn: test_case->fn, desc: test_case->desc);
1709 run_with_tmpfile(fn: test_case->fn, desc: test_case->desc);
1710 if (pmdsize)
1711 run_with_huge_zeropage(fn: test_case->fn, desc: test_case->desc);
1712 for (i = 0; i < nr_hugetlbsizes; i++)
1713 run_with_memfd_hugetlb(fn: test_case->fn, desc: test_case->desc,
1714 hugetlbsize: hugetlbsizes[i]);
1715}
1716
1717static void run_non_anon_test_cases(void)
1718{
1719 int i;
1720
1721 ksft_print_msg(msg: "[RUN] Non-anonymous memory tests in private mappings\n");
1722
1723 for (i = 0; i < ARRAY_SIZE(non_anon_test_cases); i++)
1724 run_non_anon_test_case(test_case: &non_anon_test_cases[i]);
1725}
1726
1727static int tests_per_non_anon_test_case(void)
1728{
1729 int tests = 3 + nr_hugetlbsizes;
1730
1731 if (pmdsize)
1732 tests += 1;
1733 return tests;
1734}
1735
1736int main(int argc, char **argv)
1737{
1738 int err;
1739 struct thp_settings default_settings;
1740
1741 ksft_print_header();
1742
1743 pagesize = getpagesize();
1744 pmdsize = read_pmd_pagesize();
1745 if (pmdsize) {
1746 /* Only if THP is supported. */
1747 thp_read_settings(settings: &default_settings);
1748 default_settings.hugepages[sz2ord(size: pmdsize)].enabled = THP_INHERIT;
1749 thp_save_settings();
1750 thp_push_settings(settings: &default_settings);
1751
1752 ksft_print_msg(msg: "[INFO] detected PMD size: %zu KiB\n",
1753 pmdsize / 1024);
1754 nr_thpsizes = detect_thp_sizes(sizes: thpsizes, ARRAY_SIZE(thpsizes));
1755 }
1756 nr_hugetlbsizes = detect_hugetlb_page_sizes(sizes: hugetlbsizes,
1757 ARRAY_SIZE(hugetlbsizes));
1758 detect_huge_zeropage();
1759
1760 ksft_set_plan(ARRAY_SIZE(anon_test_cases) * tests_per_anon_test_case() +
1761 ARRAY_SIZE(anon_thp_test_cases) * tests_per_anon_thp_test_case() +
1762 ARRAY_SIZE(non_anon_test_cases) * tests_per_non_anon_test_case());
1763
1764 gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR);
1765 pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
1766 if (pagemap_fd < 0)
1767 ksft_exit_fail_msg(msg: "opening pagemap failed\n");
1768
1769 run_anon_test_cases();
1770 run_anon_thp_test_cases();
1771 run_non_anon_test_cases();
1772
1773 if (pmdsize) {
1774 /* Only if THP is supported. */
1775 thp_restore_settings();
1776 }
1777
1778 err = ksft_get_fail_cnt();
1779 if (err)
1780 ksft_exit_fail_msg(msg: "%d out of %d tests failed\n",
1781 err, ksft_test_num());
1782 return ksft_exit_pass();
1783}
1784

source code of linux/tools/testing/selftests/mm/cow.c