1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * Stress userfaultfd syscall. |
4 | * |
5 | * Copyright (C) 2015 Red Hat, Inc. |
6 | * |
7 | * This test allocates two virtual areas and bounces the physical |
8 | * memory across the two virtual areas (from area_src to area_dst) |
9 | * using userfaultfd. |
10 | * |
11 | * There are three threads running per CPU: |
12 | * |
13 | * 1) one per-CPU thread takes a per-page pthread_mutex in a random |
14 | * page of the area_dst (while the physical page may still be in |
15 | * area_src), and increments a per-page counter in the same page, |
16 | * and checks its value against a verification region. |
17 | * |
18 | * 2) another per-CPU thread handles the userfaults generated by |
19 | * thread 1 above. userfaultfd blocking reads or poll() modes are |
20 | * exercised interleaved. |
21 | * |
22 | * 3) one last per-CPU thread transfers the memory in the background |
23 | * at maximum bandwidth (if not already transferred by thread |
24 | * 2). Each cpu thread takes cares of transferring a portion of the |
25 | * area. |
26 | * |
27 | * When all threads of type 3 completed the transfer, one bounce is |
28 | * complete. area_src and area_dst are then swapped. All threads are |
29 | * respawned and so the bounce is immediately restarted in the |
30 | * opposite direction. |
31 | * |
32 | * per-CPU threads 1 by triggering userfaults inside |
33 | * pthread_mutex_lock will also verify the atomicity of the memory |
34 | * transfer (UFFDIO_COPY). |
35 | */ |
36 | |
37 | #include "uffd-common.h" |
38 | |
39 | #ifdef __NR_userfaultfd |
40 | |
41 | #define BOUNCE_RANDOM (1<<0) |
42 | #define BOUNCE_RACINGFAULTS (1<<1) |
43 | #define BOUNCE_VERIFY (1<<2) |
44 | #define BOUNCE_POLL (1<<3) |
45 | static int bounces; |
46 | |
47 | /* exercise the test_uffdio_*_eexist every ALARM_INTERVAL_SECS */ |
48 | #define ALARM_INTERVAL_SECS 10 |
49 | static char *zeropage; |
50 | pthread_attr_t attr; |
51 | |
52 | #define swap(a, b) \ |
53 | do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0) |
54 | |
55 | const char *examples = |
56 | "# Run anonymous memory test on 100MiB region with 99999 bounces:\n" |
57 | "./uffd-stress anon 100 99999\n\n" |
58 | "# Run share memory test on 1GiB region with 99 bounces:\n" |
59 | "./uffd-stress shmem 1000 99\n\n" |
60 | "# Run hugetlb memory test on 256MiB region with 50 bounces:\n" |
61 | "./uffd-stress hugetlb 256 50\n\n" |
62 | "# Run the same hugetlb test but using private file:\n" |
63 | "./uffd-stress hugetlb-private 256 50\n\n" |
64 | "# 10MiB-~6GiB 999 bounces anonymous test, " |
65 | "continue forever unless an error triggers\n" |
66 | "while ./uffd-stress anon $[RANDOM % 6000 + 10] 999; do true; done\n\n" ; |
67 | |
68 | static void usage(void) |
69 | { |
70 | fprintf(stderr, "\nUsage: ./uffd-stress <test type> <MiB> <bounces>\n\n" ); |
71 | fprintf(stderr, "Supported <test type>: anon, hugetlb, " |
72 | "hugetlb-private, shmem, shmem-private\n\n" ); |
73 | fprintf(stderr, "Examples:\n\n" ); |
74 | fprintf(stderr, "%s" , examples); |
75 | exit(1); |
76 | } |
77 | |
78 | static void uffd_stats_reset(struct uffd_args *args, unsigned long n_cpus) |
79 | { |
80 | int i; |
81 | |
82 | for (i = 0; i < n_cpus; i++) { |
83 | args[i].cpu = i; |
84 | args[i].apply_wp = test_uffdio_wp; |
85 | args[i].missing_faults = 0; |
86 | args[i].wp_faults = 0; |
87 | args[i].minor_faults = 0; |
88 | } |
89 | } |
90 | |
91 | static void *locking_thread(void *arg) |
92 | { |
93 | unsigned long cpu = (unsigned long) arg; |
94 | unsigned long page_nr; |
95 | unsigned long long count; |
96 | |
97 | if (!(bounces & BOUNCE_RANDOM)) { |
98 | page_nr = -bounces; |
99 | if (!(bounces & BOUNCE_RACINGFAULTS)) |
100 | page_nr += cpu * nr_pages_per_cpu; |
101 | } |
102 | |
103 | while (!finished) { |
104 | if (bounces & BOUNCE_RANDOM) { |
105 | if (getrandom(&page_nr, sizeof(page_nr), 0) != sizeof(page_nr)) |
106 | err("getrandom failed" ); |
107 | } else |
108 | page_nr += 1; |
109 | page_nr %= nr_pages; |
110 | pthread_mutex_lock(area_mutex(area_dst, page_nr)); |
111 | count = *area_count(area_dst, page_nr); |
112 | if (count != count_verify[page_nr]) |
113 | err("page_nr %lu memory corruption %llu %llu" , |
114 | page_nr, count, count_verify[page_nr]); |
115 | count++; |
116 | *area_count(area_dst, page_nr) = count_verify[page_nr] = count; |
117 | pthread_mutex_unlock(area_mutex(area_dst, page_nr)); |
118 | } |
119 | |
120 | return NULL; |
121 | } |
122 | |
123 | static int copy_page_retry(int ufd, unsigned long offset) |
124 | { |
125 | return __copy_page(ufd, offset, true, test_uffdio_wp); |
126 | } |
127 | |
128 | pthread_mutex_t uffd_read_mutex = PTHREAD_MUTEX_INITIALIZER; |
129 | |
130 | static void *uffd_read_thread(void *arg) |
131 | { |
132 | struct uffd_args *args = (struct uffd_args *)arg; |
133 | struct uffd_msg msg; |
134 | |
135 | pthread_mutex_unlock(&uffd_read_mutex); |
136 | /* from here cancellation is ok */ |
137 | |
138 | for (;;) { |
139 | if (uffd_read_msg(uffd, &msg)) |
140 | continue; |
141 | uffd_handle_page_fault(&msg, args); |
142 | } |
143 | |
144 | return NULL; |
145 | } |
146 | |
147 | static void *background_thread(void *arg) |
148 | { |
149 | unsigned long cpu = (unsigned long) arg; |
150 | unsigned long page_nr, start_nr, mid_nr, end_nr; |
151 | |
152 | start_nr = cpu * nr_pages_per_cpu; |
153 | end_nr = (cpu+1) * nr_pages_per_cpu; |
154 | mid_nr = (start_nr + end_nr) / 2; |
155 | |
156 | /* Copy the first half of the pages */ |
157 | for (page_nr = start_nr; page_nr < mid_nr; page_nr++) |
158 | copy_page_retry(uffd, page_nr * page_size); |
159 | |
160 | /* |
161 | * If we need to test uffd-wp, set it up now. Then we'll have |
162 | * at least the first half of the pages mapped already which |
163 | * can be write-protected for testing |
164 | */ |
165 | if (test_uffdio_wp) |
166 | wp_range(uffd, (unsigned long)area_dst + start_nr * page_size, |
167 | nr_pages_per_cpu * page_size, true); |
168 | |
169 | /* |
170 | * Continue the 2nd half of the page copying, handling write |
171 | * protection faults if any |
172 | */ |
173 | for (page_nr = mid_nr; page_nr < end_nr; page_nr++) |
174 | copy_page_retry(uffd, page_nr * page_size); |
175 | |
176 | return NULL; |
177 | } |
178 | |
179 | static int stress(struct uffd_args *args) |
180 | { |
181 | unsigned long cpu; |
182 | pthread_t locking_threads[nr_cpus]; |
183 | pthread_t uffd_threads[nr_cpus]; |
184 | pthread_t background_threads[nr_cpus]; |
185 | |
186 | finished = 0; |
187 | for (cpu = 0; cpu < nr_cpus; cpu++) { |
188 | if (pthread_create(&locking_threads[cpu], &attr, |
189 | locking_thread, (void *)cpu)) |
190 | return 1; |
191 | if (bounces & BOUNCE_POLL) { |
192 | if (pthread_create(&uffd_threads[cpu], &attr, uffd_poll_thread, &args[cpu])) |
193 | err("uffd_poll_thread create" ); |
194 | } else { |
195 | if (pthread_create(&uffd_threads[cpu], &attr, |
196 | uffd_read_thread, |
197 | (void *)&args[cpu])) |
198 | return 1; |
199 | pthread_mutex_lock(&uffd_read_mutex); |
200 | } |
201 | if (pthread_create(&background_threads[cpu], &attr, |
202 | background_thread, (void *)cpu)) |
203 | return 1; |
204 | } |
205 | for (cpu = 0; cpu < nr_cpus; cpu++) |
206 | if (pthread_join(background_threads[cpu], NULL)) |
207 | return 1; |
208 | |
209 | /* |
210 | * Be strict and immediately zap area_src, the whole area has |
211 | * been transferred already by the background treads. The |
212 | * area_src could then be faulted in a racy way by still |
213 | * running uffdio_threads reading zeropages after we zapped |
214 | * area_src (but they're guaranteed to get -EEXIST from |
215 | * UFFDIO_COPY without writing zero pages into area_dst |
216 | * because the background threads already completed). |
217 | */ |
218 | uffd_test_ops->release_pages(area_src); |
219 | |
220 | finished = 1; |
221 | for (cpu = 0; cpu < nr_cpus; cpu++) |
222 | if (pthread_join(locking_threads[cpu], NULL)) |
223 | return 1; |
224 | |
225 | for (cpu = 0; cpu < nr_cpus; cpu++) { |
226 | char c; |
227 | if (bounces & BOUNCE_POLL) { |
228 | if (write(pipefd[cpu*2+1], &c, 1) != 1) |
229 | err("pipefd write error" ); |
230 | if (pthread_join(uffd_threads[cpu], |
231 | (void *)&args[cpu])) |
232 | return 1; |
233 | } else { |
234 | if (pthread_cancel(uffd_threads[cpu])) |
235 | return 1; |
236 | if (pthread_join(uffd_threads[cpu], NULL)) |
237 | return 1; |
238 | } |
239 | } |
240 | |
241 | return 0; |
242 | } |
243 | |
244 | static int userfaultfd_stress(void) |
245 | { |
246 | void *area; |
247 | unsigned long nr; |
248 | struct uffd_args args[nr_cpus]; |
249 | uint64_t mem_size = nr_pages * page_size; |
250 | |
251 | memset(args, 0, sizeof(struct uffd_args) * nr_cpus); |
252 | |
253 | if (uffd_test_ctx_init(UFFD_FEATURE_WP_UNPOPULATED, NULL)) |
254 | err("context init failed" ); |
255 | |
256 | if (posix_memalign(&area, page_size, page_size)) |
257 | err("out of memory" ); |
258 | zeropage = area; |
259 | bzero(zeropage, page_size); |
260 | |
261 | pthread_mutex_lock(&uffd_read_mutex); |
262 | |
263 | pthread_attr_init(&attr); |
264 | pthread_attr_setstacksize(&attr, 16*1024*1024); |
265 | |
266 | while (bounces--) { |
267 | printf("bounces: %d, mode:" , bounces); |
268 | if (bounces & BOUNCE_RANDOM) |
269 | printf(" rnd" ); |
270 | if (bounces & BOUNCE_RACINGFAULTS) |
271 | printf(" racing" ); |
272 | if (bounces & BOUNCE_VERIFY) |
273 | printf(" ver" ); |
274 | if (bounces & BOUNCE_POLL) |
275 | printf(" poll" ); |
276 | else |
277 | printf(" read" ); |
278 | printf(", " ); |
279 | fflush(stdout); |
280 | |
281 | if (bounces & BOUNCE_POLL) |
282 | fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); |
283 | else |
284 | fcntl(uffd, F_SETFL, uffd_flags & ~O_NONBLOCK); |
285 | |
286 | /* register */ |
287 | if (uffd_register(uffd, area_dst, mem_size, |
288 | true, test_uffdio_wp, false)) |
289 | err("register failure" ); |
290 | |
291 | if (area_dst_alias) { |
292 | if (uffd_register(uffd, area_dst_alias, mem_size, |
293 | true, test_uffdio_wp, false)) |
294 | err("register failure alias" ); |
295 | } |
296 | |
297 | /* |
298 | * The madvise done previously isn't enough: some |
299 | * uffd_thread could have read userfaults (one of |
300 | * those already resolved by the background thread) |
301 | * and it may be in the process of calling |
302 | * UFFDIO_COPY. UFFDIO_COPY will read the zapped |
303 | * area_src and it would map a zero page in it (of |
304 | * course such a UFFDIO_COPY is perfectly safe as it'd |
305 | * return -EEXIST). The problem comes at the next |
306 | * bounce though: that racing UFFDIO_COPY would |
307 | * generate zeropages in the area_src, so invalidating |
308 | * the previous MADV_DONTNEED. Without this additional |
309 | * MADV_DONTNEED those zeropages leftovers in the |
310 | * area_src would lead to -EEXIST failure during the |
311 | * next bounce, effectively leaving a zeropage in the |
312 | * area_dst. |
313 | * |
314 | * Try to comment this out madvise to see the memory |
315 | * corruption being caught pretty quick. |
316 | * |
317 | * khugepaged is also inhibited to collapse THP after |
318 | * MADV_DONTNEED only after the UFFDIO_REGISTER, so it's |
319 | * required to MADV_DONTNEED here. |
320 | */ |
321 | uffd_test_ops->release_pages(area_dst); |
322 | |
323 | uffd_stats_reset(args, nr_cpus); |
324 | |
325 | /* bounce pass */ |
326 | if (stress(args)) { |
327 | uffd_test_ctx_clear(); |
328 | return 1; |
329 | } |
330 | |
331 | /* Clear all the write protections if there is any */ |
332 | if (test_uffdio_wp) |
333 | wp_range(uffd, (unsigned long)area_dst, |
334 | nr_pages * page_size, false); |
335 | |
336 | /* unregister */ |
337 | if (uffd_unregister(uffd, area_dst, mem_size)) |
338 | err("unregister failure" ); |
339 | if (area_dst_alias) { |
340 | if (uffd_unregister(uffd, area_dst_alias, mem_size)) |
341 | err("unregister failure alias" ); |
342 | } |
343 | |
344 | /* verification */ |
345 | if (bounces & BOUNCE_VERIFY) |
346 | for (nr = 0; nr < nr_pages; nr++) |
347 | if (*area_count(area_dst, nr) != count_verify[nr]) |
348 | err("error area_count %llu %llu %lu\n" , |
349 | *area_count(area_src, nr), |
350 | count_verify[nr], nr); |
351 | |
352 | /* prepare next bounce */ |
353 | swap(area_src, area_dst); |
354 | |
355 | swap(area_src_alias, area_dst_alias); |
356 | |
357 | uffd_stats_report(args, nr_cpus); |
358 | } |
359 | uffd_test_ctx_clear(); |
360 | |
361 | return 0; |
362 | } |
363 | |
364 | static void set_test_type(const char *type) |
365 | { |
366 | if (!strcmp(type, "anon" )) { |
367 | test_type = TEST_ANON; |
368 | uffd_test_ops = &anon_uffd_test_ops; |
369 | } else if (!strcmp(type, "hugetlb" )) { |
370 | test_type = TEST_HUGETLB; |
371 | uffd_test_ops = &hugetlb_uffd_test_ops; |
372 | map_shared = true; |
373 | } else if (!strcmp(type, "hugetlb-private" )) { |
374 | test_type = TEST_HUGETLB; |
375 | uffd_test_ops = &hugetlb_uffd_test_ops; |
376 | } else if (!strcmp(type, "shmem" )) { |
377 | map_shared = true; |
378 | test_type = TEST_SHMEM; |
379 | uffd_test_ops = &shmem_uffd_test_ops; |
380 | } else if (!strcmp(type, "shmem-private" )) { |
381 | test_type = TEST_SHMEM; |
382 | uffd_test_ops = &shmem_uffd_test_ops; |
383 | } |
384 | } |
385 | |
386 | static void parse_test_type_arg(const char *raw_type) |
387 | { |
388 | uint64_t features = UFFD_API_FEATURES; |
389 | |
390 | set_test_type(raw_type); |
391 | |
392 | if (!test_type) |
393 | err("failed to parse test type argument: '%s'" , raw_type); |
394 | |
395 | if (test_type == TEST_HUGETLB) |
396 | page_size = default_huge_page_size(); |
397 | else |
398 | page_size = sysconf(_SC_PAGE_SIZE); |
399 | |
400 | if (!page_size) |
401 | err("Unable to determine page size" ); |
402 | if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2 |
403 | > page_size) |
404 | err("Impossible to run this test" ); |
405 | |
406 | /* |
407 | * Whether we can test certain features depends not just on test type, |
408 | * but also on whether or not this particular kernel supports the |
409 | * feature. |
410 | */ |
411 | |
412 | if (userfaultfd_open(&features)) |
413 | err("Userfaultfd open failed" ); |
414 | |
415 | test_uffdio_wp = test_uffdio_wp && |
416 | (features & UFFD_FEATURE_PAGEFAULT_FLAG_WP); |
417 | |
418 | close(uffd); |
419 | uffd = -1; |
420 | } |
421 | |
422 | static void sigalrm(int sig) |
423 | { |
424 | if (sig != SIGALRM) |
425 | abort(); |
426 | test_uffdio_copy_eexist = true; |
427 | alarm(ALARM_INTERVAL_SECS); |
428 | } |
429 | |
430 | int main(int argc, char **argv) |
431 | { |
432 | size_t bytes; |
433 | |
434 | if (argc < 4) |
435 | usage(); |
436 | |
437 | if (signal(SIGALRM, sigalrm) == SIG_ERR) |
438 | err("failed to arm SIGALRM" ); |
439 | alarm(ALARM_INTERVAL_SECS); |
440 | |
441 | parse_test_type_arg(argv[1]); |
442 | bytes = atol(argv[2]) * 1024 * 1024; |
443 | |
444 | if (test_type == TEST_HUGETLB && |
445 | get_free_hugepages() < bytes / page_size) { |
446 | printf("skip: Skipping userfaultfd... not enough hugepages\n" ); |
447 | return KSFT_SKIP; |
448 | } |
449 | |
450 | nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); |
451 | |
452 | nr_pages_per_cpu = bytes / page_size / nr_cpus; |
453 | if (!nr_pages_per_cpu) { |
454 | _err("invalid MiB" ); |
455 | usage(); |
456 | } |
457 | |
458 | bounces = atoi(argv[3]); |
459 | if (bounces <= 0) { |
460 | _err("invalid bounces" ); |
461 | usage(); |
462 | } |
463 | nr_pages = nr_pages_per_cpu * nr_cpus; |
464 | |
465 | printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n" , |
466 | nr_pages, nr_pages_per_cpu); |
467 | return userfaultfd_stress(); |
468 | } |
469 | |
470 | #else /* __NR_userfaultfd */ |
471 | |
472 | #warning "missing __NR_userfaultfd definition" |
473 | |
474 | int main(void) |
475 | { |
476 | printf("skip: Skipping userfaultfd test (missing __NR_userfaultfd)\n" ); |
477 | return KSFT_SKIP; |
478 | } |
479 | |
480 | #endif /* __NR_userfaultfd */ |
481 | |