1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * Userfaultfd tests util functions |
4 | * |
5 | * Copyright (C) 2015-2023 Red Hat, Inc. |
6 | */ |
7 | |
8 | #include "uffd-common.h" |
9 | |
10 | #define BASE_PMD_ADDR ((void *)(1UL << 30)) |
11 | |
12 | volatile bool test_uffdio_copy_eexist = true; |
13 | unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size; |
14 | char *area_src, *area_src_alias, *area_dst, *area_dst_alias, *area_remap; |
15 | int uffd = -1, uffd_flags, finished, *pipefd, test_type; |
16 | bool map_shared; |
17 | bool test_uffdio_wp = true; |
18 | unsigned long long *count_verify; |
19 | uffd_test_ops_t *uffd_test_ops; |
20 | uffd_test_case_ops_t *uffd_test_case_ops; |
21 | atomic_bool ready_for_fork; |
22 | |
23 | static int uffd_mem_fd_create(off_t mem_size, bool hugetlb) |
24 | { |
25 | unsigned int memfd_flags = 0; |
26 | int mem_fd; |
27 | |
28 | if (hugetlb) |
29 | memfd_flags = MFD_HUGETLB; |
30 | mem_fd = memfd_create("uffd-test" , memfd_flags); |
31 | if (mem_fd < 0) |
32 | err("memfd_create" ); |
33 | if (ftruncate(mem_fd, mem_size)) |
34 | err("ftruncate" ); |
35 | if (fallocate(mem_fd, |
36 | FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, |
37 | mem_size)) |
38 | err("fallocate" ); |
39 | |
40 | return mem_fd; |
41 | } |
42 | |
43 | static void anon_release_pages(char *rel_area) |
44 | { |
45 | if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED)) |
46 | err("madvise(MADV_DONTNEED) failed" ); |
47 | } |
48 | |
49 | static int anon_allocate_area(void **alloc_area, bool is_src) |
50 | { |
51 | *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, |
52 | MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); |
53 | if (*alloc_area == MAP_FAILED) { |
54 | *alloc_area = NULL; |
55 | return -errno; |
56 | } |
57 | return 0; |
58 | } |
59 | |
60 | static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset) |
61 | { |
62 | } |
63 | |
64 | static void hugetlb_release_pages(char *rel_area) |
65 | { |
66 | if (!map_shared) { |
67 | if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED)) |
68 | err("madvise(MADV_DONTNEED) failed" ); |
69 | } else { |
70 | if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE)) |
71 | err("madvise(MADV_REMOVE) failed" ); |
72 | } |
73 | } |
74 | |
75 | static int hugetlb_allocate_area(void **alloc_area, bool is_src) |
76 | { |
77 | off_t size = nr_pages * page_size; |
78 | off_t offset = is_src ? 0 : size; |
79 | void *area_alias = NULL; |
80 | char **alloc_area_alias; |
81 | int mem_fd = uffd_mem_fd_create(mem_size: size * 2, hugetlb: true); |
82 | |
83 | *alloc_area = mmap(NULL, size, PROT_READ | PROT_WRITE, |
84 | (map_shared ? MAP_SHARED : MAP_PRIVATE) | |
85 | (is_src ? 0 : MAP_NORESERVE), |
86 | mem_fd, offset); |
87 | if (*alloc_area == MAP_FAILED) { |
88 | *alloc_area = NULL; |
89 | return -errno; |
90 | } |
91 | |
92 | if (map_shared) { |
93 | area_alias = mmap(NULL, size, PROT_READ | PROT_WRITE, |
94 | MAP_SHARED, mem_fd, offset); |
95 | if (area_alias == MAP_FAILED) |
96 | return -errno; |
97 | } |
98 | |
99 | if (is_src) { |
100 | alloc_area_alias = &area_src_alias; |
101 | } else { |
102 | alloc_area_alias = &area_dst_alias; |
103 | } |
104 | if (area_alias) |
105 | *alloc_area_alias = area_alias; |
106 | |
107 | close(mem_fd); |
108 | return 0; |
109 | } |
110 | |
111 | static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset) |
112 | { |
113 | if (!map_shared) |
114 | return; |
115 | |
116 | *start = (unsigned long) area_dst_alias + offset; |
117 | } |
118 | |
119 | static void shmem_release_pages(char *rel_area) |
120 | { |
121 | if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE)) |
122 | err("madvise(MADV_REMOVE) failed" ); |
123 | } |
124 | |
125 | static int shmem_allocate_area(void **alloc_area, bool is_src) |
126 | { |
127 | void *area_alias = NULL; |
128 | size_t bytes = nr_pages * page_size, hpage_size = read_pmd_pagesize(); |
129 | unsigned long offset = is_src ? 0 : bytes; |
130 | char *p = NULL, *p_alias = NULL; |
131 | int mem_fd = uffd_mem_fd_create(mem_size: bytes * 2, hugetlb: false); |
132 | |
133 | /* TODO: clean this up. Use a static addr is ugly */ |
134 | p = BASE_PMD_ADDR; |
135 | if (!is_src) |
136 | /* src map + alias + interleaved hpages */ |
137 | p += 2 * (bytes + hpage_size); |
138 | p_alias = p; |
139 | p_alias += bytes; |
140 | p_alias += hpage_size; /* Prevent src/dst VMA merge */ |
141 | |
142 | *alloc_area = mmap(p, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, |
143 | mem_fd, offset); |
144 | if (*alloc_area == MAP_FAILED) { |
145 | *alloc_area = NULL; |
146 | return -errno; |
147 | } |
148 | if (*alloc_area != p) |
149 | err("mmap of memfd failed at %p" , p); |
150 | |
151 | area_alias = mmap(p_alias, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, |
152 | mem_fd, offset); |
153 | if (area_alias == MAP_FAILED) { |
154 | munmap(*alloc_area, bytes); |
155 | *alloc_area = NULL; |
156 | return -errno; |
157 | } |
158 | if (area_alias != p_alias) |
159 | err("mmap of anonymous memory failed at %p" , p_alias); |
160 | |
161 | if (is_src) |
162 | area_src_alias = area_alias; |
163 | else |
164 | area_dst_alias = area_alias; |
165 | |
166 | close(mem_fd); |
167 | return 0; |
168 | } |
169 | |
170 | static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset) |
171 | { |
172 | *start = (unsigned long)area_dst_alias + offset; |
173 | } |
174 | |
175 | static void shmem_check_pmd_mapping(void *p, int expect_nr_hpages) |
176 | { |
177 | if (!check_huge_shmem(addr: area_dst_alias, nr_hpages: expect_nr_hpages, |
178 | hpage_size: read_pmd_pagesize())) |
179 | err("Did not find expected %d number of hugepages" , |
180 | expect_nr_hpages); |
181 | } |
182 | |
183 | struct uffd_test_ops anon_uffd_test_ops = { |
184 | .allocate_area = anon_allocate_area, |
185 | .release_pages = anon_release_pages, |
186 | .alias_mapping = noop_alias_mapping, |
187 | .check_pmd_mapping = NULL, |
188 | }; |
189 | |
190 | struct uffd_test_ops shmem_uffd_test_ops = { |
191 | .allocate_area = shmem_allocate_area, |
192 | .release_pages = shmem_release_pages, |
193 | .alias_mapping = shmem_alias_mapping, |
194 | .check_pmd_mapping = shmem_check_pmd_mapping, |
195 | }; |
196 | |
197 | struct uffd_test_ops hugetlb_uffd_test_ops = { |
198 | .allocate_area = hugetlb_allocate_area, |
199 | .release_pages = hugetlb_release_pages, |
200 | .alias_mapping = hugetlb_alias_mapping, |
201 | .check_pmd_mapping = NULL, |
202 | }; |
203 | |
204 | void uffd_stats_report(struct uffd_args *args, int n_cpus) |
205 | { |
206 | int i; |
207 | unsigned long long miss_total = 0, wp_total = 0, minor_total = 0; |
208 | |
209 | for (i = 0; i < n_cpus; i++) { |
210 | miss_total += args[i].missing_faults; |
211 | wp_total += args[i].wp_faults; |
212 | minor_total += args[i].minor_faults; |
213 | } |
214 | |
215 | printf("userfaults: " ); |
216 | if (miss_total) { |
217 | printf("%llu missing (" , miss_total); |
218 | for (i = 0; i < n_cpus; i++) |
219 | printf("%lu+" , args[i].missing_faults); |
220 | printf("\b) " ); |
221 | } |
222 | if (wp_total) { |
223 | printf("%llu wp (" , wp_total); |
224 | for (i = 0; i < n_cpus; i++) |
225 | printf("%lu+" , args[i].wp_faults); |
226 | printf("\b) " ); |
227 | } |
228 | if (minor_total) { |
229 | printf("%llu minor (" , minor_total); |
230 | for (i = 0; i < n_cpus; i++) |
231 | printf("%lu+" , args[i].minor_faults); |
232 | printf("\b)" ); |
233 | } |
234 | printf("\n" ); |
235 | } |
236 | |
237 | int userfaultfd_open(uint64_t *features) |
238 | { |
239 | struct uffdio_api uffdio_api; |
240 | |
241 | uffd = uffd_open(UFFD_FLAGS); |
242 | if (uffd < 0) |
243 | return -1; |
244 | uffd_flags = fcntl(uffd, F_GETFD, NULL); |
245 | |
246 | uffdio_api.api = UFFD_API; |
247 | uffdio_api.features = *features; |
248 | if (ioctl(uffd, UFFDIO_API, &uffdio_api)) |
249 | /* Probably lack of CAP_PTRACE? */ |
250 | return -1; |
251 | if (uffdio_api.api != UFFD_API) |
252 | err("UFFDIO_API error: %" PRIu64, (uint64_t)uffdio_api.api); |
253 | |
254 | *features = uffdio_api.features; |
255 | return 0; |
256 | } |
257 | |
258 | static inline void munmap_area(void **area) |
259 | { |
260 | if (*area) |
261 | if (munmap(*area, nr_pages * page_size)) |
262 | err("munmap" ); |
263 | |
264 | *area = NULL; |
265 | } |
266 | |
267 | void uffd_test_ctx_clear(void) |
268 | { |
269 | size_t i; |
270 | |
271 | if (pipefd) { |
272 | for (i = 0; i < nr_cpus * 2; ++i) { |
273 | if (close(pipefd[i])) |
274 | err("close pipefd" ); |
275 | } |
276 | free(pipefd); |
277 | pipefd = NULL; |
278 | } |
279 | |
280 | if (count_verify) { |
281 | free(count_verify); |
282 | count_verify = NULL; |
283 | } |
284 | |
285 | if (uffd != -1) { |
286 | if (close(uffd)) |
287 | err("close uffd" ); |
288 | uffd = -1; |
289 | } |
290 | |
291 | munmap_area(area: (void **)&area_src); |
292 | munmap_area(area: (void **)&area_src_alias); |
293 | munmap_area(area: (void **)&area_dst); |
294 | munmap_area(area: (void **)&area_dst_alias); |
295 | munmap_area(area: (void **)&area_remap); |
296 | } |
297 | |
298 | int uffd_test_ctx_init(uint64_t features, const char **errmsg) |
299 | { |
300 | unsigned long nr, cpu; |
301 | int ret; |
302 | |
303 | if (uffd_test_case_ops && uffd_test_case_ops->pre_alloc) { |
304 | ret = uffd_test_case_ops->pre_alloc(errmsg); |
305 | if (ret) |
306 | return ret; |
307 | } |
308 | |
309 | ret = uffd_test_ops->allocate_area((void **)&area_src, true); |
310 | ret |= uffd_test_ops->allocate_area((void **)&area_dst, false); |
311 | if (ret) { |
312 | if (errmsg) |
313 | *errmsg = "memory allocation failed" ; |
314 | return ret; |
315 | } |
316 | |
317 | if (uffd_test_case_ops && uffd_test_case_ops->post_alloc) { |
318 | ret = uffd_test_case_ops->post_alloc(errmsg); |
319 | if (ret) |
320 | return ret; |
321 | } |
322 | |
323 | ret = userfaultfd_open(features: &features); |
324 | if (ret) { |
325 | if (errmsg) |
326 | *errmsg = "possible lack of priviledge" ; |
327 | return ret; |
328 | } |
329 | |
330 | count_verify = malloc(nr_pages * sizeof(unsigned long long)); |
331 | if (!count_verify) |
332 | err("count_verify" ); |
333 | |
334 | for (nr = 0; nr < nr_pages; nr++) { |
335 | *area_mutex(area_src, nr) = |
336 | (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER; |
337 | count_verify[nr] = *area_count(area_src, nr) = 1; |
338 | /* |
339 | * In the transition between 255 to 256, powerpc will |
340 | * read out of order in my_bcmp and see both bytes as |
341 | * zero, so leave a placeholder below always non-zero |
342 | * after the count, to avoid my_bcmp to trigger false |
343 | * positives. |
344 | */ |
345 | *(area_count(area_src, nr) + 1) = 1; |
346 | } |
347 | |
348 | /* |
349 | * After initialization of area_src, we must explicitly release pages |
350 | * for area_dst to make sure it's fully empty. Otherwise we could have |
351 | * some area_dst pages be errornously initialized with zero pages, |
352 | * hence we could hit memory corruption later in the test. |
353 | * |
354 | * One example is when THP is globally enabled, above allocate_area() |
355 | * calls could have the two areas merged into a single VMA (as they |
356 | * will have the same VMA flags so they're mergeable). When we |
357 | * initialize the area_src above, it's possible that some part of |
358 | * area_dst could have been faulted in via one huge THP that will be |
359 | * shared between area_src and area_dst. It could cause some of the |
360 | * area_dst won't be trapped by missing userfaults. |
361 | * |
362 | * This release_pages() will guarantee even if that happened, we'll |
363 | * proactively split the thp and drop any accidentally initialized |
364 | * pages within area_dst. |
365 | */ |
366 | uffd_test_ops->release_pages(area_dst); |
367 | |
368 | pipefd = malloc(sizeof(int) * nr_cpus * 2); |
369 | if (!pipefd) |
370 | err("pipefd" ); |
371 | for (cpu = 0; cpu < nr_cpus; cpu++) |
372 | if (pipe2(&pipefd[cpu * 2], O_CLOEXEC | O_NONBLOCK)) |
373 | err("pipe" ); |
374 | |
375 | return 0; |
376 | } |
377 | |
378 | void wp_range(int ufd, __u64 start, __u64 len, bool wp) |
379 | { |
380 | struct uffdio_writeprotect prms; |
381 | |
382 | /* Write protection page faults */ |
383 | prms.range.start = start; |
384 | prms.range.len = len; |
385 | /* Undo write-protect, do wakeup after that */ |
386 | prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0; |
387 | |
388 | if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms)) |
389 | err("clear WP failed: address=0x%" PRIx64, (uint64_t)start); |
390 | } |
391 | |
392 | static void continue_range(int ufd, __u64 start, __u64 len, bool wp) |
393 | { |
394 | struct uffdio_continue req; |
395 | int ret; |
396 | |
397 | req.range.start = start; |
398 | req.range.len = len; |
399 | req.mode = 0; |
400 | if (wp) |
401 | req.mode |= UFFDIO_CONTINUE_MODE_WP; |
402 | |
403 | if (ioctl(ufd, UFFDIO_CONTINUE, &req)) |
404 | err("UFFDIO_CONTINUE failed for address 0x%" PRIx64, |
405 | (uint64_t)start); |
406 | |
407 | /* |
408 | * Error handling within the kernel for continue is subtly different |
409 | * from copy or zeropage, so it may be a source of bugs. Trigger an |
410 | * error (-EEXIST) on purpose, to verify doing so doesn't cause a BUG. |
411 | */ |
412 | req.mapped = 0; |
413 | ret = ioctl(ufd, UFFDIO_CONTINUE, &req); |
414 | if (ret >= 0 || req.mapped != -EEXIST) |
415 | err("failed to exercise UFFDIO_CONTINUE error handling, ret=%d, mapped=%" PRId64, |
416 | ret, (int64_t) req.mapped); |
417 | } |
418 | |
419 | int uffd_read_msg(int ufd, struct uffd_msg *msg) |
420 | { |
421 | int ret = read(uffd, msg, sizeof(*msg)); |
422 | |
423 | if (ret != sizeof(*msg)) { |
424 | if (ret < 0) { |
425 | if (errno == EAGAIN || errno == EINTR) |
426 | return 1; |
427 | err("blocking read error" ); |
428 | } else { |
429 | err("short read" ); |
430 | } |
431 | } |
432 | |
433 | return 0; |
434 | } |
435 | |
436 | void uffd_handle_page_fault(struct uffd_msg *msg, struct uffd_args *args) |
437 | { |
438 | unsigned long offset; |
439 | |
440 | if (msg->event != UFFD_EVENT_PAGEFAULT) |
441 | err("unexpected msg event %u" , msg->event); |
442 | |
443 | if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) { |
444 | /* Write protect page faults */ |
445 | wp_range(ufd: uffd, start: msg->arg.pagefault.address, len: page_size, wp: false); |
446 | args->wp_faults++; |
447 | } else if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) { |
448 | uint8_t *area; |
449 | int b; |
450 | |
451 | /* |
452 | * Minor page faults |
453 | * |
454 | * To prove we can modify the original range for testing |
455 | * purposes, we're going to bit flip this range before |
456 | * continuing. |
457 | * |
458 | * Note that this requires all minor page fault tests operate on |
459 | * area_dst (non-UFFD-registered) and area_dst_alias |
460 | * (UFFD-registered). |
461 | */ |
462 | |
463 | area = (uint8_t *)(area_dst + |
464 | ((char *)msg->arg.pagefault.address - |
465 | area_dst_alias)); |
466 | for (b = 0; b < page_size; ++b) |
467 | area[b] = ~area[b]; |
468 | continue_range(ufd: uffd, start: msg->arg.pagefault.address, len: page_size, |
469 | wp: args->apply_wp); |
470 | args->minor_faults++; |
471 | } else { |
472 | /* |
473 | * Missing page faults. |
474 | * |
475 | * Here we force a write check for each of the missing mode |
476 | * faults. It's guaranteed because the only threads that |
477 | * will trigger uffd faults are the locking threads, and |
478 | * their first instruction to touch the missing page will |
479 | * always be pthread_mutex_lock(). |
480 | * |
481 | * Note that here we relied on an NPTL glibc impl detail to |
482 | * always read the lock type at the entry of the lock op |
483 | * (pthread_mutex_t.__data.__type, offset 0x10) before |
484 | * doing any locking operations to guarantee that. It's |
485 | * actually not good to rely on this impl detail because |
486 | * logically a pthread-compatible lib can implement the |
487 | * locks without types and we can fail when linking with |
488 | * them. However since we used to find bugs with this |
489 | * strict check we still keep it around. Hopefully this |
490 | * could be a good hint when it fails again. If one day |
491 | * it'll break on some other impl of glibc we'll revisit. |
492 | */ |
493 | if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE) |
494 | err("unexpected write fault" ); |
495 | |
496 | offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst; |
497 | offset &= ~(page_size-1); |
498 | |
499 | if (copy_page(uffd, offset, args->apply_wp)) |
500 | args->missing_faults++; |
501 | } |
502 | } |
503 | |
504 | void *uffd_poll_thread(void *arg) |
505 | { |
506 | struct uffd_args *args = (struct uffd_args *)arg; |
507 | unsigned long cpu = args->cpu; |
508 | struct pollfd pollfd[2]; |
509 | struct uffd_msg msg; |
510 | struct uffdio_register uffd_reg; |
511 | int ret; |
512 | char tmp_chr; |
513 | |
514 | if (!args->handle_fault) |
515 | args->handle_fault = uffd_handle_page_fault; |
516 | |
517 | pollfd[0].fd = uffd; |
518 | pollfd[0].events = POLLIN; |
519 | pollfd[1].fd = pipefd[cpu*2]; |
520 | pollfd[1].events = POLLIN; |
521 | |
522 | ready_for_fork = true; |
523 | |
524 | for (;;) { |
525 | ret = poll(pollfd, 2, -1); |
526 | if (ret <= 0) { |
527 | if (errno == EINTR || errno == EAGAIN) |
528 | continue; |
529 | err("poll error: %d" , ret); |
530 | } |
531 | if (pollfd[1].revents) { |
532 | if (!(pollfd[1].revents & POLLIN)) |
533 | err("pollfd[1].revents %d" , pollfd[1].revents); |
534 | if (read(pollfd[1].fd, &tmp_chr, 1) != 1) |
535 | err("read pipefd error" ); |
536 | break; |
537 | } |
538 | if (!(pollfd[0].revents & POLLIN)) |
539 | err("pollfd[0].revents %d" , pollfd[0].revents); |
540 | if (uffd_read_msg(ufd: uffd, msg: &msg)) |
541 | continue; |
542 | switch (msg.event) { |
543 | default: |
544 | err("unexpected msg event %u\n" , msg.event); |
545 | break; |
546 | case UFFD_EVENT_PAGEFAULT: |
547 | args->handle_fault(&msg, args); |
548 | break; |
549 | case UFFD_EVENT_FORK: |
550 | close(uffd); |
551 | uffd = msg.arg.fork.ufd; |
552 | pollfd[0].fd = uffd; |
553 | break; |
554 | case UFFD_EVENT_REMOVE: |
555 | uffd_reg.range.start = msg.arg.remove.start; |
556 | uffd_reg.range.len = msg.arg.remove.end - |
557 | msg.arg.remove.start; |
558 | if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range)) |
559 | err("remove failure" ); |
560 | break; |
561 | case UFFD_EVENT_REMAP: |
562 | area_remap = area_dst; /* save for later unmap */ |
563 | area_dst = (char *)(unsigned long)msg.arg.remap.to; |
564 | break; |
565 | } |
566 | } |
567 | |
568 | return NULL; |
569 | } |
570 | |
571 | static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy, |
572 | unsigned long offset) |
573 | { |
574 | uffd_test_ops->alias_mapping(&uffdio_copy->dst, |
575 | uffdio_copy->len, |
576 | offset); |
577 | if (ioctl(ufd, UFFDIO_COPY, uffdio_copy)) { |
578 | /* real retval in ufdio_copy.copy */ |
579 | if (uffdio_copy->copy != -EEXIST) |
580 | err("UFFDIO_COPY retry error: %" PRId64, |
581 | (int64_t)uffdio_copy->copy); |
582 | } else { |
583 | err("UFFDIO_COPY retry unexpected: %" PRId64, |
584 | (int64_t)uffdio_copy->copy); |
585 | } |
586 | } |
587 | |
588 | static void wake_range(int ufd, unsigned long addr, unsigned long len) |
589 | { |
590 | struct uffdio_range uffdio_wake; |
591 | |
592 | uffdio_wake.start = addr; |
593 | uffdio_wake.len = len; |
594 | |
595 | if (ioctl(ufd, UFFDIO_WAKE, &uffdio_wake)) |
596 | fprintf(stderr, "error waking %lu\n" , |
597 | addr), exit(1); |
598 | } |
599 | |
600 | int __copy_page(int ufd, unsigned long offset, bool retry, bool wp) |
601 | { |
602 | struct uffdio_copy uffdio_copy; |
603 | |
604 | if (offset >= nr_pages * page_size) |
605 | err("unexpected offset %lu\n" , offset); |
606 | uffdio_copy.dst = (unsigned long) area_dst + offset; |
607 | uffdio_copy.src = (unsigned long) area_src + offset; |
608 | uffdio_copy.len = page_size; |
609 | if (wp) |
610 | uffdio_copy.mode = UFFDIO_COPY_MODE_WP; |
611 | else |
612 | uffdio_copy.mode = 0; |
613 | uffdio_copy.copy = 0; |
614 | if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) { |
615 | /* real retval in ufdio_copy.copy */ |
616 | if (uffdio_copy.copy != -EEXIST) |
617 | err("UFFDIO_COPY error: %" PRId64, |
618 | (int64_t)uffdio_copy.copy); |
619 | wake_range(ufd, addr: uffdio_copy.dst, len: page_size); |
620 | } else if (uffdio_copy.copy != page_size) { |
621 | err("UFFDIO_COPY error: %" PRId64, (int64_t)uffdio_copy.copy); |
622 | } else { |
623 | if (test_uffdio_copy_eexist && retry) { |
624 | test_uffdio_copy_eexist = false; |
625 | retry_copy_page(ufd, uffdio_copy: &uffdio_copy, offset); |
626 | } |
627 | return 1; |
628 | } |
629 | return 0; |
630 | } |
631 | |
632 | int copy_page(int ufd, unsigned long offset, bool wp) |
633 | { |
634 | return __copy_page(ufd, offset, retry: false, wp); |
635 | } |
636 | |
637 | int move_page(int ufd, unsigned long offset, unsigned long len) |
638 | { |
639 | struct uffdio_move uffdio_move; |
640 | |
641 | if (offset + len > nr_pages * page_size) |
642 | err("unexpected offset %lu and length %lu\n" , offset, len); |
643 | uffdio_move.dst = (unsigned long) area_dst + offset; |
644 | uffdio_move.src = (unsigned long) area_src + offset; |
645 | uffdio_move.len = len; |
646 | uffdio_move.mode = UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES; |
647 | uffdio_move.move = 0; |
648 | if (ioctl(ufd, UFFDIO_MOVE, &uffdio_move)) { |
649 | /* real retval in uffdio_move.move */ |
650 | if (uffdio_move.move != -EEXIST) |
651 | err("UFFDIO_MOVE error: %" PRId64, |
652 | (int64_t)uffdio_move.move); |
653 | wake_range(ufd, addr: uffdio_move.dst, len); |
654 | } else if (uffdio_move.move != len) { |
655 | err("UFFDIO_MOVE error: %" PRId64, (int64_t)uffdio_move.move); |
656 | } else |
657 | return 1; |
658 | return 0; |
659 | } |
660 | |
661 | int uffd_open_dev(unsigned int flags) |
662 | { |
663 | int fd, uffd; |
664 | |
665 | fd = open("/dev/userfaultfd" , O_RDWR | O_CLOEXEC); |
666 | if (fd < 0) |
667 | return fd; |
668 | uffd = ioctl(fd, USERFAULTFD_IOC_NEW, flags); |
669 | close(fd); |
670 | |
671 | return uffd; |
672 | } |
673 | |
674 | int uffd_open_sys(unsigned int flags) |
675 | { |
676 | #ifdef __NR_userfaultfd |
677 | return syscall(__NR_userfaultfd, flags); |
678 | #else |
679 | return -1; |
680 | #endif |
681 | } |
682 | |
683 | int uffd_open(unsigned int flags) |
684 | { |
685 | int uffd = uffd_open_sys(flags); |
686 | |
687 | if (uffd < 0) |
688 | uffd = uffd_open_dev(flags); |
689 | |
690 | return uffd; |
691 | } |
692 | |
693 | int uffd_get_features(uint64_t *features) |
694 | { |
695 | struct uffdio_api uffdio_api = { .api = UFFD_API, .features = 0 }; |
696 | /* |
697 | * This should by default work in most kernels; the feature list |
698 | * will be the same no matter what we pass in here. |
699 | */ |
700 | int fd = uffd_open(UFFD_USER_MODE_ONLY); |
701 | |
702 | if (fd < 0) |
703 | /* Maybe the kernel is older than user-only mode? */ |
704 | fd = uffd_open(flags: 0); |
705 | |
706 | if (fd < 0) |
707 | return fd; |
708 | |
709 | if (ioctl(fd, UFFDIO_API, &uffdio_api)) { |
710 | close(fd); |
711 | return -errno; |
712 | } |
713 | |
714 | *features = uffdio_api.features; |
715 | close(fd); |
716 | |
717 | return 0; |
718 | } |
719 | |