1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * memfd GUP test-case |
4 | * This tests memfd interactions with get_user_pages(). We require the |
5 | * fuse_mnt.c program to provide a fake direct-IO FUSE mount-point for us. This |
6 | * file-system delays _all_ reads by 1s and forces direct-IO. This means, any |
7 | * read() on files in that file-system will pin the receive-buffer pages for at |
8 | * least 1s via get_user_pages(). |
9 | * |
10 | * We use this trick to race ADD_SEALS against a write on a memfd object. The |
11 | * ADD_SEALS must fail if the memfd pages are still pinned. Note that we use |
12 | * the read() syscall with our memory-mapped memfd object as receive buffer to |
13 | * force the kernel to write into our memfd object. |
14 | */ |
15 | |
16 | #define _GNU_SOURCE |
17 | #define |
18 | |
19 | #include <errno.h> |
20 | #include <inttypes.h> |
21 | #include <limits.h> |
22 | #include <linux/falloc.h> |
23 | #include <fcntl.h> |
24 | #include <linux/memfd.h> |
25 | #include <linux/types.h> |
26 | #include <sched.h> |
27 | #include <stdio.h> |
28 | #include <stdlib.h> |
29 | #include <signal.h> |
30 | #include <string.h> |
31 | #include <sys/mman.h> |
32 | #include <sys/stat.h> |
33 | #include <sys/syscall.h> |
34 | #include <sys/wait.h> |
35 | #include <unistd.h> |
36 | |
37 | #include "common.h" |
38 | |
39 | #define MFD_DEF_SIZE 8192 |
40 | #define STACK_SIZE 65536 |
41 | |
42 | static size_t mfd_def_size = MFD_DEF_SIZE; |
43 | |
44 | static int mfd_assert_new(const char *name, loff_t sz, unsigned int flags) |
45 | { |
46 | int r, fd; |
47 | |
48 | fd = sys_memfd_create(name, flags); |
49 | if (fd < 0) { |
50 | printf("memfd_create(\"%s\", %u) failed: %m\n" , |
51 | name, flags); |
52 | abort(); |
53 | } |
54 | |
55 | r = ftruncate(fd, sz); |
56 | if (r < 0) { |
57 | printf("ftruncate(%llu) failed: %m\n" , (unsigned long long)sz); |
58 | abort(); |
59 | } |
60 | |
61 | return fd; |
62 | } |
63 | |
64 | static __u64 mfd_assert_get_seals(int fd) |
65 | { |
66 | long r; |
67 | |
68 | r = fcntl(fd, F_GET_SEALS); |
69 | if (r < 0) { |
70 | printf("GET_SEALS(%d) failed: %m\n" , fd); |
71 | abort(); |
72 | } |
73 | |
74 | return r; |
75 | } |
76 | |
77 | static void mfd_assert_has_seals(int fd, __u64 seals) |
78 | { |
79 | __u64 s; |
80 | |
81 | s = mfd_assert_get_seals(fd); |
82 | if (s != seals) { |
83 | printf("%llu != %llu = GET_SEALS(%d)\n" , |
84 | (unsigned long long)seals, (unsigned long long)s, fd); |
85 | abort(); |
86 | } |
87 | } |
88 | |
89 | static void mfd_assert_add_seals(int fd, __u64 seals) |
90 | { |
91 | long r; |
92 | __u64 s; |
93 | |
94 | s = mfd_assert_get_seals(fd); |
95 | r = fcntl(fd, F_ADD_SEALS, seals); |
96 | if (r < 0) { |
97 | printf("ADD_SEALS(%d, %llu -> %llu) failed: %m\n" , |
98 | fd, (unsigned long long)s, (unsigned long long)seals); |
99 | abort(); |
100 | } |
101 | } |
102 | |
103 | static int mfd_busy_add_seals(int fd, __u64 seals) |
104 | { |
105 | long r; |
106 | __u64 s; |
107 | |
108 | r = fcntl(fd, F_GET_SEALS); |
109 | if (r < 0) |
110 | s = 0; |
111 | else |
112 | s = r; |
113 | |
114 | r = fcntl(fd, F_ADD_SEALS, seals); |
115 | if (r < 0 && errno != EBUSY) { |
116 | printf("ADD_SEALS(%d, %llu -> %llu) didn't fail as expected with EBUSY: %m\n" , |
117 | fd, (unsigned long long)s, (unsigned long long)seals); |
118 | abort(); |
119 | } |
120 | |
121 | return r; |
122 | } |
123 | |
124 | static void *mfd_assert_mmap_shared(int fd) |
125 | { |
126 | void *p; |
127 | |
128 | p = mmap(NULL, |
129 | mfd_def_size, |
130 | PROT_READ | PROT_WRITE, |
131 | MAP_SHARED, |
132 | fd, |
133 | 0); |
134 | if (p == MAP_FAILED) { |
135 | printf("mmap() failed: %m\n" ); |
136 | abort(); |
137 | } |
138 | |
139 | return p; |
140 | } |
141 | |
142 | static void *mfd_assert_mmap_private(int fd) |
143 | { |
144 | void *p; |
145 | |
146 | p = mmap(NULL, |
147 | mfd_def_size, |
148 | PROT_READ | PROT_WRITE, |
149 | MAP_PRIVATE, |
150 | fd, |
151 | 0); |
152 | if (p == MAP_FAILED) { |
153 | printf("mmap() failed: %m\n" ); |
154 | abort(); |
155 | } |
156 | |
157 | return p; |
158 | } |
159 | |
160 | static int global_mfd = -1; |
161 | static void *global_p = NULL; |
162 | |
163 | static int sealing_thread_fn(void *arg) |
164 | { |
165 | int sig, r; |
166 | |
167 | /* |
168 | * This thread first waits 200ms so any pending operation in the parent |
169 | * is correctly started. After that, it tries to seal @global_mfd as |
170 | * SEAL_WRITE. This _must_ fail as the parent thread has a read() into |
171 | * that memory mapped object still ongoing. |
172 | * We then wait one more second and try sealing again. This time it |
173 | * must succeed as there shouldn't be anyone else pinning the pages. |
174 | */ |
175 | |
176 | /* wait 200ms for FUSE-request to be active */ |
177 | usleep(200000); |
178 | |
179 | /* unmount mapping before sealing to avoid i_mmap_writable failures */ |
180 | munmap(global_p, mfd_def_size); |
181 | |
182 | /* Try sealing the global file; expect EBUSY or success. Current |
183 | * kernels will never succeed, but in the future, kernels might |
184 | * implement page-replacements or other fancy ways to avoid racing |
185 | * writes. */ |
186 | r = mfd_busy_add_seals(fd: global_mfd, seals: F_SEAL_WRITE); |
187 | if (r >= 0) { |
188 | printf("HURRAY! This kernel fixed GUP races!\n" ); |
189 | } else { |
190 | /* wait 1s more so the FUSE-request is done */ |
191 | sleep(1); |
192 | |
193 | /* try sealing the global file again */ |
194 | mfd_assert_add_seals(fd: global_mfd, seals: F_SEAL_WRITE); |
195 | } |
196 | |
197 | return 0; |
198 | } |
199 | |
200 | static pid_t spawn_sealing_thread(void) |
201 | { |
202 | uint8_t *stack; |
203 | pid_t pid; |
204 | |
205 | stack = malloc(STACK_SIZE); |
206 | if (!stack) { |
207 | printf("malloc(STACK_SIZE) failed: %m\n" ); |
208 | abort(); |
209 | } |
210 | |
211 | pid = clone(sealing_thread_fn, |
212 | stack + STACK_SIZE, |
213 | SIGCHLD | CLONE_FILES | CLONE_FS | CLONE_VM, |
214 | NULL); |
215 | if (pid < 0) { |
216 | printf("clone() failed: %m\n" ); |
217 | abort(); |
218 | } |
219 | |
220 | return pid; |
221 | } |
222 | |
223 | static void join_sealing_thread(pid_t pid) |
224 | { |
225 | waitpid(pid, NULL, 0); |
226 | } |
227 | |
228 | int main(int argc, char **argv) |
229 | { |
230 | char *zero; |
231 | int fd, mfd, r; |
232 | void *p; |
233 | int was_sealed; |
234 | pid_t pid; |
235 | |
236 | if (argc < 2) { |
237 | printf("error: please pass path to file in fuse_mnt mount-point\n" ); |
238 | abort(); |
239 | } |
240 | |
241 | if (argc >= 3) { |
242 | if (!strcmp(argv[2], "hugetlbfs" )) { |
243 | unsigned long hpage_size = default_huge_page_size(); |
244 | |
245 | if (!hpage_size) { |
246 | printf("Unable to determine huge page size\n" ); |
247 | abort(); |
248 | } |
249 | |
250 | hugetlbfs_test = 1; |
251 | mfd_def_size = hpage_size * 2; |
252 | } else { |
253 | printf("Unknown option: %s\n" , argv[2]); |
254 | abort(); |
255 | } |
256 | } |
257 | |
258 | zero = calloc(sizeof(*zero), mfd_def_size); |
259 | |
260 | /* open FUSE memfd file for GUP testing */ |
261 | printf("opening: %s\n" , argv[1]); |
262 | fd = open(argv[1], O_RDONLY | O_CLOEXEC); |
263 | if (fd < 0) { |
264 | printf("cannot open(\"%s\"): %m\n" , argv[1]); |
265 | abort(); |
266 | } |
267 | |
268 | /* create new memfd-object */ |
269 | mfd = mfd_assert_new(name: "kern_memfd_fuse" , |
270 | sz: mfd_def_size, |
271 | flags: MFD_CLOEXEC | MFD_ALLOW_SEALING); |
272 | |
273 | /* mmap memfd-object for writing */ |
274 | p = mfd_assert_mmap_shared(fd: mfd); |
275 | |
276 | /* pass mfd+mapping to a separate sealing-thread which tries to seal |
277 | * the memfd objects with SEAL_WRITE while we write into it */ |
278 | global_mfd = mfd; |
279 | global_p = p; |
280 | pid = spawn_sealing_thread(); |
281 | |
282 | /* Use read() on the FUSE file to read into our memory-mapped memfd |
283 | * object. This races the other thread which tries to seal the |
284 | * memfd-object. |
285 | * If @fd is on the memfd-fake-FUSE-FS, the read() is delayed by 1s. |
286 | * This guarantees that the receive-buffer is pinned for 1s until the |
287 | * data is written into it. The racing ADD_SEALS should thus fail as |
288 | * the pages are still pinned. */ |
289 | r = read(fd, p, mfd_def_size); |
290 | if (r < 0) { |
291 | printf("read() failed: %m\n" ); |
292 | abort(); |
293 | } else if (!r) { |
294 | printf("unexpected EOF on read()\n" ); |
295 | abort(); |
296 | } |
297 | |
298 | was_sealed = mfd_assert_get_seals(fd: mfd) & F_SEAL_WRITE; |
299 | |
300 | /* Wait for sealing-thread to finish and verify that it |
301 | * successfully sealed the file after the second try. */ |
302 | join_sealing_thread(pid); |
303 | mfd_assert_has_seals(fd: mfd, seals: F_SEAL_WRITE); |
304 | |
305 | /* *IF* the memfd-object was sealed at the time our read() returned, |
306 | * then the kernel did a page-replacement or canceled the read() (or |
307 | * whatever magic it did..). In that case, the memfd object is still |
308 | * all zero. |
309 | * In case the memfd-object was *not* sealed, the read() was successfull |
310 | * and the memfd object must *not* be all zero. |
311 | * Note that in real scenarios, there might be a mixture of both, but |
312 | * in this test-cases, we have explicit 200ms delays which should be |
313 | * enough to avoid any in-flight writes. */ |
314 | |
315 | p = mfd_assert_mmap_private(fd: mfd); |
316 | if (was_sealed && memcmp(p, zero, mfd_def_size)) { |
317 | printf("memfd sealed during read() but data not discarded\n" ); |
318 | abort(); |
319 | } else if (!was_sealed && !memcmp(p, zero, mfd_def_size)) { |
320 | printf("memfd sealed after read() but data discarded\n" ); |
321 | abort(); |
322 | } |
323 | |
324 | close(mfd); |
325 | close(fd); |
326 | |
327 | printf("fuse: DONE\n" ); |
328 | free(zero); |
329 | |
330 | return 0; |
331 | } |
332 | |