1// SPDX-License-Identifier: GPL-2.0
2
3/*
4 * Copyright (c) 2025, Google LLC.
5 * Pasha Tatashin <pasha.tatashin@soleen.com>
6 *
7 * Copyright (C) 2025 Amazon.com Inc. or its affiliates.
8 * Pratyush Yadav <ptyadav@amazon.de>
9 */
10
11/**
12 * DOC: Memfd Preservation via LUO
13 *
14 * Overview
15 * ========
16 *
17 * Memory file descriptors (memfd) can be preserved over a kexec using the Live
18 * Update Orchestrator (LUO) file preservation. This allows userspace to
19 * transfer its memory contents to the next kernel after a kexec.
20 *
21 * The preservation is not intended to be transparent. Only select properties of
22 * the file are preserved. All others are reset to default. The preserved
23 * properties are described below.
24 *
25 * .. note::
26 * The LUO API is not stabilized yet, so the preserved properties of a memfd
27 * are also not stable and are subject to backwards incompatible changes.
28 *
29 * .. note::
30 * Currently a memfd backed by Hugetlb is not supported. Memfds created
31 * with ``MFD_HUGETLB`` will be rejected.
32 *
33 * Preserved Properties
34 * ====================
35 *
36 * The following properties of the memfd are preserved across kexec:
37 *
38 * File Contents
39 * All data stored in the file is preserved.
40 *
41 * File Size
42 * The size of the file is preserved. Holes in the file are filled by
43 * allocating pages for them during preservation.
44 *
45 * File Position
46 * The current file position is preserved, allowing applications to continue
47 * reading/writing from their last position.
48 *
49 * File Status Flags
50 * memfds are always opened with ``O_RDWR`` and ``O_LARGEFILE``. This property
51 * is maintained.
52 *
53 * Non-Preserved Properties
54 * ========================
55 *
56 * All properties which are not preserved must be assumed to be reset to
57 * default. This section describes some of those properties which may be more of
58 * note.
59 *
60 * ``FD_CLOEXEC`` flag
61 * A memfd can be created with the ``MFD_CLOEXEC`` flag that sets the
62 * ``FD_CLOEXEC`` on the file. This flag is not preserved and must be set
63 * again after restore via ``fcntl()``.
64 *
65 * Seals
66 * File seals are not preserved. The file is unsealed on restore and if
67 * needed, must be sealed again via ``fcntl()``.
68 */
69
70#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
71
72#include <linux/bits.h>
73#include <linux/err.h>
74#include <linux/file.h>
75#include <linux/io.h>
76#include <linux/kexec_handover.h>
77#include <linux/kho/abi/memfd.h>
78#include <linux/liveupdate.h>
79#include <linux/shmem_fs.h>
80#include <linux/vmalloc.h>
81#include <linux/memfd.h>
82#include "internal.h"
83
84static int memfd_luo_preserve_folios(struct file *file,
85 struct kho_vmalloc *kho_vmalloc,
86 struct memfd_luo_folio_ser **out_folios_ser,
87 u64 *nr_foliosp)
88{
89 struct inode *inode = file_inode(f: file);
90 struct memfd_luo_folio_ser *folios_ser;
91 unsigned int max_folios;
92 long i, size, nr_pinned;
93 struct folio **folios;
94 int err = -EINVAL;
95 pgoff_t offset;
96 u64 nr_folios;
97
98 size = i_size_read(inode);
99 /*
100 * If the file has zero size, then the folios and nr_folios properties
101 * are not set.
102 */
103 if (!size) {
104 *nr_foliosp = 0;
105 *out_folios_ser = NULL;
106 memset(kho_vmalloc, 0, sizeof(*kho_vmalloc));
107 return 0;
108 }
109
110 /*
111 * Guess the number of folios based on inode size. Real number might end
112 * up being smaller if there are higher order folios.
113 */
114 max_folios = PAGE_ALIGN(size) / PAGE_SIZE;
115 folios = kvmalloc_array(max_folios, sizeof(*folios), GFP_KERNEL);
116 if (!folios)
117 return -ENOMEM;
118
119 /*
120 * Pin the folios so they don't move around behind our back. This also
121 * ensures none of the folios are in CMA -- which ensures they don't
122 * fall in KHO scratch memory. It also moves swapped out folios back to
123 * memory.
124 *
125 * A side effect of doing this is that it allocates a folio for all
126 * indices in the file. This might waste memory on sparse memfds. If
127 * that is really a problem in the future, we can have a
128 * memfd_pin_folios() variant that does not allocate a page on empty
129 * slots.
130 */
131 nr_pinned = memfd_pin_folios(memfd: file, start: 0, end: size - 1, folios, max_folios,
132 offset: &offset);
133 if (nr_pinned < 0) {
134 err = nr_pinned;
135 pr_err("failed to pin folios: %d\n", err);
136 goto err_free_folios;
137 }
138 nr_folios = nr_pinned;
139
140 folios_ser = vcalloc(nr_folios, sizeof(*folios_ser));
141 if (!folios_ser) {
142 err = -ENOMEM;
143 goto err_unpin;
144 }
145
146 for (i = 0; i < nr_folios; i++) {
147 struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
148 struct folio *folio = folios[i];
149 unsigned int flags = 0;
150
151 err = kho_preserve_folio(folio);
152 if (err)
153 goto err_unpreserve;
154
155 if (folio_test_dirty(folio))
156 flags |= MEMFD_LUO_FOLIO_DIRTY;
157 if (folio_test_uptodate(folio))
158 flags |= MEMFD_LUO_FOLIO_UPTODATE;
159
160 pfolio->pfn = folio_pfn(folio);
161 pfolio->flags = flags;
162 pfolio->index = folio->index;
163 }
164
165 err = kho_preserve_vmalloc(ptr: folios_ser, preservation: kho_vmalloc);
166 if (err)
167 goto err_unpreserve;
168
169 kvfree(addr: folios);
170 *nr_foliosp = nr_folios;
171 *out_folios_ser = folios_ser;
172
173 /*
174 * Note: folios_ser is purposely not freed here. It is preserved
175 * memory (via KHO). In the 'unpreserve' path, we use the vmap pointer
176 * that is passed via private_data.
177 */
178 return 0;
179
180err_unpreserve:
181 for (i = i - 1; i >= 0; i--)
182 kho_unpreserve_folio(folio: folios[i]);
183 vfree(addr: folios_ser);
184err_unpin:
185 unpin_folios(folios, nfolios: nr_folios);
186err_free_folios:
187 kvfree(addr: folios);
188
189 return err;
190}
191
192static void memfd_luo_unpreserve_folios(struct kho_vmalloc *kho_vmalloc,
193 struct memfd_luo_folio_ser *folios_ser,
194 u64 nr_folios)
195{
196 long i;
197
198 if (!nr_folios)
199 return;
200
201 kho_unpreserve_vmalloc(preservation: kho_vmalloc);
202
203 for (i = 0; i < nr_folios; i++) {
204 const struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
205 struct folio *folio;
206
207 if (!pfolio->pfn)
208 continue;
209
210 folio = pfn_folio(pfn: pfolio->pfn);
211
212 kho_unpreserve_folio(folio);
213 unpin_folio(folio);
214 }
215
216 vfree(addr: folios_ser);
217}
218
219static int memfd_luo_preserve(struct liveupdate_file_op_args *args)
220{
221 struct inode *inode = file_inode(f: args->file);
222 struct memfd_luo_folio_ser *folios_ser;
223 struct memfd_luo_ser *ser;
224 u64 nr_folios;
225 int err = 0;
226
227 inode_lock(inode);
228 shmem_freeze(inode, freeze: true);
229
230 /* Allocate the main serialization structure in preserved memory */
231 ser = kho_alloc_preserve(size: sizeof(*ser));
232 if (IS_ERR(ptr: ser)) {
233 err = PTR_ERR(ptr: ser);
234 goto err_unlock;
235 }
236
237 ser->pos = args->file->f_pos;
238 ser->size = i_size_read(inode);
239
240 err = memfd_luo_preserve_folios(file: args->file, kho_vmalloc: &ser->folios,
241 out_folios_ser: &folios_ser, nr_foliosp: &nr_folios);
242 if (err)
243 goto err_free_ser;
244
245 ser->nr_folios = nr_folios;
246 inode_unlock(inode);
247
248 args->private_data = folios_ser;
249 args->serialized_data = virt_to_phys(address: ser);
250
251 return 0;
252
253err_free_ser:
254 kho_unpreserve_free(mem: ser);
255err_unlock:
256 shmem_freeze(inode, freeze: false);
257 inode_unlock(inode);
258 return err;
259}
260
261static int memfd_luo_freeze(struct liveupdate_file_op_args *args)
262{
263 struct memfd_luo_ser *ser;
264
265 if (WARN_ON_ONCE(!args->serialized_data))
266 return -EINVAL;
267
268 ser = phys_to_virt(address: args->serialized_data);
269
270 /*
271 * The pos might have changed since prepare. Everything else stays the
272 * same.
273 */
274 ser->pos = args->file->f_pos;
275
276 return 0;
277}
278
279static void memfd_luo_unpreserve(struct liveupdate_file_op_args *args)
280{
281 struct inode *inode = file_inode(f: args->file);
282 struct memfd_luo_ser *ser;
283
284 if (WARN_ON_ONCE(!args->serialized_data))
285 return;
286
287 inode_lock(inode);
288 shmem_freeze(inode, freeze: false);
289
290 ser = phys_to_virt(address: args->serialized_data);
291
292 memfd_luo_unpreserve_folios(kho_vmalloc: &ser->folios, folios_ser: args->private_data,
293 nr_folios: ser->nr_folios);
294
295 kho_unpreserve_free(mem: ser);
296 inode_unlock(inode);
297}
298
299static void memfd_luo_discard_folios(const struct memfd_luo_folio_ser *folios_ser,
300 u64 nr_folios)
301{
302 u64 i;
303
304 for (i = 0; i < nr_folios; i++) {
305 const struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
306 struct folio *folio;
307 phys_addr_t phys;
308
309 if (!pfolio->pfn)
310 continue;
311
312 phys = PFN_PHYS(pfolio->pfn);
313 folio = kho_restore_folio(phys);
314 if (!folio) {
315 pr_warn_ratelimited("Unable to restore folio at physical address: %llx\n",
316 phys);
317 continue;
318 }
319
320 folio_put(folio);
321 }
322}
323
324static void memfd_luo_finish(struct liveupdate_file_op_args *args)
325{
326 struct memfd_luo_folio_ser *folios_ser;
327 struct memfd_luo_ser *ser;
328
329 if (args->retrieved)
330 return;
331
332 ser = phys_to_virt(address: args->serialized_data);
333 if (!ser)
334 return;
335
336 if (ser->nr_folios) {
337 folios_ser = kho_restore_vmalloc(preservation: &ser->folios);
338 if (!folios_ser)
339 goto out;
340
341 memfd_luo_discard_folios(folios_ser, nr_folios: ser->nr_folios);
342 vfree(addr: folios_ser);
343 }
344
345out:
346 kho_restore_free(mem: ser);
347}
348
349static int memfd_luo_retrieve_folios(struct file *file,
350 struct memfd_luo_folio_ser *folios_ser,
351 u64 nr_folios)
352{
353 struct inode *inode = file_inode(f: file);
354 struct address_space *mapping = inode->i_mapping;
355 struct folio *folio;
356 int err = -EIO;
357 long i;
358
359 for (i = 0; i < nr_folios; i++) {
360 const struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
361 phys_addr_t phys;
362 u64 index;
363 int flags;
364
365 if (!pfolio->pfn)
366 continue;
367
368 phys = PFN_PHYS(pfolio->pfn);
369 folio = kho_restore_folio(phys);
370 if (!folio) {
371 pr_err("Unable to restore folio at physical address: %llx\n",
372 phys);
373 goto put_folios;
374 }
375 index = pfolio->index;
376 flags = pfolio->flags;
377
378 /* Set up the folio for insertion. */
379 __folio_set_locked(folio);
380 __folio_set_swapbacked(folio);
381
382 err = mem_cgroup_charge(folio, NULL, gfp: mapping_gfp_mask(mapping));
383 if (err) {
384 pr_err("shmem: failed to charge folio index %ld: %d\n",
385 i, err);
386 goto unlock_folio;
387 }
388
389 err = shmem_add_to_page_cache(folio, mapping, index, NULL,
390 gfp: mapping_gfp_mask(mapping));
391 if (err) {
392 pr_err("shmem: failed to add to page cache folio index %ld: %d\n",
393 i, err);
394 goto unlock_folio;
395 }
396
397 if (flags & MEMFD_LUO_FOLIO_UPTODATE)
398 folio_mark_uptodate(folio);
399 if (flags & MEMFD_LUO_FOLIO_DIRTY)
400 folio_mark_dirty(folio);
401
402 err = shmem_inode_acct_blocks(inode, pages: 1);
403 if (err) {
404 pr_err("shmem: failed to account folio index %ld: %d\n",
405 i, err);
406 goto unlock_folio;
407 }
408
409 shmem_recalc_inode(inode, alloced: 1, swapped: 0);
410 folio_add_lru(folio);
411 folio_unlock(folio);
412 folio_put(folio);
413 }
414
415 return 0;
416
417unlock_folio:
418 folio_unlock(folio);
419 folio_put(folio);
420put_folios:
421 /*
422 * Note: don't free the folios already added to the file. They will be
423 * freed when the file is freed. Free the ones not added yet here.
424 */
425 for (long j = i + 1; j < nr_folios; j++) {
426 const struct memfd_luo_folio_ser *pfolio = &folios_ser[j];
427
428 folio = kho_restore_folio(phys: pfolio->pfn);
429 if (folio)
430 folio_put(folio);
431 }
432
433 return err;
434}
435
436static int memfd_luo_retrieve(struct liveupdate_file_op_args *args)
437{
438 struct memfd_luo_folio_ser *folios_ser;
439 struct memfd_luo_ser *ser;
440 struct file *file;
441 int err;
442
443 ser = phys_to_virt(address: args->serialized_data);
444 if (!ser)
445 return -EINVAL;
446
447 file = memfd_alloc_file(name: "", flags: 0);
448 if (IS_ERR(ptr: file)) {
449 pr_err("failed to setup file: %pe\n", file);
450 err = PTR_ERR(ptr: file);
451 goto free_ser;
452 }
453
454 vfs_setpos(file, offset: ser->pos, MAX_LFS_FILESIZE);
455 file->f_inode->i_size = ser->size;
456
457 if (ser->nr_folios) {
458 folios_ser = kho_restore_vmalloc(preservation: &ser->folios);
459 if (!folios_ser) {
460 err = -EINVAL;
461 goto put_file;
462 }
463
464 err = memfd_luo_retrieve_folios(file, folios_ser, nr_folios: ser->nr_folios);
465 vfree(addr: folios_ser);
466 if (err)
467 goto put_file;
468 }
469
470 args->file = file;
471 kho_restore_free(mem: ser);
472
473 return 0;
474
475put_file:
476 fput(file);
477free_ser:
478 kho_restore_free(mem: ser);
479 return err;
480}
481
482static bool memfd_luo_can_preserve(struct liveupdate_file_handler *handler,
483 struct file *file)
484{
485 struct inode *inode = file_inode(f: file);
486
487 return shmem_file(file) && !inode->i_nlink;
488}
489
490static const struct liveupdate_file_ops memfd_luo_file_ops = {
491 .freeze = memfd_luo_freeze,
492 .finish = memfd_luo_finish,
493 .retrieve = memfd_luo_retrieve,
494 .preserve = memfd_luo_preserve,
495 .unpreserve = memfd_luo_unpreserve,
496 .can_preserve = memfd_luo_can_preserve,
497 .owner = THIS_MODULE,
498};
499
500static struct liveupdate_file_handler memfd_luo_handler = {
501 .ops = &memfd_luo_file_ops,
502 .compatible = MEMFD_LUO_FH_COMPATIBLE,
503};
504
505static int __init memfd_luo_init(void)
506{
507 int err = liveupdate_register_file_handler(fh: &memfd_luo_handler);
508
509 if (err && err != -EOPNOTSUPP) {
510 pr_err("Could not register luo filesystem handler: %pe\n",
511 ERR_PTR(err));
512
513 return err;
514 }
515
516 return 0;
517}
518late_initcall(memfd_luo_init);
519

source code of linux/mm/memfd_luo.c