1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * HugeTLB Vmemmap Optimization (HVO) |
4 | * |
5 | * Copyright (c) 2020, ByteDance. All rights reserved. |
6 | * |
7 | * Author: Muchun Song <songmuchun@bytedance.com> |
8 | * |
9 | * See Documentation/mm/vmemmap_dedup.rst |
10 | */ |
11 | #define pr_fmt(fmt) "HugeTLB: " fmt |
12 | |
13 | #include <linux/pgtable.h> |
14 | #include <linux/moduleparam.h> |
15 | #include <linux/bootmem_info.h> |
16 | #include <linux/mmdebug.h> |
17 | #include <linux/pagewalk.h> |
18 | #include <asm/pgalloc.h> |
19 | #include <asm/tlbflush.h> |
20 | #include "hugetlb_vmemmap.h" |
21 | |
22 | /** |
23 | * struct vmemmap_remap_walk - walk vmemmap page table |
24 | * |
25 | * @remap_pte: called for each lowest-level entry (PTE). |
26 | * @nr_walked: the number of walked pte. |
27 | * @reuse_page: the page which is reused for the tail vmemmap pages. |
28 | * @reuse_addr: the virtual address of the @reuse_page page. |
29 | * @vmemmap_pages: the list head of the vmemmap pages that can be freed |
30 | * or is mapped from. |
31 | * @flags: used to modify behavior in vmemmap page table walking |
32 | * operations. |
33 | */ |
34 | struct vmemmap_remap_walk { |
35 | void (*remap_pte)(pte_t *pte, unsigned long addr, |
36 | struct vmemmap_remap_walk *walk); |
37 | unsigned long nr_walked; |
38 | struct page *reuse_page; |
39 | unsigned long reuse_addr; |
40 | struct list_head *vmemmap_pages; |
41 | |
42 | /* Skip the TLB flush when we split the PMD */ |
43 | #define VMEMMAP_SPLIT_NO_TLB_FLUSH BIT(0) |
44 | /* Skip the TLB flush when we remap the PTE */ |
45 | #define VMEMMAP_REMAP_NO_TLB_FLUSH BIT(1) |
46 | unsigned long flags; |
47 | }; |
48 | |
49 | static int vmemmap_split_pmd(pmd_t *pmd, struct page *head, unsigned long start, |
50 | struct vmemmap_remap_walk *walk) |
51 | { |
52 | pmd_t __pmd; |
53 | int i; |
54 | unsigned long addr = start; |
55 | pte_t *pgtable; |
56 | |
57 | pgtable = pte_alloc_one_kernel(mm: &init_mm); |
58 | if (!pgtable) |
59 | return -ENOMEM; |
60 | |
61 | pmd_populate_kernel(mm: &init_mm, pmd: &__pmd, pte: pgtable); |
62 | |
63 | for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { |
64 | pte_t entry, *pte; |
65 | pgprot_t pgprot = PAGE_KERNEL; |
66 | |
67 | entry = mk_pte(head + i, pgprot); |
68 | pte = pte_offset_kernel(pmd: &__pmd, address: addr); |
69 | set_pte_at(&init_mm, addr, pte, entry); |
70 | } |
71 | |
72 | spin_lock(lock: &init_mm.page_table_lock); |
73 | if (likely(pmd_leaf(*pmd))) { |
74 | /* |
75 | * Higher order allocations from buddy allocator must be able to |
76 | * be treated as indepdenent small pages (as they can be freed |
77 | * individually). |
78 | */ |
79 | if (!PageReserved(page: head)) |
80 | split_page(page: head, order: get_order(PMD_SIZE)); |
81 | |
82 | /* Make pte visible before pmd. See comment in pmd_install(). */ |
83 | smp_wmb(); |
84 | pmd_populate_kernel(mm: &init_mm, pmd, pte: pgtable); |
85 | if (!(walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH)) |
86 | flush_tlb_kernel_range(start, end: start + PMD_SIZE); |
87 | } else { |
88 | pte_free_kernel(mm: &init_mm, pte: pgtable); |
89 | } |
90 | spin_unlock(lock: &init_mm.page_table_lock); |
91 | |
92 | return 0; |
93 | } |
94 | |
95 | static int vmemmap_pmd_entry(pmd_t *pmd, unsigned long addr, |
96 | unsigned long next, struct mm_walk *walk) |
97 | { |
98 | int ret = 0; |
99 | struct page *head; |
100 | struct vmemmap_remap_walk *vmemmap_walk = walk->private; |
101 | |
102 | /* Only splitting, not remapping the vmemmap pages. */ |
103 | if (!vmemmap_walk->remap_pte) |
104 | walk->action = ACTION_CONTINUE; |
105 | |
106 | spin_lock(lock: &init_mm.page_table_lock); |
107 | head = pmd_leaf(pte: *pmd) ? pmd_page(*pmd) : NULL; |
108 | /* |
109 | * Due to HugeTLB alignment requirements and the vmemmap |
110 | * pages being at the start of the hotplugged memory |
111 | * region in memory_hotplug.memmap_on_memory case. Checking |
112 | * the vmemmap page associated with the first vmemmap page |
113 | * if it is self-hosted is sufficient. |
114 | * |
115 | * [ hotplugged memory ] |
116 | * [ section ][...][ section ] |
117 | * [ vmemmap ][ usable memory ] |
118 | * ^ | ^ | |
119 | * +--+ | | |
120 | * +------------------------+ |
121 | */ |
122 | if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && unlikely(!vmemmap_walk->nr_walked)) { |
123 | struct page *page = head ? head + pte_index(address: addr) : |
124 | pte_page(ptep_get(pte_offset_kernel(pmd, addr))); |
125 | |
126 | if (PageVmemmapSelfHosted(page)) |
127 | ret = -ENOTSUPP; |
128 | } |
129 | spin_unlock(lock: &init_mm.page_table_lock); |
130 | if (!head || ret) |
131 | return ret; |
132 | |
133 | return vmemmap_split_pmd(pmd, head, start: addr & PMD_MASK, walk: vmemmap_walk); |
134 | } |
135 | |
136 | static int vmemmap_pte_entry(pte_t *pte, unsigned long addr, |
137 | unsigned long next, struct mm_walk *walk) |
138 | { |
139 | struct vmemmap_remap_walk *vmemmap_walk = walk->private; |
140 | |
141 | /* |
142 | * The reuse_page is found 'first' in page table walking before |
143 | * starting remapping. |
144 | */ |
145 | if (!vmemmap_walk->reuse_page) |
146 | vmemmap_walk->reuse_page = pte_page(ptep_get(pte)); |
147 | else |
148 | vmemmap_walk->remap_pte(pte, addr, vmemmap_walk); |
149 | vmemmap_walk->nr_walked++; |
150 | |
151 | return 0; |
152 | } |
153 | |
154 | static const struct mm_walk_ops vmemmap_remap_ops = { |
155 | .pmd_entry = vmemmap_pmd_entry, |
156 | .pte_entry = vmemmap_pte_entry, |
157 | }; |
158 | |
159 | static int vmemmap_remap_range(unsigned long start, unsigned long end, |
160 | struct vmemmap_remap_walk *walk) |
161 | { |
162 | int ret; |
163 | |
164 | VM_BUG_ON(!PAGE_ALIGNED(start | end)); |
165 | |
166 | mmap_read_lock(mm: &init_mm); |
167 | ret = walk_page_range_novma(mm: &init_mm, start, end, ops: &vmemmap_remap_ops, |
168 | NULL, private: walk); |
169 | mmap_read_unlock(mm: &init_mm); |
170 | if (ret) |
171 | return ret; |
172 | |
173 | if (walk->remap_pte && !(walk->flags & VMEMMAP_REMAP_NO_TLB_FLUSH)) |
174 | flush_tlb_kernel_range(start, end); |
175 | |
176 | return 0; |
177 | } |
178 | |
179 | /* |
180 | * Free a vmemmap page. A vmemmap page can be allocated from the memblock |
181 | * allocator or buddy allocator. If the PG_reserved flag is set, it means |
182 | * that it allocated from the memblock allocator, just free it via the |
183 | * free_bootmem_page(). Otherwise, use __free_page(). |
184 | */ |
185 | static inline void free_vmemmap_page(struct page *page) |
186 | { |
187 | if (PageReserved(page)) |
188 | free_bootmem_page(page); |
189 | else |
190 | __free_page(page); |
191 | } |
192 | |
193 | /* Free a list of the vmemmap pages */ |
194 | static void free_vmemmap_page_list(struct list_head *list) |
195 | { |
196 | struct page *page, *next; |
197 | |
198 | list_for_each_entry_safe(page, next, list, lru) |
199 | free_vmemmap_page(page); |
200 | } |
201 | |
202 | static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, |
203 | struct vmemmap_remap_walk *walk) |
204 | { |
205 | /* |
206 | * Remap the tail pages as read-only to catch illegal write operation |
207 | * to the tail pages. |
208 | */ |
209 | pgprot_t pgprot = PAGE_KERNEL_RO; |
210 | struct page *page = pte_page(ptep_get(pte)); |
211 | pte_t entry; |
212 | |
213 | /* Remapping the head page requires r/w */ |
214 | if (unlikely(addr == walk->reuse_addr)) { |
215 | pgprot = PAGE_KERNEL; |
216 | list_del(entry: &walk->reuse_page->lru); |
217 | |
218 | /* |
219 | * Makes sure that preceding stores to the page contents from |
220 | * vmemmap_remap_free() become visible before the set_pte_at() |
221 | * write. |
222 | */ |
223 | smp_wmb(); |
224 | } |
225 | |
226 | entry = mk_pte(walk->reuse_page, pgprot); |
227 | list_add(new: &page->lru, head: walk->vmemmap_pages); |
228 | set_pte_at(&init_mm, addr, pte, entry); |
229 | } |
230 | |
231 | /* |
232 | * How many struct page structs need to be reset. When we reuse the head |
233 | * struct page, the special metadata (e.g. page->flags or page->mapping) |
234 | * cannot copy to the tail struct page structs. The invalid value will be |
235 | * checked in the free_tail_page_prepare(). In order to avoid the message |
236 | * of "corrupted mapping in tail page". We need to reset at least 3 (one |
237 | * head struct page struct and two tail struct page structs) struct page |
238 | * structs. |
239 | */ |
240 | #define NR_RESET_STRUCT_PAGE 3 |
241 | |
242 | static inline void reset_struct_pages(struct page *start) |
243 | { |
244 | struct page *from = start + NR_RESET_STRUCT_PAGE; |
245 | |
246 | BUILD_BUG_ON(NR_RESET_STRUCT_PAGE * 2 > PAGE_SIZE / sizeof(struct page)); |
247 | memcpy(start, from, sizeof(*from) * NR_RESET_STRUCT_PAGE); |
248 | } |
249 | |
250 | static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, |
251 | struct vmemmap_remap_walk *walk) |
252 | { |
253 | pgprot_t pgprot = PAGE_KERNEL; |
254 | struct page *page; |
255 | void *to; |
256 | |
257 | BUG_ON(pte_page(ptep_get(pte)) != walk->reuse_page); |
258 | |
259 | page = list_first_entry(walk->vmemmap_pages, struct page, lru); |
260 | list_del(entry: &page->lru); |
261 | to = page_to_virt(page); |
262 | copy_page(to, from: (void *)walk->reuse_addr); |
263 | reset_struct_pages(start: to); |
264 | |
265 | /* |
266 | * Makes sure that preceding stores to the page contents become visible |
267 | * before the set_pte_at() write. |
268 | */ |
269 | smp_wmb(); |
270 | set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot)); |
271 | } |
272 | |
273 | /** |
274 | * vmemmap_remap_split - split the vmemmap virtual address range [@start, @end) |
275 | * backing PMDs of the directmap into PTEs |
276 | * @start: start address of the vmemmap virtual address range that we want |
277 | * to remap. |
278 | * @end: end address of the vmemmap virtual address range that we want to |
279 | * remap. |
280 | * @reuse: reuse address. |
281 | * |
282 | * Return: %0 on success, negative error code otherwise. |
283 | */ |
284 | static int vmemmap_remap_split(unsigned long start, unsigned long end, |
285 | unsigned long reuse) |
286 | { |
287 | struct vmemmap_remap_walk walk = { |
288 | .remap_pte = NULL, |
289 | .flags = VMEMMAP_SPLIT_NO_TLB_FLUSH, |
290 | }; |
291 | |
292 | /* See the comment in the vmemmap_remap_free(). */ |
293 | BUG_ON(start - reuse != PAGE_SIZE); |
294 | |
295 | return vmemmap_remap_range(start: reuse, end, walk: &walk); |
296 | } |
297 | |
298 | /** |
299 | * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end) |
300 | * to the page which @reuse is mapped to, then free vmemmap |
301 | * which the range are mapped to. |
302 | * @start: start address of the vmemmap virtual address range that we want |
303 | * to remap. |
304 | * @end: end address of the vmemmap virtual address range that we want to |
305 | * remap. |
306 | * @reuse: reuse address. |
307 | * @vmemmap_pages: list to deposit vmemmap pages to be freed. It is callers |
308 | * responsibility to free pages. |
309 | * @flags: modifications to vmemmap_remap_walk flags |
310 | * |
311 | * Return: %0 on success, negative error code otherwise. |
312 | */ |
313 | static int vmemmap_remap_free(unsigned long start, unsigned long end, |
314 | unsigned long reuse, |
315 | struct list_head *vmemmap_pages, |
316 | unsigned long flags) |
317 | { |
318 | int ret; |
319 | struct vmemmap_remap_walk walk = { |
320 | .remap_pte = vmemmap_remap_pte, |
321 | .reuse_addr = reuse, |
322 | .vmemmap_pages = vmemmap_pages, |
323 | .flags = flags, |
324 | }; |
325 | int nid = page_to_nid(page: (struct page *)reuse); |
326 | gfp_t gfp_mask = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; |
327 | |
328 | /* |
329 | * Allocate a new head vmemmap page to avoid breaking a contiguous |
330 | * block of struct page memory when freeing it back to page allocator |
331 | * in free_vmemmap_page_list(). This will allow the likely contiguous |
332 | * struct page backing memory to be kept contiguous and allowing for |
333 | * more allocations of hugepages. Fallback to the currently |
334 | * mapped head page in case should it fail to allocate. |
335 | */ |
336 | walk.reuse_page = alloc_pages_node(nid, gfp_mask, order: 0); |
337 | if (walk.reuse_page) { |
338 | copy_page(page_to_virt(walk.reuse_page), |
339 | from: (void *)walk.reuse_addr); |
340 | list_add(new: &walk.reuse_page->lru, head: vmemmap_pages); |
341 | } |
342 | |
343 | /* |
344 | * In order to make remapping routine most efficient for the huge pages, |
345 | * the routine of vmemmap page table walking has the following rules |
346 | * (see more details from the vmemmap_pte_range()): |
347 | * |
348 | * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE) |
349 | * should be continuous. |
350 | * - The @reuse address is part of the range [@reuse, @end) that we are |
351 | * walking which is passed to vmemmap_remap_range(). |
352 | * - The @reuse address is the first in the complete range. |
353 | * |
354 | * So we need to make sure that @start and @reuse meet the above rules. |
355 | */ |
356 | BUG_ON(start - reuse != PAGE_SIZE); |
357 | |
358 | ret = vmemmap_remap_range(start: reuse, end, walk: &walk); |
359 | if (ret && walk.nr_walked) { |
360 | end = reuse + walk.nr_walked * PAGE_SIZE; |
361 | /* |
362 | * vmemmap_pages contains pages from the previous |
363 | * vmemmap_remap_range call which failed. These |
364 | * are pages which were removed from the vmemmap. |
365 | * They will be restored in the following call. |
366 | */ |
367 | walk = (struct vmemmap_remap_walk) { |
368 | .remap_pte = vmemmap_restore_pte, |
369 | .reuse_addr = reuse, |
370 | .vmemmap_pages = vmemmap_pages, |
371 | .flags = 0, |
372 | }; |
373 | |
374 | vmemmap_remap_range(start: reuse, end, walk: &walk); |
375 | } |
376 | |
377 | return ret; |
378 | } |
379 | |
380 | static int alloc_vmemmap_page_list(unsigned long start, unsigned long end, |
381 | struct list_head *list) |
382 | { |
383 | gfp_t gfp_mask = GFP_KERNEL | __GFP_RETRY_MAYFAIL; |
384 | unsigned long nr_pages = (end - start) >> PAGE_SHIFT; |
385 | int nid = page_to_nid(page: (struct page *)start); |
386 | struct page *page, *next; |
387 | |
388 | while (nr_pages--) { |
389 | page = alloc_pages_node(nid, gfp_mask, order: 0); |
390 | if (!page) |
391 | goto out; |
392 | list_add(new: &page->lru, head: list); |
393 | } |
394 | |
395 | return 0; |
396 | out: |
397 | list_for_each_entry_safe(page, next, list, lru) |
398 | __free_page(page); |
399 | return -ENOMEM; |
400 | } |
401 | |
402 | /** |
403 | * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end) |
404 | * to the page which is from the @vmemmap_pages |
405 | * respectively. |
406 | * @start: start address of the vmemmap virtual address range that we want |
407 | * to remap. |
408 | * @end: end address of the vmemmap virtual address range that we want to |
409 | * remap. |
410 | * @reuse: reuse address. |
411 | * @flags: modifications to vmemmap_remap_walk flags |
412 | * |
413 | * Return: %0 on success, negative error code otherwise. |
414 | */ |
415 | static int vmemmap_remap_alloc(unsigned long start, unsigned long end, |
416 | unsigned long reuse, unsigned long flags) |
417 | { |
418 | LIST_HEAD(vmemmap_pages); |
419 | struct vmemmap_remap_walk walk = { |
420 | .remap_pte = vmemmap_restore_pte, |
421 | .reuse_addr = reuse, |
422 | .vmemmap_pages = &vmemmap_pages, |
423 | .flags = flags, |
424 | }; |
425 | |
426 | /* See the comment in the vmemmap_remap_free(). */ |
427 | BUG_ON(start - reuse != PAGE_SIZE); |
428 | |
429 | if (alloc_vmemmap_page_list(start, end, list: &vmemmap_pages)) |
430 | return -ENOMEM; |
431 | |
432 | return vmemmap_remap_range(start: reuse, end, walk: &walk); |
433 | } |
434 | |
435 | DEFINE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key); |
436 | EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key); |
437 | |
438 | static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON); |
439 | core_param(hugetlb_free_vmemmap, vmemmap_optimize_enabled, bool, 0); |
440 | |
441 | static int __hugetlb_vmemmap_restore_folio(const struct hstate *h, |
442 | struct folio *folio, unsigned long flags) |
443 | { |
444 | int ret; |
445 | unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end; |
446 | unsigned long vmemmap_reuse; |
447 | |
448 | VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio); |
449 | if (!folio_test_hugetlb_vmemmap_optimized(folio)) |
450 | return 0; |
451 | |
452 | vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); |
453 | vmemmap_reuse = vmemmap_start; |
454 | vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; |
455 | |
456 | /* |
457 | * The pages which the vmemmap virtual address range [@vmemmap_start, |
458 | * @vmemmap_end) are mapped to are freed to the buddy allocator, and |
459 | * the range is mapped to the page which @vmemmap_reuse is mapped to. |
460 | * When a HugeTLB page is freed to the buddy allocator, previously |
461 | * discarded vmemmap pages must be allocated and remapping. |
462 | */ |
463 | ret = vmemmap_remap_alloc(start: vmemmap_start, end: vmemmap_end, reuse: vmemmap_reuse, flags); |
464 | if (!ret) { |
465 | folio_clear_hugetlb_vmemmap_optimized(folio); |
466 | static_branch_dec(&hugetlb_optimize_vmemmap_key); |
467 | } |
468 | |
469 | return ret; |
470 | } |
471 | |
472 | /** |
473 | * hugetlb_vmemmap_restore_folio - restore previously optimized (by |
474 | * hugetlb_vmemmap_optimize_folio()) vmemmap pages which |
475 | * will be reallocated and remapped. |
476 | * @h: struct hstate. |
477 | * @folio: the folio whose vmemmap pages will be restored. |
478 | * |
479 | * Return: %0 if @folio's vmemmap pages have been reallocated and remapped, |
480 | * negative error code otherwise. |
481 | */ |
482 | int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio) |
483 | { |
484 | return __hugetlb_vmemmap_restore_folio(h, folio, flags: 0); |
485 | } |
486 | |
487 | /** |
488 | * hugetlb_vmemmap_restore_folios - restore vmemmap for every folio on the list. |
489 | * @h: hstate. |
490 | * @folio_list: list of folios. |
491 | * @non_hvo_folios: Output list of folios for which vmemmap exists. |
492 | * |
493 | * Return: number of folios for which vmemmap was restored, or an error code |
494 | * if an error was encountered restoring vmemmap for a folio. |
495 | * Folios that have vmemmap are moved to the non_hvo_folios |
496 | * list. Processing of entries stops when the first error is |
497 | * encountered. The folio that experienced the error and all |
498 | * non-processed folios will remain on folio_list. |
499 | */ |
500 | long hugetlb_vmemmap_restore_folios(const struct hstate *h, |
501 | struct list_head *folio_list, |
502 | struct list_head *non_hvo_folios) |
503 | { |
504 | struct folio *folio, *t_folio; |
505 | long restored = 0; |
506 | long ret = 0; |
507 | |
508 | list_for_each_entry_safe(folio, t_folio, folio_list, lru) { |
509 | if (folio_test_hugetlb_vmemmap_optimized(folio)) { |
510 | ret = __hugetlb_vmemmap_restore_folio(h, folio, |
511 | VMEMMAP_REMAP_NO_TLB_FLUSH); |
512 | if (ret) |
513 | break; |
514 | restored++; |
515 | } |
516 | |
517 | /* Add non-optimized folios to output list */ |
518 | list_move(list: &folio->lru, head: non_hvo_folios); |
519 | } |
520 | |
521 | if (restored) |
522 | flush_tlb_all(); |
523 | if (!ret) |
524 | ret = restored; |
525 | return ret; |
526 | } |
527 | |
528 | /* Return true iff a HugeTLB whose vmemmap should and can be optimized. */ |
529 | static bool vmemmap_should_optimize_folio(const struct hstate *h, struct folio *folio) |
530 | { |
531 | if (folio_test_hugetlb_vmemmap_optimized(folio)) |
532 | return false; |
533 | |
534 | if (!READ_ONCE(vmemmap_optimize_enabled)) |
535 | return false; |
536 | |
537 | if (!hugetlb_vmemmap_optimizable(h)) |
538 | return false; |
539 | |
540 | return true; |
541 | } |
542 | |
543 | static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h, |
544 | struct folio *folio, |
545 | struct list_head *vmemmap_pages, |
546 | unsigned long flags) |
547 | { |
548 | int ret = 0; |
549 | unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end; |
550 | unsigned long vmemmap_reuse; |
551 | |
552 | VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio); |
553 | if (!vmemmap_should_optimize_folio(h, folio)) |
554 | return ret; |
555 | |
556 | static_branch_inc(&hugetlb_optimize_vmemmap_key); |
557 | /* |
558 | * Very Subtle |
559 | * If VMEMMAP_REMAP_NO_TLB_FLUSH is set, TLB flushing is not performed |
560 | * immediately after remapping. As a result, subsequent accesses |
561 | * and modifications to struct pages associated with the hugetlb |
562 | * page could be to the OLD struct pages. Set the vmemmap optimized |
563 | * flag here so that it is copied to the new head page. This keeps |
564 | * the old and new struct pages in sync. |
565 | * If there is an error during optimization, we will immediately FLUSH |
566 | * the TLB and clear the flag below. |
567 | */ |
568 | folio_set_hugetlb_vmemmap_optimized(folio); |
569 | |
570 | vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); |
571 | vmemmap_reuse = vmemmap_start; |
572 | vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; |
573 | |
574 | /* |
575 | * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end) |
576 | * to the page which @vmemmap_reuse is mapped to. Add pages previously |
577 | * mapping the range to vmemmap_pages list so that they can be freed by |
578 | * the caller. |
579 | */ |
580 | ret = vmemmap_remap_free(start: vmemmap_start, end: vmemmap_end, reuse: vmemmap_reuse, |
581 | vmemmap_pages, flags); |
582 | if (ret) { |
583 | static_branch_dec(&hugetlb_optimize_vmemmap_key); |
584 | folio_clear_hugetlb_vmemmap_optimized(folio); |
585 | } |
586 | |
587 | return ret; |
588 | } |
589 | |
590 | /** |
591 | * hugetlb_vmemmap_optimize_folio - optimize @folio's vmemmap pages. |
592 | * @h: struct hstate. |
593 | * @folio: the folio whose vmemmap pages will be optimized. |
594 | * |
595 | * This function only tries to optimize @folio's vmemmap pages and does not |
596 | * guarantee that the optimization will succeed after it returns. The caller |
597 | * can use folio_test_hugetlb_vmemmap_optimized(@folio) to detect if @folio's |
598 | * vmemmap pages have been optimized. |
599 | */ |
600 | void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio) |
601 | { |
602 | LIST_HEAD(vmemmap_pages); |
603 | |
604 | __hugetlb_vmemmap_optimize_folio(h, folio, vmemmap_pages: &vmemmap_pages, flags: 0); |
605 | free_vmemmap_page_list(list: &vmemmap_pages); |
606 | } |
607 | |
608 | static int hugetlb_vmemmap_split_folio(const struct hstate *h, struct folio *folio) |
609 | { |
610 | unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end; |
611 | unsigned long vmemmap_reuse; |
612 | |
613 | if (!vmemmap_should_optimize_folio(h, folio)) |
614 | return 0; |
615 | |
616 | vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); |
617 | vmemmap_reuse = vmemmap_start; |
618 | vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; |
619 | |
620 | /* |
621 | * Split PMDs on the vmemmap virtual address range [@vmemmap_start, |
622 | * @vmemmap_end] |
623 | */ |
624 | return vmemmap_remap_split(start: vmemmap_start, end: vmemmap_end, reuse: vmemmap_reuse); |
625 | } |
626 | |
627 | void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list) |
628 | { |
629 | struct folio *folio; |
630 | LIST_HEAD(vmemmap_pages); |
631 | |
632 | list_for_each_entry(folio, folio_list, lru) { |
633 | int ret = hugetlb_vmemmap_split_folio(h, folio); |
634 | |
635 | /* |
636 | * Spliting the PMD requires allocating a page, thus lets fail |
637 | * early once we encounter the first OOM. No point in retrying |
638 | * as it can be dynamically done on remap with the memory |
639 | * we get back from the vmemmap deduplication. |
640 | */ |
641 | if (ret == -ENOMEM) |
642 | break; |
643 | } |
644 | |
645 | flush_tlb_all(); |
646 | |
647 | list_for_each_entry(folio, folio_list, lru) { |
648 | int ret; |
649 | |
650 | ret = __hugetlb_vmemmap_optimize_folio(h, folio, vmemmap_pages: &vmemmap_pages, |
651 | VMEMMAP_REMAP_NO_TLB_FLUSH); |
652 | |
653 | /* |
654 | * Pages to be freed may have been accumulated. If we |
655 | * encounter an ENOMEM, free what we have and try again. |
656 | * This can occur in the case that both spliting fails |
657 | * halfway and head page allocation also failed. In this |
658 | * case __hugetlb_vmemmap_optimize_folio() would free memory |
659 | * allowing more vmemmap remaps to occur. |
660 | */ |
661 | if (ret == -ENOMEM && !list_empty(head: &vmemmap_pages)) { |
662 | flush_tlb_all(); |
663 | free_vmemmap_page_list(list: &vmemmap_pages); |
664 | INIT_LIST_HEAD(list: &vmemmap_pages); |
665 | __hugetlb_vmemmap_optimize_folio(h, folio, vmemmap_pages: &vmemmap_pages, |
666 | VMEMMAP_REMAP_NO_TLB_FLUSH); |
667 | } |
668 | } |
669 | |
670 | flush_tlb_all(); |
671 | free_vmemmap_page_list(list: &vmemmap_pages); |
672 | } |
673 | |
674 | static struct ctl_table hugetlb_vmemmap_sysctls[] = { |
675 | { |
676 | .procname = "hugetlb_optimize_vmemmap" , |
677 | .data = &vmemmap_optimize_enabled, |
678 | .maxlen = sizeof(vmemmap_optimize_enabled), |
679 | .mode = 0644, |
680 | .proc_handler = proc_dobool, |
681 | }, |
682 | { } |
683 | }; |
684 | |
685 | static int __init hugetlb_vmemmap_init(void) |
686 | { |
687 | const struct hstate *h; |
688 | |
689 | /* HUGETLB_VMEMMAP_RESERVE_SIZE should cover all used struct pages */ |
690 | BUILD_BUG_ON(__NR_USED_SUBPAGE > HUGETLB_VMEMMAP_RESERVE_PAGES); |
691 | |
692 | for_each_hstate(h) { |
693 | if (hugetlb_vmemmap_optimizable(h)) { |
694 | register_sysctl_init("vm" , hugetlb_vmemmap_sysctls); |
695 | break; |
696 | } |
697 | } |
698 | return 0; |
699 | } |
700 | late_initcall(hugetlb_vmemmap_init); |
701 | |