1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * Virtual Memory Map support |
4 | * |
5 | * (C) 2007 sgi. Christoph Lameter. |
6 | * |
7 | * Virtual memory maps allow VM primitives pfn_to_page, page_to_pfn, |
8 | * virt_to_page, page_address() to be implemented as a base offset |
9 | * calculation without memory access. |
10 | * |
11 | * However, virtual mappings need a page table and TLBs. Many Linux |
12 | * architectures already map their physical space using 1-1 mappings |
13 | * via TLBs. For those arches the virtual memory map is essentially |
14 | * for free if we use the same page size as the 1-1 mappings. In that |
15 | * case the overhead consists of a few additional pages that are |
16 | * allocated to create a view of memory for vmemmap. |
17 | * |
18 | * The architecture is expected to provide a vmemmap_populate() function |
19 | * to instantiate the mapping. |
20 | */ |
21 | #include <linux/mm.h> |
22 | #include <linux/mmzone.h> |
23 | #include <linux/memblock.h> |
24 | #include <linux/memremap.h> |
25 | #include <linux/highmem.h> |
26 | #include <linux/slab.h> |
27 | #include <linux/spinlock.h> |
28 | #include <linux/vmalloc.h> |
29 | #include <linux/sched.h> |
30 | |
31 | #include <asm/dma.h> |
32 | #include <asm/pgalloc.h> |
33 | |
34 | /* |
35 | * Allocate a block of memory to be used to back the virtual memory map |
36 | * or to back the page tables that are used to create the mapping. |
37 | * Uses the main allocators if they are available, else bootmem. |
38 | */ |
39 | |
40 | static void * __ref __earlyonly_bootmem_alloc(int node, |
41 | unsigned long size, |
42 | unsigned long align, |
43 | unsigned long goal) |
44 | { |
45 | return memblock_alloc_try_nid_raw(size, align, min_addr: goal, |
46 | MEMBLOCK_ALLOC_ACCESSIBLE, nid: node); |
47 | } |
48 | |
49 | void * __meminit vmemmap_alloc_block(unsigned long size, int node) |
50 | { |
51 | /* If the main allocator is up use that, fallback to bootmem. */ |
52 | if (slab_is_available()) { |
53 | gfp_t gfp_mask = GFP_KERNEL|__GFP_RETRY_MAYFAIL|__GFP_NOWARN; |
54 | int order = get_order(size); |
55 | static bool warned; |
56 | struct page *page; |
57 | |
58 | page = alloc_pages_node(nid: node, gfp_mask, order); |
59 | if (page) |
60 | return page_address(page); |
61 | |
62 | if (!warned) { |
63 | warn_alloc(gfp_mask: gfp_mask & ~__GFP_NOWARN, NULL, |
64 | fmt: "vmemmap alloc failure: order:%u" , order); |
65 | warned = true; |
66 | } |
67 | return NULL; |
68 | } else |
69 | return __earlyonly_bootmem_alloc(node, size, align: size, |
70 | __pa(MAX_DMA_ADDRESS)); |
71 | } |
72 | |
73 | static void * __meminit altmap_alloc_block_buf(unsigned long size, |
74 | struct vmem_altmap *altmap); |
75 | |
76 | /* need to make sure size is all the same during early stage */ |
77 | void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node, |
78 | struct vmem_altmap *altmap) |
79 | { |
80 | void *ptr; |
81 | |
82 | if (altmap) |
83 | return altmap_alloc_block_buf(size, altmap); |
84 | |
85 | ptr = sparse_buffer_alloc(size); |
86 | if (!ptr) |
87 | ptr = vmemmap_alloc_block(size, node); |
88 | return ptr; |
89 | } |
90 | |
91 | static unsigned long __meminit vmem_altmap_next_pfn(struct vmem_altmap *altmap) |
92 | { |
93 | return altmap->base_pfn + altmap->reserve + altmap->alloc |
94 | + altmap->align; |
95 | } |
96 | |
97 | static unsigned long __meminit vmem_altmap_nr_free(struct vmem_altmap *altmap) |
98 | { |
99 | unsigned long allocated = altmap->alloc + altmap->align; |
100 | |
101 | if (altmap->free > allocated) |
102 | return altmap->free - allocated; |
103 | return 0; |
104 | } |
105 | |
106 | static void * __meminit altmap_alloc_block_buf(unsigned long size, |
107 | struct vmem_altmap *altmap) |
108 | { |
109 | unsigned long pfn, nr_pfns, nr_align; |
110 | |
111 | if (size & ~PAGE_MASK) { |
112 | pr_warn_once("%s: allocations must be multiple of PAGE_SIZE (%ld)\n" , |
113 | __func__, size); |
114 | return NULL; |
115 | } |
116 | |
117 | pfn = vmem_altmap_next_pfn(altmap); |
118 | nr_pfns = size >> PAGE_SHIFT; |
119 | nr_align = 1UL << find_first_bit(addr: &nr_pfns, BITS_PER_LONG); |
120 | nr_align = ALIGN(pfn, nr_align) - pfn; |
121 | if (nr_pfns + nr_align > vmem_altmap_nr_free(altmap)) |
122 | return NULL; |
123 | |
124 | altmap->alloc += nr_pfns; |
125 | altmap->align += nr_align; |
126 | pfn += nr_align; |
127 | |
128 | pr_debug("%s: pfn: %#lx alloc: %ld align: %ld nr: %#lx\n" , |
129 | __func__, pfn, altmap->alloc, altmap->align, nr_pfns); |
130 | return __va(__pfn_to_phys(pfn)); |
131 | } |
132 | |
133 | void __meminit vmemmap_verify(pte_t *pte, int node, |
134 | unsigned long start, unsigned long end) |
135 | { |
136 | unsigned long pfn = pte_pfn(pte: ptep_get(ptep: pte)); |
137 | int actual_node = early_pfn_to_nid(pfn); |
138 | |
139 | if (node_distance(actual_node, node) > LOCAL_DISTANCE) |
140 | pr_warn_once("[%lx-%lx] potential offnode page_structs\n" , |
141 | start, end - 1); |
142 | } |
143 | |
144 | pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node, |
145 | struct vmem_altmap *altmap, |
146 | struct page *reuse) |
147 | { |
148 | pte_t *pte = pte_offset_kernel(pmd, address: addr); |
149 | if (pte_none(pte: ptep_get(ptep: pte))) { |
150 | pte_t entry; |
151 | void *p; |
152 | |
153 | if (!reuse) { |
154 | p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap); |
155 | if (!p) |
156 | return NULL; |
157 | } else { |
158 | /* |
159 | * When a PTE/PMD entry is freed from the init_mm |
160 | * there's a free_pages() call to this page allocated |
161 | * above. Thus this get_page() is paired with the |
162 | * put_page_testzero() on the freeing path. |
163 | * This can only called by certain ZONE_DEVICE path, |
164 | * and through vmemmap_populate_compound_pages() when |
165 | * slab is available. |
166 | */ |
167 | get_page(page: reuse); |
168 | p = page_to_virt(reuse); |
169 | } |
170 | entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); |
171 | set_pte_at(&init_mm, addr, pte, entry); |
172 | } |
173 | return pte; |
174 | } |
175 | |
176 | static void * __meminit vmemmap_alloc_block_zero(unsigned long size, int node) |
177 | { |
178 | void *p = vmemmap_alloc_block(size, node); |
179 | |
180 | if (!p) |
181 | return NULL; |
182 | memset(p, 0, size); |
183 | |
184 | return p; |
185 | } |
186 | |
187 | pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node) |
188 | { |
189 | pmd_t *pmd = pmd_offset(pud, address: addr); |
190 | if (pmd_none(pmd: *pmd)) { |
191 | void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node); |
192 | if (!p) |
193 | return NULL; |
194 | pmd_populate_kernel(mm: &init_mm, pmd, pte: p); |
195 | } |
196 | return pmd; |
197 | } |
198 | |
199 | void __weak __meminit pmd_init(void *addr) |
200 | { |
201 | } |
202 | |
203 | pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node) |
204 | { |
205 | pud_t *pud = pud_offset(p4d, address: addr); |
206 | if (pud_none(pud: *pud)) { |
207 | void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node); |
208 | if (!p) |
209 | return NULL; |
210 | pmd_init(addr: p); |
211 | pud_populate(mm: &init_mm, pud, pmd: p); |
212 | } |
213 | return pud; |
214 | } |
215 | |
216 | void __weak __meminit pud_init(void *addr) |
217 | { |
218 | } |
219 | |
220 | p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node) |
221 | { |
222 | p4d_t *p4d = p4d_offset(pgd, address: addr); |
223 | if (p4d_none(p4d: *p4d)) { |
224 | void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node); |
225 | if (!p) |
226 | return NULL; |
227 | pud_init(addr: p); |
228 | p4d_populate(mm: &init_mm, p4d, pud: p); |
229 | } |
230 | return p4d; |
231 | } |
232 | |
233 | pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node) |
234 | { |
235 | pgd_t *pgd = pgd_offset_k(addr); |
236 | if (pgd_none(pgd: *pgd)) { |
237 | void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node); |
238 | if (!p) |
239 | return NULL; |
240 | pgd_populate(mm: &init_mm, pgd, p4d: p); |
241 | } |
242 | return pgd; |
243 | } |
244 | |
245 | static pte_t * __meminit vmemmap_populate_address(unsigned long addr, int node, |
246 | struct vmem_altmap *altmap, |
247 | struct page *reuse) |
248 | { |
249 | pgd_t *pgd; |
250 | p4d_t *p4d; |
251 | pud_t *pud; |
252 | pmd_t *pmd; |
253 | pte_t *pte; |
254 | |
255 | pgd = vmemmap_pgd_populate(addr, node); |
256 | if (!pgd) |
257 | return NULL; |
258 | p4d = vmemmap_p4d_populate(pgd, addr, node); |
259 | if (!p4d) |
260 | return NULL; |
261 | pud = vmemmap_pud_populate(p4d, addr, node); |
262 | if (!pud) |
263 | return NULL; |
264 | pmd = vmemmap_pmd_populate(pud, addr, node); |
265 | if (!pmd) |
266 | return NULL; |
267 | pte = vmemmap_pte_populate(pmd, addr, node, altmap, reuse); |
268 | if (!pte) |
269 | return NULL; |
270 | vmemmap_verify(pte, node, start: addr, end: addr + PAGE_SIZE); |
271 | |
272 | return pte; |
273 | } |
274 | |
275 | static int __meminit vmemmap_populate_range(unsigned long start, |
276 | unsigned long end, int node, |
277 | struct vmem_altmap *altmap, |
278 | struct page *reuse) |
279 | { |
280 | unsigned long addr = start; |
281 | pte_t *pte; |
282 | |
283 | for (; addr < end; addr += PAGE_SIZE) { |
284 | pte = vmemmap_populate_address(addr, node, altmap, reuse); |
285 | if (!pte) |
286 | return -ENOMEM; |
287 | } |
288 | |
289 | return 0; |
290 | } |
291 | |
292 | int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end, |
293 | int node, struct vmem_altmap *altmap) |
294 | { |
295 | return vmemmap_populate_range(start, end, node, altmap, NULL); |
296 | } |
297 | |
298 | void __weak __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node, |
299 | unsigned long addr, unsigned long next) |
300 | { |
301 | } |
302 | |
303 | int __weak __meminit vmemmap_check_pmd(pmd_t *pmd, int node, |
304 | unsigned long addr, unsigned long next) |
305 | { |
306 | return 0; |
307 | } |
308 | |
309 | int __meminit vmemmap_populate_hugepages(unsigned long start, unsigned long end, |
310 | int node, struct vmem_altmap *altmap) |
311 | { |
312 | unsigned long addr; |
313 | unsigned long next; |
314 | pgd_t *pgd; |
315 | p4d_t *p4d; |
316 | pud_t *pud; |
317 | pmd_t *pmd; |
318 | |
319 | for (addr = start; addr < end; addr = next) { |
320 | next = pmd_addr_end(addr, end); |
321 | |
322 | pgd = vmemmap_pgd_populate(addr, node); |
323 | if (!pgd) |
324 | return -ENOMEM; |
325 | |
326 | p4d = vmemmap_p4d_populate(pgd, addr, node); |
327 | if (!p4d) |
328 | return -ENOMEM; |
329 | |
330 | pud = vmemmap_pud_populate(p4d, addr, node); |
331 | if (!pud) |
332 | return -ENOMEM; |
333 | |
334 | pmd = pmd_offset(pud, address: addr); |
335 | if (pmd_none(READ_ONCE(*pmd))) { |
336 | void *p; |
337 | |
338 | p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap); |
339 | if (p) { |
340 | vmemmap_set_pmd(pmd, p, node, addr, next); |
341 | continue; |
342 | } else if (altmap) { |
343 | /* |
344 | * No fallback: In any case we care about, the |
345 | * altmap should be reasonably sized and aligned |
346 | * such that vmemmap_alloc_block_buf() will always |
347 | * succeed. For consistency with the PTE case, |
348 | * return an error here as failure could indicate |
349 | * a configuration issue with the size of the altmap. |
350 | */ |
351 | return -ENOMEM; |
352 | } |
353 | } else if (vmemmap_check_pmd(pmd, node, addr, next)) |
354 | continue; |
355 | if (vmemmap_populate_basepages(start: addr, end: next, node, altmap)) |
356 | return -ENOMEM; |
357 | } |
358 | return 0; |
359 | } |
360 | |
361 | #ifndef vmemmap_populate_compound_pages |
362 | /* |
363 | * For compound pages bigger than section size (e.g. x86 1G compound |
364 | * pages with 2M subsection size) fill the rest of sections as tail |
365 | * pages. |
366 | * |
367 | * Note that memremap_pages() resets @nr_range value and will increment |
368 | * it after each range successful onlining. Thus the value or @nr_range |
369 | * at section memmap populate corresponds to the in-progress range |
370 | * being onlined here. |
371 | */ |
372 | static bool __meminit reuse_compound_section(unsigned long start_pfn, |
373 | struct dev_pagemap *pgmap) |
374 | { |
375 | unsigned long nr_pages = pgmap_vmemmap_nr(pgmap); |
376 | unsigned long offset = start_pfn - |
377 | PHYS_PFN(pgmap->ranges[pgmap->nr_range].start); |
378 | |
379 | return !IS_ALIGNED(offset, nr_pages) && nr_pages > PAGES_PER_SUBSECTION; |
380 | } |
381 | |
382 | static pte_t * __meminit compound_section_tail_page(unsigned long addr) |
383 | { |
384 | pte_t *pte; |
385 | |
386 | addr -= PAGE_SIZE; |
387 | |
388 | /* |
389 | * Assuming sections are populated sequentially, the previous section's |
390 | * page data can be reused. |
391 | */ |
392 | pte = pte_offset_kernel(pmd: pmd_off_k(va: addr), address: addr); |
393 | if (!pte) |
394 | return NULL; |
395 | |
396 | return pte; |
397 | } |
398 | |
399 | static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn, |
400 | unsigned long start, |
401 | unsigned long end, int node, |
402 | struct dev_pagemap *pgmap) |
403 | { |
404 | unsigned long size, addr; |
405 | pte_t *pte; |
406 | int rc; |
407 | |
408 | if (reuse_compound_section(start_pfn, pgmap)) { |
409 | pte = compound_section_tail_page(addr: start); |
410 | if (!pte) |
411 | return -ENOMEM; |
412 | |
413 | /* |
414 | * Reuse the page that was populated in the prior iteration |
415 | * with just tail struct pages. |
416 | */ |
417 | return vmemmap_populate_range(start, end, node, NULL, |
418 | pte_page(ptep_get(pte))); |
419 | } |
420 | |
421 | size = min(end - start, pgmap_vmemmap_nr(pgmap) * sizeof(struct page)); |
422 | for (addr = start; addr < end; addr += size) { |
423 | unsigned long next, last = addr + size; |
424 | |
425 | /* Populate the head page vmemmap page */ |
426 | pte = vmemmap_populate_address(addr, node, NULL, NULL); |
427 | if (!pte) |
428 | return -ENOMEM; |
429 | |
430 | /* Populate the tail pages vmemmap page */ |
431 | next = addr + PAGE_SIZE; |
432 | pte = vmemmap_populate_address(addr: next, node, NULL, NULL); |
433 | if (!pte) |
434 | return -ENOMEM; |
435 | |
436 | /* |
437 | * Reuse the previous page for the rest of tail pages |
438 | * See layout diagram in Documentation/mm/vmemmap_dedup.rst |
439 | */ |
440 | next += PAGE_SIZE; |
441 | rc = vmemmap_populate_range(start: next, end: last, node, NULL, |
442 | pte_page(ptep_get(pte))); |
443 | if (rc) |
444 | return -ENOMEM; |
445 | } |
446 | |
447 | return 0; |
448 | } |
449 | |
450 | #endif |
451 | |
452 | struct page * __meminit __populate_section_memmap(unsigned long pfn, |
453 | unsigned long nr_pages, int nid, struct vmem_altmap *altmap, |
454 | struct dev_pagemap *pgmap) |
455 | { |
456 | unsigned long start = (unsigned long) pfn_to_page(pfn); |
457 | unsigned long end = start + nr_pages * sizeof(struct page); |
458 | int r; |
459 | |
460 | if (WARN_ON_ONCE(!IS_ALIGNED(pfn, PAGES_PER_SUBSECTION) || |
461 | !IS_ALIGNED(nr_pages, PAGES_PER_SUBSECTION))) |
462 | return NULL; |
463 | |
464 | if (vmemmap_can_optimize(altmap, pgmap)) |
465 | r = vmemmap_populate_compound_pages(start_pfn: pfn, start, end, node: nid, pgmap); |
466 | else |
467 | r = vmemmap_populate(start, end, node: nid, altmap); |
468 | |
469 | if (r < 0) |
470 | return NULL; |
471 | |
472 | return pfn_to_page(pfn); |
473 | } |
474 | |