1 | // SPDX-License-Identifier: GPL-2.0-or-later |
2 | /* |
3 | * Page table handling routines for radix page table. |
4 | * |
5 | * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation. |
6 | */ |
7 | |
8 | #define pr_fmt(fmt) "radix-mmu: " fmt |
9 | |
10 | #include <linux/io.h> |
11 | #include <linux/kernel.h> |
12 | #include <linux/sched/mm.h> |
13 | #include <linux/memblock.h> |
14 | #include <linux/of.h> |
15 | #include <linux/of_fdt.h> |
16 | #include <linux/mm.h> |
17 | #include <linux/hugetlb.h> |
18 | #include <linux/string_helpers.h> |
19 | #include <linux/memory.h> |
20 | |
21 | #include <asm/pgalloc.h> |
22 | #include <asm/mmu_context.h> |
23 | #include <asm/dma.h> |
24 | #include <asm/machdep.h> |
25 | #include <asm/mmu.h> |
26 | #include <asm/firmware.h> |
27 | #include <asm/powernv.h> |
28 | #include <asm/sections.h> |
29 | #include <asm/smp.h> |
30 | #include <asm/trace.h> |
31 | #include <asm/uaccess.h> |
32 | #include <asm/ultravisor.h> |
33 | #include <asm/set_memory.h> |
34 | |
35 | #include <trace/events/thp.h> |
36 | |
37 | #include <mm/mmu_decl.h> |
38 | |
39 | unsigned int mmu_base_pid; |
40 | |
41 | static __ref void *early_alloc_pgtable(unsigned long size, int nid, |
42 | unsigned long region_start, unsigned long region_end) |
43 | { |
44 | phys_addr_t min_addr = MEMBLOCK_LOW_LIMIT; |
45 | phys_addr_t max_addr = MEMBLOCK_ALLOC_ANYWHERE; |
46 | void *ptr; |
47 | |
48 | if (region_start) |
49 | min_addr = region_start; |
50 | if (region_end) |
51 | max_addr = region_end; |
52 | |
53 | ptr = memblock_alloc_try_nid(size, align: size, min_addr, max_addr, nid); |
54 | |
55 | if (!ptr) |
56 | panic(fmt: "%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa max_addr=%pa\n" , |
57 | __func__, size, size, nid, &min_addr, &max_addr); |
58 | |
59 | return ptr; |
60 | } |
61 | |
62 | /* |
63 | * When allocating pud or pmd pointers, we allocate a complete page |
64 | * of PAGE_SIZE rather than PUD_TABLE_SIZE or PMD_TABLE_SIZE. This |
65 | * is to ensure that the page obtained from the memblock allocator |
66 | * can be completely used as page table page and can be freed |
67 | * correctly when the page table entries are removed. |
68 | */ |
69 | static int early_map_kernel_page(unsigned long ea, unsigned long pa, |
70 | pgprot_t flags, |
71 | unsigned int map_page_size, |
72 | int nid, |
73 | unsigned long region_start, unsigned long region_end) |
74 | { |
75 | unsigned long pfn = pa >> PAGE_SHIFT; |
76 | pgd_t *pgdp; |
77 | p4d_t *p4dp; |
78 | pud_t *pudp; |
79 | pmd_t *pmdp; |
80 | pte_t *ptep; |
81 | |
82 | pgdp = pgd_offset_k(ea); |
83 | p4dp = p4d_offset(pgd: pgdp, address: ea); |
84 | if (p4d_none(p4d: *p4dp)) { |
85 | pudp = early_alloc_pgtable(PAGE_SIZE, nid, |
86 | region_start, region_end); |
87 | p4d_populate(mm: &init_mm, p4d: p4dp, pud: pudp); |
88 | } |
89 | pudp = pud_offset(p4d: p4dp, address: ea); |
90 | if (map_page_size == PUD_SIZE) { |
91 | ptep = (pte_t *)pudp; |
92 | goto set_the_pte; |
93 | } |
94 | if (pud_none(pud: *pudp)) { |
95 | pmdp = early_alloc_pgtable(PAGE_SIZE, nid, region_start, |
96 | region_end); |
97 | pud_populate(mm: &init_mm, pud: pudp, pmd: pmdp); |
98 | } |
99 | pmdp = pmd_offset(pud: pudp, address: ea); |
100 | if (map_page_size == PMD_SIZE) { |
101 | ptep = pmdp_ptep(pmdp); |
102 | goto set_the_pte; |
103 | } |
104 | if (!pmd_present(pmd: *pmdp)) { |
105 | ptep = early_alloc_pgtable(PAGE_SIZE, nid, |
106 | region_start, region_end); |
107 | pmd_populate_kernel(mm: &init_mm, pmd: pmdp, pte: ptep); |
108 | } |
109 | ptep = pte_offset_kernel(pmd: pmdp, address: ea); |
110 | |
111 | set_the_pte: |
112 | set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags)); |
113 | asm volatile("ptesync" : : :"memory" ); |
114 | return 0; |
115 | } |
116 | |
117 | /* |
118 | * nid, region_start, and region_end are hints to try to place the page |
119 | * table memory in the same node or region. |
120 | */ |
121 | static int __map_kernel_page(unsigned long ea, unsigned long pa, |
122 | pgprot_t flags, |
123 | unsigned int map_page_size, |
124 | int nid, |
125 | unsigned long region_start, unsigned long region_end) |
126 | { |
127 | unsigned long pfn = pa >> PAGE_SHIFT; |
128 | pgd_t *pgdp; |
129 | p4d_t *p4dp; |
130 | pud_t *pudp; |
131 | pmd_t *pmdp; |
132 | pte_t *ptep; |
133 | /* |
134 | * Make sure task size is correct as per the max adddr |
135 | */ |
136 | BUILD_BUG_ON(TASK_SIZE_USER64 > RADIX_PGTABLE_RANGE); |
137 | |
138 | #ifdef CONFIG_PPC_64K_PAGES |
139 | BUILD_BUG_ON(RADIX_KERN_MAP_SIZE != (1UL << MAX_EA_BITS_PER_CONTEXT)); |
140 | #endif |
141 | |
142 | if (unlikely(!slab_is_available())) |
143 | return early_map_kernel_page(ea, pa, flags, map_page_size, |
144 | nid, region_start, region_end); |
145 | |
146 | /* |
147 | * Should make page table allocation functions be able to take a |
148 | * node, so we can place kernel page tables on the right nodes after |
149 | * boot. |
150 | */ |
151 | pgdp = pgd_offset_k(ea); |
152 | p4dp = p4d_offset(pgd: pgdp, address: ea); |
153 | pudp = pud_alloc(mm: &init_mm, p4d: p4dp, address: ea); |
154 | if (!pudp) |
155 | return -ENOMEM; |
156 | if (map_page_size == PUD_SIZE) { |
157 | ptep = (pte_t *)pudp; |
158 | goto set_the_pte; |
159 | } |
160 | pmdp = pmd_alloc(mm: &init_mm, pud: pudp, address: ea); |
161 | if (!pmdp) |
162 | return -ENOMEM; |
163 | if (map_page_size == PMD_SIZE) { |
164 | ptep = pmdp_ptep(pmdp); |
165 | goto set_the_pte; |
166 | } |
167 | ptep = pte_alloc_kernel(pmdp, ea); |
168 | if (!ptep) |
169 | return -ENOMEM; |
170 | |
171 | set_the_pte: |
172 | set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags)); |
173 | asm volatile("ptesync" : : :"memory" ); |
174 | return 0; |
175 | } |
176 | |
177 | int radix__map_kernel_page(unsigned long ea, unsigned long pa, |
178 | pgprot_t flags, |
179 | unsigned int map_page_size) |
180 | { |
181 | return __map_kernel_page(ea, pa, flags, map_page_size, nid: -1, region_start: 0, region_end: 0); |
182 | } |
183 | |
184 | #ifdef CONFIG_STRICT_KERNEL_RWX |
185 | static void radix__change_memory_range(unsigned long start, unsigned long end, |
186 | unsigned long clear) |
187 | { |
188 | unsigned long idx; |
189 | pgd_t *pgdp; |
190 | p4d_t *p4dp; |
191 | pud_t *pudp; |
192 | pmd_t *pmdp; |
193 | pte_t *ptep; |
194 | |
195 | start = ALIGN_DOWN(start, PAGE_SIZE); |
196 | end = PAGE_ALIGN(end); // aligns up |
197 | |
198 | pr_debug("Changing flags on range %lx-%lx removing 0x%lx\n" , |
199 | start, end, clear); |
200 | |
201 | for (idx = start; idx < end; idx += PAGE_SIZE) { |
202 | pgdp = pgd_offset_k(idx); |
203 | p4dp = p4d_offset(pgd: pgdp, address: idx); |
204 | pudp = pud_alloc(mm: &init_mm, p4d: p4dp, address: idx); |
205 | if (!pudp) |
206 | continue; |
207 | if (pud_leaf(pud: *pudp)) { |
208 | ptep = (pte_t *)pudp; |
209 | goto update_the_pte; |
210 | } |
211 | pmdp = pmd_alloc(mm: &init_mm, pud: pudp, address: idx); |
212 | if (!pmdp) |
213 | continue; |
214 | if (pmd_leaf(pte: *pmdp)) { |
215 | ptep = pmdp_ptep(pmdp); |
216 | goto update_the_pte; |
217 | } |
218 | ptep = pte_alloc_kernel(pmdp, idx); |
219 | if (!ptep) |
220 | continue; |
221 | update_the_pte: |
222 | radix__pte_update(&init_mm, idx, ptep, clear, 0, 0); |
223 | } |
224 | |
225 | radix__flush_tlb_kernel_range(start, end); |
226 | } |
227 | |
228 | void radix__mark_rodata_ro(void) |
229 | { |
230 | unsigned long start, end; |
231 | |
232 | start = (unsigned long)_stext; |
233 | end = (unsigned long)__end_rodata; |
234 | |
235 | radix__change_memory_range(start, end, clear: _PAGE_WRITE); |
236 | |
237 | for (start = PAGE_OFFSET; start < (unsigned long)_stext; start += PAGE_SIZE) { |
238 | end = start + PAGE_SIZE; |
239 | if (overlaps_interrupt_vector_text(start, end)) |
240 | radix__change_memory_range(start, end, clear: _PAGE_WRITE); |
241 | else |
242 | break; |
243 | } |
244 | } |
245 | |
246 | void radix__mark_initmem_nx(void) |
247 | { |
248 | unsigned long start = (unsigned long)__init_begin; |
249 | unsigned long end = (unsigned long)__init_end; |
250 | |
251 | radix__change_memory_range(start, end, clear: _PAGE_EXEC); |
252 | } |
253 | #endif /* CONFIG_STRICT_KERNEL_RWX */ |
254 | |
255 | static inline void __meminit |
256 | print_mapping(unsigned long start, unsigned long end, unsigned long size, bool exec) |
257 | { |
258 | char buf[10]; |
259 | |
260 | if (end <= start) |
261 | return; |
262 | |
263 | string_get_size(size, blk_size: 1, units: STRING_UNITS_2, buf, len: sizeof(buf)); |
264 | |
265 | pr_info("Mapped 0x%016lx-0x%016lx with %s pages%s\n" , start, end, buf, |
266 | exec ? " (exec)" : "" ); |
267 | } |
268 | |
269 | static unsigned long next_boundary(unsigned long addr, unsigned long end) |
270 | { |
271 | #ifdef CONFIG_STRICT_KERNEL_RWX |
272 | unsigned long stext_phys; |
273 | |
274 | stext_phys = __pa_symbol(_stext); |
275 | |
276 | // Relocatable kernel running at non-zero real address |
277 | if (stext_phys != 0) { |
278 | // The end of interrupts code at zero is a rodata boundary |
279 | unsigned long end_intr = __pa_symbol(__end_interrupts) - stext_phys; |
280 | if (addr < end_intr) |
281 | return end_intr; |
282 | |
283 | // Start of relocated kernel text is a rodata boundary |
284 | if (addr < stext_phys) |
285 | return stext_phys; |
286 | } |
287 | |
288 | if (addr < __pa_symbol(__srwx_boundary)) |
289 | return __pa_symbol(__srwx_boundary); |
290 | #endif |
291 | return end; |
292 | } |
293 | |
294 | static int __meminit create_physical_mapping(unsigned long start, |
295 | unsigned long end, |
296 | int nid, pgprot_t _prot) |
297 | { |
298 | unsigned long vaddr, addr, mapping_size = 0; |
299 | bool prev_exec, exec = false; |
300 | pgprot_t prot; |
301 | int psize; |
302 | unsigned long max_mapping_size = memory_block_size; |
303 | |
304 | if (debug_pagealloc_enabled_or_kfence()) |
305 | max_mapping_size = PAGE_SIZE; |
306 | |
307 | start = ALIGN(start, PAGE_SIZE); |
308 | end = ALIGN_DOWN(end, PAGE_SIZE); |
309 | for (addr = start; addr < end; addr += mapping_size) { |
310 | unsigned long gap, previous_size; |
311 | int rc; |
312 | |
313 | gap = next_boundary(addr, end) - addr; |
314 | if (gap > max_mapping_size) |
315 | gap = max_mapping_size; |
316 | previous_size = mapping_size; |
317 | prev_exec = exec; |
318 | |
319 | if (IS_ALIGNED(addr, PUD_SIZE) && gap >= PUD_SIZE && |
320 | mmu_psize_defs[MMU_PAGE_1G].shift) { |
321 | mapping_size = PUD_SIZE; |
322 | psize = MMU_PAGE_1G; |
323 | } else if (IS_ALIGNED(addr, PMD_SIZE) && gap >= PMD_SIZE && |
324 | mmu_psize_defs[MMU_PAGE_2M].shift) { |
325 | mapping_size = PMD_SIZE; |
326 | psize = MMU_PAGE_2M; |
327 | } else { |
328 | mapping_size = PAGE_SIZE; |
329 | psize = mmu_virtual_psize; |
330 | } |
331 | |
332 | vaddr = (unsigned long)__va(addr); |
333 | |
334 | if (overlaps_kernel_text(vaddr, vaddr + mapping_size) || |
335 | overlaps_interrupt_vector_text(vaddr, vaddr + mapping_size)) { |
336 | prot = PAGE_KERNEL_X; |
337 | exec = true; |
338 | } else { |
339 | prot = _prot; |
340 | exec = false; |
341 | } |
342 | |
343 | if (mapping_size != previous_size || exec != prev_exec) { |
344 | print_mapping(start, end: addr, size: previous_size, exec: prev_exec); |
345 | start = addr; |
346 | } |
347 | |
348 | rc = __map_kernel_page(ea: vaddr, pa: addr, flags: prot, map_page_size: mapping_size, nid, region_start: start, region_end: end); |
349 | if (rc) |
350 | return rc; |
351 | |
352 | update_page_count(level: psize, pages: 1); |
353 | } |
354 | |
355 | print_mapping(start, end: addr, size: mapping_size, exec); |
356 | return 0; |
357 | } |
358 | |
359 | static void __init radix_init_pgtable(void) |
360 | { |
361 | unsigned long rts_field; |
362 | phys_addr_t start, end; |
363 | u64 i; |
364 | |
365 | /* We don't support slb for radix */ |
366 | slb_set_size(0); |
367 | |
368 | /* |
369 | * Create the linear mapping |
370 | */ |
371 | for_each_mem_range(i, &start, &end) { |
372 | /* |
373 | * The memblock allocator is up at this point, so the |
374 | * page tables will be allocated within the range. No |
375 | * need or a node (which we don't have yet). |
376 | */ |
377 | |
378 | if (end >= RADIX_VMALLOC_START) { |
379 | pr_warn("Outside the supported range\n" ); |
380 | continue; |
381 | } |
382 | |
383 | WARN_ON(create_physical_mapping(start, end, |
384 | -1, PAGE_KERNEL)); |
385 | } |
386 | |
387 | if (!cpu_has_feature(CPU_FTR_HVMODE) && |
388 | cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG)) { |
389 | /* |
390 | * Older versions of KVM on these machines prefer if the |
391 | * guest only uses the low 19 PID bits. |
392 | */ |
393 | mmu_pid_bits = 19; |
394 | } |
395 | mmu_base_pid = 1; |
396 | |
397 | /* |
398 | * Allocate Partition table and process table for the |
399 | * host. |
400 | */ |
401 | BUG_ON(PRTB_SIZE_SHIFT > 36); |
402 | process_tb = early_alloc_pgtable(1UL << PRTB_SIZE_SHIFT, -1, 0, 0); |
403 | /* |
404 | * Fill in the process table. |
405 | */ |
406 | rts_field = radix__get_tree_size(); |
407 | process_tb->prtb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE); |
408 | |
409 | /* |
410 | * The init_mm context is given the first available (non-zero) PID, |
411 | * which is the "guard PID" and contains no page table. PIDR should |
412 | * never be set to zero because that duplicates the kernel address |
413 | * space at the 0x0... offset (quadrant 0)! |
414 | * |
415 | * An arbitrary PID that may later be allocated by the PID allocator |
416 | * for userspace processes must not be used either, because that |
417 | * would cause stale user mappings for that PID on CPUs outside of |
418 | * the TLB invalidation scheme (because it won't be in mm_cpumask). |
419 | * |
420 | * So permanently carve out one PID for the purpose of a guard PID. |
421 | */ |
422 | init_mm.context.id = mmu_base_pid; |
423 | mmu_base_pid++; |
424 | } |
425 | |
426 | static void __init radix_init_partition_table(void) |
427 | { |
428 | unsigned long rts_field, dw0, dw1; |
429 | |
430 | mmu_partition_table_init(); |
431 | rts_field = radix__get_tree_size(); |
432 | dw0 = rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE | PATB_HR; |
433 | dw1 = __pa(process_tb) | (PRTB_SIZE_SHIFT - 12) | PATB_GR; |
434 | mmu_partition_table_set_entry(0, dw0, dw1, false); |
435 | |
436 | pr_info("Initializing Radix MMU\n" ); |
437 | } |
438 | |
439 | static int __init get_idx_from_shift(unsigned int shift) |
440 | { |
441 | int idx = -1; |
442 | |
443 | switch (shift) { |
444 | case 0xc: |
445 | idx = MMU_PAGE_4K; |
446 | break; |
447 | case 0x10: |
448 | idx = MMU_PAGE_64K; |
449 | break; |
450 | case 0x15: |
451 | idx = MMU_PAGE_2M; |
452 | break; |
453 | case 0x1e: |
454 | idx = MMU_PAGE_1G; |
455 | break; |
456 | } |
457 | return idx; |
458 | } |
459 | |
460 | static int __init radix_dt_scan_page_sizes(unsigned long node, |
461 | const char *uname, int depth, |
462 | void *data) |
463 | { |
464 | int size = 0; |
465 | int shift, idx; |
466 | unsigned int ap; |
467 | const __be32 *prop; |
468 | const char *type = of_get_flat_dt_prop(node, name: "device_type" , NULL); |
469 | |
470 | /* We are scanning "cpu" nodes only */ |
471 | if (type == NULL || strcmp(type, "cpu" ) != 0) |
472 | return 0; |
473 | |
474 | /* Grab page size encodings */ |
475 | prop = of_get_flat_dt_prop(node, name: "ibm,processor-radix-AP-encodings" , size: &size); |
476 | if (!prop) |
477 | return 0; |
478 | |
479 | pr_info("Page sizes from device-tree:\n" ); |
480 | for (; size >= 4; size -= 4, ++prop) { |
481 | |
482 | struct mmu_psize_def *def; |
483 | |
484 | /* top 3 bit is AP encoding */ |
485 | shift = be32_to_cpu(prop[0]) & ~(0xe << 28); |
486 | ap = be32_to_cpu(prop[0]) >> 29; |
487 | pr_info("Page size shift = %d AP=0x%x\n" , shift, ap); |
488 | |
489 | idx = get_idx_from_shift(shift); |
490 | if (idx < 0) |
491 | continue; |
492 | |
493 | def = &mmu_psize_defs[idx]; |
494 | def->shift = shift; |
495 | def->ap = ap; |
496 | def->h_rpt_pgsize = psize_to_rpti_pgsize(idx); |
497 | } |
498 | |
499 | /* needed ? */ |
500 | cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B; |
501 | return 1; |
502 | } |
503 | |
504 | void __init radix__early_init_devtree(void) |
505 | { |
506 | int rc; |
507 | |
508 | /* |
509 | * Try to find the available page sizes in the device-tree |
510 | */ |
511 | rc = of_scan_flat_dt(it: radix_dt_scan_page_sizes, NULL); |
512 | if (!rc) { |
513 | /* |
514 | * No page size details found in device tree. |
515 | * Let's assume we have page 4k and 64k support |
516 | */ |
517 | mmu_psize_defs[MMU_PAGE_4K].shift = 12; |
518 | mmu_psize_defs[MMU_PAGE_4K].ap = 0x0; |
519 | mmu_psize_defs[MMU_PAGE_4K].h_rpt_pgsize = |
520 | psize_to_rpti_pgsize(MMU_PAGE_4K); |
521 | |
522 | mmu_psize_defs[MMU_PAGE_64K].shift = 16; |
523 | mmu_psize_defs[MMU_PAGE_64K].ap = 0x5; |
524 | mmu_psize_defs[MMU_PAGE_64K].h_rpt_pgsize = |
525 | psize_to_rpti_pgsize(MMU_PAGE_64K); |
526 | } |
527 | return; |
528 | } |
529 | |
530 | void __init radix__early_init_mmu(void) |
531 | { |
532 | unsigned long lpcr; |
533 | |
534 | #ifdef CONFIG_PPC_64S_HASH_MMU |
535 | #ifdef CONFIG_PPC_64K_PAGES |
536 | /* PAGE_SIZE mappings */ |
537 | mmu_virtual_psize = MMU_PAGE_64K; |
538 | #else |
539 | mmu_virtual_psize = MMU_PAGE_4K; |
540 | #endif |
541 | #endif |
542 | /* |
543 | * initialize page table size |
544 | */ |
545 | __pte_index_size = RADIX_PTE_INDEX_SIZE; |
546 | __pmd_index_size = RADIX_PMD_INDEX_SIZE; |
547 | __pud_index_size = RADIX_PUD_INDEX_SIZE; |
548 | __pgd_index_size = RADIX_PGD_INDEX_SIZE; |
549 | __pud_cache_index = RADIX_PUD_INDEX_SIZE; |
550 | __pte_table_size = RADIX_PTE_TABLE_SIZE; |
551 | __pmd_table_size = RADIX_PMD_TABLE_SIZE; |
552 | __pud_table_size = RADIX_PUD_TABLE_SIZE; |
553 | __pgd_table_size = RADIX_PGD_TABLE_SIZE; |
554 | |
555 | __pmd_val_bits = RADIX_PMD_VAL_BITS; |
556 | __pud_val_bits = RADIX_PUD_VAL_BITS; |
557 | __pgd_val_bits = RADIX_PGD_VAL_BITS; |
558 | |
559 | __kernel_virt_start = RADIX_KERN_VIRT_START; |
560 | __vmalloc_start = RADIX_VMALLOC_START; |
561 | __vmalloc_end = RADIX_VMALLOC_END; |
562 | __kernel_io_start = RADIX_KERN_IO_START; |
563 | __kernel_io_end = RADIX_KERN_IO_END; |
564 | vmemmap = (struct page *)RADIX_VMEMMAP_START; |
565 | ioremap_bot = IOREMAP_BASE; |
566 | |
567 | #ifdef CONFIG_PCI |
568 | pci_io_base = ISA_IO_BASE; |
569 | #endif |
570 | __pte_frag_nr = RADIX_PTE_FRAG_NR; |
571 | __pte_frag_size_shift = RADIX_PTE_FRAG_SIZE_SHIFT; |
572 | __pmd_frag_nr = RADIX_PMD_FRAG_NR; |
573 | __pmd_frag_size_shift = RADIX_PMD_FRAG_SIZE_SHIFT; |
574 | |
575 | radix_init_pgtable(); |
576 | |
577 | if (!firmware_has_feature(FW_FEATURE_LPAR)) { |
578 | lpcr = mfspr(SPRN_LPCR); |
579 | mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR); |
580 | radix_init_partition_table(); |
581 | } else { |
582 | radix_init_pseries(); |
583 | } |
584 | |
585 | memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE); |
586 | |
587 | /* Switch to the guard PID before turning on MMU */ |
588 | radix__switch_mmu_context(NULL, &init_mm); |
589 | tlbiel_all(); |
590 | } |
591 | |
592 | void radix__early_init_mmu_secondary(void) |
593 | { |
594 | unsigned long lpcr; |
595 | /* |
596 | * update partition table control register and UPRT |
597 | */ |
598 | if (!firmware_has_feature(FW_FEATURE_LPAR)) { |
599 | lpcr = mfspr(SPRN_LPCR); |
600 | mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR); |
601 | |
602 | set_ptcr_when_no_uv(__pa(partition_tb) | |
603 | (PATB_SIZE_SHIFT - 12)); |
604 | } |
605 | |
606 | radix__switch_mmu_context(NULL, &init_mm); |
607 | tlbiel_all(); |
608 | |
609 | /* Make sure userspace can't change the AMR */ |
610 | mtspr(SPRN_UAMOR, 0); |
611 | } |
612 | |
613 | /* Called during kexec sequence with MMU off */ |
614 | notrace void radix__mmu_cleanup_all(void) |
615 | { |
616 | unsigned long lpcr; |
617 | |
618 | if (!firmware_has_feature(FW_FEATURE_LPAR)) { |
619 | lpcr = mfspr(SPRN_LPCR); |
620 | mtspr(SPRN_LPCR, lpcr & ~LPCR_UPRT); |
621 | set_ptcr_when_no_uv(0); |
622 | powernv_set_nmmu_ptcr(0); |
623 | radix__flush_tlb_all(); |
624 | } |
625 | } |
626 | |
627 | #ifdef CONFIG_MEMORY_HOTPLUG |
628 | static void free_pte_table(pte_t *pte_start, pmd_t *pmd) |
629 | { |
630 | pte_t *pte; |
631 | int i; |
632 | |
633 | for (i = 0; i < PTRS_PER_PTE; i++) { |
634 | pte = pte_start + i; |
635 | if (!pte_none(pte: *pte)) |
636 | return; |
637 | } |
638 | |
639 | pte_free_kernel(mm: &init_mm, pte: pte_start); |
640 | pmd_clear(pmdp: pmd); |
641 | } |
642 | |
643 | static void free_pmd_table(pmd_t *pmd_start, pud_t *pud) |
644 | { |
645 | pmd_t *pmd; |
646 | int i; |
647 | |
648 | for (i = 0; i < PTRS_PER_PMD; i++) { |
649 | pmd = pmd_start + i; |
650 | if (!pmd_none(pmd: *pmd)) |
651 | return; |
652 | } |
653 | |
654 | pmd_free(mm: &init_mm, pmd: pmd_start); |
655 | pud_clear(pudp: pud); |
656 | } |
657 | |
658 | static void free_pud_table(pud_t *pud_start, p4d_t *p4d) |
659 | { |
660 | pud_t *pud; |
661 | int i; |
662 | |
663 | for (i = 0; i < PTRS_PER_PUD; i++) { |
664 | pud = pud_start + i; |
665 | if (!pud_none(pud: *pud)) |
666 | return; |
667 | } |
668 | |
669 | pud_free(mm: &init_mm, pud: pud_start); |
670 | p4d_clear(p4dp: p4d); |
671 | } |
672 | |
673 | #ifdef CONFIG_SPARSEMEM_VMEMMAP |
674 | static bool __meminit vmemmap_pmd_is_unused(unsigned long addr, unsigned long end) |
675 | { |
676 | unsigned long start = ALIGN_DOWN(addr, PMD_SIZE); |
677 | |
678 | return !vmemmap_populated(start, PMD_SIZE); |
679 | } |
680 | |
681 | static bool __meminit vmemmap_page_is_unused(unsigned long addr, unsigned long end) |
682 | { |
683 | unsigned long start = ALIGN_DOWN(addr, PAGE_SIZE); |
684 | |
685 | return !vmemmap_populated(start, PAGE_SIZE); |
686 | |
687 | } |
688 | #endif |
689 | |
690 | static void __meminit free_vmemmap_pages(struct page *page, |
691 | struct vmem_altmap *altmap, |
692 | int order) |
693 | { |
694 | unsigned int nr_pages = 1 << order; |
695 | |
696 | if (altmap) { |
697 | unsigned long alt_start, alt_end; |
698 | unsigned long base_pfn = page_to_pfn(page); |
699 | |
700 | /* |
701 | * with 2M vmemmap mmaping we can have things setup |
702 | * such that even though atlmap is specified we never |
703 | * used altmap. |
704 | */ |
705 | alt_start = altmap->base_pfn; |
706 | alt_end = altmap->base_pfn + altmap->reserve + altmap->free; |
707 | |
708 | if (base_pfn >= alt_start && base_pfn < alt_end) { |
709 | vmem_altmap_free(altmap, nr_pfns: nr_pages); |
710 | return; |
711 | } |
712 | } |
713 | |
714 | if (PageReserved(page)) { |
715 | /* allocated from memblock */ |
716 | while (nr_pages--) |
717 | free_reserved_page(page: page++); |
718 | } else |
719 | free_pages(addr: (unsigned long)page_address(page), order); |
720 | } |
721 | |
722 | static void __meminit remove_pte_table(pte_t *pte_start, unsigned long addr, |
723 | unsigned long end, bool direct, |
724 | struct vmem_altmap *altmap) |
725 | { |
726 | unsigned long next, pages = 0; |
727 | pte_t *pte; |
728 | |
729 | pte = pte_start + pte_index(address: addr); |
730 | for (; addr < end; addr = next, pte++) { |
731 | next = (addr + PAGE_SIZE) & PAGE_MASK; |
732 | if (next > end) |
733 | next = end; |
734 | |
735 | if (!pte_present(a: *pte)) |
736 | continue; |
737 | |
738 | if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) { |
739 | if (!direct) |
740 | free_vmemmap_pages(pte_page(*pte), altmap, order: 0); |
741 | pte_clear(mm: &init_mm, addr, ptep: pte); |
742 | pages++; |
743 | } |
744 | #ifdef CONFIG_SPARSEMEM_VMEMMAP |
745 | else if (!direct && vmemmap_page_is_unused(addr, end: next)) { |
746 | free_vmemmap_pages(pte_page(*pte), altmap, order: 0); |
747 | pte_clear(mm: &init_mm, addr, ptep: pte); |
748 | } |
749 | #endif |
750 | } |
751 | if (direct) |
752 | update_page_count(mmu_virtual_psize, -pages); |
753 | } |
754 | |
755 | static void __meminit remove_pmd_table(pmd_t *pmd_start, unsigned long addr, |
756 | unsigned long end, bool direct, |
757 | struct vmem_altmap *altmap) |
758 | { |
759 | unsigned long next, pages = 0; |
760 | pte_t *pte_base; |
761 | pmd_t *pmd; |
762 | |
763 | pmd = pmd_start + pmd_index(address: addr); |
764 | for (; addr < end; addr = next, pmd++) { |
765 | next = pmd_addr_end(addr, end); |
766 | |
767 | if (!pmd_present(pmd: *pmd)) |
768 | continue; |
769 | |
770 | if (pmd_leaf(pte: *pmd)) { |
771 | if (IS_ALIGNED(addr, PMD_SIZE) && |
772 | IS_ALIGNED(next, PMD_SIZE)) { |
773 | if (!direct) |
774 | free_vmemmap_pages(pmd_page(*pmd), altmap, order: get_order(PMD_SIZE)); |
775 | pte_clear(mm: &init_mm, addr, ptep: (pte_t *)pmd); |
776 | pages++; |
777 | } |
778 | #ifdef CONFIG_SPARSEMEM_VMEMMAP |
779 | else if (!direct && vmemmap_pmd_is_unused(addr, end: next)) { |
780 | free_vmemmap_pages(pmd_page(*pmd), altmap, order: get_order(PMD_SIZE)); |
781 | pte_clear(mm: &init_mm, addr, ptep: (pte_t *)pmd); |
782 | } |
783 | #endif |
784 | continue; |
785 | } |
786 | |
787 | pte_base = (pte_t *)pmd_page_vaddr(pmd: *pmd); |
788 | remove_pte_table(pte_start: pte_base, addr, end: next, direct, altmap); |
789 | free_pte_table(pte_start: pte_base, pmd); |
790 | } |
791 | if (direct) |
792 | update_page_count(MMU_PAGE_2M, -pages); |
793 | } |
794 | |
795 | static void __meminit remove_pud_table(pud_t *pud_start, unsigned long addr, |
796 | unsigned long end, bool direct, |
797 | struct vmem_altmap *altmap) |
798 | { |
799 | unsigned long next, pages = 0; |
800 | pmd_t *pmd_base; |
801 | pud_t *pud; |
802 | |
803 | pud = pud_start + pud_index(address: addr); |
804 | for (; addr < end; addr = next, pud++) { |
805 | next = pud_addr_end(addr, end); |
806 | |
807 | if (!pud_present(pud: *pud)) |
808 | continue; |
809 | |
810 | if (pud_leaf(pud: *pud)) { |
811 | if (!IS_ALIGNED(addr, PUD_SIZE) || |
812 | !IS_ALIGNED(next, PUD_SIZE)) { |
813 | WARN_ONCE(1, "%s: unaligned range\n" , __func__); |
814 | continue; |
815 | } |
816 | pte_clear(mm: &init_mm, addr, ptep: (pte_t *)pud); |
817 | pages++; |
818 | continue; |
819 | } |
820 | |
821 | pmd_base = pud_pgtable(pud: *pud); |
822 | remove_pmd_table(pmd_start: pmd_base, addr, end: next, direct, altmap); |
823 | free_pmd_table(pmd_start: pmd_base, pud); |
824 | } |
825 | if (direct) |
826 | update_page_count(MMU_PAGE_1G, -pages); |
827 | } |
828 | |
829 | static void __meminit |
830 | remove_pagetable(unsigned long start, unsigned long end, bool direct, |
831 | struct vmem_altmap *altmap) |
832 | { |
833 | unsigned long addr, next; |
834 | pud_t *pud_base; |
835 | pgd_t *pgd; |
836 | p4d_t *p4d; |
837 | |
838 | spin_lock(lock: &init_mm.page_table_lock); |
839 | |
840 | for (addr = start; addr < end; addr = next) { |
841 | next = pgd_addr_end(addr, end); |
842 | |
843 | pgd = pgd_offset_k(addr); |
844 | p4d = p4d_offset(pgd, address: addr); |
845 | if (!p4d_present(p4d: *p4d)) |
846 | continue; |
847 | |
848 | if (p4d_leaf(p4d: *p4d)) { |
849 | if (!IS_ALIGNED(addr, P4D_SIZE) || |
850 | !IS_ALIGNED(next, P4D_SIZE)) { |
851 | WARN_ONCE(1, "%s: unaligned range\n" , __func__); |
852 | continue; |
853 | } |
854 | |
855 | pte_clear(mm: &init_mm, addr, ptep: (pte_t *)pgd); |
856 | continue; |
857 | } |
858 | |
859 | pud_base = p4d_pgtable(p4d: *p4d); |
860 | remove_pud_table(pud_start: pud_base, addr, end: next, direct, altmap); |
861 | free_pud_table(pud_start: pud_base, p4d); |
862 | } |
863 | |
864 | spin_unlock(lock: &init_mm.page_table_lock); |
865 | radix__flush_tlb_kernel_range(start, end); |
866 | } |
867 | |
868 | int __meminit radix__create_section_mapping(unsigned long start, |
869 | unsigned long end, int nid, |
870 | pgprot_t prot) |
871 | { |
872 | if (end >= RADIX_VMALLOC_START) { |
873 | pr_warn("Outside the supported range\n" ); |
874 | return -1; |
875 | } |
876 | |
877 | return create_physical_mapping(__pa(start), __pa(end), |
878 | nid, prot: prot); |
879 | } |
880 | |
881 | int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end) |
882 | { |
883 | remove_pagetable(start, end, direct: true, NULL); |
884 | return 0; |
885 | } |
886 | #endif /* CONFIG_MEMORY_HOTPLUG */ |
887 | |
888 | #ifdef CONFIG_SPARSEMEM_VMEMMAP |
889 | static int __map_kernel_page_nid(unsigned long ea, unsigned long pa, |
890 | pgprot_t flags, unsigned int map_page_size, |
891 | int nid) |
892 | { |
893 | return __map_kernel_page(ea, pa, flags, map_page_size, nid, region_start: 0, region_end: 0); |
894 | } |
895 | |
896 | int __meminit radix__vmemmap_create_mapping(unsigned long start, |
897 | unsigned long page_size, |
898 | unsigned long phys) |
899 | { |
900 | /* Create a PTE encoding */ |
901 | int nid = early_pfn_to_nid(pfn: phys >> PAGE_SHIFT); |
902 | int ret; |
903 | |
904 | if ((start + page_size) >= RADIX_VMEMMAP_END) { |
905 | pr_warn("Outside the supported range\n" ); |
906 | return -1; |
907 | } |
908 | |
909 | ret = __map_kernel_page_nid(ea: start, pa: phys, PAGE_KERNEL, map_page_size: page_size, nid); |
910 | BUG_ON(ret); |
911 | |
912 | return 0; |
913 | } |
914 | |
915 | |
916 | bool vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap *pgmap) |
917 | { |
918 | if (radix_enabled()) |
919 | return __vmemmap_can_optimize(altmap, pgmap); |
920 | |
921 | return false; |
922 | } |
923 | |
924 | int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node, |
925 | unsigned long addr, unsigned long next) |
926 | { |
927 | int large = pmd_leaf(pte: *pmdp); |
928 | |
929 | if (large) |
930 | vmemmap_verify(pmdp_ptep(pmdp), node, addr, next); |
931 | |
932 | return large; |
933 | } |
934 | |
935 | void __meminit vmemmap_set_pmd(pmd_t *pmdp, void *p, int node, |
936 | unsigned long addr, unsigned long next) |
937 | { |
938 | pte_t entry; |
939 | pte_t *ptep = pmdp_ptep(pmdp); |
940 | |
941 | VM_BUG_ON(!IS_ALIGNED(addr, PMD_SIZE)); |
942 | entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); |
943 | set_pte_at(&init_mm, addr, ptep, entry); |
944 | asm volatile("ptesync" : : :"memory" ); |
945 | |
946 | vmemmap_verify(ptep, node, addr, next); |
947 | } |
948 | |
949 | static pte_t * __meminit radix__vmemmap_pte_populate(pmd_t *pmdp, unsigned long addr, |
950 | int node, |
951 | struct vmem_altmap *altmap, |
952 | struct page *reuse) |
953 | { |
954 | pte_t *pte = pte_offset_kernel(pmd: pmdp, address: addr); |
955 | |
956 | if (pte_none(pte: *pte)) { |
957 | pte_t entry; |
958 | void *p; |
959 | |
960 | if (!reuse) { |
961 | /* |
962 | * make sure we don't create altmap mappings |
963 | * covering things outside the device. |
964 | */ |
965 | if (altmap && altmap_cross_boundary(altmap, addr, PAGE_SIZE)) |
966 | altmap = NULL; |
967 | |
968 | p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap); |
969 | if (!p && altmap) |
970 | p = vmemmap_alloc_block_buf(PAGE_SIZE, node, NULL); |
971 | if (!p) |
972 | return NULL; |
973 | pr_debug("PAGE_SIZE vmemmap mapping\n" ); |
974 | } else { |
975 | /* |
976 | * When a PTE/PMD entry is freed from the init_mm |
977 | * there's a free_pages() call to this page allocated |
978 | * above. Thus this get_page() is paired with the |
979 | * put_page_testzero() on the freeing path. |
980 | * This can only called by certain ZONE_DEVICE path, |
981 | * and through vmemmap_populate_compound_pages() when |
982 | * slab is available. |
983 | */ |
984 | get_page(page: reuse); |
985 | p = page_to_virt(reuse); |
986 | pr_debug("Tail page reuse vmemmap mapping\n" ); |
987 | } |
988 | |
989 | VM_BUG_ON(!PAGE_ALIGNED(addr)); |
990 | entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); |
991 | set_pte_at(&init_mm, addr, pte, entry); |
992 | asm volatile("ptesync" : : :"memory" ); |
993 | } |
994 | return pte; |
995 | } |
996 | |
997 | static inline pud_t *vmemmap_pud_alloc(p4d_t *p4dp, int node, |
998 | unsigned long address) |
999 | { |
1000 | pud_t *pud; |
1001 | |
1002 | /* All early vmemmap mapping to keep simple do it at PAGE_SIZE */ |
1003 | if (unlikely(p4d_none(*p4dp))) { |
1004 | if (unlikely(!slab_is_available())) { |
1005 | pud = early_alloc_pgtable(PAGE_SIZE, nid: node, region_start: 0, region_end: 0); |
1006 | p4d_populate(mm: &init_mm, p4d: p4dp, pud); |
1007 | /* go to the pud_offset */ |
1008 | } else |
1009 | return pud_alloc(mm: &init_mm, p4d: p4dp, address); |
1010 | } |
1011 | return pud_offset(p4d: p4dp, address); |
1012 | } |
1013 | |
1014 | static inline pmd_t *vmemmap_pmd_alloc(pud_t *pudp, int node, |
1015 | unsigned long address) |
1016 | { |
1017 | pmd_t *pmd; |
1018 | |
1019 | /* All early vmemmap mapping to keep simple do it at PAGE_SIZE */ |
1020 | if (unlikely(pud_none(*pudp))) { |
1021 | if (unlikely(!slab_is_available())) { |
1022 | pmd = early_alloc_pgtable(PAGE_SIZE, nid: node, region_start: 0, region_end: 0); |
1023 | pud_populate(mm: &init_mm, pud: pudp, pmd); |
1024 | } else |
1025 | return pmd_alloc(mm: &init_mm, pud: pudp, address); |
1026 | } |
1027 | return pmd_offset(pud: pudp, address); |
1028 | } |
1029 | |
1030 | static inline pte_t *vmemmap_pte_alloc(pmd_t *pmdp, int node, |
1031 | unsigned long address) |
1032 | { |
1033 | pte_t *pte; |
1034 | |
1035 | /* All early vmemmap mapping to keep simple do it at PAGE_SIZE */ |
1036 | if (unlikely(pmd_none(*pmdp))) { |
1037 | if (unlikely(!slab_is_available())) { |
1038 | pte = early_alloc_pgtable(PAGE_SIZE, nid: node, region_start: 0, region_end: 0); |
1039 | pmd_populate(mm: &init_mm, pmd: pmdp, pte); |
1040 | } else |
1041 | return pte_alloc_kernel(pmdp, address); |
1042 | } |
1043 | return pte_offset_kernel(pmd: pmdp, address); |
1044 | } |
1045 | |
1046 | |
1047 | |
1048 | int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end, int node, |
1049 | struct vmem_altmap *altmap) |
1050 | { |
1051 | unsigned long addr; |
1052 | unsigned long next; |
1053 | pgd_t *pgd; |
1054 | p4d_t *p4d; |
1055 | pud_t *pud; |
1056 | pmd_t *pmd; |
1057 | pte_t *pte; |
1058 | |
1059 | for (addr = start; addr < end; addr = next) { |
1060 | next = pmd_addr_end(addr, end); |
1061 | |
1062 | pgd = pgd_offset_k(addr); |
1063 | p4d = p4d_offset(pgd, address: addr); |
1064 | pud = vmemmap_pud_alloc(p4dp: p4d, node, address: addr); |
1065 | if (!pud) |
1066 | return -ENOMEM; |
1067 | pmd = vmemmap_pmd_alloc(pudp: pud, node, address: addr); |
1068 | if (!pmd) |
1069 | return -ENOMEM; |
1070 | |
1071 | if (pmd_none(READ_ONCE(*pmd))) { |
1072 | void *p; |
1073 | |
1074 | /* |
1075 | * keep it simple by checking addr PMD_SIZE alignment |
1076 | * and verifying the device boundary condition. |
1077 | * For us to use a pmd mapping, both addr and pfn should |
1078 | * be aligned. We skip if addr is not aligned and for |
1079 | * pfn we hope we have extra area in the altmap that |
1080 | * can help to find an aligned block. This can result |
1081 | * in altmap block allocation failures, in which case |
1082 | * we fallback to RAM for vmemmap allocation. |
1083 | */ |
1084 | if (altmap && (!IS_ALIGNED(addr, PMD_SIZE) || |
1085 | altmap_cross_boundary(altmap, addr, PMD_SIZE))) { |
1086 | /* |
1087 | * make sure we don't create altmap mappings |
1088 | * covering things outside the device. |
1089 | */ |
1090 | goto base_mapping; |
1091 | } |
1092 | |
1093 | p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap); |
1094 | if (p) { |
1095 | vmemmap_set_pmd(pmdp: pmd, p, node, addr, next); |
1096 | pr_debug("PMD_SIZE vmemmap mapping\n" ); |
1097 | continue; |
1098 | } else if (altmap) { |
1099 | /* |
1100 | * A vmemmap block allocation can fail due to |
1101 | * alignment requirements and we trying to align |
1102 | * things aggressively there by running out of |
1103 | * space. Try base mapping on failure. |
1104 | */ |
1105 | goto base_mapping; |
1106 | } |
1107 | } else if (vmemmap_check_pmd(pmdp: pmd, node, addr, next)) { |
1108 | /* |
1109 | * If a huge mapping exist due to early call to |
1110 | * vmemmap_populate, let's try to use that. |
1111 | */ |
1112 | continue; |
1113 | } |
1114 | base_mapping: |
1115 | /* |
1116 | * Not able allocate higher order memory to back memmap |
1117 | * or we found a pointer to pte page. Allocate base page |
1118 | * size vmemmap |
1119 | */ |
1120 | pte = vmemmap_pte_alloc(pmdp: pmd, node, address: addr); |
1121 | if (!pte) |
1122 | return -ENOMEM; |
1123 | |
1124 | pte = radix__vmemmap_pte_populate(pmdp: pmd, addr, node, altmap, NULL); |
1125 | if (!pte) |
1126 | return -ENOMEM; |
1127 | |
1128 | vmemmap_verify(pte, node, addr, addr + PAGE_SIZE); |
1129 | next = addr + PAGE_SIZE; |
1130 | } |
1131 | return 0; |
1132 | } |
1133 | |
1134 | static pte_t * __meminit radix__vmemmap_populate_address(unsigned long addr, int node, |
1135 | struct vmem_altmap *altmap, |
1136 | struct page *reuse) |
1137 | { |
1138 | pgd_t *pgd; |
1139 | p4d_t *p4d; |
1140 | pud_t *pud; |
1141 | pmd_t *pmd; |
1142 | pte_t *pte; |
1143 | |
1144 | pgd = pgd_offset_k(addr); |
1145 | p4d = p4d_offset(pgd, address: addr); |
1146 | pud = vmemmap_pud_alloc(p4dp: p4d, node, address: addr); |
1147 | if (!pud) |
1148 | return NULL; |
1149 | pmd = vmemmap_pmd_alloc(pudp: pud, node, address: addr); |
1150 | if (!pmd) |
1151 | return NULL; |
1152 | if (pmd_leaf(pte: *pmd)) |
1153 | /* |
1154 | * The second page is mapped as a hugepage due to a nearby request. |
1155 | * Force our mapping to page size without deduplication |
1156 | */ |
1157 | return NULL; |
1158 | pte = vmemmap_pte_alloc(pmdp: pmd, node, address: addr); |
1159 | if (!pte) |
1160 | return NULL; |
1161 | radix__vmemmap_pte_populate(pmdp: pmd, addr, node, NULL, NULL); |
1162 | vmemmap_verify(pte, node, addr, addr + PAGE_SIZE); |
1163 | |
1164 | return pte; |
1165 | } |
1166 | |
1167 | static pte_t * __meminit vmemmap_compound_tail_page(unsigned long addr, |
1168 | unsigned long pfn_offset, int node) |
1169 | { |
1170 | pgd_t *pgd; |
1171 | p4d_t *p4d; |
1172 | pud_t *pud; |
1173 | pmd_t *pmd; |
1174 | pte_t *pte; |
1175 | unsigned long map_addr; |
1176 | |
1177 | /* the second vmemmap page which we use for duplication */ |
1178 | map_addr = addr - pfn_offset * sizeof(struct page) + PAGE_SIZE; |
1179 | pgd = pgd_offset_k(map_addr); |
1180 | p4d = p4d_offset(pgd, address: map_addr); |
1181 | pud = vmemmap_pud_alloc(p4dp: p4d, node, address: map_addr); |
1182 | if (!pud) |
1183 | return NULL; |
1184 | pmd = vmemmap_pmd_alloc(pudp: pud, node, address: map_addr); |
1185 | if (!pmd) |
1186 | return NULL; |
1187 | if (pmd_leaf(pte: *pmd)) |
1188 | /* |
1189 | * The second page is mapped as a hugepage due to a nearby request. |
1190 | * Force our mapping to page size without deduplication |
1191 | */ |
1192 | return NULL; |
1193 | pte = vmemmap_pte_alloc(pmdp: pmd, node, address: map_addr); |
1194 | if (!pte) |
1195 | return NULL; |
1196 | /* |
1197 | * Check if there exist a mapping to the left |
1198 | */ |
1199 | if (pte_none(pte: *pte)) { |
1200 | /* |
1201 | * Populate the head page vmemmap page. |
1202 | * It can fall in different pmd, hence |
1203 | * vmemmap_populate_address() |
1204 | */ |
1205 | pte = radix__vmemmap_populate_address(addr: map_addr - PAGE_SIZE, node, NULL, NULL); |
1206 | if (!pte) |
1207 | return NULL; |
1208 | /* |
1209 | * Populate the tail pages vmemmap page |
1210 | */ |
1211 | pte = radix__vmemmap_pte_populate(pmdp: pmd, addr: map_addr, node, NULL, NULL); |
1212 | if (!pte) |
1213 | return NULL; |
1214 | vmemmap_verify(pte, node, map_addr, map_addr + PAGE_SIZE); |
1215 | return pte; |
1216 | } |
1217 | return pte; |
1218 | } |
1219 | |
1220 | int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn, |
1221 | unsigned long start, |
1222 | unsigned long end, int node, |
1223 | struct dev_pagemap *pgmap) |
1224 | { |
1225 | /* |
1226 | * we want to map things as base page size mapping so that |
1227 | * we can save space in vmemmap. We could have huge mapping |
1228 | * covering out both edges. |
1229 | */ |
1230 | unsigned long addr; |
1231 | unsigned long addr_pfn = start_pfn; |
1232 | unsigned long next; |
1233 | pgd_t *pgd; |
1234 | p4d_t *p4d; |
1235 | pud_t *pud; |
1236 | pmd_t *pmd; |
1237 | pte_t *pte; |
1238 | |
1239 | for (addr = start; addr < end; addr = next) { |
1240 | |
1241 | pgd = pgd_offset_k(addr); |
1242 | p4d = p4d_offset(pgd, address: addr); |
1243 | pud = vmemmap_pud_alloc(p4dp: p4d, node, address: addr); |
1244 | if (!pud) |
1245 | return -ENOMEM; |
1246 | pmd = vmemmap_pmd_alloc(pudp: pud, node, address: addr); |
1247 | if (!pmd) |
1248 | return -ENOMEM; |
1249 | |
1250 | if (pmd_leaf(READ_ONCE(*pmd))) { |
1251 | /* existing huge mapping. Skip the range */ |
1252 | addr_pfn += (PMD_SIZE >> PAGE_SHIFT); |
1253 | next = pmd_addr_end(addr, end); |
1254 | continue; |
1255 | } |
1256 | pte = vmemmap_pte_alloc(pmdp: pmd, node, address: addr); |
1257 | if (!pte) |
1258 | return -ENOMEM; |
1259 | if (!pte_none(pte: *pte)) { |
1260 | /* |
1261 | * This could be because we already have a compound |
1262 | * page whose VMEMMAP_RESERVE_NR pages were mapped and |
1263 | * this request fall in those pages. |
1264 | */ |
1265 | addr_pfn += 1; |
1266 | next = addr + PAGE_SIZE; |
1267 | continue; |
1268 | } else { |
1269 | unsigned long nr_pages = pgmap_vmemmap_nr(pgmap); |
1270 | unsigned long pfn_offset = addr_pfn - ALIGN_DOWN(addr_pfn, nr_pages); |
1271 | pte_t *tail_page_pte; |
1272 | |
1273 | /* |
1274 | * if the address is aligned to huge page size it is the |
1275 | * head mapping. |
1276 | */ |
1277 | if (pfn_offset == 0) { |
1278 | /* Populate the head page vmemmap page */ |
1279 | pte = radix__vmemmap_pte_populate(pmdp: pmd, addr, node, NULL, NULL); |
1280 | if (!pte) |
1281 | return -ENOMEM; |
1282 | vmemmap_verify(pte, node, addr, addr + PAGE_SIZE); |
1283 | |
1284 | /* |
1285 | * Populate the tail pages vmemmap page |
1286 | * It can fall in different pmd, hence |
1287 | * vmemmap_populate_address() |
1288 | */ |
1289 | pte = radix__vmemmap_populate_address(addr: addr + PAGE_SIZE, node, NULL, NULL); |
1290 | if (!pte) |
1291 | return -ENOMEM; |
1292 | |
1293 | addr_pfn += 2; |
1294 | next = addr + 2 * PAGE_SIZE; |
1295 | continue; |
1296 | } |
1297 | /* |
1298 | * get the 2nd mapping details |
1299 | * Also create it if that doesn't exist |
1300 | */ |
1301 | tail_page_pte = vmemmap_compound_tail_page(addr, pfn_offset, node); |
1302 | if (!tail_page_pte) { |
1303 | |
1304 | pte = radix__vmemmap_pte_populate(pmdp: pmd, addr, node, NULL, NULL); |
1305 | if (!pte) |
1306 | return -ENOMEM; |
1307 | vmemmap_verify(pte, node, addr, addr + PAGE_SIZE); |
1308 | |
1309 | addr_pfn += 1; |
1310 | next = addr + PAGE_SIZE; |
1311 | continue; |
1312 | } |
1313 | |
1314 | pte = radix__vmemmap_pte_populate(pmdp: pmd, addr, node, NULL, pte_page(*tail_page_pte)); |
1315 | if (!pte) |
1316 | return -ENOMEM; |
1317 | vmemmap_verify(pte, node, addr, addr + PAGE_SIZE); |
1318 | |
1319 | addr_pfn += 1; |
1320 | next = addr + PAGE_SIZE; |
1321 | continue; |
1322 | } |
1323 | } |
1324 | return 0; |
1325 | } |
1326 | |
1327 | |
1328 | #ifdef CONFIG_MEMORY_HOTPLUG |
1329 | void __meminit radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size) |
1330 | { |
1331 | remove_pagetable(start, end: start + page_size, direct: true, NULL); |
1332 | } |
1333 | |
1334 | void __ref radix__vmemmap_free(unsigned long start, unsigned long end, |
1335 | struct vmem_altmap *altmap) |
1336 | { |
1337 | remove_pagetable(start, end, direct: false, altmap); |
1338 | } |
1339 | #endif |
1340 | #endif |
1341 | |
1342 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
1343 | |
1344 | unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, |
1345 | pmd_t *pmdp, unsigned long clr, |
1346 | unsigned long set) |
1347 | { |
1348 | unsigned long old; |
1349 | |
1350 | #ifdef CONFIG_DEBUG_VM |
1351 | WARN_ON(!radix__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp)); |
1352 | assert_spin_locked(pmd_lockptr(mm, pmdp)); |
1353 | #endif |
1354 | |
1355 | old = radix__pte_update(mm, addr, pmdp_ptep(pmdp), clr, set, 1); |
1356 | trace_hugepage_update_pmd(addr, pmd: old, clr, set); |
1357 | |
1358 | return old; |
1359 | } |
1360 | |
1361 | unsigned long radix__pud_hugepage_update(struct mm_struct *mm, unsigned long addr, |
1362 | pud_t *pudp, unsigned long clr, |
1363 | unsigned long set) |
1364 | { |
1365 | unsigned long old; |
1366 | |
1367 | #ifdef CONFIG_DEBUG_VM |
1368 | WARN_ON(!pud_devmap(*pudp)); |
1369 | assert_spin_locked(pud_lockptr(mm, pudp)); |
1370 | #endif |
1371 | |
1372 | old = radix__pte_update(mm, addr, pudp_ptep(pudp), clr, set, 1); |
1373 | trace_hugepage_update_pud(addr, pud: old, clr, set); |
1374 | |
1375 | return old; |
1376 | } |
1377 | |
1378 | pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, |
1379 | pmd_t *pmdp) |
1380 | |
1381 | { |
1382 | pmd_t pmd; |
1383 | |
1384 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); |
1385 | VM_BUG_ON(radix__pmd_trans_huge(*pmdp)); |
1386 | VM_BUG_ON(pmd_devmap(*pmdp)); |
1387 | /* |
1388 | * khugepaged calls this for normal pmd |
1389 | */ |
1390 | pmd = *pmdp; |
1391 | pmd_clear(pmdp); |
1392 | |
1393 | radix__flush_tlb_collapsed_pmd(vma->vm_mm, address); |
1394 | |
1395 | return pmd; |
1396 | } |
1397 | |
1398 | /* |
1399 | * For us pgtable_t is pte_t *. Inorder to save the deposisted |
1400 | * page table, we consider the allocated page table as a list |
1401 | * head. On withdraw we need to make sure we zero out the used |
1402 | * list_head memory area. |
1403 | */ |
1404 | void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, |
1405 | pgtable_t pgtable) |
1406 | { |
1407 | struct list_head *lh = (struct list_head *) pgtable; |
1408 | |
1409 | assert_spin_locked(pmd_lockptr(mm, pmdp)); |
1410 | |
1411 | /* FIFO */ |
1412 | if (!pmd_huge_pte(mm, pmdp)) |
1413 | INIT_LIST_HEAD(list: lh); |
1414 | else |
1415 | list_add(new: lh, head: (struct list_head *) pmd_huge_pte(mm, pmdp)); |
1416 | pmd_huge_pte(mm, pmdp) = pgtable; |
1417 | } |
1418 | |
1419 | pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) |
1420 | { |
1421 | pte_t *ptep; |
1422 | pgtable_t pgtable; |
1423 | struct list_head *lh; |
1424 | |
1425 | assert_spin_locked(pmd_lockptr(mm, pmdp)); |
1426 | |
1427 | /* FIFO */ |
1428 | pgtable = pmd_huge_pte(mm, pmdp); |
1429 | lh = (struct list_head *) pgtable; |
1430 | if (list_empty(head: lh)) |
1431 | pmd_huge_pte(mm, pmdp) = NULL; |
1432 | else { |
1433 | pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next; |
1434 | list_del(entry: lh); |
1435 | } |
1436 | ptep = (pte_t *) pgtable; |
1437 | *ptep = __pte(val: 0); |
1438 | ptep++; |
1439 | *ptep = __pte(val: 0); |
1440 | return pgtable; |
1441 | } |
1442 | |
1443 | pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm, |
1444 | unsigned long addr, pmd_t *pmdp) |
1445 | { |
1446 | pmd_t old_pmd; |
1447 | unsigned long old; |
1448 | |
1449 | old = radix__pmd_hugepage_update(mm, addr, pmdp, clr: ~0UL, set: 0); |
1450 | old_pmd = __pmd(val: old); |
1451 | return old_pmd; |
1452 | } |
1453 | |
1454 | pud_t radix__pudp_huge_get_and_clear(struct mm_struct *mm, |
1455 | unsigned long addr, pud_t *pudp) |
1456 | { |
1457 | pud_t old_pud; |
1458 | unsigned long old; |
1459 | |
1460 | old = radix__pud_hugepage_update(mm, addr, pudp, clr: ~0UL, set: 0); |
1461 | old_pud = __pud(val: old); |
1462 | return old_pud; |
1463 | } |
1464 | |
1465 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ |
1466 | |
1467 | void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep, |
1468 | pte_t entry, unsigned long address, int psize) |
1469 | { |
1470 | struct mm_struct *mm = vma->vm_mm; |
1471 | unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_SOFT_DIRTY | |
1472 | _PAGE_ACCESSED | _PAGE_RW | _PAGE_EXEC); |
1473 | |
1474 | unsigned long change = pte_val(pte: entry) ^ pte_val(pte: *ptep); |
1475 | /* |
1476 | * On POWER9, the NMMU is not able to relax PTE access permissions |
1477 | * for a translation with a TLB. The PTE must be invalidated, TLB |
1478 | * flushed before the new PTE is installed. |
1479 | * |
1480 | * This only needs to be done for radix, because hash translation does |
1481 | * flush when updating the linux pte (and we don't support NMMU |
1482 | * accelerators on HPT on POWER9 anyway XXX: do we?). |
1483 | * |
1484 | * POWER10 (and P9P) NMMU does behave as per ISA. |
1485 | */ |
1486 | if (!cpu_has_feature(CPU_FTR_ARCH_31) && (change & _PAGE_RW) && |
1487 | atomic_read(&mm->context.copros) > 0) { |
1488 | unsigned long old_pte, new_pte; |
1489 | |
1490 | old_pte = __radix_pte_update(ptep, _PAGE_PRESENT, _PAGE_INVALID); |
1491 | new_pte = old_pte | set; |
1492 | radix__flush_tlb_page_psize(mm, address, psize); |
1493 | __radix_pte_update(ptep, _PAGE_INVALID, new_pte); |
1494 | } else { |
1495 | __radix_pte_update(ptep, 0, set); |
1496 | /* |
1497 | * Book3S does not require a TLB flush when relaxing access |
1498 | * restrictions when the address space (modulo the POWER9 nest |
1499 | * MMU issue above) because the MMU will reload the PTE after |
1500 | * taking an access fault, as defined by the architecture. See |
1501 | * "Setting a Reference or Change Bit or Upgrading Access |
1502 | * Authority (PTE Subject to Atomic Hardware Updates)" in |
1503 | * Power ISA Version 3.1B. |
1504 | */ |
1505 | } |
1506 | /* See ptesync comment in radix__set_pte_at */ |
1507 | } |
1508 | |
1509 | void radix__ptep_modify_prot_commit(struct vm_area_struct *vma, |
1510 | unsigned long addr, pte_t *ptep, |
1511 | pte_t old_pte, pte_t pte) |
1512 | { |
1513 | struct mm_struct *mm = vma->vm_mm; |
1514 | |
1515 | /* |
1516 | * POWER9 NMMU must flush the TLB after clearing the PTE before |
1517 | * installing a PTE with more relaxed access permissions, see |
1518 | * radix__ptep_set_access_flags. |
1519 | */ |
1520 | if (!cpu_has_feature(CPU_FTR_ARCH_31) && |
1521 | is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) && |
1522 | (atomic_read(&mm->context.copros) > 0)) |
1523 | radix__flush_tlb_page(vma, addr); |
1524 | |
1525 | set_pte_at(mm, addr, ptep, pte); |
1526 | } |
1527 | |
1528 | int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) |
1529 | { |
1530 | pte_t *ptep = (pte_t *)pud; |
1531 | pte_t new_pud = pfn_pte(__phys_to_pfn(addr), pgprot: prot); |
1532 | |
1533 | if (!radix_enabled()) |
1534 | return 0; |
1535 | |
1536 | set_pte_at(&init_mm, 0 /* radix unused */, ptep, new_pud); |
1537 | |
1538 | return 1; |
1539 | } |
1540 | |
1541 | int pud_clear_huge(pud_t *pud) |
1542 | { |
1543 | if (pud_leaf(pud: *pud)) { |
1544 | pud_clear(pudp: pud); |
1545 | return 1; |
1546 | } |
1547 | |
1548 | return 0; |
1549 | } |
1550 | |
1551 | int pud_free_pmd_page(pud_t *pud, unsigned long addr) |
1552 | { |
1553 | pmd_t *pmd; |
1554 | int i; |
1555 | |
1556 | pmd = pud_pgtable(pud: *pud); |
1557 | pud_clear(pudp: pud); |
1558 | |
1559 | flush_tlb_kernel_range(start: addr, end: addr + PUD_SIZE); |
1560 | |
1561 | for (i = 0; i < PTRS_PER_PMD; i++) { |
1562 | if (!pmd_none(pmd: pmd[i])) { |
1563 | pte_t *pte; |
1564 | pte = (pte_t *)pmd_page_vaddr(pmd: pmd[i]); |
1565 | |
1566 | pte_free_kernel(mm: &init_mm, pte); |
1567 | } |
1568 | } |
1569 | |
1570 | pmd_free(mm: &init_mm, pmd); |
1571 | |
1572 | return 1; |
1573 | } |
1574 | |
1575 | int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) |
1576 | { |
1577 | pte_t *ptep = (pte_t *)pmd; |
1578 | pte_t new_pmd = pfn_pte(__phys_to_pfn(addr), pgprot: prot); |
1579 | |
1580 | if (!radix_enabled()) |
1581 | return 0; |
1582 | |
1583 | set_pte_at(&init_mm, 0 /* radix unused */, ptep, new_pmd); |
1584 | |
1585 | return 1; |
1586 | } |
1587 | |
1588 | int pmd_clear_huge(pmd_t *pmd) |
1589 | { |
1590 | if (pmd_leaf(pte: *pmd)) { |
1591 | pmd_clear(pmdp: pmd); |
1592 | return 1; |
1593 | } |
1594 | |
1595 | return 0; |
1596 | } |
1597 | |
1598 | int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) |
1599 | { |
1600 | pte_t *pte; |
1601 | |
1602 | pte = (pte_t *)pmd_page_vaddr(pmd: *pmd); |
1603 | pmd_clear(pmdp: pmd); |
1604 | |
1605 | flush_tlb_kernel_range(start: addr, end: addr + PMD_SIZE); |
1606 | |
1607 | pte_free_kernel(mm: &init_mm, pte); |
1608 | |
1609 | return 1; |
1610 | } |
1611 | |