1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * arch/arm64/mm/hugetlbpage.c |
4 | * |
5 | * Copyright (C) 2013 Linaro Ltd. |
6 | * |
7 | * Based on arch/x86/mm/hugetlbpage.c. |
8 | */ |
9 | |
10 | #include <linux/init.h> |
11 | #include <linux/fs.h> |
12 | #include <linux/mm.h> |
13 | #include <linux/hugetlb.h> |
14 | #include <linux/pagemap.h> |
15 | #include <linux/err.h> |
16 | #include <linux/sysctl.h> |
17 | #include <asm/mman.h> |
18 | #include <asm/tlb.h> |
19 | #include <asm/tlbflush.h> |
20 | |
21 | /* |
22 | * HugeTLB Support Matrix |
23 | * |
24 | * --------------------------------------------------- |
25 | * | Page Size | CONT PTE | PMD | CONT PMD | PUD | |
26 | * --------------------------------------------------- |
27 | * | 4K | 64K | 2M | 32M | 1G | |
28 | * | 16K | 2M | 32M | 1G | | |
29 | * | 64K | 2M | 512M | 16G | | |
30 | * --------------------------------------------------- |
31 | */ |
32 | |
33 | /* |
34 | * Reserve CMA areas for the largest supported gigantic |
35 | * huge page when requested. Any other smaller gigantic |
36 | * huge pages could still be served from those areas. |
37 | */ |
38 | #ifdef CONFIG_CMA |
39 | void __init arm64_hugetlb_cma_reserve(void) |
40 | { |
41 | int order; |
42 | |
43 | if (pud_sect_supported()) |
44 | order = PUD_SHIFT - PAGE_SHIFT; |
45 | else |
46 | order = CONT_PMD_SHIFT - PAGE_SHIFT; |
47 | |
48 | hugetlb_cma_reserve(order); |
49 | } |
50 | #endif /* CONFIG_CMA */ |
51 | |
52 | static bool __hugetlb_valid_size(unsigned long size) |
53 | { |
54 | switch (size) { |
55 | #ifndef __PAGETABLE_PMD_FOLDED |
56 | case PUD_SIZE: |
57 | return pud_sect_supported(); |
58 | #endif |
59 | case CONT_PMD_SIZE: |
60 | case PMD_SIZE: |
61 | case CONT_PTE_SIZE: |
62 | return true; |
63 | } |
64 | |
65 | return false; |
66 | } |
67 | |
68 | #ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION |
69 | bool arch_hugetlb_migration_supported(struct hstate *h) |
70 | { |
71 | size_t pagesize = huge_page_size(h); |
72 | |
73 | if (!__hugetlb_valid_size(size: pagesize)) { |
74 | pr_warn("%s: unrecognized huge page size 0x%lx\n" , |
75 | __func__, pagesize); |
76 | return false; |
77 | } |
78 | return true; |
79 | } |
80 | #endif |
81 | |
82 | int pmd_huge(pmd_t pmd) |
83 | { |
84 | return pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT); |
85 | } |
86 | |
87 | int pud_huge(pud_t pud) |
88 | { |
89 | #ifndef __PAGETABLE_PMD_FOLDED |
90 | return pud_val(pud) && !(pud_val(pud) & PUD_TABLE_BIT); |
91 | #else |
92 | return 0; |
93 | #endif |
94 | } |
95 | |
96 | static int find_num_contig(struct mm_struct *mm, unsigned long addr, |
97 | pte_t *ptep, size_t *pgsize) |
98 | { |
99 | pgd_t *pgdp = pgd_offset(mm, addr); |
100 | p4d_t *p4dp; |
101 | pud_t *pudp; |
102 | pmd_t *pmdp; |
103 | |
104 | *pgsize = PAGE_SIZE; |
105 | p4dp = p4d_offset(pgd: pgdp, address: addr); |
106 | pudp = pud_offset(p4d: p4dp, address: addr); |
107 | pmdp = pmd_offset(pud: pudp, address: addr); |
108 | if ((pte_t *)pmdp == ptep) { |
109 | *pgsize = PMD_SIZE; |
110 | return CONT_PMDS; |
111 | } |
112 | return CONT_PTES; |
113 | } |
114 | |
115 | static inline int num_contig_ptes(unsigned long size, size_t *pgsize) |
116 | { |
117 | int contig_ptes = 0; |
118 | |
119 | *pgsize = size; |
120 | |
121 | switch (size) { |
122 | #ifndef __PAGETABLE_PMD_FOLDED |
123 | case PUD_SIZE: |
124 | if (pud_sect_supported()) |
125 | contig_ptes = 1; |
126 | break; |
127 | #endif |
128 | case PMD_SIZE: |
129 | contig_ptes = 1; |
130 | break; |
131 | case CONT_PMD_SIZE: |
132 | *pgsize = PMD_SIZE; |
133 | contig_ptes = CONT_PMDS; |
134 | break; |
135 | case CONT_PTE_SIZE: |
136 | *pgsize = PAGE_SIZE; |
137 | contig_ptes = CONT_PTES; |
138 | break; |
139 | } |
140 | |
141 | return contig_ptes; |
142 | } |
143 | |
144 | pte_t huge_ptep_get(pte_t *ptep) |
145 | { |
146 | int ncontig, i; |
147 | size_t pgsize; |
148 | pte_t orig_pte = __ptep_get(ptep); |
149 | |
150 | if (!pte_present(a: orig_pte) || !pte_cont(orig_pte)) |
151 | return orig_pte; |
152 | |
153 | ncontig = num_contig_ptes(size: page_size(pte_page(orig_pte)), pgsize: &pgsize); |
154 | for (i = 0; i < ncontig; i++, ptep++) { |
155 | pte_t pte = __ptep_get(ptep); |
156 | |
157 | if (pte_dirty(pte)) |
158 | orig_pte = pte_mkdirty(pte: orig_pte); |
159 | |
160 | if (pte_young(pte)) |
161 | orig_pte = pte_mkyoung(pte: orig_pte); |
162 | } |
163 | return orig_pte; |
164 | } |
165 | |
166 | /* |
167 | * Changing some bits of contiguous entries requires us to follow a |
168 | * Break-Before-Make approach, breaking the whole contiguous set |
169 | * before we can change any entries. See ARM DDI 0487A.k_iss10775, |
170 | * "Misprogramming of the Contiguous bit", page D4-1762. |
171 | * |
172 | * This helper performs the break step. |
173 | */ |
174 | static pte_t get_clear_contig(struct mm_struct *mm, |
175 | unsigned long addr, |
176 | pte_t *ptep, |
177 | unsigned long pgsize, |
178 | unsigned long ncontig) |
179 | { |
180 | pte_t orig_pte = __ptep_get(ptep); |
181 | unsigned long i; |
182 | |
183 | for (i = 0; i < ncontig; i++, addr += pgsize, ptep++) { |
184 | pte_t pte = __ptep_get_and_clear(mm, addr, ptep); |
185 | |
186 | /* |
187 | * If HW_AFDBM is enabled, then the HW could turn on |
188 | * the dirty or accessed bit for any page in the set, |
189 | * so check them all. |
190 | */ |
191 | if (pte_dirty(pte)) |
192 | orig_pte = pte_mkdirty(pte: orig_pte); |
193 | |
194 | if (pte_young(pte)) |
195 | orig_pte = pte_mkyoung(pte: orig_pte); |
196 | } |
197 | return orig_pte; |
198 | } |
199 | |
200 | static pte_t get_clear_contig_flush(struct mm_struct *mm, |
201 | unsigned long addr, |
202 | pte_t *ptep, |
203 | unsigned long pgsize, |
204 | unsigned long ncontig) |
205 | { |
206 | pte_t orig_pte = get_clear_contig(mm, addr, ptep, pgsize, ncontig); |
207 | struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0); |
208 | |
209 | flush_tlb_range(&vma, addr, addr + (pgsize * ncontig)); |
210 | return orig_pte; |
211 | } |
212 | |
213 | /* |
214 | * Changing some bits of contiguous entries requires us to follow a |
215 | * Break-Before-Make approach, breaking the whole contiguous set |
216 | * before we can change any entries. See ARM DDI 0487A.k_iss10775, |
217 | * "Misprogramming of the Contiguous bit", page D4-1762. |
218 | * |
219 | * This helper performs the break step for use cases where the |
220 | * original pte is not needed. |
221 | */ |
222 | static void clear_flush(struct mm_struct *mm, |
223 | unsigned long addr, |
224 | pte_t *ptep, |
225 | unsigned long pgsize, |
226 | unsigned long ncontig) |
227 | { |
228 | struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0); |
229 | unsigned long i, saddr = addr; |
230 | |
231 | for (i = 0; i < ncontig; i++, addr += pgsize, ptep++) |
232 | __ptep_get_and_clear(mm, addr, ptep); |
233 | |
234 | flush_tlb_range(&vma, saddr, addr); |
235 | } |
236 | |
237 | void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, |
238 | pte_t *ptep, pte_t pte, unsigned long sz) |
239 | { |
240 | size_t pgsize; |
241 | int i; |
242 | int ncontig; |
243 | unsigned long pfn, dpfn; |
244 | pgprot_t hugeprot; |
245 | |
246 | ncontig = num_contig_ptes(size: sz, pgsize: &pgsize); |
247 | |
248 | if (!pte_present(a: pte)) { |
249 | for (i = 0; i < ncontig; i++, ptep++, addr += pgsize) |
250 | __set_ptes(mm, addr, ptep, pte, 1); |
251 | return; |
252 | } |
253 | |
254 | if (!pte_cont(pte)) { |
255 | __set_ptes(mm, addr, ptep, pte, 1); |
256 | return; |
257 | } |
258 | |
259 | pfn = pte_pfn(pte); |
260 | dpfn = pgsize >> PAGE_SHIFT; |
261 | hugeprot = pte_pgprot(pte); |
262 | |
263 | clear_flush(mm, addr, ptep, pgsize, ncontig); |
264 | |
265 | for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn) |
266 | __set_ptes(mm, addr, ptep, pfn_pte(page_nr: pfn, pgprot: hugeprot), 1); |
267 | } |
268 | |
269 | pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, |
270 | unsigned long addr, unsigned long sz) |
271 | { |
272 | pgd_t *pgdp; |
273 | p4d_t *p4dp; |
274 | pud_t *pudp; |
275 | pmd_t *pmdp; |
276 | pte_t *ptep = NULL; |
277 | |
278 | pgdp = pgd_offset(mm, addr); |
279 | p4dp = p4d_alloc(mm, pgd: pgdp, address: addr); |
280 | if (!p4dp) |
281 | return NULL; |
282 | |
283 | pudp = pud_alloc(mm, p4d: p4dp, address: addr); |
284 | if (!pudp) |
285 | return NULL; |
286 | |
287 | if (sz == PUD_SIZE) { |
288 | ptep = (pte_t *)pudp; |
289 | } else if (sz == (CONT_PTE_SIZE)) { |
290 | pmdp = pmd_alloc(mm, pud: pudp, address: addr); |
291 | if (!pmdp) |
292 | return NULL; |
293 | |
294 | WARN_ON(addr & (sz - 1)); |
295 | ptep = pte_alloc_huge(mm, pmd: pmdp, address: addr); |
296 | } else if (sz == PMD_SIZE) { |
297 | if (want_pmd_share(vma, addr) && pud_none(READ_ONCE(*pudp))) |
298 | ptep = huge_pmd_share(mm, vma, addr, pud: pudp); |
299 | else |
300 | ptep = (pte_t *)pmd_alloc(mm, pud: pudp, address: addr); |
301 | } else if (sz == (CONT_PMD_SIZE)) { |
302 | pmdp = pmd_alloc(mm, pud: pudp, address: addr); |
303 | WARN_ON(addr & (sz - 1)); |
304 | return (pte_t *)pmdp; |
305 | } |
306 | |
307 | return ptep; |
308 | } |
309 | |
310 | pte_t *huge_pte_offset(struct mm_struct *mm, |
311 | unsigned long addr, unsigned long sz) |
312 | { |
313 | pgd_t *pgdp; |
314 | p4d_t *p4dp; |
315 | pud_t *pudp, pud; |
316 | pmd_t *pmdp, pmd; |
317 | |
318 | pgdp = pgd_offset(mm, addr); |
319 | if (!pgd_present(READ_ONCE(*pgdp))) |
320 | return NULL; |
321 | |
322 | p4dp = p4d_offset(pgd: pgdp, address: addr); |
323 | if (!p4d_present(READ_ONCE(*p4dp))) |
324 | return NULL; |
325 | |
326 | pudp = pud_offset(p4d: p4dp, address: addr); |
327 | pud = READ_ONCE(*pudp); |
328 | if (sz != PUD_SIZE && pud_none(pud)) |
329 | return NULL; |
330 | /* hugepage or swap? */ |
331 | if (pud_huge(pud) || !pud_present(pud)) |
332 | return (pte_t *)pudp; |
333 | /* table; check the next level */ |
334 | |
335 | if (sz == CONT_PMD_SIZE) |
336 | addr &= CONT_PMD_MASK; |
337 | |
338 | pmdp = pmd_offset(pud: pudp, address: addr); |
339 | pmd = READ_ONCE(*pmdp); |
340 | if (!(sz == PMD_SIZE || sz == CONT_PMD_SIZE) && |
341 | pmd_none(pmd)) |
342 | return NULL; |
343 | if (pmd_huge(pmd) || !pmd_present(pmd)) |
344 | return (pte_t *)pmdp; |
345 | |
346 | if (sz == CONT_PTE_SIZE) |
347 | return pte_offset_huge(pmd: pmdp, address: (addr & CONT_PTE_MASK)); |
348 | |
349 | return NULL; |
350 | } |
351 | |
352 | unsigned long hugetlb_mask_last_page(struct hstate *h) |
353 | { |
354 | unsigned long hp_size = huge_page_size(h); |
355 | |
356 | switch (hp_size) { |
357 | #ifndef __PAGETABLE_PMD_FOLDED |
358 | case PUD_SIZE: |
359 | return PGDIR_SIZE - PUD_SIZE; |
360 | #endif |
361 | case CONT_PMD_SIZE: |
362 | return PUD_SIZE - CONT_PMD_SIZE; |
363 | case PMD_SIZE: |
364 | return PUD_SIZE - PMD_SIZE; |
365 | case CONT_PTE_SIZE: |
366 | return PMD_SIZE - CONT_PTE_SIZE; |
367 | default: |
368 | break; |
369 | } |
370 | |
371 | return 0UL; |
372 | } |
373 | |
374 | pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags) |
375 | { |
376 | size_t pagesize = 1UL << shift; |
377 | |
378 | entry = pte_mkhuge(pte: entry); |
379 | if (pagesize == CONT_PTE_SIZE) { |
380 | entry = pte_mkcont(entry); |
381 | } else if (pagesize == CONT_PMD_SIZE) { |
382 | entry = pmd_pte(pmd_mkcont(pte_pmd(entry))); |
383 | } else if (pagesize != PUD_SIZE && pagesize != PMD_SIZE) { |
384 | pr_warn("%s: unrecognized huge page size 0x%lx\n" , |
385 | __func__, pagesize); |
386 | } |
387 | return entry; |
388 | } |
389 | |
390 | void huge_pte_clear(struct mm_struct *mm, unsigned long addr, |
391 | pte_t *ptep, unsigned long sz) |
392 | { |
393 | int i, ncontig; |
394 | size_t pgsize; |
395 | |
396 | ncontig = num_contig_ptes(size: sz, pgsize: &pgsize); |
397 | |
398 | for (i = 0; i < ncontig; i++, addr += pgsize, ptep++) |
399 | __pte_clear(mm, addr, ptep); |
400 | } |
401 | |
402 | pte_t huge_ptep_get_and_clear(struct mm_struct *mm, |
403 | unsigned long addr, pte_t *ptep) |
404 | { |
405 | int ncontig; |
406 | size_t pgsize; |
407 | pte_t orig_pte = __ptep_get(ptep); |
408 | |
409 | if (!pte_cont(orig_pte)) |
410 | return __ptep_get_and_clear(mm, addr, ptep); |
411 | |
412 | ncontig = find_num_contig(mm, addr, ptep, pgsize: &pgsize); |
413 | |
414 | return get_clear_contig(mm, addr, ptep, pgsize, ncontig); |
415 | } |
416 | |
417 | /* |
418 | * huge_ptep_set_access_flags will update access flags (dirty, accesssed) |
419 | * and write permission. |
420 | * |
421 | * For a contiguous huge pte range we need to check whether or not write |
422 | * permission has to change only on the first pte in the set. Then for |
423 | * all the contiguous ptes we need to check whether or not there is a |
424 | * discrepancy between dirty or young. |
425 | */ |
426 | static int __cont_access_flags_changed(pte_t *ptep, pte_t pte, int ncontig) |
427 | { |
428 | int i; |
429 | |
430 | if (pte_write(pte) != pte_write(__ptep_get(ptep))) |
431 | return 1; |
432 | |
433 | for (i = 0; i < ncontig; i++) { |
434 | pte_t orig_pte = __ptep_get(ptep + i); |
435 | |
436 | if (pte_dirty(pte) != pte_dirty(pte: orig_pte)) |
437 | return 1; |
438 | |
439 | if (pte_young(pte) != pte_young(pte: orig_pte)) |
440 | return 1; |
441 | } |
442 | |
443 | return 0; |
444 | } |
445 | |
446 | int huge_ptep_set_access_flags(struct vm_area_struct *vma, |
447 | unsigned long addr, pte_t *ptep, |
448 | pte_t pte, int dirty) |
449 | { |
450 | int ncontig, i; |
451 | size_t pgsize = 0; |
452 | unsigned long pfn = pte_pfn(pte), dpfn; |
453 | struct mm_struct *mm = vma->vm_mm; |
454 | pgprot_t hugeprot; |
455 | pte_t orig_pte; |
456 | |
457 | if (!pte_cont(pte)) |
458 | return __ptep_set_access_flags(vma, addr, ptep, pte, dirty); |
459 | |
460 | ncontig = find_num_contig(mm, addr, ptep, pgsize: &pgsize); |
461 | dpfn = pgsize >> PAGE_SHIFT; |
462 | |
463 | if (!__cont_access_flags_changed(ptep, pte, ncontig)) |
464 | return 0; |
465 | |
466 | orig_pte = get_clear_contig_flush(mm, addr, ptep, pgsize, ncontig); |
467 | |
468 | /* Make sure we don't lose the dirty or young state */ |
469 | if (pte_dirty(pte: orig_pte)) |
470 | pte = pte_mkdirty(pte); |
471 | |
472 | if (pte_young(pte: orig_pte)) |
473 | pte = pte_mkyoung(pte); |
474 | |
475 | hugeprot = pte_pgprot(pte); |
476 | for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn) |
477 | __set_ptes(mm, addr, ptep, pfn_pte(page_nr: pfn, pgprot: hugeprot), 1); |
478 | |
479 | return 1; |
480 | } |
481 | |
482 | void huge_ptep_set_wrprotect(struct mm_struct *mm, |
483 | unsigned long addr, pte_t *ptep) |
484 | { |
485 | unsigned long pfn, dpfn; |
486 | pgprot_t hugeprot; |
487 | int ncontig, i; |
488 | size_t pgsize; |
489 | pte_t pte; |
490 | |
491 | if (!pte_cont(__ptep_get(ptep))) { |
492 | __ptep_set_wrprotect(mm, addr, ptep); |
493 | return; |
494 | } |
495 | |
496 | ncontig = find_num_contig(mm, addr, ptep, pgsize: &pgsize); |
497 | dpfn = pgsize >> PAGE_SHIFT; |
498 | |
499 | pte = get_clear_contig_flush(mm, addr, ptep, pgsize, ncontig); |
500 | pte = pte_wrprotect(pte); |
501 | |
502 | hugeprot = pte_pgprot(pte); |
503 | pfn = pte_pfn(pte); |
504 | |
505 | for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn) |
506 | __set_ptes(mm, addr, ptep, pfn_pte(page_nr: pfn, pgprot: hugeprot), 1); |
507 | } |
508 | |
509 | pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, |
510 | unsigned long addr, pte_t *ptep) |
511 | { |
512 | struct mm_struct *mm = vma->vm_mm; |
513 | size_t pgsize; |
514 | int ncontig; |
515 | |
516 | if (!pte_cont(__ptep_get(ptep))) |
517 | return ptep_clear_flush(vma, address: addr, ptep); |
518 | |
519 | ncontig = find_num_contig(mm, addr, ptep, pgsize: &pgsize); |
520 | return get_clear_contig_flush(mm, addr, ptep, pgsize, ncontig); |
521 | } |
522 | |
523 | static int __init hugetlbpage_init(void) |
524 | { |
525 | if (pud_sect_supported()) |
526 | hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); |
527 | |
528 | hugetlb_add_hstate(order: CONT_PMD_SHIFT - PAGE_SHIFT); |
529 | hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT); |
530 | hugetlb_add_hstate(order: CONT_PTE_SHIFT - PAGE_SHIFT); |
531 | |
532 | return 0; |
533 | } |
534 | arch_initcall(hugetlbpage_init); |
535 | |
536 | bool __init arch_hugetlb_valid_size(unsigned long size) |
537 | { |
538 | return __hugetlb_valid_size(size); |
539 | } |
540 | |
541 | pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) |
542 | { |
543 | if (alternative_has_cap_unlikely(ARM64_WORKAROUND_2645198)) { |
544 | /* |
545 | * Break-before-make (BBM) is required for all user space mappings |
546 | * when the permission changes from executable to non-executable |
547 | * in cases where cpu is affected with errata #2645198. |
548 | */ |
549 | if (pte_user_exec(__ptep_get(ptep))) |
550 | return huge_ptep_clear_flush(vma, addr, ptep); |
551 | } |
552 | return huge_ptep_get_and_clear(vma->vm_mm, addr, ptep); |
553 | } |
554 | |
555 | void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, |
556 | pte_t old_pte, pte_t pte) |
557 | { |
558 | unsigned long psize = huge_page_size(h: hstate_vma(vma)); |
559 | |
560 | set_huge_pte_at(vma->vm_mm, addr, ptep, pte, psize); |
561 | } |
562 | |