1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * IBM System z Huge TLB Page Support for Kernel. |
4 | * |
5 | * Copyright IBM Corp. 2007,2020 |
6 | * Author(s): Gerald Schaefer <gerald.schaefer@de.ibm.com> |
7 | */ |
8 | |
9 | #define KMSG_COMPONENT "hugetlb" |
10 | #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt |
11 | |
12 | #include <asm/pgalloc.h> |
13 | #include <linux/mm.h> |
14 | #include <linux/hugetlb.h> |
15 | #include <linux/mman.h> |
16 | #include <linux/sched/mm.h> |
17 | #include <linux/security.h> |
18 | |
19 | /* |
20 | * If the bit selected by single-bit bitmask "a" is set within "x", move |
21 | * it to the position indicated by single-bit bitmask "b". |
22 | */ |
23 | #define move_set_bit(x, a, b) (((x) & (a)) >> ilog2(a) << ilog2(b)) |
24 | |
25 | static inline unsigned long __pte_to_rste(pte_t pte) |
26 | { |
27 | unsigned long rste; |
28 | |
29 | /* |
30 | * Convert encoding pte bits pmd / pud bits |
31 | * lIR.uswrdy.p dy..R...I...wr |
32 | * empty 010.000000.0 -> 00..0...1...00 |
33 | * prot-none, clean, old 111.000000.1 -> 00..1...1...00 |
34 | * prot-none, clean, young 111.000001.1 -> 01..1...1...00 |
35 | * prot-none, dirty, old 111.000010.1 -> 10..1...1...00 |
36 | * prot-none, dirty, young 111.000011.1 -> 11..1...1...00 |
37 | * read-only, clean, old 111.000100.1 -> 00..1...1...01 |
38 | * read-only, clean, young 101.000101.1 -> 01..1...0...01 |
39 | * read-only, dirty, old 111.000110.1 -> 10..1...1...01 |
40 | * read-only, dirty, young 101.000111.1 -> 11..1...0...01 |
41 | * read-write, clean, old 111.001100.1 -> 00..1...1...11 |
42 | * read-write, clean, young 101.001101.1 -> 01..1...0...11 |
43 | * read-write, dirty, old 110.001110.1 -> 10..0...1...11 |
44 | * read-write, dirty, young 100.001111.1 -> 11..0...0...11 |
45 | * HW-bits: R read-only, I invalid |
46 | * SW-bits: p present, y young, d dirty, r read, w write, s special, |
47 | * u unused, l large |
48 | */ |
49 | if (pte_present(a: pte)) { |
50 | rste = pte_val(pte) & PAGE_MASK; |
51 | rste |= move_set_bit(pte_val(pte), _PAGE_READ, |
52 | _SEGMENT_ENTRY_READ); |
53 | rste |= move_set_bit(pte_val(pte), _PAGE_WRITE, |
54 | _SEGMENT_ENTRY_WRITE); |
55 | rste |= move_set_bit(pte_val(pte), _PAGE_INVALID, |
56 | _SEGMENT_ENTRY_INVALID); |
57 | rste |= move_set_bit(pte_val(pte), _PAGE_PROTECT, |
58 | _SEGMENT_ENTRY_PROTECT); |
59 | rste |= move_set_bit(pte_val(pte), _PAGE_DIRTY, |
60 | _SEGMENT_ENTRY_DIRTY); |
61 | rste |= move_set_bit(pte_val(pte), _PAGE_YOUNG, |
62 | _SEGMENT_ENTRY_YOUNG); |
63 | #ifdef CONFIG_MEM_SOFT_DIRTY |
64 | rste |= move_set_bit(pte_val(pte), _PAGE_SOFT_DIRTY, |
65 | _SEGMENT_ENTRY_SOFT_DIRTY); |
66 | #endif |
67 | rste |= move_set_bit(pte_val(pte), _PAGE_NOEXEC, |
68 | _SEGMENT_ENTRY_NOEXEC); |
69 | } else |
70 | rste = _SEGMENT_ENTRY_EMPTY; |
71 | return rste; |
72 | } |
73 | |
74 | static inline pte_t __rste_to_pte(unsigned long rste) |
75 | { |
76 | unsigned long pteval; |
77 | int present; |
78 | |
79 | if ((rste & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) |
80 | present = pud_present(pud: __pud(val: rste)); |
81 | else |
82 | present = pmd_present(pmd: __pmd(val: rste)); |
83 | |
84 | /* |
85 | * Convert encoding pmd / pud bits pte bits |
86 | * dy..R...I...wr lIR.uswrdy.p |
87 | * empty 00..0...1...00 -> 010.000000.0 |
88 | * prot-none, clean, old 00..1...1...00 -> 111.000000.1 |
89 | * prot-none, clean, young 01..1...1...00 -> 111.000001.1 |
90 | * prot-none, dirty, old 10..1...1...00 -> 111.000010.1 |
91 | * prot-none, dirty, young 11..1...1...00 -> 111.000011.1 |
92 | * read-only, clean, old 00..1...1...01 -> 111.000100.1 |
93 | * read-only, clean, young 01..1...0...01 -> 101.000101.1 |
94 | * read-only, dirty, old 10..1...1...01 -> 111.000110.1 |
95 | * read-only, dirty, young 11..1...0...01 -> 101.000111.1 |
96 | * read-write, clean, old 00..1...1...11 -> 111.001100.1 |
97 | * read-write, clean, young 01..1...0...11 -> 101.001101.1 |
98 | * read-write, dirty, old 10..0...1...11 -> 110.001110.1 |
99 | * read-write, dirty, young 11..0...0...11 -> 100.001111.1 |
100 | * HW-bits: R read-only, I invalid |
101 | * SW-bits: p present, y young, d dirty, r read, w write, s special, |
102 | * u unused, l large |
103 | */ |
104 | if (present) { |
105 | pteval = rste & _SEGMENT_ENTRY_ORIGIN_LARGE; |
106 | pteval |= _PAGE_LARGE | _PAGE_PRESENT; |
107 | pteval |= move_set_bit(rste, _SEGMENT_ENTRY_READ, _PAGE_READ); |
108 | pteval |= move_set_bit(rste, _SEGMENT_ENTRY_WRITE, _PAGE_WRITE); |
109 | pteval |= move_set_bit(rste, _SEGMENT_ENTRY_INVALID, _PAGE_INVALID); |
110 | pteval |= move_set_bit(rste, _SEGMENT_ENTRY_PROTECT, _PAGE_PROTECT); |
111 | pteval |= move_set_bit(rste, _SEGMENT_ENTRY_DIRTY, _PAGE_DIRTY); |
112 | pteval |= move_set_bit(rste, _SEGMENT_ENTRY_YOUNG, _PAGE_YOUNG); |
113 | #ifdef CONFIG_MEM_SOFT_DIRTY |
114 | pteval |= move_set_bit(rste, _SEGMENT_ENTRY_SOFT_DIRTY, _PAGE_SOFT_DIRTY); |
115 | #endif |
116 | pteval |= move_set_bit(rste, _SEGMENT_ENTRY_NOEXEC, _PAGE_NOEXEC); |
117 | } else |
118 | pteval = _PAGE_INVALID; |
119 | return __pte(val: pteval); |
120 | } |
121 | |
122 | static void clear_huge_pte_skeys(struct mm_struct *mm, unsigned long rste) |
123 | { |
124 | struct page *page; |
125 | unsigned long size, paddr; |
126 | |
127 | if (!mm_uses_skeys(mm) || |
128 | rste & _SEGMENT_ENTRY_INVALID) |
129 | return; |
130 | |
131 | if ((rste & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) { |
132 | page = pud_page(__pud(rste)); |
133 | size = PUD_SIZE; |
134 | paddr = rste & PUD_MASK; |
135 | } else { |
136 | page = pmd_page(__pmd(rste)); |
137 | size = PMD_SIZE; |
138 | paddr = rste & PMD_MASK; |
139 | } |
140 | |
141 | if (!test_and_set_bit(nr: PG_arch_1, addr: &page->flags)) |
142 | __storage_key_init_range(paddr, paddr + size - 1); |
143 | } |
144 | |
145 | void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr, |
146 | pte_t *ptep, pte_t pte) |
147 | { |
148 | unsigned long rste; |
149 | |
150 | rste = __pte_to_rste(pte); |
151 | if (!MACHINE_HAS_NX) |
152 | rste &= ~_SEGMENT_ENTRY_NOEXEC; |
153 | |
154 | /* Set correct table type for 2G hugepages */ |
155 | if ((pte_val(*ptep) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) { |
156 | if (likely(pte_present(pte))) |
157 | rste |= _REGION3_ENTRY_LARGE; |
158 | rste |= _REGION_ENTRY_TYPE_R3; |
159 | } else if (likely(pte_present(pte))) |
160 | rste |= _SEGMENT_ENTRY_LARGE; |
161 | |
162 | clear_huge_pte_skeys(mm, rste); |
163 | set_pte(ptep, pte: __pte(val: rste)); |
164 | } |
165 | |
166 | void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, |
167 | pte_t *ptep, pte_t pte, unsigned long sz) |
168 | { |
169 | __set_huge_pte_at(mm, addr, ptep, pte); |
170 | } |
171 | |
172 | pte_t huge_ptep_get(pte_t *ptep) |
173 | { |
174 | return __rste_to_pte(rste: pte_val(pte: *ptep)); |
175 | } |
176 | |
177 | pte_t huge_ptep_get_and_clear(struct mm_struct *mm, |
178 | unsigned long addr, pte_t *ptep) |
179 | { |
180 | pte_t pte = huge_ptep_get(ptep); |
181 | pmd_t *pmdp = (pmd_t *) ptep; |
182 | pud_t *pudp = (pud_t *) ptep; |
183 | |
184 | if ((pte_val(*ptep) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) |
185 | pudp_xchg_direct(mm, addr, pudp, __pud(_REGION3_ENTRY_EMPTY)); |
186 | else |
187 | pmdp_xchg_direct(mm, addr, pmdp, __pmd(_SEGMENT_ENTRY_EMPTY)); |
188 | return pte; |
189 | } |
190 | |
191 | pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, |
192 | unsigned long addr, unsigned long sz) |
193 | { |
194 | pgd_t *pgdp; |
195 | p4d_t *p4dp; |
196 | pud_t *pudp; |
197 | pmd_t *pmdp = NULL; |
198 | |
199 | pgdp = pgd_offset(mm, addr); |
200 | p4dp = p4d_alloc(mm, pgd: pgdp, address: addr); |
201 | if (p4dp) { |
202 | pudp = pud_alloc(mm, p4d: p4dp, address: addr); |
203 | if (pudp) { |
204 | if (sz == PUD_SIZE) |
205 | return (pte_t *) pudp; |
206 | else if (sz == PMD_SIZE) |
207 | pmdp = pmd_alloc(mm, pud: pudp, address: addr); |
208 | } |
209 | } |
210 | return (pte_t *) pmdp; |
211 | } |
212 | |
213 | pte_t *huge_pte_offset(struct mm_struct *mm, |
214 | unsigned long addr, unsigned long sz) |
215 | { |
216 | pgd_t *pgdp; |
217 | p4d_t *p4dp; |
218 | pud_t *pudp; |
219 | pmd_t *pmdp = NULL; |
220 | |
221 | pgdp = pgd_offset(mm, addr); |
222 | if (pgd_present(pgd: *pgdp)) { |
223 | p4dp = p4d_offset(pgd: pgdp, address: addr); |
224 | if (p4d_present(p4d: *p4dp)) { |
225 | pudp = pud_offset(p4d: p4dp, address: addr); |
226 | if (pud_present(pud: *pudp)) { |
227 | if (pud_leaf(pud: *pudp)) |
228 | return (pte_t *) pudp; |
229 | pmdp = pmd_offset(pud: pudp, address: addr); |
230 | } |
231 | } |
232 | } |
233 | return (pte_t *) pmdp; |
234 | } |
235 | |
236 | int pmd_huge(pmd_t pmd) |
237 | { |
238 | return pmd_leaf(pte: pmd); |
239 | } |
240 | |
241 | int pud_huge(pud_t pud) |
242 | { |
243 | return pud_leaf(pud); |
244 | } |
245 | |
246 | bool __init arch_hugetlb_valid_size(unsigned long size) |
247 | { |
248 | if (MACHINE_HAS_EDAT1 && size == PMD_SIZE) |
249 | return true; |
250 | else if (MACHINE_HAS_EDAT2 && size == PUD_SIZE) |
251 | return true; |
252 | else |
253 | return false; |
254 | } |
255 | |
256 | static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, |
257 | unsigned long addr, unsigned long len, |
258 | unsigned long pgoff, unsigned long flags) |
259 | { |
260 | struct hstate *h = hstate_file(f: file); |
261 | struct vm_unmapped_area_info info; |
262 | |
263 | info.flags = 0; |
264 | info.length = len; |
265 | info.low_limit = current->mm->mmap_base; |
266 | info.high_limit = TASK_SIZE; |
267 | info.align_mask = PAGE_MASK & ~huge_page_mask(h); |
268 | info.align_offset = 0; |
269 | return vm_unmapped_area(info: &info); |
270 | } |
271 | |
272 | static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, |
273 | unsigned long addr0, unsigned long len, |
274 | unsigned long pgoff, unsigned long flags) |
275 | { |
276 | struct hstate *h = hstate_file(f: file); |
277 | struct vm_unmapped_area_info info; |
278 | unsigned long addr; |
279 | |
280 | info.flags = VM_UNMAPPED_AREA_TOPDOWN; |
281 | info.length = len; |
282 | info.low_limit = PAGE_SIZE; |
283 | info.high_limit = current->mm->mmap_base; |
284 | info.align_mask = PAGE_MASK & ~huge_page_mask(h); |
285 | info.align_offset = 0; |
286 | addr = vm_unmapped_area(info: &info); |
287 | |
288 | /* |
289 | * A failed mmap() very likely causes application failure, |
290 | * so fall back to the bottom-up function here. This scenario |
291 | * can happen with large stack limits and large mmap() |
292 | * allocations. |
293 | */ |
294 | if (addr & ~PAGE_MASK) { |
295 | VM_BUG_ON(addr != -ENOMEM); |
296 | info.flags = 0; |
297 | info.low_limit = TASK_UNMAPPED_BASE; |
298 | info.high_limit = TASK_SIZE; |
299 | addr = vm_unmapped_area(info: &info); |
300 | } |
301 | |
302 | return addr; |
303 | } |
304 | |
305 | unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, |
306 | unsigned long len, unsigned long pgoff, unsigned long flags) |
307 | { |
308 | struct hstate *h = hstate_file(f: file); |
309 | struct mm_struct *mm = current->mm; |
310 | struct vm_area_struct *vma; |
311 | |
312 | if (len & ~huge_page_mask(h)) |
313 | return -EINVAL; |
314 | if (len > TASK_SIZE - mmap_min_addr) |
315 | return -ENOMEM; |
316 | |
317 | if (flags & MAP_FIXED) { |
318 | if (prepare_hugepage_range(file, addr, len)) |
319 | return -EINVAL; |
320 | goto check_asce_limit; |
321 | } |
322 | |
323 | if (addr) { |
324 | addr = ALIGN(addr, huge_page_size(h)); |
325 | vma = find_vma(mm, addr); |
326 | if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && |
327 | (!vma || addr + len <= vm_start_gap(vma))) |
328 | goto check_asce_limit; |
329 | } |
330 | |
331 | if (mm->get_unmapped_area == arch_get_unmapped_area) |
332 | addr = hugetlb_get_unmapped_area_bottomup(file, addr, len, |
333 | pgoff, flags); |
334 | else |
335 | addr = hugetlb_get_unmapped_area_topdown(file, addr0: addr, len, |
336 | pgoff, flags); |
337 | if (offset_in_page(addr)) |
338 | return addr; |
339 | |
340 | check_asce_limit: |
341 | return check_asce_limit(mm, addr, len); |
342 | } |
343 | |