1 | // SPDX-License-Identifier: GPL-2.0-or-later |
2 | |
3 | /* |
4 | * Copyright (C) 2004 Benjamin Herrenschmidt, IBM Corp. |
5 | * <benh@kernel.crashing.org> |
6 | */ |
7 | |
8 | #include <linux/errno.h> |
9 | #include <linux/sched.h> |
10 | #include <linux/kernel.h> |
11 | #include <linux/mm.h> |
12 | #include <linux/smp.h> |
13 | #include <linux/stddef.h> |
14 | #include <linux/unistd.h> |
15 | #include <linux/slab.h> |
16 | #include <linux/user.h> |
17 | #include <linux/elf.h> |
18 | #include <linux/security.h> |
19 | #include <linux/memblock.h> |
20 | #include <linux/syscalls.h> |
21 | #include <linux/time_namespace.h> |
22 | #include <vdso/datapage.h> |
23 | |
24 | #include <asm/syscall.h> |
25 | #include <asm/processor.h> |
26 | #include <asm/mmu.h> |
27 | #include <asm/mmu_context.h> |
28 | #include <asm/machdep.h> |
29 | #include <asm/cputable.h> |
30 | #include <asm/sections.h> |
31 | #include <asm/firmware.h> |
32 | #include <asm/vdso.h> |
33 | #include <asm/vdso_datapage.h> |
34 | #include <asm/setup.h> |
35 | |
36 | /* The alignment of the vDSO */ |
37 | #define VDSO_ALIGNMENT (1 << 16) |
38 | |
39 | extern char vdso32_start, vdso32_end; |
40 | extern char vdso64_start, vdso64_end; |
41 | |
42 | long sys_ni_syscall(void); |
43 | |
44 | /* |
45 | * The vdso data page (aka. systemcfg for old ppc64 fans) is here. |
46 | * Once the early boot kernel code no longer needs to muck around |
47 | * with it, it will become dynamically allocated |
48 | */ |
49 | static union { |
50 | struct vdso_arch_data data; |
51 | u8 page[PAGE_SIZE]; |
52 | } vdso_data_store __page_aligned_data; |
53 | struct vdso_arch_data *vdso_data = &vdso_data_store.data; |
54 | |
55 | enum vvar_pages { |
56 | VVAR_DATA_PAGE_OFFSET, |
57 | VVAR_TIMENS_PAGE_OFFSET, |
58 | VVAR_NR_PAGES, |
59 | }; |
60 | |
61 | static int vdso_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma, |
62 | unsigned long text_size) |
63 | { |
64 | unsigned long new_size = new_vma->vm_end - new_vma->vm_start; |
65 | |
66 | if (new_size != text_size) |
67 | return -EINVAL; |
68 | |
69 | current->mm->context.vdso = (void __user *)new_vma->vm_start; |
70 | |
71 | return 0; |
72 | } |
73 | |
74 | static int vdso32_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma) |
75 | { |
76 | return vdso_mremap(sm, new_vma, text_size: &vdso32_end - &vdso32_start); |
77 | } |
78 | |
79 | static int vdso64_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma) |
80 | { |
81 | return vdso_mremap(sm, new_vma, text_size: &vdso64_end - &vdso64_start); |
82 | } |
83 | |
84 | static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, |
85 | struct vm_area_struct *vma, struct vm_fault *vmf); |
86 | |
87 | static struct vm_special_mapping vvar_spec __ro_after_init = { |
88 | .name = "[vvar]" , |
89 | .fault = vvar_fault, |
90 | }; |
91 | |
92 | static struct vm_special_mapping vdso32_spec __ro_after_init = { |
93 | .name = "[vdso]" , |
94 | .mremap = vdso32_mremap, |
95 | }; |
96 | |
97 | static struct vm_special_mapping vdso64_spec __ro_after_init = { |
98 | .name = "[vdso]" , |
99 | .mremap = vdso64_mremap, |
100 | }; |
101 | |
102 | #ifdef CONFIG_TIME_NS |
103 | struct vdso_data *arch_get_vdso_data(void *vvar_page) |
104 | { |
105 | return ((struct vdso_arch_data *)vvar_page)->data; |
106 | } |
107 | |
108 | /* |
109 | * The vvar mapping contains data for a specific time namespace, so when a task |
110 | * changes namespace we must unmap its vvar data for the old namespace. |
111 | * Subsequent faults will map in data for the new namespace. |
112 | * |
113 | * For more details see timens_setup_vdso_data(). |
114 | */ |
115 | int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) |
116 | { |
117 | struct mm_struct *mm = task->mm; |
118 | VMA_ITERATOR(vmi, mm, 0); |
119 | struct vm_area_struct *vma; |
120 | |
121 | mmap_read_lock(mm); |
122 | for_each_vma(vmi, vma) { |
123 | if (vma_is_special_mapping(vma, sm: &vvar_spec)) |
124 | zap_vma_pages(vma); |
125 | } |
126 | mmap_read_unlock(mm); |
127 | |
128 | return 0; |
129 | } |
130 | #endif |
131 | |
132 | static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, |
133 | struct vm_area_struct *vma, struct vm_fault *vmf) |
134 | { |
135 | struct page *timens_page = find_timens_vvar_page(vma); |
136 | unsigned long pfn; |
137 | |
138 | switch (vmf->pgoff) { |
139 | case VVAR_DATA_PAGE_OFFSET: |
140 | if (timens_page) |
141 | pfn = page_to_pfn(timens_page); |
142 | else |
143 | pfn = virt_to_pfn(vdso_data); |
144 | break; |
145 | #ifdef CONFIG_TIME_NS |
146 | case VVAR_TIMENS_PAGE_OFFSET: |
147 | /* |
148 | * If a task belongs to a time namespace then a namespace |
149 | * specific VVAR is mapped with the VVAR_DATA_PAGE_OFFSET and |
150 | * the real VVAR page is mapped with the VVAR_TIMENS_PAGE_OFFSET |
151 | * offset. |
152 | * See also the comment near timens_setup_vdso_data(). |
153 | */ |
154 | if (!timens_page) |
155 | return VM_FAULT_SIGBUS; |
156 | pfn = virt_to_pfn(vdso_data); |
157 | break; |
158 | #endif /* CONFIG_TIME_NS */ |
159 | default: |
160 | return VM_FAULT_SIGBUS; |
161 | } |
162 | |
163 | return vmf_insert_pfn(vma, addr: vmf->address, pfn); |
164 | } |
165 | |
166 | /* |
167 | * This is called from binfmt_elf, we create the special vma for the |
168 | * vDSO and insert it into the mm struct tree |
169 | */ |
170 | static int __arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) |
171 | { |
172 | unsigned long vdso_size, vdso_base, mappings_size; |
173 | struct vm_special_mapping *vdso_spec; |
174 | unsigned long vvar_size = VVAR_NR_PAGES * PAGE_SIZE; |
175 | struct mm_struct *mm = current->mm; |
176 | struct vm_area_struct *vma; |
177 | |
178 | if (is_32bit_task()) { |
179 | vdso_spec = &vdso32_spec; |
180 | vdso_size = &vdso32_end - &vdso32_start; |
181 | } else { |
182 | vdso_spec = &vdso64_spec; |
183 | vdso_size = &vdso64_end - &vdso64_start; |
184 | } |
185 | |
186 | mappings_size = vdso_size + vvar_size; |
187 | mappings_size += (VDSO_ALIGNMENT - 1) & PAGE_MASK; |
188 | |
189 | /* |
190 | * Pick a base address for the vDSO in process space. |
191 | * Add enough to the size so that the result can be aligned. |
192 | */ |
193 | vdso_base = get_unmapped_area(NULL, 0, mappings_size, 0, 0); |
194 | if (IS_ERR_VALUE(vdso_base)) |
195 | return vdso_base; |
196 | |
197 | /* Add required alignment. */ |
198 | vdso_base = ALIGN(vdso_base, VDSO_ALIGNMENT); |
199 | |
200 | /* |
201 | * Put vDSO base into mm struct. We need to do this before calling |
202 | * install_special_mapping or the perf counter mmap tracking code |
203 | * will fail to recognise it as a vDSO. |
204 | */ |
205 | mm->context.vdso = (void __user *)vdso_base + vvar_size; |
206 | |
207 | vma = _install_special_mapping(mm, addr: vdso_base, len: vvar_size, |
208 | VM_READ | VM_MAYREAD | VM_IO | |
209 | VM_DONTDUMP | VM_PFNMAP, spec: &vvar_spec); |
210 | if (IS_ERR(ptr: vma)) |
211 | return PTR_ERR(ptr: vma); |
212 | |
213 | /* |
214 | * our vma flags don't have VM_WRITE so by default, the process isn't |
215 | * allowed to write those pages. |
216 | * gdb can break that with ptrace interface, and thus trigger COW on |
217 | * those pages but it's then your responsibility to never do that on |
218 | * the "data" page of the vDSO or you'll stop getting kernel updates |
219 | * and your nice userland gettimeofday will be totally dead. |
220 | * It's fine to use that for setting breakpoints in the vDSO code |
221 | * pages though. |
222 | */ |
223 | vma = _install_special_mapping(mm, addr: vdso_base + vvar_size, len: vdso_size, |
224 | VM_READ | VM_EXEC | VM_MAYREAD | |
225 | VM_MAYWRITE | VM_MAYEXEC, spec: vdso_spec); |
226 | if (IS_ERR(ptr: vma)) |
227 | do_munmap(mm, vdso_base, vvar_size, NULL); |
228 | |
229 | return PTR_ERR_OR_ZERO(ptr: vma); |
230 | } |
231 | |
232 | int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) |
233 | { |
234 | struct mm_struct *mm = current->mm; |
235 | int rc; |
236 | |
237 | mm->context.vdso = NULL; |
238 | |
239 | if (mmap_write_lock_killable(mm)) |
240 | return -EINTR; |
241 | |
242 | rc = __arch_setup_additional_pages(bprm, uses_interp); |
243 | if (rc) |
244 | mm->context.vdso = NULL; |
245 | |
246 | mmap_write_unlock(mm); |
247 | return rc; |
248 | } |
249 | |
250 | #define VDSO_DO_FIXUPS(type, value, bits, sec) do { \ |
251 | void *__start = (void *)VDSO##bits##_SYMBOL(&vdso##bits##_start, sec##_start); \ |
252 | void *__end = (void *)VDSO##bits##_SYMBOL(&vdso##bits##_start, sec##_end); \ |
253 | \ |
254 | do_##type##_fixups((value), __start, __end); \ |
255 | } while (0) |
256 | |
257 | static void __init vdso_fixup_features(void) |
258 | { |
259 | #ifdef CONFIG_PPC64 |
260 | VDSO_DO_FIXUPS(feature, cur_cpu_spec->cpu_features, 64, ftr_fixup); |
261 | VDSO_DO_FIXUPS(feature, cur_cpu_spec->mmu_features, 64, mmu_ftr_fixup); |
262 | VDSO_DO_FIXUPS(feature, powerpc_firmware_features, 64, fw_ftr_fixup); |
263 | VDSO_DO_FIXUPS(lwsync, cur_cpu_spec->cpu_features, 64, lwsync_fixup); |
264 | #endif /* CONFIG_PPC64 */ |
265 | |
266 | #ifdef CONFIG_VDSO32 |
267 | VDSO_DO_FIXUPS(feature, cur_cpu_spec->cpu_features, 32, ftr_fixup); |
268 | VDSO_DO_FIXUPS(feature, cur_cpu_spec->mmu_features, 32, mmu_ftr_fixup); |
269 | #ifdef CONFIG_PPC64 |
270 | VDSO_DO_FIXUPS(feature, powerpc_firmware_features, 32, fw_ftr_fixup); |
271 | #endif /* CONFIG_PPC64 */ |
272 | VDSO_DO_FIXUPS(lwsync, cur_cpu_spec->cpu_features, 32, lwsync_fixup); |
273 | #endif |
274 | } |
275 | |
276 | /* |
277 | * Called from setup_arch to initialize the bitmap of available |
278 | * syscalls in the systemcfg page |
279 | */ |
280 | static void __init vdso_setup_syscall_map(void) |
281 | { |
282 | unsigned int i; |
283 | |
284 | for (i = 0; i < NR_syscalls; i++) { |
285 | if (sys_call_table[i] != (void *)&sys_ni_syscall) |
286 | vdso_data->syscall_map[i >> 5] |= 0x80000000UL >> (i & 0x1f); |
287 | if (IS_ENABLED(CONFIG_COMPAT) && |
288 | compat_sys_call_table[i] != (void *)&sys_ni_syscall) |
289 | vdso_data->compat_syscall_map[i >> 5] |= 0x80000000UL >> (i & 0x1f); |
290 | } |
291 | } |
292 | |
293 | #ifdef CONFIG_PPC64 |
294 | int vdso_getcpu_init(void) |
295 | { |
296 | unsigned long cpu, node, val; |
297 | |
298 | /* |
299 | * SPRG_VDSO contains the CPU in the bottom 16 bits and the NUMA node |
300 | * in the next 16 bits. The VDSO uses this to implement getcpu(). |
301 | */ |
302 | cpu = get_cpu(); |
303 | WARN_ON_ONCE(cpu > 0xffff); |
304 | |
305 | node = cpu_to_node(cpu); |
306 | WARN_ON_ONCE(node > 0xffff); |
307 | |
308 | val = (cpu & 0xffff) | ((node & 0xffff) << 16); |
309 | mtspr(SPRN_SPRG_VDSO_WRITE, val); |
310 | get_paca()->sprg_vdso = val; |
311 | |
312 | put_cpu(); |
313 | |
314 | return 0; |
315 | } |
316 | /* We need to call this before SMP init */ |
317 | early_initcall(vdso_getcpu_init); |
318 | #endif |
319 | |
320 | static struct page ** __init vdso_setup_pages(void *start, void *end) |
321 | { |
322 | int i; |
323 | struct page **pagelist; |
324 | int pages = (end - start) >> PAGE_SHIFT; |
325 | |
326 | pagelist = kcalloc(n: pages + 1, size: sizeof(struct page *), GFP_KERNEL); |
327 | if (!pagelist) |
328 | panic(fmt: "%s: Cannot allocate page list for VDSO" , __func__); |
329 | |
330 | for (i = 0; i < pages; i++) |
331 | pagelist[i] = virt_to_page(start + i * PAGE_SIZE); |
332 | |
333 | return pagelist; |
334 | } |
335 | |
336 | static int __init vdso_init(void) |
337 | { |
338 | #ifdef CONFIG_PPC64 |
339 | /* |
340 | * Fill up the "systemcfg" stuff for backward compatibility |
341 | */ |
342 | strcpy((char *)vdso_data->eye_catcher, "SYSTEMCFG:PPC64" ); |
343 | vdso_data->version.major = SYSTEMCFG_MAJOR; |
344 | vdso_data->version.minor = SYSTEMCFG_MINOR; |
345 | vdso_data->processor = mfspr(SPRN_PVR); |
346 | /* |
347 | * Fake the old platform number for pSeries and add |
348 | * in LPAR bit if necessary |
349 | */ |
350 | vdso_data->platform = 0x100; |
351 | if (firmware_has_feature(FW_FEATURE_LPAR)) |
352 | vdso_data->platform |= 1; |
353 | vdso_data->physicalMemorySize = memblock_phys_mem_size(); |
354 | vdso_data->dcache_size = ppc64_caches.l1d.size; |
355 | vdso_data->dcache_line_size = ppc64_caches.l1d.line_size; |
356 | vdso_data->icache_size = ppc64_caches.l1i.size; |
357 | vdso_data->icache_line_size = ppc64_caches.l1i.line_size; |
358 | vdso_data->dcache_block_size = ppc64_caches.l1d.block_size; |
359 | vdso_data->icache_block_size = ppc64_caches.l1i.block_size; |
360 | vdso_data->dcache_log_block_size = ppc64_caches.l1d.log_block_size; |
361 | vdso_data->icache_log_block_size = ppc64_caches.l1i.log_block_size; |
362 | #endif /* CONFIG_PPC64 */ |
363 | |
364 | vdso_setup_syscall_map(); |
365 | |
366 | vdso_fixup_features(); |
367 | |
368 | if (IS_ENABLED(CONFIG_VDSO32)) |
369 | vdso32_spec.pages = vdso_setup_pages(start: &vdso32_start, end: &vdso32_end); |
370 | |
371 | if (IS_ENABLED(CONFIG_PPC64)) |
372 | vdso64_spec.pages = vdso_setup_pages(start: &vdso64_start, end: &vdso64_end); |
373 | |
374 | smp_wmb(); |
375 | |
376 | return 0; |
377 | } |
378 | arch_initcall(vdso_init); |
379 | |