1 | // SPDX-License-Identifier: GPL-2.0 |
2 | #include <linux/string.h> |
3 | #include <linux/elf.h> |
4 | #include <asm/page-states.h> |
5 | #include <asm/boot_data.h> |
6 | #include <asm/sections.h> |
7 | #include <asm/maccess.h> |
8 | #include <asm/cpu_mf.h> |
9 | #include <asm/setup.h> |
10 | #include <asm/kasan.h> |
11 | #include <asm/kexec.h> |
12 | #include <asm/sclp.h> |
13 | #include <asm/diag.h> |
14 | #include <asm/uv.h> |
15 | #include <asm/abs_lowcore.h> |
16 | #include <asm/physmem_info.h> |
17 | #include "decompressor.h" |
18 | #include "boot.h" |
19 | #include "uv.h" |
20 | |
21 | unsigned long __bootdata_preserved(__kaslr_offset); |
22 | unsigned long __bootdata_preserved(__abs_lowcore); |
23 | unsigned long __bootdata_preserved(__memcpy_real_area); |
24 | pte_t *__bootdata_preserved(memcpy_real_ptep); |
25 | unsigned long __bootdata_preserved(VMALLOC_START); |
26 | unsigned long __bootdata_preserved(VMALLOC_END); |
27 | struct page *__bootdata_preserved(vmemmap); |
28 | unsigned long __bootdata_preserved(vmemmap_size); |
29 | unsigned long __bootdata_preserved(MODULES_VADDR); |
30 | unsigned long __bootdata_preserved(MODULES_END); |
31 | unsigned long __bootdata_preserved(max_mappable); |
32 | unsigned long __bootdata(ident_map_size); |
33 | |
34 | u64 __bootdata_preserved(stfle_fac_list[16]); |
35 | u64 __bootdata_preserved(alt_stfle_fac_list[16]); |
36 | struct oldmem_data __bootdata_preserved(oldmem_data); |
37 | |
38 | struct machine_info machine; |
39 | |
40 | void error(char *x) |
41 | { |
42 | sclp_early_printk("\n\n" ); |
43 | sclp_early_printk(x); |
44 | sclp_early_printk("\n\n -- System halted" ); |
45 | |
46 | disabled_wait(); |
47 | } |
48 | |
49 | static void detect_facilities(void) |
50 | { |
51 | if (test_facility(8)) { |
52 | machine.has_edat1 = 1; |
53 | local_ctl_set_bit(0, CR0_EDAT_BIT); |
54 | } |
55 | if (test_facility(78)) |
56 | machine.has_edat2 = 1; |
57 | if (test_facility(130)) |
58 | machine.has_nx = 1; |
59 | } |
60 | |
61 | static int cmma_test_essa(void) |
62 | { |
63 | unsigned long reg1, reg2, tmp = 0; |
64 | int rc = 1; |
65 | psw_t old; |
66 | |
67 | /* Test ESSA_GET_STATE */ |
68 | asm volatile( |
69 | " mvc 0(16,%[psw_old]),0(%[psw_pgm])\n" |
70 | " epsw %[reg1],%[reg2]\n" |
71 | " st %[reg1],0(%[psw_pgm])\n" |
72 | " st %[reg2],4(%[psw_pgm])\n" |
73 | " larl %[reg1],1f\n" |
74 | " stg %[reg1],8(%[psw_pgm])\n" |
75 | " .insn rrf,0xb9ab0000,%[tmp],%[tmp],%[cmd],0\n" |
76 | " la %[rc],0\n" |
77 | "1: mvc 0(16,%[psw_pgm]),0(%[psw_old])\n" |
78 | : [reg1] "=&d" (reg1), |
79 | [reg2] "=&a" (reg2), |
80 | [rc] "+&d" (rc), |
81 | [tmp] "=&d" (tmp), |
82 | "+Q" (S390_lowcore.program_new_psw), |
83 | "=Q" (old) |
84 | : [psw_old] "a" (&old), |
85 | [psw_pgm] "a" (&S390_lowcore.program_new_psw), |
86 | [cmd] "i" (ESSA_GET_STATE) |
87 | : "cc" , "memory" ); |
88 | return rc; |
89 | } |
90 | |
91 | static void cmma_init(void) |
92 | { |
93 | if (!cmma_flag) |
94 | return; |
95 | if (cmma_test_essa()) { |
96 | cmma_flag = 0; |
97 | return; |
98 | } |
99 | if (test_facility(147)) |
100 | cmma_flag = 2; |
101 | } |
102 | |
103 | static void setup_lpp(void) |
104 | { |
105 | S390_lowcore.current_pid = 0; |
106 | S390_lowcore.lpp = LPP_MAGIC; |
107 | if (test_facility(40)) |
108 | lpp(&S390_lowcore.lpp); |
109 | } |
110 | |
111 | #ifdef CONFIG_KERNEL_UNCOMPRESSED |
112 | unsigned long mem_safe_offset(void) |
113 | { |
114 | return vmlinux.default_lma + vmlinux.image_size + vmlinux.bss_size; |
115 | } |
116 | #endif |
117 | |
118 | static void rescue_initrd(unsigned long min, unsigned long max) |
119 | { |
120 | unsigned long old_addr, addr, size; |
121 | |
122 | if (!IS_ENABLED(CONFIG_BLK_DEV_INITRD)) |
123 | return; |
124 | if (!get_physmem_reserved(RR_INITRD, &addr, &size)) |
125 | return; |
126 | if (addr >= min && addr + size <= max) |
127 | return; |
128 | old_addr = addr; |
129 | physmem_free(type: RR_INITRD); |
130 | addr = physmem_alloc_top_down(type: RR_INITRD, size, align: 0); |
131 | memmove((void *)addr, (void *)old_addr, size); |
132 | } |
133 | |
134 | static void copy_bootdata(void) |
135 | { |
136 | if (__boot_data_end - __boot_data_start != vmlinux.bootdata_size) |
137 | error(x: ".boot.data section size mismatch" ); |
138 | memcpy((void *)vmlinux.bootdata_off, __boot_data_start, vmlinux.bootdata_size); |
139 | if (__boot_data_preserved_end - __boot_data_preserved_start != vmlinux.bootdata_preserved_size) |
140 | error(x: ".boot.preserved.data section size mismatch" ); |
141 | memcpy((void *)vmlinux.bootdata_preserved_off, __boot_data_preserved_start, vmlinux.bootdata_preserved_size); |
142 | } |
143 | |
144 | #ifdef CONFIG_PIE_BUILD |
145 | static void kaslr_adjust_relocs(unsigned long min_addr, unsigned long max_addr, unsigned long offset) |
146 | { |
147 | Elf64_Rela *rela_start, *rela_end, *rela; |
148 | int r_type, r_sym, rc; |
149 | Elf64_Addr loc, val; |
150 | Elf64_Sym *dynsym; |
151 | |
152 | rela_start = (Elf64_Rela *) vmlinux.rela_dyn_start; |
153 | rela_end = (Elf64_Rela *) vmlinux.rela_dyn_end; |
154 | dynsym = (Elf64_Sym *) vmlinux.dynsym_start; |
155 | for (rela = rela_start; rela < rela_end; rela++) { |
156 | loc = rela->r_offset + offset; |
157 | val = rela->r_addend; |
158 | r_sym = ELF64_R_SYM(rela->r_info); |
159 | if (r_sym) { |
160 | if (dynsym[r_sym].st_shndx != SHN_UNDEF) |
161 | val += dynsym[r_sym].st_value + offset; |
162 | } else { |
163 | /* |
164 | * 0 == undefined symbol table index (STN_UNDEF), |
165 | * used for R_390_RELATIVE, only add KASLR offset |
166 | */ |
167 | val += offset; |
168 | } |
169 | r_type = ELF64_R_TYPE(rela->r_info); |
170 | rc = arch_kexec_do_relocs(r_type, (void *) loc, val, 0); |
171 | if (rc) |
172 | error("Unknown relocation type" ); |
173 | } |
174 | } |
175 | |
176 | static void kaslr_adjust_got(unsigned long offset) {} |
177 | static void rescue_relocs(void) {} |
178 | static void free_relocs(void) {} |
179 | #else |
180 | static int *vmlinux_relocs_64_start; |
181 | static int *vmlinux_relocs_64_end; |
182 | |
183 | static void rescue_relocs(void) |
184 | { |
185 | unsigned long size = __vmlinux_relocs_64_end - __vmlinux_relocs_64_start; |
186 | |
187 | vmlinux_relocs_64_start = (void *)physmem_alloc_top_down(RR_RELOC, size, 0); |
188 | vmlinux_relocs_64_end = (void *)vmlinux_relocs_64_start + size; |
189 | memmove(vmlinux_relocs_64_start, __vmlinux_relocs_64_start, size); |
190 | } |
191 | |
192 | static void free_relocs(void) |
193 | { |
194 | physmem_free(type: RR_RELOC); |
195 | } |
196 | |
197 | static void kaslr_adjust_relocs(unsigned long min_addr, unsigned long max_addr, unsigned long offset) |
198 | { |
199 | int *reloc; |
200 | long loc; |
201 | |
202 | /* Adjust R_390_64 relocations */ |
203 | for (reloc = vmlinux_relocs_64_start; reloc < vmlinux_relocs_64_end; reloc++) { |
204 | loc = (long)*reloc + offset; |
205 | if (loc < min_addr || loc > max_addr) |
206 | error(x: "64-bit relocation outside of kernel!\n" ); |
207 | *(u64 *)loc += offset; |
208 | } |
209 | } |
210 | |
211 | static void kaslr_adjust_got(unsigned long offset) |
212 | { |
213 | u64 *entry; |
214 | |
215 | /* |
216 | * Even without -fPIE, Clang still uses a global offset table for some |
217 | * reason. Adjust the GOT entries. |
218 | */ |
219 | for (entry = (u64 *)vmlinux.got_start; entry < (u64 *)vmlinux.got_end; entry++) |
220 | *entry += offset; |
221 | } |
222 | #endif |
223 | |
224 | /* |
225 | * Merge information from several sources into a single ident_map_size value. |
226 | * "ident_map_size" represents the upper limit of physical memory we may ever |
227 | * reach. It might not be all online memory, but also include standby (offline) |
228 | * memory. "ident_map_size" could be lower then actual standby or even online |
229 | * memory present, due to limiting factors. We should never go above this limit. |
230 | * It is the size of our identity mapping. |
231 | * |
232 | * Consider the following factors: |
233 | * 1. max_physmem_end - end of physical memory online or standby. |
234 | * Always >= end of the last online memory range (get_physmem_online_end()). |
235 | * 2. CONFIG_MAX_PHYSMEM_BITS - the maximum size of physical memory the |
236 | * kernel is able to support. |
237 | * 3. "mem=" kernel command line option which limits physical memory usage. |
238 | * 4. OLDMEM_BASE which is a kdump memory limit when the kernel is executed as |
239 | * crash kernel. |
240 | * 5. "hsa" size which is a memory limit when the kernel is executed during |
241 | * zfcp/nvme dump. |
242 | */ |
243 | static void setup_ident_map_size(unsigned long max_physmem_end) |
244 | { |
245 | unsigned long hsa_size; |
246 | |
247 | ident_map_size = max_physmem_end; |
248 | if (memory_limit) |
249 | ident_map_size = min(ident_map_size, memory_limit); |
250 | ident_map_size = min(ident_map_size, 1UL << MAX_PHYSMEM_BITS); |
251 | |
252 | #ifdef CONFIG_CRASH_DUMP |
253 | if (oldmem_data.start) { |
254 | __kaslr_enabled = 0; |
255 | ident_map_size = min(ident_map_size, oldmem_data.size); |
256 | } else if (ipl_block_valid && is_ipl_block_dump()) { |
257 | __kaslr_enabled = 0; |
258 | if (!sclp_early_get_hsa_size(&hsa_size) && hsa_size) |
259 | ident_map_size = min(ident_map_size, hsa_size); |
260 | } |
261 | #endif |
262 | } |
263 | |
264 | static unsigned long setup_kernel_memory_layout(void) |
265 | { |
266 | unsigned long vmemmap_start; |
267 | unsigned long asce_limit; |
268 | unsigned long rte_size; |
269 | unsigned long pages; |
270 | unsigned long vsize; |
271 | unsigned long vmax; |
272 | |
273 | pages = ident_map_size / PAGE_SIZE; |
274 | /* vmemmap contains a multiple of PAGES_PER_SECTION struct pages */ |
275 | vmemmap_size = SECTION_ALIGN_UP(pages) * sizeof(struct page); |
276 | |
277 | /* choose kernel address space layout: 4 or 3 levels. */ |
278 | vsize = round_up(ident_map_size, _REGION3_SIZE) + vmemmap_size + |
279 | MODULES_LEN + MEMCPY_REAL_SIZE + ABS_LOWCORE_MAP_SIZE; |
280 | vsize = size_add(addend1: vsize, addend2: vmalloc_size); |
281 | if (IS_ENABLED(CONFIG_KASAN) || (vsize > _REGION2_SIZE)) { |
282 | asce_limit = _REGION1_SIZE; |
283 | rte_size = _REGION2_SIZE; |
284 | } else { |
285 | asce_limit = _REGION2_SIZE; |
286 | rte_size = _REGION3_SIZE; |
287 | } |
288 | |
289 | /* |
290 | * Forcing modules and vmalloc area under the ultravisor |
291 | * secure storage limit, so that any vmalloc allocation |
292 | * we do could be used to back secure guest storage. |
293 | */ |
294 | vmax = adjust_to_uv_max(limit: asce_limit); |
295 | #ifdef CONFIG_KASAN |
296 | /* force vmalloc and modules below kasan shadow */ |
297 | vmax = min(vmax, KASAN_SHADOW_START); |
298 | #endif |
299 | __memcpy_real_area = round_down(vmax - MEMCPY_REAL_SIZE, PAGE_SIZE); |
300 | __abs_lowcore = round_down(__memcpy_real_area - ABS_LOWCORE_MAP_SIZE, |
301 | sizeof(struct lowcore)); |
302 | MODULES_END = round_down(__abs_lowcore, _SEGMENT_SIZE); |
303 | MODULES_VADDR = MODULES_END - MODULES_LEN; |
304 | VMALLOC_END = MODULES_VADDR; |
305 | |
306 | /* allow vmalloc area to occupy up to about 1/2 of the rest virtual space left */ |
307 | vsize = round_down(VMALLOC_END / 2, _SEGMENT_SIZE); |
308 | vmalloc_size = min(vmalloc_size, vsize); |
309 | VMALLOC_START = VMALLOC_END - vmalloc_size; |
310 | |
311 | /* split remaining virtual space between 1:1 mapping & vmemmap array */ |
312 | pages = VMALLOC_START / (PAGE_SIZE + sizeof(struct page)); |
313 | pages = SECTION_ALIGN_UP(pages); |
314 | /* keep vmemmap_start aligned to a top level region table entry */ |
315 | vmemmap_start = round_down(VMALLOC_START - pages * sizeof(struct page), rte_size); |
316 | vmemmap_start = min(vmemmap_start, 1UL << MAX_PHYSMEM_BITS); |
317 | /* maximum mappable address as seen by arch_get_mappable_range() */ |
318 | max_mappable = vmemmap_start; |
319 | /* make sure identity map doesn't overlay with vmemmap */ |
320 | ident_map_size = min(ident_map_size, vmemmap_start); |
321 | vmemmap_size = SECTION_ALIGN_UP(ident_map_size / PAGE_SIZE) * sizeof(struct page); |
322 | /* make sure vmemmap doesn't overlay with vmalloc area */ |
323 | VMALLOC_START = max(vmemmap_start + vmemmap_size, VMALLOC_START); |
324 | vmemmap = (struct page *)vmemmap_start; |
325 | |
326 | return asce_limit; |
327 | } |
328 | |
329 | /* |
330 | * This function clears the BSS section of the decompressed Linux kernel and NOT the decompressor's. |
331 | */ |
332 | static void clear_bss_section(unsigned long vmlinux_lma) |
333 | { |
334 | memset((void *)vmlinux_lma + vmlinux.image_size, 0, vmlinux.bss_size); |
335 | } |
336 | |
337 | /* |
338 | * Set vmalloc area size to an 8th of (potential) physical memory |
339 | * size, unless size has been set by kernel command line parameter. |
340 | */ |
341 | static void setup_vmalloc_size(void) |
342 | { |
343 | unsigned long size; |
344 | |
345 | if (vmalloc_size_set) |
346 | return; |
347 | size = round_up(ident_map_size / 8, _SEGMENT_SIZE); |
348 | vmalloc_size = max(size, vmalloc_size); |
349 | } |
350 | |
351 | static void kaslr_adjust_vmlinux_info(unsigned long offset) |
352 | { |
353 | *(unsigned long *)(&vmlinux.entry) += offset; |
354 | vmlinux.bootdata_off += offset; |
355 | vmlinux.bootdata_preserved_off += offset; |
356 | #ifdef CONFIG_PIE_BUILD |
357 | vmlinux.rela_dyn_start += offset; |
358 | vmlinux.rela_dyn_end += offset; |
359 | vmlinux.dynsym_start += offset; |
360 | #else |
361 | vmlinux.got_start += offset; |
362 | vmlinux.got_end += offset; |
363 | #endif |
364 | vmlinux.init_mm_off += offset; |
365 | vmlinux.swapper_pg_dir_off += offset; |
366 | vmlinux.invalid_pg_dir_off += offset; |
367 | #ifdef CONFIG_KASAN |
368 | vmlinux.kasan_early_shadow_page_off += offset; |
369 | vmlinux.kasan_early_shadow_pte_off += offset; |
370 | vmlinux.kasan_early_shadow_pmd_off += offset; |
371 | vmlinux.kasan_early_shadow_pud_off += offset; |
372 | vmlinux.kasan_early_shadow_p4d_off += offset; |
373 | #endif |
374 | } |
375 | |
376 | void startup_kernel(void) |
377 | { |
378 | unsigned long max_physmem_end; |
379 | unsigned long vmlinux_lma = 0; |
380 | unsigned long amode31_lma = 0; |
381 | unsigned long asce_limit; |
382 | unsigned long safe_addr; |
383 | void *img; |
384 | psw_t psw; |
385 | |
386 | setup_lpp(); |
387 | safe_addr = mem_safe_offset(); |
388 | |
389 | /* |
390 | * Reserve decompressor memory together with decompression heap, buffer and |
391 | * memory which might be occupied by uncompressed kernel at default 1Mb |
392 | * position (if KASLR is off or failed). |
393 | */ |
394 | physmem_reserve(RR_DECOMPRESSOR, 0, safe_addr); |
395 | if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && parmarea.initrd_size) |
396 | physmem_reserve(RR_INITRD, parmarea.initrd_start, parmarea.initrd_size); |
397 | oldmem_data.start = parmarea.oldmem_base; |
398 | oldmem_data.size = parmarea.oldmem_size; |
399 | |
400 | store_ipl_parmblock(); |
401 | read_ipl_report(); |
402 | uv_query_info(); |
403 | sclp_early_read_info(); |
404 | setup_boot_command_line(); |
405 | parse_boot_command_line(); |
406 | detect_facilities(); |
407 | cmma_init(); |
408 | sanitize_prot_virt_host(); |
409 | max_physmem_end = detect_max_physmem_end(); |
410 | setup_ident_map_size(max_physmem_end); |
411 | setup_vmalloc_size(); |
412 | asce_limit = setup_kernel_memory_layout(); |
413 | /* got final ident_map_size, physmem allocations could be performed now */ |
414 | physmem_set_usable_limit(ident_map_size); |
415 | detect_physmem_online_ranges(max_physmem_end); |
416 | save_ipl_cert_comp_list(); |
417 | rescue_initrd(safe_addr, ident_map_size); |
418 | rescue_relocs(); |
419 | |
420 | if (kaslr_enabled()) { |
421 | vmlinux_lma = randomize_within_range(vmlinux.image_size + vmlinux.bss_size, |
422 | THREAD_SIZE, vmlinux.default_lma, |
423 | ident_map_size); |
424 | if (vmlinux_lma) { |
425 | __kaslr_offset = vmlinux_lma - vmlinux.default_lma; |
426 | kaslr_adjust_vmlinux_info(__kaslr_offset); |
427 | } |
428 | } |
429 | vmlinux_lma = vmlinux_lma ?: vmlinux.default_lma; |
430 | physmem_reserve(RR_VMLINUX, vmlinux_lma, vmlinux.image_size + vmlinux.bss_size); |
431 | |
432 | if (!IS_ENABLED(CONFIG_KERNEL_UNCOMPRESSED)) { |
433 | img = decompress_kernel(); |
434 | memmove((void *)vmlinux_lma, img, vmlinux.image_size); |
435 | } else if (__kaslr_offset) { |
436 | img = (void *)vmlinux.default_lma; |
437 | memmove((void *)vmlinux_lma, img, vmlinux.image_size); |
438 | memset(img, 0, vmlinux.image_size); |
439 | } |
440 | |
441 | /* vmlinux decompression is done, shrink reserved low memory */ |
442 | physmem_reserve(RR_DECOMPRESSOR, 0, (unsigned long)_decompressor_end); |
443 | if (kaslr_enabled()) |
444 | amode31_lma = randomize_within_range(vmlinux.amode31_size, PAGE_SIZE, 0, SZ_2G); |
445 | amode31_lma = amode31_lma ?: vmlinux.default_lma - vmlinux.amode31_size; |
446 | physmem_reserve(RR_AMODE31, amode31_lma, vmlinux.amode31_size); |
447 | |
448 | /* |
449 | * The order of the following operations is important: |
450 | * |
451 | * - kaslr_adjust_relocs() must follow clear_bss_section() to establish |
452 | * static memory references to data in .bss to be used by setup_vmem() |
453 | * (i.e init_mm.pgd) |
454 | * |
455 | * - setup_vmem() must follow kaslr_adjust_relocs() to be able using |
456 | * static memory references to data in .bss (i.e init_mm.pgd) |
457 | * |
458 | * - copy_bootdata() must follow setup_vmem() to propagate changes |
459 | * to bootdata made by setup_vmem() |
460 | */ |
461 | clear_bss_section(vmlinux_lma); |
462 | kaslr_adjust_relocs(vmlinux_lma, vmlinux_lma + vmlinux.image_size, __kaslr_offset); |
463 | kaslr_adjust_got(__kaslr_offset); |
464 | free_relocs(); |
465 | setup_vmem(asce_limit); |
466 | copy_bootdata(); |
467 | |
468 | /* |
469 | * Save KASLR offset for early dumps, before vmcore_info is set. |
470 | * Mark as uneven to distinguish from real vmcore_info pointer. |
471 | */ |
472 | S390_lowcore.vmcore_info = __kaslr_offset ? __kaslr_offset | 0x1UL : 0; |
473 | |
474 | /* |
475 | * Jump to the decompressed kernel entry point and switch DAT mode on. |
476 | */ |
477 | psw.addr = vmlinux.entry; |
478 | psw.mask = PSW_KERNEL_BITS; |
479 | __load_psw(psw); |
480 | } |
481 | |