1 | // SPDX-License-Identifier: GPL-2.0 |
2 | |
3 | #include <linux/spinlock.h> |
4 | #include <linux/percpu.h> |
5 | #include <linux/kallsyms.h> |
6 | #include <linux/kcore.h> |
7 | #include <linux/pgtable.h> |
8 | |
9 | #include <asm/cpu_entry_area.h> |
10 | #include <asm/fixmap.h> |
11 | #include <asm/desc.h> |
12 | #include <asm/kasan.h> |
13 | #include <asm/setup.h> |
14 | |
15 | static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page, entry_stack_storage); |
16 | |
17 | #ifdef CONFIG_X86_64 |
18 | static DEFINE_PER_CPU_PAGE_ALIGNED(struct exception_stacks, exception_stacks); |
19 | DEFINE_PER_CPU(struct cea_exception_stacks*, cea_exception_stacks); |
20 | |
21 | static DEFINE_PER_CPU_READ_MOSTLY(unsigned long, _cea_offset); |
22 | |
23 | static __always_inline unsigned int cea_offset(unsigned int cpu) |
24 | { |
25 | return per_cpu(_cea_offset, cpu); |
26 | } |
27 | |
28 | static __init void init_cea_offsets(void) |
29 | { |
30 | unsigned int max_cea; |
31 | unsigned int i, j; |
32 | |
33 | if (!kaslr_enabled()) { |
34 | for_each_possible_cpu(i) |
35 | per_cpu(_cea_offset, i) = i; |
36 | return; |
37 | } |
38 | |
39 | max_cea = (CPU_ENTRY_AREA_MAP_SIZE - PAGE_SIZE) / CPU_ENTRY_AREA_SIZE; |
40 | |
41 | /* O(sodding terrible) */ |
42 | for_each_possible_cpu(i) { |
43 | unsigned int cea; |
44 | |
45 | again: |
46 | cea = get_random_u32_below(ceil: max_cea); |
47 | |
48 | for_each_possible_cpu(j) { |
49 | if (cea_offset(cpu: j) == cea) |
50 | goto again; |
51 | |
52 | if (i == j) |
53 | break; |
54 | } |
55 | |
56 | per_cpu(_cea_offset, i) = cea; |
57 | } |
58 | } |
59 | #else /* !X86_64 */ |
60 | DECLARE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack); |
61 | |
62 | static __always_inline unsigned int cea_offset(unsigned int cpu) |
63 | { |
64 | return cpu; |
65 | } |
66 | static inline void init_cea_offsets(void) { } |
67 | #endif |
68 | |
69 | /* Is called from entry code, so must be noinstr */ |
70 | noinstr struct cpu_entry_area *get_cpu_entry_area(int cpu) |
71 | { |
72 | unsigned long va = CPU_ENTRY_AREA_PER_CPU + cea_offset(cpu) * CPU_ENTRY_AREA_SIZE; |
73 | BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0); |
74 | |
75 | return (struct cpu_entry_area *) va; |
76 | } |
77 | EXPORT_SYMBOL(get_cpu_entry_area); |
78 | |
79 | void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags) |
80 | { |
81 | unsigned long va = (unsigned long) cea_vaddr; |
82 | pte_t pte = pfn_pte(page_nr: pa >> PAGE_SHIFT, pgprot: flags); |
83 | |
84 | /* |
85 | * The cpu_entry_area is shared between the user and kernel |
86 | * page tables. All of its ptes can safely be global. |
87 | * _PAGE_GLOBAL gets reused to help indicate PROT_NONE for |
88 | * non-present PTEs, so be careful not to set it in that |
89 | * case to avoid confusion. |
90 | */ |
91 | if (boot_cpu_has(X86_FEATURE_PGE) && |
92 | (pgprot_val(flags) & _PAGE_PRESENT)) |
93 | pte = pte_set_flags(pte, _PAGE_GLOBAL); |
94 | |
95 | set_pte_vaddr(vaddr: va, pte); |
96 | } |
97 | |
98 | static void __init |
99 | cea_map_percpu_pages(void *cea_vaddr, void *ptr, int pages, pgprot_t prot) |
100 | { |
101 | for ( ; pages; pages--, cea_vaddr+= PAGE_SIZE, ptr += PAGE_SIZE) |
102 | cea_set_pte(cea_vaddr, pa: per_cpu_ptr_to_phys(addr: ptr), flags: prot); |
103 | } |
104 | |
105 | static void __init percpu_setup_debug_store(unsigned int cpu) |
106 | { |
107 | #ifdef CONFIG_CPU_SUP_INTEL |
108 | unsigned int npages; |
109 | void *cea; |
110 | |
111 | if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) |
112 | return; |
113 | |
114 | cea = &get_cpu_entry_area(cpu)->cpu_debug_store; |
115 | npages = sizeof(struct debug_store) / PAGE_SIZE; |
116 | BUILD_BUG_ON(sizeof(struct debug_store) % PAGE_SIZE != 0); |
117 | cea_map_percpu_pages(cea_vaddr: cea, ptr: &per_cpu(cpu_debug_store, cpu), pages: npages, |
118 | PAGE_KERNEL); |
119 | |
120 | cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers; |
121 | /* |
122 | * Force the population of PMDs for not yet allocated per cpu |
123 | * memory like debug store buffers. |
124 | */ |
125 | npages = sizeof(struct debug_store_buffers) / PAGE_SIZE; |
126 | for (; npages; npages--, cea += PAGE_SIZE) |
127 | cea_set_pte(cea_vaddr: cea, pa: 0, PAGE_NONE); |
128 | #endif |
129 | } |
130 | |
131 | #ifdef CONFIG_X86_64 |
132 | |
133 | #define cea_map_stack(name) do { \ |
134 | npages = sizeof(estacks->name## _stack) / PAGE_SIZE; \ |
135 | cea_map_percpu_pages(cea->estacks.name## _stack, \ |
136 | estacks->name## _stack, npages, PAGE_KERNEL); \ |
137 | } while (0) |
138 | |
139 | static void __init percpu_setup_exception_stacks(unsigned int cpu) |
140 | { |
141 | struct exception_stacks *estacks = per_cpu_ptr(&exception_stacks, cpu); |
142 | struct cpu_entry_area *cea = get_cpu_entry_area(cpu); |
143 | unsigned int npages; |
144 | |
145 | BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0); |
146 | |
147 | per_cpu(cea_exception_stacks, cpu) = &cea->estacks; |
148 | |
149 | /* |
150 | * The exceptions stack mappings in the per cpu area are protected |
151 | * by guard pages so each stack must be mapped separately. DB2 is |
152 | * not mapped; it just exists to catch triple nesting of #DB. |
153 | */ |
154 | cea_map_stack(DF); |
155 | cea_map_stack(NMI); |
156 | cea_map_stack(DB); |
157 | cea_map_stack(MCE); |
158 | |
159 | if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) { |
160 | if (cc_platform_has(attr: CC_ATTR_GUEST_STATE_ENCRYPT)) { |
161 | cea_map_stack(VC); |
162 | cea_map_stack(VC2); |
163 | } |
164 | } |
165 | } |
166 | #else |
167 | static inline void percpu_setup_exception_stacks(unsigned int cpu) |
168 | { |
169 | struct cpu_entry_area *cea = get_cpu_entry_area(cpu); |
170 | |
171 | cea_map_percpu_pages(&cea->doublefault_stack, |
172 | &per_cpu(doublefault_stack, cpu), 1, PAGE_KERNEL); |
173 | } |
174 | #endif |
175 | |
176 | /* Setup the fixmap mappings only once per-processor */ |
177 | static void __init setup_cpu_entry_area(unsigned int cpu) |
178 | { |
179 | struct cpu_entry_area *cea = get_cpu_entry_area(cpu); |
180 | #ifdef CONFIG_X86_64 |
181 | /* On 64-bit systems, we use a read-only fixmap GDT and TSS. */ |
182 | pgprot_t gdt_prot = PAGE_KERNEL_RO; |
183 | pgprot_t tss_prot = PAGE_KERNEL_RO; |
184 | #else |
185 | /* |
186 | * On 32-bit systems, the GDT cannot be read-only because |
187 | * our double fault handler uses a task gate, and entering through |
188 | * a task gate needs to change an available TSS to busy. If the |
189 | * GDT is read-only, that will triple fault. The TSS cannot be |
190 | * read-only because the CPU writes to it on task switches. |
191 | */ |
192 | pgprot_t gdt_prot = PAGE_KERNEL; |
193 | pgprot_t tss_prot = PAGE_KERNEL; |
194 | #endif |
195 | |
196 | kasan_populate_shadow_for_vaddr(va: cea, CPU_ENTRY_AREA_SIZE, |
197 | nid: early_cpu_to_node(cpu)); |
198 | |
199 | cea_set_pte(cea_vaddr: &cea->gdt, pa: get_cpu_gdt_paddr(cpu), flags: gdt_prot); |
200 | |
201 | cea_map_percpu_pages(cea_vaddr: &cea->entry_stack_page, |
202 | per_cpu_ptr(&entry_stack_storage, cpu), pages: 1, |
203 | PAGE_KERNEL); |
204 | |
205 | /* |
206 | * The Intel SDM says (Volume 3, 7.2.1): |
207 | * |
208 | * Avoid placing a page boundary in the part of the TSS that the |
209 | * processor reads during a task switch (the first 104 bytes). The |
210 | * processor may not correctly perform address translations if a |
211 | * boundary occurs in this area. During a task switch, the processor |
212 | * reads and writes into the first 104 bytes of each TSS (using |
213 | * contiguous physical addresses beginning with the physical address |
214 | * of the first byte of the TSS). So, after TSS access begins, if |
215 | * part of the 104 bytes is not physically contiguous, the processor |
216 | * will access incorrect information without generating a page-fault |
217 | * exception. |
218 | * |
219 | * There are also a lot of errata involving the TSS spanning a page |
220 | * boundary. Assert that we're not doing that. |
221 | */ |
222 | BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^ |
223 | offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK); |
224 | BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0); |
225 | /* |
226 | * VMX changes the host TR limit to 0x67 after a VM exit. This is |
227 | * okay, since 0x67 covers the size of struct x86_hw_tss. Make sure |
228 | * that this is correct. |
229 | */ |
230 | BUILD_BUG_ON(offsetof(struct tss_struct, x86_tss) != 0); |
231 | BUILD_BUG_ON(sizeof(struct x86_hw_tss) != 0x68); |
232 | |
233 | cea_map_percpu_pages(cea_vaddr: &cea->tss, ptr: &per_cpu(cpu_tss_rw, cpu), |
234 | pages: sizeof(struct tss_struct) / PAGE_SIZE, prot: tss_prot); |
235 | |
236 | #ifdef CONFIG_X86_32 |
237 | per_cpu(cpu_entry_area, cpu) = cea; |
238 | #endif |
239 | |
240 | percpu_setup_exception_stacks(cpu); |
241 | |
242 | percpu_setup_debug_store(cpu); |
243 | } |
244 | |
245 | static __init void setup_cpu_entry_area_ptes(void) |
246 | { |
247 | #ifdef CONFIG_X86_32 |
248 | unsigned long start, end; |
249 | |
250 | /* The +1 is for the readonly IDT: */ |
251 | BUILD_BUG_ON((CPU_ENTRY_AREA_PAGES+1)*PAGE_SIZE != CPU_ENTRY_AREA_MAP_SIZE); |
252 | BUG_ON(CPU_ENTRY_AREA_BASE & ~PMD_MASK); |
253 | |
254 | start = CPU_ENTRY_AREA_BASE; |
255 | end = start + CPU_ENTRY_AREA_MAP_SIZE; |
256 | |
257 | /* Careful here: start + PMD_SIZE might wrap around */ |
258 | for (; start < end && start >= CPU_ENTRY_AREA_BASE; start += PMD_SIZE) |
259 | populate_extra_pte(start); |
260 | #endif |
261 | } |
262 | |
263 | void __init setup_cpu_entry_areas(void) |
264 | { |
265 | unsigned int cpu; |
266 | |
267 | init_cea_offsets(); |
268 | |
269 | setup_cpu_entry_area_ptes(); |
270 | |
271 | for_each_possible_cpu(cpu) |
272 | setup_cpu_entry_area(cpu); |
273 | |
274 | /* |
275 | * This is the last essential update to swapper_pgdir which needs |
276 | * to be synchronized to initial_page_table on 32bit. |
277 | */ |
278 | sync_initial_page_table(); |
279 | } |
280 | |