1 | // SPDX-License-Identifier: GPL-2.0-or-later |
2 | /* |
3 | * Copyright (C) 2009 Sunplus Core Technology Co., Ltd. |
4 | * Lennox Wu <lennox.wu@sunplusct.com> |
5 | * Chen Liqin <liqin.chen@sunplusct.com> |
6 | * Copyright (C) 2012 Regents of the University of California |
7 | */ |
8 | |
9 | |
10 | #include <linux/mm.h> |
11 | #include <linux/kernel.h> |
12 | #include <linux/interrupt.h> |
13 | #include <linux/perf_event.h> |
14 | #include <linux/signal.h> |
15 | #include <linux/uaccess.h> |
16 | #include <linux/kprobes.h> |
17 | #include <linux/kfence.h> |
18 | #include <linux/entry-common.h> |
19 | |
20 | #include <asm/ptrace.h> |
21 | #include <asm/tlbflush.h> |
22 | |
23 | #include "../kernel/head.h" |
24 | |
25 | static void die_kernel_fault(const char *msg, unsigned long addr, |
26 | struct pt_regs *regs) |
27 | { |
28 | bust_spinlocks(yes: 1); |
29 | |
30 | pr_alert("Unable to handle kernel %s at virtual address " REG_FMT "\n" , msg, |
31 | addr); |
32 | |
33 | bust_spinlocks(yes: 0); |
34 | die(regs, "Oops" ); |
35 | make_task_dead(SIGKILL); |
36 | } |
37 | |
38 | static inline void no_context(struct pt_regs *regs, unsigned long addr) |
39 | { |
40 | const char *msg; |
41 | |
42 | /* Are we prepared to handle this kernel fault? */ |
43 | if (fixup_exception(regs)) |
44 | return; |
45 | |
46 | /* |
47 | * Oops. The kernel tried to access some bad page. We'll have to |
48 | * terminate things with extreme prejudice. |
49 | */ |
50 | if (addr < PAGE_SIZE) |
51 | msg = "NULL pointer dereference" ; |
52 | else { |
53 | if (kfence_handle_page_fault(addr, is_write: regs->cause == EXC_STORE_PAGE_FAULT, regs)) |
54 | return; |
55 | |
56 | msg = "paging request" ; |
57 | } |
58 | |
59 | die_kernel_fault(msg, addr, regs); |
60 | } |
61 | |
62 | static inline void mm_fault_error(struct pt_regs *regs, unsigned long addr, vm_fault_t fault) |
63 | { |
64 | if (fault & VM_FAULT_OOM) { |
65 | /* |
66 | * We ran out of memory, call the OOM killer, and return the userspace |
67 | * (which will retry the fault, or kill us if we got oom-killed). |
68 | */ |
69 | if (!user_mode(regs)) { |
70 | no_context(regs, addr); |
71 | return; |
72 | } |
73 | pagefault_out_of_memory(); |
74 | return; |
75 | } else if (fault & (VM_FAULT_SIGBUS | VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) { |
76 | /* Kernel mode? Handle exceptions or die */ |
77 | if (!user_mode(regs)) { |
78 | no_context(regs, addr); |
79 | return; |
80 | } |
81 | do_trap(regs, SIGBUS, BUS_ADRERR, addr); |
82 | return; |
83 | } |
84 | BUG(); |
85 | } |
86 | |
87 | static inline void |
88 | bad_area_nosemaphore(struct pt_regs *regs, int code, unsigned long addr) |
89 | { |
90 | /* |
91 | * Something tried to access memory that isn't in our memory map. |
92 | * Fix it, but check if it's kernel or user first. |
93 | */ |
94 | /* User mode accesses just cause a SIGSEGV */ |
95 | if (user_mode(regs)) { |
96 | do_trap(regs, SIGSEGV, code, addr); |
97 | return; |
98 | } |
99 | |
100 | no_context(regs, addr); |
101 | } |
102 | |
103 | static inline void |
104 | bad_area(struct pt_regs *regs, struct mm_struct *mm, int code, |
105 | unsigned long addr) |
106 | { |
107 | mmap_read_unlock(mm); |
108 | |
109 | bad_area_nosemaphore(regs, code, addr); |
110 | } |
111 | |
112 | static inline void vmalloc_fault(struct pt_regs *regs, int code, unsigned long addr) |
113 | { |
114 | pgd_t *pgd, *pgd_k; |
115 | pud_t *pud_k; |
116 | p4d_t *p4d_k; |
117 | pmd_t *pmd_k; |
118 | pte_t *pte_k; |
119 | int index; |
120 | unsigned long pfn; |
121 | |
122 | /* User mode accesses just cause a SIGSEGV */ |
123 | if (user_mode(regs)) |
124 | return do_trap(regs, SIGSEGV, code, addr); |
125 | |
126 | /* |
127 | * Synchronize this task's top level page-table |
128 | * with the 'reference' page table. |
129 | * |
130 | * Do _not_ use "tsk->active_mm->pgd" here. |
131 | * We might be inside an interrupt in the middle |
132 | * of a task switch. |
133 | */ |
134 | index = pgd_index(addr); |
135 | pfn = csr_read(CSR_SATP) & SATP_PPN; |
136 | pgd = (pgd_t *)pfn_to_virt(pfn) + index; |
137 | pgd_k = init_mm.pgd + index; |
138 | |
139 | if (!pgd_present(pgd: pgdp_get(pgdp: pgd_k))) { |
140 | no_context(regs, addr); |
141 | return; |
142 | } |
143 | set_pgd(pgd, pgdp_get(pgd_k)); |
144 | |
145 | p4d_k = p4d_offset(pgd: pgd_k, address: addr); |
146 | if (!p4d_present(p4d: p4dp_get(p4dp: p4d_k))) { |
147 | no_context(regs, addr); |
148 | return; |
149 | } |
150 | |
151 | pud_k = pud_offset(p4d: p4d_k, address: addr); |
152 | if (!pud_present(pud: pudp_get(pudp: pud_k))) { |
153 | no_context(regs, addr); |
154 | return; |
155 | } |
156 | if (pud_leaf(pud: pudp_get(pudp: pud_k))) |
157 | goto flush_tlb; |
158 | |
159 | /* |
160 | * Since the vmalloc area is global, it is unnecessary |
161 | * to copy individual PTEs |
162 | */ |
163 | pmd_k = pmd_offset(pud: pud_k, address: addr); |
164 | if (!pmd_present(pmd: pmdp_get(pmdp: pmd_k))) { |
165 | no_context(regs, addr); |
166 | return; |
167 | } |
168 | if (pmd_leaf(pte: pmdp_get(pmdp: pmd_k))) |
169 | goto flush_tlb; |
170 | |
171 | /* |
172 | * Make sure the actual PTE exists as well to |
173 | * catch kernel vmalloc-area accesses to non-mapped |
174 | * addresses. If we don't do this, this will just |
175 | * silently loop forever. |
176 | */ |
177 | pte_k = pte_offset_kernel(pmd: pmd_k, address: addr); |
178 | if (!pte_present(a: ptep_get(ptep: pte_k))) { |
179 | no_context(regs, addr); |
180 | return; |
181 | } |
182 | |
183 | /* |
184 | * The kernel assumes that TLBs don't cache invalid |
185 | * entries, but in RISC-V, SFENCE.VMA specifies an |
186 | * ordering constraint, not a cache flush; it is |
187 | * necessary even after writing invalid entries. |
188 | */ |
189 | flush_tlb: |
190 | local_flush_tlb_page(addr); |
191 | } |
192 | |
193 | static inline bool access_error(unsigned long cause, struct vm_area_struct *vma) |
194 | { |
195 | switch (cause) { |
196 | case EXC_INST_PAGE_FAULT: |
197 | if (!(vma->vm_flags & VM_EXEC)) { |
198 | return true; |
199 | } |
200 | break; |
201 | case EXC_LOAD_PAGE_FAULT: |
202 | /* Write implies read */ |
203 | if (!(vma->vm_flags & (VM_READ | VM_WRITE))) { |
204 | return true; |
205 | } |
206 | break; |
207 | case EXC_STORE_PAGE_FAULT: |
208 | if (!(vma->vm_flags & VM_WRITE)) { |
209 | return true; |
210 | } |
211 | break; |
212 | default: |
213 | panic(fmt: "%s: unhandled cause %lu" , __func__, cause); |
214 | } |
215 | return false; |
216 | } |
217 | |
218 | /* |
219 | * This routine handles page faults. It determines the address and the |
220 | * problem, and then passes it off to one of the appropriate routines. |
221 | */ |
222 | void handle_page_fault(struct pt_regs *regs) |
223 | { |
224 | struct task_struct *tsk; |
225 | struct vm_area_struct *vma; |
226 | struct mm_struct *mm; |
227 | unsigned long addr, cause; |
228 | unsigned int flags = FAULT_FLAG_DEFAULT; |
229 | int code = SEGV_MAPERR; |
230 | vm_fault_t fault; |
231 | |
232 | cause = regs->cause; |
233 | addr = regs->badaddr; |
234 | |
235 | tsk = current; |
236 | mm = tsk->mm; |
237 | |
238 | if (kprobe_page_fault(regs, trap: cause)) |
239 | return; |
240 | |
241 | /* |
242 | * Fault-in kernel-space virtual memory on-demand. |
243 | * The 'reference' page table is init_mm.pgd. |
244 | * |
245 | * NOTE! We MUST NOT take any locks for this case. We may |
246 | * be in an interrupt or a critical region, and should |
247 | * only copy the information from the master page table, |
248 | * nothing more. |
249 | */ |
250 | if ((!IS_ENABLED(CONFIG_MMU) || !IS_ENABLED(CONFIG_64BIT)) && |
251 | unlikely(addr >= VMALLOC_START && addr < VMALLOC_END)) { |
252 | vmalloc_fault(regs, code, addr); |
253 | return; |
254 | } |
255 | |
256 | /* Enable interrupts if they were enabled in the parent context. */ |
257 | if (!regs_irqs_disabled(regs)) |
258 | local_irq_enable(); |
259 | |
260 | /* |
261 | * If we're in an interrupt, have no user context, or are running |
262 | * in an atomic region, then we must not take the fault. |
263 | */ |
264 | if (unlikely(faulthandler_disabled() || !mm)) { |
265 | tsk->thread.bad_cause = cause; |
266 | no_context(regs, addr); |
267 | return; |
268 | } |
269 | |
270 | if (user_mode(regs)) |
271 | flags |= FAULT_FLAG_USER; |
272 | |
273 | if (!user_mode(regs) && addr < TASK_SIZE && unlikely(!(regs->status & SR_SUM))) { |
274 | if (fixup_exception(regs)) |
275 | return; |
276 | |
277 | die_kernel_fault(msg: "access to user memory without uaccess routines" , addr, regs); |
278 | } |
279 | |
280 | perf_sw_event(event_id: PERF_COUNT_SW_PAGE_FAULTS, nr: 1, regs, addr); |
281 | |
282 | if (cause == EXC_STORE_PAGE_FAULT) |
283 | flags |= FAULT_FLAG_WRITE; |
284 | else if (cause == EXC_INST_PAGE_FAULT) |
285 | flags |= FAULT_FLAG_INSTRUCTION; |
286 | if (!(flags & FAULT_FLAG_USER)) |
287 | goto lock_mmap; |
288 | |
289 | vma = lock_vma_under_rcu(mm, address: addr); |
290 | if (!vma) |
291 | goto lock_mmap; |
292 | |
293 | if (unlikely(access_error(cause, vma))) { |
294 | vma_end_read(vma); |
295 | goto lock_mmap; |
296 | } |
297 | |
298 | fault = handle_mm_fault(vma, address: addr, flags: flags | FAULT_FLAG_VMA_LOCK, regs); |
299 | if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) |
300 | vma_end_read(vma); |
301 | |
302 | if (!(fault & VM_FAULT_RETRY)) { |
303 | count_vm_vma_lock_event(VMA_LOCK_SUCCESS); |
304 | goto done; |
305 | } |
306 | count_vm_vma_lock_event(VMA_LOCK_RETRY); |
307 | if (fault & VM_FAULT_MAJOR) |
308 | flags |= FAULT_FLAG_TRIED; |
309 | |
310 | if (fault_signal_pending(fault_flags: fault, regs)) { |
311 | if (!user_mode(regs)) |
312 | no_context(regs, addr); |
313 | return; |
314 | } |
315 | lock_mmap: |
316 | |
317 | retry: |
318 | vma = lock_mm_and_find_vma(mm, address: addr, regs); |
319 | if (unlikely(!vma)) { |
320 | tsk->thread.bad_cause = cause; |
321 | bad_area_nosemaphore(regs, code, addr); |
322 | return; |
323 | } |
324 | |
325 | /* |
326 | * Ok, we have a good vm_area for this memory access, so |
327 | * we can handle it. |
328 | */ |
329 | code = SEGV_ACCERR; |
330 | |
331 | if (unlikely(access_error(cause, vma))) { |
332 | tsk->thread.bad_cause = cause; |
333 | bad_area(regs, mm, code, addr); |
334 | return; |
335 | } |
336 | |
337 | /* |
338 | * If for any reason at all we could not handle the fault, |
339 | * make sure we exit gracefully rather than endlessly redo |
340 | * the fault. |
341 | */ |
342 | fault = handle_mm_fault(vma, address: addr, flags, regs); |
343 | |
344 | /* |
345 | * If we need to retry but a fatal signal is pending, handle the |
346 | * signal first. We do not need to release the mmap_lock because it |
347 | * would already be released in __lock_page_or_retry in mm/filemap.c. |
348 | */ |
349 | if (fault_signal_pending(fault_flags: fault, regs)) { |
350 | if (!user_mode(regs)) |
351 | no_context(regs, addr); |
352 | return; |
353 | } |
354 | |
355 | /* The fault is fully completed (including releasing mmap lock) */ |
356 | if (fault & VM_FAULT_COMPLETED) |
357 | return; |
358 | |
359 | if (unlikely(fault & VM_FAULT_RETRY)) { |
360 | flags |= FAULT_FLAG_TRIED; |
361 | |
362 | /* |
363 | * No need to mmap_read_unlock(mm) as we would |
364 | * have already released it in __lock_page_or_retry |
365 | * in mm/filemap.c. |
366 | */ |
367 | goto retry; |
368 | } |
369 | |
370 | mmap_read_unlock(mm); |
371 | |
372 | done: |
373 | if (unlikely(fault & VM_FAULT_ERROR)) { |
374 | tsk->thread.bad_cause = cause; |
375 | mm_fault_error(regs, addr, fault); |
376 | return; |
377 | } |
378 | return; |
379 | } |
380 | |