1 | // SPDX-License-Identifier: GPL-2.0-or-later |
2 | /* |
3 | * Kernel Probes Jump Optimization (Optprobes) |
4 | * |
5 | * Copyright (C) IBM Corporation, 2002, 2004 |
6 | * Copyright (C) Hitachi Ltd., 2012 |
7 | */ |
8 | #include <linux/kprobes.h> |
9 | #include <linux/perf_event.h> |
10 | #include <linux/ptrace.h> |
11 | #include <linux/string.h> |
12 | #include <linux/slab.h> |
13 | #include <linux/hardirq.h> |
14 | #include <linux/preempt.h> |
15 | #include <linux/extable.h> |
16 | #include <linux/kdebug.h> |
17 | #include <linux/kallsyms.h> |
18 | #include <linux/kgdb.h> |
19 | #include <linux/ftrace.h> |
20 | #include <linux/objtool.h> |
21 | #include <linux/pgtable.h> |
22 | #include <linux/static_call.h> |
23 | |
24 | #include <asm/text-patching.h> |
25 | #include <asm/cacheflush.h> |
26 | #include <asm/desc.h> |
27 | #include <linux/uaccess.h> |
28 | #include <asm/alternative.h> |
29 | #include <asm/insn.h> |
30 | #include <asm/debugreg.h> |
31 | #include <asm/set_memory.h> |
32 | #include <asm/sections.h> |
33 | #include <asm/nospec-branch.h> |
34 | |
35 | #include "common.h" |
36 | |
37 | unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr) |
38 | { |
39 | struct optimized_kprobe *op; |
40 | struct kprobe *kp; |
41 | long offs; |
42 | int i; |
43 | |
44 | for (i = 0; i < JMP32_INSN_SIZE; i++) { |
45 | kp = get_kprobe(addr: (void *)addr - i); |
46 | /* This function only handles jump-optimized kprobe */ |
47 | if (kp && kprobe_optimized(p: kp)) { |
48 | op = container_of(kp, struct optimized_kprobe, kp); |
49 | /* If op is optimized or under unoptimizing */ |
50 | if (list_empty(head: &op->list) || optprobe_queued_unopt(op)) |
51 | goto found; |
52 | } |
53 | } |
54 | |
55 | return addr; |
56 | found: |
57 | /* |
58 | * If the kprobe can be optimized, original bytes which can be |
59 | * overwritten by jump destination address. In this case, original |
60 | * bytes must be recovered from op->optinsn.copied_insn buffer. |
61 | */ |
62 | if (copy_from_kernel_nofault(dst: buf, src: (void *)addr, |
63 | MAX_INSN_SIZE * sizeof(kprobe_opcode_t))) |
64 | return 0UL; |
65 | |
66 | if (addr == (unsigned long)kp->addr) { |
67 | buf[0] = kp->opcode; |
68 | memcpy(buf + 1, op->optinsn.copied_insn, DISP32_SIZE); |
69 | } else { |
70 | offs = addr - (unsigned long)kp->addr - 1; |
71 | memcpy(buf, op->optinsn.copied_insn + offs, DISP32_SIZE - offs); |
72 | } |
73 | |
74 | return (unsigned long)buf; |
75 | } |
76 | |
77 | static void synthesize_clac(kprobe_opcode_t *addr) |
78 | { |
79 | /* |
80 | * Can't be static_cpu_has() due to how objtool treats this feature bit. |
81 | * This isn't a fast path anyway. |
82 | */ |
83 | if (!boot_cpu_has(X86_FEATURE_SMAP)) |
84 | return; |
85 | |
86 | /* Replace the NOP3 with CLAC */ |
87 | addr[0] = 0x0f; |
88 | addr[1] = 0x01; |
89 | addr[2] = 0xca; |
90 | } |
91 | |
92 | /* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */ |
93 | static void synthesize_set_arg1(kprobe_opcode_t *addr, unsigned long val) |
94 | { |
95 | #ifdef CONFIG_X86_64 |
96 | *addr++ = 0x48; |
97 | *addr++ = 0xbf; |
98 | #else |
99 | *addr++ = 0xb8; |
100 | #endif |
101 | *(unsigned long *)addr = val; |
102 | } |
103 | |
104 | asm ( |
105 | ".pushsection .rodata\n" |
106 | "optprobe_template_func:\n" |
107 | ".global optprobe_template_entry\n" |
108 | "optprobe_template_entry:\n" |
109 | #ifdef CONFIG_X86_64 |
110 | " pushq $" __stringify(__KERNEL_DS) "\n" |
111 | /* Save the 'sp - 8', this will be fixed later. */ |
112 | " pushq %rsp\n" |
113 | " pushfq\n" |
114 | ".global optprobe_template_clac\n" |
115 | "optprobe_template_clac:\n" |
116 | ASM_NOP3 |
117 | SAVE_REGS_STRING |
118 | " movq %rsp, %rsi\n" |
119 | ".global optprobe_template_val\n" |
120 | "optprobe_template_val:\n" |
121 | ASM_NOP5 |
122 | ASM_NOP5 |
123 | ".global optprobe_template_call\n" |
124 | "optprobe_template_call:\n" |
125 | ASM_NOP5 |
126 | /* Copy 'regs->flags' into 'regs->ss'. */ |
127 | " movq 18*8(%rsp), %rdx\n" |
128 | " movq %rdx, 20*8(%rsp)\n" |
129 | RESTORE_REGS_STRING |
130 | /* Skip 'regs->flags' and 'regs->sp'. */ |
131 | " addq $16, %rsp\n" |
132 | /* And pop flags register from 'regs->ss'. */ |
133 | " popfq\n" |
134 | #else /* CONFIG_X86_32 */ |
135 | " pushl %ss\n" |
136 | /* Save the 'sp - 4', this will be fixed later. */ |
137 | " pushl %esp\n" |
138 | " pushfl\n" |
139 | ".global optprobe_template_clac\n" |
140 | "optprobe_template_clac:\n" |
141 | ASM_NOP3 |
142 | SAVE_REGS_STRING |
143 | " movl %esp, %edx\n" |
144 | ".global optprobe_template_val\n" |
145 | "optprobe_template_val:\n" |
146 | ASM_NOP5 |
147 | ".global optprobe_template_call\n" |
148 | "optprobe_template_call:\n" |
149 | ASM_NOP5 |
150 | /* Copy 'regs->flags' into 'regs->ss'. */ |
151 | " movl 14*4(%esp), %edx\n" |
152 | " movl %edx, 16*4(%esp)\n" |
153 | RESTORE_REGS_STRING |
154 | /* Skip 'regs->flags' and 'regs->sp'. */ |
155 | " addl $8, %esp\n" |
156 | /* And pop flags register from 'regs->ss'. */ |
157 | " popfl\n" |
158 | #endif |
159 | ".global optprobe_template_end\n" |
160 | "optprobe_template_end:\n" |
161 | ".popsection\n" ); |
162 | |
163 | void optprobe_template_func(void); |
164 | STACK_FRAME_NON_STANDARD(optprobe_template_func); |
165 | |
166 | #define TMPL_CLAC_IDX \ |
167 | ((long)optprobe_template_clac - (long)optprobe_template_entry) |
168 | #define TMPL_MOVE_IDX \ |
169 | ((long)optprobe_template_val - (long)optprobe_template_entry) |
170 | #define TMPL_CALL_IDX \ |
171 | ((long)optprobe_template_call - (long)optprobe_template_entry) |
172 | #define TMPL_END_IDX \ |
173 | ((long)optprobe_template_end - (long)optprobe_template_entry) |
174 | |
175 | /* Optimized kprobe call back function: called from optinsn */ |
176 | static void |
177 | optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs) |
178 | { |
179 | /* This is possible if op is under delayed unoptimizing */ |
180 | if (kprobe_disabled(p: &op->kp)) |
181 | return; |
182 | |
183 | preempt_disable(); |
184 | if (kprobe_running()) { |
185 | kprobes_inc_nmissed_count(p: &op->kp); |
186 | } else { |
187 | struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); |
188 | /* Adjust stack pointer */ |
189 | regs->sp += sizeof(long); |
190 | /* Save skipped registers */ |
191 | regs->cs = __KERNEL_CS; |
192 | #ifdef CONFIG_X86_32 |
193 | regs->gs = 0; |
194 | #endif |
195 | regs->ip = (unsigned long)op->kp.addr + INT3_INSN_SIZE; |
196 | regs->orig_ax = ~0UL; |
197 | |
198 | __this_cpu_write(current_kprobe, &op->kp); |
199 | kcb->kprobe_status = KPROBE_HIT_ACTIVE; |
200 | opt_pre_handler(p: &op->kp, regs); |
201 | __this_cpu_write(current_kprobe, NULL); |
202 | } |
203 | preempt_enable(); |
204 | } |
205 | NOKPROBE_SYMBOL(optimized_callback); |
206 | |
207 | static int copy_optimized_instructions(u8 *dest, u8 *src, u8 *real) |
208 | { |
209 | struct insn insn; |
210 | int len = 0, ret; |
211 | |
212 | while (len < JMP32_INSN_SIZE) { |
213 | ret = __copy_instruction(dest: dest + len, src: src + len, real: real + len, insn: &insn); |
214 | if (!ret || !can_boost(insn: &insn, orig_addr: src + len)) |
215 | return -EINVAL; |
216 | len += ret; |
217 | } |
218 | /* Check whether the address range is reserved */ |
219 | if (ftrace_text_reserved(start: src, end: src + len - 1) || |
220 | alternatives_text_reserved(start: src, end: src + len - 1) || |
221 | jump_label_text_reserved(start: src, end: src + len - 1) || |
222 | static_call_text_reserved(start: src, end: src + len - 1)) |
223 | return -EBUSY; |
224 | |
225 | return len; |
226 | } |
227 | |
228 | /* Check whether insn is indirect jump */ |
229 | static int insn_is_indirect_jump(struct insn *insn) |
230 | { |
231 | return ((insn->opcode.bytes[0] == 0xff && |
232 | (X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */ |
233 | insn->opcode.bytes[0] == 0xea); /* Segment based jump */ |
234 | } |
235 | |
236 | /* Check whether insn jumps into specified address range */ |
237 | static int insn_jump_into_range(struct insn *insn, unsigned long start, int len) |
238 | { |
239 | unsigned long target = 0; |
240 | |
241 | switch (insn->opcode.bytes[0]) { |
242 | case 0xe0: /* loopne */ |
243 | case 0xe1: /* loope */ |
244 | case 0xe2: /* loop */ |
245 | case 0xe3: /* jcxz */ |
246 | case 0xe9: /* near relative jump */ |
247 | case 0xeb: /* short relative jump */ |
248 | break; |
249 | case 0x0f: |
250 | if ((insn->opcode.bytes[1] & 0xf0) == 0x80) /* jcc near */ |
251 | break; |
252 | return 0; |
253 | default: |
254 | if ((insn->opcode.bytes[0] & 0xf0) == 0x70) /* jcc short */ |
255 | break; |
256 | return 0; |
257 | } |
258 | target = (unsigned long)insn->next_byte + insn->immediate.value; |
259 | |
260 | return (start <= target && target <= start + len); |
261 | } |
262 | |
263 | /* Decode whole function to ensure any instructions don't jump into target */ |
264 | static int can_optimize(unsigned long paddr) |
265 | { |
266 | unsigned long addr, size = 0, offset = 0; |
267 | struct insn insn; |
268 | kprobe_opcode_t buf[MAX_INSN_SIZE]; |
269 | |
270 | /* Lookup symbol including addr */ |
271 | if (!kallsyms_lookup_size_offset(addr: paddr, symbolsize: &size, offset: &offset)) |
272 | return 0; |
273 | |
274 | /* |
275 | * Do not optimize in the entry code due to the unstable |
276 | * stack handling and registers setup. |
277 | */ |
278 | if (((paddr >= (unsigned long)__entry_text_start) && |
279 | (paddr < (unsigned long)__entry_text_end))) |
280 | return 0; |
281 | |
282 | /* Check there is enough space for a relative jump. */ |
283 | if (size - offset < JMP32_INSN_SIZE) |
284 | return 0; |
285 | |
286 | /* Decode instructions */ |
287 | addr = paddr - offset; |
288 | while (addr < paddr - offset + size) { /* Decode until function end */ |
289 | unsigned long recovered_insn; |
290 | int ret; |
291 | |
292 | if (search_exception_tables(add: addr)) |
293 | /* |
294 | * Since some fixup code will jumps into this function, |
295 | * we can't optimize kprobe in this function. |
296 | */ |
297 | return 0; |
298 | recovered_insn = recover_probed_instruction(buf, addr); |
299 | if (!recovered_insn) |
300 | return 0; |
301 | |
302 | ret = insn_decode_kernel(&insn, (void *)recovered_insn); |
303 | if (ret < 0) |
304 | return 0; |
305 | #ifdef CONFIG_KGDB |
306 | /* |
307 | * If there is a dynamically installed kgdb sw breakpoint, |
308 | * this function should not be probed. |
309 | */ |
310 | if (insn.opcode.bytes[0] == INT3_INSN_OPCODE && |
311 | kgdb_has_hit_break(addr)) |
312 | return 0; |
313 | #endif |
314 | /* Recover address */ |
315 | insn.kaddr = (void *)addr; |
316 | insn.next_byte = (void *)(addr + insn.length); |
317 | /* |
318 | * Check any instructions don't jump into target, indirectly or |
319 | * directly. |
320 | * |
321 | * The indirect case is present to handle a code with jump |
322 | * tables. When the kernel uses retpolines, the check should in |
323 | * theory additionally look for jumps to indirect thunks. |
324 | * However, the kernel built with retpolines or IBT has jump |
325 | * tables disabled so the check can be skipped altogether. |
326 | */ |
327 | if (!IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) && |
328 | !IS_ENABLED(CONFIG_X86_KERNEL_IBT) && |
329 | insn_is_indirect_jump(insn: &insn)) |
330 | return 0; |
331 | if (insn_jump_into_range(insn: &insn, start: paddr + INT3_INSN_SIZE, |
332 | DISP32_SIZE)) |
333 | return 0; |
334 | addr += insn.length; |
335 | } |
336 | |
337 | return 1; |
338 | } |
339 | |
340 | /* Check optimized_kprobe can actually be optimized. */ |
341 | int arch_check_optimized_kprobe(struct optimized_kprobe *op) |
342 | { |
343 | int i; |
344 | struct kprobe *p; |
345 | |
346 | for (i = 1; i < op->optinsn.size; i++) { |
347 | p = get_kprobe(addr: op->kp.addr + i); |
348 | if (p && !kprobe_disarmed(p)) |
349 | return -EEXIST; |
350 | } |
351 | |
352 | return 0; |
353 | } |
354 | |
355 | /* Check the addr is within the optimized instructions. */ |
356 | int arch_within_optimized_kprobe(struct optimized_kprobe *op, |
357 | kprobe_opcode_t *addr) |
358 | { |
359 | return (op->kp.addr <= addr && |
360 | op->kp.addr + op->optinsn.size > addr); |
361 | } |
362 | |
363 | /* Free optimized instruction slot */ |
364 | static |
365 | void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty) |
366 | { |
367 | u8 *slot = op->optinsn.insn; |
368 | if (slot) { |
369 | int len = TMPL_END_IDX + op->optinsn.size + JMP32_INSN_SIZE; |
370 | |
371 | /* Record the perf event before freeing the slot */ |
372 | if (dirty) |
373 | perf_event_text_poke(addr: slot, old_bytes: slot, old_len: len, NULL, new_len: 0); |
374 | |
375 | free_optinsn_slot(slot, dirty); |
376 | op->optinsn.insn = NULL; |
377 | op->optinsn.size = 0; |
378 | } |
379 | } |
380 | |
381 | void arch_remove_optimized_kprobe(struct optimized_kprobe *op) |
382 | { |
383 | __arch_remove_optimized_kprobe(op, dirty: 1); |
384 | } |
385 | |
386 | /* |
387 | * Copy replacing target instructions |
388 | * Target instructions MUST be relocatable (checked inside) |
389 | * This is called when new aggr(opt)probe is allocated or reused. |
390 | */ |
391 | int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, |
392 | struct kprobe *__unused) |
393 | { |
394 | u8 *buf = NULL, *slot; |
395 | int ret, len; |
396 | long rel; |
397 | |
398 | if (!can_optimize(paddr: (unsigned long)op->kp.addr)) |
399 | return -EILSEQ; |
400 | |
401 | buf = kzalloc(MAX_OPTINSN_SIZE, GFP_KERNEL); |
402 | if (!buf) |
403 | return -ENOMEM; |
404 | |
405 | op->optinsn.insn = slot = get_optinsn_slot(); |
406 | if (!slot) { |
407 | ret = -ENOMEM; |
408 | goto out; |
409 | } |
410 | |
411 | /* |
412 | * Verify if the address gap is in 2GB range, because this uses |
413 | * a relative jump. |
414 | */ |
415 | rel = (long)slot - (long)op->kp.addr + JMP32_INSN_SIZE; |
416 | if (abs(rel) > 0x7fffffff) { |
417 | ret = -ERANGE; |
418 | goto err; |
419 | } |
420 | |
421 | /* Copy arch-dep-instance from template */ |
422 | memcpy(buf, optprobe_template_entry, TMPL_END_IDX); |
423 | |
424 | /* Copy instructions into the out-of-line buffer */ |
425 | ret = copy_optimized_instructions(dest: buf + TMPL_END_IDX, src: op->kp.addr, |
426 | real: slot + TMPL_END_IDX); |
427 | if (ret < 0) |
428 | goto err; |
429 | op->optinsn.size = ret; |
430 | len = TMPL_END_IDX + op->optinsn.size; |
431 | |
432 | synthesize_clac(addr: buf + TMPL_CLAC_IDX); |
433 | |
434 | /* Set probe information */ |
435 | synthesize_set_arg1(addr: buf + TMPL_MOVE_IDX, val: (unsigned long)op); |
436 | |
437 | /* Set probe function call */ |
438 | synthesize_relcall(dest: buf + TMPL_CALL_IDX, |
439 | from: slot + TMPL_CALL_IDX, to: optimized_callback); |
440 | |
441 | /* Set returning jmp instruction at the tail of out-of-line buffer */ |
442 | synthesize_reljump(dest: buf + len, from: slot + len, |
443 | to: (u8 *)op->kp.addr + op->optinsn.size); |
444 | len += JMP32_INSN_SIZE; |
445 | |
446 | /* |
447 | * Note len = TMPL_END_IDX + op->optinsn.size + JMP32_INSN_SIZE is also |
448 | * used in __arch_remove_optimized_kprobe(). |
449 | */ |
450 | |
451 | /* We have to use text_poke() for instruction buffer because it is RO */ |
452 | perf_event_text_poke(addr: slot, NULL, old_len: 0, new_bytes: buf, new_len: len); |
453 | text_poke(addr: slot, opcode: buf, len); |
454 | |
455 | ret = 0; |
456 | out: |
457 | kfree(objp: buf); |
458 | return ret; |
459 | |
460 | err: |
461 | __arch_remove_optimized_kprobe(op, dirty: 0); |
462 | goto out; |
463 | } |
464 | |
465 | /* |
466 | * Replace breakpoints (INT3) with relative jumps (JMP.d32). |
467 | * Caller must call with locking kprobe_mutex and text_mutex. |
468 | * |
469 | * The caller will have installed a regular kprobe and after that issued |
470 | * syncrhonize_rcu_tasks(), this ensures that the instruction(s) that live in |
471 | * the 4 bytes after the INT3 are unused and can now be overwritten. |
472 | */ |
473 | void arch_optimize_kprobes(struct list_head *oplist) |
474 | { |
475 | struct optimized_kprobe *op, *tmp; |
476 | u8 insn_buff[JMP32_INSN_SIZE]; |
477 | |
478 | list_for_each_entry_safe(op, tmp, oplist, list) { |
479 | s32 rel = (s32)((long)op->optinsn.insn - |
480 | ((long)op->kp.addr + JMP32_INSN_SIZE)); |
481 | |
482 | WARN_ON(kprobe_disabled(&op->kp)); |
483 | |
484 | /* Backup instructions which will be replaced by jump address */ |
485 | memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_INSN_SIZE, |
486 | DISP32_SIZE); |
487 | |
488 | insn_buff[0] = JMP32_INSN_OPCODE; |
489 | *(s32 *)(&insn_buff[1]) = rel; |
490 | |
491 | text_poke_bp(addr: op->kp.addr, opcode: insn_buff, JMP32_INSN_SIZE, NULL); |
492 | |
493 | list_del_init(entry: &op->list); |
494 | } |
495 | } |
496 | |
497 | /* |
498 | * Replace a relative jump (JMP.d32) with a breakpoint (INT3). |
499 | * |
500 | * After that, we can restore the 4 bytes after the INT3 to undo what |
501 | * arch_optimize_kprobes() scribbled. This is safe since those bytes will be |
502 | * unused once the INT3 lands. |
503 | */ |
504 | void arch_unoptimize_kprobe(struct optimized_kprobe *op) |
505 | { |
506 | u8 new[JMP32_INSN_SIZE] = { INT3_INSN_OPCODE, }; |
507 | u8 old[JMP32_INSN_SIZE]; |
508 | u8 *addr = op->kp.addr; |
509 | |
510 | memcpy(old, op->kp.addr, JMP32_INSN_SIZE); |
511 | memcpy(new + INT3_INSN_SIZE, |
512 | op->optinsn.copied_insn, |
513 | JMP32_INSN_SIZE - INT3_INSN_SIZE); |
514 | |
515 | text_poke(addr, opcode: new, INT3_INSN_SIZE); |
516 | text_poke_sync(); |
517 | text_poke(addr: addr + INT3_INSN_SIZE, |
518 | opcode: new + INT3_INSN_SIZE, |
519 | JMP32_INSN_SIZE - INT3_INSN_SIZE); |
520 | text_poke_sync(); |
521 | |
522 | perf_event_text_poke(addr: op->kp.addr, old_bytes: old, JMP32_INSN_SIZE, new_bytes: new, JMP32_INSN_SIZE); |
523 | } |
524 | |
525 | /* |
526 | * Recover original instructions and breakpoints from relative jumps. |
527 | * Caller must call with locking kprobe_mutex. |
528 | */ |
529 | extern void arch_unoptimize_kprobes(struct list_head *oplist, |
530 | struct list_head *done_list) |
531 | { |
532 | struct optimized_kprobe *op, *tmp; |
533 | |
534 | list_for_each_entry_safe(op, tmp, oplist, list) { |
535 | arch_unoptimize_kprobe(op); |
536 | list_move(list: &op->list, head: done_list); |
537 | } |
538 | } |
539 | |
540 | int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter) |
541 | { |
542 | struct optimized_kprobe *op; |
543 | |
544 | if (p->flags & KPROBE_FLAG_OPTIMIZED) { |
545 | /* This kprobe is really able to run optimized path. */ |
546 | op = container_of(p, struct optimized_kprobe, kp); |
547 | /* Detour through copied instructions */ |
548 | regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX; |
549 | if (!reenter) |
550 | reset_current_kprobe(); |
551 | return 1; |
552 | } |
553 | return 0; |
554 | } |
555 | NOKPROBE_SYMBOL(setup_detour_execution); |
556 | |