1// SPDX-License-Identifier: GPL-2.0-only
2#define pr_fmt(fmt) "SMP alternatives: " fmt
3
4#include <linux/mmu_context.h>
5#include <linux/perf_event.h>
6#include <linux/vmalloc.h>
7#include <linux/memory.h>
8#include <linux/execmem.h>
9
10#include <asm/text-patching.h>
11#include <asm/insn.h>
12#include <asm/ibt.h>
13#include <asm/set_memory.h>
14#include <asm/nmi.h>
15
16int __read_mostly alternatives_patched;
17
18EXPORT_SYMBOL_GPL(alternatives_patched);
19
20#define MAX_PATCH_LEN (255-1)
21
22#define DA_ALL (~0)
23#define DA_ALT 0x01
24#define DA_RET 0x02
25#define DA_RETPOLINE 0x04
26#define DA_ENDBR 0x08
27#define DA_SMP 0x10
28
29static unsigned int debug_alternative;
30
31static int __init debug_alt(char *str)
32{
33 if (str && *str == '=')
34 str++;
35
36 if (!str || kstrtouint(s: str, base: 0, res: &debug_alternative))
37 debug_alternative = DA_ALL;
38
39 return 1;
40}
41__setup("debug-alternative", debug_alt);
42
43static int noreplace_smp;
44
45static int __init setup_noreplace_smp(char *str)
46{
47 noreplace_smp = 1;
48 return 1;
49}
50__setup("noreplace-smp", setup_noreplace_smp);
51
52#define DPRINTK(type, fmt, args...) \
53do { \
54 if (debug_alternative & DA_##type) \
55 printk(KERN_DEBUG pr_fmt(fmt) "\n", ##args); \
56} while (0)
57
58#define DUMP_BYTES(type, buf, len, fmt, args...) \
59do { \
60 if (unlikely(debug_alternative & DA_##type)) { \
61 int j; \
62 \
63 if (!(len)) \
64 break; \
65 \
66 printk(KERN_DEBUG pr_fmt(fmt), ##args); \
67 for (j = 0; j < (len) - 1; j++) \
68 printk(KERN_CONT "%02hhx ", buf[j]); \
69 printk(KERN_CONT "%02hhx\n", buf[j]); \
70 } \
71} while (0)
72
73static const unsigned char x86nops[] =
74{
75 BYTES_NOP1,
76 BYTES_NOP2,
77 BYTES_NOP3,
78 BYTES_NOP4,
79 BYTES_NOP5,
80 BYTES_NOP6,
81 BYTES_NOP7,
82 BYTES_NOP8,
83#ifdef CONFIG_64BIT
84 BYTES_NOP9,
85 BYTES_NOP10,
86 BYTES_NOP11,
87#endif
88};
89
90const unsigned char * const x86_nops[ASM_NOP_MAX+1] =
91{
92 NULL,
93 x86nops,
94 x86nops + 1,
95 x86nops + 1 + 2,
96 x86nops + 1 + 2 + 3,
97 x86nops + 1 + 2 + 3 + 4,
98 x86nops + 1 + 2 + 3 + 4 + 5,
99 x86nops + 1 + 2 + 3 + 4 + 5 + 6,
100 x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
101#ifdef CONFIG_64BIT
102 x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
103 x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9,
104 x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10,
105#endif
106};
107
108#ifdef CONFIG_FINEIBT
109static bool cfi_paranoid __ro_after_init;
110#endif
111
112#ifdef CONFIG_MITIGATION_ITS
113
114#ifdef CONFIG_MODULES
115static struct module *its_mod;
116#endif
117static void *its_page;
118static unsigned int its_offset;
119
120/* Initialize a thunk with the "jmp *reg; int3" instructions. */
121static void *its_init_thunk(void *thunk, int reg)
122{
123 u8 *bytes = thunk;
124 int offset = 0;
125 int i = 0;
126
127#ifdef CONFIG_FINEIBT
128 if (cfi_paranoid) {
129 /*
130 * When ITS uses indirect branch thunk the fineibt_paranoid
131 * caller sequence doesn't fit in the caller site. So put the
132 * remaining part of the sequence (<ea> + JNE) into the ITS
133 * thunk.
134 */
135 bytes[i++] = 0xea; /* invalid instruction */
136 bytes[i++] = 0x75; /* JNE */
137 bytes[i++] = 0xfd;
138
139 offset = 1;
140 }
141#endif
142
143 if (reg >= 8) {
144 bytes[i++] = 0x41; /* REX.B prefix */
145 reg -= 8;
146 }
147 bytes[i++] = 0xff;
148 bytes[i++] = 0xe0 + reg; /* jmp *reg */
149 bytes[i++] = 0xcc;
150
151 return thunk + offset;
152}
153
154#ifdef CONFIG_MODULES
155void its_init_mod(struct module *mod)
156{
157 if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS))
158 return;
159
160 mutex_lock(&text_mutex);
161 its_mod = mod;
162 its_page = NULL;
163}
164
165void its_fini_mod(struct module *mod)
166{
167 if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS))
168 return;
169
170 WARN_ON_ONCE(its_mod != mod);
171
172 its_mod = NULL;
173 its_page = NULL;
174 mutex_unlock(lock: &text_mutex);
175
176 for (int i = 0; i < mod->its_num_pages; i++) {
177 void *page = mod->its_page_array[i];
178 execmem_restore_rox(ptr: page, PAGE_SIZE);
179 }
180}
181
182void its_free_mod(struct module *mod)
183{
184 if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS))
185 return;
186
187 for (int i = 0; i < mod->its_num_pages; i++) {
188 void *page = mod->its_page_array[i];
189 execmem_free(ptr: page);
190 }
191 kfree(objp: mod->its_page_array);
192}
193#endif /* CONFIG_MODULES */
194
195static void *its_alloc(void)
196{
197 void *page __free(execmem) = execmem_alloc(type: EXECMEM_MODULE_TEXT, PAGE_SIZE);
198
199 if (!page)
200 return NULL;
201
202#ifdef CONFIG_MODULES
203 if (its_mod) {
204 void *tmp = krealloc(its_mod->its_page_array,
205 (its_mod->its_num_pages+1) * sizeof(void *),
206 GFP_KERNEL);
207 if (!tmp)
208 return NULL;
209
210 its_mod->its_page_array = tmp;
211 its_mod->its_page_array[its_mod->its_num_pages++] = page;
212
213 execmem_make_temp_rw(ptr: page, PAGE_SIZE);
214 }
215#endif /* CONFIG_MODULES */
216
217 return no_free_ptr(page);
218}
219
220static void *its_allocate_thunk(int reg)
221{
222 int size = 3 + (reg / 8);
223 void *thunk;
224
225#ifdef CONFIG_FINEIBT
226 /*
227 * The ITS thunk contains an indirect jump and an int3 instruction so
228 * its size is 3 or 4 bytes depending on the register used. If CFI
229 * paranoid is used then 3 extra bytes are added in the ITS thunk to
230 * complete the fineibt_paranoid caller sequence.
231 */
232 if (cfi_paranoid)
233 size += 3;
234#endif
235
236 if (!its_page || (its_offset + size - 1) >= PAGE_SIZE) {
237 its_page = its_alloc();
238 if (!its_page) {
239 pr_err("ITS page allocation failed\n");
240 return NULL;
241 }
242 memset(its_page, INT3_INSN_OPCODE, PAGE_SIZE);
243 its_offset = 32;
244 }
245
246 /*
247 * If the indirect branch instruction will be in the lower half
248 * of a cacheline, then update the offset to reach the upper half.
249 */
250 if ((its_offset + size - 1) % 64 < 32)
251 its_offset = ((its_offset - 1) | 0x3F) + 33;
252
253 thunk = its_page + its_offset;
254 its_offset += size;
255
256 return its_init_thunk(thunk, reg);
257}
258
259u8 *its_static_thunk(int reg)
260{
261 u8 *thunk = __x86_indirect_its_thunk_array[reg];
262
263#ifdef CONFIG_FINEIBT
264 /* Paranoid thunk starts 2 bytes before */
265 if (cfi_paranoid)
266 return thunk - 2;
267#endif
268 return thunk;
269}
270
271#endif
272
273/*
274 * Nomenclature for variable names to simplify and clarify this code and ease
275 * any potential staring at it:
276 *
277 * @instr: source address of the original instructions in the kernel text as
278 * generated by the compiler.
279 *
280 * @buf: temporary buffer on which the patching operates. This buffer is
281 * eventually text-poked into the kernel image.
282 *
283 * @replacement/@repl: pointer to the opcodes which are replacing @instr, located
284 * in the .altinstr_replacement section.
285 */
286
287/*
288 * Fill the buffer with a single effective instruction of size @len.
289 *
290 * In order not to issue an ORC stack depth tracking CFI entry (Call Frame Info)
291 * for every single-byte NOP, try to generate the maximally available NOP of
292 * size <= ASM_NOP_MAX such that only a single CFI entry is generated (vs one for
293 * each single-byte NOPs). If @len to fill out is > ASM_NOP_MAX, pad with INT3 and
294 * *jump* over instead of executing long and daft NOPs.
295 */
296static void add_nop(u8 *buf, unsigned int len)
297{
298 u8 *target = buf + len;
299
300 if (!len)
301 return;
302
303 if (len <= ASM_NOP_MAX) {
304 memcpy(buf, x86_nops[len], len);
305 return;
306 }
307
308 if (len < 128) {
309 __text_gen_insn(buf, JMP8_INSN_OPCODE, addr: buf, dest: target, JMP8_INSN_SIZE);
310 buf += JMP8_INSN_SIZE;
311 } else {
312 __text_gen_insn(buf, JMP32_INSN_OPCODE, addr: buf, dest: target, JMP32_INSN_SIZE);
313 buf += JMP32_INSN_SIZE;
314 }
315
316 for (;buf < target; buf++)
317 *buf = INT3_INSN_OPCODE;
318}
319
320/*
321 * Matches NOP and NOPL, not any of the other possible NOPs.
322 */
323static bool insn_is_nop(struct insn *insn)
324{
325 /* Anything NOP, but no REP NOP */
326 if (insn->opcode.bytes[0] == 0x90 &&
327 (!insn->prefixes.nbytes || insn->prefixes.bytes[0] != 0xF3))
328 return true;
329
330 /* NOPL */
331 if (insn->opcode.bytes[0] == 0x0F && insn->opcode.bytes[1] == 0x1F)
332 return true;
333
334 /* TODO: more nops */
335
336 return false;
337}
338
339/*
340 * Find the offset of the first non-NOP instruction starting at @offset
341 * but no further than @len.
342 */
343static int skip_nops(u8 *buf, int offset, int len)
344{
345 struct insn insn;
346
347 for (; offset < len; offset += insn.length) {
348 if (insn_decode_kernel(&insn, &buf[offset]))
349 break;
350
351 if (!insn_is_nop(insn: &insn))
352 break;
353 }
354
355 return offset;
356}
357
358/*
359 * "noinline" to cause control flow change and thus invalidate I$ and
360 * cause refetch after modification.
361 */
362static void noinline optimize_nops(const u8 * const instr, u8 *buf, size_t len)
363{
364 for (int next, i = 0; i < len; i = next) {
365 struct insn insn;
366
367 if (insn_decode_kernel(&insn, &buf[i]))
368 return;
369
370 next = i + insn.length;
371
372 if (insn_is_nop(insn: &insn)) {
373 int nop = i;
374
375 /* Has the NOP already been optimized? */
376 if (i + insn.length == len)
377 return;
378
379 next = skip_nops(buf, offset: next, len);
380
381 add_nop(buf: buf + nop, len: next - nop);
382 DUMP_BYTES(ALT, buf, len, "%px: [%d:%d) optimized NOPs: ", instr, nop, next);
383 }
384 }
385}
386
387/*
388 * In this context, "source" is where the instructions are placed in the
389 * section .altinstr_replacement, for example during kernel build by the
390 * toolchain.
391 * "Destination" is where the instructions are being patched in by this
392 * machinery.
393 *
394 * The source offset is:
395 *
396 * src_imm = target - src_next_ip (1)
397 *
398 * and the target offset is:
399 *
400 * dst_imm = target - dst_next_ip (2)
401 *
402 * so rework (1) as an expression for target like:
403 *
404 * target = src_imm + src_next_ip (1a)
405 *
406 * and substitute in (2) to get:
407 *
408 * dst_imm = (src_imm + src_next_ip) - dst_next_ip (3)
409 *
410 * Now, since the instruction stream is 'identical' at src and dst (it
411 * is being copied after all) it can be stated that:
412 *
413 * src_next_ip = src + ip_offset
414 * dst_next_ip = dst + ip_offset (4)
415 *
416 * Substitute (4) in (3) and observe ip_offset being cancelled out to
417 * obtain:
418 *
419 * dst_imm = src_imm + (src + ip_offset) - (dst + ip_offset)
420 * = src_imm + src - dst + ip_offset - ip_offset
421 * = src_imm + src - dst (5)
422 *
423 * IOW, only the relative displacement of the code block matters.
424 */
425
426#define apply_reloc_n(n_, p_, d_) \
427 do { \
428 s32 v = *(s##n_ *)(p_); \
429 v += (d_); \
430 BUG_ON((v >> 31) != (v >> (n_-1))); \
431 *(s##n_ *)(p_) = (s##n_)v; \
432 } while (0)
433
434
435static __always_inline
436void apply_reloc(int n, void *ptr, uintptr_t diff)
437{
438 switch (n) {
439 case 1: apply_reloc_n(8, ptr, diff); break;
440 case 2: apply_reloc_n(16, ptr, diff); break;
441 case 4: apply_reloc_n(32, ptr, diff); break;
442 default: BUG();
443 }
444}
445
446static __always_inline
447bool need_reloc(unsigned long offset, u8 *src, size_t src_len)
448{
449 u8 *target = src + offset;
450 /*
451 * If the target is inside the patched block, it's relative to the
452 * block itself and does not need relocation.
453 */
454 return (target < src || target > src + src_len);
455}
456
457static void __apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len)
458{
459 for (int next, i = 0; i < instrlen; i = next) {
460 struct insn insn;
461
462 if (WARN_ON_ONCE(insn_decode_kernel(&insn, &buf[i])))
463 return;
464
465 next = i + insn.length;
466
467 switch (insn.opcode.bytes[0]) {
468 case 0x0f:
469 if (insn.opcode.bytes[1] < 0x80 ||
470 insn.opcode.bytes[1] > 0x8f)
471 break;
472
473 fallthrough; /* Jcc.d32 */
474 case 0x70 ... 0x7f: /* Jcc.d8 */
475 case JMP8_INSN_OPCODE:
476 case JMP32_INSN_OPCODE:
477 case CALL_INSN_OPCODE:
478 if (need_reloc(offset: next + insn.immediate.value, src: repl, src_len: repl_len)) {
479 apply_reloc(n: insn.immediate.nbytes,
480 ptr: buf + i + insn_offset_immediate(insn: &insn),
481 diff: repl - instr);
482 }
483
484 /*
485 * Where possible, convert JMP.d32 into JMP.d8.
486 */
487 if (insn.opcode.bytes[0] == JMP32_INSN_OPCODE) {
488 s32 imm = insn.immediate.value;
489 imm += repl - instr;
490 imm += JMP32_INSN_SIZE - JMP8_INSN_SIZE;
491 if ((imm >> 31) == (imm >> 7)) {
492 buf[i+0] = JMP8_INSN_OPCODE;
493 buf[i+1] = (s8)imm;
494
495 memset(&buf[i+2], INT3_INSN_OPCODE, insn.length - 2);
496 }
497 }
498 break;
499 }
500
501 if (insn_rip_relative(insn: &insn)) {
502 if (need_reloc(offset: next + insn.displacement.value, src: repl, src_len: repl_len)) {
503 apply_reloc(n: insn.displacement.nbytes,
504 ptr: buf + i + insn_offset_displacement(insn: &insn),
505 diff: repl - instr);
506 }
507 }
508 }
509}
510
511void text_poke_apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len)
512{
513 __apply_relocation(buf, instr, instrlen, repl, repl_len);
514 optimize_nops(instr, buf, len: instrlen);
515}
516
517/* Low-level backend functions usable from alternative code replacements. */
518DEFINE_ASM_FUNC(nop_func, "", .entry.text);
519EXPORT_SYMBOL_GPL(nop_func);
520
521noinstr void BUG_func(void)
522{
523 BUG();
524}
525EXPORT_SYMBOL(BUG_func);
526
527#define CALL_RIP_REL_OPCODE 0xff
528#define CALL_RIP_REL_MODRM 0x15
529
530/*
531 * Rewrite the "call BUG_func" replacement to point to the target of the
532 * indirect pv_ops call "call *disp(%ip)".
533 */
534static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a)
535{
536 void *target, *bug = &BUG_func;
537 s32 disp;
538
539 if (a->replacementlen != 5 || insn_buff[0] != CALL_INSN_OPCODE) {
540 pr_err("ALT_FLAG_DIRECT_CALL set for a non-call replacement instruction\n");
541 BUG();
542 }
543
544 if (a->instrlen != 6 ||
545 instr[0] != CALL_RIP_REL_OPCODE ||
546 instr[1] != CALL_RIP_REL_MODRM) {
547 pr_err("ALT_FLAG_DIRECT_CALL set for unrecognized indirect call\n");
548 BUG();
549 }
550
551 /* Skip CALL_RIP_REL_OPCODE and CALL_RIP_REL_MODRM */
552 disp = *(s32 *)(instr + 2);
553#ifdef CONFIG_X86_64
554 /* ff 15 00 00 00 00 call *0x0(%rip) */
555 /* target address is stored at "next instruction + disp". */
556 target = *(void **)(instr + a->instrlen + disp);
557#else
558 /* ff 15 00 00 00 00 call *0x0 */
559 /* target address is stored at disp. */
560 target = *(void **)disp;
561#endif
562 if (!target)
563 target = bug;
564
565 /* (BUG_func - .) + (target - BUG_func) := target - . */
566 *(s32 *)(insn_buff + 1) += target - bug;
567
568 if (target == &nop_func)
569 return 0;
570
571 return 5;
572}
573
574static inline u8 * instr_va(struct alt_instr *i)
575{
576 return (u8 *)&i->instr_offset + i->instr_offset;
577}
578
579/*
580 * Replace instructions with better alternatives for this CPU type. This runs
581 * before SMP is initialized to avoid SMP problems with self modifying code.
582 * This implies that asymmetric systems where APs have less capabilities than
583 * the boot processor are not handled. Tough. Make sure you disable such
584 * features by hand.
585 *
586 * Marked "noinline" to cause control flow change and thus insn cache
587 * to refetch changed I$ lines.
588 */
589void __init_or_module noinline apply_alternatives(struct alt_instr *start,
590 struct alt_instr *end)
591{
592 u8 insn_buff[MAX_PATCH_LEN];
593 u8 *instr, *replacement;
594 struct alt_instr *a, *b;
595
596 DPRINTK(ALT, "alt table %px, -> %px", start, end);
597
598 /*
599 * KASAN_SHADOW_START is defined using
600 * cpu_feature_enabled(X86_FEATURE_LA57) and is therefore patched here.
601 * During the process, KASAN becomes confused seeing partial LA57
602 * conversion and triggers a false-positive out-of-bound report.
603 *
604 * Disable KASAN until the patching is complete.
605 */
606 kasan_disable_current();
607
608 /*
609 * The scan order should be from start to end. A later scanned
610 * alternative code can overwrite previously scanned alternative code.
611 * Some kernel functions (e.g. memcpy, memset, etc) use this order to
612 * patch code.
613 *
614 * So be careful if you want to change the scan order to any other
615 * order.
616 */
617 for (a = start; a < end; a++) {
618 int insn_buff_sz = 0;
619
620 /*
621 * In case of nested ALTERNATIVE()s the outer alternative might
622 * add more padding. To ensure consistent patching find the max
623 * padding for all alt_instr entries for this site (nested
624 * alternatives result in consecutive entries).
625 */
626 for (b = a+1; b < end && instr_va(i: b) == instr_va(i: a); b++) {
627 u8 len = max(a->instrlen, b->instrlen);
628 a->instrlen = b->instrlen = len;
629 }
630
631 instr = instr_va(i: a);
632 replacement = (u8 *)&a->repl_offset + a->repl_offset;
633 BUG_ON(a->instrlen > sizeof(insn_buff));
634 BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32);
635
636 /*
637 * Patch if either:
638 * - feature is present
639 * - feature not present but ALT_FLAG_NOT is set to mean,
640 * patch if feature is *NOT* present.
641 */
642 if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT)) {
643 memcpy(insn_buff, instr, a->instrlen);
644 optimize_nops(instr, buf: insn_buff, len: a->instrlen);
645 text_poke_early(addr: instr, opcode: insn_buff, len: a->instrlen);
646 continue;
647 }
648
649 DPRINTK(ALT, "feat: %d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d) flags: 0x%x",
650 a->cpuid >> 5,
651 a->cpuid & 0x1f,
652 instr, instr, a->instrlen,
653 replacement, a->replacementlen, a->flags);
654
655 memcpy(insn_buff, replacement, a->replacementlen);
656 insn_buff_sz = a->replacementlen;
657
658 if (a->flags & ALT_FLAG_DIRECT_CALL) {
659 insn_buff_sz = alt_replace_call(instr, insn_buff, a);
660 if (insn_buff_sz < 0)
661 continue;
662 }
663
664 for (; insn_buff_sz < a->instrlen; insn_buff_sz++)
665 insn_buff[insn_buff_sz] = 0x90;
666
667 text_poke_apply_relocation(buf: insn_buff, instr, instrlen: a->instrlen, repl: replacement, repl_len: a->replacementlen);
668
669 DUMP_BYTES(ALT, instr, a->instrlen, "%px: old_insn: ", instr);
670 DUMP_BYTES(ALT, replacement, a->replacementlen, "%px: rpl_insn: ", replacement);
671 DUMP_BYTES(ALT, insn_buff, insn_buff_sz, "%px: final_insn: ", instr);
672
673 text_poke_early(addr: instr, opcode: insn_buff, len: insn_buff_sz);
674 }
675
676 kasan_enable_current();
677}
678
679static inline bool is_jcc32(struct insn *insn)
680{
681 /* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */
682 return insn->opcode.bytes[0] == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80;
683}
684
685#if defined(CONFIG_MITIGATION_RETPOLINE) && defined(CONFIG_OBJTOOL)
686
687/*
688 * CALL/JMP *%\reg
689 */
690static int emit_indirect(int op, int reg, u8 *bytes)
691{
692 int i = 0;
693 u8 modrm;
694
695 switch (op) {
696 case CALL_INSN_OPCODE:
697 modrm = 0x10; /* Reg = 2; CALL r/m */
698 break;
699
700 case JMP32_INSN_OPCODE:
701 modrm = 0x20; /* Reg = 4; JMP r/m */
702 break;
703
704 default:
705 WARN_ON_ONCE(1);
706 return -1;
707 }
708
709 if (reg >= 8) {
710 bytes[i++] = 0x41; /* REX.B prefix */
711 reg -= 8;
712 }
713
714 modrm |= 0xc0; /* Mod = 3 */
715 modrm += reg;
716
717 bytes[i++] = 0xff; /* opcode */
718 bytes[i++] = modrm;
719
720 return i;
721}
722
723static int __emit_trampoline(void *addr, struct insn *insn, u8 *bytes,
724 void *call_dest, void *jmp_dest)
725{
726 u8 op = insn->opcode.bytes[0];
727 int i = 0;
728
729 /*
730 * Clang does 'weird' Jcc __x86_indirect_thunk_r11 conditional
731 * tail-calls. Deal with them.
732 */
733 if (is_jcc32(insn)) {
734 bytes[i++] = op;
735 op = insn->opcode.bytes[1];
736 goto clang_jcc;
737 }
738
739 if (insn->length == 6)
740 bytes[i++] = 0x2e; /* CS-prefix */
741
742 switch (op) {
743 case CALL_INSN_OPCODE:
744 __text_gen_insn(buf: bytes+i, opcode: op, addr: addr+i,
745 dest: call_dest,
746 CALL_INSN_SIZE);
747 i += CALL_INSN_SIZE;
748 break;
749
750 case JMP32_INSN_OPCODE:
751clang_jcc:
752 __text_gen_insn(buf: bytes+i, opcode: op, addr: addr+i,
753 dest: jmp_dest,
754 JMP32_INSN_SIZE);
755 i += JMP32_INSN_SIZE;
756 break;
757
758 default:
759 WARN(1, "%pS %px %*ph\n", addr, addr, 6, addr);
760 return -1;
761 }
762
763 WARN_ON_ONCE(i != insn->length);
764
765 return i;
766}
767
768static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8 *bytes)
769{
770 return __emit_trampoline(addr, insn, bytes,
771 call_dest: __x86_indirect_call_thunk_array[reg],
772 jmp_dest: __x86_indirect_jump_thunk_array[reg]);
773}
774
775#ifdef CONFIG_MITIGATION_ITS
776static int emit_its_trampoline(void *addr, struct insn *insn, int reg, u8 *bytes)
777{
778 u8 *thunk = __x86_indirect_its_thunk_array[reg];
779 u8 *tmp = its_allocate_thunk(reg);
780
781 if (tmp)
782 thunk = tmp;
783
784 return __emit_trampoline(addr, insn, bytes, call_dest: thunk, jmp_dest: thunk);
785}
786
787/* Check if an indirect branch is at ITS-unsafe address */
788static bool cpu_wants_indirect_its_thunk_at(unsigned long addr, int reg)
789{
790 if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS))
791 return false;
792
793 /* Indirect branch opcode is 2 or 3 bytes depending on reg */
794 addr += 1 + reg / 8;
795
796 /* Lower-half of the cacheline? */
797 return !(addr & 0x20);
798}
799#else /* CONFIG_MITIGATION_ITS */
800
801#ifdef CONFIG_FINEIBT
802static bool cpu_wants_indirect_its_thunk_at(unsigned long addr, int reg)
803{
804 return false;
805}
806#endif
807
808#endif /* CONFIG_MITIGATION_ITS */
809
810/*
811 * Rewrite the compiler generated retpoline thunk calls.
812 *
813 * For spectre_v2=off (!X86_FEATURE_RETPOLINE), rewrite them into immediate
814 * indirect instructions, avoiding the extra indirection.
815 *
816 * For example, convert:
817 *
818 * CALL __x86_indirect_thunk_\reg
819 *
820 * into:
821 *
822 * CALL *%\reg
823 *
824 * It also tries to inline spectre_v2=retpoline,lfence when size permits.
825 */
826static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes)
827{
828 retpoline_thunk_t *target;
829 int reg, ret, i = 0;
830 u8 op, cc;
831
832 target = addr + insn->length + insn->immediate.value;
833 reg = target - __x86_indirect_thunk_array;
834
835 if (WARN_ON_ONCE(reg & ~0xf))
836 return -1;
837
838 /* If anyone ever does: CALL/JMP *%rsp, we're in deep trouble. */
839 BUG_ON(reg == 4);
840
841 if (cpu_feature_enabled(X86_FEATURE_RETPOLINE) &&
842 !cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) {
843 if (cpu_feature_enabled(X86_FEATURE_CALL_DEPTH))
844 return emit_call_track_retpoline(addr, insn, reg, bytes);
845
846 return -1;
847 }
848
849 op = insn->opcode.bytes[0];
850
851 /*
852 * Convert:
853 *
854 * Jcc.d32 __x86_indirect_thunk_\reg
855 *
856 * into:
857 *
858 * Jncc.d8 1f
859 * [ LFENCE ]
860 * JMP *%\reg
861 * [ NOP ]
862 * 1:
863 */
864 if (is_jcc32(insn)) {
865 cc = insn->opcode.bytes[1] & 0xf;
866 cc ^= 1; /* invert condition */
867
868 bytes[i++] = 0x70 + cc; /* Jcc.d8 */
869 bytes[i++] = insn->length - 2; /* sizeof(Jcc.d8) == 2 */
870
871 /* Continue as if: JMP.d32 __x86_indirect_thunk_\reg */
872 op = JMP32_INSN_OPCODE;
873 }
874
875 /*
876 * For RETPOLINE_LFENCE: prepend the indirect CALL/JMP with an LFENCE.
877 */
878 if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) {
879 bytes[i++] = 0x0f;
880 bytes[i++] = 0xae;
881 bytes[i++] = 0xe8; /* LFENCE */
882 }
883
884#ifdef CONFIG_MITIGATION_ITS
885 /*
886 * Check if the address of last byte of emitted-indirect is in
887 * lower-half of the cacheline. Such branches need ITS mitigation.
888 */
889 if (cpu_wants_indirect_its_thunk_at(addr: (unsigned long)addr + i, reg))
890 return emit_its_trampoline(addr, insn, reg, bytes);
891#endif
892
893 ret = emit_indirect(op, reg, bytes: bytes + i);
894 if (ret < 0)
895 return ret;
896 i += ret;
897
898 /*
899 * The compiler is supposed to EMIT an INT3 after every unconditional
900 * JMP instruction due to AMD BTC. However, if the compiler is too old
901 * or MITIGATION_SLS isn't enabled, we still need an INT3 after
902 * indirect JMPs even on Intel.
903 */
904 if (op == JMP32_INSN_OPCODE && i < insn->length)
905 bytes[i++] = INT3_INSN_OPCODE;
906
907 for (; i < insn->length;)
908 bytes[i++] = BYTES_NOP1;
909
910 return i;
911}
912
913/*
914 * Generated by 'objtool --retpoline'.
915 */
916void __init_or_module noinline apply_retpolines(s32 *start, s32 *end)
917{
918 s32 *s;
919
920 for (s = start; s < end; s++) {
921 void *addr = (void *)s + *s;
922 struct insn insn;
923 int len, ret;
924 u8 bytes[16];
925 u8 op1, op2;
926 u8 *dest;
927
928 ret = insn_decode_kernel(&insn, addr);
929 if (WARN_ON_ONCE(ret < 0))
930 continue;
931
932 op1 = insn.opcode.bytes[0];
933 op2 = insn.opcode.bytes[1];
934
935 switch (op1) {
936 case 0x70 ... 0x7f: /* Jcc.d8 */
937 /* See cfi_paranoid. */
938 WARN_ON_ONCE(cfi_mode != CFI_FINEIBT);
939 continue;
940
941 case CALL_INSN_OPCODE:
942 case JMP32_INSN_OPCODE:
943 /* Check for cfi_paranoid + ITS */
944 dest = addr + insn.length + insn.immediate.value;
945 if (dest[-1] == 0xea && (dest[0] & 0xf0) == 0x70) {
946 WARN_ON_ONCE(cfi_mode != CFI_FINEIBT);
947 continue;
948 }
949 break;
950
951 case 0x0f: /* escape */
952 if (op2 >= 0x80 && op2 <= 0x8f)
953 break;
954 fallthrough;
955 default:
956 WARN_ON_ONCE(1);
957 continue;
958 }
959
960 DPRINTK(RETPOLINE, "retpoline at: %pS (%px) len: %d to: %pS",
961 addr, addr, insn.length,
962 addr + insn.length + insn.immediate.value);
963
964 len = patch_retpoline(addr, insn: &insn, bytes);
965 if (len == insn.length) {
966 optimize_nops(instr: addr, buf: bytes, len);
967 DUMP_BYTES(RETPOLINE, ((u8*)addr), len, "%px: orig: ", addr);
968 DUMP_BYTES(RETPOLINE, ((u8*)bytes), len, "%px: repl: ", addr);
969 text_poke_early(addr, opcode: bytes, len);
970 }
971 }
972}
973
974#ifdef CONFIG_MITIGATION_RETHUNK
975
976bool cpu_wants_rethunk(void)
977{
978 return cpu_feature_enabled(X86_FEATURE_RETHUNK);
979}
980
981bool cpu_wants_rethunk_at(void *addr)
982{
983 if (!cpu_feature_enabled(X86_FEATURE_RETHUNK))
984 return false;
985 if (x86_return_thunk != its_return_thunk)
986 return true;
987
988 return !((unsigned long)addr & 0x20);
989}
990
991/*
992 * Rewrite the compiler generated return thunk tail-calls.
993 *
994 * For example, convert:
995 *
996 * JMP __x86_return_thunk
997 *
998 * into:
999 *
1000 * RET
1001 */
1002static int patch_return(void *addr, struct insn *insn, u8 *bytes)
1003{
1004 int i = 0;
1005
1006 /* Patch the custom return thunks... */
1007 if (cpu_wants_rethunk_at(addr)) {
1008 i = JMP32_INSN_SIZE;
1009 __text_gen_insn(buf: bytes, JMP32_INSN_OPCODE, addr, dest: x86_return_thunk, size: i);
1010 } else {
1011 /* ... or patch them out if not needed. */
1012 bytes[i++] = RET_INSN_OPCODE;
1013 }
1014
1015 for (; i < insn->length;)
1016 bytes[i++] = INT3_INSN_OPCODE;
1017 return i;
1018}
1019
1020void __init_or_module noinline apply_returns(s32 *start, s32 *end)
1021{
1022 s32 *s;
1023
1024 if (cpu_wants_rethunk())
1025 static_call_force_reinit();
1026
1027 for (s = start; s < end; s++) {
1028 void *dest = NULL, *addr = (void *)s + *s;
1029 struct insn insn;
1030 int len, ret;
1031 u8 bytes[16];
1032 u8 op;
1033
1034 ret = insn_decode_kernel(&insn, addr);
1035 if (WARN_ON_ONCE(ret < 0))
1036 continue;
1037
1038 op = insn.opcode.bytes[0];
1039 if (op == JMP32_INSN_OPCODE)
1040 dest = addr + insn.length + insn.immediate.value;
1041
1042 if (__static_call_fixup(tramp: addr, op, dest) ||
1043 WARN_ONCE(dest != &__x86_return_thunk,
1044 "missing return thunk: %pS-%pS: %*ph",
1045 addr, dest, 5, addr))
1046 continue;
1047
1048 DPRINTK(RET, "return thunk at: %pS (%px) len: %d to: %pS",
1049 addr, addr, insn.length,
1050 addr + insn.length + insn.immediate.value);
1051
1052 len = patch_return(addr, insn: &insn, bytes);
1053 if (len == insn.length) {
1054 DUMP_BYTES(RET, ((u8*)addr), len, "%px: orig: ", addr);
1055 DUMP_BYTES(RET, ((u8*)bytes), len, "%px: repl: ", addr);
1056 text_poke_early(addr, opcode: bytes, len);
1057 }
1058 }
1059}
1060#else /* !CONFIG_MITIGATION_RETHUNK: */
1061void __init_or_module noinline apply_returns(s32 *start, s32 *end) { }
1062#endif /* !CONFIG_MITIGATION_RETHUNK */
1063
1064#else /* !CONFIG_MITIGATION_RETPOLINE || !CONFIG_OBJTOOL */
1065
1066void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { }
1067void __init_or_module noinline apply_returns(s32 *start, s32 *end) { }
1068
1069#endif /* !CONFIG_MITIGATION_RETPOLINE || !CONFIG_OBJTOOL */
1070
1071#ifdef CONFIG_X86_KERNEL_IBT
1072
1073__noendbr bool is_endbr(u32 *val)
1074{
1075 u32 endbr;
1076
1077 __get_kernel_nofault(&endbr, val, u32, Efault);
1078 return __is_endbr(val: endbr);
1079
1080Efault:
1081 return false;
1082}
1083
1084#ifdef CONFIG_FINEIBT
1085
1086static __noendbr bool exact_endbr(u32 *val)
1087{
1088 u32 endbr;
1089
1090 __get_kernel_nofault(&endbr, val, u32, Efault);
1091 return endbr == gen_endbr();
1092
1093Efault:
1094 return false;
1095}
1096
1097#endif
1098
1099static void poison_cfi(void *addr);
1100
1101static void __init_or_module poison_endbr(void *addr)
1102{
1103 u32 poison = gen_endbr_poison();
1104
1105 if (WARN_ON_ONCE(!is_endbr(addr)))
1106 return;
1107
1108 DPRINTK(ENDBR, "ENDBR at: %pS (%px)", addr, addr);
1109
1110 /*
1111 * When we have IBT, the lack of ENDBR will trigger #CP
1112 */
1113 DUMP_BYTES(ENDBR, ((u8*)addr), 4, "%px: orig: ", addr);
1114 DUMP_BYTES(ENDBR, ((u8*)&poison), 4, "%px: repl: ", addr);
1115 text_poke_early(addr, opcode: &poison, len: 4);
1116}
1117
1118/*
1119 * Generated by: objtool --ibt
1120 *
1121 * Seal the functions for indirect calls by clobbering the ENDBR instructions
1122 * and the kCFI hash value.
1123 */
1124void __init_or_module noinline apply_seal_endbr(s32 *start, s32 *end)
1125{
1126 s32 *s;
1127
1128 for (s = start; s < end; s++) {
1129 void *addr = (void *)s + *s;
1130
1131 poison_endbr(addr);
1132 if (IS_ENABLED(CONFIG_FINEIBT))
1133 poison_cfi(addr: addr - 16);
1134 }
1135}
1136
1137#else /* !CONFIG_X86_KERNEL_IBT: */
1138
1139void __init_or_module apply_seal_endbr(s32 *start, s32 *end) { }
1140
1141#endif /* !CONFIG_X86_KERNEL_IBT */
1142
1143#ifdef CONFIG_CFI_AUTO_DEFAULT
1144# define __CFI_DEFAULT CFI_AUTO
1145#elif defined(CONFIG_CFI_CLANG)
1146# define __CFI_DEFAULT CFI_KCFI
1147#else
1148# define __CFI_DEFAULT CFI_OFF
1149#endif
1150
1151enum cfi_mode cfi_mode __ro_after_init = __CFI_DEFAULT;
1152
1153#ifdef CONFIG_FINEIBT_BHI
1154bool cfi_bhi __ro_after_init = false;
1155#endif
1156
1157#ifdef CONFIG_CFI_CLANG
1158struct bpf_insn;
1159
1160/* Must match bpf_func_t / DEFINE_BPF_PROG_RUN() */
1161extern unsigned int __bpf_prog_runX(const void *ctx,
1162 const struct bpf_insn *insn);
1163
1164KCFI_REFERENCE(__bpf_prog_runX);
1165
1166/* u32 __ro_after_init cfi_bpf_hash = __kcfi_typeid___bpf_prog_runX; */
1167asm (
1168" .pushsection .data..ro_after_init,\"aw\",@progbits \n"
1169" .type cfi_bpf_hash,@object \n"
1170" .globl cfi_bpf_hash \n"
1171" .p2align 2, 0x0 \n"
1172"cfi_bpf_hash: \n"
1173" .long __kcfi_typeid___bpf_prog_runX \n"
1174" .size cfi_bpf_hash, 4 \n"
1175" .popsection \n"
1176);
1177
1178/* Must match bpf_callback_t */
1179extern u64 __bpf_callback_fn(u64, u64, u64, u64, u64);
1180
1181KCFI_REFERENCE(__bpf_callback_fn);
1182
1183/* u32 __ro_after_init cfi_bpf_subprog_hash = __kcfi_typeid___bpf_callback_fn; */
1184asm (
1185" .pushsection .data..ro_after_init,\"aw\",@progbits \n"
1186" .type cfi_bpf_subprog_hash,@object \n"
1187" .globl cfi_bpf_subprog_hash \n"
1188" .p2align 2, 0x0 \n"
1189"cfi_bpf_subprog_hash: \n"
1190" .long __kcfi_typeid___bpf_callback_fn \n"
1191" .size cfi_bpf_subprog_hash, 4 \n"
1192" .popsection \n"
1193);
1194
1195u32 cfi_get_func_hash(void *func)
1196{
1197 u32 hash;
1198
1199 func -= cfi_get_offset();
1200 switch (cfi_mode) {
1201 case CFI_FINEIBT:
1202 func += 7;
1203 break;
1204 case CFI_KCFI:
1205 func += 1;
1206 break;
1207 default:
1208 return 0;
1209 }
1210
1211 if (get_kernel_nofault(hash, func))
1212 return 0;
1213
1214 return hash;
1215}
1216
1217int cfi_get_func_arity(void *func)
1218{
1219 bhi_thunk *target;
1220 s32 disp;
1221
1222 if (cfi_mode != CFI_FINEIBT && !cfi_bhi)
1223 return 0;
1224
1225 if (get_kernel_nofault(disp, func - 4))
1226 return 0;
1227
1228 target = func + disp;
1229 return target - __bhi_args;
1230}
1231#endif
1232
1233#ifdef CONFIG_FINEIBT
1234
1235static bool cfi_rand __ro_after_init = true;
1236static u32 cfi_seed __ro_after_init;
1237
1238/*
1239 * Re-hash the CFI hash with a boot-time seed while making sure the result is
1240 * not a valid ENDBR instruction.
1241 */
1242static u32 cfi_rehash(u32 hash)
1243{
1244 hash ^= cfi_seed;
1245 while (unlikely(__is_endbr(hash) || __is_endbr(-hash))) {
1246 bool lsb = hash & 1;
1247 hash >>= 1;
1248 if (lsb)
1249 hash ^= 0x80200003;
1250 }
1251 return hash;
1252}
1253
1254static __init int cfi_parse_cmdline(char *str)
1255{
1256 if (!str)
1257 return -EINVAL;
1258
1259 while (str) {
1260 char *next = strchr(str, ',');
1261 if (next) {
1262 *next = 0;
1263 next++;
1264 }
1265
1266 if (!strcmp(str, "auto")) {
1267 cfi_mode = CFI_AUTO;
1268 } else if (!strcmp(str, "off")) {
1269 cfi_mode = CFI_OFF;
1270 cfi_rand = false;
1271 } else if (!strcmp(str, "kcfi")) {
1272 cfi_mode = CFI_KCFI;
1273 } else if (!strcmp(str, "fineibt")) {
1274 cfi_mode = CFI_FINEIBT;
1275 } else if (!strcmp(str, "norand")) {
1276 cfi_rand = false;
1277 } else if (!strcmp(str, "warn")) {
1278 pr_alert("CFI mismatch non-fatal!\n");
1279 cfi_warn = true;
1280 } else if (!strcmp(str, "paranoid")) {
1281 if (cfi_mode == CFI_FINEIBT) {
1282 cfi_paranoid = true;
1283 } else {
1284 pr_err("Ignoring paranoid; depends on fineibt.\n");
1285 }
1286 } else if (!strcmp(str, "bhi")) {
1287#ifdef CONFIG_FINEIBT_BHI
1288 if (cfi_mode == CFI_FINEIBT) {
1289 cfi_bhi = true;
1290 } else {
1291 pr_err("Ignoring bhi; depends on fineibt.\n");
1292 }
1293#else
1294 pr_err("Ignoring bhi; depends on FINEIBT_BHI=y.\n");
1295#endif
1296 } else {
1297 pr_err("Ignoring unknown cfi option (%s).", str);
1298 }
1299
1300 str = next;
1301 }
1302
1303 return 0;
1304}
1305early_param("cfi", cfi_parse_cmdline);
1306
1307/*
1308 * kCFI FineIBT
1309 *
1310 * __cfi_\func: __cfi_\func:
1311 * movl $0x12345678,%eax // 5 endbr64 // 4
1312 * nop subl $0x12345678,%r10d // 7
1313 * nop jne __cfi_\func+6 // 2
1314 * nop nop3 // 3
1315 * nop
1316 * nop
1317 * nop
1318 * nop
1319 * nop
1320 * nop
1321 * nop
1322 * nop
1323 *
1324 *
1325 * caller: caller:
1326 * movl $(-0x12345678),%r10d // 6 movl $0x12345678,%r10d // 6
1327 * addl $-15(%r11),%r10d // 4 lea -0x10(%r11),%r11 // 4
1328 * je 1f // 2 nop4 // 4
1329 * ud2 // 2
1330 * 1: cs call __x86_indirect_thunk_r11 // 6 call *%r11; nop3; // 6
1331 *
1332 */
1333
1334/*
1335 * <fineibt_preamble_start>:
1336 * 0: f3 0f 1e fa endbr64
1337 * 4: 41 81 <ea> 78 56 34 12 sub $0x12345678, %r10d
1338 * b: 75 f9 jne 6 <fineibt_preamble_start+0x6>
1339 * d: 0f 1f 00 nopl (%rax)
1340 *
1341 * Note that the JNE target is the 0xEA byte inside the SUB, this decodes as
1342 * (bad) on x86_64 and raises #UD.
1343 */
1344asm( ".pushsection .rodata \n"
1345 "fineibt_preamble_start: \n"
1346 " endbr64 \n"
1347 " subl $0x12345678, %r10d \n"
1348 "fineibt_preamble_bhi: \n"
1349 " jne fineibt_preamble_start+6 \n"
1350 ASM_NOP3
1351 "fineibt_preamble_end: \n"
1352 ".popsection\n"
1353);
1354
1355extern u8 fineibt_preamble_start[];
1356extern u8 fineibt_preamble_bhi[];
1357extern u8 fineibt_preamble_end[];
1358
1359#define fineibt_preamble_size (fineibt_preamble_end - fineibt_preamble_start)
1360#define fineibt_preamble_bhi (fineibt_preamble_bhi - fineibt_preamble_start)
1361#define fineibt_preamble_ud 6
1362#define fineibt_preamble_hash 7
1363
1364/*
1365 * <fineibt_caller_start>:
1366 * 0: 41 ba 78 56 34 12 mov $0x12345678, %r10d
1367 * 6: 4d 8d 5b f0 lea -0x10(%r11), %r11
1368 * a: 0f 1f 40 00 nopl 0x0(%rax)
1369 */
1370asm( ".pushsection .rodata \n"
1371 "fineibt_caller_start: \n"
1372 " movl $0x12345678, %r10d \n"
1373 " lea -0x10(%r11), %r11 \n"
1374 ASM_NOP4
1375 "fineibt_caller_end: \n"
1376 ".popsection \n"
1377);
1378
1379extern u8 fineibt_caller_start[];
1380extern u8 fineibt_caller_end[];
1381
1382#define fineibt_caller_size (fineibt_caller_end - fineibt_caller_start)
1383#define fineibt_caller_hash 2
1384
1385#define fineibt_caller_jmp (fineibt_caller_size - 2)
1386
1387/*
1388 * Since FineIBT does hash validation on the callee side it is prone to
1389 * circumvention attacks where a 'naked' ENDBR instruction exists that
1390 * is not part of the fineibt_preamble sequence.
1391 *
1392 * Notably the x86 entry points must be ENDBR and equally cannot be
1393 * fineibt_preamble.
1394 *
1395 * The fineibt_paranoid caller sequence adds additional caller side
1396 * hash validation. This stops such circumvention attacks dead, but at the cost
1397 * of adding a load.
1398 *
1399 * <fineibt_paranoid_start>:
1400 * 0: 41 ba 78 56 34 12 mov $0x12345678, %r10d
1401 * 6: 45 3b 53 f7 cmp -0x9(%r11), %r10d
1402 * a: 4d 8d 5b <f0> lea -0x10(%r11), %r11
1403 * e: 75 fd jne d <fineibt_paranoid_start+0xd>
1404 * 10: 41 ff d3 call *%r11
1405 * 13: 90 nop
1406 *
1407 * Notably LEA does not modify flags and can be reordered with the CMP,
1408 * avoiding a dependency. Again, using a non-taken (backwards) branch
1409 * for the failure case, abusing LEA's immediate 0xf0 as LOCK prefix for the
1410 * Jcc.d8, causing #UD.
1411 */
1412asm( ".pushsection .rodata \n"
1413 "fineibt_paranoid_start: \n"
1414 " movl $0x12345678, %r10d \n"
1415 " cmpl -9(%r11), %r10d \n"
1416 " lea -0x10(%r11), %r11 \n"
1417 " jne fineibt_paranoid_start+0xd \n"
1418 "fineibt_paranoid_ind: \n"
1419 " call *%r11 \n"
1420 " nop \n"
1421 "fineibt_paranoid_end: \n"
1422 ".popsection \n"
1423);
1424
1425extern u8 fineibt_paranoid_start[];
1426extern u8 fineibt_paranoid_ind[];
1427extern u8 fineibt_paranoid_end[];
1428
1429#define fineibt_paranoid_size (fineibt_paranoid_end - fineibt_paranoid_start)
1430#define fineibt_paranoid_ind (fineibt_paranoid_ind - fineibt_paranoid_start)
1431#define fineibt_paranoid_ud 0xd
1432
1433static u32 decode_preamble_hash(void *addr, int *reg)
1434{
1435 u8 *p = addr;
1436
1437 /* b8+reg 78 56 34 12 movl $0x12345678,\reg */
1438 if (p[0] >= 0xb8 && p[0] < 0xc0) {
1439 if (reg)
1440 *reg = p[0] - 0xb8;
1441 return *(u32 *)(addr + 1);
1442 }
1443
1444 return 0; /* invalid hash value */
1445}
1446
1447static u32 decode_caller_hash(void *addr)
1448{
1449 u8 *p = addr;
1450
1451 /* 41 ba 88 a9 cb ed mov $(-0x12345678),%r10d */
1452 if (p[0] == 0x41 && p[1] == 0xba)
1453 return -*(u32 *)(addr + 2);
1454
1455 /* e8 0c 88 a9 cb ed jmp.d8 +12 */
1456 if (p[0] == JMP8_INSN_OPCODE && p[1] == fineibt_caller_jmp)
1457 return -*(u32 *)(addr + 2);
1458
1459 return 0; /* invalid hash value */
1460}
1461
1462/* .retpoline_sites */
1463static int cfi_disable_callers(s32 *start, s32 *end)
1464{
1465 /*
1466 * Disable kCFI by patching in a JMP.d8, this leaves the hash immediate
1467 * in tact for later usage. Also see decode_caller_hash() and
1468 * cfi_rewrite_callers().
1469 */
1470 const u8 jmp[] = { JMP8_INSN_OPCODE, fineibt_caller_jmp };
1471 s32 *s;
1472
1473 for (s = start; s < end; s++) {
1474 void *addr = (void *)s + *s;
1475 u32 hash;
1476
1477 addr -= fineibt_caller_size;
1478 hash = decode_caller_hash(addr);
1479 if (!hash) /* nocfi callers */
1480 continue;
1481
1482 text_poke_early(addr, jmp, 2);
1483 }
1484
1485 return 0;
1486}
1487
1488static int cfi_enable_callers(s32 *start, s32 *end)
1489{
1490 /*
1491 * Re-enable kCFI, undo what cfi_disable_callers() did.
1492 */
1493 const u8 mov[] = { 0x41, 0xba };
1494 s32 *s;
1495
1496 for (s = start; s < end; s++) {
1497 void *addr = (void *)s + *s;
1498 u32 hash;
1499
1500 addr -= fineibt_caller_size;
1501 hash = decode_caller_hash(addr);
1502 if (!hash) /* nocfi callers */
1503 continue;
1504
1505 text_poke_early(addr, mov, 2);
1506 }
1507
1508 return 0;
1509}
1510
1511/* .cfi_sites */
1512static int cfi_rand_preamble(s32 *start, s32 *end)
1513{
1514 s32 *s;
1515
1516 for (s = start; s < end; s++) {
1517 void *addr = (void *)s + *s;
1518 u32 hash;
1519
1520 hash = decode_preamble_hash(addr, NULL);
1521 if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n",
1522 addr, addr, 5, addr))
1523 return -EINVAL;
1524
1525 hash = cfi_rehash(hash);
1526 text_poke_early(addr + 1, &hash, 4);
1527 }
1528
1529 return 0;
1530}
1531
1532static void cfi_fineibt_bhi_preamble(void *addr, int arity)
1533{
1534 if (!arity)
1535 return;
1536
1537 if (!cfi_warn && arity == 1) {
1538 /*
1539 * Crazy scheme to allow arity-1 inline:
1540 *
1541 * __cfi_foo:
1542 * 0: f3 0f 1e fa endbr64
1543 * 4: 41 81 <ea> 78 56 34 12 sub 0x12345678, %r10d
1544 * b: 49 0f 45 fa cmovne %r10, %rdi
1545 * f: 75 f5 jne __cfi_foo+6
1546 * 11: 0f 1f 00 nopl (%rax)
1547 *
1548 * Code that direct calls to foo()+0, decodes the tail end as:
1549 *
1550 * foo:
1551 * 0: f5 cmc
1552 * 1: 0f 1f 00 nopl (%rax)
1553 *
1554 * which clobbers CF, but does not affect anything ABI
1555 * wise.
1556 *
1557 * Notably, this scheme is incompatible with permissive CFI
1558 * because the CMOVcc is unconditional and RDI will have been
1559 * clobbered.
1560 */
1561 const u8 magic[9] = {
1562 0x49, 0x0f, 0x45, 0xfa,
1563 0x75, 0xf5,
1564 BYTES_NOP3,
1565 };
1566
1567 text_poke_early(addr + fineibt_preamble_bhi, magic, 9);
1568
1569 return;
1570 }
1571
1572 text_poke_early(addr + fineibt_preamble_bhi,
1573 text_gen_insn(CALL_INSN_OPCODE,
1574 addr + fineibt_preamble_bhi,
1575 __bhi_args[arity]),
1576 CALL_INSN_SIZE);
1577}
1578
1579static int cfi_rewrite_preamble(s32 *start, s32 *end)
1580{
1581 s32 *s;
1582
1583 for (s = start; s < end; s++) {
1584 void *addr = (void *)s + *s;
1585 int arity;
1586 u32 hash;
1587
1588 /*
1589 * When the function doesn't start with ENDBR the compiler will
1590 * have determined there are no indirect calls to it and we
1591 * don't need no CFI either.
1592 */
1593 if (!is_endbr(addr + 16))
1594 continue;
1595
1596 hash = decode_preamble_hash(addr, &arity);
1597 if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n",
1598 addr, addr, 5, addr))
1599 return -EINVAL;
1600
1601 text_poke_early(addr, fineibt_preamble_start, fineibt_preamble_size);
1602 WARN_ON(*(u32 *)(addr + fineibt_preamble_hash) != 0x12345678);
1603 text_poke_early(addr + fineibt_preamble_hash, &hash, 4);
1604
1605 WARN_ONCE(!IS_ENABLED(CONFIG_FINEIBT_BHI) && arity,
1606 "kCFI preamble has wrong register at: %pS %*ph\n",
1607 addr, 5, addr);
1608
1609 if (cfi_bhi)
1610 cfi_fineibt_bhi_preamble(addr, arity);
1611 }
1612
1613 return 0;
1614}
1615
1616static void cfi_rewrite_endbr(s32 *start, s32 *end)
1617{
1618 s32 *s;
1619
1620 for (s = start; s < end; s++) {
1621 void *addr = (void *)s + *s;
1622
1623 if (!exact_endbr(addr + 16))
1624 continue;
1625
1626 poison_endbr(addr + 16);
1627 }
1628}
1629
1630/* .retpoline_sites */
1631static int cfi_rand_callers(s32 *start, s32 *end)
1632{
1633 s32 *s;
1634
1635 for (s = start; s < end; s++) {
1636 void *addr = (void *)s + *s;
1637 u32 hash;
1638
1639 addr -= fineibt_caller_size;
1640 hash = decode_caller_hash(addr);
1641 if (hash) {
1642 hash = -cfi_rehash(hash);
1643 text_poke_early(addr + 2, &hash, 4);
1644 }
1645 }
1646
1647 return 0;
1648}
1649
1650static int emit_paranoid_trampoline(void *addr, struct insn *insn, int reg, u8 *bytes)
1651{
1652 u8 *thunk = (void *)__x86_indirect_its_thunk_array[reg] - 2;
1653
1654#ifdef CONFIG_MITIGATION_ITS
1655 u8 *tmp = its_allocate_thunk(reg);
1656 if (tmp)
1657 thunk = tmp;
1658#endif
1659
1660 return __emit_trampoline(addr, insn, bytes, thunk, thunk);
1661}
1662
1663static int cfi_rewrite_callers(s32 *start, s32 *end)
1664{
1665 s32 *s;
1666
1667 BUG_ON(fineibt_paranoid_size != 20);
1668
1669 for (s = start; s < end; s++) {
1670 void *addr = (void *)s + *s;
1671 struct insn insn;
1672 u8 bytes[20];
1673 u32 hash;
1674 int ret;
1675 u8 op;
1676
1677 addr -= fineibt_caller_size;
1678 hash = decode_caller_hash(addr);
1679 if (!hash)
1680 continue;
1681
1682 if (!cfi_paranoid) {
1683 text_poke_early(addr, fineibt_caller_start, fineibt_caller_size);
1684 WARN_ON(*(u32 *)(addr + fineibt_caller_hash) != 0x12345678);
1685 text_poke_early(addr + fineibt_caller_hash, &hash, 4);
1686 /* rely on apply_retpolines() */
1687 continue;
1688 }
1689
1690 /* cfi_paranoid */
1691 ret = insn_decode_kernel(&insn, addr + fineibt_caller_size);
1692 if (WARN_ON_ONCE(ret < 0))
1693 continue;
1694
1695 op = insn.opcode.bytes[0];
1696 if (op != CALL_INSN_OPCODE && op != JMP32_INSN_OPCODE) {
1697 WARN_ON_ONCE(1);
1698 continue;
1699 }
1700
1701 memcpy(bytes, fineibt_paranoid_start, fineibt_paranoid_size);
1702 memcpy(bytes + fineibt_caller_hash, &hash, 4);
1703
1704 if (cpu_wants_indirect_its_thunk_at((unsigned long)addr + fineibt_paranoid_ind, 11)) {
1705 emit_paranoid_trampoline(addr + fineibt_caller_size,
1706 &insn, 11, bytes + fineibt_caller_size);
1707 } else {
1708 ret = emit_indirect(op, 11, bytes + fineibt_paranoid_ind);
1709 if (WARN_ON_ONCE(ret != 3))
1710 continue;
1711 }
1712
1713 text_poke_early(addr, bytes, fineibt_paranoid_size);
1714 }
1715
1716 return 0;
1717}
1718
1719static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
1720 s32 *start_cfi, s32 *end_cfi, bool builtin)
1721{
1722 int ret;
1723
1724 if (WARN_ONCE(fineibt_preamble_size != 16,
1725 "FineIBT preamble wrong size: %ld", fineibt_preamble_size))
1726 return;
1727
1728 if (cfi_mode == CFI_AUTO) {
1729 cfi_mode = CFI_KCFI;
1730 if (HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT)) {
1731 /*
1732 * FRED has much saner context on exception entry and
1733 * is less easy to take advantage of.
1734 */
1735 if (!cpu_feature_enabled(X86_FEATURE_FRED))
1736 cfi_paranoid = true;
1737 cfi_mode = CFI_FINEIBT;
1738 }
1739 }
1740
1741 /*
1742 * Rewrite the callers to not use the __cfi_ stubs, such that we might
1743 * rewrite them. This disables all CFI. If this succeeds but any of the
1744 * later stages fails, we're without CFI.
1745 */
1746 ret = cfi_disable_callers(start_retpoline, end_retpoline);
1747 if (ret)
1748 goto err;
1749
1750 if (cfi_rand) {
1751 if (builtin) {
1752 cfi_seed = get_random_u32();
1753 cfi_bpf_hash = cfi_rehash(cfi_bpf_hash);
1754 cfi_bpf_subprog_hash = cfi_rehash(cfi_bpf_subprog_hash);
1755 }
1756
1757 ret = cfi_rand_preamble(start_cfi, end_cfi);
1758 if (ret)
1759 goto err;
1760
1761 ret = cfi_rand_callers(start_retpoline, end_retpoline);
1762 if (ret)
1763 goto err;
1764 }
1765
1766 switch (cfi_mode) {
1767 case CFI_OFF:
1768 if (builtin)
1769 pr_info("Disabling CFI\n");
1770 return;
1771
1772 case CFI_KCFI:
1773 ret = cfi_enable_callers(start_retpoline, end_retpoline);
1774 if (ret)
1775 goto err;
1776
1777 if (builtin)
1778 pr_info("Using kCFI\n");
1779 return;
1780
1781 case CFI_FINEIBT:
1782 /* place the FineIBT preamble at func()-16 */
1783 ret = cfi_rewrite_preamble(start_cfi, end_cfi);
1784 if (ret)
1785 goto err;
1786
1787 /* rewrite the callers to target func()-16 */
1788 ret = cfi_rewrite_callers(start_retpoline, end_retpoline);
1789 if (ret)
1790 goto err;
1791
1792 /* now that nobody targets func()+0, remove ENDBR there */
1793 cfi_rewrite_endbr(start_cfi, end_cfi);
1794
1795 if (builtin) {
1796 pr_info("Using %sFineIBT%s CFI\n",
1797 cfi_paranoid ? "paranoid " : "",
1798 cfi_bhi ? "+BHI" : "");
1799 }
1800 return;
1801
1802 default:
1803 break;
1804 }
1805
1806err:
1807 pr_err("Something went horribly wrong trying to rewrite the CFI implementation.\n");
1808}
1809
1810static inline void poison_hash(void *addr)
1811{
1812 *(u32 *)addr = 0;
1813}
1814
1815static void poison_cfi(void *addr)
1816{
1817 /*
1818 * Compilers manage to be inconsistent with ENDBR vs __cfi prefixes,
1819 * some (static) functions for which they can determine the address
1820 * is never taken do not get a __cfi prefix, but *DO* get an ENDBR.
1821 *
1822 * As such, these functions will get sealed, but we need to be careful
1823 * to not unconditionally scribble the previous function.
1824 */
1825 switch (cfi_mode) {
1826 case CFI_FINEIBT:
1827 /*
1828 * FineIBT prefix should start with an ENDBR.
1829 */
1830 if (!is_endbr(addr))
1831 break;
1832
1833 /*
1834 * __cfi_\func:
1835 * osp nopl (%rax)
1836 * subl $0, %r10d
1837 * jz 1f
1838 * ud2
1839 * 1: nop
1840 */
1841 poison_endbr(addr);
1842 poison_hash(addr + fineibt_preamble_hash);
1843 break;
1844
1845 case CFI_KCFI:
1846 /*
1847 * kCFI prefix should start with a valid hash.
1848 */
1849 if (!decode_preamble_hash(addr, NULL))
1850 break;
1851
1852 /*
1853 * __cfi_\func:
1854 * movl $0, %eax
1855 * .skip 11, 0x90
1856 */
1857 poison_hash(addr + 1);
1858 break;
1859
1860 default:
1861 break;
1862 }
1863}
1864
1865/*
1866 * When regs->ip points to a 0xEA byte in the FineIBT preamble,
1867 * return true and fill out target and type.
1868 *
1869 * We check the preamble by checking for the ENDBR instruction relative to the
1870 * 0xEA instruction.
1871 */
1872static bool decode_fineibt_preamble(struct pt_regs *regs, unsigned long *target, u32 *type)
1873{
1874 unsigned long addr = regs->ip - fineibt_preamble_ud;
1875 u32 hash;
1876
1877 if (!exact_endbr((void *)addr))
1878 return false;
1879
1880 *target = addr + fineibt_preamble_size;
1881
1882 __get_kernel_nofault(&hash, addr + fineibt_preamble_hash, u32, Efault);
1883 *type = (u32)regs->r10 + hash;
1884
1885 /*
1886 * Since regs->ip points to the middle of an instruction; it cannot
1887 * continue with the normal fixup.
1888 */
1889 regs->ip = *target;
1890
1891 return true;
1892
1893Efault:
1894 return false;
1895}
1896
1897/*
1898 * regs->ip points to one of the UD2 in __bhi_args[].
1899 */
1900static bool decode_fineibt_bhi(struct pt_regs *regs, unsigned long *target, u32 *type)
1901{
1902 unsigned long addr;
1903 u32 hash;
1904
1905 if (!cfi_bhi)
1906 return false;
1907
1908 if (regs->ip < (unsigned long)__bhi_args ||
1909 regs->ip >= (unsigned long)__bhi_args_end)
1910 return false;
1911
1912 /*
1913 * Fetch the return address from the stack, this points to the
1914 * FineIBT preamble. Since the CALL instruction is in the 5 last
1915 * bytes of the preamble, the return address is in fact the target
1916 * address.
1917 */
1918 __get_kernel_nofault(&addr, regs->sp, unsigned long, Efault);
1919 *target = addr;
1920
1921 addr -= fineibt_preamble_size;
1922 if (!exact_endbr((void *)addr))
1923 return false;
1924
1925 __get_kernel_nofault(&hash, addr + fineibt_preamble_hash, u32, Efault);
1926 *type = (u32)regs->r10 + hash;
1927
1928 /*
1929 * The UD2 sites are constructed with a RET immediately following,
1930 * as such the non-fatal case can use the regular fixup.
1931 */
1932 return true;
1933
1934Efault:
1935 return false;
1936}
1937
1938static bool is_paranoid_thunk(unsigned long addr)
1939{
1940 u32 thunk;
1941
1942 __get_kernel_nofault(&thunk, (u32 *)addr, u32, Efault);
1943 return (thunk & 0x00FFFFFF) == 0xfd75ea;
1944
1945Efault:
1946 return false;
1947}
1948
1949/*
1950 * regs->ip points to a LOCK Jcc.d8 instruction from the fineibt_paranoid_start[]
1951 * sequence, or to an invalid instruction (0xea) + Jcc.d8 for cfi_paranoid + ITS
1952 * thunk.
1953 */
1954static bool decode_fineibt_paranoid(struct pt_regs *regs, unsigned long *target, u32 *type)
1955{
1956 unsigned long addr = regs->ip - fineibt_paranoid_ud;
1957
1958 if (!cfi_paranoid)
1959 return false;
1960
1961 if (is_cfi_trap(addr + fineibt_caller_size - LEN_UD2)) {
1962 *target = regs->r11 + fineibt_preamble_size;
1963 *type = regs->r10;
1964
1965 /*
1966 * Since the trapping instruction is the exact, but LOCK prefixed,
1967 * Jcc.d8 that got us here, the normal fixup will work.
1968 */
1969 return true;
1970 }
1971
1972 /*
1973 * The cfi_paranoid + ITS thunk combination results in:
1974 *
1975 * 0: 41 ba 78 56 34 12 mov $0x12345678, %r10d
1976 * 6: 45 3b 53 f7 cmp -0x9(%r11), %r10d
1977 * a: 4d 8d 5b f0 lea -0x10(%r11), %r11
1978 * e: 2e e8 XX XX XX XX cs call __x86_indirect_paranoid_thunk_r11
1979 *
1980 * Where the paranoid_thunk looks like:
1981 *
1982 * 1d: <ea> (bad)
1983 * __x86_indirect_paranoid_thunk_r11:
1984 * 1e: 75 fd jne 1d
1985 * __x86_indirect_its_thunk_r11:
1986 * 20: 41 ff eb jmp *%r11
1987 * 23: cc int3
1988 *
1989 */
1990 if (is_paranoid_thunk(regs->ip)) {
1991 *target = regs->r11 + fineibt_preamble_size;
1992 *type = regs->r10;
1993
1994 regs->ip = *target;
1995 return true;
1996 }
1997
1998 return false;
1999}
2000
2001bool decode_fineibt_insn(struct pt_regs *regs, unsigned long *target, u32 *type)
2002{
2003 if (decode_fineibt_paranoid(regs, target, type))
2004 return true;
2005
2006 if (decode_fineibt_bhi(regs, target, type))
2007 return true;
2008
2009 return decode_fineibt_preamble(regs, target, type);
2010}
2011
2012#else /* !CONFIG_FINEIBT: */
2013
2014static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
2015 s32 *start_cfi, s32 *end_cfi, bool builtin)
2016{
2017}
2018
2019#ifdef CONFIG_X86_KERNEL_IBT
2020static void poison_cfi(void *addr) { }
2021#endif
2022
2023#endif /* !CONFIG_FINEIBT */
2024
2025void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
2026 s32 *start_cfi, s32 *end_cfi)
2027{
2028 return __apply_fineibt(start_retpoline, end_retpoline,
2029 start_cfi, end_cfi,
2030 /* .builtin = */ false);
2031}
2032
2033#ifdef CONFIG_SMP
2034static void alternatives_smp_lock(const s32 *start, const s32 *end,
2035 u8 *text, u8 *text_end)
2036{
2037 const s32 *poff;
2038
2039 for (poff = start; poff < end; poff++) {
2040 u8 *ptr = (u8 *)poff + *poff;
2041
2042 if (!*poff || ptr < text || ptr >= text_end)
2043 continue;
2044 /* turn DS segment override prefix into lock prefix */
2045 if (*ptr == 0x3e)
2046 text_poke(addr: ptr, opcode: ((unsigned char []){0xf0}), len: 1);
2047 }
2048}
2049
2050static void alternatives_smp_unlock(const s32 *start, const s32 *end,
2051 u8 *text, u8 *text_end)
2052{
2053 const s32 *poff;
2054
2055 for (poff = start; poff < end; poff++) {
2056 u8 *ptr = (u8 *)poff + *poff;
2057
2058 if (!*poff || ptr < text || ptr >= text_end)
2059 continue;
2060 /* turn lock prefix into DS segment override prefix */
2061 if (*ptr == 0xf0)
2062 text_poke(addr: ptr, opcode: ((unsigned char []){0x3E}), len: 1);
2063 }
2064}
2065
2066struct smp_alt_module {
2067 /* what is this ??? */
2068 struct module *mod;
2069 char *name;
2070
2071 /* ptrs to lock prefixes */
2072 const s32 *locks;
2073 const s32 *locks_end;
2074
2075 /* .text segment, needed to avoid patching init code ;) */
2076 u8 *text;
2077 u8 *text_end;
2078
2079 struct list_head next;
2080};
2081static LIST_HEAD(smp_alt_modules);
2082static bool uniproc_patched = false; /* protected by text_mutex */
2083
2084void __init_or_module alternatives_smp_module_add(struct module *mod,
2085 char *name,
2086 void *locks, void *locks_end,
2087 void *text, void *text_end)
2088{
2089 struct smp_alt_module *smp;
2090
2091 mutex_lock(&text_mutex);
2092 if (!uniproc_patched)
2093 goto unlock;
2094
2095 if (num_possible_cpus() == 1)
2096 /* Don't bother remembering, we'll never have to undo it. */
2097 goto smp_unlock;
2098
2099 smp = kzalloc(sizeof(*smp), GFP_KERNEL);
2100 if (NULL == smp)
2101 /* we'll run the (safe but slow) SMP code then ... */
2102 goto unlock;
2103
2104 smp->mod = mod;
2105 smp->name = name;
2106 smp->locks = locks;
2107 smp->locks_end = locks_end;
2108 smp->text = text;
2109 smp->text_end = text_end;
2110 DPRINTK(SMP, "locks %p -> %p, text %p -> %p, name %s\n",
2111 smp->locks, smp->locks_end,
2112 smp->text, smp->text_end, smp->name);
2113
2114 list_add_tail(new: &smp->next, head: &smp_alt_modules);
2115smp_unlock:
2116 alternatives_smp_unlock(start: locks, end: locks_end, text, text_end);
2117unlock:
2118 mutex_unlock(lock: &text_mutex);
2119}
2120
2121void __init_or_module alternatives_smp_module_del(struct module *mod)
2122{
2123 struct smp_alt_module *item;
2124
2125 mutex_lock(&text_mutex);
2126 list_for_each_entry(item, &smp_alt_modules, next) {
2127 if (mod != item->mod)
2128 continue;
2129 list_del(entry: &item->next);
2130 kfree(objp: item);
2131 break;
2132 }
2133 mutex_unlock(lock: &text_mutex);
2134}
2135
2136void alternatives_enable_smp(void)
2137{
2138 struct smp_alt_module *mod;
2139
2140 /* Why bother if there are no other CPUs? */
2141 BUG_ON(num_possible_cpus() == 1);
2142
2143 mutex_lock(&text_mutex);
2144
2145 if (uniproc_patched) {
2146 pr_info("switching to SMP code\n");
2147 BUG_ON(num_online_cpus() != 1);
2148 clear_cpu_cap(c: &boot_cpu_data, X86_FEATURE_UP);
2149 clear_cpu_cap(c: &cpu_data(0), X86_FEATURE_UP);
2150 list_for_each_entry(mod, &smp_alt_modules, next)
2151 alternatives_smp_lock(start: mod->locks, end: mod->locks_end,
2152 text: mod->text, text_end: mod->text_end);
2153 uniproc_patched = false;
2154 }
2155 mutex_unlock(lock: &text_mutex);
2156}
2157
2158/*
2159 * Return 1 if the address range is reserved for SMP-alternatives.
2160 * Must hold text_mutex.
2161 */
2162int alternatives_text_reserved(void *start, void *end)
2163{
2164 struct smp_alt_module *mod;
2165 const s32 *poff;
2166 u8 *text_start = start;
2167 u8 *text_end = end;
2168
2169 lockdep_assert_held(&text_mutex);
2170
2171 list_for_each_entry(mod, &smp_alt_modules, next) {
2172 if (mod->text > text_end || mod->text_end < text_start)
2173 continue;
2174 for (poff = mod->locks; poff < mod->locks_end; poff++) {
2175 const u8 *ptr = (const u8 *)poff + *poff;
2176
2177 if (text_start <= ptr && text_end > ptr)
2178 return 1;
2179 }
2180 }
2181
2182 return 0;
2183}
2184#endif /* CONFIG_SMP */
2185
2186/*
2187 * Self-test for the INT3 based CALL emulation code.
2188 *
2189 * This exercises int3_emulate_call() to make sure INT3 pt_regs are set up
2190 * properly and that there is a stack gap between the INT3 frame and the
2191 * previous context. Without this gap doing a virtual PUSH on the interrupted
2192 * stack would corrupt the INT3 IRET frame.
2193 *
2194 * See entry_{32,64}.S for more details.
2195 */
2196
2197/*
2198 * We define the int3_magic() function in assembly to control the calling
2199 * convention such that we can 'call' it from assembly.
2200 */
2201
2202extern void int3_magic(unsigned int *ptr); /* defined in asm */
2203
2204asm (
2205" .pushsection .init.text, \"ax\", @progbits\n"
2206" .type int3_magic, @function\n"
2207"int3_magic:\n"
2208 ANNOTATE_NOENDBR
2209" movl $1, (%" _ASM_ARG1 ")\n"
2210 ASM_RET
2211" .size int3_magic, .-int3_magic\n"
2212" .popsection\n"
2213);
2214
2215extern void int3_selftest_ip(void); /* defined in asm below */
2216
2217static int __init
2218int3_exception_notify(struct notifier_block *self, unsigned long val, void *data)
2219{
2220 unsigned long selftest = (unsigned long)&int3_selftest_ip;
2221 struct die_args *args = data;
2222 struct pt_regs *regs = args->regs;
2223
2224 OPTIMIZER_HIDE_VAR(selftest);
2225
2226 if (!regs || user_mode(regs))
2227 return NOTIFY_DONE;
2228
2229 if (val != DIE_INT3)
2230 return NOTIFY_DONE;
2231
2232 if (regs->ip - INT3_INSN_SIZE != selftest)
2233 return NOTIFY_DONE;
2234
2235 int3_emulate_call(regs, func: (unsigned long)&int3_magic);
2236 return NOTIFY_STOP;
2237}
2238
2239/* Must be noinline to ensure uniqueness of int3_selftest_ip. */
2240static noinline void __init int3_selftest(void)
2241{
2242 static __initdata struct notifier_block int3_exception_nb = {
2243 .notifier_call = int3_exception_notify,
2244 .priority = INT_MAX-1, /* last */
2245 };
2246 unsigned int val = 0;
2247
2248 BUG_ON(register_die_notifier(&int3_exception_nb));
2249
2250 /*
2251 * Basically: int3_magic(&val); but really complicated :-)
2252 *
2253 * INT3 padded with NOP to CALL_INSN_SIZE. The int3_exception_nb
2254 * notifier above will emulate CALL for us.
2255 */
2256 asm volatile ("int3_selftest_ip:\n\t"
2257 ANNOTATE_NOENDBR
2258 " int3; nop; nop; nop; nop\n\t"
2259 : ASM_CALL_CONSTRAINT
2260 : __ASM_SEL_RAW(a, D) (&val)
2261 : "memory");
2262
2263 BUG_ON(val != 1);
2264
2265 unregister_die_notifier(nb: &int3_exception_nb);
2266}
2267
2268static __initdata int __alt_reloc_selftest_addr;
2269
2270extern void __init __alt_reloc_selftest(void *arg);
2271__visible noinline void __init __alt_reloc_selftest(void *arg)
2272{
2273 WARN_ON(arg != &__alt_reloc_selftest_addr);
2274}
2275
2276static noinline void __init alt_reloc_selftest(void)
2277{
2278 /*
2279 * Tests text_poke_apply_relocation().
2280 *
2281 * This has a relative immediate (CALL) in a place other than the first
2282 * instruction and additionally on x86_64 we get a RIP-relative LEA:
2283 *
2284 * lea 0x0(%rip),%rdi # 5d0: R_X86_64_PC32 .init.data+0x5566c
2285 * call +0 # 5d5: R_X86_64_PLT32 __alt_reloc_selftest-0x4
2286 *
2287 * Getting this wrong will either crash and burn or tickle the WARN
2288 * above.
2289 */
2290 asm_inline volatile (
2291 ALTERNATIVE("", "lea %[mem], %%" _ASM_ARG1 "; call __alt_reloc_selftest;", X86_FEATURE_ALWAYS)
2292 : ASM_CALL_CONSTRAINT
2293 : [mem] "m" (__alt_reloc_selftest_addr)
2294 : _ASM_ARG1
2295 );
2296}
2297
2298void __init alternative_instructions(void)
2299{
2300 u64 ibt;
2301
2302 int3_selftest();
2303
2304 /*
2305 * The patching is not fully atomic, so try to avoid local
2306 * interruptions that might execute the to be patched code.
2307 * Other CPUs are not running.
2308 */
2309 stop_nmi();
2310
2311 /*
2312 * Don't stop machine check exceptions while patching.
2313 * MCEs only happen when something got corrupted and in this
2314 * case we must do something about the corruption.
2315 * Ignoring it is worse than an unlikely patching race.
2316 * Also machine checks tend to be broadcast and if one CPU
2317 * goes into machine check the others follow quickly, so we don't
2318 * expect a machine check to cause undue problems during to code
2319 * patching.
2320 */
2321
2322 /*
2323 * Make sure to set (artificial) features depending on used paravirt
2324 * functions which can later influence alternative patching.
2325 */
2326 paravirt_set_cap();
2327
2328 /* Keep CET-IBT disabled until caller/callee are patched */
2329 ibt = ibt_save(/*disable*/ true);
2330
2331 __apply_fineibt(start_retpoline: __retpoline_sites, end_retpoline: __retpoline_sites_end,
2332 start_cfi: __cfi_sites, end_cfi: __cfi_sites_end, builtin: true);
2333
2334 /*
2335 * Rewrite the retpolines, must be done before alternatives since
2336 * those can rewrite the retpoline thunks.
2337 */
2338 apply_retpolines(start: __retpoline_sites, end: __retpoline_sites_end);
2339 apply_returns(start: __return_sites, end: __return_sites_end);
2340
2341 /*
2342 * Adjust all CALL instructions to point to func()-10, including
2343 * those in .altinstr_replacement.
2344 */
2345 callthunks_patch_builtin_calls();
2346
2347 apply_alternatives(start: __alt_instructions, end: __alt_instructions_end);
2348
2349 /*
2350 * Seal all functions that do not have their address taken.
2351 */
2352 apply_seal_endbr(start: __ibt_endbr_seal, end: __ibt_endbr_seal_end);
2353
2354 ibt_restore(save: ibt);
2355
2356#ifdef CONFIG_SMP
2357 /* Patch to UP if other cpus not imminent. */
2358 if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) {
2359 uniproc_patched = true;
2360 alternatives_smp_module_add(NULL, name: "core kernel",
2361 locks: __smp_locks, locks_end: __smp_locks_end,
2362 text: _text, text_end: _etext);
2363 }
2364
2365 if (!uniproc_patched || num_possible_cpus() == 1) {
2366 free_init_pages(what: "SMP alternatives",
2367 begin: (unsigned long)__smp_locks,
2368 end: (unsigned long)__smp_locks_end);
2369 }
2370#endif
2371
2372 restart_nmi();
2373 alternatives_patched = 1;
2374
2375 alt_reloc_selftest();
2376}
2377
2378/**
2379 * text_poke_early - Update instructions on a live kernel at boot time
2380 * @addr: address to modify
2381 * @opcode: source of the copy
2382 * @len: length to copy
2383 *
2384 * When you use this code to patch more than one byte of an instruction
2385 * you need to make sure that other CPUs cannot execute this code in parallel.
2386 * Also no thread must be currently preempted in the middle of these
2387 * instructions. And on the local CPU you need to be protected against NMI or
2388 * MCE handlers seeing an inconsistent instruction while you patch.
2389 */
2390void __init_or_module text_poke_early(void *addr, const void *opcode,
2391 size_t len)
2392{
2393 unsigned long flags;
2394
2395 if (boot_cpu_has(X86_FEATURE_NX) &&
2396 is_module_text_address(addr: (unsigned long)addr)) {
2397 /*
2398 * Modules text is marked initially as non-executable, so the
2399 * code cannot be running and speculative code-fetches are
2400 * prevented. Just change the code.
2401 */
2402 memcpy(addr, opcode, len);
2403 } else {
2404 local_irq_save(flags);
2405 memcpy(addr, opcode, len);
2406 sync_core();
2407 local_irq_restore(flags);
2408
2409 /*
2410 * Could also do a CLFLUSH here to speed up CPU recovery; but
2411 * that causes hangs on some VIA CPUs.
2412 */
2413 }
2414}
2415
2416__ro_after_init struct mm_struct *text_poke_mm;
2417__ro_after_init unsigned long text_poke_mm_addr;
2418
2419static void text_poke_memcpy(void *dst, const void *src, size_t len)
2420{
2421 memcpy(dst, src, len);
2422}
2423
2424static void text_poke_memset(void *dst, const void *src, size_t len)
2425{
2426 int c = *(const int *)src;
2427
2428 memset(dst, c, len);
2429}
2430
2431typedef void text_poke_f(void *dst, const void *src, size_t len);
2432
2433static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t len)
2434{
2435 bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE;
2436 struct page *pages[2] = {NULL};
2437 struct mm_struct *prev_mm;
2438 unsigned long flags;
2439 pte_t pte, *ptep;
2440 spinlock_t *ptl;
2441 pgprot_t pgprot;
2442
2443 /*
2444 * While boot memory allocator is running we cannot use struct pages as
2445 * they are not yet initialized. There is no way to recover.
2446 */
2447 BUG_ON(!after_bootmem);
2448
2449 if (!core_kernel_text(addr: (unsigned long)addr)) {
2450 pages[0] = vmalloc_to_page(addr);
2451 if (cross_page_boundary)
2452 pages[1] = vmalloc_to_page(addr: addr + PAGE_SIZE);
2453 } else {
2454 pages[0] = virt_to_page(addr);
2455 WARN_ON(!PageReserved(pages[0]));
2456 if (cross_page_boundary)
2457 pages[1] = virt_to_page(addr + PAGE_SIZE);
2458 }
2459 /*
2460 * If something went wrong, crash and burn since recovery paths are not
2461 * implemented.
2462 */
2463 BUG_ON(!pages[0] || (cross_page_boundary && !pages[1]));
2464
2465 /*
2466 * Map the page without the global bit, as TLB flushing is done with
2467 * flush_tlb_mm_range(), which is intended for non-global PTEs.
2468 */
2469 pgprot = __pgprot(pgprot_val(PAGE_KERNEL) & ~_PAGE_GLOBAL);
2470
2471 /*
2472 * The lock is not really needed, but this allows to avoid open-coding.
2473 */
2474 ptep = get_locked_pte(mm: text_poke_mm, addr: text_poke_mm_addr, ptl: &ptl);
2475
2476 /*
2477 * This must not fail; preallocated in poking_init().
2478 */
2479 VM_BUG_ON(!ptep);
2480
2481 local_irq_save(flags);
2482
2483 pte = mk_pte(page: pages[0], pgprot);
2484 set_pte_at(text_poke_mm, text_poke_mm_addr, ptep, pte);
2485
2486 if (cross_page_boundary) {
2487 pte = mk_pte(page: pages[1], pgprot);
2488 set_pte_at(text_poke_mm, text_poke_mm_addr + PAGE_SIZE, ptep + 1, pte);
2489 }
2490
2491 /*
2492 * Loading the temporary mm behaves as a compiler barrier, which
2493 * guarantees that the PTE will be set at the time memcpy() is done.
2494 */
2495 prev_mm = use_temporary_mm(temp_mm: text_poke_mm);
2496
2497 kasan_disable_current();
2498 func((u8 *)text_poke_mm_addr + offset_in_page(addr), src, len);
2499 kasan_enable_current();
2500
2501 /*
2502 * Ensure that the PTE is only cleared after the instructions of memcpy
2503 * were issued by using a compiler barrier.
2504 */
2505 barrier();
2506
2507 pte_clear(mm: text_poke_mm, addr: text_poke_mm_addr, ptep);
2508 if (cross_page_boundary)
2509 pte_clear(mm: text_poke_mm, addr: text_poke_mm_addr + PAGE_SIZE, ptep: ptep + 1);
2510
2511 /*
2512 * Loading the previous page-table hierarchy requires a serializing
2513 * instruction that already allows the core to see the updated version.
2514 * Xen-PV is assumed to serialize execution in a similar manner.
2515 */
2516 unuse_temporary_mm(prev_mm);
2517
2518 /*
2519 * Flushing the TLB might involve IPIs, which would require enabled
2520 * IRQs, but not if the mm is not used, as it is in this point.
2521 */
2522 flush_tlb_mm_range(mm: text_poke_mm, start: text_poke_mm_addr, end: text_poke_mm_addr +
2523 (cross_page_boundary ? 2 : 1) * PAGE_SIZE,
2524 PAGE_SHIFT, freed_tables: false);
2525
2526 if (func == text_poke_memcpy) {
2527 /*
2528 * If the text does not match what we just wrote then something is
2529 * fundamentally screwy; there's nothing we can really do about that.
2530 */
2531 BUG_ON(memcmp(addr, src, len));
2532 }
2533
2534 local_irq_restore(flags);
2535 pte_unmap_unlock(ptep, ptl);
2536 return addr;
2537}
2538
2539/**
2540 * text_poke - Update instructions on a live kernel
2541 * @addr: address to modify
2542 * @opcode: source of the copy
2543 * @len: length to copy
2544 *
2545 * Only atomic text poke/set should be allowed when not doing early patching.
2546 * It means the size must be writable atomically and the address must be aligned
2547 * in a way that permits an atomic write. It also makes sure we fit on a single
2548 * page.
2549 *
2550 * Note that the caller must ensure that if the modified code is part of a
2551 * module, the module would not be removed during poking. This can be achieved
2552 * by registering a module notifier, and ordering module removal and patching
2553 * through a mutex.
2554 */
2555void *text_poke(void *addr, const void *opcode, size_t len)
2556{
2557 lockdep_assert_held(&text_mutex);
2558
2559 return __text_poke(func: text_poke_memcpy, addr, src: opcode, len);
2560}
2561
2562/**
2563 * text_poke_kgdb - Update instructions on a live kernel by kgdb
2564 * @addr: address to modify
2565 * @opcode: source of the copy
2566 * @len: length to copy
2567 *
2568 * Only atomic text poke/set should be allowed when not doing early patching.
2569 * It means the size must be writable atomically and the address must be aligned
2570 * in a way that permits an atomic write. It also makes sure we fit on a single
2571 * page.
2572 *
2573 * Context: should only be used by kgdb, which ensures no other core is running,
2574 * despite the fact it does not hold the text_mutex.
2575 */
2576void *text_poke_kgdb(void *addr, const void *opcode, size_t len)
2577{
2578 return __text_poke(func: text_poke_memcpy, addr, src: opcode, len);
2579}
2580
2581void *text_poke_copy_locked(void *addr, const void *opcode, size_t len,
2582 bool core_ok)
2583{
2584 unsigned long start = (unsigned long)addr;
2585 size_t patched = 0;
2586
2587 if (WARN_ON_ONCE(!core_ok && core_kernel_text(start)))
2588 return NULL;
2589
2590 while (patched < len) {
2591 unsigned long ptr = start + patched;
2592 size_t s;
2593
2594 s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched);
2595
2596 __text_poke(func: text_poke_memcpy, addr: (void *)ptr, src: opcode + patched, len: s);
2597 patched += s;
2598 }
2599 return addr;
2600}
2601
2602/**
2603 * text_poke_copy - Copy instructions into (an unused part of) RX memory
2604 * @addr: address to modify
2605 * @opcode: source of the copy
2606 * @len: length to copy, could be more than 2x PAGE_SIZE
2607 *
2608 * Not safe against concurrent execution; useful for JITs to dump
2609 * new code blocks into unused regions of RX memory. Can be used in
2610 * conjunction with synchronize_rcu_tasks() to wait for existing
2611 * execution to quiesce after having made sure no existing functions
2612 * pointers are live.
2613 */
2614void *text_poke_copy(void *addr, const void *opcode, size_t len)
2615{
2616 mutex_lock(&text_mutex);
2617 addr = text_poke_copy_locked(addr, opcode, len, core_ok: false);
2618 mutex_unlock(lock: &text_mutex);
2619 return addr;
2620}
2621
2622/**
2623 * text_poke_set - memset into (an unused part of) RX memory
2624 * @addr: address to modify
2625 * @c: the byte to fill the area with
2626 * @len: length to copy, could be more than 2x PAGE_SIZE
2627 *
2628 * This is useful to overwrite unused regions of RX memory with illegal
2629 * instructions.
2630 */
2631void *text_poke_set(void *addr, int c, size_t len)
2632{
2633 unsigned long start = (unsigned long)addr;
2634 size_t patched = 0;
2635
2636 if (WARN_ON_ONCE(core_kernel_text(start)))
2637 return NULL;
2638
2639 mutex_lock(&text_mutex);
2640 while (patched < len) {
2641 unsigned long ptr = start + patched;
2642 size_t s;
2643
2644 s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched);
2645
2646 __text_poke(func: text_poke_memset, addr: (void *)ptr, src: (void *)&c, len: s);
2647 patched += s;
2648 }
2649 mutex_unlock(lock: &text_mutex);
2650 return addr;
2651}
2652
2653static void do_sync_core(void *info)
2654{
2655 sync_core();
2656}
2657
2658void smp_text_poke_sync_each_cpu(void)
2659{
2660 on_each_cpu(func: do_sync_core, NULL, wait: 1);
2661}
2662
2663/*
2664 * NOTE: crazy scheme to allow patching Jcc.d32 but not increase the size of
2665 * this thing. When len == 6 everything is prefixed with 0x0f and we map
2666 * opcode to Jcc.d8, using len to distinguish.
2667 */
2668struct smp_text_poke_loc {
2669 /* addr := _stext + rel_addr */
2670 s32 rel_addr;
2671 s32 disp;
2672 u8 len;
2673 u8 opcode;
2674 const u8 text[TEXT_POKE_MAX_OPCODE_SIZE];
2675 /* see smp_text_poke_batch_finish() */
2676 u8 old;
2677};
2678
2679#define TEXT_POKE_ARRAY_MAX (PAGE_SIZE / sizeof(struct smp_text_poke_loc))
2680
2681static struct smp_text_poke_array {
2682 struct smp_text_poke_loc vec[TEXT_POKE_ARRAY_MAX];
2683 int nr_entries;
2684} text_poke_array;
2685
2686static DEFINE_PER_CPU(atomic_t, text_poke_array_refs);
2687
2688/*
2689 * These four __always_inline annotations imply noinstr, necessary
2690 * due to smp_text_poke_int3_handler() being noinstr:
2691 */
2692
2693static __always_inline bool try_get_text_poke_array(void)
2694{
2695 atomic_t *refs = this_cpu_ptr(&text_poke_array_refs);
2696
2697 if (!raw_atomic_inc_not_zero(v: refs))
2698 return false;
2699
2700 return true;
2701}
2702
2703static __always_inline void put_text_poke_array(void)
2704{
2705 atomic_t *refs = this_cpu_ptr(&text_poke_array_refs);
2706
2707 smp_mb__before_atomic();
2708 raw_atomic_dec(v: refs);
2709}
2710
2711static __always_inline void *text_poke_addr(const struct smp_text_poke_loc *tpl)
2712{
2713 return _stext + tpl->rel_addr;
2714}
2715
2716static __always_inline int patch_cmp(const void *tpl_a, const void *tpl_b)
2717{
2718 if (tpl_a < text_poke_addr(tpl: tpl_b))
2719 return -1;
2720 if (tpl_a > text_poke_addr(tpl: tpl_b))
2721 return 1;
2722 return 0;
2723}
2724
2725noinstr int smp_text_poke_int3_handler(struct pt_regs *regs)
2726{
2727 struct smp_text_poke_loc *tpl;
2728 int ret = 0;
2729 void *ip;
2730
2731 if (user_mode(regs))
2732 return 0;
2733
2734 /*
2735 * Having observed our INT3 instruction, we now must observe
2736 * text_poke_array with non-zero refcount:
2737 *
2738 * text_poke_array_refs = 1 INT3
2739 * WMB RMB
2740 * write INT3 if (text_poke_array_refs != 0)
2741 */
2742 smp_rmb();
2743
2744 if (!try_get_text_poke_array())
2745 return 0;
2746
2747 /*
2748 * Discount the INT3. See smp_text_poke_batch_finish().
2749 */
2750 ip = (void *) regs->ip - INT3_INSN_SIZE;
2751
2752 /*
2753 * Skip the binary search if there is a single member in the vector.
2754 */
2755 if (unlikely(text_poke_array.nr_entries > 1)) {
2756 tpl = __inline_bsearch(key: ip, base: text_poke_array.vec, num: text_poke_array.nr_entries,
2757 size: sizeof(struct smp_text_poke_loc),
2758 cmp: patch_cmp);
2759 if (!tpl)
2760 goto out_put;
2761 } else {
2762 tpl = text_poke_array.vec;
2763 if (text_poke_addr(tpl) != ip)
2764 goto out_put;
2765 }
2766
2767 ip += tpl->len;
2768
2769 switch (tpl->opcode) {
2770 case INT3_INSN_OPCODE:
2771 /*
2772 * Someone poked an explicit INT3, they'll want to handle it,
2773 * do not consume.
2774 */
2775 goto out_put;
2776
2777 case RET_INSN_OPCODE:
2778 int3_emulate_ret(regs);
2779 break;
2780
2781 case CALL_INSN_OPCODE:
2782 int3_emulate_call(regs, func: (long)ip + tpl->disp);
2783 break;
2784
2785 case JMP32_INSN_OPCODE:
2786 case JMP8_INSN_OPCODE:
2787 int3_emulate_jmp(regs, ip: (long)ip + tpl->disp);
2788 break;
2789
2790 case 0x70 ... 0x7f: /* Jcc */
2791 int3_emulate_jcc(regs, cc: tpl->opcode & 0xf, ip: (long)ip, disp: tpl->disp);
2792 break;
2793
2794 default:
2795 BUG();
2796 }
2797
2798 ret = 1;
2799
2800out_put:
2801 put_text_poke_array();
2802 return ret;
2803}
2804
2805/**
2806 * smp_text_poke_batch_finish() -- update instructions on live kernel on SMP
2807 *
2808 * Input state:
2809 * text_poke_array.vec: vector of instructions to patch
2810 * text_poke_array.nr_entries: number of entries in the vector
2811 *
2812 * Modify multi-byte instructions by using INT3 breakpoints on SMP.
2813 * We completely avoid using stop_machine() here, and achieve the
2814 * synchronization using INT3 breakpoints and SMP cross-calls.
2815 *
2816 * The way it is done:
2817 * - For each entry in the vector:
2818 * - add an INT3 trap to the address that will be patched
2819 * - SMP sync all CPUs
2820 * - For each entry in the vector:
2821 * - update all but the first byte of the patched range
2822 * - SMP sync all CPUs
2823 * - For each entry in the vector:
2824 * - replace the first byte (INT3) by the first byte of the
2825 * replacing opcode
2826 * - SMP sync all CPUs
2827 */
2828void smp_text_poke_batch_finish(void)
2829{
2830 unsigned char int3 = INT3_INSN_OPCODE;
2831 unsigned int i;
2832 int do_sync;
2833
2834 if (!text_poke_array.nr_entries)
2835 return;
2836
2837 lockdep_assert_held(&text_mutex);
2838
2839 /*
2840 * Corresponds to the implicit memory barrier in try_get_text_poke_array() to
2841 * ensure reading a non-zero refcount provides up to date text_poke_array data.
2842 */
2843 for_each_possible_cpu(i)
2844 atomic_set_release(per_cpu_ptr(&text_poke_array_refs, i), i: 1);
2845
2846 /*
2847 * Function tracing can enable thousands of places that need to be
2848 * updated. This can take quite some time, and with full kernel debugging
2849 * enabled, this could cause the softlockup watchdog to trigger.
2850 * This function gets called every 256 entries added to be patched.
2851 * Call cond_resched() here to make sure that other tasks can get scheduled
2852 * while processing all the functions being patched.
2853 */
2854 cond_resched();
2855
2856 /*
2857 * Corresponding read barrier in INT3 notifier for making sure the
2858 * text_poke_array.nr_entries and handler are correctly ordered wrt. patching.
2859 */
2860 smp_wmb();
2861
2862 /*
2863 * First step: add a INT3 trap to the address that will be patched.
2864 */
2865 for (i = 0; i < text_poke_array.nr_entries; i++) {
2866 text_poke_array.vec[i].old = *(u8 *)text_poke_addr(tpl: &text_poke_array.vec[i]);
2867 text_poke(addr: text_poke_addr(tpl: &text_poke_array.vec[i]), opcode: &int3, INT3_INSN_SIZE);
2868 }
2869
2870 smp_text_poke_sync_each_cpu();
2871
2872 /*
2873 * Second step: update all but the first byte of the patched range.
2874 */
2875 for (do_sync = 0, i = 0; i < text_poke_array.nr_entries; i++) {
2876 u8 old[TEXT_POKE_MAX_OPCODE_SIZE+1] = { text_poke_array.vec[i].old, };
2877 u8 _new[TEXT_POKE_MAX_OPCODE_SIZE+1];
2878 const u8 *new = text_poke_array.vec[i].text;
2879 int len = text_poke_array.vec[i].len;
2880
2881 if (len - INT3_INSN_SIZE > 0) {
2882 memcpy(old + INT3_INSN_SIZE,
2883 text_poke_addr(&text_poke_array.vec[i]) + INT3_INSN_SIZE,
2884 len - INT3_INSN_SIZE);
2885
2886 if (len == 6) {
2887 _new[0] = 0x0f;
2888 memcpy(_new + 1, new, 5);
2889 new = _new;
2890 }
2891
2892 text_poke(addr: text_poke_addr(tpl: &text_poke_array.vec[i]) + INT3_INSN_SIZE,
2893 opcode: new + INT3_INSN_SIZE,
2894 len: len - INT3_INSN_SIZE);
2895
2896 do_sync++;
2897 }
2898
2899 /*
2900 * Emit a perf event to record the text poke, primarily to
2901 * support Intel PT decoding which must walk the executable code
2902 * to reconstruct the trace. The flow up to here is:
2903 * - write INT3 byte
2904 * - IPI-SYNC
2905 * - write instruction tail
2906 * At this point the actual control flow will be through the
2907 * INT3 and handler and not hit the old or new instruction.
2908 * Intel PT outputs FUP/TIP packets for the INT3, so the flow
2909 * can still be decoded. Subsequently:
2910 * - emit RECORD_TEXT_POKE with the new instruction
2911 * - IPI-SYNC
2912 * - write first byte
2913 * - IPI-SYNC
2914 * So before the text poke event timestamp, the decoder will see
2915 * either the old instruction flow or FUP/TIP of INT3. After the
2916 * text poke event timestamp, the decoder will see either the
2917 * new instruction flow or FUP/TIP of INT3. Thus decoders can
2918 * use the timestamp as the point at which to modify the
2919 * executable code.
2920 * The old instruction is recorded so that the event can be
2921 * processed forwards or backwards.
2922 */
2923 perf_event_text_poke(addr: text_poke_addr(tpl: &text_poke_array.vec[i]), old_bytes: old, old_len: len, new_bytes: new, new_len: len);
2924 }
2925
2926 if (do_sync) {
2927 /*
2928 * According to Intel, this core syncing is very likely
2929 * not necessary and we'd be safe even without it. But
2930 * better safe than sorry (plus there's not only Intel).
2931 */
2932 smp_text_poke_sync_each_cpu();
2933 }
2934
2935 /*
2936 * Third step: replace the first byte (INT3) by the first byte of the
2937 * replacing opcode.
2938 */
2939 for (do_sync = 0, i = 0; i < text_poke_array.nr_entries; i++) {
2940 u8 byte = text_poke_array.vec[i].text[0];
2941
2942 if (text_poke_array.vec[i].len == 6)
2943 byte = 0x0f;
2944
2945 if (byte == INT3_INSN_OPCODE)
2946 continue;
2947
2948 text_poke(addr: text_poke_addr(tpl: &text_poke_array.vec[i]), opcode: &byte, INT3_INSN_SIZE);
2949 do_sync++;
2950 }
2951
2952 if (do_sync)
2953 smp_text_poke_sync_each_cpu();
2954
2955 /*
2956 * Remove and wait for refs to be zero.
2957 *
2958 * Notably, if after step-3 above the INT3 got removed, then the
2959 * smp_text_poke_sync_each_cpu() will have serialized against any running INT3
2960 * handlers and the below spin-wait will not happen.
2961 *
2962 * IOW. unless the replacement instruction is INT3, this case goes
2963 * unused.
2964 */
2965 for_each_possible_cpu(i) {
2966 atomic_t *refs = per_cpu_ptr(&text_poke_array_refs, i);
2967
2968 if (unlikely(!atomic_dec_and_test(refs)))
2969 atomic_cond_read_acquire(refs, !VAL);
2970 }
2971
2972 /* They are all completed: */
2973 text_poke_array.nr_entries = 0;
2974}
2975
2976static void __smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, const void *emulate)
2977{
2978 struct smp_text_poke_loc *tpl;
2979 struct insn insn;
2980 int ret, i = 0;
2981
2982 tpl = &text_poke_array.vec[text_poke_array.nr_entries++];
2983
2984 if (len == 6)
2985 i = 1;
2986 memcpy((void *)tpl->text, opcode+i, len-i);
2987 if (!emulate)
2988 emulate = opcode;
2989
2990 ret = insn_decode_kernel(&insn, emulate);
2991 BUG_ON(ret < 0);
2992
2993 tpl->rel_addr = addr - (void *)_stext;
2994 tpl->len = len;
2995 tpl->opcode = insn.opcode.bytes[0];
2996
2997 if (is_jcc32(insn: &insn)) {
2998 /*
2999 * Map Jcc.d32 onto Jcc.d8 and use len to distinguish.
3000 */
3001 tpl->opcode = insn.opcode.bytes[1] - 0x10;
3002 }
3003
3004 switch (tpl->opcode) {
3005 case RET_INSN_OPCODE:
3006 case JMP32_INSN_OPCODE:
3007 case JMP8_INSN_OPCODE:
3008 /*
3009 * Control flow instructions without implied execution of the
3010 * next instruction can be padded with INT3.
3011 */
3012 for (i = insn.length; i < len; i++)
3013 BUG_ON(tpl->text[i] != INT3_INSN_OPCODE);
3014 break;
3015
3016 default:
3017 BUG_ON(len != insn.length);
3018 }
3019
3020 switch (tpl->opcode) {
3021 case INT3_INSN_OPCODE:
3022 case RET_INSN_OPCODE:
3023 break;
3024
3025 case CALL_INSN_OPCODE:
3026 case JMP32_INSN_OPCODE:
3027 case JMP8_INSN_OPCODE:
3028 case 0x70 ... 0x7f: /* Jcc */
3029 tpl->disp = insn.immediate.value;
3030 break;
3031
3032 default: /* assume NOP */
3033 switch (len) {
3034 case 2: /* NOP2 -- emulate as JMP8+0 */
3035 BUG_ON(memcmp(emulate, x86_nops[len], len));
3036 tpl->opcode = JMP8_INSN_OPCODE;
3037 tpl->disp = 0;
3038 break;
3039
3040 case 5: /* NOP5 -- emulate as JMP32+0 */
3041 BUG_ON(memcmp(emulate, x86_nops[len], len));
3042 tpl->opcode = JMP32_INSN_OPCODE;
3043 tpl->disp = 0;
3044 break;
3045
3046 default: /* unknown instruction */
3047 BUG();
3048 }
3049 break;
3050 }
3051}
3052
3053/*
3054 * We hard rely on the text_poke_array.vec being ordered; ensure this is so by flushing
3055 * early if needed.
3056 */
3057static bool text_poke_addr_ordered(void *addr)
3058{
3059 WARN_ON_ONCE(!addr);
3060
3061 if (!text_poke_array.nr_entries)
3062 return true;
3063
3064 /*
3065 * If the last current entry's address is higher than the
3066 * new entry's address we'd like to add, then ordering
3067 * is violated and we must first flush all pending patching
3068 * requests:
3069 */
3070 if (text_poke_addr(tpl: text_poke_array.vec + text_poke_array.nr_entries-1) > addr)
3071 return false;
3072
3073 return true;
3074}
3075
3076/**
3077 * smp_text_poke_batch_add() -- update instruction on live kernel on SMP, batched
3078 * @addr: address to patch
3079 * @opcode: opcode of new instruction
3080 * @len: length to copy
3081 * @emulate: instruction to be emulated
3082 *
3083 * Add a new instruction to the current queue of to-be-patched instructions
3084 * the kernel maintains. The patching request will not be executed immediately,
3085 * but becomes part of an array of patching requests, optimized for batched
3086 * execution. All pending patching requests will be executed on the next
3087 * smp_text_poke_batch_finish() call.
3088 */
3089void __ref smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, const void *emulate)
3090{
3091 if (text_poke_array.nr_entries == TEXT_POKE_ARRAY_MAX || !text_poke_addr_ordered(addr))
3092 smp_text_poke_batch_finish();
3093 __smp_text_poke_batch_add(addr, opcode, len, emulate);
3094}
3095
3096/**
3097 * smp_text_poke_single() -- update instruction on live kernel on SMP immediately
3098 * @addr: address to patch
3099 * @opcode: opcode of new instruction
3100 * @len: length to copy
3101 * @emulate: instruction to be emulated
3102 *
3103 * Update a single instruction with the vector in the stack, avoiding
3104 * dynamically allocated memory. This function should be used when it is
3105 * not possible to allocate memory for a vector. The single instruction
3106 * is patched in immediately.
3107 */
3108void __ref smp_text_poke_single(void *addr, const void *opcode, size_t len, const void *emulate)
3109{
3110 __smp_text_poke_batch_add(addr, opcode, len, emulate);
3111 smp_text_poke_batch_finish();
3112}
3113

Provided by KDAB

Privacy Policy
Improve your Profiling and Debugging skills
Find out more

source code of linux/arch/x86/kernel/alternative.c