1 // SPDX-License-Identifier: GPL-2.0-only
2 #define pr_fmt(fmt) "SMP alternatives: " fmt
3
4 #include <linux/module.h>
5 #include <linux/sched.h>
6 #include <linux/perf_event.h>
7 #include <linux/mutex.h>
8 #include <linux/list.h>
9 #include <linux/stringify.h>
10 #include <linux/highmem.h>
11 #include <linux/mm.h>
12 #include <linux/vmalloc.h>
13 #include <linux/memory.h>
14 #include <linux/stop_machine.h>
15 #include <linux/slab.h>
16 #include <linux/kdebug.h>
17 #include <linux/kprobes.h>
18 #include <linux/mmu_context.h>
19 #include <linux/bsearch.h>
20 #include <linux/sync_core.h>
21 #include <asm/text-patching.h>
22 #include <asm/alternative.h>
23 #include <asm/sections.h>
24 #include <asm/mce.h>
25 #include <asm/nmi.h>
26 #include <asm/cacheflush.h>
27 #include <asm/tlbflush.h>
28 #include <asm/insn.h>
29 #include <asm/io.h>
30 #include <asm/fixmap.h>
31 #include <asm/paravirt.h>
32 #include <asm/asm-prototypes.h>
33
34 int __read_mostly alternatives_patched;
35
36 EXPORT_SYMBOL_GPL(alternatives_patched);
37
38 #define MAX_PATCH_LEN (255-1)
39
40 static int __initdata_or_module debug_alternative;
41
debug_alt(char * str)42 static int __init debug_alt(char *str)
43 {
44 debug_alternative = 1;
45 return 1;
46 }
47 __setup("debug-alternative", debug_alt);
48
49 static int noreplace_smp;
50
setup_noreplace_smp(char * str)51 static int __init setup_noreplace_smp(char *str)
52 {
53 noreplace_smp = 1;
54 return 1;
55 }
56 __setup("noreplace-smp", setup_noreplace_smp);
57
58 #define DPRINTK(fmt, args...) \
59 do { \
60 if (debug_alternative) \
61 printk(KERN_DEBUG pr_fmt(fmt) "\n", ##args); \
62 } while (0)
63
64 #define DUMP_BYTES(buf, len, fmt, args...) \
65 do { \
66 if (unlikely(debug_alternative)) { \
67 int j; \
68 \
69 if (!(len)) \
70 break; \
71 \
72 printk(KERN_DEBUG pr_fmt(fmt), ##args); \
73 for (j = 0; j < (len) - 1; j++) \
74 printk(KERN_CONT "%02hhx ", buf[j]); \
75 printk(KERN_CONT "%02hhx\n", buf[j]); \
76 } \
77 } while (0)
78
79 static const unsigned char x86nops[] =
80 {
81 BYTES_NOP1,
82 BYTES_NOP2,
83 BYTES_NOP3,
84 BYTES_NOP4,
85 BYTES_NOP5,
86 BYTES_NOP6,
87 BYTES_NOP7,
88 BYTES_NOP8,
89 };
90
91 const unsigned char * const x86_nops[ASM_NOP_MAX+1] =
92 {
93 NULL,
94 x86nops,
95 x86nops + 1,
96 x86nops + 1 + 2,
97 x86nops + 1 + 2 + 3,
98 x86nops + 1 + 2 + 3 + 4,
99 x86nops + 1 + 2 + 3 + 4 + 5,
100 x86nops + 1 + 2 + 3 + 4 + 5 + 6,
101 x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
102 };
103
104 /* Use this to add nops to a buffer, then text_poke the whole buffer. */
add_nops(void * insns,unsigned int len)105 static void __init_or_module add_nops(void *insns, unsigned int len)
106 {
107 while (len > 0) {
108 unsigned int noplen = len;
109 if (noplen > ASM_NOP_MAX)
110 noplen = ASM_NOP_MAX;
111 memcpy(insns, x86_nops[noplen], noplen);
112 insns += noplen;
113 len -= noplen;
114 }
115 }
116
117 extern s32 __retpoline_sites[], __retpoline_sites_end[];
118 extern s32 __return_sites[], __return_sites_end[];
119 extern s32 __ibt_endbr_seal[], __ibt_endbr_seal_end[];
120 extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
121 extern s32 __smp_locks[], __smp_locks_end[];
122 void text_poke_early(void *addr, const void *opcode, size_t len);
123
124 /*
125 * Are we looking at a near JMP with a 1 or 4-byte displacement.
126 */
is_jmp(const u8 opcode)127 static inline bool is_jmp(const u8 opcode)
128 {
129 return opcode == 0xeb || opcode == 0xe9;
130 }
131
132 static void __init_or_module
recompute_jump(struct alt_instr * a,u8 * orig_insn,u8 * repl_insn,u8 * insn_buff)133 recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insn_buff)
134 {
135 u8 *next_rip, *tgt_rip;
136 s32 n_dspl, o_dspl;
137 int repl_len;
138
139 if (a->replacementlen != 5)
140 return;
141
142 o_dspl = *(s32 *)(insn_buff + 1);
143
144 /* next_rip of the replacement JMP */
145 next_rip = repl_insn + a->replacementlen;
146 /* target rip of the replacement JMP */
147 tgt_rip = next_rip + o_dspl;
148 n_dspl = tgt_rip - orig_insn;
149
150 DPRINTK("target RIP: %px, new_displ: 0x%x", tgt_rip, n_dspl);
151
152 if (tgt_rip - orig_insn >= 0) {
153 if (n_dspl - 2 <= 127)
154 goto two_byte_jmp;
155 else
156 goto five_byte_jmp;
157 /* negative offset */
158 } else {
159 if (((n_dspl - 2) & 0xff) == (n_dspl - 2))
160 goto two_byte_jmp;
161 else
162 goto five_byte_jmp;
163 }
164
165 two_byte_jmp:
166 n_dspl -= 2;
167
168 insn_buff[0] = 0xeb;
169 insn_buff[1] = (s8)n_dspl;
170 add_nops(insn_buff + 2, 3);
171
172 repl_len = 2;
173 goto done;
174
175 five_byte_jmp:
176 n_dspl -= 5;
177
178 insn_buff[0] = 0xe9;
179 *(s32 *)&insn_buff[1] = n_dspl;
180
181 repl_len = 5;
182
183 done:
184
185 DPRINTK("final displ: 0x%08x, JMP 0x%lx",
186 n_dspl, (unsigned long)orig_insn + n_dspl + repl_len);
187 }
188
189 /*
190 * optimize_nops_range() - Optimize a sequence of single byte NOPs (0x90)
191 *
192 * @instr: instruction byte stream
193 * @instrlen: length of the above
194 * @off: offset within @instr where the first NOP has been detected
195 *
196 * Return: number of NOPs found (and replaced).
197 */
optimize_nops_range(u8 * instr,u8 instrlen,int off)198 static __always_inline int optimize_nops_range(u8 *instr, u8 instrlen, int off)
199 {
200 unsigned long flags;
201 int i = off, nnops;
202
203 while (i < instrlen) {
204 if (instr[i] != 0x90)
205 break;
206
207 i++;
208 }
209
210 nnops = i - off;
211
212 if (nnops <= 1)
213 return nnops;
214
215 local_irq_save(flags);
216 add_nops(instr + off, nnops);
217 local_irq_restore(flags);
218
219 DUMP_BYTES(instr, instrlen, "%px: [%d:%d) optimized NOPs: ", instr, off, i);
220
221 return nnops;
222 }
223
224 /*
225 * "noinline" to cause control flow change and thus invalidate I$ and
226 * cause refetch after modification.
227 */
optimize_nops(u8 * instr,size_t len)228 static void __init_or_module noinline optimize_nops(u8 *instr, size_t len)
229 {
230 struct insn insn;
231 int i = 0;
232
233 /*
234 * Jump over the non-NOP insns and optimize single-byte NOPs into bigger
235 * ones.
236 */
237 for (;;) {
238 if (insn_decode_kernel(&insn, &instr[i]))
239 return;
240
241 /*
242 * See if this and any potentially following NOPs can be
243 * optimized.
244 */
245 if (insn.length == 1 && insn.opcode.bytes[0] == 0x90)
246 i += optimize_nops_range(instr, len, i);
247 else
248 i += insn.length;
249
250 if (i >= len)
251 return;
252 }
253 }
254
255 /*
256 * Replace instructions with better alternatives for this CPU type. This runs
257 * before SMP is initialized to avoid SMP problems with self modifying code.
258 * This implies that asymmetric systems where APs have less capabilities than
259 * the boot processor are not handled. Tough. Make sure you disable such
260 * features by hand.
261 *
262 * Marked "noinline" to cause control flow change and thus insn cache
263 * to refetch changed I$ lines.
264 */
apply_alternatives(struct alt_instr * start,struct alt_instr * end)265 void __init_or_module noinline apply_alternatives(struct alt_instr *start,
266 struct alt_instr *end)
267 {
268 struct alt_instr *a;
269 u8 *instr, *replacement;
270 u8 insn_buff[MAX_PATCH_LEN];
271
272 DPRINTK("alt table %px, -> %px", start, end);
273 /*
274 * The scan order should be from start to end. A later scanned
275 * alternative code can overwrite previously scanned alternative code.
276 * Some kernel functions (e.g. memcpy, memset, etc) use this order to
277 * patch code.
278 *
279 * So be careful if you want to change the scan order to any other
280 * order.
281 */
282 for (a = start; a < end; a++) {
283 int insn_buff_sz = 0;
284 /* Mask away "NOT" flag bit for feature to test. */
285 u16 feature = a->cpuid & ~ALTINSTR_FLAG_INV;
286
287 instr = (u8 *)&a->instr_offset + a->instr_offset;
288 replacement = (u8 *)&a->repl_offset + a->repl_offset;
289 BUG_ON(a->instrlen > sizeof(insn_buff));
290 BUG_ON(feature >= (NCAPINTS + NBUGINTS) * 32);
291
292 /*
293 * Patch if either:
294 * - feature is present
295 * - feature not present but ALTINSTR_FLAG_INV is set to mean,
296 * patch if feature is *NOT* present.
297 */
298 if (!boot_cpu_has(feature) == !(a->cpuid & ALTINSTR_FLAG_INV))
299 goto next;
300
301 DPRINTK("feat: %s%d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d)",
302 (a->cpuid & ALTINSTR_FLAG_INV) ? "!" : "",
303 feature >> 5,
304 feature & 0x1f,
305 instr, instr, a->instrlen,
306 replacement, a->replacementlen);
307
308 DUMP_BYTES(instr, a->instrlen, "%px: old_insn: ", instr);
309 DUMP_BYTES(replacement, a->replacementlen, "%px: rpl_insn: ", replacement);
310
311 memcpy(insn_buff, replacement, a->replacementlen);
312 insn_buff_sz = a->replacementlen;
313
314 /*
315 * 0xe8 is a relative jump; fix the offset.
316 *
317 * Instruction length is checked before the opcode to avoid
318 * accessing uninitialized bytes for zero-length replacements.
319 */
320 if (a->replacementlen == 5 && *insn_buff == 0xe8) {
321 *(s32 *)(insn_buff + 1) += replacement - instr;
322 DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx",
323 *(s32 *)(insn_buff + 1),
324 (unsigned long)instr + *(s32 *)(insn_buff + 1) + 5);
325 }
326
327 if (a->replacementlen && is_jmp(replacement[0]))
328 recompute_jump(a, instr, replacement, insn_buff);
329
330 for (; insn_buff_sz < a->instrlen; insn_buff_sz++)
331 insn_buff[insn_buff_sz] = 0x90;
332
333 DUMP_BYTES(insn_buff, insn_buff_sz, "%px: final_insn: ", instr);
334
335 text_poke_early(instr, insn_buff, insn_buff_sz);
336
337 next:
338 optimize_nops(instr, a->instrlen);
339 }
340 }
341
342 #if defined(CONFIG_RETPOLINE) && defined(CONFIG_OBJTOOL)
343
344 /*
345 * CALL/JMP *%\reg
346 */
emit_indirect(int op,int reg,u8 * bytes)347 static int emit_indirect(int op, int reg, u8 *bytes)
348 {
349 int i = 0;
350 u8 modrm;
351
352 switch (op) {
353 case CALL_INSN_OPCODE:
354 modrm = 0x10; /* Reg = 2; CALL r/m */
355 break;
356
357 case JMP32_INSN_OPCODE:
358 modrm = 0x20; /* Reg = 4; JMP r/m */
359 break;
360
361 default:
362 WARN_ON_ONCE(1);
363 return -1;
364 }
365
366 if (reg >= 8) {
367 bytes[i++] = 0x41; /* REX.B prefix */
368 reg -= 8;
369 }
370
371 modrm |= 0xc0; /* Mod = 3 */
372 modrm += reg;
373
374 bytes[i++] = 0xff; /* opcode */
375 bytes[i++] = modrm;
376
377 return i;
378 }
379
380 /*
381 * Rewrite the compiler generated retpoline thunk calls.
382 *
383 * For spectre_v2=off (!X86_FEATURE_RETPOLINE), rewrite them into immediate
384 * indirect instructions, avoiding the extra indirection.
385 *
386 * For example, convert:
387 *
388 * CALL __x86_indirect_thunk_\reg
389 *
390 * into:
391 *
392 * CALL *%\reg
393 *
394 * It also tries to inline spectre_v2=retpoline,lfence when size permits.
395 */
patch_retpoline(void * addr,struct insn * insn,u8 * bytes)396 static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes)
397 {
398 retpoline_thunk_t *target;
399 int reg, ret, i = 0;
400 u8 op, cc;
401
402 target = addr + insn->length + insn->immediate.value;
403 reg = target - __x86_indirect_thunk_array;
404
405 if (WARN_ON_ONCE(reg & ~0xf))
406 return -1;
407
408 /* If anyone ever does: CALL/JMP *%rsp, we're in deep trouble. */
409 BUG_ON(reg == 4);
410
411 if (cpu_feature_enabled(X86_FEATURE_RETPOLINE) &&
412 !cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE))
413 return -1;
414
415 op = insn->opcode.bytes[0];
416
417 /*
418 * Convert:
419 *
420 * Jcc.d32 __x86_indirect_thunk_\reg
421 *
422 * into:
423 *
424 * Jncc.d8 1f
425 * [ LFENCE ]
426 * JMP *%\reg
427 * [ NOP ]
428 * 1:
429 */
430 /* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */
431 if (op == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80) {
432 cc = insn->opcode.bytes[1] & 0xf;
433 cc ^= 1; /* invert condition */
434
435 bytes[i++] = 0x70 + cc; /* Jcc.d8 */
436 bytes[i++] = insn->length - 2; /* sizeof(Jcc.d8) == 2 */
437
438 /* Continue as if: JMP.d32 __x86_indirect_thunk_\reg */
439 op = JMP32_INSN_OPCODE;
440 }
441
442 /*
443 * For RETPOLINE_LFENCE: prepend the indirect CALL/JMP with an LFENCE.
444 */
445 if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) {
446 bytes[i++] = 0x0f;
447 bytes[i++] = 0xae;
448 bytes[i++] = 0xe8; /* LFENCE */
449 }
450
451 ret = emit_indirect(op, reg, bytes + i);
452 if (ret < 0)
453 return ret;
454 i += ret;
455
456 /*
457 * The compiler is supposed to EMIT an INT3 after every unconditional
458 * JMP instruction due to AMD BTC. However, if the compiler is too old
459 * or SLS isn't enabled, we still need an INT3 after indirect JMPs
460 * even on Intel.
461 */
462 if (op == JMP32_INSN_OPCODE && i < insn->length)
463 bytes[i++] = INT3_INSN_OPCODE;
464
465 for (; i < insn->length;)
466 bytes[i++] = BYTES_NOP1;
467
468 return i;
469 }
470
471 /*
472 * Generated by 'objtool --retpoline'.
473 */
apply_retpolines(s32 * start,s32 * end)474 void __init_or_module noinline apply_retpolines(s32 *start, s32 *end)
475 {
476 s32 *s;
477
478 for (s = start; s < end; s++) {
479 void *addr = (void *)s + *s;
480 struct insn insn;
481 int len, ret;
482 u8 bytes[16];
483 u8 op1, op2;
484
485 ret = insn_decode_kernel(&insn, addr);
486 if (WARN_ON_ONCE(ret < 0))
487 continue;
488
489 op1 = insn.opcode.bytes[0];
490 op2 = insn.opcode.bytes[1];
491
492 switch (op1) {
493 case CALL_INSN_OPCODE:
494 case JMP32_INSN_OPCODE:
495 break;
496
497 case 0x0f: /* escape */
498 if (op2 >= 0x80 && op2 <= 0x8f)
499 break;
500 fallthrough;
501 default:
502 WARN_ON_ONCE(1);
503 continue;
504 }
505
506 DPRINTK("retpoline at: %pS (%px) len: %d to: %pS",
507 addr, addr, insn.length,
508 addr + insn.length + insn.immediate.value);
509
510 len = patch_retpoline(addr, &insn, bytes);
511 if (len == insn.length) {
512 optimize_nops(bytes, len);
513 DUMP_BYTES(((u8*)addr), len, "%px: orig: ", addr);
514 DUMP_BYTES(((u8*)bytes), len, "%px: repl: ", addr);
515 text_poke_early(addr, bytes, len);
516 }
517 }
518 }
519
520 #ifdef CONFIG_RETHUNK
521 /*
522 * Rewrite the compiler generated return thunk tail-calls.
523 *
524 * For example, convert:
525 *
526 * JMP __x86_return_thunk
527 *
528 * into:
529 *
530 * RET
531 */
patch_return(void * addr,struct insn * insn,u8 * bytes)532 static int patch_return(void *addr, struct insn *insn, u8 *bytes)
533 {
534 int i = 0;
535
536 if (cpu_feature_enabled(X86_FEATURE_RETHUNK))
537 return -1;
538
539 bytes[i++] = RET_INSN_OPCODE;
540
541 for (; i < insn->length;)
542 bytes[i++] = INT3_INSN_OPCODE;
543
544 return i;
545 }
546
apply_returns(s32 * start,s32 * end)547 void __init_or_module noinline apply_returns(s32 *start, s32 *end)
548 {
549 s32 *s;
550
551 for (s = start; s < end; s++) {
552 void *dest = NULL, *addr = (void *)s + *s;
553 struct insn insn;
554 int len, ret;
555 u8 bytes[16];
556 u8 op;
557
558 ret = insn_decode_kernel(&insn, addr);
559 if (WARN_ON_ONCE(ret < 0))
560 continue;
561
562 op = insn.opcode.bytes[0];
563 if (op == JMP32_INSN_OPCODE)
564 dest = addr + insn.length + insn.immediate.value;
565
566 if (__static_call_fixup(addr, op, dest) ||
567 WARN_ONCE(dest != &__x86_return_thunk,
568 "missing return thunk: %pS-%pS: %*ph",
569 addr, dest, 5, addr))
570 continue;
571
572 DPRINTK("return thunk at: %pS (%px) len: %d to: %pS",
573 addr, addr, insn.length,
574 addr + insn.length + insn.immediate.value);
575
576 len = patch_return(addr, &insn, bytes);
577 if (len == insn.length) {
578 DUMP_BYTES(((u8*)addr), len, "%px: orig: ", addr);
579 DUMP_BYTES(((u8*)bytes), len, "%px: repl: ", addr);
580 text_poke_early(addr, bytes, len);
581 }
582 }
583 }
584 #else
apply_returns(s32 * start,s32 * end)585 void __init_or_module noinline apply_returns(s32 *start, s32 *end) { }
586 #endif /* CONFIG_RETHUNK */
587
588 #else /* !CONFIG_RETPOLINE || !CONFIG_OBJTOOL */
589
apply_retpolines(s32 * start,s32 * end)590 void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { }
apply_returns(s32 * start,s32 * end)591 void __init_or_module noinline apply_returns(s32 *start, s32 *end) { }
592
593 #endif /* CONFIG_RETPOLINE && CONFIG_OBJTOOL */
594
595 #ifdef CONFIG_X86_KERNEL_IBT
596
597 /*
598 * Generated by: objtool --ibt
599 */
apply_ibt_endbr(s32 * start,s32 * end)600 void __init_or_module noinline apply_ibt_endbr(s32 *start, s32 *end)
601 {
602 s32 *s;
603
604 for (s = start; s < end; s++) {
605 u32 endbr, poison = gen_endbr_poison();
606 void *addr = (void *)s + *s;
607
608 if (WARN_ON_ONCE(get_kernel_nofault(endbr, addr)))
609 continue;
610
611 if (WARN_ON_ONCE(!is_endbr(endbr)))
612 continue;
613
614 DPRINTK("ENDBR at: %pS (%px)", addr, addr);
615
616 /*
617 * When we have IBT, the lack of ENDBR will trigger #CP
618 */
619 DUMP_BYTES(((u8*)addr), 4, "%px: orig: ", addr);
620 DUMP_BYTES(((u8*)&poison), 4, "%px: repl: ", addr);
621 text_poke_early(addr, &poison, 4);
622 }
623 }
624
625 #else
626
apply_ibt_endbr(s32 * start,s32 * end)627 void __init_or_module noinline apply_ibt_endbr(s32 *start, s32 *end) { }
628
629 #endif /* CONFIG_X86_KERNEL_IBT */
630
631 #ifdef CONFIG_SMP
alternatives_smp_lock(const s32 * start,const s32 * end,u8 * text,u8 * text_end)632 static void alternatives_smp_lock(const s32 *start, const s32 *end,
633 u8 *text, u8 *text_end)
634 {
635 const s32 *poff;
636
637 for (poff = start; poff < end; poff++) {
638 u8 *ptr = (u8 *)poff + *poff;
639
640 if (!*poff || ptr < text || ptr >= text_end)
641 continue;
642 /* turn DS segment override prefix into lock prefix */
643 if (*ptr == 0x3e)
644 text_poke(ptr, ((unsigned char []){0xf0}), 1);
645 }
646 }
647
alternatives_smp_unlock(const s32 * start,const s32 * end,u8 * text,u8 * text_end)648 static void alternatives_smp_unlock(const s32 *start, const s32 *end,
649 u8 *text, u8 *text_end)
650 {
651 const s32 *poff;
652
653 for (poff = start; poff < end; poff++) {
654 u8 *ptr = (u8 *)poff + *poff;
655
656 if (!*poff || ptr < text || ptr >= text_end)
657 continue;
658 /* turn lock prefix into DS segment override prefix */
659 if (*ptr == 0xf0)
660 text_poke(ptr, ((unsigned char []){0x3E}), 1);
661 }
662 }
663
664 struct smp_alt_module {
665 /* what is this ??? */
666 struct module *mod;
667 char *name;
668
669 /* ptrs to lock prefixes */
670 const s32 *locks;
671 const s32 *locks_end;
672
673 /* .text segment, needed to avoid patching init code ;) */
674 u8 *text;
675 u8 *text_end;
676
677 struct list_head next;
678 };
679 static LIST_HEAD(smp_alt_modules);
680 static bool uniproc_patched = false; /* protected by text_mutex */
681
alternatives_smp_module_add(struct module * mod,char * name,void * locks,void * locks_end,void * text,void * text_end)682 void __init_or_module alternatives_smp_module_add(struct module *mod,
683 char *name,
684 void *locks, void *locks_end,
685 void *text, void *text_end)
686 {
687 struct smp_alt_module *smp;
688
689 mutex_lock(&text_mutex);
690 if (!uniproc_patched)
691 goto unlock;
692
693 if (num_possible_cpus() == 1)
694 /* Don't bother remembering, we'll never have to undo it. */
695 goto smp_unlock;
696
697 smp = kzalloc(sizeof(*smp), GFP_KERNEL);
698 if (NULL == smp)
699 /* we'll run the (safe but slow) SMP code then ... */
700 goto unlock;
701
702 smp->mod = mod;
703 smp->name = name;
704 smp->locks = locks;
705 smp->locks_end = locks_end;
706 smp->text = text;
707 smp->text_end = text_end;
708 DPRINTK("locks %p -> %p, text %p -> %p, name %s\n",
709 smp->locks, smp->locks_end,
710 smp->text, smp->text_end, smp->name);
711
712 list_add_tail(&smp->next, &smp_alt_modules);
713 smp_unlock:
714 alternatives_smp_unlock(locks, locks_end, text, text_end);
715 unlock:
716 mutex_unlock(&text_mutex);
717 }
718
alternatives_smp_module_del(struct module * mod)719 void __init_or_module alternatives_smp_module_del(struct module *mod)
720 {
721 struct smp_alt_module *item;
722
723 mutex_lock(&text_mutex);
724 list_for_each_entry(item, &smp_alt_modules, next) {
725 if (mod != item->mod)
726 continue;
727 list_del(&item->next);
728 kfree(item);
729 break;
730 }
731 mutex_unlock(&text_mutex);
732 }
733
alternatives_enable_smp(void)734 void alternatives_enable_smp(void)
735 {
736 struct smp_alt_module *mod;
737
738 /* Why bother if there are no other CPUs? */
739 BUG_ON(num_possible_cpus() == 1);
740
741 mutex_lock(&text_mutex);
742
743 if (uniproc_patched) {
744 pr_info("switching to SMP code\n");
745 BUG_ON(num_online_cpus() != 1);
746 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
747 clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
748 list_for_each_entry(mod, &smp_alt_modules, next)
749 alternatives_smp_lock(mod->locks, mod->locks_end,
750 mod->text, mod->text_end);
751 uniproc_patched = false;
752 }
753 mutex_unlock(&text_mutex);
754 }
755
756 /*
757 * Return 1 if the address range is reserved for SMP-alternatives.
758 * Must hold text_mutex.
759 */
alternatives_text_reserved(void * start,void * end)760 int alternatives_text_reserved(void *start, void *end)
761 {
762 struct smp_alt_module *mod;
763 const s32 *poff;
764 u8 *text_start = start;
765 u8 *text_end = end;
766
767 lockdep_assert_held(&text_mutex);
768
769 list_for_each_entry(mod, &smp_alt_modules, next) {
770 if (mod->text > text_end || mod->text_end < text_start)
771 continue;
772 for (poff = mod->locks; poff < mod->locks_end; poff++) {
773 const u8 *ptr = (const u8 *)poff + *poff;
774
775 if (text_start <= ptr && text_end > ptr)
776 return 1;
777 }
778 }
779
780 return 0;
781 }
782 #endif /* CONFIG_SMP */
783
784 #ifdef CONFIG_PARAVIRT
apply_paravirt(struct paravirt_patch_site * start,struct paravirt_patch_site * end)785 void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
786 struct paravirt_patch_site *end)
787 {
788 struct paravirt_patch_site *p;
789 char insn_buff[MAX_PATCH_LEN];
790
791 for (p = start; p < end; p++) {
792 unsigned int used;
793
794 BUG_ON(p->len > MAX_PATCH_LEN);
795 /* prep the buffer with the original instructions */
796 memcpy(insn_buff, p->instr, p->len);
797 used = paravirt_patch(p->type, insn_buff, (unsigned long)p->instr, p->len);
798
799 BUG_ON(used > p->len);
800
801 /* Pad the rest with nops */
802 add_nops(insn_buff + used, p->len - used);
803 text_poke_early(p->instr, insn_buff, p->len);
804 }
805 }
806 extern struct paravirt_patch_site __start_parainstructions[],
807 __stop_parainstructions[];
808 #endif /* CONFIG_PARAVIRT */
809
810 /*
811 * Self-test for the INT3 based CALL emulation code.
812 *
813 * This exercises int3_emulate_call() to make sure INT3 pt_regs are set up
814 * properly and that there is a stack gap between the INT3 frame and the
815 * previous context. Without this gap doing a virtual PUSH on the interrupted
816 * stack would corrupt the INT3 IRET frame.
817 *
818 * See entry_{32,64}.S for more details.
819 */
820
821 /*
822 * We define the int3_magic() function in assembly to control the calling
823 * convention such that we can 'call' it from assembly.
824 */
825
826 extern void int3_magic(unsigned int *ptr); /* defined in asm */
827
828 asm (
829 " .pushsection .init.text, \"ax\", @progbits\n"
830 " .type int3_magic, @function\n"
831 "int3_magic:\n"
832 ANNOTATE_NOENDBR
833 " movl $1, (%" _ASM_ARG1 ")\n"
834 ASM_RET
835 " .size int3_magic, .-int3_magic\n"
836 " .popsection\n"
837 );
838
839 extern void int3_selftest_ip(void); /* defined in asm below */
840
841 static int __init
int3_exception_notify(struct notifier_block * self,unsigned long val,void * data)842 int3_exception_notify(struct notifier_block *self, unsigned long val, void *data)
843 {
844 unsigned long selftest = (unsigned long)&int3_selftest_ip;
845 struct die_args *args = data;
846 struct pt_regs *regs = args->regs;
847
848 OPTIMIZER_HIDE_VAR(selftest);
849
850 if (!regs || user_mode(regs))
851 return NOTIFY_DONE;
852
853 if (val != DIE_INT3)
854 return NOTIFY_DONE;
855
856 if (regs->ip - INT3_INSN_SIZE != selftest)
857 return NOTIFY_DONE;
858
859 int3_emulate_call(regs, (unsigned long)&int3_magic);
860 return NOTIFY_STOP;
861 }
862
863 /* Must be noinline to ensure uniqueness of int3_selftest_ip. */
int3_selftest(void)864 static noinline void __init int3_selftest(void)
865 {
866 static __initdata struct notifier_block int3_exception_nb = {
867 .notifier_call = int3_exception_notify,
868 .priority = INT_MAX-1, /* last */
869 };
870 unsigned int val = 0;
871
872 BUG_ON(register_die_notifier(&int3_exception_nb));
873
874 /*
875 * Basically: int3_magic(&val); but really complicated :-)
876 *
877 * INT3 padded with NOP to CALL_INSN_SIZE. The int3_exception_nb
878 * notifier above will emulate CALL for us.
879 */
880 asm volatile ("int3_selftest_ip:\n\t"
881 ANNOTATE_NOENDBR
882 " int3; nop; nop; nop; nop\n\t"
883 : ASM_CALL_CONSTRAINT
884 : __ASM_SEL_RAW(a, D) (&val)
885 : "memory");
886
887 BUG_ON(val != 1);
888
889 unregister_die_notifier(&int3_exception_nb);
890 }
891
alternative_instructions(void)892 void __init alternative_instructions(void)
893 {
894 int3_selftest();
895
896 /*
897 * The patching is not fully atomic, so try to avoid local
898 * interruptions that might execute the to be patched code.
899 * Other CPUs are not running.
900 */
901 stop_nmi();
902
903 /*
904 * Don't stop machine check exceptions while patching.
905 * MCEs only happen when something got corrupted and in this
906 * case we must do something about the corruption.
907 * Ignoring it is worse than an unlikely patching race.
908 * Also machine checks tend to be broadcast and if one CPU
909 * goes into machine check the others follow quickly, so we don't
910 * expect a machine check to cause undue problems during to code
911 * patching.
912 */
913
914 /*
915 * Paravirt patching and alternative patching can be combined to
916 * replace a function call with a short direct code sequence (e.g.
917 * by setting a constant return value instead of doing that in an
918 * external function).
919 * In order to make this work the following sequence is required:
920 * 1. set (artificial) features depending on used paravirt
921 * functions which can later influence alternative patching
922 * 2. apply paravirt patching (generally replacing an indirect
923 * function call with a direct one)
924 * 3. apply alternative patching (e.g. replacing a direct function
925 * call with a custom code sequence)
926 * Doing paravirt patching after alternative patching would clobber
927 * the optimization of the custom code with a function call again.
928 */
929 paravirt_set_cap();
930
931 /*
932 * First patch paravirt functions, such that we overwrite the indirect
933 * call with the direct call.
934 */
935 apply_paravirt(__parainstructions, __parainstructions_end);
936
937 /*
938 * Rewrite the retpolines, must be done before alternatives since
939 * those can rewrite the retpoline thunks.
940 */
941 apply_retpolines(__retpoline_sites, __retpoline_sites_end);
942 apply_returns(__return_sites, __return_sites_end);
943
944 /*
945 * Then patch alternatives, such that those paravirt calls that are in
946 * alternatives can be overwritten by their immediate fragments.
947 */
948 apply_alternatives(__alt_instructions, __alt_instructions_end);
949
950 apply_ibt_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end);
951
952 #ifdef CONFIG_SMP
953 /* Patch to UP if other cpus not imminent. */
954 if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) {
955 uniproc_patched = true;
956 alternatives_smp_module_add(NULL, "core kernel",
957 __smp_locks, __smp_locks_end,
958 _text, _etext);
959 }
960
961 if (!uniproc_patched || num_possible_cpus() == 1) {
962 free_init_pages("SMP alternatives",
963 (unsigned long)__smp_locks,
964 (unsigned long)__smp_locks_end);
965 }
966 #endif
967
968 restart_nmi();
969 alternatives_patched = 1;
970 }
971
972 /**
973 * text_poke_early - Update instructions on a live kernel at boot time
974 * @addr: address to modify
975 * @opcode: source of the copy
976 * @len: length to copy
977 *
978 * When you use this code to patch more than one byte of an instruction
979 * you need to make sure that other CPUs cannot execute this code in parallel.
980 * Also no thread must be currently preempted in the middle of these
981 * instructions. And on the local CPU you need to be protected against NMI or
982 * MCE handlers seeing an inconsistent instruction while you patch.
983 */
text_poke_early(void * addr,const void * opcode,size_t len)984 void __init_or_module text_poke_early(void *addr, const void *opcode,
985 size_t len)
986 {
987 unsigned long flags;
988
989 if (boot_cpu_has(X86_FEATURE_NX) &&
990 is_module_text_address((unsigned long)addr)) {
991 /*
992 * Modules text is marked initially as non-executable, so the
993 * code cannot be running and speculative code-fetches are
994 * prevented. Just change the code.
995 */
996 memcpy(addr, opcode, len);
997 } else {
998 local_irq_save(flags);
999 memcpy(addr, opcode, len);
1000 local_irq_restore(flags);
1001 sync_core();
1002
1003 /*
1004 * Could also do a CLFLUSH here to speed up CPU recovery; but
1005 * that causes hangs on some VIA CPUs.
1006 */
1007 }
1008 }
1009
1010 typedef struct {
1011 struct mm_struct *mm;
1012 } temp_mm_state_t;
1013
1014 /*
1015 * Using a temporary mm allows to set temporary mappings that are not accessible
1016 * by other CPUs. Such mappings are needed to perform sensitive memory writes
1017 * that override the kernel memory protections (e.g., W^X), without exposing the
1018 * temporary page-table mappings that are required for these write operations to
1019 * other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the
1020 * mapping is torn down.
1021 *
1022 * Context: The temporary mm needs to be used exclusively by a single core. To
1023 * harden security IRQs must be disabled while the temporary mm is
1024 * loaded, thereby preventing interrupt handler bugs from overriding
1025 * the kernel memory protection.
1026 */
use_temporary_mm(struct mm_struct * mm)1027 static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm)
1028 {
1029 temp_mm_state_t temp_state;
1030
1031 lockdep_assert_irqs_disabled();
1032
1033 /*
1034 * Make sure not to be in TLB lazy mode, as otherwise we'll end up
1035 * with a stale address space WITHOUT being in lazy mode after
1036 * restoring the previous mm.
1037 */
1038 if (this_cpu_read(cpu_tlbstate_shared.is_lazy))
1039 leave_mm(smp_processor_id());
1040
1041 temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm);
1042 switch_mm_irqs_off(NULL, mm, current);
1043
1044 /*
1045 * If breakpoints are enabled, disable them while the temporary mm is
1046 * used. Userspace might set up watchpoints on addresses that are used
1047 * in the temporary mm, which would lead to wrong signals being sent or
1048 * crashes.
1049 *
1050 * Note that breakpoints are not disabled selectively, which also causes
1051 * kernel breakpoints (e.g., perf's) to be disabled. This might be
1052 * undesirable, but still seems reasonable as the code that runs in the
1053 * temporary mm should be short.
1054 */
1055 if (hw_breakpoint_active())
1056 hw_breakpoint_disable();
1057
1058 return temp_state;
1059 }
1060
unuse_temporary_mm(temp_mm_state_t prev_state)1061 static inline void unuse_temporary_mm(temp_mm_state_t prev_state)
1062 {
1063 lockdep_assert_irqs_disabled();
1064 switch_mm_irqs_off(NULL, prev_state.mm, current);
1065
1066 /*
1067 * Restore the breakpoints if they were disabled before the temporary mm
1068 * was loaded.
1069 */
1070 if (hw_breakpoint_active())
1071 hw_breakpoint_restore();
1072 }
1073
1074 __ro_after_init struct mm_struct *poking_mm;
1075 __ro_after_init unsigned long poking_addr;
1076
text_poke_memcpy(void * dst,const void * src,size_t len)1077 static void text_poke_memcpy(void *dst, const void *src, size_t len)
1078 {
1079 memcpy(dst, src, len);
1080 }
1081
text_poke_memset(void * dst,const void * src,size_t len)1082 static void text_poke_memset(void *dst, const void *src, size_t len)
1083 {
1084 int c = *(const int *)src;
1085
1086 memset(dst, c, len);
1087 }
1088
1089 typedef void text_poke_f(void *dst, const void *src, size_t len);
1090
__text_poke(text_poke_f func,void * addr,const void * src,size_t len)1091 static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t len)
1092 {
1093 bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE;
1094 struct page *pages[2] = {NULL};
1095 temp_mm_state_t prev;
1096 unsigned long flags;
1097 pte_t pte, *ptep;
1098 spinlock_t *ptl;
1099 pgprot_t pgprot;
1100
1101 /*
1102 * While boot memory allocator is running we cannot use struct pages as
1103 * they are not yet initialized. There is no way to recover.
1104 */
1105 BUG_ON(!after_bootmem);
1106
1107 if (!core_kernel_text((unsigned long)addr)) {
1108 pages[0] = vmalloc_to_page(addr);
1109 if (cross_page_boundary)
1110 pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
1111 } else {
1112 pages[0] = virt_to_page(addr);
1113 WARN_ON(!PageReserved(pages[0]));
1114 if (cross_page_boundary)
1115 pages[1] = virt_to_page(addr + PAGE_SIZE);
1116 }
1117 /*
1118 * If something went wrong, crash and burn since recovery paths are not
1119 * implemented.
1120 */
1121 BUG_ON(!pages[0] || (cross_page_boundary && !pages[1]));
1122
1123 /*
1124 * Map the page without the global bit, as TLB flushing is done with
1125 * flush_tlb_mm_range(), which is intended for non-global PTEs.
1126 */
1127 pgprot = __pgprot(pgprot_val(PAGE_KERNEL) & ~_PAGE_GLOBAL);
1128
1129 /*
1130 * The lock is not really needed, but this allows to avoid open-coding.
1131 */
1132 ptep = get_locked_pte(poking_mm, poking_addr, &ptl);
1133
1134 /*
1135 * This must not fail; preallocated in poking_init().
1136 */
1137 VM_BUG_ON(!ptep);
1138
1139 local_irq_save(flags);
1140
1141 pte = mk_pte(pages[0], pgprot);
1142 set_pte_at(poking_mm, poking_addr, ptep, pte);
1143
1144 if (cross_page_boundary) {
1145 pte = mk_pte(pages[1], pgprot);
1146 set_pte_at(poking_mm, poking_addr + PAGE_SIZE, ptep + 1, pte);
1147 }
1148
1149 /*
1150 * Loading the temporary mm behaves as a compiler barrier, which
1151 * guarantees that the PTE will be set at the time memcpy() is done.
1152 */
1153 prev = use_temporary_mm(poking_mm);
1154
1155 kasan_disable_current();
1156 func((u8 *)poking_addr + offset_in_page(addr), src, len);
1157 kasan_enable_current();
1158
1159 /*
1160 * Ensure that the PTE is only cleared after the instructions of memcpy
1161 * were issued by using a compiler barrier.
1162 */
1163 barrier();
1164
1165 pte_clear(poking_mm, poking_addr, ptep);
1166 if (cross_page_boundary)
1167 pte_clear(poking_mm, poking_addr + PAGE_SIZE, ptep + 1);
1168
1169 /*
1170 * Loading the previous page-table hierarchy requires a serializing
1171 * instruction that already allows the core to see the updated version.
1172 * Xen-PV is assumed to serialize execution in a similar manner.
1173 */
1174 unuse_temporary_mm(prev);
1175
1176 /*
1177 * Flushing the TLB might involve IPIs, which would require enabled
1178 * IRQs, but not if the mm is not used, as it is in this point.
1179 */
1180 flush_tlb_mm_range(poking_mm, poking_addr, poking_addr +
1181 (cross_page_boundary ? 2 : 1) * PAGE_SIZE,
1182 PAGE_SHIFT, false);
1183
1184 if (func == text_poke_memcpy) {
1185 /*
1186 * If the text does not match what we just wrote then something is
1187 * fundamentally screwy; there's nothing we can really do about that.
1188 */
1189 BUG_ON(memcmp(addr, src, len));
1190 }
1191
1192 local_irq_restore(flags);
1193 pte_unmap_unlock(ptep, ptl);
1194 return addr;
1195 }
1196
1197 /**
1198 * text_poke - Update instructions on a live kernel
1199 * @addr: address to modify
1200 * @opcode: source of the copy
1201 * @len: length to copy
1202 *
1203 * Only atomic text poke/set should be allowed when not doing early patching.
1204 * It means the size must be writable atomically and the address must be aligned
1205 * in a way that permits an atomic write. It also makes sure we fit on a single
1206 * page.
1207 *
1208 * Note that the caller must ensure that if the modified code is part of a
1209 * module, the module would not be removed during poking. This can be achieved
1210 * by registering a module notifier, and ordering module removal and patching
1211 * trough a mutex.
1212 */
text_poke(void * addr,const void * opcode,size_t len)1213 void *text_poke(void *addr, const void *opcode, size_t len)
1214 {
1215 lockdep_assert_held(&text_mutex);
1216
1217 return __text_poke(text_poke_memcpy, addr, opcode, len);
1218 }
1219
1220 /**
1221 * text_poke_kgdb - Update instructions on a live kernel by kgdb
1222 * @addr: address to modify
1223 * @opcode: source of the copy
1224 * @len: length to copy
1225 *
1226 * Only atomic text poke/set should be allowed when not doing early patching.
1227 * It means the size must be writable atomically and the address must be aligned
1228 * in a way that permits an atomic write. It also makes sure we fit on a single
1229 * page.
1230 *
1231 * Context: should only be used by kgdb, which ensures no other core is running,
1232 * despite the fact it does not hold the text_mutex.
1233 */
text_poke_kgdb(void * addr,const void * opcode,size_t len)1234 void *text_poke_kgdb(void *addr, const void *opcode, size_t len)
1235 {
1236 return __text_poke(text_poke_memcpy, addr, opcode, len);
1237 }
1238
1239 /**
1240 * text_poke_copy - Copy instructions into (an unused part of) RX memory
1241 * @addr: address to modify
1242 * @opcode: source of the copy
1243 * @len: length to copy, could be more than 2x PAGE_SIZE
1244 *
1245 * Not safe against concurrent execution; useful for JITs to dump
1246 * new code blocks into unused regions of RX memory. Can be used in
1247 * conjunction with synchronize_rcu_tasks() to wait for existing
1248 * execution to quiesce after having made sure no existing functions
1249 * pointers are live.
1250 */
text_poke_copy(void * addr,const void * opcode,size_t len)1251 void *text_poke_copy(void *addr, const void *opcode, size_t len)
1252 {
1253 unsigned long start = (unsigned long)addr;
1254 size_t patched = 0;
1255
1256 if (WARN_ON_ONCE(core_kernel_text(start)))
1257 return NULL;
1258
1259 mutex_lock(&text_mutex);
1260 while (patched < len) {
1261 unsigned long ptr = start + patched;
1262 size_t s;
1263
1264 s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched);
1265
1266 __text_poke(text_poke_memcpy, (void *)ptr, opcode + patched, s);
1267 patched += s;
1268 }
1269 mutex_unlock(&text_mutex);
1270 return addr;
1271 }
1272
1273 /**
1274 * text_poke_set - memset into (an unused part of) RX memory
1275 * @addr: address to modify
1276 * @c: the byte to fill the area with
1277 * @len: length to copy, could be more than 2x PAGE_SIZE
1278 *
1279 * This is useful to overwrite unused regions of RX memory with illegal
1280 * instructions.
1281 */
text_poke_set(void * addr,int c,size_t len)1282 void *text_poke_set(void *addr, int c, size_t len)
1283 {
1284 unsigned long start = (unsigned long)addr;
1285 size_t patched = 0;
1286
1287 if (WARN_ON_ONCE(core_kernel_text(start)))
1288 return NULL;
1289
1290 mutex_lock(&text_mutex);
1291 while (patched < len) {
1292 unsigned long ptr = start + patched;
1293 size_t s;
1294
1295 s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched);
1296
1297 __text_poke(text_poke_memset, (void *)ptr, (void *)&c, s);
1298 patched += s;
1299 }
1300 mutex_unlock(&text_mutex);
1301 return addr;
1302 }
1303
do_sync_core(void * info)1304 static void do_sync_core(void *info)
1305 {
1306 sync_core();
1307 }
1308
text_poke_sync(void)1309 void text_poke_sync(void)
1310 {
1311 on_each_cpu(do_sync_core, NULL, 1);
1312 }
1313
1314 struct text_poke_loc {
1315 /* addr := _stext + rel_addr */
1316 s32 rel_addr;
1317 s32 disp;
1318 u8 len;
1319 u8 opcode;
1320 const u8 text[POKE_MAX_OPCODE_SIZE];
1321 /* see text_poke_bp_batch() */
1322 u8 old;
1323 };
1324
1325 struct bp_patching_desc {
1326 struct text_poke_loc *vec;
1327 int nr_entries;
1328 atomic_t refs;
1329 };
1330
1331 static struct bp_patching_desc bp_desc;
1332
1333 static __always_inline
try_get_desc(void)1334 struct bp_patching_desc *try_get_desc(void)
1335 {
1336 struct bp_patching_desc *desc = &bp_desc;
1337
1338 if (!arch_atomic_inc_not_zero(&desc->refs))
1339 return NULL;
1340
1341 return desc;
1342 }
1343
put_desc(void)1344 static __always_inline void put_desc(void)
1345 {
1346 struct bp_patching_desc *desc = &bp_desc;
1347
1348 smp_mb__before_atomic();
1349 arch_atomic_dec(&desc->refs);
1350 }
1351
text_poke_addr(struct text_poke_loc * tp)1352 static __always_inline void *text_poke_addr(struct text_poke_loc *tp)
1353 {
1354 return _stext + tp->rel_addr;
1355 }
1356
patch_cmp(const void * key,const void * elt)1357 static __always_inline int patch_cmp(const void *key, const void *elt)
1358 {
1359 struct text_poke_loc *tp = (struct text_poke_loc *) elt;
1360
1361 if (key < text_poke_addr(tp))
1362 return -1;
1363 if (key > text_poke_addr(tp))
1364 return 1;
1365 return 0;
1366 }
1367
poke_int3_handler(struct pt_regs * regs)1368 noinstr int poke_int3_handler(struct pt_regs *regs)
1369 {
1370 struct bp_patching_desc *desc;
1371 struct text_poke_loc *tp;
1372 int ret = 0;
1373 void *ip;
1374
1375 if (user_mode(regs))
1376 return 0;
1377
1378 /*
1379 * Having observed our INT3 instruction, we now must observe
1380 * bp_desc with non-zero refcount:
1381 *
1382 * bp_desc.refs = 1 INT3
1383 * WMB RMB
1384 * write INT3 if (bp_desc.refs != 0)
1385 */
1386 smp_rmb();
1387
1388 desc = try_get_desc();
1389 if (!desc)
1390 return 0;
1391
1392 /*
1393 * Discount the INT3. See text_poke_bp_batch().
1394 */
1395 ip = (void *) regs->ip - INT3_INSN_SIZE;
1396
1397 /*
1398 * Skip the binary search if there is a single member in the vector.
1399 */
1400 if (unlikely(desc->nr_entries > 1)) {
1401 tp = __inline_bsearch(ip, desc->vec, desc->nr_entries,
1402 sizeof(struct text_poke_loc),
1403 patch_cmp);
1404 if (!tp)
1405 goto out_put;
1406 } else {
1407 tp = desc->vec;
1408 if (text_poke_addr(tp) != ip)
1409 goto out_put;
1410 }
1411
1412 ip += tp->len;
1413
1414 switch (tp->opcode) {
1415 case INT3_INSN_OPCODE:
1416 /*
1417 * Someone poked an explicit INT3, they'll want to handle it,
1418 * do not consume.
1419 */
1420 goto out_put;
1421
1422 case RET_INSN_OPCODE:
1423 int3_emulate_ret(regs);
1424 break;
1425
1426 case CALL_INSN_OPCODE:
1427 int3_emulate_call(regs, (long)ip + tp->disp);
1428 break;
1429
1430 case JMP32_INSN_OPCODE:
1431 case JMP8_INSN_OPCODE:
1432 int3_emulate_jmp(regs, (long)ip + tp->disp);
1433 break;
1434
1435 default:
1436 BUG();
1437 }
1438
1439 ret = 1;
1440
1441 out_put:
1442 put_desc();
1443 return ret;
1444 }
1445
1446 #define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc))
1447 static struct text_poke_loc tp_vec[TP_VEC_MAX];
1448 static int tp_vec_nr;
1449
1450 /**
1451 * text_poke_bp_batch() -- update instructions on live kernel on SMP
1452 * @tp: vector of instructions to patch
1453 * @nr_entries: number of entries in the vector
1454 *
1455 * Modify multi-byte instruction by using int3 breakpoint on SMP.
1456 * We completely avoid stop_machine() here, and achieve the
1457 * synchronization using int3 breakpoint.
1458 *
1459 * The way it is done:
1460 * - For each entry in the vector:
1461 * - add a int3 trap to the address that will be patched
1462 * - sync cores
1463 * - For each entry in the vector:
1464 * - update all but the first byte of the patched range
1465 * - sync cores
1466 * - For each entry in the vector:
1467 * - replace the first byte (int3) by the first byte of
1468 * replacing opcode
1469 * - sync cores
1470 */
text_poke_bp_batch(struct text_poke_loc * tp,unsigned int nr_entries)1471 static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries)
1472 {
1473 unsigned char int3 = INT3_INSN_OPCODE;
1474 unsigned int i;
1475 int do_sync;
1476
1477 lockdep_assert_held(&text_mutex);
1478
1479 bp_desc.vec = tp;
1480 bp_desc.nr_entries = nr_entries;
1481
1482 /*
1483 * Corresponds to the implicit memory barrier in try_get_desc() to
1484 * ensure reading a non-zero refcount provides up to date bp_desc data.
1485 */
1486 atomic_set_release(&bp_desc.refs, 1);
1487
1488 /*
1489 * Corresponding read barrier in int3 notifier for making sure the
1490 * nr_entries and handler are correctly ordered wrt. patching.
1491 */
1492 smp_wmb();
1493
1494 /*
1495 * First step: add a int3 trap to the address that will be patched.
1496 */
1497 for (i = 0; i < nr_entries; i++) {
1498 tp[i].old = *(u8 *)text_poke_addr(&tp[i]);
1499 text_poke(text_poke_addr(&tp[i]), &int3, INT3_INSN_SIZE);
1500 }
1501
1502 text_poke_sync();
1503
1504 /*
1505 * Second step: update all but the first byte of the patched range.
1506 */
1507 for (do_sync = 0, i = 0; i < nr_entries; i++) {
1508 u8 old[POKE_MAX_OPCODE_SIZE] = { tp[i].old, };
1509 int len = tp[i].len;
1510
1511 if (len - INT3_INSN_SIZE > 0) {
1512 memcpy(old + INT3_INSN_SIZE,
1513 text_poke_addr(&tp[i]) + INT3_INSN_SIZE,
1514 len - INT3_INSN_SIZE);
1515 text_poke(text_poke_addr(&tp[i]) + INT3_INSN_SIZE,
1516 (const char *)tp[i].text + INT3_INSN_SIZE,
1517 len - INT3_INSN_SIZE);
1518 do_sync++;
1519 }
1520
1521 /*
1522 * Emit a perf event to record the text poke, primarily to
1523 * support Intel PT decoding which must walk the executable code
1524 * to reconstruct the trace. The flow up to here is:
1525 * - write INT3 byte
1526 * - IPI-SYNC
1527 * - write instruction tail
1528 * At this point the actual control flow will be through the
1529 * INT3 and handler and not hit the old or new instruction.
1530 * Intel PT outputs FUP/TIP packets for the INT3, so the flow
1531 * can still be decoded. Subsequently:
1532 * - emit RECORD_TEXT_POKE with the new instruction
1533 * - IPI-SYNC
1534 * - write first byte
1535 * - IPI-SYNC
1536 * So before the text poke event timestamp, the decoder will see
1537 * either the old instruction flow or FUP/TIP of INT3. After the
1538 * text poke event timestamp, the decoder will see either the
1539 * new instruction flow or FUP/TIP of INT3. Thus decoders can
1540 * use the timestamp as the point at which to modify the
1541 * executable code.
1542 * The old instruction is recorded so that the event can be
1543 * processed forwards or backwards.
1544 */
1545 perf_event_text_poke(text_poke_addr(&tp[i]), old, len,
1546 tp[i].text, len);
1547 }
1548
1549 if (do_sync) {
1550 /*
1551 * According to Intel, this core syncing is very likely
1552 * not necessary and we'd be safe even without it. But
1553 * better safe than sorry (plus there's not only Intel).
1554 */
1555 text_poke_sync();
1556 }
1557
1558 /*
1559 * Third step: replace the first byte (int3) by the first byte of
1560 * replacing opcode.
1561 */
1562 for (do_sync = 0, i = 0; i < nr_entries; i++) {
1563 if (tp[i].text[0] == INT3_INSN_OPCODE)
1564 continue;
1565
1566 text_poke(text_poke_addr(&tp[i]), tp[i].text, INT3_INSN_SIZE);
1567 do_sync++;
1568 }
1569
1570 if (do_sync)
1571 text_poke_sync();
1572
1573 /*
1574 * Remove and wait for refs to be zero.
1575 */
1576 if (!atomic_dec_and_test(&bp_desc.refs))
1577 atomic_cond_read_acquire(&bp_desc.refs, !VAL);
1578 }
1579
text_poke_loc_init(struct text_poke_loc * tp,void * addr,const void * opcode,size_t len,const void * emulate)1580 static void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
1581 const void *opcode, size_t len, const void *emulate)
1582 {
1583 struct insn insn;
1584 int ret, i;
1585
1586 memcpy((void *)tp->text, opcode, len);
1587 if (!emulate)
1588 emulate = opcode;
1589
1590 ret = insn_decode_kernel(&insn, emulate);
1591 BUG_ON(ret < 0);
1592
1593 tp->rel_addr = addr - (void *)_stext;
1594 tp->len = len;
1595 tp->opcode = insn.opcode.bytes[0];
1596
1597 switch (tp->opcode) {
1598 case RET_INSN_OPCODE:
1599 case JMP32_INSN_OPCODE:
1600 case JMP8_INSN_OPCODE:
1601 /*
1602 * Control flow instructions without implied execution of the
1603 * next instruction can be padded with INT3.
1604 */
1605 for (i = insn.length; i < len; i++)
1606 BUG_ON(tp->text[i] != INT3_INSN_OPCODE);
1607 break;
1608
1609 default:
1610 BUG_ON(len != insn.length);
1611 };
1612
1613
1614 switch (tp->opcode) {
1615 case INT3_INSN_OPCODE:
1616 case RET_INSN_OPCODE:
1617 break;
1618
1619 case CALL_INSN_OPCODE:
1620 case JMP32_INSN_OPCODE:
1621 case JMP8_INSN_OPCODE:
1622 tp->disp = insn.immediate.value;
1623 break;
1624
1625 default: /* assume NOP */
1626 switch (len) {
1627 case 2: /* NOP2 -- emulate as JMP8+0 */
1628 BUG_ON(memcmp(emulate, x86_nops[len], len));
1629 tp->opcode = JMP8_INSN_OPCODE;
1630 tp->disp = 0;
1631 break;
1632
1633 case 5: /* NOP5 -- emulate as JMP32+0 */
1634 BUG_ON(memcmp(emulate, x86_nops[len], len));
1635 tp->opcode = JMP32_INSN_OPCODE;
1636 tp->disp = 0;
1637 break;
1638
1639 default: /* unknown instruction */
1640 BUG();
1641 }
1642 break;
1643 }
1644 }
1645
1646 /*
1647 * We hard rely on the tp_vec being ordered; ensure this is so by flushing
1648 * early if needed.
1649 */
tp_order_fail(void * addr)1650 static bool tp_order_fail(void *addr)
1651 {
1652 struct text_poke_loc *tp;
1653
1654 if (!tp_vec_nr)
1655 return false;
1656
1657 if (!addr) /* force */
1658 return true;
1659
1660 tp = &tp_vec[tp_vec_nr - 1];
1661 if ((unsigned long)text_poke_addr(tp) > (unsigned long)addr)
1662 return true;
1663
1664 return false;
1665 }
1666
text_poke_flush(void * addr)1667 static void text_poke_flush(void *addr)
1668 {
1669 if (tp_vec_nr == TP_VEC_MAX || tp_order_fail(addr)) {
1670 text_poke_bp_batch(tp_vec, tp_vec_nr);
1671 tp_vec_nr = 0;
1672 }
1673 }
1674
text_poke_finish(void)1675 void text_poke_finish(void)
1676 {
1677 text_poke_flush(NULL);
1678 }
1679
text_poke_queue(void * addr,const void * opcode,size_t len,const void * emulate)1680 void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate)
1681 {
1682 struct text_poke_loc *tp;
1683
1684 if (unlikely(system_state == SYSTEM_BOOTING)) {
1685 text_poke_early(addr, opcode, len);
1686 return;
1687 }
1688
1689 text_poke_flush(addr);
1690
1691 tp = &tp_vec[tp_vec_nr++];
1692 text_poke_loc_init(tp, addr, opcode, len, emulate);
1693 }
1694
1695 /**
1696 * text_poke_bp() -- update instructions on live kernel on SMP
1697 * @addr: address to patch
1698 * @opcode: opcode of new instruction
1699 * @len: length to copy
1700 * @emulate: instruction to be emulated
1701 *
1702 * Update a single instruction with the vector in the stack, avoiding
1703 * dynamically allocated memory. This function should be used when it is
1704 * not possible to allocate memory.
1705 */
text_poke_bp(void * addr,const void * opcode,size_t len,const void * emulate)1706 void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate)
1707 {
1708 struct text_poke_loc tp;
1709
1710 if (unlikely(system_state == SYSTEM_BOOTING)) {
1711 text_poke_early(addr, opcode, len);
1712 return;
1713 }
1714
1715 text_poke_loc_init(&tp, addr, opcode, len, emulate);
1716 text_poke_bp_batch(&tp, 1);
1717 }
1718