1/* 2 * linux/arch/x86_64/entry.S 3 * 4 * Copyright (C) 1991, 1992 Linus Torvalds 5 * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs 6 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> 7 */ 8 9/* 10 * entry.S contains the system-call and fault low-level handling routines. 11 * 12 * Some of this is documented in Documentation/x86/entry_64.txt 13 * 14 * NOTE: This code handles signal-recognition, which happens every time 15 * after an interrupt and after each system call. 16 * 17 * Normal syscalls and interrupts don't save a full stack frame, this is 18 * only done for syscall tracing, signals or fork/exec et.al. 19 * 20 * A note on terminology: 21 * - top of stack: Architecture defined interrupt frame from SS to RIP 22 * at the top of the kernel process stack. 23 * - partial stack frame: partially saved registers up to R11. 24 * - full stack frame: Like partial stack frame, but all register saved. 25 * 26 * Some macro usage: 27 * - CFI macros are used to generate dwarf2 unwind information for better 28 * backtraces. They don't change any code. 29 * - SAVE_ALL/RESTORE_ALL - Save/restore all registers 30 * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify. 31 * There are unfortunately lots of special cases where some registers 32 * not touched. The macro is a big mess that should be cleaned up. 33 * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS. 34 * Gives a full stack frame. 35 * - ENTRY/END Define functions in the symbol table. 36 * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack 37 * frame that is otherwise undefined after a SYSCALL 38 * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging. 39 * - errorentry/paranoidentry/zeroentry - Define exception entry points. 40 */ 41 42#include <linux/linkage.h> 43#include <asm/segment.h> 44#include <asm/cache.h> 45#include <asm/errno.h> 46#include <asm/dwarf2.h> 47#include <asm/calling.h> 48#include <asm/asm-offsets.h> 49#include <asm/msr.h> 50#include <asm/unistd.h> 51#include <asm/thread_info.h> 52#include <asm/hw_irq.h> 53#include <asm/page_types.h> 54#include <asm/irqflags.h> 55#include <asm/paravirt.h> 56#include <asm/ftrace.h> 57#include <asm/percpu.h> 58#include <linux/err.h> 59 60/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ 61#include <linux/elf-em.h> 62#define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE) 63#define __AUDIT_ARCH_64BIT 0x80000000 64#define __AUDIT_ARCH_LE 0x40000000 65 66 .code64 67 .section .entry.text, "ax" 68 69#ifdef CONFIG_FUNCTION_TRACER 70#ifdef CONFIG_DYNAMIC_FTRACE 71ENTRY(mcount) 72 retq 73END(mcount) 74 75ENTRY(ftrace_caller) 76 cmpl $0, function_trace_stop 77 jne ftrace_stub 78 79 MCOUNT_SAVE_FRAME 80 81 movq 0x38(%rsp), %rdi 82 movq 8(%rbp), %rsi 83 subq $MCOUNT_INSN_SIZE, %rdi 84 85GLOBAL(ftrace_call) 86 call ftrace_stub 87 88 MCOUNT_RESTORE_FRAME 89 90#ifdef CONFIG_FUNCTION_GRAPH_TRACER 91GLOBAL(ftrace_graph_call) 92 jmp ftrace_stub 93#endif 94 95GLOBAL(ftrace_stub) 96 retq 97END(ftrace_caller) 98 99#else /* ! CONFIG_DYNAMIC_FTRACE */ 100ENTRY(mcount) 101 cmpl $0, function_trace_stop 102 jne ftrace_stub 103 104 cmpq $ftrace_stub, ftrace_trace_function 105 jnz trace 106 107#ifdef CONFIG_FUNCTION_GRAPH_TRACER 108 cmpq $ftrace_stub, ftrace_graph_return 109 jnz ftrace_graph_caller 110 111 cmpq $ftrace_graph_entry_stub, ftrace_graph_entry 112 jnz ftrace_graph_caller 113#endif 114 115GLOBAL(ftrace_stub) 116 retq 117 118trace: 119 MCOUNT_SAVE_FRAME 120 121 movq 0x38(%rsp), %rdi 122 movq 8(%rbp), %rsi 123 subq $MCOUNT_INSN_SIZE, %rdi 124 125 call *ftrace_trace_function 126 127 MCOUNT_RESTORE_FRAME 128 129 jmp ftrace_stub 130END(mcount) 131#endif /* CONFIG_DYNAMIC_FTRACE */ 132#endif /* CONFIG_FUNCTION_TRACER */ 133 134#ifdef CONFIG_FUNCTION_GRAPH_TRACER 135ENTRY(ftrace_graph_caller) 136 cmpl $0, function_trace_stop 137 jne ftrace_stub 138 139 MCOUNT_SAVE_FRAME 140 141 leaq 8(%rbp), %rdi 142 movq 0x38(%rsp), %rsi 143 movq (%rbp), %rdx 144 subq $MCOUNT_INSN_SIZE, %rsi 145 146 call prepare_ftrace_return 147 148 MCOUNT_RESTORE_FRAME 149 150 retq 151END(ftrace_graph_caller) 152 153GLOBAL(return_to_handler) 154 subq $24, %rsp 155 156 /* Save the return values */ 157 movq %rax, (%rsp) 158 movq %rdx, 8(%rsp) 159 movq %rbp, %rdi 160 161 call ftrace_return_to_handler 162 163 movq %rax, %rdi 164 movq 8(%rsp), %rdx 165 movq (%rsp), %rax 166 addq $24, %rsp 167 jmp *%rdi 168#endif 169 170 171#ifndef CONFIG_PREEMPT 172#define retint_kernel retint_restore_args 173#endif 174 175#ifdef CONFIG_PARAVIRT 176ENTRY(native_usergs_sysret64) 177 swapgs 178 sysretq 179ENDPROC(native_usergs_sysret64) 180#endif /* CONFIG_PARAVIRT */ 181 182 183.macro TRACE_IRQS_IRETQ offset=ARGOFFSET 184#ifdef CONFIG_TRACE_IRQFLAGS 185 bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */ 186 jnc 1f 187 TRACE_IRQS_ON 1881: 189#endif 190.endm 191 192/* 193 * C code is not supposed to know about undefined top of stack. Every time 194 * a C function with an pt_regs argument is called from the SYSCALL based 195 * fast path FIXUP_TOP_OF_STACK is needed. 196 * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs 197 * manipulation. 198 */ 199 200 /* %rsp:at FRAMEEND */ 201 .macro FIXUP_TOP_OF_STACK tmp offset=0 202 movq PER_CPU_VAR(old_rsp),\tmp 203 movq \tmp,RSP+\offset(%rsp) 204 movq $__USER_DS,SS+\offset(%rsp) 205 movq $__USER_CS,CS+\offset(%rsp) 206 movq $-1,RCX+\offset(%rsp) 207 movq R11+\offset(%rsp),\tmp /* get eflags */ 208 movq \tmp,EFLAGS+\offset(%rsp) 209 .endm 210 211 .macro RESTORE_TOP_OF_STACK tmp offset=0 212 movq RSP+\offset(%rsp),\tmp 213 movq \tmp,PER_CPU_VAR(old_rsp) 214 movq EFLAGS+\offset(%rsp),\tmp 215 movq \tmp,R11+\offset(%rsp) 216 .endm 217 218 .macro FAKE_STACK_FRAME child_rip 219 /* push in order ss, rsp, eflags, cs, rip */ 220 xorl %eax, %eax 221 pushq_cfi $__KERNEL_DS /* ss */ 222 /*CFI_REL_OFFSET ss,0*/ 223 pushq_cfi %rax /* rsp */ 224 CFI_REL_OFFSET rsp,0 225 pushq_cfi $(X86_EFLAGS_IF|X86_EFLAGS_BIT1) /* eflags - interrupts on */ 226 /*CFI_REL_OFFSET rflags,0*/ 227 pushq_cfi $__KERNEL_CS /* cs */ 228 /*CFI_REL_OFFSET cs,0*/ 229 pushq_cfi \child_rip /* rip */ 230 CFI_REL_OFFSET rip,0 231 pushq_cfi %rax /* orig rax */ 232 .endm 233 234 .macro UNFAKE_STACK_FRAME 235 addq $8*6, %rsp 236 CFI_ADJUST_CFA_OFFSET -(6*8) 237 .endm 238 239/* 240 * initial frame state for interrupts (and exceptions without error code) 241 */ 242 .macro EMPTY_FRAME start=1 offset=0 243 .if \start 244 CFI_STARTPROC simple 245 CFI_SIGNAL_FRAME 246 CFI_DEF_CFA rsp,8+\offset 247 .else 248 CFI_DEF_CFA_OFFSET 8+\offset 249 .endif 250 .endm 251 252/* 253 * initial frame state for interrupts (and exceptions without error code) 254 */ 255 .macro INTR_FRAME start=1 offset=0 256 EMPTY_FRAME \start, SS+8+\offset-RIP 257 /*CFI_REL_OFFSET ss, SS+\offset-RIP*/ 258 CFI_REL_OFFSET rsp, RSP+\offset-RIP 259 /*CFI_REL_OFFSET rflags, EFLAGS+\offset-RIP*/ 260 /*CFI_REL_OFFSET cs, CS+\offset-RIP*/ 261 CFI_REL_OFFSET rip, RIP+\offset-RIP 262 .endm 263 264/* 265 * initial frame state for exceptions with error code (and interrupts 266 * with vector already pushed) 267 */ 268 .macro XCPT_FRAME start=1 offset=0 269 INTR_FRAME \start, RIP+\offset-ORIG_RAX 270 /*CFI_REL_OFFSET orig_rax, ORIG_RAX-ORIG_RAX*/ 271 .endm 272 273/* 274 * frame that enables calling into C. 275 */ 276 .macro PARTIAL_FRAME start=1 offset=0 277 XCPT_FRAME \start, ORIG_RAX+\offset-ARGOFFSET 278 CFI_REL_OFFSET rdi, RDI+\offset-ARGOFFSET 279 CFI_REL_OFFSET rsi, RSI+\offset-ARGOFFSET 280 CFI_REL_OFFSET rdx, RDX+\offset-ARGOFFSET 281 CFI_REL_OFFSET rcx, RCX+\offset-ARGOFFSET 282 CFI_REL_OFFSET rax, RAX+\offset-ARGOFFSET 283 CFI_REL_OFFSET r8, R8+\offset-ARGOFFSET 284 CFI_REL_OFFSET r9, R9+\offset-ARGOFFSET 285 CFI_REL_OFFSET r10, R10+\offset-ARGOFFSET 286 CFI_REL_OFFSET r11, R11+\offset-ARGOFFSET 287 .endm 288 289/* 290 * frame that enables passing a complete pt_regs to a C function. 291 */ 292 .macro DEFAULT_FRAME start=1 offset=0 293 PARTIAL_FRAME \start, R11+\offset-R15 294 CFI_REL_OFFSET rbx, RBX+\offset 295 CFI_REL_OFFSET rbp, RBP+\offset 296 CFI_REL_OFFSET r12, R12+\offset 297 CFI_REL_OFFSET r13, R13+\offset 298 CFI_REL_OFFSET r14, R14+\offset 299 CFI_REL_OFFSET r15, R15+\offset 300 .endm 301 302/* save partial stack frame */ 303 .macro SAVE_ARGS_IRQ 304 cld 305 /* start from rbp in pt_regs and jump over */ 306 movq_cfi rdi, RDI-RBP 307 movq_cfi rsi, RSI-RBP 308 movq_cfi rdx, RDX-RBP 309 movq_cfi rcx, RCX-RBP 310 movq_cfi rax, RAX-RBP 311 movq_cfi r8, R8-RBP 312 movq_cfi r9, R9-RBP 313 movq_cfi r10, R10-RBP 314 movq_cfi r11, R11-RBP 315 316 /* Save rbp so that we can unwind from get_irq_regs() */ 317 movq_cfi rbp, 0 318 319 /* Save previous stack value */ 320 movq %rsp, %rsi 321 322 leaq -RBP(%rsp),%rdi /* arg1 for handler */ 323 testl $3, CS-RBP(%rsi) 324 je 1f 325 SWAPGS 326 /* 327 * irq_count is used to check if a CPU is already on an interrupt stack 328 * or not. While this is essentially redundant with preempt_count it is 329 * a little cheaper to use a separate counter in the PDA (short of 330 * moving irq_enter into assembly, which would be too much work) 331 */ 3321: incl PER_CPU_VAR(irq_count) 333 cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp 334 CFI_DEF_CFA_REGISTER rsi 335 336 /* Store previous stack value */ 337 pushq %rsi 338 CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \ 339 0x77 /* DW_OP_breg7 */, 0, \ 340 0x06 /* DW_OP_deref */, \ 341 0x08 /* DW_OP_const1u */, SS+8-RBP, \ 342 0x22 /* DW_OP_plus */ 343 /* We entered an interrupt context - irqs are off: */ 344 TRACE_IRQS_OFF 345 .endm 346 347ENTRY(save_rest) 348 PARTIAL_FRAME 1 REST_SKIP+8 349 movq 5*8+16(%rsp), %r11 /* save return address */ 350 movq_cfi rbx, RBX+16 351 movq_cfi rbp, RBP+16 352 movq_cfi r12, R12+16 353 movq_cfi r13, R13+16 354 movq_cfi r14, R14+16 355 movq_cfi r15, R15+16 356 movq %r11, 8(%rsp) /* return address */ 357 FIXUP_TOP_OF_STACK %r11, 16 358 ret 359 CFI_ENDPROC 360END(save_rest) 361 362/* save complete stack frame */ 363 .pushsection .kprobes.text, "ax" 364ENTRY(save_paranoid) 365 XCPT_FRAME 1 RDI+8 366 cld 367 movq_cfi rdi, RDI+8 368 movq_cfi rsi, RSI+8 369 movq_cfi rdx, RDX+8 370 movq_cfi rcx, RCX+8 371 movq_cfi rax, RAX+8 372 movq_cfi r8, R8+8 373 movq_cfi r9, R9+8 374 movq_cfi r10, R10+8 375 movq_cfi r11, R11+8 376 movq_cfi rbx, RBX+8 377 movq_cfi rbp, RBP+8 378 movq_cfi r12, R12+8 379 movq_cfi r13, R13+8 380 movq_cfi r14, R14+8 381 movq_cfi r15, R15+8 382 movl $1,%ebx 383 movl $MSR_GS_BASE,%ecx 384 rdmsr 385 testl %edx,%edx 386 js 1f /* negative -> in kernel */ 387 SWAPGS 388 xorl %ebx,%ebx 3891: ret 390 CFI_ENDPROC 391END(save_paranoid) 392 .popsection 393 394/* 395 * A newly forked process directly context switches into this address. 396 * 397 * rdi: prev task we switched from 398 */ 399ENTRY(ret_from_fork) 400 DEFAULT_FRAME 401 402 LOCK ; btr $TIF_FORK,TI_flags(%r8) 403 404 pushq_cfi kernel_eflags(%rip) 405 popfq_cfi # reset kernel eflags 406 407 call schedule_tail # rdi: 'prev' task parameter 408 409 GET_THREAD_INFO(%rcx) 410 411 RESTORE_REST 412 413 testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread? 414 jz retint_restore_args 415 416 testl $_TIF_IA32, TI_flags(%rcx) # 32-bit compat task needs IRET 417 jnz int_ret_from_sys_call 418 419 RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET 420 jmp ret_from_sys_call # go to the SYSRET fastpath 421 422 CFI_ENDPROC 423END(ret_from_fork) 424 425/* 426 * System call entry. Up to 6 arguments in registers are supported. 427 * 428 * SYSCALL does not save anything on the stack and does not change the 429 * stack pointer. 430 */ 431 432/* 433 * Register setup: 434 * rax system call number 435 * rdi arg0 436 * rcx return address for syscall/sysret, C arg3 437 * rsi arg1 438 * rdx arg2 439 * r10 arg3 (--> moved to rcx for C) 440 * r8 arg4 441 * r9 arg5 442 * r11 eflags for syscall/sysret, temporary for C 443 * r12-r15,rbp,rbx saved by C code, not touched. 444 * 445 * Interrupts are off on entry. 446 * Only called from user space. 447 * 448 * XXX if we had a free scratch register we could save the RSP into the stack frame 449 * and report it properly in ps. Unfortunately we haven't. 450 * 451 * When user can change the frames always force IRET. That is because 452 * it deals with uncanonical addresses better. SYSRET has trouble 453 * with them due to bugs in both AMD and Intel CPUs. 454 */ 455 456ENTRY(system_call) 457 CFI_STARTPROC simple 458 CFI_SIGNAL_FRAME 459 CFI_DEF_CFA rsp,KERNEL_STACK_OFFSET 460 CFI_REGISTER rip,rcx 461 /*CFI_REGISTER rflags,r11*/ 462 SWAPGS_UNSAFE_STACK 463 /* 464 * A hypervisor implementation might want to use a label 465 * after the swapgs, so that it can do the swapgs 466 * for the guest and jump here on syscall. 467 */ 468GLOBAL(system_call_after_swapgs) 469 470 movq %rsp,PER_CPU_VAR(old_rsp) 471 movq PER_CPU_VAR(kernel_stack),%rsp 472 /* 473 * No need to follow this irqs off/on section - it's straight 474 * and short: 475 */ 476 ENABLE_INTERRUPTS(CLBR_NONE) 477 SAVE_ARGS 8,0 478 movq %rax,ORIG_RAX-ARGOFFSET(%rsp) 479 movq %rcx,RIP-ARGOFFSET(%rsp) 480 CFI_REL_OFFSET rip,RIP-ARGOFFSET 481 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) 482 jnz tracesys 483system_call_fastpath: 484#if __SYSCALL_MASK == ~0 485 cmpq $__NR_syscall_max,%rax 486#else 487 andl $__SYSCALL_MASK,%eax 488 cmpl $__NR_syscall_max,%eax 489#endif 490 ja badsys 491 movq %r10,%rcx 492 call *sys_call_table(,%rax,8) # XXX: rip relative 493 movq %rax,RAX-ARGOFFSET(%rsp) 494/* 495 * Syscall return path ending with SYSRET (fast path) 496 * Has incomplete stack frame and undefined top of stack. 497 */ 498ret_from_sys_call: 499 movl $_TIF_ALLWORK_MASK,%edi 500 /* edi: flagmask */ 501sysret_check: 502 LOCKDEP_SYS_EXIT 503 DISABLE_INTERRUPTS(CLBR_NONE) 504 TRACE_IRQS_OFF 505 movl TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET),%edx 506 andl %edi,%edx 507 jnz sysret_careful 508 CFI_REMEMBER_STATE 509 /* 510 * sysretq will re-enable interrupts: 511 */ 512 TRACE_IRQS_ON 513 movq RIP-ARGOFFSET(%rsp),%rcx 514 CFI_REGISTER rip,rcx 515 RESTORE_ARGS 1,-ARG_SKIP,0 516 /*CFI_REGISTER rflags,r11*/ 517 movq PER_CPU_VAR(old_rsp), %rsp 518 USERGS_SYSRET64 519 520 CFI_RESTORE_STATE 521 /* Handle reschedules */ 522 /* edx: work, edi: workmask */ 523sysret_careful: 524 bt $TIF_NEED_RESCHED,%edx 525 jnc sysret_signal 526 TRACE_IRQS_ON 527 ENABLE_INTERRUPTS(CLBR_NONE) 528 pushq_cfi %rdi 529 call schedule 530 popq_cfi %rdi 531 jmp sysret_check 532 533 /* Handle a signal */ 534sysret_signal: 535 TRACE_IRQS_ON 536 ENABLE_INTERRUPTS(CLBR_NONE) 537#ifdef CONFIG_AUDITSYSCALL 538 bt $TIF_SYSCALL_AUDIT,%edx 539 jc sysret_audit 540#endif 541 /* 542 * We have a signal, or exit tracing or single-step. 543 * These all wind up with the iret return path anyway, 544 * so just join that path right now. 545 */ 546 FIXUP_TOP_OF_STACK %r11, -ARGOFFSET 547 jmp int_check_syscall_exit_work 548 549badsys: 550 movq $-ENOSYS,RAX-ARGOFFSET(%rsp) 551 jmp ret_from_sys_call 552 553#ifdef CONFIG_AUDITSYSCALL 554 /* 555 * Fast path for syscall audit without full syscall trace. 556 * We just call __audit_syscall_entry() directly, and then 557 * jump back to the normal fast path. 558 */ 559auditsys: 560 movq %r10,%r9 /* 6th arg: 4th syscall arg */ 561 movq %rdx,%r8 /* 5th arg: 3rd syscall arg */ 562 movq %rsi,%rcx /* 4th arg: 2nd syscall arg */ 563 movq %rdi,%rdx /* 3rd arg: 1st syscall arg */ 564 movq %rax,%rsi /* 2nd arg: syscall number */ 565 movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */ 566 call __audit_syscall_entry 567 LOAD_ARGS 0 /* reload call-clobbered registers */ 568 jmp system_call_fastpath 569 570 /* 571 * Return fast path for syscall audit. Call __audit_syscall_exit() 572 * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT 573 * masked off. 574 */ 575sysret_audit: 576 movq RAX-ARGOFFSET(%rsp),%rsi /* second arg, syscall return value */ 577 cmpq $-MAX_ERRNO,%rsi /* is it < -MAX_ERRNO? */ 578 setbe %al /* 1 if so, 0 if not */ 579 movzbl %al,%edi /* zero-extend that into %edi */ 580 call __audit_syscall_exit 581 movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi 582 jmp sysret_check 583#endif /* CONFIG_AUDITSYSCALL */ 584 585 /* Do syscall tracing */ 586tracesys: 587#ifdef CONFIG_AUDITSYSCALL 588 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) 589 jz auditsys 590#endif 591 SAVE_REST 592 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ 593 FIXUP_TOP_OF_STACK %rdi 594 movq %rsp,%rdi 595 call syscall_trace_enter 596 /* 597 * Reload arg registers from stack in case ptrace changed them. 598 * We don't reload %rax because syscall_trace_enter() returned 599 * the value it wants us to use in the table lookup. 600 */ 601 LOAD_ARGS ARGOFFSET, 1 602 RESTORE_REST 603#if __SYSCALL_MASK == ~0 604 cmpq $__NR_syscall_max,%rax 605#else 606 andl $__SYSCALL_MASK,%eax 607 cmpl $__NR_syscall_max,%eax 608#endif 609 ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */ 610 movq %r10,%rcx /* fixup for C */ 611 call *sys_call_table(,%rax,8) 612 movq %rax,RAX-ARGOFFSET(%rsp) 613 /* Use IRET because user could have changed frame */ 614 615/* 616 * Syscall return path ending with IRET. 617 * Has correct top of stack, but partial stack frame. 618 */ 619GLOBAL(int_ret_from_sys_call) 620 DISABLE_INTERRUPTS(CLBR_NONE) 621 TRACE_IRQS_OFF 622 movl $_TIF_ALLWORK_MASK,%edi 623 /* edi: mask to check */ 624GLOBAL(int_with_check) 625 LOCKDEP_SYS_EXIT_IRQ 626 GET_THREAD_INFO(%rcx) 627 movl TI_flags(%rcx),%edx 628 andl %edi,%edx 629 jnz int_careful 630 andl $~TS_COMPAT,TI_status(%rcx) 631 jmp retint_swapgs 632 633 /* Either reschedule or signal or syscall exit tracking needed. */ 634 /* First do a reschedule test. */ 635 /* edx: work, edi: workmask */ 636int_careful: 637 bt $TIF_NEED_RESCHED,%edx 638 jnc int_very_careful 639 TRACE_IRQS_ON 640 ENABLE_INTERRUPTS(CLBR_NONE) 641 pushq_cfi %rdi 642 call schedule 643 popq_cfi %rdi 644 DISABLE_INTERRUPTS(CLBR_NONE) 645 TRACE_IRQS_OFF 646 jmp int_with_check 647 648 /* handle signals and tracing -- both require a full stack frame */ 649int_very_careful: 650 TRACE_IRQS_ON 651 ENABLE_INTERRUPTS(CLBR_NONE) 652int_check_syscall_exit_work: 653 SAVE_REST 654 /* Check for syscall exit trace */ 655 testl $_TIF_WORK_SYSCALL_EXIT,%edx 656 jz int_signal 657 pushq_cfi %rdi 658 leaq 8(%rsp),%rdi # &ptregs -> arg1 659 call syscall_trace_leave 660 popq_cfi %rdi 661 andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi 662 jmp int_restore_rest 663 664int_signal: 665 testl $_TIF_DO_NOTIFY_MASK,%edx 666 jz 1f 667 movq %rsp,%rdi # &ptregs -> arg1 668 xorl %esi,%esi # oldset -> arg2 669 call do_notify_resume 6701: movl $_TIF_WORK_MASK,%edi 671int_restore_rest: 672 RESTORE_REST 673 DISABLE_INTERRUPTS(CLBR_NONE) 674 TRACE_IRQS_OFF 675 jmp int_with_check 676 CFI_ENDPROC 677END(system_call) 678 679/* 680 * Certain special system calls that need to save a complete full stack frame. 681 */ 682 .macro PTREGSCALL label,func,arg 683ENTRY(\label) 684 PARTIAL_FRAME 1 8 /* offset 8: return address */ 685 subq $REST_SKIP, %rsp 686 CFI_ADJUST_CFA_OFFSET REST_SKIP 687 call save_rest 688 DEFAULT_FRAME 0 8 /* offset 8: return address */ 689 leaq 8(%rsp), \arg /* pt_regs pointer */ 690 call \func 691 jmp ptregscall_common 692 CFI_ENDPROC 693END(\label) 694 .endm 695 696 PTREGSCALL stub_clone, sys_clone, %r8 697 PTREGSCALL stub_fork, sys_fork, %rdi 698 PTREGSCALL stub_vfork, sys_vfork, %rdi 699 PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx 700 PTREGSCALL stub_iopl, sys_iopl, %rsi 701 702ENTRY(ptregscall_common) 703 DEFAULT_FRAME 1 8 /* offset 8: return address */ 704 RESTORE_TOP_OF_STACK %r11, 8 705 movq_cfi_restore R15+8, r15 706 movq_cfi_restore R14+8, r14 707 movq_cfi_restore R13+8, r13 708 movq_cfi_restore R12+8, r12 709 movq_cfi_restore RBP+8, rbp 710 movq_cfi_restore RBX+8, rbx 711 ret $REST_SKIP /* pop extended registers */ 712 CFI_ENDPROC 713END(ptregscall_common) 714 715ENTRY(stub_execve) 716 CFI_STARTPROC 717 addq $8, %rsp 718 PARTIAL_FRAME 0 719 SAVE_REST 720 FIXUP_TOP_OF_STACK %r11 721 movq %rsp, %rcx 722 call sys_execve 723 RESTORE_TOP_OF_STACK %r11 724 movq %rax,RAX(%rsp) 725 RESTORE_REST 726 jmp int_ret_from_sys_call 727 CFI_ENDPROC 728END(stub_execve) 729 730/* 731 * sigreturn is special because it needs to restore all registers on return. 732 * This cannot be done with SYSRET, so use the IRET return path instead. 733 */ 734ENTRY(stub_rt_sigreturn) 735 CFI_STARTPROC 736 addq $8, %rsp 737 PARTIAL_FRAME 0 738 SAVE_REST 739 movq %rsp,%rdi 740 FIXUP_TOP_OF_STACK %r11 741 call sys_rt_sigreturn 742 movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer 743 RESTORE_REST 744 jmp int_ret_from_sys_call 745 CFI_ENDPROC 746END(stub_rt_sigreturn) 747 748#ifdef CONFIG_X86_X32_ABI 749 PTREGSCALL stub_x32_sigaltstack, sys32_sigaltstack, %rdx 750 751ENTRY(stub_x32_rt_sigreturn) 752 CFI_STARTPROC 753 addq $8, %rsp 754 PARTIAL_FRAME 0 755 SAVE_REST 756 movq %rsp,%rdi 757 FIXUP_TOP_OF_STACK %r11 758 call sys32_x32_rt_sigreturn 759 movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer 760 RESTORE_REST 761 jmp int_ret_from_sys_call 762 CFI_ENDPROC 763END(stub_x32_rt_sigreturn) 764 765ENTRY(stub_x32_execve) 766 CFI_STARTPROC 767 addq $8, %rsp 768 PARTIAL_FRAME 0 769 SAVE_REST 770 FIXUP_TOP_OF_STACK %r11 771 movq %rsp, %rcx 772 call sys32_execve 773 RESTORE_TOP_OF_STACK %r11 774 movq %rax,RAX(%rsp) 775 RESTORE_REST 776 jmp int_ret_from_sys_call 777 CFI_ENDPROC 778END(stub_x32_execve) 779 780#endif 781 782/* 783 * Build the entry stubs and pointer table with some assembler magic. 784 * We pack 7 stubs into a single 32-byte chunk, which will fit in a 785 * single cache line on all modern x86 implementations. 786 */ 787 .section .init.rodata,"a" 788ENTRY(interrupt) 789 .section .entry.text 790 .p2align 5 791 .p2align CONFIG_X86_L1_CACHE_SHIFT 792ENTRY(irq_entries_start) 793 INTR_FRAME 794vector=FIRST_EXTERNAL_VECTOR 795.rept (NR_VECTORS-FIRST_EXTERNAL_VECTOR+6)/7 796 .balign 32 797 .rept 7 798 .if vector < NR_VECTORS 799 .if vector <> FIRST_EXTERNAL_VECTOR 800 CFI_ADJUST_CFA_OFFSET -8 801 .endif 8021: pushq_cfi $(~vector+0x80) /* Note: always in signed byte range */ 803 .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6 804 jmp 2f 805 .endif 806 .previous 807 .quad 1b 808 .section .entry.text 809vector=vector+1 810 .endif 811 .endr 8122: jmp common_interrupt 813.endr 814 CFI_ENDPROC 815END(irq_entries_start) 816 817.previous 818END(interrupt) 819.previous 820 821/* 822 * Interrupt entry/exit. 823 * 824 * Interrupt entry points save only callee clobbered registers in fast path. 825 * 826 * Entry runs with interrupts off. 827 */ 828 829/* 0(%rsp): ~(interrupt number) */ 830 .macro interrupt func 831 /* reserve pt_regs for scratch regs and rbp */ 832 subq $ORIG_RAX-RBP, %rsp 833 CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP 834 SAVE_ARGS_IRQ 835 call \func 836 .endm 837 838/* 839 * Interrupt entry/exit should be protected against kprobes 840 */ 841 .pushsection .kprobes.text, "ax" 842 /* 843 * The interrupt stubs push (~vector+0x80) onto the stack and 844 * then jump to common_interrupt. 845 */ 846 .p2align CONFIG_X86_L1_CACHE_SHIFT 847common_interrupt: 848 XCPT_FRAME 849 addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */ 850 interrupt do_IRQ 851 /* 0(%rsp): old_rsp-ARGOFFSET */ 852ret_from_intr: 853 DISABLE_INTERRUPTS(CLBR_NONE) 854 TRACE_IRQS_OFF 855 decl PER_CPU_VAR(irq_count) 856 857 /* Restore saved previous stack */ 858 popq %rsi 859 CFI_DEF_CFA rsi,SS+8-RBP /* reg/off reset after def_cfa_expr */ 860 leaq ARGOFFSET-RBP(%rsi), %rsp 861 CFI_DEF_CFA_REGISTER rsp 862 CFI_ADJUST_CFA_OFFSET RBP-ARGOFFSET 863 864exit_intr: 865 GET_THREAD_INFO(%rcx) 866 testl $3,CS-ARGOFFSET(%rsp) 867 je retint_kernel 868 869 /* Interrupt came from user space */ 870 /* 871 * Has a correct top of stack, but a partial stack frame 872 * %rcx: thread info. Interrupts off. 873 */ 874retint_with_reschedule: 875 movl $_TIF_WORK_MASK,%edi 876retint_check: 877 LOCKDEP_SYS_EXIT_IRQ 878 movl TI_flags(%rcx),%edx 879 andl %edi,%edx 880 CFI_REMEMBER_STATE 881 jnz retint_careful 882 883retint_swapgs: /* return to user-space */ 884 /* 885 * The iretq could re-enable interrupts: 886 */ 887 DISABLE_INTERRUPTS(CLBR_ANY) 888 TRACE_IRQS_IRETQ 889 SWAPGS 890 jmp restore_args 891 892retint_restore_args: /* return to kernel space */ 893 DISABLE_INTERRUPTS(CLBR_ANY) 894 /* 895 * The iretq could re-enable interrupts: 896 */ 897 TRACE_IRQS_IRETQ 898restore_args: 899 RESTORE_ARGS 1,8,1 900 901irq_return: 902 INTERRUPT_RETURN 903 904 .section __ex_table, "a" 905 .quad irq_return, bad_iret 906 .previous 907 908#ifdef CONFIG_PARAVIRT 909ENTRY(native_iret) 910 iretq 911 912 .section __ex_table,"a" 913 .quad native_iret, bad_iret 914 .previous 915#endif 916 917 .section .fixup,"ax" 918bad_iret: 919 /* 920 * The iret traps when the %cs or %ss being restored is bogus. 921 * We've lost the original trap vector and error code. 922 * #GPF is the most likely one to get for an invalid selector. 923 * So pretend we completed the iret and took the #GPF in user mode. 924 * 925 * We are now running with the kernel GS after exception recovery. 926 * But error_entry expects us to have user GS to match the user %cs, 927 * so swap back. 928 */ 929 pushq $0 930 931 SWAPGS 932 jmp general_protection 933 934 .previous 935 936 /* edi: workmask, edx: work */ 937retint_careful: 938 CFI_RESTORE_STATE 939 bt $TIF_NEED_RESCHED,%edx 940 jnc retint_signal 941 TRACE_IRQS_ON 942 ENABLE_INTERRUPTS(CLBR_NONE) 943 pushq_cfi %rdi 944 call schedule 945 popq_cfi %rdi 946 GET_THREAD_INFO(%rcx) 947 DISABLE_INTERRUPTS(CLBR_NONE) 948 TRACE_IRQS_OFF 949 jmp retint_check 950 951retint_signal: 952 testl $_TIF_DO_NOTIFY_MASK,%edx 953 jz retint_swapgs 954 TRACE_IRQS_ON 955 ENABLE_INTERRUPTS(CLBR_NONE) 956 SAVE_REST 957 movq $-1,ORIG_RAX(%rsp) 958 xorl %esi,%esi # oldset 959 movq %rsp,%rdi # &pt_regs 960 call do_notify_resume 961 RESTORE_REST 962 DISABLE_INTERRUPTS(CLBR_NONE) 963 TRACE_IRQS_OFF 964 GET_THREAD_INFO(%rcx) 965 jmp retint_with_reschedule 966 967#ifdef CONFIG_PREEMPT 968 /* Returning to kernel space. Check if we need preemption */ 969 /* rcx: threadinfo. interrupts off. */ 970ENTRY(retint_kernel) 971 cmpl $0,TI_preempt_count(%rcx) 972 jnz retint_restore_args 973 bt $TIF_NEED_RESCHED,TI_flags(%rcx) 974 jnc retint_restore_args 975 bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */ 976 jnc retint_restore_args 977 call preempt_schedule_irq 978 jmp exit_intr 979#endif 980 981 CFI_ENDPROC 982END(common_interrupt) 983/* 984 * End of kprobes section 985 */ 986 .popsection 987 988/* 989 * APIC interrupts. 990 */ 991.macro apicinterrupt num sym do_sym 992ENTRY(\sym) 993 INTR_FRAME 994 pushq_cfi $~(\num) 995.Lcommon_\sym: 996 interrupt \do_sym 997 jmp ret_from_intr 998 CFI_ENDPROC 999END(\sym) 1000.endm 1001 1002#ifdef CONFIG_SMP 1003apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \ 1004 irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt 1005apicinterrupt REBOOT_VECTOR \ 1006 reboot_interrupt smp_reboot_interrupt 1007#endif 1008 1009#ifdef CONFIG_X86_UV 1010apicinterrupt UV_BAU_MESSAGE \ 1011 uv_bau_message_intr1 uv_bau_message_interrupt 1012#endif 1013apicinterrupt LOCAL_TIMER_VECTOR \ 1014 apic_timer_interrupt smp_apic_timer_interrupt 1015apicinterrupt X86_PLATFORM_IPI_VECTOR \ 1016 x86_platform_ipi smp_x86_platform_ipi 1017 1018#ifdef CONFIG_SMP 1019 ALIGN 1020 INTR_FRAME 1021.irp idx,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \ 1022 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 1023.if NUM_INVALIDATE_TLB_VECTORS > \idx 1024ENTRY(invalidate_interrupt\idx) 1025 pushq_cfi $~(INVALIDATE_TLB_VECTOR_START+\idx) 1026 jmp .Lcommon_invalidate_interrupt0 1027 CFI_ADJUST_CFA_OFFSET -8 1028END(invalidate_interrupt\idx) 1029.endif 1030.endr 1031 CFI_ENDPROC 1032apicinterrupt INVALIDATE_TLB_VECTOR_START, \ 1033 invalidate_interrupt0, smp_invalidate_interrupt 1034#endif 1035 1036apicinterrupt THRESHOLD_APIC_VECTOR \ 1037 threshold_interrupt smp_threshold_interrupt 1038apicinterrupt THERMAL_APIC_VECTOR \ 1039 thermal_interrupt smp_thermal_interrupt 1040 1041#ifdef CONFIG_SMP 1042apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \ 1043 call_function_single_interrupt smp_call_function_single_interrupt 1044apicinterrupt CALL_FUNCTION_VECTOR \ 1045 call_function_interrupt smp_call_function_interrupt 1046apicinterrupt RESCHEDULE_VECTOR \ 1047 reschedule_interrupt smp_reschedule_interrupt 1048#endif 1049 1050apicinterrupt ERROR_APIC_VECTOR \ 1051 error_interrupt smp_error_interrupt 1052apicinterrupt SPURIOUS_APIC_VECTOR \ 1053 spurious_interrupt smp_spurious_interrupt 1054 1055#ifdef CONFIG_IRQ_WORK 1056apicinterrupt IRQ_WORK_VECTOR \ 1057 irq_work_interrupt smp_irq_work_interrupt 1058#endif 1059 1060/* 1061 * Exception entry points. 1062 */ 1063.macro zeroentry sym do_sym 1064ENTRY(\sym) 1065 INTR_FRAME 1066 PARAVIRT_ADJUST_EXCEPTION_FRAME 1067 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ 1068 subq $ORIG_RAX-R15, %rsp 1069 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 1070 call error_entry 1071 DEFAULT_FRAME 0 1072 movq %rsp,%rdi /* pt_regs pointer */ 1073 xorl %esi,%esi /* no error code */ 1074 call \do_sym 1075 jmp error_exit /* %ebx: no swapgs flag */ 1076 CFI_ENDPROC 1077END(\sym) 1078.endm 1079 1080.macro paranoidzeroentry sym do_sym 1081ENTRY(\sym) 1082 INTR_FRAME 1083 PARAVIRT_ADJUST_EXCEPTION_FRAME 1084 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ 1085 subq $ORIG_RAX-R15, %rsp 1086 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 1087 call save_paranoid 1088 TRACE_IRQS_OFF 1089 movq %rsp,%rdi /* pt_regs pointer */ 1090 xorl %esi,%esi /* no error code */ 1091 call \do_sym 1092 jmp paranoid_exit /* %ebx: no swapgs flag */ 1093 CFI_ENDPROC 1094END(\sym) 1095.endm 1096 1097#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8) 1098.macro paranoidzeroentry_ist sym do_sym ist 1099ENTRY(\sym) 1100 INTR_FRAME 1101 PARAVIRT_ADJUST_EXCEPTION_FRAME 1102 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ 1103 subq $ORIG_RAX-R15, %rsp 1104 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 1105 call save_paranoid 1106 TRACE_IRQS_OFF 1107 movq %rsp,%rdi /* pt_regs pointer */ 1108 xorl %esi,%esi /* no error code */ 1109 subq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist) 1110 call \do_sym 1111 addq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist) 1112 jmp paranoid_exit /* %ebx: no swapgs flag */ 1113 CFI_ENDPROC 1114END(\sym) 1115.endm 1116 1117.macro errorentry sym do_sym 1118ENTRY(\sym) 1119 XCPT_FRAME 1120 PARAVIRT_ADJUST_EXCEPTION_FRAME 1121 subq $ORIG_RAX-R15, %rsp 1122 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 1123 call error_entry 1124 DEFAULT_FRAME 0 1125 movq %rsp,%rdi /* pt_regs pointer */ 1126 movq ORIG_RAX(%rsp),%rsi /* get error code */ 1127 movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */ 1128 call \do_sym 1129 jmp error_exit /* %ebx: no swapgs flag */ 1130 CFI_ENDPROC 1131END(\sym) 1132.endm 1133 1134 /* error code is on the stack already */ 1135.macro paranoiderrorentry sym do_sym 1136ENTRY(\sym) 1137 XCPT_FRAME 1138 PARAVIRT_ADJUST_EXCEPTION_FRAME 1139 subq $ORIG_RAX-R15, %rsp 1140 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 1141 call save_paranoid 1142 DEFAULT_FRAME 0 1143 TRACE_IRQS_OFF 1144 movq %rsp,%rdi /* pt_regs pointer */ 1145 movq ORIG_RAX(%rsp),%rsi /* get error code */ 1146 movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */ 1147 call \do_sym 1148 jmp paranoid_exit /* %ebx: no swapgs flag */ 1149 CFI_ENDPROC 1150END(\sym) 1151.endm 1152 1153zeroentry divide_error do_divide_error 1154zeroentry overflow do_overflow 1155zeroentry bounds do_bounds 1156zeroentry invalid_op do_invalid_op 1157zeroentry device_not_available do_device_not_available 1158paranoiderrorentry double_fault do_double_fault 1159zeroentry coprocessor_segment_overrun do_coprocessor_segment_overrun 1160errorentry invalid_TSS do_invalid_TSS 1161errorentry segment_not_present do_segment_not_present 1162zeroentry spurious_interrupt_bug do_spurious_interrupt_bug 1163zeroentry coprocessor_error do_coprocessor_error 1164errorentry alignment_check do_alignment_check 1165zeroentry simd_coprocessor_error do_simd_coprocessor_error 1166 1167 1168 /* Reload gs selector with exception handling */ 1169 /* edi: new selector */ 1170ENTRY(native_load_gs_index) 1171 CFI_STARTPROC 1172 pushfq_cfi 1173 DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI) 1174 SWAPGS 1175gs_change: 1176 movl %edi,%gs 11772: mfence /* workaround */ 1178 SWAPGS 1179 popfq_cfi 1180 ret 1181 CFI_ENDPROC 1182END(native_load_gs_index) 1183 1184 .section __ex_table,"a" 1185 .align 8 1186 .quad gs_change,bad_gs 1187 .previous 1188 .section .fixup,"ax" 1189 /* running with kernelgs */ 1190bad_gs: 1191 SWAPGS /* switch back to user gs */ 1192 xorl %eax,%eax 1193 movl %eax,%gs 1194 jmp 2b 1195 .previous 1196 1197ENTRY(kernel_thread_helper) 1198 pushq $0 # fake return address 1199 CFI_STARTPROC 1200 /* 1201 * Here we are in the child and the registers are set as they were 1202 * at kernel_thread() invocation in the parent. 1203 */ 1204 call *%rsi 1205 # exit 1206 mov %eax, %edi 1207 call do_exit 1208 ud2 # padding for call trace 1209 CFI_ENDPROC 1210END(kernel_thread_helper) 1211 1212/* 1213 * execve(). This function needs to use IRET, not SYSRET, to set up all state properly. 1214 * 1215 * C extern interface: 1216 * extern long execve(const char *name, char **argv, char **envp) 1217 * 1218 * asm input arguments: 1219 * rdi: name, rsi: argv, rdx: envp 1220 * 1221 * We want to fallback into: 1222 * extern long sys_execve(const char *name, char **argv,char **envp, struct pt_regs *regs) 1223 * 1224 * do_sys_execve asm fallback arguments: 1225 * rdi: name, rsi: argv, rdx: envp, rcx: fake frame on the stack 1226 */ 1227ENTRY(kernel_execve) 1228 CFI_STARTPROC 1229 FAKE_STACK_FRAME $0 1230 SAVE_ALL 1231 movq %rsp,%rcx 1232 call sys_execve 1233 movq %rax, RAX(%rsp) 1234 RESTORE_REST 1235 testq %rax,%rax 1236 je int_ret_from_sys_call 1237 RESTORE_ARGS 1238 UNFAKE_STACK_FRAME 1239 ret 1240 CFI_ENDPROC 1241END(kernel_execve) 1242 1243/* Call softirq on interrupt stack. Interrupts are off. */ 1244ENTRY(call_softirq) 1245 CFI_STARTPROC 1246 pushq_cfi %rbp 1247 CFI_REL_OFFSET rbp,0 1248 mov %rsp,%rbp 1249 CFI_DEF_CFA_REGISTER rbp 1250 incl PER_CPU_VAR(irq_count) 1251 cmove PER_CPU_VAR(irq_stack_ptr),%rsp 1252 push %rbp # backlink for old unwinder 1253 call __do_softirq 1254 leaveq 1255 CFI_RESTORE rbp 1256 CFI_DEF_CFA_REGISTER rsp 1257 CFI_ADJUST_CFA_OFFSET -8 1258 decl PER_CPU_VAR(irq_count) 1259 ret 1260 CFI_ENDPROC 1261END(call_softirq) 1262 1263#ifdef CONFIG_XEN 1264zeroentry xen_hypervisor_callback xen_do_hypervisor_callback 1265 1266/* 1267 * A note on the "critical region" in our callback handler. 1268 * We want to avoid stacking callback handlers due to events occurring 1269 * during handling of the last event. To do this, we keep events disabled 1270 * until we've done all processing. HOWEVER, we must enable events before 1271 * popping the stack frame (can't be done atomically) and so it would still 1272 * be possible to get enough handler activations to overflow the stack. 1273 * Although unlikely, bugs of that kind are hard to track down, so we'd 1274 * like to avoid the possibility. 1275 * So, on entry to the handler we detect whether we interrupted an 1276 * existing activation in its critical region -- if so, we pop the current 1277 * activation and restart the handler using the previous one. 1278 */ 1279ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs) 1280 CFI_STARTPROC 1281/* 1282 * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will 1283 * see the correct pointer to the pt_regs 1284 */ 1285 movq %rdi, %rsp # we don't return, adjust the stack frame 1286 CFI_ENDPROC 1287 DEFAULT_FRAME 128811: incl PER_CPU_VAR(irq_count) 1289 movq %rsp,%rbp 1290 CFI_DEF_CFA_REGISTER rbp 1291 cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp 1292 pushq %rbp # backlink for old unwinder 1293 call xen_evtchn_do_upcall 1294 popq %rsp 1295 CFI_DEF_CFA_REGISTER rsp 1296 decl PER_CPU_VAR(irq_count) 1297 jmp error_exit 1298 CFI_ENDPROC 1299END(xen_do_hypervisor_callback) 1300 1301/* 1302 * Hypervisor uses this for application faults while it executes. 1303 * We get here for two reasons: 1304 * 1. Fault while reloading DS, ES, FS or GS 1305 * 2. Fault while executing IRET 1306 * Category 1 we do not need to fix up as Xen has already reloaded all segment 1307 * registers that could be reloaded and zeroed the others. 1308 * Category 2 we fix up by killing the current process. We cannot use the 1309 * normal Linux return path in this case because if we use the IRET hypercall 1310 * to pop the stack frame we end up in an infinite loop of failsafe callbacks. 1311 * We distinguish between categories by comparing each saved segment register 1312 * with its current contents: any discrepancy means we in category 1. 1313 */ 1314ENTRY(xen_failsafe_callback) 1315 INTR_FRAME 1 (6*8) 1316 /*CFI_REL_OFFSET gs,GS*/ 1317 /*CFI_REL_OFFSET fs,FS*/ 1318 /*CFI_REL_OFFSET es,ES*/ 1319 /*CFI_REL_OFFSET ds,DS*/ 1320 CFI_REL_OFFSET r11,8 1321 CFI_REL_OFFSET rcx,0 1322 movw %ds,%cx 1323 cmpw %cx,0x10(%rsp) 1324 CFI_REMEMBER_STATE 1325 jne 1f 1326 movw %es,%cx 1327 cmpw %cx,0x18(%rsp) 1328 jne 1f 1329 movw %fs,%cx 1330 cmpw %cx,0x20(%rsp) 1331 jne 1f 1332 movw %gs,%cx 1333 cmpw %cx,0x28(%rsp) 1334 jne 1f 1335 /* All segments match their saved values => Category 2 (Bad IRET). */ 1336 movq (%rsp),%rcx 1337 CFI_RESTORE rcx 1338 movq 8(%rsp),%r11 1339 CFI_RESTORE r11 1340 addq $0x30,%rsp 1341 CFI_ADJUST_CFA_OFFSET -0x30 1342 pushq_cfi $0 /* RIP */ 1343 pushq_cfi %r11 1344 pushq_cfi %rcx 1345 jmp general_protection 1346 CFI_RESTORE_STATE 13471: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */ 1348 movq (%rsp),%rcx 1349 CFI_RESTORE rcx 1350 movq 8(%rsp),%r11 1351 CFI_RESTORE r11 1352 addq $0x30,%rsp 1353 CFI_ADJUST_CFA_OFFSET -0x30 1354 pushq_cfi $-1 /* orig_ax = -1 => not a system call */ 1355 SAVE_ALL 1356 jmp error_exit 1357 CFI_ENDPROC 1358END(xen_failsafe_callback) 1359 1360apicinterrupt XEN_HVM_EVTCHN_CALLBACK \ 1361 xen_hvm_callback_vector xen_evtchn_do_upcall 1362 1363#endif /* CONFIG_XEN */ 1364 1365/* 1366 * Some functions should be protected against kprobes 1367 */ 1368 .pushsection .kprobes.text, "ax" 1369 1370paranoidzeroentry_ist debug do_debug DEBUG_STACK 1371paranoidzeroentry_ist int3 do_int3 DEBUG_STACK 1372paranoiderrorentry stack_segment do_stack_segment 1373#ifdef CONFIG_XEN 1374zeroentry xen_debug do_debug 1375zeroentry xen_int3 do_int3 1376errorentry xen_stack_segment do_stack_segment 1377#endif 1378errorentry general_protection do_general_protection 1379errorentry page_fault do_page_fault 1380#ifdef CONFIG_KVM_GUEST 1381errorentry async_page_fault do_async_page_fault 1382#endif 1383#ifdef CONFIG_X86_MCE 1384paranoidzeroentry machine_check *machine_check_vector(%rip) 1385#endif 1386 1387 /* 1388 * "Paranoid" exit path from exception stack. 1389 * Paranoid because this is used by NMIs and cannot take 1390 * any kernel state for granted. 1391 * We don't do kernel preemption checks here, because only 1392 * NMI should be common and it does not enable IRQs and 1393 * cannot get reschedule ticks. 1394 * 1395 * "trace" is 0 for the NMI handler only, because irq-tracing 1396 * is fundamentally NMI-unsafe. (we cannot change the soft and 1397 * hard flags at once, atomically) 1398 */ 1399 1400 /* ebx: no swapgs flag */ 1401ENTRY(paranoid_exit) 1402 DEFAULT_FRAME 1403 DISABLE_INTERRUPTS(CLBR_NONE) 1404 TRACE_IRQS_OFF 1405 testl %ebx,%ebx /* swapgs needed? */ 1406 jnz paranoid_restore 1407 testl $3,CS(%rsp) 1408 jnz paranoid_userspace 1409paranoid_swapgs: 1410 TRACE_IRQS_IRETQ 0 1411 SWAPGS_UNSAFE_STACK 1412 RESTORE_ALL 8 1413 jmp irq_return 1414paranoid_restore: 1415 TRACE_IRQS_IRETQ 0 1416 RESTORE_ALL 8 1417 jmp irq_return 1418paranoid_userspace: 1419 GET_THREAD_INFO(%rcx) 1420 movl TI_flags(%rcx),%ebx 1421 andl $_TIF_WORK_MASK,%ebx 1422 jz paranoid_swapgs 1423 movq %rsp,%rdi /* &pt_regs */ 1424 call sync_regs 1425 movq %rax,%rsp /* switch stack for scheduling */ 1426 testl $_TIF_NEED_RESCHED,%ebx 1427 jnz paranoid_schedule 1428 movl %ebx,%edx /* arg3: thread flags */ 1429 TRACE_IRQS_ON 1430 ENABLE_INTERRUPTS(CLBR_NONE) 1431 xorl %esi,%esi /* arg2: oldset */ 1432 movq %rsp,%rdi /* arg1: &pt_regs */ 1433 call do_notify_resume 1434 DISABLE_INTERRUPTS(CLBR_NONE) 1435 TRACE_IRQS_OFF 1436 jmp paranoid_userspace 1437paranoid_schedule: 1438 TRACE_IRQS_ON 1439 ENABLE_INTERRUPTS(CLBR_ANY) 1440 call schedule 1441 DISABLE_INTERRUPTS(CLBR_ANY) 1442 TRACE_IRQS_OFF 1443 jmp paranoid_userspace 1444 CFI_ENDPROC 1445END(paranoid_exit) 1446 1447/* 1448 * Exception entry point. This expects an error code/orig_rax on the stack. 1449 * returns in "no swapgs flag" in %ebx. 1450 */ 1451ENTRY(error_entry) 1452 XCPT_FRAME 1453 CFI_ADJUST_CFA_OFFSET 15*8 1454 /* oldrax contains error code */ 1455 cld 1456 movq_cfi rdi, RDI+8 1457 movq_cfi rsi, RSI+8 1458 movq_cfi rdx, RDX+8 1459 movq_cfi rcx, RCX+8 1460 movq_cfi rax, RAX+8 1461 movq_cfi r8, R8+8 1462 movq_cfi r9, R9+8 1463 movq_cfi r10, R10+8 1464 movq_cfi r11, R11+8 1465 movq_cfi rbx, RBX+8 1466 movq_cfi rbp, RBP+8 1467 movq_cfi r12, R12+8 1468 movq_cfi r13, R13+8 1469 movq_cfi r14, R14+8 1470 movq_cfi r15, R15+8 1471 xorl %ebx,%ebx 1472 testl $3,CS+8(%rsp) 1473 je error_kernelspace 1474error_swapgs: 1475 SWAPGS 1476error_sti: 1477 TRACE_IRQS_OFF 1478 ret 1479 1480/* 1481 * There are two places in the kernel that can potentially fault with 1482 * usergs. Handle them here. The exception handlers after iret run with 1483 * kernel gs again, so don't set the user space flag. B stepping K8s 1484 * sometimes report an truncated RIP for IRET exceptions returning to 1485 * compat mode. Check for these here too. 1486 */ 1487error_kernelspace: 1488 incl %ebx 1489 leaq irq_return(%rip),%rcx 1490 cmpq %rcx,RIP+8(%rsp) 1491 je error_swapgs 1492 movl %ecx,%eax /* zero extend */ 1493 cmpq %rax,RIP+8(%rsp) 1494 je bstep_iret 1495 cmpq $gs_change,RIP+8(%rsp) 1496 je error_swapgs 1497 jmp error_sti 1498 1499bstep_iret: 1500 /* Fix truncated RIP */ 1501 movq %rcx,RIP+8(%rsp) 1502 jmp error_swapgs 1503 CFI_ENDPROC 1504END(error_entry) 1505 1506 1507/* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */ 1508ENTRY(error_exit) 1509 DEFAULT_FRAME 1510 movl %ebx,%eax 1511 RESTORE_REST 1512 DISABLE_INTERRUPTS(CLBR_NONE) 1513 TRACE_IRQS_OFF 1514 GET_THREAD_INFO(%rcx) 1515 testl %eax,%eax 1516 jne retint_kernel 1517 LOCKDEP_SYS_EXIT_IRQ 1518 movl TI_flags(%rcx),%edx 1519 movl $_TIF_WORK_MASK,%edi 1520 andl %edi,%edx 1521 jnz retint_careful 1522 jmp retint_swapgs 1523 CFI_ENDPROC 1524END(error_exit) 1525 1526/* 1527 * Test if a given stack is an NMI stack or not. 1528 */ 1529 .macro test_in_nmi reg stack nmi_ret normal_ret 1530 cmpq %\reg, \stack 1531 ja \normal_ret 1532 subq $EXCEPTION_STKSZ, %\reg 1533 cmpq %\reg, \stack 1534 jb \normal_ret 1535 jmp \nmi_ret 1536 .endm 1537 1538 /* runs on exception stack */ 1539ENTRY(nmi) 1540 INTR_FRAME 1541 PARAVIRT_ADJUST_EXCEPTION_FRAME 1542 /* 1543 * We allow breakpoints in NMIs. If a breakpoint occurs, then 1544 * the iretq it performs will take us out of NMI context. 1545 * This means that we can have nested NMIs where the next 1546 * NMI is using the top of the stack of the previous NMI. We 1547 * can't let it execute because the nested NMI will corrupt the 1548 * stack of the previous NMI. NMI handlers are not re-entrant 1549 * anyway. 1550 * 1551 * To handle this case we do the following: 1552 * Check the a special location on the stack that contains 1553 * a variable that is set when NMIs are executing. 1554 * The interrupted task's stack is also checked to see if it 1555 * is an NMI stack. 1556 * If the variable is not set and the stack is not the NMI 1557 * stack then: 1558 * o Set the special variable on the stack 1559 * o Copy the interrupt frame into a "saved" location on the stack 1560 * o Copy the interrupt frame into a "copy" location on the stack 1561 * o Continue processing the NMI 1562 * If the variable is set or the previous stack is the NMI stack: 1563 * o Modify the "copy" location to jump to the repeate_nmi 1564 * o return back to the first NMI 1565 * 1566 * Now on exit of the first NMI, we first clear the stack variable 1567 * The NMI stack will tell any nested NMIs at that point that it is 1568 * nested. Then we pop the stack normally with iret, and if there was 1569 * a nested NMI that updated the copy interrupt stack frame, a 1570 * jump will be made to the repeat_nmi code that will handle the second 1571 * NMI. 1572 */ 1573 1574 /* Use %rdx as out temp variable throughout */ 1575 pushq_cfi %rdx 1576 CFI_REL_OFFSET rdx, 0 1577 1578 /* 1579 * If %cs was not the kernel segment, then the NMI triggered in user 1580 * space, which means it is definitely not nested. 1581 */ 1582 cmpl $__KERNEL_CS, 16(%rsp) 1583 jne first_nmi 1584 1585 /* 1586 * Check the special variable on the stack to see if NMIs are 1587 * executing. 1588 */ 1589 cmpl $1, -8(%rsp) 1590 je nested_nmi 1591 1592 /* 1593 * Now test if the previous stack was an NMI stack. 1594 * We need the double check. We check the NMI stack to satisfy the 1595 * race when the first NMI clears the variable before returning. 1596 * We check the variable because the first NMI could be in a 1597 * breakpoint routine using a breakpoint stack. 1598 */ 1599 lea 6*8(%rsp), %rdx 1600 test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi 1601 CFI_REMEMBER_STATE 1602 1603nested_nmi: 1604 /* 1605 * Do nothing if we interrupted the fixup in repeat_nmi. 1606 * It's about to repeat the NMI handler, so we are fine 1607 * with ignoring this one. 1608 */ 1609 movq $repeat_nmi, %rdx 1610 cmpq 8(%rsp), %rdx 1611 ja 1f 1612 movq $end_repeat_nmi, %rdx 1613 cmpq 8(%rsp), %rdx 1614 ja nested_nmi_out 1615 16161: 1617 /* Set up the interrupted NMIs stack to jump to repeat_nmi */ 1618 leaq -6*8(%rsp), %rdx 1619 movq %rdx, %rsp 1620 CFI_ADJUST_CFA_OFFSET 6*8 1621 pushq_cfi $__KERNEL_DS 1622 pushq_cfi %rdx 1623 pushfq_cfi 1624 pushq_cfi $__KERNEL_CS 1625 pushq_cfi $repeat_nmi 1626 1627 /* Put stack back */ 1628 addq $(11*8), %rsp 1629 CFI_ADJUST_CFA_OFFSET -11*8 1630 1631nested_nmi_out: 1632 popq_cfi %rdx 1633 CFI_RESTORE rdx 1634 1635 /* No need to check faults here */ 1636 INTERRUPT_RETURN 1637 1638 CFI_RESTORE_STATE 1639first_nmi: 1640 /* 1641 * Because nested NMIs will use the pushed location that we 1642 * stored in rdx, we must keep that space available. 1643 * Here's what our stack frame will look like: 1644 * +-------------------------+ 1645 * | original SS | 1646 * | original Return RSP | 1647 * | original RFLAGS | 1648 * | original CS | 1649 * | original RIP | 1650 * +-------------------------+ 1651 * | temp storage for rdx | 1652 * +-------------------------+ 1653 * | NMI executing variable | 1654 * +-------------------------+ 1655 * | Saved SS | 1656 * | Saved Return RSP | 1657 * | Saved RFLAGS | 1658 * | Saved CS | 1659 * | Saved RIP | 1660 * +-------------------------+ 1661 * | copied SS | 1662 * | copied Return RSP | 1663 * | copied RFLAGS | 1664 * | copied CS | 1665 * | copied RIP | 1666 * +-------------------------+ 1667 * | pt_regs | 1668 * +-------------------------+ 1669 * 1670 * The saved stack frame is used to fix up the copied stack frame 1671 * that a nested NMI may change to make the interrupted NMI iret jump 1672 * to the repeat_nmi. The original stack frame and the temp storage 1673 * is also used by nested NMIs and can not be trusted on exit. 1674 */ 1675 /* Do not pop rdx, nested NMIs will corrupt that part of the stack */ 1676 movq (%rsp), %rdx 1677 CFI_RESTORE rdx 1678 1679 /* Set the NMI executing variable on the stack. */ 1680 pushq_cfi $1 1681 1682 /* Copy the stack frame to the Saved frame */ 1683 .rept 5 1684 pushq_cfi 6*8(%rsp) 1685 .endr 1686 CFI_DEF_CFA_OFFSET SS+8-RIP 1687 1688 /* Everything up to here is safe from nested NMIs */ 1689 1690 /* 1691 * If there was a nested NMI, the first NMI's iret will return 1692 * here. But NMIs are still enabled and we can take another 1693 * nested NMI. The nested NMI checks the interrupted RIP to see 1694 * if it is between repeat_nmi and end_repeat_nmi, and if so 1695 * it will just return, as we are about to repeat an NMI anyway. 1696 * This makes it safe to copy to the stack frame that a nested 1697 * NMI will update. 1698 */ 1699repeat_nmi: 1700 /* 1701 * Update the stack variable to say we are still in NMI (the update 1702 * is benign for the non-repeat case, where 1 was pushed just above 1703 * to this very stack slot). 1704 */ 1705 movq $1, 5*8(%rsp) 1706 1707 /* Make another copy, this one may be modified by nested NMIs */ 1708 .rept 5 1709 pushq_cfi 4*8(%rsp) 1710 .endr 1711 CFI_DEF_CFA_OFFSET SS+8-RIP 1712end_repeat_nmi: 1713 1714 /* 1715 * Everything below this point can be preempted by a nested 1716 * NMI if the first NMI took an exception and reset our iret stack 1717 * so that we repeat another NMI. 1718 */ 1719 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ 1720 subq $ORIG_RAX-R15, %rsp 1721 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 1722 /* 1723 * Use save_paranoid to handle SWAPGS, but no need to use paranoid_exit 1724 * as we should not be calling schedule in NMI context. 1725 * Even with normal interrupts enabled. An NMI should not be 1726 * setting NEED_RESCHED or anything that normal interrupts and 1727 * exceptions might do. 1728 */ 1729 call save_paranoid 1730 DEFAULT_FRAME 0 1731 /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ 1732 movq %rsp,%rdi 1733 movq $-1,%rsi 1734 call do_nmi 1735 testl %ebx,%ebx /* swapgs needed? */ 1736 jnz nmi_restore 1737nmi_swapgs: 1738 SWAPGS_UNSAFE_STACK 1739nmi_restore: 1740 RESTORE_ALL 8 1741 /* Clear the NMI executing stack variable */ 1742 movq $0, 10*8(%rsp) 1743 jmp irq_return 1744 CFI_ENDPROC 1745END(nmi) 1746 1747ENTRY(ignore_sysret) 1748 CFI_STARTPROC 1749 mov $-ENOSYS,%eax 1750 sysret 1751 CFI_ENDPROC 1752END(ignore_sysret) 1753 1754/* 1755 * End of kprobes section 1756 */ 1757 .popsection 1758