1/*
2 *  linux/arch/x86_64/entry.S
3 *
4 *  Copyright (C) 1991, 1992  Linus Torvalds
5 *  Copyright (C) 2000, 2001, 2002  Andi Kleen SuSE Labs
6 *  Copyright (C) 2000  Pavel Machek <pavel@suse.cz>
7 */
8
9/*
10 * entry.S contains the system-call and fault low-level handling routines.
11 *
12 * Some of this is documented in Documentation/x86/entry_64.txt
13 *
14 * NOTE: This code handles signal-recognition, which happens every time
15 * after an interrupt and after each system call.
16 *
17 * Normal syscalls and interrupts don't save a full stack frame, this is
18 * only done for syscall tracing, signals or fork/exec et.al.
19 *
20 * A note on terminology:
21 * - top of stack: Architecture defined interrupt frame from SS to RIP
22 * at the top of the kernel process stack.
23 * - partial stack frame: partially saved registers up to R11.
24 * - full stack frame: Like partial stack frame, but all register saved.
25 *
26 * Some macro usage:
27 * - CFI macros are used to generate dwarf2 unwind information for better
28 * backtraces. They don't change any code.
29 * - SAVE_ALL/RESTORE_ALL - Save/restore all registers
30 * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify.
31 * There are unfortunately lots of special cases where some registers
32 * not touched. The macro is a big mess that should be cleaned up.
33 * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS.
34 * Gives a full stack frame.
35 * - ENTRY/END Define functions in the symbol table.
36 * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack
37 * frame that is otherwise undefined after a SYSCALL
38 * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging.
39 * - errorentry/paranoidentry/zeroentry - Define exception entry points.
40 */
41
42#include <linux/linkage.h>
43#include <asm/segment.h>
44#include <asm/cache.h>
45#include <asm/errno.h>
46#include <asm/dwarf2.h>
47#include <asm/calling.h>
48#include <asm/asm-offsets.h>
49#include <asm/msr.h>
50#include <asm/unistd.h>
51#include <asm/thread_info.h>
52#include <asm/hw_irq.h>
53#include <asm/page_types.h>
54#include <asm/irqflags.h>
55#include <asm/paravirt.h>
56#include <asm/ftrace.h>
57#include <asm/percpu.h>
58#include <linux/err.h>
59
60/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
61#include <linux/elf-em.h>
62#define AUDIT_ARCH_X86_64	(EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
63#define __AUDIT_ARCH_64BIT 0x80000000
64#define __AUDIT_ARCH_LE	   0x40000000
65
66	.code64
67	.section .entry.text, "ax"
68
69#ifdef CONFIG_FUNCTION_TRACER
70#ifdef CONFIG_DYNAMIC_FTRACE
71ENTRY(mcount)
72	retq
73END(mcount)
74
75ENTRY(ftrace_caller)
76	cmpl $0, function_trace_stop
77	jne  ftrace_stub
78
79	MCOUNT_SAVE_FRAME
80
81	movq 0x38(%rsp), %rdi
82	movq 8(%rbp), %rsi
83	subq $MCOUNT_INSN_SIZE, %rdi
84
85GLOBAL(ftrace_call)
86	call ftrace_stub
87
88	MCOUNT_RESTORE_FRAME
89
90#ifdef CONFIG_FUNCTION_GRAPH_TRACER
91GLOBAL(ftrace_graph_call)
92	jmp ftrace_stub
93#endif
94
95GLOBAL(ftrace_stub)
96	retq
97END(ftrace_caller)
98
99#else /* ! CONFIG_DYNAMIC_FTRACE */
100ENTRY(mcount)
101	cmpl $0, function_trace_stop
102	jne  ftrace_stub
103
104	cmpq $ftrace_stub, ftrace_trace_function
105	jnz trace
106
107#ifdef CONFIG_FUNCTION_GRAPH_TRACER
108	cmpq $ftrace_stub, ftrace_graph_return
109	jnz ftrace_graph_caller
110
111	cmpq $ftrace_graph_entry_stub, ftrace_graph_entry
112	jnz ftrace_graph_caller
113#endif
114
115GLOBAL(ftrace_stub)
116	retq
117
118trace:
119	MCOUNT_SAVE_FRAME
120
121	movq 0x38(%rsp), %rdi
122	movq 8(%rbp), %rsi
123	subq $MCOUNT_INSN_SIZE, %rdi
124
125	call   *ftrace_trace_function
126
127	MCOUNT_RESTORE_FRAME
128
129	jmp ftrace_stub
130END(mcount)
131#endif /* CONFIG_DYNAMIC_FTRACE */
132#endif /* CONFIG_FUNCTION_TRACER */
133
134#ifdef CONFIG_FUNCTION_GRAPH_TRACER
135ENTRY(ftrace_graph_caller)
136	cmpl $0, function_trace_stop
137	jne ftrace_stub
138
139	MCOUNT_SAVE_FRAME
140
141	leaq 8(%rbp), %rdi
142	movq 0x38(%rsp), %rsi
143	movq (%rbp), %rdx
144	subq $MCOUNT_INSN_SIZE, %rsi
145
146	call	prepare_ftrace_return
147
148	MCOUNT_RESTORE_FRAME
149
150	retq
151END(ftrace_graph_caller)
152
153GLOBAL(return_to_handler)
154	subq  $24, %rsp
155
156	/* Save the return values */
157	movq %rax, (%rsp)
158	movq %rdx, 8(%rsp)
159	movq %rbp, %rdi
160
161	call ftrace_return_to_handler
162
163	movq %rax, %rdi
164	movq 8(%rsp), %rdx
165	movq (%rsp), %rax
166	addq $24, %rsp
167	jmp *%rdi
168#endif
169
170
171#ifndef CONFIG_PREEMPT
172#define retint_kernel retint_restore_args
173#endif
174
175#ifdef CONFIG_PARAVIRT
176ENTRY(native_usergs_sysret64)
177	swapgs
178	sysretq
179ENDPROC(native_usergs_sysret64)
180#endif /* CONFIG_PARAVIRT */
181
182
183.macro TRACE_IRQS_IRETQ offset=ARGOFFSET
184#ifdef CONFIG_TRACE_IRQFLAGS
185	bt   $9,EFLAGS-\offset(%rsp)	/* interrupts off? */
186	jnc  1f
187	TRACE_IRQS_ON
1881:
189#endif
190.endm
191
192/*
193 * C code is not supposed to know about undefined top of stack. Every time
194 * a C function with an pt_regs argument is called from the SYSCALL based
195 * fast path FIXUP_TOP_OF_STACK is needed.
196 * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs
197 * manipulation.
198 */
199
200	/* %rsp:at FRAMEEND */
201	.macro FIXUP_TOP_OF_STACK tmp offset=0
202	movq PER_CPU_VAR(old_rsp),\tmp
203	movq \tmp,RSP+\offset(%rsp)
204	movq $__USER_DS,SS+\offset(%rsp)
205	movq $__USER_CS,CS+\offset(%rsp)
206	movq $-1,RCX+\offset(%rsp)
207	movq R11+\offset(%rsp),\tmp  /* get eflags */
208	movq \tmp,EFLAGS+\offset(%rsp)
209	.endm
210
211	.macro RESTORE_TOP_OF_STACK tmp offset=0
212	movq RSP+\offset(%rsp),\tmp
213	movq \tmp,PER_CPU_VAR(old_rsp)
214	movq EFLAGS+\offset(%rsp),\tmp
215	movq \tmp,R11+\offset(%rsp)
216	.endm
217
218	.macro FAKE_STACK_FRAME child_rip
219	/* push in order ss, rsp, eflags, cs, rip */
220	xorl %eax, %eax
221	pushq_cfi $__KERNEL_DS /* ss */
222	/*CFI_REL_OFFSET	ss,0*/
223	pushq_cfi %rax /* rsp */
224	CFI_REL_OFFSET	rsp,0
225	pushq_cfi $(X86_EFLAGS_IF|X86_EFLAGS_BIT1) /* eflags - interrupts on */
226	/*CFI_REL_OFFSET	rflags,0*/
227	pushq_cfi $__KERNEL_CS /* cs */
228	/*CFI_REL_OFFSET	cs,0*/
229	pushq_cfi \child_rip /* rip */
230	CFI_REL_OFFSET	rip,0
231	pushq_cfi %rax /* orig rax */
232	.endm
233
234	.macro UNFAKE_STACK_FRAME
235	addq $8*6, %rsp
236	CFI_ADJUST_CFA_OFFSET	-(6*8)
237	.endm
238
239/*
240 * initial frame state for interrupts (and exceptions without error code)
241 */
242	.macro EMPTY_FRAME start=1 offset=0
243	.if \start
244	CFI_STARTPROC simple
245	CFI_SIGNAL_FRAME
246	CFI_DEF_CFA rsp,8+\offset
247	.else
248	CFI_DEF_CFA_OFFSET 8+\offset
249	.endif
250	.endm
251
252/*
253 * initial frame state for interrupts (and exceptions without error code)
254 */
255	.macro INTR_FRAME start=1 offset=0
256	EMPTY_FRAME \start, SS+8+\offset-RIP
257	/*CFI_REL_OFFSET ss, SS+\offset-RIP*/
258	CFI_REL_OFFSET rsp, RSP+\offset-RIP
259	/*CFI_REL_OFFSET rflags, EFLAGS+\offset-RIP*/
260	/*CFI_REL_OFFSET cs, CS+\offset-RIP*/
261	CFI_REL_OFFSET rip, RIP+\offset-RIP
262	.endm
263
264/*
265 * initial frame state for exceptions with error code (and interrupts
266 * with vector already pushed)
267 */
268	.macro XCPT_FRAME start=1 offset=0
269	INTR_FRAME \start, RIP+\offset-ORIG_RAX
270	/*CFI_REL_OFFSET orig_rax, ORIG_RAX-ORIG_RAX*/
271	.endm
272
273/*
274 * frame that enables calling into C.
275 */
276	.macro PARTIAL_FRAME start=1 offset=0
277	XCPT_FRAME \start, ORIG_RAX+\offset-ARGOFFSET
278	CFI_REL_OFFSET rdi, RDI+\offset-ARGOFFSET
279	CFI_REL_OFFSET rsi, RSI+\offset-ARGOFFSET
280	CFI_REL_OFFSET rdx, RDX+\offset-ARGOFFSET
281	CFI_REL_OFFSET rcx, RCX+\offset-ARGOFFSET
282	CFI_REL_OFFSET rax, RAX+\offset-ARGOFFSET
283	CFI_REL_OFFSET r8, R8+\offset-ARGOFFSET
284	CFI_REL_OFFSET r9, R9+\offset-ARGOFFSET
285	CFI_REL_OFFSET r10, R10+\offset-ARGOFFSET
286	CFI_REL_OFFSET r11, R11+\offset-ARGOFFSET
287	.endm
288
289/*
290 * frame that enables passing a complete pt_regs to a C function.
291 */
292	.macro DEFAULT_FRAME start=1 offset=0
293	PARTIAL_FRAME \start, R11+\offset-R15
294	CFI_REL_OFFSET rbx, RBX+\offset
295	CFI_REL_OFFSET rbp, RBP+\offset
296	CFI_REL_OFFSET r12, R12+\offset
297	CFI_REL_OFFSET r13, R13+\offset
298	CFI_REL_OFFSET r14, R14+\offset
299	CFI_REL_OFFSET r15, R15+\offset
300	.endm
301
302/* save partial stack frame */
303	.macro SAVE_ARGS_IRQ
304	cld
305	/* start from rbp in pt_regs and jump over */
306	movq_cfi rdi, RDI-RBP
307	movq_cfi rsi, RSI-RBP
308	movq_cfi rdx, RDX-RBP
309	movq_cfi rcx, RCX-RBP
310	movq_cfi rax, RAX-RBP
311	movq_cfi  r8,  R8-RBP
312	movq_cfi  r9,  R9-RBP
313	movq_cfi r10, R10-RBP
314	movq_cfi r11, R11-RBP
315
316	/* Save rbp so that we can unwind from get_irq_regs() */
317	movq_cfi rbp, 0
318
319	/* Save previous stack value */
320	movq %rsp, %rsi
321
322	leaq -RBP(%rsp),%rdi	/* arg1 for handler */
323	testl $3, CS-RBP(%rsi)
324	je 1f
325	SWAPGS
326	/*
327	 * irq_count is used to check if a CPU is already on an interrupt stack
328	 * or not. While this is essentially redundant with preempt_count it is
329	 * a little cheaper to use a separate counter in the PDA (short of
330	 * moving irq_enter into assembly, which would be too much work)
331	 */
3321:	incl PER_CPU_VAR(irq_count)
333	cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
334	CFI_DEF_CFA_REGISTER	rsi
335
336	/* Store previous stack value */
337	pushq %rsi
338	CFI_ESCAPE	0x0f /* DW_CFA_def_cfa_expression */, 6, \
339			0x77 /* DW_OP_breg7 */, 0, \
340			0x06 /* DW_OP_deref */, \
341			0x08 /* DW_OP_const1u */, SS+8-RBP, \
342			0x22 /* DW_OP_plus */
343	/* We entered an interrupt context - irqs are off: */
344	TRACE_IRQS_OFF
345	.endm
346
347ENTRY(save_rest)
348	PARTIAL_FRAME 1 REST_SKIP+8
349	movq 5*8+16(%rsp), %r11	/* save return address */
350	movq_cfi rbx, RBX+16
351	movq_cfi rbp, RBP+16
352	movq_cfi r12, R12+16
353	movq_cfi r13, R13+16
354	movq_cfi r14, R14+16
355	movq_cfi r15, R15+16
356	movq %r11, 8(%rsp)	/* return address */
357	FIXUP_TOP_OF_STACK %r11, 16
358	ret
359	CFI_ENDPROC
360END(save_rest)
361
362/* save complete stack frame */
363	.pushsection .kprobes.text, "ax"
364ENTRY(save_paranoid)
365	XCPT_FRAME 1 RDI+8
366	cld
367	movq_cfi rdi, RDI+8
368	movq_cfi rsi, RSI+8
369	movq_cfi rdx, RDX+8
370	movq_cfi rcx, RCX+8
371	movq_cfi rax, RAX+8
372	movq_cfi r8, R8+8
373	movq_cfi r9, R9+8
374	movq_cfi r10, R10+8
375	movq_cfi r11, R11+8
376	movq_cfi rbx, RBX+8
377	movq_cfi rbp, RBP+8
378	movq_cfi r12, R12+8
379	movq_cfi r13, R13+8
380	movq_cfi r14, R14+8
381	movq_cfi r15, R15+8
382	movl $1,%ebx
383	movl $MSR_GS_BASE,%ecx
384	rdmsr
385	testl %edx,%edx
386	js 1f	/* negative -> in kernel */
387	SWAPGS
388	xorl %ebx,%ebx
3891:	ret
390	CFI_ENDPROC
391END(save_paranoid)
392	.popsection
393
394/*
395 * A newly forked process directly context switches into this address.
396 *
397 * rdi: prev task we switched from
398 */
399ENTRY(ret_from_fork)
400	DEFAULT_FRAME
401
402	LOCK ; btr $TIF_FORK,TI_flags(%r8)
403
404	pushq_cfi kernel_eflags(%rip)
405	popfq_cfi				# reset kernel eflags
406
407	call schedule_tail			# rdi: 'prev' task parameter
408
409	GET_THREAD_INFO(%rcx)
410
411	RESTORE_REST
412
413	testl $3, CS-ARGOFFSET(%rsp)		# from kernel_thread?
414	jz   retint_restore_args
415
416	testl $_TIF_IA32, TI_flags(%rcx)	# 32-bit compat task needs IRET
417	jnz  int_ret_from_sys_call
418
419	RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET
420	jmp ret_from_sys_call			# go to the SYSRET fastpath
421
422	CFI_ENDPROC
423END(ret_from_fork)
424
425/*
426 * System call entry. Up to 6 arguments in registers are supported.
427 *
428 * SYSCALL does not save anything on the stack and does not change the
429 * stack pointer.
430 */
431
432/*
433 * Register setup:
434 * rax  system call number
435 * rdi  arg0
436 * rcx  return address for syscall/sysret, C arg3
437 * rsi  arg1
438 * rdx  arg2
439 * r10  arg3 	(--> moved to rcx for C)
440 * r8   arg4
441 * r9   arg5
442 * r11  eflags for syscall/sysret, temporary for C
443 * r12-r15,rbp,rbx saved by C code, not touched.
444 *
445 * Interrupts are off on entry.
446 * Only called from user space.
447 *
448 * XXX	if we had a free scratch register we could save the RSP into the stack frame
449 *      and report it properly in ps. Unfortunately we haven't.
450 *
451 * When user can change the frames always force IRET. That is because
452 * it deals with uncanonical addresses better. SYSRET has trouble
453 * with them due to bugs in both AMD and Intel CPUs.
454 */
455
456ENTRY(system_call)
457	CFI_STARTPROC	simple
458	CFI_SIGNAL_FRAME
459	CFI_DEF_CFA	rsp,KERNEL_STACK_OFFSET
460	CFI_REGISTER	rip,rcx
461	/*CFI_REGISTER	rflags,r11*/
462	SWAPGS_UNSAFE_STACK
463	/*
464	 * A hypervisor implementation might want to use a label
465	 * after the swapgs, so that it can do the swapgs
466	 * for the guest and jump here on syscall.
467	 */
468GLOBAL(system_call_after_swapgs)
469
470	movq	%rsp,PER_CPU_VAR(old_rsp)
471	movq	PER_CPU_VAR(kernel_stack),%rsp
472	/*
473	 * No need to follow this irqs off/on section - it's straight
474	 * and short:
475	 */
476	ENABLE_INTERRUPTS(CLBR_NONE)
477	SAVE_ARGS 8,0
478	movq  %rax,ORIG_RAX-ARGOFFSET(%rsp)
479	movq  %rcx,RIP-ARGOFFSET(%rsp)
480	CFI_REL_OFFSET rip,RIP-ARGOFFSET
481	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
482	jnz tracesys
483system_call_fastpath:
484#if __SYSCALL_MASK == ~0
485	cmpq $__NR_syscall_max,%rax
486#else
487	andl $__SYSCALL_MASK,%eax
488	cmpl $__NR_syscall_max,%eax
489#endif
490	ja badsys
491	movq %r10,%rcx
492	call *sys_call_table(,%rax,8)  # XXX:	 rip relative
493	movq %rax,RAX-ARGOFFSET(%rsp)
494/*
495 * Syscall return path ending with SYSRET (fast path)
496 * Has incomplete stack frame and undefined top of stack.
497 */
498ret_from_sys_call:
499	movl $_TIF_ALLWORK_MASK,%edi
500	/* edi:	flagmask */
501sysret_check:
502	LOCKDEP_SYS_EXIT
503	DISABLE_INTERRUPTS(CLBR_NONE)
504	TRACE_IRQS_OFF
505	movl TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET),%edx
506	andl %edi,%edx
507	jnz  sysret_careful
508	CFI_REMEMBER_STATE
509	/*
510	 * sysretq will re-enable interrupts:
511	 */
512	TRACE_IRQS_ON
513	movq RIP-ARGOFFSET(%rsp),%rcx
514	CFI_REGISTER	rip,rcx
515	RESTORE_ARGS 1,-ARG_SKIP,0
516	/*CFI_REGISTER	rflags,r11*/
517	movq	PER_CPU_VAR(old_rsp), %rsp
518	USERGS_SYSRET64
519
520	CFI_RESTORE_STATE
521	/* Handle reschedules */
522	/* edx:	work, edi: workmask */
523sysret_careful:
524	bt $TIF_NEED_RESCHED,%edx
525	jnc sysret_signal
526	TRACE_IRQS_ON
527	ENABLE_INTERRUPTS(CLBR_NONE)
528	pushq_cfi %rdi
529	call schedule
530	popq_cfi %rdi
531	jmp sysret_check
532
533	/* Handle a signal */
534sysret_signal:
535	TRACE_IRQS_ON
536	ENABLE_INTERRUPTS(CLBR_NONE)
537#ifdef CONFIG_AUDITSYSCALL
538	bt $TIF_SYSCALL_AUDIT,%edx
539	jc sysret_audit
540#endif
541	/*
542	 * We have a signal, or exit tracing or single-step.
543	 * These all wind up with the iret return path anyway,
544	 * so just join that path right now.
545	 */
546	FIXUP_TOP_OF_STACK %r11, -ARGOFFSET
547	jmp int_check_syscall_exit_work
548
549badsys:
550	movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
551	jmp ret_from_sys_call
552
553#ifdef CONFIG_AUDITSYSCALL
554	/*
555	 * Fast path for syscall audit without full syscall trace.
556	 * We just call __audit_syscall_entry() directly, and then
557	 * jump back to the normal fast path.
558	 */
559auditsys:
560	movq %r10,%r9			/* 6th arg: 4th syscall arg */
561	movq %rdx,%r8			/* 5th arg: 3rd syscall arg */
562	movq %rsi,%rcx			/* 4th arg: 2nd syscall arg */
563	movq %rdi,%rdx			/* 3rd arg: 1st syscall arg */
564	movq %rax,%rsi			/* 2nd arg: syscall number */
565	movl $AUDIT_ARCH_X86_64,%edi	/* 1st arg: audit arch */
566	call __audit_syscall_entry
567	LOAD_ARGS 0		/* reload call-clobbered registers */
568	jmp system_call_fastpath
569
570	/*
571	 * Return fast path for syscall audit.  Call __audit_syscall_exit()
572	 * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
573	 * masked off.
574	 */
575sysret_audit:
576	movq RAX-ARGOFFSET(%rsp),%rsi	/* second arg, syscall return value */
577	cmpq $-MAX_ERRNO,%rsi	/* is it < -MAX_ERRNO? */
578	setbe %al		/* 1 if so, 0 if not */
579	movzbl %al,%edi		/* zero-extend that into %edi */
580	call __audit_syscall_exit
581	movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
582	jmp sysret_check
583#endif	/* CONFIG_AUDITSYSCALL */
584
585	/* Do syscall tracing */
586tracesys:
587#ifdef CONFIG_AUDITSYSCALL
588	testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
589	jz auditsys
590#endif
591	SAVE_REST
592	movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
593	FIXUP_TOP_OF_STACK %rdi
594	movq %rsp,%rdi
595	call syscall_trace_enter
596	/*
597	 * Reload arg registers from stack in case ptrace changed them.
598	 * We don't reload %rax because syscall_trace_enter() returned
599	 * the value it wants us to use in the table lookup.
600	 */
601	LOAD_ARGS ARGOFFSET, 1
602	RESTORE_REST
603#if __SYSCALL_MASK == ~0
604	cmpq $__NR_syscall_max,%rax
605#else
606	andl $__SYSCALL_MASK,%eax
607	cmpl $__NR_syscall_max,%eax
608#endif
609	ja   int_ret_from_sys_call	/* RAX(%rsp) set to -ENOSYS above */
610	movq %r10,%rcx	/* fixup for C */
611	call *sys_call_table(,%rax,8)
612	movq %rax,RAX-ARGOFFSET(%rsp)
613	/* Use IRET because user could have changed frame */
614
615/*
616 * Syscall return path ending with IRET.
617 * Has correct top of stack, but partial stack frame.
618 */
619GLOBAL(int_ret_from_sys_call)
620	DISABLE_INTERRUPTS(CLBR_NONE)
621	TRACE_IRQS_OFF
622	movl $_TIF_ALLWORK_MASK,%edi
623	/* edi:	mask to check */
624GLOBAL(int_with_check)
625	LOCKDEP_SYS_EXIT_IRQ
626	GET_THREAD_INFO(%rcx)
627	movl TI_flags(%rcx),%edx
628	andl %edi,%edx
629	jnz   int_careful
630	andl    $~TS_COMPAT,TI_status(%rcx)
631	jmp   retint_swapgs
632
633	/* Either reschedule or signal or syscall exit tracking needed. */
634	/* First do a reschedule test. */
635	/* edx:	work, edi: workmask */
636int_careful:
637	bt $TIF_NEED_RESCHED,%edx
638	jnc  int_very_careful
639	TRACE_IRQS_ON
640	ENABLE_INTERRUPTS(CLBR_NONE)
641	pushq_cfi %rdi
642	call schedule
643	popq_cfi %rdi
644	DISABLE_INTERRUPTS(CLBR_NONE)
645	TRACE_IRQS_OFF
646	jmp int_with_check
647
648	/* handle signals and tracing -- both require a full stack frame */
649int_very_careful:
650	TRACE_IRQS_ON
651	ENABLE_INTERRUPTS(CLBR_NONE)
652int_check_syscall_exit_work:
653	SAVE_REST
654	/* Check for syscall exit trace */
655	testl $_TIF_WORK_SYSCALL_EXIT,%edx
656	jz int_signal
657	pushq_cfi %rdi
658	leaq 8(%rsp),%rdi	# &ptregs -> arg1
659	call syscall_trace_leave
660	popq_cfi %rdi
661	andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
662	jmp int_restore_rest
663
664int_signal:
665	testl $_TIF_DO_NOTIFY_MASK,%edx
666	jz 1f
667	movq %rsp,%rdi		# &ptregs -> arg1
668	xorl %esi,%esi		# oldset -> arg2
669	call do_notify_resume
6701:	movl $_TIF_WORK_MASK,%edi
671int_restore_rest:
672	RESTORE_REST
673	DISABLE_INTERRUPTS(CLBR_NONE)
674	TRACE_IRQS_OFF
675	jmp int_with_check
676	CFI_ENDPROC
677END(system_call)
678
679/*
680 * Certain special system calls that need to save a complete full stack frame.
681 */
682	.macro PTREGSCALL label,func,arg
683ENTRY(\label)
684	PARTIAL_FRAME 1 8		/* offset 8: return address */
685	subq $REST_SKIP, %rsp
686	CFI_ADJUST_CFA_OFFSET REST_SKIP
687	call save_rest
688	DEFAULT_FRAME 0 8		/* offset 8: return address */
689	leaq 8(%rsp), \arg	/* pt_regs pointer */
690	call \func
691	jmp ptregscall_common
692	CFI_ENDPROC
693END(\label)
694	.endm
695
696	PTREGSCALL stub_clone, sys_clone, %r8
697	PTREGSCALL stub_fork, sys_fork, %rdi
698	PTREGSCALL stub_vfork, sys_vfork, %rdi
699	PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx
700	PTREGSCALL stub_iopl, sys_iopl, %rsi
701
702ENTRY(ptregscall_common)
703	DEFAULT_FRAME 1 8	/* offset 8: return address */
704	RESTORE_TOP_OF_STACK %r11, 8
705	movq_cfi_restore R15+8, r15
706	movq_cfi_restore R14+8, r14
707	movq_cfi_restore R13+8, r13
708	movq_cfi_restore R12+8, r12
709	movq_cfi_restore RBP+8, rbp
710	movq_cfi_restore RBX+8, rbx
711	ret $REST_SKIP		/* pop extended registers */
712	CFI_ENDPROC
713END(ptregscall_common)
714
715ENTRY(stub_execve)
716	CFI_STARTPROC
717	addq $8, %rsp
718	PARTIAL_FRAME 0
719	SAVE_REST
720	FIXUP_TOP_OF_STACK %r11
721	movq %rsp, %rcx
722	call sys_execve
723	RESTORE_TOP_OF_STACK %r11
724	movq %rax,RAX(%rsp)
725	RESTORE_REST
726	jmp int_ret_from_sys_call
727	CFI_ENDPROC
728END(stub_execve)
729
730/*
731 * sigreturn is special because it needs to restore all registers on return.
732 * This cannot be done with SYSRET, so use the IRET return path instead.
733 */
734ENTRY(stub_rt_sigreturn)
735	CFI_STARTPROC
736	addq $8, %rsp
737	PARTIAL_FRAME 0
738	SAVE_REST
739	movq %rsp,%rdi
740	FIXUP_TOP_OF_STACK %r11
741	call sys_rt_sigreturn
742	movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
743	RESTORE_REST
744	jmp int_ret_from_sys_call
745	CFI_ENDPROC
746END(stub_rt_sigreturn)
747
748#ifdef CONFIG_X86_X32_ABI
749	PTREGSCALL stub_x32_sigaltstack, sys32_sigaltstack, %rdx
750
751ENTRY(stub_x32_rt_sigreturn)
752	CFI_STARTPROC
753	addq $8, %rsp
754	PARTIAL_FRAME 0
755	SAVE_REST
756	movq %rsp,%rdi
757	FIXUP_TOP_OF_STACK %r11
758	call sys32_x32_rt_sigreturn
759	movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
760	RESTORE_REST
761	jmp int_ret_from_sys_call
762	CFI_ENDPROC
763END(stub_x32_rt_sigreturn)
764
765ENTRY(stub_x32_execve)
766	CFI_STARTPROC
767	addq $8, %rsp
768	PARTIAL_FRAME 0
769	SAVE_REST
770	FIXUP_TOP_OF_STACK %r11
771	movq %rsp, %rcx
772	call sys32_execve
773	RESTORE_TOP_OF_STACK %r11
774	movq %rax,RAX(%rsp)
775	RESTORE_REST
776	jmp int_ret_from_sys_call
777	CFI_ENDPROC
778END(stub_x32_execve)
779
780#endif
781
782/*
783 * Build the entry stubs and pointer table with some assembler magic.
784 * We pack 7 stubs into a single 32-byte chunk, which will fit in a
785 * single cache line on all modern x86 implementations.
786 */
787	.section .init.rodata,"a"
788ENTRY(interrupt)
789	.section .entry.text
790	.p2align 5
791	.p2align CONFIG_X86_L1_CACHE_SHIFT
792ENTRY(irq_entries_start)
793	INTR_FRAME
794vector=FIRST_EXTERNAL_VECTOR
795.rept (NR_VECTORS-FIRST_EXTERNAL_VECTOR+6)/7
796	.balign 32
797  .rept	7
798    .if vector < NR_VECTORS
799      .if vector <> FIRST_EXTERNAL_VECTOR
800	CFI_ADJUST_CFA_OFFSET -8
801      .endif
8021:	pushq_cfi $(~vector+0x80)	/* Note: always in signed byte range */
803      .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
804	jmp 2f
805      .endif
806      .previous
807	.quad 1b
808      .section .entry.text
809vector=vector+1
810    .endif
811  .endr
8122:	jmp common_interrupt
813.endr
814	CFI_ENDPROC
815END(irq_entries_start)
816
817.previous
818END(interrupt)
819.previous
820
821/*
822 * Interrupt entry/exit.
823 *
824 * Interrupt entry points save only callee clobbered registers in fast path.
825 *
826 * Entry runs with interrupts off.
827 */
828
829/* 0(%rsp): ~(interrupt number) */
830	.macro interrupt func
831	/* reserve pt_regs for scratch regs and rbp */
832	subq $ORIG_RAX-RBP, %rsp
833	CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP
834	SAVE_ARGS_IRQ
835	call \func
836	.endm
837
838/*
839 * Interrupt entry/exit should be protected against kprobes
840 */
841	.pushsection .kprobes.text, "ax"
842	/*
843	 * The interrupt stubs push (~vector+0x80) onto the stack and
844	 * then jump to common_interrupt.
845	 */
846	.p2align CONFIG_X86_L1_CACHE_SHIFT
847common_interrupt:
848	XCPT_FRAME
849	addq $-0x80,(%rsp)		/* Adjust vector to [-256,-1] range */
850	interrupt do_IRQ
851	/* 0(%rsp): old_rsp-ARGOFFSET */
852ret_from_intr:
853	DISABLE_INTERRUPTS(CLBR_NONE)
854	TRACE_IRQS_OFF
855	decl PER_CPU_VAR(irq_count)
856
857	/* Restore saved previous stack */
858	popq %rsi
859	CFI_DEF_CFA rsi,SS+8-RBP	/* reg/off reset after def_cfa_expr */
860	leaq ARGOFFSET-RBP(%rsi), %rsp
861	CFI_DEF_CFA_REGISTER	rsp
862	CFI_ADJUST_CFA_OFFSET	RBP-ARGOFFSET
863
864exit_intr:
865	GET_THREAD_INFO(%rcx)
866	testl $3,CS-ARGOFFSET(%rsp)
867	je retint_kernel
868
869	/* Interrupt came from user space */
870	/*
871	 * Has a correct top of stack, but a partial stack frame
872	 * %rcx: thread info. Interrupts off.
873	 */
874retint_with_reschedule:
875	movl $_TIF_WORK_MASK,%edi
876retint_check:
877	LOCKDEP_SYS_EXIT_IRQ
878	movl TI_flags(%rcx),%edx
879	andl %edi,%edx
880	CFI_REMEMBER_STATE
881	jnz  retint_careful
882
883retint_swapgs:		/* return to user-space */
884	/*
885	 * The iretq could re-enable interrupts:
886	 */
887	DISABLE_INTERRUPTS(CLBR_ANY)
888	TRACE_IRQS_IRETQ
889	SWAPGS
890	jmp restore_args
891
892retint_restore_args:	/* return to kernel space */
893	DISABLE_INTERRUPTS(CLBR_ANY)
894	/*
895	 * The iretq could re-enable interrupts:
896	 */
897	TRACE_IRQS_IRETQ
898restore_args:
899	RESTORE_ARGS 1,8,1
900
901irq_return:
902	INTERRUPT_RETURN
903
904	.section __ex_table, "a"
905	.quad irq_return, bad_iret
906	.previous
907
908#ifdef CONFIG_PARAVIRT
909ENTRY(native_iret)
910	iretq
911
912	.section __ex_table,"a"
913	.quad native_iret, bad_iret
914	.previous
915#endif
916
917	.section .fixup,"ax"
918bad_iret:
919	/*
920	 * The iret traps when the %cs or %ss being restored is bogus.
921	 * We've lost the original trap vector and error code.
922	 * #GPF is the most likely one to get for an invalid selector.
923	 * So pretend we completed the iret and took the #GPF in user mode.
924	 *
925	 * We are now running with the kernel GS after exception recovery.
926	 * But error_entry expects us to have user GS to match the user %cs,
927	 * so swap back.
928	 */
929	pushq $0
930
931	SWAPGS
932	jmp general_protection
933
934	.previous
935
936	/* edi: workmask, edx: work */
937retint_careful:
938	CFI_RESTORE_STATE
939	bt    $TIF_NEED_RESCHED,%edx
940	jnc   retint_signal
941	TRACE_IRQS_ON
942	ENABLE_INTERRUPTS(CLBR_NONE)
943	pushq_cfi %rdi
944	call  schedule
945	popq_cfi %rdi
946	GET_THREAD_INFO(%rcx)
947	DISABLE_INTERRUPTS(CLBR_NONE)
948	TRACE_IRQS_OFF
949	jmp retint_check
950
951retint_signal:
952	testl $_TIF_DO_NOTIFY_MASK,%edx
953	jz    retint_swapgs
954	TRACE_IRQS_ON
955	ENABLE_INTERRUPTS(CLBR_NONE)
956	SAVE_REST
957	movq $-1,ORIG_RAX(%rsp)
958	xorl %esi,%esi		# oldset
959	movq %rsp,%rdi		# &pt_regs
960	call do_notify_resume
961	RESTORE_REST
962	DISABLE_INTERRUPTS(CLBR_NONE)
963	TRACE_IRQS_OFF
964	GET_THREAD_INFO(%rcx)
965	jmp retint_with_reschedule
966
967#ifdef CONFIG_PREEMPT
968	/* Returning to kernel space. Check if we need preemption */
969	/* rcx:	 threadinfo. interrupts off. */
970ENTRY(retint_kernel)
971	cmpl $0,TI_preempt_count(%rcx)
972	jnz  retint_restore_args
973	bt  $TIF_NEED_RESCHED,TI_flags(%rcx)
974	jnc  retint_restore_args
975	bt   $9,EFLAGS-ARGOFFSET(%rsp)	/* interrupts off? */
976	jnc  retint_restore_args
977	call preempt_schedule_irq
978	jmp exit_intr
979#endif
980
981	CFI_ENDPROC
982END(common_interrupt)
983/*
984 * End of kprobes section
985 */
986       .popsection
987
988/*
989 * APIC interrupts.
990 */
991.macro apicinterrupt num sym do_sym
992ENTRY(\sym)
993	INTR_FRAME
994	pushq_cfi $~(\num)
995.Lcommon_\sym:
996	interrupt \do_sym
997	jmp ret_from_intr
998	CFI_ENDPROC
999END(\sym)
1000.endm
1001
1002#ifdef CONFIG_SMP
1003apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \
1004	irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt
1005apicinterrupt REBOOT_VECTOR \
1006	reboot_interrupt smp_reboot_interrupt
1007#endif
1008
1009#ifdef CONFIG_X86_UV
1010apicinterrupt UV_BAU_MESSAGE \
1011	uv_bau_message_intr1 uv_bau_message_interrupt
1012#endif
1013apicinterrupt LOCAL_TIMER_VECTOR \
1014	apic_timer_interrupt smp_apic_timer_interrupt
1015apicinterrupt X86_PLATFORM_IPI_VECTOR \
1016	x86_platform_ipi smp_x86_platform_ipi
1017
1018#ifdef CONFIG_SMP
1019	ALIGN
1020	INTR_FRAME
1021.irp idx,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \
1022	16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
1023.if NUM_INVALIDATE_TLB_VECTORS > \idx
1024ENTRY(invalidate_interrupt\idx)
1025	pushq_cfi $~(INVALIDATE_TLB_VECTOR_START+\idx)
1026	jmp .Lcommon_invalidate_interrupt0
1027	CFI_ADJUST_CFA_OFFSET -8
1028END(invalidate_interrupt\idx)
1029.endif
1030.endr
1031	CFI_ENDPROC
1032apicinterrupt INVALIDATE_TLB_VECTOR_START, \
1033	invalidate_interrupt0, smp_invalidate_interrupt
1034#endif
1035
1036apicinterrupt THRESHOLD_APIC_VECTOR \
1037	threshold_interrupt smp_threshold_interrupt
1038apicinterrupt THERMAL_APIC_VECTOR \
1039	thermal_interrupt smp_thermal_interrupt
1040
1041#ifdef CONFIG_SMP
1042apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \
1043	call_function_single_interrupt smp_call_function_single_interrupt
1044apicinterrupt CALL_FUNCTION_VECTOR \
1045	call_function_interrupt smp_call_function_interrupt
1046apicinterrupt RESCHEDULE_VECTOR \
1047	reschedule_interrupt smp_reschedule_interrupt
1048#endif
1049
1050apicinterrupt ERROR_APIC_VECTOR \
1051	error_interrupt smp_error_interrupt
1052apicinterrupt SPURIOUS_APIC_VECTOR \
1053	spurious_interrupt smp_spurious_interrupt
1054
1055#ifdef CONFIG_IRQ_WORK
1056apicinterrupt IRQ_WORK_VECTOR \
1057	irq_work_interrupt smp_irq_work_interrupt
1058#endif
1059
1060/*
1061 * Exception entry points.
1062 */
1063.macro zeroentry sym do_sym
1064ENTRY(\sym)
1065	INTR_FRAME
1066	PARAVIRT_ADJUST_EXCEPTION_FRAME
1067	pushq_cfi $-1		/* ORIG_RAX: no syscall to restart */
1068	subq $ORIG_RAX-R15, %rsp
1069	CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
1070	call error_entry
1071	DEFAULT_FRAME 0
1072	movq %rsp,%rdi		/* pt_regs pointer */
1073	xorl %esi,%esi		/* no error code */
1074	call \do_sym
1075	jmp error_exit		/* %ebx: no swapgs flag */
1076	CFI_ENDPROC
1077END(\sym)
1078.endm
1079
1080.macro paranoidzeroentry sym do_sym
1081ENTRY(\sym)
1082	INTR_FRAME
1083	PARAVIRT_ADJUST_EXCEPTION_FRAME
1084	pushq_cfi $-1		/* ORIG_RAX: no syscall to restart */
1085	subq $ORIG_RAX-R15, %rsp
1086	CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
1087	call save_paranoid
1088	TRACE_IRQS_OFF
1089	movq %rsp,%rdi		/* pt_regs pointer */
1090	xorl %esi,%esi		/* no error code */
1091	call \do_sym
1092	jmp paranoid_exit	/* %ebx: no swapgs flag */
1093	CFI_ENDPROC
1094END(\sym)
1095.endm
1096
1097#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8)
1098.macro paranoidzeroentry_ist sym do_sym ist
1099ENTRY(\sym)
1100	INTR_FRAME
1101	PARAVIRT_ADJUST_EXCEPTION_FRAME
1102	pushq_cfi $-1		/* ORIG_RAX: no syscall to restart */
1103	subq $ORIG_RAX-R15, %rsp
1104	CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
1105	call save_paranoid
1106	TRACE_IRQS_OFF
1107	movq %rsp,%rdi		/* pt_regs pointer */
1108	xorl %esi,%esi		/* no error code */
1109	subq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist)
1110	call \do_sym
1111	addq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist)
1112	jmp paranoid_exit	/* %ebx: no swapgs flag */
1113	CFI_ENDPROC
1114END(\sym)
1115.endm
1116
1117.macro errorentry sym do_sym
1118ENTRY(\sym)
1119	XCPT_FRAME
1120	PARAVIRT_ADJUST_EXCEPTION_FRAME
1121	subq $ORIG_RAX-R15, %rsp
1122	CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
1123	call error_entry
1124	DEFAULT_FRAME 0
1125	movq %rsp,%rdi			/* pt_regs pointer */
1126	movq ORIG_RAX(%rsp),%rsi	/* get error code */
1127	movq $-1,ORIG_RAX(%rsp)		/* no syscall to restart */
1128	call \do_sym
1129	jmp error_exit			/* %ebx: no swapgs flag */
1130	CFI_ENDPROC
1131END(\sym)
1132.endm
1133
1134	/* error code is on the stack already */
1135.macro paranoiderrorentry sym do_sym
1136ENTRY(\sym)
1137	XCPT_FRAME
1138	PARAVIRT_ADJUST_EXCEPTION_FRAME
1139	subq $ORIG_RAX-R15, %rsp
1140	CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
1141	call save_paranoid
1142	DEFAULT_FRAME 0
1143	TRACE_IRQS_OFF
1144	movq %rsp,%rdi			/* pt_regs pointer */
1145	movq ORIG_RAX(%rsp),%rsi	/* get error code */
1146	movq $-1,ORIG_RAX(%rsp)		/* no syscall to restart */
1147	call \do_sym
1148	jmp paranoid_exit		/* %ebx: no swapgs flag */
1149	CFI_ENDPROC
1150END(\sym)
1151.endm
1152
1153zeroentry divide_error do_divide_error
1154zeroentry overflow do_overflow
1155zeroentry bounds do_bounds
1156zeroentry invalid_op do_invalid_op
1157zeroentry device_not_available do_device_not_available
1158paranoiderrorentry double_fault do_double_fault
1159zeroentry coprocessor_segment_overrun do_coprocessor_segment_overrun
1160errorentry invalid_TSS do_invalid_TSS
1161errorentry segment_not_present do_segment_not_present
1162zeroentry spurious_interrupt_bug do_spurious_interrupt_bug
1163zeroentry coprocessor_error do_coprocessor_error
1164errorentry alignment_check do_alignment_check
1165zeroentry simd_coprocessor_error do_simd_coprocessor_error
1166
1167
1168	/* Reload gs selector with exception handling */
1169	/* edi:  new selector */
1170ENTRY(native_load_gs_index)
1171	CFI_STARTPROC
1172	pushfq_cfi
1173	DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI)
1174	SWAPGS
1175gs_change:
1176	movl %edi,%gs
11772:	mfence		/* workaround */
1178	SWAPGS
1179	popfq_cfi
1180	ret
1181	CFI_ENDPROC
1182END(native_load_gs_index)
1183
1184	.section __ex_table,"a"
1185	.align 8
1186	.quad gs_change,bad_gs
1187	.previous
1188	.section .fixup,"ax"
1189	/* running with kernelgs */
1190bad_gs:
1191	SWAPGS			/* switch back to user gs */
1192	xorl %eax,%eax
1193	movl %eax,%gs
1194	jmp  2b
1195	.previous
1196
1197ENTRY(kernel_thread_helper)
1198	pushq $0		# fake return address
1199	CFI_STARTPROC
1200	/*
1201	 * Here we are in the child and the registers are set as they were
1202	 * at kernel_thread() invocation in the parent.
1203	 */
1204	call *%rsi
1205	# exit
1206	mov %eax, %edi
1207	call do_exit
1208	ud2			# padding for call trace
1209	CFI_ENDPROC
1210END(kernel_thread_helper)
1211
1212/*
1213 * execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
1214 *
1215 * C extern interface:
1216 *	 extern long execve(const char *name, char **argv, char **envp)
1217 *
1218 * asm input arguments:
1219 *	rdi: name, rsi: argv, rdx: envp
1220 *
1221 * We want to fallback into:
1222 *	extern long sys_execve(const char *name, char **argv,char **envp, struct pt_regs *regs)
1223 *
1224 * do_sys_execve asm fallback arguments:
1225 *	rdi: name, rsi: argv, rdx: envp, rcx: fake frame on the stack
1226 */
1227ENTRY(kernel_execve)
1228	CFI_STARTPROC
1229	FAKE_STACK_FRAME $0
1230	SAVE_ALL
1231	movq %rsp,%rcx
1232	call sys_execve
1233	movq %rax, RAX(%rsp)
1234	RESTORE_REST
1235	testq %rax,%rax
1236	je int_ret_from_sys_call
1237	RESTORE_ARGS
1238	UNFAKE_STACK_FRAME
1239	ret
1240	CFI_ENDPROC
1241END(kernel_execve)
1242
1243/* Call softirq on interrupt stack. Interrupts are off. */
1244ENTRY(call_softirq)
1245	CFI_STARTPROC
1246	pushq_cfi %rbp
1247	CFI_REL_OFFSET rbp,0
1248	mov  %rsp,%rbp
1249	CFI_DEF_CFA_REGISTER rbp
1250	incl PER_CPU_VAR(irq_count)
1251	cmove PER_CPU_VAR(irq_stack_ptr),%rsp
1252	push  %rbp			# backlink for old unwinder
1253	call __do_softirq
1254	leaveq
1255	CFI_RESTORE		rbp
1256	CFI_DEF_CFA_REGISTER	rsp
1257	CFI_ADJUST_CFA_OFFSET   -8
1258	decl PER_CPU_VAR(irq_count)
1259	ret
1260	CFI_ENDPROC
1261END(call_softirq)
1262
1263#ifdef CONFIG_XEN
1264zeroentry xen_hypervisor_callback xen_do_hypervisor_callback
1265
1266/*
1267 * A note on the "critical region" in our callback handler.
1268 * We want to avoid stacking callback handlers due to events occurring
1269 * during handling of the last event. To do this, we keep events disabled
1270 * until we've done all processing. HOWEVER, we must enable events before
1271 * popping the stack frame (can't be done atomically) and so it would still
1272 * be possible to get enough handler activations to overflow the stack.
1273 * Although unlikely, bugs of that kind are hard to track down, so we'd
1274 * like to avoid the possibility.
1275 * So, on entry to the handler we detect whether we interrupted an
1276 * existing activation in its critical region -- if so, we pop the current
1277 * activation and restart the handler using the previous one.
1278 */
1279ENTRY(xen_do_hypervisor_callback)   # do_hypervisor_callback(struct *pt_regs)
1280	CFI_STARTPROC
1281/*
1282 * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
1283 * see the correct pointer to the pt_regs
1284 */
1285	movq %rdi, %rsp            # we don't return, adjust the stack frame
1286	CFI_ENDPROC
1287	DEFAULT_FRAME
128811:	incl PER_CPU_VAR(irq_count)
1289	movq %rsp,%rbp
1290	CFI_DEF_CFA_REGISTER rbp
1291	cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
1292	pushq %rbp			# backlink for old unwinder
1293	call xen_evtchn_do_upcall
1294	popq %rsp
1295	CFI_DEF_CFA_REGISTER rsp
1296	decl PER_CPU_VAR(irq_count)
1297	jmp  error_exit
1298	CFI_ENDPROC
1299END(xen_do_hypervisor_callback)
1300
1301/*
1302 * Hypervisor uses this for application faults while it executes.
1303 * We get here for two reasons:
1304 *  1. Fault while reloading DS, ES, FS or GS
1305 *  2. Fault while executing IRET
1306 * Category 1 we do not need to fix up as Xen has already reloaded all segment
1307 * registers that could be reloaded and zeroed the others.
1308 * Category 2 we fix up by killing the current process. We cannot use the
1309 * normal Linux return path in this case because if we use the IRET hypercall
1310 * to pop the stack frame we end up in an infinite loop of failsafe callbacks.
1311 * We distinguish between categories by comparing each saved segment register
1312 * with its current contents: any discrepancy means we in category 1.
1313 */
1314ENTRY(xen_failsafe_callback)
1315	INTR_FRAME 1 (6*8)
1316	/*CFI_REL_OFFSET gs,GS*/
1317	/*CFI_REL_OFFSET fs,FS*/
1318	/*CFI_REL_OFFSET es,ES*/
1319	/*CFI_REL_OFFSET ds,DS*/
1320	CFI_REL_OFFSET r11,8
1321	CFI_REL_OFFSET rcx,0
1322	movw %ds,%cx
1323	cmpw %cx,0x10(%rsp)
1324	CFI_REMEMBER_STATE
1325	jne 1f
1326	movw %es,%cx
1327	cmpw %cx,0x18(%rsp)
1328	jne 1f
1329	movw %fs,%cx
1330	cmpw %cx,0x20(%rsp)
1331	jne 1f
1332	movw %gs,%cx
1333	cmpw %cx,0x28(%rsp)
1334	jne 1f
1335	/* All segments match their saved values => Category 2 (Bad IRET). */
1336	movq (%rsp),%rcx
1337	CFI_RESTORE rcx
1338	movq 8(%rsp),%r11
1339	CFI_RESTORE r11
1340	addq $0x30,%rsp
1341	CFI_ADJUST_CFA_OFFSET -0x30
1342	pushq_cfi $0	/* RIP */
1343	pushq_cfi %r11
1344	pushq_cfi %rcx
1345	jmp general_protection
1346	CFI_RESTORE_STATE
13471:	/* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
1348	movq (%rsp),%rcx
1349	CFI_RESTORE rcx
1350	movq 8(%rsp),%r11
1351	CFI_RESTORE r11
1352	addq $0x30,%rsp
1353	CFI_ADJUST_CFA_OFFSET -0x30
1354	pushq_cfi $-1 /* orig_ax = -1 => not a system call */
1355	SAVE_ALL
1356	jmp error_exit
1357	CFI_ENDPROC
1358END(xen_failsafe_callback)
1359
1360apicinterrupt XEN_HVM_EVTCHN_CALLBACK \
1361	xen_hvm_callback_vector xen_evtchn_do_upcall
1362
1363#endif /* CONFIG_XEN */
1364
1365/*
1366 * Some functions should be protected against kprobes
1367 */
1368	.pushsection .kprobes.text, "ax"
1369
1370paranoidzeroentry_ist debug do_debug DEBUG_STACK
1371paranoidzeroentry_ist int3 do_int3 DEBUG_STACK
1372paranoiderrorentry stack_segment do_stack_segment
1373#ifdef CONFIG_XEN
1374zeroentry xen_debug do_debug
1375zeroentry xen_int3 do_int3
1376errorentry xen_stack_segment do_stack_segment
1377#endif
1378errorentry general_protection do_general_protection
1379errorentry page_fault do_page_fault
1380#ifdef CONFIG_KVM_GUEST
1381errorentry async_page_fault do_async_page_fault
1382#endif
1383#ifdef CONFIG_X86_MCE
1384paranoidzeroentry machine_check *machine_check_vector(%rip)
1385#endif
1386
1387	/*
1388	 * "Paranoid" exit path from exception stack.
1389	 * Paranoid because this is used by NMIs and cannot take
1390	 * any kernel state for granted.
1391	 * We don't do kernel preemption checks here, because only
1392	 * NMI should be common and it does not enable IRQs and
1393	 * cannot get reschedule ticks.
1394	 *
1395	 * "trace" is 0 for the NMI handler only, because irq-tracing
1396	 * is fundamentally NMI-unsafe. (we cannot change the soft and
1397	 * hard flags at once, atomically)
1398	 */
1399
1400	/* ebx:	no swapgs flag */
1401ENTRY(paranoid_exit)
1402	DEFAULT_FRAME
1403	DISABLE_INTERRUPTS(CLBR_NONE)
1404	TRACE_IRQS_OFF
1405	testl %ebx,%ebx				/* swapgs needed? */
1406	jnz paranoid_restore
1407	testl $3,CS(%rsp)
1408	jnz   paranoid_userspace
1409paranoid_swapgs:
1410	TRACE_IRQS_IRETQ 0
1411	SWAPGS_UNSAFE_STACK
1412	RESTORE_ALL 8
1413	jmp irq_return
1414paranoid_restore:
1415	TRACE_IRQS_IRETQ 0
1416	RESTORE_ALL 8
1417	jmp irq_return
1418paranoid_userspace:
1419	GET_THREAD_INFO(%rcx)
1420	movl TI_flags(%rcx),%ebx
1421	andl $_TIF_WORK_MASK,%ebx
1422	jz paranoid_swapgs
1423	movq %rsp,%rdi			/* &pt_regs */
1424	call sync_regs
1425	movq %rax,%rsp			/* switch stack for scheduling */
1426	testl $_TIF_NEED_RESCHED,%ebx
1427	jnz paranoid_schedule
1428	movl %ebx,%edx			/* arg3: thread flags */
1429	TRACE_IRQS_ON
1430	ENABLE_INTERRUPTS(CLBR_NONE)
1431	xorl %esi,%esi 			/* arg2: oldset */
1432	movq %rsp,%rdi 			/* arg1: &pt_regs */
1433	call do_notify_resume
1434	DISABLE_INTERRUPTS(CLBR_NONE)
1435	TRACE_IRQS_OFF
1436	jmp paranoid_userspace
1437paranoid_schedule:
1438	TRACE_IRQS_ON
1439	ENABLE_INTERRUPTS(CLBR_ANY)
1440	call schedule
1441	DISABLE_INTERRUPTS(CLBR_ANY)
1442	TRACE_IRQS_OFF
1443	jmp paranoid_userspace
1444	CFI_ENDPROC
1445END(paranoid_exit)
1446
1447/*
1448 * Exception entry point. This expects an error code/orig_rax on the stack.
1449 * returns in "no swapgs flag" in %ebx.
1450 */
1451ENTRY(error_entry)
1452	XCPT_FRAME
1453	CFI_ADJUST_CFA_OFFSET 15*8
1454	/* oldrax contains error code */
1455	cld
1456	movq_cfi rdi, RDI+8
1457	movq_cfi rsi, RSI+8
1458	movq_cfi rdx, RDX+8
1459	movq_cfi rcx, RCX+8
1460	movq_cfi rax, RAX+8
1461	movq_cfi  r8,  R8+8
1462	movq_cfi  r9,  R9+8
1463	movq_cfi r10, R10+8
1464	movq_cfi r11, R11+8
1465	movq_cfi rbx, RBX+8
1466	movq_cfi rbp, RBP+8
1467	movq_cfi r12, R12+8
1468	movq_cfi r13, R13+8
1469	movq_cfi r14, R14+8
1470	movq_cfi r15, R15+8
1471	xorl %ebx,%ebx
1472	testl $3,CS+8(%rsp)
1473	je error_kernelspace
1474error_swapgs:
1475	SWAPGS
1476error_sti:
1477	TRACE_IRQS_OFF
1478	ret
1479
1480/*
1481 * There are two places in the kernel that can potentially fault with
1482 * usergs. Handle them here. The exception handlers after iret run with
1483 * kernel gs again, so don't set the user space flag. B stepping K8s
1484 * sometimes report an truncated RIP for IRET exceptions returning to
1485 * compat mode. Check for these here too.
1486 */
1487error_kernelspace:
1488	incl %ebx
1489	leaq irq_return(%rip),%rcx
1490	cmpq %rcx,RIP+8(%rsp)
1491	je error_swapgs
1492	movl %ecx,%eax	/* zero extend */
1493	cmpq %rax,RIP+8(%rsp)
1494	je bstep_iret
1495	cmpq $gs_change,RIP+8(%rsp)
1496	je error_swapgs
1497	jmp error_sti
1498
1499bstep_iret:
1500	/* Fix truncated RIP */
1501	movq %rcx,RIP+8(%rsp)
1502	jmp error_swapgs
1503	CFI_ENDPROC
1504END(error_entry)
1505
1506
1507/* ebx:	no swapgs flag (1: don't need swapgs, 0: need it) */
1508ENTRY(error_exit)
1509	DEFAULT_FRAME
1510	movl %ebx,%eax
1511	RESTORE_REST
1512	DISABLE_INTERRUPTS(CLBR_NONE)
1513	TRACE_IRQS_OFF
1514	GET_THREAD_INFO(%rcx)
1515	testl %eax,%eax
1516	jne retint_kernel
1517	LOCKDEP_SYS_EXIT_IRQ
1518	movl TI_flags(%rcx),%edx
1519	movl $_TIF_WORK_MASK,%edi
1520	andl %edi,%edx
1521	jnz retint_careful
1522	jmp retint_swapgs
1523	CFI_ENDPROC
1524END(error_exit)
1525
1526/*
1527 * Test if a given stack is an NMI stack or not.
1528 */
1529	.macro test_in_nmi reg stack nmi_ret normal_ret
1530	cmpq %\reg, \stack
1531	ja \normal_ret
1532	subq $EXCEPTION_STKSZ, %\reg
1533	cmpq %\reg, \stack
1534	jb \normal_ret
1535	jmp \nmi_ret
1536	.endm
1537
1538	/* runs on exception stack */
1539ENTRY(nmi)
1540	INTR_FRAME
1541	PARAVIRT_ADJUST_EXCEPTION_FRAME
1542	/*
1543	 * We allow breakpoints in NMIs. If a breakpoint occurs, then
1544	 * the iretq it performs will take us out of NMI context.
1545	 * This means that we can have nested NMIs where the next
1546	 * NMI is using the top of the stack of the previous NMI. We
1547	 * can't let it execute because the nested NMI will corrupt the
1548	 * stack of the previous NMI. NMI handlers are not re-entrant
1549	 * anyway.
1550	 *
1551	 * To handle this case we do the following:
1552	 *  Check the a special location on the stack that contains
1553	 *  a variable that is set when NMIs are executing.
1554	 *  The interrupted task's stack is also checked to see if it
1555	 *  is an NMI stack.
1556	 *  If the variable is not set and the stack is not the NMI
1557	 *  stack then:
1558	 *    o Set the special variable on the stack
1559	 *    o Copy the interrupt frame into a "saved" location on the stack
1560	 *    o Copy the interrupt frame into a "copy" location on the stack
1561	 *    o Continue processing the NMI
1562	 *  If the variable is set or the previous stack is the NMI stack:
1563	 *    o Modify the "copy" location to jump to the repeate_nmi
1564	 *    o return back to the first NMI
1565	 *
1566	 * Now on exit of the first NMI, we first clear the stack variable
1567	 * The NMI stack will tell any nested NMIs at that point that it is
1568	 * nested. Then we pop the stack normally with iret, and if there was
1569	 * a nested NMI that updated the copy interrupt stack frame, a
1570	 * jump will be made to the repeat_nmi code that will handle the second
1571	 * NMI.
1572	 */
1573
1574	/* Use %rdx as out temp variable throughout */
1575	pushq_cfi %rdx
1576	CFI_REL_OFFSET rdx, 0
1577
1578	/*
1579	 * If %cs was not the kernel segment, then the NMI triggered in user
1580	 * space, which means it is definitely not nested.
1581	 */
1582	cmpl $__KERNEL_CS, 16(%rsp)
1583	jne first_nmi
1584
1585	/*
1586	 * Check the special variable on the stack to see if NMIs are
1587	 * executing.
1588	 */
1589	cmpl $1, -8(%rsp)
1590	je nested_nmi
1591
1592	/*
1593	 * Now test if the previous stack was an NMI stack.
1594	 * We need the double check. We check the NMI stack to satisfy the
1595	 * race when the first NMI clears the variable before returning.
1596	 * We check the variable because the first NMI could be in a
1597	 * breakpoint routine using a breakpoint stack.
1598	 */
1599	lea 6*8(%rsp), %rdx
1600	test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi
1601	CFI_REMEMBER_STATE
1602
1603nested_nmi:
1604	/*
1605	 * Do nothing if we interrupted the fixup in repeat_nmi.
1606	 * It's about to repeat the NMI handler, so we are fine
1607	 * with ignoring this one.
1608	 */
1609	movq $repeat_nmi, %rdx
1610	cmpq 8(%rsp), %rdx
1611	ja 1f
1612	movq $end_repeat_nmi, %rdx
1613	cmpq 8(%rsp), %rdx
1614	ja nested_nmi_out
1615
16161:
1617	/* Set up the interrupted NMIs stack to jump to repeat_nmi */
1618	leaq -6*8(%rsp), %rdx
1619	movq %rdx, %rsp
1620	CFI_ADJUST_CFA_OFFSET 6*8
1621	pushq_cfi $__KERNEL_DS
1622	pushq_cfi %rdx
1623	pushfq_cfi
1624	pushq_cfi $__KERNEL_CS
1625	pushq_cfi $repeat_nmi
1626
1627	/* Put stack back */
1628	addq $(11*8), %rsp
1629	CFI_ADJUST_CFA_OFFSET -11*8
1630
1631nested_nmi_out:
1632	popq_cfi %rdx
1633	CFI_RESTORE rdx
1634
1635	/* No need to check faults here */
1636	INTERRUPT_RETURN
1637
1638	CFI_RESTORE_STATE
1639first_nmi:
1640	/*
1641	 * Because nested NMIs will use the pushed location that we
1642	 * stored in rdx, we must keep that space available.
1643	 * Here's what our stack frame will look like:
1644	 * +-------------------------+
1645	 * | original SS             |
1646	 * | original Return RSP     |
1647	 * | original RFLAGS         |
1648	 * | original CS             |
1649	 * | original RIP            |
1650	 * +-------------------------+
1651	 * | temp storage for rdx    |
1652	 * +-------------------------+
1653	 * | NMI executing variable  |
1654	 * +-------------------------+
1655	 * | Saved SS                |
1656	 * | Saved Return RSP        |
1657	 * | Saved RFLAGS            |
1658	 * | Saved CS                |
1659	 * | Saved RIP               |
1660	 * +-------------------------+
1661	 * | copied SS               |
1662	 * | copied Return RSP       |
1663	 * | copied RFLAGS           |
1664	 * | copied CS               |
1665	 * | copied RIP              |
1666	 * +-------------------------+
1667	 * | pt_regs                 |
1668	 * +-------------------------+
1669	 *
1670	 * The saved stack frame is used to fix up the copied stack frame
1671	 * that a nested NMI may change to make the interrupted NMI iret jump
1672	 * to the repeat_nmi. The original stack frame and the temp storage
1673	 * is also used by nested NMIs and can not be trusted on exit.
1674	 */
1675	/* Do not pop rdx, nested NMIs will corrupt that part of the stack */
1676	movq (%rsp), %rdx
1677	CFI_RESTORE rdx
1678
1679	/* Set the NMI executing variable on the stack. */
1680	pushq_cfi $1
1681
1682	/* Copy the stack frame to the Saved frame */
1683	.rept 5
1684	pushq_cfi 6*8(%rsp)
1685	.endr
1686	CFI_DEF_CFA_OFFSET SS+8-RIP
1687
1688	/* Everything up to here is safe from nested NMIs */
1689
1690	/*
1691	 * If there was a nested NMI, the first NMI's iret will return
1692	 * here. But NMIs are still enabled and we can take another
1693	 * nested NMI. The nested NMI checks the interrupted RIP to see
1694	 * if it is between repeat_nmi and end_repeat_nmi, and if so
1695	 * it will just return, as we are about to repeat an NMI anyway.
1696	 * This makes it safe to copy to the stack frame that a nested
1697	 * NMI will update.
1698	 */
1699repeat_nmi:
1700	/*
1701	 * Update the stack variable to say we are still in NMI (the update
1702	 * is benign for the non-repeat case, where 1 was pushed just above
1703	 * to this very stack slot).
1704	 */
1705	movq $1, 5*8(%rsp)
1706
1707	/* Make another copy, this one may be modified by nested NMIs */
1708	.rept 5
1709	pushq_cfi 4*8(%rsp)
1710	.endr
1711	CFI_DEF_CFA_OFFSET SS+8-RIP
1712end_repeat_nmi:
1713
1714	/*
1715	 * Everything below this point can be preempted by a nested
1716	 * NMI if the first NMI took an exception and reset our iret stack
1717	 * so that we repeat another NMI.
1718	 */
1719	pushq_cfi $-1		/* ORIG_RAX: no syscall to restart */
1720	subq $ORIG_RAX-R15, %rsp
1721	CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
1722	/*
1723	 * Use save_paranoid to handle SWAPGS, but no need to use paranoid_exit
1724	 * as we should not be calling schedule in NMI context.
1725	 * Even with normal interrupts enabled. An NMI should not be
1726	 * setting NEED_RESCHED or anything that normal interrupts and
1727	 * exceptions might do.
1728	 */
1729	call save_paranoid
1730	DEFAULT_FRAME 0
1731	/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
1732	movq %rsp,%rdi
1733	movq $-1,%rsi
1734	call do_nmi
1735	testl %ebx,%ebx				/* swapgs needed? */
1736	jnz nmi_restore
1737nmi_swapgs:
1738	SWAPGS_UNSAFE_STACK
1739nmi_restore:
1740	RESTORE_ALL 8
1741	/* Clear the NMI executing stack variable */
1742	movq $0, 10*8(%rsp)
1743	jmp irq_return
1744	CFI_ENDPROC
1745END(nmi)
1746
1747ENTRY(ignore_sysret)
1748	CFI_STARTPROC
1749	mov $-ENOSYS,%eax
1750	sysret
1751	CFI_ENDPROC
1752END(ignore_sysret)
1753
1754/*
1755 * End of kprobes section
1756 */
1757	.popsection
1758