diff mbox

[3.13.y-ckt,05/53] x86/nmi/64: Switch stacks on userspace NMI entry

Message ID 1438893498-17157-6-git-send-email-kamal@canonical.com
State New
Headers show

Commit Message

Kamal Mostafa Aug. 6, 2015, 8:37 p.m. UTC
3.13.11-ckt25 -stable review patch.  If anyone has any objections, please let me know.

------------------

From: Andy Lutomirski <luto@kernel.org>

commit 9b6e6a8334d56354853f9c255d1395c2ba570e0a upstream.

Returning to userspace is tricky: IRET can fail, and ESPFIX can
rearrange the stack prior to IRET.

The NMI nesting fixup relies on a precise stack layout and atomic
IRET.  Rather than trying to teach the NMI nesting fixup to handle
ESPFIX and failed IRET, punt: run NMIs that came from user mode on
the normal kernel stack.

This will make some nested NMIs visible to C code, but the C code is
okay with that.

As a side effect, this should speed up perf: it eliminates an RDMSR
when NMIs come from user mode.

Fixes CVE-2015-3290.

Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
[bwh: Backported to 4.0:
 - Adjust filename, context
 - s/restore_c_regs_and_iret/restore_args/
 - Use kernel_stack + KERNEL_STACK_OFFSET instead of cpu_current_top_of_stack]
[luto: Open-coded return path to avoid dependency on partial pt_regs details]
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Acked-by: John Johansen <john.johansen@canonical.com>
Acked-by: Andy Whitcroft <apw@canonical.com>
CVE-2015-3290, CVE-2015-5157
Signed-off-by: Luis Henriques <luis.henriques@canonical.com>
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kamal Mostafa <kamal@canonical.com>
---
 arch/x86/kernel/entry_64.S | 79 +++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 75 insertions(+), 4 deletions(-)
diff mbox

Patch

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 2cd55a8..2991779 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1740,19 +1740,90 @@  ENTRY(nmi)
 	 * a nested NMI that updated the copy interrupt stack frame, a
 	 * jump will be made to the repeat_nmi code that will handle the second
 	 * NMI.
+	 *
+	 * However, espfix prevents us from directly returning to userspace
+	 * with a single IRET instruction.  Similarly, IRET to user mode
+	 * can fault.  We therefore handle NMIs from user space like
+	 * other IST entries.
 	 */
 
 	/* Use %rdx as out temp variable throughout */
 	pushq_cfi %rdx
 	CFI_REL_OFFSET rdx, 0
 
+	testb	$3, CS-RIP+8(%rsp)
+	jz	.Lnmi_from_kernel
+
 	/*
-	 * If %cs was not the kernel segment, then the NMI triggered in user
-	 * space, which means it is definitely not nested.
+	 * NMI from user mode.  We need to run on the thread stack, but we
+	 * can't go through the normal entry paths: NMIs are masked, and
+	 * we don't want to enable interrupts, because then we'll end
+	 * up in an awkward situation in which IRQs are on but NMIs
+	 * are off.
 	 */
-	cmpl $__KERNEL_CS, 16(%rsp)
-	jne first_nmi
 
+	SWAPGS
+	cld
+	movq	%rsp, %rdx
+	movq	PER_CPU_VAR(kernel_stack), %rsp
+	addq	$KERNEL_STACK_OFFSET, %rsp
+	pushq	5*8(%rdx)	/* pt_regs->ss */
+	pushq	4*8(%rdx)	/* pt_regs->rsp */
+	pushq	3*8(%rdx)	/* pt_regs->flags */
+	pushq	2*8(%rdx)	/* pt_regs->cs */
+	pushq	1*8(%rdx)	/* pt_regs->rip */
+	pushq   $-1		/* pt_regs->orig_ax */
+	pushq   %rdi		/* pt_regs->di */
+	pushq   %rsi		/* pt_regs->si */
+	pushq   (%rdx)		/* pt_regs->dx */
+	pushq   %rcx		/* pt_regs->cx */
+	pushq   %rax		/* pt_regs->ax */
+	pushq   %r8		/* pt_regs->r8 */
+	pushq   %r9		/* pt_regs->r9 */
+	pushq   %r10		/* pt_regs->r10 */
+	pushq   %r11		/* pt_regs->r11 */
+	pushq	%rbx		/* pt_regs->rbx */
+	pushq	%rbp		/* pt_regs->rbp */
+	pushq	%r12		/* pt_regs->r12 */
+	pushq	%r13		/* pt_regs->r13 */
+	pushq	%r14		/* pt_regs->r14 */
+	pushq	%r15		/* pt_regs->r15 */
+
+	/*
+	 * At this point we no longer need to worry about stack damage
+	 * due to nesting -- we're on the normal thread stack and we're
+	 * done with the NMI stack.
+	 */
+
+	movq	%rsp, %rdi
+	movq	$-1, %rsi
+	call	do_nmi
+
+	/*
+	 * Return back to user mode.  We must *not* do the normal exit
+	 * work, because we don't want to enable interrupts.  Fortunately,
+	 * do_nmi doesn't modify pt_regs.
+	 */
+	SWAPGS
+
+	/*
+	 * Open-code the entire return process for compatibility with varying
+	 * register layouts across different kernel versions.
+	 */
+	addq	$6*8, %rsp	/* skip bx, bp, and r12-r15 */
+	popq	%r11		/* pt_regs->r11 */
+	popq	%r10		/* pt_regs->r10 */
+	popq	%r9		/* pt_regs->r9 */
+	popq	%r8		/* pt_regs->r8 */
+	popq	%rax		/* pt_regs->ax */
+	popq	%rcx		/* pt_regs->cx */
+	popq	%rdx		/* pt_regs->dx */
+	popq	%rsi		/* pt_regs->si */
+	popq	%rdi		/* pt_regs->di */
+	addq	$8, %rsp	/* skip orig_ax */
+	INTERRUPT_RETURN
+
+.Lnmi_from_kernel:
 	/*
 	 * Check the special variable on the stack to see if NMIs are
 	 * executing.