diff mbox

E5-2620v2 - emulation stop error

Message ID 20150331134512.GG13271@potion.brq.redhat.com
State New
Headers show

Commit Message

Radim Krčmář March 31, 2015, 1:45 p.m. UTC
2015-03-30 22:32+0300, Andrey Korolyov:
> On Mon, Mar 30, 2015 at 9:56 PM, Radim Krčmář <rkrcmar@redhat.com> wrote:
> > 2015-03-27 13:16+0300, Andrey Korolyov:
> >> On Fri, Mar 27, 2015 at 12:03 AM, Bandan Das <bsd@redhat.com> wrote:
> >> > Radim Krčmář <rkrcmar@redhat.com> writes:
> >> >> I second Bandan -- checking that it reproduces on other machine would be
> >> >> great for sanity :)  (Although a bug in our APICv is far more likely.)
> >> >
> >> > If it's APICv related, a run without apicv enabled could give more hints.
> >> >
> >> > Your "devices not getting reset" hypothesis makes the most sense to me,
> >> > maybe the timer vector in the error message is just one part of
> >> > the whole story. Another misbehaving interrupt from the dark comes in at the
> >> > same time and leads to a double fault.
> >>
> >> Default trace (APICv enabled, first reboot introduced the issue):
> >> http://xdel.ru/downloads/kvm-e5v2-issue/hanged-reboot-apic-on.dat.gz
> >
> > The relevant part is here,
> > prefixed with "qemu-system-x86-4180  [002]   697.111550:"
> >
> >   kvm_exit:             reason CR_ACCESS rip 0xd272 info 0 0
> >   kvm_cr:               cr_write 0 = 0x10
> >   kvm_mmu_get_page:     existing sp gfn 0 0/4 q0 direct --- !pge !nxe root 0 sync
> >   kvm_entry:            vcpu 0
> >   kvm_emulate_insn:     f0000:d275: ea 7a d2 00 f0
> >   kvm_emulate_insn:     f0000:d27a: 2e 0f 01 1e f0 6c
> >   kvm_emulate_insn:     f0000:d280: 31 c0
> >   kvm_emulate_insn:     f0000:d282: 8e e0
> >   kvm_emulate_insn:     f0000:d284: 8e e8
> >   kvm_emulate_insn:     f0000:d286: 8e c0
> >   kvm_emulate_insn:     f0000:d288: 8e d8
> >   kvm_emulate_insn:     f0000:d28a: 8e d0
> >   kvm_entry:            vcpu 0
> >   kvm_exit:             reason EXTERNAL_INTERRUPT rip 0xd28f info 0 800000f6
> >   kvm_entry:            vcpu 0
> >   kvm_exit:             reason EPT_VIOLATION rip 0x8dd0 info 184 0
> >   kvm_page_fault:       address f8dd0 error_code 184
> >   kvm_entry:            vcpu 0
> >   kvm_exit:             reason EXTERNAL_INTERRUPT rip 0x8dd0 info 0 800000f6
> >   kvm_entry:            vcpu 0
> >   kvm_exit:             reason EPT_VIOLATION rip 0x76d6 info 184 0
> >   kvm_page_fault:       address f76d6 error_code 184
> >   kvm_entry:            vcpu 0
> >   kvm_exit:             reason EXTERNAL_INTERRUPT rip 0x76d6 info 0 800000f6
> >   kvm_entry:            vcpu 0
> >   kvm_exit:             reason PENDING_INTERRUPT rip 0xd331 info 0 0
> >   kvm_inj_virq:         irq 8
> >   kvm_entry:            vcpu 0
> >   kvm_exit:             reason EXTERNAL_INTERRUPT rip 0xfea5 info 0 800000f6
> >   kvm_entry:            vcpu 0
> >   kvm_exit:             reason EPT_VIOLATION rip 0xfea5 info 184 0
> >   kvm_page_fault:       address ffea5 error_code 184
> >   kvm_entry:            vcpu 0
> >   kvm_exit:             reason EXTERNAL_INTERRUPT rip 0xfea5 info 0 800000f6
> >   kvm_entry:            vcpu 0
> >   kvm_exit:             reason EPT_VIOLATION rip 0xe990 info 184 0
> >   kvm_page_fault:       address fe990 error_code 184
> >   kvm_entry:            vcpu 0
> >   kvm_exit:             reason EXTERNAL_INTERRUPT rip 0xe990 info 0 800000f6
> >   kvm_entry:            vcpu 0
> >   kvm_exit:             reason EXCEPTION_NMI rip 0xd334 info 0 80000b0d
> >   kvm_userspace_exit:   reason KVM_EXIT_INTERNAL_ERROR (17)
> >
> >> Trace without APICv (three reboots, just to make sure to hit the
> >> problematic condition of supposed DF, as it still have not one hundred
> >> percent reproducibility):
> >> http://xdel.ru/downloads/kvm-e5v2-issue/apic-off.dat.gz
> >
> > The trace here contains a well matching excerpt, just instead of the
> > EXCEPTION_NMI, it does
> >
> >  169.905098: kvm_exit:             reason EPT_VIOLATION rip 0xd334 info 181 0
> >  169.905102: kvm_page_fault:       address feffd066 error_code 181
> >
> > and works.  Page fault says we tried to read 0xfeffd066 -- probably IOPB
> > of TSS.  (I guess it is pre-fetch for following IO instruction.)
> >
> > Nothing strikes me when looking at it, but some APICv boots don't fail,
> > so it would be interesting to compare them ... hosts's 0xf6 interrupt
> > (IRQ_WORK_VECTOR) is a possible source of races.  (We could look more
> > closely.  It is fired too often for my liking as well.)
> 
> Thanks Radim, http://xdel.ru/downloads/kvm-e5v2-issue/no-fail-with-apicv.dat.gz
> 
> The related bits looks the same as with enable_apicv=0 for me.

Yeah,

 qemu-system-x86-4201  [007]   159.297337:
  kvm_exit:             reason CR_ACCESS rip 0xd272 info 0 0
  kvm_cr:               cr_write 0 = 0x10
  kvm_mmu_get_page:     existing sp gfn 0 0/4 q0 direct --- !pge !nxe root 0 sync
  kvm_entry:            vcpu 0
  kvm_emulate_insn:     f0000:d275: ea 7a d2 00 f0
  kvm_emulate_insn:     f0000:d27a: 2e 0f 01 1e f0 6c
  kvm_emulate_insn:     f0000:d280: 31 c0
  kvm_emulate_insn:     f0000:d282: 8e e0
  kvm_emulate_insn:     f0000:d284: 8e e8
  kvm_emulate_insn:     f0000:d286: 8e c0
  kvm_emulate_insn:     f0000:d288: 8e d8
  kvm_emulate_insn:     f0000:d28a: 8e d0
  kvm_entry:            vcpu 0
  kvm_exit:             reason EPT_VIOLATION rip 0x8dd0 info 184 0
  kvm_page_fault:       address f8dd0 error_code 184
  kvm_entry:            vcpu 0
  kvm_exit:             reason EPT_VIOLATION rip 0x76d6 info 184 0
  kvm_page_fault:       address f76d6 error_code 184
  kvm_entry:            vcpu 0
  kvm_exit:             reason EPT_VIOLATION rip 0xd331 info 181 0
  kvm_page_fault:       address feffd066 error_code 181
  kvm_inj_virq:         irq 25
  kvm_entry:            vcpu 0
  kvm_exit:             reason EPT_VIOLATION rip 0xe6f2 info 184 0
  kvm_page_fault:       address fe6f2 error_code 184
  kvm_entry:            vcpu 0
  kvm_exit:             reason IO_INSTRUCTION rip 0xd202 info 700040 0
  kvm_pio:              pio_write at 0x70 size 1 count 1 val 0x8f
  kvm_userspace_exit:   reason KVM_EXIT_IO (2)

I noticed three differences from the failing version:
 1) Page fault 0xfeffd066 @ 0xd331, KVM also injects vector 0x19 there
 2) No PENDING_INTERRUPT irq 8  (not sure why it happened on APICv)
 3) No EXTERNAL_INTERRUPT vector 0xf6 (lucky race)

Chasing the culprit this way could take a long time, so a new tracepoint
that shows if 0xef is set on entry would let us guess the bug faster ...

Please provide a failing trace with the following patch:

---

Comments

Andrey Korolyov March 31, 2015, 2:56 p.m. UTC | #1
On Tue, Mar 31, 2015 at 4:45 PM, Radim Krčmář <rkrcmar@redhat.com> wrote:
> 2015-03-30 22:32+0300, Andrey Korolyov:
>> On Mon, Mar 30, 2015 at 9:56 PM, Radim Krčmář <rkrcmar@redhat.com> wrote:
>> > 2015-03-27 13:16+0300, Andrey Korolyov:
>> >> On Fri, Mar 27, 2015 at 12:03 AM, Bandan Das <bsd@redhat.com> wrote:
>> >> > Radim Krčmář <rkrcmar@redhat.com> writes:
>> >> >> I second Bandan -- checking that it reproduces on other machine would be
>> >> >> great for sanity :)  (Although a bug in our APICv is far more likely.)
>> >> >
>> >> > If it's APICv related, a run without apicv enabled could give more hints.
>> >> >
>> >> > Your "devices not getting reset" hypothesis makes the most sense to me,
>> >> > maybe the timer vector in the error message is just one part of
>> >> > the whole story. Another misbehaving interrupt from the dark comes in at the
>> >> > same time and leads to a double fault.
>> >>
>> >> Default trace (APICv enabled, first reboot introduced the issue):
>> >> http://xdel.ru/downloads/kvm-e5v2-issue/hanged-reboot-apic-on.dat.gz
>> >
>> > The relevant part is here,
>> > prefixed with "qemu-system-x86-4180  [002]   697.111550:"
>> >
>> >   kvm_exit:             reason CR_ACCESS rip 0xd272 info 0 0
>> >   kvm_cr:               cr_write 0 = 0x10
>> >   kvm_mmu_get_page:     existing sp gfn 0 0/4 q0 direct --- !pge !nxe root 0 sync
>> >   kvm_entry:            vcpu 0
>> >   kvm_emulate_insn:     f0000:d275: ea 7a d2 00 f0
>> >   kvm_emulate_insn:     f0000:d27a: 2e 0f 01 1e f0 6c
>> >   kvm_emulate_insn:     f0000:d280: 31 c0
>> >   kvm_emulate_insn:     f0000:d282: 8e e0
>> >   kvm_emulate_insn:     f0000:d284: 8e e8
>> >   kvm_emulate_insn:     f0000:d286: 8e c0
>> >   kvm_emulate_insn:     f0000:d288: 8e d8
>> >   kvm_emulate_insn:     f0000:d28a: 8e d0
>> >   kvm_entry:            vcpu 0
>> >   kvm_exit:             reason EXTERNAL_INTERRUPT rip 0xd28f info 0 800000f6
>> >   kvm_entry:            vcpu 0
>> >   kvm_exit:             reason EPT_VIOLATION rip 0x8dd0 info 184 0
>> >   kvm_page_fault:       address f8dd0 error_code 184
>> >   kvm_entry:            vcpu 0
>> >   kvm_exit:             reason EXTERNAL_INTERRUPT rip 0x8dd0 info 0 800000f6
>> >   kvm_entry:            vcpu 0
>> >   kvm_exit:             reason EPT_VIOLATION rip 0x76d6 info 184 0
>> >   kvm_page_fault:       address f76d6 error_code 184
>> >   kvm_entry:            vcpu 0
>> >   kvm_exit:             reason EXTERNAL_INTERRUPT rip 0x76d6 info 0 800000f6
>> >   kvm_entry:            vcpu 0
>> >   kvm_exit:             reason PENDING_INTERRUPT rip 0xd331 info 0 0
>> >   kvm_inj_virq:         irq 8
>> >   kvm_entry:            vcpu 0
>> >   kvm_exit:             reason EXTERNAL_INTERRUPT rip 0xfea5 info 0 800000f6
>> >   kvm_entry:            vcpu 0
>> >   kvm_exit:             reason EPT_VIOLATION rip 0xfea5 info 184 0
>> >   kvm_page_fault:       address ffea5 error_code 184
>> >   kvm_entry:            vcpu 0
>> >   kvm_exit:             reason EXTERNAL_INTERRUPT rip 0xfea5 info 0 800000f6
>> >   kvm_entry:            vcpu 0
>> >   kvm_exit:             reason EPT_VIOLATION rip 0xe990 info 184 0
>> >   kvm_page_fault:       address fe990 error_code 184
>> >   kvm_entry:            vcpu 0
>> >   kvm_exit:             reason EXTERNAL_INTERRUPT rip 0xe990 info 0 800000f6
>> >   kvm_entry:            vcpu 0
>> >   kvm_exit:             reason EXCEPTION_NMI rip 0xd334 info 0 80000b0d
>> >   kvm_userspace_exit:   reason KVM_EXIT_INTERNAL_ERROR (17)
>> >
>> >> Trace without APICv (three reboots, just to make sure to hit the
>> >> problematic condition of supposed DF, as it still have not one hundred
>> >> percent reproducibility):
>> >> http://xdel.ru/downloads/kvm-e5v2-issue/apic-off.dat.gz
>> >
>> > The trace here contains a well matching excerpt, just instead of the
>> > EXCEPTION_NMI, it does
>> >
>> >  169.905098: kvm_exit:             reason EPT_VIOLATION rip 0xd334 info 181 0
>> >  169.905102: kvm_page_fault:       address feffd066 error_code 181
>> >
>> > and works.  Page fault says we tried to read 0xfeffd066 -- probably IOPB
>> > of TSS.  (I guess it is pre-fetch for following IO instruction.)
>> >
>> > Nothing strikes me when looking at it, but some APICv boots don't fail,
>> > so it would be interesting to compare them ... hosts's 0xf6 interrupt
>> > (IRQ_WORK_VECTOR) is a possible source of races.  (We could look more
>> > closely.  It is fired too often for my liking as well.)
>>
>> Thanks Radim, http://xdel.ru/downloads/kvm-e5v2-issue/no-fail-with-apicv.dat.gz
>>
>> The related bits looks the same as with enable_apicv=0 for me.
>
> Yeah,
>
>  qemu-system-x86-4201  [007]   159.297337:
>   kvm_exit:             reason CR_ACCESS rip 0xd272 info 0 0
>   kvm_cr:               cr_write 0 = 0x10
>   kvm_mmu_get_page:     existing sp gfn 0 0/4 q0 direct --- !pge !nxe root 0 sync
>   kvm_entry:            vcpu 0
>   kvm_emulate_insn:     f0000:d275: ea 7a d2 00 f0
>   kvm_emulate_insn:     f0000:d27a: 2e 0f 01 1e f0 6c
>   kvm_emulate_insn:     f0000:d280: 31 c0
>   kvm_emulate_insn:     f0000:d282: 8e e0
>   kvm_emulate_insn:     f0000:d284: 8e e8
>   kvm_emulate_insn:     f0000:d286: 8e c0
>   kvm_emulate_insn:     f0000:d288: 8e d8
>   kvm_emulate_insn:     f0000:d28a: 8e d0
>   kvm_entry:            vcpu 0
>   kvm_exit:             reason EPT_VIOLATION rip 0x8dd0 info 184 0
>   kvm_page_fault:       address f8dd0 error_code 184
>   kvm_entry:            vcpu 0
>   kvm_exit:             reason EPT_VIOLATION rip 0x76d6 info 184 0
>   kvm_page_fault:       address f76d6 error_code 184
>   kvm_entry:            vcpu 0
>   kvm_exit:             reason EPT_VIOLATION rip 0xd331 info 181 0
>   kvm_page_fault:       address feffd066 error_code 181
>   kvm_inj_virq:         irq 25
>   kvm_entry:            vcpu 0
>   kvm_exit:             reason EPT_VIOLATION rip 0xe6f2 info 184 0
>   kvm_page_fault:       address fe6f2 error_code 184
>   kvm_entry:            vcpu 0
>   kvm_exit:             reason IO_INSTRUCTION rip 0xd202 info 700040 0
>   kvm_pio:              pio_write at 0x70 size 1 count 1 val 0x8f
>   kvm_userspace_exit:   reason KVM_EXIT_IO (2)
>
> I noticed three differences from the failing version:
>  1) Page fault 0xfeffd066 @ 0xd331, KVM also injects vector 0x19 there
>  2) No PENDING_INTERRUPT irq 8  (not sure why it happened on APICv)
>  3) No EXTERNAL_INTERRUPT vector 0xf6 (lucky race)
>
> Chasing the culprit this way could take a long time, so a new tracepoint
> that shows if 0xef is set on entry would let us guess the bug faster ...
>
> Please provide a failing trace with the following patch:
>
> ---
> diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
> index 7c7bc8bef21f..8cf85dcaa1ee 100644
> --- a/arch/x86/kvm/trace.h
> +++ b/arch/x86/kvm/trace.h
> @@ -742,6 +742,29 @@ TRACE_EVENT(kvm_emulate_insn,
>  #define trace_kvm_emulate_insn_start(vcpu) trace_kvm_emulate_insn(vcpu, 0)
>  #define trace_kvm_emulate_insn_failed(vcpu) trace_kvm_emulate_insn(vcpu, 1)
>
> +TRACE_EVENT(kvm_0xef,
> +       TP_PROTO(bool irr, bool isr, u32 vmcs),
> +       TP_ARGS(irr, isr, vmcs),
> +
> +       TP_STRUCT__entry(
> +               __field(bool, irr )
> +               __field(bool, isr )
> +               __field(u32,  vmcs)
> +               ),
> +
> +       TP_fast_assign(
> +               __entry->irr  = irr;
> +               __entry->isr  = isr;
> +               __entry->vmcs = vmcs;
> +               ),
> +
> +       TP_printk("irr %s, isr %s, vmcs 0x%x",
> +                 __entry->irr ? "set  " : "clear",
> +                 __entry->isr ? "set  " : "clear",
> +                 __entry->vmcs
> +                )
> +       );
> +
>  TRACE_EVENT(
>         vcpu_match_mmio,
>         TP_PROTO(gva_t gva, gpa_t gpa, bool write, bool gpa_match),
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index eee63dc33d89..254cca12c2ec 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -8129,6 +8129,13 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
>                                         msrs[i].host);
>  }
>
> +#define VEC_POS(v) ((v) & (32 - 1))
> +#define REG_POS(v) (((v) >> 5) << 4)
> +static inline int apic_test_vector(int vec, void *bitmap)
> +{
> +       return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
> +}
> +
>  static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
>  {
>         struct vcpu_vmx *vmx = to_vmx(vcpu);
> @@ -8143,6 +8150,10 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
>         if (vmx->emulation_required)
>                 return;
>
> +       trace_kvm_0xef(apic_test_vector(0xef, vcpu->arch.apic->regs + APIC_IRR),
> +                      apic_test_vector(0xef, vcpu->arch.apic->regs + APIC_ISR),
> +                      vmcs_read32(VM_ENTRY_INTR_INFO_FIELD));
> +
>         if (vmx->ple_window_dirty) {
>                 vmx->ple_window_dirty = false;
>                 vmcs_write32(PLE_WINDOW, vmx->ple_window);
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 32bf19ef3115..a45fa01bd354 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -7881,3 +7881,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
>  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset);
>  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window);
>  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full);
> +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_0xef);

Thanks, please see below:

http://xdel.ru/downloads/kvm-e5v2-issue/new-tracepoint-fail-with-apicv.dat.gz
diff mbox

Patch

diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 7c7bc8bef21f..8cf85dcaa1ee 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -742,6 +742,29 @@  TRACE_EVENT(kvm_emulate_insn,
 #define trace_kvm_emulate_insn_start(vcpu) trace_kvm_emulate_insn(vcpu, 0)
 #define trace_kvm_emulate_insn_failed(vcpu) trace_kvm_emulate_insn(vcpu, 1)
 
+TRACE_EVENT(kvm_0xef,
+	TP_PROTO(bool irr, bool isr, u32 vmcs),
+	TP_ARGS(irr, isr, vmcs),
+
+	TP_STRUCT__entry(
+		__field(bool, irr )
+		__field(bool, isr )
+		__field(u32,  vmcs)
+		),
+
+	TP_fast_assign(
+		__entry->irr  = irr;
+		__entry->isr  = isr;
+		__entry->vmcs = vmcs;
+		),
+
+	TP_printk("irr %s, isr %s, vmcs 0x%x",
+	          __entry->irr ? "set  " : "clear",
+	          __entry->isr ? "set  " : "clear",
+	          __entry->vmcs
+	         )
+	);
+
 TRACE_EVENT(
 	vcpu_match_mmio,
 	TP_PROTO(gva_t gva, gpa_t gpa, bool write, bool gpa_match),
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index eee63dc33d89..254cca12c2ec 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -8129,6 +8129,13 @@  static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
 					msrs[i].host);
 }
 
+#define VEC_POS(v) ((v) & (32 - 1))
+#define REG_POS(v) (((v) >> 5) << 4)
+static inline int apic_test_vector(int vec, void *bitmap)
+{
+	return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
+}
+
 static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -8143,6 +8150,10 @@  static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	if (vmx->emulation_required)
 		return;
 
+	trace_kvm_0xef(apic_test_vector(0xef, vcpu->arch.apic->regs + APIC_IRR),
+	               apic_test_vector(0xef, vcpu->arch.apic->regs + APIC_ISR),
+	               vmcs_read32(VM_ENTRY_INTR_INFO_FIELD));
+
 	if (vmx->ple_window_dirty) {
 		vmx->ple_window_dirty = false;
 		vmcs_write32(PLE_WINDOW, vmx->ple_window);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 32bf19ef3115..a45fa01bd354 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7881,3 +7881,4 @@  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_0xef);