diff mbox series

[3/7] KVM: i386: Add support for KVM_CAP_EXCEPTION_PAYLOAD

Message ID 20190615004256.16367-4-pbonzini@redhat.com
State New
Headers show
Series target-i386/kvm: live migration support for nested VMX | expand

Commit Message

Paolo Bonzini June 15, 2019, 12:42 a.m. UTC
From: Liran Alon <liran.alon@oracle.com>

Kernel commit c4f55198c7c2 ("kvm: x86: Introduce KVM_CAP_EXCEPTION_PAYLOAD")
introduced a new KVM capability which allows userspace to correctly
distinguish between pending and injected exceptions.

This distinguish is important in case of nested virtualization scenarios
because a L2 pending exception can still be intercepted by the L1 hypervisor
while a L2 injected exception cannot.

Furthermore, when an exception is attempted to be injected by QEMU,
QEMU should specify the exception payload (CR2 in case of #PF or
DR6 in case of #DB) instead of having the payload already delivered in
the respective vCPU register. Because in case exception is injected to
L2 guest and is intercepted by L1 hypervisor, then payload needs to be
reported to L1 intercept (VMExit handler) while still preserving
respective vCPU register unchanged.

This commit adds support for QEMU to properly utilise this new KVM
capability (KVM_CAP_EXCEPTION_PAYLOAD).

Reviewed-by: Nikita Leshenko <nikita.leshchenko@oracle.com>
Signed-off-by: Liran Alon <liran.alon@oracle.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 target/i386/cpu.c        | 10 ++---
 target/i386/cpu.h        | 13 +++++-
 target/i386/hvf/hvf.c    | 10 +++--
 target/i386/hvf/x86hvf.c |  4 +-
 target/i386/kvm.c        | 95 +++++++++++++++++++++++++++++++++-------
 target/i386/machine.c    | 61 +++++++++++++++++++++++++-
 6 files changed, 163 insertions(+), 30 deletions(-)

Comments

Liran Alon June 15, 2019, 12:57 a.m. UTC | #1
> On 15 Jun 2019, at 3:42, Paolo Bonzini <pbonzini@redhat.com> wrote:
> 
> From: Liran Alon <liran.alon@oracle.com>
> 
> Kernel commit c4f55198c7c2 ("kvm: x86: Introduce KVM_CAP_EXCEPTION_PAYLOAD")
> introduced a new KVM capability which allows userspace to correctly
> distinguish between pending and injected exceptions.
> 
> This distinguish is important in case of nested virtualization scenarios
> because a L2 pending exception can still be intercepted by the L1 hypervisor
> while a L2 injected exception cannot.
> 
> Furthermore, when an exception is attempted to be injected by QEMU,
> QEMU should specify the exception payload (CR2 in case of #PF or
> DR6 in case of #DB) instead of having the payload already delivered in
> the respective vCPU register. Because in case exception is injected to
> L2 guest and is intercepted by L1 hypervisor, then payload needs to be
> reported to L1 intercept (VMExit handler) while still preserving
> respective vCPU register unchanged.
> 
> This commit adds support for QEMU to properly utilise this new KVM
> capability (KVM_CAP_EXCEPTION_PAYLOAD).
> 
> Reviewed-by: Nikita Leshenko <nikita.leshchenko@oracle.com>
> Signed-off-by: Liran Alon <liran.alon@oracle.com>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
> target/i386/cpu.c        | 10 ++---
> target/i386/cpu.h        | 13 +++++-
> target/i386/hvf/hvf.c    | 10 +++--
> target/i386/hvf/x86hvf.c |  4 +-
> target/i386/kvm.c        | 95 +++++++++++++++++++++++++++++++++-------
> target/i386/machine.c    | 61 +++++++++++++++++++++++++-
> 6 files changed, 163 insertions(+), 30 deletions(-)
> 
> diff --git a/target/i386/cpu.c b/target/i386/cpu.c
> index c1ab86d63e..4e19969111 100644
> --- a/target/i386/cpu.c
> +++ b/target/i386/cpu.c
> @@ -4777,7 +4777,9 @@ static void x86_cpu_reset(CPUState *s)
>     memset(env->mtrr_fixed, 0, sizeof(env->mtrr_fixed));
> 
>     env->interrupt_injected = -1;
> -    env->exception_injected = -1;
> +    env->exception_nr = -1;
> +    env->exception_pending = 0;
> +    env->exception_injected = 0;

Note: I the patch-series I will submit I will add here:
+    env->exception_has_payload = false;
+    env->exception_payload = 0;

>     env->nmi_injected = false;
> #if !defined(CONFIG_USER_ONLY)
>     /* We hard-wire the BSP to the first CPU. */
> @@ -5173,12 +5175,6 @@ static int x86_cpu_filter_features(X86CPU *cpu)
>     return rv;
> }
> 
> -#define IS_INTEL_CPU(env) ((env)->cpuid_vendor1 == CPUID_VENDOR_INTEL_1 && \
> -                           (env)->cpuid_vendor2 == CPUID_VENDOR_INTEL_2 && \
> -                           (env)->cpuid_vendor3 == CPUID_VENDOR_INTEL_3)
> -#define IS_AMD_CPU(env) ((env)->cpuid_vendor1 == CPUID_VENDOR_AMD_1 && \
> -                         (env)->cpuid_vendor2 == CPUID_VENDOR_AMD_2 && \
> -                         (env)->cpuid_vendor3 == CPUID_VENDOR_AMD_3)
> static void x86_cpu_realizefn(DeviceState *dev, Error **errp)
> {
>     CPUState *cs = CPU(dev);
> diff --git a/target/i386/cpu.h b/target/i386/cpu.h
> index bd06523a53..bbeb7a9521 100644
> --- a/target/i386/cpu.h
> +++ b/target/i386/cpu.h
> @@ -729,6 +729,13 @@ typedef uint32_t FeatureWordArray[FEATURE_WORDS];
> 
> #define CPUID_VENDOR_HYGON    "HygonGenuine"
> 
> +#define IS_INTEL_CPU(env) ((env)->cpuid_vendor1 == CPUID_VENDOR_INTEL_1 && \
> +                           (env)->cpuid_vendor2 == CPUID_VENDOR_INTEL_2 && \
> +                           (env)->cpuid_vendor3 == CPUID_VENDOR_INTEL_3)
> +#define IS_AMD_CPU(env) ((env)->cpuid_vendor1 == CPUID_VENDOR_AMD_1 && \
> +                         (env)->cpuid_vendor2 == CPUID_VENDOR_AMD_2 && \
> +                         (env)->cpuid_vendor3 == CPUID_VENDOR_AMD_3)
> +
> #define CPUID_MWAIT_IBE     (1U << 1) /* Interrupts can exit capability */
> #define CPUID_MWAIT_EMX     (1U << 0) /* enumeration supported */
> 
> @@ -1332,10 +1339,14 @@ typedef struct CPUX86State {
> 
>     /* For KVM */
>     uint32_t mp_state;
> -    int32_t exception_injected;
> +    int32_t exception_nr;
>     int32_t interrupt_injected;
>     uint8_t soft_interrupt;
> +    uint8_t exception_pending;
> +    uint8_t exception_injected;
>     uint8_t has_error_code;
> +    uint8_t exception_has_payload;
> +    uint64_t exception_payload;
>     uint32_t ins_len;
>     uint32_t sipi_vector;
>     bool tsc_valid;
> diff --git a/target/i386/hvf/hvf.c b/target/i386/hvf/hvf.c
> index 2751c8125c..dc4bb63536 100644
> --- a/target/i386/hvf/hvf.c
> +++ b/target/i386/hvf/hvf.c
> @@ -605,7 +605,9 @@ static void hvf_store_events(CPUState *cpu, uint32_t ins_len, uint64_t idtvec_in
>     X86CPU *x86_cpu = X86_CPU(cpu);
>     CPUX86State *env = &x86_cpu->env;
> 
> -    env->exception_injected = -1;
> +    env->exception_nr = -1;
> +    env->exception_pending = 0;
> +    env->exception_injected = 0;
>     env->interrupt_injected = -1;
>     env->nmi_injected = false;
>     if (idtvec_info & VMCS_IDT_VEC_VALID) {
> @@ -619,7 +621,8 @@ static void hvf_store_events(CPUState *cpu, uint32_t ins_len, uint64_t idtvec_in
>             break;
>         case VMCS_IDT_VEC_HWEXCEPTION:
>         case VMCS_IDT_VEC_SWEXCEPTION:
> -            env->exception_injected = idtvec_info & VMCS_IDT_VEC_VECNUM;
> +            env->exception_nr = idtvec_info & VMCS_IDT_VEC_VECNUM;
> +            env->exception_injected = 1;
>             break;
>         case VMCS_IDT_VEC_PRIV_SWEXCEPTION:
>         default:
> @@ -912,7 +915,8 @@ int hvf_vcpu_exec(CPUState *cpu)
>             macvm_set_rip(cpu, rip + ins_len);
>             break;
>         case VMX_REASON_VMCALL:
> -            env->exception_injected = EXCP0D_GPF;
> +            env->exception_nr = EXCP0D_GPF;
> +            env->exception_injected = 1;
>             env->has_error_code = true;
>             env->error_code = 0;
>             break;
> diff --git a/target/i386/hvf/x86hvf.c b/target/i386/hvf/x86hvf.c
> index df8e946fbc..e0ea02d631 100644
> --- a/target/i386/hvf/x86hvf.c
> +++ b/target/i386/hvf/x86hvf.c
> @@ -362,8 +362,8 @@ bool hvf_inject_interrupts(CPUState *cpu_state)
>     if (env->interrupt_injected != -1) {
>         vector = env->interrupt_injected;
>         intr_type = VMCS_INTR_T_SWINTR;
> -    } else if (env->exception_injected != -1) {
> -        vector = env->exception_injected;
> +    } else if (env->exception_nr != -1) {
> +        vector = env->exception_nr;
>         if (vector == EXCP03_INT3 || vector == EXCP04_INTO) {
>             intr_type = VMCS_INTR_T_SWEXCEPTION;
>         } else {
> diff --git a/target/i386/kvm.c b/target/i386/kvm.c
> index 53f95b02a0..dca76830ec 100644
> --- a/target/i386/kvm.c
> +++ b/target/i386/kvm.c
> @@ -104,6 +104,7 @@ static uint32_t num_architectural_pmu_fixed_counters;
> static int has_xsave;
> static int has_xcrs;
> static int has_pit_state2;
> +static int has_exception_payload;
> 
> static bool has_msr_mcg_ext_ctl;
> 
> @@ -584,15 +585,51 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr)
>     /* Hope we are lucky for AO MCE */
> }
> 
> +static void kvm_reset_exception(CPUX86State *env)
> +{
> +	env->exception_nr = -1;
> +	env->exception_pending = 0;
> +	env->exception_injected = 0;

Note: In the patch-series I will submit I will add here:
+       env->exception_has_payload = false;
+       env->exception_payload = 0;

> +}
> +
> +static void kvm_queue_exception(CPUX86State *env,
> +                                int32_t exception_nr,
> +                                uint8_t exception_has_payload,
> +                                uint64_t exception_payload)
> +{
> +    assert(env->exception_nr == -1);
> +    assert(!env->exception_pending);
> +    assert(!env->exception_injected);

Note: In the patch-series I will submit I will add here:
+    assert(!env->exception_has_payload);

> +
> +    env->exception_nr = exception_nr;
> +
> +    if (has_exception_payload) {
> +        env->exception_pending = 1;
> +
> +        env->exception_has_payload = exception_has_payload;
> +        env->exception_payload = exception_payload;
> +    } else {
> +        env->exception_injected = 1;
> +
> +        if (exception_has_payload) {
> +            if (exception_nr == EXCP01_DB) {
> +                env->dr[6] = exception_payload;
> +            } else if (exception_nr == EXCP0E_PAGE) {
> +                env->cr[2] = exception_payload;
> +            }
> +        }

Note: In the patch-series I will submit, I have changed this
to verify that #DB & #PF always have env->exception_has_payload set to true
and other cases have it set to false.

> +    }
> +}
> +
> static int kvm_inject_mce_oldstyle(X86CPU *cpu)
> {
>     CPUX86State *env = &cpu->env;
> 
> -    if (!kvm_has_vcpu_events() && env->exception_injected == EXCP12_MCHK) {
> +    if (!kvm_has_vcpu_events() && env->exception_nr == EXCP12_MCHK) {
>         unsigned int bank, bank_num = env->mcg_cap & 0xff;
>         struct kvm_x86_mce mce;
> 
> -        env->exception_injected = -1;
> +	kvm_reset_exception(env);
> 
>         /*
>          * There must be at least one bank in use if an MCE is pending.
> @@ -1573,6 +1610,16 @@ int kvm_arch_init(MachineState *ms, KVMState *s)
> 
>     hv_vpindex_settable = kvm_check_extension(s, KVM_CAP_HYPERV_VP_INDEX);
> 
> +    has_exception_payload = kvm_check_extension(s, KVM_CAP_EXCEPTION_PAYLOAD);
> +    if (has_exception_payload) {
> +        ret = kvm_vm_enable_cap(s, KVM_CAP_EXCEPTION_PAYLOAD, 0, true);
> +        if (ret < 0) {
> +            error_report("kvm: Failed to enable exception payload cap: %s",
> +                         strerror(-ret));
> +            return ret;
> +        }
> +    }
> +
>     ret = kvm_get_supported_msrs(s);
>     if (ret < 0) {
>         return ret;
> @@ -2877,8 +2924,16 @@ static int kvm_put_vcpu_events(X86CPU *cpu, int level)
>         return 0;
>     }
> 
> -    events.exception.injected = (env->exception_injected >= 0);
> -    events.exception.nr = env->exception_injected;
> +    events.flags = 0;
> +
> +    if (has_exception_payload) {
> +        events.flags |= KVM_VCPUEVENT_VALID_PAYLOAD;
> +        events.exception.pending = env->exception_pending;
> +        events.exception_has_payload = env->exception_has_payload;
> +        events.exception_payload = env->exception_payload;
> +    }
> +    events.exception.nr = env->exception_nr;
> +    events.exception.injected = env->exception_injected;
>     events.exception.has_error_code = env->has_error_code;
>     events.exception.error_code = env->error_code;
> 
> @@ -2891,7 +2946,6 @@ static int kvm_put_vcpu_events(X86CPU *cpu, int level)
>     events.nmi.masked = !!(env->hflags2 & HF2_NMI_MASK);
> 
>     events.sipi_vector = env->sipi_vector;
> -    events.flags = 0;
> 
>     if (has_msr_smbase) {
>         events.smi.smm = !!(env->hflags & HF_SMM_MASK);
> @@ -2941,8 +2995,19 @@ static int kvm_get_vcpu_events(X86CPU *cpu)
>     if (ret < 0) {
>        return ret;
>     }
> -    env->exception_injected =
> -       events.exception.injected ? events.exception.nr : -1;
> +
> +    if (events.flags & KVM_VCPUEVENT_VALID_PAYLOAD) {
> +        env->exception_pending = events.exception.pending;
> +        env->exception_has_payload = events.exception_has_payload;
> +        env->exception_payload = events.exception_payload;
> +    } else {
> +        env->exception_pending = 0;
> +        env->exception_has_payload = false;
> +    }
> +    env->exception_injected = events.exception.injected;
> +    env->exception_nr =
> +        (env->exception_pending || env->exception_injected) ?
> +        events.exception.nr : -1;
>     env->has_error_code = events.exception.has_error_code;
>     env->error_code = events.exception.error_code;
> 
> @@ -2994,12 +3059,12 @@ static int kvm_guest_debug_workarounds(X86CPU *cpu)
>     unsigned long reinject_trap = 0;
> 
>     if (!kvm_has_vcpu_events()) {
> -        if (env->exception_injected == EXCP01_DB) {
> +        if (env->exception_nr == EXCP01_DB) {
>             reinject_trap = KVM_GUESTDBG_INJECT_DB;
>         } else if (env->exception_injected == EXCP03_INT3) {
>             reinject_trap = KVM_GUESTDBG_INJECT_BP;
>         }
> -        env->exception_injected = -1;
> +        kvm_reset_exception(env);
>     }
> 
>     /*
> @@ -3320,13 +3385,13 @@ int kvm_arch_process_async_events(CPUState *cs)
> 
>         kvm_cpu_synchronize_state(cs);
> 
> -        if (env->exception_injected == EXCP08_DBLE) {
> +        if (env->exception_nr == EXCP08_DBLE) {
>             /* this means triple fault */
>             qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
>             cs->exit_request = 1;
>             return 0;
>         }
> -        env->exception_injected = EXCP12_MCHK;
> +        kvm_queue_exception(env, EXCP12_MCHK, 0, 0);
>         env->has_error_code = 0;
> 
>         cs->halted = 0;
> @@ -3541,14 +3606,12 @@ static int kvm_handle_debug(X86CPU *cpu,
>     }
>     if (ret == 0) {
>         cpu_synchronize_state(cs);
> -        assert(env->exception_injected == -1);
> +        assert(env->exception_nr == -1);
> 
>         /* pass to guest */
> -        env->exception_injected = arch_info->exception;
> +        kvm_queue_exception(env, arch_info->exception,
> +                            EXCP01_DB, arch_info->dr6);

There is a bug here I will fix in my patch-series:
Third argument should be (arch_info->exception == EXCP01_DB).

>         env->has_error_code = 0;
> -        if (arch_info->exception == EXCP01_DB) {
> -            env->dr[6] = arch_info->dr6;
> -        }
>     }
> 
>     return ret;
> diff --git a/target/i386/machine.c b/target/i386/machine.c
> index 225b5d433b..41460be54b 100644
> --- a/target/i386/machine.c
> +++ b/target/i386/machine.c
> @@ -199,6 +199,21 @@ static const VMStateDescription vmstate_fpreg = {
>     }
> };
> 
> +static bool is_vmx_enabled(CPUX86State *env)
> +{
> +    return (IS_INTEL_CPU(env) && (env->cr[4] & CR4_VMXE_MASK));
> +}
> +
> +static bool is_svm_enabled(CPUX86State *env)
> +{
> +    return (IS_AMD_CPU(env) && (env->efer & MSR_EFER_SVME));
> +}
> +
> +static bool is_nested_virt_enabled(CPUX86State *env)
> +{
> +    return (is_vmx_enabled(env) || is_svm_enabled(env));
> +}

I have later realised that this nested_virt_enabled() function is not enough to determine if nested_state is required to be sent.
This is because it may be that vCPU is running L2 but have momentarily entered SMM due to SMI.
In this case, CR4 & MSR_EFER are saved in SMRAM and are set to 0 on entering to SMM.
This means that in case (env->hflags & HF_SMM_MASK), we theoretically should have read saved CR4 & MSR_EFER from SMRAM.
However, because we cannot reference guest memory at this point (Not valid in case we migrate guest in post-copy), I should change
code to assume that in case (env->hflags & HF_SMM_MASK), we need to assume that nested-state is needed.
This should break backwards-compatability migration only in very rare cases and therefore I think it should be sufficient.
Any objections to this idea?

> +
> static int cpu_pre_save(void *opaque)
> {

In my next patch-series I have added logic to cpu_pre_save that converts
a pending exception to an injected exception in case there is an exception pending
and nested-virtualization is not enabled. During this conversion, I also make sure
to apply the exception payload to the respective vCPU register (i.e. CR2 in case of #PF
or DR6 in case of #DB).
I have realised this is required because otherwise exception-info VMState subsection
will be deemed not needed and exception-payload will be lost.
Something like this:

+    /*
+     * When vCPU is running L2 and exception is still pending,
+     * it can potentially be intercepted by L1 hypervisor.
+     * In contrast to an injected exception which cannot be
+     * intercepted anymore.
+     *
+     * Furthermore, when a L2 exception is intercepted by L1
+     * hypervisor, it's exception payload (CR2/DR6 on #PF/#DB)
+     * should not be set yet in the respective vCPU register.
+     * Thus, in case an exception is pending, it is
+     * important to save the exception payload seperately.
+     *
+     * Therefore, if an exception is not in a pending state
+     * or guest is not running a nested hypervisor, it is
+     * not important to distinguish between a pending and
+     * injected exception and we don't need to store seperately
+     * the exception payload.
+     *
+     * In order to preserve better backwards-compatabile migration,
+     * convert a pending exception to an injected exception in
+     * case it is not important to distingiush between them
+     * as described above.
+     */
+    if (!is_nested_virt_enabled(env) && env->exception_pending) {
+        env->exception_pending = 0;
+        env->exception_injected = 1;
+
+        if (env->exception_nr == EXCP01_DB) {
+            assert(env->exception_has_payload);
+            env->dr[6] = env->exception_payload;
+        } else if (env->exception_nr == EXCP0E_PAGE) {
+            assert(env->exception_has_payload);
+            env->cr[2] = env->exception_payload;
+        } else {
+            assert(!env->exception_has_payload);
+        }
+    }
+

>     X86CPU *cpu = opaque;
> @@ -278,6 +293,23 @@ static int cpu_post_load(void *opaque, int version_id)
>     env->hflags &= ~HF_CPL_MASK;
>     env->hflags |= (env->segs[R_SS].flags >> DESC_DPL_SHIFT) & HF_CPL_MASK;
> 
> +    /*
> +     * There are cases that we can get valid exception_nr with both
> +     * exception_pending and exception_clear being cleared. This can happen in
> +     * one of the following scenarios:
> +     * 1) Source is older QEMU without KVM_CAP_EXCEPTION_PAYLOAD support.
> +     * 2) Source is running on kernel without KVM_CAP_EXCEPTION_PAYLOAD support.
> +     * 3) "cpu/exception_info" subsection not sent because there is no exception
> +     *	  pending or guest wasn't running L2.
> +     *
> +     * In those cases, we can just deduce that a valid exception_nr means
> +     * we can treat the exception as already injected.
> +     */
> +    if ((env->exception_nr != -1) &&
> +        !env->exception_pending && !env->exception_injected) {
> +        env->exception_injected = 1;
> +    }
> +
>     env->fpstt = (env->fpus_vmstate >> 11) & 7;
>     env->fpus = env->fpus_vmstate & ~0x3800;
>     env->fptag_vmstate ^= 0xff;
> @@ -323,6 +355,32 @@ static bool steal_time_msr_needed(void *opaque)
>     return cpu->env.steal_time_msr != 0;
> }
> 
> +static bool exception_info_needed(void *opaque)
> +{
> +    X86CPU *cpu = opaque;
> +
> +    /*
> +     * Differenting between pending and injected exceptions
> +     * is only important when running L2.
> +     */
> +    return (cpu->env.exception_pending &&
> +            is_nested_virt_enabled(&cpu->env));
> +}
> +
> +static const VMStateDescription vmstate_exception_info = {
> +    .name = "cpu/exception_info",
> +    .version_id = 1,
> +    .minimum_version_id = 1,
> +    .needed = exception_info_needed,
> +    .fields = (VMStateField[]) {
> +        VMSTATE_UINT8(env.exception_pending, X86CPU),
> +        VMSTATE_UINT8(env.exception_injected, X86CPU),
> +        VMSTATE_UINT8(env.exception_has_payload, X86CPU),
> +        VMSTATE_UINT64(env.exception_payload, X86CPU),
> +        VMSTATE_END_OF_LIST()
> +    }
> +};
> +
> static const VMStateDescription vmstate_steal_time_msr = {
>     .name = "cpu/steal_time_msr",
>     .version_id = 1,
> @@ -1035,7 +1093,7 @@ VMStateDescription vmstate_x86_cpu = {
>         VMSTATE_INT32(env.interrupt_injected, X86CPU),
>         VMSTATE_UINT32(env.mp_state, X86CPU),
>         VMSTATE_UINT64(env.tsc, X86CPU),
> -        VMSTATE_INT32(env.exception_injected, X86CPU),
> +        VMSTATE_INT32(env.exception_nr, X86CPU),
>         VMSTATE_UINT8(env.soft_interrupt, X86CPU),
>         VMSTATE_UINT8(env.nmi_injected, X86CPU),
>         VMSTATE_UINT8(env.nmi_pending, X86CPU),
> @@ -1059,6 +1117,7 @@ VMStateDescription vmstate_x86_cpu = {
>         /* The above list is not sorted /wrt version numbers, watch out! */
>     },
>     .subsections = (const VMStateDescription*[]) {
> +        &vmstate_exception_info,
>         &vmstate_async_pf_msr,
>         &vmstate_pv_eoi_msr,
>         &vmstate_steal_time_msr,
> -- 
> 2.21.0
> 
>
Liran Alon June 16, 2019, 12:38 p.m. UTC | #2
> On 15 Jun 2019, at 3:57, Liran Alon <liran.alon@oracle.com> wrote:
> 
>> On 15 Jun 2019, at 3:42, Paolo Bonzini <pbonzini@redhat.com> wrote:
>> 
>> From: Liran Alon <liran.alon@oracle.com>
>> 
>> +static bool is_vmx_enabled(CPUX86State *env)
>> +{
>> +    return (IS_INTEL_CPU(env) && (env->cr[4] & CR4_VMXE_MASK));
>> +}
>> +
>> +static bool is_svm_enabled(CPUX86State *env)
>> +{
>> +    return (IS_AMD_CPU(env) && (env->efer & MSR_EFER_SVME));
>> +}
>> +
>> +static bool is_nested_virt_enabled(CPUX86State *env)
>> +{
>> +    return (is_vmx_enabled(env) || is_svm_enabled(env));
>> +}
> 
> I have later realised that this nested_virt_enabled() function is not enough to determine if nested_state is required to be sent.
> This is because it may be that vCPU is running L2 but have momentarily entered SMM due to SMI.
> In this case, CR4 & MSR_EFER are saved in SMRAM and are set to 0 on entering to SMM.
> This means that in case (env->hflags & HF_SMM_MASK), we theoretically should have read saved CR4 & MSR_EFER from SMRAM.
> However, because we cannot reference guest memory at this point (Not valid in case we migrate guest in post-copy), I should change
> code to assume that in case (env->hflags & HF_SMM_MASK), we need to assume that nested-state is needed.
> This should break backwards-compatability migration only in very rare cases and therefore I think it should be sufficient.
> Any objections to this idea?
> 

Actually, this is even worse than I originally thought.
Even in case guest is not currently in SMM mode, if it’s in VMX non-root mode, the CR4 read here is L2 CR4. Not L1 CR4.
Therefore, CR4.VMXE doesn’t necessarily indicate if guest have nested-virtualization enabled. Same is true for MSR_EFER in case of SVM.

Putting this all together, in case kernel doesn’t support extracting nested-state, there is no decent way to know if guest is running nested-virtualization.
Which means that in theory we always need to fail migration in case kernel doesn’t support KVM_CAP_NESTED_STATE or KVM_CAP_EXCEPTION_PAYLOAD
and vCPU is exposed with VMX/SVM capability.

I can condition this behaviour with a flag that can be manipulated using QMP to allow user to indicate it wishes to migrate guest anyway in this case.
This however bring me back to the entire discussion I had with Dr. David Alan Gilbert on migration backwards compatibility in general and the fact that I believe
we should have a generic QMP command which allows to provide list of VMState subsections that can be ignored in migration…
See: https://www.mail-archive.com/qemu-devel@nongnu.org/msg622274.html

Paolo, What are your thoughts on how I would proceed with this?

-Liran
Liran Alon June 17, 2019, 11:34 a.m. UTC | #3
> On 16 Jun 2019, at 15:38, Liran Alon <liran.alon@oracle.com> wrote:
> 
> 
> 
>> On 15 Jun 2019, at 3:57, Liran Alon <liran.alon@oracle.com> wrote:
>> 
>>> On 15 Jun 2019, at 3:42, Paolo Bonzini <pbonzini@redhat.com> wrote:
>>> 
>>> From: Liran Alon <liran.alon@oracle.com>
>>> 
>>> +static bool is_vmx_enabled(CPUX86State *env)
>>> +{
>>> +    return (IS_INTEL_CPU(env) && (env->cr[4] & CR4_VMXE_MASK));
>>> +}
>>> +
>>> +static bool is_svm_enabled(CPUX86State *env)
>>> +{
>>> +    return (IS_AMD_CPU(env) && (env->efer & MSR_EFER_SVME));
>>> +}
>>> +
>>> +static bool is_nested_virt_enabled(CPUX86State *env)
>>> +{
>>> +    return (is_vmx_enabled(env) || is_svm_enabled(env));
>>> +}
>> 
>> I have later realised that this nested_virt_enabled() function is not enough to determine if nested_state is required to be sent.
>> This is because it may be that vCPU is running L2 but have momentarily entered SMM due to SMI.
>> In this case, CR4 & MSR_EFER are saved in SMRAM and are set to 0 on entering to SMM.
>> This means that in case (env->hflags & HF_SMM_MASK), we theoretically should have read saved CR4 & MSR_EFER from SMRAM.
>> However, because we cannot reference guest memory at this point (Not valid in case we migrate guest in post-copy), I should change
>> code to assume that in case (env->hflags & HF_SMM_MASK), we need to assume that nested-state is needed.
>> This should break backwards-compatability migration only in very rare cases and therefore I think it should be sufficient.
>> Any objections to this idea?
>> 
> 
> Actually, this is even worse than I originally thought.
> Even in case guest is not currently in SMM mode, if it’s in VMX non-root mode, the CR4 read here is L2 CR4. Not L1 CR4.
> Therefore, CR4.VMXE doesn’t necessarily indicate if guest have nested-virtualization enabled. Same is true for MSR_EFER in case of SVM.
> 
> Putting this all together, in case kernel doesn’t support extracting nested-state, there is no decent way to know if guest is running nested-virtualization.
> Which means that in theory we always need to fail migration in case kernel doesn’t support KVM_CAP_NESTED_STATE or KVM_CAP_EXCEPTION_PAYLOAD
> and vCPU is exposed with VMX/SVM capability.
> 
> I can condition this behaviour with a flag that can be manipulated using QMP to allow user to indicate it wishes to migrate guest anyway in this case.
> This however bring me back to the entire discussion I had with Dr. David Alan Gilbert on migration backwards compatibility in general and the fact that I believe
> we should have a generic QMP command which allows to provide list of VMState subsections that can be ignored in migration…
> See: https://www.mail-archive.com/qemu-devel@nongnu.org/msg622274.html
> 
> Paolo, What are your thoughts on how I would proceed with this?
> 
> -Liran
> 

Paolo, can you also elaborate what was your original intent in this QEMU commit you made:
f8dc4c645ec2 ("target/i386: rename HF_SVMI_MASK to HF_GUEST_MASK")

How did you expect this flag to be used in nVMX migration?
Currently the HF_GUEST_MASK flag in KVM that is set in vcpu->arch.hflags is never propagated to userspace.
Did you expect to set HF_GUEST_MASK in QEMU’s hflag based on nested_state returned by KVM_GET_NESTED_STATE ioctl?
What is the value of this hflag in QEMU given that we have all the information we need in env->nested_state?
(E.g. We can deduce vCPU is in VMX non-root mode by testing for (env->nested_state.flags & KVM_STATE_NESTED_GUEST_MODE)).

-Liran
Paolo Bonzini June 17, 2019, 5:27 p.m. UTC | #4
On 17/06/19 13:34, Liran Alon wrote:
> Putting this all together, in case kernel doesn’t support extracting
> nested-state, there is no decent way to know if guest is running
> nested-virtualization. Which means that in theory we always need to
> fail migration in case kernel doesn’t support KVM_CAP_NESTED_STATE or
> KVM_CAP_EXCEPTION_PAYLOAD and vCPU is exposed with VMX/SVM
> capability.

For VMX this would be okay because we had a blocker before this series,
and this wouldn't be any worse.

For SVM we can ignore the case and fix it when we have
KVM_CAP_NESTED_STATE, as again that wouldn't be any worse.

Paolo

> I can condition this behaviour with a flag that can be manipulated
> using QMP to allow user to indicate it wishes to migrate guest anyway
> in this case. This however bring me back to the entire discussion I
> had with Dr. David Alan Gilbert on migration backwards compatibility
> in general and the fact that I believe we should have a generic QMP
> command which allows to provide list of VMState subsections that can
> be ignored in migration… See:
> https://www.mail-archive.com/qemu-devel@nongnu.org/msg622274.html
> 
> Paolo, What are your thoughts on how I would proceed with this?
Maran Wilson June 18, 2019, 10:38 p.m. UTC | #5
On 6/17/2019 10:27 AM, Paolo Bonzini wrote:
> On 17/06/19 13:34, Liran Alon wrote:
>> Putting this all together, in case kernel doesn’t support extracting
>> nested-state, there is no decent way to know if guest is running
>> nested-virtualization. Which means that in theory we always need to
>> fail migration in case kernel doesn’t support KVM_CAP_NESTED_STATE or
>> KVM_CAP_EXCEPTION_PAYLOAD and vCPU is exposed with VMX/SVM
>> capability.
> For VMX this would be okay because we had a blocker before this series,
> and this wouldn't be any worse.

I agree it shouldn't be a gating issue for this patch series, but I'd 
hate to see this discussion thread die off.

I'm still pretty interested in hearing whether anyone has any good ideas 
for how to conclusively determine whether a given L1 VM has created a 
nested L2 or not when the host is running an older Kernel that doesn't 
support KVM_CAP_NESTED_STATE. That would be a very useful capability, 
especially for CSP use cases. If anyone has any suggestions about where 
to look, I don't mind spending some time digging into it and possibly 
testing out a few ideas. Again, separate from this particular patch 
series. So far I've been drawing a blank after Liran pointed out that 
corner case problems associated with env->cr[4] & CR4_VMXE_MASK.

Thanks,
-Maran

> For SVM we can ignore the case and fix it when we have
> KVM_CAP_NESTED_STATE, as again that wouldn't be any worse.
>
> Paolo
>
>> I can condition this behaviour with a flag that can be manipulated
>> using QMP to allow user to indicate it wishes to migrate guest anyway
>> in this case. This however bring me back to the entire discussion I
>> had with Dr. David Alan Gilbert on migration backwards compatibility
>> in general and the fact that I believe we should have a generic QMP
>> command which allows to provide list of VMState subsections that can
>> be ignored in migration… See:
>> https://www.mail-archive.com/qemu-devel@nongnu.org/msg622274.html
>>
>> Paolo, What are your thoughts on how I would proceed with this?
diff mbox series

Patch

diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index c1ab86d63e..4e19969111 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -4777,7 +4777,9 @@  static void x86_cpu_reset(CPUState *s)
     memset(env->mtrr_fixed, 0, sizeof(env->mtrr_fixed));
 
     env->interrupt_injected = -1;
-    env->exception_injected = -1;
+    env->exception_nr = -1;
+    env->exception_pending = 0;
+    env->exception_injected = 0;
     env->nmi_injected = false;
 #if !defined(CONFIG_USER_ONLY)
     /* We hard-wire the BSP to the first CPU. */
@@ -5173,12 +5175,6 @@  static int x86_cpu_filter_features(X86CPU *cpu)
     return rv;
 }
 
-#define IS_INTEL_CPU(env) ((env)->cpuid_vendor1 == CPUID_VENDOR_INTEL_1 && \
-                           (env)->cpuid_vendor2 == CPUID_VENDOR_INTEL_2 && \
-                           (env)->cpuid_vendor3 == CPUID_VENDOR_INTEL_3)
-#define IS_AMD_CPU(env) ((env)->cpuid_vendor1 == CPUID_VENDOR_AMD_1 && \
-                         (env)->cpuid_vendor2 == CPUID_VENDOR_AMD_2 && \
-                         (env)->cpuid_vendor3 == CPUID_VENDOR_AMD_3)
 static void x86_cpu_realizefn(DeviceState *dev, Error **errp)
 {
     CPUState *cs = CPU(dev);
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index bd06523a53..bbeb7a9521 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -729,6 +729,13 @@  typedef uint32_t FeatureWordArray[FEATURE_WORDS];
 
 #define CPUID_VENDOR_HYGON    "HygonGenuine"
 
+#define IS_INTEL_CPU(env) ((env)->cpuid_vendor1 == CPUID_VENDOR_INTEL_1 && \
+                           (env)->cpuid_vendor2 == CPUID_VENDOR_INTEL_2 && \
+                           (env)->cpuid_vendor3 == CPUID_VENDOR_INTEL_3)
+#define IS_AMD_CPU(env) ((env)->cpuid_vendor1 == CPUID_VENDOR_AMD_1 && \
+                         (env)->cpuid_vendor2 == CPUID_VENDOR_AMD_2 && \
+                         (env)->cpuid_vendor3 == CPUID_VENDOR_AMD_3)
+
 #define CPUID_MWAIT_IBE     (1U << 1) /* Interrupts can exit capability */
 #define CPUID_MWAIT_EMX     (1U << 0) /* enumeration supported */
 
@@ -1332,10 +1339,14 @@  typedef struct CPUX86State {
 
     /* For KVM */
     uint32_t mp_state;
-    int32_t exception_injected;
+    int32_t exception_nr;
     int32_t interrupt_injected;
     uint8_t soft_interrupt;
+    uint8_t exception_pending;
+    uint8_t exception_injected;
     uint8_t has_error_code;
+    uint8_t exception_has_payload;
+    uint64_t exception_payload;
     uint32_t ins_len;
     uint32_t sipi_vector;
     bool tsc_valid;
diff --git a/target/i386/hvf/hvf.c b/target/i386/hvf/hvf.c
index 2751c8125c..dc4bb63536 100644
--- a/target/i386/hvf/hvf.c
+++ b/target/i386/hvf/hvf.c
@@ -605,7 +605,9 @@  static void hvf_store_events(CPUState *cpu, uint32_t ins_len, uint64_t idtvec_in
     X86CPU *x86_cpu = X86_CPU(cpu);
     CPUX86State *env = &x86_cpu->env;
 
-    env->exception_injected = -1;
+    env->exception_nr = -1;
+    env->exception_pending = 0;
+    env->exception_injected = 0;
     env->interrupt_injected = -1;
     env->nmi_injected = false;
     if (idtvec_info & VMCS_IDT_VEC_VALID) {
@@ -619,7 +621,8 @@  static void hvf_store_events(CPUState *cpu, uint32_t ins_len, uint64_t idtvec_in
             break;
         case VMCS_IDT_VEC_HWEXCEPTION:
         case VMCS_IDT_VEC_SWEXCEPTION:
-            env->exception_injected = idtvec_info & VMCS_IDT_VEC_VECNUM;
+            env->exception_nr = idtvec_info & VMCS_IDT_VEC_VECNUM;
+            env->exception_injected = 1;
             break;
         case VMCS_IDT_VEC_PRIV_SWEXCEPTION:
         default:
@@ -912,7 +915,8 @@  int hvf_vcpu_exec(CPUState *cpu)
             macvm_set_rip(cpu, rip + ins_len);
             break;
         case VMX_REASON_VMCALL:
-            env->exception_injected = EXCP0D_GPF;
+            env->exception_nr = EXCP0D_GPF;
+            env->exception_injected = 1;
             env->has_error_code = true;
             env->error_code = 0;
             break;
diff --git a/target/i386/hvf/x86hvf.c b/target/i386/hvf/x86hvf.c
index df8e946fbc..e0ea02d631 100644
--- a/target/i386/hvf/x86hvf.c
+++ b/target/i386/hvf/x86hvf.c
@@ -362,8 +362,8 @@  bool hvf_inject_interrupts(CPUState *cpu_state)
     if (env->interrupt_injected != -1) {
         vector = env->interrupt_injected;
         intr_type = VMCS_INTR_T_SWINTR;
-    } else if (env->exception_injected != -1) {
-        vector = env->exception_injected;
+    } else if (env->exception_nr != -1) {
+        vector = env->exception_nr;
         if (vector == EXCP03_INT3 || vector == EXCP04_INTO) {
             intr_type = VMCS_INTR_T_SWEXCEPTION;
         } else {
diff --git a/target/i386/kvm.c b/target/i386/kvm.c
index 53f95b02a0..dca76830ec 100644
--- a/target/i386/kvm.c
+++ b/target/i386/kvm.c
@@ -104,6 +104,7 @@  static uint32_t num_architectural_pmu_fixed_counters;
 static int has_xsave;
 static int has_xcrs;
 static int has_pit_state2;
+static int has_exception_payload;
 
 static bool has_msr_mcg_ext_ctl;
 
@@ -584,15 +585,51 @@  void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr)
     /* Hope we are lucky for AO MCE */
 }
 
+static void kvm_reset_exception(CPUX86State *env)
+{
+	env->exception_nr = -1;
+	env->exception_pending = 0;
+	env->exception_injected = 0;
+}
+
+static void kvm_queue_exception(CPUX86State *env,
+                                int32_t exception_nr,
+                                uint8_t exception_has_payload,
+                                uint64_t exception_payload)
+{
+    assert(env->exception_nr == -1);
+    assert(!env->exception_pending);
+    assert(!env->exception_injected);
+
+    env->exception_nr = exception_nr;
+
+    if (has_exception_payload) {
+        env->exception_pending = 1;
+
+        env->exception_has_payload = exception_has_payload;
+        env->exception_payload = exception_payload;
+    } else {
+        env->exception_injected = 1;
+
+        if (exception_has_payload) {
+            if (exception_nr == EXCP01_DB) {
+                env->dr[6] = exception_payload;
+            } else if (exception_nr == EXCP0E_PAGE) {
+                env->cr[2] = exception_payload;
+            }
+        }
+    }
+}
+
 static int kvm_inject_mce_oldstyle(X86CPU *cpu)
 {
     CPUX86State *env = &cpu->env;
 
-    if (!kvm_has_vcpu_events() && env->exception_injected == EXCP12_MCHK) {
+    if (!kvm_has_vcpu_events() && env->exception_nr == EXCP12_MCHK) {
         unsigned int bank, bank_num = env->mcg_cap & 0xff;
         struct kvm_x86_mce mce;
 
-        env->exception_injected = -1;
+	kvm_reset_exception(env);
 
         /*
          * There must be at least one bank in use if an MCE is pending.
@@ -1573,6 +1610,16 @@  int kvm_arch_init(MachineState *ms, KVMState *s)
 
     hv_vpindex_settable = kvm_check_extension(s, KVM_CAP_HYPERV_VP_INDEX);
 
+    has_exception_payload = kvm_check_extension(s, KVM_CAP_EXCEPTION_PAYLOAD);
+    if (has_exception_payload) {
+        ret = kvm_vm_enable_cap(s, KVM_CAP_EXCEPTION_PAYLOAD, 0, true);
+        if (ret < 0) {
+            error_report("kvm: Failed to enable exception payload cap: %s",
+                         strerror(-ret));
+            return ret;
+        }
+    }
+
     ret = kvm_get_supported_msrs(s);
     if (ret < 0) {
         return ret;
@@ -2877,8 +2924,16 @@  static int kvm_put_vcpu_events(X86CPU *cpu, int level)
         return 0;
     }
 
-    events.exception.injected = (env->exception_injected >= 0);
-    events.exception.nr = env->exception_injected;
+    events.flags = 0;
+
+    if (has_exception_payload) {
+        events.flags |= KVM_VCPUEVENT_VALID_PAYLOAD;
+        events.exception.pending = env->exception_pending;
+        events.exception_has_payload = env->exception_has_payload;
+        events.exception_payload = env->exception_payload;
+    }
+    events.exception.nr = env->exception_nr;
+    events.exception.injected = env->exception_injected;
     events.exception.has_error_code = env->has_error_code;
     events.exception.error_code = env->error_code;
 
@@ -2891,7 +2946,6 @@  static int kvm_put_vcpu_events(X86CPU *cpu, int level)
     events.nmi.masked = !!(env->hflags2 & HF2_NMI_MASK);
 
     events.sipi_vector = env->sipi_vector;
-    events.flags = 0;
 
     if (has_msr_smbase) {
         events.smi.smm = !!(env->hflags & HF_SMM_MASK);
@@ -2941,8 +2995,19 @@  static int kvm_get_vcpu_events(X86CPU *cpu)
     if (ret < 0) {
        return ret;
     }
-    env->exception_injected =
-       events.exception.injected ? events.exception.nr : -1;
+
+    if (events.flags & KVM_VCPUEVENT_VALID_PAYLOAD) {
+        env->exception_pending = events.exception.pending;
+        env->exception_has_payload = events.exception_has_payload;
+        env->exception_payload = events.exception_payload;
+    } else {
+        env->exception_pending = 0;
+        env->exception_has_payload = false;
+    }
+    env->exception_injected = events.exception.injected;
+    env->exception_nr =
+        (env->exception_pending || env->exception_injected) ?
+        events.exception.nr : -1;
     env->has_error_code = events.exception.has_error_code;
     env->error_code = events.exception.error_code;
 
@@ -2994,12 +3059,12 @@  static int kvm_guest_debug_workarounds(X86CPU *cpu)
     unsigned long reinject_trap = 0;
 
     if (!kvm_has_vcpu_events()) {
-        if (env->exception_injected == EXCP01_DB) {
+        if (env->exception_nr == EXCP01_DB) {
             reinject_trap = KVM_GUESTDBG_INJECT_DB;
         } else if (env->exception_injected == EXCP03_INT3) {
             reinject_trap = KVM_GUESTDBG_INJECT_BP;
         }
-        env->exception_injected = -1;
+        kvm_reset_exception(env);
     }
 
     /*
@@ -3320,13 +3385,13 @@  int kvm_arch_process_async_events(CPUState *cs)
 
         kvm_cpu_synchronize_state(cs);
 
-        if (env->exception_injected == EXCP08_DBLE) {
+        if (env->exception_nr == EXCP08_DBLE) {
             /* this means triple fault */
             qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
             cs->exit_request = 1;
             return 0;
         }
-        env->exception_injected = EXCP12_MCHK;
+        kvm_queue_exception(env, EXCP12_MCHK, 0, 0);
         env->has_error_code = 0;
 
         cs->halted = 0;
@@ -3541,14 +3606,12 @@  static int kvm_handle_debug(X86CPU *cpu,
     }
     if (ret == 0) {
         cpu_synchronize_state(cs);
-        assert(env->exception_injected == -1);
+        assert(env->exception_nr == -1);
 
         /* pass to guest */
-        env->exception_injected = arch_info->exception;
+        kvm_queue_exception(env, arch_info->exception,
+                            EXCP01_DB, arch_info->dr6);
         env->has_error_code = 0;
-        if (arch_info->exception == EXCP01_DB) {
-            env->dr[6] = arch_info->dr6;
-        }
     }
 
     return ret;
diff --git a/target/i386/machine.c b/target/i386/machine.c
index 225b5d433b..41460be54b 100644
--- a/target/i386/machine.c
+++ b/target/i386/machine.c
@@ -199,6 +199,21 @@  static const VMStateDescription vmstate_fpreg = {
     }
 };
 
+static bool is_vmx_enabled(CPUX86State *env)
+{
+    return (IS_INTEL_CPU(env) && (env->cr[4] & CR4_VMXE_MASK));
+}
+
+static bool is_svm_enabled(CPUX86State *env)
+{
+    return (IS_AMD_CPU(env) && (env->efer & MSR_EFER_SVME));
+}
+
+static bool is_nested_virt_enabled(CPUX86State *env)
+{
+    return (is_vmx_enabled(env) || is_svm_enabled(env));
+}
+
 static int cpu_pre_save(void *opaque)
 {
     X86CPU *cpu = opaque;
@@ -278,6 +293,23 @@  static int cpu_post_load(void *opaque, int version_id)
     env->hflags &= ~HF_CPL_MASK;
     env->hflags |= (env->segs[R_SS].flags >> DESC_DPL_SHIFT) & HF_CPL_MASK;
 
+    /*
+     * There are cases that we can get valid exception_nr with both
+     * exception_pending and exception_clear being cleared. This can happen in
+     * one of the following scenarios:
+     * 1) Source is older QEMU without KVM_CAP_EXCEPTION_PAYLOAD support.
+     * 2) Source is running on kernel without KVM_CAP_EXCEPTION_PAYLOAD support.
+     * 3) "cpu/exception_info" subsection not sent because there is no exception
+     *	  pending or guest wasn't running L2.
+     *
+     * In those cases, we can just deduce that a valid exception_nr means
+     * we can treat the exception as already injected.
+     */
+    if ((env->exception_nr != -1) &&
+        !env->exception_pending && !env->exception_injected) {
+        env->exception_injected = 1;
+    }
+
     env->fpstt = (env->fpus_vmstate >> 11) & 7;
     env->fpus = env->fpus_vmstate & ~0x3800;
     env->fptag_vmstate ^= 0xff;
@@ -323,6 +355,32 @@  static bool steal_time_msr_needed(void *opaque)
     return cpu->env.steal_time_msr != 0;
 }
 
+static bool exception_info_needed(void *opaque)
+{
+    X86CPU *cpu = opaque;
+
+    /*
+     * Differenting between pending and injected exceptions
+     * is only important when running L2.
+     */
+    return (cpu->env.exception_pending &&
+            is_nested_virt_enabled(&cpu->env));
+}
+
+static const VMStateDescription vmstate_exception_info = {
+    .name = "cpu/exception_info",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .needed = exception_info_needed,
+    .fields = (VMStateField[]) {
+        VMSTATE_UINT8(env.exception_pending, X86CPU),
+        VMSTATE_UINT8(env.exception_injected, X86CPU),
+        VMSTATE_UINT8(env.exception_has_payload, X86CPU),
+        VMSTATE_UINT64(env.exception_payload, X86CPU),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
 static const VMStateDescription vmstate_steal_time_msr = {
     .name = "cpu/steal_time_msr",
     .version_id = 1,
@@ -1035,7 +1093,7 @@  VMStateDescription vmstate_x86_cpu = {
         VMSTATE_INT32(env.interrupt_injected, X86CPU),
         VMSTATE_UINT32(env.mp_state, X86CPU),
         VMSTATE_UINT64(env.tsc, X86CPU),
-        VMSTATE_INT32(env.exception_injected, X86CPU),
+        VMSTATE_INT32(env.exception_nr, X86CPU),
         VMSTATE_UINT8(env.soft_interrupt, X86CPU),
         VMSTATE_UINT8(env.nmi_injected, X86CPU),
         VMSTATE_UINT8(env.nmi_pending, X86CPU),
@@ -1059,6 +1117,7 @@  VMStateDescription vmstate_x86_cpu = {
         /* The above list is not sorted /wrt version numbers, watch out! */
     },
     .subsections = (const VMStateDescription*[]) {
+        &vmstate_exception_info,
         &vmstate_async_pf_msr,
         &vmstate_pv_eoi_msr,
         &vmstate_steal_time_msr,