[3/7] KVM: i386: Add support for KVM_CAP_EXCEPTION_PAYLOAD

Message ID	20190615004256.16367-4-pbonzini@redhat.com
State	New
Headers	show Return-Path: <qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org> From: Paolo Bonzini <pbonzini@redhat.com> To: qemu-devel@nongnu.org Date: Sat, 15 Jun 2019 02:42:52 +0200 Message-Id: <20190615004256.16367-4-pbonzini@redhat.com> In-Reply-To: <20190615004256.16367-1-pbonzini@redhat.com> References: <20190615004256.16367-1-pbonzini@redhat.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Subject: [Qemu-devel] [PATCH 3/7] KVM: i386: Add support for KVM_CAP_EXCEPTION_PAYLOAD Precedence: list Cc: liran.alon@oracle.com, nikita.leshchenko@oracle.com Errors-To: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org Sender: "Qemu-devel" <qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org>
Series	target-i386/kvm: live migration support for nested VMX \| expand [preliminary,0/7] target-i386/kvm: live migration support for nested VMX [1/7] KVM: i386: Use symbolic constant for #DB/#BP exception constants [2/7] KVM: i386: Re-inject #DB to guest with updated DR6 [3/7] KVM: i386: Add support for KVM_CAP_EXCEPTION_PAYLOAD [4/7] linux-headers: import improved definition of KVM_GET/SET_NESTED_STATE structs [6/7] KVM: i386: Add support for save and restore nested state [7/7] Revert "target/i386: kvm: add VMX migration blocker"

> On 15 Jun 2019, at 3:42, Paolo Bonzini <pbonzini@redhat.com> wrote: > > From: Liran Alon <liran.alon@oracle.com> > > Kernel commit c4f55198c7c2 ("kvm: x86: Introduce KVM_CAP_EXCEPTION_PAYLOAD") > introduced a new KVM capability which allows userspace to correctly > distinguish between pending and injected exceptions. > > This distinguish is important in case of nested virtualization scenarios > because a L2 pending exception can still be intercepted by the L1 hypervisor > while a L2 injected exception cannot. > > Furthermore, when an exception is attempted to be injected by QEMU, > QEMU should specify the exception payload (CR2 in case of #PF or > DR6 in case of #DB) instead of having the payload already delivered in > the respective vCPU register. Because in case exception is injected to > L2 guest and is intercepted by L1 hypervisor, then payload needs to be > reported to L1 intercept (VMExit handler) while still preserving > respective vCPU register unchanged. > > This commit adds support for QEMU to properly utilise this new KVM > capability (KVM_CAP_EXCEPTION_PAYLOAD). > > Reviewed-by: Nikita Leshenko <nikita.leshchenko@oracle.com> > Signed-off-by: Liran Alon <liran.alon@oracle.com> > Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> > --- > target/i386/cpu.c | 10 ++--- > target/i386/cpu.h | 13 +++++- > target/i386/hvf/hvf.c | 10 +++-- > target/i386/hvf/x86hvf.c | 4 +- > target/i386/kvm.c | 95 +++++++++++++++++++++++++++++++++------- > target/i386/machine.c | 61 +++++++++++++++++++++++++- > 6 files changed, 163 insertions(+), 30 deletions(-) > > diff --git a/target/i386/cpu.c b/target/i386/cpu.c > index c1ab86d63e..4e19969111 100644 > --- a/target/i386/cpu.c > +++ b/target/i386/cpu.c > @@ -4777,7 +4777,9 @@ static void x86_cpu_reset(CPUState *s) > memset(env->mtrr_fixed, 0, sizeof(env->mtrr_fixed)); > > env->interrupt_injected = -1; > - env->exception_injected = -1; > + env->exception_nr = -1; > + env->exception_pending = 0; > + env->exception_injected = 0; Note: I the patch-series I will submit I will add here: + env->exception_has_payload = false; + env->exception_payload = 0; > env->nmi_injected = false; > #if !defined(CONFIG_USER_ONLY) > /* We hard-wire the BSP to the first CPU. */ > @@ -5173,12 +5175,6 @@ static int x86_cpu_filter_features(X86CPU *cpu) > return rv; > } > > -#define IS_INTEL_CPU(env) ((env)->cpuid_vendor1 == CPUID_VENDOR_INTEL_1 && \ > - (env)->cpuid_vendor2 == CPUID_VENDOR_INTEL_2 && \ > - (env)->cpuid_vendor3 == CPUID_VENDOR_INTEL_3) > -#define IS_AMD_CPU(env) ((env)->cpuid_vendor1 == CPUID_VENDOR_AMD_1 && \ > - (env)->cpuid_vendor2 == CPUID_VENDOR_AMD_2 && \ > - (env)->cpuid_vendor3 == CPUID_VENDOR_AMD_3) > static void x86_cpu_realizefn(DeviceState *dev, Error **errp) > { > CPUState *cs = CPU(dev); > diff --git a/target/i386/cpu.h b/target/i386/cpu.h > index bd06523a53..bbeb7a9521 100644 > --- a/target/i386/cpu.h > +++ b/target/i386/cpu.h > @@ -729,6 +729,13 @@ typedef uint32_t FeatureWordArray[FEATURE_WORDS]; > > #define CPUID_VENDOR_HYGON "HygonGenuine" > > +#define IS_INTEL_CPU(env) ((env)->cpuid_vendor1 == CPUID_VENDOR_INTEL_1 && \ > + (env)->cpuid_vendor2 == CPUID_VENDOR_INTEL_2 && \ > + (env)->cpuid_vendor3 == CPUID_VENDOR_INTEL_3) > +#define IS_AMD_CPU(env) ((env)->cpuid_vendor1 == CPUID_VENDOR_AMD_1 && \ > + (env)->cpuid_vendor2 == CPUID_VENDOR_AMD_2 && \ > + (env)->cpuid_vendor3 == CPUID_VENDOR_AMD_3) > + > #define CPUID_MWAIT_IBE (1U << 1) /* Interrupts can exit capability */ > #define CPUID_MWAIT_EMX (1U << 0) /* enumeration supported */ > > @@ -1332,10 +1339,14 @@ typedef struct CPUX86State { > > /* For KVM */ > uint32_t mp_state; > - int32_t exception_injected; > + int32_t exception_nr; > int32_t interrupt_injected; > uint8_t soft_interrupt; > + uint8_t exception_pending; > + uint8_t exception_injected; > uint8_t has_error_code; > + uint8_t exception_has_payload; > + uint64_t exception_payload; > uint32_t ins_len; > uint32_t sipi_vector; > bool tsc_valid; > diff --git a/target/i386/hvf/hvf.c b/target/i386/hvf/hvf.c > index 2751c8125c..dc4bb63536 100644 > --- a/target/i386/hvf/hvf.c > +++ b/target/i386/hvf/hvf.c > @@ -605,7 +605,9 @@ static void hvf_store_events(CPUState *cpu, uint32_t ins_len, uint64_t idtvec_in > X86CPU *x86_cpu = X86_CPU(cpu); > CPUX86State *env = &x86_cpu->env; > > - env->exception_injected = -1; > + env->exception_nr = -1; > + env->exception_pending = 0; > + env->exception_injected = 0; > env->interrupt_injected = -1; > env->nmi_injected = false; > if (idtvec_info & VMCS_IDT_VEC_VALID) { > @@ -619,7 +621,8 @@ static void hvf_store_events(CPUState *cpu, uint32_t ins_len, uint64_t idtvec_in > break; > case VMCS_IDT_VEC_HWEXCEPTION: > case VMCS_IDT_VEC_SWEXCEPTION: > - env->exception_injected = idtvec_info & VMCS_IDT_VEC_VECNUM; > + env->exception_nr = idtvec_info & VMCS_IDT_VEC_VECNUM; > + env->exception_injected = 1; > break; > case VMCS_IDT_VEC_PRIV_SWEXCEPTION: > default: > @@ -912,7 +915,8 @@ int hvf_vcpu_exec(CPUState *cpu) > macvm_set_rip(cpu, rip + ins_len); > break; > case VMX_REASON_VMCALL: > - env->exception_injected = EXCP0D_GPF; > + env->exception_nr = EXCP0D_GPF; > + env->exception_injected = 1; > env->has_error_code = true; > env->error_code = 0; > break; > diff --git a/target/i386/hvf/x86hvf.c b/target/i386/hvf/x86hvf.c > index df8e946fbc..e0ea02d631 100644 > --- a/target/i386/hvf/x86hvf.c > +++ b/target/i386/hvf/x86hvf.c > @@ -362,8 +362,8 @@ bool hvf_inject_interrupts(CPUState *cpu_state) > if (env->interrupt_injected != -1) { > vector = env->interrupt_injected; > intr_type = VMCS_INTR_T_SWINTR; > - } else if (env->exception_injected != -1) { > - vector = env->exception_injected; > + } else if (env->exception_nr != -1) { > + vector = env->exception_nr; > if (vector == EXCP03_INT3 || vector == EXCP04_INTO) { > intr_type = VMCS_INTR_T_SWEXCEPTION; > } else { > diff --git a/target/i386/kvm.c b/target/i386/kvm.c > index 53f95b02a0..dca76830ec 100644 > --- a/target/i386/kvm.c > +++ b/target/i386/kvm.c > @@ -104,6 +104,7 @@ static uint32_t num_architectural_pmu_fixed_counters; > static int has_xsave; > static int has_xcrs; > static int has_pit_state2; > +static int has_exception_payload; > > static bool has_msr_mcg_ext_ctl; > > @@ -584,15 +585,51 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr) > /* Hope we are lucky for AO MCE */ > } > > +static void kvm_reset_exception(CPUX86State *env) > +{ > + env->exception_nr = -1; > + env->exception_pending = 0; > + env->exception_injected = 0; Note: In the patch-series I will submit I will add here: + env->exception_has_payload = false; + env->exception_payload = 0; > +} > + > +static void kvm_queue_exception(CPUX86State *env, > + int32_t exception_nr, > + uint8_t exception_has_payload, > + uint64_t exception_payload) > +{ > + assert(env->exception_nr == -1); > + assert(!env->exception_pending); > + assert(!env->exception_injected); Note: In the patch-series I will submit I will add here: + assert(!env->exception_has_payload); > + > + env->exception_nr = exception_nr; > + > + if (has_exception_payload) { > + env->exception_pending = 1; > + > + env->exception_has_payload = exception_has_payload; > + env->exception_payload = exception_payload; > + } else { > + env->exception_injected = 1; > + > + if (exception_has_payload) { > + if (exception_nr == EXCP01_DB) { > + env->dr[6] = exception_payload; > + } else if (exception_nr == EXCP0E_PAGE) { > + env->cr[2] = exception_payload; > + } > + } Note: In the patch-series I will submit, I have changed this to verify that #DB & #PF always have env->exception_has_payload set to true and other cases have it set to false. > + } > +} > + > static int kvm_inject_mce_oldstyle(X86CPU *cpu) > { > CPUX86State *env = &cpu->env; > > - if (!kvm_has_vcpu_events() && env->exception_injected == EXCP12_MCHK) { > + if (!kvm_has_vcpu_events() && env->exception_nr == EXCP12_MCHK) { > unsigned int bank, bank_num = env->mcg_cap & 0xff; > struct kvm_x86_mce mce; > > - env->exception_injected = -1; > + kvm_reset_exception(env); > > /* > * There must be at least one bank in use if an MCE is pending. > @@ -1573,6 +1610,16 @@ int kvm_arch_init(MachineState *ms, KVMState *s) > > hv_vpindex_settable = kvm_check_extension(s, KVM_CAP_HYPERV_VP_INDEX); > > + has_exception_payload = kvm_check_extension(s, KVM_CAP_EXCEPTION_PAYLOAD); > + if (has_exception_payload) { > + ret = kvm_vm_enable_cap(s, KVM_CAP_EXCEPTION_PAYLOAD, 0, true); > + if (ret < 0) { > + error_report("kvm: Failed to enable exception payload cap: %s", > + strerror(-ret)); > + return ret; > + } > + } > + > ret = kvm_get_supported_msrs(s); > if (ret < 0) { > return ret; > @@ -2877,8 +2924,16 @@ static int kvm_put_vcpu_events(X86CPU *cpu, int level) > return 0; > } > > - events.exception.injected = (env->exception_injected >= 0); > - events.exception.nr = env->exception_injected; > + events.flags = 0; > + > + if (has_exception_payload) { > + events.flags |= KVM_VCPUEVENT_VALID_PAYLOAD; > + events.exception.pending = env->exception_pending; > + events.exception_has_payload = env->exception_has_payload; > + events.exception_payload = env->exception_payload; > + } > + events.exception.nr = env->exception_nr; > + events.exception.injected = env->exception_injected; > events.exception.has_error_code = env->has_error_code; > events.exception.error_code = env->error_code; > > @@ -2891,7 +2946,6 @@ static int kvm_put_vcpu_events(X86CPU *cpu, int level) > events.nmi.masked = !!(env->hflags2 & HF2_NMI_MASK); > > events.sipi_vector = env->sipi_vector; > - events.flags = 0; > > if (has_msr_smbase) { > events.smi.smm = !!(env->hflags & HF_SMM_MASK); > @@ -2941,8 +2995,19 @@ static int kvm_get_vcpu_events(X86CPU *cpu) > if (ret < 0) { > return ret; > } > - env->exception_injected = > - events.exception.injected ? events.exception.nr : -1; > + > + if (events.flags & KVM_VCPUEVENT_VALID_PAYLOAD) { > + env->exception_pending = events.exception.pending; > + env->exception_has_payload = events.exception_has_payload; > + env->exception_payload = events.exception_payload; > + } else { > + env->exception_pending = 0; > + env->exception_has_payload = false; > + } > + env->exception_injected = events.exception.injected; > + env->exception_nr = > + (env->exception_pending || env->exception_injected) ? > + events.exception.nr : -1; > env->has_error_code = events.exception.has_error_code; > env->error_code = events.exception.error_code; > > @@ -2994,12 +3059,12 @@ static int kvm_guest_debug_workarounds(X86CPU *cpu) > unsigned long reinject_trap = 0; > > if (!kvm_has_vcpu_events()) { > - if (env->exception_injected == EXCP01_DB) { > + if (env->exception_nr == EXCP01_DB) { > reinject_trap = KVM_GUESTDBG_INJECT_DB; > } else if (env->exception_injected == EXCP03_INT3) { > reinject_trap = KVM_GUESTDBG_INJECT_BP; > } > - env->exception_injected = -1; > + kvm_reset_exception(env); > } > > /* > @@ -3320,13 +3385,13 @@ int kvm_arch_process_async_events(CPUState *cs) > > kvm_cpu_synchronize_state(cs); > > - if (env->exception_injected == EXCP08_DBLE) { > + if (env->exception_nr == EXCP08_DBLE) { > /* this means triple fault */ > qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET); > cs->exit_request = 1; > return 0; > } > - env->exception_injected = EXCP12_MCHK; > + kvm_queue_exception(env, EXCP12_MCHK, 0, 0); > env->has_error_code = 0; > > cs->halted = 0; > @@ -3541,14 +3606,12 @@ static int kvm_handle_debug(X86CPU *cpu, > } > if (ret == 0) { > cpu_synchronize_state(cs); > - assert(env->exception_injected == -1); > + assert(env->exception_nr == -1); > > /* pass to guest */ > - env->exception_injected = arch_info->exception; > + kvm_queue_exception(env, arch_info->exception, > + EXCP01_DB, arch_info->dr6); There is a bug here I will fix in my patch-series: Third argument should be (arch_info->exception == EXCP01_DB). > env->has_error_code = 0; > - if (arch_info->exception == EXCP01_DB) { > - env->dr[6] = arch_info->dr6; > - } > } > > return ret; > diff --git a/target/i386/machine.c b/target/i386/machine.c > index 225b5d433b..41460be54b 100644 > --- a/target/i386/machine.c > +++ b/target/i386/machine.c > @@ -199,6 +199,21 @@ static const VMStateDescription vmstate_fpreg = { > } > }; > > +static bool is_vmx_enabled(CPUX86State *env) > +{ > + return (IS_INTEL_CPU(env) && (env->cr[4] & CR4_VMXE_MASK)); > +} > + > +static bool is_svm_enabled(CPUX86State *env) > +{ > + return (IS_AMD_CPU(env) && (env->efer & MSR_EFER_SVME)); > +} > + > +static bool is_nested_virt_enabled(CPUX86State *env) > +{ > + return (is_vmx_enabled(env) || is_svm_enabled(env)); > +} I have later realised that this nested_virt_enabled() function is not enough to determine if nested_state is required to be sent. This is because it may be that vCPU is running L2 but have momentarily entered SMM due to SMI. In this case, CR4 & MSR_EFER are saved in SMRAM and are set to 0 on entering to SMM. This means that in case (env->hflags & HF_SMM_MASK), we theoretically should have read saved CR4 & MSR_EFER from SMRAM. However, because we cannot reference guest memory at this point (Not valid in case we migrate guest in post-copy), I should change code to assume that in case (env->hflags & HF_SMM_MASK), we need to assume that nested-state is needed. This should break backwards-compatability migration only in very rare cases and therefore I think it should be sufficient. Any objections to this idea? > + > static int cpu_pre_save(void *opaque) > { In my next patch-series I have added logic to cpu_pre_save that converts a pending exception to an injected exception in case there is an exception pending and nested-virtualization is not enabled. During this conversion, I also make sure to apply the exception payload to the respective vCPU register (i.e. CR2 in case of #PF or DR6 in case of #DB). I have realised this is required because otherwise exception-info VMState subsection will be deemed not needed and exception-payload will be lost. Something like this: + /* + * When vCPU is running L2 and exception is still pending, + * it can potentially be intercepted by L1 hypervisor. + * In contrast to an injected exception which cannot be + * intercepted anymore. + * + * Furthermore, when a L2 exception is intercepted by L1 + * hypervisor, it's exception payload (CR2/DR6 on #PF/#DB) + * should not be set yet in the respective vCPU register. + * Thus, in case an exception is pending, it is + * important to save the exception payload seperately. + * + * Therefore, if an exception is not in a pending state + * or guest is not running a nested hypervisor, it is + * not important to distinguish between a pending and + * injected exception and we don't need to store seperately + * the exception payload. + * + * In order to preserve better backwards-compatabile migration, + * convert a pending exception to an injected exception in + * case it is not important to distingiush between them + * as described above. + */ + if (!is_nested_virt_enabled(env) && env->exception_pending) { + env->exception_pending = 0; + env->exception_injected = 1; + + if (env->exception_nr == EXCP01_DB) { + assert(env->exception_has_payload); + env->dr[6] = env->exception_payload; + } else if (env->exception_nr == EXCP0E_PAGE) { + assert(env->exception_has_payload); + env->cr[2] = env->exception_payload; + } else { + assert(!env->exception_has_payload); + } + } + > X86CPU *cpu = opaque; > @@ -278,6 +293,23 @@ static int cpu_post_load(void *opaque, int version_id) > env->hflags &= ~HF_CPL_MASK; > env->hflags |= (env->segs[R_SS].flags >> DESC_DPL_SHIFT) & HF_CPL_MASK; > > + /* > + * There are cases that we can get valid exception_nr with both > + * exception_pending and exception_clear being cleared. This can happen in > + * one of the following scenarios: > + * 1) Source is older QEMU without KVM_CAP_EXCEPTION_PAYLOAD support. > + * 2) Source is running on kernel without KVM_CAP_EXCEPTION_PAYLOAD support. > + * 3) "cpu/exception_info" subsection not sent because there is no exception > + * pending or guest wasn't running L2. > + * > + * In those cases, we can just deduce that a valid exception_nr means > + * we can treat the exception as already injected. > + */ > + if ((env->exception_nr != -1) && > + !env->exception_pending && !env->exception_injected) { > + env->exception_injected = 1; > + } > + > env->fpstt = (env->fpus_vmstate >> 11) & 7; > env->fpus = env->fpus_vmstate & ~0x3800; > env->fptag_vmstate ^= 0xff; > @@ -323,6 +355,32 @@ static bool steal_time_msr_needed(void *opaque) > return cpu->env.steal_time_msr != 0; > } > > +static bool exception_info_needed(void *opaque) > +{ > + X86CPU *cpu = opaque; > + > + /* > + * Differenting between pending and injected exceptions > + * is only important when running L2. > + */ > + return (cpu->env.exception_pending && > + is_nested_virt_enabled(&cpu->env)); > +} > + > +static const VMStateDescription vmstate_exception_info = { > + .name = "cpu/exception_info", > + .version_id = 1, > + .minimum_version_id = 1, > + .needed = exception_info_needed, > + .fields = (VMStateField[]) { > + VMSTATE_UINT8(env.exception_pending, X86CPU), > + VMSTATE_UINT8(env.exception_injected, X86CPU), > + VMSTATE_UINT8(env.exception_has_payload, X86CPU), > + VMSTATE_UINT64(env.exception_payload, X86CPU), > + VMSTATE_END_OF_LIST() > + } > +}; > + > static const VMStateDescription vmstate_steal_time_msr = { > .name = "cpu/steal_time_msr", > .version_id = 1, > @@ -1035,7 +1093,7 @@ VMStateDescription vmstate_x86_cpu = { > VMSTATE_INT32(env.interrupt_injected, X86CPU), > VMSTATE_UINT32(env.mp_state, X86CPU), > VMSTATE_UINT64(env.tsc, X86CPU), > - VMSTATE_INT32(env.exception_injected, X86CPU), > + VMSTATE_INT32(env.exception_nr, X86CPU), > VMSTATE_UINT8(env.soft_interrupt, X86CPU), > VMSTATE_UINT8(env.nmi_injected, X86CPU), > VMSTATE_UINT8(env.nmi_pending, X86CPU), > @@ -1059,6 +1117,7 @@ VMStateDescription vmstate_x86_cpu = { > /* The above list is not sorted /wrt version numbers, watch out! */ > }, > .subsections = (const VMStateDescription*[]) { > + &vmstate_exception_info, > &vmstate_async_pf_msr, > &vmstate_pv_eoi_msr, > &vmstate_steal_time_msr, > -- > 2.21.0 > >

diff --git a/target/i386/cpu.c b/target/i386/cpu.c index c1ab86d63e..4e19969111 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -4777,7 +4777,9 @@ static void x86_cpu_reset(CPUState *s) memset(env->mtrr_fixed, 0, sizeof(env->mtrr_fixed)); env->interrupt_injected = -1; - env->exception_injected = -1; + env->exception_nr = -1; + env->exception_pending = 0; + env->exception_injected = 0; env->nmi_injected = false; #if !defined(CONFIG_USER_ONLY) /* We hard-wire the BSP to the first CPU. */ @@ -5173,12 +5175,6 @@ static int x86_cpu_filter_features(X86CPU *cpu) return rv; } -#define IS_INTEL_CPU(env) ((env)->cpuid_vendor1 == CPUID_VENDOR_INTEL_1 && \ - (env)->cpuid_vendor2 == CPUID_VENDOR_INTEL_2 && \ - (env)->cpuid_vendor3 == CPUID_VENDOR_INTEL_3) -#define IS_AMD_CPU(env) ((env)->cpuid_vendor1 == CPUID_VENDOR_AMD_1 && \ - (env)->cpuid_vendor2 == CPUID_VENDOR_AMD_2 && \ - (env)->cpuid_vendor3 == CPUID_VENDOR_AMD_3) static void x86_cpu_realizefn(DeviceState *dev, Error **errp) { CPUState *cs = CPU(dev); diff --git a/target/i386/cpu.h b/target/i386/cpu.h index bd06523a53..bbeb7a9521 100644 --- a/target/i386/cpu.h +++ b/target/i386/cpu.h @@ -729,6 +729,13 @@ typedef uint32_t FeatureWordArray[FEATURE_WORDS]; #define CPUID_VENDOR_HYGON "HygonGenuine" +#define IS_INTEL_CPU(env) ((env)->cpuid_vendor1 == CPUID_VENDOR_INTEL_1 && \ + (env)->cpuid_vendor2 == CPUID_VENDOR_INTEL_2 && \ + (env)->cpuid_vendor3 == CPUID_VENDOR_INTEL_3) +#define IS_AMD_CPU(env) ((env)->cpuid_vendor1 == CPUID_VENDOR_AMD_1 && \ + (env)->cpuid_vendor2 == CPUID_VENDOR_AMD_2 && \ + (env)->cpuid_vendor3 == CPUID_VENDOR_AMD_3) + #define CPUID_MWAIT_IBE (1U << 1) /* Interrupts can exit capability */ #define CPUID_MWAIT_EMX (1U << 0) /* enumeration supported */ @@ -1332,10 +1339,14 @@ typedef struct CPUX86State { /* For KVM */ uint32_t mp_state; - int32_t exception_injected; + int32_t exception_nr; int32_t interrupt_injected; uint8_t soft_interrupt; + uint8_t exception_pending; + uint8_t exception_injected; uint8_t has_error_code; + uint8_t exception_has_payload; + uint64_t exception_payload; uint32_t ins_len; uint32_t sipi_vector; bool tsc_valid; diff --git a/target/i386/hvf/hvf.c b/target/i386/hvf/hvf.c index 2751c8125c..dc4bb63536 100644 --- a/target/i386/hvf/hvf.c +++ b/target/i386/hvf/hvf.c @@ -605,7 +605,9 @@ static void hvf_store_events(CPUState *cpu, uint32_t ins_len, uint64_t idtvec_in X86CPU *x86_cpu = X86_CPU(cpu); CPUX86State *env = &x86_cpu->env; - env->exception_injected = -1; + env->exception_nr = -1; + env->exception_pending = 0; + env->exception_injected = 0; env->interrupt_injected = -1; env->nmi_injected = false; if (idtvec_info & VMCS_IDT_VEC_VALID) { @@ -619,7 +621,8 @@ static void hvf_store_events(CPUState *cpu, uint32_t ins_len, uint64_t idtvec_in break; case VMCS_IDT_VEC_HWEXCEPTION: case VMCS_IDT_VEC_SWEXCEPTION: - env->exception_injected = idtvec_info & VMCS_IDT_VEC_VECNUM; + env->exception_nr = idtvec_info & VMCS_IDT_VEC_VECNUM; + env->exception_injected = 1; break; case VMCS_IDT_VEC_PRIV_SWEXCEPTION: default: @@ -912,7 +915,8 @@ int hvf_vcpu_exec(CPUState *cpu) macvm_set_rip(cpu, rip + ins_len); break; case VMX_REASON_VMCALL: - env->exception_injected = EXCP0D_GPF; + env->exception_nr = EXCP0D_GPF; + env->exception_injected = 1; env->has_error_code = true; env->error_code = 0; break; diff --git a/target/i386/hvf/x86hvf.c b/target/i386/hvf/x86hvf.c index df8e946fbc..e0ea02d631 100644 --- a/target/i386/hvf/x86hvf.c +++ b/target/i386/hvf/x86hvf.c @@ -362,8 +362,8 @@ bool hvf_inject_interrupts(CPUState *cpu_state) if (env->interrupt_injected != -1) { vector = env->interrupt_injected; intr_type = VMCS_INTR_T_SWINTR; - } else if (env->exception_injected != -1) { - vector = env->exception_injected; + } else if (env->exception_nr != -1) { + vector = env->exception_nr; if (vector == EXCP03_INT3 || vector == EXCP04_INTO) { intr_type = VMCS_INTR_T_SWEXCEPTION; } else { diff --git a/target/i386/kvm.c b/target/i386/kvm.c index 53f95b02a0..dca76830ec 100644 --- a/target/i386/kvm.c +++ b/target/i386/kvm.c @@ -104,6 +104,7 @@ static uint32_t num_architectural_pmu_fixed_counters; static int has_xsave; static int has_xcrs; static int has_pit_state2; +static int has_exception_payload; static bool has_msr_mcg_ext_ctl; @@ -584,15 +585,51 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr) /* Hope we are lucky for AO MCE */ } +static void kvm_reset_exception(CPUX86State *env) +{ + env->exception_nr = -1; + env->exception_pending = 0; + env->exception_injected = 0; +} + +static void kvm_queue_exception(CPUX86State *env, + int32_t exception_nr, + uint8_t exception_has_payload, + uint64_t exception_payload) +{ + assert(env->exception_nr == -1); + assert(!env->exception_pending); + assert(!env->exception_injected); + + env->exception_nr = exception_nr; + + if (has_exception_payload) { + env->exception_pending = 1; + + env->exception_has_payload = exception_has_payload; + env->exception_payload = exception_payload; + } else { + env->exception_injected = 1; + + if (exception_has_payload) { + if (exception_nr == EXCP01_DB) { + env->dr[6] = exception_payload; + } else if (exception_nr == EXCP0E_PAGE) { + env->cr[2] = exception_payload; + } + } + } +} + static int kvm_inject_mce_oldstyle(X86CPU *cpu) { CPUX86State *env = &cpu->env; - if (!kvm_has_vcpu_events() && env->exception_injected == EXCP12_MCHK) { + if (!kvm_has_vcpu_events() && env->exception_nr == EXCP12_MCHK) { unsigned int bank, bank_num = env->mcg_cap & 0xff; struct kvm_x86_mce mce; - env->exception_injected = -1; + kvm_reset_exception(env); /* * There must be at least one bank in use if an MCE is pending. @@ -1573,6 +1610,16 @@ int kvm_arch_init(MachineState *ms, KVMState *s) hv_vpindex_settable = kvm_check_extension(s, KVM_CAP_HYPERV_VP_INDEX); + has_exception_payload = kvm_check_extension(s, KVM_CAP_EXCEPTION_PAYLOAD); + if (has_exception_payload) { + ret = kvm_vm_enable_cap(s, KVM_CAP_EXCEPTION_PAYLOAD, 0, true); + if (ret < 0) { + error_report("kvm: Failed to enable exception payload cap: %s", + strerror(-ret)); + return ret; + } + } + ret = kvm_get_supported_msrs(s); if (ret < 0) { return ret; @@ -2877,8 +2924,16 @@ static int kvm_put_vcpu_events(X86CPU *cpu, int level) return 0; } - events.exception.injected = (env->exception_injected >= 0); - events.exception.nr = env->exception_injected; + events.flags = 0; + + if (has_exception_payload) { + events.flags |= KVM_VCPUEVENT_VALID_PAYLOAD; + events.exception.pending = env->exception_pending; + events.exception_has_payload = env->exception_has_payload; + events.exception_payload = env->exception_payload; + } + events.exception.nr = env->exception_nr; + events.exception.injected = env->exception_injected; events.exception.has_error_code = env->has_error_code; events.exception.error_code = env->error_code; @@ -2891,7 +2946,6 @@ static int kvm_put_vcpu_events(X86CPU *cpu, int level) events.nmi.masked = !!(env->hflags2 & HF2_NMI_MASK); events.sipi_vector = env->sipi_vector; - events.flags = 0; if (has_msr_smbase) { events.smi.smm = !!(env->hflags & HF_SMM_MASK); @@ -2941,8 +2995,19 @@ static int kvm_get_vcpu_events(X86CPU *cpu) if (ret < 0) { return ret; } - env->exception_injected = - events.exception.injected ? events.exception.nr : -1; + + if (events.flags & KVM_VCPUEVENT_VALID_PAYLOAD) { + env->exception_pending = events.exception.pending; + env->exception_has_payload = events.exception_has_payload; + env->exception_payload = events.exception_payload; + } else { + env->exception_pending = 0; + env->exception_has_payload = false; + } + env->exception_injected = events.exception.injected; + env->exception_nr = + (env->exception_pending || env->exception_injected) ? + events.exception.nr : -1; env->has_error_code = events.exception.has_error_code; env->error_code = events.exception.error_code; @@ -2994,12 +3059,12 @@ static int kvm_guest_debug_workarounds(X86CPU *cpu) unsigned long reinject_trap = 0; if (!kvm_has_vcpu_events()) { - if (env->exception_injected == EXCP01_DB) { + if (env->exception_nr == EXCP01_DB) { reinject_trap = KVM_GUESTDBG_INJECT_DB; } else if (env->exception_injected == EXCP03_INT3) { reinject_trap = KVM_GUESTDBG_INJECT_BP; } - env->exception_injected = -1; + kvm_reset_exception(env); } /* @@ -3320,13 +3385,13 @@ int kvm_arch_process_async_events(CPUState *cs) kvm_cpu_synchronize_state(cs); - if (env->exception_injected == EXCP08_DBLE) { + if (env->exception_nr == EXCP08_DBLE) { /* this means triple fault */ qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET); cs->exit_request = 1; return 0; } - env->exception_injected = EXCP12_MCHK; + kvm_queue_exception(env, EXCP12_MCHK, 0, 0); env->has_error_code = 0; cs->halted = 0; @@ -3541,14 +3606,12 @@ static int kvm_handle_debug(X86CPU *cpu, } if (ret == 0) { cpu_synchronize_state(cs); - assert(env->exception_injected == -1); + assert(env->exception_nr == -1); /* pass to guest */ - env->exception_injected = arch_info->exception; + kvm_queue_exception(env, arch_info->exception, + EXCP01_DB, arch_info->dr6); env->has_error_code = 0; - if (arch_info->exception == EXCP01_DB) { - env->dr[6] = arch_info->dr6; - } } return ret; diff --git a/target/i386/machine.c b/target/i386/machine.c index 225b5d433b..41460be54b 100644 --- a/target/i386/machine.c +++ b/target/i386/machine.c @@ -199,6 +199,21 @@ static const VMStateDescription vmstate_fpreg = { } }; +static bool is_vmx_enabled(CPUX86State *env) +{ + return (IS_INTEL_CPU(env) && (env->cr[4] & CR4_VMXE_MASK)); +} + +static bool is_svm_enabled(CPUX86State *env) +{ + return (IS_AMD_CPU(env) && (env->efer & MSR_EFER_SVME)); +} + +static bool is_nested_virt_enabled(CPUX86State *env) +{ + return (is_vmx_enabled(env) || is_svm_enabled(env)); +} + static int cpu_pre_save(void *opaque) { X86CPU *cpu = opaque; @@ -278,6 +293,23 @@ static int cpu_post_load(void *opaque, int version_id) env->hflags &= ~HF_CPL_MASK; env->hflags |= (env->segs[R_SS].flags >> DESC_DPL_SHIFT) & HF_CPL_MASK; + /* + * There are cases that we can get valid exception_nr with both + * exception_pending and exception_clear being cleared. This can happen in + * one of the following scenarios: + * 1) Source is older QEMU without KVM_CAP_EXCEPTION_PAYLOAD support. + * 2) Source is running on kernel without KVM_CAP_EXCEPTION_PAYLOAD support. + * 3) "cpu/exception_info" subsection not sent because there is no exception + * pending or guest wasn't running L2. + * + * In those cases, we can just deduce that a valid exception_nr means + * we can treat the exception as already injected. + */ + if ((env->exception_nr != -1) && + !env->exception_pending && !env->exception_injected) { + env->exception_injected = 1; + } + env->fpstt = (env->fpus_vmstate >> 11) & 7; env->fpus = env->fpus_vmstate & ~0x3800; env->fptag_vmstate ^= 0xff; @@ -323,6 +355,32 @@ static bool steal_time_msr_needed(void *opaque) return cpu->env.steal_time_msr != 0; } +static bool exception_info_needed(void *opaque) +{ + X86CPU *cpu = opaque; + + /* + * Differenting between pending and injected exceptions + * is only important when running L2. + */ + return (cpu->env.exception_pending && + is_nested_virt_enabled(&cpu->env)); +} + +static const VMStateDescription vmstate_exception_info = { + .name = "cpu/exception_info", + .version_id = 1, + .minimum_version_id = 1, + .needed = exception_info_needed, + .fields = (VMStateField[]) { + VMSTATE_UINT8(env.exception_pending, X86CPU), + VMSTATE_UINT8(env.exception_injected, X86CPU), + VMSTATE_UINT8(env.exception_has_payload, X86CPU), + VMSTATE_UINT64(env.exception_payload, X86CPU), + VMSTATE_END_OF_LIST() + } +}; + static const VMStateDescription vmstate_steal_time_msr = { .name = "cpu/steal_time_msr", .version_id = 1, @@ -1035,7 +1093,7 @@ VMStateDescription vmstate_x86_cpu = { VMSTATE_INT32(env.interrupt_injected, X86CPU), VMSTATE_UINT32(env.mp_state, X86CPU), VMSTATE_UINT64(env.tsc, X86CPU), - VMSTATE_INT32(env.exception_injected, X86CPU), + VMSTATE_INT32(env.exception_nr, X86CPU), VMSTATE_UINT8(env.soft_interrupt, X86CPU), VMSTATE_UINT8(env.nmi_injected, X86CPU), VMSTATE_UINT8(env.nmi_pending, X86CPU), @@ -1059,6 +1117,7 @@ VMStateDescription vmstate_x86_cpu = { /* The above list is not sorted /wrt version numbers, watch out! */ }, .subsections = (const VMStateDescription*[]) { + &vmstate_exception_info, &vmstate_async_pf_msr, &vmstate_pv_eoi_msr, &vmstate_steal_time_msr,

[3/7] KVM: i386: Add support for KVM_CAP_EXCEPTION_PAYLOAD

Commit Message

Comments

Patch