Message ID | 20190615004256.16367-7-pbonzini@redhat.com |
---|---|
State | New |
Headers | show |
Series | target-i386/kvm: live migration support for nested VMX | expand |
> On 15 Jun 2019, at 3:42, Paolo Bonzini <pbonzini@redhat.com> wrote: > > From: Liran Alon <liran.alon@oracle.com> > > Kernel commit 8fcc4b5923af ("kvm: nVMX: Introduce KVM_CAP_NESTED_STATE") > introduced new IOCTLs to extract and restore KVM internal state used to > run a VM that is in VMX operation. > > Utilize these IOCTLs to add support of migration of VMs which are > running nested hypervisors. > > Reviewed-by: Nikita Leshenko <nikita.leshchenko@oracle.com> > Signed-off-by: Liran Alon <liran.alon@oracle.com> > [Simplified subsection needed functions and computation of > kvm_min_nested_state_len(); adjusted for upstream kernel field > names; fixed !CONFIG_KVM compilation. - Paolo] > Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> > --- > accel/kvm/kvm-all.c | 8 ++ > include/sysemu/kvm.h | 1 + > target/i386/cpu.h | 3 + > target/i386/kvm.c | 52 ++++++++++++ > target/i386/machine.c | 182 ++++++++++++++++++++++++++++++++++++++++++ > 5 files changed, 246 insertions(+) > > diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c > index e4ac3386cb..e1c6c067e8 100644 > --- a/accel/kvm/kvm-all.c > +++ b/accel/kvm/kvm-all.c > @@ -88,6 +88,7 @@ struct KVMState > #ifdef KVM_CAP_SET_GUEST_DEBUG > QTAILQ_HEAD(, kvm_sw_breakpoint) kvm_sw_breakpoints; > #endif > + uint32_t max_nested_state_len; Note: In my next patch-series I have changed this to be “int”. > int many_ioeventfds; > int intx_set_mask; > bool sync_mmu; > @@ -1677,6 +1678,8 @@ static int kvm_init(MachineState *ms) > s->debugregs = kvm_check_extension(s, KVM_CAP_DEBUGREGS); > #endif > > + s->max_nested_state_len = kvm_check_extension(s, KVM_CAP_NESTED_STATE); > + > #ifdef KVM_CAP_IRQ_ROUTING > kvm_direct_msi_allowed = (kvm_check_extension(s, KVM_CAP_SIGNAL_MSI) > 0); > #endif > @@ -2244,6 +2247,11 @@ int kvm_has_debugregs(void) > return kvm_state->debugregs; > } > > +uint32_t kvm_max_nested_state_length(void) > +{ > + return kvm_state->max_nested_state_len; > +} > + > int kvm_has_many_ioeventfds(void) > { > if (!kvm_enabled()) { > diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h > index a6d1cd190f..5eb79b594c 100644 > --- a/include/sysemu/kvm.h > +++ b/include/sysemu/kvm.h > @@ -210,6 +210,7 @@ bool kvm_has_sync_mmu(void); > int kvm_has_vcpu_events(void); > int kvm_has_robust_singlestep(void); > int kvm_has_debugregs(void); > +uint32_t kvm_max_nested_state_length(void); > int kvm_has_pit_state2(void); > int kvm_has_many_ioeventfds(void); > int kvm_has_gsi_routing(void); > diff --git a/target/i386/cpu.h b/target/i386/cpu.h > index bbeb7a9521..550d397807 100644 > --- a/target/i386/cpu.h > +++ b/target/i386/cpu.h > @@ -1355,6 +1355,9 @@ typedef struct CPUX86State { > #if defined(CONFIG_KVM) || defined(CONFIG_HVF) > void *xsave_buf; > #endif > +#if defined(CONFIG_KVM) > + struct kvm_nested_state *nested_state; > +#endif Nice catch regarding CONFIG_KVM. Thanks for that. :) > #if defined(CONFIG_HVF) > HVFX86EmulatorState *hvf_emul; > #endif > diff --git a/target/i386/kvm.c b/target/i386/kvm.c > index dca76830ec..d48fafa22b 100644 > --- a/target/i386/kvm.c > +++ b/target/i386/kvm.c > @@ -968,6 +968,7 @@ int kvm_arch_init_vcpu(CPUState *cs) > struct kvm_cpuid_entry2 *c; > uint32_t signature[3]; > int kvm_base = KVM_CPUID_SIGNATURE; > + uint32_t nested_state_len; > int r; > Error *local_err = NULL; > > @@ -1368,6 +1369,13 @@ int kvm_arch_init_vcpu(CPUState *cs) > if (has_xsave) { > env->xsave_buf = qemu_memalign(4096, sizeof(struct kvm_xsave)); > } > + > + nested_state_len = kvm_max_nested_state_length(); > + if (nested_state_len > 0) { > + assert(nested_state_len >= offsetof(struct kvm_nested_state, data)); > + env->nested_state = g_malloc0(nested_state_len); Paolo, why have you removed setting “env->nested_state->size = max_nested_state_len;”? In addition, in my next patch-series I also added the following code here which is required: + if (IS_INTEL_CPU(env)) { + struct kvm_vmx_nested_state_hdr *vmx_hdr = + &env->nested_state->hdr.vmx_hdr; + + vmx_hdr->vmxon_pa = -1ull; + vmx_hdr->vmcs12_pa = -1ull; + } > + } > + > cpu->kvm_msr_buf = g_malloc0(MSR_BUF_SIZE); Note: In my next patch-series I have also added a new kvm_arch_destroy_vcpu() method which is called from kvm_destroy_vcpu(). Similar to how kvm_arch_init_vcpu() is called from kvm_init_vcpu(). I use it to free both cpu->kvm_msr_buf and env->nested_state. > > if (!(env->features[FEAT_8000_0001_EDX] & CPUID_EXT2_RDTSCP)) { > @@ -3125,6 +3133,41 @@ static int kvm_get_debugregs(X86CPU *cpu) > return 0; > } > > +static int kvm_put_nested_state(X86CPU *cpu) > +{ > + CPUX86State *env = &cpu->env; > + uint32_t nested_state_len = kvm_max_nested_state_length(); > + > + if (nested_state_len == 0) { > + return 0; > + } > + > + assert(env->nested_state->size <= nested_state_len); > + return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_NESTED_STATE, env->nested_state); > +} > + > +static int kvm_get_nested_state(X86CPU *cpu) > +{ > + CPUX86State *env = &cpu->env; > + uint32_t nested_state_len = kvm_max_nested_state_length(); > + > + if (nested_state_len == 0) { > + return 0; > + } > + > + /* > + * It is possible that migration restored a smaller size into > + * nested_state->size than what our kernel supports. > + * We preserve migration origin nested_state->size for > + * the call to KVM_SET_NESTED_STATE but wish that our next call > + * to KVM_GET_NESTED_STATE will use the maximum size supported by > + * the kernel we're running on. > + */ > + env->nested_state->size = nested_state_len; > + > + return kvm_vcpu_ioctl(CPU(cpu), KVM_GET_NESTED_STATE, env->nested_state); > +} > + > int kvm_arch_put_registers(CPUState *cpu, int level) > { > X86CPU *x86_cpu = X86_CPU(cpu); > @@ -3132,6 +3175,11 @@ int kvm_arch_put_registers(CPUState *cpu, int level) > > assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu)); > > + ret = kvm_put_nested_state(x86_cpu); > + if (ret < 0) { > + return ret; > + } > + > if (level >= KVM_PUT_RESET_STATE) { > ret = kvm_put_msr_feature_control(x86_cpu); > if (ret < 0) { > @@ -3247,6 +3295,10 @@ int kvm_arch_get_registers(CPUState *cs) > if (ret < 0) { > goto out; > } > + ret = kvm_get_nested_state(cpu); > + if (ret < 0) { > + goto out; > + } > ret = 0; > out: > cpu_sync_bndcs_hflags(&cpu->env); > diff --git a/target/i386/machine.c b/target/i386/machine.c > index 41460be54b..45dbae6054 100644 > --- a/target/i386/machine.c > +++ b/target/i386/machine.c > @@ -246,6 +246,15 @@ static int cpu_pre_save(void *opaque) > env->segs[R_SS].flags &= ~(env->segs[R_SS].flags & DESC_DPL_MASK); > } > > +#ifdef CONFIG_KVM > + /* Verify we have nested virtualization state from kernel if required */ > + if (is_nested_virt_enabled(env) && !env->nested_state) { > + error_report("Guest enabled nested virtualization but kernel " > + "do not support saving nested state"); > + return -EINVAL; > + } > +#endif > + > return 0; > } > > @@ -909,6 +918,176 @@ static const VMStateDescription vmstate_tsc_khz = { > } > }; > > +#ifdef CONFIG_KVM > +static bool vmx_vmcs12_needed(void *opaque) > +{ > + struct kvm_nested_state *nested_state = opaque; > + return (nested_state->size > offsetof(struct kvm_nested_state, > + vmx.data[0].vmcs12)); Do you prefer this compared to checking explicitly? i.e. by: return (nested_state->vmx.vmcs12_pa != -1ull); > +} > + > +static const VMStateDescription vmstate_vmx_vmcs12_state = { > + .name = "cpu/kvm_nested_state/vmx/vmcs12", > + .version_id = 1, > + .minimum_version_id = 1, > + .needed = vmx_vmcs12_needed, > + .fields = (VMStateField[]) { > + VMSTATE_UINT8_ARRAY(vmx.data[0].vmcs12, > + struct kvm_nested_state, 0x1000), > + VMSTATE_END_OF_LIST() > + } > +}; > + > +static bool vmx_shadow_vmcs12_needed(void *opaque) > +{ > + struct kvm_nested_state *nested_state = opaque; > + return (nested_state->size > offsetof(struct kvm_nested_state, > + vmx.data[0].shadow_vmcs12)); Nice trick on how to determine if to send shadow_vmcs12 without requiring to check if vmcs12 indeed have VMCS-shadowing enabled and a valid vmcs-link-ptr. :) > +} > + > +static const VMStateDescription vmstate_vmx_shadow_vmcs12_state = { > + .name = "cpu/kvm_nested_state/vmx/shadow_vmcs12", > + .version_id = 1, > + .minimum_version_id = 1, > + .needed = vmx_shadow_vmcs12_needed, > + .fields = (VMStateField[]) { > + VMSTATE_UINT8_ARRAY(vmx.data[0].shadow_vmcs12, > + struct kvm_nested_state, 0x1000), > + VMSTATE_END_OF_LIST() > + } > +}; > + > +static bool vmx_nested_state_needed(void *opaque) > +{ > + struct kvm_nested_state *nested_state = opaque; > + > + return ((nested_state->format == KVM_STATE_NESTED_FORMAT_VMX) && > + (nested_state->vmx.vmxon_pa != -1ull)); > +} > + > +static const VMStateDescription vmstate_vmx_nested_state = { > + .name = "cpu/kvm_nested_state/vmx", > + .version_id = 1, > + .minimum_version_id = 1, > + .needed = vmx_nested_state_needed, > + .fields = (VMStateField[]) { > + VMSTATE_U64(vmx.vmxon_pa, struct kvm_nested_state), > + VMSTATE_U64(vmx.vmcs_pa, struct kvm_nested_state), > + VMSTATE_U16(vmx.smm.flags, struct kvm_nested_state), > + VMSTATE_END_OF_LIST() > + }, > + .subsections = (const VMStateDescription*[]) { > + &vmstate_vmx_vmcs12_state, > + &vmstate_vmx_shadow_vmcs12_state, > + NULL, > + } > +}; > + > +static bool svm_nested_state_needed(void *opaque) > +{ > + struct kvm_nested_state *nested_state = opaque; > + > + return (nested_state->format == KVM_STATE_NESTED_FORMAT_SVM); > +} > + > +static const VMStateDescription vmstate_svm_nested_state = { > + .name = "cpu/kvm_nested_state/svm", > + .version_id = 1, > + .minimum_version_id = 1, > + .needed = svm_nested_state_needed, > + .fields = (VMStateField[]) { > + VMSTATE_END_OF_LIST() > + } > +}; > + > +static bool nested_state_needed(void *opaque) > +{ > + X86CPU *cpu = opaque; > + CPUX86State *env = &cpu->env; > + > + return (is_vmx_enabled(env) && vmx_nested_state_needed(env->nested_state)) || > + (is_svm_enabled(env) && svm_nested_state_needed(env->nested_state)); > +} As I specified in an earlier email in this patch-series, this is not entirely accurate. In case vCPU is running L2 and entered SMM, then is_vmx_enabled() will return false because CR4 is set to 0 on entering SMM. I consider deeming nested_state needed in case hflags specifies guest is in SMM mode. Any objection? > + > +static int nested_state_post_load(void *opaque, int version_id) > +{ > + X86CPU *cpu = opaque; > + CPUX86State *env = &cpu->env; > + struct kvm_nested_state *nested_state = env->nested_state; > + uint32_t min_nested_state_len = offsetof(struct kvm_nested_state, data); > + uint32_t max_nested_state_len = kvm_max_nested_state_length(); > + > + /* > + * If our kernel don't support setting nested state > + * and we have received nested state from migration stream, > + * we need to fail migration > + */ > + if (max_nested_state_len == 0) { > + error_report("Received nested state when kernel cannot restore it"); > + return -EINVAL; > + } > + > + /* > + * Verify that the size of received nested_state struct > + * at least cover required header and is not larger > + * than the max size that our kernel support > + */ > + if (nested_state->size < min_nested_state_len) { > + error_report("Received nested state size less than min: " > + "len=%d, min=%d", > + nested_state->size, min_nested_state_len); > + return -EINVAL; > + } > + if (nested_state->size > max_nested_state_len) { > + error_report("Recieved unsupported nested state size: " > + "nested_state->size=%d, max=%d", > + nested_state->size, max_nested_state_len); > + return -EINVAL; > + } > + > + /* Verify format is valid */ > + if ((nested_state->format != KVM_STATE_NESTED_FORMAT_VMX) && > + (nested_state->format != KVM_STATE_NESTED_FORMAT_SVM)) { > + error_report("Received invalid nested state format: %d", > + nested_state->format); > + return -EINVAL; > + } > + > + return 0; > +} > + > +static const VMStateDescription vmstate_kvm_nested_state = { > + .name = "cpu/kvm_nested_state", > + .version_id = 1, > + .minimum_version_id = 1, > + .fields = (VMStateField[]) { > + VMSTATE_U16(flags, struct kvm_nested_state), > + VMSTATE_U16(format, struct kvm_nested_state), > + VMSTATE_U32(size, struct kvm_nested_state), > + VMSTATE_END_OF_LIST() > + }, > + .subsections = (const VMStateDescription*[]) { > + &vmstate_vmx_nested_state, > + &vmstate_svm_nested_state, > + NULL > + } > +}; > + > +static const VMStateDescription vmstate_nested_state = { > + .name = "cpu/nested_state", > + .version_id = 1, > + .minimum_version_id = 1, > + .needed = nested_state_needed, > + .post_load = nested_state_post_load, > + .fields = (VMStateField[]) { > + VMSTATE_STRUCT_POINTER(env.nested_state, X86CPU, > + vmstate_kvm_nested_state, > + struct kvm_nested_state), > + VMSTATE_END_OF_LIST() > + } > +}; > +#endif > + > static bool mcg_ext_ctl_needed(void *opaque) > { > X86CPU *cpu = opaque; > @@ -1148,6 +1327,9 @@ VMStateDescription vmstate_x86_cpu = { > &vmstate_msr_intel_pt, > &vmstate_msr_virt_ssbd, > &vmstate_svm_npt, > +#ifdef CONFIG_KVM > + &vmstate_nested_state, > +#endif > NULL > } > }; > -- > 2.21.0 > >
On 15/06/19 03:14, Liran Alon wrote: >> @@ -1368,6 +1369,13 @@ int kvm_arch_init_vcpu(CPUState *cs) >> if (has_xsave) { >> env->xsave_buf = qemu_memalign(4096, sizeof(struct kvm_xsave)); >> } >> + >> + nested_state_len = kvm_max_nested_state_length(); >> + if (nested_state_len > 0) { >> + assert(nested_state_len >= offsetof(struct kvm_nested_state, data)); >> + env->nested_state = g_malloc0(nested_state_len); > > Paolo, why have you removed setting “env->nested_state->size = max_nested_state_len;”? Because I confused the "nested_state_len == 0" check in kvm_put_nested_state with "env->nested_state->size == 0". > In addition, in my next patch-series I also added the following code here which is required: > > + if (IS_INTEL_CPU(env)) { > + struct kvm_vmx_nested_state_hdr *vmx_hdr = > + &env->nested_state->hdr.vmx_hdr; > + > + vmx_hdr->vmxon_pa = -1ull; > + vmx_hdr->vmcs12_pa = -1ull; > + } Looks good. >> + } >> + >> cpu->kvm_msr_buf = g_malloc0(MSR_BUF_SIZE); > > Note: In my next patch-series I have also added a new kvm_arch_destroy_vcpu() method which is called from kvm_destroy_vcpu(). > Similar to how kvm_arch_init_vcpu() is called from kvm_init_vcpu(). > I use it to free both cpu->kvm_msr_buf and env->nested_state. Looks good too. >> >> if (!(env->features[FEAT_8000_0001_EDX] & CPUID_EXT2_RDTSCP)) { >> @@ -3125,6 +3133,41 @@ static int kvm_get_debugregs(X86CPU *cpu) >> return 0; >> } >> >> +static int kvm_put_nested_state(X86CPU *cpu) >> +{ >> + CPUX86State *env = &cpu->env; >> + uint32_t nested_state_len = kvm_max_nested_state_length(); >> + >> + if (nested_state_len == 0) { >> + return 0; >> + } >> + >> + assert(env->nested_state->size <= nested_state_len); >> + return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_NESTED_STATE, env->nested_state); >> +} >> + >> +static int kvm_get_nested_state(X86CPU *cpu) >> +{ >> + CPUX86State *env = &cpu->env; >> + uint32_t nested_state_len = kvm_max_nested_state_length(); >> + >> + if (nested_state_len == 0) { >> + return 0; >> + } >> + >> + /* >> + * It is possible that migration restored a smaller size into >> + * nested_state->size than what our kernel supports. >> + * We preserve migration origin nested_state->size for >> + * the call to KVM_SET_NESTED_STATE but wish that our next call >> + * to KVM_GET_NESTED_STATE will use the maximum size supported by >> + * the kernel we're running on. >> + */ >> + env->nested_state->size = nested_state_len; >> + >> + return kvm_vcpu_ioctl(CPU(cpu), KVM_GET_NESTED_STATE, env->nested_state); >> +} >> + >> int kvm_arch_put_registers(CPUState *cpu, int level) >> { >> X86CPU *x86_cpu = X86_CPU(cpu); >> @@ -3132,6 +3175,11 @@ int kvm_arch_put_registers(CPUState *cpu, int level) >> >> assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu)); >> >> + ret = kvm_put_nested_state(x86_cpu); >> + if (ret < 0) { >> + return ret; >> + } >> + >> if (level >= KVM_PUT_RESET_STATE) { >> ret = kvm_put_msr_feature_control(x86_cpu); >> if (ret < 0) { >> @@ -3247,6 +3295,10 @@ int kvm_arch_get_registers(CPUState *cs) >> if (ret < 0) { >> goto out; >> } >> + ret = kvm_get_nested_state(cpu); >> + if (ret < 0) { >> + goto out; >> + } >> ret = 0; >> out: >> cpu_sync_bndcs_hflags(&cpu->env); >> diff --git a/target/i386/machine.c b/target/i386/machine.c >> index 41460be54b..45dbae6054 100644 >> --- a/target/i386/machine.c >> +++ b/target/i386/machine.c >> @@ -246,6 +246,15 @@ static int cpu_pre_save(void *opaque) >> env->segs[R_SS].flags &= ~(env->segs[R_SS].flags & DESC_DPL_MASK); >> } >> >> +#ifdef CONFIG_KVM >> + /* Verify we have nested virtualization state from kernel if required */ >> + if (is_nested_virt_enabled(env) && !env->nested_state) { >> + error_report("Guest enabled nested virtualization but kernel " >> + "do not support saving nested state"); >> + return -EINVAL; >> + } >> +#endif >> + >> return 0; >> } >> >> @@ -909,6 +918,176 @@ static const VMStateDescription vmstate_tsc_khz = { >> } >> }; >> >> +#ifdef CONFIG_KVM >> +static bool vmx_vmcs12_needed(void *opaque) >> +{ >> + struct kvm_nested_state *nested_state = opaque; >> + return (nested_state->size > offsetof(struct kvm_nested_state, >> + vmx.data[0].vmcs12)); > > Do you prefer this compared to checking explicitly? i.e. by: > return (nested_state->vmx.vmcs12_pa != -1ull); I think I do, it guarantees that we don't serialize gibberish from vmx.data[0] and it's consistent with the vmx_shadow_vmcs12_needed check. >> +static bool nested_state_needed(void *opaque) >> +{ >> + X86CPU *cpu = opaque; >> + CPUX86State *env = &cpu->env; >> + >> + return (is_vmx_enabled(env) && vmx_nested_state_needed(env->nested_state)) || >> + (is_svm_enabled(env) && svm_nested_state_needed(env->nested_state)); >> +} > > As I specified in an earlier email in this patch-series, this is not entirely accurate. > In case vCPU is running L2 and entered SMM, then is_vmx_enabled() will return false because CR4 is set to 0 on entering SMM. > I consider deeming nested_state needed in case hflags specifies guest is in SMM mode. Any objection? See other answer, let's fix it in patch 7 instead. Paolo
diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c index e4ac3386cb..e1c6c067e8 100644 --- a/accel/kvm/kvm-all.c +++ b/accel/kvm/kvm-all.c @@ -88,6 +88,7 @@ struct KVMState #ifdef KVM_CAP_SET_GUEST_DEBUG QTAILQ_HEAD(, kvm_sw_breakpoint) kvm_sw_breakpoints; #endif + uint32_t max_nested_state_len; int many_ioeventfds; int intx_set_mask; bool sync_mmu; @@ -1677,6 +1678,8 @@ static int kvm_init(MachineState *ms) s->debugregs = kvm_check_extension(s, KVM_CAP_DEBUGREGS); #endif + s->max_nested_state_len = kvm_check_extension(s, KVM_CAP_NESTED_STATE); + #ifdef KVM_CAP_IRQ_ROUTING kvm_direct_msi_allowed = (kvm_check_extension(s, KVM_CAP_SIGNAL_MSI) > 0); #endif @@ -2244,6 +2247,11 @@ int kvm_has_debugregs(void) return kvm_state->debugregs; } +uint32_t kvm_max_nested_state_length(void) +{ + return kvm_state->max_nested_state_len; +} + int kvm_has_many_ioeventfds(void) { if (!kvm_enabled()) { diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h index a6d1cd190f..5eb79b594c 100644 --- a/include/sysemu/kvm.h +++ b/include/sysemu/kvm.h @@ -210,6 +210,7 @@ bool kvm_has_sync_mmu(void); int kvm_has_vcpu_events(void); int kvm_has_robust_singlestep(void); int kvm_has_debugregs(void); +uint32_t kvm_max_nested_state_length(void); int kvm_has_pit_state2(void); int kvm_has_many_ioeventfds(void); int kvm_has_gsi_routing(void); diff --git a/target/i386/cpu.h b/target/i386/cpu.h index bbeb7a9521..550d397807 100644 --- a/target/i386/cpu.h +++ b/target/i386/cpu.h @@ -1355,6 +1355,9 @@ typedef struct CPUX86State { #if defined(CONFIG_KVM) || defined(CONFIG_HVF) void *xsave_buf; #endif +#if defined(CONFIG_KVM) + struct kvm_nested_state *nested_state; +#endif #if defined(CONFIG_HVF) HVFX86EmulatorState *hvf_emul; #endif diff --git a/target/i386/kvm.c b/target/i386/kvm.c index dca76830ec..d48fafa22b 100644 --- a/target/i386/kvm.c +++ b/target/i386/kvm.c @@ -968,6 +968,7 @@ int kvm_arch_init_vcpu(CPUState *cs) struct kvm_cpuid_entry2 *c; uint32_t signature[3]; int kvm_base = KVM_CPUID_SIGNATURE; + uint32_t nested_state_len; int r; Error *local_err = NULL; @@ -1368,6 +1369,13 @@ int kvm_arch_init_vcpu(CPUState *cs) if (has_xsave) { env->xsave_buf = qemu_memalign(4096, sizeof(struct kvm_xsave)); } + + nested_state_len = kvm_max_nested_state_length(); + if (nested_state_len > 0) { + assert(nested_state_len >= offsetof(struct kvm_nested_state, data)); + env->nested_state = g_malloc0(nested_state_len); + } + cpu->kvm_msr_buf = g_malloc0(MSR_BUF_SIZE); if (!(env->features[FEAT_8000_0001_EDX] & CPUID_EXT2_RDTSCP)) { @@ -3125,6 +3133,41 @@ static int kvm_get_debugregs(X86CPU *cpu) return 0; } +static int kvm_put_nested_state(X86CPU *cpu) +{ + CPUX86State *env = &cpu->env; + uint32_t nested_state_len = kvm_max_nested_state_length(); + + if (nested_state_len == 0) { + return 0; + } + + assert(env->nested_state->size <= nested_state_len); + return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_NESTED_STATE, env->nested_state); +} + +static int kvm_get_nested_state(X86CPU *cpu) +{ + CPUX86State *env = &cpu->env; + uint32_t nested_state_len = kvm_max_nested_state_length(); + + if (nested_state_len == 0) { + return 0; + } + + /* + * It is possible that migration restored a smaller size into + * nested_state->size than what our kernel supports. + * We preserve migration origin nested_state->size for + * the call to KVM_SET_NESTED_STATE but wish that our next call + * to KVM_GET_NESTED_STATE will use the maximum size supported by + * the kernel we're running on. + */ + env->nested_state->size = nested_state_len; + + return kvm_vcpu_ioctl(CPU(cpu), KVM_GET_NESTED_STATE, env->nested_state); +} + int kvm_arch_put_registers(CPUState *cpu, int level) { X86CPU *x86_cpu = X86_CPU(cpu); @@ -3132,6 +3175,11 @@ int kvm_arch_put_registers(CPUState *cpu, int level) assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu)); + ret = kvm_put_nested_state(x86_cpu); + if (ret < 0) { + return ret; + } + if (level >= KVM_PUT_RESET_STATE) { ret = kvm_put_msr_feature_control(x86_cpu); if (ret < 0) { @@ -3247,6 +3295,10 @@ int kvm_arch_get_registers(CPUState *cs) if (ret < 0) { goto out; } + ret = kvm_get_nested_state(cpu); + if (ret < 0) { + goto out; + } ret = 0; out: cpu_sync_bndcs_hflags(&cpu->env); diff --git a/target/i386/machine.c b/target/i386/machine.c index 41460be54b..45dbae6054 100644 --- a/target/i386/machine.c +++ b/target/i386/machine.c @@ -246,6 +246,15 @@ static int cpu_pre_save(void *opaque) env->segs[R_SS].flags &= ~(env->segs[R_SS].flags & DESC_DPL_MASK); } +#ifdef CONFIG_KVM + /* Verify we have nested virtualization state from kernel if required */ + if (is_nested_virt_enabled(env) && !env->nested_state) { + error_report("Guest enabled nested virtualization but kernel " + "do not support saving nested state"); + return -EINVAL; + } +#endif + return 0; } @@ -909,6 +918,176 @@ static const VMStateDescription vmstate_tsc_khz = { } }; +#ifdef CONFIG_KVM +static bool vmx_vmcs12_needed(void *opaque) +{ + struct kvm_nested_state *nested_state = opaque; + return (nested_state->size > offsetof(struct kvm_nested_state, + vmx.data[0].vmcs12)); +} + +static const VMStateDescription vmstate_vmx_vmcs12_state = { + .name = "cpu/kvm_nested_state/vmx/vmcs12", + .version_id = 1, + .minimum_version_id = 1, + .needed = vmx_vmcs12_needed, + .fields = (VMStateField[]) { + VMSTATE_UINT8_ARRAY(vmx.data[0].vmcs12, + struct kvm_nested_state, 0x1000), + VMSTATE_END_OF_LIST() + } +}; + +static bool vmx_shadow_vmcs12_needed(void *opaque) +{ + struct kvm_nested_state *nested_state = opaque; + return (nested_state->size > offsetof(struct kvm_nested_state, + vmx.data[0].shadow_vmcs12)); +} + +static const VMStateDescription vmstate_vmx_shadow_vmcs12_state = { + .name = "cpu/kvm_nested_state/vmx/shadow_vmcs12", + .version_id = 1, + .minimum_version_id = 1, + .needed = vmx_shadow_vmcs12_needed, + .fields = (VMStateField[]) { + VMSTATE_UINT8_ARRAY(vmx.data[0].shadow_vmcs12, + struct kvm_nested_state, 0x1000), + VMSTATE_END_OF_LIST() + } +}; + +static bool vmx_nested_state_needed(void *opaque) +{ + struct kvm_nested_state *nested_state = opaque; + + return ((nested_state->format == KVM_STATE_NESTED_FORMAT_VMX) && + (nested_state->vmx.vmxon_pa != -1ull)); +} + +static const VMStateDescription vmstate_vmx_nested_state = { + .name = "cpu/kvm_nested_state/vmx", + .version_id = 1, + .minimum_version_id = 1, + .needed = vmx_nested_state_needed, + .fields = (VMStateField[]) { + VMSTATE_U64(vmx.vmxon_pa, struct kvm_nested_state), + VMSTATE_U64(vmx.vmcs_pa, struct kvm_nested_state), + VMSTATE_U16(vmx.smm.flags, struct kvm_nested_state), + VMSTATE_END_OF_LIST() + }, + .subsections = (const VMStateDescription*[]) { + &vmstate_vmx_vmcs12_state, + &vmstate_vmx_shadow_vmcs12_state, + NULL, + } +}; + +static bool svm_nested_state_needed(void *opaque) +{ + struct kvm_nested_state *nested_state = opaque; + + return (nested_state->format == KVM_STATE_NESTED_FORMAT_SVM); +} + +static const VMStateDescription vmstate_svm_nested_state = { + .name = "cpu/kvm_nested_state/svm", + .version_id = 1, + .minimum_version_id = 1, + .needed = svm_nested_state_needed, + .fields = (VMStateField[]) { + VMSTATE_END_OF_LIST() + } +}; + +static bool nested_state_needed(void *opaque) +{ + X86CPU *cpu = opaque; + CPUX86State *env = &cpu->env; + + return (is_vmx_enabled(env) && vmx_nested_state_needed(env->nested_state)) || + (is_svm_enabled(env) && svm_nested_state_needed(env->nested_state)); +} + +static int nested_state_post_load(void *opaque, int version_id) +{ + X86CPU *cpu = opaque; + CPUX86State *env = &cpu->env; + struct kvm_nested_state *nested_state = env->nested_state; + uint32_t min_nested_state_len = offsetof(struct kvm_nested_state, data); + uint32_t max_nested_state_len = kvm_max_nested_state_length(); + + /* + * If our kernel don't support setting nested state + * and we have received nested state from migration stream, + * we need to fail migration + */ + if (max_nested_state_len == 0) { + error_report("Received nested state when kernel cannot restore it"); + return -EINVAL; + } + + /* + * Verify that the size of received nested_state struct + * at least cover required header and is not larger + * than the max size that our kernel support + */ + if (nested_state->size < min_nested_state_len) { + error_report("Received nested state size less than min: " + "len=%d, min=%d", + nested_state->size, min_nested_state_len); + return -EINVAL; + } + if (nested_state->size > max_nested_state_len) { + error_report("Recieved unsupported nested state size: " + "nested_state->size=%d, max=%d", + nested_state->size, max_nested_state_len); + return -EINVAL; + } + + /* Verify format is valid */ + if ((nested_state->format != KVM_STATE_NESTED_FORMAT_VMX) && + (nested_state->format != KVM_STATE_NESTED_FORMAT_SVM)) { + error_report("Received invalid nested state format: %d", + nested_state->format); + return -EINVAL; + } + + return 0; +} + +static const VMStateDescription vmstate_kvm_nested_state = { + .name = "cpu/kvm_nested_state", + .version_id = 1, + .minimum_version_id = 1, + .fields = (VMStateField[]) { + VMSTATE_U16(flags, struct kvm_nested_state), + VMSTATE_U16(format, struct kvm_nested_state), + VMSTATE_U32(size, struct kvm_nested_state), + VMSTATE_END_OF_LIST() + }, + .subsections = (const VMStateDescription*[]) { + &vmstate_vmx_nested_state, + &vmstate_svm_nested_state, + NULL + } +}; + +static const VMStateDescription vmstate_nested_state = { + .name = "cpu/nested_state", + .version_id = 1, + .minimum_version_id = 1, + .needed = nested_state_needed, + .post_load = nested_state_post_load, + .fields = (VMStateField[]) { + VMSTATE_STRUCT_POINTER(env.nested_state, X86CPU, + vmstate_kvm_nested_state, + struct kvm_nested_state), + VMSTATE_END_OF_LIST() + } +}; +#endif + static bool mcg_ext_ctl_needed(void *opaque) { X86CPU *cpu = opaque; @@ -1148,6 +1327,9 @@ VMStateDescription vmstate_x86_cpu = { &vmstate_msr_intel_pt, &vmstate_msr_virt_ssbd, &vmstate_svm_npt, +#ifdef CONFIG_KVM + &vmstate_nested_state, +#endif NULL } };