diff mbox series

[6/7] KVM: i386: Add support for save and restore nested state

Message ID 20190615004256.16367-7-pbonzini@redhat.com
State New
Headers show
Series target-i386/kvm: live migration support for nested VMX | expand

Commit Message

Paolo Bonzini June 15, 2019, 12:42 a.m. UTC
From: Liran Alon <liran.alon@oracle.com>

Kernel commit 8fcc4b5923af ("kvm: nVMX: Introduce KVM_CAP_NESTED_STATE")
introduced new IOCTLs to extract and restore KVM internal state used to
run a VM that is in VMX operation.

Utilize these IOCTLs to add support of migration of VMs which are
running nested hypervisors.

Reviewed-by: Nikita Leshenko <nikita.leshchenko@oracle.com>
Signed-off-by: Liran Alon <liran.alon@oracle.com>
[Simplified subsection needed functions and computation of
 kvm_min_nested_state_len(); adjusted for upstream kernel field
 names; fixed !CONFIG_KVM compilation. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 accel/kvm/kvm-all.c   |   8 ++
 include/sysemu/kvm.h  |   1 +
 target/i386/cpu.h     |   3 +
 target/i386/kvm.c     |  52 ++++++++++++
 target/i386/machine.c | 182 ++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 246 insertions(+)

Comments

Liran Alon June 15, 2019, 1:14 a.m. UTC | #1
> On 15 Jun 2019, at 3:42, Paolo Bonzini <pbonzini@redhat.com> wrote:
> 
> From: Liran Alon <liran.alon@oracle.com>
> 
> Kernel commit 8fcc4b5923af ("kvm: nVMX: Introduce KVM_CAP_NESTED_STATE")
> introduced new IOCTLs to extract and restore KVM internal state used to
> run a VM that is in VMX operation.
> 
> Utilize these IOCTLs to add support of migration of VMs which are
> running nested hypervisors.
> 
> Reviewed-by: Nikita Leshenko <nikita.leshchenko@oracle.com>
> Signed-off-by: Liran Alon <liran.alon@oracle.com>
> [Simplified subsection needed functions and computation of
> kvm_min_nested_state_len(); adjusted for upstream kernel field
> names; fixed !CONFIG_KVM compilation. - Paolo]
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
> accel/kvm/kvm-all.c   |   8 ++
> include/sysemu/kvm.h  |   1 +
> target/i386/cpu.h     |   3 +
> target/i386/kvm.c     |  52 ++++++++++++
> target/i386/machine.c | 182 ++++++++++++++++++++++++++++++++++++++++++
> 5 files changed, 246 insertions(+)
> 
> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
> index e4ac3386cb..e1c6c067e8 100644
> --- a/accel/kvm/kvm-all.c
> +++ b/accel/kvm/kvm-all.c
> @@ -88,6 +88,7 @@ struct KVMState
> #ifdef KVM_CAP_SET_GUEST_DEBUG
>     QTAILQ_HEAD(, kvm_sw_breakpoint) kvm_sw_breakpoints;
> #endif
> +    uint32_t max_nested_state_len;

Note: In my next patch-series I have changed this to be “int”.

>     int many_ioeventfds;
>     int intx_set_mask;
>     bool sync_mmu;
> @@ -1677,6 +1678,8 @@ static int kvm_init(MachineState *ms)
>     s->debugregs = kvm_check_extension(s, KVM_CAP_DEBUGREGS);
> #endif
> 
> +    s->max_nested_state_len = kvm_check_extension(s, KVM_CAP_NESTED_STATE);
> +
> #ifdef KVM_CAP_IRQ_ROUTING
>     kvm_direct_msi_allowed = (kvm_check_extension(s, KVM_CAP_SIGNAL_MSI) > 0);
> #endif
> @@ -2244,6 +2247,11 @@ int kvm_has_debugregs(void)
>     return kvm_state->debugregs;
> }
> 
> +uint32_t kvm_max_nested_state_length(void)
> +{
> +    return kvm_state->max_nested_state_len;
> +}
> +
> int kvm_has_many_ioeventfds(void)
> {
>     if (!kvm_enabled()) {
> diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
> index a6d1cd190f..5eb79b594c 100644
> --- a/include/sysemu/kvm.h
> +++ b/include/sysemu/kvm.h
> @@ -210,6 +210,7 @@ bool kvm_has_sync_mmu(void);
> int kvm_has_vcpu_events(void);
> int kvm_has_robust_singlestep(void);
> int kvm_has_debugregs(void);
> +uint32_t kvm_max_nested_state_length(void);
> int kvm_has_pit_state2(void);
> int kvm_has_many_ioeventfds(void);
> int kvm_has_gsi_routing(void);
> diff --git a/target/i386/cpu.h b/target/i386/cpu.h
> index bbeb7a9521..550d397807 100644
> --- a/target/i386/cpu.h
> +++ b/target/i386/cpu.h
> @@ -1355,6 +1355,9 @@ typedef struct CPUX86State {
> #if defined(CONFIG_KVM) || defined(CONFIG_HVF)
>     void *xsave_buf;
> #endif
> +#if defined(CONFIG_KVM)
> +    struct kvm_nested_state *nested_state;
> +#endif

Nice catch regarding CONFIG_KVM. Thanks for that. :)

> #if defined(CONFIG_HVF)
>     HVFX86EmulatorState *hvf_emul;
> #endif
> diff --git a/target/i386/kvm.c b/target/i386/kvm.c
> index dca76830ec..d48fafa22b 100644
> --- a/target/i386/kvm.c
> +++ b/target/i386/kvm.c
> @@ -968,6 +968,7 @@ int kvm_arch_init_vcpu(CPUState *cs)
>     struct kvm_cpuid_entry2 *c;
>     uint32_t signature[3];
>     int kvm_base = KVM_CPUID_SIGNATURE;
> +    uint32_t nested_state_len;
>     int r;
>     Error *local_err = NULL;
> 
> @@ -1368,6 +1369,13 @@ int kvm_arch_init_vcpu(CPUState *cs)
>     if (has_xsave) {
>         env->xsave_buf = qemu_memalign(4096, sizeof(struct kvm_xsave));
>     }
> +
> +    nested_state_len = kvm_max_nested_state_length();
> +    if (nested_state_len > 0) {
> +        assert(nested_state_len >= offsetof(struct kvm_nested_state, data));
> +        env->nested_state = g_malloc0(nested_state_len);

Paolo, why have you removed setting “env->nested_state->size = max_nested_state_len;”?

In addition, in my next patch-series I also added the following code here which is required:

+        if (IS_INTEL_CPU(env)) {
+            struct kvm_vmx_nested_state_hdr *vmx_hdr =
+                &env->nested_state->hdr.vmx_hdr;
+
+            vmx_hdr->vmxon_pa = -1ull;
+            vmx_hdr->vmcs12_pa = -1ull;
+        }

> +    }
> +
>     cpu->kvm_msr_buf = g_malloc0(MSR_BUF_SIZE);

Note: In my next patch-series I have also added a new kvm_arch_destroy_vcpu() method which is called from kvm_destroy_vcpu().
Similar to how kvm_arch_init_vcpu() is called from kvm_init_vcpu().
I use it to free both cpu->kvm_msr_buf and env->nested_state.

> 
>     if (!(env->features[FEAT_8000_0001_EDX] & CPUID_EXT2_RDTSCP)) {
> @@ -3125,6 +3133,41 @@ static int kvm_get_debugregs(X86CPU *cpu)
>     return 0;
> }
> 
> +static int kvm_put_nested_state(X86CPU *cpu)
> +{
> +    CPUX86State *env = &cpu->env;
> +    uint32_t nested_state_len = kvm_max_nested_state_length();
> +
> +    if (nested_state_len == 0) {
> +        return 0;
> +    }
> +
> +    assert(env->nested_state->size <= nested_state_len);
> +    return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_NESTED_STATE, env->nested_state);
> +}
> +
> +static int kvm_get_nested_state(X86CPU *cpu)
> +{
> +    CPUX86State *env = &cpu->env;
> +    uint32_t nested_state_len = kvm_max_nested_state_length();
> +
> +    if (nested_state_len == 0) {
> +        return 0;
> +    }
> +
> +    /*
> +     * It is possible that migration restored a smaller size into
> +     * nested_state->size than what our kernel supports.
> +     * We preserve migration origin nested_state->size for
> +     * the call to KVM_SET_NESTED_STATE but wish that our next call
> +     * to KVM_GET_NESTED_STATE will use the maximum size supported by
> +     * the kernel we're running on.
> +     */
> +    env->nested_state->size = nested_state_len;
> +
> +    return kvm_vcpu_ioctl(CPU(cpu), KVM_GET_NESTED_STATE, env->nested_state);
> +}
> +
> int kvm_arch_put_registers(CPUState *cpu, int level)
> {
>     X86CPU *x86_cpu = X86_CPU(cpu);
> @@ -3132,6 +3175,11 @@ int kvm_arch_put_registers(CPUState *cpu, int level)
> 
>     assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
> 
> +    ret = kvm_put_nested_state(x86_cpu);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
>     if (level >= KVM_PUT_RESET_STATE) {
>         ret = kvm_put_msr_feature_control(x86_cpu);
>         if (ret < 0) {
> @@ -3247,6 +3295,10 @@ int kvm_arch_get_registers(CPUState *cs)
>     if (ret < 0) {
>         goto out;
>     }
> +    ret = kvm_get_nested_state(cpu);
> +    if (ret < 0) {
> +        goto out;
> +    }
>     ret = 0;
>  out:
>     cpu_sync_bndcs_hflags(&cpu->env);
> diff --git a/target/i386/machine.c b/target/i386/machine.c
> index 41460be54b..45dbae6054 100644
> --- a/target/i386/machine.c
> +++ b/target/i386/machine.c
> @@ -246,6 +246,15 @@ static int cpu_pre_save(void *opaque)
>         env->segs[R_SS].flags &= ~(env->segs[R_SS].flags & DESC_DPL_MASK);
>     }
> 
> +#ifdef CONFIG_KVM
> +    /* Verify we have nested virtualization state from kernel if required */
> +    if (is_nested_virt_enabled(env) && !env->nested_state) {
> +        error_report("Guest enabled nested virtualization but kernel "
> +                     "do not support saving nested state");
> +        return -EINVAL;
> +    }
> +#endif
> +
>     return 0;
> }
> 
> @@ -909,6 +918,176 @@ static const VMStateDescription vmstate_tsc_khz = {
>     }
> };
> 
> +#ifdef CONFIG_KVM
> +static bool vmx_vmcs12_needed(void *opaque)
> +{
> +    struct kvm_nested_state *nested_state = opaque;
> +    return (nested_state->size > offsetof(struct kvm_nested_state,
> +                                          vmx.data[0].vmcs12));

Do you prefer this compared to checking explicitly? i.e. by:
return (nested_state->vmx.vmcs12_pa != -1ull);

> +}
> +
> +static const VMStateDescription vmstate_vmx_vmcs12_state = {
> +	.name = "cpu/kvm_nested_state/vmx/vmcs12",
> +	.version_id = 1,
> +	.minimum_version_id = 1,
> +	.needed = vmx_vmcs12_needed,
> +	.fields = (VMStateField[]) {
> +	    VMSTATE_UINT8_ARRAY(vmx.data[0].vmcs12,
> +	                        struct kvm_nested_state, 0x1000),
> +	    VMSTATE_END_OF_LIST()
> +    }
> +};
> +
> +static bool vmx_shadow_vmcs12_needed(void *opaque)
> +{
> +    struct kvm_nested_state *nested_state = opaque;
> +    return (nested_state->size > offsetof(struct kvm_nested_state,
> +                                          vmx.data[0].shadow_vmcs12));

Nice trick on how to determine if to send shadow_vmcs12 without requiring to check
if vmcs12 indeed have VMCS-shadowing enabled and a valid vmcs-link-ptr. :)

> +}
> +
> +static const VMStateDescription vmstate_vmx_shadow_vmcs12_state = {
> +	.name = "cpu/kvm_nested_state/vmx/shadow_vmcs12",
> +	.version_id = 1,
> +	.minimum_version_id = 1,
> +	.needed = vmx_shadow_vmcs12_needed,
> +	.fields = (VMStateField[]) {
> +	    VMSTATE_UINT8_ARRAY(vmx.data[0].shadow_vmcs12,
> +	                        struct kvm_nested_state, 0x1000),
> +	    VMSTATE_END_OF_LIST()
> +    }
> +};
> +
> +static bool vmx_nested_state_needed(void *opaque)
> +{
> +    struct kvm_nested_state *nested_state = opaque;
> +
> +    return ((nested_state->format == KVM_STATE_NESTED_FORMAT_VMX) &&
> +            (nested_state->vmx.vmxon_pa != -1ull));
> +}
> +
> +static const VMStateDescription vmstate_vmx_nested_state = {
> +	.name = "cpu/kvm_nested_state/vmx",
> +	.version_id = 1,
> +	.minimum_version_id = 1,
> +	.needed = vmx_nested_state_needed,
> +	.fields = (VMStateField[]) {
> +	    VMSTATE_U64(vmx.vmxon_pa, struct kvm_nested_state),
> +	    VMSTATE_U64(vmx.vmcs_pa, struct kvm_nested_state),
> +	    VMSTATE_U16(vmx.smm.flags, struct kvm_nested_state),
> +	    VMSTATE_END_OF_LIST()
> +    },
> +    .subsections = (const VMStateDescription*[]) {
> +        &vmstate_vmx_vmcs12_state,
> +        &vmstate_vmx_shadow_vmcs12_state,
> +        NULL,
> +    }
> +};
> +
> +static bool svm_nested_state_needed(void *opaque)
> +{
> +    struct kvm_nested_state *nested_state = opaque;
> +
> +    return (nested_state->format == KVM_STATE_NESTED_FORMAT_SVM);
> +}
> +
> +static const VMStateDescription vmstate_svm_nested_state = {
> +	.name = "cpu/kvm_nested_state/svm",
> +	.version_id = 1,
> +	.minimum_version_id = 1,
> +	.needed = svm_nested_state_needed,
> +	.fields = (VMStateField[]) {
> +	    VMSTATE_END_OF_LIST()
> +    }
> +};
> +
> +static bool nested_state_needed(void *opaque)
> +{
> +    X86CPU *cpu = opaque;
> +    CPUX86State *env = &cpu->env;
> +
> +    return (is_vmx_enabled(env) && vmx_nested_state_needed(env->nested_state)) ||
> +           (is_svm_enabled(env) && svm_nested_state_needed(env->nested_state));
> +}

As I specified in an earlier email in this patch-series, this is not entirely accurate.
In case vCPU is running L2 and entered SMM, then is_vmx_enabled() will return false because CR4 is set to 0 on entering SMM.
I consider deeming nested_state needed in case hflags specifies guest is in SMM mode. Any objection?

> +
> +static int nested_state_post_load(void *opaque, int version_id)
> +{
> +    X86CPU *cpu = opaque;
> +    CPUX86State *env = &cpu->env;
> +    struct kvm_nested_state *nested_state = env->nested_state;
> +    uint32_t min_nested_state_len = offsetof(struct kvm_nested_state, data);
> +    uint32_t max_nested_state_len = kvm_max_nested_state_length();
> +
> +    /*
> +     * If our kernel don't support setting nested state
> +     * and we have received nested state from migration stream,
> +     * we need to fail migration
> +     */
> +    if (max_nested_state_len == 0) {
> +        error_report("Received nested state when kernel cannot restore it");
> +        return -EINVAL;
> +    }
> +
> +    /*
> +     * Verify that the size of received nested_state struct
> +     * at least cover required header and is not larger
> +     * than the max size that our kernel support
> +     */
> +    if (nested_state->size < min_nested_state_len) {
> +        error_report("Received nested state size less than min: "
> +                     "len=%d, min=%d",
> +                     nested_state->size, min_nested_state_len);
> +        return -EINVAL;
> +    }
> +    if (nested_state->size > max_nested_state_len) {
> +        error_report("Recieved unsupported nested state size: "
> +                     "nested_state->size=%d, max=%d",
> +                     nested_state->size, max_nested_state_len);
> +        return -EINVAL;
> +    }
> +
> +    /* Verify format is valid */
> +    if ((nested_state->format != KVM_STATE_NESTED_FORMAT_VMX) &&
> +        (nested_state->format != KVM_STATE_NESTED_FORMAT_SVM)) {
> +        error_report("Received invalid nested state format: %d",
> +                     nested_state->format);
> +        return -EINVAL;
> +    }
> +
> +    return 0;
> +}
> +
> +static const VMStateDescription vmstate_kvm_nested_state = {
> +    .name = "cpu/kvm_nested_state",
> +    .version_id = 1,
> +    .minimum_version_id = 1,
> +    .fields = (VMStateField[]) {
> +        VMSTATE_U16(flags, struct kvm_nested_state),
> +        VMSTATE_U16(format, struct kvm_nested_state),
> +        VMSTATE_U32(size, struct kvm_nested_state),
> +        VMSTATE_END_OF_LIST()
> +    },
> +    .subsections = (const VMStateDescription*[]) {
> +        &vmstate_vmx_nested_state,
> +        &vmstate_svm_nested_state,
> +        NULL
> +    }
> +};
> +
> +static const VMStateDescription vmstate_nested_state = {
> +    .name = "cpu/nested_state",
> +    .version_id = 1,
> +    .minimum_version_id = 1,
> +    .needed = nested_state_needed,
> +    .post_load = nested_state_post_load,
> +    .fields = (VMStateField[]) {
> +        VMSTATE_STRUCT_POINTER(env.nested_state, X86CPU,
> +                               vmstate_kvm_nested_state,
> +                               struct kvm_nested_state),
> +        VMSTATE_END_OF_LIST()
> +    }
> +};
> +#endif
> +
> static bool mcg_ext_ctl_needed(void *opaque)
> {
>     X86CPU *cpu = opaque;
> @@ -1148,6 +1327,9 @@ VMStateDescription vmstate_x86_cpu = {
>         &vmstate_msr_intel_pt,
>         &vmstate_msr_virt_ssbd,
>         &vmstate_svm_npt,
> +#ifdef CONFIG_KVM
> +        &vmstate_nested_state,
> +#endif
>         NULL
>     }
> };
> -- 
> 2.21.0
> 
>
Paolo Bonzini June 17, 2019, 5:31 p.m. UTC | #2
On 15/06/19 03:14, Liran Alon wrote:
>> @@ -1368,6 +1369,13 @@ int kvm_arch_init_vcpu(CPUState *cs)
>>     if (has_xsave) {
>>         env->xsave_buf = qemu_memalign(4096, sizeof(struct kvm_xsave));
>>     }
>> +
>> +    nested_state_len = kvm_max_nested_state_length();
>> +    if (nested_state_len > 0) {
>> +        assert(nested_state_len >= offsetof(struct kvm_nested_state, data));
>> +        env->nested_state = g_malloc0(nested_state_len);
> 
> Paolo, why have you removed setting “env->nested_state->size = max_nested_state_len;”?

Because I confused the "nested_state_len == 0" check in
kvm_put_nested_state with "env->nested_state->size == 0".

> In addition, in my next patch-series I also added the following code here which is required:
> 
> +        if (IS_INTEL_CPU(env)) {
> +            struct kvm_vmx_nested_state_hdr *vmx_hdr =
> +                &env->nested_state->hdr.vmx_hdr;
> +
> +            vmx_hdr->vmxon_pa = -1ull;
> +            vmx_hdr->vmcs12_pa = -1ull;
> +        }

Looks good.

>> +    }
>> +
>>     cpu->kvm_msr_buf = g_malloc0(MSR_BUF_SIZE);
> 
> Note: In my next patch-series I have also added a new kvm_arch_destroy_vcpu() method which is called from kvm_destroy_vcpu().
> Similar to how kvm_arch_init_vcpu() is called from kvm_init_vcpu().
> I use it to free both cpu->kvm_msr_buf and env->nested_state.

Looks good too.

>>
>>     if (!(env->features[FEAT_8000_0001_EDX] & CPUID_EXT2_RDTSCP)) {
>> @@ -3125,6 +3133,41 @@ static int kvm_get_debugregs(X86CPU *cpu)
>>     return 0;
>> }
>>
>> +static int kvm_put_nested_state(X86CPU *cpu)
>> +{
>> +    CPUX86State *env = &cpu->env;
>> +    uint32_t nested_state_len = kvm_max_nested_state_length();
>> +
>> +    if (nested_state_len == 0) {
>> +        return 0;
>> +    }
>> +
>> +    assert(env->nested_state->size <= nested_state_len);
>> +    return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_NESTED_STATE, env->nested_state);
>> +}
>> +
>> +static int kvm_get_nested_state(X86CPU *cpu)
>> +{
>> +    CPUX86State *env = &cpu->env;
>> +    uint32_t nested_state_len = kvm_max_nested_state_length();
>> +
>> +    if (nested_state_len == 0) {
>> +        return 0;
>> +    }
>> +
>> +    /*
>> +     * It is possible that migration restored a smaller size into
>> +     * nested_state->size than what our kernel supports.
>> +     * We preserve migration origin nested_state->size for
>> +     * the call to KVM_SET_NESTED_STATE but wish that our next call
>> +     * to KVM_GET_NESTED_STATE will use the maximum size supported by
>> +     * the kernel we're running on.
>> +     */
>> +    env->nested_state->size = nested_state_len;
>> +
>> +    return kvm_vcpu_ioctl(CPU(cpu), KVM_GET_NESTED_STATE, env->nested_state);
>> +}
>> +
>> int kvm_arch_put_registers(CPUState *cpu, int level)
>> {
>>     X86CPU *x86_cpu = X86_CPU(cpu);
>> @@ -3132,6 +3175,11 @@ int kvm_arch_put_registers(CPUState *cpu, int level)
>>
>>     assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
>>
>> +    ret = kvm_put_nested_state(x86_cpu);
>> +    if (ret < 0) {
>> +        return ret;
>> +    }
>> +
>>     if (level >= KVM_PUT_RESET_STATE) {
>>         ret = kvm_put_msr_feature_control(x86_cpu);
>>         if (ret < 0) {
>> @@ -3247,6 +3295,10 @@ int kvm_arch_get_registers(CPUState *cs)
>>     if (ret < 0) {
>>         goto out;
>>     }
>> +    ret = kvm_get_nested_state(cpu);
>> +    if (ret < 0) {
>> +        goto out;
>> +    }
>>     ret = 0;
>>  out:
>>     cpu_sync_bndcs_hflags(&cpu->env);
>> diff --git a/target/i386/machine.c b/target/i386/machine.c
>> index 41460be54b..45dbae6054 100644
>> --- a/target/i386/machine.c
>> +++ b/target/i386/machine.c
>> @@ -246,6 +246,15 @@ static int cpu_pre_save(void *opaque)
>>         env->segs[R_SS].flags &= ~(env->segs[R_SS].flags & DESC_DPL_MASK);
>>     }
>>
>> +#ifdef CONFIG_KVM
>> +    /* Verify we have nested virtualization state from kernel if required */
>> +    if (is_nested_virt_enabled(env) && !env->nested_state) {
>> +        error_report("Guest enabled nested virtualization but kernel "
>> +                     "do not support saving nested state");
>> +        return -EINVAL;
>> +    }
>> +#endif
>> +
>>     return 0;
>> }
>>
>> @@ -909,6 +918,176 @@ static const VMStateDescription vmstate_tsc_khz = {
>>     }
>> };
>>
>> +#ifdef CONFIG_KVM
>> +static bool vmx_vmcs12_needed(void *opaque)
>> +{
>> +    struct kvm_nested_state *nested_state = opaque;
>> +    return (nested_state->size > offsetof(struct kvm_nested_state,
>> +                                          vmx.data[0].vmcs12));
> 
> Do you prefer this compared to checking explicitly? i.e. by:
> return (nested_state->vmx.vmcs12_pa != -1ull);

I think I do, it guarantees that we don't serialize gibberish from
vmx.data[0] and it's consistent with the vmx_shadow_vmcs12_needed check.

>> +static bool nested_state_needed(void *opaque)
>> +{
>> +    X86CPU *cpu = opaque;
>> +    CPUX86State *env = &cpu->env;
>> +
>> +    return (is_vmx_enabled(env) && vmx_nested_state_needed(env->nested_state)) ||
>> +           (is_svm_enabled(env) && svm_nested_state_needed(env->nested_state));
>> +}
> 
> As I specified in an earlier email in this patch-series, this is not entirely accurate.
> In case vCPU is running L2 and entered SMM, then is_vmx_enabled() will return false because CR4 is set to 0 on entering SMM.
> I consider deeming nested_state needed in case hflags specifies guest is in SMM mode. Any objection?

See other answer, let's fix it in patch 7 instead.

Paolo
diff mbox series

Patch

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index e4ac3386cb..e1c6c067e8 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -88,6 +88,7 @@  struct KVMState
 #ifdef KVM_CAP_SET_GUEST_DEBUG
     QTAILQ_HEAD(, kvm_sw_breakpoint) kvm_sw_breakpoints;
 #endif
+    uint32_t max_nested_state_len;
     int many_ioeventfds;
     int intx_set_mask;
     bool sync_mmu;
@@ -1677,6 +1678,8 @@  static int kvm_init(MachineState *ms)
     s->debugregs = kvm_check_extension(s, KVM_CAP_DEBUGREGS);
 #endif
 
+    s->max_nested_state_len = kvm_check_extension(s, KVM_CAP_NESTED_STATE);
+
 #ifdef KVM_CAP_IRQ_ROUTING
     kvm_direct_msi_allowed = (kvm_check_extension(s, KVM_CAP_SIGNAL_MSI) > 0);
 #endif
@@ -2244,6 +2247,11 @@  int kvm_has_debugregs(void)
     return kvm_state->debugregs;
 }
 
+uint32_t kvm_max_nested_state_length(void)
+{
+    return kvm_state->max_nested_state_len;
+}
+
 int kvm_has_many_ioeventfds(void)
 {
     if (!kvm_enabled()) {
diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
index a6d1cd190f..5eb79b594c 100644
--- a/include/sysemu/kvm.h
+++ b/include/sysemu/kvm.h
@@ -210,6 +210,7 @@  bool kvm_has_sync_mmu(void);
 int kvm_has_vcpu_events(void);
 int kvm_has_robust_singlestep(void);
 int kvm_has_debugregs(void);
+uint32_t kvm_max_nested_state_length(void);
 int kvm_has_pit_state2(void);
 int kvm_has_many_ioeventfds(void);
 int kvm_has_gsi_routing(void);
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index bbeb7a9521..550d397807 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -1355,6 +1355,9 @@  typedef struct CPUX86State {
 #if defined(CONFIG_KVM) || defined(CONFIG_HVF)
     void *xsave_buf;
 #endif
+#if defined(CONFIG_KVM)
+    struct kvm_nested_state *nested_state;
+#endif
 #if defined(CONFIG_HVF)
     HVFX86EmulatorState *hvf_emul;
 #endif
diff --git a/target/i386/kvm.c b/target/i386/kvm.c
index dca76830ec..d48fafa22b 100644
--- a/target/i386/kvm.c
+++ b/target/i386/kvm.c
@@ -968,6 +968,7 @@  int kvm_arch_init_vcpu(CPUState *cs)
     struct kvm_cpuid_entry2 *c;
     uint32_t signature[3];
     int kvm_base = KVM_CPUID_SIGNATURE;
+    uint32_t nested_state_len;
     int r;
     Error *local_err = NULL;
 
@@ -1368,6 +1369,13 @@  int kvm_arch_init_vcpu(CPUState *cs)
     if (has_xsave) {
         env->xsave_buf = qemu_memalign(4096, sizeof(struct kvm_xsave));
     }
+
+    nested_state_len = kvm_max_nested_state_length();
+    if (nested_state_len > 0) {
+        assert(nested_state_len >= offsetof(struct kvm_nested_state, data));
+        env->nested_state = g_malloc0(nested_state_len);
+    }
+
     cpu->kvm_msr_buf = g_malloc0(MSR_BUF_SIZE);
 
     if (!(env->features[FEAT_8000_0001_EDX] & CPUID_EXT2_RDTSCP)) {
@@ -3125,6 +3133,41 @@  static int kvm_get_debugregs(X86CPU *cpu)
     return 0;
 }
 
+static int kvm_put_nested_state(X86CPU *cpu)
+{
+    CPUX86State *env = &cpu->env;
+    uint32_t nested_state_len = kvm_max_nested_state_length();
+
+    if (nested_state_len == 0) {
+        return 0;
+    }
+
+    assert(env->nested_state->size <= nested_state_len);
+    return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_NESTED_STATE, env->nested_state);
+}
+
+static int kvm_get_nested_state(X86CPU *cpu)
+{
+    CPUX86State *env = &cpu->env;
+    uint32_t nested_state_len = kvm_max_nested_state_length();
+
+    if (nested_state_len == 0) {
+        return 0;
+    }
+
+    /*
+     * It is possible that migration restored a smaller size into
+     * nested_state->size than what our kernel supports.
+     * We preserve migration origin nested_state->size for
+     * the call to KVM_SET_NESTED_STATE but wish that our next call
+     * to KVM_GET_NESTED_STATE will use the maximum size supported by
+     * the kernel we're running on.
+     */
+    env->nested_state->size = nested_state_len;
+
+    return kvm_vcpu_ioctl(CPU(cpu), KVM_GET_NESTED_STATE, env->nested_state);
+}
+
 int kvm_arch_put_registers(CPUState *cpu, int level)
 {
     X86CPU *x86_cpu = X86_CPU(cpu);
@@ -3132,6 +3175,11 @@  int kvm_arch_put_registers(CPUState *cpu, int level)
 
     assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
 
+    ret = kvm_put_nested_state(x86_cpu);
+    if (ret < 0) {
+        return ret;
+    }
+
     if (level >= KVM_PUT_RESET_STATE) {
         ret = kvm_put_msr_feature_control(x86_cpu);
         if (ret < 0) {
@@ -3247,6 +3295,10 @@  int kvm_arch_get_registers(CPUState *cs)
     if (ret < 0) {
         goto out;
     }
+    ret = kvm_get_nested_state(cpu);
+    if (ret < 0) {
+        goto out;
+    }
     ret = 0;
  out:
     cpu_sync_bndcs_hflags(&cpu->env);
diff --git a/target/i386/machine.c b/target/i386/machine.c
index 41460be54b..45dbae6054 100644
--- a/target/i386/machine.c
+++ b/target/i386/machine.c
@@ -246,6 +246,15 @@  static int cpu_pre_save(void *opaque)
         env->segs[R_SS].flags &= ~(env->segs[R_SS].flags & DESC_DPL_MASK);
     }
 
+#ifdef CONFIG_KVM
+    /* Verify we have nested virtualization state from kernel if required */
+    if (is_nested_virt_enabled(env) && !env->nested_state) {
+        error_report("Guest enabled nested virtualization but kernel "
+                     "do not support saving nested state");
+        return -EINVAL;
+    }
+#endif
+
     return 0;
 }
 
@@ -909,6 +918,176 @@  static const VMStateDescription vmstate_tsc_khz = {
     }
 };
 
+#ifdef CONFIG_KVM
+static bool vmx_vmcs12_needed(void *opaque)
+{
+    struct kvm_nested_state *nested_state = opaque;
+    return (nested_state->size > offsetof(struct kvm_nested_state,
+                                          vmx.data[0].vmcs12));
+}
+
+static const VMStateDescription vmstate_vmx_vmcs12_state = {
+	.name = "cpu/kvm_nested_state/vmx/vmcs12",
+	.version_id = 1,
+	.minimum_version_id = 1,
+	.needed = vmx_vmcs12_needed,
+	.fields = (VMStateField[]) {
+	    VMSTATE_UINT8_ARRAY(vmx.data[0].vmcs12,
+	                        struct kvm_nested_state, 0x1000),
+	    VMSTATE_END_OF_LIST()
+    }
+};
+
+static bool vmx_shadow_vmcs12_needed(void *opaque)
+{
+    struct kvm_nested_state *nested_state = opaque;
+    return (nested_state->size > offsetof(struct kvm_nested_state,
+                                          vmx.data[0].shadow_vmcs12));
+}
+
+static const VMStateDescription vmstate_vmx_shadow_vmcs12_state = {
+	.name = "cpu/kvm_nested_state/vmx/shadow_vmcs12",
+	.version_id = 1,
+	.minimum_version_id = 1,
+	.needed = vmx_shadow_vmcs12_needed,
+	.fields = (VMStateField[]) {
+	    VMSTATE_UINT8_ARRAY(vmx.data[0].shadow_vmcs12,
+	                        struct kvm_nested_state, 0x1000),
+	    VMSTATE_END_OF_LIST()
+    }
+};
+
+static bool vmx_nested_state_needed(void *opaque)
+{
+    struct kvm_nested_state *nested_state = opaque;
+
+    return ((nested_state->format == KVM_STATE_NESTED_FORMAT_VMX) &&
+            (nested_state->vmx.vmxon_pa != -1ull));
+}
+
+static const VMStateDescription vmstate_vmx_nested_state = {
+	.name = "cpu/kvm_nested_state/vmx",
+	.version_id = 1,
+	.minimum_version_id = 1,
+	.needed = vmx_nested_state_needed,
+	.fields = (VMStateField[]) {
+	    VMSTATE_U64(vmx.vmxon_pa, struct kvm_nested_state),
+	    VMSTATE_U64(vmx.vmcs_pa, struct kvm_nested_state),
+	    VMSTATE_U16(vmx.smm.flags, struct kvm_nested_state),
+	    VMSTATE_END_OF_LIST()
+    },
+    .subsections = (const VMStateDescription*[]) {
+        &vmstate_vmx_vmcs12_state,
+        &vmstate_vmx_shadow_vmcs12_state,
+        NULL,
+    }
+};
+
+static bool svm_nested_state_needed(void *opaque)
+{
+    struct kvm_nested_state *nested_state = opaque;
+
+    return (nested_state->format == KVM_STATE_NESTED_FORMAT_SVM);
+}
+
+static const VMStateDescription vmstate_svm_nested_state = {
+	.name = "cpu/kvm_nested_state/svm",
+	.version_id = 1,
+	.minimum_version_id = 1,
+	.needed = svm_nested_state_needed,
+	.fields = (VMStateField[]) {
+	    VMSTATE_END_OF_LIST()
+    }
+};
+
+static bool nested_state_needed(void *opaque)
+{
+    X86CPU *cpu = opaque;
+    CPUX86State *env = &cpu->env;
+
+    return (is_vmx_enabled(env) && vmx_nested_state_needed(env->nested_state)) ||
+           (is_svm_enabled(env) && svm_nested_state_needed(env->nested_state));
+}
+
+static int nested_state_post_load(void *opaque, int version_id)
+{
+    X86CPU *cpu = opaque;
+    CPUX86State *env = &cpu->env;
+    struct kvm_nested_state *nested_state = env->nested_state;
+    uint32_t min_nested_state_len = offsetof(struct kvm_nested_state, data);
+    uint32_t max_nested_state_len = kvm_max_nested_state_length();
+
+    /*
+     * If our kernel don't support setting nested state
+     * and we have received nested state from migration stream,
+     * we need to fail migration
+     */
+    if (max_nested_state_len == 0) {
+        error_report("Received nested state when kernel cannot restore it");
+        return -EINVAL;
+    }
+
+    /*
+     * Verify that the size of received nested_state struct
+     * at least cover required header and is not larger
+     * than the max size that our kernel support
+     */
+    if (nested_state->size < min_nested_state_len) {
+        error_report("Received nested state size less than min: "
+                     "len=%d, min=%d",
+                     nested_state->size, min_nested_state_len);
+        return -EINVAL;
+    }
+    if (nested_state->size > max_nested_state_len) {
+        error_report("Recieved unsupported nested state size: "
+                     "nested_state->size=%d, max=%d",
+                     nested_state->size, max_nested_state_len);
+        return -EINVAL;
+    }
+
+    /* Verify format is valid */
+    if ((nested_state->format != KVM_STATE_NESTED_FORMAT_VMX) &&
+        (nested_state->format != KVM_STATE_NESTED_FORMAT_SVM)) {
+        error_report("Received invalid nested state format: %d",
+                     nested_state->format);
+        return -EINVAL;
+    }
+
+    return 0;
+}
+
+static const VMStateDescription vmstate_kvm_nested_state = {
+    .name = "cpu/kvm_nested_state",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .fields = (VMStateField[]) {
+        VMSTATE_U16(flags, struct kvm_nested_state),
+        VMSTATE_U16(format, struct kvm_nested_state),
+        VMSTATE_U32(size, struct kvm_nested_state),
+        VMSTATE_END_OF_LIST()
+    },
+    .subsections = (const VMStateDescription*[]) {
+        &vmstate_vmx_nested_state,
+        &vmstate_svm_nested_state,
+        NULL
+    }
+};
+
+static const VMStateDescription vmstate_nested_state = {
+    .name = "cpu/nested_state",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .needed = nested_state_needed,
+    .post_load = nested_state_post_load,
+    .fields = (VMStateField[]) {
+        VMSTATE_STRUCT_POINTER(env.nested_state, X86CPU,
+                               vmstate_kvm_nested_state,
+                               struct kvm_nested_state),
+        VMSTATE_END_OF_LIST()
+    }
+};
+#endif
+
 static bool mcg_ext_ctl_needed(void *opaque)
 {
     X86CPU *cpu = opaque;
@@ -1148,6 +1327,9 @@  VMStateDescription vmstate_x86_cpu = {
         &vmstate_msr_intel_pt,
         &vmstate_msr_virt_ssbd,
         &vmstate_svm_npt,
+#ifdef CONFIG_KVM
+        &vmstate_nested_state,
+#endif
         NULL
     }
 };