diff mbox

[3/5] x86: fill high bits of mtrr mask

Message ID 1466097133-5489-4-git-send-email-dgilbert@redhat.com
State New
Headers show

Commit Message

Dr. David Alan Gilbert June 16, 2016, 5:12 p.m. UTC
From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>

Fill the bits between 51..number-of-physical-address-bits in the
MTRR_PHYSMASKn variable range mtrr masks so that they're consistent
in the migration stream irrespective of the physical address space
of the source VM in a migration.

Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/hw/i386/pc.h |  5 +++++
 target-i386/cpu.c    |  1 +
 target-i386/cpu.h    |  3 +++
 target-i386/kvm.c    | 25 ++++++++++++++++++++++++-
 4 files changed, 33 insertions(+), 1 deletion(-)

Comments

Eduardo Habkost June 16, 2016, 8:14 p.m. UTC | #1
On Thu, Jun 16, 2016 at 06:12:11PM +0100, Dr. David Alan Gilbert (git) wrote:
> From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
> 
> Fill the bits between 51..number-of-physical-address-bits in the
> MTRR_PHYSMASKn variable range mtrr masks so that they're consistent
> in the migration stream irrespective of the physical address space
> of the source VM in a migration.
> 
> Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
> Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>  include/hw/i386/pc.h |  5 +++++
>  target-i386/cpu.c    |  1 +
>  target-i386/cpu.h    |  3 +++
>  target-i386/kvm.c    | 25 ++++++++++++++++++++++++-
>  4 files changed, 33 insertions(+), 1 deletion(-)
> 
> diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
> index 49566c8..3883d04 100644
> --- a/include/hw/i386/pc.h
> +++ b/include/hw/i386/pc.h
> @@ -362,6 +362,11 @@ bool e820_get_entry(int, uint32_t, uint64_t *, uint64_t *);
>          .driver   = TYPE_X86_CPU,\
>          .property = "cpuid-0xb",\
>          .value    = "off",\
> +    },\
> +    {\
> +        .driver = TYPE_X86_CPU,\
> +        .property = "fill-mtrr-mask",\
> +        .value = "off",\
>      },
>  
>  #define PC_COMPAT_2_5 \
> diff --git a/target-i386/cpu.c b/target-i386/cpu.c
> index f9302f9..4111240 100644
> --- a/target-i386/cpu.c
> +++ b/target-i386/cpu.c
> @@ -3272,6 +3272,7 @@ static Property x86_cpu_properties[] = {
>      DEFINE_PROP_BOOL("check", X86CPU, check_cpuid, true),
>      DEFINE_PROP_BOOL("enforce", X86CPU, enforce_cpuid, false),
>      DEFINE_PROP_BOOL("kvm", X86CPU, expose_kvm, true),
> +    DEFINE_PROP_BOOL("fill-mtrr-mask", X86CPU, fill_mtrr_mask, true),

This is necessary only when phys_bits is higher on the
destination, right?

Should we really default this to true? I would like to enable
this hack only when really necessary. Except when using host's
phys_bits (phys-bits=0), is there any valid reason to expect
higher phys-bits on the destination?



>      DEFINE_PROP_UINT32("level", X86CPU, env.cpuid_level, 0),
>      DEFINE_PROP_UINT32("xlevel", X86CPU, env.cpuid_xlevel, 0),
>      DEFINE_PROP_UINT32("xlevel2", X86CPU, env.cpuid_xlevel2, 0),
> diff --git a/target-i386/cpu.h b/target-i386/cpu.h
> index 61c06b7..13d8501 100644
> --- a/target-i386/cpu.h
> +++ b/target-i386/cpu.h
> @@ -1181,6 +1181,9 @@ struct X86CPU {
>      /* Compatibility bits for old machine types: */
>      bool enable_cpuid_0xb;
>  
> +    /* if true fill the top bits of the MTRR_PHYSMASKn variable range */
> +    bool fill_mtrr_mask;
> +
>      /* in order to simplify APIC support, we leave this pointer to the
>         user */
>      struct DeviceState *apic_state;
> diff --git a/target-i386/kvm.c b/target-i386/kvm.c
> index d95d06b..9b5158e 100644
> --- a/target-i386/kvm.c
> +++ b/target-i386/kvm.c
> @@ -1934,6 +1934,7 @@ static int kvm_get_msrs(X86CPU *cpu)
>      CPUX86State *env = &cpu->env;
>      struct kvm_msr_entry *msrs = cpu->kvm_msr_buf->entries;
>      int ret, i;
> +    uint64_t mtrr_top_bits;
>  
>      kvm_msr_buf_reset(cpu);
>  
> @@ -2083,6 +2084,27 @@ static int kvm_get_msrs(X86CPU *cpu)
>      }
>  
>      assert(ret == cpu->kvm_msr_buf->nmsrs);
> +    /*
> +     * MTRR masks: Each mask consists of 5 parts
> +     * a  10..0: must be zero
> +     * b  11   : valid bit
> +     * c n-1.12: actual mask bits
> +     * d  51..n: reserved must be zero
> +     * e  63.52: reserved must be zero
> +     *
> +     * 'n' is the number of physical bits supported by the CPU and is
> +     * apparently always <= 52.   We know our 'n' but don't know what
> +     * the destinations 'n' is; it might be smaller, in which case
> +     * it masks (c) on loading. It might be larger, in which case
> +     * we fill 'd' so that d..c is consistent irrespetive of the 'n'
> +     * we're migrating to.
> +     */
> +    if (cpu->fill_mtrr_mask) {
> +        mtrr_top_bits = BIT_RANGE(51, cpu_x86_guest_phys_address_bits(env));
> +    } else {
> +        mtrr_top_bits = 0;
> +    }
> +
>      for (i = 0; i < ret; i++) {
>          uint32_t index = msrs[i].index;
>          switch (index) {
> @@ -2278,7 +2300,8 @@ static int kvm_get_msrs(X86CPU *cpu)
>              break;
>          case MSR_MTRRphysBase(0) ... MSR_MTRRphysMask(MSR_MTRRcap_VCNT - 1):
>              if (index & 1) {
> -                env->mtrr_var[MSR_MTRRphysIndex(index)].mask = msrs[i].data;
> +                env->mtrr_var[MSR_MTRRphysIndex(index)].mask = msrs[i].data |
> +                                                               mtrr_top_bits;
>              } else {
>                  env->mtrr_var[MSR_MTRRphysIndex(index)].base = msrs[i].data;
>              }
> -- 
> 2.7.4
>
Paolo Bonzini June 17, 2016, 7:47 a.m. UTC | #2
On 16/06/2016 22:14, Eduardo Habkost wrote:
> This is necessary only when phys_bits is higher on the
> destination, right?
> 
> Should we really default this to true? I would like to enable
> this hack only when really necessary. Except when using host's
> phys_bits (phys-bits=0), is there any valid reason to expect
> higher phys-bits on the destination?

It would need a property even if you did it only for phys-bits==0, and
then it's simpler to just do it always.

The bits are reserved anyway, so we can do whatever we want with them.
In fact I think it's weird for the architecture to make them
must-be-zero, it might even make more sense to make them must-be-one...
It's a mask after all, and there's no way to access out-of-range
physical addresses.

Paolo
Dr. David Alan Gilbert June 17, 2016, 8:53 a.m. UTC | #3
* Eduardo Habkost (ehabkost@redhat.com) wrote:
> On Thu, Jun 16, 2016 at 06:12:11PM +0100, Dr. David Alan Gilbert (git) wrote:
> > From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
> > 
> > Fill the bits between 51..number-of-physical-address-bits in the
> > MTRR_PHYSMASKn variable range mtrr masks so that they're consistent
> > in the migration stream irrespective of the physical address space
> > of the source VM in a migration.
> > 
> > Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
> > Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
> > ---
> >  include/hw/i386/pc.h |  5 +++++
> >  target-i386/cpu.c    |  1 +
> >  target-i386/cpu.h    |  3 +++
> >  target-i386/kvm.c    | 25 ++++++++++++++++++++++++-
> >  4 files changed, 33 insertions(+), 1 deletion(-)
> > 
> > diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
> > index 49566c8..3883d04 100644
> > --- a/include/hw/i386/pc.h
> > +++ b/include/hw/i386/pc.h
> > @@ -362,6 +362,11 @@ bool e820_get_entry(int, uint32_t, uint64_t *, uint64_t *);
> >          .driver   = TYPE_X86_CPU,\
> >          .property = "cpuid-0xb",\
> >          .value    = "off",\
> > +    },\
> > +    {\
> > +        .driver = TYPE_X86_CPU,\
> > +        .property = "fill-mtrr-mask",\
> > +        .value = "off",\
> >      },
> >  
> >  #define PC_COMPAT_2_5 \
> > diff --git a/target-i386/cpu.c b/target-i386/cpu.c
> > index f9302f9..4111240 100644
> > --- a/target-i386/cpu.c
> > +++ b/target-i386/cpu.c
> > @@ -3272,6 +3272,7 @@ static Property x86_cpu_properties[] = {
> >      DEFINE_PROP_BOOL("check", X86CPU, check_cpuid, true),
> >      DEFINE_PROP_BOOL("enforce", X86CPU, enforce_cpuid, false),
> >      DEFINE_PROP_BOOL("kvm", X86CPU, expose_kvm, true),
> > +    DEFINE_PROP_BOOL("fill-mtrr-mask", X86CPU, fill_mtrr_mask, true),
> 
> This is necessary only when phys_bits is higher on the
> destination, right?

Yes because otherwise the destination will mask it out.

> Should we really default this to true? I would like to enable
> this hack only when really necessary. Except when using host's
> phys_bits (phys-bits=0), is there any valid reason to expect
> higher phys-bits on the destination?

Who is to say? With the current default of 40 we've not
been setting these higher bits when going from 39-46 bit
systems for years; I'm not sure what the consequence has been.

Dave

> 
> 
> 
> >      DEFINE_PROP_UINT32("level", X86CPU, env.cpuid_level, 0),
> >      DEFINE_PROP_UINT32("xlevel", X86CPU, env.cpuid_xlevel, 0),
> >      DEFINE_PROP_UINT32("xlevel2", X86CPU, env.cpuid_xlevel2, 0),
> > diff --git a/target-i386/cpu.h b/target-i386/cpu.h
> > index 61c06b7..13d8501 100644
> > --- a/target-i386/cpu.h
> > +++ b/target-i386/cpu.h
> > @@ -1181,6 +1181,9 @@ struct X86CPU {
> >      /* Compatibility bits for old machine types: */
> >      bool enable_cpuid_0xb;
> >  
> > +    /* if true fill the top bits of the MTRR_PHYSMASKn variable range */
> > +    bool fill_mtrr_mask;
> > +
> >      /* in order to simplify APIC support, we leave this pointer to the
> >         user */
> >      struct DeviceState *apic_state;
> > diff --git a/target-i386/kvm.c b/target-i386/kvm.c
> > index d95d06b..9b5158e 100644
> > --- a/target-i386/kvm.c
> > +++ b/target-i386/kvm.c
> > @@ -1934,6 +1934,7 @@ static int kvm_get_msrs(X86CPU *cpu)
> >      CPUX86State *env = &cpu->env;
> >      struct kvm_msr_entry *msrs = cpu->kvm_msr_buf->entries;
> >      int ret, i;
> > +    uint64_t mtrr_top_bits;
> >  
> >      kvm_msr_buf_reset(cpu);
> >  
> > @@ -2083,6 +2084,27 @@ static int kvm_get_msrs(X86CPU *cpu)
> >      }
> >  
> >      assert(ret == cpu->kvm_msr_buf->nmsrs);
> > +    /*
> > +     * MTRR masks: Each mask consists of 5 parts
> > +     * a  10..0: must be zero
> > +     * b  11   : valid bit
> > +     * c n-1.12: actual mask bits
> > +     * d  51..n: reserved must be zero
> > +     * e  63.52: reserved must be zero
> > +     *
> > +     * 'n' is the number of physical bits supported by the CPU and is
> > +     * apparently always <= 52.   We know our 'n' but don't know what
> > +     * the destinations 'n' is; it might be smaller, in which case
> > +     * it masks (c) on loading. It might be larger, in which case
> > +     * we fill 'd' so that d..c is consistent irrespetive of the 'n'
> > +     * we're migrating to.
> > +     */
> > +    if (cpu->fill_mtrr_mask) {
> > +        mtrr_top_bits = BIT_RANGE(51, cpu_x86_guest_phys_address_bits(env));
> > +    } else {
> > +        mtrr_top_bits = 0;
> > +    }
> > +
> >      for (i = 0; i < ret; i++) {
> >          uint32_t index = msrs[i].index;
> >          switch (index) {
> > @@ -2278,7 +2300,8 @@ static int kvm_get_msrs(X86CPU *cpu)
> >              break;
> >          case MSR_MTRRphysBase(0) ... MSR_MTRRphysMask(MSR_MTRRcap_VCNT - 1):
> >              if (index & 1) {
> > -                env->mtrr_var[MSR_MTRRphysIndex(index)].mask = msrs[i].data;
> > +                env->mtrr_var[MSR_MTRRphysIndex(index)].mask = msrs[i].data |
> > +                                                               mtrr_top_bits;
> >              } else {
> >                  env->mtrr_var[MSR_MTRRphysIndex(index)].base = msrs[i].data;
> >              }
> > -- 
> > 2.7.4
> > 
> 
> -- 
> Eduardo
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Eduardo Habkost June 17, 2016, 12:46 p.m. UTC | #4
On Fri, Jun 17, 2016 at 09:47:27AM +0200, Paolo Bonzini wrote:
> On 16/06/2016 22:14, Eduardo Habkost wrote:
> > This is necessary only when phys_bits is higher on the
> > destination, right?
> > 
> > Should we really default this to true? I would like to enable
> > this hack only when really necessary. Except when using host's
> > phys_bits (phys-bits=0), is there any valid reason to expect
> > higher phys-bits on the destination?
> 
> It would need a property even if you did it only for phys-bits==0, and
> then it's simpler to just do it always.
> 
> The bits are reserved anyway, so we can do whatever we want with them.
> In fact I think it's weird for the architecture to make them
> must-be-zero, it might even make more sense to make them must-be-one...
> It's a mask after all, and there's no way to access out-of-range
> physical addresses.

If we always fill the bits on the source, the destination can't
differentiate between a 40-bit source that set the MTRR to
0xffffffffff from a 36-bit source that set the MTRR to
0xfffffffff.

I really want to print a warning if the MTRR value or the
phys-bits value is being changed during migration, just in case
this has unintended consequences in the future. We can send the
current phys_bits value in the migration stream, so the
destination can decide how to handle it (and which warnings to
print).
Paolo Bonzini June 17, 2016, 1:01 p.m. UTC | #5
On 17/06/2016 14:46, Eduardo Habkost wrote:
>> > 
>> > The bits are reserved anyway, so we can do whatever we want with them.
>> > In fact I think it's weird for the architecture to make them
>> > must-be-zero, it might even make more sense to make them must-be-one...
>> > It's a mask after all, and there's no way to access out-of-range
>> > physical addresses.
> 
> If we always fill the bits on the source, the destination can't
> differentiate between a 40-bit source that set the MTRR to
> 0xffffffffff from a 36-bit source that set the MTRR to
> 0xfffffffff.

That's not a bug, it's a feature.  MTRRs work by comparing (address &
mtrr_mask) with mtrr_base.

A 40-bit source that sets the MTRR to 0xffffffffff must set higher bits
of mtrr_base to zero, but when you migrate to another destination, you
want the comparison to hold.  There are two ways for it to hold:

- if the physical address space becomes shorter, the base bits must be
zero or writing mtrr_base fails, and the address bits must be zero or
the processor fails to walk EPT page tables.  So it's fine to strip the
top four bits of the mask.

- if the physical address space becomes larger, the base bits must be
zero but the mask bits must become one.  So why not migrate them this
way directly.

(For what it's worth, KVM also stores the mask extended to all ones, and
strips the unnecessary high bits when reading the MSRs).

> I really want to print a warning if the MTRR value or the
> phys-bits value is being changed during migration, just in case
> this has unintended consequences in the future. We can send the
> current phys_bits value in the migration stream, so the
> destination can decide how to handle it (and which warnings to
> print).

The problem with this is that it must be a warning only, because usually
things will work (evidence: they have worked on RHEL6 and RHEL7 since
2010).  No one will read it until it's too late and they have lost a VM.

Note that the failure mode is pretty brutal since KVM reports an
internal error right after restarting on the destination, and who knows
what used to happen before the assertion was introduced.  All MSRs after
MTRRs would be ignored by KVM!

Migrating and checking the configuration complicates the code and,
because it's only for a warning, doesn't save you from massaging the
values to make things work.

Paolo
Eduardo Habkost June 17, 2016, 1:41 p.m. UTC | #6
On Fri, Jun 17, 2016 at 03:01:55PM +0200, Paolo Bonzini wrote:
> 
> 
> On 17/06/2016 14:46, Eduardo Habkost wrote:
> >> > 
> >> > The bits are reserved anyway, so we can do whatever we want with them.
> >> > In fact I think it's weird for the architecture to make them
> >> > must-be-zero, it might even make more sense to make them must-be-one...
> >> > It's a mask after all, and there's no way to access out-of-range
> >> > physical addresses.
> > 
> > If we always fill the bits on the source, the destination can't
> > differentiate between a 40-bit source that set the MTRR to
> > 0xffffffffff from a 36-bit source that set the MTRR to
> > 0xfffffffff.
> 
> That's not a bug, it's a feature.  MTRRs work by comparing (address &
> mtrr_mask) with mtrr_base.
> 
> A 40-bit source that sets the MTRR to 0xffffffffff must set higher bits
> of mtrr_base to zero, but when you migrate to another destination, you
> want the comparison to hold.  There are two ways for it to hold:
> 
> - if the physical address space becomes shorter, the base bits must be
> zero or writing mtrr_base fails, and the address bits must be zero or
> the processor fails to walk EPT page tables.  So it's fine to strip the
> top four bits of the mask.
> 
> - if the physical address space becomes larger, the base bits must be
> zero but the mask bits must become one.  So why not migrate them this
> way directly.
> 
> (For what it's worth, KVM also stores the mask extended to all ones, and
> strips the unnecessary high bits when reading the MSRs).

So the MSR is changing but the semantics are preserved. That's
good, but we may still have other problems if phys_addr changes
on migration, so:

> 
> > I really want to print a warning if the MTRR value or the
> > phys-bits value is being changed during migration, just in case
> > this has unintended consequences in the future. We can send the
> > current phys_bits value in the migration stream, so the
> > destination can decide how to handle it (and which warnings to
> > print).
> 
> The problem with this is that it must be a warning only, because usually
> things will work (evidence: they have worked on RHEL6 and RHEL7 since
> 2010).  No one will read it until it's too late and they have lost a VM.
> 
> Note that the failure mode is pretty brutal since KVM reports an
> internal error right after restarting on the destination, and who knows
> what used to happen before the assertion was introduced.  All MSRs after
> MTRRs would be ignored by KVM!
> 

It's probably too late for the user to act on it, but I want to
see the warning in the logs of a bug report in case something
else breaks. The MTRR changes may be 100% safe (and may not
deserve a warning), but other things can break when phys_bits
change on migration.

> Migrating and checking the configuration complicates the code and,
> because it's only for a warning, doesn't save you from massaging the
> values to make things work.

I would agree with you if we didn't have the existing
host-phys-bits mode already. In this case, phys_bits is
initialized to a different value depending on the host. You can
argue it is not machine state, but it is not user-specified
configuration either.

In theory, we should never initialize anything on the machine
based on the host we are running. In practice we sometimes do
that, and we know it's unsafe. Sending the value on the migration
stream is a solution to detect when this breaks something. We did
that before, for TSC frequency.

I prefer to add a little extra code, than to waste time debugging
when that stuff breaks.
Dr. David Alan Gilbert June 17, 2016, 1:51 p.m. UTC | #7
* Paolo Bonzini (pbonzini@redhat.com) wrote:

> Note that the failure mode is pretty brutal since KVM reports an
> internal error right after restarting on the destination, and who knows
> what used to happen before the assertion was introduced.  All MSRs after
> MTRRs would be ignored by KVM!

I think we got lucky because the MTRRs were close to the last MSRs
in the list, there was only some MCG/MCE stuff after it which probably
wouldn't have been missed;  if the MTRR wasn't set I'm guessing someone
would have just had a behaviour that looked like dreadful performance
after migration perhaps?

Dave

> Migrating and checking the configuration complicates the code and,
> because it's only for a warning, doesn't save you from massaging the
> values to make things work.
> 
> Paolo
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Paolo Bonzini June 17, 2016, 2:19 p.m. UTC | #8
On 17/06/2016 15:51, Dr. David Alan Gilbert wrote:
> 
>> > Note that the failure mode is pretty brutal since KVM reports an
>> > internal error right after restarting on the destination, and who knows
>> > what used to happen before the assertion was introduced.  All MSRs after
>> > MTRRs would be ignored by KVM!
> I think we got lucky because the MTRRs were close to the last MSRs
> in the list, there was only some MCG/MCE stuff after it which probably
> wouldn't have been missed;  if the MTRR wasn't set I'm guessing someone
> would have just had a behaviour that looked like dreadful performance
> after migration perhaps?

No, MTRRs are only used by KVM if there's an assigned device, and that's
incompatible with migration.

Paolo
Paolo Bonzini June 17, 2016, 2:25 p.m. UTC | #9
On 17/06/2016 15:41, Eduardo Habkost wrote:
> In theory, we should never initialize anything on the machine
> based on the host we are running. In practice we sometimes do
> that, and we know it's unsafe. Sending the value on the migration
> stream is a solution to detect when this breaks something. We did
> that before, for TSC frequency.
> 
> I prefer to add a little extra code, than to waste time debugging
> when that stuff breaks.

Fair enough (but still let's add and strip the 1s in pre_save and
post_load).  It is a good justification for sending configuration over
the wire, that we initialize things based on the host we're running.

But we shouldn't initialize more things based on the host (e.g. new
booleans should be "enforce"-style).  Also I don't really like
introducing sanity checks for those that aren't based on the host (as in
the LMCE thread).  This code will never trigger in practice, so it's
just extra cost for no benefit.

Paolo
Eduardo Habkost June 17, 2016, 3:27 p.m. UTC | #10
On Fri, Jun 17, 2016 at 04:25:57PM +0200, Paolo Bonzini wrote:
> 
> 
> On 17/06/2016 15:41, Eduardo Habkost wrote:
> > In theory, we should never initialize anything on the machine
> > based on the host we are running. In practice we sometimes do
> > that, and we know it's unsafe. Sending the value on the migration
> > stream is a solution to detect when this breaks something. We did
> > that before, for TSC frequency.
> > 
> > I prefer to add a little extra code, than to waste time debugging
> > when that stuff breaks.
> 
> Fair enough (but still let's add and strip the 1s in pre_save and
> post_load).  It is a good justification for sending configuration over
> the wire, that we initialize things based on the host we're running.
> 
> But we shouldn't initialize more things based on the host (e.g. new
> booleans should be "enforce"-style).

Strongly agreed.

> Also I don't really like
> introducing sanity checks for those that aren't based on the host (as in
> the LMCE thread).  This code will never trigger in practice, so it's
> just extra cost for no benefit.

Our experience with guest ABI and compatibility code tells me
otherwise. Keeping guest ABI is hard, so I would always support
adding extra code to help us catch mistakes (either in QEMU,
management software, or user mistakes).
Paolo Bonzini June 17, 2016, 3:29 p.m. UTC | #11
On 17/06/2016 17:27, Eduardo Habkost wrote:
> On Fri, Jun 17, 2016 at 04:25:57PM +0200, Paolo Bonzini wrote:
>>
>>
>> On 17/06/2016 15:41, Eduardo Habkost wrote:
>>> In theory, we should never initialize anything on the machine
>>> based on the host we are running. In practice we sometimes do
>>> that, and we know it's unsafe. Sending the value on the migration
>>> stream is a solution to detect when this breaks something. We did
>>> that before, for TSC frequency.
>>>
>>> I prefer to add a little extra code, than to waste time debugging
>>> when that stuff breaks.
>>
>> Fair enough (but still let's add and strip the 1s in pre_save and
>> post_load).  It is a good justification for sending configuration over
>> the wire, that we initialize things based on the host we're running.
>>
>> But we shouldn't initialize more things based on the host (e.g. new
>> booleans should be "enforce"-style).
> 
> Strongly agreed.
> 
>> Also I don't really like
>> introducing sanity checks for those that aren't based on the host (as in
>> the LMCE thread).  This code will never trigger in practice, so it's
>> just extra cost for no benefit.
> 
> Our experience with guest ABI and compatibility code tells me
> otherwise. Keeping guest ABI is hard, so I would always support
> adding extra code to help us catch mistakes (either in QEMU,
> management software, or user mistakes).

I agree, but I think the "enforce"-style properties are enough (and
count as extra code :)).

Paolo
Eduardo Habkost June 17, 2016, 3:35 p.m. UTC | #12
On Fri, Jun 17, 2016 at 05:29:21PM +0200, Paolo Bonzini wrote:
> 
> 
> On 17/06/2016 17:27, Eduardo Habkost wrote:
> > On Fri, Jun 17, 2016 at 04:25:57PM +0200, Paolo Bonzini wrote:
> >>
> >>
> >> On 17/06/2016 15:41, Eduardo Habkost wrote:
> >>> In theory, we should never initialize anything on the machine
> >>> based on the host we are running. In practice we sometimes do
> >>> that, and we know it's unsafe. Sending the value on the migration
> >>> stream is a solution to detect when this breaks something. We did
> >>> that before, for TSC frequency.
> >>>
> >>> I prefer to add a little extra code, than to waste time debugging
> >>> when that stuff breaks.
> >>
> >> Fair enough (but still let's add and strip the 1s in pre_save and
> >> post_load).  It is a good justification for sending configuration over
> >> the wire, that we initialize things based on the host we're running.
> >>
> >> But we shouldn't initialize more things based on the host (e.g. new
> >> booleans should be "enforce"-style).
> > 
> > Strongly agreed.
> > 
> >> Also I don't really like
> >> introducing sanity checks for those that aren't based on the host (as in
> >> the LMCE thread).  This code will never trigger in practice, so it's
> >> just extra cost for no benefit.
> > 
> > Our experience with guest ABI and compatibility code tells me
> > otherwise. Keeping guest ABI is hard, so I would always support
> > adding extra code to help us catch mistakes (either in QEMU,
> > management software, or user mistakes).
> 
> I agree, but I think the "enforce"-style properties are enough (and
> count as extra code :)).

I agree they are enough when bits have an easy 1-1 correspondence
to command-line options. I was thinking about things like CPUID
bits, that depend on complex CPU model + accelerator +
machine-type interactions.
diff mbox

Patch

diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
index 49566c8..3883d04 100644
--- a/include/hw/i386/pc.h
+++ b/include/hw/i386/pc.h
@@ -362,6 +362,11 @@  bool e820_get_entry(int, uint32_t, uint64_t *, uint64_t *);
         .driver   = TYPE_X86_CPU,\
         .property = "cpuid-0xb",\
         .value    = "off",\
+    },\
+    {\
+        .driver = TYPE_X86_CPU,\
+        .property = "fill-mtrr-mask",\
+        .value = "off",\
     },
 
 #define PC_COMPAT_2_5 \
diff --git a/target-i386/cpu.c b/target-i386/cpu.c
index f9302f9..4111240 100644
--- a/target-i386/cpu.c
+++ b/target-i386/cpu.c
@@ -3272,6 +3272,7 @@  static Property x86_cpu_properties[] = {
     DEFINE_PROP_BOOL("check", X86CPU, check_cpuid, true),
     DEFINE_PROP_BOOL("enforce", X86CPU, enforce_cpuid, false),
     DEFINE_PROP_BOOL("kvm", X86CPU, expose_kvm, true),
+    DEFINE_PROP_BOOL("fill-mtrr-mask", X86CPU, fill_mtrr_mask, true),
     DEFINE_PROP_UINT32("level", X86CPU, env.cpuid_level, 0),
     DEFINE_PROP_UINT32("xlevel", X86CPU, env.cpuid_xlevel, 0),
     DEFINE_PROP_UINT32("xlevel2", X86CPU, env.cpuid_xlevel2, 0),
diff --git a/target-i386/cpu.h b/target-i386/cpu.h
index 61c06b7..13d8501 100644
--- a/target-i386/cpu.h
+++ b/target-i386/cpu.h
@@ -1181,6 +1181,9 @@  struct X86CPU {
     /* Compatibility bits for old machine types: */
     bool enable_cpuid_0xb;
 
+    /* if true fill the top bits of the MTRR_PHYSMASKn variable range */
+    bool fill_mtrr_mask;
+
     /* in order to simplify APIC support, we leave this pointer to the
        user */
     struct DeviceState *apic_state;
diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index d95d06b..9b5158e 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -1934,6 +1934,7 @@  static int kvm_get_msrs(X86CPU *cpu)
     CPUX86State *env = &cpu->env;
     struct kvm_msr_entry *msrs = cpu->kvm_msr_buf->entries;
     int ret, i;
+    uint64_t mtrr_top_bits;
 
     kvm_msr_buf_reset(cpu);
 
@@ -2083,6 +2084,27 @@  static int kvm_get_msrs(X86CPU *cpu)
     }
 
     assert(ret == cpu->kvm_msr_buf->nmsrs);
+    /*
+     * MTRR masks: Each mask consists of 5 parts
+     * a  10..0: must be zero
+     * b  11   : valid bit
+     * c n-1.12: actual mask bits
+     * d  51..n: reserved must be zero
+     * e  63.52: reserved must be zero
+     *
+     * 'n' is the number of physical bits supported by the CPU and is
+     * apparently always <= 52.   We know our 'n' but don't know what
+     * the destinations 'n' is; it might be smaller, in which case
+     * it masks (c) on loading. It might be larger, in which case
+     * we fill 'd' so that d..c is consistent irrespetive of the 'n'
+     * we're migrating to.
+     */
+    if (cpu->fill_mtrr_mask) {
+        mtrr_top_bits = BIT_RANGE(51, cpu_x86_guest_phys_address_bits(env));
+    } else {
+        mtrr_top_bits = 0;
+    }
+
     for (i = 0; i < ret; i++) {
         uint32_t index = msrs[i].index;
         switch (index) {
@@ -2278,7 +2300,8 @@  static int kvm_get_msrs(X86CPU *cpu)
             break;
         case MSR_MTRRphysBase(0) ... MSR_MTRRphysMask(MSR_MTRRcap_VCNT - 1):
             if (index & 1) {
-                env->mtrr_var[MSR_MTRRphysIndex(index)].mask = msrs[i].data;
+                env->mtrr_var[MSR_MTRRphysIndex(index)].mask = msrs[i].data |
+                                                               mtrr_top_bits;
             } else {
                 env->mtrr_var[MSR_MTRRphysIndex(index)].base = msrs[i].data;
             }