diff mbox

[v5,7/7] pc: add PC_I440FX_COMPAT to disable aercap for vifo device

Message ID 18492b2679b3194f92b1ceefdbe4355bb840e660.1426155432.git.chen.fan.fnst@cn.fujitsu.com
State New
Headers show

Commit Message

chenfan March 12, 2015, 10:23 a.m. UTC
for piix4 chipset, we don't need to expose aer, so introduce
PC_I440FX_COMPAT for all piix4 machines to disable aercap,
and add HW_COMPAT_2_2 to disable aercap for all lower
than 2.3.

Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
---
 hw/i386/pc_piix.c   |  9 +++++++++
 hw/i386/pc_q35.c    |  4 ++++
 include/hw/compat.h | 10 ++++++++++
 3 files changed, 23 insertions(+)

Comments

Alex Williamson March 13, 2015, 10:38 p.m. UTC | #1
On Thu, 2015-03-12 at 18:23 +0800, Chen Fan wrote:
> for piix4 chipset, we don't need to expose aer, so introduce
> PC_I440FX_COMPAT for all piix4 machines to disable aercap,
> and add HW_COMPAT_2_2 to disable aercap for all lower
> than 2.3.

440FX is not PCIe, so it doesn't seem like we need to do anything there.
Shouldn't this only cover q35 machine types through 2.3?  (QEMU 2.3 is
already in hard freeze afaik, this won't go in until after 2.4
development opens).  Thanks,

Alex

> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> ---
>  hw/i386/pc_piix.c   |  9 +++++++++
>  hw/i386/pc_q35.c    |  4 ++++
>  include/hw/compat.h | 10 ++++++++++
>  3 files changed, 23 insertions(+)
> 
> diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
> index 8eab4ba..ff9d312 100644
> --- a/hw/i386/pc_piix.c
> +++ b/hw/i386/pc_piix.c
> @@ -307,6 +307,11 @@ static void pc_init1(MachineState *machine,
>  
>  static void pc_init_pci(MachineState *machine)
>  {
> +    static GlobalProperty pc_compat_props[] = {
> +        PC_I440FX_COMPAT,
> +        { /* end of list */ }
> +    };
> +    qdev_prop_register_global_list(pc_compat_props);
>      pc_init1(machine, 1, 1);
>  }
>  
> @@ -534,6 +539,10 @@ static QEMUMachine pc_i440fx_machine_v2_2 = {
>      PC_I440FX_2_2_MACHINE_OPTIONS,
>      .name = "pc-i440fx-2.2",
>      .init = pc_init_pci_2_2,
> +    .compat_props = (GlobalProperty[]) {
> +        HW_COMPAT_2_2,
> +        { /* end of list */ }
> +    },
>  };
>  
>  #define PC_I440FX_2_1_MACHINE_OPTIONS                           \
> diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
> index c0f21fe..97afb7d 100644
> --- a/hw/i386/pc_q35.c
> +++ b/hw/i386/pc_q35.c
> @@ -431,6 +431,10 @@ static QEMUMachine pc_q35_machine_v2_2 = {
>      PC_Q35_2_2_MACHINE_OPTIONS,
>      .name = "pc-q35-2.2",
>      .init = pc_q35_init_2_2,
> +    .compat_props = (GlobalProperty[]) {
> +        HW_COMPAT_2_2,
> +        { /* end of list */ }
> +    },
>  };
>  
>  #define PC_Q35_2_1_MACHINE_OPTIONS                      \
> diff --git a/include/hw/compat.h b/include/hw/compat.h
> index 313682a..40c974a 100644
> --- a/include/hw/compat.h
> +++ b/include/hw/compat.h
> @@ -1,7 +1,17 @@
>  #ifndef HW_COMPAT_H
>  #define HW_COMPAT_H
>  
> +#define HW_COMPAT_2_2 PC_I440FX_COMPAT
> +
> +#define PC_I440FX_COMPAT \
> +        {\
> +            .driver   = "vfio-pci",\
> +            .property = "x-aer",\
> +            .value    = "off",\
> +        }
> +
>  #define HW_COMPAT_2_1 \
> +        HW_COMPAT_2_2, \
>          {\
>              .driver   = "intel-hda",\
>              .property = "old_msi_addr",\
chenfan March 16, 2015, 2:48 a.m. UTC | #2
On 03/14/2015 06:38 AM, Alex Williamson wrote:
> On Thu, 2015-03-12 at 18:23 +0800, Chen Fan wrote:
>> for piix4 chipset, we don't need to expose aer, so introduce
>> PC_I440FX_COMPAT for all piix4 machines to disable aercap,
>> and add HW_COMPAT_2_2 to disable aercap for all lower
>> than 2.3.
> 440FX is not PCIe, so it doesn't seem like we need to do anything there.
> Shouldn't this only cover q35 machine types through 2.3?  (QEMU 2.3 is
> already in hard freeze afaik, this won't go in until after 2.4
> development opens).  Thanks,
Yes, 440Fx is not PCIe, so extended cap won't be parsed.
I knew this patches won't be in QEMU 2.3. I want to wait for
the 2.4 opens and then rebase this patch.

Thanks,
Chen


>
> Alex
>
>> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
>> ---
>>   hw/i386/pc_piix.c   |  9 +++++++++
>>   hw/i386/pc_q35.c    |  4 ++++
>>   include/hw/compat.h | 10 ++++++++++
>>   3 files changed, 23 insertions(+)
>>
>> diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
>> index 8eab4ba..ff9d312 100644
>> --- a/hw/i386/pc_piix.c
>> +++ b/hw/i386/pc_piix.c
>> @@ -307,6 +307,11 @@ static void pc_init1(MachineState *machine,
>>   
>>   static void pc_init_pci(MachineState *machine)
>>   {
>> +    static GlobalProperty pc_compat_props[] = {
>> +        PC_I440FX_COMPAT,
>> +        { /* end of list */ }
>> +    };
>> +    qdev_prop_register_global_list(pc_compat_props);
>>       pc_init1(machine, 1, 1);
>>   }
>>   
>> @@ -534,6 +539,10 @@ static QEMUMachine pc_i440fx_machine_v2_2 = {
>>       PC_I440FX_2_2_MACHINE_OPTIONS,
>>       .name = "pc-i440fx-2.2",
>>       .init = pc_init_pci_2_2,
>> +    .compat_props = (GlobalProperty[]) {
>> +        HW_COMPAT_2_2,
>> +        { /* end of list */ }
>> +    },
>>   };
>>   
>>   #define PC_I440FX_2_1_MACHINE_OPTIONS                           \
>> diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
>> index c0f21fe..97afb7d 100644
>> --- a/hw/i386/pc_q35.c
>> +++ b/hw/i386/pc_q35.c
>> @@ -431,6 +431,10 @@ static QEMUMachine pc_q35_machine_v2_2 = {
>>       PC_Q35_2_2_MACHINE_OPTIONS,
>>       .name = "pc-q35-2.2",
>>       .init = pc_q35_init_2_2,
>> +    .compat_props = (GlobalProperty[]) {
>> +        HW_COMPAT_2_2,
>> +        { /* end of list */ }
>> +    },
>>   };
>>   
>>   #define PC_Q35_2_1_MACHINE_OPTIONS                      \
>> diff --git a/include/hw/compat.h b/include/hw/compat.h
>> index 313682a..40c974a 100644
>> --- a/include/hw/compat.h
>> +++ b/include/hw/compat.h
>> @@ -1,7 +1,17 @@
>>   #ifndef HW_COMPAT_H
>>   #define HW_COMPAT_H
>>   
>> +#define HW_COMPAT_2_2 PC_I440FX_COMPAT
>> +
>> +#define PC_I440FX_COMPAT \
>> +        {\
>> +            .driver   = "vfio-pci",\
>> +            .property = "x-aer",\
>> +            .value    = "off",\
>> +        }
>> +
>>   #define HW_COMPAT_2_1 \
>> +        HW_COMPAT_2_2, \
>>           {\
>>               .driver   = "intel-hda",\
>>               .property = "old_msi_addr",\
>
>
> .
>
chenfan March 16, 2015, 2:49 a.m. UTC | #3
On 03/14/2015 06:38 AM, Alex Williamson wrote:
> On Thu, 2015-03-12 at 18:23 +0800, Chen Fan wrote:
>> for piix4 chipset, we don't need to expose aer, so introduce
>> PC_I440FX_COMPAT for all piix4 machines to disable aercap,
>> and add HW_COMPAT_2_2 to disable aercap for all lower
>> than 2.3.
> 440FX is not PCIe, so it doesn't seem like we need to do anything there.
> Shouldn't this only cover q35 machine types through 2.3?  (QEMU 2.3 is
> already in hard freeze afaik, this won't go in until after 2.4
> development opens).  Thanks,
Yes, 440Fx is not PCIe, so extended cap won't be parsed.
I knew this patches won't be in QEMU 2.3. I want to wait for
the 2.4 opens and then rebase this patch separately.

Thanks,
Chen


>
> Alex
>
>> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
>> ---
>>   hw/i386/pc_piix.c   |  9 +++++++++
>>   hw/i386/pc_q35.c    |  4 ++++
>>   include/hw/compat.h | 10 ++++++++++
>>   3 files changed, 23 insertions(+)
>>
>> diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
>> index 8eab4ba..ff9d312 100644
>> --- a/hw/i386/pc_piix.c
>> +++ b/hw/i386/pc_piix.c
>> @@ -307,6 +307,11 @@ static void pc_init1(MachineState *machine,
>>   
>>   static void pc_init_pci(MachineState *machine)
>>   {
>> +    static GlobalProperty pc_compat_props[] = {
>> +        PC_I440FX_COMPAT,
>> +        { /* end of list */ }
>> +    };
>> +    qdev_prop_register_global_list(pc_compat_props);
>>       pc_init1(machine, 1, 1);
>>   }
>>   
>> @@ -534,6 +539,10 @@ static QEMUMachine pc_i440fx_machine_v2_2 = {
>>       PC_I440FX_2_2_MACHINE_OPTIONS,
>>       .name = "pc-i440fx-2.2",
>>       .init = pc_init_pci_2_2,
>> +    .compat_props = (GlobalProperty[]) {
>> +        HW_COMPAT_2_2,
>> +        { /* end of list */ }
>> +    },
>>   };
>>   
>>   #define PC_I440FX_2_1_MACHINE_OPTIONS                           \
>> diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
>> index c0f21fe..97afb7d 100644
>> --- a/hw/i386/pc_q35.c
>> +++ b/hw/i386/pc_q35.c
>> @@ -431,6 +431,10 @@ static QEMUMachine pc_q35_machine_v2_2 = {
>>       PC_Q35_2_2_MACHINE_OPTIONS,
>>       .name = "pc-q35-2.2",
>>       .init = pc_q35_init_2_2,
>> +    .compat_props = (GlobalProperty[]) {
>> +        HW_COMPAT_2_2,
>> +        { /* end of list */ }
>> +    },
>>   };
>>   
>>   #define PC_Q35_2_1_MACHINE_OPTIONS                      \
>> diff --git a/include/hw/compat.h b/include/hw/compat.h
>> index 313682a..40c974a 100644
>> --- a/include/hw/compat.h
>> +++ b/include/hw/compat.h
>> @@ -1,7 +1,17 @@
>>   #ifndef HW_COMPAT_H
>>   #define HW_COMPAT_H
>>   
>> +#define HW_COMPAT_2_2 PC_I440FX_COMPAT
>> +
>> +#define PC_I440FX_COMPAT \
>> +        {\
>> +            .driver   = "vfio-pci",\
>> +            .property = "x-aer",\
>> +            .value    = "off",\
>> +        }
>> +
>>   #define HW_COMPAT_2_1 \
>> +        HW_COMPAT_2_2, \
>>           {\
>>               .driver   = "intel-hda",\
>>               .property = "old_msi_addr",\
>
>
> .
>
Michael S. Tsirkin March 18, 2015, 1:23 p.m. UTC | #4
typo in subject: vfio, not vifo.

On Thu, Mar 12, 2015 at 06:23:59PM +0800, Chen Fan wrote:
> for piix4 chipset, we don't need to expose aer, so introduce
> PC_I440FX_COMPAT for all piix4 machines to disable aercap,
> and add HW_COMPAT_2_2 to disable aercap for all lower
> than 2.3.
> 
> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>

Well vfio is never migrated ATM.
So why is compat code needed at all?

> ---
>  hw/i386/pc_piix.c   |  9 +++++++++
>  hw/i386/pc_q35.c    |  4 ++++
>  include/hw/compat.h | 10 ++++++++++
>  3 files changed, 23 insertions(+)
> 
> diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
> index 8eab4ba..ff9d312 100644
> --- a/hw/i386/pc_piix.c
> +++ b/hw/i386/pc_piix.c
> @@ -307,6 +307,11 @@ static void pc_init1(MachineState *machine,
>  
>  static void pc_init_pci(MachineState *machine)
>  {
> +    static GlobalProperty pc_compat_props[] = {
> +        PC_I440FX_COMPAT,
> +        { /* end of list */ }
> +    };
> +    qdev_prop_register_global_list(pc_compat_props);
>      pc_init1(machine, 1, 1);
>  }
>  
> @@ -534,6 +539,10 @@ static QEMUMachine pc_i440fx_machine_v2_2 = {
>      PC_I440FX_2_2_MACHINE_OPTIONS,
>      .name = "pc-i440fx-2.2",
>      .init = pc_init_pci_2_2,
> +    .compat_props = (GlobalProperty[]) {
> +        HW_COMPAT_2_2,
> +        { /* end of list */ }
> +    },
>  };
>  
>  #define PC_I440FX_2_1_MACHINE_OPTIONS                           \
> diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
> index c0f21fe..97afb7d 100644
> --- a/hw/i386/pc_q35.c
> +++ b/hw/i386/pc_q35.c
> @@ -431,6 +431,10 @@ static QEMUMachine pc_q35_machine_v2_2 = {
>      PC_Q35_2_2_MACHINE_OPTIONS,
>      .name = "pc-q35-2.2",
>      .init = pc_q35_init_2_2,
> +    .compat_props = (GlobalProperty[]) {
> +        HW_COMPAT_2_2,
> +        { /* end of list */ }
> +    },
>  };
>  
>  #define PC_Q35_2_1_MACHINE_OPTIONS                      \
> diff --git a/include/hw/compat.h b/include/hw/compat.h
> index 313682a..40c974a 100644
> --- a/include/hw/compat.h
> +++ b/include/hw/compat.h
> @@ -1,7 +1,17 @@
>  #ifndef HW_COMPAT_H
>  #define HW_COMPAT_H
>  
> +#define HW_COMPAT_2_2 PC_I440FX_COMPAT
> +
> +#define PC_I440FX_COMPAT \
> +        {\
> +            .driver   = "vfio-pci",\
> +            .property = "x-aer",\
> +            .value    = "off",\
> +        }
> +
>  #define HW_COMPAT_2_1 \
> +        HW_COMPAT_2_2, \
>          {\
>              .driver   = "intel-hda",\
>              .property = "old_msi_addr",\
> -- 
> 1.9.3
>
Alex Williamson March 18, 2015, 2:02 p.m. UTC | #5
On Wed, 2015-03-18 at 14:23 +0100, Michael S. Tsirkin wrote:
> typo in subject: vfio, not vifo.
> 
> On Thu, Mar 12, 2015 at 06:23:59PM +0800, Chen Fan wrote:
> > for piix4 chipset, we don't need to expose aer, so introduce
> > PC_I440FX_COMPAT for all piix4 machines to disable aercap,
> > and add HW_COMPAT_2_2 to disable aercap for all lower
> > than 2.3.
> > 
> > Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> 
> Well vfio is never migrated ATM.
> So why is compat code needed at all?

It's not for migration, it's to maintain current behavior on existing
platforms.  If someone gets an uncorrected AER error on q35 machine type
today, the VM stops.  With this change, AER would be exposed to the
guest and the guest could handle it.  The compat change therefore
maintains the stop VM behavior on existing q35 machine types.  As I
commented here, the 440fx part of this patch is unnecessary since AER
cannot be exposed to the guest on a conventional PCI chipset.  Thanks,

Alex

> > ---
> >  hw/i386/pc_piix.c   |  9 +++++++++
> >  hw/i386/pc_q35.c    |  4 ++++
> >  include/hw/compat.h | 10 ++++++++++
> >  3 files changed, 23 insertions(+)
> > 
> > diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
> > index 8eab4ba..ff9d312 100644
> > --- a/hw/i386/pc_piix.c
> > +++ b/hw/i386/pc_piix.c
> > @@ -307,6 +307,11 @@ static void pc_init1(MachineState *machine,
> >  
> >  static void pc_init_pci(MachineState *machine)
> >  {
> > +    static GlobalProperty pc_compat_props[] = {
> > +        PC_I440FX_COMPAT,
> > +        { /* end of list */ }
> > +    };
> > +    qdev_prop_register_global_list(pc_compat_props);
> >      pc_init1(machine, 1, 1);
> >  }
> >  
> > @@ -534,6 +539,10 @@ static QEMUMachine pc_i440fx_machine_v2_2 = {
> >      PC_I440FX_2_2_MACHINE_OPTIONS,
> >      .name = "pc-i440fx-2.2",
> >      .init = pc_init_pci_2_2,
> > +    .compat_props = (GlobalProperty[]) {
> > +        HW_COMPAT_2_2,
> > +        { /* end of list */ }
> > +    },
> >  };
> >  
> >  #define PC_I440FX_2_1_MACHINE_OPTIONS                           \
> > diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
> > index c0f21fe..97afb7d 100644
> > --- a/hw/i386/pc_q35.c
> > +++ b/hw/i386/pc_q35.c
> > @@ -431,6 +431,10 @@ static QEMUMachine pc_q35_machine_v2_2 = {
> >      PC_Q35_2_2_MACHINE_OPTIONS,
> >      .name = "pc-q35-2.2",
> >      .init = pc_q35_init_2_2,
> > +    .compat_props = (GlobalProperty[]) {
> > +        HW_COMPAT_2_2,
> > +        { /* end of list */ }
> > +    },
> >  };
> >  
> >  #define PC_Q35_2_1_MACHINE_OPTIONS                      \
> > diff --git a/include/hw/compat.h b/include/hw/compat.h
> > index 313682a..40c974a 100644
> > --- a/include/hw/compat.h
> > +++ b/include/hw/compat.h
> > @@ -1,7 +1,17 @@
> >  #ifndef HW_COMPAT_H
> >  #define HW_COMPAT_H
> >  
> > +#define HW_COMPAT_2_2 PC_I440FX_COMPAT
> > +
> > +#define PC_I440FX_COMPAT \
> > +        {\
> > +            .driver   = "vfio-pci",\
> > +            .property = "x-aer",\
> > +            .value    = "off",\
> > +        }
> > +
> >  #define HW_COMPAT_2_1 \
> > +        HW_COMPAT_2_2, \
> >          {\
> >              .driver   = "intel-hda",\
> >              .property = "old_msi_addr",\
> > -- 
> > 1.9.3
> >
Michael S. Tsirkin March 18, 2015, 2:05 p.m. UTC | #6
On Wed, Mar 18, 2015 at 08:02:26AM -0600, Alex Williamson wrote:
> On Wed, 2015-03-18 at 14:23 +0100, Michael S. Tsirkin wrote:
> > typo in subject: vfio, not vifo.
> > 
> > On Thu, Mar 12, 2015 at 06:23:59PM +0800, Chen Fan wrote:
> > > for piix4 chipset, we don't need to expose aer, so introduce
> > > PC_I440FX_COMPAT for all piix4 machines to disable aercap,
> > > and add HW_COMPAT_2_2 to disable aercap for all lower
> > > than 2.3.
> > > 
> > > Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> > 
> > Well vfio is never migrated ATM.
> > So why is compat code needed at all?
> 
> It's not for migration, it's to maintain current behavior on existing
> platforms.  If someone gets an uncorrected AER error on q35 machine type
> today, the VM stops.  With this change, AER would be exposed to the
> guest and the guest could handle it.  The compat change therefore
> maintains the stop VM behavior on existing q35 machine types.

If stop VM behaviour is useful, expose it to users.
If not, then don't.
I don't see why does it have to be tied to machine types.

>  As I
> commented here, the 440fx part of this patch is unnecessary since AER
> cannot be exposed to the guest on a conventional PCI chipset.  Thanks,
> 
> Alex
> 
> > > ---
> > >  hw/i386/pc_piix.c   |  9 +++++++++
> > >  hw/i386/pc_q35.c    |  4 ++++
> > >  include/hw/compat.h | 10 ++++++++++
> > >  3 files changed, 23 insertions(+)
> > > 
> > > diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
> > > index 8eab4ba..ff9d312 100644
> > > --- a/hw/i386/pc_piix.c
> > > +++ b/hw/i386/pc_piix.c
> > > @@ -307,6 +307,11 @@ static void pc_init1(MachineState *machine,
> > >  
> > >  static void pc_init_pci(MachineState *machine)
> > >  {
> > > +    static GlobalProperty pc_compat_props[] = {
> > > +        PC_I440FX_COMPAT,
> > > +        { /* end of list */ }
> > > +    };
> > > +    qdev_prop_register_global_list(pc_compat_props);
> > >      pc_init1(machine, 1, 1);
> > >  }
> > >  
> > > @@ -534,6 +539,10 @@ static QEMUMachine pc_i440fx_machine_v2_2 = {
> > >      PC_I440FX_2_2_MACHINE_OPTIONS,
> > >      .name = "pc-i440fx-2.2",
> > >      .init = pc_init_pci_2_2,
> > > +    .compat_props = (GlobalProperty[]) {
> > > +        HW_COMPAT_2_2,
> > > +        { /* end of list */ }
> > > +    },
> > >  };
> > >  
> > >  #define PC_I440FX_2_1_MACHINE_OPTIONS                           \
> > > diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
> > > index c0f21fe..97afb7d 100644
> > > --- a/hw/i386/pc_q35.c
> > > +++ b/hw/i386/pc_q35.c
> > > @@ -431,6 +431,10 @@ static QEMUMachine pc_q35_machine_v2_2 = {
> > >      PC_Q35_2_2_MACHINE_OPTIONS,
> > >      .name = "pc-q35-2.2",
> > >      .init = pc_q35_init_2_2,
> > > +    .compat_props = (GlobalProperty[]) {
> > > +        HW_COMPAT_2_2,
> > > +        { /* end of list */ }
> > > +    },
> > >  };
> > >  
> > >  #define PC_Q35_2_1_MACHINE_OPTIONS                      \
> > > diff --git a/include/hw/compat.h b/include/hw/compat.h
> > > index 313682a..40c974a 100644
> > > --- a/include/hw/compat.h
> > > +++ b/include/hw/compat.h
> > > @@ -1,7 +1,17 @@
> > >  #ifndef HW_COMPAT_H
> > >  #define HW_COMPAT_H
> > >  
> > > +#define HW_COMPAT_2_2 PC_I440FX_COMPAT
> > > +
> > > +#define PC_I440FX_COMPAT \
> > > +        {\
> > > +            .driver   = "vfio-pci",\
> > > +            .property = "x-aer",\
> > > +            .value    = "off",\
> > > +        }
> > > +
> > >  #define HW_COMPAT_2_1 \
> > > +        HW_COMPAT_2_2, \
> > >          {\
> > >              .driver   = "intel-hda",\
> > >              .property = "old_msi_addr",\
> > > -- 
> > > 1.9.3
> > > 
> 
>
Alex Williamson March 18, 2015, 2:15 p.m. UTC | #7
On Wed, 2015-03-18 at 15:05 +0100, Michael S. Tsirkin wrote:
> On Wed, Mar 18, 2015 at 08:02:26AM -0600, Alex Williamson wrote:
> > On Wed, 2015-03-18 at 14:23 +0100, Michael S. Tsirkin wrote:
> > > typo in subject: vfio, not vifo.
> > > 
> > > On Thu, Mar 12, 2015 at 06:23:59PM +0800, Chen Fan wrote:
> > > > for piix4 chipset, we don't need to expose aer, so introduce
> > > > PC_I440FX_COMPAT for all piix4 machines to disable aercap,
> > > > and add HW_COMPAT_2_2 to disable aercap for all lower
> > > > than 2.3.
> > > > 
> > > > Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> > > 
> > > Well vfio is never migrated ATM.
> > > So why is compat code needed at all?
> > 
> > It's not for migration, it's to maintain current behavior on existing
> > platforms.  If someone gets an uncorrected AER error on q35 machine type
> > today, the VM stops.  With this change, AER would be exposed to the
> > guest and the guest could handle it.  The compat change therefore
> > maintains the stop VM behavior on existing q35 machine types.
> 
> If stop VM behaviour is useful, expose it to users.
> If not, then don't.
> I don't see why does it have to be tied to machine types.

Because q35-2.2 machine type will currently do a stop VM on uncorrected
AER error.  If we don't tie that to a machine option then q35-2.2 would
suddenly start exposing the error to the guest.  That's a fairly
significant change in behavior for a static machine type.

> >  As I
> > commented here, the 440fx part of this patch is unnecessary since AER
> > cannot be exposed to the guest on a conventional PCI chipset.  Thanks,
> > 
> > Alex
> > 
> > > > ---
> > > >  hw/i386/pc_piix.c   |  9 +++++++++
> > > >  hw/i386/pc_q35.c    |  4 ++++
> > > >  include/hw/compat.h | 10 ++++++++++
> > > >  3 files changed, 23 insertions(+)
> > > > 
> > > > diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
> > > > index 8eab4ba..ff9d312 100644
> > > > --- a/hw/i386/pc_piix.c
> > > > +++ b/hw/i386/pc_piix.c
> > > > @@ -307,6 +307,11 @@ static void pc_init1(MachineState *machine,
> > > >  
> > > >  static void pc_init_pci(MachineState *machine)
> > > >  {
> > > > +    static GlobalProperty pc_compat_props[] = {
> > > > +        PC_I440FX_COMPAT,
> > > > +        { /* end of list */ }
> > > > +    };
> > > > +    qdev_prop_register_global_list(pc_compat_props);
> > > >      pc_init1(machine, 1, 1);
> > > >  }
> > > >  
> > > > @@ -534,6 +539,10 @@ static QEMUMachine pc_i440fx_machine_v2_2 = {
> > > >      PC_I440FX_2_2_MACHINE_OPTIONS,
> > > >      .name = "pc-i440fx-2.2",
> > > >      .init = pc_init_pci_2_2,
> > > > +    .compat_props = (GlobalProperty[]) {
> > > > +        HW_COMPAT_2_2,
> > > > +        { /* end of list */ }
> > > > +    },
> > > >  };
> > > >  
> > > >  #define PC_I440FX_2_1_MACHINE_OPTIONS                           \
> > > > diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
> > > > index c0f21fe..97afb7d 100644
> > > > --- a/hw/i386/pc_q35.c
> > > > +++ b/hw/i386/pc_q35.c
> > > > @@ -431,6 +431,10 @@ static QEMUMachine pc_q35_machine_v2_2 = {
> > > >      PC_Q35_2_2_MACHINE_OPTIONS,
> > > >      .name = "pc-q35-2.2",
> > > >      .init = pc_q35_init_2_2,
> > > > +    .compat_props = (GlobalProperty[]) {
> > > > +        HW_COMPAT_2_2,
> > > > +        { /* end of list */ }
> > > > +    },
> > > >  };
> > > >  
> > > >  #define PC_Q35_2_1_MACHINE_OPTIONS                      \
> > > > diff --git a/include/hw/compat.h b/include/hw/compat.h
> > > > index 313682a..40c974a 100644
> > > > --- a/include/hw/compat.h
> > > > +++ b/include/hw/compat.h
> > > > @@ -1,7 +1,17 @@
> > > >  #ifndef HW_COMPAT_H
> > > >  #define HW_COMPAT_H
> > > >  
> > > > +#define HW_COMPAT_2_2 PC_I440FX_COMPAT
> > > > +
> > > > +#define PC_I440FX_COMPAT \
> > > > +        {\
> > > > +            .driver   = "vfio-pci",\
> > > > +            .property = "x-aer",\
> > > > +            .value    = "off",\
> > > > +        }
> > > > +
> > > >  #define HW_COMPAT_2_1 \
> > > > +        HW_COMPAT_2_2, \
> > > >          {\
> > > >              .driver   = "intel-hda",\
> > > >              .property = "old_msi_addr",\
> > > > -- 
> > > > 1.9.3
> > > > 
> > 
> >
Michael S. Tsirkin March 18, 2015, 2:36 p.m. UTC | #8
On Wed, Mar 18, 2015 at 08:15:01AM -0600, Alex Williamson wrote:
> On Wed, 2015-03-18 at 15:05 +0100, Michael S. Tsirkin wrote:
> > On Wed, Mar 18, 2015 at 08:02:26AM -0600, Alex Williamson wrote:
> > > On Wed, 2015-03-18 at 14:23 +0100, Michael S. Tsirkin wrote:
> > > > typo in subject: vfio, not vifo.
> > > > 
> > > > On Thu, Mar 12, 2015 at 06:23:59PM +0800, Chen Fan wrote:
> > > > > for piix4 chipset, we don't need to expose aer, so introduce
> > > > > PC_I440FX_COMPAT for all piix4 machines to disable aercap,
> > > > > and add HW_COMPAT_2_2 to disable aercap for all lower
> > > > > than 2.3.
> > > > > 
> > > > > Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> > > > 
> > > > Well vfio is never migrated ATM.
> > > > So why is compat code needed at all?
> > > 
> > > It's not for migration, it's to maintain current behavior on existing
> > > platforms.  If someone gets an uncorrected AER error on q35 machine type
> > > today, the VM stops.  With this change, AER would be exposed to the
> > > guest and the guest could handle it.  The compat change therefore
> > > maintains the stop VM behavior on existing q35 machine types.
> > 
> > If stop VM behaviour is useful, expose it to users.
> > If not, then don't.
> > I don't see why does it have to be tied to machine types.
> 
> Because q35-2.2 machine type will currently do a stop VM on uncorrected
> AER error.  If we don't tie that to a machine option then q35-2.2 would
> suddenly start exposing the error to the guest.  That's a fairly
> significant change in behavior for a static machine type.

I don't think you can classify it as a behaviour change. VM stop is not
guest visible behaviour.
Are you worrying about guests misbehaving when they see these errors?
Then you want this as user-controlled, supported option.

In other words: we only tie things to machine types when we
have to. This code gets almost no testing, and is a lot of
work to test. This one sounds like "just in case" is not a good
motivation.
Alex Williamson March 18, 2015, 2:50 p.m. UTC | #9
On Wed, 2015-03-18 at 15:36 +0100, Michael S. Tsirkin wrote:
> On Wed, Mar 18, 2015 at 08:15:01AM -0600, Alex Williamson wrote:
> > On Wed, 2015-03-18 at 15:05 +0100, Michael S. Tsirkin wrote:
> > > On Wed, Mar 18, 2015 at 08:02:26AM -0600, Alex Williamson wrote:
> > > > On Wed, 2015-03-18 at 14:23 +0100, Michael S. Tsirkin wrote:
> > > > > typo in subject: vfio, not vifo.
> > > > > 
> > > > > On Thu, Mar 12, 2015 at 06:23:59PM +0800, Chen Fan wrote:
> > > > > > for piix4 chipset, we don't need to expose aer, so introduce
> > > > > > PC_I440FX_COMPAT for all piix4 machines to disable aercap,
> > > > > > and add HW_COMPAT_2_2 to disable aercap for all lower
> > > > > > than 2.3.
> > > > > > 
> > > > > > Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> > > > > 
> > > > > Well vfio is never migrated ATM.
> > > > > So why is compat code needed at all?
> > > > 
> > > > It's not for migration, it's to maintain current behavior on existing
> > > > platforms.  If someone gets an uncorrected AER error on q35 machine type
> > > > today, the VM stops.  With this change, AER would be exposed to the
> > > > guest and the guest could handle it.  The compat change therefore
> > > > maintains the stop VM behavior on existing q35 machine types.
> > > 
> > > If stop VM behaviour is useful, expose it to users.
> > > If not, then don't.
> > > I don't see why does it have to be tied to machine types.
> > 
> > Because q35-2.2 machine type will currently do a stop VM on uncorrected
> > AER error.  If we don't tie that to a machine option then q35-2.2 would
> > suddenly start exposing the error to the guest.  That's a fairly
> > significant change in behavior for a static machine type.
> 
> I don't think you can classify it as a behaviour change. VM stop is not
> guest visible behaviour.

In one case, an uncorrected AER occurs and the VM is stopped by QEMU.
In the other case, the guest is notified and may attempt corrective
action... or maybe the guest doesn't understand AER and the user is
depending on the previous behavior.  That is absolutely a behavior
change.

> Are you worrying about guests misbehaving when they see these errors?
> Then you want this as user-controlled, supported option.

Whether the option is user visible is tangential to whether the behavior
of existing machine types should be maintained.  Existing machine types
can impose a different default than current machine types.

> In other words: we only tie things to machine types when we
> have to. This code gets almost no testing, and is a lot of
> work to test. This one sounds like "just in case" is not a good
> motivation.

It seems like an obvious use case for using machine types to maintain
compatibility with previous behavior, which is exactly why we have
machine types.  If we're not going to use it, why do we have it?
Michael S. Tsirkin March 18, 2015, 3:02 p.m. UTC | #10
On Wed, Mar 18, 2015 at 08:50:54AM -0600, Alex Williamson wrote:
> On Wed, 2015-03-18 at 15:36 +0100, Michael S. Tsirkin wrote:
> > On Wed, Mar 18, 2015 at 08:15:01AM -0600, Alex Williamson wrote:
> > > On Wed, 2015-03-18 at 15:05 +0100, Michael S. Tsirkin wrote:
> > > > On Wed, Mar 18, 2015 at 08:02:26AM -0600, Alex Williamson wrote:
> > > > > On Wed, 2015-03-18 at 14:23 +0100, Michael S. Tsirkin wrote:
> > > > > > typo in subject: vfio, not vifo.
> > > > > > 
> > > > > > On Thu, Mar 12, 2015 at 06:23:59PM +0800, Chen Fan wrote:
> > > > > > > for piix4 chipset, we don't need to expose aer, so introduce
> > > > > > > PC_I440FX_COMPAT for all piix4 machines to disable aercap,
> > > > > > > and add HW_COMPAT_2_2 to disable aercap for all lower
> > > > > > > than 2.3.
> > > > > > > 
> > > > > > > Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> > > > > > 
> > > > > > Well vfio is never migrated ATM.
> > > > > > So why is compat code needed at all?
> > > > > 
> > > > > It's not for migration, it's to maintain current behavior on existing
> > > > > platforms.  If someone gets an uncorrected AER error on q35 machine type
> > > > > today, the VM stops.  With this change, AER would be exposed to the
> > > > > guest and the guest could handle it.  The compat change therefore
> > > > > maintains the stop VM behavior on existing q35 machine types.
> > > > 
> > > > If stop VM behaviour is useful, expose it to users.
> > > > If not, then don't.
> > > > I don't see why does it have to be tied to machine types.
> > > 
> > > Because q35-2.2 machine type will currently do a stop VM on uncorrected
> > > AER error.  If we don't tie that to a machine option then q35-2.2 would
> > > suddenly start exposing the error to the guest.  That's a fairly
> > > significant change in behavior for a static machine type.
> > 
> > I don't think you can classify it as a behaviour change. VM stop is not
> > guest visible behaviour.
> 
> In one case, an uncorrected AER occurs and the VM is stopped by QEMU.
> In the other case, the guest is notified and may attempt corrective
> action... or maybe the guest doesn't understand AER and the user is
> depending on the previous behavior.  That is absolutely a behavior
> change.
> 
> > Are you worrying about guests misbehaving when they see these errors?
> > Then you want this as user-controlled, supported option.
> 
> Whether the option is user visible is tangential to whether the behavior
> of existing machine types should be maintained.  Existing machine types
> can impose a different default than current machine types.
>
> > In other words: we only tie things to machine types when we
> > have to. This code gets almost no testing, and is a lot of
> > work to test. This one sounds like "just in case" is not a good
> > motivation.
> 
> It seems like an obvious use case for using machine types to maintain
> compatibility with previous behavior, which is exactly why we have
> machine types.  If we're not going to use it, why do we have it?

We have machine types because of the following issues:
- some silent changes confuse guests. For example guest installed with
  one machine type might not boot if you try to use it after
  changing something, or - in case of windows - throw up warnings.
- some changes break migration

Looks like none of these cases.
If AER is unsafe, turn it off by default for everyone.
Alex Williamson March 18, 2015, 3:45 p.m. UTC | #11
On Wed, 2015-03-18 at 16:02 +0100, Michael S. Tsirkin wrote:
> On Wed, Mar 18, 2015 at 08:50:54AM -0600, Alex Williamson wrote:
> > On Wed, 2015-03-18 at 15:36 +0100, Michael S. Tsirkin wrote:
> > > On Wed, Mar 18, 2015 at 08:15:01AM -0600, Alex Williamson wrote:
> > > > On Wed, 2015-03-18 at 15:05 +0100, Michael S. Tsirkin wrote:
> > > > > On Wed, Mar 18, 2015 at 08:02:26AM -0600, Alex Williamson wrote:
> > > > > > On Wed, 2015-03-18 at 14:23 +0100, Michael S. Tsirkin wrote:
> > > > > > > typo in subject: vfio, not vifo.
> > > > > > > 
> > > > > > > On Thu, Mar 12, 2015 at 06:23:59PM +0800, Chen Fan wrote:
> > > > > > > > for piix4 chipset, we don't need to expose aer, so introduce
> > > > > > > > PC_I440FX_COMPAT for all piix4 machines to disable aercap,
> > > > > > > > and add HW_COMPAT_2_2 to disable aercap for all lower
> > > > > > > > than 2.3.
> > > > > > > > 
> > > > > > > > Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> > > > > > > 
> > > > > > > Well vfio is never migrated ATM.
> > > > > > > So why is compat code needed at all?
> > > > > > 
> > > > > > It's not for migration, it's to maintain current behavior on existing
> > > > > > platforms.  If someone gets an uncorrected AER error on q35 machine type
> > > > > > today, the VM stops.  With this change, AER would be exposed to the
> > > > > > guest and the guest could handle it.  The compat change therefore
> > > > > > maintains the stop VM behavior on existing q35 machine types.
> > > > > 
> > > > > If stop VM behaviour is useful, expose it to users.
> > > > > If not, then don't.
> > > > > I don't see why does it have to be tied to machine types.
> > > > 
> > > > Because q35-2.2 machine type will currently do a stop VM on uncorrected
> > > > AER error.  If we don't tie that to a machine option then q35-2.2 would
> > > > suddenly start exposing the error to the guest.  That's a fairly
> > > > significant change in behavior for a static machine type.
> > > 
> > > I don't think you can classify it as a behaviour change. VM stop is not
> > > guest visible behaviour.
> > 
> > In one case, an uncorrected AER occurs and the VM is stopped by QEMU.
> > In the other case, the guest is notified and may attempt corrective
> > action... or maybe the guest doesn't understand AER and the user is
> > depending on the previous behavior.  That is absolutely a behavior
> > change.
> > 
> > > Are you worrying about guests misbehaving when they see these errors?
> > > Then you want this as user-controlled, supported option.
> > 
> > Whether the option is user visible is tangential to whether the behavior
> > of existing machine types should be maintained.  Existing machine types
> > can impose a different default than current machine types.
> >
> > > In other words: we only tie things to machine types when we
> > > have to. This code gets almost no testing, and is a lot of
> > > work to test. This one sounds like "just in case" is not a good
> > > motivation.
> > 
> > It seems like an obvious use case for using machine types to maintain
> > compatibility with previous behavior, which is exactly why we have
> > machine types.  If we're not going to use it, why do we have it?
> 
> We have machine types because of the following issues:
> - some silent changes confuse guests. For example guest installed with
>   one machine type might not boot if you try to use it after
>   changing something, or - in case of windows - throw up warnings.
> - some changes break migration
> 
> Looks like none of these cases.
> If AER is unsafe, turn it off by default for everyone.

This is silly, we have the tools, let's use them.  If a user is running
a VM that gets a VM stop on AER error one day and they upgrade QEMU and
restart it, they should get the same behavior, whether a migration is
involved or not.  Maybe the default should be disabled, this patch
series hasn't yet even convinced me that there's a worthwhile general
case where the guest can recover, but using the existing machine
compatibility infrastructure should be at our disposal if we do think
the default going forward should be different than the behavior today.
Michael S. Tsirkin March 18, 2015, 4:44 p.m. UTC | #12
On Wed, Mar 18, 2015 at 09:45:29AM -0600, Alex Williamson wrote:
> On Wed, 2015-03-18 at 16:02 +0100, Michael S. Tsirkin wrote:
> > On Wed, Mar 18, 2015 at 08:50:54AM -0600, Alex Williamson wrote:
> > > On Wed, 2015-03-18 at 15:36 +0100, Michael S. Tsirkin wrote:
> > > > On Wed, Mar 18, 2015 at 08:15:01AM -0600, Alex Williamson wrote:
> > > > > On Wed, 2015-03-18 at 15:05 +0100, Michael S. Tsirkin wrote:
> > > > > > On Wed, Mar 18, 2015 at 08:02:26AM -0600, Alex Williamson wrote:
> > > > > > > On Wed, 2015-03-18 at 14:23 +0100, Michael S. Tsirkin wrote:
> > > > > > > > typo in subject: vfio, not vifo.
> > > > > > > > 
> > > > > > > > On Thu, Mar 12, 2015 at 06:23:59PM +0800, Chen Fan wrote:
> > > > > > > > > for piix4 chipset, we don't need to expose aer, so introduce
> > > > > > > > > PC_I440FX_COMPAT for all piix4 machines to disable aercap,
> > > > > > > > > and add HW_COMPAT_2_2 to disable aercap for all lower
> > > > > > > > > than 2.3.
> > > > > > > > > 
> > > > > > > > > Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> > > > > > > > 
> > > > > > > > Well vfio is never migrated ATM.
> > > > > > > > So why is compat code needed at all?
> > > > > > > 
> > > > > > > It's not for migration, it's to maintain current behavior on existing
> > > > > > > platforms.  If someone gets an uncorrected AER error on q35 machine type
> > > > > > > today, the VM stops.  With this change, AER would be exposed to the
> > > > > > > guest and the guest could handle it.  The compat change therefore
> > > > > > > maintains the stop VM behavior on existing q35 machine types.
> > > > > > 
> > > > > > If stop VM behaviour is useful, expose it to users.
> > > > > > If not, then don't.
> > > > > > I don't see why does it have to be tied to machine types.
> > > > > 
> > > > > Because q35-2.2 machine type will currently do a stop VM on uncorrected
> > > > > AER error.  If we don't tie that to a machine option then q35-2.2 would
> > > > > suddenly start exposing the error to the guest.  That's a fairly
> > > > > significant change in behavior for a static machine type.
> > > > 
> > > > I don't think you can classify it as a behaviour change. VM stop is not
> > > > guest visible behaviour.
> > > 
> > > In one case, an uncorrected AER occurs and the VM is stopped by QEMU.
> > > In the other case, the guest is notified and may attempt corrective
> > > action... or maybe the guest doesn't understand AER and the user is
> > > depending on the previous behavior.  That is absolutely a behavior
> > > change.
> > > 
> > > > Are you worrying about guests misbehaving when they see these errors?
> > > > Then you want this as user-controlled, supported option.
> > > 
> > > Whether the option is user visible is tangential to whether the behavior
> > > of existing machine types should be maintained.  Existing machine types
> > > can impose a different default than current machine types.
> > >
> > > > In other words: we only tie things to machine types when we
> > > > have to. This code gets almost no testing, and is a lot of
> > > > work to test. This one sounds like "just in case" is not a good
> > > > motivation.
> > > 
> > > It seems like an obvious use case for using machine types to maintain
> > > compatibility with previous behavior, which is exactly why we have
> > > machine types.  If we're not going to use it, why do we have it?
> > 
> > We have machine types because of the following issues:
> > - some silent changes confuse guests. For example guest installed with
> >   one machine type might not boot if you try to use it after
> >   changing something, or - in case of windows - throw up warnings.
> > - some changes break migration
> > 
> > Looks like none of these cases.
> > If AER is unsafe, turn it off by default for everyone.
> 
> This is silly, we have the tools, let's use them.

It's a very expensive tool, maintainance-wise. We often don't
have the choice but I'm not going to use this tool by choice
unless we know why we are doing this.

> If a user is running
> a VM that gets a VM stop on AER error one day and they upgrade QEMU and
> restart it, they should get the same behavior, whether a migration is
> involved or not.

You keep saying this, but why should it? Answer that question, the rest
will follow.

> Maybe the default should be disabled, this patch
> series hasn't yet even convinced me that there's a worthwhile general
> case where the guest can recover, but using the existing machine
> compatibility infrastructure should be at our disposal if we do think
> the default going forward should be different than the behavior today.

I'm sorry, I don't think it's the right tool for the job.
Alex Williamson March 18, 2015, 5:11 p.m. UTC | #13
On Wed, 2015-03-18 at 17:44 +0100, Michael S. Tsirkin wrote:
> On Wed, Mar 18, 2015 at 09:45:29AM -0600, Alex Williamson wrote:
> > On Wed, 2015-03-18 at 16:02 +0100, Michael S. Tsirkin wrote:
> > > On Wed, Mar 18, 2015 at 08:50:54AM -0600, Alex Williamson wrote:
> > > > On Wed, 2015-03-18 at 15:36 +0100, Michael S. Tsirkin wrote:
> > > > > On Wed, Mar 18, 2015 at 08:15:01AM -0600, Alex Williamson wrote:
> > > > > > On Wed, 2015-03-18 at 15:05 +0100, Michael S. Tsirkin wrote:
> > > > > > > On Wed, Mar 18, 2015 at 08:02:26AM -0600, Alex Williamson wrote:
> > > > > > > > On Wed, 2015-03-18 at 14:23 +0100, Michael S. Tsirkin wrote:
> > > > > > > > > typo in subject: vfio, not vifo.
> > > > > > > > > 
> > > > > > > > > On Thu, Mar 12, 2015 at 06:23:59PM +0800, Chen Fan wrote:
> > > > > > > > > > for piix4 chipset, we don't need to expose aer, so introduce
> > > > > > > > > > PC_I440FX_COMPAT for all piix4 machines to disable aercap,
> > > > > > > > > > and add HW_COMPAT_2_2 to disable aercap for all lower
> > > > > > > > > > than 2.3.
> > > > > > > > > > 
> > > > > > > > > > Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> > > > > > > > > 
> > > > > > > > > Well vfio is never migrated ATM.
> > > > > > > > > So why is compat code needed at all?
> > > > > > > > 
> > > > > > > > It's not for migration, it's to maintain current behavior on existing
> > > > > > > > platforms.  If someone gets an uncorrected AER error on q35 machine type
> > > > > > > > today, the VM stops.  With this change, AER would be exposed to the
> > > > > > > > guest and the guest could handle it.  The compat change therefore
> > > > > > > > maintains the stop VM behavior on existing q35 machine types.
> > > > > > > 
> > > > > > > If stop VM behaviour is useful, expose it to users.
> > > > > > > If not, then don't.
> > > > > > > I don't see why does it have to be tied to machine types.
> > > > > > 
> > > > > > Because q35-2.2 machine type will currently do a stop VM on uncorrected
> > > > > > AER error.  If we don't tie that to a machine option then q35-2.2 would
> > > > > > suddenly start exposing the error to the guest.  That's a fairly
> > > > > > significant change in behavior for a static machine type.
> > > > > 
> > > > > I don't think you can classify it as a behaviour change. VM stop is not
> > > > > guest visible behaviour.
> > > > 
> > > > In one case, an uncorrected AER occurs and the VM is stopped by QEMU.
> > > > In the other case, the guest is notified and may attempt corrective
> > > > action... or maybe the guest doesn't understand AER and the user is
> > > > depending on the previous behavior.  That is absolutely a behavior
> > > > change.
> > > > 
> > > > > Are you worrying about guests misbehaving when they see these errors?
> > > > > Then you want this as user-controlled, supported option.
> > > > 
> > > > Whether the option is user visible is tangential to whether the behavior
> > > > of existing machine types should be maintained.  Existing machine types
> > > > can impose a different default than current machine types.
> > > >
> > > > > In other words: we only tie things to machine types when we
> > > > > have to. This code gets almost no testing, and is a lot of
> > > > > work to test. This one sounds like "just in case" is not a good
> > > > > motivation.
> > > > 
> > > > It seems like an obvious use case for using machine types to maintain
> > > > compatibility with previous behavior, which is exactly why we have
> > > > machine types.  If we're not going to use it, why do we have it?
> > > 
> > > We have machine types because of the following issues:
> > > - some silent changes confuse guests. For example guest installed with
> > >   one machine type might not boot if you try to use it after
> > >   changing something, or - in case of windows - throw up warnings.
> > > - some changes break migration
> > > 
> > > Looks like none of these cases.
> > > If AER is unsafe, turn it off by default for everyone.
> > 
> > This is silly, we have the tools, let's use them.
> 
> It's a very expensive tool, maintainance-wise. We often don't
> have the choice but I'm not going to use this tool by choice
> unless we know why we are doing this.
> 
> > If a user is running
> > a VM that gets a VM stop on AER error one day and they upgrade QEMU and
> > restart it, they should get the same behavior, whether a migration is
> > involved or not.
> 
> You keep saying this, but why should it? Answer that question, the rest
> will follow.

My answer is that it's a user visible change in behavior that they may
rely on and would not expect to change within an existing machine type.
Silently modifying the default may expose them to error conditions that
were previously handled by QEMU and adds AER handling requirements to
the guest.

Ignoring the question about whether an AER can be reliably recovered in
the guest for a moment, a typical PC hardware platform will signal the
running OS with AER errors, so it seems that *if* we can achieve parity
in a virtual machine with that signaling, and more importantly the
recovery process, then the default going forward should be to mimic bare
metal behavior, thus changing the default from what we do today.

As it is now, I think we have too many outstanding questions about how
recover occurs to change the default.

So let me return the question, if we were to resolve those questions and
change the default handling from what we do today, creating a user
visible behavior change and imposing new requirements for AER handling
in the guest, why is that not worthy of machine type stability?  I'd
certainly expect it from a distro specific machine type.

> > Maybe the default should be disabled, this patch
> > series hasn't yet even convinced me that there's a worthwhile general
> > case where the guest can recover, but using the existing machine
> > compatibility infrastructure should be at our disposal if we do think
> > the default going forward should be different than the behavior today.
> 
> I'm sorry, I don't think it's the right tool for the job.
>
Michael S. Tsirkin March 18, 2015, 5:45 p.m. UTC | #14
On Wed, Mar 18, 2015 at 11:11:28AM -0600, Alex Williamson wrote:
> On Wed, 2015-03-18 at 17:44 +0100, Michael S. Tsirkin wrote:
> > On Wed, Mar 18, 2015 at 09:45:29AM -0600, Alex Williamson wrote:
> > > On Wed, 2015-03-18 at 16:02 +0100, Michael S. Tsirkin wrote:
> > > > On Wed, Mar 18, 2015 at 08:50:54AM -0600, Alex Williamson wrote:
> > > > > On Wed, 2015-03-18 at 15:36 +0100, Michael S. Tsirkin wrote:
> > > > > > On Wed, Mar 18, 2015 at 08:15:01AM -0600, Alex Williamson wrote:
> > > > > > > On Wed, 2015-03-18 at 15:05 +0100, Michael S. Tsirkin wrote:
> > > > > > > > On Wed, Mar 18, 2015 at 08:02:26AM -0600, Alex Williamson wrote:
> > > > > > > > > On Wed, 2015-03-18 at 14:23 +0100, Michael S. Tsirkin wrote:
> > > > > > > > > > typo in subject: vfio, not vifo.
> > > > > > > > > > 
> > > > > > > > > > On Thu, Mar 12, 2015 at 06:23:59PM +0800, Chen Fan wrote:
> > > > > > > > > > > for piix4 chipset, we don't need to expose aer, so introduce
> > > > > > > > > > > PC_I440FX_COMPAT for all piix4 machines to disable aercap,
> > > > > > > > > > > and add HW_COMPAT_2_2 to disable aercap for all lower
> > > > > > > > > > > than 2.3.
> > > > > > > > > > > 
> > > > > > > > > > > Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> > > > > > > > > > 
> > > > > > > > > > Well vfio is never migrated ATM.
> > > > > > > > > > So why is compat code needed at all?
> > > > > > > > > 
> > > > > > > > > It's not for migration, it's to maintain current behavior on existing
> > > > > > > > > platforms.  If someone gets an uncorrected AER error on q35 machine type
> > > > > > > > > today, the VM stops.  With this change, AER would be exposed to the
> > > > > > > > > guest and the guest could handle it.  The compat change therefore
> > > > > > > > > maintains the stop VM behavior on existing q35 machine types.
> > > > > > > > 
> > > > > > > > If stop VM behaviour is useful, expose it to users.
> > > > > > > > If not, then don't.
> > > > > > > > I don't see why does it have to be tied to machine types.
> > > > > > > 
> > > > > > > Because q35-2.2 machine type will currently do a stop VM on uncorrected
> > > > > > > AER error.  If we don't tie that to a machine option then q35-2.2 would
> > > > > > > suddenly start exposing the error to the guest.  That's a fairly
> > > > > > > significant change in behavior for a static machine type.
> > > > > > 
> > > > > > I don't think you can classify it as a behaviour change. VM stop is not
> > > > > > guest visible behaviour.
> > > > > 
> > > > > In one case, an uncorrected AER occurs and the VM is stopped by QEMU.
> > > > > In the other case, the guest is notified and may attempt corrective
> > > > > action... or maybe the guest doesn't understand AER and the user is
> > > > > depending on the previous behavior.  That is absolutely a behavior
> > > > > change.
> > > > > 
> > > > > > Are you worrying about guests misbehaving when they see these errors?
> > > > > > Then you want this as user-controlled, supported option.
> > > > > 
> > > > > Whether the option is user visible is tangential to whether the behavior
> > > > > of existing machine types should be maintained.  Existing machine types
> > > > > can impose a different default than current machine types.
> > > > >
> > > > > > In other words: we only tie things to machine types when we
> > > > > > have to. This code gets almost no testing, and is a lot of
> > > > > > work to test. This one sounds like "just in case" is not a good
> > > > > > motivation.
> > > > > 
> > > > > It seems like an obvious use case for using machine types to maintain
> > > > > compatibility with previous behavior, which is exactly why we have
> > > > > machine types.  If we're not going to use it, why do we have it?
> > > > 
> > > > We have machine types because of the following issues:
> > > > - some silent changes confuse guests. For example guest installed with
> > > >   one machine type might not boot if you try to use it after
> > > >   changing something, or - in case of windows - throw up warnings.
> > > > - some changes break migration
> > > > 
> > > > Looks like none of these cases.
> > > > If AER is unsafe, turn it off by default for everyone.
> > > 
> > > This is silly, we have the tools, let's use them.
> > 
> > It's a very expensive tool, maintainance-wise. We often don't
> > have the choice but I'm not going to use this tool by choice
> > unless we know why we are doing this.
> > 
> > > If a user is running
> > > a VM that gets a VM stop on AER error one day and they upgrade QEMU and
> > > restart it, they should get the same behavior, whether a migration is
> > > involved or not.
> > 
> > You keep saying this, but why should it? Answer that question, the rest
> > will follow.
> 
> My answer is that it's a user visible change in behavior that they may
> rely on and would not expect to change within an existing machine type.
> Silently modifying the default may expose them to error conditions that
> were previously handled by QEMU and adds AER handling requirements to
> the guest.
> 
> Ignoring the question about whether an AER can be reliably recovered in
> the guest for a moment, a typical PC hardware platform will signal the
> running OS with AER errors, so it seems that *if* we can achieve parity
> in a virtual machine with that signaling, and more importantly the
> recovery process, then the default going forward should be to mimic bare
> metal behavior, thus changing the default from what we do today.
> 
> As it is now, I think we have too many outstanding questions about how
> recover occurs to change the default.
> 
> So let me return the question, if we were to resolve those questions and
> change the default handling from what we do today, creating a user
> visible behavior change and imposing new requirements for AER handling
> in the guest, why is that not worthy of machine type stability?  I'd
> certainly expect it from a distro specific machine type.

This really depends on what guests do with this.
We try to avoid guest visible changes during migration
(even that's not 100%).
There aren't such requirements for devices where migration
is disabled, e.g. we did many guest visible changes in q35.

Latest machine types are better tested. Deviating from latest machine
just for a theoretical "this is guest visible" isn't going to give
you better stability, it will give you worse stability.

If you are worried about guest bugs, they are not going
away just because we release new QEMU.So if guests are
buggy with this feature, this needs a solution for all
machine types.

Anyway, if we keep the default, that's even easier.

> > > Maybe the default should be disabled, this patch
> > > series hasn't yet even convinced me that there's a worthwhile general
> > > case where the guest can recover, but using the existing machine
> > > compatibility infrastructure should be at our disposal if we do think
> > > the default going forward should be different than the behavior today.
> > 
> > I'm sorry, I don't think it's the right tool for the job.
> > 
> 
>
Alex Williamson March 18, 2015, 6:08 p.m. UTC | #15
On Wed, 2015-03-18 at 18:45 +0100, Michael S. Tsirkin wrote:
> On Wed, Mar 18, 2015 at 11:11:28AM -0600, Alex Williamson wrote:
> > On Wed, 2015-03-18 at 17:44 +0100, Michael S. Tsirkin wrote:
> > > On Wed, Mar 18, 2015 at 09:45:29AM -0600, Alex Williamson wrote:
> > > > On Wed, 2015-03-18 at 16:02 +0100, Michael S. Tsirkin wrote:
> > > > > On Wed, Mar 18, 2015 at 08:50:54AM -0600, Alex Williamson wrote:
> > > > > > On Wed, 2015-03-18 at 15:36 +0100, Michael S. Tsirkin wrote:
> > > > > > > On Wed, Mar 18, 2015 at 08:15:01AM -0600, Alex Williamson wrote:
> > > > > > > > On Wed, 2015-03-18 at 15:05 +0100, Michael S. Tsirkin wrote:
> > > > > > > > > On Wed, Mar 18, 2015 at 08:02:26AM -0600, Alex Williamson wrote:
> > > > > > > > > > On Wed, 2015-03-18 at 14:23 +0100, Michael S. Tsirkin wrote:
> > > > > > > > > > > typo in subject: vfio, not vifo.
> > > > > > > > > > > 
> > > > > > > > > > > On Thu, Mar 12, 2015 at 06:23:59PM +0800, Chen Fan wrote:
> > > > > > > > > > > > for piix4 chipset, we don't need to expose aer, so introduce
> > > > > > > > > > > > PC_I440FX_COMPAT for all piix4 machines to disable aercap,
> > > > > > > > > > > > and add HW_COMPAT_2_2 to disable aercap for all lower
> > > > > > > > > > > > than 2.3.
> > > > > > > > > > > > 
> > > > > > > > > > > > Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> > > > > > > > > > > 
> > > > > > > > > > > Well vfio is never migrated ATM.
> > > > > > > > > > > So why is compat code needed at all?
> > > > > > > > > > 
> > > > > > > > > > It's not for migration, it's to maintain current behavior on existing
> > > > > > > > > > platforms.  If someone gets an uncorrected AER error on q35 machine type
> > > > > > > > > > today, the VM stops.  With this change, AER would be exposed to the
> > > > > > > > > > guest and the guest could handle it.  The compat change therefore
> > > > > > > > > > maintains the stop VM behavior on existing q35 machine types.
> > > > > > > > > 
> > > > > > > > > If stop VM behaviour is useful, expose it to users.
> > > > > > > > > If not, then don't.
> > > > > > > > > I don't see why does it have to be tied to machine types.
> > > > > > > > 
> > > > > > > > Because q35-2.2 machine type will currently do a stop VM on uncorrected
> > > > > > > > AER error.  If we don't tie that to a machine option then q35-2.2 would
> > > > > > > > suddenly start exposing the error to the guest.  That's a fairly
> > > > > > > > significant change in behavior for a static machine type.
> > > > > > > 
> > > > > > > I don't think you can classify it as a behaviour change. VM stop is not
> > > > > > > guest visible behaviour.
> > > > > > 
> > > > > > In one case, an uncorrected AER occurs and the VM is stopped by QEMU.
> > > > > > In the other case, the guest is notified and may attempt corrective
> > > > > > action... or maybe the guest doesn't understand AER and the user is
> > > > > > depending on the previous behavior.  That is absolutely a behavior
> > > > > > change.
> > > > > > 
> > > > > > > Are you worrying about guests misbehaving when they see these errors?
> > > > > > > Then you want this as user-controlled, supported option.
> > > > > > 
> > > > > > Whether the option is user visible is tangential to whether the behavior
> > > > > > of existing machine types should be maintained.  Existing machine types
> > > > > > can impose a different default than current machine types.
> > > > > >
> > > > > > > In other words: we only tie things to machine types when we
> > > > > > > have to. This code gets almost no testing, and is a lot of
> > > > > > > work to test. This one sounds like "just in case" is not a good
> > > > > > > motivation.
> > > > > > 
> > > > > > It seems like an obvious use case for using machine types to maintain
> > > > > > compatibility with previous behavior, which is exactly why we have
> > > > > > machine types.  If we're not going to use it, why do we have it?
> > > > > 
> > > > > We have machine types because of the following issues:
> > > > > - some silent changes confuse guests. For example guest installed with
> > > > >   one machine type might not boot if you try to use it after
> > > > >   changing something, or - in case of windows - throw up warnings.
> > > > > - some changes break migration
> > > > > 
> > > > > Looks like none of these cases.
> > > > > If AER is unsafe, turn it off by default for everyone.
> > > > 
> > > > This is silly, we have the tools, let's use them.
> > > 
> > > It's a very expensive tool, maintainance-wise. We often don't
> > > have the choice but I'm not going to use this tool by choice
> > > unless we know why we are doing this.
> > > 
> > > > If a user is running
> > > > a VM that gets a VM stop on AER error one day and they upgrade QEMU and
> > > > restart it, they should get the same behavior, whether a migration is
> > > > involved or not.
> > > 
> > > You keep saying this, but why should it? Answer that question, the rest
> > > will follow.
> > 
> > My answer is that it's a user visible change in behavior that they may
> > rely on and would not expect to change within an existing machine type.
> > Silently modifying the default may expose them to error conditions that
> > were previously handled by QEMU and adds AER handling requirements to
> > the guest.
> > 
> > Ignoring the question about whether an AER can be reliably recovered in
> > the guest for a moment, a typical PC hardware platform will signal the
> > running OS with AER errors, so it seems that *if* we can achieve parity
> > in a virtual machine with that signaling, and more importantly the
> > recovery process, then the default going forward should be to mimic bare
> > metal behavior, thus changing the default from what we do today.
> > 
> > As it is now, I think we have too many outstanding questions about how
> > recover occurs to change the default.
> > 
> > So let me return the question, if we were to resolve those questions and
> > change the default handling from what we do today, creating a user
> > visible behavior change and imposing new requirements for AER handling
> > in the guest, why is that not worthy of machine type stability?  I'd
> > certainly expect it from a distro specific machine type.
> 
> This really depends on what guests do with this.
> We try to avoid guest visible changes during migration
> (even that's not 100%).
> There aren't such requirements for devices where migration
> is disabled, e.g. we did many guest visible changes in q35.
> 
> Latest machine types are better tested. Deviating from latest machine
> just for a theoretical "this is guest visible" isn't going to give
> you better stability, it will give you worse stability.

You keep calling this theoretical.  As it exists today, an uncorrected
AER error results in a VM stop with no guest intervention or support
required.  If we were to change the default, the guest would instead be
notified of the AER and given the opportunity to recover.  That's not
just guest visible, that's a complete change in error handling paradigm.

> If you are worried about guest bugs, they are not going
> away just because we release new QEMU.So if guests are
> buggy with this feature, this needs a solution for all
> machine types.

Guest bugs are not my issue.

> Anyway, if we keep the default, that's even easier.

Agreed, and that may be where we end if we can't come up with a
reasonable expectation that the guest can recover from an AER.  However,
I think machine compatibility should be fair game if we do want to flip
that switch.

> > > > Maybe the default should be disabled, this patch
> > > > series hasn't yet even convinced me that there's a worthwhile general
> > > > case where the guest can recover, but using the existing machine
> > > > compatibility infrastructure should be at our disposal if we do think
> > > > the default going forward should be different than the behavior today.
> > > 
> > > I'm sorry, I don't think it's the right tool for the job.
> > > 
> > 
> >
Michael S. Tsirkin March 18, 2015, 6:56 p.m. UTC | #16
On Wed, Mar 18, 2015 at 12:08:15PM -0600, Alex Williamson wrote:
> On Wed, 2015-03-18 at 18:45 +0100, Michael S. Tsirkin wrote:
> > On Wed, Mar 18, 2015 at 11:11:28AM -0600, Alex Williamson wrote:
> > > On Wed, 2015-03-18 at 17:44 +0100, Michael S. Tsirkin wrote:
> > > > On Wed, Mar 18, 2015 at 09:45:29AM -0600, Alex Williamson wrote:
> > > > > On Wed, 2015-03-18 at 16:02 +0100, Michael S. Tsirkin wrote:
> > > > > > On Wed, Mar 18, 2015 at 08:50:54AM -0600, Alex Williamson wrote:
> > > > > > > On Wed, 2015-03-18 at 15:36 +0100, Michael S. Tsirkin wrote:
> > > > > > > > On Wed, Mar 18, 2015 at 08:15:01AM -0600, Alex Williamson wrote:
> > > > > > > > > On Wed, 2015-03-18 at 15:05 +0100, Michael S. Tsirkin wrote:
> > > > > > > > > > On Wed, Mar 18, 2015 at 08:02:26AM -0600, Alex Williamson wrote:
> > > > > > > > > > > On Wed, 2015-03-18 at 14:23 +0100, Michael S. Tsirkin wrote:
> > > > > > > > > > > > typo in subject: vfio, not vifo.
> > > > > > > > > > > > 
> > > > > > > > > > > > On Thu, Mar 12, 2015 at 06:23:59PM +0800, Chen Fan wrote:
> > > > > > > > > > > > > for piix4 chipset, we don't need to expose aer, so introduce
> > > > > > > > > > > > > PC_I440FX_COMPAT for all piix4 machines to disable aercap,
> > > > > > > > > > > > > and add HW_COMPAT_2_2 to disable aercap for all lower
> > > > > > > > > > > > > than 2.3.
> > > > > > > > > > > > > 
> > > > > > > > > > > > > Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> > > > > > > > > > > > 
> > > > > > > > > > > > Well vfio is never migrated ATM.
> > > > > > > > > > > > So why is compat code needed at all?
> > > > > > > > > > > 
> > > > > > > > > > > It's not for migration, it's to maintain current behavior on existing
> > > > > > > > > > > platforms.  If someone gets an uncorrected AER error on q35 machine type
> > > > > > > > > > > today, the VM stops.  With this change, AER would be exposed to the
> > > > > > > > > > > guest and the guest could handle it.  The compat change therefore
> > > > > > > > > > > maintains the stop VM behavior on existing q35 machine types.
> > > > > > > > > > 
> > > > > > > > > > If stop VM behaviour is useful, expose it to users.
> > > > > > > > > > If not, then don't.
> > > > > > > > > > I don't see why does it have to be tied to machine types.
> > > > > > > > > 
> > > > > > > > > Because q35-2.2 machine type will currently do a stop VM on uncorrected
> > > > > > > > > AER error.  If we don't tie that to a machine option then q35-2.2 would
> > > > > > > > > suddenly start exposing the error to the guest.  That's a fairly
> > > > > > > > > significant change in behavior for a static machine type.
> > > > > > > > 
> > > > > > > > I don't think you can classify it as a behaviour change. VM stop is not
> > > > > > > > guest visible behaviour.
> > > > > > > 
> > > > > > > In one case, an uncorrected AER occurs and the VM is stopped by QEMU.
> > > > > > > In the other case, the guest is notified and may attempt corrective
> > > > > > > action... or maybe the guest doesn't understand AER and the user is
> > > > > > > depending on the previous behavior.  That is absolutely a behavior
> > > > > > > change.
> > > > > > > 
> > > > > > > > Are you worrying about guests misbehaving when they see these errors?
> > > > > > > > Then you want this as user-controlled, supported option.
> > > > > > > 
> > > > > > > Whether the option is user visible is tangential to whether the behavior
> > > > > > > of existing machine types should be maintained.  Existing machine types
> > > > > > > can impose a different default than current machine types.
> > > > > > >
> > > > > > > > In other words: we only tie things to machine types when we
> > > > > > > > have to. This code gets almost no testing, and is a lot of
> > > > > > > > work to test. This one sounds like "just in case" is not a good
> > > > > > > > motivation.
> > > > > > > 
> > > > > > > It seems like an obvious use case for using machine types to maintain
> > > > > > > compatibility with previous behavior, which is exactly why we have
> > > > > > > machine types.  If we're not going to use it, why do we have it?
> > > > > > 
> > > > > > We have machine types because of the following issues:
> > > > > > - some silent changes confuse guests. For example guest installed with
> > > > > >   one machine type might not boot if you try to use it after
> > > > > >   changing something, or - in case of windows - throw up warnings.
> > > > > > - some changes break migration
> > > > > > 
> > > > > > Looks like none of these cases.
> > > > > > If AER is unsafe, turn it off by default for everyone.
> > > > > 
> > > > > This is silly, we have the tools, let's use them.
> > > > 
> > > > It's a very expensive tool, maintainance-wise. We often don't
> > > > have the choice but I'm not going to use this tool by choice
> > > > unless we know why we are doing this.
> > > > 
> > > > > If a user is running
> > > > > a VM that gets a VM stop on AER error one day and they upgrade QEMU and
> > > > > restart it, they should get the same behavior, whether a migration is
> > > > > involved or not.
> > > > 
> > > > You keep saying this, but why should it? Answer that question, the rest
> > > > will follow.
> > > 
> > > My answer is that it's a user visible change in behavior that they may
> > > rely on and would not expect to change within an existing machine type.
> > > Silently modifying the default may expose them to error conditions that
> > > were previously handled by QEMU and adds AER handling requirements to
> > > the guest.
> > > 
> > > Ignoring the question about whether an AER can be reliably recovered in
> > > the guest for a moment, a typical PC hardware platform will signal the
> > > running OS with AER errors, so it seems that *if* we can achieve parity
> > > in a virtual machine with that signaling, and more importantly the
> > > recovery process, then the default going forward should be to mimic bare
> > > metal behavior, thus changing the default from what we do today.
> > > 
> > > As it is now, I think we have too many outstanding questions about how
> > > recover occurs to change the default.
> > > 
> > > So let me return the question, if we were to resolve those questions and
> > > change the default handling from what we do today, creating a user
> > > visible behavior change and imposing new requirements for AER handling
> > > in the guest, why is that not worthy of machine type stability?  I'd
> > > certainly expect it from a distro specific machine type.
> > 
> > This really depends on what guests do with this.
> > We try to avoid guest visible changes during migration
> > (even that's not 100%).
> > There aren't such requirements for devices where migration
> > is disabled, e.g. we did many guest visible changes in q35.
> > 
> > Latest machine types are better tested. Deviating from latest machine
> > just for a theoretical "this is guest visible" isn't going to give
> > you better stability, it will give you worse stability.
> 
> You keep calling this theoretical.  As it exists today, an uncorrected
> AER error results in a VM stop with no guest intervention or support
> required.  If we were to change the default, the guest would instead be
> notified of the AER and given the opportunity to recover.  That's not
> just guest visible, that's a complete change in error handling paradigm.

OK, so maybe it's a feature that users should have control over.
But tying it to machine types makes no sense.

> > If you are worried about guest bugs, they are not going
> > away just because we release new QEMU.So if guests are
> > buggy with this feature, this needs a solution for all
> > machine types.
> 
> Guest bugs are not my issue.
> 
> > Anyway, if we keep the default, that's even easier.
> 
> Agreed, and that may be where we end if we can't come up with a
> reasonable expectation that the guest can recover from an AER.  However,
> I think machine compatibility should be fair game if we do want to flip
> that switch.
> 
> > > > > Maybe the default should be disabled, this patch
> > > > > series hasn't yet even convinced me that there's a worthwhile general
> > > > > case where the guest can recover, but using the existing machine
> > > > > compatibility infrastructure should be at our disposal if we do think
> > > > > the default going forward should be different than the behavior today.
> > > > 
> > > > I'm sorry, I don't think it's the right tool for the job.
> > > > 
> > > 
> > > 
> 
>
Alex Williamson March 18, 2015, 7:05 p.m. UTC | #17
On Wed, 2015-03-18 at 19:56 +0100, Michael S. Tsirkin wrote:
> On Wed, Mar 18, 2015 at 12:08:15PM -0600, Alex Williamson wrote:
> > On Wed, 2015-03-18 at 18:45 +0100, Michael S. Tsirkin wrote:
> > > On Wed, Mar 18, 2015 at 11:11:28AM -0600, Alex Williamson wrote:
> > > > On Wed, 2015-03-18 at 17:44 +0100, Michael S. Tsirkin wrote:
> > > > > On Wed, Mar 18, 2015 at 09:45:29AM -0600, Alex Williamson wrote:
> > > > > > On Wed, 2015-03-18 at 16:02 +0100, Michael S. Tsirkin wrote:
> > > > > > > On Wed, Mar 18, 2015 at 08:50:54AM -0600, Alex Williamson wrote:
> > > > > > > > On Wed, 2015-03-18 at 15:36 +0100, Michael S. Tsirkin wrote:
> > > > > > > > > On Wed, Mar 18, 2015 at 08:15:01AM -0600, Alex Williamson wrote:
> > > > > > > > > > On Wed, 2015-03-18 at 15:05 +0100, Michael S. Tsirkin wrote:
> > > > > > > > > > > On Wed, Mar 18, 2015 at 08:02:26AM -0600, Alex Williamson wrote:
> > > > > > > > > > > > On Wed, 2015-03-18 at 14:23 +0100, Michael S. Tsirkin wrote:
> > > > > > > > > > > > > typo in subject: vfio, not vifo.
> > > > > > > > > > > > > 
> > > > > > > > > > > > > On Thu, Mar 12, 2015 at 06:23:59PM +0800, Chen Fan wrote:
> > > > > > > > > > > > > > for piix4 chipset, we don't need to expose aer, so introduce
> > > > > > > > > > > > > > PC_I440FX_COMPAT for all piix4 machines to disable aercap,
> > > > > > > > > > > > > > and add HW_COMPAT_2_2 to disable aercap for all lower
> > > > > > > > > > > > > > than 2.3.
> > > > > > > > > > > > > > 
> > > > > > > > > > > > > > Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> > > > > > > > > > > > > 
> > > > > > > > > > > > > Well vfio is never migrated ATM.
> > > > > > > > > > > > > So why is compat code needed at all?
> > > > > > > > > > > > 
> > > > > > > > > > > > It's not for migration, it's to maintain current behavior on existing
> > > > > > > > > > > > platforms.  If someone gets an uncorrected AER error on q35 machine type
> > > > > > > > > > > > today, the VM stops.  With this change, AER would be exposed to the
> > > > > > > > > > > > guest and the guest could handle it.  The compat change therefore
> > > > > > > > > > > > maintains the stop VM behavior on existing q35 machine types.
> > > > > > > > > > > 
> > > > > > > > > > > If stop VM behaviour is useful, expose it to users.
> > > > > > > > > > > If not, then don't.
> > > > > > > > > > > I don't see why does it have to be tied to machine types.
> > > > > > > > > > 
> > > > > > > > > > Because q35-2.2 machine type will currently do a stop VM on uncorrected
> > > > > > > > > > AER error.  If we don't tie that to a machine option then q35-2.2 would
> > > > > > > > > > suddenly start exposing the error to the guest.  That's a fairly
> > > > > > > > > > significant change in behavior for a static machine type.
> > > > > > > > > 
> > > > > > > > > I don't think you can classify it as a behaviour change. VM stop is not
> > > > > > > > > guest visible behaviour.
> > > > > > > > 
> > > > > > > > In one case, an uncorrected AER occurs and the VM is stopped by QEMU.
> > > > > > > > In the other case, the guest is notified and may attempt corrective
> > > > > > > > action... or maybe the guest doesn't understand AER and the user is
> > > > > > > > depending on the previous behavior.  That is absolutely a behavior
> > > > > > > > change.
> > > > > > > > 
> > > > > > > > > Are you worrying about guests misbehaving when they see these errors?
> > > > > > > > > Then you want this as user-controlled, supported option.
> > > > > > > > 
> > > > > > > > Whether the option is user visible is tangential to whether the behavior
> > > > > > > > of existing machine types should be maintained.  Existing machine types
> > > > > > > > can impose a different default than current machine types.
> > > > > > > >
> > > > > > > > > In other words: we only tie things to machine types when we
> > > > > > > > > have to. This code gets almost no testing, and is a lot of
> > > > > > > > > work to test. This one sounds like "just in case" is not a good
> > > > > > > > > motivation.
> > > > > > > > 
> > > > > > > > It seems like an obvious use case for using machine types to maintain
> > > > > > > > compatibility with previous behavior, which is exactly why we have
> > > > > > > > machine types.  If we're not going to use it, why do we have it?
> > > > > > > 
> > > > > > > We have machine types because of the following issues:
> > > > > > > - some silent changes confuse guests. For example guest installed with
> > > > > > >   one machine type might not boot if you try to use it after
> > > > > > >   changing something, or - in case of windows - throw up warnings.
> > > > > > > - some changes break migration
> > > > > > > 
> > > > > > > Looks like none of these cases.
> > > > > > > If AER is unsafe, turn it off by default for everyone.
> > > > > > 
> > > > > > This is silly, we have the tools, let's use them.
> > > > > 
> > > > > It's a very expensive tool, maintainance-wise. We often don't
> > > > > have the choice but I'm not going to use this tool by choice
> > > > > unless we know why we are doing this.
> > > > > 
> > > > > > If a user is running
> > > > > > a VM that gets a VM stop on AER error one day and they upgrade QEMU and
> > > > > > restart it, they should get the same behavior, whether a migration is
> > > > > > involved or not.
> > > > > 
> > > > > You keep saying this, but why should it? Answer that question, the rest
> > > > > will follow.
> > > > 
> > > > My answer is that it's a user visible change in behavior that they may
> > > > rely on and would not expect to change within an existing machine type.
> > > > Silently modifying the default may expose them to error conditions that
> > > > were previously handled by QEMU and adds AER handling requirements to
> > > > the guest.
> > > > 
> > > > Ignoring the question about whether an AER can be reliably recovered in
> > > > the guest for a moment, a typical PC hardware platform will signal the
> > > > running OS with AER errors, so it seems that *if* we can achieve parity
> > > > in a virtual machine with that signaling, and more importantly the
> > > > recovery process, then the default going forward should be to mimic bare
> > > > metal behavior, thus changing the default from what we do today.
> > > > 
> > > > As it is now, I think we have too many outstanding questions about how
> > > > recover occurs to change the default.
> > > > 
> > > > So let me return the question, if we were to resolve those questions and
> > > > change the default handling from what we do today, creating a user
> > > > visible behavior change and imposing new requirements for AER handling
> > > > in the guest, why is that not worthy of machine type stability?  I'd
> > > > certainly expect it from a distro specific machine type.
> > > 
> > > This really depends on what guests do with this.
> > > We try to avoid guest visible changes during migration
> > > (even that's not 100%).
> > > There aren't such requirements for devices where migration
> > > is disabled, e.g. we did many guest visible changes in q35.
> > > 
> > > Latest machine types are better tested. Deviating from latest machine
> > > just for a theoretical "this is guest visible" isn't going to give
> > > you better stability, it will give you worse stability.
> > 
> > You keep calling this theoretical.  As it exists today, an uncorrected
> > AER error results in a VM stop with no guest intervention or support
> > required.  If we were to change the default, the guest would instead be
> > notified of the AER and given the opportunity to recover.  That's not
> > just guest visible, that's a complete change in error handling paradigm.
> 
> OK, so maybe it's a feature that users should have control over.
> But tying it to machine types makes no sense.

If we were to change the default, where else would you tie it?  Machine
types are the only finger hold we have to maintain VM stability afaik.
Paolo Bonzini March 19, 2015, 9:26 p.m. UTC | #18
On 18/03/2015 20:05, Alex Williamson wrote:
> > OK, so maybe it's a feature that users should have control over.
> > But tying it to machine types makes no sense.
>
> If we were to change the default, where else would you tie it?  Machine
> types are the only finger hold we have to maintain VM stability afaik.

I agree.

Machine type are there to avoid exposing "important" changes in the
behavior of the device model to the guest.  These include the number and
size of capabilities and BARs (and other fields in configuration space),
the number of virtqueues and how they are used (virtio features), etc.

Migration is definitely the main reason to introduce a new property and
add compatibility glue for older machine types, but it does not have to
be the only one.

AER seems to me like it is a significant-enough change in how the guest
sees an assigned device.  You certainly don't want it to appear in a
configuration that was previously stable.

So, migration could be a reason why we _have_ to introduce compatibility
glue, but AER to me is a perfect example of why we may sometimes _want_
to have compatibility glue.  It's different, but it makes a lot of sense.

The maintenance cost of the compatibility glue is small.  Hiding the
capability is half a dozen lines of code, and the behavior is no
different than when the guest doesn't want to see errors (which has to
be supported, if only because not all chipsets are PCIe).

Paolo
diff mbox

Patch

diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
index 8eab4ba..ff9d312 100644
--- a/hw/i386/pc_piix.c
+++ b/hw/i386/pc_piix.c
@@ -307,6 +307,11 @@  static void pc_init1(MachineState *machine,
 
 static void pc_init_pci(MachineState *machine)
 {
+    static GlobalProperty pc_compat_props[] = {
+        PC_I440FX_COMPAT,
+        { /* end of list */ }
+    };
+    qdev_prop_register_global_list(pc_compat_props);
     pc_init1(machine, 1, 1);
 }
 
@@ -534,6 +539,10 @@  static QEMUMachine pc_i440fx_machine_v2_2 = {
     PC_I440FX_2_2_MACHINE_OPTIONS,
     .name = "pc-i440fx-2.2",
     .init = pc_init_pci_2_2,
+    .compat_props = (GlobalProperty[]) {
+        HW_COMPAT_2_2,
+        { /* end of list */ }
+    },
 };
 
 #define PC_I440FX_2_1_MACHINE_OPTIONS                           \
diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
index c0f21fe..97afb7d 100644
--- a/hw/i386/pc_q35.c
+++ b/hw/i386/pc_q35.c
@@ -431,6 +431,10 @@  static QEMUMachine pc_q35_machine_v2_2 = {
     PC_Q35_2_2_MACHINE_OPTIONS,
     .name = "pc-q35-2.2",
     .init = pc_q35_init_2_2,
+    .compat_props = (GlobalProperty[]) {
+        HW_COMPAT_2_2,
+        { /* end of list */ }
+    },
 };
 
 #define PC_Q35_2_1_MACHINE_OPTIONS                      \
diff --git a/include/hw/compat.h b/include/hw/compat.h
index 313682a..40c974a 100644
--- a/include/hw/compat.h
+++ b/include/hw/compat.h
@@ -1,7 +1,17 @@ 
 #ifndef HW_COMPAT_H
 #define HW_COMPAT_H
 
+#define HW_COMPAT_2_2 PC_I440FX_COMPAT
+
+#define PC_I440FX_COMPAT \
+        {\
+            .driver   = "vfio-pci",\
+            .property = "x-aer",\
+            .value    = "off",\
+        }
+
 #define HW_COMPAT_2_1 \
+        HW_COMPAT_2_2, \
         {\
             .driver   = "intel-hda",\
             .property = "old_msi_addr",\