Message ID | 18492b2679b3194f92b1ceefdbe4355bb840e660.1426155432.git.chen.fan.fnst@cn.fujitsu.com |
---|---|
State | New |
Headers | show |
On Thu, 2015-03-12 at 18:23 +0800, Chen Fan wrote: > for piix4 chipset, we don't need to expose aer, so introduce > PC_I440FX_COMPAT for all piix4 machines to disable aercap, > and add HW_COMPAT_2_2 to disable aercap for all lower > than 2.3. 440FX is not PCIe, so it doesn't seem like we need to do anything there. Shouldn't this only cover q35 machine types through 2.3? (QEMU 2.3 is already in hard freeze afaik, this won't go in until after 2.4 development opens). Thanks, Alex > Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com> > --- > hw/i386/pc_piix.c | 9 +++++++++ > hw/i386/pc_q35.c | 4 ++++ > include/hw/compat.h | 10 ++++++++++ > 3 files changed, 23 insertions(+) > > diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c > index 8eab4ba..ff9d312 100644 > --- a/hw/i386/pc_piix.c > +++ b/hw/i386/pc_piix.c > @@ -307,6 +307,11 @@ static void pc_init1(MachineState *machine, > > static void pc_init_pci(MachineState *machine) > { > + static GlobalProperty pc_compat_props[] = { > + PC_I440FX_COMPAT, > + { /* end of list */ } > + }; > + qdev_prop_register_global_list(pc_compat_props); > pc_init1(machine, 1, 1); > } > > @@ -534,6 +539,10 @@ static QEMUMachine pc_i440fx_machine_v2_2 = { > PC_I440FX_2_2_MACHINE_OPTIONS, > .name = "pc-i440fx-2.2", > .init = pc_init_pci_2_2, > + .compat_props = (GlobalProperty[]) { > + HW_COMPAT_2_2, > + { /* end of list */ } > + }, > }; > > #define PC_I440FX_2_1_MACHINE_OPTIONS \ > diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c > index c0f21fe..97afb7d 100644 > --- a/hw/i386/pc_q35.c > +++ b/hw/i386/pc_q35.c > @@ -431,6 +431,10 @@ static QEMUMachine pc_q35_machine_v2_2 = { > PC_Q35_2_2_MACHINE_OPTIONS, > .name = "pc-q35-2.2", > .init = pc_q35_init_2_2, > + .compat_props = (GlobalProperty[]) { > + HW_COMPAT_2_2, > + { /* end of list */ } > + }, > }; > > #define PC_Q35_2_1_MACHINE_OPTIONS \ > diff --git a/include/hw/compat.h b/include/hw/compat.h > index 313682a..40c974a 100644 > --- a/include/hw/compat.h > +++ b/include/hw/compat.h > @@ -1,7 +1,17 @@ > #ifndef HW_COMPAT_H > #define HW_COMPAT_H > > +#define HW_COMPAT_2_2 PC_I440FX_COMPAT > + > +#define PC_I440FX_COMPAT \ > + {\ > + .driver = "vfio-pci",\ > + .property = "x-aer",\ > + .value = "off",\ > + } > + > #define HW_COMPAT_2_1 \ > + HW_COMPAT_2_2, \ > {\ > .driver = "intel-hda",\ > .property = "old_msi_addr",\
On 03/14/2015 06:38 AM, Alex Williamson wrote: > On Thu, 2015-03-12 at 18:23 +0800, Chen Fan wrote: >> for piix4 chipset, we don't need to expose aer, so introduce >> PC_I440FX_COMPAT for all piix4 machines to disable aercap, >> and add HW_COMPAT_2_2 to disable aercap for all lower >> than 2.3. > 440FX is not PCIe, so it doesn't seem like we need to do anything there. > Shouldn't this only cover q35 machine types through 2.3? (QEMU 2.3 is > already in hard freeze afaik, this won't go in until after 2.4 > development opens). Thanks, Yes, 440Fx is not PCIe, so extended cap won't be parsed. I knew this patches won't be in QEMU 2.3. I want to wait for the 2.4 opens and then rebase this patch. Thanks, Chen > > Alex > >> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com> >> --- >> hw/i386/pc_piix.c | 9 +++++++++ >> hw/i386/pc_q35.c | 4 ++++ >> include/hw/compat.h | 10 ++++++++++ >> 3 files changed, 23 insertions(+) >> >> diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c >> index 8eab4ba..ff9d312 100644 >> --- a/hw/i386/pc_piix.c >> +++ b/hw/i386/pc_piix.c >> @@ -307,6 +307,11 @@ static void pc_init1(MachineState *machine, >> >> static void pc_init_pci(MachineState *machine) >> { >> + static GlobalProperty pc_compat_props[] = { >> + PC_I440FX_COMPAT, >> + { /* end of list */ } >> + }; >> + qdev_prop_register_global_list(pc_compat_props); >> pc_init1(machine, 1, 1); >> } >> >> @@ -534,6 +539,10 @@ static QEMUMachine pc_i440fx_machine_v2_2 = { >> PC_I440FX_2_2_MACHINE_OPTIONS, >> .name = "pc-i440fx-2.2", >> .init = pc_init_pci_2_2, >> + .compat_props = (GlobalProperty[]) { >> + HW_COMPAT_2_2, >> + { /* end of list */ } >> + }, >> }; >> >> #define PC_I440FX_2_1_MACHINE_OPTIONS \ >> diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c >> index c0f21fe..97afb7d 100644 >> --- a/hw/i386/pc_q35.c >> +++ b/hw/i386/pc_q35.c >> @@ -431,6 +431,10 @@ static QEMUMachine pc_q35_machine_v2_2 = { >> PC_Q35_2_2_MACHINE_OPTIONS, >> .name = "pc-q35-2.2", >> .init = pc_q35_init_2_2, >> + .compat_props = (GlobalProperty[]) { >> + HW_COMPAT_2_2, >> + { /* end of list */ } >> + }, >> }; >> >> #define PC_Q35_2_1_MACHINE_OPTIONS \ >> diff --git a/include/hw/compat.h b/include/hw/compat.h >> index 313682a..40c974a 100644 >> --- a/include/hw/compat.h >> +++ b/include/hw/compat.h >> @@ -1,7 +1,17 @@ >> #ifndef HW_COMPAT_H >> #define HW_COMPAT_H >> >> +#define HW_COMPAT_2_2 PC_I440FX_COMPAT >> + >> +#define PC_I440FX_COMPAT \ >> + {\ >> + .driver = "vfio-pci",\ >> + .property = "x-aer",\ >> + .value = "off",\ >> + } >> + >> #define HW_COMPAT_2_1 \ >> + HW_COMPAT_2_2, \ >> {\ >> .driver = "intel-hda",\ >> .property = "old_msi_addr",\ > > > . >
On 03/14/2015 06:38 AM, Alex Williamson wrote: > On Thu, 2015-03-12 at 18:23 +0800, Chen Fan wrote: >> for piix4 chipset, we don't need to expose aer, so introduce >> PC_I440FX_COMPAT for all piix4 machines to disable aercap, >> and add HW_COMPAT_2_2 to disable aercap for all lower >> than 2.3. > 440FX is not PCIe, so it doesn't seem like we need to do anything there. > Shouldn't this only cover q35 machine types through 2.3? (QEMU 2.3 is > already in hard freeze afaik, this won't go in until after 2.4 > development opens). Thanks, Yes, 440Fx is not PCIe, so extended cap won't be parsed. I knew this patches won't be in QEMU 2.3. I want to wait for the 2.4 opens and then rebase this patch separately. Thanks, Chen > > Alex > >> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com> >> --- >> hw/i386/pc_piix.c | 9 +++++++++ >> hw/i386/pc_q35.c | 4 ++++ >> include/hw/compat.h | 10 ++++++++++ >> 3 files changed, 23 insertions(+) >> >> diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c >> index 8eab4ba..ff9d312 100644 >> --- a/hw/i386/pc_piix.c >> +++ b/hw/i386/pc_piix.c >> @@ -307,6 +307,11 @@ static void pc_init1(MachineState *machine, >> >> static void pc_init_pci(MachineState *machine) >> { >> + static GlobalProperty pc_compat_props[] = { >> + PC_I440FX_COMPAT, >> + { /* end of list */ } >> + }; >> + qdev_prop_register_global_list(pc_compat_props); >> pc_init1(machine, 1, 1); >> } >> >> @@ -534,6 +539,10 @@ static QEMUMachine pc_i440fx_machine_v2_2 = { >> PC_I440FX_2_2_MACHINE_OPTIONS, >> .name = "pc-i440fx-2.2", >> .init = pc_init_pci_2_2, >> + .compat_props = (GlobalProperty[]) { >> + HW_COMPAT_2_2, >> + { /* end of list */ } >> + }, >> }; >> >> #define PC_I440FX_2_1_MACHINE_OPTIONS \ >> diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c >> index c0f21fe..97afb7d 100644 >> --- a/hw/i386/pc_q35.c >> +++ b/hw/i386/pc_q35.c >> @@ -431,6 +431,10 @@ static QEMUMachine pc_q35_machine_v2_2 = { >> PC_Q35_2_2_MACHINE_OPTIONS, >> .name = "pc-q35-2.2", >> .init = pc_q35_init_2_2, >> + .compat_props = (GlobalProperty[]) { >> + HW_COMPAT_2_2, >> + { /* end of list */ } >> + }, >> }; >> >> #define PC_Q35_2_1_MACHINE_OPTIONS \ >> diff --git a/include/hw/compat.h b/include/hw/compat.h >> index 313682a..40c974a 100644 >> --- a/include/hw/compat.h >> +++ b/include/hw/compat.h >> @@ -1,7 +1,17 @@ >> #ifndef HW_COMPAT_H >> #define HW_COMPAT_H >> >> +#define HW_COMPAT_2_2 PC_I440FX_COMPAT >> + >> +#define PC_I440FX_COMPAT \ >> + {\ >> + .driver = "vfio-pci",\ >> + .property = "x-aer",\ >> + .value = "off",\ >> + } >> + >> #define HW_COMPAT_2_1 \ >> + HW_COMPAT_2_2, \ >> {\ >> .driver = "intel-hda",\ >> .property = "old_msi_addr",\ > > > . >
typo in subject: vfio, not vifo. On Thu, Mar 12, 2015 at 06:23:59PM +0800, Chen Fan wrote: > for piix4 chipset, we don't need to expose aer, so introduce > PC_I440FX_COMPAT for all piix4 machines to disable aercap, > and add HW_COMPAT_2_2 to disable aercap for all lower > than 2.3. > > Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com> Well vfio is never migrated ATM. So why is compat code needed at all? > --- > hw/i386/pc_piix.c | 9 +++++++++ > hw/i386/pc_q35.c | 4 ++++ > include/hw/compat.h | 10 ++++++++++ > 3 files changed, 23 insertions(+) > > diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c > index 8eab4ba..ff9d312 100644 > --- a/hw/i386/pc_piix.c > +++ b/hw/i386/pc_piix.c > @@ -307,6 +307,11 @@ static void pc_init1(MachineState *machine, > > static void pc_init_pci(MachineState *machine) > { > + static GlobalProperty pc_compat_props[] = { > + PC_I440FX_COMPAT, > + { /* end of list */ } > + }; > + qdev_prop_register_global_list(pc_compat_props); > pc_init1(machine, 1, 1); > } > > @@ -534,6 +539,10 @@ static QEMUMachine pc_i440fx_machine_v2_2 = { > PC_I440FX_2_2_MACHINE_OPTIONS, > .name = "pc-i440fx-2.2", > .init = pc_init_pci_2_2, > + .compat_props = (GlobalProperty[]) { > + HW_COMPAT_2_2, > + { /* end of list */ } > + }, > }; > > #define PC_I440FX_2_1_MACHINE_OPTIONS \ > diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c > index c0f21fe..97afb7d 100644 > --- a/hw/i386/pc_q35.c > +++ b/hw/i386/pc_q35.c > @@ -431,6 +431,10 @@ static QEMUMachine pc_q35_machine_v2_2 = { > PC_Q35_2_2_MACHINE_OPTIONS, > .name = "pc-q35-2.2", > .init = pc_q35_init_2_2, > + .compat_props = (GlobalProperty[]) { > + HW_COMPAT_2_2, > + { /* end of list */ } > + }, > }; > > #define PC_Q35_2_1_MACHINE_OPTIONS \ > diff --git a/include/hw/compat.h b/include/hw/compat.h > index 313682a..40c974a 100644 > --- a/include/hw/compat.h > +++ b/include/hw/compat.h > @@ -1,7 +1,17 @@ > #ifndef HW_COMPAT_H > #define HW_COMPAT_H > > +#define HW_COMPAT_2_2 PC_I440FX_COMPAT > + > +#define PC_I440FX_COMPAT \ > + {\ > + .driver = "vfio-pci",\ > + .property = "x-aer",\ > + .value = "off",\ > + } > + > #define HW_COMPAT_2_1 \ > + HW_COMPAT_2_2, \ > {\ > .driver = "intel-hda",\ > .property = "old_msi_addr",\ > -- > 1.9.3 >
On Wed, 2015-03-18 at 14:23 +0100, Michael S. Tsirkin wrote: > typo in subject: vfio, not vifo. > > On Thu, Mar 12, 2015 at 06:23:59PM +0800, Chen Fan wrote: > > for piix4 chipset, we don't need to expose aer, so introduce > > PC_I440FX_COMPAT for all piix4 machines to disable aercap, > > and add HW_COMPAT_2_2 to disable aercap for all lower > > than 2.3. > > > > Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com> > > Well vfio is never migrated ATM. > So why is compat code needed at all? It's not for migration, it's to maintain current behavior on existing platforms. If someone gets an uncorrected AER error on q35 machine type today, the VM stops. With this change, AER would be exposed to the guest and the guest could handle it. The compat change therefore maintains the stop VM behavior on existing q35 machine types. As I commented here, the 440fx part of this patch is unnecessary since AER cannot be exposed to the guest on a conventional PCI chipset. Thanks, Alex > > --- > > hw/i386/pc_piix.c | 9 +++++++++ > > hw/i386/pc_q35.c | 4 ++++ > > include/hw/compat.h | 10 ++++++++++ > > 3 files changed, 23 insertions(+) > > > > diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c > > index 8eab4ba..ff9d312 100644 > > --- a/hw/i386/pc_piix.c > > +++ b/hw/i386/pc_piix.c > > @@ -307,6 +307,11 @@ static void pc_init1(MachineState *machine, > > > > static void pc_init_pci(MachineState *machine) > > { > > + static GlobalProperty pc_compat_props[] = { > > + PC_I440FX_COMPAT, > > + { /* end of list */ } > > + }; > > + qdev_prop_register_global_list(pc_compat_props); > > pc_init1(machine, 1, 1); > > } > > > > @@ -534,6 +539,10 @@ static QEMUMachine pc_i440fx_machine_v2_2 = { > > PC_I440FX_2_2_MACHINE_OPTIONS, > > .name = "pc-i440fx-2.2", > > .init = pc_init_pci_2_2, > > + .compat_props = (GlobalProperty[]) { > > + HW_COMPAT_2_2, > > + { /* end of list */ } > > + }, > > }; > > > > #define PC_I440FX_2_1_MACHINE_OPTIONS \ > > diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c > > index c0f21fe..97afb7d 100644 > > --- a/hw/i386/pc_q35.c > > +++ b/hw/i386/pc_q35.c > > @@ -431,6 +431,10 @@ static QEMUMachine pc_q35_machine_v2_2 = { > > PC_Q35_2_2_MACHINE_OPTIONS, > > .name = "pc-q35-2.2", > > .init = pc_q35_init_2_2, > > + .compat_props = (GlobalProperty[]) { > > + HW_COMPAT_2_2, > > + { /* end of list */ } > > + }, > > }; > > > > #define PC_Q35_2_1_MACHINE_OPTIONS \ > > diff --git a/include/hw/compat.h b/include/hw/compat.h > > index 313682a..40c974a 100644 > > --- a/include/hw/compat.h > > +++ b/include/hw/compat.h > > @@ -1,7 +1,17 @@ > > #ifndef HW_COMPAT_H > > #define HW_COMPAT_H > > > > +#define HW_COMPAT_2_2 PC_I440FX_COMPAT > > + > > +#define PC_I440FX_COMPAT \ > > + {\ > > + .driver = "vfio-pci",\ > > + .property = "x-aer",\ > > + .value = "off",\ > > + } > > + > > #define HW_COMPAT_2_1 \ > > + HW_COMPAT_2_2, \ > > {\ > > .driver = "intel-hda",\ > > .property = "old_msi_addr",\ > > -- > > 1.9.3 > >
On Wed, Mar 18, 2015 at 08:02:26AM -0600, Alex Williamson wrote: > On Wed, 2015-03-18 at 14:23 +0100, Michael S. Tsirkin wrote: > > typo in subject: vfio, not vifo. > > > > On Thu, Mar 12, 2015 at 06:23:59PM +0800, Chen Fan wrote: > > > for piix4 chipset, we don't need to expose aer, so introduce > > > PC_I440FX_COMPAT for all piix4 machines to disable aercap, > > > and add HW_COMPAT_2_2 to disable aercap for all lower > > > than 2.3. > > > > > > Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com> > > > > Well vfio is never migrated ATM. > > So why is compat code needed at all? > > It's not for migration, it's to maintain current behavior on existing > platforms. If someone gets an uncorrected AER error on q35 machine type > today, the VM stops. With this change, AER would be exposed to the > guest and the guest could handle it. The compat change therefore > maintains the stop VM behavior on existing q35 machine types. If stop VM behaviour is useful, expose it to users. If not, then don't. I don't see why does it have to be tied to machine types. > As I > commented here, the 440fx part of this patch is unnecessary since AER > cannot be exposed to the guest on a conventional PCI chipset. Thanks, > > Alex > > > > --- > > > hw/i386/pc_piix.c | 9 +++++++++ > > > hw/i386/pc_q35.c | 4 ++++ > > > include/hw/compat.h | 10 ++++++++++ > > > 3 files changed, 23 insertions(+) > > > > > > diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c > > > index 8eab4ba..ff9d312 100644 > > > --- a/hw/i386/pc_piix.c > > > +++ b/hw/i386/pc_piix.c > > > @@ -307,6 +307,11 @@ static void pc_init1(MachineState *machine, > > > > > > static void pc_init_pci(MachineState *machine) > > > { > > > + static GlobalProperty pc_compat_props[] = { > > > + PC_I440FX_COMPAT, > > > + { /* end of list */ } > > > + }; > > > + qdev_prop_register_global_list(pc_compat_props); > > > pc_init1(machine, 1, 1); > > > } > > > > > > @@ -534,6 +539,10 @@ static QEMUMachine pc_i440fx_machine_v2_2 = { > > > PC_I440FX_2_2_MACHINE_OPTIONS, > > > .name = "pc-i440fx-2.2", > > > .init = pc_init_pci_2_2, > > > + .compat_props = (GlobalProperty[]) { > > > + HW_COMPAT_2_2, > > > + { /* end of list */ } > > > + }, > > > }; > > > > > > #define PC_I440FX_2_1_MACHINE_OPTIONS \ > > > diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c > > > index c0f21fe..97afb7d 100644 > > > --- a/hw/i386/pc_q35.c > > > +++ b/hw/i386/pc_q35.c > > > @@ -431,6 +431,10 @@ static QEMUMachine pc_q35_machine_v2_2 = { > > > PC_Q35_2_2_MACHINE_OPTIONS, > > > .name = "pc-q35-2.2", > > > .init = pc_q35_init_2_2, > > > + .compat_props = (GlobalProperty[]) { > > > + HW_COMPAT_2_2, > > > + { /* end of list */ } > > > + }, > > > }; > > > > > > #define PC_Q35_2_1_MACHINE_OPTIONS \ > > > diff --git a/include/hw/compat.h b/include/hw/compat.h > > > index 313682a..40c974a 100644 > > > --- a/include/hw/compat.h > > > +++ b/include/hw/compat.h > > > @@ -1,7 +1,17 @@ > > > #ifndef HW_COMPAT_H > > > #define HW_COMPAT_H > > > > > > +#define HW_COMPAT_2_2 PC_I440FX_COMPAT > > > + > > > +#define PC_I440FX_COMPAT \ > > > + {\ > > > + .driver = "vfio-pci",\ > > > + .property = "x-aer",\ > > > + .value = "off",\ > > > + } > > > + > > > #define HW_COMPAT_2_1 \ > > > + HW_COMPAT_2_2, \ > > > {\ > > > .driver = "intel-hda",\ > > > .property = "old_msi_addr",\ > > > -- > > > 1.9.3 > > > > >
On Wed, 2015-03-18 at 15:05 +0100, Michael S. Tsirkin wrote: > On Wed, Mar 18, 2015 at 08:02:26AM -0600, Alex Williamson wrote: > > On Wed, 2015-03-18 at 14:23 +0100, Michael S. Tsirkin wrote: > > > typo in subject: vfio, not vifo. > > > > > > On Thu, Mar 12, 2015 at 06:23:59PM +0800, Chen Fan wrote: > > > > for piix4 chipset, we don't need to expose aer, so introduce > > > > PC_I440FX_COMPAT for all piix4 machines to disable aercap, > > > > and add HW_COMPAT_2_2 to disable aercap for all lower > > > > than 2.3. > > > > > > > > Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com> > > > > > > Well vfio is never migrated ATM. > > > So why is compat code needed at all? > > > > It's not for migration, it's to maintain current behavior on existing > > platforms. If someone gets an uncorrected AER error on q35 machine type > > today, the VM stops. With this change, AER would be exposed to the > > guest and the guest could handle it. The compat change therefore > > maintains the stop VM behavior on existing q35 machine types. > > If stop VM behaviour is useful, expose it to users. > If not, then don't. > I don't see why does it have to be tied to machine types. Because q35-2.2 machine type will currently do a stop VM on uncorrected AER error. If we don't tie that to a machine option then q35-2.2 would suddenly start exposing the error to the guest. That's a fairly significant change in behavior for a static machine type. > > As I > > commented here, the 440fx part of this patch is unnecessary since AER > > cannot be exposed to the guest on a conventional PCI chipset. Thanks, > > > > Alex > > > > > > --- > > > > hw/i386/pc_piix.c | 9 +++++++++ > > > > hw/i386/pc_q35.c | 4 ++++ > > > > include/hw/compat.h | 10 ++++++++++ > > > > 3 files changed, 23 insertions(+) > > > > > > > > diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c > > > > index 8eab4ba..ff9d312 100644 > > > > --- a/hw/i386/pc_piix.c > > > > +++ b/hw/i386/pc_piix.c > > > > @@ -307,6 +307,11 @@ static void pc_init1(MachineState *machine, > > > > > > > > static void pc_init_pci(MachineState *machine) > > > > { > > > > + static GlobalProperty pc_compat_props[] = { > > > > + PC_I440FX_COMPAT, > > > > + { /* end of list */ } > > > > + }; > > > > + qdev_prop_register_global_list(pc_compat_props); > > > > pc_init1(machine, 1, 1); > > > > } > > > > > > > > @@ -534,6 +539,10 @@ static QEMUMachine pc_i440fx_machine_v2_2 = { > > > > PC_I440FX_2_2_MACHINE_OPTIONS, > > > > .name = "pc-i440fx-2.2", > > > > .init = pc_init_pci_2_2, > > > > + .compat_props = (GlobalProperty[]) { > > > > + HW_COMPAT_2_2, > > > > + { /* end of list */ } > > > > + }, > > > > }; > > > > > > > > #define PC_I440FX_2_1_MACHINE_OPTIONS \ > > > > diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c > > > > index c0f21fe..97afb7d 100644 > > > > --- a/hw/i386/pc_q35.c > > > > +++ b/hw/i386/pc_q35.c > > > > @@ -431,6 +431,10 @@ static QEMUMachine pc_q35_machine_v2_2 = { > > > > PC_Q35_2_2_MACHINE_OPTIONS, > > > > .name = "pc-q35-2.2", > > > > .init = pc_q35_init_2_2, > > > > + .compat_props = (GlobalProperty[]) { > > > > + HW_COMPAT_2_2, > > > > + { /* end of list */ } > > > > + }, > > > > }; > > > > > > > > #define PC_Q35_2_1_MACHINE_OPTIONS \ > > > > diff --git a/include/hw/compat.h b/include/hw/compat.h > > > > index 313682a..40c974a 100644 > > > > --- a/include/hw/compat.h > > > > +++ b/include/hw/compat.h > > > > @@ -1,7 +1,17 @@ > > > > #ifndef HW_COMPAT_H > > > > #define HW_COMPAT_H > > > > > > > > +#define HW_COMPAT_2_2 PC_I440FX_COMPAT > > > > + > > > > +#define PC_I440FX_COMPAT \ > > > > + {\ > > > > + .driver = "vfio-pci",\ > > > > + .property = "x-aer",\ > > > > + .value = "off",\ > > > > + } > > > > + > > > > #define HW_COMPAT_2_1 \ > > > > + HW_COMPAT_2_2, \ > > > > {\ > > > > .driver = "intel-hda",\ > > > > .property = "old_msi_addr",\ > > > > -- > > > > 1.9.3 > > > > > > > >
On Wed, Mar 18, 2015 at 08:15:01AM -0600, Alex Williamson wrote: > On Wed, 2015-03-18 at 15:05 +0100, Michael S. Tsirkin wrote: > > On Wed, Mar 18, 2015 at 08:02:26AM -0600, Alex Williamson wrote: > > > On Wed, 2015-03-18 at 14:23 +0100, Michael S. Tsirkin wrote: > > > > typo in subject: vfio, not vifo. > > > > > > > > On Thu, Mar 12, 2015 at 06:23:59PM +0800, Chen Fan wrote: > > > > > for piix4 chipset, we don't need to expose aer, so introduce > > > > > PC_I440FX_COMPAT for all piix4 machines to disable aercap, > > > > > and add HW_COMPAT_2_2 to disable aercap for all lower > > > > > than 2.3. > > > > > > > > > > Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com> > > > > > > > > Well vfio is never migrated ATM. > > > > So why is compat code needed at all? > > > > > > It's not for migration, it's to maintain current behavior on existing > > > platforms. If someone gets an uncorrected AER error on q35 machine type > > > today, the VM stops. With this change, AER would be exposed to the > > > guest and the guest could handle it. The compat change therefore > > > maintains the stop VM behavior on existing q35 machine types. > > > > If stop VM behaviour is useful, expose it to users. > > If not, then don't. > > I don't see why does it have to be tied to machine types. > > Because q35-2.2 machine type will currently do a stop VM on uncorrected > AER error. If we don't tie that to a machine option then q35-2.2 would > suddenly start exposing the error to the guest. That's a fairly > significant change in behavior for a static machine type. I don't think you can classify it as a behaviour change. VM stop is not guest visible behaviour. Are you worrying about guests misbehaving when they see these errors? Then you want this as user-controlled, supported option. In other words: we only tie things to machine types when we have to. This code gets almost no testing, and is a lot of work to test. This one sounds like "just in case" is not a good motivation.
On Wed, 2015-03-18 at 15:36 +0100, Michael S. Tsirkin wrote: > On Wed, Mar 18, 2015 at 08:15:01AM -0600, Alex Williamson wrote: > > On Wed, 2015-03-18 at 15:05 +0100, Michael S. Tsirkin wrote: > > > On Wed, Mar 18, 2015 at 08:02:26AM -0600, Alex Williamson wrote: > > > > On Wed, 2015-03-18 at 14:23 +0100, Michael S. Tsirkin wrote: > > > > > typo in subject: vfio, not vifo. > > > > > > > > > > On Thu, Mar 12, 2015 at 06:23:59PM +0800, Chen Fan wrote: > > > > > > for piix4 chipset, we don't need to expose aer, so introduce > > > > > > PC_I440FX_COMPAT for all piix4 machines to disable aercap, > > > > > > and add HW_COMPAT_2_2 to disable aercap for all lower > > > > > > than 2.3. > > > > > > > > > > > > Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com> > > > > > > > > > > Well vfio is never migrated ATM. > > > > > So why is compat code needed at all? > > > > > > > > It's not for migration, it's to maintain current behavior on existing > > > > platforms. If someone gets an uncorrected AER error on q35 machine type > > > > today, the VM stops. With this change, AER would be exposed to the > > > > guest and the guest could handle it. The compat change therefore > > > > maintains the stop VM behavior on existing q35 machine types. > > > > > > If stop VM behaviour is useful, expose it to users. > > > If not, then don't. > > > I don't see why does it have to be tied to machine types. > > > > Because q35-2.2 machine type will currently do a stop VM on uncorrected > > AER error. If we don't tie that to a machine option then q35-2.2 would > > suddenly start exposing the error to the guest. That's a fairly > > significant change in behavior for a static machine type. > > I don't think you can classify it as a behaviour change. VM stop is not > guest visible behaviour. In one case, an uncorrected AER occurs and the VM is stopped by QEMU. In the other case, the guest is notified and may attempt corrective action... or maybe the guest doesn't understand AER and the user is depending on the previous behavior. That is absolutely a behavior change. > Are you worrying about guests misbehaving when they see these errors? > Then you want this as user-controlled, supported option. Whether the option is user visible is tangential to whether the behavior of existing machine types should be maintained. Existing machine types can impose a different default than current machine types. > In other words: we only tie things to machine types when we > have to. This code gets almost no testing, and is a lot of > work to test. This one sounds like "just in case" is not a good > motivation. It seems like an obvious use case for using machine types to maintain compatibility with previous behavior, which is exactly why we have machine types. If we're not going to use it, why do we have it?
On Wed, Mar 18, 2015 at 08:50:54AM -0600, Alex Williamson wrote: > On Wed, 2015-03-18 at 15:36 +0100, Michael S. Tsirkin wrote: > > On Wed, Mar 18, 2015 at 08:15:01AM -0600, Alex Williamson wrote: > > > On Wed, 2015-03-18 at 15:05 +0100, Michael S. Tsirkin wrote: > > > > On Wed, Mar 18, 2015 at 08:02:26AM -0600, Alex Williamson wrote: > > > > > On Wed, 2015-03-18 at 14:23 +0100, Michael S. Tsirkin wrote: > > > > > > typo in subject: vfio, not vifo. > > > > > > > > > > > > On Thu, Mar 12, 2015 at 06:23:59PM +0800, Chen Fan wrote: > > > > > > > for piix4 chipset, we don't need to expose aer, so introduce > > > > > > > PC_I440FX_COMPAT for all piix4 machines to disable aercap, > > > > > > > and add HW_COMPAT_2_2 to disable aercap for all lower > > > > > > > than 2.3. > > > > > > > > > > > > > > Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com> > > > > > > > > > > > > Well vfio is never migrated ATM. > > > > > > So why is compat code needed at all? > > > > > > > > > > It's not for migration, it's to maintain current behavior on existing > > > > > platforms. If someone gets an uncorrected AER error on q35 machine type > > > > > today, the VM stops. With this change, AER would be exposed to the > > > > > guest and the guest could handle it. The compat change therefore > > > > > maintains the stop VM behavior on existing q35 machine types. > > > > > > > > If stop VM behaviour is useful, expose it to users. > > > > If not, then don't. > > > > I don't see why does it have to be tied to machine types. > > > > > > Because q35-2.2 machine type will currently do a stop VM on uncorrected > > > AER error. If we don't tie that to a machine option then q35-2.2 would > > > suddenly start exposing the error to the guest. That's a fairly > > > significant change in behavior for a static machine type. > > > > I don't think you can classify it as a behaviour change. VM stop is not > > guest visible behaviour. > > In one case, an uncorrected AER occurs and the VM is stopped by QEMU. > In the other case, the guest is notified and may attempt corrective > action... or maybe the guest doesn't understand AER and the user is > depending on the previous behavior. That is absolutely a behavior > change. > > > Are you worrying about guests misbehaving when they see these errors? > > Then you want this as user-controlled, supported option. > > Whether the option is user visible is tangential to whether the behavior > of existing machine types should be maintained. Existing machine types > can impose a different default than current machine types. > > > In other words: we only tie things to machine types when we > > have to. This code gets almost no testing, and is a lot of > > work to test. This one sounds like "just in case" is not a good > > motivation. > > It seems like an obvious use case for using machine types to maintain > compatibility with previous behavior, which is exactly why we have > machine types. If we're not going to use it, why do we have it? We have machine types because of the following issues: - some silent changes confuse guests. For example guest installed with one machine type might not boot if you try to use it after changing something, or - in case of windows - throw up warnings. - some changes break migration Looks like none of these cases. If AER is unsafe, turn it off by default for everyone.
On Wed, 2015-03-18 at 16:02 +0100, Michael S. Tsirkin wrote: > On Wed, Mar 18, 2015 at 08:50:54AM -0600, Alex Williamson wrote: > > On Wed, 2015-03-18 at 15:36 +0100, Michael S. Tsirkin wrote: > > > On Wed, Mar 18, 2015 at 08:15:01AM -0600, Alex Williamson wrote: > > > > On Wed, 2015-03-18 at 15:05 +0100, Michael S. Tsirkin wrote: > > > > > On Wed, Mar 18, 2015 at 08:02:26AM -0600, Alex Williamson wrote: > > > > > > On Wed, 2015-03-18 at 14:23 +0100, Michael S. Tsirkin wrote: > > > > > > > typo in subject: vfio, not vifo. > > > > > > > > > > > > > > On Thu, Mar 12, 2015 at 06:23:59PM +0800, Chen Fan wrote: > > > > > > > > for piix4 chipset, we don't need to expose aer, so introduce > > > > > > > > PC_I440FX_COMPAT for all piix4 machines to disable aercap, > > > > > > > > and add HW_COMPAT_2_2 to disable aercap for all lower > > > > > > > > than 2.3. > > > > > > > > > > > > > > > > Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com> > > > > > > > > > > > > > > Well vfio is never migrated ATM. > > > > > > > So why is compat code needed at all? > > > > > > > > > > > > It's not for migration, it's to maintain current behavior on existing > > > > > > platforms. If someone gets an uncorrected AER error on q35 machine type > > > > > > today, the VM stops. With this change, AER would be exposed to the > > > > > > guest and the guest could handle it. The compat change therefore > > > > > > maintains the stop VM behavior on existing q35 machine types. > > > > > > > > > > If stop VM behaviour is useful, expose it to users. > > > > > If not, then don't. > > > > > I don't see why does it have to be tied to machine types. > > > > > > > > Because q35-2.2 machine type will currently do a stop VM on uncorrected > > > > AER error. If we don't tie that to a machine option then q35-2.2 would > > > > suddenly start exposing the error to the guest. That's a fairly > > > > significant change in behavior for a static machine type. > > > > > > I don't think you can classify it as a behaviour change. VM stop is not > > > guest visible behaviour. > > > > In one case, an uncorrected AER occurs and the VM is stopped by QEMU. > > In the other case, the guest is notified and may attempt corrective > > action... or maybe the guest doesn't understand AER and the user is > > depending on the previous behavior. That is absolutely a behavior > > change. > > > > > Are you worrying about guests misbehaving when they see these errors? > > > Then you want this as user-controlled, supported option. > > > > Whether the option is user visible is tangential to whether the behavior > > of existing machine types should be maintained. Existing machine types > > can impose a different default than current machine types. > > > > > In other words: we only tie things to machine types when we > > > have to. This code gets almost no testing, and is a lot of > > > work to test. This one sounds like "just in case" is not a good > > > motivation. > > > > It seems like an obvious use case for using machine types to maintain > > compatibility with previous behavior, which is exactly why we have > > machine types. If we're not going to use it, why do we have it? > > We have machine types because of the following issues: > - some silent changes confuse guests. For example guest installed with > one machine type might not boot if you try to use it after > changing something, or - in case of windows - throw up warnings. > - some changes break migration > > Looks like none of these cases. > If AER is unsafe, turn it off by default for everyone. This is silly, we have the tools, let's use them. If a user is running a VM that gets a VM stop on AER error one day and they upgrade QEMU and restart it, they should get the same behavior, whether a migration is involved or not. Maybe the default should be disabled, this patch series hasn't yet even convinced me that there's a worthwhile general case where the guest can recover, but using the existing machine compatibility infrastructure should be at our disposal if we do think the default going forward should be different than the behavior today.
On Wed, Mar 18, 2015 at 09:45:29AM -0600, Alex Williamson wrote: > On Wed, 2015-03-18 at 16:02 +0100, Michael S. Tsirkin wrote: > > On Wed, Mar 18, 2015 at 08:50:54AM -0600, Alex Williamson wrote: > > > On Wed, 2015-03-18 at 15:36 +0100, Michael S. Tsirkin wrote: > > > > On Wed, Mar 18, 2015 at 08:15:01AM -0600, Alex Williamson wrote: > > > > > On Wed, 2015-03-18 at 15:05 +0100, Michael S. Tsirkin wrote: > > > > > > On Wed, Mar 18, 2015 at 08:02:26AM -0600, Alex Williamson wrote: > > > > > > > On Wed, 2015-03-18 at 14:23 +0100, Michael S. Tsirkin wrote: > > > > > > > > typo in subject: vfio, not vifo. > > > > > > > > > > > > > > > > On Thu, Mar 12, 2015 at 06:23:59PM +0800, Chen Fan wrote: > > > > > > > > > for piix4 chipset, we don't need to expose aer, so introduce > > > > > > > > > PC_I440FX_COMPAT for all piix4 machines to disable aercap, > > > > > > > > > and add HW_COMPAT_2_2 to disable aercap for all lower > > > > > > > > > than 2.3. > > > > > > > > > > > > > > > > > > Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com> > > > > > > > > > > > > > > > > Well vfio is never migrated ATM. > > > > > > > > So why is compat code needed at all? > > > > > > > > > > > > > > It's not for migration, it's to maintain current behavior on existing > > > > > > > platforms. If someone gets an uncorrected AER error on q35 machine type > > > > > > > today, the VM stops. With this change, AER would be exposed to the > > > > > > > guest and the guest could handle it. The compat change therefore > > > > > > > maintains the stop VM behavior on existing q35 machine types. > > > > > > > > > > > > If stop VM behaviour is useful, expose it to users. > > > > > > If not, then don't. > > > > > > I don't see why does it have to be tied to machine types. > > > > > > > > > > Because q35-2.2 machine type will currently do a stop VM on uncorrected > > > > > AER error. If we don't tie that to a machine option then q35-2.2 would > > > > > suddenly start exposing the error to the guest. That's a fairly > > > > > significant change in behavior for a static machine type. > > > > > > > > I don't think you can classify it as a behaviour change. VM stop is not > > > > guest visible behaviour. > > > > > > In one case, an uncorrected AER occurs and the VM is stopped by QEMU. > > > In the other case, the guest is notified and may attempt corrective > > > action... or maybe the guest doesn't understand AER and the user is > > > depending on the previous behavior. That is absolutely a behavior > > > change. > > > > > > > Are you worrying about guests misbehaving when they see these errors? > > > > Then you want this as user-controlled, supported option. > > > > > > Whether the option is user visible is tangential to whether the behavior > > > of existing machine types should be maintained. Existing machine types > > > can impose a different default than current machine types. > > > > > > > In other words: we only tie things to machine types when we > > > > have to. This code gets almost no testing, and is a lot of > > > > work to test. This one sounds like "just in case" is not a good > > > > motivation. > > > > > > It seems like an obvious use case for using machine types to maintain > > > compatibility with previous behavior, which is exactly why we have > > > machine types. If we're not going to use it, why do we have it? > > > > We have machine types because of the following issues: > > - some silent changes confuse guests. For example guest installed with > > one machine type might not boot if you try to use it after > > changing something, or - in case of windows - throw up warnings. > > - some changes break migration > > > > Looks like none of these cases. > > If AER is unsafe, turn it off by default for everyone. > > This is silly, we have the tools, let's use them. It's a very expensive tool, maintainance-wise. We often don't have the choice but I'm not going to use this tool by choice unless we know why we are doing this. > If a user is running > a VM that gets a VM stop on AER error one day and they upgrade QEMU and > restart it, they should get the same behavior, whether a migration is > involved or not. You keep saying this, but why should it? Answer that question, the rest will follow. > Maybe the default should be disabled, this patch > series hasn't yet even convinced me that there's a worthwhile general > case where the guest can recover, but using the existing machine > compatibility infrastructure should be at our disposal if we do think > the default going forward should be different than the behavior today. I'm sorry, I don't think it's the right tool for the job.
On Wed, 2015-03-18 at 17:44 +0100, Michael S. Tsirkin wrote: > On Wed, Mar 18, 2015 at 09:45:29AM -0600, Alex Williamson wrote: > > On Wed, 2015-03-18 at 16:02 +0100, Michael S. Tsirkin wrote: > > > On Wed, Mar 18, 2015 at 08:50:54AM -0600, Alex Williamson wrote: > > > > On Wed, 2015-03-18 at 15:36 +0100, Michael S. Tsirkin wrote: > > > > > On Wed, Mar 18, 2015 at 08:15:01AM -0600, Alex Williamson wrote: > > > > > > On Wed, 2015-03-18 at 15:05 +0100, Michael S. Tsirkin wrote: > > > > > > > On Wed, Mar 18, 2015 at 08:02:26AM -0600, Alex Williamson wrote: > > > > > > > > On Wed, 2015-03-18 at 14:23 +0100, Michael S. Tsirkin wrote: > > > > > > > > > typo in subject: vfio, not vifo. > > > > > > > > > > > > > > > > > > On Thu, Mar 12, 2015 at 06:23:59PM +0800, Chen Fan wrote: > > > > > > > > > > for piix4 chipset, we don't need to expose aer, so introduce > > > > > > > > > > PC_I440FX_COMPAT for all piix4 machines to disable aercap, > > > > > > > > > > and add HW_COMPAT_2_2 to disable aercap for all lower > > > > > > > > > > than 2.3. > > > > > > > > > > > > > > > > > > > > Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com> > > > > > > > > > > > > > > > > > > Well vfio is never migrated ATM. > > > > > > > > > So why is compat code needed at all? > > > > > > > > > > > > > > > > It's not for migration, it's to maintain current behavior on existing > > > > > > > > platforms. If someone gets an uncorrected AER error on q35 machine type > > > > > > > > today, the VM stops. With this change, AER would be exposed to the > > > > > > > > guest and the guest could handle it. The compat change therefore > > > > > > > > maintains the stop VM behavior on existing q35 machine types. > > > > > > > > > > > > > > If stop VM behaviour is useful, expose it to users. > > > > > > > If not, then don't. > > > > > > > I don't see why does it have to be tied to machine types. > > > > > > > > > > > > Because q35-2.2 machine type will currently do a stop VM on uncorrected > > > > > > AER error. If we don't tie that to a machine option then q35-2.2 would > > > > > > suddenly start exposing the error to the guest. That's a fairly > > > > > > significant change in behavior for a static machine type. > > > > > > > > > > I don't think you can classify it as a behaviour change. VM stop is not > > > > > guest visible behaviour. > > > > > > > > In one case, an uncorrected AER occurs and the VM is stopped by QEMU. > > > > In the other case, the guest is notified and may attempt corrective > > > > action... or maybe the guest doesn't understand AER and the user is > > > > depending on the previous behavior. That is absolutely a behavior > > > > change. > > > > > > > > > Are you worrying about guests misbehaving when they see these errors? > > > > > Then you want this as user-controlled, supported option. > > > > > > > > Whether the option is user visible is tangential to whether the behavior > > > > of existing machine types should be maintained. Existing machine types > > > > can impose a different default than current machine types. > > > > > > > > > In other words: we only tie things to machine types when we > > > > > have to. This code gets almost no testing, and is a lot of > > > > > work to test. This one sounds like "just in case" is not a good > > > > > motivation. > > > > > > > > It seems like an obvious use case for using machine types to maintain > > > > compatibility with previous behavior, which is exactly why we have > > > > machine types. If we're not going to use it, why do we have it? > > > > > > We have machine types because of the following issues: > > > - some silent changes confuse guests. For example guest installed with > > > one machine type might not boot if you try to use it after > > > changing something, or - in case of windows - throw up warnings. > > > - some changes break migration > > > > > > Looks like none of these cases. > > > If AER is unsafe, turn it off by default for everyone. > > > > This is silly, we have the tools, let's use them. > > It's a very expensive tool, maintainance-wise. We often don't > have the choice but I'm not going to use this tool by choice > unless we know why we are doing this. > > > If a user is running > > a VM that gets a VM stop on AER error one day and they upgrade QEMU and > > restart it, they should get the same behavior, whether a migration is > > involved or not. > > You keep saying this, but why should it? Answer that question, the rest > will follow. My answer is that it's a user visible change in behavior that they may rely on and would not expect to change within an existing machine type. Silently modifying the default may expose them to error conditions that were previously handled by QEMU and adds AER handling requirements to the guest. Ignoring the question about whether an AER can be reliably recovered in the guest for a moment, a typical PC hardware platform will signal the running OS with AER errors, so it seems that *if* we can achieve parity in a virtual machine with that signaling, and more importantly the recovery process, then the default going forward should be to mimic bare metal behavior, thus changing the default from what we do today. As it is now, I think we have too many outstanding questions about how recover occurs to change the default. So let me return the question, if we were to resolve those questions and change the default handling from what we do today, creating a user visible behavior change and imposing new requirements for AER handling in the guest, why is that not worthy of machine type stability? I'd certainly expect it from a distro specific machine type. > > Maybe the default should be disabled, this patch > > series hasn't yet even convinced me that there's a worthwhile general > > case where the guest can recover, but using the existing machine > > compatibility infrastructure should be at our disposal if we do think > > the default going forward should be different than the behavior today. > > I'm sorry, I don't think it's the right tool for the job. >
On Wed, Mar 18, 2015 at 11:11:28AM -0600, Alex Williamson wrote: > On Wed, 2015-03-18 at 17:44 +0100, Michael S. Tsirkin wrote: > > On Wed, Mar 18, 2015 at 09:45:29AM -0600, Alex Williamson wrote: > > > On Wed, 2015-03-18 at 16:02 +0100, Michael S. Tsirkin wrote: > > > > On Wed, Mar 18, 2015 at 08:50:54AM -0600, Alex Williamson wrote: > > > > > On Wed, 2015-03-18 at 15:36 +0100, Michael S. Tsirkin wrote: > > > > > > On Wed, Mar 18, 2015 at 08:15:01AM -0600, Alex Williamson wrote: > > > > > > > On Wed, 2015-03-18 at 15:05 +0100, Michael S. Tsirkin wrote: > > > > > > > > On Wed, Mar 18, 2015 at 08:02:26AM -0600, Alex Williamson wrote: > > > > > > > > > On Wed, 2015-03-18 at 14:23 +0100, Michael S. Tsirkin wrote: > > > > > > > > > > typo in subject: vfio, not vifo. > > > > > > > > > > > > > > > > > > > > On Thu, Mar 12, 2015 at 06:23:59PM +0800, Chen Fan wrote: > > > > > > > > > > > for piix4 chipset, we don't need to expose aer, so introduce > > > > > > > > > > > PC_I440FX_COMPAT for all piix4 machines to disable aercap, > > > > > > > > > > > and add HW_COMPAT_2_2 to disable aercap for all lower > > > > > > > > > > > than 2.3. > > > > > > > > > > > > > > > > > > > > > > Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com> > > > > > > > > > > > > > > > > > > > > Well vfio is never migrated ATM. > > > > > > > > > > So why is compat code needed at all? > > > > > > > > > > > > > > > > > > It's not for migration, it's to maintain current behavior on existing > > > > > > > > > platforms. If someone gets an uncorrected AER error on q35 machine type > > > > > > > > > today, the VM stops. With this change, AER would be exposed to the > > > > > > > > > guest and the guest could handle it. The compat change therefore > > > > > > > > > maintains the stop VM behavior on existing q35 machine types. > > > > > > > > > > > > > > > > If stop VM behaviour is useful, expose it to users. > > > > > > > > If not, then don't. > > > > > > > > I don't see why does it have to be tied to machine types. > > > > > > > > > > > > > > Because q35-2.2 machine type will currently do a stop VM on uncorrected > > > > > > > AER error. If we don't tie that to a machine option then q35-2.2 would > > > > > > > suddenly start exposing the error to the guest. That's a fairly > > > > > > > significant change in behavior for a static machine type. > > > > > > > > > > > > I don't think you can classify it as a behaviour change. VM stop is not > > > > > > guest visible behaviour. > > > > > > > > > > In one case, an uncorrected AER occurs and the VM is stopped by QEMU. > > > > > In the other case, the guest is notified and may attempt corrective > > > > > action... or maybe the guest doesn't understand AER and the user is > > > > > depending on the previous behavior. That is absolutely a behavior > > > > > change. > > > > > > > > > > > Are you worrying about guests misbehaving when they see these errors? > > > > > > Then you want this as user-controlled, supported option. > > > > > > > > > > Whether the option is user visible is tangential to whether the behavior > > > > > of existing machine types should be maintained. Existing machine types > > > > > can impose a different default than current machine types. > > > > > > > > > > > In other words: we only tie things to machine types when we > > > > > > have to. This code gets almost no testing, and is a lot of > > > > > > work to test. This one sounds like "just in case" is not a good > > > > > > motivation. > > > > > > > > > > It seems like an obvious use case for using machine types to maintain > > > > > compatibility with previous behavior, which is exactly why we have > > > > > machine types. If we're not going to use it, why do we have it? > > > > > > > > We have machine types because of the following issues: > > > > - some silent changes confuse guests. For example guest installed with > > > > one machine type might not boot if you try to use it after > > > > changing something, or - in case of windows - throw up warnings. > > > > - some changes break migration > > > > > > > > Looks like none of these cases. > > > > If AER is unsafe, turn it off by default for everyone. > > > > > > This is silly, we have the tools, let's use them. > > > > It's a very expensive tool, maintainance-wise. We often don't > > have the choice but I'm not going to use this tool by choice > > unless we know why we are doing this. > > > > > If a user is running > > > a VM that gets a VM stop on AER error one day and they upgrade QEMU and > > > restart it, they should get the same behavior, whether a migration is > > > involved or not. > > > > You keep saying this, but why should it? Answer that question, the rest > > will follow. > > My answer is that it's a user visible change in behavior that they may > rely on and would not expect to change within an existing machine type. > Silently modifying the default may expose them to error conditions that > were previously handled by QEMU and adds AER handling requirements to > the guest. > > Ignoring the question about whether an AER can be reliably recovered in > the guest for a moment, a typical PC hardware platform will signal the > running OS with AER errors, so it seems that *if* we can achieve parity > in a virtual machine with that signaling, and more importantly the > recovery process, then the default going forward should be to mimic bare > metal behavior, thus changing the default from what we do today. > > As it is now, I think we have too many outstanding questions about how > recover occurs to change the default. > > So let me return the question, if we were to resolve those questions and > change the default handling from what we do today, creating a user > visible behavior change and imposing new requirements for AER handling > in the guest, why is that not worthy of machine type stability? I'd > certainly expect it from a distro specific machine type. This really depends on what guests do with this. We try to avoid guest visible changes during migration (even that's not 100%). There aren't such requirements for devices where migration is disabled, e.g. we did many guest visible changes in q35. Latest machine types are better tested. Deviating from latest machine just for a theoretical "this is guest visible" isn't going to give you better stability, it will give you worse stability. If you are worried about guest bugs, they are not going away just because we release new QEMU.So if guests are buggy with this feature, this needs a solution for all machine types. Anyway, if we keep the default, that's even easier. > > > Maybe the default should be disabled, this patch > > > series hasn't yet even convinced me that there's a worthwhile general > > > case where the guest can recover, but using the existing machine > > > compatibility infrastructure should be at our disposal if we do think > > > the default going forward should be different than the behavior today. > > > > I'm sorry, I don't think it's the right tool for the job. > > > >
On Wed, 2015-03-18 at 18:45 +0100, Michael S. Tsirkin wrote: > On Wed, Mar 18, 2015 at 11:11:28AM -0600, Alex Williamson wrote: > > On Wed, 2015-03-18 at 17:44 +0100, Michael S. Tsirkin wrote: > > > On Wed, Mar 18, 2015 at 09:45:29AM -0600, Alex Williamson wrote: > > > > On Wed, 2015-03-18 at 16:02 +0100, Michael S. Tsirkin wrote: > > > > > On Wed, Mar 18, 2015 at 08:50:54AM -0600, Alex Williamson wrote: > > > > > > On Wed, 2015-03-18 at 15:36 +0100, Michael S. Tsirkin wrote: > > > > > > > On Wed, Mar 18, 2015 at 08:15:01AM -0600, Alex Williamson wrote: > > > > > > > > On Wed, 2015-03-18 at 15:05 +0100, Michael S. Tsirkin wrote: > > > > > > > > > On Wed, Mar 18, 2015 at 08:02:26AM -0600, Alex Williamson wrote: > > > > > > > > > > On Wed, 2015-03-18 at 14:23 +0100, Michael S. Tsirkin wrote: > > > > > > > > > > > typo in subject: vfio, not vifo. > > > > > > > > > > > > > > > > > > > > > > On Thu, Mar 12, 2015 at 06:23:59PM +0800, Chen Fan wrote: > > > > > > > > > > > > for piix4 chipset, we don't need to expose aer, so introduce > > > > > > > > > > > > PC_I440FX_COMPAT for all piix4 machines to disable aercap, > > > > > > > > > > > > and add HW_COMPAT_2_2 to disable aercap for all lower > > > > > > > > > > > > than 2.3. > > > > > > > > > > > > > > > > > > > > > > > > Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com> > > > > > > > > > > > > > > > > > > > > > > Well vfio is never migrated ATM. > > > > > > > > > > > So why is compat code needed at all? > > > > > > > > > > > > > > > > > > > > It's not for migration, it's to maintain current behavior on existing > > > > > > > > > > platforms. If someone gets an uncorrected AER error on q35 machine type > > > > > > > > > > today, the VM stops. With this change, AER would be exposed to the > > > > > > > > > > guest and the guest could handle it. The compat change therefore > > > > > > > > > > maintains the stop VM behavior on existing q35 machine types. > > > > > > > > > > > > > > > > > > If stop VM behaviour is useful, expose it to users. > > > > > > > > > If not, then don't. > > > > > > > > > I don't see why does it have to be tied to machine types. > > > > > > > > > > > > > > > > Because q35-2.2 machine type will currently do a stop VM on uncorrected > > > > > > > > AER error. If we don't tie that to a machine option then q35-2.2 would > > > > > > > > suddenly start exposing the error to the guest. That's a fairly > > > > > > > > significant change in behavior for a static machine type. > > > > > > > > > > > > > > I don't think you can classify it as a behaviour change. VM stop is not > > > > > > > guest visible behaviour. > > > > > > > > > > > > In one case, an uncorrected AER occurs and the VM is stopped by QEMU. > > > > > > In the other case, the guest is notified and may attempt corrective > > > > > > action... or maybe the guest doesn't understand AER and the user is > > > > > > depending on the previous behavior. That is absolutely a behavior > > > > > > change. > > > > > > > > > > > > > Are you worrying about guests misbehaving when they see these errors? > > > > > > > Then you want this as user-controlled, supported option. > > > > > > > > > > > > Whether the option is user visible is tangential to whether the behavior > > > > > > of existing machine types should be maintained. Existing machine types > > > > > > can impose a different default than current machine types. > > > > > > > > > > > > > In other words: we only tie things to machine types when we > > > > > > > have to. This code gets almost no testing, and is a lot of > > > > > > > work to test. This one sounds like "just in case" is not a good > > > > > > > motivation. > > > > > > > > > > > > It seems like an obvious use case for using machine types to maintain > > > > > > compatibility with previous behavior, which is exactly why we have > > > > > > machine types. If we're not going to use it, why do we have it? > > > > > > > > > > We have machine types because of the following issues: > > > > > - some silent changes confuse guests. For example guest installed with > > > > > one machine type might not boot if you try to use it after > > > > > changing something, or - in case of windows - throw up warnings. > > > > > - some changes break migration > > > > > > > > > > Looks like none of these cases. > > > > > If AER is unsafe, turn it off by default for everyone. > > > > > > > > This is silly, we have the tools, let's use them. > > > > > > It's a very expensive tool, maintainance-wise. We often don't > > > have the choice but I'm not going to use this tool by choice > > > unless we know why we are doing this. > > > > > > > If a user is running > > > > a VM that gets a VM stop on AER error one day and they upgrade QEMU and > > > > restart it, they should get the same behavior, whether a migration is > > > > involved or not. > > > > > > You keep saying this, but why should it? Answer that question, the rest > > > will follow. > > > > My answer is that it's a user visible change in behavior that they may > > rely on and would not expect to change within an existing machine type. > > Silently modifying the default may expose them to error conditions that > > were previously handled by QEMU and adds AER handling requirements to > > the guest. > > > > Ignoring the question about whether an AER can be reliably recovered in > > the guest for a moment, a typical PC hardware platform will signal the > > running OS with AER errors, so it seems that *if* we can achieve parity > > in a virtual machine with that signaling, and more importantly the > > recovery process, then the default going forward should be to mimic bare > > metal behavior, thus changing the default from what we do today. > > > > As it is now, I think we have too many outstanding questions about how > > recover occurs to change the default. > > > > So let me return the question, if we were to resolve those questions and > > change the default handling from what we do today, creating a user > > visible behavior change and imposing new requirements for AER handling > > in the guest, why is that not worthy of machine type stability? I'd > > certainly expect it from a distro specific machine type. > > This really depends on what guests do with this. > We try to avoid guest visible changes during migration > (even that's not 100%). > There aren't such requirements for devices where migration > is disabled, e.g. we did many guest visible changes in q35. > > Latest machine types are better tested. Deviating from latest machine > just for a theoretical "this is guest visible" isn't going to give > you better stability, it will give you worse stability. You keep calling this theoretical. As it exists today, an uncorrected AER error results in a VM stop with no guest intervention or support required. If we were to change the default, the guest would instead be notified of the AER and given the opportunity to recover. That's not just guest visible, that's a complete change in error handling paradigm. > If you are worried about guest bugs, they are not going > away just because we release new QEMU.So if guests are > buggy with this feature, this needs a solution for all > machine types. Guest bugs are not my issue. > Anyway, if we keep the default, that's even easier. Agreed, and that may be where we end if we can't come up with a reasonable expectation that the guest can recover from an AER. However, I think machine compatibility should be fair game if we do want to flip that switch. > > > > Maybe the default should be disabled, this patch > > > > series hasn't yet even convinced me that there's a worthwhile general > > > > case where the guest can recover, but using the existing machine > > > > compatibility infrastructure should be at our disposal if we do think > > > > the default going forward should be different than the behavior today. > > > > > > I'm sorry, I don't think it's the right tool for the job. > > > > > > >
On Wed, Mar 18, 2015 at 12:08:15PM -0600, Alex Williamson wrote: > On Wed, 2015-03-18 at 18:45 +0100, Michael S. Tsirkin wrote: > > On Wed, Mar 18, 2015 at 11:11:28AM -0600, Alex Williamson wrote: > > > On Wed, 2015-03-18 at 17:44 +0100, Michael S. Tsirkin wrote: > > > > On Wed, Mar 18, 2015 at 09:45:29AM -0600, Alex Williamson wrote: > > > > > On Wed, 2015-03-18 at 16:02 +0100, Michael S. Tsirkin wrote: > > > > > > On Wed, Mar 18, 2015 at 08:50:54AM -0600, Alex Williamson wrote: > > > > > > > On Wed, 2015-03-18 at 15:36 +0100, Michael S. Tsirkin wrote: > > > > > > > > On Wed, Mar 18, 2015 at 08:15:01AM -0600, Alex Williamson wrote: > > > > > > > > > On Wed, 2015-03-18 at 15:05 +0100, Michael S. Tsirkin wrote: > > > > > > > > > > On Wed, Mar 18, 2015 at 08:02:26AM -0600, Alex Williamson wrote: > > > > > > > > > > > On Wed, 2015-03-18 at 14:23 +0100, Michael S. Tsirkin wrote: > > > > > > > > > > > > typo in subject: vfio, not vifo. > > > > > > > > > > > > > > > > > > > > > > > > On Thu, Mar 12, 2015 at 06:23:59PM +0800, Chen Fan wrote: > > > > > > > > > > > > > for piix4 chipset, we don't need to expose aer, so introduce > > > > > > > > > > > > > PC_I440FX_COMPAT for all piix4 machines to disable aercap, > > > > > > > > > > > > > and add HW_COMPAT_2_2 to disable aercap for all lower > > > > > > > > > > > > > than 2.3. > > > > > > > > > > > > > > > > > > > > > > > > > > Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com> > > > > > > > > > > > > > > > > > > > > > > > > Well vfio is never migrated ATM. > > > > > > > > > > > > So why is compat code needed at all? > > > > > > > > > > > > > > > > > > > > > > It's not for migration, it's to maintain current behavior on existing > > > > > > > > > > > platforms. If someone gets an uncorrected AER error on q35 machine type > > > > > > > > > > > today, the VM stops. With this change, AER would be exposed to the > > > > > > > > > > > guest and the guest could handle it. The compat change therefore > > > > > > > > > > > maintains the stop VM behavior on existing q35 machine types. > > > > > > > > > > > > > > > > > > > > If stop VM behaviour is useful, expose it to users. > > > > > > > > > > If not, then don't. > > > > > > > > > > I don't see why does it have to be tied to machine types. > > > > > > > > > > > > > > > > > > Because q35-2.2 machine type will currently do a stop VM on uncorrected > > > > > > > > > AER error. If we don't tie that to a machine option then q35-2.2 would > > > > > > > > > suddenly start exposing the error to the guest. That's a fairly > > > > > > > > > significant change in behavior for a static machine type. > > > > > > > > > > > > > > > > I don't think you can classify it as a behaviour change. VM stop is not > > > > > > > > guest visible behaviour. > > > > > > > > > > > > > > In one case, an uncorrected AER occurs and the VM is stopped by QEMU. > > > > > > > In the other case, the guest is notified and may attempt corrective > > > > > > > action... or maybe the guest doesn't understand AER and the user is > > > > > > > depending on the previous behavior. That is absolutely a behavior > > > > > > > change. > > > > > > > > > > > > > > > Are you worrying about guests misbehaving when they see these errors? > > > > > > > > Then you want this as user-controlled, supported option. > > > > > > > > > > > > > > Whether the option is user visible is tangential to whether the behavior > > > > > > > of existing machine types should be maintained. Existing machine types > > > > > > > can impose a different default than current machine types. > > > > > > > > > > > > > > > In other words: we only tie things to machine types when we > > > > > > > > have to. This code gets almost no testing, and is a lot of > > > > > > > > work to test. This one sounds like "just in case" is not a good > > > > > > > > motivation. > > > > > > > > > > > > > > It seems like an obvious use case for using machine types to maintain > > > > > > > compatibility with previous behavior, which is exactly why we have > > > > > > > machine types. If we're not going to use it, why do we have it? > > > > > > > > > > > > We have machine types because of the following issues: > > > > > > - some silent changes confuse guests. For example guest installed with > > > > > > one machine type might not boot if you try to use it after > > > > > > changing something, or - in case of windows - throw up warnings. > > > > > > - some changes break migration > > > > > > > > > > > > Looks like none of these cases. > > > > > > If AER is unsafe, turn it off by default for everyone. > > > > > > > > > > This is silly, we have the tools, let's use them. > > > > > > > > It's a very expensive tool, maintainance-wise. We often don't > > > > have the choice but I'm not going to use this tool by choice > > > > unless we know why we are doing this. > > > > > > > > > If a user is running > > > > > a VM that gets a VM stop on AER error one day and they upgrade QEMU and > > > > > restart it, they should get the same behavior, whether a migration is > > > > > involved or not. > > > > > > > > You keep saying this, but why should it? Answer that question, the rest > > > > will follow. > > > > > > My answer is that it's a user visible change in behavior that they may > > > rely on and would not expect to change within an existing machine type. > > > Silently modifying the default may expose them to error conditions that > > > were previously handled by QEMU and adds AER handling requirements to > > > the guest. > > > > > > Ignoring the question about whether an AER can be reliably recovered in > > > the guest for a moment, a typical PC hardware platform will signal the > > > running OS with AER errors, so it seems that *if* we can achieve parity > > > in a virtual machine with that signaling, and more importantly the > > > recovery process, then the default going forward should be to mimic bare > > > metal behavior, thus changing the default from what we do today. > > > > > > As it is now, I think we have too many outstanding questions about how > > > recover occurs to change the default. > > > > > > So let me return the question, if we were to resolve those questions and > > > change the default handling from what we do today, creating a user > > > visible behavior change and imposing new requirements for AER handling > > > in the guest, why is that not worthy of machine type stability? I'd > > > certainly expect it from a distro specific machine type. > > > > This really depends on what guests do with this. > > We try to avoid guest visible changes during migration > > (even that's not 100%). > > There aren't such requirements for devices where migration > > is disabled, e.g. we did many guest visible changes in q35. > > > > Latest machine types are better tested. Deviating from latest machine > > just for a theoretical "this is guest visible" isn't going to give > > you better stability, it will give you worse stability. > > You keep calling this theoretical. As it exists today, an uncorrected > AER error results in a VM stop with no guest intervention or support > required. If we were to change the default, the guest would instead be > notified of the AER and given the opportunity to recover. That's not > just guest visible, that's a complete change in error handling paradigm. OK, so maybe it's a feature that users should have control over. But tying it to machine types makes no sense. > > If you are worried about guest bugs, they are not going > > away just because we release new QEMU.So if guests are > > buggy with this feature, this needs a solution for all > > machine types. > > Guest bugs are not my issue. > > > Anyway, if we keep the default, that's even easier. > > Agreed, and that may be where we end if we can't come up with a > reasonable expectation that the guest can recover from an AER. However, > I think machine compatibility should be fair game if we do want to flip > that switch. > > > > > > Maybe the default should be disabled, this patch > > > > > series hasn't yet even convinced me that there's a worthwhile general > > > > > case where the guest can recover, but using the existing machine > > > > > compatibility infrastructure should be at our disposal if we do think > > > > > the default going forward should be different than the behavior today. > > > > > > > > I'm sorry, I don't think it's the right tool for the job. > > > > > > > > > > > >
On Wed, 2015-03-18 at 19:56 +0100, Michael S. Tsirkin wrote: > On Wed, Mar 18, 2015 at 12:08:15PM -0600, Alex Williamson wrote: > > On Wed, 2015-03-18 at 18:45 +0100, Michael S. Tsirkin wrote: > > > On Wed, Mar 18, 2015 at 11:11:28AM -0600, Alex Williamson wrote: > > > > On Wed, 2015-03-18 at 17:44 +0100, Michael S. Tsirkin wrote: > > > > > On Wed, Mar 18, 2015 at 09:45:29AM -0600, Alex Williamson wrote: > > > > > > On Wed, 2015-03-18 at 16:02 +0100, Michael S. Tsirkin wrote: > > > > > > > On Wed, Mar 18, 2015 at 08:50:54AM -0600, Alex Williamson wrote: > > > > > > > > On Wed, 2015-03-18 at 15:36 +0100, Michael S. Tsirkin wrote: > > > > > > > > > On Wed, Mar 18, 2015 at 08:15:01AM -0600, Alex Williamson wrote: > > > > > > > > > > On Wed, 2015-03-18 at 15:05 +0100, Michael S. Tsirkin wrote: > > > > > > > > > > > On Wed, Mar 18, 2015 at 08:02:26AM -0600, Alex Williamson wrote: > > > > > > > > > > > > On Wed, 2015-03-18 at 14:23 +0100, Michael S. Tsirkin wrote: > > > > > > > > > > > > > typo in subject: vfio, not vifo. > > > > > > > > > > > > > > > > > > > > > > > > > > On Thu, Mar 12, 2015 at 06:23:59PM +0800, Chen Fan wrote: > > > > > > > > > > > > > > for piix4 chipset, we don't need to expose aer, so introduce > > > > > > > > > > > > > > PC_I440FX_COMPAT for all piix4 machines to disable aercap, > > > > > > > > > > > > > > and add HW_COMPAT_2_2 to disable aercap for all lower > > > > > > > > > > > > > > than 2.3. > > > > > > > > > > > > > > > > > > > > > > > > > > > > Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com> > > > > > > > > > > > > > > > > > > > > > > > > > > Well vfio is never migrated ATM. > > > > > > > > > > > > > So why is compat code needed at all? > > > > > > > > > > > > > > > > > > > > > > > > It's not for migration, it's to maintain current behavior on existing > > > > > > > > > > > > platforms. If someone gets an uncorrected AER error on q35 machine type > > > > > > > > > > > > today, the VM stops. With this change, AER would be exposed to the > > > > > > > > > > > > guest and the guest could handle it. The compat change therefore > > > > > > > > > > > > maintains the stop VM behavior on existing q35 machine types. > > > > > > > > > > > > > > > > > > > > > > If stop VM behaviour is useful, expose it to users. > > > > > > > > > > > If not, then don't. > > > > > > > > > > > I don't see why does it have to be tied to machine types. > > > > > > > > > > > > > > > > > > > > Because q35-2.2 machine type will currently do a stop VM on uncorrected > > > > > > > > > > AER error. If we don't tie that to a machine option then q35-2.2 would > > > > > > > > > > suddenly start exposing the error to the guest. That's a fairly > > > > > > > > > > significant change in behavior for a static machine type. > > > > > > > > > > > > > > > > > > I don't think you can classify it as a behaviour change. VM stop is not > > > > > > > > > guest visible behaviour. > > > > > > > > > > > > > > > > In one case, an uncorrected AER occurs and the VM is stopped by QEMU. > > > > > > > > In the other case, the guest is notified and may attempt corrective > > > > > > > > action... or maybe the guest doesn't understand AER and the user is > > > > > > > > depending on the previous behavior. That is absolutely a behavior > > > > > > > > change. > > > > > > > > > > > > > > > > > Are you worrying about guests misbehaving when they see these errors? > > > > > > > > > Then you want this as user-controlled, supported option. > > > > > > > > > > > > > > > > Whether the option is user visible is tangential to whether the behavior > > > > > > > > of existing machine types should be maintained. Existing machine types > > > > > > > > can impose a different default than current machine types. > > > > > > > > > > > > > > > > > In other words: we only tie things to machine types when we > > > > > > > > > have to. This code gets almost no testing, and is a lot of > > > > > > > > > work to test. This one sounds like "just in case" is not a good > > > > > > > > > motivation. > > > > > > > > > > > > > > > > It seems like an obvious use case for using machine types to maintain > > > > > > > > compatibility with previous behavior, which is exactly why we have > > > > > > > > machine types. If we're not going to use it, why do we have it? > > > > > > > > > > > > > > We have machine types because of the following issues: > > > > > > > - some silent changes confuse guests. For example guest installed with > > > > > > > one machine type might not boot if you try to use it after > > > > > > > changing something, or - in case of windows - throw up warnings. > > > > > > > - some changes break migration > > > > > > > > > > > > > > Looks like none of these cases. > > > > > > > If AER is unsafe, turn it off by default for everyone. > > > > > > > > > > > > This is silly, we have the tools, let's use them. > > > > > > > > > > It's a very expensive tool, maintainance-wise. We often don't > > > > > have the choice but I'm not going to use this tool by choice > > > > > unless we know why we are doing this. > > > > > > > > > > > If a user is running > > > > > > a VM that gets a VM stop on AER error one day and they upgrade QEMU and > > > > > > restart it, they should get the same behavior, whether a migration is > > > > > > involved or not. > > > > > > > > > > You keep saying this, but why should it? Answer that question, the rest > > > > > will follow. > > > > > > > > My answer is that it's a user visible change in behavior that they may > > > > rely on and would not expect to change within an existing machine type. > > > > Silently modifying the default may expose them to error conditions that > > > > were previously handled by QEMU and adds AER handling requirements to > > > > the guest. > > > > > > > > Ignoring the question about whether an AER can be reliably recovered in > > > > the guest for a moment, a typical PC hardware platform will signal the > > > > running OS with AER errors, so it seems that *if* we can achieve parity > > > > in a virtual machine with that signaling, and more importantly the > > > > recovery process, then the default going forward should be to mimic bare > > > > metal behavior, thus changing the default from what we do today. > > > > > > > > As it is now, I think we have too many outstanding questions about how > > > > recover occurs to change the default. > > > > > > > > So let me return the question, if we were to resolve those questions and > > > > change the default handling from what we do today, creating a user > > > > visible behavior change and imposing new requirements for AER handling > > > > in the guest, why is that not worthy of machine type stability? I'd > > > > certainly expect it from a distro specific machine type. > > > > > > This really depends on what guests do with this. > > > We try to avoid guest visible changes during migration > > > (even that's not 100%). > > > There aren't such requirements for devices where migration > > > is disabled, e.g. we did many guest visible changes in q35. > > > > > > Latest machine types are better tested. Deviating from latest machine > > > just for a theoretical "this is guest visible" isn't going to give > > > you better stability, it will give you worse stability. > > > > You keep calling this theoretical. As it exists today, an uncorrected > > AER error results in a VM stop with no guest intervention or support > > required. If we were to change the default, the guest would instead be > > notified of the AER and given the opportunity to recover. That's not > > just guest visible, that's a complete change in error handling paradigm. > > OK, so maybe it's a feature that users should have control over. > But tying it to machine types makes no sense. If we were to change the default, where else would you tie it? Machine types are the only finger hold we have to maintain VM stability afaik.
On 18/03/2015 20:05, Alex Williamson wrote: > > OK, so maybe it's a feature that users should have control over. > > But tying it to machine types makes no sense. > > If we were to change the default, where else would you tie it? Machine > types are the only finger hold we have to maintain VM stability afaik. I agree. Machine type are there to avoid exposing "important" changes in the behavior of the device model to the guest. These include the number and size of capabilities and BARs (and other fields in configuration space), the number of virtqueues and how they are used (virtio features), etc. Migration is definitely the main reason to introduce a new property and add compatibility glue for older machine types, but it does not have to be the only one. AER seems to me like it is a significant-enough change in how the guest sees an assigned device. You certainly don't want it to appear in a configuration that was previously stable. So, migration could be a reason why we _have_ to introduce compatibility glue, but AER to me is a perfect example of why we may sometimes _want_ to have compatibility glue. It's different, but it makes a lot of sense. The maintenance cost of the compatibility glue is small. Hiding the capability is half a dozen lines of code, and the behavior is no different than when the guest doesn't want to see errors (which has to be supported, if only because not all chipsets are PCIe). Paolo
diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index 8eab4ba..ff9d312 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -307,6 +307,11 @@ static void pc_init1(MachineState *machine, static void pc_init_pci(MachineState *machine) { + static GlobalProperty pc_compat_props[] = { + PC_I440FX_COMPAT, + { /* end of list */ } + }; + qdev_prop_register_global_list(pc_compat_props); pc_init1(machine, 1, 1); } @@ -534,6 +539,10 @@ static QEMUMachine pc_i440fx_machine_v2_2 = { PC_I440FX_2_2_MACHINE_OPTIONS, .name = "pc-i440fx-2.2", .init = pc_init_pci_2_2, + .compat_props = (GlobalProperty[]) { + HW_COMPAT_2_2, + { /* end of list */ } + }, }; #define PC_I440FX_2_1_MACHINE_OPTIONS \ diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c index c0f21fe..97afb7d 100644 --- a/hw/i386/pc_q35.c +++ b/hw/i386/pc_q35.c @@ -431,6 +431,10 @@ static QEMUMachine pc_q35_machine_v2_2 = { PC_Q35_2_2_MACHINE_OPTIONS, .name = "pc-q35-2.2", .init = pc_q35_init_2_2, + .compat_props = (GlobalProperty[]) { + HW_COMPAT_2_2, + { /* end of list */ } + }, }; #define PC_Q35_2_1_MACHINE_OPTIONS \ diff --git a/include/hw/compat.h b/include/hw/compat.h index 313682a..40c974a 100644 --- a/include/hw/compat.h +++ b/include/hw/compat.h @@ -1,7 +1,17 @@ #ifndef HW_COMPAT_H #define HW_COMPAT_H +#define HW_COMPAT_2_2 PC_I440FX_COMPAT + +#define PC_I440FX_COMPAT \ + {\ + .driver = "vfio-pci",\ + .property = "x-aer",\ + .value = "off",\ + } + #define HW_COMPAT_2_1 \ + HW_COMPAT_2_2, \ {\ .driver = "intel-hda",\ .property = "old_msi_addr",\
for piix4 chipset, we don't need to expose aer, so introduce PC_I440FX_COMPAT for all piix4 machines to disable aercap, and add HW_COMPAT_2_2 to disable aercap for all lower than 2.3. Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com> --- hw/i386/pc_piix.c | 9 +++++++++ hw/i386/pc_q35.c | 4 ++++ include/hw/compat.h | 10 ++++++++++ 3 files changed, 23 insertions(+)