Message ID | 1523340161-8928-1-git-send-email-yulei.zhang@intel.com |
---|---|
State | New |
Headers | show |
Series | vfio: Introduce live migation capability to | expand |
On 4/10/2018 11:32 AM, Yulei Zhang wrote: > New VFIO sub region VFIO_REGION_SUBTYPE_DEVICE_STATE is added > to fetch and restore the status of mdev device vGPU during the > live migration. > > Signed-off-by: Yulei Zhang <yulei.zhang@intel.com> > --- > hw/vfio/pci.c | 25 ++++++++++++++++++++++++- > hw/vfio/pci.h | 2 ++ > linux-headers/linux/vfio.h | 9 ++++++--- > 3 files changed, 32 insertions(+), 4 deletions(-) > > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c > index c977ee3..f98a9dd 100644 > --- a/hw/vfio/pci.c > +++ b/hw/vfio/pci.c > @@ -32,6 +32,7 @@ > #include "pci.h" > #include "trace.h" > #include "qapi/error.h" > +#include "migration/blocker.h" > > #define MSIX_CAP_LENGTH 12 > > @@ -2821,6 +2822,25 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) > vfio_vga_quirk_setup(vdev); > } > > + struct vfio_region_info *device_state; > + /* device state region setup */ > + if (!vfio_get_dev_region_info(&vdev->vbasedev, > + VFIO_REGION_TYPE_PCI_VENDOR_TYPE, > + VFIO_REGION_SUBTYPE_DEVICE_STATE, &device_state)) { > + memcpy(&vdev->device_state, device_state, > + sizeof(struct vfio_region_info)); > + g_free(device_state); > + } else { > + error_setg(&vdev->migration_blocker, > + "Migration disabled: cannot support device state region"); > + migrate_add_blocker(vdev->migration_blocker, &err); > + if (err) { > + error_propagate(errp, err); > + error_free(vdev->migration_blocker); > + goto error; > + } > + } > + I think there should be a _PROBE ioctl before trying to setup region for migration. If vfio device driver or vendor driver for mdev device supports migration capability, _PROBE ioctl should return success along with region's information <index, size> which can be used to setup the region. > for (i = 0; i < PCI_ROM_SLOT; i++) { > vfio_bar_quirk_setup(vdev, i); > } > @@ -2884,6 +2904,10 @@ out_teardown: > vfio_teardown_msi(vdev); > vfio_bars_exit(vdev); > error: > + if (vdev->migration_blocker) { > + migrate_del_blocker(vdev->migration_blocker); > + error_free(vdev->migration_blocker); > + } > error_prepend(errp, ERR_PREFIX, vdev->vbasedev.name); > } > > @@ -3009,7 +3033,6 @@ static Property vfio_pci_dev_properties[] = { > > static const VMStateDescription vfio_pci_vmstate = { > .name = "vfio-pci", > - .unmigratable = 1, Based on the result of above _PROBE ioctl, 'unmigratable' should be set if vendor driver doesn't support migration capability. Thanks, Kirti > }; > > static void vfio_pci_dev_class_init(ObjectClass *klass, void *data) > diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h > index 502a575..0ee1724 100644 > --- a/hw/vfio/pci.h > +++ b/hw/vfio/pci.h > @@ -116,6 +116,8 @@ typedef struct VFIOPCIDevice { > VFIOBAR bars[PCI_NUM_REGIONS - 1]; /* No ROM */ > VFIOVGA *vga; /* 0xa0000, 0x3b0, 0x3c0 */ > void *igd_opregion; > + struct vfio_region_info device_state; > + Error *migration_blocker; > PCIHostDeviceAddress host; > EventNotifier err_notifier; > EventNotifier req_notifier; > diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h > index 4312e96..e3380ad 100644 > --- a/linux-headers/linux/vfio.h > +++ b/linux-headers/linux/vfio.h > @@ -297,9 +297,12 @@ struct vfio_region_info_cap_type { > #define VFIO_REGION_TYPE_PCI_VENDOR_MASK (0xffff) > > /* 8086 Vendor sub-types */ > -#define VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION (1) > -#define VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG (2) > -#define VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG (3) > +#define VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION (1) > +#define VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG (2) > +#define VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG (3) > + > +/* Mdev sub-type for device state save and restore */ > +#define VFIO_REGION_SUBTYPE_DEVICE_STATE (4) > > /** > * VFIO_DEVICE_GET_IRQ_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 9, >
On Mon, 16 Apr 2018 20:14:03 +0530 Kirti Wankhede <kwankhede@nvidia.com> wrote: > On 4/10/2018 11:32 AM, Yulei Zhang wrote: > > New VFIO sub region VFIO_REGION_SUBTYPE_DEVICE_STATE is added > > to fetch and restore the status of mdev device vGPU during the > > live migration. > > > > Signed-off-by: Yulei Zhang <yulei.zhang@intel.com> > > --- > > hw/vfio/pci.c | 25 ++++++++++++++++++++++++- > > hw/vfio/pci.h | 2 ++ > > linux-headers/linux/vfio.h | 9 ++++++--- > > 3 files changed, 32 insertions(+), 4 deletions(-) > > > > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c > > index c977ee3..f98a9dd 100644 > > --- a/hw/vfio/pci.c > > +++ b/hw/vfio/pci.c > > @@ -32,6 +32,7 @@ > > #include "pci.h" > > #include "trace.h" > > #include "qapi/error.h" > > +#include "migration/blocker.h" > > > > #define MSIX_CAP_LENGTH 12 > > > > @@ -2821,6 +2822,25 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) > > vfio_vga_quirk_setup(vdev); > > } > > > > + struct vfio_region_info *device_state; > > + /* device state region setup */ > > + if (!vfio_get_dev_region_info(&vdev->vbasedev, > > + VFIO_REGION_TYPE_PCI_VENDOR_TYPE, This is not how VENDOR_TYPE is meant to be used. We have a 32-bit type and 32-bit sub-type. When bit 31 of type is set (ie. VENDOR_TYPE), then the low 16-bits (VENDOR_MASK) defines a vendor specific set of sub-types. Using it as above, you're stomping on vendor 0x0000's vendor sub-types. I would expect non-vendor specific types to leave bit 31 of the type field clear. We could then define a universal type for device state with a sub-type specifying the API such that we could revise the API by providing an updated sub-type. Or perhaps there are devices that want separate save vs restore regions, we could do that with the sub-type. > > + VFIO_REGION_SUBTYPE_DEVICE_STATE, &device_state)) { > > + memcpy(&vdev->device_state, device_state, > > + sizeof(struct vfio_region_info)); > > + g_free(device_state); > > + } else { > > + error_setg(&vdev->migration_blocker, > > + "Migration disabled: cannot support device state region"); > > + migrate_add_blocker(vdev->migration_blocker, &err); This appears as if it's going to be rather verbose and generate errors for anything not supporting migration, which is currently everything. Maybe there should be an OnOffAuto vfio-pci device option for the user to specify migration such that if migration=on is specified the device will fail if it's not available. Otherwise the default would be auto. > > + if (err) { > > + error_propagate(errp, err); > > + error_free(vdev->migration_blocker); > > + goto error; > > + } > > + } > > + > > I think there should be a _PROBE ioctl before trying to setup region for > migration. If vfio device driver or vendor driver for mdev device > supports migration capability, _PROBE ioctl should return success along > with region's information <index, size> which can be used to setup the > region. How is _PROBE different from _REGION_INFO which already exists to tell us information about the region? > > for (i = 0; i < PCI_ROM_SLOT; i++) { > > vfio_bar_quirk_setup(vdev, i); > > } > > @@ -2884,6 +2904,10 @@ out_teardown: > > vfio_teardown_msi(vdev); > > vfio_bars_exit(vdev); > > error: > > + if (vdev->migration_blocker) { > > + migrate_del_blocker(vdev->migration_blocker); > > + error_free(vdev->migration_blocker); > > + } > > error_prepend(errp, ERR_PREFIX, vdev->vbasedev.name); > > } > > > > @@ -3009,7 +3033,6 @@ static Property vfio_pci_dev_properties[] = { > > > > static const VMStateDescription vfio_pci_vmstate = { > > .name = "vfio-pci", > > - .unmigratable = 1, > > Based on the result of above _PROBE ioctl, 'unmigratable' should be set > if vendor driver doesn't support migration capability. .unmigratable isn't really dynamically settable aiui, thus the existence of the migration blocker. > > }; > > > > static void vfio_pci_dev_class_init(ObjectClass *klass, void *data) > > diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h > > index 502a575..0ee1724 100644 > > --- a/hw/vfio/pci.h > > +++ b/hw/vfio/pci.h > > @@ -116,6 +116,8 @@ typedef struct VFIOPCIDevice { > > VFIOBAR bars[PCI_NUM_REGIONS - 1]; /* No ROM */ > > VFIOVGA *vga; /* 0xa0000, 0x3b0, 0x3c0 */ > > void *igd_opregion; > > + struct vfio_region_info device_state; > > + Error *migration_blocker; > > PCIHostDeviceAddress host; > > EventNotifier err_notifier; > > EventNotifier req_notifier; > > diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h > > index 4312e96..e3380ad 100644 > > --- a/linux-headers/linux/vfio.h > > +++ b/linux-headers/linux/vfio.h > > @@ -297,9 +297,12 @@ struct vfio_region_info_cap_type { > > #define VFIO_REGION_TYPE_PCI_VENDOR_MASK (0xffff) > > > > /* 8086 Vendor sub-types */ > > -#define VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION (1) > > -#define VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG (2) > > -#define VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG (3) > > +#define VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION (1) > > +#define VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG (2) > > +#define VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG (3) > > + > > +/* Mdev sub-type for device state save and restore */ > > +#define VFIO_REGION_SUBTYPE_DEVICE_STATE (4) This continues on with the 0x8086 vendor sub-types, which is entirely unnecessary as each vendor has an entire sub-type address space and we shouldn't be using a vendor type anyway. Additionally, we need significantly more documentation on the API for this region. Thanks, Alex > > > > /** > > * VFIO_DEVICE_GET_IRQ_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 9, > >
> -----Original Message----- > From: Alex Williamson [mailto:alex.williamson@redhat.com] > Sent: Tuesday, April 17, 2018 4:15 AM > To: Kirti Wankhede <kwankhede@nvidia.com> > Cc: Zhang, Yulei <yulei.zhang@intel.com>; qemu-devel@nongnu.org; Tian, > Kevin <kevin.tian@intel.com>; joonas.lahtinen@linux.intel.com; > zhenyuw@linux.intel.com; Wang, Zhi A <zhi.a.wang@intel.com>; > dgilbert@redhat.com; quintela@redhat.com > Subject: Re: [RFC PATCH V4 1/4] vfio: introduce a new VFIO subregion for > mdev device migration support > > On Mon, 16 Apr 2018 20:14:03 +0530 > Kirti Wankhede <kwankhede@nvidia.com> wrote: > > > On 4/10/2018 11:32 AM, Yulei Zhang wrote: > > > New VFIO sub region VFIO_REGION_SUBTYPE_DEVICE_STATE is added > > > to fetch and restore the status of mdev device vGPU during the > > > live migration. > > > > > > Signed-off-by: Yulei Zhang <yulei.zhang@intel.com> > > > --- > > > hw/vfio/pci.c | 25 ++++++++++++++++++++++++- > > > hw/vfio/pci.h | 2 ++ > > > linux-headers/linux/vfio.h | 9 ++++++--- > > > 3 files changed, 32 insertions(+), 4 deletions(-) > > > > > > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c > > > index c977ee3..f98a9dd 100644 > > > --- a/hw/vfio/pci.c > > > +++ b/hw/vfio/pci.c > > > @@ -32,6 +32,7 @@ > > > #include "pci.h" > > > #include "trace.h" > > > #include "qapi/error.h" > > > +#include "migration/blocker.h" > > > > > > #define MSIX_CAP_LENGTH 12 > > > > > > @@ -2821,6 +2822,25 @@ static void vfio_realize(PCIDevice *pdev, Error > **errp) > > > vfio_vga_quirk_setup(vdev); > > > } > > > > > > + struct vfio_region_info *device_state; > > > + /* device state region setup */ > > > + if (!vfio_get_dev_region_info(&vdev->vbasedev, > > > + VFIO_REGION_TYPE_PCI_VENDOR_TYPE, > > This is not how VENDOR_TYPE is meant to be used. We have a 32-bit type > and 32-bit sub-type. When bit 31 of type is set (ie. VENDOR_TYPE), > then the low 16-bits (VENDOR_MASK) defines a vendor specific set of > sub-types. Using it as above, you're stomping on vendor 0x0000's > vendor sub-types. I would expect non-vendor specific types to leave > bit 31 of the type field clear. We could then define a universal type > for device state with a sub-type specifying the API such that we could > revise the API by providing an updated sub-type. Or perhaps there are > devices that want separate save vs restore regions, we could do that > with the sub-type. Got it. For the subtype, can we use it to specify how to use the region, to indicate it can be mmaped or not. > > > > + VFIO_REGION_SUBTYPE_DEVICE_STATE, &device_state)) { > > > + memcpy(&vdev->device_state, device_state, > > > + sizeof(struct vfio_region_info)); > > > + g_free(device_state); > > > + } else { > > > + error_setg(&vdev->migration_blocker, > > > + "Migration disabled: cannot support device state region"); > > > + migrate_add_blocker(vdev->migration_blocker, &err); > > This appears as if it's going to be rather verbose and generate errors > for anything not supporting migration, which is currently everything. > Maybe there should be an OnOffAuto vfio-pci device option for the user > to specify migration such that if migration=on is specified the device > will fail if it's not available. Otherwise the default would be auto. > We only get this error message when we try to migrate the guest with vfio-pci device which doesn't contain the device_state region, just like currently we set unmigratable=1. > > > + if (err) { > > > + error_propagate(errp, err); > > > + error_free(vdev->migration_blocker); > > > + goto error; > > > + } > > > + } > > > + > > > > I think there should be a _PROBE ioctl before trying to setup region for > > migration. If vfio device driver or vendor driver for mdev device > > supports migration capability, _PROBE ioctl should return success along > > with region's information <index, size> which can be used to setup the > > region. > > How is _PROBE different from _REGION_INFO which already exists to tell > us information about the region? > > > > for (i = 0; i < PCI_ROM_SLOT; i++) { > > > vfio_bar_quirk_setup(vdev, i); > > > } > > > @@ -2884,6 +2904,10 @@ out_teardown: > > > vfio_teardown_msi(vdev); > > > vfio_bars_exit(vdev); > > > error: > > > + if (vdev->migration_blocker) { > > > + migrate_del_blocker(vdev->migration_blocker); > > > + error_free(vdev->migration_blocker); > > > + } > > > error_prepend(errp, ERR_PREFIX, vdev->vbasedev.name); > > > } > > > > > > @@ -3009,7 +3033,6 @@ static Property vfio_pci_dev_properties[] = { > > > > > > static const VMStateDescription vfio_pci_vmstate = { > > > .name = "vfio-pci", > > > - .unmigratable = 1, > > > > Based on the result of above _PROBE ioctl, 'unmigratable' should be set > > if vendor driver doesn't support migration capability. > > .unmigratable isn't really dynamically settable aiui, thus the > existence of the migration blocker. > > > > }; > > > > > > static void vfio_pci_dev_class_init(ObjectClass *klass, void *data) > > > diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h > > > index 502a575..0ee1724 100644 > > > --- a/hw/vfio/pci.h > > > +++ b/hw/vfio/pci.h > > > @@ -116,6 +116,8 @@ typedef struct VFIOPCIDevice { > > > VFIOBAR bars[PCI_NUM_REGIONS - 1]; /* No ROM */ > > > VFIOVGA *vga; /* 0xa0000, 0x3b0, 0x3c0 */ > > > void *igd_opregion; > > > + struct vfio_region_info device_state; > > > + Error *migration_blocker; > > > PCIHostDeviceAddress host; > > > EventNotifier err_notifier; > > > EventNotifier req_notifier; > > > diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h > > > index 4312e96..e3380ad 100644 > > > --- a/linux-headers/linux/vfio.h > > > +++ b/linux-headers/linux/vfio.h > > > @@ -297,9 +297,12 @@ struct vfio_region_info_cap_type { > > > #define VFIO_REGION_TYPE_PCI_VENDOR_MASK (0xffff) > > > > > > /* 8086 Vendor sub-types */ > > > -#define VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION (1) > > > -#define VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG (2) > > > -#define VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG (3) > > > +#define VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION (1) > > > +#define VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG (2) > > > +#define VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG (3) > > > + > > > +/* Mdev sub-type for device state save and restore */ > > > +#define VFIO_REGION_SUBTYPE_DEVICE_STATE (4) > > This continues on with the 0x8086 vendor sub-types, which is entirely > unnecessary as each vendor has an entire sub-type address space and we > shouldn't be using a vendor type anyway. Additionally, we need > significantly more documentation on the API for this region. Thanks, > > Alex > > > > > > > /** > > > * VFIO_DEVICE_GET_IRQ_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 9, > > >
On Tue, 17 Apr 2018 10:36:53 +0000 "Zhang, Yulei" <yulei.zhang@intel.com> wrote: > > -----Original Message----- > > From: Alex Williamson [mailto:alex.williamson@redhat.com] > > Sent: Tuesday, April 17, 2018 4:15 AM > > To: Kirti Wankhede <kwankhede@nvidia.com> > > Cc: Zhang, Yulei <yulei.zhang@intel.com>; qemu-devel@nongnu.org; Tian, > > Kevin <kevin.tian@intel.com>; joonas.lahtinen@linux.intel.com; > > zhenyuw@linux.intel.com; Wang, Zhi A <zhi.a.wang@intel.com>; > > dgilbert@redhat.com; quintela@redhat.com > > Subject: Re: [RFC PATCH V4 1/4] vfio: introduce a new VFIO subregion for > > mdev device migration support > > > > On Mon, 16 Apr 2018 20:14:03 +0530 > > Kirti Wankhede <kwankhede@nvidia.com> wrote: > > > > > On 4/10/2018 11:32 AM, Yulei Zhang wrote: > > > > New VFIO sub region VFIO_REGION_SUBTYPE_DEVICE_STATE is added > > > > to fetch and restore the status of mdev device vGPU during the > > > > live migration. > > > > > > > > Signed-off-by: Yulei Zhang <yulei.zhang@intel.com> > > > > --- > > > > hw/vfio/pci.c | 25 ++++++++++++++++++++++++- > > > > hw/vfio/pci.h | 2 ++ > > > > linux-headers/linux/vfio.h | 9 ++++++--- > > > > 3 files changed, 32 insertions(+), 4 deletions(-) > > > > > > > > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c > > > > index c977ee3..f98a9dd 100644 > > > > --- a/hw/vfio/pci.c > > > > +++ b/hw/vfio/pci.c > > > > @@ -32,6 +32,7 @@ > > > > #include "pci.h" > > > > #include "trace.h" > > > > #include "qapi/error.h" > > > > +#include "migration/blocker.h" > > > > > > > > #define MSIX_CAP_LENGTH 12 > > > > > > > > @@ -2821,6 +2822,25 @@ static void vfio_realize(PCIDevice *pdev, Error > > **errp) > > > > vfio_vga_quirk_setup(vdev); > > > > } > > > > > > > > + struct vfio_region_info *device_state; > > > > + /* device state region setup */ > > > > + if (!vfio_get_dev_region_info(&vdev->vbasedev, > > > > + VFIO_REGION_TYPE_PCI_VENDOR_TYPE, > > > > This is not how VENDOR_TYPE is meant to be used. We have a 32-bit type > > and 32-bit sub-type. When bit 31 of type is set (ie. VENDOR_TYPE), > > then the low 16-bits (VENDOR_MASK) defines a vendor specific set of > > sub-types. Using it as above, you're stomping on vendor 0x0000's > > vendor sub-types. I would expect non-vendor specific types to leave > > bit 31 of the type field clear. We could then define a universal type > > for device state with a sub-type specifying the API such that we could > > revise the API by providing an updated sub-type. Or perhaps there are > > devices that want separate save vs restore regions, we could do that > > with the sub-type. > > Got it. For the subtype, can we use it to specify how to use the region, > to indicate it can be mmaped or not. No, REGION_INFO tells us about mmap'ability and device state regions can certainly support the sparse mmap capability, we already have full ability to describe the mmap features of a region without dedicating a sub-type to it. A sub-type must have a defined API, but use the existing mechanisms for mmap. > > > > + VFIO_REGION_SUBTYPE_DEVICE_STATE, &device_state)) { > > > > + memcpy(&vdev->device_state, device_state, > > > > + sizeof(struct vfio_region_info)); > > > > + g_free(device_state); > > > > + } else { > > > > + error_setg(&vdev->migration_blocker, > > > > + "Migration disabled: cannot support device state region"); > > > > + migrate_add_blocker(vdev->migration_blocker, &err); > > > > This appears as if it's going to be rather verbose and generate errors > > for anything not supporting migration, which is currently everything. > > Maybe there should be an OnOffAuto vfio-pci device option for the user > > to specify migration such that if migration=on is specified the device > > will fail if it's not available. Otherwise the default would be auto. > > > We only get this error message when we try to migrate the guest with > vfio-pci device which doesn't contain the device_state region, just like > currently we set unmigratable=1. Ok, I still wonder though if management tools might want the ability to fail the device if it doesn't support migration via an OnOffAuto option. I can imagine a customer that doesn't want to compromise the migrate'ability of their VM and doesn't want to wait until a migration is attempted to find out it's blocked. Thanks, Alex > > > > + if (err) { > > > > + error_propagate(errp, err); > > > > + error_free(vdev->migration_blocker); > > > > + goto error; > > > > + } > > > > + } > > > > + > > > > > > I think there should be a _PROBE ioctl before trying to setup region for > > > migration. If vfio device driver or vendor driver for mdev device > > > supports migration capability, _PROBE ioctl should return success along > > > with region's information <index, size> which can be used to setup the > > > region. > > > > How is _PROBE different from _REGION_INFO which already exists to tell > > us information about the region? > > > > > > for (i = 0; i < PCI_ROM_SLOT; i++) { > > > > vfio_bar_quirk_setup(vdev, i); > > > > } > > > > @@ -2884,6 +2904,10 @@ out_teardown: > > > > vfio_teardown_msi(vdev); > > > > vfio_bars_exit(vdev); > > > > error: > > > > + if (vdev->migration_blocker) { > > > > + migrate_del_blocker(vdev->migration_blocker); > > > > + error_free(vdev->migration_blocker); > > > > + } > > > > error_prepend(errp, ERR_PREFIX, vdev->vbasedev.name); > > > > } > > > > > > > > @@ -3009,7 +3033,6 @@ static Property vfio_pci_dev_properties[] = { > > > > > > > > static const VMStateDescription vfio_pci_vmstate = { > > > > .name = "vfio-pci", > > > > - .unmigratable = 1, > > > > > > Based on the result of above _PROBE ioctl, 'unmigratable' should be set > > > if vendor driver doesn't support migration capability. > > > > .unmigratable isn't really dynamically settable aiui, thus the > > existence of the migration blocker. > > > > > > }; > > > > > > > > static void vfio_pci_dev_class_init(ObjectClass *klass, void *data) > > > > diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h > > > > index 502a575..0ee1724 100644 > > > > --- a/hw/vfio/pci.h > > > > +++ b/hw/vfio/pci.h > > > > @@ -116,6 +116,8 @@ typedef struct VFIOPCIDevice { > > > > VFIOBAR bars[PCI_NUM_REGIONS - 1]; /* No ROM */ > > > > VFIOVGA *vga; /* 0xa0000, 0x3b0, 0x3c0 */ > > > > void *igd_opregion; > > > > + struct vfio_region_info device_state; > > > > + Error *migration_blocker; > > > > PCIHostDeviceAddress host; > > > > EventNotifier err_notifier; > > > > EventNotifier req_notifier; > > > > diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h > > > > index 4312e96..e3380ad 100644 > > > > --- a/linux-headers/linux/vfio.h > > > > +++ b/linux-headers/linux/vfio.h > > > > @@ -297,9 +297,12 @@ struct vfio_region_info_cap_type { > > > > #define VFIO_REGION_TYPE_PCI_VENDOR_MASK (0xffff) > > > > > > > > /* 8086 Vendor sub-types */ > > > > -#define VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION (1) > > > > -#define VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG (2) > > > > -#define VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG (3) > > > > +#define VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION (1) > > > > +#define VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG (2) > > > > +#define VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG (3) > > > > + > > > > +/* Mdev sub-type for device state save and restore */ > > > > +#define VFIO_REGION_SUBTYPE_DEVICE_STATE (4) > > > > This continues on with the 0x8086 vendor sub-types, which is entirely > > unnecessary as each vendor has an entire sub-type address space and we > > shouldn't be using a vendor type anyway. Additionally, we need > > significantly more documentation on the API for this region. Thanks, > > > > Alex > > > > > > > > > > /** > > > > * VFIO_DEVICE_GET_IRQ_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 9, > > > > >
* Alex Williamson (alex.williamson@redhat.com) wrote: > On Tue, 17 Apr 2018 10:36:53 +0000 > "Zhang, Yulei" <yulei.zhang@intel.com> wrote: > > > > -----Original Message----- > > > From: Alex Williamson [mailto:alex.williamson@redhat.com] > > > Sent: Tuesday, April 17, 2018 4:15 AM > > > To: Kirti Wankhede <kwankhede@nvidia.com> > > > Cc: Zhang, Yulei <yulei.zhang@intel.com>; qemu-devel@nongnu.org; Tian, > > > Kevin <kevin.tian@intel.com>; joonas.lahtinen@linux.intel.com; > > > zhenyuw@linux.intel.com; Wang, Zhi A <zhi.a.wang@intel.com>; > > > dgilbert@redhat.com; quintela@redhat.com > > > Subject: Re: [RFC PATCH V4 1/4] vfio: introduce a new VFIO subregion for > > > mdev device migration support > > > > > > On Mon, 16 Apr 2018 20:14:03 +0530 > > > Kirti Wankhede <kwankhede@nvidia.com> wrote: > > > > > > > On 4/10/2018 11:32 AM, Yulei Zhang wrote: > > > > > New VFIO sub region VFIO_REGION_SUBTYPE_DEVICE_STATE is added > > > > > to fetch and restore the status of mdev device vGPU during the > > > > > live migration. > > > > > > > > > > Signed-off-by: Yulei Zhang <yulei.zhang@intel.com> > > > > > --- > > > > > hw/vfio/pci.c | 25 ++++++++++++++++++++++++- > > > > > hw/vfio/pci.h | 2 ++ > > > > > linux-headers/linux/vfio.h | 9 ++++++--- > > > > > 3 files changed, 32 insertions(+), 4 deletions(-) > > > > > > > > > > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c > > > > > index c977ee3..f98a9dd 100644 > > > > > --- a/hw/vfio/pci.c > > > > > +++ b/hw/vfio/pci.c > > > > > @@ -32,6 +32,7 @@ > > > > > #include "pci.h" > > > > > #include "trace.h" > > > > > #include "qapi/error.h" > > > > > +#include "migration/blocker.h" > > > > > > > > > > #define MSIX_CAP_LENGTH 12 > > > > > > > > > > @@ -2821,6 +2822,25 @@ static void vfio_realize(PCIDevice *pdev, Error > > > **errp) > > > > > vfio_vga_quirk_setup(vdev); > > > > > } > > > > > > > > > > + struct vfio_region_info *device_state; > > > > > + /* device state region setup */ > > > > > + if (!vfio_get_dev_region_info(&vdev->vbasedev, > > > > > + VFIO_REGION_TYPE_PCI_VENDOR_TYPE, > > > > > > This is not how VENDOR_TYPE is meant to be used. We have a 32-bit type > > > and 32-bit sub-type. When bit 31 of type is set (ie. VENDOR_TYPE), > > > then the low 16-bits (VENDOR_MASK) defines a vendor specific set of > > > sub-types. Using it as above, you're stomping on vendor 0x0000's > > > vendor sub-types. I would expect non-vendor specific types to leave > > > bit 31 of the type field clear. We could then define a universal type > > > for device state with a sub-type specifying the API such that we could > > > revise the API by providing an updated sub-type. Or perhaps there are > > > devices that want separate save vs restore regions, we could do that > > > with the sub-type. > > > > Got it. For the subtype, can we use it to specify how to use the region, > > to indicate it can be mmaped or not. > > No, REGION_INFO tells us about mmap'ability and device state regions > can certainly support the sparse mmap capability, we already have full > ability to describe the mmap features of a region without dedicating a > sub-type to it. A sub-type must have a defined API, but use the > existing mechanisms for mmap. > > > > > > + VFIO_REGION_SUBTYPE_DEVICE_STATE, &device_state)) { > > > > > + memcpy(&vdev->device_state, device_state, > > > > > + sizeof(struct vfio_region_info)); > > > > > + g_free(device_state); > > > > > + } else { > > > > > + error_setg(&vdev->migration_blocker, > > > > > + "Migration disabled: cannot support device state region"); > > > > > + migrate_add_blocker(vdev->migration_blocker, &err); > > > > > > This appears as if it's going to be rather verbose and generate errors > > > for anything not supporting migration, which is currently everything. > > > Maybe there should be an OnOffAuto vfio-pci device option for the user > > > to specify migration such that if migration=on is specified the device > > > will fail if it's not available. Otherwise the default would be auto. > > > > > We only get this error message when we try to migrate the guest with > > vfio-pci device which doesn't contain the device_state region, just like > > currently we set unmigratable=1. > > Ok, I still wonder though if management tools might want the ability to > fail the device if it doesn't support migration via an OnOffAuto > option. I can imagine a customer that doesn't want to compromise the > migrate'ability of their VM and doesn't want to wait until a migration > is attempted to find out it's blocked. Thanks, The fix here is to check the return value from migrate_add_blocker and bail if it's not happy. If QEMU is started with --only-migratable it'll fail with the error 'disallowing migration blocker (--only migratable) for:' and then your message. Dave > Alex > > > > > > + if (err) { > > > > > + error_propagate(errp, err); > > > > > + error_free(vdev->migration_blocker); > > > > > + goto error; > > > > > + } > > > > > + } > > > > > + > > > > > > > > I think there should be a _PROBE ioctl before trying to setup region for > > > > migration. If vfio device driver or vendor driver for mdev device > > > > supports migration capability, _PROBE ioctl should return success along > > > > with region's information <index, size> which can be used to setup the > > > > region. > > > > > > How is _PROBE different from _REGION_INFO which already exists to tell > > > us information about the region? > > > > > > > > for (i = 0; i < PCI_ROM_SLOT; i++) { > > > > > vfio_bar_quirk_setup(vdev, i); > > > > > } > > > > > @@ -2884,6 +2904,10 @@ out_teardown: > > > > > vfio_teardown_msi(vdev); > > > > > vfio_bars_exit(vdev); > > > > > error: > > > > > + if (vdev->migration_blocker) { > > > > > + migrate_del_blocker(vdev->migration_blocker); > > > > > + error_free(vdev->migration_blocker); > > > > > + } > > > > > error_prepend(errp, ERR_PREFIX, vdev->vbasedev.name); > > > > > } > > > > > > > > > > @@ -3009,7 +3033,6 @@ static Property vfio_pci_dev_properties[] = { > > > > > > > > > > static const VMStateDescription vfio_pci_vmstate = { > > > > > .name = "vfio-pci", > > > > > - .unmigratable = 1, > > > > > > > > Based on the result of above _PROBE ioctl, 'unmigratable' should be set > > > > if vendor driver doesn't support migration capability. > > > > > > .unmigratable isn't really dynamically settable aiui, thus the > > > existence of the migration blocker. > > > > > > > > }; > > > > > > > > > > static void vfio_pci_dev_class_init(ObjectClass *klass, void *data) > > > > > diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h > > > > > index 502a575..0ee1724 100644 > > > > > --- a/hw/vfio/pci.h > > > > > +++ b/hw/vfio/pci.h > > > > > @@ -116,6 +116,8 @@ typedef struct VFIOPCIDevice { > > > > > VFIOBAR bars[PCI_NUM_REGIONS - 1]; /* No ROM */ > > > > > VFIOVGA *vga; /* 0xa0000, 0x3b0, 0x3c0 */ > > > > > void *igd_opregion; > > > > > + struct vfio_region_info device_state; > > > > > + Error *migration_blocker; > > > > > PCIHostDeviceAddress host; > > > > > EventNotifier err_notifier; > > > > > EventNotifier req_notifier; > > > > > diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h > > > > > index 4312e96..e3380ad 100644 > > > > > --- a/linux-headers/linux/vfio.h > > > > > +++ b/linux-headers/linux/vfio.h > > > > > @@ -297,9 +297,12 @@ struct vfio_region_info_cap_type { > > > > > #define VFIO_REGION_TYPE_PCI_VENDOR_MASK (0xffff) > > > > > > > > > > /* 8086 Vendor sub-types */ > > > > > -#define VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION (1) > > > > > -#define VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG (2) > > > > > -#define VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG (3) > > > > > +#define VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION (1) > > > > > +#define VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG (2) > > > > > +#define VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG (3) > > > > > + > > > > > +/* Mdev sub-type for device state save and restore */ > > > > > +#define VFIO_REGION_SUBTYPE_DEVICE_STATE (4) > > > > > > This continues on with the 0x8086 vendor sub-types, which is entirely > > > unnecessary as each vendor has an entire sub-type address space and we > > > shouldn't be using a vendor type anyway. Additionally, we need > > > significantly more documentation on the API for this region. Thanks, > > > > > > Alex > > > > > > > > > > > > > /** > > > > > * VFIO_DEVICE_GET_IRQ_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 9, > > > > > > > > -- Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
On 4/17/2018 1:45 AM, Alex Williamson wrote: > On Mon, 16 Apr 2018 20:14:03 +0530 > Kirti Wankhede <kwankhede@nvidia.com> wrote: > >> On 4/10/2018 11:32 AM, Yulei Zhang wrote: >>> New VFIO sub region VFIO_REGION_SUBTYPE_DEVICE_STATE is added >>> to fetch and restore the status of mdev device vGPU during the >>> live migration. >>> >>> Signed-off-by: Yulei Zhang <yulei.zhang@intel.com> >>> --- >>> hw/vfio/pci.c | 25 ++++++++++++++++++++++++- >>> hw/vfio/pci.h | 2 ++ >>> linux-headers/linux/vfio.h | 9 ++++++--- >>> 3 files changed, 32 insertions(+), 4 deletions(-) >>> >>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c >>> index c977ee3..f98a9dd 100644 >>> --- a/hw/vfio/pci.c >>> +++ b/hw/vfio/pci.c >>> @@ -32,6 +32,7 @@ >>> #include "pci.h" >>> #include "trace.h" >>> #include "qapi/error.h" >>> +#include "migration/blocker.h" >>> >>> #define MSIX_CAP_LENGTH 12 >>> >>> @@ -2821,6 +2822,25 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) >>> vfio_vga_quirk_setup(vdev); >>> } >>> >>> + struct vfio_region_info *device_state; >>> + /* device state region setup */ >>> + if (!vfio_get_dev_region_info(&vdev->vbasedev, >>> + VFIO_REGION_TYPE_PCI_VENDOR_TYPE, > > This is not how VENDOR_TYPE is meant to be used. We have a 32-bit type > and 32-bit sub-type. When bit 31 of type is set (ie. VENDOR_TYPE), > then the low 16-bits (VENDOR_MASK) defines a vendor specific set of > sub-types. Using it as above, you're stomping on vendor 0x0000's > vendor sub-types. I would expect non-vendor specific types to leave > bit 31 of the type field clear. We could then define a universal type > for device state with a sub-type specifying the API such that we could > revise the API by providing an updated sub-type. Or perhaps there are > devices that want separate save vs restore regions, we could do that > with the sub-type. > >>> + VFIO_REGION_SUBTYPE_DEVICE_STATE, &device_state)) { >>> + memcpy(&vdev->device_state, device_state, >>> + sizeof(struct vfio_region_info)); >>> + g_free(device_state); >>> + } else { >>> + error_setg(&vdev->migration_blocker, >>> + "Migration disabled: cannot support device state region"); >>> + migrate_add_blocker(vdev->migration_blocker, &err); > > This appears as if it's going to be rather verbose and generate errors > for anything not supporting migration, which is currently everything. > Maybe there should be an OnOffAuto vfio-pci device option for the user > to specify migration such that if migration=on is specified the device > will fail if it's not available. Otherwise the default would be auto. > >>> + if (err) { >>> + error_propagate(errp, err); >>> + error_free(vdev->migration_blocker); >>> + goto error; >>> + } >>> + } >>> + >> >> I think there should be a _PROBE ioctl before trying to setup region for >> migration. If vfio device driver or vendor driver for mdev device >> supports migration capability, _PROBE ioctl should return success along >> with region's information <index, size> which can be used to setup the >> region. > > How is _PROBE different from _REGION_INFO which already exists to tell > us information about the region? > Fine with _REGION_INFO if SUBTYPE is non-vendor specific as you mentioned above. Thanks, Kirti >>> for (i = 0; i < PCI_ROM_SLOT; i++) { >>> vfio_bar_quirk_setup(vdev, i); >>> } >>> @@ -2884,6 +2904,10 @@ out_teardown: >>> vfio_teardown_msi(vdev); >>> vfio_bars_exit(vdev); >>> error: >>> + if (vdev->migration_blocker) { >>> + migrate_del_blocker(vdev->migration_blocker); >>> + error_free(vdev->migration_blocker); >>> + } >>> error_prepend(errp, ERR_PREFIX, vdev->vbasedev.name); >>> } >>> >>> @@ -3009,7 +3033,6 @@ static Property vfio_pci_dev_properties[] = { >>> >>> static const VMStateDescription vfio_pci_vmstate = { >>> .name = "vfio-pci", >>> - .unmigratable = 1, >> >> Based on the result of above _PROBE ioctl, 'unmigratable' should be set >> if vendor driver doesn't support migration capability. > > .unmigratable isn't really dynamically settable aiui, thus the > existence of the migration blocker. > >>> }; >>> >>> static void vfio_pci_dev_class_init(ObjectClass *klass, void *data) >>> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h >>> index 502a575..0ee1724 100644 >>> --- a/hw/vfio/pci.h >>> +++ b/hw/vfio/pci.h >>> @@ -116,6 +116,8 @@ typedef struct VFIOPCIDevice { >>> VFIOBAR bars[PCI_NUM_REGIONS - 1]; /* No ROM */ >>> VFIOVGA *vga; /* 0xa0000, 0x3b0, 0x3c0 */ >>> void *igd_opregion; >>> + struct vfio_region_info device_state; >>> + Error *migration_blocker; >>> PCIHostDeviceAddress host; >>> EventNotifier err_notifier; >>> EventNotifier req_notifier; >>> diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h >>> index 4312e96..e3380ad 100644 >>> --- a/linux-headers/linux/vfio.h >>> +++ b/linux-headers/linux/vfio.h >>> @@ -297,9 +297,12 @@ struct vfio_region_info_cap_type { >>> #define VFIO_REGION_TYPE_PCI_VENDOR_MASK (0xffff) >>> >>> /* 8086 Vendor sub-types */ >>> -#define VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION (1) >>> -#define VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG (2) >>> -#define VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG (3) >>> +#define VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION (1) >>> +#define VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG (2) >>> +#define VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG (3) >>> + >>> +/* Mdev sub-type for device state save and restore */ >>> +#define VFIO_REGION_SUBTYPE_DEVICE_STATE (4) > > This continues on with the 0x8086 vendor sub-types, which is entirely > unnecessary as each vendor has an entire sub-type address space and we > shouldn't be using a vendor type anyway. Additionally, we need > significantly more documentation on the API for this region. Thanks, > > Alex > >>> >>> /** >>> * VFIO_DEVICE_GET_IRQ_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 9, >>> >
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index c977ee3..f98a9dd 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -32,6 +32,7 @@ #include "pci.h" #include "trace.h" #include "qapi/error.h" +#include "migration/blocker.h" #define MSIX_CAP_LENGTH 12 @@ -2821,6 +2822,25 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) vfio_vga_quirk_setup(vdev); } + struct vfio_region_info *device_state; + /* device state region setup */ + if (!vfio_get_dev_region_info(&vdev->vbasedev, + VFIO_REGION_TYPE_PCI_VENDOR_TYPE, + VFIO_REGION_SUBTYPE_DEVICE_STATE, &device_state)) { + memcpy(&vdev->device_state, device_state, + sizeof(struct vfio_region_info)); + g_free(device_state); + } else { + error_setg(&vdev->migration_blocker, + "Migration disabled: cannot support device state region"); + migrate_add_blocker(vdev->migration_blocker, &err); + if (err) { + error_propagate(errp, err); + error_free(vdev->migration_blocker); + goto error; + } + } + for (i = 0; i < PCI_ROM_SLOT; i++) { vfio_bar_quirk_setup(vdev, i); } @@ -2884,6 +2904,10 @@ out_teardown: vfio_teardown_msi(vdev); vfio_bars_exit(vdev); error: + if (vdev->migration_blocker) { + migrate_del_blocker(vdev->migration_blocker); + error_free(vdev->migration_blocker); + } error_prepend(errp, ERR_PREFIX, vdev->vbasedev.name); } @@ -3009,7 +3033,6 @@ static Property vfio_pci_dev_properties[] = { static const VMStateDescription vfio_pci_vmstate = { .name = "vfio-pci", - .unmigratable = 1, }; static void vfio_pci_dev_class_init(ObjectClass *klass, void *data) diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index 502a575..0ee1724 100644 --- a/hw/vfio/pci.h +++ b/hw/vfio/pci.h @@ -116,6 +116,8 @@ typedef struct VFIOPCIDevice { VFIOBAR bars[PCI_NUM_REGIONS - 1]; /* No ROM */ VFIOVGA *vga; /* 0xa0000, 0x3b0, 0x3c0 */ void *igd_opregion; + struct vfio_region_info device_state; + Error *migration_blocker; PCIHostDeviceAddress host; EventNotifier err_notifier; EventNotifier req_notifier; diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h index 4312e96..e3380ad 100644 --- a/linux-headers/linux/vfio.h +++ b/linux-headers/linux/vfio.h @@ -297,9 +297,12 @@ struct vfio_region_info_cap_type { #define VFIO_REGION_TYPE_PCI_VENDOR_MASK (0xffff) /* 8086 Vendor sub-types */ -#define VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION (1) -#define VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG (2) -#define VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG (3) +#define VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION (1) +#define VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG (2) +#define VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG (3) + +/* Mdev sub-type for device state save and restore */ +#define VFIO_REGION_SUBTYPE_DEVICE_STATE (4) /** * VFIO_DEVICE_GET_IRQ_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 9,
New VFIO sub region VFIO_REGION_SUBTYPE_DEVICE_STATE is added to fetch and restore the status of mdev device vGPU during the live migration. Signed-off-by: Yulei Zhang <yulei.zhang@intel.com> --- hw/vfio/pci.c | 25 ++++++++++++++++++++++++- hw/vfio/pci.h | 2 ++ linux-headers/linux/vfio.h | 9 ++++++--- 3 files changed, 32 insertions(+), 4 deletions(-)