diff mbox

[v9,10/11] vfio: Add waiting for host aer error progress

Message ID 1468913909-21811-11-git-send-email-zhoujie2011@cn.fujitsu.com
State New
Headers show

Commit Message

Zhou Jie July 19, 2016, 7:38 a.m. UTC
From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>

For supporting aer recovery, host and guest would run the same aer
recovery code, that would do the secondary bus reset if the error
is fatal, the aer recovery process:
  1. error_detected
  2. reset_link (if fatal)
  3. slot_reset/mmio_enabled
  4. resume

It indicates that host will do secondary bus reset to reset
the physical devices under bus in step 2, that would cause
devices in D3 status in a short time. But in qemu, we register
an error detected handler, that would be invoked as host broadcasts
the error-detected event in step 1, in order to avoid guest do
reset_link when host do reset_link simultaneously. it may cause
fatal error. we poll the vfio_device_info to assure host reset
completely.
In qemu, the aer recovery process:
  1. Detect support for aer error progress
     If host vfio driver does not support for aer error progress,
     directly fail to boot up VM as with aer enabled.
  2. Immediately notify the VM on error detected.
  3. Wait for host aer error progress
     Poll the vfio_device_info, If it is still in aer error progress after
     some timeout, we would abort the guest directed bus reset
     altogether and unplug of the device to prevent it from further
     interacting with the VM.
  4. Reset bus.

Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
Signed-off-by: Zhou Jie <zhoujie2011@cn.fujitsu.com>
---
 hw/vfio/pci.c              | 51 +++++++++++++++++++++++++++++++++++++++++++++-
 hw/vfio/pci.h              |  1 +
 linux-headers/linux/vfio.h |  4 ++++
 3 files changed, 55 insertions(+), 1 deletion(-)

Comments

Alex Williamson Aug. 31, 2016, 8:13 p.m. UTC | #1
On Tue, 19 Jul 2016 15:38:28 +0800
Zhou Jie <zhoujie2011@cn.fujitsu.com> wrote:

> From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> 
> For supporting aer recovery, host and guest would run the same aer
> recovery code, that would do the secondary bus reset if the error
> is fatal, the aer recovery process:
>   1. error_detected
>   2. reset_link (if fatal)
>   3. slot_reset/mmio_enabled
>   4. resume
> 
> It indicates that host will do secondary bus reset to reset
> the physical devices under bus in step 2, that would cause
> devices in D3 status in a short time. But in qemu, we register
> an error detected handler, that would be invoked as host broadcasts
> the error-detected event in step 1, in order to avoid guest do
> reset_link when host do reset_link simultaneously. it may cause
> fatal error. we poll the vfio_device_info to assure host reset
> completely.
> In qemu, the aer recovery process:
>   1. Detect support for aer error progress
>      If host vfio driver does not support for aer error progress,
>      directly fail to boot up VM as with aer enabled.
>   2. Immediately notify the VM on error detected.
>   3. Wait for host aer error progress
>      Poll the vfio_device_info, If it is still in aer error progress after
>      some timeout, we would abort the guest directed bus reset
>      altogether and unplug of the device to prevent it from further
>      interacting with the VM.
>   4. Reset bus.
> 
> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> Signed-off-by: Zhou Jie <zhoujie2011@cn.fujitsu.com>
> ---
>  hw/vfio/pci.c              | 51 +++++++++++++++++++++++++++++++++++++++++++++-
>  hw/vfio/pci.h              |  1 +
>  linux-headers/linux/vfio.h |  4 ++++
>  3 files changed, 55 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index 0e42786..777245c 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -35,6 +35,12 @@
>  
>  #define MSIX_CAP_LENGTH 12
>  
> +/*
> + * Timeout for waiting host aer error process, it is 3 seconds.
> + * For hardware bus reset 3 seconds will be enough.
> + */
> +#define PCI_AER_PROCESS_TIMEOUT 3000000

Why is 3 seconds "enough"?  What considerations went into determining
this that would need to be re-evaluated if we ever want to change it?
24 hours is enough, but why was 3 seconds chosen over 24 hours?  Why
would 2 seconds be a worse choice?  1?

> +
>  static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
>  static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
>  
> @@ -1913,6 +1919,14 @@ static void vfio_check_hot_bus_reset(VFIOPCIDevice *vdev, Error **errp)
>      VFIOGroup *group;
>      int ret, i, devfn, range_limit;
>  
> +    if (!(vdev->vbasedev.flags & VFIO_DEVICE_FLAGS_AERPROCESS)) {
> +        error_setg(errp, "vfio: Cannot enable AER for device %s,"
> +                   " host vfio driver does not support for"
> +                   " aer error progress",
> +                   vdev->vbasedev.name);
> +        return;
> +    }
> +
>      ret = vfio_get_hot_reset_info(vdev, &info);
>      if (ret) {
>          error_setg(errp, "vfio: Cannot enable AER for device %s,"
> @@ -2679,6 +2693,11 @@ static void vfio_err_notifier_handler(void *opaque)
>          msg.severity = isfatal ? PCI_ERR_ROOT_CMD_FATAL_EN :
>                                   PCI_ERR_ROOT_CMD_NONFATAL_EN;
>  
> +        if (isfatal) {
> +            PCIDevice *dev_0 = pci_get_function_0(dev);
> +            VFIOPCIDevice *vdev_0 = DO_UPCAST(VFIOPCIDevice, pdev, dev_0);
> +            vdev_0->pci_aer_error_signaled = true;
> +        }
>          pcie_aer_msg(dev, &msg);
>          return;
>      }
> @@ -3163,6 +3182,19 @@ static void vfio_exitfn(PCIDevice *pdev)
>      vfio_bars_exit(vdev);
>  }
>  
> +static int vfio_aer_error_is_in_process(VFIOPCIDevice *vdev)
> +{
> +    struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
> +    int ret;
> +
> +    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_INFO, &dev_info);
> +    if (ret) {
> +        error_report("vfio: error getting device info: %m");
> +        return ret;
> +    }
> +    return dev_info.flags & VFIO_DEVICE_FLAGS_INAERPROCESS ? 1 : 0;
> +}
> +
>  static void vfio_pci_reset(DeviceState *dev)
>  {
>      PCIDevice *pdev = DO_UPCAST(PCIDevice, qdev, dev);
> @@ -3176,7 +3208,24 @@ static void vfio_pci_reset(DeviceState *dev)
>          if ((pci_get_word(br->config + PCI_BRIDGE_CONTROL) &
>               PCI_BRIDGE_CTL_BUS_RESET)) {
>              if (pci_get_function_0(pdev) == pdev) {
> -                vfio_pci_hot_reset(vdev, vdev->single_depend_dev);
> +                if (!vdev->pci_aer_error_signaled) {
> +                    vfio_pci_hot_reset(vdev, vdev->single_depend_dev);
> +                } else {
> +                    int i;
> +                    for (i = 0; i < 1000; i++) {
> +                        if (!vfio_aer_error_is_in_process(vdev)) {
> +                            break;
> +                        }
> +                        g_usleep(PCI_AER_PROCESS_TIMEOUT / 1000);
> +                    }
> +
> +                    if (i == 1000) {
> +                        qdev_unplug(&vdev->pdev.qdev, NULL);

This looks a bit precarious to me, but I guess in the end we're only
really sending an attention button press on the hotplug slot.  Have you
forced this code path in testing and does the right thing happen for
both Windows and Linux guests?

> +                    } else {
> +                        vfio_pci_hot_reset(vdev, vdev->single_depend_dev);
> +                    }
> +                    vdev->pci_aer_error_signaled = false;
> +                }
>              }
>              return;
>          }
> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
> index ed14322..c9e0202 100644
> --- a/hw/vfio/pci.h
> +++ b/hw/vfio/pci.h
> @@ -148,6 +148,7 @@ typedef struct VFIOPCIDevice {
>      bool no_kvm_msi;
>      bool no_kvm_msix;
>      bool single_depend_dev;
> +    bool pci_aer_error_signaled;
>  } VFIOPCIDevice;
>  
>  uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
> diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
> index 759b850..9295fca 100644
> --- a/linux-headers/linux/vfio.h
> +++ b/linux-headers/linux/vfio.h
> @@ -198,6 +198,10 @@ struct vfio_device_info {
>  #define VFIO_DEVICE_FLAGS_PCI	(1 << 1)	/* vfio-pci device */
>  #define VFIO_DEVICE_FLAGS_PLATFORM (1 << 2)	/* vfio-platform device */
>  #define VFIO_DEVICE_FLAGS_AMBA  (1 << 3)	/* vfio-amba device */
> +/* support aer error progress */
> +#define VFIO_DEVICE_FLAGS_AERPROCESS  (1 << 4)
> +/* status in aer error progress */
> +#define VFIO_DEVICE_FLAGS_INAERPROCESS  (1 << 5)
>  	__u32	num_regions;	/* Max region index + 1 */
>  	__u32	num_irqs;	/* Max IRQ index + 1 */
>  };
Michael S. Tsirkin Aug. 31, 2016, 8:34 p.m. UTC | #2
On Wed, Aug 31, 2016 at 02:13:09PM -0600, Alex Williamson wrote:
> On Tue, 19 Jul 2016 15:38:28 +0800
> Zhou Jie <zhoujie2011@cn.fujitsu.com> wrote:
> 
> > From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> > 
> > For supporting aer recovery, host and guest would run the same aer
> > recovery code, that would do the secondary bus reset if the error
> > is fatal, the aer recovery process:
> >   1. error_detected
> >   2. reset_link (if fatal)
> >   3. slot_reset/mmio_enabled
> >   4. resume
> > 
> > It indicates that host will do secondary bus reset to reset
> > the physical devices under bus in step 2, that would cause
> > devices in D3 status in a short time. But in qemu, we register
> > an error detected handler, that would be invoked as host broadcasts
> > the error-detected event in step 1, in order to avoid guest do
> > reset_link when host do reset_link simultaneously. it may cause
> > fatal error. we poll the vfio_device_info to assure host reset
> > completely.
> > In qemu, the aer recovery process:
> >   1. Detect support for aer error progress
> >      If host vfio driver does not support for aer error progress,
> >      directly fail to boot up VM as with aer enabled.
> >   2. Immediately notify the VM on error detected.
> >   3. Wait for host aer error progress
> >      Poll the vfio_device_info, If it is still in aer error progress after
> >      some timeout, we would abort the guest directed bus reset
> >      altogether and unplug of the device to prevent it from further
> >      interacting with the VM.
> >   4. Reset bus.
> > 
> > Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> > Signed-off-by: Zhou Jie <zhoujie2011@cn.fujitsu.com>
> > ---
> >  hw/vfio/pci.c              | 51 +++++++++++++++++++++++++++++++++++++++++++++-
> >  hw/vfio/pci.h              |  1 +
> >  linux-headers/linux/vfio.h |  4 ++++
> >  3 files changed, 55 insertions(+), 1 deletion(-)
> > 
> > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> > index 0e42786..777245c 100644
> > --- a/hw/vfio/pci.c
> > +++ b/hw/vfio/pci.c
> > @@ -35,6 +35,12 @@
> >  
> >  #define MSIX_CAP_LENGTH 12
> >  
> > +/*
> > + * Timeout for waiting host aer error process, it is 3 seconds.
> > + * For hardware bus reset 3 seconds will be enough.
> > + */
> > +#define PCI_AER_PROCESS_TIMEOUT 3000000
> 
> Why is 3 seconds "enough"?  What considerations went into determining
> this that would need to be re-evaluated if we ever want to change it?
> 24 hours is enough, but why was 3 seconds chosen over 24 hours?  Why
> would 2 seconds be a worse choice?  1?

And just to clarify, the answer belongs in a code comment
and possibly commit log, not just in an email response.

> > +
> >  static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
> >  static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
> >  
> > @@ -1913,6 +1919,14 @@ static void vfio_check_hot_bus_reset(VFIOPCIDevice *vdev, Error **errp)
> >      VFIOGroup *group;
> >      int ret, i, devfn, range_limit;
> >  
> > +    if (!(vdev->vbasedev.flags & VFIO_DEVICE_FLAGS_AERPROCESS)) {
> > +        error_setg(errp, "vfio: Cannot enable AER for device %s,"
> > +                   " host vfio driver does not support for"
> > +                   " aer error progress",
> > +                   vdev->vbasedev.name);
> > +        return;
> > +    }
> > +
> >      ret = vfio_get_hot_reset_info(vdev, &info);
> >      if (ret) {
> >          error_setg(errp, "vfio: Cannot enable AER for device %s,"
> > @@ -2679,6 +2693,11 @@ static void vfio_err_notifier_handler(void *opaque)
> >          msg.severity = isfatal ? PCI_ERR_ROOT_CMD_FATAL_EN :
> >                                   PCI_ERR_ROOT_CMD_NONFATAL_EN;
> >  
> > +        if (isfatal) {
> > +            PCIDevice *dev_0 = pci_get_function_0(dev);
> > +            VFIOPCIDevice *vdev_0 = DO_UPCAST(VFIOPCIDevice, pdev, dev_0);
> > +            vdev_0->pci_aer_error_signaled = true;
> > +        }
> >          pcie_aer_msg(dev, &msg);
> >          return;
> >      }
> > @@ -3163,6 +3182,19 @@ static void vfio_exitfn(PCIDevice *pdev)
> >      vfio_bars_exit(vdev);
> >  }
> >  
> > +static int vfio_aer_error_is_in_process(VFIOPCIDevice *vdev)
> > +{
> > +    struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
> > +    int ret;
> > +
> > +    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_INFO, &dev_info);
> > +    if (ret) {
> > +        error_report("vfio: error getting device info: %m");
> > +        return ret;
> > +    }
> > +    return dev_info.flags & VFIO_DEVICE_FLAGS_INAERPROCESS ? 1 : 0;
> > +}
> > +
> >  static void vfio_pci_reset(DeviceState *dev)
> >  {
> >      PCIDevice *pdev = DO_UPCAST(PCIDevice, qdev, dev);
> > @@ -3176,7 +3208,24 @@ static void vfio_pci_reset(DeviceState *dev)
> >          if ((pci_get_word(br->config + PCI_BRIDGE_CONTROL) &
> >               PCI_BRIDGE_CTL_BUS_RESET)) {
> >              if (pci_get_function_0(pdev) == pdev) {
> > -                vfio_pci_hot_reset(vdev, vdev->single_depend_dev);
> > +                if (!vdev->pci_aer_error_signaled) {
> > +                    vfio_pci_hot_reset(vdev, vdev->single_depend_dev);
> > +                } else {
> > +                    int i;
> > +                    for (i = 0; i < 1000; i++) {
> > +                        if (!vfio_aer_error_is_in_process(vdev)) {
> > +                            break;
> > +                        }
> > +                        g_usleep(PCI_AER_PROCESS_TIMEOUT / 1000);
> > +                    }
> > +
> > +                    if (i == 1000) {
> > +                        qdev_unplug(&vdev->pdev.qdev, NULL);
> 
> This looks a bit precarious to me, but I guess in the end we're only
> really sending an attention button press on the hotplug slot.  Have you
> forced this code path in testing and does the right thing happen for
> both Windows and Linux guests?

If all else fails, require management to specify the timeout.


> > +                    } else {
> > +                        vfio_pci_hot_reset(vdev, vdev->single_depend_dev);
> > +                    }
> > +                    vdev->pci_aer_error_signaled = false;
> > +                }
> >              }
> >              return;
> >          }
> > diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
> > index ed14322..c9e0202 100644
> > --- a/hw/vfio/pci.h
> > +++ b/hw/vfio/pci.h
> > @@ -148,6 +148,7 @@ typedef struct VFIOPCIDevice {
> >      bool no_kvm_msi;
> >      bool no_kvm_msix;
> >      bool single_depend_dev;
> > +    bool pci_aer_error_signaled;
> >  } VFIOPCIDevice;
> >  
> >  uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
> > diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
> > index 759b850..9295fca 100644
> > --- a/linux-headers/linux/vfio.h
> > +++ b/linux-headers/linux/vfio.h
> > @@ -198,6 +198,10 @@ struct vfio_device_info {
> >  #define VFIO_DEVICE_FLAGS_PCI	(1 << 1)	/* vfio-pci device */
> >  #define VFIO_DEVICE_FLAGS_PLATFORM (1 << 2)	/* vfio-platform device */
> >  #define VFIO_DEVICE_FLAGS_AMBA  (1 << 3)	/* vfio-amba device */
> > +/* support aer error progress */
> > +#define VFIO_DEVICE_FLAGS_AERPROCESS  (1 << 4)
> > +/* status in aer error progress */
> > +#define VFIO_DEVICE_FLAGS_INAERPROCESS  (1 << 5)
> >  	__u32	num_regions;	/* Max region index + 1 */
> >  	__u32	num_irqs;	/* Max IRQ index + 1 */
> >  };
Cao jin Oct. 24, 2016, 7:56 a.m. UTC | #3
Hi

On 07/19/2016 03:38 PM, Zhou Jie wrote:
> From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
>
> For supporting aer recovery, host and guest would run the same aer
> recovery code, that would do the secondary bus reset if the error
> is fatal, the aer recovery process:
>    1. error_detected
>    2. reset_link (if fatal)
>    3. slot_reset/mmio_enabled
>    4. resume
>
> It indicates that host will do secondary bus reset to reset
> the physical devices under bus in step 2, that would cause
> devices in D3 status in a short time. But in qemu, we register
> an error detected handler, that would be invoked as host broadcasts
> the error-detected event in step 1, in order to avoid guest do
> reset_link when host do reset_link simultaneously. it may cause
> fatal error. we poll the vfio_device_info to assure host reset
> completely.
> In qemu, the aer recovery process:
>    1. Detect support for aer error progress
>       If host vfio driver does not support for aer error progress,
>       directly fail to boot up VM as with aer enabled.
>    2. Immediately notify the VM on error detected.
>    3. Wait for host aer error progress
>       Poll the vfio_device_info, If it is still in aer error progress after
>       some timeout, we would abort the guest directed bus reset
>       altogether and unplug of the device to prevent it from further
>       interacting with the VM.
>    4. Reset bus.
>

I have question about step 4(Reset bus). When guest reset link, guest 
set 'secondary bus reset' bit, then devices under the bus would do reset 
themselves separately. For vfio-pci, the emulated device, I think the 
previous logic[*] is fine. But now process looks like following:

1. One affected device: we do(or wait & do) bus 
reset(vfio_pci_hot_reset) directly
2. N affected devices: function 0 will do the same as 1.
    other affected devices will do nothing, just return.

these logic seems weird to me, are these what we want?

I don't see why we don't use the previous 'reset priority' for each 
vfio-pci emulated devices.

[*]reset priority
      1. If has "device specific reset function", then do it
      2. If has FLR, then do it.
      3. If it can do bus reset(only 1 affected device), then do it
      4. If has pm_reset, then do it

This is what I think:

static void vfio_pci_reset(DeviceState *dev)
{
     PCIDevice *pdev = DO_UPCAST(PCIDevice, qdev, dev);
     VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
     int i;

     for (i = 0; i < 1000; i++) {
         if (!vfio_aer_error_is_in_process(vdev)) {
             break;
         }
         g_usleep(PCI_AER_PROCESS_TIMEOUT / 1000);
     }

     if (i == 1000) {
         qdev_unplug(&vdev->pdev.qdev, NULL);
     } else {
         trace_vfio_pci_reset(vdev->vbasedev.name);

         vfio_pci_pre_reset(vdev);

         if (vdev->resetfn && !vdev->resetfn(vdev)) {
             goto post_reset;
         }

         if (vdev->vbasedev.reset_works &&
             (vdev->has_flr || !vdev->has_pm_reset) &&
             !ioctl(vdev->vbasedev.fd, VFIO_DEVICE_RESET)) {
             trace_vfio_pci_reset_flr(vdev->vbasedev.name);
             goto post_reset;
         }

         /* See if we can do our own bus reset */
         if (!vfio_pci_hot_reset_one(vdev)) {
             goto post_reset;
         }

         /* If nothing else works and the device supports PM reset, use 
it */
         if (vdev->vbasedev.reset_works && vdev->has_pm_reset &&
             !ioctl(vdev->vbasedev.fd, VFIO_DEVICE_RESET)) {
             trace_vfio_pci_reset_pm(vdev->vbasedev.name);
             goto post_reset;
         }
     }

post_reset:
     vfio_pci_post_reset(vdev);
}

Cao jin

> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> Signed-off-by: Zhou Jie <zhoujie2011@cn.fujitsu.com>
> ---
>   hw/vfio/pci.c              | 51 +++++++++++++++++++++++++++++++++++++++++++++-
>   hw/vfio/pci.h              |  1 +
>   linux-headers/linux/vfio.h |  4 ++++
>   3 files changed, 55 insertions(+), 1 deletion(-)
>
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index 0e42786..777245c 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -35,6 +35,12 @@
>
>   #define MSIX_CAP_LENGTH 12
>
> +/*
> + * Timeout for waiting host aer error process, it is 3 seconds.
> + * For hardware bus reset 3 seconds will be enough.
> + */
> +#define PCI_AER_PROCESS_TIMEOUT 3000000
> +
>   static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
>   static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
>
> @@ -1913,6 +1919,14 @@ static void vfio_check_hot_bus_reset(VFIOPCIDevice *vdev, Error **errp)
>       VFIOGroup *group;
>       int ret, i, devfn, range_limit;
>
> +    if (!(vdev->vbasedev.flags & VFIO_DEVICE_FLAGS_AERPROCESS)) {
> +        error_setg(errp, "vfio: Cannot enable AER for device %s,"
> +                   " host vfio driver does not support for"
> +                   " aer error progress",
> +                   vdev->vbasedev.name);
> +        return;
> +    }
> +
>       ret = vfio_get_hot_reset_info(vdev, &info);
>       if (ret) {
>           error_setg(errp, "vfio: Cannot enable AER for device %s,"
> @@ -2679,6 +2693,11 @@ static void vfio_err_notifier_handler(void *opaque)
>           msg.severity = isfatal ? PCI_ERR_ROOT_CMD_FATAL_EN :
>                                    PCI_ERR_ROOT_CMD_NONFATAL_EN;
>
> +        if (isfatal) {
> +            PCIDevice *dev_0 = pci_get_function_0(dev);
> +            VFIOPCIDevice *vdev_0 = DO_UPCAST(VFIOPCIDevice, pdev, dev_0);
> +            vdev_0->pci_aer_error_signaled = true;
> +        }
>           pcie_aer_msg(dev, &msg);
>           return;
>       }
> @@ -3163,6 +3182,19 @@ static void vfio_exitfn(PCIDevice *pdev)
>       vfio_bars_exit(vdev);
>   }
>
> +static int vfio_aer_error_is_in_process(VFIOPCIDevice *vdev)
> +{
> +    struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
> +    int ret;
> +
> +    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_INFO, &dev_info);
> +    if (ret) {
> +        error_report("vfio: error getting device info: %m");
> +        return ret;
> +    }
> +    return dev_info.flags & VFIO_DEVICE_FLAGS_INAERPROCESS ? 1 : 0;
> +}
> +
>   static void vfio_pci_reset(DeviceState *dev)
>   {
>       PCIDevice *pdev = DO_UPCAST(PCIDevice, qdev, dev);
> @@ -3176,7 +3208,24 @@ static void vfio_pci_reset(DeviceState *dev)
>           if ((pci_get_word(br->config + PCI_BRIDGE_CONTROL) &
>                PCI_BRIDGE_CTL_BUS_RESET)) {
>               if (pci_get_function_0(pdev) == pdev) {
> -                vfio_pci_hot_reset(vdev, vdev->single_depend_dev);
> +                if (!vdev->pci_aer_error_signaled) {
> +                    vfio_pci_hot_reset(vdev, vdev->single_depend_dev);
> +                } else {
> +                    int i;
> +                    for (i = 0; i < 1000; i++) {
> +                        if (!vfio_aer_error_is_in_process(vdev)) {
> +                            break;
> +                        }
> +                        g_usleep(PCI_AER_PROCESS_TIMEOUT / 1000);
> +                    }
> +
> +                    if (i == 1000) {
> +                        qdev_unplug(&vdev->pdev.qdev, NULL);
> +                    } else {
> +                        vfio_pci_hot_reset(vdev, vdev->single_depend_dev);
> +                    }
> +                    vdev->pci_aer_error_signaled = false;
> +                }
>               }
>               return;
>           }
> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
> index ed14322..c9e0202 100644
> --- a/hw/vfio/pci.h
> +++ b/hw/vfio/pci.h
> @@ -148,6 +148,7 @@ typedef struct VFIOPCIDevice {
>       bool no_kvm_msi;
>       bool no_kvm_msix;
>       bool single_depend_dev;
> +    bool pci_aer_error_signaled;
>   } VFIOPCIDevice;
>
>   uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
> diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
> index 759b850..9295fca 100644
> --- a/linux-headers/linux/vfio.h
> +++ b/linux-headers/linux/vfio.h
> @@ -198,6 +198,10 @@ struct vfio_device_info {
>   #define VFIO_DEVICE_FLAGS_PCI	(1 << 1)	/* vfio-pci device */
>   #define VFIO_DEVICE_FLAGS_PLATFORM (1 << 2)	/* vfio-platform device */
>   #define VFIO_DEVICE_FLAGS_AMBA  (1 << 3)	/* vfio-amba device */
> +/* support aer error progress */
> +#define VFIO_DEVICE_FLAGS_AERPROCESS  (1 << 4)
> +/* status in aer error progress */
> +#define VFIO_DEVICE_FLAGS_INAERPROCESS  (1 << 5)
>   	__u32	num_regions;	/* Max region index + 1 */
>   	__u32	num_irqs;	/* Max IRQ index + 1 */
>   };
>
diff mbox

Patch

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 0e42786..777245c 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -35,6 +35,12 @@ 
 
 #define MSIX_CAP_LENGTH 12
 
+/*
+ * Timeout for waiting host aer error process, it is 3 seconds.
+ * For hardware bus reset 3 seconds will be enough.
+ */
+#define PCI_AER_PROCESS_TIMEOUT 3000000
+
 static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
 static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
 
@@ -1913,6 +1919,14 @@  static void vfio_check_hot_bus_reset(VFIOPCIDevice *vdev, Error **errp)
     VFIOGroup *group;
     int ret, i, devfn, range_limit;
 
+    if (!(vdev->vbasedev.flags & VFIO_DEVICE_FLAGS_AERPROCESS)) {
+        error_setg(errp, "vfio: Cannot enable AER for device %s,"
+                   " host vfio driver does not support for"
+                   " aer error progress",
+                   vdev->vbasedev.name);
+        return;
+    }
+
     ret = vfio_get_hot_reset_info(vdev, &info);
     if (ret) {
         error_setg(errp, "vfio: Cannot enable AER for device %s,"
@@ -2679,6 +2693,11 @@  static void vfio_err_notifier_handler(void *opaque)
         msg.severity = isfatal ? PCI_ERR_ROOT_CMD_FATAL_EN :
                                  PCI_ERR_ROOT_CMD_NONFATAL_EN;
 
+        if (isfatal) {
+            PCIDevice *dev_0 = pci_get_function_0(dev);
+            VFIOPCIDevice *vdev_0 = DO_UPCAST(VFIOPCIDevice, pdev, dev_0);
+            vdev_0->pci_aer_error_signaled = true;
+        }
         pcie_aer_msg(dev, &msg);
         return;
     }
@@ -3163,6 +3182,19 @@  static void vfio_exitfn(PCIDevice *pdev)
     vfio_bars_exit(vdev);
 }
 
+static int vfio_aer_error_is_in_process(VFIOPCIDevice *vdev)
+{
+    struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
+    int ret;
+
+    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_INFO, &dev_info);
+    if (ret) {
+        error_report("vfio: error getting device info: %m");
+        return ret;
+    }
+    return dev_info.flags & VFIO_DEVICE_FLAGS_INAERPROCESS ? 1 : 0;
+}
+
 static void vfio_pci_reset(DeviceState *dev)
 {
     PCIDevice *pdev = DO_UPCAST(PCIDevice, qdev, dev);
@@ -3176,7 +3208,24 @@  static void vfio_pci_reset(DeviceState *dev)
         if ((pci_get_word(br->config + PCI_BRIDGE_CONTROL) &
              PCI_BRIDGE_CTL_BUS_RESET)) {
             if (pci_get_function_0(pdev) == pdev) {
-                vfio_pci_hot_reset(vdev, vdev->single_depend_dev);
+                if (!vdev->pci_aer_error_signaled) {
+                    vfio_pci_hot_reset(vdev, vdev->single_depend_dev);
+                } else {
+                    int i;
+                    for (i = 0; i < 1000; i++) {
+                        if (!vfio_aer_error_is_in_process(vdev)) {
+                            break;
+                        }
+                        g_usleep(PCI_AER_PROCESS_TIMEOUT / 1000);
+                    }
+
+                    if (i == 1000) {
+                        qdev_unplug(&vdev->pdev.qdev, NULL);
+                    } else {
+                        vfio_pci_hot_reset(vdev, vdev->single_depend_dev);
+                    }
+                    vdev->pci_aer_error_signaled = false;
+                }
             }
             return;
         }
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index ed14322..c9e0202 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -148,6 +148,7 @@  typedef struct VFIOPCIDevice {
     bool no_kvm_msi;
     bool no_kvm_msix;
     bool single_depend_dev;
+    bool pci_aer_error_signaled;
 } VFIOPCIDevice;
 
 uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
index 759b850..9295fca 100644
--- a/linux-headers/linux/vfio.h
+++ b/linux-headers/linux/vfio.h
@@ -198,6 +198,10 @@  struct vfio_device_info {
 #define VFIO_DEVICE_FLAGS_PCI	(1 << 1)	/* vfio-pci device */
 #define VFIO_DEVICE_FLAGS_PLATFORM (1 << 2)	/* vfio-platform device */
 #define VFIO_DEVICE_FLAGS_AMBA  (1 << 3)	/* vfio-amba device */
+/* support aer error progress */
+#define VFIO_DEVICE_FLAGS_AERPROCESS  (1 << 4)
+/* status in aer error progress */
+#define VFIO_DEVICE_FLAGS_INAERPROCESS  (1 << 5)
 	__u32	num_regions;	/* Max region index + 1 */
 	__u32	num_irqs;	/* Max IRQ index + 1 */
 };