diff mbox

[v5,5/7] vfio-pci: pass the aer error to guest

Message ID 3c81eaae84d6b1fa6e229e765a534fdf180e1ce4.1426155432.git.chen.fan.fnst@cn.fujitsu.com
State New
Headers show

Commit Message

chenfan March 12, 2015, 10:23 a.m. UTC
when the vfio device encounters an uncorrectable error in host,
the vfio_pci driver will signal the eventfd registered by this
vfio device, the results in the qemu eventfd handler getting
invoked.

this patch is to pass the error to guest and have the guest driver
recover from the error.

Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
---
 hw/vfio/pci.c | 34 ++++++++++++++++++++++++++++------
 1 file changed, 28 insertions(+), 6 deletions(-)

Comments

Alex Williamson March 13, 2015, 10:34 p.m. UTC | #1
On Thu, 2015-03-12 at 18:23 +0800, Chen Fan wrote:
> when the vfio device encounters an uncorrectable error in host,
> the vfio_pci driver will signal the eventfd registered by this
> vfio device, the results in the qemu eventfd handler getting
> invoked.
> 
> this patch is to pass the error to guest and have the guest driver
> recover from the error.

What is going to be the typical recovery mechanism for the guest?  I'm
concerned that the topology of the device in the guest doesn't
necessarily match the topology of the device in the host, so if the
guest were to attempt a bus reset to recover a device, for instance,
what happens?

> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> ---
>  hw/vfio/pci.c | 34 ++++++++++++++++++++++++++++------
>  1 file changed, 28 insertions(+), 6 deletions(-)
> 
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index 0a515b6..8966c49 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -3240,18 +3240,40 @@ static void vfio_put_device(VFIOPCIDevice *vdev)
>  static void vfio_err_notifier_handler(void *opaque)
>  {
>      VFIOPCIDevice *vdev = opaque;
> +    PCIDevice *dev = &vdev->pdev;
> +    PCIEAERMsg msg = {
> +        .severity = 0,
> +        .source_id = (pci_bus_num(dev->bus) << 8) | dev->devfn,
> +    };
>  
>      if (!event_notifier_test_and_clear(&vdev->err_notifier)) {
>          return;
>      }
>  
> +    /* we should read the error details from the real hardware
> +     * configuration spaces, here we only need to do is signaling
> +     * to guest an uncorrectable error has occurred.
> +     */

Inconsistent comment style

> +     if(dev->exp.aer_cap) {
         ^ space

> +        uint8_t *aer_cap = dev->config + dev->exp.aer_cap;
> +        uint32_t uncor_status;
> +        bool isfatal;
> +
> +        uncor_status = vfio_pci_read_config(dev,
> +                           dev->exp.aer_cap + PCI_ERR_UNCOR_STATUS, 4);
> +
> +        isfatal = uncor_status & pci_get_long(aer_cap + PCI_ERR_UNCOR_SEVER);
> +
> +        msg.severity = isfatal ? PCI_ERR_ROOT_CMD_FATAL_EN :
> +                                 PCI_ERR_ROOT_CMD_NONFATAL_EN;
> +
> +        pcie_aer_msg(dev, &msg);
> +        return;
> +    }
> +
>      /*
> -     * TBD. Retrieve the error details and decide what action
> -     * needs to be taken. One of the actions could be to pass
> -     * the error to the guest and have the guest driver recover
> -     * from the error. This requires that PCIe capabilities be
> -     * exposed to the guest. For now, we just terminate the
> -     * guest to contain the error.
> +     * If the aer capability is not exposed to the guest. we just
> +     * terminate the guest to contain the error.
>       */
>  
>      error_report("%s(%04x:%02x:%02x.%x) Unrecoverable error detected.  "
chenfan March 16, 2015, 3:05 a.m. UTC | #2
On 03/14/2015 06:34 AM, Alex Williamson wrote:
> On Thu, 2015-03-12 at 18:23 +0800, Chen Fan wrote:
>> when the vfio device encounters an uncorrectable error in host,
>> the vfio_pci driver will signal the eventfd registered by this
>> vfio device, the results in the qemu eventfd handler getting
>> invoked.
>>
>> this patch is to pass the error to guest and have the guest driver
>> recover from the error.
> What is going to be the typical recovery mechanism for the guest?  I'm
> concerned that the topology of the device in the guest doesn't
> necessarily match the topology of the device in the host, so if the
> guest were to attempt a bus reset to recover a device, for instance,
> what happens?
the recovery mechanism is that when guest got an aer error from a device,
guest will clean the corresponding status bit in device register. and for
need reset device, the guest aer driver would reset all devices under bus.

Thanks,
Chen


>> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
>> ---
>>   hw/vfio/pci.c | 34 ++++++++++++++++++++++++++++------
>>   1 file changed, 28 insertions(+), 6 deletions(-)
>>
>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>> index 0a515b6..8966c49 100644
>> --- a/hw/vfio/pci.c
>> +++ b/hw/vfio/pci.c
>> @@ -3240,18 +3240,40 @@ static void vfio_put_device(VFIOPCIDevice *vdev)
>>   static void vfio_err_notifier_handler(void *opaque)
>>   {
>>       VFIOPCIDevice *vdev = opaque;
>> +    PCIDevice *dev = &vdev->pdev;
>> +    PCIEAERMsg msg = {
>> +        .severity = 0,
>> +        .source_id = (pci_bus_num(dev->bus) << 8) | dev->devfn,
>> +    };
>>   
>>       if (!event_notifier_test_and_clear(&vdev->err_notifier)) {
>>           return;
>>       }
>>   
>> +    /* we should read the error details from the real hardware
>> +     * configuration spaces, here we only need to do is signaling
>> +     * to guest an uncorrectable error has occurred.
>> +     */
> Inconsistent comment style
>
>> +     if(dev->exp.aer_cap) {
>           ^ space
>
>> +        uint8_t *aer_cap = dev->config + dev->exp.aer_cap;
>> +        uint32_t uncor_status;
>> +        bool isfatal;
>> +
>> +        uncor_status = vfio_pci_read_config(dev,
>> +                           dev->exp.aer_cap + PCI_ERR_UNCOR_STATUS, 4);
>> +
>> +        isfatal = uncor_status & pci_get_long(aer_cap + PCI_ERR_UNCOR_SEVER);
>> +
>> +        msg.severity = isfatal ? PCI_ERR_ROOT_CMD_FATAL_EN :
>> +                                 PCI_ERR_ROOT_CMD_NONFATAL_EN;
>> +
>> +        pcie_aer_msg(dev, &msg);
>> +        return;
>> +    }
>> +
>>       /*
>> -     * TBD. Retrieve the error details and decide what action
>> -     * needs to be taken. One of the actions could be to pass
>> -     * the error to the guest and have the guest driver recover
>> -     * from the error. This requires that PCIe capabilities be
>> -     * exposed to the guest. For now, we just terminate the
>> -     * guest to contain the error.
>> +     * If the aer capability is not exposed to the guest. we just
>> +     * terminate the guest to contain the error.
>>        */
>>   
>>       error_report("%s(%04x:%02x:%02x.%x) Unrecoverable error detected.  "
>
>
> .
>
Alex Williamson March 16, 2015, 3:52 a.m. UTC | #3
On Mon, 2015-03-16 at 11:05 +0800, Chen Fan wrote:
> On 03/14/2015 06:34 AM, Alex Williamson wrote:
> > On Thu, 2015-03-12 at 18:23 +0800, Chen Fan wrote:
> >> when the vfio device encounters an uncorrectable error in host,
> >> the vfio_pci driver will signal the eventfd registered by this
> >> vfio device, the results in the qemu eventfd handler getting
> >> invoked.
> >>
> >> this patch is to pass the error to guest and have the guest driver
> >> recover from the error.
> > What is going to be the typical recovery mechanism for the guest?  I'm
> > concerned that the topology of the device in the guest doesn't
> > necessarily match the topology of the device in the host, so if the
> > guest were to attempt a bus reset to recover a device, for instance,
> > what happens?
> the recovery mechanism is that when guest got an aer error from a device,
> guest will clean the corresponding status bit in device register. and for
> need reset device, the guest aer driver would reset all devices under bus.

Sorry, I'm still confused, how does the guest aer driver reset all
devices under a bus?  Are we talking about function-level, device
specific reset mechanisms or secondary bus resets?  If the guest is
performing secondary bus resets, what guarantee do they have that it
will translate to a physical secondary bus reset?  vfio may only do an
FLR when the bus is reset or it may not be able to do anything depending
on the available function-level resets and physical and virtual topology
of the device.  Thanks,

Alex

> >> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> >> ---
> >>   hw/vfio/pci.c | 34 ++++++++++++++++++++++++++++------
> >>   1 file changed, 28 insertions(+), 6 deletions(-)
> >>
> >> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> >> index 0a515b6..8966c49 100644
> >> --- a/hw/vfio/pci.c
> >> +++ b/hw/vfio/pci.c
> >> @@ -3240,18 +3240,40 @@ static void vfio_put_device(VFIOPCIDevice *vdev)
> >>   static void vfio_err_notifier_handler(void *opaque)
> >>   {
> >>       VFIOPCIDevice *vdev = opaque;
> >> +    PCIDevice *dev = &vdev->pdev;
> >> +    PCIEAERMsg msg = {
> >> +        .severity = 0,
> >> +        .source_id = (pci_bus_num(dev->bus) << 8) | dev->devfn,
> >> +    };
> >>   
> >>       if (!event_notifier_test_and_clear(&vdev->err_notifier)) {
> >>           return;
> >>       }
> >>   
> >> +    /* we should read the error details from the real hardware
> >> +     * configuration spaces, here we only need to do is signaling
> >> +     * to guest an uncorrectable error has occurred.
> >> +     */
> > Inconsistent comment style
> >
> >> +     if(dev->exp.aer_cap) {
> >           ^ space
> >
> >> +        uint8_t *aer_cap = dev->config + dev->exp.aer_cap;
> >> +        uint32_t uncor_status;
> >> +        bool isfatal;
> >> +
> >> +        uncor_status = vfio_pci_read_config(dev,
> >> +                           dev->exp.aer_cap + PCI_ERR_UNCOR_STATUS, 4);
> >> +
> >> +        isfatal = uncor_status & pci_get_long(aer_cap + PCI_ERR_UNCOR_SEVER);
> >> +
> >> +        msg.severity = isfatal ? PCI_ERR_ROOT_CMD_FATAL_EN :
> >> +                                 PCI_ERR_ROOT_CMD_NONFATAL_EN;
> >> +
> >> +        pcie_aer_msg(dev, &msg);
> >> +        return;
> >> +    }
> >> +
> >>       /*
> >> -     * TBD. Retrieve the error details and decide what action
> >> -     * needs to be taken. One of the actions could be to pass
> >> -     * the error to the guest and have the guest driver recover
> >> -     * from the error. This requires that PCIe capabilities be
> >> -     * exposed to the guest. For now, we just terminate the
> >> -     * guest to contain the error.
> >> +     * If the aer capability is not exposed to the guest. we just
> >> +     * terminate the guest to contain the error.
> >>        */
> >>   
> >>       error_report("%s(%04x:%02x:%02x.%x) Unrecoverable error detected.  "
> >
> >
> > .
> >
>
chenfan March 16, 2015, 7:35 a.m. UTC | #4
On 03/16/2015 11:52 AM, Alex Williamson wrote:
> On Mon, 2015-03-16 at 11:05 +0800, Chen Fan wrote:
>> On 03/14/2015 06:34 AM, Alex Williamson wrote:
>>> On Thu, 2015-03-12 at 18:23 +0800, Chen Fan wrote:
>>>> when the vfio device encounters an uncorrectable error in host,
>>>> the vfio_pci driver will signal the eventfd registered by this
>>>> vfio device, the results in the qemu eventfd handler getting
>>>> invoked.
>>>>
>>>> this patch is to pass the error to guest and have the guest driver
>>>> recover from the error.
>>> What is going to be the typical recovery mechanism for the guest?  I'm
>>> concerned that the topology of the device in the guest doesn't
>>> necessarily match the topology of the device in the host, so if the
>>> guest were to attempt a bus reset to recover a device, for instance,
>>> what happens?
>> the recovery mechanism is that when guest got an aer error from a device,
>> guest will clean the corresponding status bit in device register. and for
>> need reset device, the guest aer driver would reset all devices under bus.
> Sorry, I'm still confused, how does the guest aer driver reset all
> devices under a bus?  Are we talking about function-level, device
> specific reset mechanisms or secondary bus resets?  If the guest is
> performing secondary bus resets, what guarantee do they have that it
> will translate to a physical secondary bus reset?  vfio may only do an
> FLR when the bus is reset or it may not be able to do anything depending
> on the available function-level resets and physical and virtual topology
> of the device.  Thanks,
in general, functions depends on the corresponding device driver behaviors
to do the recovery. e.g: implemented the error_detect, slot_reset callbacks.
and for link reset, it usually do secondary bus reset.

and do we must require to the physical secondary bus reset for vfio device
as bus reset?

Thanks,
Chen

>
> Alex
>
>>>> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
>>>> ---
>>>>    hw/vfio/pci.c | 34 ++++++++++++++++++++++++++++------
>>>>    1 file changed, 28 insertions(+), 6 deletions(-)
>>>>
>>>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>>>> index 0a515b6..8966c49 100644
>>>> --- a/hw/vfio/pci.c
>>>> +++ b/hw/vfio/pci.c
>>>> @@ -3240,18 +3240,40 @@ static void vfio_put_device(VFIOPCIDevice *vdev)
>>>>    static void vfio_err_notifier_handler(void *opaque)
>>>>    {
>>>>        VFIOPCIDevice *vdev = opaque;
>>>> +    PCIDevice *dev = &vdev->pdev;
>>>> +    PCIEAERMsg msg = {
>>>> +        .severity = 0,
>>>> +        .source_id = (pci_bus_num(dev->bus) << 8) | dev->devfn,
>>>> +    };
>>>>    
>>>>        if (!event_notifier_test_and_clear(&vdev->err_notifier)) {
>>>>            return;
>>>>        }
>>>>    
>>>> +    /* we should read the error details from the real hardware
>>>> +     * configuration spaces, here we only need to do is signaling
>>>> +     * to guest an uncorrectable error has occurred.
>>>> +     */
>>> Inconsistent comment style
>>>
>>>> +     if(dev->exp.aer_cap) {
>>>            ^ space
>>>
>>>> +        uint8_t *aer_cap = dev->config + dev->exp.aer_cap;
>>>> +        uint32_t uncor_status;
>>>> +        bool isfatal;
>>>> +
>>>> +        uncor_status = vfio_pci_read_config(dev,
>>>> +                           dev->exp.aer_cap + PCI_ERR_UNCOR_STATUS, 4);
>>>> +
>>>> +        isfatal = uncor_status & pci_get_long(aer_cap + PCI_ERR_UNCOR_SEVER);
>>>> +
>>>> +        msg.severity = isfatal ? PCI_ERR_ROOT_CMD_FATAL_EN :
>>>> +                                 PCI_ERR_ROOT_CMD_NONFATAL_EN;
>>>> +
>>>> +        pcie_aer_msg(dev, &msg);
>>>> +        return;
>>>> +    }
>>>> +
>>>>        /*
>>>> -     * TBD. Retrieve the error details and decide what action
>>>> -     * needs to be taken. One of the actions could be to pass
>>>> -     * the error to the guest and have the guest driver recover
>>>> -     * from the error. This requires that PCIe capabilities be
>>>> -     * exposed to the guest. For now, we just terminate the
>>>> -     * guest to contain the error.
>>>> +     * If the aer capability is not exposed to the guest. we just
>>>> +     * terminate the guest to contain the error.
>>>>         */
>>>>    
>>>>        error_report("%s(%04x:%02x:%02x.%x) Unrecoverable error detected.  "
>>>
>>> .
>>>
>
>
> .
>
Alex Williamson March 16, 2015, 2:09 p.m. UTC | #5
On Mon, 2015-03-16 at 15:35 +0800, Chen Fan wrote:
> On 03/16/2015 11:52 AM, Alex Williamson wrote:
> > On Mon, 2015-03-16 at 11:05 +0800, Chen Fan wrote:
> >> On 03/14/2015 06:34 AM, Alex Williamson wrote:
> >>> On Thu, 2015-03-12 at 18:23 +0800, Chen Fan wrote:
> >>>> when the vfio device encounters an uncorrectable error in host,
> >>>> the vfio_pci driver will signal the eventfd registered by this
> >>>> vfio device, the results in the qemu eventfd handler getting
> >>>> invoked.
> >>>>
> >>>> this patch is to pass the error to guest and have the guest driver
> >>>> recover from the error.
> >>> What is going to be the typical recovery mechanism for the guest?  I'm
> >>> concerned that the topology of the device in the guest doesn't
> >>> necessarily match the topology of the device in the host, so if the
> >>> guest were to attempt a bus reset to recover a device, for instance,
> >>> what happens?
> >> the recovery mechanism is that when guest got an aer error from a device,
> >> guest will clean the corresponding status bit in device register. and for
> >> need reset device, the guest aer driver would reset all devices under bus.
> > Sorry, I'm still confused, how does the guest aer driver reset all
> > devices under a bus?  Are we talking about function-level, device
> > specific reset mechanisms or secondary bus resets?  If the guest is
> > performing secondary bus resets, what guarantee do they have that it
> > will translate to a physical secondary bus reset?  vfio may only do an
> > FLR when the bus is reset or it may not be able to do anything depending
> > on the available function-level resets and physical and virtual topology
> > of the device.  Thanks,
> in general, functions depends on the corresponding device driver behaviors
> to do the recovery. e.g: implemented the error_detect, slot_reset callbacks.
> and for link reset, it usually do secondary bus reset.
> 
> and do we must require to the physical secondary bus reset for vfio device
> as bus reset?

That depends on how the guest driver attempts recovery, doesn't it?
There are only a very limited number of cases where a secondary bus
reset initiated by the guest will translate to a secondary bus reset of
the physical device (iirc, single function device without FLR).  In most
cases, it will at best be translated to an FLR.  VFIO really only does
bus resets on VM reset because that's the only time we know that it's ok
to reset multiple devices.  If the guest driver is depending on a
secondary bus reset to put the device into a recoverable state and we're
not able to provide that, then we're actually reducing containment of
the error by exposing AER to the guest and allowing it to attempt
recovery.  So in practice, I'm afraid we're risking the integrity of the
VM by exposing AER to the guest and making it think that it can perform
recovery operations that are not effective.  Thanks,

Alex
chenfan March 25, 2015, 1:33 a.m. UTC | #6
On 03/16/2015 10:09 PM, Alex Williamson wrote:
> On Mon, 2015-03-16 at 15:35 +0800, Chen Fan wrote:
>> On 03/16/2015 11:52 AM, Alex Williamson wrote:
>>> On Mon, 2015-03-16 at 11:05 +0800, Chen Fan wrote:
>>>> On 03/14/2015 06:34 AM, Alex Williamson wrote:
>>>>> On Thu, 2015-03-12 at 18:23 +0800, Chen Fan wrote:
>>>>>> when the vfio device encounters an uncorrectable error in host,
>>>>>> the vfio_pci driver will signal the eventfd registered by this
>>>>>> vfio device, the results in the qemu eventfd handler getting
>>>>>> invoked.
>>>>>>
>>>>>> this patch is to pass the error to guest and have the guest driver
>>>>>> recover from the error.
>>>>> What is going to be the typical recovery mechanism for the guest?  I'm
>>>>> concerned that the topology of the device in the guest doesn't
>>>>> necessarily match the topology of the device in the host, so if the
>>>>> guest were to attempt a bus reset to recover a device, for instance,
>>>>> what happens?
>>>> the recovery mechanism is that when guest got an aer error from a device,
>>>> guest will clean the corresponding status bit in device register. and for
>>>> need reset device, the guest aer driver would reset all devices under bus.
>>> Sorry, I'm still confused, how does the guest aer driver reset all
>>> devices under a bus?  Are we talking about function-level, device
>>> specific reset mechanisms or secondary bus resets?  If the guest is
>>> performing secondary bus resets, what guarantee do they have that it
>>> will translate to a physical secondary bus reset?  vfio may only do an
>>> FLR when the bus is reset or it may not be able to do anything depending
>>> on the available function-level resets and physical and virtual topology
>>> of the device.  Thanks,
>> in general, functions depends on the corresponding device driver behaviors
>> to do the recovery. e.g: implemented the error_detect, slot_reset callbacks.
>> and for link reset, it usually do secondary bus reset.
>>
>> and do we must require to the physical secondary bus reset for vfio device
>> as bus reset?
> That depends on how the guest driver attempts recovery, doesn't it?
> There are only a very limited number of cases where a secondary bus
> reset initiated by the guest will translate to a secondary bus reset of
> the physical device (iirc, single function device without FLR).  In most
> cases, it will at best be translated to an FLR.  VFIO really only does
> bus resets on VM reset because that's the only time we know that it's ok
> to reset multiple devices.  If the guest driver is depending on a
> secondary bus reset to put the device into a recoverable state and we're
> not able to provide that, then we're actually reducing containment of
> the error by exposing AER to the guest and allowing it to attempt
> recovery.  So in practice, I'm afraid we're risking the integrity of the
> VM by exposing AER to the guest and making it think that it can perform
> recovery operations that are not effective.  Thanks,
Hi Alex,

     if guest driver need reset a vfio device by secondary bus reset when
an aer occured. how about keeping the behavior by stopping VM and
output an fatal error information to user.

Thanks,
Chen

>
> Alex
>
> .
>
chenfan March 25, 2015, 1:53 a.m. UTC | #7
On 03/16/2015 10:09 PM, Alex Williamson wrote:
> On Mon, 2015-03-16 at 15:35 +0800, Chen Fan wrote:
>> On 03/16/2015 11:52 AM, Alex Williamson wrote:
>>> On Mon, 2015-03-16 at 11:05 +0800, Chen Fan wrote:
>>>> On 03/14/2015 06:34 AM, Alex Williamson wrote:
>>>>> On Thu, 2015-03-12 at 18:23 +0800, Chen Fan wrote:
>>>>>> when the vfio device encounters an uncorrectable error in host,
>>>>>> the vfio_pci driver will signal the eventfd registered by this
>>>>>> vfio device, the results in the qemu eventfd handler getting
>>>>>> invoked.
>>>>>>
>>>>>> this patch is to pass the error to guest and have the guest driver
>>>>>> recover from the error.
>>>>> What is going to be the typical recovery mechanism for the guest?  I'm
>>>>> concerned that the topology of the device in the guest doesn't
>>>>> necessarily match the topology of the device in the host, so if the
>>>>> guest were to attempt a bus reset to recover a device, for instance,
>>>>> what happens?
>>>> the recovery mechanism is that when guest got an aer error from a device,
>>>> guest will clean the corresponding status bit in device register. and for
>>>> need reset device, the guest aer driver would reset all devices under bus.
>>> Sorry, I'm still confused, how does the guest aer driver reset all
>>> devices under a bus?  Are we talking about function-level, device
>>> specific reset mechanisms or secondary bus resets?  If the guest is
>>> performing secondary bus resets, what guarantee do they have that it
>>> will translate to a physical secondary bus reset?  vfio may only do an
>>> FLR when the bus is reset or it may not be able to do anything depending
>>> on the available function-level resets and physical and virtual topology
>>> of the device.  Thanks,
>> in general, functions depends on the corresponding device driver behaviors
>> to do the recovery. e.g: implemented the error_detect, slot_reset callbacks.
>> and for link reset, it usually do secondary bus reset.
>>
>> and do we must require to the physical secondary bus reset for vfio device
>> as bus reset?
> That depends on how the guest driver attempts recovery, doesn't it?
> There are only a very limited number of cases where a secondary bus
> reset initiated by the guest will translate to a secondary bus reset of
> the physical device (iirc, single function device without FLR).  In most
> cases, it will at best be translated to an FLR.  VFIO really only does
> bus resets on VM reset because that's the only time we know that it's ok
> to reset multiple devices.  If the guest driver is depending on a
> secondary bus reset to put the device into a recoverable state and we're
> not able to provide that, then we're actually reducing containment of
> the error by exposing AER to the guest and allowing it to attempt
> recovery.  So in practice, I'm afraid we're risking the integrity of the
> VM by exposing AER to the guest and making it think that it can perform
> recovery operations that are not effective.  Thanks,
I also have seen that if device without FLR, it seems can do hot reset
by ioctl VFIO_DEVICE_PCI_HOT_RESET to reset the physical slot or bus
in vfio_pci_reset. does it satisfy the recovery issues that you said?

Thanks,
Chen



>
> Alex
>
> .
>
Alex Williamson March 25, 2015, 2:31 a.m. UTC | #8
On Wed, 2015-03-25 at 09:33 +0800, Chen Fan wrote:
> On 03/16/2015 10:09 PM, Alex Williamson wrote:
> > On Mon, 2015-03-16 at 15:35 +0800, Chen Fan wrote:
> >> On 03/16/2015 11:52 AM, Alex Williamson wrote:
> >>> On Mon, 2015-03-16 at 11:05 +0800, Chen Fan wrote:
> >>>> On 03/14/2015 06:34 AM, Alex Williamson wrote:
> >>>>> On Thu, 2015-03-12 at 18:23 +0800, Chen Fan wrote:
> >>>>>> when the vfio device encounters an uncorrectable error in host,
> >>>>>> the vfio_pci driver will signal the eventfd registered by this
> >>>>>> vfio device, the results in the qemu eventfd handler getting
> >>>>>> invoked.
> >>>>>>
> >>>>>> this patch is to pass the error to guest and have the guest driver
> >>>>>> recover from the error.
> >>>>> What is going to be the typical recovery mechanism for the guest?  I'm
> >>>>> concerned that the topology of the device in the guest doesn't
> >>>>> necessarily match the topology of the device in the host, so if the
> >>>>> guest were to attempt a bus reset to recover a device, for instance,
> >>>>> what happens?
> >>>> the recovery mechanism is that when guest got an aer error from a device,
> >>>> guest will clean the corresponding status bit in device register. and for
> >>>> need reset device, the guest aer driver would reset all devices under bus.
> >>> Sorry, I'm still confused, how does the guest aer driver reset all
> >>> devices under a bus?  Are we talking about function-level, device
> >>> specific reset mechanisms or secondary bus resets?  If the guest is
> >>> performing secondary bus resets, what guarantee do they have that it
> >>> will translate to a physical secondary bus reset?  vfio may only do an
> >>> FLR when the bus is reset or it may not be able to do anything depending
> >>> on the available function-level resets and physical and virtual topology
> >>> of the device.  Thanks,
> >> in general, functions depends on the corresponding device driver behaviors
> >> to do the recovery. e.g: implemented the error_detect, slot_reset callbacks.
> >> and for link reset, it usually do secondary bus reset.
> >>
> >> and do we must require to the physical secondary bus reset for vfio device
> >> as bus reset?
> > That depends on how the guest driver attempts recovery, doesn't it?
> > There are only a very limited number of cases where a secondary bus
> > reset initiated by the guest will translate to a secondary bus reset of
> > the physical device (iirc, single function device without FLR).  In most
> > cases, it will at best be translated to an FLR.  VFIO really only does
> > bus resets on VM reset because that's the only time we know that it's ok
> > to reset multiple devices.  If the guest driver is depending on a
> > secondary bus reset to put the device into a recoverable state and we're
> > not able to provide that, then we're actually reducing containment of
> > the error by exposing AER to the guest and allowing it to attempt
> > recovery.  So in practice, I'm afraid we're risking the integrity of the
> > VM by exposing AER to the guest and making it think that it can perform
> > recovery operations that are not effective.  Thanks,
> Hi Alex,
> 
>      if guest driver need reset a vfio device by secondary bus reset when
> an aer occured. how about keeping the behavior by stopping VM and
> output an fatal error information to user.

That sounds like a very fragile heuristic to try to associate the reason
for a secondary bus reset based on the timing of an AER notification.
How can we be sure there's an association?  Is it still worthwhile to
allow the guest to participate in recovery or will most of the cases
just stall the VM stop until a bus reset is attempted?  Thanks,

Alex
Alex Williamson March 25, 2015, 2:41 a.m. UTC | #9
On Wed, 2015-03-25 at 09:53 +0800, Chen Fan wrote:
> On 03/16/2015 10:09 PM, Alex Williamson wrote:
> > On Mon, 2015-03-16 at 15:35 +0800, Chen Fan wrote:
> >> On 03/16/2015 11:52 AM, Alex Williamson wrote:
> >>> On Mon, 2015-03-16 at 11:05 +0800, Chen Fan wrote:
> >>>> On 03/14/2015 06:34 AM, Alex Williamson wrote:
> >>>>> On Thu, 2015-03-12 at 18:23 +0800, Chen Fan wrote:
> >>>>>> when the vfio device encounters an uncorrectable error in host,
> >>>>>> the vfio_pci driver will signal the eventfd registered by this
> >>>>>> vfio device, the results in the qemu eventfd handler getting
> >>>>>> invoked.
> >>>>>>
> >>>>>> this patch is to pass the error to guest and have the guest driver
> >>>>>> recover from the error.
> >>>>> What is going to be the typical recovery mechanism for the guest?  I'm
> >>>>> concerned that the topology of the device in the guest doesn't
> >>>>> necessarily match the topology of the device in the host, so if the
> >>>>> guest were to attempt a bus reset to recover a device, for instance,
> >>>>> what happens?
> >>>> the recovery mechanism is that when guest got an aer error from a device,
> >>>> guest will clean the corresponding status bit in device register. and for
> >>>> need reset device, the guest aer driver would reset all devices under bus.
> >>> Sorry, I'm still confused, how does the guest aer driver reset all
> >>> devices under a bus?  Are we talking about function-level, device
> >>> specific reset mechanisms or secondary bus resets?  If the guest is
> >>> performing secondary bus resets, what guarantee do they have that it
> >>> will translate to a physical secondary bus reset?  vfio may only do an
> >>> FLR when the bus is reset or it may not be able to do anything depending
> >>> on the available function-level resets and physical and virtual topology
> >>> of the device.  Thanks,
> >> in general, functions depends on the corresponding device driver behaviors
> >> to do the recovery. e.g: implemented the error_detect, slot_reset callbacks.
> >> and for link reset, it usually do secondary bus reset.
> >>
> >> and do we must require to the physical secondary bus reset for vfio device
> >> as bus reset?
> > That depends on how the guest driver attempts recovery, doesn't it?
> > There are only a very limited number of cases where a secondary bus
> > reset initiated by the guest will translate to a secondary bus reset of
> > the physical device (iirc, single function device without FLR).  In most
> > cases, it will at best be translated to an FLR.  VFIO really only does
> > bus resets on VM reset because that's the only time we know that it's ok
> > to reset multiple devices.  If the guest driver is depending on a
> > secondary bus reset to put the device into a recoverable state and we're
> > not able to provide that, then we're actually reducing containment of
> > the error by exposing AER to the guest and allowing it to attempt
> > recovery.  So in practice, I'm afraid we're risking the integrity of the
> > VM by exposing AER to the guest and making it think that it can perform
> > recovery operations that are not effective.  Thanks,
> I also have seen that if device without FLR, it seems can do hot reset
> by ioctl VFIO_DEVICE_PCI_HOT_RESET to reset the physical slot or bus
> in vfio_pci_reset. does it satisfy the recovery issues that you said?

The hot reset interface can only be used when a) the user (QEMU) owns
all of the devices on the bus and b) we know we're resetting all of the
devices.  That mostly limits its use to VM reset.  I think that on a
secondary bus reset, we don't know the scope of the reset at the QEMU
vfio driver, so we only make use of reset methods with a function-level
scope.  That would only result in a secondary bus reset if that's the
reset mechanism used by the host kernel's PCI code (pci_reset_function),
which is limited to single function devices on a secondary bus, with no
other reset mechanisms.  The host reset is also only available in some
configurations, for instance if we have a dual-port NIC where each
function is a separate IOMMU group, then we clearly cannot do a hot
reset unless both functions are assigned to the same VM _and_ appear to
the guest on the same virtual bus.  So even if we could know the scope
of the reset in the QEMU vfio driver, we can only make use of it under
very strict guest configurations.  Thanks,

Alex
chenfan March 25, 2015, 3:07 a.m. UTC | #10
On 03/25/2015 10:41 AM, Alex Williamson wrote:
> On Wed, 2015-03-25 at 09:53 +0800, Chen Fan wrote:
>> On 03/16/2015 10:09 PM, Alex Williamson wrote:
>>> On Mon, 2015-03-16 at 15:35 +0800, Chen Fan wrote:
>>>> On 03/16/2015 11:52 AM, Alex Williamson wrote:
>>>>> On Mon, 2015-03-16 at 11:05 +0800, Chen Fan wrote:
>>>>>> On 03/14/2015 06:34 AM, Alex Williamson wrote:
>>>>>>> On Thu, 2015-03-12 at 18:23 +0800, Chen Fan wrote:
>>>>>>>> when the vfio device encounters an uncorrectable error in host,
>>>>>>>> the vfio_pci driver will signal the eventfd registered by this
>>>>>>>> vfio device, the results in the qemu eventfd handler getting
>>>>>>>> invoked.
>>>>>>>>
>>>>>>>> this patch is to pass the error to guest and have the guest driver
>>>>>>>> recover from the error.
>>>>>>> What is going to be the typical recovery mechanism for the guest?  I'm
>>>>>>> concerned that the topology of the device in the guest doesn't
>>>>>>> necessarily match the topology of the device in the host, so if the
>>>>>>> guest were to attempt a bus reset to recover a device, for instance,
>>>>>>> what happens?
>>>>>> the recovery mechanism is that when guest got an aer error from a device,
>>>>>> guest will clean the corresponding status bit in device register. and for
>>>>>> need reset device, the guest aer driver would reset all devices under bus.
>>>>> Sorry, I'm still confused, how does the guest aer driver reset all
>>>>> devices under a bus?  Are we talking about function-level, device
>>>>> specific reset mechanisms or secondary bus resets?  If the guest is
>>>>> performing secondary bus resets, what guarantee do they have that it
>>>>> will translate to a physical secondary bus reset?  vfio may only do an
>>>>> FLR when the bus is reset or it may not be able to do anything depending
>>>>> on the available function-level resets and physical and virtual topology
>>>>> of the device.  Thanks,
>>>> in general, functions depends on the corresponding device driver behaviors
>>>> to do the recovery. e.g: implemented the error_detect, slot_reset callbacks.
>>>> and for link reset, it usually do secondary bus reset.
>>>>
>>>> and do we must require to the physical secondary bus reset for vfio device
>>>> as bus reset?
>>> That depends on how the guest driver attempts recovery, doesn't it?
>>> There are only a very limited number of cases where a secondary bus
>>> reset initiated by the guest will translate to a secondary bus reset of
>>> the physical device (iirc, single function device without FLR).  In most
>>> cases, it will at best be translated to an FLR.  VFIO really only does
>>> bus resets on VM reset because that's the only time we know that it's ok
>>> to reset multiple devices.  If the guest driver is depending on a
>>> secondary bus reset to put the device into a recoverable state and we're
>>> not able to provide that, then we're actually reducing containment of
>>> the error by exposing AER to the guest and allowing it to attempt
>>> recovery.  So in practice, I'm afraid we're risking the integrity of the
>>> VM by exposing AER to the guest and making it think that it can perform
>>> recovery operations that are not effective.  Thanks,
>> I also have seen that if device without FLR, it seems can do hot reset
>> by ioctl VFIO_DEVICE_PCI_HOT_RESET to reset the physical slot or bus
>> in vfio_pci_reset. does it satisfy the recovery issues that you said?
> The hot reset interface can only be used when a) the user (QEMU) owns
> all of the devices on the bus and b) we know we're resetting all of the
> devices.  That mostly limits its use to VM reset.  I think that on a
> secondary bus reset, we don't know the scope of the reset at the QEMU
> vfio driver, so we only make use of reset methods with a function-level
> scope.  That would only result in a secondary bus reset if that's the
> reset mechanism used by the host kernel's PCI code (pci_reset_function),
> which is limited to single function devices on a secondary bus, with no
> other reset mechanisms.  The host reset is also only available in some
> configurations, for instance if we have a dual-port NIC where each
> function is a separate IOMMU group, then we clearly cannot do a hot
> reset unless both functions are assigned to the same VM _and_ appear to
> the guest on the same virtual bus.  So even if we could know the scope
> of the reset in the QEMU vfio driver, we can only make use of it under
> very strict guest configurations.  Thanks,

it seems difficult to allow guest to participate in recovery.
but I think that we might be able to capture the vfio_pci_reset
result. if vfio device reset fail. then we stop the VM.

Thanks,
Chen

>
> Alex
>
> .
>
chenfan April 1, 2015, 4:12 a.m. UTC | #11
On 03/25/2015 10:41 AM, Alex Williamson wrote:
> On Wed, 2015-03-25 at 09:53 +0800, Chen Fan wrote:
>> On 03/16/2015 10:09 PM, Alex Williamson wrote:
>>> On Mon, 2015-03-16 at 15:35 +0800, Chen Fan wrote:
>>>> On 03/16/2015 11:52 AM, Alex Williamson wrote:
>>>>> On Mon, 2015-03-16 at 11:05 +0800, Chen Fan wrote:
>>>>>> On 03/14/2015 06:34 AM, Alex Williamson wrote:
>>>>>>> On Thu, 2015-03-12 at 18:23 +0800, Chen Fan wrote:
>>>>>>>> when the vfio device encounters an uncorrectable error in host,
>>>>>>>> the vfio_pci driver will signal the eventfd registered by this
>>>>>>>> vfio device, the results in the qemu eventfd handler getting
>>>>>>>> invoked.
>>>>>>>>
>>>>>>>> this patch is to pass the error to guest and have the guest driver
>>>>>>>> recover from the error.
>>>>>>> What is going to be the typical recovery mechanism for the guest?  I'm
>>>>>>> concerned that the topology of the device in the guest doesn't
>>>>>>> necessarily match the topology of the device in the host, so if the
>>>>>>> guest were to attempt a bus reset to recover a device, for instance,
>>>>>>> what happens?
>>>>>> the recovery mechanism is that when guest got an aer error from a device,
>>>>>> guest will clean the corresponding status bit in device register. and for
>>>>>> need reset device, the guest aer driver would reset all devices under bus.
>>>>> Sorry, I'm still confused, how does the guest aer driver reset all
>>>>> devices under a bus?  Are we talking about function-level, device
>>>>> specific reset mechanisms or secondary bus resets?  If the guest is
>>>>> performing secondary bus resets, what guarantee do they have that it
>>>>> will translate to a physical secondary bus reset?  vfio may only do an
>>>>> FLR when the bus is reset or it may not be able to do anything depending
>>>>> on the available function-level resets and physical and virtual topology
>>>>> of the device.  Thanks,
>>>> in general, functions depends on the corresponding device driver behaviors
>>>> to do the recovery. e.g: implemented the error_detect, slot_reset callbacks.
>>>> and for link reset, it usually do secondary bus reset.
>>>>
>>>> and do we must require to the physical secondary bus reset for vfio device
>>>> as bus reset?
>>> That depends on how the guest driver attempts recovery, doesn't it?
>>> There are only a very limited number of cases where a secondary bus
>>> reset initiated by the guest will translate to a secondary bus reset of
>>> the physical device (iirc, single function device without FLR).  In most
>>> cases, it will at best be translated to an FLR.  VFIO really only does
>>> bus resets on VM reset because that's the only time we know that it's ok
>>> to reset multiple devices.  If the guest driver is depending on a
>>> secondary bus reset to put the device into a recoverable state and we're
>>> not able to provide that, then we're actually reducing containment of
>>> the error by exposing AER to the guest and allowing it to attempt
>>> recovery.  So in practice, I'm afraid we're risking the integrity of the
>>> VM by exposing AER to the guest and making it think that it can perform
>>> recovery operations that are not effective.  Thanks,
>> I also have seen that if device without FLR, it seems can do hot reset
>> by ioctl VFIO_DEVICE_PCI_HOT_RESET to reset the physical slot or bus
>> in vfio_pci_reset. does it satisfy the recovery issues that you said?
> The hot reset interface can only be used when a) the user (QEMU) owns
> all of the devices on the bus and b) we know we're resetting all of the
> devices.  That mostly limits its use to VM reset.  I think that on a
> secondary bus reset, we don't know the scope of the reset at the QEMU
> vfio driver, so we only make use of reset methods with a function-level
> scope.  That would only result in a secondary bus reset if that's the
> reset mechanism used by the host kernel's PCI code (pci_reset_function),
> which is limited to single function devices on a secondary bus, with no
> other reset mechanisms.  The host reset is also only available in some
> configurations, for instance if we have a dual-port NIC where each
> function is a separate IOMMU group, then we clearly cannot do a hot
> reset unless both functions are assigned to the same VM _and_ appear to
> the guest on the same virtual bus.  So even if we could know the scope
> of the reset in the QEMU vfio driver, we can only make use of it under
> very strict guest configurations.  Thanks,
Hi Alex,

    have you some idea or scenario to fix/escape this issue?

Thanks,
Chen


>
> Alex
>
> .
>
Alex Williamson April 1, 2015, 3:46 p.m. UTC | #12
On Wed, 2015-04-01 at 12:12 +0800, Chen Fan wrote:
> On 03/25/2015 10:41 AM, Alex Williamson wrote:
> > On Wed, 2015-03-25 at 09:53 +0800, Chen Fan wrote:
> >> On 03/16/2015 10:09 PM, Alex Williamson wrote:
> >>> On Mon, 2015-03-16 at 15:35 +0800, Chen Fan wrote:
> >>>> On 03/16/2015 11:52 AM, Alex Williamson wrote:
> >>>>> On Mon, 2015-03-16 at 11:05 +0800, Chen Fan wrote:
> >>>>>> On 03/14/2015 06:34 AM, Alex Williamson wrote:
> >>>>>>> On Thu, 2015-03-12 at 18:23 +0800, Chen Fan wrote:
> >>>>>>>> when the vfio device encounters an uncorrectable error in host,
> >>>>>>>> the vfio_pci driver will signal the eventfd registered by this
> >>>>>>>> vfio device, the results in the qemu eventfd handler getting
> >>>>>>>> invoked.
> >>>>>>>>
> >>>>>>>> this patch is to pass the error to guest and have the guest driver
> >>>>>>>> recover from the error.
> >>>>>>> What is going to be the typical recovery mechanism for the guest?  I'm
> >>>>>>> concerned that the topology of the device in the guest doesn't
> >>>>>>> necessarily match the topology of the device in the host, so if the
> >>>>>>> guest were to attempt a bus reset to recover a device, for instance,
> >>>>>>> what happens?
> >>>>>> the recovery mechanism is that when guest got an aer error from a device,
> >>>>>> guest will clean the corresponding status bit in device register. and for
> >>>>>> need reset device, the guest aer driver would reset all devices under bus.
> >>>>> Sorry, I'm still confused, how does the guest aer driver reset all
> >>>>> devices under a bus?  Are we talking about function-level, device
> >>>>> specific reset mechanisms or secondary bus resets?  If the guest is
> >>>>> performing secondary bus resets, what guarantee do they have that it
> >>>>> will translate to a physical secondary bus reset?  vfio may only do an
> >>>>> FLR when the bus is reset or it may not be able to do anything depending
> >>>>> on the available function-level resets and physical and virtual topology
> >>>>> of the device.  Thanks,
> >>>> in general, functions depends on the corresponding device driver behaviors
> >>>> to do the recovery. e.g: implemented the error_detect, slot_reset callbacks.
> >>>> and for link reset, it usually do secondary bus reset.
> >>>>
> >>>> and do we must require to the physical secondary bus reset for vfio device
> >>>> as bus reset?
> >>> That depends on how the guest driver attempts recovery, doesn't it?
> >>> There are only a very limited number of cases where a secondary bus
> >>> reset initiated by the guest will translate to a secondary bus reset of
> >>> the physical device (iirc, single function device without FLR).  In most
> >>> cases, it will at best be translated to an FLR.  VFIO really only does
> >>> bus resets on VM reset because that's the only time we know that it's ok
> >>> to reset multiple devices.  If the guest driver is depending on a
> >>> secondary bus reset to put the device into a recoverable state and we're
> >>> not able to provide that, then we're actually reducing containment of
> >>> the error by exposing AER to the guest and allowing it to attempt
> >>> recovery.  So in practice, I'm afraid we're risking the integrity of the
> >>> VM by exposing AER to the guest and making it think that it can perform
> >>> recovery operations that are not effective.  Thanks,
> >> I also have seen that if device without FLR, it seems can do hot reset
> >> by ioctl VFIO_DEVICE_PCI_HOT_RESET to reset the physical slot or bus
> >> in vfio_pci_reset. does it satisfy the recovery issues that you said?
> > The hot reset interface can only be used when a) the user (QEMU) owns
> > all of the devices on the bus and b) we know we're resetting all of the
> > devices.  That mostly limits its use to VM reset.  I think that on a
> > secondary bus reset, we don't know the scope of the reset at the QEMU
> > vfio driver, so we only make use of reset methods with a function-level
> > scope.  That would only result in a secondary bus reset if that's the
> > reset mechanism used by the host kernel's PCI code (pci_reset_function),
> > which is limited to single function devices on a secondary bus, with no
> > other reset mechanisms.  The host reset is also only available in some
> > configurations, for instance if we have a dual-port NIC where each
> > function is a separate IOMMU group, then we clearly cannot do a hot
> > reset unless both functions are assigned to the same VM _and_ appear to
> > the guest on the same virtual bus.  So even if we could know the scope
> > of the reset in the QEMU vfio driver, we can only make use of it under
> > very strict guest configurations.  Thanks,
> Hi Alex,
> 
>     have you some idea or scenario to fix/escape this issue?

Hi Chen,

I expect there are two major components to this.  The first is that
QEMU/vfio-pci needs to enforce that a bus reset is possible for the host
and guest topology when guest AER handling is specified for a device.
That means that everything affected by the bus reset needs to be exposed
to the guest in a compatible way.  For instance, if a bus reset affects
devices from multiple groups, the guest needs to not only own all of
those groups, but they also need to be exposed to the guest such that
the virtual bus layout reflects the extent of the reset for the physical
bus.  This also implies that guest AER handling cannot be the default
since it will impose significant configuration restrictions on device
assignment.

This seems like a difficult configuration enforcement to make, but maybe
there are simplifying assumptions that can help.  For instance the
devices need to be exposed as PCIe therefore we won't have multiple
slots in use on a bus and I think we can therefore mostly ignore hotplug
since we can only hotplug at a slot granularity.  That may also imply
that we should simply enforce a 1:1 mapping of physical functions to
virtual functions.  At least one function from each group affected by a
reset must be exposed to the guest.

The second issue is that individual QEMU PCI devices have no callback
for a bus reset.  QEMU/vfio-pci currently has the DeviceClass.reset
callback, which we assume to be a function-level reset.  We also
register with qemu_register_reset() for a VM reset, which is the only
point currently that we know we can do a reset affecting multiple
devices.  Infrastructure will need to be added to QEMU/PCI to expose the
link down/RST signal to devices on a bus to trigger a multi-device reset
in vfio-pci.

Hopefully I'm not missing something, but I think both of those changes
are going to be required before we can have anything remotely
supportable for guest-based AER error handle.  This pretty complicated
for the user and also for libvirt to figure out.  At a minimum libvirt
would need to support a new guest-based AER handling flag for devices.
We probably need to determine whether this is unique to vfio-pci or a
generic PCIDevice option.  Thanks,

Alex
chenfan April 8, 2015, 8:59 a.m. UTC | #13
On 04/01/2015 11:46 PM, Alex Williamson wrote:
> On Wed, 2015-04-01 at 12:12 +0800, Chen Fan wrote:
>> On 03/25/2015 10:41 AM, Alex Williamson wrote:
>>> On Wed, 2015-03-25 at 09:53 +0800, Chen Fan wrote:
>>>> On 03/16/2015 10:09 PM, Alex Williamson wrote:
>>>>> On Mon, 2015-03-16 at 15:35 +0800, Chen Fan wrote:
>>>>>> On 03/16/2015 11:52 AM, Alex Williamson wrote:
>>>>>>> On Mon, 2015-03-16 at 11:05 +0800, Chen Fan wrote:
>>>>>>>> On 03/14/2015 06:34 AM, Alex Williamson wrote:
>>>>>>>>> On Thu, 2015-03-12 at 18:23 +0800, Chen Fan wrote:
>>>>>>>>>> when the vfio device encounters an uncorrectable error in host,
>>>>>>>>>> the vfio_pci driver will signal the eventfd registered by this
>>>>>>>>>> vfio device, the results in the qemu eventfd handler getting
>>>>>>>>>> invoked.
>>>>>>>>>>
>>>>>>>>>> this patch is to pass the error to guest and have the guest driver
>>>>>>>>>> recover from the error.
>>>>>>>>> What is going to be the typical recovery mechanism for the guest?  I'm
>>>>>>>>> concerned that the topology of the device in the guest doesn't
>>>>>>>>> necessarily match the topology of the device in the host, so if the
>>>>>>>>> guest were to attempt a bus reset to recover a device, for instance,
>>>>>>>>> what happens?
>>>>>>>> the recovery mechanism is that when guest got an aer error from a device,
>>>>>>>> guest will clean the corresponding status bit in device register. and for
>>>>>>>> need reset device, the guest aer driver would reset all devices under bus.
>>>>>>> Sorry, I'm still confused, how does the guest aer driver reset all
>>>>>>> devices under a bus?  Are we talking about function-level, device
>>>>>>> specific reset mechanisms or secondary bus resets?  If the guest is
>>>>>>> performing secondary bus resets, what guarantee do they have that it
>>>>>>> will translate to a physical secondary bus reset?  vfio may only do an
>>>>>>> FLR when the bus is reset or it may not be able to do anything depending
>>>>>>> on the available function-level resets and physical and virtual topology
>>>>>>> of the device.  Thanks,
>>>>>> in general, functions depends on the corresponding device driver behaviors
>>>>>> to do the recovery. e.g: implemented the error_detect, slot_reset callbacks.
>>>>>> and for link reset, it usually do secondary bus reset.
>>>>>>
>>>>>> and do we must require to the physical secondary bus reset for vfio device
>>>>>> as bus reset?
>>>>> That depends on how the guest driver attempts recovery, doesn't it?
>>>>> There are only a very limited number of cases where a secondary bus
>>>>> reset initiated by the guest will translate to a secondary bus reset of
>>>>> the physical device (iirc, single function device without FLR).  In most
>>>>> cases, it will at best be translated to an FLR.  VFIO really only does
>>>>> bus resets on VM reset because that's the only time we know that it's ok
>>>>> to reset multiple devices.  If the guest driver is depending on a
>>>>> secondary bus reset to put the device into a recoverable state and we're
>>>>> not able to provide that, then we're actually reducing containment of
>>>>> the error by exposing AER to the guest and allowing it to attempt
>>>>> recovery.  So in practice, I'm afraid we're risking the integrity of the
>>>>> VM by exposing AER to the guest and making it think that it can perform
>>>>> recovery operations that are not effective.  Thanks,
>>>> I also have seen that if device without FLR, it seems can do hot reset
>>>> by ioctl VFIO_DEVICE_PCI_HOT_RESET to reset the physical slot or bus
>>>> in vfio_pci_reset. does it satisfy the recovery issues that you said?
>>> The hot reset interface can only be used when a) the user (QEMU) owns
>>> all of the devices on the bus and b) we know we're resetting all of the
>>> devices.  That mostly limits its use to VM reset.  I think that on a
>>> secondary bus reset, we don't know the scope of the reset at the QEMU
>>> vfio driver, so we only make use of reset methods with a function-level
>>> scope.  That would only result in a secondary bus reset if that's the
>>> reset mechanism used by the host kernel's PCI code (pci_reset_function),
>>> which is limited to single function devices on a secondary bus, with no
>>> other reset mechanisms.  The host reset is also only available in some
>>> configurations, for instance if we have a dual-port NIC where each
>>> function is a separate IOMMU group, then we clearly cannot do a hot
>>> reset unless both functions are assigned to the same VM _and_ appear to
>>> the guest on the same virtual bus.  So even if we could know the scope
>>> of the reset in the QEMU vfio driver, we can only make use of it under
>>> very strict guest configurations.  Thanks,
>> Hi Alex,
>>
>>      have you some idea or scenario to fix/escape this issue?
> Hi Chen,
>
> I expect there are two major components to this.  The first is that
> QEMU/vfio-pci needs to enforce that a bus reset is possible for the host
> and guest topology when guest AER handling is specified for a device.
> That means that everything affected by the bus reset needs to be exposed
> to the guest in a compatible way.  For instance, if a bus reset affects
> devices from multiple groups, the guest needs to not only own all of
> those groups, but they also need to be exposed to the guest such that
> the virtual bus layout reflects the extent of the reset for the physical
> bus.  This also implies that guest AER handling cannot be the default
> since it will impose significant configuration restrictions on device
> assignment.
>
> This seems like a difficult configuration enforcement to make, but maybe
> there are simplifying assumptions that can help.  For instance the
> devices need to be exposed as PCIe therefore we won't have multiple
> slots in use on a bus and I think we can therefore mostly ignore hotplug
> since we can only hotplug at a slot granularity.  That may also imply
> that we should simply enforce a 1:1 mapping of physical functions to
> virtual functions.  At least one function from each group affected by a
> reset must be exposed to the guest.
>
> The second issue is that individual QEMU PCI devices have no callback
> for a bus reset.  QEMU/vfio-pci currently has the DeviceClass.reset
> callback, which we assume to be a function-level reset.  We also
> register with qemu_register_reset() for a VM reset, which is the only
> point currently that we know we can do a reset affecting multiple
> devices.  Infrastructure will need to be added to QEMU/PCI to expose the
> link down/RST signal to devices on a bus to trigger a multi-device reset
> in vfio-pci.
>
> Hopefully I'm not missing something, but I think both of those changes
> are going to be required before we can have anything remotely
> supportable for guest-based AER error handle.  This pretty complicated
> for the user and also for libvirt to figure out.  At a minimum libvirt
> would need to support a new guest-based AER handling flag for devices.
> We probably need to determine whether this is unique to vfio-pci or a
> generic PCIDevice option.  Thanks,

Hi Alex,
   Solving the two issues seem like a big workload. do we have a simple
   way to support qemu AER ?

Thanks,
Chen

>
> Alex
>
>
> .
>
Alex Williamson April 8, 2015, 3:36 p.m. UTC | #14
On Wed, 2015-04-08 at 16:59 +0800, Chen Fan wrote:
> On 04/01/2015 11:46 PM, Alex Williamson wrote:
> > On Wed, 2015-04-01 at 12:12 +0800, Chen Fan wrote:
> >> On 03/25/2015 10:41 AM, Alex Williamson wrote:
> >>> On Wed, 2015-03-25 at 09:53 +0800, Chen Fan wrote:
> >>>> On 03/16/2015 10:09 PM, Alex Williamson wrote:
> >>>>> On Mon, 2015-03-16 at 15:35 +0800, Chen Fan wrote:
> >>>>>> On 03/16/2015 11:52 AM, Alex Williamson wrote:
> >>>>>>> On Mon, 2015-03-16 at 11:05 +0800, Chen Fan wrote:
> >>>>>>>> On 03/14/2015 06:34 AM, Alex Williamson wrote:
> >>>>>>>>> On Thu, 2015-03-12 at 18:23 +0800, Chen Fan wrote:
> >>>>>>>>>> when the vfio device encounters an uncorrectable error in host,
> >>>>>>>>>> the vfio_pci driver will signal the eventfd registered by this
> >>>>>>>>>> vfio device, the results in the qemu eventfd handler getting
> >>>>>>>>>> invoked.
> >>>>>>>>>>
> >>>>>>>>>> this patch is to pass the error to guest and have the guest driver
> >>>>>>>>>> recover from the error.
> >>>>>>>>> What is going to be the typical recovery mechanism for the guest?  I'm
> >>>>>>>>> concerned that the topology of the device in the guest doesn't
> >>>>>>>>> necessarily match the topology of the device in the host, so if the
> >>>>>>>>> guest were to attempt a bus reset to recover a device, for instance,
> >>>>>>>>> what happens?
> >>>>>>>> the recovery mechanism is that when guest got an aer error from a device,
> >>>>>>>> guest will clean the corresponding status bit in device register. and for
> >>>>>>>> need reset device, the guest aer driver would reset all devices under bus.
> >>>>>>> Sorry, I'm still confused, how does the guest aer driver reset all
> >>>>>>> devices under a bus?  Are we talking about function-level, device
> >>>>>>> specific reset mechanisms or secondary bus resets?  If the guest is
> >>>>>>> performing secondary bus resets, what guarantee do they have that it
> >>>>>>> will translate to a physical secondary bus reset?  vfio may only do an
> >>>>>>> FLR when the bus is reset or it may not be able to do anything depending
> >>>>>>> on the available function-level resets and physical and virtual topology
> >>>>>>> of the device.  Thanks,
> >>>>>> in general, functions depends on the corresponding device driver behaviors
> >>>>>> to do the recovery. e.g: implemented the error_detect, slot_reset callbacks.
> >>>>>> and for link reset, it usually do secondary bus reset.
> >>>>>>
> >>>>>> and do we must require to the physical secondary bus reset for vfio device
> >>>>>> as bus reset?
> >>>>> That depends on how the guest driver attempts recovery, doesn't it?
> >>>>> There are only a very limited number of cases where a secondary bus
> >>>>> reset initiated by the guest will translate to a secondary bus reset of
> >>>>> the physical device (iirc, single function device without FLR).  In most
> >>>>> cases, it will at best be translated to an FLR.  VFIO really only does
> >>>>> bus resets on VM reset because that's the only time we know that it's ok
> >>>>> to reset multiple devices.  If the guest driver is depending on a
> >>>>> secondary bus reset to put the device into a recoverable state and we're
> >>>>> not able to provide that, then we're actually reducing containment of
> >>>>> the error by exposing AER to the guest and allowing it to attempt
> >>>>> recovery.  So in practice, I'm afraid we're risking the integrity of the
> >>>>> VM by exposing AER to the guest and making it think that it can perform
> >>>>> recovery operations that are not effective.  Thanks,
> >>>> I also have seen that if device without FLR, it seems can do hot reset
> >>>> by ioctl VFIO_DEVICE_PCI_HOT_RESET to reset the physical slot or bus
> >>>> in vfio_pci_reset. does it satisfy the recovery issues that you said?
> >>> The hot reset interface can only be used when a) the user (QEMU) owns
> >>> all of the devices on the bus and b) we know we're resetting all of the
> >>> devices.  That mostly limits its use to VM reset.  I think that on a
> >>> secondary bus reset, we don't know the scope of the reset at the QEMU
> >>> vfio driver, so we only make use of reset methods with a function-level
> >>> scope.  That would only result in a secondary bus reset if that's the
> >>> reset mechanism used by the host kernel's PCI code (pci_reset_function),
> >>> which is limited to single function devices on a secondary bus, with no
> >>> other reset mechanisms.  The host reset is also only available in some
> >>> configurations, for instance if we have a dual-port NIC where each
> >>> function is a separate IOMMU group, then we clearly cannot do a hot
> >>> reset unless both functions are assigned to the same VM _and_ appear to
> >>> the guest on the same virtual bus.  So even if we could know the scope
> >>> of the reset in the QEMU vfio driver, we can only make use of it under
> >>> very strict guest configurations.  Thanks,
> >> Hi Alex,
> >>
> >>      have you some idea or scenario to fix/escape this issue?
> > Hi Chen,
> >
> > I expect there are two major components to this.  The first is that
> > QEMU/vfio-pci needs to enforce that a bus reset is possible for the host
> > and guest topology when guest AER handling is specified for a device.
> > That means that everything affected by the bus reset needs to be exposed
> > to the guest in a compatible way.  For instance, if a bus reset affects
> > devices from multiple groups, the guest needs to not only own all of
> > those groups, but they also need to be exposed to the guest such that
> > the virtual bus layout reflects the extent of the reset for the physical
> > bus.  This also implies that guest AER handling cannot be the default
> > since it will impose significant configuration restrictions on device
> > assignment.
> >
> > This seems like a difficult configuration enforcement to make, but maybe
> > there are simplifying assumptions that can help.  For instance the
> > devices need to be exposed as PCIe therefore we won't have multiple
> > slots in use on a bus and I think we can therefore mostly ignore hotplug
> > since we can only hotplug at a slot granularity.  That may also imply
> > that we should simply enforce a 1:1 mapping of physical functions to
> > virtual functions.  At least one function from each group affected by a
> > reset must be exposed to the guest.
> >
> > The second issue is that individual QEMU PCI devices have no callback
> > for a bus reset.  QEMU/vfio-pci currently has the DeviceClass.reset
> > callback, which we assume to be a function-level reset.  We also
> > register with qemu_register_reset() for a VM reset, which is the only
> > point currently that we know we can do a reset affecting multiple
> > devices.  Infrastructure will need to be added to QEMU/PCI to expose the
> > link down/RST signal to devices on a bus to trigger a multi-device reset
> > in vfio-pci.
> >
> > Hopefully I'm not missing something, but I think both of those changes
> > are going to be required before we can have anything remotely
> > supportable for guest-based AER error handle.  This pretty complicated
> > for the user and also for libvirt to figure out.  At a minimum libvirt
> > would need to support a new guest-based AER handling flag for devices.
> > We probably need to determine whether this is unique to vfio-pci or a
> > generic PCIDevice option.  Thanks,
> 
> Hi Alex,
>    Solving the two issues seem like a big workload. do we have a simple
>    way to support qemu AER ?

Hi Chen,

The simpler way is the existing, containment-only solution where QEMU
stops the guest on an uncorrected error.  Do you have any other
suggestions?  I don't see how we can rely on guest involvement in
recovery unless the guest has the same abilities to reset the device as
it would on bare metal.  Thanks,

Alex
chenfan April 15, 2015, 10:30 a.m. UTC | #15
On 04/08/2015 11:36 PM, Alex Williamson wrote:
> On Wed, 2015-04-08 at 16:59 +0800, Chen Fan wrote:
>> On 04/01/2015 11:46 PM, Alex Williamson wrote:
>>> On Wed, 2015-04-01 at 12:12 +0800, Chen Fan wrote:
>>>> On 03/25/2015 10:41 AM, Alex Williamson wrote:
>>>>> On Wed, 2015-03-25 at 09:53 +0800, Chen Fan wrote:
>>>>>> On 03/16/2015 10:09 PM, Alex Williamson wrote:
>>>>>>> On Mon, 2015-03-16 at 15:35 +0800, Chen Fan wrote:
>>>>>>>> On 03/16/2015 11:52 AM, Alex Williamson wrote:
>>>>>>>>> On Mon, 2015-03-16 at 11:05 +0800, Chen Fan wrote:
>>>>>>>>>> On 03/14/2015 06:34 AM, Alex Williamson wrote:
>>>>>>>>>>> On Thu, 2015-03-12 at 18:23 +0800, Chen Fan wrote:
>>>>>>>>>>>> when the vfio device encounters an uncorrectable error in host,
>>>>>>>>>>>> the vfio_pci driver will signal the eventfd registered by this
>>>>>>>>>>>> vfio device, the results in the qemu eventfd handler getting
>>>>>>>>>>>> invoked.
>>>>>>>>>>>>
>>>>>>>>>>>> this patch is to pass the error to guest and have the guest driver
>>>>>>>>>>>> recover from the error.
>>>>>>>>>>> What is going to be the typical recovery mechanism for the guest?  I'm
>>>>>>>>>>> concerned that the topology of the device in the guest doesn't
>>>>>>>>>>> necessarily match the topology of the device in the host, so if the
>>>>>>>>>>> guest were to attempt a bus reset to recover a device, for instance,
>>>>>>>>>>> what happens?
>>>>>>>>>> the recovery mechanism is that when guest got an aer error from a device,
>>>>>>>>>> guest will clean the corresponding status bit in device register. and for
>>>>>>>>>> need reset device, the guest aer driver would reset all devices under bus.
>>>>>>>>> Sorry, I'm still confused, how does the guest aer driver reset all
>>>>>>>>> devices under a bus?  Are we talking about function-level, device
>>>>>>>>> specific reset mechanisms or secondary bus resets?  If the guest is
>>>>>>>>> performing secondary bus resets, what guarantee do they have that it
>>>>>>>>> will translate to a physical secondary bus reset?  vfio may only do an
>>>>>>>>> FLR when the bus is reset or it may not be able to do anything depending
>>>>>>>>> on the available function-level resets and physical and virtual topology
>>>>>>>>> of the device.  Thanks,
>>>>>>>> in general, functions depends on the corresponding device driver behaviors
>>>>>>>> to do the recovery. e.g: implemented the error_detect, slot_reset callbacks.
>>>>>>>> and for link reset, it usually do secondary bus reset.
>>>>>>>>
>>>>>>>> and do we must require to the physical secondary bus reset for vfio device
>>>>>>>> as bus reset?
>>>>>>> That depends on how the guest driver attempts recovery, doesn't it?
>>>>>>> There are only a very limited number of cases where a secondary bus
>>>>>>> reset initiated by the guest will translate to a secondary bus reset of
>>>>>>> the physical device (iirc, single function device without FLR).  In most
>>>>>>> cases, it will at best be translated to an FLR.  VFIO really only does
>>>>>>> bus resets on VM reset because that's the only time we know that it's ok
>>>>>>> to reset multiple devices.  If the guest driver is depending on a
>>>>>>> secondary bus reset to put the device into a recoverable state and we're
>>>>>>> not able to provide that, then we're actually reducing containment of
>>>>>>> the error by exposing AER to the guest and allowing it to attempt
>>>>>>> recovery.  So in practice, I'm afraid we're risking the integrity of the
>>>>>>> VM by exposing AER to the guest and making it think that it can perform
>>>>>>> recovery operations that are not effective.  Thanks,
>>>>>> I also have seen that if device without FLR, it seems can do hot reset
>>>>>> by ioctl VFIO_DEVICE_PCI_HOT_RESET to reset the physical slot or bus
>>>>>> in vfio_pci_reset. does it satisfy the recovery issues that you said?
>>>>> The hot reset interface can only be used when a) the user (QEMU) owns
>>>>> all of the devices on the bus and b) we know we're resetting all of the
>>>>> devices.  That mostly limits its use to VM reset.  I think that on a
>>>>> secondary bus reset, we don't know the scope of the reset at the QEMU
>>>>> vfio driver, so we only make use of reset methods with a function-level
>>>>> scope.  That would only result in a secondary bus reset if that's the
>>>>> reset mechanism used by the host kernel's PCI code (pci_reset_function),
>>>>> which is limited to single function devices on a secondary bus, with no
>>>>> other reset mechanisms.  The host reset is also only available in some
>>>>> configurations, for instance if we have a dual-port NIC where each
>>>>> function is a separate IOMMU group, then we clearly cannot do a hot
>>>>> reset unless both functions are assigned to the same VM _and_ appear to
>>>>> the guest on the same virtual bus.  So even if we could know the scope
>>>>> of the reset in the QEMU vfio driver, we can only make use of it under
>>>>> very strict guest configurations.  Thanks,
>>>> Hi Alex,
>>>>
>>>>       have you some idea or scenario to fix/escape this issue?
>>> Hi Chen,
>>>
>>> I expect there are two major components to this.  The first is that
>>> QEMU/vfio-pci needs to enforce that a bus reset is possible for the host
>>> and guest topology when guest AER handling is specified for a device.
>>> That means that everything affected by the bus reset needs to be exposed
>>> to the guest in a compatible way.  For instance, if a bus reset affects
>>> devices from multiple groups, the guest needs to not only own all of
>>> those groups, but they also need to be exposed to the guest such that
>>> the virtual bus layout reflects the extent of the reset for the physical
>>> bus.  This also implies that guest AER handling cannot be the default
>>> since it will impose significant configuration restrictions on device
>>> assignment.
>>>
>>> This seems like a difficult configuration enforcement to make, but maybe
>>> there are simplifying assumptions that can help.  For instance the
>>> devices need to be exposed as PCIe therefore we won't have multiple
>>> slots in use on a bus and I think we can therefore mostly ignore hotplug
>>> since we can only hotplug at a slot granularity.  That may also imply
>>> that we should simply enforce a 1:1 mapping of physical functions to
>>> virtual functions.  At least one function from each group affected by a
>>> reset must be exposed to the guest.
>>>
>>> The second issue is that individual QEMU PCI devices have no callback
>>> for a bus reset.  QEMU/vfio-pci currently has the DeviceClass.reset
>>> callback, which we assume to be a function-level reset.  We also
>>> register with qemu_register_reset() for a VM reset, which is the only
>>> point currently that we know we can do a reset affecting multiple
>>> devices.  Infrastructure will need to be added to QEMU/PCI to expose the
>>> link down/RST signal to devices on a bus to trigger a multi-device reset
>>> in vfio-pci.
>>>
>>> Hopefully I'm not missing something, but I think both of those changes
>>> are going to be required before we can have anything remotely
>>> supportable for guest-based AER error handle.  This pretty complicated
>>> for the user and also for libvirt to figure out.  At a minimum libvirt
>>> would need to support a new guest-based AER handling flag for devices.
>>> We probably need to determine whether this is unique to vfio-pci or a
>>> generic PCIDevice option.  Thanks,
>> Hi Alex,
>>     Solving the two issues seem like a big workload. do we have a simple
>>     way to support qemu AER ?
> Hi Chen,
>
> The simpler way is the existing, containment-only solution where QEMU
> stops the guest on an uncorrected error.  Do you have any other
> suggestions?  I don't see how we can rely on guest involvement in
> recovery unless the guest has the same abilities to reset the device as
> it would on bare metal.  Thanks,
Hi Alex,

for the first issue, I think the functions affected by a bus reset need 
to assign to
guest are too restricted.
I suppose if we enable support the aer feature, only need to do
is check the pass through device's host bus whether have other endpoint,
if no other pci device, we can support the host bus reset in qemu vfio-pci.

Thanks,
Chen


>
> Alex
>
> .
>
Alex Williamson April 15, 2015, 2:18 p.m. UTC | #16
On Wed, 2015-04-15 at 18:30 +0800, Chen Fan wrote:
> On 04/08/2015 11:36 PM, Alex Williamson wrote:
> > On Wed, 2015-04-08 at 16:59 +0800, Chen Fan wrote:
> >> On 04/01/2015 11:46 PM, Alex Williamson wrote:
> >>> On Wed, 2015-04-01 at 12:12 +0800, Chen Fan wrote:
> >>>> On 03/25/2015 10:41 AM, Alex Williamson wrote:
> >>>>> On Wed, 2015-03-25 at 09:53 +0800, Chen Fan wrote:
> >>>>>> On 03/16/2015 10:09 PM, Alex Williamson wrote:
> >>>>>>> On Mon, 2015-03-16 at 15:35 +0800, Chen Fan wrote:
> >>>>>>>> On 03/16/2015 11:52 AM, Alex Williamson wrote:
> >>>>>>>>> On Mon, 2015-03-16 at 11:05 +0800, Chen Fan wrote:
> >>>>>>>>>> On 03/14/2015 06:34 AM, Alex Williamson wrote:
> >>>>>>>>>>> On Thu, 2015-03-12 at 18:23 +0800, Chen Fan wrote:
> >>>>>>>>>>>> when the vfio device encounters an uncorrectable error in host,
> >>>>>>>>>>>> the vfio_pci driver will signal the eventfd registered by this
> >>>>>>>>>>>> vfio device, the results in the qemu eventfd handler getting
> >>>>>>>>>>>> invoked.
> >>>>>>>>>>>>
> >>>>>>>>>>>> this patch is to pass the error to guest and have the guest driver
> >>>>>>>>>>>> recover from the error.
> >>>>>>>>>>> What is going to be the typical recovery mechanism for the guest?  I'm
> >>>>>>>>>>> concerned that the topology of the device in the guest doesn't
> >>>>>>>>>>> necessarily match the topology of the device in the host, so if the
> >>>>>>>>>>> guest were to attempt a bus reset to recover a device, for instance,
> >>>>>>>>>>> what happens?
> >>>>>>>>>> the recovery mechanism is that when guest got an aer error from a device,
> >>>>>>>>>> guest will clean the corresponding status bit in device register. and for
> >>>>>>>>>> need reset device, the guest aer driver would reset all devices under bus.
> >>>>>>>>> Sorry, I'm still confused, how does the guest aer driver reset all
> >>>>>>>>> devices under a bus?  Are we talking about function-level, device
> >>>>>>>>> specific reset mechanisms or secondary bus resets?  If the guest is
> >>>>>>>>> performing secondary bus resets, what guarantee do they have that it
> >>>>>>>>> will translate to a physical secondary bus reset?  vfio may only do an
> >>>>>>>>> FLR when the bus is reset or it may not be able to do anything depending
> >>>>>>>>> on the available function-level resets and physical and virtual topology
> >>>>>>>>> of the device.  Thanks,
> >>>>>>>> in general, functions depends on the corresponding device driver behaviors
> >>>>>>>> to do the recovery. e.g: implemented the error_detect, slot_reset callbacks.
> >>>>>>>> and for link reset, it usually do secondary bus reset.
> >>>>>>>>
> >>>>>>>> and do we must require to the physical secondary bus reset for vfio device
> >>>>>>>> as bus reset?
> >>>>>>> That depends on how the guest driver attempts recovery, doesn't it?
> >>>>>>> There are only a very limited number of cases where a secondary bus
> >>>>>>> reset initiated by the guest will translate to a secondary bus reset of
> >>>>>>> the physical device (iirc, single function device without FLR).  In most
> >>>>>>> cases, it will at best be translated to an FLR.  VFIO really only does
> >>>>>>> bus resets on VM reset because that's the only time we know that it's ok
> >>>>>>> to reset multiple devices.  If the guest driver is depending on a
> >>>>>>> secondary bus reset to put the device into a recoverable state and we're
> >>>>>>> not able to provide that, then we're actually reducing containment of
> >>>>>>> the error by exposing AER to the guest and allowing it to attempt
> >>>>>>> recovery.  So in practice, I'm afraid we're risking the integrity of the
> >>>>>>> VM by exposing AER to the guest and making it think that it can perform
> >>>>>>> recovery operations that are not effective.  Thanks,
> >>>>>> I also have seen that if device without FLR, it seems can do hot reset
> >>>>>> by ioctl VFIO_DEVICE_PCI_HOT_RESET to reset the physical slot or bus
> >>>>>> in vfio_pci_reset. does it satisfy the recovery issues that you said?
> >>>>> The hot reset interface can only be used when a) the user (QEMU) owns
> >>>>> all of the devices on the bus and b) we know we're resetting all of the
> >>>>> devices.  That mostly limits its use to VM reset.  I think that on a
> >>>>> secondary bus reset, we don't know the scope of the reset at the QEMU
> >>>>> vfio driver, so we only make use of reset methods with a function-level
> >>>>> scope.  That would only result in a secondary bus reset if that's the
> >>>>> reset mechanism used by the host kernel's PCI code (pci_reset_function),
> >>>>> which is limited to single function devices on a secondary bus, with no
> >>>>> other reset mechanisms.  The host reset is also only available in some
> >>>>> configurations, for instance if we have a dual-port NIC where each
> >>>>> function is a separate IOMMU group, then we clearly cannot do a hot
> >>>>> reset unless both functions are assigned to the same VM _and_ appear to
> >>>>> the guest on the same virtual bus.  So even if we could know the scope
> >>>>> of the reset in the QEMU vfio driver, we can only make use of it under
> >>>>> very strict guest configurations.  Thanks,
> >>>> Hi Alex,
> >>>>
> >>>>       have you some idea or scenario to fix/escape this issue?
> >>> Hi Chen,
> >>>
> >>> I expect there are two major components to this.  The first is that
> >>> QEMU/vfio-pci needs to enforce that a bus reset is possible for the host
> >>> and guest topology when guest AER handling is specified for a device.
> >>> That means that everything affected by the bus reset needs to be exposed
> >>> to the guest in a compatible way.  For instance, if a bus reset affects
> >>> devices from multiple groups, the guest needs to not only own all of
> >>> those groups, but they also need to be exposed to the guest such that
> >>> the virtual bus layout reflects the extent of the reset for the physical
> >>> bus.  This also implies that guest AER handling cannot be the default
> >>> since it will impose significant configuration restrictions on device
> >>> assignment.
> >>>
> >>> This seems like a difficult configuration enforcement to make, but maybe
> >>> there are simplifying assumptions that can help.  For instance the
> >>> devices need to be exposed as PCIe therefore we won't have multiple
> >>> slots in use on a bus and I think we can therefore mostly ignore hotplug
> >>> since we can only hotplug at a slot granularity.  That may also imply
> >>> that we should simply enforce a 1:1 mapping of physical functions to
> >>> virtual functions.  At least one function from each group affected by a
> >>> reset must be exposed to the guest.
> >>>
> >>> The second issue is that individual QEMU PCI devices have no callback
> >>> for a bus reset.  QEMU/vfio-pci currently has the DeviceClass.reset
> >>> callback, which we assume to be a function-level reset.  We also
> >>> register with qemu_register_reset() for a VM reset, which is the only
> >>> point currently that we know we can do a reset affecting multiple
> >>> devices.  Infrastructure will need to be added to QEMU/PCI to expose the
> >>> link down/RST signal to devices on a bus to trigger a multi-device reset
> >>> in vfio-pci.
> >>>
> >>> Hopefully I'm not missing something, but I think both of those changes
> >>> are going to be required before we can have anything remotely
> >>> supportable for guest-based AER error handle.  This pretty complicated
> >>> for the user and also for libvirt to figure out.  At a minimum libvirt
> >>> would need to support a new guest-based AER handling flag for devices.
> >>> We probably need to determine whether this is unique to vfio-pci or a
> >>> generic PCIDevice option.  Thanks,
> >> Hi Alex,
> >>     Solving the two issues seem like a big workload. do we have a simple
> >>     way to support qemu AER ?
> > Hi Chen,
> >
> > The simpler way is the existing, containment-only solution where QEMU
> > stops the guest on an uncorrected error.  Do you have any other
> > suggestions?  I don't see how we can rely on guest involvement in
> > recovery unless the guest has the same abilities to reset the device as
> > it would on bare metal.  Thanks,
> Hi Alex,
> 
> for the first issue, I think the functions affected by a bus reset need 
> to assign to
> guest are too restricted.

Why?  If the guest thinks that it's doing a bus reset to recover the
device, I don't think we can ignore that or do a lesser function-level
reset.  If the guest thought it could recover using a function-level
reset, it probably would have used that instead.

> I suppose if we enable support the aer feature, only need to do
> is check the pass through device's host bus whether have other endpoint,
> if no other pci device, we can support the host bus reset in qemu vfio-pci.

I don't think that restricting the problem to single-function endpoints
changes the requirements at all.  vfio-pci in QEMU would still need to
restrict that AER forwarding to the guest can only be enabled in
supported configurations and the QEMU PCI-core code would need to
differentiate a PCI bus reset from a regular single device scope reset.
In fact, restricting the configuration to single function endpoints
appears to be the same amount of work, only reducing the usefulness.
Thanks,

Alex
diff mbox

Patch

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 0a515b6..8966c49 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -3240,18 +3240,40 @@  static void vfio_put_device(VFIOPCIDevice *vdev)
 static void vfio_err_notifier_handler(void *opaque)
 {
     VFIOPCIDevice *vdev = opaque;
+    PCIDevice *dev = &vdev->pdev;
+    PCIEAERMsg msg = {
+        .severity = 0,
+        .source_id = (pci_bus_num(dev->bus) << 8) | dev->devfn,
+    };
 
     if (!event_notifier_test_and_clear(&vdev->err_notifier)) {
         return;
     }
 
+    /* we should read the error details from the real hardware
+     * configuration spaces, here we only need to do is signaling
+     * to guest an uncorrectable error has occurred.
+     */
+     if(dev->exp.aer_cap) {
+        uint8_t *aer_cap = dev->config + dev->exp.aer_cap;
+        uint32_t uncor_status;
+        bool isfatal;
+
+        uncor_status = vfio_pci_read_config(dev,
+                           dev->exp.aer_cap + PCI_ERR_UNCOR_STATUS, 4);
+
+        isfatal = uncor_status & pci_get_long(aer_cap + PCI_ERR_UNCOR_SEVER);
+
+        msg.severity = isfatal ? PCI_ERR_ROOT_CMD_FATAL_EN :
+                                 PCI_ERR_ROOT_CMD_NONFATAL_EN;
+
+        pcie_aer_msg(dev, &msg);
+        return;
+    }
+
     /*
-     * TBD. Retrieve the error details and decide what action
-     * needs to be taken. One of the actions could be to pass
-     * the error to the guest and have the guest driver recover
-     * from the error. This requires that PCIe capabilities be
-     * exposed to the guest. For now, we just terminate the
-     * guest to contain the error.
+     * If the aer capability is not exposed to the guest. we just
+     * terminate the guest to contain the error.
      */
 
     error_report("%s(%04x:%02x:%02x.%x) Unrecoverable error detected.  "