diff mbox

[RFC,v8.1,06/13] vfio: add check host bus reset is support or not

Message ID c90da5507901c24863e9bf4b36c718810d61b55d.1432694455.git.chen.fan.fnst@cn.fujitsu.com
State New
Headers show

Commit Message

chenfan May 27, 2015, 2:46 a.m. UTC
we introduce a has_bus_reset capability to sign the vfio
devices if support host bus reset.

Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
---
 hw/vfio/pci.c | 123 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 123 insertions(+)

Comments

Alex Williamson May 27, 2015, 9:32 p.m. UTC | #1
On Wed, 2015-05-27 at 10:46 +0800, Chen Fan wrote:
> we introduce a has_bus_reset capability to sign the vfio
> devices if support host bus reset.
> 
> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> ---
>  hw/vfio/pci.c | 123 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 123 insertions(+)
> 
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index f4e7855..5934fd7 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -33,6 +33,7 @@
>  #include "hw/pci/msix.h"
>  #include "hw/pci/pci.h"
>  #include "hw/pci/pci_bridge.h"
> +#include "hw/pci/pci_bus.h"
>  #include "qemu-common.h"
>  #include "qemu/error-report.h"
>  #include "qemu/event_notifier.h"
> @@ -170,6 +171,7 @@ typedef struct VFIOPCIDevice {
>      bool req_enabled;
>      bool has_flr;
>      bool has_pm_reset;
> +    bool has_bus_reset;

I still think that caching this is a bad idea, there's no point at which
we can blindly assume the capability is still present.

>      bool rom_read_failed;
>  } VFIOPCIDevice;
>  
> @@ -203,6 +205,7 @@ static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
>  static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr,
>                                    uint32_t val, int len);
>  static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
> +static void vfio_check_host_bus_reset(VFIOPCIDevice *vdev);
>  
>  /*
>   * Disabling BAR mmaping can be slow, but toggling it around INTx can
> @@ -2853,6 +2856,20 @@ static int vfio_setup_aer(VFIOPCIDevice *vdev, int pos, uint16_t size)
>          dev_iter = pci_bridge_get_device(dev_iter->bus);
>      }
>  
> +    /*
> +     * Don't check bus reset capability when device is enabled during
> +     * qemu machine creation, which is done by machine init function.
> +     */
> +    if (DEVICE(vdev)->hotplugged) {
> +        vfio_check_host_bus_reset(vdev);
> +        if (!vdev->has_bus_reset) {
> +            error_report("vfio: Cannot enable AER for device %s, "
> +                         "which is not support host bus reset.",

"which does not support host bus reset."

> +                         vdev->vbasedev.name);
> +            goto error;
> +        }
> +    }
> +
>      errcap = vfio_pci_read_config(pdev, pdev->exp.aer_cap + PCI_ERR_CAP, 4);
>      /*
>       * The ability to record multiple headers is depending on
> @@ -3678,6 +3695,112 @@ static void vfio_setup_resetfn(VFIOPCIDevice *vdev)
>      }
>  }
>  
> +struct VfioDeviceFind {

We use VFIOFooBar for all other camel case definitions, much like PCIBus
and PCIDevice below.

> +    PCIBus *pbus;
> +    PCIDevice *pdev;
> +    bool found;
> +};
> +
> +static void find_devices(PCIBus *bus, void *opaque)
> +{
> +    struct VfioDeviceFind *find = opaque;
> +    int i;
> +
> +    if (find->found == true) {

if (find->found) {...

> +        return;
> +    }
> +
> +    for (i = 0; i < ARRAY_SIZE(bus->devices); i++) {
> +        if (!bus->devices[i]) {
> +            continue;
> +        }
> +
> +        if (bus->devices[i] == find->pdev) {
> +            find->pbus = bus;
> +            find->found = true;
> +            break;
> +        }
> +    }
> +}
> +
> +static void vfio_check_host_bus_reset(VFIOPCIDevice *vdev)
> +{
> +    PCIBus *bus = vdev->pdev.bus;
> +    struct vfio_pci_hot_reset_info *info = NULL;
> +    struct vfio_pci_dependent_device *devices;
> +    VFIOGroup *group;
> +    int ret, i;
> +    bool has_bus_reset = false;
> +
> +    ret = vfio_get_hot_reset_info(vdev, &info);
> +    if (ret < 0) {

if (ret) {...

> +        goto out;
> +    }
> +
> +    /* List all affected devices by bus reset */
> +    devices = &info->devices[0];
> +
> +    /* Verify that we have all the groups required */
> +    for (i = 0; i < info->count; i++) {
> +        PCIHostDeviceAddress host;
> +        VFIOPCIDevice *tmp;
> +        VFIODevice *vbasedev_iter;
> +
> +        host.domain = devices[i].segment;
> +        host.bus = devices[i].bus;
> +        host.slot = PCI_SLOT(devices[i].devfn);
> +        host.function = PCI_FUNC(devices[i].devfn);
> +
> +        /* Skip the current device */
> +        if (vfio_pci_host_match(&host, &vdev->host)) {
> +            continue;
> +        }
> +
> +        /* Ensure we own the group of the affected device */
> +        QLIST_FOREACH(group, &vfio_group_list, next) {
> +            if (group->groupid == devices[i].group_id) {
> +                break;
> +            }
> +        }
> +
> +        if (!group) {
> +            goto out;
> +        }
> +
> +        /* Ensure affected devices for reset under the same bus */
> +        QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
> +            if (vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) {
> +                continue;
> +            }
> +            tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev);
> +            if (vfio_pci_host_match(&host, &tmp->host)) {
> +                struct VfioDeviceFind find = { .pdev = &tmp->pdev, .found = false };
> +
> +                pci_for_each_bus(bus, find_devices, &find);
> +                if (!find.found) {
> +                    goto out;
> +                }
> +                /*
> +                 * When the check device is hotplugged to a higher bus again,
> +                 * which would influence the affected device which enable aer
> +                 * below the bus.
> +                 */
> +                if (tmp->features & VFIO_FEATURE_ENABLE_AER &&
> +                    find.pbus != bus) {
> +                    goto out;
> +                }

I think what you're trying to do here is assume that if a reset of A
affects B, then a reset of B affects A, and if B is on a subordinate bus
from A, then our configuration is broken, right?  Can we really
guarantee that assumption?  If we had a physical topology that mirrored
this virtual topology, that wouldn't necessarily be true.  For instance,
if A was a function of a multi-function device where another function
was a PCIe upstream switch, B could be subordinate to that switch, so a
reset of A affects B, but a reset of B doesn't affect A.

> +                break;
> +            }
> +        }
> +    }
> +
> +    has_bus_reset = true;
> +
> +out:
> +    vdev->has_bus_reset = has_bus_reset;

I don't see any value is storing this, it can't be trusted at any point
in the future.  I think that any time we add a device or think about
forwarding an AER message to the guest, we need to do a validation pass,
testing the entire topology.

To do that we'd iterate through every device in every group for PCI
devices that support AER.  For each one, get the hot reset info for the
affected device list.  For each affected device, if it's attached to the
VM, it needs to be on or below the bus of the target device.
Additionally, we should test all of the non-AER supporting vfio-pci
devices on or below the target bus to verify they have a reset
mechanism.  Run that function for each vfio-pci device as it's added,
regardless of whether it's hotplug.  If the test fails, fail the initfn
for the current device.  Also run the test prior to AER injection, if it
fails demote the AER injection to a machine halt as we have now.

> +    g_free(info);
> +}
> +
>  static int vfio_initfn(PCIDevice *pdev)
>  {
>      VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
chenfan June 2, 2015, 7:54 a.m. UTC | #2
On 05/28/2015 05:32 AM, Alex Williamson wrote:
> On Wed, 2015-05-27 at 10:46 +0800, Chen Fan wrote:
>> we introduce a has_bus_reset capability to sign the vfio
>> devices if support host bus reset.
>>
>> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
>> ---
>>   hw/vfio/pci.c | 123 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>   1 file changed, 123 insertions(+)
>>
>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>> index f4e7855..5934fd7 100644
>> --- a/hw/vfio/pci.c
>> +++ b/hw/vfio/pci.c
>> @@ -33,6 +33,7 @@
>>   #include "hw/pci/msix.h"
>>   #include "hw/pci/pci.h"
>>   #include "hw/pci/pci_bridge.h"
>> +#include "hw/pci/pci_bus.h"
>>   #include "qemu-common.h"
>>   #include "qemu/error-report.h"
>>   #include "qemu/event_notifier.h"
>> @@ -170,6 +171,7 @@ typedef struct VFIOPCIDevice {
>>       bool req_enabled;
>>       bool has_flr;
>>       bool has_pm_reset;
>> +    bool has_bus_reset;
> I still think that caching this is a bad idea, there's no point at which
> we can blindly assume the capability is still present.
>
>>       bool rom_read_failed;
>>   } VFIOPCIDevice;
>>   
>> @@ -203,6 +205,7 @@ static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
>>   static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr,
>>                                     uint32_t val, int len);
>>   static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
>> +static void vfio_check_host_bus_reset(VFIOPCIDevice *vdev);
>>   
>>   /*
>>    * Disabling BAR mmaping can be slow, but toggling it around INTx can
>> @@ -2853,6 +2856,20 @@ static int vfio_setup_aer(VFIOPCIDevice *vdev, int pos, uint16_t size)
>>           dev_iter = pci_bridge_get_device(dev_iter->bus);
>>       }
>>   
>> +    /*
>> +     * Don't check bus reset capability when device is enabled during
>> +     * qemu machine creation, which is done by machine init function.
>> +     */
>> +    if (DEVICE(vdev)->hotplugged) {
>> +        vfio_check_host_bus_reset(vdev);
>> +        if (!vdev->has_bus_reset) {
>> +            error_report("vfio: Cannot enable AER for device %s, "
>> +                         "which is not support host bus reset.",
> "which does not support host bus reset."
>
>> +                         vdev->vbasedev.name);
>> +            goto error;
>> +        }
>> +    }
>> +
>>       errcap = vfio_pci_read_config(pdev, pdev->exp.aer_cap + PCI_ERR_CAP, 4);
>>       /*
>>        * The ability to record multiple headers is depending on
>> @@ -3678,6 +3695,112 @@ static void vfio_setup_resetfn(VFIOPCIDevice *vdev)
>>       }
>>   }
>>   
>> +struct VfioDeviceFind {
> We use VFIOFooBar for all other camel case definitions, much like PCIBus
> and PCIDevice below.
>
>> +    PCIBus *pbus;
>> +    PCIDevice *pdev;
>> +    bool found;
>> +};
>> +
>> +static void find_devices(PCIBus *bus, void *opaque)
>> +{
>> +    struct VfioDeviceFind *find = opaque;
>> +    int i;
>> +
>> +    if (find->found == true) {
> if (find->found) {...
>
>> +        return;
>> +    }
>> +
>> +    for (i = 0; i < ARRAY_SIZE(bus->devices); i++) {
>> +        if (!bus->devices[i]) {
>> +            continue;
>> +        }
>> +
>> +        if (bus->devices[i] == find->pdev) {
>> +            find->pbus = bus;
>> +            find->found = true;
>> +            break;
>> +        }
>> +    }
>> +}
>> +
>> +static void vfio_check_host_bus_reset(VFIOPCIDevice *vdev)
>> +{
>> +    PCIBus *bus = vdev->pdev.bus;
>> +    struct vfio_pci_hot_reset_info *info = NULL;
>> +    struct vfio_pci_dependent_device *devices;
>> +    VFIOGroup *group;
>> +    int ret, i;
>> +    bool has_bus_reset = false;
>> +
>> +    ret = vfio_get_hot_reset_info(vdev, &info);
>> +    if (ret < 0) {
> if (ret) {...
>
>> +        goto out;
>> +    }
>> +
>> +    /* List all affected devices by bus reset */
>> +    devices = &info->devices[0];
>> +
>> +    /* Verify that we have all the groups required */
>> +    for (i = 0; i < info->count; i++) {
>> +        PCIHostDeviceAddress host;
>> +        VFIOPCIDevice *tmp;
>> +        VFIODevice *vbasedev_iter;
>> +
>> +        host.domain = devices[i].segment;
>> +        host.bus = devices[i].bus;
>> +        host.slot = PCI_SLOT(devices[i].devfn);
>> +        host.function = PCI_FUNC(devices[i].devfn);
>> +
>> +        /* Skip the current device */
>> +        if (vfio_pci_host_match(&host, &vdev->host)) {
>> +            continue;
>> +        }
>> +
>> +        /* Ensure we own the group of the affected device */
>> +        QLIST_FOREACH(group, &vfio_group_list, next) {
>> +            if (group->groupid == devices[i].group_id) {
>> +                break;
>> +            }
>> +        }
>> +
>> +        if (!group) {
>> +            goto out;
>> +        }
>> +
>> +        /* Ensure affected devices for reset under the same bus */
>> +        QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
>> +            if (vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) {
>> +                continue;
>> +            }
>> +            tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev);
>> +            if (vfio_pci_host_match(&host, &tmp->host)) {
>> +                struct VfioDeviceFind find = { .pdev = &tmp->pdev, .found = false };
>> +
>> +                pci_for_each_bus(bus, find_devices, &find);
>> +                if (!find.found) {
>> +                    goto out;
>> +                }
>> +                /*
>> +                 * When the check device is hotplugged to a higher bus again,
>> +                 * which would influence the affected device which enable aer
>> +                 * below the bus.
>> +                 */
>> +                if (tmp->features & VFIO_FEATURE_ENABLE_AER &&
>> +                    find.pbus != bus) {
>> +                    goto out;
>> +                }
> I think what you're trying to do here is assume that if a reset of A
> affects B, then a reset of B affects A, and if B is on a subordinate bus
> from A, then our configuration is broken, right?  Can we really
> guarantee that assumption?  If we had a physical topology that mirrored
> this virtual topology, that wouldn't necessarily be true.  For instance,
> if A was a function of a multi-function device where another function
> was a PCIe upstream switch, B could be subordinate to that switch, so a
> reset of A affects B, but a reset of B doesn't affect A.
>
>> +                break;
>> +            }
>> +        }
>> +    }
>> +
>> +    has_bus_reset = true;
>> +
>> +out:
>> +    vdev->has_bus_reset = has_bus_reset;
> I don't see any value is storing this, it can't be trusted at any point
> in the future.  I think that any time we add a device or think about
> forwarding an AER message to the guest, we need to do a validation pass,
> testing the entire topology.
>
> To do that we'd iterate through every device in every group for PCI
> devices that support AER.  For each one, get the hot reset info for the
> affected device list.  For each affected device, if it's attached to the
> VM, it needs to be on or below the bus of the target device.
> Additionally, we should test all of the non-AER supporting vfio-pci
> devices on or below the target bus to verify they have a reset
> mechanism.  Run that function for each vfio-pci device as it's added,
> regardless of whether it's hotplug.  If the test fails, fail the initfn
> for the current device.  Also run the test prior to AER injection, if it
> fails demote the AER injection to a machine halt as we have now.
I'm worry about is the case that the affected devices belonged to
another groups but when initialize this device the another group
has not been added. it would cause fail the initfn, but the group
maybe added later. so I used the machine done event notifier to
check this case. if we don't do like that. how can we check the
case when all vfio-pci devices initfn ?

Thanks,
Chen

>
>> +    g_free(info);
>> +}
>> +
>>   static int vfio_initfn(PCIDevice *pdev)
>>   {
>>       VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
>
>
> .
>
Alex Williamson June 2, 2015, 4:47 p.m. UTC | #3
On Tue, 2015-06-02 at 15:54 +0800, Chen Fan wrote:
> On 05/28/2015 05:32 AM, Alex Williamson wrote:
> > On Wed, 2015-05-27 at 10:46 +0800, Chen Fan wrote:
> >> we introduce a has_bus_reset capability to sign the vfio
> >> devices if support host bus reset.
> >>
> >> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> >> ---
> >>   hw/vfio/pci.c | 123 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> >>   1 file changed, 123 insertions(+)
> >>
> >> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> >> index f4e7855..5934fd7 100644
> >> --- a/hw/vfio/pci.c
> >> +++ b/hw/vfio/pci.c
> >> @@ -33,6 +33,7 @@
> >>   #include "hw/pci/msix.h"
> >>   #include "hw/pci/pci.h"
> >>   #include "hw/pci/pci_bridge.h"
> >> +#include "hw/pci/pci_bus.h"
> >>   #include "qemu-common.h"
> >>   #include "qemu/error-report.h"
> >>   #include "qemu/event_notifier.h"
> >> @@ -170,6 +171,7 @@ typedef struct VFIOPCIDevice {
> >>       bool req_enabled;
> >>       bool has_flr;
> >>       bool has_pm_reset;
> >> +    bool has_bus_reset;
> > I still think that caching this is a bad idea, there's no point at which
> > we can blindly assume the capability is still present.
> >
> >>       bool rom_read_failed;
> >>   } VFIOPCIDevice;
> >>   
> >> @@ -203,6 +205,7 @@ static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
> >>   static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr,
> >>                                     uint32_t val, int len);
> >>   static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
> >> +static void vfio_check_host_bus_reset(VFIOPCIDevice *vdev);
> >>   
> >>   /*
> >>    * Disabling BAR mmaping can be slow, but toggling it around INTx can
> >> @@ -2853,6 +2856,20 @@ static int vfio_setup_aer(VFIOPCIDevice *vdev, int pos, uint16_t size)
> >>           dev_iter = pci_bridge_get_device(dev_iter->bus);
> >>       }
> >>   
> >> +    /*
> >> +     * Don't check bus reset capability when device is enabled during
> >> +     * qemu machine creation, which is done by machine init function.
> >> +     */
> >> +    if (DEVICE(vdev)->hotplugged) {
> >> +        vfio_check_host_bus_reset(vdev);
> >> +        if (!vdev->has_bus_reset) {
> >> +            error_report("vfio: Cannot enable AER for device %s, "
> >> +                         "which is not support host bus reset.",
> > "which does not support host bus reset."
> >
> >> +                         vdev->vbasedev.name);
> >> +            goto error;
> >> +        }
> >> +    }
> >> +
> >>       errcap = vfio_pci_read_config(pdev, pdev->exp.aer_cap + PCI_ERR_CAP, 4);
> >>       /*
> >>        * The ability to record multiple headers is depending on
> >> @@ -3678,6 +3695,112 @@ static void vfio_setup_resetfn(VFIOPCIDevice *vdev)
> >>       }
> >>   }
> >>   
> >> +struct VfioDeviceFind {
> > We use VFIOFooBar for all other camel case definitions, much like PCIBus
> > and PCIDevice below.
> >
> >> +    PCIBus *pbus;
> >> +    PCIDevice *pdev;
> >> +    bool found;
> >> +};
> >> +
> >> +static void find_devices(PCIBus *bus, void *opaque)
> >> +{
> >> +    struct VfioDeviceFind *find = opaque;
> >> +    int i;
> >> +
> >> +    if (find->found == true) {
> > if (find->found) {...
> >
> >> +        return;
> >> +    }
> >> +
> >> +    for (i = 0; i < ARRAY_SIZE(bus->devices); i++) {
> >> +        if (!bus->devices[i]) {
> >> +            continue;
> >> +        }
> >> +
> >> +        if (bus->devices[i] == find->pdev) {
> >> +            find->pbus = bus;
> >> +            find->found = true;
> >> +            break;
> >> +        }
> >> +    }
> >> +}
> >> +
> >> +static void vfio_check_host_bus_reset(VFIOPCIDevice *vdev)
> >> +{
> >> +    PCIBus *bus = vdev->pdev.bus;
> >> +    struct vfio_pci_hot_reset_info *info = NULL;
> >> +    struct vfio_pci_dependent_device *devices;
> >> +    VFIOGroup *group;
> >> +    int ret, i;
> >> +    bool has_bus_reset = false;
> >> +
> >> +    ret = vfio_get_hot_reset_info(vdev, &info);
> >> +    if (ret < 0) {
> > if (ret) {...
> >
> >> +        goto out;
> >> +    }
> >> +
> >> +    /* List all affected devices by bus reset */
> >> +    devices = &info->devices[0];
> >> +
> >> +    /* Verify that we have all the groups required */
> >> +    for (i = 0; i < info->count; i++) {
> >> +        PCIHostDeviceAddress host;
> >> +        VFIOPCIDevice *tmp;
> >> +        VFIODevice *vbasedev_iter;
> >> +
> >> +        host.domain = devices[i].segment;
> >> +        host.bus = devices[i].bus;
> >> +        host.slot = PCI_SLOT(devices[i].devfn);
> >> +        host.function = PCI_FUNC(devices[i].devfn);
> >> +
> >> +        /* Skip the current device */
> >> +        if (vfio_pci_host_match(&host, &vdev->host)) {
> >> +            continue;
> >> +        }
> >> +
> >> +        /* Ensure we own the group of the affected device */
> >> +        QLIST_FOREACH(group, &vfio_group_list, next) {
> >> +            if (group->groupid == devices[i].group_id) {
> >> +                break;
> >> +            }
> >> +        }
> >> +
> >> +        if (!group) {
> >> +            goto out;
> >> +        }
> >> +
> >> +        /* Ensure affected devices for reset under the same bus */
> >> +        QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
> >> +            if (vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) {
> >> +                continue;
> >> +            }
> >> +            tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev);
> >> +            if (vfio_pci_host_match(&host, &tmp->host)) {
> >> +                struct VfioDeviceFind find = { .pdev = &tmp->pdev, .found = false };
> >> +
> >> +                pci_for_each_bus(bus, find_devices, &find);
> >> +                if (!find.found) {
> >> +                    goto out;
> >> +                }
> >> +                /*
> >> +                 * When the check device is hotplugged to a higher bus again,
> >> +                 * which would influence the affected device which enable aer
> >> +                 * below the bus.
> >> +                 */
> >> +                if (tmp->features & VFIO_FEATURE_ENABLE_AER &&
> >> +                    find.pbus != bus) {
> >> +                    goto out;
> >> +                }
> > I think what you're trying to do here is assume that if a reset of A
> > affects B, then a reset of B affects A, and if B is on a subordinate bus
> > from A, then our configuration is broken, right?  Can we really
> > guarantee that assumption?  If we had a physical topology that mirrored
> > this virtual topology, that wouldn't necessarily be true.  For instance,
> > if A was a function of a multi-function device where another function
> > was a PCIe upstream switch, B could be subordinate to that switch, so a
> > reset of A affects B, but a reset of B doesn't affect A.
> >
> >> +                break;
> >> +            }
> >> +        }
> >> +    }
> >> +
> >> +    has_bus_reset = true;
> >> +
> >> +out:
> >> +    vdev->has_bus_reset = has_bus_reset;
> > I don't see any value is storing this, it can't be trusted at any point
> > in the future.  I think that any time we add a device or think about
> > forwarding an AER message to the guest, we need to do a validation pass,
> > testing the entire topology.
> >
> > To do that we'd iterate through every device in every group for PCI
> > devices that support AER.  For each one, get the hot reset info for the
> > affected device list.  For each affected device, if it's attached to the
> > VM, it needs to be on or below the bus of the target device.
> > Additionally, we should test all of the non-AER supporting vfio-pci
> > devices on or below the target bus to verify they have a reset
> > mechanism.  Run that function for each vfio-pci device as it's added,
> > regardless of whether it's hotplug.  If the test fails, fail the initfn
> > for the current device.  Also run the test prior to AER injection, if it
> > fails demote the AER injection to a machine halt as we have now.
> I'm worry about is the case that the affected devices belonged to
> another groups but when initialize this device the another group
> has not been added. it would cause fail the initfn, but the group
> maybe added later. so I used the machine done event notifier to
> check this case. if we don't do like that. how can we check the
> case when all vfio-pci devices initfn ?

That's why the initfn test needs to test the entire topology, not just
the device being added.  If we have the case you describe where we have
two devices in separate groups, we add the first device, test the
topology, see that the second device is affected but not yet added and
allow AER to be enabled.  When the second device is added, we again test
the topology, we see the potential conflict and fail the initfn for the
second device if the bus requirements are not met.

I don't see the advantage of using a machine init done notifier.  You
can perform fewer topology verification passes using that notifier, but
I think you can provide a more useful error message by testing on each
device addition.  We also use the exact same strategy for cold-plug and
hot-plug, which makes maintenance easier.  Thanks,

Alex
chenfan June 3, 2015, 12:52 a.m. UTC | #4
On 06/03/2015 12:47 AM, Alex Williamson wrote:
> On Tue, 2015-06-02 at 15:54 +0800, Chen Fan wrote:
>> On 05/28/2015 05:32 AM, Alex Williamson wrote:
>>> On Wed, 2015-05-27 at 10:46 +0800, Chen Fan wrote:
>>>> we introduce a has_bus_reset capability to sign the vfio
>>>> devices if support host bus reset.
>>>>
>>>> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
>>>> ---
>>>>    hw/vfio/pci.c | 123 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>>>    1 file changed, 123 insertions(+)
>>>>
>>>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>>>> index f4e7855..5934fd7 100644
>>>> --- a/hw/vfio/pci.c
>>>> +++ b/hw/vfio/pci.c
>>>> @@ -33,6 +33,7 @@
>>>>    #include "hw/pci/msix.h"
>>>>    #include "hw/pci/pci.h"
>>>>    #include "hw/pci/pci_bridge.h"
>>>> +#include "hw/pci/pci_bus.h"
>>>>    #include "qemu-common.h"
>>>>    #include "qemu/error-report.h"
>>>>    #include "qemu/event_notifier.h"
>>>> @@ -170,6 +171,7 @@ typedef struct VFIOPCIDevice {
>>>>        bool req_enabled;
>>>>        bool has_flr;
>>>>        bool has_pm_reset;
>>>> +    bool has_bus_reset;
>>> I still think that caching this is a bad idea, there's no point at which
>>> we can blindly assume the capability is still present.
>>>
>>>>        bool rom_read_failed;
>>>>    } VFIOPCIDevice;
>>>>    
>>>> @@ -203,6 +205,7 @@ static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
>>>>    static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr,
>>>>                                      uint32_t val, int len);
>>>>    static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
>>>> +static void vfio_check_host_bus_reset(VFIOPCIDevice *vdev);
>>>>    
>>>>    /*
>>>>     * Disabling BAR mmaping can be slow, but toggling it around INTx can
>>>> @@ -2853,6 +2856,20 @@ static int vfio_setup_aer(VFIOPCIDevice *vdev, int pos, uint16_t size)
>>>>            dev_iter = pci_bridge_get_device(dev_iter->bus);
>>>>        }
>>>>    
>>>> +    /*
>>>> +     * Don't check bus reset capability when device is enabled during
>>>> +     * qemu machine creation, which is done by machine init function.
>>>> +     */
>>>> +    if (DEVICE(vdev)->hotplugged) {
>>>> +        vfio_check_host_bus_reset(vdev);
>>>> +        if (!vdev->has_bus_reset) {
>>>> +            error_report("vfio: Cannot enable AER for device %s, "
>>>> +                         "which is not support host bus reset.",
>>> "which does not support host bus reset."
>>>
>>>> +                         vdev->vbasedev.name);
>>>> +            goto error;
>>>> +        }
>>>> +    }
>>>> +
>>>>        errcap = vfio_pci_read_config(pdev, pdev->exp.aer_cap + PCI_ERR_CAP, 4);
>>>>        /*
>>>>         * The ability to record multiple headers is depending on
>>>> @@ -3678,6 +3695,112 @@ static void vfio_setup_resetfn(VFIOPCIDevice *vdev)
>>>>        }
>>>>    }
>>>>    
>>>> +struct VfioDeviceFind {
>>> We use VFIOFooBar for all other camel case definitions, much like PCIBus
>>> and PCIDevice below.
>>>
>>>> +    PCIBus *pbus;
>>>> +    PCIDevice *pdev;
>>>> +    bool found;
>>>> +};
>>>> +
>>>> +static void find_devices(PCIBus *bus, void *opaque)
>>>> +{
>>>> +    struct VfioDeviceFind *find = opaque;
>>>> +    int i;
>>>> +
>>>> +    if (find->found == true) {
>>> if (find->found) {...
>>>
>>>> +        return;
>>>> +    }
>>>> +
>>>> +    for (i = 0; i < ARRAY_SIZE(bus->devices); i++) {
>>>> +        if (!bus->devices[i]) {
>>>> +            continue;
>>>> +        }
>>>> +
>>>> +        if (bus->devices[i] == find->pdev) {
>>>> +            find->pbus = bus;
>>>> +            find->found = true;
>>>> +            break;
>>>> +        }
>>>> +    }
>>>> +}
>>>> +
>>>> +static void vfio_check_host_bus_reset(VFIOPCIDevice *vdev)
>>>> +{
>>>> +    PCIBus *bus = vdev->pdev.bus;
>>>> +    struct vfio_pci_hot_reset_info *info = NULL;
>>>> +    struct vfio_pci_dependent_device *devices;
>>>> +    VFIOGroup *group;
>>>> +    int ret, i;
>>>> +    bool has_bus_reset = false;
>>>> +
>>>> +    ret = vfio_get_hot_reset_info(vdev, &info);
>>>> +    if (ret < 0) {
>>> if (ret) {...
>>>
>>>> +        goto out;
>>>> +    }
>>>> +
>>>> +    /* List all affected devices by bus reset */
>>>> +    devices = &info->devices[0];
>>>> +
>>>> +    /* Verify that we have all the groups required */
>>>> +    for (i = 0; i < info->count; i++) {
>>>> +        PCIHostDeviceAddress host;
>>>> +        VFIOPCIDevice *tmp;
>>>> +        VFIODevice *vbasedev_iter;
>>>> +
>>>> +        host.domain = devices[i].segment;
>>>> +        host.bus = devices[i].bus;
>>>> +        host.slot = PCI_SLOT(devices[i].devfn);
>>>> +        host.function = PCI_FUNC(devices[i].devfn);
>>>> +
>>>> +        /* Skip the current device */
>>>> +        if (vfio_pci_host_match(&host, &vdev->host)) {
>>>> +            continue;
>>>> +        }
>>>> +
>>>> +        /* Ensure we own the group of the affected device */
>>>> +        QLIST_FOREACH(group, &vfio_group_list, next) {
>>>> +            if (group->groupid == devices[i].group_id) {
>>>> +                break;
>>>> +            }
>>>> +        }
>>>> +
>>>> +        if (!group) {
>>>> +            goto out;
>>>> +        }
>>>> +
>>>> +        /* Ensure affected devices for reset under the same bus */
>>>> +        QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
>>>> +            if (vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) {
>>>> +                continue;
>>>> +            }
>>>> +            tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev);
>>>> +            if (vfio_pci_host_match(&host, &tmp->host)) {
>>>> +                struct VfioDeviceFind find = { .pdev = &tmp->pdev, .found = false };
>>>> +
>>>> +                pci_for_each_bus(bus, find_devices, &find);
>>>> +                if (!find.found) {
>>>> +                    goto out;
>>>> +                }
>>>> +                /*
>>>> +                 * When the check device is hotplugged to a higher bus again,
>>>> +                 * which would influence the affected device which enable aer
>>>> +                 * below the bus.
>>>> +                 */
>>>> +                if (tmp->features & VFIO_FEATURE_ENABLE_AER &&
>>>> +                    find.pbus != bus) {
>>>> +                    goto out;
>>>> +                }
>>> I think what you're trying to do here is assume that if a reset of A
>>> affects B, then a reset of B affects A, and if B is on a subordinate bus
>>> from A, then our configuration is broken, right?  Can we really
>>> guarantee that assumption?  If we had a physical topology that mirrored
>>> this virtual topology, that wouldn't necessarily be true.  For instance,
>>> if A was a function of a multi-function device where another function
>>> was a PCIe upstream switch, B could be subordinate to that switch, so a
>>> reset of A affects B, but a reset of B doesn't affect A.
>>>
>>>> +                break;
>>>> +            }
>>>> +        }
>>>> +    }
>>>> +
>>>> +    has_bus_reset = true;
>>>> +
>>>> +out:
>>>> +    vdev->has_bus_reset = has_bus_reset;
>>> I don't see any value is storing this, it can't be trusted at any point
>>> in the future.  I think that any time we add a device or think about
>>> forwarding an AER message to the guest, we need to do a validation pass,
>>> testing the entire topology.
>>>
>>> To do that we'd iterate through every device in every group for PCI
>>> devices that support AER.  For each one, get the hot reset info for the
>>> affected device list.  For each affected device, if it's attached to the
>>> VM, it needs to be on or below the bus of the target device.
>>> Additionally, we should test all of the non-AER supporting vfio-pci
>>> devices on or below the target bus to verify they have a reset
>>> mechanism.  Run that function for each vfio-pci device as it's added,
>>> regardless of whether it's hotplug.  If the test fails, fail the initfn
>>> for the current device.  Also run the test prior to AER injection, if it
>>> fails demote the AER injection to a machine halt as we have now.
>> I'm worry about is the case that the affected devices belonged to
>> another groups but when initialize this device the another group
>> has not been added. it would cause fail the initfn, but the group
>> maybe added later. so I used the machine done event notifier to
>> check this case. if we don't do like that. how can we check the
>> case when all vfio-pci devices initfn ?
> That's why the initfn test needs to test the entire topology, not just
> the device being added.  If we have the case you describe where we have
> two devices in separate groups, we add the first device, test the
> topology, see that the second device is affected but not yet added and
> allow AER to be enabled.  When the second device is added, we again test
> the topology, we see the potential conflict and fail the initfn for the
> second device if the bus requirements are not met.
In case that the second device may not be added. in this case the
first device enable the aer, and pass the validate. how do we know
the second device if be added ?

Thanks,
Chen



>
> I don't see the advantage of using a machine init done notifier.  You
> can perform fewer topology verification passes using that notifier, but
> I think you can provide a more useful error message by testing on each
> device addition.  We also use the exact same strategy for cold-plug and
> hot-plug, which makes maintenance easier.  Thanks,
>
> Alex
>
> .
>
Alex Williamson June 4, 2015, 3:59 p.m. UTC | #5
On Wed, 2015-06-03 at 08:52 +0800, Chen Fan wrote:
> On 06/03/2015 12:47 AM, Alex Williamson wrote:
> > On Tue, 2015-06-02 at 15:54 +0800, Chen Fan wrote:
> >> On 05/28/2015 05:32 AM, Alex Williamson wrote:
> >>> On Wed, 2015-05-27 at 10:46 +0800, Chen Fan wrote:
> >>>> we introduce a has_bus_reset capability to sign the vfio
> >>>> devices if support host bus reset.
> >>>>
> >>>> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> >>>> ---
> >>>>    hw/vfio/pci.c | 123 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> >>>>    1 file changed, 123 insertions(+)
> >>>>
> >>>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> >>>> index f4e7855..5934fd7 100644
> >>>> --- a/hw/vfio/pci.c
> >>>> +++ b/hw/vfio/pci.c
> >>>> @@ -33,6 +33,7 @@
> >>>>    #include "hw/pci/msix.h"
> >>>>    #include "hw/pci/pci.h"
> >>>>    #include "hw/pci/pci_bridge.h"
> >>>> +#include "hw/pci/pci_bus.h"
> >>>>    #include "qemu-common.h"
> >>>>    #include "qemu/error-report.h"
> >>>>    #include "qemu/event_notifier.h"
> >>>> @@ -170,6 +171,7 @@ typedef struct VFIOPCIDevice {
> >>>>        bool req_enabled;
> >>>>        bool has_flr;
> >>>>        bool has_pm_reset;
> >>>> +    bool has_bus_reset;
> >>> I still think that caching this is a bad idea, there's no point at which
> >>> we can blindly assume the capability is still present.
> >>>
> >>>>        bool rom_read_failed;
> >>>>    } VFIOPCIDevice;
> >>>>    
> >>>> @@ -203,6 +205,7 @@ static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
> >>>>    static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr,
> >>>>                                      uint32_t val, int len);
> >>>>    static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
> >>>> +static void vfio_check_host_bus_reset(VFIOPCIDevice *vdev);
> >>>>    
> >>>>    /*
> >>>>     * Disabling BAR mmaping can be slow, but toggling it around INTx can
> >>>> @@ -2853,6 +2856,20 @@ static int vfio_setup_aer(VFIOPCIDevice *vdev, int pos, uint16_t size)
> >>>>            dev_iter = pci_bridge_get_device(dev_iter->bus);
> >>>>        }
> >>>>    
> >>>> +    /*
> >>>> +     * Don't check bus reset capability when device is enabled during
> >>>> +     * qemu machine creation, which is done by machine init function.
> >>>> +     */
> >>>> +    if (DEVICE(vdev)->hotplugged) {
> >>>> +        vfio_check_host_bus_reset(vdev);
> >>>> +        if (!vdev->has_bus_reset) {
> >>>> +            error_report("vfio: Cannot enable AER for device %s, "
> >>>> +                         "which is not support host bus reset.",
> >>> "which does not support host bus reset."
> >>>
> >>>> +                         vdev->vbasedev.name);
> >>>> +            goto error;
> >>>> +        }
> >>>> +    }
> >>>> +
> >>>>        errcap = vfio_pci_read_config(pdev, pdev->exp.aer_cap + PCI_ERR_CAP, 4);
> >>>>        /*
> >>>>         * The ability to record multiple headers is depending on
> >>>> @@ -3678,6 +3695,112 @@ static void vfio_setup_resetfn(VFIOPCIDevice *vdev)
> >>>>        }
> >>>>    }
> >>>>    
> >>>> +struct VfioDeviceFind {
> >>> We use VFIOFooBar for all other camel case definitions, much like PCIBus
> >>> and PCIDevice below.
> >>>
> >>>> +    PCIBus *pbus;
> >>>> +    PCIDevice *pdev;
> >>>> +    bool found;
> >>>> +};
> >>>> +
> >>>> +static void find_devices(PCIBus *bus, void *opaque)
> >>>> +{
> >>>> +    struct VfioDeviceFind *find = opaque;
> >>>> +    int i;
> >>>> +
> >>>> +    if (find->found == true) {
> >>> if (find->found) {...
> >>>
> >>>> +        return;
> >>>> +    }
> >>>> +
> >>>> +    for (i = 0; i < ARRAY_SIZE(bus->devices); i++) {
> >>>> +        if (!bus->devices[i]) {
> >>>> +            continue;
> >>>> +        }
> >>>> +
> >>>> +        if (bus->devices[i] == find->pdev) {
> >>>> +            find->pbus = bus;
> >>>> +            find->found = true;
> >>>> +            break;
> >>>> +        }
> >>>> +    }
> >>>> +}
> >>>> +
> >>>> +static void vfio_check_host_bus_reset(VFIOPCIDevice *vdev)
> >>>> +{
> >>>> +    PCIBus *bus = vdev->pdev.bus;
> >>>> +    struct vfio_pci_hot_reset_info *info = NULL;
> >>>> +    struct vfio_pci_dependent_device *devices;
> >>>> +    VFIOGroup *group;
> >>>> +    int ret, i;
> >>>> +    bool has_bus_reset = false;
> >>>> +
> >>>> +    ret = vfio_get_hot_reset_info(vdev, &info);
> >>>> +    if (ret < 0) {
> >>> if (ret) {...
> >>>
> >>>> +        goto out;
> >>>> +    }
> >>>> +
> >>>> +    /* List all affected devices by bus reset */
> >>>> +    devices = &info->devices[0];
> >>>> +
> >>>> +    /* Verify that we have all the groups required */
> >>>> +    for (i = 0; i < info->count; i++) {
> >>>> +        PCIHostDeviceAddress host;
> >>>> +        VFIOPCIDevice *tmp;
> >>>> +        VFIODevice *vbasedev_iter;
> >>>> +
> >>>> +        host.domain = devices[i].segment;
> >>>> +        host.bus = devices[i].bus;
> >>>> +        host.slot = PCI_SLOT(devices[i].devfn);
> >>>> +        host.function = PCI_FUNC(devices[i].devfn);
> >>>> +
> >>>> +        /* Skip the current device */
> >>>> +        if (vfio_pci_host_match(&host, &vdev->host)) {
> >>>> +            continue;
> >>>> +        }
> >>>> +
> >>>> +        /* Ensure we own the group of the affected device */
> >>>> +        QLIST_FOREACH(group, &vfio_group_list, next) {
> >>>> +            if (group->groupid == devices[i].group_id) {
> >>>> +                break;
> >>>> +            }
> >>>> +        }
> >>>> +
> >>>> +        if (!group) {
> >>>> +            goto out;
> >>>> +        }
> >>>> +
> >>>> +        /* Ensure affected devices for reset under the same bus */
> >>>> +        QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
> >>>> +            if (vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) {
> >>>> +                continue;
> >>>> +            }
> >>>> +            tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev);
> >>>> +            if (vfio_pci_host_match(&host, &tmp->host)) {
> >>>> +                struct VfioDeviceFind find = { .pdev = &tmp->pdev, .found = false };
> >>>> +
> >>>> +                pci_for_each_bus(bus, find_devices, &find);
> >>>> +                if (!find.found) {
> >>>> +                    goto out;
> >>>> +                }
> >>>> +                /*
> >>>> +                 * When the check device is hotplugged to a higher bus again,
> >>>> +                 * which would influence the affected device which enable aer
> >>>> +                 * below the bus.
> >>>> +                 */
> >>>> +                if (tmp->features & VFIO_FEATURE_ENABLE_AER &&
> >>>> +                    find.pbus != bus) {
> >>>> +                    goto out;
> >>>> +                }
> >>> I think what you're trying to do here is assume that if a reset of A
> >>> affects B, then a reset of B affects A, and if B is on a subordinate bus
> >>> from A, then our configuration is broken, right?  Can we really
> >>> guarantee that assumption?  If we had a physical topology that mirrored
> >>> this virtual topology, that wouldn't necessarily be true.  For instance,
> >>> if A was a function of a multi-function device where another function
> >>> was a PCIe upstream switch, B could be subordinate to that switch, so a
> >>> reset of A affects B, but a reset of B doesn't affect A.
> >>>
> >>>> +                break;
> >>>> +            }
> >>>> +        }
> >>>> +    }
> >>>> +
> >>>> +    has_bus_reset = true;
> >>>> +
> >>>> +out:
> >>>> +    vdev->has_bus_reset = has_bus_reset;
> >>> I don't see any value is storing this, it can't be trusted at any point
> >>> in the future.  I think that any time we add a device or think about
> >>> forwarding an AER message to the guest, we need to do a validation pass,
> >>> testing the entire topology.
> >>>
> >>> To do that we'd iterate through every device in every group for PCI
> >>> devices that support AER.  For each one, get the hot reset info for the
> >>> affected device list.  For each affected device, if it's attached to the
> >>> VM, it needs to be on or below the bus of the target device.
> >>> Additionally, we should test all of the non-AER supporting vfio-pci
> >>> devices on or below the target bus to verify they have a reset
> >>> mechanism.  Run that function for each vfio-pci device as it's added,
> >>> regardless of whether it's hotplug.  If the test fails, fail the initfn
> >>> for the current device.  Also run the test prior to AER injection, if it
> >>> fails demote the AER injection to a machine halt as we have now.
> >> I'm worry about is the case that the affected devices belonged to
> >> another groups but when initialize this device the another group
> >> has not been added. it would cause fail the initfn, but the group
> >> maybe added later. so I used the machine done event notifier to
> >> check this case. if we don't do like that. how can we check the
> >> case when all vfio-pci devices initfn ?
> > That's why the initfn test needs to test the entire topology, not just
> > the device being added.  If we have the case you describe where we have
> > two devices in separate groups, we add the first device, test the
> > topology, see that the second device is affected but not yet added and
> > allow AER to be enabled.  When the second device is added, we again test
> > the topology, we see the potential conflict and fail the initfn for the
> > second device if the bus requirements are not met.
> In case that the second device may not be added. in this case the
> first device enable the aer, and pass the validate. how do we know
> the second device if be added ?

Yeah, I see what you're getting at here.  If we have a dual port NIC
with isolation between functions such that each is a separate IOMMU
group, when we add the first device with AER enabled it will fail
because a bus reset affects both groups and we don't yet own the second
group.

My proposal wouldn't provide a way to ever enable AER for these devices.
However, the proposal of using a machine-init-done hook only allows
cold-plug of such devices with AER, hotplug would get the same issue.  I
don't think that sort of asymmetry is supportable either.

We almost need some sort of vfio-group stub driver in qemu that we can
claim ownership of a group without adding any of the devices in the
group to the VM.  Another option might be to make AER a "soft"
requirement, demote it to a VM stop unless the topology allows it.  That
also creates a confusing user scenario that probably looks nearly random
from an outside perspective.  The only other idea I can see would be to
allow some manipulation of iommu groups at the kernel level, perhaps
creating a meta-group composed of one or more iommu groups.  Or maybe a
kernel option that could ignore isolation for specified devices to
broaden the group.  Are we really sure exposing AER to guests is a good
idea?  Requiring guest bus resets to map to host bus rests is really a
mess.  Thanks,

Alex
chenfan June 9, 2015, 3:43 a.m. UTC | #6
On 06/04/2015 11:59 PM, Alex Williamson wrote:
> On Wed, 2015-06-03 at 08:52 +0800, Chen Fan wrote:
>> On 06/03/2015 12:47 AM, Alex Williamson wrote:
>>> On Tue, 2015-06-02 at 15:54 +0800, Chen Fan wrote:
>>>> On 05/28/2015 05:32 AM, Alex Williamson wrote:
>>>>> On Wed, 2015-05-27 at 10:46 +0800, Chen Fan wrote:
>>>>>> we introduce a has_bus_reset capability to sign the vfio
>>>>>> devices if support host bus reset.
>>>>>>
>>>>>> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
>>>>>> ---
>>>>>>     hw/vfio/pci.c | 123 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>>>>>     1 file changed, 123 insertions(+)
>>>>>>
>>>>>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>>>>>> index f4e7855..5934fd7 100644
>>>>>> --- a/hw/vfio/pci.c
>>>>>> +++ b/hw/vfio/pci.c
>>>>>> @@ -33,6 +33,7 @@
>>>>>>     #include "hw/pci/msix.h"
>>>>>>     #include "hw/pci/pci.h"
>>>>>>     #include "hw/pci/pci_bridge.h"
>>>>>> +#include "hw/pci/pci_bus.h"
>>>>>>     #include "qemu-common.h"
>>>>>>     #include "qemu/error-report.h"
>>>>>>     #include "qemu/event_notifier.h"
>>>>>> @@ -170,6 +171,7 @@ typedef struct VFIOPCIDevice {
>>>>>>         bool req_enabled;
>>>>>>         bool has_flr;
>>>>>>         bool has_pm_reset;
>>>>>> +    bool has_bus_reset;
>>>>> I still think that caching this is a bad idea, there's no point at which
>>>>> we can blindly assume the capability is still present.
>>>>>
>>>>>>         bool rom_read_failed;
>>>>>>     } VFIOPCIDevice;
>>>>>>     
>>>>>> @@ -203,6 +205,7 @@ static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
>>>>>>     static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr,
>>>>>>                                       uint32_t val, int len);
>>>>>>     static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
>>>>>> +static void vfio_check_host_bus_reset(VFIOPCIDevice *vdev);
>>>>>>     
>>>>>>     /*
>>>>>>      * Disabling BAR mmaping can be slow, but toggling it around INTx can
>>>>>> @@ -2853,6 +2856,20 @@ static int vfio_setup_aer(VFIOPCIDevice *vdev, int pos, uint16_t size)
>>>>>>             dev_iter = pci_bridge_get_device(dev_iter->bus);
>>>>>>         }
>>>>>>     
>>>>>> +    /*
>>>>>> +     * Don't check bus reset capability when device is enabled during
>>>>>> +     * qemu machine creation, which is done by machine init function.
>>>>>> +     */
>>>>>> +    if (DEVICE(vdev)->hotplugged) {
>>>>>> +        vfio_check_host_bus_reset(vdev);
>>>>>> +        if (!vdev->has_bus_reset) {
>>>>>> +            error_report("vfio: Cannot enable AER for device %s, "
>>>>>> +                         "which is not support host bus reset.",
>>>>> "which does not support host bus reset."
>>>>>
>>>>>> +                         vdev->vbasedev.name);
>>>>>> +            goto error;
>>>>>> +        }
>>>>>> +    }
>>>>>> +
>>>>>>         errcap = vfio_pci_read_config(pdev, pdev->exp.aer_cap + PCI_ERR_CAP, 4);
>>>>>>         /*
>>>>>>          * The ability to record multiple headers is depending on
>>>>>> @@ -3678,6 +3695,112 @@ static void vfio_setup_resetfn(VFIOPCIDevice *vdev)
>>>>>>         }
>>>>>>     }
>>>>>>     
>>>>>> +struct VfioDeviceFind {
>>>>> We use VFIOFooBar for all other camel case definitions, much like PCIBus
>>>>> and PCIDevice below.
>>>>>
>>>>>> +    PCIBus *pbus;
>>>>>> +    PCIDevice *pdev;
>>>>>> +    bool found;
>>>>>> +};
>>>>>> +
>>>>>> +static void find_devices(PCIBus *bus, void *opaque)
>>>>>> +{
>>>>>> +    struct VfioDeviceFind *find = opaque;
>>>>>> +    int i;
>>>>>> +
>>>>>> +    if (find->found == true) {
>>>>> if (find->found) {...
>>>>>
>>>>>> +        return;
>>>>>> +    }
>>>>>> +
>>>>>> +    for (i = 0; i < ARRAY_SIZE(bus->devices); i++) {
>>>>>> +        if (!bus->devices[i]) {
>>>>>> +            continue;
>>>>>> +        }
>>>>>> +
>>>>>> +        if (bus->devices[i] == find->pdev) {
>>>>>> +            find->pbus = bus;
>>>>>> +            find->found = true;
>>>>>> +            break;
>>>>>> +        }
>>>>>> +    }
>>>>>> +}
>>>>>> +
>>>>>> +static void vfio_check_host_bus_reset(VFIOPCIDevice *vdev)
>>>>>> +{
>>>>>> +    PCIBus *bus = vdev->pdev.bus;
>>>>>> +    struct vfio_pci_hot_reset_info *info = NULL;
>>>>>> +    struct vfio_pci_dependent_device *devices;
>>>>>> +    VFIOGroup *group;
>>>>>> +    int ret, i;
>>>>>> +    bool has_bus_reset = false;
>>>>>> +
>>>>>> +    ret = vfio_get_hot_reset_info(vdev, &info);
>>>>>> +    if (ret < 0) {
>>>>> if (ret) {...
>>>>>
>>>>>> +        goto out;
>>>>>> +    }
>>>>>> +
>>>>>> +    /* List all affected devices by bus reset */
>>>>>> +    devices = &info->devices[0];
>>>>>> +
>>>>>> +    /* Verify that we have all the groups required */
>>>>>> +    for (i = 0; i < info->count; i++) {
>>>>>> +        PCIHostDeviceAddress host;
>>>>>> +        VFIOPCIDevice *tmp;
>>>>>> +        VFIODevice *vbasedev_iter;
>>>>>> +
>>>>>> +        host.domain = devices[i].segment;
>>>>>> +        host.bus = devices[i].bus;
>>>>>> +        host.slot = PCI_SLOT(devices[i].devfn);
>>>>>> +        host.function = PCI_FUNC(devices[i].devfn);
>>>>>> +
>>>>>> +        /* Skip the current device */
>>>>>> +        if (vfio_pci_host_match(&host, &vdev->host)) {
>>>>>> +            continue;
>>>>>> +        }
>>>>>> +
>>>>>> +        /* Ensure we own the group of the affected device */
>>>>>> +        QLIST_FOREACH(group, &vfio_group_list, next) {
>>>>>> +            if (group->groupid == devices[i].group_id) {
>>>>>> +                break;
>>>>>> +            }
>>>>>> +        }
>>>>>> +
>>>>>> +        if (!group) {
>>>>>> +            goto out;
>>>>>> +        }
>>>>>> +
>>>>>> +        /* Ensure affected devices for reset under the same bus */
>>>>>> +        QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
>>>>>> +            if (vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) {
>>>>>> +                continue;
>>>>>> +            }
>>>>>> +            tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev);
>>>>>> +            if (vfio_pci_host_match(&host, &tmp->host)) {
>>>>>> +                struct VfioDeviceFind find = { .pdev = &tmp->pdev, .found = false };
>>>>>> +
>>>>>> +                pci_for_each_bus(bus, find_devices, &find);
>>>>>> +                if (!find.found) {
>>>>>> +                    goto out;
>>>>>> +                }
>>>>>> +                /*
>>>>>> +                 * When the check device is hotplugged to a higher bus again,
>>>>>> +                 * which would influence the affected device which enable aer
>>>>>> +                 * below the bus.
>>>>>> +                 */
>>>>>> +                if (tmp->features & VFIO_FEATURE_ENABLE_AER &&
>>>>>> +                    find.pbus != bus) {
>>>>>> +                    goto out;
>>>>>> +                }
>>>>> I think what you're trying to do here is assume that if a reset of A
>>>>> affects B, then a reset of B affects A, and if B is on a subordinate bus
>>>>> from A, then our configuration is broken, right?  Can we really
>>>>> guarantee that assumption?  If we had a physical topology that mirrored
>>>>> this virtual topology, that wouldn't necessarily be true.  For instance,
>>>>> if A was a function of a multi-function device where another function
>>>>> was a PCIe upstream switch, B could be subordinate to that switch, so a
>>>>> reset of A affects B, but a reset of B doesn't affect A.
>>>>>
>>>>>> +                break;
>>>>>> +            }
>>>>>> +        }
>>>>>> +    }
>>>>>> +
>>>>>> +    has_bus_reset = true;
>>>>>> +
>>>>>> +out:
>>>>>> +    vdev->has_bus_reset = has_bus_reset;
>>>>> I don't see any value is storing this, it can't be trusted at any point
>>>>> in the future.  I think that any time we add a device or think about
>>>>> forwarding an AER message to the guest, we need to do a validation pass,
>>>>> testing the entire topology.
>>>>>
>>>>> To do that we'd iterate through every device in every group for PCI
>>>>> devices that support AER.  For each one, get the hot reset info for the
>>>>> affected device list.  For each affected device, if it's attached to the
>>>>> VM, it needs to be on or below the bus of the target device.
>>>>> Additionally, we should test all of the non-AER supporting vfio-pci
>>>>> devices on or below the target bus to verify they have a reset
>>>>> mechanism.  Run that function for each vfio-pci device as it's added,
>>>>> regardless of whether it's hotplug.  If the test fails, fail the initfn
>>>>> for the current device.  Also run the test prior to AER injection, if it
>>>>> fails demote the AER injection to a machine halt as we have now.
>>>> I'm worry about is the case that the affected devices belonged to
>>>> another groups but when initialize this device the another group
>>>> has not been added. it would cause fail the initfn, but the group
>>>> maybe added later. so I used the machine done event notifier to
>>>> check this case. if we don't do like that. how can we check the
>>>> case when all vfio-pci devices initfn ?
>>> That's why the initfn test needs to test the entire topology, not just
>>> the device being added.  If we have the case you describe where we have
>>> two devices in separate groups, we add the first device, test the
>>> topology, see that the second device is affected but not yet added and
>>> allow AER to be enabled.  When the second device is added, we again test
>>> the topology, we see the potential conflict and fail the initfn for the
>>> second device if the bus requirements are not met.
>> In case that the second device may not be added. in this case the
>> first device enable the aer, and pass the validate. how do we know
>> the second device if be added ?
> Yeah, I see what you're getting at here.  If we have a dual port NIC
> with isolation between functions such that each is a separate IOMMU
> group, when we add the first device with AER enabled it will fail
> because a bus reset affects both groups and we don't yet own the second
> group.
>
> My proposal wouldn't provide a way to ever enable AER for these devices.
> However, the proposal of using a machine-init-done hook only allows
> cold-plug of such devices with AER, hotplug would get the same issue.  I
> don't think that sort of asymmetry is supportable either.
>
> We almost need some sort of vfio-group stub driver in qemu that we can
> claim ownership of a group without adding any of the devices in the
> group to the VM.  Another option might be to make AER a "soft"
> requirement, demote it to a VM stop unless the topology allows it.  That
> also creates a confusing user scenario that probably looks nearly random
> from an outside perspective.  The only other idea I can see would be to
> allow some manipulation of iommu groups at the kernel level, perhaps
> creating a meta-group composed of one or more iommu groups.  Or maybe a
> kernel option that could ignore isolation for specified devices to
> broaden the group.  Are we really sure exposing AER to guests is a good
> idea?  Requiring guest bus resets to map to host bus rests is really a
> mess.  Thanks,
>
> Alex
Usually the user and the administer are not the same one, when device
eject aer error, the user isn't aware of, so I think this feature should
have. I think own the affected group in VM for aer devices is good idea.
and I sent out the new version 9 to fix the init issues. pls help to 
review it.

Thanks,
Chen

>
> .
>
diff mbox

Patch

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index f4e7855..5934fd7 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -33,6 +33,7 @@ 
 #include "hw/pci/msix.h"
 #include "hw/pci/pci.h"
 #include "hw/pci/pci_bridge.h"
+#include "hw/pci/pci_bus.h"
 #include "qemu-common.h"
 #include "qemu/error-report.h"
 #include "qemu/event_notifier.h"
@@ -170,6 +171,7 @@  typedef struct VFIOPCIDevice {
     bool req_enabled;
     bool has_flr;
     bool has_pm_reset;
+    bool has_bus_reset;
     bool rom_read_failed;
 } VFIOPCIDevice;
 
@@ -203,6 +205,7 @@  static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
 static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr,
                                   uint32_t val, int len);
 static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
+static void vfio_check_host_bus_reset(VFIOPCIDevice *vdev);
 
 /*
  * Disabling BAR mmaping can be slow, but toggling it around INTx can
@@ -2853,6 +2856,20 @@  static int vfio_setup_aer(VFIOPCIDevice *vdev, int pos, uint16_t size)
         dev_iter = pci_bridge_get_device(dev_iter->bus);
     }
 
+    /*
+     * Don't check bus reset capability when device is enabled during
+     * qemu machine creation, which is done by machine init function.
+     */
+    if (DEVICE(vdev)->hotplugged) {
+        vfio_check_host_bus_reset(vdev);
+        if (!vdev->has_bus_reset) {
+            error_report("vfio: Cannot enable AER for device %s, "
+                         "which is not support host bus reset.",
+                         vdev->vbasedev.name);
+            goto error;
+        }
+    }
+
     errcap = vfio_pci_read_config(pdev, pdev->exp.aer_cap + PCI_ERR_CAP, 4);
     /*
      * The ability to record multiple headers is depending on
@@ -3678,6 +3695,112 @@  static void vfio_setup_resetfn(VFIOPCIDevice *vdev)
     }
 }
 
+struct VfioDeviceFind {
+    PCIBus *pbus;
+    PCIDevice *pdev;
+    bool found;
+};
+
+static void find_devices(PCIBus *bus, void *opaque)
+{
+    struct VfioDeviceFind *find = opaque;
+    int i;
+
+    if (find->found == true) {
+        return;
+    }
+
+    for (i = 0; i < ARRAY_SIZE(bus->devices); i++) {
+        if (!bus->devices[i]) {
+            continue;
+        }
+
+        if (bus->devices[i] == find->pdev) {
+            find->pbus = bus;
+            find->found = true;
+            break;
+        }
+    }
+}
+
+static void vfio_check_host_bus_reset(VFIOPCIDevice *vdev)
+{
+    PCIBus *bus = vdev->pdev.bus;
+    struct vfio_pci_hot_reset_info *info = NULL;
+    struct vfio_pci_dependent_device *devices;
+    VFIOGroup *group;
+    int ret, i;
+    bool has_bus_reset = false;
+
+    ret = vfio_get_hot_reset_info(vdev, &info);
+    if (ret < 0) {
+        goto out;
+    }
+
+    /* List all affected devices by bus reset */
+    devices = &info->devices[0];
+
+    /* Verify that we have all the groups required */
+    for (i = 0; i < info->count; i++) {
+        PCIHostDeviceAddress host;
+        VFIOPCIDevice *tmp;
+        VFIODevice *vbasedev_iter;
+
+        host.domain = devices[i].segment;
+        host.bus = devices[i].bus;
+        host.slot = PCI_SLOT(devices[i].devfn);
+        host.function = PCI_FUNC(devices[i].devfn);
+
+        /* Skip the current device */
+        if (vfio_pci_host_match(&host, &vdev->host)) {
+            continue;
+        }
+
+        /* Ensure we own the group of the affected device */
+        QLIST_FOREACH(group, &vfio_group_list, next) {
+            if (group->groupid == devices[i].group_id) {
+                break;
+            }
+        }
+
+        if (!group) {
+            goto out;
+        }
+
+        /* Ensure affected devices for reset under the same bus */
+        QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
+            if (vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) {
+                continue;
+            }
+            tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev);
+            if (vfio_pci_host_match(&host, &tmp->host)) {
+                struct VfioDeviceFind find = { .pdev = &tmp->pdev, .found = false };
+
+                pci_for_each_bus(bus, find_devices, &find);
+                if (!find.found) {
+                    goto out;
+                }
+                /*
+                 * When the check device is hotplugged to a higher bus again,
+                 * which would influence the affected device which enable aer
+                 * below the bus.
+                 */
+                if (tmp->features & VFIO_FEATURE_ENABLE_AER &&
+                    find.pbus != bus) {
+                    goto out;
+                }
+                break;
+            }
+        }
+    }
+
+    has_bus_reset = true;
+
+out:
+    vdev->has_bus_reset = has_bus_reset;
+    g_free(info);
+}
+
 static int vfio_initfn(PCIDevice *pdev)
 {
     VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);