diff mbox

[RFC,v6,11/11] vfio: add bus reset notifier for host bus reset

Message ID b95cd227a99fdc382e0a9a64b1e9ded18c82ad3d.1430297161.git.chen.fan.fnst@cn.fujitsu.com
State New
Headers show

Commit Message

chenfan April 29, 2015, 8:48 a.m. UTC
add host secondary bus reset for vfio when AER occurs, if reset failed,
we should stop vm.

Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
---
 hw/vfio/pci.c | 151 +++++++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 138 insertions(+), 13 deletions(-)

Comments

Alex Williamson April 29, 2015, 5:32 p.m. UTC | #1
On Wed, 2015-04-29 at 16:48 +0800, Chen Fan wrote:
> add host secondary bus reset for vfio when AER occurs, if reset failed,
> we should stop vm.
> 
> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> ---
>  hw/vfio/pci.c | 151 +++++++++++++++++++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 138 insertions(+), 13 deletions(-)
> 
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index 060fb47..619daed 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -154,6 +154,8 @@ typedef struct VFIOPCIDevice {
>      PCIHostDeviceAddress host;
>      EventNotifier err_notifier;
>      EventNotifier req_notifier;
> +
> +    Notifier sec_bus_reset_notifier;
>      uint32_t features;
>  #define VFIO_FEATURE_ENABLE_VGA_BIT 0
>  #define VFIO_FEATURE_ENABLE_VGA (1 << VFIO_FEATURE_ENABLE_VGA_BIT)
> @@ -2627,6 +2629,13 @@ static int vfio_setup_pcie_cap(VFIOPCIDevice *vdev, int pos, uint8_t size)
>      return pos;
>  }
>  
> +static bool vfio_pci_host_match(PCIHostDeviceAddress *host1,
> +                                PCIHostDeviceAddress *host2)
> +{
> +    return (host1->domain == host2->domain && host1->bus == host2->bus &&
> +            host1->slot == host2->slot && host1->function == host2->function);
> +}
> +
>  static void vfio_check_pcie_flr(VFIOPCIDevice *vdev, uint8_t pos)
>  {
>      uint32_t cap = pci_get_long(vdev->pdev.config + pos + PCI_EXP_DEVCAP);
> @@ -2819,6 +2828,131 @@ static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos)
>      return 0;
>  }
>  
> +static int vfio_aer_validate_devices(DeviceState *dev,
> +                                     void *opaque)
> +{
> +    VFIOPCIDevice *vdev;
> +    int i;
> +    bool found = false;
> +    struct vfio_pci_hot_reset_info *info = opaque;
> +    struct vfio_pci_dependent_device *devices = &info->devices[0];
> +
> +    if (!object_dynamic_cast(OBJECT(dev), "vfio-pci")) {
> +        return 0;
> +    }
> +
> +    vdev = DO_UPCAST(VFIOPCIDevice, pdev, PCI_DEVICE(dev));
> +    for (i = 0; i < info->count; i++) {
> +        PCIHostDeviceAddress host;
> +
> +        host.domain = devices[i].segment;
> +        host.bus = devices[i].bus;
> +        host.slot = PCI_SLOT(devices[i].devfn);
> +        host.function = PCI_FUNC(devices[i].devfn);
> +
> +        if (vfio_pci_host_match(&host, &vdev->host)) {
> +            found = true;
> +            break;
> +        }
> +    }
> +
> +    if (!found) {
> +        error_report("vfio: Cannot reset parent bus with AER supported,"
> +                     "depends on device %s which is not contained.",
> +                      vdev->vbasedev.name);
> +        return -1;
> +    }
> +
> +    return 0;
> +}
> +
> +static void vfio_pci_vm_stop(VFIOPCIDevice *vdev)
> +{
> +    error_report("%s(%04x:%02x:%02x.%x) Unrecoverable error detected. "
> +                 "Please collect any data possible and then kill the guest",
> +                 __func__, vdev->host.domain, vdev->host.bus,
> +                 vdev->host.slot, vdev->host.function);
> +
> +    vm_stop(RUN_STATE_INTERNAL_ERROR);
> +}
> +
> +static void vfio_pci_host_bus_reset(Notifier *n, void *opaque)
> +{
> +    VFIOPCIDevice *vdev = container_of(n, VFIOPCIDevice, sec_bus_reset_notifier);
> +    PCIDevice *pdev = &vdev->pdev;
> +    int ret, i;
> +    struct vfio_pci_hot_reset_info *info;
> +    struct vfio_pci_dependent_device *devices;
> +    VFIOGroup *group;
> +
> +    if (!(vdev->features & VFIO_FEATURE_ENABLE_AER)) {
> +        return;
> +    }
> +
> +    /*
> +     * Check the affected devices by virtual bus reset are contained in
> +     * the set of groups.
> +     */
> +    ret = vfio_get_hot_reset_info(vdev, &info);
> +    if (ret < 0) {
> +        goto stop_vm;
> +    }
> +
> +    devices = &info->devices[0];
> +
> +    /* Verify that we have all the groups required */
> +    for (i = 0; i < info->count; i++) {
> +        PCIHostDeviceAddress host;
> +
> +        host.domain = devices[i].segment;
> +        host.bus = devices[i].bus;
> +        host.slot = PCI_SLOT(devices[i].devfn);
> +        host.function = PCI_FUNC(devices[i].devfn);
> +
> +        if (vfio_pci_host_match(&host, &vdev->host)) {
> +            continue;
> +        }
> +
> +        QLIST_FOREACH(group, &vfio_group_list, next) {
> +            if (group->groupid == devices[i].group_id) {
> +                break;
> +            }
> +        }
> +
> +        if (!group) {
> +            if (!vdev->has_pm_reset) {

I'm not sure how has_pm_reset has anything to do with what we're testing
here.  Copy-paste error?

> +                error_report("vfio: Cannot reset device %s with AER supported,"
> +                             "depends on group %d which is not owned.",
> +                             vdev->vbasedev.name, devices[i].group_id);
> +            }
> +            ret = -EPERM;
> +            goto stop_vm;
> +        }
> +    }


The above verifies that all of the devices affected by the bus reset are
owned by the VM.

> +
> +    /* Verify that we have all the affected devices under the bus */
> +    ret = qbus_walk_children(BUS(pdev->bus), NULL, NULL,
> +                             vfio_aer_validate_devices,
> +                             NULL, info);

And here we make sure that any vfio-pci devices affected by the bus
reset are contained in the list of affected devices.  What we're still
missing is whether there are any affected devices that are exposed to
the VM that are not covered in this bus walk.  For instance, if I assign
a multi-function device and place function 0 and 1 under separate, peer
root ports, I believe the above tests will confirm that the VM owns the
necessary groups and that the only vfio-pci device affected by the bus
reset is in the list of affected devices, but it will not verify that
the other function is not affected by the guest bus reset, but is
affected by the host bus reset.

> +    if (ret < 0) {
> +        goto stop_vm;
> +    }
> +
> +
> +    /* bus reset! */
> +    ret = vfio_pci_do_hot_reset(vdev, info);
> +    if (ret < 0) {
> +        goto stop_vm;
> +    }
> +
> +    g_free(info);
> +    return;
> +
> +stop_vm:
> +    g_free(info);
> +    vfio_pci_vm_stop(vdev);
> +}
> +
>  static int vfio_setup_aer(VFIOPCIDevice *vdev, int pos, uint16_t size)
>  {
>      PCIDevice *pdev = &vdev->pdev;
> @@ -2852,6 +2986,9 @@ static int vfio_setup_aer(VFIOPCIDevice *vdev, int pos, uint16_t size)
>                     pdev->exp.aer_cap + PCI_ERR_UNCOR_SEVER, 4);
>      pci_long_test_and_clear_mask(exp_cap + PCI_ERR_UNCOR_SEVER, ~severity);
>  
> +    vdev->sec_bus_reset_notifier.notify = vfio_pci_host_bus_reset;
> +    pci_bus_add_reset_notifier(pdev->bus, &vdev->sec_bus_reset_notifier);
> +

Additionally, we're going to do a bus reset once for every vfio-pci
device subordinate to the bus being reset.  That's not very efficient.

There are still a number of problems here.  We're allowing users to
configure their VMs however they wish and enable AER, then only when
they do a bus reset (which we have no way to infer is related to an AER
event) do we check and potentially halt the VM if guest mapping of the
host topology prevents a bus reset.  That seems sort of like making
backups of your data, but never testing that you can recover from the
backup until we need the data.  I can't imagine that many users are
going to be able to get this correct and they won't know it's incorrect
until an AER event occurs.

Also, Do we need to worry about guest root complex devices?  The guest
won't be able to perform a bus reset on the guest bus in that case.  Is
AER even valid for RC devices since they don't have a parent downstream
port to signal the AER?  This seems like another case where it's more
likely that a user will create an invalid configuration than it is they
will create a valid one, but we won't know until an AER occurs with the
code structure here.

Therefore I think that if the user specifies AER, we need to verify and
enforce at that point, ie. the initfn, that a host bus reset is
possible, that a guest reset is possible, and that a guest bus reset
fully contains the VM visible host devices affected by the reset.

I'd also like to see the patches ordered such that we provide all the
infrastructure to validate and enforce the configuration restrictions to
support AER before we enable it to the guest.  The order here creates
bisect points where AER is exposed to the guest with unacceptable QEMU
handling.  Thanks,

Alex

>      return 0;
>  }
>  
> @@ -2978,13 +3115,6 @@ static void vfio_pci_post_reset(VFIOPCIDevice *vdev)
>      vfio_enable_intx(vdev);
>  }
>  
> -static bool vfio_pci_host_match(PCIHostDeviceAddress *host1,
> -                                PCIHostDeviceAddress *host2)
> -{
> -    return (host1->domain == host2->domain && host1->bus == host2->bus &&
> -            host1->slot == host2->slot && host1->function == host2->function);
> -}
> -
>  static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
>  {
>      VFIOGroup *group;
> @@ -3328,12 +3458,7 @@ static void vfio_err_notifier_handler(void *opaque)
>       * terminate the guest to contain the error.
>       */
>  
> -    error_report("%s(%04x:%02x:%02x.%x) Unrecoverable error detected.  "
> -                 "Please collect any data possible and then kill the guest",
> -                 __func__, vdev->host.domain, vdev->host.bus,
> -                 vdev->host.slot, vdev->host.function);
> -
> -    vm_stop(RUN_STATE_INTERNAL_ERROR);
> +    vfio_pci_vm_stop(vdev);
>  }
>  
>  /*
chenfan April 30, 2015, 3:07 a.m. UTC | #2
On 04/30/2015 01:32 AM, Alex Williamson wrote:
> On Wed, 2015-04-29 at 16:48 +0800, Chen Fan wrote:
>> add host secondary bus reset for vfio when AER occurs, if reset failed,
>> we should stop vm.
>>
>> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
>> ---
>>   hw/vfio/pci.c | 151 +++++++++++++++++++++++++++++++++++++++++++++++++++++-----
>>   1 file changed, 138 insertions(+), 13 deletions(-)
>>
>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>> index 060fb47..619daed 100644
>> --- a/hw/vfio/pci.c
>> +++ b/hw/vfio/pci.c
>> @@ -154,6 +154,8 @@ typedef struct VFIOPCIDevice {
>>       PCIHostDeviceAddress host;
>>       EventNotifier err_notifier;
>>       EventNotifier req_notifier;
>> +
>> +    Notifier sec_bus_reset_notifier;
>>       uint32_t features;
>>   #define VFIO_FEATURE_ENABLE_VGA_BIT 0
>>   #define VFIO_FEATURE_ENABLE_VGA (1 << VFIO_FEATURE_ENABLE_VGA_BIT)
>> @@ -2627,6 +2629,13 @@ static int vfio_setup_pcie_cap(VFIOPCIDevice *vdev, int pos, uint8_t size)
>>       return pos;
>>   }
>>   
>> +static bool vfio_pci_host_match(PCIHostDeviceAddress *host1,
>> +                                PCIHostDeviceAddress *host2)
>> +{
>> +    return (host1->domain == host2->domain && host1->bus == host2->bus &&
>> +            host1->slot == host2->slot && host1->function == host2->function);
>> +}
>> +
>>   static void vfio_check_pcie_flr(VFIOPCIDevice *vdev, uint8_t pos)
>>   {
>>       uint32_t cap = pci_get_long(vdev->pdev.config + pos + PCI_EXP_DEVCAP);
>> @@ -2819,6 +2828,131 @@ static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos)
>>       return 0;
>>   }
>>   
>> +static int vfio_aer_validate_devices(DeviceState *dev,
>> +                                     void *opaque)
>> +{
>> +    VFIOPCIDevice *vdev;
>> +    int i;
>> +    bool found = false;
>> +    struct vfio_pci_hot_reset_info *info = opaque;
>> +    struct vfio_pci_dependent_device *devices = &info->devices[0];
>> +
>> +    if (!object_dynamic_cast(OBJECT(dev), "vfio-pci")) {
>> +        return 0;
>> +    }
>> +
>> +    vdev = DO_UPCAST(VFIOPCIDevice, pdev, PCI_DEVICE(dev));
>> +    for (i = 0; i < info->count; i++) {
>> +        PCIHostDeviceAddress host;
>> +
>> +        host.domain = devices[i].segment;
>> +        host.bus = devices[i].bus;
>> +        host.slot = PCI_SLOT(devices[i].devfn);
>> +        host.function = PCI_FUNC(devices[i].devfn);
>> +
>> +        if (vfio_pci_host_match(&host, &vdev->host)) {
>> +            found = true;
>> +            break;
>> +        }
>> +    }
>> +
>> +    if (!found) {
>> +        error_report("vfio: Cannot reset parent bus with AER supported,"
>> +                     "depends on device %s which is not contained.",
>> +                      vdev->vbasedev.name);
>> +        return -1;
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +static void vfio_pci_vm_stop(VFIOPCIDevice *vdev)
>> +{
>> +    error_report("%s(%04x:%02x:%02x.%x) Unrecoverable error detected. "
>> +                 "Please collect any data possible and then kill the guest",
>> +                 __func__, vdev->host.domain, vdev->host.bus,
>> +                 vdev->host.slot, vdev->host.function);
>> +
>> +    vm_stop(RUN_STATE_INTERNAL_ERROR);
>> +}
>> +
>> +static void vfio_pci_host_bus_reset(Notifier *n, void *opaque)
>> +{
>> +    VFIOPCIDevice *vdev = container_of(n, VFIOPCIDevice, sec_bus_reset_notifier);
>> +    PCIDevice *pdev = &vdev->pdev;
>> +    int ret, i;
>> +    struct vfio_pci_hot_reset_info *info;
>> +    struct vfio_pci_dependent_device *devices;
>> +    VFIOGroup *group;
>> +
>> +    if (!(vdev->features & VFIO_FEATURE_ENABLE_AER)) {
>> +        return;
>> +    }
>> +
>> +    /*
>> +     * Check the affected devices by virtual bus reset are contained in
>> +     * the set of groups.
>> +     */
>> +    ret = vfio_get_hot_reset_info(vdev, &info);
>> +    if (ret < 0) {
>> +        goto stop_vm;
>> +    }
>> +
>> +    devices = &info->devices[0];
>> +
>> +    /* Verify that we have all the groups required */
>> +    for (i = 0; i < info->count; i++) {
>> +        PCIHostDeviceAddress host;
>> +
>> +        host.domain = devices[i].segment;
>> +        host.bus = devices[i].bus;
>> +        host.slot = PCI_SLOT(devices[i].devfn);
>> +        host.function = PCI_FUNC(devices[i].devfn);
>> +
>> +        if (vfio_pci_host_match(&host, &vdev->host)) {
>> +            continue;
>> +        }
>> +
>> +        QLIST_FOREACH(group, &vfio_group_list, next) {
>> +            if (group->groupid == devices[i].group_id) {
>> +                break;
>> +            }
>> +        }
>> +
>> +        if (!group) {
>> +            if (!vdev->has_pm_reset) {
> I'm not sure how has_pm_reset has anything to do with what we're testing
> here.  Copy-paste error?
>
>> +                error_report("vfio: Cannot reset device %s with AER supported,"
>> +                             "depends on group %d which is not owned.",
>> +                             vdev->vbasedev.name, devices[i].group_id);
>> +            }
>> +            ret = -EPERM;
>> +            goto stop_vm;
>> +        }
>> +    }
>
> The above verifies that all of the devices affected by the bus reset are
> owned by the VM.
>
>> +
>> +    /* Verify that we have all the affected devices under the bus */
>> +    ret = qbus_walk_children(BUS(pdev->bus), NULL, NULL,
>> +                             vfio_aer_validate_devices,
>> +                             NULL, info);
> And here we make sure that any vfio-pci devices affected by the bus
> reset are contained in the list of affected devices.  What we're still
> missing is whether there are any affected devices that are exposed to
> the VM that are not covered in this bus walk.  For instance, if I assign
> a multi-function device and place function 0 and 1 under separate, peer
> root ports, I believe the above tests will confirm that the VM owns the
> necessary groups and that the only vfio-pci device affected by the bus
> reset is in the list of affected devices, but it will not verify that
> the other function is not affected by the guest bus reset, but is
> affected by the host bus reset.
you are right.

>
>> +    if (ret < 0) {
>> +        goto stop_vm;
>> +    }
>> +
>> +
>> +    /* bus reset! */
>> +    ret = vfio_pci_do_hot_reset(vdev, info);
>> +    if (ret < 0) {
>> +        goto stop_vm;
>> +    }
>> +
>> +    g_free(info);
>> +    return;
>> +
>> +stop_vm:
>> +    g_free(info);
>> +    vfio_pci_vm_stop(vdev);
>> +}
>> +
>>   static int vfio_setup_aer(VFIOPCIDevice *vdev, int pos, uint16_t size)
>>   {
>>       PCIDevice *pdev = &vdev->pdev;
>> @@ -2852,6 +2986,9 @@ static int vfio_setup_aer(VFIOPCIDevice *vdev, int pos, uint16_t size)
>>                      pdev->exp.aer_cap + PCI_ERR_UNCOR_SEVER, 4);
>>       pci_long_test_and_clear_mask(exp_cap + PCI_ERR_UNCOR_SEVER, ~severity);
>>   
>> +    vdev->sec_bus_reset_notifier.notify = vfio_pci_host_bus_reset;
>> +    pci_bus_add_reset_notifier(pdev->bus, &vdev->sec_bus_reset_notifier);
>> +
> Additionally, we're going to do a bus reset once for every vfio-pci
> device subordinate to the bus being reset.  That's not very efficient.
>
> There are still a number of problems here.  We're allowing users to
> configure their VMs however they wish and enable AER, then only when
> they do a bus reset (which we have no way to infer is related to an AER
> event) do we check and potentially halt the VM if guest mapping of the
> host topology prevents a bus reset.  That seems sort of like making
> backups of your data, but never testing that you can recover from the
> backup until we need the data.  I can't imagine that many users are
> going to be able to get this correct and they won't know it's incorrect
> until an AER event occurs.
>
> Also, Do we need to worry about guest root complex devices?  The guest
> won't be able to perform a bus reset on the guest bus in that case.  Is
> AER even valid for RC devices since they don't have a parent downstream
> port to signal the AER?  This seems like another case where it's more
> likely that a user will create an invalid configuration than it is they
> will create a valid one, but we won't know until an AER occurs with the
> code structure here.
do you refer to the root complex device is the root port of root complex?
the root port can notify system itself, the PCI Express aer driver
reset the upstream link between root port and upstream port of
subordinate switch. so I think we don't need to care that.

>
> Therefore I think that if the user specifies AER, we need to verify and
> enforce at that point, ie. the initfn, that a host bus reset is
> possible, that a guest reset is possible, and that a guest bus reset
> fully contains the VM visible host devices affected by the reset.
>
> I'd also like to see the patches ordered such that we provide all the
> infrastructure to validate and enforce the configuration restrictions to
> support AER before we enable it to the guest.  The order here creates
> bisect points where AER is exposed to the guest with unacceptable QEMU
> handling.  Thanks,
I'm sorry for that, due to this is RFC version I missed that,
and  I will arrange the patches after this series.

Thanks,
Chen


>
> Alex
>
>>       return 0;
>>   }
>>   
>> @@ -2978,13 +3115,6 @@ static void vfio_pci_post_reset(VFIOPCIDevice *vdev)
>>       vfio_enable_intx(vdev);
>>   }
>>   
>> -static bool vfio_pci_host_match(PCIHostDeviceAddress *host1,
>> -                                PCIHostDeviceAddress *host2)
>> -{
>> -    return (host1->domain == host2->domain && host1->bus == host2->bus &&
>> -            host1->slot == host2->slot && host1->function == host2->function);
>> -}
>> -
>>   static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
>>   {
>>       VFIOGroup *group;
>> @@ -3328,12 +3458,7 @@ static void vfio_err_notifier_handler(void *opaque)
>>        * terminate the guest to contain the error.
>>        */
>>   
>> -    error_report("%s(%04x:%02x:%02x.%x) Unrecoverable error detected.  "
>> -                 "Please collect any data possible and then kill the guest",
>> -                 __func__, vdev->host.domain, vdev->host.bus,
>> -                 vdev->host.slot, vdev->host.function);
>> -
>> -    vm_stop(RUN_STATE_INTERNAL_ERROR);
>> +    vfio_pci_vm_stop(vdev);
>>   }
>>   
>>   /*
>
>
> .
>
Alex Williamson April 30, 2015, 3:21 a.m. UTC | #3
On Thu, 2015-04-30 at 11:07 +0800, Chen Fan wrote:
> On 04/30/2015 01:32 AM, Alex Williamson wrote:
> > On Wed, 2015-04-29 at 16:48 +0800, Chen Fan wrote:
> >> add host secondary bus reset for vfio when AER occurs, if reset failed,
> >> we should stop vm.
> >>
> >> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> >> ---
> >>   hw/vfio/pci.c | 151 +++++++++++++++++++++++++++++++++++++++++++++++++++++-----
> >>   1 file changed, 138 insertions(+), 13 deletions(-)
> >>
> >> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> >> index 060fb47..619daed 100644
> >> --- a/hw/vfio/pci.c
> >> +++ b/hw/vfio/pci.c
> >> @@ -154,6 +154,8 @@ typedef struct VFIOPCIDevice {
> >>       PCIHostDeviceAddress host;
> >>       EventNotifier err_notifier;
> >>       EventNotifier req_notifier;
> >> +
> >> +    Notifier sec_bus_reset_notifier;
> >>       uint32_t features;
> >>   #define VFIO_FEATURE_ENABLE_VGA_BIT 0
> >>   #define VFIO_FEATURE_ENABLE_VGA (1 << VFIO_FEATURE_ENABLE_VGA_BIT)
> >> @@ -2627,6 +2629,13 @@ static int vfio_setup_pcie_cap(VFIOPCIDevice *vdev, int pos, uint8_t size)
> >>       return pos;
> >>   }
> >>   
> >> +static bool vfio_pci_host_match(PCIHostDeviceAddress *host1,
> >> +                                PCIHostDeviceAddress *host2)
> >> +{
> >> +    return (host1->domain == host2->domain && host1->bus == host2->bus &&
> >> +            host1->slot == host2->slot && host1->function == host2->function);
> >> +}
> >> +
> >>   static void vfio_check_pcie_flr(VFIOPCIDevice *vdev, uint8_t pos)
> >>   {
> >>       uint32_t cap = pci_get_long(vdev->pdev.config + pos + PCI_EXP_DEVCAP);
> >> @@ -2819,6 +2828,131 @@ static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos)
> >>       return 0;
> >>   }
> >>   
> >> +static int vfio_aer_validate_devices(DeviceState *dev,
> >> +                                     void *opaque)
> >> +{
> >> +    VFIOPCIDevice *vdev;
> >> +    int i;
> >> +    bool found = false;
> >> +    struct vfio_pci_hot_reset_info *info = opaque;
> >> +    struct vfio_pci_dependent_device *devices = &info->devices[0];
> >> +
> >> +    if (!object_dynamic_cast(OBJECT(dev), "vfio-pci")) {
> >> +        return 0;
> >> +    }
> >> +
> >> +    vdev = DO_UPCAST(VFIOPCIDevice, pdev, PCI_DEVICE(dev));
> >> +    for (i = 0; i < info->count; i++) {
> >> +        PCIHostDeviceAddress host;
> >> +
> >> +        host.domain = devices[i].segment;
> >> +        host.bus = devices[i].bus;
> >> +        host.slot = PCI_SLOT(devices[i].devfn);
> >> +        host.function = PCI_FUNC(devices[i].devfn);
> >> +
> >> +        if (vfio_pci_host_match(&host, &vdev->host)) {
> >> +            found = true;
> >> +            break;
> >> +        }
> >> +    }
> >> +
> >> +    if (!found) {
> >> +        error_report("vfio: Cannot reset parent bus with AER supported,"
> >> +                     "depends on device %s which is not contained.",
> >> +                      vdev->vbasedev.name);
> >> +        return -1;
> >> +    }
> >> +
> >> +    return 0;
> >> +}
> >> +
> >> +static void vfio_pci_vm_stop(VFIOPCIDevice *vdev)
> >> +{
> >> +    error_report("%s(%04x:%02x:%02x.%x) Unrecoverable error detected. "
> >> +                 "Please collect any data possible and then kill the guest",
> >> +                 __func__, vdev->host.domain, vdev->host.bus,
> >> +                 vdev->host.slot, vdev->host.function);
> >> +
> >> +    vm_stop(RUN_STATE_INTERNAL_ERROR);
> >> +}
> >> +
> >> +static void vfio_pci_host_bus_reset(Notifier *n, void *opaque)
> >> +{
> >> +    VFIOPCIDevice *vdev = container_of(n, VFIOPCIDevice, sec_bus_reset_notifier);
> >> +    PCIDevice *pdev = &vdev->pdev;
> >> +    int ret, i;
> >> +    struct vfio_pci_hot_reset_info *info;
> >> +    struct vfio_pci_dependent_device *devices;
> >> +    VFIOGroup *group;
> >> +
> >> +    if (!(vdev->features & VFIO_FEATURE_ENABLE_AER)) {
> >> +        return;
> >> +    }
> >> +
> >> +    /*
> >> +     * Check the affected devices by virtual bus reset are contained in
> >> +     * the set of groups.
> >> +     */
> >> +    ret = vfio_get_hot_reset_info(vdev, &info);
> >> +    if (ret < 0) {
> >> +        goto stop_vm;
> >> +    }
> >> +
> >> +    devices = &info->devices[0];
> >> +
> >> +    /* Verify that we have all the groups required */
> >> +    for (i = 0; i < info->count; i++) {
> >> +        PCIHostDeviceAddress host;
> >> +
> >> +        host.domain = devices[i].segment;
> >> +        host.bus = devices[i].bus;
> >> +        host.slot = PCI_SLOT(devices[i].devfn);
> >> +        host.function = PCI_FUNC(devices[i].devfn);
> >> +
> >> +        if (vfio_pci_host_match(&host, &vdev->host)) {
> >> +            continue;
> >> +        }
> >> +
> >> +        QLIST_FOREACH(group, &vfio_group_list, next) {
> >> +            if (group->groupid == devices[i].group_id) {
> >> +                break;
> >> +            }
> >> +        }
> >> +
> >> +        if (!group) {
> >> +            if (!vdev->has_pm_reset) {
> > I'm not sure how has_pm_reset has anything to do with what we're testing
> > here.  Copy-paste error?
> >
> >> +                error_report("vfio: Cannot reset device %s with AER supported,"
> >> +                             "depends on group %d which is not owned.",
> >> +                             vdev->vbasedev.name, devices[i].group_id);
> >> +            }
> >> +            ret = -EPERM;
> >> +            goto stop_vm;
> >> +        }
> >> +    }
> >
> > The above verifies that all of the devices affected by the bus reset are
> > owned by the VM.
> >
> >> +
> >> +    /* Verify that we have all the affected devices under the bus */
> >> +    ret = qbus_walk_children(BUS(pdev->bus), NULL, NULL,
> >> +                             vfio_aer_validate_devices,
> >> +                             NULL, info);
> > And here we make sure that any vfio-pci devices affected by the bus
> > reset are contained in the list of affected devices.  What we're still
> > missing is whether there are any affected devices that are exposed to
> > the VM that are not covered in this bus walk.  For instance, if I assign
> > a multi-function device and place function 0 and 1 under separate, peer
> > root ports, I believe the above tests will confirm that the VM owns the
> > necessary groups and that the only vfio-pci device affected by the bus
> > reset is in the list of affected devices, but it will not verify that
> > the other function is not affected by the guest bus reset, but is
> > affected by the host bus reset.
> you are right.
> 
> >
> >> +    if (ret < 0) {
> >> +        goto stop_vm;
> >> +    }
> >> +
> >> +
> >> +    /* bus reset! */
> >> +    ret = vfio_pci_do_hot_reset(vdev, info);
> >> +    if (ret < 0) {
> >> +        goto stop_vm;
> >> +    }
> >> +
> >> +    g_free(info);
> >> +    return;
> >> +
> >> +stop_vm:
> >> +    g_free(info);
> >> +    vfio_pci_vm_stop(vdev);
> >> +}
> >> +
> >>   static int vfio_setup_aer(VFIOPCIDevice *vdev, int pos, uint16_t size)
> >>   {
> >>       PCIDevice *pdev = &vdev->pdev;
> >> @@ -2852,6 +2986,9 @@ static int vfio_setup_aer(VFIOPCIDevice *vdev, int pos, uint16_t size)
> >>                      pdev->exp.aer_cap + PCI_ERR_UNCOR_SEVER, 4);
> >>       pci_long_test_and_clear_mask(exp_cap + PCI_ERR_UNCOR_SEVER, ~severity);
> >>   
> >> +    vdev->sec_bus_reset_notifier.notify = vfio_pci_host_bus_reset;
> >> +    pci_bus_add_reset_notifier(pdev->bus, &vdev->sec_bus_reset_notifier);
> >> +
> > Additionally, we're going to do a bus reset once for every vfio-pci
> > device subordinate to the bus being reset.  That's not very efficient.
> >
> > There are still a number of problems here.  We're allowing users to
> > configure their VMs however they wish and enable AER, then only when
> > they do a bus reset (which we have no way to infer is related to an AER
> > event) do we check and potentially halt the VM if guest mapping of the
> > host topology prevents a bus reset.  That seems sort of like making
> > backups of your data, but never testing that you can recover from the
> > backup until we need the data.  I can't imagine that many users are
> > going to be able to get this correct and they won't know it's incorrect
> > until an AER event occurs.
> >
> > Also, Do we need to worry about guest root complex devices?  The guest
> > won't be able to perform a bus reset on the guest bus in that case.  Is
> > AER even valid for RC devices since they don't have a parent downstream
> > port to signal the AER?  This seems like another case where it's more
> > likely that a user will create an invalid configuration than it is they
> > will create a valid one, but we won't know until an AER occurs with the
> > code structure here.
> do you refer to the root complex device is the root port of root complex?
> the root port can notify system itself, the PCI Express aer driver
> reset the upstream link between root port and upstream port of
> subordinate switch. so I think we don't need to care that.


No, I'm saying that we can place an assigned device directly on pcie.0,
the host bus, without using a root port.  I don't know if we have any
way to signal an AER event for root complex devices, but the guest
certainly doesn't have a way to perform a bus reset on the host bus.
Thanks,

Alex


> > Therefore I think that if the user specifies AER, we need to verify and
> > enforce at that point, ie. the initfn, that a host bus reset is
> > possible, that a guest reset is possible, and that a guest bus reset
> > fully contains the VM visible host devices affected by the reset.
> >
> > I'd also like to see the patches ordered such that we provide all the
> > infrastructure to validate and enforce the configuration restrictions to
> > support AER before we enable it to the guest.  The order here creates
> > bisect points where AER is exposed to the guest with unacceptable QEMU
> > handling.  Thanks,
> I'm sorry for that, due to this is RFC version I missed that,
> and  I will arrange the patches after this series.
> 
> Thanks,
> Chen
> 
> 
> >
> > Alex
> >
> >>       return 0;
> >>   }
> >>   
> >> @@ -2978,13 +3115,6 @@ static void vfio_pci_post_reset(VFIOPCIDevice *vdev)
> >>       vfio_enable_intx(vdev);
> >>   }
> >>   
> >> -static bool vfio_pci_host_match(PCIHostDeviceAddress *host1,
> >> -                                PCIHostDeviceAddress *host2)
> >> -{
> >> -    return (host1->domain == host2->domain && host1->bus == host2->bus &&
> >> -            host1->slot == host2->slot && host1->function == host2->function);
> >> -}
> >> -
> >>   static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
> >>   {
> >>       VFIOGroup *group;
> >> @@ -3328,12 +3458,7 @@ static void vfio_err_notifier_handler(void *opaque)
> >>        * terminate the guest to contain the error.
> >>        */
> >>   
> >> -    error_report("%s(%04x:%02x:%02x.%x) Unrecoverable error detected.  "
> >> -                 "Please collect any data possible and then kill the guest",
> >> -                 __func__, vdev->host.domain, vdev->host.bus,
> >> -                 vdev->host.slot, vdev->host.function);
> >> -
> >> -    vm_stop(RUN_STATE_INTERNAL_ERROR);
> >> +    vfio_pci_vm_stop(vdev);
> >>   }
> >>   
> >>   /*
> >
> >
> > .
> >
>
diff mbox

Patch

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 060fb47..619daed 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -154,6 +154,8 @@  typedef struct VFIOPCIDevice {
     PCIHostDeviceAddress host;
     EventNotifier err_notifier;
     EventNotifier req_notifier;
+
+    Notifier sec_bus_reset_notifier;
     uint32_t features;
 #define VFIO_FEATURE_ENABLE_VGA_BIT 0
 #define VFIO_FEATURE_ENABLE_VGA (1 << VFIO_FEATURE_ENABLE_VGA_BIT)
@@ -2627,6 +2629,13 @@  static int vfio_setup_pcie_cap(VFIOPCIDevice *vdev, int pos, uint8_t size)
     return pos;
 }
 
+static bool vfio_pci_host_match(PCIHostDeviceAddress *host1,
+                                PCIHostDeviceAddress *host2)
+{
+    return (host1->domain == host2->domain && host1->bus == host2->bus &&
+            host1->slot == host2->slot && host1->function == host2->function);
+}
+
 static void vfio_check_pcie_flr(VFIOPCIDevice *vdev, uint8_t pos)
 {
     uint32_t cap = pci_get_long(vdev->pdev.config + pos + PCI_EXP_DEVCAP);
@@ -2819,6 +2828,131 @@  static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos)
     return 0;
 }
 
+static int vfio_aer_validate_devices(DeviceState *dev,
+                                     void *opaque)
+{
+    VFIOPCIDevice *vdev;
+    int i;
+    bool found = false;
+    struct vfio_pci_hot_reset_info *info = opaque;
+    struct vfio_pci_dependent_device *devices = &info->devices[0];
+
+    if (!object_dynamic_cast(OBJECT(dev), "vfio-pci")) {
+        return 0;
+    }
+
+    vdev = DO_UPCAST(VFIOPCIDevice, pdev, PCI_DEVICE(dev));
+    for (i = 0; i < info->count; i++) {
+        PCIHostDeviceAddress host;
+
+        host.domain = devices[i].segment;
+        host.bus = devices[i].bus;
+        host.slot = PCI_SLOT(devices[i].devfn);
+        host.function = PCI_FUNC(devices[i].devfn);
+
+        if (vfio_pci_host_match(&host, &vdev->host)) {
+            found = true;
+            break;
+        }
+    }
+
+    if (!found) {
+        error_report("vfio: Cannot reset parent bus with AER supported,"
+                     "depends on device %s which is not contained.",
+                      vdev->vbasedev.name);
+        return -1;
+    }
+
+    return 0;
+}
+
+static void vfio_pci_vm_stop(VFIOPCIDevice *vdev)
+{
+    error_report("%s(%04x:%02x:%02x.%x) Unrecoverable error detected. "
+                 "Please collect any data possible and then kill the guest",
+                 __func__, vdev->host.domain, vdev->host.bus,
+                 vdev->host.slot, vdev->host.function);
+
+    vm_stop(RUN_STATE_INTERNAL_ERROR);
+}
+
+static void vfio_pci_host_bus_reset(Notifier *n, void *opaque)
+{
+    VFIOPCIDevice *vdev = container_of(n, VFIOPCIDevice, sec_bus_reset_notifier);
+    PCIDevice *pdev = &vdev->pdev;
+    int ret, i;
+    struct vfio_pci_hot_reset_info *info;
+    struct vfio_pci_dependent_device *devices;
+    VFIOGroup *group;
+
+    if (!(vdev->features & VFIO_FEATURE_ENABLE_AER)) {
+        return;
+    }
+
+    /*
+     * Check the affected devices by virtual bus reset are contained in
+     * the set of groups.
+     */
+    ret = vfio_get_hot_reset_info(vdev, &info);
+    if (ret < 0) {
+        goto stop_vm;
+    }
+
+    devices = &info->devices[0];
+
+    /* Verify that we have all the groups required */
+    for (i = 0; i < info->count; i++) {
+        PCIHostDeviceAddress host;
+
+        host.domain = devices[i].segment;
+        host.bus = devices[i].bus;
+        host.slot = PCI_SLOT(devices[i].devfn);
+        host.function = PCI_FUNC(devices[i].devfn);
+
+        if (vfio_pci_host_match(&host, &vdev->host)) {
+            continue;
+        }
+
+        QLIST_FOREACH(group, &vfio_group_list, next) {
+            if (group->groupid == devices[i].group_id) {
+                break;
+            }
+        }
+
+        if (!group) {
+            if (!vdev->has_pm_reset) {
+                error_report("vfio: Cannot reset device %s with AER supported,"
+                             "depends on group %d which is not owned.",
+                             vdev->vbasedev.name, devices[i].group_id);
+            }
+            ret = -EPERM;
+            goto stop_vm;
+        }
+    }
+
+    /* Verify that we have all the affected devices under the bus */
+    ret = qbus_walk_children(BUS(pdev->bus), NULL, NULL,
+                             vfio_aer_validate_devices,
+                             NULL, info);
+    if (ret < 0) {
+        goto stop_vm;
+    }
+
+
+    /* bus reset! */
+    ret = vfio_pci_do_hot_reset(vdev, info);
+    if (ret < 0) {
+        goto stop_vm;
+    }
+
+    g_free(info);
+    return;
+
+stop_vm:
+    g_free(info);
+    vfio_pci_vm_stop(vdev);
+}
+
 static int vfio_setup_aer(VFIOPCIDevice *vdev, int pos, uint16_t size)
 {
     PCIDevice *pdev = &vdev->pdev;
@@ -2852,6 +2986,9 @@  static int vfio_setup_aer(VFIOPCIDevice *vdev, int pos, uint16_t size)
                    pdev->exp.aer_cap + PCI_ERR_UNCOR_SEVER, 4);
     pci_long_test_and_clear_mask(exp_cap + PCI_ERR_UNCOR_SEVER, ~severity);
 
+    vdev->sec_bus_reset_notifier.notify = vfio_pci_host_bus_reset;
+    pci_bus_add_reset_notifier(pdev->bus, &vdev->sec_bus_reset_notifier);
+
     return 0;
 }
 
@@ -2978,13 +3115,6 @@  static void vfio_pci_post_reset(VFIOPCIDevice *vdev)
     vfio_enable_intx(vdev);
 }
 
-static bool vfio_pci_host_match(PCIHostDeviceAddress *host1,
-                                PCIHostDeviceAddress *host2)
-{
-    return (host1->domain == host2->domain && host1->bus == host2->bus &&
-            host1->slot == host2->slot && host1->function == host2->function);
-}
-
 static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
 {
     VFIOGroup *group;
@@ -3328,12 +3458,7 @@  static void vfio_err_notifier_handler(void *opaque)
      * terminate the guest to contain the error.
      */
 
-    error_report("%s(%04x:%02x:%02x.%x) Unrecoverable error detected.  "
-                 "Please collect any data possible and then kill the guest",
-                 __func__, vdev->host.domain, vdev->host.bus,
-                 vdev->host.slot, vdev->host.function);
-
-    vm_stop(RUN_STATE_INTERNAL_ERROR);
+    vfio_pci_vm_stop(vdev);
 }
 
 /*