Message ID | b95cd227a99fdc382e0a9a64b1e9ded18c82ad3d.1430297161.git.chen.fan.fnst@cn.fujitsu.com |
---|---|
State | New |
Headers | show |
On Wed, 2015-04-29 at 16:48 +0800, Chen Fan wrote: > add host secondary bus reset for vfio when AER occurs, if reset failed, > we should stop vm. > > Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com> > --- > hw/vfio/pci.c | 151 +++++++++++++++++++++++++++++++++++++++++++++++++++++----- > 1 file changed, 138 insertions(+), 13 deletions(-) > > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c > index 060fb47..619daed 100644 > --- a/hw/vfio/pci.c > +++ b/hw/vfio/pci.c > @@ -154,6 +154,8 @@ typedef struct VFIOPCIDevice { > PCIHostDeviceAddress host; > EventNotifier err_notifier; > EventNotifier req_notifier; > + > + Notifier sec_bus_reset_notifier; > uint32_t features; > #define VFIO_FEATURE_ENABLE_VGA_BIT 0 > #define VFIO_FEATURE_ENABLE_VGA (1 << VFIO_FEATURE_ENABLE_VGA_BIT) > @@ -2627,6 +2629,13 @@ static int vfio_setup_pcie_cap(VFIOPCIDevice *vdev, int pos, uint8_t size) > return pos; > } > > +static bool vfio_pci_host_match(PCIHostDeviceAddress *host1, > + PCIHostDeviceAddress *host2) > +{ > + return (host1->domain == host2->domain && host1->bus == host2->bus && > + host1->slot == host2->slot && host1->function == host2->function); > +} > + > static void vfio_check_pcie_flr(VFIOPCIDevice *vdev, uint8_t pos) > { > uint32_t cap = pci_get_long(vdev->pdev.config + pos + PCI_EXP_DEVCAP); > @@ -2819,6 +2828,131 @@ static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos) > return 0; > } > > +static int vfio_aer_validate_devices(DeviceState *dev, > + void *opaque) > +{ > + VFIOPCIDevice *vdev; > + int i; > + bool found = false; > + struct vfio_pci_hot_reset_info *info = opaque; > + struct vfio_pci_dependent_device *devices = &info->devices[0]; > + > + if (!object_dynamic_cast(OBJECT(dev), "vfio-pci")) { > + return 0; > + } > + > + vdev = DO_UPCAST(VFIOPCIDevice, pdev, PCI_DEVICE(dev)); > + for (i = 0; i < info->count; i++) { > + PCIHostDeviceAddress host; > + > + host.domain = devices[i].segment; > + host.bus = devices[i].bus; > + host.slot = PCI_SLOT(devices[i].devfn); > + host.function = PCI_FUNC(devices[i].devfn); > + > + if (vfio_pci_host_match(&host, &vdev->host)) { > + found = true; > + break; > + } > + } > + > + if (!found) { > + error_report("vfio: Cannot reset parent bus with AER supported," > + "depends on device %s which is not contained.", > + vdev->vbasedev.name); > + return -1; > + } > + > + return 0; > +} > + > +static void vfio_pci_vm_stop(VFIOPCIDevice *vdev) > +{ > + error_report("%s(%04x:%02x:%02x.%x) Unrecoverable error detected. " > + "Please collect any data possible and then kill the guest", > + __func__, vdev->host.domain, vdev->host.bus, > + vdev->host.slot, vdev->host.function); > + > + vm_stop(RUN_STATE_INTERNAL_ERROR); > +} > + > +static void vfio_pci_host_bus_reset(Notifier *n, void *opaque) > +{ > + VFIOPCIDevice *vdev = container_of(n, VFIOPCIDevice, sec_bus_reset_notifier); > + PCIDevice *pdev = &vdev->pdev; > + int ret, i; > + struct vfio_pci_hot_reset_info *info; > + struct vfio_pci_dependent_device *devices; > + VFIOGroup *group; > + > + if (!(vdev->features & VFIO_FEATURE_ENABLE_AER)) { > + return; > + } > + > + /* > + * Check the affected devices by virtual bus reset are contained in > + * the set of groups. > + */ > + ret = vfio_get_hot_reset_info(vdev, &info); > + if (ret < 0) { > + goto stop_vm; > + } > + > + devices = &info->devices[0]; > + > + /* Verify that we have all the groups required */ > + for (i = 0; i < info->count; i++) { > + PCIHostDeviceAddress host; > + > + host.domain = devices[i].segment; > + host.bus = devices[i].bus; > + host.slot = PCI_SLOT(devices[i].devfn); > + host.function = PCI_FUNC(devices[i].devfn); > + > + if (vfio_pci_host_match(&host, &vdev->host)) { > + continue; > + } > + > + QLIST_FOREACH(group, &vfio_group_list, next) { > + if (group->groupid == devices[i].group_id) { > + break; > + } > + } > + > + if (!group) { > + if (!vdev->has_pm_reset) { I'm not sure how has_pm_reset has anything to do with what we're testing here. Copy-paste error? > + error_report("vfio: Cannot reset device %s with AER supported," > + "depends on group %d which is not owned.", > + vdev->vbasedev.name, devices[i].group_id); > + } > + ret = -EPERM; > + goto stop_vm; > + } > + } The above verifies that all of the devices affected by the bus reset are owned by the VM. > + > + /* Verify that we have all the affected devices under the bus */ > + ret = qbus_walk_children(BUS(pdev->bus), NULL, NULL, > + vfio_aer_validate_devices, > + NULL, info); And here we make sure that any vfio-pci devices affected by the bus reset are contained in the list of affected devices. What we're still missing is whether there are any affected devices that are exposed to the VM that are not covered in this bus walk. For instance, if I assign a multi-function device and place function 0 and 1 under separate, peer root ports, I believe the above tests will confirm that the VM owns the necessary groups and that the only vfio-pci device affected by the bus reset is in the list of affected devices, but it will not verify that the other function is not affected by the guest bus reset, but is affected by the host bus reset. > + if (ret < 0) { > + goto stop_vm; > + } > + > + > + /* bus reset! */ > + ret = vfio_pci_do_hot_reset(vdev, info); > + if (ret < 0) { > + goto stop_vm; > + } > + > + g_free(info); > + return; > + > +stop_vm: > + g_free(info); > + vfio_pci_vm_stop(vdev); > +} > + > static int vfio_setup_aer(VFIOPCIDevice *vdev, int pos, uint16_t size) > { > PCIDevice *pdev = &vdev->pdev; > @@ -2852,6 +2986,9 @@ static int vfio_setup_aer(VFIOPCIDevice *vdev, int pos, uint16_t size) > pdev->exp.aer_cap + PCI_ERR_UNCOR_SEVER, 4); > pci_long_test_and_clear_mask(exp_cap + PCI_ERR_UNCOR_SEVER, ~severity); > > + vdev->sec_bus_reset_notifier.notify = vfio_pci_host_bus_reset; > + pci_bus_add_reset_notifier(pdev->bus, &vdev->sec_bus_reset_notifier); > + Additionally, we're going to do a bus reset once for every vfio-pci device subordinate to the bus being reset. That's not very efficient. There are still a number of problems here. We're allowing users to configure their VMs however they wish and enable AER, then only when they do a bus reset (which we have no way to infer is related to an AER event) do we check and potentially halt the VM if guest mapping of the host topology prevents a bus reset. That seems sort of like making backups of your data, but never testing that you can recover from the backup until we need the data. I can't imagine that many users are going to be able to get this correct and they won't know it's incorrect until an AER event occurs. Also, Do we need to worry about guest root complex devices? The guest won't be able to perform a bus reset on the guest bus in that case. Is AER even valid for RC devices since they don't have a parent downstream port to signal the AER? This seems like another case where it's more likely that a user will create an invalid configuration than it is they will create a valid one, but we won't know until an AER occurs with the code structure here. Therefore I think that if the user specifies AER, we need to verify and enforce at that point, ie. the initfn, that a host bus reset is possible, that a guest reset is possible, and that a guest bus reset fully contains the VM visible host devices affected by the reset. I'd also like to see the patches ordered such that we provide all the infrastructure to validate and enforce the configuration restrictions to support AER before we enable it to the guest. The order here creates bisect points where AER is exposed to the guest with unacceptable QEMU handling. Thanks, Alex > return 0; > } > > @@ -2978,13 +3115,6 @@ static void vfio_pci_post_reset(VFIOPCIDevice *vdev) > vfio_enable_intx(vdev); > } > > -static bool vfio_pci_host_match(PCIHostDeviceAddress *host1, > - PCIHostDeviceAddress *host2) > -{ > - return (host1->domain == host2->domain && host1->bus == host2->bus && > - host1->slot == host2->slot && host1->function == host2->function); > -} > - > static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single) > { > VFIOGroup *group; > @@ -3328,12 +3458,7 @@ static void vfio_err_notifier_handler(void *opaque) > * terminate the guest to contain the error. > */ > > - error_report("%s(%04x:%02x:%02x.%x) Unrecoverable error detected. " > - "Please collect any data possible and then kill the guest", > - __func__, vdev->host.domain, vdev->host.bus, > - vdev->host.slot, vdev->host.function); > - > - vm_stop(RUN_STATE_INTERNAL_ERROR); > + vfio_pci_vm_stop(vdev); > } > > /*
On 04/30/2015 01:32 AM, Alex Williamson wrote: > On Wed, 2015-04-29 at 16:48 +0800, Chen Fan wrote: >> add host secondary bus reset for vfio when AER occurs, if reset failed, >> we should stop vm. >> >> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com> >> --- >> hw/vfio/pci.c | 151 +++++++++++++++++++++++++++++++++++++++++++++++++++++----- >> 1 file changed, 138 insertions(+), 13 deletions(-) >> >> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c >> index 060fb47..619daed 100644 >> --- a/hw/vfio/pci.c >> +++ b/hw/vfio/pci.c >> @@ -154,6 +154,8 @@ typedef struct VFIOPCIDevice { >> PCIHostDeviceAddress host; >> EventNotifier err_notifier; >> EventNotifier req_notifier; >> + >> + Notifier sec_bus_reset_notifier; >> uint32_t features; >> #define VFIO_FEATURE_ENABLE_VGA_BIT 0 >> #define VFIO_FEATURE_ENABLE_VGA (1 << VFIO_FEATURE_ENABLE_VGA_BIT) >> @@ -2627,6 +2629,13 @@ static int vfio_setup_pcie_cap(VFIOPCIDevice *vdev, int pos, uint8_t size) >> return pos; >> } >> >> +static bool vfio_pci_host_match(PCIHostDeviceAddress *host1, >> + PCIHostDeviceAddress *host2) >> +{ >> + return (host1->domain == host2->domain && host1->bus == host2->bus && >> + host1->slot == host2->slot && host1->function == host2->function); >> +} >> + >> static void vfio_check_pcie_flr(VFIOPCIDevice *vdev, uint8_t pos) >> { >> uint32_t cap = pci_get_long(vdev->pdev.config + pos + PCI_EXP_DEVCAP); >> @@ -2819,6 +2828,131 @@ static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos) >> return 0; >> } >> >> +static int vfio_aer_validate_devices(DeviceState *dev, >> + void *opaque) >> +{ >> + VFIOPCIDevice *vdev; >> + int i; >> + bool found = false; >> + struct vfio_pci_hot_reset_info *info = opaque; >> + struct vfio_pci_dependent_device *devices = &info->devices[0]; >> + >> + if (!object_dynamic_cast(OBJECT(dev), "vfio-pci")) { >> + return 0; >> + } >> + >> + vdev = DO_UPCAST(VFIOPCIDevice, pdev, PCI_DEVICE(dev)); >> + for (i = 0; i < info->count; i++) { >> + PCIHostDeviceAddress host; >> + >> + host.domain = devices[i].segment; >> + host.bus = devices[i].bus; >> + host.slot = PCI_SLOT(devices[i].devfn); >> + host.function = PCI_FUNC(devices[i].devfn); >> + >> + if (vfio_pci_host_match(&host, &vdev->host)) { >> + found = true; >> + break; >> + } >> + } >> + >> + if (!found) { >> + error_report("vfio: Cannot reset parent bus with AER supported," >> + "depends on device %s which is not contained.", >> + vdev->vbasedev.name); >> + return -1; >> + } >> + >> + return 0; >> +} >> + >> +static void vfio_pci_vm_stop(VFIOPCIDevice *vdev) >> +{ >> + error_report("%s(%04x:%02x:%02x.%x) Unrecoverable error detected. " >> + "Please collect any data possible and then kill the guest", >> + __func__, vdev->host.domain, vdev->host.bus, >> + vdev->host.slot, vdev->host.function); >> + >> + vm_stop(RUN_STATE_INTERNAL_ERROR); >> +} >> + >> +static void vfio_pci_host_bus_reset(Notifier *n, void *opaque) >> +{ >> + VFIOPCIDevice *vdev = container_of(n, VFIOPCIDevice, sec_bus_reset_notifier); >> + PCIDevice *pdev = &vdev->pdev; >> + int ret, i; >> + struct vfio_pci_hot_reset_info *info; >> + struct vfio_pci_dependent_device *devices; >> + VFIOGroup *group; >> + >> + if (!(vdev->features & VFIO_FEATURE_ENABLE_AER)) { >> + return; >> + } >> + >> + /* >> + * Check the affected devices by virtual bus reset are contained in >> + * the set of groups. >> + */ >> + ret = vfio_get_hot_reset_info(vdev, &info); >> + if (ret < 0) { >> + goto stop_vm; >> + } >> + >> + devices = &info->devices[0]; >> + >> + /* Verify that we have all the groups required */ >> + for (i = 0; i < info->count; i++) { >> + PCIHostDeviceAddress host; >> + >> + host.domain = devices[i].segment; >> + host.bus = devices[i].bus; >> + host.slot = PCI_SLOT(devices[i].devfn); >> + host.function = PCI_FUNC(devices[i].devfn); >> + >> + if (vfio_pci_host_match(&host, &vdev->host)) { >> + continue; >> + } >> + >> + QLIST_FOREACH(group, &vfio_group_list, next) { >> + if (group->groupid == devices[i].group_id) { >> + break; >> + } >> + } >> + >> + if (!group) { >> + if (!vdev->has_pm_reset) { > I'm not sure how has_pm_reset has anything to do with what we're testing > here. Copy-paste error? > >> + error_report("vfio: Cannot reset device %s with AER supported," >> + "depends on group %d which is not owned.", >> + vdev->vbasedev.name, devices[i].group_id); >> + } >> + ret = -EPERM; >> + goto stop_vm; >> + } >> + } > > The above verifies that all of the devices affected by the bus reset are > owned by the VM. > >> + >> + /* Verify that we have all the affected devices under the bus */ >> + ret = qbus_walk_children(BUS(pdev->bus), NULL, NULL, >> + vfio_aer_validate_devices, >> + NULL, info); > And here we make sure that any vfio-pci devices affected by the bus > reset are contained in the list of affected devices. What we're still > missing is whether there are any affected devices that are exposed to > the VM that are not covered in this bus walk. For instance, if I assign > a multi-function device and place function 0 and 1 under separate, peer > root ports, I believe the above tests will confirm that the VM owns the > necessary groups and that the only vfio-pci device affected by the bus > reset is in the list of affected devices, but it will not verify that > the other function is not affected by the guest bus reset, but is > affected by the host bus reset. you are right. > >> + if (ret < 0) { >> + goto stop_vm; >> + } >> + >> + >> + /* bus reset! */ >> + ret = vfio_pci_do_hot_reset(vdev, info); >> + if (ret < 0) { >> + goto stop_vm; >> + } >> + >> + g_free(info); >> + return; >> + >> +stop_vm: >> + g_free(info); >> + vfio_pci_vm_stop(vdev); >> +} >> + >> static int vfio_setup_aer(VFIOPCIDevice *vdev, int pos, uint16_t size) >> { >> PCIDevice *pdev = &vdev->pdev; >> @@ -2852,6 +2986,9 @@ static int vfio_setup_aer(VFIOPCIDevice *vdev, int pos, uint16_t size) >> pdev->exp.aer_cap + PCI_ERR_UNCOR_SEVER, 4); >> pci_long_test_and_clear_mask(exp_cap + PCI_ERR_UNCOR_SEVER, ~severity); >> >> + vdev->sec_bus_reset_notifier.notify = vfio_pci_host_bus_reset; >> + pci_bus_add_reset_notifier(pdev->bus, &vdev->sec_bus_reset_notifier); >> + > Additionally, we're going to do a bus reset once for every vfio-pci > device subordinate to the bus being reset. That's not very efficient. > > There are still a number of problems here. We're allowing users to > configure their VMs however they wish and enable AER, then only when > they do a bus reset (which we have no way to infer is related to an AER > event) do we check and potentially halt the VM if guest mapping of the > host topology prevents a bus reset. That seems sort of like making > backups of your data, but never testing that you can recover from the > backup until we need the data. I can't imagine that many users are > going to be able to get this correct and they won't know it's incorrect > until an AER event occurs. > > Also, Do we need to worry about guest root complex devices? The guest > won't be able to perform a bus reset on the guest bus in that case. Is > AER even valid for RC devices since they don't have a parent downstream > port to signal the AER? This seems like another case where it's more > likely that a user will create an invalid configuration than it is they > will create a valid one, but we won't know until an AER occurs with the > code structure here. do you refer to the root complex device is the root port of root complex? the root port can notify system itself, the PCI Express aer driver reset the upstream link between root port and upstream port of subordinate switch. so I think we don't need to care that. > > Therefore I think that if the user specifies AER, we need to verify and > enforce at that point, ie. the initfn, that a host bus reset is > possible, that a guest reset is possible, and that a guest bus reset > fully contains the VM visible host devices affected by the reset. > > I'd also like to see the patches ordered such that we provide all the > infrastructure to validate and enforce the configuration restrictions to > support AER before we enable it to the guest. The order here creates > bisect points where AER is exposed to the guest with unacceptable QEMU > handling. Thanks, I'm sorry for that, due to this is RFC version I missed that, and I will arrange the patches after this series. Thanks, Chen > > Alex > >> return 0; >> } >> >> @@ -2978,13 +3115,6 @@ static void vfio_pci_post_reset(VFIOPCIDevice *vdev) >> vfio_enable_intx(vdev); >> } >> >> -static bool vfio_pci_host_match(PCIHostDeviceAddress *host1, >> - PCIHostDeviceAddress *host2) >> -{ >> - return (host1->domain == host2->domain && host1->bus == host2->bus && >> - host1->slot == host2->slot && host1->function == host2->function); >> -} >> - >> static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single) >> { >> VFIOGroup *group; >> @@ -3328,12 +3458,7 @@ static void vfio_err_notifier_handler(void *opaque) >> * terminate the guest to contain the error. >> */ >> >> - error_report("%s(%04x:%02x:%02x.%x) Unrecoverable error detected. " >> - "Please collect any data possible and then kill the guest", >> - __func__, vdev->host.domain, vdev->host.bus, >> - vdev->host.slot, vdev->host.function); >> - >> - vm_stop(RUN_STATE_INTERNAL_ERROR); >> + vfio_pci_vm_stop(vdev); >> } >> >> /* > > > . >
On Thu, 2015-04-30 at 11:07 +0800, Chen Fan wrote: > On 04/30/2015 01:32 AM, Alex Williamson wrote: > > On Wed, 2015-04-29 at 16:48 +0800, Chen Fan wrote: > >> add host secondary bus reset for vfio when AER occurs, if reset failed, > >> we should stop vm. > >> > >> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com> > >> --- > >> hw/vfio/pci.c | 151 +++++++++++++++++++++++++++++++++++++++++++++++++++++----- > >> 1 file changed, 138 insertions(+), 13 deletions(-) > >> > >> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c > >> index 060fb47..619daed 100644 > >> --- a/hw/vfio/pci.c > >> +++ b/hw/vfio/pci.c > >> @@ -154,6 +154,8 @@ typedef struct VFIOPCIDevice { > >> PCIHostDeviceAddress host; > >> EventNotifier err_notifier; > >> EventNotifier req_notifier; > >> + > >> + Notifier sec_bus_reset_notifier; > >> uint32_t features; > >> #define VFIO_FEATURE_ENABLE_VGA_BIT 0 > >> #define VFIO_FEATURE_ENABLE_VGA (1 << VFIO_FEATURE_ENABLE_VGA_BIT) > >> @@ -2627,6 +2629,13 @@ static int vfio_setup_pcie_cap(VFIOPCIDevice *vdev, int pos, uint8_t size) > >> return pos; > >> } > >> > >> +static bool vfio_pci_host_match(PCIHostDeviceAddress *host1, > >> + PCIHostDeviceAddress *host2) > >> +{ > >> + return (host1->domain == host2->domain && host1->bus == host2->bus && > >> + host1->slot == host2->slot && host1->function == host2->function); > >> +} > >> + > >> static void vfio_check_pcie_flr(VFIOPCIDevice *vdev, uint8_t pos) > >> { > >> uint32_t cap = pci_get_long(vdev->pdev.config + pos + PCI_EXP_DEVCAP); > >> @@ -2819,6 +2828,131 @@ static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos) > >> return 0; > >> } > >> > >> +static int vfio_aer_validate_devices(DeviceState *dev, > >> + void *opaque) > >> +{ > >> + VFIOPCIDevice *vdev; > >> + int i; > >> + bool found = false; > >> + struct vfio_pci_hot_reset_info *info = opaque; > >> + struct vfio_pci_dependent_device *devices = &info->devices[0]; > >> + > >> + if (!object_dynamic_cast(OBJECT(dev), "vfio-pci")) { > >> + return 0; > >> + } > >> + > >> + vdev = DO_UPCAST(VFIOPCIDevice, pdev, PCI_DEVICE(dev)); > >> + for (i = 0; i < info->count; i++) { > >> + PCIHostDeviceAddress host; > >> + > >> + host.domain = devices[i].segment; > >> + host.bus = devices[i].bus; > >> + host.slot = PCI_SLOT(devices[i].devfn); > >> + host.function = PCI_FUNC(devices[i].devfn); > >> + > >> + if (vfio_pci_host_match(&host, &vdev->host)) { > >> + found = true; > >> + break; > >> + } > >> + } > >> + > >> + if (!found) { > >> + error_report("vfio: Cannot reset parent bus with AER supported," > >> + "depends on device %s which is not contained.", > >> + vdev->vbasedev.name); > >> + return -1; > >> + } > >> + > >> + return 0; > >> +} > >> + > >> +static void vfio_pci_vm_stop(VFIOPCIDevice *vdev) > >> +{ > >> + error_report("%s(%04x:%02x:%02x.%x) Unrecoverable error detected. " > >> + "Please collect any data possible and then kill the guest", > >> + __func__, vdev->host.domain, vdev->host.bus, > >> + vdev->host.slot, vdev->host.function); > >> + > >> + vm_stop(RUN_STATE_INTERNAL_ERROR); > >> +} > >> + > >> +static void vfio_pci_host_bus_reset(Notifier *n, void *opaque) > >> +{ > >> + VFIOPCIDevice *vdev = container_of(n, VFIOPCIDevice, sec_bus_reset_notifier); > >> + PCIDevice *pdev = &vdev->pdev; > >> + int ret, i; > >> + struct vfio_pci_hot_reset_info *info; > >> + struct vfio_pci_dependent_device *devices; > >> + VFIOGroup *group; > >> + > >> + if (!(vdev->features & VFIO_FEATURE_ENABLE_AER)) { > >> + return; > >> + } > >> + > >> + /* > >> + * Check the affected devices by virtual bus reset are contained in > >> + * the set of groups. > >> + */ > >> + ret = vfio_get_hot_reset_info(vdev, &info); > >> + if (ret < 0) { > >> + goto stop_vm; > >> + } > >> + > >> + devices = &info->devices[0]; > >> + > >> + /* Verify that we have all the groups required */ > >> + for (i = 0; i < info->count; i++) { > >> + PCIHostDeviceAddress host; > >> + > >> + host.domain = devices[i].segment; > >> + host.bus = devices[i].bus; > >> + host.slot = PCI_SLOT(devices[i].devfn); > >> + host.function = PCI_FUNC(devices[i].devfn); > >> + > >> + if (vfio_pci_host_match(&host, &vdev->host)) { > >> + continue; > >> + } > >> + > >> + QLIST_FOREACH(group, &vfio_group_list, next) { > >> + if (group->groupid == devices[i].group_id) { > >> + break; > >> + } > >> + } > >> + > >> + if (!group) { > >> + if (!vdev->has_pm_reset) { > > I'm not sure how has_pm_reset has anything to do with what we're testing > > here. Copy-paste error? > > > >> + error_report("vfio: Cannot reset device %s with AER supported," > >> + "depends on group %d which is not owned.", > >> + vdev->vbasedev.name, devices[i].group_id); > >> + } > >> + ret = -EPERM; > >> + goto stop_vm; > >> + } > >> + } > > > > The above verifies that all of the devices affected by the bus reset are > > owned by the VM. > > > >> + > >> + /* Verify that we have all the affected devices under the bus */ > >> + ret = qbus_walk_children(BUS(pdev->bus), NULL, NULL, > >> + vfio_aer_validate_devices, > >> + NULL, info); > > And here we make sure that any vfio-pci devices affected by the bus > > reset are contained in the list of affected devices. What we're still > > missing is whether there are any affected devices that are exposed to > > the VM that are not covered in this bus walk. For instance, if I assign > > a multi-function device and place function 0 and 1 under separate, peer > > root ports, I believe the above tests will confirm that the VM owns the > > necessary groups and that the only vfio-pci device affected by the bus > > reset is in the list of affected devices, but it will not verify that > > the other function is not affected by the guest bus reset, but is > > affected by the host bus reset. > you are right. > > > > >> + if (ret < 0) { > >> + goto stop_vm; > >> + } > >> + > >> + > >> + /* bus reset! */ > >> + ret = vfio_pci_do_hot_reset(vdev, info); > >> + if (ret < 0) { > >> + goto stop_vm; > >> + } > >> + > >> + g_free(info); > >> + return; > >> + > >> +stop_vm: > >> + g_free(info); > >> + vfio_pci_vm_stop(vdev); > >> +} > >> + > >> static int vfio_setup_aer(VFIOPCIDevice *vdev, int pos, uint16_t size) > >> { > >> PCIDevice *pdev = &vdev->pdev; > >> @@ -2852,6 +2986,9 @@ static int vfio_setup_aer(VFIOPCIDevice *vdev, int pos, uint16_t size) > >> pdev->exp.aer_cap + PCI_ERR_UNCOR_SEVER, 4); > >> pci_long_test_and_clear_mask(exp_cap + PCI_ERR_UNCOR_SEVER, ~severity); > >> > >> + vdev->sec_bus_reset_notifier.notify = vfio_pci_host_bus_reset; > >> + pci_bus_add_reset_notifier(pdev->bus, &vdev->sec_bus_reset_notifier); > >> + > > Additionally, we're going to do a bus reset once for every vfio-pci > > device subordinate to the bus being reset. That's not very efficient. > > > > There are still a number of problems here. We're allowing users to > > configure their VMs however they wish and enable AER, then only when > > they do a bus reset (which we have no way to infer is related to an AER > > event) do we check and potentially halt the VM if guest mapping of the > > host topology prevents a bus reset. That seems sort of like making > > backups of your data, but never testing that you can recover from the > > backup until we need the data. I can't imagine that many users are > > going to be able to get this correct and they won't know it's incorrect > > until an AER event occurs. > > > > Also, Do we need to worry about guest root complex devices? The guest > > won't be able to perform a bus reset on the guest bus in that case. Is > > AER even valid for RC devices since they don't have a parent downstream > > port to signal the AER? This seems like another case where it's more > > likely that a user will create an invalid configuration than it is they > > will create a valid one, but we won't know until an AER occurs with the > > code structure here. > do you refer to the root complex device is the root port of root complex? > the root port can notify system itself, the PCI Express aer driver > reset the upstream link between root port and upstream port of > subordinate switch. so I think we don't need to care that. No, I'm saying that we can place an assigned device directly on pcie.0, the host bus, without using a root port. I don't know if we have any way to signal an AER event for root complex devices, but the guest certainly doesn't have a way to perform a bus reset on the host bus. Thanks, Alex > > Therefore I think that if the user specifies AER, we need to verify and > > enforce at that point, ie. the initfn, that a host bus reset is > > possible, that a guest reset is possible, and that a guest bus reset > > fully contains the VM visible host devices affected by the reset. > > > > I'd also like to see the patches ordered such that we provide all the > > infrastructure to validate and enforce the configuration restrictions to > > support AER before we enable it to the guest. The order here creates > > bisect points where AER is exposed to the guest with unacceptable QEMU > > handling. Thanks, > I'm sorry for that, due to this is RFC version I missed that, > and I will arrange the patches after this series. > > Thanks, > Chen > > > > > > Alex > > > >> return 0; > >> } > >> > >> @@ -2978,13 +3115,6 @@ static void vfio_pci_post_reset(VFIOPCIDevice *vdev) > >> vfio_enable_intx(vdev); > >> } > >> > >> -static bool vfio_pci_host_match(PCIHostDeviceAddress *host1, > >> - PCIHostDeviceAddress *host2) > >> -{ > >> - return (host1->domain == host2->domain && host1->bus == host2->bus && > >> - host1->slot == host2->slot && host1->function == host2->function); > >> -} > >> - > >> static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single) > >> { > >> VFIOGroup *group; > >> @@ -3328,12 +3458,7 @@ static void vfio_err_notifier_handler(void *opaque) > >> * terminate the guest to contain the error. > >> */ > >> > >> - error_report("%s(%04x:%02x:%02x.%x) Unrecoverable error detected. " > >> - "Please collect any data possible and then kill the guest", > >> - __func__, vdev->host.domain, vdev->host.bus, > >> - vdev->host.slot, vdev->host.function); > >> - > >> - vm_stop(RUN_STATE_INTERNAL_ERROR); > >> + vfio_pci_vm_stop(vdev); > >> } > >> > >> /* > > > > > > . > > >
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 060fb47..619daed 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -154,6 +154,8 @@ typedef struct VFIOPCIDevice { PCIHostDeviceAddress host; EventNotifier err_notifier; EventNotifier req_notifier; + + Notifier sec_bus_reset_notifier; uint32_t features; #define VFIO_FEATURE_ENABLE_VGA_BIT 0 #define VFIO_FEATURE_ENABLE_VGA (1 << VFIO_FEATURE_ENABLE_VGA_BIT) @@ -2627,6 +2629,13 @@ static int vfio_setup_pcie_cap(VFIOPCIDevice *vdev, int pos, uint8_t size) return pos; } +static bool vfio_pci_host_match(PCIHostDeviceAddress *host1, + PCIHostDeviceAddress *host2) +{ + return (host1->domain == host2->domain && host1->bus == host2->bus && + host1->slot == host2->slot && host1->function == host2->function); +} + static void vfio_check_pcie_flr(VFIOPCIDevice *vdev, uint8_t pos) { uint32_t cap = pci_get_long(vdev->pdev.config + pos + PCI_EXP_DEVCAP); @@ -2819,6 +2828,131 @@ static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos) return 0; } +static int vfio_aer_validate_devices(DeviceState *dev, + void *opaque) +{ + VFIOPCIDevice *vdev; + int i; + bool found = false; + struct vfio_pci_hot_reset_info *info = opaque; + struct vfio_pci_dependent_device *devices = &info->devices[0]; + + if (!object_dynamic_cast(OBJECT(dev), "vfio-pci")) { + return 0; + } + + vdev = DO_UPCAST(VFIOPCIDevice, pdev, PCI_DEVICE(dev)); + for (i = 0; i < info->count; i++) { + PCIHostDeviceAddress host; + + host.domain = devices[i].segment; + host.bus = devices[i].bus; + host.slot = PCI_SLOT(devices[i].devfn); + host.function = PCI_FUNC(devices[i].devfn); + + if (vfio_pci_host_match(&host, &vdev->host)) { + found = true; + break; + } + } + + if (!found) { + error_report("vfio: Cannot reset parent bus with AER supported," + "depends on device %s which is not contained.", + vdev->vbasedev.name); + return -1; + } + + return 0; +} + +static void vfio_pci_vm_stop(VFIOPCIDevice *vdev) +{ + error_report("%s(%04x:%02x:%02x.%x) Unrecoverable error detected. " + "Please collect any data possible and then kill the guest", + __func__, vdev->host.domain, vdev->host.bus, + vdev->host.slot, vdev->host.function); + + vm_stop(RUN_STATE_INTERNAL_ERROR); +} + +static void vfio_pci_host_bus_reset(Notifier *n, void *opaque) +{ + VFIOPCIDevice *vdev = container_of(n, VFIOPCIDevice, sec_bus_reset_notifier); + PCIDevice *pdev = &vdev->pdev; + int ret, i; + struct vfio_pci_hot_reset_info *info; + struct vfio_pci_dependent_device *devices; + VFIOGroup *group; + + if (!(vdev->features & VFIO_FEATURE_ENABLE_AER)) { + return; + } + + /* + * Check the affected devices by virtual bus reset are contained in + * the set of groups. + */ + ret = vfio_get_hot_reset_info(vdev, &info); + if (ret < 0) { + goto stop_vm; + } + + devices = &info->devices[0]; + + /* Verify that we have all the groups required */ + for (i = 0; i < info->count; i++) { + PCIHostDeviceAddress host; + + host.domain = devices[i].segment; + host.bus = devices[i].bus; + host.slot = PCI_SLOT(devices[i].devfn); + host.function = PCI_FUNC(devices[i].devfn); + + if (vfio_pci_host_match(&host, &vdev->host)) { + continue; + } + + QLIST_FOREACH(group, &vfio_group_list, next) { + if (group->groupid == devices[i].group_id) { + break; + } + } + + if (!group) { + if (!vdev->has_pm_reset) { + error_report("vfio: Cannot reset device %s with AER supported," + "depends on group %d which is not owned.", + vdev->vbasedev.name, devices[i].group_id); + } + ret = -EPERM; + goto stop_vm; + } + } + + /* Verify that we have all the affected devices under the bus */ + ret = qbus_walk_children(BUS(pdev->bus), NULL, NULL, + vfio_aer_validate_devices, + NULL, info); + if (ret < 0) { + goto stop_vm; + } + + + /* bus reset! */ + ret = vfio_pci_do_hot_reset(vdev, info); + if (ret < 0) { + goto stop_vm; + } + + g_free(info); + return; + +stop_vm: + g_free(info); + vfio_pci_vm_stop(vdev); +} + static int vfio_setup_aer(VFIOPCIDevice *vdev, int pos, uint16_t size) { PCIDevice *pdev = &vdev->pdev; @@ -2852,6 +2986,9 @@ static int vfio_setup_aer(VFIOPCIDevice *vdev, int pos, uint16_t size) pdev->exp.aer_cap + PCI_ERR_UNCOR_SEVER, 4); pci_long_test_and_clear_mask(exp_cap + PCI_ERR_UNCOR_SEVER, ~severity); + vdev->sec_bus_reset_notifier.notify = vfio_pci_host_bus_reset; + pci_bus_add_reset_notifier(pdev->bus, &vdev->sec_bus_reset_notifier); + return 0; } @@ -2978,13 +3115,6 @@ static void vfio_pci_post_reset(VFIOPCIDevice *vdev) vfio_enable_intx(vdev); } -static bool vfio_pci_host_match(PCIHostDeviceAddress *host1, - PCIHostDeviceAddress *host2) -{ - return (host1->domain == host2->domain && host1->bus == host2->bus && - host1->slot == host2->slot && host1->function == host2->function); -} - static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single) { VFIOGroup *group; @@ -3328,12 +3458,7 @@ static void vfio_err_notifier_handler(void *opaque) * terminate the guest to contain the error. */ - error_report("%s(%04x:%02x:%02x.%x) Unrecoverable error detected. " - "Please collect any data possible and then kill the guest", - __func__, vdev->host.domain, vdev->host.bus, - vdev->host.slot, vdev->host.function); - - vm_stop(RUN_STATE_INTERNAL_ERROR); + vfio_pci_vm_stop(vdev); } /*
add host secondary bus reset for vfio when AER occurs, if reset failed, we should stop vm. Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com> --- hw/vfio/pci.c | 151 +++++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 138 insertions(+), 13 deletions(-)