diff mbox

[v9,05/11] vfio: add check host bus reset is support or not

Message ID 1468913909-21811-6-git-send-email-zhoujie2011@cn.fujitsu.com
State New
Headers show

Commit Message

Zhou Jie July 19, 2016, 7:38 a.m. UTC
From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>

When assigning a vfio device with AER enabled, we must check whether
the device supports a host bus reset (ie. hot reset) as this may be
used by the guest OS in order to recover the device from an AER
error.  QEMU must therefore have the ability to perform a physical
host bus reset using the existing vfio APIs in response to a virtual
bus reset in the VM.  A physical bus reset affects all of the devices
on the host bus, therefore we place a few simplifying configuration
restriction on the VM:

 - All physical devices affected by a bus reset must be assigned to
   the VM with AER enabled on each and be configured on the same
   virtual bus in the VM.

 - No devices unaffected by the bus reset, be they physical, emulated,
   or paravirtual may be configured on the same virtual bus as a
   device supporting AER signaling through vfio.

In other words users wishing to enable AER on a multifunction device
need to assign all functions of the device to the same virtual bus
and enable AER support for each device.  The easiest way to
accomplish this is to identity map the physical functions to virtual
functions with multifunction enabled on the virtual device.

Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
---
 hw/vfio/pci.c | 278 +++++++++++++++++++++++++++++++++++++++++++++++++++++-----
 hw/vfio/pci.h |   1 +
 2 files changed, 256 insertions(+), 23 deletions(-)

Comments

Alex Williamson Aug. 31, 2016, 7:56 p.m. UTC | #1
On Tue, 19 Jul 2016 15:38:23 +0800
Zhou Jie <zhoujie2011@cn.fujitsu.com> wrote:

> From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> 
> When assigning a vfio device with AER enabled, we must check whether
> the device supports a host bus reset (ie. hot reset) as this may be
> used by the guest OS in order to recover the device from an AER
> error.  QEMU must therefore have the ability to perform a physical
> host bus reset using the existing vfio APIs in response to a virtual
> bus reset in the VM.  A physical bus reset affects all of the devices
> on the host bus, therefore we place a few simplifying configuration
> restriction on the VM:
> 
>  - All physical devices affected by a bus reset must be assigned to
>    the VM with AER enabled on each and be configured on the same
>    virtual bus in the VM.
> 
>  - No devices unaffected by the bus reset, be they physical, emulated,
>    or paravirtual may be configured on the same virtual bus as a
>    device supporting AER signaling through vfio.
> 
> In other words users wishing to enable AER on a multifunction device
> need to assign all functions of the device to the same virtual bus
> and enable AER support for each device.  The easiest way to
> accomplish this is to identity map the physical functions to virtual
> functions with multifunction enabled on the virtual device.

Why am I able to start the following VM with aer=on for the vfio-pci
devices?

# lspci -tv
-[0000:00]-+-00.0  Intel Corporation 82G33/G31/P35/P31 Express DRAM Controller
           +-01.0  Device 1234:1111
           +-1c.0-[01]--
           +-1d.0-[02]--+-01.0  Intel Corporation 82576 Gigabit Network Connection
           |            \-01.1  Intel Corporation 82576 Gigabit Network Connection
           ...

# lspci -vvv -s 1d.0
00:1d.0 PCI bridge: Red Hat, Inc. QEMU PCI-PCI bridge (prog-if 00 [Normal decode])

The devices are behind a PCIe-to-PCI bridge, so shouldn't specifying
aer=on for the vfio-pci devices cause a configuration error?

commandline:

/home/alwillia/local/bin/qemu-system-x86_64 -name guest=rhel7-q35,debug-threads=on -S -object secret,id=masterKey0,format=raw,file=/var/lib/libvirt/qemu/domain-11-rhel7-q35/master-key.aes -machine pc-q35-2.7,accel=kvm,usb=off,vmport=off -cpu IvyBridge -m 8192 -realtime mlock=off -smp 6,sockets=1,cores=6,threads=1 -uuid b20b28b4-9304-4e11-9ffa-0367aeb44afb -no-user-config -nodefaults -chardev socket,id=charmonitor,path=/var/lib/libvirt/qemu/domain-11-rhel7-q35/monitor.sock,server,nowait -mon chardev=charmonitor,id=monitor,mode=control -rtc base=utc,driftfix=slew -global kvm-pit.lost_tick_policy=discard -no-hpet -no-shutdown -global ICH9-LPC.disable_s3=1 -global ICH9-LPC.disable_s4=1 -boot strict=on -device i82801b11-bridge,id=pci.1,bus=pcie.0,addr=0x1e -device pci-bridge,chassis_nr=2,id=pci.2,bus=pci.1,addr=0x1 -device pci-bridge,chassis_nr=3,id=pci.3,bus=pcie.0,addr=0x1d -device ioh3420,port=0xe0,chassis=4,id=pci.4,bus=pcie.0,addr=0x1c -device ich9-usb-ehci1,id=usb,bus=pci.2!
 ,addr=0x3.0x7 -device ich9-usb-uhci1,masterbus=usb.0,firstport=0,bus=pci.2,multifunction=on,addr=0x3 -device ich9-usb-uhci2,masterbus=usb.0,firstport=2,bus=pci.2,addr=0x3.0x1 -device ich9-usb-uhci3,masterbus=usb.0,firstport=4,bus=pci.2,addr=0x3.0x2 -drive file=/dev/rhel/rhel7-q35,format=raw,if=none,id=drive-virtio-disk0,cache=none,aio=native -device virtio-blk-pci,scsi=off,bus=pci.2,addr=0x5,drive=drive-virtio-disk0,id=virtio-disk0,bootindex=1 -netdev tap,fd=26,id=hostnet0,vhost=on,vhostfd=28 -device virtio-net-pci,netdev=hostnet0,id=net0,mac=52:54:00:50:ec:0d,bus=pci.2,addr=0x1 -chardev pty,id=charserial0 -device isa-serial,chardev=charserial0,id=serial0 -device usb-tablet,id=input0,bus=usb.0,port=1 -vnc 127.0.0.1:0 -device VGA,id=video0,vgamem_mb=16,bus=pcie.0,addr=0x1 -device intel-hda,id=sound0,bus=pci.2,addr=0x2 -device hda-duplex,id=sound0-codec0,bus=sound0.0,cad=0 -device vfio-pci,aer=on,host=07:00.0,id=hostdev0,bus=pci.3,multifunction=on,addr=0x1 -device vfio-pci,ae!
 r=on,host=07:00.1,id=hostdev1,bus=pci.3,addr=0x1.0x1 -msg timestamp=on

Thanks,
Alex

> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> ---
>  hw/vfio/pci.c | 278 +++++++++++++++++++++++++++++++++++++++++++++++++++++-----
>  hw/vfio/pci.h |   1 +
>  2 files changed, 256 insertions(+), 23 deletions(-)
> 
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index 21fd801..242c1e4 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -1693,6 +1693,42 @@ static void vfio_check_af_flr(VFIOPCIDevice *vdev, uint8_t pos)
>      }
>  }
>  
> +static int vfio_pci_name_to_addr(const char *name, PCIHostDeviceAddress *addr)
> +{
> +    if (strlen(name) != 12 ||
> +        sscanf(name, "%04x:%02x:%02x.%1x", &addr->domain,
> +               &addr->bus, &addr->slot, &addr->function) != 4) {
> +        return -EINVAL;
> +    }
> +
> +    return 0;
> +}
> +
> +static bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name)
> +{
> +    PCIHostDeviceAddress tmp;
> +
> +    if (vfio_pci_name_to_addr(name, &tmp)) {
> +        return false;
> +    }
> +
> +    return (tmp.domain == addr->domain && tmp.bus == addr->bus &&
> +            tmp.slot == addr->slot && tmp.function == addr->function);
> +}
> +
> +static bool vfio_pci_host_match_slot(PCIHostDeviceAddress *addr,
> +                                     const char *name)
> +{
> +    PCIHostDeviceAddress tmp;
> +
> +    if (vfio_pci_name_to_addr(name, &tmp)) {
> +        return false;
> +    }
> +
> +    return (tmp.domain == addr->domain && tmp.bus == addr->bus &&
> +            tmp.slot == addr->slot);
> +}
> +
>  /*
>   * return negative with errno, return 0 on success.
>   * if success, the point of ret_info fill with the affected device reset info.
> @@ -1854,6 +1890,200 @@ static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos)
>      return 0;
>  }
>  
> +static int vfio_device_range_limit(PCIBus *bus)
> +{
> +    PCIDevice *br;
> +
> +    br = pci_bridge_get_device(bus);
> +    if (!br ||
> +        !pci_is_express(br) ||
> +        !(br->exp.exp_cap) ||
> +        pcie_cap_is_arifwd_enabled(br)) {
> +        return 255;
> +    }
> +
> +    return 8;
> +}
> +
> +static void vfio_check_hot_bus_reset(VFIOPCIDevice *vdev, Error **errp)
> +{
> +    PCIBus *bus = vdev->pdev.bus;
> +    struct vfio_pci_hot_reset_info *info = NULL;
> +    struct vfio_pci_dependent_device *devices;
> +    VFIOGroup *group;
> +    int ret, i, devfn, range_limit;
> +
> +    ret = vfio_get_hot_reset_info(vdev, &info);
> +    if (ret) {
> +        error_setg(errp, "vfio: Cannot enable AER for device %s,"
> +                   " device does not support hot reset.",
> +                   vdev->vbasedev.name);
> +        return;
> +    }
> +
> +    /* List all affected devices by bus reset */
> +    devices = &info->devices[0];
> +
> +    /* Verify that we have all the groups required */
> +    for (i = 0; i < info->count; i++) {
> +        PCIHostDeviceAddress host;
> +        VFIOPCIDevice *tmp;
> +        VFIODevice *vbasedev_iter;
> +        bool found = false;
> +
> +        host.domain = devices[i].segment;
> +        host.bus = devices[i].bus;
> +        host.slot = PCI_SLOT(devices[i].devfn);
> +        host.function = PCI_FUNC(devices[i].devfn);
> +
> +        /* Skip the current device */
> +        if (vfio_pci_host_match(&host, vdev->vbasedev.name)) {
> +            continue;
> +        }
> +
> +        /* Ensure we own the group of the affected device */
> +        QLIST_FOREACH(group, &vfio_group_list, next) {
> +            if (group->groupid == devices[i].group_id) {
> +                break;
> +            }
> +        }
> +
> +        if (!group) {
> +            error_setg(errp, "vfio: Cannot enable AER for device %s, "
> +                       "depends on group %d which is not owned.",
> +                       vdev->vbasedev.name, devices[i].group_id);
> +            goto out;
> +        }
> +
> +        /* Ensure affected devices for reset on the same bus */
> +        QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
> +            if (vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) {
> +                continue;
> +            }
> +            tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev);
> +            if (vfio_pci_host_match(&host, tmp->vbasedev.name)) {
> +                /*
> +                 * AER errors may be broadcast to all functions of a multi-
> +                 * function endpoint.  If any of those sibling functions are
> +                 * also assigned, they need to have AER enabled or else an
> +                 * error may continue to cause a vm_stop condition.  IOW,
> +                 * AER setup of this function would be pointless.
> +                 */
> +                if (vfio_pci_host_match_slot(&host, vdev->vbasedev.name) &&
> +                    !(tmp->features & VFIO_FEATURE_ENABLE_AER)) {
> +                    error_setg(errp, "vfio: Cannot enable AER for device %s,"
> +                               " on same slot the dependent device %s which"
> +                               " does not enable AER.",
> +                               vdev->vbasedev.name, tmp->vbasedev.name);
> +                    goto out;
> +                }
> +
> +                if (tmp->pdev.bus != bus) {
> +                    error_setg(errp, "vfio: Cannot enable AER for device %s, "
> +                               "the dependent device %s is not on the same bus",
> +                               vdev->vbasedev.name, tmp->vbasedev.name);
> +                    goto out;
> +                }
> +                found = true;
> +                break;
> +            }
> +        }
> +
> +        /* Ensure all affected devices assigned to VM */
> +        if (!found) {
> +            error_setg(errp, "vfio: Cannot enable AER for device %s, "
> +                       "the dependent device %04x:%02x:%02x.%x "
> +                       "is not assigned to VM.",
> +                       vdev->vbasedev.name, host.domain, host.bus,
> +                       host.slot, host.function);
> +            goto out;
> +        }
> +    }
> +
> +    /*
> +     * The above code verified that all devices affected by a bus reset
> +     * exist on the same bus in the VM.  To further simplify, we also
> +     * require that there are no additional devices beyond those existing on
> +     * the VM bus.
> +     */
> +    range_limit = vfio_device_range_limit(bus);
> +    for (devfn = 0; devfn < range_limit; devfn++) {
> +        VFIOPCIDevice *tmp;
> +        PCIDevice *dev;
> +        bool found = false;
> +
> +        dev = pci_find_device(bus, pci_bus_num(bus),
> +                  PCI_DEVFN(PCI_SLOT(vdev->pdev.devfn), devfn));
> +
> +        if (!dev) {
> +            continue;
> +        }
> +
> +        if (!object_dynamic_cast(OBJECT(dev), "vfio-pci")) {
> +            error_setg(errp, "vfio: Cannot enable AER for device %s, device"
> +                             " %s: slot %d function%d cannot be configured"
> +                             " on the same virtual bus",
> +                             vdev->vbasedev.name, dev->name,
> +                             PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn));
> +            goto out;
> +        }
> +
> +        tmp = DO_UPCAST(VFIOPCIDevice, pdev, dev);
> +        for (i = 0; i < info->count; i++) {
> +            PCIHostDeviceAddress host;
> +
> +            host.domain = devices[i].segment;
> +            host.bus = devices[i].bus;
> +            host.slot = PCI_SLOT(devices[i].devfn);
> +            host.function = PCI_FUNC(devices[i].devfn);
> +
> +            if (vfio_pci_host_match(&host, tmp->vbasedev.name)) {
> +                found = true;
> +                break;
> +            }
> +        }
> +
> +        if (!found) {
> +            error_setg(errp, "vfio: Cannot enable AER for device %s, affected"
> +                             " device %s does not be configured on the same"
> +                             " virtual bus",
> +                       vdev->vbasedev.name, tmp->vbasedev.name);
> +            goto out;
> +        }
> +    }
> +
> +out:
> +    g_free(info);
> +    return;
> +}
> +
> +static void vfio_aer_check_host_bus_reset(Error **errp)
> +{
> +    VFIOGroup *group;
> +    VFIODevice *vbasedev;
> +    VFIOPCIDevice *vdev;
> +    Error *local_err = NULL;
> +
> +    /* Check All vfio-pci devices if have bus reset capability */
> +    QLIST_FOREACH(group, &vfio_group_list, next) {
> +        QLIST_FOREACH(vbasedev, &group->device_list, next) {
> +            if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
> +                continue;
> +            }
> +            vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
> +            if (vdev->features & VFIO_FEATURE_ENABLE_AER) {
> +                vfio_check_hot_bus_reset(vdev, &local_err);
> +                if (local_err) {
> +                    error_propagate(errp, local_err);
> +                    return;
> +                }
> +            }
> +        }
> +    }
> +
> +    return;
> +}
> +
>  static int vfio_setup_aer(VFIOPCIDevice *vdev, uint8_t cap_ver,
>                            int pos, uint16_t size)
>  {
> @@ -2060,29 +2290,6 @@ static void vfio_pci_post_reset(VFIOPCIDevice *vdev)
>      vfio_intx_enable(vdev);
>  }
>  
> -static int vfio_pci_name_to_addr(const char *name, PCIHostDeviceAddress *addr)
> -{
> -    if (strlen(name) != 12 ||
> -        sscanf(name, "%04x:%02x:%02x.%1x", &addr->domain,
> -               &addr->bus, &addr->slot, &addr->function) != 4) {
> -        return -EINVAL;
> -    }
> -
> -    return 0;
> -}
> -
> -static bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name)
> -{
> -    PCIHostDeviceAddress tmp;
> -
> -    if (vfio_pci_name_to_addr(name, &tmp)) {
> -        return false;
> -    }
> -
> -    return (tmp.domain == addr->domain && tmp.bus == addr->bus &&
> -            tmp.slot == addr->slot && tmp.function == addr->function);
> -}
> -
>  static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
>  {
>      VFIOGroup *group;
> @@ -2608,6 +2815,22 @@ static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
>      vdev->req_enabled = false;
>  }
>  
> +static void vfio_pci_machine_done_notify(Notifier *notifier, void *unused)
> +{
> +    Error *local_err = NULL;
> +
> +    vfio_aer_check_host_bus_reset(&local_err);
> +    if (local_err) {
> +        fprintf(stderr, "%s\n", error_get_pretty(local_err));
> +        error_free(local_err);
> +        exit(1);
> +    }
> +}
> +
> +static Notifier machine_notifier = {
> +    .notify = vfio_pci_machine_done_notify,
> +};
> +
>  static int vfio_initfn(PCIDevice *pdev)
>  {
>      VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
> @@ -3000,6 +3223,15 @@ static const TypeInfo vfio_pci_dev_info = {
>  static void register_vfio_pci_dev_type(void)
>  {
>      type_register_static(&vfio_pci_dev_info);
> +
> +    /*
> +     * The AER configuration may depend on multiple devices, so we cannot
> +     * validate consistency after each device is initialized.  We can only
> +     * depend on function initialization order (function 0 last) for hotplug
> +     * devices, therefore a machine-init-done notifier is used to validate
> +     * the configuration after all cold-plug devices are processed.
> +     */
> +     qemu_add_machine_init_done_notifier(&machine_notifier);
>  }
>  
>  type_init(register_vfio_pci_dev_type)
> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
> index 5483044..e2faffb 100644
> --- a/hw/vfio/pci.h
> +++ b/hw/vfio/pci.h
> @@ -15,6 +15,7 @@
>  #include "qemu-common.h"
>  #include "exec/memory.h"
>  #include "hw/pci/pci.h"
> +#include "hw/pci/pci_bus.h"
>  #include "hw/pci/pci_bridge.h"
>  #include "hw/vfio/vfio-common.h"
>  #include "qemu/event_notifier.h"
Alex Williamson Sept. 1, 2016, 2:12 a.m. UTC | #2
On Wed, 31 Aug 2016 13:56:20 -0600
Alex Williamson <alex.williamson@redhat.com> wrote:

> On Tue, 19 Jul 2016 15:38:23 +0800
> Zhou Jie <zhoujie2011@cn.fujitsu.com> wrote:
> 
> > From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> > 
> > When assigning a vfio device with AER enabled, we must check whether
> > the device supports a host bus reset (ie. hot reset) as this may be
> > used by the guest OS in order to recover the device from an AER
> > error.  QEMU must therefore have the ability to perform a physical
> > host bus reset using the existing vfio APIs in response to a virtual
> > bus reset in the VM.  A physical bus reset affects all of the devices
> > on the host bus, therefore we place a few simplifying configuration
> > restriction on the VM:
> > 
> >  - All physical devices affected by a bus reset must be assigned to
> >    the VM with AER enabled on each and be configured on the same
> >    virtual bus in the VM.
> > 
> >  - No devices unaffected by the bus reset, be they physical, emulated,
> >    or paravirtual may be configured on the same virtual bus as a
> >    device supporting AER signaling through vfio.
> > 
> > In other words users wishing to enable AER on a multifunction device
> > need to assign all functions of the device to the same virtual bus
> > and enable AER support for each device.  The easiest way to
> > accomplish this is to identity map the physical functions to virtual
> > functions with multifunction enabled on the virtual device.  
> 
> Why am I able to start the following VM with aer=on for the vfio-pci
> devices?
> 
> # lspci -tv
> -[0000:00]-+-00.0  Intel Corporation 82G33/G31/P35/P31 Express DRAM Controller
>            +-01.0  Device 1234:1111
>            +-1c.0-[01]--
>            +-1d.0-[02]--+-01.0  Intel Corporation 82576 Gigabit Network Connection
>            |            \-01.1  Intel Corporation 82576 Gigabit Network Connection
>            ...
> 
> # lspci -vvv -s 1d.0
> 00:1d.0 PCI bridge: Red Hat, Inc. QEMU PCI-PCI bridge (prog-if 00 [Normal decode])
> 
> The devices are behind a PCIe-to-PCI bridge, so shouldn't specifying
> aer=on for the vfio-pci devices cause a configuration error?
> 
> commandline:
> 
> /home/alwillia/local/bin/qemu-system-x86_64 -name guest=rhel7-q35,debug-threads=on -S -object secret,id=masterKey0,format=raw,file=/var/lib/libvirt/qemu/domain-11-rhel7-q35/master-key.aes -machine pc-q35-2.7,accel=kvm,usb=off,vmport=off -cpu IvyBridge -m 8192 -realtime mlock=off -smp 6,sockets=1,cores=6,threads=1 -uuid b20b28b4-9304-4e11-9ffa-0367aeb44afb -no-user-config -nodefaults -chardev socket,id=charmonitor,path=/var/lib/libvirt/qemu/domain-11-rhel7-q35/monitor.sock,server,nowait -mon chardev=charmonitor,id=monitor,mode=control -rtc base=utc,driftfix=slew -global kvm-pit.lost_tick_policy=discard -no-hpet -no-shutdown -global ICH9-LPC.disable_s3=1 -global ICH9-LPC.disable_s4=1 -boot strict=on -device i82801b11-bridge,id=pci.1,bus=pcie.0,addr=0x1e -device pci-bridge,chassis_nr=2,id=pci.2,bus=pci.1,addr=0x1 -device pci-bridge,chassis_nr=3,id=pci.3,bus=pcie.0,addr=0x1d -device ioh3420,port=0xe0,chassis=4,id=pci.4,bus=pcie.0,addr=0x1c -device ich9-usb-ehci1,id=usb,bus=pci!
 .2,addr=0x3.0x7 -device ich9-usb-uhci1,masterbus=usb.0,firstport=0,bus=pci.2,multifunction=on,addr=0x3 -device ich9-usb-uhci2,masterbus=usb.0,firstport=2,bus=pci.2,addr=0x3.0x1 -device ich9-usb-uhci3,masterbus=usb.0,firstport=4,bus=pci.2,addr=0x3.0x2 -drive file=/dev/rhel/rhel7-q35,format=raw,if=none,id=drive-virtio-disk0,cache=none,aio=native -device virtio-blk-pci,scsi=off,bus=pci.2,addr=0x5,drive=drive-virtio-disk0,id=virtio-disk0,bootindex=1 -netdev tap,fd=26,id=hostnet0,vhost=on,vhostfd=28 -device virtio-net-pci,netdev=hostnet0,id=net0,mac=52:54:00:50:ec:0d,bus=pci.2,addr=0x1 -chardev pty,id=charserial0 -device isa-serial,chardev=charserial0,id=serial0 -device usb-tablet,id=input0,bus=usb.0,port=1 -vnc 127.0.0.1:0 -device VGA,id=video0,vgamem_mb=16,bus=pcie.0,addr=0x1 -device intel-hda,id=sound0,bus=pci.2,addr=0x2 -device hda-duplex,id=sound0-codec0,bus=sound0.0,cad=0 -device vfio-pci,aer=on,host=07:00.0,id=hostdev0,bus=pci.3,multifunction=on,addr=0x1 -device vfio-pci,!
 aer=on,host=07:00.1,id=hostdev1,bus=pci.3,addr=0x1.0x1 -msg timestamp=on
> 

I had to move to a different system where I could actually inject an
aer error and created a config similar to above but with the 82576
ports downstream of the ioh3420 root port.  When I inject a malformed
TLP uncorrectable error, my RHEL7.2 guest does this:

[   35.995645] pcieport 0000:00:1c.0: AER: Multiple Uncorrected (Fatal) error received: id=0200
[   35.998483] igb 0000:02:00.0: PCIe Bus Error: severity=Uncorrected (Fatal), type=Unaccessible, id=0200(Unregistered Agent ID)
[   36.001965] igb 0000:02:00.0 enp2s0f0: PCIe link lost, device now detached
[   36.015092] igb 0000:02:00.1 enp2s0f1: PCIe link lost, device now detached
[   39.133185] igb 0000:02:00.0: enabling device (0000 -> 0002)
[   40.071245] igb 0000:02:00.1: enabling device (0000 -> 0002)
[   41.014451] BUG: unable to handle kernel paging request at 0000000000003818
[   41.015969] IP: [<ffffffffa02b438d>] igb_configure_tx_ring+0x14d/0x280 [igb]
[   41.017507] PGD 367e2067 PUD 7ae56067 PMD 0 
[   41.018497] Oops: 0002 [#1] SMP 
[   41.019242] Modules linked in: ip6t_rpfilter ip6t_REJECT ipt_REJECT xt_conntrack ebtable_nat ebtable_broute bridge stp llc ebtable_filter ebtables ip6table_nat nf_conntrack_ipv6 nf_defrag_ipv6 nf_nat_ipv6 ip6table_mangle ip6table_security ip6table_raw ip6table_filter ip6_tables iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack iptable_mangle iptable_security iptable_raw iptable_filter snd_hda_codec_generic snd_hda_intel snd_hda_codec ppdev snd_hda_core snd_hwdep snd_seq snd_seq_device iTCO_wdt iTCO_vendor_support bochs_drm snd_pcm syscopyarea sysfillrect sysimgblt ttm virtio_balloon snd_timer snd igb drm_kms_helper soundcore ptp pps_core i2c_algo_bit i2c_i801 dca drm shpchp lpc_ich mfd_core pcspkr i2c_core parport_pc parport ip_tables xfs libcrc32c virtio_blk virtio_console virtio_net ahci libahci crc32c_intel serio_raw libata virtio_pci virtio_ring virtio dm_mirror dm_region_hash dm_log dm_mod
[   41.040590] CPU: 0 PID: 29 Comm: kworker/0:1 Not tainted 3.10.0-327.el7.x86_64 #1
[   41.042180] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.9.3-0-ge2fc41e-prebuilt.qemu-project.org 04/01/2014
[   41.044635] Workqueue: events aer_isr
[   41.045478] task: ffff880179435080 ti: ffff880179680000 task.ti: ffff880179680000
[   41.047097] RIP: 0010:[<ffffffffa02b438d>]  [<ffffffffa02b438d>] igb_configure_tx_ring+0x14d/0x280 [igb]
[   41.049151] RSP: 0018:ffff880179683bf8  EFLAGS: 00010246
[   41.050260] RAX: 0000000000003818 RBX: 0000000000000000 RCX: 0000000000003818
[   41.051747] RDX: 0000000000000000 RSI: 0000000000000008 RDI: 00000000002896b3
[   41.053268] RBP: ffff880179683c20 R08: 0000000001010100 R09: 00000000ffffffe7
[   41.054730] R10: ffffea0001eb6100 R11: ffffffffa02afa31 R12: 0000000000000000
[   41.056201] R13: ffff880035dbc8c0 R14: ffff880175d03f80 R15: 000000017716e000
[   41.057673] FS:  0000000000000000(0000) GS:ffff88017fc00000(0000) knlGS:0000000000000000
[   41.059337] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[   41.060548] CR2: 0000000000003818 CR3: 0000000178331000 CR4: 00000000000006f0
[   41.062028] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[   41.063534] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[   41.065025] Stack:
[   41.065473]  ffff880035dbc8c0 ffff880035dbce70 0000000000000001 ffff880035dbc8c8
[   41.067119]  ffff880035dbce70 ffff880179683c80 ffffffffa02b8a77 fefdf27269fb3cd8
[   41.068781]  2009f9ee3386436f eb9e4e66756bbfdd 34002f8114a5d65f 9535990856231c4b
[   41.094179] Call Trace:
[   41.118688]  [<ffffffffa02b8a77>] igb_configure+0x267/0x450 [igb]
[   41.144286]  [<ffffffffa02b94f1>] igb_up+0x21/0x1a0 [igb]
[   41.170606]  [<ffffffffa02b96a7>] igb_io_resume+0x37/0x70 [igb]
[   41.195846]  [<ffffffff813381e0>] ? pci_cleanup_aer_uncorrect_error_status+0x90/0x90
[   41.221767]  [<ffffffff81338228>] report_resume+0x48/0x60
[   41.246455]  [<ffffffff8131e359>] pci_walk_bus+0x79/0xa0
[   41.270722]  [<ffffffff813381e0>] ? pci_cleanup_aer_uncorrect_error_status+0x90/0x90
[   41.296747]  [<ffffffff813382f0>] broadcast_error_message+0xb0/0x100
[   41.321552]  [<ffffffff81338509>] do_recovery+0x1c9/0x280
[   41.345507]  [<ffffffff81338f58>] aer_isr+0x348/0x430
[   41.368851]  [<ffffffff8109d5fb>] process_one_work+0x17b/0x470
[   41.392157]  [<ffffffff8109e3cb>] worker_thread+0x11b/0x400
[   41.416852]  [<ffffffff8109e2b0>] ? rescuer_thread+0x400/0x400
[   41.441577]  [<ffffffff810a5aef>] kthread+0xcf/0xe0
[   41.465029]  [<ffffffff810a5a20>] ? kthread_create_on_node+0x140/0x140
[   41.488341]  [<ffffffff81645858>] ret_from_fork+0x58/0x90
[   41.511247]  [<ffffffff810a5a20>] ? kthread_create_on_node+0x140/0x140
[   41.535442] Code: c1 49 89 4e 30 49 8b 85 b8 05 00 00 48 85 c0 0f 84 39 01 00 00 81 c2 10 38 00 00 48 63 d2 48 01 d0 31 d2 89 10 49 8b 46 30 31 d2 <89> 10 41 8b 95 3c 06 00 00 b8 14 01 10 02 83 fa 05 74 0b 83 fa 
[   41.587718] RIP  [<ffffffffa02b438d>] igb_configure_tx_ring+0x14d/0x280 [igb]
[   41.610872]  RSP <ffff880179683bf8>
[   41.632301] CR2: 0000000000003818

And then it reboots.  So what RAS improvement have we bought ourselves
here?  What endpoints have you tested with this?  Which ones recovered
reliably?  Thanks,

Alex
Dou Liyang Sept. 22, 2016, 8:34 a.m. UTC | #3
Hi Alex,

At 09/01/2016 03:56 AM, Alex Williamson wrote:
> On Tue, 19 Jul 2016 15:38:23 +0800
> Zhou Jie <zhoujie2011@cn.fujitsu.com> wrote:
>
>> From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
>>
>> When assigning a vfio device with AER enabled, we must check whether
>> the device supports a host bus reset (ie. hot reset) as this may be
>> used by the guest OS in order to recover the device from an AER
>> error.  QEMU must therefore have the ability to perform a physical
>> host bus reset using the existing vfio APIs in response to a virtual
>> bus reset in the VM.  A physical bus reset affects all of the devices
>> on the host bus, therefore we place a few simplifying configuration
>> restriction on the VM:
>>
>>  - All physical devices affected by a bus reset must be assigned to
>>    the VM with AER enabled on each and be configured on the same
>>    virtual bus in the VM.
>>
>>  - No devices unaffected by the bus reset, be they physical, emulated,
>>    or paravirtual may be configured on the same virtual bus as a
>>    device supporting AER signaling through vfio.
>>
>> In other words users wishing to enable AER on a multifunction device
>> need to assign all functions of the device to the same virtual bus
>> and enable AER support for each device.  The easiest way to
>> accomplish this is to identity map the physical functions to virtual
>> functions with multifunction enabled on the virtual device.
>
> Why am I able to start the following VM with aer=on for the vfio-pci
> devices?

In my opinion, because, when we check out this situation in vfio_init,
our code just do nothing except going to out_teardown. so the devices 
which are behind a PCIe-to-PCI bridge will be regarded as the PCI devices.

>
> # lspci -tv
> -[0000:00]-+-00.0  Intel Corporation 82G33/G31/P35/P31 Express DRAM Controller
>            +-01.0  Device 1234:1111
>            +-1c.0-[01]--
>            +-1d.0-[02]--+-01.0  Intel Corporation 82576 Gigabit Network Connection
>            |            \-01.1  Intel Corporation 82576 Gigabit Network Connection
>            ...
>
> # lspci -vvv -s 1d.0
> 00:1d.0 PCI bridge: Red Hat, Inc. QEMU PCI-PCI bridge (prog-if 00 [Normal decode])
>
> The devices are behind a PCIe-to-PCI bridge, so shouldn't specifying
> aer=on for the vfio-pci devices cause a configuration error?

Yes, I think it may makes users confuse.

I added a error_report for it.

Then, I test many passible combinations, such as:

pci-bridge
     |- vfio-pci

-device pci-bridge,chassis_nr=3,id=pci.3,bus=pcie.0,addr=0x1d \
-device vfio-pci,aer=on,host=06:00.0,id=hostdev0,bus=pci.3,addr=1 \

error:
vfio: Unable to enable AER for device 0000:06:00.0, device does not 
support AER signaling

--------------------------------------------------------------------

pci-bridge
     |- vfio-pci
     |- vfio-pci

-device pci-bridge,chassis_nr=3,id=pci.3,bus=pcie.0,addr=0x1d \
-device vfio-pci,aer=on,host=06:00.0,id=hostdev0,bus=pci.3,addr=1 \
-device vfio-pci,aer=on,host=06:00.1,id=hostdev1,bus=pci.3,addr=1.1 \

error:
vfio: Unable to enable AER for device 0000:06:00.0, device does not 
support AER signaling

--------------------------------------------------------------------
ioh3420
   |- pci-bridge
       |- vfio-pci
       |- vfio-pci

-device ioh3420,bus=pcie.0,addr=1c.0,port=1,id=bridge1,chassis=1 \
-device pci-bridge,chassis_nr=3,id=pci.1,bus=bridge1,addr=0x1d \
-device vfio-pci,aer=on,host=06:00.0,id=hostdev0,bus=pci.1,addr=1 \
-device vfio-pci,aer=on,host=06:00.1,id=hostdev1,bus=pci.1,addr=1.1 \

error:
vfio: Unable to enable AER for device 0000:06:00.0, device does not 
support AER signaling

--------------------------------------------------------------------
just for test:
pci-bridge
   |- ioh3420
       |- vfio-pci
       |- vfio-pci
-device pci-bridge,chassis_nr=3,id=pci.1,bus=pcie.0,addr=0x1d \
-device ioh3420,bus=pci.1,addr=1c.0,port=1,id=bridge1,chassis=1 \
-device vfio-pci,aer=on,host=06:00.0,id=hostdev0,bus=bridge1,addr=1 \
-device vfio-pci,aer=on,host=06:00.1,id=hostdev1,bus=bridge1,addr=1.1 \

error:
vfio: Unable to enable AER for device 0000:06:00.0, parent bus does not 
support AER signaling

--------------------------------------------------------------------
ioh3420
    |- vfio-pci
    |- vfio-pci

-device ioh3420,bus=pcie.0,addr=1c.0,port=1,id=bridge1,chassis=1 \
-device vfio-pci,aer=on,host=06:00.0,id=hostdev0,bus=bridge1,addr=1 \
-device vfio-pci,aer=on,host=06:00.1,id=hostdev1,bus=bridge1,addr=1.1 \

OK
--------------------------------------------------------------------
...

Thanks,
Dou

>

[...]


>
Alex Williamson Sept. 22, 2016, 2:03 p.m. UTC | #4
On Thu, 22 Sep 2016 16:34:32 +0800
Dou Liyang <douly.fnst@cn.fujitsu.com> wrote:

> Hi Alex,
> 
> At 09/01/2016 03:56 AM, Alex Williamson wrote:
> > On Tue, 19 Jul 2016 15:38:23 +0800
> > Zhou Jie <zhoujie2011@cn.fujitsu.com> wrote:
> >  
> >> From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> >>
> >> When assigning a vfio device with AER enabled, we must check whether
> >> the device supports a host bus reset (ie. hot reset) as this may be
> >> used by the guest OS in order to recover the device from an AER
> >> error.  QEMU must therefore have the ability to perform a physical
> >> host bus reset using the existing vfio APIs in response to a virtual
> >> bus reset in the VM.  A physical bus reset affects all of the devices
> >> on the host bus, therefore we place a few simplifying configuration
> >> restriction on the VM:
> >>
> >>  - All physical devices affected by a bus reset must be assigned to
> >>    the VM with AER enabled on each and be configured on the same
> >>    virtual bus in the VM.
> >>
> >>  - No devices unaffected by the bus reset, be they physical, emulated,
> >>    or paravirtual may be configured on the same virtual bus as a
> >>    device supporting AER signaling through vfio.
> >>
> >> In other words users wishing to enable AER on a multifunction device
> >> need to assign all functions of the device to the same virtual bus
> >> and enable AER support for each device.  The easiest way to
> >> accomplish this is to identity map the physical functions to virtual
> >> functions with multifunction enabled on the virtual device.  
> >
> > Why am I able to start the following VM with aer=on for the vfio-pci
> > devices?  
> 
> In my opinion, because, when we check out this situation in vfio_init,
> our code just do nothing except going to out_teardown. so the devices 
> which are behind a PCIe-to-PCI bridge will be regarded as the PCI devices.
> 
> >
> > # lspci -tv
> > -[0000:00]-+-00.0  Intel Corporation 82G33/G31/P35/P31 Express DRAM Controller
> >            +-01.0  Device 1234:1111
> >            +-1c.0-[01]--
> >            +-1d.0-[02]--+-01.0  Intel Corporation 82576 Gigabit Network Connection
> >            |            \-01.1  Intel Corporation 82576 Gigabit Network Connection
> >            ...
> >
> > # lspci -vvv -s 1d.0
> > 00:1d.0 PCI bridge: Red Hat, Inc. QEMU PCI-PCI bridge (prog-if 00 [Normal decode])
> >
> > The devices are behind a PCIe-to-PCI bridge, so shouldn't specifying
> > aer=on for the vfio-pci devices cause a configuration error?  
> 
> Yes, I think it may makes users confuse.
> 
> I added a error_report for it.

I think we need more than an error_report, our strategy to date has
been that if the user specifies aer=on and we cannot support AER on the
device then we fail the initialization of the device.  Eric's patches
to convert vfio to realize and overhaul the error handling might make
this easier to accomplish.  Thanks,

Alex

> 
> Then, I test many passible combinations, such as:
> 
> pci-bridge
>      |- vfio-pci
> 
> -device pci-bridge,chassis_nr=3,id=pci.3,bus=pcie.0,addr=0x1d \
> -device vfio-pci,aer=on,host=06:00.0,id=hostdev0,bus=pci.3,addr=1 \
> 
> error:
> vfio: Unable to enable AER for device 0000:06:00.0, device does not 
> support AER signaling
> 
> --------------------------------------------------------------------
> 
> pci-bridge
>      |- vfio-pci
>      |- vfio-pci
> 
> -device pci-bridge,chassis_nr=3,id=pci.3,bus=pcie.0,addr=0x1d \
> -device vfio-pci,aer=on,host=06:00.0,id=hostdev0,bus=pci.3,addr=1 \
> -device vfio-pci,aer=on,host=06:00.1,id=hostdev1,bus=pci.3,addr=1.1 \
> 
> error:
> vfio: Unable to enable AER for device 0000:06:00.0, device does not 
> support AER signaling
> 
> --------------------------------------------------------------------
> ioh3420
>    |- pci-bridge
>        |- vfio-pci
>        |- vfio-pci
> 
> -device ioh3420,bus=pcie.0,addr=1c.0,port=1,id=bridge1,chassis=1 \
> -device pci-bridge,chassis_nr=3,id=pci.1,bus=bridge1,addr=0x1d \
> -device vfio-pci,aer=on,host=06:00.0,id=hostdev0,bus=pci.1,addr=1 \
> -device vfio-pci,aer=on,host=06:00.1,id=hostdev1,bus=pci.1,addr=1.1 \
> 
> error:
> vfio: Unable to enable AER for device 0000:06:00.0, device does not 
> support AER signaling
> 
> --------------------------------------------------------------------
> just for test:
> pci-bridge
>    |- ioh3420
>        |- vfio-pci
>        |- vfio-pci
> -device pci-bridge,chassis_nr=3,id=pci.1,bus=pcie.0,addr=0x1d \
> -device ioh3420,bus=pci.1,addr=1c.0,port=1,id=bridge1,chassis=1 \
> -device vfio-pci,aer=on,host=06:00.0,id=hostdev0,bus=bridge1,addr=1 \
> -device vfio-pci,aer=on,host=06:00.1,id=hostdev1,bus=bridge1,addr=1.1 \
> 
> error:
> vfio: Unable to enable AER for device 0000:06:00.0, parent bus does not 
> support AER signaling
> 
> --------------------------------------------------------------------
> ioh3420
>     |- vfio-pci
>     |- vfio-pci
> 
> -device ioh3420,bus=pcie.0,addr=1c.0,port=1,id=bridge1,chassis=1 \
> -device vfio-pci,aer=on,host=06:00.0,id=hostdev0,bus=bridge1,addr=1 \
> -device vfio-pci,aer=on,host=06:00.1,id=hostdev1,bus=bridge1,addr=1.1 \
> 
> OK
> --------------------------------------------------------------------
> ...
> 
> Thanks,
> Dou
> 
> >  
> 
> [...]
> 
> 
> >  
> 
> 
> 
>
Dou Liyang Sept. 22, 2016, 2:04 p.m. UTC | #5
Hi Alex,

At 09/01/2016 10:12 AM, Alex Williamson wrote:

[...]

>
> I had to move to a different system where I could actually inject an
> aer error and created a config similar to above but with the 82576
> ports downstream of the ioh3420 root port.  When I inject a malformed
> TLP uncorrectable error, my RHEL7.2 guest does this:
>
> [   35.995645] pcieport 0000:00:1c.0: AER: Multiple Uncorrected (Fatal) error received: id=0200
> [   35.998483] igb 0000:02:00.0: PCIe Bus Error: severity=Uncorrected (Fatal), type=Unaccessible, id=0200(Unregistered Agent ID)
> [   36.001965] igb 0000:02:00.0 enp2s0f0: PCIe link lost, device now detached
> [   36.015092] igb 0000:02:00.1 enp2s0f1: PCIe link lost, device now detached
> [   39.133185] igb 0000:02:00.0: enabling device (0000 -> 0002)
> [   40.071245] igb 0000:02:00.1: enabling device (0000 -> 0002)
> [   41.014451] BUG: unable to handle kernel paging request at 0000000000003818
> [   41.015969] IP: [<ffffffffa02b438d>] igb_configure_tx_ring+0x14d/0x280 [igb]
> [   41.017507] PGD 367e2067 PUD 7ae56067 PMD 0
> [   41.018497] Oops: 0002 [#1] SMP
> [   41.019242] Modules linked in: ip6t_rpfilter ip6t_REJECT ipt_REJECT xt_conntrack ebtable_nat ebtable_broute bridge stp llc ebtable_filter ebtables ip6table_nat nf_conntrack_ipv6 nf_defrag_ipv6 nf_nat_ipv6 ip6table_mangle ip6table_security ip6table_raw ip6table_filter ip6_tables iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack iptable_mangle iptable_security iptable_raw iptable_filter snd_hda_codec_generic snd_hda_intel snd_hda_codec ppdev snd_hda_core snd_hwdep snd_seq snd_seq_device iTCO_wdt iTCO_vendor_support bochs_drm snd_pcm syscopyarea sysfillrect sysimgblt ttm virtio_balloon snd_timer snd igb drm_kms_helper soundcore ptp pps_core i2c_algo_bit i2c_i801 dca drm shpchp lpc_ich mfd_core pcspkr i2c_core parport_pc parport ip_tables xfs libcrc32c virtio_blk virtio_console virtio_net ahci libahci crc32c_intel serio_raw libata virtio_pci virtio_ring virtio dm_mirror dm_region_hash dm_log dm_mod
> [   41.040590] CPU: 0 PID: 29 Comm: kworker/0:1 Not tainted 3.10.0-327.el7.x86_64 #1
> [   41.042180] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.9.3-0-ge2fc41e-prebuilt.qemu-project.org 04/01/2014
> [   41.044635] Workqueue: events aer_isr
> [   41.045478] task: ffff880179435080 ti: ffff880179680000 task.ti: ffff880179680000
> [   41.047097] RIP: 0010:[<ffffffffa02b438d>]  [<ffffffffa02b438d>] igb_configure_tx_ring+0x14d/0x280 [igb]
> [   41.049151] RSP: 0018:ffff880179683bf8  EFLAGS: 00010246
> [   41.050260] RAX: 0000000000003818 RBX: 0000000000000000 RCX: 0000000000003818
> [   41.051747] RDX: 0000000000000000 RSI: 0000000000000008 RDI: 00000000002896b3
> [   41.053268] RBP: ffff880179683c20 R08: 0000000001010100 R09: 00000000ffffffe7
> [   41.054730] R10: ffffea0001eb6100 R11: ffffffffa02afa31 R12: 0000000000000000
> [   41.056201] R13: ffff880035dbc8c0 R14: ffff880175d03f80 R15: 000000017716e000
> [   41.057673] FS:  0000000000000000(0000) GS:ffff88017fc00000(0000) knlGS:0000000000000000
> [   41.059337] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
> [   41.060548] CR2: 0000000000003818 CR3: 0000000178331000 CR4: 00000000000006f0
> [   41.062028] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> [   41.063534] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
> [   41.065025] Stack:
> [   41.065473]  ffff880035dbc8c0 ffff880035dbce70 0000000000000001 ffff880035dbc8c8
> [   41.067119]  ffff880035dbce70 ffff880179683c80 ffffffffa02b8a77 fefdf27269fb3cd8
> [   41.068781]  2009f9ee3386436f eb9e4e66756bbfdd 34002f8114a5d65f 9535990856231c4b
> [   41.094179] Call Trace:
> [   41.118688]  [<ffffffffa02b8a77>] igb_configure+0x267/0x450 [igb]
> [   41.144286]  [<ffffffffa02b94f1>] igb_up+0x21/0x1a0 [igb]
> [   41.170606]  [<ffffffffa02b96a7>] igb_io_resume+0x37/0x70 [igb]
> [   41.195846]  [<ffffffff813381e0>] ? pci_cleanup_aer_uncorrect_error_status+0x90/0x90
> [   41.221767]  [<ffffffff81338228>] report_resume+0x48/0x60
> [   41.246455]  [<ffffffff8131e359>] pci_walk_bus+0x79/0xa0
> [   41.270722]  [<ffffffff813381e0>] ? pci_cleanup_aer_uncorrect_error_status+0x90/0x90
> [   41.296747]  [<ffffffff813382f0>] broadcast_error_message+0xb0/0x100
> [   41.321552]  [<ffffffff81338509>] do_recovery+0x1c9/0x280
> [   41.345507]  [<ffffffff81338f58>] aer_isr+0x348/0x430
> [   41.368851]  [<ffffffff8109d5fb>] process_one_work+0x17b/0x470
> [   41.392157]  [<ffffffff8109e3cb>] worker_thread+0x11b/0x400
> [   41.416852]  [<ffffffff8109e2b0>] ? rescuer_thread+0x400/0x400
> [   41.441577]  [<ffffffff810a5aef>] kthread+0xcf/0xe0
> [   41.465029]  [<ffffffff810a5a20>] ? kthread_create_on_node+0x140/0x140
> [   41.488341]  [<ffffffff81645858>] ret_from_fork+0x58/0x90
> [   41.511247]  [<ffffffff810a5a20>] ? kthread_create_on_node+0x140/0x140
> [   41.535442] Code: c1 49 89 4e 30 49 8b 85 b8 05 00 00 48 85 c0 0f 84 39 01 00 00 81 c2 10 38 00 00 48 63 d2 48 01 d0 31 d2 89 10 49 8b 46 30 31 d2 <89> 10 41 8b 95 3c 06 00 00 b8 14 01 10 02 83 fa 05 74 0b 83 fa
> [   41.587718] RIP  [<ffffffffa02b438d>] igb_configure_tx_ring+0x14d/0x280 [igb]
> [   41.610872]  RSP <ffff880179683bf8>
> [   41.632301] CR2: 0000000000003818
>
> And then it reboots.  So what RAS improvement have we bought ourselves
> here?  What endpoints have you tested with this?  Which ones recovered
> reliably?  Thanks,

I am working on it.

The endpoints I used to test areļ¼š

-device ioh3420,bus=pcie.0,addr=1c.0,port=1,id=bridge1,chassis=1 \
-device vfio-pci,host=06:00.1,bus=bridge1,addr=00.1,id=net1,aer=true \
-device 
vfio-pci,host=06:00.0,bus=bridge1,addr=00.0,id=net0,aer=true,multifunction=on 
\

When I tested them, sometimes, my guest even be kernel panic.

And I found that the bug was related with the states of the devices.

when the enp1s0f1 was not up, which just like below:

ifconfig -a
...
enp1s0f1: flags=4098<BROADCAST,MULTICAST>  mtu 1500
         ether 00:1b:21:67:3b:bd  txqueuelen 1000  (Ethernet)
         RX packets 0  bytes 0 (0.0 B)
         RX errors 0  dropped 0  overruns 0  frame 0
         TX packets 0  bytes 0 (0.0 B)
         TX errors 0  dropped 0 overruns 0  carrier 0  collisions 0
         device memory 0xfe420000-fe43ffff
...

Then, I injected fatal/nonfatal error, it could recovered reliably.

But, when I executed this command to make the network up:

ifconfig enp1s0f1 up

dmesg:
...
[   34.109886] IPv6: ADDRCONF(NETDEV_UP): enp1s0f1: link is not ready
[   36.232118] igb 0000:01:00.1 enp1s0f1: igb: enp1s0f1 NIC Link is Up 
1000 Mbps Full Duplex, Flow Control: RX/TX
[   36.232397] IPv6: ADDRCONF(NETDEV_CHANGE): enp1s0f1: link becomes ready
...

ifconfig:
...
enp1s0f1: flags=4163<UP,BROADCAST,RUNNING,MULTICAST>  mtu 1500
...

Then, I did the tests, I got the kernel panic:

        [  910.702002] ---[ end trace 3cd6a977a579a0bd ]---
        [  910.702002] Kernel panic - not syncing: Fatal exception
        [  910.702002] Kernel Offset: 0x0 from 0xffffffff81000000 
(relocation range: 0xf
        fffffff80000000-0xffffffff9fffffff) 

        [  910.702002] ---[ end Kernel panic - not syncing: Fatal exception

----------------------

The bug happened in Guest with "PCIe link lost, device now detached",
but both Host and QEMU can reset the devices well.

At First, I guessed it might be a igb bug[1] and did many jobs on it.
But, it don't work.

Currently, I guess if I should make the devices non-working before QEMU
reset the devices ?
OR
I guess if I need to investigate the bug in igb driver ?

Now, I have no idea about it, Could you give me some advice?

[1]:http://www.gossamer-threads.com/lists/linux/kernel/2274363

Thanks,
Dou
Dou Liyang Sept. 22, 2016, 2:27 p.m. UTC | #6
Hi Alex,

At 09/22/2016 10:03 PM, Alex Williamson wrote:
> On Thu, 22 Sep 2016 16:34:32 +0800
> Dou Liyang <douly.fnst@cn.fujitsu.com> wrote:
>
>> Hi Alex,
>>
>> At 09/01/2016 03:56 AM, Alex Williamson wrote:
>>> On Tue, 19 Jul 2016 15:38:23 +0800
>>> Zhou Jie <zhoujie2011@cn.fujitsu.com> wrote:
>>>
>>>> From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
>>>>
>>>> When assigning a vfio device with AER enabled, we must check whether
>>>> the device supports a host bus reset (ie. hot reset) as this may be
>>>> used by the guest OS in order to recover the device from an AER
>>>> error.  QEMU must therefore have the ability to perform a physical
>>>> host bus reset using the existing vfio APIs in response to a virtual
>>>> bus reset in the VM.  A physical bus reset affects all of the devices
>>>> on the host bus, therefore we place a few simplifying configuration
>>>> restriction on the VM:
>>>>
>>>>  - All physical devices affected by a bus reset must be assigned to
>>>>    the VM with AER enabled on each and be configured on the same
>>>>    virtual bus in the VM.
>>>>
>>>>  - No devices unaffected by the bus reset, be they physical, emulated,
>>>>    or paravirtual may be configured on the same virtual bus as a
>>>>    device supporting AER signaling through vfio.
>>>>
>>>> In other words users wishing to enable AER on a multifunction device
>>>> need to assign all functions of the device to the same virtual bus
>>>> and enable AER support for each device.  The easiest way to
>>>> accomplish this is to identity map the physical functions to virtual
>>>> functions with multifunction enabled on the virtual device.
>>>
>>> Why am I able to start the following VM with aer=on for the vfio-pci
>>> devices?
>>
>> In my opinion, because, when we check out this situation in vfio_init,
>> our code just do nothing except going to out_teardown. so the devices
>> which are behind a PCIe-to-PCI bridge will be regarded as the PCI devices.
>>
>>>
>>> # lspci -tv
>>> -[0000:00]-+-00.0  Intel Corporation 82G33/G31/P35/P31 Express DRAM Controller
>>>            +-01.0  Device 1234:1111
>>>            +-1c.0-[01]--
>>>            +-1d.0-[02]--+-01.0  Intel Corporation 82576 Gigabit Network Connection
>>>            |            \-01.1  Intel Corporation 82576 Gigabit Network Connection
>>>            ...
>>>
>>> # lspci -vvv -s 1d.0
>>> 00:1d.0 PCI bridge: Red Hat, Inc. QEMU PCI-PCI bridge (prog-if 00 [Normal decode])
>>>
>>> The devices are behind a PCIe-to-PCI bridge, so shouldn't specifying
>>> aer=on for the vfio-pci devices cause a configuration error?
>>
>> Yes, I think it may makes users confuse.
>>
>> I added a error_report for it.
>
> I think we need more than an error_report, our strategy to date has
> been that if the user specifies aer=on and we cannot support AER on the
> device then we fail the initialization of the device.

Oh, I am sorry, I did not say clearly. I also made the init failure by
"return -EINVAL".

  Eric's patches
> to convert vfio to realize and overhaul the error handling might make
> this easier to accomplish.  Thanks,

Yes, I saw  Eric's patches too.

I will follow Eric's way.

Thanks very much.
Dou


>
> Alex
>
>>
>> Then, I test many passible combinations, such as:
>>
>> pci-bridge
>>      |- vfio-pci
>>
>> -device pci-bridge,chassis_nr=3,id=pci.3,bus=pcie.0,addr=0x1d \
>> -device vfio-pci,aer=on,host=06:00.0,id=hostdev0,bus=pci.3,addr=1 \
>>
>> error:
>> vfio: Unable to enable AER for device 0000:06:00.0, device does not
>> support AER signaling
>>
>> --------------------------------------------------------------------
>>
>> pci-bridge
>>      |- vfio-pci
>>      |- vfio-pci
>>
>> -device pci-bridge,chassis_nr=3,id=pci.3,bus=pcie.0,addr=0x1d \
>> -device vfio-pci,aer=on,host=06:00.0,id=hostdev0,bus=pci.3,addr=1 \
>> -device vfio-pci,aer=on,host=06:00.1,id=hostdev1,bus=pci.3,addr=1.1 \
>>
>> error:
>> vfio: Unable to enable AER for device 0000:06:00.0, device does not
>> support AER signaling
>>
>> --------------------------------------------------------------------
>> ioh3420
>>    |- pci-bridge
>>        |- vfio-pci
>>        |- vfio-pci
>>
>> -device ioh3420,bus=pcie.0,addr=1c.0,port=1,id=bridge1,chassis=1 \
>> -device pci-bridge,chassis_nr=3,id=pci.1,bus=bridge1,addr=0x1d \
>> -device vfio-pci,aer=on,host=06:00.0,id=hostdev0,bus=pci.1,addr=1 \
>> -device vfio-pci,aer=on,host=06:00.1,id=hostdev1,bus=pci.1,addr=1.1 \
>>
>> error:
>> vfio: Unable to enable AER for device 0000:06:00.0, device does not
>> support AER signaling
>>
>> --------------------------------------------------------------------
>> just for test:
>> pci-bridge
>>    |- ioh3420
>>        |- vfio-pci
>>        |- vfio-pci
>> -device pci-bridge,chassis_nr=3,id=pci.1,bus=pcie.0,addr=0x1d \
>> -device ioh3420,bus=pci.1,addr=1c.0,port=1,id=bridge1,chassis=1 \
>> -device vfio-pci,aer=on,host=06:00.0,id=hostdev0,bus=bridge1,addr=1 \
>> -device vfio-pci,aer=on,host=06:00.1,id=hostdev1,bus=bridge1,addr=1.1 \
>>
>> error:
>> vfio: Unable to enable AER for device 0000:06:00.0, parent bus does not
>> support AER signaling
>>
>> --------------------------------------------------------------------
>> ioh3420
>>     |- vfio-pci
>>     |- vfio-pci
>>
>> -device ioh3420,bus=pcie.0,addr=1c.0,port=1,id=bridge1,chassis=1 \
>> -device vfio-pci,aer=on,host=06:00.0,id=hostdev0,bus=bridge1,addr=1 \
>> -device vfio-pci,aer=on,host=06:00.1,id=hostdev1,bus=bridge1,addr=1.1 \
>>
>> OK
>> --------------------------------------------------------------------
>> ...
>>
>> Thanks,
>> Dou
>>
>>>
>>
>> [...]
>>
>>
>>>
>>
>>
>>
>>
>
>
>
diff mbox

Patch

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 21fd801..242c1e4 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -1693,6 +1693,42 @@  static void vfio_check_af_flr(VFIOPCIDevice *vdev, uint8_t pos)
     }
 }
 
+static int vfio_pci_name_to_addr(const char *name, PCIHostDeviceAddress *addr)
+{
+    if (strlen(name) != 12 ||
+        sscanf(name, "%04x:%02x:%02x.%1x", &addr->domain,
+               &addr->bus, &addr->slot, &addr->function) != 4) {
+        return -EINVAL;
+    }
+
+    return 0;
+}
+
+static bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name)
+{
+    PCIHostDeviceAddress tmp;
+
+    if (vfio_pci_name_to_addr(name, &tmp)) {
+        return false;
+    }
+
+    return (tmp.domain == addr->domain && tmp.bus == addr->bus &&
+            tmp.slot == addr->slot && tmp.function == addr->function);
+}
+
+static bool vfio_pci_host_match_slot(PCIHostDeviceAddress *addr,
+                                     const char *name)
+{
+    PCIHostDeviceAddress tmp;
+
+    if (vfio_pci_name_to_addr(name, &tmp)) {
+        return false;
+    }
+
+    return (tmp.domain == addr->domain && tmp.bus == addr->bus &&
+            tmp.slot == addr->slot);
+}
+
 /*
  * return negative with errno, return 0 on success.
  * if success, the point of ret_info fill with the affected device reset info.
@@ -1854,6 +1890,200 @@  static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos)
     return 0;
 }
 
+static int vfio_device_range_limit(PCIBus *bus)
+{
+    PCIDevice *br;
+
+    br = pci_bridge_get_device(bus);
+    if (!br ||
+        !pci_is_express(br) ||
+        !(br->exp.exp_cap) ||
+        pcie_cap_is_arifwd_enabled(br)) {
+        return 255;
+    }
+
+    return 8;
+}
+
+static void vfio_check_hot_bus_reset(VFIOPCIDevice *vdev, Error **errp)
+{
+    PCIBus *bus = vdev->pdev.bus;
+    struct vfio_pci_hot_reset_info *info = NULL;
+    struct vfio_pci_dependent_device *devices;
+    VFIOGroup *group;
+    int ret, i, devfn, range_limit;
+
+    ret = vfio_get_hot_reset_info(vdev, &info);
+    if (ret) {
+        error_setg(errp, "vfio: Cannot enable AER for device %s,"
+                   " device does not support hot reset.",
+                   vdev->vbasedev.name);
+        return;
+    }
+
+    /* List all affected devices by bus reset */
+    devices = &info->devices[0];
+
+    /* Verify that we have all the groups required */
+    for (i = 0; i < info->count; i++) {
+        PCIHostDeviceAddress host;
+        VFIOPCIDevice *tmp;
+        VFIODevice *vbasedev_iter;
+        bool found = false;
+
+        host.domain = devices[i].segment;
+        host.bus = devices[i].bus;
+        host.slot = PCI_SLOT(devices[i].devfn);
+        host.function = PCI_FUNC(devices[i].devfn);
+
+        /* Skip the current device */
+        if (vfio_pci_host_match(&host, vdev->vbasedev.name)) {
+            continue;
+        }
+
+        /* Ensure we own the group of the affected device */
+        QLIST_FOREACH(group, &vfio_group_list, next) {
+            if (group->groupid == devices[i].group_id) {
+                break;
+            }
+        }
+
+        if (!group) {
+            error_setg(errp, "vfio: Cannot enable AER for device %s, "
+                       "depends on group %d which is not owned.",
+                       vdev->vbasedev.name, devices[i].group_id);
+            goto out;
+        }
+
+        /* Ensure affected devices for reset on the same bus */
+        QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
+            if (vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) {
+                continue;
+            }
+            tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev);
+            if (vfio_pci_host_match(&host, tmp->vbasedev.name)) {
+                /*
+                 * AER errors may be broadcast to all functions of a multi-
+                 * function endpoint.  If any of those sibling functions are
+                 * also assigned, they need to have AER enabled or else an
+                 * error may continue to cause a vm_stop condition.  IOW,
+                 * AER setup of this function would be pointless.
+                 */
+                if (vfio_pci_host_match_slot(&host, vdev->vbasedev.name) &&
+                    !(tmp->features & VFIO_FEATURE_ENABLE_AER)) {
+                    error_setg(errp, "vfio: Cannot enable AER for device %s,"
+                               " on same slot the dependent device %s which"
+                               " does not enable AER.",
+                               vdev->vbasedev.name, tmp->vbasedev.name);
+                    goto out;
+                }
+
+                if (tmp->pdev.bus != bus) {
+                    error_setg(errp, "vfio: Cannot enable AER for device %s, "
+                               "the dependent device %s is not on the same bus",
+                               vdev->vbasedev.name, tmp->vbasedev.name);
+                    goto out;
+                }
+                found = true;
+                break;
+            }
+        }
+
+        /* Ensure all affected devices assigned to VM */
+        if (!found) {
+            error_setg(errp, "vfio: Cannot enable AER for device %s, "
+                       "the dependent device %04x:%02x:%02x.%x "
+                       "is not assigned to VM.",
+                       vdev->vbasedev.name, host.domain, host.bus,
+                       host.slot, host.function);
+            goto out;
+        }
+    }
+
+    /*
+     * The above code verified that all devices affected by a bus reset
+     * exist on the same bus in the VM.  To further simplify, we also
+     * require that there are no additional devices beyond those existing on
+     * the VM bus.
+     */
+    range_limit = vfio_device_range_limit(bus);
+    for (devfn = 0; devfn < range_limit; devfn++) {
+        VFIOPCIDevice *tmp;
+        PCIDevice *dev;
+        bool found = false;
+
+        dev = pci_find_device(bus, pci_bus_num(bus),
+                  PCI_DEVFN(PCI_SLOT(vdev->pdev.devfn), devfn));
+
+        if (!dev) {
+            continue;
+        }
+
+        if (!object_dynamic_cast(OBJECT(dev), "vfio-pci")) {
+            error_setg(errp, "vfio: Cannot enable AER for device %s, device"
+                             " %s: slot %d function%d cannot be configured"
+                             " on the same virtual bus",
+                             vdev->vbasedev.name, dev->name,
+                             PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn));
+            goto out;
+        }
+
+        tmp = DO_UPCAST(VFIOPCIDevice, pdev, dev);
+        for (i = 0; i < info->count; i++) {
+            PCIHostDeviceAddress host;
+
+            host.domain = devices[i].segment;
+            host.bus = devices[i].bus;
+            host.slot = PCI_SLOT(devices[i].devfn);
+            host.function = PCI_FUNC(devices[i].devfn);
+
+            if (vfio_pci_host_match(&host, tmp->vbasedev.name)) {
+                found = true;
+                break;
+            }
+        }
+
+        if (!found) {
+            error_setg(errp, "vfio: Cannot enable AER for device %s, affected"
+                             " device %s does not be configured on the same"
+                             " virtual bus",
+                       vdev->vbasedev.name, tmp->vbasedev.name);
+            goto out;
+        }
+    }
+
+out:
+    g_free(info);
+    return;
+}
+
+static void vfio_aer_check_host_bus_reset(Error **errp)
+{
+    VFIOGroup *group;
+    VFIODevice *vbasedev;
+    VFIOPCIDevice *vdev;
+    Error *local_err = NULL;
+
+    /* Check All vfio-pci devices if have bus reset capability */
+    QLIST_FOREACH(group, &vfio_group_list, next) {
+        QLIST_FOREACH(vbasedev, &group->device_list, next) {
+            if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
+                continue;
+            }
+            vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
+            if (vdev->features & VFIO_FEATURE_ENABLE_AER) {
+                vfio_check_hot_bus_reset(vdev, &local_err);
+                if (local_err) {
+                    error_propagate(errp, local_err);
+                    return;
+                }
+            }
+        }
+    }
+
+    return;
+}
+
 static int vfio_setup_aer(VFIOPCIDevice *vdev, uint8_t cap_ver,
                           int pos, uint16_t size)
 {
@@ -2060,29 +2290,6 @@  static void vfio_pci_post_reset(VFIOPCIDevice *vdev)
     vfio_intx_enable(vdev);
 }
 
-static int vfio_pci_name_to_addr(const char *name, PCIHostDeviceAddress *addr)
-{
-    if (strlen(name) != 12 ||
-        sscanf(name, "%04x:%02x:%02x.%1x", &addr->domain,
-               &addr->bus, &addr->slot, &addr->function) != 4) {
-        return -EINVAL;
-    }
-
-    return 0;
-}
-
-static bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name)
-{
-    PCIHostDeviceAddress tmp;
-
-    if (vfio_pci_name_to_addr(name, &tmp)) {
-        return false;
-    }
-
-    return (tmp.domain == addr->domain && tmp.bus == addr->bus &&
-            tmp.slot == addr->slot && tmp.function == addr->function);
-}
-
 static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
 {
     VFIOGroup *group;
@@ -2608,6 +2815,22 @@  static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
     vdev->req_enabled = false;
 }
 
+static void vfio_pci_machine_done_notify(Notifier *notifier, void *unused)
+{
+    Error *local_err = NULL;
+
+    vfio_aer_check_host_bus_reset(&local_err);
+    if (local_err) {
+        fprintf(stderr, "%s\n", error_get_pretty(local_err));
+        error_free(local_err);
+        exit(1);
+    }
+}
+
+static Notifier machine_notifier = {
+    .notify = vfio_pci_machine_done_notify,
+};
+
 static int vfio_initfn(PCIDevice *pdev)
 {
     VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
@@ -3000,6 +3223,15 @@  static const TypeInfo vfio_pci_dev_info = {
 static void register_vfio_pci_dev_type(void)
 {
     type_register_static(&vfio_pci_dev_info);
+
+    /*
+     * The AER configuration may depend on multiple devices, so we cannot
+     * validate consistency after each device is initialized.  We can only
+     * depend on function initialization order (function 0 last) for hotplug
+     * devices, therefore a machine-init-done notifier is used to validate
+     * the configuration after all cold-plug devices are processed.
+     */
+     qemu_add_machine_init_done_notifier(&machine_notifier);
 }
 
 type_init(register_vfio_pci_dev_type)
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index 5483044..e2faffb 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -15,6 +15,7 @@ 
 #include "qemu-common.h"
 #include "exec/memory.h"
 #include "hw/pci/pci.h"
+#include "hw/pci/pci_bus.h"
 #include "hw/pci/pci_bridge.h"
 #include "hw/vfio/vfio-common.h"
 #include "qemu/event_notifier.h"