Patchwork vfio-pci: Add KVM INTx acceleration

login
register
mail settings
Submitter Alex Williamson
Date Oct. 15, 2012, 8:28 p.m.
Message ID <20121015202031.23323.72827.stgit@bling.home>
Download mbox | patch
Permalink /patch/191662/
State New
Headers show

Comments

Alex Williamson - Oct. 15, 2012, 8:28 p.m.
This makes use of the new level irqfd support enabling bypass of
qemu userspace both on INTx injection and unmask.  This significantly
boosts the performance of devices making use of legacy interrupts.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---

My INTx routing workaround below will probably raise some eyebrows,
but I don't feel it's worth subjecting users to core dumps if they
want to try vfio-pci on new platforms.  INTx routing is part of some
larger plan, but until that plan materializes we have to try to avoid
the API unless we think there's a good chance it might be there.
I'll accept the maintenance of updating a whitelist in the interim.
Thanks,

Alex

 hw/vfio_pci.c |  224 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 224 insertions(+)
Michael S. Tsirkin - Oct. 16, 2012, 6:39 a.m.
On Mon, Oct 15, 2012 at 02:28:15PM -0600, Alex Williamson wrote:
> This makes use of the new level irqfd support enabling bypass of
> qemu userspace both on INTx injection and unmask.  This significantly
> boosts the performance of devices making use of legacy interrupts.
> 
> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> ---
> 
> My INTx routing workaround below will probably raise some eyebrows,
> but I don't feel it's worth subjecting users to core dumps if they
> want to try vfio-pci on new platforms.  INTx routing is part of some
> larger plan, but until that plan materializes we have to try to avoid
> the API unless we think there's a good chance it might be there.
> I'll accept the maintenance of updating a whitelist in the interim.
> Thanks,
> 
> Alex
> 
>  hw/vfio_pci.c |  224 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 224 insertions(+)
> 
> diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c
> index 639371e..777a5f8 100644
> --- a/hw/vfio_pci.c
> +++ b/hw/vfio_pci.c
> @@ -154,6 +154,53 @@ static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
>  static void vfio_mmap_set_enabled(VFIODevice *vdev, bool enabled);
>  
>  /*
> + * PCI code refuses to make it possible to probe whether the chipset
> + * supports pci_device_route_intx_to_irq() and booby traps the call
> + * to assert if doesn't.  For us, this is just an optimization, so
> + * only enable it when we know it's present.  Unfortunately PCIBus is
> + * private, so we can't just look at the function pointer.
> + */
> +static bool vfio_pci_bus_has_intx_route(PCIDevice *pdev)
> +{
> +#ifdef CONFIG_KVM
> +    BusState *bus = qdev_get_parent_bus(&pdev->qdev);
> +    DeviceState *dev;
> +
> +    if (!kvm_irqchip_in_kernel() ||
> +        !kvm_check_extension(kvm_state, KVM_CAP_IRQFD_RESAMPLE)) {
> +	return false;
> +    }


Shouldn't we update linux-headers/ to get KVM_CAP_IRQFD_RESAMPLE?
Also for KVM_IRQFD_FLAG_RESAMPLE.

> +
> +    for (; bus->parent; bus = qdev_get_parent_bus(dev)) {
> +
> +        dev = bus->parent;
> +
> +        if (!strncmp("i440FX-pcihost", object_get_typename(OBJECT(dev)), 14)) {
> +            return true;
> +        }
> +    }
> +
> +    error_report("vfio-pci: VM chipset does not support INTx routing, "
> +                 "using slow INTx mode\n");

When does this code trigger? It seems irqchip implies piix ATM -
is this just dead code?


> +#endif
> +    return false;
> +}
> +
> +static PCIINTxRoute vfio_pci_device_route_intx_to_irq(PCIDevice *pdev, int pin)
> +{
> +    if (!vfio_pci_bus_has_intx_route(pdev)) {
> +        return (PCIINTxRoute) { .mode = PCI_INTX_DISABLED, .irq = -1 };
> +    }
> +
> +    return pci_device_route_intx_to_irq(pdev, pin);
> +}
> +
> +static bool vfio_pci_intx_route_changed(PCIINTxRoute *old, PCIINTxRoute *new)
> +{
> +    return old->mode != new->mode || old->irq != new->irq;
> +}
> +

Didn't you add an API for this? It's on pci branch but I can drop
it if not needed.

> +/*
>   * Common VFIO interrupt disable
>   */
>  static void vfio_disable_irqindex(VFIODevice *vdev, int index)
> @@ -185,6 +232,21 @@ static void vfio_unmask_intx(VFIODevice *vdev)
>      ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
>  }
>  
> +#ifdef CONFIG_KVM
> +static void vfio_mask_intx(VFIODevice *vdev)
> +{
> +    struct vfio_irq_set irq_set = {
> +        .argsz = sizeof(irq_set),
> +        .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK,
> +        .index = VFIO_PCI_INTX_IRQ_INDEX,
> +        .start = 0,
> +        .count = 1,
> +    };
> +
> +    ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
> +}
> +#endif
> +
>  /*
>   * Disabling BAR mmaping can be slow, but toggling it around INTx can
>   * also be a huge overhead.  We try to get the best of both worlds by
> @@ -248,6 +310,161 @@ static void vfio_eoi(VFIODevice *vdev)
>      vfio_unmask_intx(vdev);
>  }
>  
> +static void vfio_enable_intx_kvm(VFIODevice *vdev)
> +{
> +#ifdef CONFIG_KVM
> +    struct kvm_irqfd irqfd = {
> +        .fd = event_notifier_get_fd(&vdev->intx.interrupt),
> +        .gsi = vdev->intx.route.irq,
> +        .flags = KVM_IRQFD_FLAG_RESAMPLE,


Should not kvm ioctl handling be localized in kvm-all.c?
E.g. extend kvm_irqchip_add_irqfd_notifier in
some way? Same question for KVM_CAP_IRQFD_RESAMPLE use above ...


> +    };
> +    struct vfio_irq_set *irq_set;
> +    int ret, argsz;
> +    int32_t *pfd;
> +
> +    if (!kvm_irqchip_in_kernel() ||
> +        vdev->intx.route.mode != PCI_INTX_ENABLED ||
> +        !kvm_check_extension(kvm_state, KVM_CAP_IRQFD_RESAMPLE)) {
> +        return;
> +    }
> +
> +    /* Get to a known interrupt state */
> +    qemu_set_fd_handler(irqfd.fd, NULL, NULL, vdev);
> +    vfio_mask_intx(vdev);
> +    vdev->intx.pending = false;
> +    qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 0);
> +
> +    /* Get an eventfd for resample/unmask */
> +    if (event_notifier_init(&vdev->intx.unmask, 0)) {
> +        error_report("vfio: Error: event_notifier_init failed eoi\n");
> +        goto fail;
> +    }
> +
> +    /* KVM triggers it, VFIO listens for it */
> +    irqfd.resamplefd = event_notifier_get_fd(&vdev->intx.unmask);
> +
> +    if (kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd)) {
> +        error_report("vfio: Error: Failed to setup resample irqfd: %m\n");
> +        goto fail_irqfd;
> +    }
> +
> +    argsz = sizeof(*irq_set) + sizeof(*pfd);
> +
> +    irq_set = g_malloc0(argsz);
> +    irq_set->argsz = argsz;
> +    irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_UNMASK;
> +    irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
> +    irq_set->start = 0;
> +    irq_set->count = 1;
> +    pfd = (int32_t *)&irq_set->data;
> +
> +    *pfd = irqfd.resamplefd;
> +
> +    ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
> +    g_free(irq_set);
> +    if (ret) {
> +        error_report("vfio: Error: Failed to setup INTx unmask fd: %m\n");
> +        goto fail_vfio;
> +    }
> +
> +    /* Let'em rip */
> +    vfio_unmask_intx(vdev);
> +
> +    vdev->intx.kvm_accel = true;
> +
> +    DPRINTF("%s(%04x:%02x:%02x.%x) KVM INTx accel enabled\n",
> +            __func__, vdev->host.domain, vdev->host.bus,
> +            vdev->host.slot, vdev->host.function);
> +
> +    return;
> +
> +fail_vfio:
> +    irqfd.flags = KVM_IRQFD_FLAG_DEASSIGN;
> +    kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd);
> +fail_irqfd:
> +    event_notifier_cleanup(&vdev->intx.unmask);
> +fail:
> +    qemu_set_fd_handler(irqfd.fd, vfio_intx_interrupt, NULL, vdev);
> +    vfio_unmask_intx(vdev);
> +#endif
> +}
> +
> +static void vfio_disable_intx_kvm(VFIODevice *vdev)
> +{
> +#ifdef CONFIG_KVM
> +    struct kvm_irqfd irqfd = {
> +        .fd = event_notifier_get_fd(&vdev->intx.interrupt),
> +        .gsi = vdev->intx.route.irq,
> +        .flags = KVM_IRQFD_FLAG_DEASSIGN,
> +    };
> +
> +    if (!vdev->intx.kvm_accel) {
> +        return;
> +    }
> +
> +    /*
> +     * Get to a known state, hardware masked, QEMU ready to accept new
> +     * interrupts, QEMU IRQ de-asserted.
> +     */
> +    vfio_mask_intx(vdev);
> +    vdev->intx.pending = false;
> +    qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 0);
> +
> +    /* Tell KVM to stop listening for an INTx irqfd */
> +    if (kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd)) {
> +        error_report("vfio: Error: Failed to disable INTx irqfd: %m\n");
> +    }
> +
> +    /* We only need to close the eventfd for VFIO to cleanup the kernel side */
> +    event_notifier_cleanup(&vdev->intx.unmask);
> +
> +    /* QEMU starts listening for interrupt events. */
> +    qemu_set_fd_handler(irqfd.fd, vfio_intx_interrupt, NULL, vdev);
> +
> +    vdev->intx.kvm_accel = false;
> +
> +    /* If we've missed an event, let it re-fire through QEMU */
> +    vfio_unmask_intx(vdev);
> +
> +    DPRINTF("%s(%04x:%02x:%02x.%x) KVM INTx accel disabled\n",
> +            __func__, vdev->host.domain, vdev->host.bus,
> +            vdev->host.slot, vdev->host.function);
> +#endif
> +}
> +
> +static void vfio_update_irq(PCIDevice *pdev)
> +{
> +    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
> +    PCIINTxRoute route;
> +
> +    if (vdev->interrupt != VFIO_INT_INTx) {
> +        return;
> +    }
> +
> +    route = vfio_pci_device_route_intx_to_irq(&vdev->pdev, vdev->intx.pin);
> +
> +    if (!vfio_pci_intx_route_changed(&vdev->intx.route, &route)) {
> +        return; /* Nothing changed */
> +    }
> +
> +    DPRINTF("%s(%04x:%02x:%02x.%x) IRQ moved %d -> %d\n", __func__,
> +            vdev->host.domain, vdev->host.bus, vdev->host.slot,
> +            vdev->host.function, vdev->intx.route.irq, route.irq);
> +
> +    vfio_disable_intx_kvm(vdev);
> +
> +    vdev->intx.route = route;
> +
> +    if (route.mode != PCI_INTX_ENABLED) {
> +        return;
> +    }
> +
> +    vfio_enable_intx_kvm(vdev);
> +
> +    /* Re-enable the interrupt in cased we missed an EOI */
> +    vfio_eoi(vdev);
> +}
> +
>  static int vfio_enable_intx(VFIODevice *vdev)
>  {
>      uint8_t pin = vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1);
> @@ -262,6 +479,9 @@ static int vfio_enable_intx(VFIODevice *vdev)
>      vfio_disable_interrupts(vdev);
>  
>      vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */
> +    vdev->intx.route = vfio_pci_device_route_intx_to_irq(&vdev->pdev,
> +                                                         vdev->intx.pin);
> +
>      ret = event_notifier_init(&vdev->intx.interrupt, 0);
>      if (ret) {
>          error_report("vfio: Error: event_notifier_init failed\n");
> @@ -290,6 +510,8 @@ static int vfio_enable_intx(VFIODevice *vdev)
>          return -errno;
>      }
>  
> +    vfio_enable_intx_kvm(vdev);
> +
>      vdev->interrupt = VFIO_INT_INTx;
>  
>      DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
> @@ -303,6 +525,7 @@ static void vfio_disable_intx(VFIODevice *vdev)
>      int fd;
>  
>      qemu_del_timer(vdev->intx.mmap_timer);
> +    vfio_disable_intx_kvm(vdev);
>      vfio_disable_irqindex(vdev, VFIO_PCI_INTX_IRQ_INDEX);
>      vdev->intx.pending = false;
>      qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 0);
> @@ -1870,6 +2093,7 @@ static int vfio_initfn(PCIDevice *pdev)
>      if (vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1)) {
>          vdev->intx.mmap_timer = qemu_new_timer_ms(vm_clock,
>                                                    vfio_intx_mmap_enable, vdev);
> +        pci_device_set_intx_routing_notifier(&vdev->pdev, vfio_update_irq);
>          ret = vfio_enable_intx(vdev);
>          if (ret) {
>              goto out_teardown;
Alex Williamson - Oct. 16, 2012, 1:51 p.m.
On Tue, 2012-10-16 at 08:39 +0200, Michael S. Tsirkin wrote:
> On Mon, Oct 15, 2012 at 02:28:15PM -0600, Alex Williamson wrote:
> > This makes use of the new level irqfd support enabling bypass of
> > qemu userspace both on INTx injection and unmask.  This significantly
> > boosts the performance of devices making use of legacy interrupts.
> > 
> > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > ---
> > 
> > My INTx routing workaround below will probably raise some eyebrows,
> > but I don't feel it's worth subjecting users to core dumps if they
> > want to try vfio-pci on new platforms.  INTx routing is part of some
> > larger plan, but until that plan materializes we have to try to avoid
> > the API unless we think there's a good chance it might be there.
> > I'll accept the maintenance of updating a whitelist in the interim.
> > Thanks,
> > 
> > Alex
> > 
> >  hw/vfio_pci.c |  224 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 224 insertions(+)
> > 
> > diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c
> > index 639371e..777a5f8 100644
> > --- a/hw/vfio_pci.c
> > +++ b/hw/vfio_pci.c
> > @@ -154,6 +154,53 @@ static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
> >  static void vfio_mmap_set_enabled(VFIODevice *vdev, bool enabled);
> >  
> >  /*
> > + * PCI code refuses to make it possible to probe whether the chipset
> > + * supports pci_device_route_intx_to_irq() and booby traps the call
> > + * to assert if doesn't.  For us, this is just an optimization, so
> > + * only enable it when we know it's present.  Unfortunately PCIBus is
> > + * private, so we can't just look at the function pointer.
> > + */
> > +static bool vfio_pci_bus_has_intx_route(PCIDevice *pdev)
> > +{
> > +#ifdef CONFIG_KVM
> > +    BusState *bus = qdev_get_parent_bus(&pdev->qdev);
> > +    DeviceState *dev;
> > +
> > +    if (!kvm_irqchip_in_kernel() ||
> > +        !kvm_check_extension(kvm_state, KVM_CAP_IRQFD_RESAMPLE)) {
> > +	return false;
> > +    }
> 
> 
> Shouldn't we update linux-headers/ to get KVM_CAP_IRQFD_RESAMPLE?
> Also for KVM_IRQFD_FLAG_RESAMPLE.

I posted the patch for that separately yesterday.  I'll only request a
pull once that's in.

> > +
> > +    for (; bus->parent; bus = qdev_get_parent_bus(dev)) {
> > +
> > +        dev = bus->parent;
> > +
> > +        if (!strncmp("i440FX-pcihost", object_get_typename(OBJECT(dev)), 14)) {
> > +            return true;
> > +        }
> > +    }
> > +
> > +    error_report("vfio-pci: VM chipset does not support INTx routing, "
> > +                 "using slow INTx mode\n");
> 
> When does this code trigger? It seems irqchip implies piix ATM -
> is this just dead code?

Unused, but not unnecessary.  Another chipset is under development,
which means very quickly irqchip will not imply piix.  Likewise irqfd
support is being added to other architectures, so I don't know how long
the kvm specific tests will hold up.  Testing for a specific chipset
could of course be avoided if we were willing to support:

bool pci_device_intx_route_supported(PCIDevice *pdev)

or the NOROUTE option I posted previously.

> > +#endif
> > +    return false;
> > +}
> > +
> > +static PCIINTxRoute vfio_pci_device_route_intx_to_irq(PCIDevice *pdev, int pin)
> > +{
> > +    if (!vfio_pci_bus_has_intx_route(pdev)) {
> > +        return (PCIINTxRoute) { .mode = PCI_INTX_DISABLED, .irq = -1 };
> > +    }
> > +
> > +    return pci_device_route_intx_to_irq(pdev, pin);
> > +}
> > +
> > +static bool vfio_pci_intx_route_changed(PCIINTxRoute *old, PCIINTxRoute *new)
> > +{
> > +    return old->mode != new->mode || old->irq != new->irq;
> > +}
> > +
> 
> Didn't you add an API for this? It's on pci branch but I can drop
> it if not needed.

I did and I'll switch to it when available, but I have no idea when that
will be, so I've hedged my bets by re-implementing it here.  2 week+
turnover for a patch makes it difficult to coordinate dependent changes
on short qemu release cycles.

> > +/*
> >   * Common VFIO interrupt disable
> >   */
> >  static void vfio_disable_irqindex(VFIODevice *vdev, int index)
> > @@ -185,6 +232,21 @@ static void vfio_unmask_intx(VFIODevice *vdev)
> >      ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
> >  }
> >  
> > +#ifdef CONFIG_KVM
> > +static void vfio_mask_intx(VFIODevice *vdev)
> > +{
> > +    struct vfio_irq_set irq_set = {
> > +        .argsz = sizeof(irq_set),
> > +        .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK,
> > +        .index = VFIO_PCI_INTX_IRQ_INDEX,
> > +        .start = 0,
> > +        .count = 1,
> > +    };
> > +
> > +    ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
> > +}
> > +#endif
> > +
> >  /*
> >   * Disabling BAR mmaping can be slow, but toggling it around INTx can
> >   * also be a huge overhead.  We try to get the best of both worlds by
> > @@ -248,6 +310,161 @@ static void vfio_eoi(VFIODevice *vdev)
> >      vfio_unmask_intx(vdev);
> >  }
> >  
> > +static void vfio_enable_intx_kvm(VFIODevice *vdev)
> > +{
> > +#ifdef CONFIG_KVM
> > +    struct kvm_irqfd irqfd = {
> > +        .fd = event_notifier_get_fd(&vdev->intx.interrupt),
> > +        .gsi = vdev->intx.route.irq,
> > +        .flags = KVM_IRQFD_FLAG_RESAMPLE,
> 
> 
> Should not kvm ioctl handling be localized in kvm-all.c?
> E.g. extend kvm_irqchip_add_irqfd_notifier in
> some way? Same question for KVM_CAP_IRQFD_RESAMPLE use above ...

Possibly, I took a survey of existing code and found examples of both.
Therefore I decided to host it myself first and push it out to common
helpers later, which will also help me get rid of the #ifdefs here.
Thanks,

Alex

> > +    };
> > +    struct vfio_irq_set *irq_set;
> > +    int ret, argsz;
> > +    int32_t *pfd;
> > +
> > +    if (!kvm_irqchip_in_kernel() ||
> > +        vdev->intx.route.mode != PCI_INTX_ENABLED ||
> > +        !kvm_check_extension(kvm_state, KVM_CAP_IRQFD_RESAMPLE)) {
> > +        return;
> > +    }
> > +
> > +    /* Get to a known interrupt state */
> > +    qemu_set_fd_handler(irqfd.fd, NULL, NULL, vdev);
> > +    vfio_mask_intx(vdev);
> > +    vdev->intx.pending = false;
> > +    qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 0);
> > +
> > +    /* Get an eventfd for resample/unmask */
> > +    if (event_notifier_init(&vdev->intx.unmask, 0)) {
> > +        error_report("vfio: Error: event_notifier_init failed eoi\n");
> > +        goto fail;
> > +    }
> > +
> > +    /* KVM triggers it, VFIO listens for it */
> > +    irqfd.resamplefd = event_notifier_get_fd(&vdev->intx.unmask);
> > +
> > +    if (kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd)) {
> > +        error_report("vfio: Error: Failed to setup resample irqfd: %m\n");
> > +        goto fail_irqfd;
> > +    }
> > +
> > +    argsz = sizeof(*irq_set) + sizeof(*pfd);
> > +
> > +    irq_set = g_malloc0(argsz);
> > +    irq_set->argsz = argsz;
> > +    irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_UNMASK;
> > +    irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
> > +    irq_set->start = 0;
> > +    irq_set->count = 1;
> > +    pfd = (int32_t *)&irq_set->data;
> > +
> > +    *pfd = irqfd.resamplefd;
> > +
> > +    ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
> > +    g_free(irq_set);
> > +    if (ret) {
> > +        error_report("vfio: Error: Failed to setup INTx unmask fd: %m\n");
> > +        goto fail_vfio;
> > +    }
> > +
> > +    /* Let'em rip */
> > +    vfio_unmask_intx(vdev);
> > +
> > +    vdev->intx.kvm_accel = true;
> > +
> > +    DPRINTF("%s(%04x:%02x:%02x.%x) KVM INTx accel enabled\n",
> > +            __func__, vdev->host.domain, vdev->host.bus,
> > +            vdev->host.slot, vdev->host.function);
> > +
> > +    return;
> > +
> > +fail_vfio:
> > +    irqfd.flags = KVM_IRQFD_FLAG_DEASSIGN;
> > +    kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd);
> > +fail_irqfd:
> > +    event_notifier_cleanup(&vdev->intx.unmask);
> > +fail:
> > +    qemu_set_fd_handler(irqfd.fd, vfio_intx_interrupt, NULL, vdev);
> > +    vfio_unmask_intx(vdev);
> > +#endif
> > +}
> > +
> > +static void vfio_disable_intx_kvm(VFIODevice *vdev)
> > +{
> > +#ifdef CONFIG_KVM
> > +    struct kvm_irqfd irqfd = {
> > +        .fd = event_notifier_get_fd(&vdev->intx.interrupt),
> > +        .gsi = vdev->intx.route.irq,
> > +        .flags = KVM_IRQFD_FLAG_DEASSIGN,
> > +    };
> > +
> > +    if (!vdev->intx.kvm_accel) {
> > +        return;
> > +    }
> > +
> > +    /*
> > +     * Get to a known state, hardware masked, QEMU ready to accept new
> > +     * interrupts, QEMU IRQ de-asserted.
> > +     */
> > +    vfio_mask_intx(vdev);
> > +    vdev->intx.pending = false;
> > +    qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 0);
> > +
> > +    /* Tell KVM to stop listening for an INTx irqfd */
> > +    if (kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd)) {
> > +        error_report("vfio: Error: Failed to disable INTx irqfd: %m\n");
> > +    }
> > +
> > +    /* We only need to close the eventfd for VFIO to cleanup the kernel side */
> > +    event_notifier_cleanup(&vdev->intx.unmask);
> > +
> > +    /* QEMU starts listening for interrupt events. */
> > +    qemu_set_fd_handler(irqfd.fd, vfio_intx_interrupt, NULL, vdev);
> > +
> > +    vdev->intx.kvm_accel = false;
> > +
> > +    /* If we've missed an event, let it re-fire through QEMU */
> > +    vfio_unmask_intx(vdev);
> > +
> > +    DPRINTF("%s(%04x:%02x:%02x.%x) KVM INTx accel disabled\n",
> > +            __func__, vdev->host.domain, vdev->host.bus,
> > +            vdev->host.slot, vdev->host.function);
> > +#endif
> > +}
> > +
> > +static void vfio_update_irq(PCIDevice *pdev)
> > +{
> > +    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
> > +    PCIINTxRoute route;
> > +
> > +    if (vdev->interrupt != VFIO_INT_INTx) {
> > +        return;
> > +    }
> > +
> > +    route = vfio_pci_device_route_intx_to_irq(&vdev->pdev, vdev->intx.pin);
> > +
> > +    if (!vfio_pci_intx_route_changed(&vdev->intx.route, &route)) {
> > +        return; /* Nothing changed */
> > +    }
> > +
> > +    DPRINTF("%s(%04x:%02x:%02x.%x) IRQ moved %d -> %d\n", __func__,
> > +            vdev->host.domain, vdev->host.bus, vdev->host.slot,
> > +            vdev->host.function, vdev->intx.route.irq, route.irq);
> > +
> > +    vfio_disable_intx_kvm(vdev);
> > +
> > +    vdev->intx.route = route;
> > +
> > +    if (route.mode != PCI_INTX_ENABLED) {
> > +        return;
> > +    }
> > +
> > +    vfio_enable_intx_kvm(vdev);
> > +
> > +    /* Re-enable the interrupt in cased we missed an EOI */
> > +    vfio_eoi(vdev);
> > +}
> > +
> >  static int vfio_enable_intx(VFIODevice *vdev)
> >  {
> >      uint8_t pin = vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1);
> > @@ -262,6 +479,9 @@ static int vfio_enable_intx(VFIODevice *vdev)
> >      vfio_disable_interrupts(vdev);
> >  
> >      vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */
> > +    vdev->intx.route = vfio_pci_device_route_intx_to_irq(&vdev->pdev,
> > +                                                         vdev->intx.pin);
> > +
> >      ret = event_notifier_init(&vdev->intx.interrupt, 0);
> >      if (ret) {
> >          error_report("vfio: Error: event_notifier_init failed\n");
> > @@ -290,6 +510,8 @@ static int vfio_enable_intx(VFIODevice *vdev)
> >          return -errno;
> >      }
> >  
> > +    vfio_enable_intx_kvm(vdev);
> > +
> >      vdev->interrupt = VFIO_INT_INTx;
> >  
> >      DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
> > @@ -303,6 +525,7 @@ static void vfio_disable_intx(VFIODevice *vdev)
> >      int fd;
> >  
> >      qemu_del_timer(vdev->intx.mmap_timer);
> > +    vfio_disable_intx_kvm(vdev);
> >      vfio_disable_irqindex(vdev, VFIO_PCI_INTX_IRQ_INDEX);
> >      vdev->intx.pending = false;
> >      qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 0);
> > @@ -1870,6 +2093,7 @@ static int vfio_initfn(PCIDevice *pdev)
> >      if (vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1)) {
> >          vdev->intx.mmap_timer = qemu_new_timer_ms(vm_clock,
> >                                                    vfio_intx_mmap_enable, vdev);
> > +        pci_device_set_intx_routing_notifier(&vdev->pdev, vfio_update_irq);
> >          ret = vfio_enable_intx(vdev);
> >          if (ret) {
> >              goto out_teardown;
Michael S. Tsirkin - Oct. 16, 2012, 2:14 p.m.
On Tue, Oct 16, 2012 at 07:51:43AM -0600, Alex Williamson wrote:
> On Tue, 2012-10-16 at 08:39 +0200, Michael S. Tsirkin wrote:
> > On Mon, Oct 15, 2012 at 02:28:15PM -0600, Alex Williamson wrote:
> > > This makes use of the new level irqfd support enabling bypass of
> > > qemu userspace both on INTx injection and unmask.  This significantly
> > > boosts the performance of devices making use of legacy interrupts.
> > > 
> > > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > > ---
> > > 
> > > My INTx routing workaround below will probably raise some eyebrows,
> > > but I don't feel it's worth subjecting users to core dumps if they
> > > want to try vfio-pci on new platforms.  INTx routing is part of some
> > > larger plan, but until that plan materializes we have to try to avoid
> > > the API unless we think there's a good chance it might be there.
> > > I'll accept the maintenance of updating a whitelist in the interim.
> > > Thanks,
> > > 
> > > Alex
> > > 
> > >  hw/vfio_pci.c |  224 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> > >  1 file changed, 224 insertions(+)
> > > 
> > > diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c
> > > index 639371e..777a5f8 100644
> > > --- a/hw/vfio_pci.c
> > > +++ b/hw/vfio_pci.c
> > > @@ -154,6 +154,53 @@ static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
> > >  static void vfio_mmap_set_enabled(VFIODevice *vdev, bool enabled);
> > >  
> > >  /*
> > > + * PCI code refuses to make it possible to probe whether the chipset
> > > + * supports pci_device_route_intx_to_irq() and booby traps the call
> > > + * to assert if doesn't.  For us, this is just an optimization, so
> > > + * only enable it when we know it's present.  Unfortunately PCIBus is
> > > + * private, so we can't just look at the function pointer.
> > > + */
> > > +static bool vfio_pci_bus_has_intx_route(PCIDevice *pdev)
> > > +{
> > > +#ifdef CONFIG_KVM
> > > +    BusState *bus = qdev_get_parent_bus(&pdev->qdev);
> > > +    DeviceState *dev;
> > > +
> > > +    if (!kvm_irqchip_in_kernel() ||
> > > +        !kvm_check_extension(kvm_state, KVM_CAP_IRQFD_RESAMPLE)) {
> > > +	return false;
> > > +    }
> > 
> > 
> > Shouldn't we update linux-headers/ to get KVM_CAP_IRQFD_RESAMPLE?
> > Also for KVM_IRQFD_FLAG_RESAMPLE.
> 
> I posted the patch for that separately yesterday.  I'll only request a
> pull once that's in.

OK missed that. In the future, might be a good idea to note dependencies
in the patchset or repost them as part of patchset with appropriate
tagging.

> > > +
> > > +    for (; bus->parent; bus = qdev_get_parent_bus(dev)) {
> > > +
> > > +        dev = bus->parent;
> > > +
> > > +        if (!strncmp("i440FX-pcihost", object_get_typename(OBJECT(dev)), 14)) {
> > > +            return true;
> > > +        }
> > > +    }
> > > +
> > > +    error_report("vfio-pci: VM chipset does not support INTx routing, "
> > > +                 "using slow INTx mode\n");
> > 
> > When does this code trigger? It seems irqchip implies piix ATM -
> > is this just dead code?
> 
> Unused, but not unnecessary.  Another chipset is under development,
> which means very quickly irqchip will not imply piix.

So this is for purposes of an out of tree stuff, let's
keep these hacks out of tree too. My guess is
q35 can just be added with pci_device_route_intx straight away.

>  Likewise irqfd
> support is being added to other architectures, so I don't know how long
> the kvm specific tests will hold up.  Testing for a specific chipset
> could of course be avoided if we were willing to support:
> 
> bool pci_device_intx_route_supported(PCIDevice *pdev)
> 
> or the NOROUTE option I posted previously.

This is just moving the pain to users who will
get bad performance and will have to debug it. Injecting
through userspace is slow, new architectures should
simply add irqfd straight away instead of adding
work arounds in userspace.

This is exactly why we have the assert in pci core.

> > > +#endif
> > > +    return false;
> > > +}
> > > +
> > > +static PCIINTxRoute vfio_pci_device_route_intx_to_irq(PCIDevice *pdev, int pin)
> > > +{
> > > +    if (!vfio_pci_bus_has_intx_route(pdev)) {
> > > +        return (PCIINTxRoute) { .mode = PCI_INTX_DISABLED, .irq = -1 };
> > > +    }
> > > +
> > > +    return pci_device_route_intx_to_irq(pdev, pin);
> > > +}
> > > +
> > > +static bool vfio_pci_intx_route_changed(PCIINTxRoute *old, PCIINTxRoute *new)
> > > +{
> > > +    return old->mode != new->mode || old->irq != new->irq;
> > > +}
> > > +
> > 
> > Didn't you add an API for this? It's on pci branch but I can drop
> > it if not needed.
> 
> I did and I'll switch to it when available, but I have no idea when that
> will be, so I've hedged my bets by re-implementing it here.  2 week+
> turnover for a patch makes it difficult to coordinate dependent changes
> on short qemu release cycles.

It's available on pci branch, please base on that instead of master.
Yes I merge at about 2 week intervals but the point is using
subtrees. You do not need to block waiting for stuff to land
in master.

> > > +/*
> > >   * Common VFIO interrupt disable
> > >   */
> > >  static void vfio_disable_irqindex(VFIODevice *vdev, int index)
> > > @@ -185,6 +232,21 @@ static void vfio_unmask_intx(VFIODevice *vdev)
> > >      ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
> > >  }
> > >  
> > > +#ifdef CONFIG_KVM
> > > +static void vfio_mask_intx(VFIODevice *vdev)
> > > +{
> > > +    struct vfio_irq_set irq_set = {
> > > +        .argsz = sizeof(irq_set),
> > > +        .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK,
> > > +        .index = VFIO_PCI_INTX_IRQ_INDEX,
> > > +        .start = 0,
> > > +        .count = 1,
> > > +    };
> > > +
> > > +    ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
> > > +}
> > > +#endif
> > > +
> > >  /*
> > >   * Disabling BAR mmaping can be slow, but toggling it around INTx can
> > >   * also be a huge overhead.  We try to get the best of both worlds by
> > > @@ -248,6 +310,161 @@ static void vfio_eoi(VFIODevice *vdev)
> > >      vfio_unmask_intx(vdev);
> > >  }
> > >  
> > > +static void vfio_enable_intx_kvm(VFIODevice *vdev)
> > > +{
> > > +#ifdef CONFIG_KVM
> > > +    struct kvm_irqfd irqfd = {
> > > +        .fd = event_notifier_get_fd(&vdev->intx.interrupt),
> > > +        .gsi = vdev->intx.route.irq,
> > > +        .flags = KVM_IRQFD_FLAG_RESAMPLE,
> > 
> > 
> > Should not kvm ioctl handling be localized in kvm-all.c?
> > E.g. extend kvm_irqchip_add_irqfd_notifier in
> > some way? Same question for KVM_CAP_IRQFD_RESAMPLE use above ...
> 
> Possibly, I took a survey of existing code and found examples of both.

These are bugs really - this prevents adding a device to libhw
which slows build for everyone. So pls do not copy such examples.

> Therefore I decided to host it myself first and push it out to common
> helpers later, which will also help me get rid of the #ifdefs here.
> Thanks,
> 
> Alex

I think it is better to put it in the final place straight away.

> > > +    };
> > > +    struct vfio_irq_set *irq_set;
> > > +    int ret, argsz;
> > > +    int32_t *pfd;
> > > +
> > > +    if (!kvm_irqchip_in_kernel() ||
> > > +        vdev->intx.route.mode != PCI_INTX_ENABLED ||
> > > +        !kvm_check_extension(kvm_state, KVM_CAP_IRQFD_RESAMPLE)) {
> > > +        return;
> > > +    }
> > > +
> > > +    /* Get to a known interrupt state */
> > > +    qemu_set_fd_handler(irqfd.fd, NULL, NULL, vdev);
> > > +    vfio_mask_intx(vdev);
> > > +    vdev->intx.pending = false;
> > > +    qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 0);
> > > +
> > > +    /* Get an eventfd for resample/unmask */
> > > +    if (event_notifier_init(&vdev->intx.unmask, 0)) {
> > > +        error_report("vfio: Error: event_notifier_init failed eoi\n");
> > > +        goto fail;
> > > +    }
> > > +
> > > +    /* KVM triggers it, VFIO listens for it */
> > > +    irqfd.resamplefd = event_notifier_get_fd(&vdev->intx.unmask);
> > > +
> > > +    if (kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd)) {
> > > +        error_report("vfio: Error: Failed to setup resample irqfd: %m\n");
> > > +        goto fail_irqfd;
> > > +    }
> > > +
> > > +    argsz = sizeof(*irq_set) + sizeof(*pfd);
> > > +
> > > +    irq_set = g_malloc0(argsz);
> > > +    irq_set->argsz = argsz;
> > > +    irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_UNMASK;
> > > +    irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
> > > +    irq_set->start = 0;
> > > +    irq_set->count = 1;
> > > +    pfd = (int32_t *)&irq_set->data;
> > > +
> > > +    *pfd = irqfd.resamplefd;
> > > +
> > > +    ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
> > > +    g_free(irq_set);
> > > +    if (ret) {
> > > +        error_report("vfio: Error: Failed to setup INTx unmask fd: %m\n");
> > > +        goto fail_vfio;
> > > +    }
> > > +
> > > +    /* Let'em rip */
> > > +    vfio_unmask_intx(vdev);
> > > +
> > > +    vdev->intx.kvm_accel = true;
> > > +
> > > +    DPRINTF("%s(%04x:%02x:%02x.%x) KVM INTx accel enabled\n",
> > > +            __func__, vdev->host.domain, vdev->host.bus,
> > > +            vdev->host.slot, vdev->host.function);
> > > +
> > > +    return;
> > > +
> > > +fail_vfio:
> > > +    irqfd.flags = KVM_IRQFD_FLAG_DEASSIGN;
> > > +    kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd);
> > > +fail_irqfd:
> > > +    event_notifier_cleanup(&vdev->intx.unmask);
> > > +fail:
> > > +    qemu_set_fd_handler(irqfd.fd, vfio_intx_interrupt, NULL, vdev);
> > > +    vfio_unmask_intx(vdev);
> > > +#endif
> > > +}
> > > +
> > > +static void vfio_disable_intx_kvm(VFIODevice *vdev)
> > > +{
> > > +#ifdef CONFIG_KVM
> > > +    struct kvm_irqfd irqfd = {
> > > +        .fd = event_notifier_get_fd(&vdev->intx.interrupt),
> > > +        .gsi = vdev->intx.route.irq,
> > > +        .flags = KVM_IRQFD_FLAG_DEASSIGN,
> > > +    };
> > > +
> > > +    if (!vdev->intx.kvm_accel) {
> > > +        return;
> > > +    }
> > > +
> > > +    /*
> > > +     * Get to a known state, hardware masked, QEMU ready to accept new
> > > +     * interrupts, QEMU IRQ de-asserted.
> > > +     */
> > > +    vfio_mask_intx(vdev);
> > > +    vdev->intx.pending = false;
> > > +    qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 0);
> > > +
> > > +    /* Tell KVM to stop listening for an INTx irqfd */
> > > +    if (kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd)) {
> > > +        error_report("vfio: Error: Failed to disable INTx irqfd: %m\n");
> > > +    }
> > > +
> > > +    /* We only need to close the eventfd for VFIO to cleanup the kernel side */
> > > +    event_notifier_cleanup(&vdev->intx.unmask);
> > > +
> > > +    /* QEMU starts listening for interrupt events. */
> > > +    qemu_set_fd_handler(irqfd.fd, vfio_intx_interrupt, NULL, vdev);
> > > +
> > > +    vdev->intx.kvm_accel = false;
> > > +
> > > +    /* If we've missed an event, let it re-fire through QEMU */
> > > +    vfio_unmask_intx(vdev);
> > > +
> > > +    DPRINTF("%s(%04x:%02x:%02x.%x) KVM INTx accel disabled\n",
> > > +            __func__, vdev->host.domain, vdev->host.bus,
> > > +            vdev->host.slot, vdev->host.function);
> > > +#endif
> > > +}
> > > +
> > > +static void vfio_update_irq(PCIDevice *pdev)
> > > +{
> > > +    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
> > > +    PCIINTxRoute route;
> > > +
> > > +    if (vdev->interrupt != VFIO_INT_INTx) {
> > > +        return;
> > > +    }
> > > +
> > > +    route = vfio_pci_device_route_intx_to_irq(&vdev->pdev, vdev->intx.pin);
> > > +
> > > +    if (!vfio_pci_intx_route_changed(&vdev->intx.route, &route)) {
> > > +        return; /* Nothing changed */
> > > +    }
> > > +
> > > +    DPRINTF("%s(%04x:%02x:%02x.%x) IRQ moved %d -> %d\n", __func__,
> > > +            vdev->host.domain, vdev->host.bus, vdev->host.slot,
> > > +            vdev->host.function, vdev->intx.route.irq, route.irq);
> > > +
> > > +    vfio_disable_intx_kvm(vdev);
> > > +
> > > +    vdev->intx.route = route;
> > > +
> > > +    if (route.mode != PCI_INTX_ENABLED) {
> > > +        return;
> > > +    }
> > > +
> > > +    vfio_enable_intx_kvm(vdev);
> > > +
> > > +    /* Re-enable the interrupt in cased we missed an EOI */
> > > +    vfio_eoi(vdev);
> > > +}
> > > +
> > >  static int vfio_enable_intx(VFIODevice *vdev)
> > >  {
> > >      uint8_t pin = vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1);
> > > @@ -262,6 +479,9 @@ static int vfio_enable_intx(VFIODevice *vdev)
> > >      vfio_disable_interrupts(vdev);
> > >  
> > >      vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */
> > > +    vdev->intx.route = vfio_pci_device_route_intx_to_irq(&vdev->pdev,
> > > +                                                         vdev->intx.pin);
> > > +
> > >      ret = event_notifier_init(&vdev->intx.interrupt, 0);
> > >      if (ret) {
> > >          error_report("vfio: Error: event_notifier_init failed\n");
> > > @@ -290,6 +510,8 @@ static int vfio_enable_intx(VFIODevice *vdev)
> > >          return -errno;
> > >      }
> > >  
> > > +    vfio_enable_intx_kvm(vdev);
> > > +
> > >      vdev->interrupt = VFIO_INT_INTx;
> > >  
> > >      DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
> > > @@ -303,6 +525,7 @@ static void vfio_disable_intx(VFIODevice *vdev)
> > >      int fd;
> > >  
> > >      qemu_del_timer(vdev->intx.mmap_timer);
> > > +    vfio_disable_intx_kvm(vdev);
> > >      vfio_disable_irqindex(vdev, VFIO_PCI_INTX_IRQ_INDEX);
> > >      vdev->intx.pending = false;
> > >      qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 0);
> > > @@ -1870,6 +2093,7 @@ static int vfio_initfn(PCIDevice *pdev)
> > >      if (vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1)) {
> > >          vdev->intx.mmap_timer = qemu_new_timer_ms(vm_clock,
> > >                                                    vfio_intx_mmap_enable, vdev);
> > > +        pci_device_set_intx_routing_notifier(&vdev->pdev, vfio_update_irq);
> > >          ret = vfio_enable_intx(vdev);
> > >          if (ret) {
> > >              goto out_teardown;
> 
>
Alex Williamson - Oct. 16, 2012, 2:48 p.m.
On Tue, 2012-10-16 at 16:14 +0200, Michael S. Tsirkin wrote:
> On Tue, Oct 16, 2012 at 07:51:43AM -0600, Alex Williamson wrote:
> > On Tue, 2012-10-16 at 08:39 +0200, Michael S. Tsirkin wrote:
> > > On Mon, Oct 15, 2012 at 02:28:15PM -0600, Alex Williamson wrote:
> > > > This makes use of the new level irqfd support enabling bypass of
> > > > qemu userspace both on INTx injection and unmask.  This significantly
> > > > boosts the performance of devices making use of legacy interrupts.
> > > > 
> > > > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > > > ---
> > > > 
> > > > My INTx routing workaround below will probably raise some eyebrows,
> > > > but I don't feel it's worth subjecting users to core dumps if they
> > > > want to try vfio-pci on new platforms.  INTx routing is part of some
> > > > larger plan, but until that plan materializes we have to try to avoid
> > > > the API unless we think there's a good chance it might be there.
> > > > I'll accept the maintenance of updating a whitelist in the interim.
> > > > Thanks,
> > > > 
> > > > Alex
> > > > 
> > > >  hw/vfio_pci.c |  224 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> > > >  1 file changed, 224 insertions(+)
> > > > 
> > > > diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c
> > > > index 639371e..777a5f8 100644
> > > > --- a/hw/vfio_pci.c
> > > > +++ b/hw/vfio_pci.c
> > > > @@ -154,6 +154,53 @@ static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
> > > >  static void vfio_mmap_set_enabled(VFIODevice *vdev, bool enabled);
> > > >  
> > > >  /*
> > > > + * PCI code refuses to make it possible to probe whether the chipset
> > > > + * supports pci_device_route_intx_to_irq() and booby traps the call
> > > > + * to assert if doesn't.  For us, this is just an optimization, so
> > > > + * only enable it when we know it's present.  Unfortunately PCIBus is
> > > > + * private, so we can't just look at the function pointer.
> > > > + */
> > > > +static bool vfio_pci_bus_has_intx_route(PCIDevice *pdev)
> > > > +{
> > > > +#ifdef CONFIG_KVM
> > > > +    BusState *bus = qdev_get_parent_bus(&pdev->qdev);
> > > > +    DeviceState *dev;
> > > > +
> > > > +    if (!kvm_irqchip_in_kernel() ||
> > > > +        !kvm_check_extension(kvm_state, KVM_CAP_IRQFD_RESAMPLE)) {
> > > > +	return false;
> > > > +    }
> > > 
> > > 
> > > Shouldn't we update linux-headers/ to get KVM_CAP_IRQFD_RESAMPLE?
> > > Also for KVM_IRQFD_FLAG_RESAMPLE.
> > 
> > I posted the patch for that separately yesterday.  I'll only request a
> > pull once that's in.
> 
> OK missed that. In the future, might be a good idea to note dependencies
> in the patchset or repost them as part of patchset with appropriate
> tagging.
> 
> > > > +
> > > > +    for (; bus->parent; bus = qdev_get_parent_bus(dev)) {
> > > > +
> > > > +        dev = bus->parent;
> > > > +
> > > > +        if (!strncmp("i440FX-pcihost", object_get_typename(OBJECT(dev)), 14)) {
> > > > +            return true;
> > > > +        }
> > > > +    }
> > > > +
> > > > +    error_report("vfio-pci: VM chipset does not support INTx routing, "
> > > > +                 "using slow INTx mode\n");
> > > 
> > > When does this code trigger? It seems irqchip implies piix ATM -
> > > is this just dead code?
> > 
> > Unused, but not unnecessary.  Another chipset is under development,
> > which means very quickly irqchip will not imply piix.
> 
> So this is for purposes of an out of tree stuff, let's
> keep these hacks out of tree too. My guess is
> q35 can just be added with pci_device_route_intx straight away.
> 
> >  Likewise irqfd
> > support is being added to other architectures, so I don't know how long
> > the kvm specific tests will hold up.  Testing for a specific chipset
> > could of course be avoided if we were willing to support:
> > 
> > bool pci_device_intx_route_supported(PCIDevice *pdev)
> > 
> > or the NOROUTE option I posted previously.
> 
> This is just moving the pain to users who will
> get bad performance and will have to debug it. Injecting
> through userspace is slow, new architectures should
> simply add irqfd straight away instead of adding
> work arounds in userspace.
> 
> This is exactly why we have the assert in pci core.

Let's compare user experience:

As coded here:

- Error message, only runs slower if the driver actually uses INTx.
Result: file bug, continue using vfio-pci, maybe do something useful,
maybe find new bugs to file.

Blindly calling PCI code w/ assert:

- Assert.  Result: file bug, full stop.

Maybe I do too much kernel programming, but I don't take asserts lightly
and prefer they be saved for "something is really broken and I can't
safely continue".  This is not such a case.

> > > > +#endif
> > > > +    return false;
> > > > +}
> > > > +
> > > > +static PCIINTxRoute vfio_pci_device_route_intx_to_irq(PCIDevice *pdev, int pin)
> > > > +{
> > > > +    if (!vfio_pci_bus_has_intx_route(pdev)) {
> > > > +        return (PCIINTxRoute) { .mode = PCI_INTX_DISABLED, .irq = -1 };
> > > > +    }
> > > > +
> > > > +    return pci_device_route_intx_to_irq(pdev, pin);
> > > > +}
> > > > +
> > > > +static bool vfio_pci_intx_route_changed(PCIINTxRoute *old, PCIINTxRoute *new)
> > > > +{
> > > > +    return old->mode != new->mode || old->irq != new->irq;
> > > > +}
> > > > +
> > > 
> > > Didn't you add an API for this? It's on pci branch but I can drop
> > > it if not needed.
> > 
> > I did and I'll switch to it when available, but I have no idea when that
> > will be, so I've hedged my bets by re-implementing it here.  2 week+
> > turnover for a patch makes it difficult to coordinate dependent changes
> > on short qemu release cycles.
> 
> It's available on pci branch, please base on that instead of master.
> Yes I merge at about 2 week intervals but the point is using
> subtrees. You do not need to block waiting for stuff to land
> in master.

If it wasn't such a trivial function to re-implement I'd do as you
suggest, but note that I'm still blocked doing a pull request to master
until you get your changes merged.  There are only about 8 weeks of open
qemu 1.3 development, so a 2 week interval doesn't allow many cycles.

> > > > +/*
> > > >   * Common VFIO interrupt disable
> > > >   */
> > > >  static void vfio_disable_irqindex(VFIODevice *vdev, int index)
> > > > @@ -185,6 +232,21 @@ static void vfio_unmask_intx(VFIODevice *vdev)
> > > >      ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
> > > >  }
> > > >  
> > > > +#ifdef CONFIG_KVM
> > > > +static void vfio_mask_intx(VFIODevice *vdev)
> > > > +{
> > > > +    struct vfio_irq_set irq_set = {
> > > > +        .argsz = sizeof(irq_set),
> > > > +        .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK,
> > > > +        .index = VFIO_PCI_INTX_IRQ_INDEX,
> > > > +        .start = 0,
> > > > +        .count = 1,
> > > > +    };
> > > > +
> > > > +    ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
> > > > +}
> > > > +#endif
> > > > +
> > > >  /*
> > > >   * Disabling BAR mmaping can be slow, but toggling it around INTx can
> > > >   * also be a huge overhead.  We try to get the best of both worlds by
> > > > @@ -248,6 +310,161 @@ static void vfio_eoi(VFIODevice *vdev)
> > > >      vfio_unmask_intx(vdev);
> > > >  }
> > > >  
> > > > +static void vfio_enable_intx_kvm(VFIODevice *vdev)
> > > > +{
> > > > +#ifdef CONFIG_KVM
> > > > +    struct kvm_irqfd irqfd = {
> > > > +        .fd = event_notifier_get_fd(&vdev->intx.interrupt),
> > > > +        .gsi = vdev->intx.route.irq,
> > > > +        .flags = KVM_IRQFD_FLAG_RESAMPLE,
> > > 
> > > 
> > > Should not kvm ioctl handling be localized in kvm-all.c?
> > > E.g. extend kvm_irqchip_add_irqfd_notifier in
> > > some way? Same question for KVM_CAP_IRQFD_RESAMPLE use above ...
> > 
> > Possibly, I took a survey of existing code and found examples of both.
> 
> These are bugs really - this prevents adding a device to libhw
> which slows build for everyone. So pls do not copy such examples.
> 
> > Therefore I decided to host it myself first and push it out to common
> > helpers later, which will also help me get rid of the #ifdefs here.
> > Thanks,
> > 
> > Alex
> 
> I think it is better to put it in the final place straight away.

Timing doesn't readily allow for that.  The qemu 1.3 soft freeze is only
about 2 weeks away and I'm on vacation for most of that time.  Adding it
locally has precedence, provides acceleration to users now, and adds an
in-tree user when I start splitting it out later.  Given my history, I'm
just as likely to get push back for adding an API without a user as I am
for adding a user prior to the API.  Thanks,

Alex

> > > > +    };
> > > > +    struct vfio_irq_set *irq_set;
> > > > +    int ret, argsz;
> > > > +    int32_t *pfd;
> > > > +
> > > > +    if (!kvm_irqchip_in_kernel() ||
> > > > +        vdev->intx.route.mode != PCI_INTX_ENABLED ||
> > > > +        !kvm_check_extension(kvm_state, KVM_CAP_IRQFD_RESAMPLE)) {
> > > > +        return;
> > > > +    }
> > > > +
> > > > +    /* Get to a known interrupt state */
> > > > +    qemu_set_fd_handler(irqfd.fd, NULL, NULL, vdev);
> > > > +    vfio_mask_intx(vdev);
> > > > +    vdev->intx.pending = false;
> > > > +    qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 0);
> > > > +
> > > > +    /* Get an eventfd for resample/unmask */
> > > > +    if (event_notifier_init(&vdev->intx.unmask, 0)) {
> > > > +        error_report("vfio: Error: event_notifier_init failed eoi\n");
> > > > +        goto fail;
> > > > +    }
> > > > +
> > > > +    /* KVM triggers it, VFIO listens for it */
> > > > +    irqfd.resamplefd = event_notifier_get_fd(&vdev->intx.unmask);
> > > > +
> > > > +    if (kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd)) {
> > > > +        error_report("vfio: Error: Failed to setup resample irqfd: %m\n");
> > > > +        goto fail_irqfd;
> > > > +    }
> > > > +
> > > > +    argsz = sizeof(*irq_set) + sizeof(*pfd);
> > > > +
> > > > +    irq_set = g_malloc0(argsz);
> > > > +    irq_set->argsz = argsz;
> > > > +    irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_UNMASK;
> > > > +    irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
> > > > +    irq_set->start = 0;
> > > > +    irq_set->count = 1;
> > > > +    pfd = (int32_t *)&irq_set->data;
> > > > +
> > > > +    *pfd = irqfd.resamplefd;
> > > > +
> > > > +    ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
> > > > +    g_free(irq_set);
> > > > +    if (ret) {
> > > > +        error_report("vfio: Error: Failed to setup INTx unmask fd: %m\n");
> > > > +        goto fail_vfio;
> > > > +    }
> > > > +
> > > > +    /* Let'em rip */
> > > > +    vfio_unmask_intx(vdev);
> > > > +
> > > > +    vdev->intx.kvm_accel = true;
> > > > +
> > > > +    DPRINTF("%s(%04x:%02x:%02x.%x) KVM INTx accel enabled\n",
> > > > +            __func__, vdev->host.domain, vdev->host.bus,
> > > > +            vdev->host.slot, vdev->host.function);
> > > > +
> > > > +    return;
> > > > +
> > > > +fail_vfio:
> > > > +    irqfd.flags = KVM_IRQFD_FLAG_DEASSIGN;
> > > > +    kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd);
> > > > +fail_irqfd:
> > > > +    event_notifier_cleanup(&vdev->intx.unmask);
> > > > +fail:
> > > > +    qemu_set_fd_handler(irqfd.fd, vfio_intx_interrupt, NULL, vdev);
> > > > +    vfio_unmask_intx(vdev);
> > > > +#endif
> > > > +}
> > > > +
> > > > +static void vfio_disable_intx_kvm(VFIODevice *vdev)
> > > > +{
> > > > +#ifdef CONFIG_KVM
> > > > +    struct kvm_irqfd irqfd = {
> > > > +        .fd = event_notifier_get_fd(&vdev->intx.interrupt),
> > > > +        .gsi = vdev->intx.route.irq,
> > > > +        .flags = KVM_IRQFD_FLAG_DEASSIGN,
> > > > +    };
> > > > +
> > > > +    if (!vdev->intx.kvm_accel) {
> > > > +        return;
> > > > +    }
> > > > +
> > > > +    /*
> > > > +     * Get to a known state, hardware masked, QEMU ready to accept new
> > > > +     * interrupts, QEMU IRQ de-asserted.
> > > > +     */
> > > > +    vfio_mask_intx(vdev);
> > > > +    vdev->intx.pending = false;
> > > > +    qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 0);
> > > > +
> > > > +    /* Tell KVM to stop listening for an INTx irqfd */
> > > > +    if (kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd)) {
> > > > +        error_report("vfio: Error: Failed to disable INTx irqfd: %m\n");
> > > > +    }
> > > > +
> > > > +    /* We only need to close the eventfd for VFIO to cleanup the kernel side */
> > > > +    event_notifier_cleanup(&vdev->intx.unmask);
> > > > +
> > > > +    /* QEMU starts listening for interrupt events. */
> > > > +    qemu_set_fd_handler(irqfd.fd, vfio_intx_interrupt, NULL, vdev);
> > > > +
> > > > +    vdev->intx.kvm_accel = false;
> > > > +
> > > > +    /* If we've missed an event, let it re-fire through QEMU */
> > > > +    vfio_unmask_intx(vdev);
> > > > +
> > > > +    DPRINTF("%s(%04x:%02x:%02x.%x) KVM INTx accel disabled\n",
> > > > +            __func__, vdev->host.domain, vdev->host.bus,
> > > > +            vdev->host.slot, vdev->host.function);
> > > > +#endif
> > > > +}
> > > > +
> > > > +static void vfio_update_irq(PCIDevice *pdev)
> > > > +{
> > > > +    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
> > > > +    PCIINTxRoute route;
> > > > +
> > > > +    if (vdev->interrupt != VFIO_INT_INTx) {
> > > > +        return;
> > > > +    }
> > > > +
> > > > +    route = vfio_pci_device_route_intx_to_irq(&vdev->pdev, vdev->intx.pin);
> > > > +
> > > > +    if (!vfio_pci_intx_route_changed(&vdev->intx.route, &route)) {
> > > > +        return; /* Nothing changed */
> > > > +    }
> > > > +
> > > > +    DPRINTF("%s(%04x:%02x:%02x.%x) IRQ moved %d -> %d\n", __func__,
> > > > +            vdev->host.domain, vdev->host.bus, vdev->host.slot,
> > > > +            vdev->host.function, vdev->intx.route.irq, route.irq);
> > > > +
> > > > +    vfio_disable_intx_kvm(vdev);
> > > > +
> > > > +    vdev->intx.route = route;
> > > > +
> > > > +    if (route.mode != PCI_INTX_ENABLED) {
> > > > +        return;
> > > > +    }
> > > > +
> > > > +    vfio_enable_intx_kvm(vdev);
> > > > +
> > > > +    /* Re-enable the interrupt in cased we missed an EOI */
> > > > +    vfio_eoi(vdev);
> > > > +}
> > > > +
> > > >  static int vfio_enable_intx(VFIODevice *vdev)
> > > >  {
> > > >      uint8_t pin = vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1);
> > > > @@ -262,6 +479,9 @@ static int vfio_enable_intx(VFIODevice *vdev)
> > > >      vfio_disable_interrupts(vdev);
> > > >  
> > > >      vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */
> > > > +    vdev->intx.route = vfio_pci_device_route_intx_to_irq(&vdev->pdev,
> > > > +                                                         vdev->intx.pin);
> > > > +
> > > >      ret = event_notifier_init(&vdev->intx.interrupt, 0);
> > > >      if (ret) {
> > > >          error_report("vfio: Error: event_notifier_init failed\n");
> > > > @@ -290,6 +510,8 @@ static int vfio_enable_intx(VFIODevice *vdev)
> > > >          return -errno;
> > > >      }
> > > >  
> > > > +    vfio_enable_intx_kvm(vdev);
> > > > +
> > > >      vdev->interrupt = VFIO_INT_INTx;
> > > >  
> > > >      DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
> > > > @@ -303,6 +525,7 @@ static void vfio_disable_intx(VFIODevice *vdev)
> > > >      int fd;
> > > >  
> > > >      qemu_del_timer(vdev->intx.mmap_timer);
> > > > +    vfio_disable_intx_kvm(vdev);
> > > >      vfio_disable_irqindex(vdev, VFIO_PCI_INTX_IRQ_INDEX);
> > > >      vdev->intx.pending = false;
> > > >      qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 0);
> > > > @@ -1870,6 +2093,7 @@ static int vfio_initfn(PCIDevice *pdev)
> > > >      if (vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1)) {
> > > >          vdev->intx.mmap_timer = qemu_new_timer_ms(vm_clock,
> > > >                                                    vfio_intx_mmap_enable, vdev);
> > > > +        pci_device_set_intx_routing_notifier(&vdev->pdev, vfio_update_irq);
> > > >          ret = vfio_enable_intx(vdev);
> > > >          if (ret) {
> > > >              goto out_teardown;
> > 
> >
Michael S. Tsirkin - Oct. 16, 2012, 3:08 p.m.
On Tue, Oct 16, 2012 at 08:48:04AM -0600, Alex Williamson wrote:
> On Tue, 2012-10-16 at 16:14 +0200, Michael S. Tsirkin wrote:
> > On Tue, Oct 16, 2012 at 07:51:43AM -0600, Alex Williamson wrote:
> > > On Tue, 2012-10-16 at 08:39 +0200, Michael S. Tsirkin wrote:
> > > > On Mon, Oct 15, 2012 at 02:28:15PM -0600, Alex Williamson wrote:
> > > > > This makes use of the new level irqfd support enabling bypass of
> > > > > qemu userspace both on INTx injection and unmask.  This significantly
> > > > > boosts the performance of devices making use of legacy interrupts.
> > > > > 
> > > > > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > > > > ---
> > > > > 
> > > > > My INTx routing workaround below will probably raise some eyebrows,
> > > > > but I don't feel it's worth subjecting users to core dumps if they
> > > > > want to try vfio-pci on new platforms.  INTx routing is part of some
> > > > > larger plan, but until that plan materializes we have to try to avoid
> > > > > the API unless we think there's a good chance it might be there.
> > > > > I'll accept the maintenance of updating a whitelist in the interim.
> > > > > Thanks,
> > > > > 
> > > > > Alex
> > > > > 
> > > > >  hw/vfio_pci.c |  224 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> > > > >  1 file changed, 224 insertions(+)
> > > > > 
> > > > > diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c
> > > > > index 639371e..777a5f8 100644
> > > > > --- a/hw/vfio_pci.c
> > > > > +++ b/hw/vfio_pci.c
> > > > > @@ -154,6 +154,53 @@ static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
> > > > >  static void vfio_mmap_set_enabled(VFIODevice *vdev, bool enabled);
> > > > >  
> > > > >  /*
> > > > > + * PCI code refuses to make it possible to probe whether the chipset
> > > > > + * supports pci_device_route_intx_to_irq() and booby traps the call
> > > > > + * to assert if doesn't.  For us, this is just an optimization, so
> > > > > + * only enable it when we know it's present.  Unfortunately PCIBus is
> > > > > + * private, so we can't just look at the function pointer.
> > > > > + */
> > > > > +static bool vfio_pci_bus_has_intx_route(PCIDevice *pdev)
> > > > > +{
> > > > > +#ifdef CONFIG_KVM
> > > > > +    BusState *bus = qdev_get_parent_bus(&pdev->qdev);
> > > > > +    DeviceState *dev;
> > > > > +
> > > > > +    if (!kvm_irqchip_in_kernel() ||
> > > > > +        !kvm_check_extension(kvm_state, KVM_CAP_IRQFD_RESAMPLE)) {
> > > > > +	return false;
> > > > > +    }
> > > > 
> > > > 
> > > > Shouldn't we update linux-headers/ to get KVM_CAP_IRQFD_RESAMPLE?
> > > > Also for KVM_IRQFD_FLAG_RESAMPLE.
> > > 
> > > I posted the patch for that separately yesterday.  I'll only request a
> > > pull once that's in.
> > 
> > OK missed that. In the future, might be a good idea to note dependencies
> > in the patchset or repost them as part of patchset with appropriate
> > tagging.
> > 
> > > > > +
> > > > > +    for (; bus->parent; bus = qdev_get_parent_bus(dev)) {
> > > > > +
> > > > > +        dev = bus->parent;
> > > > > +
> > > > > +        if (!strncmp("i440FX-pcihost", object_get_typename(OBJECT(dev)), 14)) {
> > > > > +            return true;
> > > > > +        }
> > > > > +    }
> > > > > +
> > > > > +    error_report("vfio-pci: VM chipset does not support INTx routing, "
> > > > > +                 "using slow INTx mode\n");
> > > > 
> > > > When does this code trigger? It seems irqchip implies piix ATM -
> > > > is this just dead code?
> > > 
> > > Unused, but not unnecessary.  Another chipset is under development,
> > > which means very quickly irqchip will not imply piix.
> > 
> > So this is for purposes of an out of tree stuff, let's
> > keep these hacks out of tree too. My guess is
> > q35 can just be added with pci_device_route_intx straight away.
> > 
> > >  Likewise irqfd
> > > support is being added to other architectures, so I don't know how long
> > > the kvm specific tests will hold up.  Testing for a specific chipset
> > > could of course be avoided if we were willing to support:
> > > 
> > > bool pci_device_intx_route_supported(PCIDevice *pdev)
> > > 
> > > or the NOROUTE option I posted previously.
> > 
> > This is just moving the pain to users who will
> > get bad performance and will have to debug it. Injecting
> > through userspace is slow, new architectures should
> > simply add irqfd straight away instead of adding
> > work arounds in userspace.
> > 
> > This is exactly why we have the assert in pci core.
> 
> Let's compare user experience:
> 
> As coded here:
> 
> - Error message, only runs slower if the driver actually uses INTx.
> Result: file bug, continue using vfio-pci, maybe do something useful,
> maybe find new bugs to file.
> 
> Blindly calling PCI code w/ assert:
> 
> - Assert.  Result: file bug, full stop.
> 
> Maybe I do too much kernel programming, but I don't take asserts lightly
> and prefer they be saved for "something is really broken and I can't
> safely continue".  This is not such a case.

There's no chance we ship e.g. q35 by mistake without this API: since
there is no way this specific assert can be missed in even basic
testing:

So I see it differently:

As coded here:
	chipset authors get lazy and do not implement API.
	bad performance for all users.

With assert:
	chipset authors implement necessary API.
	good performance for all users.
Alex Williamson - Oct. 16, 2012, 3:13 p.m.
On Tue, 2012-10-16 at 17:08 +0200, Michael S. Tsirkin wrote:
> On Tue, Oct 16, 2012 at 08:48:04AM -0600, Alex Williamson wrote:
> > On Tue, 2012-10-16 at 16:14 +0200, Michael S. Tsirkin wrote:
> > > On Tue, Oct 16, 2012 at 07:51:43AM -0600, Alex Williamson wrote:
> > > > On Tue, 2012-10-16 at 08:39 +0200, Michael S. Tsirkin wrote:
> > > > > On Mon, Oct 15, 2012 at 02:28:15PM -0600, Alex Williamson wrote:
> > > > > > This makes use of the new level irqfd support enabling bypass of
> > > > > > qemu userspace both on INTx injection and unmask.  This significantly
> > > > > > boosts the performance of devices making use of legacy interrupts.
> > > > > > 
> > > > > > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > > > > > ---
> > > > > > 
> > > > > > My INTx routing workaround below will probably raise some eyebrows,
> > > > > > but I don't feel it's worth subjecting users to core dumps if they
> > > > > > want to try vfio-pci on new platforms.  INTx routing is part of some
> > > > > > larger plan, but until that plan materializes we have to try to avoid
> > > > > > the API unless we think there's a good chance it might be there.
> > > > > > I'll accept the maintenance of updating a whitelist in the interim.
> > > > > > Thanks,
> > > > > > 
> > > > > > Alex
> > > > > > 
> > > > > >  hw/vfio_pci.c |  224 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> > > > > >  1 file changed, 224 insertions(+)
> > > > > > 
> > > > > > diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c
> > > > > > index 639371e..777a5f8 100644
> > > > > > --- a/hw/vfio_pci.c
> > > > > > +++ b/hw/vfio_pci.c
> > > > > > @@ -154,6 +154,53 @@ static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
> > > > > >  static void vfio_mmap_set_enabled(VFIODevice *vdev, bool enabled);
> > > > > >  
> > > > > >  /*
> > > > > > + * PCI code refuses to make it possible to probe whether the chipset
> > > > > > + * supports pci_device_route_intx_to_irq() and booby traps the call
> > > > > > + * to assert if doesn't.  For us, this is just an optimization, so
> > > > > > + * only enable it when we know it's present.  Unfortunately PCIBus is
> > > > > > + * private, so we can't just look at the function pointer.
> > > > > > + */
> > > > > > +static bool vfio_pci_bus_has_intx_route(PCIDevice *pdev)
> > > > > > +{
> > > > > > +#ifdef CONFIG_KVM
> > > > > > +    BusState *bus = qdev_get_parent_bus(&pdev->qdev);
> > > > > > +    DeviceState *dev;
> > > > > > +
> > > > > > +    if (!kvm_irqchip_in_kernel() ||
> > > > > > +        !kvm_check_extension(kvm_state, KVM_CAP_IRQFD_RESAMPLE)) {
> > > > > > +	return false;
> > > > > > +    }
> > > > > 
> > > > > 
> > > > > Shouldn't we update linux-headers/ to get KVM_CAP_IRQFD_RESAMPLE?
> > > > > Also for KVM_IRQFD_FLAG_RESAMPLE.
> > > > 
> > > > I posted the patch for that separately yesterday.  I'll only request a
> > > > pull once that's in.
> > > 
> > > OK missed that. In the future, might be a good idea to note dependencies
> > > in the patchset or repost them as part of patchset with appropriate
> > > tagging.
> > > 
> > > > > > +
> > > > > > +    for (; bus->parent; bus = qdev_get_parent_bus(dev)) {
> > > > > > +
> > > > > > +        dev = bus->parent;
> > > > > > +
> > > > > > +        if (!strncmp("i440FX-pcihost", object_get_typename(OBJECT(dev)), 14)) {
> > > > > > +            return true;
> > > > > > +        }
> > > > > > +    }
> > > > > > +
> > > > > > +    error_report("vfio-pci: VM chipset does not support INTx routing, "
> > > > > > +                 "using slow INTx mode\n");
> > > > > 
> > > > > When does this code trigger? It seems irqchip implies piix ATM -
> > > > > is this just dead code?
> > > > 
> > > > Unused, but not unnecessary.  Another chipset is under development,
> > > > which means very quickly irqchip will not imply piix.
> > > 
> > > So this is for purposes of an out of tree stuff, let's
> > > keep these hacks out of tree too. My guess is
> > > q35 can just be added with pci_device_route_intx straight away.
> > > 
> > > >  Likewise irqfd
> > > > support is being added to other architectures, so I don't know how long
> > > > the kvm specific tests will hold up.  Testing for a specific chipset
> > > > could of course be avoided if we were willing to support:
> > > > 
> > > > bool pci_device_intx_route_supported(PCIDevice *pdev)
> > > > 
> > > > or the NOROUTE option I posted previously.
> > > 
> > > This is just moving the pain to users who will
> > > get bad performance and will have to debug it. Injecting
> > > through userspace is slow, new architectures should
> > > simply add irqfd straight away instead of adding
> > > work arounds in userspace.
> > > 
> > > This is exactly why we have the assert in pci core.
> > 
> > Let's compare user experience:
> > 
> > As coded here:
> > 
> > - Error message, only runs slower if the driver actually uses INTx.
> > Result: file bug, continue using vfio-pci, maybe do something useful,
> > maybe find new bugs to file.
> > 
> > Blindly calling PCI code w/ assert:
> > 
> > - Assert.  Result: file bug, full stop.
> > 
> > Maybe I do too much kernel programming, but I don't take asserts lightly
> > and prefer they be saved for "something is really broken and I can't
> > safely continue".  This is not such a case.
> 
> There's no chance we ship e.g. q35 by mistake without this API: since
> there is no way this specific assert can be missed in even basic
> testing:
> 
> So I see it differently:
> 
> As coded here:
> 	chipset authors get lazy and do not implement API.
> 	bad performance for all users.
> 
> With assert:
> 	chipset authors implement necessary API.
> 	good performance for all users.

I prefer a carrot, not a whip.  Thanks,

Alex
Michael S. Tsirkin - Oct. 16, 2012, 3:23 p.m.
On Tue, Oct 16, 2012 at 09:13:15AM -0600, Alex Williamson wrote:
> > There's no chance we ship e.g. q35 by mistake without this API: since
> > there is no way this specific assert can be missed in even basic
> > testing:
> > 
> > So I see it differently:
> > 
> > As coded here:
> > 	chipset authors get lazy and do not implement API.
> > 	bad performance for all users.
> > 
> > With assert:
> > 	chipset authors implement necessary API.
> > 	good performance for all users.
> 
> I prefer a carrot, not a whip.  Thanks,
> 
> Alex
> 

It's not just that.
Problem is performance testing/fixing is hard.
Catching and fixing asserts is easy.
So working around buggy qemu code really backfires
as it reverses the motivation for writing well performing
code. History proves me right: for each API change where
we implemented a fallback old code stayed around for years.
Alex Williamson - Oct. 16, 2012, 4:49 p.m.
On Tue, 2012-10-16 at 17:23 +0200, Michael S. Tsirkin wrote:
> On Tue, Oct 16, 2012 at 09:13:15AM -0600, Alex Williamson wrote:
> > > There's no chance we ship e.g. q35 by mistake without this API: since
> > > there is no way this specific assert can be missed in even basic
> > > testing:
> > > 
> > > So I see it differently:
> > > 
> > > As coded here:
> > > 	chipset authors get lazy and do not implement API.
> > > 	bad performance for all users.
> > > 
> > > With assert:
> > > 	chipset authors implement necessary API.
> > > 	good performance for all users.
> > 
> > I prefer a carrot, not a whip.  Thanks,
> > 
> > Alex
> > 
> 
> It's not just that.
> Problem is performance testing/fixing is hard.

Getting an error_report from the driver saying it's using a slow path
and why makes that significantly easier.

> Catching and fixing asserts is easy.

Easy for who?  The user trying to test a feature?  Probably not.  Me,
who may not have access to the chipset documentation or understand the
platform?  Maybe, maybe not.

> So working around buggy qemu code really backfires
> as it reverses the motivation for writing well performing
> code. History proves me right: for each API change where
> we implemented a fallback old code stayed around for years.

Does that necessarily mean it was wrong?  How many of those API changes
added new features that may have been abandoned if the developer was
required to make sweeping changes to get their code accepted?  If not
abandoned, how much delayed?  How many land mines might we have in the
code for changes that were done incorrectly or missed?  I don't
understand why adding robustness to the API is such a contentious point,
but it's your prerogative, just as it's mine to avoid using that API
arbitrarily.  Thanks,

Alex
Michael S. Tsirkin - Oct. 16, 2012, 5:34 p.m.
On Tue, Oct 16, 2012 at 10:49:38AM -0600, Alex Williamson wrote:
> On Tue, 2012-10-16 at 17:23 +0200, Michael S. Tsirkin wrote:
> > On Tue, Oct 16, 2012 at 09:13:15AM -0600, Alex Williamson wrote:
> > > > There's no chance we ship e.g. q35 by mistake without this API: since
> > > > there is no way this specific assert can be missed in even basic
> > > > testing:
> > > > 
> > > > So I see it differently:
> > > > 
> > > > As coded here:
> > > > 	chipset authors get lazy and do not implement API.
> > > > 	bad performance for all users.
> > > > 
> > > > With assert:
> > > > 	chipset authors implement necessary API.
> > > > 	good performance for all users.
> > > 
> > > I prefer a carrot, not a whip.  Thanks,
> > > 
> > > Alex
> > > 
> > 
> > It's not just that.
> > Problem is performance testing/fixing is hard.
> 
> Getting an error_report from the driver saying it's using a slow path
> and why makes that significantly easier.
> 
> > Catching and fixing asserts is easy.
> 
> Easy for who?  The user trying to test a feature?  Probably not.  Me,
> who may not have access to the chipset documentation or understand the
> platform?  Maybe, maybe not.
> 
> > So working around buggy qemu code really backfires
> > as it reverses the motivation for writing well performing
> > code. History proves me right: for each API change where
> > we implemented a fallback old code stayed around for years.
> 
> Does that necessarily mean it was wrong?  How many of those API changes
> added new features that may have been abandoned if the developer was
> required to make sweeping changes to get their code accepted?  If not
> abandoned, how much delayed?  How many land mines might we have in the
> code for changes that were done incorrectly or missed?  I don't
> understand why adding robustness to the API is such a contentious point,
> but it's your prerogative, just as it's mine to avoid using that API
> arbitrarily.  Thanks,
> 
> Alex

Yea. All I say is, I intend to fix things so you don't need
to probe for this API. I think it does not make sense to
add a temporary API as a stopgap for this since it will solve no
actual problem. If e.g. Jason finds it hard to add this to
q35, we could add a stopgap solution for vfio.

Patch

diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c
index 639371e..777a5f8 100644
--- a/hw/vfio_pci.c
+++ b/hw/vfio_pci.c
@@ -154,6 +154,53 @@  static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
 static void vfio_mmap_set_enabled(VFIODevice *vdev, bool enabled);
 
 /*
+ * PCI code refuses to make it possible to probe whether the chipset
+ * supports pci_device_route_intx_to_irq() and booby traps the call
+ * to assert if doesn't.  For us, this is just an optimization, so
+ * only enable it when we know it's present.  Unfortunately PCIBus is
+ * private, so we can't just look at the function pointer.
+ */
+static bool vfio_pci_bus_has_intx_route(PCIDevice *pdev)
+{
+#ifdef CONFIG_KVM
+    BusState *bus = qdev_get_parent_bus(&pdev->qdev);
+    DeviceState *dev;
+
+    if (!kvm_irqchip_in_kernel() ||
+        !kvm_check_extension(kvm_state, KVM_CAP_IRQFD_RESAMPLE)) {
+	return false;
+    }
+
+    for (; bus->parent; bus = qdev_get_parent_bus(dev)) {
+
+        dev = bus->parent;
+
+        if (!strncmp("i440FX-pcihost", object_get_typename(OBJECT(dev)), 14)) {
+            return true;
+        }
+    }
+
+    error_report("vfio-pci: VM chipset does not support INTx routing, "
+                 "using slow INTx mode\n");
+#endif
+    return false;
+}
+
+static PCIINTxRoute vfio_pci_device_route_intx_to_irq(PCIDevice *pdev, int pin)
+{
+    if (!vfio_pci_bus_has_intx_route(pdev)) {
+        return (PCIINTxRoute) { .mode = PCI_INTX_DISABLED, .irq = -1 };
+    }
+
+    return pci_device_route_intx_to_irq(pdev, pin);
+}
+
+static bool vfio_pci_intx_route_changed(PCIINTxRoute *old, PCIINTxRoute *new)
+{
+    return old->mode != new->mode || old->irq != new->irq;
+}
+
+/*
  * Common VFIO interrupt disable
  */
 static void vfio_disable_irqindex(VFIODevice *vdev, int index)
@@ -185,6 +232,21 @@  static void vfio_unmask_intx(VFIODevice *vdev)
     ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
 }
 
+#ifdef CONFIG_KVM
+static void vfio_mask_intx(VFIODevice *vdev)
+{
+    struct vfio_irq_set irq_set = {
+        .argsz = sizeof(irq_set),
+        .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK,
+        .index = VFIO_PCI_INTX_IRQ_INDEX,
+        .start = 0,
+        .count = 1,
+    };
+
+    ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
+}
+#endif
+
 /*
  * Disabling BAR mmaping can be slow, but toggling it around INTx can
  * also be a huge overhead.  We try to get the best of both worlds by
@@ -248,6 +310,161 @@  static void vfio_eoi(VFIODevice *vdev)
     vfio_unmask_intx(vdev);
 }
 
+static void vfio_enable_intx_kvm(VFIODevice *vdev)
+{
+#ifdef CONFIG_KVM
+    struct kvm_irqfd irqfd = {
+        .fd = event_notifier_get_fd(&vdev->intx.interrupt),
+        .gsi = vdev->intx.route.irq,
+        .flags = KVM_IRQFD_FLAG_RESAMPLE,
+    };
+    struct vfio_irq_set *irq_set;
+    int ret, argsz;
+    int32_t *pfd;
+
+    if (!kvm_irqchip_in_kernel() ||
+        vdev->intx.route.mode != PCI_INTX_ENABLED ||
+        !kvm_check_extension(kvm_state, KVM_CAP_IRQFD_RESAMPLE)) {
+        return;
+    }
+
+    /* Get to a known interrupt state */
+    qemu_set_fd_handler(irqfd.fd, NULL, NULL, vdev);
+    vfio_mask_intx(vdev);
+    vdev->intx.pending = false;
+    qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 0);
+
+    /* Get an eventfd for resample/unmask */
+    if (event_notifier_init(&vdev->intx.unmask, 0)) {
+        error_report("vfio: Error: event_notifier_init failed eoi\n");
+        goto fail;
+    }
+
+    /* KVM triggers it, VFIO listens for it */
+    irqfd.resamplefd = event_notifier_get_fd(&vdev->intx.unmask);
+
+    if (kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd)) {
+        error_report("vfio: Error: Failed to setup resample irqfd: %m\n");
+        goto fail_irqfd;
+    }
+
+    argsz = sizeof(*irq_set) + sizeof(*pfd);
+
+    irq_set = g_malloc0(argsz);
+    irq_set->argsz = argsz;
+    irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_UNMASK;
+    irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
+    irq_set->start = 0;
+    irq_set->count = 1;
+    pfd = (int32_t *)&irq_set->data;
+
+    *pfd = irqfd.resamplefd;
+
+    ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
+    g_free(irq_set);
+    if (ret) {
+        error_report("vfio: Error: Failed to setup INTx unmask fd: %m\n");
+        goto fail_vfio;
+    }
+
+    /* Let'em rip */
+    vfio_unmask_intx(vdev);
+
+    vdev->intx.kvm_accel = true;
+
+    DPRINTF("%s(%04x:%02x:%02x.%x) KVM INTx accel enabled\n",
+            __func__, vdev->host.domain, vdev->host.bus,
+            vdev->host.slot, vdev->host.function);
+
+    return;
+
+fail_vfio:
+    irqfd.flags = KVM_IRQFD_FLAG_DEASSIGN;
+    kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd);
+fail_irqfd:
+    event_notifier_cleanup(&vdev->intx.unmask);
+fail:
+    qemu_set_fd_handler(irqfd.fd, vfio_intx_interrupt, NULL, vdev);
+    vfio_unmask_intx(vdev);
+#endif
+}
+
+static void vfio_disable_intx_kvm(VFIODevice *vdev)
+{
+#ifdef CONFIG_KVM
+    struct kvm_irqfd irqfd = {
+        .fd = event_notifier_get_fd(&vdev->intx.interrupt),
+        .gsi = vdev->intx.route.irq,
+        .flags = KVM_IRQFD_FLAG_DEASSIGN,
+    };
+
+    if (!vdev->intx.kvm_accel) {
+        return;
+    }
+
+    /*
+     * Get to a known state, hardware masked, QEMU ready to accept new
+     * interrupts, QEMU IRQ de-asserted.
+     */
+    vfio_mask_intx(vdev);
+    vdev->intx.pending = false;
+    qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 0);
+
+    /* Tell KVM to stop listening for an INTx irqfd */
+    if (kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd)) {
+        error_report("vfio: Error: Failed to disable INTx irqfd: %m\n");
+    }
+
+    /* We only need to close the eventfd for VFIO to cleanup the kernel side */
+    event_notifier_cleanup(&vdev->intx.unmask);
+
+    /* QEMU starts listening for interrupt events. */
+    qemu_set_fd_handler(irqfd.fd, vfio_intx_interrupt, NULL, vdev);
+
+    vdev->intx.kvm_accel = false;
+
+    /* If we've missed an event, let it re-fire through QEMU */
+    vfio_unmask_intx(vdev);
+
+    DPRINTF("%s(%04x:%02x:%02x.%x) KVM INTx accel disabled\n",
+            __func__, vdev->host.domain, vdev->host.bus,
+            vdev->host.slot, vdev->host.function);
+#endif
+}
+
+static void vfio_update_irq(PCIDevice *pdev)
+{
+    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
+    PCIINTxRoute route;
+
+    if (vdev->interrupt != VFIO_INT_INTx) {
+        return;
+    }
+
+    route = vfio_pci_device_route_intx_to_irq(&vdev->pdev, vdev->intx.pin);
+
+    if (!vfio_pci_intx_route_changed(&vdev->intx.route, &route)) {
+        return; /* Nothing changed */
+    }
+
+    DPRINTF("%s(%04x:%02x:%02x.%x) IRQ moved %d -> %d\n", __func__,
+            vdev->host.domain, vdev->host.bus, vdev->host.slot,
+            vdev->host.function, vdev->intx.route.irq, route.irq);
+
+    vfio_disable_intx_kvm(vdev);
+
+    vdev->intx.route = route;
+
+    if (route.mode != PCI_INTX_ENABLED) {
+        return;
+    }
+
+    vfio_enable_intx_kvm(vdev);
+
+    /* Re-enable the interrupt in cased we missed an EOI */
+    vfio_eoi(vdev);
+}
+
 static int vfio_enable_intx(VFIODevice *vdev)
 {
     uint8_t pin = vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1);
@@ -262,6 +479,9 @@  static int vfio_enable_intx(VFIODevice *vdev)
     vfio_disable_interrupts(vdev);
 
     vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */
+    vdev->intx.route = vfio_pci_device_route_intx_to_irq(&vdev->pdev,
+                                                         vdev->intx.pin);
+
     ret = event_notifier_init(&vdev->intx.interrupt, 0);
     if (ret) {
         error_report("vfio: Error: event_notifier_init failed\n");
@@ -290,6 +510,8 @@  static int vfio_enable_intx(VFIODevice *vdev)
         return -errno;
     }
 
+    vfio_enable_intx_kvm(vdev);
+
     vdev->interrupt = VFIO_INT_INTx;
 
     DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
@@ -303,6 +525,7 @@  static void vfio_disable_intx(VFIODevice *vdev)
     int fd;
 
     qemu_del_timer(vdev->intx.mmap_timer);
+    vfio_disable_intx_kvm(vdev);
     vfio_disable_irqindex(vdev, VFIO_PCI_INTX_IRQ_INDEX);
     vdev->intx.pending = false;
     qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 0);
@@ -1870,6 +2093,7 @@  static int vfio_initfn(PCIDevice *pdev)
     if (vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1)) {
         vdev->intx.mmap_timer = qemu_new_timer_ms(vm_clock,
                                                   vfio_intx_mmap_enable, vdev);
+        pci_device_set_intx_routing_notifier(&vdev->pdev, vfio_update_irq);
         ret = vfio_enable_intx(vdev);
         if (ret) {
             goto out_teardown;