diff mbox series

[v2,4/4] vfio/quirks: Enable ioeventfd quirks to be handled by vfio directly

Message ID 20180501164346.28940.93328.stgit@gimli.home
State New
Headers show
Series vfio/quirks: ioeventfd support | expand

Commit Message

Alex Williamson May 1, 2018, 4:43 p.m. UTC
With vfio ioeventfd support, we can program vfio-pci to perform a
specified BAR write when an eventfd is triggered.  This allows the
KVM ioeventfd to be wired directly to vfio-pci, entirely avoiding
userspace handling for these events.  On the same micro-benchmark
where the ioeventfd got us to almost 90% of performance versus
disabling the GeForce quirks, this gets us to within 95%.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 hw/vfio/pci-quirks.c |   50 +++++++++++++++++++++++++++++++++++++++++++-------
 hw/vfio/pci.c        |    2 ++
 hw/vfio/pci.h        |    2 ++
 hw/vfio/trace-events |    2 +-
 4 files changed, 48 insertions(+), 8 deletions(-)

Comments

Peter Xu May 3, 2018, 4:56 a.m. UTC | #1
On Tue, May 01, 2018 at 10:43:46AM -0600, Alex Williamson wrote:

[...]

> -static void vfio_ioeventfd_exit(VFIOIOEventFD *ioeventfd)
> +static void vfio_ioeventfd_exit(VFIOPCIDevice *vdev, VFIOIOEventFD *ioeventfd)
>  {
>      QLIST_REMOVE(ioeventfd, next);
> +
>      memory_region_del_eventfd(ioeventfd->mr, ioeventfd->addr, ioeventfd->size,
>                                ioeventfd->match_data, ioeventfd->data,
>                                &ioeventfd->e);
> -    qemu_set_fd_handler(event_notifier_get_fd(&ioeventfd->e), NULL, NULL, NULL);
> +
> +    if (ioeventfd->vfio) {
> +        struct vfio_device_ioeventfd vfio_ioeventfd;
> +
> +        vfio_ioeventfd.argsz = sizeof(vfio_ioeventfd);
> +        vfio_ioeventfd.flags = ioeventfd->size;
> +        vfio_ioeventfd.data = ioeventfd->data;
> +        vfio_ioeventfd.offset = ioeventfd->region->fd_offset +
> +                                ioeventfd->region_addr;
> +        vfio_ioeventfd.fd = -1;
> +
> +        ioctl(vdev->vbasedev.fd, VFIO_DEVICE_IOEVENTFD, &vfio_ioeventfd);

(If the series is going to respin, I am thinking whether it would
 worth it to capture this error to dump something.  But it's for sure
 optional since even error happened we should have something in dmesg
 so it only matters on whether we also want something to be dumped
 from QEMU side too... After all there aren't much we can do)

> +
> +    } else {
> +        qemu_set_fd_handler(event_notifier_get_fd(&ioeventfd->e),
> +                            NULL, NULL, NULL);
> +    }
> +
>      event_notifier_cleanup(&ioeventfd->e);
>      trace_vfio_ioeventfd_exit(memory_region_name(ioeventfd->mr),
>                                (uint64_t)ioeventfd->addr, ioeventfd->size,

[...]

> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index ba1239551115..84e27c7bb2d1 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -3177,6 +3177,8 @@ static Property vfio_pci_dev_properties[] = {
>                       no_geforce_quirks, false),
>      DEFINE_PROP_BOOL("x-no-kvm-ioeventfd", VFIOPCIDevice, no_kvm_ioeventfd,
>                       false),
> +    DEFINE_PROP_BOOL("x-no-vfio-ioeventfd", VFIOPCIDevice, no_vfio_ioeventfd,
> +                     false),

Here it looks more like a tri-state for me, so we can either:

- disable the acceleration in general, or
- enable QEMU-side acceleration only, or
- enable kernel-side acceleration

In other words, IIUC x-no-vfio-ioeventfd won't matter much if
x-no-kvm-ioeventfd is already set.  So not sure whether a single
parameter would be nicer.

Anyway, I'm even fine with two-parameter way, so I'll leave the
decision to Alex and other reviewers:

Reviewed-by: Peter Xu <peterx@redhat.com>

Thanks,
Eric Auger May 3, 2018, 3:20 p.m. UTC | #2
Hi Alex,

On 05/01/2018 06:43 PM, Alex Williamson wrote:
> With vfio ioeventfd support, we can program vfio-pci to perform a
> specified BAR write when an eventfd is triggered.  This allows the
> KVM ioeventfd to be wired directly to vfio-pci, entirely avoiding
> userspace handling for these events.  On the same micro-benchmark
> where the ioeventfd got us to almost 90% of performance versus
> disabling the GeForce quirks, this gets us to within 95%.
> 
> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> ---
>  hw/vfio/pci-quirks.c |   50 +++++++++++++++++++++++++++++++++++++++++++-------
>  hw/vfio/pci.c        |    2 ++
>  hw/vfio/pci.h        |    2 ++
>  hw/vfio/trace-events |    2 +-
>  4 files changed, 48 insertions(+), 8 deletions(-)
> 
> diff --git a/hw/vfio/pci-quirks.c b/hw/vfio/pci-quirks.c
> index 4cedc733bc0a..94be27dd0a3b 100644
> --- a/hw/vfio/pci-quirks.c
> +++ b/hw/vfio/pci-quirks.c
> @@ -16,6 +16,7 @@
>  #include "qemu/range.h"
>  #include "qapi/error.h"
>  #include "qapi/visitor.h"
> +#include <sys/ioctl.h>
>  #include "hw/nvram/fw_cfg.h"
>  #include "pci.h"
>  #include "trace.h"
> @@ -287,13 +288,31 @@ static VFIOQuirk *vfio_quirk_alloc(int nr_mem)
>      return quirk;
>  }
>  
> -static void vfio_ioeventfd_exit(VFIOIOEventFD *ioeventfd)
> +static void vfio_ioeventfd_exit(VFIOPCIDevice *vdev, VFIOIOEventFD *ioeventfd)
>  {
>      QLIST_REMOVE(ioeventfd, next);
> +
nit: unrelated new line
>      memory_region_del_eventfd(ioeventfd->mr, ioeventfd->addr, ioeventfd->size,
>                                ioeventfd->match_data, ioeventfd->data,
>                                &ioeventfd->e);
> -    qemu_set_fd_handler(event_notifier_get_fd(&ioeventfd->e), NULL, NULL, NULL);
> +
> +    if (ioeventfd->vfio) {
> +        struct vfio_device_ioeventfd vfio_ioeventfd;
> +
> +        vfio_ioeventfd.argsz = sizeof(vfio_ioeventfd);
> +        vfio_ioeventfd.flags = ioeventfd->size;
> +        vfio_ioeventfd.data = ioeventfd->data;
> +        vfio_ioeventfd.offset = ioeventfd->region->fd_offset +
> +                                ioeventfd->region_addr;
> +        vfio_ioeventfd.fd = -1;
> +
> +        ioctl(vdev->vbasedev.fd, VFIO_DEVICE_IOEVENTFD, &vfio_ioeventfd);
> +
> +    } else {
> +        qemu_set_fd_handler(event_notifier_get_fd(&ioeventfd->e),
> +                            NULL, NULL, NULL);
> +    }
> +
>      event_notifier_cleanup(&ioeventfd->e);
>      trace_vfio_ioeventfd_exit(memory_region_name(ioeventfd->mr),
>                                (uint64_t)ioeventfd->addr, ioeventfd->size,
> @@ -307,7 +326,7 @@ static void vfio_drop_dynamic_eventfds(VFIOPCIDevice *vdev, VFIOQuirk *quirk)
>  
>      QLIST_FOREACH_SAFE(ioeventfd, &quirk->ioeventfds, next, tmp) {
>          if (ioeventfd->dynamic) {
> -            vfio_ioeventfd_exit(ioeventfd);
> +            vfio_ioeventfd_exit(vdev, ioeventfd);
>          }
>      }
>  }
> @@ -361,13 +380,30 @@ static VFIOIOEventFD *vfio_ioeventfd_init(VFIOPCIDevice *vdev,
>      ioeventfd->region = region;
>      ioeventfd->region_addr = region_addr;
>  
> -    qemu_set_fd_handler(event_notifier_get_fd(&ioeventfd->e),
> -                        vfio_ioeventfd_handler, NULL, ioeventfd);
> +    if (!vdev->no_vfio_ioeventfd) {
> +        struct vfio_device_ioeventfd vfio_ioeventfd;
> +
> +        vfio_ioeventfd.argsz = sizeof(vfio_ioeventfd);
> +        vfio_ioeventfd.flags = ioeventfd->size;
> +        vfio_ioeventfd.data = ioeventfd->data;
> +        vfio_ioeventfd.offset = ioeventfd->region->fd_offset +
> +                                ioeventfd->region_addr;
> +        vfio_ioeventfd.fd = event_notifier_get_fd(&ioeventfd->e);
> +
> +        ioeventfd->vfio = !ioctl(vdev->vbasedev.fd,
> +                                 VFIO_DEVICE_IOEVENTFD, &vfio_ioeventfd);
> +    }
> +
> +    if (!ioeventfd->vfio) {
> +        qemu_set_fd_handler(event_notifier_get_fd(&ioeventfd->e),
> +                            vfio_ioeventfd_handler, NULL, ioeventfd);
> +    }
> +
>      memory_region_add_eventfd(ioeventfd->mr, ioeventfd->addr,
>                                ioeventfd->size, ioeventfd->match_data,
>                                ioeventfd->data, &ioeventfd->e);
>      trace_vfio_ioeventfd_init(memory_region_name(mr), (uint64_t)addr,
> -                              size, data);
> +                              size, data, ioeventfd->vfio);
>  
>      return ioeventfd;
>  }
> @@ -1835,7 +1871,7 @@ void vfio_bar_quirk_exit(VFIOPCIDevice *vdev, int nr)
>  
>      QLIST_FOREACH(quirk, &bar->quirks, next) {
>          while (!QLIST_EMPTY(&quirk->ioeventfds)) {
> -            vfio_ioeventfd_exit(QLIST_FIRST(&quirk->ioeventfds));
> +            vfio_ioeventfd_exit(vdev, QLIST_FIRST(&quirk->ioeventfds));
>          }
>  
>          for (i = 0; i < quirk->nr_mem; i++) {
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index ba1239551115..84e27c7bb2d1 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -3177,6 +3177,8 @@ static Property vfio_pci_dev_properties[] = {
>                       no_geforce_quirks, false),
>      DEFINE_PROP_BOOL("x-no-kvm-ioeventfd", VFIOPCIDevice, no_kvm_ioeventfd,
>                       false),
> +    DEFINE_PROP_BOOL("x-no-vfio-ioeventfd", VFIOPCIDevice, no_vfio_ioeventfd,
> +                     false),
I tend to agree with Peter about the 2 options. Only the KVM
acceleration brings benefit here?

Thanks

Eric
>      DEFINE_PROP_UINT32("x-pci-vendor-id", VFIOPCIDevice, vendor_id, PCI_ANY_ID),
>      DEFINE_PROP_UINT32("x-pci-device-id", VFIOPCIDevice, device_id, PCI_ANY_ID),
>      DEFINE_PROP_UINT32("x-pci-sub-vendor-id", VFIOPCIDevice,
> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
> index dbb3aca9b3d2..dbb3932b50ef 100644
> --- a/hw/vfio/pci.h
> +++ b/hw/vfio/pci.h
> @@ -35,6 +35,7 @@ typedef struct VFIOIOEventFD {
>      hwaddr region_addr;
>      bool match_data;
>      bool dynamic;
> +    bool vfio;
>  } VFIOIOEventFD;
>  
>  typedef struct VFIOQuirk {
> @@ -164,6 +165,7 @@ typedef struct VFIOPCIDevice {
>      bool no_kvm_msix;
>      bool no_geforce_quirks;
>      bool no_kvm_ioeventfd;
> +    bool no_vfio_ioeventfd;
>      VFIODisplay *dpy;
>  } VFIOPCIDevice;
>  
> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
> index f8f97d1ff90c..d2a74952e389 100644
> --- a/hw/vfio/trace-events
> +++ b/hw/vfio/trace-events
> @@ -79,7 +79,7 @@ vfio_quirk_ati_bonaire_reset_done(const char *name) "%s"
>  vfio_quirk_ati_bonaire_reset(const char *name) "%s"
>  vfio_ioeventfd_exit(const char *name, uint64_t addr, unsigned size, uint64_t data) "%s+0x%"PRIx64"[%d]:0x%"PRIx64
>  vfio_ioeventfd_handler(const char *name, uint64_t addr, unsigned size, uint64_t data) "%s+0x%"PRIx64"[%d] -> 0x%"PRIx64
> -vfio_ioeventfd_init(const char *name, uint64_t addr, unsigned size, uint64_t data) "%s+0x%"PRIx64"[%d]:0x%"PRIx64
> +vfio_ioeventfd_init(const char *name, uint64_t addr, unsigned size, uint64_t data, bool vfio) "%s+0x%"PRIx64"[%d]:0x%"PRIx64" vfio:%d"
>  vfio_pci_igd_bar4_write(const char *name, uint32_t index, uint32_t data, uint32_t base) "%s [0x%03x] 0x%08x -> 0x%08x"
>  vfio_pci_igd_bdsm_enabled(const char *name, int size) "%s %dMB"
>  vfio_pci_igd_opregion_enabled(const char *name) "%s"
> 
>
Alex Williamson May 3, 2018, 4:29 p.m. UTC | #3
On Thu, 3 May 2018 12:56:03 +0800
Peter Xu <peterx@redhat.com> wrote:

> On Tue, May 01, 2018 at 10:43:46AM -0600, Alex Williamson wrote:
> 
> [...]
> 
> > -static void vfio_ioeventfd_exit(VFIOIOEventFD *ioeventfd)
> > +static void vfio_ioeventfd_exit(VFIOPCIDevice *vdev, VFIOIOEventFD *ioeventfd)
> >  {
> >      QLIST_REMOVE(ioeventfd, next);
> > +
> >      memory_region_del_eventfd(ioeventfd->mr, ioeventfd->addr, ioeventfd->size,
> >                                ioeventfd->match_data, ioeventfd->data,
> >                                &ioeventfd->e);
> > -    qemu_set_fd_handler(event_notifier_get_fd(&ioeventfd->e), NULL, NULL, NULL);
> > +
> > +    if (ioeventfd->vfio) {
> > +        struct vfio_device_ioeventfd vfio_ioeventfd;
> > +
> > +        vfio_ioeventfd.argsz = sizeof(vfio_ioeventfd);
> > +        vfio_ioeventfd.flags = ioeventfd->size;
> > +        vfio_ioeventfd.data = ioeventfd->data;
> > +        vfio_ioeventfd.offset = ioeventfd->region->fd_offset +
> > +                                ioeventfd->region_addr;
> > +        vfio_ioeventfd.fd = -1;
> > +
> > +        ioctl(vdev->vbasedev.fd, VFIO_DEVICE_IOEVENTFD, &vfio_ioeventfd);  
> 
> (If the series is going to respin, I am thinking whether it would
>  worth it to capture this error to dump something.  But it's for sure
>  optional since even error happened we should have something in dmesg
>  so it only matters on whether we also want something to be dumped
>  from QEMU side too... After all there aren't much we can do)

I'm torn whether to use QEMU standard error handling here, ie.
abort().  If we failed to remove the KVM ioeventfd, we'd abort before
we get here, so there's no chance that the vfio ioeventfd will continue
to be triggered.  Obviously leaving a vfio ioeventfd that we can't
trigger and might interfere with future ioeventfds is not good, but do
we really want to kill the VM because we possibly can't add an
accelerator here later?  I'm inclined to say no, so I think I'll just
error_report() unless there are objections.

> > +
> > +    } else {
> > +        qemu_set_fd_handler(event_notifier_get_fd(&ioeventfd->e),
> > +                            NULL, NULL, NULL);
> > +    }
> > +
> >      event_notifier_cleanup(&ioeventfd->e);
> >      trace_vfio_ioeventfd_exit(memory_region_name(ioeventfd->mr),
> >                                (uint64_t)ioeventfd->addr, ioeventfd->size,  
> 
> [...]
> 
> > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> > index ba1239551115..84e27c7bb2d1 100644
> > --- a/hw/vfio/pci.c
> > +++ b/hw/vfio/pci.c
> > @@ -3177,6 +3177,8 @@ static Property vfio_pci_dev_properties[] = {
> >                       no_geforce_quirks, false),
> >      DEFINE_PROP_BOOL("x-no-kvm-ioeventfd", VFIOPCIDevice, no_kvm_ioeventfd,
> >                       false),
> > +    DEFINE_PROP_BOOL("x-no-vfio-ioeventfd", VFIOPCIDevice, no_vfio_ioeventfd,
> > +                     false),  
> 
> Here it looks more like a tri-state for me, so we can either:
> 
> - disable the acceleration in general, or
> - enable QEMU-side acceleration only, or
> - enable kernel-side acceleration

So you're looking for a Auto/Off/KVM-only option?  Do you really think
it's worth defining a new tristate property for this sort of debugging
option...

> In other words, IIUC x-no-vfio-ioeventfd won't matter much if
> x-no-kvm-ioeventfd is already set.  So not sure whether a single
> parameter would be nicer.

That's correct, but who do we expect to be using this option and why?
I added enum OffAutoPCIBAR and the property to enable it for MSI-x
relocation because it is an option that a normal user might reasonably
need to use, given the right hardware on the right host, but it's an
unsupported option because we cannot programatically validate it.
Support rests with the individual user, if it doesn't work, don't use
it, if it helps, great.  Here we have options that are really only for
debugging, to test whether something has gone wrong in this code,
disable this bypass to make all device interactions visible through
QEMU, or specifically to evaluate the performance of this path.  Is it
reasonable to impose yet another property type on the common code for
this use case when a couple bools work just fine, if perhaps not
absolutely ideal?  Am I overlooking an existing tri-state that might be
a reasonable match?

Tying Eric's thread so we can discuss this in one place:

On Thu, 3 May 2018 17:20:18 +0200
Auger Eric <eric.auger@redhat.com> wrote:
> I tend to agree with Peter about the 2 options. Only the KVM
> acceleration brings benefit here?

Not at all, KVM acceleration does half the job (perhaps more than
half), but if we want to completely bypass the userspace exit then we
need both KVM triggering the ioeventfd and vfio consuming it.  It's
useful for me at least to be able to compare completely disabled vs
just KVM vs also adding vfio.  As above, I don't expect these to be
used for anything other than debugging and performance evaluation, so I
don't think it's worth making non-trivial code changes to enable the
vfio-only path, nor do I think it's worthwhile to define a new
tri-state property to resolve the slight mismatch we have vs two
bools.  Again, let me know if you think I'm missing an existing
property or if there are trivial changes we could make to make this map
to an existing property.  Thanks,

Alex
Alex Williamson May 3, 2018, 4:30 p.m. UTC | #4
On Thu, 3 May 2018 17:20:18 +0200
Auger Eric <eric.auger@redhat.com> wrote:

> Hi Alex,
> 
> On 05/01/2018 06:43 PM, Alex Williamson wrote:
> > With vfio ioeventfd support, we can program vfio-pci to perform a
> > specified BAR write when an eventfd is triggered.  This allows the
> > KVM ioeventfd to be wired directly to vfio-pci, entirely avoiding
> > userspace handling for these events.  On the same micro-benchmark
> > where the ioeventfd got us to almost 90% of performance versus
> > disabling the GeForce quirks, this gets us to within 95%.
> > 
> > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > ---
> >  hw/vfio/pci-quirks.c |   50 +++++++++++++++++++++++++++++++++++++++++++-------
> >  hw/vfio/pci.c        |    2 ++
> >  hw/vfio/pci.h        |    2 ++
> >  hw/vfio/trace-events |    2 +-
> >  4 files changed, 48 insertions(+), 8 deletions(-)
> > 
> > diff --git a/hw/vfio/pci-quirks.c b/hw/vfio/pci-quirks.c
> > index 4cedc733bc0a..94be27dd0a3b 100644
> > --- a/hw/vfio/pci-quirks.c
> > +++ b/hw/vfio/pci-quirks.c
> > @@ -16,6 +16,7 @@
> >  #include "qemu/range.h"
> >  #include "qapi/error.h"
> >  #include "qapi/visitor.h"
> > +#include <sys/ioctl.h>
> >  #include "hw/nvram/fw_cfg.h"
> >  #include "pci.h"
> >  #include "trace.h"
> > @@ -287,13 +288,31 @@ static VFIOQuirk *vfio_quirk_alloc(int nr_mem)
> >      return quirk;
> >  }
> >  
> > -static void vfio_ioeventfd_exit(VFIOIOEventFD *ioeventfd)
> > +static void vfio_ioeventfd_exit(VFIOPCIDevice *vdev, VFIOIOEventFD *ioeventfd)
> >  {
> >      QLIST_REMOVE(ioeventfd, next);
> > +  
> nit: unrelated new line

Fixed

> >      memory_region_del_eventfd(ioeventfd->mr, ioeventfd->addr, ioeventfd->size,
> >                                ioeventfd->match_data, ioeventfd->data,
> >                                &ioeventfd->e);
> > -    qemu_set_fd_handler(event_notifier_get_fd(&ioeventfd->e), NULL, NULL, NULL);
> > +
> > +    if (ioeventfd->vfio) {
> > +        struct vfio_device_ioeventfd vfio_ioeventfd;
> > +
> > +        vfio_ioeventfd.argsz = sizeof(vfio_ioeventfd);
> > +        vfio_ioeventfd.flags = ioeventfd->size;
> > +        vfio_ioeventfd.data = ioeventfd->data;
> > +        vfio_ioeventfd.offset = ioeventfd->region->fd_offset +
> > +                                ioeventfd->region_addr;
> > +        vfio_ioeventfd.fd = -1;
> > +
> > +        ioctl(vdev->vbasedev.fd, VFIO_DEVICE_IOEVENTFD, &vfio_ioeventfd);
> > +
> > +    } else {
> > +        qemu_set_fd_handler(event_notifier_get_fd(&ioeventfd->e),
> > +                            NULL, NULL, NULL);
> > +    }
> > +
> >      event_notifier_cleanup(&ioeventfd->e);
> >      trace_vfio_ioeventfd_exit(memory_region_name(ioeventfd->mr),
> >                                (uint64_t)ioeventfd->addr, ioeventfd->size,
> > @@ -307,7 +326,7 @@ static void vfio_drop_dynamic_eventfds(VFIOPCIDevice *vdev, VFIOQuirk *quirk)
> >  
> >      QLIST_FOREACH_SAFE(ioeventfd, &quirk->ioeventfds, next, tmp) {
> >          if (ioeventfd->dynamic) {
> > -            vfio_ioeventfd_exit(ioeventfd);
> > +            vfio_ioeventfd_exit(vdev, ioeventfd);
> >          }
> >      }
> >  }
> > @@ -361,13 +380,30 @@ static VFIOIOEventFD *vfio_ioeventfd_init(VFIOPCIDevice *vdev,
> >      ioeventfd->region = region;
> >      ioeventfd->region_addr = region_addr;
> >  
> > -    qemu_set_fd_handler(event_notifier_get_fd(&ioeventfd->e),
> > -                        vfio_ioeventfd_handler, NULL, ioeventfd);
> > +    if (!vdev->no_vfio_ioeventfd) {
> > +        struct vfio_device_ioeventfd vfio_ioeventfd;
> > +
> > +        vfio_ioeventfd.argsz = sizeof(vfio_ioeventfd);
> > +        vfio_ioeventfd.flags = ioeventfd->size;
> > +        vfio_ioeventfd.data = ioeventfd->data;
> > +        vfio_ioeventfd.offset = ioeventfd->region->fd_offset +
> > +                                ioeventfd->region_addr;
> > +        vfio_ioeventfd.fd = event_notifier_get_fd(&ioeventfd->e);
> > +
> > +        ioeventfd->vfio = !ioctl(vdev->vbasedev.fd,
> > +                                 VFIO_DEVICE_IOEVENTFD, &vfio_ioeventfd);
> > +    }
> > +
> > +    if (!ioeventfd->vfio) {
> > +        qemu_set_fd_handler(event_notifier_get_fd(&ioeventfd->e),
> > +                            vfio_ioeventfd_handler, NULL, ioeventfd);
> > +    }
> > +
> >      memory_region_add_eventfd(ioeventfd->mr, ioeventfd->addr,
> >                                ioeventfd->size, ioeventfd->match_data,
> >                                ioeventfd->data, &ioeventfd->e);
> >      trace_vfio_ioeventfd_init(memory_region_name(mr), (uint64_t)addr,
> > -                              size, data);
> > +                              size, data, ioeventfd->vfio);
> >  
> >      return ioeventfd;
> >  }
> > @@ -1835,7 +1871,7 @@ void vfio_bar_quirk_exit(VFIOPCIDevice *vdev, int nr)
> >  
> >      QLIST_FOREACH(quirk, &bar->quirks, next) {
> >          while (!QLIST_EMPTY(&quirk->ioeventfds)) {
> > -            vfio_ioeventfd_exit(QLIST_FIRST(&quirk->ioeventfds));
> > +            vfio_ioeventfd_exit(vdev, QLIST_FIRST(&quirk->ioeventfds));
> >          }
> >  
> >          for (i = 0; i < quirk->nr_mem; i++) {
> > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> > index ba1239551115..84e27c7bb2d1 100644
> > --- a/hw/vfio/pci.c
> > +++ b/hw/vfio/pci.c
> > @@ -3177,6 +3177,8 @@ static Property vfio_pci_dev_properties[] = {
> >                       no_geforce_quirks, false),
> >      DEFINE_PROP_BOOL("x-no-kvm-ioeventfd", VFIOPCIDevice, no_kvm_ioeventfd,
> >                       false),
> > +    DEFINE_PROP_BOOL("x-no-vfio-ioeventfd", VFIOPCIDevice, no_vfio_ioeventfd,
> > +                     false),  
> I tend to agree with Peter about the 2 options. Only the KVM
> acceleration brings benefit here?

Consolidated response in reply to Peter.  Thanks,

Alex
Peter Xu May 4, 2018, 2:16 a.m. UTC | #5
On Thu, May 03, 2018 at 10:29:42AM -0600, Alex Williamson wrote:
> On Thu, 3 May 2018 12:56:03 +0800
> Peter Xu <peterx@redhat.com> wrote:
> 
> > On Tue, May 01, 2018 at 10:43:46AM -0600, Alex Williamson wrote:
> > 
> > [...]
> > 
> > > -static void vfio_ioeventfd_exit(VFIOIOEventFD *ioeventfd)
> > > +static void vfio_ioeventfd_exit(VFIOPCIDevice *vdev, VFIOIOEventFD *ioeventfd)
> > >  {
> > >      QLIST_REMOVE(ioeventfd, next);
> > > +
> > >      memory_region_del_eventfd(ioeventfd->mr, ioeventfd->addr, ioeventfd->size,
> > >                                ioeventfd->match_data, ioeventfd->data,
> > >                                &ioeventfd->e);
> > > -    qemu_set_fd_handler(event_notifier_get_fd(&ioeventfd->e), NULL, NULL, NULL);
> > > +
> > > +    if (ioeventfd->vfio) {
> > > +        struct vfio_device_ioeventfd vfio_ioeventfd;
> > > +
> > > +        vfio_ioeventfd.argsz = sizeof(vfio_ioeventfd);
> > > +        vfio_ioeventfd.flags = ioeventfd->size;
> > > +        vfio_ioeventfd.data = ioeventfd->data;
> > > +        vfio_ioeventfd.offset = ioeventfd->region->fd_offset +
> > > +                                ioeventfd->region_addr;
> > > +        vfio_ioeventfd.fd = -1;
> > > +
> > > +        ioctl(vdev->vbasedev.fd, VFIO_DEVICE_IOEVENTFD, &vfio_ioeventfd);  
> > 
> > (If the series is going to respin, I am thinking whether it would
> >  worth it to capture this error to dump something.  But it's for sure
> >  optional since even error happened we should have something in dmesg
> >  so it only matters on whether we also want something to be dumped
> >  from QEMU side too... After all there aren't much we can do)
> 
> I'm torn whether to use QEMU standard error handling here, ie.
> abort().  If we failed to remove the KVM ioeventfd, we'd abort before
> we get here, so there's no chance that the vfio ioeventfd will continue
> to be triggered.  Obviously leaving a vfio ioeventfd that we can't
> trigger and might interfere with future ioeventfds is not good, but do
> we really want to kill the VM because we possibly can't add an
> accelerator here later?  I'm inclined to say no, so I think I'll just
> error_report() unless there are objections.

I must be misleading when I said "dump something"... :) Yes the
error_report is exactly what I meant.

(Even an "error_report_once" but we don't have that yet)

> 
> > > +
> > > +    } else {
> > > +        qemu_set_fd_handler(event_notifier_get_fd(&ioeventfd->e),
> > > +                            NULL, NULL, NULL);
> > > +    }
> > > +
> > >      event_notifier_cleanup(&ioeventfd->e);
> > >      trace_vfio_ioeventfd_exit(memory_region_name(ioeventfd->mr),
> > >                                (uint64_t)ioeventfd->addr, ioeventfd->size,  
> > 
> > [...]
> > 
> > > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> > > index ba1239551115..84e27c7bb2d1 100644
> > > --- a/hw/vfio/pci.c
> > > +++ b/hw/vfio/pci.c
> > > @@ -3177,6 +3177,8 @@ static Property vfio_pci_dev_properties[] = {
> > >                       no_geforce_quirks, false),
> > >      DEFINE_PROP_BOOL("x-no-kvm-ioeventfd", VFIOPCIDevice, no_kvm_ioeventfd,
> > >                       false),
> > > +    DEFINE_PROP_BOOL("x-no-vfio-ioeventfd", VFIOPCIDevice, no_vfio_ioeventfd,
> > > +                     false),  
> > 
> > Here it looks more like a tri-state for me, so we can either:
> > 
> > - disable the acceleration in general, or
> > - enable QEMU-side acceleration only, or
> > - enable kernel-side acceleration
> 
> So you're looking for a Auto/Off/KVM-only option?  Do you really think
> it's worth defining a new tristate property for this sort of debugging
> option...
> 
> > In other words, IIUC x-no-vfio-ioeventfd won't matter much if
> > x-no-kvm-ioeventfd is already set.  So not sure whether a single
> > parameter would be nicer.
> 
> That's correct, but who do we expect to be using this option and why?
> I added enum OffAutoPCIBAR and the property to enable it for MSI-x
> relocation because it is an option that a normal user might reasonably
> need to use, given the right hardware on the right host, but it's an
> unsupported option because we cannot programatically validate it.
> Support rests with the individual user, if it doesn't work, don't use
> it, if it helps, great.  Here we have options that are really only for
> debugging, to test whether something has gone wrong in this code,
> disable this bypass to make all device interactions visible through
> QEMU, or specifically to evaluate the performance of this path.  Is it
> reasonable to impose yet another property type on the common code for
> this use case when a couple bools work just fine, if perhaps not
> absolutely ideal?  Am I overlooking an existing tri-state that might be
> a reasonable match?

Oh so it's only for debugging.  Then I would be perfectly fine with
two parameters.

Actually I wasn't thinking about any tri-state property, I was
thinking about e.g. string-typed that can satisfy things like
tri-state.  But again now I don't think it'll worth it to repost with
that if only for debugging purpose.

Thanks!
Eric Auger May 4, 2018, 7:38 a.m. UTC | #6
Hi Alex,

On 05/03/2018 06:29 PM, Alex Williamson wrote:
> On Thu, 3 May 2018 12:56:03 +0800
> Peter Xu <peterx@redhat.com> wrote:
> 
>> On Tue, May 01, 2018 at 10:43:46AM -0600, Alex Williamson wrote:
>>
>> [...]
>>
>>> -static void vfio_ioeventfd_exit(VFIOIOEventFD *ioeventfd)
>>> +static void vfio_ioeventfd_exit(VFIOPCIDevice *vdev, VFIOIOEventFD *ioeventfd)
>>>  {
>>>      QLIST_REMOVE(ioeventfd, next);
>>> +
>>>      memory_region_del_eventfd(ioeventfd->mr, ioeventfd->addr, ioeventfd->size,
>>>                                ioeventfd->match_data, ioeventfd->data,
>>>                                &ioeventfd->e);
>>> -    qemu_set_fd_handler(event_notifier_get_fd(&ioeventfd->e), NULL, NULL, NULL);
>>> +
>>> +    if (ioeventfd->vfio) {
>>> +        struct vfio_device_ioeventfd vfio_ioeventfd;
>>> +
>>> +        vfio_ioeventfd.argsz = sizeof(vfio_ioeventfd);
>>> +        vfio_ioeventfd.flags = ioeventfd->size;
>>> +        vfio_ioeventfd.data = ioeventfd->data;
>>> +        vfio_ioeventfd.offset = ioeventfd->region->fd_offset +
>>> +                                ioeventfd->region_addr;
>>> +        vfio_ioeventfd.fd = -1;
>>> +
>>> +        ioctl(vdev->vbasedev.fd, VFIO_DEVICE_IOEVENTFD, &vfio_ioeventfd);  
>>
>> (If the series is going to respin, I am thinking whether it would
>>  worth it to capture this error to dump something.  But it's for sure
>>  optional since even error happened we should have something in dmesg
>>  so it only matters on whether we also want something to be dumped
>>  from QEMU side too... After all there aren't much we can do)
> 
> I'm torn whether to use QEMU standard error handling here, ie.
> abort().  If we failed to remove the KVM ioeventfd, we'd abort before
> we get here, so there's no chance that the vfio ioeventfd will continue
> to be triggered.  Obviously leaving a vfio ioeventfd that we can't
> trigger and might interfere with future ioeventfds is not good, but do
> we really want to kill the VM because we possibly can't add an
> accelerator here later?  I'm inclined to say no, so I think I'll just
> error_report() unless there are objections.
> 
>>> +
>>> +    } else {
>>> +        qemu_set_fd_handler(event_notifier_get_fd(&ioeventfd->e),
>>> +                            NULL, NULL, NULL);
>>> +    }
>>> +
>>>      event_notifier_cleanup(&ioeventfd->e);
>>>      trace_vfio_ioeventfd_exit(memory_region_name(ioeventfd->mr),
>>>                                (uint64_t)ioeventfd->addr, ioeventfd->size,  
>>
>> [...]
>>
>>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>>> index ba1239551115..84e27c7bb2d1 100644
>>> --- a/hw/vfio/pci.c
>>> +++ b/hw/vfio/pci.c
>>> @@ -3177,6 +3177,8 @@ static Property vfio_pci_dev_properties[] = {
>>>                       no_geforce_quirks, false),
>>>      DEFINE_PROP_BOOL("x-no-kvm-ioeventfd", VFIOPCIDevice, no_kvm_ioeventfd,
>>>                       false),
>>> +    DEFINE_PROP_BOOL("x-no-vfio-ioeventfd", VFIOPCIDevice, no_vfio_ioeventfd,
>>> +                     false),  
>>
>> Here it looks more like a tri-state for me, so we can either:
>>
>> - disable the acceleration in general, or
>> - enable QEMU-side acceleration only, or
>> - enable kernel-side acceleration
> 
> So you're looking for a Auto/Off/KVM-only option?  Do you really think
> it's worth defining a new tristate property for this sort of debugging
> option...
> 
>> In other words, IIUC x-no-vfio-ioeventfd won't matter much if
>> x-no-kvm-ioeventfd is already set.  So not sure whether a single
>> parameter would be nicer.
> 
> That's correct, but who do we expect to be using this option and why?
> I added enum OffAutoPCIBAR and the property to enable it for MSI-x
> relocation because it is an option that a normal user might reasonably
> need to use, given the right hardware on the right host, but it's an
> unsupported option because we cannot programatically validate it.
> Support rests with the individual user, if it doesn't work, don't use
> it, if it helps, great.  Here we have options that are really only for
> debugging, to test whether something has gone wrong in this code,
> disable this bypass to make all device interactions visible through
> QEMU, or specifically to evaluate the performance of this path.  Is it
> reasonable to impose yet another property type on the common code for
> this use case when a couple bools work just fine, if perhaps not
> absolutely ideal?  Am I overlooking an existing tri-state that might be
> a reasonable match?
> 
> Tying Eric's thread so we can discuss this in one place:
> 
> On Thu, 3 May 2018 17:20:18 +0200
> Auger Eric <eric.auger@redhat.com> wrote:
>> I tend to agree with Peter about the 2 options. Only the KVM
>> acceleration brings benefit here?
> 
> Not at all, KVM acceleration does half the job (perhaps more than
> half), but if we want to completely bypass the userspace exit then we
> need both KVM triggering the ioeventfd and vfio consuming it.  It's
> useful for me at least to be able to compare completely disabled vs
> just KVM vs also adding vfio.  As above, I don't expect these to be
> used for anything other than debugging and performance evaluation, so I
> don't think it's worth making non-trivial code changes to enable the
> vfio-only path, nor do I think it's worthwhile to define a new
> tri-state property to resolve the slight mismatch we have vs two
> bools.  Again, let me know if you think I'm missing an existing
> property or if there are trivial changes we could make to make this map
> to an existing property.  Thanks,

Yes sorry I was not clear enough. I understood actual acceleration was
due to the fact the ioeventfd is handled on kernel side and not on user
side. So I thought we generally wanted both or nothing, hence a single
option. Nevertheless I am fine as well to leave the 2 debug options.

Thanks

Eric
> 
> Alex
>
diff mbox series

Patch

diff --git a/hw/vfio/pci-quirks.c b/hw/vfio/pci-quirks.c
index 4cedc733bc0a..94be27dd0a3b 100644
--- a/hw/vfio/pci-quirks.c
+++ b/hw/vfio/pci-quirks.c
@@ -16,6 +16,7 @@ 
 #include "qemu/range.h"
 #include "qapi/error.h"
 #include "qapi/visitor.h"
+#include <sys/ioctl.h>
 #include "hw/nvram/fw_cfg.h"
 #include "pci.h"
 #include "trace.h"
@@ -287,13 +288,31 @@  static VFIOQuirk *vfio_quirk_alloc(int nr_mem)
     return quirk;
 }
 
-static void vfio_ioeventfd_exit(VFIOIOEventFD *ioeventfd)
+static void vfio_ioeventfd_exit(VFIOPCIDevice *vdev, VFIOIOEventFD *ioeventfd)
 {
     QLIST_REMOVE(ioeventfd, next);
+
     memory_region_del_eventfd(ioeventfd->mr, ioeventfd->addr, ioeventfd->size,
                               ioeventfd->match_data, ioeventfd->data,
                               &ioeventfd->e);
-    qemu_set_fd_handler(event_notifier_get_fd(&ioeventfd->e), NULL, NULL, NULL);
+
+    if (ioeventfd->vfio) {
+        struct vfio_device_ioeventfd vfio_ioeventfd;
+
+        vfio_ioeventfd.argsz = sizeof(vfio_ioeventfd);
+        vfio_ioeventfd.flags = ioeventfd->size;
+        vfio_ioeventfd.data = ioeventfd->data;
+        vfio_ioeventfd.offset = ioeventfd->region->fd_offset +
+                                ioeventfd->region_addr;
+        vfio_ioeventfd.fd = -1;
+
+        ioctl(vdev->vbasedev.fd, VFIO_DEVICE_IOEVENTFD, &vfio_ioeventfd);
+
+    } else {
+        qemu_set_fd_handler(event_notifier_get_fd(&ioeventfd->e),
+                            NULL, NULL, NULL);
+    }
+
     event_notifier_cleanup(&ioeventfd->e);
     trace_vfio_ioeventfd_exit(memory_region_name(ioeventfd->mr),
                               (uint64_t)ioeventfd->addr, ioeventfd->size,
@@ -307,7 +326,7 @@  static void vfio_drop_dynamic_eventfds(VFIOPCIDevice *vdev, VFIOQuirk *quirk)
 
     QLIST_FOREACH_SAFE(ioeventfd, &quirk->ioeventfds, next, tmp) {
         if (ioeventfd->dynamic) {
-            vfio_ioeventfd_exit(ioeventfd);
+            vfio_ioeventfd_exit(vdev, ioeventfd);
         }
     }
 }
@@ -361,13 +380,30 @@  static VFIOIOEventFD *vfio_ioeventfd_init(VFIOPCIDevice *vdev,
     ioeventfd->region = region;
     ioeventfd->region_addr = region_addr;
 
-    qemu_set_fd_handler(event_notifier_get_fd(&ioeventfd->e),
-                        vfio_ioeventfd_handler, NULL, ioeventfd);
+    if (!vdev->no_vfio_ioeventfd) {
+        struct vfio_device_ioeventfd vfio_ioeventfd;
+
+        vfio_ioeventfd.argsz = sizeof(vfio_ioeventfd);
+        vfio_ioeventfd.flags = ioeventfd->size;
+        vfio_ioeventfd.data = ioeventfd->data;
+        vfio_ioeventfd.offset = ioeventfd->region->fd_offset +
+                                ioeventfd->region_addr;
+        vfio_ioeventfd.fd = event_notifier_get_fd(&ioeventfd->e);
+
+        ioeventfd->vfio = !ioctl(vdev->vbasedev.fd,
+                                 VFIO_DEVICE_IOEVENTFD, &vfio_ioeventfd);
+    }
+
+    if (!ioeventfd->vfio) {
+        qemu_set_fd_handler(event_notifier_get_fd(&ioeventfd->e),
+                            vfio_ioeventfd_handler, NULL, ioeventfd);
+    }
+
     memory_region_add_eventfd(ioeventfd->mr, ioeventfd->addr,
                               ioeventfd->size, ioeventfd->match_data,
                               ioeventfd->data, &ioeventfd->e);
     trace_vfio_ioeventfd_init(memory_region_name(mr), (uint64_t)addr,
-                              size, data);
+                              size, data, ioeventfd->vfio);
 
     return ioeventfd;
 }
@@ -1835,7 +1871,7 @@  void vfio_bar_quirk_exit(VFIOPCIDevice *vdev, int nr)
 
     QLIST_FOREACH(quirk, &bar->quirks, next) {
         while (!QLIST_EMPTY(&quirk->ioeventfds)) {
-            vfio_ioeventfd_exit(QLIST_FIRST(&quirk->ioeventfds));
+            vfio_ioeventfd_exit(vdev, QLIST_FIRST(&quirk->ioeventfds));
         }
 
         for (i = 0; i < quirk->nr_mem; i++) {
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index ba1239551115..84e27c7bb2d1 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -3177,6 +3177,8 @@  static Property vfio_pci_dev_properties[] = {
                      no_geforce_quirks, false),
     DEFINE_PROP_BOOL("x-no-kvm-ioeventfd", VFIOPCIDevice, no_kvm_ioeventfd,
                      false),
+    DEFINE_PROP_BOOL("x-no-vfio-ioeventfd", VFIOPCIDevice, no_vfio_ioeventfd,
+                     false),
     DEFINE_PROP_UINT32("x-pci-vendor-id", VFIOPCIDevice, vendor_id, PCI_ANY_ID),
     DEFINE_PROP_UINT32("x-pci-device-id", VFIOPCIDevice, device_id, PCI_ANY_ID),
     DEFINE_PROP_UINT32("x-pci-sub-vendor-id", VFIOPCIDevice,
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index dbb3aca9b3d2..dbb3932b50ef 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -35,6 +35,7 @@  typedef struct VFIOIOEventFD {
     hwaddr region_addr;
     bool match_data;
     bool dynamic;
+    bool vfio;
 } VFIOIOEventFD;
 
 typedef struct VFIOQuirk {
@@ -164,6 +165,7 @@  typedef struct VFIOPCIDevice {
     bool no_kvm_msix;
     bool no_geforce_quirks;
     bool no_kvm_ioeventfd;
+    bool no_vfio_ioeventfd;
     VFIODisplay *dpy;
 } VFIOPCIDevice;
 
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index f8f97d1ff90c..d2a74952e389 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -79,7 +79,7 @@  vfio_quirk_ati_bonaire_reset_done(const char *name) "%s"
 vfio_quirk_ati_bonaire_reset(const char *name) "%s"
 vfio_ioeventfd_exit(const char *name, uint64_t addr, unsigned size, uint64_t data) "%s+0x%"PRIx64"[%d]:0x%"PRIx64
 vfio_ioeventfd_handler(const char *name, uint64_t addr, unsigned size, uint64_t data) "%s+0x%"PRIx64"[%d] -> 0x%"PRIx64
-vfio_ioeventfd_init(const char *name, uint64_t addr, unsigned size, uint64_t data) "%s+0x%"PRIx64"[%d]:0x%"PRIx64
+vfio_ioeventfd_init(const char *name, uint64_t addr, unsigned size, uint64_t data, bool vfio) "%s+0x%"PRIx64"[%d]:0x%"PRIx64" vfio:%d"
 vfio_pci_igd_bar4_write(const char *name, uint32_t index, uint32_t data, uint32_t base) "%s [0x%03x] 0x%08x -> 0x%08x"
 vfio_pci_igd_bdsm_enabled(const char *name, int size) "%s %dMB"
 vfio_pci_igd_opregion_enabled(const char *name) "%s"