diff mbox

[v10,16/26] intel_iommu: add support for split irqchip

Message ID 1466495274-5011-17-git-send-email-peterx@redhat.com
State New
Headers show

Commit Message

Peter Xu June 21, 2016, 7:47 a.m. UTC
In split irqchip mode, IOAPIC is working in user space, only update
kernel irq routes when entry changed. When IR is enabled, we directly
update the kernel with translated messages. It works just like a kernel
cache for the remapping entries.

Since KVM irqfd is using kernel gsi routes to deliver interrupts, as
long as we can support split irqchip, we will support irqfd as
well. Also, since kernel gsi routes will cache translated interrupts,
irqfd delivery will not suffer from any performance impact due to IR.

And, since we supported irqfd, vhost devices will be able to work
seamlessly with IR now. Logically this should contain both vhost-net and
vhost-user case.

Signed-off-by: Peter Xu <peterx@redhat.com>
---
 hw/i386/intel_iommu.c         |  7 +++++++
 include/hw/i386/intel_iommu.h |  1 +
 include/hw/i386/x86-iommu.h   |  4 ++++
 target-i386/kvm.c             | 27 +++++++++++++++++++++++++++
 trace-events                  |  3 +++
 5 files changed, 42 insertions(+)

Comments

Jan Kiszka June 25, 2016, 8:08 a.m. UTC | #1
On 2016-06-21 09:47, Peter Xu wrote:
> In split irqchip mode, IOAPIC is working in user space, only update
> kernel irq routes when entry changed. When IR is enabled, we directly
> update the kernel with translated messages. It works just like a kernel
> cache for the remapping entries.
> 
> Since KVM irqfd is using kernel gsi routes to deliver interrupts, as
> long as we can support split irqchip, we will support irqfd as
> well. Also, since kernel gsi routes will cache translated interrupts,
> irqfd delivery will not suffer from any performance impact due to IR.
> 
> And, since we supported irqfd, vhost devices will be able to work
> seamlessly with IR now. Logically this should contain both vhost-net and
> vhost-user case.
> 
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
>  hw/i386/intel_iommu.c         |  7 +++++++
>  include/hw/i386/intel_iommu.h |  1 +
>  include/hw/i386/x86-iommu.h   |  4 ++++
>  target-i386/kvm.c             | 27 +++++++++++++++++++++++++++
>  trace-events                  |  3 +++
>  5 files changed, 42 insertions(+)
> 
> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> index d874596..0eaffc6 100644
> --- a/hw/i386/intel_iommu.c
> +++ b/hw/i386/intel_iommu.c
> @@ -2149,6 +2149,12 @@ do_not_translate:
>      return 0;
>  }
>  
> +static int vtd_int_remap(X86IOMMUState *iommu, MSIMessage *src,
> +                         MSIMessage *dst, uint16_t sid)
> +{
> +    return vtd_interrupt_remap_msi(INTEL_IOMMU_DEVICE(iommu), src, dst);
> +}
> +
>  static MemTxResult vtd_mem_ir_read(void *opaque, hwaddr addr,
>                                     uint64_t *data, unsigned size,
>                                     MemTxAttrs attrs)
> @@ -2393,6 +2399,7 @@ static void vtd_class_init(ObjectClass *klass, void *data)
>      dc->props = vtd_properties;
>      x86_class->realize = vtd_realize;
>      x86_class->find_add_as = vtd_find_add_as;
> +    x86_class->int_remap = vtd_int_remap;
>  }
>  
>  static const TypeInfo vtd_info = {
> diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
> index b3f17d7..3bca390 100644
> --- a/include/hw/i386/intel_iommu.h
> +++ b/include/hw/i386/intel_iommu.h
> @@ -26,6 +26,7 @@
>  #include "hw/i386/x86-iommu.h"
>  #include "hw/i386/ioapic.h"
>  #include "hw/pci/msi.h"
> +#include "hw/sysbus.h"
>  
>  #define TYPE_INTEL_IOMMU_DEVICE "intel-iommu"
>  #define INTEL_IOMMU_DEVICE(obj) \
> diff --git a/include/hw/i386/x86-iommu.h b/include/hw/i386/x86-iommu.h
> index 07199be..b419ae5 100644
> --- a/include/hw/i386/x86-iommu.h
> +++ b/include/hw/i386/x86-iommu.h
> @@ -22,6 +22,7 @@
>  
>  #include "hw/sysbus.h"
>  #include "exec/memory.h"
> +#include "hw/pci/pci.h"
>  
>  #define  TYPE_X86_IOMMU_DEVICE  ("x86-iommu")
>  #define  X86_IOMMU_DEVICE(obj) \
> @@ -43,6 +44,9 @@ struct X86IOMMUClass {
>      DeviceRealize realize;
>      /* Find/Add IOMMU address space for specific PCI device */
>      AddressSpace *(*find_add_as)(X86IOMMUState *s, PCIBus *bus, int devfn);
> +    /* MSI-based interrupt remapping */
> +    int (*int_remap)(X86IOMMUState *iommu, MSIMessage *src,
> +                     MSIMessage *dst, uint16_t sid);
>  };
>  
>  struct X86IOMMUState {
> diff --git a/target-i386/kvm.c b/target-i386/kvm.c
> index f3698f1..bfa40b2 100644
> --- a/target-i386/kvm.c
> +++ b/target-i386/kvm.c
> @@ -35,6 +35,7 @@
>  #include "hw/i386/apic.h"
>  #include "hw/i386/apic_internal.h"
>  #include "hw/i386/apic-msidef.h"
> +#include "hw/i386/intel_iommu.h"
>  
>  #include "exec/ioport.h"
>  #include "standard-headers/asm-x86/hyperv.h"
> @@ -42,6 +43,7 @@
>  #include "hw/pci/msi.h"
>  #include "migration/migration.h"
>  #include "exec/memattrs.h"
> +#include "trace.h"
>  
>  //#define DEBUG_KVM
>  
> @@ -3323,6 +3325,31 @@ int kvm_device_msix_deassign(KVMState *s, uint32_t dev_id)
>  int kvm_arch_fixup_msi_route(struct kvm_irq_routing_entry *route,
>                               uint64_t address, uint32_t data, PCIDevice *dev)
>  {
> +    X86IOMMUState *iommu = x86_iommu_get_default();
> +
> +    if (iommu) {
> +        int ret;
> +        MSIMessage src, dst;
> +        X86IOMMUClass *class = X86_IOMMU_GET_CLASS(iommu);
> +
> +        src.address = route->u.msi.address_hi;
> +        src.address <<= VTD_MSI_ADDR_HI_SHIFT;
> +        src.address |= route->u.msi.address_lo;
> +        src.data = route->u.msi.data;
> +
> +        ret = class->int_remap(iommu, &src, &dst, dev ? \
> +                               pci_requester_id(dev) : \
> +                               X86_IOMMU_SID_INVALID);
> +        if (ret) {
> +            trace_kvm_x86_fixup_msi_error(route->gsi);
> +            return 1;
> +        }
> +
> +        route->u.msi.address_hi = dst.address >> VTD_MSI_ADDR_HI_SHIFT;
> +        route->u.msi.address_lo = dst.address & VTD_MSI_ADDR_LO_MASK;
> +        route->u.msi.data = dst.data;
> +    }
> +
>      return 0;
>  }
>  
> diff --git a/trace-events b/trace-events
> index da0d060..2982f64 100644
> --- a/trace-events
> +++ b/trace-events
> @@ -2206,3 +2206,6 @@ gicv3_redist_write(uint32_t cpu, uint64_t offset, uint64_t data, unsigned size,
>  gicv3_redist_badwrite(uint32_t cpu, uint64_t offset, uint64_t data, unsigned size, bool secure) "GICv3 redistributor %x write: offset 0x%" PRIx64 " data 0x%" PRIx64 " size %u secure %d: error"
>  gicv3_redist_set_irq(uint32_t cpu, int irq, int level) "GICv3 redistributor %x interrupt %d level changed to %d"
>  gicv3_redist_send_sgi(uint32_t cpu, int irq) "GICv3 redistributor %x pending SGI %d"
> +
> +# target-i386/kvm.c
> +kvm_x86_fixup_msi_error(uint32_t gsi) "VT-d failed to remap interrupt for GSI %" PRIu32
> 

For successful remappings, this is fine - it just caches the result in
an interrupt route. But what will happen with invalid interrupts?

My current understanding is that, because the translation happens on
activation of that interrupt source, not on actual signalling, the IOMMU
will report an error too early and none when the interrupt is actually
sent. That will lead to unwanted results, in the worst case
false-positiv IR error reports to the guest, no?

I think we need to do this:
- silently remap broken sources to an error sink
- hook up the error sink with the actual IOMMU model (Intel or AMD)
- when that source actually fires, let the sink report an IR
  translation error to the guest

Am I right?

Jan
Peter Xu June 25, 2016, 1:18 p.m. UTC | #2
On Sat, Jun 25, 2016 at 10:08:10AM +0200, Jan Kiszka wrote:

[...]

> For successful remappings, this is fine - it just caches the result in
> an interrupt route. But what will happen with invalid interrupts?
> 
> My current understanding is that, because the translation happens on
> activation of that interrupt source, not on actual signalling, the IOMMU
> will report an error too early and none when the interrupt is actually
> sent. That will lead to unwanted results, in the worst case
> false-positiv IR error reports to the guest, no?
> 
> I think we need to do this:
> - silently remap broken sources to an error sink
> - hook up the error sink with the actual IOMMU model (Intel or AMD)
> - when that source actually fires, let the sink report an IR
>   translation error to the guest
> 
> Am I right?

Right. I totally missed this one. :(

Currently when split irqchip is specified, IOAPIC interrupts are
cached in kernel with type KVM_IRQ_ROUTING_MSI (which is the same as
irqfds). When guest specify a fault interrupt entry, it is possible
that we silently fail the update, and all further interrupts are still
the old and correct one.

I agree with your solution on this. First of all we update the
interrupt even if it's faulty, but we should mark it out. After that,
we should fire QEMU from kernel side when the fault interrupt is
triggered, so that QEMU IOMMU can still generate corresponding fault
report interrupt to guest (though for Intel IOMMU IR, we still haven't
handled any fault report yet, but we should be prepared for it).

So it seems that finally we cannot avoid touching KVM this time.

I have a thought on how to implement the "sink" you have mentioned:

First of all, in KVM, we provide a new KVM_IRQ_ROUTING_* type, maybe
called:

  KVM_IRQ_ROUTING_EVENTFD

When KVM got this kind of interrupt, KVM does not trigger any real
interrupt to guest. Instead, it just do eventfd_signal() to a
pre-defined fd (maybe also with some data along with the notification,
so that we can put the error inside?), which is set during
KVM_SET_GSI_ROUTING ioctl().

After that, QEMU register all fault interrupts using this new
KVM_IRQ_ROUTING_EVENTFD type (rather than original
KVM_IRQ_ROUTING_MSI), assign a specific handler to handle the events
from these interrupts, and trigger IOMMU fault report path in that
handler.

(Here I used KVM_IRQ_ROUTING_EVENTFD rather than something like
 KVM_IRQ_ROUTING_FAULT_MSI to make the API a more general one, in case
 we can leverage it in other cases in the future)

Do you think the above workable?

No matter which solution we will have, I would still suggest we add
this as an "enhancement" after this series, since:

- there are works that depend on this series, so I would appreciate if
  this series can be merged first, so that other people can have a
  good basement (Radim's x2apic, David's AMD IOMMU). Though this is
  based on the assumption that the basic design of this series is
  workable...

- this problem will only exist for guest driver developers and should
  not happen for generic users (right?), so only a small subset of
  users might be affected.

Thanks,

-- peterx
Jan Kiszka June 25, 2016, 3:18 p.m. UTC | #3
On 2016-06-25 15:18, Peter Xu wrote:
> On Sat, Jun 25, 2016 at 10:08:10AM +0200, Jan Kiszka wrote:
> 
> [...]
> 
>> For successful remappings, this is fine - it just caches the result in
>> an interrupt route. But what will happen with invalid interrupts?
>>
>> My current understanding is that, because the translation happens on
>> activation of that interrupt source, not on actual signalling, the IOMMU
>> will report an error too early and none when the interrupt is actually
>> sent. That will lead to unwanted results, in the worst case
>> false-positiv IR error reports to the guest, no?
>>
>> I think we need to do this:
>> - silently remap broken sources to an error sink
>> - hook up the error sink with the actual IOMMU model (Intel or AMD)
>> - when that source actually fires, let the sink report an IR
>>   translation error to the guest
>>
>> Am I right?
> 
> Right. I totally missed this one. :(
> 
> Currently when split irqchip is specified, IOAPIC interrupts are
> cached in kernel with type KVM_IRQ_ROUTING_MSI (which is the same as
> irqfds). When guest specify a fault interrupt entry, it is possible
> that we silently fail the update, and all further interrupts are still
> the old and correct one.
> 
> I agree with your solution on this. First of all we update the
> interrupt even if it's faulty, but we should mark it out. After that,
> we should fire QEMU from kernel side when the fault interrupt is
> triggered, so that QEMU IOMMU can still generate corresponding fault
> report interrupt to guest (though for Intel IOMMU IR, we still haven't
> handled any fault report yet, but we should be prepared for it).
> 
> So it seems that finally we cannot avoid touching KVM this time.
> 
> I have a thought on how to implement the "sink" you have mentioned:
> 
> First of all, in KVM, we provide a new KVM_IRQ_ROUTING_* type, maybe
> called:
> 
>   KVM_IRQ_ROUTING_EVENTFD

Not really, because all sources are either using eventfds, which you can
also terminate in user space (already done for vhost and vfio in certain
scenarios - IIRC) or originate there anyway (IOAPIC).

> 
> When KVM got this kind of interrupt, KVM does not trigger any real
> interrupt to guest. Instead, it just do eventfd_signal() to a
> pre-defined fd (maybe also with some data along with the notification,
> so that we can put the error inside?), which is set during
> KVM_SET_GSI_ROUTING ioctl().
> 
> After that, QEMU register all fault interrupts using this new
> KVM_IRQ_ROUTING_EVENTFD type (rather than original
> KVM_IRQ_ROUTING_MSI), assign a specific handler to handle the events
> from these interrupts, and trigger IOMMU fault report path in that
> handler.
> 
> (Here I used KVM_IRQ_ROUTING_EVENTFD rather than something like
>  KVM_IRQ_ROUTING_FAULT_MSI to make the API a more general one, in case
>  we can leverage it in other cases in the future)
> 
> Do you think the above workable?
> 
> No matter which solution we will have, I would still suggest we add
> this as an "enhancement" after this series, since:
> 
> - there are works that depend on this series, so I would appreciate if
>   this series can be merged first, so that other people can have a
>   good basement (Radim's x2apic, David's AMD IOMMU). Though this is
>   based on the assumption that the basic design of this series is
>   workable...

I understand, and it is probably safe...

> 
> - this problem will only exist for guest driver developers and should
>   not happen for generic users (right?), so only a small subset of
>   users might be affected.

...provided there is only little risk that some guest programs some
half-backed or stale message that would be rejected prematurely. But
that risk is most likely low.

Jan
Peter Xu June 26, 2016, 1:48 a.m. UTC | #4
On Sat, Jun 25, 2016 at 05:18:40PM +0200, Jan Kiszka wrote:
> On 2016-06-25 15:18, Peter Xu wrote:
> > On Sat, Jun 25, 2016 at 10:08:10AM +0200, Jan Kiszka wrote:

[...]

> > I have a thought on how to implement the "sink" you have mentioned:
> > 
> > First of all, in KVM, we provide a new KVM_IRQ_ROUTING_* type, maybe
> > called:
> > 
> >   KVM_IRQ_ROUTING_EVENTFD
> 
> Not really, because all sources are either using eventfds, which you can
> also terminate in user space (already done for vhost and vfio in certain
> scenarios - IIRC) or originate there anyway (IOAPIC).

But how should we handle the cases when the interrupt path are all in
kernel?

For vhost, data should be transfered all inside kernel when split
irqchip and irqfd are used: when vhost got data, it triggers irqfd to
deliver the interrupt to KVM. Along the way, we should all in kernel.

For vfio, we have vfio_msihandler() who handles the hardware IRQ and
then triggers irqfd as well to KVM. Again, it seems all in kernel
space, no chance to stop that as well.

Please correct me if I was wrong.

[...]

> > - there are works that depend on this series, so I would appreciate if
> >   this series can be merged first, so that other people can have a
> >   good basement (Radim's x2apic, David's AMD IOMMU). Though this is
> >   based on the assumption that the basic design of this series is
> >   workable...
> 
> I understand, and it is probably safe...
> 
> > 
> > - this problem will only exist for guest driver developers and should
> >   not happen for generic users (right?), so only a small subset of
> >   users might be affected.
> 
> ...provided there is only little risk that some guest programs some
> half-backed or stale message that would be rejected prematurely. But
> that risk is most likely low.

Yes, thanks!

-- peterx
Jan Kiszka June 26, 2016, 1:27 p.m. UTC | #5
On 2016-06-26 03:48, Peter Xu wrote:
> On Sat, Jun 25, 2016 at 05:18:40PM +0200, Jan Kiszka wrote:
>> On 2016-06-25 15:18, Peter Xu wrote:
>>> On Sat, Jun 25, 2016 at 10:08:10AM +0200, Jan Kiszka wrote:
> 
> [...]
> 
>>> I have a thought on how to implement the "sink" you have mentioned:
>>>
>>> First of all, in KVM, we provide a new KVM_IRQ_ROUTING_* type, maybe
>>> called:
>>>
>>>   KVM_IRQ_ROUTING_EVENTFD
>>
>> Not really, because all sources are either using eventfds, which you can
>> also terminate in user space (already done for vhost and vfio in certain
>> scenarios - IIRC) or originate there anyway (IOAPIC).
> 
> But how should we handle the cases when the interrupt path are all in
> kernel?

There are none which we can't redirect (only full in-kernel irqchip
would have, but that's unsupported anyway).

> 
> For vhost, data should be transfered all inside kernel when split
> irqchip and irqfd are used: when vhost got data, it triggers irqfd to
> deliver the interrupt to KVM. Along the way, we should all in kernel.
> 
> For vfio, we have vfio_msihandler() who handles the hardware IRQ and
> then triggers irqfd as well to KVM. Again, it seems all in kernel
> space, no chance to stop that as well.
> 
> Please correct me if I was wrong.

Look at what vhost is doing e.g.: when a virtqueue is masked, it
installs an event notifier that records incoming events in a pending
state field. When it's unmasked, the corresponding KVM irqfd is installed.

Jan
Michael S. Tsirkin June 28, 2016, 6:10 a.m. UTC | #6
On Sun, Jun 26, 2016 at 03:27:50PM +0200, Jan Kiszka wrote:
> On 2016-06-26 03:48, Peter Xu wrote:
> > On Sat, Jun 25, 2016 at 05:18:40PM +0200, Jan Kiszka wrote:
> >> On 2016-06-25 15:18, Peter Xu wrote:
> >>> On Sat, Jun 25, 2016 at 10:08:10AM +0200, Jan Kiszka wrote:
> > 
> > [...]
> > 
> >>> I have a thought on how to implement the "sink" you have mentioned:
> >>>
> >>> First of all, in KVM, we provide a new KVM_IRQ_ROUTING_* type, maybe
> >>> called:
> >>>
> >>>   KVM_IRQ_ROUTING_EVENTFD
> >>
> >> Not really, because all sources are either using eventfds, which you can
> >> also terminate in user space (already done for vhost and vfio in certain
> >> scenarios - IIRC) or originate there anyway (IOAPIC).
> > 
> > But how should we handle the cases when the interrupt path are all in
> > kernel?
> 
> There are none which we can't redirect (only full in-kernel irqchip
> would have, but that's unsupported anyway).

I agree but I kind of feel it's ok to work on this
as a patch on top.
Additionally, some kind of test would have to be written
for these error cases, which is non-negligeable amount of worl.
So I'm inlined to merge this patchset - I feel it'll
help things make progress.

Thoughts? Jan - if you agree it's a good idea, acks would be appreciated.

> > 
> > For vhost, data should be transfered all inside kernel when split
> > irqchip and irqfd are used: when vhost got data, it triggers irqfd to
> > deliver the interrupt to KVM. Along the way, we should all in kernel.
> > 
> > For vfio, we have vfio_msihandler() who handles the hardware IRQ and
> > then triggers irqfd as well to KVM. Again, it seems all in kernel
> > space, no chance to stop that as well.
> > 
> > Please correct me if I was wrong.
> 
> Look at what vhost is doing e.g.: when a virtqueue is masked, it
> installs an event notifier that records incoming events in a pending
> state field. When it's unmasked, the corresponding KVM irqfd is installed.
> 
> Jan
> 
>
Peter Xu June 28, 2016, 7:25 a.m. UTC | #7
On Sun, Jun 26, 2016 at 03:27:50PM +0200, Jan Kiszka wrote:
> On 2016-06-26 03:48, Peter Xu wrote:
> > On Sat, Jun 25, 2016 at 05:18:40PM +0200, Jan Kiszka wrote:
> >> On 2016-06-25 15:18, Peter Xu wrote:
> >>> On Sat, Jun 25, 2016 at 10:08:10AM +0200, Jan Kiszka wrote:
> > 
> > [...]
> > 
> >>> I have a thought on how to implement the "sink" you have mentioned:
> >>>
> >>> First of all, in KVM, we provide a new KVM_IRQ_ROUTING_* type, maybe
> >>> called:
> >>>
> >>>   KVM_IRQ_ROUTING_EVENTFD
> >>
> >> Not really, because all sources are either using eventfds, which you can
> >> also terminate in user space (already done for vhost and vfio in certain
> >> scenarios - IIRC) or originate there anyway (IOAPIC).
> > 
> > But how should we handle the cases when the interrupt path are all in
> > kernel?
> 
> There are none which we can't redirect (only full in-kernel irqchip
> would have, but that's unsupported anyway).
> 
> > 
> > For vhost, data should be transfered all inside kernel when split
> > irqchip and irqfd are used: when vhost got data, it triggers irqfd to
> > deliver the interrupt to KVM. Along the way, we should all in kernel.
> > 
> > For vfio, we have vfio_msihandler() who handles the hardware IRQ and
> > then triggers irqfd as well to KVM. Again, it seems all in kernel
> > space, no chance to stop that as well.
> > 
> > Please correct me if I was wrong.
> 
> Look at what vhost is doing e.g.: when a virtqueue is masked, it
> installs an event notifier that records incoming events in a pending
> state field. When it's unmasked, the corresponding KVM irqfd is installed.

You are right. Thanks for the explaination.

-- peterx
Paolo Bonzini July 4, 2016, 2:32 p.m. UTC | #8
On 21/06/2016 09:47, Peter Xu wrote:
> @@ -3323,6 +3325,31 @@ int kvm_device_msix_deassign(KVMState *s, uint32_t dev_id)
>  int kvm_arch_fixup_msi_route(struct kvm_irq_routing_entry *route,
>                               uint64_t address, uint32_t data, PCIDevice *dev)
>  {
> +    X86IOMMUState *iommu = x86_iommu_get_default();
> +
> +    if (iommu) {
> +        int ret;
> +        MSIMessage src, dst;
> +        X86IOMMUClass *class = X86_IOMMU_GET_CLASS(iommu);
> +
> +        src.address = route->u.msi.address_hi;
> +        src.address <<= VTD_MSI_ADDR_HI_SHIFT;
> +        src.address |= route->u.msi.address_lo;
> +        src.data = route->u.msi.data;
> +
> +        ret = class->int_remap(iommu, &src, &dst, dev ? \
> +                               pci_requester_id(dev) : \
> +                               X86_IOMMU_SID_INVALID);
> +        if (ret) {
> +            trace_kvm_x86_fixup_msi_error(route->gsi);
> +            return 1;
> +        }
> +
> +        route->u.msi.address_hi = dst.address >> VTD_MSI_ADDR_HI_SHIFT;
> +        route->u.msi.address_lo = dst.address & VTD_MSI_ADDR_LO_MASK;
> +        route->u.msi.data = dst.data;
> +    }
> +
>      return 0;
>  }

I don't like this particularly.  Instead, I think the X86 IOMMU class
should implement a new interface "MSIRemapper", and PCIBus should have a
pointer to MSIRemapper*.  Then this can become:

    if (dev) {
        PCIBus *bus = PCI_BUS(qdev_get_parent_bus(DEVICE(dev));
        if (bus->remapper) {
            msi_remapper_fixup_route(bus->remapper, &src, &dst,
                                     pci_requester_id(dev));
        }
    }

That said, I'm okay with the patch as is because the issue is not with
the x86 implementation but with kvm_arch_fixup_msi_route.  S390 should
be able to do the same, by implementing MSIRemapper in S390pciState (I
think).

Paolo
Peter Xu Jan. 3, 2017, 6:15 a.m. UTC | #9
On Sun, Jun 26, 2016 at 03:27:50PM +0200, Jan Kiszka wrote:
> On 2016-06-26 03:48, Peter Xu wrote:
> > On Sat, Jun 25, 2016 at 05:18:40PM +0200, Jan Kiszka wrote:
> >> On 2016-06-25 15:18, Peter Xu wrote:
> >>> On Sat, Jun 25, 2016 at 10:08:10AM +0200, Jan Kiszka wrote:
> > 
> > [...]
> > 
> >>> I have a thought on how to implement the "sink" you have mentioned:
> >>>
> >>> First of all, in KVM, we provide a new KVM_IRQ_ROUTING_* type, maybe
> >>> called:
> >>>
> >>>   KVM_IRQ_ROUTING_EVENTFD
> >>
> >> Not really, because all sources are either using eventfds, which you can
> >> also terminate in user space (already done for vhost and vfio in certain
> >> scenarios - IIRC) or originate there anyway (IOAPIC).
> > 
> > But how should we handle the cases when the interrupt path are all in
> > kernel?
> 
> There are none which we can't redirect (only full in-kernel irqchip
> would have, but that's unsupported anyway).
> 
> > 
> > For vhost, data should be transfered all inside kernel when split
> > irqchip and irqfd are used: when vhost got data, it triggers irqfd to
> > deliver the interrupt to KVM. Along the way, we should all in kernel.
> > 
> > For vfio, we have vfio_msihandler() who handles the hardware IRQ and
> > then triggers irqfd as well to KVM. Again, it seems all in kernel
> > space, no chance to stop that as well.
> > 
> > Please correct me if I was wrong.
> 
> Look at what vhost is doing e.g.: when a virtqueue is masked, it
> installs an event notifier that records incoming events in a pending
> state field. When it's unmasked, the corresponding KVM irqfd is installed.

Hmm I think it's time I pick up this topic up again... :)

Since it's been half a year from the last post of this thread (I
believe this thread is the so-called "cold data" and should be stored
on tapes already... and sorry fot the long delay), I'd like to do a
quick summary on this: interrupt remap still cannot work well when we
install fault interrupts - when that happens, we should inject VT-d
fault, rather than keeping silence.

The suggestion from Jan above should be a good solution that only need
to touch qemu part - that's the most benefit AFAIU. However, OTOH IMO
we need to modify all the kvm irqfd users with this fix (pci-assign,
ioapic, ivshmem, vfio-pci, virtio) - we need to have all these devices
init with an "fault sink" eventfd, then when we detected specific
irqfd install error, we install the "fault sink". What's worse, if we
add new devices with irqfd support, we need to implement the same
error handling logic as well. Am I understanding it correctly? If so,
isn't that awkward?

Now I am re-thinking about my KVM_IRQ_ROUTING_EVENTFD proposal to do
it - in that case, we should not need to worry about the users of kvm
irqfd, and the error handling is done automatically even with new
irqfd users coming in. The disadvantage is of course we need to touch
both qemu and kvm, also we need to touch KVM API for it (though I
think it'll only need very small change in KVM). And not sure whether
that would worth it.

Or, any better way to do it?

Hope I didn't miss anything. Comments are welcomed!

Regards,

-- peterx
Jan Kiszka Jan. 4, 2017, 10:33 a.m. UTC | #10
On 2017-01-03 07:15, Peter Xu wrote:
> On Sun, Jun 26, 2016 at 03:27:50PM +0200, Jan Kiszka wrote:
>> On 2016-06-26 03:48, Peter Xu wrote:
>>> On Sat, Jun 25, 2016 at 05:18:40PM +0200, Jan Kiszka wrote:
>>>> On 2016-06-25 15:18, Peter Xu wrote:
>>>>> On Sat, Jun 25, 2016 at 10:08:10AM +0200, Jan Kiszka wrote:
>>>
>>> [...]
>>>
>>>>> I have a thought on how to implement the "sink" you have mentioned:
>>>>>
>>>>> First of all, in KVM, we provide a new KVM_IRQ_ROUTING_* type, maybe
>>>>> called:
>>>>>
>>>>>   KVM_IRQ_ROUTING_EVENTFD
>>>>
>>>> Not really, because all sources are either using eventfds, which you can
>>>> also terminate in user space (already done for vhost and vfio in certain
>>>> scenarios - IIRC) or originate there anyway (IOAPIC).
>>>
>>> But how should we handle the cases when the interrupt path are all in
>>> kernel?
>>
>> There are none which we can't redirect (only full in-kernel irqchip
>> would have, but that's unsupported anyway).
>>
>>>
>>> For vhost, data should be transfered all inside kernel when split
>>> irqchip and irqfd are used: when vhost got data, it triggers irqfd to
>>> deliver the interrupt to KVM. Along the way, we should all in kernel.
>>>
>>> For vfio, we have vfio_msihandler() who handles the hardware IRQ and
>>> then triggers irqfd as well to KVM. Again, it seems all in kernel
>>> space, no chance to stop that as well.
>>>
>>> Please correct me if I was wrong.
>>
>> Look at what vhost is doing e.g.: when a virtqueue is masked, it
>> installs an event notifier that records incoming events in a pending
>> state field. When it's unmasked, the corresponding KVM irqfd is installed.
> 
> Hmm I think it's time I pick up this topic up again... :)
> 
> Since it's been half a year from the last post of this thread (I
> believe this thread is the so-called "cold data" and should be stored
> on tapes already... and sorry fot the long delay), I'd like to do a
> quick summary on this: interrupt remap still cannot work well when we
> install fault interrupts - when that happens, we should inject VT-d
> fault, rather than keeping silence.
> 
> The suggestion from Jan above should be a good solution that only need
> to touch qemu part - that's the most benefit AFAIU. However, OTOH IMO
> we need to modify all the kvm irqfd users with this fix (pci-assign,
> ioapic, ivshmem, vfio-pci, virtio) - we need to have all these devices
> init with an "fault sink" eventfd, then when we detected specific
> irqfd install error, we install the "fault sink". What's worse, if we
> add new devices with irqfd support, we need to implement the same
> error handling logic as well. Am I understanding it correctly? If so,
> isn't that awkward?
> 
> Now I am re-thinking about my KVM_IRQ_ROUTING_EVENTFD proposal to do
> it - in that case, we should not need to worry about the users of kvm
> irqfd, and the error handling is done automatically even with new
> irqfd users coming in. The disadvantage is of course we need to touch
> both qemu and kvm, also we need to touch KVM API for it (though I
> think it'll only need very small change in KVM). And not sure whether
> that would worth it.
> 
> Or, any better way to do it?
> 
> Hope I didn't miss anything. Comments are welcomed!
> 

I don't have the details in mind again, but I suppose the only
alternative to fixing a QEMU boilerplate code issue with new KVM kernel
interface is abstracting the common patterns in QEMU that all the irqfd
users share and solve solve that topic once. Might turn out, though,
that the exiting kernel interface prevents this...

Jan
Peter Xu Jan. 5, 2017, 2:21 a.m. UTC | #11
On Wed, Jan 04, 2017 at 11:33:36AM +0100, Jan Kiszka wrote:
> On 2017-01-03 07:15, Peter Xu wrote:
> > On Sun, Jun 26, 2016 at 03:27:50PM +0200, Jan Kiszka wrote:
> >> On 2016-06-26 03:48, Peter Xu wrote:
> >>> On Sat, Jun 25, 2016 at 05:18:40PM +0200, Jan Kiszka wrote:
> >>>> On 2016-06-25 15:18, Peter Xu wrote:
> >>>>> On Sat, Jun 25, 2016 at 10:08:10AM +0200, Jan Kiszka wrote:
> >>>
> >>> [...]
> >>>
> >>>>> I have a thought on how to implement the "sink" you have mentioned:
> >>>>>
> >>>>> First of all, in KVM, we provide a new KVM_IRQ_ROUTING_* type, maybe
> >>>>> called:
> >>>>>
> >>>>>   KVM_IRQ_ROUTING_EVENTFD
> >>>>
> >>>> Not really, because all sources are either using eventfds, which you can
> >>>> also terminate in user space (already done for vhost and vfio in certain
> >>>> scenarios - IIRC) or originate there anyway (IOAPIC).
> >>>
> >>> But how should we handle the cases when the interrupt path are all in
> >>> kernel?
> >>
> >> There are none which we can't redirect (only full in-kernel irqchip
> >> would have, but that's unsupported anyway).
> >>
> >>>
> >>> For vhost, data should be transfered all inside kernel when split
> >>> irqchip and irqfd are used: when vhost got data, it triggers irqfd to
> >>> deliver the interrupt to KVM. Along the way, we should all in kernel.
> >>>
> >>> For vfio, we have vfio_msihandler() who handles the hardware IRQ and
> >>> then triggers irqfd as well to KVM. Again, it seems all in kernel
> >>> space, no chance to stop that as well.
> >>>
> >>> Please correct me if I was wrong.
> >>
> >> Look at what vhost is doing e.g.: when a virtqueue is masked, it
> >> installs an event notifier that records incoming events in a pending
> >> state field. When it's unmasked, the corresponding KVM irqfd is installed.
> > 
> > Hmm I think it's time I pick up this topic up again... :)
> > 
> > Since it's been half a year from the last post of this thread (I
> > believe this thread is the so-called "cold data" and should be stored
> > on tapes already... and sorry fot the long delay), I'd like to do a
> > quick summary on this: interrupt remap still cannot work well when we
> > install fault interrupts - when that happens, we should inject VT-d
> > fault, rather than keeping silence.
> > 
> > The suggestion from Jan above should be a good solution that only need
> > to touch qemu part - that's the most benefit AFAIU. However, OTOH IMO
> > we need to modify all the kvm irqfd users with this fix (pci-assign,
> > ioapic, ivshmem, vfio-pci, virtio) - we need to have all these devices
> > init with an "fault sink" eventfd, then when we detected specific
> > irqfd install error, we install the "fault sink". What's worse, if we
> > add new devices with irqfd support, we need to implement the same
> > error handling logic as well. Am I understanding it correctly? If so,
> > isn't that awkward?
> > 
> > Now I am re-thinking about my KVM_IRQ_ROUTING_EVENTFD proposal to do
> > it - in that case, we should not need to worry about the users of kvm
> > irqfd, and the error handling is done automatically even with new
> > irqfd users coming in. The disadvantage is of course we need to touch
> > both qemu and kvm, also we need to touch KVM API for it (though I
> > think it'll only need very small change in KVM). And not sure whether
> > that would worth it.
> > 
> > Or, any better way to do it?
> > 
> > Hope I didn't miss anything. Comments are welcomed!
> > 
> 
> I don't have the details in mind again, but I suppose the only
> alternative to fixing a QEMU boilerplate code issue with new KVM kernel
> interface is abstracting the common patterns in QEMU that all the irqfd
> users share and solve solve that topic once. Might turn out, though,
> that the exiting kernel interface prevents this...

Hmm, (after a quick glance) I was just afraid that I might need to
touch lots of codes in QEMU even to provide such a common layer for
this single fault tolerance feature.

Then let me think it over again... Thanks Jan!

-- peterx
diff mbox

Patch

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index d874596..0eaffc6 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -2149,6 +2149,12 @@  do_not_translate:
     return 0;
 }
 
+static int vtd_int_remap(X86IOMMUState *iommu, MSIMessage *src,
+                         MSIMessage *dst, uint16_t sid)
+{
+    return vtd_interrupt_remap_msi(INTEL_IOMMU_DEVICE(iommu), src, dst);
+}
+
 static MemTxResult vtd_mem_ir_read(void *opaque, hwaddr addr,
                                    uint64_t *data, unsigned size,
                                    MemTxAttrs attrs)
@@ -2393,6 +2399,7 @@  static void vtd_class_init(ObjectClass *klass, void *data)
     dc->props = vtd_properties;
     x86_class->realize = vtd_realize;
     x86_class->find_add_as = vtd_find_add_as;
+    x86_class->int_remap = vtd_int_remap;
 }
 
 static const TypeInfo vtd_info = {
diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index b3f17d7..3bca390 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -26,6 +26,7 @@ 
 #include "hw/i386/x86-iommu.h"
 #include "hw/i386/ioapic.h"
 #include "hw/pci/msi.h"
+#include "hw/sysbus.h"
 
 #define TYPE_INTEL_IOMMU_DEVICE "intel-iommu"
 #define INTEL_IOMMU_DEVICE(obj) \
diff --git a/include/hw/i386/x86-iommu.h b/include/hw/i386/x86-iommu.h
index 07199be..b419ae5 100644
--- a/include/hw/i386/x86-iommu.h
+++ b/include/hw/i386/x86-iommu.h
@@ -22,6 +22,7 @@ 
 
 #include "hw/sysbus.h"
 #include "exec/memory.h"
+#include "hw/pci/pci.h"
 
 #define  TYPE_X86_IOMMU_DEVICE  ("x86-iommu")
 #define  X86_IOMMU_DEVICE(obj) \
@@ -43,6 +44,9 @@  struct X86IOMMUClass {
     DeviceRealize realize;
     /* Find/Add IOMMU address space for specific PCI device */
     AddressSpace *(*find_add_as)(X86IOMMUState *s, PCIBus *bus, int devfn);
+    /* MSI-based interrupt remapping */
+    int (*int_remap)(X86IOMMUState *iommu, MSIMessage *src,
+                     MSIMessage *dst, uint16_t sid);
 };
 
 struct X86IOMMUState {
diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index f3698f1..bfa40b2 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -35,6 +35,7 @@ 
 #include "hw/i386/apic.h"
 #include "hw/i386/apic_internal.h"
 #include "hw/i386/apic-msidef.h"
+#include "hw/i386/intel_iommu.h"
 
 #include "exec/ioport.h"
 #include "standard-headers/asm-x86/hyperv.h"
@@ -42,6 +43,7 @@ 
 #include "hw/pci/msi.h"
 #include "migration/migration.h"
 #include "exec/memattrs.h"
+#include "trace.h"
 
 //#define DEBUG_KVM
 
@@ -3323,6 +3325,31 @@  int kvm_device_msix_deassign(KVMState *s, uint32_t dev_id)
 int kvm_arch_fixup_msi_route(struct kvm_irq_routing_entry *route,
                              uint64_t address, uint32_t data, PCIDevice *dev)
 {
+    X86IOMMUState *iommu = x86_iommu_get_default();
+
+    if (iommu) {
+        int ret;
+        MSIMessage src, dst;
+        X86IOMMUClass *class = X86_IOMMU_GET_CLASS(iommu);
+
+        src.address = route->u.msi.address_hi;
+        src.address <<= VTD_MSI_ADDR_HI_SHIFT;
+        src.address |= route->u.msi.address_lo;
+        src.data = route->u.msi.data;
+
+        ret = class->int_remap(iommu, &src, &dst, dev ? \
+                               pci_requester_id(dev) : \
+                               X86_IOMMU_SID_INVALID);
+        if (ret) {
+            trace_kvm_x86_fixup_msi_error(route->gsi);
+            return 1;
+        }
+
+        route->u.msi.address_hi = dst.address >> VTD_MSI_ADDR_HI_SHIFT;
+        route->u.msi.address_lo = dst.address & VTD_MSI_ADDR_LO_MASK;
+        route->u.msi.data = dst.data;
+    }
+
     return 0;
 }
 
diff --git a/trace-events b/trace-events
index da0d060..2982f64 100644
--- a/trace-events
+++ b/trace-events
@@ -2206,3 +2206,6 @@  gicv3_redist_write(uint32_t cpu, uint64_t offset, uint64_t data, unsigned size,
 gicv3_redist_badwrite(uint32_t cpu, uint64_t offset, uint64_t data, unsigned size, bool secure) "GICv3 redistributor %x write: offset 0x%" PRIx64 " data 0x%" PRIx64 " size %u secure %d: error"
 gicv3_redist_set_irq(uint32_t cpu, int irq, int level) "GICv3 redistributor %x interrupt %d level changed to %d"
 gicv3_redist_send_sgi(uint32_t cpu, int irq) "GICv3 redistributor %x pending SGI %d"
+
+# target-i386/kvm.c
+kvm_x86_fixup_msi_error(uint32_t gsi) "VT-d failed to remap interrupt for GSI %" PRIu32