Patchwork [RFC,12/45] msi: Introduce MSIRoutingCache

login
register
mail settings
Submitter Jan Kiszka
Date Oct. 17, 2011, 9:27 a.m.
Message ID <d2dd951083341c26ce23d4f24b8968c9d19a8c6b.1318843693.git.jan.kiszka@siemens.com>
Download mbox | patch
Permalink /patch/120187/
State New
Headers show

Comments

Jan Kiszka - Oct. 17, 2011, 9:27 a.m.
This cache will help us implementing KVM in-kernel irqchip support
without spreading hooks all over the place.

KVM requires us to register it first and then deliver it by raising a
pseudo IRQ line returned on registration. While this could be changed
for QEMU-originated MSI messages by adding direct MSI injection, we will
still need this translation for irqfd-originated messages. The
MSIRoutingCache will allow to track those registrations and update them
lazily before the actual delivery. This avoid having to track MSI
vectors at device level (like qemu-kvm currently does).

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
---
 hw/apic.c     |    5 +++--
 hw/apic.h     |    2 +-
 hw/msi.c      |   10 +++++++---
 hw/msi.h      |   14 +++++++++++++-
 hw/msix.c     |    7 ++++++-
 hw/pc.c       |    4 ++--
 hw/pci.h      |    4 ++++
 qemu-common.h |    1 +
 8 files changed, 37 insertions(+), 10 deletions(-)
Avi Kivity - Oct. 17, 2011, 12:17 p.m.
On 10/17/2011 01:31 PM, Jan Kiszka wrote:
> > 
> > Just to make sure I understand this completely:  a hash table indexed by
> > MSIMessage in kvm code would avoid this?  You'd just allocate on demand
> > when seeing a new MSIMessage and free on an LRU basis, avoiding pinned
> > entries.
> > 
> > I'm not advocating this (yet), just want to understand the tradeoffs.
>
> Practically, that may work. I just wanted to avoid searching. And for
> static routes (irqfd, device assigment) you still need caches anyway, so
> I decided to use them consistently.

Okay.  Even if we do decide to go for transparent caches, it should be
done after this is merged, to avoid excessive churn.
Michael S. Tsirkin - Oct. 17, 2011, 3:37 p.m.
On Mon, Oct 17, 2011 at 01:19:56PM +0200, Jan Kiszka wrote:
> On 2011-10-17 13:06, Avi Kivity wrote:
> > On 10/17/2011 11:27 AM, Jan Kiszka wrote:
> >> This cache will help us implementing KVM in-kernel irqchip support
> >> without spreading hooks all over the place.
> >>
> >> KVM requires us to register it first and then deliver it by raising a
> >> pseudo IRQ line returned on registration. While this could be changed
> >> for QEMU-originated MSI messages by adding direct MSI injection, we will
> >> still need this translation for irqfd-originated messages. The
> >> MSIRoutingCache will allow to track those registrations and update them
> >> lazily before the actual delivery. This avoid having to track MSI
> >> vectors at device level (like qemu-kvm currently does).
> >>
> >>
> >> +typedef enum {
> >> +    MSI_ROUTE_NONE = 0,
> >> +    MSI_ROUTE_STATIC,
> >> +} MSIRouteType;
> >> +
> >> +struct MSIRoutingCache {
> >> +    MSIMessage msg;
> >> +    MSIRouteType type;
> >> +    int kvm_gsi;
> >> +    int kvm_irqfd;
> >> +};
> >> +
> >> diff --git a/hw/pci.h b/hw/pci.h
> >> index 329ab32..5b5d2fd 100644
> >> --- a/hw/pci.h
> >> +++ b/hw/pci.h
> >> @@ -197,6 +197,10 @@ struct PCIDevice {
> >>      MemoryRegion rom;
> >>      uint32_t rom_bar;
> >>  
> >> +    /* MSI routing chaches */
> >> +    MSIRoutingCache *msi_cache;
> >> +    MSIRoutingCache *msix_cache;
> >> +
> >>      /* MSI entries */
> >>      int msi_entries_nr;
> >>      struct KVMMsiMessage *msi_irq_entries;
> > 
> > IMO this needlessly leaks kvm information into core qemu.  The cache
> > should be completely hidden in kvm code.
> > 
> > I think msi_deliver() can hide the use of the cache completely.  For
> > pre-registered events like kvm's irqfd, you can use something like
> > 
> >   qemu_irq qemu_msi_irq(MSIMessage msg)
> > 
> > for non-kvm, it simply returns a qemu_irq that triggers a stl_phys();
> > for kvm, it allocates an irqfd and a permanent entry in the cache and
> > returns a qemu_irq that triggers the irqfd.
> 
> See my previously mail: you want to track the life-cycle of an MSI
> source to avoid generating routes for identical sources. A messages is
> not a source. Two identical messages can come from different sources.

Since MSI messages are edge triggered, I don't see how this
would work without losing interrupts. And AFAIK,
existing guests do not use the same message for
different sources.

> So
> we need a separate data structure for that purpose.
> 
> Jan
> 
> -- 
> Siemens AG, Corporate Technology, CT T DE IT 1
> Corporate Competence Center Embedded Linux
Michael S. Tsirkin - Oct. 17, 2011, 3:43 p.m.
On Mon, Oct 17, 2011 at 11:27:46AM +0200, Jan Kiszka wrote:
> This cache will help us implementing KVM in-kernel irqchip support
> without spreading hooks all over the place.
> 
> KVM requires us to register it first and then deliver it by raising a
> pseudo IRQ line returned on registration. While this could be changed
> for QEMU-originated MSI messages by adding direct MSI injection, we will
> still need this translation for irqfd-originated messages. The
> MSIRoutingCache will allow to track those registrations and update them
> lazily before the actual delivery. This avoid having to track MSI
> vectors at device level (like qemu-kvm currently does).
> 
> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>

So if many devices are added, exhausting the number of GSIs supported,
we get terrible performance intead of simply failing outright.

To me, this looks more like a bug than a feature ...


> ---
>  hw/apic.c     |    5 +++--
>  hw/apic.h     |    2 +-
>  hw/msi.c      |   10 +++++++---
>  hw/msi.h      |   14 +++++++++++++-
>  hw/msix.c     |    7 ++++++-
>  hw/pc.c       |    4 ++--
>  hw/pci.h      |    4 ++++
>  qemu-common.h |    1 +
>  8 files changed, 37 insertions(+), 10 deletions(-)
> 
> diff --git a/hw/apic.c b/hw/apic.c
> index c1d557d..6811ae1 100644
> --- a/hw/apic.c
> +++ b/hw/apic.c
> @@ -804,7 +804,7 @@ static uint32_t apic_mem_readl(void *opaque, target_phys_addr_t addr)
>      return val;
>  }
>  
> -void apic_deliver_msi(MSIMessage *msg)
> +void apic_deliver_msi(MSIMessage *msg, MSIRoutingCache *cache)
>  {
>      uint8_t dest =
>          (msg->address & MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT;
> @@ -829,8 +829,9 @@ static void apic_mem_writel(void *opaque, target_phys_addr_t addr, uint32_t val)
>           * Mapping them on the global bus happens to work because
>           * MSI registers are reserved in APIC MMIO and vice versa. */
>          MSIMessage msg = { .address = addr, .data = val };
> +        static MSIRoutingCache cache;
>  
> -        msi_deliver(&msg);
> +        msi_deliver(&msg, &cache);
>          return;
>      }
>  
> diff --git a/hw/apic.h b/hw/apic.h
> index fa848fd..353ea3a 100644
> --- a/hw/apic.h
> +++ b/hw/apic.h
> @@ -18,7 +18,7 @@ void cpu_set_apic_tpr(DeviceState *s, uint8_t val);
>  uint8_t cpu_get_apic_tpr(DeviceState *s);
>  void apic_init_reset(DeviceState *s);
>  void apic_sipi(DeviceState *s);
> -void apic_deliver_msi(MSIMessage *msg);
> +void apic_deliver_msi(MSIMessage *msg, MSIRoutingCache *cache);
>  
>  /* pc.c */
>  int cpu_is_bsp(CPUState *env);
> diff --git a/hw/msi.c b/hw/msi.c
> index 9055155..c8ccb17 100644
> --- a/hw/msi.c
> +++ b/hw/msi.c
> @@ -40,13 +40,13 @@
>  /* Flag for interrupt controller to declare MSI/MSI-X support */
>  bool msi_supported;
>  
> -static void msi_unsupported(MSIMessage *msg)
> +static void msi_unsupported(MSIMessage *msg, MSIRoutingCache *cache)
>  {
>      /* If we get here, the board failed to register a delivery handler. */
>      abort();
>  }
>  
> -void (*msi_deliver)(MSIMessage *msg) = msi_unsupported;
> +void (*msi_deliver)(MSIMessage *msg, MSIRoutingCache *cache) = msi_unsupported;
>  
>  /* If we get rid of cap allocator, we won't need this. */
>  static inline uint8_t msi_cap_sizeof(uint16_t flags)
> @@ -288,6 +288,8 @@ int msi_init(struct PCIDevice *dev, uint8_t offset,
>                       0xffffffff >> (PCI_MSI_VECTORS_MAX - nr_vectors));
>      }
>  
> +    dev->msi_cache = g_malloc0(nr_vectors * sizeof(*dev->msi_cache));
> +
>      if (kvm_enabled() && kvm_irqchip_in_kernel()) {
>          dev->msi_irq_entries = g_malloc(nr_vectors *
>                                          sizeof(*dev->msix_irq_entries));
> @@ -312,6 +314,8 @@ void msi_uninit(struct PCIDevice *dev)
>          g_free(dev->msi_irq_entries);
>      }
>  
> +    g_free(dev->msi_cache);
> +
>      pci_del_capability(dev, PCI_CAP_ID_MSI, cap_size);
>      dev->cap_present &= ~QEMU_PCI_CAP_MSI;
>  
> @@ -389,7 +393,7 @@ void msi_notify(PCIDevice *dev, unsigned int vector)
>                     "notify vector 0x%x"
>                     " address: 0x%"PRIx64" data: 0x%"PRIx32"\n",
>                     vector, msg.address, msg.data);
> -    msi_deliver(&msg);
> +    msi_deliver(&msg, &dev->msi_cache[vector]);
>  }
>  
>  /* Normally called by pci_default_write_config(). */
> diff --git a/hw/msi.h b/hw/msi.h
> index f3152f3..20ae215 100644
> --- a/hw/msi.h
> +++ b/hw/msi.h
> @@ -29,6 +29,18 @@ struct MSIMessage {
>      uint32_t data;
>  };
>  
> +typedef enum {
> +    MSI_ROUTE_NONE = 0,
> +    MSI_ROUTE_STATIC,
> +} MSIRouteType;
> +
> +struct MSIRoutingCache {
> +    MSIMessage msg;
> +    MSIRouteType type;
> +    int kvm_gsi;
> +    int kvm_irqfd;
> +};
> +
>  extern bool msi_supported;
>  
>  bool msi_enabled(const PCIDevice *dev);
> @@ -46,6 +58,6 @@ static inline bool msi_present(const PCIDevice *dev)
>      return dev->cap_present & QEMU_PCI_CAP_MSI;
>  }
>  
> -extern void (*msi_deliver)(MSIMessage *msg);
> +extern void (*msi_deliver)(MSIMessage *msg, MSIRoutingCache *cache);
>  
>  #endif /* QEMU_MSI_H */
> diff --git a/hw/msix.c b/hw/msix.c
> index 08cc526..e824aef 100644
> --- a/hw/msix.c
> +++ b/hw/msix.c
> @@ -358,6 +358,8 @@ int msix_init(struct PCIDevice *dev, unsigned short nentries,
>      if (ret)
>          goto err_config;
>  
> +    dev->msix_cache = g_malloc0(nentries * sizeof *dev->msix_cache);
> +
>      if (kvm_enabled() && kvm_irqchip_in_kernel()) {
>          dev->msix_irq_entries = g_malloc(nentries *
>                                           sizeof *dev->msix_irq_entries);
> @@ -409,6 +411,9 @@ int msix_uninit(PCIDevice *dev, MemoryRegion *bar)
>      dev->msix_entry_used = NULL;
>      g_free(dev->msix_irq_entries);
>      dev->msix_irq_entries = NULL;
> +
> +    g_free(dev->msix_cache);
> +
>      dev->cap_present &= ~QEMU_PCI_CAP_MSIX;
>      return 0;
>  }
> @@ -478,7 +483,7 @@ void msix_notify(PCIDevice *dev, unsigned vector)
>  
>      msix_message_from_vector(dev, vector, &msg);
>  
> -    msi_deliver(&msg);
> +    msi_deliver(&msg, &dev->msix_cache[vector]);
>  }
>  
>  void msix_reset(PCIDevice *dev)
> diff --git a/hw/pc.c b/hw/pc.c
> index 7d29a4a..4d8b524 100644
> --- a/hw/pc.c
> +++ b/hw/pc.c
> @@ -103,10 +103,10 @@ void isa_irq_handler(void *opaque, int n, int level)
>          qemu_set_irq(isa->ioapic[n], level);
>  };
>  
> -static void pc_msi_deliver(MSIMessage *msg)
> +static void pc_msi_deliver(MSIMessage *msg, MSIRoutingCache *cache)
>  {
>      if ((msg->address & 0xfff00000) == MSI_ADDR_BASE) {
> -        apic_deliver_msi(msg);
> +        apic_deliver_msi(msg, cache);
>      } else {
>          stl_phys(msg->address, msg->data);
>      }
> diff --git a/hw/pci.h b/hw/pci.h
> index 329ab32..5b5d2fd 100644
> --- a/hw/pci.h
> +++ b/hw/pci.h
> @@ -197,6 +197,10 @@ struct PCIDevice {
>      MemoryRegion rom;
>      uint32_t rom_bar;
>  
> +    /* MSI routing chaches */
> +    MSIRoutingCache *msi_cache;
> +    MSIRoutingCache *msix_cache;
> +
>      /* MSI entries */
>      int msi_entries_nr;
>      struct KVMMsiMessage *msi_irq_entries;
> diff --git a/qemu-common.h b/qemu-common.h
> index d3901bd..c1d1614 100644
> --- a/qemu-common.h
> +++ b/qemu-common.h
> @@ -16,6 +16,7 @@ typedef struct QEMUFile QEMUFile;
>  typedef struct QEMUBH QEMUBH;
>  typedef struct DeviceState DeviceState;
>  typedef struct MSIMessage MSIMessage;
> +typedef struct MSIRoutingCache MSIRoutingCache;
>  
>  struct Monitor;
>  typedef struct Monitor Monitor;
> -- 
> 1.7.3.4
Jan Kiszka - Oct. 17, 2011, 7:19 p.m.
On 2011-10-17 17:37, Michael S. Tsirkin wrote:
> On Mon, Oct 17, 2011 at 01:19:56PM +0200, Jan Kiszka wrote:
>> On 2011-10-17 13:06, Avi Kivity wrote:
>>> On 10/17/2011 11:27 AM, Jan Kiszka wrote:
>>>> This cache will help us implementing KVM in-kernel irqchip support
>>>> without spreading hooks all over the place.
>>>>
>>>> KVM requires us to register it first and then deliver it by raising a
>>>> pseudo IRQ line returned on registration. While this could be changed
>>>> for QEMU-originated MSI messages by adding direct MSI injection, we will
>>>> still need this translation for irqfd-originated messages. The
>>>> MSIRoutingCache will allow to track those registrations and update them
>>>> lazily before the actual delivery. This avoid having to track MSI
>>>> vectors at device level (like qemu-kvm currently does).
>>>>
>>>>
>>>> +typedef enum {
>>>> +    MSI_ROUTE_NONE = 0,
>>>> +    MSI_ROUTE_STATIC,
>>>> +} MSIRouteType;
>>>> +
>>>> +struct MSIRoutingCache {
>>>> +    MSIMessage msg;
>>>> +    MSIRouteType type;
>>>> +    int kvm_gsi;
>>>> +    int kvm_irqfd;
>>>> +};
>>>> +
>>>> diff --git a/hw/pci.h b/hw/pci.h
>>>> index 329ab32..5b5d2fd 100644
>>>> --- a/hw/pci.h
>>>> +++ b/hw/pci.h
>>>> @@ -197,6 +197,10 @@ struct PCIDevice {
>>>>      MemoryRegion rom;
>>>>      uint32_t rom_bar;
>>>>  
>>>> +    /* MSI routing chaches */
>>>> +    MSIRoutingCache *msi_cache;
>>>> +    MSIRoutingCache *msix_cache;
>>>> +
>>>>      /* MSI entries */
>>>>      int msi_entries_nr;
>>>>      struct KVMMsiMessage *msi_irq_entries;
>>>
>>> IMO this needlessly leaks kvm information into core qemu.  The cache
>>> should be completely hidden in kvm code.
>>>
>>> I think msi_deliver() can hide the use of the cache completely.  For
>>> pre-registered events like kvm's irqfd, you can use something like
>>>
>>>   qemu_irq qemu_msi_irq(MSIMessage msg)
>>>
>>> for non-kvm, it simply returns a qemu_irq that triggers a stl_phys();
>>> for kvm, it allocates an irqfd and a permanent entry in the cache and
>>> returns a qemu_irq that triggers the irqfd.
>>
>> See my previously mail: you want to track the life-cycle of an MSI
>> source to avoid generating routes for identical sources. A messages is
>> not a source. Two identical messages can come from different sources.
> 
> Since MSI messages are edge triggered, I don't see how this
> would work without losing interrupts. And AFAIK,
> existing guests do not use the same message for
> different sources.

Just like we handle shared edge-triggered line-base IRQs, shared MSIs
are in principle feasible as well.

Jan
Jan Kiszka - Oct. 17, 2011, 7:23 p.m.
On 2011-10-17 17:43, Michael S. Tsirkin wrote:
> On Mon, Oct 17, 2011 at 11:27:46AM +0200, Jan Kiszka wrote:
>> This cache will help us implementing KVM in-kernel irqchip support
>> without spreading hooks all over the place.
>>
>> KVM requires us to register it first and then deliver it by raising a
>> pseudo IRQ line returned on registration. While this could be changed
>> for QEMU-originated MSI messages by adding direct MSI injection, we will
>> still need this translation for irqfd-originated messages. The
>> MSIRoutingCache will allow to track those registrations and update them
>> lazily before the actual delivery. This avoid having to track MSI
>> vectors at device level (like qemu-kvm currently does).
>>
>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
> 
> So if many devices are added, exhausting the number of GSIs supported,
> we get terrible performance intead of simply failing outright.
> 
> To me, this looks more like a bug than a feature ...

If that ever turns out to be a bottleneck, failing looks like the worst
we can do. Reporting excessive cache flushes would make some sense and
could still be added.

Jan
Michael S. Tsirkin - Oct. 18, 2011, 12:17 p.m.
On Mon, Oct 17, 2011 at 09:19:34PM +0200, Jan Kiszka wrote:
> On 2011-10-17 17:37, Michael S. Tsirkin wrote:
> > On Mon, Oct 17, 2011 at 01:19:56PM +0200, Jan Kiszka wrote:
> >> On 2011-10-17 13:06, Avi Kivity wrote:
> >>> On 10/17/2011 11:27 AM, Jan Kiszka wrote:
> >>>> This cache will help us implementing KVM in-kernel irqchip support
> >>>> without spreading hooks all over the place.
> >>>>
> >>>> KVM requires us to register it first and then deliver it by raising a
> >>>> pseudo IRQ line returned on registration. While this could be changed
> >>>> for QEMU-originated MSI messages by adding direct MSI injection, we will
> >>>> still need this translation for irqfd-originated messages. The
> >>>> MSIRoutingCache will allow to track those registrations and update them
> >>>> lazily before the actual delivery. This avoid having to track MSI
> >>>> vectors at device level (like qemu-kvm currently does).
> >>>>
> >>>>
> >>>> +typedef enum {
> >>>> +    MSI_ROUTE_NONE = 0,
> >>>> +    MSI_ROUTE_STATIC,
> >>>> +} MSIRouteType;
> >>>> +
> >>>> +struct MSIRoutingCache {
> >>>> +    MSIMessage msg;
> >>>> +    MSIRouteType type;
> >>>> +    int kvm_gsi;
> >>>> +    int kvm_irqfd;
> >>>> +};
> >>>> +
> >>>> diff --git a/hw/pci.h b/hw/pci.h
> >>>> index 329ab32..5b5d2fd 100644
> >>>> --- a/hw/pci.h
> >>>> +++ b/hw/pci.h
> >>>> @@ -197,6 +197,10 @@ struct PCIDevice {
> >>>>      MemoryRegion rom;
> >>>>      uint32_t rom_bar;
> >>>>  
> >>>> +    /* MSI routing chaches */
> >>>> +    MSIRoutingCache *msi_cache;
> >>>> +    MSIRoutingCache *msix_cache;
> >>>> +
> >>>>      /* MSI entries */
> >>>>      int msi_entries_nr;
> >>>>      struct KVMMsiMessage *msi_irq_entries;
> >>>
> >>> IMO this needlessly leaks kvm information into core qemu.  The cache
> >>> should be completely hidden in kvm code.
> >>>
> >>> I think msi_deliver() can hide the use of the cache completely.  For
> >>> pre-registered events like kvm's irqfd, you can use something like
> >>>
> >>>   qemu_irq qemu_msi_irq(MSIMessage msg)
> >>>
> >>> for non-kvm, it simply returns a qemu_irq that triggers a stl_phys();
> >>> for kvm, it allocates an irqfd and a permanent entry in the cache and
> >>> returns a qemu_irq that triggers the irqfd.
> >>
> >> See my previously mail: you want to track the life-cycle of an MSI
> >> source to avoid generating routes for identical sources. A messages is
> >> not a source. Two identical messages can come from different sources.
> > 
> > Since MSI messages are edge triggered, I don't see how this
> > would work without losing interrupts. And AFAIK,
> > existing guests do not use the same message for
> > different sources.
> 
> Just like we handle shared edge-triggered line-base IRQs, shared MSIs
> are in principle feasible as well.
> 
> Jan
> 

For this case it seems quite harmless to use multiple
routes for identical sources. Yes it would use more resources
but it never happens in practice.
So what Avi said originally is still true.
Jan Kiszka - Oct. 18, 2011, 12:26 p.m.
On 2011-10-18 14:17, Michael S. Tsirkin wrote:
> On Mon, Oct 17, 2011 at 09:19:34PM +0200, Jan Kiszka wrote:
>> On 2011-10-17 17:37, Michael S. Tsirkin wrote:
>>> On Mon, Oct 17, 2011 at 01:19:56PM +0200, Jan Kiszka wrote:
>>>> On 2011-10-17 13:06, Avi Kivity wrote:
>>>>> On 10/17/2011 11:27 AM, Jan Kiszka wrote:
>>>>>> This cache will help us implementing KVM in-kernel irqchip support
>>>>>> without spreading hooks all over the place.
>>>>>>
>>>>>> KVM requires us to register it first and then deliver it by raising a
>>>>>> pseudo IRQ line returned on registration. While this could be changed
>>>>>> for QEMU-originated MSI messages by adding direct MSI injection, we will
>>>>>> still need this translation for irqfd-originated messages. The
>>>>>> MSIRoutingCache will allow to track those registrations and update them
>>>>>> lazily before the actual delivery. This avoid having to track MSI
>>>>>> vectors at device level (like qemu-kvm currently does).
>>>>>>
>>>>>>
>>>>>> +typedef enum {
>>>>>> +    MSI_ROUTE_NONE = 0,
>>>>>> +    MSI_ROUTE_STATIC,
>>>>>> +} MSIRouteType;
>>>>>> +
>>>>>> +struct MSIRoutingCache {
>>>>>> +    MSIMessage msg;
>>>>>> +    MSIRouteType type;
>>>>>> +    int kvm_gsi;
>>>>>> +    int kvm_irqfd;
>>>>>> +};
>>>>>> +
>>>>>> diff --git a/hw/pci.h b/hw/pci.h
>>>>>> index 329ab32..5b5d2fd 100644
>>>>>> --- a/hw/pci.h
>>>>>> +++ b/hw/pci.h
>>>>>> @@ -197,6 +197,10 @@ struct PCIDevice {
>>>>>>      MemoryRegion rom;
>>>>>>      uint32_t rom_bar;
>>>>>>  
>>>>>> +    /* MSI routing chaches */
>>>>>> +    MSIRoutingCache *msi_cache;
>>>>>> +    MSIRoutingCache *msix_cache;
>>>>>> +
>>>>>>      /* MSI entries */
>>>>>>      int msi_entries_nr;
>>>>>>      struct KVMMsiMessage *msi_irq_entries;
>>>>>
>>>>> IMO this needlessly leaks kvm information into core qemu.  The cache
>>>>> should be completely hidden in kvm code.
>>>>>
>>>>> I think msi_deliver() can hide the use of the cache completely.  For
>>>>> pre-registered events like kvm's irqfd, you can use something like
>>>>>
>>>>>   qemu_irq qemu_msi_irq(MSIMessage msg)
>>>>>
>>>>> for non-kvm, it simply returns a qemu_irq that triggers a stl_phys();
>>>>> for kvm, it allocates an irqfd and a permanent entry in the cache and
>>>>> returns a qemu_irq that triggers the irqfd.
>>>>
>>>> See my previously mail: you want to track the life-cycle of an MSI
>>>> source to avoid generating routes for identical sources. A messages is
>>>> not a source. Two identical messages can come from different sources.
>>>
>>> Since MSI messages are edge triggered, I don't see how this
>>> would work without losing interrupts. And AFAIK,
>>> existing guests do not use the same message for
>>> different sources.
>>
>> Just like we handle shared edge-triggered line-base IRQs, shared MSIs
>> are in principle feasible as well.
>>
>> Jan
>>
> 
> For this case it seems quite harmless to use multiple
> routes for identical sources.

Unless we track the source (via the MSIRoutingCache abstraction), there
can be no multiple routes. The core cannot differentiate between
identical messages, thus will not create multiple routes.

But that's actually a corner case, and we could probably live with it.
The real question is if we want to search for MSI routes on each message
delivery.

Jan

Patch

diff --git a/hw/apic.c b/hw/apic.c
index c1d557d..6811ae1 100644
--- a/hw/apic.c
+++ b/hw/apic.c
@@ -804,7 +804,7 @@  static uint32_t apic_mem_readl(void *opaque, target_phys_addr_t addr)
     return val;
 }
 
-void apic_deliver_msi(MSIMessage *msg)
+void apic_deliver_msi(MSIMessage *msg, MSIRoutingCache *cache)
 {
     uint8_t dest =
         (msg->address & MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT;
@@ -829,8 +829,9 @@  static void apic_mem_writel(void *opaque, target_phys_addr_t addr, uint32_t val)
          * Mapping them on the global bus happens to work because
          * MSI registers are reserved in APIC MMIO and vice versa. */
         MSIMessage msg = { .address = addr, .data = val };
+        static MSIRoutingCache cache;
 
-        msi_deliver(&msg);
+        msi_deliver(&msg, &cache);
         return;
     }
 
diff --git a/hw/apic.h b/hw/apic.h
index fa848fd..353ea3a 100644
--- a/hw/apic.h
+++ b/hw/apic.h
@@ -18,7 +18,7 @@  void cpu_set_apic_tpr(DeviceState *s, uint8_t val);
 uint8_t cpu_get_apic_tpr(DeviceState *s);
 void apic_init_reset(DeviceState *s);
 void apic_sipi(DeviceState *s);
-void apic_deliver_msi(MSIMessage *msg);
+void apic_deliver_msi(MSIMessage *msg, MSIRoutingCache *cache);
 
 /* pc.c */
 int cpu_is_bsp(CPUState *env);
diff --git a/hw/msi.c b/hw/msi.c
index 9055155..c8ccb17 100644
--- a/hw/msi.c
+++ b/hw/msi.c
@@ -40,13 +40,13 @@ 
 /* Flag for interrupt controller to declare MSI/MSI-X support */
 bool msi_supported;
 
-static void msi_unsupported(MSIMessage *msg)
+static void msi_unsupported(MSIMessage *msg, MSIRoutingCache *cache)
 {
     /* If we get here, the board failed to register a delivery handler. */
     abort();
 }
 
-void (*msi_deliver)(MSIMessage *msg) = msi_unsupported;
+void (*msi_deliver)(MSIMessage *msg, MSIRoutingCache *cache) = msi_unsupported;
 
 /* If we get rid of cap allocator, we won't need this. */
 static inline uint8_t msi_cap_sizeof(uint16_t flags)
@@ -288,6 +288,8 @@  int msi_init(struct PCIDevice *dev, uint8_t offset,
                      0xffffffff >> (PCI_MSI_VECTORS_MAX - nr_vectors));
     }
 
+    dev->msi_cache = g_malloc0(nr_vectors * sizeof(*dev->msi_cache));
+
     if (kvm_enabled() && kvm_irqchip_in_kernel()) {
         dev->msi_irq_entries = g_malloc(nr_vectors *
                                         sizeof(*dev->msix_irq_entries));
@@ -312,6 +314,8 @@  void msi_uninit(struct PCIDevice *dev)
         g_free(dev->msi_irq_entries);
     }
 
+    g_free(dev->msi_cache);
+
     pci_del_capability(dev, PCI_CAP_ID_MSI, cap_size);
     dev->cap_present &= ~QEMU_PCI_CAP_MSI;
 
@@ -389,7 +393,7 @@  void msi_notify(PCIDevice *dev, unsigned int vector)
                    "notify vector 0x%x"
                    " address: 0x%"PRIx64" data: 0x%"PRIx32"\n",
                    vector, msg.address, msg.data);
-    msi_deliver(&msg);
+    msi_deliver(&msg, &dev->msi_cache[vector]);
 }
 
 /* Normally called by pci_default_write_config(). */
diff --git a/hw/msi.h b/hw/msi.h
index f3152f3..20ae215 100644
--- a/hw/msi.h
+++ b/hw/msi.h
@@ -29,6 +29,18 @@  struct MSIMessage {
     uint32_t data;
 };
 
+typedef enum {
+    MSI_ROUTE_NONE = 0,
+    MSI_ROUTE_STATIC,
+} MSIRouteType;
+
+struct MSIRoutingCache {
+    MSIMessage msg;
+    MSIRouteType type;
+    int kvm_gsi;
+    int kvm_irqfd;
+};
+
 extern bool msi_supported;
 
 bool msi_enabled(const PCIDevice *dev);
@@ -46,6 +58,6 @@  static inline bool msi_present(const PCIDevice *dev)
     return dev->cap_present & QEMU_PCI_CAP_MSI;
 }
 
-extern void (*msi_deliver)(MSIMessage *msg);
+extern void (*msi_deliver)(MSIMessage *msg, MSIRoutingCache *cache);
 
 #endif /* QEMU_MSI_H */
diff --git a/hw/msix.c b/hw/msix.c
index 08cc526..e824aef 100644
--- a/hw/msix.c
+++ b/hw/msix.c
@@ -358,6 +358,8 @@  int msix_init(struct PCIDevice *dev, unsigned short nentries,
     if (ret)
         goto err_config;
 
+    dev->msix_cache = g_malloc0(nentries * sizeof *dev->msix_cache);
+
     if (kvm_enabled() && kvm_irqchip_in_kernel()) {
         dev->msix_irq_entries = g_malloc(nentries *
                                          sizeof *dev->msix_irq_entries);
@@ -409,6 +411,9 @@  int msix_uninit(PCIDevice *dev, MemoryRegion *bar)
     dev->msix_entry_used = NULL;
     g_free(dev->msix_irq_entries);
     dev->msix_irq_entries = NULL;
+
+    g_free(dev->msix_cache);
+
     dev->cap_present &= ~QEMU_PCI_CAP_MSIX;
     return 0;
 }
@@ -478,7 +483,7 @@  void msix_notify(PCIDevice *dev, unsigned vector)
 
     msix_message_from_vector(dev, vector, &msg);
 
-    msi_deliver(&msg);
+    msi_deliver(&msg, &dev->msix_cache[vector]);
 }
 
 void msix_reset(PCIDevice *dev)
diff --git a/hw/pc.c b/hw/pc.c
index 7d29a4a..4d8b524 100644
--- a/hw/pc.c
+++ b/hw/pc.c
@@ -103,10 +103,10 @@  void isa_irq_handler(void *opaque, int n, int level)
         qemu_set_irq(isa->ioapic[n], level);
 };
 
-static void pc_msi_deliver(MSIMessage *msg)
+static void pc_msi_deliver(MSIMessage *msg, MSIRoutingCache *cache)
 {
     if ((msg->address & 0xfff00000) == MSI_ADDR_BASE) {
-        apic_deliver_msi(msg);
+        apic_deliver_msi(msg, cache);
     } else {
         stl_phys(msg->address, msg->data);
     }
diff --git a/hw/pci.h b/hw/pci.h
index 329ab32..5b5d2fd 100644
--- a/hw/pci.h
+++ b/hw/pci.h
@@ -197,6 +197,10 @@  struct PCIDevice {
     MemoryRegion rom;
     uint32_t rom_bar;
 
+    /* MSI routing chaches */
+    MSIRoutingCache *msi_cache;
+    MSIRoutingCache *msix_cache;
+
     /* MSI entries */
     int msi_entries_nr;
     struct KVMMsiMessage *msi_irq_entries;
diff --git a/qemu-common.h b/qemu-common.h
index d3901bd..c1d1614 100644
--- a/qemu-common.h
+++ b/qemu-common.h
@@ -16,6 +16,7 @@  typedef struct QEMUFile QEMUFile;
 typedef struct QEMUBH QEMUBH;
 typedef struct DeviceState DeviceState;
 typedef struct MSIMessage MSIMessage;
+typedef struct MSIRoutingCache MSIRoutingCache;
 
 struct Monitor;
 typedef struct Monitor Monitor;