Patchwork [v3,3/8] kvm: Introduce basic MSI support for in-kernel irqchips

login
register
mail settings
Submitter Jan Kiszka
Date May 10, 2012, 9:02 p.m.
Message ID <7268a4ec77db89147f33345019f0f52e3f12c231.1336683774.git.jan.kiszka@siemens.com>
Download mbox | patch
Permalink /patch/158405/
State New
Headers show

Comments

Jan Kiszka - May 10, 2012, 9:02 p.m.
This patch basically adds kvm_irqchip_send_msi, a service for sending
arbitrary MSI messages to KVM's in-kernel irqchip models.

As the current KVI API requires us to establish a static route from a
pseudo GSI to the target MSI message and inject the MSI via toggling
that GSI, we need to play some tricks to make this unfortunately
interface transparent. We create those routes on demand and keep them
in a hash table. Succeeding messages can then search for an existing
route in the table first and reuse it whenever possible. If we should
run out of limited GSIs, we simply flush the table and rebuild it as
messages are sent.

This approach is rather simple and could be optimized further. However,
future kernels will contain a more efficient MSI injection interface
that will obsolete the GSI-based dynamic injection.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
---
 kvm-all.c |  139 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 kvm.h     |    1 +
 2 files changed, 139 insertions(+), 1 deletions(-)
Marcelo Tosatti - May 16, 2012, 3:27 a.m.
On Thu, May 10, 2012 at 06:02:52PM -0300, Jan Kiszka wrote:
> This patch basically adds kvm_irqchip_send_msi, a service for sending
> arbitrary MSI messages to KVM's in-kernel irqchip models.
> 
> As the current KVI API requires us to establish a static route from a
> pseudo GSI to the target MSI message and inject the MSI via toggling
> that GSI, we need to play some tricks to make this unfortunately
> interface transparent. We create those routes on demand and keep them
> in a hash table. Succeeding messages can then search for an existing
> route in the table first and reuse it whenever possible. If we should
> run out of limited GSIs, we simply flush the table and rebuild it as
> messages are sent.
> 
> This approach is rather simple and could be optimized further. However,
> future kernels will contain a more efficient MSI injection interface
> that will obsolete the GSI-based dynamic injection.
> 
> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
> ---
>  kvm-all.c |  139 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>  kvm.h     |    1 +
>  2 files changed, 139 insertions(+), 1 deletions(-)
> 
> diff --git a/kvm-all.c b/kvm-all.c
> index 2d82d54..e7ed510 100644
> --- a/kvm-all.c
> +++ b/kvm-all.c
> @@ -24,6 +24,7 @@
>  #include "qemu-barrier.h"
>  #include "sysemu.h"
>  #include "hw/hw.h"
> +#include "hw/msi.h"
>  #include "gdbstub.h"
>  #include "kvm.h"
>  #include "bswap.h"
> @@ -48,6 +49,8 @@
>      do { } while (0)
>  #endif
>  
> +#define KVM_MSI_HASHTAB_SIZE    256
> +
>  typedef struct KVMSlot
>  {
>      target_phys_addr_t start_addr;
> @@ -59,6 +62,11 @@ typedef struct KVMSlot
>  
>  typedef struct kvm_dirty_log KVMDirtyLog;
>  
> +typedef struct KVMMSIRoute {
> +    struct kvm_irq_routing_entry kroute;
> +    QTAILQ_ENTRY(KVMMSIRoute) entry;
> +} KVMMSIRoute;
> +
>  struct KVMState
>  {
>      KVMSlot slots[32];
> @@ -87,6 +95,7 @@ struct KVMState
>      int nr_allocated_irq_routes;
>      uint32_t *used_gsi_bitmap;
>      unsigned int gsi_count;
> +    QTAILQ_HEAD(msi_hashtab, KVMMSIRoute) msi_hashtab[KVM_MSI_HASHTAB_SIZE];
>  #endif
>  };
>  
> @@ -862,9 +871,14 @@ static void set_gsi(KVMState *s, unsigned int gsi)
>      s->used_gsi_bitmap[gsi / 32] |= 1U << (gsi % 32);
>  }
>  
> +static void clear_gsi(KVMState *s, unsigned int gsi)
> +{
> +    s->used_gsi_bitmap[gsi / 32] &= ~(1U << (gsi % 32));
> +}
> +
>  static void kvm_init_irq_routing(KVMState *s)
>  {
> -    int gsi_count;
> +    int gsi_count, i;
>  
>      gsi_count = kvm_check_extension(s, KVM_CAP_IRQ_ROUTING);
>      if (gsi_count > 0) {
> @@ -884,6 +898,10 @@ static void kvm_init_irq_routing(KVMState *s)
>      s->irq_routes = g_malloc0(sizeof(*s->irq_routes));
>      s->nr_allocated_irq_routes = 0;
>  
> +    for (i = 0; i < KVM_MSI_HASHTAB_SIZE; i++) {
> +        QTAILQ_INIT(&s->msi_hashtab[i]);
> +    }
> +
>      kvm_arch_init_irq_routing(s);
>  }
>  
> @@ -934,11 +952,130 @@ int kvm_irqchip_commit_routes(KVMState *s)
>      return kvm_vm_ioctl(s, KVM_SET_GSI_ROUTING, s->irq_routes);
>  }
>  
> +static void kvm_release_gsi(KVMState *s, int gsi)
> +{
> +    struct kvm_irq_routing_entry *e;
> +    int i;
> +
> +    for (i = 0; i < s->irq_routes->nr; i++) {
> +        e = &s->irq_routes->entries[i];
> +        if (e->gsi == gsi) {
> +            s->irq_routes->nr--;
> +            *e = s->irq_routes->entries[s->irq_routes->nr];
> +        }
> +    }
> +    clear_gsi(s, gsi);
> +}
> +
> +static unsigned int kvm_hash_msi(uint32_t data)
> +{
> +    /* This is optimized for IA32 MSI layout. However, no other arch shall
> +     * repeat the mistake of not providing a direct MSI injection API. */
> +    return data & 0xff;
> +}
> +
> +static void kvm_flush_dynamic_msi_routes(KVMState *s)
> +{
> +    KVMMSIRoute *route, *next;
> +    unsigned int hash;
> +
> +    for (hash = 0; hash < KVM_MSI_HASHTAB_SIZE; hash++) {
> +        QTAILQ_FOREACH_SAFE(route, &s->msi_hashtab[hash], entry, next) {
> +            kvm_release_gsi(s, route->kroute.gsi);
> +            QTAILQ_REMOVE(&s->msi_hashtab[hash], route, entry);
> +            g_free(route);
> +        }
> +    }
> +}
> +
> +static int kvm_get_pseudo_gsi(KVMState *s)
> +{
> +    uint32_t *word = s->used_gsi_bitmap;
> +    int max_words = ALIGN(s->gsi_count, 32) / 32;
> +    int i, bit;
> +    bool retry = true;
> +
> +again:
> +    /* Return the lowest unused GSI in the bitmap */
> +    for (i = 0; i < max_words; i++) {
> +        bit = ffs(~word[i]);
> +        if (!bit) {
> +            continue;
> +        }
> +
> +        return bit - 1 + i * 32;
> +    }
> +    if (retry) {
> +        retry = false;
> +        kvm_flush_dynamic_msi_routes(s);
> +        goto again;
> +    }

Guests with more MSI vectors than GSIs available, multiplexing are going
to be so slow that its best to disallow this completly (set_irq_route ->
synchronize_rcu can take 10ms or more, each).

MSI vectors must be allocated before use, can't it report #nr of
available ones in MSI Multiple Message Capable field? Or allow one MSI
per device? Or any other way to avoid this multiplexing.
Jan Kiszka - May 16, 2012, 11:15 a.m.
On 2012-05-16 00:27, Marcelo Tosatti wrote:
> On Thu, May 10, 2012 at 06:02:52PM -0300, Jan Kiszka wrote:
>> This patch basically adds kvm_irqchip_send_msi, a service for sending
>> arbitrary MSI messages to KVM's in-kernel irqchip models.
>>
>> As the current KVI API requires us to establish a static route from a
>> pseudo GSI to the target MSI message and inject the MSI via toggling
>> that GSI, we need to play some tricks to make this unfortunately
>> interface transparent. We create those routes on demand and keep them
>> in a hash table. Succeeding messages can then search for an existing
>> route in the table first and reuse it whenever possible. If we should
>> run out of limited GSIs, we simply flush the table and rebuild it as
>> messages are sent.
>>
>> This approach is rather simple and could be optimized further. However,
>> future kernels will contain a more efficient MSI injection interface
>> that will obsolete the GSI-based dynamic injection.
>>
>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
>> ---
>>  kvm-all.c |  139 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>>  kvm.h     |    1 +
>>  2 files changed, 139 insertions(+), 1 deletions(-)
>>
>> diff --git a/kvm-all.c b/kvm-all.c
>> index 2d82d54..e7ed510 100644
>> --- a/kvm-all.c
>> +++ b/kvm-all.c
>> @@ -24,6 +24,7 @@
>>  #include "qemu-barrier.h"
>>  #include "sysemu.h"
>>  #include "hw/hw.h"
>> +#include "hw/msi.h"
>>  #include "gdbstub.h"
>>  #include "kvm.h"
>>  #include "bswap.h"
>> @@ -48,6 +49,8 @@
>>      do { } while (0)
>>  #endif
>>  
>> +#define KVM_MSI_HASHTAB_SIZE    256
>> +
>>  typedef struct KVMSlot
>>  {
>>      target_phys_addr_t start_addr;
>> @@ -59,6 +62,11 @@ typedef struct KVMSlot
>>  
>>  typedef struct kvm_dirty_log KVMDirtyLog;
>>  
>> +typedef struct KVMMSIRoute {
>> +    struct kvm_irq_routing_entry kroute;
>> +    QTAILQ_ENTRY(KVMMSIRoute) entry;
>> +} KVMMSIRoute;
>> +
>>  struct KVMState
>>  {
>>      KVMSlot slots[32];
>> @@ -87,6 +95,7 @@ struct KVMState
>>      int nr_allocated_irq_routes;
>>      uint32_t *used_gsi_bitmap;
>>      unsigned int gsi_count;
>> +    QTAILQ_HEAD(msi_hashtab, KVMMSIRoute) msi_hashtab[KVM_MSI_HASHTAB_SIZE];
>>  #endif
>>  };
>>  
>> @@ -862,9 +871,14 @@ static void set_gsi(KVMState *s, unsigned int gsi)
>>      s->used_gsi_bitmap[gsi / 32] |= 1U << (gsi % 32);
>>  }
>>  
>> +static void clear_gsi(KVMState *s, unsigned int gsi)
>> +{
>> +    s->used_gsi_bitmap[gsi / 32] &= ~(1U << (gsi % 32));
>> +}
>> +
>>  static void kvm_init_irq_routing(KVMState *s)
>>  {
>> -    int gsi_count;
>> +    int gsi_count, i;
>>  
>>      gsi_count = kvm_check_extension(s, KVM_CAP_IRQ_ROUTING);
>>      if (gsi_count > 0) {
>> @@ -884,6 +898,10 @@ static void kvm_init_irq_routing(KVMState *s)
>>      s->irq_routes = g_malloc0(sizeof(*s->irq_routes));
>>      s->nr_allocated_irq_routes = 0;
>>  
>> +    for (i = 0; i < KVM_MSI_HASHTAB_SIZE; i++) {
>> +        QTAILQ_INIT(&s->msi_hashtab[i]);
>> +    }
>> +
>>      kvm_arch_init_irq_routing(s);
>>  }
>>  
>> @@ -934,11 +952,130 @@ int kvm_irqchip_commit_routes(KVMState *s)
>>      return kvm_vm_ioctl(s, KVM_SET_GSI_ROUTING, s->irq_routes);
>>  }
>>  
>> +static void kvm_release_gsi(KVMState *s, int gsi)
>> +{
>> +    struct kvm_irq_routing_entry *e;
>> +    int i;
>> +
>> +    for (i = 0; i < s->irq_routes->nr; i++) {
>> +        e = &s->irq_routes->entries[i];
>> +        if (e->gsi == gsi) {
>> +            s->irq_routes->nr--;
>> +            *e = s->irq_routes->entries[s->irq_routes->nr];
>> +        }
>> +    }
>> +    clear_gsi(s, gsi);
>> +}
>> +
>> +static unsigned int kvm_hash_msi(uint32_t data)
>> +{
>> +    /* This is optimized for IA32 MSI layout. However, no other arch shall
>> +     * repeat the mistake of not providing a direct MSI injection API. */
>> +    return data & 0xff;
>> +}
>> +
>> +static void kvm_flush_dynamic_msi_routes(KVMState *s)
>> +{
>> +    KVMMSIRoute *route, *next;
>> +    unsigned int hash;
>> +
>> +    for (hash = 0; hash < KVM_MSI_HASHTAB_SIZE; hash++) {
>> +        QTAILQ_FOREACH_SAFE(route, &s->msi_hashtab[hash], entry, next) {
>> +            kvm_release_gsi(s, route->kroute.gsi);
>> +            QTAILQ_REMOVE(&s->msi_hashtab[hash], route, entry);
>> +            g_free(route);
>> +        }
>> +    }
>> +}
>> +
>> +static int kvm_get_pseudo_gsi(KVMState *s)
>> +{
>> +    uint32_t *word = s->used_gsi_bitmap;
>> +    int max_words = ALIGN(s->gsi_count, 32) / 32;
>> +    int i, bit;
>> +    bool retry = true;
>> +
>> +again:
>> +    /* Return the lowest unused GSI in the bitmap */
>> +    for (i = 0; i < max_words; i++) {
>> +        bit = ffs(~word[i]);
>> +        if (!bit) {
>> +            continue;
>> +        }
>> +
>> +        return bit - 1 + i * 32;
>> +    }
>> +    if (retry) {
>> +        retry = false;
>> +        kvm_flush_dynamic_msi_routes(s);
>> +        goto again;
>> +    }
> 
> Guests with more MSI vectors than GSIs available, multiplexing are going
> to be so slow that its best to disallow this completly (set_irq_route ->
> synchronize_rcu can take 10ms or more, each).

Well, it's still way better than exit(1) - what happens so far (but
obviously not in practice, or we would have heard about it). Note that
this code path is also required for MSI garbage collection, even if we
limited the vectors somehow.

> 
> MSI vectors must be allocated before use, can't it report #nr of
> available ones in MSI Multiple Message Capable field? Or allow one MSI
> per device? Or any other way to avoid this multiplexing.

I can't follow yet. All we have is the maximum number of vectors per
device on MSI[-X] init. There is no spec-conforming interface to
retrieve the actual vector usage from the guest, nor is there anything
to signal resource shortage once we exposed the MSI[-X] capability.
Also, in this design, we do not track vector origins back to the device,
so we can't apply per-device quotas.

But given that we haven't heard about such an overflow scenario in
practice so far (or did you?), I think there is no urgent need to worry.
And we could still increase the number of GSIs.

Jan
Marcelo Tosatti - May 16, 2012, 5:31 p.m.
On Wed, May 16, 2012 at 08:15:37AM -0300, Jan Kiszka wrote:
> On 2012-05-16 00:27, Marcelo Tosatti wrote:
> > On Thu, May 10, 2012 at 06:02:52PM -0300, Jan Kiszka wrote:
> >> This patch basically adds kvm_irqchip_send_msi, a service for sending
> >> arbitrary MSI messages to KVM's in-kernel irqchip models.
> >>
> >> As the current KVI API requires us to establish a static route from a
> >> pseudo GSI to the target MSI message and inject the MSI via toggling
> >> that GSI, we need to play some tricks to make this unfortunately
> >> interface transparent. We create those routes on demand and keep them
> >> in a hash table. Succeeding messages can then search for an existing
> >> route in the table first and reuse it whenever possible. If we should
> >> run out of limited GSIs, we simply flush the table and rebuild it as
> >> messages are sent.
> >>
> >> This approach is rather simple and could be optimized further. However,
> >> future kernels will contain a more efficient MSI injection interface
> >> that will obsolete the GSI-based dynamic injection.
> >>
> >> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
> >> ---
> >>  kvm-all.c |  139 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
> >>  kvm.h     |    1 +
> >>  2 files changed, 139 insertions(+), 1 deletions(-)
> >>
> >> diff --git a/kvm-all.c b/kvm-all.c
> >> index 2d82d54..e7ed510 100644
> >> --- a/kvm-all.c
> >> +++ b/kvm-all.c
> >> @@ -24,6 +24,7 @@
> >>  #include "qemu-barrier.h"
> >>  #include "sysemu.h"
> >>  #include "hw/hw.h"
> >> +#include "hw/msi.h"
> >>  #include "gdbstub.h"
> >>  #include "kvm.h"
> >>  #include "bswap.h"
> >> @@ -48,6 +49,8 @@
> >>      do { } while (0)
> >>  #endif
> >>  
> >> +#define KVM_MSI_HASHTAB_SIZE    256
> >> +
> >>  typedef struct KVMSlot
> >>  {
> >>      target_phys_addr_t start_addr;
> >> @@ -59,6 +62,11 @@ typedef struct KVMSlot
> >>  
> >>  typedef struct kvm_dirty_log KVMDirtyLog;
> >>  
> >> +typedef struct KVMMSIRoute {
> >> +    struct kvm_irq_routing_entry kroute;
> >> +    QTAILQ_ENTRY(KVMMSIRoute) entry;
> >> +} KVMMSIRoute;
> >> +
> >>  struct KVMState
> >>  {
> >>      KVMSlot slots[32];
> >> @@ -87,6 +95,7 @@ struct KVMState
> >>      int nr_allocated_irq_routes;
> >>      uint32_t *used_gsi_bitmap;
> >>      unsigned int gsi_count;
> >> +    QTAILQ_HEAD(msi_hashtab, KVMMSIRoute) msi_hashtab[KVM_MSI_HASHTAB_SIZE];
> >>  #endif
> >>  };
> >>  
> >> @@ -862,9 +871,14 @@ static void set_gsi(KVMState *s, unsigned int gsi)
> >>      s->used_gsi_bitmap[gsi / 32] |= 1U << (gsi % 32);
> >>  }
> >>  
> >> +static void clear_gsi(KVMState *s, unsigned int gsi)
> >> +{
> >> +    s->used_gsi_bitmap[gsi / 32] &= ~(1U << (gsi % 32));
> >> +}
> >> +
> >>  static void kvm_init_irq_routing(KVMState *s)
> >>  {
> >> -    int gsi_count;
> >> +    int gsi_count, i;
> >>  
> >>      gsi_count = kvm_check_extension(s, KVM_CAP_IRQ_ROUTING);
> >>      if (gsi_count > 0) {
> >> @@ -884,6 +898,10 @@ static void kvm_init_irq_routing(KVMState *s)
> >>      s->irq_routes = g_malloc0(sizeof(*s->irq_routes));
> >>      s->nr_allocated_irq_routes = 0;
> >>  
> >> +    for (i = 0; i < KVM_MSI_HASHTAB_SIZE; i++) {
> >> +        QTAILQ_INIT(&s->msi_hashtab[i]);
> >> +    }
> >> +
> >>      kvm_arch_init_irq_routing(s);
> >>  }
> >>  
> >> @@ -934,11 +952,130 @@ int kvm_irqchip_commit_routes(KVMState *s)
> >>      return kvm_vm_ioctl(s, KVM_SET_GSI_ROUTING, s->irq_routes);
> >>  }
> >>  
> >> +static void kvm_release_gsi(KVMState *s, int gsi)
> >> +{
> >> +    struct kvm_irq_routing_entry *e;
> >> +    int i;
> >> +
> >> +    for (i = 0; i < s->irq_routes->nr; i++) {
> >> +        e = &s->irq_routes->entries[i];
> >> +        if (e->gsi == gsi) {
> >> +            s->irq_routes->nr--;
> >> +            *e = s->irq_routes->entries[s->irq_routes->nr];
> >> +        }
> >> +    }
> >> +    clear_gsi(s, gsi);
> >> +}
> >> +
> >> +static unsigned int kvm_hash_msi(uint32_t data)
> >> +{
> >> +    /* This is optimized for IA32 MSI layout. However, no other arch shall
> >> +     * repeat the mistake of not providing a direct MSI injection API. */
> >> +    return data & 0xff;
> >> +}
> >> +
> >> +static void kvm_flush_dynamic_msi_routes(KVMState *s)
> >> +{
> >> +    KVMMSIRoute *route, *next;
> >> +    unsigned int hash;
> >> +
> >> +    for (hash = 0; hash < KVM_MSI_HASHTAB_SIZE; hash++) {
> >> +        QTAILQ_FOREACH_SAFE(route, &s->msi_hashtab[hash], entry, next) {
> >> +            kvm_release_gsi(s, route->kroute.gsi);
> >> +            QTAILQ_REMOVE(&s->msi_hashtab[hash], route, entry);
> >> +            g_free(route);
> >> +        }
> >> +    }
> >> +}
> >> +
> >> +static int kvm_get_pseudo_gsi(KVMState *s)
> >> +{
> >> +    uint32_t *word = s->used_gsi_bitmap;
> >> +    int max_words = ALIGN(s->gsi_count, 32) / 32;
> >> +    int i, bit;
> >> +    bool retry = true;
> >> +
> >> +again:
> >> +    /* Return the lowest unused GSI in the bitmap */
> >> +    for (i = 0; i < max_words; i++) {
> >> +        bit = ffs(~word[i]);
> >> +        if (!bit) {
> >> +            continue;
> >> +        }
> >> +
> >> +        return bit - 1 + i * 32;
> >> +    }
> >> +    if (retry) {
> >> +        retry = false;
> >> +        kvm_flush_dynamic_msi_routes(s);
> >> +        goto again;
> >> +    }
> > 
> > Guests with more MSI vectors than GSIs available, multiplexing are going
> > to be so slow that its best to disallow this completly (set_irq_route ->
> > synchronize_rcu can take 10ms or more, each).
> 
> Well, it's still way better than exit(1) - what happens so far (but
> obviously not in practice, or we would have heard about it). Note that
> this code path is also required for MSI garbage collection, even if we
> limited the vectors somehow.
> 
> > 
> > MSI vectors must be allocated before use, can't it report #nr of
> > available ones in MSI Multiple Message Capable field? Or allow one MSI
> > per device? Or any other way to avoid this multiplexing.
> 
> I can't follow yet. All we have is the maximum number of vectors per
> device on MSI[-X] init. There is no spec-conforming interface to
> retrieve the actual vector usage from the guest, nor is there anything
> to signal resource shortage once we exposed the MSI[-X] capability.
> Also, in this design, we do not track vector origins back to the device,
> so we can't apply per-device quotas.

Wishful thinking.

> But given that we haven't heard about such an overflow scenario in
> practice so far (or did you?), I think there is no urgent need to worry.
> And we could still increase the number of GSIs.

I did not. With no route cache hits, guest will be limited to 100
interrupts per second (and hang waiting on synchronize_rcu during 
that second). Perhaps it is not a bad idea to exit().

Will apply the patch anyway, thanks.
Jan Kiszka - May 16, 2012, 6:18 p.m.
On 2012-05-16 08:15, Jan Kiszka wrote:
> On 2012-05-16 00:27, Marcelo Tosatti wrote:
>> On Thu, May 10, 2012 at 06:02:52PM -0300, Jan Kiszka wrote:
>>> This patch basically adds kvm_irqchip_send_msi, a service for sending
>>> arbitrary MSI messages to KVM's in-kernel irqchip models.
>>>
>>> As the current KVI API requires us to establish a static route from a
>>> pseudo GSI to the target MSI message and inject the MSI via toggling
>>> that GSI, we need to play some tricks to make this unfortunately
>>> interface transparent. We create those routes on demand and keep them
>>> in a hash table. Succeeding messages can then search for an existing
>>> route in the table first and reuse it whenever possible. If we should
>>> run out of limited GSIs, we simply flush the table and rebuild it as
>>> messages are sent.
>>>
>>> This approach is rather simple and could be optimized further. However,
>>> future kernels will contain a more efficient MSI injection interface
>>> that will obsolete the GSI-based dynamic injection.
>>>
>>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
>>> ---
>>>  kvm-all.c |  139 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>>>  kvm.h     |    1 +
>>>  2 files changed, 139 insertions(+), 1 deletions(-)
>>>
>>> diff --git a/kvm-all.c b/kvm-all.c
>>> index 2d82d54..e7ed510 100644
>>> --- a/kvm-all.c
>>> +++ b/kvm-all.c
>>> @@ -24,6 +24,7 @@
>>>  #include "qemu-barrier.h"
>>>  #include "sysemu.h"
>>>  #include "hw/hw.h"
>>> +#include "hw/msi.h"
>>>  #include "gdbstub.h"
>>>  #include "kvm.h"
>>>  #include "bswap.h"
>>> @@ -48,6 +49,8 @@
>>>      do { } while (0)
>>>  #endif
>>>  
>>> +#define KVM_MSI_HASHTAB_SIZE    256
>>> +
>>>  typedef struct KVMSlot
>>>  {
>>>      target_phys_addr_t start_addr;
>>> @@ -59,6 +62,11 @@ typedef struct KVMSlot
>>>  
>>>  typedef struct kvm_dirty_log KVMDirtyLog;
>>>  
>>> +typedef struct KVMMSIRoute {
>>> +    struct kvm_irq_routing_entry kroute;
>>> +    QTAILQ_ENTRY(KVMMSIRoute) entry;
>>> +} KVMMSIRoute;
>>> +
>>>  struct KVMState
>>>  {
>>>      KVMSlot slots[32];
>>> @@ -87,6 +95,7 @@ struct KVMState
>>>      int nr_allocated_irq_routes;
>>>      uint32_t *used_gsi_bitmap;
>>>      unsigned int gsi_count;
>>> +    QTAILQ_HEAD(msi_hashtab, KVMMSIRoute) msi_hashtab[KVM_MSI_HASHTAB_SIZE];
>>>  #endif
>>>  };
>>>  
>>> @@ -862,9 +871,14 @@ static void set_gsi(KVMState *s, unsigned int gsi)
>>>      s->used_gsi_bitmap[gsi / 32] |= 1U << (gsi % 32);
>>>  }
>>>  
>>> +static void clear_gsi(KVMState *s, unsigned int gsi)
>>> +{
>>> +    s->used_gsi_bitmap[gsi / 32] &= ~(1U << (gsi % 32));
>>> +}
>>> +
>>>  static void kvm_init_irq_routing(KVMState *s)
>>>  {
>>> -    int gsi_count;
>>> +    int gsi_count, i;
>>>  
>>>      gsi_count = kvm_check_extension(s, KVM_CAP_IRQ_ROUTING);
>>>      if (gsi_count > 0) {
>>> @@ -884,6 +898,10 @@ static void kvm_init_irq_routing(KVMState *s)
>>>      s->irq_routes = g_malloc0(sizeof(*s->irq_routes));
>>>      s->nr_allocated_irq_routes = 0;
>>>  
>>> +    for (i = 0; i < KVM_MSI_HASHTAB_SIZE; i++) {
>>> +        QTAILQ_INIT(&s->msi_hashtab[i]);
>>> +    }
>>> +
>>>      kvm_arch_init_irq_routing(s);
>>>  }
>>>  
>>> @@ -934,11 +952,130 @@ int kvm_irqchip_commit_routes(KVMState *s)
>>>      return kvm_vm_ioctl(s, KVM_SET_GSI_ROUTING, s->irq_routes);
>>>  }
>>>  
>>> +static void kvm_release_gsi(KVMState *s, int gsi)
>>> +{
>>> +    struct kvm_irq_routing_entry *e;
>>> +    int i;
>>> +
>>> +    for (i = 0; i < s->irq_routes->nr; i++) {
>>> +        e = &s->irq_routes->entries[i];
>>> +        if (e->gsi == gsi) {
>>> +            s->irq_routes->nr--;
>>> +            *e = s->irq_routes->entries[s->irq_routes->nr];
>>> +        }
>>> +    }
>>> +    clear_gsi(s, gsi);
>>> +}
>>> +
>>> +static unsigned int kvm_hash_msi(uint32_t data)
>>> +{
>>> +    /* This is optimized for IA32 MSI layout. However, no other arch shall
>>> +     * repeat the mistake of not providing a direct MSI injection API. */
>>> +    return data & 0xff;
>>> +}
>>> +
>>> +static void kvm_flush_dynamic_msi_routes(KVMState *s)
>>> +{
>>> +    KVMMSIRoute *route, *next;
>>> +    unsigned int hash;
>>> +
>>> +    for (hash = 0; hash < KVM_MSI_HASHTAB_SIZE; hash++) {
>>> +        QTAILQ_FOREACH_SAFE(route, &s->msi_hashtab[hash], entry, next) {
>>> +            kvm_release_gsi(s, route->kroute.gsi);
>>> +            QTAILQ_REMOVE(&s->msi_hashtab[hash], route, entry);
>>> +            g_free(route);
>>> +        }
>>> +    }
>>> +}
>>> +
>>> +static int kvm_get_pseudo_gsi(KVMState *s)
>>> +{
>>> +    uint32_t *word = s->used_gsi_bitmap;
>>> +    int max_words = ALIGN(s->gsi_count, 32) / 32;
>>> +    int i, bit;
>>> +    bool retry = true;
>>> +
>>> +again:
>>> +    /* Return the lowest unused GSI in the bitmap */
>>> +    for (i = 0; i < max_words; i++) {
>>> +        bit = ffs(~word[i]);
>>> +        if (!bit) {
>>> +            continue;
>>> +        }
>>> +
>>> +        return bit - 1 + i * 32;
>>> +    }
>>> +    if (retry) {
>>> +        retry = false;
>>> +        kvm_flush_dynamic_msi_routes(s);
>>> +        goto again;
>>> +    }
>>
>> Guests with more MSI vectors than GSIs available, multiplexing are going
>> to be so slow that its best to disallow this completly (set_irq_route ->
>> synchronize_rcu can take 10ms or more, each).
> 
> Well, it's still way better than exit(1) - what happens so far (but
> obviously not in practice, or we would have heard about it). Note that
> this code path is also required for MSI garbage collection, even if we
> limited the vectors somehow.
> 
>>
>> MSI vectors must be allocated before use, can't it report #nr of
>> available ones in MSI Multiple Message Capable field? Or allow one MSI
>> per device? Or any other way to avoid this multiplexing.
> 
> I can't follow yet. All we have is the maximum number of vectors per
> device on MSI[-X] init. There is no spec-conforming interface to
> retrieve the actual vector usage from the guest, nor is there anything
> to signal resource shortage once we exposed the MSI[-X] capability.
> Also, in this design, we do not track vector origins back to the device,
> so we can't apply per-device quotas.
> 
> But given that we haven't heard about such an overflow scenario in
> practice so far (or did you?), I think there is no urgent need to worry.
> And we could still increase the number of GSIs.

Oh, and forgot to remind that this route flushing code path will be
unused on current kernels with KVM_CAP_SIGNAL_MSI.

I will push a new version of this series soon as this one as a small
bisactability issue.

Jan

Patch

diff --git a/kvm-all.c b/kvm-all.c
index 2d82d54..e7ed510 100644
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -24,6 +24,7 @@ 
 #include "qemu-barrier.h"
 #include "sysemu.h"
 #include "hw/hw.h"
+#include "hw/msi.h"
 #include "gdbstub.h"
 #include "kvm.h"
 #include "bswap.h"
@@ -48,6 +49,8 @@ 
     do { } while (0)
 #endif
 
+#define KVM_MSI_HASHTAB_SIZE    256
+
 typedef struct KVMSlot
 {
     target_phys_addr_t start_addr;
@@ -59,6 +62,11 @@  typedef struct KVMSlot
 
 typedef struct kvm_dirty_log KVMDirtyLog;
 
+typedef struct KVMMSIRoute {
+    struct kvm_irq_routing_entry kroute;
+    QTAILQ_ENTRY(KVMMSIRoute) entry;
+} KVMMSIRoute;
+
 struct KVMState
 {
     KVMSlot slots[32];
@@ -87,6 +95,7 @@  struct KVMState
     int nr_allocated_irq_routes;
     uint32_t *used_gsi_bitmap;
     unsigned int gsi_count;
+    QTAILQ_HEAD(msi_hashtab, KVMMSIRoute) msi_hashtab[KVM_MSI_HASHTAB_SIZE];
 #endif
 };
 
@@ -862,9 +871,14 @@  static void set_gsi(KVMState *s, unsigned int gsi)
     s->used_gsi_bitmap[gsi / 32] |= 1U << (gsi % 32);
 }
 
+static void clear_gsi(KVMState *s, unsigned int gsi)
+{
+    s->used_gsi_bitmap[gsi / 32] &= ~(1U << (gsi % 32));
+}
+
 static void kvm_init_irq_routing(KVMState *s)
 {
-    int gsi_count;
+    int gsi_count, i;
 
     gsi_count = kvm_check_extension(s, KVM_CAP_IRQ_ROUTING);
     if (gsi_count > 0) {
@@ -884,6 +898,10 @@  static void kvm_init_irq_routing(KVMState *s)
     s->irq_routes = g_malloc0(sizeof(*s->irq_routes));
     s->nr_allocated_irq_routes = 0;
 
+    for (i = 0; i < KVM_MSI_HASHTAB_SIZE; i++) {
+        QTAILQ_INIT(&s->msi_hashtab[i]);
+    }
+
     kvm_arch_init_irq_routing(s);
 }
 
@@ -934,11 +952,130 @@  int kvm_irqchip_commit_routes(KVMState *s)
     return kvm_vm_ioctl(s, KVM_SET_GSI_ROUTING, s->irq_routes);
 }
 
+static void kvm_release_gsi(KVMState *s, int gsi)
+{
+    struct kvm_irq_routing_entry *e;
+    int i;
+
+    for (i = 0; i < s->irq_routes->nr; i++) {
+        e = &s->irq_routes->entries[i];
+        if (e->gsi == gsi) {
+            s->irq_routes->nr--;
+            *e = s->irq_routes->entries[s->irq_routes->nr];
+        }
+    }
+    clear_gsi(s, gsi);
+}
+
+static unsigned int kvm_hash_msi(uint32_t data)
+{
+    /* This is optimized for IA32 MSI layout. However, no other arch shall
+     * repeat the mistake of not providing a direct MSI injection API. */
+    return data & 0xff;
+}
+
+static void kvm_flush_dynamic_msi_routes(KVMState *s)
+{
+    KVMMSIRoute *route, *next;
+    unsigned int hash;
+
+    for (hash = 0; hash < KVM_MSI_HASHTAB_SIZE; hash++) {
+        QTAILQ_FOREACH_SAFE(route, &s->msi_hashtab[hash], entry, next) {
+            kvm_release_gsi(s, route->kroute.gsi);
+            QTAILQ_REMOVE(&s->msi_hashtab[hash], route, entry);
+            g_free(route);
+        }
+    }
+}
+
+static int kvm_get_pseudo_gsi(KVMState *s)
+{
+    uint32_t *word = s->used_gsi_bitmap;
+    int max_words = ALIGN(s->gsi_count, 32) / 32;
+    int i, bit;
+    bool retry = true;
+
+again:
+    /* Return the lowest unused GSI in the bitmap */
+    for (i = 0; i < max_words; i++) {
+        bit = ffs(~word[i]);
+        if (!bit) {
+            continue;
+        }
+
+        return bit - 1 + i * 32;
+    }
+    if (retry) {
+        retry = false;
+        kvm_flush_dynamic_msi_routes(s);
+        goto again;
+    }
+    return -ENOSPC;
+
+}
+
+static KVMMSIRoute *kvm_lookup_msi_route(KVMState *s, MSIMessage msg)
+{
+    unsigned int hash = kvm_hash_msi(msg.data);
+    KVMMSIRoute *route;
+
+    QTAILQ_FOREACH(route, &s->msi_hashtab[hash], entry) {
+        if (route->kroute.u.msi.address_lo == (uint32_t)msg.address &&
+            route->kroute.u.msi.address_hi == (msg.address >> 32) &&
+            route->kroute.u.msi.data == msg.data) {
+            return route;
+        }
+    }
+    return NULL;
+}
+
+int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg)
+{
+    KVMMSIRoute *route;
+
+    route = kvm_lookup_msi_route(s, msg);
+    if (!route) {
+        int gsi, ret;
+
+        gsi = kvm_get_pseudo_gsi(s);
+        if (gsi < 0) {
+            return gsi;
+        }
+
+        route = g_malloc(sizeof(KVMMSIRoute));
+        route->kroute.gsi = gsi;
+        route->kroute.type = KVM_IRQ_ROUTING_MSI;
+        route->kroute.flags = 0;
+        route->kroute.u.msi.address_lo = (uint32_t)msg.address;
+        route->kroute.u.msi.address_hi = msg.address >> 32;
+        route->kroute.u.msi.data = msg.data;
+
+        kvm_add_routing_entry(s, &route->kroute);
+
+        QTAILQ_INSERT_TAIL(&s->msi_hashtab[kvm_hash_msi(msg.data)], route,
+                           entry);
+
+        ret = kvm_irqchip_commit_routes(s);
+        if (ret < 0) {
+            return ret;
+        }
+    }
+
+    assert(route->kroute.type == KVM_IRQ_ROUTING_MSI);
+
+    return kvm_irqchip_set_irq(s, route->kroute.gsi, 1);
+}
+
 #else /* !KVM_CAP_IRQ_ROUTING */
 
 static void kvm_init_irq_routing(KVMState *s)
 {
 }
+
+int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg)
+{
+    abort();
+}
 #endif /* !KVM_CAP_IRQ_ROUTING */
 
 static int kvm_irqchip_create(KVMState *s)
diff --git a/kvm.h b/kvm.h
index 4ccae8c..7857dbf 100644
--- a/kvm.h
+++ b/kvm.h
@@ -132,6 +132,7 @@  int kvm_arch_on_sigbus(int code, void *addr);
 void kvm_arch_init_irq_routing(KVMState *s);
 
 int kvm_irqchip_set_irq(KVMState *s, int irq, int level);
+int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg);
 
 void kvm_irqchip_add_route(KVMState *s, int gsi, int irqchip, int pin);
 int kvm_irqchip_commit_routes(KVMState *s);