Patchwork ivshmem: use irqfd to interrupt among VMs

login
register
mail settings
Submitter pingfan liu
Date Nov. 21, 2012, 6:02 a.m.
Message ID <1353477751-9846-1-git-send-email-qemulist@gmail.com>
Download mbox | patch
Permalink /patch/200586/
State New
Headers show

Comments

pingfan liu - Nov. 21, 2012, 6:02 a.m.
From: Liu Ping Fan <pingfank@linux.vnet.ibm.com>

Using irqfd, so we can avoid switch between kernel and user when
VMs interrupts each other.

Signed-off-by: Liu Ping Fan <pingfank@linux.vnet.ibm.com>
---
 hw/ivshmem.c |   48 +++++++++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 47 insertions(+), 1 deletions(-)
Jan Kiszka - Nov. 21, 2012, 12:43 p.m.
On 2012-11-21 07:02, Liu Ping Fan wrote:
> From: Liu Ping Fan <pingfank@linux.vnet.ibm.com>
> 
> Using irqfd, so we can avoid switch between kernel and user when
> VMs interrupts each other.
> 
> Signed-off-by: Liu Ping Fan <pingfank@linux.vnet.ibm.com>
> ---
>  hw/ivshmem.c |   48 +++++++++++++++++++++++++++++++++++++++++++++++-
>  1 files changed, 47 insertions(+), 1 deletions(-)
> 
> diff --git a/hw/ivshmem.c b/hw/ivshmem.c
> index f6dbb21..81c7354 100644
> --- a/hw/ivshmem.c
> +++ b/hw/ivshmem.c
> @@ -19,6 +19,7 @@
>  #include "hw.h"
>  #include "pc.h"
>  #include "pci.h"
> +#include "msi.h"
>  #include "msix.h"
>  #include "kvm.h"
>  #include "migration.h"
> @@ -54,6 +55,11 @@ typedef struct EventfdEntry {
>      int vector;
>  } EventfdEntry;
>  
> +typedef struct IrqfdEntry {
> +    int virq;
> +    bool used;

used = (virq != -1), so it should be redundant, and you can reduce
IrqfdEntry to a plain int holding the virq.

> +} IrqfdEntry;
> +
>  typedef struct IVShmemState {
>      PCIDevice dev;
>      uint32_t intrmask;
> @@ -83,6 +89,8 @@ typedef struct IVShmemState {
>      uint32_t vectors;
>      uint32_t features;
>      EventfdEntry *eventfd_table;
> +    IrqfdEntry *vector_irqfd;
> +    bool irqfd_enable;
>  
>      Error *migration_blocker;
>  
> @@ -632,6 +640,38 @@ static void ivshmem_write_config(PCIDevice *pci_dev, uint32_t address,
>      msix_write_config(pci_dev, address, val, len);
>  }
>  
> +static int ivshmem_vector_use(PCIDevice *dev, unsigned vector,
> +                                     MSIMessage msg)
> +{
> +    IVShmemState *s = DO_UPCAST(IVShmemState, dev, dev);
> +    int virq;
> +    EventNotifier *n = &s->peers[s->vm_id].eventfds[vector];
> +
> +    virq = kvm_irqchip_add_msi_route(kvm_state, msg);
> +    if (virq >= 0 && kvm_irqchip_add_irqfd_notifier(kvm_state, n, virq) >= 0) {
> +        s->vector_irqfd[vector].virq = virq;
> +        s->vector_irqfd[vector].used = true;
> +        qemu_chr_add_handlers(s->eventfd_chr[vector], NULL, NULL, NULL, NULL);
> +    } else if (virq >= 0) {
> +        kvm_irqchip_release_virq(kvm_state, virq);
> +    }
> +    return 0;

You drop the errors here. Better refactor the code to a scheme like this:

err = service();
if (err) {
    roll_back();
    return err;
    /* or: goto roll_back_... */
}

> +}
> +
> +static void ivshmem_vector_release(PCIDevice *dev, unsigned vector)
> +{
> +    IVShmemState *s = DO_UPCAST(IVShmemState, dev, dev);
> +    EventNotifier *n = &s->peers[s->vm_id].eventfds[vector];
> +    int virq = s->vector_irqfd[vector].virq;
> +
> +    if (s->vector_irqfd[vector].used) {
> +        kvm_irqchip_remove_irqfd_notifier(kvm_state, n, virq);
> +        kvm_irqchip_release_virq(kvm_state, virq);
> +        s->vector_irqfd[vector].virq = -1;
> +        s->vector_irqfd[vector].used = false;
> +    }
> +}
> +
>  static int pci_ivshmem_init(PCIDevice *dev)
>  {
>      IVShmemState *s = DO_UPCAST(IVShmemState, dev, dev);
> @@ -759,7 +799,13 @@ static int pci_ivshmem_init(PCIDevice *dev)
>      }
>  
>      s->dev.config_write = ivshmem_write_config;
> -
> +    if (kvm_gsi_routing_enabled()) {
> +        s->irqfd_enable = msix_set_vector_notifiers(dev, ivshmem_vector_use,
> +                    ivshmem_vector_release) >= 0 ? true : false;
> +        if (s->irqfd_enable) {
> +            s->vector_irqfd = g_new0(IrqfdEntry, s->vectors);

Conceptually, msix_set_vector_notifiers can already call
ivshmem_vector_use, so this initialization would come too late. Doesn't
happen here as MSI-X is still off during device init. However, just
perform vector_irqfd allocation unconditionally before the registration.

And where do you free it again...?

> +        }
> +    }
>      return 0;
>  }
>  
> 

Jan
pingfan liu - Nov. 22, 2012, 2:48 a.m.
On Wed, Nov 21, 2012 at 8:43 PM, Jan Kiszka <jan.kiszka@siemens.com> wrote:
> On 2012-11-21 07:02, Liu Ping Fan wrote:
>> From: Liu Ping Fan <pingfank@linux.vnet.ibm.com>
>>
>> Using irqfd, so we can avoid switch between kernel and user when
>> VMs interrupts each other.
>>
>> Signed-off-by: Liu Ping Fan <pingfank@linux.vnet.ibm.com>
>> ---
>>  hw/ivshmem.c |   48 +++++++++++++++++++++++++++++++++++++++++++++++-
>>  1 files changed, 47 insertions(+), 1 deletions(-)
>>
>> diff --git a/hw/ivshmem.c b/hw/ivshmem.c
>> index f6dbb21..81c7354 100644
>> --- a/hw/ivshmem.c
>> +++ b/hw/ivshmem.c
>> @@ -19,6 +19,7 @@
>>  #include "hw.h"
>>  #include "pc.h"
>>  #include "pci.h"
>> +#include "msi.h"
>>  #include "msix.h"
>>  #include "kvm.h"
>>  #include "migration.h"
>> @@ -54,6 +55,11 @@ typedef struct EventfdEntry {
>>      int vector;
>>  } EventfdEntry;
>>
>> +typedef struct IrqfdEntry {
>> +    int virq;
>> +    bool used;
>
> used = (virq != -1), so it should be redundant, and you can reduce
> IrqfdEntry to a plain int holding the virq.
>
Applied,

>> +} IrqfdEntry;
>> +
>>  typedef struct IVShmemState {
>>      PCIDevice dev;
>>      uint32_t intrmask;
>> @@ -83,6 +89,8 @@ typedef struct IVShmemState {
>>      uint32_t vectors;
>>      uint32_t features;
>>      EventfdEntry *eventfd_table;
>> +    IrqfdEntry *vector_irqfd;
>> +    bool irqfd_enable;
>>
>>      Error *migration_blocker;
>>
>> @@ -632,6 +640,38 @@ static void ivshmem_write_config(PCIDevice *pci_dev, uint32_t address,
>>      msix_write_config(pci_dev, address, val, len);
>>  }
>>
>> +static int ivshmem_vector_use(PCIDevice *dev, unsigned vector,
>> +                                     MSIMessage msg)
>> +{
>> +    IVShmemState *s = DO_UPCAST(IVShmemState, dev, dev);
>> +    int virq;
>> +    EventNotifier *n = &s->peers[s->vm_id].eventfds[vector];
>> +
>> +    virq = kvm_irqchip_add_msi_route(kvm_state, msg);
>> +    if (virq >= 0 && kvm_irqchip_add_irqfd_notifier(kvm_state, n, virq) >= 0) {
>> +        s->vector_irqfd[vector].virq = virq;
>> +        s->vector_irqfd[vector].used = true;
>> +        qemu_chr_add_handlers(s->eventfd_chr[vector], NULL, NULL, NULL, NULL);
>> +    } else if (virq >= 0) {
>> +        kvm_irqchip_release_virq(kvm_state, virq);
>> +    }
>> +    return 0;
>
> You drop the errors here. Better refactor the code to a scheme like this:
>
In msix_fire_vector_notifier(), there is "assert(ret >= 0);".  But I
think ivshmem can work even if irqfd can not be established, and fall
back to the origin scheme. So here just ignore err. Is it proper?

> err = service();
> if (err) {
>     roll_back();
>     return err;
>     /* or: goto roll_back_... */
> }
>
>> +}
>> +
>> +static void ivshmem_vector_release(PCIDevice *dev, unsigned vector)
>> +{
>> +    IVShmemState *s = DO_UPCAST(IVShmemState, dev, dev);
>> +    EventNotifier *n = &s->peers[s->vm_id].eventfds[vector];
>> +    int virq = s->vector_irqfd[vector].virq;
>> +
>> +    if (s->vector_irqfd[vector].used) {
>> +        kvm_irqchip_remove_irqfd_notifier(kvm_state, n, virq);
>> +        kvm_irqchip_release_virq(kvm_state, virq);
>> +        s->vector_irqfd[vector].virq = -1;
>> +        s->vector_irqfd[vector].used = false;
>> +    }
>> +}
>> +
>>  static int pci_ivshmem_init(PCIDevice *dev)
>>  {
>>      IVShmemState *s = DO_UPCAST(IVShmemState, dev, dev);
>> @@ -759,7 +799,13 @@ static int pci_ivshmem_init(PCIDevice *dev)
>>      }
>>
>>      s->dev.config_write = ivshmem_write_config;
>> -
>> +    if (kvm_gsi_routing_enabled()) {
>> +        s->irqfd_enable = msix_set_vector_notifiers(dev, ivshmem_vector_use,
>> +                    ivshmem_vector_release) >= 0 ? true : false;
>> +        if (s->irqfd_enable) {
>> +            s->vector_irqfd = g_new0(IrqfdEntry, s->vectors);
>
> Conceptually, msix_set_vector_notifiers can already call
> ivshmem_vector_use, so this initialization would come too late. Doesn't
> happen here as MSI-X is still off during device init. However, just
> perform vector_irqfd allocation unconditionally before the registration.
>
Applied,
> And where do you free it again...?
>
Will fix it.

And I think, this is the way to push interrupt subsystem out of big
lock for KVM.  For TCG code, we can use something like
kvm_irqchip_add_msi_route(). How do you think about it?

Regards,
Pingfan
>> +        }
>> +    }
>>      return 0;
>>  }
>>
>>
>
> Jan
>
> --
> Siemens AG, Corporate Technology, CT RTC ITP SDP-DE
> Corporate Competence Center Embedded Linux
Jan Kiszka - Nov. 22, 2012, 11:40 a.m.
On 2012-11-22 03:48, liu ping fan wrote:
> On Wed, Nov 21, 2012 at 8:43 PM, Jan Kiszka <jan.kiszka@siemens.com> wrote:
>> On 2012-11-21 07:02, Liu Ping Fan wrote:
>>> From: Liu Ping Fan <pingfank@linux.vnet.ibm.com>
>>>
>>> Using irqfd, so we can avoid switch between kernel and user when
>>> VMs interrupts each other.
>>>
>>> Signed-off-by: Liu Ping Fan <pingfank@linux.vnet.ibm.com>
>>> ---
>>>  hw/ivshmem.c |   48 +++++++++++++++++++++++++++++++++++++++++++++++-
>>>  1 files changed, 47 insertions(+), 1 deletions(-)
>>>
>>> diff --git a/hw/ivshmem.c b/hw/ivshmem.c
>>> index f6dbb21..81c7354 100644
>>> --- a/hw/ivshmem.c
>>> +++ b/hw/ivshmem.c
>>> @@ -19,6 +19,7 @@
>>>  #include "hw.h"
>>>  #include "pc.h"
>>>  #include "pci.h"
>>> +#include "msi.h"
>>>  #include "msix.h"
>>>  #include "kvm.h"
>>>  #include "migration.h"
>>> @@ -54,6 +55,11 @@ typedef struct EventfdEntry {
>>>      int vector;
>>>  } EventfdEntry;
>>>
>>> +typedef struct IrqfdEntry {
>>> +    int virq;
>>> +    bool used;
>>
>> used = (virq != -1), so it should be redundant, and you can reduce
>> IrqfdEntry to a plain int holding the virq.
>>
> Applied,
> 
>>> +} IrqfdEntry;
>>> +
>>>  typedef struct IVShmemState {
>>>      PCIDevice dev;
>>>      uint32_t intrmask;
>>> @@ -83,6 +89,8 @@ typedef struct IVShmemState {
>>>      uint32_t vectors;
>>>      uint32_t features;
>>>      EventfdEntry *eventfd_table;
>>> +    IrqfdEntry *vector_irqfd;
>>> +    bool irqfd_enable;
>>>
>>>      Error *migration_blocker;
>>>
>>> @@ -632,6 +640,38 @@ static void ivshmem_write_config(PCIDevice *pci_dev, uint32_t address,
>>>      msix_write_config(pci_dev, address, val, len);
>>>  }
>>>
>>> +static int ivshmem_vector_use(PCIDevice *dev, unsigned vector,
>>> +                                     MSIMessage msg)
>>> +{
>>> +    IVShmemState *s = DO_UPCAST(IVShmemState, dev, dev);
>>> +    int virq;
>>> +    EventNotifier *n = &s->peers[s->vm_id].eventfds[vector];
>>> +
>>> +    virq = kvm_irqchip_add_msi_route(kvm_state, msg);
>>> +    if (virq >= 0 && kvm_irqchip_add_irqfd_notifier(kvm_state, n, virq) >= 0) {
>>> +        s->vector_irqfd[vector].virq = virq;
>>> +        s->vector_irqfd[vector].used = true;
>>> +        qemu_chr_add_handlers(s->eventfd_chr[vector], NULL, NULL, NULL, NULL);
>>> +    } else if (virq >= 0) {
>>> +        kvm_irqchip_release_virq(kvm_state, virq);
>>> +    }
>>> +    return 0;
>>
>> You drop the errors here. Better refactor the code to a scheme like this:
>>
> In msix_fire_vector_notifier(), there is "assert(ret >= 0);".  But I
> think ivshmem can work even if irqfd can not be established, and fall
> back to the origin scheme. So here just ignore err. Is it proper?

If you have an error here, something seriously went wrong. So better let
the user know than falling back to "slow" mode silently. That's what
other users do as well.

> 
>> err = service();
>> if (err) {
>>     roll_back();
>>     return err;
>>     /* or: goto roll_back_... */
>> }
>>
>>> +}
>>> +
>>> +static void ivshmem_vector_release(PCIDevice *dev, unsigned vector)
>>> +{
>>> +    IVShmemState *s = DO_UPCAST(IVShmemState, dev, dev);
>>> +    EventNotifier *n = &s->peers[s->vm_id].eventfds[vector];
>>> +    int virq = s->vector_irqfd[vector].virq;
>>> +
>>> +    if (s->vector_irqfd[vector].used) {
>>> +        kvm_irqchip_remove_irqfd_notifier(kvm_state, n, virq);
>>> +        kvm_irqchip_release_virq(kvm_state, virq);
>>> +        s->vector_irqfd[vector].virq = -1;
>>> +        s->vector_irqfd[vector].used = false;
>>> +    }
>>> +}
>>> +
>>>  static int pci_ivshmem_init(PCIDevice *dev)
>>>  {
>>>      IVShmemState *s = DO_UPCAST(IVShmemState, dev, dev);
>>> @@ -759,7 +799,13 @@ static int pci_ivshmem_init(PCIDevice *dev)
>>>      }
>>>
>>>      s->dev.config_write = ivshmem_write_config;
>>> -
>>> +    if (kvm_gsi_routing_enabled()) {
>>> +        s->irqfd_enable = msix_set_vector_notifiers(dev, ivshmem_vector_use,
>>> +                    ivshmem_vector_release) >= 0 ? true : false;
>>> +        if (s->irqfd_enable) {
>>> +            s->vector_irqfd = g_new0(IrqfdEntry, s->vectors);
>>
>> Conceptually, msix_set_vector_notifiers can already call
>> ivshmem_vector_use, so this initialization would come too late. Doesn't
>> happen here as MSI-X is still off during device init. However, just
>> perform vector_irqfd allocation unconditionally before the registration.
>>
> Applied,
>> And where do you free it again...?
>>
> Will fix it.
> 
> And I think, this is the way to push interrupt subsystem out of big
> lock for KVM.  For TCG code, we can use something like
> kvm_irqchip_add_msi_route(). How do you think about it?

Cannot follow what your idea is.

Also not that MSI on x86 is special anyway as it is not routed in any
user-space device model (so far - emulated interrupt remapping will
change this) but goes directly to the target VCPU via the corresponding
IOCTL.

Jan
pingfan liu - Nov. 23, 2012, 2:20 a.m.
On Thu, Nov 22, 2012 at 7:40 PM, Jan Kiszka <jan.kiszka@siemens.com> wrote:
> On 2012-11-22 03:48, liu ping fan wrote:
>> On Wed, Nov 21, 2012 at 8:43 PM, Jan Kiszka <jan.kiszka@siemens.com> wrote:
>>> On 2012-11-21 07:02, Liu Ping Fan wrote:
[...]
>>>>
>>>> +static int ivshmem_vector_use(PCIDevice *dev, unsigned vector,
>>>> +                                     MSIMessage msg)
>>>> +{
>>>> +    IVShmemState *s = DO_UPCAST(IVShmemState, dev, dev);
>>>> +    int virq;
>>>> +    EventNotifier *n = &s->peers[s->vm_id].eventfds[vector];
>>>> +
>>>> +    virq = kvm_irqchip_add_msi_route(kvm_state, msg);
>>>> +    if (virq >= 0 && kvm_irqchip_add_irqfd_notifier(kvm_state, n, virq) >= 0) {
>>>> +        s->vector_irqfd[vector].virq = virq;
>>>> +        s->vector_irqfd[vector].used = true;
>>>> +        qemu_chr_add_handlers(s->eventfd_chr[vector], NULL, NULL, NULL, NULL);
>>>> +    } else if (virq >= 0) {
>>>> +        kvm_irqchip_release_virq(kvm_state, virq);
>>>> +    }
>>>> +    return 0;
>>>
>>> You drop the errors here. Better refactor the code to a scheme like this:
>>>
>> In msix_fire_vector_notifier(), there is "assert(ret >= 0);".  But I
>> think ivshmem can work even if irqfd can not be established, and fall
>> back to the origin scheme. So here just ignore err. Is it proper?
>
> If you have an error here, something seriously went wrong. So better let
> the user know than falling back to "slow" mode silently. That's what
> other users do as well.
>
Refer to vfio_msix_vector_use(),  I think to avoid silently, we can
error_report() to notify user.  And because as msi_route is limited
resource, so we should hold with fail, and keep the Qemu still in
work.
Is it OK?
>>
>>> err = service();
>>> if (err) {
>>>     roll_back();
>>>     return err;
>>>     /* or: goto roll_back_... */
>>> }
>>>
>>>> +}
>>>> +
[...]
>>
>> And I think, this is the way to push interrupt subsystem out of big
>> lock for KVM.  For TCG code, we can use something like
>> kvm_irqchip_add_msi_route(). How do you think about it?
>
> Cannot follow what your idea is.
>
I mean to add RCU style interrupt dispatching table for TCG.

> Also not that MSI on x86 is special anyway as it is not routed in any
> user-space device model (so far - emulated interrupt remapping will
> change this) but goes directly to the target VCPU via the corresponding
> IOCTL.
>
Oh, yes, MSI is so special. The user space dispatch table can only
shortcut IRQ to something like "ioapic",   not directly to vcpu

Regards,
Pingfan
> Jan
>
> --
> Siemens AG, Corporate Technology, CT RTC ITP SDP-DE
> Corporate Competence Center Embedded Linux

Patch

diff --git a/hw/ivshmem.c b/hw/ivshmem.c
index f6dbb21..81c7354 100644
--- a/hw/ivshmem.c
+++ b/hw/ivshmem.c
@@ -19,6 +19,7 @@ 
 #include "hw.h"
 #include "pc.h"
 #include "pci.h"
+#include "msi.h"
 #include "msix.h"
 #include "kvm.h"
 #include "migration.h"
@@ -54,6 +55,11 @@  typedef struct EventfdEntry {
     int vector;
 } EventfdEntry;
 
+typedef struct IrqfdEntry {
+    int virq;
+    bool used;
+} IrqfdEntry;
+
 typedef struct IVShmemState {
     PCIDevice dev;
     uint32_t intrmask;
@@ -83,6 +89,8 @@  typedef struct IVShmemState {
     uint32_t vectors;
     uint32_t features;
     EventfdEntry *eventfd_table;
+    IrqfdEntry *vector_irqfd;
+    bool irqfd_enable;
 
     Error *migration_blocker;
 
@@ -632,6 +640,38 @@  static void ivshmem_write_config(PCIDevice *pci_dev, uint32_t address,
     msix_write_config(pci_dev, address, val, len);
 }
 
+static int ivshmem_vector_use(PCIDevice *dev, unsigned vector,
+                                     MSIMessage msg)
+{
+    IVShmemState *s = DO_UPCAST(IVShmemState, dev, dev);
+    int virq;
+    EventNotifier *n = &s->peers[s->vm_id].eventfds[vector];
+
+    virq = kvm_irqchip_add_msi_route(kvm_state, msg);
+    if (virq >= 0 && kvm_irqchip_add_irqfd_notifier(kvm_state, n, virq) >= 0) {
+        s->vector_irqfd[vector].virq = virq;
+        s->vector_irqfd[vector].used = true;
+        qemu_chr_add_handlers(s->eventfd_chr[vector], NULL, NULL, NULL, NULL);
+    } else if (virq >= 0) {
+        kvm_irqchip_release_virq(kvm_state, virq);
+    }
+    return 0;
+}
+
+static void ivshmem_vector_release(PCIDevice *dev, unsigned vector)
+{
+    IVShmemState *s = DO_UPCAST(IVShmemState, dev, dev);
+    EventNotifier *n = &s->peers[s->vm_id].eventfds[vector];
+    int virq = s->vector_irqfd[vector].virq;
+
+    if (s->vector_irqfd[vector].used) {
+        kvm_irqchip_remove_irqfd_notifier(kvm_state, n, virq);
+        kvm_irqchip_release_virq(kvm_state, virq);
+        s->vector_irqfd[vector].virq = -1;
+        s->vector_irqfd[vector].used = false;
+    }
+}
+
 static int pci_ivshmem_init(PCIDevice *dev)
 {
     IVShmemState *s = DO_UPCAST(IVShmemState, dev, dev);
@@ -759,7 +799,13 @@  static int pci_ivshmem_init(PCIDevice *dev)
     }
 
     s->dev.config_write = ivshmem_write_config;
-
+    if (kvm_gsi_routing_enabled()) {
+        s->irqfd_enable = msix_set_vector_notifiers(dev, ivshmem_vector_use,
+                    ivshmem_vector_release) >= 0 ? true : false;
+        if (s->irqfd_enable) {
+            s->vector_irqfd = g_new0(IrqfdEntry, s->vectors);
+        }
+    }
     return 0;
 }