Patchwork Xen: enabling emulated MSI injection

login
register
mail settings
Submitter Jan Kiszka
Date May 26, 2011, 7:21 a.m.
Message ID <4DDDFF83.6010404@siemens.com>
Download mbox | patch
Permalink /patch/97503/
State New
Headers show

Comments

Jan Kiszka - May 26, 2011, 7:21 a.m.
On 2011-05-26 06:57, Wei Liu wrote:
> commit 0ad7c9289253b9fd496ac695ce5bf1f23027d97a
> Author: Wei Liu <liuw@liuw.name>
> Date:   Thu May 26 10:16:18 2011 +0800
> 
>     MSI injection for Xen.
> 
>     This is supposed to be used in conjunction with Xen's
>     new hypercall interface for emualted MSI injection.

That's not MSI but MSI-X only (we originally did the same mistake in KVM).

> 
>     Signed-off-by: Wei Liu <liuw@liuw.name>
> 
> diff --git a/hw/msix.c b/hw/msix.c
> index af40e26..1057eff 100644
> --- a/hw/msix.c
> +++ b/hw/msix.c
> @@ -15,6 +15,7 @@
>  #include "msix.h"
>  #include "pci.h"
>  #include "range.h"
> +#include "xen.h"
> 
>  /* MSI-X capability structure */
>  #define MSIX_TABLE_OFFSET 4
> @@ -369,7 +370,12 @@ void msix_notify(PCIDevice *dev, unsigned vector)
>      address = pci_get_long(table_entry + MSIX_MSG_UPPER_ADDR);
>      address = (address << 32) | pci_get_long(table_entry + MSIX_MSG_ADDR);
>      data = pci_get_long(table_entry + MSIX_MSG_DATA);
> -    stl_phys(address, data);
> +
> +    if (xen_enabled()) {
> +        xen_hvm_inj_msi(address, data);
> +    } else {
> +        stl_phys(address, data);
> +    }
>  }
> 
>  void msix_reset(PCIDevice *dev)
> diff --git a/hw/xen.h b/hw/xen.h
> index d435ca0..11a3c4a 100644
> --- a/hw/xen.h
> +++ b/hw/xen.h
> @@ -43,6 +43,7 @@ static inline int xen_mapcache_enabled(void)
>  int xen_pci_slot_get_pirq(PCIDevice *pci_dev, int irq_num);
>  void xen_piix3_set_irq(void *opaque, int irq_num, int level);
>  void xen_piix_pci_write_config_client(uint32_t address, uint32_t val, int len);
> +void xen_hvm_inj_msi(uint64_t addr, uint32_t data);
>  void xen_cmos_set_s3_resume(void *opaque, int irq, int level);
> 
>  qemu_irq *xen_interrupt_controller_init(void);
> diff --git a/xen-all.c b/xen-all.c
> index 0944673..a553992 100644
> --- a/xen-all.c
> +++ b/xen-all.c
> @@ -100,6 +100,11 @@ void xen_piix_pci_write_config_client(uint32_t
> address, uint32_t val, int len)
>      }
>  }
> 
> +void xen_hvm_inj_msi(uint64_t addr, uint32_t data)
> +{
> +    xc_hvm_inj_msi(xen_xc, xen_domid, addr, data);
> +}
> +
>  void xen_cmos_set_s3_resume(void *opaque, int irq, int level)
>  {
>      pc_cmos_set_s3_resume(opaque, irq, level);
> diff --git a/xen-stub.c b/xen-stub.c
> index a4f35a1..c18c62f 100644
> --- a/xen-stub.c
> +++ b/xen-stub.c
> @@ -39,3 +39,7 @@ int xen_init(void)
>  {
>      return -ENOSYS;
>  }
> +
> +void xen_hvm_inj_msi(uint64_t addr, uint32_t data)
> +{
> +}
> 

A cleaner way of hooking into this would be registering on the MSI MMIO
target region (replacing the APIC). That would give you MSI support as well.

However, KVM has much more complex requirements for MSI handling when
using the in-kernel irqchip. That's because we need to support, e.g.,
eventfd-based MSI injection in the hypervisor without looping over QEMU,
ie. we need to translate a binary event into a MSI address/data tuple.
This is currently solved a bit too pragmatic in our qemu-kvm tree -
not mergeable to upstream. So I started working out a better plan,
partly based on ideas Avi provided.

One element in this concept will be a hook into the MSI delivery path,
both for events triggered by msi/msix_notify as well as those raised by
weird DMA (to be architecturally correct). See below for a snapshot
(depends on other patches). That will obsolete any open-coded hooking
like this here.

If Xen support is urging, I could try to break out the relevant bits a
bit earlier. Otherwise I would prefer to postpone the topic by a few
weeks until we are done with evaluating the concept against KVM's
requirements so that we can find a truly generic solution.

Jan


------8<-------
Wei Liu - May 26, 2011, 7:35 a.m.
On Thu, May 26, 2011 at 3:21 PM, Jan Kiszka <jan.kiszka@siemens.com> wrote:
> A cleaner way of hooking into this would be registering on the MSI MMIO
> target region (replacing the APIC). That would give you MSI support as well.
>
> However, KVM has much more complex requirements for MSI handling when
> using the in-kernel irqchip. That's because we need to support, e.g.,
> eventfd-based MSI injection in the hypervisor without looping over QEMU,
> ie. we need to translate a binary event into a MSI address/data tuple.
> This is currently solved a bit too pragmatic in our qemu-kvm tree -
> not mergeable to upstream. So I started working out a better plan,
> partly based on ideas Avi provided.
>
> One element in this concept will be a hook into the MSI delivery path,
> both for events triggered by msi/msix_notify as well as those raised by
> weird DMA (to be architecturally correct). See below for a snapshot
> (depends on other patches). That will obsolete any open-coded hooking
> like this here.
>
> If Xen support is urging, I could try to break out the relevant bits a
> bit earlier. Otherwise I would prefer to postpone the topic by a few
> weeks until we are done with evaluating the concept against KVM's
> requirements so that we can find a truly generic solution.
>
> Jan
>
>
> ------8<-------
>
> diff --git a/hw/apic.c b/hw/apic.c
> index a45b57f..7447d91 100644
> --- a/hw/apic.c
> +++ b/hw/apic.c
> @@ -19,6 +19,7 @@
>  #include "hw.h"
>  #include "apic.h"
>  #include "ioapic.h"
> +#include "msi.h"
>  #include "qemu-timer.h"
>  #include "host-utils.h"
>  #include "sysbus.h"
> @@ -795,7 +796,7 @@ static uint32_t apic_mem_readl(void *opaque, target_phys_addr_t addr)
>     return val;
>  }
>
> -static void apic_send_msi(target_phys_addr_t addr, uint32_t data)
> +void apic_deliver_msi(uint64_t addr, uint32_t data)
>  {
>     uint8_t dest = (addr & MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT;
>     uint8_t vector = (data & MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT;
> @@ -817,7 +818,9 @@ static void apic_mem_writel(void *opaque, target_phys_addr_t addr, uint32_t val)
>          * APIC is connected directly to the CPU.
>          * Mapping them on the global bus happens to work because
>          * MSI registers are reserved in APIC MMIO and vice versa. */
> -        apic_send_msi(addr, val);
> +        MSIMessage msg = { .address = addr, .data = val };
> +
> +        msi_deliver(&msg);
>         return;
>     }
>
> diff --git a/hw/apic.h b/hw/apic.h
> index c857d52..d0971ae 100644
> --- a/hw/apic.h
> +++ b/hw/apic.h
> @@ -20,6 +20,7 @@ void cpu_set_apic_tpr(DeviceState *s, uint8_t val);
>  uint8_t cpu_get_apic_tpr(DeviceState *s);
>  void apic_init_reset(DeviceState *s);
>  void apic_sipi(DeviceState *s);
> +void apic_deliver_msi(uint64_t addr, uint32_t data);
>
>  /* pc.c */
>  int cpu_is_bsp(CPUState *env);
> diff --git a/hw/msi.c b/hw/msi.c
> index e111ceb..b88dcc4 100644
> --- a/hw/msi.c
> +++ b/hw/msi.c
> @@ -37,6 +37,14 @@
>
>  #define PCI_MSI_VECTORS_MAX     32
>
> +static void msi_unsupported(MSIMessage *msg)
> +{
> +    /* If we get here, the board failed to register a delivery handler. */
> +    abort();
> +}
> +
> +void (*msi_deliver)(MSIMessage *msg) = msi_unsupported;
> +
>  /* If we get rid of cap allocator, we won't need this. */
>  static inline uint8_t msi_cap_sizeof(uint16_t flags)
>  {
> @@ -361,7 +369,7 @@ void msi_notify(PCIDevice *dev, unsigned int vector)
>                    "notify vector 0x%x"
>                    " address: 0x%"PRIx64" data: 0x%"PRIx32"\n",
>                    vector, msg.address, msg.data);
> -    stl_phys(msg.address, msg.data);
> +    msi_deliver(&msg);
>  }
>
>  /* call this function after updating configs by pci_default_write_config(). */
> diff --git a/hw/msi.h b/hw/msi.h
> index 7a0e76f..ba76eca 100644
> --- a/hw/msi.h
> +++ b/hw/msi.h
> @@ -44,4 +44,6 @@ static inline bool msi_present(const PCIDevice *dev)
>     return dev->cap_present & QEMU_PCI_CAP_MSI;
>  }
>
> +extern void (*msi_deliver)(MSIMessage *msg);
> +
>  #endif /* QEMU_MSI_H */
> diff --git a/hw/msix.c b/hw/msix.c
> index 6e55422..d893f88 100644
> --- a/hw/msix.c
> +++ b/hw/msix.c
> @@ -514,7 +514,7 @@ void msix_notify(PCIDevice *dev, unsigned vector)
>
>     msix_message_from_vector(dev, vector, &msg);
>
> -    stl_phys(msg.address, msg.data);
> +    msi_deliver(&msg);
>  }
>
>  void msix_reset(PCIDevice *dev)
> diff --git a/hw/pc.c b/hw/pc.c
> index 73398eb..f88ef03 100644
> --- a/hw/pc.c
> +++ b/hw/pc.c
> @@ -24,6 +24,7 @@
>  #include "hw.h"
>  #include "pc.h"
>  #include "apic.h"
> +#include "msi.h"
>  #include "fdc.h"
>  #include "ide.h"
>  #include "pci.h"
> @@ -102,6 +103,15 @@ void isa_irq_handler(void *opaque, int n, int level)
>         qemu_set_irq(isa->ioapic[n], level);
>  };
>
> +static void pc_msi_deliver(MSIMessage *msg)
> +{
> +    if ((msg->address & 0xfff00000) == MSI_ADDR_BASE) {
> +        apic_deliver_msi(msg->address, msg->data);
> +    } else {
> +        stl_phys(msg->address, msg->data);
> +    }
> +}
> +
>  static void ioport80_write(void *opaque, uint32_t addr, uint32_t data)
>  {
>  }
> @@ -889,6 +899,7 @@ static DeviceState *apic_init(void *env, uint8_t apic_id)
>            on the global memory bus. */
>         /* XXX: what if the base changes? */
>         sysbus_mmio_map(d, 0, MSI_ADDR_BASE);
> +        msi_deliver = pc_msi_deliver;
>         apic_mapped = 1;
>     }
>
> --
> 1.7.1
>
> --
> Siemens AG, Corporate Technology, CT T DE IT 1
> Corporate Competence Center Embedded Linux
>

Hmm... This patch is just part of GSoC project Virtio on Xen. AFAICS,
this is not so urgent.

I would prefer a cleaner way. A standalone function that doesn't
belong to any QEMU subsystem annoys me. And the hooking looks ugly.

Thanks for your patch, I think we can wait for a better design and
cleaner solution for both Xen and KVM.
Jan Kiszka - May 26, 2011, 7:41 a.m.
On 2011-05-26 09:35, Wei Liu wrote:
> On Thu, May 26, 2011 at 3:21 PM, Jan Kiszka <jan.kiszka@siemens.com> wrote:
>> A cleaner way of hooking into this would be registering on the MSI MMIO
>> target region (replacing the APIC). That would give you MSI support as well.
>>
>> However, KVM has much more complex requirements for MSI handling when
>> using the in-kernel irqchip. That's because we need to support, e.g.,
>> eventfd-based MSI injection in the hypervisor without looping over QEMU,
>> ie. we need to translate a binary event into a MSI address/data tuple.
>> This is currently solved a bit too pragmatic in our qemu-kvm tree -
>> not mergeable to upstream. So I started working out a better plan,
>> partly based on ideas Avi provided.
>>
>> One element in this concept will be a hook into the MSI delivery path,
>> both for events triggered by msi/msix_notify as well as those raised by
>> weird DMA (to be architecturally correct). See below for a snapshot
>> (depends on other patches). That will obsolete any open-coded hooking
>> like this here.
>>
>> If Xen support is urging, I could try to break out the relevant bits a
>> bit earlier. Otherwise I would prefer to postpone the topic by a few
>> weeks until we are done with evaluating the concept against KVM's
>> requirements so that we can find a truly generic solution.
>>
>> Jan
>>
>>
>> ------8<-------
>>
>> diff --git a/hw/apic.c b/hw/apic.c
>> index a45b57f..7447d91 100644
>> --- a/hw/apic.c
>> +++ b/hw/apic.c
>> @@ -19,6 +19,7 @@
>>  #include "hw.h"
>>  #include "apic.h"
>>  #include "ioapic.h"
>> +#include "msi.h"
>>  #include "qemu-timer.h"
>>  #include "host-utils.h"
>>  #include "sysbus.h"
>> @@ -795,7 +796,7 @@ static uint32_t apic_mem_readl(void *opaque, target_phys_addr_t addr)
>>     return val;
>>  }
>>
>> -static void apic_send_msi(target_phys_addr_t addr, uint32_t data)
>> +void apic_deliver_msi(uint64_t addr, uint32_t data)
>>  {
>>     uint8_t dest = (addr & MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT;
>>     uint8_t vector = (data & MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT;
>> @@ -817,7 +818,9 @@ static void apic_mem_writel(void *opaque, target_phys_addr_t addr, uint32_t val)
>>          * APIC is connected directly to the CPU.
>>          * Mapping them on the global bus happens to work because
>>          * MSI registers are reserved in APIC MMIO and vice versa. */
>> -        apic_send_msi(addr, val);
>> +        MSIMessage msg = { .address = addr, .data = val };
>> +
>> +        msi_deliver(&msg);
>>         return;
>>     }
>>
>> diff --git a/hw/apic.h b/hw/apic.h
>> index c857d52..d0971ae 100644
>> --- a/hw/apic.h
>> +++ b/hw/apic.h
>> @@ -20,6 +20,7 @@ void cpu_set_apic_tpr(DeviceState *s, uint8_t val);
>>  uint8_t cpu_get_apic_tpr(DeviceState *s);
>>  void apic_init_reset(DeviceState *s);
>>  void apic_sipi(DeviceState *s);
>> +void apic_deliver_msi(uint64_t addr, uint32_t data);
>>
>>  /* pc.c */
>>  int cpu_is_bsp(CPUState *env);
>> diff --git a/hw/msi.c b/hw/msi.c
>> index e111ceb..b88dcc4 100644
>> --- a/hw/msi.c
>> +++ b/hw/msi.c
>> @@ -37,6 +37,14 @@
>>
>>  #define PCI_MSI_VECTORS_MAX     32
>>
>> +static void msi_unsupported(MSIMessage *msg)
>> +{
>> +    /* If we get here, the board failed to register a delivery handler. */
>> +    abort();
>> +}
>> +
>> +void (*msi_deliver)(MSIMessage *msg) = msi_unsupported;
>> +
>>  /* If we get rid of cap allocator, we won't need this. */
>>  static inline uint8_t msi_cap_sizeof(uint16_t flags)
>>  {
>> @@ -361,7 +369,7 @@ void msi_notify(PCIDevice *dev, unsigned int vector)
>>                    "notify vector 0x%x"
>>                    " address: 0x%"PRIx64" data: 0x%"PRIx32"\n",
>>                    vector, msg.address, msg.data);
>> -    stl_phys(msg.address, msg.data);
>> +    msi_deliver(&msg);
>>  }
>>
>>  /* call this function after updating configs by pci_default_write_config(). */
>> diff --git a/hw/msi.h b/hw/msi.h
>> index 7a0e76f..ba76eca 100644
>> --- a/hw/msi.h
>> +++ b/hw/msi.h
>> @@ -44,4 +44,6 @@ static inline bool msi_present(const PCIDevice *dev)
>>     return dev->cap_present & QEMU_PCI_CAP_MSI;
>>  }
>>
>> +extern void (*msi_deliver)(MSIMessage *msg);
>> +
>>  #endif /* QEMU_MSI_H */
>> diff --git a/hw/msix.c b/hw/msix.c
>> index 6e55422..d893f88 100644
>> --- a/hw/msix.c
>> +++ b/hw/msix.c
>> @@ -514,7 +514,7 @@ void msix_notify(PCIDevice *dev, unsigned vector)
>>
>>     msix_message_from_vector(dev, vector, &msg);
>>
>> -    stl_phys(msg.address, msg.data);
>> +    msi_deliver(&msg);
>>  }
>>
>>  void msix_reset(PCIDevice *dev)
>> diff --git a/hw/pc.c b/hw/pc.c
>> index 73398eb..f88ef03 100644
>> --- a/hw/pc.c
>> +++ b/hw/pc.c
>> @@ -24,6 +24,7 @@
>>  #include "hw.h"
>>  #include "pc.h"
>>  #include "apic.h"
>> +#include "msi.h"
>>  #include "fdc.h"
>>  #include "ide.h"
>>  #include "pci.h"
>> @@ -102,6 +103,15 @@ void isa_irq_handler(void *opaque, int n, int level)
>>         qemu_set_irq(isa->ioapic[n], level);
>>  };
>>
>> +static void pc_msi_deliver(MSIMessage *msg)
>> +{
>> +    if ((msg->address & 0xfff00000) == MSI_ADDR_BASE) {
>> +        apic_deliver_msi(msg->address, msg->data);
>> +    } else {
>> +        stl_phys(msg->address, msg->data);
>> +    }
>> +}
>> +
>>  static void ioport80_write(void *opaque, uint32_t addr, uint32_t data)
>>  {
>>  }
>> @@ -889,6 +899,7 @@ static DeviceState *apic_init(void *env, uint8_t apic_id)
>>            on the global memory bus. */
>>         /* XXX: what if the base changes? */
>>         sysbus_mmio_map(d, 0, MSI_ADDR_BASE);
>> +        msi_deliver = pc_msi_deliver;
>>         apic_mapped = 1;
>>     }
>>
>> --
>> 1.7.1
>>
>> --
>> Siemens AG, Corporate Technology, CT T DE IT 1
>> Corporate Competence Center Embedded Linux
>>
> 
> Hmm... This patch is just part of GSoC project Virtio on Xen. AFAICS,
> this is not so urgent.

Ah, I see. The Xen hypervisor part is actually new as well.

> 
> I would prefer a cleaner way. A standalone function that doesn't
> belong to any QEMU subsystem annoys me. And the hooking looks ugly.
> 
> Thanks for your patch, I think we can wait for a better design and
> cleaner solution for both Xen and KVM.

Perfect.

Jan

Patch

diff --git a/hw/apic.c b/hw/apic.c
index a45b57f..7447d91 100644
--- a/hw/apic.c
+++ b/hw/apic.c
@@ -19,6 +19,7 @@ 
 #include "hw.h"
 #include "apic.h"
 #include "ioapic.h"
+#include "msi.h"
 #include "qemu-timer.h"
 #include "host-utils.h"
 #include "sysbus.h"
@@ -795,7 +796,7 @@  static uint32_t apic_mem_readl(void *opaque, target_phys_addr_t addr)
     return val;
 }
 
-static void apic_send_msi(target_phys_addr_t addr, uint32_t data)
+void apic_deliver_msi(uint64_t addr, uint32_t data)
 {
     uint8_t dest = (addr & MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT;
     uint8_t vector = (data & MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT;
@@ -817,7 +818,9 @@  static void apic_mem_writel(void *opaque, target_phys_addr_t addr, uint32_t val)
          * APIC is connected directly to the CPU.
          * Mapping them on the global bus happens to work because
          * MSI registers are reserved in APIC MMIO and vice versa. */
-        apic_send_msi(addr, val);
+        MSIMessage msg = { .address = addr, .data = val };
+
+        msi_deliver(&msg);
         return;
     }
 
diff --git a/hw/apic.h b/hw/apic.h
index c857d52..d0971ae 100644
--- a/hw/apic.h
+++ b/hw/apic.h
@@ -20,6 +20,7 @@  void cpu_set_apic_tpr(DeviceState *s, uint8_t val);
 uint8_t cpu_get_apic_tpr(DeviceState *s);
 void apic_init_reset(DeviceState *s);
 void apic_sipi(DeviceState *s);
+void apic_deliver_msi(uint64_t addr, uint32_t data);
 
 /* pc.c */
 int cpu_is_bsp(CPUState *env);
diff --git a/hw/msi.c b/hw/msi.c
index e111ceb..b88dcc4 100644
--- a/hw/msi.c
+++ b/hw/msi.c
@@ -37,6 +37,14 @@ 
 
 #define PCI_MSI_VECTORS_MAX     32
 
+static void msi_unsupported(MSIMessage *msg)
+{
+    /* If we get here, the board failed to register a delivery handler. */
+    abort();
+}
+
+void (*msi_deliver)(MSIMessage *msg) = msi_unsupported;
+
 /* If we get rid of cap allocator, we won't need this. */
 static inline uint8_t msi_cap_sizeof(uint16_t flags)
 {
@@ -361,7 +369,7 @@  void msi_notify(PCIDevice *dev, unsigned int vector)
                    "notify vector 0x%x"
                    " address: 0x%"PRIx64" data: 0x%"PRIx32"\n",
                    vector, msg.address, msg.data);
-    stl_phys(msg.address, msg.data);
+    msi_deliver(&msg);
 }
 
 /* call this function after updating configs by pci_default_write_config(). */
diff --git a/hw/msi.h b/hw/msi.h
index 7a0e76f..ba76eca 100644
--- a/hw/msi.h
+++ b/hw/msi.h
@@ -44,4 +44,6 @@  static inline bool msi_present(const PCIDevice *dev)
     return dev->cap_present & QEMU_PCI_CAP_MSI;
 }
 
+extern void (*msi_deliver)(MSIMessage *msg);
+
 #endif /* QEMU_MSI_H */
diff --git a/hw/msix.c b/hw/msix.c
index 6e55422..d893f88 100644
--- a/hw/msix.c
+++ b/hw/msix.c
@@ -514,7 +514,7 @@  void msix_notify(PCIDevice *dev, unsigned vector)
 
     msix_message_from_vector(dev, vector, &msg);
 
-    stl_phys(msg.address, msg.data);
+    msi_deliver(&msg);
 }
 
 void msix_reset(PCIDevice *dev)
diff --git a/hw/pc.c b/hw/pc.c
index 73398eb..f88ef03 100644
--- a/hw/pc.c
+++ b/hw/pc.c
@@ -24,6 +24,7 @@ 
 #include "hw.h"
 #include "pc.h"
 #include "apic.h"
+#include "msi.h"
 #include "fdc.h"
 #include "ide.h"
 #include "pci.h"
@@ -102,6 +103,15 @@  void isa_irq_handler(void *opaque, int n, int level)
         qemu_set_irq(isa->ioapic[n], level);
 };
 
+static void pc_msi_deliver(MSIMessage *msg)
+{
+    if ((msg->address & 0xfff00000) == MSI_ADDR_BASE) {
+        apic_deliver_msi(msg->address, msg->data);
+    } else {
+        stl_phys(msg->address, msg->data);
+    }
+}
+
 static void ioport80_write(void *opaque, uint32_t addr, uint32_t data)
 {
 }
@@ -889,6 +899,7 @@  static DeviceState *apic_init(void *env, uint8_t apic_id)
            on the global memory bus. */
         /* XXX: what if the base changes? */
         sysbus_mmio_map(d, 0, MSI_ADDR_BASE);
+        msi_deliver = pc_msi_deliver;
         apic_mapped = 1;
     }