diff mbox series

[RFC,v2,20/22] i386/xen: HVMOP_set_param / HVM_PARAM_CALLBACK_IRQ

Message ID 20221209095612.689243-21-dwmw2@infradead.org
State New
Headers show
Series Xen HVM support under KVM | expand

Commit Message

David Woodhouse Dec. 9, 2022, 9:56 a.m. UTC
From: Ankur Arora <ankur.a.arora@oracle.com>

The HVM_PARAM_CALLBACK_IRQ parameter controls the system-wide event
channel upcall method.  The vector support is handled by KVM internally,
when the evtchn_upcall_pending field in the vcpu_info is set.

The GSI and PCI_INTX delivery methods are not supported. yet; those
need to simulate a level-triggered event on the I/OAPIC.

Add a 'xen_evtchn' device to host the migration state, as we'll shortly
be adding a full event channel table there too.

Signed-off-by: Ankur Arora <ankur.a.arora@oracle.com>
Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
[dwmw2: Rework for upstream kernel changes, split from per-VCPU vector]
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 hw/i386/kvm/meson.build  |   5 +-
 hw/i386/kvm/xen_evtchn.c | 117 +++++++++++++++++++++++++++++++++++++++
 hw/i386/kvm/xen_evtchn.h |  13 +++++
 hw/i386/pc_piix.c        |   2 +
 target/i386/xen.c        |  44 +++++++++++++--
 5 files changed, 174 insertions(+), 7 deletions(-)
 create mode 100644 hw/i386/kvm/xen_evtchn.c
 create mode 100644 hw/i386/kvm/xen_evtchn.h

Comments

Durrant, Paul Dec. 12, 2022, 4:16 p.m. UTC | #1
On 09/12/2022 09:56, David Woodhouse wrote:
> From: Ankur Arora <ankur.a.arora@oracle.com>
> 
> The HVM_PARAM_CALLBACK_IRQ parameter controls the system-wide event
> channel upcall method.  The vector support is handled by KVM internally,
> when the evtchn_upcall_pending field in the vcpu_info is set.
> 
> The GSI and PCI_INTX delivery methods are not supported. yet; those
> need to simulate a level-triggered event on the I/OAPIC.

That's gonna be somewhat limiting if anyone runs a Windows guest with 
upcall vector support turned off... which is an option at:

https://xenbits.xen.org/gitweb/?p=pvdrivers/win/xenbus.git;a=blob;f=src/xenbus/evtchn.c;;hb=HEAD#l1928

> 
> Add a 'xen_evtchn' device to host the migration state, as we'll shortly
> be adding a full event channel table there too.
> 
> Signed-off-by: Ankur Arora <ankur.a.arora@oracle.com>
> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
> [dwmw2: Rework for upstream kernel changes, split from per-VCPU vector]
> Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
> ---
>   hw/i386/kvm/meson.build  |   5 +-
>   hw/i386/kvm/xen_evtchn.c | 117 +++++++++++++++++++++++++++++++++++++++
>   hw/i386/kvm/xen_evtchn.h |  13 +++++
>   hw/i386/pc_piix.c        |   2 +
>   target/i386/xen.c        |  44 +++++++++++++--
>   5 files changed, 174 insertions(+), 7 deletions(-)
>   create mode 100644 hw/i386/kvm/xen_evtchn.c
>   create mode 100644 hw/i386/kvm/xen_evtchn.h
> 
> diff --git a/hw/i386/kvm/meson.build b/hw/i386/kvm/meson.build
> index 6165cbf019..cab64df339 100644
> --- a/hw/i386/kvm/meson.build
> +++ b/hw/i386/kvm/meson.build
> @@ -4,6 +4,9 @@ i386_kvm_ss.add(when: 'CONFIG_APIC', if_true: files('apic.c'))
>   i386_kvm_ss.add(when: 'CONFIG_I8254', if_true: files('i8254.c'))
>   i386_kvm_ss.add(when: 'CONFIG_I8259', if_true: files('i8259.c'))
>   i386_kvm_ss.add(when: 'CONFIG_IOAPIC', if_true: files('ioapic.c'))
> -i386_kvm_ss.add(when: 'CONFIG_XEN_EMU', if_true: files('xen_overlay.c'))
> +i386_kvm_ss.add(when: 'CONFIG_XEN_EMU', if_true: files(
> +  'xen_overlay.c',
> +  'xen_evtchn.c',
> +  ))
>   
>   i386_ss.add_all(when: 'CONFIG_KVM', if_true: i386_kvm_ss)
> diff --git a/hw/i386/kvm/xen_evtchn.c b/hw/i386/kvm/xen_evtchn.c
> new file mode 100644
> index 0000000000..1ca0c034e7
> --- /dev/null
> +++ b/hw/i386/kvm/xen_evtchn.c
> @@ -0,0 +1,117 @@
> +/*
> + * QEMU Xen emulation: Shared/overlay pages support
> + *
> + * Copyright © 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved.
> + *
> + * Authors: David Woodhouse <dwmw2@infradead.org>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> + * See the COPYING file in the top-level directory.
> + */
> +
> +#include "qemu/osdep.h"
> +#include "qemu/host-utils.h"
> +#include "qemu/module.h"
> +#include "qemu/main-loop.h"
> +#include "qapi/error.h"
> +#include "qom/object.h"
> +#include "exec/target_page.h"
> +#include "exec/address-spaces.h"
> +#include "migration/vmstate.h"
> +
> +#include "hw/sysbus.h"
> +#include "hw/xen/xen.h"
> +#include "xen_evtchn.h"
> +
> +#include "sysemu/kvm.h"
> +#include <linux/kvm.h>
> +
> +#include "standard-headers/xen/memory.h"
> +#include "standard-headers/xen/hvm/params.h"
> +
> +#define TYPE_XEN_EVTCHN "xenevtchn"
> +OBJECT_DECLARE_SIMPLE_TYPE(XenEvtchnState, XEN_EVTCHN)
> +
> +struct XenEvtchnState {
> +    /*< private >*/
> +    SysBusDevice busdev;
> +    /*< public >*/
> +
> +    uint64_t callback_param;
> +};
> +
> +struct XenEvtchnState *xen_evtchn_singleton;
> +
> +static int xen_evtchn_post_load(void *opaque, int version_id)
> +{
> +    XenEvtchnState *s = opaque;
> +
> +    if (s->callback_param) {
> +        xen_evtchn_set_callback_param(s->callback_param);
> +    }
> +
> +    return 0;
> +}
> +
> +static bool xen_evtchn_is_needed(void *opaque)
> +{
> +    return xen_mode == XEN_EMULATE;
> +}
> +
> +static const VMStateDescription xen_evtchn_vmstate = {
> +    .name = "xen_evtchn",
> +    .version_id = 1,
> +    .minimum_version_id = 1,
> +    .needed = xen_evtchn_is_needed,
> +    .post_load = xen_evtchn_post_load,
> +    .fields = (VMStateField[]) {
> +        VMSTATE_UINT64(callback_param, XenEvtchnState),
> +        VMSTATE_END_OF_LIST()
> +    }
> +};
> +
> +static void xen_evtchn_class_init(ObjectClass *klass, void *data)
> +{
> +    DeviceClass *dc = DEVICE_CLASS(klass);
> +
> +    dc->vmsd = &xen_evtchn_vmstate;
> +}
> +
> +static const TypeInfo xen_evtchn_info = {
> +    .name          = TYPE_XEN_EVTCHN,
> +    .parent        = TYPE_SYS_BUS_DEVICE,
> +    .instance_size = sizeof(XenEvtchnState),
> +    .class_init    = xen_evtchn_class_init,
> +};
> +
> +void xen_evtchn_create(void)
> +{
> +    xen_evtchn_singleton = XEN_EVTCHN(sysbus_create_simple(TYPE_XEN_EVTCHN, -1, NULL));
> +}
> +
> +static void xen_evtchn_register_types(void)
> +{
> +    type_register_static(&xen_evtchn_info);
> +}
> +
> +type_init(xen_evtchn_register_types)
> +
> +
> +#define CALLBACK_VIA_TYPE_SHIFT       56
> +
> +int xen_evtchn_set_callback_param(uint64_t param)
> +{
> +    int ret = -ENOSYS;
> +
> +    if (param >> CALLBACK_VIA_TYPE_SHIFT == HVM_PARAM_CALLBACK_TYPE_VECTOR) {
> +        struct kvm_xen_hvm_attr xa = {
> +            .type = KVM_XEN_ATTR_TYPE_UPCALL_VECTOR,
> +            .u.vector = (uint8_t)param,
> +        };
> +
> +        ret = kvm_vm_ioctl(kvm_state, KVM_XEN_HVM_SET_ATTR, &xa);
> +        if (!ret && xen_evtchn_singleton)
> +            xen_evtchn_singleton->callback_param = param;
> +    }
> +    return ret;
> +}
> diff --git a/hw/i386/kvm/xen_evtchn.h b/hw/i386/kvm/xen_evtchn.h
> new file mode 100644
> index 0000000000..11c6ed22a0
> --- /dev/null
> +++ b/hw/i386/kvm/xen_evtchn.h
> @@ -0,0 +1,13 @@
> +/*
> + * QEMU Xen emulation: Event channel support
> + *
> + * Copyright © 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved.
> + *
> + * Authors: David Woodhouse <dwmw2@infradead.org>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> + * See the COPYING file in the top-level directory.
> + */
> +
> +void xen_evtchn_create(void);
> +int xen_evtchn_set_callback_param(uint64_t param);
> diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
> index c3c61eedde..18540084a0 100644
> --- a/hw/i386/pc_piix.c
> +++ b/hw/i386/pc_piix.c
> @@ -60,6 +60,7 @@
>   #endif
>   #ifdef CONFIG_XEN_EMU
>   #include "hw/i386/kvm/xen_overlay.h"
> +#include "hw/i386/kvm/xen_evtchn.h"
>   #endif
>   #include "migration/global_state.h"
>   #include "migration/misc.h"
> @@ -417,6 +418,7 @@ static void pc_xen_hvm_init(MachineState *machine)
>   #ifdef CONFIG_XEN_EMU
>       if (xen_mode == XEN_EMULATE) {
>               xen_overlay_create();
> +            xen_evtchn_create();
>       }
>   #endif
>   }
> diff --git a/target/i386/xen.c b/target/i386/xen.c
> index 2583c00a6b..1af336d9e5 100644
> --- a/target/i386/xen.c
> +++ b/target/i386/xen.c
> @@ -16,6 +16,8 @@
>   #include "xen.h"
>   #include "trace.h"
>   #include "hw/i386/kvm/xen_overlay.h"
> +#include "hw/i386/kvm/xen_evtchn.h"
> +
>   #include "standard-headers/xen/version.h"
>   #include "standard-headers/xen/memory.h"
>   #include "standard-headers/xen/hvm/hvm_op.h"
> @@ -287,24 +289,53 @@ static bool kvm_xen_hcall_memory_op(struct kvm_xen_exit *exit,
>       return true;
>   }
>   
> +static int handle_set_param(struct kvm_xen_exit *exit, X86CPU *cpu,
> +                            uint64_t arg)
> +{
> +    CPUState *cs = CPU(cpu);
> +    struct xen_hvm_param hp;
> +    int err = 0;
> +
> +    if (kvm_copy_from_gva(cs, arg, &hp, sizeof(hp))) {
> +        err = -EFAULT;
> +        goto out;
> +    }
> +
> +    if (hp.domid != DOMID_SELF) {

Xen actually allows the domain's own id to be specified as well as the 
magic DOMID_SELF.

> +        err = -EINVAL;

And this should be -ESRCH.

> +        goto out;
> +    }
> +
> +    switch (hp.index) {
> +    case HVM_PARAM_CALLBACK_IRQ:
> +        err = xen_evtchn_set_callback_param(hp.value);
> +        break;
> +    default:
> +        return false;
> +    }
> +
> +out:
> +    exit->u.hcall.result = err;

This is a bit on the ugly side isn't it? Why not return the err and have 
kvm_xen_hcall_hvm_op() deal with passing it back?

> +    return true;
> +}
> +
>   static int kvm_xen_hcall_evtchn_upcall_vector(struct kvm_xen_exit *exit,
>                                                 X86CPU *cpu, uint64_t arg)
>   {
> -    struct xen_hvm_evtchn_upcall_vector *up;
> +    struct xen_hvm_evtchn_upcall_vector up;
>       CPUState *target_cs;
>       int vector;
>   
> -    up = gva_to_hva(CPU(cpu), arg);
> -    if (!up) {
> +    if (kvm_copy_from_gva(CPU(cpu), arg, &up, sizeof(up))) {
>           return -EFAULT;
>       }
>   
> -    vector = up->vector;
> +    vector = up.vector;
>       if (vector < 0x10) {
>           return -EINVAL;
>       }
>   
> -    target_cs = qemu_get_cpu(up->vcpu);
> +    target_cs = qemu_get_cpu(up.vcpu);
>       if (!target_cs) {
>           return -EINVAL;
>       }

These changes to kvm_xen_hcall_evtchn_upcall_vector() seem to have 
nothing to do with the rest of the patch. Am I missing something?

   Paul

> @@ -325,7 +356,8 @@ static bool kvm_xen_hcall_hvm_op(struct kvm_xen_exit *exit, X86CPU *cpu,
>       case HVMOP_pagetable_dying:
>               ret = -ENOSYS;
>               break;
> -
> +    case HVMOP_set_param:
> +            return handle_set_param(exit, cpu, arg);
>       default:
>               return false;
>       }
David Woodhouse Dec. 12, 2022, 4:26 p.m. UTC | #2
On Mon, 2022-12-12 at 16:16 +0000, Paul Durrant wrote:
> On 09/12/2022 09:56, David Woodhouse wrote:
> > From: Ankur Arora <ankur.a.arora@oracle.com>
> > The HVM_PARAM_CALLBACK_IRQ parameter controls the system-wide event
> > channel upcall method.  The vector support is handled by KVM internally,
> > when the evtchn_upcall_pending field in the vcpu_info is set.
> > The GSI and PCI_INTX delivery methods are not supported. yet; those
> > need to simulate a level-triggered event on the I/OAPIC.
> 
> That's gonna be somewhat limiting if anyone runs a Windows guest with 
> upcall vector support turned off... which is an option at:
> 
> https://xenbits.xen.org/gitweb/?p=pvdrivers/win/xenbus.git;a=blob;f=src/xenbus/evtchn.c;;hb=HEAD#l1928

Sure. And as you know, I also added the 'xen_no_vector_callback' option
to the Linux command line to allow for that mode to be tested with
Linux too: https://git.kernel.org/torvalds/c/b36b0fe96a 

The GSI and PCI_INTX modes will be added in time, but not yet.
Durrant, Paul Dec. 12, 2022, 4:39 p.m. UTC | #3
On 12/12/2022 16:26, David Woodhouse wrote:
> On Mon, 2022-12-12 at 16:16 +0000, Paul Durrant wrote:
>> On 09/12/2022 09:56, David Woodhouse wrote:
>>> From: Ankur Arora <ankur.a.arora@oracle.com>
>>> The HVM_PARAM_CALLBACK_IRQ parameter controls the system-wide event
>>> channel upcall method.  The vector support is handled by KVM internally,
>>> when the evtchn_upcall_pending field in the vcpu_info is set.
>>> The GSI and PCI_INTX delivery methods are not supported. yet; those
>>> need to simulate a level-triggered event on the I/OAPIC.
>>
>> That's gonna be somewhat limiting if anyone runs a Windows guest with
>> upcall vector support turned off... which is an option at:
>>
>> https://xenbits.xen.org/gitweb/?p=pvdrivers/win/xenbus.git;a=blob;f=src/xenbus/evtchn.c;;hb=HEAD#l1928
> 
> Sure. And as you know, I also added the 'xen_no_vector_callback' option
> to the Linux command line to allow for that mode to be tested with
> Linux too: https://git.kernel.org/torvalds/c/b36b0fe96a
> 
> The GSI and PCI_INTX modes will be added in time, but not yet.

Ok, but maybe worth calling out the limitation in the commit comment for 
those wishing to kick the tyres.

   Paul
David Woodhouse Dec. 15, 2022, 8:54 p.m. UTC | #4
On Mon, 2022-12-12 at 16:39 +0000, Paul Durrant wrote:
> On 12/12/2022 16:26, David Woodhouse wrote:
> > On Mon, 2022-12-12 at 16:16 +0000, Paul Durrant wrote:
> > > On 09/12/2022 09:56, David Woodhouse wrote:
> > > > From: Ankur Arora <ankur.a.arora@oracle.com>
> > > > The HVM_PARAM_CALLBACK_IRQ parameter controls the system-wide event
> > > > channel upcall method.  The vector support is handled by KVM internally,
> > > > when the evtchn_upcall_pending field in the vcpu_info is set.
> > > > The GSI and PCI_INTX delivery methods are not supported. yet; those
> > > > need to simulate a level-triggered event on the I/OAPIC.
> > > 
> > > That's gonna be somewhat limiting if anyone runs a Windows guest with
> > > upcall vector support turned off... which is an option at:
> > > 
> > > https://xenbits.xen.org/gitweb/?p=pvdrivers/win/xenbus.git;a=blob;f=src/xenbus/evtchn.c;;hb=HEAD#l1928
> > > 
> > 
> > Sure. And as you know, I also added the 'xen_no_vector_callback' option
> > to the Linux command line to allow for that mode to be tested with
> > Linux too: 
> > https://git.kernel.org/torvalds/c/b36b0fe96a
> > 
> > 
> > The GSI and PCI_INTX modes will be added in time, but not yet.
> 
> Ok, but maybe worth calling out the limitation in the commit comment for 
> those wishing to kick the tyres.

Hm... this isn't as simple in QEMU as I hoped.

The way I want to handle it is like the way that VFIO eventfd pairs
work for level-triggered interrupts: the first eventfd is triggered on
a rising edge, and the other one is a 'resampler' which is triggered on
EOI, and causes the first one to be retriggered if the level is still
actually high.

However... unlike the kernel and another VMM that you and I are
familiar with, QEMU doesn't actually hook that up to the EOI in the
APIC/IOAPIC at all.

Instead, when VFIO devices assert a level-triggered interrupt, QEMU
*unmaps* that device's BARs from the guest so it can trap-and-emulate
them, and each MMIO read or write will also trigger the resampler
(whether that line is currently masked in the APIC or not).

I suppose we could try making the page with the vcpu_info as read-only, 
and trapping access to that so we spot when the guest clears its own
->evtchn_upcall_pending flag? That seems overly complex though.

So I've resorted to doing what Xen itself does: just poll the flag on
every vmexit. Patch is at the tip of my tree at 
https://git.infradead.org/users/dwmw2/qemu.git/shortlog/refs/heads/xenfv
and below.

However, in the case of the in-kernel irqchip we might not even *get* a
vmexit all the way to userspace; can't a guest just get stuck in an
interrupt storm with it being handled entirely in-kernel? I might
decree that this works *only* with the split irqchip.

Then again, it'll work *nicely* in the kernel where the EOI
notification exists, so I might teach the kernel's evtchn code to
create those eventfd pairs like VFIO's, which can be hooked in as IRQFD
routing to the in-kernel {IOA,}PIC.

--------------------
From 15a91ff4833d07910abba1dec093e48580c2b4c4 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Subject: [PATCH] hw/xen: Support HVM_PARAM_CALLBACK_TYPE_GSI callback

The GSI callback (and later PCI_INTX) is a level triggered interrupt. It
is asserted when an event channel is delivered to vCPU0, and is supposed
to be cleared when the vcpu_info->evtchn_upcall_pending field for vCPU0
is cleared again.

Thankfully, Xen does *not* assert the GSI if the guest sets its own
evtchn_upcall_pending field; we only need to assert the GSI when we
have delivered an event for ourselves. So that's the easy part.

However, we *do* need to poll for the evtchn_upcall_pending flag being
cleared. In an ideal world we would poll that when the EOI happens on
the PIC/IOAPIC. That's how it works in the kernel with the VFIO eventfd
pairs — one is used to trigger the interrupt, and the other works in the
other direction to 'resample' on EOI, and trigger the first eventfd
again if the line is still active.

However, QEMU doesn't seem to do that. Even VFIO level interrupts seem
to be supported by temporarily unmapping the device's BARs from the
guest when an interrupt happens, then trapping *all* MMIO to the device
and sending the 'resample' event on *every* MMIO access until the IRQ
is cleared! Maybe in future we'll plumb the 'resample' concept through
QEMU's irq framework but for now we'll do what Xen itself does: just
check the flag on every vmexit if the upcall GSI is known to be
asserted.

This is barely tested; I did make it waggle IRQ1 up and down on *every*
delivery even through the vector method, and observed that it is indeed
visible to the guest (as lots of spurious keyboard interrupts). But if
the vector callback isn't in use, Linux guests won't actually use event
channels for anything except PV devices... which I haven't implemented
here yet, so it's hard to test delivery :)

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 hw/i386/kvm/xen_evtchn.c  | 88 +++++++++++++++++++++++++++++++++++----
 hw/i386/kvm/xen_evtchn.h  |  3 ++
 hw/i386/pc.c              | 10 ++++-
 include/sysemu/kvm_xen.h  |  2 +-
 target/i386/cpu.h         |  1 +
 target/i386/kvm/kvm.c     | 13 ++++++
 target/i386/kvm/xen-emu.c | 64 ++++++++++++++++++++--------
 target/i386/kvm/xen-emu.h |  1 +
 8 files changed, 154 insertions(+), 28 deletions(-)

diff --git a/hw/i386/kvm/xen_evtchn.c b/hw/i386/kvm/xen_evtchn.c
index 9292602c09..8ea8cf550e 100644
--- a/hw/i386/kvm/xen_evtchn.c
+++ b/hw/i386/kvm/xen_evtchn.c
@@ -24,6 +24,8 @@
 
 #include "hw/sysbus.h"
 #include "hw/xen/xen.h"
+#include "hw/i386/x86.h"
+#include "hw/irq.h"
 
 #include "xen_evtchn.h"
 #include "xen_overlay.h"
@@ -102,6 +104,7 @@ struct XenEvtchnState {
     QemuMutex port_lock;
     uint32_t nr_ports;
     XenEvtchnPort port_table[EVTCHN_2L_NR_CHANNELS];
+    qemu_irq gsis[GSI_NUM_PINS];
 };
 
 struct XenEvtchnState *xen_evtchn_singleton;
@@ -166,9 +169,29 @@ static const TypeInfo xen_evtchn_info = {
 void xen_evtchn_create(void)
 {
     XenEvtchnState *s = XEN_EVTCHN(sysbus_create_simple(TYPE_XEN_EVTCHN, -1, NULL));
+    int i;
+
     xen_evtchn_singleton = s;
 
     qemu_mutex_init(&s->port_lock);
+
+    for (i = 0; i < GSI_NUM_PINS; i++) {
+        sysbus_init_irq(SYS_BUS_DEVICE(s), &s->gsis[i]);
+    }
+}
+
+void xen_evtchn_connect_gsis(qemu_irq *system_gsis)
+{
+    XenEvtchnState *s = xen_evtchn_singleton;
+    int i;
+
+    if (!s) {
+        return;
+    }
+
+    for (i = 0; i < GSI_NUM_PINS; i++) {
+        sysbus_connect_irq(SYS_BUS_DEVICE(s), i, system_gsis[i]);
+    }
 }
 
 static void xen_evtchn_register_types(void)
@@ -178,26 +201,75 @@ static void xen_evtchn_register_types(void)
 
 type_init(xen_evtchn_register_types)
 
-
 #define CALLBACK_VIA_TYPE_SHIFT       56
 
 int xen_evtchn_set_callback_param(uint64_t param)
 {
+    XenEvtchnState *s = xen_evtchn_singleton;
     int ret = -ENOSYS;
 
-    if (param >> CALLBACK_VIA_TYPE_SHIFT == HVM_PARAM_CALLBACK_TYPE_VECTOR) {
+    if (!s) {
+        return -ENOTSUP;
+    }
+
+    switch (param >> CALLBACK_VIA_TYPE_SHIFT) {
+    case HVM_PARAM_CALLBACK_TYPE_VECTOR: {
         struct kvm_xen_hvm_attr xa = {
             .type = KVM_XEN_ATTR_TYPE_UPCALL_VECTOR,
             .u.vector = (uint8_t)param,
         };
 
         ret = kvm_vm_ioctl(kvm_state, KVM_XEN_HVM_SET_ATTR, &xa);
-        if (!ret && xen_evtchn_singleton)
-            xen_evtchn_singleton->callback_param = param;
+        break;
+    }
+    case HVM_PARAM_CALLBACK_TYPE_GSI:
+        ret = 0;
+        break;
     }
+
+    if (!ret) {
+        s->callback_param = param;
+    }
+
     return ret;
 }
 
+static void xen_evtchn_set_callback_level(XenEvtchnState *s, int level)
+{
+    uint32_t param = (uint32_t)s->callback_param;
+
+    switch (s->callback_param >> CALLBACK_VIA_TYPE_SHIFT) {
+    case HVM_PARAM_CALLBACK_TYPE_GSI:
+        if (param < GSI_NUM_PINS) {
+            qemu_set_irq(s->gsis[param], level);
+        }
+        break;
+    }
+}
+
+static void inject_callback(XenEvtchnState *s, uint32_t vcpu)
+{
+    if (kvm_xen_inject_vcpu_callback_vector(vcpu, s->callback_param)) {
+        return;
+    }
+
+    /* GSI or PCI_INTX delivery is only for events on vCPU 0 */
+    if (vcpu) {
+        return;
+    }
+
+    xen_evtchn_set_callback_level(s, 1);
+}
+
+void xen_evtchn_deassert_callback(void)
+{
+    XenEvtchnState *s = xen_evtchn_singleton;
+
+    if (s) {
+        xen_evtchn_set_callback_level(s, 0);
+    }
+}
+
 static void deassign_kernel_port(evtchn_port_t port)
 {
     struct kvm_xen_hvm_attr ha;
@@ -359,7 +431,7 @@ static int do_unmask_port_lm(XenEvtchnState *s, evtchn_port_t port,
         return 0;
     }
 
-    kvm_xen_inject_vcpu_callback_vector(s->port_table[port].vcpu);
+    inject_callback(s, s->port_table[port].vcpu);
 
     return 0;
 }
@@ -413,7 +485,7 @@ static int do_unmask_port_compat(XenEvtchnState *s, evtchn_port_t port,
         return 0;
     }
 
-    kvm_xen_inject_vcpu_callback_vector(s->port_table[port].vcpu);
+    inject_callback(s, s->port_table[port].vcpu);
 
     return 0;
 }
@@ -481,7 +553,7 @@ static int do_set_port_lm(XenEvtchnState *s, evtchn_port_t port,
         return 0;
     }
 
-    kvm_xen_inject_vcpu_callback_vector(s->port_table[port].vcpu);
+    inject_callback(s, s->port_table[port].vcpu);
 
     return 0;
 }
@@ -524,7 +596,7 @@ static int do_set_port_compat(XenEvtchnState *s, evtchn_port_t port,
         return 0;
     }
 
-    kvm_xen_inject_vcpu_callback_vector(s->port_table[port].vcpu);
+    inject_callback(s, s->port_table[port].vcpu);
 
     return 0;
 }
diff --git a/hw/i386/kvm/xen_evtchn.h b/hw/i386/kvm/xen_evtchn.h
index 2acbaeabaa..1176b67b91 100644
--- a/hw/i386/kvm/xen_evtchn.h
+++ b/hw/i386/kvm/xen_evtchn.h
@@ -9,9 +9,12 @@
  * See the COPYING file in the top-level directory.
  */
 
+#include "hw/sysbus.h"
 
 void xen_evtchn_create(void);
 int xen_evtchn_set_callback_param(uint64_t param);
+void xen_evtchn_connect_gsis(qemu_irq *system_gsis);
+void xen_evtchn_deassert_callback(void);
 
 void hmp_xen_event_list(Monitor *mon, const QDict *qdict);
 void hmp_xen_event_inject(Monitor *mon, const QDict *qdict);
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 427f79e6a8..1c4941de8f 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -1303,6 +1303,12 @@ void pc_basic_device_init(struct PCMachineState *pcms,
     }
     *rtc_state = mc146818_rtc_init(isa_bus, 2000, rtc_irq);
 
+#ifdef CONFIG_XEN_EMU
+    if (xen_mode == XEN_EMULATE) {
+        xen_evtchn_connect_gsis(gsi);
+    }
+#endif
+
     qemu_register_boot_set(pc_boot_set, *rtc_state);
 
     if (!xen_enabled() &&
@@ -1848,8 +1854,8 @@ int pc_machine_kvm_type(MachineState *machine, const char *kvm_type)
 {
 #ifdef CONFIG_XEN_EMU
     if (xen_mode == XEN_EMULATE) {
-            xen_overlay_create();
-            xen_evtchn_create();
+        xen_overlay_create();
+        xen_evtchn_create();
     }
 #endif
     return 0;
diff --git a/include/sysemu/kvm_xen.h b/include/sysemu/kvm_xen.h
index e5b14ffe8d..73fe5969b8 100644
--- a/include/sysemu/kvm_xen.h
+++ b/include/sysemu/kvm_xen.h
@@ -13,7 +13,7 @@
 #define QEMU_SYSEMU_KVM_XEN_H
 
 void *kvm_xen_get_vcpu_info_hva(uint32_t vcpu_id);
-void kvm_xen_inject_vcpu_callback_vector(uint32_t vcpu_id);
+bool kvm_xen_inject_vcpu_callback_vector(uint32_t vcpu_id, uint64_t callback_param);
 int kvm_xen_set_vcpu_virq(uint32_t vcpu_id, uint16_t virq, uint16_t port);
 
 #endif /* QEMU_SYSEMU_KVM_XEN_H */
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 846c738fd7..9330eb83fd 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -1795,6 +1795,7 @@ typedef struct CPUArchState {
     uint64_t xen_vcpu_time_info_gpa;
     uint64_t xen_vcpu_runstate_gpa;
     uint8_t xen_vcpu_callback_vector;
+    bool xen_callback_asserted;
     uint16_t xen_virq[XEN_NR_VIRQS];
 #endif
 #if defined(CONFIG_HVF)
diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
index cbf41d6f81..32d808da37 100644
--- a/target/i386/kvm/kvm.c
+++ b/target/i386/kvm/kvm.c
@@ -5431,6 +5431,19 @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run)
     char str[256];
     KVMState *state;
 
+#ifdef CONFIG_XEN_EMU
+    /*
+     * If the callback is asserted as a GSI (or PCI INTx) then check if
+     * vcpu_info->evtchn_upcall_pending has been cleared, and deassert
+     * the callback IRQ if so. Ideally we could hook into the PIC/IOAPIC
+     * EOI and only resample then, exactly how the VFIO eventfd pairs
+     * are designed to work for level triggered interrupts.
+     */
+    if (cpu->env.xen_callback_asserted) {
+        kvm_xen_maybe_deassert_callback(cs);
+    }
+#endif
+
     switch (run->exit_reason) {
     case KVM_EXIT_HLT:
         DPRINTF("handle_hlt\n");
diff --git a/target/i386/kvm/xen-emu.c b/target/i386/kvm/xen-emu.c
index a8c953e3ca..48ae47809a 100644
--- a/target/i386/kvm/xen-emu.c
+++ b/target/i386/kvm/xen-emu.c
@@ -240,18 +240,11 @@ static void *gpa_to_hva(uint64_t gpa)
                                              mrs.offset_within_region);
 }
 
-void *kvm_xen_get_vcpu_info_hva(uint32_t vcpu_id)
+static void *vcpu_info_hva_from_cs(CPUState *cs)
 {
-    CPUState *cs = qemu_get_cpu(vcpu_id);
-    CPUX86State *env;
-    uint64_t gpa;
-
-    if (!cs) {
-        return NULL;
-    }
-    env = &X86_CPU(cs)->env;
+    CPUX86State *env = &X86_CPU(cs)->env;
+    uint64_t gpa = env->xen_vcpu_info_gpa;
 
-    gpa = env->xen_vcpu_info_gpa;
     if (gpa == UINT64_MAX)
         gpa = env->xen_vcpu_info_default_gpa;
     if (gpa == UINT64_MAX)
@@ -260,13 +253,38 @@ void *kvm_xen_get_vcpu_info_hva(uint32_t vcpu_id)
     return gpa_to_hva(gpa);
 }
 
-void kvm_xen_inject_vcpu_callback_vector(uint32_t vcpu_id)
+void *kvm_xen_get_vcpu_info_hva(uint32_t vcpu_id)
+{
+    CPUState *cs = qemu_get_cpu(vcpu_id);
+
+    if (!cs) {
+            return NULL;
+    }
+
+    return vcpu_info_hva_from_cs(cs);
+}
+
+void kvm_xen_maybe_deassert_callback(CPUState *cs)
+{
+    struct vcpu_info *vi = vcpu_info_hva_from_cs(cs);
+    if (!vi) {
+            return;
+    }
+
+    /* If the evtchn_upcall_pending flag is cleared, turn the GSI off. */
+    if (!vi->evtchn_upcall_pending) {
+        X86_CPU(cs)->env.xen_callback_asserted = false;
+        xen_evtchn_deassert_callback();
+    }
+}
+
+bool kvm_xen_inject_vcpu_callback_vector(uint32_t vcpu_id, uint64_t callback_param)
 {
     CPUState *cs = qemu_get_cpu(vcpu_id);
     uint8_t vector;
 
     if (!cs) {
-        return;
+        return false;
     }
     vector = X86_CPU(cs)->env.xen_vcpu_callback_vector;
 
@@ -278,13 +296,25 @@ void kvm_xen_inject_vcpu_callback_vector(uint32_t vcpu_id)
             .data = vector | (1UL << MSI_DATA_LEVEL_SHIFT),
         };
         kvm_irqchip_send_msi(kvm_state, msg);
-        return;
+        return true;
     }
 
-    /* If the evtchn_upcall_pending field in the vcpu_info is set, then
-     * KVM will automatically deliver the vector on entering the vCPU
-     * so all we have to do is kick it out. */
-    qemu_cpu_kick(cs);
+    switch(callback_param >> 56) {
+    case HVM_PARAM_CALLBACK_TYPE_VECTOR:
+            /* If the evtchn_upcall_pending field in the vcpu_info is set, then
+             * KVM will automatically deliver the vector on entering the vCPU
+             * so all we have to do is kick it out. */
+            qemu_cpu_kick(cs);
+            return true;
+
+    case HVM_PARAM_CALLBACK_TYPE_GSI:
+    case HVM_PARAM_CALLBACK_TYPE_PCI_INTX:
+            if (vcpu_id == 0) {
+                X86_CPU(cs)->env.xen_callback_asserted = true;
+            }
+            return false;
+    }
+    return false;
 }
 
 static int kvm_xen_set_vcpu_timer(CPUState *cs)
diff --git a/target/i386/kvm/xen-emu.h b/target/i386/kvm/xen-emu.h
index 58e4748d80..0ff8bed350 100644
--- a/target/i386/kvm/xen-emu.h
+++ b/target/i386/kvm/xen-emu.h
@@ -27,5 +27,6 @@ int kvm_xen_init(KVMState *s, uint32_t hypercall_msr);
 int kvm_xen_handle_exit(X86CPU *cpu, struct kvm_xen_exit *exit);
 int kvm_xen_set_vcpu_attr(CPUState *cs, uint16_t type, uint64_t gpa);
 int kvm_xen_set_vcpu_callback_vector(CPUState *cs);
+void kvm_xen_maybe_deassert_callback(CPUState *cs);
 
 #endif /* QEMU_I386_KVM_XEN_EMU_H */
Durrant, Paul Dec. 20, 2022, 1:56 p.m. UTC | #5
On 15/12/2022 20:54, David Woodhouse wrote:
> On Mon, 2022-12-12 at 16:39 +0000, Paul Durrant wrote:
>> On 12/12/2022 16:26, David Woodhouse wrote:
>>> On Mon, 2022-12-12 at 16:16 +0000, Paul Durrant wrote:
>>>> On 09/12/2022 09:56, David Woodhouse wrote:
>>>>> From: Ankur Arora <ankur.a.arora@oracle.com>
>>>>> The HVM_PARAM_CALLBACK_IRQ parameter controls the system-wide event
>>>>> channel upcall method.  The vector support is handled by KVM internally,
>>>>> when the evtchn_upcall_pending field in the vcpu_info is set.
>>>>> The GSI and PCI_INTX delivery methods are not supported. yet; those
>>>>> need to simulate a level-triggered event on the I/OAPIC.
>>>>
>>>> That's gonna be somewhat limiting if anyone runs a Windows guest with
>>>> upcall vector support turned off... which is an option at:
>>>>
>>>> https://xenbits.xen.org/gitweb/?p=pvdrivers/win/xenbus.git;a=blob;f=src/xenbus/evtchn.c;;hb=HEAD#l1928
>>>>
>>>
>>> Sure. And as you know, I also added the 'xen_no_vector_callback' option
>>> to the Linux command line to allow for that mode to be tested with
>>> Linux too:
>>> https://git.kernel.org/torvalds/c/b36b0fe96a
>>>
>>>
>>> The GSI and PCI_INTX modes will be added in time, but not yet.
>>
>> Ok, but maybe worth calling out the limitation in the commit comment for
>> those wishing to kick the tyres.
> 
> Hm... this isn't as simple in QEMU as I hoped.
> 
> The way I want to handle it is like the way that VFIO eventfd pairs
> work for level-triggered interrupts: the first eventfd is triggered on
> a rising edge, and the other one is a 'resampler' which is triggered on
> EOI, and causes the first one to be retriggered if the level is still
> actually high.
> 
> However... unlike the kernel and another VMM that you and I are
> familiar with, QEMU doesn't actually hook that up to the EOI in the
> APIC/IOAPIC at all.
> 
> Instead, when VFIO devices assert a level-triggered interrupt, QEMU
> *unmaps* that device's BARs from the guest so it can trap-and-emulate
> them, and each MMIO read or write will also trigger the resampler
> (whether that line is currently masked in the APIC or not).
> 
> I suppose we could try making the page with the vcpu_info as read-only,
> and trapping access to that so we spot when the guest clears its own
> ->evtchn_upcall_pending flag? That seems overly complex though.
> 
> So I've resorted to doing what Xen itself does: just poll the flag on
> every vmexit. Patch is at the tip of my tree at
> https://git.infradead.org/users/dwmw2/qemu.git/shortlog/refs/heads/xenfv
> and below.
> 
> However, in the case of the in-kernel irqchip we might not even *get* a
> vmexit all the way to userspace; can't a guest just get stuck in an
> interrupt storm with it being handled entirely in-kernel? I might
> decree that this works *only* with the split irqchip.
> 
> Then again, it'll work *nicely* in the kernel where the EOI
> notification exists, so I might teach the kernel's evtchn code to
> create those eventfd pairs like VFIO's, which can be hooked in as IRQFD
> routing to the in-kernel {IOA,}PIC.

The callback via is essentially just another line interrupt but its 
assertion is closely coupled with the vcpu upcall_pending bit. Because 
that is being dealt with in-kernel then the via should be dealt with 
in-kernel, right?

   Paul
David Woodhouse Dec. 20, 2022, 4:27 p.m. UTC | #6
On 20 December 2022 13:56:39 GMT, Paul Durrant <xadimgnik@gmail.com> wrote:
>The callback via is essentially just another line interrupt but its assertion is closely coupled with the vcpu upcall_pending bit. Because that is being dealt with in-kernel then the via should be dealt with in-kernel, right?

Not right now, because there's not a lot of point in kernel acceleration in the case that it's delivered as GSI. There's no per-vCPU delivery, so it doesn't get used for IPI, for timers. None of the stuff we want to accelerate in-kernel. Only the actual PV drivers.

If we do FIFO event channels in the future then the kernel will probably need to own those; they need spinlocks and you can have *both* userspace and kernel delivering with test-and-set-bit sequences like you can with 2level.

Even so, I was tempted to add a VFIO-like eventfd pair for the vCPU0 evtchn_upcall_pending and let the kernel drive it... but qemu doesn't even do the EOI side of that as $DEITY intended, so I didn't.
Durrant, Paul Dec. 20, 2022, 5:25 p.m. UTC | #7
On 20/12/2022 16:27, David Woodhouse wrote:
> 
> 
> On 20 December 2022 13:56:39 GMT, Paul Durrant <xadimgnik@gmail.com> wrote:
>> The callback via is essentially just another line interrupt but its assertion is closely coupled with the vcpu upcall_pending bit. Because that is being dealt with in-kernel then the via should be dealt with in-kernel, right?
> 
> Not right now, because there's not a lot of point in kernel acceleration in the case that it's delivered as GSI. There's no per-vCPU delivery, so it doesn't get used for IPI, for timers. None of the stuff we want to accelerate in-kernel. Only the actual PV drivers.
> 
> If we do FIFO event channels in the future then the kernel will probably need to own those; they need spinlocks and you can have *both* userspace and kernel delivering with test-and-set-bit sequences like you can with 2level.
> 
> Even so, I was tempted to add a VFIO-like eventfd pair for the vCPU0 evtchn_upcall_pending and let the kernel drive it... but qemu doesn't even do the EOI side of that as $DEITY intended, so I didn't.

My point was that clearing upcall_pending is essentially the equivalent 
of EOI-at-device i.e. it's the thing that drops the line. So, without 
some form of interception, we need some way to check it at an 
appropriate time... and as you say, there may be no exit to VMM for EOI 
of the APIC. So when?

   Paul
David Woodhouse Dec. 20, 2022, 5:29 p.m. UTC | #8
On 20 December 2022 17:25:31 GMT, Paul Durrant <xadimgnik@gmail.com> wrote:
>On 20/12/2022 16:27, David Woodhouse wrote:
>> 
>> 
>> On 20 December 2022 13:56:39 GMT, Paul Durrant <xadimgnik@gmail.com> wrote:
>>> The callback via is essentially just another line interrupt but its assertion is closely coupled with the vcpu upcall_pending bit. Because that is being dealt with in-kernel then the via should be dealt with in-kernel, right?
>> 
>> Not right now, because there's not a lot of point in kernel acceleration in the case that it's delivered as GSI. There's no per-vCPU delivery, so it doesn't get used for IPI, for timers. None of the stuff we want to accelerate in-kernel. Only the actual PV drivers.
>> 
>> If we do FIFO event channels in the future then the kernel will probably need to own those; they need spinlocks and you can have *both* userspace and kernel delivering with test-and-set-bit sequences like you can with 2level.
>> 
>> Even so, I was tempted to add a VFIO-like eventfd pair for the vCPU0 evtchn_upcall_pending and let the kernel drive it... but qemu doesn't even do the EOI side of that as $DEITY intended, so I didn't.
>
>My point was that clearing upcall_pending is essentially the equivalent of EOI-at-device i.e. it's the thing that drops the line. So, without some form of interception, we need some way to check it at an appropriate time... and as you say, there may be no exit to VMM for EOI of the APIC. So when?

If we have an in-kernel APIC then it has a notifier callback on EOI and we can poll evtchn_upcall_pending then. That's another point in favour of handling it in kernel.

And for userspace APIC we *do* get an exit of course... but QEMU lacks that notifier mechanism for EOI of pseudo-level interrupts for that "should it still be pending?" check.
David Woodhouse Dec. 21, 2022, 1:41 a.m. UTC | #9
On Mon, 2022-12-12 at 16:16 +0000, Paul Durrant wrote:
> 
> > @@ -287,24 +289,53 @@ static bool kvm_xen_hcall_memory_op(struct kvm_xen_exit *exit,
> >       return true;
> >   }
> >   
> > +static int handle_set_param(struct kvm_xen_exit *exit, X86CPU *cpu,
> > +                            uint64_t arg)
> > +{
> > +    CPUState *cs = CPU(cpu);
> > +    struct xen_hvm_param hp;
> > +    int err = 0;
> > +
> > +    if (kvm_copy_from_gva(cs, arg, &hp, sizeof(hp))) {
> > +        err = -EFAULT;
> > +        goto out;
> > +    }
> > +
> > +    if (hp.domid != DOMID_SELF) {
> 
> Xen actually allows the domain's own id to be specified as well as the 
> magic DOMID_SELF.
> 
> > +        err = -EINVAL;
> 
> And this should be -ESRCH.
> 

Oops, fixed that after posting v4 series. Fixed in:

https://git.infradead.org/users/dwmw2/qemu.git/shortlog/refs/heads/xenfv

I fixed the similar -EPERM in evtchn_status_op() too.

> > +        goto out;
> > +    }
> > +
> > +    switch (hp.index) {
> > +    case HVM_PARAM_CALLBACK_IRQ:
> > +        err = xen_evtchn_set_callback_param(hp.value);
> > +        break;
> > +    default:
> > +        return false;
> > +    }
> > +
> > +out:
> > +    exit->u.hcall.result = err;
> 
> This is a bit on the ugly side isn't it? Why not return the err and have 
> kvm_xen_hcall_hvm_op() deal with passing it back?

Because 'return false' means qemu will whine about it being
unimplemented.
Durrant, Paul Dec. 21, 2022, 9:37 a.m. UTC | #10
On 21/12/2022 01:41, David Woodhouse wrote:
> On Mon, 2022-12-12 at 16:16 +0000, Paul Durrant wrote:
>>
>>> @@ -287,24 +289,53 @@ static bool kvm_xen_hcall_memory_op(struct kvm_xen_exit *exit,
>>>        return true;
>>>    }
>>>    
>>> +static int handle_set_param(struct kvm_xen_exit *exit, X86CPU *cpu,
>>> +                            uint64_t arg)
>>> +{
>>> +    CPUState *cs = CPU(cpu);
>>> +    struct xen_hvm_param hp;
>>> +    int err = 0;
>>> +
>>> +    if (kvm_copy_from_gva(cs, arg, &hp, sizeof(hp))) {
>>> +        err = -EFAULT;
>>> +        goto out;
>>> +    }
>>> +
>>> +    if (hp.domid != DOMID_SELF) {
>>
>> Xen actually allows the domain's own id to be specified as well as the
>> magic DOMID_SELF.
>>
>>> +        err = -EINVAL;
>>
>> And this should be -ESRCH.
>>
> 
> Oops, fixed that after posting v4 series. Fixed in:
> 
> https://git.infradead.org/users/dwmw2/qemu.git/shortlog/refs/heads/xenfv
> 
> I fixed the similar -EPERM in evtchn_status_op() too.
> 
>>> +        goto out;
>>> +    }
>>> +
>>> +    switch (hp.index) {
>>> +    case HVM_PARAM_CALLBACK_IRQ:
>>> +        err = xen_evtchn_set_callback_param(hp.value);
>>> +        break;
>>> +    default:
>>> +        return false;
>>> +    }
>>> +
>>> +out:
>>> +    exit->u.hcall.result = err;
>>
>> This is a bit on the ugly side isn't it? Why not return the err and have
>> kvm_xen_hcall_hvm_op() deal with passing it back?
> 
> Because 'return false' means qemu will whine about it being
> unimplemented.
> 

Ah, ok. Yes, I did suggest turning that into a trace, which would mean 
that only those who cared would see such a whine.

   Paul
David Woodhouse Dec. 21, 2022, 12:16 p.m. UTC | #11
On 21 December 2022 09:37:36 GMT, Paul Durrant <xadimgnik@gmail.com> wrote:
>On 21/12/2022 01:41, David Woodhouse wrote:
>>>> +    exit->u.hcall.result = err;
>>> 
>>> This is a bit on the ugly side isn't it? Why not return the err and have
>>> kvm_xen_hcall_hvm_op() deal with passing it back?
>> 
>> Because 'return false' means qemu will whine about it being
>> unimplemented.
>> 
>
>Ah, ok. Yes, I did suggest turning that into a trace, which would mean that only those who cared would see such a whine.

We have both a trace *and* the UNIMPL warning. Neither are enabled by default.
David Woodhouse Dec. 28, 2022, 10:45 a.m. UTC | #12
On Tue, 2022-12-20 at 17:29 +0000, David Woodhouse wrote:
> On 20 December 2022 17:25:31 GMT, Paul Durrant <xadimgnik@gmail.com> wrote:
> > On 20/12/2022 16:27, David Woodhouse wrote:
> > > 
> > > 
> > > On 20 December 2022 13:56:39 GMT, Paul Durrant
> > > <xadimgnik@gmail.com> wrote:
> > > > The callback via is essentially just another line interrupt but
> > > > its assertion is closely coupled with the vcpu upcall_pending
> > > > bit. Because that is being dealt with in-kernel then the via
> > > > should be dealt with in-kernel, right?
> > > 
> > > Not right now, because there's not a lot of point in kernel
> > > acceleration in the case that it's delivered as GSI. There's no
> > > per-vCPU delivery, so it doesn't get used for IPI, for timers.
> > > None of the stuff we want to accelerate in-kernel. Only the
> > > actual PV drivers.
> > > 
> > > If we do FIFO event channels in the future then the kernel will
> > > probably need to own those; they need spinlocks and you can have
> > > *both* userspace and kernel delivering with test-and-set-bit
> > > sequences like you can with 2level.
> > > 
> > > Even so, I was tempted to add a VFIO-like eventfd pair for the
> > > vCPU0 evtchn_upcall_pending and let the kernel drive it... but
> > > qemu doesn't even do the EOI side of that as $DEITY intended, so
> > > I didn't.
> > 
> > My point was that clearing upcall_pending is essentially the
> > equivalent of EOI-at-device i.e. it's the thing that drops the
> > line. So, without some form of interception, we need some way to
> > check it at an appropriate time... and as you say, there may be no
> > exit to VMM for EOI of the APIC. So when?
> 
> If we have an in-kernel APIC then it has a notifier callback on EOI
> and we can poll evtchn_upcall_pending then. That's another point in
> favour of handling it in kernel.
> 
> And for userspace APIC we *do* get an exit of course... but QEMU
> lacks that notifier mechanism for EOI of pseudo-level interrupts for
> that "should it still be pending?" check. 

So, I now have the qemu/backend side of event channels hooked up, and a
basic xenstore implementation that just returns ENOSYS to everything
for now. As before, at
https://git.infradead.org/users/dwmw2/qemu.git/shortlog/refs/heads/xenfv

I played with booting the guest with 'xen_no_vector_callback' on its
command line.

With kernel-irqchip=split (i.e. PIC/APIC in userspace) that seems to be
working OK. We *do* take that exit to qemu, and qemu spots that it
needs to deassert the GSI.

With the *kernel* PIC/APIC, we get a brief IRQ storm on the PCI INTX
every time:

[root@localhost ~]# grep xen /proc/interrupts 
 10:      80001          0   IO-APIC  10-fasteoi   xen-platform-pci
 24:          8          0   xen-dyn    -event     xenbus

In fact, unless CPU0 takes an exit, that storm might not be very brief
at all.

I *could* fix this up in the kernel via kvm_notify_acked_irq(), but I
suspect the better answer is just to declare that the Xen support (or
at least GSI evtchn delivery) requires split irqchip.
diff mbox series

Patch

diff --git a/hw/i386/kvm/meson.build b/hw/i386/kvm/meson.build
index 6165cbf019..cab64df339 100644
--- a/hw/i386/kvm/meson.build
+++ b/hw/i386/kvm/meson.build
@@ -4,6 +4,9 @@  i386_kvm_ss.add(when: 'CONFIG_APIC', if_true: files('apic.c'))
 i386_kvm_ss.add(when: 'CONFIG_I8254', if_true: files('i8254.c'))
 i386_kvm_ss.add(when: 'CONFIG_I8259', if_true: files('i8259.c'))
 i386_kvm_ss.add(when: 'CONFIG_IOAPIC', if_true: files('ioapic.c'))
-i386_kvm_ss.add(when: 'CONFIG_XEN_EMU', if_true: files('xen_overlay.c'))
+i386_kvm_ss.add(when: 'CONFIG_XEN_EMU', if_true: files(
+  'xen_overlay.c',
+  'xen_evtchn.c',
+  ))
 
 i386_ss.add_all(when: 'CONFIG_KVM', if_true: i386_kvm_ss)
diff --git a/hw/i386/kvm/xen_evtchn.c b/hw/i386/kvm/xen_evtchn.c
new file mode 100644
index 0000000000..1ca0c034e7
--- /dev/null
+++ b/hw/i386/kvm/xen_evtchn.c
@@ -0,0 +1,117 @@ 
+/*
+ * QEMU Xen emulation: Shared/overlay pages support
+ *
+ * Copyright © 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * Authors: David Woodhouse <dwmw2@infradead.org>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/host-utils.h"
+#include "qemu/module.h"
+#include "qemu/main-loop.h"
+#include "qapi/error.h"
+#include "qom/object.h"
+#include "exec/target_page.h"
+#include "exec/address-spaces.h"
+#include "migration/vmstate.h"
+
+#include "hw/sysbus.h"
+#include "hw/xen/xen.h"
+#include "xen_evtchn.h"
+
+#include "sysemu/kvm.h"
+#include <linux/kvm.h>
+
+#include "standard-headers/xen/memory.h"
+#include "standard-headers/xen/hvm/params.h"
+
+#define TYPE_XEN_EVTCHN "xenevtchn"
+OBJECT_DECLARE_SIMPLE_TYPE(XenEvtchnState, XEN_EVTCHN)
+
+struct XenEvtchnState {
+    /*< private >*/
+    SysBusDevice busdev;
+    /*< public >*/
+
+    uint64_t callback_param;
+};
+
+struct XenEvtchnState *xen_evtchn_singleton;
+
+static int xen_evtchn_post_load(void *opaque, int version_id)
+{
+    XenEvtchnState *s = opaque;
+
+    if (s->callback_param) {
+        xen_evtchn_set_callback_param(s->callback_param);
+    }
+
+    return 0;
+}
+
+static bool xen_evtchn_is_needed(void *opaque)
+{
+    return xen_mode == XEN_EMULATE;
+}
+
+static const VMStateDescription xen_evtchn_vmstate = {
+    .name = "xen_evtchn",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .needed = xen_evtchn_is_needed,
+    .post_load = xen_evtchn_post_load,
+    .fields = (VMStateField[]) {
+        VMSTATE_UINT64(callback_param, XenEvtchnState),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+static void xen_evtchn_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+
+    dc->vmsd = &xen_evtchn_vmstate;
+}
+
+static const TypeInfo xen_evtchn_info = {
+    .name          = TYPE_XEN_EVTCHN,
+    .parent        = TYPE_SYS_BUS_DEVICE,
+    .instance_size = sizeof(XenEvtchnState),
+    .class_init    = xen_evtchn_class_init,
+};
+
+void xen_evtchn_create(void)
+{
+    xen_evtchn_singleton = XEN_EVTCHN(sysbus_create_simple(TYPE_XEN_EVTCHN, -1, NULL));
+}
+
+static void xen_evtchn_register_types(void)
+{
+    type_register_static(&xen_evtchn_info);
+}
+
+type_init(xen_evtchn_register_types)
+
+
+#define CALLBACK_VIA_TYPE_SHIFT       56
+
+int xen_evtchn_set_callback_param(uint64_t param)
+{
+    int ret = -ENOSYS;
+
+    if (param >> CALLBACK_VIA_TYPE_SHIFT == HVM_PARAM_CALLBACK_TYPE_VECTOR) {
+        struct kvm_xen_hvm_attr xa = {
+            .type = KVM_XEN_ATTR_TYPE_UPCALL_VECTOR,
+            .u.vector = (uint8_t)param,
+        };
+
+        ret = kvm_vm_ioctl(kvm_state, KVM_XEN_HVM_SET_ATTR, &xa);
+        if (!ret && xen_evtchn_singleton)
+            xen_evtchn_singleton->callback_param = param;
+    }
+    return ret;
+}
diff --git a/hw/i386/kvm/xen_evtchn.h b/hw/i386/kvm/xen_evtchn.h
new file mode 100644
index 0000000000..11c6ed22a0
--- /dev/null
+++ b/hw/i386/kvm/xen_evtchn.h
@@ -0,0 +1,13 @@ 
+/*
+ * QEMU Xen emulation: Event channel support
+ *
+ * Copyright © 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * Authors: David Woodhouse <dwmw2@infradead.org>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+void xen_evtchn_create(void);
+int xen_evtchn_set_callback_param(uint64_t param);
diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
index c3c61eedde..18540084a0 100644
--- a/hw/i386/pc_piix.c
+++ b/hw/i386/pc_piix.c
@@ -60,6 +60,7 @@ 
 #endif
 #ifdef CONFIG_XEN_EMU
 #include "hw/i386/kvm/xen_overlay.h"
+#include "hw/i386/kvm/xen_evtchn.h"
 #endif
 #include "migration/global_state.h"
 #include "migration/misc.h"
@@ -417,6 +418,7 @@  static void pc_xen_hvm_init(MachineState *machine)
 #ifdef CONFIG_XEN_EMU
     if (xen_mode == XEN_EMULATE) {
             xen_overlay_create();
+            xen_evtchn_create();
     }
 #endif
 }
diff --git a/target/i386/xen.c b/target/i386/xen.c
index 2583c00a6b..1af336d9e5 100644
--- a/target/i386/xen.c
+++ b/target/i386/xen.c
@@ -16,6 +16,8 @@ 
 #include "xen.h"
 #include "trace.h"
 #include "hw/i386/kvm/xen_overlay.h"
+#include "hw/i386/kvm/xen_evtchn.h"
+
 #include "standard-headers/xen/version.h"
 #include "standard-headers/xen/memory.h"
 #include "standard-headers/xen/hvm/hvm_op.h"
@@ -287,24 +289,53 @@  static bool kvm_xen_hcall_memory_op(struct kvm_xen_exit *exit,
     return true;
 }
 
+static int handle_set_param(struct kvm_xen_exit *exit, X86CPU *cpu,
+                            uint64_t arg)
+{
+    CPUState *cs = CPU(cpu);
+    struct xen_hvm_param hp;
+    int err = 0;
+
+    if (kvm_copy_from_gva(cs, arg, &hp, sizeof(hp))) {
+        err = -EFAULT;
+        goto out;
+    }
+
+    if (hp.domid != DOMID_SELF) {
+        err = -EINVAL;
+        goto out;
+    }
+
+    switch (hp.index) {
+    case HVM_PARAM_CALLBACK_IRQ:
+        err = xen_evtchn_set_callback_param(hp.value);
+        break;
+    default:
+        return false;
+    }
+
+out:
+    exit->u.hcall.result = err;
+    return true;
+}
+
 static int kvm_xen_hcall_evtchn_upcall_vector(struct kvm_xen_exit *exit,
                                               X86CPU *cpu, uint64_t arg)
 {
-    struct xen_hvm_evtchn_upcall_vector *up;
+    struct xen_hvm_evtchn_upcall_vector up;
     CPUState *target_cs;
     int vector;
 
-    up = gva_to_hva(CPU(cpu), arg);
-    if (!up) {
+    if (kvm_copy_from_gva(CPU(cpu), arg, &up, sizeof(up))) {
         return -EFAULT;
     }
 
-    vector = up->vector;
+    vector = up.vector;
     if (vector < 0x10) {
         return -EINVAL;
     }
 
-    target_cs = qemu_get_cpu(up->vcpu);
+    target_cs = qemu_get_cpu(up.vcpu);
     if (!target_cs) {
         return -EINVAL;
     }
@@ -325,7 +356,8 @@  static bool kvm_xen_hcall_hvm_op(struct kvm_xen_exit *exit, X86CPU *cpu,
     case HVMOP_pagetable_dying:
             ret = -ENOSYS;
             break;
-
+    case HVMOP_set_param:
+            return handle_set_param(exit, cpu, arg);
     default:
             return false;
     }