diff mbox

[6/6] xics-kvm: Support for in-kernel XICS interrupt controller

Message ID 1375777673-20274-7-git-send-email-aik@ozlabs.ru
State New
Headers show

Commit Message

Alexey Kardashevskiy Aug. 6, 2013, 8:27 a.m. UTC
From: David Gibson <david@gibson.dropbear.id.au>

Recent (host) kernels support emulating the PAPR defined "XICS" interrupt
controller system within KVM.  This patch allows qemu to initialize and
configure the in-kernel XICS, and keep its state in sync with qemu's XICS
state as necessary.

This should give considerable performance improvements.  e.g. on a simple
IPI ping-pong test between hardware threads, using qemu XICS gives us
around 5,000 irqs/second, whereas the in-kernel XICS gives us around
70,000 irqs/s on the same hardware configuration.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
[Mike Qiu <qiudayu@linux.vnet.ibm.com>: fixed mistype which caused ics_set_kvm_state() to fail]
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 default-configs/ppc64-softmmu.mak |   1 +
 hw/intc/Makefile.objs             |   1 +
 hw/intc/xics_kvm.c                | 479 ++++++++++++++++++++++++++++++++++++++
 hw/ppc/spapr.c                    |  23 +-
 include/hw/ppc/xics.h             |  13 ++
 5 files changed, 515 insertions(+), 2 deletions(-)
 create mode 100644 hw/intc/xics_kvm.c

Comments

Andreas Färber Aug. 6, 2013, 10:12 a.m. UTC | #1
Am 06.08.2013 10:27, schrieb Alexey Kardashevskiy:
> From: David Gibson <david@gibson.dropbear.id.au>
> 
> Recent (host) kernels support emulating the PAPR defined "XICS" interrupt
> controller system within KVM.  This patch allows qemu to initialize and
> configure the in-kernel XICS, and keep its state in sync with qemu's XICS
> state as necessary.
> 
> This should give considerable performance improvements.  e.g. on a simple
> IPI ping-pong test between hardware threads, using qemu XICS gives us
> around 5,000 irqs/second, whereas the in-kernel XICS gives us around
> 70,000 irqs/s on the same hardware configuration.
> 
> Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
> [Mike Qiu <qiudayu@linux.vnet.ibm.com>: fixed mistype which caused ics_set_kvm_state() to fail]
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
>  default-configs/ppc64-softmmu.mak |   1 +
>  hw/intc/Makefile.objs             |   1 +
>  hw/intc/xics_kvm.c                | 479 ++++++++++++++++++++++++++++++++++++++
>  hw/ppc/spapr.c                    |  23 +-
>  include/hw/ppc/xics.h             |  13 ++
>  5 files changed, 515 insertions(+), 2 deletions(-)
>  create mode 100644 hw/intc/xics_kvm.c
> 
> diff --git a/default-configs/ppc64-softmmu.mak b/default-configs/ppc64-softmmu.mak
> index 7831c2b..116f4ca 100644
> --- a/default-configs/ppc64-softmmu.mak
> +++ b/default-configs/ppc64-softmmu.mak
> @@ -47,6 +47,7 @@ CONFIG_E500=y
>  CONFIG_OPENPIC_KVM=$(and $(CONFIG_E500),$(CONFIG_KVM))
>  # For pSeries
>  CONFIG_XICS=$(CONFIG_PSERIES)
> +CONFIG_XICS_KVM=$(and $(CONFIG_PSERIES),$(CONFIG_KVM))
>  # For PReP
>  CONFIG_I82378=y
>  CONFIG_I8259=y
> diff --git a/hw/intc/Makefile.objs b/hw/intc/Makefile.objs
> index 2851eed..47ac442 100644
> --- a/hw/intc/Makefile.objs
> +++ b/hw/intc/Makefile.objs
> @@ -23,3 +23,4 @@ obj-$(CONFIG_OMAP) += omap_intc.o
>  obj-$(CONFIG_OPENPIC_KVM) += openpic_kvm.o
>  obj-$(CONFIG_SH4) += sh_intc.o
>  obj-$(CONFIG_XICS) += xics.o
> +obj-$(CONFIG_XICS_KVM) += xics_kvm.o
> diff --git a/hw/intc/xics_kvm.c b/hw/intc/xics_kvm.c
> new file mode 100644
> index 0000000..d9313d4
> --- /dev/null
> +++ b/hw/intc/xics_kvm.c
> @@ -0,0 +1,479 @@
> +/*
> + * QEMU PowerPC pSeries Logical Partition (aka sPAPR) hardware System Emulator
> + *
> + * PAPR Virtualized Interrupt System, aka ICS/ICP aka xics, in-kernel emulation
> + *
> + * Copyright (c) 2013 David Gibson, IBM Corporation.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a copy
> + * of this software and associated documentation files (the "Software"), to deal
> + * in the Software without restriction, including without limitation the rights
> + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
> + * copies of the Software, and to permit persons to whom the Software is
> + * furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
> + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
> + * THE SOFTWARE.
> + *
> + */
> +
> +#include "hw/hw.h"
> +#include "trace.h"
> +#include "hw/ppc/spapr.h"
> +#include "hw/ppc/xics.h"
> +#include "kvm_ppc.h"
> +#include "qemu/config-file.h"
> +#include "qemu/error-report.h"
> +
> +#include <sys/ioctl.h>
> +
> +typedef struct KVMXICSState {
> +    XICSState parent_obj;
> +
> +    uint32_t set_xive_token;
> +    uint32_t get_xive_token;
> +    uint32_t int_off_token;
> +    uint32_t int_on_token;
> +    int kernel_xics_fd;
> +} KVMXICSState;
> +
> +/*
> + * ICP-KVM
> + */
> +static void icp_get_kvm_state(ICPState *ss)
> +{
> +    uint64_t state;
> +    struct kvm_one_reg reg = {
> +        .id = KVM_REG_PPC_ICP_STATE,
> +        .addr = (uintptr_t)&state,
> +    };
> +    int ret;
> +
> +    /* ICP for this CPU thread is not in use, exiting */
> +    if (!ss->cs) {
> +        return;
> +    }
> +
> +    ret = kvm_vcpu_ioctl(ss->cs, KVM_GET_ONE_REG, &reg);
> +    if (ret != 0) {
> +        error_report("Unable to retrieve KVM interrupt controller state"
> +                " for CPU %d: %s", ss->cs->cpu_index, strerror(errno));
> +        exit(1);
> +    }
> +
> +    ss->xirr = state >> KVM_REG_PPC_ICP_XISR_SHIFT;
> +    ss->mfrr = (state >> KVM_REG_PPC_ICP_MFRR_SHIFT)
> +        & KVM_REG_PPC_ICP_MFRR_MASK;
> +    ss->pending_priority = (state >> KVM_REG_PPC_ICP_PPRI_SHIFT)
> +        & KVM_REG_PPC_ICP_PPRI_MASK;
> +}
> +
> +static int icp_set_kvm_state(ICPState *ss)
> +{
> +    uint64_t state;
> +    struct kvm_one_reg reg = {
> +        .id = KVM_REG_PPC_ICP_STATE,
> +        .addr = (uintptr_t)&state,
> +    };
> +    int ret;
> +
> +    /* ICP for this CPU thread is not in use, exiting */
> +    if (!ss->cs) {
> +        return 0;
> +    }
> +
> +    state = ((uint64_t)ss->xirr << KVM_REG_PPC_ICP_XISR_SHIFT)
> +        | ((uint64_t)ss->mfrr << KVM_REG_PPC_ICP_MFRR_SHIFT)
> +        | ((uint64_t)ss->pending_priority << KVM_REG_PPC_ICP_PPRI_SHIFT);
> +
> +    ret = kvm_vcpu_ioctl(ss->cs, KVM_SET_ONE_REG, &reg);
> +    if (ret != 0) {
> +        error_report("Unable to restore KVM interrupt controller state (0x%"
> +                PRIx64 ") for CPU %d: %s", state, ss->cs->cpu_index,
> +                strerror(errno));
> +        return ret;
> +    }
> +
> +    return 0;
> +}
> +
> +static void icp_kvm_reset(DeviceState *dev)
> +{
> +    ICPState *icp = ICP(dev);
> +
> +    icp->xirr = 0;
> +    icp->pending_priority = 0xff;
> +    icp->mfrr = 0xff;
> +
> +    /* Make all outputs are deasserted */
> +    qemu_set_irq(icp->output, 0);
> +
> +    icp_set_kvm_state(icp);
> +}
> +
> +static void icp_kvm_class_init(ObjectClass *klass, void *data)
> +{
> +    DeviceClass *dc = DEVICE_CLASS(klass);
> +    ICPStateClass *icpc = ICP_CLASS(klass);
> +
> +    dc->reset = icp_kvm_reset;
> +    icpc->pre_save = icp_get_kvm_state;
> +    icpc->post_load = icp_set_kvm_state;
> +}
> +
> +static const TypeInfo icp_kvm_info = {
> +    .name = TYPE_ICP_KVM,
> +    .parent = TYPE_ICP,
> +    .instance_size = sizeof(ICPState),
> +    .class_init = icp_kvm_class_init,
> +    .class_size = sizeof(ICPStateClass),
> +};
> +
> +/*
> + * ICS-KVM
> + */
> +static void ics_get_kvm_state(ICSState *ics)
> +{
> +    KVMXICSState *icpkvm = KVM_XICS(ics->icp);
> +    uint64_t state;
> +    struct kvm_device_attr attr = {
> +        .flags = 0,
> +        .group = KVM_DEV_XICS_GRP_SOURCES,
> +        .addr = (uint64_t)(uintptr_t)&state,
> +    };
> +    int i;
> +
> +    for (i = 0; i < ics->nr_irqs; i++) {
> +        ICSIRQState *irq = &ics->irqs[i];
> +        int ret;
> +
> +        attr.attr = i + ics->offset;
> +
> +        ret = ioctl(icpkvm->kernel_xics_fd, KVM_GET_DEVICE_ATTR, &attr);
> +        if (ret != 0) {
> +            error_report("Unable to retrieve KVM interrupt controller state"
> +                    " for IRQ %d: %s", i + ics->offset, strerror(errno));
> +            exit(1);
> +        }
> +
> +        irq->server = state & KVM_XICS_DESTINATION_MASK;
> +        irq->saved_priority = (state >> KVM_XICS_PRIORITY_SHIFT)
> +            & KVM_XICS_PRIORITY_MASK;
> +        /*
> +         * To be consistent with the software emulation in xics.c, we
> +         * split out the masked state + priority that we get from the
> +         * kernel into 'current priority' (0xff if masked) and
> +         * 'saved priority' (if masked, this is the priority the
> +         * interrupt had before it was masked).  Masking and unmasking
> +         * are done with the ibm,int-off and ibm,int-on RTAS calls.
> +         */
> +        if (state & KVM_XICS_MASKED) {
> +            irq->priority = 0xff;
> +        } else {
> +            irq->priority = irq->saved_priority;
> +        }
> +
> +        if (state & KVM_XICS_PENDING) {
> +            if (state & KVM_XICS_LEVEL_SENSITIVE) {
> +                irq->status |= XICS_STATUS_ASSERTED;
> +            } else {
> +                /*
> +                 * A pending edge-triggered interrupt (or MSI)
> +                 * must have been rejected previously when we
> +                 * first detected it and tried to deliver it,
> +                 * so mark it as pending and previously rejected
> +                 * for consistency with how xics.c works.
> +                 */
> +                irq->status |= XICS_STATUS_MASKED_PENDING
> +                    | XICS_STATUS_REJECTED;
> +            }
> +        }
> +    }
> +}
> +
> +static int ics_set_kvm_state(ICSState *ics)
> +{
> +    KVMXICSState *icpkvm = KVM_XICS(ics->icp);
> +    uint64_t state;
> +    struct kvm_device_attr attr = {
> +        .flags = 0,
> +        .group = KVM_DEV_XICS_GRP_SOURCES,
> +        .addr = (uint64_t)(uintptr_t)&state,
> +    };
> +    int i;
> +
> +    for (i = 0; i < ics->nr_irqs; i++) {
> +        ICSIRQState *irq = &ics->irqs[i];
> +        int ret;
> +
> +        attr.attr = i + ics->offset;
> +
> +        state = irq->server;
> +        state |= (uint64_t)(irq->saved_priority & KVM_XICS_PRIORITY_MASK)
> +            << KVM_XICS_PRIORITY_SHIFT;
> +        if (irq->priority != irq->saved_priority) {
> +            assert(irq->priority == 0xff);
> +            state |= KVM_XICS_MASKED;
> +        }
> +
> +        if (ics->islsi[i]) {
> +            state |= KVM_XICS_LEVEL_SENSITIVE;
> +            if (irq->status & XICS_STATUS_ASSERTED) {
> +                state |= KVM_XICS_PENDING;
> +            }
> +        } else {
> +            if (irq->status & XICS_STATUS_MASKED_PENDING) {
> +                state |= KVM_XICS_PENDING;
> +            }
> +        }
> +
> +        ret = ioctl(icpkvm->kernel_xics_fd, KVM_SET_DEVICE_ATTR, &attr);
> +        if (ret != 0) {
> +            error_report("Unable to restore KVM interrupt controller state"
> +                    " for IRQs %d: %s", i + ics->offset, strerror(errno));
> +            return ret;
> +        }
> +    }
> +
> +    return 0;
> +}
> +
> +static void ics_kvm_set_irq(void *opaque, int srcno, int val)
> +{
> +    ICSState *ics = opaque;
> +    struct kvm_irq_level args;
> +    int rc;
> +
> +    args.irq = srcno + ics->offset;
> +    if (!ics->islsi[srcno]) {
> +        if (!val) {
> +            return;
> +        }
> +        args.level = KVM_INTERRUPT_SET;
> +    } else {
> +        args.level = val ? KVM_INTERRUPT_SET_LEVEL : KVM_INTERRUPT_UNSET;
> +    }
> +    rc = kvm_vm_ioctl(kvm_state, KVM_IRQ_LINE, &args);
> +    if (rc < 0) {
> +        perror("kvm_irq_line");
> +    }
> +}
> +
> +static void ics_kvm_reset(DeviceState *dev)
> +{
> +    ics_set_kvm_state(ICS(dev));
> +}
> +
> +static int ics_kvm_realize(DeviceState *dev)

This is still copying the realize error: It's an old-style initfn
disguised as realizefn.

> +{
> +    ICSState *ics = ICS(dev);
> +
> +    assert(ics->nr_irqs);

error_setg()

> +    ics->irqs = g_malloc0(ics->nr_irqs * sizeof(ICSIRQState));
> +    ics->islsi = g_malloc0(ics->nr_irqs * sizeof(bool));
> +    ics->qirqs = qemu_allocate_irqs(ics_kvm_set_irq, ics, ics->nr_irqs);
> +
> +    return 0;
> +}
> +
> +static void ics_kvm_class_init(ObjectClass *klass, void *data)
> +{
> +    DeviceClass *dc = DEVICE_CLASS(klass);
> +    ICSStateClass *icsc = ICS_CLASS(klass);
> +
> +    dc->init = ics_kvm_realize;
> +    dc->reset = ics_kvm_reset;
> +    icsc->pre_save = ics_get_kvm_state;
> +    icsc->post_load = ics_set_kvm_state;
> +}
> +
> +static const TypeInfo ics_kvm_info = {
> +    .name = TYPE_ICS_KVM,
> +    .parent = TYPE_ICS,
> +    .instance_size = sizeof(ICSState),
> +    .class_init = ics_kvm_class_init,
> +};
> +
> +/*
> + * XICS-KVM
> + */
> +static void xics_kvm_cpu_setup(XICSState *icp, PowerPCCPU *cpu)
> +{
> +    CPUState *cs;
> +    ICPState *ss;
> +    KVMXICSState *icpkvm = KVM_XICS(icp);
> +    XICSStateClass *xics_info = XICS_GET_PARENT_CLASS(OBJECT(icp));
> +
> +    cs = CPU(cpu);
> +    ss = &icp->ss[cs->cpu_index];
> +
> +    assert(cs->cpu_index < icp->nr_servers);
> +    if (icpkvm->kernel_xics_fd == -1) {
> +        abort();
> +    }
> +
> +    if (icpkvm->kernel_xics_fd != -1) {
> +        int ret;
> +        struct kvm_enable_cap xics_enable_cap = {
> +            .cap = KVM_CAP_IRQ_XICS,
> +            .flags = 0,
> +            .args = {icpkvm->kernel_xics_fd, cs->cpu_index, 0, 0},
> +        };
> +
> +        ss->cs = cs;
> +
> +        ret = kvm_vcpu_ioctl(ss->cs, KVM_ENABLE_CAP, &xics_enable_cap);
> +        if (ret < 0) {
> +            error_report("Unable to connect CPU%d to kernel XICS: %s\n",
> +                    cs->cpu_index, strerror(errno));
> +            exit(1);
> +        }
> +    }
> +
> +    /* Call emulated XICS implementation for consistency */
> +    assert(xics_info && xics_info->cpu_setup);
> +    xics_info->cpu_setup(icp, cpu);
> +}
> +
> +void xics_kvm_set_nr_irqs(XICSState *icp, uint32_t nr_irqs)
> +{
> +    icp->ics = ICS(object_new(TYPE_ICS_KVM));
> +    object_property_add_child(OBJECT(icp), "ics", OBJECT(icp->ics), NULL);
> +    icp->ics->icp = icp;

instance_init + object_initialize()?

> +    icp->nr_irqs = icp->ics->nr_irqs = nr_irqs;
> +}
> +
> +void xics_kvm_set_nr_servers(XICSState *icp, uint32_t nr_servers)
> +{
> +    int i;
> +
> +    icp->nr_servers = nr_servers;
> +
> +    icp->ss = g_malloc0(icp->nr_servers*sizeof(ICPState));
> +    for (i = 0; i < icp->nr_servers; i++) {
> +        char buffer[32];
> +        object_initialize(&icp->ss[i], TYPE_ICP_KVM);
> +        snprintf(buffer, sizeof(buffer), "icp[%d]", i);
> +        object_property_add_child(OBJECT(icp), buffer, OBJECT(&icp->ss[i]), NULL);
> +    }
> +}
> +
> +static void rtas_dummy(PowerPCCPU *cpu, sPAPREnvironment *spapr,
> +                       uint32_t token,
> +                       uint32_t nargs, target_ulong args,
> +                       uint32_t nret, target_ulong rets)
> +{
> +    error_report("pseries: %s must never be called for in-kernel XICS\n",
> +            __func__);
> +}
> +
> +static void xics_kvm_realize(DeviceState *dev, Error **errp)
> +{
> +    KVMXICSState *icpkvm = KVM_XICS(dev);
> +    XICSState *icp = XICS_COMMON(dev);
> +    int i, rc;
> +    Error *error = NULL;
> +    struct kvm_create_device xics_create_device = {
> +        .type = KVM_DEV_TYPE_XICS,
> +        .flags = 0,
> +    };
> +
> +    assert(kvm_enabled());
> +    assert(kvm_check_extension(kvm_state, KVM_CAP_IRQ_XICS));

error_setg() - if device can be created without accel=kvm (which it
looks as if it can) then you should just error out the nice way here.

> +
> +    icpkvm->set_xive_token = spapr_rtas_register("ibm,set-xive", rtas_dummy);
> +    icpkvm->get_xive_token = spapr_rtas_register("ibm,get-xive", rtas_dummy);
> +    icpkvm->int_off_token = spapr_rtas_register("ibm,int-off", rtas_dummy);
> +    icpkvm->int_on_token = spapr_rtas_register("ibm,int-on", rtas_dummy);
> +
> +    rc = kvmppc_define_rtas_token(icpkvm->set_xive_token, "ibm,set-xive");
> +    if (rc < 0) {
> +        error_setg(errp, "kvmppc_define_rtas_token: ibm,set-xive");
> +        goto fail;
> +    }
> +
> +    rc = kvmppc_define_rtas_token(icpkvm->get_xive_token, "ibm,get-xive");
> +    if (rc < 0) {
> +        error_setg(errp, "kvmppc_define_rtas_token: ibm,get-xive");
> +        goto fail;
> +    }
> +
> +    rc = kvmppc_define_rtas_token(icpkvm->int_on_token, "ibm,int-on");
> +    if (rc < 0) {
> +        error_setg(errp, "kvmppc_define_rtas_token: ibm,int-on");
> +        goto fail;
> +    }
> +
> +    rc = kvmppc_define_rtas_token(icpkvm->int_off_token, "ibm,int-off");
> +    if (rc < 0) {
> +        error_setg(errp, "kvmppc_define_rtas_token: ibm,int-off");
> +        goto fail;
> +    }
> +
> +    /* Create the kernel ICP */
> +    rc = kvm_vm_ioctl(kvm_state, KVM_CREATE_DEVICE, &xics_create_device);
> +    if (rc < 0) {
> +        error_setg_errno(errp, -rc, "Error on KVM_CREATE_DEVICE for XICS");
> +        goto fail;
> +    }
> +
> +    icpkvm->kernel_xics_fd = xics_create_device.fd;
> +
> +    object_property_set_bool(OBJECT(icp->ics), true, "realized", &error);
> +    if (error) {
> +        error_propagate(errp, error);
> +            goto fail;

Indentation?

> +    }
> +
> +    assert(icp->nr_servers);
> +    for (i = 0; i < icp->nr_servers; i++) {
> +        object_property_set_bool(OBJECT(&icp->ss[i]), true, "realized", &error);
> +        if (error) {
> +            error_propagate(errp, error);
> +            goto fail;
> +        }
> +    }
> +    return;
> +
> +fail:
> +    kvmppc_define_rtas_token(0, "ibm,set-xive");
> +    kvmppc_define_rtas_token(0, "ibm,get-xive");
> +    kvmppc_define_rtas_token(0, "ibm,int-on");
> +    kvmppc_define_rtas_token(0, "ibm,int-off");
> +    return;

This return is not needed.

> +}
> +
> +static void xics_kvm_class_init(ObjectClass *oc, void *data)
> +{
> +    DeviceClass *dc = DEVICE_CLASS(oc);
> +    XICSStateClass *xsc = XICS_COMMON_CLASS(oc);
> +
> +    dc->realize = xics_kvm_realize;
> +    xsc->cpu_setup = xics_kvm_cpu_setup;
> +    xsc->set_nr_irqs = xics_kvm_set_nr_irqs;
> +    xsc->set_nr_servers = xics_kvm_set_nr_servers;
> +}
> +
> +static const TypeInfo xics_kvm_info = {
> +    .name          = TYPE_KVM_XICS,
> +    .parent        = TYPE_XICS_COMMON,
> +    .instance_size = sizeof(KVMXICSState),
> +    .class_init    = xics_kvm_class_init,
> +};
> +
> +static void xics_kvm_register_types(void)
> +{
> +    type_register_static(&xics_kvm_info);
> +    type_register_static(&ics_kvm_info);
> +    type_register_static(&icp_kvm_info);
> +}
> +
> +type_init(xics_kvm_register_types)
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index 432f0d2..8dc8565 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -141,14 +141,33 @@ static XICSState *try_create_xics(const char *type, int nr_servers,
>          return NULL;
>      }
>  
> -    return XICS(dev);
> +    return XICS_COMMON(dev);
>  }
>  
>  static XICSState *xics_system_init(int nr_servers, int nr_irqs)
>  {
>      XICSState *icp = NULL;
>  
> -    icp = try_create_xics(TYPE_XICS, nr_servers, nr_irqs);
> +    if (kvm_enabled()) {
> +        QemuOpts *machine_opts = qemu_get_machine_opts();
> +        bool irqchip_allowed = qemu_opt_get_bool(machine_opts,
> +                                                "kernel_irqchip", true);
> +        bool irqchip_required = qemu_opt_get_bool(machine_opts,
> +                                                  "kernel_irqchip", false);
> +        if (irqchip_allowed) {
> +            icp = try_create_xics(TYPE_KVM_XICS, nr_servers, nr_irqs);
> +        }
> +
> +        if (irqchip_required && !icp) {
> +            perror("Failed to create in-kernel XICS\n");
> +            abort();
> +        }
> +    }
> +
> +    if (!icp) {
> +        icp = try_create_xics(TYPE_XICS, nr_servers, nr_irqs);
> +    }
> +
>      if (!icp) {
>          perror("Failed to create XICS\n");
>          abort();
> diff --git a/include/hw/ppc/xics.h b/include/hw/ppc/xics.h
> index 1066f69..e5889e9 100644
> --- a/include/hw/ppc/xics.h
> +++ b/include/hw/ppc/xics.h
> @@ -35,6 +35,9 @@
>  #define TYPE_XICS "xics"
>  #define XICS(obj) OBJECT_CHECK(XICSState, (obj), TYPE_XICS)
>  
> +#define TYPE_KVM_XICS "xics-kvm"
> +#define KVM_XICS(obj) OBJECT_CHECK(KVMXICSState, (obj), TYPE_KVM_XICS)
> +
>  #define XICS_COMMON_CLASS(klass) \
>       OBJECT_CLASS_CHECK(XICSStateClass, (klass), TYPE_XICS_COMMON)
>  #define XICS_CLASS(klass) \
> @@ -44,6 +47,9 @@
>  #define XICS_GET_CLASS(obj) \
>       OBJECT_GET_CLASS(XICSStateClass, (obj), TYPE_XICS)
>  
> +#define XICS_GET_PARENT_CLASS(obj) \
> +    (XICSStateClass *) object_class_get_parent(object_get_class(obj))

This is dangerous in that it takes the parent class of whatever type the
obj argument turns out to have. This has been discussed in lengths in
the context of Peter C.'s proposal for ISA/ARM realize cleanup.

This should therefore be a macro per type specifying the TYPE_*, either
for object_class_by_name() or to my proposed macro which abstracts the
implementation to core QOM code.

XICS_CLASS() would be nicer than open-coding, but why cast here?
DeviceClass can be needed just as well.

Also please check error_report() indentation and that they don't contain
trailing \n.

Regards,
Andreas

> +
>  #define XICS_IPI        0x2
>  #define XICS_BUID       0x1
>  #define XICS_IRQ_BASE   (XICS_BUID << 12)
> @@ -82,6 +88,9 @@ struct XICSState {
>  #define TYPE_ICP "icp"
>  #define ICP(obj) OBJECT_CHECK(ICPState, (obj), TYPE_ICP)
>  
> +#define TYPE_ICP_KVM "icp-kvm"
> +#define ICP_KVM(obj) OBJECT_CHECK(ICPState, (obj), TYPE_ICP_KVM)
> +
>  #define ICP_CLASS(klass) \
>       OBJECT_CLASS_CHECK(ICPStateClass, (klass), TYPE_ICP)
>  #define ICP_GET_CLASS(obj) \
> @@ -98,6 +107,7 @@ struct ICPState {
>      /*< private >*/
>      DeviceState parent_obj;
>      /*< public >*/
> +    CPUState *cs;
>      uint32_t xirr;
>      uint8_t pending_priority;
>      uint8_t mfrr;
> @@ -107,6 +117,9 @@ struct ICPState {
>  #define TYPE_ICS "ics"
>  #define ICS(obj) OBJECT_CHECK(ICSState, (obj), TYPE_ICS)
>  
> +#define TYPE_ICS_KVM "icskvm"
> +#define ICS_KVM(obj) OBJECT_CHECK(ICSState, (obj), TYPE_ICS_KVM)
> +
>  #define ICS_CLASS(klass) \
>       OBJECT_CLASS_CHECK(ICSStateClass, (klass), TYPE_ICS)
>  #define ICS_GET_CLASS(obj) \
>
Alexey Kardashevskiy Aug. 6, 2013, 12:06 p.m. UTC | #2
On 08/06/2013 08:12 PM, Andreas Färber wrote:

>> diff --git a/include/hw/ppc/xics.h b/include/hw/ppc/xics.h
>> index 1066f69..e5889e9 100644
>> --- a/include/hw/ppc/xics.h
>> +++ b/include/hw/ppc/xics.h
>> @@ -35,6 +35,9 @@
>>  #define TYPE_XICS "xics"
>>  #define XICS(obj) OBJECT_CHECK(XICSState, (obj), TYPE_XICS)
>>  
>> +#define TYPE_KVM_XICS "xics-kvm"
>> +#define KVM_XICS(obj) OBJECT_CHECK(KVMXICSState, (obj), TYPE_KVM_XICS)
>> +
>>  #define XICS_COMMON_CLASS(klass) \
>>       OBJECT_CLASS_CHECK(XICSStateClass, (klass), TYPE_XICS_COMMON)
>>  #define XICS_CLASS(klass) \
>> @@ -44,6 +47,9 @@
>>  #define XICS_GET_CLASS(obj) \
>>       OBJECT_GET_CLASS(XICSStateClass, (obj), TYPE_XICS)
>>  
>> +#define XICS_GET_PARENT_CLASS(obj) \
>> +    (XICSStateClass *) object_class_get_parent(object_get_class(obj))
> 
> This is dangerous in that it takes the parent class of whatever type the
> obj argument turns out to have. This has been discussed in lengths in
> the context of Peter C.'s proposal for ISA/ARM realize cleanup.

How is it dangerous? I perfectly know what I call it with. And simple run
will crash QEMU if something is wrong.

> This should therefore be a macro per type specifying the TYPE_*, either
> for object_class_by_name() or to my proposed macro which abstracts the
> implementation to core QOM code.

Please, be more specific. What type should be used in macro here -
XICS_COMMON or KVM_XICS? I asked already in the other thread but somehow
you missed it. Neither really makes sense for me as I believe that
get_parent is exactly for the cases (and this is the case) when I do not
want to know exact parent type and I already know the current type. Thanks.

> XICS_CLASS() would be nicer than open-coding, but why cast here?
> DeviceClass can be needed just as well.

I do not need DeviceClass, at all, anywhere. I need a pointer to a my
specific cpu_setup callback, this is all about it.
Andreas Färber Aug. 6, 2013, 3:10 p.m. UTC | #3
Am 06.08.2013 14:06, schrieb Alexey Kardashevskiy:
> On 08/06/2013 08:12 PM, Andreas Färber wrote:
>>> diff --git a/include/hw/ppc/xics.h b/include/hw/ppc/xics.h
>>> index 1066f69..e5889e9 100644
>>> --- a/include/hw/ppc/xics.h
>>> +++ b/include/hw/ppc/xics.h
>>> @@ -35,6 +35,9 @@
>>>  #define TYPE_XICS "xics"
>>>  #define XICS(obj) OBJECT_CHECK(XICSState, (obj), TYPE_XICS)
>>>  
>>> +#define TYPE_KVM_XICS "xics-kvm"
>>> +#define KVM_XICS(obj) OBJECT_CHECK(KVMXICSState, (obj), TYPE_KVM_XICS)
>>> +
>>>  #define XICS_COMMON_CLASS(klass) \
>>>       OBJECT_CLASS_CHECK(XICSStateClass, (klass), TYPE_XICS_COMMON)
>>>  #define XICS_CLASS(klass) \
>>> @@ -44,6 +47,9 @@
>>>  #define XICS_GET_CLASS(obj) \
>>>       OBJECT_GET_CLASS(XICSStateClass, (obj), TYPE_XICS)
>>>  
>>> +#define XICS_GET_PARENT_CLASS(obj) \
>>> +    (XICSStateClass *) object_class_get_parent(object_get_class(obj))
>>
>> This is dangerous in that it takes the parent class of whatever type the
>> obj argument turns out to have. This has been discussed in lengths in
>> the context of Peter C.'s proposal for ISA/ARM realize cleanup.
> 
> How is it dangerous? I perfectly know what I call it with. And simple run
> will crash QEMU if something is wrong.

I did not say it was wrong or an error, I said it's dangerous. The
reason is that unlike C++ we do not have VTables in QOM:
KVM_XICS(obj) === XICS(obj) === obj.
I.e., object_get_class() always returns the actual, most specific type
rather than a type corresponding to the variable you are using it through.

So if someone derives TYPE_MY_KVM_XICS from your TYPE_KVM_XICS then the
function will silently do something different than it was doing before.
Same if someone inserts TYPE_VIRTUAL_XICS between TYPE_XICS and
TYPE_KVM_XICS.

The thought of both my macro (and Peter C.'s last version) and my
comments here is to keep the type information in a central place and to
make, say, a kvm_xics_realize C function behave like a KVMXICS::realize
C++ function would.

>> This should therefore be a macro per type specifying the TYPE_*, either
>> for object_class_by_name() or to my proposed macro which abstracts the
>> implementation to core QOM code.
> 
> Please, be more specific. What type should be used in macro here -
> XICS_COMMON or KVM_XICS? I asked already in the other thread but somehow
> you missed it.

Answered it now! ;)

> Neither really makes sense for me as I believe that
> get_parent is exactly for the cases (and this is the case) when I do not
> want to know exact parent type and I already know the current type. Thanks.

Correct. You do not know, you are making implicit assumptions that I
would like to avoid for safety reasons in favor of explicit
parent-of-my-type. That allows both to derive further types and to
insert intermediate types when using object_class_get_parent() the way I do.

>> XICS_CLASS() would be nicer than open-coding, but why cast here?
>> DeviceClass can be needed just as well.
> 
> I do not need DeviceClass, at all, anywhere. I need a pointer to a my
> specific cpu_setup callback, this is all about it.

You might not do so now - you might if there is common stuff to do in
DeviceClass::realize().

But that is something I suggest to discuss on my macro patch instead, as
it is a general design question for our parent-related macros. I also
suggest to search the archives for Peter's last two proposals that got
some discussion of corner cases and intentions as well.

Andreas
Alexey Kardashevskiy Aug. 7, 2013, 7:03 a.m. UTC | #4
On 08/06/2013 08:12 PM, Andreas Färber wrote:
>> +    /* Call emulated XICS implementation for consistency */
>> +    assert(xics_info && xics_info->cpu_setup);
>> +    xics_info->cpu_setup(icp, cpu);
>> +}
>> +
>> +void xics_kvm_set_nr_irqs(XICSState *icp, uint32_t nr_irqs)
>> +{
>> +    icp->ics = ICS(object_new(TYPE_ICS_KVM));
>> +    object_property_add_child(OBJECT(icp), "ics", OBJECT(icp->ics), NULL);
>> +    icp->ics->icp = icp;
> 
> instance_init + object_initialize()?

Create ICS_KVM in instance_init of XICS_KVM and initialize it here? Why split?

> 
>> +    icp->nr_irqs = icp->ics->nr_irqs = nr_irqs;
>> +}
>> +
>> +void xics_kvm_set_nr_servers(XICSState *icp, uint32_t nr_servers)
>> +{
>> +    int i;
>> +
>> +    icp->nr_servers = nr_servers;
>> +
>> +    icp->ss = g_malloc0(icp->nr_servers*sizeof(ICPState));
>> +    for (i = 0; i < icp->nr_servers; i++) {
>> +        char buffer[32];
>> +        object_initialize(&icp->ss[i], TYPE_ICP_KVM);
>> +        snprintf(buffer, sizeof(buffer), "icp[%d]", i);
>> +        object_property_add_child(OBJECT(icp), buffer, OBJECT(&icp->ss[i]), NULL);
>> +    }
>> +}
>> +
>> +static void rtas_dummy(PowerPCCPU *cpu, sPAPREnvironment *spapr,
>> +                       uint32_t token,
>> +                       uint32_t nargs, target_ulong args,
>> +                       uint32_t nret, target_ulong rets)
>> +{
>> +    error_report("pseries: %s must never be called for in-kernel XICS\n",
>> +            __func__);
>> +}
>> +
>> +static void xics_kvm_realize(DeviceState *dev, Error **errp)
>> +{
>> +    KVMXICSState *icpkvm = KVM_XICS(dev);
>> +    XICSState *icp = XICS_COMMON(dev);
>> +    int i, rc;
>> +    Error *error = NULL;
>> +    struct kvm_create_device xics_create_device = {
>> +        .type = KVM_DEV_TYPE_XICS,
>> +        .flags = 0,
>> +    };
>> +
>> +    assert(kvm_enabled());
>> +    assert(kvm_check_extension(kvm_state, KVM_CAP_IRQ_XICS));
> 
> error_setg() - if device can be created without accel=kvm (which it
> looks as if it can) then you should just error out the nice way here.


I check kvm_enabled() (I thought I check both but was wrong, will fix it)
where I try to create XICS_KVM so we should not be here if that check failed.
Andreas Färber Aug. 7, 2013, 7:08 a.m. UTC | #5
Am 07.08.2013 09:03, schrieb Alexey Kardashevskiy:
> On 08/06/2013 08:12 PM, Andreas Färber wrote:
>>> +    /* Call emulated XICS implementation for consistency */
>>> +    assert(xics_info && xics_info->cpu_setup);
>>> +    xics_info->cpu_setup(icp, cpu);
>>> +}
>>> +
>>> +void xics_kvm_set_nr_irqs(XICSState *icp, uint32_t nr_irqs)
>>> +{
>>> +    icp->ics = ICS(object_new(TYPE_ICS_KVM));
>>> +    object_property_add_child(OBJECT(icp), "ics", OBJECT(icp->ics), NULL);
>>> +    icp->ics->icp = icp;
>>
>> instance_init + object_initialize()?
> 
> Create ICS_KVM in instance_init of XICS_KVM and initialize it here? Why split?

Answered on 5/6 (mail bounced at first due to kernel.crashing.org).

>>> +    icp->nr_irqs = icp->ics->nr_irqs = nr_irqs;
>>> +}
>>> +
>>> +void xics_kvm_set_nr_servers(XICSState *icp, uint32_t nr_servers)
>>> +{
>>> +    int i;
>>> +
>>> +    icp->nr_servers = nr_servers;
>>> +
>>> +    icp->ss = g_malloc0(icp->nr_servers*sizeof(ICPState));
>>> +    for (i = 0; i < icp->nr_servers; i++) {
>>> +        char buffer[32];
>>> +        object_initialize(&icp->ss[i], TYPE_ICP_KVM);
>>> +        snprintf(buffer, sizeof(buffer), "icp[%d]", i);
>>> +        object_property_add_child(OBJECT(icp), buffer, OBJECT(&icp->ss[i]), NULL);
>>> +    }
>>> +}
[...]
>>> +static void xics_kvm_realize(DeviceState *dev, Error **errp)
>>> +{
>>> +    KVMXICSState *icpkvm = KVM_XICS(dev);
>>> +    XICSState *icp = XICS_COMMON(dev);
>>> +    int i, rc;
>>> +    Error *error = NULL;
>>> +    struct kvm_create_device xics_create_device = {
>>> +        .type = KVM_DEV_TYPE_XICS,
>>> +        .flags = 0,
>>> +    };
>>> +
>>> +    assert(kvm_enabled());
>>> +    assert(kvm_check_extension(kvm_state, KVM_CAP_IRQ_XICS));
>>
>> error_setg() - if device can be created without accel=kvm (which it
>> looks as if it can) then you should just error out the nice way here.
> 
> 
> I check kvm_enabled() (I thought I check both but was wrong, will fix it)
> where I try to create XICS_KVM so we should not be here if that check failed.

-device, -object and patch Anthony's patch QMP are other ways to
instantiate the same type. :)

Andreas
Alexey Kardashevskiy Aug. 7, 2013, 7:31 a.m. UTC | #6
On 08/07/2013 05:08 PM, Andreas Färber wrote:
>>>> +    icp->nr_irqs = icp->ics->nr_irqs = nr_irqs;
>>>> +}
>>>> +
>>>> +void xics_kvm_set_nr_servers(XICSState *icp, uint32_t nr_servers)
>>>> +{
>>>> +    int i;
>>>> +
>>>> +    icp->nr_servers = nr_servers;
>>>> +
>>>> +    icp->ss = g_malloc0(icp->nr_servers*sizeof(ICPState));
>>>> +    for (i = 0; i < icp->nr_servers; i++) {
>>>> +        char buffer[32];
>>>> +        object_initialize(&icp->ss[i], TYPE_ICP_KVM);
>>>> +        snprintf(buffer, sizeof(buffer), "icp[%d]", i);
>>>> +        object_property_add_child(OBJECT(icp), buffer, OBJECT(&icp->ss[i]), NULL);
>>>> +    }
>>>> +}
> [...]
>>>> +static void xics_kvm_realize(DeviceState *dev, Error **errp)
>>>> +{
>>>> +    KVMXICSState *icpkvm = KVM_XICS(dev);
>>>> +    XICSState *icp = XICS_COMMON(dev);
>>>> +    int i, rc;
>>>> +    Error *error = NULL;
>>>> +    struct kvm_create_device xics_create_device = {
>>>> +        .type = KVM_DEV_TYPE_XICS,
>>>> +        .flags = 0,
>>>> +    };
>>>> +
>>>> +    assert(kvm_enabled());
>>>> +    assert(kvm_check_extension(kvm_state, KVM_CAP_IRQ_XICS));
>>>
>>> error_setg() - if device can be created without accel=kvm (which it
>>> looks as if it can) then you should just error out the nice way here.
>>
>>
>> I check kvm_enabled() (I thought I check both but was wrong, will fix it)
>> where I try to create XICS_KVM so we should not be here if that check failed.
> 
> -device, -object and patch Anthony's patch QMP are other ways to
> instantiate the same type. :)

Hmm. Nice. I thought "-device xics,nr_irqs=1,nr_servers=2" won't work but
it does (well, fails, but lot further, in realize()) :)
diff mbox

Patch

diff --git a/default-configs/ppc64-softmmu.mak b/default-configs/ppc64-softmmu.mak
index 7831c2b..116f4ca 100644
--- a/default-configs/ppc64-softmmu.mak
+++ b/default-configs/ppc64-softmmu.mak
@@ -47,6 +47,7 @@  CONFIG_E500=y
 CONFIG_OPENPIC_KVM=$(and $(CONFIG_E500),$(CONFIG_KVM))
 # For pSeries
 CONFIG_XICS=$(CONFIG_PSERIES)
+CONFIG_XICS_KVM=$(and $(CONFIG_PSERIES),$(CONFIG_KVM))
 # For PReP
 CONFIG_I82378=y
 CONFIG_I8259=y
diff --git a/hw/intc/Makefile.objs b/hw/intc/Makefile.objs
index 2851eed..47ac442 100644
--- a/hw/intc/Makefile.objs
+++ b/hw/intc/Makefile.objs
@@ -23,3 +23,4 @@  obj-$(CONFIG_OMAP) += omap_intc.o
 obj-$(CONFIG_OPENPIC_KVM) += openpic_kvm.o
 obj-$(CONFIG_SH4) += sh_intc.o
 obj-$(CONFIG_XICS) += xics.o
+obj-$(CONFIG_XICS_KVM) += xics_kvm.o
diff --git a/hw/intc/xics_kvm.c b/hw/intc/xics_kvm.c
new file mode 100644
index 0000000..d9313d4
--- /dev/null
+++ b/hw/intc/xics_kvm.c
@@ -0,0 +1,479 @@ 
+/*
+ * QEMU PowerPC pSeries Logical Partition (aka sPAPR) hardware System Emulator
+ *
+ * PAPR Virtualized Interrupt System, aka ICS/ICP aka xics, in-kernel emulation
+ *
+ * Copyright (c) 2013 David Gibson, IBM Corporation.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ */
+
+#include "hw/hw.h"
+#include "trace.h"
+#include "hw/ppc/spapr.h"
+#include "hw/ppc/xics.h"
+#include "kvm_ppc.h"
+#include "qemu/config-file.h"
+#include "qemu/error-report.h"
+
+#include <sys/ioctl.h>
+
+typedef struct KVMXICSState {
+    XICSState parent_obj;
+
+    uint32_t set_xive_token;
+    uint32_t get_xive_token;
+    uint32_t int_off_token;
+    uint32_t int_on_token;
+    int kernel_xics_fd;
+} KVMXICSState;
+
+/*
+ * ICP-KVM
+ */
+static void icp_get_kvm_state(ICPState *ss)
+{
+    uint64_t state;
+    struct kvm_one_reg reg = {
+        .id = KVM_REG_PPC_ICP_STATE,
+        .addr = (uintptr_t)&state,
+    };
+    int ret;
+
+    /* ICP for this CPU thread is not in use, exiting */
+    if (!ss->cs) {
+        return;
+    }
+
+    ret = kvm_vcpu_ioctl(ss->cs, KVM_GET_ONE_REG, &reg);
+    if (ret != 0) {
+        error_report("Unable to retrieve KVM interrupt controller state"
+                " for CPU %d: %s", ss->cs->cpu_index, strerror(errno));
+        exit(1);
+    }
+
+    ss->xirr = state >> KVM_REG_PPC_ICP_XISR_SHIFT;
+    ss->mfrr = (state >> KVM_REG_PPC_ICP_MFRR_SHIFT)
+        & KVM_REG_PPC_ICP_MFRR_MASK;
+    ss->pending_priority = (state >> KVM_REG_PPC_ICP_PPRI_SHIFT)
+        & KVM_REG_PPC_ICP_PPRI_MASK;
+}
+
+static int icp_set_kvm_state(ICPState *ss)
+{
+    uint64_t state;
+    struct kvm_one_reg reg = {
+        .id = KVM_REG_PPC_ICP_STATE,
+        .addr = (uintptr_t)&state,
+    };
+    int ret;
+
+    /* ICP for this CPU thread is not in use, exiting */
+    if (!ss->cs) {
+        return 0;
+    }
+
+    state = ((uint64_t)ss->xirr << KVM_REG_PPC_ICP_XISR_SHIFT)
+        | ((uint64_t)ss->mfrr << KVM_REG_PPC_ICP_MFRR_SHIFT)
+        | ((uint64_t)ss->pending_priority << KVM_REG_PPC_ICP_PPRI_SHIFT);
+
+    ret = kvm_vcpu_ioctl(ss->cs, KVM_SET_ONE_REG, &reg);
+    if (ret != 0) {
+        error_report("Unable to restore KVM interrupt controller state (0x%"
+                PRIx64 ") for CPU %d: %s", state, ss->cs->cpu_index,
+                strerror(errno));
+        return ret;
+    }
+
+    return 0;
+}
+
+static void icp_kvm_reset(DeviceState *dev)
+{
+    ICPState *icp = ICP(dev);
+
+    icp->xirr = 0;
+    icp->pending_priority = 0xff;
+    icp->mfrr = 0xff;
+
+    /* Make all outputs are deasserted */
+    qemu_set_irq(icp->output, 0);
+
+    icp_set_kvm_state(icp);
+}
+
+static void icp_kvm_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+    ICPStateClass *icpc = ICP_CLASS(klass);
+
+    dc->reset = icp_kvm_reset;
+    icpc->pre_save = icp_get_kvm_state;
+    icpc->post_load = icp_set_kvm_state;
+}
+
+static const TypeInfo icp_kvm_info = {
+    .name = TYPE_ICP_KVM,
+    .parent = TYPE_ICP,
+    .instance_size = sizeof(ICPState),
+    .class_init = icp_kvm_class_init,
+    .class_size = sizeof(ICPStateClass),
+};
+
+/*
+ * ICS-KVM
+ */
+static void ics_get_kvm_state(ICSState *ics)
+{
+    KVMXICSState *icpkvm = KVM_XICS(ics->icp);
+    uint64_t state;
+    struct kvm_device_attr attr = {
+        .flags = 0,
+        .group = KVM_DEV_XICS_GRP_SOURCES,
+        .addr = (uint64_t)(uintptr_t)&state,
+    };
+    int i;
+
+    for (i = 0; i < ics->nr_irqs; i++) {
+        ICSIRQState *irq = &ics->irqs[i];
+        int ret;
+
+        attr.attr = i + ics->offset;
+
+        ret = ioctl(icpkvm->kernel_xics_fd, KVM_GET_DEVICE_ATTR, &attr);
+        if (ret != 0) {
+            error_report("Unable to retrieve KVM interrupt controller state"
+                    " for IRQ %d: %s", i + ics->offset, strerror(errno));
+            exit(1);
+        }
+
+        irq->server = state & KVM_XICS_DESTINATION_MASK;
+        irq->saved_priority = (state >> KVM_XICS_PRIORITY_SHIFT)
+            & KVM_XICS_PRIORITY_MASK;
+        /*
+         * To be consistent with the software emulation in xics.c, we
+         * split out the masked state + priority that we get from the
+         * kernel into 'current priority' (0xff if masked) and
+         * 'saved priority' (if masked, this is the priority the
+         * interrupt had before it was masked).  Masking and unmasking
+         * are done with the ibm,int-off and ibm,int-on RTAS calls.
+         */
+        if (state & KVM_XICS_MASKED) {
+            irq->priority = 0xff;
+        } else {
+            irq->priority = irq->saved_priority;
+        }
+
+        if (state & KVM_XICS_PENDING) {
+            if (state & KVM_XICS_LEVEL_SENSITIVE) {
+                irq->status |= XICS_STATUS_ASSERTED;
+            } else {
+                /*
+                 * A pending edge-triggered interrupt (or MSI)
+                 * must have been rejected previously when we
+                 * first detected it and tried to deliver it,
+                 * so mark it as pending and previously rejected
+                 * for consistency with how xics.c works.
+                 */
+                irq->status |= XICS_STATUS_MASKED_PENDING
+                    | XICS_STATUS_REJECTED;
+            }
+        }
+    }
+}
+
+static int ics_set_kvm_state(ICSState *ics)
+{
+    KVMXICSState *icpkvm = KVM_XICS(ics->icp);
+    uint64_t state;
+    struct kvm_device_attr attr = {
+        .flags = 0,
+        .group = KVM_DEV_XICS_GRP_SOURCES,
+        .addr = (uint64_t)(uintptr_t)&state,
+    };
+    int i;
+
+    for (i = 0; i < ics->nr_irqs; i++) {
+        ICSIRQState *irq = &ics->irqs[i];
+        int ret;
+
+        attr.attr = i + ics->offset;
+
+        state = irq->server;
+        state |= (uint64_t)(irq->saved_priority & KVM_XICS_PRIORITY_MASK)
+            << KVM_XICS_PRIORITY_SHIFT;
+        if (irq->priority != irq->saved_priority) {
+            assert(irq->priority == 0xff);
+            state |= KVM_XICS_MASKED;
+        }
+
+        if (ics->islsi[i]) {
+            state |= KVM_XICS_LEVEL_SENSITIVE;
+            if (irq->status & XICS_STATUS_ASSERTED) {
+                state |= KVM_XICS_PENDING;
+            }
+        } else {
+            if (irq->status & XICS_STATUS_MASKED_PENDING) {
+                state |= KVM_XICS_PENDING;
+            }
+        }
+
+        ret = ioctl(icpkvm->kernel_xics_fd, KVM_SET_DEVICE_ATTR, &attr);
+        if (ret != 0) {
+            error_report("Unable to restore KVM interrupt controller state"
+                    " for IRQs %d: %s", i + ics->offset, strerror(errno));
+            return ret;
+        }
+    }
+
+    return 0;
+}
+
+static void ics_kvm_set_irq(void *opaque, int srcno, int val)
+{
+    ICSState *ics = opaque;
+    struct kvm_irq_level args;
+    int rc;
+
+    args.irq = srcno + ics->offset;
+    if (!ics->islsi[srcno]) {
+        if (!val) {
+            return;
+        }
+        args.level = KVM_INTERRUPT_SET;
+    } else {
+        args.level = val ? KVM_INTERRUPT_SET_LEVEL : KVM_INTERRUPT_UNSET;
+    }
+    rc = kvm_vm_ioctl(kvm_state, KVM_IRQ_LINE, &args);
+    if (rc < 0) {
+        perror("kvm_irq_line");
+    }
+}
+
+static void ics_kvm_reset(DeviceState *dev)
+{
+    ics_set_kvm_state(ICS(dev));
+}
+
+static int ics_kvm_realize(DeviceState *dev)
+{
+    ICSState *ics = ICS(dev);
+
+    assert(ics->nr_irqs);
+    ics->irqs = g_malloc0(ics->nr_irqs * sizeof(ICSIRQState));
+    ics->islsi = g_malloc0(ics->nr_irqs * sizeof(bool));
+    ics->qirqs = qemu_allocate_irqs(ics_kvm_set_irq, ics, ics->nr_irqs);
+
+    return 0;
+}
+
+static void ics_kvm_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+    ICSStateClass *icsc = ICS_CLASS(klass);
+
+    dc->init = ics_kvm_realize;
+    dc->reset = ics_kvm_reset;
+    icsc->pre_save = ics_get_kvm_state;
+    icsc->post_load = ics_set_kvm_state;
+}
+
+static const TypeInfo ics_kvm_info = {
+    .name = TYPE_ICS_KVM,
+    .parent = TYPE_ICS,
+    .instance_size = sizeof(ICSState),
+    .class_init = ics_kvm_class_init,
+};
+
+/*
+ * XICS-KVM
+ */
+static void xics_kvm_cpu_setup(XICSState *icp, PowerPCCPU *cpu)
+{
+    CPUState *cs;
+    ICPState *ss;
+    KVMXICSState *icpkvm = KVM_XICS(icp);
+    XICSStateClass *xics_info = XICS_GET_PARENT_CLASS(OBJECT(icp));
+
+    cs = CPU(cpu);
+    ss = &icp->ss[cs->cpu_index];
+
+    assert(cs->cpu_index < icp->nr_servers);
+    if (icpkvm->kernel_xics_fd == -1) {
+        abort();
+    }
+
+    if (icpkvm->kernel_xics_fd != -1) {
+        int ret;
+        struct kvm_enable_cap xics_enable_cap = {
+            .cap = KVM_CAP_IRQ_XICS,
+            .flags = 0,
+            .args = {icpkvm->kernel_xics_fd, cs->cpu_index, 0, 0},
+        };
+
+        ss->cs = cs;
+
+        ret = kvm_vcpu_ioctl(ss->cs, KVM_ENABLE_CAP, &xics_enable_cap);
+        if (ret < 0) {
+            error_report("Unable to connect CPU%d to kernel XICS: %s\n",
+                    cs->cpu_index, strerror(errno));
+            exit(1);
+        }
+    }
+
+    /* Call emulated XICS implementation for consistency */
+    assert(xics_info && xics_info->cpu_setup);
+    xics_info->cpu_setup(icp, cpu);
+}
+
+void xics_kvm_set_nr_irqs(XICSState *icp, uint32_t nr_irqs)
+{
+    icp->ics = ICS(object_new(TYPE_ICS_KVM));
+    object_property_add_child(OBJECT(icp), "ics", OBJECT(icp->ics), NULL);
+    icp->ics->icp = icp;
+    icp->nr_irqs = icp->ics->nr_irqs = nr_irqs;
+}
+
+void xics_kvm_set_nr_servers(XICSState *icp, uint32_t nr_servers)
+{
+    int i;
+
+    icp->nr_servers = nr_servers;
+
+    icp->ss = g_malloc0(icp->nr_servers*sizeof(ICPState));
+    for (i = 0; i < icp->nr_servers; i++) {
+        char buffer[32];
+        object_initialize(&icp->ss[i], TYPE_ICP_KVM);
+        snprintf(buffer, sizeof(buffer), "icp[%d]", i);
+        object_property_add_child(OBJECT(icp), buffer, OBJECT(&icp->ss[i]), NULL);
+    }
+}
+
+static void rtas_dummy(PowerPCCPU *cpu, sPAPREnvironment *spapr,
+                       uint32_t token,
+                       uint32_t nargs, target_ulong args,
+                       uint32_t nret, target_ulong rets)
+{
+    error_report("pseries: %s must never be called for in-kernel XICS\n",
+            __func__);
+}
+
+static void xics_kvm_realize(DeviceState *dev, Error **errp)
+{
+    KVMXICSState *icpkvm = KVM_XICS(dev);
+    XICSState *icp = XICS_COMMON(dev);
+    int i, rc;
+    Error *error = NULL;
+    struct kvm_create_device xics_create_device = {
+        .type = KVM_DEV_TYPE_XICS,
+        .flags = 0,
+    };
+
+    assert(kvm_enabled());
+    assert(kvm_check_extension(kvm_state, KVM_CAP_IRQ_XICS));
+
+    icpkvm->set_xive_token = spapr_rtas_register("ibm,set-xive", rtas_dummy);
+    icpkvm->get_xive_token = spapr_rtas_register("ibm,get-xive", rtas_dummy);
+    icpkvm->int_off_token = spapr_rtas_register("ibm,int-off", rtas_dummy);
+    icpkvm->int_on_token = spapr_rtas_register("ibm,int-on", rtas_dummy);
+
+    rc = kvmppc_define_rtas_token(icpkvm->set_xive_token, "ibm,set-xive");
+    if (rc < 0) {
+        error_setg(errp, "kvmppc_define_rtas_token: ibm,set-xive");
+        goto fail;
+    }
+
+    rc = kvmppc_define_rtas_token(icpkvm->get_xive_token, "ibm,get-xive");
+    if (rc < 0) {
+        error_setg(errp, "kvmppc_define_rtas_token: ibm,get-xive");
+        goto fail;
+    }
+
+    rc = kvmppc_define_rtas_token(icpkvm->int_on_token, "ibm,int-on");
+    if (rc < 0) {
+        error_setg(errp, "kvmppc_define_rtas_token: ibm,int-on");
+        goto fail;
+    }
+
+    rc = kvmppc_define_rtas_token(icpkvm->int_off_token, "ibm,int-off");
+    if (rc < 0) {
+        error_setg(errp, "kvmppc_define_rtas_token: ibm,int-off");
+        goto fail;
+    }
+
+    /* Create the kernel ICP */
+    rc = kvm_vm_ioctl(kvm_state, KVM_CREATE_DEVICE, &xics_create_device);
+    if (rc < 0) {
+        error_setg_errno(errp, -rc, "Error on KVM_CREATE_DEVICE for XICS");
+        goto fail;
+    }
+
+    icpkvm->kernel_xics_fd = xics_create_device.fd;
+
+    object_property_set_bool(OBJECT(icp->ics), true, "realized", &error);
+    if (error) {
+        error_propagate(errp, error);
+            goto fail;
+    }
+
+    assert(icp->nr_servers);
+    for (i = 0; i < icp->nr_servers; i++) {
+        object_property_set_bool(OBJECT(&icp->ss[i]), true, "realized", &error);
+        if (error) {
+            error_propagate(errp, error);
+            goto fail;
+        }
+    }
+    return;
+
+fail:
+    kvmppc_define_rtas_token(0, "ibm,set-xive");
+    kvmppc_define_rtas_token(0, "ibm,get-xive");
+    kvmppc_define_rtas_token(0, "ibm,int-on");
+    kvmppc_define_rtas_token(0, "ibm,int-off");
+    return;
+}
+
+static void xics_kvm_class_init(ObjectClass *oc, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(oc);
+    XICSStateClass *xsc = XICS_COMMON_CLASS(oc);
+
+    dc->realize = xics_kvm_realize;
+    xsc->cpu_setup = xics_kvm_cpu_setup;
+    xsc->set_nr_irqs = xics_kvm_set_nr_irqs;
+    xsc->set_nr_servers = xics_kvm_set_nr_servers;
+}
+
+static const TypeInfo xics_kvm_info = {
+    .name          = TYPE_KVM_XICS,
+    .parent        = TYPE_XICS_COMMON,
+    .instance_size = sizeof(KVMXICSState),
+    .class_init    = xics_kvm_class_init,
+};
+
+static void xics_kvm_register_types(void)
+{
+    type_register_static(&xics_kvm_info);
+    type_register_static(&ics_kvm_info);
+    type_register_static(&icp_kvm_info);
+}
+
+type_init(xics_kvm_register_types)
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 432f0d2..8dc8565 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -141,14 +141,33 @@  static XICSState *try_create_xics(const char *type, int nr_servers,
         return NULL;
     }
 
-    return XICS(dev);
+    return XICS_COMMON(dev);
 }
 
 static XICSState *xics_system_init(int nr_servers, int nr_irqs)
 {
     XICSState *icp = NULL;
 
-    icp = try_create_xics(TYPE_XICS, nr_servers, nr_irqs);
+    if (kvm_enabled()) {
+        QemuOpts *machine_opts = qemu_get_machine_opts();
+        bool irqchip_allowed = qemu_opt_get_bool(machine_opts,
+                                                "kernel_irqchip", true);
+        bool irqchip_required = qemu_opt_get_bool(machine_opts,
+                                                  "kernel_irqchip", false);
+        if (irqchip_allowed) {
+            icp = try_create_xics(TYPE_KVM_XICS, nr_servers, nr_irqs);
+        }
+
+        if (irqchip_required && !icp) {
+            perror("Failed to create in-kernel XICS\n");
+            abort();
+        }
+    }
+
+    if (!icp) {
+        icp = try_create_xics(TYPE_XICS, nr_servers, nr_irqs);
+    }
+
     if (!icp) {
         perror("Failed to create XICS\n");
         abort();
diff --git a/include/hw/ppc/xics.h b/include/hw/ppc/xics.h
index 1066f69..e5889e9 100644
--- a/include/hw/ppc/xics.h
+++ b/include/hw/ppc/xics.h
@@ -35,6 +35,9 @@ 
 #define TYPE_XICS "xics"
 #define XICS(obj) OBJECT_CHECK(XICSState, (obj), TYPE_XICS)
 
+#define TYPE_KVM_XICS "xics-kvm"
+#define KVM_XICS(obj) OBJECT_CHECK(KVMXICSState, (obj), TYPE_KVM_XICS)
+
 #define XICS_COMMON_CLASS(klass) \
      OBJECT_CLASS_CHECK(XICSStateClass, (klass), TYPE_XICS_COMMON)
 #define XICS_CLASS(klass) \
@@ -44,6 +47,9 @@ 
 #define XICS_GET_CLASS(obj) \
      OBJECT_GET_CLASS(XICSStateClass, (obj), TYPE_XICS)
 
+#define XICS_GET_PARENT_CLASS(obj) \
+    (XICSStateClass *) object_class_get_parent(object_get_class(obj))
+
 #define XICS_IPI        0x2
 #define XICS_BUID       0x1
 #define XICS_IRQ_BASE   (XICS_BUID << 12)
@@ -82,6 +88,9 @@  struct XICSState {
 #define TYPE_ICP "icp"
 #define ICP(obj) OBJECT_CHECK(ICPState, (obj), TYPE_ICP)
 
+#define TYPE_ICP_KVM "icp-kvm"
+#define ICP_KVM(obj) OBJECT_CHECK(ICPState, (obj), TYPE_ICP_KVM)
+
 #define ICP_CLASS(klass) \
      OBJECT_CLASS_CHECK(ICPStateClass, (klass), TYPE_ICP)
 #define ICP_GET_CLASS(obj) \
@@ -98,6 +107,7 @@  struct ICPState {
     /*< private >*/
     DeviceState parent_obj;
     /*< public >*/
+    CPUState *cs;
     uint32_t xirr;
     uint8_t pending_priority;
     uint8_t mfrr;
@@ -107,6 +117,9 @@  struct ICPState {
 #define TYPE_ICS "ics"
 #define ICS(obj) OBJECT_CHECK(ICSState, (obj), TYPE_ICS)
 
+#define TYPE_ICS_KVM "icskvm"
+#define ICS_KVM(obj) OBJECT_CHECK(ICSState, (obj), TYPE_ICS_KVM)
+
 #define ICS_CLASS(klass) \
      OBJECT_CLASS_CHECK(ICSStateClass, (klass), TYPE_ICS)
 #define ICS_GET_CLASS(obj) \