diff mbox

[4/4] xics: Support for in-kernel XICS interrupt controller

Message ID 1374043057-27208-5-git-send-email-aik@ozlabs.ru
State New
Headers show

Commit Message

Alexey Kardashevskiy July 17, 2013, 6:37 a.m. UTC
From: David Gibson <david@gibson.dropbear.id.au>

Recent (host) kernels support emulating the PAPR defined "XICS" interrupt
controller system within KVM.  This patch allows qemu to initialize and
configure the in-kernel XICS, and keep its state in sync with qemu's XICS
state as necessary.

This should give considerable performance improvements.  e.g. on a simple
IPI ping-pong test between hardware threads, using qemu XICS gives us
around 5,000 irqs/second, whereas the in-kernel XICS gives us around
70,000 irqs/s on the same hardware configuration.

[Mike Qiu <qiudayu@linux.vnet.ibm.com>: fixed mistype which caused ics_set_kvm_state() to fail]
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
[aik: moved to a separate device, reworked QOM]

---
Changes:
2013/07/17:
* QOM rework

2013/07/01
* fixed VMState names in order to support xics-kvm migration to xics and vice versa

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 default-configs/ppc64-softmmu.mak |   1 +
 hw/intc/Makefile.objs             |   1 +
 hw/intc/xics_kvm.c                | 469 ++++++++++++++++++++++++++++++++++++++
 hw/ppc/spapr.c                    |  26 ++-
 include/hw/ppc/xics.h             |  10 +
 5 files changed, 506 insertions(+), 1 deletion(-)
 create mode 100644 hw/intc/xics_kvm.c

Comments

Anthony Liguori July 31, 2013, 6:05 p.m. UTC | #1
Alexey Kardashevskiy <aik@ozlabs.ru> writes:

> From: David Gibson <david@gibson.dropbear.id.au>
>
> Recent (host) kernels support emulating the PAPR defined "XICS" interrupt
> controller system within KVM.  This patch allows qemu to initialize and
> configure the in-kernel XICS, and keep its state in sync with qemu's XICS
> state as necessary.
>
> This should give considerable performance improvements.  e.g. on a simple
> IPI ping-pong test between hardware threads, using qemu XICS gives us
> around 5,000 irqs/second, whereas the in-kernel XICS gives us around
> 70,000 irqs/s on the same hardware configuration.
>
> [Mike Qiu <qiudayu@linux.vnet.ibm.com>: fixed mistype which caused ics_set_kvm_state() to fail]
> Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
> [aik: moved to a separate device, reworked QOM]
>
> ---
> Changes:
> 2013/07/17:
> * QOM rework
>
> 2013/07/01
> * fixed VMState names in order to support xics-kvm migration to xics and vice versa
>
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
>  default-configs/ppc64-softmmu.mak |   1 +
>  hw/intc/Makefile.objs             |   1 +
>  hw/intc/xics_kvm.c                | 469 ++++++++++++++++++++++++++++++++++++++
>  hw/ppc/spapr.c                    |  26 ++-
>  include/hw/ppc/xics.h             |  10 +
>  5 files changed, 506 insertions(+), 1 deletion(-)
>  create mode 100644 hw/intc/xics_kvm.c
>
> diff --git a/default-configs/ppc64-softmmu.mak b/default-configs/ppc64-softmmu.mak
> index 6d1933b..1f4550a 100644
> --- a/default-configs/ppc64-softmmu.mak
> +++ b/default-configs/ppc64-softmmu.mak
> @@ -47,5 +47,6 @@ CONFIG_E500=y
>  CONFIG_OPENPIC_KVM=$(and $(CONFIG_E500),$(CONFIG_KVM))
>  # For pSeries
>  CONFIG_XICS=$(CONFIG_PSERIES)
> +CONFIG_XICS_KVM=$(and $(CONFIG_PSERIES),$(CONFIG_KVM))
>  # For PReP
>  CONFIG_MC146818RTC=y
> diff --git a/hw/intc/Makefile.objs b/hw/intc/Makefile.objs
> index 2851eed..47ac442 100644
> --- a/hw/intc/Makefile.objs
> +++ b/hw/intc/Makefile.objs
> @@ -23,3 +23,4 @@ obj-$(CONFIG_OMAP) += omap_intc.o
>  obj-$(CONFIG_OPENPIC_KVM) += openpic_kvm.o
>  obj-$(CONFIG_SH4) += sh_intc.o
>  obj-$(CONFIG_XICS) += xics.o
> +obj-$(CONFIG_XICS_KVM) += xics_kvm.o
> diff --git a/hw/intc/xics_kvm.c b/hw/intc/xics_kvm.c
> new file mode 100644
> index 0000000..04ce1be
> --- /dev/null
> +++ b/hw/intc/xics_kvm.c
> @@ -0,0 +1,469 @@
> +/*
> + * QEMU PowerPC pSeries Logical Partition (aka sPAPR) hardware System Emulator
> + *
> + * PAPR Virtualized Interrupt System, aka ICS/ICP aka xics, in-kernel emulation
> + *
> + * Copyright (c) 2013 David Gibson, IBM Corporation.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a copy
> + * of this software and associated documentation files (the "Software"), to deal
> + * in the Software without restriction, including without limitation the rights
> + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
> + * copies of the Software, and to permit persons to whom the Software is
> + * furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
> + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
> + * THE SOFTWARE.
> + *
> + */
> +
> +#include "hw/hw.h"
> +#include "trace.h"
> +#include "hw/ppc/spapr.h"
> +#include "hw/ppc/xics.h"
> +#include "kvm_ppc.h"
> +#include "qemu/config-file.h"
> +
> +#include <sys/ioctl.h>
> +
> +typedef struct XICSStateKVM {
> +    struct XICSState parent;

'struct' is not necessary here.

> +
> +    uint32_t set_xive_token;
> +    uint32_t get_xive_token;
> +    uint32_t int_off_token;
> +    uint32_t int_on_token;
> +    int kernel_xics_fd;
> +} XICSStateKVM;
> +
> +/*
> + * ICP-KVM
> + */
> +static void icp_get_kvm_state(ICPState *ss)
> +{
> +    uint64_t state;
> +    struct kvm_one_reg reg = {
> +        .id = KVM_REG_PPC_ICP_STATE,
> +        .addr = (uintptr_t)&state,
> +    };
> +    int ret;
> +
> +    if (!ss->cs) {
> +        return; /* kernel irqchip not in use */
> +    }

Should this be an assert?

> +
> +    ret = kvm_vcpu_ioctl(ss->cs, KVM_GET_ONE_REG, &reg);
> +    if (ret != 0) {
> +        fprintf(stderr, "Unable to retrieve KVM interrupt controller state"
> +                " for CPU %d: %s\n", ss->cs->cpu_index, strerror(errno));
> +        exit(1);
> +    }
> +
> +    ss->xirr = state >> KVM_REG_PPC_ICP_XISR_SHIFT;
> +    ss->mfrr = (state >> KVM_REG_PPC_ICP_MFRR_SHIFT)
> +        & KVM_REG_PPC_ICP_MFRR_MASK;
> +    ss->pending_priority = (state >> KVM_REG_PPC_ICP_PPRI_SHIFT)
> +        & KVM_REG_PPC_ICP_PPRI_MASK;
> +}
> +
> +static int icp_set_kvm_state(ICPState *ss)
> +{
> +    uint64_t state;
> +    struct kvm_one_reg reg = {
> +        .id = KVM_REG_PPC_ICP_STATE,
> +        .addr = (uintptr_t)&state,
> +    };
> +    int ret;
> +
> +    if (!ss->cs) {
> +        return 0; /* kernel irqchip not in use */
> +    }

assert?

Regards,

Anthony Liguori


> +
> +    state = ((uint64_t)ss->xirr << KVM_REG_PPC_ICP_XISR_SHIFT)
> +        | ((uint64_t)ss->mfrr << KVM_REG_PPC_ICP_MFRR_SHIFT)
> +        | ((uint64_t)ss->pending_priority << KVM_REG_PPC_ICP_PPRI_SHIFT);
> +
> +    ret = kvm_vcpu_ioctl(ss->cs, KVM_SET_ONE_REG, &reg);
> +    if (ret != 0) {
> +        fprintf(stderr, "Unable to restore KVM interrupt controller state (0x%"
> +                PRIx64 ") for CPU %d: %s\n", state, ss->cs->cpu_index,
> +                strerror(errno));
> +        return ret;
> +    }
> +
> +    return 0;
> +}
> +
> +static void icp_kvm_reset(DeviceState *dev)
> +{
> +    ICPState *icp = ICP(dev);
> +
> +    icp->xirr = 0;
> +    icp->pending_priority = 0xff;
> +    icp->mfrr = 0xff;
> +
> +    /* Make all outputs are deasserted */
> +    qemu_set_irq(icp->output, 0);
> +
> +    icp_set_kvm_state(icp);
> +}
> +
> +static void icp_kvm_class_init(ObjectClass *klass, void *data)
> +{
> +    DeviceClass *dc = DEVICE_CLASS(klass);
> +    ICPStateClass *k = ICP_CLASS(klass);
> +
> +    dc->reset = icp_kvm_reset;
> +    k->pre_save = icp_get_kvm_state;
> +    k->post_load = icp_set_kvm_state;
> +}
> +
> +static TypeInfo icp_kvm_info = {
> +    .name = TYPE_ICP_KVM,
> +    .parent = TYPE_ICP,
> +    .instance_size = sizeof(ICPState),
> +    .class_init = icp_kvm_class_init,
> +    .class_size = sizeof(ICPStateClass),
> +};
> +
> +/*
> + * ICS-KVM
> + */
> +static void ics_get_kvm_state(ICSState *ics)
> +{
> +    XICSStateKVM *icpkvm = XICS_KVM(ics->icp);
> +    uint64_t state;
> +    struct kvm_device_attr attr = {
> +        .flags = 0,
> +        .group = KVM_DEV_XICS_GRP_SOURCES,
> +        .addr = (uint64_t)(uintptr_t)&state,
> +    };
> +    int i;
> +
> +    for (i = 0; i < ics->nr_irqs; i++) {
> +        ICSIRQState *irq = &ics->irqs[i];
> +        int ret;
> +
> +        attr.attr = i + ics->offset;
> +
> +        ret = ioctl(icpkvm->kernel_xics_fd, KVM_GET_DEVICE_ATTR, &attr);
> +        if (ret != 0) {
> +            fprintf(stderr, "Unable to retrieve KVM interrupt controller state"
> +                    " for IRQ %d: %s\n", i + ics->offset, strerror(errno));
> +            exit(1);
> +        }
> +
> +        irq->server = state & KVM_XICS_DESTINATION_MASK;
> +        irq->saved_priority = (state >> KVM_XICS_PRIORITY_SHIFT)
> +            & KVM_XICS_PRIORITY_MASK;
> +        /*
> +         * To be consistent with the software emulation in xics.c, we
> +         * split out the masked state + priority that we get from the
> +         * kernel into 'current priority' (0xff if masked) and
> +         * 'saved priority' (if masked, this is the priority the
> +         * interrupt had before it was masked).  Masking and unmasking
> +         * are done with the ibm,int-off and ibm,int-on RTAS calls.
> +         */
> +        if (state & KVM_XICS_MASKED) {
> +            irq->priority = 0xff;
> +        } else {
> +            irq->priority = irq->saved_priority;
> +        }
> +
> +        if (state & KVM_XICS_PENDING) {
> +            if (state & KVM_XICS_LEVEL_SENSITIVE) {
> +                irq->status |= XICS_STATUS_ASSERTED;
> +            } else {
> +                /*
> +                 * A pending edge-triggered interrupt (or MSI)
> +                 * must have been rejected previously when we
> +                 * first detected it and tried to deliver it,
> +                 * so mark it as pending and previously rejected
> +                 * for consistency with how xics.c works.
> +                 */
> +                irq->status |= XICS_STATUS_MASKED_PENDING
> +                    | XICS_STATUS_REJECTED;
> +            }
> +        }
> +    }
> +}
> +
> +static int ics_set_kvm_state(ICSState *ics)
> +{
> +    XICSStateKVM *icpkvm = XICS_KVM(ics->icp);
> +    uint64_t state;
> +    struct kvm_device_attr attr = {
> +        .flags = 0,
> +        .group = KVM_DEV_XICS_GRP_SOURCES,
> +        .addr = (uint64_t)(uintptr_t)&state,
> +    };
> +    int i;
> +
> +    for (i = 0; i < ics->nr_irqs; i++) {
> +        ICSIRQState *irq = &ics->irqs[i];
> +        int ret;
> +
> +        attr.attr = i + ics->offset;
> +
> +        state = irq->server;
> +        state |= (uint64_t)(irq->saved_priority & KVM_XICS_PRIORITY_MASK)
> +            << KVM_XICS_PRIORITY_SHIFT;
> +        if (irq->priority != irq->saved_priority) {
> +            assert(irq->priority == 0xff);
> +            state |= KVM_XICS_MASKED;
> +        }
> +
> +        if (ics->islsi[i]) {
> +            state |= KVM_XICS_LEVEL_SENSITIVE;
> +            if (irq->status & XICS_STATUS_ASSERTED) {
> +                state |= KVM_XICS_PENDING;
> +            }
> +        } else {
> +            if (irq->status & XICS_STATUS_MASKED_PENDING) {
> +                state |= KVM_XICS_PENDING;
> +            }
> +        }
> +
> +        ret = ioctl(icpkvm->kernel_xics_fd, KVM_SET_DEVICE_ATTR, &attr);
> +        if (ret != 0) {
> +            fprintf(stderr, "Unable to restore KVM interrupt controller state"
> +                    " for IRQs %d: %s\n", i + ics->offset, strerror(errno));
> +            return ret;
> +        }
> +    }
> +
> +    return 0;
> +}
> +
> +static void ics_kvm_set_irq(void *opaque, int srcno, int val)
> +{
> +    ICSState *ics = opaque;
> +    struct kvm_irq_level args;
> +    int rc;
> +
> +    args.irq = srcno + ics->offset;
> +    if (!ics->islsi[srcno]) {
> +        if (!val) {
> +            return;
> +        }
> +        args.level = KVM_INTERRUPT_SET;
> +    } else {
> +        args.level = val ? KVM_INTERRUPT_SET_LEVEL : KVM_INTERRUPT_UNSET;
> +    }
> +    rc = kvm_vm_ioctl(kvm_state, KVM_IRQ_LINE, &args);
> +    if (rc < 0) {
> +        perror("kvm_irq_line");
> +    }
> +}
> +
> +static void ics_kvm_reset(DeviceState *dev)
> +{
> +    ics_set_kvm_state(ICS(dev));
> +}
> +
> +static int ics_kvm_realize(DeviceState *dev)
> +{
> +    ICSState *ics = ICS(dev);
> +
> +    ics->irqs = g_malloc0(ics->nr_irqs * sizeof(ICSIRQState));
> +    ics->islsi = g_malloc0(ics->nr_irqs * sizeof(bool));
> +    ics->qirqs = qemu_allocate_irqs(ics_kvm_set_irq, ics, ics->nr_irqs);
> +
> +    return 0;
> +}
> +
> +static void ics_kvm_class_init(ObjectClass *klass, void *data)
> +{
> +    DeviceClass *dc = DEVICE_CLASS(klass);
> +    ICSStateClass *k = ICS_CLASS(klass);
> +
> +    dc->init = ics_kvm_realize;
> +    dc->reset = ics_kvm_reset;
> +    k->pre_save = ics_get_kvm_state;
> +    k->post_load = ics_set_kvm_state;
> +}
> +
> +static TypeInfo ics_kvm_info = {
> +    .name = TYPE_ICS_KVM,
> +    .parent = TYPE_ICS,
> +    .instance_size = sizeof(ICSState),
> +    .class_init = ics_kvm_class_init,
> +};
> +
> +/*
> + * XICS-KVM
> + */
> +static void xics_kvm_cpu_setup(XICSState *icp, PowerPCCPU *cpu)
> +{
> +    CPUState *cs;
> +    ICPState *ss;
> +    XICSStateKVM *icpkvm = (XICSStateKVM *) object_dynamic_cast(
> +            OBJECT(icp), TYPE_XICS_KVM);
> +    XICSStateClass *xics_info = XICS_CLASS(object_class_by_name(TYPE_XICS));
> +
> +    if (!icpkvm) {
> +        return;
> +    }
> +
> +    cs = CPU(cpu);
> +    ss = &icp->ss[cs->cpu_index];
> +
> +    assert(cs->cpu_index < icp->nr_servers);
> +    if (icpkvm->kernel_xics_fd == -1) {
> +        abort();
> +    }
> +
> +    if (icpkvm->kernel_xics_fd != -1) {
> +        int ret;
> +        struct kvm_enable_cap xics_enable_cap = {
> +            .cap = KVM_CAP_IRQ_XICS,
> +            .flags = 0,
> +            .args = {icpkvm->kernel_xics_fd, cs->cpu_index, 0, 0},
> +        };
> +
> +        ss->cs = cs;
> +
> +        ret = kvm_vcpu_ioctl(ss->cs, KVM_ENABLE_CAP, &xics_enable_cap);
> +        if (ret < 0) {
> +            fprintf(stderr, "Unable to connect CPU%d to kernel XICS: %s\n",
> +                    cs->cpu_index, strerror(errno));
> +            exit(1);
> +        }
> +    }
> +
> +    /* Call emulated XICS implementation for consistency */
> +    assert(xics_info);
> +    xics_info->cpu_setup(icp, cpu);
> +}
> +
> +static void rtas_dummy(PowerPCCPU *cpu, sPAPREnvironment *spapr,
> +                       uint32_t token,
> +                       uint32_t nargs, target_ulong args,
> +                       uint32_t nret, target_ulong rets)
> +{
> +    fprintf(stderr, "pseries: %s must never be called for in-kernel XICS\n",
> +            __func__);
> +}
> +
> +static void xics_kvm_realize(DeviceState *dev, Error **errp)
> +{
> +    XICSStateKVM *icpkvm = XICS_KVM(dev);
> +    XICSState *icp = XICS(dev);
> +    ICSState *ics;
> +    QemuOptsList *list = qemu_find_opts("machine");
> +    int i, rc;
> +    struct kvm_create_device xics_create_device = {
> +        .type = KVM_DEV_TYPE_XICS,
> +        .flags = 0,
> +    };
> +
> +    if (!kvm_enabled()) {
> +        error_setg(errp, "KVM must be enabled for in-kernel XICS");
> +        goto fail;
> +    }
> +
> +    if (QTAILQ_EMPTY(&list->head) ||
> +        !qemu_opt_get_bool(QTAILQ_FIRST(&list->head),
> +                           "kernel_irqchip", true) ||
> +        !kvm_check_extension(kvm_state, KVM_CAP_IRQ_XICS)) {
> +        error_setg(errp, "KVM must be enabled for in-kernel XICS");
> +        return;
> +    }
> +
> +    icpkvm->set_xive_token = spapr_rtas_register("ibm,set-xive", rtas_dummy);
> +    icpkvm->get_xive_token = spapr_rtas_register("ibm,get-xive", rtas_dummy);
> +    icpkvm->int_off_token = spapr_rtas_register("ibm,int-off", rtas_dummy);
> +    icpkvm->int_on_token = spapr_rtas_register("ibm,int-on", rtas_dummy);
> +
> +    rc = kvmppc_define_rtas_token(icpkvm->set_xive_token, "ibm,set-xive");
> +    if (rc < 0) {
> +        error_setg(errp, "kvmppc_define_rtas_token: ibm,set-xive");
> +        goto fail;
> +    }
> +
> +    rc = kvmppc_define_rtas_token(icpkvm->get_xive_token, "ibm,get-xive");
> +    if (rc < 0) {
> +        error_setg(errp, "kvmppc_define_rtas_token: ibm,get-xive");
> +        goto fail;
> +    }
> +
> +    rc = kvmppc_define_rtas_token(icpkvm->int_on_token, "ibm,int-on");
> +    if (rc < 0) {
> +        error_setg(errp, "kvmppc_define_rtas_token: ibm,int-on");
> +        goto fail;
> +    }
> +
> +    rc = kvmppc_define_rtas_token(icpkvm->int_off_token, "ibm,int-off");
> +    if (rc < 0) {
> +        error_setg(errp, "kvmppc_define_rtas_token: ibm,int-off");
> +        goto fail;
> +    }
> +
> +    /* Create the kernel ICP */
> +    rc = kvm_vm_ioctl(kvm_state, KVM_CREATE_DEVICE, &xics_create_device);
> +    if (rc < 0) {
> +        error_setg_errno(errp, -rc, "Error on KVM_CREATE_DEVICE for XICS");
> +        goto fail;
> +    }
> +
> +    icpkvm->kernel_xics_fd = xics_create_device.fd;
> +
> +    icp->ics = ICS(object_new(TYPE_ICS_KVM));
> +    ics = icp->ics;
> +    object_property_add_child(OBJECT(icp), "ics", OBJECT(icp->ics), NULL);
> +
> +    ics->nr_irqs = icp->nr_irqs;
> +    ics->offset = XICS_IRQ_BASE;
> +    ics->icp = icp;
> +    qdev_init_nofail(DEVICE(ics));
> +
> +    icp->ss = g_malloc0(icp->nr_servers*sizeof(ICPState));
> +    for (i = 0; i < icp->nr_servers; i++) {
> +        char buffer[32];
> +        object_initialize(&icp->ss[i], TYPE_ICP_KVM);
> +        snprintf(buffer, sizeof(buffer), "icp[%d]", i);
> +        object_property_add_child(OBJECT(icp), buffer, OBJECT(&icp->ss[i]), NULL);
> +        qdev_init_nofail(DEVICE(&icp->ss[i]));
> +    }
> +    return;
> +
> +fail:
> +    kvmppc_define_rtas_token(0, "ibm,set-xive");
> +    kvmppc_define_rtas_token(0, "ibm,get-xive");
> +    kvmppc_define_rtas_token(0, "ibm,int-on");
> +    kvmppc_define_rtas_token(0, "ibm,int-off");
> +    return;
> +}
> +
> +static void xics_kvm_class_init(ObjectClass *oc, void *data)
> +{
> +    DeviceClass *dc = DEVICE_CLASS(oc);
> +    XICSStateClass *k = XICS_CLASS(oc);
> +
> +    dc->realize = xics_kvm_realize;
> +    k->cpu_setup = xics_kvm_cpu_setup;
> +}
> +
> +static const TypeInfo xics_kvm_info = {
> +    .name          = TYPE_XICS_KVM,
> +    .parent        = TYPE_XICS,
> +    .instance_size = sizeof(XICSStateKVM),
> +    .class_init    = xics_kvm_class_init,
> +};
> +
> +static void xics_kvm_register_types(void)
> +{
> +    type_register_static(&xics_kvm_info);
> +    type_register_static(&ics_kvm_info);
> +    type_register_static(&icp_kvm_info);
> +}
> +
> +type_init(xics_kvm_register_types)
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index 432f0d2..84433ee 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -148,7 +148,31 @@ static XICSState *xics_system_init(int nr_servers, int nr_irqs)
>  {
>      XICSState *icp = NULL;
>  
> -    icp = try_create_xics(TYPE_XICS, nr_servers, nr_irqs);
> +    if (kvm_enabled()) {
> +        bool irqchip_allowed = true, irqchip_required = false;
> +        QemuOptsList *list = qemu_find_opts("machine");
> +
> +        if (!QTAILQ_EMPTY(&list->head)) {
> +            irqchip_allowed = qemu_opt_get_bool(QTAILQ_FIRST(&list->head),
> +                                                "kernel_irqchip", true);
> +            irqchip_required = qemu_opt_get_bool(QTAILQ_FIRST(&list->head),
> +                                                 "kernel_irqchip", false);
> +        }
> +
> +        if (irqchip_allowed) {
> +            icp = try_create_xics(TYPE_XICS_KVM, nr_servers, nr_irqs);
> +        }
> +
> +        if (irqchip_required && !icp) {
> +            perror("iFailed to create in-kernel XICS\n");
> +            abort();
> +        }
> +    }
> +
> +    if (!icp) {
> +        icp = try_create_xics(TYPE_XICS, nr_servers, nr_irqs);
> +    }
> +
>      if (!icp) {
>          perror("Failed to create XICS\n");
>          abort();
> diff --git a/include/hw/ppc/xics.h b/include/hw/ppc/xics.h
> index 90ecaf8..835a3d6 100644
> --- a/include/hw/ppc/xics.h
> +++ b/include/hw/ppc/xics.h
> @@ -32,6 +32,9 @@
>  #define TYPE_XICS "xics"
>  #define XICS(obj) OBJECT_CHECK(XICSState, (obj), TYPE_XICS)
>  
> +#define TYPE_XICS_KVM "xics-kvm"
> +#define XICS_KVM(obj) OBJECT_CHECK(XICSStateKVM, (obj), TYPE_XICS_KVM)
> +
>  #define XICS_CLASS(klass) \
>       OBJECT_CLASS_CHECK(XICSStateClass, (klass), TYPE_XICS)
>  #define XICS_GET_CLASS(obj) \
> @@ -73,6 +76,9 @@ struct XICSState {
>  #define TYPE_ICP "icp"
>  #define ICP(obj) OBJECT_CHECK(ICPState, (obj), TYPE_ICP)
>  
> +#define TYPE_ICP_KVM "icp-kvm"
> +#define ICP_KVM(obj) OBJECT_CHECK(ICPState, (obj), TYPE_ICP_KVM)
> +
>  #define ICP_CLASS(klass) \
>       OBJECT_CLASS_CHECK(ICPStateClass, (klass), TYPE_ICP)
>  #define ICP_GET_CLASS(obj) \
> @@ -89,6 +95,7 @@ struct ICPState {
>      /*< private >*/
>      DeviceState parent_obj;
>      /*< public >*/
> +    CPUState *cs;
>      uint32_t xirr;
>      uint8_t pending_priority;
>      uint8_t mfrr;
> @@ -98,6 +105,9 @@ struct ICPState {
>  #define TYPE_ICS "ics"
>  #define ICS(obj) OBJECT_CHECK(ICSState, (obj), TYPE_ICS)
>  
> +#define TYPE_ICS_KVM "icskvm"
> +#define ICS_KVM(obj) OBJECT_CHECK(ICSState, (obj), TYPE_ICS_KVM)
> +
>  #define ICS_CLASS(klass) \
>       OBJECT_CLASS_CHECK(ICSStateClass, (klass), TYPE_ICS)
>  #define ICS_GET_CLASS(obj) \
> -- 
> 1.8.3.2
Andreas Färber July 31, 2013, 7:52 p.m. UTC | #2
Hi,

Am 17.07.2013 08:37, schrieb Alexey Kardashevskiy:
> From: David Gibson <david@gibson.dropbear.id.au>
> 
> Recent (host) kernels support emulating the PAPR defined "XICS" interrupt
> controller system within KVM.  This patch allows qemu to initialize and
> configure the in-kernel XICS, and keep its state in sync with qemu's XICS
> state as necessary.
> 
> This should give considerable performance improvements.  e.g. on a simple
> IPI ping-pong test between hardware threads, using qemu XICS gives us
> around 5,000 irqs/second, whereas the in-kernel XICS gives us around
> 70,000 irqs/s on the same hardware configuration.
> 
> [Mike Qiu <qiudayu@linux.vnet.ibm.com>: fixed mistype which caused ics_set_kvm_state() to fail]
> Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
> [aik: moved to a separate device, reworked QOM]
> 
> ---
> Changes:
> 2013/07/17:
> * QOM rework
> 
> 2013/07/01
> * fixed VMState names in order to support xics-kvm migration to xics and vice versa
> 
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>

Something went wrong here. ;)

[...]
> diff --git a/hw/intc/xics_kvm.c b/hw/intc/xics_kvm.c
> new file mode 100644
> index 0000000..04ce1be
> --- /dev/null
> +++ b/hw/intc/xics_kvm.c
[...]
> +typedef struct XICSStateKVM {
> +    struct XICSState parent;

parent_obj to align with most new devices?

> +
> +    uint32_t set_xive_token;
> +    uint32_t get_xive_token;
> +    uint32_t int_off_token;
> +    uint32_t int_on_token;
> +    int kernel_xics_fd;
> +} XICSStateKVM;
[...]
> +static int ics_kvm_realize(DeviceState *dev)
> +{
> +    ICSState *ics = ICS(dev);
> +
> +    ics->irqs = g_malloc0(ics->nr_irqs * sizeof(ICSIRQState));
> +    ics->islsi = g_malloc0(ics->nr_irqs * sizeof(bool));
> +    ics->qirqs = qemu_allocate_irqs(ics_kvm_set_irq, ics, ics->nr_irqs);
> +
> +    return 0;
> +}
> +
> +static void ics_kvm_class_init(ObjectClass *klass, void *data)
> +{
> +    DeviceClass *dc = DEVICE_CLASS(klass);
> +    ICSStateClass *k = ICS_CLASS(klass);
> +
> +    dc->init = ics_kvm_realize;

This is fishy - probably you want to assign to dc->realize and fix the
signature?

Andreas

> +    dc->reset = ics_kvm_reset;
> +    k->pre_save = ics_get_kvm_state;
> +    k->post_load = ics_set_kvm_state;
> +}
> +
> +static TypeInfo ics_kvm_info = {

static const for all TypeInfos.

> +    .name = TYPE_ICS_KVM,
> +    .parent = TYPE_ICS,
> +    .instance_size = sizeof(ICSState),
> +    .class_init = ics_kvm_class_init,
> +};
> +
> +/*
> + * XICS-KVM
> + */
> +static void xics_kvm_cpu_setup(XICSState *icp, PowerPCCPU *cpu)
> +{
> +    CPUState *cs;
> +    ICPState *ss;
> +    XICSStateKVM *icpkvm = (XICSStateKVM *) object_dynamic_cast(
> +            OBJECT(icp), TYPE_XICS_KVM);
> +    XICSStateClass *xics_info = XICS_CLASS(object_class_by_name(TYPE_XICS));

Are you intentionally accessing that class by name rather than using
XICS_GET_CLASS(icp), which allows the KVM variant to overwrite things?

> +
> +    if (!icpkvm) {
> +        return;
> +    }
> +
> +    cs = CPU(cpu);
> +    ss = &icp->ss[cs->cpu_index];
> +
> +    assert(cs->cpu_index < icp->nr_servers);
> +    if (icpkvm->kernel_xics_fd == -1) {
> +        abort();
> +    }
> +
> +    if (icpkvm->kernel_xics_fd != -1) {
> +        int ret;
> +        struct kvm_enable_cap xics_enable_cap = {
> +            .cap = KVM_CAP_IRQ_XICS,
> +            .flags = 0,
> +            .args = {icpkvm->kernel_xics_fd, cs->cpu_index, 0, 0},
> +        };
> +
> +        ss->cs = cs;
> +
> +        ret = kvm_vcpu_ioctl(ss->cs, KVM_ENABLE_CAP, &xics_enable_cap);
> +        if (ret < 0) {
> +            fprintf(stderr, "Unable to connect CPU%d to kernel XICS: %s\n",
> +                    cs->cpu_index, strerror(errno));

error_report() in place of all fprintf(stderr, ...)? (without \n then)

> +            exit(1);
> +        }
> +    }
> +
> +    /* Call emulated XICS implementation for consistency */
> +    assert(xics_info);

If you want to go safe, you could add assert(xics_info->cpu_setup).

> +    xics_info->cpu_setup(icp, cpu);
> +}
> +
> +static void rtas_dummy(PowerPCCPU *cpu, sPAPREnvironment *spapr,
> +                       uint32_t token,
> +                       uint32_t nargs, target_ulong args,
> +                       uint32_t nret, target_ulong rets)
> +{
> +    fprintf(stderr, "pseries: %s must never be called for in-kernel XICS\n",
> +            __func__);
> +}
> +
> +static void xics_kvm_realize(DeviceState *dev, Error **errp)
> +{
> +    XICSStateKVM *icpkvm = XICS_KVM(dev);
> +    XICSState *icp = XICS(dev);
> +    ICSState *ics;
> +    QemuOptsList *list = qemu_find_opts("machine");
> +    int i, rc;
> +    struct kvm_create_device xics_create_device = {
> +        .type = KVM_DEV_TYPE_XICS,
> +        .flags = 0,
> +    };
> +
> +    if (!kvm_enabled()) {
> +        error_setg(errp, "KVM must be enabled for in-kernel XICS");
> +        goto fail;
> +    }
> +
> +    if (QTAILQ_EMPTY(&list->head) ||
> +        !qemu_opt_get_bool(QTAILQ_FIRST(&list->head),
> +                           "kernel_irqchip", true) ||

Is it safe to take a value from the first element in the list? Shouldn't
merging -machine spapr,accel=kvm -machine kernel_irqchip=on be taken
into account?

> +        !kvm_check_extension(kvm_state, KVM_CAP_IRQ_XICS)) {
> +        error_setg(errp, "KVM must be enabled for in-kernel XICS");
> +        return;
> +    }
> +
> +    icpkvm->set_xive_token = spapr_rtas_register("ibm,set-xive", rtas_dummy);
> +    icpkvm->get_xive_token = spapr_rtas_register("ibm,get-xive", rtas_dummy);
> +    icpkvm->int_off_token = spapr_rtas_register("ibm,int-off", rtas_dummy);
> +    icpkvm->int_on_token = spapr_rtas_register("ibm,int-on", rtas_dummy);
> +
> +    rc = kvmppc_define_rtas_token(icpkvm->set_xive_token, "ibm,set-xive");
> +    if (rc < 0) {
> +        error_setg(errp, "kvmppc_define_rtas_token: ibm,set-xive");
> +        goto fail;
> +    }
> +
> +    rc = kvmppc_define_rtas_token(icpkvm->get_xive_token, "ibm,get-xive");
> +    if (rc < 0) {
> +        error_setg(errp, "kvmppc_define_rtas_token: ibm,get-xive");
> +        goto fail;
> +    }
> +
> +    rc = kvmppc_define_rtas_token(icpkvm->int_on_token, "ibm,int-on");
> +    if (rc < 0) {
> +        error_setg(errp, "kvmppc_define_rtas_token: ibm,int-on");
> +        goto fail;
> +    }
> +
> +    rc = kvmppc_define_rtas_token(icpkvm->int_off_token, "ibm,int-off");
> +    if (rc < 0) {
> +        error_setg(errp, "kvmppc_define_rtas_token: ibm,int-off");
> +        goto fail;
> +    }
> +
> +    /* Create the kernel ICP */
> +    rc = kvm_vm_ioctl(kvm_state, KVM_CREATE_DEVICE, &xics_create_device);
> +    if (rc < 0) {
> +        error_setg_errno(errp, -rc, "Error on KVM_CREATE_DEVICE for XICS");
> +        goto fail;
> +    }
> +
> +    icpkvm->kernel_xics_fd = xics_create_device.fd;
> +
> +    icp->ics = ICS(object_new(TYPE_ICS_KVM));
> +    ics = icp->ics;
> +    object_property_add_child(OBJECT(icp), "ics", OBJECT(icp->ics), NULL);
> +
> +    ics->nr_irqs = icp->nr_irqs;
> +    ics->offset = XICS_IRQ_BASE;
> +    ics->icp = icp;
> +    qdev_init_nofail(DEVICE(ics));

Since this is in a realize function, please use
object_property_set_bool(), which gives you access to the Error - to
catch it you can use a local Error *err variable.

> +
> +    icp->ss = g_malloc0(icp->nr_servers*sizeof(ICPState));
> +    for (i = 0; i < icp->nr_servers; i++) {
> +        char buffer[32];
> +        object_initialize(&icp->ss[i], TYPE_ICP_KVM);
> +        snprintf(buffer, sizeof(buffer), "icp[%d]", i);
> +        object_property_add_child(OBJECT(icp), buffer, OBJECT(&icp->ss[i]), NULL);
> +        qdev_init_nofail(DEVICE(&icp->ss[i]));

object_property_set_bool()

Where does icp->nr_servers come from? Is there no way to split this into
instance_init and realize?

> +    }
> +    return;
> +
> +fail:
> +    kvmppc_define_rtas_token(0, "ibm,set-xive");
> +    kvmppc_define_rtas_token(0, "ibm,get-xive");
> +    kvmppc_define_rtas_token(0, "ibm,int-on");
> +    kvmppc_define_rtas_token(0, "ibm,int-off");
> +    return;
> +}
> +
> +static void xics_kvm_class_init(ObjectClass *oc, void *data)
> +{
> +    DeviceClass *dc = DEVICE_CLASS(oc);
> +    XICSStateClass *k = XICS_CLASS(oc);

xsc?

> +
> +    dc->realize = xics_kvm_realize;
> +    k->cpu_setup = xics_kvm_cpu_setup;
> +}
> +
> +static const TypeInfo xics_kvm_info = {
> +    .name          = TYPE_XICS_KVM,
> +    .parent        = TYPE_XICS,
> +    .instance_size = sizeof(XICSStateKVM),
> +    .class_init    = xics_kvm_class_init,
> +};
> +
> +static void xics_kvm_register_types(void)
> +{
> +    type_register_static(&xics_kvm_info);
> +    type_register_static(&ics_kvm_info);
> +    type_register_static(&icp_kvm_info);
> +}
> +
> +type_init(xics_kvm_register_types)
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index 432f0d2..84433ee 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -148,7 +148,31 @@ static XICSState *xics_system_init(int nr_servers, int nr_irqs)
>  {
>      XICSState *icp = NULL;
>  
> -    icp = try_create_xics(TYPE_XICS, nr_servers, nr_irqs);
> +    if (kvm_enabled()) {
> +        bool irqchip_allowed = true, irqchip_required = false;
> +        QemuOptsList *list = qemu_find_opts("machine");
> +
> +        if (!QTAILQ_EMPTY(&list->head)) {
> +            irqchip_allowed = qemu_opt_get_bool(QTAILQ_FIRST(&list->head),
> +                                                "kernel_irqchip", true);
> +            irqchip_required = qemu_opt_get_bool(QTAILQ_FIRST(&list->head),
> +                                                 "kernel_irqchip", false);
> +        }

Again, don't we have APIs to help with opts access?

> +
> +        if (irqchip_allowed) {
> +            icp = try_create_xics(TYPE_XICS_KVM, nr_servers, nr_irqs);
> +        }
> +
> +        if (irqchip_required && !icp) {
> +            perror("iFailed to create in-kernel XICS\n");

"Failed to ..."?

> +            abort();
> +        }
> +    }
> +
> +    if (!icp) {
> +        icp = try_create_xics(TYPE_XICS, nr_servers, nr_irqs);
> +    }
> +
>      if (!icp) {
>          perror("Failed to create XICS\n");
>          abort();
> diff --git a/include/hw/ppc/xics.h b/include/hw/ppc/xics.h
> index 90ecaf8..835a3d6 100644
> --- a/include/hw/ppc/xics.h
> +++ b/include/hw/ppc/xics.h
> @@ -32,6 +32,9 @@
>  #define TYPE_XICS "xics"
>  #define XICS(obj) OBJECT_CHECK(XICSState, (obj), TYPE_XICS)
>  
> +#define TYPE_XICS_KVM "xics-kvm"
> +#define XICS_KVM(obj) OBJECT_CHECK(XICSStateKVM, (obj), TYPE_XICS_KVM)

TYPE_KVM_XICS, KVM_XICS() please - from the specific to base.

> +
>  #define XICS_CLASS(klass) \
>       OBJECT_CLASS_CHECK(XICSStateClass, (klass), TYPE_XICS)
>  #define XICS_GET_CLASS(obj) \
> @@ -73,6 +76,9 @@ struct XICSState {
>  #define TYPE_ICP "icp"
>  #define ICP(obj) OBJECT_CHECK(ICPState, (obj), TYPE_ICP)
>  
> +#define TYPE_ICP_KVM "icp-kvm"
> +#define ICP_KVM(obj) OBJECT_CHECK(ICPState, (obj), TYPE_ICP_KVM)
> +
>  #define ICP_CLASS(klass) \
>       OBJECT_CLASS_CHECK(ICPStateClass, (klass), TYPE_ICP)
>  #define ICP_GET_CLASS(obj) \
> @@ -89,6 +95,7 @@ struct ICPState {
>      /*< private >*/
>      DeviceState parent_obj;
>      /*< public >*/
> +    CPUState *cs;
>      uint32_t xirr;
>      uint8_t pending_priority;
>      uint8_t mfrr;
> @@ -98,6 +105,9 @@ struct ICPState {
>  #define TYPE_ICS "ics"
>  #define ICS(obj) OBJECT_CHECK(ICSState, (obj), TYPE_ICS)
>  
> +#define TYPE_ICS_KVM "icskvm"
> +#define ICS_KVM(obj) OBJECT_CHECK(ICSState, (obj), TYPE_ICS_KVM)
> +
>  #define ICS_CLASS(klass) \
>       OBJECT_CLASS_CHECK(ICSStateClass, (klass), TYPE_ICS)
>  #define ICS_GET_CLASS(obj) \

Regards,
Andreas
Peter Maydell July 31, 2013, 8:47 p.m. UTC | #3
On 31 July 2013 20:52, Andreas Färber <afaerber@suse.de> wrote:
> Hi,
>
> Am 17.07.2013 08:37, schrieb Alexey Kardashevskiy:
>> +            perror("iFailed to create in-kernel XICS\n");
>
> "Failed to ..."?

Perhaps

#ifdef __APPLE__
     perror("iFailed ...");
#else
     perror("Failed ...");
#endif

:-)

-- PMM
Alexey Kardashevskiy Aug. 1, 2013, 12:14 a.m. UTC | #4
On 08/01/2013 05:52 AM, Andreas Färber wrote:
> Hi,
> 
> Am 17.07.2013 08:37, schrieb Alexey Kardashevskiy:
>> From: David Gibson <david@gibson.dropbear.id.au>
>>
>> Recent (host) kernels support emulating the PAPR defined "XICS" interrupt
>> controller system within KVM.  This patch allows qemu to initialize and
>> configure the in-kernel XICS, and keep its state in sync with qemu's XICS
>> state as necessary.
>>
>> This should give considerable performance improvements.  e.g. on a simple
>> IPI ping-pong test between hardware threads, using qemu XICS gives us
>> around 5,000 irqs/second, whereas the in-kernel XICS gives us around
>> 70,000 irqs/s on the same hardware configuration.
>>
>> [Mike Qiu <qiudayu@linux.vnet.ibm.com>: fixed mistype which caused ics_set_kvm_state() to fail]
>> Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
>> [aik: moved to a separate device, reworked QOM]
>>
>> ---
>> Changes:
>> 2013/07/17:
>> * QOM rework
>>
>> 2013/07/01
>> * fixed VMState names in order to support xics-kvm migration to xics and vice versa
>>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> 
> Something went wrong here. ;)


Yeah. git format-patch -s gets confused if commit message has parts
separated with '---' :)


> [...]
>> diff --git a/hw/intc/xics_kvm.c b/hw/intc/xics_kvm.c
>> new file mode 100644
>> index 0000000..04ce1be
>> --- /dev/null
>> +++ b/hw/intc/xics_kvm.c
> [...]
>> +typedef struct XICSStateKVM {
>> +    struct XICSState parent;
> 
> parent_obj to align with most new devices?
> 
>> +
>> +    uint32_t set_xive_token;
>> +    uint32_t get_xive_token;
>> +    uint32_t int_off_token;
>> +    uint32_t int_on_token;
>> +    int kernel_xics_fd;
>> +} XICSStateKVM;
> [...]
>> +static int ics_kvm_realize(DeviceState *dev)
>> +{
>> +    ICSState *ics = ICS(dev);
>> +
>> +    ics->irqs = g_malloc0(ics->nr_irqs * sizeof(ICSIRQState));
>> +    ics->islsi = g_malloc0(ics->nr_irqs * sizeof(bool));
>> +    ics->qirqs = qemu_allocate_irqs(ics_kvm_set_irq, ics, ics->nr_irqs);
>> +
>> +    return 0;
>> +}
>> +
>> +static void ics_kvm_class_init(ObjectClass *klass, void *data)
>> +{
>> +    DeviceClass *dc = DEVICE_CLASS(klass);
>> +    ICSStateClass *k = ICS_CLASS(klass);
>> +
>> +    dc->init = ics_kvm_realize;
> 
> This is fishy - probably you want to assign to dc->realize and fix the
> signature?
> 
> Andreas
> 
>> +    dc->reset = ics_kvm_reset;
>> +    k->pre_save = ics_get_kvm_state;
>> +    k->post_load = ics_set_kvm_state;
>> +}
>> +
>> +static TypeInfo ics_kvm_info = {
> 
> static const for all TypeInfos.
> 
>> +    .name = TYPE_ICS_KVM,
>> +    .parent = TYPE_ICS,
>> +    .instance_size = sizeof(ICSState),
>> +    .class_init = ics_kvm_class_init,
>> +};
>> +
>> +/*
>> + * XICS-KVM
>> + */
>> +static void xics_kvm_cpu_setup(XICSState *icp, PowerPCCPU *cpu)
>> +{
>> +    CPUState *cs;
>> +    ICPState *ss;
>> +    XICSStateKVM *icpkvm = (XICSStateKVM *) object_dynamic_cast(
>> +            OBJECT(icp), TYPE_XICS_KVM);
>> +    XICSStateClass *xics_info = XICS_CLASS(object_class_by_name(TYPE_XICS));
> 
> Are you intentionally accessing that class by name rather than using
> XICS_GET_CLASS(icp), which allows the KVM variant to overwrite things?


This is KVM's CPU_setup(). I want to call non-KVM CPU_setup afterwards,
i.e. "call parent method". XICS_GET_CLASS will return XICS_KVM class but
not XICS, no?



>> +
>> +    if (!icpkvm) {
>> +        return;
>> +    }
>> +
>> +    cs = CPU(cpu);
>> +    ss = &icp->ss[cs->cpu_index];
>> +
>> +    assert(cs->cpu_index < icp->nr_servers);
>> +    if (icpkvm->kernel_xics_fd == -1) {
>> +        abort();
>> +    }
>> +
>> +    if (icpkvm->kernel_xics_fd != -1) {
>> +        int ret;
>> +        struct kvm_enable_cap xics_enable_cap = {
>> +            .cap = KVM_CAP_IRQ_XICS,
>> +            .flags = 0,
>> +            .args = {icpkvm->kernel_xics_fd, cs->cpu_index, 0, 0},
>> +        };
>> +
>> +        ss->cs = cs;
>> +
>> +        ret = kvm_vcpu_ioctl(ss->cs, KVM_ENABLE_CAP, &xics_enable_cap);
>> +        if (ret < 0) {
>> +            fprintf(stderr, "Unable to connect CPU%d to kernel XICS: %s\n",
>> +                    cs->cpu_index, strerror(errno));
> 
> error_report() in place of all fprintf(stderr, ...)? (without \n then)
> 
>> +            exit(1);
>> +        }
>> +    }
>> +
>> +    /* Call emulated XICS implementation for consistency */
>> +    assert(xics_info);
> 
> If you want to go safe, you could add assert(xics_info->cpu_setup).
> 
>> +    xics_info->cpu_setup(icp, cpu);
>> +}
>> +
>> +static void rtas_dummy(PowerPCCPU *cpu, sPAPREnvironment *spapr,
>> +                       uint32_t token,
>> +                       uint32_t nargs, target_ulong args,
>> +                       uint32_t nret, target_ulong rets)
>> +{
>> +    fprintf(stderr, "pseries: %s must never be called for in-kernel XICS\n",
>> +            __func__);
>> +}
>> +
>> +static void xics_kvm_realize(DeviceState *dev, Error **errp)
>> +{
>> +    XICSStateKVM *icpkvm = XICS_KVM(dev);
>> +    XICSState *icp = XICS(dev);
>> +    ICSState *ics;
>> +    QemuOptsList *list = qemu_find_opts("machine");
>> +    int i, rc;
>> +    struct kvm_create_device xics_create_device = {
>> +        .type = KVM_DEV_TYPE_XICS,
>> +        .flags = 0,
>> +    };
>> +
>> +    if (!kvm_enabled()) {
>> +        error_setg(errp, "KVM must be enabled for in-kernel XICS");
>> +        goto fail;
>> +    }
>> +
>> +    if (QTAILQ_EMPTY(&list->head) ||
>> +        !qemu_opt_get_bool(QTAILQ_FIRST(&list->head),
>> +                           "kernel_irqchip", true) ||
> 
> Is it safe to take a value from the first element in the list? Shouldn't
> merging -machine spapr,accel=kvm -machine kernel_irqchip=on be taken
> into account?
> 
>> +        !kvm_check_extension(kvm_state, KVM_CAP_IRQ_XICS)) {
>> +        error_setg(errp, "KVM must be enabled for in-kernel XICS");
>> +        return;
>> +    }
>> +
>> +    icpkvm->set_xive_token = spapr_rtas_register("ibm,set-xive", rtas_dummy);
>> +    icpkvm->get_xive_token = spapr_rtas_register("ibm,get-xive", rtas_dummy);
>> +    icpkvm->int_off_token = spapr_rtas_register("ibm,int-off", rtas_dummy);
>> +    icpkvm->int_on_token = spapr_rtas_register("ibm,int-on", rtas_dummy);
>> +
>> +    rc = kvmppc_define_rtas_token(icpkvm->set_xive_token, "ibm,set-xive");
>> +    if (rc < 0) {
>> +        error_setg(errp, "kvmppc_define_rtas_token: ibm,set-xive");
>> +        goto fail;
>> +    }
>> +
>> +    rc = kvmppc_define_rtas_token(icpkvm->get_xive_token, "ibm,get-xive");
>> +    if (rc < 0) {
>> +        error_setg(errp, "kvmppc_define_rtas_token: ibm,get-xive");
>> +        goto fail;
>> +    }
>> +
>> +    rc = kvmppc_define_rtas_token(icpkvm->int_on_token, "ibm,int-on");
>> +    if (rc < 0) {
>> +        error_setg(errp, "kvmppc_define_rtas_token: ibm,int-on");
>> +        goto fail;
>> +    }
>> +
>> +    rc = kvmppc_define_rtas_token(icpkvm->int_off_token, "ibm,int-off");
>> +    if (rc < 0) {
>> +        error_setg(errp, "kvmppc_define_rtas_token: ibm,int-off");
>> +        goto fail;
>> +    }
>> +
>> +    /* Create the kernel ICP */
>> +    rc = kvm_vm_ioctl(kvm_state, KVM_CREATE_DEVICE, &xics_create_device);
>> +    if (rc < 0) {
>> +        error_setg_errno(errp, -rc, "Error on KVM_CREATE_DEVICE for XICS");
>> +        goto fail;
>> +    }
>> +
>> +    icpkvm->kernel_xics_fd = xics_create_device.fd;
>> +
>> +    icp->ics = ICS(object_new(TYPE_ICS_KVM));
>> +    ics = icp->ics;
>> +    object_property_add_child(OBJECT(icp), "ics", OBJECT(icp->ics), NULL);
>> +
>> +    ics->nr_irqs = icp->nr_irqs;
>> +    ics->offset = XICS_IRQ_BASE;
>> +    ics->icp = icp;
>> +    qdev_init_nofail(DEVICE(ics));
> 
> Since this is in a realize function, please use
> object_property_set_bool(), which gives you access to the Error - to
> catch it you can use a local Error *err variable.
> 
>> +
>> +    icp->ss = g_malloc0(icp->nr_servers*sizeof(ICPState));
>> +    for (i = 0; i < icp->nr_servers; i++) {
>> +        char buffer[32];
>> +        object_initialize(&icp->ss[i], TYPE_ICP_KVM);
>> +        snprintf(buffer, sizeof(buffer), "icp[%d]", i);
>> +        object_property_add_child(OBJECT(icp), buffer, OBJECT(&icp->ss[i]), NULL);
>> +        qdev_init_nofail(DEVICE(&icp->ss[i]));
> 
> object_property_set_bool()


? Anthony did XICS refactoring recently and that has qdev_init_nofail().


> Where does icp->nr_servers come from?

Via properties in try_create_xics() (hw/ppc/spapr.c).

> Is there no way to split this into
> instance_init and realize?


Why would we want to split?



>> +    }
>> +    return;
>> +
>> +fail:
>> +    kvmppc_define_rtas_token(0, "ibm,set-xive");
>> +    kvmppc_define_rtas_token(0, "ibm,get-xive");
>> +    kvmppc_define_rtas_token(0, "ibm,int-on");
>> +    kvmppc_define_rtas_token(0, "ibm,int-off");
>> +    return;
>> +}
>> +
>> +static void xics_kvm_class_init(ObjectClass *oc, void *data)
>> +{
>> +    DeviceClass *dc = DEVICE_CLASS(oc);
>> +    XICSStateClass *k = XICS_CLASS(oc);
> 
> xsc?


Whyyyy? Many places keep using "k".


>> +
>> +    dc->realize = xics_kvm_realize;
>> +    k->cpu_setup = xics_kvm_cpu_setup;
>> +}
>> +
>> +static const TypeInfo xics_kvm_info = {
>> +    .name          = TYPE_XICS_KVM,
>> +    .parent        = TYPE_XICS,
>> +    .instance_size = sizeof(XICSStateKVM),
>> +    .class_init    = xics_kvm_class_init,
>> +};
>> +
>> +static void xics_kvm_register_types(void)
>> +{
>> +    type_register_static(&xics_kvm_info);
>> +    type_register_static(&ics_kvm_info);
>> +    type_register_static(&icp_kvm_info);
>> +}
>> +
>> +type_init(xics_kvm_register_types)
>> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
>> index 432f0d2..84433ee 100644
>> --- a/hw/ppc/spapr.c
>> +++ b/hw/ppc/spapr.c
>> @@ -148,7 +148,31 @@ static XICSState *xics_system_init(int nr_servers, int nr_irqs)
>>  {
>>      XICSState *icp = NULL;
>>  
>> -    icp = try_create_xics(TYPE_XICS, nr_servers, nr_irqs);
>> +    if (kvm_enabled()) {
>> +        bool irqchip_allowed = true, irqchip_required = false;
>> +        QemuOptsList *list = qemu_find_opts("machine");
>> +
>> +        if (!QTAILQ_EMPTY(&list->head)) {
>> +            irqchip_allowed = qemu_opt_get_bool(QTAILQ_FIRST(&list->head),
>> +                                                "kernel_irqchip", true);
>> +            irqchip_required = qemu_opt_get_bool(QTAILQ_FIRST(&list->head),
>> +                                                 "kernel_irqchip", false);
>> +        }
> 
> Again, don't we have APIs to help with opts access?


Hm, I copied this from openpic but it went further since then :)


>> +
>> +        if (irqchip_allowed) {
>> +            icp = try_create_xics(TYPE_XICS_KVM, nr_servers, nr_irqs);
>> +        }
>> +
>> +        if (irqchip_required && !icp) {
>> +            perror("iFailed to create in-kernel XICS\n");
> 
> "Failed to ..."?


vim...
Andreas Färber Aug. 1, 2013, 1:29 a.m. UTC | #5
Am 01.08.2013 02:14, schrieb Alexey Kardashevskiy:
> On 08/01/2013 05:52 AM, Andreas Färber wrote:
>> Am 17.07.2013 08:37, schrieb Alexey Kardashevskiy:
>>> +/*
>>> + * XICS-KVM
>>> + */
>>> +static void xics_kvm_cpu_setup(XICSState *icp, PowerPCCPU *cpu)
>>> +{
>>> +    CPUState *cs;
>>> +    ICPState *ss;
>>> +    XICSStateKVM *icpkvm = (XICSStateKVM *) object_dynamic_cast(
>>> +            OBJECT(icp), TYPE_XICS_KVM);
>>> +    XICSStateClass *xics_info = XICS_CLASS(object_class_by_name(TYPE_XICS));
>>
>> Are you intentionally accessing that class by name rather than using
>> XICS_GET_CLASS(icp), which allows the KVM variant to overwrite things?
> 
> 
> This is KVM's CPU_setup(). I want to call non-KVM CPU_setup afterwards,
> i.e. "call parent method". XICS_GET_CLASS will return XICS_KVM class but
> not XICS, no?

OK, then I'll CC you on my upcoming virtio v2 series that introduces a
more comprehensable macro for this purpose: I would/will recommend to
use a local macro KVM_XICS_GET_PARENT_CLASS(obj) - where you could move
your current inline implementation - to make more obvious that it's not
a mistake.

>>> +
>>> +    icp->ss = g_malloc0(icp->nr_servers*sizeof(ICPState));
>>> +    for (i = 0; i < icp->nr_servers; i++) {
>>> +        char buffer[32];
>>> +        object_initialize(&icp->ss[i], TYPE_ICP_KVM);
>>> +        snprintf(buffer, sizeof(buffer), "icp[%d]", i);
>>> +        object_property_add_child(OBJECT(icp), buffer, OBJECT(&icp->ss[i]), NULL);
>>> +        qdev_init_nofail(DEVICE(&icp->ss[i]));
>>
>> object_property_set_bool()
> 
> 
> ? Anthony did XICS refactoring recently and that has qdev_init_nofail().

Nobody is perfect. ;)

The point is, this is an object, and in realize you shouldn't abort but
set errp and leave error printing and handling to your caller. The QOM
API as opposed to qdev works with an Error object that you can
error_propagate() to your caller.

(Also using qdev_* for something that is new-style QOM is ugly IMO.)

>> Where does icp->nr_servers come from?
> 
> Via properties in try_create_xics() (hw/ppc/spapr.c).

Sounds tricky... Peter introduced static array properties for a similar
purpose, I believe. Don't know if that would help here.

> 
>> Is there no way to split this into
>> instance_init and realize?
> 
> 
> Why would we want to split?

Because realize is too late to create new devices: With our targetted
late, recursive realization model it will not be possible to see and
modify such objects from management interface - only before realize.

I even have a patch on the list that would assert when that happens
during final recursive realization.

>>> +    }
>>> +    return;
>>> +
>>> +fail:
>>> +    kvmppc_define_rtas_token(0, "ibm,set-xive");
>>> +    kvmppc_define_rtas_token(0, "ibm,get-xive");
>>> +    kvmppc_define_rtas_token(0, "ibm,int-on");
>>> +    kvmppc_define_rtas_token(0, "ibm,int-off");
>>> +    return;
>>> +}
>>> +
>>> +static void xics_kvm_class_init(ObjectClass *oc, void *data)
>>> +{
>>> +    DeviceClass *dc = DEVICE_CLASS(oc);
>>> +    XICSStateClass *k = XICS_CLASS(oc);
>>
>> xsc?
> 
> 
> Whyyyy? Many places keep using "k".

Many places also still use "klass" and you took the new "oc". ;)

There's no "k" in XICSStateClass, and we adopted using the initial
letters of the name components, like dc for DeviceClass and oc for
ObjectClass above, just like we sort variables from abstract to
concrete. That way, when the type hierarchy gets changed you can more
naturally add variables.
It's not a requirement, but there's no real reason to call it "k"
either, since "c" unlike "class" is not a reserved word in C++. :)

Cheers,
Andreas
Alexey Kardashevskiy Aug. 1, 2013, 2:08 a.m. UTC | #6
On 08/01/2013 11:29 AM, Andreas Färber wrote:
> Am 01.08.2013 02:14, schrieb Alexey Kardashevskiy:
>> On 08/01/2013 05:52 AM, Andreas Färber wrote:
>>> Am 17.07.2013 08:37, schrieb Alexey Kardashevskiy:
>>>> +/*
>>>> + * XICS-KVM
>>>> + */
>>>> +static void xics_kvm_cpu_setup(XICSState *icp, PowerPCCPU *cpu)
>>>> +{
>>>> +    CPUState *cs;
>>>> +    ICPState *ss;
>>>> +    XICSStateKVM *icpkvm = (XICSStateKVM *) object_dynamic_cast(
>>>> +            OBJECT(icp), TYPE_XICS_KVM);
>>>> +    XICSStateClass *xics_info = XICS_CLASS(object_class_by_name(TYPE_XICS));
>>>
>>> Are you intentionally accessing that class by name rather than using
>>> XICS_GET_CLASS(icp), which allows the KVM variant to overwrite things?
>>
>>
>> This is KVM's CPU_setup(). I want to call non-KVM CPU_setup afterwards,
>> i.e. "call parent method". XICS_GET_CLASS will return XICS_KVM class but
>> not XICS, no?
> 
> OK, then I'll CC you on my upcoming virtio v2 series that introduces a
> more comprehensable macro for this purpose: I would/will recommend to
> use a local macro KVM_XICS_GET_PARENT_CLASS(obj) - where you could move
> your current inline implementation - to make more obvious that it's not
> a mistake.

Oh. So. This has to wait till that virtio thing gets to upstream. Correct?


>>>> +
>>>> +    icp->ss = g_malloc0(icp->nr_servers*sizeof(ICPState));
>>>> +    for (i = 0; i < icp->nr_servers; i++) {
>>>> +        char buffer[32];
>>>> +        object_initialize(&icp->ss[i], TYPE_ICP_KVM);
>>>> +        snprintf(buffer, sizeof(buffer), "icp[%d]", i);
>>>> +        object_property_add_child(OBJECT(icp), buffer, OBJECT(&icp->ss[i]), NULL);
>>>> +        qdev_init_nofail(DEVICE(&icp->ss[i]));
>>>
>>> object_property_set_bool()
>>
>>
>> ? Anthony did XICS refactoring recently and that has qdev_init_nofail().
> 
> Nobody is perfect. ;)

That's ok, my question is more about whether I should use set_bool here and
leave emulated XICS as is or you expect me to fix emulated XICS as well and
post an additional patch or what?


> The point is, this is an object, and in realize you shouldn't abort but
> set errp and leave error printing and handling to your caller. The QOM
> API as opposed to qdev works with an Error object that you can
> error_propagate() to your caller.
> 
> (Also using qdev_* for something that is new-style QOM is ugly IMO.)
> 
>>> Where does icp->nr_servers come from?
>>
>> Via properties in try_create_xics() (hw/ppc/spapr.c).
> 
> Sounds tricky... Peter introduced static array properties for a similar
> purpose, I believe. Don't know if that would help here.
> 
>>
>>> Is there no way to split this into
>>> instance_init and realize?
>>
>>
>> Why would we want to split?
> 
> Because realize is too late to create new devices: With our targetted
> late, recursive realization model it will not be possible to see and
> modify such objects from management interface - only before realize.
> 
> I even have a patch on the list that would assert when that happens
> during final recursive realization.


So most this stuff has to go to instance_init and since there is no way to
prevent parent's instance_init from being called, you are basically forcing
me to introduce an abstract XICS class and inherit emulated XICS and KVM
XICS from it. Besides that, I do not any use of it. Is that correct?
Andreas Färber Aug. 1, 2013, 3:07 a.m. UTC | #7
Am 01.08.2013 04:08, schrieb Alexey Kardashevskiy:
> On 08/01/2013 11:29 AM, Andreas Färber wrote:
>> Am 01.08.2013 02:14, schrieb Alexey Kardashevskiy:
>>> On 08/01/2013 05:52 AM, Andreas Färber wrote:
>>>> Am 17.07.2013 08:37, schrieb Alexey Kardashevskiy:
>>>>> +/*
>>>>> + * XICS-KVM
>>>>> + */
>>>>> +static void xics_kvm_cpu_setup(XICSState *icp, PowerPCCPU *cpu)
>>>>> +{
>>>>> +    CPUState *cs;
>>>>> +    ICPState *ss;
>>>>> +    XICSStateKVM *icpkvm = (XICSStateKVM *) object_dynamic_cast(
>>>>> +            OBJECT(icp), TYPE_XICS_KVM);
>>>>> +    XICSStateClass *xics_info = XICS_CLASS(object_class_by_name(TYPE_XICS));
>>>>
>>>> Are you intentionally accessing that class by name rather than using
>>>> XICS_GET_CLASS(icp), which allows the KVM variant to overwrite things?
>>>
>>>
>>> This is KVM's CPU_setup(). I want to call non-KVM CPU_setup afterwards,
>>> i.e. "call parent method". XICS_GET_CLASS will return XICS_KVM class but
>>> not XICS, no?
>>
>> OK, then I'll CC you on my upcoming virtio v2 series that introduces a
>> more comprehensable macro for this purpose: I would/will recommend to
>> use a local macro KVM_XICS_GET_PARENT_CLASS(obj) - where you could move
>> your current inline implementation - to make more obvious that it's not
>> a mistake.
> 
> Oh. So. This has to wait till that virtio thing gets to upstream. Correct?

Not quite, there is no dependency on virtio.

a) You could do
#define KVM_XICS_GET_PARENT_CLASS(obj) \
    object_class_by_name(TYPE_KVM_XICS)
for now, where you do #define TYPE_KVM_XICS etc.
Note that this is a proposal and not a rule. So far we agreed that
adding classes with fields was not working well for variable depths of
hierarchy and that open-coding the access was not ideal either. Nothing
more, nothing less.

b) You could cherry-pick just http://patchwork.ozlabs.org/patch/263863/
to update the implementation of a), but since that has not yet been
acked I would advise against doing that immediately.
But upstream is in freeze for the next two weeks anyway, so no need to rush.

c) You should participate in the review of Peter's and my proposals
rather than silently inventing your own solution. :)
Advantage of our approaches is hardcoding the current type rather than
the parent's outside of TypeInfo.

>>>>> +
>>>>> +    icp->ss = g_malloc0(icp->nr_servers*sizeof(ICPState));
>>>>> +    for (i = 0; i < icp->nr_servers; i++) {
>>>>> +        char buffer[32];
>>>>> +        object_initialize(&icp->ss[i], TYPE_ICP_KVM);
>>>>> +        snprintf(buffer, sizeof(buffer), "icp[%d]", i);
>>>>> +        object_property_add_child(OBJECT(icp), buffer, OBJECT(&icp->ss[i]), NULL);
>>>>> +        qdev_init_nofail(DEVICE(&icp->ss[i]));
>>>>
>>>> object_property_set_bool()
>>>
>>>
>>> ? Anthony did XICS refactoring recently and that has qdev_init_nofail().
>>
>> Nobody is perfect. ;)
> 
> That's ok, my question is more about whether I should use set_bool here and
> leave emulated XICS as is or you expect me to fix emulated XICS as well and
> post an additional patch or what?

If that is so then yes, cleaning up your existing emulation in a patch
before this one would be a good idea.

>>>> Is there no way to split this into
>>>> instance_init and realize?
>>>
>>> Why would we want to split?
>>
>> Because realize is too late to create new devices: With our targetted
>> late, recursive realization model it will not be possible to see and
>> modify such objects from management interface - only before realize.
>>
>> I even have a patch on the list that would assert when that happens
>> during final recursive realization.
> 
> 
> So most this stuff has to go to instance_init and since there is no way to
> prevent parent's instance_init from being called, you are basically forcing
> me to introduce an abstract XICS class and inherit emulated XICS and KVM
> XICS from it. Besides that, I do not any use of it. Is that correct?

Sorry, I don't follow. x86 and arm do use an abstract base class, e500
doesn't iirc. But whether instance_init or realize, the parent's
implementation will/should be called.

Andreas
Alexey Kardashevskiy Aug. 1, 2013, 3:22 a.m. UTC | #8
On 08/01/2013 01:07 PM, Andreas Färber wrote:
> Am 01.08.2013 04:08, schrieb Alexey Kardashevskiy:
>> On 08/01/2013 11:29 AM, Andreas Färber wrote:
>>> Am 01.08.2013 02:14, schrieb Alexey Kardashevskiy:
>>>> On 08/01/2013 05:52 AM, Andreas Färber wrote:
>>>>> Am 17.07.2013 08:37, schrieb Alexey Kardashevskiy:
>>>>>> +/*
>>>>>> + * XICS-KVM
>>>>>> + */
>>>>>> +static void xics_kvm_cpu_setup(XICSState *icp, PowerPCCPU *cpu)
>>>>>> +{
>>>>>> +    CPUState *cs;
>>>>>> +    ICPState *ss;
>>>>>> +    XICSStateKVM *icpkvm = (XICSStateKVM *) object_dynamic_cast(
>>>>>> +            OBJECT(icp), TYPE_XICS_KVM);
>>>>>> +    XICSStateClass *xics_info = XICS_CLASS(object_class_by_name(TYPE_XICS));
>>>>>
>>>>> Are you intentionally accessing that class by name rather than using
>>>>> XICS_GET_CLASS(icp), which allows the KVM variant to overwrite things?
>>>>
>>>>
>>>> This is KVM's CPU_setup(). I want to call non-KVM CPU_setup afterwards,
>>>> i.e. "call parent method". XICS_GET_CLASS will return XICS_KVM class but
>>>> not XICS, no?
>>>
>>> OK, then I'll CC you on my upcoming virtio v2 series that introduces a
>>> more comprehensable macro for this purpose: I would/will recommend to
>>> use a local macro KVM_XICS_GET_PARENT_CLASS(obj) - where you could move
>>> your current inline implementation - to make more obvious that it's not
>>> a mistake.
>>
>> Oh. So. This has to wait till that virtio thing gets to upstream. Correct?
> 
> Not quite, there is no dependency on virtio.
> 
> a) You could do
> #define KVM_XICS_GET_PARENT_CLASS(obj) \
>     object_class_by_name(TYPE_KVM_XICS)

TYPE_KVM_XICS or TYPE_XICS?


> for now, where you do #define TYPE_KVM_XICS etc.
> Note that this is a proposal and not a rule. So far we agreed that
> adding classes with fields was not working well for variable depths of
> hierarchy and that open-coding the access was not ideal either. Nothing
> more, nothing less.
> 
> b) You could cherry-pick just http://patchwork.ozlabs.org/patch/263863/
> to update the implementation of a), but since that has not yet been
> acked I would advise against doing that immediately.
> But upstream is in freeze for the next two weeks anyway, so no need to rush.
> 
> c) You should participate in the review of Peter's and my proposals
> rather than silently inventing your own solution. :)
> Advantage of our approaches is hardcoding the current type rather than
> the parent's outside of TypeInfo.

I do not understand most of the stuff you do for QOM. I mean I know
C++/RTTI pretty good and even worse - I still remember M$ DCOM/OLE2 but I
do not know to what extent you want to implement that C++ in QEMU :)



>>>>>> +
>>>>>> +    icp->ss = g_malloc0(icp->nr_servers*sizeof(ICPState));
>>>>>> +    for (i = 0; i < icp->nr_servers; i++) {
>>>>>> +        char buffer[32];
>>>>>> +        object_initialize(&icp->ss[i], TYPE_ICP_KVM);
>>>>>> +        snprintf(buffer, sizeof(buffer), "icp[%d]", i);
>>>>>> +        object_property_add_child(OBJECT(icp), buffer, OBJECT(&icp->ss[i]), NULL);
>>>>>> +        qdev_init_nofail(DEVICE(&icp->ss[i]));
>>>>>
>>>>> object_property_set_bool()
>>>>
>>>>
>>>> ? Anthony did XICS refactoring recently and that has qdev_init_nofail().
>>>
>>> Nobody is perfect. ;)
>>
>> That's ok, my question is more about whether I should use set_bool here and
>> leave emulated XICS as is or you expect me to fix emulated XICS as well and
>> post an additional patch or what?
> 
> If that is so then yes, cleaning up your existing emulation in a patch
> before this one would be a good idea.
> 
>>>>> Is there no way to split this into
>>>>> instance_init and realize?
>>>>
>>>> Why would we want to split?
>>>
>>> Because realize is too late to create new devices: With our targetted
>>> late, recursive realization model it will not be possible to see and
>>> modify such objects from management interface - only before realize.
>>>
>>> I even have a patch on the list that would assert when that happens
>>> during final recursive realization.
>>
>>
>> So most this stuff has to go to instance_init and since there is no way to
>> prevent parent's instance_init from being called, you are basically forcing
>> me to introduce an abstract XICS class and inherit emulated XICS and KVM
>> XICS from it. Besides that, I do not any use of it. Is that correct?
> 
> Sorry, I don't follow. x86 and arm do use an abstract base class, e500
> doesn't iirc. But whether instance_init or realize, the parent's
> implementation will/should be called.

Openpic does not but its init() does not do much and openpic does not have
child devices but XICS does now (after Anthony's rework).

Unlike instance_init(), parent's realize() is not called automatically,
this was the main reason why I put everything into realize().
Alexey Kardashevskiy Aug. 2, 2013, 2:57 p.m. UTC | #9
On 08/01/2013 11:29 AM, Andreas Färber wrote:
> Am 01.08.2013 02:14, schrieb Alexey Kardashevskiy:
>> On 08/01/2013 05:52 AM, Andreas Färber wrote:
>>> Am 17.07.2013 08:37, schrieb Alexey Kardashevskiy:

>>>> +
>>>> +    icp->ss = g_malloc0(icp->nr_servers*sizeof(ICPState));
>>>> +    for (i = 0; i < icp->nr_servers; i++) {
>>>> +        char buffer[32];
>>>> +        object_initialize(&icp->ss[i], TYPE_ICP_KVM);
>>>> +        snprintf(buffer, sizeof(buffer), "icp[%d]", i);
>>>> +        object_property_add_child(OBJECT(icp), buffer, OBJECT(&icp->ss[i]), NULL);
>>>> +        qdev_init_nofail(DEVICE(&icp->ss[i]));
>>>
>>> object_property_set_bool()
>>
>>
>> ? Anthony did XICS refactoring recently and that has qdev_init_nofail().
> 
> Nobody is perfect. ;)
> 
> The point is, this is an object, and in realize you shouldn't abort but
> set errp and leave error printing and handling to your caller. The QOM
> API as opposed to qdev works with an Error object that you can
> error_propagate() to your caller.
> 
> (Also using qdev_* for something that is new-style QOM is ugly IMO.)
> 
>>> Where does icp->nr_servers come from?
>>
>> Via properties in try_create_xics() (hw/ppc/spapr.c).
> 
> Sounds tricky... Peter introduced static array properties for a similar
> purpose, I believe. Don't know if that would help here.

Peter who? For what purpose? Could you please point to it? Cannot find it
anywhere.


>>
>>> Is there no way to split this into
>>> instance_init and realize?
>>
>>
>> Why would we want to split?
> 
> Because realize is too late to create new devices: With our targetted
> late, recursive realization model it will not be possible to see and
> modify such objects from management interface - only before realize.


Oh. I lost you here. I need to create XICS with some child ICP devices (now
they are devices). The number of child devices comes from spapr.c via
properties. Now you are saying it is too late to create devices (even so
lame ones) in realize(). So I have to put the creation in instance_init().
Could you please point now I can create device-and-pass-propetries in one
shot? Thanks!
Alexey Kardashevskiy Aug. 3, 2013, 2:45 a.m. UTC | #10
On 08/03/2013 12:57 AM, Alexey Kardashevskiy wrote:
> On 08/01/2013 11:29 AM, Andreas Färber wrote:
>> Am 01.08.2013 02:14, schrieb Alexey Kardashevskiy:
>>> On 08/01/2013 05:52 AM, Andreas Färber wrote:
>>>> Am 17.07.2013 08:37, schrieb Alexey Kardashevskiy:
> 
>>>>> +
>>>>> +    icp->ss = g_malloc0(icp->nr_servers*sizeof(ICPState));
>>>>> +    for (i = 0; i < icp->nr_servers; i++) {
>>>>> +        char buffer[32];
>>>>> +        object_initialize(&icp->ss[i], TYPE_ICP_KVM);
>>>>> +        snprintf(buffer, sizeof(buffer), "icp[%d]", i);
>>>>> +        object_property_add_child(OBJECT(icp), buffer, OBJECT(&icp->ss[i]), NULL);
>>>>> +        qdev_init_nofail(DEVICE(&icp->ss[i]));
>>>>
>>>> object_property_set_bool()
>>>
>>>
>>> ? Anthony did XICS refactoring recently and that has qdev_init_nofail().
>>
>> Nobody is perfect. ;)
>>
>> The point is, this is an object, and in realize you shouldn't abort but
>> set errp and leave error printing and handling to your caller. The QOM
>> API as opposed to qdev works with an Error object that you can
>> error_propagate() to your caller.
>>
>> (Also using qdev_* for something that is new-style QOM is ugly IMO.)
>>
>>>> Where does icp->nr_servers come from?
>>>
>>> Via properties in try_create_xics() (hw/ppc/spapr.c).
>>
>> Sounds tricky... Peter introduced static array properties for a similar
>> purpose, I believe. Don't know if that would help here.
> 
> Peter who? For what purpose? Could you please point to it? Cannot find it
> anywhere.
> 
> 
>>>
>>>> Is there no way to split this into
>>>> instance_init and realize?
>>>
>>>
>>> Why would we want to split?
>>
>> Because realize is too late to create new devices: With our targetted
>> late, recursive realization model it will not be possible to see and
>> modify such objects from management interface - only before realize.
> 
> 
> Oh. I lost you here. I need to create XICS with some child ICP devices (now
> they are devices). The number of child devices comes from spapr.c via
> properties. Now you are saying it is too late to create devices (even so
> lame ones) in realize(). So I have to put the creation in instance_init().
> Could you please point now I can create device-and-pass-propetries in one
> shot? Thanks!


No, I am still missing some bits.
In spapr we try to initialize xics-kvm and if we cannot have it, we switch
to emulated xics.

I.e.:
dev = qdev_create(NULL, type);
qdev_prop_set_uint32(dev, "nr_servers", nr_servers);
qdev_prop_set_uint32(dev, "nr_irqs", nr_irqs);
if (qdev_init(dev) < 0) {
    return NULL;
}

qdev_create() calls instance_init() (which cannot fail) and qdev_init()
calls realize() (which can fail and this is how I know that xics-kvm cannot
work).

instance_init() and realize() are called at once, right here. But as I
understand, realize() is not supposed to be called this way, it should be
called by QOM, no?


And about properties - before I create child devices, I need both
properties set. So it looks like I need a third property such as
"realized", call it "real_instance_init" and assign a callback on it as it
is done for realize(). Correct?
Andreas Färber Aug. 6, 2013, 2:14 p.m. UTC | #11
Am 01.08.2013 05:22, schrieb Alexey Kardashevskiy:
> On 08/01/2013 01:07 PM, Andreas Färber wrote:
>> Am 01.08.2013 04:08, schrieb Alexey Kardashevskiy:
>>> On 08/01/2013 11:29 AM, Andreas Färber wrote:
>>>> Am 01.08.2013 02:14, schrieb Alexey Kardashevskiy:
>>>>> On 08/01/2013 05:52 AM, Andreas Färber wrote:
>>>>>> Am 17.07.2013 08:37, schrieb Alexey Kardashevskiy:
>>>>>>> +/*
>>>>>>> + * XICS-KVM
>>>>>>> + */
>>>>>>> +static void xics_kvm_cpu_setup(XICSState *icp, PowerPCCPU *cpu)
>>>>>>> +{
>>>>>>> +    CPUState *cs;
>>>>>>> +    ICPState *ss;
>>>>>>> +    XICSStateKVM *icpkvm = (XICSStateKVM *) object_dynamic_cast(
>>>>>>> +            OBJECT(icp), TYPE_XICS_KVM);
>>>>>>> +    XICSStateClass *xics_info = XICS_CLASS(object_class_by_name(TYPE_XICS));
>>>>>>
>>>>>> Are you intentionally accessing that class by name rather than using
>>>>>> XICS_GET_CLASS(icp), which allows the KVM variant to overwrite things?
>>>>>
>>>>>
>>>>> This is KVM's CPU_setup(). I want to call non-KVM CPU_setup afterwards,
>>>>> i.e. "call parent method". XICS_GET_CLASS will return XICS_KVM class but
>>>>> not XICS, no?
>>>>
>>>> OK, then I'll CC you on my upcoming virtio v2 series that introduces a
>>>> more comprehensable macro for this purpose: I would/will recommend to
>>>> use a local macro KVM_XICS_GET_PARENT_CLASS(obj) - where you could move
>>>> your current inline implementation - to make more obvious that it's not
>>>> a mistake.
>>>
>>> Oh. So. This has to wait till that virtio thing gets to upstream. Correct?
>>
>> Not quite, there is no dependency on virtio.
>>
>> a) You could do
>> #define KVM_XICS_GET_PARENT_CLASS(obj) \
>>     object_class_by_name(TYPE_KVM_XICS)
> 
> TYPE_KVM_XICS or TYPE_XICS?

Sorry for late reply: When using object_class_by_name() directly, then
TYPE_XICS (or now TYPE_COMMON_XICS); when using object_get_parent() or
the wrapping OBJECT_GET_PARENT_CLASS() then TYPE_KVM_XICS, i.e. the QOM
type to which the C function belongs.

Andreas
diff mbox

Patch

diff --git a/default-configs/ppc64-softmmu.mak b/default-configs/ppc64-softmmu.mak
index 6d1933b..1f4550a 100644
--- a/default-configs/ppc64-softmmu.mak
+++ b/default-configs/ppc64-softmmu.mak
@@ -47,5 +47,6 @@  CONFIG_E500=y
 CONFIG_OPENPIC_KVM=$(and $(CONFIG_E500),$(CONFIG_KVM))
 # For pSeries
 CONFIG_XICS=$(CONFIG_PSERIES)
+CONFIG_XICS_KVM=$(and $(CONFIG_PSERIES),$(CONFIG_KVM))
 # For PReP
 CONFIG_MC146818RTC=y
diff --git a/hw/intc/Makefile.objs b/hw/intc/Makefile.objs
index 2851eed..47ac442 100644
--- a/hw/intc/Makefile.objs
+++ b/hw/intc/Makefile.objs
@@ -23,3 +23,4 @@  obj-$(CONFIG_OMAP) += omap_intc.o
 obj-$(CONFIG_OPENPIC_KVM) += openpic_kvm.o
 obj-$(CONFIG_SH4) += sh_intc.o
 obj-$(CONFIG_XICS) += xics.o
+obj-$(CONFIG_XICS_KVM) += xics_kvm.o
diff --git a/hw/intc/xics_kvm.c b/hw/intc/xics_kvm.c
new file mode 100644
index 0000000..04ce1be
--- /dev/null
+++ b/hw/intc/xics_kvm.c
@@ -0,0 +1,469 @@ 
+/*
+ * QEMU PowerPC pSeries Logical Partition (aka sPAPR) hardware System Emulator
+ *
+ * PAPR Virtualized Interrupt System, aka ICS/ICP aka xics, in-kernel emulation
+ *
+ * Copyright (c) 2013 David Gibson, IBM Corporation.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ */
+
+#include "hw/hw.h"
+#include "trace.h"
+#include "hw/ppc/spapr.h"
+#include "hw/ppc/xics.h"
+#include "kvm_ppc.h"
+#include "qemu/config-file.h"
+
+#include <sys/ioctl.h>
+
+typedef struct XICSStateKVM {
+    struct XICSState parent;
+
+    uint32_t set_xive_token;
+    uint32_t get_xive_token;
+    uint32_t int_off_token;
+    uint32_t int_on_token;
+    int kernel_xics_fd;
+} XICSStateKVM;
+
+/*
+ * ICP-KVM
+ */
+static void icp_get_kvm_state(ICPState *ss)
+{
+    uint64_t state;
+    struct kvm_one_reg reg = {
+        .id = KVM_REG_PPC_ICP_STATE,
+        .addr = (uintptr_t)&state,
+    };
+    int ret;
+
+    if (!ss->cs) {
+        return; /* kernel irqchip not in use */
+    }
+
+    ret = kvm_vcpu_ioctl(ss->cs, KVM_GET_ONE_REG, &reg);
+    if (ret != 0) {
+        fprintf(stderr, "Unable to retrieve KVM interrupt controller state"
+                " for CPU %d: %s\n", ss->cs->cpu_index, strerror(errno));
+        exit(1);
+    }
+
+    ss->xirr = state >> KVM_REG_PPC_ICP_XISR_SHIFT;
+    ss->mfrr = (state >> KVM_REG_PPC_ICP_MFRR_SHIFT)
+        & KVM_REG_PPC_ICP_MFRR_MASK;
+    ss->pending_priority = (state >> KVM_REG_PPC_ICP_PPRI_SHIFT)
+        & KVM_REG_PPC_ICP_PPRI_MASK;
+}
+
+static int icp_set_kvm_state(ICPState *ss)
+{
+    uint64_t state;
+    struct kvm_one_reg reg = {
+        .id = KVM_REG_PPC_ICP_STATE,
+        .addr = (uintptr_t)&state,
+    };
+    int ret;
+
+    if (!ss->cs) {
+        return 0; /* kernel irqchip not in use */
+    }
+
+    state = ((uint64_t)ss->xirr << KVM_REG_PPC_ICP_XISR_SHIFT)
+        | ((uint64_t)ss->mfrr << KVM_REG_PPC_ICP_MFRR_SHIFT)
+        | ((uint64_t)ss->pending_priority << KVM_REG_PPC_ICP_PPRI_SHIFT);
+
+    ret = kvm_vcpu_ioctl(ss->cs, KVM_SET_ONE_REG, &reg);
+    if (ret != 0) {
+        fprintf(stderr, "Unable to restore KVM interrupt controller state (0x%"
+                PRIx64 ") for CPU %d: %s\n", state, ss->cs->cpu_index,
+                strerror(errno));
+        return ret;
+    }
+
+    return 0;
+}
+
+static void icp_kvm_reset(DeviceState *dev)
+{
+    ICPState *icp = ICP(dev);
+
+    icp->xirr = 0;
+    icp->pending_priority = 0xff;
+    icp->mfrr = 0xff;
+
+    /* Make all outputs are deasserted */
+    qemu_set_irq(icp->output, 0);
+
+    icp_set_kvm_state(icp);
+}
+
+static void icp_kvm_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+    ICPStateClass *k = ICP_CLASS(klass);
+
+    dc->reset = icp_kvm_reset;
+    k->pre_save = icp_get_kvm_state;
+    k->post_load = icp_set_kvm_state;
+}
+
+static TypeInfo icp_kvm_info = {
+    .name = TYPE_ICP_KVM,
+    .parent = TYPE_ICP,
+    .instance_size = sizeof(ICPState),
+    .class_init = icp_kvm_class_init,
+    .class_size = sizeof(ICPStateClass),
+};
+
+/*
+ * ICS-KVM
+ */
+static void ics_get_kvm_state(ICSState *ics)
+{
+    XICSStateKVM *icpkvm = XICS_KVM(ics->icp);
+    uint64_t state;
+    struct kvm_device_attr attr = {
+        .flags = 0,
+        .group = KVM_DEV_XICS_GRP_SOURCES,
+        .addr = (uint64_t)(uintptr_t)&state,
+    };
+    int i;
+
+    for (i = 0; i < ics->nr_irqs; i++) {
+        ICSIRQState *irq = &ics->irqs[i];
+        int ret;
+
+        attr.attr = i + ics->offset;
+
+        ret = ioctl(icpkvm->kernel_xics_fd, KVM_GET_DEVICE_ATTR, &attr);
+        if (ret != 0) {
+            fprintf(stderr, "Unable to retrieve KVM interrupt controller state"
+                    " for IRQ %d: %s\n", i + ics->offset, strerror(errno));
+            exit(1);
+        }
+
+        irq->server = state & KVM_XICS_DESTINATION_MASK;
+        irq->saved_priority = (state >> KVM_XICS_PRIORITY_SHIFT)
+            & KVM_XICS_PRIORITY_MASK;
+        /*
+         * To be consistent with the software emulation in xics.c, we
+         * split out the masked state + priority that we get from the
+         * kernel into 'current priority' (0xff if masked) and
+         * 'saved priority' (if masked, this is the priority the
+         * interrupt had before it was masked).  Masking and unmasking
+         * are done with the ibm,int-off and ibm,int-on RTAS calls.
+         */
+        if (state & KVM_XICS_MASKED) {
+            irq->priority = 0xff;
+        } else {
+            irq->priority = irq->saved_priority;
+        }
+
+        if (state & KVM_XICS_PENDING) {
+            if (state & KVM_XICS_LEVEL_SENSITIVE) {
+                irq->status |= XICS_STATUS_ASSERTED;
+            } else {
+                /*
+                 * A pending edge-triggered interrupt (or MSI)
+                 * must have been rejected previously when we
+                 * first detected it and tried to deliver it,
+                 * so mark it as pending and previously rejected
+                 * for consistency with how xics.c works.
+                 */
+                irq->status |= XICS_STATUS_MASKED_PENDING
+                    | XICS_STATUS_REJECTED;
+            }
+        }
+    }
+}
+
+static int ics_set_kvm_state(ICSState *ics)
+{
+    XICSStateKVM *icpkvm = XICS_KVM(ics->icp);
+    uint64_t state;
+    struct kvm_device_attr attr = {
+        .flags = 0,
+        .group = KVM_DEV_XICS_GRP_SOURCES,
+        .addr = (uint64_t)(uintptr_t)&state,
+    };
+    int i;
+
+    for (i = 0; i < ics->nr_irqs; i++) {
+        ICSIRQState *irq = &ics->irqs[i];
+        int ret;
+
+        attr.attr = i + ics->offset;
+
+        state = irq->server;
+        state |= (uint64_t)(irq->saved_priority & KVM_XICS_PRIORITY_MASK)
+            << KVM_XICS_PRIORITY_SHIFT;
+        if (irq->priority != irq->saved_priority) {
+            assert(irq->priority == 0xff);
+            state |= KVM_XICS_MASKED;
+        }
+
+        if (ics->islsi[i]) {
+            state |= KVM_XICS_LEVEL_SENSITIVE;
+            if (irq->status & XICS_STATUS_ASSERTED) {
+                state |= KVM_XICS_PENDING;
+            }
+        } else {
+            if (irq->status & XICS_STATUS_MASKED_PENDING) {
+                state |= KVM_XICS_PENDING;
+            }
+        }
+
+        ret = ioctl(icpkvm->kernel_xics_fd, KVM_SET_DEVICE_ATTR, &attr);
+        if (ret != 0) {
+            fprintf(stderr, "Unable to restore KVM interrupt controller state"
+                    " for IRQs %d: %s\n", i + ics->offset, strerror(errno));
+            return ret;
+        }
+    }
+
+    return 0;
+}
+
+static void ics_kvm_set_irq(void *opaque, int srcno, int val)
+{
+    ICSState *ics = opaque;
+    struct kvm_irq_level args;
+    int rc;
+
+    args.irq = srcno + ics->offset;
+    if (!ics->islsi[srcno]) {
+        if (!val) {
+            return;
+        }
+        args.level = KVM_INTERRUPT_SET;
+    } else {
+        args.level = val ? KVM_INTERRUPT_SET_LEVEL : KVM_INTERRUPT_UNSET;
+    }
+    rc = kvm_vm_ioctl(kvm_state, KVM_IRQ_LINE, &args);
+    if (rc < 0) {
+        perror("kvm_irq_line");
+    }
+}
+
+static void ics_kvm_reset(DeviceState *dev)
+{
+    ics_set_kvm_state(ICS(dev));
+}
+
+static int ics_kvm_realize(DeviceState *dev)
+{
+    ICSState *ics = ICS(dev);
+
+    ics->irqs = g_malloc0(ics->nr_irqs * sizeof(ICSIRQState));
+    ics->islsi = g_malloc0(ics->nr_irqs * sizeof(bool));
+    ics->qirqs = qemu_allocate_irqs(ics_kvm_set_irq, ics, ics->nr_irqs);
+
+    return 0;
+}
+
+static void ics_kvm_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+    ICSStateClass *k = ICS_CLASS(klass);
+
+    dc->init = ics_kvm_realize;
+    dc->reset = ics_kvm_reset;
+    k->pre_save = ics_get_kvm_state;
+    k->post_load = ics_set_kvm_state;
+}
+
+static TypeInfo ics_kvm_info = {
+    .name = TYPE_ICS_KVM,
+    .parent = TYPE_ICS,
+    .instance_size = sizeof(ICSState),
+    .class_init = ics_kvm_class_init,
+};
+
+/*
+ * XICS-KVM
+ */
+static void xics_kvm_cpu_setup(XICSState *icp, PowerPCCPU *cpu)
+{
+    CPUState *cs;
+    ICPState *ss;
+    XICSStateKVM *icpkvm = (XICSStateKVM *) object_dynamic_cast(
+            OBJECT(icp), TYPE_XICS_KVM);
+    XICSStateClass *xics_info = XICS_CLASS(object_class_by_name(TYPE_XICS));
+
+    if (!icpkvm) {
+        return;
+    }
+
+    cs = CPU(cpu);
+    ss = &icp->ss[cs->cpu_index];
+
+    assert(cs->cpu_index < icp->nr_servers);
+    if (icpkvm->kernel_xics_fd == -1) {
+        abort();
+    }
+
+    if (icpkvm->kernel_xics_fd != -1) {
+        int ret;
+        struct kvm_enable_cap xics_enable_cap = {
+            .cap = KVM_CAP_IRQ_XICS,
+            .flags = 0,
+            .args = {icpkvm->kernel_xics_fd, cs->cpu_index, 0, 0},
+        };
+
+        ss->cs = cs;
+
+        ret = kvm_vcpu_ioctl(ss->cs, KVM_ENABLE_CAP, &xics_enable_cap);
+        if (ret < 0) {
+            fprintf(stderr, "Unable to connect CPU%d to kernel XICS: %s\n",
+                    cs->cpu_index, strerror(errno));
+            exit(1);
+        }
+    }
+
+    /* Call emulated XICS implementation for consistency */
+    assert(xics_info);
+    xics_info->cpu_setup(icp, cpu);
+}
+
+static void rtas_dummy(PowerPCCPU *cpu, sPAPREnvironment *spapr,
+                       uint32_t token,
+                       uint32_t nargs, target_ulong args,
+                       uint32_t nret, target_ulong rets)
+{
+    fprintf(stderr, "pseries: %s must never be called for in-kernel XICS\n",
+            __func__);
+}
+
+static void xics_kvm_realize(DeviceState *dev, Error **errp)
+{
+    XICSStateKVM *icpkvm = XICS_KVM(dev);
+    XICSState *icp = XICS(dev);
+    ICSState *ics;
+    QemuOptsList *list = qemu_find_opts("machine");
+    int i, rc;
+    struct kvm_create_device xics_create_device = {
+        .type = KVM_DEV_TYPE_XICS,
+        .flags = 0,
+    };
+
+    if (!kvm_enabled()) {
+        error_setg(errp, "KVM must be enabled for in-kernel XICS");
+        goto fail;
+    }
+
+    if (QTAILQ_EMPTY(&list->head) ||
+        !qemu_opt_get_bool(QTAILQ_FIRST(&list->head),
+                           "kernel_irqchip", true) ||
+        !kvm_check_extension(kvm_state, KVM_CAP_IRQ_XICS)) {
+        error_setg(errp, "KVM must be enabled for in-kernel XICS");
+        return;
+    }
+
+    icpkvm->set_xive_token = spapr_rtas_register("ibm,set-xive", rtas_dummy);
+    icpkvm->get_xive_token = spapr_rtas_register("ibm,get-xive", rtas_dummy);
+    icpkvm->int_off_token = spapr_rtas_register("ibm,int-off", rtas_dummy);
+    icpkvm->int_on_token = spapr_rtas_register("ibm,int-on", rtas_dummy);
+
+    rc = kvmppc_define_rtas_token(icpkvm->set_xive_token, "ibm,set-xive");
+    if (rc < 0) {
+        error_setg(errp, "kvmppc_define_rtas_token: ibm,set-xive");
+        goto fail;
+    }
+
+    rc = kvmppc_define_rtas_token(icpkvm->get_xive_token, "ibm,get-xive");
+    if (rc < 0) {
+        error_setg(errp, "kvmppc_define_rtas_token: ibm,get-xive");
+        goto fail;
+    }
+
+    rc = kvmppc_define_rtas_token(icpkvm->int_on_token, "ibm,int-on");
+    if (rc < 0) {
+        error_setg(errp, "kvmppc_define_rtas_token: ibm,int-on");
+        goto fail;
+    }
+
+    rc = kvmppc_define_rtas_token(icpkvm->int_off_token, "ibm,int-off");
+    if (rc < 0) {
+        error_setg(errp, "kvmppc_define_rtas_token: ibm,int-off");
+        goto fail;
+    }
+
+    /* Create the kernel ICP */
+    rc = kvm_vm_ioctl(kvm_state, KVM_CREATE_DEVICE, &xics_create_device);
+    if (rc < 0) {
+        error_setg_errno(errp, -rc, "Error on KVM_CREATE_DEVICE for XICS");
+        goto fail;
+    }
+
+    icpkvm->kernel_xics_fd = xics_create_device.fd;
+
+    icp->ics = ICS(object_new(TYPE_ICS_KVM));
+    ics = icp->ics;
+    object_property_add_child(OBJECT(icp), "ics", OBJECT(icp->ics), NULL);
+
+    ics->nr_irqs = icp->nr_irqs;
+    ics->offset = XICS_IRQ_BASE;
+    ics->icp = icp;
+    qdev_init_nofail(DEVICE(ics));
+
+    icp->ss = g_malloc0(icp->nr_servers*sizeof(ICPState));
+    for (i = 0; i < icp->nr_servers; i++) {
+        char buffer[32];
+        object_initialize(&icp->ss[i], TYPE_ICP_KVM);
+        snprintf(buffer, sizeof(buffer), "icp[%d]", i);
+        object_property_add_child(OBJECT(icp), buffer, OBJECT(&icp->ss[i]), NULL);
+        qdev_init_nofail(DEVICE(&icp->ss[i]));
+    }
+    return;
+
+fail:
+    kvmppc_define_rtas_token(0, "ibm,set-xive");
+    kvmppc_define_rtas_token(0, "ibm,get-xive");
+    kvmppc_define_rtas_token(0, "ibm,int-on");
+    kvmppc_define_rtas_token(0, "ibm,int-off");
+    return;
+}
+
+static void xics_kvm_class_init(ObjectClass *oc, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(oc);
+    XICSStateClass *k = XICS_CLASS(oc);
+
+    dc->realize = xics_kvm_realize;
+    k->cpu_setup = xics_kvm_cpu_setup;
+}
+
+static const TypeInfo xics_kvm_info = {
+    .name          = TYPE_XICS_KVM,
+    .parent        = TYPE_XICS,
+    .instance_size = sizeof(XICSStateKVM),
+    .class_init    = xics_kvm_class_init,
+};
+
+static void xics_kvm_register_types(void)
+{
+    type_register_static(&xics_kvm_info);
+    type_register_static(&ics_kvm_info);
+    type_register_static(&icp_kvm_info);
+}
+
+type_init(xics_kvm_register_types)
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 432f0d2..84433ee 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -148,7 +148,31 @@  static XICSState *xics_system_init(int nr_servers, int nr_irqs)
 {
     XICSState *icp = NULL;
 
-    icp = try_create_xics(TYPE_XICS, nr_servers, nr_irqs);
+    if (kvm_enabled()) {
+        bool irqchip_allowed = true, irqchip_required = false;
+        QemuOptsList *list = qemu_find_opts("machine");
+
+        if (!QTAILQ_EMPTY(&list->head)) {
+            irqchip_allowed = qemu_opt_get_bool(QTAILQ_FIRST(&list->head),
+                                                "kernel_irqchip", true);
+            irqchip_required = qemu_opt_get_bool(QTAILQ_FIRST(&list->head),
+                                                 "kernel_irqchip", false);
+        }
+
+        if (irqchip_allowed) {
+            icp = try_create_xics(TYPE_XICS_KVM, nr_servers, nr_irqs);
+        }
+
+        if (irqchip_required && !icp) {
+            perror("iFailed to create in-kernel XICS\n");
+            abort();
+        }
+    }
+
+    if (!icp) {
+        icp = try_create_xics(TYPE_XICS, nr_servers, nr_irqs);
+    }
+
     if (!icp) {
         perror("Failed to create XICS\n");
         abort();
diff --git a/include/hw/ppc/xics.h b/include/hw/ppc/xics.h
index 90ecaf8..835a3d6 100644
--- a/include/hw/ppc/xics.h
+++ b/include/hw/ppc/xics.h
@@ -32,6 +32,9 @@ 
 #define TYPE_XICS "xics"
 #define XICS(obj) OBJECT_CHECK(XICSState, (obj), TYPE_XICS)
 
+#define TYPE_XICS_KVM "xics-kvm"
+#define XICS_KVM(obj) OBJECT_CHECK(XICSStateKVM, (obj), TYPE_XICS_KVM)
+
 #define XICS_CLASS(klass) \
      OBJECT_CLASS_CHECK(XICSStateClass, (klass), TYPE_XICS)
 #define XICS_GET_CLASS(obj) \
@@ -73,6 +76,9 @@  struct XICSState {
 #define TYPE_ICP "icp"
 #define ICP(obj) OBJECT_CHECK(ICPState, (obj), TYPE_ICP)
 
+#define TYPE_ICP_KVM "icp-kvm"
+#define ICP_KVM(obj) OBJECT_CHECK(ICPState, (obj), TYPE_ICP_KVM)
+
 #define ICP_CLASS(klass) \
      OBJECT_CLASS_CHECK(ICPStateClass, (klass), TYPE_ICP)
 #define ICP_GET_CLASS(obj) \
@@ -89,6 +95,7 @@  struct ICPState {
     /*< private >*/
     DeviceState parent_obj;
     /*< public >*/
+    CPUState *cs;
     uint32_t xirr;
     uint8_t pending_priority;
     uint8_t mfrr;
@@ -98,6 +105,9 @@  struct ICPState {
 #define TYPE_ICS "ics"
 #define ICS(obj) OBJECT_CHECK(ICSState, (obj), TYPE_ICS)
 
+#define TYPE_ICS_KVM "icskvm"
+#define ICS_KVM(obj) OBJECT_CHECK(ICSState, (obj), TYPE_ICS_KVM)
+
 #define ICS_CLASS(klass) \
      OBJECT_CLASS_CHECK(ICSStateClass, (klass), TYPE_ICS)
 #define ICS_GET_CLASS(obj) \