diff mbox series

[v2,12/13] spapr: add KVM support to the 'dual' machine

Message ID 20190222131322.26079-13-clg@kaod.org
State New
Headers show
Series spapr: add KVM support to the XIVE interrupt mode | expand

Commit Message

Cédric Le Goater Feb. 22, 2019, 1:13 p.m. UTC
The interrupt mode is chosen by the CAS negotiation process and
activated after a reset to take into account the required changes in
the machine. This brings new constraints on how the associated KVM IRQ
device is initialized.

Currently, each model takes care of the initialization of the KVM
device in their realize method but this is not possible anymore as the
initialization needs to be done globaly when the interrupt mode is
known, i.e. when machine is reseted. It also means that we need a way
to delete a KVM device when another mode is chosen.

Also, to support migration, the QEMU objects holding the state to
transfer should always be available but not necessarily activated.

The overall approach of this proposal is to initialize both interrupt
mode at the QEMU level and keep the IRQ number space in sync to allow
switching from one mode to another. For the KVM side of things, the
whole initialization of the KVM device, sources and presenters, is
grouped in a single routine. The XICS and XIVE sPAPR IRQ reset
handlers are modified accordingly to handle the init and the delete
sequences of the KVM device.

Signed-off-by: Cédric Le Goater <clg@kaod.org>
---
 include/hw/ppc/spapr_xive.h |  1 +
 hw/intc/spapr_xive.c        | 19 +++++++-
 hw/intc/spapr_xive_kvm.c    | 27 +++++++++++
 hw/intc/xics_kvm.c          | 26 ++++++++++
 hw/intc/xive.c              |  4 --
 hw/ppc/spapr_irq.c          | 97 ++++++++++++++++++++++++++++---------
 6 files changed, 145 insertions(+), 29 deletions(-)

Comments

David Gibson Feb. 28, 2019, 5:15 a.m. UTC | #1
On Fri, Feb 22, 2019 at 02:13:21PM +0100, Cédric Le Goater wrote:
> The interrupt mode is chosen by the CAS negotiation process and
> activated after a reset to take into account the required changes in
> the machine. This brings new constraints on how the associated KVM IRQ
> device is initialized.
> 
> Currently, each model takes care of the initialization of the KVM
> device in their realize method but this is not possible anymore as the
> initialization needs to be done globaly when the interrupt mode is
> known, i.e. when machine is reseted. It also means that we need a way
> to delete a KVM device when another mode is chosen.
> 
> Also, to support migration, the QEMU objects holding the state to
> transfer should always be available but not necessarily activated.
> 
> The overall approach of this proposal is to initialize both interrupt
> mode at the QEMU level and keep the IRQ number space in sync to allow
> switching from one mode to another. For the KVM side of things, the
> whole initialization of the KVM device, sources and presenters, is
> grouped in a single routine. The XICS and XIVE sPAPR IRQ reset
> handlers are modified accordingly to handle the init and the delete
> sequences of the KVM device.
> 
> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> ---
>  include/hw/ppc/spapr_xive.h |  1 +
>  hw/intc/spapr_xive.c        | 19 +++++++-
>  hw/intc/spapr_xive_kvm.c    | 27 +++++++++++
>  hw/intc/xics_kvm.c          | 26 ++++++++++
>  hw/intc/xive.c              |  4 --
>  hw/ppc/spapr_irq.c          | 97 ++++++++++++++++++++++++++++---------
>  6 files changed, 145 insertions(+), 29 deletions(-)
> 
> diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h
> index a7c4c275a747..a1593ac2fcf0 100644
> --- a/include/hw/ppc/spapr_xive.h
> +++ b/include/hw/ppc/spapr_xive.h
> @@ -66,6 +66,7 @@ void spapr_xive_map_mmio(sPAPRXive *xive);
>  
>  int spapr_xive_end_to_target(uint8_t end_blk, uint32_t end_idx,
>                               uint32_t *out_server, uint8_t *out_prio);
> +void spapr_xive_late_realize(sPAPRXive *xive, Error **errp);
>  
>  /*
>   * KVM XIVE device helpers
> diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
> index 21fe5e1aa39f..b0cbc2fe21ee 100644
> --- a/hw/intc/spapr_xive.c
> +++ b/hw/intc/spapr_xive.c
> @@ -278,7 +278,6 @@ static void spapr_xive_realize(DeviceState *dev, Error **errp)
>      XiveSource *xsrc = &xive->source;
>      XiveENDSource *end_xsrc = &xive->end_source;
>      Error *local_err = NULL;
> -    MachineState *machine = MACHINE(qdev_get_machine());
>  
>      if (!xive->nr_irqs) {
>          error_setg(errp, "Number of interrupt needs to be greater 0");
> @@ -329,6 +328,15 @@ static void spapr_xive_realize(DeviceState *dev, Error **errp)
>                             xive->tm_base + XIVE_TM_USER_PAGE * (1 << TM_SHIFT));
>  
>      qemu_register_reset(spapr_xive_reset, dev);
> +}
> +
> +void spapr_xive_late_realize(sPAPRXive *xive, Error **errp)
> +{
> +    Error *local_err = NULL;
> +    MachineState *machine = MACHINE(qdev_get_machine());
> +    XiveSource *xsrc = &xive->source;
> +    XiveENDSource *end_xsrc = &xive->end_source;
> +    static bool once;
>  
>      if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
>          kvmppc_xive_connect(xive, &local_err);
> @@ -351,6 +359,15 @@ static void spapr_xive_realize(DeviceState *dev, Error **errp)
>          error_report_err(local_err);
>      }
>  
> +    /*
> +     * TODO: Emulated mode can only be initialized once. Should we
> +     * store the information under the device model for later usage ?
> +     */
> +    if (once) {
> +        return;
> +    }
> +    once = true;

Urgh.  static locals are a bad smell.  I think at least this flag
should go into the instance structure (if we can't deduce it from
something else in there).

> +
>      /* TIMA initialization */
>      memory_region_init_io(&xive->tm_mmio, OBJECT(xive), &xive_tm_ops, xive,
>                            "xive.tima", 4ull << TM_SHIFT);
> diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c
> index cd81cdb23a5e..99a829fb3f60 100644
> --- a/hw/intc/spapr_xive_kvm.c
> +++ b/hw/intc/spapr_xive_kvm.c
> @@ -657,6 +657,15 @@ void kvmppc_xive_connect(sPAPRXive *xive, Error **errp)
>      Error *local_err = NULL;
>      size_t esb_len = (1ull << xsrc->esb_shift) * xsrc->nr_irqs;
>      size_t tima_len = 4ull << TM_SHIFT;
> +    CPUState *cs;
> +
> +    /*
> +     * The KVM XIVE device already in use. This is the case when
> +     * rebooting XIVE -> XIVE

I take it from this we're leaving the XIVE KVM device initialized if
we don't change to XICS during CAS.  Would it make things simpler if
we always removed both the XICS and XIVE KVM devices at reset and
recreated the one we need at CAS?

> +     */
> +    if (xive->fd != -1) {
> +        return;
> +    }
>  
>      if (!kvmppc_has_cap_xive()) {
>          error_setg(errp, "IRQ_XIVE capability must be present for KVM");
> @@ -705,6 +714,24 @@ void kvmppc_xive_connect(sPAPRXive *xive, Error **errp)
>      xive->change = qemu_add_vm_change_state_handler(
>          kvmppc_xive_change_state_handler, xive);
>  
> +    /* Connect the presenters to the initial VCPUs of the machine */
> +    CPU_FOREACH(cs) {
> +        PowerPCCPU *cpu = POWERPC_CPU(cs);
> +
> +        kvmppc_xive_cpu_connect(spapr_cpu_state(cpu)->tctx, &local_err);
> +        if (local_err) {
> +            error_propagate(errp, local_err);
> +            return;
> +        }
> +    }
> +
> +    /* Update the KVM sources */
> +    kvmppc_xive_source_reset(xsrc, &local_err);
> +    if (local_err) {
> +            error_propagate(errp, local_err);
> +            return;
> +    }
> +
>      kvm_kernel_irqchip = true;
>      kvm_msi_via_irqfd_allowed = true;
>      kvm_gsi_direct_mapping = true;
> diff --git a/hw/intc/xics_kvm.c b/hw/intc/xics_kvm.c
> index 9855316e4831..8ffd4c7a36f8 100644
> --- a/hw/intc/xics_kvm.c
> +++ b/hw/intc/xics_kvm.c
> @@ -33,6 +33,7 @@
>  #include "trace.h"
>  #include "sysemu/kvm.h"
>  #include "hw/ppc/spapr.h"
> +#include "hw/ppc/spapr_cpu_core.h"
>  #include "hw/ppc/xics.h"
>  #include "hw/ppc/xics_spapr.h"
>  #include "kvm_ppc.h"
> @@ -337,6 +338,16 @@ static void rtas_dummy(PowerPCCPU *cpu, sPAPRMachineState *spapr,
>  int xics_kvm_init(sPAPRMachineState *spapr, Error **errp)
>  {
>      int rc;
> +    CPUState *cs;
> +    Error *local_err = NULL;
> +
> +    /*
> +     * The KVM XICS device already in use. This is the case when
> +     * rebooting XICS -> XICS
> +     */
> +    if (kernel_xics_fd != -1) {
> +        return 0;
> +    }
>  
>      if (!kvm_enabled() || !kvm_check_extension(kvm_state, KVM_CAP_IRQ_XICS)) {
>          error_setg(errp,
> @@ -385,6 +396,21 @@ int xics_kvm_init(sPAPRMachineState *spapr, Error **errp)
>      kvm_msi_via_irqfd_allowed = true;
>      kvm_gsi_direct_mapping = true;
>  
> +    /* Connect the presenters to the initial VCPUs of the machine */
> +    CPU_FOREACH(cs) {
> +        PowerPCCPU *cpu = POWERPC_CPU(cs);
> +
> +        icp_kvm_realize(DEVICE(spapr_cpu_state(cpu)->icp), &local_err);
> +        if (local_err) {
> +            error_propagate(errp, local_err);
> +            goto fail;
> +        }
> +        icp_set_kvm_state(spapr_cpu_state(cpu)->icp);
> +    }
> +
> +    /* Update the KVM sources */
> +    ics_set_kvm_state(spapr->ics);
> +
>      return 0;
>  
>  fail:
> diff --git a/hw/intc/xive.c b/hw/intc/xive.c
> index 1f8e923ca654..715d5a7e65ed 100644
> --- a/hw/intc/xive.c
> +++ b/hw/intc/xive.c
> @@ -929,10 +929,6 @@ static void xive_source_reset(void *dev)
>  
>      /* PQs are initialized to 0b01 (Q=1) which corresponds to "ints off" */
>      memset(xsrc->status, XIVE_ESB_OFF, xsrc->nr_irqs);
> -
> -    if (kvm_irqchip_in_kernel()) {
> -        kvmppc_xive_source_reset(xsrc, &error_fatal);
> -    }
>  }
>  
>  static void xive_source_realize(DeviceState *dev, Error **errp)
> diff --git a/hw/ppc/spapr_irq.c b/hw/ppc/spapr_irq.c
> index 3176098b9f7c..f8260c14aecd 100644
> --- a/hw/ppc/spapr_irq.c
> +++ b/hw/ppc/spapr_irq.c
> @@ -92,35 +92,55 @@ error:
>      return NULL;
>  }
>  
> -static void spapr_irq_init_xics(sPAPRMachineState *spapr, int nr_irqs,
> -                                Error **errp)
> +static void spapr_ics_late_realize(sPAPRMachineState *spapr, Error **errp)
>  {
>      MachineState *machine = MACHINE(spapr);
>      Error *local_err = NULL;
> -    bool xics_kvm = false;
> +    static bool once;
>  
> -    if (kvm_enabled()) {
> -        if (machine_kernel_irqchip_allowed(machine) &&
> -            !xics_kvm_init(spapr, &local_err)) {
> -            xics_kvm = true;
> -        }
> -        if (machine_kernel_irqchip_required(machine) && !xics_kvm) {
> +    if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
> +        xics_kvm_init(spapr, &local_err);
> +        if (local_err && machine_kernel_irqchip_required(machine)) {
>              error_prepend(&local_err,
>                            "kernel_irqchip requested but unavailable: ");
> -            goto error;
> +            error_propagate(errp, local_err);
> +            return;
>          }
> -        error_free(local_err);
> -        local_err = NULL;
> +
> +        if (!local_err) {
> +            return;
> +        }
> +
> +        /*
> +         * We failed to initialize the XIVE KVM device, fallback to
> +         * emulated mode
> +         */
> +        error_prepend(&local_err, "kernel_irqchip allowed but unavailable: ");
> +        error_report_err(local_err);

Should only warn (at most) in this case, since fallback is permitted
and should work.

>      }
>  
> -    if (!xics_kvm) {
> -        xics_spapr_init(spapr);
> +    /*
> +     * TODO: Emulated mode can only be initialized once. Should we
> +     * store the information under the device model for later usage ?
> +     */
> +    if (once) {
> +        return;
>      }
> +    once = true;
>  
> -    spapr->ics = spapr_ics_create(spapr, nr_irqs, &local_err);
> +    xics_spapr_init(spapr);
> +}
>  
> -error:
> -    error_propagate(errp, local_err);
> +static void spapr_irq_init_xics(sPAPRMachineState *spapr, int nr_irqs,
> +                                Error **errp)
> +{
> +    Error *local_err = NULL;
> +
> +    spapr->ics = spapr_ics_create(spapr, nr_irqs, &local_err);
> +    if (local_err) {
> +        error_propagate(errp, local_err);
> +        return;
> +    }
>  }
>  
>  #define ICS_IRQ_FREE(ics, srcno)   \
> @@ -227,7 +247,13 @@ static void spapr_irq_set_irq_xics(void *opaque, int srcno, int val)
>  
>  static void spapr_irq_reset_xics(sPAPRMachineState *spapr, Error **errp)
>  {
> -    /* TODO: create the KVM XICS device */
> +    Error *local_err = NULL;
> +
> +    spapr_ics_late_realize(spapr, &local_err);
> +    if (local_err) {
> +        error_propagate(errp, local_err);
> +        return;
> +    }
>  }
>  
>  static const char *spapr_irq_get_nodename_xics(sPAPRMachineState *spapr)
> @@ -386,6 +412,7 @@ static int spapr_irq_post_load_xive(sPAPRMachineState *spapr, int version_id)
>  static void spapr_irq_reset_xive(sPAPRMachineState *spapr, Error **errp)
>  {
>      CPUState *cs;
> +    Error *local_err = NULL;
>  
>      CPU_FOREACH(cs) {
>          PowerPCCPU *cpu = POWERPC_CPU(cs);
> @@ -394,6 +421,12 @@ static void spapr_irq_reset_xive(sPAPRMachineState *spapr, Error **errp)
>          spapr_xive_set_tctx_os_cam(spapr_cpu_state(cpu)->tctx);
>      }
>  
> +    spapr_xive_late_realize(spapr->xive, &local_err);
> +    if (local_err) {
> +        error_propagate(errp, local_err);
> +        return;
> +    }
> +
>      /* Activate the XIVE MMIOs */
>      spapr_xive_mmio_set_enabled(spapr->xive, true);
>  }
> @@ -462,14 +495,8 @@ static sPAPRIrq *spapr_irq_current(sPAPRMachineState *spapr)
>  static void spapr_irq_init_dual(sPAPRMachineState *spapr, int nr_irqs,
>                                  Error **errp)
>  {
> -    MachineState *machine = MACHINE(spapr);
>      Error *local_err = NULL;
>  
> -    if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
> -        error_setg(errp, "No KVM support for the 'dual' machine");
> -        return;
> -    }
> -
>      spapr_irq_xics.init(spapr, spapr_irq_xics.nr_irqs, &local_err);
>      if (local_err) {
>          error_propagate(errp, local_err);
> @@ -548,6 +575,9 @@ static int spapr_irq_post_load_dual(sPAPRMachineState *spapr, int version_id)
>       * defaults to XICS at startup.
>       */
>      if (spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
> +        if (kvm_irqchip_in_kernel()) {
> +            xics_kvm_disconnect(spapr, &error_fatal);

Yeah, this is kinda nasty.  I'm wondering if we could make things
simpler by always using emulated irqchip until CAS and only setting up
kernel irqchip then.

> +        }
>          spapr_irq_xive.reset(spapr, &error_fatal);
>      }
>  
> @@ -556,12 +586,30 @@ static int spapr_irq_post_load_dual(sPAPRMachineState *spapr, int version_id)
>  
>  static void spapr_irq_reset_dual(sPAPRMachineState *spapr, Error **errp)
>  {
> +    Error *local_err = NULL;
> +
>      /*
>       * Deactivate the XIVE MMIOs. The XIVE backend will reenable them
>       * if selected.
>       */
>      spapr_xive_mmio_set_enabled(spapr->xive, false);
>  
> +    /* Destroy all KVM devices */
> +    if (kvm_irqchip_in_kernel()) {
> +        xics_kvm_disconnect(spapr, &local_err);
> +        if (local_err) {
> +            error_propagate(errp, local_err);
> +            error_prepend(errp, "KVM XICS disconnect failed: ");
> +            return;
> +        }
> +        kvmppc_xive_disconnect(spapr->xive, &local_err);
> +        if (local_err) {
> +            error_propagate(errp, local_err);
> +            error_prepend(errp, "KVM XIVE disconnect failed: ");
> +            return;
> +        }
> +    }
> +
>      spapr_irq_current(spapr)->reset(spapr, errp);
>  }
>  
> @@ -748,6 +796,7 @@ sPAPRIrq spapr_irq_xics_legacy = {
>      .dt_populate = spapr_dt_xics,
>      .cpu_intc_create = spapr_irq_cpu_intc_create_xics,
>      .post_load   = spapr_irq_post_load_xics,
> +    .reset       = spapr_irq_reset_xics,
>      .set_irq     = spapr_irq_set_irq_xics,
>      .get_nodename = spapr_irq_get_nodename_xics,
>  };
Cédric Le Goater March 12, 2019, 9:40 a.m. UTC | #2
On 2/28/19 6:15 AM, David Gibson wrote:
> On Fri, Feb 22, 2019 at 02:13:21PM +0100, Cédric Le Goater wrote:
>> The interrupt mode is chosen by the CAS negotiation process and
>> activated after a reset to take into account the required changes in
>> the machine. This brings new constraints on how the associated KVM IRQ
>> device is initialized.
>>
>> Currently, each model takes care of the initialization of the KVM
>> device in their realize method but this is not possible anymore as the
>> initialization needs to be done globaly when the interrupt mode is
>> known, i.e. when machine is reseted. It also means that we need a way
>> to delete a KVM device when another mode is chosen.
>>
>> Also, to support migration, the QEMU objects holding the state to
>> transfer should always be available but not necessarily activated.
>>
>> The overall approach of this proposal is to initialize both interrupt
>> mode at the QEMU level and keep the IRQ number space in sync to allow
>> switching from one mode to another. For the KVM side of things, the
>> whole initialization of the KVM device, sources and presenters, is
>> grouped in a single routine. The XICS and XIVE sPAPR IRQ reset
>> handlers are modified accordingly to handle the init and the delete
>> sequences of the KVM device.
>>
>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>> ---
>>  include/hw/ppc/spapr_xive.h |  1 +
>>  hw/intc/spapr_xive.c        | 19 +++++++-
>>  hw/intc/spapr_xive_kvm.c    | 27 +++++++++++
>>  hw/intc/xics_kvm.c          | 26 ++++++++++
>>  hw/intc/xive.c              |  4 --
>>  hw/ppc/spapr_irq.c          | 97 ++++++++++++++++++++++++++++---------
>>  6 files changed, 145 insertions(+), 29 deletions(-)
>>
>> diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h
>> index a7c4c275a747..a1593ac2fcf0 100644
>> --- a/include/hw/ppc/spapr_xive.h
>> +++ b/include/hw/ppc/spapr_xive.h
>> @@ -66,6 +66,7 @@ void spapr_xive_map_mmio(sPAPRXive *xive);
>>  
>>  int spapr_xive_end_to_target(uint8_t end_blk, uint32_t end_idx,
>>                               uint32_t *out_server, uint8_t *out_prio);
>> +void spapr_xive_late_realize(sPAPRXive *xive, Error **errp);
>>  
>>  /*
>>   * KVM XIVE device helpers
>> diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
>> index 21fe5e1aa39f..b0cbc2fe21ee 100644
>> --- a/hw/intc/spapr_xive.c
>> +++ b/hw/intc/spapr_xive.c
>> @@ -278,7 +278,6 @@ static void spapr_xive_realize(DeviceState *dev, Error **errp)
>>      XiveSource *xsrc = &xive->source;
>>      XiveENDSource *end_xsrc = &xive->end_source;
>>      Error *local_err = NULL;
>> -    MachineState *machine = MACHINE(qdev_get_machine());
>>  
>>      if (!xive->nr_irqs) {
>>          error_setg(errp, "Number of interrupt needs to be greater 0");
>> @@ -329,6 +328,15 @@ static void spapr_xive_realize(DeviceState *dev, Error **errp)
>>                             xive->tm_base + XIVE_TM_USER_PAGE * (1 << TM_SHIFT));
>>  
>>      qemu_register_reset(spapr_xive_reset, dev);
>> +}
>> +
>> +void spapr_xive_late_realize(sPAPRXive *xive, Error **errp)
>> +{
>> +    Error *local_err = NULL;
>> +    MachineState *machine = MACHINE(qdev_get_machine());
>> +    XiveSource *xsrc = &xive->source;
>> +    XiveENDSource *end_xsrc = &xive->end_source;
>> +    static bool once;
>>  
>>      if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
>>          kvmppc_xive_connect(xive, &local_err);
>> @@ -351,6 +359,15 @@ static void spapr_xive_realize(DeviceState *dev, Error **errp)
>>          error_report_err(local_err);
>>      }
>>  
>> +    /*
>> +     * TODO: Emulated mode can only be initialized once. Should we
>> +     * store the information under the device model for later usage ?
>> +     */
>> +    if (once) {
>> +        return;
>> +    }
>> +    once = true;
> 
> Urgh.  static locals are a bad smell.  I think at least this flag
> should go into the instance structure (if we can't deduce it from
> something else in there).

Yes. I have the same feeling. Maybe we can deduce it from some 
object state. I will check. 

> 
>> +
>>      /* TIMA initialization */
>>      memory_region_init_io(&xive->tm_mmio, OBJECT(xive), &xive_tm_ops, xive,
>>                            "xive.tima", 4ull << TM_SHIFT);
>> diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c
>> index cd81cdb23a5e..99a829fb3f60 100644
>> --- a/hw/intc/spapr_xive_kvm.c
>> +++ b/hw/intc/spapr_xive_kvm.c
>> @@ -657,6 +657,15 @@ void kvmppc_xive_connect(sPAPRXive *xive, Error **errp)
>>      Error *local_err = NULL;
>>      size_t esb_len = (1ull << xsrc->esb_shift) * xsrc->nr_irqs;
>>      size_t tima_len = 4ull << TM_SHIFT;
>> +    CPUState *cs;
>> +
>> +    /*
>> +     * The KVM XIVE device already in use. This is the case when
>> +     * rebooting XIVE -> XIVE
> 
> I take it from this we're leaving the XIVE KVM device initialized if
> we don't change to XICS during CAS. 

No. This is for a XIVE only machine rebooting, which uses the same code. 

> Would it make things simpler if
> we always removed both the XICS and XIVE KVM devices at reset and
> recreated the one we need at CAS?

On a 'dual' mode machine, we remove both XICS and XIVE KVM devices at
reset and (re)create the XICS KVM device by default because the OV5 is 
reseted to its default. From there, CAS negotiates a new interrupt mode,
or nor.

It would be good to keep the XIVE mode activated once selected by CAS
because it is unlikely to be changed afterwards. I haven't worked on 
that optimization yet.

> 
>> +     */
>> +    if (xive->fd != -1) {
>> +        return;
>> +    }
>>  
>>      if (!kvmppc_has_cap_xive()) {
>>          error_setg(errp, "IRQ_XIVE capability must be present for KVM");
>> @@ -705,6 +714,24 @@ void kvmppc_xive_connect(sPAPRXive *xive, Error **errp)
>>      xive->change = qemu_add_vm_change_state_handler(
>>          kvmppc_xive_change_state_handler, xive);
>>  
>> +    /* Connect the presenters to the initial VCPUs of the machine */
>> +    CPU_FOREACH(cs) {
>> +        PowerPCCPU *cpu = POWERPC_CPU(cs);
>> +
>> +        kvmppc_xive_cpu_connect(spapr_cpu_state(cpu)->tctx, &local_err);
>> +        if (local_err) {
>> +            error_propagate(errp, local_err);
>> +            return;
>> +        }
>> +    }
>> +
>> +    /* Update the KVM sources */
>> +    kvmppc_xive_source_reset(xsrc, &local_err);
>> +    if (local_err) {
>> +            error_propagate(errp, local_err);
>> +            return;
>> +    }
>> +
>>      kvm_kernel_irqchip = true;
>>      kvm_msi_via_irqfd_allowed = true;
>>      kvm_gsi_direct_mapping = true;
>> diff --git a/hw/intc/xics_kvm.c b/hw/intc/xics_kvm.c
>> index 9855316e4831..8ffd4c7a36f8 100644
>> --- a/hw/intc/xics_kvm.c
>> +++ b/hw/intc/xics_kvm.c
>> @@ -33,6 +33,7 @@
>>  #include "trace.h"
>>  #include "sysemu/kvm.h"
>>  #include "hw/ppc/spapr.h"
>> +#include "hw/ppc/spapr_cpu_core.h"
>>  #include "hw/ppc/xics.h"
>>  #include "hw/ppc/xics_spapr.h"
>>  #include "kvm_ppc.h"
>> @@ -337,6 +338,16 @@ static void rtas_dummy(PowerPCCPU *cpu, sPAPRMachineState *spapr,
>>  int xics_kvm_init(sPAPRMachineState *spapr, Error **errp)
>>  {
>>      int rc;
>> +    CPUState *cs;
>> +    Error *local_err = NULL;
>> +
>> +    /*
>> +     * The KVM XICS device already in use. This is the case when
>> +     * rebooting XICS -> XICS
>> +     */
>> +    if (kernel_xics_fd != -1) {
>> +        return 0;
>> +    }
>>  
>>      if (!kvm_enabled() || !kvm_check_extension(kvm_state, KVM_CAP_IRQ_XICS)) {
>>          error_setg(errp,
>> @@ -385,6 +396,21 @@ int xics_kvm_init(sPAPRMachineState *spapr, Error **errp)
>>      kvm_msi_via_irqfd_allowed = true;
>>      kvm_gsi_direct_mapping = true;
>>  
>> +    /* Connect the presenters to the initial VCPUs of the machine */
>> +    CPU_FOREACH(cs) {
>> +        PowerPCCPU *cpu = POWERPC_CPU(cs);
>> +
>> +        icp_kvm_realize(DEVICE(spapr_cpu_state(cpu)->icp), &local_err);
>> +        if (local_err) {
>> +            error_propagate(errp, local_err);
>> +            goto fail;
>> +        }
>> +        icp_set_kvm_state(spapr_cpu_state(cpu)->icp);
>> +    }
>> +
>> +    /* Update the KVM sources */
>> +    ics_set_kvm_state(spapr->ics);
>> +
>>      return 0;
>>  
>>  fail:
>> diff --git a/hw/intc/xive.c b/hw/intc/xive.c
>> index 1f8e923ca654..715d5a7e65ed 100644
>> --- a/hw/intc/xive.c
>> +++ b/hw/intc/xive.c
>> @@ -929,10 +929,6 @@ static void xive_source_reset(void *dev)
>>  
>>      /* PQs are initialized to 0b01 (Q=1) which corresponds to "ints off" */
>>      memset(xsrc->status, XIVE_ESB_OFF, xsrc->nr_irqs);
>> -
>> -    if (kvm_irqchip_in_kernel()) {
>> -        kvmppc_xive_source_reset(xsrc, &error_fatal);
>> -    }
>>  }
>>  
>>  static void xive_source_realize(DeviceState *dev, Error **errp)
>> diff --git a/hw/ppc/spapr_irq.c b/hw/ppc/spapr_irq.c
>> index 3176098b9f7c..f8260c14aecd 100644
>> --- a/hw/ppc/spapr_irq.c
>> +++ b/hw/ppc/spapr_irq.c
>> @@ -92,35 +92,55 @@ error:
>>      return NULL;
>>  }
>>  
>> -static void spapr_irq_init_xics(sPAPRMachineState *spapr, int nr_irqs,
>> -                                Error **errp)
>> +static void spapr_ics_late_realize(sPAPRMachineState *spapr, Error **errp)
>>  {
>>      MachineState *machine = MACHINE(spapr);
>>      Error *local_err = NULL;
>> -    bool xics_kvm = false;
>> +    static bool once;
>>  
>> -    if (kvm_enabled()) {
>> -        if (machine_kernel_irqchip_allowed(machine) &&
>> -            !xics_kvm_init(spapr, &local_err)) {
>> -            xics_kvm = true;
>> -        }
>> -        if (machine_kernel_irqchip_required(machine) && !xics_kvm) {
>> +    if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
>> +        xics_kvm_init(spapr, &local_err);
>> +        if (local_err && machine_kernel_irqchip_required(machine)) {
>>              error_prepend(&local_err,
>>                            "kernel_irqchip requested but unavailable: ");
>> -            goto error;
>> +            error_propagate(errp, local_err);
>> +            return;
>>          }
>> -        error_free(local_err);
>> -        local_err = NULL;
>> +
>> +        if (!local_err) {
>> +            return;
>> +        }
>> +
>> +        /*
>> +         * We failed to initialize the XIVE KVM device, fallback to
>> +         * emulated mode
>> +         */
>> +        error_prepend(&local_err, "kernel_irqchip allowed but unavailable: ");
>> +        error_report_err(local_err);
> 
> Should only warn (at most) in this case, since fallback is permitted
> and should work.

yes.

> 
>>      }
>>  
>> -    if (!xics_kvm) {
>> -        xics_spapr_init(spapr);
>> +    /*
>> +     * TODO: Emulated mode can only be initialized once. Should we
>> +     * store the information under the device model for later usage ?
>> +     */
>> +    if (once) {
>> +        return;
>>      }
>> +    once = true;
>>  
>> -    spapr->ics = spapr_ics_create(spapr, nr_irqs, &local_err);
>> +    xics_spapr_init(spapr);
>> +}
>>  
>> -error:
>> -    error_propagate(errp, local_err);
>> +static void spapr_irq_init_xics(sPAPRMachineState *spapr, int nr_irqs,
>> +                                Error **errp)
>> +{
>> +    Error *local_err = NULL;
>> +
>> +    spapr->ics = spapr_ics_create(spapr, nr_irqs, &local_err);
>> +    if (local_err) {
>> +        error_propagate(errp, local_err);
>> +        return;
>> +    }
>>  }
>>  
>>  #define ICS_IRQ_FREE(ics, srcno)   \
>> @@ -227,7 +247,13 @@ static void spapr_irq_set_irq_xics(void *opaque, int srcno, int val)
>>  
>>  static void spapr_irq_reset_xics(sPAPRMachineState *spapr, Error **errp)
>>  {
>> -    /* TODO: create the KVM XICS device */
>> +    Error *local_err = NULL;
>> +
>> +    spapr_ics_late_realize(spapr, &local_err);
>> +    if (local_err) {
>> +        error_propagate(errp, local_err);
>> +        return;
>> +    }
>>  }
>>  
>>  static const char *spapr_irq_get_nodename_xics(sPAPRMachineState *spapr)
>> @@ -386,6 +412,7 @@ static int spapr_irq_post_load_xive(sPAPRMachineState *spapr, int version_id)
>>  static void spapr_irq_reset_xive(sPAPRMachineState *spapr, Error **errp)
>>  {
>>      CPUState *cs;
>> +    Error *local_err = NULL;
>>  
>>      CPU_FOREACH(cs) {
>>          PowerPCCPU *cpu = POWERPC_CPU(cs);
>> @@ -394,6 +421,12 @@ static void spapr_irq_reset_xive(sPAPRMachineState *spapr, Error **errp)
>>          spapr_xive_set_tctx_os_cam(spapr_cpu_state(cpu)->tctx);
>>      }
>>  
>> +    spapr_xive_late_realize(spapr->xive, &local_err);
>> +    if (local_err) {
>> +        error_propagate(errp, local_err);
>> +        return;
>> +    }
>> +
>>      /* Activate the XIVE MMIOs */
>>      spapr_xive_mmio_set_enabled(spapr->xive, true);
>>  }
>> @@ -462,14 +495,8 @@ static sPAPRIrq *spapr_irq_current(sPAPRMachineState *spapr)
>>  static void spapr_irq_init_dual(sPAPRMachineState *spapr, int nr_irqs,
>>                                  Error **errp)
>>  {
>> -    MachineState *machine = MACHINE(spapr);
>>      Error *local_err = NULL;
>>  
>> -    if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
>> -        error_setg(errp, "No KVM support for the 'dual' machine");
>> -        return;
>> -    }
>> -
>>      spapr_irq_xics.init(spapr, spapr_irq_xics.nr_irqs, &local_err);
>>      if (local_err) {
>>          error_propagate(errp, local_err);
>> @@ -548,6 +575,9 @@ static int spapr_irq_post_load_dual(sPAPRMachineState *spapr, int version_id)
>>       * defaults to XICS at startup.
>>       */
>>      if (spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
>> +        if (kvm_irqchip_in_kernel()) {
>> +            xics_kvm_disconnect(spapr, &error_fatal);
> 
> Yeah, this is kinda nasty.  I'm wondering if we could make things
> simpler by always using emulated irqchip until CAS and only setting up
> kernel irqchip then.

ah. good idea. We could be running with an emulated device before CAS
and switch to the KVM device once a mode has been selected. I even think 
that an emulated XICS device and a KVM XIVE device can cohabite.

I need to look closer at this idea. We might have to add some 'bool' to
know at which stage the machine is running : before or after CAS.

Thanks,

C.
 
>> +        }
>>          spapr_irq_xive.reset(spapr, &error_fatal);
>>      }
>>  
>> @@ -556,12 +586,30 @@ static int spapr_irq_post_load_dual(sPAPRMachineState *spapr, int version_id)
>>  
>>  static void spapr_irq_reset_dual(sPAPRMachineState *spapr, Error **errp)
>>  {
>> +    Error *local_err = NULL;
>> +
>>      /*
>>       * Deactivate the XIVE MMIOs. The XIVE backend will reenable them
>>       * if selected.
>>       */
>>      spapr_xive_mmio_set_enabled(spapr->xive, false);
>>  
>> +    /* Destroy all KVM devices */
>> +    if (kvm_irqchip_in_kernel()) {
>> +        xics_kvm_disconnect(spapr, &local_err);
>> +        if (local_err) {
>> +            error_propagate(errp, local_err);
>> +            error_prepend(errp, "KVM XICS disconnect failed: ");
>> +            return;
>> +        }
>> +        kvmppc_xive_disconnect(spapr->xive, &local_err);
>> +        if (local_err) {
>> +            error_propagate(errp, local_err);
>> +            error_prepend(errp, "KVM XIVE disconnect failed: ");
>> +            return;
>> +        }
>> +    }
>> +
>>      spapr_irq_current(spapr)->reset(spapr, errp);
>>  }
>>  
>> @@ -748,6 +796,7 @@ sPAPRIrq spapr_irq_xics_legacy = {
>>      .dt_populate = spapr_dt_xics,
>>      .cpu_intc_create = spapr_irq_cpu_intc_create_xics,
>>      .post_load   = spapr_irq_post_load_xics,
>> +    .reset       = spapr_irq_reset_xics,
>>      .set_irq     = spapr_irq_set_irq_xics,
>>      .get_nodename = spapr_irq_get_nodename_xics,
>>  };
>
David Gibson March 13, 2019, 4:18 a.m. UTC | #3
On Tue, Mar 12, 2019 at 10:40:19AM +0100, Cédric Le Goater wrote:
> On 2/28/19 6:15 AM, David Gibson wrote:
> > On Fri, Feb 22, 2019 at 02:13:21PM +0100, Cédric Le Goater wrote:
> >> The interrupt mode is chosen by the CAS negotiation process and
> >> activated after a reset to take into account the required changes in
> >> the machine. This brings new constraints on how the associated KVM IRQ
> >> device is initialized.
> >>
> >> Currently, each model takes care of the initialization of the KVM
> >> device in their realize method but this is not possible anymore as the
> >> initialization needs to be done globaly when the interrupt mode is
> >> known, i.e. when machine is reseted. It also means that we need a way
> >> to delete a KVM device when another mode is chosen.
> >>
> >> Also, to support migration, the QEMU objects holding the state to
> >> transfer should always be available but not necessarily activated.
> >>
> >> The overall approach of this proposal is to initialize both interrupt
> >> mode at the QEMU level and keep the IRQ number space in sync to allow
> >> switching from one mode to another. For the KVM side of things, the
> >> whole initialization of the KVM device, sources and presenters, is
> >> grouped in a single routine. The XICS and XIVE sPAPR IRQ reset
> >> handlers are modified accordingly to handle the init and the delete
> >> sequences of the KVM device.
> >>
> >> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> >> ---
> >>  include/hw/ppc/spapr_xive.h |  1 +
> >>  hw/intc/spapr_xive.c        | 19 +++++++-
> >>  hw/intc/spapr_xive_kvm.c    | 27 +++++++++++
> >>  hw/intc/xics_kvm.c          | 26 ++++++++++
> >>  hw/intc/xive.c              |  4 --
> >>  hw/ppc/spapr_irq.c          | 97 ++++++++++++++++++++++++++++---------
> >>  6 files changed, 145 insertions(+), 29 deletions(-)
> >>
> >> diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h
> >> index a7c4c275a747..a1593ac2fcf0 100644
> >> --- a/include/hw/ppc/spapr_xive.h
> >> +++ b/include/hw/ppc/spapr_xive.h
> >> @@ -66,6 +66,7 @@ void spapr_xive_map_mmio(sPAPRXive *xive);
> >>  
> >>  int spapr_xive_end_to_target(uint8_t end_blk, uint32_t end_idx,
> >>                               uint32_t *out_server, uint8_t *out_prio);
> >> +void spapr_xive_late_realize(sPAPRXive *xive, Error **errp);
> >>  
> >>  /*
> >>   * KVM XIVE device helpers
> >> diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
> >> index 21fe5e1aa39f..b0cbc2fe21ee 100644
> >> --- a/hw/intc/spapr_xive.c
> >> +++ b/hw/intc/spapr_xive.c
> >> @@ -278,7 +278,6 @@ static void spapr_xive_realize(DeviceState *dev, Error **errp)
> >>      XiveSource *xsrc = &xive->source;
> >>      XiveENDSource *end_xsrc = &xive->end_source;
> >>      Error *local_err = NULL;
> >> -    MachineState *machine = MACHINE(qdev_get_machine());
> >>  
> >>      if (!xive->nr_irqs) {
> >>          error_setg(errp, "Number of interrupt needs to be greater 0");
> >> @@ -329,6 +328,15 @@ static void spapr_xive_realize(DeviceState *dev, Error **errp)
> >>                             xive->tm_base + XIVE_TM_USER_PAGE * (1 << TM_SHIFT));
> >>  
> >>      qemu_register_reset(spapr_xive_reset, dev);
> >> +}
> >> +
> >> +void spapr_xive_late_realize(sPAPRXive *xive, Error **errp)
> >> +{
> >> +    Error *local_err = NULL;
> >> +    MachineState *machine = MACHINE(qdev_get_machine());
> >> +    XiveSource *xsrc = &xive->source;
> >> +    XiveENDSource *end_xsrc = &xive->end_source;
> >> +    static bool once;
> >>  
> >>      if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
> >>          kvmppc_xive_connect(xive, &local_err);
> >> @@ -351,6 +359,15 @@ static void spapr_xive_realize(DeviceState *dev, Error **errp)
> >>          error_report_err(local_err);
> >>      }
> >>  
> >> +    /*
> >> +     * TODO: Emulated mode can only be initialized once. Should we
> >> +     * store the information under the device model for later usage ?
> >> +     */
> >> +    if (once) {
> >> +        return;
> >> +    }
> >> +    once = true;
> > 
> > Urgh.  static locals are a bad smell.  I think at least this flag
> > should go into the instance structure (if we can't deduce it from
> > something else in there).
> 
> Yes. I have the same feeling. Maybe we can deduce it from some 
> object state. I will check.

Ok.

> >> +
> >>      /* TIMA initialization */
> >>      memory_region_init_io(&xive->tm_mmio, OBJECT(xive), &xive_tm_ops, xive,
> >>                            "xive.tima", 4ull << TM_SHIFT);
> >> diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c
> >> index cd81cdb23a5e..99a829fb3f60 100644
> >> --- a/hw/intc/spapr_xive_kvm.c
> >> +++ b/hw/intc/spapr_xive_kvm.c
> >> @@ -657,6 +657,15 @@ void kvmppc_xive_connect(sPAPRXive *xive, Error **errp)
> >>      Error *local_err = NULL;
> >>      size_t esb_len = (1ull << xsrc->esb_shift) * xsrc->nr_irqs;
> >>      size_t tima_len = 4ull << TM_SHIFT;
> >> +    CPUState *cs;
> >> +
> >> +    /*
> >> +     * The KVM XIVE device already in use. This is the case when
> >> +     * rebooting XIVE -> XIVE
> > 
> > I take it from this we're leaving the XIVE KVM device initialized if
> > we don't change to XICS during CAS. 
> 
> No. This is for a XIVE only machine rebooting, which uses the same code. 

Ok.

> > Would it make things simpler if
> > we always removed both the XICS and XIVE KVM devices at reset and
> > recreated the one we need at CAS?
> 
> On a 'dual' mode machine, we remove both XICS and XIVE KVM devices at
> reset and (re)create the XICS KVM device by default because the OV5 is 
> reseted to its default. From there, CAS negotiates a new interrupt mode,
> or nor.
> 
> It would be good to keep the XIVE mode activated once selected by CAS
> because it is unlikely to be changed afterwards. I haven't worked on 
> that optimization yet.

Yes, handling this as a later optimization is just fine by me.

I'm even fine destroying and recreating the PIC in non-dual mode if it
makes the overall code simpler.

> >> @@ -462,14 +495,8 @@ static sPAPRIrq *spapr_irq_current(sPAPRMachineState *spapr)
> >>  static void spapr_irq_init_dual(sPAPRMachineState *spapr, int nr_irqs,
> >>                                  Error **errp)
> >>  {
> >> -    MachineState *machine = MACHINE(spapr);
> >>      Error *local_err = NULL;
> >>  
> >> -    if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
> >> -        error_setg(errp, "No KVM support for the 'dual' machine");
> >> -        return;
> >> -    }
> >> -
> >>      spapr_irq_xics.init(spapr, spapr_irq_xics.nr_irqs, &local_err);
> >>      if (local_err) {
> >>          error_propagate(errp, local_err);
> >> @@ -548,6 +575,9 @@ static int spapr_irq_post_load_dual(sPAPRMachineState *spapr, int version_id)
> >>       * defaults to XICS at startup.
> >>       */
> >>      if (spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
> >> +        if (kvm_irqchip_in_kernel()) {
> >> +            xics_kvm_disconnect(spapr, &error_fatal);
> > 
> > Yeah, this is kinda nasty.  I'm wondering if we could make things
> > simpler by always using emulated irqchip until CAS and only setting up
> > kernel irqchip then.
> 
> ah. good idea. We could be running with an emulated device before CAS
> and switch to the KVM device once a mode has been selected. I even think 
> that an emulated XICS device and a KVM XIVE device can cohabite.

> I need to look closer at this idea. We might have to add some 'bool' to
> know at which stage the machine is running : before or after CAS.

I thought we already had some places testing for that, but now I can't
find them :/.  Possibly we could have ov5_case just be NULL from reset
until CAS.
diff mbox series

Patch

diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h
index a7c4c275a747..a1593ac2fcf0 100644
--- a/include/hw/ppc/spapr_xive.h
+++ b/include/hw/ppc/spapr_xive.h
@@ -66,6 +66,7 @@  void spapr_xive_map_mmio(sPAPRXive *xive);
 
 int spapr_xive_end_to_target(uint8_t end_blk, uint32_t end_idx,
                              uint32_t *out_server, uint8_t *out_prio);
+void spapr_xive_late_realize(sPAPRXive *xive, Error **errp);
 
 /*
  * KVM XIVE device helpers
diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
index 21fe5e1aa39f..b0cbc2fe21ee 100644
--- a/hw/intc/spapr_xive.c
+++ b/hw/intc/spapr_xive.c
@@ -278,7 +278,6 @@  static void spapr_xive_realize(DeviceState *dev, Error **errp)
     XiveSource *xsrc = &xive->source;
     XiveENDSource *end_xsrc = &xive->end_source;
     Error *local_err = NULL;
-    MachineState *machine = MACHINE(qdev_get_machine());
 
     if (!xive->nr_irqs) {
         error_setg(errp, "Number of interrupt needs to be greater 0");
@@ -329,6 +328,15 @@  static void spapr_xive_realize(DeviceState *dev, Error **errp)
                            xive->tm_base + XIVE_TM_USER_PAGE * (1 << TM_SHIFT));
 
     qemu_register_reset(spapr_xive_reset, dev);
+}
+
+void spapr_xive_late_realize(sPAPRXive *xive, Error **errp)
+{
+    Error *local_err = NULL;
+    MachineState *machine = MACHINE(qdev_get_machine());
+    XiveSource *xsrc = &xive->source;
+    XiveENDSource *end_xsrc = &xive->end_source;
+    static bool once;
 
     if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
         kvmppc_xive_connect(xive, &local_err);
@@ -351,6 +359,15 @@  static void spapr_xive_realize(DeviceState *dev, Error **errp)
         error_report_err(local_err);
     }
 
+    /*
+     * TODO: Emulated mode can only be initialized once. Should we
+     * store the information under the device model for later usage ?
+     */
+    if (once) {
+        return;
+    }
+    once = true;
+
     /* TIMA initialization */
     memory_region_init_io(&xive->tm_mmio, OBJECT(xive), &xive_tm_ops, xive,
                           "xive.tima", 4ull << TM_SHIFT);
diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c
index cd81cdb23a5e..99a829fb3f60 100644
--- a/hw/intc/spapr_xive_kvm.c
+++ b/hw/intc/spapr_xive_kvm.c
@@ -657,6 +657,15 @@  void kvmppc_xive_connect(sPAPRXive *xive, Error **errp)
     Error *local_err = NULL;
     size_t esb_len = (1ull << xsrc->esb_shift) * xsrc->nr_irqs;
     size_t tima_len = 4ull << TM_SHIFT;
+    CPUState *cs;
+
+    /*
+     * The KVM XIVE device already in use. This is the case when
+     * rebooting XIVE -> XIVE
+     */
+    if (xive->fd != -1) {
+        return;
+    }
 
     if (!kvmppc_has_cap_xive()) {
         error_setg(errp, "IRQ_XIVE capability must be present for KVM");
@@ -705,6 +714,24 @@  void kvmppc_xive_connect(sPAPRXive *xive, Error **errp)
     xive->change = qemu_add_vm_change_state_handler(
         kvmppc_xive_change_state_handler, xive);
 
+    /* Connect the presenters to the initial VCPUs of the machine */
+    CPU_FOREACH(cs) {
+        PowerPCCPU *cpu = POWERPC_CPU(cs);
+
+        kvmppc_xive_cpu_connect(spapr_cpu_state(cpu)->tctx, &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            return;
+        }
+    }
+
+    /* Update the KVM sources */
+    kvmppc_xive_source_reset(xsrc, &local_err);
+    if (local_err) {
+            error_propagate(errp, local_err);
+            return;
+    }
+
     kvm_kernel_irqchip = true;
     kvm_msi_via_irqfd_allowed = true;
     kvm_gsi_direct_mapping = true;
diff --git a/hw/intc/xics_kvm.c b/hw/intc/xics_kvm.c
index 9855316e4831..8ffd4c7a36f8 100644
--- a/hw/intc/xics_kvm.c
+++ b/hw/intc/xics_kvm.c
@@ -33,6 +33,7 @@ 
 #include "trace.h"
 #include "sysemu/kvm.h"
 #include "hw/ppc/spapr.h"
+#include "hw/ppc/spapr_cpu_core.h"
 #include "hw/ppc/xics.h"
 #include "hw/ppc/xics_spapr.h"
 #include "kvm_ppc.h"
@@ -337,6 +338,16 @@  static void rtas_dummy(PowerPCCPU *cpu, sPAPRMachineState *spapr,
 int xics_kvm_init(sPAPRMachineState *spapr, Error **errp)
 {
     int rc;
+    CPUState *cs;
+    Error *local_err = NULL;
+
+    /*
+     * The KVM XICS device already in use. This is the case when
+     * rebooting XICS -> XICS
+     */
+    if (kernel_xics_fd != -1) {
+        return 0;
+    }
 
     if (!kvm_enabled() || !kvm_check_extension(kvm_state, KVM_CAP_IRQ_XICS)) {
         error_setg(errp,
@@ -385,6 +396,21 @@  int xics_kvm_init(sPAPRMachineState *spapr, Error **errp)
     kvm_msi_via_irqfd_allowed = true;
     kvm_gsi_direct_mapping = true;
 
+    /* Connect the presenters to the initial VCPUs of the machine */
+    CPU_FOREACH(cs) {
+        PowerPCCPU *cpu = POWERPC_CPU(cs);
+
+        icp_kvm_realize(DEVICE(spapr_cpu_state(cpu)->icp), &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            goto fail;
+        }
+        icp_set_kvm_state(spapr_cpu_state(cpu)->icp);
+    }
+
+    /* Update the KVM sources */
+    ics_set_kvm_state(spapr->ics);
+
     return 0;
 
 fail:
diff --git a/hw/intc/xive.c b/hw/intc/xive.c
index 1f8e923ca654..715d5a7e65ed 100644
--- a/hw/intc/xive.c
+++ b/hw/intc/xive.c
@@ -929,10 +929,6 @@  static void xive_source_reset(void *dev)
 
     /* PQs are initialized to 0b01 (Q=1) which corresponds to "ints off" */
     memset(xsrc->status, XIVE_ESB_OFF, xsrc->nr_irqs);
-
-    if (kvm_irqchip_in_kernel()) {
-        kvmppc_xive_source_reset(xsrc, &error_fatal);
-    }
 }
 
 static void xive_source_realize(DeviceState *dev, Error **errp)
diff --git a/hw/ppc/spapr_irq.c b/hw/ppc/spapr_irq.c
index 3176098b9f7c..f8260c14aecd 100644
--- a/hw/ppc/spapr_irq.c
+++ b/hw/ppc/spapr_irq.c
@@ -92,35 +92,55 @@  error:
     return NULL;
 }
 
-static void spapr_irq_init_xics(sPAPRMachineState *spapr, int nr_irqs,
-                                Error **errp)
+static void spapr_ics_late_realize(sPAPRMachineState *spapr, Error **errp)
 {
     MachineState *machine = MACHINE(spapr);
     Error *local_err = NULL;
-    bool xics_kvm = false;
+    static bool once;
 
-    if (kvm_enabled()) {
-        if (machine_kernel_irqchip_allowed(machine) &&
-            !xics_kvm_init(spapr, &local_err)) {
-            xics_kvm = true;
-        }
-        if (machine_kernel_irqchip_required(machine) && !xics_kvm) {
+    if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
+        xics_kvm_init(spapr, &local_err);
+        if (local_err && machine_kernel_irqchip_required(machine)) {
             error_prepend(&local_err,
                           "kernel_irqchip requested but unavailable: ");
-            goto error;
+            error_propagate(errp, local_err);
+            return;
         }
-        error_free(local_err);
-        local_err = NULL;
+
+        if (!local_err) {
+            return;
+        }
+
+        /*
+         * We failed to initialize the XIVE KVM device, fallback to
+         * emulated mode
+         */
+        error_prepend(&local_err, "kernel_irqchip allowed but unavailable: ");
+        error_report_err(local_err);
     }
 
-    if (!xics_kvm) {
-        xics_spapr_init(spapr);
+    /*
+     * TODO: Emulated mode can only be initialized once. Should we
+     * store the information under the device model for later usage ?
+     */
+    if (once) {
+        return;
     }
+    once = true;
 
-    spapr->ics = spapr_ics_create(spapr, nr_irqs, &local_err);
+    xics_spapr_init(spapr);
+}
 
-error:
-    error_propagate(errp, local_err);
+static void spapr_irq_init_xics(sPAPRMachineState *spapr, int nr_irqs,
+                                Error **errp)
+{
+    Error *local_err = NULL;
+
+    spapr->ics = spapr_ics_create(spapr, nr_irqs, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
 }
 
 #define ICS_IRQ_FREE(ics, srcno)   \
@@ -227,7 +247,13 @@  static void spapr_irq_set_irq_xics(void *opaque, int srcno, int val)
 
 static void spapr_irq_reset_xics(sPAPRMachineState *spapr, Error **errp)
 {
-    /* TODO: create the KVM XICS device */
+    Error *local_err = NULL;
+
+    spapr_ics_late_realize(spapr, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
 }
 
 static const char *spapr_irq_get_nodename_xics(sPAPRMachineState *spapr)
@@ -386,6 +412,7 @@  static int spapr_irq_post_load_xive(sPAPRMachineState *spapr, int version_id)
 static void spapr_irq_reset_xive(sPAPRMachineState *spapr, Error **errp)
 {
     CPUState *cs;
+    Error *local_err = NULL;
 
     CPU_FOREACH(cs) {
         PowerPCCPU *cpu = POWERPC_CPU(cs);
@@ -394,6 +421,12 @@  static void spapr_irq_reset_xive(sPAPRMachineState *spapr, Error **errp)
         spapr_xive_set_tctx_os_cam(spapr_cpu_state(cpu)->tctx);
     }
 
+    spapr_xive_late_realize(spapr->xive, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
+
     /* Activate the XIVE MMIOs */
     spapr_xive_mmio_set_enabled(spapr->xive, true);
 }
@@ -462,14 +495,8 @@  static sPAPRIrq *spapr_irq_current(sPAPRMachineState *spapr)
 static void spapr_irq_init_dual(sPAPRMachineState *spapr, int nr_irqs,
                                 Error **errp)
 {
-    MachineState *machine = MACHINE(spapr);
     Error *local_err = NULL;
 
-    if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
-        error_setg(errp, "No KVM support for the 'dual' machine");
-        return;
-    }
-
     spapr_irq_xics.init(spapr, spapr_irq_xics.nr_irqs, &local_err);
     if (local_err) {
         error_propagate(errp, local_err);
@@ -548,6 +575,9 @@  static int spapr_irq_post_load_dual(sPAPRMachineState *spapr, int version_id)
      * defaults to XICS at startup.
      */
     if (spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
+        if (kvm_irqchip_in_kernel()) {
+            xics_kvm_disconnect(spapr, &error_fatal);
+        }
         spapr_irq_xive.reset(spapr, &error_fatal);
     }
 
@@ -556,12 +586,30 @@  static int spapr_irq_post_load_dual(sPAPRMachineState *spapr, int version_id)
 
 static void spapr_irq_reset_dual(sPAPRMachineState *spapr, Error **errp)
 {
+    Error *local_err = NULL;
+
     /*
      * Deactivate the XIVE MMIOs. The XIVE backend will reenable them
      * if selected.
      */
     spapr_xive_mmio_set_enabled(spapr->xive, false);
 
+    /* Destroy all KVM devices */
+    if (kvm_irqchip_in_kernel()) {
+        xics_kvm_disconnect(spapr, &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            error_prepend(errp, "KVM XICS disconnect failed: ");
+            return;
+        }
+        kvmppc_xive_disconnect(spapr->xive, &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            error_prepend(errp, "KVM XIVE disconnect failed: ");
+            return;
+        }
+    }
+
     spapr_irq_current(spapr)->reset(spapr, errp);
 }
 
@@ -748,6 +796,7 @@  sPAPRIrq spapr_irq_xics_legacy = {
     .dt_populate = spapr_dt_xics,
     .cpu_intc_create = spapr_irq_cpu_intc_create_xics,
     .post_load   = spapr_irq_post_load_xics,
+    .reset       = spapr_irq_reset_xics,
     .set_irq     = spapr_irq_set_irq_xics,
     .get_nodename = spapr_irq_get_nodename_xics,
 };