diff mbox series

[13/13] spapr: add KVM support to the 'dual' machine

Message ID 20190107183946.7230-14-clg@kaod.org
State New
Headers show
Series spapr: add KVM support to the XIVE interrupt mode | expand

Commit Message

Cédric Le Goater Jan. 7, 2019, 6:39 p.m. UTC
The interrupt mode is chosen by the CAS negotiation process and
activated after a reset to take into account the required changes in
the machine. This brings new constraints on how the associated KVM IRQ
device is initialized.

Currently, each model takes care of the initialization of the KVM
device in their realize method but this is not possible anymore as the
initialization needs to be done globaly when the interrupt mode is
known, i.e. when machine is reseted. It also means that we need a way
to delete a KVM device when another mode is chosen.

Also, to support migration, the QEMU objects holding the state to
transfer should always be available but not necessarily activated.

The overall approach of this proposal is to initialize both interrupt
mode at the QEMU level and keep the IRQ number space in sync to allow
switching from one mode to another. For the KVM side of things, the
whole initialization of the KVM device, sources and presenters, is
grouped in a single routine. The XICS and XIVE sPAPR IRQ reset
handlers are modified accordingly to handle the init and the delete
sequences of the KVM device.

As KVM is now initialized at reset, we loose the possiblity to
fallback to the QEMU emulated mode in case of failure and failures
become fatal to the machine.

Signed-off-by: Cédric Le Goater <clg@kaod.org>
---
 hw/intc/spapr_xive.c     |  8 +---
 hw/intc/spapr_xive_kvm.c | 27 ++++++++++++++
 hw/intc/xics_kvm.c       | 25 +++++++++++++
 hw/intc/xive.c           |  4 --
 hw/ppc/spapr_irq.c       | 79 ++++++++++++++++++++++++++++------------
 5 files changed, 109 insertions(+), 34 deletions(-)

Comments

David Gibson Feb. 12, 2019, 1:11 a.m. UTC | #1
On Mon, Jan 07, 2019 at 07:39:46PM +0100, Cédric Le Goater wrote:
> The interrupt mode is chosen by the CAS negotiation process and
> activated after a reset to take into account the required changes in
> the machine. This brings new constraints on how the associated KVM IRQ
> device is initialized.
> 
> Currently, each model takes care of the initialization of the KVM
> device in their realize method but this is not possible anymore as the
> initialization needs to be done globaly when the interrupt mode is
> known, i.e. when machine is reseted. It also means that we need a way
> to delete a KVM device when another mode is chosen.
> 
> Also, to support migration, the QEMU objects holding the state to
> transfer should always be available but not necessarily activated.
> 
> The overall approach of this proposal is to initialize both interrupt
> mode at the QEMU level and keep the IRQ number space in sync to allow
> switching from one mode to another. For the KVM side of things, the
> whole initialization of the KVM device, sources and presenters, is
> grouped in a single routine. The XICS and XIVE sPAPR IRQ reset
> handlers are modified accordingly to handle the init and the delete
> sequences of the KVM device.
> 
> As KVM is now initialized at reset, we loose the possiblity to
> fallback to the QEMU emulated mode in case of failure and failures
> become fatal to the machine.
> 
> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> ---
>  hw/intc/spapr_xive.c     |  8 +---
>  hw/intc/spapr_xive_kvm.c | 27 ++++++++++++++
>  hw/intc/xics_kvm.c       | 25 +++++++++++++
>  hw/intc/xive.c           |  4 --
>  hw/ppc/spapr_irq.c       | 79 ++++++++++++++++++++++++++++------------
>  5 files changed, 109 insertions(+), 34 deletions(-)
> 
> diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
> index 21f3c1ef0901..0661aca35900 100644
> --- a/hw/intc/spapr_xive.c
> +++ b/hw/intc/spapr_xive.c
> @@ -330,13 +330,7 @@ static void spapr_xive_realize(DeviceState *dev, Error **errp)
>      xive->eat = g_new0(XiveEAS, xive->nr_irqs);
>      xive->endt = g_new0(XiveEND, xive->nr_ends);
>  
> -    if (kvmppc_xive_enabled()) {
> -        kvmppc_xive_connect(xive, &local_err);
> -        if (local_err) {
> -            error_propagate(errp, local_err);
> -            return;
> -        }
> -    } else {
> +    if (!kvmppc_xive_enabled()) {
>          /* TIMA initialization */
>          memory_region_init_io(&xive->tm_mmio, OBJECT(xive), &xive_tm_ops, xive,
>                                "xive.tima", 4ull << TM_SHIFT);
> diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c
> index d35814c1992e..3ebc947f2be7 100644
> --- a/hw/intc/spapr_xive_kvm.c
> +++ b/hw/intc/spapr_xive_kvm.c
> @@ -737,6 +737,15 @@ void kvmppc_xive_connect(sPAPRXive *xive, Error **errp)
>      Error *local_err = NULL;
>      size_t esb_len;
>      size_t tima_len;
> +    CPUState *cs;
> +
> +    /*
> +     * The KVM XIVE device already in use. This is the case when
> +     * rebooting XIVE -> XIVE

Can this case actually occur?  Further down you appear to
unconditionally destroy both KVM devices at reset time.

> +     */
> +    if (xive->fd != -1) {
> +        return;
> +    }
>  
>      if (!kvm_enabled() || !kvmppc_has_cap_xive()) {
>          error_setg(errp, "IRQ_XIVE capability must be present for KVM");
> @@ -800,6 +809,24 @@ void kvmppc_xive_connect(sPAPRXive *xive, Error **errp)
>      xive->change = qemu_add_vm_change_state_handler(
>          kvmppc_xive_change_state_handler, xive);
>  
> +    /* Connect the presenters to the initial VCPUs of the machine */
> +    CPU_FOREACH(cs) {
> +        PowerPCCPU *cpu = POWERPC_CPU(cs);
> +
> +        kvmppc_xive_cpu_connect(cpu->tctx, &local_err);
> +        if (local_err) {
> +            error_propagate(errp, local_err);
> +            return;
> +        }
> +    }
> +
> +    /* Update the KVM sources */
> +    kvmppc_xive_source_reset(xsrc, &local_err);
> +    if (local_err) {
> +            error_propagate(errp, local_err);
> +            return;
> +    }
> +
>      kvm_kernel_irqchip = true;
>      kvm_msi_via_irqfd_allowed = true;
>      kvm_gsi_direct_mapping = true;
> diff --git a/hw/intc/xics_kvm.c b/hw/intc/xics_kvm.c
> index 1d21ff217b82..bfc35d71df7f 100644
> --- a/hw/intc/xics_kvm.c
> +++ b/hw/intc/xics_kvm.c
> @@ -448,6 +448,16 @@ static void rtas_dummy(PowerPCCPU *cpu, sPAPRMachineState *spapr,
>  int xics_kvm_init(sPAPRMachineState *spapr, Error **errp)
>  {
>      int rc;
> +    CPUState *cs;
> +    Error *local_err = NULL;
> +
> +    /*
> +     * The KVM XICS device already in use. This is the case when
> +     * rebooting XICS -> XICS
> +     */
> +    if (kernel_xics_fd != -1) {
> +        return 0;
> +    }
>  
>      if (!kvm_enabled() || !kvm_check_extension(kvm_state, KVM_CAP_IRQ_XICS)) {
>          error_setg(errp,
> @@ -496,6 +506,21 @@ int xics_kvm_init(sPAPRMachineState *spapr, Error **errp)
>      kvm_msi_via_irqfd_allowed = true;
>      kvm_gsi_direct_mapping = true;
>  
> +    /* Connect the presenters to the initial VCPUs of the machine */
> +    CPU_FOREACH(cs) {
> +        PowerPCCPU *cpu = POWERPC_CPU(cs);
> +
> +        icp_kvm_connect(cpu->icp, &local_err);
> +        if (local_err) {
> +            error_propagate(errp, local_err);
> +            goto fail;
> +        }
> +        icp_set_kvm_state(cpu->icp, 1);
> +    }
> +
> +    /* Update the KVM sources */
> +    ics_set_kvm_state(ICS_KVM(spapr->ics), 1);
> +
>      return 0;
>  
>  fail:
> diff --git a/hw/intc/xive.c b/hw/intc/xive.c
> index c5c2fbc3f8bc..c166eab5b210 100644
> --- a/hw/intc/xive.c
> +++ b/hw/intc/xive.c
> @@ -932,10 +932,6 @@ static void xive_source_reset(void *dev)
>  
>      /* PQs are initialized to 0b01 (Q=1) which corresponds to "ints off" */
>      memset(xsrc->status, XIVE_ESB_OFF, xsrc->nr_irqs);
> -
> -    if (kvmppc_xive_enabled()) {
> -        kvmppc_xive_source_reset(xsrc, &error_fatal);
> -    }
>  }
>  
>  static void xive_source_realize(DeviceState *dev, Error **errp)
> diff --git a/hw/ppc/spapr_irq.c b/hw/ppc/spapr_irq.c
> index ba27d9d8e972..5592eec3787b 100644
> --- a/hw/ppc/spapr_irq.c
> +++ b/hw/ppc/spapr_irq.c
> @@ -98,20 +98,14 @@ static void spapr_irq_init_xics(sPAPRMachineState *spapr, Error **errp)
>      int nr_irqs = spapr->irq->nr_irqs;
>      Error *local_err = NULL;
>  
> -    if (kvm_enabled()) {
> -        if (machine_kernel_irqchip_allowed(machine) &&
> -            !xics_kvm_init(spapr, &local_err)) {
> -            spapr->icp_type = TYPE_KVM_ICP;
> -            spapr->ics = spapr_ics_create(spapr, TYPE_ICS_KVM, nr_irqs,
> -                                          &local_err);
> -        }
> -        if (machine_kernel_irqchip_required(machine) && !spapr->ics) {
> -            error_prepend(&local_err,
> -                          "kernel_irqchip requested but unavailable: ");
> -            goto error;

I don't see anything that replaces the irqchip_required logic, which
doesn't seem right.

> +    if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
> +        spapr->icp_type = TYPE_KVM_ICP;
> +        spapr->ics = spapr_ics_create(spapr, TYPE_ICS_KVM, nr_irqs,
> +                                      &local_err);
> +        if (local_err) {
> +            error_propagate(errp, local_err);
> +            return;
>          }
> -        error_free(local_err);
> -        local_err = NULL;
>      }
>  
>      if (!spapr->ics) {
> @@ -119,10 +113,11 @@ static void spapr_irq_init_xics(sPAPRMachineState *spapr, Error **errp)
>          spapr->icp_type = TYPE_ICP;
>          spapr->ics = spapr_ics_create(spapr, TYPE_ICS_SIMPLE, nr_irqs,
>                                        &local_err);
> +        if (local_err) {
> +            error_propagate(errp, local_err);
> +            return;
> +        }
>      }
> -
> -error:
> -    error_propagate(errp, local_err);
>  }
>  
>  #define ICS_IRQ_FREE(ics, srcno)   \
> @@ -233,7 +228,17 @@ static void spapr_irq_set_irq_xics(void *opaque, int srcno, int val)
>  
>  static void spapr_irq_reset_xics(sPAPRMachineState *spapr, Error **errp)
>  {
> -    /* TODO: create the KVM XICS device */
> +    MachineState *machine = MACHINE(spapr);
> +    Error *local_err = NULL;
> +
> +    if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
> +        xics_kvm_init(spapr, &local_err);
> +        if (local_err) {
> +            error_propagate(errp, local_err);
> +            error_prepend(errp, "KVM XICS connect failed: ");
> +            return;
> +        }
> +    }
>  }
>  
>  #define SPAPR_IRQ_XICS_NR_IRQS     0x1000
> @@ -393,6 +398,7 @@ static int spapr_irq_post_load_xive(sPAPRMachineState *spapr, int version_id)
>  static void spapr_irq_reset_xive(sPAPRMachineState *spapr, Error **errp)
>  {
>      CPUState *cs;
> +    Error *local_err = NULL;
>  
>      CPU_FOREACH(cs) {
>          PowerPCCPU *cpu = POWERPC_CPU(cs);
> @@ -401,6 +407,15 @@ static void spapr_irq_reset_xive(sPAPRMachineState *spapr, Error **errp)
>          spapr_xive_set_tctx_os_cam(cpu->tctx);
>      }
>  
> +    if (kvmppc_xive_enabled()) {
> +        kvmppc_xive_connect(spapr->xive, &local_err);
> +        if (local_err) {
> +            error_propagate(errp, local_err);
> +            error_prepend(errp, "KVM XIVE connect failed: ");
> +            return;
> +        }
> +    }
> +
>      /* Activate the XIVE MMIOs */
>      spapr_xive_mmio_set_enabled(spapr->xive, true);
>  }
> @@ -462,14 +477,8 @@ static sPAPRIrq *spapr_irq_current(sPAPRMachineState *spapr)
>  
>  static void spapr_irq_init_dual(sPAPRMachineState *spapr, Error **errp)
>  {
> -    MachineState *machine = MACHINE(spapr);
>      Error *local_err = NULL;
>  
> -    if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
> -        error_setg(errp, "No KVM support for the 'dual' machine");
> -        return;
> -    }
> -
>      spapr_irq_xics.init(spapr, &local_err);
>      if (local_err) {
>          error_propagate(errp, local_err);
> @@ -568,11 +577,16 @@ static void spapr_irq_cpu_intc_create_dual(sPAPRMachineState *spapr,
>  
>  static int spapr_irq_post_load_dual(sPAPRMachineState *spapr, int version_id)
>  {
> +    MachineState *machine = MACHINE(spapr);
> +
>      /*
>       * Force a reset of the XIVE backend after migration. The machine
>       * defaults to XICS at startup.
>       */
>      if (spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
> +        if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
> +            xics_kvm_disconnect(spapr, &error_fatal);
> +        }
>          spapr_irq_xive.reset(spapr, &error_fatal);
>      }
>  
> @@ -581,12 +595,31 @@ static int spapr_irq_post_load_dual(sPAPRMachineState *spapr, int version_id)
>  
>  static void spapr_irq_reset_dual(sPAPRMachineState *spapr, Error **errp)
>  {
> +    MachineState *machine = MACHINE(spapr);
> +    Error *local_err = NULL;
> +
>      /*
>       * Deactivate the XIVE MMIOs. The XIVE backend will reenable them
>       * if selected.
>       */
>      spapr_xive_mmio_set_enabled(spapr->xive, false);
>  
> +    /* Destroy all KVM devices */
> +    if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
> +        xics_kvm_disconnect(spapr, &local_err);
> +        if (local_err) {
> +            error_propagate(errp, local_err);
> +            error_prepend(errp, "KVM XICS disconnect failed: ");
> +            return;
> +        }
> +        kvmppc_xive_disconnect(spapr->xive, &local_err);
> +        if (local_err) {
> +            error_propagate(errp, local_err);
> +            error_prepend(errp, "KVM XIVE disconnect failed: ");
> +            return;
> +        }
> +    }
> +
>      spapr_irq_current(spapr)->reset(spapr, errp);
>  }
>
Cédric Le Goater Feb. 12, 2019, 7:18 a.m. UTC | #2
On 2/12/19 2:11 AM, David Gibson wrote:
> On Mon, Jan 07, 2019 at 07:39:46PM +0100, Cédric Le Goater wrote:
>> The interrupt mode is chosen by the CAS negotiation process and
>> activated after a reset to take into account the required changes in
>> the machine. This brings new constraints on how the associated KVM IRQ
>> device is initialized.
>>
>> Currently, each model takes care of the initialization of the KVM
>> device in their realize method but this is not possible anymore as the
>> initialization needs to be done globaly when the interrupt mode is
>> known, i.e. when machine is reseted. It also means that we need a way
>> to delete a KVM device when another mode is chosen.
>>
>> Also, to support migration, the QEMU objects holding the state to
>> transfer should always be available but not necessarily activated.
>>
>> The overall approach of this proposal is to initialize both interrupt
>> mode at the QEMU level and keep the IRQ number space in sync to allow
>> switching from one mode to another. For the KVM side of things, the
>> whole initialization of the KVM device, sources and presenters, is
>> grouped in a single routine. The XICS and XIVE sPAPR IRQ reset
>> handlers are modified accordingly to handle the init and the delete
>> sequences of the KVM device.
>>
>> As KVM is now initialized at reset, we loose the possiblity to
>> fallback to the QEMU emulated mode in case of failure and failures
>> become fatal to the machine.
>>
>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>> ---
>>  hw/intc/spapr_xive.c     |  8 +---
>>  hw/intc/spapr_xive_kvm.c | 27 ++++++++++++++
>>  hw/intc/xics_kvm.c       | 25 +++++++++++++
>>  hw/intc/xive.c           |  4 --
>>  hw/ppc/spapr_irq.c       | 79 ++++++++++++++++++++++++++++------------
>>  5 files changed, 109 insertions(+), 34 deletions(-)
>>
>> diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
>> index 21f3c1ef0901..0661aca35900 100644
>> --- a/hw/intc/spapr_xive.c
>> +++ b/hw/intc/spapr_xive.c
>> @@ -330,13 +330,7 @@ static void spapr_xive_realize(DeviceState *dev, Error **errp)
>>      xive->eat = g_new0(XiveEAS, xive->nr_irqs);
>>      xive->endt = g_new0(XiveEND, xive->nr_ends);
>>  
>> -    if (kvmppc_xive_enabled()) {
>> -        kvmppc_xive_connect(xive, &local_err);
>> -        if (local_err) {
>> -            error_propagate(errp, local_err);
>> -            return;
>> -        }
>> -    } else {
>> +    if (!kvmppc_xive_enabled()) {
>>          /* TIMA initialization */
>>          memory_region_init_io(&xive->tm_mmio, OBJECT(xive), &xive_tm_ops, xive,
>>                                "xive.tima", 4ull << TM_SHIFT);
>> diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c
>> index d35814c1992e..3ebc947f2be7 100644
>> --- a/hw/intc/spapr_xive_kvm.c
>> +++ b/hw/intc/spapr_xive_kvm.c
>> @@ -737,6 +737,15 @@ void kvmppc_xive_connect(sPAPRXive *xive, Error **errp)
>>      Error *local_err = NULL;
>>      size_t esb_len;
>>      size_t tima_len;
>> +    CPUState *cs;
>> +
>> +    /*
>> +     * The KVM XIVE device already in use. This is the case when
>> +     * rebooting XIVE -> XIVE
> 
> Can this case actually occur?  Further down you appear to
> unconditionally destroy both KVM devices at reset time.

I guess you are right. I will check.

>> +     */
>> +    if (xive->fd != -1) {
>> +        return;
>> +    }
>>  
>>      if (!kvm_enabled() || !kvmppc_has_cap_xive()) {
>>          error_setg(errp, "IRQ_XIVE capability must be present for KVM");
>> @@ -800,6 +809,24 @@ void kvmppc_xive_connect(sPAPRXive *xive, Error **errp)
>>      xive->change = qemu_add_vm_change_state_handler(
>>          kvmppc_xive_change_state_handler, xive);
>>  
>> +    /* Connect the presenters to the initial VCPUs of the machine */
>> +    CPU_FOREACH(cs) {
>> +        PowerPCCPU *cpu = POWERPC_CPU(cs);
>> +
>> +        kvmppc_xive_cpu_connect(cpu->tctx, &local_err);
>> +        if (local_err) {
>> +            error_propagate(errp, local_err);
>> +            return;
>> +        }
>> +    }
>> +
>> +    /* Update the KVM sources */
>> +    kvmppc_xive_source_reset(xsrc, &local_err);
>> +    if (local_err) {
>> +            error_propagate(errp, local_err);
>> +            return;
>> +    }
>> +
>>      kvm_kernel_irqchip = true;
>>      kvm_msi_via_irqfd_allowed = true;
>>      kvm_gsi_direct_mapping = true;
>> diff --git a/hw/intc/xics_kvm.c b/hw/intc/xics_kvm.c
>> index 1d21ff217b82..bfc35d71df7f 100644
>> --- a/hw/intc/xics_kvm.c
>> +++ b/hw/intc/xics_kvm.c
>> @@ -448,6 +448,16 @@ static void rtas_dummy(PowerPCCPU *cpu, sPAPRMachineState *spapr,
>>  int xics_kvm_init(sPAPRMachineState *spapr, Error **errp)
>>  {
>>      int rc;
>> +    CPUState *cs;
>> +    Error *local_err = NULL;
>> +
>> +    /*
>> +     * The KVM XICS device already in use. This is the case when
>> +     * rebooting XICS -> XICS
>> +     */
>> +    if (kernel_xics_fd != -1) {
>> +        return 0;
>> +    }
>>  
>>      if (!kvm_enabled() || !kvm_check_extension(kvm_state, KVM_CAP_IRQ_XICS)) {
>>          error_setg(errp,
>> @@ -496,6 +506,21 @@ int xics_kvm_init(sPAPRMachineState *spapr, Error **errp)
>>      kvm_msi_via_irqfd_allowed = true;
>>      kvm_gsi_direct_mapping = true;
>>  
>> +    /* Connect the presenters to the initial VCPUs of the machine */
>> +    CPU_FOREACH(cs) {
>> +        PowerPCCPU *cpu = POWERPC_CPU(cs);
>> +
>> +        icp_kvm_connect(cpu->icp, &local_err);
>> +        if (local_err) {
>> +            error_propagate(errp, local_err);
>> +            goto fail;
>> +        }
>> +        icp_set_kvm_state(cpu->icp, 1);
>> +    }
>> +
>> +    /* Update the KVM sources */
>> +    ics_set_kvm_state(ICS_KVM(spapr->ics), 1);
>> +
>>      return 0;
>>  
>>  fail:
>> diff --git a/hw/intc/xive.c b/hw/intc/xive.c
>> index c5c2fbc3f8bc..c166eab5b210 100644
>> --- a/hw/intc/xive.c
>> +++ b/hw/intc/xive.c
>> @@ -932,10 +932,6 @@ static void xive_source_reset(void *dev)
>>  
>>      /* PQs are initialized to 0b01 (Q=1) which corresponds to "ints off" */
>>      memset(xsrc->status, XIVE_ESB_OFF, xsrc->nr_irqs);
>> -
>> -    if (kvmppc_xive_enabled()) {
>> -        kvmppc_xive_source_reset(xsrc, &error_fatal);
>> -    }
>>  }
>>  
>>  static void xive_source_realize(DeviceState *dev, Error **errp)
>> diff --git a/hw/ppc/spapr_irq.c b/hw/ppc/spapr_irq.c
>> index ba27d9d8e972..5592eec3787b 100644
>> --- a/hw/ppc/spapr_irq.c
>> +++ b/hw/ppc/spapr_irq.c
>> @@ -98,20 +98,14 @@ static void spapr_irq_init_xics(sPAPRMachineState *spapr, Error **errp)
>>      int nr_irqs = spapr->irq->nr_irqs;
>>      Error *local_err = NULL;
>>  
>> -    if (kvm_enabled()) {
>> -        if (machine_kernel_irqchip_allowed(machine) &&
>> -            !xics_kvm_init(spapr, &local_err)) {
>> -            spapr->icp_type = TYPE_KVM_ICP;
>> -            spapr->ics = spapr_ics_create(spapr, TYPE_ICS_KVM, nr_irqs,
>> -                                          &local_err);
>> -        }
>> -        if (machine_kernel_irqchip_required(machine) && !spapr->ics) {
>> -            error_prepend(&local_err,
>> -                          "kernel_irqchip requested but unavailable: ");
>> -            goto error;
> 
> I don't see anything that replaces the irqchip_required logic, which
> doesn't seem right.

Yes. We do loose the ability to fall back to the emulated device in case
of failure. It is not impossible to do but it will require more changes
to check what are the KVM capabilities before starting the machine.

Nevertheless, any failure in reset when setting the KVM backend will
result in machine abort.

C.       

> 
>> +    if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
>> +        spapr->icp_type = TYPE_KVM_ICP;
>> +        spapr->ics = spapr_ics_create(spapr, TYPE_ICS_KVM, nr_irqs,
>> +                                      &local_err);
>> +        if (local_err) {
>> +            error_propagate(errp, local_err);
>> +            return;
>>          }
>> -        error_free(local_err);
>> -        local_err = NULL;
>>      }
>>  
>>      if (!spapr->ics) {
>> @@ -119,10 +113,11 @@ static void spapr_irq_init_xics(sPAPRMachineState *spapr, Error **errp)
>>          spapr->icp_type = TYPE_ICP;
>>          spapr->ics = spapr_ics_create(spapr, TYPE_ICS_SIMPLE, nr_irqs,
>>                                        &local_err);
>> +        if (local_err) {
>> +            error_propagate(errp, local_err);
>> +            return;
>> +        }
>>      }
>> -
>> -error:
>> -    error_propagate(errp, local_err);
>>  }
>>  
>>  #define ICS_IRQ_FREE(ics, srcno)   \
>> @@ -233,7 +228,17 @@ static void spapr_irq_set_irq_xics(void *opaque, int srcno, int val)
>>  
>>  static void spapr_irq_reset_xics(sPAPRMachineState *spapr, Error **errp)
>>  {
>> -    /* TODO: create the KVM XICS device */
>> +    MachineState *machine = MACHINE(spapr);
>> +    Error *local_err = NULL;
>> +
>> +    if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
>> +        xics_kvm_init(spapr, &local_err);
>> +        if (local_err) {
>> +            error_propagate(errp, local_err);
>> +            error_prepend(errp, "KVM XICS connect failed: ");
>> +            return;
>> +        }
>> +    }
>>  }
>>  
>>  #define SPAPR_IRQ_XICS_NR_IRQS     0x1000
>> @@ -393,6 +398,7 @@ static int spapr_irq_post_load_xive(sPAPRMachineState *spapr, int version_id)
>>  static void spapr_irq_reset_xive(sPAPRMachineState *spapr, Error **errp)
>>  {
>>      CPUState *cs;
>> +    Error *local_err = NULL;
>>  
>>      CPU_FOREACH(cs) {
>>          PowerPCCPU *cpu = POWERPC_CPU(cs);
>> @@ -401,6 +407,15 @@ static void spapr_irq_reset_xive(sPAPRMachineState *spapr, Error **errp)
>>          spapr_xive_set_tctx_os_cam(cpu->tctx);
>>      }
>>  
>> +    if (kvmppc_xive_enabled()) {
>> +        kvmppc_xive_connect(spapr->xive, &local_err);
>> +        if (local_err) {
>> +            error_propagate(errp, local_err);
>> +            error_prepend(errp, "KVM XIVE connect failed: ");
>> +            return;
>> +        }
>> +    }
>> +
>>      /* Activate the XIVE MMIOs */
>>      spapr_xive_mmio_set_enabled(spapr->xive, true);
>>  }
>> @@ -462,14 +477,8 @@ static sPAPRIrq *spapr_irq_current(sPAPRMachineState *spapr)
>>  
>>  static void spapr_irq_init_dual(sPAPRMachineState *spapr, Error **errp)
>>  {
>> -    MachineState *machine = MACHINE(spapr);
>>      Error *local_err = NULL;
>>  
>> -    if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
>> -        error_setg(errp, "No KVM support for the 'dual' machine");
>> -        return;
>> -    }
>> -
>>      spapr_irq_xics.init(spapr, &local_err);
>>      if (local_err) {
>>          error_propagate(errp, local_err);
>> @@ -568,11 +577,16 @@ static void spapr_irq_cpu_intc_create_dual(sPAPRMachineState *spapr,
>>  
>>  static int spapr_irq_post_load_dual(sPAPRMachineState *spapr, int version_id)
>>  {
>> +    MachineState *machine = MACHINE(spapr);
>> +
>>      /*
>>       * Force a reset of the XIVE backend after migration. The machine
>>       * defaults to XICS at startup.
>>       */
>>      if (spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
>> +        if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
>> +            xics_kvm_disconnect(spapr, &error_fatal);
>> +        }
>>          spapr_irq_xive.reset(spapr, &error_fatal);
>>      }
>>  
>> @@ -581,12 +595,31 @@ static int spapr_irq_post_load_dual(sPAPRMachineState *spapr, int version_id)
>>  
>>  static void spapr_irq_reset_dual(sPAPRMachineState *spapr, Error **errp)
>>  {
>> +    MachineState *machine = MACHINE(spapr);
>> +    Error *local_err = NULL;
>> +
>>      /*
>>       * Deactivate the XIVE MMIOs. The XIVE backend will reenable them
>>       * if selected.
>>       */
>>      spapr_xive_mmio_set_enabled(spapr->xive, false);
>>  
>> +    /* Destroy all KVM devices */
>> +    if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
>> +        xics_kvm_disconnect(spapr, &local_err);
>> +        if (local_err) {
>> +            error_propagate(errp, local_err);
>> +            error_prepend(errp, "KVM XICS disconnect failed: ");
>> +            return;
>> +        }
>> +        kvmppc_xive_disconnect(spapr->xive, &local_err);
>> +        if (local_err) {
>> +            error_propagate(errp, local_err);
>> +            error_prepend(errp, "KVM XIVE disconnect failed: ");
>> +            return;
>> +        }
>> +    }
>> +
>>      spapr_irq_current(spapr)->reset(spapr, errp);
>>  }
>>  
>
David Gibson Feb. 13, 2019, 1:32 a.m. UTC | #3
On Tue, Feb 12, 2019 at 08:18:19AM +0100, Cédric Le Goater wrote:
> On 2/12/19 2:11 AM, David Gibson wrote:
> > On Mon, Jan 07, 2019 at 07:39:46PM +0100, Cédric Le Goater wrote:
> >> The interrupt mode is chosen by the CAS negotiation process and
> >> activated after a reset to take into account the required changes in
> >> the machine. This brings new constraints on how the associated KVM IRQ
> >> device is initialized.
> >>
> >> Currently, each model takes care of the initialization of the KVM
> >> device in their realize method but this is not possible anymore as the
> >> initialization needs to be done globaly when the interrupt mode is
> >> known, i.e. when machine is reseted. It also means that we need a way
> >> to delete a KVM device when another mode is chosen.
> >>
> >> Also, to support migration, the QEMU objects holding the state to
> >> transfer should always be available but not necessarily activated.
> >>
> >> The overall approach of this proposal is to initialize both interrupt
> >> mode at the QEMU level and keep the IRQ number space in sync to allow
> >> switching from one mode to another. For the KVM side of things, the
> >> whole initialization of the KVM device, sources and presenters, is
> >> grouped in a single routine. The XICS and XIVE sPAPR IRQ reset
> >> handlers are modified accordingly to handle the init and the delete
> >> sequences of the KVM device.
> >>
> >> As KVM is now initialized at reset, we loose the possiblity to
> >> fallback to the QEMU emulated mode in case of failure and failures
> >> become fatal to the machine.
> >>
> >> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> >> ---
> >>  hw/intc/spapr_xive.c     |  8 +---
> >>  hw/intc/spapr_xive_kvm.c | 27 ++++++++++++++
> >>  hw/intc/xics_kvm.c       | 25 +++++++++++++
> >>  hw/intc/xive.c           |  4 --
> >>  hw/ppc/spapr_irq.c       | 79 ++++++++++++++++++++++++++++------------
> >>  5 files changed, 109 insertions(+), 34 deletions(-)
> >>
> >> diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
> >> index 21f3c1ef0901..0661aca35900 100644
> >> --- a/hw/intc/spapr_xive.c
> >> +++ b/hw/intc/spapr_xive.c
> >> @@ -330,13 +330,7 @@ static void spapr_xive_realize(DeviceState *dev, Error **errp)
> >>      xive->eat = g_new0(XiveEAS, xive->nr_irqs);
> >>      xive->endt = g_new0(XiveEND, xive->nr_ends);
> >>  
> >> -    if (kvmppc_xive_enabled()) {
> >> -        kvmppc_xive_connect(xive, &local_err);
> >> -        if (local_err) {
> >> -            error_propagate(errp, local_err);
> >> -            return;
> >> -        }
> >> -    } else {
> >> +    if (!kvmppc_xive_enabled()) {
> >>          /* TIMA initialization */
> >>          memory_region_init_io(&xive->tm_mmio, OBJECT(xive), &xive_tm_ops, xive,
> >>                                "xive.tima", 4ull << TM_SHIFT);
> >> diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c
> >> index d35814c1992e..3ebc947f2be7 100644
> >> --- a/hw/intc/spapr_xive_kvm.c
> >> +++ b/hw/intc/spapr_xive_kvm.c
> >> @@ -737,6 +737,15 @@ void kvmppc_xive_connect(sPAPRXive *xive, Error **errp)
> >>      Error *local_err = NULL;
> >>      size_t esb_len;
> >>      size_t tima_len;
> >> +    CPUState *cs;
> >> +
> >> +    /*
> >> +     * The KVM XIVE device already in use. This is the case when
> >> +     * rebooting XIVE -> XIVE
> > 
> > Can this case actually occur?  Further down you appear to
> > unconditionally destroy both KVM devices at reset time.
> 
> I guess you are right. I will check.
> 
> >> +     */
> >> +    if (xive->fd != -1) {
> >> +        return;
> >> +    }
> >>  
> >>      if (!kvm_enabled() || !kvmppc_has_cap_xive()) {
> >>          error_setg(errp, "IRQ_XIVE capability must be present for KVM");
> >> @@ -800,6 +809,24 @@ void kvmppc_xive_connect(sPAPRXive *xive, Error **errp)
> >>      xive->change = qemu_add_vm_change_state_handler(
> >>          kvmppc_xive_change_state_handler, xive);
> >>  
> >> +    /* Connect the presenters to the initial VCPUs of the machine */
> >> +    CPU_FOREACH(cs) {
> >> +        PowerPCCPU *cpu = POWERPC_CPU(cs);
> >> +
> >> +        kvmppc_xive_cpu_connect(cpu->tctx, &local_err);
> >> +        if (local_err) {
> >> +            error_propagate(errp, local_err);
> >> +            return;
> >> +        }
> >> +    }
> >> +
> >> +    /* Update the KVM sources */
> >> +    kvmppc_xive_source_reset(xsrc, &local_err);
> >> +    if (local_err) {
> >> +            error_propagate(errp, local_err);
> >> +            return;
> >> +    }
> >> +
> >>      kvm_kernel_irqchip = true;
> >>      kvm_msi_via_irqfd_allowed = true;
> >>      kvm_gsi_direct_mapping = true;
> >> diff --git a/hw/intc/xics_kvm.c b/hw/intc/xics_kvm.c
> >> index 1d21ff217b82..bfc35d71df7f 100644
> >> --- a/hw/intc/xics_kvm.c
> >> +++ b/hw/intc/xics_kvm.c
> >> @@ -448,6 +448,16 @@ static void rtas_dummy(PowerPCCPU *cpu, sPAPRMachineState *spapr,
> >>  int xics_kvm_init(sPAPRMachineState *spapr, Error **errp)
> >>  {
> >>      int rc;
> >> +    CPUState *cs;
> >> +    Error *local_err = NULL;
> >> +
> >> +    /*
> >> +     * The KVM XICS device already in use. This is the case when
> >> +     * rebooting XICS -> XICS
> >> +     */
> >> +    if (kernel_xics_fd != -1) {
> >> +        return 0;
> >> +    }
> >>  
> >>      if (!kvm_enabled() || !kvm_check_extension(kvm_state, KVM_CAP_IRQ_XICS)) {
> >>          error_setg(errp,
> >> @@ -496,6 +506,21 @@ int xics_kvm_init(sPAPRMachineState *spapr, Error **errp)
> >>      kvm_msi_via_irqfd_allowed = true;
> >>      kvm_gsi_direct_mapping = true;
> >>  
> >> +    /* Connect the presenters to the initial VCPUs of the machine */
> >> +    CPU_FOREACH(cs) {
> >> +        PowerPCCPU *cpu = POWERPC_CPU(cs);
> >> +
> >> +        icp_kvm_connect(cpu->icp, &local_err);
> >> +        if (local_err) {
> >> +            error_propagate(errp, local_err);
> >> +            goto fail;
> >> +        }
> >> +        icp_set_kvm_state(cpu->icp, 1);
> >> +    }
> >> +
> >> +    /* Update the KVM sources */
> >> +    ics_set_kvm_state(ICS_KVM(spapr->ics), 1);
> >> +
> >>      return 0;
> >>  
> >>  fail:
> >> diff --git a/hw/intc/xive.c b/hw/intc/xive.c
> >> index c5c2fbc3f8bc..c166eab5b210 100644
> >> --- a/hw/intc/xive.c
> >> +++ b/hw/intc/xive.c
> >> @@ -932,10 +932,6 @@ static void xive_source_reset(void *dev)
> >>  
> >>      /* PQs are initialized to 0b01 (Q=1) which corresponds to "ints off" */
> >>      memset(xsrc->status, XIVE_ESB_OFF, xsrc->nr_irqs);
> >> -
> >> -    if (kvmppc_xive_enabled()) {
> >> -        kvmppc_xive_source_reset(xsrc, &error_fatal);
> >> -    }
> >>  }
> >>  
> >>  static void xive_source_realize(DeviceState *dev, Error **errp)
> >> diff --git a/hw/ppc/spapr_irq.c b/hw/ppc/spapr_irq.c
> >> index ba27d9d8e972..5592eec3787b 100644
> >> --- a/hw/ppc/spapr_irq.c
> >> +++ b/hw/ppc/spapr_irq.c
> >> @@ -98,20 +98,14 @@ static void spapr_irq_init_xics(sPAPRMachineState *spapr, Error **errp)
> >>      int nr_irqs = spapr->irq->nr_irqs;
> >>      Error *local_err = NULL;
> >>  
> >> -    if (kvm_enabled()) {
> >> -        if (machine_kernel_irqchip_allowed(machine) &&
> >> -            !xics_kvm_init(spapr, &local_err)) {
> >> -            spapr->icp_type = TYPE_KVM_ICP;
> >> -            spapr->ics = spapr_ics_create(spapr, TYPE_ICS_KVM, nr_irqs,
> >> -                                          &local_err);
> >> -        }
> >> -        if (machine_kernel_irqchip_required(machine) && !spapr->ics) {
> >> -            error_prepend(&local_err,
> >> -                          "kernel_irqchip requested but unavailable: ");
> >> -            goto error;
> > 
> > I don't see anything that replaces the irqchip_required logic, which
> > doesn't seem right.
> 
> Yes. We do loose the ability to fall back to the emulated device in case
> of failure. It is not impossible to do but it will require more changes
> to check what are the KVM capabilities before starting the machine.

Uh... it seems more like it's the other way around.  We'll always fall
back to emulated, even if we've explicitly said on the command line
that we don't want that.

> Nevertheless, any failure in reset when setting the KVM backend will
> result in machine abort.
> 
> C.       
> 
> > 
> >> +    if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
> >> +        spapr->icp_type = TYPE_KVM_ICP;
> >> +        spapr->ics = spapr_ics_create(spapr, TYPE_ICS_KVM, nr_irqs,
> >> +                                      &local_err);
> >> +        if (local_err) {
> >> +            error_propagate(errp, local_err);
> >> +            return;
> >>          }
> >> -        error_free(local_err);
> >> -        local_err = NULL;
> >>      }
> >>  
> >>      if (!spapr->ics) {
> >> @@ -119,10 +113,11 @@ static void spapr_irq_init_xics(sPAPRMachineState *spapr, Error **errp)
> >>          spapr->icp_type = TYPE_ICP;
> >>          spapr->ics = spapr_ics_create(spapr, TYPE_ICS_SIMPLE, nr_irqs,
> >>                                        &local_err);
> >> +        if (local_err) {
> >> +            error_propagate(errp, local_err);
> >> +            return;
> >> +        }
> >>      }
> >> -
> >> -error:
> >> -    error_propagate(errp, local_err);
> >>  }
> >>  
> >>  #define ICS_IRQ_FREE(ics, srcno)   \
> >> @@ -233,7 +228,17 @@ static void spapr_irq_set_irq_xics(void *opaque, int srcno, int val)
> >>  
> >>  static void spapr_irq_reset_xics(sPAPRMachineState *spapr, Error **errp)
> >>  {
> >> -    /* TODO: create the KVM XICS device */
> >> +    MachineState *machine = MACHINE(spapr);
> >> +    Error *local_err = NULL;
> >> +
> >> +    if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
> >> +        xics_kvm_init(spapr, &local_err);
> >> +        if (local_err) {
> >> +            error_propagate(errp, local_err);
> >> +            error_prepend(errp, "KVM XICS connect failed: ");
> >> +            return;
> >> +        }
> >> +    }
> >>  }
> >>  
> >>  #define SPAPR_IRQ_XICS_NR_IRQS     0x1000
> >> @@ -393,6 +398,7 @@ static int spapr_irq_post_load_xive(sPAPRMachineState *spapr, int version_id)
> >>  static void spapr_irq_reset_xive(sPAPRMachineState *spapr, Error **errp)
> >>  {
> >>      CPUState *cs;
> >> +    Error *local_err = NULL;
> >>  
> >>      CPU_FOREACH(cs) {
> >>          PowerPCCPU *cpu = POWERPC_CPU(cs);
> >> @@ -401,6 +407,15 @@ static void spapr_irq_reset_xive(sPAPRMachineState *spapr, Error **errp)
> >>          spapr_xive_set_tctx_os_cam(cpu->tctx);
> >>      }
> >>  
> >> +    if (kvmppc_xive_enabled()) {
> >> +        kvmppc_xive_connect(spapr->xive, &local_err);
> >> +        if (local_err) {
> >> +            error_propagate(errp, local_err);
> >> +            error_prepend(errp, "KVM XIVE connect failed: ");
> >> +            return;
> >> +        }
> >> +    }
> >> +
> >>      /* Activate the XIVE MMIOs */
> >>      spapr_xive_mmio_set_enabled(spapr->xive, true);
> >>  }
> >> @@ -462,14 +477,8 @@ static sPAPRIrq *spapr_irq_current(sPAPRMachineState *spapr)
> >>  
> >>  static void spapr_irq_init_dual(sPAPRMachineState *spapr, Error **errp)
> >>  {
> >> -    MachineState *machine = MACHINE(spapr);
> >>      Error *local_err = NULL;
> >>  
> >> -    if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
> >> -        error_setg(errp, "No KVM support for the 'dual' machine");
> >> -        return;
> >> -    }
> >> -
> >>      spapr_irq_xics.init(spapr, &local_err);
> >>      if (local_err) {
> >>          error_propagate(errp, local_err);
> >> @@ -568,11 +577,16 @@ static void spapr_irq_cpu_intc_create_dual(sPAPRMachineState *spapr,
> >>  
> >>  static int spapr_irq_post_load_dual(sPAPRMachineState *spapr, int version_id)
> >>  {
> >> +    MachineState *machine = MACHINE(spapr);
> >> +
> >>      /*
> >>       * Force a reset of the XIVE backend after migration. The machine
> >>       * defaults to XICS at startup.
> >>       */
> >>      if (spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
> >> +        if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
> >> +            xics_kvm_disconnect(spapr, &error_fatal);
> >> +        }
> >>          spapr_irq_xive.reset(spapr, &error_fatal);
> >>      }
> >>  
> >> @@ -581,12 +595,31 @@ static int spapr_irq_post_load_dual(sPAPRMachineState *spapr, int version_id)
> >>  
> >>  static void spapr_irq_reset_dual(sPAPRMachineState *spapr, Error **errp)
> >>  {
> >> +    MachineState *machine = MACHINE(spapr);
> >> +    Error *local_err = NULL;
> >> +
> >>      /*
> >>       * Deactivate the XIVE MMIOs. The XIVE backend will reenable them
> >>       * if selected.
> >>       */
> >>      spapr_xive_mmio_set_enabled(spapr->xive, false);
> >>  
> >> +    /* Destroy all KVM devices */
> >> +    if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
> >> +        xics_kvm_disconnect(spapr, &local_err);
> >> +        if (local_err) {
> >> +            error_propagate(errp, local_err);
> >> +            error_prepend(errp, "KVM XICS disconnect failed: ");
> >> +            return;
> >> +        }
> >> +        kvmppc_xive_disconnect(spapr->xive, &local_err);
> >> +        if (local_err) {
> >> +            error_propagate(errp, local_err);
> >> +            error_prepend(errp, "KVM XIVE disconnect failed: ");
> >> +            return;
> >> +        }
> >> +    }
> >> +
> >>      spapr_irq_current(spapr)->reset(spapr, errp);
> >>  }
> >>  
> > 
>
Cédric Le Goater Feb. 13, 2019, 8:22 a.m. UTC | #4
On 2/13/19 2:32 AM, David Gibson wrote:
> On Tue, Feb 12, 2019 at 08:18:19AM +0100, Cédric Le Goater wrote:
>> On 2/12/19 2:11 AM, David Gibson wrote:
>>> On Mon, Jan 07, 2019 at 07:39:46PM +0100, Cédric Le Goater wrote:
>>>> The interrupt mode is chosen by the CAS negotiation process and
>>>> activated after a reset to take into account the required changes in
>>>> the machine. This brings new constraints on how the associated KVM IRQ
>>>> device is initialized.
>>>>
>>>> Currently, each model takes care of the initialization of the KVM
>>>> device in their realize method but this is not possible anymore as the
>>>> initialization needs to be done globaly when the interrupt mode is
>>>> known, i.e. when machine is reseted. It also means that we need a way
>>>> to delete a KVM device when another mode is chosen.
>>>>
>>>> Also, to support migration, the QEMU objects holding the state to
>>>> transfer should always be available but not necessarily activated.
>>>>
>>>> The overall approach of this proposal is to initialize both interrupt
>>>> mode at the QEMU level and keep the IRQ number space in sync to allow
>>>> switching from one mode to another. For the KVM side of things, the
>>>> whole initialization of the KVM device, sources and presenters, is
>>>> grouped in a single routine. The XICS and XIVE sPAPR IRQ reset
>>>> handlers are modified accordingly to handle the init and the delete
>>>> sequences of the KVM device.
>>>>
>>>> As KVM is now initialized at reset, we loose the possiblity to
>>>> fallback to the QEMU emulated mode in case of failure and failures
>>>> become fatal to the machine.
>>>>
>>>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>>>> ---
>>>>  hw/intc/spapr_xive.c     |  8 +---
>>>>  hw/intc/spapr_xive_kvm.c | 27 ++++++++++++++
>>>>  hw/intc/xics_kvm.c       | 25 +++++++++++++
>>>>  hw/intc/xive.c           |  4 --
>>>>  hw/ppc/spapr_irq.c       | 79 ++++++++++++++++++++++++++++------------
>>>>  5 files changed, 109 insertions(+), 34 deletions(-)
>>>>
>>>> diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
>>>> index 21f3c1ef0901..0661aca35900 100644
>>>> --- a/hw/intc/spapr_xive.c
>>>> +++ b/hw/intc/spapr_xive.c
>>>> @@ -330,13 +330,7 @@ static void spapr_xive_realize(DeviceState *dev, Error **errp)
>>>>      xive->eat = g_new0(XiveEAS, xive->nr_irqs);
>>>>      xive->endt = g_new0(XiveEND, xive->nr_ends);
>>>>  
>>>> -    if (kvmppc_xive_enabled()) {
>>>> -        kvmppc_xive_connect(xive, &local_err);
>>>> -        if (local_err) {
>>>> -            error_propagate(errp, local_err);
>>>> -            return;
>>>> -        }
>>>> -    } else {
>>>> +    if (!kvmppc_xive_enabled()) {
>>>>          /* TIMA initialization */
>>>>          memory_region_init_io(&xive->tm_mmio, OBJECT(xive), &xive_tm_ops, xive,
>>>>                                "xive.tima", 4ull << TM_SHIFT);
>>>> diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c
>>>> index d35814c1992e..3ebc947f2be7 100644
>>>> --- a/hw/intc/spapr_xive_kvm.c
>>>> +++ b/hw/intc/spapr_xive_kvm.c
>>>> @@ -737,6 +737,15 @@ void kvmppc_xive_connect(sPAPRXive *xive, Error **errp)
>>>>      Error *local_err = NULL;
>>>>      size_t esb_len;
>>>>      size_t tima_len;
>>>> +    CPUState *cs;
>>>> +
>>>> +    /*
>>>> +     * The KVM XIVE device already in use. This is the case when
>>>> +     * rebooting XIVE -> XIVE
>>>
>>> Can this case actually occur?  Further down you appear to
>>> unconditionally destroy both KVM devices at reset time.
>>
>> I guess you are right. I will check.
>>
>>>> +     */
>>>> +    if (xive->fd != -1) {
>>>> +        return;
>>>> +    }
>>>>  
>>>>      if (!kvm_enabled() || !kvmppc_has_cap_xive()) {
>>>>          error_setg(errp, "IRQ_XIVE capability must be present for KVM");
>>>> @@ -800,6 +809,24 @@ void kvmppc_xive_connect(sPAPRXive *xive, Error **errp)
>>>>      xive->change = qemu_add_vm_change_state_handler(
>>>>          kvmppc_xive_change_state_handler, xive);
>>>>  
>>>> +    /* Connect the presenters to the initial VCPUs of the machine */
>>>> +    CPU_FOREACH(cs) {
>>>> +        PowerPCCPU *cpu = POWERPC_CPU(cs);
>>>> +
>>>> +        kvmppc_xive_cpu_connect(cpu->tctx, &local_err);
>>>> +        if (local_err) {
>>>> +            error_propagate(errp, local_err);
>>>> +            return;
>>>> +        }
>>>> +    }
>>>> +
>>>> +    /* Update the KVM sources */
>>>> +    kvmppc_xive_source_reset(xsrc, &local_err);
>>>> +    if (local_err) {
>>>> +            error_propagate(errp, local_err);
>>>> +            return;
>>>> +    }
>>>> +
>>>>      kvm_kernel_irqchip = true;
>>>>      kvm_msi_via_irqfd_allowed = true;
>>>>      kvm_gsi_direct_mapping = true;
>>>> diff --git a/hw/intc/xics_kvm.c b/hw/intc/xics_kvm.c
>>>> index 1d21ff217b82..bfc35d71df7f 100644
>>>> --- a/hw/intc/xics_kvm.c
>>>> +++ b/hw/intc/xics_kvm.c
>>>> @@ -448,6 +448,16 @@ static void rtas_dummy(PowerPCCPU *cpu, sPAPRMachineState *spapr,
>>>>  int xics_kvm_init(sPAPRMachineState *spapr, Error **errp)
>>>>  {
>>>>      int rc;
>>>> +    CPUState *cs;
>>>> +    Error *local_err = NULL;
>>>> +
>>>> +    /*
>>>> +     * The KVM XICS device already in use. This is the case when
>>>> +     * rebooting XICS -> XICS
>>>> +     */
>>>> +    if (kernel_xics_fd != -1) {
>>>> +        return 0;
>>>> +    }
>>>>  
>>>>      if (!kvm_enabled() || !kvm_check_extension(kvm_state, KVM_CAP_IRQ_XICS)) {
>>>>          error_setg(errp,
>>>> @@ -496,6 +506,21 @@ int xics_kvm_init(sPAPRMachineState *spapr, Error **errp)
>>>>      kvm_msi_via_irqfd_allowed = true;
>>>>      kvm_gsi_direct_mapping = true;
>>>>  
>>>> +    /* Connect the presenters to the initial VCPUs of the machine */
>>>> +    CPU_FOREACH(cs) {
>>>> +        PowerPCCPU *cpu = POWERPC_CPU(cs);
>>>> +
>>>> +        icp_kvm_connect(cpu->icp, &local_err);
>>>> +        if (local_err) {
>>>> +            error_propagate(errp, local_err);
>>>> +            goto fail;
>>>> +        }
>>>> +        icp_set_kvm_state(cpu->icp, 1);
>>>> +    }
>>>> +
>>>> +    /* Update the KVM sources */
>>>> +    ics_set_kvm_state(ICS_KVM(spapr->ics), 1);
>>>> +
>>>>      return 0;
>>>>  
>>>>  fail:
>>>> diff --git a/hw/intc/xive.c b/hw/intc/xive.c
>>>> index c5c2fbc3f8bc..c166eab5b210 100644
>>>> --- a/hw/intc/xive.c
>>>> +++ b/hw/intc/xive.c
>>>> @@ -932,10 +932,6 @@ static void xive_source_reset(void *dev)
>>>>  
>>>>      /* PQs are initialized to 0b01 (Q=1) which corresponds to "ints off" */
>>>>      memset(xsrc->status, XIVE_ESB_OFF, xsrc->nr_irqs);
>>>> -
>>>> -    if (kvmppc_xive_enabled()) {
>>>> -        kvmppc_xive_source_reset(xsrc, &error_fatal);
>>>> -    }
>>>>  }
>>>>  
>>>>  static void xive_source_realize(DeviceState *dev, Error **errp)
>>>> diff --git a/hw/ppc/spapr_irq.c b/hw/ppc/spapr_irq.c
>>>> index ba27d9d8e972..5592eec3787b 100644
>>>> --- a/hw/ppc/spapr_irq.c
>>>> +++ b/hw/ppc/spapr_irq.c
>>>> @@ -98,20 +98,14 @@ static void spapr_irq_init_xics(sPAPRMachineState *spapr, Error **errp)
>>>>      int nr_irqs = spapr->irq->nr_irqs;
>>>>      Error *local_err = NULL;
>>>>  
>>>> -    if (kvm_enabled()) {
>>>> -        if (machine_kernel_irqchip_allowed(machine) &&
>>>> -            !xics_kvm_init(spapr, &local_err)) {
>>>> -            spapr->icp_type = TYPE_KVM_ICP;
>>>> -            spapr->ics = spapr_ics_create(spapr, TYPE_ICS_KVM, nr_irqs,
>>>> -                                          &local_err);
>>>> -        }
>>>> -        if (machine_kernel_irqchip_required(machine) && !spapr->ics) {
>>>> -            error_prepend(&local_err,
>>>> -                          "kernel_irqchip requested but unavailable: ");
>>>> -            goto error;
>>>
>>> I don't see anything that replaces the irqchip_required logic, which
>>> doesn't seem right.
>>
>> Yes. We do loose the ability to fall back to the emulated device in case
>> of failure. It is not impossible to do but it will require more changes
>> to check what are the KVM capabilities before starting the machine.
> 
> Uh... it seems more like it's the other way around.  We'll always fall
> back to emulated, even if we've explicitly said on the command line
> that we don't want that.

Ah yes. The init function might be also broken. 

XICS mode is a bit more difficult to handle than XIVE because we have 
different object type for the KVM device and the QEMU emulated device, 
and with the 'dual' mode, we activate the device at CAS reset time.

Failures being handled at reset time, should we keep the same logic and  
abort the machine at reset if the kernel irqchip is required ? 

But we won't be able to fall back on the QEMU emulated device if KVM 
XICS fails and if the kernel irqchip is only allowed. It should work for 
XIVE though.

Thanks,

C.


>> Nevertheless, any failure in reset when setting the KVM backend will
>> result in machine abort.
>>
>> C.       
>>
>>>
>>>> +    if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
>>>> +        spapr->icp_type = TYPE_KVM_ICP;
>>>> +        spapr->ics = spapr_ics_create(spapr, TYPE_ICS_KVM, nr_irqs,
>>>> +                                      &local_err);
>>>> +        if (local_err) {
>>>> +            error_propagate(errp, local_err);
>>>> +            return;
>>>>          }
>>>> -        error_free(local_err);
>>>> -        local_err = NULL;
>>>>      }
>>>>  
>>>>      if (!spapr->ics) {
>>>> @@ -119,10 +113,11 @@ static void spapr_irq_init_xics(sPAPRMachineState *spapr, Error **errp)
>>>>          spapr->icp_type = TYPE_ICP;
>>>>          spapr->ics = spapr_ics_create(spapr, TYPE_ICS_SIMPLE, nr_irqs,
>>>>                                        &local_err);
>>>> +        if (local_err) {
>>>> +            error_propagate(errp, local_err);
>>>> +            return;
>>>> +        }
>>>>      }
>>>> -
>>>> -error:
>>>> -    error_propagate(errp, local_err);
>>>>  }
>>>>  
>>>>  #define ICS_IRQ_FREE(ics, srcno)   \
>>>> @@ -233,7 +228,17 @@ static void spapr_irq_set_irq_xics(void *opaque, int srcno, int val)
>>>>  
>>>>  static void spapr_irq_reset_xics(sPAPRMachineState *spapr, Error **errp)
>>>>  {
>>>> -    /* TODO: create the KVM XICS device */
>>>> +    MachineState *machine = MACHINE(spapr);
>>>> +    Error *local_err = NULL;
>>>> +
>>>> +    if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
>>>> +        xics_kvm_init(spapr, &local_err);
>>>> +        if (local_err) {
>>>> +            error_propagate(errp, local_err);
>>>> +            error_prepend(errp, "KVM XICS connect failed: ");
>>>> +            return;
>>>> +        }
>>>> +    }
>>>>  }
>>>>  
>>>>  #define SPAPR_IRQ_XICS_NR_IRQS     0x1000
>>>> @@ -393,6 +398,7 @@ static int spapr_irq_post_load_xive(sPAPRMachineState *spapr, int version_id)
>>>>  static void spapr_irq_reset_xive(sPAPRMachineState *spapr, Error **errp)
>>>>  {
>>>>      CPUState *cs;
>>>> +    Error *local_err = NULL;
>>>>  
>>>>      CPU_FOREACH(cs) {
>>>>          PowerPCCPU *cpu = POWERPC_CPU(cs);
>>>> @@ -401,6 +407,15 @@ static void spapr_irq_reset_xive(sPAPRMachineState *spapr, Error **errp)
>>>>          spapr_xive_set_tctx_os_cam(cpu->tctx);
>>>>      }
>>>>  
>>>> +    if (kvmppc_xive_enabled()) {
>>>> +        kvmppc_xive_connect(spapr->xive, &local_err);
>>>> +        if (local_err) {
>>>> +            error_propagate(errp, local_err);
>>>> +            error_prepend(errp, "KVM XIVE connect failed: ");
>>>> +            return;
>>>> +        }
>>>> +    }
>>>> +
>>>>      /* Activate the XIVE MMIOs */
>>>>      spapr_xive_mmio_set_enabled(spapr->xive, true);
>>>>  }
>>>> @@ -462,14 +477,8 @@ static sPAPRIrq *spapr_irq_current(sPAPRMachineState *spapr)
>>>>  
>>>>  static void spapr_irq_init_dual(sPAPRMachineState *spapr, Error **errp)
>>>>  {
>>>> -    MachineState *machine = MACHINE(spapr);
>>>>      Error *local_err = NULL;
>>>>  
>>>> -    if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
>>>> -        error_setg(errp, "No KVM support for the 'dual' machine");
>>>> -        return;
>>>> -    }
>>>> -
>>>>      spapr_irq_xics.init(spapr, &local_err);
>>>>      if (local_err) {
>>>>          error_propagate(errp, local_err);
>>>> @@ -568,11 +577,16 @@ static void spapr_irq_cpu_intc_create_dual(sPAPRMachineState *spapr,
>>>>  
>>>>  static int spapr_irq_post_load_dual(sPAPRMachineState *spapr, int version_id)
>>>>  {
>>>> +    MachineState *machine = MACHINE(spapr);
>>>> +
>>>>      /*
>>>>       * Force a reset of the XIVE backend after migration. The machine
>>>>       * defaults to XICS at startup.
>>>>       */
>>>>      if (spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
>>>> +        if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
>>>> +            xics_kvm_disconnect(spapr, &error_fatal);
>>>> +        }
>>>>          spapr_irq_xive.reset(spapr, &error_fatal);
>>>>      }
>>>>  
>>>> @@ -581,12 +595,31 @@ static int spapr_irq_post_load_dual(sPAPRMachineState *spapr, int version_id)
>>>>  
>>>>  static void spapr_irq_reset_dual(sPAPRMachineState *spapr, Error **errp)
>>>>  {
>>>> +    MachineState *machine = MACHINE(spapr);
>>>> +    Error *local_err = NULL;
>>>> +
>>>>      /*
>>>>       * Deactivate the XIVE MMIOs. The XIVE backend will reenable them
>>>>       * if selected.
>>>>       */
>>>>      spapr_xive_mmio_set_enabled(spapr->xive, false);
>>>>  
>>>> +    /* Destroy all KVM devices */
>>>> +    if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
>>>> +        xics_kvm_disconnect(spapr, &local_err);
>>>> +        if (local_err) {
>>>> +            error_propagate(errp, local_err);
>>>> +            error_prepend(errp, "KVM XICS disconnect failed: ");
>>>> +            return;
>>>> +        }
>>>> +        kvmppc_xive_disconnect(spapr->xive, &local_err);
>>>> +        if (local_err) {
>>>> +            error_propagate(errp, local_err);
>>>> +            error_prepend(errp, "KVM XIVE disconnect failed: ");
>>>> +            return;
>>>> +        }
>>>> +    }
>>>> +
>>>>      spapr_irq_current(spapr)->reset(spapr, errp);
>>>>  }
>>>>  
>>>
>>
>
Greg Kurz Feb. 13, 2019, 10:07 a.m. UTC | #5
On Wed, 13 Feb 2019 09:22:46 +0100
Cédric Le Goater <clg@kaod.org> wrote:

> On 2/13/19 2:32 AM, David Gibson wrote:
> > On Tue, Feb 12, 2019 at 08:18:19AM +0100, Cédric Le Goater wrote:  
> >> On 2/12/19 2:11 AM, David Gibson wrote:  
> >>> On Mon, Jan 07, 2019 at 07:39:46PM +0100, Cédric Le Goater wrote:  
> >>>> The interrupt mode is chosen by the CAS negotiation process and
> >>>> activated after a reset to take into account the required changes in
> >>>> the machine. This brings new constraints on how the associated KVM IRQ
> >>>> device is initialized.
> >>>>
> >>>> Currently, each model takes care of the initialization of the KVM
> >>>> device in their realize method but this is not possible anymore as the
> >>>> initialization needs to be done globaly when the interrupt mode is
> >>>> known, i.e. when machine is reseted. It also means that we need a way
> >>>> to delete a KVM device when another mode is chosen.
> >>>>
> >>>> Also, to support migration, the QEMU objects holding the state to
> >>>> transfer should always be available but not necessarily activated.
> >>>>
> >>>> The overall approach of this proposal is to initialize both interrupt
> >>>> mode at the QEMU level and keep the IRQ number space in sync to allow
> >>>> switching from one mode to another. For the KVM side of things, the
> >>>> whole initialization of the KVM device, sources and presenters, is
> >>>> grouped in a single routine. The XICS and XIVE sPAPR IRQ reset
> >>>> handlers are modified accordingly to handle the init and the delete
> >>>> sequences of the KVM device.
> >>>>
> >>>> As KVM is now initialized at reset, we loose the possiblity to
> >>>> fallback to the QEMU emulated mode in case of failure and failures
> >>>> become fatal to the machine.
> >>>>
> >>>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> >>>> ---
> >>>>  hw/intc/spapr_xive.c     |  8 +---
> >>>>  hw/intc/spapr_xive_kvm.c | 27 ++++++++++++++
> >>>>  hw/intc/xics_kvm.c       | 25 +++++++++++++
> >>>>  hw/intc/xive.c           |  4 --
> >>>>  hw/ppc/spapr_irq.c       | 79 ++++++++++++++++++++++++++++------------
> >>>>  5 files changed, 109 insertions(+), 34 deletions(-)
> >>>>
> >>>> diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
> >>>> index 21f3c1ef0901..0661aca35900 100644
> >>>> --- a/hw/intc/spapr_xive.c
> >>>> +++ b/hw/intc/spapr_xive.c
> >>>> @@ -330,13 +330,7 @@ static void spapr_xive_realize(DeviceState *dev, Error **errp)
> >>>>      xive->eat = g_new0(XiveEAS, xive->nr_irqs);
> >>>>      xive->endt = g_new0(XiveEND, xive->nr_ends);
> >>>>  
> >>>> -    if (kvmppc_xive_enabled()) {
> >>>> -        kvmppc_xive_connect(xive, &local_err);
> >>>> -        if (local_err) {
> >>>> -            error_propagate(errp, local_err);
> >>>> -            return;
> >>>> -        }
> >>>> -    } else {
> >>>> +    if (!kvmppc_xive_enabled()) {
> >>>>          /* TIMA initialization */
> >>>>          memory_region_init_io(&xive->tm_mmio, OBJECT(xive), &xive_tm_ops, xive,
> >>>>                                "xive.tima", 4ull << TM_SHIFT);
> >>>> diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c
> >>>> index d35814c1992e..3ebc947f2be7 100644
> >>>> --- a/hw/intc/spapr_xive_kvm.c
> >>>> +++ b/hw/intc/spapr_xive_kvm.c
> >>>> @@ -737,6 +737,15 @@ void kvmppc_xive_connect(sPAPRXive *xive, Error **errp)
> >>>>      Error *local_err = NULL;
> >>>>      size_t esb_len;
> >>>>      size_t tima_len;
> >>>> +    CPUState *cs;
> >>>> +
> >>>> +    /*
> >>>> +     * The KVM XIVE device already in use. This is the case when
> >>>> +     * rebooting XIVE -> XIVE  
> >>>
> >>> Can this case actually occur?  Further down you appear to
> >>> unconditionally destroy both KVM devices at reset time.  
> >>
> >> I guess you are right. I will check.
> >>  
> >>>> +     */
> >>>> +    if (xive->fd != -1) {
> >>>> +        return;
> >>>> +    }
> >>>>  
> >>>>      if (!kvm_enabled() || !kvmppc_has_cap_xive()) {
> >>>>          error_setg(errp, "IRQ_XIVE capability must be present for KVM");
> >>>> @@ -800,6 +809,24 @@ void kvmppc_xive_connect(sPAPRXive *xive, Error **errp)
> >>>>      xive->change = qemu_add_vm_change_state_handler(
> >>>>          kvmppc_xive_change_state_handler, xive);
> >>>>  
> >>>> +    /* Connect the presenters to the initial VCPUs of the machine */
> >>>> +    CPU_FOREACH(cs) {
> >>>> +        PowerPCCPU *cpu = POWERPC_CPU(cs);
> >>>> +
> >>>> +        kvmppc_xive_cpu_connect(cpu->tctx, &local_err);
> >>>> +        if (local_err) {
> >>>> +            error_propagate(errp, local_err);
> >>>> +            return;
> >>>> +        }
> >>>> +    }
> >>>> +
> >>>> +    /* Update the KVM sources */
> >>>> +    kvmppc_xive_source_reset(xsrc, &local_err);
> >>>> +    if (local_err) {
> >>>> +            error_propagate(errp, local_err);
> >>>> +            return;
> >>>> +    }
> >>>> +
> >>>>      kvm_kernel_irqchip = true;
> >>>>      kvm_msi_via_irqfd_allowed = true;
> >>>>      kvm_gsi_direct_mapping = true;
> >>>> diff --git a/hw/intc/xics_kvm.c b/hw/intc/xics_kvm.c
> >>>> index 1d21ff217b82..bfc35d71df7f 100644
> >>>> --- a/hw/intc/xics_kvm.c
> >>>> +++ b/hw/intc/xics_kvm.c
> >>>> @@ -448,6 +448,16 @@ static void rtas_dummy(PowerPCCPU *cpu, sPAPRMachineState *spapr,
> >>>>  int xics_kvm_init(sPAPRMachineState *spapr, Error **errp)
> >>>>  {
> >>>>      int rc;
> >>>> +    CPUState *cs;
> >>>> +    Error *local_err = NULL;
> >>>> +
> >>>> +    /*
> >>>> +     * The KVM XICS device already in use. This is the case when
> >>>> +     * rebooting XICS -> XICS
> >>>> +     */
> >>>> +    if (kernel_xics_fd != -1) {
> >>>> +        return 0;
> >>>> +    }
> >>>>  
> >>>>      if (!kvm_enabled() || !kvm_check_extension(kvm_state, KVM_CAP_IRQ_XICS)) {
> >>>>          error_setg(errp,
> >>>> @@ -496,6 +506,21 @@ int xics_kvm_init(sPAPRMachineState *spapr, Error **errp)
> >>>>      kvm_msi_via_irqfd_allowed = true;
> >>>>      kvm_gsi_direct_mapping = true;
> >>>>  
> >>>> +    /* Connect the presenters to the initial VCPUs of the machine */
> >>>> +    CPU_FOREACH(cs) {
> >>>> +        PowerPCCPU *cpu = POWERPC_CPU(cs);
> >>>> +
> >>>> +        icp_kvm_connect(cpu->icp, &local_err);
> >>>> +        if (local_err) {
> >>>> +            error_propagate(errp, local_err);
> >>>> +            goto fail;
> >>>> +        }
> >>>> +        icp_set_kvm_state(cpu->icp, 1);
> >>>> +    }
> >>>> +
> >>>> +    /* Update the KVM sources */
> >>>> +    ics_set_kvm_state(ICS_KVM(spapr->ics), 1);
> >>>> +
> >>>>      return 0;
> >>>>  
> >>>>  fail:
> >>>> diff --git a/hw/intc/xive.c b/hw/intc/xive.c
> >>>> index c5c2fbc3f8bc..c166eab5b210 100644
> >>>> --- a/hw/intc/xive.c
> >>>> +++ b/hw/intc/xive.c
> >>>> @@ -932,10 +932,6 @@ static void xive_source_reset(void *dev)
> >>>>  
> >>>>      /* PQs are initialized to 0b01 (Q=1) which corresponds to "ints off" */
> >>>>      memset(xsrc->status, XIVE_ESB_OFF, xsrc->nr_irqs);
> >>>> -
> >>>> -    if (kvmppc_xive_enabled()) {
> >>>> -        kvmppc_xive_source_reset(xsrc, &error_fatal);
> >>>> -    }
> >>>>  }
> >>>>  
> >>>>  static void xive_source_realize(DeviceState *dev, Error **errp)
> >>>> diff --git a/hw/ppc/spapr_irq.c b/hw/ppc/spapr_irq.c
> >>>> index ba27d9d8e972..5592eec3787b 100644
> >>>> --- a/hw/ppc/spapr_irq.c
> >>>> +++ b/hw/ppc/spapr_irq.c
> >>>> @@ -98,20 +98,14 @@ static void spapr_irq_init_xics(sPAPRMachineState *spapr, Error **errp)
> >>>>      int nr_irqs = spapr->irq->nr_irqs;
> >>>>      Error *local_err = NULL;
> >>>>  
> >>>> -    if (kvm_enabled()) {
> >>>> -        if (machine_kernel_irqchip_allowed(machine) &&
> >>>> -            !xics_kvm_init(spapr, &local_err)) {
> >>>> -            spapr->icp_type = TYPE_KVM_ICP;
> >>>> -            spapr->ics = spapr_ics_create(spapr, TYPE_ICS_KVM, nr_irqs,
> >>>> -                                          &local_err);
> >>>> -        }
> >>>> -        if (machine_kernel_irqchip_required(machine) && !spapr->ics) {
> >>>> -            error_prepend(&local_err,
> >>>> -                          "kernel_irqchip requested but unavailable: ");
> >>>> -            goto error;  
> >>>
> >>> I don't see anything that replaces the irqchip_required logic, which
> >>> doesn't seem right.  
> >>
> >> Yes. We do loose the ability to fall back to the emulated device in case
> >> of failure. It is not impossible to do but it will require more changes
> >> to check what are the KVM capabilities before starting the machine.  
> > 
> > Uh... it seems more like it's the other way around.  We'll always fall
> > back to emulated, even if we've explicitly said on the command line
> > that we don't want that.  
> 
> Ah yes. The init function might be also broken. 
> 
> XICS mode is a bit more difficult to handle than XIVE because we have 
> different object type for the KVM device and the QEMU emulated device, 

This is indeed a bit unfortunate, but I think there's still room for
improvement. Let's look at the base classes:

struct ICPStateClass {
    DeviceClass parent_class;

    DeviceRealize parent_realize;
    DeviceReset parent_reset;

    void (*pre_save)(ICPState *icp);
    int (*post_load)(ICPState *icp, int version_id);
    void (*synchronize_state)(ICPState *icp);
};

struct ICSStateClass {
    DeviceClass parent_class;

    DeviceRealize parent_realize;
    DeviceReset parent_reset;

    void (*pre_save)(ICSState *s);
    int (*post_load)(ICSState *s, int version_id);
    void (*reject)(ICSState *s, uint32_t irq);
    void (*resend)(ICSState *s);
    void (*eoi)(ICSState *s, uint32_t irq);
    void (*synchronize_state)(ICSState *s);
};

The pre_save and post_load callbacks are only used with
the KVM device. They could be explicitely called from
the corresponding VMStateDescription callbacks with a
kvm_enabled() && kvm_irqchip_in_kernel() check.

Same goes for the synchronize_state callbacks, which are only
needed for 'info pic'.

The reject, resend and eoi callbacks are only called by code that
belongs to the QEMU emulated device. Either the RTAS/hypercalls
or from the machine code with explicit checks like:

static void spapr_irq_set_irq_xics(void *opaque, int srcno, int val)
{
    sPAPRMachineState *spapr = opaque;
    MachineState *machine = MACHINE(opaque);

    if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
        ics_kvm_set_irq(spapr->ics, srcno, val);
    } else {
        ics_simple_set_irq(spapr->ics, srcno, val);
    }
}

or

static int spapr_irq_post_load_xics(sPAPRMachineState *spapr, int version_id)
{
    if (!object_dynamic_cast(OBJECT(spapr->ics), TYPE_ICS_KVM)) {
        CPUState *cs;
        CPU_FOREACH(cs) {
            PowerPCCPU *cpu = POWERPC_CPU(cs);
            icp_resend(spapr_cpu_state(cpu)->icp);
        }
    }
    return 0;
}

Unless I'm missing something, the reject, resend and eoi callbacks could
simply be removed. This would allow to unify KVM and QEMU emulation in
the same ICP and ICS object types.

If this makes sense to you, I can have a look (already started actually ;-)

> and with the 'dual' mode, we activate the device at CAS reset time.
> 
> Failures being handled at reset time, should we keep the same logic and  
> abort the machine at reset if the kernel irqchip is required ? 
> 

If the user passed ic-mode=dual,kernel-irqchip=on, we should at least make
sure KVM supports both XICS and XIVE devices during machine init. Then
during reset if something goes wrong with KVM, it seems ok to abort.

If the user didn't pass kernel-irqchip, ie, kernel_irqchip_allowed is true
and kernel_irqchip_required is false, the current behavior for XICS is
to try KVM first and fallback to QEMU emulation. I guess it could be the
same for XIVE.

> But we won't be able to fall back on the QEMU emulated device if KVM 
> XICS fails and if the kernel irqchip is only allowed. It should work for 
> XIVE though.
> 
> Thanks,
> 
> C.
> 
> 
> >> Nevertheless, any failure in reset when setting the KVM backend will
> >> result in machine abort.
> >>
> >> C.       
> >>  
> >>>  
> >>>> +    if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
> >>>> +        spapr->icp_type = TYPE_KVM_ICP;
> >>>> +        spapr->ics = spapr_ics_create(spapr, TYPE_ICS_KVM, nr_irqs,
> >>>> +                                      &local_err);
> >>>> +        if (local_err) {
> >>>> +            error_propagate(errp, local_err);
> >>>> +            return;
> >>>>          }
> >>>> -        error_free(local_err);
> >>>> -        local_err = NULL;
> >>>>      }
> >>>>  
> >>>>      if (!spapr->ics) {
> >>>> @@ -119,10 +113,11 @@ static void spapr_irq_init_xics(sPAPRMachineState *spapr, Error **errp)
> >>>>          spapr->icp_type = TYPE_ICP;
> >>>>          spapr->ics = spapr_ics_create(spapr, TYPE_ICS_SIMPLE, nr_irqs,
> >>>>                                        &local_err);
> >>>> +        if (local_err) {
> >>>> +            error_propagate(errp, local_err);
> >>>> +            return;
> >>>> +        }
> >>>>      }
> >>>> -
> >>>> -error:
> >>>> -    error_propagate(errp, local_err);
> >>>>  }
> >>>>  
> >>>>  #define ICS_IRQ_FREE(ics, srcno)   \
> >>>> @@ -233,7 +228,17 @@ static void spapr_irq_set_irq_xics(void *opaque, int srcno, int val)
> >>>>  
> >>>>  static void spapr_irq_reset_xics(sPAPRMachineState *spapr, Error **errp)
> >>>>  {
> >>>> -    /* TODO: create the KVM XICS device */
> >>>> +    MachineState *machine = MACHINE(spapr);
> >>>> +    Error *local_err = NULL;
> >>>> +
> >>>> +    if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
> >>>> +        xics_kvm_init(spapr, &local_err);
> >>>> +        if (local_err) {
> >>>> +            error_propagate(errp, local_err);
> >>>> +            error_prepend(errp, "KVM XICS connect failed: ");
> >>>> +            return;
> >>>> +        }
> >>>> +    }
> >>>>  }
> >>>>  
> >>>>  #define SPAPR_IRQ_XICS_NR_IRQS     0x1000
> >>>> @@ -393,6 +398,7 @@ static int spapr_irq_post_load_xive(sPAPRMachineState *spapr, int version_id)
> >>>>  static void spapr_irq_reset_xive(sPAPRMachineState *spapr, Error **errp)
> >>>>  {
> >>>>      CPUState *cs;
> >>>> +    Error *local_err = NULL;
> >>>>  
> >>>>      CPU_FOREACH(cs) {
> >>>>          PowerPCCPU *cpu = POWERPC_CPU(cs);
> >>>> @@ -401,6 +407,15 @@ static void spapr_irq_reset_xive(sPAPRMachineState *spapr, Error **errp)
> >>>>          spapr_xive_set_tctx_os_cam(cpu->tctx);
> >>>>      }
> >>>>  
> >>>> +    if (kvmppc_xive_enabled()) {
> >>>> +        kvmppc_xive_connect(spapr->xive, &local_err);
> >>>> +        if (local_err) {
> >>>> +            error_propagate(errp, local_err);
> >>>> +            error_prepend(errp, "KVM XIVE connect failed: ");
> >>>> +            return;
> >>>> +        }
> >>>> +    }
> >>>> +
> >>>>      /* Activate the XIVE MMIOs */
> >>>>      spapr_xive_mmio_set_enabled(spapr->xive, true);
> >>>>  }
> >>>> @@ -462,14 +477,8 @@ static sPAPRIrq *spapr_irq_current(sPAPRMachineState *spapr)
> >>>>  
> >>>>  static void spapr_irq_init_dual(sPAPRMachineState *spapr, Error **errp)
> >>>>  {
> >>>> -    MachineState *machine = MACHINE(spapr);
> >>>>      Error *local_err = NULL;
> >>>>  
> >>>> -    if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
> >>>> -        error_setg(errp, "No KVM support for the 'dual' machine");
> >>>> -        return;
> >>>> -    }
> >>>> -
> >>>>      spapr_irq_xics.init(spapr, &local_err);
> >>>>      if (local_err) {
> >>>>          error_propagate(errp, local_err);
> >>>> @@ -568,11 +577,16 @@ static void spapr_irq_cpu_intc_create_dual(sPAPRMachineState *spapr,
> >>>>  
> >>>>  static int spapr_irq_post_load_dual(sPAPRMachineState *spapr, int version_id)
> >>>>  {
> >>>> +    MachineState *machine = MACHINE(spapr);
> >>>> +
> >>>>      /*
> >>>>       * Force a reset of the XIVE backend after migration. The machine
> >>>>       * defaults to XICS at startup.
> >>>>       */
> >>>>      if (spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
> >>>> +        if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
> >>>> +            xics_kvm_disconnect(spapr, &error_fatal);
> >>>> +        }
> >>>>          spapr_irq_xive.reset(spapr, &error_fatal);
> >>>>      }
> >>>>  
> >>>> @@ -581,12 +595,31 @@ static int spapr_irq_post_load_dual(sPAPRMachineState *spapr, int version_id)
> >>>>  
> >>>>  static void spapr_irq_reset_dual(sPAPRMachineState *spapr, Error **errp)
> >>>>  {
> >>>> +    MachineState *machine = MACHINE(spapr);
> >>>> +    Error *local_err = NULL;
> >>>> +
> >>>>      /*
> >>>>       * Deactivate the XIVE MMIOs. The XIVE backend will reenable them
> >>>>       * if selected.
> >>>>       */
> >>>>      spapr_xive_mmio_set_enabled(spapr->xive, false);
> >>>>  
> >>>> +    /* Destroy all KVM devices */
> >>>> +    if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
> >>>> +        xics_kvm_disconnect(spapr, &local_err);
> >>>> +        if (local_err) {
> >>>> +            error_propagate(errp, local_err);
> >>>> +            error_prepend(errp, "KVM XICS disconnect failed: ");
> >>>> +            return;
> >>>> +        }
> >>>> +        kvmppc_xive_disconnect(spapr->xive, &local_err);
> >>>> +        if (local_err) {
> >>>> +            error_propagate(errp, local_err);
> >>>> +            error_prepend(errp, "KVM XIVE disconnect failed: ");
> >>>> +            return;
> >>>> +        }
> >>>> +    }
> >>>> +
> >>>>      spapr_irq_current(spapr)->reset(spapr, errp);
> >>>>  }
> >>>>    
> >>>  
> >>  
> >   
> 
>
David Gibson Feb. 14, 2019, 3:29 a.m. UTC | #6
On Wed, Feb 13, 2019 at 09:22:46AM +0100, Cédric Le Goater wrote:
> On 2/13/19 2:32 AM, David Gibson wrote:
> > On Tue, Feb 12, 2019 at 08:18:19AM +0100, Cédric Le Goater wrote:
> >> On 2/12/19 2:11 AM, David Gibson wrote:
> >>> On Mon, Jan 07, 2019 at 07:39:46PM +0100, Cédric Le Goater wrote:
> >>>> The interrupt mode is chosen by the CAS negotiation process and
> >>>> activated after a reset to take into account the required changes in
> >>>> the machine. This brings new constraints on how the associated KVM IRQ
> >>>> device is initialized.
> >>>>
> >>>> Currently, each model takes care of the initialization of the KVM
> >>>> device in their realize method but this is not possible anymore as the
> >>>> initialization needs to be done globaly when the interrupt mode is
> >>>> known, i.e. when machine is reseted. It also means that we need a way
> >>>> to delete a KVM device when another mode is chosen.
> >>>>
> >>>> Also, to support migration, the QEMU objects holding the state to
> >>>> transfer should always be available but not necessarily activated.
> >>>>
> >>>> The overall approach of this proposal is to initialize both interrupt
> >>>> mode at the QEMU level and keep the IRQ number space in sync to allow
> >>>> switching from one mode to another. For the KVM side of things, the
> >>>> whole initialization of the KVM device, sources and presenters, is
> >>>> grouped in a single routine. The XICS and XIVE sPAPR IRQ reset
> >>>> handlers are modified accordingly to handle the init and the delete
> >>>> sequences of the KVM device.
> >>>>
> >>>> As KVM is now initialized at reset, we loose the possiblity to
> >>>> fallback to the QEMU emulated mode in case of failure and failures
> >>>> become fatal to the machine.
> >>>>
> >>>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> >>>> ---
> >>>>  hw/intc/spapr_xive.c     |  8 +---
> >>>>  hw/intc/spapr_xive_kvm.c | 27 ++++++++++++++
> >>>>  hw/intc/xics_kvm.c       | 25 +++++++++++++
> >>>>  hw/intc/xive.c           |  4 --
> >>>>  hw/ppc/spapr_irq.c       | 79 ++++++++++++++++++++++++++++------------
> >>>>  5 files changed, 109 insertions(+), 34 deletions(-)
> >>>>
> >>>> diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
> >>>> index 21f3c1ef0901..0661aca35900 100644
> >>>> --- a/hw/intc/spapr_xive.c
> >>>> +++ b/hw/intc/spapr_xive.c
> >>>> @@ -330,13 +330,7 @@ static void spapr_xive_realize(DeviceState *dev, Error **errp)
> >>>>      xive->eat = g_new0(XiveEAS, xive->nr_irqs);
> >>>>      xive->endt = g_new0(XiveEND, xive->nr_ends);
> >>>>  
> >>>> -    if (kvmppc_xive_enabled()) {
> >>>> -        kvmppc_xive_connect(xive, &local_err);
> >>>> -        if (local_err) {
> >>>> -            error_propagate(errp, local_err);
> >>>> -            return;
> >>>> -        }
> >>>> -    } else {
> >>>> +    if (!kvmppc_xive_enabled()) {
> >>>>          /* TIMA initialization */
> >>>>          memory_region_init_io(&xive->tm_mmio, OBJECT(xive), &xive_tm_ops, xive,
> >>>>                                "xive.tima", 4ull << TM_SHIFT);
> >>>> diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c
> >>>> index d35814c1992e..3ebc947f2be7 100644
> >>>> --- a/hw/intc/spapr_xive_kvm.c
> >>>> +++ b/hw/intc/spapr_xive_kvm.c
> >>>> @@ -737,6 +737,15 @@ void kvmppc_xive_connect(sPAPRXive *xive, Error **errp)
> >>>>      Error *local_err = NULL;
> >>>>      size_t esb_len;
> >>>>      size_t tima_len;
> >>>> +    CPUState *cs;
> >>>> +
> >>>> +    /*
> >>>> +     * The KVM XIVE device already in use. This is the case when
> >>>> +     * rebooting XIVE -> XIVE
> >>>
> >>> Can this case actually occur?  Further down you appear to
> >>> unconditionally destroy both KVM devices at reset time.
> >>
> >> I guess you are right. I will check.
> >>
> >>>> +     */
> >>>> +    if (xive->fd != -1) {
> >>>> +        return;
> >>>> +    }
> >>>>  
> >>>>      if (!kvm_enabled() || !kvmppc_has_cap_xive()) {
> >>>>          error_setg(errp, "IRQ_XIVE capability must be present for KVM");
> >>>> @@ -800,6 +809,24 @@ void kvmppc_xive_connect(sPAPRXive *xive, Error **errp)
> >>>>      xive->change = qemu_add_vm_change_state_handler(
> >>>>          kvmppc_xive_change_state_handler, xive);
> >>>>  
> >>>> +    /* Connect the presenters to the initial VCPUs of the machine */
> >>>> +    CPU_FOREACH(cs) {
> >>>> +        PowerPCCPU *cpu = POWERPC_CPU(cs);
> >>>> +
> >>>> +        kvmppc_xive_cpu_connect(cpu->tctx, &local_err);
> >>>> +        if (local_err) {
> >>>> +            error_propagate(errp, local_err);
> >>>> +            return;
> >>>> +        }
> >>>> +    }
> >>>> +
> >>>> +    /* Update the KVM sources */
> >>>> +    kvmppc_xive_source_reset(xsrc, &local_err);
> >>>> +    if (local_err) {
> >>>> +            error_propagate(errp, local_err);
> >>>> +            return;
> >>>> +    }
> >>>> +
> >>>>      kvm_kernel_irqchip = true;
> >>>>      kvm_msi_via_irqfd_allowed = true;
> >>>>      kvm_gsi_direct_mapping = true;
> >>>> diff --git a/hw/intc/xics_kvm.c b/hw/intc/xics_kvm.c
> >>>> index 1d21ff217b82..bfc35d71df7f 100644
> >>>> --- a/hw/intc/xics_kvm.c
> >>>> +++ b/hw/intc/xics_kvm.c
> >>>> @@ -448,6 +448,16 @@ static void rtas_dummy(PowerPCCPU *cpu, sPAPRMachineState *spapr,
> >>>>  int xics_kvm_init(sPAPRMachineState *spapr, Error **errp)
> >>>>  {
> >>>>      int rc;
> >>>> +    CPUState *cs;
> >>>> +    Error *local_err = NULL;
> >>>> +
> >>>> +    /*
> >>>> +     * The KVM XICS device already in use. This is the case when
> >>>> +     * rebooting XICS -> XICS
> >>>> +     */
> >>>> +    if (kernel_xics_fd != -1) {
> >>>> +        return 0;
> >>>> +    }
> >>>>  
> >>>>      if (!kvm_enabled() || !kvm_check_extension(kvm_state, KVM_CAP_IRQ_XICS)) {
> >>>>          error_setg(errp,
> >>>> @@ -496,6 +506,21 @@ int xics_kvm_init(sPAPRMachineState *spapr, Error **errp)
> >>>>      kvm_msi_via_irqfd_allowed = true;
> >>>>      kvm_gsi_direct_mapping = true;
> >>>>  
> >>>> +    /* Connect the presenters to the initial VCPUs of the machine */
> >>>> +    CPU_FOREACH(cs) {
> >>>> +        PowerPCCPU *cpu = POWERPC_CPU(cs);
> >>>> +
> >>>> +        icp_kvm_connect(cpu->icp, &local_err);
> >>>> +        if (local_err) {
> >>>> +            error_propagate(errp, local_err);
> >>>> +            goto fail;
> >>>> +        }
> >>>> +        icp_set_kvm_state(cpu->icp, 1);
> >>>> +    }
> >>>> +
> >>>> +    /* Update the KVM sources */
> >>>> +    ics_set_kvm_state(ICS_KVM(spapr->ics), 1);
> >>>> +
> >>>>      return 0;
> >>>>  
> >>>>  fail:
> >>>> diff --git a/hw/intc/xive.c b/hw/intc/xive.c
> >>>> index c5c2fbc3f8bc..c166eab5b210 100644
> >>>> --- a/hw/intc/xive.c
> >>>> +++ b/hw/intc/xive.c
> >>>> @@ -932,10 +932,6 @@ static void xive_source_reset(void *dev)
> >>>>  
> >>>>      /* PQs are initialized to 0b01 (Q=1) which corresponds to "ints off" */
> >>>>      memset(xsrc->status, XIVE_ESB_OFF, xsrc->nr_irqs);
> >>>> -
> >>>> -    if (kvmppc_xive_enabled()) {
> >>>> -        kvmppc_xive_source_reset(xsrc, &error_fatal);
> >>>> -    }
> >>>>  }
> >>>>  
> >>>>  static void xive_source_realize(DeviceState *dev, Error **errp)
> >>>> diff --git a/hw/ppc/spapr_irq.c b/hw/ppc/spapr_irq.c
> >>>> index ba27d9d8e972..5592eec3787b 100644
> >>>> --- a/hw/ppc/spapr_irq.c
> >>>> +++ b/hw/ppc/spapr_irq.c
> >>>> @@ -98,20 +98,14 @@ static void spapr_irq_init_xics(sPAPRMachineState *spapr, Error **errp)
> >>>>      int nr_irqs = spapr->irq->nr_irqs;
> >>>>      Error *local_err = NULL;
> >>>>  
> >>>> -    if (kvm_enabled()) {
> >>>> -        if (machine_kernel_irqchip_allowed(machine) &&
> >>>> -            !xics_kvm_init(spapr, &local_err)) {
> >>>> -            spapr->icp_type = TYPE_KVM_ICP;
> >>>> -            spapr->ics = spapr_ics_create(spapr, TYPE_ICS_KVM, nr_irqs,
> >>>> -                                          &local_err);
> >>>> -        }
> >>>> -        if (machine_kernel_irqchip_required(machine) && !spapr->ics) {
> >>>> -            error_prepend(&local_err,
> >>>> -                          "kernel_irqchip requested but unavailable: ");
> >>>> -            goto error;
> >>>
> >>> I don't see anything that replaces the irqchip_required logic, which
> >>> doesn't seem right.
> >>
> >> Yes. We do loose the ability to fall back to the emulated device in case
> >> of failure. It is not impossible to do but it will require more changes
> >> to check what are the KVM capabilities before starting the machine.
> > 
> > Uh... it seems more like it's the other way around.  We'll always fall
> > back to emulated, even if we've explicitly said on the command line
> > that we don't want that.
> 
> Ah yes. The init function might be also broken. 
> 
> XICS mode is a bit more difficult to handle than XIVE because we have 
> different object type for the KVM device and the QEMU emulated device, 
> and with the 'dual' mode, we activate the device at CAS reset time.

Yeah.. we should probably fix that.

> Failures being handled at reset time, should we keep the same logic and  
> abort the machine at reset if the kernel irqchip is required ? 
> 
> But we won't be able to fall back on the QEMU emulated device if KVM 
> XICS fails and if the kernel irqchip is only allowed. It should work for 
> XIVE though.

That's fine.  If we've said that kernel irqchip is required, we
shouldn't fall back to emulation.
David Gibson Feb. 14, 2019, 3:35 a.m. UTC | #7
On Wed, Feb 13, 2019 at 11:07:49AM +0100, Greg Kurz wrote:
> On Wed, 13 Feb 2019 09:22:46 +0100
> Cédric Le Goater <clg@kaod.org> wrote:
> 
> > On 2/13/19 2:32 AM, David Gibson wrote:
> > > On Tue, Feb 12, 2019 at 08:18:19AM +0100, Cédric Le Goater wrote:  
> > >> On 2/12/19 2:11 AM, David Gibson wrote:  
> > >>> On Mon, Jan 07, 2019 at 07:39:46PM +0100, Cédric Le Goater wrote:  
> > >>>> The interrupt mode is chosen by the CAS negotiation process and
> > >>>> activated after a reset to take into account the required changes in
> > >>>> the machine. This brings new constraints on how the associated KVM IRQ
> > >>>> device is initialized.
> > >>>>
> > >>>> Currently, each model takes care of the initialization of the KVM
> > >>>> device in their realize method but this is not possible anymore as the
> > >>>> initialization needs to be done globaly when the interrupt mode is
> > >>>> known, i.e. when machine is reseted. It also means that we need a way
> > >>>> to delete a KVM device when another mode is chosen.
> > >>>>
> > >>>> Also, to support migration, the QEMU objects holding the state to
> > >>>> transfer should always be available but not necessarily activated.
> > >>>>
> > >>>> The overall approach of this proposal is to initialize both interrupt
> > >>>> mode at the QEMU level and keep the IRQ number space in sync to allow
> > >>>> switching from one mode to another. For the KVM side of things, the
> > >>>> whole initialization of the KVM device, sources and presenters, is
> > >>>> grouped in a single routine. The XICS and XIVE sPAPR IRQ reset
> > >>>> handlers are modified accordingly to handle the init and the delete
> > >>>> sequences of the KVM device.
> > >>>>
> > >>>> As KVM is now initialized at reset, we loose the possiblity to
> > >>>> fallback to the QEMU emulated mode in case of failure and failures
> > >>>> become fatal to the machine.
> > >>>>
> > >>>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> > >>>> ---
> > >>>>  hw/intc/spapr_xive.c     |  8 +---
> > >>>>  hw/intc/spapr_xive_kvm.c | 27 ++++++++++++++
> > >>>>  hw/intc/xics_kvm.c       | 25 +++++++++++++
> > >>>>  hw/intc/xive.c           |  4 --
> > >>>>  hw/ppc/spapr_irq.c       | 79 ++++++++++++++++++++++++++++------------
> > >>>>  5 files changed, 109 insertions(+), 34 deletions(-)
> > >>>>
> > >>>> diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
> > >>>> index 21f3c1ef0901..0661aca35900 100644
> > >>>> --- a/hw/intc/spapr_xive.c
> > >>>> +++ b/hw/intc/spapr_xive.c
> > >>>> @@ -330,13 +330,7 @@ static void spapr_xive_realize(DeviceState *dev, Error **errp)
> > >>>>      xive->eat = g_new0(XiveEAS, xive->nr_irqs);
> > >>>>      xive->endt = g_new0(XiveEND, xive->nr_ends);
> > >>>>  
> > >>>> -    if (kvmppc_xive_enabled()) {
> > >>>> -        kvmppc_xive_connect(xive, &local_err);
> > >>>> -        if (local_err) {
> > >>>> -            error_propagate(errp, local_err);
> > >>>> -            return;
> > >>>> -        }
> > >>>> -    } else {
> > >>>> +    if (!kvmppc_xive_enabled()) {
> > >>>>          /* TIMA initialization */
> > >>>>          memory_region_init_io(&xive->tm_mmio, OBJECT(xive), &xive_tm_ops, xive,
> > >>>>                                "xive.tima", 4ull << TM_SHIFT);
> > >>>> diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c
> > >>>> index d35814c1992e..3ebc947f2be7 100644
> > >>>> --- a/hw/intc/spapr_xive_kvm.c
> > >>>> +++ b/hw/intc/spapr_xive_kvm.c
> > >>>> @@ -737,6 +737,15 @@ void kvmppc_xive_connect(sPAPRXive *xive, Error **errp)
> > >>>>      Error *local_err = NULL;
> > >>>>      size_t esb_len;
> > >>>>      size_t tima_len;
> > >>>> +    CPUState *cs;
> > >>>> +
> > >>>> +    /*
> > >>>> +     * The KVM XIVE device already in use. This is the case when
> > >>>> +     * rebooting XIVE -> XIVE  
> > >>>
> > >>> Can this case actually occur?  Further down you appear to
> > >>> unconditionally destroy both KVM devices at reset time.  
> > >>
> > >> I guess you are right. I will check.
> > >>  
> > >>>> +     */
> > >>>> +    if (xive->fd != -1) {
> > >>>> +        return;
> > >>>> +    }
> > >>>>  
> > >>>>      if (!kvm_enabled() || !kvmppc_has_cap_xive()) {
> > >>>>          error_setg(errp, "IRQ_XIVE capability must be present for KVM");
> > >>>> @@ -800,6 +809,24 @@ void kvmppc_xive_connect(sPAPRXive *xive, Error **errp)
> > >>>>      xive->change = qemu_add_vm_change_state_handler(
> > >>>>          kvmppc_xive_change_state_handler, xive);
> > >>>>  
> > >>>> +    /* Connect the presenters to the initial VCPUs of the machine */
> > >>>> +    CPU_FOREACH(cs) {
> > >>>> +        PowerPCCPU *cpu = POWERPC_CPU(cs);
> > >>>> +
> > >>>> +        kvmppc_xive_cpu_connect(cpu->tctx, &local_err);
> > >>>> +        if (local_err) {
> > >>>> +            error_propagate(errp, local_err);
> > >>>> +            return;
> > >>>> +        }
> > >>>> +    }
> > >>>> +
> > >>>> +    /* Update the KVM sources */
> > >>>> +    kvmppc_xive_source_reset(xsrc, &local_err);
> > >>>> +    if (local_err) {
> > >>>> +            error_propagate(errp, local_err);
> > >>>> +            return;
> > >>>> +    }
> > >>>> +
> > >>>>      kvm_kernel_irqchip = true;
> > >>>>      kvm_msi_via_irqfd_allowed = true;
> > >>>>      kvm_gsi_direct_mapping = true;
> > >>>> diff --git a/hw/intc/xics_kvm.c b/hw/intc/xics_kvm.c
> > >>>> index 1d21ff217b82..bfc35d71df7f 100644
> > >>>> --- a/hw/intc/xics_kvm.c
> > >>>> +++ b/hw/intc/xics_kvm.c
> > >>>> @@ -448,6 +448,16 @@ static void rtas_dummy(PowerPCCPU *cpu, sPAPRMachineState *spapr,
> > >>>>  int xics_kvm_init(sPAPRMachineState *spapr, Error **errp)
> > >>>>  {
> > >>>>      int rc;
> > >>>> +    CPUState *cs;
> > >>>> +    Error *local_err = NULL;
> > >>>> +
> > >>>> +    /*
> > >>>> +     * The KVM XICS device already in use. This is the case when
> > >>>> +     * rebooting XICS -> XICS
> > >>>> +     */
> > >>>> +    if (kernel_xics_fd != -1) {
> > >>>> +        return 0;
> > >>>> +    }
> > >>>>  
> > >>>>      if (!kvm_enabled() || !kvm_check_extension(kvm_state, KVM_CAP_IRQ_XICS)) {
> > >>>>          error_setg(errp,
> > >>>> @@ -496,6 +506,21 @@ int xics_kvm_init(sPAPRMachineState *spapr, Error **errp)
> > >>>>      kvm_msi_via_irqfd_allowed = true;
> > >>>>      kvm_gsi_direct_mapping = true;
> > >>>>  
> > >>>> +    /* Connect the presenters to the initial VCPUs of the machine */
> > >>>> +    CPU_FOREACH(cs) {
> > >>>> +        PowerPCCPU *cpu = POWERPC_CPU(cs);
> > >>>> +
> > >>>> +        icp_kvm_connect(cpu->icp, &local_err);
> > >>>> +        if (local_err) {
> > >>>> +            error_propagate(errp, local_err);
> > >>>> +            goto fail;
> > >>>> +        }
> > >>>> +        icp_set_kvm_state(cpu->icp, 1);
> > >>>> +    }
> > >>>> +
> > >>>> +    /* Update the KVM sources */
> > >>>> +    ics_set_kvm_state(ICS_KVM(spapr->ics), 1);
> > >>>> +
> > >>>>      return 0;
> > >>>>  
> > >>>>  fail:
> > >>>> diff --git a/hw/intc/xive.c b/hw/intc/xive.c
> > >>>> index c5c2fbc3f8bc..c166eab5b210 100644
> > >>>> --- a/hw/intc/xive.c
> > >>>> +++ b/hw/intc/xive.c
> > >>>> @@ -932,10 +932,6 @@ static void xive_source_reset(void *dev)
> > >>>>  
> > >>>>      /* PQs are initialized to 0b01 (Q=1) which corresponds to "ints off" */
> > >>>>      memset(xsrc->status, XIVE_ESB_OFF, xsrc->nr_irqs);
> > >>>> -
> > >>>> -    if (kvmppc_xive_enabled()) {
> > >>>> -        kvmppc_xive_source_reset(xsrc, &error_fatal);
> > >>>> -    }
> > >>>>  }
> > >>>>  
> > >>>>  static void xive_source_realize(DeviceState *dev, Error **errp)
> > >>>> diff --git a/hw/ppc/spapr_irq.c b/hw/ppc/spapr_irq.c
> > >>>> index ba27d9d8e972..5592eec3787b 100644
> > >>>> --- a/hw/ppc/spapr_irq.c
> > >>>> +++ b/hw/ppc/spapr_irq.c
> > >>>> @@ -98,20 +98,14 @@ static void spapr_irq_init_xics(sPAPRMachineState *spapr, Error **errp)
> > >>>>      int nr_irqs = spapr->irq->nr_irqs;
> > >>>>      Error *local_err = NULL;
> > >>>>  
> > >>>> -    if (kvm_enabled()) {
> > >>>> -        if (machine_kernel_irqchip_allowed(machine) &&
> > >>>> -            !xics_kvm_init(spapr, &local_err)) {
> > >>>> -            spapr->icp_type = TYPE_KVM_ICP;
> > >>>> -            spapr->ics = spapr_ics_create(spapr, TYPE_ICS_KVM, nr_irqs,
> > >>>> -                                          &local_err);
> > >>>> -        }
> > >>>> -        if (machine_kernel_irqchip_required(machine) && !spapr->ics) {
> > >>>> -            error_prepend(&local_err,
> > >>>> -                          "kernel_irqchip requested but unavailable: ");
> > >>>> -            goto error;  
> > >>>
> > >>> I don't see anything that replaces the irqchip_required logic, which
> > >>> doesn't seem right.  
> > >>
> > >> Yes. We do loose the ability to fall back to the emulated device in case
> > >> of failure. It is not impossible to do but it will require more changes
> > >> to check what are the KVM capabilities before starting the machine.  
> > > 
> > > Uh... it seems more like it's the other way around.  We'll always fall
> > > back to emulated, even if we've explicitly said on the command line
> > > that we don't want that.  
> > 
> > Ah yes. The init function might be also broken. 
> > 
> > XICS mode is a bit more difficult to handle than XIVE because we have 
> > different object type for the KVM device and the QEMU emulated device, 
> 
> This is indeed a bit unfortunate, but I think there's still room for
> improvement. Let's look at the base classes:
> 
> struct ICPStateClass {
>     DeviceClass parent_class;
> 
>     DeviceRealize parent_realize;
>     DeviceReset parent_reset;
> 
>     void (*pre_save)(ICPState *icp);
>     int (*post_load)(ICPState *icp, int version_id);
>     void (*synchronize_state)(ICPState *icp);
> };
> 
> struct ICSStateClass {
>     DeviceClass parent_class;
> 
>     DeviceRealize parent_realize;
>     DeviceReset parent_reset;
> 
>     void (*pre_save)(ICSState *s);
>     int (*post_load)(ICSState *s, int version_id);
>     void (*reject)(ICSState *s, uint32_t irq);
>     void (*resend)(ICSState *s);
>     void (*eoi)(ICSState *s, uint32_t irq);
>     void (*synchronize_state)(ICSState *s);
> };
> 
> The pre_save and post_load callbacks are only used with
> the KVM device. They could be explicitely called from
> the corresponding VMStateDescription callbacks with a
> kvm_enabled() && kvm_irqchip_in_kernel() check.
> 
> Same goes for the synchronize_state callbacks, which are only
> needed for 'info pic'.
> 
> The reject, resend and eoi callbacks are only called by code that
> belongs to the QEMU emulated device. Either the RTAS/hypercalls
> or from the machine code with explicit checks like:
> 
> static void spapr_irq_set_irq_xics(void *opaque, int srcno, int val)
> {
>     sPAPRMachineState *spapr = opaque;
>     MachineState *machine = MACHINE(opaque);
> 
>     if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
>         ics_kvm_set_irq(spapr->ics, srcno, val);
>     } else {
>         ics_simple_set_irq(spapr->ics, srcno, val);
>     }
> }
> 
> or
> 
> static int spapr_irq_post_load_xics(sPAPRMachineState *spapr, int version_id)
> {
>     if (!object_dynamic_cast(OBJECT(spapr->ics), TYPE_ICS_KVM)) {
>         CPUState *cs;
>         CPU_FOREACH(cs) {
>             PowerPCCPU *cpu = POWERPC_CPU(cs);
>             icp_resend(spapr_cpu_state(cpu)->icp);
>         }
>     }
>     return 0;
> }
> 
> Unless I'm missing something, the reject, resend and eoi callbacks could
> simply be removed. This would allow to unify KVM and QEMU emulation in
> the same ICP and ICS object types.
> 
> If this makes sense to you, I can have a look (already started actually ;-)

Please do.  The use of different object types was something that
seemed like a good idea at the time, but in hindsight, wasn't.  In
general different device types should represent guest-visibly
different objects, not just implementation differences.

> > and with the 'dual' mode, we activate the device at CAS reset time.
> > 
> > Failures being handled at reset time, should we keep the same logic and  
> > abort the machine at reset if the kernel irqchip is required ? 
> > 
> 
> If the user passed ic-mode=dual,kernel-irqchip=on, we should at least make
> sure KVM supports both XICS and XIVE devices during machine init. Then
> during reset if something goes wrong with KVM, it seems ok to abort.
> 
> If the user didn't pass kernel-irqchip, ie, kernel_irqchip_allowed is true
> and kernel_irqchip_required is false, the current behavior for XICS is
> to try KVM first and fallback to QEMU emulation. I guess it could be the
> same for XIVE.

Yes, I think that's the behaviour we want, on all counts.
Cédric Le Goater Feb. 14, 2019, 7:13 a.m. UTC | #8
On 2/14/19 4:35 AM, David Gibson wrote:
> On Wed, Feb 13, 2019 at 11:07:49AM +0100, Greg Kurz wrote:
>> On Wed, 13 Feb 2019 09:22:46 +0100
>> Cédric Le Goater <clg@kaod.org> wrote:
>>
>>> On 2/13/19 2:32 AM, David Gibson wrote:
>>>> On Tue, Feb 12, 2019 at 08:18:19AM +0100, Cédric Le Goater wrote:  
>>>>> On 2/12/19 2:11 AM, David Gibson wrote:  
>>>>>> On Mon, Jan 07, 2019 at 07:39:46PM +0100, Cédric Le Goater wrote:  
>>>>>>> The interrupt mode is chosen by the CAS negotiation process and
>>>>>>> activated after a reset to take into account the required changes in
>>>>>>> the machine. This brings new constraints on how the associated KVM IRQ
>>>>>>> device is initialized.
>>>>>>>
>>>>>>> Currently, each model takes care of the initialization of the KVM
>>>>>>> device in their realize method but this is not possible anymore as the
>>>>>>> initialization needs to be done globaly when the interrupt mode is
>>>>>>> known, i.e. when machine is reseted. It also means that we need a way
>>>>>>> to delete a KVM device when another mode is chosen.
>>>>>>>
>>>>>>> Also, to support migration, the QEMU objects holding the state to
>>>>>>> transfer should always be available but not necessarily activated.
>>>>>>>
>>>>>>> The overall approach of this proposal is to initialize both interrupt
>>>>>>> mode at the QEMU level and keep the IRQ number space in sync to allow
>>>>>>> switching from one mode to another. For the KVM side of things, the
>>>>>>> whole initialization of the KVM device, sources and presenters, is
>>>>>>> grouped in a single routine. The XICS and XIVE sPAPR IRQ reset
>>>>>>> handlers are modified accordingly to handle the init and the delete
>>>>>>> sequences of the KVM device.
>>>>>>>
>>>>>>> As KVM is now initialized at reset, we loose the possiblity to
>>>>>>> fallback to the QEMU emulated mode in case of failure and failures
>>>>>>> become fatal to the machine.
>>>>>>>
>>>>>>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>>>>>>> ---
>>>>>>>  hw/intc/spapr_xive.c     |  8 +---
>>>>>>>  hw/intc/spapr_xive_kvm.c | 27 ++++++++++++++
>>>>>>>  hw/intc/xics_kvm.c       | 25 +++++++++++++
>>>>>>>  hw/intc/xive.c           |  4 --
>>>>>>>  hw/ppc/spapr_irq.c       | 79 ++++++++++++++++++++++++++++------------
>>>>>>>  5 files changed, 109 insertions(+), 34 deletions(-)
>>>>>>>
>>>>>>> diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
>>>>>>> index 21f3c1ef0901..0661aca35900 100644
>>>>>>> --- a/hw/intc/spapr_xive.c
>>>>>>> +++ b/hw/intc/spapr_xive.c
>>>>>>> @@ -330,13 +330,7 @@ static void spapr_xive_realize(DeviceState *dev, Error **errp)
>>>>>>>      xive->eat = g_new0(XiveEAS, xive->nr_irqs);
>>>>>>>      xive->endt = g_new0(XiveEND, xive->nr_ends);
>>>>>>>  
>>>>>>> -    if (kvmppc_xive_enabled()) {
>>>>>>> -        kvmppc_xive_connect(xive, &local_err);
>>>>>>> -        if (local_err) {
>>>>>>> -            error_propagate(errp, local_err);
>>>>>>> -            return;
>>>>>>> -        }
>>>>>>> -    } else {
>>>>>>> +    if (!kvmppc_xive_enabled()) {
>>>>>>>          /* TIMA initialization */
>>>>>>>          memory_region_init_io(&xive->tm_mmio, OBJECT(xive), &xive_tm_ops, xive,
>>>>>>>                                "xive.tima", 4ull << TM_SHIFT);
>>>>>>> diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c
>>>>>>> index d35814c1992e..3ebc947f2be7 100644
>>>>>>> --- a/hw/intc/spapr_xive_kvm.c
>>>>>>> +++ b/hw/intc/spapr_xive_kvm.c
>>>>>>> @@ -737,6 +737,15 @@ void kvmppc_xive_connect(sPAPRXive *xive, Error **errp)
>>>>>>>      Error *local_err = NULL;
>>>>>>>      size_t esb_len;
>>>>>>>      size_t tima_len;
>>>>>>> +    CPUState *cs;
>>>>>>> +
>>>>>>> +    /*
>>>>>>> +     * The KVM XIVE device already in use. This is the case when
>>>>>>> +     * rebooting XIVE -> XIVE  
>>>>>>
>>>>>> Can this case actually occur?  Further down you appear to
>>>>>> unconditionally destroy both KVM devices at reset time.  
>>>>>
>>>>> I guess you are right. I will check.
>>>>>  
>>>>>>> +     */
>>>>>>> +    if (xive->fd != -1) {
>>>>>>> +        return;
>>>>>>> +    }
>>>>>>>  
>>>>>>>      if (!kvm_enabled() || !kvmppc_has_cap_xive()) {
>>>>>>>          error_setg(errp, "IRQ_XIVE capability must be present for KVM");
>>>>>>> @@ -800,6 +809,24 @@ void kvmppc_xive_connect(sPAPRXive *xive, Error **errp)
>>>>>>>      xive->change = qemu_add_vm_change_state_handler(
>>>>>>>          kvmppc_xive_change_state_handler, xive);
>>>>>>>  
>>>>>>> +    /* Connect the presenters to the initial VCPUs of the machine */
>>>>>>> +    CPU_FOREACH(cs) {
>>>>>>> +        PowerPCCPU *cpu = POWERPC_CPU(cs);
>>>>>>> +
>>>>>>> +        kvmppc_xive_cpu_connect(cpu->tctx, &local_err);
>>>>>>> +        if (local_err) {
>>>>>>> +            error_propagate(errp, local_err);
>>>>>>> +            return;
>>>>>>> +        }
>>>>>>> +    }
>>>>>>> +
>>>>>>> +    /* Update the KVM sources */
>>>>>>> +    kvmppc_xive_source_reset(xsrc, &local_err);
>>>>>>> +    if (local_err) {
>>>>>>> +            error_propagate(errp, local_err);
>>>>>>> +            return;
>>>>>>> +    }
>>>>>>> +
>>>>>>>      kvm_kernel_irqchip = true;
>>>>>>>      kvm_msi_via_irqfd_allowed = true;
>>>>>>>      kvm_gsi_direct_mapping = true;
>>>>>>> diff --git a/hw/intc/xics_kvm.c b/hw/intc/xics_kvm.c
>>>>>>> index 1d21ff217b82..bfc35d71df7f 100644
>>>>>>> --- a/hw/intc/xics_kvm.c
>>>>>>> +++ b/hw/intc/xics_kvm.c
>>>>>>> @@ -448,6 +448,16 @@ static void rtas_dummy(PowerPCCPU *cpu, sPAPRMachineState *spapr,
>>>>>>>  int xics_kvm_init(sPAPRMachineState *spapr, Error **errp)
>>>>>>>  {
>>>>>>>      int rc;
>>>>>>> +    CPUState *cs;
>>>>>>> +    Error *local_err = NULL;
>>>>>>> +
>>>>>>> +    /*
>>>>>>> +     * The KVM XICS device already in use. This is the case when
>>>>>>> +     * rebooting XICS -> XICS
>>>>>>> +     */
>>>>>>> +    if (kernel_xics_fd != -1) {
>>>>>>> +        return 0;
>>>>>>> +    }
>>>>>>>  
>>>>>>>      if (!kvm_enabled() || !kvm_check_extension(kvm_state, KVM_CAP_IRQ_XICS)) {
>>>>>>>          error_setg(errp,
>>>>>>> @@ -496,6 +506,21 @@ int xics_kvm_init(sPAPRMachineState *spapr, Error **errp)
>>>>>>>      kvm_msi_via_irqfd_allowed = true;
>>>>>>>      kvm_gsi_direct_mapping = true;
>>>>>>>  
>>>>>>> +    /* Connect the presenters to the initial VCPUs of the machine */
>>>>>>> +    CPU_FOREACH(cs) {
>>>>>>> +        PowerPCCPU *cpu = POWERPC_CPU(cs);
>>>>>>> +
>>>>>>> +        icp_kvm_connect(cpu->icp, &local_err);
>>>>>>> +        if (local_err) {
>>>>>>> +            error_propagate(errp, local_err);
>>>>>>> +            goto fail;
>>>>>>> +        }
>>>>>>> +        icp_set_kvm_state(cpu->icp, 1);
>>>>>>> +    }
>>>>>>> +
>>>>>>> +    /* Update the KVM sources */
>>>>>>> +    ics_set_kvm_state(ICS_KVM(spapr->ics), 1);
>>>>>>> +
>>>>>>>      return 0;
>>>>>>>  
>>>>>>>  fail:
>>>>>>> diff --git a/hw/intc/xive.c b/hw/intc/xive.c
>>>>>>> index c5c2fbc3f8bc..c166eab5b210 100644
>>>>>>> --- a/hw/intc/xive.c
>>>>>>> +++ b/hw/intc/xive.c
>>>>>>> @@ -932,10 +932,6 @@ static void xive_source_reset(void *dev)
>>>>>>>  
>>>>>>>      /* PQs are initialized to 0b01 (Q=1) which corresponds to "ints off" */
>>>>>>>      memset(xsrc->status, XIVE_ESB_OFF, xsrc->nr_irqs);
>>>>>>> -
>>>>>>> -    if (kvmppc_xive_enabled()) {
>>>>>>> -        kvmppc_xive_source_reset(xsrc, &error_fatal);
>>>>>>> -    }
>>>>>>>  }
>>>>>>>  
>>>>>>>  static void xive_source_realize(DeviceState *dev, Error **errp)
>>>>>>> diff --git a/hw/ppc/spapr_irq.c b/hw/ppc/spapr_irq.c
>>>>>>> index ba27d9d8e972..5592eec3787b 100644
>>>>>>> --- a/hw/ppc/spapr_irq.c
>>>>>>> +++ b/hw/ppc/spapr_irq.c
>>>>>>> @@ -98,20 +98,14 @@ static void spapr_irq_init_xics(sPAPRMachineState *spapr, Error **errp)
>>>>>>>      int nr_irqs = spapr->irq->nr_irqs;
>>>>>>>      Error *local_err = NULL;
>>>>>>>  
>>>>>>> -    if (kvm_enabled()) {
>>>>>>> -        if (machine_kernel_irqchip_allowed(machine) &&
>>>>>>> -            !xics_kvm_init(spapr, &local_err)) {
>>>>>>> -            spapr->icp_type = TYPE_KVM_ICP;
>>>>>>> -            spapr->ics = spapr_ics_create(spapr, TYPE_ICS_KVM, nr_irqs,
>>>>>>> -                                          &local_err);
>>>>>>> -        }
>>>>>>> -        if (machine_kernel_irqchip_required(machine) && !spapr->ics) {
>>>>>>> -            error_prepend(&local_err,
>>>>>>> -                          "kernel_irqchip requested but unavailable: ");
>>>>>>> -            goto error;  
>>>>>>
>>>>>> I don't see anything that replaces the irqchip_required logic, which
>>>>>> doesn't seem right.  
>>>>>
>>>>> Yes. We do loose the ability to fall back to the emulated device in case
>>>>> of failure. It is not impossible to do but it will require more changes
>>>>> to check what are the KVM capabilities before starting the machine.  
>>>>
>>>> Uh... it seems more like it's the other way around.  We'll always fall
>>>> back to emulated, even if we've explicitly said on the command line
>>>> that we don't want that.  
>>>
>>> Ah yes. The init function might be also broken. 
>>>
>>> XICS mode is a bit more difficult to handle than XIVE because we have 
>>> different object type for the KVM device and the QEMU emulated device, 
>>
>> This is indeed a bit unfortunate, but I think there's still room for
>> improvement. Let's look at the base classes:
>>
>> struct ICPStateClass {
>>     DeviceClass parent_class;
>>
>>     DeviceRealize parent_realize;
>>     DeviceReset parent_reset;
>>
>>     void (*pre_save)(ICPState *icp);
>>     int (*post_load)(ICPState *icp, int version_id);
>>     void (*synchronize_state)(ICPState *icp);
>> };
>>
>> struct ICSStateClass {
>>     DeviceClass parent_class;
>>
>>     DeviceRealize parent_realize;
>>     DeviceReset parent_reset;
>>
>>     void (*pre_save)(ICSState *s);
>>     int (*post_load)(ICSState *s, int version_id);
>>     void (*reject)(ICSState *s, uint32_t irq);
>>     void (*resend)(ICSState *s);
>>     void (*eoi)(ICSState *s, uint32_t irq);
>>     void (*synchronize_state)(ICSState *s);
>> };
>>
>> The pre_save and post_load callbacks are only used with
>> the KVM device. They could be explicitely called from
>> the corresponding VMStateDescription callbacks with a
>> kvm_enabled() && kvm_irqchip_in_kernel() check.

yes.
>> Same goes for the synchronize_state callbacks, which are only
>> needed for 'info pic'.

yes, like we do for XIVE.

>> The reject, resend and eoi callbacks are only called by code that
>> belongs to the QEMU emulated device. Either the RTAS/hypercalls
>> or from the machine code with explicit checks like:
>>
>> static void spapr_irq_set_irq_xics(void *opaque, int srcno, int val)
>> {
>>     sPAPRMachineState *spapr = opaque;
>>     MachineState *machine = MACHINE(opaque);
>>
>>     if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
>>         ics_kvm_set_irq(spapr->ics, srcno, val);
>>     } else {
>>         ics_simple_set_irq(spapr->ics, srcno, val);
>>     }
>> }
>>
>> or
>>
>> static int spapr_irq_post_load_xics(sPAPRMachineState *spapr, int version_id)
>> {
>>     if (!object_dynamic_cast(OBJECT(spapr->ics), TYPE_ICS_KVM)) {
>>         CPUState *cs;
>>         CPU_FOREACH(cs) {
>>             PowerPCCPU *cpu = POWERPC_CPU(cs);
>>             icp_resend(spapr_cpu_state(cpu)->icp);
>>         }
>>     }
>>     return 0;
>> }
>>
>> Unless I'm missing something, the reject, resend and eoi callbacks could
>> simply be removed. This would allow to unify KVM and QEMU emulation in
>> the same ICP and ICS object types.

yes.  

>> If this makes sense to you, I can have a look (already started actually ;-)
> 
> Please do.  The use of different object types was something that
> seemed like a good idea at the time, but in hindsight, wasn't.  In
> general different device types should represent guest-visibly
> different objects, not just implementation differences.

yes. The direction we took to add KVM support in XIVE proved to be 
much simpler.

>>> and with the 'dual' mode, we activate the device at CAS reset time.
>>>
>>> Failures being handled at reset time, should we keep the same logic and  
>>> abort the machine at reset if the kernel irqchip is required ? 
>>>
>>
>> If the user passed ic-mode=dual,kernel-irqchip=on, we should at least make
>> sure KVM supports both XICS and XIVE devices during machine init. Then
>> during reset if something goes wrong with KVM, it seems ok to abort.
>>
>> If the user didn't pass kernel-irqchip, ie, kernel_irqchip_allowed is true
>> and kernel_irqchip_required is false, the current behavior for XICS is
>> to try KVM first and fallback to QEMU emulation. I guess it could be the
>> same for XIVE.
> 
> Yes, I think that's the behaviour we want, on all counts.
 
Thanks,

C.
Cédric Le Goater Feb. 22, 2019, 12:36 p.m. UTC | #9
On 2/13/19 2:32 AM, David Gibson wrote:
> On Tue, Feb 12, 2019 at 08:18:19AM +0100, Cédric Le Goater wrote:
>> On 2/12/19 2:11 AM, David Gibson wrote:
>>> On Mon, Jan 07, 2019 at 07:39:46PM +0100, Cédric Le Goater wrote:
>>>> The interrupt mode is chosen by the CAS negotiation process and
>>>> activated after a reset to take into account the required changes in
>>>> the machine. This brings new constraints on how the associated KVM IRQ
>>>> device is initialized.
>>>>
>>>> Currently, each model takes care of the initialization of the KVM
>>>> device in their realize method but this is not possible anymore as the
>>>> initialization needs to be done globaly when the interrupt mode is
>>>> known, i.e. when machine is reseted. It also means that we need a way
>>>> to delete a KVM device when another mode is chosen.
>>>>
>>>> Also, to support migration, the QEMU objects holding the state to
>>>> transfer should always be available but not necessarily activated.
>>>>
>>>> The overall approach of this proposal is to initialize both interrupt
>>>> mode at the QEMU level and keep the IRQ number space in sync to allow
>>>> switching from one mode to another. For the KVM side of things, the
>>>> whole initialization of the KVM device, sources and presenters, is
>>>> grouped in a single routine. The XICS and XIVE sPAPR IRQ reset
>>>> handlers are modified accordingly to handle the init and the delete
>>>> sequences of the KVM device.
>>>>
>>>> As KVM is now initialized at reset, we loose the possiblity to
>>>> fallback to the QEMU emulated mode in case of failure and failures
>>>> become fatal to the machine.
>>>>
>>>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>>>> ---
>>>>  hw/intc/spapr_xive.c     |  8 +---
>>>>  hw/intc/spapr_xive_kvm.c | 27 ++++++++++++++
>>>>  hw/intc/xics_kvm.c       | 25 +++++++++++++
>>>>  hw/intc/xive.c           |  4 --
>>>>  hw/ppc/spapr_irq.c       | 79 ++++++++++++++++++++++++++++------------
>>>>  5 files changed, 109 insertions(+), 34 deletions(-)
>>>>
>>>> diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
>>>> index 21f3c1ef0901..0661aca35900 100644
>>>> --- a/hw/intc/spapr_xive.c
>>>> +++ b/hw/intc/spapr_xive.c
>>>> @@ -330,13 +330,7 @@ static void spapr_xive_realize(DeviceState *dev, Error **errp)
>>>>      xive->eat = g_new0(XiveEAS, xive->nr_irqs);
>>>>      xive->endt = g_new0(XiveEND, xive->nr_ends);
>>>>  
>>>> -    if (kvmppc_xive_enabled()) {
>>>> -        kvmppc_xive_connect(xive, &local_err);
>>>> -        if (local_err) {
>>>> -            error_propagate(errp, local_err);
>>>> -            return;
>>>> -        }
>>>> -    } else {
>>>> +    if (!kvmppc_xive_enabled()) {
>>>>          /* TIMA initialization */
>>>>          memory_region_init_io(&xive->tm_mmio, OBJECT(xive), &xive_tm_ops, xive,
>>>>                                "xive.tima", 4ull << TM_SHIFT);
>>>> diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c
>>>> index d35814c1992e..3ebc947f2be7 100644
>>>> --- a/hw/intc/spapr_xive_kvm.c
>>>> +++ b/hw/intc/spapr_xive_kvm.c
>>>> @@ -737,6 +737,15 @@ void kvmppc_xive_connect(sPAPRXive *xive, Error **errp)
>>>>      Error *local_err = NULL;
>>>>      size_t esb_len;
>>>>      size_t tima_len;
>>>> +    CPUState *cs;
>>>> +
>>>> +    /*
>>>> +     * The KVM XIVE device already in use. This is the case when
>>>> +     * rebooting XIVE -> XIVE
>>>
>>> Can this case actually occur?  Further down you appear to
>>> unconditionally destroy both KVM devices at reset time.
>>
>> I guess you are right. I will check.

So we have to keep it for ic-mode=xive.

The number of test combinations is exploding. We now have :

3 hypervisors              KVM, KVM nested, QEMU TCG
3 Interrupt modes          xics, xive, dual (xics+xive)
3 kernel irqchip modes     off, allow, required.


C.
diff mbox series

Patch

diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
index 21f3c1ef0901..0661aca35900 100644
--- a/hw/intc/spapr_xive.c
+++ b/hw/intc/spapr_xive.c
@@ -330,13 +330,7 @@  static void spapr_xive_realize(DeviceState *dev, Error **errp)
     xive->eat = g_new0(XiveEAS, xive->nr_irqs);
     xive->endt = g_new0(XiveEND, xive->nr_ends);
 
-    if (kvmppc_xive_enabled()) {
-        kvmppc_xive_connect(xive, &local_err);
-        if (local_err) {
-            error_propagate(errp, local_err);
-            return;
-        }
-    } else {
+    if (!kvmppc_xive_enabled()) {
         /* TIMA initialization */
         memory_region_init_io(&xive->tm_mmio, OBJECT(xive), &xive_tm_ops, xive,
                               "xive.tima", 4ull << TM_SHIFT);
diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c
index d35814c1992e..3ebc947f2be7 100644
--- a/hw/intc/spapr_xive_kvm.c
+++ b/hw/intc/spapr_xive_kvm.c
@@ -737,6 +737,15 @@  void kvmppc_xive_connect(sPAPRXive *xive, Error **errp)
     Error *local_err = NULL;
     size_t esb_len;
     size_t tima_len;
+    CPUState *cs;
+
+    /*
+     * The KVM XIVE device already in use. This is the case when
+     * rebooting XIVE -> XIVE
+     */
+    if (xive->fd != -1) {
+        return;
+    }
 
     if (!kvm_enabled() || !kvmppc_has_cap_xive()) {
         error_setg(errp, "IRQ_XIVE capability must be present for KVM");
@@ -800,6 +809,24 @@  void kvmppc_xive_connect(sPAPRXive *xive, Error **errp)
     xive->change = qemu_add_vm_change_state_handler(
         kvmppc_xive_change_state_handler, xive);
 
+    /* Connect the presenters to the initial VCPUs of the machine */
+    CPU_FOREACH(cs) {
+        PowerPCCPU *cpu = POWERPC_CPU(cs);
+
+        kvmppc_xive_cpu_connect(cpu->tctx, &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            return;
+        }
+    }
+
+    /* Update the KVM sources */
+    kvmppc_xive_source_reset(xsrc, &local_err);
+    if (local_err) {
+            error_propagate(errp, local_err);
+            return;
+    }
+
     kvm_kernel_irqchip = true;
     kvm_msi_via_irqfd_allowed = true;
     kvm_gsi_direct_mapping = true;
diff --git a/hw/intc/xics_kvm.c b/hw/intc/xics_kvm.c
index 1d21ff217b82..bfc35d71df7f 100644
--- a/hw/intc/xics_kvm.c
+++ b/hw/intc/xics_kvm.c
@@ -448,6 +448,16 @@  static void rtas_dummy(PowerPCCPU *cpu, sPAPRMachineState *spapr,
 int xics_kvm_init(sPAPRMachineState *spapr, Error **errp)
 {
     int rc;
+    CPUState *cs;
+    Error *local_err = NULL;
+
+    /*
+     * The KVM XICS device already in use. This is the case when
+     * rebooting XICS -> XICS
+     */
+    if (kernel_xics_fd != -1) {
+        return 0;
+    }
 
     if (!kvm_enabled() || !kvm_check_extension(kvm_state, KVM_CAP_IRQ_XICS)) {
         error_setg(errp,
@@ -496,6 +506,21 @@  int xics_kvm_init(sPAPRMachineState *spapr, Error **errp)
     kvm_msi_via_irqfd_allowed = true;
     kvm_gsi_direct_mapping = true;
 
+    /* Connect the presenters to the initial VCPUs of the machine */
+    CPU_FOREACH(cs) {
+        PowerPCCPU *cpu = POWERPC_CPU(cs);
+
+        icp_kvm_connect(cpu->icp, &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            goto fail;
+        }
+        icp_set_kvm_state(cpu->icp, 1);
+    }
+
+    /* Update the KVM sources */
+    ics_set_kvm_state(ICS_KVM(spapr->ics), 1);
+
     return 0;
 
 fail:
diff --git a/hw/intc/xive.c b/hw/intc/xive.c
index c5c2fbc3f8bc..c166eab5b210 100644
--- a/hw/intc/xive.c
+++ b/hw/intc/xive.c
@@ -932,10 +932,6 @@  static void xive_source_reset(void *dev)
 
     /* PQs are initialized to 0b01 (Q=1) which corresponds to "ints off" */
     memset(xsrc->status, XIVE_ESB_OFF, xsrc->nr_irqs);
-
-    if (kvmppc_xive_enabled()) {
-        kvmppc_xive_source_reset(xsrc, &error_fatal);
-    }
 }
 
 static void xive_source_realize(DeviceState *dev, Error **errp)
diff --git a/hw/ppc/spapr_irq.c b/hw/ppc/spapr_irq.c
index ba27d9d8e972..5592eec3787b 100644
--- a/hw/ppc/spapr_irq.c
+++ b/hw/ppc/spapr_irq.c
@@ -98,20 +98,14 @@  static void spapr_irq_init_xics(sPAPRMachineState *spapr, Error **errp)
     int nr_irqs = spapr->irq->nr_irqs;
     Error *local_err = NULL;
 
-    if (kvm_enabled()) {
-        if (machine_kernel_irqchip_allowed(machine) &&
-            !xics_kvm_init(spapr, &local_err)) {
-            spapr->icp_type = TYPE_KVM_ICP;
-            spapr->ics = spapr_ics_create(spapr, TYPE_ICS_KVM, nr_irqs,
-                                          &local_err);
-        }
-        if (machine_kernel_irqchip_required(machine) && !spapr->ics) {
-            error_prepend(&local_err,
-                          "kernel_irqchip requested but unavailable: ");
-            goto error;
+    if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
+        spapr->icp_type = TYPE_KVM_ICP;
+        spapr->ics = spapr_ics_create(spapr, TYPE_ICS_KVM, nr_irqs,
+                                      &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            return;
         }
-        error_free(local_err);
-        local_err = NULL;
     }
 
     if (!spapr->ics) {
@@ -119,10 +113,11 @@  static void spapr_irq_init_xics(sPAPRMachineState *spapr, Error **errp)
         spapr->icp_type = TYPE_ICP;
         spapr->ics = spapr_ics_create(spapr, TYPE_ICS_SIMPLE, nr_irqs,
                                       &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            return;
+        }
     }
-
-error:
-    error_propagate(errp, local_err);
 }
 
 #define ICS_IRQ_FREE(ics, srcno)   \
@@ -233,7 +228,17 @@  static void spapr_irq_set_irq_xics(void *opaque, int srcno, int val)
 
 static void spapr_irq_reset_xics(sPAPRMachineState *spapr, Error **errp)
 {
-    /* TODO: create the KVM XICS device */
+    MachineState *machine = MACHINE(spapr);
+    Error *local_err = NULL;
+
+    if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
+        xics_kvm_init(spapr, &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            error_prepend(errp, "KVM XICS connect failed: ");
+            return;
+        }
+    }
 }
 
 #define SPAPR_IRQ_XICS_NR_IRQS     0x1000
@@ -393,6 +398,7 @@  static int spapr_irq_post_load_xive(sPAPRMachineState *spapr, int version_id)
 static void spapr_irq_reset_xive(sPAPRMachineState *spapr, Error **errp)
 {
     CPUState *cs;
+    Error *local_err = NULL;
 
     CPU_FOREACH(cs) {
         PowerPCCPU *cpu = POWERPC_CPU(cs);
@@ -401,6 +407,15 @@  static void spapr_irq_reset_xive(sPAPRMachineState *spapr, Error **errp)
         spapr_xive_set_tctx_os_cam(cpu->tctx);
     }
 
+    if (kvmppc_xive_enabled()) {
+        kvmppc_xive_connect(spapr->xive, &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            error_prepend(errp, "KVM XIVE connect failed: ");
+            return;
+        }
+    }
+
     /* Activate the XIVE MMIOs */
     spapr_xive_mmio_set_enabled(spapr->xive, true);
 }
@@ -462,14 +477,8 @@  static sPAPRIrq *spapr_irq_current(sPAPRMachineState *spapr)
 
 static void spapr_irq_init_dual(sPAPRMachineState *spapr, Error **errp)
 {
-    MachineState *machine = MACHINE(spapr);
     Error *local_err = NULL;
 
-    if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
-        error_setg(errp, "No KVM support for the 'dual' machine");
-        return;
-    }
-
     spapr_irq_xics.init(spapr, &local_err);
     if (local_err) {
         error_propagate(errp, local_err);
@@ -568,11 +577,16 @@  static void spapr_irq_cpu_intc_create_dual(sPAPRMachineState *spapr,
 
 static int spapr_irq_post_load_dual(sPAPRMachineState *spapr, int version_id)
 {
+    MachineState *machine = MACHINE(spapr);
+
     /*
      * Force a reset of the XIVE backend after migration. The machine
      * defaults to XICS at startup.
      */
     if (spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) {
+        if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
+            xics_kvm_disconnect(spapr, &error_fatal);
+        }
         spapr_irq_xive.reset(spapr, &error_fatal);
     }
 
@@ -581,12 +595,31 @@  static int spapr_irq_post_load_dual(sPAPRMachineState *spapr, int version_id)
 
 static void spapr_irq_reset_dual(sPAPRMachineState *spapr, Error **errp)
 {
+    MachineState *machine = MACHINE(spapr);
+    Error *local_err = NULL;
+
     /*
      * Deactivate the XIVE MMIOs. The XIVE backend will reenable them
      * if selected.
      */
     spapr_xive_mmio_set_enabled(spapr->xive, false);
 
+    /* Destroy all KVM devices */
+    if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) {
+        xics_kvm_disconnect(spapr, &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            error_prepend(errp, "KVM XICS disconnect failed: ");
+            return;
+        }
+        kvmppc_xive_disconnect(spapr->xive, &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            error_prepend(errp, "KVM XIVE disconnect failed: ");
+            return;
+        }
+    }
+
     spapr_irq_current(spapr)->reset(spapr, errp);
 }