diff mbox series

[17/25] spapr: add a sPAPRXive object to the machine

Message ID 20171123132955.1261-18-clg@kaod.org
State New
Headers show
Series spapr: Guest exploitation of the XIVE interrupt controller (POWER9) | expand

Commit Message

Cédric Le Goater Nov. 23, 2017, 1:29 p.m. UTC
The XIVE object is designed to be always available, so it is created
unconditionally on newer machines. Depending on the configuration and
the guest capabilities, the CAS negotiation process will decide which
interrupt model to use, legacy or XIVE.

The XIVE model makes use of the full range of the IRQ number space
because the IRQ numbers for the CPU IPIs are allocated in the range
below XICS_IRQ_BASE, which is unused by XICS.

Signed-off-by: Cédric Le Goater <clg@kaod.org>
---
 hw/ppc/spapr.c         | 34 ++++++++++++++++++++++++++++++++++
 include/hw/ppc/spapr.h |  2 ++
 2 files changed, 36 insertions(+)

Comments

David Gibson Nov. 30, 2017, 5:55 a.m. UTC | #1
On Thu, Nov 23, 2017 at 02:29:47PM +0100, Cédric Le Goater wrote:
> The XIVE object is designed to be always available, so it is created
> unconditionally on newer machines.

There doesn't actually seem to be anything dependent on machine
version here.

> Depending on the configuration and
> the guest capabilities, the CAS negotiation process will decide which
> interrupt model to use, legacy or XIVE.
> 
> The XIVE model makes use of the full range of the IRQ number space
> because the IRQ numbers for the CPU IPIs are allocated in the range
> below XICS_IRQ_BASE, which is unused by XICS.

Ok.  And I take it 4096 is enough space for the XIVE IPIs for the
forseeable future?

> 
> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> ---
>  hw/ppc/spapr.c         | 34 ++++++++++++++++++++++++++++++++++
>  include/hw/ppc/spapr.h |  2 ++
>  2 files changed, 36 insertions(+)
> 
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index 5d3325ca3c88..0e0107c8272c 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -56,6 +56,7 @@
>  #include "hw/ppc/spapr_vio.h"
>  #include "hw/pci-host/spapr.h"
>  #include "hw/ppc/xics.h"
> +#include "hw/ppc/spapr_xive.h"
>  #include "hw/pci/msi.h"
>  
>  #include "hw/pci/pci.h"
> @@ -204,6 +205,29 @@ static void xics_system_init(MachineState *machine, int nr_irqs, Error **errp)
>      }
>  }
>  
> +static sPAPRXive *spapr_xive_create(sPAPRMachineState *spapr, int nr_irqs,
> +                                    Error **errp)
> +{
> +    Error *local_err = NULL;
> +    Object *obj;
> +
> +    obj = object_new(TYPE_SPAPR_XIVE);
> +    object_property_add_child(OBJECT(spapr), "xive", obj, &error_abort);
> +    object_property_set_int(obj, nr_irqs, "nr-irqs",  &local_err);
> +    if (local_err) {
> +        goto error;
> +    }
> +    object_property_set_bool(obj, true, "realized", &local_err);
> +    if (local_err) {
> +        goto error;
> +    }
> +
> +    return SPAPR_XIVE(obj);
> +error:
> +    error_propagate(errp, local_err);
> +    return NULL;
> +}
> +
>  static int spapr_fixup_cpu_smt_dt(void *fdt, int offset, PowerPCCPU *cpu,
>                                    int smt_threads)
>  {
> @@ -2360,6 +2384,16 @@ static void ppc_spapr_init(MachineState *machine)
>      /* Set up Interrupt Controller before we create the VCPUs */
>      xics_system_init(machine, XICS_IRQS_SPAPR, &error_fatal);
>  
> +    /* We don't have KVM support yet, so check for irqchip=on */
> +    if (kvm_enabled() && machine_kernel_irqchip_required(machine)) {
> +        error_report("kernel_irqchip requested. no XIVE support");

I think you want an actual exit(1) here, no?  error_report() will
print an error but keep going.

> +    } else {
> +        /* XIVE uses the full range of IRQ numbers. The CPU IPIs will
> +         * use the range below XICS_IRQ_BASE, which is unused by XICS. */
> +        spapr->xive = spapr_xive_create(spapr, XICS_IRQ_BASE + XICS_IRQS_SPAPR,
> +                                        &error_fatal);

XICS_IRQ_BASE == 4096, and XICS_IRQS_SPAPR (which we should rename at
some point) == 1024.

So we have a total irq space of 5k, which is a bit odd.  I'd be ok
with rounding it out to 8k for newer machines if that's useful.
Sparse allocations in there might make life easier for getting
consistent irq numbers without an "allocator" per se (because we can
use different regions for VIO, PCI intx, MSI, etc. etc.).

> +    }
> +
>      /* Set up containers for ibm,client-architecture-support negotiated options
>       */
>      spapr->ov5 = spapr_ovec_new();
> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> index 9a3885593c86..90e2b0f6c678 100644
> --- a/include/hw/ppc/spapr.h
> +++ b/include/hw/ppc/spapr.h
> @@ -14,6 +14,7 @@ struct sPAPRNVRAM;
>  typedef struct sPAPREventLogEntry sPAPREventLogEntry;
>  typedef struct sPAPREventSource sPAPREventSource;
>  typedef struct sPAPRPendingHPT sPAPRPendingHPT;
> +typedef struct sPAPRXive sPAPRXive;
>  
>  #define HPTE64_V_HPTE_DIRTY     0x0000000000000040ULL
>  #define SPAPR_ENTRY_POINT       0x100
> @@ -127,6 +128,7 @@ struct sPAPRMachineState {
>      MemoryHotplugState hotplug_memory;
>  
>      const char *icp_type;
> +    sPAPRXive  *xive;
>  };
>  
>  #define H_SUCCESS         0
Cédric Le Goater Nov. 30, 2017, 3:15 p.m. UTC | #2
On 11/30/2017 05:55 AM, David Gibson wrote:
> On Thu, Nov 23, 2017 at 02:29:47PM +0100, Cédric Le Goater wrote:
>> The XIVE object is designed to be always available, so it is created
>> unconditionally on newer machines.
> 
> There doesn't actually seem to be anything dependent on machine
> version here.

No. I thought that was too early in the patchset. This is handled 
in the last patch with a 'xive_exploitation' bool which is set to 
false on older machines. 

But, nevertheless, the XIVE objects are always created even if not
used. Something to discuss. 

>> Depending on the configuration and
>> the guest capabilities, the CAS negotiation process will decide which
>> interrupt model to use, legacy or XIVE.
>>
>> The XIVE model makes use of the full range of the IRQ number space
>> because the IRQ numbers for the CPU IPIs are allocated in the range
>> below XICS_IRQ_BASE, which is unused by XICS.
> 
> Ok.  And I take it 4096 is enough space for the XIVE IPIs for the
> forseeable future?

The biggest real system I am aware of as 16 sockets, 192 cores, SMT8. 
That's 1536 cpus. pseries has a max_cpus of 1024. 

>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>> ---
>>  hw/ppc/spapr.c         | 34 ++++++++++++++++++++++++++++++++++
>>  include/hw/ppc/spapr.h |  2 ++
>>  2 files changed, 36 insertions(+)
>>
>> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
>> index 5d3325ca3c88..0e0107c8272c 100644
>> --- a/hw/ppc/spapr.c
>> +++ b/hw/ppc/spapr.c
>> @@ -56,6 +56,7 @@
>>  #include "hw/ppc/spapr_vio.h"
>>  #include "hw/pci-host/spapr.h"
>>  #include "hw/ppc/xics.h"
>> +#include "hw/ppc/spapr_xive.h"
>>  #include "hw/pci/msi.h"
>>  
>>  #include "hw/pci/pci.h"
>> @@ -204,6 +205,29 @@ static void xics_system_init(MachineState *machine, int nr_irqs, Error **errp)
>>      }
>>  }
>>  
>> +static sPAPRXive *spapr_xive_create(sPAPRMachineState *spapr, int nr_irqs,
>> +                                    Error **errp)
>> +{
>> +    Error *local_err = NULL;
>> +    Object *obj;
>> +
>> +    obj = object_new(TYPE_SPAPR_XIVE);
>> +    object_property_add_child(OBJECT(spapr), "xive", obj, &error_abort);
>> +    object_property_set_int(obj, nr_irqs, "nr-irqs",  &local_err);
>> +    if (local_err) {
>> +        goto error;
>> +    }
>> +    object_property_set_bool(obj, true, "realized", &local_err);
>> +    if (local_err) {
>> +        goto error;
>> +    }
>> +
>> +    return SPAPR_XIVE(obj);
>> +error:
>> +    error_propagate(errp, local_err);
>> +    return NULL;
>> +}
>> +
>>  static int spapr_fixup_cpu_smt_dt(void *fdt, int offset, PowerPCCPU *cpu,
>>                                    int smt_threads)
>>  {
>> @@ -2360,6 +2384,16 @@ static void ppc_spapr_init(MachineState *machine)
>>      /* Set up Interrupt Controller before we create the VCPUs */
>>      xics_system_init(machine, XICS_IRQS_SPAPR, &error_fatal);
>>  
>> +    /* We don't have KVM support yet, so check for irqchip=on */
>> +    if (kvm_enabled() && machine_kernel_irqchip_required(machine)) {
>> +        error_report("kernel_irqchip requested. no XIVE support");
> 
> I think you want an actual exit(1) here, no?  error_report() will
> print an error but keep going.

yes. Today, it coredumps. I am not sure why. I will add an exit().

> 
>> +    } else {
>> +        /* XIVE uses the full range of IRQ numbers. The CPU IPIs will
>> +         * use the range below XICS_IRQ_BASE, which is unused by XICS. */
>> +        spapr->xive = spapr_xive_create(spapr, XICS_IRQ_BASE + XICS_IRQS_SPAPR,
>> +                                        &error_fatal);
> 
> XICS_IRQ_BASE == 4096, and XICS_IRQS_SPAPR (which we should rename at
> some point) == 1024.
> 
> So we have a total irq space of 5k, which is a bit odd.  I'd be ok
> with rounding it out to 8k for newer machines if that's useful.
> Sparse allocations in there might make life easier for getting
> consistent irq numbers without an "allocator" per se (because we can
> use different regions for VIO, PCI intx, MSI, etc. etc.).
I will start another thread on that topic.

Thanks,

C. 

> 
>> +    }
>> +
>>      /* Set up containers for ibm,client-architecture-support negotiated options
>>       */
>>      spapr->ov5 = spapr_ovec_new();
>> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
>> index 9a3885593c86..90e2b0f6c678 100644
>> --- a/include/hw/ppc/spapr.h
>> +++ b/include/hw/ppc/spapr.h
>> @@ -14,6 +14,7 @@ struct sPAPRNVRAM;
>>  typedef struct sPAPREventLogEntry sPAPREventLogEntry;
>>  typedef struct sPAPREventSource sPAPREventSource;
>>  typedef struct sPAPRPendingHPT sPAPRPendingHPT;
>> +typedef struct sPAPRXive sPAPRXive;
>>  
>>  #define HPTE64_V_HPTE_DIRTY     0x0000000000000040ULL
>>  #define SPAPR_ENTRY_POINT       0x100
>> @@ -127,6 +128,7 @@ struct sPAPRMachineState {
>>      MemoryHotplugState hotplug_memory;
>>  
>>      const char *icp_type;
>> +    sPAPRXive  *xive;
>>  };
>>  
>>  #define H_SUCCESS         0
>
Cédric Le Goater Nov. 30, 2017, 3:38 p.m. UTC | #3
>> +    } else {
>> +        /* XIVE uses the full range of IRQ numbers. The CPU IPIs will
>> +         * use the range below XICS_IRQ_BASE, which is unused by XICS. */
>> +        spapr->xive = spapr_xive_create(spapr, XICS_IRQ_BASE + XICS_IRQS_SPAPR,
>> +                                        &error_fatal);
> 
> XICS_IRQ_BASE == 4096, and XICS_IRQS_SPAPR (which we should rename at
> some point) == 1024.

BTW, why XICS_IRQ_BASE == 4096 ? I could not find a reason for
this offset. 

> So we have a total irq space of 5k, which is a bit odd.  I'd be ok
> with rounding it out to 8k for newer machines if that's useful.

ok. and using a machine class value to maintain compatibility. That 
would be useful if we allocate more PHBs. 

> Sparse allocations in there might make life easier for getting
> consistent irq numbers without an "allocator" per se (because we can
> use different regions for VIO, PCI intx, MSI, etc. etc.).

So, do you think we should modify the IRQ allocator routines to be 
able to segment the IRQ number space and let devices specify the
range they want to use ? 

That would be useful for the PHB LSIs. The starting IRQ for the PHB
could be aligned on some value depending on the PHB index, first 
would come the LSI interrupts and then the MSIs which are allocated 
later on by the guest. We would have predictable values.

Thanks,

C.
David Gibson Dec. 1, 2017, 4:14 a.m. UTC | #4
On Thu, Nov 30, 2017 at 03:15:09PM +0000, Cédric Le Goater wrote:
> On 11/30/2017 05:55 AM, David Gibson wrote:
> > On Thu, Nov 23, 2017 at 02:29:47PM +0100, Cédric Le Goater wrote:
> >> The XIVE object is designed to be always available, so it is created
> >> unconditionally on newer machines.
> > 
> > There doesn't actually seem to be anything dependent on machine
> > version here.
> 
> No. I thought that was too early in the patchset. This is handled 
> in the last patch with a 'xive_exploitation' bool which is set to 
> false on older machines. 
> 
> But, nevertheless, the XIVE objects are always created even if not
> used. Something to discuss.

That'll definitely break backwards migration, since the destination
won't understand the (unused but still present) xive state it
receives.  So xives can only be created on new machine types.  I'm ok
(at least tentatively) with always creating them on the newer machine
types, regardless of whether the guest ends up exploiting it or not.

> >> Depending on the configuration and
> >> the guest capabilities, the CAS negotiation process will decide which
> >> interrupt model to use, legacy or XIVE.
> >>
> >> The XIVE model makes use of the full range of the IRQ number space
> >> because the IRQ numbers for the CPU IPIs are allocated in the range
> >> below XICS_IRQ_BASE, which is unused by XICS.
> > 
> > Ok.  And I take it 4096 is enough space for the XIVE IPIs for the
> > forseeable future?
> 
> The biggest real system I am aware of as 16 sockets, 192 cores, SMT8. 
> That's 1536 cpus. pseries has a max_cpus of 1024.

Ok, so we can go to double the current system size, but not 4x.  Not
sure if that seems adequate or not.  Still it's a relatively minor
detail.
David Gibson Dec. 1, 2017, 4:17 a.m. UTC | #5
On Thu, Nov 30, 2017 at 03:38:46PM +0000, Cédric Le Goater wrote:
> >> +    } else {
> >> +        /* XIVE uses the full range of IRQ numbers. The CPU IPIs will
> >> +         * use the range below XICS_IRQ_BASE, which is unused by XICS. */
> >> +        spapr->xive = spapr_xive_create(spapr, XICS_IRQ_BASE + XICS_IRQS_SPAPR,
> >> +                                        &error_fatal);
> > 
> > XICS_IRQ_BASE == 4096, and XICS_IRQS_SPAPR (which we should rename at
> > some point) == 1024.
> 
> BTW, why XICS_IRQ_BASE == 4096 ? I could not find a reason for
> this offset.

It's basically arbitrary.  Possible I copied the value used in
practice on a PowerVM system of the time, but I don't recall for sure.

> > So we have a total irq space of 5k, which is a bit odd.  I'd be ok
> > with rounding it out to 8k for newer machines if that's useful.
> 
> ok. and using a machine class value to maintain compatibility. That 
> would be useful if we allocate more PHBs. 
> 
> > Sparse allocations in there might make life easier for getting
> > consistent irq numbers without an "allocator" per se (because we can
> > use different regions for VIO, PCI intx, MSI, etc. etc.).
> 
> So, do you think we should modify the IRQ allocator routines to be 
> able to segment the IRQ number space and let devices specify the
> range they want to use ?

No, I'm suggesting *eliminating* the IRQ allocator routines (except
for backwards compat) and having devices "just know" their irq numbers
based on their own device number and the portion of the overall irq
space they're supposed to live in.

PCI MSI is an exception, obviously, it will need some sort of runtime
allocation.

> That would be useful for the PHB LSIs. The starting IRQ for the PHB
> could be aligned on some value depending on the PHB index, first 
> would come the LSI interrupts and then the MSIs which are allocated 
> later on by the guest. We would have predictable values.
> 
> Thanks,
> 
> C. 
>
Cédric Le Goater Dec. 1, 2017, 8:10 a.m. UTC | #6
On 12/01/2017 05:14 AM, David Gibson wrote:
> On Thu, Nov 30, 2017 at 03:15:09PM +0000, Cédric Le Goater wrote:
>> On 11/30/2017 05:55 AM, David Gibson wrote:
>>> On Thu, Nov 23, 2017 at 02:29:47PM +0100, Cédric Le Goater wrote:
>>>> The XIVE object is designed to be always available, so it is created
>>>> unconditionally on newer machines.
>>>
>>> There doesn't actually seem to be anything dependent on machine
>>> version here.
>>
>> No. I thought that was too early in the patchset. This is handled 
>> in the last patch with a 'xive_exploitation' bool which is set to 
>> false on older machines. 
>>
>> But, nevertheless, the XIVE objects are always created even if not
>> used. Something to discuss.
> 
> That'll definitely break backwards migration, since the destination
> won't understand the (unused but still present) xive state it
> receives. 

no because it's not sent. the vmstate 'needed' op of the sPAPRXive
object discards it :

    static bool vmstate_spapr_xive_needed(void *opaque)
    {
        sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
 
        return spapr->xive_exploitation;
    }

> So xives can only be created on new machine types. 

That would be better I agree. I can probably use the 'xive_exploitation'
bool to condition its creation.

> I'm ok
> (at least tentatively) with always creating them on the newer machine
> types, regardless of whether the guest ends up exploiting it or not.

OK.


>>>> Depending on the configuration and
>>>> the guest capabilities, the CAS negotiation process will decide which
>>>> interrupt model to use, legacy or XIVE.
>>>>
>>>> The XIVE model makes use of the full range of the IRQ number space
>>>> because the IRQ numbers for the CPU IPIs are allocated in the range
>>>> below XICS_IRQ_BASE, which is unused by XICS.
>>>
>>> Ok.  And I take it 4096 is enough space for the XIVE IPIs for the
>>> forseeable future?
>>
>> The biggest real system I am aware of as 16 sockets, 192 cores, SMT8. 
>> That's 1536 cpus. pseries has a max_cpus of 1024.
> 
> Ok, so we can go to double the current system size, but not 4x.  Not
> sure if that seems adequate or not.  Still it's a relatively minor
> detail.
>
David Gibson Dec. 4, 2017, 1:59 a.m. UTC | #7
On Fri, Dec 01, 2017 at 09:10:24AM +0100, Cédric Le Goater wrote:
> On 12/01/2017 05:14 AM, David Gibson wrote:
> > On Thu, Nov 30, 2017 at 03:15:09PM +0000, Cédric Le Goater wrote:
> >> On 11/30/2017 05:55 AM, David Gibson wrote:
> >>> On Thu, Nov 23, 2017 at 02:29:47PM +0100, Cédric Le Goater wrote:
> >>>> The XIVE object is designed to be always available, so it is created
> >>>> unconditionally on newer machines.
> >>>
> >>> There doesn't actually seem to be anything dependent on machine
> >>> version here.
> >>
> >> No. I thought that was too early in the patchset. This is handled 
> >> in the last patch with a 'xive_exploitation' bool which is set to 
> >> false on older machines. 
> >>
> >> But, nevertheless, the XIVE objects are always created even if not
> >> used. Something to discuss.
> > 
> > That'll definitely break backwards migration, since the destination
> > won't understand the (unused but still present) xive state it
> > receives. 
> 
> no because it's not sent. the vmstate 'needed' op of the sPAPRXive
> object discards it :
> 
>     static bool vmstate_spapr_xive_needed(void *opaque)
>     {
>         sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
>  
>         return spapr->xive_exploitation;
>     }

Ah, sorry, missed that.  Once we have negotiation we'll need to make
sure the xive_exploitation bit is sent first, of course, but I'm
pretty sure the machine state is already sent first.

> > So xives can only be created on new machine types. 
> 
> That would be better I agree. I can probably use the 'xive_exploitation'
> bool to condition its creation.

Hrm.  I'm less sure about that - I'm not sure the lifetimes line up.
But I'd like to avoid creating them on earlier machine types, even if
xive_exploitation can't do the trick.

> 
> > I'm ok
> > (at least tentatively) with always creating them on the newer machine
> > types, regardless of whether the guest ends up exploiting it or not.
> 
> OK.
> 
> 
> >>>> Depending on the configuration and
> >>>> the guest capabilities, the CAS negotiation process will decide which
> >>>> interrupt model to use, legacy or XIVE.
> >>>>
> >>>> The XIVE model makes use of the full range of the IRQ number space
> >>>> because the IRQ numbers for the CPU IPIs are allocated in the range
> >>>> below XICS_IRQ_BASE, which is unused by XICS.
> >>>
> >>> Ok.  And I take it 4096 is enough space for the XIVE IPIs for the
> >>> forseeable future?
> >>
> >> The biggest real system I am aware of as 16 sockets, 192 cores, SMT8. 
> >> That's 1536 cpus. pseries has a max_cpus of 1024.
> > 
> > Ok, so we can go to double the current system size, but not 4x.  Not
> > sure if that seems adequate or not.  Still it's a relatively minor
> > detail.
> > 
>
Cédric Le Goater Dec. 4, 2017, 8:32 a.m. UTC | #8
On 12/04/2017 02:59 AM, David Gibson wrote:
> On Fri, Dec 01, 2017 at 09:10:24AM +0100, Cédric Le Goater wrote:
>> On 12/01/2017 05:14 AM, David Gibson wrote:
>>> On Thu, Nov 30, 2017 at 03:15:09PM +0000, Cédric Le Goater wrote:
>>>> On 11/30/2017 05:55 AM, David Gibson wrote:
>>>>> On Thu, Nov 23, 2017 at 02:29:47PM +0100, Cédric Le Goater wrote:
>>>>>> The XIVE object is designed to be always available, so it is created
>>>>>> unconditionally on newer machines.
>>>>>
>>>>> There doesn't actually seem to be anything dependent on machine
>>>>> version here.
>>>>
>>>> No. I thought that was too early in the patchset. This is handled 
>>>> in the last patch with a 'xive_exploitation' bool which is set to 
>>>> false on older machines. 
>>>>
>>>> But, nevertheless, the XIVE objects are always created even if not
>>>> used. Something to discuss.
>>>
>>> That'll definitely break backwards migration, since the destination
>>> won't understand the (unused but still present) xive state it
>>> receives. 
>>
>> no because it's not sent. the vmstate 'needed' op of the sPAPRXive
>> object discards it :
>>
>>     static bool vmstate_spapr_xive_needed(void *opaque)
>>     {
>>         sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
>>  
>>         return spapr->xive_exploitation;
>>     }
> 
> Ah, sorry, missed that.  Once we have negotiation we'll need to make
> sure the xive_exploitation bit is sent first, of course, but I'm
> pretty sure the machine state is already sent first.
> 
>>> So xives can only be created on new machine types. 
>>
>> That would be better I agree. I can probably use the 'xive_exploitation'
>> bool to condition its creation.
> 
> Hrm.  I'm less sure about that - I'm not sure the lifetimes line up.
> But I'd like to avoid creating them on earlier machine types, even if
> xive_exploitation can't do the trick.

Yes. I agree. I think we can work something out without introducing
too much complexity. The XIVE object is directly used by the
machine only to set/unset IRQ numbers. Otherwise, it is always 
conditioned by :

    spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)

I think adding a couple of more tests on the 'xive_exploitation'
bool should work out for older machines.

Thanks,
 
C.
David Gibson Dec. 4, 2017, 8:40 a.m. UTC | #9
On Mon, Dec 04, 2017 at 09:32:00AM +0100, Cédric Le Goater wrote:
> On 12/04/2017 02:59 AM, David Gibson wrote:
> > On Fri, Dec 01, 2017 at 09:10:24AM +0100, Cédric Le Goater wrote:
> >> On 12/01/2017 05:14 AM, David Gibson wrote:
> >>> On Thu, Nov 30, 2017 at 03:15:09PM +0000, Cédric Le Goater wrote:
> >>>> On 11/30/2017 05:55 AM, David Gibson wrote:
> >>>>> On Thu, Nov 23, 2017 at 02:29:47PM +0100, Cédric Le Goater wrote:
> >>>>>> The XIVE object is designed to be always available, so it is created
> >>>>>> unconditionally on newer machines.
> >>>>>
> >>>>> There doesn't actually seem to be anything dependent on machine
> >>>>> version here.
> >>>>
> >>>> No. I thought that was too early in the patchset. This is handled 
> >>>> in the last patch with a 'xive_exploitation' bool which is set to 
> >>>> false on older machines. 
> >>>>
> >>>> But, nevertheless, the XIVE objects are always created even if not
> >>>> used. Something to discuss.
> >>>
> >>> That'll definitely break backwards migration, since the destination
> >>> won't understand the (unused but still present) xive state it
> >>> receives. 
> >>
> >> no because it's not sent. the vmstate 'needed' op of the sPAPRXive
> >> object discards it :
> >>
> >>     static bool vmstate_spapr_xive_needed(void *opaque)
> >>     {
> >>         sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
> >>  
> >>         return spapr->xive_exploitation;
> >>     }
> > 
> > Ah, sorry, missed that.  Once we have negotiation we'll need to make
> > sure the xive_exploitation bit is sent first, of course, but I'm
> > pretty sure the machine state is already sent first.
> > 
> >>> So xives can only be created on new machine types. 
> >>
> >> That would be better I agree. I can probably use the 'xive_exploitation'
> >> bool to condition its creation.
> > 
> > Hrm.  I'm less sure about that - I'm not sure the lifetimes line up.
> > But I'd like to avoid creating them on earlier machine types, even if
> > xive_exploitation can't do the trick.
> 
> Yes. I agree. I think we can work something out without introducing
> too much complexity. The XIVE object is directly used by the
> machine only to set/unset IRQ numbers. Otherwise, it is always 
> conditioned by :
> 
>     spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)
> 
> I think adding a couple of more tests on the 'xive_exploitation'
> bool should work out for older machines.

Ok.  If not you can always add a "xive_possible" flag to the MachineClass.
diff mbox series

Patch

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 5d3325ca3c88..0e0107c8272c 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -56,6 +56,7 @@ 
 #include "hw/ppc/spapr_vio.h"
 #include "hw/pci-host/spapr.h"
 #include "hw/ppc/xics.h"
+#include "hw/ppc/spapr_xive.h"
 #include "hw/pci/msi.h"
 
 #include "hw/pci/pci.h"
@@ -204,6 +205,29 @@  static void xics_system_init(MachineState *machine, int nr_irqs, Error **errp)
     }
 }
 
+static sPAPRXive *spapr_xive_create(sPAPRMachineState *spapr, int nr_irqs,
+                                    Error **errp)
+{
+    Error *local_err = NULL;
+    Object *obj;
+
+    obj = object_new(TYPE_SPAPR_XIVE);
+    object_property_add_child(OBJECT(spapr), "xive", obj, &error_abort);
+    object_property_set_int(obj, nr_irqs, "nr-irqs",  &local_err);
+    if (local_err) {
+        goto error;
+    }
+    object_property_set_bool(obj, true, "realized", &local_err);
+    if (local_err) {
+        goto error;
+    }
+
+    return SPAPR_XIVE(obj);
+error:
+    error_propagate(errp, local_err);
+    return NULL;
+}
+
 static int spapr_fixup_cpu_smt_dt(void *fdt, int offset, PowerPCCPU *cpu,
                                   int smt_threads)
 {
@@ -2360,6 +2384,16 @@  static void ppc_spapr_init(MachineState *machine)
     /* Set up Interrupt Controller before we create the VCPUs */
     xics_system_init(machine, XICS_IRQS_SPAPR, &error_fatal);
 
+    /* We don't have KVM support yet, so check for irqchip=on */
+    if (kvm_enabled() && machine_kernel_irqchip_required(machine)) {
+        error_report("kernel_irqchip requested. no XIVE support");
+    } else {
+        /* XIVE uses the full range of IRQ numbers. The CPU IPIs will
+         * use the range below XICS_IRQ_BASE, which is unused by XICS. */
+        spapr->xive = spapr_xive_create(spapr, XICS_IRQ_BASE + XICS_IRQS_SPAPR,
+                                        &error_fatal);
+    }
+
     /* Set up containers for ibm,client-architecture-support negotiated options
      */
     spapr->ov5 = spapr_ovec_new();
diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index 9a3885593c86..90e2b0f6c678 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -14,6 +14,7 @@  struct sPAPRNVRAM;
 typedef struct sPAPREventLogEntry sPAPREventLogEntry;
 typedef struct sPAPREventSource sPAPREventSource;
 typedef struct sPAPRPendingHPT sPAPRPendingHPT;
+typedef struct sPAPRXive sPAPRXive;
 
 #define HPTE64_V_HPTE_DIRTY     0x0000000000000040ULL
 #define SPAPR_ENTRY_POINT       0x100
@@ -127,6 +128,7 @@  struct sPAPRMachineState {
     MemoryHotplugState hotplug_memory;
 
     const char *icp_type;
+    sPAPRXive  *xive;
 };
 
 #define H_SUCCESS         0