diff mbox series

[v4,1/3] spapr: introduce a fixed IRQ number space

Message ID 20180706090713.15712-2-clg@kaod.org
State New
Headers show
Series spapr: introduce a fixed IRQ number space and an IRQ controller backend | expand

Commit Message

Cédric Le Goater July 6, 2018, 9:07 a.m. UTC
This proposal introduces a new IRQ number space layout using static
numbers for all devices, depending on a device index, and a bitmap
allocator for the MSI IRQ numbers which are negotiated by the guest at
runtime.

As the VIO device model does not have a device index but a "reg"
property, we introduce a formula to compute an IRQ number from a "reg"
value. It should minimize most of the collisions.

The previous layout is kept in pre-3.0 machines raising the
'legacy_irq_allocation' machine class flag.

Signed-off-by: Cédric Le Goater <clg@kaod.org>
---
 The new XICS layout will only be activated when a new pseries-3.1
 machine is introduced.

 include/hw/ppc/spapr.h     |  5 ++++
 include/hw/ppc/spapr_irq.h | 32 ++++++++++++++++++++++
 hw/ppc/spapr.c             | 32 +++++++++++++++++++++-
 hw/ppc/spapr_events.c      | 12 ++++++--
 hw/ppc/spapr_irq.c         | 56 ++++++++++++++++++++++++++++++++++++++
 hw/ppc/spapr_pci.c         | 29 ++++++++++++++++----
 hw/ppc/spapr_vio.c         | 47 ++++++++++++++++++++++++++++----
 hw/ppc/Makefile.objs       |  2 +-
 8 files changed, 199 insertions(+), 16 deletions(-)
 create mode 100644 include/hw/ppc/spapr_irq.h
 create mode 100644 hw/ppc/spapr_irq.c

Comments

Cédric Le Goater July 6, 2018, 1:27 p.m. UTC | #1
> +/*
> + * The register property of a VIO device is defined in livirt using a
> + * base number + 0x1000 increment and in QEMU by incrementing the base
> + * register number 0x71000000.
> + *
> + * The formula below tries to compute a unique index number from the
> + * register value that will be used to define the IRQ number of the
> + * VIO device. A maximum of 256 (0x100) VIO devices is covered.
> + *
> + * To minimize collisions, we define two distinct ranges depending on
> + * the "reg" value definition:
> + *
> + *     [0x00 - 0x7f]    user/libvirt
> + *     [0x80 - 0xff]    QEMU VIO model
> + *
> + * Collisions will be detected when the IRQ is claimed.
> + */
> +static inline uint32_t spapr_vio_reg_to_irq(uint32_t reg)
> +{
> +    if (reg >= SPAPR_VIO_REG_BASE) {
> +        return SPAPR_IRQ_VIO | (reg & 0x7f) | 0x80;
> +    } else {
> +        return SPAPR_IRQ_VIO | ((reg & 0x7f) ^ ((reg >> 12) & 0x7f));

This is not good enough for the VIO devices with a user defined "reg".

I propose to extract bits [29-28] and [16-12] to compose a 7bit index :

	(((reg >> 28) & 0x3) << 5) | ((reg >> 12) & 0x1f)

That would give us a mapping :

	0x00000000  -> 0
	0x00001000  -> 1
		...
	0x0001F000  -> 31
	0x10000000  -> 32
	0x10001000  -> 33
		...
	0x1001F000  -> 63
	0x20000000  -> 64
	0x20001000  -> 65
		...
	0x2001F000  -> 95
	0x30000000  -> 96
	0x30001000  -> 97
		...
	0x3001F000  -> 127

C.
Greg Kurz July 6, 2018, 1:36 p.m. UTC | #2
On Fri,  6 Jul 2018 11:07:11 +0200
Cédric Le Goater <clg@kaod.org> wrote:

> This proposal introduces a new IRQ number space layout using static
> numbers for all devices, depending on a device index, and a bitmap
> allocator for the MSI IRQ numbers which are negotiated by the guest at
> runtime.
> 
> As the VIO device model does not have a device index but a "reg"
> property, we introduce a formula to compute an IRQ number from a "reg"
> value. It should minimize most of the collisions.
> 
> The previous layout is kept in pre-3.0 machines raising the
> 'legacy_irq_allocation' machine class flag.
> 
> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> ---
>  The new XICS layout will only be activated when a new pseries-3.1
>  machine is introduced.
> 
>  include/hw/ppc/spapr.h     |  5 ++++
>  include/hw/ppc/spapr_irq.h | 32 ++++++++++++++++++++++
>  hw/ppc/spapr.c             | 32 +++++++++++++++++++++-
>  hw/ppc/spapr_events.c      | 12 ++++++--
>  hw/ppc/spapr_irq.c         | 56 ++++++++++++++++++++++++++++++++++++++
>  hw/ppc/spapr_pci.c         | 29 ++++++++++++++++----
>  hw/ppc/spapr_vio.c         | 47 ++++++++++++++++++++++++++++----
>  hw/ppc/Makefile.objs       |  2 +-
>  8 files changed, 199 insertions(+), 16 deletions(-)
>  create mode 100644 include/hw/ppc/spapr_irq.h
>  create mode 100644 hw/ppc/spapr_irq.c
> 
> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> index 7e5de1a6fd42..73067f5ee8aa 100644
> --- a/include/hw/ppc/spapr.h
> +++ b/include/hw/ppc/spapr.h
> @@ -8,6 +8,7 @@
>  #include "hw/ppc/spapr_drc.h"
>  #include "hw/mem/pc-dimm.h"
>  #include "hw/ppc/spapr_ovec.h"
> +#include "hw/ppc/spapr_irq.h"
>  
>  struct VIOsPAPRBus;
>  struct sPAPRPHBState;
> @@ -101,6 +102,8 @@ struct sPAPRMachineClass {
>      bool dr_lmb_enabled;       /* enable dynamic-reconfig/hotplug of LMBs */
>      bool use_ohci_by_default;  /* use USB-OHCI instead of XHCI */
>      bool pre_2_10_has_unused_icps;
> +    bool legacy_irq_allocation;
> +
>      void (*phb_placement)(sPAPRMachineState *spapr, uint32_t index,
>                            uint64_t *buid, hwaddr *pio, 
>                            hwaddr *mmio32, hwaddr *mmio64,
> @@ -167,6 +170,8 @@ struct sPAPRMachineState {
>      char *kvm_type;
>  
>      const char *icp_type;
> +    int32_t irq_map_nr;
> +    unsigned long *irq_map;
>  
>      bool cmd_line_caps[SPAPR_CAP_NUM];
>      sPAPRCapabilities def, eff, mig;
> diff --git a/include/hw/ppc/spapr_irq.h b/include/hw/ppc/spapr_irq.h
> new file mode 100644
> index 000000000000..6f7f50548809
> --- /dev/null
> +++ b/include/hw/ppc/spapr_irq.h
> @@ -0,0 +1,32 @@
> +/*
> + * QEMU PowerPC sPAPR IRQ backend definitions
> + *
> + * Copyright (c) 2018, IBM Corporation.
> + *
> + * This code is licensed under the GPL version 2 or later. See the
> + * COPYING file in the top-level directory.
> + */
> +
> +#ifndef HW_SPAPR_IRQ_H
> +#define HW_SPAPR_IRQ_H
> +
> +/*
> + * IRQ range offsets per device type
> + */
> +#define SPAPR_IRQ_EPOW       0x1000  /* XICS_IRQ_BASE offset */
> +#define SPAPR_IRQ_HOTPLUG    0x1001
> +#define SPAPR_IRQ_VIO        0x1100  /* 256 VIO devices */
> +#define SPAPR_IRQ_PCI_LSI    0x1200  /* 32+ PHBs devices */
> +
> +#define SPAPR_IRQ_MSI        0x1300  /* Offset of the dynamic range covered
> +                                      * by the bitmap allocator */
> +
> +typedef struct sPAPRMachineState sPAPRMachineState;
> +
> +void spapr_irq_msi_init(sPAPRMachineState *spapr, uint32_t nr_msis);
> +int spapr_irq_msi_alloc(sPAPRMachineState *spapr, uint32_t num, bool align,
> +                        Error **errp);
> +void spapr_irq_msi_free(sPAPRMachineState *spapr, int irq, uint32_t num);
> +void spapr_irq_msi_reset(sPAPRMachineState *spapr);
> +
> +#endif
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index 3f5e1d3ec21a..7bc9a48abf04 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -189,6 +189,11 @@ static void xics_system_init(MachineState *machine, int nr_irqs, Error **errp)
>      sPAPRMachineState *spapr = SPAPR_MACHINE(machine);
>      Error *local_err = NULL;
>  
> +    /* Initialize the MSI IRQ allocator. */
> +    if (!SPAPR_MACHINE_GET_CLASS(spapr)->legacy_irq_allocation) {
> +        spapr_irq_msi_init(spapr, XICS_IRQ_BASE + nr_irqs - SPAPR_IRQ_MSI);
> +    }
> +
>      if (kvm_enabled()) {
>          if (machine_kernel_irqchip_allowed(machine) &&
>              !xics_kvm_init(spapr, &local_err)) {
> @@ -1636,6 +1641,10 @@ static void spapr_machine_reset(void)
>          ppc_set_compat(first_ppc_cpu, spapr->max_compat_pvr, &error_fatal);
>      }
>  
> +    if (!SPAPR_MACHINE_GET_CLASS(spapr)->legacy_irq_allocation) {
> +        spapr_irq_msi_reset(spapr);
> +    }
> +
>      qemu_devices_reset();
>  
>      /* DRC reset may cause a device to be unplugged. This will cause troubles
> @@ -1910,6 +1919,24 @@ static const VMStateDescription vmstate_spapr_patb_entry = {
>      },
>  };
>  
> +static bool spapr_irq_map_needed(void *opaque)
> +{
> +    sPAPRMachineState *spapr = opaque;
> +
> +    return spapr->irq_map && !bitmap_empty(spapr->irq_map, spapr->irq_map_nr);
> +}
> +
> +static const VMStateDescription vmstate_spapr_irq_map = {
> +    .name = "spapr_irq_map",
> +    .version_id = 1,
> +    .minimum_version_id = 1,
> +    .needed = spapr_irq_map_needed,
> +    .fields = (VMStateField[]) {
> +        VMSTATE_BITMAP(irq_map, sPAPRMachineState, 0, irq_map_nr),
> +        VMSTATE_END_OF_LIST()
> +    },
> +};
> +
>  static const VMStateDescription vmstate_spapr = {
>      .name = "spapr",
>      .version_id = 3,
> @@ -1937,6 +1964,7 @@ static const VMStateDescription vmstate_spapr = {
>          &vmstate_spapr_cap_cfpc,
>          &vmstate_spapr_cap_sbbc,
>          &vmstate_spapr_cap_ibs,
> +        &vmstate_spapr_irq_map,
>          NULL
>      }
>  };
> @@ -4067,7 +4095,9 @@ static void spapr_machine_3_0_instance_options(MachineState *machine)
>  
>  static void spapr_machine_3_0_class_options(MachineClass *mc)
>  {
> -    /* Defaults for the latest behaviour inherited from the base class */
> +    sPAPRMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
> +
> +    smc->legacy_irq_allocation = true;
>  }
>  
>  DEFINE_SPAPR_MACHINE(3_0, "3.0", true);
> diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
> index e4f5946a2188..cab950d33446 100644
> --- a/hw/ppc/spapr_events.c
> +++ b/hw/ppc/spapr_events.c
> @@ -709,7 +709,11 @@ void spapr_events_init(sPAPRMachineState *spapr)
>  {
>      int epow_irq;
>  
> -    epow_irq = spapr_irq_findone(spapr, &error_fatal);
> +    if (SPAPR_MACHINE_GET_CLASS(spapr)->legacy_irq_allocation) {
> +        epow_irq = spapr_irq_findone(spapr, &error_fatal);
> +    } else {
> +        epow_irq = SPAPR_IRQ_EPOW;
> +    }
>  
>      spapr_irq_claim(spapr, epow_irq, false, &error_fatal);
>  
> @@ -731,7 +735,11 @@ void spapr_events_init(sPAPRMachineState *spapr)
>      if (spapr->use_hotplug_event_source) {
>          int hp_irq;
>  
> -        hp_irq = spapr_irq_findone(spapr, &error_fatal);
> +        if (SPAPR_MACHINE_GET_CLASS(spapr)->legacy_irq_allocation) {
> +            hp_irq = spapr_irq_findone(spapr, &error_fatal);
> +        } else {
> +            hp_irq = SPAPR_IRQ_HOTPLUG;
> +        }
>  
>          spapr_irq_claim(spapr, hp_irq, false, &error_fatal);
>  
> diff --git a/hw/ppc/spapr_irq.c b/hw/ppc/spapr_irq.c
> new file mode 100644
> index 000000000000..24e9c1d4433c
> --- /dev/null
> +++ b/hw/ppc/spapr_irq.c
> @@ -0,0 +1,56 @@
> +/*
> + * QEMU PowerPC sPAPR IRQ interface
> + *
> + * Copyright (c) 2018, IBM Corporation.
> + *
> + * This code is licensed under the GPL version 2 or later. See the
> + * COPYING file in the top-level directory.
> + */
> +
> +#include "qemu/osdep.h"
> +#include "qemu/log.h"
> +#include "qemu/error-report.h"
> +#include "qapi/error.h"
> +#include "hw/ppc/spapr.h"
> +#include "hw/ppc/xics.h"
> +
> +void spapr_irq_msi_init(sPAPRMachineState *spapr, uint32_t nr_msis)
> +{
> +    spapr->irq_map_nr = nr_msis;
> +    spapr->irq_map = bitmap_new(spapr->irq_map_nr);
> +}
> +
> +int spapr_irq_msi_alloc(sPAPRMachineState *spapr, uint32_t num, bool align,
> +                        Error **errp)
> +{
> +    int irq;
> +
> +    /*
> +     * The 'align_mask' parameter of bitmap_find_next_zero_area()
> +     * should be one less than a power of 2; 0 means no
> +     * alignment. Adapt the 'align' value of the former allocator
> +     * to fit the requirements of bitmap_find_next_zero_area()
> +     */
> +    align -= 1;
> +
> +    irq = bitmap_find_next_zero_area(spapr->irq_map, spapr->irq_map_nr, 0, num,
> +                                     align);
> +    if (irq == spapr->irq_map_nr) {
> +        error_setg(errp, "can't find a free %d-IRQ block", num);
> +        return -1;
> +    }
> +
> +    bitmap_set(spapr->irq_map, irq, num);
> +
> +    return irq + SPAPR_IRQ_MSI;
> +}
> +
> +void spapr_irq_msi_free(sPAPRMachineState *spapr, int irq, uint32_t num)
> +{
> +    bitmap_clear(spapr->irq_map, irq - SPAPR_IRQ_MSI, num);
> +}
> +
> +void spapr_irq_msi_reset(sPAPRMachineState *spapr)
> +{
> +    bitmap_clear(spapr->irq_map, 0, spapr->irq_map_nr);
> +}
> diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
> index 497b896c7d24..cba5340f4bad 100644
> --- a/hw/ppc/spapr_pci.c
> +++ b/hw/ppc/spapr_pci.c
> @@ -334,6 +334,9 @@ static void rtas_ibm_change_msi(PowerPCCPU *cpu, sPAPRMachineState *spapr,
>              return;
>          }
>  
> +        if (!SPAPR_MACHINE_GET_CLASS(spapr)->legacy_irq_allocation) {
> +            spapr_irq_msi_free(spapr, msi->first_irq, msi->num);
> +        }
>          spapr_irq_free(spapr, msi->first_irq, msi->num);
>          if (msi_present(pdev)) {
>              spapr_msi_setmsg(pdev, 0, false, 0, 0);
> @@ -372,7 +375,13 @@ static void rtas_ibm_change_msi(PowerPCCPU *cpu, sPAPRMachineState *spapr,
>      }
>  
>      /* Allocate MSIs */
> -    irq = spapr_irq_find(spapr, req_num, ret_intr_type == RTAS_TYPE_MSI, &err);
> +    if (SPAPR_MACHINE_GET_CLASS(spapr)->legacy_irq_allocation) {
> +        irq = spapr_irq_find(spapr, req_num, ret_intr_type == RTAS_TYPE_MSI,
> +                             &err);
> +    } else {
> +        irq = spapr_irq_msi_alloc(spapr, req_num,
> +                                  ret_intr_type == RTAS_TYPE_MSI, &err);
> +    }
>      if (err) {
>          error_reportf_err(err, "Can't allocate MSIs for device %x: ",
>                            config_addr);
> @@ -392,6 +401,9 @@ static void rtas_ibm_change_msi(PowerPCCPU *cpu, sPAPRMachineState *spapr,
>  
>      /* Release previous MSIs */
>      if (msi) {
> +        if (!SPAPR_MACHINE_GET_CLASS(spapr)->legacy_irq_allocation) {
> +            spapr_irq_msi_free(spapr, msi->first_irq, msi->num);
> +        }
>          spapr_irq_free(spapr, msi->first_irq, msi->num);
>          g_hash_table_remove(phb->msi, &config_addr);
>      }
> @@ -1708,11 +1720,15 @@ static void spapr_phb_realize(DeviceState *dev, Error **errp)
>          uint32_t irq;
>          Error *local_err = NULL;
>  
> -        irq = spapr_irq_findone(spapr, &local_err);
> -        if (local_err) {
> -            error_propagate(errp, local_err);
> -            error_prepend(errp, "can't allocate LSIs: ");
> -            return;
> +        if (SPAPR_MACHINE_GET_CLASS(spapr)->legacy_irq_allocation) {
> +            irq = spapr_irq_findone(spapr, &local_err);
> +            if (local_err) {
> +                error_propagate(errp, local_err);
> +                error_prepend(errp, "can't allocate LSIs: ");
> +                return;
> +            }
> +        } else {
> +            irq = SPAPR_IRQ_PCI_LSI + sphb->index * PCI_NUM_PINS + i;
>          }
>  
>          spapr_irq_claim(spapr, irq, true, &local_err);
> @@ -2123,6 +2139,7 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
>      _FDT(fdt_setprop(fdt, bus_off, "ranges", &ranges, sizeof_ranges));
>      _FDT(fdt_setprop(fdt, bus_off, "reg", &bus_reg, sizeof(bus_reg)));
>      _FDT(fdt_setprop_cell(fdt, bus_off, "ibm,pci-config-space-type", 0x1));
> +    /* TODO: fine tune the total count of allocatable MSIs per PHB */
>      _FDT(fdt_setprop_cell(fdt, bus_off, "ibm,pe-total-#msi", XICS_IRQS_SPAPR));
>  
>      /* Dynamic DMA window */
> diff --git a/hw/ppc/spapr_vio.c b/hw/ppc/spapr_vio.c
> index be9af71437cc..a0695200f348 100644
> --- a/hw/ppc/spapr_vio.c
> +++ b/hw/ppc/spapr_vio.c
> @@ -37,12 +37,13 @@
>  
>  #include "hw/ppc/spapr.h"
>  #include "hw/ppc/spapr_vio.h"
> -#include "hw/ppc/xics.h"
>  #include "hw/ppc/fdt.h"
>  #include "trace.h"
>  
>  #include <libfdt.h>
>  
> +#define SPAPR_VIO_REG_BASE 0x71000000
> +
>  static void spapr_vio_get_irq(Object *obj, Visitor *v, const char *name,
>                                void *opaque, Error **errp)
>  {
> @@ -445,6 +446,32 @@ static void spapr_vio_busdev_reset(DeviceState *qdev)
>      }
>  }
>  
> +/*
> + * The register property of a VIO device is defined in livirt using a
> + * base number + 0x1000 increment and in QEMU by incrementing the base
> + * register number 0x71000000.
> + *
> + * The formula below tries to compute a unique index number from the
> + * register value that will be used to define the IRQ number of the
> + * VIO device. A maximum of 256 (0x100) VIO devices is covered.
> + *
> + * To minimize collisions, we define two distinct ranges depending on
> + * the "reg" value definition:
> + *
> + *     [0x00 - 0x7f]    user/libvirt
> + *     [0x80 - 0xff]    QEMU VIO model
> + *
> + * Collisions will be detected when the IRQ is claimed.
> + */
> +static inline uint32_t spapr_vio_reg_to_irq(uint32_t reg)
> +{
> +    if (reg >= SPAPR_VIO_REG_BASE) {
> +        return SPAPR_IRQ_VIO | (reg & 0x7f) | 0x80;
> +    } else {
> +        return SPAPR_IRQ_VIO | ((reg & 0x7f) ^ ((reg >> 12) & 0x7f));

Hmm... if you put a VIO net and two VIO vty in the domain XML, libvirt
will generate reg == 0x1000 for the VIO net and reg == 0x30001000 for the
second VIO vty... this will necessarily collide, won't it ?

With a 256 VIO devices limit, libvirt can only add 255 devices since
the nvram is created by QEMU by default (libvirt can only change its
reg using -global).

As David mentioned in another mail:

	VIO net devices start at reg=0x1000
	VIO scsi devices start at reg=0x2000
	VIO nvram devices start at reg=0x3000
	VIO vty devices start at reg=0x30000000
	    and increment by 0x1000 each type


The values for net, scsi and nvram overlap... which makes me wonder why
do we even care to have per-type base value !?! Anyway, I don't think
it's important for what you're trying to achieve.

Basically libvirt can generate regs in two distinct ranges:

- one for scsi/net/nvram:

smallest possible reg: 0x1000
largest possible reg: 0x2000 + 254 * 0x1000 = 0x100000

ie, 254 scsi devices starting at 0x2000 and 1 nvram

- one for vty 

smallest possible reg: 0x30000000
largest possible reg: 0x30000000 + 253 * 0x1000 = 0x300fd000

ie, 254 vty devices

Thinking about the bit shifting magic that is needed to convert
reg into a usable index makes my brain hurt, but I'll happily
review anything you propose :)

> +    }
> +}
> +
>  static void spapr_vio_busdev_realize(DeviceState *qdev, Error **errp)
>  {
>      sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
> @@ -485,10 +512,18 @@ static void spapr_vio_busdev_realize(DeviceState *qdev, Error **errp)
>      }
>  
>      if (!dev->irq) {
> -        dev->irq = spapr_irq_findone(spapr, &local_err);
> -        if (local_err) {
> -            error_propagate(errp, local_err);
> -            return;
> +        if (SPAPR_MACHINE_GET_CLASS(spapr)->legacy_irq_allocation) {
> +            dev->irq = spapr_irq_findone(spapr, &local_err);
> +            if (local_err) {
> +                error_propagate(errp, local_err);
> +                return;
> +            }
> +        } else {
> +            dev->irq = spapr_vio_reg_to_irq(dev->reg);
> +            if (dev->irq == SPAPR_IRQ_PCI_LSI) {
> +                error_setg(errp, "Too many VIO devices");
> +                return;
> +            }
>          }
>      }
>  
> @@ -557,7 +592,7 @@ VIOsPAPRBus *spapr_vio_bus_init(void)
>      /* Create bus on bridge device */
>      qbus = qbus_create(TYPE_SPAPR_VIO_BUS, dev, "spapr-vio");
>      bus = SPAPR_VIO_BUS(qbus);
> -    bus->next_reg = 0x71000000;
> +    bus->next_reg = SPAPR_VIO_REG_BASE;
>  
>      /* hcall-vio */
>      spapr_register_hypercall(H_VIO_SIGNAL, h_vio_signal);
> diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
> index bcab6323b7ed..4ab556467289 100644
> --- a/hw/ppc/Makefile.objs
> +++ b/hw/ppc/Makefile.objs
> @@ -4,7 +4,7 @@ obj-y += ppc.o ppc_booke.o fdt.o
>  obj-$(CONFIG_PSERIES) += spapr.o spapr_caps.o spapr_vio.o spapr_events.o
>  obj-$(CONFIG_PSERIES) += spapr_hcall.o spapr_iommu.o spapr_rtas.o
>  obj-$(CONFIG_PSERIES) += spapr_pci.o spapr_rtc.o spapr_drc.o spapr_rng.o
> -obj-$(CONFIG_PSERIES) += spapr_cpu_core.o spapr_ovec.o
> +obj-$(CONFIG_PSERIES) += spapr_cpu_core.o spapr_ovec.o spapr_irq.o
>  # IBM PowerNV
>  obj-$(CONFIG_POWERNV) += pnv.o pnv_xscom.o pnv_core.o pnv_lpc.o pnv_psi.o pnv_occ.o pnv_bmc.o
>  ifeq ($(CONFIG_PCI)$(CONFIG_PSERIES)$(CONFIG_LINUX), yyy)
Greg Kurz July 6, 2018, 2:09 p.m. UTC | #3
On Fri, 6 Jul 2018 15:36:24 +0200
Greg Kurz <groug@kaod.org> wrote:

[...]

> Hmm... if you put a VIO net and two VIO vty in the domain XML, libvirt
> will generate reg == 0x1000 for the VIO net and reg == 0x30001000 for the
> second VIO vty... this will necessarily collide, won't it ?
> 
> With a 256 VIO devices limit, libvirt can only add 255 devices since
> the nvram is created by QEMU by default (libvirt can only change its
> reg using -global).
> 
> As David mentioned in another mail:
> 
> 	VIO net devices start at reg=0x1000
> 	VIO scsi devices start at reg=0x2000
> 	VIO nvram devices start at reg=0x3000
> 	VIO vty devices start at reg=0x30000000
> 	    and increment by 0x1000 each type
> 
> 
> The values for net, scsi and nvram overlap... which makes me wonder why
> do we even care to have per-type base value !?! Anyway, I don't think
> it's important for what you're trying to achieve.
> 
> Basically libvirt can generate regs in two distinct ranges:
> 
> - one for scsi/net/nvram:
> 
> smallest possible reg: 0x1000
> largest possible reg: 0x2000 + 254 * 0x1000 = 0x100000
> 
> ie, 254 scsi devices starting at 0x2000 and 1 nvram
> 

Oops, it is 255 scsi devices plus 1 nvram so the largest
reg in this range is 0x101000

> - one for vty 
> 
> smallest possible reg: 0x30000000
> largest possible reg: 0x30000000 + 253 * 0x1000 = 0x300fd000
> 
> ie, 254 vty devices
> 

and 255 vty devices, ie, largest reg is 0x300fe000.

> Thinking about the bit shifting magic that is needed to convert
> reg into a usable index makes my brain hurt, but I'll happily
> review anything you propose :)
> 
> > +    }
> > +}
> > +
> >  static void spapr_vio_busdev_realize(DeviceState *qdev, Error **errp)
> >  {
> >      sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
> > @@ -485,10 +512,18 @@ static void spapr_vio_busdev_realize(DeviceState *qdev, Error **errp)
> >      }
> >  
> >      if (!dev->irq) {
> > -        dev->irq = spapr_irq_findone(spapr, &local_err);
> > -        if (local_err) {
> > -            error_propagate(errp, local_err);
> > -            return;
> > +        if (SPAPR_MACHINE_GET_CLASS(spapr)->legacy_irq_allocation) {
> > +            dev->irq = spapr_irq_findone(spapr, &local_err);
> > +            if (local_err) {
> > +                error_propagate(errp, local_err);
> > +                return;
> > +            }
> > +        } else {
> > +            dev->irq = spapr_vio_reg_to_irq(dev->reg);
> > +            if (dev->irq == SPAPR_IRQ_PCI_LSI) {
> > +                error_setg(errp, "Too many VIO devices");
> > +                return;
> > +            }
> >          }
> >      }
> >  
> > @@ -557,7 +592,7 @@ VIOsPAPRBus *spapr_vio_bus_init(void)
> >      /* Create bus on bridge device */
> >      qbus = qbus_create(TYPE_SPAPR_VIO_BUS, dev, "spapr-vio");
> >      bus = SPAPR_VIO_BUS(qbus);
> > -    bus->next_reg = 0x71000000;
> > +    bus->next_reg = SPAPR_VIO_REG_BASE;
> >  
> >      /* hcall-vio */
> >      spapr_register_hypercall(H_VIO_SIGNAL, h_vio_signal);
> > diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
> > index bcab6323b7ed..4ab556467289 100644
> > --- a/hw/ppc/Makefile.objs
> > +++ b/hw/ppc/Makefile.objs
> > @@ -4,7 +4,7 @@ obj-y += ppc.o ppc_booke.o fdt.o
> >  obj-$(CONFIG_PSERIES) += spapr.o spapr_caps.o spapr_vio.o spapr_events.o
> >  obj-$(CONFIG_PSERIES) += spapr_hcall.o spapr_iommu.o spapr_rtas.o
> >  obj-$(CONFIG_PSERIES) += spapr_pci.o spapr_rtc.o spapr_drc.o spapr_rng.o
> > -obj-$(CONFIG_PSERIES) += spapr_cpu_core.o spapr_ovec.o
> > +obj-$(CONFIG_PSERIES) += spapr_cpu_core.o spapr_ovec.o spapr_irq.o
> >  # IBM PowerNV
> >  obj-$(CONFIG_POWERNV) += pnv.o pnv_xscom.o pnv_core.o pnv_lpc.o pnv_psi.o pnv_occ.o pnv_bmc.o
> >  ifeq ($(CONFIG_PCI)$(CONFIG_PSERIES)$(CONFIG_LINUX), yyy)  
> 
>
Greg Kurz July 6, 2018, 2:16 p.m. UTC | #4
On Fri, 6 Jul 2018 15:27:17 +0200
Cédric Le Goater <clg@kaod.org> wrote:

>   > +/*
> > + * The register property of a VIO device is defined in livirt using a
> > + * base number + 0x1000 increment and in QEMU by incrementing the base
> > + * register number 0x71000000.
> > + *
> > + * The formula below tries to compute a unique index number from the
> > + * register value that will be used to define the IRQ number of the
> > + * VIO device. A maximum of 256 (0x100) VIO devices is covered.
> > + *
> > + * To minimize collisions, we define two distinct ranges depending on
> > + * the "reg" value definition:
> > + *
> > + *     [0x00 - 0x7f]    user/libvirt
> > + *     [0x80 - 0xff]    QEMU VIO model
> > + *
> > + * Collisions will be detected when the IRQ is claimed.
> > + */
> > +static inline uint32_t spapr_vio_reg_to_irq(uint32_t reg)
> > +{
> > +    if (reg >= SPAPR_VIO_REG_BASE) {
> > +        return SPAPR_IRQ_VIO | (reg & 0x7f) | 0x80;
> > +    } else {
> > +        return SPAPR_IRQ_VIO | ((reg & 0x7f) ^ ((reg >> 12) & 0x7f));  
> 
> This is not good enough for the VIO devices with a user defined "reg".
> 
> I propose to extract bits [29-28] and [16-12] to compose a 7bit index :
> 
> 	(((reg >> 28) & 0x3) << 5) | ((reg >> 12) & 0x1f)
> 

Looks good.

> That would give us a mapping :
> 
> 	0x00000000  -> 0
> 	0x00001000  -> 1
> 		...
> 	0x0001F000  -> 31
> 	0x10000000  -> 32
> 	0x10001000  -> 33
> 		...
> 	0x1001F000  -> 63
> 	0x20000000  -> 64
> 	0x20001000  -> 65
> 		...
> 	0x2001F000  -> 95
> 	0x30000000  -> 96
> 	0x30001000  -> 97
> 		...
> 	0x3001F000  -> 127
> 
> C.
> 
>
Cédric Le Goater July 6, 2018, 4:43 p.m. UTC | #5
On 07/06/2018 04:16 PM, Greg Kurz wrote:
> On Fri, 6 Jul 2018 15:27:17 +0200
> Cédric Le Goater <clg@kaod.org> wrote:
> 
>>   > +/*
>>> + * The register property of a VIO device is defined in livirt using a
>>> + * base number + 0x1000 increment and in QEMU by incrementing the base
>>> + * register number 0x71000000.
>>> + *
>>> + * The formula below tries to compute a unique index number from the
>>> + * register value that will be used to define the IRQ number of the
>>> + * VIO device. A maximum of 256 (0x100) VIO devices is covered.
>>> + *
>>> + * To minimize collisions, we define two distinct ranges depending on
>>> + * the "reg" value definition:
>>> + *
>>> + *     [0x00 - 0x7f]    user/libvirt
>>> + *     [0x80 - 0xff]    QEMU VIO model
>>> + *
>>> + * Collisions will be detected when the IRQ is claimed.
>>> + */
>>> +static inline uint32_t spapr_vio_reg_to_irq(uint32_t reg)
>>> +{
>>> +    if (reg >= SPAPR_VIO_REG_BASE) {
>>> +        return SPAPR_IRQ_VIO | (reg & 0x7f) | 0x80;
>>> +    } else {
>>> +        return SPAPR_IRQ_VIO | ((reg & 0x7f) ^ ((reg >> 12) & 0x7f));  
>>
>> This is not good enough for the VIO devices with a user defined "reg".
>>
>> I propose to extract bits [29-28] and [16-12] to compose a 7bit index :
>>
>> 	(((reg >> 28) & 0x3) << 5) | ((reg >> 12) & 0x1f)
>>
> 
> Looks good.

Thanks for the investigation you made in libvirt and the tests. 

C. 

> 
>> That would give us a mapping :
>>
>> 	0x00000000  -> 0
>> 	0x00001000  -> 1
>> 		...
>> 	0x0001F000  -> 31
>> 	0x10000000  -> 32
>> 	0x10001000  -> 33
>> 		...
>> 	0x1001F000  -> 63
>> 	0x20000000  -> 64
>> 	0x20001000  -> 65
>> 		...
>> 	0x2001F000  -> 95
>> 	0x30000000  -> 96
>> 	0x30001000  -> 97
>> 		...
>> 	0x3001F000  -> 127
>>
>> C.
>>
>>
>
David Gibson July 20, 2018, 2:18 a.m. UTC | #6
On Fri, Jul 06, 2018 at 11:07:11AM +0200, Cédric Le Goater wrote:
> This proposal introduces a new IRQ number space layout using static
> numbers for all devices, depending on a device index, and a bitmap
> allocator for the MSI IRQ numbers which are negotiated by the guest at
> runtime.
> 
> As the VIO device model does not have a device index but a "reg"
> property, we introduce a formula to compute an IRQ number from a "reg"
> value. It should minimize most of the collisions.
> 
> The previous layout is kept in pre-3.0 machines raising the
> 'legacy_irq_allocation' machine class flag.
> 
> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> ---
>  The new XICS layout will only be activated when a new pseries-3.1
>  machine is introduced.

Looking good.  I've noted a few nits below, but they're very minor -
I wouldn't even bother to ask for the changes if we didn't have to
wait for 3.1 anyway.


> 
>  include/hw/ppc/spapr.h     |  5 ++++
>  include/hw/ppc/spapr_irq.h | 32 ++++++++++++++++++++++
>  hw/ppc/spapr.c             | 32 +++++++++++++++++++++-
>  hw/ppc/spapr_events.c      | 12 ++++++--
>  hw/ppc/spapr_irq.c         | 56 ++++++++++++++++++++++++++++++++++++++
>  hw/ppc/spapr_pci.c         | 29 ++++++++++++++++----
>  hw/ppc/spapr_vio.c         | 47 ++++++++++++++++++++++++++++----
>  hw/ppc/Makefile.objs       |  2 +-
>  8 files changed, 199 insertions(+), 16 deletions(-)
>  create mode 100644 include/hw/ppc/spapr_irq.h
>  create mode 100644 hw/ppc/spapr_irq.c
> 
> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> index 7e5de1a6fd42..73067f5ee8aa 100644
> --- a/include/hw/ppc/spapr.h
> +++ b/include/hw/ppc/spapr.h
> @@ -8,6 +8,7 @@
>  #include "hw/ppc/spapr_drc.h"
>  #include "hw/mem/pc-dimm.h"
>  #include "hw/ppc/spapr_ovec.h"
> +#include "hw/ppc/spapr_irq.h"
>  
>  struct VIOsPAPRBus;
>  struct sPAPRPHBState;
> @@ -101,6 +102,8 @@ struct sPAPRMachineClass {
>      bool dr_lmb_enabled;       /* enable dynamic-reconfig/hotplug of LMBs */
>      bool use_ohci_by_default;  /* use USB-OHCI instead of XHCI */
>      bool pre_2_10_has_unused_icps;
> +    bool legacy_irq_allocation;
> +
>      void (*phb_placement)(sPAPRMachineState *spapr, uint32_t index,
>                            uint64_t *buid, hwaddr *pio, 
>                            hwaddr *mmio32, hwaddr *mmio64,
> @@ -167,6 +170,8 @@ struct sPAPRMachineState {
>      char *kvm_type;
>  
>      const char *icp_type;
> +    int32_t irq_map_nr;
> +    unsigned long *irq_map;
>  
>      bool cmd_line_caps[SPAPR_CAP_NUM];
>      sPAPRCapabilities def, eff, mig;
> diff --git a/include/hw/ppc/spapr_irq.h b/include/hw/ppc/spapr_irq.h
> new file mode 100644
> index 000000000000..6f7f50548809
> --- /dev/null
> +++ b/include/hw/ppc/spapr_irq.h
> @@ -0,0 +1,32 @@
> +/*
> + * QEMU PowerPC sPAPR IRQ backend definitions
> + *
> + * Copyright (c) 2018, IBM Corporation.
> + *
> + * This code is licensed under the GPL version 2 or later. See the
> + * COPYING file in the top-level directory.
> + */
> +
> +#ifndef HW_SPAPR_IRQ_H
> +#define HW_SPAPR_IRQ_H
> +
> +/*
> + * IRQ range offsets per device type
> + */
> +#define SPAPR_IRQ_EPOW       0x1000  /* XICS_IRQ_BASE offset */
> +#define SPAPR_IRQ_HOTPLUG    0x1001
> +#define SPAPR_IRQ_VIO        0x1100  /* 256 VIO devices */
> +#define SPAPR_IRQ_PCI_LSI    0x1200  /* 32+ PHBs devices */
> +
> +#define SPAPR_IRQ_MSI        0x1300  /* Offset of the dynamic range covered
> +                                      * by the bitmap allocator */
> +
> +typedef struct sPAPRMachineState sPAPRMachineState;
> +
> +void spapr_irq_msi_init(sPAPRMachineState *spapr, uint32_t nr_msis);
> +int spapr_irq_msi_alloc(sPAPRMachineState *spapr, uint32_t num, bool align,
> +                        Error **errp);
> +void spapr_irq_msi_free(sPAPRMachineState *spapr, int irq, uint32_t num);
> +void spapr_irq_msi_reset(sPAPRMachineState *spapr);
> +
> +#endif
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index 3f5e1d3ec21a..7bc9a48abf04 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -189,6 +189,11 @@ static void xics_system_init(MachineState *machine, int nr_irqs, Error **errp)
>      sPAPRMachineState *spapr = SPAPR_MACHINE(machine);
>      Error *local_err = NULL;
>  
> +    /* Initialize the MSI IRQ allocator. */
> +    if (!SPAPR_MACHINE_GET_CLASS(spapr)->legacy_irq_allocation) {
> +        spapr_irq_msi_init(spapr, XICS_IRQ_BASE + nr_irqs - SPAPR_IRQ_MSI);
> +    }
> +
>      if (kvm_enabled()) {
>          if (machine_kernel_irqchip_allowed(machine) &&
>              !xics_kvm_init(spapr, &local_err)) {
> @@ -1636,6 +1641,10 @@ static void spapr_machine_reset(void)
>          ppc_set_compat(first_ppc_cpu, spapr->max_compat_pvr, &error_fatal);
>      }
>  
> +    if (!SPAPR_MACHINE_GET_CLASS(spapr)->legacy_irq_allocation) {
> +        spapr_irq_msi_reset(spapr);
> +    }
> +
>      qemu_devices_reset();
>  
>      /* DRC reset may cause a device to be unplugged. This will cause troubles
> @@ -1910,6 +1919,24 @@ static const VMStateDescription vmstate_spapr_patb_entry = {
>      },
>  };
>  
> +static bool spapr_irq_map_needed(void *opaque)
> +{
> +    sPAPRMachineState *spapr = opaque;
> +
> +    return spapr->irq_map && !bitmap_empty(spapr->irq_map, spapr->irq_map_nr);
> +}
> +
> +static const VMStateDescription vmstate_spapr_irq_map = {
> +    .name = "spapr_irq_map",
> +    .version_id = 1,
> +    .minimum_version_id = 1,
> +    .needed = spapr_irq_map_needed,
> +    .fields = (VMStateField[]) {
> +        VMSTATE_BITMAP(irq_map, sPAPRMachineState, 0, irq_map_nr),
> +        VMSTATE_END_OF_LIST()
> +    },
> +};
> +
>  static const VMStateDescription vmstate_spapr = {
>      .name = "spapr",
>      .version_id = 3,
> @@ -1937,6 +1964,7 @@ static const VMStateDescription vmstate_spapr = {
>          &vmstate_spapr_cap_cfpc,
>          &vmstate_spapr_cap_sbbc,
>          &vmstate_spapr_cap_ibs,
> +        &vmstate_spapr_irq_map,
>          NULL
>      }
>  };
> @@ -4067,7 +4095,9 @@ static void spapr_machine_3_0_instance_options(MachineState *machine)
>  
>  static void spapr_machine_3_0_class_options(MachineClass *mc)
>  {
> -    /* Defaults for the latest behaviour inherited from the base class */
> +    sPAPRMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
> +
> +    smc->legacy_irq_allocation = true;
>  }
>  
>  DEFINE_SPAPR_MACHINE(3_0, "3.0", true);
> diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
> index e4f5946a2188..cab950d33446 100644
> --- a/hw/ppc/spapr_events.c
> +++ b/hw/ppc/spapr_events.c
> @@ -709,7 +709,11 @@ void spapr_events_init(sPAPRMachineState *spapr)
>  {
>      int epow_irq;
>  
> -    epow_irq = spapr_irq_findone(spapr, &error_fatal);
> +    if (SPAPR_MACHINE_GET_CLASS(spapr)->legacy_irq_allocation) {
> +        epow_irq = spapr_irq_findone(spapr, &error_fatal);
> +    } else {
> +        epow_irq = SPAPR_IRQ_EPOW;
> +    }

Nit: these can be rearranged more briefly by initializing to the fixed
value, then overriding in the legacy case.

>  
>      spapr_irq_claim(spapr, epow_irq, false, &error_fatal);
>  
> @@ -731,7 +735,11 @@ void spapr_events_init(sPAPRMachineState *spapr)
>      if (spapr->use_hotplug_event_source) {
>          int hp_irq;
>  
> -        hp_irq = spapr_irq_findone(spapr, &error_fatal);
> +        if (SPAPR_MACHINE_GET_CLASS(spapr)->legacy_irq_allocation) {
> +            hp_irq = spapr_irq_findone(spapr, &error_fatal);
> +        } else {
> +            hp_irq = SPAPR_IRQ_HOTPLUG;
> +        }

Here too.

>          spapr_irq_claim(spapr, hp_irq, false, &error_fatal);
>  
> diff --git a/hw/ppc/spapr_irq.c b/hw/ppc/spapr_irq.c
> new file mode 100644
> index 000000000000..24e9c1d4433c
> --- /dev/null
> +++ b/hw/ppc/spapr_irq.c
> @@ -0,0 +1,56 @@
> +/*
> + * QEMU PowerPC sPAPR IRQ interface
> + *
> + * Copyright (c) 2018, IBM Corporation.
> + *
> + * This code is licensed under the GPL version 2 or later. See the
> + * COPYING file in the top-level directory.
> + */
> +
> +#include "qemu/osdep.h"
> +#include "qemu/log.h"
> +#include "qemu/error-report.h"
> +#include "qapi/error.h"
> +#include "hw/ppc/spapr.h"
> +#include "hw/ppc/xics.h"
> +
> +void spapr_irq_msi_init(sPAPRMachineState *spapr, uint32_t nr_msis)
> +{
> +    spapr->irq_map_nr = nr_msis;
> +    spapr->irq_map = bitmap_new(spapr->irq_map_nr);
> +}
> +
> +int spapr_irq_msi_alloc(sPAPRMachineState *spapr, uint32_t num, bool align,
> +                        Error **errp)
> +{
> +    int irq;
> +
> +    /*
> +     * The 'align_mask' parameter of bitmap_find_next_zero_area()
> +     * should be one less than a power of 2; 0 means no
> +     * alignment. Adapt the 'align' value of the former allocator
> +     * to fit the requirements of bitmap_find_next_zero_area()
> +     */
> +    align -= 1;
> +
> +    irq = bitmap_find_next_zero_area(spapr->irq_map, spapr->irq_map_nr, 0, num,
> +                                     align);
> +    if (irq == spapr->irq_map_nr) {
> +        error_setg(errp, "can't find a free %d-IRQ block", num);
> +        return -1;
> +    }
> +
> +    bitmap_set(spapr->irq_map, irq, num);
> +
> +    return irq + SPAPR_IRQ_MSI;
> +}
> +
> +void spapr_irq_msi_free(sPAPRMachineState *spapr, int irq, uint32_t num)
> +{
> +    bitmap_clear(spapr->irq_map, irq - SPAPR_IRQ_MSI, num);
> +}
> +
> +void spapr_irq_msi_reset(sPAPRMachineState *spapr)
> +{
> +    bitmap_clear(spapr->irq_map, 0, spapr->irq_map_nr);
> +}
> diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
> index 497b896c7d24..cba5340f4bad 100644
> --- a/hw/ppc/spapr_pci.c
> +++ b/hw/ppc/spapr_pci.c
> @@ -334,6 +334,9 @@ static void rtas_ibm_change_msi(PowerPCCPU *cpu, sPAPRMachineState *spapr,
>              return;
>          }
>  
> +        if (!SPAPR_MACHINE_GET_CLASS(spapr)->legacy_irq_allocation) {
> +            spapr_irq_msi_free(spapr, msi->first_irq, msi->num);
> +        }
>          spapr_irq_free(spapr, msi->first_irq, msi->num);
>          if (msi_present(pdev)) {
>              spapr_msi_setmsg(pdev, 0, false, 0, 0);
> @@ -372,7 +375,13 @@ static void rtas_ibm_change_msi(PowerPCCPU *cpu, sPAPRMachineState *spapr,
>      }
>  
>      /* Allocate MSIs */
> -    irq = spapr_irq_find(spapr, req_num, ret_intr_type == RTAS_TYPE_MSI, &err);
> +    if (SPAPR_MACHINE_GET_CLASS(spapr)->legacy_irq_allocation) {
> +        irq = spapr_irq_find(spapr, req_num, ret_intr_type == RTAS_TYPE_MSI,
> +                             &err);
> +    } else {
> +        irq = spapr_irq_msi_alloc(spapr, req_num,
> +                                  ret_intr_type == RTAS_TYPE_MSI, &err);
> +    }
>      if (err) {
>          error_reportf_err(err, "Can't allocate MSIs for device %x: ",
>                            config_addr);
> @@ -392,6 +401,9 @@ static void rtas_ibm_change_msi(PowerPCCPU *cpu, sPAPRMachineState *spapr,
>  
>      /* Release previous MSIs */
>      if (msi) {
> +        if (!SPAPR_MACHINE_GET_CLASS(spapr)->legacy_irq_allocation) {
> +            spapr_irq_msi_free(spapr, msi->first_irq, msi->num);
> +        }
>          spapr_irq_free(spapr, msi->first_irq, msi->num);
>          g_hash_table_remove(phb->msi, &config_addr);
>      }
> @@ -1708,11 +1720,15 @@ static void spapr_phb_realize(DeviceState *dev, Error **errp)
>          uint32_t irq;
>          Error *local_err = NULL;
>  
> -        irq = spapr_irq_findone(spapr, &local_err);
> -        if (local_err) {
> -            error_propagate(errp, local_err);
> -            error_prepend(errp, "can't allocate LSIs: ");
> -            return;
> +        if (SPAPR_MACHINE_GET_CLASS(spapr)->legacy_irq_allocation) {
> +            irq = spapr_irq_findone(spapr, &local_err);
> +            if (local_err) {
> +                error_propagate(errp, local_err);
> +                error_prepend(errp, "can't allocate LSIs: ");
> +                return;
> +            }
> +        } else {
> +            irq = SPAPR_IRQ_PCI_LSI + sphb->index * PCI_NUM_PINS + i;

Here again, you can use the new fixed value as the default.

>          }
>  
>          spapr_irq_claim(spapr, irq, true, &local_err);
> @@ -2123,6 +2139,7 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
>      _FDT(fdt_setprop(fdt, bus_off, "ranges", &ranges, sizeof_ranges));
>      _FDT(fdt_setprop(fdt, bus_off, "reg", &bus_reg, sizeof(bus_reg)));
>      _FDT(fdt_setprop_cell(fdt, bus_off, "ibm,pci-config-space-type", 0x1));
> +    /* TODO: fine tune the total count of allocatable MSIs per PHB */
>      _FDT(fdt_setprop_cell(fdt, bus_off, "ibm,pe-total-#msi", XICS_IRQS_SPAPR));
>  
>      /* Dynamic DMA window */
> diff --git a/hw/ppc/spapr_vio.c b/hw/ppc/spapr_vio.c
> index be9af71437cc..a0695200f348 100644
> --- a/hw/ppc/spapr_vio.c
> +++ b/hw/ppc/spapr_vio.c
> @@ -37,12 +37,13 @@
>  
>  #include "hw/ppc/spapr.h"
>  #include "hw/ppc/spapr_vio.h"
> -#include "hw/ppc/xics.h"
>  #include "hw/ppc/fdt.h"
>  #include "trace.h"
>  
>  #include <libfdt.h>
>  
> +#define SPAPR_VIO_REG_BASE 0x71000000
> +
>  static void spapr_vio_get_irq(Object *obj, Visitor *v, const char *name,
>                                void *opaque, Error **errp)
>  {
> @@ -445,6 +446,32 @@ static void spapr_vio_busdev_reset(DeviceState *qdev)
>      }
>  }
>  
> +/*
> + * The register property of a VIO device is defined in livirt using a
> + * base number + 0x1000 increment and in QEMU by incrementing the base
> + * register number 0x71000000.
> + *
> + * The formula below tries to compute a unique index number from the
> + * register value that will be used to define the IRQ number of the
> + * VIO device. A maximum of 256 (0x100) VIO devices is covered.
> + *
> + * To minimize collisions, we define two distinct ranges depending on
> + * the "reg" value definition:
> + *
> + *     [0x00 - 0x7f]    user/libvirt
> + *     [0x80 - 0xff]    QEMU VIO model
> + *
> + * Collisions will be detected when the IRQ is claimed.
> + */
> +static inline uint32_t spapr_vio_reg_to_irq(uint32_t reg)
> +{
> +    if (reg >= SPAPR_VIO_REG_BASE) {
> +        return SPAPR_IRQ_VIO | (reg & 0x7f) | 0x80;
> +    } else {
> +        return SPAPR_IRQ_VIO | ((reg & 0x7f) ^ ((reg >> 12) & 0x7f));
> +    }
> +}
> +
>  static void spapr_vio_busdev_realize(DeviceState *qdev, Error **errp)
>  {
>      sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
> @@ -485,10 +512,18 @@ static void spapr_vio_busdev_realize(DeviceState *qdev, Error **errp)
>      }
>  
>      if (!dev->irq) {
> -        dev->irq = spapr_irq_findone(spapr, &local_err);
> -        if (local_err) {
> -            error_propagate(errp, local_err);
> -            return;
> +        if (SPAPR_MACHINE_GET_CLASS(spapr)->legacy_irq_allocation) {
> +            dev->irq = spapr_irq_findone(spapr, &local_err);
> +            if (local_err) {
> +                error_propagate(errp, local_err);
> +                return;
> +            }
> +        } else {
> +            dev->irq = spapr_vio_reg_to_irq(dev->reg);
> +            if (dev->irq == SPAPR_IRQ_PCI_LSI) {
> +                error_setg(errp, "Too many VIO devices");
> +                return;
> +            }
>          }
>      }
>  
> @@ -557,7 +592,7 @@ VIOsPAPRBus *spapr_vio_bus_init(void)
>      /* Create bus on bridge device */
>      qbus = qbus_create(TYPE_SPAPR_VIO_BUS, dev, "spapr-vio");
>      bus = SPAPR_VIO_BUS(qbus);
> -    bus->next_reg = 0x71000000;
> +    bus->next_reg = SPAPR_VIO_REG_BASE;
>  
>      /* hcall-vio */
>      spapr_register_hypercall(H_VIO_SIGNAL, h_vio_signal);
> diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
> index bcab6323b7ed..4ab556467289 100644
> --- a/hw/ppc/Makefile.objs
> +++ b/hw/ppc/Makefile.objs
> @@ -4,7 +4,7 @@ obj-y += ppc.o ppc_booke.o fdt.o
>  obj-$(CONFIG_PSERIES) += spapr.o spapr_caps.o spapr_vio.o spapr_events.o
>  obj-$(CONFIG_PSERIES) += spapr_hcall.o spapr_iommu.o spapr_rtas.o
>  obj-$(CONFIG_PSERIES) += spapr_pci.o spapr_rtc.o spapr_drc.o spapr_rng.o
> -obj-$(CONFIG_PSERIES) += spapr_cpu_core.o spapr_ovec.o
> +obj-$(CONFIG_PSERIES) += spapr_cpu_core.o spapr_ovec.o spapr_irq.o
>  # IBM PowerNV
>  obj-$(CONFIG_POWERNV) += pnv.o pnv_xscom.o pnv_core.o pnv_lpc.o pnv_psi.o pnv_occ.o pnv_bmc.o
>  ifeq ($(CONFIG_PCI)$(CONFIG_PSERIES)$(CONFIG_LINUX), yyy)
David Gibson July 20, 2018, 2:38 a.m. UTC | #7
On Fri, Jul 06, 2018 at 03:36:24PM +0200, Greg Kurz wrote:
> On Fri,  6 Jul 2018 11:07:11 +0200
> Cédric Le Goater <clg@kaod.org> wrote:
[snip]
> > +/*
> > + * The register property of a VIO device is defined in livirt using a
> > + * base number + 0x1000 increment and in QEMU by incrementing the base
> > + * register number 0x71000000.
> > + *
> > + * The formula below tries to compute a unique index number from the
> > + * register value that will be used to define the IRQ number of the
> > + * VIO device. A maximum of 256 (0x100) VIO devices is covered.
> > + *
> > + * To minimize collisions, we define two distinct ranges depending on
> > + * the "reg" value definition:
> > + *
> > + *     [0x00 - 0x7f]    user/libvirt
> > + *     [0x80 - 0xff]    QEMU VIO model
> > + *
> > + * Collisions will be detected when the IRQ is claimed.
> > + */
> > +static inline uint32_t spapr_vio_reg_to_irq(uint32_t reg)
> > +{
> > +    if (reg >= SPAPR_VIO_REG_BASE) {
> > +        return SPAPR_IRQ_VIO | (reg & 0x7f) | 0x80;
> > +    } else {
> > +        return SPAPR_IRQ_VIO | ((reg & 0x7f) ^ ((reg >> 12) & 0x7f));
> 
> Hmm... if you put a VIO net and two VIO vty in the domain XML, libvirt
> will generate reg == 0x1000 for the VIO net and reg == 0x30001000 for the
> second VIO vty... this will necessarily collide, won't it ?

Ah, drat, yeah, this will still need some tweaking.

> With a 256 VIO devices limit, libvirt can only add 255 devices since
> the nvram is created by QEMU by default (libvirt can only change its
> reg using -global).

Note that the 256 limit is basically arbitrary.  I don't think it
exists in the code now, but it should be way more than sufficient.

> As David mentioned in another mail:
> 
> 	VIO net devices start at reg=0x1000
> 	VIO scsi devices start at reg=0x2000
> 	VIO nvram devices start at reg=0x3000
> 	VIO vty devices start at reg=0x30000000
> 	    and increment by 0x1000 each type
> 
> 
> The values for net, scsi and nvram overlap... which makes me wonder why
> do we even care to have per-type base value !?!

We don't, really, I'm not sure why it ended up like that.

> Anyway, I don't think
> it's important for what you're trying to achieve.
> 
> Basically libvirt can generate regs in two distinct ranges:
> 
> - one for scsi/net/nvram:
> 
> smallest possible reg: 0x1000
> largest possible reg: 0x2000 + 254 * 0x1000 = 0x100000
> 
> ie, 254 scsi devices starting at 0x2000 and 1 nvram
> 
> - one for vty 
> 
> smallest possible reg: 0x30000000
> largest possible reg: 0x30000000 + 253 * 0x1000 = 0x300fd000
> 
> ie, 254 vty devices
> 
> Thinking about the bit shifting magic that is needed to convert
> reg into a usable index makes my brain hurt, but I'll happily
> review anything you propose :)

Considering just the libvirt assigned addresses how about:

	if (reg >= 0x30000000)
		irq = VIO_BASE + 240 + (reg >> 12); // up to 16 vtys
	else
		irq = VIO_BASE + (reg >> 12);       // up to 238 others

(+ some masking to enforce the limits).

I think it's ok to overlap the "qemu" and "libvirt" ranges, since
(with the exception of the nvram device) I don't see any natural way
you'd get both schemes in use at once.  If the user overrides things,
either on the qemu command line or in the libvirt XML to combine the
two schemes, then we're back to "fix your own mess by manually
allocating irqs as well".

Note that the above formula doesn't use VIO_BASE+0 itself, so I think
we can then factor in the qemu assigned devices with just a:
	irq ^= (reg & 0xff);

> 
> > +    }
> > +}
> > +
> >  static void spapr_vio_busdev_realize(DeviceState *qdev, Error **errp)
> >  {
> >      sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
> > @@ -485,10 +512,18 @@ static void spapr_vio_busdev_realize(DeviceState *qdev, Error **errp)
> >      }
> >  
> >      if (!dev->irq) {
> > -        dev->irq = spapr_irq_findone(spapr, &local_err);
> > -        if (local_err) {
> > -            error_propagate(errp, local_err);
> > -            return;
> > +        if (SPAPR_MACHINE_GET_CLASS(spapr)->legacy_irq_allocation) {
> > +            dev->irq = spapr_irq_findone(spapr, &local_err);
> > +            if (local_err) {
> > +                error_propagate(errp, local_err);
> > +                return;
> > +            }
> > +        } else {
> > +            dev->irq = spapr_vio_reg_to_irq(dev->reg);
> > +            if (dev->irq == SPAPR_IRQ_PCI_LSI) {
> > +                error_setg(errp, "Too many VIO devices");
> > +                return;
> > +            }
> >          }
> >      }
> >  
> > @@ -557,7 +592,7 @@ VIOsPAPRBus *spapr_vio_bus_init(void)
> >      /* Create bus on bridge device */
> >      qbus = qbus_create(TYPE_SPAPR_VIO_BUS, dev, "spapr-vio");
> >      bus = SPAPR_VIO_BUS(qbus);
> > -    bus->next_reg = 0x71000000;
> > +    bus->next_reg = SPAPR_VIO_REG_BASE;
> >  
> >      /* hcall-vio */
> >      spapr_register_hypercall(H_VIO_SIGNAL, h_vio_signal);
> > diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
> > index bcab6323b7ed..4ab556467289 100644
> > --- a/hw/ppc/Makefile.objs
> > +++ b/hw/ppc/Makefile.objs
> > @@ -4,7 +4,7 @@ obj-y += ppc.o ppc_booke.o fdt.o
> >  obj-$(CONFIG_PSERIES) += spapr.o spapr_caps.o spapr_vio.o spapr_events.o
> >  obj-$(CONFIG_PSERIES) += spapr_hcall.o spapr_iommu.o spapr_rtas.o
> >  obj-$(CONFIG_PSERIES) += spapr_pci.o spapr_rtc.o spapr_drc.o spapr_rng.o
> > -obj-$(CONFIG_PSERIES) += spapr_cpu_core.o spapr_ovec.o
> > +obj-$(CONFIG_PSERIES) += spapr_cpu_core.o spapr_ovec.o spapr_irq.o
> >  # IBM PowerNV
> >  obj-$(CONFIG_POWERNV) += pnv.o pnv_xscom.o pnv_core.o pnv_lpc.o pnv_psi.o pnv_occ.o pnv_bmc.o
> >  ifeq ($(CONFIG_PCI)$(CONFIG_PSERIES)$(CONFIG_LINUX), yyy)
>
Cédric Le Goater July 23, 2018, 3:20 p.m. UTC | #8
On 07/20/2018 04:38 AM, David Gibson wrote:
> On Fri, Jul 06, 2018 at 03:36:24PM +0200, Greg Kurz wrote:
>> On Fri,  6 Jul 2018 11:07:11 +0200
>> Cédric Le Goater <clg@kaod.org> wrote:
> [snip]
>>> +/*
>>> + * The register property of a VIO device is defined in livirt using a
>>> + * base number + 0x1000 increment and in QEMU by incrementing the base
>>> + * register number 0x71000000.
>>> + *
>>> + * The formula below tries to compute a unique index number from the
>>> + * register value that will be used to define the IRQ number of the
>>> + * VIO device. A maximum of 256 (0x100) VIO devices is covered.
>>> + *
>>> + * To minimize collisions, we define two distinct ranges depending on
>>> + * the "reg" value definition:
>>> + *
>>> + *     [0x00 - 0x7f]    user/libvirt
>>> + *     [0x80 - 0xff]    QEMU VIO model
>>> + *
>>> + * Collisions will be detected when the IRQ is claimed.
>>> + */
>>> +static inline uint32_t spapr_vio_reg_to_irq(uint32_t reg)
>>> +{
>>> +    if (reg >= SPAPR_VIO_REG_BASE) {
>>> +        return SPAPR_IRQ_VIO | (reg & 0x7f) | 0x80;
>>> +    } else {
>>> +        return SPAPR_IRQ_VIO | ((reg & 0x7f) ^ ((reg >> 12) & 0x7f));
>>
>> Hmm... if you put a VIO net and two VIO vty in the domain XML, libvirt
>> will generate reg == 0x1000 for the VIO net and reg == 0x30001000 for the
>> second VIO vty... this will necessarily collide, won't it ?
> 
> Ah, drat, yeah, this will still need some tweaking.
> 
>> With a 256 VIO devices limit, libvirt can only add 255 devices since
>> the nvram is created by QEMU by default (libvirt can only change its
>> reg using -global).
> 
> Note that the 256 limit is basically arbitrary.  I don't think it
> exists in the code now, but it should be way more than sufficient.
> 
>> As David mentioned in another mail:
>>
>> 	VIO net devices start at reg=0x1000
>> 	VIO scsi devices start at reg=0x2000
>> 	VIO nvram devices start at reg=0x3000
>> 	VIO vty devices start at reg=0x30000000
>> 	    and increment by 0x1000 each type
>>
>>
>> The values for net, scsi and nvram overlap... which makes me wonder why
>> do we even care to have per-type base value !?!
> 
> We don't, really, I'm not sure why it ended up like that.
> 
>> Anyway, I don't think
>> it's important for what you're trying to achieve.
>>
>> Basically libvirt can generate regs in two distinct ranges:
>>
>> - one for scsi/net/nvram:
>>
>> smallest possible reg: 0x1000
>> largest possible reg: 0x2000 + 254 * 0x1000 = 0x100000
>>
>> ie, 254 scsi devices starting at 0x2000 and 1 nvram
>>
>> - one for vty 
>>
>> smallest possible reg: 0x30000000
>> largest possible reg: 0x30000000 + 253 * 0x1000 = 0x300fd000
>>
>> ie, 254 vty devices
>>
>> Thinking about the bit shifting magic that is needed to convert
>> reg into a usable index makes my brain hurt, but I'll happily
>> review anything you propose :)
> 
> Considering just the libvirt assigned addresses how about:
> 
> 	if (reg >= 0x30000000)
> 		irq = VIO_BASE + 240 + (reg >> 12); // up to 16 vtys
> 	else
> 		irq = VIO_BASE + (reg >> 12);       // up to 238 others
> 
> (+ some masking to enforce the limits).
> 
> I think it's ok to overlap the "qemu" and "libvirt" ranges, since
> (with the exception of the nvram device) I don't see any natural way
> you'd get both schemes in use at once.  If the user overrides things,
> either on the qemu command line or in the libvirt XML to combine the
> two schemes, then we're back to "fix your own mess by manually
> allocating irqs as well".
> 
> Note that the above formula doesn't use VIO_BASE+0 itself, so I think
> we can then factor in the qemu assigned devices with just a:
> 	irq ^= (reg & 0xff);

And what about the proposal below ? May be the QEMU/libvirt split could 
be rebalanced to favor one or the other. 

Thanks,

C.


+#define SPAPR_VIO_REG_BASE 0x71000000
+
+/*
+ * The register property of a VIO device is defined in livirt using a
+ * base number + 0x1000 increment and in QEMU by incrementing the base
+ * register number 0x71000000.
+ *
+ * The formula below tries to compute a unique index number from the
+ * register value that will be used to define the IRQ number of the
+ * VIO device. A maximum of 256 (0x100) VIO devices is covered.
+ *
+ * To minimize collisions, we define two distinct ranges depending on
+ * the "reg" value definition:
+ *
+ *     [0x00 - 0x7f]    user/libvirt
+ *     [0x80 - 0xff]    QEMU VIO model
+ *
+ * For the VIO devices with a user defined "reg", we extract bits
+ * [29-28] and [16-12] to compose an index on 7 bits. It should fit
+ * all libvirt usage.
+ *
+ * Collisions will be detected when the IRQ is claimed.
+ */
+static inline uint32_t spapr_vio_reg_to_irq(uint32_t reg)
+{
+    if (reg >= SPAPR_VIO_REG_BASE) {
+        return SPAPR_IRQ_VIO | 0x80 | (reg & 0x7f);
+    } else {
+        return SPAPR_IRQ_VIO | (((reg >> 28) & 0x3) << 5) |
+            ((reg >> 12) & 0x1f);
+    }
+}
+
Cédric Le Goater July 23, 2018, 4:04 p.m. UTC | #9
On 07/20/2018 04:18 AM, David Gibson wrote:
> On Fri, Jul 06, 2018 at 11:07:11AM +0200, Cédric Le Goater wrote:
>> This proposal introduces a new IRQ number space layout using static
>> numbers for all devices, depending on a device index, and a bitmap
>> allocator for the MSI IRQ numbers which are negotiated by the guest at
>> runtime.
>>
>> As the VIO device model does not have a device index but a "reg"
>> property, we introduce a formula to compute an IRQ number from a "reg"
>> value. It should minimize most of the collisions.
>>
>> The previous layout is kept in pre-3.0 machines raising the
>> 'legacy_irq_allocation' machine class flag.
>>
>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>> ---
>>  The new XICS layout will only be activated when a new pseries-3.1
>>  machine is introduced.
> 
> Looking good.  I've noted a few nits below, but they're very minor -
> I wouldn't even bother to ask for the changes if we didn't have to
> wait for 3.1 anyway.

Yes. np. I have taken into account the nits. We just need to agree
on a reg_to_irq formula now.

Thanks,

C. 


> 
> 
>>
>>  include/hw/ppc/spapr.h     |  5 ++++
>>  include/hw/ppc/spapr_irq.h | 32 ++++++++++++++++++++++
>>  hw/ppc/spapr.c             | 32 +++++++++++++++++++++-
>>  hw/ppc/spapr_events.c      | 12 ++++++--
>>  hw/ppc/spapr_irq.c         | 56 ++++++++++++++++++++++++++++++++++++++
>>  hw/ppc/spapr_pci.c         | 29 ++++++++++++++++----
>>  hw/ppc/spapr_vio.c         | 47 ++++++++++++++++++++++++++++----
>>  hw/ppc/Makefile.objs       |  2 +-
>>  8 files changed, 199 insertions(+), 16 deletions(-)
>>  create mode 100644 include/hw/ppc/spapr_irq.h
>>  create mode 100644 hw/ppc/spapr_irq.c
>>
>> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
>> index 7e5de1a6fd42..73067f5ee8aa 100644
>> --- a/include/hw/ppc/spapr.h
>> +++ b/include/hw/ppc/spapr.h
>> @@ -8,6 +8,7 @@
>>  #include "hw/ppc/spapr_drc.h"
>>  #include "hw/mem/pc-dimm.h"
>>  #include "hw/ppc/spapr_ovec.h"
>> +#include "hw/ppc/spapr_irq.h"
>>  
>>  struct VIOsPAPRBus;
>>  struct sPAPRPHBState;
>> @@ -101,6 +102,8 @@ struct sPAPRMachineClass {
>>      bool dr_lmb_enabled;       /* enable dynamic-reconfig/hotplug of LMBs */
>>      bool use_ohci_by_default;  /* use USB-OHCI instead of XHCI */
>>      bool pre_2_10_has_unused_icps;
>> +    bool legacy_irq_allocation;
>> +
>>      void (*phb_placement)(sPAPRMachineState *spapr, uint32_t index,
>>                            uint64_t *buid, hwaddr *pio, 
>>                            hwaddr *mmio32, hwaddr *mmio64,
>> @@ -167,6 +170,8 @@ struct sPAPRMachineState {
>>      char *kvm_type;
>>  
>>      const char *icp_type;
>> +    int32_t irq_map_nr;
>> +    unsigned long *irq_map;
>>  
>>      bool cmd_line_caps[SPAPR_CAP_NUM];
>>      sPAPRCapabilities def, eff, mig;
>> diff --git a/include/hw/ppc/spapr_irq.h b/include/hw/ppc/spapr_irq.h
>> new file mode 100644
>> index 000000000000..6f7f50548809
>> --- /dev/null
>> +++ b/include/hw/ppc/spapr_irq.h
>> @@ -0,0 +1,32 @@
>> +/*
>> + * QEMU PowerPC sPAPR IRQ backend definitions
>> + *
>> + * Copyright (c) 2018, IBM Corporation.
>> + *
>> + * This code is licensed under the GPL version 2 or later. See the
>> + * COPYING file in the top-level directory.
>> + */
>> +
>> +#ifndef HW_SPAPR_IRQ_H
>> +#define HW_SPAPR_IRQ_H
>> +
>> +/*
>> + * IRQ range offsets per device type
>> + */
>> +#define SPAPR_IRQ_EPOW       0x1000  /* XICS_IRQ_BASE offset */
>> +#define SPAPR_IRQ_HOTPLUG    0x1001
>> +#define SPAPR_IRQ_VIO        0x1100  /* 256 VIO devices */
>> +#define SPAPR_IRQ_PCI_LSI    0x1200  /* 32+ PHBs devices */
>> +
>> +#define SPAPR_IRQ_MSI        0x1300  /* Offset of the dynamic range covered
>> +                                      * by the bitmap allocator */
>> +
>> +typedef struct sPAPRMachineState sPAPRMachineState;
>> +
>> +void spapr_irq_msi_init(sPAPRMachineState *spapr, uint32_t nr_msis);
>> +int spapr_irq_msi_alloc(sPAPRMachineState *spapr, uint32_t num, bool align,
>> +                        Error **errp);
>> +void spapr_irq_msi_free(sPAPRMachineState *spapr, int irq, uint32_t num);
>> +void spapr_irq_msi_reset(sPAPRMachineState *spapr);
>> +
>> +#endif
>> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
>> index 3f5e1d3ec21a..7bc9a48abf04 100644
>> --- a/hw/ppc/spapr.c
>> +++ b/hw/ppc/spapr.c
>> @@ -189,6 +189,11 @@ static void xics_system_init(MachineState *machine, int nr_irqs, Error **errp)
>>      sPAPRMachineState *spapr = SPAPR_MACHINE(machine);
>>      Error *local_err = NULL;
>>  
>> +    /* Initialize the MSI IRQ allocator. */
>> +    if (!SPAPR_MACHINE_GET_CLASS(spapr)->legacy_irq_allocation) {
>> +        spapr_irq_msi_init(spapr, XICS_IRQ_BASE + nr_irqs - SPAPR_IRQ_MSI);
>> +    }
>> +
>>      if (kvm_enabled()) {
>>          if (machine_kernel_irqchip_allowed(machine) &&
>>              !xics_kvm_init(spapr, &local_err)) {
>> @@ -1636,6 +1641,10 @@ static void spapr_machine_reset(void)
>>          ppc_set_compat(first_ppc_cpu, spapr->max_compat_pvr, &error_fatal);
>>      }
>>  
>> +    if (!SPAPR_MACHINE_GET_CLASS(spapr)->legacy_irq_allocation) {
>> +        spapr_irq_msi_reset(spapr);
>> +    }
>> +
>>      qemu_devices_reset();
>>  
>>      /* DRC reset may cause a device to be unplugged. This will cause troubles
>> @@ -1910,6 +1919,24 @@ static const VMStateDescription vmstate_spapr_patb_entry = {
>>      },
>>  };
>>  
>> +static bool spapr_irq_map_needed(void *opaque)
>> +{
>> +    sPAPRMachineState *spapr = opaque;
>> +
>> +    return spapr->irq_map && !bitmap_empty(spapr->irq_map, spapr->irq_map_nr);
>> +}
>> +
>> +static const VMStateDescription vmstate_spapr_irq_map = {
>> +    .name = "spapr_irq_map",
>> +    .version_id = 1,
>> +    .minimum_version_id = 1,
>> +    .needed = spapr_irq_map_needed,
>> +    .fields = (VMStateField[]) {
>> +        VMSTATE_BITMAP(irq_map, sPAPRMachineState, 0, irq_map_nr),
>> +        VMSTATE_END_OF_LIST()
>> +    },
>> +};
>> +
>>  static const VMStateDescription vmstate_spapr = {
>>      .name = "spapr",
>>      .version_id = 3,
>> @@ -1937,6 +1964,7 @@ static const VMStateDescription vmstate_spapr = {
>>          &vmstate_spapr_cap_cfpc,
>>          &vmstate_spapr_cap_sbbc,
>>          &vmstate_spapr_cap_ibs,
>> +        &vmstate_spapr_irq_map,
>>          NULL
>>      }
>>  };
>> @@ -4067,7 +4095,9 @@ static void spapr_machine_3_0_instance_options(MachineState *machine)
>>  
>>  static void spapr_machine_3_0_class_options(MachineClass *mc)
>>  {
>> -    /* Defaults for the latest behaviour inherited from the base class */
>> +    sPAPRMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
>> +
>> +    smc->legacy_irq_allocation = true;
>>  }
>>  
>>  DEFINE_SPAPR_MACHINE(3_0, "3.0", true);
>> diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
>> index e4f5946a2188..cab950d33446 100644
>> --- a/hw/ppc/spapr_events.c
>> +++ b/hw/ppc/spapr_events.c
>> @@ -709,7 +709,11 @@ void spapr_events_init(sPAPRMachineState *spapr)
>>  {
>>      int epow_irq;
>>  
>> -    epow_irq = spapr_irq_findone(spapr, &error_fatal);
>> +    if (SPAPR_MACHINE_GET_CLASS(spapr)->legacy_irq_allocation) {
>> +        epow_irq = spapr_irq_findone(spapr, &error_fatal);
>> +    } else {
>> +        epow_irq = SPAPR_IRQ_EPOW;
>> +    }
> 
> Nit: these can be rearranged more briefly by initializing to the fixed
> value, then overriding in the legacy case.
> 
>>  
>>      spapr_irq_claim(spapr, epow_irq, false, &error_fatal);
>>  
>> @@ -731,7 +735,11 @@ void spapr_events_init(sPAPRMachineState *spapr)
>>      if (spapr->use_hotplug_event_source) {
>>          int hp_irq;
>>  
>> -        hp_irq = spapr_irq_findone(spapr, &error_fatal);
>> +        if (SPAPR_MACHINE_GET_CLASS(spapr)->legacy_irq_allocation) {
>> +            hp_irq = spapr_irq_findone(spapr, &error_fatal);
>> +        } else {
>> +            hp_irq = SPAPR_IRQ_HOTPLUG;
>> +        }
> 
> Here too.
> 
>>          spapr_irq_claim(spapr, hp_irq, false, &error_fatal);
>>  
>> diff --git a/hw/ppc/spapr_irq.c b/hw/ppc/spapr_irq.c
>> new file mode 100644
>> index 000000000000..24e9c1d4433c
>> --- /dev/null
>> +++ b/hw/ppc/spapr_irq.c
>> @@ -0,0 +1,56 @@
>> +/*
>> + * QEMU PowerPC sPAPR IRQ interface
>> + *
>> + * Copyright (c) 2018, IBM Corporation.
>> + *
>> + * This code is licensed under the GPL version 2 or later. See the
>> + * COPYING file in the top-level directory.
>> + */
>> +
>> +#include "qemu/osdep.h"
>> +#include "qemu/log.h"
>> +#include "qemu/error-report.h"
>> +#include "qapi/error.h"
>> +#include "hw/ppc/spapr.h"
>> +#include "hw/ppc/xics.h"
>> +
>> +void spapr_irq_msi_init(sPAPRMachineState *spapr, uint32_t nr_msis)
>> +{
>> +    spapr->irq_map_nr = nr_msis;
>> +    spapr->irq_map = bitmap_new(spapr->irq_map_nr);
>> +}
>> +
>> +int spapr_irq_msi_alloc(sPAPRMachineState *spapr, uint32_t num, bool align,
>> +                        Error **errp)
>> +{
>> +    int irq;
>> +
>> +    /*
>> +     * The 'align_mask' parameter of bitmap_find_next_zero_area()
>> +     * should be one less than a power of 2; 0 means no
>> +     * alignment. Adapt the 'align' value of the former allocator
>> +     * to fit the requirements of bitmap_find_next_zero_area()
>> +     */
>> +    align -= 1;
>> +
>> +    irq = bitmap_find_next_zero_area(spapr->irq_map, spapr->irq_map_nr, 0, num,
>> +                                     align);
>> +    if (irq == spapr->irq_map_nr) {
>> +        error_setg(errp, "can't find a free %d-IRQ block", num);
>> +        return -1;
>> +    }
>> +
>> +    bitmap_set(spapr->irq_map, irq, num);
>> +
>> +    return irq + SPAPR_IRQ_MSI;
>> +}
>> +
>> +void spapr_irq_msi_free(sPAPRMachineState *spapr, int irq, uint32_t num)
>> +{
>> +    bitmap_clear(spapr->irq_map, irq - SPAPR_IRQ_MSI, num);
>> +}
>> +
>> +void spapr_irq_msi_reset(sPAPRMachineState *spapr)
>> +{
>> +    bitmap_clear(spapr->irq_map, 0, spapr->irq_map_nr);
>> +}
>> diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
>> index 497b896c7d24..cba5340f4bad 100644
>> --- a/hw/ppc/spapr_pci.c
>> +++ b/hw/ppc/spapr_pci.c
>> @@ -334,6 +334,9 @@ static void rtas_ibm_change_msi(PowerPCCPU *cpu, sPAPRMachineState *spapr,
>>              return;
>>          }
>>  
>> +        if (!SPAPR_MACHINE_GET_CLASS(spapr)->legacy_irq_allocation) {
>> +            spapr_irq_msi_free(spapr, msi->first_irq, msi->num);
>> +        }
>>          spapr_irq_free(spapr, msi->first_irq, msi->num);
>>          if (msi_present(pdev)) {
>>              spapr_msi_setmsg(pdev, 0, false, 0, 0);
>> @@ -372,7 +375,13 @@ static void rtas_ibm_change_msi(PowerPCCPU *cpu, sPAPRMachineState *spapr,
>>      }
>>  
>>      /* Allocate MSIs */
>> -    irq = spapr_irq_find(spapr, req_num, ret_intr_type == RTAS_TYPE_MSI, &err);
>> +    if (SPAPR_MACHINE_GET_CLASS(spapr)->legacy_irq_allocation) {
>> +        irq = spapr_irq_find(spapr, req_num, ret_intr_type == RTAS_TYPE_MSI,
>> +                             &err);
>> +    } else {
>> +        irq = spapr_irq_msi_alloc(spapr, req_num,
>> +                                  ret_intr_type == RTAS_TYPE_MSI, &err);
>> +    }
>>      if (err) {
>>          error_reportf_err(err, "Can't allocate MSIs for device %x: ",
>>                            config_addr);
>> @@ -392,6 +401,9 @@ static void rtas_ibm_change_msi(PowerPCCPU *cpu, sPAPRMachineState *spapr,
>>  
>>      /* Release previous MSIs */
>>      if (msi) {
>> +        if (!SPAPR_MACHINE_GET_CLASS(spapr)->legacy_irq_allocation) {
>> +            spapr_irq_msi_free(spapr, msi->first_irq, msi->num);
>> +        }
>>          spapr_irq_free(spapr, msi->first_irq, msi->num);
>>          g_hash_table_remove(phb->msi, &config_addr);
>>      }
>> @@ -1708,11 +1720,15 @@ static void spapr_phb_realize(DeviceState *dev, Error **errp)
>>          uint32_t irq;
>>          Error *local_err = NULL;
>>  
>> -        irq = spapr_irq_findone(spapr, &local_err);
>> -        if (local_err) {
>> -            error_propagate(errp, local_err);
>> -            error_prepend(errp, "can't allocate LSIs: ");
>> -            return;
>> +        if (SPAPR_MACHINE_GET_CLASS(spapr)->legacy_irq_allocation) {
>> +            irq = spapr_irq_findone(spapr, &local_err);
>> +            if (local_err) {
>> +                error_propagate(errp, local_err);
>> +                error_prepend(errp, "can't allocate LSIs: ");
>> +                return;
>> +            }
>> +        } else {
>> +            irq = SPAPR_IRQ_PCI_LSI + sphb->index * PCI_NUM_PINS + i;
> 
> Here again, you can use the new fixed value as the default.
> 
>>          }
>>  
>>          spapr_irq_claim(spapr, irq, true, &local_err);
>> @@ -2123,6 +2139,7 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
>>      _FDT(fdt_setprop(fdt, bus_off, "ranges", &ranges, sizeof_ranges));
>>      _FDT(fdt_setprop(fdt, bus_off, "reg", &bus_reg, sizeof(bus_reg)));
>>      _FDT(fdt_setprop_cell(fdt, bus_off, "ibm,pci-config-space-type", 0x1));
>> +    /* TODO: fine tune the total count of allocatable MSIs per PHB */
>>      _FDT(fdt_setprop_cell(fdt, bus_off, "ibm,pe-total-#msi", XICS_IRQS_SPAPR));
>>  
>>      /* Dynamic DMA window */
>> diff --git a/hw/ppc/spapr_vio.c b/hw/ppc/spapr_vio.c
>> index be9af71437cc..a0695200f348 100644
>> --- a/hw/ppc/spapr_vio.c
>> +++ b/hw/ppc/spapr_vio.c
>> @@ -37,12 +37,13 @@
>>  
>>  #include "hw/ppc/spapr.h"
>>  #include "hw/ppc/spapr_vio.h"
>> -#include "hw/ppc/xics.h"
>>  #include "hw/ppc/fdt.h"
>>  #include "trace.h"
>>  
>>  #include <libfdt.h>
>>  
>> +#define SPAPR_VIO_REG_BASE 0x71000000
>> +
>>  static void spapr_vio_get_irq(Object *obj, Visitor *v, const char *name,
>>                                void *opaque, Error **errp)
>>  {
>> @@ -445,6 +446,32 @@ static void spapr_vio_busdev_reset(DeviceState *qdev)
>>      }
>>  }
>>  
>> +/*
>> + * The register property of a VIO device is defined in livirt using a
>> + * base number + 0x1000 increment and in QEMU by incrementing the base
>> + * register number 0x71000000.
>> + *
>> + * The formula below tries to compute a unique index number from the
>> + * register value that will be used to define the IRQ number of the
>> + * VIO device. A maximum of 256 (0x100) VIO devices is covered.
>> + *
>> + * To minimize collisions, we define two distinct ranges depending on
>> + * the "reg" value definition:
>> + *
>> + *     [0x00 - 0x7f]    user/libvirt
>> + *     [0x80 - 0xff]    QEMU VIO model
>> + *
>> + * Collisions will be detected when the IRQ is claimed.
>> + */
>> +static inline uint32_t spapr_vio_reg_to_irq(uint32_t reg)
>> +{
>> +    if (reg >= SPAPR_VIO_REG_BASE) {
>> +        return SPAPR_IRQ_VIO | (reg & 0x7f) | 0x80;
>> +    } else {
>> +        return SPAPR_IRQ_VIO | ((reg & 0x7f) ^ ((reg >> 12) & 0x7f));
>> +    }
>> +}
>> +
>>  static void spapr_vio_busdev_realize(DeviceState *qdev, Error **errp)
>>  {
>>      sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
>> @@ -485,10 +512,18 @@ static void spapr_vio_busdev_realize(DeviceState *qdev, Error **errp)
>>      }
>>  
>>      if (!dev->irq) {
>> -        dev->irq = spapr_irq_findone(spapr, &local_err);
>> -        if (local_err) {
>> -            error_propagate(errp, local_err);
>> -            return;
>> +        if (SPAPR_MACHINE_GET_CLASS(spapr)->legacy_irq_allocation) {
>> +            dev->irq = spapr_irq_findone(spapr, &local_err);
>> +            if (local_err) {
>> +                error_propagate(errp, local_err);
>> +                return;
>> +            }
>> +        } else {
>> +            dev->irq = spapr_vio_reg_to_irq(dev->reg);
>> +            if (dev->irq == SPAPR_IRQ_PCI_LSI) {
>> +                error_setg(errp, "Too many VIO devices");
>> +                return;
>> +            }
>>          }
>>      }
>>  
>> @@ -557,7 +592,7 @@ VIOsPAPRBus *spapr_vio_bus_init(void)
>>      /* Create bus on bridge device */
>>      qbus = qbus_create(TYPE_SPAPR_VIO_BUS, dev, "spapr-vio");
>>      bus = SPAPR_VIO_BUS(qbus);
>> -    bus->next_reg = 0x71000000;
>> +    bus->next_reg = SPAPR_VIO_REG_BASE;
>>  
>>      /* hcall-vio */
>>      spapr_register_hypercall(H_VIO_SIGNAL, h_vio_signal);
>> diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
>> index bcab6323b7ed..4ab556467289 100644
>> --- a/hw/ppc/Makefile.objs
>> +++ b/hw/ppc/Makefile.objs
>> @@ -4,7 +4,7 @@ obj-y += ppc.o ppc_booke.o fdt.o
>>  obj-$(CONFIG_PSERIES) += spapr.o spapr_caps.o spapr_vio.o spapr_events.o
>>  obj-$(CONFIG_PSERIES) += spapr_hcall.o spapr_iommu.o spapr_rtas.o
>>  obj-$(CONFIG_PSERIES) += spapr_pci.o spapr_rtc.o spapr_drc.o spapr_rng.o
>> -obj-$(CONFIG_PSERIES) += spapr_cpu_core.o spapr_ovec.o
>> +obj-$(CONFIG_PSERIES) += spapr_cpu_core.o spapr_ovec.o spapr_irq.o
>>  # IBM PowerNV
>>  obj-$(CONFIG_POWERNV) += pnv.o pnv_xscom.o pnv_core.o pnv_lpc.o pnv_psi.o pnv_occ.o pnv_bmc.o
>>  ifeq ($(CONFIG_PCI)$(CONFIG_PSERIES)$(CONFIG_LINUX), yyy)
>
diff mbox series

Patch

diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index 7e5de1a6fd42..73067f5ee8aa 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -8,6 +8,7 @@ 
 #include "hw/ppc/spapr_drc.h"
 #include "hw/mem/pc-dimm.h"
 #include "hw/ppc/spapr_ovec.h"
+#include "hw/ppc/spapr_irq.h"
 
 struct VIOsPAPRBus;
 struct sPAPRPHBState;
@@ -101,6 +102,8 @@  struct sPAPRMachineClass {
     bool dr_lmb_enabled;       /* enable dynamic-reconfig/hotplug of LMBs */
     bool use_ohci_by_default;  /* use USB-OHCI instead of XHCI */
     bool pre_2_10_has_unused_icps;
+    bool legacy_irq_allocation;
+
     void (*phb_placement)(sPAPRMachineState *spapr, uint32_t index,
                           uint64_t *buid, hwaddr *pio, 
                           hwaddr *mmio32, hwaddr *mmio64,
@@ -167,6 +170,8 @@  struct sPAPRMachineState {
     char *kvm_type;
 
     const char *icp_type;
+    int32_t irq_map_nr;
+    unsigned long *irq_map;
 
     bool cmd_line_caps[SPAPR_CAP_NUM];
     sPAPRCapabilities def, eff, mig;
diff --git a/include/hw/ppc/spapr_irq.h b/include/hw/ppc/spapr_irq.h
new file mode 100644
index 000000000000..6f7f50548809
--- /dev/null
+++ b/include/hw/ppc/spapr_irq.h
@@ -0,0 +1,32 @@ 
+/*
+ * QEMU PowerPC sPAPR IRQ backend definitions
+ *
+ * Copyright (c) 2018, IBM Corporation.
+ *
+ * This code is licensed under the GPL version 2 or later. See the
+ * COPYING file in the top-level directory.
+ */
+
+#ifndef HW_SPAPR_IRQ_H
+#define HW_SPAPR_IRQ_H
+
+/*
+ * IRQ range offsets per device type
+ */
+#define SPAPR_IRQ_EPOW       0x1000  /* XICS_IRQ_BASE offset */
+#define SPAPR_IRQ_HOTPLUG    0x1001
+#define SPAPR_IRQ_VIO        0x1100  /* 256 VIO devices */
+#define SPAPR_IRQ_PCI_LSI    0x1200  /* 32+ PHBs devices */
+
+#define SPAPR_IRQ_MSI        0x1300  /* Offset of the dynamic range covered
+                                      * by the bitmap allocator */
+
+typedef struct sPAPRMachineState sPAPRMachineState;
+
+void spapr_irq_msi_init(sPAPRMachineState *spapr, uint32_t nr_msis);
+int spapr_irq_msi_alloc(sPAPRMachineState *spapr, uint32_t num, bool align,
+                        Error **errp);
+void spapr_irq_msi_free(sPAPRMachineState *spapr, int irq, uint32_t num);
+void spapr_irq_msi_reset(sPAPRMachineState *spapr);
+
+#endif
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 3f5e1d3ec21a..7bc9a48abf04 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -189,6 +189,11 @@  static void xics_system_init(MachineState *machine, int nr_irqs, Error **errp)
     sPAPRMachineState *spapr = SPAPR_MACHINE(machine);
     Error *local_err = NULL;
 
+    /* Initialize the MSI IRQ allocator. */
+    if (!SPAPR_MACHINE_GET_CLASS(spapr)->legacy_irq_allocation) {
+        spapr_irq_msi_init(spapr, XICS_IRQ_BASE + nr_irqs - SPAPR_IRQ_MSI);
+    }
+
     if (kvm_enabled()) {
         if (machine_kernel_irqchip_allowed(machine) &&
             !xics_kvm_init(spapr, &local_err)) {
@@ -1636,6 +1641,10 @@  static void spapr_machine_reset(void)
         ppc_set_compat(first_ppc_cpu, spapr->max_compat_pvr, &error_fatal);
     }
 
+    if (!SPAPR_MACHINE_GET_CLASS(spapr)->legacy_irq_allocation) {
+        spapr_irq_msi_reset(spapr);
+    }
+
     qemu_devices_reset();
 
     /* DRC reset may cause a device to be unplugged. This will cause troubles
@@ -1910,6 +1919,24 @@  static const VMStateDescription vmstate_spapr_patb_entry = {
     },
 };
 
+static bool spapr_irq_map_needed(void *opaque)
+{
+    sPAPRMachineState *spapr = opaque;
+
+    return spapr->irq_map && !bitmap_empty(spapr->irq_map, spapr->irq_map_nr);
+}
+
+static const VMStateDescription vmstate_spapr_irq_map = {
+    .name = "spapr_irq_map",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .needed = spapr_irq_map_needed,
+    .fields = (VMStateField[]) {
+        VMSTATE_BITMAP(irq_map, sPAPRMachineState, 0, irq_map_nr),
+        VMSTATE_END_OF_LIST()
+    },
+};
+
 static const VMStateDescription vmstate_spapr = {
     .name = "spapr",
     .version_id = 3,
@@ -1937,6 +1964,7 @@  static const VMStateDescription vmstate_spapr = {
         &vmstate_spapr_cap_cfpc,
         &vmstate_spapr_cap_sbbc,
         &vmstate_spapr_cap_ibs,
+        &vmstate_spapr_irq_map,
         NULL
     }
 };
@@ -4067,7 +4095,9 @@  static void spapr_machine_3_0_instance_options(MachineState *machine)
 
 static void spapr_machine_3_0_class_options(MachineClass *mc)
 {
-    /* Defaults for the latest behaviour inherited from the base class */
+    sPAPRMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
+
+    smc->legacy_irq_allocation = true;
 }
 
 DEFINE_SPAPR_MACHINE(3_0, "3.0", true);
diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
index e4f5946a2188..cab950d33446 100644
--- a/hw/ppc/spapr_events.c
+++ b/hw/ppc/spapr_events.c
@@ -709,7 +709,11 @@  void spapr_events_init(sPAPRMachineState *spapr)
 {
     int epow_irq;
 
-    epow_irq = spapr_irq_findone(spapr, &error_fatal);
+    if (SPAPR_MACHINE_GET_CLASS(spapr)->legacy_irq_allocation) {
+        epow_irq = spapr_irq_findone(spapr, &error_fatal);
+    } else {
+        epow_irq = SPAPR_IRQ_EPOW;
+    }
 
     spapr_irq_claim(spapr, epow_irq, false, &error_fatal);
 
@@ -731,7 +735,11 @@  void spapr_events_init(sPAPRMachineState *spapr)
     if (spapr->use_hotplug_event_source) {
         int hp_irq;
 
-        hp_irq = spapr_irq_findone(spapr, &error_fatal);
+        if (SPAPR_MACHINE_GET_CLASS(spapr)->legacy_irq_allocation) {
+            hp_irq = spapr_irq_findone(spapr, &error_fatal);
+        } else {
+            hp_irq = SPAPR_IRQ_HOTPLUG;
+        }
 
         spapr_irq_claim(spapr, hp_irq, false, &error_fatal);
 
diff --git a/hw/ppc/spapr_irq.c b/hw/ppc/spapr_irq.c
new file mode 100644
index 000000000000..24e9c1d4433c
--- /dev/null
+++ b/hw/ppc/spapr_irq.c
@@ -0,0 +1,56 @@ 
+/*
+ * QEMU PowerPC sPAPR IRQ interface
+ *
+ * Copyright (c) 2018, IBM Corporation.
+ *
+ * This code is licensed under the GPL version 2 or later. See the
+ * COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/log.h"
+#include "qemu/error-report.h"
+#include "qapi/error.h"
+#include "hw/ppc/spapr.h"
+#include "hw/ppc/xics.h"
+
+void spapr_irq_msi_init(sPAPRMachineState *spapr, uint32_t nr_msis)
+{
+    spapr->irq_map_nr = nr_msis;
+    spapr->irq_map = bitmap_new(spapr->irq_map_nr);
+}
+
+int spapr_irq_msi_alloc(sPAPRMachineState *spapr, uint32_t num, bool align,
+                        Error **errp)
+{
+    int irq;
+
+    /*
+     * The 'align_mask' parameter of bitmap_find_next_zero_area()
+     * should be one less than a power of 2; 0 means no
+     * alignment. Adapt the 'align' value of the former allocator
+     * to fit the requirements of bitmap_find_next_zero_area()
+     */
+    align -= 1;
+
+    irq = bitmap_find_next_zero_area(spapr->irq_map, spapr->irq_map_nr, 0, num,
+                                     align);
+    if (irq == spapr->irq_map_nr) {
+        error_setg(errp, "can't find a free %d-IRQ block", num);
+        return -1;
+    }
+
+    bitmap_set(spapr->irq_map, irq, num);
+
+    return irq + SPAPR_IRQ_MSI;
+}
+
+void spapr_irq_msi_free(sPAPRMachineState *spapr, int irq, uint32_t num)
+{
+    bitmap_clear(spapr->irq_map, irq - SPAPR_IRQ_MSI, num);
+}
+
+void spapr_irq_msi_reset(sPAPRMachineState *spapr)
+{
+    bitmap_clear(spapr->irq_map, 0, spapr->irq_map_nr);
+}
diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
index 497b896c7d24..cba5340f4bad 100644
--- a/hw/ppc/spapr_pci.c
+++ b/hw/ppc/spapr_pci.c
@@ -334,6 +334,9 @@  static void rtas_ibm_change_msi(PowerPCCPU *cpu, sPAPRMachineState *spapr,
             return;
         }
 
+        if (!SPAPR_MACHINE_GET_CLASS(spapr)->legacy_irq_allocation) {
+            spapr_irq_msi_free(spapr, msi->first_irq, msi->num);
+        }
         spapr_irq_free(spapr, msi->first_irq, msi->num);
         if (msi_present(pdev)) {
             spapr_msi_setmsg(pdev, 0, false, 0, 0);
@@ -372,7 +375,13 @@  static void rtas_ibm_change_msi(PowerPCCPU *cpu, sPAPRMachineState *spapr,
     }
 
     /* Allocate MSIs */
-    irq = spapr_irq_find(spapr, req_num, ret_intr_type == RTAS_TYPE_MSI, &err);
+    if (SPAPR_MACHINE_GET_CLASS(spapr)->legacy_irq_allocation) {
+        irq = spapr_irq_find(spapr, req_num, ret_intr_type == RTAS_TYPE_MSI,
+                             &err);
+    } else {
+        irq = spapr_irq_msi_alloc(spapr, req_num,
+                                  ret_intr_type == RTAS_TYPE_MSI, &err);
+    }
     if (err) {
         error_reportf_err(err, "Can't allocate MSIs for device %x: ",
                           config_addr);
@@ -392,6 +401,9 @@  static void rtas_ibm_change_msi(PowerPCCPU *cpu, sPAPRMachineState *spapr,
 
     /* Release previous MSIs */
     if (msi) {
+        if (!SPAPR_MACHINE_GET_CLASS(spapr)->legacy_irq_allocation) {
+            spapr_irq_msi_free(spapr, msi->first_irq, msi->num);
+        }
         spapr_irq_free(spapr, msi->first_irq, msi->num);
         g_hash_table_remove(phb->msi, &config_addr);
     }
@@ -1708,11 +1720,15 @@  static void spapr_phb_realize(DeviceState *dev, Error **errp)
         uint32_t irq;
         Error *local_err = NULL;
 
-        irq = spapr_irq_findone(spapr, &local_err);
-        if (local_err) {
-            error_propagate(errp, local_err);
-            error_prepend(errp, "can't allocate LSIs: ");
-            return;
+        if (SPAPR_MACHINE_GET_CLASS(spapr)->legacy_irq_allocation) {
+            irq = spapr_irq_findone(spapr, &local_err);
+            if (local_err) {
+                error_propagate(errp, local_err);
+                error_prepend(errp, "can't allocate LSIs: ");
+                return;
+            }
+        } else {
+            irq = SPAPR_IRQ_PCI_LSI + sphb->index * PCI_NUM_PINS + i;
         }
 
         spapr_irq_claim(spapr, irq, true, &local_err);
@@ -2123,6 +2139,7 @@  int spapr_populate_pci_dt(sPAPRPHBState *phb,
     _FDT(fdt_setprop(fdt, bus_off, "ranges", &ranges, sizeof_ranges));
     _FDT(fdt_setprop(fdt, bus_off, "reg", &bus_reg, sizeof(bus_reg)));
     _FDT(fdt_setprop_cell(fdt, bus_off, "ibm,pci-config-space-type", 0x1));
+    /* TODO: fine tune the total count of allocatable MSIs per PHB */
     _FDT(fdt_setprop_cell(fdt, bus_off, "ibm,pe-total-#msi", XICS_IRQS_SPAPR));
 
     /* Dynamic DMA window */
diff --git a/hw/ppc/spapr_vio.c b/hw/ppc/spapr_vio.c
index be9af71437cc..a0695200f348 100644
--- a/hw/ppc/spapr_vio.c
+++ b/hw/ppc/spapr_vio.c
@@ -37,12 +37,13 @@ 
 
 #include "hw/ppc/spapr.h"
 #include "hw/ppc/spapr_vio.h"
-#include "hw/ppc/xics.h"
 #include "hw/ppc/fdt.h"
 #include "trace.h"
 
 #include <libfdt.h>
 
+#define SPAPR_VIO_REG_BASE 0x71000000
+
 static void spapr_vio_get_irq(Object *obj, Visitor *v, const char *name,
                               void *opaque, Error **errp)
 {
@@ -445,6 +446,32 @@  static void spapr_vio_busdev_reset(DeviceState *qdev)
     }
 }
 
+/*
+ * The register property of a VIO device is defined in livirt using a
+ * base number + 0x1000 increment and in QEMU by incrementing the base
+ * register number 0x71000000.
+ *
+ * The formula below tries to compute a unique index number from the
+ * register value that will be used to define the IRQ number of the
+ * VIO device. A maximum of 256 (0x100) VIO devices is covered.
+ *
+ * To minimize collisions, we define two distinct ranges depending on
+ * the "reg" value definition:
+ *
+ *     [0x00 - 0x7f]    user/libvirt
+ *     [0x80 - 0xff]    QEMU VIO model
+ *
+ * Collisions will be detected when the IRQ is claimed.
+ */
+static inline uint32_t spapr_vio_reg_to_irq(uint32_t reg)
+{
+    if (reg >= SPAPR_VIO_REG_BASE) {
+        return SPAPR_IRQ_VIO | (reg & 0x7f) | 0x80;
+    } else {
+        return SPAPR_IRQ_VIO | ((reg & 0x7f) ^ ((reg >> 12) & 0x7f));
+    }
+}
+
 static void spapr_vio_busdev_realize(DeviceState *qdev, Error **errp)
 {
     sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
@@ -485,10 +512,18 @@  static void spapr_vio_busdev_realize(DeviceState *qdev, Error **errp)
     }
 
     if (!dev->irq) {
-        dev->irq = spapr_irq_findone(spapr, &local_err);
-        if (local_err) {
-            error_propagate(errp, local_err);
-            return;
+        if (SPAPR_MACHINE_GET_CLASS(spapr)->legacy_irq_allocation) {
+            dev->irq = spapr_irq_findone(spapr, &local_err);
+            if (local_err) {
+                error_propagate(errp, local_err);
+                return;
+            }
+        } else {
+            dev->irq = spapr_vio_reg_to_irq(dev->reg);
+            if (dev->irq == SPAPR_IRQ_PCI_LSI) {
+                error_setg(errp, "Too many VIO devices");
+                return;
+            }
         }
     }
 
@@ -557,7 +592,7 @@  VIOsPAPRBus *spapr_vio_bus_init(void)
     /* Create bus on bridge device */
     qbus = qbus_create(TYPE_SPAPR_VIO_BUS, dev, "spapr-vio");
     bus = SPAPR_VIO_BUS(qbus);
-    bus->next_reg = 0x71000000;
+    bus->next_reg = SPAPR_VIO_REG_BASE;
 
     /* hcall-vio */
     spapr_register_hypercall(H_VIO_SIGNAL, h_vio_signal);
diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
index bcab6323b7ed..4ab556467289 100644
--- a/hw/ppc/Makefile.objs
+++ b/hw/ppc/Makefile.objs
@@ -4,7 +4,7 @@  obj-y += ppc.o ppc_booke.o fdt.o
 obj-$(CONFIG_PSERIES) += spapr.o spapr_caps.o spapr_vio.o spapr_events.o
 obj-$(CONFIG_PSERIES) += spapr_hcall.o spapr_iommu.o spapr_rtas.o
 obj-$(CONFIG_PSERIES) += spapr_pci.o spapr_rtc.o spapr_drc.o spapr_rng.o
-obj-$(CONFIG_PSERIES) += spapr_cpu_core.o spapr_ovec.o
+obj-$(CONFIG_PSERIES) += spapr_cpu_core.o spapr_ovec.o spapr_irq.o
 # IBM PowerNV
 obj-$(CONFIG_POWERNV) += pnv.o pnv_xscom.o pnv_core.o pnv_lpc.o pnv_psi.o pnv_occ.o pnv_bmc.o
 ifeq ($(CONFIG_PCI)$(CONFIG_PSERIES)$(CONFIG_LINUX), yyy)