Patchwork spapr-pci: rework MSI/MSIX

login
register
mail settings
Submitter Alexey Kardashevskiy
Date July 12, 2013, 7:38 a.m.
Message ID <1373614704-15581-1-git-send-email-aik@ozlabs.ru>
Download mbox | patch
Permalink /patch/258712/
State New
Headers show

Comments

Alexey Kardashevskiy - July 12, 2013, 7:38 a.m.
On the sPAPR platform a guest allocates MSI/MSIX vectors via RTAS
hypercalls which return global IRQ numbers to a guest so it only
operates with those and never touches MSIMessage.

Therefore MSIMessage handling is completely hidden in QEMU.

Previously every sPAPR PCI host bridge implemented its own MSI window
to catch msi_notify()/msix_notify() calls from QEMU devices (virtio-pci
or vfio) and route them to the guest via qemu_pulse_irq().
MSIMessage used to be encoded as:
	.addr - address within the PHB MSI window;
	.data - the device index on PHB plus vector number.
The MSI MR write function translated this MSIMessage to a global IRQ
number and called qemu_pulse_irq().

However the total number of IRQs is not really big (at the moment it is
1024 IRQs starting from 4096) and even 16bit data field of MSIMessage
seems to be enough to store an IRQ number there.

This simplifies MSI handling in sPAPR PHB. Specifically, this does:
1. remove a MSI window from a PHB;
2. add a single memory region for all MSIs to sPAPREnvironment
and spapr_pci_msi_init() to initialize it;
3. encode MSIMessage as:
    * .addr - a fixed address of SPAPR_PCI_MSI_WINDOW==0x40000000000ULL;
    * .data as an IRQ number.
4. change IRQ allocator to align first IRQ number in a block for MSI.
MSI uses lower bits to specify the vector number so the first IRQ has to
be aligned. MSIX does not need any special allocator though.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 hw/ppc/spapr.c              | 29 ++++++++++++++++++---
 hw/ppc/spapr_pci.c          | 61 ++++++++++++++++++++-------------------------
 include/hw/pci-host/spapr.h |  8 +++---
 include/hw/ppc/spapr.h      |  4 ++-
 4 files changed, 60 insertions(+), 42 deletions(-)
Alexey Kardashevskiy - July 23, 2013, 2:54 a.m.
Michael, could you please review the patch as it is about PCI? Thanks!


On 07/12/2013 05:38 PM, Alexey Kardashevskiy wrote:
> On the sPAPR platform a guest allocates MSI/MSIX vectors via RTAS
> hypercalls which return global IRQ numbers to a guest so it only
> operates with those and never touches MSIMessage.
> 
> Therefore MSIMessage handling is completely hidden in QEMU.
> 
> Previously every sPAPR PCI host bridge implemented its own MSI window
> to catch msi_notify()/msix_notify() calls from QEMU devices (virtio-pci
> or vfio) and route them to the guest via qemu_pulse_irq().
> MSIMessage used to be encoded as:
> 	.addr - address within the PHB MSI window;
> 	.data - the device index on PHB plus vector number.
> The MSI MR write function translated this MSIMessage to a global IRQ
> number and called qemu_pulse_irq().
> 
> However the total number of IRQs is not really big (at the moment it is
> 1024 IRQs starting from 4096) and even 16bit data field of MSIMessage
> seems to be enough to store an IRQ number there.
> 
> This simplifies MSI handling in sPAPR PHB. Specifically, this does:
> 1. remove a MSI window from a PHB;
> 2. add a single memory region for all MSIs to sPAPREnvironment
> and spapr_pci_msi_init() to initialize it;
> 3. encode MSIMessage as:
>     * .addr - a fixed address of SPAPR_PCI_MSI_WINDOW==0x40000000000ULL;
>     * .data as an IRQ number.
> 4. change IRQ allocator to align first IRQ number in a block for MSI.
> MSI uses lower bits to specify the vector number so the first IRQ has to
> be aligned. MSIX does not need any special allocator though.
> 
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
>  hw/ppc/spapr.c              | 29 ++++++++++++++++++---
>  hw/ppc/spapr_pci.c          | 61 ++++++++++++++++++++-------------------------
>  include/hw/pci-host/spapr.h |  8 +++---
>  include/hw/ppc/spapr.h      |  4 ++-
>  4 files changed, 60 insertions(+), 42 deletions(-)
> 
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index 671bbd9..6b21191 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -88,6 +88,9 @@ int spapr_allocate_irq(int hint, bool lsi)
>  
>      if (hint) {
>          irq = hint;
> +        if (hint >= spapr->next_irq) {
> +            spapr->next_irq = hint + 1;
> +        }
>          /* FIXME: we should probably check for collisions somehow */
>      } else {
>          irq = spapr->next_irq++;
> @@ -103,22 +106,39 @@ int spapr_allocate_irq(int hint, bool lsi)
>      return irq;
>  }
>  
> -/* Allocate block of consequtive IRQs, returns a number of the first */
> -int spapr_allocate_irq_block(int num, bool lsi)
> +/*
> + * Allocate block of consequtive IRQs, returns a number of the first.
> + * If msi==true, aligns the first IRQ number to num.
> + */
> +int spapr_allocate_irq_block(int num, bool lsi, bool msi)
>  {
>      int first = -1;
> -    int i;
> +    int i, hint = 0;
> +
> +    /*
> +     * MSIMesage::data is used for storing VIRQ so
> +     * it has to be aligned to num to support multiple
> +     * MSI vectors. MSI-X is not affected by this.
> +     * The hint is used for the first IRQ, the rest should
> +     * be allocated continously.
> +     */
> +    if (msi) {
> +        assert((num == 1) || (num == 2) || (num == 4) ||
> +               (num == 8) || (num == 16) || (num == 32));
> +        hint = (spapr->next_irq + num - 1) & ~(num - 1);
> +    }
>  
>      for (i = 0; i < num; ++i) {
>          int irq;
>  
> -        irq = spapr_allocate_irq(0, lsi);
> +        irq = spapr_allocate_irq(hint, lsi);
>          if (!irq) {
>              return -1;
>          }
>  
>          if (0 == i) {
>              first = irq;
> +            hint = 0;
>          }
>  
>          /* If the above doesn't create a consecutive block then that's
> @@ -1127,6 +1147,7 @@ static void ppc_spapr_init(QEMUMachineInitArgs *args)
>      spapr_create_nvram(spapr);
>  
>      /* Set up PCI */
> +    spapr_pci_msi_init(spapr, SPAPR_PCI_MSI_WINDOW);
>      spapr_pci_rtas_init();
>  
>      phb = spapr_create_phb(spapr, 0);
> diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
> index dfe4d04..c8ea777 100644
> --- a/hw/ppc/spapr_pci.c
> +++ b/hw/ppc/spapr_pci.c
> @@ -258,11 +258,11 @@ static int spapr_msicfg_find(sPAPRPHBState *phb, uint32_t config_addr,
>   * This is required for msi_notify()/msix_notify() which
>   * will write at the addresses via spapr_msi_write().
>   */
> -static void spapr_msi_setmsg(PCIDevice *pdev, hwaddr addr,
> -                             bool msix, unsigned req_num)
> +static void spapr_msi_setmsg(PCIDevice *pdev, hwaddr addr, bool msix,
> +                             unsigned first_irq, unsigned req_num)
>  {
>      unsigned i;
> -    MSIMessage msg = { .address = addr, .data = 0 };
> +    MSIMessage msg = { .address = addr, .data = first_irq };
>  
>      if (!msix) {
>          msi_set_message(pdev, msg);
> @@ -270,8 +270,7 @@ static void spapr_msi_setmsg(PCIDevice *pdev, hwaddr addr,
>          return;
>      }
>  
> -    for (i = 0; i < req_num; ++i) {
> -        msg.address = addr | (i << 2);
> +    for (i = 0; i < req_num; ++i, ++msg.data) {
>          msix_set_message(pdev, i, msg);
>          trace_spapr_pci_msi_setup(pdev->name, i, msg.address);
>      }
> @@ -351,7 +350,8 @@ static void rtas_ibm_change_msi(PowerPCCPU *cpu, sPAPREnvironment *spapr,
>  
>      /* There is no cached config, allocate MSIs */
>      if (!phb->msi_table[ndev].nvec) {
> -        irq = spapr_allocate_irq_block(req_num, false);
> +        irq = spapr_allocate_irq_block(req_num, false,
> +                                       ret_intr_type == RTAS_TYPE_MSI);
>          if (irq < 0) {
>              fprintf(stderr, "Cannot allocate MSIs for device#%d", ndev);
>              rtas_st(rets, 0, -1); /* Hardware error */
> @@ -363,8 +363,8 @@ static void rtas_ibm_change_msi(PowerPCCPU *cpu, sPAPREnvironment *spapr,
>      }
>  
>      /* Setup MSI/MSIX vectors in the device (via cfgspace or MSIX BAR) */
> -    spapr_msi_setmsg(pdev, phb->msi_win_addr | (ndev << 16),
> -                     ret_intr_type == RTAS_TYPE_MSIX, req_num);
> +    spapr_msi_setmsg(pdev, spapr->msi_win_addr, ret_intr_type == RTAS_TYPE_MSIX,
> +                     phb->msi_table[ndev].irq, req_num);
>  
>      rtas_st(rets, 0, 0);
>      rtas_st(rets, 1, req_num);
> @@ -487,10 +487,7 @@ static const MemoryRegionOps spapr_io_ops = {
>  static void spapr_msi_write(void *opaque, hwaddr addr,
>                              uint64_t data, unsigned size)
>  {
> -    sPAPRPHBState *phb = opaque;
> -    int ndev = addr >> 16;
> -    int vec = ((addr & 0xFFFF) >> 2) | data;
> -    uint32_t irq = phb->msi_table[ndev].irq + vec;
> +    uint32_t irq = data;
>  
>      trace_spapr_pci_msi_write(addr, data, irq);
>  
> @@ -504,6 +501,23 @@ static const MemoryRegionOps spapr_msi_ops = {
>      .endianness = DEVICE_LITTLE_ENDIAN
>  };
>  
> +void spapr_pci_msi_init(sPAPREnvironment *spapr, hwaddr addr)
> +{
> +    /*
> +     * As MSI/MSIX interrupts trigger by writing at MSI/MSIX vectors,
> +     * we need to allocate some memory to catch those writes coming
> +     * from msi_notify()/msix_notify().
> +     * As MSIMessage:addr is going to be the same and MSIMessage:data
> +     * is going to be a VIRQ number, 4 bytes of the MSI MR will only
> +     * be used.
> +     */
> +    spapr->msi_win_addr = addr;
> +    memory_region_init_io(&spapr->msiwindow, NULL, &spapr_msi_ops, spapr,
> +                          "msi", getpagesize());
> +    memory_region_add_subregion(get_system_memory(), spapr->msi_win_addr,
> +                                &spapr->msiwindow);
> +}
> +
>  /*
>   * PHB PCI device
>   */
> @@ -528,8 +542,7 @@ static int spapr_phb_init(SysBusDevice *s)
>  
>          if ((sphb->buid != -1) || (sphb->dma_liobn != -1)
>              || (sphb->mem_win_addr != -1)
> -            || (sphb->io_win_addr != -1)
> -            || (sphb->msi_win_addr != -1)) {
> +            || (sphb->io_win_addr != -1)) {
>              fprintf(stderr, "Either \"index\" or other parameters must"
>                      " be specified for PAPR PHB, not both\n");
>              return -1;
> @@ -542,7 +555,6 @@ static int spapr_phb_init(SysBusDevice *s)
>              + sphb->index * SPAPR_PCI_WINDOW_SPACING;
>          sphb->mem_win_addr = windows_base + SPAPR_PCI_MMIO_WIN_OFF;
>          sphb->io_win_addr = windows_base + SPAPR_PCI_IO_WIN_OFF;
> -        sphb->msi_win_addr = windows_base + SPAPR_PCI_MSI_WIN_OFF;
>      }
>  
>      if (sphb->buid == -1) {
> @@ -565,11 +577,6 @@ static int spapr_phb_init(SysBusDevice *s)
>          return -1;
>      }
>  
> -    if (sphb->msi_win_addr == -1) {
> -        fprintf(stderr, "MSI window address not specified for PHB\n");
> -        return -1;
> -    }
> -
>      if (find_phb(spapr, sphb->buid)) {
>          fprintf(stderr, "PCI host bridges must have unique BUIDs\n");
>          return -1;
> @@ -609,18 +616,6 @@ static int spapr_phb_init(SysBusDevice *s)
>                            namebuf, SPAPR_PCI_IO_WIN_SIZE);
>      memory_region_add_subregion(get_system_memory(), sphb->io_win_addr,
>                                  &sphb->iowindow);
> -
> -    /* As MSI/MSIX interrupts trigger by writing at MSI/MSIX vectors,
> -     * we need to allocate some memory to catch those writes coming
> -     * from msi_notify()/msix_notify() */
> -    if (msi_supported) {
> -        sprintf(namebuf, "%s.msi", sphb->dtbusname);
> -        memory_region_init_io(&sphb->msiwindow, OBJECT(sphb), &spapr_msi_ops, sphb,
> -                              namebuf, SPAPR_MSIX_MAX_DEVS * 0x10000);
> -        memory_region_add_subregion(get_system_memory(), sphb->msi_win_addr,
> -                                    &sphb->msiwindow);
> -    }
> -
>      /*
>       * Selecting a busname is more complex than you'd think, due to
>       * interacting constraints.  If the user has specified an id
> @@ -695,7 +690,6 @@ static Property spapr_phb_properties[] = {
>      DEFINE_PROP_HEX64("io_win_addr", sPAPRPHBState, io_win_addr, -1),
>      DEFINE_PROP_HEX64("io_win_size", sPAPRPHBState, io_win_size,
>                        SPAPR_PCI_IO_WIN_SIZE),
> -    DEFINE_PROP_HEX64("msi_win_addr", sPAPRPHBState, msi_win_addr, -1),
>      DEFINE_PROP_END_OF_LIST(),
>  };
>  
> @@ -737,7 +731,6 @@ static const VMStateDescription vmstate_spapr_pci = {
>          VMSTATE_UINT64_EQUAL(mem_win_size, sPAPRPHBState),
>          VMSTATE_UINT64_EQUAL(io_win_addr, sPAPRPHBState),
>          VMSTATE_UINT64_EQUAL(io_win_size, sPAPRPHBState),
> -        VMSTATE_UINT64_EQUAL(msi_win_addr, sPAPRPHBState),
>          VMSTATE_STRUCT_ARRAY(lsi_table, sPAPRPHBState, PCI_NUM_PINS, 0,
>                               vmstate_spapr_pci_lsi, struct spapr_pci_lsi),
>          VMSTATE_STRUCT_ARRAY(msi_table, sPAPRPHBState, SPAPR_MSIX_MAX_DEVS, 0,
> diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h
> index 93f9511..970b4a9 100644
> --- a/include/hw/pci-host/spapr.h
> +++ b/include/hw/pci-host/spapr.h
> @@ -43,8 +43,7 @@ typedef struct sPAPRPHBState {
>  
>      MemoryRegion memspace, iospace;
>      hwaddr mem_win_addr, mem_win_size, io_win_addr, io_win_size;
> -    hwaddr msi_win_addr;
> -    MemoryRegion memwindow, iowindow, msiwindow;
> +    MemoryRegion memwindow, iowindow;
>  
>      uint32_t dma_liobn;
>      uint64_t dma_window_start;
> @@ -73,7 +72,8 @@ typedef struct sPAPRPHBState {
>  #define SPAPR_PCI_MMIO_WIN_SIZE      0x20000000
>  #define SPAPR_PCI_IO_WIN_OFF         0x80000000
>  #define SPAPR_PCI_IO_WIN_SIZE        0x10000
> -#define SPAPR_PCI_MSI_WIN_OFF        0x90000000
> +
> +#define SPAPR_PCI_MSI_WINDOW         0x40000000000ULL
>  
>  #define SPAPR_PCI_MEM_WIN_BUS_OFFSET 0x80000000ULL
>  
> @@ -88,6 +88,8 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
>                            uint32_t xics_phandle,
>                            void *fdt);
>  
> +void spapr_pci_msi_init(sPAPREnvironment *spapr, hwaddr addr);
> +
>  void spapr_pci_rtas_init(void);
>  
>  #endif /* __HW_SPAPR_PCI_H__ */
> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> index 7aa9a3d..aed02e1 100644
> --- a/include/hw/ppc/spapr.h
> +++ b/include/hw/ppc/spapr.h
> @@ -14,6 +14,8 @@ struct icp_state;
>  typedef struct sPAPREnvironment {
>      struct VIOsPAPRBus *vio_bus;
>      QLIST_HEAD(, sPAPRPHBState) phbs;
> +    hwaddr msi_win_addr;
> +    MemoryRegion msiwindow;
>      struct sPAPRNVRAM *nvram;
>      struct icp_state *icp;
>  
> @@ -303,7 +305,7 @@ target_ulong spapr_hypercall(PowerPCCPU *cpu, target_ulong opcode,
>                               target_ulong *args);
>  
>  int spapr_allocate_irq(int hint, bool lsi);
> -int spapr_allocate_irq_block(int num, bool lsi);
> +int spapr_allocate_irq_block(int num, bool lsi, bool msi);
>  
>  static inline int spapr_allocate_msi(int hint)
>  {
>
Alexey Kardashevskiy - July 30, 2013, 2:56 a.m.
Ping, anyone?


On 07/23/2013 12:54 PM, Alexey Kardashevskiy wrote:
> Michael, could you please review the patch as it is about PCI? Thanks!
> 
> 
> On 07/12/2013 05:38 PM, Alexey Kardashevskiy wrote:
>> On the sPAPR platform a guest allocates MSI/MSIX vectors via RTAS
>> hypercalls which return global IRQ numbers to a guest so it only
>> operates with those and never touches MSIMessage.
>>
>> Therefore MSIMessage handling is completely hidden in QEMU.
>>
>> Previously every sPAPR PCI host bridge implemented its own MSI window
>> to catch msi_notify()/msix_notify() calls from QEMU devices (virtio-pci
>> or vfio) and route them to the guest via qemu_pulse_irq().
>> MSIMessage used to be encoded as:
>> 	.addr - address within the PHB MSI window;
>> 	.data - the device index on PHB plus vector number.
>> The MSI MR write function translated this MSIMessage to a global IRQ
>> number and called qemu_pulse_irq().
>>
>> However the total number of IRQs is not really big (at the moment it is
>> 1024 IRQs starting from 4096) and even 16bit data field of MSIMessage
>> seems to be enough to store an IRQ number there.
>>
>> This simplifies MSI handling in sPAPR PHB. Specifically, this does:
>> 1. remove a MSI window from a PHB;
>> 2. add a single memory region for all MSIs to sPAPREnvironment
>> and spapr_pci_msi_init() to initialize it;
>> 3. encode MSIMessage as:
>>     * .addr - a fixed address of SPAPR_PCI_MSI_WINDOW==0x40000000000ULL;
>>     * .data as an IRQ number.
>> 4. change IRQ allocator to align first IRQ number in a block for MSI.
>> MSI uses lower bits to specify the vector number so the first IRQ has to
>> be aligned. MSIX does not need any special allocator though.
>>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>> ---
>>  hw/ppc/spapr.c              | 29 ++++++++++++++++++---
>>  hw/ppc/spapr_pci.c          | 61 ++++++++++++++++++++-------------------------
>>  include/hw/pci-host/spapr.h |  8 +++---
>>  include/hw/ppc/spapr.h      |  4 ++-
>>  4 files changed, 60 insertions(+), 42 deletions(-)
>>
>> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
>> index 671bbd9..6b21191 100644
>> --- a/hw/ppc/spapr.c
>> +++ b/hw/ppc/spapr.c
>> @@ -88,6 +88,9 @@ int spapr_allocate_irq(int hint, bool lsi)
>>  
>>      if (hint) {
>>          irq = hint;
>> +        if (hint >= spapr->next_irq) {
>> +            spapr->next_irq = hint + 1;
>> +        }
>>          /* FIXME: we should probably check for collisions somehow */
>>      } else {
>>          irq = spapr->next_irq++;
>> @@ -103,22 +106,39 @@ int spapr_allocate_irq(int hint, bool lsi)
>>      return irq;
>>  }
>>  
>> -/* Allocate block of consequtive IRQs, returns a number of the first */
>> -int spapr_allocate_irq_block(int num, bool lsi)
>> +/*
>> + * Allocate block of consequtive IRQs, returns a number of the first.
>> + * If msi==true, aligns the first IRQ number to num.
>> + */
>> +int spapr_allocate_irq_block(int num, bool lsi, bool msi)
>>  {
>>      int first = -1;
>> -    int i;
>> +    int i, hint = 0;
>> +
>> +    /*
>> +     * MSIMesage::data is used for storing VIRQ so
>> +     * it has to be aligned to num to support multiple
>> +     * MSI vectors. MSI-X is not affected by this.
>> +     * The hint is used for the first IRQ, the rest should
>> +     * be allocated continously.
>> +     */
>> +    if (msi) {
>> +        assert((num == 1) || (num == 2) || (num == 4) ||
>> +               (num == 8) || (num == 16) || (num == 32));
>> +        hint = (spapr->next_irq + num - 1) & ~(num - 1);
>> +    }
>>  
>>      for (i = 0; i < num; ++i) {
>>          int irq;
>>  
>> -        irq = spapr_allocate_irq(0, lsi);
>> +        irq = spapr_allocate_irq(hint, lsi);
>>          if (!irq) {
>>              return -1;
>>          }
>>  
>>          if (0 == i) {
>>              first = irq;
>> +            hint = 0;
>>          }
>>  
>>          /* If the above doesn't create a consecutive block then that's
>> @@ -1127,6 +1147,7 @@ static void ppc_spapr_init(QEMUMachineInitArgs *args)
>>      spapr_create_nvram(spapr);
>>  
>>      /* Set up PCI */
>> +    spapr_pci_msi_init(spapr, SPAPR_PCI_MSI_WINDOW);
>>      spapr_pci_rtas_init();
>>  
>>      phb = spapr_create_phb(spapr, 0);
>> diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
>> index dfe4d04..c8ea777 100644
>> --- a/hw/ppc/spapr_pci.c
>> +++ b/hw/ppc/spapr_pci.c
>> @@ -258,11 +258,11 @@ static int spapr_msicfg_find(sPAPRPHBState *phb, uint32_t config_addr,
>>   * This is required for msi_notify()/msix_notify() which
>>   * will write at the addresses via spapr_msi_write().
>>   */
>> -static void spapr_msi_setmsg(PCIDevice *pdev, hwaddr addr,
>> -                             bool msix, unsigned req_num)
>> +static void spapr_msi_setmsg(PCIDevice *pdev, hwaddr addr, bool msix,
>> +                             unsigned first_irq, unsigned req_num)
>>  {
>>      unsigned i;
>> -    MSIMessage msg = { .address = addr, .data = 0 };
>> +    MSIMessage msg = { .address = addr, .data = first_irq };
>>  
>>      if (!msix) {
>>          msi_set_message(pdev, msg);
>> @@ -270,8 +270,7 @@ static void spapr_msi_setmsg(PCIDevice *pdev, hwaddr addr,
>>          return;
>>      }
>>  
>> -    for (i = 0; i < req_num; ++i) {
>> -        msg.address = addr | (i << 2);
>> +    for (i = 0; i < req_num; ++i, ++msg.data) {
>>          msix_set_message(pdev, i, msg);
>>          trace_spapr_pci_msi_setup(pdev->name, i, msg.address);
>>      }
>> @@ -351,7 +350,8 @@ static void rtas_ibm_change_msi(PowerPCCPU *cpu, sPAPREnvironment *spapr,
>>  
>>      /* There is no cached config, allocate MSIs */
>>      if (!phb->msi_table[ndev].nvec) {
>> -        irq = spapr_allocate_irq_block(req_num, false);
>> +        irq = spapr_allocate_irq_block(req_num, false,
>> +                                       ret_intr_type == RTAS_TYPE_MSI);
>>          if (irq < 0) {
>>              fprintf(stderr, "Cannot allocate MSIs for device#%d", ndev);
>>              rtas_st(rets, 0, -1); /* Hardware error */
>> @@ -363,8 +363,8 @@ static void rtas_ibm_change_msi(PowerPCCPU *cpu, sPAPREnvironment *spapr,
>>      }
>>  
>>      /* Setup MSI/MSIX vectors in the device (via cfgspace or MSIX BAR) */
>> -    spapr_msi_setmsg(pdev, phb->msi_win_addr | (ndev << 16),
>> -                     ret_intr_type == RTAS_TYPE_MSIX, req_num);
>> +    spapr_msi_setmsg(pdev, spapr->msi_win_addr, ret_intr_type == RTAS_TYPE_MSIX,
>> +                     phb->msi_table[ndev].irq, req_num);
>>  
>>      rtas_st(rets, 0, 0);
>>      rtas_st(rets, 1, req_num);
>> @@ -487,10 +487,7 @@ static const MemoryRegionOps spapr_io_ops = {
>>  static void spapr_msi_write(void *opaque, hwaddr addr,
>>                              uint64_t data, unsigned size)
>>  {
>> -    sPAPRPHBState *phb = opaque;
>> -    int ndev = addr >> 16;
>> -    int vec = ((addr & 0xFFFF) >> 2) | data;
>> -    uint32_t irq = phb->msi_table[ndev].irq + vec;
>> +    uint32_t irq = data;
>>  
>>      trace_spapr_pci_msi_write(addr, data, irq);
>>  
>> @@ -504,6 +501,23 @@ static const MemoryRegionOps spapr_msi_ops = {
>>      .endianness = DEVICE_LITTLE_ENDIAN
>>  };
>>  
>> +void spapr_pci_msi_init(sPAPREnvironment *spapr, hwaddr addr)
>> +{
>> +    /*
>> +     * As MSI/MSIX interrupts trigger by writing at MSI/MSIX vectors,
>> +     * we need to allocate some memory to catch those writes coming
>> +     * from msi_notify()/msix_notify().
>> +     * As MSIMessage:addr is going to be the same and MSIMessage:data
>> +     * is going to be a VIRQ number, 4 bytes of the MSI MR will only
>> +     * be used.
>> +     */
>> +    spapr->msi_win_addr = addr;
>> +    memory_region_init_io(&spapr->msiwindow, NULL, &spapr_msi_ops, spapr,
>> +                          "msi", getpagesize());
>> +    memory_region_add_subregion(get_system_memory(), spapr->msi_win_addr,
>> +                                &spapr->msiwindow);
>> +}
>> +
>>  /*
>>   * PHB PCI device
>>   */
>> @@ -528,8 +542,7 @@ static int spapr_phb_init(SysBusDevice *s)
>>  
>>          if ((sphb->buid != -1) || (sphb->dma_liobn != -1)
>>              || (sphb->mem_win_addr != -1)
>> -            || (sphb->io_win_addr != -1)
>> -            || (sphb->msi_win_addr != -1)) {
>> +            || (sphb->io_win_addr != -1)) {
>>              fprintf(stderr, "Either \"index\" or other parameters must"
>>                      " be specified for PAPR PHB, not both\n");
>>              return -1;
>> @@ -542,7 +555,6 @@ static int spapr_phb_init(SysBusDevice *s)
>>              + sphb->index * SPAPR_PCI_WINDOW_SPACING;
>>          sphb->mem_win_addr = windows_base + SPAPR_PCI_MMIO_WIN_OFF;
>>          sphb->io_win_addr = windows_base + SPAPR_PCI_IO_WIN_OFF;
>> -        sphb->msi_win_addr = windows_base + SPAPR_PCI_MSI_WIN_OFF;
>>      }
>>  
>>      if (sphb->buid == -1) {
>> @@ -565,11 +577,6 @@ static int spapr_phb_init(SysBusDevice *s)
>>          return -1;
>>      }
>>  
>> -    if (sphb->msi_win_addr == -1) {
>> -        fprintf(stderr, "MSI window address not specified for PHB\n");
>> -        return -1;
>> -    }
>> -
>>      if (find_phb(spapr, sphb->buid)) {
>>          fprintf(stderr, "PCI host bridges must have unique BUIDs\n");
>>          return -1;
>> @@ -609,18 +616,6 @@ static int spapr_phb_init(SysBusDevice *s)
>>                            namebuf, SPAPR_PCI_IO_WIN_SIZE);
>>      memory_region_add_subregion(get_system_memory(), sphb->io_win_addr,
>>                                  &sphb->iowindow);
>> -
>> -    /* As MSI/MSIX interrupts trigger by writing at MSI/MSIX vectors,
>> -     * we need to allocate some memory to catch those writes coming
>> -     * from msi_notify()/msix_notify() */
>> -    if (msi_supported) {
>> -        sprintf(namebuf, "%s.msi", sphb->dtbusname);
>> -        memory_region_init_io(&sphb->msiwindow, OBJECT(sphb), &spapr_msi_ops, sphb,
>> -                              namebuf, SPAPR_MSIX_MAX_DEVS * 0x10000);
>> -        memory_region_add_subregion(get_system_memory(), sphb->msi_win_addr,
>> -                                    &sphb->msiwindow);
>> -    }
>> -
>>      /*
>>       * Selecting a busname is more complex than you'd think, due to
>>       * interacting constraints.  If the user has specified an id
>> @@ -695,7 +690,6 @@ static Property spapr_phb_properties[] = {
>>      DEFINE_PROP_HEX64("io_win_addr", sPAPRPHBState, io_win_addr, -1),
>>      DEFINE_PROP_HEX64("io_win_size", sPAPRPHBState, io_win_size,
>>                        SPAPR_PCI_IO_WIN_SIZE),
>> -    DEFINE_PROP_HEX64("msi_win_addr", sPAPRPHBState, msi_win_addr, -1),
>>      DEFINE_PROP_END_OF_LIST(),
>>  };
>>  
>> @@ -737,7 +731,6 @@ static const VMStateDescription vmstate_spapr_pci = {
>>          VMSTATE_UINT64_EQUAL(mem_win_size, sPAPRPHBState),
>>          VMSTATE_UINT64_EQUAL(io_win_addr, sPAPRPHBState),
>>          VMSTATE_UINT64_EQUAL(io_win_size, sPAPRPHBState),
>> -        VMSTATE_UINT64_EQUAL(msi_win_addr, sPAPRPHBState),
>>          VMSTATE_STRUCT_ARRAY(lsi_table, sPAPRPHBState, PCI_NUM_PINS, 0,
>>                               vmstate_spapr_pci_lsi, struct spapr_pci_lsi),
>>          VMSTATE_STRUCT_ARRAY(msi_table, sPAPRPHBState, SPAPR_MSIX_MAX_DEVS, 0,
>> diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h
>> index 93f9511..970b4a9 100644
>> --- a/include/hw/pci-host/spapr.h
>> +++ b/include/hw/pci-host/spapr.h
>> @@ -43,8 +43,7 @@ typedef struct sPAPRPHBState {
>>  
>>      MemoryRegion memspace, iospace;
>>      hwaddr mem_win_addr, mem_win_size, io_win_addr, io_win_size;
>> -    hwaddr msi_win_addr;
>> -    MemoryRegion memwindow, iowindow, msiwindow;
>> +    MemoryRegion memwindow, iowindow;
>>  
>>      uint32_t dma_liobn;
>>      uint64_t dma_window_start;
>> @@ -73,7 +72,8 @@ typedef struct sPAPRPHBState {
>>  #define SPAPR_PCI_MMIO_WIN_SIZE      0x20000000
>>  #define SPAPR_PCI_IO_WIN_OFF         0x80000000
>>  #define SPAPR_PCI_IO_WIN_SIZE        0x10000
>> -#define SPAPR_PCI_MSI_WIN_OFF        0x90000000
>> +
>> +#define SPAPR_PCI_MSI_WINDOW         0x40000000000ULL
>>  
>>  #define SPAPR_PCI_MEM_WIN_BUS_OFFSET 0x80000000ULL
>>  
>> @@ -88,6 +88,8 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
>>                            uint32_t xics_phandle,
>>                            void *fdt);
>>  
>> +void spapr_pci_msi_init(sPAPREnvironment *spapr, hwaddr addr);
>> +
>>  void spapr_pci_rtas_init(void);
>>  
>>  #endif /* __HW_SPAPR_PCI_H__ */
>> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
>> index 7aa9a3d..aed02e1 100644
>> --- a/include/hw/ppc/spapr.h
>> +++ b/include/hw/ppc/spapr.h
>> @@ -14,6 +14,8 @@ struct icp_state;
>>  typedef struct sPAPREnvironment {
>>      struct VIOsPAPRBus *vio_bus;
>>      QLIST_HEAD(, sPAPRPHBState) phbs;
>> +    hwaddr msi_win_addr;
>> +    MemoryRegion msiwindow;
>>      struct sPAPRNVRAM *nvram;
>>      struct icp_state *icp;
>>  
>> @@ -303,7 +305,7 @@ target_ulong spapr_hypercall(PowerPCCPU *cpu, target_ulong opcode,
>>                               target_ulong *args);
>>  
>>  int spapr_allocate_irq(int hint, bool lsi);
>> -int spapr_allocate_irq_block(int num, bool lsi);
>> +int spapr_allocate_irq_block(int num, bool lsi, bool msi);
>>  
>>  static inline int spapr_allocate_msi(int hint)
>>  {
>>
> 
>
Anthony Liguori - July 31, 2013, 6:02 p.m.
Alexey Kardashevskiy <aik@ozlabs.ru> writes:

> On the sPAPR platform a guest allocates MSI/MSIX vectors via RTAS
> hypercalls which return global IRQ numbers to a guest so it only
> operates with those and never touches MSIMessage.
>
> Therefore MSIMessage handling is completely hidden in QEMU.
>
> Previously every sPAPR PCI host bridge implemented its own MSI window
> to catch msi_notify()/msix_notify() calls from QEMU devices (virtio-pci
> or vfio) and route them to the guest via qemu_pulse_irq().
> MSIMessage used to be encoded as:
> 	.addr - address within the PHB MSI window;
> 	.data - the device index on PHB plus vector number.
> The MSI MR write function translated this MSIMessage to a global IRQ
> number and called qemu_pulse_irq().
>
> However the total number of IRQs is not really big (at the moment it is
> 1024 IRQs starting from 4096) and even 16bit data field of MSIMessage
> seems to be enough to store an IRQ number there.
>
> This simplifies MSI handling in sPAPR PHB. Specifically, this does:
> 1. remove a MSI window from a PHB;
> 2. add a single memory region for all MSIs to sPAPREnvironment
> and spapr_pci_msi_init() to initialize it;
> 3. encode MSIMessage as:
>     * .addr - a fixed address of SPAPR_PCI_MSI_WINDOW==0x40000000000ULL;
>     * .data as an IRQ number.
> 4. change IRQ allocator to align first IRQ number in a block for MSI.
> MSI uses lower bits to specify the vector number so the first IRQ has to
> be aligned. MSIX does not need any special allocator though.
>
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>

Reviewed-by: Anthony Liguori <aliguori@us.ibm.com>

Does this actually fix any bug or is this just refactoring?  If it's the
later, it'll have to wait until after the 1.7 window opens up.

Regards,

Anthony Liguori

> ---
>  hw/ppc/spapr.c              | 29 ++++++++++++++++++---
>  hw/ppc/spapr_pci.c          | 61 ++++++++++++++++++++-------------------------
>  include/hw/pci-host/spapr.h |  8 +++---
>  include/hw/ppc/spapr.h      |  4 ++-
>  4 files changed, 60 insertions(+), 42 deletions(-)
>
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index 671bbd9..6b21191 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -88,6 +88,9 @@ int spapr_allocate_irq(int hint, bool lsi)
>  
>      if (hint) {
>          irq = hint;
> +        if (hint >= spapr->next_irq) {
> +            spapr->next_irq = hint + 1;
> +        }
>          /* FIXME: we should probably check for collisions somehow */
>      } else {
>          irq = spapr->next_irq++;
> @@ -103,22 +106,39 @@ int spapr_allocate_irq(int hint, bool lsi)
>      return irq;
>  }
>  
> -/* Allocate block of consequtive IRQs, returns a number of the first */
> -int spapr_allocate_irq_block(int num, bool lsi)
> +/*
> + * Allocate block of consequtive IRQs, returns a number of the first.
> + * If msi==true, aligns the first IRQ number to num.
> + */
> +int spapr_allocate_irq_block(int num, bool lsi, bool msi)
>  {
>      int first = -1;
> -    int i;
> +    int i, hint = 0;
> +
> +    /*
> +     * MSIMesage::data is used for storing VIRQ so
> +     * it has to be aligned to num to support multiple
> +     * MSI vectors. MSI-X is not affected by this.
> +     * The hint is used for the first IRQ, the rest should
> +     * be allocated continously.
> +     */
> +    if (msi) {
> +        assert((num == 1) || (num == 2) || (num == 4) ||
> +               (num == 8) || (num == 16) || (num == 32));
> +        hint = (spapr->next_irq + num - 1) & ~(num - 1);
> +    }
>  
>      for (i = 0; i < num; ++i) {
>          int irq;
>  
> -        irq = spapr_allocate_irq(0, lsi);
> +        irq = spapr_allocate_irq(hint, lsi);
>          if (!irq) {
>              return -1;
>          }
>  
>          if (0 == i) {
>              first = irq;
> +            hint = 0;
>          }
>  
>          /* If the above doesn't create a consecutive block then that's
> @@ -1127,6 +1147,7 @@ static void ppc_spapr_init(QEMUMachineInitArgs *args)
>      spapr_create_nvram(spapr);
>  
>      /* Set up PCI */
> +    spapr_pci_msi_init(spapr, SPAPR_PCI_MSI_WINDOW);
>      spapr_pci_rtas_init();
>  
>      phb = spapr_create_phb(spapr, 0);
> diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
> index dfe4d04..c8ea777 100644
> --- a/hw/ppc/spapr_pci.c
> +++ b/hw/ppc/spapr_pci.c
> @@ -258,11 +258,11 @@ static int spapr_msicfg_find(sPAPRPHBState *phb, uint32_t config_addr,
>   * This is required for msi_notify()/msix_notify() which
>   * will write at the addresses via spapr_msi_write().
>   */
> -static void spapr_msi_setmsg(PCIDevice *pdev, hwaddr addr,
> -                             bool msix, unsigned req_num)
> +static void spapr_msi_setmsg(PCIDevice *pdev, hwaddr addr, bool msix,
> +                             unsigned first_irq, unsigned req_num)
>  {
>      unsigned i;
> -    MSIMessage msg = { .address = addr, .data = 0 };
> +    MSIMessage msg = { .address = addr, .data = first_irq };
>  
>      if (!msix) {
>          msi_set_message(pdev, msg);
> @@ -270,8 +270,7 @@ static void spapr_msi_setmsg(PCIDevice *pdev, hwaddr addr,
>          return;
>      }
>  
> -    for (i = 0; i < req_num; ++i) {
> -        msg.address = addr | (i << 2);
> +    for (i = 0; i < req_num; ++i, ++msg.data) {
>          msix_set_message(pdev, i, msg);
>          trace_spapr_pci_msi_setup(pdev->name, i, msg.address);
>      }
> @@ -351,7 +350,8 @@ static void rtas_ibm_change_msi(PowerPCCPU *cpu, sPAPREnvironment *spapr,
>  
>      /* There is no cached config, allocate MSIs */
>      if (!phb->msi_table[ndev].nvec) {
> -        irq = spapr_allocate_irq_block(req_num, false);
> +        irq = spapr_allocate_irq_block(req_num, false,
> +                                       ret_intr_type == RTAS_TYPE_MSI);
>          if (irq < 0) {
>              fprintf(stderr, "Cannot allocate MSIs for device#%d", ndev);
>              rtas_st(rets, 0, -1); /* Hardware error */
> @@ -363,8 +363,8 @@ static void rtas_ibm_change_msi(PowerPCCPU *cpu, sPAPREnvironment *spapr,
>      }
>  
>      /* Setup MSI/MSIX vectors in the device (via cfgspace or MSIX BAR) */
> -    spapr_msi_setmsg(pdev, phb->msi_win_addr | (ndev << 16),
> -                     ret_intr_type == RTAS_TYPE_MSIX, req_num);
> +    spapr_msi_setmsg(pdev, spapr->msi_win_addr, ret_intr_type == RTAS_TYPE_MSIX,
> +                     phb->msi_table[ndev].irq, req_num);
>  
>      rtas_st(rets, 0, 0);
>      rtas_st(rets, 1, req_num);
> @@ -487,10 +487,7 @@ static const MemoryRegionOps spapr_io_ops = {
>  static void spapr_msi_write(void *opaque, hwaddr addr,
>                              uint64_t data, unsigned size)
>  {
> -    sPAPRPHBState *phb = opaque;
> -    int ndev = addr >> 16;
> -    int vec = ((addr & 0xFFFF) >> 2) | data;
> -    uint32_t irq = phb->msi_table[ndev].irq + vec;
> +    uint32_t irq = data;
>  
>      trace_spapr_pci_msi_write(addr, data, irq);
>  
> @@ -504,6 +501,23 @@ static const MemoryRegionOps spapr_msi_ops = {
>      .endianness = DEVICE_LITTLE_ENDIAN
>  };
>  
> +void spapr_pci_msi_init(sPAPREnvironment *spapr, hwaddr addr)
> +{
> +    /*
> +     * As MSI/MSIX interrupts trigger by writing at MSI/MSIX vectors,
> +     * we need to allocate some memory to catch those writes coming
> +     * from msi_notify()/msix_notify().
> +     * As MSIMessage:addr is going to be the same and MSIMessage:data
> +     * is going to be a VIRQ number, 4 bytes of the MSI MR will only
> +     * be used.
> +     */
> +    spapr->msi_win_addr = addr;
> +    memory_region_init_io(&spapr->msiwindow, NULL, &spapr_msi_ops, spapr,
> +                          "msi", getpagesize());
> +    memory_region_add_subregion(get_system_memory(), spapr->msi_win_addr,
> +                                &spapr->msiwindow);
> +}
> +
>  /*
>   * PHB PCI device
>   */
> @@ -528,8 +542,7 @@ static int spapr_phb_init(SysBusDevice *s)
>  
>          if ((sphb->buid != -1) || (sphb->dma_liobn != -1)
>              || (sphb->mem_win_addr != -1)
> -            || (sphb->io_win_addr != -1)
> -            || (sphb->msi_win_addr != -1)) {
> +            || (sphb->io_win_addr != -1)) {
>              fprintf(stderr, "Either \"index\" or other parameters must"
>                      " be specified for PAPR PHB, not both\n");
>              return -1;
> @@ -542,7 +555,6 @@ static int spapr_phb_init(SysBusDevice *s)
>              + sphb->index * SPAPR_PCI_WINDOW_SPACING;
>          sphb->mem_win_addr = windows_base + SPAPR_PCI_MMIO_WIN_OFF;
>          sphb->io_win_addr = windows_base + SPAPR_PCI_IO_WIN_OFF;
> -        sphb->msi_win_addr = windows_base + SPAPR_PCI_MSI_WIN_OFF;
>      }
>  
>      if (sphb->buid == -1) {
> @@ -565,11 +577,6 @@ static int spapr_phb_init(SysBusDevice *s)
>          return -1;
>      }
>  
> -    if (sphb->msi_win_addr == -1) {
> -        fprintf(stderr, "MSI window address not specified for PHB\n");
> -        return -1;
> -    }
> -
>      if (find_phb(spapr, sphb->buid)) {
>          fprintf(stderr, "PCI host bridges must have unique BUIDs\n");
>          return -1;
> @@ -609,18 +616,6 @@ static int spapr_phb_init(SysBusDevice *s)
>                            namebuf, SPAPR_PCI_IO_WIN_SIZE);
>      memory_region_add_subregion(get_system_memory(), sphb->io_win_addr,
>                                  &sphb->iowindow);
> -
> -    /* As MSI/MSIX interrupts trigger by writing at MSI/MSIX vectors,
> -     * we need to allocate some memory to catch those writes coming
> -     * from msi_notify()/msix_notify() */
> -    if (msi_supported) {
> -        sprintf(namebuf, "%s.msi", sphb->dtbusname);
> -        memory_region_init_io(&sphb->msiwindow, OBJECT(sphb), &spapr_msi_ops, sphb,
> -                              namebuf, SPAPR_MSIX_MAX_DEVS * 0x10000);
> -        memory_region_add_subregion(get_system_memory(), sphb->msi_win_addr,
> -                                    &sphb->msiwindow);
> -    }
> -
>      /*
>       * Selecting a busname is more complex than you'd think, due to
>       * interacting constraints.  If the user has specified an id
> @@ -695,7 +690,6 @@ static Property spapr_phb_properties[] = {
>      DEFINE_PROP_HEX64("io_win_addr", sPAPRPHBState, io_win_addr, -1),
>      DEFINE_PROP_HEX64("io_win_size", sPAPRPHBState, io_win_size,
>                        SPAPR_PCI_IO_WIN_SIZE),
> -    DEFINE_PROP_HEX64("msi_win_addr", sPAPRPHBState, msi_win_addr, -1),
>      DEFINE_PROP_END_OF_LIST(),
>  };
>  
> @@ -737,7 +731,6 @@ static const VMStateDescription vmstate_spapr_pci = {
>          VMSTATE_UINT64_EQUAL(mem_win_size, sPAPRPHBState),
>          VMSTATE_UINT64_EQUAL(io_win_addr, sPAPRPHBState),
>          VMSTATE_UINT64_EQUAL(io_win_size, sPAPRPHBState),
> -        VMSTATE_UINT64_EQUAL(msi_win_addr, sPAPRPHBState),
>          VMSTATE_STRUCT_ARRAY(lsi_table, sPAPRPHBState, PCI_NUM_PINS, 0,
>                               vmstate_spapr_pci_lsi, struct spapr_pci_lsi),
>          VMSTATE_STRUCT_ARRAY(msi_table, sPAPRPHBState, SPAPR_MSIX_MAX_DEVS, 0,
> diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h
> index 93f9511..970b4a9 100644
> --- a/include/hw/pci-host/spapr.h
> +++ b/include/hw/pci-host/spapr.h
> @@ -43,8 +43,7 @@ typedef struct sPAPRPHBState {
>  
>      MemoryRegion memspace, iospace;
>      hwaddr mem_win_addr, mem_win_size, io_win_addr, io_win_size;
> -    hwaddr msi_win_addr;
> -    MemoryRegion memwindow, iowindow, msiwindow;
> +    MemoryRegion memwindow, iowindow;
>  
>      uint32_t dma_liobn;
>      uint64_t dma_window_start;
> @@ -73,7 +72,8 @@ typedef struct sPAPRPHBState {
>  #define SPAPR_PCI_MMIO_WIN_SIZE      0x20000000
>  #define SPAPR_PCI_IO_WIN_OFF         0x80000000
>  #define SPAPR_PCI_IO_WIN_SIZE        0x10000
> -#define SPAPR_PCI_MSI_WIN_OFF        0x90000000
> +
> +#define SPAPR_PCI_MSI_WINDOW         0x40000000000ULL
>  
>  #define SPAPR_PCI_MEM_WIN_BUS_OFFSET 0x80000000ULL
>  
> @@ -88,6 +88,8 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
>                            uint32_t xics_phandle,
>                            void *fdt);
>  
> +void spapr_pci_msi_init(sPAPREnvironment *spapr, hwaddr addr);
> +
>  void spapr_pci_rtas_init(void);
>  
>  #endif /* __HW_SPAPR_PCI_H__ */
> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> index 7aa9a3d..aed02e1 100644
> --- a/include/hw/ppc/spapr.h
> +++ b/include/hw/ppc/spapr.h
> @@ -14,6 +14,8 @@ struct icp_state;
>  typedef struct sPAPREnvironment {
>      struct VIOsPAPRBus *vio_bus;
>      QLIST_HEAD(, sPAPRPHBState) phbs;
> +    hwaddr msi_win_addr;
> +    MemoryRegion msiwindow;
>      struct sPAPRNVRAM *nvram;
>      struct icp_state *icp;
>  
> @@ -303,7 +305,7 @@ target_ulong spapr_hypercall(PowerPCCPU *cpu, target_ulong opcode,
>                               target_ulong *args);
>  
>  int spapr_allocate_irq(int hint, bool lsi);
> -int spapr_allocate_irq_block(int num, bool lsi);
> +int spapr_allocate_irq_block(int num, bool lsi, bool msi);
>  
>  static inline int spapr_allocate_msi(int hint)
>  {
> -- 
> 1.8.3.2
Alexey Kardashevskiy - July 31, 2013, 10:56 p.m.
On 08/01/2013 04:02 AM, Anthony Liguori wrote:
> Alexey Kardashevskiy <aik@ozlabs.ru> writes:
> 
>> On the sPAPR platform a guest allocates MSI/MSIX vectors via RTAS
>> hypercalls which return global IRQ numbers to a guest so it only
>> operates with those and never touches MSIMessage.
>>
>> Therefore MSIMessage handling is completely hidden in QEMU.
>>
>> Previously every sPAPR PCI host bridge implemented its own MSI window
>> to catch msi_notify()/msix_notify() calls from QEMU devices (virtio-pci
>> or vfio) and route them to the guest via qemu_pulse_irq().
>> MSIMessage used to be encoded as:
>> 	.addr - address within the PHB MSI window;
>> 	.data - the device index on PHB plus vector number.
>> The MSI MR write function translated this MSIMessage to a global IRQ
>> number and called qemu_pulse_irq().
>>
>> However the total number of IRQs is not really big (at the moment it is
>> 1024 IRQs starting from 4096) and even 16bit data field of MSIMessage
>> seems to be enough to store an IRQ number there.
>>
>> This simplifies MSI handling in sPAPR PHB. Specifically, this does:
>> 1. remove a MSI window from a PHB;
>> 2. add a single memory region for all MSIs to sPAPREnvironment
>> and spapr_pci_msi_init() to initialize it;
>> 3. encode MSIMessage as:
>>     * .addr - a fixed address of SPAPR_PCI_MSI_WINDOW==0x40000000000ULL;
>>     * .data as an IRQ number.
>> 4. change IRQ allocator to align first IRQ number in a block for MSI.
>> MSI uses lower bits to specify the vector number so the first IRQ has to
>> be aligned. MSIX does not need any special allocator though.
>>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> 
> Reviewed-by: Anthony Liguori <aliguori@us.ibm.com>
> 
> Does this actually fix any bug or is this just refactoring?  If it's the
> later, it'll have to wait until after the 1.7 window opens up.


This is refactoring which should make IRQFD enablement on spapr easier.
Anthony Liguori - July 31, 2013, 11:22 p.m.
Alexey Kardashevskiy <aik@ozlabs.ru> writes:

> On 08/01/2013 04:02 AM, Anthony Liguori wrote:
>> Alexey Kardashevskiy <aik@ozlabs.ru> writes:
>> 
>>> On the sPAPR platform a guest allocates MSI/MSIX vectors via RTAS
>>> hypercalls which return global IRQ numbers to a guest so it only
>>> operates with those and never touches MSIMessage.
>>>
>>> Therefore MSIMessage handling is completely hidden in QEMU.
>>>
>>> Previously every sPAPR PCI host bridge implemented its own MSI window
>>> to catch msi_notify()/msix_notify() calls from QEMU devices (virtio-pci
>>> or vfio) and route them to the guest via qemu_pulse_irq().
>>> MSIMessage used to be encoded as:
>>> 	.addr - address within the PHB MSI window;
>>> 	.data - the device index on PHB plus vector number.
>>> The MSI MR write function translated this MSIMessage to a global IRQ
>>> number and called qemu_pulse_irq().
>>>
>>> However the total number of IRQs is not really big (at the moment it is
>>> 1024 IRQs starting from 4096) and even 16bit data field of MSIMessage
>>> seems to be enough to store an IRQ number there.
>>>
>>> This simplifies MSI handling in sPAPR PHB. Specifically, this does:
>>> 1. remove a MSI window from a PHB;
>>> 2. add a single memory region for all MSIs to sPAPREnvironment
>>> and spapr_pci_msi_init() to initialize it;
>>> 3. encode MSIMessage as:
>>>     * .addr - a fixed address of SPAPR_PCI_MSI_WINDOW==0x40000000000ULL;
>>>     * .data as an IRQ number.
>>> 4. change IRQ allocator to align first IRQ number in a block for MSI.
>>> MSI uses lower bits to specify the vector number so the first IRQ has to
>>> be aligned. MSIX does not need any special allocator though.
>>>
>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>> 
>> Reviewed-by: Anthony Liguori <aliguori@us.ibm.com>
>> 
>> Does this actually fix any bug or is this just refactoring?  If it's the
>> later, it'll have to wait until after the 1.7 window opens up.
>
>
> This is refactoring which should make IRQFD enablement on spapr
> easier.

Okay, I'll apply it after hard freeze then.

Regards,

Anthony Liguori

>
>
>
> -- 
> Alexey
Michael S. Tsirkin - Aug. 1, 2013, 9:01 a.m.
On Fri, Jul 12, 2013 at 05:38:24PM +1000, Alexey Kardashevskiy wrote:
> On the sPAPR platform a guest allocates MSI/MSIX vectors via RTAS
> hypercalls which return global IRQ numbers to a guest so it only
> operates with those and never touches MSIMessage.
> 
> Therefore MSIMessage handling is completely hidden in QEMU.
> 
> Previously every sPAPR PCI host bridge implemented its own MSI window
> to catch msi_notify()/msix_notify() calls from QEMU devices (virtio-pci
> or vfio) and route them to the guest via qemu_pulse_irq().
> MSIMessage used to be encoded as:
> 	.addr - address within the PHB MSI window;
> 	.data - the device index on PHB plus vector number.
> The MSI MR write function translated this MSIMessage to a global IRQ
> number and called qemu_pulse_irq().
> 
> However the total number of IRQs is not really big (at the moment it is
> 1024 IRQs starting from 4096) and even 16bit data field of MSIMessage
> seems to be enough to store an IRQ number there.
> 
> This simplifies MSI handling in sPAPR PHB. Specifically, this does:
> 1. remove a MSI window from a PHB;
> 2. add a single memory region for all MSIs to sPAPREnvironment
> and spapr_pci_msi_init() to initialize it;
> 3. encode MSIMessage as:
>     * .addr - a fixed address of SPAPR_PCI_MSI_WINDOW==0x40000000000ULL;
>     * .data as an IRQ number.
> 4. change IRQ allocator to align first IRQ number in a block for MSI.
> MSI uses lower bits to specify the vector number so the first IRQ has to
> be aligned. MSIX does not need any special allocator though.
> 
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>


Acked-by: Michael S. Tsirkin <mst@redhat.com>

> ---
>  hw/ppc/spapr.c              | 29 ++++++++++++++++++---
>  hw/ppc/spapr_pci.c          | 61 ++++++++++++++++++++-------------------------
>  include/hw/pci-host/spapr.h |  8 +++---
>  include/hw/ppc/spapr.h      |  4 ++-
>  4 files changed, 60 insertions(+), 42 deletions(-)
> 
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index 671bbd9..6b21191 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -88,6 +88,9 @@ int spapr_allocate_irq(int hint, bool lsi)
>  
>      if (hint) {
>          irq = hint;
> +        if (hint >= spapr->next_irq) {
> +            spapr->next_irq = hint + 1;
> +        }
>          /* FIXME: we should probably check for collisions somehow */
>      } else {
>          irq = spapr->next_irq++;
> @@ -103,22 +106,39 @@ int spapr_allocate_irq(int hint, bool lsi)
>      return irq;
>  }
>  
> -/* Allocate block of consequtive IRQs, returns a number of the first */
> -int spapr_allocate_irq_block(int num, bool lsi)
> +/*
> + * Allocate block of consequtive IRQs, returns a number of the first.
> + * If msi==true, aligns the first IRQ number to num.
> + */
> +int spapr_allocate_irq_block(int num, bool lsi, bool msi)
>  {
>      int first = -1;
> -    int i;
> +    int i, hint = 0;
> +
> +    /*
> +     * MSIMesage::data is used for storing VIRQ so
> +     * it has to be aligned to num to support multiple
> +     * MSI vectors. MSI-X is not affected by this.
> +     * The hint is used for the first IRQ, the rest should
> +     * be allocated continously.
> +     */
> +    if (msi) {
> +        assert((num == 1) || (num == 2) || (num == 4) ||
> +               (num == 8) || (num == 16) || (num == 32));
> +        hint = (spapr->next_irq + num - 1) & ~(num - 1);
> +    }
>  
>      for (i = 0; i < num; ++i) {
>          int irq;
>  
> -        irq = spapr_allocate_irq(0, lsi);
> +        irq = spapr_allocate_irq(hint, lsi);
>          if (!irq) {
>              return -1;
>          }
>  
>          if (0 == i) {
>              first = irq;
> +            hint = 0;
>          }
>  
>          /* If the above doesn't create a consecutive block then that's
> @@ -1127,6 +1147,7 @@ static void ppc_spapr_init(QEMUMachineInitArgs *args)
>      spapr_create_nvram(spapr);
>  
>      /* Set up PCI */
> +    spapr_pci_msi_init(spapr, SPAPR_PCI_MSI_WINDOW);
>      spapr_pci_rtas_init();
>  
>      phb = spapr_create_phb(spapr, 0);
> diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
> index dfe4d04..c8ea777 100644
> --- a/hw/ppc/spapr_pci.c
> +++ b/hw/ppc/spapr_pci.c
> @@ -258,11 +258,11 @@ static int spapr_msicfg_find(sPAPRPHBState *phb, uint32_t config_addr,
>   * This is required for msi_notify()/msix_notify() which
>   * will write at the addresses via spapr_msi_write().
>   */
> -static void spapr_msi_setmsg(PCIDevice *pdev, hwaddr addr,
> -                             bool msix, unsigned req_num)
> +static void spapr_msi_setmsg(PCIDevice *pdev, hwaddr addr, bool msix,
> +                             unsigned first_irq, unsigned req_num)
>  {
>      unsigned i;
> -    MSIMessage msg = { .address = addr, .data = 0 };
> +    MSIMessage msg = { .address = addr, .data = first_irq };
>  
>      if (!msix) {
>          msi_set_message(pdev, msg);
> @@ -270,8 +270,7 @@ static void spapr_msi_setmsg(PCIDevice *pdev, hwaddr addr,
>          return;
>      }
>  
> -    for (i = 0; i < req_num; ++i) {
> -        msg.address = addr | (i << 2);
> +    for (i = 0; i < req_num; ++i, ++msg.data) {
>          msix_set_message(pdev, i, msg);
>          trace_spapr_pci_msi_setup(pdev->name, i, msg.address);
>      }
> @@ -351,7 +350,8 @@ static void rtas_ibm_change_msi(PowerPCCPU *cpu, sPAPREnvironment *spapr,
>  
>      /* There is no cached config, allocate MSIs */
>      if (!phb->msi_table[ndev].nvec) {
> -        irq = spapr_allocate_irq_block(req_num, false);
> +        irq = spapr_allocate_irq_block(req_num, false,
> +                                       ret_intr_type == RTAS_TYPE_MSI);
>          if (irq < 0) {
>              fprintf(stderr, "Cannot allocate MSIs for device#%d", ndev);
>              rtas_st(rets, 0, -1); /* Hardware error */
> @@ -363,8 +363,8 @@ static void rtas_ibm_change_msi(PowerPCCPU *cpu, sPAPREnvironment *spapr,
>      }
>  
>      /* Setup MSI/MSIX vectors in the device (via cfgspace or MSIX BAR) */
> -    spapr_msi_setmsg(pdev, phb->msi_win_addr | (ndev << 16),
> -                     ret_intr_type == RTAS_TYPE_MSIX, req_num);
> +    spapr_msi_setmsg(pdev, spapr->msi_win_addr, ret_intr_type == RTAS_TYPE_MSIX,
> +                     phb->msi_table[ndev].irq, req_num);
>  
>      rtas_st(rets, 0, 0);
>      rtas_st(rets, 1, req_num);
> @@ -487,10 +487,7 @@ static const MemoryRegionOps spapr_io_ops = {
>  static void spapr_msi_write(void *opaque, hwaddr addr,
>                              uint64_t data, unsigned size)
>  {
> -    sPAPRPHBState *phb = opaque;
> -    int ndev = addr >> 16;
> -    int vec = ((addr & 0xFFFF) >> 2) | data;
> -    uint32_t irq = phb->msi_table[ndev].irq + vec;
> +    uint32_t irq = data;
>  
>      trace_spapr_pci_msi_write(addr, data, irq);
>  
> @@ -504,6 +501,23 @@ static const MemoryRegionOps spapr_msi_ops = {
>      .endianness = DEVICE_LITTLE_ENDIAN
>  };
>  
> +void spapr_pci_msi_init(sPAPREnvironment *spapr, hwaddr addr)
> +{
> +    /*
> +     * As MSI/MSIX interrupts trigger by writing at MSI/MSIX vectors,
> +     * we need to allocate some memory to catch those writes coming
> +     * from msi_notify()/msix_notify().
> +     * As MSIMessage:addr is going to be the same and MSIMessage:data
> +     * is going to be a VIRQ number, 4 bytes of the MSI MR will only
> +     * be used.
> +     */
> +    spapr->msi_win_addr = addr;
> +    memory_region_init_io(&spapr->msiwindow, NULL, &spapr_msi_ops, spapr,
> +                          "msi", getpagesize());
> +    memory_region_add_subregion(get_system_memory(), spapr->msi_win_addr,
> +                                &spapr->msiwindow);
> +}
> +
>  /*
>   * PHB PCI device
>   */
> @@ -528,8 +542,7 @@ static int spapr_phb_init(SysBusDevice *s)
>  
>          if ((sphb->buid != -1) || (sphb->dma_liobn != -1)
>              || (sphb->mem_win_addr != -1)
> -            || (sphb->io_win_addr != -1)
> -            || (sphb->msi_win_addr != -1)) {
> +            || (sphb->io_win_addr != -1)) {
>              fprintf(stderr, "Either \"index\" or other parameters must"
>                      " be specified for PAPR PHB, not both\n");
>              return -1;
> @@ -542,7 +555,6 @@ static int spapr_phb_init(SysBusDevice *s)
>              + sphb->index * SPAPR_PCI_WINDOW_SPACING;
>          sphb->mem_win_addr = windows_base + SPAPR_PCI_MMIO_WIN_OFF;
>          sphb->io_win_addr = windows_base + SPAPR_PCI_IO_WIN_OFF;
> -        sphb->msi_win_addr = windows_base + SPAPR_PCI_MSI_WIN_OFF;
>      }
>  
>      if (sphb->buid == -1) {
> @@ -565,11 +577,6 @@ static int spapr_phb_init(SysBusDevice *s)
>          return -1;
>      }
>  
> -    if (sphb->msi_win_addr == -1) {
> -        fprintf(stderr, "MSI window address not specified for PHB\n");
> -        return -1;
> -    }
> -
>      if (find_phb(spapr, sphb->buid)) {
>          fprintf(stderr, "PCI host bridges must have unique BUIDs\n");
>          return -1;
> @@ -609,18 +616,6 @@ static int spapr_phb_init(SysBusDevice *s)
>                            namebuf, SPAPR_PCI_IO_WIN_SIZE);
>      memory_region_add_subregion(get_system_memory(), sphb->io_win_addr,
>                                  &sphb->iowindow);
> -
> -    /* As MSI/MSIX interrupts trigger by writing at MSI/MSIX vectors,
> -     * we need to allocate some memory to catch those writes coming
> -     * from msi_notify()/msix_notify() */
> -    if (msi_supported) {
> -        sprintf(namebuf, "%s.msi", sphb->dtbusname);
> -        memory_region_init_io(&sphb->msiwindow, OBJECT(sphb), &spapr_msi_ops, sphb,
> -                              namebuf, SPAPR_MSIX_MAX_DEVS * 0x10000);
> -        memory_region_add_subregion(get_system_memory(), sphb->msi_win_addr,
> -                                    &sphb->msiwindow);
> -    }
> -
>      /*
>       * Selecting a busname is more complex than you'd think, due to
>       * interacting constraints.  If the user has specified an id
> @@ -695,7 +690,6 @@ static Property spapr_phb_properties[] = {
>      DEFINE_PROP_HEX64("io_win_addr", sPAPRPHBState, io_win_addr, -1),
>      DEFINE_PROP_HEX64("io_win_size", sPAPRPHBState, io_win_size,
>                        SPAPR_PCI_IO_WIN_SIZE),
> -    DEFINE_PROP_HEX64("msi_win_addr", sPAPRPHBState, msi_win_addr, -1),
>      DEFINE_PROP_END_OF_LIST(),
>  };
>  
> @@ -737,7 +731,6 @@ static const VMStateDescription vmstate_spapr_pci = {
>          VMSTATE_UINT64_EQUAL(mem_win_size, sPAPRPHBState),
>          VMSTATE_UINT64_EQUAL(io_win_addr, sPAPRPHBState),
>          VMSTATE_UINT64_EQUAL(io_win_size, sPAPRPHBState),
> -        VMSTATE_UINT64_EQUAL(msi_win_addr, sPAPRPHBState),
>          VMSTATE_STRUCT_ARRAY(lsi_table, sPAPRPHBState, PCI_NUM_PINS, 0,
>                               vmstate_spapr_pci_lsi, struct spapr_pci_lsi),
>          VMSTATE_STRUCT_ARRAY(msi_table, sPAPRPHBState, SPAPR_MSIX_MAX_DEVS, 0,
> diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h
> index 93f9511..970b4a9 100644
> --- a/include/hw/pci-host/spapr.h
> +++ b/include/hw/pci-host/spapr.h
> @@ -43,8 +43,7 @@ typedef struct sPAPRPHBState {
>  
>      MemoryRegion memspace, iospace;
>      hwaddr mem_win_addr, mem_win_size, io_win_addr, io_win_size;
> -    hwaddr msi_win_addr;
> -    MemoryRegion memwindow, iowindow, msiwindow;
> +    MemoryRegion memwindow, iowindow;
>  
>      uint32_t dma_liobn;
>      uint64_t dma_window_start;
> @@ -73,7 +72,8 @@ typedef struct sPAPRPHBState {
>  #define SPAPR_PCI_MMIO_WIN_SIZE      0x20000000
>  #define SPAPR_PCI_IO_WIN_OFF         0x80000000
>  #define SPAPR_PCI_IO_WIN_SIZE        0x10000
> -#define SPAPR_PCI_MSI_WIN_OFF        0x90000000
> +
> +#define SPAPR_PCI_MSI_WINDOW         0x40000000000ULL
>  
>  #define SPAPR_PCI_MEM_WIN_BUS_OFFSET 0x80000000ULL
>  
> @@ -88,6 +88,8 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
>                            uint32_t xics_phandle,
>                            void *fdt);
>  
> +void spapr_pci_msi_init(sPAPREnvironment *spapr, hwaddr addr);
> +
>  void spapr_pci_rtas_init(void);
>  
>  #endif /* __HW_SPAPR_PCI_H__ */
> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> index 7aa9a3d..aed02e1 100644
> --- a/include/hw/ppc/spapr.h
> +++ b/include/hw/ppc/spapr.h
> @@ -14,6 +14,8 @@ struct icp_state;
>  typedef struct sPAPREnvironment {
>      struct VIOsPAPRBus *vio_bus;
>      QLIST_HEAD(, sPAPRPHBState) phbs;
> +    hwaddr msi_win_addr;
> +    MemoryRegion msiwindow;
>      struct sPAPRNVRAM *nvram;
>      struct icp_state *icp;
>  
> @@ -303,7 +305,7 @@ target_ulong spapr_hypercall(PowerPCCPU *cpu, target_ulong opcode,
>                               target_ulong *args);
>  
>  int spapr_allocate_irq(int hint, bool lsi);
> -int spapr_allocate_irq_block(int num, bool lsi);
> +int spapr_allocate_irq_block(int num, bool lsi, bool msi);
>  
>  static inline int spapr_allocate_msi(int hint)
>  {
> -- 
> 1.8.3.2
>
Alexey Kardashevskiy - Aug. 23, 2013, 3:10 a.m.
On 08/01/2013 09:22 AM, Anthony Liguori wrote:
> Alexey Kardashevskiy <aik@ozlabs.ru> writes:
> 
>> On 08/01/2013 04:02 AM, Anthony Liguori wrote:
>>> Alexey Kardashevskiy <aik@ozlabs.ru> writes:
>>>
>>>> On the sPAPR platform a guest allocates MSI/MSIX vectors via RTAS
>>>> hypercalls which return global IRQ numbers to a guest so it only
>>>> operates with those and never touches MSIMessage.
>>>>
>>>> Therefore MSIMessage handling is completely hidden in QEMU.
>>>>
>>>> Previously every sPAPR PCI host bridge implemented its own MSI window
>>>> to catch msi_notify()/msix_notify() calls from QEMU devices (virtio-pci
>>>> or vfio) and route them to the guest via qemu_pulse_irq().
>>>> MSIMessage used to be encoded as:
>>>> 	.addr - address within the PHB MSI window;
>>>> 	.data - the device index on PHB plus vector number.
>>>> The MSI MR write function translated this MSIMessage to a global IRQ
>>>> number and called qemu_pulse_irq().
>>>>
>>>> However the total number of IRQs is not really big (at the moment it is
>>>> 1024 IRQs starting from 4096) and even 16bit data field of MSIMessage
>>>> seems to be enough to store an IRQ number there.
>>>>
>>>> This simplifies MSI handling in sPAPR PHB. Specifically, this does:
>>>> 1. remove a MSI window from a PHB;
>>>> 2. add a single memory region for all MSIs to sPAPREnvironment
>>>> and spapr_pci_msi_init() to initialize it;
>>>> 3. encode MSIMessage as:
>>>>     * .addr - a fixed address of SPAPR_PCI_MSI_WINDOW==0x40000000000ULL;
>>>>     * .data as an IRQ number.
>>>> 4. change IRQ allocator to align first IRQ number in a block for MSI.
>>>> MSI uses lower bits to specify the vector number so the first IRQ has to
>>>> be aligned. MSIX does not need any special allocator though.
>>>>
>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>>
>>> Reviewed-by: Anthony Liguori <aliguori@us.ibm.com>
>>>
>>> Does this actually fix any bug or is this just refactoring?  If it's the
>>> later, it'll have to wait until after the 1.7 window opens up.
>>
>>
>> This is refactoring which should make IRQFD enablement on spapr
>> easier.
> 
> Okay, I'll apply it after hard freeze then.


Anyone, ping?
Alexander Graf - Aug. 25, 2013, 6:31 p.m.
On 12.07.2013, at 08:38, Alexey Kardashevskiy wrote:

> On the sPAPR platform a guest allocates MSI/MSIX vectors via RTAS
> hypercalls which return global IRQ numbers to a guest so it only
> operates with those and never touches MSIMessage.
> 
> Therefore MSIMessage handling is completely hidden in QEMU.
> 
> Previously every sPAPR PCI host bridge implemented its own MSI window
> to catch msi_notify()/msix_notify() calls from QEMU devices (virtio-pci
> or vfio) and route them to the guest via qemu_pulse_irq().
> MSIMessage used to be encoded as:
> 	.addr - address within the PHB MSI window;
> 	.data - the device index on PHB plus vector number.
> The MSI MR write function translated this MSIMessage to a global IRQ
> number and called qemu_pulse_irq().
> 
> However the total number of IRQs is not really big (at the moment it is
> 1024 IRQs starting from 4096) and even 16bit data field of MSIMessage
> seems to be enough to store an IRQ number there.
> 
> This simplifies MSI handling in sPAPR PHB. Specifically, this does:
> 1. remove a MSI window from a PHB;
> 2. add a single memory region for all MSIs to sPAPREnvironment
> and spapr_pci_msi_init() to initialize it;
> 3. encode MSIMessage as:
>    * .addr - a fixed address of SPAPR_PCI_MSI_WINDOW==0x40000000000ULL;
>    * .data as an IRQ number.
> 4. change IRQ allocator to align first IRQ number in a block for MSI.
> MSI uses lower bits to specify the vector number so the first IRQ has to
> be aligned. MSIX does not need any special allocator though.
> 
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>

Thanks, applied to ppc-next.


Alex

Patch

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 671bbd9..6b21191 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -88,6 +88,9 @@  int spapr_allocate_irq(int hint, bool lsi)
 
     if (hint) {
         irq = hint;
+        if (hint >= spapr->next_irq) {
+            spapr->next_irq = hint + 1;
+        }
         /* FIXME: we should probably check for collisions somehow */
     } else {
         irq = spapr->next_irq++;
@@ -103,22 +106,39 @@  int spapr_allocate_irq(int hint, bool lsi)
     return irq;
 }
 
-/* Allocate block of consequtive IRQs, returns a number of the first */
-int spapr_allocate_irq_block(int num, bool lsi)
+/*
+ * Allocate block of consequtive IRQs, returns a number of the first.
+ * If msi==true, aligns the first IRQ number to num.
+ */
+int spapr_allocate_irq_block(int num, bool lsi, bool msi)
 {
     int first = -1;
-    int i;
+    int i, hint = 0;
+
+    /*
+     * MSIMesage::data is used for storing VIRQ so
+     * it has to be aligned to num to support multiple
+     * MSI vectors. MSI-X is not affected by this.
+     * The hint is used for the first IRQ, the rest should
+     * be allocated continously.
+     */
+    if (msi) {
+        assert((num == 1) || (num == 2) || (num == 4) ||
+               (num == 8) || (num == 16) || (num == 32));
+        hint = (spapr->next_irq + num - 1) & ~(num - 1);
+    }
 
     for (i = 0; i < num; ++i) {
         int irq;
 
-        irq = spapr_allocate_irq(0, lsi);
+        irq = spapr_allocate_irq(hint, lsi);
         if (!irq) {
             return -1;
         }
 
         if (0 == i) {
             first = irq;
+            hint = 0;
         }
 
         /* If the above doesn't create a consecutive block then that's
@@ -1127,6 +1147,7 @@  static void ppc_spapr_init(QEMUMachineInitArgs *args)
     spapr_create_nvram(spapr);
 
     /* Set up PCI */
+    spapr_pci_msi_init(spapr, SPAPR_PCI_MSI_WINDOW);
     spapr_pci_rtas_init();
 
     phb = spapr_create_phb(spapr, 0);
diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
index dfe4d04..c8ea777 100644
--- a/hw/ppc/spapr_pci.c
+++ b/hw/ppc/spapr_pci.c
@@ -258,11 +258,11 @@  static int spapr_msicfg_find(sPAPRPHBState *phb, uint32_t config_addr,
  * This is required for msi_notify()/msix_notify() which
  * will write at the addresses via spapr_msi_write().
  */
-static void spapr_msi_setmsg(PCIDevice *pdev, hwaddr addr,
-                             bool msix, unsigned req_num)
+static void spapr_msi_setmsg(PCIDevice *pdev, hwaddr addr, bool msix,
+                             unsigned first_irq, unsigned req_num)
 {
     unsigned i;
-    MSIMessage msg = { .address = addr, .data = 0 };
+    MSIMessage msg = { .address = addr, .data = first_irq };
 
     if (!msix) {
         msi_set_message(pdev, msg);
@@ -270,8 +270,7 @@  static void spapr_msi_setmsg(PCIDevice *pdev, hwaddr addr,
         return;
     }
 
-    for (i = 0; i < req_num; ++i) {
-        msg.address = addr | (i << 2);
+    for (i = 0; i < req_num; ++i, ++msg.data) {
         msix_set_message(pdev, i, msg);
         trace_spapr_pci_msi_setup(pdev->name, i, msg.address);
     }
@@ -351,7 +350,8 @@  static void rtas_ibm_change_msi(PowerPCCPU *cpu, sPAPREnvironment *spapr,
 
     /* There is no cached config, allocate MSIs */
     if (!phb->msi_table[ndev].nvec) {
-        irq = spapr_allocate_irq_block(req_num, false);
+        irq = spapr_allocate_irq_block(req_num, false,
+                                       ret_intr_type == RTAS_TYPE_MSI);
         if (irq < 0) {
             fprintf(stderr, "Cannot allocate MSIs for device#%d", ndev);
             rtas_st(rets, 0, -1); /* Hardware error */
@@ -363,8 +363,8 @@  static void rtas_ibm_change_msi(PowerPCCPU *cpu, sPAPREnvironment *spapr,
     }
 
     /* Setup MSI/MSIX vectors in the device (via cfgspace or MSIX BAR) */
-    spapr_msi_setmsg(pdev, phb->msi_win_addr | (ndev << 16),
-                     ret_intr_type == RTAS_TYPE_MSIX, req_num);
+    spapr_msi_setmsg(pdev, spapr->msi_win_addr, ret_intr_type == RTAS_TYPE_MSIX,
+                     phb->msi_table[ndev].irq, req_num);
 
     rtas_st(rets, 0, 0);
     rtas_st(rets, 1, req_num);
@@ -487,10 +487,7 @@  static const MemoryRegionOps spapr_io_ops = {
 static void spapr_msi_write(void *opaque, hwaddr addr,
                             uint64_t data, unsigned size)
 {
-    sPAPRPHBState *phb = opaque;
-    int ndev = addr >> 16;
-    int vec = ((addr & 0xFFFF) >> 2) | data;
-    uint32_t irq = phb->msi_table[ndev].irq + vec;
+    uint32_t irq = data;
 
     trace_spapr_pci_msi_write(addr, data, irq);
 
@@ -504,6 +501,23 @@  static const MemoryRegionOps spapr_msi_ops = {
     .endianness = DEVICE_LITTLE_ENDIAN
 };
 
+void spapr_pci_msi_init(sPAPREnvironment *spapr, hwaddr addr)
+{
+    /*
+     * As MSI/MSIX interrupts trigger by writing at MSI/MSIX vectors,
+     * we need to allocate some memory to catch those writes coming
+     * from msi_notify()/msix_notify().
+     * As MSIMessage:addr is going to be the same and MSIMessage:data
+     * is going to be a VIRQ number, 4 bytes of the MSI MR will only
+     * be used.
+     */
+    spapr->msi_win_addr = addr;
+    memory_region_init_io(&spapr->msiwindow, NULL, &spapr_msi_ops, spapr,
+                          "msi", getpagesize());
+    memory_region_add_subregion(get_system_memory(), spapr->msi_win_addr,
+                                &spapr->msiwindow);
+}
+
 /*
  * PHB PCI device
  */
@@ -528,8 +542,7 @@  static int spapr_phb_init(SysBusDevice *s)
 
         if ((sphb->buid != -1) || (sphb->dma_liobn != -1)
             || (sphb->mem_win_addr != -1)
-            || (sphb->io_win_addr != -1)
-            || (sphb->msi_win_addr != -1)) {
+            || (sphb->io_win_addr != -1)) {
             fprintf(stderr, "Either \"index\" or other parameters must"
                     " be specified for PAPR PHB, not both\n");
             return -1;
@@ -542,7 +555,6 @@  static int spapr_phb_init(SysBusDevice *s)
             + sphb->index * SPAPR_PCI_WINDOW_SPACING;
         sphb->mem_win_addr = windows_base + SPAPR_PCI_MMIO_WIN_OFF;
         sphb->io_win_addr = windows_base + SPAPR_PCI_IO_WIN_OFF;
-        sphb->msi_win_addr = windows_base + SPAPR_PCI_MSI_WIN_OFF;
     }
 
     if (sphb->buid == -1) {
@@ -565,11 +577,6 @@  static int spapr_phb_init(SysBusDevice *s)
         return -1;
     }
 
-    if (sphb->msi_win_addr == -1) {
-        fprintf(stderr, "MSI window address not specified for PHB\n");
-        return -1;
-    }
-
     if (find_phb(spapr, sphb->buid)) {
         fprintf(stderr, "PCI host bridges must have unique BUIDs\n");
         return -1;
@@ -609,18 +616,6 @@  static int spapr_phb_init(SysBusDevice *s)
                           namebuf, SPAPR_PCI_IO_WIN_SIZE);
     memory_region_add_subregion(get_system_memory(), sphb->io_win_addr,
                                 &sphb->iowindow);
-
-    /* As MSI/MSIX interrupts trigger by writing at MSI/MSIX vectors,
-     * we need to allocate some memory to catch those writes coming
-     * from msi_notify()/msix_notify() */
-    if (msi_supported) {
-        sprintf(namebuf, "%s.msi", sphb->dtbusname);
-        memory_region_init_io(&sphb->msiwindow, OBJECT(sphb), &spapr_msi_ops, sphb,
-                              namebuf, SPAPR_MSIX_MAX_DEVS * 0x10000);
-        memory_region_add_subregion(get_system_memory(), sphb->msi_win_addr,
-                                    &sphb->msiwindow);
-    }
-
     /*
      * Selecting a busname is more complex than you'd think, due to
      * interacting constraints.  If the user has specified an id
@@ -695,7 +690,6 @@  static Property spapr_phb_properties[] = {
     DEFINE_PROP_HEX64("io_win_addr", sPAPRPHBState, io_win_addr, -1),
     DEFINE_PROP_HEX64("io_win_size", sPAPRPHBState, io_win_size,
                       SPAPR_PCI_IO_WIN_SIZE),
-    DEFINE_PROP_HEX64("msi_win_addr", sPAPRPHBState, msi_win_addr, -1),
     DEFINE_PROP_END_OF_LIST(),
 };
 
@@ -737,7 +731,6 @@  static const VMStateDescription vmstate_spapr_pci = {
         VMSTATE_UINT64_EQUAL(mem_win_size, sPAPRPHBState),
         VMSTATE_UINT64_EQUAL(io_win_addr, sPAPRPHBState),
         VMSTATE_UINT64_EQUAL(io_win_size, sPAPRPHBState),
-        VMSTATE_UINT64_EQUAL(msi_win_addr, sPAPRPHBState),
         VMSTATE_STRUCT_ARRAY(lsi_table, sPAPRPHBState, PCI_NUM_PINS, 0,
                              vmstate_spapr_pci_lsi, struct spapr_pci_lsi),
         VMSTATE_STRUCT_ARRAY(msi_table, sPAPRPHBState, SPAPR_MSIX_MAX_DEVS, 0,
diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h
index 93f9511..970b4a9 100644
--- a/include/hw/pci-host/spapr.h
+++ b/include/hw/pci-host/spapr.h
@@ -43,8 +43,7 @@  typedef struct sPAPRPHBState {
 
     MemoryRegion memspace, iospace;
     hwaddr mem_win_addr, mem_win_size, io_win_addr, io_win_size;
-    hwaddr msi_win_addr;
-    MemoryRegion memwindow, iowindow, msiwindow;
+    MemoryRegion memwindow, iowindow;
 
     uint32_t dma_liobn;
     uint64_t dma_window_start;
@@ -73,7 +72,8 @@  typedef struct sPAPRPHBState {
 #define SPAPR_PCI_MMIO_WIN_SIZE      0x20000000
 #define SPAPR_PCI_IO_WIN_OFF         0x80000000
 #define SPAPR_PCI_IO_WIN_SIZE        0x10000
-#define SPAPR_PCI_MSI_WIN_OFF        0x90000000
+
+#define SPAPR_PCI_MSI_WINDOW         0x40000000000ULL
 
 #define SPAPR_PCI_MEM_WIN_BUS_OFFSET 0x80000000ULL
 
@@ -88,6 +88,8 @@  int spapr_populate_pci_dt(sPAPRPHBState *phb,
                           uint32_t xics_phandle,
                           void *fdt);
 
+void spapr_pci_msi_init(sPAPREnvironment *spapr, hwaddr addr);
+
 void spapr_pci_rtas_init(void);
 
 #endif /* __HW_SPAPR_PCI_H__ */
diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index 7aa9a3d..aed02e1 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -14,6 +14,8 @@  struct icp_state;
 typedef struct sPAPREnvironment {
     struct VIOsPAPRBus *vio_bus;
     QLIST_HEAD(, sPAPRPHBState) phbs;
+    hwaddr msi_win_addr;
+    MemoryRegion msiwindow;
     struct sPAPRNVRAM *nvram;
     struct icp_state *icp;
 
@@ -303,7 +305,7 @@  target_ulong spapr_hypercall(PowerPCCPU *cpu, target_ulong opcode,
                              target_ulong *args);
 
 int spapr_allocate_irq(int hint, bool lsi);
-int spapr_allocate_irq_block(int num, bool lsi);
+int spapr_allocate_irq_block(int num, bool lsi, bool msi);
 
 static inline int spapr_allocate_msi(int hint)
 {