Patchwork [3/3] pseries pci: added MSI/MSIX support

login
register
mail settings
Submitter Alexey Kardashevskiy
Date June 14, 2012, 4:34 a.m.
Message ID <4FD969D3.1050700@ozlabs.ru>
Download mbox | patch
Permalink /patch/164798/
State New
Headers show

Comments

Alexey Kardashevskiy - June 14, 2012, 4:34 a.m.
virtio-pci expects the guest to set up MSI message address and data, and
to do other initialization such as a vector number negotiation.
It also notifies the guest via writing an MSI message to a previously set
address.

This patch includes:

1. RTAS call "ibm,change-msi" which sets up number of MSI vectors per
a device. Note that this call may configure and return lesser number of
vectors than requested.

2. RTAS call "ibm,query-interrupt-source-number" which translates MSI
vector to interrupt controller (XICS) IRQ number.

3. A "config_space_address to msi_table" map to provide IRQ resolving from
config-address as MSI RTAS calls take a PCI config space address as
an identifier.

4. A MSIX memory region is added to catch msi_notify()/msix_notiry()
from virtio-pci and pass them to the guest via qemu_irq_pulse().

This patch depends on the "msi/msix: added functions to API to set up
message address and data" patch.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 hw/spapr.c     |    9 ++-
 hw/spapr_pci.c |  266 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
 hw/spapr_pci.h |   13 +++-
 trace-events   |    9 ++
 4 files changed, 284 insertions(+), 13 deletions(-)

+spapr_pci_rtas_ibm_query_interrupt_source_number(unsigned ioa, unsigned intr) "queries for #%u, IRQ%u"
+spapr_pci_msi_write(uint64_t addr, uint64_t data, uint32_t dt_irq) "@%"PRIx64"<=%"PRIx64" IRQ %u"
+spapr_pci_lsi_set(const char *busname, int pin, uint32_t irq) "%s PIN%d IRQ %u"
+
Alexander Graf - June 27, 2012, 6:15 p.m.
On 14.06.2012, at 06:34, Alexey Kardashevskiy wrote:

> virtio-pci expects the guest to set up MSI message address and data, and
> to do other initialization such as a vector number negotiation.
> It also notifies the guest via writing an MSI message to a previously set
> address.
> 
> This patch includes:
> 
> 1. RTAS call "ibm,change-msi" which sets up number of MSI vectors per
> a device. Note that this call may configure and return lesser number of
> vectors than requested.
> 
> 2. RTAS call "ibm,query-interrupt-source-number" which translates MSI
> vector to interrupt controller (XICS) IRQ number.
> 
> 3. A "config_space_address to msi_table" map to provide IRQ resolving from
> config-address as MSI RTAS calls take a PCI config space address as
> an identifier.
> 
> 4. A MSIX memory region is added to catch msi_notify()/msix_notiry()
> from virtio-pci and pass them to the guest via qemu_irq_pulse().
> 
> This patch depends on the "msi/msix: added functions to API to set up
> message address and data" patch.
> 
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>

Quite a big patch. I'd really prefer if you split out logically separate parts into easier to review chunks.

Either way, for this to get applied, it definitely need an ack from Ben and Michael. I know too little about the actual details of MSI, the QEMU MSI framework and the MSI PAPR bits.

> ---
> hw/spapr.c     |    9 ++-
> hw/spapr_pci.c |  266 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
> hw/spapr_pci.h |   13 +++-
> trace-events   |    9 ++
> 4 files changed, 284 insertions(+), 13 deletions(-)
> 
> diff --git a/hw/spapr.c b/hw/spapr.c
> index ef6ffcb..35ad075 100644
> --- a/hw/spapr.c
> +++ b/hw/spapr.c
> @@ -43,6 +43,7 @@
> #include "hw/spapr_vio.h"
> #include "hw/spapr_pci.h"
> #include "hw/xics.h"
> +#include "hw/msi.h"
> 
> #include "kvm.h"
> #include "kvm_ppc.h"
> @@ -82,6 +83,7 @@
> #define SPAPR_PCI_MEM_WIN_ADDR  (0x10000000000ULL + 0xA0000000)
> #define SPAPR_PCI_MEM_WIN_SIZE  0x20000000
> #define SPAPR_PCI_IO_WIN_ADDR   (0x10000000000ULL + 0x80000000)
> +#define SPAPR_PCI_MSI_WIN_ADDR  (0x10000000000ULL + 0x90000000)
> 
> #define PHANDLE_XICP            0x00001111
> 
> @@ -116,7 +118,7 @@ qemu_irq spapr_allocate_irq(uint32_t hint, uint32_t *irq_num,
> /* Allocate block of consequtive IRQs, returns a number of the first */
> int spapr_allocate_irq_block(uint32_t num, enum xics_irq_type type)
> {
> -    int i, ret;
> +    int i, ret = 0;

Huh?

>     uint32_t irq = -1;
> 
>     for (i = 0; i < num; ++i) {
> @@ -690,6 +692,8 @@ static void ppc_spapr_init(ram_addr_t ram_size,
>     long pteg_shift = 17;
>     char *filename;
> 
> +    msi_supported = true;
> +
>     spapr = g_malloc0(sizeof(*spapr));
>     QLIST_INIT(&spapr->phbs);
> 
> @@ -804,7 +808,8 @@ static void ppc_spapr_init(ram_addr_t ram_size,
>     spapr_create_phb(spapr, "pci", SPAPR_PCI_BUID,
>                      SPAPR_PCI_MEM_WIN_ADDR,
>                      SPAPR_PCI_MEM_WIN_SIZE,
> -                     SPAPR_PCI_IO_WIN_ADDR);
> +                     SPAPR_PCI_IO_WIN_ADDR,
> +                     SPAPR_PCI_MSI_WIN_ADDR);
> 
>     for (i = 0; i < nb_nics; i++) {
>         NICInfo *nd = &nd_table[i];
> diff --git a/hw/spapr_pci.c b/hw/spapr_pci.c
> index 93017cd..21fbc50 100644
> --- a/hw/spapr_pci.c
> +++ b/hw/spapr_pci.c
> @@ -24,31 +24,46 @@
>  */
> #include "hw.h"
> #include "pci.h"
> +#include "msix.h"
> +#include "msi.h"
> #include "pci_host.h"
> #include "hw/spapr.h"
> #include "hw/spapr_pci.h"
> #include "exec-memory.h"
> #include <libfdt.h>
> +#include "trace.h"
> 
> #include "hw/pci_internals.h"
> 
> -static PCIDevice *find_dev(sPAPREnvironment *spapr,
> -                           uint64_t buid, uint32_t config_addr)
> +static sPAPRPHBState *find_phb(sPAPREnvironment *spapr, uint64_t buid)
> {
> -    DeviceState *qdev;
> -    int devfn = (config_addr >> 8) & 0xFF;
>     sPAPRPHBState *phb;
> 
>     QLIST_FOREACH(phb, &spapr->phbs, list) {
>         if (phb->buid != buid) {
>             continue;
>         }
> +        return phb;
> +    }
> 
> -        QTAILQ_FOREACH(qdev, &phb->host_state.bus->qbus.children, sibling) {
> -            PCIDevice *dev = (PCIDevice *)qdev;
> -            if (dev->devfn == devfn) {
> -                return dev;
> -            }
> +    return NULL;
> +}
> +
> +static PCIDevice *find_dev(sPAPREnvironment *spapr, uint64_t buid,
> +                           uint32_t config_addr)
> +{
> +    sPAPRPHBState *phb = find_phb(spapr, buid);
> +    DeviceState *qdev;
> +    int devfn = (config_addr >> 8) & 0xFF;
> +
> +    if (!phb) {
> +        return NULL;
> +    }
> +
> +    QTAILQ_FOREACH(qdev, &phb->host_state.bus->qbus.children, sibling) {
> +        PCIDevice *dev = (PCIDevice *)qdev;
> +        if (dev->devfn == devfn) {
> +            return dev;
>         }
>     }
> 
> @@ -138,6 +153,220 @@ static void rtas_write_pci_config(sPAPREnvironment *spapr,
>     rtas_st(rets, 0, 0);
> }
> 
> +/*
> + * Initializes req_num vectors for a device.
> + * The code assumes that MSI/MSIX is enabled in the config space
> + * as a result of msix_init() or msi_init().
> + */
> +static int spapr_pci_config_msi(sPAPRPHBState *ph, int ndev,
> +                                PCIDevice *pdev, bool msix, unsigned req_num)
> +{
> +    unsigned i;
> +    int irq;
> +    uint64_t msi_address;
> +    uint32_t config_addr = pdev->devfn << 8;
> +
> +    /* Disabling - nothing to do */
> +    if (0 == req_num) {

Please don't do this. It makes the code almost unreadable. If you need to mentally read the 0 in the first place, use !x. Otherwise always do if (x == const).

> +        return 0;
> +    }
> +
> +    /* Enabling! */
> +    if (ph->msi_table[ndev].nvec && (req_num != ph->msi_table[ndev].nvec)) {
> +        /* Unexpected behaviour */
> +        fprintf(stderr, "Cannot reuse cached MSI config for device#%u", ndev);
> +        return -1;
> +    }
> +
> +    if (0 == ph->msi_table[ndev].nvec) {
> +        irq = spapr_allocate_irq_block(req_num, XICS_MSI);
> +        if (irq < 0) {
> +            return -1;
> +        }
> +        ph->msi_table[ndev].dt_irq = irq;
> +        ph->msi_table[ndev].nvec = req_num;
> +        ph->msi_table[ndev].config_addr = config_addr;
> +    }
> +
> +    if (msix) {
> +        for (i = 0; i < req_num; ++i) {
> +            msi_address = ph->msi_win_addr + ((ndev << 16) | i << 2);
> +            msix_set_address_data(pdev, i, msi_address, 0);
> +            trace_spapr_pci_msi_setup(pdev->name, i, msi_address);
> +        }
> +    } else {
> +        msi_address = ph->msi_win_addr + (ndev << 16);
> +        msi_set_address_data(pdev, msi_address, 0);
> +        trace_spapr_pci_msi_setup(pdev->name, 0, msi_address);
> +    }
> +
> +    return req_num;
> +}
> +
> +/*
> + * The handler handles both MSI and MSIX.
> + * For MSI-X, the vector number is encoded as a part of the address
> + * and data is set to 0.
> + * For MSI, the vector takes least bits in data and not encoded in
> + * address.
> + */
> +static void spapr_msi_write(void *opaque, target_phys_addr_t addr,
> +                            uint64_t data, unsigned size)
> +{
> +    sPAPRPHBState *ephb = opaque;
> +    int n = addr >> 16;
> +    int vec = ((addr & 0xFFFF) >> 2) | data;
> +    uint32_t dt_irq = ephb->msi_table[n].dt_irq + vec;
> +
> +    trace_spapr_pci_msi_write(addr, data, dt_irq);
> +
> +    qemu_irq_pulse(xics_assign_irq(spapr->icp, dt_irq, XICS_MSI));
> +}
> +
> +static const MemoryRegionOps spapr_msi_ops = {
> +    .read = NULL,
> +    .write = spapr_msi_write,
> +    .endianness = DEVICE_LITTLE_ENDIAN
> +};
> +
> +static void rtas_ibm_change_msi(sPAPREnvironment *spapr,
> +                                uint32_t token, uint32_t nargs,
> +                                target_ulong args, uint32_t nret,
> +                                target_ulong rets)
> +{
> +    uint32_t config_addr = rtas_ld(args, 0);
> +    uint64_t buid = ((uint64_t)rtas_ld(args, 1) << 32) | rtas_ld(args, 2);
> +#define RTAS_QUERY_FN           0
> +#define RTAS_CHANGE_FN          1
> +#define RTAS_RESET_FN           2
> +#define RTAS_CHANGE_MSI_FN      3
> +#define RTAS_CHANGE_MSIX_FN     4

Generic RTAS defines? Doesn't belong in the middle of the code.

> +    unsigned int func = rtas_ld(args, 3);
> +    unsigned int req_num = rtas_ld(args, 4); /* 0 == remove all */
> +    unsigned int seq_num = rtas_ld(args, 5);
> +    int assigned_num = -1;
> +    enum { MSI = 1, MSIX = 2 } ret_intr_type = MSI;

O_o. No.

> +    int i;
> +    sPAPRPHBState *phb = NULL;
> +    PCIDevice *pdev = NULL;
> +
> +    switch (func) {
> +    case RTAS_CHANGE_MSI_FN:
> +    case RTAS_CHANGE_FN:
> +        ret_intr_type = MSI;
> +        break;
> +    case RTAS_CHANGE_MSIX_FN:
> +        ret_intr_type = MSIX;
> +        break;
> +    default:
> +        fprintf(stderr, "rtas_ibm_change_msi(%u) is not implemented\n", func);
> +        rtas_st(rets, 0, -3); /* Parameter error */
> +        return;
> +    }
> +
> +    /* Fins sPAPRPHBState */
> +    phb = find_phb(spapr, buid);
> +    if (phb) {
> +        pdev = find_dev(spapr, buid, config_addr);
> +    }
> +    if (!phb || !pdev) {
> +        rtas_st(rets, 0, -3); /* Parameter error */
> +        return;
> +    }
> +
> +    if (0 == req_num) {
> +        /* Calculate a device number in the map to remove */
> +        for (i = 0; i < SPAPR_MSIX_MAX_DEVS; ++i) {
> +            if (phb->msi_table[i].nvec &&
> +                    (phb->msi_table[i].config_addr == config_addr)) {
> +                trace_spapr_pci_msi("Releasing MSI config for device#", i);
> +                break;

I thought it's "release all"?

> +            }
> +        }
> +        if (i >= SPAPR_MSIX_MAX_DEVS) {
> +            fprintf(stderr, "MSI has not been enabled for this device!\n");

This looks more like debug output than anything.

> +            rtas_st(rets, 0, -1); /* Hardware error */
> +            return;
> +        }
> +    } else {
> +        /* Find a device number in the map to add */
> +        for (i = 0; i < SPAPR_MSIX_MAX_DEVS; ++i) {
> +            if (!phb->msi_table[i].nvec) {
> +                /* as we never free entries, it means no more left in cache,
> +                   so we use new empty entry */
> +                trace_spapr_pci_msi("Allocating new MSI for device#", i);
> +                break;
> +            }
> +            if (phb->msi_table[i].config_addr != config_addr) {
> +                continue;
> +            }
> +            /* Even if req_num is greater than nvec, it is expected
> +               that the host will allocate (nvec) vectors again */
> +            trace_spapr_pci_msi("Reuse cached MSI config for device#", i);
> +            break;
> +        }
> +        if (i >= SPAPR_MSIX_MAX_DEVS) {
> +            fprintf(stderr, "No free entry for a new MSI device\n");
> +            rtas_st(rets, 0, -1); /* Hardware error */
> +            return;
> +        }
> +    }
> +
> +    assigned_num = spapr_pci_config_msi(phb, i, pdev,
> +                                        ret_intr_type == MSIX, req_num);
> +
> +    if (assigned_num < 0) {
> +        fprintf(stderr, "rtas_ibm_change_msi(%u) failed\n", func);
> +        rtas_st(rets, 0, -3); /* Parameter error */
> +        return;
> +    }
> +    trace_spapr_pci_rtas_ibm_change_msi(func, req_num, assigned_num);
> +
> +    rtas_st(rets, 0, 0);
> +    rtas_st(rets, 1, assigned_num);
> +    rtas_st(rets, 2, ++seq_num);
> +    rtas_st(rets, 3, ret_intr_type);
> +}
> +
> +static void rtas_ibm_query_interrupt_source_number(sPAPREnvironment *spapr,
> +                                                   uint32_t token,
> +                                                   uint32_t nargs,
> +                                                   target_ulong args,
> +                                                   uint32_t nret,
> +                                                   target_ulong rets)
> +{
> +    uint32_t config_addr = rtas_ld(args, 0);
> +    uint64_t buid = ((uint64_t)rtas_ld(args, 1) << 32) | rtas_ld(args, 2);
> +    unsigned int i, intr_src_num = -1, ioa_intr_num = rtas_ld(args, 3);
> +    sPAPRPHBState *phb = NULL;
> +
> +    /* Fins sPAPRPHBState */
> +    phb = find_phb(spapr, buid);
> +    if (!phb) {
> +        rtas_st(rets, 0, -3); /* Parameter error */
> +        return;
> +    }
> +
> +    /* Find device descriptor and start IRQ */
> +    for (i = 0; i < SPAPR_MSIX_MAX_DEVS; ++i) {
> +        if (phb->msi_table[i].config_addr == config_addr) {
> +            intr_src_num = phb->msi_table[i].dt_irq + ioa_intr_num;
> +            break;
> +        }
> +    }
> +    /* Device has not been configured, return error */
> +    if (-1 == intr_src_num) {
> +        rtas_st(rets, 0, -1); /* Hardware error */
> +        return;
> +    }
> +    trace_spapr_pci_rtas_ibm_query_interrupt_source_number(ioa_intr_num,
> +                                                           intr_src_num);
> +
> +    rtas_st(rets, 0, 0);
> +    rtas_st(rets, 1, intr_src_num);
> +    rtas_st(rets, 2, 1);/* 0 == level; 1 == edge */
> +}
> +
> static int pci_swizzle(int slot, int pin)
> {
>     return (slot + pin) % PCI_NUM_PINS;
> @@ -162,6 +391,8 @@ static void pci_spapr_set_irq(void *opaque, int irq_num, int level)
>      */
>     sPAPRPHBState *phb = opaque;
> 
> +    trace_spapr_pci_lsi_set(phb->busname, irq_num,
> +                            phb->lsi_table[irq_num].dt_irq);
>     qemu_set_irq(xics_assign_irq(spapr->icp,
>                                  phb->lsi_table[irq_num].dt_irq, XICS_LSI),
>                  level);
> @@ -255,6 +486,14 @@ static int spapr_phb_init(SysBusDevice *s)
>     memory_region_add_subregion(get_system_memory(), phb->io_win_addr,
>                                 &phb->iowindow);
> 
> +    if (msi_supported) {
> +        sprintf(namebuf, "%s.msi", phb->dtbusname);
> +        memory_region_init_io(&phb->msiwindow, &spapr_msi_ops, phb,
> +                              namebuf, SPAPR_MSIX_MAX_DEVS * 0x10000);
> +        memory_region_add_subregion(get_system_memory(), phb->msi_win_addr,
> +                                    &phb->msiwindow);
> +    }
> +
>     bus = pci_register_bus(&phb->host_state.busdev.qdev,
>                            phb->busname ? phb->busname : phb->dtbusname,
>                            pci_spapr_set_irq, pci_spapr_map_irq, phb,
> @@ -286,6 +525,7 @@ static Property spapr_phb_properties[] = {
>     DEFINE_PROP_HEX64("mem_win_size", sPAPRPHBState, mem_win_size, 0x20000000),
>     DEFINE_PROP_HEX64("io_win_addr", sPAPRPHBState, io_win_addr, 0),
>     DEFINE_PROP_HEX64("io_win_size", sPAPRPHBState, io_win_size, 0x10000),
> +    DEFINE_PROP_HEX64("msi_win_addr", sPAPRPHBState, msi_win_addr, 0),
>     DEFINE_PROP_END_OF_LIST(),
> };
> 
> @@ -301,6 +541,11 @@ static void spapr_phb_class_init(ObjectClass *klass, void *data)
>     spapr_rtas_register("write-pci-config", rtas_write_pci_config);
>     spapr_rtas_register("ibm,read-pci-config", rtas_ibm_read_pci_config);
>     spapr_rtas_register("ibm,write-pci-config", rtas_ibm_write_pci_config);
> +    if (msi_supported) {
> +        spapr_rtas_register("ibm,query-interrupt-source-number",
> +                            rtas_ibm_query_interrupt_source_number);
> +        spapr_rtas_register("ibm,change-msi", rtas_ibm_change_msi);
> +    }
> }
> 
> static TypeInfo spapr_phb_info = {
> @@ -313,7 +558,7 @@ static TypeInfo spapr_phb_info = {
> void spapr_create_phb(sPAPREnvironment *spapr,
>                       const char *busname, uint64_t buid,
>                       uint64_t mem_win_addr, uint64_t mem_win_size,
> -                      uint64_t io_win_addr)
> +                      uint64_t io_win_addr, uint64_t msi_win_addr)
> {
>     DeviceState *dev;
> 
> @@ -326,6 +571,7 @@ void spapr_create_phb(sPAPREnvironment *spapr,
>     qdev_prop_set_uint64(dev, "mem_win_addr", mem_win_addr);
>     qdev_prop_set_uint64(dev, "mem_win_size", mem_win_size);
>     qdev_prop_set_uint64(dev, "io_win_addr", io_win_addr);
> +    qdev_prop_set_uint64(dev, "msi_win_addr", msi_win_addr);
> 
>     qdev_init_nofail(dev);
> }
> diff --git a/hw/spapr_pci.h b/hw/spapr_pci.h
> index 11c3ee1..bb9a5ea 100644
> --- a/hw/spapr_pci.h
> +++ b/hw/spapr_pci.h
> @@ -27,6 +27,8 @@
> #include "hw/pci_host.h"
> #include "hw/xics.h"
> 
> +#define SPAPR_MSIX_MAX_DEVS 32
> +
> typedef struct sPAPRPHBState {
>     PCIHostState host_state;
> 
> @@ -37,12 +39,21 @@ typedef struct sPAPRPHBState {
>     MemoryRegion memspace, iospace;
>     target_phys_addr_t mem_win_addr, mem_win_size, io_win_addr, io_win_size;
>     MemoryRegion memwindow, iowindow;
> +    target_phys_addr_t msi_win_addr;
> +    MemoryRegion msiwindow;
> +
>     DMAContext *dma;
> 
>     struct {
>         uint32_t dt_irq;
>     } lsi_table[PCI_NUM_PINS];
> 
> +    struct {
> +        uint32_t config_addr;
> +        uint32_t dt_irq; /* first irq, dt_irq!=0 means "used" */
> +        int nvec;
> +    } msi_table[SPAPR_MSIX_MAX_DEVS];
> +
>     QLIST_ENTRY(sPAPRPHBState) list;
> } sPAPRPHBState;
> 
> @@ -52,7 +63,7 @@ typedef struct sPAPRPHBState {
> void spapr_create_phb(sPAPREnvironment *spapr,
>                       const char *busname, uint64_t buid,
>                       uint64_t mem_win_addr, uint64_t mem_win_size,
> -                      uint64_t io_win_addr);
> +                      uint64_t io_win_addr, uint64_t msi_win_addr);
> 
> int spapr_populate_pci_dt(sPAPRPHBState *phb,
>                           uint32_t xics_phandle,
> diff --git a/trace-events b/trace-events
> index 70f059d..8d94053 100644
> --- a/trace-events
> +++ b/trace-events
> @@ -788,3 +788,12 @@ qxl_render_blit_guest_primary_initialized(void) ""
> qxl_render_blit(int32_t stride, int32_t left, int32_t right, int32_t top, int32_t bottom)
> "stride=%d [%d, %d, %d, %d]"
> qxl_render_guest_primary_resized(int32_t width, int32_t height, int32_t stride, int32_t bytes_pp,
> int32_t bits_pp) "%dx%d, stride %d, bpp %d, depth %d"
> qxl_render_update_area_done(void *cookie) "%p"
> +
> +# hw/spapr_pci.c
> +spapr_pci_msi(const char *msg, uint32_t n) "%s%u"
> +spapr_pci_msi_setup(const char *name, unsigned vector, uint64_t addr) "dev\"%s\" vector %u,
> addr=%"PRIx64
> +spapr_pci_rtas_ibm_change_msi(unsigned func, unsigned req, int assigned) "func %u, requested %u,
> assigned %d"
> +spapr_pci_rtas_ibm_query_interrupt_source_number(unsigned ioa, unsigned intr) "queries for #%u, IRQ%u"
> +spapr_pci_msi_write(uint64_t addr, uint64_t data, uint32_t dt_irq) "@%"PRIx64"<=%"PRIx64" IRQ %u"
> +spapr_pci_lsi_set(const char *busname, int pin, uint32_t irq) "%s PIN%d IRQ %u"
> +
> -- 
> 1.7.7.3
> --
> To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch

diff --git a/hw/spapr.c b/hw/spapr.c
index ef6ffcb..35ad075 100644
--- a/hw/spapr.c
+++ b/hw/spapr.c
@@ -43,6 +43,7 @@ 
 #include "hw/spapr_vio.h"
 #include "hw/spapr_pci.h"
 #include "hw/xics.h"
+#include "hw/msi.h"

 #include "kvm.h"
 #include "kvm_ppc.h"
@@ -82,6 +83,7 @@ 
 #define SPAPR_PCI_MEM_WIN_ADDR  (0x10000000000ULL + 0xA0000000)
 #define SPAPR_PCI_MEM_WIN_SIZE  0x20000000
 #define SPAPR_PCI_IO_WIN_ADDR   (0x10000000000ULL + 0x80000000)
+#define SPAPR_PCI_MSI_WIN_ADDR  (0x10000000000ULL + 0x90000000)

 #define PHANDLE_XICP            0x00001111

@@ -116,7 +118,7 @@  qemu_irq spapr_allocate_irq(uint32_t hint, uint32_t *irq_num,
 /* Allocate block of consequtive IRQs, returns a number of the first */
 int spapr_allocate_irq_block(uint32_t num, enum xics_irq_type type)
 {
-    int i, ret;
+    int i, ret = 0;
     uint32_t irq = -1;

     for (i = 0; i < num; ++i) {
@@ -690,6 +692,8 @@  static void ppc_spapr_init(ram_addr_t ram_size,
     long pteg_shift = 17;
     char *filename;

+    msi_supported = true;
+
     spapr = g_malloc0(sizeof(*spapr));
     QLIST_INIT(&spapr->phbs);

@@ -804,7 +808,8 @@  static void ppc_spapr_init(ram_addr_t ram_size,
     spapr_create_phb(spapr, "pci", SPAPR_PCI_BUID,
                      SPAPR_PCI_MEM_WIN_ADDR,
                      SPAPR_PCI_MEM_WIN_SIZE,
-                     SPAPR_PCI_IO_WIN_ADDR);
+                     SPAPR_PCI_IO_WIN_ADDR,
+                     SPAPR_PCI_MSI_WIN_ADDR);

     for (i = 0; i < nb_nics; i++) {
         NICInfo *nd = &nd_table[i];
diff --git a/hw/spapr_pci.c b/hw/spapr_pci.c
index 93017cd..21fbc50 100644
--- a/hw/spapr_pci.c
+++ b/hw/spapr_pci.c
@@ -24,31 +24,46 @@ 
  */
 #include "hw.h"
 #include "pci.h"
+#include "msix.h"
+#include "msi.h"
 #include "pci_host.h"
 #include "hw/spapr.h"
 #include "hw/spapr_pci.h"
 #include "exec-memory.h"
 #include <libfdt.h>
+#include "trace.h"

 #include "hw/pci_internals.h"

-static PCIDevice *find_dev(sPAPREnvironment *spapr,
-                           uint64_t buid, uint32_t config_addr)
+static sPAPRPHBState *find_phb(sPAPREnvironment *spapr, uint64_t buid)
 {
-    DeviceState *qdev;
-    int devfn = (config_addr >> 8) & 0xFF;
     sPAPRPHBState *phb;

     QLIST_FOREACH(phb, &spapr->phbs, list) {
         if (phb->buid != buid) {
             continue;
         }
+        return phb;
+    }

-        QTAILQ_FOREACH(qdev, &phb->host_state.bus->qbus.children, sibling) {
-            PCIDevice *dev = (PCIDevice *)qdev;
-            if (dev->devfn == devfn) {
-                return dev;
-            }
+    return NULL;
+}
+
+static PCIDevice *find_dev(sPAPREnvironment *spapr, uint64_t buid,
+                           uint32_t config_addr)
+{
+    sPAPRPHBState *phb = find_phb(spapr, buid);
+    DeviceState *qdev;
+    int devfn = (config_addr >> 8) & 0xFF;
+
+    if (!phb) {
+        return NULL;
+    }
+
+    QTAILQ_FOREACH(qdev, &phb->host_state.bus->qbus.children, sibling) {
+        PCIDevice *dev = (PCIDevice *)qdev;
+        if (dev->devfn == devfn) {
+            return dev;
         }
     }

@@ -138,6 +153,220 @@  static void rtas_write_pci_config(sPAPREnvironment *spapr,
     rtas_st(rets, 0, 0);
 }

+/*
+ * Initializes req_num vectors for a device.
+ * The code assumes that MSI/MSIX is enabled in the config space
+ * as a result of msix_init() or msi_init().
+ */
+static int spapr_pci_config_msi(sPAPRPHBState *ph, int ndev,
+                                PCIDevice *pdev, bool msix, unsigned req_num)
+{
+    unsigned i;
+    int irq;
+    uint64_t msi_address;
+    uint32_t config_addr = pdev->devfn << 8;
+
+    /* Disabling - nothing to do */
+    if (0 == req_num) {
+        return 0;
+    }
+
+    /* Enabling! */
+    if (ph->msi_table[ndev].nvec && (req_num != ph->msi_table[ndev].nvec)) {
+        /* Unexpected behaviour */
+        fprintf(stderr, "Cannot reuse cached MSI config for device#%u", ndev);
+        return -1;
+    }
+
+    if (0 == ph->msi_table[ndev].nvec) {
+        irq = spapr_allocate_irq_block(req_num, XICS_MSI);
+        if (irq < 0) {
+            return -1;
+        }
+        ph->msi_table[ndev].dt_irq = irq;
+        ph->msi_table[ndev].nvec = req_num;
+        ph->msi_table[ndev].config_addr = config_addr;
+    }
+
+    if (msix) {
+        for (i = 0; i < req_num; ++i) {
+            msi_address = ph->msi_win_addr + ((ndev << 16) | i << 2);
+            msix_set_address_data(pdev, i, msi_address, 0);
+            trace_spapr_pci_msi_setup(pdev->name, i, msi_address);
+        }
+    } else {
+        msi_address = ph->msi_win_addr + (ndev << 16);
+        msi_set_address_data(pdev, msi_address, 0);
+        trace_spapr_pci_msi_setup(pdev->name, 0, msi_address);
+    }
+
+    return req_num;
+}
+
+/*
+ * The handler handles both MSI and MSIX.
+ * For MSI-X, the vector number is encoded as a part of the address
+ * and data is set to 0.
+ * For MSI, the vector takes least bits in data and not encoded in
+ * address.
+ */
+static void spapr_msi_write(void *opaque, target_phys_addr_t addr,
+                            uint64_t data, unsigned size)
+{
+    sPAPRPHBState *ephb = opaque;
+    int n = addr >> 16;
+    int vec = ((addr & 0xFFFF) >> 2) | data;
+    uint32_t dt_irq = ephb->msi_table[n].dt_irq + vec;
+
+    trace_spapr_pci_msi_write(addr, data, dt_irq);
+
+    qemu_irq_pulse(xics_assign_irq(spapr->icp, dt_irq, XICS_MSI));
+}
+
+static const MemoryRegionOps spapr_msi_ops = {
+    .read = NULL,
+    .write = spapr_msi_write,
+    .endianness = DEVICE_LITTLE_ENDIAN
+};
+
+static void rtas_ibm_change_msi(sPAPREnvironment *spapr,
+                                uint32_t token, uint32_t nargs,
+                                target_ulong args, uint32_t nret,
+                                target_ulong rets)
+{
+    uint32_t config_addr = rtas_ld(args, 0);
+    uint64_t buid = ((uint64_t)rtas_ld(args, 1) << 32) | rtas_ld(args, 2);
+#define RTAS_QUERY_FN           0
+#define RTAS_CHANGE_FN          1
+#define RTAS_RESET_FN           2
+#define RTAS_CHANGE_MSI_FN      3
+#define RTAS_CHANGE_MSIX_FN     4
+    unsigned int func = rtas_ld(args, 3);
+    unsigned int req_num = rtas_ld(args, 4); /* 0 == remove all */
+    unsigned int seq_num = rtas_ld(args, 5);
+    int assigned_num = -1;
+    enum { MSI = 1, MSIX = 2 } ret_intr_type = MSI;
+    int i;
+    sPAPRPHBState *phb = NULL;
+    PCIDevice *pdev = NULL;
+
+    switch (func) {
+    case RTAS_CHANGE_MSI_FN:
+    case RTAS_CHANGE_FN:
+        ret_intr_type = MSI;
+        break;
+    case RTAS_CHANGE_MSIX_FN:
+        ret_intr_type = MSIX;
+        break;
+    default:
+        fprintf(stderr, "rtas_ibm_change_msi(%u) is not implemented\n", func);
+        rtas_st(rets, 0, -3); /* Parameter error */
+        return;
+    }
+
+    /* Fins sPAPRPHBState */
+    phb = find_phb(spapr, buid);
+    if (phb) {
+        pdev = find_dev(spapr, buid, config_addr);
+    }
+    if (!phb || !pdev) {
+        rtas_st(rets, 0, -3); /* Parameter error */
+        return;
+    }
+
+    if (0 == req_num) {
+        /* Calculate a device number in the map to remove */
+        for (i = 0; i < SPAPR_MSIX_MAX_DEVS; ++i) {
+            if (phb->msi_table[i].nvec &&
+                    (phb->msi_table[i].config_addr == config_addr)) {
+                trace_spapr_pci_msi("Releasing MSI config for device#", i);
+                break;
+            }
+        }
+        if (i >= SPAPR_MSIX_MAX_DEVS) {
+            fprintf(stderr, "MSI has not been enabled for this device!\n");
+            rtas_st(rets, 0, -1); /* Hardware error */
+            return;
+        }
+    } else {
+        /* Find a device number in the map to add */
+        for (i = 0; i < SPAPR_MSIX_MAX_DEVS; ++i) {
+            if (!phb->msi_table[i].nvec) {
+                /* as we never free entries, it means no more left in cache,
+                   so we use new empty entry */
+                trace_spapr_pci_msi("Allocating new MSI for device#", i);
+                break;
+            }
+            if (phb->msi_table[i].config_addr != config_addr) {
+                continue;
+            }
+            /* Even if req_num is greater than nvec, it is expected
+               that the host will allocate (nvec) vectors again */
+            trace_spapr_pci_msi("Reuse cached MSI config for device#", i);
+            break;
+        }
+        if (i >= SPAPR_MSIX_MAX_DEVS) {
+            fprintf(stderr, "No free entry for a new MSI device\n");
+            rtas_st(rets, 0, -1); /* Hardware error */
+            return;
+        }
+    }
+
+    assigned_num = spapr_pci_config_msi(phb, i, pdev,
+                                        ret_intr_type == MSIX, req_num);
+
+    if (assigned_num < 0) {
+        fprintf(stderr, "rtas_ibm_change_msi(%u) failed\n", func);
+        rtas_st(rets, 0, -3); /* Parameter error */
+        return;
+    }
+    trace_spapr_pci_rtas_ibm_change_msi(func, req_num, assigned_num);
+
+    rtas_st(rets, 0, 0);
+    rtas_st(rets, 1, assigned_num);
+    rtas_st(rets, 2, ++seq_num);
+    rtas_st(rets, 3, ret_intr_type);
+}
+
+static void rtas_ibm_query_interrupt_source_number(sPAPREnvironment *spapr,
+                                                   uint32_t token,
+                                                   uint32_t nargs,
+                                                   target_ulong args,
+                                                   uint32_t nret,
+                                                   target_ulong rets)
+{
+    uint32_t config_addr = rtas_ld(args, 0);
+    uint64_t buid = ((uint64_t)rtas_ld(args, 1) << 32) | rtas_ld(args, 2);
+    unsigned int i, intr_src_num = -1, ioa_intr_num = rtas_ld(args, 3);
+    sPAPRPHBState *phb = NULL;
+
+    /* Fins sPAPRPHBState */
+    phb = find_phb(spapr, buid);
+    if (!phb) {
+        rtas_st(rets, 0, -3); /* Parameter error */
+        return;
+    }
+
+    /* Find device descriptor and start IRQ */
+    for (i = 0; i < SPAPR_MSIX_MAX_DEVS; ++i) {
+        if (phb->msi_table[i].config_addr == config_addr) {
+            intr_src_num = phb->msi_table[i].dt_irq + ioa_intr_num;
+            break;
+        }
+    }
+    /* Device has not been configured, return error */
+    if (-1 == intr_src_num) {
+        rtas_st(rets, 0, -1); /* Hardware error */
+        return;
+    }
+    trace_spapr_pci_rtas_ibm_query_interrupt_source_number(ioa_intr_num,
+                                                           intr_src_num);
+
+    rtas_st(rets, 0, 0);
+    rtas_st(rets, 1, intr_src_num);
+    rtas_st(rets, 2, 1);/* 0 == level; 1 == edge */
+}
+
 static int pci_swizzle(int slot, int pin)
 {
     return (slot + pin) % PCI_NUM_PINS;
@@ -162,6 +391,8 @@  static void pci_spapr_set_irq(void *opaque, int irq_num, int level)
      */
     sPAPRPHBState *phb = opaque;

+    trace_spapr_pci_lsi_set(phb->busname, irq_num,
+                            phb->lsi_table[irq_num].dt_irq);
     qemu_set_irq(xics_assign_irq(spapr->icp,
                                  phb->lsi_table[irq_num].dt_irq, XICS_LSI),
                  level);
@@ -255,6 +486,14 @@  static int spapr_phb_init(SysBusDevice *s)
     memory_region_add_subregion(get_system_memory(), phb->io_win_addr,
                                 &phb->iowindow);

+    if (msi_supported) {
+        sprintf(namebuf, "%s.msi", phb->dtbusname);
+        memory_region_init_io(&phb->msiwindow, &spapr_msi_ops, phb,
+                              namebuf, SPAPR_MSIX_MAX_DEVS * 0x10000);
+        memory_region_add_subregion(get_system_memory(), phb->msi_win_addr,
+                                    &phb->msiwindow);
+    }
+
     bus = pci_register_bus(&phb->host_state.busdev.qdev,
                            phb->busname ? phb->busname : phb->dtbusname,
                            pci_spapr_set_irq, pci_spapr_map_irq, phb,
@@ -286,6 +525,7 @@  static Property spapr_phb_properties[] = {
     DEFINE_PROP_HEX64("mem_win_size", sPAPRPHBState, mem_win_size, 0x20000000),
     DEFINE_PROP_HEX64("io_win_addr", sPAPRPHBState, io_win_addr, 0),
     DEFINE_PROP_HEX64("io_win_size", sPAPRPHBState, io_win_size, 0x10000),
+    DEFINE_PROP_HEX64("msi_win_addr", sPAPRPHBState, msi_win_addr, 0),
     DEFINE_PROP_END_OF_LIST(),
 };

@@ -301,6 +541,11 @@  static void spapr_phb_class_init(ObjectClass *klass, void *data)
     spapr_rtas_register("write-pci-config", rtas_write_pci_config);
     spapr_rtas_register("ibm,read-pci-config", rtas_ibm_read_pci_config);
     spapr_rtas_register("ibm,write-pci-config", rtas_ibm_write_pci_config);
+    if (msi_supported) {
+        spapr_rtas_register("ibm,query-interrupt-source-number",
+                            rtas_ibm_query_interrupt_source_number);
+        spapr_rtas_register("ibm,change-msi", rtas_ibm_change_msi);
+    }
 }

 static TypeInfo spapr_phb_info = {
@@ -313,7 +558,7 @@  static TypeInfo spapr_phb_info = {
 void spapr_create_phb(sPAPREnvironment *spapr,
                       const char *busname, uint64_t buid,
                       uint64_t mem_win_addr, uint64_t mem_win_size,
-                      uint64_t io_win_addr)
+                      uint64_t io_win_addr, uint64_t msi_win_addr)
 {
     DeviceState *dev;

@@ -326,6 +571,7 @@  void spapr_create_phb(sPAPREnvironment *spapr,
     qdev_prop_set_uint64(dev, "mem_win_addr", mem_win_addr);
     qdev_prop_set_uint64(dev, "mem_win_size", mem_win_size);
     qdev_prop_set_uint64(dev, "io_win_addr", io_win_addr);
+    qdev_prop_set_uint64(dev, "msi_win_addr", msi_win_addr);

     qdev_init_nofail(dev);
 }
diff --git a/hw/spapr_pci.h b/hw/spapr_pci.h
index 11c3ee1..bb9a5ea 100644
--- a/hw/spapr_pci.h
+++ b/hw/spapr_pci.h
@@ -27,6 +27,8 @@ 
 #include "hw/pci_host.h"
 #include "hw/xics.h"

+#define SPAPR_MSIX_MAX_DEVS 32
+
 typedef struct sPAPRPHBState {
     PCIHostState host_state;

@@ -37,12 +39,21 @@  typedef struct sPAPRPHBState {
     MemoryRegion memspace, iospace;
     target_phys_addr_t mem_win_addr, mem_win_size, io_win_addr, io_win_size;
     MemoryRegion memwindow, iowindow;
+    target_phys_addr_t msi_win_addr;
+    MemoryRegion msiwindow;
+
     DMAContext *dma;

     struct {
         uint32_t dt_irq;
     } lsi_table[PCI_NUM_PINS];

+    struct {
+        uint32_t config_addr;
+        uint32_t dt_irq; /* first irq, dt_irq!=0 means "used" */
+        int nvec;
+    } msi_table[SPAPR_MSIX_MAX_DEVS];
+
     QLIST_ENTRY(sPAPRPHBState) list;
 } sPAPRPHBState;

@@ -52,7 +63,7 @@  typedef struct sPAPRPHBState {
 void spapr_create_phb(sPAPREnvironment *spapr,
                       const char *busname, uint64_t buid,
                       uint64_t mem_win_addr, uint64_t mem_win_size,
-                      uint64_t io_win_addr);
+                      uint64_t io_win_addr, uint64_t msi_win_addr);

 int spapr_populate_pci_dt(sPAPRPHBState *phb,
                           uint32_t xics_phandle,
diff --git a/trace-events b/trace-events
index 70f059d..8d94053 100644
--- a/trace-events
+++ b/trace-events
@@ -788,3 +788,12 @@  qxl_render_blit_guest_primary_initialized(void) ""
 qxl_render_blit(int32_t stride, int32_t left, int32_t right, int32_t top, int32_t bottom)
"stride=%d [%d, %d, %d, %d]"
 qxl_render_guest_primary_resized(int32_t width, int32_t height, int32_t stride, int32_t bytes_pp,
int32_t bits_pp) "%dx%d, stride %d, bpp %d, depth %d"
 qxl_render_update_area_done(void *cookie) "%p"
+
+# hw/spapr_pci.c
+spapr_pci_msi(const char *msg, uint32_t n) "%s%u"
+spapr_pci_msi_setup(const char *name, unsigned vector, uint64_t addr) "dev\"%s\" vector %u,
addr=%"PRIx64
+spapr_pci_rtas_ibm_change_msi(unsigned func, unsigned req, int assigned) "func %u, requested %u,
assigned %d"