diff mbox

RFC kvm irqfd: add directly mapped MSI IRQ support

Message ID 1371737338-25148-1-git-send-email-aik@ozlabs.ru
State New
Headers show

Commit Message

Alexey Kardashevskiy June 20, 2013, 2:08 p.m. UTC
At the moment QEMU creates a route for every MSI IRQ.

Now we are about to add IRQFD support on PPC64-pseries platform.
pSeries already has in-kernel emulated interrupt controller with
8192 IRQs. Also, pSeries PHB already supports MSIMessage to IRQ
mapping as a part of PAPR requirements for MSI/MSIX guests.
Specifically, the pSeries guest does not touch MSIMessage's at
all, instead it uses rtas_ibm_change_msi and rtas_ibm_query_interrupt_source
rtas calls to do the mapping.

Therefore we do not really need more routing than we got already.
The patch introduces the infrastructure to enable direct IRQ mapping.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>

---

The patch is raw and ugly indeed, I made it only to demonstrate
the idea and see if it has right to live or not.

For some reason which I do not really understand (limited GSI numbers?)
the existing code always adds routing and I do not see why we would need it.

Thanks!
---
 hw/misc/vfio.c           |   11 +++++++++--
 hw/pci/pci.c             |   13 +++++++++++++
 hw/ppc/spapr_pci.c       |   13 +++++++++++++
 hw/virtio/virtio-pci.c   |   26 ++++++++++++++++++++------
 include/hw/pci/pci.h     |    4 ++++
 include/hw/pci/pci_bus.h |    1 +
 6 files changed, 60 insertions(+), 8 deletions(-)

Comments

Michael S. Tsirkin June 20, 2013, 3:38 p.m. UTC | #1
On Fri, Jun 21, 2013 at 12:08:58AM +1000, Alexey Kardashevskiy wrote:
> At the moment QEMU creates a route for every MSI IRQ.
> 
> Now we are about to add IRQFD support on PPC64-pseries platform.
> pSeries already has in-kernel emulated interrupt controller with
> 8192 IRQs. Also, pSeries PHB already supports MSIMessage to IRQ
> mapping as a part of PAPR requirements for MSI/MSIX guests.
> Specifically, the pSeries guest does not touch MSIMessage's at
> all, instead it uses rtas_ibm_change_msi and rtas_ibm_query_interrupt_source
> rtas calls to do the mapping.
> 
> Therefore we do not really need more routing than we got already.
> The patch introduces the infrastructure to enable direct IRQ mapping.
> 
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> 
> ---
> 
> The patch is raw and ugly indeed, I made it only to demonstrate
> the idea and see if it has right to live or not.
> 
> For some reason which I do not really understand (limited GSI numbers?)
> the existing code always adds routing and I do not see why we would need it.
> 
> Thanks!
> ---
>  hw/misc/vfio.c           |   11 +++++++++--
>  hw/pci/pci.c             |   13 +++++++++++++
>  hw/ppc/spapr_pci.c       |   13 +++++++++++++
>  hw/virtio/virtio-pci.c   |   26 ++++++++++++++++++++------
>  include/hw/pci/pci.h     |    4 ++++
>  include/hw/pci/pci_bus.h |    1 +
>  6 files changed, 60 insertions(+), 8 deletions(-)
> 
> diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c
> index 14aac04..2d9eef7 100644
> --- a/hw/misc/vfio.c
> +++ b/hw/misc/vfio.c
> @@ -639,7 +639,11 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
>       * Attempt to enable route through KVM irqchip,
>       * default to userspace handling if unavailable.
>       */
> -    vector->virq = msg ? kvm_irqchip_add_msi_route(kvm_state, *msg) : -1;
> +
> +    vector->virq = msg ? pci_bus_map_msi(vdev->pdev.bus, *msg) : -1;
> +    if (vector->virq < 0) {
> +        vector->virq = msg ? kvm_irqchip_add_msi_route(kvm_state, *msg) : -1;
> +    }
>      if (vector->virq < 0 ||
>          kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt,
>                                         vector->virq) < 0) {
> @@ -807,7 +811,10 @@ retry:
>           * Attempt to enable route through KVM irqchip,
>           * default to userspace handling if unavailable.
>           */
> -        vector->virq = kvm_irqchip_add_msi_route(kvm_state, msg);
> +        vector->virq = pci_bus_map_msi(vdev->pdev.bus, msg);
> +        if (vector->virq < 0) {
> +            vector->virq = kvm_irqchip_add_msi_route(kvm_state, msg);
> +        }
>          if (vector->virq < 0 ||
>              kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt,
>                                             vector->virq) < 0) {
> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> index a976e46..a9875e9 100644
> --- a/hw/pci/pci.c
> +++ b/hw/pci/pci.c
> @@ -1254,6 +1254,19 @@ void pci_device_set_intx_routing_notifier(PCIDevice *dev,
>      dev->intx_routing_notifier = notifier;
>  }
>  
> +void pci_bus_set_map_msi_fn(PCIBus *bus, pci_map_msi_fn map_msi_fn)
> +{
> +    bus->map_msi = map_msi_fn;
> +}
> +
> +int pci_bus_map_msi(PCIBus *bus, MSIMessage msg)
> +{
> +    if (bus->map_msi) {
> +        return bus->map_msi(bus, msg);
> +    }
> +    return -1;
> +}
> +
>  /*
>   * PCI-to-PCI bridge specification
>   * 9.1: Interrupt routing. Table 9-1
> diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
> index 80408c9..9ef9a29 100644
> --- a/hw/ppc/spapr_pci.c
> +++ b/hw/ppc/spapr_pci.c
> @@ -500,6 +500,18 @@ static void spapr_msi_write(void *opaque, hwaddr addr,
>      qemu_irq_pulse(xics_get_qirq(spapr->icp, irq));
>  }
>  
> +static int spapr_msi_get_irq(PCIBus *bus, MSIMessage msg)
> +{
> +    DeviceState *par = bus->qbus.parent;
> +    sPAPRPHBState *sphb = (sPAPRPHBState *) par;
> +    unsigned long addr = msg.address - sphb->msi_win_addr;
> +    int ndev = addr >> 16;
> +    int vec = ((addr & 0xFFFF) >> 2) | msg.data;
> +    uint32_t irq = sphb->msi_table[ndev].irq + vec;

This array seems to be SPAPR_MSIX_MAX_DEVS in size, no?
Won't this overflow if ndev is large?

> +
> +    return (int)irq;
> +}
> +
>  static const MemoryRegionOps spapr_msi_ops = {
>      /* There is no .read as the read result is undefined by PCI spec */
>      .read = NULL,
> @@ -664,6 +676,7 @@ static int _spapr_phb_init(SysBusDevice *s)
>  
>          sphb->lsi_table[i].irq = irq;
>      }
> +    pci_bus_set_map_msi_fn(bus, spapr_msi_get_irq);
>  
>      return 0;
>  }
> diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
> index d309416..587f53e 100644
> --- a/hw/virtio/virtio-pci.c
> +++ b/hw/virtio/virtio-pci.c
> @@ -472,6 +472,8 @@ static unsigned virtio_pci_get_features(DeviceState *d)
>      return proxy->host_features;
>  }
>  
> +extern int spapr_msi_get_irq(PCIBus *bus, MSIMessage *msg);
> +
>  static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy *proxy,
>                                          unsigned int queue_no,
>                                          unsigned int vector,
> @@ -481,7 +483,10 @@ static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy *proxy,
>      int ret;
>  
>      if (irqfd->users == 0) {
> -        ret = kvm_irqchip_add_msi_route(kvm_state, msg);
> +        ret = pci_bus_map_msi(proxy->pci_dev.bus, msg);
> +        if (ret < 0) {
> +            ret = kvm_irqchip_add_msi_route(kvm_state, msg);
> +        }
>          if (ret < 0) {
>              return ret;
>          }
> @@ -609,14 +614,23 @@ static int virtio_pci_vq_vector_unmask(VirtIOPCIProxy *proxy,
>      VirtQueue *vq = virtio_get_queue(proxy->vdev, queue_no);
>      EventNotifier *n = virtio_queue_get_guest_notifier(vq);
>      VirtIOIRQFD *irqfd;
> -    int ret = 0;
> +    int ret = 0, tmp;
>  
>      if (proxy->vector_irqfd) {
>          irqfd = &proxy->vector_irqfd[vector];
> -        if (irqfd->msg.data != msg.data || irqfd->msg.address != msg.address) {
> -            ret = kvm_irqchip_update_msi_route(kvm_state, irqfd->virq, msg);
> -            if (ret < 0) {
> -                return ret;
> +
> +        tmp = pci_bus_map_msi(proxy->pci_dev.bus, msg);
> +        if (tmp >= 0) {
> +            if (irqfd->virq != tmp) {
> +                fprintf(stderr, "FIXME: MSI(-X) vector has changed from %X to %x\n",
> +                        irqfd->virq, tmp);
> +            }
> +        } else {
> +            if (irqfd->msg.data != msg.data || irqfd->msg.address != msg.address) {
> +                ret = kvm_irqchip_update_msi_route(kvm_state, irqfd->virq, msg);
> +                if (ret < 0) {
> +                    return ret;
> +                }
>              }
>          }
>      }
> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
> index 8797802..632739a 100644
> --- a/include/hw/pci/pci.h
> +++ b/include/hw/pci/pci.h
> @@ -332,6 +332,7 @@ MemoryRegion *pci_address_space_io(PCIDevice *dev);
>  typedef void (*pci_set_irq_fn)(void *opaque, int irq_num, int level);
>  typedef int (*pci_map_irq_fn)(PCIDevice *pci_dev, int irq_num);
>  typedef PCIINTxRoute (*pci_route_irq_fn)(void *opaque, int pin);
> +typedef int (*pci_map_msi_fn)(PCIBus *bus, MSIMessage msg);
>  
>  typedef enum {
>      PCI_HOTPLUG_DISABLED,
> @@ -375,6 +376,9 @@ bool pci_intx_route_changed(PCIINTxRoute *old, PCIINTxRoute *new);
>  void pci_bus_fire_intx_routing_notifier(PCIBus *bus);
>  void pci_device_set_intx_routing_notifier(PCIDevice *dev,
>                                            PCIINTxRoutingNotifier notifier);
> +void pci_bus_set_map_msi_fn(PCIBus *bus, pci_map_msi_fn map_msi_fn);
> +int pci_bus_map_msi(PCIBus *bus, MSIMessage msg);
> +
>  void pci_device_reset(PCIDevice *dev);
>  void pci_bus_reset(PCIBus *bus);
>  
> diff --git a/include/hw/pci/pci_bus.h b/include/hw/pci/pci_bus.h
> index 66762f6..81efd2b 100644
> --- a/include/hw/pci/pci_bus.h
> +++ b/include/hw/pci/pci_bus.h
> @@ -16,6 +16,7 @@ struct PCIBus {
>      pci_set_irq_fn set_irq;
>      pci_map_irq_fn map_irq;
>      pci_route_irq_fn route_intx_to_irq;
> +    pci_map_msi_fn map_msi;
>      pci_hotplug_fn hotplug;
>      DeviceState *hotplug_qdev;
>      void *irq_opaque;
> -- 
> 1.7.10.4
Anthony Liguori June 20, 2013, 4:37 p.m. UTC | #2
Alexey Kardashevskiy <aik@ozlabs.ru> writes:

> At the moment QEMU creates a route for every MSI IRQ.
>
> Now we are about to add IRQFD support on PPC64-pseries platform.
> pSeries already has in-kernel emulated interrupt controller with
> 8192 IRQs. Also, pSeries PHB already supports MSIMessage to IRQ
> mapping as a part of PAPR requirements for MSI/MSIX guests.
> Specifically, the pSeries guest does not touch MSIMessage's at
> all, instead it uses rtas_ibm_change_msi and rtas_ibm_query_interrupt_source
> rtas calls to do the mapping.
>
> Therefore we do not really need more routing than we got already.
> The patch introduces the infrastructure to enable direct IRQ mapping.
>
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>
> ---
>
> The patch is raw and ugly indeed, I made it only to demonstrate
> the idea and see if it has right to live or not.
>
> For some reason which I do not really understand (limited GSI numbers?)
> the existing code always adds routing and I do not see why we would need it.
>
> Thanks!
> ---
>  hw/misc/vfio.c           |   11 +++++++++--
>  hw/pci/pci.c             |   13 +++++++++++++
>  hw/ppc/spapr_pci.c       |   13 +++++++++++++
>  hw/virtio/virtio-pci.c   |   26 ++++++++++++++++++++------
>  include/hw/pci/pci.h     |    4 ++++
>  include/hw/pci/pci_bus.h |    1 +
>  6 files changed, 60 insertions(+), 8 deletions(-)
>
> diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c
> index 14aac04..2d9eef7 100644
> --- a/hw/misc/vfio.c
> +++ b/hw/misc/vfio.c
> @@ -639,7 +639,11 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
>       * Attempt to enable route through KVM irqchip,
>       * default to userspace handling if unavailable.
>       */
> -    vector->virq = msg ? kvm_irqchip_add_msi_route(kvm_state, *msg) : -1;
> +
> +    vector->virq = msg ? pci_bus_map_msi(vdev->pdev.bus, *msg) : -1;
> +    if (vector->virq < 0) {
> +        vector->virq = msg ? kvm_irqchip_add_msi_route(kvm_state, *msg) : -1;
> +    }
>      if (vector->virq < 0 ||
>          kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt,
>                                         vector->virq) < 0) {
> @@ -807,7 +811,10 @@ retry:
>           * Attempt to enable route through KVM irqchip,
>           * default to userspace handling if unavailable.
>           */
> -        vector->virq = kvm_irqchip_add_msi_route(kvm_state, msg);
> +        vector->virq = pci_bus_map_msi(vdev->pdev.bus, msg);
> +        if (vector->virq < 0) {
> +            vector->virq = kvm_irqchip_add_msi_route(kvm_state, msg);
> +        }

I don't understand why you're adding a pci level hook verses just having
a kvmppc specific hook in the kvm_irqchip_add_msi_route function..

Regards,

Anthony Liguori

>          if (vector->virq < 0 ||
>              kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt,
>                                             vector->virq) < 0) {
> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> index a976e46..a9875e9 100644
> --- a/hw/pci/pci.c
> +++ b/hw/pci/pci.c
> @@ -1254,6 +1254,19 @@ void pci_device_set_intx_routing_notifier(PCIDevice *dev,
>      dev->intx_routing_notifier = notifier;
>  }
>  
> +void pci_bus_set_map_msi_fn(PCIBus *bus, pci_map_msi_fn map_msi_fn)
> +{
> +    bus->map_msi = map_msi_fn;
> +}
> +
> +int pci_bus_map_msi(PCIBus *bus, MSIMessage msg)
> +{
> +    if (bus->map_msi) {
> +        return bus->map_msi(bus, msg);
> +    }
> +    return -1;
> +}
> +
>  /*
>   * PCI-to-PCI bridge specification
>   * 9.1: Interrupt routing. Table 9-1
> diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
> index 80408c9..9ef9a29 100644
> --- a/hw/ppc/spapr_pci.c
> +++ b/hw/ppc/spapr_pci.c
> @@ -500,6 +500,18 @@ static void spapr_msi_write(void *opaque, hwaddr addr,
>      qemu_irq_pulse(xics_get_qirq(spapr->icp, irq));
>  }
>  
> +static int spapr_msi_get_irq(PCIBus *bus, MSIMessage msg)
> +{
> +    DeviceState *par = bus->qbus.parent;
> +    sPAPRPHBState *sphb = (sPAPRPHBState *) par;
> +    unsigned long addr = msg.address - sphb->msi_win_addr;
> +    int ndev = addr >> 16;
> +    int vec = ((addr & 0xFFFF) >> 2) | msg.data;
> +    uint32_t irq = sphb->msi_table[ndev].irq + vec;
> +
> +    return (int)irq;
> +}
> +
>  static const MemoryRegionOps spapr_msi_ops = {
>      /* There is no .read as the read result is undefined by PCI spec */
>      .read = NULL,
> @@ -664,6 +676,7 @@ static int _spapr_phb_init(SysBusDevice *s)
>  
>          sphb->lsi_table[i].irq = irq;
>      }
> +    pci_bus_set_map_msi_fn(bus, spapr_msi_get_irq);
>  
>      return 0;
>  }
> diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
> index d309416..587f53e 100644
> --- a/hw/virtio/virtio-pci.c
> +++ b/hw/virtio/virtio-pci.c
> @@ -472,6 +472,8 @@ static unsigned virtio_pci_get_features(DeviceState *d)
>      return proxy->host_features;
>  }
>  
> +extern int spapr_msi_get_irq(PCIBus *bus, MSIMessage *msg);
> +
>  static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy *proxy,
>                                          unsigned int queue_no,
>                                          unsigned int vector,
> @@ -481,7 +483,10 @@ static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy *proxy,
>      int ret;
>  
>      if (irqfd->users == 0) {
> -        ret = kvm_irqchip_add_msi_route(kvm_state, msg);
> +        ret = pci_bus_map_msi(proxy->pci_dev.bus, msg);
> +        if (ret < 0) {
> +            ret = kvm_irqchip_add_msi_route(kvm_state, msg);
> +        }
>          if (ret < 0) {
>              return ret;
>          }
> @@ -609,14 +614,23 @@ static int virtio_pci_vq_vector_unmask(VirtIOPCIProxy *proxy,
>      VirtQueue *vq = virtio_get_queue(proxy->vdev, queue_no);
>      EventNotifier *n = virtio_queue_get_guest_notifier(vq);
>      VirtIOIRQFD *irqfd;
> -    int ret = 0;
> +    int ret = 0, tmp;
>  
>      if (proxy->vector_irqfd) {
>          irqfd = &proxy->vector_irqfd[vector];
> -        if (irqfd->msg.data != msg.data || irqfd->msg.address != msg.address) {
> -            ret = kvm_irqchip_update_msi_route(kvm_state, irqfd->virq, msg);
> -            if (ret < 0) {
> -                return ret;
> +
> +        tmp = pci_bus_map_msi(proxy->pci_dev.bus, msg);
> +        if (tmp >= 0) {
> +            if (irqfd->virq != tmp) {
> +                fprintf(stderr, "FIXME: MSI(-X) vector has changed from %X to %x\n",
> +                        irqfd->virq, tmp);
> +            }
> +        } else {
> +            if (irqfd->msg.data != msg.data || irqfd->msg.address != msg.address) {
> +                ret = kvm_irqchip_update_msi_route(kvm_state, irqfd->virq, msg);
> +                if (ret < 0) {
> +                    return ret;
> +                }
>              }
>          }
>      }
> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
> index 8797802..632739a 100644
> --- a/include/hw/pci/pci.h
> +++ b/include/hw/pci/pci.h
> @@ -332,6 +332,7 @@ MemoryRegion *pci_address_space_io(PCIDevice *dev);
>  typedef void (*pci_set_irq_fn)(void *opaque, int irq_num, int level);
>  typedef int (*pci_map_irq_fn)(PCIDevice *pci_dev, int irq_num);
>  typedef PCIINTxRoute (*pci_route_irq_fn)(void *opaque, int pin);
> +typedef int (*pci_map_msi_fn)(PCIBus *bus, MSIMessage msg);
>  
>  typedef enum {
>      PCI_HOTPLUG_DISABLED,
> @@ -375,6 +376,9 @@ bool pci_intx_route_changed(PCIINTxRoute *old, PCIINTxRoute *new);
>  void pci_bus_fire_intx_routing_notifier(PCIBus *bus);
>  void pci_device_set_intx_routing_notifier(PCIDevice *dev,
>                                            PCIINTxRoutingNotifier notifier);
> +void pci_bus_set_map_msi_fn(PCIBus *bus, pci_map_msi_fn map_msi_fn);
> +int pci_bus_map_msi(PCIBus *bus, MSIMessage msg);
> +
>  void pci_device_reset(PCIDevice *dev);
>  void pci_bus_reset(PCIBus *bus);
>  
> diff --git a/include/hw/pci/pci_bus.h b/include/hw/pci/pci_bus.h
> index 66762f6..81efd2b 100644
> --- a/include/hw/pci/pci_bus.h
> +++ b/include/hw/pci/pci_bus.h
> @@ -16,6 +16,7 @@ struct PCIBus {
>      pci_set_irq_fn set_irq;
>      pci_map_irq_fn map_irq;
>      pci_route_irq_fn route_intx_to_irq;
> +    pci_map_msi_fn map_msi;
>      pci_hotplug_fn hotplug;
>      DeviceState *hotplug_qdev;
>      void *irq_opaque;
> -- 
> 1.7.10.4
Alex Williamson June 20, 2013, 4:51 p.m. UTC | #3
On Fri, 2013-06-21 at 00:08 +1000, Alexey Kardashevskiy wrote:
> At the moment QEMU creates a route for every MSI IRQ.
> 
> Now we are about to add IRQFD support on PPC64-pseries platform.
> pSeries already has in-kernel emulated interrupt controller with
> 8192 IRQs. Also, pSeries PHB already supports MSIMessage to IRQ
> mapping as a part of PAPR requirements for MSI/MSIX guests.
> Specifically, the pSeries guest does not touch MSIMessage's at
> all, instead it uses rtas_ibm_change_msi and rtas_ibm_query_interrupt_source
> rtas calls to do the mapping.
> 
> Therefore we do not really need more routing than we got already.
> The patch introduces the infrastructure to enable direct IRQ mapping.
> 
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> 
> ---
> 
> The patch is raw and ugly indeed, I made it only to demonstrate
> the idea and see if it has right to live or not.
> 
> For some reason which I do not really understand (limited GSI numbers?)
> the existing code always adds routing and I do not see why we would need it.

It's an IOAPIC, a pin gets toggled from the device and an MSI message
gets written to the CPU.  So the route allocates and programs the
pin->MSI, then we tell it what notifier triggers that pin.

On x86 the MSI vector doesn't encode any information about the device
sending the MSI, here you seem to be able to figure out the device and
vector space number from the address.  Then your pin to MSI is
effectively fixed.  So why isn't this just your
kvm_irqchip_add_msi_route function?  On pSeries it's a lookup, on x86
it's a allocate and program.  What does kvm_irqchip_add_msi_route do on
pSeries today?  Thanks,

Alex

> ---
>  hw/misc/vfio.c           |   11 +++++++++--
>  hw/pci/pci.c             |   13 +++++++++++++
>  hw/ppc/spapr_pci.c       |   13 +++++++++++++
>  hw/virtio/virtio-pci.c   |   26 ++++++++++++++++++++------
>  include/hw/pci/pci.h     |    4 ++++
>  include/hw/pci/pci_bus.h |    1 +
>  6 files changed, 60 insertions(+), 8 deletions(-)
> 
> diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c
> index 14aac04..2d9eef7 100644
> --- a/hw/misc/vfio.c
> +++ b/hw/misc/vfio.c
> @@ -639,7 +639,11 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
>       * Attempt to enable route through KVM irqchip,
>       * default to userspace handling if unavailable.
>       */
> -    vector->virq = msg ? kvm_irqchip_add_msi_route(kvm_state, *msg) : -1;
> +
> +    vector->virq = msg ? pci_bus_map_msi(vdev->pdev.bus, *msg) : -1;
> +    if (vector->virq < 0) {
> +        vector->virq = msg ? kvm_irqchip_add_msi_route(kvm_state, *msg) : -1;
> +    }
>      if (vector->virq < 0 ||
>          kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt,
>                                         vector->virq) < 0) {
> @@ -807,7 +811,10 @@ retry:
>           * Attempt to enable route through KVM irqchip,
>           * default to userspace handling if unavailable.
>           */
> -        vector->virq = kvm_irqchip_add_msi_route(kvm_state, msg);
> +        vector->virq = pci_bus_map_msi(vdev->pdev.bus, msg);
> +        if (vector->virq < 0) {
> +            vector->virq = kvm_irqchip_add_msi_route(kvm_state, msg);
> +        }
>          if (vector->virq < 0 ||
>              kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt,
>                                             vector->virq) < 0) {
> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> index a976e46..a9875e9 100644
> --- a/hw/pci/pci.c
> +++ b/hw/pci/pci.c
> @@ -1254,6 +1254,19 @@ void pci_device_set_intx_routing_notifier(PCIDevice *dev,
>      dev->intx_routing_notifier = notifier;
>  }
>  
> +void pci_bus_set_map_msi_fn(PCIBus *bus, pci_map_msi_fn map_msi_fn)
> +{
> +    bus->map_msi = map_msi_fn;
> +}
> +
> +int pci_bus_map_msi(PCIBus *bus, MSIMessage msg)
> +{
> +    if (bus->map_msi) {
> +        return bus->map_msi(bus, msg);
> +    }
> +    return -1;
> +}
> +
>  /*
>   * PCI-to-PCI bridge specification
>   * 9.1: Interrupt routing. Table 9-1
> diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
> index 80408c9..9ef9a29 100644
> --- a/hw/ppc/spapr_pci.c
> +++ b/hw/ppc/spapr_pci.c
> @@ -500,6 +500,18 @@ static void spapr_msi_write(void *opaque, hwaddr addr,
>      qemu_irq_pulse(xics_get_qirq(spapr->icp, irq));
>  }
>  
> +static int spapr_msi_get_irq(PCIBus *bus, MSIMessage msg)
> +{
> +    DeviceState *par = bus->qbus.parent;
> +    sPAPRPHBState *sphb = (sPAPRPHBState *) par;
> +    unsigned long addr = msg.address - sphb->msi_win_addr;
> +    int ndev = addr >> 16;
> +    int vec = ((addr & 0xFFFF) >> 2) | msg.data;
> +    uint32_t irq = sphb->msi_table[ndev].irq + vec;
> +
> +    return (int)irq;
> +}
> +
>  static const MemoryRegionOps spapr_msi_ops = {
>      /* There is no .read as the read result is undefined by PCI spec */
>      .read = NULL,
> @@ -664,6 +676,7 @@ static int _spapr_phb_init(SysBusDevice *s)
>  
>          sphb->lsi_table[i].irq = irq;
>      }
> +    pci_bus_set_map_msi_fn(bus, spapr_msi_get_irq);
>  
>      return 0;
>  }
> diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
> index d309416..587f53e 100644
> --- a/hw/virtio/virtio-pci.c
> +++ b/hw/virtio/virtio-pci.c
> @@ -472,6 +472,8 @@ static unsigned virtio_pci_get_features(DeviceState *d)
>      return proxy->host_features;
>  }
>  
> +extern int spapr_msi_get_irq(PCIBus *bus, MSIMessage *msg);
> +
>  static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy *proxy,
>                                          unsigned int queue_no,
>                                          unsigned int vector,
> @@ -481,7 +483,10 @@ static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy *proxy,
>      int ret;
>  
>      if (irqfd->users == 0) {
> -        ret = kvm_irqchip_add_msi_route(kvm_state, msg);
> +        ret = pci_bus_map_msi(proxy->pci_dev.bus, msg);
> +        if (ret < 0) {
> +            ret = kvm_irqchip_add_msi_route(kvm_state, msg);
> +        }
>          if (ret < 0) {
>              return ret;
>          }
> @@ -609,14 +614,23 @@ static int virtio_pci_vq_vector_unmask(VirtIOPCIProxy *proxy,
>      VirtQueue *vq = virtio_get_queue(proxy->vdev, queue_no);
>      EventNotifier *n = virtio_queue_get_guest_notifier(vq);
>      VirtIOIRQFD *irqfd;
> -    int ret = 0;
> +    int ret = 0, tmp;
>  
>      if (proxy->vector_irqfd) {
>          irqfd = &proxy->vector_irqfd[vector];
> -        if (irqfd->msg.data != msg.data || irqfd->msg.address != msg.address) {
> -            ret = kvm_irqchip_update_msi_route(kvm_state, irqfd->virq, msg);
> -            if (ret < 0) {
> -                return ret;
> +
> +        tmp = pci_bus_map_msi(proxy->pci_dev.bus, msg);
> +        if (tmp >= 0) {
> +            if (irqfd->virq != tmp) {
> +                fprintf(stderr, "FIXME: MSI(-X) vector has changed from %X to %x\n",
> +                        irqfd->virq, tmp);
> +            }
> +        } else {
> +            if (irqfd->msg.data != msg.data || irqfd->msg.address != msg.address) {
> +                ret = kvm_irqchip_update_msi_route(kvm_state, irqfd->virq, msg);
> +                if (ret < 0) {
> +                    return ret;
> +                }
>              }
>          }
>      }
> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
> index 8797802..632739a 100644
> --- a/include/hw/pci/pci.h
> +++ b/include/hw/pci/pci.h
> @@ -332,6 +332,7 @@ MemoryRegion *pci_address_space_io(PCIDevice *dev);
>  typedef void (*pci_set_irq_fn)(void *opaque, int irq_num, int level);
>  typedef int (*pci_map_irq_fn)(PCIDevice *pci_dev, int irq_num);
>  typedef PCIINTxRoute (*pci_route_irq_fn)(void *opaque, int pin);
> +typedef int (*pci_map_msi_fn)(PCIBus *bus, MSIMessage msg);
>  
>  typedef enum {
>      PCI_HOTPLUG_DISABLED,
> @@ -375,6 +376,9 @@ bool pci_intx_route_changed(PCIINTxRoute *old, PCIINTxRoute *new);
>  void pci_bus_fire_intx_routing_notifier(PCIBus *bus);
>  void pci_device_set_intx_routing_notifier(PCIDevice *dev,
>                                            PCIINTxRoutingNotifier notifier);
> +void pci_bus_set_map_msi_fn(PCIBus *bus, pci_map_msi_fn map_msi_fn);
> +int pci_bus_map_msi(PCIBus *bus, MSIMessage msg);
> +
>  void pci_device_reset(PCIDevice *dev);
>  void pci_bus_reset(PCIBus *bus);
>  
> diff --git a/include/hw/pci/pci_bus.h b/include/hw/pci/pci_bus.h
> index 66762f6..81efd2b 100644
> --- a/include/hw/pci/pci_bus.h
> +++ b/include/hw/pci/pci_bus.h
> @@ -16,6 +16,7 @@ struct PCIBus {
>      pci_set_irq_fn set_irq;
>      pci_map_irq_fn map_irq;
>      pci_route_irq_fn route_intx_to_irq;
> +    pci_map_msi_fn map_msi;
>      pci_hotplug_fn hotplug;
>      DeviceState *hotplug_qdev;
>      void *irq_opaque;
Alexey Kardashevskiy June 20, 2013, 11:51 p.m. UTC | #4
On 06/21/2013 02:37 AM, Anthony Liguori wrote:
> Alexey Kardashevskiy <aik@ozlabs.ru> writes:
> 
>> At the moment QEMU creates a route for every MSI IRQ.
>>
>> Now we are about to add IRQFD support on PPC64-pseries platform.
>> pSeries already has in-kernel emulated interrupt controller with
>> 8192 IRQs. Also, pSeries PHB already supports MSIMessage to IRQ
>> mapping as a part of PAPR requirements for MSI/MSIX guests.
>> Specifically, the pSeries guest does not touch MSIMessage's at
>> all, instead it uses rtas_ibm_change_msi and rtas_ibm_query_interrupt_source
>> rtas calls to do the mapping.
>>
>> Therefore we do not really need more routing than we got already.
>> The patch introduces the infrastructure to enable direct IRQ mapping.
>>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>
>> ---
>>
>> The patch is raw and ugly indeed, I made it only to demonstrate
>> the idea and see if it has right to live or not.
>>
>> For some reason which I do not really understand (limited GSI numbers?)
>> the existing code always adds routing and I do not see why we would need it.
>>
>> Thanks!
>> ---
>>  hw/misc/vfio.c           |   11 +++++++++--
>>  hw/pci/pci.c             |   13 +++++++++++++
>>  hw/ppc/spapr_pci.c       |   13 +++++++++++++
>>  hw/virtio/virtio-pci.c   |   26 ++++++++++++++++++++------
>>  include/hw/pci/pci.h     |    4 ++++
>>  include/hw/pci/pci_bus.h |    1 +
>>  6 files changed, 60 insertions(+), 8 deletions(-)
>>
>> diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c
>> index 14aac04..2d9eef7 100644
>> --- a/hw/misc/vfio.c
>> +++ b/hw/misc/vfio.c
>> @@ -639,7 +639,11 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
>>       * Attempt to enable route through KVM irqchip,
>>       * default to userspace handling if unavailable.
>>       */
>> -    vector->virq = msg ? kvm_irqchip_add_msi_route(kvm_state, *msg) : -1;
>> +
>> +    vector->virq = msg ? pci_bus_map_msi(vdev->pdev.bus, *msg) : -1;
>> +    if (vector->virq < 0) {
>> +        vector->virq = msg ? kvm_irqchip_add_msi_route(kvm_state, *msg) : -1;
>> +    }
>>      if (vector->virq < 0 ||
>>          kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt,
>>                                         vector->virq) < 0) {
>> @@ -807,7 +811,10 @@ retry:
>>           * Attempt to enable route through KVM irqchip,
>>           * default to userspace handling if unavailable.
>>           */
>> -        vector->virq = kvm_irqchip_add_msi_route(kvm_state, msg);
>> +        vector->virq = pci_bus_map_msi(vdev->pdev.bus, msg);
>> +        if (vector->virq < 0) {
>> +            vector->virq = kvm_irqchip_add_msi_route(kvm_state, msg);
>> +        }
> 
> I don't understand why you're adding a pci level hook verses just having
> a kvmppc specific hook in the kvm_irqchip_add_msi_route function..

Me neither :) I am just asking. The existing mapping code already exists in
sPAPR PCI host bridge and it is not going anywhere else.

And kvm_irqchip_add_msi_route does not have any link to a device or a bus
so I'll have to walk through all PHBs in system and see if PHB's MSI window
is the one from MSIMessage and convert MSIMessage to virq. Pretty easy and
quick but still dirty hack, would it be better?
Alexey Kardashevskiy June 21, 2013, 1:56 a.m. UTC | #5
On 06/21/2013 02:51 AM, Alex Williamson wrote:
> On Fri, 2013-06-21 at 00:08 +1000, Alexey Kardashevskiy wrote:
>> At the moment QEMU creates a route for every MSI IRQ.
>>
>> Now we are about to add IRQFD support on PPC64-pseries platform.
>> pSeries already has in-kernel emulated interrupt controller with
>> 8192 IRQs. Also, pSeries PHB already supports MSIMessage to IRQ
>> mapping as a part of PAPR requirements for MSI/MSIX guests.
>> Specifically, the pSeries guest does not touch MSIMessage's at
>> all, instead it uses rtas_ibm_change_msi and rtas_ibm_query_interrupt_source
>> rtas calls to do the mapping.
>>
>> Therefore we do not really need more routing than we got already.
>> The patch introduces the infrastructure to enable direct IRQ mapping.
>>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>
>> ---
>>
>> The patch is raw and ugly indeed, I made it only to demonstrate
>> the idea and see if it has right to live or not.
>>
>> For some reason which I do not really understand (limited GSI numbers?)
>> the existing code always adds routing and I do not see why we would need it.
> 
> It's an IOAPIC, a pin gets toggled from the device and an MSI message
> gets written to the CPU.  So the route allocates and programs the
> pin->MSI, then we tell it what notifier triggers that pin.

> On x86 the MSI vector doesn't encode any information about the device
> sending the MSI, here you seem to be able to figure out the device and
> vector space number from the address.  Then your pin to MSI is
> effectively fixed.  So why isn't this just your
> kvm_irqchip_add_msi_route function?  On pSeries it's a lookup, on x86
> it's a allocate and program.
>  What does kvm_irqchip_add_msi_route do on
> pSeries today?  Thanks,


As we just started implementing this thing, I commented it out for the
starter. Once called, it destroys direct mapping in the host kernel and
everything stops working as routing is not implemented (yet? ever?).

My point here is that MSIMessage to irq translation is made on a PCI domain
as PAPR (ppc64 server) spec says. The guest never uses MSIMessage, it is
all in QEMU, the guest dynamically allocates MSI IRQs and it is up to a
hypeviser (QEMU) to take care of actual MSIMessage for the device.

And the only reason to use MSIMessage in QEMU for us is to support
msi_notify()/msix_notify() in places like vfio_msi_interrupt(), I have
added a MSI window for that long time ago which we do not need as much as
we already have an irq number in vfio_msi_interrupt(), etc.



> 
> Alex
> 
>> ---
>>  hw/misc/vfio.c           |   11 +++++++++--
>>  hw/pci/pci.c             |   13 +++++++++++++
>>  hw/ppc/spapr_pci.c       |   13 +++++++++++++
>>  hw/virtio/virtio-pci.c   |   26 ++++++++++++++++++++------
>>  include/hw/pci/pci.h     |    4 ++++
>>  include/hw/pci/pci_bus.h |    1 +
>>  6 files changed, 60 insertions(+), 8 deletions(-)
>>
>> diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c
>> index 14aac04..2d9eef7 100644
>> --- a/hw/misc/vfio.c
>> +++ b/hw/misc/vfio.c
>> @@ -639,7 +639,11 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
>>       * Attempt to enable route through KVM irqchip,
>>       * default to userspace handling if unavailable.
>>       */
>> -    vector->virq = msg ? kvm_irqchip_add_msi_route(kvm_state, *msg) : -1;
>> +
>> +    vector->virq = msg ? pci_bus_map_msi(vdev->pdev.bus, *msg) : -1;
>> +    if (vector->virq < 0) {
>> +        vector->virq = msg ? kvm_irqchip_add_msi_route(kvm_state, *msg) : -1;
>> +    }
>>      if (vector->virq < 0 ||
>>          kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt,
>>                                         vector->virq) < 0) {
>> @@ -807,7 +811,10 @@ retry:
>>           * Attempt to enable route through KVM irqchip,
>>           * default to userspace handling if unavailable.
>>           */
>> -        vector->virq = kvm_irqchip_add_msi_route(kvm_state, msg);
>> +        vector->virq = pci_bus_map_msi(vdev->pdev.bus, msg);
>> +        if (vector->virq < 0) {
>> +            vector->virq = kvm_irqchip_add_msi_route(kvm_state, msg);
>> +        }
>>          if (vector->virq < 0 ||
>>              kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt,
>>                                             vector->virq) < 0) {
>> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
>> index a976e46..a9875e9 100644
>> --- a/hw/pci/pci.c
>> +++ b/hw/pci/pci.c
>> @@ -1254,6 +1254,19 @@ void pci_device_set_intx_routing_notifier(PCIDevice *dev,
>>      dev->intx_routing_notifier = notifier;
>>  }
>>  
>> +void pci_bus_set_map_msi_fn(PCIBus *bus, pci_map_msi_fn map_msi_fn)
>> +{
>> +    bus->map_msi = map_msi_fn;
>> +}
>> +
>> +int pci_bus_map_msi(PCIBus *bus, MSIMessage msg)
>> +{
>> +    if (bus->map_msi) {
>> +        return bus->map_msi(bus, msg);
>> +    }
>> +    return -1;
>> +}
>> +
>>  /*
>>   * PCI-to-PCI bridge specification
>>   * 9.1: Interrupt routing. Table 9-1
>> diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
>> index 80408c9..9ef9a29 100644
>> --- a/hw/ppc/spapr_pci.c
>> +++ b/hw/ppc/spapr_pci.c
>> @@ -500,6 +500,18 @@ static void spapr_msi_write(void *opaque, hwaddr addr,
>>      qemu_irq_pulse(xics_get_qirq(spapr->icp, irq));
>>  }
>>  
>> +static int spapr_msi_get_irq(PCIBus *bus, MSIMessage msg)
>> +{
>> +    DeviceState *par = bus->qbus.parent;
>> +    sPAPRPHBState *sphb = (sPAPRPHBState *) par;
>> +    unsigned long addr = msg.address - sphb->msi_win_addr;
>> +    int ndev = addr >> 16;
>> +    int vec = ((addr & 0xFFFF) >> 2) | msg.data;
>> +    uint32_t irq = sphb->msi_table[ndev].irq + vec;
>> +
>> +    return (int)irq;
>> +}
>> +
>>  static const MemoryRegionOps spapr_msi_ops = {
>>      /* There is no .read as the read result is undefined by PCI spec */
>>      .read = NULL,
>> @@ -664,6 +676,7 @@ static int _spapr_phb_init(SysBusDevice *s)
>>  
>>          sphb->lsi_table[i].irq = irq;
>>      }
>> +    pci_bus_set_map_msi_fn(bus, spapr_msi_get_irq);
>>  
>>      return 0;
>>  }
>> diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
>> index d309416..587f53e 100644
>> --- a/hw/virtio/virtio-pci.c
>> +++ b/hw/virtio/virtio-pci.c
>> @@ -472,6 +472,8 @@ static unsigned virtio_pci_get_features(DeviceState *d)
>>      return proxy->host_features;
>>  }
>>  
>> +extern int spapr_msi_get_irq(PCIBus *bus, MSIMessage *msg);
>> +
>>  static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy *proxy,
>>                                          unsigned int queue_no,
>>                                          unsigned int vector,
>> @@ -481,7 +483,10 @@ static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy *proxy,
>>      int ret;
>>  
>>      if (irqfd->users == 0) {
>> -        ret = kvm_irqchip_add_msi_route(kvm_state, msg);
>> +        ret = pci_bus_map_msi(proxy->pci_dev.bus, msg);
>> +        if (ret < 0) {
>> +            ret = kvm_irqchip_add_msi_route(kvm_state, msg);
>> +        }
>>          if (ret < 0) {
>>              return ret;
>>          }
>> @@ -609,14 +614,23 @@ static int virtio_pci_vq_vector_unmask(VirtIOPCIProxy *proxy,
>>      VirtQueue *vq = virtio_get_queue(proxy->vdev, queue_no);
>>      EventNotifier *n = virtio_queue_get_guest_notifier(vq);
>>      VirtIOIRQFD *irqfd;
>> -    int ret = 0;
>> +    int ret = 0, tmp;
>>  
>>      if (proxy->vector_irqfd) {
>>          irqfd = &proxy->vector_irqfd[vector];
>> -        if (irqfd->msg.data != msg.data || irqfd->msg.address != msg.address) {
>> -            ret = kvm_irqchip_update_msi_route(kvm_state, irqfd->virq, msg);
>> -            if (ret < 0) {
>> -                return ret;
>> +
>> +        tmp = pci_bus_map_msi(proxy->pci_dev.bus, msg);
>> +        if (tmp >= 0) {
>> +            if (irqfd->virq != tmp) {
>> +                fprintf(stderr, "FIXME: MSI(-X) vector has changed from %X to %x\n",
>> +                        irqfd->virq, tmp);
>> +            }
>> +        } else {
>> +            if (irqfd->msg.data != msg.data || irqfd->msg.address != msg.address) {
>> +                ret = kvm_irqchip_update_msi_route(kvm_state, irqfd->virq, msg);
>> +                if (ret < 0) {
>> +                    return ret;
>> +                }
>>              }
>>          }
>>      }
>> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
>> index 8797802..632739a 100644
>> --- a/include/hw/pci/pci.h
>> +++ b/include/hw/pci/pci.h
>> @@ -332,6 +332,7 @@ MemoryRegion *pci_address_space_io(PCIDevice *dev);
>>  typedef void (*pci_set_irq_fn)(void *opaque, int irq_num, int level);
>>  typedef int (*pci_map_irq_fn)(PCIDevice *pci_dev, int irq_num);
>>  typedef PCIINTxRoute (*pci_route_irq_fn)(void *opaque, int pin);
>> +typedef int (*pci_map_msi_fn)(PCIBus *bus, MSIMessage msg);
>>  
>>  typedef enum {
>>      PCI_HOTPLUG_DISABLED,
>> @@ -375,6 +376,9 @@ bool pci_intx_route_changed(PCIINTxRoute *old, PCIINTxRoute *new);
>>  void pci_bus_fire_intx_routing_notifier(PCIBus *bus);
>>  void pci_device_set_intx_routing_notifier(PCIDevice *dev,
>>                                            PCIINTxRoutingNotifier notifier);
>> +void pci_bus_set_map_msi_fn(PCIBus *bus, pci_map_msi_fn map_msi_fn);
>> +int pci_bus_map_msi(PCIBus *bus, MSIMessage msg);
>> +
>>  void pci_device_reset(PCIDevice *dev);
>>  void pci_bus_reset(PCIBus *bus);
>>  
>> diff --git a/include/hw/pci/pci_bus.h b/include/hw/pci/pci_bus.h
>> index 66762f6..81efd2b 100644
>> --- a/include/hw/pci/pci_bus.h
>> +++ b/include/hw/pci/pci_bus.h
>> @@ -16,6 +16,7 @@ struct PCIBus {
>>      pci_set_irq_fn set_irq;
>>      pci_map_irq_fn map_irq;
>>      pci_route_irq_fn route_intx_to_irq;
>> +    pci_map_msi_fn map_msi;
>>      pci_hotplug_fn hotplug;
>>      DeviceState *hotplug_qdev;
>>      void *irq_opaque;
> 
> 
>
Alex Williamson June 21, 2013, 2:34 a.m. UTC | #6
On Fri, 2013-06-21 at 11:56 +1000, Alexey Kardashevskiy wrote:
> On 06/21/2013 02:51 AM, Alex Williamson wrote:
> > On Fri, 2013-06-21 at 00:08 +1000, Alexey Kardashevskiy wrote:
> >> At the moment QEMU creates a route for every MSI IRQ.
> >>
> >> Now we are about to add IRQFD support on PPC64-pseries platform.
> >> pSeries already has in-kernel emulated interrupt controller with
> >> 8192 IRQs. Also, pSeries PHB already supports MSIMessage to IRQ
> >> mapping as a part of PAPR requirements for MSI/MSIX guests.
> >> Specifically, the pSeries guest does not touch MSIMessage's at
> >> all, instead it uses rtas_ibm_change_msi and rtas_ibm_query_interrupt_source
> >> rtas calls to do the mapping.
> >>
> >> Therefore we do not really need more routing than we got already.
> >> The patch introduces the infrastructure to enable direct IRQ mapping.
> >>
> >> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >>
> >> ---
> >>
> >> The patch is raw and ugly indeed, I made it only to demonstrate
> >> the idea and see if it has right to live or not.
> >>
> >> For some reason which I do not really understand (limited GSI numbers?)
> >> the existing code always adds routing and I do not see why we would need it.
> > 
> > It's an IOAPIC, a pin gets toggled from the device and an MSI message
> > gets written to the CPU.  So the route allocates and programs the
> > pin->MSI, then we tell it what notifier triggers that pin.
> 
> > On x86 the MSI vector doesn't encode any information about the device
> > sending the MSI, here you seem to be able to figure out the device and
> > vector space number from the address.  Then your pin to MSI is
> > effectively fixed.  So why isn't this just your
> > kvm_irqchip_add_msi_route function?  On pSeries it's a lookup, on x86
> > it's a allocate and program.
> >  What does kvm_irqchip_add_msi_route do on
> > pSeries today?  Thanks,
> 
> 
> As we just started implementing this thing, I commented it out for the
> starter. Once called, it destroys direct mapping in the host kernel and
> everything stops working as routing is not implemented (yet? ever?).

Yay, it's broken, you can rewrite it ;)

> My point here is that MSIMessage to irq translation is made on a PCI domain
> as PAPR (ppc64 server) spec says. The guest never uses MSIMessage, it is
> all in QEMU, the guest dynamically allocates MSI IRQs and it is up to a
> hypeviser (QEMU) to take care of actual MSIMessage for the device.

MSIMessage is what the guest has programmed for the address/data fields,
it's not just a QEMU invention.  From the guest perspective, the device
writes msg.data to msg.address to signal the CPU for the interrupt.

> And the only reason to use MSIMessage in QEMU for us is to support
> msi_notify()/msix_notify() in places like vfio_msi_interrupt(), I have
> added a MSI window for that long time ago which we do not need as much as
> we already have an irq number in vfio_msi_interrupt(), etc.

It seems like you just have another layer of indirection via your
msi_table.  For x86 there's a layer of indirection via the virq virtual
IOAPIC pin.  Seems similar.  Thanks,

Alex

> >> ---
> >>  hw/misc/vfio.c           |   11 +++++++++--
> >>  hw/pci/pci.c             |   13 +++++++++++++
> >>  hw/ppc/spapr_pci.c       |   13 +++++++++++++
> >>  hw/virtio/virtio-pci.c   |   26 ++++++++++++++++++++------
> >>  include/hw/pci/pci.h     |    4 ++++
> >>  include/hw/pci/pci_bus.h |    1 +
> >>  6 files changed, 60 insertions(+), 8 deletions(-)
> >>
> >> diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c
> >> index 14aac04..2d9eef7 100644
> >> --- a/hw/misc/vfio.c
> >> +++ b/hw/misc/vfio.c
> >> @@ -639,7 +639,11 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
> >>       * Attempt to enable route through KVM irqchip,
> >>       * default to userspace handling if unavailable.
> >>       */
> >> -    vector->virq = msg ? kvm_irqchip_add_msi_route(kvm_state, *msg) : -1;
> >> +
> >> +    vector->virq = msg ? pci_bus_map_msi(vdev->pdev.bus, *msg) : -1;
> >> +    if (vector->virq < 0) {
> >> +        vector->virq = msg ? kvm_irqchip_add_msi_route(kvm_state, *msg) : -1;
> >> +    }
> >>      if (vector->virq < 0 ||
> >>          kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt,
> >>                                         vector->virq) < 0) {
> >> @@ -807,7 +811,10 @@ retry:
> >>           * Attempt to enable route through KVM irqchip,
> >>           * default to userspace handling if unavailable.
> >>           */
> >> -        vector->virq = kvm_irqchip_add_msi_route(kvm_state, msg);
> >> +        vector->virq = pci_bus_map_msi(vdev->pdev.bus, msg);
> >> +        if (vector->virq < 0) {
> >> +            vector->virq = kvm_irqchip_add_msi_route(kvm_state, msg);
> >> +        }
> >>          if (vector->virq < 0 ||
> >>              kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt,
> >>                                             vector->virq) < 0) {
> >> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> >> index a976e46..a9875e9 100644
> >> --- a/hw/pci/pci.c
> >> +++ b/hw/pci/pci.c
> >> @@ -1254,6 +1254,19 @@ void pci_device_set_intx_routing_notifier(PCIDevice *dev,
> >>      dev->intx_routing_notifier = notifier;
> >>  }
> >>  
> >> +void pci_bus_set_map_msi_fn(PCIBus *bus, pci_map_msi_fn map_msi_fn)
> >> +{
> >> +    bus->map_msi = map_msi_fn;
> >> +}
> >> +
> >> +int pci_bus_map_msi(PCIBus *bus, MSIMessage msg)
> >> +{
> >> +    if (bus->map_msi) {
> >> +        return bus->map_msi(bus, msg);
> >> +    }
> >> +    return -1;
> >> +}
> >> +
> >>  /*
> >>   * PCI-to-PCI bridge specification
> >>   * 9.1: Interrupt routing. Table 9-1
> >> diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
> >> index 80408c9..9ef9a29 100644
> >> --- a/hw/ppc/spapr_pci.c
> >> +++ b/hw/ppc/spapr_pci.c
> >> @@ -500,6 +500,18 @@ static void spapr_msi_write(void *opaque, hwaddr addr,
> >>      qemu_irq_pulse(xics_get_qirq(spapr->icp, irq));
> >>  }
> >>  
> >> +static int spapr_msi_get_irq(PCIBus *bus, MSIMessage msg)
> >> +{
> >> +    DeviceState *par = bus->qbus.parent;
> >> +    sPAPRPHBState *sphb = (sPAPRPHBState *) par;
> >> +    unsigned long addr = msg.address - sphb->msi_win_addr;
> >> +    int ndev = addr >> 16;
> >> +    int vec = ((addr & 0xFFFF) >> 2) | msg.data;
> >> +    uint32_t irq = sphb->msi_table[ndev].irq + vec;
> >> +
> >> +    return (int)irq;
> >> +}
> >> +
> >>  static const MemoryRegionOps spapr_msi_ops = {
> >>      /* There is no .read as the read result is undefined by PCI spec */
> >>      .read = NULL,
> >> @@ -664,6 +676,7 @@ static int _spapr_phb_init(SysBusDevice *s)
> >>  
> >>          sphb->lsi_table[i].irq = irq;
> >>      }
> >> +    pci_bus_set_map_msi_fn(bus, spapr_msi_get_irq);
> >>  
> >>      return 0;
> >>  }
> >> diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
> >> index d309416..587f53e 100644
> >> --- a/hw/virtio/virtio-pci.c
> >> +++ b/hw/virtio/virtio-pci.c
> >> @@ -472,6 +472,8 @@ static unsigned virtio_pci_get_features(DeviceState *d)
> >>      return proxy->host_features;
> >>  }
> >>  
> >> +extern int spapr_msi_get_irq(PCIBus *bus, MSIMessage *msg);
> >> +
> >>  static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy *proxy,
> >>                                          unsigned int queue_no,
> >>                                          unsigned int vector,
> >> @@ -481,7 +483,10 @@ static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy *proxy,
> >>      int ret;
> >>  
> >>      if (irqfd->users == 0) {
> >> -        ret = kvm_irqchip_add_msi_route(kvm_state, msg);
> >> +        ret = pci_bus_map_msi(proxy->pci_dev.bus, msg);
> >> +        if (ret < 0) {
> >> +            ret = kvm_irqchip_add_msi_route(kvm_state, msg);
> >> +        }
> >>          if (ret < 0) {
> >>              return ret;
> >>          }
> >> @@ -609,14 +614,23 @@ static int virtio_pci_vq_vector_unmask(VirtIOPCIProxy *proxy,
> >>      VirtQueue *vq = virtio_get_queue(proxy->vdev, queue_no);
> >>      EventNotifier *n = virtio_queue_get_guest_notifier(vq);
> >>      VirtIOIRQFD *irqfd;
> >> -    int ret = 0;
> >> +    int ret = 0, tmp;
> >>  
> >>      if (proxy->vector_irqfd) {
> >>          irqfd = &proxy->vector_irqfd[vector];
> >> -        if (irqfd->msg.data != msg.data || irqfd->msg.address != msg.address) {
> >> -            ret = kvm_irqchip_update_msi_route(kvm_state, irqfd->virq, msg);
> >> -            if (ret < 0) {
> >> -                return ret;
> >> +
> >> +        tmp = pci_bus_map_msi(proxy->pci_dev.bus, msg);
> >> +        if (tmp >= 0) {
> >> +            if (irqfd->virq != tmp) {
> >> +                fprintf(stderr, "FIXME: MSI(-X) vector has changed from %X to %x\n",
> >> +                        irqfd->virq, tmp);
> >> +            }
> >> +        } else {
> >> +            if (irqfd->msg.data != msg.data || irqfd->msg.address != msg.address) {
> >> +                ret = kvm_irqchip_update_msi_route(kvm_state, irqfd->virq, msg);
> >> +                if (ret < 0) {
> >> +                    return ret;
> >> +                }
> >>              }
> >>          }
> >>      }
> >> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
> >> index 8797802..632739a 100644
> >> --- a/include/hw/pci/pci.h
> >> +++ b/include/hw/pci/pci.h
> >> @@ -332,6 +332,7 @@ MemoryRegion *pci_address_space_io(PCIDevice *dev);
> >>  typedef void (*pci_set_irq_fn)(void *opaque, int irq_num, int level);
> >>  typedef int (*pci_map_irq_fn)(PCIDevice *pci_dev, int irq_num);
> >>  typedef PCIINTxRoute (*pci_route_irq_fn)(void *opaque, int pin);
> >> +typedef int (*pci_map_msi_fn)(PCIBus *bus, MSIMessage msg);
> >>  
> >>  typedef enum {
> >>      PCI_HOTPLUG_DISABLED,
> >> @@ -375,6 +376,9 @@ bool pci_intx_route_changed(PCIINTxRoute *old, PCIINTxRoute *new);
> >>  void pci_bus_fire_intx_routing_notifier(PCIBus *bus);
> >>  void pci_device_set_intx_routing_notifier(PCIDevice *dev,
> >>                                            PCIINTxRoutingNotifier notifier);
> >> +void pci_bus_set_map_msi_fn(PCIBus *bus, pci_map_msi_fn map_msi_fn);
> >> +int pci_bus_map_msi(PCIBus *bus, MSIMessage msg);
> >> +
> >>  void pci_device_reset(PCIDevice *dev);
> >>  void pci_bus_reset(PCIBus *bus);
> >>  
> >> diff --git a/include/hw/pci/pci_bus.h b/include/hw/pci/pci_bus.h
> >> index 66762f6..81efd2b 100644
> >> --- a/include/hw/pci/pci_bus.h
> >> +++ b/include/hw/pci/pci_bus.h
> >> @@ -16,6 +16,7 @@ struct PCIBus {
> >>      pci_set_irq_fn set_irq;
> >>      pci_map_irq_fn map_irq;
> >>      pci_route_irq_fn route_intx_to_irq;
> >> +    pci_map_msi_fn map_msi;
> >>      pci_hotplug_fn hotplug;
> >>      DeviceState *hotplug_qdev;
> >>      void *irq_opaque;
> > 
> > 
> > 
> 
>
Alexey Kardashevskiy June 21, 2013, 2:49 a.m. UTC | #7
On 06/21/2013 12:34 PM, Alex Williamson wrote:
> On Fri, 2013-06-21 at 11:56 +1000, Alexey Kardashevskiy wrote:
>> On 06/21/2013 02:51 AM, Alex Williamson wrote:
>>> On Fri, 2013-06-21 at 00:08 +1000, Alexey Kardashevskiy wrote:
>>>> At the moment QEMU creates a route for every MSI IRQ.
>>>>
>>>> Now we are about to add IRQFD support on PPC64-pseries platform.
>>>> pSeries already has in-kernel emulated interrupt controller with
>>>> 8192 IRQs. Also, pSeries PHB already supports MSIMessage to IRQ
>>>> mapping as a part of PAPR requirements for MSI/MSIX guests.
>>>> Specifically, the pSeries guest does not touch MSIMessage's at
>>>> all, instead it uses rtas_ibm_change_msi and rtas_ibm_query_interrupt_source
>>>> rtas calls to do the mapping.
>>>>
>>>> Therefore we do not really need more routing than we got already.
>>>> The patch introduces the infrastructure to enable direct IRQ mapping.
>>>>
>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>>>
>>>> ---
>>>>
>>>> The patch is raw and ugly indeed, I made it only to demonstrate
>>>> the idea and see if it has right to live or not.
>>>>
>>>> For some reason which I do not really understand (limited GSI numbers?)
>>>> the existing code always adds routing and I do not see why we would need it.
>>>
>>> It's an IOAPIC, a pin gets toggled from the device and an MSI message
>>> gets written to the CPU.  So the route allocates and programs the
>>> pin->MSI, then we tell it what notifier triggers that pin.
>>
>>> On x86 the MSI vector doesn't encode any information about the device
>>> sending the MSI, here you seem to be able to figure out the device and
>>> vector space number from the address.  Then your pin to MSI is
>>> effectively fixed.  So why isn't this just your
>>> kvm_irqchip_add_msi_route function?  On pSeries it's a lookup, on x86
>>> it's a allocate and program.
>>>  What does kvm_irqchip_add_msi_route do on
>>> pSeries today?  Thanks,
>>
>>
>> As we just started implementing this thing, I commented it out for the
>> starter. Once called, it destroys direct mapping in the host kernel and
>> everything stops working as routing is not implemented (yet? ever?).
> 
> Yay, it's broken, you can rewrite it ;)


There is nothing to rewrite, my understanding is that it is just not
written yet and Paul would like not do that :)


>> My point here is that MSIMessage to irq translation is made on a PCI domain
>> as PAPR (ppc64 server) spec says. The guest never uses MSIMessage, it is
>> all in QEMU, the guest dynamically allocates MSI IRQs and it is up to a
>> hypeviser (QEMU) to take care of actual MSIMessage for the device.
> 
> MSIMessage is what the guest has programmed for the address/data fields,
> it's not just a QEMU invention.  From the guest perspective, the device
> writes msg.data to msg.address to signal the CPU for the interrupt.


Our guests do never program MSIMessage. Hypercalls are used instead.


>> And the only reason to use MSIMessage in QEMU for us is to support
>> msi_notify()/msix_notify() in places like vfio_msi_interrupt(), I have
>> added a MSI window for that long time ago which we do not need as much as
>> we already have an irq number in vfio_msi_interrupt(), etc.
> 
> It seems like you just have another layer of indirection via your
> msi_table.  For x86 there's a layer of indirection via the virq virtual
> IOAPIC pin.  Seems similar.  Thanks,


Do not follow you, sorry. For x86, is it that MSI routing table which is
updated via KVM_SET_GSI_ROUTING in KVM? When there is no KVM, what piece of
code responds on msi_notify() in qemu-x86 and does qemu_irq_pulse()?



> 
> Alex
> 
>>>> ---
>>>>  hw/misc/vfio.c           |   11 +++++++++--
>>>>  hw/pci/pci.c             |   13 +++++++++++++
>>>>  hw/ppc/spapr_pci.c       |   13 +++++++++++++
>>>>  hw/virtio/virtio-pci.c   |   26 ++++++++++++++++++++------
>>>>  include/hw/pci/pci.h     |    4 ++++
>>>>  include/hw/pci/pci_bus.h |    1 +
>>>>  6 files changed, 60 insertions(+), 8 deletions(-)
>>>>
>>>> diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c
>>>> index 14aac04..2d9eef7 100644
>>>> --- a/hw/misc/vfio.c
>>>> +++ b/hw/misc/vfio.c
>>>> @@ -639,7 +639,11 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
>>>>       * Attempt to enable route through KVM irqchip,
>>>>       * default to userspace handling if unavailable.
>>>>       */
>>>> -    vector->virq = msg ? kvm_irqchip_add_msi_route(kvm_state, *msg) : -1;
>>>> +
>>>> +    vector->virq = msg ? pci_bus_map_msi(vdev->pdev.bus, *msg) : -1;
>>>> +    if (vector->virq < 0) {
>>>> +        vector->virq = msg ? kvm_irqchip_add_msi_route(kvm_state, *msg) : -1;
>>>> +    }
>>>>      if (vector->virq < 0 ||
>>>>          kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt,
>>>>                                         vector->virq) < 0) {
>>>> @@ -807,7 +811,10 @@ retry:
>>>>           * Attempt to enable route through KVM irqchip,
>>>>           * default to userspace handling if unavailable.
>>>>           */
>>>> -        vector->virq = kvm_irqchip_add_msi_route(kvm_state, msg);
>>>> +        vector->virq = pci_bus_map_msi(vdev->pdev.bus, msg);
>>>> +        if (vector->virq < 0) {
>>>> +            vector->virq = kvm_irqchip_add_msi_route(kvm_state, msg);
>>>> +        }
>>>>          if (vector->virq < 0 ||
>>>>              kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt,
>>>>                                             vector->virq) < 0) {
>>>> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
>>>> index a976e46..a9875e9 100644
>>>> --- a/hw/pci/pci.c
>>>> +++ b/hw/pci/pci.c
>>>> @@ -1254,6 +1254,19 @@ void pci_device_set_intx_routing_notifier(PCIDevice *dev,
>>>>      dev->intx_routing_notifier = notifier;
>>>>  }
>>>>  
>>>> +void pci_bus_set_map_msi_fn(PCIBus *bus, pci_map_msi_fn map_msi_fn)
>>>> +{
>>>> +    bus->map_msi = map_msi_fn;
>>>> +}
>>>> +
>>>> +int pci_bus_map_msi(PCIBus *bus, MSIMessage msg)
>>>> +{
>>>> +    if (bus->map_msi) {
>>>> +        return bus->map_msi(bus, msg);
>>>> +    }
>>>> +    return -1;
>>>> +}
>>>> +
>>>>  /*
>>>>   * PCI-to-PCI bridge specification
>>>>   * 9.1: Interrupt routing. Table 9-1
>>>> diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
>>>> index 80408c9..9ef9a29 100644
>>>> --- a/hw/ppc/spapr_pci.c
>>>> +++ b/hw/ppc/spapr_pci.c
>>>> @@ -500,6 +500,18 @@ static void spapr_msi_write(void *opaque, hwaddr addr,
>>>>      qemu_irq_pulse(xics_get_qirq(spapr->icp, irq));
>>>>  }
>>>>  
>>>> +static int spapr_msi_get_irq(PCIBus *bus, MSIMessage msg)
>>>> +{
>>>> +    DeviceState *par = bus->qbus.parent;
>>>> +    sPAPRPHBState *sphb = (sPAPRPHBState *) par;
>>>> +    unsigned long addr = msg.address - sphb->msi_win_addr;
>>>> +    int ndev = addr >> 16;
>>>> +    int vec = ((addr & 0xFFFF) >> 2) | msg.data;
>>>> +    uint32_t irq = sphb->msi_table[ndev].irq + vec;
>>>> +
>>>> +    return (int)irq;
>>>> +}
>>>> +
>>>>  static const MemoryRegionOps spapr_msi_ops = {
>>>>      /* There is no .read as the read result is undefined by PCI spec */
>>>>      .read = NULL,
>>>> @@ -664,6 +676,7 @@ static int _spapr_phb_init(SysBusDevice *s)
>>>>  
>>>>          sphb->lsi_table[i].irq = irq;
>>>>      }
>>>> +    pci_bus_set_map_msi_fn(bus, spapr_msi_get_irq);
>>>>  
>>>>      return 0;
>>>>  }
>>>> diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
>>>> index d309416..587f53e 100644
>>>> --- a/hw/virtio/virtio-pci.c
>>>> +++ b/hw/virtio/virtio-pci.c
>>>> @@ -472,6 +472,8 @@ static unsigned virtio_pci_get_features(DeviceState *d)
>>>>      return proxy->host_features;
>>>>  }
>>>>  
>>>> +extern int spapr_msi_get_irq(PCIBus *bus, MSIMessage *msg);
>>>> +
>>>>  static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy *proxy,
>>>>                                          unsigned int queue_no,
>>>>                                          unsigned int vector,
>>>> @@ -481,7 +483,10 @@ static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy *proxy,
>>>>      int ret;
>>>>  
>>>>      if (irqfd->users == 0) {
>>>> -        ret = kvm_irqchip_add_msi_route(kvm_state, msg);
>>>> +        ret = pci_bus_map_msi(proxy->pci_dev.bus, msg);
>>>> +        if (ret < 0) {
>>>> +            ret = kvm_irqchip_add_msi_route(kvm_state, msg);
>>>> +        }
>>>>          if (ret < 0) {
>>>>              return ret;
>>>>          }
>>>> @@ -609,14 +614,23 @@ static int virtio_pci_vq_vector_unmask(VirtIOPCIProxy *proxy,
>>>>      VirtQueue *vq = virtio_get_queue(proxy->vdev, queue_no);
>>>>      EventNotifier *n = virtio_queue_get_guest_notifier(vq);
>>>>      VirtIOIRQFD *irqfd;
>>>> -    int ret = 0;
>>>> +    int ret = 0, tmp;
>>>>  
>>>>      if (proxy->vector_irqfd) {
>>>>          irqfd = &proxy->vector_irqfd[vector];
>>>> -        if (irqfd->msg.data != msg.data || irqfd->msg.address != msg.address) {
>>>> -            ret = kvm_irqchip_update_msi_route(kvm_state, irqfd->virq, msg);
>>>> -            if (ret < 0) {
>>>> -                return ret;
>>>> +
>>>> +        tmp = pci_bus_map_msi(proxy->pci_dev.bus, msg);
>>>> +        if (tmp >= 0) {
>>>> +            if (irqfd->virq != tmp) {
>>>> +                fprintf(stderr, "FIXME: MSI(-X) vector has changed from %X to %x\n",
>>>> +                        irqfd->virq, tmp);
>>>> +            }
>>>> +        } else {
>>>> +            if (irqfd->msg.data != msg.data || irqfd->msg.address != msg.address) {
>>>> +                ret = kvm_irqchip_update_msi_route(kvm_state, irqfd->virq, msg);
>>>> +                if (ret < 0) {
>>>> +                    return ret;
>>>> +                }
>>>>              }
>>>>          }
>>>>      }
>>>> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
>>>> index 8797802..632739a 100644
>>>> --- a/include/hw/pci/pci.h
>>>> +++ b/include/hw/pci/pci.h
>>>> @@ -332,6 +332,7 @@ MemoryRegion *pci_address_space_io(PCIDevice *dev);
>>>>  typedef void (*pci_set_irq_fn)(void *opaque, int irq_num, int level);
>>>>  typedef int (*pci_map_irq_fn)(PCIDevice *pci_dev, int irq_num);
>>>>  typedef PCIINTxRoute (*pci_route_irq_fn)(void *opaque, int pin);
>>>> +typedef int (*pci_map_msi_fn)(PCIBus *bus, MSIMessage msg);
>>>>  
>>>>  typedef enum {
>>>>      PCI_HOTPLUG_DISABLED,
>>>> @@ -375,6 +376,9 @@ bool pci_intx_route_changed(PCIINTxRoute *old, PCIINTxRoute *new);
>>>>  void pci_bus_fire_intx_routing_notifier(PCIBus *bus);
>>>>  void pci_device_set_intx_routing_notifier(PCIDevice *dev,
>>>>                                            PCIINTxRoutingNotifier notifier);
>>>> +void pci_bus_set_map_msi_fn(PCIBus *bus, pci_map_msi_fn map_msi_fn);
>>>> +int pci_bus_map_msi(PCIBus *bus, MSIMessage msg);
>>>> +
>>>>  void pci_device_reset(PCIDevice *dev);
>>>>  void pci_bus_reset(PCIBus *bus);
>>>>  
>>>> diff --git a/include/hw/pci/pci_bus.h b/include/hw/pci/pci_bus.h
>>>> index 66762f6..81efd2b 100644
>>>> --- a/include/hw/pci/pci_bus.h
>>>> +++ b/include/hw/pci/pci_bus.h
>>>> @@ -16,6 +16,7 @@ struct PCIBus {
>>>>      pci_set_irq_fn set_irq;
>>>>      pci_map_irq_fn map_irq;
>>>>      pci_route_irq_fn route_intx_to_irq;
>>>> +    pci_map_msi_fn map_msi;
>>>>      pci_hotplug_fn hotplug;
>>>>      DeviceState *hotplug_qdev;
>>>>      void *irq_opaque;
>>>
>>>
>>>
>>
>>
> 
> 
>
Alex Williamson June 21, 2013, 4:46 a.m. UTC | #8
On Fri, 2013-06-21 at 12:49 +1000, Alexey Kardashevskiy wrote:
> On 06/21/2013 12:34 PM, Alex Williamson wrote:
> > On Fri, 2013-06-21 at 11:56 +1000, Alexey Kardashevskiy wrote:
> >> On 06/21/2013 02:51 AM, Alex Williamson wrote:
> >>> On Fri, 2013-06-21 at 00:08 +1000, Alexey Kardashevskiy wrote:
> >>>> At the moment QEMU creates a route for every MSI IRQ.
> >>>>
> >>>> Now we are about to add IRQFD support on PPC64-pseries platform.
> >>>> pSeries already has in-kernel emulated interrupt controller with
> >>>> 8192 IRQs. Also, pSeries PHB already supports MSIMessage to IRQ
> >>>> mapping as a part of PAPR requirements for MSI/MSIX guests.
> >>>> Specifically, the pSeries guest does not touch MSIMessage's at
> >>>> all, instead it uses rtas_ibm_change_msi and rtas_ibm_query_interrupt_source
> >>>> rtas calls to do the mapping.
> >>>>
> >>>> Therefore we do not really need more routing than we got already.
> >>>> The patch introduces the infrastructure to enable direct IRQ mapping.
> >>>>
> >>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >>>>
> >>>> ---
> >>>>
> >>>> The patch is raw and ugly indeed, I made it only to demonstrate
> >>>> the idea and see if it has right to live or not.
> >>>>
> >>>> For some reason which I do not really understand (limited GSI numbers?)
> >>>> the existing code always adds routing and I do not see why we would need it.
> >>>
> >>> It's an IOAPIC, a pin gets toggled from the device and an MSI message
> >>> gets written to the CPU.  So the route allocates and programs the
> >>> pin->MSI, then we tell it what notifier triggers that pin.
> >>
> >>> On x86 the MSI vector doesn't encode any information about the device
> >>> sending the MSI, here you seem to be able to figure out the device and
> >>> vector space number from the address.  Then your pin to MSI is
> >>> effectively fixed.  So why isn't this just your
> >>> kvm_irqchip_add_msi_route function?  On pSeries it's a lookup, on x86
> >>> it's a allocate and program.
> >>>  What does kvm_irqchip_add_msi_route do on
> >>> pSeries today?  Thanks,
> >>
> >>
> >> As we just started implementing this thing, I commented it out for the
> >> starter. Once called, it destroys direct mapping in the host kernel and
> >> everything stops working as routing is not implemented (yet? ever?).
> > 
> > Yay, it's broken, you can rewrite it ;)
> 
> 
> There is nothing to rewrite, my understanding is that it is just not
> written yet and Paul would like not do that :)
> 
> 
> >> My point here is that MSIMessage to irq translation is made on a PCI domain
> >> as PAPR (ppc64 server) spec says. The guest never uses MSIMessage, it is
> >> all in QEMU, the guest dynamically allocates MSI IRQs and it is up to a
> >> hypeviser (QEMU) to take care of actual MSIMessage for the device.
> > 
> > MSIMessage is what the guest has programmed for the address/data fields,
> > it's not just a QEMU invention.  From the guest perspective, the device
> > writes msg.data to msg.address to signal the CPU for the interrupt.
> 
> 
> Our guests do never program MSIMessage. Hypercalls are used instead.

Of course POWER has a hypercall for that, but that's just abstracting
the physical device, which does actually write msg.data to msg.address
on the bus.

> >> And the only reason to use MSIMessage in QEMU for us is to support
> >> msi_notify()/msix_notify() in places like vfio_msi_interrupt(), I have
> >> added a MSI window for that long time ago which we do not need as much as
> >> we already have an irq number in vfio_msi_interrupt(), etc.
> > 
> > It seems like you just have another layer of indirection via your
> > msi_table.  For x86 there's a layer of indirection via the virq virtual
> > IOAPIC pin.  Seems similar.  Thanks,
> 
> 
> Do not follow you, sorry. For x86, is it that MSI routing table which is
> updated via KVM_SET_GSI_ROUTING in KVM? When there is no KVM, what piece of
> code responds on msi_notify() in qemu-x86 and does qemu_irq_pulse()?

vfio_msi_interrupt->msi[x]_notify->stl_le_phys(msg.address, msg.data)

This writes directly to the interrupt block on the vCPU.  With KVM, the
in-kernel APIC does the same write, where the pin to MSIMessage is setup
by kvm_irqchip_add_msi_route and the pin is pulled by an irqfd.

Do I understand that on POWER the MSI from the device is intercepted at
the PHB and converted to an IRQ that's triggered by some means other
than a MSI write?  So to correctly model the hardware, vfio should do a
msi_notify() that does a stl_le_phys that terminates at this IRQ
remapper thing and in turn toggles a qemu_irq.  MSIMessage is only
extraneous data if you want to skip over hardware blocks.

Maybe you could add a device parameter to kvm_irqchip_add_msi_route so
that it can be implemented on POWER without this pci_bus_map_msi
interface that seems very unique to POWER.  Thanks,

Alex

> >>>> ---
> >>>>  hw/misc/vfio.c           |   11 +++++++++--
> >>>>  hw/pci/pci.c             |   13 +++++++++++++
> >>>>  hw/ppc/spapr_pci.c       |   13 +++++++++++++
> >>>>  hw/virtio/virtio-pci.c   |   26 ++++++++++++++++++++------
> >>>>  include/hw/pci/pci.h     |    4 ++++
> >>>>  include/hw/pci/pci_bus.h |    1 +
> >>>>  6 files changed, 60 insertions(+), 8 deletions(-)
> >>>>
> >>>> diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c
> >>>> index 14aac04..2d9eef7 100644
> >>>> --- a/hw/misc/vfio.c
> >>>> +++ b/hw/misc/vfio.c
> >>>> @@ -639,7 +639,11 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
> >>>>       * Attempt to enable route through KVM irqchip,
> >>>>       * default to userspace handling if unavailable.
> >>>>       */
> >>>> -    vector->virq = msg ? kvm_irqchip_add_msi_route(kvm_state, *msg) : -1;
> >>>> +
> >>>> +    vector->virq = msg ? pci_bus_map_msi(vdev->pdev.bus, *msg) : -1;
> >>>> +    if (vector->virq < 0) {
> >>>> +        vector->virq = msg ? kvm_irqchip_add_msi_route(kvm_state, *msg) : -1;
> >>>> +    }
> >>>>      if (vector->virq < 0 ||
> >>>>          kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt,
> >>>>                                         vector->virq) < 0) {
> >>>> @@ -807,7 +811,10 @@ retry:
> >>>>           * Attempt to enable route through KVM irqchip,
> >>>>           * default to userspace handling if unavailable.
> >>>>           */
> >>>> -        vector->virq = kvm_irqchip_add_msi_route(kvm_state, msg);
> >>>> +        vector->virq = pci_bus_map_msi(vdev->pdev.bus, msg);
> >>>> +        if (vector->virq < 0) {
> >>>> +            vector->virq = kvm_irqchip_add_msi_route(kvm_state, msg);
> >>>> +        }
> >>>>          if (vector->virq < 0 ||
> >>>>              kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt,
> >>>>                                             vector->virq) < 0) {
> >>>> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> >>>> index a976e46..a9875e9 100644
> >>>> --- a/hw/pci/pci.c
> >>>> +++ b/hw/pci/pci.c
> >>>> @@ -1254,6 +1254,19 @@ void pci_device_set_intx_routing_notifier(PCIDevice *dev,
> >>>>      dev->intx_routing_notifier = notifier;
> >>>>  }
> >>>>  
> >>>> +void pci_bus_set_map_msi_fn(PCIBus *bus, pci_map_msi_fn map_msi_fn)
> >>>> +{
> >>>> +    bus->map_msi = map_msi_fn;
> >>>> +}
> >>>> +
> >>>> +int pci_bus_map_msi(PCIBus *bus, MSIMessage msg)
> >>>> +{
> >>>> +    if (bus->map_msi) {
> >>>> +        return bus->map_msi(bus, msg);
> >>>> +    }
> >>>> +    return -1;
> >>>> +}
> >>>> +
> >>>>  /*
> >>>>   * PCI-to-PCI bridge specification
> >>>>   * 9.1: Interrupt routing. Table 9-1
> >>>> diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
> >>>> index 80408c9..9ef9a29 100644
> >>>> --- a/hw/ppc/spapr_pci.c
> >>>> +++ b/hw/ppc/spapr_pci.c
> >>>> @@ -500,6 +500,18 @@ static void spapr_msi_write(void *opaque, hwaddr addr,
> >>>>      qemu_irq_pulse(xics_get_qirq(spapr->icp, irq));
> >>>>  }
> >>>>  
> >>>> +static int spapr_msi_get_irq(PCIBus *bus, MSIMessage msg)
> >>>> +{
> >>>> +    DeviceState *par = bus->qbus.parent;
> >>>> +    sPAPRPHBState *sphb = (sPAPRPHBState *) par;
> >>>> +    unsigned long addr = msg.address - sphb->msi_win_addr;
> >>>> +    int ndev = addr >> 16;
> >>>> +    int vec = ((addr & 0xFFFF) >> 2) | msg.data;
> >>>> +    uint32_t irq = sphb->msi_table[ndev].irq + vec;
> >>>> +
> >>>> +    return (int)irq;
> >>>> +}
> >>>> +
> >>>>  static const MemoryRegionOps spapr_msi_ops = {
> >>>>      /* There is no .read as the read result is undefined by PCI spec */
> >>>>      .read = NULL,
> >>>> @@ -664,6 +676,7 @@ static int _spapr_phb_init(SysBusDevice *s)
> >>>>  
> >>>>          sphb->lsi_table[i].irq = irq;
> >>>>      }
> >>>> +    pci_bus_set_map_msi_fn(bus, spapr_msi_get_irq);
> >>>>  
> >>>>      return 0;
> >>>>  }
> >>>> diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
> >>>> index d309416..587f53e 100644
> >>>> --- a/hw/virtio/virtio-pci.c
> >>>> +++ b/hw/virtio/virtio-pci.c
> >>>> @@ -472,6 +472,8 @@ static unsigned virtio_pci_get_features(DeviceState *d)
> >>>>      return proxy->host_features;
> >>>>  }
> >>>>  
> >>>> +extern int spapr_msi_get_irq(PCIBus *bus, MSIMessage *msg);
> >>>> +
> >>>>  static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy *proxy,
> >>>>                                          unsigned int queue_no,
> >>>>                                          unsigned int vector,
> >>>> @@ -481,7 +483,10 @@ static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy *proxy,
> >>>>      int ret;
> >>>>  
> >>>>      if (irqfd->users == 0) {
> >>>> -        ret = kvm_irqchip_add_msi_route(kvm_state, msg);
> >>>> +        ret = pci_bus_map_msi(proxy->pci_dev.bus, msg);
> >>>> +        if (ret < 0) {
> >>>> +            ret = kvm_irqchip_add_msi_route(kvm_state, msg);
> >>>> +        }
> >>>>          if (ret < 0) {
> >>>>              return ret;
> >>>>          }
> >>>> @@ -609,14 +614,23 @@ static int virtio_pci_vq_vector_unmask(VirtIOPCIProxy *proxy,
> >>>>      VirtQueue *vq = virtio_get_queue(proxy->vdev, queue_no);
> >>>>      EventNotifier *n = virtio_queue_get_guest_notifier(vq);
> >>>>      VirtIOIRQFD *irqfd;
> >>>> -    int ret = 0;
> >>>> +    int ret = 0, tmp;
> >>>>  
> >>>>      if (proxy->vector_irqfd) {
> >>>>          irqfd = &proxy->vector_irqfd[vector];
> >>>> -        if (irqfd->msg.data != msg.data || irqfd->msg.address != msg.address) {
> >>>> -            ret = kvm_irqchip_update_msi_route(kvm_state, irqfd->virq, msg);
> >>>> -            if (ret < 0) {
> >>>> -                return ret;
> >>>> +
> >>>> +        tmp = pci_bus_map_msi(proxy->pci_dev.bus, msg);
> >>>> +        if (tmp >= 0) {
> >>>> +            if (irqfd->virq != tmp) {
> >>>> +                fprintf(stderr, "FIXME: MSI(-X) vector has changed from %X to %x\n",
> >>>> +                        irqfd->virq, tmp);
> >>>> +            }
> >>>> +        } else {
> >>>> +            if (irqfd->msg.data != msg.data || irqfd->msg.address != msg.address) {
> >>>> +                ret = kvm_irqchip_update_msi_route(kvm_state, irqfd->virq, msg);
> >>>> +                if (ret < 0) {
> >>>> +                    return ret;
> >>>> +                }
> >>>>              }
> >>>>          }
> >>>>      }
> >>>> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
> >>>> index 8797802..632739a 100644
> >>>> --- a/include/hw/pci/pci.h
> >>>> +++ b/include/hw/pci/pci.h
> >>>> @@ -332,6 +332,7 @@ MemoryRegion *pci_address_space_io(PCIDevice *dev);
> >>>>  typedef void (*pci_set_irq_fn)(void *opaque, int irq_num, int level);
> >>>>  typedef int (*pci_map_irq_fn)(PCIDevice *pci_dev, int irq_num);
> >>>>  typedef PCIINTxRoute (*pci_route_irq_fn)(void *opaque, int pin);
> >>>> +typedef int (*pci_map_msi_fn)(PCIBus *bus, MSIMessage msg);
> >>>>  
> >>>>  typedef enum {
> >>>>      PCI_HOTPLUG_DISABLED,
> >>>> @@ -375,6 +376,9 @@ bool pci_intx_route_changed(PCIINTxRoute *old, PCIINTxRoute *new);
> >>>>  void pci_bus_fire_intx_routing_notifier(PCIBus *bus);
> >>>>  void pci_device_set_intx_routing_notifier(PCIDevice *dev,
> >>>>                                            PCIINTxRoutingNotifier notifier);
> >>>> +void pci_bus_set_map_msi_fn(PCIBus *bus, pci_map_msi_fn map_msi_fn);
> >>>> +int pci_bus_map_msi(PCIBus *bus, MSIMessage msg);
> >>>> +
> >>>>  void pci_device_reset(PCIDevice *dev);
> >>>>  void pci_bus_reset(PCIBus *bus);
> >>>>  
> >>>> diff --git a/include/hw/pci/pci_bus.h b/include/hw/pci/pci_bus.h
> >>>> index 66762f6..81efd2b 100644
> >>>> --- a/include/hw/pci/pci_bus.h
> >>>> +++ b/include/hw/pci/pci_bus.h
> >>>> @@ -16,6 +16,7 @@ struct PCIBus {
> >>>>      pci_set_irq_fn set_irq;
> >>>>      pci_map_irq_fn map_irq;
> >>>>      pci_route_irq_fn route_intx_to_irq;
> >>>> +    pci_map_msi_fn map_msi;
> >>>>      pci_hotplug_fn hotplug;
> >>>>      DeviceState *hotplug_qdev;
> >>>>      void *irq_opaque;
> >>>
> >>>
> >>>
> >>
> >>
> > 
> > 
> > 
> 
>
Benjamin Herrenschmidt June 21, 2013, 5:12 a.m. UTC | #9
On Thu, 2013-06-20 at 22:46 -0600, Alex Williamson wrote:
> Maybe you could add a device parameter to kvm_irqchip_add_msi_route so
> that it can be implemented on POWER without this pci_bus_map_msi
> interface that seems very unique to POWER.  Thanks,

You mean unique to all non-x86 ? :-)

I believe almost everybody eventually turn MSIs into "normal"
interrupts...

Most often than not, the logic to do so is in the PCI Host Bridge.

The whole concept of passing the message address/data accross the
user/kernel interface is an x86 crackpotery but as is the entire
remapping/routing layer so ... :-)

Ben.
Alex Williamson June 21, 2013, 6:03 a.m. UTC | #10
On Fri, 2013-06-21 at 15:12 +1000, Benjamin Herrenschmidt wrote:
> On Thu, 2013-06-20 at 22:46 -0600, Alex Williamson wrote:
> > Maybe you could add a device parameter to kvm_irqchip_add_msi_route so
> > that it can be implemented on POWER without this pci_bus_map_msi
> > interface that seems very unique to POWER.  Thanks,
> 
> You mean unique to all non-x86 ? :-)
> 
> I believe almost everybody eventually turn MSIs into "normal"
> interrupts...
> 
> Most often than not, the logic to do so is in the PCI Host Bridge.
> 
> The whole concept of passing the message address/data accross the
> user/kernel interface is an x86 crackpotery but as is the entire
> remapping/routing layer so ... :-)

Regardless, this is exactly what kvm_irqchip_add_msi_route does.  It
says, here's an MSIMessage, give me an IRQ that sends that.  In the x86
case, that means pick a free IRQ and program it to send that MSIMessage
when we hit the irqfd.  In the case of POWER it means lookup which IRQ
gets fired by that MSIMessage and return it.  In a non-accelerated QEMU
case I'd think msi_notify() would write the MSIMessage to this IRQ
remapper device and let it toggle the next qemu_irq down the line.  If
we ever add an IOMMU based IRQ remapper to the x86 model, we'd need
something similar.  Thanks,

Alex
Benjamin Herrenschmidt June 21, 2013, 6:12 a.m. UTC | #11
On Fri, 2013-06-21 at 00:03 -0600, Alex Williamson wrote:
> On Fri, 2013-06-21 at 15:12 +1000, Benjamin Herrenschmidt wrote:
> > On Thu, 2013-06-20 at 22:46 -0600, Alex Williamson wrote:
> > > Maybe you could add a device parameter to kvm_irqchip_add_msi_route so
> > > that it can be implemented on POWER without this pci_bus_map_msi
> > > interface that seems very unique to POWER.  Thanks,
> > 
> > You mean unique to all non-x86 ? :-)
> > 
> > I believe almost everybody eventually turn MSIs into "normal"
> > interrupts...
> > 
> > Most often than not, the logic to do so is in the PCI Host Bridge.
> > 
> > The whole concept of passing the message address/data accross the
> > user/kernel interface is an x86 crackpotery but as is the entire
> > remapping/routing layer so ... :-)
> 
> Regardless, this is exactly what kvm_irqchip_add_msi_route does.  It
> says, here's an MSIMessage, give me an IRQ that sends that. 

Yes, and in our case, what happens is that the guest said to use "I want
an MSI", we picked up an IRQ, and made up a message for it :-) The
actual message address/data we use is a complete invention that only
exists within qemu. So here we need to basically turn it back into an
IRQ, which we might be able to do by ... just making the message (or
part of the address) be the IRQ number or something like that.

>  In the x86
> case, that means pick a free IRQ and program it to send that MSIMessage
> when we hit the irqfd.  In the case of POWER it means lookup which IRQ
> gets fired by that MSIMessage and return it.  In a non-accelerated QEMU
> case I'd think msi_notify() would write the MSIMessage to this IRQ
> remapper device and let it toggle the next qemu_irq down the line.  If
> we ever add an IOMMU based IRQ remapper to the x86 model, we'd need
> something similar.  Thanks,

Ben.
Alexey Kardashevskiy June 21, 2013, 6:40 a.m. UTC | #12
On 06/21/2013 04:12 PM, Benjamin Herrenschmidt wrote:
> On Fri, 2013-06-21 at 00:03 -0600, Alex Williamson wrote:
>> On Fri, 2013-06-21 at 15:12 +1000, Benjamin Herrenschmidt wrote:
>>> On Thu, 2013-06-20 at 22:46 -0600, Alex Williamson wrote:
>>>> Maybe you could add a device parameter to kvm_irqchip_add_msi_route so
>>>> that it can be implemented on POWER without this pci_bus_map_msi
>>>> interface that seems very unique to POWER.  Thanks,
>>>
>>> You mean unique to all non-x86 ? :-)
>>>
>>> I believe almost everybody eventually turn MSIs into "normal"
>>> interrupts...
>>>
>>> Most often than not, the logic to do so is in the PCI Host Bridge.
>>>
>>> The whole concept of passing the message address/data accross the
>>> user/kernel interface is an x86 crackpotery but as is the entire
>>> remapping/routing layer so ... :-)
>>
>> Regardless, this is exactly what kvm_irqchip_add_msi_route does.  It
>> says, here's an MSIMessage, give me an IRQ that sends that. 
> 
> Yes, and in our case, what happens is that the guest said to use "I want
> an MSI", we picked up an IRQ, and made up a message for it :-) The
> actual message address/data we use is a complete invention that only
> exists within qemu. So here we need to basically turn it back into an
> IRQ, which we might be able to do by ... just making the message (or
> part of the address) be the IRQ number or something like that.
> 
>>  In the x86
>> case, that means pick a free IRQ and program it to send that MSIMessage
>> when we hit the irqfd.  In the case of POWER it means lookup which IRQ
>> gets fired by that MSIMessage and return it.  In a non-accelerated QEMU
>> case I'd think msi_notify() would write the MSIMessage to this IRQ
>> remapper device and let it toggle the next qemu_irq down the line.  If
>> we ever add an IOMMU based IRQ remapper to the x86 model, we'd need
>> something similar.  Thanks,


Yes, I am changing IRQ to MSI message conversion (and some other things as
replacing per PHB MSI windows to one for a whole system) and I am going to
post something tonight.
Michael S. Tsirkin June 23, 2013, 2:07 p.m. UTC | #13
On Fri, Jun 21, 2013 at 09:51:20AM +1000, Alexey Kardashevskiy wrote:
> And kvm_irqchip_add_msi_route does not have any link to a device or a bus
> so I'll have to walk through all PHBs in system and see if PHB's MSI window
> is the one from MSIMessage and convert MSIMessage to virq. Pretty easy and
> quick but still dirty hack, would it be better?

Yes I think that's fine really.

Basically devices all speak MSIMessage as they should -
this is what the PCI spec says.        
On all normal systems guests also speak MSIMessage so
the API which uses these makes sense for kvm.
Except, ppc architecture in its wisdom decided to hide this
in firmware.  QEMU does not have firmware so the translation
has to be maintained in QEMU powerpc code.
Anthony Liguori June 23, 2013, 3:02 p.m. UTC | #14
On Sun, Jun 23, 2013 at 9:07 AM, Michael S. Tsirkin <mst@redhat.com> wrote:
> On Fri, Jun 21, 2013 at 09:51:20AM +1000, Alexey Kardashevskiy wrote:
>> And kvm_irqchip_add_msi_route does not have any link to a device or a bus
>> so I'll have to walk through all PHBs in system and see if PHB's MSI window
>> is the one from MSIMessage and convert MSIMessage to virq. Pretty easy and
>> quick but still dirty hack, would it be better?
>
> Yes I think that's fine really.
>
> Basically devices all speak MSIMessage as they should -
> this is what the PCI spec says.
> On all normal systems guests also speak MSIMessage so
> the API which uses these makes sense for kvm.

There is no difference between Power and x86 in how they handle MSI.
MSI is not a concept in the APIC or processor.  MSI interrupts are
handled by the PHB and delivered through the I/O APIC using the
information in the address/data.

I've always assumed that the kernel knows about MSI just because PCI
passthrough needs it for whatever reason.  It's not part of the APIC
complex and it is weird to have MSI decoding information in the
kernel.

> Except, ppc architecture in its wisdom decided to hide this
> in firmware.  QEMU does not have firmware so the translation
> has to be maintained in QEMU powerpc code.

To be clear: instead of writing to an address with a load/store, Power
has a hypercall to do this.  The hypercall is hidden behind a firmware
abstract layer (RTAS) but it's still just a hypercall.

It uses a hypercall simply because Power was designed to not require
MMIO trapping.  It's much like Xen PV in that regard.

It doesn't make a lot of sense to have Power do MSI decoding in
userspace while x86 does it in the kernel.  I assume the kernel
doesn't have enough information to do the translation on Power?

Regards,

Anthony Liguori

> --
> MST
>
Anthony Liguori June 23, 2013, 3:06 p.m. UTC | #15
On Thu, Jun 20, 2013 at 11:46 PM, Alex Williamson
<alex.williamson@redhat.com> wrote:
> On Fri, 2013-06-21 at 12:49 +1000, Alexey Kardashevskiy wrote:
>> On 06/21/2013 12:34 PM, Alex Williamson wrote:
>>
>>
>> Do not follow you, sorry. For x86, is it that MSI routing table which is
>> updated via KVM_SET_GSI_ROUTING in KVM? When there is no KVM, what piece of
>> code responds on msi_notify() in qemu-x86 and does qemu_irq_pulse()?
>
> vfio_msi_interrupt->msi[x]_notify->stl_le_phys(msg.address, msg.data)
>
> This writes directly to the interrupt block on the vCPU.  With KVM, the
> in-kernel APIC does the same write, where the pin to MSIMessage is setup
> by kvm_irqchip_add_msi_route and the pin is pulled by an irqfd.

What is this "interrupt block on the vCPU" you speak of?  I reviewed
the SDM and see nothing in the APIC protocol or the brief description
of MSI as a PCI concept that would indicate anything except that the
PHB handles MSI writes and feeds them to the I/O APIC.

In fact, the wikipedia article on MSI has:

"A common misconception with Message Signaled Interrupts is that they
allow the device to send data to a processor as part of the interrupt.
The data that is sent as part of the write is used by the chipset to
determine which interrupt to trigger on which processor; it is not
available for the device to communicate additional information to the
interrupt handler."

> Do I understand that on POWER the MSI from the device is intercepted at
> the PHB and converted to an IRQ that's triggered by some means other
> than a MSI write?

This is exactly the same thing that happens on x86, no?  Can you point
me to something in the SDM that says otherwise?

Regards,

Anthony Liguori

>  So to correctly model the hardware, vfio should do a
> msi_notify() that does a stl_le_phys that terminates at this IRQ
> remapper thing and in turn toggles a qemu_irq.  MSIMessage is only
> extraneous data if you want to skip over hardware blocks.
>
> Maybe you could add a device parameter to kvm_irqchip_add_msi_route so
> that it can be implemented on POWER without this pci_bus_map_msi
> interface that seems very unique to POWER.  Thanks,
>
> Alex
>
>> >>>> ---
>> >>>>  hw/misc/vfio.c           |   11 +++++++++--
>> >>>>  hw/pci/pci.c             |   13 +++++++++++++
>> >>>>  hw/ppc/spapr_pci.c       |   13 +++++++++++++
>> >>>>  hw/virtio/virtio-pci.c   |   26 ++++++++++++++++++++------
>> >>>>  include/hw/pci/pci.h     |    4 ++++
>> >>>>  include/hw/pci/pci_bus.h |    1 +
>> >>>>  6 files changed, 60 insertions(+), 8 deletions(-)
>> >>>>
>> >>>> diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c
>> >>>> index 14aac04..2d9eef7 100644
>> >>>> --- a/hw/misc/vfio.c
>> >>>> +++ b/hw/misc/vfio.c
>> >>>> @@ -639,7 +639,11 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
>> >>>>       * Attempt to enable route through KVM irqchip,
>> >>>>       * default to userspace handling if unavailable.
>> >>>>       */
>> >>>> -    vector->virq = msg ? kvm_irqchip_add_msi_route(kvm_state, *msg) : -1;
>> >>>> +
>> >>>> +    vector->virq = msg ? pci_bus_map_msi(vdev->pdev.bus, *msg) : -1;
>> >>>> +    if (vector->virq < 0) {
>> >>>> +        vector->virq = msg ? kvm_irqchip_add_msi_route(kvm_state, *msg) : -1;
>> >>>> +    }
>> >>>>      if (vector->virq < 0 ||
>> >>>>          kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt,
>> >>>>                                         vector->virq) < 0) {
>> >>>> @@ -807,7 +811,10 @@ retry:
>> >>>>           * Attempt to enable route through KVM irqchip,
>> >>>>           * default to userspace handling if unavailable.
>> >>>>           */
>> >>>> -        vector->virq = kvm_irqchip_add_msi_route(kvm_state, msg);
>> >>>> +        vector->virq = pci_bus_map_msi(vdev->pdev.bus, msg);
>> >>>> +        if (vector->virq < 0) {
>> >>>> +            vector->virq = kvm_irqchip_add_msi_route(kvm_state, msg);
>> >>>> +        }
>> >>>>          if (vector->virq < 0 ||
>> >>>>              kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt,
>> >>>>                                             vector->virq) < 0) {
>> >>>> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
>> >>>> index a976e46..a9875e9 100644
>> >>>> --- a/hw/pci/pci.c
>> >>>> +++ b/hw/pci/pci.c
>> >>>> @@ -1254,6 +1254,19 @@ void pci_device_set_intx_routing_notifier(PCIDevice *dev,
>> >>>>      dev->intx_routing_notifier = notifier;
>> >>>>  }
>> >>>>
>> >>>> +void pci_bus_set_map_msi_fn(PCIBus *bus, pci_map_msi_fn map_msi_fn)
>> >>>> +{
>> >>>> +    bus->map_msi = map_msi_fn;
>> >>>> +}
>> >>>> +
>> >>>> +int pci_bus_map_msi(PCIBus *bus, MSIMessage msg)
>> >>>> +{
>> >>>> +    if (bus->map_msi) {
>> >>>> +        return bus->map_msi(bus, msg);
>> >>>> +    }
>> >>>> +    return -1;
>> >>>> +}
>> >>>> +
>> >>>>  /*
>> >>>>   * PCI-to-PCI bridge specification
>> >>>>   * 9.1: Interrupt routing. Table 9-1
>> >>>> diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
>> >>>> index 80408c9..9ef9a29 100644
>> >>>> --- a/hw/ppc/spapr_pci.c
>> >>>> +++ b/hw/ppc/spapr_pci.c
>> >>>> @@ -500,6 +500,18 @@ static void spapr_msi_write(void *opaque, hwaddr addr,
>> >>>>      qemu_irq_pulse(xics_get_qirq(spapr->icp, irq));
>> >>>>  }
>> >>>>
>> >>>> +static int spapr_msi_get_irq(PCIBus *bus, MSIMessage msg)
>> >>>> +{
>> >>>> +    DeviceState *par = bus->qbus.parent;
>> >>>> +    sPAPRPHBState *sphb = (sPAPRPHBState *) par;
>> >>>> +    unsigned long addr = msg.address - sphb->msi_win_addr;
>> >>>> +    int ndev = addr >> 16;
>> >>>> +    int vec = ((addr & 0xFFFF) >> 2) | msg.data;
>> >>>> +    uint32_t irq = sphb->msi_table[ndev].irq + vec;
>> >>>> +
>> >>>> +    return (int)irq;
>> >>>> +}
>> >>>> +
>> >>>>  static const MemoryRegionOps spapr_msi_ops = {
>> >>>>      /* There is no .read as the read result is undefined by PCI spec */
>> >>>>      .read = NULL,
>> >>>> @@ -664,6 +676,7 @@ static int _spapr_phb_init(SysBusDevice *s)
>> >>>>
>> >>>>          sphb->lsi_table[i].irq = irq;
>> >>>>      }
>> >>>> +    pci_bus_set_map_msi_fn(bus, spapr_msi_get_irq);
>> >>>>
>> >>>>      return 0;
>> >>>>  }
>> >>>> diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
>> >>>> index d309416..587f53e 100644
>> >>>> --- a/hw/virtio/virtio-pci.c
>> >>>> +++ b/hw/virtio/virtio-pci.c
>> >>>> @@ -472,6 +472,8 @@ static unsigned virtio_pci_get_features(DeviceState *d)
>> >>>>      return proxy->host_features;
>> >>>>  }
>> >>>>
>> >>>> +extern int spapr_msi_get_irq(PCIBus *bus, MSIMessage *msg);
>> >>>> +
>> >>>>  static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy *proxy,
>> >>>>                                          unsigned int queue_no,
>> >>>>                                          unsigned int vector,
>> >>>> @@ -481,7 +483,10 @@ static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy *proxy,
>> >>>>      int ret;
>> >>>>
>> >>>>      if (irqfd->users == 0) {
>> >>>> -        ret = kvm_irqchip_add_msi_route(kvm_state, msg);
>> >>>> +        ret = pci_bus_map_msi(proxy->pci_dev.bus, msg);
>> >>>> +        if (ret < 0) {
>> >>>> +            ret = kvm_irqchip_add_msi_route(kvm_state, msg);
>> >>>> +        }
>> >>>>          if (ret < 0) {
>> >>>>              return ret;
>> >>>>          }
>> >>>> @@ -609,14 +614,23 @@ static int virtio_pci_vq_vector_unmask(VirtIOPCIProxy *proxy,
>> >>>>      VirtQueue *vq = virtio_get_queue(proxy->vdev, queue_no);
>> >>>>      EventNotifier *n = virtio_queue_get_guest_notifier(vq);
>> >>>>      VirtIOIRQFD *irqfd;
>> >>>> -    int ret = 0;
>> >>>> +    int ret = 0, tmp;
>> >>>>
>> >>>>      if (proxy->vector_irqfd) {
>> >>>>          irqfd = &proxy->vector_irqfd[vector];
>> >>>> -        if (irqfd->msg.data != msg.data || irqfd->msg.address != msg.address) {
>> >>>> -            ret = kvm_irqchip_update_msi_route(kvm_state, irqfd->virq, msg);
>> >>>> -            if (ret < 0) {
>> >>>> -                return ret;
>> >>>> +
>> >>>> +        tmp = pci_bus_map_msi(proxy->pci_dev.bus, msg);
>> >>>> +        if (tmp >= 0) {
>> >>>> +            if (irqfd->virq != tmp) {
>> >>>> +                fprintf(stderr, "FIXME: MSI(-X) vector has changed from %X to %x\n",
>> >>>> +                        irqfd->virq, tmp);
>> >>>> +            }
>> >>>> +        } else {
>> >>>> +            if (irqfd->msg.data != msg.data || irqfd->msg.address != msg.address) {
>> >>>> +                ret = kvm_irqchip_update_msi_route(kvm_state, irqfd->virq, msg);
>> >>>> +                if (ret < 0) {
>> >>>> +                    return ret;
>> >>>> +                }
>> >>>>              }
>> >>>>          }
>> >>>>      }
>> >>>> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
>> >>>> index 8797802..632739a 100644
>> >>>> --- a/include/hw/pci/pci.h
>> >>>> +++ b/include/hw/pci/pci.h
>> >>>> @@ -332,6 +332,7 @@ MemoryRegion *pci_address_space_io(PCIDevice *dev);
>> >>>>  typedef void (*pci_set_irq_fn)(void *opaque, int irq_num, int level);
>> >>>>  typedef int (*pci_map_irq_fn)(PCIDevice *pci_dev, int irq_num);
>> >>>>  typedef PCIINTxRoute (*pci_route_irq_fn)(void *opaque, int pin);
>> >>>> +typedef int (*pci_map_msi_fn)(PCIBus *bus, MSIMessage msg);
>> >>>>
>> >>>>  typedef enum {
>> >>>>      PCI_HOTPLUG_DISABLED,
>> >>>> @@ -375,6 +376,9 @@ bool pci_intx_route_changed(PCIINTxRoute *old, PCIINTxRoute *new);
>> >>>>  void pci_bus_fire_intx_routing_notifier(PCIBus *bus);
>> >>>>  void pci_device_set_intx_routing_notifier(PCIDevice *dev,
>> >>>>                                            PCIINTxRoutingNotifier notifier);
>> >>>> +void pci_bus_set_map_msi_fn(PCIBus *bus, pci_map_msi_fn map_msi_fn);
>> >>>> +int pci_bus_map_msi(PCIBus *bus, MSIMessage msg);
>> >>>> +
>> >>>>  void pci_device_reset(PCIDevice *dev);
>> >>>>  void pci_bus_reset(PCIBus *bus);
>> >>>>
>> >>>> diff --git a/include/hw/pci/pci_bus.h b/include/hw/pci/pci_bus.h
>> >>>> index 66762f6..81efd2b 100644
>> >>>> --- a/include/hw/pci/pci_bus.h
>> >>>> +++ b/include/hw/pci/pci_bus.h
>> >>>> @@ -16,6 +16,7 @@ struct PCIBus {
>> >>>>      pci_set_irq_fn set_irq;
>> >>>>      pci_map_irq_fn map_irq;
>> >>>>      pci_route_irq_fn route_intx_to_irq;
>> >>>> +    pci_map_msi_fn map_msi;
>> >>>>      pci_hotplug_fn hotplug;
>> >>>>      DeviceState *hotplug_qdev;
>> >>>>      void *irq_opaque;
>> >>>
>> >>>
>> >>>
>> >>
>> >>
>> >
>> >
>> >
>>
>>
>
>
>
>
Benjamin Herrenschmidt June 23, 2013, 9:36 p.m. UTC | #16
On Sun, 2013-06-23 at 17:07 +0300, Michael S. Tsirkin wrote:
> Yes I think that's fine really.
> 
> Basically devices all speak MSIMessage as they should -
> this is what the PCI spec says.        
> On all normal systems guests also speak MSIMessage so
> the API which uses these makes sense for kvm.
> Except, ppc architecture in its wisdom decided to hide this
> in firmware.  QEMU does not have firmware so the translation
> has to be maintained in QEMU powerpc code.

It is still not enough. There is nothing that says that an MSI
address/message is unique. On pseries (among others I'm sure), this is a
per-host-bridge concept since the MSIs are decoded by the PHB and turned
into standard fabric interrupts there.

So at least the bus needs to be passed as well.

Cheers,
Ben.
Benjamin Herrenschmidt June 23, 2013, 9:39 p.m. UTC | #17
On Sun, 2013-06-23 at 10:02 -0500, Anthony Liguori wrote:
> It doesn't make a lot of sense to have Power do MSI decoding in
> userspace while x86 does it in the kernel.  I assume the kernel
> doesn't have enough information to do the translation on Power?

Well, it would need the host bridge which it doesn't have and it
would need a concept of mapping that to interrupt numbers which
we don't want in the kernel.

We can keep the kernel side a LOT simpler (completely avoiding
the whole route bloatware) by sticking to our "simple" direct map kernel
stuff and doing the mapping from address/data to interrupts in qemu, so
we'll stick to that.

Cheers,
Ben.
Anthony Liguori June 23, 2013, 9:58 p.m. UTC | #18
On Sun, Jun 23, 2013 at 4:39 PM, Benjamin Herrenschmidt
<benh@kernel.crashing.org> wrote:
> On Sun, 2013-06-23 at 10:02 -0500, Anthony Liguori wrote:
>> It doesn't make a lot of sense to have Power do MSI decoding in
>> userspace while x86 does it in the kernel.  I assume the kernel
>> doesn't have enough information to do the translation on Power?
>
> Well, it would need the host bridge which it doesn't have and it
> would need a concept of mapping that to interrupt numbers which
> we don't want in the kernel.
>
> We can keep the kernel side a LOT simpler (completely avoiding
> the whole route bloatware) by sticking to our "simple" direct map kernel
> stuff and doing the mapping from address/data to interrupts in qemu, so
> we'll stick to that.

Yeah, but none of this is Power specific...  so we can do the same for x86, no?

I'm still trying to wrap my head around why we need MSI knowledge at
all in the kernel for x86.  I presume it's to fast-path irqfd when
doing vhost?

Regards,

Anthony Liguori

>
> Cheers,
> Ben.
>
>
Alex Williamson June 24, 2013, 4:44 a.m. UTC | #19
On Sun, 2013-06-23 at 10:06 -0500, Anthony Liguori wrote:
> On Thu, Jun 20, 2013 at 11:46 PM, Alex Williamson
> <alex.williamson@redhat.com> wrote:
> > On Fri, 2013-06-21 at 12:49 +1000, Alexey Kardashevskiy wrote:
> >> On 06/21/2013 12:34 PM, Alex Williamson wrote:
> >>
> >>
> >> Do not follow you, sorry. For x86, is it that MSI routing table which is
> >> updated via KVM_SET_GSI_ROUTING in KVM? When there is no KVM, what piece of
> >> code responds on msi_notify() in qemu-x86 and does qemu_irq_pulse()?
> >
> > vfio_msi_interrupt->msi[x]_notify->stl_le_phys(msg.address, msg.data)
> >
> > This writes directly to the interrupt block on the vCPU.  With KVM, the
> > in-kernel APIC does the same write, where the pin to MSIMessage is setup
> > by kvm_irqchip_add_msi_route and the pin is pulled by an irqfd.
> 
> What is this "interrupt block on the vCPU" you speak of?  I reviewed
> the SDM and see nothing in the APIC protocol or the brief description
> of MSI as a PCI concept that would indicate anything except that the
> PHB handles MSI writes and feeds them to the I/O APIC.

In all likelihood I'm recalling ia64 details and trying to apply them to
x86.  Does the MSIMessage not actually get written to the LAPIC on the
CPU?  Thanks,

Alex

> In fact, the wikipedia article on MSI has:
> 
> "A common misconception with Message Signaled Interrupts is that they
> allow the device to send data to a processor as part of the interrupt.
> The data that is sent as part of the write is used by the chipset to
> determine which interrupt to trigger on which processor; it is not
> available for the device to communicate additional information to the
> interrupt handler."
> 
> > Do I understand that on POWER the MSI from the device is intercepted at
> > the PHB and converted to an IRQ that's triggered by some means other
> > than a MSI write?
> 
> This is exactly the same thing that happens on x86, no?  Can you point
> me to something in the SDM that says otherwise?
> 
> Regards,
> 
> Anthony Liguori
> 
> >  So to correctly model the hardware, vfio should do a
> > msi_notify() that does a stl_le_phys that terminates at this IRQ
> > remapper thing and in turn toggles a qemu_irq.  MSIMessage is only
> > extraneous data if you want to skip over hardware blocks.
> >
> > Maybe you could add a device parameter to kvm_irqchip_add_msi_route so
> > that it can be implemented on POWER without this pci_bus_map_msi
> > interface that seems very unique to POWER.  Thanks,
> >
> > Alex
> >
> >> >>>> ---
> >> >>>>  hw/misc/vfio.c           |   11 +++++++++--
> >> >>>>  hw/pci/pci.c             |   13 +++++++++++++
> >> >>>>  hw/ppc/spapr_pci.c       |   13 +++++++++++++
> >> >>>>  hw/virtio/virtio-pci.c   |   26 ++++++++++++++++++++------
> >> >>>>  include/hw/pci/pci.h     |    4 ++++
> >> >>>>  include/hw/pci/pci_bus.h |    1 +
> >> >>>>  6 files changed, 60 insertions(+), 8 deletions(-)
> >> >>>>
> >> >>>> diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c
> >> >>>> index 14aac04..2d9eef7 100644
> >> >>>> --- a/hw/misc/vfio.c
> >> >>>> +++ b/hw/misc/vfio.c
> >> >>>> @@ -639,7 +639,11 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
> >> >>>>       * Attempt to enable route through KVM irqchip,
> >> >>>>       * default to userspace handling if unavailable.
> >> >>>>       */
> >> >>>> -    vector->virq = msg ? kvm_irqchip_add_msi_route(kvm_state, *msg) : -1;
> >> >>>> +
> >> >>>> +    vector->virq = msg ? pci_bus_map_msi(vdev->pdev.bus, *msg) : -1;
> >> >>>> +    if (vector->virq < 0) {
> >> >>>> +        vector->virq = msg ? kvm_irqchip_add_msi_route(kvm_state, *msg) : -1;
> >> >>>> +    }
> >> >>>>      if (vector->virq < 0 ||
> >> >>>>          kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt,
> >> >>>>                                         vector->virq) < 0) {
> >> >>>> @@ -807,7 +811,10 @@ retry:
> >> >>>>           * Attempt to enable route through KVM irqchip,
> >> >>>>           * default to userspace handling if unavailable.
> >> >>>>           */
> >> >>>> -        vector->virq = kvm_irqchip_add_msi_route(kvm_state, msg);
> >> >>>> +        vector->virq = pci_bus_map_msi(vdev->pdev.bus, msg);
> >> >>>> +        if (vector->virq < 0) {
> >> >>>> +            vector->virq = kvm_irqchip_add_msi_route(kvm_state, msg);
> >> >>>> +        }
> >> >>>>          if (vector->virq < 0 ||
> >> >>>>              kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt,
> >> >>>>                                             vector->virq) < 0) {
> >> >>>> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> >> >>>> index a976e46..a9875e9 100644
> >> >>>> --- a/hw/pci/pci.c
> >> >>>> +++ b/hw/pci/pci.c
> >> >>>> @@ -1254,6 +1254,19 @@ void pci_device_set_intx_routing_notifier(PCIDevice *dev,
> >> >>>>      dev->intx_routing_notifier = notifier;
> >> >>>>  }
> >> >>>>
> >> >>>> +void pci_bus_set_map_msi_fn(PCIBus *bus, pci_map_msi_fn map_msi_fn)
> >> >>>> +{
> >> >>>> +    bus->map_msi = map_msi_fn;
> >> >>>> +}
> >> >>>> +
> >> >>>> +int pci_bus_map_msi(PCIBus *bus, MSIMessage msg)
> >> >>>> +{
> >> >>>> +    if (bus->map_msi) {
> >> >>>> +        return bus->map_msi(bus, msg);
> >> >>>> +    }
> >> >>>> +    return -1;
> >> >>>> +}
> >> >>>> +
> >> >>>>  /*
> >> >>>>   * PCI-to-PCI bridge specification
> >> >>>>   * 9.1: Interrupt routing. Table 9-1
> >> >>>> diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
> >> >>>> index 80408c9..9ef9a29 100644
> >> >>>> --- a/hw/ppc/spapr_pci.c
> >> >>>> +++ b/hw/ppc/spapr_pci.c
> >> >>>> @@ -500,6 +500,18 @@ static void spapr_msi_write(void *opaque, hwaddr addr,
> >> >>>>      qemu_irq_pulse(xics_get_qirq(spapr->icp, irq));
> >> >>>>  }
> >> >>>>
> >> >>>> +static int spapr_msi_get_irq(PCIBus *bus, MSIMessage msg)
> >> >>>> +{
> >> >>>> +    DeviceState *par = bus->qbus.parent;
> >> >>>> +    sPAPRPHBState *sphb = (sPAPRPHBState *) par;
> >> >>>> +    unsigned long addr = msg.address - sphb->msi_win_addr;
> >> >>>> +    int ndev = addr >> 16;
> >> >>>> +    int vec = ((addr & 0xFFFF) >> 2) | msg.data;
> >> >>>> +    uint32_t irq = sphb->msi_table[ndev].irq + vec;
> >> >>>> +
> >> >>>> +    return (int)irq;
> >> >>>> +}
> >> >>>> +
> >> >>>>  static const MemoryRegionOps spapr_msi_ops = {
> >> >>>>      /* There is no .read as the read result is undefined by PCI spec */
> >> >>>>      .read = NULL,
> >> >>>> @@ -664,6 +676,7 @@ static int _spapr_phb_init(SysBusDevice *s)
> >> >>>>
> >> >>>>          sphb->lsi_table[i].irq = irq;
> >> >>>>      }
> >> >>>> +    pci_bus_set_map_msi_fn(bus, spapr_msi_get_irq);
> >> >>>>
> >> >>>>      return 0;
> >> >>>>  }
> >> >>>> diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
> >> >>>> index d309416..587f53e 100644
> >> >>>> --- a/hw/virtio/virtio-pci.c
> >> >>>> +++ b/hw/virtio/virtio-pci.c
> >> >>>> @@ -472,6 +472,8 @@ static unsigned virtio_pci_get_features(DeviceState *d)
> >> >>>>      return proxy->host_features;
> >> >>>>  }
> >> >>>>
> >> >>>> +extern int spapr_msi_get_irq(PCIBus *bus, MSIMessage *msg);
> >> >>>> +
> >> >>>>  static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy *proxy,
> >> >>>>                                          unsigned int queue_no,
> >> >>>>                                          unsigned int vector,
> >> >>>> @@ -481,7 +483,10 @@ static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy *proxy,
> >> >>>>      int ret;
> >> >>>>
> >> >>>>      if (irqfd->users == 0) {
> >> >>>> -        ret = kvm_irqchip_add_msi_route(kvm_state, msg);
> >> >>>> +        ret = pci_bus_map_msi(proxy->pci_dev.bus, msg);
> >> >>>> +        if (ret < 0) {
> >> >>>> +            ret = kvm_irqchip_add_msi_route(kvm_state, msg);
> >> >>>> +        }
> >> >>>>          if (ret < 0) {
> >> >>>>              return ret;
> >> >>>>          }
> >> >>>> @@ -609,14 +614,23 @@ static int virtio_pci_vq_vector_unmask(VirtIOPCIProxy *proxy,
> >> >>>>      VirtQueue *vq = virtio_get_queue(proxy->vdev, queue_no);
> >> >>>>      EventNotifier *n = virtio_queue_get_guest_notifier(vq);
> >> >>>>      VirtIOIRQFD *irqfd;
> >> >>>> -    int ret = 0;
> >> >>>> +    int ret = 0, tmp;
> >> >>>>
> >> >>>>      if (proxy->vector_irqfd) {
> >> >>>>          irqfd = &proxy->vector_irqfd[vector];
> >> >>>> -        if (irqfd->msg.data != msg.data || irqfd->msg.address != msg.address) {
> >> >>>> -            ret = kvm_irqchip_update_msi_route(kvm_state, irqfd->virq, msg);
> >> >>>> -            if (ret < 0) {
> >> >>>> -                return ret;
> >> >>>> +
> >> >>>> +        tmp = pci_bus_map_msi(proxy->pci_dev.bus, msg);
> >> >>>> +        if (tmp >= 0) {
> >> >>>> +            if (irqfd->virq != tmp) {
> >> >>>> +                fprintf(stderr, "FIXME: MSI(-X) vector has changed from %X to %x\n",
> >> >>>> +                        irqfd->virq, tmp);
> >> >>>> +            }
> >> >>>> +        } else {
> >> >>>> +            if (irqfd->msg.data != msg.data || irqfd->msg.address != msg.address) {
> >> >>>> +                ret = kvm_irqchip_update_msi_route(kvm_state, irqfd->virq, msg);
> >> >>>> +                if (ret < 0) {
> >> >>>> +                    return ret;
> >> >>>> +                }
> >> >>>>              }
> >> >>>>          }
> >> >>>>      }
> >> >>>> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
> >> >>>> index 8797802..632739a 100644
> >> >>>> --- a/include/hw/pci/pci.h
> >> >>>> +++ b/include/hw/pci/pci.h
> >> >>>> @@ -332,6 +332,7 @@ MemoryRegion *pci_address_space_io(PCIDevice *dev);
> >> >>>>  typedef void (*pci_set_irq_fn)(void *opaque, int irq_num, int level);
> >> >>>>  typedef int (*pci_map_irq_fn)(PCIDevice *pci_dev, int irq_num);
> >> >>>>  typedef PCIINTxRoute (*pci_route_irq_fn)(void *opaque, int pin);
> >> >>>> +typedef int (*pci_map_msi_fn)(PCIBus *bus, MSIMessage msg);
> >> >>>>
> >> >>>>  typedef enum {
> >> >>>>      PCI_HOTPLUG_DISABLED,
> >> >>>> @@ -375,6 +376,9 @@ bool pci_intx_route_changed(PCIINTxRoute *old, PCIINTxRoute *new);
> >> >>>>  void pci_bus_fire_intx_routing_notifier(PCIBus *bus);
> >> >>>>  void pci_device_set_intx_routing_notifier(PCIDevice *dev,
> >> >>>>                                            PCIINTxRoutingNotifier notifier);
> >> >>>> +void pci_bus_set_map_msi_fn(PCIBus *bus, pci_map_msi_fn map_msi_fn);
> >> >>>> +int pci_bus_map_msi(PCIBus *bus, MSIMessage msg);
> >> >>>> +
> >> >>>>  void pci_device_reset(PCIDevice *dev);
> >> >>>>  void pci_bus_reset(PCIBus *bus);
> >> >>>>
> >> >>>> diff --git a/include/hw/pci/pci_bus.h b/include/hw/pci/pci_bus.h
> >> >>>> index 66762f6..81efd2b 100644
> >> >>>> --- a/include/hw/pci/pci_bus.h
> >> >>>> +++ b/include/hw/pci/pci_bus.h
> >> >>>> @@ -16,6 +16,7 @@ struct PCIBus {
> >> >>>>      pci_set_irq_fn set_irq;
> >> >>>>      pci_map_irq_fn map_irq;
> >> >>>>      pci_route_irq_fn route_intx_to_irq;
> >> >>>> +    pci_map_msi_fn map_msi;
> >> >>>>      pci_hotplug_fn hotplug;
> >> >>>>      DeviceState *hotplug_qdev;
> >> >>>>      void *irq_opaque;
> >> >>>
> >> >>>
> >> >>>
> >> >>
> >> >>
> >> >
> >> >
> >> >
> >>
> >>
> >
> >
> >
> >
Alex Williamson June 24, 2013, 4:46 a.m. UTC | #20
On Sun, 2013-06-23 at 16:58 -0500, Anthony Liguori wrote:
> On Sun, Jun 23, 2013 at 4:39 PM, Benjamin Herrenschmidt
> <benh@kernel.crashing.org> wrote:
> > On Sun, 2013-06-23 at 10:02 -0500, Anthony Liguori wrote:
> >> It doesn't make a lot of sense to have Power do MSI decoding in
> >> userspace while x86 does it in the kernel.  I assume the kernel
> >> doesn't have enough information to do the translation on Power?
> >
> > Well, it would need the host bridge which it doesn't have and it
> > would need a concept of mapping that to interrupt numbers which
> > we don't want in the kernel.
> >
> > We can keep the kernel side a LOT simpler (completely avoiding
> > the whole route bloatware) by sticking to our "simple" direct map kernel
> > stuff and doing the mapping from address/data to interrupts in qemu, so
> > we'll stick to that.
> 
> Yeah, but none of this is Power specific...  so we can do the same for x86, no?
> 
> I'm still trying to wrap my head around why we need MSI knowledge at
> all in the kernel for x86.  I presume it's to fast-path irqfd when
> doing vhost?

Or device assignment.  Any paths where we want to inject an MSI
interrupt without going through userspace.  Thanks,

Alex
Gleb Natapov June 24, 2013, 7:13 a.m. UTC | #21
On Sun, Jun 23, 2013 at 10:06:05AM -0500, Anthony Liguori wrote:
> On Thu, Jun 20, 2013 at 11:46 PM, Alex Williamson
> <alex.williamson@redhat.com> wrote:
> > On Fri, 2013-06-21 at 12:49 +1000, Alexey Kardashevskiy wrote:
> >> On 06/21/2013 12:34 PM, Alex Williamson wrote:
> >>
> >>
> >> Do not follow you, sorry. For x86, is it that MSI routing table which is
> >> updated via KVM_SET_GSI_ROUTING in KVM? When there is no KVM, what piece of
> >> code responds on msi_notify() in qemu-x86 and does qemu_irq_pulse()?
> >
> > vfio_msi_interrupt->msi[x]_notify->stl_le_phys(msg.address, msg.data)
> >
> > This writes directly to the interrupt block on the vCPU.  With KVM, the
> > in-kernel APIC does the same write, where the pin to MSIMessage is setup
> > by kvm_irqchip_add_msi_route and the pin is pulled by an irqfd.
> 
> What is this "interrupt block on the vCPU" you speak of?  I reviewed
FEE00000H address as seen from PCI bus is a special address range (see
10.11.1 in SDM). Any write by a PCI device to that address range is
interpreted as MSI. We do not model this correctly in QEMU yet since
all devices, including vcpus, see exactly same memory map.

> the SDM and see nothing in the APIC protocol or the brief description
> of MSI as a PCI concept that would indicate anything except that the
> PHB handles MSI writes and feeds them to the I/O APIC.
> 
I/O APIC? Did you mean APIC, but even that will probably be incorrect.
I'd say it translates the data to APIC bus message. And with interrupt
remapping there is more magic happens between MSI and APIC bus.

> In fact, the wikipedia article on MSI has:
> 
> "A common misconception with Message Signaled Interrupts is that they
> allow the device to send data to a processor as part of the interrupt.
> The data that is sent as part of the write is used by the chipset to
> determine which interrupt to trigger on which processor; it is not
> available for the device to communicate additional information to the
> interrupt handler."
> 
Not sure who claimed otherwise.

> > Do I understand that on POWER the MSI from the device is intercepted at
> > the PHB and converted to an IRQ that's triggered by some means other
> > than a MSI write?
> 
> This is exactly the same thing that happens on x86, no?  Can you point
> me to something in the SDM that says otherwise?
> 
> Regards,
> 
> Anthony Liguori
> 
> >  So to correctly model the hardware, vfio should do a
> > msi_notify() that does a stl_le_phys that terminates at this IRQ
> > remapper thing and in turn toggles a qemu_irq.  MSIMessage is only
> > extraneous data if you want to skip over hardware blocks.
> >
> > Maybe you could add a device parameter to kvm_irqchip_add_msi_route so
> > that it can be implemented on POWER without this pci_bus_map_msi
> > interface that seems very unique to POWER.  Thanks,
> >
> > Alex
> >
> >> >>>> ---
> >> >>>>  hw/misc/vfio.c           |   11 +++++++++--
> >> >>>>  hw/pci/pci.c             |   13 +++++++++++++
> >> >>>>  hw/ppc/spapr_pci.c       |   13 +++++++++++++
> >> >>>>  hw/virtio/virtio-pci.c   |   26 ++++++++++++++++++++------
> >> >>>>  include/hw/pci/pci.h     |    4 ++++
> >> >>>>  include/hw/pci/pci_bus.h |    1 +
> >> >>>>  6 files changed, 60 insertions(+), 8 deletions(-)
> >> >>>>
> >> >>>> diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c
> >> >>>> index 14aac04..2d9eef7 100644
> >> >>>> --- a/hw/misc/vfio.c
> >> >>>> +++ b/hw/misc/vfio.c
> >> >>>> @@ -639,7 +639,11 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
> >> >>>>       * Attempt to enable route through KVM irqchip,
> >> >>>>       * default to userspace handling if unavailable.
> >> >>>>       */
> >> >>>> -    vector->virq = msg ? kvm_irqchip_add_msi_route(kvm_state, *msg) : -1;
> >> >>>> +
> >> >>>> +    vector->virq = msg ? pci_bus_map_msi(vdev->pdev.bus, *msg) : -1;
> >> >>>> +    if (vector->virq < 0) {
> >> >>>> +        vector->virq = msg ? kvm_irqchip_add_msi_route(kvm_state, *msg) : -1;
> >> >>>> +    }
> >> >>>>      if (vector->virq < 0 ||
> >> >>>>          kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt,
> >> >>>>                                         vector->virq) < 0) {
> >> >>>> @@ -807,7 +811,10 @@ retry:
> >> >>>>           * Attempt to enable route through KVM irqchip,
> >> >>>>           * default to userspace handling if unavailable.
> >> >>>>           */
> >> >>>> -        vector->virq = kvm_irqchip_add_msi_route(kvm_state, msg);
> >> >>>> +        vector->virq = pci_bus_map_msi(vdev->pdev.bus, msg);
> >> >>>> +        if (vector->virq < 0) {
> >> >>>> +            vector->virq = kvm_irqchip_add_msi_route(kvm_state, msg);
> >> >>>> +        }
> >> >>>>          if (vector->virq < 0 ||
> >> >>>>              kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt,
> >> >>>>                                             vector->virq) < 0) {
> >> >>>> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> >> >>>> index a976e46..a9875e9 100644
> >> >>>> --- a/hw/pci/pci.c
> >> >>>> +++ b/hw/pci/pci.c
> >> >>>> @@ -1254,6 +1254,19 @@ void pci_device_set_intx_routing_notifier(PCIDevice *dev,
> >> >>>>      dev->intx_routing_notifier = notifier;
> >> >>>>  }
> >> >>>>
> >> >>>> +void pci_bus_set_map_msi_fn(PCIBus *bus, pci_map_msi_fn map_msi_fn)
> >> >>>> +{
> >> >>>> +    bus->map_msi = map_msi_fn;
> >> >>>> +}
> >> >>>> +
> >> >>>> +int pci_bus_map_msi(PCIBus *bus, MSIMessage msg)
> >> >>>> +{
> >> >>>> +    if (bus->map_msi) {
> >> >>>> +        return bus->map_msi(bus, msg);
> >> >>>> +    }
> >> >>>> +    return -1;
> >> >>>> +}
> >> >>>> +
> >> >>>>  /*
> >> >>>>   * PCI-to-PCI bridge specification
> >> >>>>   * 9.1: Interrupt routing. Table 9-1
> >> >>>> diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
> >> >>>> index 80408c9..9ef9a29 100644
> >> >>>> --- a/hw/ppc/spapr_pci.c
> >> >>>> +++ b/hw/ppc/spapr_pci.c
> >> >>>> @@ -500,6 +500,18 @@ static void spapr_msi_write(void *opaque, hwaddr addr,
> >> >>>>      qemu_irq_pulse(xics_get_qirq(spapr->icp, irq));
> >> >>>>  }
> >> >>>>
> >> >>>> +static int spapr_msi_get_irq(PCIBus *bus, MSIMessage msg)
> >> >>>> +{
> >> >>>> +    DeviceState *par = bus->qbus.parent;
> >> >>>> +    sPAPRPHBState *sphb = (sPAPRPHBState *) par;
> >> >>>> +    unsigned long addr = msg.address - sphb->msi_win_addr;
> >> >>>> +    int ndev = addr >> 16;
> >> >>>> +    int vec = ((addr & 0xFFFF) >> 2) | msg.data;
> >> >>>> +    uint32_t irq = sphb->msi_table[ndev].irq + vec;
> >> >>>> +
> >> >>>> +    return (int)irq;
> >> >>>> +}
> >> >>>> +
> >> >>>>  static const MemoryRegionOps spapr_msi_ops = {
> >> >>>>      /* There is no .read as the read result is undefined by PCI spec */
> >> >>>>      .read = NULL,
> >> >>>> @@ -664,6 +676,7 @@ static int _spapr_phb_init(SysBusDevice *s)
> >> >>>>
> >> >>>>          sphb->lsi_table[i].irq = irq;
> >> >>>>      }
> >> >>>> +    pci_bus_set_map_msi_fn(bus, spapr_msi_get_irq);
> >> >>>>
> >> >>>>      return 0;
> >> >>>>  }
> >> >>>> diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
> >> >>>> index d309416..587f53e 100644
> >> >>>> --- a/hw/virtio/virtio-pci.c
> >> >>>> +++ b/hw/virtio/virtio-pci.c
> >> >>>> @@ -472,6 +472,8 @@ static unsigned virtio_pci_get_features(DeviceState *d)
> >> >>>>      return proxy->host_features;
> >> >>>>  }
> >> >>>>
> >> >>>> +extern int spapr_msi_get_irq(PCIBus *bus, MSIMessage *msg);
> >> >>>> +
> >> >>>>  static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy *proxy,
> >> >>>>                                          unsigned int queue_no,
> >> >>>>                                          unsigned int vector,
> >> >>>> @@ -481,7 +483,10 @@ static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy *proxy,
> >> >>>>      int ret;
> >> >>>>
> >> >>>>      if (irqfd->users == 0) {
> >> >>>> -        ret = kvm_irqchip_add_msi_route(kvm_state, msg);
> >> >>>> +        ret = pci_bus_map_msi(proxy->pci_dev.bus, msg);
> >> >>>> +        if (ret < 0) {
> >> >>>> +            ret = kvm_irqchip_add_msi_route(kvm_state, msg);
> >> >>>> +        }
> >> >>>>          if (ret < 0) {
> >> >>>>              return ret;
> >> >>>>          }
> >> >>>> @@ -609,14 +614,23 @@ static int virtio_pci_vq_vector_unmask(VirtIOPCIProxy *proxy,
> >> >>>>      VirtQueue *vq = virtio_get_queue(proxy->vdev, queue_no);
> >> >>>>      EventNotifier *n = virtio_queue_get_guest_notifier(vq);
> >> >>>>      VirtIOIRQFD *irqfd;
> >> >>>> -    int ret = 0;
> >> >>>> +    int ret = 0, tmp;
> >> >>>>
> >> >>>>      if (proxy->vector_irqfd) {
> >> >>>>          irqfd = &proxy->vector_irqfd[vector];
> >> >>>> -        if (irqfd->msg.data != msg.data || irqfd->msg.address != msg.address) {
> >> >>>> -            ret = kvm_irqchip_update_msi_route(kvm_state, irqfd->virq, msg);
> >> >>>> -            if (ret < 0) {
> >> >>>> -                return ret;
> >> >>>> +
> >> >>>> +        tmp = pci_bus_map_msi(proxy->pci_dev.bus, msg);
> >> >>>> +        if (tmp >= 0) {
> >> >>>> +            if (irqfd->virq != tmp) {
> >> >>>> +                fprintf(stderr, "FIXME: MSI(-X) vector has changed from %X to %x\n",
> >> >>>> +                        irqfd->virq, tmp);
> >> >>>> +            }
> >> >>>> +        } else {
> >> >>>> +            if (irqfd->msg.data != msg.data || irqfd->msg.address != msg.address) {
> >> >>>> +                ret = kvm_irqchip_update_msi_route(kvm_state, irqfd->virq, msg);
> >> >>>> +                if (ret < 0) {
> >> >>>> +                    return ret;
> >> >>>> +                }
> >> >>>>              }
> >> >>>>          }
> >> >>>>      }
> >> >>>> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
> >> >>>> index 8797802..632739a 100644
> >> >>>> --- a/include/hw/pci/pci.h
> >> >>>> +++ b/include/hw/pci/pci.h
> >> >>>> @@ -332,6 +332,7 @@ MemoryRegion *pci_address_space_io(PCIDevice *dev);
> >> >>>>  typedef void (*pci_set_irq_fn)(void *opaque, int irq_num, int level);
> >> >>>>  typedef int (*pci_map_irq_fn)(PCIDevice *pci_dev, int irq_num);
> >> >>>>  typedef PCIINTxRoute (*pci_route_irq_fn)(void *opaque, int pin);
> >> >>>> +typedef int (*pci_map_msi_fn)(PCIBus *bus, MSIMessage msg);
> >> >>>>
> >> >>>>  typedef enum {
> >> >>>>      PCI_HOTPLUG_DISABLED,
> >> >>>> @@ -375,6 +376,9 @@ bool pci_intx_route_changed(PCIINTxRoute *old, PCIINTxRoute *new);
> >> >>>>  void pci_bus_fire_intx_routing_notifier(PCIBus *bus);
> >> >>>>  void pci_device_set_intx_routing_notifier(PCIDevice *dev,
> >> >>>>                                            PCIINTxRoutingNotifier notifier);
> >> >>>> +void pci_bus_set_map_msi_fn(PCIBus *bus, pci_map_msi_fn map_msi_fn);
> >> >>>> +int pci_bus_map_msi(PCIBus *bus, MSIMessage msg);
> >> >>>> +
> >> >>>>  void pci_device_reset(PCIDevice *dev);
> >> >>>>  void pci_bus_reset(PCIBus *bus);
> >> >>>>
> >> >>>> diff --git a/include/hw/pci/pci_bus.h b/include/hw/pci/pci_bus.h
> >> >>>> index 66762f6..81efd2b 100644
> >> >>>> --- a/include/hw/pci/pci_bus.h
> >> >>>> +++ b/include/hw/pci/pci_bus.h
> >> >>>> @@ -16,6 +16,7 @@ struct PCIBus {
> >> >>>>      pci_set_irq_fn set_irq;
> >> >>>>      pci_map_irq_fn map_irq;
> >> >>>>      pci_route_irq_fn route_intx_to_irq;
> >> >>>> +    pci_map_msi_fn map_msi;
> >> >>>>      pci_hotplug_fn hotplug;
> >> >>>>      DeviceState *hotplug_qdev;
> >> >>>>      void *irq_opaque;
> >> >>>
> >> >>>
> >> >>>
> >> >>
> >> >>
> >> >
> >> >
> >> >
> >>
> >>
> >
> >
> >
> >

--
			Gleb.
Michael S. Tsirkin June 24, 2013, 12:10 p.m. UTC | #22
On Mon, Jun 24, 2013 at 07:36:25AM +1000, Benjamin Herrenschmidt wrote:
> On Sun, 2013-06-23 at 17:07 +0300, Michael S. Tsirkin wrote:
> > Yes I think that's fine really.
> > 
> > Basically devices all speak MSIMessage as they should -
> > this is what the PCI spec says.        
> > On all normal systems guests also speak MSIMessage so
> > the API which uses these makes sense for kvm.
> > Except, ppc architecture in its wisdom decided to hide this
> > in firmware.  QEMU does not have firmware so the translation
> > has to be maintained in QEMU powerpc code.
> 
> It is still not enough. There is nothing that says that an MSI
> address/message is unique. On pseries (among others I'm sure), this is a
> per-host-bridge concept since the MSIs are decoded by the PHB and turned
> into standard fabric interrupts there.
> 
> So at least the bus needs to be passed as well.
> 
> Cheers,
> Ben.

If passing bus (or dma context, etc)
turns out to actually be helpful, why not.
Anthony Liguori June 24, 2013, 12:24 p.m. UTC | #23
Alex Williamson <alex.williamson@redhat.com> writes:

> On Sun, 2013-06-23 at 16:58 -0500, Anthony Liguori wrote:
>> Yeah, but none of this is Power specific...  so we can do the same for x86, no?
>> 
>> I'm still trying to wrap my head around why we need MSI knowledge at
>> all in the kernel for x86.  I presume it's to fast-path irqfd when
>> doing vhost?
>
> Or device assignment.  Any paths where we want to inject an MSI
> interrupt without going through userspace.  Thanks,

With VFIO, we use irqfd, if we program the irqfd as the underlying IRQ
that MSI would route to, isn't that sufficient?

That's more or less what Ben is proposing for Power...

Regards,

Anthony Liguori

>
> Alex
Anthony Liguori June 24, 2013, 12:25 p.m. UTC | #24
Alex Williamson <alex.williamson@redhat.com> writes:

> On Sun, 2013-06-23 at 10:06 -0500, Anthony Liguori wrote:
>> On Thu, Jun 20, 2013 at 11:46 PM, Alex Williamson
>> <alex.williamson@redhat.com> wrote:
>> > On Fri, 2013-06-21 at 12:49 +1000, Alexey Kardashevskiy wrote:
>> >> On 06/21/2013 12:34 PM, Alex Williamson wrote:
>> >>
>> >>
>> >> Do not follow you, sorry. For x86, is it that MSI routing table which is
>> >> updated via KVM_SET_GSI_ROUTING in KVM? When there is no KVM, what piece of
>> >> code responds on msi_notify() in qemu-x86 and does qemu_irq_pulse()?
>> >
>> > vfio_msi_interrupt->msi[x]_notify->stl_le_phys(msg.address, msg.data)
>> >
>> > This writes directly to the interrupt block on the vCPU.  With KVM, the
>> > in-kernel APIC does the same write, where the pin to MSIMessage is setup
>> > by kvm_irqchip_add_msi_route and the pin is pulled by an irqfd.
>> 
>> What is this "interrupt block on the vCPU" you speak of?  I reviewed
>> the SDM and see nothing in the APIC protocol or the brief description
>> of MSI as a PCI concept that would indicate anything except that the
>> PHB handles MSI writes and feeds them to the I/O APIC.
>
> In all likelihood I'm recalling ia64 details and trying to apply them to
> x86.  Does the MSIMessage not actually get written to the LAPIC on the
> CPU?  Thanks,

There definitely isn't an APIC message for MSI specifically.  I think
the only question is whether the PHB sits on the APIC bus and can
generate an APIC message directly or whether it has a private interface
to the IO APIC to do it.

I suspect that there are systems that do either.  But the important
point is that MSI writes are interpreted by the PHB either way.

Regards,

Anthony Liguori

>
> Alex
>
>> In fact, the wikipedia article on MSI has:
>> 
>> "A common misconception with Message Signaled Interrupts is that they
>> allow the device to send data to a processor as part of the interrupt.
>> The data that is sent as part of the write is used by the chipset to
>> determine which interrupt to trigger on which processor; it is not
>> available for the device to communicate additional information to the
>> interrupt handler."
>> 
>> > Do I understand that on POWER the MSI from the device is intercepted at
>> > the PHB and converted to an IRQ that's triggered by some means other
>> > than a MSI write?
>> 
>> This is exactly the same thing that happens on x86, no?  Can you point
>> me to something in the SDM that says otherwise?
>> 
>> Regards,
>> 
>> Anthony Liguori
>> 
>> >  So to correctly model the hardware, vfio should do a
>> > msi_notify() that does a stl_le_phys that terminates at this IRQ
>> > remapper thing and in turn toggles a qemu_irq.  MSIMessage is only
>> > extraneous data if you want to skip over hardware blocks.
>> >
>> > Maybe you could add a device parameter to kvm_irqchip_add_msi_route so
>> > that it can be implemented on POWER without this pci_bus_map_msi
>> > interface that seems very unique to POWER.  Thanks,
>> >
>> > Alex
>> >
>> >> >>>> ---
>> >> >>>>  hw/misc/vfio.c           |   11 +++++++++--
>> >> >>>>  hw/pci/pci.c             |   13 +++++++++++++
>> >> >>>>  hw/ppc/spapr_pci.c       |   13 +++++++++++++
>> >> >>>>  hw/virtio/virtio-pci.c   |   26 ++++++++++++++++++++------
>> >> >>>>  include/hw/pci/pci.h     |    4 ++++
>> >> >>>>  include/hw/pci/pci_bus.h |    1 +
>> >> >>>>  6 files changed, 60 insertions(+), 8 deletions(-)
>> >> >>>>
>> >> >>>> diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c
>> >> >>>> index 14aac04..2d9eef7 100644
>> >> >>>> --- a/hw/misc/vfio.c
>> >> >>>> +++ b/hw/misc/vfio.c
>> >> >>>> @@ -639,7 +639,11 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
>> >> >>>>       * Attempt to enable route through KVM irqchip,
>> >> >>>>       * default to userspace handling if unavailable.
>> >> >>>>       */
>> >> >>>> -    vector->virq = msg ? kvm_irqchip_add_msi_route(kvm_state, *msg) : -1;
>> >> >>>> +
>> >> >>>> +    vector->virq = msg ? pci_bus_map_msi(vdev->pdev.bus, *msg) : -1;
>> >> >>>> +    if (vector->virq < 0) {
>> >> >>>> +        vector->virq = msg ? kvm_irqchip_add_msi_route(kvm_state, *msg) : -1;
>> >> >>>> +    }
>> >> >>>>      if (vector->virq < 0 ||
>> >> >>>>          kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt,
>> >> >>>>                                         vector->virq) < 0) {
>> >> >>>> @@ -807,7 +811,10 @@ retry:
>> >> >>>>           * Attempt to enable route through KVM irqchip,
>> >> >>>>           * default to userspace handling if unavailable.
>> >> >>>>           */
>> >> >>>> -        vector->virq = kvm_irqchip_add_msi_route(kvm_state, msg);
>> >> >>>> +        vector->virq = pci_bus_map_msi(vdev->pdev.bus, msg);
>> >> >>>> +        if (vector->virq < 0) {
>> >> >>>> +            vector->virq = kvm_irqchip_add_msi_route(kvm_state, msg);
>> >> >>>> +        }
>> >> >>>>          if (vector->virq < 0 ||
>> >> >>>>              kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt,
>> >> >>>>                                             vector->virq) < 0) {
>> >> >>>> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
>> >> >>>> index a976e46..a9875e9 100644
>> >> >>>> --- a/hw/pci/pci.c
>> >> >>>> +++ b/hw/pci/pci.c
>> >> >>>> @@ -1254,6 +1254,19 @@ void pci_device_set_intx_routing_notifier(PCIDevice *dev,
>> >> >>>>      dev->intx_routing_notifier = notifier;
>> >> >>>>  }
>> >> >>>>
>> >> >>>> +void pci_bus_set_map_msi_fn(PCIBus *bus, pci_map_msi_fn map_msi_fn)
>> >> >>>> +{
>> >> >>>> +    bus->map_msi = map_msi_fn;
>> >> >>>> +}
>> >> >>>> +
>> >> >>>> +int pci_bus_map_msi(PCIBus *bus, MSIMessage msg)
>> >> >>>> +{
>> >> >>>> +    if (bus->map_msi) {
>> >> >>>> +        return bus->map_msi(bus, msg);
>> >> >>>> +    }
>> >> >>>> +    return -1;
>> >> >>>> +}
>> >> >>>> +
>> >> >>>>  /*
>> >> >>>>   * PCI-to-PCI bridge specification
>> >> >>>>   * 9.1: Interrupt routing. Table 9-1
>> >> >>>> diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
>> >> >>>> index 80408c9..9ef9a29 100644
>> >> >>>> --- a/hw/ppc/spapr_pci.c
>> >> >>>> +++ b/hw/ppc/spapr_pci.c
>> >> >>>> @@ -500,6 +500,18 @@ static void spapr_msi_write(void *opaque, hwaddr addr,
>> >> >>>>      qemu_irq_pulse(xics_get_qirq(spapr->icp, irq));
>> >> >>>>  }
>> >> >>>>
>> >> >>>> +static int spapr_msi_get_irq(PCIBus *bus, MSIMessage msg)
>> >> >>>> +{
>> >> >>>> +    DeviceState *par = bus->qbus.parent;
>> >> >>>> +    sPAPRPHBState *sphb = (sPAPRPHBState *) par;
>> >> >>>> +    unsigned long addr = msg.address - sphb->msi_win_addr;
>> >> >>>> +    int ndev = addr >> 16;
>> >> >>>> +    int vec = ((addr & 0xFFFF) >> 2) | msg.data;
>> >> >>>> +    uint32_t irq = sphb->msi_table[ndev].irq + vec;
>> >> >>>> +
>> >> >>>> +    return (int)irq;
>> >> >>>> +}
>> >> >>>> +
>> >> >>>>  static const MemoryRegionOps spapr_msi_ops = {
>> >> >>>>      /* There is no .read as the read result is undefined by PCI spec */
>> >> >>>>      .read = NULL,
>> >> >>>> @@ -664,6 +676,7 @@ static int _spapr_phb_init(SysBusDevice *s)
>> >> >>>>
>> >> >>>>          sphb->lsi_table[i].irq = irq;
>> >> >>>>      }
>> >> >>>> +    pci_bus_set_map_msi_fn(bus, spapr_msi_get_irq);
>> >> >>>>
>> >> >>>>      return 0;
>> >> >>>>  }
>> >> >>>> diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
>> >> >>>> index d309416..587f53e 100644
>> >> >>>> --- a/hw/virtio/virtio-pci.c
>> >> >>>> +++ b/hw/virtio/virtio-pci.c
>> >> >>>> @@ -472,6 +472,8 @@ static unsigned virtio_pci_get_features(DeviceState *d)
>> >> >>>>      return proxy->host_features;
>> >> >>>>  }
>> >> >>>>
>> >> >>>> +extern int spapr_msi_get_irq(PCIBus *bus, MSIMessage *msg);
>> >> >>>> +
>> >> >>>>  static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy *proxy,
>> >> >>>>                                          unsigned int queue_no,
>> >> >>>>                                          unsigned int vector,
>> >> >>>> @@ -481,7 +483,10 @@ static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy *proxy,
>> >> >>>>      int ret;
>> >> >>>>
>> >> >>>>      if (irqfd->users == 0) {
>> >> >>>> -        ret = kvm_irqchip_add_msi_route(kvm_state, msg);
>> >> >>>> +        ret = pci_bus_map_msi(proxy->pci_dev.bus, msg);
>> >> >>>> +        if (ret < 0) {
>> >> >>>> +            ret = kvm_irqchip_add_msi_route(kvm_state, msg);
>> >> >>>> +        }
>> >> >>>>          if (ret < 0) {
>> >> >>>>              return ret;
>> >> >>>>          }
>> >> >>>> @@ -609,14 +614,23 @@ static int virtio_pci_vq_vector_unmask(VirtIOPCIProxy *proxy,
>> >> >>>>      VirtQueue *vq = virtio_get_queue(proxy->vdev, queue_no);
>> >> >>>>      EventNotifier *n = virtio_queue_get_guest_notifier(vq);
>> >> >>>>      VirtIOIRQFD *irqfd;
>> >> >>>> -    int ret = 0;
>> >> >>>> +    int ret = 0, tmp;
>> >> >>>>
>> >> >>>>      if (proxy->vector_irqfd) {
>> >> >>>>          irqfd = &proxy->vector_irqfd[vector];
>> >> >>>> -        if (irqfd->msg.data != msg.data || irqfd->msg.address != msg.address) {
>> >> >>>> -            ret = kvm_irqchip_update_msi_route(kvm_state, irqfd->virq, msg);
>> >> >>>> -            if (ret < 0) {
>> >> >>>> -                return ret;
>> >> >>>> +
>> >> >>>> +        tmp = pci_bus_map_msi(proxy->pci_dev.bus, msg);
>> >> >>>> +        if (tmp >= 0) {
>> >> >>>> +            if (irqfd->virq != tmp) {
>> >> >>>> +                fprintf(stderr, "FIXME: MSI(-X) vector has changed from %X to %x\n",
>> >> >>>> +                        irqfd->virq, tmp);
>> >> >>>> +            }
>> >> >>>> +        } else {
>> >> >>>> +            if (irqfd->msg.data != msg.data || irqfd->msg.address != msg.address) {
>> >> >>>> +                ret = kvm_irqchip_update_msi_route(kvm_state, irqfd->virq, msg);
>> >> >>>> +                if (ret < 0) {
>> >> >>>> +                    return ret;
>> >> >>>> +                }
>> >> >>>>              }
>> >> >>>>          }
>> >> >>>>      }
>> >> >>>> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
>> >> >>>> index 8797802..632739a 100644
>> >> >>>> --- a/include/hw/pci/pci.h
>> >> >>>> +++ b/include/hw/pci/pci.h
>> >> >>>> @@ -332,6 +332,7 @@ MemoryRegion *pci_address_space_io(PCIDevice *dev);
>> >> >>>>  typedef void (*pci_set_irq_fn)(void *opaque, int irq_num, int level);
>> >> >>>>  typedef int (*pci_map_irq_fn)(PCIDevice *pci_dev, int irq_num);
>> >> >>>>  typedef PCIINTxRoute (*pci_route_irq_fn)(void *opaque, int pin);
>> >> >>>> +typedef int (*pci_map_msi_fn)(PCIBus *bus, MSIMessage msg);
>> >> >>>>
>> >> >>>>  typedef enum {
>> >> >>>>      PCI_HOTPLUG_DISABLED,
>> >> >>>> @@ -375,6 +376,9 @@ bool pci_intx_route_changed(PCIINTxRoute *old, PCIINTxRoute *new);
>> >> >>>>  void pci_bus_fire_intx_routing_notifier(PCIBus *bus);
>> >> >>>>  void pci_device_set_intx_routing_notifier(PCIDevice *dev,
>> >> >>>>                                            PCIINTxRoutingNotifier notifier);
>> >> >>>> +void pci_bus_set_map_msi_fn(PCIBus *bus, pci_map_msi_fn map_msi_fn);
>> >> >>>> +int pci_bus_map_msi(PCIBus *bus, MSIMessage msg);
>> >> >>>> +
>> >> >>>>  void pci_device_reset(PCIDevice *dev);
>> >> >>>>  void pci_bus_reset(PCIBus *bus);
>> >> >>>>
>> >> >>>> diff --git a/include/hw/pci/pci_bus.h b/include/hw/pci/pci_bus.h
>> >> >>>> index 66762f6..81efd2b 100644
>> >> >>>> --- a/include/hw/pci/pci_bus.h
>> >> >>>> +++ b/include/hw/pci/pci_bus.h
>> >> >>>> @@ -16,6 +16,7 @@ struct PCIBus {
>> >> >>>>      pci_set_irq_fn set_irq;
>> >> >>>>      pci_map_irq_fn map_irq;
>> >> >>>>      pci_route_irq_fn route_intx_to_irq;
>> >> >>>> +    pci_map_msi_fn map_msi;
>> >> >>>>      pci_hotplug_fn hotplug;
>> >> >>>>      DeviceState *hotplug_qdev;
>> >> >>>>      void *irq_opaque;
>> >> >>>
>> >> >>>
>> >> >>>
>> >> >>
>> >> >>
>> >> >
>> >> >
>> >> >
>> >>
>> >>
>> >
>> >
>> >
>> >
Anthony Liguori June 24, 2013, 12:32 p.m. UTC | #25
Gleb Natapov <gleb@redhat.com> writes:

> On Sun, Jun 23, 2013 at 10:06:05AM -0500, Anthony Liguori wrote:
>> On Thu, Jun 20, 2013 at 11:46 PM, Alex Williamson
>> <alex.williamson@redhat.com> wrote:
>> > On Fri, 2013-06-21 at 12:49 +1000, Alexey Kardashevskiy wrote:
>> >> On 06/21/2013 12:34 PM, Alex Williamson wrote:
>> >>
>> >>
>> >> Do not follow you, sorry. For x86, is it that MSI routing table which is
>> >> updated via KVM_SET_GSI_ROUTING in KVM? When there is no KVM, what piece of
>> >> code responds on msi_notify() in qemu-x86 and does qemu_irq_pulse()?
>> >
>> > vfio_msi_interrupt->msi[x]_notify->stl_le_phys(msg.address, msg.data)
>> >
>> > This writes directly to the interrupt block on the vCPU.  With KVM, the
>> > in-kernel APIC does the same write, where the pin to MSIMessage is setup
>> > by kvm_irqchip_add_msi_route and the pin is pulled by an irqfd.
>> 
>> What is this "interrupt block on the vCPU" you speak of?  I reviewed
> FEE00000H address as seen from PCI bus is a special address range (see
> 10.11.1 in SDM).

Ack.

> Any write by a PCI device to that address range is
> interpreted as MSI. We do not model this correctly in QEMU yet since
> all devices, including vcpus, see exactly same memory map.

This should be a per-device mapping, yes.  But I'm not sure that VCPUs
should even see anything.  I don't think a VCPU can generate an MSI
interrupt by writing to this location.

>> the SDM and see nothing in the APIC protocol or the brief description
>> of MSI as a PCI concept that would indicate anything except that the
>> PHB handles MSI writes and feeds them to the I/O APIC.
>> 
> I/O APIC? Did you mean APIC, but even that will probably be incorrect.
> I'd say it translates the data to APIC bus message. And with interrupt
> remapping there is more magic happens between MSI and APIC bus.

I think the wording in the SDM allows either.

>> In fact, the wikipedia article on MSI has:
>> 
>> "A common misconception with Message Signaled Interrupts is that they
>> allow the device to send data to a processor as part of the interrupt.
>> The data that is sent as part of the write is used by the chipset to
>> determine which interrupt to trigger on which processor; it is not
>> available for the device to communicate additional information to the
>> interrupt handler."
>> 
> Not sure who claimed otherwise.

So to summarize:

1) MSI writes are intercepted by the PHB and generates an appropriate
   IRQ.

2) The PHB has a tuple of (src device, address, data) plus whatever
   information it maintains to do the translation.

3) On Power, we can have multiple PHBs.

4) The kernel interface assumes a single flat table mapping (address,
   data) to interrupts.  We try to keep that table up-to-date in QEMU.

5) The reason the kernel has MSI info at all is to allow for IRQFDs to
   generate MSI interrupts.

Is there anything that prevents us from using IRQFDs corresponding to
the target of an MSI mapping and get rid of the MSI info in the kernel?

It seems like the only sane way to actually support (2) and (3).

Regards,

Anthony Liguori
Alexander Graf June 24, 2013, 12:37 p.m. UTC | #26
On 24.06.2013, at 14:32, Anthony Liguori wrote:

> Gleb Natapov <gleb@redhat.com> writes:
> 
>> On Sun, Jun 23, 2013 at 10:06:05AM -0500, Anthony Liguori wrote:
>>> On Thu, Jun 20, 2013 at 11:46 PM, Alex Williamson
>>> <alex.williamson@redhat.com> wrote:
>>>> On Fri, 2013-06-21 at 12:49 +1000, Alexey Kardashevskiy wrote:
>>>>> On 06/21/2013 12:34 PM, Alex Williamson wrote:
>>>>> 
>>>>> 
>>>>> Do not follow you, sorry. For x86, is it that MSI routing table which is
>>>>> updated via KVM_SET_GSI_ROUTING in KVM? When there is no KVM, what piece of
>>>>> code responds on msi_notify() in qemu-x86 and does qemu_irq_pulse()?
>>>> 
>>>> vfio_msi_interrupt->msi[x]_notify->stl_le_phys(msg.address, msg.data)
>>>> 
>>>> This writes directly to the interrupt block on the vCPU.  With KVM, the
>>>> in-kernel APIC does the same write, where the pin to MSIMessage is setup
>>>> by kvm_irqchip_add_msi_route and the pin is pulled by an irqfd.
>>> 
>>> What is this "interrupt block on the vCPU" you speak of?  I reviewed
>> FEE00000H address as seen from PCI bus is a special address range (see
>> 10.11.1 in SDM).
> 
> Ack.
> 
>> Any write by a PCI device to that address range is
>> interpreted as MSI. We do not model this correctly in QEMU yet since
>> all devices, including vcpus, see exactly same memory map.
> 
> This should be a per-device mapping, yes.  But I'm not sure that VCPUs
> should even see anything.  I don't think a VCPU can generate an MSI
> interrupt by writing to this location.
> 
>>> the SDM and see nothing in the APIC protocol or the brief description
>>> of MSI as a PCI concept that would indicate anything except that the
>>> PHB handles MSI writes and feeds them to the I/O APIC.
>>> 
>> I/O APIC? Did you mean APIC, but even that will probably be incorrect.
>> I'd say it translates the data to APIC bus message. And with interrupt
>> remapping there is more magic happens between MSI and APIC bus.
> 
> I think the wording in the SDM allows either.
> 
>>> In fact, the wikipedia article on MSI has:
>>> 
>>> "A common misconception with Message Signaled Interrupts is that they
>>> allow the device to send data to a processor as part of the interrupt.
>>> The data that is sent as part of the write is used by the chipset to
>>> determine which interrupt to trigger on which processor; it is not
>>> available for the device to communicate additional information to the
>>> interrupt handler."
>>> 
>> Not sure who claimed otherwise.
> 
> So to summarize:
> 
> 1) MSI writes are intercepted by the PHB and generates an appropriate
>   IRQ.
> 
> 2) The PHB has a tuple of (src device, address, data) plus whatever
>   information it maintains to do the translation.
> 
> 3) On Power, we can have multiple PHBs.
> 
> 4) The kernel interface assumes a single flat table mapping (address,
>   data) to interrupts.  We try to keep that table up-to-date in QEMU.
> 
> 5) The reason the kernel has MSI info at all is to allow for IRQFDs to
>   generate MSI interrupts.
> 
> Is there anything that prevents us from using IRQFDs corresponding to
> the target of an MSI mapping and get rid of the MSI info in the kernel?

What would that interface look like? An MSI does not arrive at an I/O APIC pin, so we can't use the existing "give me an irqfd for this pin" command.


Alex
Gleb Natapov June 24, 2013, 12:39 p.m. UTC | #27
On Mon, Jun 24, 2013 at 07:24:03AM -0500, Anthony Liguori wrote:
> Alex Williamson <alex.williamson@redhat.com> writes:
> 
> > On Sun, 2013-06-23 at 16:58 -0500, Anthony Liguori wrote:
> >> Yeah, but none of this is Power specific...  so we can do the same for x86, no?
> >> 
> >> I'm still trying to wrap my head around why we need MSI knowledge at
> >> all in the kernel for x86.  I presume it's to fast-path irqfd when
> >> doing vhost?
> >
> > Or device assignment.  Any paths where we want to inject an MSI
> > interrupt without going through userspace.  Thanks,
> 
> With VFIO, we use irqfd, if we program the irqfd as the underlying IRQ
> that MSI would route to, isn't that sufficient?
> 
Not sure I understand what you mean by IRQ here, there is no "IRQ pin"
on x86 CPU with APIC architecture (there is ExtINT but it is not
relevant here). If by IRQ you mean "information needed to inject
interrupt through APIC" then this is how things work now.

> That's more or less what Ben is proposing for Power...
> 
> Regards,
> 
> Anthony Liguori
> 
> >
> > Alex

--
			Gleb.
Gleb Natapov June 24, 2013, 1:06 p.m. UTC | #28
On Mon, Jun 24, 2013 at 07:32:32AM -0500, Anthony Liguori wrote:
> Gleb Natapov <gleb@redhat.com> writes:
> 
> > On Sun, Jun 23, 2013 at 10:06:05AM -0500, Anthony Liguori wrote:
> >> On Thu, Jun 20, 2013 at 11:46 PM, Alex Williamson
> >> <alex.williamson@redhat.com> wrote:
> >> > On Fri, 2013-06-21 at 12:49 +1000, Alexey Kardashevskiy wrote:
> >> >> On 06/21/2013 12:34 PM, Alex Williamson wrote:
> >> >>
> >> >>
> >> >> Do not follow you, sorry. For x86, is it that MSI routing table which is
> >> >> updated via KVM_SET_GSI_ROUTING in KVM? When there is no KVM, what piece of
> >> >> code responds on msi_notify() in qemu-x86 and does qemu_irq_pulse()?
> >> >
> >> > vfio_msi_interrupt->msi[x]_notify->stl_le_phys(msg.address, msg.data)
> >> >
> >> > This writes directly to the interrupt block on the vCPU.  With KVM, the
> >> > in-kernel APIC does the same write, where the pin to MSIMessage is setup
> >> > by kvm_irqchip_add_msi_route and the pin is pulled by an irqfd.
> >> 
> >> What is this "interrupt block on the vCPU" you speak of?  I reviewed
> > FEE00000H address as seen from PCI bus is a special address range (see
> > 10.11.1 in SDM).
> 
> Ack.
> 
> > Any write by a PCI device to that address range is
> > interpreted as MSI. We do not model this correctly in QEMU yet since
> > all devices, including vcpus, see exactly same memory map.
> 
> This should be a per-device mapping, yes.  But I'm not sure that VCPUs
> should even see anything.  I don't think a VCPU can generate an MSI
> interrupt by writing to this location.
> 
No, and lower 4k of this space is where APIC is mapped as seen from CPU.

> >> the SDM and see nothing in the APIC protocol or the brief description
> >> of MSI as a PCI concept that would indicate anything except that the
> >> PHB handles MSI writes and feeds them to the I/O APIC.
> >> 
> > I/O APIC? Did you mean APIC, but even that will probably be incorrect.
> > I'd say it translates the data to APIC bus message. And with interrupt
> > remapping there is more magic happens between MSI and APIC bus.
> 
> I think the wording in the SDM allows either.
> 
SDM says nothing about it, but since we are guessing anyway my last guess
would be I/O APIC.  I/O APIC has well defined role: it detects level/edge
interrupts on an input pins and send preconfigured APIC message to APIC
bus if one is detected. In fact its even mapped at different address
0FEC00000H. Of course since I/O APIC and the logic that maps MSI writes
to APIC bus message are probably on the same chips anyway you can call
this logic a part of I/O APIC, but this is stretching it too much IMO :)

> >> In fact, the wikipedia article on MSI has:
> >> 
> >> "A common misconception with Message Signaled Interrupts is that they
> >> allow the device to send data to a processor as part of the interrupt.
> >> The data that is sent as part of the write is used by the chipset to
> >> determine which interrupt to trigger on which processor; it is not
> >> available for the device to communicate additional information to the
> >> interrupt handler."
> >> 
> > Not sure who claimed otherwise.
> 
> So to summarize:
> 
> 1) MSI writes are intercepted by the PHB and generates an appropriate
>    IRQ.
> 
> 2) The PHB has a tuple of (src device, address, data) plus whatever
>    information it maintains to do the translation.
> 
> 3) On Power, we can have multiple PHBs.
> 
Looks like that means we need to put more information into the kernel,
not less.

> 4) The kernel interface assumes a single flat table mapping (address,
>    data) to interrupts.  We try to keep that table up-to-date in QEMU.
> 
> 5) The reason the kernel has MSI info at all is to allow for IRQFDs to
>    generate MSI interrupts.
> 
> Is there anything that prevents us from using IRQFDs corresponding to
> the target of an MSI mapping and get rid of the MSI info in the kernel?
> 
Again, you assume that x86 has some pin that MSI triggers. This is not
the case; address/data is minimum that is needed to inject interrupt
there (or moving APIC into userspace, since this is where "translation"
is happening).

> It seems like the only sane way to actually support (2) and (3).
> 
> Regards,
> 
> Anthony Liguori

--
			Gleb.
Anthony Liguori June 24, 2013, 1:34 p.m. UTC | #29
Gleb Natapov <gleb@redhat.com> writes:

> On Mon, Jun 24, 2013 at 07:32:32AM -0500, Anthony Liguori wrote:
>> Gleb Natapov <gleb@redhat.com> writes:
>> 
>> This should be a per-device mapping, yes.  But I'm not sure that VCPUs
>> should even see anything.  I don't think a VCPU can generate an MSI
>> interrupt by writing to this location.
>> 
> No, and lower 4k of this space is where APIC is mapped as seen from CPU.
>> Is there anything that prevents us from using IRQFDs corresponding to
>> the target of an MSI mapping and get rid of the MSI info in the kernel?
>> 
> Again, you assume that x86 has some pin that MSI triggers. This is not
> the case; address/data is minimum that is needed to inject interrupt
> there (or moving APIC into userspace, since this is where "translation"
> is happening).

An APIC message contains:

1) Destination Mode
2) Delivery mode
3) Level
4) Trigger mode
5) Vector
6) Destination

Which is more or less what the MSI addr/data pair encodes.

But we can certainly have a userspace interface to inject such a message
into the LAPICs.  In fact, this is more or less what KVM_SIGNAL_MSI is
doing except that it's called MSI and encodes things in an addr/pair.

Such an interface would also allow for a QEMU implementation of an IO
APIC while still having the in-kernel LAPIC.

It would also allow QEMU to do per-device MSI decoding.

Isn't this more or less what Avi's previous proposal was around changing
the APIC interfaces to userspace?

Regards,

Anthony Liguori

>
>> It seems like the only sane way to actually support (2) and (3).
>> 
>> Regards,
>> 
>> Anthony Liguori
>
> --
> 			Gleb.
Michael S. Tsirkin June 24, 2013, 1:41 p.m. UTC | #30
On Mon, Jun 24, 2013 at 08:34:52AM -0500, Anthony Liguori wrote:
> Gleb Natapov <gleb@redhat.com> writes:
> 
> > On Mon, Jun 24, 2013 at 07:32:32AM -0500, Anthony Liguori wrote:
> >> Gleb Natapov <gleb@redhat.com> writes:
> >> 
> >> This should be a per-device mapping, yes.  But I'm not sure that VCPUs
> >> should even see anything.  I don't think a VCPU can generate an MSI
> >> interrupt by writing to this location.
> >> 
> > No, and lower 4k of this space is where APIC is mapped as seen from CPU.
> >> Is there anything that prevents us from using IRQFDs corresponding to
> >> the target of an MSI mapping and get rid of the MSI info in the kernel?
> >> 
> > Again, you assume that x86 has some pin that MSI triggers. This is not
> > the case; address/data is minimum that is needed to inject interrupt
> > there (or moving APIC into userspace, since this is where "translation"
> > is happening).
> 
> An APIC message contains:
> 
> 1) Destination Mode
> 2) Delivery mode
> 3) Level
> 4) Trigger mode
> 5) Vector
> 6) Destination
> 
> Which is more or less what the MSI addr/data pair encodes.
> 
> But we can certainly have a userspace interface to inject such a message
> into the LAPICs.  In fact, this is more or less what KVM_SIGNAL_MSI is
> doing except that it's called MSI and encodes things in an addr/pair.
> 
> Such an interface would also allow for a QEMU implementation of an IO
> APIC while still having the in-kernel LAPIC.
> 
> It would also allow QEMU to do per-device MSI decoding.
> 
> Isn't this more or less what Avi's previous proposal was around changing
> the APIC interfaces to userspace?
> 
> Regards,
> 
> Anthony Liguori

While that's not very elegant, I think we can use the existing
interface for this: just encode things in a fake
"msi message" in the format that kernel expects.

> >
> >> It seems like the only sane way to actually support (2) and (3).
> >> 
> >> Regards,
> >> 
> >> Anthony Liguori
> >
> > --
> > 			Gleb.
Anthony Liguori June 24, 2013, 2:31 p.m. UTC | #31
"Michael S. Tsirkin" <mst@redhat.com> writes:

> On Mon, Jun 24, 2013 at 08:34:52AM -0500, Anthony Liguori wrote:
>> Gleb Natapov <gleb@redhat.com> writes:
>> 
>> Isn't this more or less what Avi's previous proposal was around changing
>> the APIC interfaces to userspace?
>> 
>> Regards,
>> 
>> Anthony Liguori
>
> While that's not very elegant, I think we can use the existing
> interface for this: just encode things in a fake
> "msi message" in the format that kernel expects.

This is, in fact, exactly what we do today.  The MSI interfaces aren't
for MSI.  They are for sending messages to the APIC bus.  What we should
do is:

#define KVM_SIGNAL_LAPIC	KVM_SIGNAL_MSI
#define KVM_IRQ_ROUTING_LAPIC	KVM_IRQ_ROUTING_MSI

And switch to using the new #defines in QEMU.  That would make it more
obvious where we need to refactor things.  We currently hard code
routing via a local APIC with MSI.  We need to change this into a bus
specific function that can choose the right interface.

I think Power is fine with just doing all routing through the irqchip
interface.  We may need other routing interfaces for other architectures
though.

Regards,

Anthony Liguori

>
>> >
>> >> It seems like the only sane way to actually support (2) and (3).
>> >> 
>> >> Regards,
>> >> 
>> >> Anthony Liguori
>> >
>> > --
>> > 			Gleb.
Alexander Graf June 24, 2013, 2:34 p.m. UTC | #32
On 24.06.2013, at 16:31, Anthony Liguori wrote:

> "Michael S. Tsirkin" <mst@redhat.com> writes:
> 
>> On Mon, Jun 24, 2013 at 08:34:52AM -0500, Anthony Liguori wrote:
>>> Gleb Natapov <gleb@redhat.com> writes:
>>> 
>>> Isn't this more or less what Avi's previous proposal was around changing
>>> the APIC interfaces to userspace?
>>> 
>>> Regards,
>>> 
>>> Anthony Liguori
>> 
>> While that's not very elegant, I think we can use the existing
>> interface for this: just encode things in a fake
>> "msi message" in the format that kernel expects.
> 
> This is, in fact, exactly what we do today.  The MSI interfaces aren't
> for MSI.  They are for sending messages to the APIC bus.  What we should
> do is:
> 
> #define KVM_SIGNAL_LAPIC	KVM_SIGNAL_MSI
> #define KVM_IRQ_ROUTING_LAPIC	KVM_IRQ_ROUTING_MSI
> 
> And switch to using the new #defines in QEMU.  That would make it more
> obvious where we need to refactor things.  We currently hard code
> routing via a local APIC with MSI.  We need to change this into a bus
> specific function that can choose the right interface.
> 
> I think Power is fine with just doing all routing through the irqchip
> interface.  We may need other routing interfaces for other architectures
> though.

I'm not quite sure what problem exactly you're trying to solve here. The main user of MSI injects through irqfd are in the kernel and definitely want a fast path to inject those without any involvement of QEMU at all.


Alex
Anthony Liguori June 24, 2013, 3:17 p.m. UTC | #33
Alexander Graf <agraf@suse.de> writes:

> On 24.06.2013, at 16:31, Anthony Liguori wrote:
>
>> "Michael S. Tsirkin" <mst@redhat.com> writes:
>> 
>>> On Mon, Jun 24, 2013 at 08:34:52AM -0500, Anthony Liguori wrote:
>>>> Gleb Natapov <gleb@redhat.com> writes:
>>>> 
>>>> Isn't this more or less what Avi's previous proposal was around changing
>>>> the APIC interfaces to userspace?
>>>> 
>>>> Regards,
>>>> 
>>>> Anthony Liguori
>>> 
>>> While that's not very elegant, I think we can use the existing
>>> interface for this: just encode things in a fake
>>> "msi message" in the format that kernel expects.
>> 
>> This is, in fact, exactly what we do today.  The MSI interfaces aren't
>> for MSI.  They are for sending messages to the APIC bus.  What we should
>> do is:
>> 
>> #define KVM_SIGNAL_LAPIC	KVM_SIGNAL_MSI
>> #define KVM_IRQ_ROUTING_LAPIC	KVM_IRQ_ROUTING_MSI
>> 
>> And switch to using the new #defines in QEMU.  That would make it more
>> obvious where we need to refactor things.  We currently hard code
>> routing via a local APIC with MSI.  We need to change this into a bus
>> specific function that can choose the right interface.
>> 
>> I think Power is fine with just doing all routing through the irqchip
>> interface.  We may need other routing interfaces for other architectures
>> though.
>
> I'm not quite sure what problem exactly you're trying to solve
> here. The main user of MSI injects through irqfd are in the kernel and
> definitely want a fast path to inject those without any involvement of
> QEMU at all.

Internally the communication between the IO APIC and LAPICs uses a
protocol that does not map to a simple "pin numbering".

Because we implement the IO APIC in the kernel, the kernel interface can
be pin numbering more or less.

But MSI bypasses the IO APIC (potentially at least) which means that we
need a userspace interface for sending APIC messages.

Unfortunately, when this was added to support MSI, this was called "MSI"
but it is most certainly not.

In the very least, you also need the bus state + the source device to do
proper MSI translation.  We get around this by assigning global MSI
addresses and only having a single PHB.

If you make those assumptions, then the APIC protocol looks a lot like
an MSI address/data pair.

Regards,

Anthony Liguori

>
>
> Alex
Gleb Natapov June 24, 2013, 4:35 p.m. UTC | #34
On Mon, Jun 24, 2013 at 08:34:52AM -0500, Anthony Liguori wrote:
> Gleb Natapov <gleb@redhat.com> writes:
> 
> > On Mon, Jun 24, 2013 at 07:32:32AM -0500, Anthony Liguori wrote:
> >> Gleb Natapov <gleb@redhat.com> writes:
> >> 
> >> This should be a per-device mapping, yes.  But I'm not sure that VCPUs
> >> should even see anything.  I don't think a VCPU can generate an MSI
> >> interrupt by writing to this location.
> >> 
> > No, and lower 4k of this space is where APIC is mapped as seen from CPU.
> >> Is there anything that prevents us from using IRQFDs corresponding to
> >> the target of an MSI mapping and get rid of the MSI info in the kernel?
> >> 
> > Again, you assume that x86 has some pin that MSI triggers. This is not
> > the case; address/data is minimum that is needed to inject interrupt
> > there (or moving APIC into userspace, since this is where "translation"
> > is happening).
> 
> An APIC message contains:
> 
> 1) Destination Mode
> 2) Delivery mode
> 3) Level
> 4) Trigger mode
> 5) Vector
> 6) Destination
> 
> Which is more or less what the MSI addr/data pair encodes.
> 
Not if interrupt remapping is in use.

> But we can certainly have a userspace interface to inject such a message
> into the LAPICs.  In fact, this is more or less what KVM_SIGNAL_MSI is
> doing except that it's called MSI and encodes things in an addr/pair.
> 
Good that it does that otherwise it would have been broken after
interrupt remapping implementation.

> Such an interface would also allow for a QEMU implementation of an IO
> APIC while still having the in-kernel LAPIC.
> 
Yes.

> It would also allow QEMU to do per-device MSI decoding.
> 
Why can't it be done now with existing interfaces?

> Isn't this more or less what Avi's previous proposal was around changing
> the APIC interfaces to userspace?
> 
Avi actually was against adding KVM_SIGNAL_MSI initially since it
duplicated the functionality we already had. Don't remember how Jan
managed to persuade him in the end :)

--
			Gleb.
Gleb Natapov June 24, 2013, 4:48 p.m. UTC | #35
On Mon, Jun 24, 2013 at 10:17:41AM -0500, Anthony Liguori wrote:
> Alexander Graf <agraf@suse.de> writes:
> 
> > On 24.06.2013, at 16:31, Anthony Liguori wrote:
> >
> >> "Michael S. Tsirkin" <mst@redhat.com> writes:
> >> 
> >>> On Mon, Jun 24, 2013 at 08:34:52AM -0500, Anthony Liguori wrote:
> >>>> Gleb Natapov <gleb@redhat.com> writes:
> >>>> 
> >>>> Isn't this more or less what Avi's previous proposal was around changing
> >>>> the APIC interfaces to userspace?
> >>>> 
> >>>> Regards,
> >>>> 
> >>>> Anthony Liguori
> >>> 
> >>> While that's not very elegant, I think we can use the existing
> >>> interface for this: just encode things in a fake
> >>> "msi message" in the format that kernel expects.
> >> 
> >> This is, in fact, exactly what we do today.  The MSI interfaces aren't
> >> for MSI.  They are for sending messages to the APIC bus.  What we should
> >> do is:
> >> 
> >> #define KVM_SIGNAL_LAPIC	KVM_SIGNAL_MSI
> >> #define KVM_IRQ_ROUTING_LAPIC	KVM_IRQ_ROUTING_MSI
> >> 
> >> And switch to using the new #defines in QEMU.  That would make it more
> >> obvious where we need to refactor things.  We currently hard code
> >> routing via a local APIC with MSI.  We need to change this into a bus
> >> specific function that can choose the right interface.
> >> 
> >> I think Power is fine with just doing all routing through the irqchip
> >> interface.  We may need other routing interfaces for other architectures
> >> though.
> >
> > I'm not quite sure what problem exactly you're trying to solve
> > here. The main user of MSI injects through irqfd are in the kernel and
> > definitely want a fast path to inject those without any involvement of
> > QEMU at all.
> 
> Internally the communication between the IO APIC and LAPICs uses a
> protocol that does not map to a simple "pin numbering".
> 
> Because we implement the IO APIC in the kernel, the kernel interface can
> be pin numbering more or less.
> 
> But MSI bypasses the IO APIC (potentially at least) which means that we
> need a userspace interface for sending APIC messages.
> 
> Unfortunately, when this was added to support MSI, this was called "MSI"
> but it is most certainly not.
Why not? MSI is when a PIC device writes something somewhere. This is
what interface provides. Interpretation of MSI data/address as APIC
message is x86 specific and shouldn't even be compiled by other arches.

> 
> In the very least, you also need the bus state + the source device to do
> proper MSI translation.  We get around this by assigning global MSI
> addresses and only having a single PHB.
> 
It looks to me that you are arguing that each architecture needs to have
its own logic to deliver MSI interrupt and for some arches current
interface does not provide enough data. I am not arguing with that.

> If you make those assumptions, then the APIC protocol looks a lot like
> an MSI address/data pair.
> 

--
			Gleb.
diff mbox

Patch

diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c
index 14aac04..2d9eef7 100644
--- a/hw/misc/vfio.c
+++ b/hw/misc/vfio.c
@@ -639,7 +639,11 @@  static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
      * Attempt to enable route through KVM irqchip,
      * default to userspace handling if unavailable.
      */
-    vector->virq = msg ? kvm_irqchip_add_msi_route(kvm_state, *msg) : -1;
+
+    vector->virq = msg ? pci_bus_map_msi(vdev->pdev.bus, *msg) : -1;
+    if (vector->virq < 0) {
+        vector->virq = msg ? kvm_irqchip_add_msi_route(kvm_state, *msg) : -1;
+    }
     if (vector->virq < 0 ||
         kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt,
                                        vector->virq) < 0) {
@@ -807,7 +811,10 @@  retry:
          * Attempt to enable route through KVM irqchip,
          * default to userspace handling if unavailable.
          */
-        vector->virq = kvm_irqchip_add_msi_route(kvm_state, msg);
+        vector->virq = pci_bus_map_msi(vdev->pdev.bus, msg);
+        if (vector->virq < 0) {
+            vector->virq = kvm_irqchip_add_msi_route(kvm_state, msg);
+        }
         if (vector->virq < 0 ||
             kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt,
                                            vector->virq) < 0) {
diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index a976e46..a9875e9 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -1254,6 +1254,19 @@  void pci_device_set_intx_routing_notifier(PCIDevice *dev,
     dev->intx_routing_notifier = notifier;
 }
 
+void pci_bus_set_map_msi_fn(PCIBus *bus, pci_map_msi_fn map_msi_fn)
+{
+    bus->map_msi = map_msi_fn;
+}
+
+int pci_bus_map_msi(PCIBus *bus, MSIMessage msg)
+{
+    if (bus->map_msi) {
+        return bus->map_msi(bus, msg);
+    }
+    return -1;
+}
+
 /*
  * PCI-to-PCI bridge specification
  * 9.1: Interrupt routing. Table 9-1
diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
index 80408c9..9ef9a29 100644
--- a/hw/ppc/spapr_pci.c
+++ b/hw/ppc/spapr_pci.c
@@ -500,6 +500,18 @@  static void spapr_msi_write(void *opaque, hwaddr addr,
     qemu_irq_pulse(xics_get_qirq(spapr->icp, irq));
 }
 
+static int spapr_msi_get_irq(PCIBus *bus, MSIMessage msg)
+{
+    DeviceState *par = bus->qbus.parent;
+    sPAPRPHBState *sphb = (sPAPRPHBState *) par;
+    unsigned long addr = msg.address - sphb->msi_win_addr;
+    int ndev = addr >> 16;
+    int vec = ((addr & 0xFFFF) >> 2) | msg.data;
+    uint32_t irq = sphb->msi_table[ndev].irq + vec;
+
+    return (int)irq;
+}
+
 static const MemoryRegionOps spapr_msi_ops = {
     /* There is no .read as the read result is undefined by PCI spec */
     .read = NULL,
@@ -664,6 +676,7 @@  static int _spapr_phb_init(SysBusDevice *s)
 
         sphb->lsi_table[i].irq = irq;
     }
+    pci_bus_set_map_msi_fn(bus, spapr_msi_get_irq);
 
     return 0;
 }
diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
index d309416..587f53e 100644
--- a/hw/virtio/virtio-pci.c
+++ b/hw/virtio/virtio-pci.c
@@ -472,6 +472,8 @@  static unsigned virtio_pci_get_features(DeviceState *d)
     return proxy->host_features;
 }
 
+extern int spapr_msi_get_irq(PCIBus *bus, MSIMessage *msg);
+
 static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy *proxy,
                                         unsigned int queue_no,
                                         unsigned int vector,
@@ -481,7 +483,10 @@  static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy *proxy,
     int ret;
 
     if (irqfd->users == 0) {
-        ret = kvm_irqchip_add_msi_route(kvm_state, msg);
+        ret = pci_bus_map_msi(proxy->pci_dev.bus, msg);
+        if (ret < 0) {
+            ret = kvm_irqchip_add_msi_route(kvm_state, msg);
+        }
         if (ret < 0) {
             return ret;
         }
@@ -609,14 +614,23 @@  static int virtio_pci_vq_vector_unmask(VirtIOPCIProxy *proxy,
     VirtQueue *vq = virtio_get_queue(proxy->vdev, queue_no);
     EventNotifier *n = virtio_queue_get_guest_notifier(vq);
     VirtIOIRQFD *irqfd;
-    int ret = 0;
+    int ret = 0, tmp;
 
     if (proxy->vector_irqfd) {
         irqfd = &proxy->vector_irqfd[vector];
-        if (irqfd->msg.data != msg.data || irqfd->msg.address != msg.address) {
-            ret = kvm_irqchip_update_msi_route(kvm_state, irqfd->virq, msg);
-            if (ret < 0) {
-                return ret;
+
+        tmp = pci_bus_map_msi(proxy->pci_dev.bus, msg);
+        if (tmp >= 0) {
+            if (irqfd->virq != tmp) {
+                fprintf(stderr, "FIXME: MSI(-X) vector has changed from %X to %x\n",
+                        irqfd->virq, tmp);
+            }
+        } else {
+            if (irqfd->msg.data != msg.data || irqfd->msg.address != msg.address) {
+                ret = kvm_irqchip_update_msi_route(kvm_state, irqfd->virq, msg);
+                if (ret < 0) {
+                    return ret;
+                }
             }
         }
     }
diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index 8797802..632739a 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -332,6 +332,7 @@  MemoryRegion *pci_address_space_io(PCIDevice *dev);
 typedef void (*pci_set_irq_fn)(void *opaque, int irq_num, int level);
 typedef int (*pci_map_irq_fn)(PCIDevice *pci_dev, int irq_num);
 typedef PCIINTxRoute (*pci_route_irq_fn)(void *opaque, int pin);
+typedef int (*pci_map_msi_fn)(PCIBus *bus, MSIMessage msg);
 
 typedef enum {
     PCI_HOTPLUG_DISABLED,
@@ -375,6 +376,9 @@  bool pci_intx_route_changed(PCIINTxRoute *old, PCIINTxRoute *new);
 void pci_bus_fire_intx_routing_notifier(PCIBus *bus);
 void pci_device_set_intx_routing_notifier(PCIDevice *dev,
                                           PCIINTxRoutingNotifier notifier);
+void pci_bus_set_map_msi_fn(PCIBus *bus, pci_map_msi_fn map_msi_fn);
+int pci_bus_map_msi(PCIBus *bus, MSIMessage msg);
+
 void pci_device_reset(PCIDevice *dev);
 void pci_bus_reset(PCIBus *bus);
 
diff --git a/include/hw/pci/pci_bus.h b/include/hw/pci/pci_bus.h
index 66762f6..81efd2b 100644
--- a/include/hw/pci/pci_bus.h
+++ b/include/hw/pci/pci_bus.h
@@ -16,6 +16,7 @@  struct PCIBus {
     pci_set_irq_fn set_irq;
     pci_map_irq_fn map_irq;
     pci_route_irq_fn route_intx_to_irq;
+    pci_map_msi_fn map_msi;
     pci_hotplug_fn hotplug;
     DeviceState *hotplug_qdev;
     void *irq_opaque;