Message ID | 1371737338-25148-1-git-send-email-aik@ozlabs.ru |
---|---|
State | New |
Headers | show |
On Fri, Jun 21, 2013 at 12:08:58AM +1000, Alexey Kardashevskiy wrote: > At the moment QEMU creates a route for every MSI IRQ. > > Now we are about to add IRQFD support on PPC64-pseries platform. > pSeries already has in-kernel emulated interrupt controller with > 8192 IRQs. Also, pSeries PHB already supports MSIMessage to IRQ > mapping as a part of PAPR requirements for MSI/MSIX guests. > Specifically, the pSeries guest does not touch MSIMessage's at > all, instead it uses rtas_ibm_change_msi and rtas_ibm_query_interrupt_source > rtas calls to do the mapping. > > Therefore we do not really need more routing than we got already. > The patch introduces the infrastructure to enable direct IRQ mapping. > > Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> > > --- > > The patch is raw and ugly indeed, I made it only to demonstrate > the idea and see if it has right to live or not. > > For some reason which I do not really understand (limited GSI numbers?) > the existing code always adds routing and I do not see why we would need it. > > Thanks! > --- > hw/misc/vfio.c | 11 +++++++++-- > hw/pci/pci.c | 13 +++++++++++++ > hw/ppc/spapr_pci.c | 13 +++++++++++++ > hw/virtio/virtio-pci.c | 26 ++++++++++++++++++++------ > include/hw/pci/pci.h | 4 ++++ > include/hw/pci/pci_bus.h | 1 + > 6 files changed, 60 insertions(+), 8 deletions(-) > > diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c > index 14aac04..2d9eef7 100644 > --- a/hw/misc/vfio.c > +++ b/hw/misc/vfio.c > @@ -639,7 +639,11 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, > * Attempt to enable route through KVM irqchip, > * default to userspace handling if unavailable. > */ > - vector->virq = msg ? kvm_irqchip_add_msi_route(kvm_state, *msg) : -1; > + > + vector->virq = msg ? pci_bus_map_msi(vdev->pdev.bus, *msg) : -1; > + if (vector->virq < 0) { > + vector->virq = msg ? kvm_irqchip_add_msi_route(kvm_state, *msg) : -1; > + } > if (vector->virq < 0 || > kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt, > vector->virq) < 0) { > @@ -807,7 +811,10 @@ retry: > * Attempt to enable route through KVM irqchip, > * default to userspace handling if unavailable. > */ > - vector->virq = kvm_irqchip_add_msi_route(kvm_state, msg); > + vector->virq = pci_bus_map_msi(vdev->pdev.bus, msg); > + if (vector->virq < 0) { > + vector->virq = kvm_irqchip_add_msi_route(kvm_state, msg); > + } > if (vector->virq < 0 || > kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt, > vector->virq) < 0) { > diff --git a/hw/pci/pci.c b/hw/pci/pci.c > index a976e46..a9875e9 100644 > --- a/hw/pci/pci.c > +++ b/hw/pci/pci.c > @@ -1254,6 +1254,19 @@ void pci_device_set_intx_routing_notifier(PCIDevice *dev, > dev->intx_routing_notifier = notifier; > } > > +void pci_bus_set_map_msi_fn(PCIBus *bus, pci_map_msi_fn map_msi_fn) > +{ > + bus->map_msi = map_msi_fn; > +} > + > +int pci_bus_map_msi(PCIBus *bus, MSIMessage msg) > +{ > + if (bus->map_msi) { > + return bus->map_msi(bus, msg); > + } > + return -1; > +} > + > /* > * PCI-to-PCI bridge specification > * 9.1: Interrupt routing. Table 9-1 > diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c > index 80408c9..9ef9a29 100644 > --- a/hw/ppc/spapr_pci.c > +++ b/hw/ppc/spapr_pci.c > @@ -500,6 +500,18 @@ static void spapr_msi_write(void *opaque, hwaddr addr, > qemu_irq_pulse(xics_get_qirq(spapr->icp, irq)); > } > > +static int spapr_msi_get_irq(PCIBus *bus, MSIMessage msg) > +{ > + DeviceState *par = bus->qbus.parent; > + sPAPRPHBState *sphb = (sPAPRPHBState *) par; > + unsigned long addr = msg.address - sphb->msi_win_addr; > + int ndev = addr >> 16; > + int vec = ((addr & 0xFFFF) >> 2) | msg.data; > + uint32_t irq = sphb->msi_table[ndev].irq + vec; This array seems to be SPAPR_MSIX_MAX_DEVS in size, no? Won't this overflow if ndev is large? > + > + return (int)irq; > +} > + > static const MemoryRegionOps spapr_msi_ops = { > /* There is no .read as the read result is undefined by PCI spec */ > .read = NULL, > @@ -664,6 +676,7 @@ static int _spapr_phb_init(SysBusDevice *s) > > sphb->lsi_table[i].irq = irq; > } > + pci_bus_set_map_msi_fn(bus, spapr_msi_get_irq); > > return 0; > } > diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c > index d309416..587f53e 100644 > --- a/hw/virtio/virtio-pci.c > +++ b/hw/virtio/virtio-pci.c > @@ -472,6 +472,8 @@ static unsigned virtio_pci_get_features(DeviceState *d) > return proxy->host_features; > } > > +extern int spapr_msi_get_irq(PCIBus *bus, MSIMessage *msg); > + > static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy *proxy, > unsigned int queue_no, > unsigned int vector, > @@ -481,7 +483,10 @@ static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy *proxy, > int ret; > > if (irqfd->users == 0) { > - ret = kvm_irqchip_add_msi_route(kvm_state, msg); > + ret = pci_bus_map_msi(proxy->pci_dev.bus, msg); > + if (ret < 0) { > + ret = kvm_irqchip_add_msi_route(kvm_state, msg); > + } > if (ret < 0) { > return ret; > } > @@ -609,14 +614,23 @@ static int virtio_pci_vq_vector_unmask(VirtIOPCIProxy *proxy, > VirtQueue *vq = virtio_get_queue(proxy->vdev, queue_no); > EventNotifier *n = virtio_queue_get_guest_notifier(vq); > VirtIOIRQFD *irqfd; > - int ret = 0; > + int ret = 0, tmp; > > if (proxy->vector_irqfd) { > irqfd = &proxy->vector_irqfd[vector]; > - if (irqfd->msg.data != msg.data || irqfd->msg.address != msg.address) { > - ret = kvm_irqchip_update_msi_route(kvm_state, irqfd->virq, msg); > - if (ret < 0) { > - return ret; > + > + tmp = pci_bus_map_msi(proxy->pci_dev.bus, msg); > + if (tmp >= 0) { > + if (irqfd->virq != tmp) { > + fprintf(stderr, "FIXME: MSI(-X) vector has changed from %X to %x\n", > + irqfd->virq, tmp); > + } > + } else { > + if (irqfd->msg.data != msg.data || irqfd->msg.address != msg.address) { > + ret = kvm_irqchip_update_msi_route(kvm_state, irqfd->virq, msg); > + if (ret < 0) { > + return ret; > + } > } > } > } > diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h > index 8797802..632739a 100644 > --- a/include/hw/pci/pci.h > +++ b/include/hw/pci/pci.h > @@ -332,6 +332,7 @@ MemoryRegion *pci_address_space_io(PCIDevice *dev); > typedef void (*pci_set_irq_fn)(void *opaque, int irq_num, int level); > typedef int (*pci_map_irq_fn)(PCIDevice *pci_dev, int irq_num); > typedef PCIINTxRoute (*pci_route_irq_fn)(void *opaque, int pin); > +typedef int (*pci_map_msi_fn)(PCIBus *bus, MSIMessage msg); > > typedef enum { > PCI_HOTPLUG_DISABLED, > @@ -375,6 +376,9 @@ bool pci_intx_route_changed(PCIINTxRoute *old, PCIINTxRoute *new); > void pci_bus_fire_intx_routing_notifier(PCIBus *bus); > void pci_device_set_intx_routing_notifier(PCIDevice *dev, > PCIINTxRoutingNotifier notifier); > +void pci_bus_set_map_msi_fn(PCIBus *bus, pci_map_msi_fn map_msi_fn); > +int pci_bus_map_msi(PCIBus *bus, MSIMessage msg); > + > void pci_device_reset(PCIDevice *dev); > void pci_bus_reset(PCIBus *bus); > > diff --git a/include/hw/pci/pci_bus.h b/include/hw/pci/pci_bus.h > index 66762f6..81efd2b 100644 > --- a/include/hw/pci/pci_bus.h > +++ b/include/hw/pci/pci_bus.h > @@ -16,6 +16,7 @@ struct PCIBus { > pci_set_irq_fn set_irq; > pci_map_irq_fn map_irq; > pci_route_irq_fn route_intx_to_irq; > + pci_map_msi_fn map_msi; > pci_hotplug_fn hotplug; > DeviceState *hotplug_qdev; > void *irq_opaque; > -- > 1.7.10.4
Alexey Kardashevskiy <aik@ozlabs.ru> writes: > At the moment QEMU creates a route for every MSI IRQ. > > Now we are about to add IRQFD support on PPC64-pseries platform. > pSeries already has in-kernel emulated interrupt controller with > 8192 IRQs. Also, pSeries PHB already supports MSIMessage to IRQ > mapping as a part of PAPR requirements for MSI/MSIX guests. > Specifically, the pSeries guest does not touch MSIMessage's at > all, instead it uses rtas_ibm_change_msi and rtas_ibm_query_interrupt_source > rtas calls to do the mapping. > > Therefore we do not really need more routing than we got already. > The patch introduces the infrastructure to enable direct IRQ mapping. > > Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> > > --- > > The patch is raw and ugly indeed, I made it only to demonstrate > the idea and see if it has right to live or not. > > For some reason which I do not really understand (limited GSI numbers?) > the existing code always adds routing and I do not see why we would need it. > > Thanks! > --- > hw/misc/vfio.c | 11 +++++++++-- > hw/pci/pci.c | 13 +++++++++++++ > hw/ppc/spapr_pci.c | 13 +++++++++++++ > hw/virtio/virtio-pci.c | 26 ++++++++++++++++++++------ > include/hw/pci/pci.h | 4 ++++ > include/hw/pci/pci_bus.h | 1 + > 6 files changed, 60 insertions(+), 8 deletions(-) > > diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c > index 14aac04..2d9eef7 100644 > --- a/hw/misc/vfio.c > +++ b/hw/misc/vfio.c > @@ -639,7 +639,11 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, > * Attempt to enable route through KVM irqchip, > * default to userspace handling if unavailable. > */ > - vector->virq = msg ? kvm_irqchip_add_msi_route(kvm_state, *msg) : -1; > + > + vector->virq = msg ? pci_bus_map_msi(vdev->pdev.bus, *msg) : -1; > + if (vector->virq < 0) { > + vector->virq = msg ? kvm_irqchip_add_msi_route(kvm_state, *msg) : -1; > + } > if (vector->virq < 0 || > kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt, > vector->virq) < 0) { > @@ -807,7 +811,10 @@ retry: > * Attempt to enable route through KVM irqchip, > * default to userspace handling if unavailable. > */ > - vector->virq = kvm_irqchip_add_msi_route(kvm_state, msg); > + vector->virq = pci_bus_map_msi(vdev->pdev.bus, msg); > + if (vector->virq < 0) { > + vector->virq = kvm_irqchip_add_msi_route(kvm_state, msg); > + } I don't understand why you're adding a pci level hook verses just having a kvmppc specific hook in the kvm_irqchip_add_msi_route function.. Regards, Anthony Liguori > if (vector->virq < 0 || > kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt, > vector->virq) < 0) { > diff --git a/hw/pci/pci.c b/hw/pci/pci.c > index a976e46..a9875e9 100644 > --- a/hw/pci/pci.c > +++ b/hw/pci/pci.c > @@ -1254,6 +1254,19 @@ void pci_device_set_intx_routing_notifier(PCIDevice *dev, > dev->intx_routing_notifier = notifier; > } > > +void pci_bus_set_map_msi_fn(PCIBus *bus, pci_map_msi_fn map_msi_fn) > +{ > + bus->map_msi = map_msi_fn; > +} > + > +int pci_bus_map_msi(PCIBus *bus, MSIMessage msg) > +{ > + if (bus->map_msi) { > + return bus->map_msi(bus, msg); > + } > + return -1; > +} > + > /* > * PCI-to-PCI bridge specification > * 9.1: Interrupt routing. Table 9-1 > diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c > index 80408c9..9ef9a29 100644 > --- a/hw/ppc/spapr_pci.c > +++ b/hw/ppc/spapr_pci.c > @@ -500,6 +500,18 @@ static void spapr_msi_write(void *opaque, hwaddr addr, > qemu_irq_pulse(xics_get_qirq(spapr->icp, irq)); > } > > +static int spapr_msi_get_irq(PCIBus *bus, MSIMessage msg) > +{ > + DeviceState *par = bus->qbus.parent; > + sPAPRPHBState *sphb = (sPAPRPHBState *) par; > + unsigned long addr = msg.address - sphb->msi_win_addr; > + int ndev = addr >> 16; > + int vec = ((addr & 0xFFFF) >> 2) | msg.data; > + uint32_t irq = sphb->msi_table[ndev].irq + vec; > + > + return (int)irq; > +} > + > static const MemoryRegionOps spapr_msi_ops = { > /* There is no .read as the read result is undefined by PCI spec */ > .read = NULL, > @@ -664,6 +676,7 @@ static int _spapr_phb_init(SysBusDevice *s) > > sphb->lsi_table[i].irq = irq; > } > + pci_bus_set_map_msi_fn(bus, spapr_msi_get_irq); > > return 0; > } > diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c > index d309416..587f53e 100644 > --- a/hw/virtio/virtio-pci.c > +++ b/hw/virtio/virtio-pci.c > @@ -472,6 +472,8 @@ static unsigned virtio_pci_get_features(DeviceState *d) > return proxy->host_features; > } > > +extern int spapr_msi_get_irq(PCIBus *bus, MSIMessage *msg); > + > static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy *proxy, > unsigned int queue_no, > unsigned int vector, > @@ -481,7 +483,10 @@ static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy *proxy, > int ret; > > if (irqfd->users == 0) { > - ret = kvm_irqchip_add_msi_route(kvm_state, msg); > + ret = pci_bus_map_msi(proxy->pci_dev.bus, msg); > + if (ret < 0) { > + ret = kvm_irqchip_add_msi_route(kvm_state, msg); > + } > if (ret < 0) { > return ret; > } > @@ -609,14 +614,23 @@ static int virtio_pci_vq_vector_unmask(VirtIOPCIProxy *proxy, > VirtQueue *vq = virtio_get_queue(proxy->vdev, queue_no); > EventNotifier *n = virtio_queue_get_guest_notifier(vq); > VirtIOIRQFD *irqfd; > - int ret = 0; > + int ret = 0, tmp; > > if (proxy->vector_irqfd) { > irqfd = &proxy->vector_irqfd[vector]; > - if (irqfd->msg.data != msg.data || irqfd->msg.address != msg.address) { > - ret = kvm_irqchip_update_msi_route(kvm_state, irqfd->virq, msg); > - if (ret < 0) { > - return ret; > + > + tmp = pci_bus_map_msi(proxy->pci_dev.bus, msg); > + if (tmp >= 0) { > + if (irqfd->virq != tmp) { > + fprintf(stderr, "FIXME: MSI(-X) vector has changed from %X to %x\n", > + irqfd->virq, tmp); > + } > + } else { > + if (irqfd->msg.data != msg.data || irqfd->msg.address != msg.address) { > + ret = kvm_irqchip_update_msi_route(kvm_state, irqfd->virq, msg); > + if (ret < 0) { > + return ret; > + } > } > } > } > diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h > index 8797802..632739a 100644 > --- a/include/hw/pci/pci.h > +++ b/include/hw/pci/pci.h > @@ -332,6 +332,7 @@ MemoryRegion *pci_address_space_io(PCIDevice *dev); > typedef void (*pci_set_irq_fn)(void *opaque, int irq_num, int level); > typedef int (*pci_map_irq_fn)(PCIDevice *pci_dev, int irq_num); > typedef PCIINTxRoute (*pci_route_irq_fn)(void *opaque, int pin); > +typedef int (*pci_map_msi_fn)(PCIBus *bus, MSIMessage msg); > > typedef enum { > PCI_HOTPLUG_DISABLED, > @@ -375,6 +376,9 @@ bool pci_intx_route_changed(PCIINTxRoute *old, PCIINTxRoute *new); > void pci_bus_fire_intx_routing_notifier(PCIBus *bus); > void pci_device_set_intx_routing_notifier(PCIDevice *dev, > PCIINTxRoutingNotifier notifier); > +void pci_bus_set_map_msi_fn(PCIBus *bus, pci_map_msi_fn map_msi_fn); > +int pci_bus_map_msi(PCIBus *bus, MSIMessage msg); > + > void pci_device_reset(PCIDevice *dev); > void pci_bus_reset(PCIBus *bus); > > diff --git a/include/hw/pci/pci_bus.h b/include/hw/pci/pci_bus.h > index 66762f6..81efd2b 100644 > --- a/include/hw/pci/pci_bus.h > +++ b/include/hw/pci/pci_bus.h > @@ -16,6 +16,7 @@ struct PCIBus { > pci_set_irq_fn set_irq; > pci_map_irq_fn map_irq; > pci_route_irq_fn route_intx_to_irq; > + pci_map_msi_fn map_msi; > pci_hotplug_fn hotplug; > DeviceState *hotplug_qdev; > void *irq_opaque; > -- > 1.7.10.4
On Fri, 2013-06-21 at 00:08 +1000, Alexey Kardashevskiy wrote: > At the moment QEMU creates a route for every MSI IRQ. > > Now we are about to add IRQFD support on PPC64-pseries platform. > pSeries already has in-kernel emulated interrupt controller with > 8192 IRQs. Also, pSeries PHB already supports MSIMessage to IRQ > mapping as a part of PAPR requirements for MSI/MSIX guests. > Specifically, the pSeries guest does not touch MSIMessage's at > all, instead it uses rtas_ibm_change_msi and rtas_ibm_query_interrupt_source > rtas calls to do the mapping. > > Therefore we do not really need more routing than we got already. > The patch introduces the infrastructure to enable direct IRQ mapping. > > Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> > > --- > > The patch is raw and ugly indeed, I made it only to demonstrate > the idea and see if it has right to live or not. > > For some reason which I do not really understand (limited GSI numbers?) > the existing code always adds routing and I do not see why we would need it. It's an IOAPIC, a pin gets toggled from the device and an MSI message gets written to the CPU. So the route allocates and programs the pin->MSI, then we tell it what notifier triggers that pin. On x86 the MSI vector doesn't encode any information about the device sending the MSI, here you seem to be able to figure out the device and vector space number from the address. Then your pin to MSI is effectively fixed. So why isn't this just your kvm_irqchip_add_msi_route function? On pSeries it's a lookup, on x86 it's a allocate and program. What does kvm_irqchip_add_msi_route do on pSeries today? Thanks, Alex > --- > hw/misc/vfio.c | 11 +++++++++-- > hw/pci/pci.c | 13 +++++++++++++ > hw/ppc/spapr_pci.c | 13 +++++++++++++ > hw/virtio/virtio-pci.c | 26 ++++++++++++++++++++------ > include/hw/pci/pci.h | 4 ++++ > include/hw/pci/pci_bus.h | 1 + > 6 files changed, 60 insertions(+), 8 deletions(-) > > diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c > index 14aac04..2d9eef7 100644 > --- a/hw/misc/vfio.c > +++ b/hw/misc/vfio.c > @@ -639,7 +639,11 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, > * Attempt to enable route through KVM irqchip, > * default to userspace handling if unavailable. > */ > - vector->virq = msg ? kvm_irqchip_add_msi_route(kvm_state, *msg) : -1; > + > + vector->virq = msg ? pci_bus_map_msi(vdev->pdev.bus, *msg) : -1; > + if (vector->virq < 0) { > + vector->virq = msg ? kvm_irqchip_add_msi_route(kvm_state, *msg) : -1; > + } > if (vector->virq < 0 || > kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt, > vector->virq) < 0) { > @@ -807,7 +811,10 @@ retry: > * Attempt to enable route through KVM irqchip, > * default to userspace handling if unavailable. > */ > - vector->virq = kvm_irqchip_add_msi_route(kvm_state, msg); > + vector->virq = pci_bus_map_msi(vdev->pdev.bus, msg); > + if (vector->virq < 0) { > + vector->virq = kvm_irqchip_add_msi_route(kvm_state, msg); > + } > if (vector->virq < 0 || > kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt, > vector->virq) < 0) { > diff --git a/hw/pci/pci.c b/hw/pci/pci.c > index a976e46..a9875e9 100644 > --- a/hw/pci/pci.c > +++ b/hw/pci/pci.c > @@ -1254,6 +1254,19 @@ void pci_device_set_intx_routing_notifier(PCIDevice *dev, > dev->intx_routing_notifier = notifier; > } > > +void pci_bus_set_map_msi_fn(PCIBus *bus, pci_map_msi_fn map_msi_fn) > +{ > + bus->map_msi = map_msi_fn; > +} > + > +int pci_bus_map_msi(PCIBus *bus, MSIMessage msg) > +{ > + if (bus->map_msi) { > + return bus->map_msi(bus, msg); > + } > + return -1; > +} > + > /* > * PCI-to-PCI bridge specification > * 9.1: Interrupt routing. Table 9-1 > diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c > index 80408c9..9ef9a29 100644 > --- a/hw/ppc/spapr_pci.c > +++ b/hw/ppc/spapr_pci.c > @@ -500,6 +500,18 @@ static void spapr_msi_write(void *opaque, hwaddr addr, > qemu_irq_pulse(xics_get_qirq(spapr->icp, irq)); > } > > +static int spapr_msi_get_irq(PCIBus *bus, MSIMessage msg) > +{ > + DeviceState *par = bus->qbus.parent; > + sPAPRPHBState *sphb = (sPAPRPHBState *) par; > + unsigned long addr = msg.address - sphb->msi_win_addr; > + int ndev = addr >> 16; > + int vec = ((addr & 0xFFFF) >> 2) | msg.data; > + uint32_t irq = sphb->msi_table[ndev].irq + vec; > + > + return (int)irq; > +} > + > static const MemoryRegionOps spapr_msi_ops = { > /* There is no .read as the read result is undefined by PCI spec */ > .read = NULL, > @@ -664,6 +676,7 @@ static int _spapr_phb_init(SysBusDevice *s) > > sphb->lsi_table[i].irq = irq; > } > + pci_bus_set_map_msi_fn(bus, spapr_msi_get_irq); > > return 0; > } > diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c > index d309416..587f53e 100644 > --- a/hw/virtio/virtio-pci.c > +++ b/hw/virtio/virtio-pci.c > @@ -472,6 +472,8 @@ static unsigned virtio_pci_get_features(DeviceState *d) > return proxy->host_features; > } > > +extern int spapr_msi_get_irq(PCIBus *bus, MSIMessage *msg); > + > static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy *proxy, > unsigned int queue_no, > unsigned int vector, > @@ -481,7 +483,10 @@ static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy *proxy, > int ret; > > if (irqfd->users == 0) { > - ret = kvm_irqchip_add_msi_route(kvm_state, msg); > + ret = pci_bus_map_msi(proxy->pci_dev.bus, msg); > + if (ret < 0) { > + ret = kvm_irqchip_add_msi_route(kvm_state, msg); > + } > if (ret < 0) { > return ret; > } > @@ -609,14 +614,23 @@ static int virtio_pci_vq_vector_unmask(VirtIOPCIProxy *proxy, > VirtQueue *vq = virtio_get_queue(proxy->vdev, queue_no); > EventNotifier *n = virtio_queue_get_guest_notifier(vq); > VirtIOIRQFD *irqfd; > - int ret = 0; > + int ret = 0, tmp; > > if (proxy->vector_irqfd) { > irqfd = &proxy->vector_irqfd[vector]; > - if (irqfd->msg.data != msg.data || irqfd->msg.address != msg.address) { > - ret = kvm_irqchip_update_msi_route(kvm_state, irqfd->virq, msg); > - if (ret < 0) { > - return ret; > + > + tmp = pci_bus_map_msi(proxy->pci_dev.bus, msg); > + if (tmp >= 0) { > + if (irqfd->virq != tmp) { > + fprintf(stderr, "FIXME: MSI(-X) vector has changed from %X to %x\n", > + irqfd->virq, tmp); > + } > + } else { > + if (irqfd->msg.data != msg.data || irqfd->msg.address != msg.address) { > + ret = kvm_irqchip_update_msi_route(kvm_state, irqfd->virq, msg); > + if (ret < 0) { > + return ret; > + } > } > } > } > diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h > index 8797802..632739a 100644 > --- a/include/hw/pci/pci.h > +++ b/include/hw/pci/pci.h > @@ -332,6 +332,7 @@ MemoryRegion *pci_address_space_io(PCIDevice *dev); > typedef void (*pci_set_irq_fn)(void *opaque, int irq_num, int level); > typedef int (*pci_map_irq_fn)(PCIDevice *pci_dev, int irq_num); > typedef PCIINTxRoute (*pci_route_irq_fn)(void *opaque, int pin); > +typedef int (*pci_map_msi_fn)(PCIBus *bus, MSIMessage msg); > > typedef enum { > PCI_HOTPLUG_DISABLED, > @@ -375,6 +376,9 @@ bool pci_intx_route_changed(PCIINTxRoute *old, PCIINTxRoute *new); > void pci_bus_fire_intx_routing_notifier(PCIBus *bus); > void pci_device_set_intx_routing_notifier(PCIDevice *dev, > PCIINTxRoutingNotifier notifier); > +void pci_bus_set_map_msi_fn(PCIBus *bus, pci_map_msi_fn map_msi_fn); > +int pci_bus_map_msi(PCIBus *bus, MSIMessage msg); > + > void pci_device_reset(PCIDevice *dev); > void pci_bus_reset(PCIBus *bus); > > diff --git a/include/hw/pci/pci_bus.h b/include/hw/pci/pci_bus.h > index 66762f6..81efd2b 100644 > --- a/include/hw/pci/pci_bus.h > +++ b/include/hw/pci/pci_bus.h > @@ -16,6 +16,7 @@ struct PCIBus { > pci_set_irq_fn set_irq; > pci_map_irq_fn map_irq; > pci_route_irq_fn route_intx_to_irq; > + pci_map_msi_fn map_msi; > pci_hotplug_fn hotplug; > DeviceState *hotplug_qdev; > void *irq_opaque;
On 06/21/2013 02:37 AM, Anthony Liguori wrote: > Alexey Kardashevskiy <aik@ozlabs.ru> writes: > >> At the moment QEMU creates a route for every MSI IRQ. >> >> Now we are about to add IRQFD support on PPC64-pseries platform. >> pSeries already has in-kernel emulated interrupt controller with >> 8192 IRQs. Also, pSeries PHB already supports MSIMessage to IRQ >> mapping as a part of PAPR requirements for MSI/MSIX guests. >> Specifically, the pSeries guest does not touch MSIMessage's at >> all, instead it uses rtas_ibm_change_msi and rtas_ibm_query_interrupt_source >> rtas calls to do the mapping. >> >> Therefore we do not really need more routing than we got already. >> The patch introduces the infrastructure to enable direct IRQ mapping. >> >> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> >> >> --- >> >> The patch is raw and ugly indeed, I made it only to demonstrate >> the idea and see if it has right to live or not. >> >> For some reason which I do not really understand (limited GSI numbers?) >> the existing code always adds routing and I do not see why we would need it. >> >> Thanks! >> --- >> hw/misc/vfio.c | 11 +++++++++-- >> hw/pci/pci.c | 13 +++++++++++++ >> hw/ppc/spapr_pci.c | 13 +++++++++++++ >> hw/virtio/virtio-pci.c | 26 ++++++++++++++++++++------ >> include/hw/pci/pci.h | 4 ++++ >> include/hw/pci/pci_bus.h | 1 + >> 6 files changed, 60 insertions(+), 8 deletions(-) >> >> diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c >> index 14aac04..2d9eef7 100644 >> --- a/hw/misc/vfio.c >> +++ b/hw/misc/vfio.c >> @@ -639,7 +639,11 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, >> * Attempt to enable route through KVM irqchip, >> * default to userspace handling if unavailable. >> */ >> - vector->virq = msg ? kvm_irqchip_add_msi_route(kvm_state, *msg) : -1; >> + >> + vector->virq = msg ? pci_bus_map_msi(vdev->pdev.bus, *msg) : -1; >> + if (vector->virq < 0) { >> + vector->virq = msg ? kvm_irqchip_add_msi_route(kvm_state, *msg) : -1; >> + } >> if (vector->virq < 0 || >> kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt, >> vector->virq) < 0) { >> @@ -807,7 +811,10 @@ retry: >> * Attempt to enable route through KVM irqchip, >> * default to userspace handling if unavailable. >> */ >> - vector->virq = kvm_irqchip_add_msi_route(kvm_state, msg); >> + vector->virq = pci_bus_map_msi(vdev->pdev.bus, msg); >> + if (vector->virq < 0) { >> + vector->virq = kvm_irqchip_add_msi_route(kvm_state, msg); >> + } > > I don't understand why you're adding a pci level hook verses just having > a kvmppc specific hook in the kvm_irqchip_add_msi_route function.. Me neither :) I am just asking. The existing mapping code already exists in sPAPR PCI host bridge and it is not going anywhere else. And kvm_irqchip_add_msi_route does not have any link to a device or a bus so I'll have to walk through all PHBs in system and see if PHB's MSI window is the one from MSIMessage and convert MSIMessage to virq. Pretty easy and quick but still dirty hack, would it be better?
On 06/21/2013 02:51 AM, Alex Williamson wrote: > On Fri, 2013-06-21 at 00:08 +1000, Alexey Kardashevskiy wrote: >> At the moment QEMU creates a route for every MSI IRQ. >> >> Now we are about to add IRQFD support on PPC64-pseries platform. >> pSeries already has in-kernel emulated interrupt controller with >> 8192 IRQs. Also, pSeries PHB already supports MSIMessage to IRQ >> mapping as a part of PAPR requirements for MSI/MSIX guests. >> Specifically, the pSeries guest does not touch MSIMessage's at >> all, instead it uses rtas_ibm_change_msi and rtas_ibm_query_interrupt_source >> rtas calls to do the mapping. >> >> Therefore we do not really need more routing than we got already. >> The patch introduces the infrastructure to enable direct IRQ mapping. >> >> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> >> >> --- >> >> The patch is raw and ugly indeed, I made it only to demonstrate >> the idea and see if it has right to live or not. >> >> For some reason which I do not really understand (limited GSI numbers?) >> the existing code always adds routing and I do not see why we would need it. > > It's an IOAPIC, a pin gets toggled from the device and an MSI message > gets written to the CPU. So the route allocates and programs the > pin->MSI, then we tell it what notifier triggers that pin. > On x86 the MSI vector doesn't encode any information about the device > sending the MSI, here you seem to be able to figure out the device and > vector space number from the address. Then your pin to MSI is > effectively fixed. So why isn't this just your > kvm_irqchip_add_msi_route function? On pSeries it's a lookup, on x86 > it's a allocate and program. > What does kvm_irqchip_add_msi_route do on > pSeries today? Thanks, As we just started implementing this thing, I commented it out for the starter. Once called, it destroys direct mapping in the host kernel and everything stops working as routing is not implemented (yet? ever?). My point here is that MSIMessage to irq translation is made on a PCI domain as PAPR (ppc64 server) spec says. The guest never uses MSIMessage, it is all in QEMU, the guest dynamically allocates MSI IRQs and it is up to a hypeviser (QEMU) to take care of actual MSIMessage for the device. And the only reason to use MSIMessage in QEMU for us is to support msi_notify()/msix_notify() in places like vfio_msi_interrupt(), I have added a MSI window for that long time ago which we do not need as much as we already have an irq number in vfio_msi_interrupt(), etc. > > Alex > >> --- >> hw/misc/vfio.c | 11 +++++++++-- >> hw/pci/pci.c | 13 +++++++++++++ >> hw/ppc/spapr_pci.c | 13 +++++++++++++ >> hw/virtio/virtio-pci.c | 26 ++++++++++++++++++++------ >> include/hw/pci/pci.h | 4 ++++ >> include/hw/pci/pci_bus.h | 1 + >> 6 files changed, 60 insertions(+), 8 deletions(-) >> >> diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c >> index 14aac04..2d9eef7 100644 >> --- a/hw/misc/vfio.c >> +++ b/hw/misc/vfio.c >> @@ -639,7 +639,11 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, >> * Attempt to enable route through KVM irqchip, >> * default to userspace handling if unavailable. >> */ >> - vector->virq = msg ? kvm_irqchip_add_msi_route(kvm_state, *msg) : -1; >> + >> + vector->virq = msg ? pci_bus_map_msi(vdev->pdev.bus, *msg) : -1; >> + if (vector->virq < 0) { >> + vector->virq = msg ? kvm_irqchip_add_msi_route(kvm_state, *msg) : -1; >> + } >> if (vector->virq < 0 || >> kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt, >> vector->virq) < 0) { >> @@ -807,7 +811,10 @@ retry: >> * Attempt to enable route through KVM irqchip, >> * default to userspace handling if unavailable. >> */ >> - vector->virq = kvm_irqchip_add_msi_route(kvm_state, msg); >> + vector->virq = pci_bus_map_msi(vdev->pdev.bus, msg); >> + if (vector->virq < 0) { >> + vector->virq = kvm_irqchip_add_msi_route(kvm_state, msg); >> + } >> if (vector->virq < 0 || >> kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt, >> vector->virq) < 0) { >> diff --git a/hw/pci/pci.c b/hw/pci/pci.c >> index a976e46..a9875e9 100644 >> --- a/hw/pci/pci.c >> +++ b/hw/pci/pci.c >> @@ -1254,6 +1254,19 @@ void pci_device_set_intx_routing_notifier(PCIDevice *dev, >> dev->intx_routing_notifier = notifier; >> } >> >> +void pci_bus_set_map_msi_fn(PCIBus *bus, pci_map_msi_fn map_msi_fn) >> +{ >> + bus->map_msi = map_msi_fn; >> +} >> + >> +int pci_bus_map_msi(PCIBus *bus, MSIMessage msg) >> +{ >> + if (bus->map_msi) { >> + return bus->map_msi(bus, msg); >> + } >> + return -1; >> +} >> + >> /* >> * PCI-to-PCI bridge specification >> * 9.1: Interrupt routing. Table 9-1 >> diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c >> index 80408c9..9ef9a29 100644 >> --- a/hw/ppc/spapr_pci.c >> +++ b/hw/ppc/spapr_pci.c >> @@ -500,6 +500,18 @@ static void spapr_msi_write(void *opaque, hwaddr addr, >> qemu_irq_pulse(xics_get_qirq(spapr->icp, irq)); >> } >> >> +static int spapr_msi_get_irq(PCIBus *bus, MSIMessage msg) >> +{ >> + DeviceState *par = bus->qbus.parent; >> + sPAPRPHBState *sphb = (sPAPRPHBState *) par; >> + unsigned long addr = msg.address - sphb->msi_win_addr; >> + int ndev = addr >> 16; >> + int vec = ((addr & 0xFFFF) >> 2) | msg.data; >> + uint32_t irq = sphb->msi_table[ndev].irq + vec; >> + >> + return (int)irq; >> +} >> + >> static const MemoryRegionOps spapr_msi_ops = { >> /* There is no .read as the read result is undefined by PCI spec */ >> .read = NULL, >> @@ -664,6 +676,7 @@ static int _spapr_phb_init(SysBusDevice *s) >> >> sphb->lsi_table[i].irq = irq; >> } >> + pci_bus_set_map_msi_fn(bus, spapr_msi_get_irq); >> >> return 0; >> } >> diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c >> index d309416..587f53e 100644 >> --- a/hw/virtio/virtio-pci.c >> +++ b/hw/virtio/virtio-pci.c >> @@ -472,6 +472,8 @@ static unsigned virtio_pci_get_features(DeviceState *d) >> return proxy->host_features; >> } >> >> +extern int spapr_msi_get_irq(PCIBus *bus, MSIMessage *msg); >> + >> static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy *proxy, >> unsigned int queue_no, >> unsigned int vector, >> @@ -481,7 +483,10 @@ static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy *proxy, >> int ret; >> >> if (irqfd->users == 0) { >> - ret = kvm_irqchip_add_msi_route(kvm_state, msg); >> + ret = pci_bus_map_msi(proxy->pci_dev.bus, msg); >> + if (ret < 0) { >> + ret = kvm_irqchip_add_msi_route(kvm_state, msg); >> + } >> if (ret < 0) { >> return ret; >> } >> @@ -609,14 +614,23 @@ static int virtio_pci_vq_vector_unmask(VirtIOPCIProxy *proxy, >> VirtQueue *vq = virtio_get_queue(proxy->vdev, queue_no); >> EventNotifier *n = virtio_queue_get_guest_notifier(vq); >> VirtIOIRQFD *irqfd; >> - int ret = 0; >> + int ret = 0, tmp; >> >> if (proxy->vector_irqfd) { >> irqfd = &proxy->vector_irqfd[vector]; >> - if (irqfd->msg.data != msg.data || irqfd->msg.address != msg.address) { >> - ret = kvm_irqchip_update_msi_route(kvm_state, irqfd->virq, msg); >> - if (ret < 0) { >> - return ret; >> + >> + tmp = pci_bus_map_msi(proxy->pci_dev.bus, msg); >> + if (tmp >= 0) { >> + if (irqfd->virq != tmp) { >> + fprintf(stderr, "FIXME: MSI(-X) vector has changed from %X to %x\n", >> + irqfd->virq, tmp); >> + } >> + } else { >> + if (irqfd->msg.data != msg.data || irqfd->msg.address != msg.address) { >> + ret = kvm_irqchip_update_msi_route(kvm_state, irqfd->virq, msg); >> + if (ret < 0) { >> + return ret; >> + } >> } >> } >> } >> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h >> index 8797802..632739a 100644 >> --- a/include/hw/pci/pci.h >> +++ b/include/hw/pci/pci.h >> @@ -332,6 +332,7 @@ MemoryRegion *pci_address_space_io(PCIDevice *dev); >> typedef void (*pci_set_irq_fn)(void *opaque, int irq_num, int level); >> typedef int (*pci_map_irq_fn)(PCIDevice *pci_dev, int irq_num); >> typedef PCIINTxRoute (*pci_route_irq_fn)(void *opaque, int pin); >> +typedef int (*pci_map_msi_fn)(PCIBus *bus, MSIMessage msg); >> >> typedef enum { >> PCI_HOTPLUG_DISABLED, >> @@ -375,6 +376,9 @@ bool pci_intx_route_changed(PCIINTxRoute *old, PCIINTxRoute *new); >> void pci_bus_fire_intx_routing_notifier(PCIBus *bus); >> void pci_device_set_intx_routing_notifier(PCIDevice *dev, >> PCIINTxRoutingNotifier notifier); >> +void pci_bus_set_map_msi_fn(PCIBus *bus, pci_map_msi_fn map_msi_fn); >> +int pci_bus_map_msi(PCIBus *bus, MSIMessage msg); >> + >> void pci_device_reset(PCIDevice *dev); >> void pci_bus_reset(PCIBus *bus); >> >> diff --git a/include/hw/pci/pci_bus.h b/include/hw/pci/pci_bus.h >> index 66762f6..81efd2b 100644 >> --- a/include/hw/pci/pci_bus.h >> +++ b/include/hw/pci/pci_bus.h >> @@ -16,6 +16,7 @@ struct PCIBus { >> pci_set_irq_fn set_irq; >> pci_map_irq_fn map_irq; >> pci_route_irq_fn route_intx_to_irq; >> + pci_map_msi_fn map_msi; >> pci_hotplug_fn hotplug; >> DeviceState *hotplug_qdev; >> void *irq_opaque; > > >
On Fri, 2013-06-21 at 11:56 +1000, Alexey Kardashevskiy wrote: > On 06/21/2013 02:51 AM, Alex Williamson wrote: > > On Fri, 2013-06-21 at 00:08 +1000, Alexey Kardashevskiy wrote: > >> At the moment QEMU creates a route for every MSI IRQ. > >> > >> Now we are about to add IRQFD support on PPC64-pseries platform. > >> pSeries already has in-kernel emulated interrupt controller with > >> 8192 IRQs. Also, pSeries PHB already supports MSIMessage to IRQ > >> mapping as a part of PAPR requirements for MSI/MSIX guests. > >> Specifically, the pSeries guest does not touch MSIMessage's at > >> all, instead it uses rtas_ibm_change_msi and rtas_ibm_query_interrupt_source > >> rtas calls to do the mapping. > >> > >> Therefore we do not really need more routing than we got already. > >> The patch introduces the infrastructure to enable direct IRQ mapping. > >> > >> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> > >> > >> --- > >> > >> The patch is raw and ugly indeed, I made it only to demonstrate > >> the idea and see if it has right to live or not. > >> > >> For some reason which I do not really understand (limited GSI numbers?) > >> the existing code always adds routing and I do not see why we would need it. > > > > It's an IOAPIC, a pin gets toggled from the device and an MSI message > > gets written to the CPU. So the route allocates and programs the > > pin->MSI, then we tell it what notifier triggers that pin. > > > On x86 the MSI vector doesn't encode any information about the device > > sending the MSI, here you seem to be able to figure out the device and > > vector space number from the address. Then your pin to MSI is > > effectively fixed. So why isn't this just your > > kvm_irqchip_add_msi_route function? On pSeries it's a lookup, on x86 > > it's a allocate and program. > > What does kvm_irqchip_add_msi_route do on > > pSeries today? Thanks, > > > As we just started implementing this thing, I commented it out for the > starter. Once called, it destroys direct mapping in the host kernel and > everything stops working as routing is not implemented (yet? ever?). Yay, it's broken, you can rewrite it ;) > My point here is that MSIMessage to irq translation is made on a PCI domain > as PAPR (ppc64 server) spec says. The guest never uses MSIMessage, it is > all in QEMU, the guest dynamically allocates MSI IRQs and it is up to a > hypeviser (QEMU) to take care of actual MSIMessage for the device. MSIMessage is what the guest has programmed for the address/data fields, it's not just a QEMU invention. From the guest perspective, the device writes msg.data to msg.address to signal the CPU for the interrupt. > And the only reason to use MSIMessage in QEMU for us is to support > msi_notify()/msix_notify() in places like vfio_msi_interrupt(), I have > added a MSI window for that long time ago which we do not need as much as > we already have an irq number in vfio_msi_interrupt(), etc. It seems like you just have another layer of indirection via your msi_table. For x86 there's a layer of indirection via the virq virtual IOAPIC pin. Seems similar. Thanks, Alex > >> --- > >> hw/misc/vfio.c | 11 +++++++++-- > >> hw/pci/pci.c | 13 +++++++++++++ > >> hw/ppc/spapr_pci.c | 13 +++++++++++++ > >> hw/virtio/virtio-pci.c | 26 ++++++++++++++++++++------ > >> include/hw/pci/pci.h | 4 ++++ > >> include/hw/pci/pci_bus.h | 1 + > >> 6 files changed, 60 insertions(+), 8 deletions(-) > >> > >> diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c > >> index 14aac04..2d9eef7 100644 > >> --- a/hw/misc/vfio.c > >> +++ b/hw/misc/vfio.c > >> @@ -639,7 +639,11 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, > >> * Attempt to enable route through KVM irqchip, > >> * default to userspace handling if unavailable. > >> */ > >> - vector->virq = msg ? kvm_irqchip_add_msi_route(kvm_state, *msg) : -1; > >> + > >> + vector->virq = msg ? pci_bus_map_msi(vdev->pdev.bus, *msg) : -1; > >> + if (vector->virq < 0) { > >> + vector->virq = msg ? kvm_irqchip_add_msi_route(kvm_state, *msg) : -1; > >> + } > >> if (vector->virq < 0 || > >> kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt, > >> vector->virq) < 0) { > >> @@ -807,7 +811,10 @@ retry: > >> * Attempt to enable route through KVM irqchip, > >> * default to userspace handling if unavailable. > >> */ > >> - vector->virq = kvm_irqchip_add_msi_route(kvm_state, msg); > >> + vector->virq = pci_bus_map_msi(vdev->pdev.bus, msg); > >> + if (vector->virq < 0) { > >> + vector->virq = kvm_irqchip_add_msi_route(kvm_state, msg); > >> + } > >> if (vector->virq < 0 || > >> kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt, > >> vector->virq) < 0) { > >> diff --git a/hw/pci/pci.c b/hw/pci/pci.c > >> index a976e46..a9875e9 100644 > >> --- a/hw/pci/pci.c > >> +++ b/hw/pci/pci.c > >> @@ -1254,6 +1254,19 @@ void pci_device_set_intx_routing_notifier(PCIDevice *dev, > >> dev->intx_routing_notifier = notifier; > >> } > >> > >> +void pci_bus_set_map_msi_fn(PCIBus *bus, pci_map_msi_fn map_msi_fn) > >> +{ > >> + bus->map_msi = map_msi_fn; > >> +} > >> + > >> +int pci_bus_map_msi(PCIBus *bus, MSIMessage msg) > >> +{ > >> + if (bus->map_msi) { > >> + return bus->map_msi(bus, msg); > >> + } > >> + return -1; > >> +} > >> + > >> /* > >> * PCI-to-PCI bridge specification > >> * 9.1: Interrupt routing. Table 9-1 > >> diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c > >> index 80408c9..9ef9a29 100644 > >> --- a/hw/ppc/spapr_pci.c > >> +++ b/hw/ppc/spapr_pci.c > >> @@ -500,6 +500,18 @@ static void spapr_msi_write(void *opaque, hwaddr addr, > >> qemu_irq_pulse(xics_get_qirq(spapr->icp, irq)); > >> } > >> > >> +static int spapr_msi_get_irq(PCIBus *bus, MSIMessage msg) > >> +{ > >> + DeviceState *par = bus->qbus.parent; > >> + sPAPRPHBState *sphb = (sPAPRPHBState *) par; > >> + unsigned long addr = msg.address - sphb->msi_win_addr; > >> + int ndev = addr >> 16; > >> + int vec = ((addr & 0xFFFF) >> 2) | msg.data; > >> + uint32_t irq = sphb->msi_table[ndev].irq + vec; > >> + > >> + return (int)irq; > >> +} > >> + > >> static const MemoryRegionOps spapr_msi_ops = { > >> /* There is no .read as the read result is undefined by PCI spec */ > >> .read = NULL, > >> @@ -664,6 +676,7 @@ static int _spapr_phb_init(SysBusDevice *s) > >> > >> sphb->lsi_table[i].irq = irq; > >> } > >> + pci_bus_set_map_msi_fn(bus, spapr_msi_get_irq); > >> > >> return 0; > >> } > >> diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c > >> index d309416..587f53e 100644 > >> --- a/hw/virtio/virtio-pci.c > >> +++ b/hw/virtio/virtio-pci.c > >> @@ -472,6 +472,8 @@ static unsigned virtio_pci_get_features(DeviceState *d) > >> return proxy->host_features; > >> } > >> > >> +extern int spapr_msi_get_irq(PCIBus *bus, MSIMessage *msg); > >> + > >> static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy *proxy, > >> unsigned int queue_no, > >> unsigned int vector, > >> @@ -481,7 +483,10 @@ static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy *proxy, > >> int ret; > >> > >> if (irqfd->users == 0) { > >> - ret = kvm_irqchip_add_msi_route(kvm_state, msg); > >> + ret = pci_bus_map_msi(proxy->pci_dev.bus, msg); > >> + if (ret < 0) { > >> + ret = kvm_irqchip_add_msi_route(kvm_state, msg); > >> + } > >> if (ret < 0) { > >> return ret; > >> } > >> @@ -609,14 +614,23 @@ static int virtio_pci_vq_vector_unmask(VirtIOPCIProxy *proxy, > >> VirtQueue *vq = virtio_get_queue(proxy->vdev, queue_no); > >> EventNotifier *n = virtio_queue_get_guest_notifier(vq); > >> VirtIOIRQFD *irqfd; > >> - int ret = 0; > >> + int ret = 0, tmp; > >> > >> if (proxy->vector_irqfd) { > >> irqfd = &proxy->vector_irqfd[vector]; > >> - if (irqfd->msg.data != msg.data || irqfd->msg.address != msg.address) { > >> - ret = kvm_irqchip_update_msi_route(kvm_state, irqfd->virq, msg); > >> - if (ret < 0) { > >> - return ret; > >> + > >> + tmp = pci_bus_map_msi(proxy->pci_dev.bus, msg); > >> + if (tmp >= 0) { > >> + if (irqfd->virq != tmp) { > >> + fprintf(stderr, "FIXME: MSI(-X) vector has changed from %X to %x\n", > >> + irqfd->virq, tmp); > >> + } > >> + } else { > >> + if (irqfd->msg.data != msg.data || irqfd->msg.address != msg.address) { > >> + ret = kvm_irqchip_update_msi_route(kvm_state, irqfd->virq, msg); > >> + if (ret < 0) { > >> + return ret; > >> + } > >> } > >> } > >> } > >> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h > >> index 8797802..632739a 100644 > >> --- a/include/hw/pci/pci.h > >> +++ b/include/hw/pci/pci.h > >> @@ -332,6 +332,7 @@ MemoryRegion *pci_address_space_io(PCIDevice *dev); > >> typedef void (*pci_set_irq_fn)(void *opaque, int irq_num, int level); > >> typedef int (*pci_map_irq_fn)(PCIDevice *pci_dev, int irq_num); > >> typedef PCIINTxRoute (*pci_route_irq_fn)(void *opaque, int pin); > >> +typedef int (*pci_map_msi_fn)(PCIBus *bus, MSIMessage msg); > >> > >> typedef enum { > >> PCI_HOTPLUG_DISABLED, > >> @@ -375,6 +376,9 @@ bool pci_intx_route_changed(PCIINTxRoute *old, PCIINTxRoute *new); > >> void pci_bus_fire_intx_routing_notifier(PCIBus *bus); > >> void pci_device_set_intx_routing_notifier(PCIDevice *dev, > >> PCIINTxRoutingNotifier notifier); > >> +void pci_bus_set_map_msi_fn(PCIBus *bus, pci_map_msi_fn map_msi_fn); > >> +int pci_bus_map_msi(PCIBus *bus, MSIMessage msg); > >> + > >> void pci_device_reset(PCIDevice *dev); > >> void pci_bus_reset(PCIBus *bus); > >> > >> diff --git a/include/hw/pci/pci_bus.h b/include/hw/pci/pci_bus.h > >> index 66762f6..81efd2b 100644 > >> --- a/include/hw/pci/pci_bus.h > >> +++ b/include/hw/pci/pci_bus.h > >> @@ -16,6 +16,7 @@ struct PCIBus { > >> pci_set_irq_fn set_irq; > >> pci_map_irq_fn map_irq; > >> pci_route_irq_fn route_intx_to_irq; > >> + pci_map_msi_fn map_msi; > >> pci_hotplug_fn hotplug; > >> DeviceState *hotplug_qdev; > >> void *irq_opaque; > > > > > > > >
On 06/21/2013 12:34 PM, Alex Williamson wrote: > On Fri, 2013-06-21 at 11:56 +1000, Alexey Kardashevskiy wrote: >> On 06/21/2013 02:51 AM, Alex Williamson wrote: >>> On Fri, 2013-06-21 at 00:08 +1000, Alexey Kardashevskiy wrote: >>>> At the moment QEMU creates a route for every MSI IRQ. >>>> >>>> Now we are about to add IRQFD support on PPC64-pseries platform. >>>> pSeries already has in-kernel emulated interrupt controller with >>>> 8192 IRQs. Also, pSeries PHB already supports MSIMessage to IRQ >>>> mapping as a part of PAPR requirements for MSI/MSIX guests. >>>> Specifically, the pSeries guest does not touch MSIMessage's at >>>> all, instead it uses rtas_ibm_change_msi and rtas_ibm_query_interrupt_source >>>> rtas calls to do the mapping. >>>> >>>> Therefore we do not really need more routing than we got already. >>>> The patch introduces the infrastructure to enable direct IRQ mapping. >>>> >>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> >>>> >>>> --- >>>> >>>> The patch is raw and ugly indeed, I made it only to demonstrate >>>> the idea and see if it has right to live or not. >>>> >>>> For some reason which I do not really understand (limited GSI numbers?) >>>> the existing code always adds routing and I do not see why we would need it. >>> >>> It's an IOAPIC, a pin gets toggled from the device and an MSI message >>> gets written to the CPU. So the route allocates and programs the >>> pin->MSI, then we tell it what notifier triggers that pin. >> >>> On x86 the MSI vector doesn't encode any information about the device >>> sending the MSI, here you seem to be able to figure out the device and >>> vector space number from the address. Then your pin to MSI is >>> effectively fixed. So why isn't this just your >>> kvm_irqchip_add_msi_route function? On pSeries it's a lookup, on x86 >>> it's a allocate and program. >>> What does kvm_irqchip_add_msi_route do on >>> pSeries today? Thanks, >> >> >> As we just started implementing this thing, I commented it out for the >> starter. Once called, it destroys direct mapping in the host kernel and >> everything stops working as routing is not implemented (yet? ever?). > > Yay, it's broken, you can rewrite it ;) There is nothing to rewrite, my understanding is that it is just not written yet and Paul would like not do that :) >> My point here is that MSIMessage to irq translation is made on a PCI domain >> as PAPR (ppc64 server) spec says. The guest never uses MSIMessage, it is >> all in QEMU, the guest dynamically allocates MSI IRQs and it is up to a >> hypeviser (QEMU) to take care of actual MSIMessage for the device. > > MSIMessage is what the guest has programmed for the address/data fields, > it's not just a QEMU invention. From the guest perspective, the device > writes msg.data to msg.address to signal the CPU for the interrupt. Our guests do never program MSIMessage. Hypercalls are used instead. >> And the only reason to use MSIMessage in QEMU for us is to support >> msi_notify()/msix_notify() in places like vfio_msi_interrupt(), I have >> added a MSI window for that long time ago which we do not need as much as >> we already have an irq number in vfio_msi_interrupt(), etc. > > It seems like you just have another layer of indirection via your > msi_table. For x86 there's a layer of indirection via the virq virtual > IOAPIC pin. Seems similar. Thanks, Do not follow you, sorry. For x86, is it that MSI routing table which is updated via KVM_SET_GSI_ROUTING in KVM? When there is no KVM, what piece of code responds on msi_notify() in qemu-x86 and does qemu_irq_pulse()? > > Alex > >>>> --- >>>> hw/misc/vfio.c | 11 +++++++++-- >>>> hw/pci/pci.c | 13 +++++++++++++ >>>> hw/ppc/spapr_pci.c | 13 +++++++++++++ >>>> hw/virtio/virtio-pci.c | 26 ++++++++++++++++++++------ >>>> include/hw/pci/pci.h | 4 ++++ >>>> include/hw/pci/pci_bus.h | 1 + >>>> 6 files changed, 60 insertions(+), 8 deletions(-) >>>> >>>> diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c >>>> index 14aac04..2d9eef7 100644 >>>> --- a/hw/misc/vfio.c >>>> +++ b/hw/misc/vfio.c >>>> @@ -639,7 +639,11 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, >>>> * Attempt to enable route through KVM irqchip, >>>> * default to userspace handling if unavailable. >>>> */ >>>> - vector->virq = msg ? kvm_irqchip_add_msi_route(kvm_state, *msg) : -1; >>>> + >>>> + vector->virq = msg ? pci_bus_map_msi(vdev->pdev.bus, *msg) : -1; >>>> + if (vector->virq < 0) { >>>> + vector->virq = msg ? kvm_irqchip_add_msi_route(kvm_state, *msg) : -1; >>>> + } >>>> if (vector->virq < 0 || >>>> kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt, >>>> vector->virq) < 0) { >>>> @@ -807,7 +811,10 @@ retry: >>>> * Attempt to enable route through KVM irqchip, >>>> * default to userspace handling if unavailable. >>>> */ >>>> - vector->virq = kvm_irqchip_add_msi_route(kvm_state, msg); >>>> + vector->virq = pci_bus_map_msi(vdev->pdev.bus, msg); >>>> + if (vector->virq < 0) { >>>> + vector->virq = kvm_irqchip_add_msi_route(kvm_state, msg); >>>> + } >>>> if (vector->virq < 0 || >>>> kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt, >>>> vector->virq) < 0) { >>>> diff --git a/hw/pci/pci.c b/hw/pci/pci.c >>>> index a976e46..a9875e9 100644 >>>> --- a/hw/pci/pci.c >>>> +++ b/hw/pci/pci.c >>>> @@ -1254,6 +1254,19 @@ void pci_device_set_intx_routing_notifier(PCIDevice *dev, >>>> dev->intx_routing_notifier = notifier; >>>> } >>>> >>>> +void pci_bus_set_map_msi_fn(PCIBus *bus, pci_map_msi_fn map_msi_fn) >>>> +{ >>>> + bus->map_msi = map_msi_fn; >>>> +} >>>> + >>>> +int pci_bus_map_msi(PCIBus *bus, MSIMessage msg) >>>> +{ >>>> + if (bus->map_msi) { >>>> + return bus->map_msi(bus, msg); >>>> + } >>>> + return -1; >>>> +} >>>> + >>>> /* >>>> * PCI-to-PCI bridge specification >>>> * 9.1: Interrupt routing. Table 9-1 >>>> diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c >>>> index 80408c9..9ef9a29 100644 >>>> --- a/hw/ppc/spapr_pci.c >>>> +++ b/hw/ppc/spapr_pci.c >>>> @@ -500,6 +500,18 @@ static void spapr_msi_write(void *opaque, hwaddr addr, >>>> qemu_irq_pulse(xics_get_qirq(spapr->icp, irq)); >>>> } >>>> >>>> +static int spapr_msi_get_irq(PCIBus *bus, MSIMessage msg) >>>> +{ >>>> + DeviceState *par = bus->qbus.parent; >>>> + sPAPRPHBState *sphb = (sPAPRPHBState *) par; >>>> + unsigned long addr = msg.address - sphb->msi_win_addr; >>>> + int ndev = addr >> 16; >>>> + int vec = ((addr & 0xFFFF) >> 2) | msg.data; >>>> + uint32_t irq = sphb->msi_table[ndev].irq + vec; >>>> + >>>> + return (int)irq; >>>> +} >>>> + >>>> static const MemoryRegionOps spapr_msi_ops = { >>>> /* There is no .read as the read result is undefined by PCI spec */ >>>> .read = NULL, >>>> @@ -664,6 +676,7 @@ static int _spapr_phb_init(SysBusDevice *s) >>>> >>>> sphb->lsi_table[i].irq = irq; >>>> } >>>> + pci_bus_set_map_msi_fn(bus, spapr_msi_get_irq); >>>> >>>> return 0; >>>> } >>>> diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c >>>> index d309416..587f53e 100644 >>>> --- a/hw/virtio/virtio-pci.c >>>> +++ b/hw/virtio/virtio-pci.c >>>> @@ -472,6 +472,8 @@ static unsigned virtio_pci_get_features(DeviceState *d) >>>> return proxy->host_features; >>>> } >>>> >>>> +extern int spapr_msi_get_irq(PCIBus *bus, MSIMessage *msg); >>>> + >>>> static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy *proxy, >>>> unsigned int queue_no, >>>> unsigned int vector, >>>> @@ -481,7 +483,10 @@ static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy *proxy, >>>> int ret; >>>> >>>> if (irqfd->users == 0) { >>>> - ret = kvm_irqchip_add_msi_route(kvm_state, msg); >>>> + ret = pci_bus_map_msi(proxy->pci_dev.bus, msg); >>>> + if (ret < 0) { >>>> + ret = kvm_irqchip_add_msi_route(kvm_state, msg); >>>> + } >>>> if (ret < 0) { >>>> return ret; >>>> } >>>> @@ -609,14 +614,23 @@ static int virtio_pci_vq_vector_unmask(VirtIOPCIProxy *proxy, >>>> VirtQueue *vq = virtio_get_queue(proxy->vdev, queue_no); >>>> EventNotifier *n = virtio_queue_get_guest_notifier(vq); >>>> VirtIOIRQFD *irqfd; >>>> - int ret = 0; >>>> + int ret = 0, tmp; >>>> >>>> if (proxy->vector_irqfd) { >>>> irqfd = &proxy->vector_irqfd[vector]; >>>> - if (irqfd->msg.data != msg.data || irqfd->msg.address != msg.address) { >>>> - ret = kvm_irqchip_update_msi_route(kvm_state, irqfd->virq, msg); >>>> - if (ret < 0) { >>>> - return ret; >>>> + >>>> + tmp = pci_bus_map_msi(proxy->pci_dev.bus, msg); >>>> + if (tmp >= 0) { >>>> + if (irqfd->virq != tmp) { >>>> + fprintf(stderr, "FIXME: MSI(-X) vector has changed from %X to %x\n", >>>> + irqfd->virq, tmp); >>>> + } >>>> + } else { >>>> + if (irqfd->msg.data != msg.data || irqfd->msg.address != msg.address) { >>>> + ret = kvm_irqchip_update_msi_route(kvm_state, irqfd->virq, msg); >>>> + if (ret < 0) { >>>> + return ret; >>>> + } >>>> } >>>> } >>>> } >>>> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h >>>> index 8797802..632739a 100644 >>>> --- a/include/hw/pci/pci.h >>>> +++ b/include/hw/pci/pci.h >>>> @@ -332,6 +332,7 @@ MemoryRegion *pci_address_space_io(PCIDevice *dev); >>>> typedef void (*pci_set_irq_fn)(void *opaque, int irq_num, int level); >>>> typedef int (*pci_map_irq_fn)(PCIDevice *pci_dev, int irq_num); >>>> typedef PCIINTxRoute (*pci_route_irq_fn)(void *opaque, int pin); >>>> +typedef int (*pci_map_msi_fn)(PCIBus *bus, MSIMessage msg); >>>> >>>> typedef enum { >>>> PCI_HOTPLUG_DISABLED, >>>> @@ -375,6 +376,9 @@ bool pci_intx_route_changed(PCIINTxRoute *old, PCIINTxRoute *new); >>>> void pci_bus_fire_intx_routing_notifier(PCIBus *bus); >>>> void pci_device_set_intx_routing_notifier(PCIDevice *dev, >>>> PCIINTxRoutingNotifier notifier); >>>> +void pci_bus_set_map_msi_fn(PCIBus *bus, pci_map_msi_fn map_msi_fn); >>>> +int pci_bus_map_msi(PCIBus *bus, MSIMessage msg); >>>> + >>>> void pci_device_reset(PCIDevice *dev); >>>> void pci_bus_reset(PCIBus *bus); >>>> >>>> diff --git a/include/hw/pci/pci_bus.h b/include/hw/pci/pci_bus.h >>>> index 66762f6..81efd2b 100644 >>>> --- a/include/hw/pci/pci_bus.h >>>> +++ b/include/hw/pci/pci_bus.h >>>> @@ -16,6 +16,7 @@ struct PCIBus { >>>> pci_set_irq_fn set_irq; >>>> pci_map_irq_fn map_irq; >>>> pci_route_irq_fn route_intx_to_irq; >>>> + pci_map_msi_fn map_msi; >>>> pci_hotplug_fn hotplug; >>>> DeviceState *hotplug_qdev; >>>> void *irq_opaque; >>> >>> >>> >> >> > > >
On Fri, 2013-06-21 at 12:49 +1000, Alexey Kardashevskiy wrote: > On 06/21/2013 12:34 PM, Alex Williamson wrote: > > On Fri, 2013-06-21 at 11:56 +1000, Alexey Kardashevskiy wrote: > >> On 06/21/2013 02:51 AM, Alex Williamson wrote: > >>> On Fri, 2013-06-21 at 00:08 +1000, Alexey Kardashevskiy wrote: > >>>> At the moment QEMU creates a route for every MSI IRQ. > >>>> > >>>> Now we are about to add IRQFD support on PPC64-pseries platform. > >>>> pSeries already has in-kernel emulated interrupt controller with > >>>> 8192 IRQs. Also, pSeries PHB already supports MSIMessage to IRQ > >>>> mapping as a part of PAPR requirements for MSI/MSIX guests. > >>>> Specifically, the pSeries guest does not touch MSIMessage's at > >>>> all, instead it uses rtas_ibm_change_msi and rtas_ibm_query_interrupt_source > >>>> rtas calls to do the mapping. > >>>> > >>>> Therefore we do not really need more routing than we got already. > >>>> The patch introduces the infrastructure to enable direct IRQ mapping. > >>>> > >>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> > >>>> > >>>> --- > >>>> > >>>> The patch is raw and ugly indeed, I made it only to demonstrate > >>>> the idea and see if it has right to live or not. > >>>> > >>>> For some reason which I do not really understand (limited GSI numbers?) > >>>> the existing code always adds routing and I do not see why we would need it. > >>> > >>> It's an IOAPIC, a pin gets toggled from the device and an MSI message > >>> gets written to the CPU. So the route allocates and programs the > >>> pin->MSI, then we tell it what notifier triggers that pin. > >> > >>> On x86 the MSI vector doesn't encode any information about the device > >>> sending the MSI, here you seem to be able to figure out the device and > >>> vector space number from the address. Then your pin to MSI is > >>> effectively fixed. So why isn't this just your > >>> kvm_irqchip_add_msi_route function? On pSeries it's a lookup, on x86 > >>> it's a allocate and program. > >>> What does kvm_irqchip_add_msi_route do on > >>> pSeries today? Thanks, > >> > >> > >> As we just started implementing this thing, I commented it out for the > >> starter. Once called, it destroys direct mapping in the host kernel and > >> everything stops working as routing is not implemented (yet? ever?). > > > > Yay, it's broken, you can rewrite it ;) > > > There is nothing to rewrite, my understanding is that it is just not > written yet and Paul would like not do that :) > > > >> My point here is that MSIMessage to irq translation is made on a PCI domain > >> as PAPR (ppc64 server) spec says. The guest never uses MSIMessage, it is > >> all in QEMU, the guest dynamically allocates MSI IRQs and it is up to a > >> hypeviser (QEMU) to take care of actual MSIMessage for the device. > > > > MSIMessage is what the guest has programmed for the address/data fields, > > it's not just a QEMU invention. From the guest perspective, the device > > writes msg.data to msg.address to signal the CPU for the interrupt. > > > Our guests do never program MSIMessage. Hypercalls are used instead. Of course POWER has a hypercall for that, but that's just abstracting the physical device, which does actually write msg.data to msg.address on the bus. > >> And the only reason to use MSIMessage in QEMU for us is to support > >> msi_notify()/msix_notify() in places like vfio_msi_interrupt(), I have > >> added a MSI window for that long time ago which we do not need as much as > >> we already have an irq number in vfio_msi_interrupt(), etc. > > > > It seems like you just have another layer of indirection via your > > msi_table. For x86 there's a layer of indirection via the virq virtual > > IOAPIC pin. Seems similar. Thanks, > > > Do not follow you, sorry. For x86, is it that MSI routing table which is > updated via KVM_SET_GSI_ROUTING in KVM? When there is no KVM, what piece of > code responds on msi_notify() in qemu-x86 and does qemu_irq_pulse()? vfio_msi_interrupt->msi[x]_notify->stl_le_phys(msg.address, msg.data) This writes directly to the interrupt block on the vCPU. With KVM, the in-kernel APIC does the same write, where the pin to MSIMessage is setup by kvm_irqchip_add_msi_route and the pin is pulled by an irqfd. Do I understand that on POWER the MSI from the device is intercepted at the PHB and converted to an IRQ that's triggered by some means other than a MSI write? So to correctly model the hardware, vfio should do a msi_notify() that does a stl_le_phys that terminates at this IRQ remapper thing and in turn toggles a qemu_irq. MSIMessage is only extraneous data if you want to skip over hardware blocks. Maybe you could add a device parameter to kvm_irqchip_add_msi_route so that it can be implemented on POWER without this pci_bus_map_msi interface that seems very unique to POWER. Thanks, Alex > >>>> --- > >>>> hw/misc/vfio.c | 11 +++++++++-- > >>>> hw/pci/pci.c | 13 +++++++++++++ > >>>> hw/ppc/spapr_pci.c | 13 +++++++++++++ > >>>> hw/virtio/virtio-pci.c | 26 ++++++++++++++++++++------ > >>>> include/hw/pci/pci.h | 4 ++++ > >>>> include/hw/pci/pci_bus.h | 1 + > >>>> 6 files changed, 60 insertions(+), 8 deletions(-) > >>>> > >>>> diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c > >>>> index 14aac04..2d9eef7 100644 > >>>> --- a/hw/misc/vfio.c > >>>> +++ b/hw/misc/vfio.c > >>>> @@ -639,7 +639,11 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, > >>>> * Attempt to enable route through KVM irqchip, > >>>> * default to userspace handling if unavailable. > >>>> */ > >>>> - vector->virq = msg ? kvm_irqchip_add_msi_route(kvm_state, *msg) : -1; > >>>> + > >>>> + vector->virq = msg ? pci_bus_map_msi(vdev->pdev.bus, *msg) : -1; > >>>> + if (vector->virq < 0) { > >>>> + vector->virq = msg ? kvm_irqchip_add_msi_route(kvm_state, *msg) : -1; > >>>> + } > >>>> if (vector->virq < 0 || > >>>> kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt, > >>>> vector->virq) < 0) { > >>>> @@ -807,7 +811,10 @@ retry: > >>>> * Attempt to enable route through KVM irqchip, > >>>> * default to userspace handling if unavailable. > >>>> */ > >>>> - vector->virq = kvm_irqchip_add_msi_route(kvm_state, msg); > >>>> + vector->virq = pci_bus_map_msi(vdev->pdev.bus, msg); > >>>> + if (vector->virq < 0) { > >>>> + vector->virq = kvm_irqchip_add_msi_route(kvm_state, msg); > >>>> + } > >>>> if (vector->virq < 0 || > >>>> kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt, > >>>> vector->virq) < 0) { > >>>> diff --git a/hw/pci/pci.c b/hw/pci/pci.c > >>>> index a976e46..a9875e9 100644 > >>>> --- a/hw/pci/pci.c > >>>> +++ b/hw/pci/pci.c > >>>> @@ -1254,6 +1254,19 @@ void pci_device_set_intx_routing_notifier(PCIDevice *dev, > >>>> dev->intx_routing_notifier = notifier; > >>>> } > >>>> > >>>> +void pci_bus_set_map_msi_fn(PCIBus *bus, pci_map_msi_fn map_msi_fn) > >>>> +{ > >>>> + bus->map_msi = map_msi_fn; > >>>> +} > >>>> + > >>>> +int pci_bus_map_msi(PCIBus *bus, MSIMessage msg) > >>>> +{ > >>>> + if (bus->map_msi) { > >>>> + return bus->map_msi(bus, msg); > >>>> + } > >>>> + return -1; > >>>> +} > >>>> + > >>>> /* > >>>> * PCI-to-PCI bridge specification > >>>> * 9.1: Interrupt routing. Table 9-1 > >>>> diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c > >>>> index 80408c9..9ef9a29 100644 > >>>> --- a/hw/ppc/spapr_pci.c > >>>> +++ b/hw/ppc/spapr_pci.c > >>>> @@ -500,6 +500,18 @@ static void spapr_msi_write(void *opaque, hwaddr addr, > >>>> qemu_irq_pulse(xics_get_qirq(spapr->icp, irq)); > >>>> } > >>>> > >>>> +static int spapr_msi_get_irq(PCIBus *bus, MSIMessage msg) > >>>> +{ > >>>> + DeviceState *par = bus->qbus.parent; > >>>> + sPAPRPHBState *sphb = (sPAPRPHBState *) par; > >>>> + unsigned long addr = msg.address - sphb->msi_win_addr; > >>>> + int ndev = addr >> 16; > >>>> + int vec = ((addr & 0xFFFF) >> 2) | msg.data; > >>>> + uint32_t irq = sphb->msi_table[ndev].irq + vec; > >>>> + > >>>> + return (int)irq; > >>>> +} > >>>> + > >>>> static const MemoryRegionOps spapr_msi_ops = { > >>>> /* There is no .read as the read result is undefined by PCI spec */ > >>>> .read = NULL, > >>>> @@ -664,6 +676,7 @@ static int _spapr_phb_init(SysBusDevice *s) > >>>> > >>>> sphb->lsi_table[i].irq = irq; > >>>> } > >>>> + pci_bus_set_map_msi_fn(bus, spapr_msi_get_irq); > >>>> > >>>> return 0; > >>>> } > >>>> diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c > >>>> index d309416..587f53e 100644 > >>>> --- a/hw/virtio/virtio-pci.c > >>>> +++ b/hw/virtio/virtio-pci.c > >>>> @@ -472,6 +472,8 @@ static unsigned virtio_pci_get_features(DeviceState *d) > >>>> return proxy->host_features; > >>>> } > >>>> > >>>> +extern int spapr_msi_get_irq(PCIBus *bus, MSIMessage *msg); > >>>> + > >>>> static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy *proxy, > >>>> unsigned int queue_no, > >>>> unsigned int vector, > >>>> @@ -481,7 +483,10 @@ static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy *proxy, > >>>> int ret; > >>>> > >>>> if (irqfd->users == 0) { > >>>> - ret = kvm_irqchip_add_msi_route(kvm_state, msg); > >>>> + ret = pci_bus_map_msi(proxy->pci_dev.bus, msg); > >>>> + if (ret < 0) { > >>>> + ret = kvm_irqchip_add_msi_route(kvm_state, msg); > >>>> + } > >>>> if (ret < 0) { > >>>> return ret; > >>>> } > >>>> @@ -609,14 +614,23 @@ static int virtio_pci_vq_vector_unmask(VirtIOPCIProxy *proxy, > >>>> VirtQueue *vq = virtio_get_queue(proxy->vdev, queue_no); > >>>> EventNotifier *n = virtio_queue_get_guest_notifier(vq); > >>>> VirtIOIRQFD *irqfd; > >>>> - int ret = 0; > >>>> + int ret = 0, tmp; > >>>> > >>>> if (proxy->vector_irqfd) { > >>>> irqfd = &proxy->vector_irqfd[vector]; > >>>> - if (irqfd->msg.data != msg.data || irqfd->msg.address != msg.address) { > >>>> - ret = kvm_irqchip_update_msi_route(kvm_state, irqfd->virq, msg); > >>>> - if (ret < 0) { > >>>> - return ret; > >>>> + > >>>> + tmp = pci_bus_map_msi(proxy->pci_dev.bus, msg); > >>>> + if (tmp >= 0) { > >>>> + if (irqfd->virq != tmp) { > >>>> + fprintf(stderr, "FIXME: MSI(-X) vector has changed from %X to %x\n", > >>>> + irqfd->virq, tmp); > >>>> + } > >>>> + } else { > >>>> + if (irqfd->msg.data != msg.data || irqfd->msg.address != msg.address) { > >>>> + ret = kvm_irqchip_update_msi_route(kvm_state, irqfd->virq, msg); > >>>> + if (ret < 0) { > >>>> + return ret; > >>>> + } > >>>> } > >>>> } > >>>> } > >>>> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h > >>>> index 8797802..632739a 100644 > >>>> --- a/include/hw/pci/pci.h > >>>> +++ b/include/hw/pci/pci.h > >>>> @@ -332,6 +332,7 @@ MemoryRegion *pci_address_space_io(PCIDevice *dev); > >>>> typedef void (*pci_set_irq_fn)(void *opaque, int irq_num, int level); > >>>> typedef int (*pci_map_irq_fn)(PCIDevice *pci_dev, int irq_num); > >>>> typedef PCIINTxRoute (*pci_route_irq_fn)(void *opaque, int pin); > >>>> +typedef int (*pci_map_msi_fn)(PCIBus *bus, MSIMessage msg); > >>>> > >>>> typedef enum { > >>>> PCI_HOTPLUG_DISABLED, > >>>> @@ -375,6 +376,9 @@ bool pci_intx_route_changed(PCIINTxRoute *old, PCIINTxRoute *new); > >>>> void pci_bus_fire_intx_routing_notifier(PCIBus *bus); > >>>> void pci_device_set_intx_routing_notifier(PCIDevice *dev, > >>>> PCIINTxRoutingNotifier notifier); > >>>> +void pci_bus_set_map_msi_fn(PCIBus *bus, pci_map_msi_fn map_msi_fn); > >>>> +int pci_bus_map_msi(PCIBus *bus, MSIMessage msg); > >>>> + > >>>> void pci_device_reset(PCIDevice *dev); > >>>> void pci_bus_reset(PCIBus *bus); > >>>> > >>>> diff --git a/include/hw/pci/pci_bus.h b/include/hw/pci/pci_bus.h > >>>> index 66762f6..81efd2b 100644 > >>>> --- a/include/hw/pci/pci_bus.h > >>>> +++ b/include/hw/pci/pci_bus.h > >>>> @@ -16,6 +16,7 @@ struct PCIBus { > >>>> pci_set_irq_fn set_irq; > >>>> pci_map_irq_fn map_irq; > >>>> pci_route_irq_fn route_intx_to_irq; > >>>> + pci_map_msi_fn map_msi; > >>>> pci_hotplug_fn hotplug; > >>>> DeviceState *hotplug_qdev; > >>>> void *irq_opaque; > >>> > >>> > >>> > >> > >> > > > > > > > >
On Thu, 2013-06-20 at 22:46 -0600, Alex Williamson wrote: > Maybe you could add a device parameter to kvm_irqchip_add_msi_route so > that it can be implemented on POWER without this pci_bus_map_msi > interface that seems very unique to POWER. Thanks, You mean unique to all non-x86 ? :-) I believe almost everybody eventually turn MSIs into "normal" interrupts... Most often than not, the logic to do so is in the PCI Host Bridge. The whole concept of passing the message address/data accross the user/kernel interface is an x86 crackpotery but as is the entire remapping/routing layer so ... :-) Ben.
On Fri, 2013-06-21 at 15:12 +1000, Benjamin Herrenschmidt wrote: > On Thu, 2013-06-20 at 22:46 -0600, Alex Williamson wrote: > > Maybe you could add a device parameter to kvm_irqchip_add_msi_route so > > that it can be implemented on POWER without this pci_bus_map_msi > > interface that seems very unique to POWER. Thanks, > > You mean unique to all non-x86 ? :-) > > I believe almost everybody eventually turn MSIs into "normal" > interrupts... > > Most often than not, the logic to do so is in the PCI Host Bridge. > > The whole concept of passing the message address/data accross the > user/kernel interface is an x86 crackpotery but as is the entire > remapping/routing layer so ... :-) Regardless, this is exactly what kvm_irqchip_add_msi_route does. It says, here's an MSIMessage, give me an IRQ that sends that. In the x86 case, that means pick a free IRQ and program it to send that MSIMessage when we hit the irqfd. In the case of POWER it means lookup which IRQ gets fired by that MSIMessage and return it. In a non-accelerated QEMU case I'd think msi_notify() would write the MSIMessage to this IRQ remapper device and let it toggle the next qemu_irq down the line. If we ever add an IOMMU based IRQ remapper to the x86 model, we'd need something similar. Thanks, Alex
On Fri, 2013-06-21 at 00:03 -0600, Alex Williamson wrote: > On Fri, 2013-06-21 at 15:12 +1000, Benjamin Herrenschmidt wrote: > > On Thu, 2013-06-20 at 22:46 -0600, Alex Williamson wrote: > > > Maybe you could add a device parameter to kvm_irqchip_add_msi_route so > > > that it can be implemented on POWER without this pci_bus_map_msi > > > interface that seems very unique to POWER. Thanks, > > > > You mean unique to all non-x86 ? :-) > > > > I believe almost everybody eventually turn MSIs into "normal" > > interrupts... > > > > Most often than not, the logic to do so is in the PCI Host Bridge. > > > > The whole concept of passing the message address/data accross the > > user/kernel interface is an x86 crackpotery but as is the entire > > remapping/routing layer so ... :-) > > Regardless, this is exactly what kvm_irqchip_add_msi_route does. It > says, here's an MSIMessage, give me an IRQ that sends that. Yes, and in our case, what happens is that the guest said to use "I want an MSI", we picked up an IRQ, and made up a message for it :-) The actual message address/data we use is a complete invention that only exists within qemu. So here we need to basically turn it back into an IRQ, which we might be able to do by ... just making the message (or part of the address) be the IRQ number or something like that. > In the x86 > case, that means pick a free IRQ and program it to send that MSIMessage > when we hit the irqfd. In the case of POWER it means lookup which IRQ > gets fired by that MSIMessage and return it. In a non-accelerated QEMU > case I'd think msi_notify() would write the MSIMessage to this IRQ > remapper device and let it toggle the next qemu_irq down the line. If > we ever add an IOMMU based IRQ remapper to the x86 model, we'd need > something similar. Thanks, Ben.
On 06/21/2013 04:12 PM, Benjamin Herrenschmidt wrote: > On Fri, 2013-06-21 at 00:03 -0600, Alex Williamson wrote: >> On Fri, 2013-06-21 at 15:12 +1000, Benjamin Herrenschmidt wrote: >>> On Thu, 2013-06-20 at 22:46 -0600, Alex Williamson wrote: >>>> Maybe you could add a device parameter to kvm_irqchip_add_msi_route so >>>> that it can be implemented on POWER without this pci_bus_map_msi >>>> interface that seems very unique to POWER. Thanks, >>> >>> You mean unique to all non-x86 ? :-) >>> >>> I believe almost everybody eventually turn MSIs into "normal" >>> interrupts... >>> >>> Most often than not, the logic to do so is in the PCI Host Bridge. >>> >>> The whole concept of passing the message address/data accross the >>> user/kernel interface is an x86 crackpotery but as is the entire >>> remapping/routing layer so ... :-) >> >> Regardless, this is exactly what kvm_irqchip_add_msi_route does. It >> says, here's an MSIMessage, give me an IRQ that sends that. > > Yes, and in our case, what happens is that the guest said to use "I want > an MSI", we picked up an IRQ, and made up a message for it :-) The > actual message address/data we use is a complete invention that only > exists within qemu. So here we need to basically turn it back into an > IRQ, which we might be able to do by ... just making the message (or > part of the address) be the IRQ number or something like that. > >> In the x86 >> case, that means pick a free IRQ and program it to send that MSIMessage >> when we hit the irqfd. In the case of POWER it means lookup which IRQ >> gets fired by that MSIMessage and return it. In a non-accelerated QEMU >> case I'd think msi_notify() would write the MSIMessage to this IRQ >> remapper device and let it toggle the next qemu_irq down the line. If >> we ever add an IOMMU based IRQ remapper to the x86 model, we'd need >> something similar. Thanks, Yes, I am changing IRQ to MSI message conversion (and some other things as replacing per PHB MSI windows to one for a whole system) and I am going to post something tonight.
On Fri, Jun 21, 2013 at 09:51:20AM +1000, Alexey Kardashevskiy wrote: > And kvm_irqchip_add_msi_route does not have any link to a device or a bus > so I'll have to walk through all PHBs in system and see if PHB's MSI window > is the one from MSIMessage and convert MSIMessage to virq. Pretty easy and > quick but still dirty hack, would it be better? Yes I think that's fine really. Basically devices all speak MSIMessage as they should - this is what the PCI spec says. On all normal systems guests also speak MSIMessage so the API which uses these makes sense for kvm. Except, ppc architecture in its wisdom decided to hide this in firmware. QEMU does not have firmware so the translation has to be maintained in QEMU powerpc code.
On Sun, Jun 23, 2013 at 9:07 AM, Michael S. Tsirkin <mst@redhat.com> wrote: > On Fri, Jun 21, 2013 at 09:51:20AM +1000, Alexey Kardashevskiy wrote: >> And kvm_irqchip_add_msi_route does not have any link to a device or a bus >> so I'll have to walk through all PHBs in system and see if PHB's MSI window >> is the one from MSIMessage and convert MSIMessage to virq. Pretty easy and >> quick but still dirty hack, would it be better? > > Yes I think that's fine really. > > Basically devices all speak MSIMessage as they should - > this is what the PCI spec says. > On all normal systems guests also speak MSIMessage so > the API which uses these makes sense for kvm. There is no difference between Power and x86 in how they handle MSI. MSI is not a concept in the APIC or processor. MSI interrupts are handled by the PHB and delivered through the I/O APIC using the information in the address/data. I've always assumed that the kernel knows about MSI just because PCI passthrough needs it for whatever reason. It's not part of the APIC complex and it is weird to have MSI decoding information in the kernel. > Except, ppc architecture in its wisdom decided to hide this > in firmware. QEMU does not have firmware so the translation > has to be maintained in QEMU powerpc code. To be clear: instead of writing to an address with a load/store, Power has a hypercall to do this. The hypercall is hidden behind a firmware abstract layer (RTAS) but it's still just a hypercall. It uses a hypercall simply because Power was designed to not require MMIO trapping. It's much like Xen PV in that regard. It doesn't make a lot of sense to have Power do MSI decoding in userspace while x86 does it in the kernel. I assume the kernel doesn't have enough information to do the translation on Power? Regards, Anthony Liguori > -- > MST >
On Thu, Jun 20, 2013 at 11:46 PM, Alex Williamson <alex.williamson@redhat.com> wrote: > On Fri, 2013-06-21 at 12:49 +1000, Alexey Kardashevskiy wrote: >> On 06/21/2013 12:34 PM, Alex Williamson wrote: >> >> >> Do not follow you, sorry. For x86, is it that MSI routing table which is >> updated via KVM_SET_GSI_ROUTING in KVM? When there is no KVM, what piece of >> code responds on msi_notify() in qemu-x86 and does qemu_irq_pulse()? > > vfio_msi_interrupt->msi[x]_notify->stl_le_phys(msg.address, msg.data) > > This writes directly to the interrupt block on the vCPU. With KVM, the > in-kernel APIC does the same write, where the pin to MSIMessage is setup > by kvm_irqchip_add_msi_route and the pin is pulled by an irqfd. What is this "interrupt block on the vCPU" you speak of? I reviewed the SDM and see nothing in the APIC protocol or the brief description of MSI as a PCI concept that would indicate anything except that the PHB handles MSI writes and feeds them to the I/O APIC. In fact, the wikipedia article on MSI has: "A common misconception with Message Signaled Interrupts is that they allow the device to send data to a processor as part of the interrupt. The data that is sent as part of the write is used by the chipset to determine which interrupt to trigger on which processor; it is not available for the device to communicate additional information to the interrupt handler." > Do I understand that on POWER the MSI from the device is intercepted at > the PHB and converted to an IRQ that's triggered by some means other > than a MSI write? This is exactly the same thing that happens on x86, no? Can you point me to something in the SDM that says otherwise? Regards, Anthony Liguori > So to correctly model the hardware, vfio should do a > msi_notify() that does a stl_le_phys that terminates at this IRQ > remapper thing and in turn toggles a qemu_irq. MSIMessage is only > extraneous data if you want to skip over hardware blocks. > > Maybe you could add a device parameter to kvm_irqchip_add_msi_route so > that it can be implemented on POWER without this pci_bus_map_msi > interface that seems very unique to POWER. Thanks, > > Alex > >> >>>> --- >> >>>> hw/misc/vfio.c | 11 +++++++++-- >> >>>> hw/pci/pci.c | 13 +++++++++++++ >> >>>> hw/ppc/spapr_pci.c | 13 +++++++++++++ >> >>>> hw/virtio/virtio-pci.c | 26 ++++++++++++++++++++------ >> >>>> include/hw/pci/pci.h | 4 ++++ >> >>>> include/hw/pci/pci_bus.h | 1 + >> >>>> 6 files changed, 60 insertions(+), 8 deletions(-) >> >>>> >> >>>> diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c >> >>>> index 14aac04..2d9eef7 100644 >> >>>> --- a/hw/misc/vfio.c >> >>>> +++ b/hw/misc/vfio.c >> >>>> @@ -639,7 +639,11 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, >> >>>> * Attempt to enable route through KVM irqchip, >> >>>> * default to userspace handling if unavailable. >> >>>> */ >> >>>> - vector->virq = msg ? kvm_irqchip_add_msi_route(kvm_state, *msg) : -1; >> >>>> + >> >>>> + vector->virq = msg ? pci_bus_map_msi(vdev->pdev.bus, *msg) : -1; >> >>>> + if (vector->virq < 0) { >> >>>> + vector->virq = msg ? kvm_irqchip_add_msi_route(kvm_state, *msg) : -1; >> >>>> + } >> >>>> if (vector->virq < 0 || >> >>>> kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt, >> >>>> vector->virq) < 0) { >> >>>> @@ -807,7 +811,10 @@ retry: >> >>>> * Attempt to enable route through KVM irqchip, >> >>>> * default to userspace handling if unavailable. >> >>>> */ >> >>>> - vector->virq = kvm_irqchip_add_msi_route(kvm_state, msg); >> >>>> + vector->virq = pci_bus_map_msi(vdev->pdev.bus, msg); >> >>>> + if (vector->virq < 0) { >> >>>> + vector->virq = kvm_irqchip_add_msi_route(kvm_state, msg); >> >>>> + } >> >>>> if (vector->virq < 0 || >> >>>> kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt, >> >>>> vector->virq) < 0) { >> >>>> diff --git a/hw/pci/pci.c b/hw/pci/pci.c >> >>>> index a976e46..a9875e9 100644 >> >>>> --- a/hw/pci/pci.c >> >>>> +++ b/hw/pci/pci.c >> >>>> @@ -1254,6 +1254,19 @@ void pci_device_set_intx_routing_notifier(PCIDevice *dev, >> >>>> dev->intx_routing_notifier = notifier; >> >>>> } >> >>>> >> >>>> +void pci_bus_set_map_msi_fn(PCIBus *bus, pci_map_msi_fn map_msi_fn) >> >>>> +{ >> >>>> + bus->map_msi = map_msi_fn; >> >>>> +} >> >>>> + >> >>>> +int pci_bus_map_msi(PCIBus *bus, MSIMessage msg) >> >>>> +{ >> >>>> + if (bus->map_msi) { >> >>>> + return bus->map_msi(bus, msg); >> >>>> + } >> >>>> + return -1; >> >>>> +} >> >>>> + >> >>>> /* >> >>>> * PCI-to-PCI bridge specification >> >>>> * 9.1: Interrupt routing. Table 9-1 >> >>>> diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c >> >>>> index 80408c9..9ef9a29 100644 >> >>>> --- a/hw/ppc/spapr_pci.c >> >>>> +++ b/hw/ppc/spapr_pci.c >> >>>> @@ -500,6 +500,18 @@ static void spapr_msi_write(void *opaque, hwaddr addr, >> >>>> qemu_irq_pulse(xics_get_qirq(spapr->icp, irq)); >> >>>> } >> >>>> >> >>>> +static int spapr_msi_get_irq(PCIBus *bus, MSIMessage msg) >> >>>> +{ >> >>>> + DeviceState *par = bus->qbus.parent; >> >>>> + sPAPRPHBState *sphb = (sPAPRPHBState *) par; >> >>>> + unsigned long addr = msg.address - sphb->msi_win_addr; >> >>>> + int ndev = addr >> 16; >> >>>> + int vec = ((addr & 0xFFFF) >> 2) | msg.data; >> >>>> + uint32_t irq = sphb->msi_table[ndev].irq + vec; >> >>>> + >> >>>> + return (int)irq; >> >>>> +} >> >>>> + >> >>>> static const MemoryRegionOps spapr_msi_ops = { >> >>>> /* There is no .read as the read result is undefined by PCI spec */ >> >>>> .read = NULL, >> >>>> @@ -664,6 +676,7 @@ static int _spapr_phb_init(SysBusDevice *s) >> >>>> >> >>>> sphb->lsi_table[i].irq = irq; >> >>>> } >> >>>> + pci_bus_set_map_msi_fn(bus, spapr_msi_get_irq); >> >>>> >> >>>> return 0; >> >>>> } >> >>>> diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c >> >>>> index d309416..587f53e 100644 >> >>>> --- a/hw/virtio/virtio-pci.c >> >>>> +++ b/hw/virtio/virtio-pci.c >> >>>> @@ -472,6 +472,8 @@ static unsigned virtio_pci_get_features(DeviceState *d) >> >>>> return proxy->host_features; >> >>>> } >> >>>> >> >>>> +extern int spapr_msi_get_irq(PCIBus *bus, MSIMessage *msg); >> >>>> + >> >>>> static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy *proxy, >> >>>> unsigned int queue_no, >> >>>> unsigned int vector, >> >>>> @@ -481,7 +483,10 @@ static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy *proxy, >> >>>> int ret; >> >>>> >> >>>> if (irqfd->users == 0) { >> >>>> - ret = kvm_irqchip_add_msi_route(kvm_state, msg); >> >>>> + ret = pci_bus_map_msi(proxy->pci_dev.bus, msg); >> >>>> + if (ret < 0) { >> >>>> + ret = kvm_irqchip_add_msi_route(kvm_state, msg); >> >>>> + } >> >>>> if (ret < 0) { >> >>>> return ret; >> >>>> } >> >>>> @@ -609,14 +614,23 @@ static int virtio_pci_vq_vector_unmask(VirtIOPCIProxy *proxy, >> >>>> VirtQueue *vq = virtio_get_queue(proxy->vdev, queue_no); >> >>>> EventNotifier *n = virtio_queue_get_guest_notifier(vq); >> >>>> VirtIOIRQFD *irqfd; >> >>>> - int ret = 0; >> >>>> + int ret = 0, tmp; >> >>>> >> >>>> if (proxy->vector_irqfd) { >> >>>> irqfd = &proxy->vector_irqfd[vector]; >> >>>> - if (irqfd->msg.data != msg.data || irqfd->msg.address != msg.address) { >> >>>> - ret = kvm_irqchip_update_msi_route(kvm_state, irqfd->virq, msg); >> >>>> - if (ret < 0) { >> >>>> - return ret; >> >>>> + >> >>>> + tmp = pci_bus_map_msi(proxy->pci_dev.bus, msg); >> >>>> + if (tmp >= 0) { >> >>>> + if (irqfd->virq != tmp) { >> >>>> + fprintf(stderr, "FIXME: MSI(-X) vector has changed from %X to %x\n", >> >>>> + irqfd->virq, tmp); >> >>>> + } >> >>>> + } else { >> >>>> + if (irqfd->msg.data != msg.data || irqfd->msg.address != msg.address) { >> >>>> + ret = kvm_irqchip_update_msi_route(kvm_state, irqfd->virq, msg); >> >>>> + if (ret < 0) { >> >>>> + return ret; >> >>>> + } >> >>>> } >> >>>> } >> >>>> } >> >>>> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h >> >>>> index 8797802..632739a 100644 >> >>>> --- a/include/hw/pci/pci.h >> >>>> +++ b/include/hw/pci/pci.h >> >>>> @@ -332,6 +332,7 @@ MemoryRegion *pci_address_space_io(PCIDevice *dev); >> >>>> typedef void (*pci_set_irq_fn)(void *opaque, int irq_num, int level); >> >>>> typedef int (*pci_map_irq_fn)(PCIDevice *pci_dev, int irq_num); >> >>>> typedef PCIINTxRoute (*pci_route_irq_fn)(void *opaque, int pin); >> >>>> +typedef int (*pci_map_msi_fn)(PCIBus *bus, MSIMessage msg); >> >>>> >> >>>> typedef enum { >> >>>> PCI_HOTPLUG_DISABLED, >> >>>> @@ -375,6 +376,9 @@ bool pci_intx_route_changed(PCIINTxRoute *old, PCIINTxRoute *new); >> >>>> void pci_bus_fire_intx_routing_notifier(PCIBus *bus); >> >>>> void pci_device_set_intx_routing_notifier(PCIDevice *dev, >> >>>> PCIINTxRoutingNotifier notifier); >> >>>> +void pci_bus_set_map_msi_fn(PCIBus *bus, pci_map_msi_fn map_msi_fn); >> >>>> +int pci_bus_map_msi(PCIBus *bus, MSIMessage msg); >> >>>> + >> >>>> void pci_device_reset(PCIDevice *dev); >> >>>> void pci_bus_reset(PCIBus *bus); >> >>>> >> >>>> diff --git a/include/hw/pci/pci_bus.h b/include/hw/pci/pci_bus.h >> >>>> index 66762f6..81efd2b 100644 >> >>>> --- a/include/hw/pci/pci_bus.h >> >>>> +++ b/include/hw/pci/pci_bus.h >> >>>> @@ -16,6 +16,7 @@ struct PCIBus { >> >>>> pci_set_irq_fn set_irq; >> >>>> pci_map_irq_fn map_irq; >> >>>> pci_route_irq_fn route_intx_to_irq; >> >>>> + pci_map_msi_fn map_msi; >> >>>> pci_hotplug_fn hotplug; >> >>>> DeviceState *hotplug_qdev; >> >>>> void *irq_opaque; >> >>> >> >>> >> >>> >> >> >> >> >> > >> > >> > >> >> > > > >
On Sun, 2013-06-23 at 17:07 +0300, Michael S. Tsirkin wrote: > Yes I think that's fine really. > > Basically devices all speak MSIMessage as they should - > this is what the PCI spec says. > On all normal systems guests also speak MSIMessage so > the API which uses these makes sense for kvm. > Except, ppc architecture in its wisdom decided to hide this > in firmware. QEMU does not have firmware so the translation > has to be maintained in QEMU powerpc code. It is still not enough. There is nothing that says that an MSI address/message is unique. On pseries (among others I'm sure), this is a per-host-bridge concept since the MSIs are decoded by the PHB and turned into standard fabric interrupts there. So at least the bus needs to be passed as well. Cheers, Ben.
On Sun, 2013-06-23 at 10:02 -0500, Anthony Liguori wrote: > It doesn't make a lot of sense to have Power do MSI decoding in > userspace while x86 does it in the kernel. I assume the kernel > doesn't have enough information to do the translation on Power? Well, it would need the host bridge which it doesn't have and it would need a concept of mapping that to interrupt numbers which we don't want in the kernel. We can keep the kernel side a LOT simpler (completely avoiding the whole route bloatware) by sticking to our "simple" direct map kernel stuff and doing the mapping from address/data to interrupts in qemu, so we'll stick to that. Cheers, Ben.
On Sun, Jun 23, 2013 at 4:39 PM, Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote: > On Sun, 2013-06-23 at 10:02 -0500, Anthony Liguori wrote: >> It doesn't make a lot of sense to have Power do MSI decoding in >> userspace while x86 does it in the kernel. I assume the kernel >> doesn't have enough information to do the translation on Power? > > Well, it would need the host bridge which it doesn't have and it > would need a concept of mapping that to interrupt numbers which > we don't want in the kernel. > > We can keep the kernel side a LOT simpler (completely avoiding > the whole route bloatware) by sticking to our "simple" direct map kernel > stuff and doing the mapping from address/data to interrupts in qemu, so > we'll stick to that. Yeah, but none of this is Power specific... so we can do the same for x86, no? I'm still trying to wrap my head around why we need MSI knowledge at all in the kernel for x86. I presume it's to fast-path irqfd when doing vhost? Regards, Anthony Liguori > > Cheers, > Ben. > >
On Sun, 2013-06-23 at 10:06 -0500, Anthony Liguori wrote: > On Thu, Jun 20, 2013 at 11:46 PM, Alex Williamson > <alex.williamson@redhat.com> wrote: > > On Fri, 2013-06-21 at 12:49 +1000, Alexey Kardashevskiy wrote: > >> On 06/21/2013 12:34 PM, Alex Williamson wrote: > >> > >> > >> Do not follow you, sorry. For x86, is it that MSI routing table which is > >> updated via KVM_SET_GSI_ROUTING in KVM? When there is no KVM, what piece of > >> code responds on msi_notify() in qemu-x86 and does qemu_irq_pulse()? > > > > vfio_msi_interrupt->msi[x]_notify->stl_le_phys(msg.address, msg.data) > > > > This writes directly to the interrupt block on the vCPU. With KVM, the > > in-kernel APIC does the same write, where the pin to MSIMessage is setup > > by kvm_irqchip_add_msi_route and the pin is pulled by an irqfd. > > What is this "interrupt block on the vCPU" you speak of? I reviewed > the SDM and see nothing in the APIC protocol or the brief description > of MSI as a PCI concept that would indicate anything except that the > PHB handles MSI writes and feeds them to the I/O APIC. In all likelihood I'm recalling ia64 details and trying to apply them to x86. Does the MSIMessage not actually get written to the LAPIC on the CPU? Thanks, Alex > In fact, the wikipedia article on MSI has: > > "A common misconception with Message Signaled Interrupts is that they > allow the device to send data to a processor as part of the interrupt. > The data that is sent as part of the write is used by the chipset to > determine which interrupt to trigger on which processor; it is not > available for the device to communicate additional information to the > interrupt handler." > > > Do I understand that on POWER the MSI from the device is intercepted at > > the PHB and converted to an IRQ that's triggered by some means other > > than a MSI write? > > This is exactly the same thing that happens on x86, no? Can you point > me to something in the SDM that says otherwise? > > Regards, > > Anthony Liguori > > > So to correctly model the hardware, vfio should do a > > msi_notify() that does a stl_le_phys that terminates at this IRQ > > remapper thing and in turn toggles a qemu_irq. MSIMessage is only > > extraneous data if you want to skip over hardware blocks. > > > > Maybe you could add a device parameter to kvm_irqchip_add_msi_route so > > that it can be implemented on POWER without this pci_bus_map_msi > > interface that seems very unique to POWER. Thanks, > > > > Alex > > > >> >>>> --- > >> >>>> hw/misc/vfio.c | 11 +++++++++-- > >> >>>> hw/pci/pci.c | 13 +++++++++++++ > >> >>>> hw/ppc/spapr_pci.c | 13 +++++++++++++ > >> >>>> hw/virtio/virtio-pci.c | 26 ++++++++++++++++++++------ > >> >>>> include/hw/pci/pci.h | 4 ++++ > >> >>>> include/hw/pci/pci_bus.h | 1 + > >> >>>> 6 files changed, 60 insertions(+), 8 deletions(-) > >> >>>> > >> >>>> diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c > >> >>>> index 14aac04..2d9eef7 100644 > >> >>>> --- a/hw/misc/vfio.c > >> >>>> +++ b/hw/misc/vfio.c > >> >>>> @@ -639,7 +639,11 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, > >> >>>> * Attempt to enable route through KVM irqchip, > >> >>>> * default to userspace handling if unavailable. > >> >>>> */ > >> >>>> - vector->virq = msg ? kvm_irqchip_add_msi_route(kvm_state, *msg) : -1; > >> >>>> + > >> >>>> + vector->virq = msg ? pci_bus_map_msi(vdev->pdev.bus, *msg) : -1; > >> >>>> + if (vector->virq < 0) { > >> >>>> + vector->virq = msg ? kvm_irqchip_add_msi_route(kvm_state, *msg) : -1; > >> >>>> + } > >> >>>> if (vector->virq < 0 || > >> >>>> kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt, > >> >>>> vector->virq) < 0) { > >> >>>> @@ -807,7 +811,10 @@ retry: > >> >>>> * Attempt to enable route through KVM irqchip, > >> >>>> * default to userspace handling if unavailable. > >> >>>> */ > >> >>>> - vector->virq = kvm_irqchip_add_msi_route(kvm_state, msg); > >> >>>> + vector->virq = pci_bus_map_msi(vdev->pdev.bus, msg); > >> >>>> + if (vector->virq < 0) { > >> >>>> + vector->virq = kvm_irqchip_add_msi_route(kvm_state, msg); > >> >>>> + } > >> >>>> if (vector->virq < 0 || > >> >>>> kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt, > >> >>>> vector->virq) < 0) { > >> >>>> diff --git a/hw/pci/pci.c b/hw/pci/pci.c > >> >>>> index a976e46..a9875e9 100644 > >> >>>> --- a/hw/pci/pci.c > >> >>>> +++ b/hw/pci/pci.c > >> >>>> @@ -1254,6 +1254,19 @@ void pci_device_set_intx_routing_notifier(PCIDevice *dev, > >> >>>> dev->intx_routing_notifier = notifier; > >> >>>> } > >> >>>> > >> >>>> +void pci_bus_set_map_msi_fn(PCIBus *bus, pci_map_msi_fn map_msi_fn) > >> >>>> +{ > >> >>>> + bus->map_msi = map_msi_fn; > >> >>>> +} > >> >>>> + > >> >>>> +int pci_bus_map_msi(PCIBus *bus, MSIMessage msg) > >> >>>> +{ > >> >>>> + if (bus->map_msi) { > >> >>>> + return bus->map_msi(bus, msg); > >> >>>> + } > >> >>>> + return -1; > >> >>>> +} > >> >>>> + > >> >>>> /* > >> >>>> * PCI-to-PCI bridge specification > >> >>>> * 9.1: Interrupt routing. Table 9-1 > >> >>>> diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c > >> >>>> index 80408c9..9ef9a29 100644 > >> >>>> --- a/hw/ppc/spapr_pci.c > >> >>>> +++ b/hw/ppc/spapr_pci.c > >> >>>> @@ -500,6 +500,18 @@ static void spapr_msi_write(void *opaque, hwaddr addr, > >> >>>> qemu_irq_pulse(xics_get_qirq(spapr->icp, irq)); > >> >>>> } > >> >>>> > >> >>>> +static int spapr_msi_get_irq(PCIBus *bus, MSIMessage msg) > >> >>>> +{ > >> >>>> + DeviceState *par = bus->qbus.parent; > >> >>>> + sPAPRPHBState *sphb = (sPAPRPHBState *) par; > >> >>>> + unsigned long addr = msg.address - sphb->msi_win_addr; > >> >>>> + int ndev = addr >> 16; > >> >>>> + int vec = ((addr & 0xFFFF) >> 2) | msg.data; > >> >>>> + uint32_t irq = sphb->msi_table[ndev].irq + vec; > >> >>>> + > >> >>>> + return (int)irq; > >> >>>> +} > >> >>>> + > >> >>>> static const MemoryRegionOps spapr_msi_ops = { > >> >>>> /* There is no .read as the read result is undefined by PCI spec */ > >> >>>> .read = NULL, > >> >>>> @@ -664,6 +676,7 @@ static int _spapr_phb_init(SysBusDevice *s) > >> >>>> > >> >>>> sphb->lsi_table[i].irq = irq; > >> >>>> } > >> >>>> + pci_bus_set_map_msi_fn(bus, spapr_msi_get_irq); > >> >>>> > >> >>>> return 0; > >> >>>> } > >> >>>> diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c > >> >>>> index d309416..587f53e 100644 > >> >>>> --- a/hw/virtio/virtio-pci.c > >> >>>> +++ b/hw/virtio/virtio-pci.c > >> >>>> @@ -472,6 +472,8 @@ static unsigned virtio_pci_get_features(DeviceState *d) > >> >>>> return proxy->host_features; > >> >>>> } > >> >>>> > >> >>>> +extern int spapr_msi_get_irq(PCIBus *bus, MSIMessage *msg); > >> >>>> + > >> >>>> static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy *proxy, > >> >>>> unsigned int queue_no, > >> >>>> unsigned int vector, > >> >>>> @@ -481,7 +483,10 @@ static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy *proxy, > >> >>>> int ret; > >> >>>> > >> >>>> if (irqfd->users == 0) { > >> >>>> - ret = kvm_irqchip_add_msi_route(kvm_state, msg); > >> >>>> + ret = pci_bus_map_msi(proxy->pci_dev.bus, msg); > >> >>>> + if (ret < 0) { > >> >>>> + ret = kvm_irqchip_add_msi_route(kvm_state, msg); > >> >>>> + } > >> >>>> if (ret < 0) { > >> >>>> return ret; > >> >>>> } > >> >>>> @@ -609,14 +614,23 @@ static int virtio_pci_vq_vector_unmask(VirtIOPCIProxy *proxy, > >> >>>> VirtQueue *vq = virtio_get_queue(proxy->vdev, queue_no); > >> >>>> EventNotifier *n = virtio_queue_get_guest_notifier(vq); > >> >>>> VirtIOIRQFD *irqfd; > >> >>>> - int ret = 0; > >> >>>> + int ret = 0, tmp; > >> >>>> > >> >>>> if (proxy->vector_irqfd) { > >> >>>> irqfd = &proxy->vector_irqfd[vector]; > >> >>>> - if (irqfd->msg.data != msg.data || irqfd->msg.address != msg.address) { > >> >>>> - ret = kvm_irqchip_update_msi_route(kvm_state, irqfd->virq, msg); > >> >>>> - if (ret < 0) { > >> >>>> - return ret; > >> >>>> + > >> >>>> + tmp = pci_bus_map_msi(proxy->pci_dev.bus, msg); > >> >>>> + if (tmp >= 0) { > >> >>>> + if (irqfd->virq != tmp) { > >> >>>> + fprintf(stderr, "FIXME: MSI(-X) vector has changed from %X to %x\n", > >> >>>> + irqfd->virq, tmp); > >> >>>> + } > >> >>>> + } else { > >> >>>> + if (irqfd->msg.data != msg.data || irqfd->msg.address != msg.address) { > >> >>>> + ret = kvm_irqchip_update_msi_route(kvm_state, irqfd->virq, msg); > >> >>>> + if (ret < 0) { > >> >>>> + return ret; > >> >>>> + } > >> >>>> } > >> >>>> } > >> >>>> } > >> >>>> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h > >> >>>> index 8797802..632739a 100644 > >> >>>> --- a/include/hw/pci/pci.h > >> >>>> +++ b/include/hw/pci/pci.h > >> >>>> @@ -332,6 +332,7 @@ MemoryRegion *pci_address_space_io(PCIDevice *dev); > >> >>>> typedef void (*pci_set_irq_fn)(void *opaque, int irq_num, int level); > >> >>>> typedef int (*pci_map_irq_fn)(PCIDevice *pci_dev, int irq_num); > >> >>>> typedef PCIINTxRoute (*pci_route_irq_fn)(void *opaque, int pin); > >> >>>> +typedef int (*pci_map_msi_fn)(PCIBus *bus, MSIMessage msg); > >> >>>> > >> >>>> typedef enum { > >> >>>> PCI_HOTPLUG_DISABLED, > >> >>>> @@ -375,6 +376,9 @@ bool pci_intx_route_changed(PCIINTxRoute *old, PCIINTxRoute *new); > >> >>>> void pci_bus_fire_intx_routing_notifier(PCIBus *bus); > >> >>>> void pci_device_set_intx_routing_notifier(PCIDevice *dev, > >> >>>> PCIINTxRoutingNotifier notifier); > >> >>>> +void pci_bus_set_map_msi_fn(PCIBus *bus, pci_map_msi_fn map_msi_fn); > >> >>>> +int pci_bus_map_msi(PCIBus *bus, MSIMessage msg); > >> >>>> + > >> >>>> void pci_device_reset(PCIDevice *dev); > >> >>>> void pci_bus_reset(PCIBus *bus); > >> >>>> > >> >>>> diff --git a/include/hw/pci/pci_bus.h b/include/hw/pci/pci_bus.h > >> >>>> index 66762f6..81efd2b 100644 > >> >>>> --- a/include/hw/pci/pci_bus.h > >> >>>> +++ b/include/hw/pci/pci_bus.h > >> >>>> @@ -16,6 +16,7 @@ struct PCIBus { > >> >>>> pci_set_irq_fn set_irq; > >> >>>> pci_map_irq_fn map_irq; > >> >>>> pci_route_irq_fn route_intx_to_irq; > >> >>>> + pci_map_msi_fn map_msi; > >> >>>> pci_hotplug_fn hotplug; > >> >>>> DeviceState *hotplug_qdev; > >> >>>> void *irq_opaque; > >> >>> > >> >>> > >> >>> > >> >> > >> >> > >> > > >> > > >> > > >> > >> > > > > > > > >
On Sun, 2013-06-23 at 16:58 -0500, Anthony Liguori wrote: > On Sun, Jun 23, 2013 at 4:39 PM, Benjamin Herrenschmidt > <benh@kernel.crashing.org> wrote: > > On Sun, 2013-06-23 at 10:02 -0500, Anthony Liguori wrote: > >> It doesn't make a lot of sense to have Power do MSI decoding in > >> userspace while x86 does it in the kernel. I assume the kernel > >> doesn't have enough information to do the translation on Power? > > > > Well, it would need the host bridge which it doesn't have and it > > would need a concept of mapping that to interrupt numbers which > > we don't want in the kernel. > > > > We can keep the kernel side a LOT simpler (completely avoiding > > the whole route bloatware) by sticking to our "simple" direct map kernel > > stuff and doing the mapping from address/data to interrupts in qemu, so > > we'll stick to that. > > Yeah, but none of this is Power specific... so we can do the same for x86, no? > > I'm still trying to wrap my head around why we need MSI knowledge at > all in the kernel for x86. I presume it's to fast-path irqfd when > doing vhost? Or device assignment. Any paths where we want to inject an MSI interrupt without going through userspace. Thanks, Alex
On Sun, Jun 23, 2013 at 10:06:05AM -0500, Anthony Liguori wrote: > On Thu, Jun 20, 2013 at 11:46 PM, Alex Williamson > <alex.williamson@redhat.com> wrote: > > On Fri, 2013-06-21 at 12:49 +1000, Alexey Kardashevskiy wrote: > >> On 06/21/2013 12:34 PM, Alex Williamson wrote: > >> > >> > >> Do not follow you, sorry. For x86, is it that MSI routing table which is > >> updated via KVM_SET_GSI_ROUTING in KVM? When there is no KVM, what piece of > >> code responds on msi_notify() in qemu-x86 and does qemu_irq_pulse()? > > > > vfio_msi_interrupt->msi[x]_notify->stl_le_phys(msg.address, msg.data) > > > > This writes directly to the interrupt block on the vCPU. With KVM, the > > in-kernel APIC does the same write, where the pin to MSIMessage is setup > > by kvm_irqchip_add_msi_route and the pin is pulled by an irqfd. > > What is this "interrupt block on the vCPU" you speak of? I reviewed FEE00000H address as seen from PCI bus is a special address range (see 10.11.1 in SDM). Any write by a PCI device to that address range is interpreted as MSI. We do not model this correctly in QEMU yet since all devices, including vcpus, see exactly same memory map. > the SDM and see nothing in the APIC protocol or the brief description > of MSI as a PCI concept that would indicate anything except that the > PHB handles MSI writes and feeds them to the I/O APIC. > I/O APIC? Did you mean APIC, but even that will probably be incorrect. I'd say it translates the data to APIC bus message. And with interrupt remapping there is more magic happens between MSI and APIC bus. > In fact, the wikipedia article on MSI has: > > "A common misconception with Message Signaled Interrupts is that they > allow the device to send data to a processor as part of the interrupt. > The data that is sent as part of the write is used by the chipset to > determine which interrupt to trigger on which processor; it is not > available for the device to communicate additional information to the > interrupt handler." > Not sure who claimed otherwise. > > Do I understand that on POWER the MSI from the device is intercepted at > > the PHB and converted to an IRQ that's triggered by some means other > > than a MSI write? > > This is exactly the same thing that happens on x86, no? Can you point > me to something in the SDM that says otherwise? > > Regards, > > Anthony Liguori > > > So to correctly model the hardware, vfio should do a > > msi_notify() that does a stl_le_phys that terminates at this IRQ > > remapper thing and in turn toggles a qemu_irq. MSIMessage is only > > extraneous data if you want to skip over hardware blocks. > > > > Maybe you could add a device parameter to kvm_irqchip_add_msi_route so > > that it can be implemented on POWER without this pci_bus_map_msi > > interface that seems very unique to POWER. Thanks, > > > > Alex > > > >> >>>> --- > >> >>>> hw/misc/vfio.c | 11 +++++++++-- > >> >>>> hw/pci/pci.c | 13 +++++++++++++ > >> >>>> hw/ppc/spapr_pci.c | 13 +++++++++++++ > >> >>>> hw/virtio/virtio-pci.c | 26 ++++++++++++++++++++------ > >> >>>> include/hw/pci/pci.h | 4 ++++ > >> >>>> include/hw/pci/pci_bus.h | 1 + > >> >>>> 6 files changed, 60 insertions(+), 8 deletions(-) > >> >>>> > >> >>>> diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c > >> >>>> index 14aac04..2d9eef7 100644 > >> >>>> --- a/hw/misc/vfio.c > >> >>>> +++ b/hw/misc/vfio.c > >> >>>> @@ -639,7 +639,11 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, > >> >>>> * Attempt to enable route through KVM irqchip, > >> >>>> * default to userspace handling if unavailable. > >> >>>> */ > >> >>>> - vector->virq = msg ? kvm_irqchip_add_msi_route(kvm_state, *msg) : -1; > >> >>>> + > >> >>>> + vector->virq = msg ? pci_bus_map_msi(vdev->pdev.bus, *msg) : -1; > >> >>>> + if (vector->virq < 0) { > >> >>>> + vector->virq = msg ? kvm_irqchip_add_msi_route(kvm_state, *msg) : -1; > >> >>>> + } > >> >>>> if (vector->virq < 0 || > >> >>>> kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt, > >> >>>> vector->virq) < 0) { > >> >>>> @@ -807,7 +811,10 @@ retry: > >> >>>> * Attempt to enable route through KVM irqchip, > >> >>>> * default to userspace handling if unavailable. > >> >>>> */ > >> >>>> - vector->virq = kvm_irqchip_add_msi_route(kvm_state, msg); > >> >>>> + vector->virq = pci_bus_map_msi(vdev->pdev.bus, msg); > >> >>>> + if (vector->virq < 0) { > >> >>>> + vector->virq = kvm_irqchip_add_msi_route(kvm_state, msg); > >> >>>> + } > >> >>>> if (vector->virq < 0 || > >> >>>> kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt, > >> >>>> vector->virq) < 0) { > >> >>>> diff --git a/hw/pci/pci.c b/hw/pci/pci.c > >> >>>> index a976e46..a9875e9 100644 > >> >>>> --- a/hw/pci/pci.c > >> >>>> +++ b/hw/pci/pci.c > >> >>>> @@ -1254,6 +1254,19 @@ void pci_device_set_intx_routing_notifier(PCIDevice *dev, > >> >>>> dev->intx_routing_notifier = notifier; > >> >>>> } > >> >>>> > >> >>>> +void pci_bus_set_map_msi_fn(PCIBus *bus, pci_map_msi_fn map_msi_fn) > >> >>>> +{ > >> >>>> + bus->map_msi = map_msi_fn; > >> >>>> +} > >> >>>> + > >> >>>> +int pci_bus_map_msi(PCIBus *bus, MSIMessage msg) > >> >>>> +{ > >> >>>> + if (bus->map_msi) { > >> >>>> + return bus->map_msi(bus, msg); > >> >>>> + } > >> >>>> + return -1; > >> >>>> +} > >> >>>> + > >> >>>> /* > >> >>>> * PCI-to-PCI bridge specification > >> >>>> * 9.1: Interrupt routing. Table 9-1 > >> >>>> diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c > >> >>>> index 80408c9..9ef9a29 100644 > >> >>>> --- a/hw/ppc/spapr_pci.c > >> >>>> +++ b/hw/ppc/spapr_pci.c > >> >>>> @@ -500,6 +500,18 @@ static void spapr_msi_write(void *opaque, hwaddr addr, > >> >>>> qemu_irq_pulse(xics_get_qirq(spapr->icp, irq)); > >> >>>> } > >> >>>> > >> >>>> +static int spapr_msi_get_irq(PCIBus *bus, MSIMessage msg) > >> >>>> +{ > >> >>>> + DeviceState *par = bus->qbus.parent; > >> >>>> + sPAPRPHBState *sphb = (sPAPRPHBState *) par; > >> >>>> + unsigned long addr = msg.address - sphb->msi_win_addr; > >> >>>> + int ndev = addr >> 16; > >> >>>> + int vec = ((addr & 0xFFFF) >> 2) | msg.data; > >> >>>> + uint32_t irq = sphb->msi_table[ndev].irq + vec; > >> >>>> + > >> >>>> + return (int)irq; > >> >>>> +} > >> >>>> + > >> >>>> static const MemoryRegionOps spapr_msi_ops = { > >> >>>> /* There is no .read as the read result is undefined by PCI spec */ > >> >>>> .read = NULL, > >> >>>> @@ -664,6 +676,7 @@ static int _spapr_phb_init(SysBusDevice *s) > >> >>>> > >> >>>> sphb->lsi_table[i].irq = irq; > >> >>>> } > >> >>>> + pci_bus_set_map_msi_fn(bus, spapr_msi_get_irq); > >> >>>> > >> >>>> return 0; > >> >>>> } > >> >>>> diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c > >> >>>> index d309416..587f53e 100644 > >> >>>> --- a/hw/virtio/virtio-pci.c > >> >>>> +++ b/hw/virtio/virtio-pci.c > >> >>>> @@ -472,6 +472,8 @@ static unsigned virtio_pci_get_features(DeviceState *d) > >> >>>> return proxy->host_features; > >> >>>> } > >> >>>> > >> >>>> +extern int spapr_msi_get_irq(PCIBus *bus, MSIMessage *msg); > >> >>>> + > >> >>>> static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy *proxy, > >> >>>> unsigned int queue_no, > >> >>>> unsigned int vector, > >> >>>> @@ -481,7 +483,10 @@ static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy *proxy, > >> >>>> int ret; > >> >>>> > >> >>>> if (irqfd->users == 0) { > >> >>>> - ret = kvm_irqchip_add_msi_route(kvm_state, msg); > >> >>>> + ret = pci_bus_map_msi(proxy->pci_dev.bus, msg); > >> >>>> + if (ret < 0) { > >> >>>> + ret = kvm_irqchip_add_msi_route(kvm_state, msg); > >> >>>> + } > >> >>>> if (ret < 0) { > >> >>>> return ret; > >> >>>> } > >> >>>> @@ -609,14 +614,23 @@ static int virtio_pci_vq_vector_unmask(VirtIOPCIProxy *proxy, > >> >>>> VirtQueue *vq = virtio_get_queue(proxy->vdev, queue_no); > >> >>>> EventNotifier *n = virtio_queue_get_guest_notifier(vq); > >> >>>> VirtIOIRQFD *irqfd; > >> >>>> - int ret = 0; > >> >>>> + int ret = 0, tmp; > >> >>>> > >> >>>> if (proxy->vector_irqfd) { > >> >>>> irqfd = &proxy->vector_irqfd[vector]; > >> >>>> - if (irqfd->msg.data != msg.data || irqfd->msg.address != msg.address) { > >> >>>> - ret = kvm_irqchip_update_msi_route(kvm_state, irqfd->virq, msg); > >> >>>> - if (ret < 0) { > >> >>>> - return ret; > >> >>>> + > >> >>>> + tmp = pci_bus_map_msi(proxy->pci_dev.bus, msg); > >> >>>> + if (tmp >= 0) { > >> >>>> + if (irqfd->virq != tmp) { > >> >>>> + fprintf(stderr, "FIXME: MSI(-X) vector has changed from %X to %x\n", > >> >>>> + irqfd->virq, tmp); > >> >>>> + } > >> >>>> + } else { > >> >>>> + if (irqfd->msg.data != msg.data || irqfd->msg.address != msg.address) { > >> >>>> + ret = kvm_irqchip_update_msi_route(kvm_state, irqfd->virq, msg); > >> >>>> + if (ret < 0) { > >> >>>> + return ret; > >> >>>> + } > >> >>>> } > >> >>>> } > >> >>>> } > >> >>>> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h > >> >>>> index 8797802..632739a 100644 > >> >>>> --- a/include/hw/pci/pci.h > >> >>>> +++ b/include/hw/pci/pci.h > >> >>>> @@ -332,6 +332,7 @@ MemoryRegion *pci_address_space_io(PCIDevice *dev); > >> >>>> typedef void (*pci_set_irq_fn)(void *opaque, int irq_num, int level); > >> >>>> typedef int (*pci_map_irq_fn)(PCIDevice *pci_dev, int irq_num); > >> >>>> typedef PCIINTxRoute (*pci_route_irq_fn)(void *opaque, int pin); > >> >>>> +typedef int (*pci_map_msi_fn)(PCIBus *bus, MSIMessage msg); > >> >>>> > >> >>>> typedef enum { > >> >>>> PCI_HOTPLUG_DISABLED, > >> >>>> @@ -375,6 +376,9 @@ bool pci_intx_route_changed(PCIINTxRoute *old, PCIINTxRoute *new); > >> >>>> void pci_bus_fire_intx_routing_notifier(PCIBus *bus); > >> >>>> void pci_device_set_intx_routing_notifier(PCIDevice *dev, > >> >>>> PCIINTxRoutingNotifier notifier); > >> >>>> +void pci_bus_set_map_msi_fn(PCIBus *bus, pci_map_msi_fn map_msi_fn); > >> >>>> +int pci_bus_map_msi(PCIBus *bus, MSIMessage msg); > >> >>>> + > >> >>>> void pci_device_reset(PCIDevice *dev); > >> >>>> void pci_bus_reset(PCIBus *bus); > >> >>>> > >> >>>> diff --git a/include/hw/pci/pci_bus.h b/include/hw/pci/pci_bus.h > >> >>>> index 66762f6..81efd2b 100644 > >> >>>> --- a/include/hw/pci/pci_bus.h > >> >>>> +++ b/include/hw/pci/pci_bus.h > >> >>>> @@ -16,6 +16,7 @@ struct PCIBus { > >> >>>> pci_set_irq_fn set_irq; > >> >>>> pci_map_irq_fn map_irq; > >> >>>> pci_route_irq_fn route_intx_to_irq; > >> >>>> + pci_map_msi_fn map_msi; > >> >>>> pci_hotplug_fn hotplug; > >> >>>> DeviceState *hotplug_qdev; > >> >>>> void *irq_opaque; > >> >>> > >> >>> > >> >>> > >> >> > >> >> > >> > > >> > > >> > > >> > >> > > > > > > > > -- Gleb.
On Mon, Jun 24, 2013 at 07:36:25AM +1000, Benjamin Herrenschmidt wrote: > On Sun, 2013-06-23 at 17:07 +0300, Michael S. Tsirkin wrote: > > Yes I think that's fine really. > > > > Basically devices all speak MSIMessage as they should - > > this is what the PCI spec says. > > On all normal systems guests also speak MSIMessage so > > the API which uses these makes sense for kvm. > > Except, ppc architecture in its wisdom decided to hide this > > in firmware. QEMU does not have firmware so the translation > > has to be maintained in QEMU powerpc code. > > It is still not enough. There is nothing that says that an MSI > address/message is unique. On pseries (among others I'm sure), this is a > per-host-bridge concept since the MSIs are decoded by the PHB and turned > into standard fabric interrupts there. > > So at least the bus needs to be passed as well. > > Cheers, > Ben. If passing bus (or dma context, etc) turns out to actually be helpful, why not.
Alex Williamson <alex.williamson@redhat.com> writes: > On Sun, 2013-06-23 at 16:58 -0500, Anthony Liguori wrote: >> Yeah, but none of this is Power specific... so we can do the same for x86, no? >> >> I'm still trying to wrap my head around why we need MSI knowledge at >> all in the kernel for x86. I presume it's to fast-path irqfd when >> doing vhost? > > Or device assignment. Any paths where we want to inject an MSI > interrupt without going through userspace. Thanks, With VFIO, we use irqfd, if we program the irqfd as the underlying IRQ that MSI would route to, isn't that sufficient? That's more or less what Ben is proposing for Power... Regards, Anthony Liguori > > Alex
Alex Williamson <alex.williamson@redhat.com> writes: > On Sun, 2013-06-23 at 10:06 -0500, Anthony Liguori wrote: >> On Thu, Jun 20, 2013 at 11:46 PM, Alex Williamson >> <alex.williamson@redhat.com> wrote: >> > On Fri, 2013-06-21 at 12:49 +1000, Alexey Kardashevskiy wrote: >> >> On 06/21/2013 12:34 PM, Alex Williamson wrote: >> >> >> >> >> >> Do not follow you, sorry. For x86, is it that MSI routing table which is >> >> updated via KVM_SET_GSI_ROUTING in KVM? When there is no KVM, what piece of >> >> code responds on msi_notify() in qemu-x86 and does qemu_irq_pulse()? >> > >> > vfio_msi_interrupt->msi[x]_notify->stl_le_phys(msg.address, msg.data) >> > >> > This writes directly to the interrupt block on the vCPU. With KVM, the >> > in-kernel APIC does the same write, where the pin to MSIMessage is setup >> > by kvm_irqchip_add_msi_route and the pin is pulled by an irqfd. >> >> What is this "interrupt block on the vCPU" you speak of? I reviewed >> the SDM and see nothing in the APIC protocol or the brief description >> of MSI as a PCI concept that would indicate anything except that the >> PHB handles MSI writes and feeds them to the I/O APIC. > > In all likelihood I'm recalling ia64 details and trying to apply them to > x86. Does the MSIMessage not actually get written to the LAPIC on the > CPU? Thanks, There definitely isn't an APIC message for MSI specifically. I think the only question is whether the PHB sits on the APIC bus and can generate an APIC message directly or whether it has a private interface to the IO APIC to do it. I suspect that there are systems that do either. But the important point is that MSI writes are interpreted by the PHB either way. Regards, Anthony Liguori > > Alex > >> In fact, the wikipedia article on MSI has: >> >> "A common misconception with Message Signaled Interrupts is that they >> allow the device to send data to a processor as part of the interrupt. >> The data that is sent as part of the write is used by the chipset to >> determine which interrupt to trigger on which processor; it is not >> available for the device to communicate additional information to the >> interrupt handler." >> >> > Do I understand that on POWER the MSI from the device is intercepted at >> > the PHB and converted to an IRQ that's triggered by some means other >> > than a MSI write? >> >> This is exactly the same thing that happens on x86, no? Can you point >> me to something in the SDM that says otherwise? >> >> Regards, >> >> Anthony Liguori >> >> > So to correctly model the hardware, vfio should do a >> > msi_notify() that does a stl_le_phys that terminates at this IRQ >> > remapper thing and in turn toggles a qemu_irq. MSIMessage is only >> > extraneous data if you want to skip over hardware blocks. >> > >> > Maybe you could add a device parameter to kvm_irqchip_add_msi_route so >> > that it can be implemented on POWER without this pci_bus_map_msi >> > interface that seems very unique to POWER. Thanks, >> > >> > Alex >> > >> >> >>>> --- >> >> >>>> hw/misc/vfio.c | 11 +++++++++-- >> >> >>>> hw/pci/pci.c | 13 +++++++++++++ >> >> >>>> hw/ppc/spapr_pci.c | 13 +++++++++++++ >> >> >>>> hw/virtio/virtio-pci.c | 26 ++++++++++++++++++++------ >> >> >>>> include/hw/pci/pci.h | 4 ++++ >> >> >>>> include/hw/pci/pci_bus.h | 1 + >> >> >>>> 6 files changed, 60 insertions(+), 8 deletions(-) >> >> >>>> >> >> >>>> diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c >> >> >>>> index 14aac04..2d9eef7 100644 >> >> >>>> --- a/hw/misc/vfio.c >> >> >>>> +++ b/hw/misc/vfio.c >> >> >>>> @@ -639,7 +639,11 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, >> >> >>>> * Attempt to enable route through KVM irqchip, >> >> >>>> * default to userspace handling if unavailable. >> >> >>>> */ >> >> >>>> - vector->virq = msg ? kvm_irqchip_add_msi_route(kvm_state, *msg) : -1; >> >> >>>> + >> >> >>>> + vector->virq = msg ? pci_bus_map_msi(vdev->pdev.bus, *msg) : -1; >> >> >>>> + if (vector->virq < 0) { >> >> >>>> + vector->virq = msg ? kvm_irqchip_add_msi_route(kvm_state, *msg) : -1; >> >> >>>> + } >> >> >>>> if (vector->virq < 0 || >> >> >>>> kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt, >> >> >>>> vector->virq) < 0) { >> >> >>>> @@ -807,7 +811,10 @@ retry: >> >> >>>> * Attempt to enable route through KVM irqchip, >> >> >>>> * default to userspace handling if unavailable. >> >> >>>> */ >> >> >>>> - vector->virq = kvm_irqchip_add_msi_route(kvm_state, msg); >> >> >>>> + vector->virq = pci_bus_map_msi(vdev->pdev.bus, msg); >> >> >>>> + if (vector->virq < 0) { >> >> >>>> + vector->virq = kvm_irqchip_add_msi_route(kvm_state, msg); >> >> >>>> + } >> >> >>>> if (vector->virq < 0 || >> >> >>>> kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt, >> >> >>>> vector->virq) < 0) { >> >> >>>> diff --git a/hw/pci/pci.c b/hw/pci/pci.c >> >> >>>> index a976e46..a9875e9 100644 >> >> >>>> --- a/hw/pci/pci.c >> >> >>>> +++ b/hw/pci/pci.c >> >> >>>> @@ -1254,6 +1254,19 @@ void pci_device_set_intx_routing_notifier(PCIDevice *dev, >> >> >>>> dev->intx_routing_notifier = notifier; >> >> >>>> } >> >> >>>> >> >> >>>> +void pci_bus_set_map_msi_fn(PCIBus *bus, pci_map_msi_fn map_msi_fn) >> >> >>>> +{ >> >> >>>> + bus->map_msi = map_msi_fn; >> >> >>>> +} >> >> >>>> + >> >> >>>> +int pci_bus_map_msi(PCIBus *bus, MSIMessage msg) >> >> >>>> +{ >> >> >>>> + if (bus->map_msi) { >> >> >>>> + return bus->map_msi(bus, msg); >> >> >>>> + } >> >> >>>> + return -1; >> >> >>>> +} >> >> >>>> + >> >> >>>> /* >> >> >>>> * PCI-to-PCI bridge specification >> >> >>>> * 9.1: Interrupt routing. Table 9-1 >> >> >>>> diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c >> >> >>>> index 80408c9..9ef9a29 100644 >> >> >>>> --- a/hw/ppc/spapr_pci.c >> >> >>>> +++ b/hw/ppc/spapr_pci.c >> >> >>>> @@ -500,6 +500,18 @@ static void spapr_msi_write(void *opaque, hwaddr addr, >> >> >>>> qemu_irq_pulse(xics_get_qirq(spapr->icp, irq)); >> >> >>>> } >> >> >>>> >> >> >>>> +static int spapr_msi_get_irq(PCIBus *bus, MSIMessage msg) >> >> >>>> +{ >> >> >>>> + DeviceState *par = bus->qbus.parent; >> >> >>>> + sPAPRPHBState *sphb = (sPAPRPHBState *) par; >> >> >>>> + unsigned long addr = msg.address - sphb->msi_win_addr; >> >> >>>> + int ndev = addr >> 16; >> >> >>>> + int vec = ((addr & 0xFFFF) >> 2) | msg.data; >> >> >>>> + uint32_t irq = sphb->msi_table[ndev].irq + vec; >> >> >>>> + >> >> >>>> + return (int)irq; >> >> >>>> +} >> >> >>>> + >> >> >>>> static const MemoryRegionOps spapr_msi_ops = { >> >> >>>> /* There is no .read as the read result is undefined by PCI spec */ >> >> >>>> .read = NULL, >> >> >>>> @@ -664,6 +676,7 @@ static int _spapr_phb_init(SysBusDevice *s) >> >> >>>> >> >> >>>> sphb->lsi_table[i].irq = irq; >> >> >>>> } >> >> >>>> + pci_bus_set_map_msi_fn(bus, spapr_msi_get_irq); >> >> >>>> >> >> >>>> return 0; >> >> >>>> } >> >> >>>> diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c >> >> >>>> index d309416..587f53e 100644 >> >> >>>> --- a/hw/virtio/virtio-pci.c >> >> >>>> +++ b/hw/virtio/virtio-pci.c >> >> >>>> @@ -472,6 +472,8 @@ static unsigned virtio_pci_get_features(DeviceState *d) >> >> >>>> return proxy->host_features; >> >> >>>> } >> >> >>>> >> >> >>>> +extern int spapr_msi_get_irq(PCIBus *bus, MSIMessage *msg); >> >> >>>> + >> >> >>>> static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy *proxy, >> >> >>>> unsigned int queue_no, >> >> >>>> unsigned int vector, >> >> >>>> @@ -481,7 +483,10 @@ static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy *proxy, >> >> >>>> int ret; >> >> >>>> >> >> >>>> if (irqfd->users == 0) { >> >> >>>> - ret = kvm_irqchip_add_msi_route(kvm_state, msg); >> >> >>>> + ret = pci_bus_map_msi(proxy->pci_dev.bus, msg); >> >> >>>> + if (ret < 0) { >> >> >>>> + ret = kvm_irqchip_add_msi_route(kvm_state, msg); >> >> >>>> + } >> >> >>>> if (ret < 0) { >> >> >>>> return ret; >> >> >>>> } >> >> >>>> @@ -609,14 +614,23 @@ static int virtio_pci_vq_vector_unmask(VirtIOPCIProxy *proxy, >> >> >>>> VirtQueue *vq = virtio_get_queue(proxy->vdev, queue_no); >> >> >>>> EventNotifier *n = virtio_queue_get_guest_notifier(vq); >> >> >>>> VirtIOIRQFD *irqfd; >> >> >>>> - int ret = 0; >> >> >>>> + int ret = 0, tmp; >> >> >>>> >> >> >>>> if (proxy->vector_irqfd) { >> >> >>>> irqfd = &proxy->vector_irqfd[vector]; >> >> >>>> - if (irqfd->msg.data != msg.data || irqfd->msg.address != msg.address) { >> >> >>>> - ret = kvm_irqchip_update_msi_route(kvm_state, irqfd->virq, msg); >> >> >>>> - if (ret < 0) { >> >> >>>> - return ret; >> >> >>>> + >> >> >>>> + tmp = pci_bus_map_msi(proxy->pci_dev.bus, msg); >> >> >>>> + if (tmp >= 0) { >> >> >>>> + if (irqfd->virq != tmp) { >> >> >>>> + fprintf(stderr, "FIXME: MSI(-X) vector has changed from %X to %x\n", >> >> >>>> + irqfd->virq, tmp); >> >> >>>> + } >> >> >>>> + } else { >> >> >>>> + if (irqfd->msg.data != msg.data || irqfd->msg.address != msg.address) { >> >> >>>> + ret = kvm_irqchip_update_msi_route(kvm_state, irqfd->virq, msg); >> >> >>>> + if (ret < 0) { >> >> >>>> + return ret; >> >> >>>> + } >> >> >>>> } >> >> >>>> } >> >> >>>> } >> >> >>>> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h >> >> >>>> index 8797802..632739a 100644 >> >> >>>> --- a/include/hw/pci/pci.h >> >> >>>> +++ b/include/hw/pci/pci.h >> >> >>>> @@ -332,6 +332,7 @@ MemoryRegion *pci_address_space_io(PCIDevice *dev); >> >> >>>> typedef void (*pci_set_irq_fn)(void *opaque, int irq_num, int level); >> >> >>>> typedef int (*pci_map_irq_fn)(PCIDevice *pci_dev, int irq_num); >> >> >>>> typedef PCIINTxRoute (*pci_route_irq_fn)(void *opaque, int pin); >> >> >>>> +typedef int (*pci_map_msi_fn)(PCIBus *bus, MSIMessage msg); >> >> >>>> >> >> >>>> typedef enum { >> >> >>>> PCI_HOTPLUG_DISABLED, >> >> >>>> @@ -375,6 +376,9 @@ bool pci_intx_route_changed(PCIINTxRoute *old, PCIINTxRoute *new); >> >> >>>> void pci_bus_fire_intx_routing_notifier(PCIBus *bus); >> >> >>>> void pci_device_set_intx_routing_notifier(PCIDevice *dev, >> >> >>>> PCIINTxRoutingNotifier notifier); >> >> >>>> +void pci_bus_set_map_msi_fn(PCIBus *bus, pci_map_msi_fn map_msi_fn); >> >> >>>> +int pci_bus_map_msi(PCIBus *bus, MSIMessage msg); >> >> >>>> + >> >> >>>> void pci_device_reset(PCIDevice *dev); >> >> >>>> void pci_bus_reset(PCIBus *bus); >> >> >>>> >> >> >>>> diff --git a/include/hw/pci/pci_bus.h b/include/hw/pci/pci_bus.h >> >> >>>> index 66762f6..81efd2b 100644 >> >> >>>> --- a/include/hw/pci/pci_bus.h >> >> >>>> +++ b/include/hw/pci/pci_bus.h >> >> >>>> @@ -16,6 +16,7 @@ struct PCIBus { >> >> >>>> pci_set_irq_fn set_irq; >> >> >>>> pci_map_irq_fn map_irq; >> >> >>>> pci_route_irq_fn route_intx_to_irq; >> >> >>>> + pci_map_msi_fn map_msi; >> >> >>>> pci_hotplug_fn hotplug; >> >> >>>> DeviceState *hotplug_qdev; >> >> >>>> void *irq_opaque; >> >> >>> >> >> >>> >> >> >>> >> >> >> >> >> >> >> >> > >> >> > >> >> > >> >> >> >> >> > >> > >> > >> >
Gleb Natapov <gleb@redhat.com> writes: > On Sun, Jun 23, 2013 at 10:06:05AM -0500, Anthony Liguori wrote: >> On Thu, Jun 20, 2013 at 11:46 PM, Alex Williamson >> <alex.williamson@redhat.com> wrote: >> > On Fri, 2013-06-21 at 12:49 +1000, Alexey Kardashevskiy wrote: >> >> On 06/21/2013 12:34 PM, Alex Williamson wrote: >> >> >> >> >> >> Do not follow you, sorry. For x86, is it that MSI routing table which is >> >> updated via KVM_SET_GSI_ROUTING in KVM? When there is no KVM, what piece of >> >> code responds on msi_notify() in qemu-x86 and does qemu_irq_pulse()? >> > >> > vfio_msi_interrupt->msi[x]_notify->stl_le_phys(msg.address, msg.data) >> > >> > This writes directly to the interrupt block on the vCPU. With KVM, the >> > in-kernel APIC does the same write, where the pin to MSIMessage is setup >> > by kvm_irqchip_add_msi_route and the pin is pulled by an irqfd. >> >> What is this "interrupt block on the vCPU" you speak of? I reviewed > FEE00000H address as seen from PCI bus is a special address range (see > 10.11.1 in SDM). Ack. > Any write by a PCI device to that address range is > interpreted as MSI. We do not model this correctly in QEMU yet since > all devices, including vcpus, see exactly same memory map. This should be a per-device mapping, yes. But I'm not sure that VCPUs should even see anything. I don't think a VCPU can generate an MSI interrupt by writing to this location. >> the SDM and see nothing in the APIC protocol or the brief description >> of MSI as a PCI concept that would indicate anything except that the >> PHB handles MSI writes and feeds them to the I/O APIC. >> > I/O APIC? Did you mean APIC, but even that will probably be incorrect. > I'd say it translates the data to APIC bus message. And with interrupt > remapping there is more magic happens between MSI and APIC bus. I think the wording in the SDM allows either. >> In fact, the wikipedia article on MSI has: >> >> "A common misconception with Message Signaled Interrupts is that they >> allow the device to send data to a processor as part of the interrupt. >> The data that is sent as part of the write is used by the chipset to >> determine which interrupt to trigger on which processor; it is not >> available for the device to communicate additional information to the >> interrupt handler." >> > Not sure who claimed otherwise. So to summarize: 1) MSI writes are intercepted by the PHB and generates an appropriate IRQ. 2) The PHB has a tuple of (src device, address, data) plus whatever information it maintains to do the translation. 3) On Power, we can have multiple PHBs. 4) The kernel interface assumes a single flat table mapping (address, data) to interrupts. We try to keep that table up-to-date in QEMU. 5) The reason the kernel has MSI info at all is to allow for IRQFDs to generate MSI interrupts. Is there anything that prevents us from using IRQFDs corresponding to the target of an MSI mapping and get rid of the MSI info in the kernel? It seems like the only sane way to actually support (2) and (3). Regards, Anthony Liguori
On 24.06.2013, at 14:32, Anthony Liguori wrote: > Gleb Natapov <gleb@redhat.com> writes: > >> On Sun, Jun 23, 2013 at 10:06:05AM -0500, Anthony Liguori wrote: >>> On Thu, Jun 20, 2013 at 11:46 PM, Alex Williamson >>> <alex.williamson@redhat.com> wrote: >>>> On Fri, 2013-06-21 at 12:49 +1000, Alexey Kardashevskiy wrote: >>>>> On 06/21/2013 12:34 PM, Alex Williamson wrote: >>>>> >>>>> >>>>> Do not follow you, sorry. For x86, is it that MSI routing table which is >>>>> updated via KVM_SET_GSI_ROUTING in KVM? When there is no KVM, what piece of >>>>> code responds on msi_notify() in qemu-x86 and does qemu_irq_pulse()? >>>> >>>> vfio_msi_interrupt->msi[x]_notify->stl_le_phys(msg.address, msg.data) >>>> >>>> This writes directly to the interrupt block on the vCPU. With KVM, the >>>> in-kernel APIC does the same write, where the pin to MSIMessage is setup >>>> by kvm_irqchip_add_msi_route and the pin is pulled by an irqfd. >>> >>> What is this "interrupt block on the vCPU" you speak of? I reviewed >> FEE00000H address as seen from PCI bus is a special address range (see >> 10.11.1 in SDM). > > Ack. > >> Any write by a PCI device to that address range is >> interpreted as MSI. We do not model this correctly in QEMU yet since >> all devices, including vcpus, see exactly same memory map. > > This should be a per-device mapping, yes. But I'm not sure that VCPUs > should even see anything. I don't think a VCPU can generate an MSI > interrupt by writing to this location. > >>> the SDM and see nothing in the APIC protocol or the brief description >>> of MSI as a PCI concept that would indicate anything except that the >>> PHB handles MSI writes and feeds them to the I/O APIC. >>> >> I/O APIC? Did you mean APIC, but even that will probably be incorrect. >> I'd say it translates the data to APIC bus message. And with interrupt >> remapping there is more magic happens between MSI and APIC bus. > > I think the wording in the SDM allows either. > >>> In fact, the wikipedia article on MSI has: >>> >>> "A common misconception with Message Signaled Interrupts is that they >>> allow the device to send data to a processor as part of the interrupt. >>> The data that is sent as part of the write is used by the chipset to >>> determine which interrupt to trigger on which processor; it is not >>> available for the device to communicate additional information to the >>> interrupt handler." >>> >> Not sure who claimed otherwise. > > So to summarize: > > 1) MSI writes are intercepted by the PHB and generates an appropriate > IRQ. > > 2) The PHB has a tuple of (src device, address, data) plus whatever > information it maintains to do the translation. > > 3) On Power, we can have multiple PHBs. > > 4) The kernel interface assumes a single flat table mapping (address, > data) to interrupts. We try to keep that table up-to-date in QEMU. > > 5) The reason the kernel has MSI info at all is to allow for IRQFDs to > generate MSI interrupts. > > Is there anything that prevents us from using IRQFDs corresponding to > the target of an MSI mapping and get rid of the MSI info in the kernel? What would that interface look like? An MSI does not arrive at an I/O APIC pin, so we can't use the existing "give me an irqfd for this pin" command. Alex
On Mon, Jun 24, 2013 at 07:24:03AM -0500, Anthony Liguori wrote: > Alex Williamson <alex.williamson@redhat.com> writes: > > > On Sun, 2013-06-23 at 16:58 -0500, Anthony Liguori wrote: > >> Yeah, but none of this is Power specific... so we can do the same for x86, no? > >> > >> I'm still trying to wrap my head around why we need MSI knowledge at > >> all in the kernel for x86. I presume it's to fast-path irqfd when > >> doing vhost? > > > > Or device assignment. Any paths where we want to inject an MSI > > interrupt without going through userspace. Thanks, > > With VFIO, we use irqfd, if we program the irqfd as the underlying IRQ > that MSI would route to, isn't that sufficient? > Not sure I understand what you mean by IRQ here, there is no "IRQ pin" on x86 CPU with APIC architecture (there is ExtINT but it is not relevant here). If by IRQ you mean "information needed to inject interrupt through APIC" then this is how things work now. > That's more or less what Ben is proposing for Power... > > Regards, > > Anthony Liguori > > > > > Alex -- Gleb.
On Mon, Jun 24, 2013 at 07:32:32AM -0500, Anthony Liguori wrote: > Gleb Natapov <gleb@redhat.com> writes: > > > On Sun, Jun 23, 2013 at 10:06:05AM -0500, Anthony Liguori wrote: > >> On Thu, Jun 20, 2013 at 11:46 PM, Alex Williamson > >> <alex.williamson@redhat.com> wrote: > >> > On Fri, 2013-06-21 at 12:49 +1000, Alexey Kardashevskiy wrote: > >> >> On 06/21/2013 12:34 PM, Alex Williamson wrote: > >> >> > >> >> > >> >> Do not follow you, sorry. For x86, is it that MSI routing table which is > >> >> updated via KVM_SET_GSI_ROUTING in KVM? When there is no KVM, what piece of > >> >> code responds on msi_notify() in qemu-x86 and does qemu_irq_pulse()? > >> > > >> > vfio_msi_interrupt->msi[x]_notify->stl_le_phys(msg.address, msg.data) > >> > > >> > This writes directly to the interrupt block on the vCPU. With KVM, the > >> > in-kernel APIC does the same write, where the pin to MSIMessage is setup > >> > by kvm_irqchip_add_msi_route and the pin is pulled by an irqfd. > >> > >> What is this "interrupt block on the vCPU" you speak of? I reviewed > > FEE00000H address as seen from PCI bus is a special address range (see > > 10.11.1 in SDM). > > Ack. > > > Any write by a PCI device to that address range is > > interpreted as MSI. We do not model this correctly in QEMU yet since > > all devices, including vcpus, see exactly same memory map. > > This should be a per-device mapping, yes. But I'm not sure that VCPUs > should even see anything. I don't think a VCPU can generate an MSI > interrupt by writing to this location. > No, and lower 4k of this space is where APIC is mapped as seen from CPU. > >> the SDM and see nothing in the APIC protocol or the brief description > >> of MSI as a PCI concept that would indicate anything except that the > >> PHB handles MSI writes and feeds them to the I/O APIC. > >> > > I/O APIC? Did you mean APIC, but even that will probably be incorrect. > > I'd say it translates the data to APIC bus message. And with interrupt > > remapping there is more magic happens between MSI and APIC bus. > > I think the wording in the SDM allows either. > SDM says nothing about it, but since we are guessing anyway my last guess would be I/O APIC. I/O APIC has well defined role: it detects level/edge interrupts on an input pins and send preconfigured APIC message to APIC bus if one is detected. In fact its even mapped at different address 0FEC00000H. Of course since I/O APIC and the logic that maps MSI writes to APIC bus message are probably on the same chips anyway you can call this logic a part of I/O APIC, but this is stretching it too much IMO :) > >> In fact, the wikipedia article on MSI has: > >> > >> "A common misconception with Message Signaled Interrupts is that they > >> allow the device to send data to a processor as part of the interrupt. > >> The data that is sent as part of the write is used by the chipset to > >> determine which interrupt to trigger on which processor; it is not > >> available for the device to communicate additional information to the > >> interrupt handler." > >> > > Not sure who claimed otherwise. > > So to summarize: > > 1) MSI writes are intercepted by the PHB and generates an appropriate > IRQ. > > 2) The PHB has a tuple of (src device, address, data) plus whatever > information it maintains to do the translation. > > 3) On Power, we can have multiple PHBs. > Looks like that means we need to put more information into the kernel, not less. > 4) The kernel interface assumes a single flat table mapping (address, > data) to interrupts. We try to keep that table up-to-date in QEMU. > > 5) The reason the kernel has MSI info at all is to allow for IRQFDs to > generate MSI interrupts. > > Is there anything that prevents us from using IRQFDs corresponding to > the target of an MSI mapping and get rid of the MSI info in the kernel? > Again, you assume that x86 has some pin that MSI triggers. This is not the case; address/data is minimum that is needed to inject interrupt there (or moving APIC into userspace, since this is where "translation" is happening). > It seems like the only sane way to actually support (2) and (3). > > Regards, > > Anthony Liguori -- Gleb.
Gleb Natapov <gleb@redhat.com> writes: > On Mon, Jun 24, 2013 at 07:32:32AM -0500, Anthony Liguori wrote: >> Gleb Natapov <gleb@redhat.com> writes: >> >> This should be a per-device mapping, yes. But I'm not sure that VCPUs >> should even see anything. I don't think a VCPU can generate an MSI >> interrupt by writing to this location. >> > No, and lower 4k of this space is where APIC is mapped as seen from CPU. >> Is there anything that prevents us from using IRQFDs corresponding to >> the target of an MSI mapping and get rid of the MSI info in the kernel? >> > Again, you assume that x86 has some pin that MSI triggers. This is not > the case; address/data is minimum that is needed to inject interrupt > there (or moving APIC into userspace, since this is where "translation" > is happening). An APIC message contains: 1) Destination Mode 2) Delivery mode 3) Level 4) Trigger mode 5) Vector 6) Destination Which is more or less what the MSI addr/data pair encodes. But we can certainly have a userspace interface to inject such a message into the LAPICs. In fact, this is more or less what KVM_SIGNAL_MSI is doing except that it's called MSI and encodes things in an addr/pair. Such an interface would also allow for a QEMU implementation of an IO APIC while still having the in-kernel LAPIC. It would also allow QEMU to do per-device MSI decoding. Isn't this more or less what Avi's previous proposal was around changing the APIC interfaces to userspace? Regards, Anthony Liguori > >> It seems like the only sane way to actually support (2) and (3). >> >> Regards, >> >> Anthony Liguori > > -- > Gleb.
On Mon, Jun 24, 2013 at 08:34:52AM -0500, Anthony Liguori wrote: > Gleb Natapov <gleb@redhat.com> writes: > > > On Mon, Jun 24, 2013 at 07:32:32AM -0500, Anthony Liguori wrote: > >> Gleb Natapov <gleb@redhat.com> writes: > >> > >> This should be a per-device mapping, yes. But I'm not sure that VCPUs > >> should even see anything. I don't think a VCPU can generate an MSI > >> interrupt by writing to this location. > >> > > No, and lower 4k of this space is where APIC is mapped as seen from CPU. > >> Is there anything that prevents us from using IRQFDs corresponding to > >> the target of an MSI mapping and get rid of the MSI info in the kernel? > >> > > Again, you assume that x86 has some pin that MSI triggers. This is not > > the case; address/data is minimum that is needed to inject interrupt > > there (or moving APIC into userspace, since this is where "translation" > > is happening). > > An APIC message contains: > > 1) Destination Mode > 2) Delivery mode > 3) Level > 4) Trigger mode > 5) Vector > 6) Destination > > Which is more or less what the MSI addr/data pair encodes. > > But we can certainly have a userspace interface to inject such a message > into the LAPICs. In fact, this is more or less what KVM_SIGNAL_MSI is > doing except that it's called MSI and encodes things in an addr/pair. > > Such an interface would also allow for a QEMU implementation of an IO > APIC while still having the in-kernel LAPIC. > > It would also allow QEMU to do per-device MSI decoding. > > Isn't this more or less what Avi's previous proposal was around changing > the APIC interfaces to userspace? > > Regards, > > Anthony Liguori While that's not very elegant, I think we can use the existing interface for this: just encode things in a fake "msi message" in the format that kernel expects. > > > >> It seems like the only sane way to actually support (2) and (3). > >> > >> Regards, > >> > >> Anthony Liguori > > > > -- > > Gleb.
"Michael S. Tsirkin" <mst@redhat.com> writes: > On Mon, Jun 24, 2013 at 08:34:52AM -0500, Anthony Liguori wrote: >> Gleb Natapov <gleb@redhat.com> writes: >> >> Isn't this more or less what Avi's previous proposal was around changing >> the APIC interfaces to userspace? >> >> Regards, >> >> Anthony Liguori > > While that's not very elegant, I think we can use the existing > interface for this: just encode things in a fake > "msi message" in the format that kernel expects. This is, in fact, exactly what we do today. The MSI interfaces aren't for MSI. They are for sending messages to the APIC bus. What we should do is: #define KVM_SIGNAL_LAPIC KVM_SIGNAL_MSI #define KVM_IRQ_ROUTING_LAPIC KVM_IRQ_ROUTING_MSI And switch to using the new #defines in QEMU. That would make it more obvious where we need to refactor things. We currently hard code routing via a local APIC with MSI. We need to change this into a bus specific function that can choose the right interface. I think Power is fine with just doing all routing through the irqchip interface. We may need other routing interfaces for other architectures though. Regards, Anthony Liguori > >> > >> >> It seems like the only sane way to actually support (2) and (3). >> >> >> >> Regards, >> >> >> >> Anthony Liguori >> > >> > -- >> > Gleb.
On 24.06.2013, at 16:31, Anthony Liguori wrote: > "Michael S. Tsirkin" <mst@redhat.com> writes: > >> On Mon, Jun 24, 2013 at 08:34:52AM -0500, Anthony Liguori wrote: >>> Gleb Natapov <gleb@redhat.com> writes: >>> >>> Isn't this more or less what Avi's previous proposal was around changing >>> the APIC interfaces to userspace? >>> >>> Regards, >>> >>> Anthony Liguori >> >> While that's not very elegant, I think we can use the existing >> interface for this: just encode things in a fake >> "msi message" in the format that kernel expects. > > This is, in fact, exactly what we do today. The MSI interfaces aren't > for MSI. They are for sending messages to the APIC bus. What we should > do is: > > #define KVM_SIGNAL_LAPIC KVM_SIGNAL_MSI > #define KVM_IRQ_ROUTING_LAPIC KVM_IRQ_ROUTING_MSI > > And switch to using the new #defines in QEMU. That would make it more > obvious where we need to refactor things. We currently hard code > routing via a local APIC with MSI. We need to change this into a bus > specific function that can choose the right interface. > > I think Power is fine with just doing all routing through the irqchip > interface. We may need other routing interfaces for other architectures > though. I'm not quite sure what problem exactly you're trying to solve here. The main user of MSI injects through irqfd are in the kernel and definitely want a fast path to inject those without any involvement of QEMU at all. Alex
Alexander Graf <agraf@suse.de> writes: > On 24.06.2013, at 16:31, Anthony Liguori wrote: > >> "Michael S. Tsirkin" <mst@redhat.com> writes: >> >>> On Mon, Jun 24, 2013 at 08:34:52AM -0500, Anthony Liguori wrote: >>>> Gleb Natapov <gleb@redhat.com> writes: >>>> >>>> Isn't this more or less what Avi's previous proposal was around changing >>>> the APIC interfaces to userspace? >>>> >>>> Regards, >>>> >>>> Anthony Liguori >>> >>> While that's not very elegant, I think we can use the existing >>> interface for this: just encode things in a fake >>> "msi message" in the format that kernel expects. >> >> This is, in fact, exactly what we do today. The MSI interfaces aren't >> for MSI. They are for sending messages to the APIC bus. What we should >> do is: >> >> #define KVM_SIGNAL_LAPIC KVM_SIGNAL_MSI >> #define KVM_IRQ_ROUTING_LAPIC KVM_IRQ_ROUTING_MSI >> >> And switch to using the new #defines in QEMU. That would make it more >> obvious where we need to refactor things. We currently hard code >> routing via a local APIC with MSI. We need to change this into a bus >> specific function that can choose the right interface. >> >> I think Power is fine with just doing all routing through the irqchip >> interface. We may need other routing interfaces for other architectures >> though. > > I'm not quite sure what problem exactly you're trying to solve > here. The main user of MSI injects through irqfd are in the kernel and > definitely want a fast path to inject those without any involvement of > QEMU at all. Internally the communication between the IO APIC and LAPICs uses a protocol that does not map to a simple "pin numbering". Because we implement the IO APIC in the kernel, the kernel interface can be pin numbering more or less. But MSI bypasses the IO APIC (potentially at least) which means that we need a userspace interface for sending APIC messages. Unfortunately, when this was added to support MSI, this was called "MSI" but it is most certainly not. In the very least, you also need the bus state + the source device to do proper MSI translation. We get around this by assigning global MSI addresses and only having a single PHB. If you make those assumptions, then the APIC protocol looks a lot like an MSI address/data pair. Regards, Anthony Liguori > > > Alex
On Mon, Jun 24, 2013 at 08:34:52AM -0500, Anthony Liguori wrote: > Gleb Natapov <gleb@redhat.com> writes: > > > On Mon, Jun 24, 2013 at 07:32:32AM -0500, Anthony Liguori wrote: > >> Gleb Natapov <gleb@redhat.com> writes: > >> > >> This should be a per-device mapping, yes. But I'm not sure that VCPUs > >> should even see anything. I don't think a VCPU can generate an MSI > >> interrupt by writing to this location. > >> > > No, and lower 4k of this space is where APIC is mapped as seen from CPU. > >> Is there anything that prevents us from using IRQFDs corresponding to > >> the target of an MSI mapping and get rid of the MSI info in the kernel? > >> > > Again, you assume that x86 has some pin that MSI triggers. This is not > > the case; address/data is minimum that is needed to inject interrupt > > there (or moving APIC into userspace, since this is where "translation" > > is happening). > > An APIC message contains: > > 1) Destination Mode > 2) Delivery mode > 3) Level > 4) Trigger mode > 5) Vector > 6) Destination > > Which is more or less what the MSI addr/data pair encodes. > Not if interrupt remapping is in use. > But we can certainly have a userspace interface to inject such a message > into the LAPICs. In fact, this is more or less what KVM_SIGNAL_MSI is > doing except that it's called MSI and encodes things in an addr/pair. > Good that it does that otherwise it would have been broken after interrupt remapping implementation. > Such an interface would also allow for a QEMU implementation of an IO > APIC while still having the in-kernel LAPIC. > Yes. > It would also allow QEMU to do per-device MSI decoding. > Why can't it be done now with existing interfaces? > Isn't this more or less what Avi's previous proposal was around changing > the APIC interfaces to userspace? > Avi actually was against adding KVM_SIGNAL_MSI initially since it duplicated the functionality we already had. Don't remember how Jan managed to persuade him in the end :) -- Gleb.
On Mon, Jun 24, 2013 at 10:17:41AM -0500, Anthony Liguori wrote: > Alexander Graf <agraf@suse.de> writes: > > > On 24.06.2013, at 16:31, Anthony Liguori wrote: > > > >> "Michael S. Tsirkin" <mst@redhat.com> writes: > >> > >>> On Mon, Jun 24, 2013 at 08:34:52AM -0500, Anthony Liguori wrote: > >>>> Gleb Natapov <gleb@redhat.com> writes: > >>>> > >>>> Isn't this more or less what Avi's previous proposal was around changing > >>>> the APIC interfaces to userspace? > >>>> > >>>> Regards, > >>>> > >>>> Anthony Liguori > >>> > >>> While that's not very elegant, I think we can use the existing > >>> interface for this: just encode things in a fake > >>> "msi message" in the format that kernel expects. > >> > >> This is, in fact, exactly what we do today. The MSI interfaces aren't > >> for MSI. They are for sending messages to the APIC bus. What we should > >> do is: > >> > >> #define KVM_SIGNAL_LAPIC KVM_SIGNAL_MSI > >> #define KVM_IRQ_ROUTING_LAPIC KVM_IRQ_ROUTING_MSI > >> > >> And switch to using the new #defines in QEMU. That would make it more > >> obvious where we need to refactor things. We currently hard code > >> routing via a local APIC with MSI. We need to change this into a bus > >> specific function that can choose the right interface. > >> > >> I think Power is fine with just doing all routing through the irqchip > >> interface. We may need other routing interfaces for other architectures > >> though. > > > > I'm not quite sure what problem exactly you're trying to solve > > here. The main user of MSI injects through irqfd are in the kernel and > > definitely want a fast path to inject those without any involvement of > > QEMU at all. > > Internally the communication between the IO APIC and LAPICs uses a > protocol that does not map to a simple "pin numbering". > > Because we implement the IO APIC in the kernel, the kernel interface can > be pin numbering more or less. > > But MSI bypasses the IO APIC (potentially at least) which means that we > need a userspace interface for sending APIC messages. > > Unfortunately, when this was added to support MSI, this was called "MSI" > but it is most certainly not. Why not? MSI is when a PIC device writes something somewhere. This is what interface provides. Interpretation of MSI data/address as APIC message is x86 specific and shouldn't even be compiled by other arches. > > In the very least, you also need the bus state + the source device to do > proper MSI translation. We get around this by assigning global MSI > addresses and only having a single PHB. > It looks to me that you are arguing that each architecture needs to have its own logic to deliver MSI interrupt and for some arches current interface does not provide enough data. I am not arguing with that. > If you make those assumptions, then the APIC protocol looks a lot like > an MSI address/data pair. > -- Gleb.
diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c index 14aac04..2d9eef7 100644 --- a/hw/misc/vfio.c +++ b/hw/misc/vfio.c @@ -639,7 +639,11 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, * Attempt to enable route through KVM irqchip, * default to userspace handling if unavailable. */ - vector->virq = msg ? kvm_irqchip_add_msi_route(kvm_state, *msg) : -1; + + vector->virq = msg ? pci_bus_map_msi(vdev->pdev.bus, *msg) : -1; + if (vector->virq < 0) { + vector->virq = msg ? kvm_irqchip_add_msi_route(kvm_state, *msg) : -1; + } if (vector->virq < 0 || kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt, vector->virq) < 0) { @@ -807,7 +811,10 @@ retry: * Attempt to enable route through KVM irqchip, * default to userspace handling if unavailable. */ - vector->virq = kvm_irqchip_add_msi_route(kvm_state, msg); + vector->virq = pci_bus_map_msi(vdev->pdev.bus, msg); + if (vector->virq < 0) { + vector->virq = kvm_irqchip_add_msi_route(kvm_state, msg); + } if (vector->virq < 0 || kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt, vector->virq) < 0) { diff --git a/hw/pci/pci.c b/hw/pci/pci.c index a976e46..a9875e9 100644 --- a/hw/pci/pci.c +++ b/hw/pci/pci.c @@ -1254,6 +1254,19 @@ void pci_device_set_intx_routing_notifier(PCIDevice *dev, dev->intx_routing_notifier = notifier; } +void pci_bus_set_map_msi_fn(PCIBus *bus, pci_map_msi_fn map_msi_fn) +{ + bus->map_msi = map_msi_fn; +} + +int pci_bus_map_msi(PCIBus *bus, MSIMessage msg) +{ + if (bus->map_msi) { + return bus->map_msi(bus, msg); + } + return -1; +} + /* * PCI-to-PCI bridge specification * 9.1: Interrupt routing. Table 9-1 diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c index 80408c9..9ef9a29 100644 --- a/hw/ppc/spapr_pci.c +++ b/hw/ppc/spapr_pci.c @@ -500,6 +500,18 @@ static void spapr_msi_write(void *opaque, hwaddr addr, qemu_irq_pulse(xics_get_qirq(spapr->icp, irq)); } +static int spapr_msi_get_irq(PCIBus *bus, MSIMessage msg) +{ + DeviceState *par = bus->qbus.parent; + sPAPRPHBState *sphb = (sPAPRPHBState *) par; + unsigned long addr = msg.address - sphb->msi_win_addr; + int ndev = addr >> 16; + int vec = ((addr & 0xFFFF) >> 2) | msg.data; + uint32_t irq = sphb->msi_table[ndev].irq + vec; + + return (int)irq; +} + static const MemoryRegionOps spapr_msi_ops = { /* There is no .read as the read result is undefined by PCI spec */ .read = NULL, @@ -664,6 +676,7 @@ static int _spapr_phb_init(SysBusDevice *s) sphb->lsi_table[i].irq = irq; } + pci_bus_set_map_msi_fn(bus, spapr_msi_get_irq); return 0; } diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c index d309416..587f53e 100644 --- a/hw/virtio/virtio-pci.c +++ b/hw/virtio/virtio-pci.c @@ -472,6 +472,8 @@ static unsigned virtio_pci_get_features(DeviceState *d) return proxy->host_features; } +extern int spapr_msi_get_irq(PCIBus *bus, MSIMessage *msg); + static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy *proxy, unsigned int queue_no, unsigned int vector, @@ -481,7 +483,10 @@ static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy *proxy, int ret; if (irqfd->users == 0) { - ret = kvm_irqchip_add_msi_route(kvm_state, msg); + ret = pci_bus_map_msi(proxy->pci_dev.bus, msg); + if (ret < 0) { + ret = kvm_irqchip_add_msi_route(kvm_state, msg); + } if (ret < 0) { return ret; } @@ -609,14 +614,23 @@ static int virtio_pci_vq_vector_unmask(VirtIOPCIProxy *proxy, VirtQueue *vq = virtio_get_queue(proxy->vdev, queue_no); EventNotifier *n = virtio_queue_get_guest_notifier(vq); VirtIOIRQFD *irqfd; - int ret = 0; + int ret = 0, tmp; if (proxy->vector_irqfd) { irqfd = &proxy->vector_irqfd[vector]; - if (irqfd->msg.data != msg.data || irqfd->msg.address != msg.address) { - ret = kvm_irqchip_update_msi_route(kvm_state, irqfd->virq, msg); - if (ret < 0) { - return ret; + + tmp = pci_bus_map_msi(proxy->pci_dev.bus, msg); + if (tmp >= 0) { + if (irqfd->virq != tmp) { + fprintf(stderr, "FIXME: MSI(-X) vector has changed from %X to %x\n", + irqfd->virq, tmp); + } + } else { + if (irqfd->msg.data != msg.data || irqfd->msg.address != msg.address) { + ret = kvm_irqchip_update_msi_route(kvm_state, irqfd->virq, msg); + if (ret < 0) { + return ret; + } } } } diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h index 8797802..632739a 100644 --- a/include/hw/pci/pci.h +++ b/include/hw/pci/pci.h @@ -332,6 +332,7 @@ MemoryRegion *pci_address_space_io(PCIDevice *dev); typedef void (*pci_set_irq_fn)(void *opaque, int irq_num, int level); typedef int (*pci_map_irq_fn)(PCIDevice *pci_dev, int irq_num); typedef PCIINTxRoute (*pci_route_irq_fn)(void *opaque, int pin); +typedef int (*pci_map_msi_fn)(PCIBus *bus, MSIMessage msg); typedef enum { PCI_HOTPLUG_DISABLED, @@ -375,6 +376,9 @@ bool pci_intx_route_changed(PCIINTxRoute *old, PCIINTxRoute *new); void pci_bus_fire_intx_routing_notifier(PCIBus *bus); void pci_device_set_intx_routing_notifier(PCIDevice *dev, PCIINTxRoutingNotifier notifier); +void pci_bus_set_map_msi_fn(PCIBus *bus, pci_map_msi_fn map_msi_fn); +int pci_bus_map_msi(PCIBus *bus, MSIMessage msg); + void pci_device_reset(PCIDevice *dev); void pci_bus_reset(PCIBus *bus); diff --git a/include/hw/pci/pci_bus.h b/include/hw/pci/pci_bus.h index 66762f6..81efd2b 100644 --- a/include/hw/pci/pci_bus.h +++ b/include/hw/pci/pci_bus.h @@ -16,6 +16,7 @@ struct PCIBus { pci_set_irq_fn set_irq; pci_map_irq_fn map_irq; pci_route_irq_fn route_intx_to_irq; + pci_map_msi_fn map_msi; pci_hotplug_fn hotplug; DeviceState *hotplug_qdev; void *irq_opaque;
At the moment QEMU creates a route for every MSI IRQ. Now we are about to add IRQFD support on PPC64-pseries platform. pSeries already has in-kernel emulated interrupt controller with 8192 IRQs. Also, pSeries PHB already supports MSIMessage to IRQ mapping as a part of PAPR requirements for MSI/MSIX guests. Specifically, the pSeries guest does not touch MSIMessage's at all, instead it uses rtas_ibm_change_msi and rtas_ibm_query_interrupt_source rtas calls to do the mapping. Therefore we do not really need more routing than we got already. The patch introduces the infrastructure to enable direct IRQ mapping. Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> --- The patch is raw and ugly indeed, I made it only to demonstrate the idea and see if it has right to live or not. For some reason which I do not really understand (limited GSI numbers?) the existing code always adds routing and I do not see why we would need it. Thanks! --- hw/misc/vfio.c | 11 +++++++++-- hw/pci/pci.c | 13 +++++++++++++ hw/ppc/spapr_pci.c | 13 +++++++++++++ hw/virtio/virtio-pci.c | 26 ++++++++++++++++++++------ include/hw/pci/pci.h | 4 ++++ include/hw/pci/pci_bus.h | 1 + 6 files changed, 60 insertions(+), 8 deletions(-)