Message ID | ee9ba574e6ccc27d33585d09f0bd66c1b6051379.1318843693.git.jan.kiszka@siemens.com |
---|---|
State | New |
Headers | show |
On Mon, Oct 17, 2011 at 11:28:02AM +0200, Jan Kiszka wrote: > This optimization was only required to keep KVM route usage low. Now > that we solve that problem via lazy updates, we can drop the field. We > still need interfaces to clear pending vectors, though (and we have to > make use of them more broadly - but that's unrelated to this patch). > > Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com> Lazy updates should be an implementation detail. IMO resource tracking of vectors makes sense as an API. Making devices deal with pending vectors as a concept, IMO, does not. > --- > hw/ivshmem.c | 16 ++----------- > hw/msix.c | 62 +++++++++++------------------------------------------- > hw/msix.h | 5 +-- > hw/pci.h | 2 - > hw/virtio-pci.c | 20 +++++++---------- > 5 files changed, 26 insertions(+), 79 deletions(-) > > diff --git a/hw/ivshmem.c b/hw/ivshmem.c > index 242fbea..a402c98 100644 > --- a/hw/ivshmem.c > +++ b/hw/ivshmem.c > @@ -535,10 +535,8 @@ static uint64_t ivshmem_get_size(IVShmemState * s) { > return value; > } > > -static void ivshmem_setup_msi(IVShmemState * s) { > - > - int i; > - > +static void ivshmem_setup_msi(IVShmemState *s) > +{ > /* allocate the MSI-X vectors */ > > memory_region_init(&s->msix_bar, "ivshmem-msix", 4096); > @@ -551,11 +549,6 @@ static void ivshmem_setup_msi(IVShmemState * s) { > exit(1); > } > > - /* 'activate' the vectors */ > - for (i = 0; i < s->vectors; i++) { > - msix_vector_use(&s->dev, i); > - } > - > /* allocate Qemu char devices for receiving interrupts */ > s->eventfd_table = g_malloc0(s->vectors * sizeof(EventfdEntry)); > } > @@ -581,7 +574,7 @@ static int ivshmem_load(QEMUFile* f, void *opaque, int version_id) > IVSHMEM_DPRINTF("ivshmem_load\n"); > > IVShmemState *proxy = opaque; > - int ret, i; > + int ret; > > if (version_id > 0) { > return -EINVAL; > @@ -599,9 +592,6 @@ static int ivshmem_load(QEMUFile* f, void *opaque, int version_id) > > if (ivshmem_has_feature(proxy, IVSHMEM_MSI)) { > msix_load(&proxy->dev, f); > - for (i = 0; i < proxy->vectors; i++) { > - msix_vector_use(&proxy->dev, i); > - } > } else { > proxy->intrstatus = qemu_get_be32(f); > proxy->intrmask = qemu_get_be32(f); > diff --git a/hw/msix.c b/hw/msix.c > index ce3375a..f1b97b5 100644 > --- a/hw/msix.c > +++ b/hw/msix.c > @@ -292,9 +292,6 @@ int msix_init(struct PCIDevice *dev, unsigned short nentries, > if (nentries > MSIX_MAX_ENTRIES) > return -EINVAL; > > - dev->msix_entry_used = g_malloc0(MSIX_MAX_ENTRIES * > - sizeof *dev->msix_entry_used); > - > dev->msix_table_page = g_malloc0(MSIX_PAGE_SIZE); > msix_mask_all(dev, nentries); > > @@ -317,21 +314,9 @@ err_config: > memory_region_destroy(&dev->msix_mmio); > g_free(dev->msix_table_page); > dev->msix_table_page = NULL; > - g_free(dev->msix_entry_used); > - dev->msix_entry_used = NULL; > return ret; > } > > -static void msix_free_irq_entries(PCIDevice *dev) > -{ > - int vector; > - > - for (vector = 0; vector < dev->msix_entries_nr; ++vector) { > - dev->msix_entry_used[vector] = 0; > - msix_clr_pending(dev, vector); > - } > -} > - > /* Clean up resources for the device. */ > int msix_uninit(PCIDevice *dev, MemoryRegion *bar) > { > @@ -340,14 +325,11 @@ int msix_uninit(PCIDevice *dev, MemoryRegion *bar) > } > pci_del_capability(dev, PCI_CAP_ID_MSIX, MSIX_CAP_LENGTH); > dev->msix_cap = 0; > - msix_free_irq_entries(dev); > dev->msix_entries_nr = 0; > memory_region_del_subregion(bar, &dev->msix_mmio); > memory_region_destroy(&dev->msix_mmio); > g_free(dev->msix_table_page); > dev->msix_table_page = NULL; > - g_free(dev->msix_entry_used); > - dev->msix_entry_used = NULL; > > kvm_msix_free(dev); > g_free(dev->msix_cache); > @@ -376,7 +358,6 @@ void msix_load(PCIDevice *dev, QEMUFile *f) > return; > } > > - msix_free_irq_entries(dev); > qemu_get_buffer(f, dev->msix_table_page, n * PCI_MSIX_ENTRY_SIZE); > qemu_get_buffer(f, dev->msix_table_page + MSIX_PAGE_PENDING, (n + 7) / 8); > } > @@ -407,7 +388,7 @@ void msix_notify(PCIDevice *dev, unsigned vector) > { > MSIMessage msg; > > - if (vector >= dev->msix_entries_nr || !dev->msix_entry_used[vector]) > + if (vector >= dev->msix_entries_nr) > return; > if (msix_is_masked(dev, vector)) { > msix_set_pending(dev, vector); > @@ -424,48 +405,31 @@ void msix_reset(PCIDevice *dev) > if (!msix_present(dev)) { > return; > } > - msix_free_irq_entries(dev); > + msix_clear_all_vectors(dev); > dev->config[dev->msix_cap + MSIX_CONTROL_OFFSET] &= > ~dev->wmask[dev->msix_cap + MSIX_CONTROL_OFFSET]; > memset(dev->msix_table_page, 0, MSIX_PAGE_SIZE); > msix_mask_all(dev, dev->msix_entries_nr); > } > > -/* PCI spec suggests that devices make it possible for software to configure > - * less vectors than supported by the device, but does not specify a standard > - * mechanism for devices to do so. > - * > - * We support this by asking devices to declare vectors software is going to > - * actually use, and checking this on the notification path. Devices that > - * don't want to follow the spec suggestion can declare all vectors as used. */ > - > -/* Mark vector as used. */ > -int msix_vector_use(PCIDevice *dev, unsigned vector) > +/* Clear pending vector. */ > +void msix_clear_vector(PCIDevice *dev, unsigned vector) > { > - if (vector >= dev->msix_entries_nr) > - return -EINVAL; > - ++dev->msix_entry_used[vector]; > - return 0; > -} > - > -/* Mark vector as unused. */ > -void msix_vector_unuse(PCIDevice *dev, unsigned vector) > -{ > - if (vector >= dev->msix_entries_nr || !dev->msix_entry_used[vector]) { > - return; > - } > - if (--dev->msix_entry_used[vector]) { > - return; > + if (msix_present(dev) && vector < dev->msix_entries_nr) { > + msix_clr_pending(dev, vector); > } > - msix_clr_pending(dev, vector); > } > > -void msix_unuse_all_vectors(PCIDevice *dev) > +void msix_clear_all_vectors(PCIDevice *dev) > { > + unsigned int vector; > + > if (!msix_present(dev)) { > return; > } > - msix_free_irq_entries(dev); > + for (vector = 0; vector < dev->msix_entries_nr; ++vector) { > + msix_clr_pending(dev, vector); > + } > } > > /* Invoke the notifier if vector entry is used and unmasked. */ > @@ -476,7 +440,7 @@ msix_notify_if_unmasked(PCIDevice *dev, unsigned int vector, bool masked) > > assert(dev->msix_vector_config_notifier); > > - if (!dev->msix_entry_used[vector] || msix_is_masked(dev, vector)) { > + if (msix_is_masked(dev, vector)) { > return 0; > } > msix_message_from_vector(dev, vector, &msg); > diff --git a/hw/msix.h b/hw/msix.h > index 978f417..9cd54cf 100644 > --- a/hw/msix.h > +++ b/hw/msix.h > @@ -21,9 +21,8 @@ int msix_present(PCIDevice *dev); > > uint32_t msix_bar_size(PCIDevice *dev); > > -int msix_vector_use(PCIDevice *dev, unsigned vector); > -void msix_vector_unuse(PCIDevice *dev, unsigned vector); > -void msix_unuse_all_vectors(PCIDevice *dev); > +void msix_clear_vector(PCIDevice *dev, unsigned vector); > +void msix_clear_all_vectors(PCIDevice *dev); > > void msix_notify(PCIDevice *dev, unsigned vector); > > diff --git a/hw/pci.h b/hw/pci.h > index d7a652e..5cf9a16 100644 > --- a/hw/pci.h > +++ b/hw/pci.h > @@ -178,8 +178,6 @@ struct PCIDevice { > uint8_t *msix_table_page; > /* MMIO index used to map MSIX table and pending bit entries. */ > MemoryRegion msix_mmio; > - /* Reference-count for entries actually in use by driver. */ > - unsigned *msix_entry_used; > /* Region including the MSI-X table */ > uint32_t msix_bar_size; > /* Version id needed for VMState */ > diff --git a/hw/virtio-pci.c b/hw/virtio-pci.c > index 85d6771..5004d7d 100644 > --- a/hw/virtio-pci.c > +++ b/hw/virtio-pci.c > @@ -136,9 +136,6 @@ static int virtio_pci_load_config(void * opaque, QEMUFile *f) > } else { > proxy->vdev->config_vector = VIRTIO_NO_VECTOR; > } > - if (proxy->vdev->config_vector != VIRTIO_NO_VECTOR) { > - return msix_vector_use(&proxy->pci_dev, proxy->vdev->config_vector); > - } > return 0; > } > > @@ -152,9 +149,6 @@ static int virtio_pci_load_queue(void * opaque, int n, QEMUFile *f) > vector = VIRTIO_NO_VECTOR; > } > virtio_queue_set_vector(proxy->vdev, n, vector); > - if (vector != VIRTIO_NO_VECTOR) { > - return msix_vector_use(&proxy->pci_dev, vector); > - } > return 0; > } > > @@ -304,7 +298,7 @@ static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val) > if (pa == 0) { > virtio_pci_stop_ioeventfd(proxy); > virtio_reset(proxy->vdev); > - msix_unuse_all_vectors(&proxy->pci_dev); > + msix_clear_all_vectors(&proxy->pci_dev); > } > else > virtio_queue_set_addr(vdev, vdev->queue_sel, pa); > @@ -331,7 +325,7 @@ static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val) > > if (vdev->status == 0) { > virtio_reset(proxy->vdev); > - msix_unuse_all_vectors(&proxy->pci_dev); > + msix_clear_all_vectors(&proxy->pci_dev); > } > > /* Linux before 2.6.34 sets the device as OK without enabling > @@ -343,18 +337,20 @@ static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val) > } > break; > case VIRTIO_MSI_CONFIG_VECTOR: > - msix_vector_unuse(&proxy->pci_dev, vdev->config_vector); > + msix_clear_vector(&proxy->pci_dev, vdev->config_vector); > /* Make it possible for guest to discover an error took place. */ > - if (msix_vector_use(&proxy->pci_dev, val) < 0) > + if (val >= vdev->nvectors) { > val = VIRTIO_NO_VECTOR; > + } > vdev->config_vector = val; > break; > case VIRTIO_MSI_QUEUE_VECTOR: > - msix_vector_unuse(&proxy->pci_dev, > + msix_clear_vector(&proxy->pci_dev, > virtio_queue_vector(vdev, vdev->queue_sel)); > /* Make it possible for guest to discover an error took place. */ > - if (msix_vector_use(&proxy->pci_dev, val) < 0) > + if (val >= vdev->nvectors) { > val = VIRTIO_NO_VECTOR; > + } > virtio_queue_set_vector(vdev, vdev->queue_sel, val); > break; > default: > -- > 1.7.3.4
On 2011-10-17 17:48, Michael S. Tsirkin wrote: > On Mon, Oct 17, 2011 at 11:28:02AM +0200, Jan Kiszka wrote: >> This optimization was only required to keep KVM route usage low. Now >> that we solve that problem via lazy updates, we can drop the field. We >> still need interfaces to clear pending vectors, though (and we have to >> make use of them more broadly - but that's unrelated to this patch). >> >> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com> > > Lazy updates should be an implementation detail. > IMO resource tracking of vectors makes sense > as an API. Making devices deal with pending > vectors as a concept, IMO, does not. There is really no use for tracking the vector lifecycle once we have lazy updates (except for static routes). It's a way too invasive concept, and it's not needed for anything but KVM. If you want an example, check http://permalink.gmane.org/gmane.comp.emulators.kvm.devel/70915 and compare it to the changes done to hpet in this series. Jan
On Mon, Oct 17, 2011 at 09:28:12PM +0200, Jan Kiszka wrote: > On 2011-10-17 17:48, Michael S. Tsirkin wrote: > > On Mon, Oct 17, 2011 at 11:28:02AM +0200, Jan Kiszka wrote: > >> This optimization was only required to keep KVM route usage low. Now > >> that we solve that problem via lazy updates, we can drop the field. We > >> still need interfaces to clear pending vectors, though (and we have to > >> make use of them more broadly - but that's unrelated to this patch). > >> > >> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com> > > > > Lazy updates should be an implementation detail. > > IMO resource tracking of vectors makes sense > > as an API. Making devices deal with pending > > vectors as a concept, IMO, does not. > > There is really no use for tracking the vector lifecycle once we have > lazy updates (except for static routes). It's a way too invasive > concept, and it's not needed for anything but KVM. I think it's needed. The PCI spec states that when the device does not need an interrupt anymore, it should clear the pending bit. The use/unuse is IMO a decent API for this, because it uses a familiar resource tracking concept. Exposing this knowledge of msix to devices seems like a worse API. > > If you want an example, check > http://permalink.gmane.org/gmane.comp.emulators.kvm.devel/70915 and > compare it to the changes done to hpet in this series. > > Jan > This seems to be a general argument that lazy updates are good? I have no real problem with them, besides the fact that we need an API to reserve space in the routing table so that device setup can fail upfront.
On 2011-10-18 13:58, Michael S. Tsirkin wrote: > On Mon, Oct 17, 2011 at 09:28:12PM +0200, Jan Kiszka wrote: >> On 2011-10-17 17:48, Michael S. Tsirkin wrote: >>> On Mon, Oct 17, 2011 at 11:28:02AM +0200, Jan Kiszka wrote: >>>> This optimization was only required to keep KVM route usage low. Now >>>> that we solve that problem via lazy updates, we can drop the field. We >>>> still need interfaces to clear pending vectors, though (and we have to >>>> make use of them more broadly - but that's unrelated to this patch). >>>> >>>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com> >>> >>> Lazy updates should be an implementation detail. >>> IMO resource tracking of vectors makes sense >>> as an API. Making devices deal with pending >>> vectors as a concept, IMO, does not. >> >> There is really no use for tracking the vector lifecycle once we have >> lazy updates (except for static routes). It's a way too invasive >> concept, and it's not needed for anything but KVM. > > I think it's needed. The PCI spec states that when the device > does not need an interrupt anymore, it should clear the pending > bit. That should be done explicitly if it is required outside existing clearing points. We already have that service, it's called msix_clear_vector. That alone does not justify msix_vector_use and all the state and logic behind it IMHO. > The use/unuse is IMO a decent API for this, > because it uses a familiar resource tracking concept. > Exposing this knowledge of msix to devices seems > like a worse API. > >> >> If you want an example, check >> http://permalink.gmane.org/gmane.comp.emulators.kvm.devel/70915 and >> compare it to the changes done to hpet in this series. >> >> Jan >> > > This seems to be a general argument that lazy updates are good? > I have no real problem with them, besides the fact that > we need an API to reserve space in the routing > table so that device setup can fail upfront. That's not possible, even with used vectors, as devices change their vector usage depending on how the guest configures the devices. If you (pre-)allocate all possible vectors, you may run out of resources earlier than needed actually. That's also why we do those data == 0 checks to skip used but unconfigured vectors. Jan
On Tue, Oct 18, 2011 at 02:08:59PM +0200, Jan Kiszka wrote: > On 2011-10-18 13:58, Michael S. Tsirkin wrote: > > On Mon, Oct 17, 2011 at 09:28:12PM +0200, Jan Kiszka wrote: > >> On 2011-10-17 17:48, Michael S. Tsirkin wrote: > >>> On Mon, Oct 17, 2011 at 11:28:02AM +0200, Jan Kiszka wrote: > >>>> This optimization was only required to keep KVM route usage low. Now > >>>> that we solve that problem via lazy updates, we can drop the field. We > >>>> still need interfaces to clear pending vectors, though (and we have to > >>>> make use of them more broadly - but that's unrelated to this patch). > >>>> > >>>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com> > >>> > >>> Lazy updates should be an implementation detail. > >>> IMO resource tracking of vectors makes sense > >>> as an API. Making devices deal with pending > >>> vectors as a concept, IMO, does not. > >> > >> There is really no use for tracking the vector lifecycle once we have > >> lazy updates (except for static routes). It's a way too invasive > >> concept, and it's not needed for anything but KVM. > > > > I think it's needed. The PCI spec states that when the device > > does not need an interrupt anymore, it should clear the pending > > bit. > > That should be done explicitly if it is required outside existing > clearing points. We already have that service, it's called > msix_clear_vector. We do? I don't seem to see it upstream... > That alone does not justify msix_vector_use and all > the state and logic behind it IMHO. To me it looks like an abstraction that solves both this problem and the resource allocation problem. Resources are actually limited BTW, this is not just a KVM thing. qemu.git currently lets guests decide what to do with them, but it might turn out to be benefitial to warn the management application that it is shooting itself in the foot. > > The use/unuse is IMO a decent API for this, > > because it uses a familiar resource tracking concept. > > Exposing this knowledge of msix to devices seems > > like a worse API. > > > >> > >> If you want an example, check > >> http://permalink.gmane.org/gmane.comp.emulators.kvm.devel/70915 and > >> compare it to the changes done to hpet in this series. > >> > >> Jan > >> > > > > This seems to be a general argument that lazy updates are good? > > I have no real problem with them, besides the fact that > > we need an API to reserve space in the routing > > table so that device setup can fail upfront. > > That's not possible, even with used vectors, as devices change their > vector usage depending on how the guest configures the devices. If you > (pre-)allocate all possible vectors, you may run out of resources > earlier than needed actually. This really depends, but please do look at how with virtio we report resource shortage to guest and let it fall back to level interrups. You seem to remove that capability. I actually would not mind preallocating everything upfront which is much easier. But with your patch we get a silent failure or a drastic slowdown which is much more painful IMO. > That's also why we do those data == 0 > checks to skip used but unconfigured vectors. > > Jan These checks work more or less by luck BTW. It's a hack which I hope lazy allocation will replace. > -- > Siemens AG, Corporate Technology, CT T DE IT 1 > Corporate Competence Center Embedded Linux
On 2011-10-18 14:33, Michael S. Tsirkin wrote: > On Tue, Oct 18, 2011 at 02:08:59PM +0200, Jan Kiszka wrote: >> On 2011-10-18 13:58, Michael S. Tsirkin wrote: >>> On Mon, Oct 17, 2011 at 09:28:12PM +0200, Jan Kiszka wrote: >>>> On 2011-10-17 17:48, Michael S. Tsirkin wrote: >>>>> On Mon, Oct 17, 2011 at 11:28:02AM +0200, Jan Kiszka wrote: >>>>>> This optimization was only required to keep KVM route usage low. Now >>>>>> that we solve that problem via lazy updates, we can drop the field. We >>>>>> still need interfaces to clear pending vectors, though (and we have to >>>>>> make use of them more broadly - but that's unrelated to this patch). >>>>>> >>>>>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com> >>>>> >>>>> Lazy updates should be an implementation detail. >>>>> IMO resource tracking of vectors makes sense >>>>> as an API. Making devices deal with pending >>>>> vectors as a concept, IMO, does not. >>>> >>>> There is really no use for tracking the vector lifecycle once we have >>>> lazy updates (except for static routes). It's a way too invasive >>>> concept, and it's not needed for anything but KVM. >>> >>> I think it's needed. The PCI spec states that when the device >>> does not need an interrupt anymore, it should clear the pending >>> bit. >> >> That should be done explicitly if it is required outside existing >> clearing points. We already have that service, it's called >> msix_clear_vector. > > We do? I don't seem to see it upstream... True. From the device's POV, MSI-X (and also MSI!) vectors are actually level-triggered. So we should communicate the level to the MSI core and not just the edge. Needs more fixing > >> That alone does not justify msix_vector_use and all >> the state and logic behind it IMHO. > > To me it looks like an abstraction that solves both > this problem and the resource allocation problem. > Resources are actually limited BTW, this is not just > a KVM thing. qemu.git currently lets guests decide > what to do with them, but it might turn out to > be benefitial to warn the management application > that it is shooting itself in the foot. > >>> The use/unuse is IMO a decent API for this, >>> because it uses a familiar resource tracking concept. >>> Exposing this knowledge of msix to devices seems >>> like a worse API. >>> >>>> >>>> If you want an example, check >>>> http://permalink.gmane.org/gmane.comp.emulators.kvm.devel/70915 and >>>> compare it to the changes done to hpet in this series. >>>> >>>> Jan >>>> >>> >>> This seems to be a general argument that lazy updates are good? >>> I have no real problem with them, besides the fact that >>> we need an API to reserve space in the routing >>> table so that device setup can fail upfront. >> >> That's not possible, even with used vectors, as devices change their >> vector usage depending on how the guest configures the devices. If you >> (pre-)allocate all possible vectors, you may run out of resources >> earlier than needed actually. > > This really depends, but please do look at how with virtio > we report resource shortage to guest and let it fall back to > level interrups. You seem to remove that capability. To my understanding, virtio will be the exception as no other device will have a chance to react on resource shortage while sending(!) an MSI message. > > I actually would not mind preallocating everything upfront which is much > easier. But with your patch we get a silent failure or a drastic > slowdown which is much more painful IMO. Again: did we already saw that limit? And where does it come from if not from KVM? > >> That's also why we do those data == 0 >> checks to skip used but unconfigured vectors. >> >> Jan > > These checks work more or less by luck BTW. It's > a hack which I hope lazy allocation will replace. The check is still valid (for x86) when we have to use static routes (device assignment, vhost). For lazy updates, it's obsolete, that's true. Jan
On Tue, Oct 18, 2011 at 02:38:36PM +0200, Jan Kiszka wrote: > On 2011-10-18 14:33, Michael S. Tsirkin wrote: > > On Tue, Oct 18, 2011 at 02:08:59PM +0200, Jan Kiszka wrote: > >> On 2011-10-18 13:58, Michael S. Tsirkin wrote: > >>> On Mon, Oct 17, 2011 at 09:28:12PM +0200, Jan Kiszka wrote: > >>>> On 2011-10-17 17:48, Michael S. Tsirkin wrote: > >>>>> On Mon, Oct 17, 2011 at 11:28:02AM +0200, Jan Kiszka wrote: > >>>>>> This optimization was only required to keep KVM route usage low. Now > >>>>>> that we solve that problem via lazy updates, we can drop the field. We > >>>>>> still need interfaces to clear pending vectors, though (and we have to > >>>>>> make use of them more broadly - but that's unrelated to this patch). > >>>>>> > >>>>>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com> > >>>>> > >>>>> Lazy updates should be an implementation detail. > >>>>> IMO resource tracking of vectors makes sense > >>>>> as an API. Making devices deal with pending > >>>>> vectors as a concept, IMO, does not. > >>>> > >>>> There is really no use for tracking the vector lifecycle once we have > >>>> lazy updates (except for static routes). It's a way too invasive > >>>> concept, and it's not needed for anything but KVM. > >>> > >>> I think it's needed. The PCI spec states that when the device > >>> does not need an interrupt anymore, it should clear the pending > >>> bit. > >> > >> That should be done explicitly if it is required outside existing > >> clearing points. We already have that service, it's called > >> msix_clear_vector. > > > > We do? I don't seem to see it upstream... > > True. From the device's POV, MSI-X (and also MSI!) vectors are actually > level-triggered. This definitely takes adjusting to. > So we should communicate the level to the MSI core and > not just the edge. Needs more fixing > > > > >> That alone does not justify msix_vector_use and all > >> the state and logic behind it IMHO. > > > > To me it looks like an abstraction that solves both > > this problem and the resource allocation problem. > > Resources are actually limited BTW, this is not just > > a KVM thing. qemu.git currently lets guests decide > > what to do with them, but it might turn out to > > be benefitial to warn the management application > > that it is shooting itself in the foot. > > > >>> The use/unuse is IMO a decent API for this, > >>> because it uses a familiar resource tracking concept. > >>> Exposing this knowledge of msix to devices seems > >>> like a worse API. > >>> > >>>> > >>>> If you want an example, check > >>>> http://permalink.gmane.org/gmane.comp.emulators.kvm.devel/70915 and > >>>> compare it to the changes done to hpet in this series. > >>>> > >>>> Jan > >>>> > >>> > >>> This seems to be a general argument that lazy updates are good? > >>> I have no real problem with them, besides the fact that > >>> we need an API to reserve space in the routing > >>> table so that device setup can fail upfront. > >> > >> That's not possible, even with used vectors, as devices change their > >> vector usage depending on how the guest configures the devices. If you > >> (pre-)allocate all possible vectors, you may run out of resources > >> earlier than needed actually. > > > > This really depends, but please do look at how with virtio > > we report resource shortage to guest and let it fall back to > > level interrups. You seem to remove that capability. > > To my understanding, virtio will be the exception as no other device > will have a chance to react on resource shortage while sending(!) an MSI > message. Hmm, are you familiar with that spec? This is not what virtio does, resource shortage is detected during setup. This is exactly the problem with lazy registration as you don't allocate until it's too late. > > > > I actually would not mind preallocating everything upfront which is much > > easier. But with your patch we get a silent failure or a drastic > > slowdown which is much more painful IMO. > > Again: did we already saw that limit? And where does it come from if not > from KVM? It's a hardware limitation of intel APICs. interrupt vector is encoded in an 8 bit field in msi address. So you can have at most 256 of these. > > > >> That's also why we do those data == 0 > >> checks to skip used but unconfigured vectors. > >> > >> Jan > > > > These checks work more or less by luck BTW. It's > > a hack which I hope lazy allocation will replace. > > The check is still valid (for x86) when we have to use static routes > (device assignment, vhost). It's not valid at all - we are just lucky that linux and windows guests seem to zero out the vector when it's not in use. They do not have to do that. > For lazy updates, it's obsolete, that's true. > > Jan > > -- > Siemens AG, Corporate Technology, CT T DE IT 1 > Corporate Competence Center Embedded Linux
On 2011-10-18 14:48, Michael S. Tsirkin wrote: >> To my understanding, virtio will be the exception as no other device >> will have a chance to react on resource shortage while sending(!) an MSI >> message. > > Hmm, are you familiar with that spec? Not by heart. > This is not what virtio does, > resource shortage is detected during setup. > This is exactly the problem with lazy registration as you don't > allocate until it's too late. When is that setup phase? Does it actually come after every change to an MSI vector? I doubt so. Thus virtio can only estimate the guest usage as well (a guest may or may not actually write a non-null data into a vector and unmask it). > >>> >>> I actually would not mind preallocating everything upfront which is much >>> easier. But with your patch we get a silent failure or a drastic >>> slowdown which is much more painful IMO. >> >> Again: did we already saw that limit? And where does it come from if not >> from KVM? > > It's a hardware limitation of intel APICs. interrupt vector is encoded > in an 8 bit field in msi address. So you can have at most 256 of these. There should be no such limitation with pseudo GSIs we use for MSI injection. They end up as MSI messages again, so actually 256 (-reserved vectors) * number-of-cpus (on x86). > >>> >>>> That's also why we do those data == 0 >>>> checks to skip used but unconfigured vectors. >>>> >>>> Jan >>> >>> These checks work more or less by luck BTW. It's >>> a hack which I hope lazy allocation will replace. >> >> The check is still valid (for x86) when we have to use static routes >> (device assignment, vhost). > > It's not valid at all - we are just lucky that linux and > windows guests seem to zero out the vector when it's not in use. > They do not have to do that. It is valid as it is just an optimization. If an unused vector has a non-null data field, we just redundantly register a route where we do not actually have to. But we do need to be prepared for potentially arriving messages on that virtual GSI, either via irqfd or kvm device assignment. Jan
On Tue, Oct 18, 2011 at 03:00:29PM +0200, Jan Kiszka wrote: > On 2011-10-18 14:48, Michael S. Tsirkin wrote: > >> To my understanding, virtio will be the exception as no other device > >> will have a chance to react on resource shortage while sending(!) an MSI > >> message. > > > > Hmm, are you familiar with that spec? > > Not by heart. > > > This is not what virtio does, > > resource shortage is detected during setup. > > This is exactly the problem with lazy registration as you don't > > allocate until it's too late. > > When is that setup phase? Does it actually come after every change to an > MSI vector? I doubt so. No. During setup, driver requests vectors from the OS, and then tells the device which vector should each VQ use. It then checks that the assignment was successful. If not, it retries with less vectors. Other devices can do this during initialization, and signal resource availability to guest using msix vector number field. > Thus virtio can only estimate the guest usage as > well At some level, this is fundamental: some guest operations have no failure mode. So we must preallocate some resources to make sure they won't fail. > (a guest may or may not actually write a non-null data into a > vector and unmask it). Please, forget the non-NULL thing. virtio driver knows exactly how many vectors we use and communicates this info to the device. This is not uncommon at all. > > > >>> > >>> I actually would not mind preallocating everything upfront which is much > >>> easier. But with your patch we get a silent failure or a drastic > >>> slowdown which is much more painful IMO. > >> > >> Again: did we already saw that limit? And where does it come from if not > >> from KVM? > > > > It's a hardware limitation of intel APICs. interrupt vector is encoded > > in an 8 bit field in msi address. So you can have at most 256 of these. > > There should be no such limitation with pseudo GSIs we use for MSI > injection. They end up as MSI messages again, so actually 256 (-reserved > vectors) * number-of-cpus (on x86). This limits which CPUs can get the interrupt though. Linux seems to have a global pool as it wants to be able to freely balance vectors between CPUs. Or, consider a guest with a single CPU :) Anyway, why argue - there is a limitation, and it's not coming from KVM, right? > > > >>> > >>>> That's also why we do those data == 0 > >>>> checks to skip used but unconfigured vectors. > >>>> > >>>> Jan > >>> > >>> These checks work more or less by luck BTW. It's > >>> a hack which I hope lazy allocation will replace. > >> > >> The check is still valid (for x86) when we have to use static routes > >> (device assignment, vhost). > > > > It's not valid at all - we are just lucky that linux and > > windows guests seem to zero out the vector when it's not in use. > > They do not have to do that. > > It is valid as it is just an optimization. If an unused vector has a > non-null data field, we just redundantly register a route where we do > not actually have to. Well, the only reason we even have this code is because it was claimed that some devices declare support for a huge number of vectors which then go unused. So if the guest does not do this we'll run out of vectors ... > But we do need to be prepared And ATM, we aren't, and probably can't be without kernel changes, right? > for potentially > arriving messages on that virtual GSI, either via irqfd or kvm device > assignment. > > Jan Why irqfd? Device assignment is ATM the only place where we use these ugly hacks. > -- > Siemens AG, Corporate Technology, CT T DE IT 1 > Corporate Competence Center Embedded Linux
On 2011-10-18 15:37, Michael S. Tsirkin wrote: > On Tue, Oct 18, 2011 at 03:00:29PM +0200, Jan Kiszka wrote: >> On 2011-10-18 14:48, Michael S. Tsirkin wrote: >>>> To my understanding, virtio will be the exception as no other device >>>> will have a chance to react on resource shortage while sending(!) an MSI >>>> message. >>> >>> Hmm, are you familiar with that spec? >> >> Not by heart. >> >>> This is not what virtio does, >>> resource shortage is detected during setup. >>> This is exactly the problem with lazy registration as you don't >>> allocate until it's too late. >> >> When is that setup phase? Does it actually come after every change to an >> MSI vector? I doubt so. > > No. During setup, driver requests vectors from the OS, and then tells > the device which vector should each VQ use. It then checks that the > assignment was successful. If not, it retries with less vectors. > > Other devices can do this during initialization, and signal > resource availability to guest using msix vector number field. > >> Thus virtio can only estimate the guest usage as >> well > > At some level, this is fundamental: some guest operations > have no failure mode. So we must preallocate > some resources to make sure they won't fail. We can still track the expected maximum number of active vectors at core level, collect them from the KVM layer, and warn if we expect conflicts. Anxious MSI users could then refrain from using this feature, others might be fine with risking a slow-down on conflicts. > >> (a guest may or may not actually write a non-null data into a >> vector and unmask it). > > Please, forget the non-NULL thing. virtio driver knows exactly > how many vectors we use and communicates this info to the device. > This is not uncommon at all. > >>> >>>>> >>>>> I actually would not mind preallocating everything upfront which is much >>>>> easier. But with your patch we get a silent failure or a drastic >>>>> slowdown which is much more painful IMO. >>>> >>>> Again: did we already saw that limit? And where does it come from if not >>>> from KVM? >>> >>> It's a hardware limitation of intel APICs. interrupt vector is encoded >>> in an 8 bit field in msi address. So you can have at most 256 of these. >> >> There should be no such limitation with pseudo GSIs we use for MSI >> injection. They end up as MSI messages again, so actually 256 (-reserved >> vectors) * number-of-cpus (on x86). > > This limits which CPUs can get the interrupt though. > Linux seems to have a global pool as it wants to be able to freely > balance vectors between CPUs. Or, consider a guest with a single CPU :) > > Anyway, why argue - there is a limitation, and it's not coming from KVM, > right? No, our limit we hit with MSI message routing are first of all KVM GSIs, and there only pseudo GSIs that do not go to any interrupt controller with limited pins. That could easily be lifted in the kernel if we run into shortages in practice. > >>> >>>>> >>>>>> That's also why we do those data == 0 >>>>>> checks to skip used but unconfigured vectors. >>>>>> >>>>>> Jan >>>>> >>>>> These checks work more or less by luck BTW. It's >>>>> a hack which I hope lazy allocation will replace. >>>> >>>> The check is still valid (for x86) when we have to use static routes >>>> (device assignment, vhost). >>> >>> It's not valid at all - we are just lucky that linux and >>> windows guests seem to zero out the vector when it's not in use. >>> They do not have to do that. >> >> It is valid as it is just an optimization. If an unused vector has a >> non-null data field, we just redundantly register a route where we do >> not actually have to. > > Well, the only reason we even have this code is because > it was claimed that some devices declare support for a huge number > of vectors which then go unused. So if the guest does not > do this we'll run out of vectors ... > >> But we do need to be prepared > > And ATM, we aren't, and probably can't be without kernel > changes, right? > >> for potentially >> arriving messages on that virtual GSI, either via irqfd or kvm device >> assignment. >> >> Jan > > Why irqfd? Device assignment is ATM the only place where we use these > ugly hacks. vfio will use irqfds. And that virtio is partly out of the picture is only because we know much more about virtio internals (specifically: "will not advertise more vectors than guests will want to use"). Jan
On Tue, Oct 18, 2011 at 03:46:06PM +0200, Jan Kiszka wrote: > On 2011-10-18 15:37, Michael S. Tsirkin wrote: > > On Tue, Oct 18, 2011 at 03:00:29PM +0200, Jan Kiszka wrote: > >> On 2011-10-18 14:48, Michael S. Tsirkin wrote: > >>>> To my understanding, virtio will be the exception as no other device > >>>> will have a chance to react on resource shortage while sending(!) an MSI > >>>> message. > >>> > >>> Hmm, are you familiar with that spec? > >> > >> Not by heart. > >> > >>> This is not what virtio does, > >>> resource shortage is detected during setup. > >>> This is exactly the problem with lazy registration as you don't > >>> allocate until it's too late. > >> > >> When is that setup phase? Does it actually come after every change to an > >> MSI vector? I doubt so. > > > > No. During setup, driver requests vectors from the OS, and then tells > > the device which vector should each VQ use. It then checks that the > > assignment was successful. If not, it retries with less vectors. > > > > Other devices can do this during initialization, and signal > > resource availability to guest using msix vector number field. > > > >> Thus virtio can only estimate the guest usage as > >> well > > > > At some level, this is fundamental: some guest operations > > have no failure mode. So we must preallocate > > some resources to make sure they won't fail. > > We can still track the expected maximum number of active vectors at core > level, collect them from the KVM layer, and warn if we expect conflicts. > Anxious MSI users could then refrain from using this feature, others > might be fine with risking a slow-down on conflicts. It seems like a nice feature until you have to debug it in the field :). If you really think it's worthwhile, let's add a 'force' flag so that advanced users at least can declare that they know what they are doing. > > > >> (a guest may or may not actually write a non-null data into a > >> vector and unmask it). > > > > Please, forget the non-NULL thing. virtio driver knows exactly > > how many vectors we use and communicates this info to the device. > > This is not uncommon at all. > > > >>> > >>>>> > >>>>> I actually would not mind preallocating everything upfront which is much > >>>>> easier. But with your patch we get a silent failure or a drastic > >>>>> slowdown which is much more painful IMO. > >>>> > >>>> Again: did we already saw that limit? And where does it come from if not > >>>> from KVM? > >>> > >>> It's a hardware limitation of intel APICs. interrupt vector is encoded > >>> in an 8 bit field in msi address. So you can have at most 256 of these. > >> > >> There should be no such limitation with pseudo GSIs we use for MSI > >> injection. They end up as MSI messages again, so actually 256 (-reserved > >> vectors) * number-of-cpus (on x86). > > > > This limits which CPUs can get the interrupt though. > > Linux seems to have a global pool as it wants to be able to freely > > balance vectors between CPUs. Or, consider a guest with a single CPU :) > > > > Anyway, why argue - there is a limitation, and it's not coming from KVM, > > right? > > No, our limit we hit with MSI message routing are first of all KVM GSIs, > and there only pseudo GSIs that do not go to any interrupt controller > with limited pins. I see KVM_MAX_IRQ_ROUTES 1024 This is > 256 so KVM does not seem to be the problem. > That could easily be lifted in the kernel if we run > into shortages in practice. What I was saying is that resources are limited even without kvm. > > > >>> > >>>>> > >>>>>> That's also why we do those data == 0 > >>>>>> checks to skip used but unconfigured vectors. > >>>>>> > >>>>>> Jan > >>>>> > >>>>> These checks work more or less by luck BTW. It's > >>>>> a hack which I hope lazy allocation will replace. > >>>> > >>>> The check is still valid (for x86) when we have to use static routes > >>>> (device assignment, vhost). > >>> > >>> It's not valid at all - we are just lucky that linux and > >>> windows guests seem to zero out the vector when it's not in use. > >>> They do not have to do that. > >> > >> It is valid as it is just an optimization. If an unused vector has a > >> non-null data field, we just redundantly register a route where we do > >> not actually have to. > > > > Well, the only reason we even have this code is because > > it was claimed that some devices declare support for a huge number > > of vectors which then go unused. So if the guest does not > > do this we'll run out of vectors ... > > > >> But we do need to be prepared > > > > And ATM, we aren't, and probably can't be without kernel > > changes, right? > > > >> for potentially > >> arriving messages on that virtual GSI, either via irqfd or kvm device > >> assignment. > >> > >> Jan > > > > Why irqfd? Device assignment is ATM the only place where we use these > > ugly hacks. > > vfio will use irqfds. And that virtio is partly out of the picture is > only because we know much more about virtio internals (specifically: > "will not advertise more vectors than guests will want to use"). > > Jan > > -- > Siemens AG, Corporate Technology, CT T DE IT 1 > Corporate Competence Center Embedded Linux
On 2011-10-18 16:01, Michael S. Tsirkin wrote: >>>>>>> >>>>>>> I actually would not mind preallocating everything upfront which is much >>>>>>> easier. But with your patch we get a silent failure or a drastic >>>>>>> slowdown which is much more painful IMO. >>>>>> >>>>>> Again: did we already saw that limit? And where does it come from if not >>>>>> from KVM? >>>>> >>>>> It's a hardware limitation of intel APICs. interrupt vector is encoded >>>>> in an 8 bit field in msi address. So you can have at most 256 of these. >>>> >>>> There should be no such limitation with pseudo GSIs we use for MSI >>>> injection. They end up as MSI messages again, so actually 256 (-reserved >>>> vectors) * number-of-cpus (on x86). >>> >>> This limits which CPUs can get the interrupt though. >>> Linux seems to have a global pool as it wants to be able to freely >>> balance vectors between CPUs. Or, consider a guest with a single CPU :) >>> >>> Anyway, why argue - there is a limitation, and it's not coming from KVM, >>> right? >> >> No, our limit we hit with MSI message routing are first of all KVM GSIs, >> and there only pseudo GSIs that do not go to any interrupt controller >> with limited pins. > > I see KVM_MAX_IRQ_ROUTES 1024 > This is > 256 so KVM does not seem to be the problem. We can generate way more different MSI messages than 256. A message may encode the target CPU, so you have this number in the equation e.g. > >> That could easily be lifted in the kernel if we run >> into shortages in practice. > > What I was saying is that resources are limited even without kvm. What other resources related to this particular case are exhausted before GSI numbers? Jan
On Tue, Oct 18, 2011 at 04:08:46PM +0200, Jan Kiszka wrote: > On 2011-10-18 16:01, Michael S. Tsirkin wrote: > >>>>>>> > >>>>>>> I actually would not mind preallocating everything upfront which is much > >>>>>>> easier. But with your patch we get a silent failure or a drastic > >>>>>>> slowdown which is much more painful IMO. > >>>>>> > >>>>>> Again: did we already saw that limit? And where does it come from if not > >>>>>> from KVM? > >>>>> > >>>>> It's a hardware limitation of intel APICs. interrupt vector is encoded > >>>>> in an 8 bit field in msi address. So you can have at most 256 of these. > >>>> > >>>> There should be no such limitation with pseudo GSIs we use for MSI > >>>> injection. They end up as MSI messages again, so actually 256 (-reserved > >>>> vectors) * number-of-cpus (on x86). > >>> > >>> This limits which CPUs can get the interrupt though. > >>> Linux seems to have a global pool as it wants to be able to freely > >>> balance vectors between CPUs. Or, consider a guest with a single CPU :) > >>> > >>> Anyway, why argue - there is a limitation, and it's not coming from KVM, > >>> right? > >> > >> No, our limit we hit with MSI message routing are first of all KVM GSIs, > >> and there only pseudo GSIs that do not go to any interrupt controller > >> with limited pins. > > > > I see KVM_MAX_IRQ_ROUTES 1024 > > This is > 256 so KVM does not seem to be the problem. > > We can generate way more different MSI messages than 256. A message may > encode the target CPU, so you have this number in the equation e.g. Yes but the vector is encoded in 256 bits. The rest is stuff like delivery mode, which won't affect which handler is run AFAIK. So while the problem might appear with vector sharing, in practice there is no vector sharing so no problem :) > > > >> That could easily be lifted in the kernel if we run > >> into shortages in practice. > > > > What I was saying is that resources are limited even without kvm. > > What other resources related to this particular case are exhausted > before GSI numbers? > > Jan distinct vectors > Siemens AG, Corporate Technology, CT T DE IT 1 > Corporate Competence Center Embedded Linux
On 2011-10-18 17:08, Michael S. Tsirkin wrote: > On Tue, Oct 18, 2011 at 04:08:46PM +0200, Jan Kiszka wrote: >> On 2011-10-18 16:01, Michael S. Tsirkin wrote: >>>>>>>>> >>>>>>>>> I actually would not mind preallocating everything upfront which is much >>>>>>>>> easier. But with your patch we get a silent failure or a drastic >>>>>>>>> slowdown which is much more painful IMO. >>>>>>>> >>>>>>>> Again: did we already saw that limit? And where does it come from if not >>>>>>>> from KVM? >>>>>>> >>>>>>> It's a hardware limitation of intel APICs. interrupt vector is encoded >>>>>>> in an 8 bit field in msi address. So you can have at most 256 of these. >>>>>> >>>>>> There should be no such limitation with pseudo GSIs we use for MSI >>>>>> injection. They end up as MSI messages again, so actually 256 (-reserved >>>>>> vectors) * number-of-cpus (on x86). >>>>> >>>>> This limits which CPUs can get the interrupt though. >>>>> Linux seems to have a global pool as it wants to be able to freely >>>>> balance vectors between CPUs. Or, consider a guest with a single CPU :) >>>>> >>>>> Anyway, why argue - there is a limitation, and it's not coming from KVM, >>>>> right? >>>> >>>> No, our limit we hit with MSI message routing are first of all KVM GSIs, >>>> and there only pseudo GSIs that do not go to any interrupt controller >>>> with limited pins. >>> >>> I see KVM_MAX_IRQ_ROUTES 1024 >>> This is > 256 so KVM does not seem to be the problem. >> >> We can generate way more different MSI messages than 256. A message may >> encode the target CPU, so you have this number in the equation e.g. > > Yes but the vector is encoded in 256 bits. The rest is > stuff like delivery mode, which won't affect which > handler is run AFAIK. So while the problem might > appear with vector sharing, in practice there is > no vector sharing so no problem :) > >>> >>>> That could easily be lifted in the kernel if we run >>>> into shortages in practice. >>> >>> What I was saying is that resources are limited even without kvm. >> >> What other resources related to this particular case are exhausted >> before GSI numbers? >> >> Jan > > distinct vectors The guest is responsible for managing vectors, not KVM, not QEMU. And the guest will notice first when it runs out of them. So a virtio guest driver may not even request MSI-X support if that happens. What KVM has to do is just mapping an arbitrary MSI message (theoretically 64+32 bits, in practice it's much of course much less) to a single GSI and vice versa. As there are less GSIs than possible MSI messages, we could run out of them when creating routes, statically or lazily. What would probably help us long-term out of your concerns regarding lazy routing is to bypass that redundant GSI translation for dynamic messages, i.e. those that are not associated with an irqfd number or an assigned device irq. Something like a KVM_DELIVER_MSI IOCTL that accepts address and data directly. Jan
On 2011-10-18 17:22, Jan Kiszka wrote: > What KVM has to do is just mapping an arbitrary MSI message > (theoretically 64+32 bits, in practice it's much of course much less) to ( There are 24 distinguishing bits in an MSI message on x86, but that's only a current interpretation of one specific arch. ) > a single GSI and vice versa. As there are less GSIs than possible MSI > messages, we could run out of them when creating routes, statically or > lazily. > > What would probably help us long-term out of your concerns regarding > lazy routing is to bypass that redundant GSI translation for dynamic > messages, i.e. those that are not associated with an irqfd number or an > assigned device irq. Something like a KVM_DELIVER_MSI IOCTL that accepts > address and data directly. This would be a trivial extension in fact. Given its beneficial impact on our GSI limitation issue, I think I will hack up something like that. And maybe this makes a transparent cache more reasonable. Then only old host kernels would force us to do searches for already cached messages. Jan
On Tue, Oct 18, 2011 at 05:22:38PM +0200, Jan Kiszka wrote: > On 2011-10-18 17:08, Michael S. Tsirkin wrote: > > On Tue, Oct 18, 2011 at 04:08:46PM +0200, Jan Kiszka wrote: > >> On 2011-10-18 16:01, Michael S. Tsirkin wrote: > >>>>>>>>> > >>>>>>>>> I actually would not mind preallocating everything upfront which is much > >>>>>>>>> easier. But with your patch we get a silent failure or a drastic > >>>>>>>>> slowdown which is much more painful IMO. > >>>>>>>> > >>>>>>>> Again: did we already saw that limit? And where does it come from if not > >>>>>>>> from KVM? > >>>>>>> > >>>>>>> It's a hardware limitation of intel APICs. interrupt vector is encoded > >>>>>>> in an 8 bit field in msi address. So you can have at most 256 of these. > >>>>>> > >>>>>> There should be no such limitation with pseudo GSIs we use for MSI > >>>>>> injection. They end up as MSI messages again, so actually 256 (-reserved > >>>>>> vectors) * number-of-cpus (on x86). > >>>>> > >>>>> This limits which CPUs can get the interrupt though. > >>>>> Linux seems to have a global pool as it wants to be able to freely > >>>>> balance vectors between CPUs. Or, consider a guest with a single CPU :) > >>>>> > >>>>> Anyway, why argue - there is a limitation, and it's not coming from KVM, > >>>>> right? > >>>> > >>>> No, our limit we hit with MSI message routing are first of all KVM GSIs, > >>>> and there only pseudo GSIs that do not go to any interrupt controller > >>>> with limited pins. > >>> > >>> I see KVM_MAX_IRQ_ROUTES 1024 > >>> This is > 256 so KVM does not seem to be the problem. > >> > >> We can generate way more different MSI messages than 256. A message may > >> encode the target CPU, so you have this number in the equation e.g. > > > > Yes but the vector is encoded in 256 bits. The rest is > > stuff like delivery mode, which won't affect which > > handler is run AFAIK. So while the problem might > > appear with vector sharing, in practice there is > > no vector sharing so no problem :) > > > >>> > >>>> That could easily be lifted in the kernel if we run > >>>> into shortages in practice. > >>> > >>> What I was saying is that resources are limited even without kvm. > >> > >> What other resources related to this particular case are exhausted > >> before GSI numbers? > >> > >> Jan > > > > distinct vectors > > The guest is responsible for managing vectors, not KVM, not QEMU. And > the guest will notice first when it runs out of them. So a virtio guest > driver may not even request MSI-X support if that happens. Absolutely. You can solve the problem from guest in theory. But what I was saying is, in practice what happens first X devices get msix, others don't. Guests aren't doing anything smart as they are not designed with a huge number of devices in mind. What we can do is solve the problem from management. And to do that we can't delay allocation until it's used. > What KVM has to do is just mapping an arbitrary MSI message > (theoretically 64+32 bits, in practice it's much of course much less) to > a single GSI and vice versa. As there are less GSIs than possible MSI > messages, we could run out of them when creating routes, statically or > lazily. Possible MSI messages != possible MSI vectors. If two devices share a vector, APIC won't be able to distinguish even though e.g. delivery mode is different. > What would probably help us long-term out of your concerns regarding > lazy routing is to bypass that redundant GSI translation for dynamic > messages, i.e. those that are not associated with an irqfd number or an > assigned device irq. Something like a KVM_DELIVER_MSI IOCTL that accepts > address and data directly. > > Jan You are trying to work around the problem by not requiring any resources per MSI vector. This just might work for some uses (ioctl) but isn't a generic solution (e.g. won't work for irqfd). > -- > Siemens AG, Corporate Technology, CT T DE IT 1 > Corporate Competence Center Embedded Linux
On 2011-10-18 17:56, Michael S. Tsirkin wrote: >> What would probably help us long-term out of your concerns regarding >> lazy routing is to bypass that redundant GSI translation for dynamic >> messages, i.e. those that are not associated with an irqfd number or an >> assigned device irq. Something like a KVM_DELIVER_MSI IOCTL that accepts >> address and data directly. >> >> Jan > > You are trying to work around the problem by not requiring > any resources per MSI vector. This just might work for some > uses (ioctl) but isn't a generic solution (e.g. won't work for irqfd). irqfd is not affected anymore in that model as it cannot participate in lazy routing anyway. Jan
On Tue, Oct 18, 2011 at 05:55:54PM +0200, Jan Kiszka wrote: > On 2011-10-18 17:22, Jan Kiszka wrote: > > What KVM has to do is just mapping an arbitrary MSI message > > (theoretically 64+32 bits, in practice it's much of course much less) to > > ( There are 24 distinguishing bits in an MSI message on x86, but that's > only a current interpretation of one specific arch. ) Confused. vector mask is 8 bits. the rest is destination id etc. > > a single GSI and vice versa. As there are less GSIs than possible MSI > > messages, we could run out of them when creating routes, statically or > > lazily. > > > > What would probably help us long-term out of your concerns regarding > > lazy routing is to bypass that redundant GSI translation for dynamic > > messages, i.e. those that are not associated with an irqfd number or an > > assigned device irq. Something like a KVM_DELIVER_MSI IOCTL that accepts > > address and data directly. > > This would be a trivial extension in fact. Given its beneficial impact > on our GSI limitation issue, I think I will hack up something like that. > > And maybe this makes a transparent cache more reasonable. Then only old > host kernels would force us to do searches for already cached messages. > > Jan Hmm, I'm not all that sure. Existing design really allows caching the route in various smart ways. We currently do this for irqfd but this can be extended to ioctls. If we just let the guest inject arbitrary messages, that becomes much more complex. Another concern is mask bit emulation. We currently handle mask bit in userspace but patches to do them in kernel for assigned devices where seen and IMO we might want to do that for virtio as well. For that to work the mask bit needs to be tied to a specific gsi or specific device, which does not work if we just inject arbitrary writes.
On 2011-10-18 19:06, Michael S. Tsirkin wrote: > On Tue, Oct 18, 2011 at 05:55:54PM +0200, Jan Kiszka wrote: >> On 2011-10-18 17:22, Jan Kiszka wrote: >>> What KVM has to do is just mapping an arbitrary MSI message >>> (theoretically 64+32 bits, in practice it's much of course much less) to >> >> ( There are 24 distinguishing bits in an MSI message on x86, but that's >> only a current interpretation of one specific arch. ) > > Confused. vector mask is 8 bits. the rest is destination id etc. Right, but those additional bits like the destination make different messages. We have to encode those 24 bits into a unique GSI number and restore them (by table lookup) on APIC injection inside the kernel. If we only had to encode 256 different vectors, we would be done already. > >>> a single GSI and vice versa. As there are less GSIs than possible MSI >>> messages, we could run out of them when creating routes, statically or >>> lazily. >>> >>> What would probably help us long-term out of your concerns regarding >>> lazy routing is to bypass that redundant GSI translation for dynamic >>> messages, i.e. those that are not associated with an irqfd number or an >>> assigned device irq. Something like a KVM_DELIVER_MSI IOCTL that accepts >>> address and data directly. >> >> This would be a trivial extension in fact. Given its beneficial impact >> on our GSI limitation issue, I think I will hack up something like that. >> >> And maybe this makes a transparent cache more reasonable. Then only old >> host kernels would force us to do searches for already cached messages. >> >> Jan > > Hmm, I'm not all that sure. Existing design really allows > caching the route in various smart ways. We currently do > this for irqfd but this can be extended to ioctls. > If we just let the guest inject arbitrary messages, > that becomes much more complex. irqfd and kvm device assignment do not allow us to inject arbitrary messages at arbitrary points. The new API offers kvm_msi_irqfd_set and kvm_device_msix_set_vector (etc.) for those scenarios to set static routes from an MSI message to a GSI number (+they configure the related backends). > > Another concern is mask bit emulation. We currently > handle mask bit in userspace but patches > to do them in kernel for assigned devices where seen > and IMO we might want to do that for virtio as well. > > For that to work the mask bit needs to be tied to > a specific gsi or specific device, which does not > work if we just inject arbitrary writes. Yes, but I do not see those valuable plans being negatively affected. Jan
On 2011-10-18 19:06, Michael S. Tsirkin wrote: > On Tue, Oct 18, 2011 at 05:55:54PM +0200, Jan Kiszka wrote: >> On 2011-10-18 17:22, Jan Kiszka wrote: >>> What KVM has to do is just mapping an arbitrary MSI message >>> (theoretically 64+32 bits, in practice it's much of course much less) to >> >> ( There are 24 distinguishing bits in an MSI message on x86, but that's >> only a current interpretation of one specific arch. ) > > Confused. vector mask is 8 bits. the rest is destination id etc. Right, but those additional bits like the destination make different messages. We have to encode those 24 bits into a unique GSI number and restore them (by table lookup) on APIC injection inside the kernel. If we only had to encode 256 different vectors, we would be done already. > >>> a single GSI and vice versa. As there are less GSIs than possible MSI >>> messages, we could run out of them when creating routes, statically or >>> lazily. >>> >>> What would probably help us long-term out of your concerns regarding >>> lazy routing is to bypass that redundant GSI translation for dynamic >>> messages, i.e. those that are not associated with an irqfd number or an >>> assigned device irq. Something like a KVM_DELIVER_MSI IOCTL that accepts >>> address and data directly. >> >> This would be a trivial extension in fact. Given its beneficial impact >> on our GSI limitation issue, I think I will hack up something like that. >> >> And maybe this makes a transparent cache more reasonable. Then only old >> host kernels would force us to do searches for already cached messages. >> >> Jan > > Hmm, I'm not all that sure. Existing design really allows > caching the route in various smart ways. We currently do > this for irqfd but this can be extended to ioctls. > If we just let the guest inject arbitrary messages, > that becomes much more complex. irqfd and kvm device assignment do not allow us to inject arbitrary messages at arbitrary points. The new API offers kvm_msi_irqfd_set and kvm_device_msix_set_vector (etc.) for those scenarios to set static routes from an MSI message to a GSI number (+they configure the related backends). > > Another concern is mask bit emulation. We currently > handle mask bit in userspace but patches > to do them in kernel for assigned devices where seen > and IMO we might want to do that for virtio as well. > > For that to work the mask bit needs to be tied to > a specific gsi or specific device, which does not > work if we just inject arbitrary writes. Yes, but I do not see those valuable plans being negatively affected. Jan
On Tue, Oct 18, 2011 at 08:24:39PM +0200, Jan Kiszka wrote: > On 2011-10-18 19:06, Michael S. Tsirkin wrote: > > On Tue, Oct 18, 2011 at 05:55:54PM +0200, Jan Kiszka wrote: > >> On 2011-10-18 17:22, Jan Kiszka wrote: > >>> What KVM has to do is just mapping an arbitrary MSI message > >>> (theoretically 64+32 bits, in practice it's much of course much less) to > >> > >> ( There are 24 distinguishing bits in an MSI message on x86, but that's > >> only a current interpretation of one specific arch. ) > > > > Confused. vector mask is 8 bits. the rest is destination id etc. > > Right, but those additional bits like the destination make different > messages. We have to encode those 24 bits into a unique GSI number and > restore them (by table lookup) on APIC injection inside the kernel. If > we only had to encode 256 different vectors, we would be done already. Right. But in practice guests always use distinct vectors (from the 256 available) for distinct messages. This is because the vector seems to be the only thing that gets communicated by the APIC to the software. So e.g. a table with 256 entries, with extra 1024-256 used for spill-over for guests that do something unexpected, would work really well. > > > >>> a single GSI and vice versa. As there are less GSIs than possible MSI > >>> messages, we could run out of them when creating routes, statically or > >>> lazily. > >>> > >>> What would probably help us long-term out of your concerns regarding > >>> lazy routing is to bypass that redundant GSI translation for dynamic > >>> messages, i.e. those that are not associated with an irqfd number or an > >>> assigned device irq. Something like a KVM_DELIVER_MSI IOCTL that accepts > >>> address and data directly. > >> > >> This would be a trivial extension in fact. Given its beneficial impact > >> on our GSI limitation issue, I think I will hack up something like that. > >> > >> And maybe this makes a transparent cache more reasonable. Then only old > >> host kernels would force us to do searches for already cached messages. > >> > >> Jan > > > > Hmm, I'm not all that sure. Existing design really allows > > caching the route in various smart ways. We currently do > > this for irqfd but this can be extended to ioctls. > > If we just let the guest inject arbitrary messages, > > that becomes much more complex. > > irqfd and kvm device assignment do not allow us to inject arbitrary > messages at arbitrary points. The new API offers kvm_msi_irqfd_set and > kvm_device_msix_set_vector (etc.) for those scenarios to set static > routes from an MSI message to a GSI number (+they configure the related > backends). Yes, it's a very flexible API but it would be very hard to optimize. GSIs let us do the slow path setup, but they make it easy to optimize target lookup in kernel. An analogy would be if read/write operated on file paths. fd makes it easy to do permission checks and slow lookups in one place. GSI happens to work like this (maybe, by accident). > > > > Another concern is mask bit emulation. We currently > > handle mask bit in userspace but patches > > to do them in kernel for assigned devices where seen > > and IMO we might want to do that for virtio as well. > > > > For that to work the mask bit needs to be tied to > > a specific gsi or specific device, which does not > > work if we just inject arbitrary writes. > > Yes, but I do not see those valuable plans being negatively affected. > > Jan > I do. How would we maintain a mask/pending bit in kernel if we are not supplied info on all available vectors even?
On 2011-10-18 20:40, Michael S. Tsirkin wrote: > On Tue, Oct 18, 2011 at 08:24:39PM +0200, Jan Kiszka wrote: >> On 2011-10-18 19:06, Michael S. Tsirkin wrote: >>> On Tue, Oct 18, 2011 at 05:55:54PM +0200, Jan Kiszka wrote: >>>> On 2011-10-18 17:22, Jan Kiszka wrote: >>>>> What KVM has to do is just mapping an arbitrary MSI message >>>>> (theoretically 64+32 bits, in practice it's much of course much less) to >>>> >>>> ( There are 24 distinguishing bits in an MSI message on x86, but that's >>>> only a current interpretation of one specific arch. ) >>> >>> Confused. vector mask is 8 bits. the rest is destination id etc. >> >> Right, but those additional bits like the destination make different >> messages. We have to encode those 24 bits into a unique GSI number and >> restore them (by table lookup) on APIC injection inside the kernel. If >> we only had to encode 256 different vectors, we would be done already. > > Right. But in practice guests always use distinct vectors (from the > 256 available) for distinct messages. This is because > the vector seems to be the only thing that gets communicated by the APIC > to the software. > > So e.g. a table with 256 entries, with extra 1024-256 > used for spill-over for guests that do something unexpected, > would work really well. Already Linux manages vectors on a pre-CPU basis. For efficiency reasons, it does not exploit the full range of 256 vectors but actually allocates them in - IIRC - steps of 16. So I would not be surprised to find lots of vector number "collisions" when looking over a full set of CPUs in a system. Really, these considerations do not help us. We must store all 96 bits, already for the sake of other KVM architectures that want MSI routing. > > >>> >>>>> a single GSI and vice versa. As there are less GSIs than possible MSI >>>>> messages, we could run out of them when creating routes, statically or >>>>> lazily. >>>>> >>>>> What would probably help us long-term out of your concerns regarding >>>>> lazy routing is to bypass that redundant GSI translation for dynamic >>>>> messages, i.e. those that are not associated with an irqfd number or an >>>>> assigned device irq. Something like a KVM_DELIVER_MSI IOCTL that accepts >>>>> address and data directly. >>>> >>>> This would be a trivial extension in fact. Given its beneficial impact >>>> on our GSI limitation issue, I think I will hack up something like that. >>>> >>>> And maybe this makes a transparent cache more reasonable. Then only old >>>> host kernels would force us to do searches for already cached messages. >>>> >>>> Jan >>> >>> Hmm, I'm not all that sure. Existing design really allows >>> caching the route in various smart ways. We currently do >>> this for irqfd but this can be extended to ioctls. >>> If we just let the guest inject arbitrary messages, >>> that becomes much more complex. >> >> irqfd and kvm device assignment do not allow us to inject arbitrary >> messages at arbitrary points. The new API offers kvm_msi_irqfd_set and >> kvm_device_msix_set_vector (etc.) for those scenarios to set static >> routes from an MSI message to a GSI number (+they configure the related >> backends). > > Yes, it's a very flexible API but it would be very hard to optimize. > GSIs let us do the slow path setup, but they make it easy > to optimize target lookup in kernel. Users of the API above have no need to know anything about GSIs. They are an artifact of the KVM-internal interface between user space and kernel now - thanks to the MSIRoutingCache encapsulation. > > An analogy would be if read/write operated on file paths. > fd makes it easy to do permission checks and slow lookups > in one place. GSI happens to work like this (maybe, by accident). Think of an opaque file handle as a MSIRoutingCache object. And it encodes not only the routing handle but also other useful associated information we need from time to time - internally, not in the device models. >>> >>> Another concern is mask bit emulation. We currently >>> handle mask bit in userspace but patches >>> to do them in kernel for assigned devices where seen >>> and IMO we might want to do that for virtio as well. >>> >>> For that to work the mask bit needs to be tied to >>> a specific gsi or specific device, which does not >>> work if we just inject arbitrary writes. >> >> Yes, but I do not see those valuable plans being negatively affected. >> >> Jan >> > > I do. > How would we maintain a mask/pending bit in kernel if we are not > supplied info on all available vectors even? It's tricky to discuss an undefined interface (there only exists an outdated proposal for kvm device assignment). But I suppose that user space will have to define the maximum number of vectors when creating an in-kernel MSI-X MMIO area. The device already has to tell this to msix_init. The number of used vectors will correlate with the number of registered irqfds (in the case of vhost or vfio, device assignment still has SET_MSIX_NR). As kernel space would then be responsible for mask processing, user space would keep vectors registered with irqfds, even if they are masked. It could just continue to play the trick and drop data=0 vectors. The point here is: All those steps have _nothing_ to do with the generic MSI-X core. They are KVM-specific "side channels" for which KVM provides an API. In contrast, msix_vector_use/unuse were generic services that were actually created to please KVM requirements. But if we split that up, we can address the generic MSI-X requirements in a way that makes more sense for emulated devices (and particularly msix_vector_use makes no sense for emulation). Jan
On Tue, Oct 18, 2011 at 09:37:14PM +0200, Jan Kiszka wrote: > On 2011-10-18 20:40, Michael S. Tsirkin wrote: > > On Tue, Oct 18, 2011 at 08:24:39PM +0200, Jan Kiszka wrote: > >> On 2011-10-18 19:06, Michael S. Tsirkin wrote: > >>> On Tue, Oct 18, 2011 at 05:55:54PM +0200, Jan Kiszka wrote: > >>>> On 2011-10-18 17:22, Jan Kiszka wrote: > >>>>> What KVM has to do is just mapping an arbitrary MSI message > >>>>> (theoretically 64+32 bits, in practice it's much of course much less) to > >>>> > >>>> ( There are 24 distinguishing bits in an MSI message on x86, but that's > >>>> only a current interpretation of one specific arch. ) > >>> > >>> Confused. vector mask is 8 bits. the rest is destination id etc. > >> > >> Right, but those additional bits like the destination make different > >> messages. We have to encode those 24 bits into a unique GSI number and > >> restore them (by table lookup) on APIC injection inside the kernel. If > >> we only had to encode 256 different vectors, we would be done already. > > > > Right. But in practice guests always use distinct vectors (from the > > 256 available) for distinct messages. This is because > > the vector seems to be the only thing that gets communicated by the APIC > > to the software. > > > > So e.g. a table with 256 entries, with extra 1024-256 > > used for spill-over for guests that do something unexpected, > > would work really well. > > Already Linux manages vectors on a pre-CPU basis. For efficiency > reasons, it does not exploit the full range of 256 vectors but actually > allocates them in - IIRC - steps of 16. So I would not be surprised to > find lots of vector number "collisions" when looking over a full set of > CPUs in a system. > > Really, these considerations do not help us. We must store all 96 bits, > already for the sake of other KVM architectures that want MSI routing. > > > > > >>> > >>>>> a single GSI and vice versa. As there are less GSIs than possible MSI > >>>>> messages, we could run out of them when creating routes, statically or > >>>>> lazily. > >>>>> > >>>>> What would probably help us long-term out of your concerns regarding > >>>>> lazy routing is to bypass that redundant GSI translation for dynamic > >>>>> messages, i.e. those that are not associated with an irqfd number or an > >>>>> assigned device irq. Something like a KVM_DELIVER_MSI IOCTL that accepts > >>>>> address and data directly. > >>>> > >>>> This would be a trivial extension in fact. Given its beneficial impact > >>>> on our GSI limitation issue, I think I will hack up something like that. > >>>> > >>>> And maybe this makes a transparent cache more reasonable. Then only old > >>>> host kernels would force us to do searches for already cached messages. > >>>> > >>>> Jan > >>> > >>> Hmm, I'm not all that sure. Existing design really allows > >>> caching the route in various smart ways. We currently do > >>> this for irqfd but this can be extended to ioctls. > >>> If we just let the guest inject arbitrary messages, > >>> that becomes much more complex. > >> > >> irqfd and kvm device assignment do not allow us to inject arbitrary > >> messages at arbitrary points. The new API offers kvm_msi_irqfd_set and > >> kvm_device_msix_set_vector (etc.) for those scenarios to set static > >> routes from an MSI message to a GSI number (+they configure the related > >> backends). > > > > Yes, it's a very flexible API but it would be very hard to optimize. > > GSIs let us do the slow path setup, but they make it easy > > to optimize target lookup in kernel. > > Users of the API above have no need to know anything about GSIs. They > are an artifact of the KVM-internal interface between user space and > kernel now - thanks to the MSIRoutingCache encapsulation. Yes but I am saying that the API above can't be implemented more efficiently than now: you will have to scan all apics on each MSI. The GSI implementation can be optimized: decode the vector once, if it matches a single vcpu, store that vcpu and use when sending interrupts. > > > > An analogy would be if read/write operated on file paths. > > fd makes it easy to do permission checks and slow lookups > > in one place. GSI happens to work like this (maybe, by accident). > > Think of an opaque file handle as a MSIRoutingCache object. And it > encodes not only the routing handle but also other useful associated > information we need from time to time - internally, not in the device > models. Forget qemu abstractions, I am talking about data path optimizations in kernel in kvm. From that POV the point of an fd is not that it is opaque. It is that it's an index in an array that can be used for fast lookups. > >>> > >>> Another concern is mask bit emulation. We currently > >>> handle mask bit in userspace but patches > >>> to do them in kernel for assigned devices where seen > >>> and IMO we might want to do that for virtio as well. > >>> > >>> For that to work the mask bit needs to be tied to > >>> a specific gsi or specific device, which does not > >>> work if we just inject arbitrary writes. > >> > >> Yes, but I do not see those valuable plans being negatively affected. > >> > >> Jan > >> > > > > I do. > > How would we maintain a mask/pending bit in kernel if we are not > > supplied info on all available vectors even? > > It's tricky to discuss an undefined interface (there only exists an > outdated proposal for kvm device assignment). But I suppose that user > space will have to define the maximum number of vectors when creating an > in-kernel MSI-X MMIO area. The device already has to tell this to msix_init. > > The number of used vectors will correlate with the number of registered > irqfds (in the case of vhost or vfio, device assignment still has > SET_MSIX_NR). As kernel space would then be responsible for mask > processing, user space would keep vectors registered with irqfds, even > if they are masked. It could just continue to play the trick and drop > data=0 vectors. Which trick? We don't play any tricks except for device assignment. > The point here is: All those steps have _nothing_ to do with the generic > MSI-X core. They are KVM-specific "side channels" for which KVM provides > an API. In contrast, msix_vector_use/unuse were generic services that > were actually created to please KVM requirements. But if we split that > up, we can address the generic MSI-X requirements in a way that makes > more sense for emulated devices (and particularly msix_vector_use makes > no sense for emulation). > > Jan > We need at least msix_vector_unuse - IMO it makes more sense than "clear pending vector". msix_vector_use is good to keep around for symmetry: who knows whether we'll need to allocate resources per vector in the future.
On 2011-10-18 23:40, Michael S. Tsirkin wrote: > On Tue, Oct 18, 2011 at 09:37:14PM +0200, Jan Kiszka wrote: >> On 2011-10-18 20:40, Michael S. Tsirkin wrote: >>> On Tue, Oct 18, 2011 at 08:24:39PM +0200, Jan Kiszka wrote: >>>> On 2011-10-18 19:06, Michael S. Tsirkin wrote: >>>>> On Tue, Oct 18, 2011 at 05:55:54PM +0200, Jan Kiszka wrote: >>>>>> On 2011-10-18 17:22, Jan Kiszka wrote: >>>>>>> What KVM has to do is just mapping an arbitrary MSI message >>>>>>> (theoretically 64+32 bits, in practice it's much of course much less) to >>>>>> >>>>>> ( There are 24 distinguishing bits in an MSI message on x86, but that's >>>>>> only a current interpretation of one specific arch. ) >>>>> >>>>> Confused. vector mask is 8 bits. the rest is destination id etc. >>>> >>>> Right, but those additional bits like the destination make different >>>> messages. We have to encode those 24 bits into a unique GSI number and >>>> restore them (by table lookup) on APIC injection inside the kernel. If >>>> we only had to encode 256 different vectors, we would be done already. >>> >>> Right. But in practice guests always use distinct vectors (from the >>> 256 available) for distinct messages. This is because >>> the vector seems to be the only thing that gets communicated by the APIC >>> to the software. >>> >>> So e.g. a table with 256 entries, with extra 1024-256 >>> used for spill-over for guests that do something unexpected, >>> would work really well. >> >> Already Linux manages vectors on a pre-CPU basis. For efficiency >> reasons, it does not exploit the full range of 256 vectors but actually >> allocates them in - IIRC - steps of 16. So I would not be surprised to >> find lots of vector number "collisions" when looking over a full set of >> CPUs in a system. >> >> Really, these considerations do not help us. We must store all 96 bits, >> already for the sake of other KVM architectures that want MSI routing. >>> >>> >>>>> >>>>>>> a single GSI and vice versa. As there are less GSIs than possible MSI >>>>>>> messages, we could run out of them when creating routes, statically or >>>>>>> lazily. >>>>>>> >>>>>>> What would probably help us long-term out of your concerns regarding >>>>>>> lazy routing is to bypass that redundant GSI translation for dynamic >>>>>>> messages, i.e. those that are not associated with an irqfd number or an >>>>>>> assigned device irq. Something like a KVM_DELIVER_MSI IOCTL that accepts >>>>>>> address and data directly. >>>>>> >>>>>> This would be a trivial extension in fact. Given its beneficial impact >>>>>> on our GSI limitation issue, I think I will hack up something like that. >>>>>> >>>>>> And maybe this makes a transparent cache more reasonable. Then only old >>>>>> host kernels would force us to do searches for already cached messages. >>>>>> >>>>>> Jan >>>>> >>>>> Hmm, I'm not all that sure. Existing design really allows >>>>> caching the route in various smart ways. We currently do >>>>> this for irqfd but this can be extended to ioctls. >>>>> If we just let the guest inject arbitrary messages, >>>>> that becomes much more complex. >>>> >>>> irqfd and kvm device assignment do not allow us to inject arbitrary >>>> messages at arbitrary points. The new API offers kvm_msi_irqfd_set and >>>> kvm_device_msix_set_vector (etc.) for those scenarios to set static >>>> routes from an MSI message to a GSI number (+they configure the related >>>> backends). >>> >>> Yes, it's a very flexible API but it would be very hard to optimize. >>> GSIs let us do the slow path setup, but they make it easy >>> to optimize target lookup in kernel. >> >> Users of the API above have no need to know anything about GSIs. They >> are an artifact of the KVM-internal interface between user space and >> kernel now - thanks to the MSIRoutingCache encapsulation. > > Yes but I am saying that the API above can't be implemented > more efficiently than now: you will have to scan all apics on each MSI. > The GSI implementation can be optimized: decode the vector once, > if it matches a single vcpu, store that vcpu and use when sending > interrupts. Sorry, missed that you switched to kernel. What information do you want to cache there that cannot be easily obtained by looking at a concrete message? I do not see any. Once you checked that the delivery mode targets a specific cpu, you could address it directly. Or are you thinking about some cluster mode? > > >>> >>> An analogy would be if read/write operated on file paths. >>> fd makes it easy to do permission checks and slow lookups >>> in one place. GSI happens to work like this (maybe, by accident). >> >> Think of an opaque file handle as a MSIRoutingCache object. And it >> encodes not only the routing handle but also other useful associated >> information we need from time to time - internally, not in the device >> models. > > Forget qemu abstractions, I am talking about data path > optimizations in kernel in kvm. From that POV the point of an fd is not > that it is opaque. It is that it's an index in an array that > can be used for fast lookups. > >>>>> >>>>> Another concern is mask bit emulation. We currently >>>>> handle mask bit in userspace but patches >>>>> to do them in kernel for assigned devices where seen >>>>> and IMO we might want to do that for virtio as well. >>>>> >>>>> For that to work the mask bit needs to be tied to >>>>> a specific gsi or specific device, which does not >>>>> work if we just inject arbitrary writes. >>>> >>>> Yes, but I do not see those valuable plans being negatively affected. >>>> >>>> Jan >>>> >>> >>> I do. >>> How would we maintain a mask/pending bit in kernel if we are not >>> supplied info on all available vectors even? >> >> It's tricky to discuss an undefined interface (there only exists an >> outdated proposal for kvm device assignment). But I suppose that user >> space will have to define the maximum number of vectors when creating an >> in-kernel MSI-X MMIO area. The device already has to tell this to msix_init. >> >> The number of used vectors will correlate with the number of registered >> irqfds (in the case of vhost or vfio, device assignment still has >> SET_MSIX_NR). As kernel space would then be responsible for mask >> processing, user space would keep vectors registered with irqfds, even >> if they are masked. It could just continue to play the trick and drop >> data=0 vectors. > > Which trick? We don't play any tricks except for device assignment. > >> The point here is: All those steps have _nothing_ to do with the generic >> MSI-X core. They are KVM-specific "side channels" for which KVM provides >> an API. In contrast, msix_vector_use/unuse were generic services that >> were actually created to please KVM requirements. But if we split that >> up, we can address the generic MSI-X requirements in a way that makes >> more sense for emulated devices (and particularly msix_vector_use makes >> no sense for emulation). >> >> Jan >> > > We need at least msix_vector_unuse Not at all. We rather need some qemu_irq_set(level) for MSI. The spec requires that the device clears pending when the reason for that is removed. And any removal that is device model-originated should simply be signaled like an irq de-assert. Vector "unusage" is just one reason here. > - IMO it makes more sense than "clear > pending vector". msix_vector_use is good to keep around for symmetry: > who knows whether we'll need to allocate resources per vector > in the future. For MSI[-X], the spec is already there, and we know that there no need for further resources when emulating it. Only KVM has special needs. Jan
On Wed, Oct 19, 2011 at 12:13:49AM +0200, Jan Kiszka wrote: > On 2011-10-18 23:40, Michael S. Tsirkin wrote: > > On Tue, Oct 18, 2011 at 09:37:14PM +0200, Jan Kiszka wrote: > >> On 2011-10-18 20:40, Michael S. Tsirkin wrote: > >>> On Tue, Oct 18, 2011 at 08:24:39PM +0200, Jan Kiszka wrote: > >>>> On 2011-10-18 19:06, Michael S. Tsirkin wrote: > >>>>> On Tue, Oct 18, 2011 at 05:55:54PM +0200, Jan Kiszka wrote: > >>>>>> On 2011-10-18 17:22, Jan Kiszka wrote: > >>>>>>> What KVM has to do is just mapping an arbitrary MSI message > >>>>>>> (theoretically 64+32 bits, in practice it's much of course much less) to > >>>>>> > >>>>>> ( There are 24 distinguishing bits in an MSI message on x86, but that's > >>>>>> only a current interpretation of one specific arch. ) > >>>>> > >>>>> Confused. vector mask is 8 bits. the rest is destination id etc. > >>>> > >>>> Right, but those additional bits like the destination make different > >>>> messages. We have to encode those 24 bits into a unique GSI number and > >>>> restore them (by table lookup) on APIC injection inside the kernel. If > >>>> we only had to encode 256 different vectors, we would be done already. > >>> > >>> Right. But in practice guests always use distinct vectors (from the > >>> 256 available) for distinct messages. This is because > >>> the vector seems to be the only thing that gets communicated by the APIC > >>> to the software. > >>> > >>> So e.g. a table with 256 entries, with extra 1024-256 > >>> used for spill-over for guests that do something unexpected, > >>> would work really well. > >> > >> Already Linux manages vectors on a pre-CPU basis. For efficiency > >> reasons, it does not exploit the full range of 256 vectors but actually > >> allocates them in - IIRC - steps of 16. So I would not be surprised to > >> find lots of vector number "collisions" when looking over a full set of > >> CPUs in a system. > >> > >> Really, these considerations do not help us. We must store all 96 bits, > >> already for the sake of other KVM architectures that want MSI routing. > >>> > >>> > >>>>> > >>>>>>> a single GSI and vice versa. As there are less GSIs than possible MSI > >>>>>>> messages, we could run out of them when creating routes, statically or > >>>>>>> lazily. > >>>>>>> > >>>>>>> What would probably help us long-term out of your concerns regarding > >>>>>>> lazy routing is to bypass that redundant GSI translation for dynamic > >>>>>>> messages, i.e. those that are not associated with an irqfd number or an > >>>>>>> assigned device irq. Something like a KVM_DELIVER_MSI IOCTL that accepts > >>>>>>> address and data directly. > >>>>>> > >>>>>> This would be a trivial extension in fact. Given its beneficial impact > >>>>>> on our GSI limitation issue, I think I will hack up something like that. > >>>>>> > >>>>>> And maybe this makes a transparent cache more reasonable. Then only old > >>>>>> host kernels would force us to do searches for already cached messages. > >>>>>> > >>>>>> Jan > >>>>> > >>>>> Hmm, I'm not all that sure. Existing design really allows > >>>>> caching the route in various smart ways. We currently do > >>>>> this for irqfd but this can be extended to ioctls. > >>>>> If we just let the guest inject arbitrary messages, > >>>>> that becomes much more complex. > >>>> > >>>> irqfd and kvm device assignment do not allow us to inject arbitrary > >>>> messages at arbitrary points. The new API offers kvm_msi_irqfd_set and > >>>> kvm_device_msix_set_vector (etc.) for those scenarios to set static > >>>> routes from an MSI message to a GSI number (+they configure the related > >>>> backends). > >>> > >>> Yes, it's a very flexible API but it would be very hard to optimize. > >>> GSIs let us do the slow path setup, but they make it easy > >>> to optimize target lookup in kernel. > >> > >> Users of the API above have no need to know anything about GSIs. They > >> are an artifact of the KVM-internal interface between user space and > >> kernel now - thanks to the MSIRoutingCache encapsulation. > > > > Yes but I am saying that the API above can't be implemented > > more efficiently than now: you will have to scan all apics on each MSI. > > The GSI implementation can be optimized: decode the vector once, > > if it matches a single vcpu, store that vcpu and use when sending > > interrupts. > > Sorry, missed that you switched to kernel. > > What information do you want to cache there that cannot be easily > obtained by looking at a concrete message? I do not see any. Once you > checked that the delivery mode targets a specific cpu, you could address > it directly. I thought we need to match APIC ID. That needs a table lookup, no? > Or are you thinking about some cluster mode? That too. > > > > > >>> > >>> An analogy would be if read/write operated on file paths. > >>> fd makes it easy to do permission checks and slow lookups > >>> in one place. GSI happens to work like this (maybe, by accident). > >> > >> Think of an opaque file handle as a MSIRoutingCache object. And it > >> encodes not only the routing handle but also other useful associated > >> information we need from time to time - internally, not in the device > >> models. > > > > Forget qemu abstractions, I am talking about data path > > optimizations in kernel in kvm. From that POV the point of an fd is not > > that it is opaque. It is that it's an index in an array that > > can be used for fast lookups. > > > >>>>> > >>>>> Another concern is mask bit emulation. We currently > >>>>> handle mask bit in userspace but patches > >>>>> to do them in kernel for assigned devices where seen > >>>>> and IMO we might want to do that for virtio as well. > >>>>> > >>>>> For that to work the mask bit needs to be tied to > >>>>> a specific gsi or specific device, which does not > >>>>> work if we just inject arbitrary writes. > >>>> > >>>> Yes, but I do not see those valuable plans being negatively affected. > >>>> > >>>> Jan > >>>> > >>> > >>> I do. > >>> How would we maintain a mask/pending bit in kernel if we are not > >>> supplied info on all available vectors even? > >> > >> It's tricky to discuss an undefined interface (there only exists an > >> outdated proposal for kvm device assignment). But I suppose that user > >> space will have to define the maximum number of vectors when creating an > >> in-kernel MSI-X MMIO area. The device already has to tell this to msix_init. > >> > >> The number of used vectors will correlate with the number of registered > >> irqfds (in the case of vhost or vfio, device assignment still has > >> SET_MSIX_NR). As kernel space would then be responsible for mask > >> processing, user space would keep vectors registered with irqfds, even > >> if they are masked. It could just continue to play the trick and drop > >> data=0 vectors. > > > > Which trick? We don't play any tricks except for device assignment. > > > >> The point here is: All those steps have _nothing_ to do with the generic > >> MSI-X core. They are KVM-specific "side channels" for which KVM provides > >> an API. In contrast, msix_vector_use/unuse were generic services that > >> were actually created to please KVM requirements. But if we split that > >> up, we can address the generic MSI-X requirements in a way that makes > >> more sense for emulated devices (and particularly msix_vector_use makes > >> no sense for emulation). > >> > >> Jan > >> > > > > We need at least msix_vector_unuse > > Not at all. We rather need some qemu_irq_set(level) for MSI. > The spec > requires that the device clears pending when the reason for that is > removed. And any removal that is device model-originated should simply > be signaled like an irq de-assert. OK, this is a good argument. In particular virtio ISR read could clear msix pending bit (note: it would also need to clear irqfd as that is where we get the pending bit). I would prefer not to use qemu_irq_set for this though. We can add a level flag to msix_notify. > Vector "unusage" is just one reason here. I don't see removing the use/unuse functions as a priority though, but if we add an API that also lets devices say 'reason for interrupt is removed', that would be nice. Removing extra code can then be done separately, and on qemu.git not on qemu-kvm. > > - IMO it makes more sense than "clear > > pending vector". msix_vector_use is good to keep around for symmetry: > > who knows whether we'll need to allocate resources per vector > > in the future. > > For MSI[-X], the spec is already there, and we know that there no need > for further resources when emulating it. > Only KVM has special needs. > > Jan > It's not hard to speculate. Imagine an out of process device that shares guest memory and sends interrupts to qemu using eventfd. Suddenly we need an fd per vector, and this without KVM.
On 2011-10-19 02:56, Michael S. Tsirkin wrote: > On Wed, Oct 19, 2011 at 12:13:49AM +0200, Jan Kiszka wrote: >> On 2011-10-18 23:40, Michael S. Tsirkin wrote: >>> On Tue, Oct 18, 2011 at 09:37:14PM +0200, Jan Kiszka wrote: >>>> On 2011-10-18 20:40, Michael S. Tsirkin wrote: >>>>> On Tue, Oct 18, 2011 at 08:24:39PM +0200, Jan Kiszka wrote: >>>>>> On 2011-10-18 19:06, Michael S. Tsirkin wrote: >>>>>>> On Tue, Oct 18, 2011 at 05:55:54PM +0200, Jan Kiszka wrote: >>>>>>>> On 2011-10-18 17:22, Jan Kiszka wrote: >>>>>>>>> What KVM has to do is just mapping an arbitrary MSI message >>>>>>>>> (theoretically 64+32 bits, in practice it's much of course much less) to >>>>>>>> >>>>>>>> ( There are 24 distinguishing bits in an MSI message on x86, but that's >>>>>>>> only a current interpretation of one specific arch. ) >>>>>>> >>>>>>> Confused. vector mask is 8 bits. the rest is destination id etc. >>>>>> >>>>>> Right, but those additional bits like the destination make different >>>>>> messages. We have to encode those 24 bits into a unique GSI number and >>>>>> restore them (by table lookup) on APIC injection inside the kernel. If >>>>>> we only had to encode 256 different vectors, we would be done already. >>>>> >>>>> Right. But in practice guests always use distinct vectors (from the >>>>> 256 available) for distinct messages. This is because >>>>> the vector seems to be the only thing that gets communicated by the APIC >>>>> to the software. >>>>> >>>>> So e.g. a table with 256 entries, with extra 1024-256 >>>>> used for spill-over for guests that do something unexpected, >>>>> would work really well. >>>> >>>> Already Linux manages vectors on a pre-CPU basis. For efficiency >>>> reasons, it does not exploit the full range of 256 vectors but actually >>>> allocates them in - IIRC - steps of 16. So I would not be surprised to >>>> find lots of vector number "collisions" when looking over a full set of >>>> CPUs in a system. >>>> >>>> Really, these considerations do not help us. We must store all 96 bits, >>>> already for the sake of other KVM architectures that want MSI routing. >>>>> >>>>> >>>>>>> >>>>>>>>> a single GSI and vice versa. As there are less GSIs than possible MSI >>>>>>>>> messages, we could run out of them when creating routes, statically or >>>>>>>>> lazily. >>>>>>>>> >>>>>>>>> What would probably help us long-term out of your concerns regarding >>>>>>>>> lazy routing is to bypass that redundant GSI translation for dynamic >>>>>>>>> messages, i.e. those that are not associated with an irqfd number or an >>>>>>>>> assigned device irq. Something like a KVM_DELIVER_MSI IOCTL that accepts >>>>>>>>> address and data directly. >>>>>>>> >>>>>>>> This would be a trivial extension in fact. Given its beneficial impact >>>>>>>> on our GSI limitation issue, I think I will hack up something like that. >>>>>>>> >>>>>>>> And maybe this makes a transparent cache more reasonable. Then only old >>>>>>>> host kernels would force us to do searches for already cached messages. >>>>>>>> >>>>>>>> Jan >>>>>>> >>>>>>> Hmm, I'm not all that sure. Existing design really allows >>>>>>> caching the route in various smart ways. We currently do >>>>>>> this for irqfd but this can be extended to ioctls. >>>>>>> If we just let the guest inject arbitrary messages, >>>>>>> that becomes much more complex. >>>>>> >>>>>> irqfd and kvm device assignment do not allow us to inject arbitrary >>>>>> messages at arbitrary points. The new API offers kvm_msi_irqfd_set and >>>>>> kvm_device_msix_set_vector (etc.) for those scenarios to set static >>>>>> routes from an MSI message to a GSI number (+they configure the related >>>>>> backends). >>>>> >>>>> Yes, it's a very flexible API but it would be very hard to optimize. >>>>> GSIs let us do the slow path setup, but they make it easy >>>>> to optimize target lookup in kernel. >>>> >>>> Users of the API above have no need to know anything about GSIs. They >>>> are an artifact of the KVM-internal interface between user space and >>>> kernel now - thanks to the MSIRoutingCache encapsulation. >>> >>> Yes but I am saying that the API above can't be implemented >>> more efficiently than now: you will have to scan all apics on each MSI. >>> The GSI implementation can be optimized: decode the vector once, >>> if it matches a single vcpu, store that vcpu and use when sending >>> interrupts. >> >> Sorry, missed that you switched to kernel. >> >> What information do you want to cache there that cannot be easily >> obtained by looking at a concrete message? I do not see any. Once you >> checked that the delivery mode targets a specific cpu, you could address >> it directly. > > I thought we need to match APIC ID. That needs a table lookup, no? Yes. But that's completely independent of a concrete MSI message. In fact, this is the same thing we need when interpreting an IOAPIC redirection table entry. So let's create an APIC ID lookup table for the destination ID field, maybe multiple of them to match different modes, but not a MSI message table. > >> Or are you thinking about some cluster mode? > > That too. > >>> >>> >>>>> >>>>> An analogy would be if read/write operated on file paths. >>>>> fd makes it easy to do permission checks and slow lookups >>>>> in one place. GSI happens to work like this (maybe, by accident). >>>> >>>> Think of an opaque file handle as a MSIRoutingCache object. And it >>>> encodes not only the routing handle but also other useful associated >>>> information we need from time to time - internally, not in the device >>>> models. >>> >>> Forget qemu abstractions, I am talking about data path >>> optimizations in kernel in kvm. From that POV the point of an fd is not >>> that it is opaque. It is that it's an index in an array that >>> can be used for fast lookups. >>> >>>>>>> >>>>>>> Another concern is mask bit emulation. We currently >>>>>>> handle mask bit in userspace but patches >>>>>>> to do them in kernel for assigned devices where seen >>>>>>> and IMO we might want to do that for virtio as well. >>>>>>> >>>>>>> For that to work the mask bit needs to be tied to >>>>>>> a specific gsi or specific device, which does not >>>>>>> work if we just inject arbitrary writes. >>>>>> >>>>>> Yes, but I do not see those valuable plans being negatively affected. >>>>>> >>>>>> Jan >>>>>> >>>>> >>>>> I do. >>>>> How would we maintain a mask/pending bit in kernel if we are not >>>>> supplied info on all available vectors even? >>>> >>>> It's tricky to discuss an undefined interface (there only exists an >>>> outdated proposal for kvm device assignment). But I suppose that user >>>> space will have to define the maximum number of vectors when creating an >>>> in-kernel MSI-X MMIO area. The device already has to tell this to msix_init. >>>> >>>> The number of used vectors will correlate with the number of registered >>>> irqfds (in the case of vhost or vfio, device assignment still has >>>> SET_MSIX_NR). As kernel space would then be responsible for mask >>>> processing, user space would keep vectors registered with irqfds, even >>>> if they are masked. It could just continue to play the trick and drop >>>> data=0 vectors. >>> >>> Which trick? We don't play any tricks except for device assignment. >>> >>>> The point here is: All those steps have _nothing_ to do with the generic >>>> MSI-X core. They are KVM-specific "side channels" for which KVM provides >>>> an API. In contrast, msix_vector_use/unuse were generic services that >>>> were actually created to please KVM requirements. But if we split that >>>> up, we can address the generic MSI-X requirements in a way that makes >>>> more sense for emulated devices (and particularly msix_vector_use makes >>>> no sense for emulation). >>>> >>>> Jan >>>> >>> >>> We need at least msix_vector_unuse >> >> Not at all. We rather need some qemu_irq_set(level) for MSI. >> The spec >> requires that the device clears pending when the reason for that is >> removed. And any removal that is device model-originated should simply >> be signaled like an irq de-assert. > > OK, this is a good argument. > In particular virtio ISR read could clear msix pending bit > (note: it would also need to clear irqfd as that is where > we get the pending bit). > > I would prefer not to use qemu_irq_set for this though. > We can add a level flag to msix_notify. No concerns. > >> Vector "unusage" is just one reason here. > > I don't see removing the use/unuse functions as a priority though, > but if we add an API that also lets devices say > 'reason for interrupt is removed', that would be nice. > > Removing extra code can then be done separately, and on qemu.git > not on qemu-kvm. If we refrain from hacking KVM logic into the use/unuse services upstream, we can do this later on. For me it is important that those obsolete services do not block or complicate further cleanups of the MSI layer nor bother device model creators with tasks they should not worry about. > >>> - IMO it makes more sense than "clear >>> pending vector". msix_vector_use is good to keep around for symmetry: >>> who knows whether we'll need to allocate resources per vector >>> in the future. >> >> For MSI[-X], the spec is already there, and we know that there no need >> for further resources when emulating it. >> Only KVM has special needs. >> >> Jan >> > > It's not hard to speculate. Imagine an out of process device that > shares guest memory and sends interrupts to qemu using eventfd. Suddenly > we need an fd per vector, and this without KVM. That's what irqfd was invented for. Already works for vhost, and there is nothing that prevents communicating the irqfd fd between two processes. But note: irqfd handle, not a KVM-internal GSI. Jan
On Wed, Oct 19, 2011 at 08:41:48AM +0200, Jan Kiszka wrote: > >>>>>>>>> a single GSI and vice versa. As there are less GSIs than possible MSI > >>>>>>>>> messages, we could run out of them when creating routes, statically or > >>>>>>>>> lazily. > >>>>>>>>> > >>>>>>>>> What would probably help us long-term out of your concerns regarding > >>>>>>>>> lazy routing is to bypass that redundant GSI translation for dynamic > >>>>>>>>> messages, i.e. those that are not associated with an irqfd number or an > >>>>>>>>> assigned device irq. Something like a KVM_DELIVER_MSI IOCTL that accepts > >>>>>>>>> address and data directly. > >>>>>>>> > >>>>>>>> This would be a trivial extension in fact. Given its beneficial impact > >>>>>>>> on our GSI limitation issue, I think I will hack up something like that. > >>>>>>>> > >>>>>>>> And maybe this makes a transparent cache more reasonable. Then only old > >>>>>>>> host kernels would force us to do searches for already cached messages. > >>>>>>>> > >>>>>>>> Jan > >>>>>>> > >>>>>>> Hmm, I'm not all that sure. Existing design really allows > >>>>>>> caching the route in various smart ways. We currently do > >>>>>>> this for irqfd but this can be extended to ioctls. > >>>>>>> If we just let the guest inject arbitrary messages, > >>>>>>> that becomes much more complex. > >>>>>> > >>>>>> irqfd and kvm device assignment do not allow us to inject arbitrary > >>>>>> messages at arbitrary points. The new API offers kvm_msi_irqfd_set and > >>>>>> kvm_device_msix_set_vector (etc.) for those scenarios to set static > >>>>>> routes from an MSI message to a GSI number (+they configure the related > >>>>>> backends). > >>>>> > >>>>> Yes, it's a very flexible API but it would be very hard to optimize. > >>>>> GSIs let us do the slow path setup, but they make it easy > >>>>> to optimize target lookup in kernel. > >>>> > >>>> Users of the API above have no need to know anything about GSIs. They > >>>> are an artifact of the KVM-internal interface between user space and > >>>> kernel now - thanks to the MSIRoutingCache encapsulation. > >>> > >>> Yes but I am saying that the API above can't be implemented > >>> more efficiently than now: you will have to scan all apics on each MSI. > >>> The GSI implementation can be optimized: decode the vector once, > >>> if it matches a single vcpu, store that vcpu and use when sending > >>> interrupts. > >> > >> Sorry, missed that you switched to kernel. > >> > >> What information do you want to cache there that cannot be easily > >> obtained by looking at a concrete message? I do not see any. Once you > >> checked that the delivery mode targets a specific cpu, you could address > >> it directly. > > > > I thought we need to match APIC ID. That needs a table lookup, no? > > Yes. But that's completely independent of a concrete MSI message. In > fact, this is the same thing we need when interpreting an IOAPIC > redirection table entry. So let's create an APIC ID lookup table for the > destination ID field, maybe multiple of them to match different modes, > but not a MSI message table. > > > >> Or are you thinking about some cluster mode? > > > > That too. Hmm, might be a good idea. APIC IDs are 8 bit, right? > >>> > >>> > >>>>> > >>>>> An analogy would be if read/write operated on file paths. > >>>>> fd makes it easy to do permission checks and slow lookups > >>>>> in one place. GSI happens to work like this (maybe, by accident). > >>>> > >>>> Think of an opaque file handle as a MSIRoutingCache object. And it > >>>> encodes not only the routing handle but also other useful associated > >>>> information we need from time to time - internally, not in the device > >>>> models. > >>> > >>> Forget qemu abstractions, I am talking about data path > >>> optimizations in kernel in kvm. From that POV the point of an fd is not > >>> that it is opaque. It is that it's an index in an array that > >>> can be used for fast lookups. > >>> > >>>>>>> > >>>>>>> Another concern is mask bit emulation. We currently > >>>>>>> handle mask bit in userspace but patches > >>>>>>> to do them in kernel for assigned devices where seen > >>>>>>> and IMO we might want to do that for virtio as well. > >>>>>>> > >>>>>>> For that to work the mask bit needs to be tied to > >>>>>>> a specific gsi or specific device, which does not > >>>>>>> work if we just inject arbitrary writes. > >>>>>> > >>>>>> Yes, but I do not see those valuable plans being negatively affected. > >>>>>> > >>>>>> Jan > >>>>>> > >>>>> > >>>>> I do. > >>>>> How would we maintain a mask/pending bit in kernel if we are not > >>>>> supplied info on all available vectors even? > >>>> > >>>> It's tricky to discuss an undefined interface (there only exists an > >>>> outdated proposal for kvm device assignment). But I suppose that user > >>>> space will have to define the maximum number of vectors when creating an > >>>> in-kernel MSI-X MMIO area. The device already has to tell this to msix_init. > >>>> > >>>> The number of used vectors will correlate with the number of registered > >>>> irqfds (in the case of vhost or vfio, device assignment still has > >>>> SET_MSIX_NR). As kernel space would then be responsible for mask > >>>> processing, user space would keep vectors registered with irqfds, even > >>>> if they are masked. It could just continue to play the trick and drop > >>>> data=0 vectors. > >>> > >>> Which trick? We don't play any tricks except for device assignment. > >>> > >>>> The point here is: All those steps have _nothing_ to do with the generic > >>>> MSI-X core. They are KVM-specific "side channels" for which KVM provides > >>>> an API. In contrast, msix_vector_use/unuse were generic services that > >>>> were actually created to please KVM requirements. But if we split that > >>>> up, we can address the generic MSI-X requirements in a way that makes > >>>> more sense for emulated devices (and particularly msix_vector_use makes > >>>> no sense for emulation). > >>>> > >>>> Jan > >>>> > >>> > >>> We need at least msix_vector_unuse > >> > >> Not at all. We rather need some qemu_irq_set(level) for MSI. > >> The spec > >> requires that the device clears pending when the reason for that is > >> removed. And any removal that is device model-originated should simply > >> be signaled like an irq de-assert. > > > > OK, this is a good argument. > > In particular virtio ISR read could clear msix pending bit > > (note: it would also need to clear irqfd as that is where > > we get the pending bit). > > > > I would prefer not to use qemu_irq_set for this though. > > We can add a level flag to msix_notify. > > No concerns. > > > > >> Vector "unusage" is just one reason here. > > > > I don't see removing the use/unuse functions as a priority though, > > but if we add an API that also lets devices say > > 'reason for interrupt is removed', that would be nice. > > > > Removing extra code can then be done separately, and on qemu.git > > not on qemu-kvm. > > If we refrain from hacking KVM logic into the use/unuse services > upstream, we can do this later on. For me it is important that those > obsolete services do not block or complicate further cleanups of the MSI > layer nor bother device model creators with tasks they should not worry > about. My assumption is devices shall keep calling use/unuse until we drop it. Does not seem like a major bother. If you like, use all vectors or just those with message != 0. > > > >>> - IMO it makes more sense than "clear > >>> pending vector". msix_vector_use is good to keep around for symmetry: > >>> who knows whether we'll need to allocate resources per vector > >>> in the future. > >> > >> For MSI[-X], the spec is already there, and we know that there no need > >> for further resources when emulating it. > >> Only KVM has special needs. > >> > >> Jan > >> > > > > It's not hard to speculate. Imagine an out of process device that > > shares guest memory and sends interrupts to qemu using eventfd. Suddenly > > we need an fd per vector, and this without KVM. > > That's what irqfd was invented for. Already works for vhost, and there > is nothing that prevents communicating the irqfd fd between two > processes. But note: irqfd handle, not a KVM-internal GSI. > > Jan > Yes. But this still makes an API for acquiring per-vector resources a requirement.
On 2011-10-19 11:03, Michael S. Tsirkin wrote: >>> I thought we need to match APIC ID. That needs a table lookup, no? >> >> Yes. But that's completely independent of a concrete MSI message. In >> fact, this is the same thing we need when interpreting an IOAPIC >> redirection table entry. So let's create an APIC ID lookup table for the >> destination ID field, maybe multiple of them to match different modes, >> but not a MSI message table. >>> >>>> Or are you thinking about some cluster mode? >>> >>> That too. > > Hmm, might be a good idea. APIC IDs are 8 bit, right? Yep (more generally: destination IDs). So even if we have to create multiple lookup tables for the various modes, that won't consume megabytes of RAM. > > >>>>> >>>>> >>>>>>> >>>>>>> An analogy would be if read/write operated on file paths. >>>>>>> fd makes it easy to do permission checks and slow lookups >>>>>>> in one place. GSI happens to work like this (maybe, by accident). >>>>>> >>>>>> Think of an opaque file handle as a MSIRoutingCache object. And it >>>>>> encodes not only the routing handle but also other useful associated >>>>>> information we need from time to time - internally, not in the device >>>>>> models. >>>>> >>>>> Forget qemu abstractions, I am talking about data path >>>>> optimizations in kernel in kvm. From that POV the point of an fd is not >>>>> that it is opaque. It is that it's an index in an array that >>>>> can be used for fast lookups. >>>>> >>>>>>>>> >>>>>>>>> Another concern is mask bit emulation. We currently >>>>>>>>> handle mask bit in userspace but patches >>>>>>>>> to do them in kernel for assigned devices where seen >>>>>>>>> and IMO we might want to do that for virtio as well. >>>>>>>>> >>>>>>>>> For that to work the mask bit needs to be tied to >>>>>>>>> a specific gsi or specific device, which does not >>>>>>>>> work if we just inject arbitrary writes. >>>>>>>> >>>>>>>> Yes, but I do not see those valuable plans being negatively affected. >>>>>>>> >>>>>>>> Jan >>>>>>>> >>>>>>> >>>>>>> I do. >>>>>>> How would we maintain a mask/pending bit in kernel if we are not >>>>>>> supplied info on all available vectors even? >>>>>> >>>>>> It's tricky to discuss an undefined interface (there only exists an >>>>>> outdated proposal for kvm device assignment). But I suppose that user >>>>>> space will have to define the maximum number of vectors when creating an >>>>>> in-kernel MSI-X MMIO area. The device already has to tell this to msix_init. >>>>>> >>>>>> The number of used vectors will correlate with the number of registered >>>>>> irqfds (in the case of vhost or vfio, device assignment still has >>>>>> SET_MSIX_NR). As kernel space would then be responsible for mask >>>>>> processing, user space would keep vectors registered with irqfds, even >>>>>> if they are masked. It could just continue to play the trick and drop >>>>>> data=0 vectors. >>>>> >>>>> Which trick? We don't play any tricks except for device assignment. >>>>> >>>>>> The point here is: All those steps have _nothing_ to do with the generic >>>>>> MSI-X core. They are KVM-specific "side channels" for which KVM provides >>>>>> an API. In contrast, msix_vector_use/unuse were generic services that >>>>>> were actually created to please KVM requirements. But if we split that >>>>>> up, we can address the generic MSI-X requirements in a way that makes >>>>>> more sense for emulated devices (and particularly msix_vector_use makes >>>>>> no sense for emulation). >>>>>> >>>>>> Jan >>>>>> >>>>> >>>>> We need at least msix_vector_unuse >>>> >>>> Not at all. We rather need some qemu_irq_set(level) for MSI. >>>> The spec >>>> requires that the device clears pending when the reason for that is >>>> removed. And any removal that is device model-originated should simply >>>> be signaled like an irq de-assert. >>> >>> OK, this is a good argument. >>> In particular virtio ISR read could clear msix pending bit >>> (note: it would also need to clear irqfd as that is where >>> we get the pending bit). >>> >>> I would prefer not to use qemu_irq_set for this though. >>> We can add a level flag to msix_notify. >> >> No concerns. >> >>> >>>> Vector "unusage" is just one reason here. >>> >>> I don't see removing the use/unuse functions as a priority though, >>> but if we add an API that also lets devices say >>> 'reason for interrupt is removed', that would be nice. >>> >>> Removing extra code can then be done separately, and on qemu.git >>> not on qemu-kvm. >> >> If we refrain from hacking KVM logic into the use/unuse services >> upstream, we can do this later on. For me it is important that those >> obsolete services do not block or complicate further cleanups of the MSI >> layer nor bother device model creators with tasks they should not worry >> about. > > My assumption is devices shall keep calling use/unuse until we drop it. > Does not seem like a major bother. If you like, use all vectors > or just those with message != 0. What about letting only those devices call use/unuse that sometimes need less than the maximum amount? All other would benefit for an use_all executed on enable and a unuse_all on disable/reset/uninit. > >>> >>>>> - IMO it makes more sense than "clear >>>>> pending vector". msix_vector_use is good to keep around for symmetry: >>>>> who knows whether we'll need to allocate resources per vector >>>>> in the future. >>>> >>>> For MSI[-X], the spec is already there, and we know that there no need >>>> for further resources when emulating it. >>>> Only KVM has special needs. >>>> >>>> Jan >>>> >>> >>> It's not hard to speculate. Imagine an out of process device that >>> shares guest memory and sends interrupts to qemu using eventfd. Suddenly >>> we need an fd per vector, and this without KVM. >> >> That's what irqfd was invented for. Already works for vhost, and there >> is nothing that prevents communicating the irqfd fd between two >> processes. But note: irqfd handle, not a KVM-internal GSI. >> >> Jan >> > > Yes. But this still makes an API for acquiring per-vector resources a requirement. Yes, but a different one than current use/unuse. And it will be an optional one, only for those devices that need to establish irq/eventfd channels. Jan
On Wed, Oct 19, 2011 at 01:17:03PM +0200, Jan Kiszka wrote: > On 2011-10-19 11:03, Michael S. Tsirkin wrote: > >>> I thought we need to match APIC ID. That needs a table lookup, no? > >> > >> Yes. But that's completely independent of a concrete MSI message. In > >> fact, this is the same thing we need when interpreting an IOAPIC > >> redirection table entry. So let's create an APIC ID lookup table for the > >> destination ID field, maybe multiple of them to match different modes, > >> but not a MSI message table. > >>> > >>>> Or are you thinking about some cluster mode? > >>> > >>> That too. > > > > Hmm, might be a good idea. APIC IDs are 8 bit, right? > > Yep (more generally: destination IDs). So even if we have to create > multiple lookup tables for the various modes, that won't consume > megabytes of RAM. > > > > > > >>>>> > >>>>> > >>>>>>> > >>>>>>> An analogy would be if read/write operated on file paths. > >>>>>>> fd makes it easy to do permission checks and slow lookups > >>>>>>> in one place. GSI happens to work like this (maybe, by accident). > >>>>>> > >>>>>> Think of an opaque file handle as a MSIRoutingCache object. And it > >>>>>> encodes not only the routing handle but also other useful associated > >>>>>> information we need from time to time - internally, not in the device > >>>>>> models. > >>>>> > >>>>> Forget qemu abstractions, I am talking about data path > >>>>> optimizations in kernel in kvm. From that POV the point of an fd is not > >>>>> that it is opaque. It is that it's an index in an array that > >>>>> can be used for fast lookups. > >>>>> > >>>>>>>>> > >>>>>>>>> Another concern is mask bit emulation. We currently > >>>>>>>>> handle mask bit in userspace but patches > >>>>>>>>> to do them in kernel for assigned devices where seen > >>>>>>>>> and IMO we might want to do that for virtio as well. > >>>>>>>>> > >>>>>>>>> For that to work the mask bit needs to be tied to > >>>>>>>>> a specific gsi or specific device, which does not > >>>>>>>>> work if we just inject arbitrary writes. > >>>>>>>> > >>>>>>>> Yes, but I do not see those valuable plans being negatively affected. > >>>>>>>> > >>>>>>>> Jan > >>>>>>>> > >>>>>>> > >>>>>>> I do. > >>>>>>> How would we maintain a mask/pending bit in kernel if we are not > >>>>>>> supplied info on all available vectors even? > >>>>>> > >>>>>> It's tricky to discuss an undefined interface (there only exists an > >>>>>> outdated proposal for kvm device assignment). But I suppose that user > >>>>>> space will have to define the maximum number of vectors when creating an > >>>>>> in-kernel MSI-X MMIO area. The device already has to tell this to msix_init. > >>>>>> > >>>>>> The number of used vectors will correlate with the number of registered > >>>>>> irqfds (in the case of vhost or vfio, device assignment still has > >>>>>> SET_MSIX_NR). As kernel space would then be responsible for mask > >>>>>> processing, user space would keep vectors registered with irqfds, even > >>>>>> if they are masked. It could just continue to play the trick and drop > >>>>>> data=0 vectors. > >>>>> > >>>>> Which trick? We don't play any tricks except for device assignment. > >>>>> > >>>>>> The point here is: All those steps have _nothing_ to do with the generic > >>>>>> MSI-X core. They are KVM-specific "side channels" for which KVM provides > >>>>>> an API. In contrast, msix_vector_use/unuse were generic services that > >>>>>> were actually created to please KVM requirements. But if we split that > >>>>>> up, we can address the generic MSI-X requirements in a way that makes > >>>>>> more sense for emulated devices (and particularly msix_vector_use makes > >>>>>> no sense for emulation). > >>>>>> > >>>>>> Jan > >>>>>> > >>>>> > >>>>> We need at least msix_vector_unuse > >>>> > >>>> Not at all. We rather need some qemu_irq_set(level) for MSI. > >>>> The spec > >>>> requires that the device clears pending when the reason for that is > >>>> removed. And any removal that is device model-originated should simply > >>>> be signaled like an irq de-assert. > >>> > >>> OK, this is a good argument. > >>> In particular virtio ISR read could clear msix pending bit > >>> (note: it would also need to clear irqfd as that is where > >>> we get the pending bit). > >>> > >>> I would prefer not to use qemu_irq_set for this though. > >>> We can add a level flag to msix_notify. > >> > >> No concerns. > >> > >>> > >>>> Vector "unusage" is just one reason here. > >>> > >>> I don't see removing the use/unuse functions as a priority though, > >>> but if we add an API that also lets devices say > >>> 'reason for interrupt is removed', that would be nice. > >>> > >>> Removing extra code can then be done separately, and on qemu.git > >>> not on qemu-kvm. > >> > >> If we refrain from hacking KVM logic into the use/unuse services > >> upstream, we can do this later on. For me it is important that those > >> obsolete services do not block or complicate further cleanups of the MSI > >> layer nor bother device model creators with tasks they should not worry > >> about. > > > > My assumption is devices shall keep calling use/unuse until we drop it. > > Does not seem like a major bother. If you like, use all vectors > > or just those with message != 0. > > What about letting only those devices call use/unuse that sometimes need > less than the maximum amount? All other would benefit for an use_all > executed on enable and a unuse_all on disable/reset/uninit. Sure, I don't mind adding use_all/unuse_all wrappers. > > > >>> > >>>>> - IMO it makes more sense than "clear > >>>>> pending vector". msix_vector_use is good to keep around for symmetry: > >>>>> who knows whether we'll need to allocate resources per vector > >>>>> in the future. > >>>> > >>>> For MSI[-X], the spec is already there, and we know that there no need > >>>> for further resources when emulating it. > >>>> Only KVM has special needs. > >>>> > >>>> Jan > >>>> > >>> > >>> It's not hard to speculate. Imagine an out of process device that > >>> shares guest memory and sends interrupts to qemu using eventfd. Suddenly > >>> we need an fd per vector, and this without KVM. > >> > >> That's what irqfd was invented for. Already works for vhost, and there > >> is nothing that prevents communicating the irqfd fd between two > >> processes. But note: irqfd handle, not a KVM-internal GSI. > >> > >> Jan > >> > > > > Yes. But this still makes an API for acquiring per-vector resources a requirement. > > Yes, but a different one than current use/unuse. What's wrong with use/unuse as an API? It's already in place and virtio calls it. > And it will be an > optional one, only for those devices that need to establish irq/eventfd > channels. > > Jan Not sure this should be up to the device. > -- > Siemens AG, Corporate Technology, CT T DE IT 1 > Corporate Competence Center Embedded Linux
On 2011-10-21 00:02, Michael S. Tsirkin wrote: >>> Yes. But this still makes an API for acquiring per-vector resources a requirement. >> >> Yes, but a different one than current use/unuse. > > What's wrong with use/unuse as an API? It's already in place > and virtio calls it. Not for that purpose. It remains a useless API in the absence of KVM's requirements. > >> And it will be an >> optional one, only for those devices that need to establish irq/eventfd >> channels. >> >> Jan > > Not sure this should be up to the device. The device provides the fd. At least it acquires and associates it. Jan
On Fri, Oct 21, 2011 at 09:09:10AM +0200, Jan Kiszka wrote: > On 2011-10-21 00:02, Michael S. Tsirkin wrote: > >>> Yes. But this still makes an API for acquiring per-vector resources a requirement. > >> > >> Yes, but a different one than current use/unuse. > > > > What's wrong with use/unuse as an API? It's already in place > > and virtio calls it. > > Not for that purpose. > It remains a useless API in the absence of KVM's > requirements. > Sorry, I don't understand. This can acquire whatever resources necessary. It does not seem to make sense to rip it out only to add a different one back in. > > > >> And it will be an > >> optional one, only for those devices that need to establish irq/eventfd > >> channels. > >> > >> Jan > > > > Not sure this should be up to the device. > > The device provides the fd. At least it acquires and associates it. > > Jan It would surely be beneficial to be able to have a uniform API so that devices don't need to be recoded to be moved in this way. > -- > Siemens AG, Corporate Technology, CT T DE IT 1 > Corporate Competence Center Embedded Linux
On 2011-10-21 09:54, Michael S. Tsirkin wrote: > On Fri, Oct 21, 2011 at 09:09:10AM +0200, Jan Kiszka wrote: >> On 2011-10-21 00:02, Michael S. Tsirkin wrote: >>>>> Yes. But this still makes an API for acquiring per-vector resources a requirement. >>>> >>>> Yes, but a different one than current use/unuse. >>> >>> What's wrong with use/unuse as an API? It's already in place >>> and virtio calls it. >> >> Not for that purpose. >> It remains a useless API in the absence of KVM's >> requirements. >> > > Sorry, I don't understand. This can acquire whatever resources > necessary. It does not seem to make sense to rip it out > only to add a different one back in. > >>> >>>> And it will be an >>>> optional one, only for those devices that need to establish irq/eventfd >>>> channels. >>>> >>>> Jan >>> >>> Not sure this should be up to the device. >> >> The device provides the fd. At least it acquires and associates it. >> >> Jan > > It would surely be beneficial to be able to have a uniform > API so that devices don't need to be recoded to be moved > in this way. The point is that the current API is useless for devices that do not have to declare any vector to the core. By forcing them to call into that API, we solve no current problem automatically. We rather need associate_vector_with_x (and the reverse). And that only for device that have different backends than user space models. Jan
On Fri, Oct 21, 2011 at 11:27:48AM +0200, Jan Kiszka wrote: > On 2011-10-21 09:54, Michael S. Tsirkin wrote: > > On Fri, Oct 21, 2011 at 09:09:10AM +0200, Jan Kiszka wrote: > >> On 2011-10-21 00:02, Michael S. Tsirkin wrote: > >>>>> Yes. But this still makes an API for acquiring per-vector resources a requirement. > >>>> > >>>> Yes, but a different one than current use/unuse. > >>> > >>> What's wrong with use/unuse as an API? It's already in place > >>> and virtio calls it. > >> > >> Not for that purpose. > >> It remains a useless API in the absence of KVM's > >> requirements. > >> > > > > Sorry, I don't understand. This can acquire whatever resources > > necessary. It does not seem to make sense to rip it out > > only to add a different one back in. > > > >>> > >>>> And it will be an > >>>> optional one, only for those devices that need to establish irq/eventfd > >>>> channels. > >>>> > >>>> Jan > >>> > >>> Not sure this should be up to the device. > >> > >> The device provides the fd. At least it acquires and associates it. > >> > >> Jan > > > > It would surely be beneficial to be able to have a uniform > > API so that devices don't need to be recoded to be moved > > in this way. > > The point is that the current API is useless for devices that do not > have to declare any vector to the core. Don't assigned devices want this as well? They handle 0-address vectors specially, and this hack absolutely doesn't belong in pci core ... > By forcing them to call into > that API, we solve no current problem automatically. We rather need > associate_vector_with_x (and the reverse). And that only for device that > have different backends than user space models. > > Jan I'll need to think about this, would prefer this series not to get blocked on this issue. We more or less agreed to add _use_all/unuse_all for now? > -- > Siemens AG, Corporate Technology, CT T DE IT 1 > Corporate Competence Center Embedded Linux
diff --git a/hw/ivshmem.c b/hw/ivshmem.c index 242fbea..a402c98 100644 --- a/hw/ivshmem.c +++ b/hw/ivshmem.c @@ -535,10 +535,8 @@ static uint64_t ivshmem_get_size(IVShmemState * s) { return value; } -static void ivshmem_setup_msi(IVShmemState * s) { - - int i; - +static void ivshmem_setup_msi(IVShmemState *s) +{ /* allocate the MSI-X vectors */ memory_region_init(&s->msix_bar, "ivshmem-msix", 4096); @@ -551,11 +549,6 @@ static void ivshmem_setup_msi(IVShmemState * s) { exit(1); } - /* 'activate' the vectors */ - for (i = 0; i < s->vectors; i++) { - msix_vector_use(&s->dev, i); - } - /* allocate Qemu char devices for receiving interrupts */ s->eventfd_table = g_malloc0(s->vectors * sizeof(EventfdEntry)); } @@ -581,7 +574,7 @@ static int ivshmem_load(QEMUFile* f, void *opaque, int version_id) IVSHMEM_DPRINTF("ivshmem_load\n"); IVShmemState *proxy = opaque; - int ret, i; + int ret; if (version_id > 0) { return -EINVAL; @@ -599,9 +592,6 @@ static int ivshmem_load(QEMUFile* f, void *opaque, int version_id) if (ivshmem_has_feature(proxy, IVSHMEM_MSI)) { msix_load(&proxy->dev, f); - for (i = 0; i < proxy->vectors; i++) { - msix_vector_use(&proxy->dev, i); - } } else { proxy->intrstatus = qemu_get_be32(f); proxy->intrmask = qemu_get_be32(f); diff --git a/hw/msix.c b/hw/msix.c index ce3375a..f1b97b5 100644 --- a/hw/msix.c +++ b/hw/msix.c @@ -292,9 +292,6 @@ int msix_init(struct PCIDevice *dev, unsigned short nentries, if (nentries > MSIX_MAX_ENTRIES) return -EINVAL; - dev->msix_entry_used = g_malloc0(MSIX_MAX_ENTRIES * - sizeof *dev->msix_entry_used); - dev->msix_table_page = g_malloc0(MSIX_PAGE_SIZE); msix_mask_all(dev, nentries); @@ -317,21 +314,9 @@ err_config: memory_region_destroy(&dev->msix_mmio); g_free(dev->msix_table_page); dev->msix_table_page = NULL; - g_free(dev->msix_entry_used); - dev->msix_entry_used = NULL; return ret; } -static void msix_free_irq_entries(PCIDevice *dev) -{ - int vector; - - for (vector = 0; vector < dev->msix_entries_nr; ++vector) { - dev->msix_entry_used[vector] = 0; - msix_clr_pending(dev, vector); - } -} - /* Clean up resources for the device. */ int msix_uninit(PCIDevice *dev, MemoryRegion *bar) { @@ -340,14 +325,11 @@ int msix_uninit(PCIDevice *dev, MemoryRegion *bar) } pci_del_capability(dev, PCI_CAP_ID_MSIX, MSIX_CAP_LENGTH); dev->msix_cap = 0; - msix_free_irq_entries(dev); dev->msix_entries_nr = 0; memory_region_del_subregion(bar, &dev->msix_mmio); memory_region_destroy(&dev->msix_mmio); g_free(dev->msix_table_page); dev->msix_table_page = NULL; - g_free(dev->msix_entry_used); - dev->msix_entry_used = NULL; kvm_msix_free(dev); g_free(dev->msix_cache); @@ -376,7 +358,6 @@ void msix_load(PCIDevice *dev, QEMUFile *f) return; } - msix_free_irq_entries(dev); qemu_get_buffer(f, dev->msix_table_page, n * PCI_MSIX_ENTRY_SIZE); qemu_get_buffer(f, dev->msix_table_page + MSIX_PAGE_PENDING, (n + 7) / 8); } @@ -407,7 +388,7 @@ void msix_notify(PCIDevice *dev, unsigned vector) { MSIMessage msg; - if (vector >= dev->msix_entries_nr || !dev->msix_entry_used[vector]) + if (vector >= dev->msix_entries_nr) return; if (msix_is_masked(dev, vector)) { msix_set_pending(dev, vector); @@ -424,48 +405,31 @@ void msix_reset(PCIDevice *dev) if (!msix_present(dev)) { return; } - msix_free_irq_entries(dev); + msix_clear_all_vectors(dev); dev->config[dev->msix_cap + MSIX_CONTROL_OFFSET] &= ~dev->wmask[dev->msix_cap + MSIX_CONTROL_OFFSET]; memset(dev->msix_table_page, 0, MSIX_PAGE_SIZE); msix_mask_all(dev, dev->msix_entries_nr); } -/* PCI spec suggests that devices make it possible for software to configure - * less vectors than supported by the device, but does not specify a standard - * mechanism for devices to do so. - * - * We support this by asking devices to declare vectors software is going to - * actually use, and checking this on the notification path. Devices that - * don't want to follow the spec suggestion can declare all vectors as used. */ - -/* Mark vector as used. */ -int msix_vector_use(PCIDevice *dev, unsigned vector) +/* Clear pending vector. */ +void msix_clear_vector(PCIDevice *dev, unsigned vector) { - if (vector >= dev->msix_entries_nr) - return -EINVAL; - ++dev->msix_entry_used[vector]; - return 0; -} - -/* Mark vector as unused. */ -void msix_vector_unuse(PCIDevice *dev, unsigned vector) -{ - if (vector >= dev->msix_entries_nr || !dev->msix_entry_used[vector]) { - return; - } - if (--dev->msix_entry_used[vector]) { - return; + if (msix_present(dev) && vector < dev->msix_entries_nr) { + msix_clr_pending(dev, vector); } - msix_clr_pending(dev, vector); } -void msix_unuse_all_vectors(PCIDevice *dev) +void msix_clear_all_vectors(PCIDevice *dev) { + unsigned int vector; + if (!msix_present(dev)) { return; } - msix_free_irq_entries(dev); + for (vector = 0; vector < dev->msix_entries_nr; ++vector) { + msix_clr_pending(dev, vector); + } } /* Invoke the notifier if vector entry is used and unmasked. */ @@ -476,7 +440,7 @@ msix_notify_if_unmasked(PCIDevice *dev, unsigned int vector, bool masked) assert(dev->msix_vector_config_notifier); - if (!dev->msix_entry_used[vector] || msix_is_masked(dev, vector)) { + if (msix_is_masked(dev, vector)) { return 0; } msix_message_from_vector(dev, vector, &msg); diff --git a/hw/msix.h b/hw/msix.h index 978f417..9cd54cf 100644 --- a/hw/msix.h +++ b/hw/msix.h @@ -21,9 +21,8 @@ int msix_present(PCIDevice *dev); uint32_t msix_bar_size(PCIDevice *dev); -int msix_vector_use(PCIDevice *dev, unsigned vector); -void msix_vector_unuse(PCIDevice *dev, unsigned vector); -void msix_unuse_all_vectors(PCIDevice *dev); +void msix_clear_vector(PCIDevice *dev, unsigned vector); +void msix_clear_all_vectors(PCIDevice *dev); void msix_notify(PCIDevice *dev, unsigned vector); diff --git a/hw/pci.h b/hw/pci.h index d7a652e..5cf9a16 100644 --- a/hw/pci.h +++ b/hw/pci.h @@ -178,8 +178,6 @@ struct PCIDevice { uint8_t *msix_table_page; /* MMIO index used to map MSIX table and pending bit entries. */ MemoryRegion msix_mmio; - /* Reference-count for entries actually in use by driver. */ - unsigned *msix_entry_used; /* Region including the MSI-X table */ uint32_t msix_bar_size; /* Version id needed for VMState */ diff --git a/hw/virtio-pci.c b/hw/virtio-pci.c index 85d6771..5004d7d 100644 --- a/hw/virtio-pci.c +++ b/hw/virtio-pci.c @@ -136,9 +136,6 @@ static int virtio_pci_load_config(void * opaque, QEMUFile *f) } else { proxy->vdev->config_vector = VIRTIO_NO_VECTOR; } - if (proxy->vdev->config_vector != VIRTIO_NO_VECTOR) { - return msix_vector_use(&proxy->pci_dev, proxy->vdev->config_vector); - } return 0; } @@ -152,9 +149,6 @@ static int virtio_pci_load_queue(void * opaque, int n, QEMUFile *f) vector = VIRTIO_NO_VECTOR; } virtio_queue_set_vector(proxy->vdev, n, vector); - if (vector != VIRTIO_NO_VECTOR) { - return msix_vector_use(&proxy->pci_dev, vector); - } return 0; } @@ -304,7 +298,7 @@ static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val) if (pa == 0) { virtio_pci_stop_ioeventfd(proxy); virtio_reset(proxy->vdev); - msix_unuse_all_vectors(&proxy->pci_dev); + msix_clear_all_vectors(&proxy->pci_dev); } else virtio_queue_set_addr(vdev, vdev->queue_sel, pa); @@ -331,7 +325,7 @@ static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val) if (vdev->status == 0) { virtio_reset(proxy->vdev); - msix_unuse_all_vectors(&proxy->pci_dev); + msix_clear_all_vectors(&proxy->pci_dev); } /* Linux before 2.6.34 sets the device as OK without enabling @@ -343,18 +337,20 @@ static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val) } break; case VIRTIO_MSI_CONFIG_VECTOR: - msix_vector_unuse(&proxy->pci_dev, vdev->config_vector); + msix_clear_vector(&proxy->pci_dev, vdev->config_vector); /* Make it possible for guest to discover an error took place. */ - if (msix_vector_use(&proxy->pci_dev, val) < 0) + if (val >= vdev->nvectors) { val = VIRTIO_NO_VECTOR; + } vdev->config_vector = val; break; case VIRTIO_MSI_QUEUE_VECTOR: - msix_vector_unuse(&proxy->pci_dev, + msix_clear_vector(&proxy->pci_dev, virtio_queue_vector(vdev, vdev->queue_sel)); /* Make it possible for guest to discover an error took place. */ - if (msix_vector_use(&proxy->pci_dev, val) < 0) + if (val >= vdev->nvectors) { val = VIRTIO_NO_VECTOR; + } virtio_queue_set_vector(vdev, vdev->queue_sel, val); break; default:
This optimization was only required to keep KVM route usage low. Now that we solve that problem via lazy updates, we can drop the field. We still need interfaces to clear pending vectors, though (and we have to make use of them more broadly - but that's unrelated to this patch). Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com> --- hw/ivshmem.c | 16 ++----------- hw/msix.c | 62 +++++++++++------------------------------------------- hw/msix.h | 5 +-- hw/pci.h | 2 - hw/virtio-pci.c | 20 +++++++---------- 5 files changed, 26 insertions(+), 79 deletions(-)