Patchwork [RFC,28/45] qemu-kvm: msix: Drop tracking of used vectors

login
register
mail settings
Submitter Jan Kiszka
Date Oct. 17, 2011, 9:28 a.m.
Message ID <ee9ba574e6ccc27d33585d09f0bd66c1b6051379.1318843693.git.jan.kiszka@siemens.com>
Download mbox | patch
Permalink /patch/120123/
State New
Headers show

Comments

Jan Kiszka - Oct. 17, 2011, 9:28 a.m.
This optimization was only required to keep KVM route usage low. Now
that we solve that problem via lazy updates, we can drop the field. We
still need interfaces to clear pending vectors, though (and we have to
make use of them more broadly - but that's unrelated to this patch).

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
---
 hw/ivshmem.c    |   16 ++-----------
 hw/msix.c       |   62 +++++++++++-------------------------------------------
 hw/msix.h       |    5 +--
 hw/pci.h        |    2 -
 hw/virtio-pci.c |   20 +++++++----------
 5 files changed, 26 insertions(+), 79 deletions(-)
Michael S. Tsirkin - Oct. 17, 2011, 3:48 p.m.
On Mon, Oct 17, 2011 at 11:28:02AM +0200, Jan Kiszka wrote:
> This optimization was only required to keep KVM route usage low. Now
> that we solve that problem via lazy updates, we can drop the field. We
> still need interfaces to clear pending vectors, though (and we have to
> make use of them more broadly - but that's unrelated to this patch).
> 
> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>

Lazy updates should be an implementation detail.
IMO resource tracking of vectors makes sense
as an API. Making devices deal with pending
vectors as a concept, IMO, does not.

> ---
>  hw/ivshmem.c    |   16 ++-----------
>  hw/msix.c       |   62 +++++++++++-------------------------------------------
>  hw/msix.h       |    5 +--
>  hw/pci.h        |    2 -
>  hw/virtio-pci.c |   20 +++++++----------
>  5 files changed, 26 insertions(+), 79 deletions(-)
> 
> diff --git a/hw/ivshmem.c b/hw/ivshmem.c
> index 242fbea..a402c98 100644
> --- a/hw/ivshmem.c
> +++ b/hw/ivshmem.c
> @@ -535,10 +535,8 @@ static uint64_t ivshmem_get_size(IVShmemState * s) {
>      return value;
>  }
>  
> -static void ivshmem_setup_msi(IVShmemState * s) {
> -
> -    int i;
> -
> +static void ivshmem_setup_msi(IVShmemState *s)
> +{
>      /* allocate the MSI-X vectors */
>  
>      memory_region_init(&s->msix_bar, "ivshmem-msix", 4096);
> @@ -551,11 +549,6 @@ static void ivshmem_setup_msi(IVShmemState * s) {
>          exit(1);
>      }
>  
> -    /* 'activate' the vectors */
> -    for (i = 0; i < s->vectors; i++) {
> -        msix_vector_use(&s->dev, i);
> -    }
> -
>      /* allocate Qemu char devices for receiving interrupts */
>      s->eventfd_table = g_malloc0(s->vectors * sizeof(EventfdEntry));
>  }
> @@ -581,7 +574,7 @@ static int ivshmem_load(QEMUFile* f, void *opaque, int version_id)
>      IVSHMEM_DPRINTF("ivshmem_load\n");
>  
>      IVShmemState *proxy = opaque;
> -    int ret, i;
> +    int ret;
>  
>      if (version_id > 0) {
>          return -EINVAL;
> @@ -599,9 +592,6 @@ static int ivshmem_load(QEMUFile* f, void *opaque, int version_id)
>  
>      if (ivshmem_has_feature(proxy, IVSHMEM_MSI)) {
>          msix_load(&proxy->dev, f);
> -        for (i = 0; i < proxy->vectors; i++) {
> -            msix_vector_use(&proxy->dev, i);
> -        }
>      } else {
>          proxy->intrstatus = qemu_get_be32(f);
>          proxy->intrmask = qemu_get_be32(f);
> diff --git a/hw/msix.c b/hw/msix.c
> index ce3375a..f1b97b5 100644
> --- a/hw/msix.c
> +++ b/hw/msix.c
> @@ -292,9 +292,6 @@ int msix_init(struct PCIDevice *dev, unsigned short nentries,
>      if (nentries > MSIX_MAX_ENTRIES)
>          return -EINVAL;
>  
> -    dev->msix_entry_used = g_malloc0(MSIX_MAX_ENTRIES *
> -                                        sizeof *dev->msix_entry_used);
> -
>      dev->msix_table_page = g_malloc0(MSIX_PAGE_SIZE);
>      msix_mask_all(dev, nentries);
>  
> @@ -317,21 +314,9 @@ err_config:
>      memory_region_destroy(&dev->msix_mmio);
>      g_free(dev->msix_table_page);
>      dev->msix_table_page = NULL;
> -    g_free(dev->msix_entry_used);
> -    dev->msix_entry_used = NULL;
>      return ret;
>  }
>  
> -static void msix_free_irq_entries(PCIDevice *dev)
> -{
> -    int vector;
> -
> -    for (vector = 0; vector < dev->msix_entries_nr; ++vector) {
> -        dev->msix_entry_used[vector] = 0;
> -        msix_clr_pending(dev, vector);
> -    }
> -}
> -
>  /* Clean up resources for the device. */
>  int msix_uninit(PCIDevice *dev, MemoryRegion *bar)
>  {
> @@ -340,14 +325,11 @@ int msix_uninit(PCIDevice *dev, MemoryRegion *bar)
>      }
>      pci_del_capability(dev, PCI_CAP_ID_MSIX, MSIX_CAP_LENGTH);
>      dev->msix_cap = 0;
> -    msix_free_irq_entries(dev);
>      dev->msix_entries_nr = 0;
>      memory_region_del_subregion(bar, &dev->msix_mmio);
>      memory_region_destroy(&dev->msix_mmio);
>      g_free(dev->msix_table_page);
>      dev->msix_table_page = NULL;
> -    g_free(dev->msix_entry_used);
> -    dev->msix_entry_used = NULL;
>  
>      kvm_msix_free(dev);
>      g_free(dev->msix_cache);
> @@ -376,7 +358,6 @@ void msix_load(PCIDevice *dev, QEMUFile *f)
>          return;
>      }
>  
> -    msix_free_irq_entries(dev);
>      qemu_get_buffer(f, dev->msix_table_page, n * PCI_MSIX_ENTRY_SIZE);
>      qemu_get_buffer(f, dev->msix_table_page + MSIX_PAGE_PENDING, (n + 7) / 8);
>  }
> @@ -407,7 +388,7 @@ void msix_notify(PCIDevice *dev, unsigned vector)
>  {
>      MSIMessage msg;
>  
> -    if (vector >= dev->msix_entries_nr || !dev->msix_entry_used[vector])
> +    if (vector >= dev->msix_entries_nr)
>          return;
>      if (msix_is_masked(dev, vector)) {
>          msix_set_pending(dev, vector);
> @@ -424,48 +405,31 @@ void msix_reset(PCIDevice *dev)
>      if (!msix_present(dev)) {
>          return;
>      }
> -    msix_free_irq_entries(dev);
> +    msix_clear_all_vectors(dev);
>      dev->config[dev->msix_cap + MSIX_CONTROL_OFFSET] &=
>  	    ~dev->wmask[dev->msix_cap + MSIX_CONTROL_OFFSET];
>      memset(dev->msix_table_page, 0, MSIX_PAGE_SIZE);
>      msix_mask_all(dev, dev->msix_entries_nr);
>  }
>  
> -/* PCI spec suggests that devices make it possible for software to configure
> - * less vectors than supported by the device, but does not specify a standard
> - * mechanism for devices to do so.
> - *
> - * We support this by asking devices to declare vectors software is going to
> - * actually use, and checking this on the notification path. Devices that
> - * don't want to follow the spec suggestion can declare all vectors as used. */
> -
> -/* Mark vector as used. */
> -int msix_vector_use(PCIDevice *dev, unsigned vector)
> +/* Clear pending vector. */
> +void msix_clear_vector(PCIDevice *dev, unsigned vector)
>  {
> -    if (vector >= dev->msix_entries_nr)
> -        return -EINVAL;
> -    ++dev->msix_entry_used[vector];
> -    return 0;
> -}
> -
> -/* Mark vector as unused. */
> -void msix_vector_unuse(PCIDevice *dev, unsigned vector)
> -{
> -    if (vector >= dev->msix_entries_nr || !dev->msix_entry_used[vector]) {
> -        return;
> -    }
> -    if (--dev->msix_entry_used[vector]) {
> -        return;
> +    if (msix_present(dev) && vector < dev->msix_entries_nr) {
> +        msix_clr_pending(dev, vector);
>      }
> -    msix_clr_pending(dev, vector);
>  }
>  
> -void msix_unuse_all_vectors(PCIDevice *dev)
> +void msix_clear_all_vectors(PCIDevice *dev)
>  {
> +    unsigned int vector;
> +
>      if (!msix_present(dev)) {
>          return;
>      }
> -    msix_free_irq_entries(dev);
> +    for (vector = 0; vector < dev->msix_entries_nr; ++vector) {
> +        msix_clr_pending(dev, vector);
> +    }
>  }
>  
>  /* Invoke the notifier if vector entry is used and unmasked. */
> @@ -476,7 +440,7 @@ msix_notify_if_unmasked(PCIDevice *dev, unsigned int vector, bool masked)
>  
>      assert(dev->msix_vector_config_notifier);
>  
> -    if (!dev->msix_entry_used[vector] || msix_is_masked(dev, vector)) {
> +    if (msix_is_masked(dev, vector)) {
>          return 0;
>      }
>      msix_message_from_vector(dev, vector, &msg);
> diff --git a/hw/msix.h b/hw/msix.h
> index 978f417..9cd54cf 100644
> --- a/hw/msix.h
> +++ b/hw/msix.h
> @@ -21,9 +21,8 @@ int msix_present(PCIDevice *dev);
>  
>  uint32_t msix_bar_size(PCIDevice *dev);
>  
> -int msix_vector_use(PCIDevice *dev, unsigned vector);
> -void msix_vector_unuse(PCIDevice *dev, unsigned vector);
> -void msix_unuse_all_vectors(PCIDevice *dev);
> +void msix_clear_vector(PCIDevice *dev, unsigned vector);
> +void msix_clear_all_vectors(PCIDevice *dev);
>  
>  void msix_notify(PCIDevice *dev, unsigned vector);
>  
> diff --git a/hw/pci.h b/hw/pci.h
> index d7a652e..5cf9a16 100644
> --- a/hw/pci.h
> +++ b/hw/pci.h
> @@ -178,8 +178,6 @@ struct PCIDevice {
>      uint8_t *msix_table_page;
>      /* MMIO index used to map MSIX table and pending bit entries. */
>      MemoryRegion msix_mmio;
> -    /* Reference-count for entries actually in use by driver. */
> -    unsigned *msix_entry_used;
>      /* Region including the MSI-X table */
>      uint32_t msix_bar_size;
>      /* Version id needed for VMState */
> diff --git a/hw/virtio-pci.c b/hw/virtio-pci.c
> index 85d6771..5004d7d 100644
> --- a/hw/virtio-pci.c
> +++ b/hw/virtio-pci.c
> @@ -136,9 +136,6 @@ static int virtio_pci_load_config(void * opaque, QEMUFile *f)
>      } else {
>          proxy->vdev->config_vector = VIRTIO_NO_VECTOR;
>      }
> -    if (proxy->vdev->config_vector != VIRTIO_NO_VECTOR) {
> -        return msix_vector_use(&proxy->pci_dev, proxy->vdev->config_vector);
> -    }
>      return 0;
>  }
>  
> @@ -152,9 +149,6 @@ static int virtio_pci_load_queue(void * opaque, int n, QEMUFile *f)
>          vector = VIRTIO_NO_VECTOR;
>      }
>      virtio_queue_set_vector(proxy->vdev, n, vector);
> -    if (vector != VIRTIO_NO_VECTOR) {
> -        return msix_vector_use(&proxy->pci_dev, vector);
> -    }
>      return 0;
>  }
>  
> @@ -304,7 +298,7 @@ static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val)
>          if (pa == 0) {
>              virtio_pci_stop_ioeventfd(proxy);
>              virtio_reset(proxy->vdev);
> -            msix_unuse_all_vectors(&proxy->pci_dev);
> +            msix_clear_all_vectors(&proxy->pci_dev);
>          }
>          else
>              virtio_queue_set_addr(vdev, vdev->queue_sel, pa);
> @@ -331,7 +325,7 @@ static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val)
>  
>          if (vdev->status == 0) {
>              virtio_reset(proxy->vdev);
> -            msix_unuse_all_vectors(&proxy->pci_dev);
> +            msix_clear_all_vectors(&proxy->pci_dev);
>          }
>  
>          /* Linux before 2.6.34 sets the device as OK without enabling
> @@ -343,18 +337,20 @@ static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val)
>          }
>          break;
>      case VIRTIO_MSI_CONFIG_VECTOR:
> -        msix_vector_unuse(&proxy->pci_dev, vdev->config_vector);
> +        msix_clear_vector(&proxy->pci_dev, vdev->config_vector);
>          /* Make it possible for guest to discover an error took place. */
> -        if (msix_vector_use(&proxy->pci_dev, val) < 0)
> +        if (val >= vdev->nvectors) {
>              val = VIRTIO_NO_VECTOR;
> +        }
>          vdev->config_vector = val;
>          break;
>      case VIRTIO_MSI_QUEUE_VECTOR:
> -        msix_vector_unuse(&proxy->pci_dev,
> +        msix_clear_vector(&proxy->pci_dev,
>                            virtio_queue_vector(vdev, vdev->queue_sel));
>          /* Make it possible for guest to discover an error took place. */
> -        if (msix_vector_use(&proxy->pci_dev, val) < 0)
> +        if (val >= vdev->nvectors) {
>              val = VIRTIO_NO_VECTOR;
> +        }
>          virtio_queue_set_vector(vdev, vdev->queue_sel, val);
>          break;
>      default:
> -- 
> 1.7.3.4
Jan Kiszka - Oct. 17, 2011, 7:28 p.m.
On 2011-10-17 17:48, Michael S. Tsirkin wrote:
> On Mon, Oct 17, 2011 at 11:28:02AM +0200, Jan Kiszka wrote:
>> This optimization was only required to keep KVM route usage low. Now
>> that we solve that problem via lazy updates, we can drop the field. We
>> still need interfaces to clear pending vectors, though (and we have to
>> make use of them more broadly - but that's unrelated to this patch).
>>
>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
> 
> Lazy updates should be an implementation detail.
> IMO resource tracking of vectors makes sense
> as an API. Making devices deal with pending
> vectors as a concept, IMO, does not.

There is really no use for tracking the vector lifecycle once we have
lazy updates (except for static routes). It's a way too invasive
concept, and it's not needed for anything but KVM.

If you want an example, check
http://permalink.gmane.org/gmane.comp.emulators.kvm.devel/70915 and
compare it to the changes done to hpet in this series.

Jan
Michael S. Tsirkin - Oct. 18, 2011, 11:58 a.m.
On Mon, Oct 17, 2011 at 09:28:12PM +0200, Jan Kiszka wrote:
> On 2011-10-17 17:48, Michael S. Tsirkin wrote:
> > On Mon, Oct 17, 2011 at 11:28:02AM +0200, Jan Kiszka wrote:
> >> This optimization was only required to keep KVM route usage low. Now
> >> that we solve that problem via lazy updates, we can drop the field. We
> >> still need interfaces to clear pending vectors, though (and we have to
> >> make use of them more broadly - but that's unrelated to this patch).
> >>
> >> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
> > 
> > Lazy updates should be an implementation detail.
> > IMO resource tracking of vectors makes sense
> > as an API. Making devices deal with pending
> > vectors as a concept, IMO, does not.
> 
> There is really no use for tracking the vector lifecycle once we have
> lazy updates (except for static routes). It's a way too invasive
> concept, and it's not needed for anything but KVM.

I think it's needed. The PCI spec states that when the device
does not need an interrupt anymore, it should clear the pending
bit. The use/unuse is IMO a decent API for this,
because it uses a familiar resource tracking concept.
Exposing this knowledge of msix to devices seems
like a worse API.

> 
> If you want an example, check
> http://permalink.gmane.org/gmane.comp.emulators.kvm.devel/70915 and
> compare it to the changes done to hpet in this series.
> 
> Jan
> 

This seems to be a general argument that lazy updates are good?
I have no real problem with them, besides the fact that
we need an API to reserve space in the routing
table so that device setup can fail upfront.
Jan Kiszka - Oct. 18, 2011, 12:08 p.m.
On 2011-10-18 13:58, Michael S. Tsirkin wrote:
> On Mon, Oct 17, 2011 at 09:28:12PM +0200, Jan Kiszka wrote:
>> On 2011-10-17 17:48, Michael S. Tsirkin wrote:
>>> On Mon, Oct 17, 2011 at 11:28:02AM +0200, Jan Kiszka wrote:
>>>> This optimization was only required to keep KVM route usage low. Now
>>>> that we solve that problem via lazy updates, we can drop the field. We
>>>> still need interfaces to clear pending vectors, though (and we have to
>>>> make use of them more broadly - but that's unrelated to this patch).
>>>>
>>>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
>>>
>>> Lazy updates should be an implementation detail.
>>> IMO resource tracking of vectors makes sense
>>> as an API. Making devices deal with pending
>>> vectors as a concept, IMO, does not.
>>
>> There is really no use for tracking the vector lifecycle once we have
>> lazy updates (except for static routes). It's a way too invasive
>> concept, and it's not needed for anything but KVM.
> 
> I think it's needed. The PCI spec states that when the device
> does not need an interrupt anymore, it should clear the pending
> bit.

That should be done explicitly if it is required outside existing
clearing points. We already have that service, it's called
msix_clear_vector. That alone does not justify msix_vector_use and all
the state and logic behind it IMHO.

> The use/unuse is IMO a decent API for this,
> because it uses a familiar resource tracking concept.
> Exposing this knowledge of msix to devices seems
> like a worse API.
> 
>>
>> If you want an example, check
>> http://permalink.gmane.org/gmane.comp.emulators.kvm.devel/70915 and
>> compare it to the changes done to hpet in this series.
>>
>> Jan
>>
> 
> This seems to be a general argument that lazy updates are good?
> I have no real problem with them, besides the fact that
> we need an API to reserve space in the routing
> table so that device setup can fail upfront.

That's not possible, even with used vectors, as devices change their
vector usage depending on how the guest configures the devices. If you
(pre-)allocate all possible vectors, you may run out of resources
earlier than needed actually. That's also why we do those data == 0
checks to skip used but unconfigured vectors.

Jan
Michael S. Tsirkin - Oct. 18, 2011, 12:33 p.m.
On Tue, Oct 18, 2011 at 02:08:59PM +0200, Jan Kiszka wrote:
> On 2011-10-18 13:58, Michael S. Tsirkin wrote:
> > On Mon, Oct 17, 2011 at 09:28:12PM +0200, Jan Kiszka wrote:
> >> On 2011-10-17 17:48, Michael S. Tsirkin wrote:
> >>> On Mon, Oct 17, 2011 at 11:28:02AM +0200, Jan Kiszka wrote:
> >>>> This optimization was only required to keep KVM route usage low. Now
> >>>> that we solve that problem via lazy updates, we can drop the field. We
> >>>> still need interfaces to clear pending vectors, though (and we have to
> >>>> make use of them more broadly - but that's unrelated to this patch).
> >>>>
> >>>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
> >>>
> >>> Lazy updates should be an implementation detail.
> >>> IMO resource tracking of vectors makes sense
> >>> as an API. Making devices deal with pending
> >>> vectors as a concept, IMO, does not.
> >>
> >> There is really no use for tracking the vector lifecycle once we have
> >> lazy updates (except for static routes). It's a way too invasive
> >> concept, and it's not needed for anything but KVM.
> > 
> > I think it's needed. The PCI spec states that when the device
> > does not need an interrupt anymore, it should clear the pending
> > bit.
> 
> That should be done explicitly if it is required outside existing
> clearing points. We already have that service, it's called
> msix_clear_vector.

We do? I don't seem to see it upstream...

> That alone does not justify msix_vector_use and all
> the state and logic behind it IMHO.

To me it looks like an abstraction that solves both
this problem and the resource allocation problem.
Resources are actually limited BTW, this is not just
a KVM thing. qemu.git currently lets guests decide
what to do with them, but it might turn out to
be benefitial to warn the management application
that it is shooting itself in the foot.

> > The use/unuse is IMO a decent API for this,
> > because it uses a familiar resource tracking concept.
> > Exposing this knowledge of msix to devices seems
> > like a worse API.
> > 
> >>
> >> If you want an example, check
> >> http://permalink.gmane.org/gmane.comp.emulators.kvm.devel/70915 and
> >> compare it to the changes done to hpet in this series.
> >>
> >> Jan
> >>
> > 
> > This seems to be a general argument that lazy updates are good?
> > I have no real problem with them, besides the fact that
> > we need an API to reserve space in the routing
> > table so that device setup can fail upfront.
> 
> That's not possible, even with used vectors, as devices change their
> vector usage depending on how the guest configures the devices. If you
> (pre-)allocate all possible vectors, you may run out of resources
> earlier than needed actually.

This really depends, but please do look at how with virtio
we report resource shortage to guest and let it fall back to
level interrups. You seem to remove that capability.

I actually would not mind preallocating everything upfront which is much
easier.  But with your patch we get a silent failure or a drastic
slowdown which is much more painful IMO.

> That's also why we do those data == 0
> checks to skip used but unconfigured vectors.
> 
> Jan

These checks work more or less by luck BTW. It's
a hack which I hope lazy allocation will replace.

> -- 
> Siemens AG, Corporate Technology, CT T DE IT 1
> Corporate Competence Center Embedded Linux
Jan Kiszka - Oct. 18, 2011, 12:38 p.m.
On 2011-10-18 14:33, Michael S. Tsirkin wrote:
> On Tue, Oct 18, 2011 at 02:08:59PM +0200, Jan Kiszka wrote:
>> On 2011-10-18 13:58, Michael S. Tsirkin wrote:
>>> On Mon, Oct 17, 2011 at 09:28:12PM +0200, Jan Kiszka wrote:
>>>> On 2011-10-17 17:48, Michael S. Tsirkin wrote:
>>>>> On Mon, Oct 17, 2011 at 11:28:02AM +0200, Jan Kiszka wrote:
>>>>>> This optimization was only required to keep KVM route usage low. Now
>>>>>> that we solve that problem via lazy updates, we can drop the field. We
>>>>>> still need interfaces to clear pending vectors, though (and we have to
>>>>>> make use of them more broadly - but that's unrelated to this patch).
>>>>>>
>>>>>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
>>>>>
>>>>> Lazy updates should be an implementation detail.
>>>>> IMO resource tracking of vectors makes sense
>>>>> as an API. Making devices deal with pending
>>>>> vectors as a concept, IMO, does not.
>>>>
>>>> There is really no use for tracking the vector lifecycle once we have
>>>> lazy updates (except for static routes). It's a way too invasive
>>>> concept, and it's not needed for anything but KVM.
>>>
>>> I think it's needed. The PCI spec states that when the device
>>> does not need an interrupt anymore, it should clear the pending
>>> bit.
>>
>> That should be done explicitly if it is required outside existing
>> clearing points. We already have that service, it's called
>> msix_clear_vector.
> 
> We do? I don't seem to see it upstream...

True. From the device's POV, MSI-X (and also MSI!) vectors are actually
level-triggered. So we should communicate the level to the MSI core and
not just the edge. Needs more fixing

> 
>> That alone does not justify msix_vector_use and all
>> the state and logic behind it IMHO.
> 
> To me it looks like an abstraction that solves both
> this problem and the resource allocation problem.
> Resources are actually limited BTW, this is not just
> a KVM thing. qemu.git currently lets guests decide
> what to do with them, but it might turn out to
> be benefitial to warn the management application
> that it is shooting itself in the foot.
> 
>>> The use/unuse is IMO a decent API for this,
>>> because it uses a familiar resource tracking concept.
>>> Exposing this knowledge of msix to devices seems
>>> like a worse API.
>>>
>>>>
>>>> If you want an example, check
>>>> http://permalink.gmane.org/gmane.comp.emulators.kvm.devel/70915 and
>>>> compare it to the changes done to hpet in this series.
>>>>
>>>> Jan
>>>>
>>>
>>> This seems to be a general argument that lazy updates are good?
>>> I have no real problem with them, besides the fact that
>>> we need an API to reserve space in the routing
>>> table so that device setup can fail upfront.
>>
>> That's not possible, even with used vectors, as devices change their
>> vector usage depending on how the guest configures the devices. If you
>> (pre-)allocate all possible vectors, you may run out of resources
>> earlier than needed actually.
> 
> This really depends, but please do look at how with virtio
> we report resource shortage to guest and let it fall back to
> level interrups. You seem to remove that capability.

To my understanding, virtio will be the exception as no other device
will have a chance to react on resource shortage while sending(!) an MSI
message.

> 
> I actually would not mind preallocating everything upfront which is much
> easier.  But with your patch we get a silent failure or a drastic
> slowdown which is much more painful IMO.

Again: did we already saw that limit? And where does it come from if not
from KVM?

> 
>> That's also why we do those data == 0
>> checks to skip used but unconfigured vectors.
>>
>> Jan
> 
> These checks work more or less by luck BTW. It's
> a hack which I hope lazy allocation will replace.

The check is still valid (for x86) when we have to use static routes
(device assignment, vhost). For lazy updates, it's obsolete, that's true.

Jan
Michael S. Tsirkin - Oct. 18, 2011, 12:48 p.m.
On Tue, Oct 18, 2011 at 02:38:36PM +0200, Jan Kiszka wrote:
> On 2011-10-18 14:33, Michael S. Tsirkin wrote:
> > On Tue, Oct 18, 2011 at 02:08:59PM +0200, Jan Kiszka wrote:
> >> On 2011-10-18 13:58, Michael S. Tsirkin wrote:
> >>> On Mon, Oct 17, 2011 at 09:28:12PM +0200, Jan Kiszka wrote:
> >>>> On 2011-10-17 17:48, Michael S. Tsirkin wrote:
> >>>>> On Mon, Oct 17, 2011 at 11:28:02AM +0200, Jan Kiszka wrote:
> >>>>>> This optimization was only required to keep KVM route usage low. Now
> >>>>>> that we solve that problem via lazy updates, we can drop the field. We
> >>>>>> still need interfaces to clear pending vectors, though (and we have to
> >>>>>> make use of them more broadly - but that's unrelated to this patch).
> >>>>>>
> >>>>>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
> >>>>>
> >>>>> Lazy updates should be an implementation detail.
> >>>>> IMO resource tracking of vectors makes sense
> >>>>> as an API. Making devices deal with pending
> >>>>> vectors as a concept, IMO, does not.
> >>>>
> >>>> There is really no use for tracking the vector lifecycle once we have
> >>>> lazy updates (except for static routes). It's a way too invasive
> >>>> concept, and it's not needed for anything but KVM.
> >>>
> >>> I think it's needed. The PCI spec states that when the device
> >>> does not need an interrupt anymore, it should clear the pending
> >>> bit.
> >>
> >> That should be done explicitly if it is required outside existing
> >> clearing points. We already have that service, it's called
> >> msix_clear_vector.
> > 
> > We do? I don't seem to see it upstream...
> 
> True. From the device's POV, MSI-X (and also MSI!) vectors are actually
> level-triggered.

This definitely takes adjusting to.

> So we should communicate the level to the MSI core and
> not just the edge. Needs more fixing
>
> > 
> >> That alone does not justify msix_vector_use and all
> >> the state and logic behind it IMHO.
> > 
> > To me it looks like an abstraction that solves both
> > this problem and the resource allocation problem.
> > Resources are actually limited BTW, this is not just
> > a KVM thing. qemu.git currently lets guests decide
> > what to do with them, but it might turn out to
> > be benefitial to warn the management application
> > that it is shooting itself in the foot.
> > 
> >>> The use/unuse is IMO a decent API for this,
> >>> because it uses a familiar resource tracking concept.
> >>> Exposing this knowledge of msix to devices seems
> >>> like a worse API.
> >>>
> >>>>
> >>>> If you want an example, check
> >>>> http://permalink.gmane.org/gmane.comp.emulators.kvm.devel/70915 and
> >>>> compare it to the changes done to hpet in this series.
> >>>>
> >>>> Jan
> >>>>
> >>>
> >>> This seems to be a general argument that lazy updates are good?
> >>> I have no real problem with them, besides the fact that
> >>> we need an API to reserve space in the routing
> >>> table so that device setup can fail upfront.
> >>
> >> That's not possible, even with used vectors, as devices change their
> >> vector usage depending on how the guest configures the devices. If you
> >> (pre-)allocate all possible vectors, you may run out of resources
> >> earlier than needed actually.
> > 
> > This really depends, but please do look at how with virtio
> > we report resource shortage to guest and let it fall back to
> > level interrups. You seem to remove that capability.
> 
> To my understanding, virtio will be the exception as no other device
> will have a chance to react on resource shortage while sending(!) an MSI
> message.

Hmm, are you familiar with that spec? This is not what virtio does,
resource shortage is detected during setup.
This is exactly the problem with lazy registration as you don't
allocate until it's too late.

> > 
> > I actually would not mind preallocating everything upfront which is much
> > easier.  But with your patch we get a silent failure or a drastic
> > slowdown which is much more painful IMO.
> 
> Again: did we already saw that limit? And where does it come from if not
> from KVM?

It's a hardware limitation of intel APICs. interrupt vector is encoded
in an 8 bit field in msi address. So you can have at most 256 of these.

> > 
> >> That's also why we do those data == 0
> >> checks to skip used but unconfigured vectors.
> >>
> >> Jan
> > 
> > These checks work more or less by luck BTW. It's
> > a hack which I hope lazy allocation will replace.
> 
> The check is still valid (for x86) when we have to use static routes
> (device assignment, vhost).

It's not valid at all - we are just lucky that linux and
windows guests seem to zero out the vector when it's not in use.
They do not have to do that.

> For lazy updates, it's obsolete, that's true.
> 
> Jan
> 
> -- 
> Siemens AG, Corporate Technology, CT T DE IT 1
> Corporate Competence Center Embedded Linux
Jan Kiszka - Oct. 18, 2011, 1 p.m.
On 2011-10-18 14:48, Michael S. Tsirkin wrote:
>> To my understanding, virtio will be the exception as no other device
>> will have a chance to react on resource shortage while sending(!) an MSI
>> message.
> 
> Hmm, are you familiar with that spec?

Not by heart.

> This is not what virtio does,
> resource shortage is detected during setup.
> This is exactly the problem with lazy registration as you don't
> allocate until it's too late.

When is that setup phase? Does it actually come after every change to an
MSI vector? I doubt so. Thus virtio can only estimate the guest usage as
well (a guest may or may not actually write a non-null data into a
vector and unmask it).

> 
>>>
>>> I actually would not mind preallocating everything upfront which is much
>>> easier.  But with your patch we get a silent failure or a drastic
>>> slowdown which is much more painful IMO.
>>
>> Again: did we already saw that limit? And where does it come from if not
>> from KVM?
> 
> It's a hardware limitation of intel APICs. interrupt vector is encoded
> in an 8 bit field in msi address. So you can have at most 256 of these.

There should be no such limitation with pseudo GSIs we use for MSI
injection. They end up as MSI messages again, so actually 256 (-reserved
vectors) * number-of-cpus (on x86).

> 
>>>
>>>> That's also why we do those data == 0
>>>> checks to skip used but unconfigured vectors.
>>>>
>>>> Jan
>>>
>>> These checks work more or less by luck BTW. It's
>>> a hack which I hope lazy allocation will replace.
>>
>> The check is still valid (for x86) when we have to use static routes
>> (device assignment, vhost).
> 
> It's not valid at all - we are just lucky that linux and
> windows guests seem to zero out the vector when it's not in use.
> They do not have to do that.

It is valid as it is just an optimization. If an unused vector has a
non-null data field, we just redundantly register a route where we do
not actually have to. But we do need to be prepared for potentially
arriving messages on that virtual GSI, either via irqfd or kvm device
assignment.

Jan
Michael S. Tsirkin - Oct. 18, 2011, 1:37 p.m.
On Tue, Oct 18, 2011 at 03:00:29PM +0200, Jan Kiszka wrote:
> On 2011-10-18 14:48, Michael S. Tsirkin wrote:
> >> To my understanding, virtio will be the exception as no other device
> >> will have a chance to react on resource shortage while sending(!) an MSI
> >> message.
> > 
> > Hmm, are you familiar with that spec?
> 
> Not by heart.
> 
> > This is not what virtio does,
> > resource shortage is detected during setup.
> > This is exactly the problem with lazy registration as you don't
> > allocate until it's too late.
> 
> When is that setup phase? Does it actually come after every change to an
> MSI vector? I doubt so.

No. During setup, driver requests vectors from the OS, and then tells
the device which vector should each VQ use.  It then checks that the
assignment was successful. If not, it retries with less vectors.

Other devices can do this during initialization, and signal
resource availability to guest using msix vector number field.

> Thus virtio can only estimate the guest usage as
> well

At some level, this is fundamental: some guest operations
have no failure mode. So we must preallocate
some resources to make sure they won't fail.

> (a guest may or may not actually write a non-null data into a
> vector and unmask it).

Please, forget the non-NULL thing. virtio driver knows exactly
how many vectors we use and communicates this info to the device.
This is not uncommon at all.

> > 
> >>>
> >>> I actually would not mind preallocating everything upfront which is much
> >>> easier.  But with your patch we get a silent failure or a drastic
> >>> slowdown which is much more painful IMO.
> >>
> >> Again: did we already saw that limit? And where does it come from if not
> >> from KVM?
> > 
> > It's a hardware limitation of intel APICs. interrupt vector is encoded
> > in an 8 bit field in msi address. So you can have at most 256 of these.
> 
> There should be no such limitation with pseudo GSIs we use for MSI
> injection. They end up as MSI messages again, so actually 256 (-reserved
> vectors) * number-of-cpus (on x86).

This limits which CPUs can get the interrupt though.
Linux seems to have a global pool as it wants to be able to freely
balance vectors between CPUs. Or, consider a guest with a single CPU :)

Anyway, why argue - there is a limitation, and it's not coming from KVM,
right?

> > 
> >>>
> >>>> That's also why we do those data == 0
> >>>> checks to skip used but unconfigured vectors.
> >>>>
> >>>> Jan
> >>>
> >>> These checks work more or less by luck BTW. It's
> >>> a hack which I hope lazy allocation will replace.
> >>
> >> The check is still valid (for x86) when we have to use static routes
> >> (device assignment, vhost).
> > 
> > It's not valid at all - we are just lucky that linux and
> > windows guests seem to zero out the vector when it's not in use.
> > They do not have to do that.
> 
> It is valid as it is just an optimization. If an unused vector has a
> non-null data field, we just redundantly register a route where we do
> not actually have to.

Well, the only reason we even have this code is because
it was claimed that some devices declare support for a huge number
of vectors which then go unused. So if the guest does not
do this we'll run out of vectors ...

> But we do need to be prepared

And ATM, we aren't, and probably can't be without kernel
changes, right?

> for potentially
> arriving messages on that virtual GSI, either via irqfd or kvm device
> assignment.
> 
> Jan

Why irqfd?  Device assignment is ATM the only place where we use these
ugly hacks.

> -- 
> Siemens AG, Corporate Technology, CT T DE IT 1
> Corporate Competence Center Embedded Linux
Jan Kiszka - Oct. 18, 2011, 1:46 p.m.
On 2011-10-18 15:37, Michael S. Tsirkin wrote:
> On Tue, Oct 18, 2011 at 03:00:29PM +0200, Jan Kiszka wrote:
>> On 2011-10-18 14:48, Michael S. Tsirkin wrote:
>>>> To my understanding, virtio will be the exception as no other device
>>>> will have a chance to react on resource shortage while sending(!) an MSI
>>>> message.
>>>
>>> Hmm, are you familiar with that spec?
>>
>> Not by heart.
>>
>>> This is not what virtio does,
>>> resource shortage is detected during setup.
>>> This is exactly the problem with lazy registration as you don't
>>> allocate until it's too late.
>>
>> When is that setup phase? Does it actually come after every change to an
>> MSI vector? I doubt so.
> 
> No. During setup, driver requests vectors from the OS, and then tells
> the device which vector should each VQ use.  It then checks that the
> assignment was successful. If not, it retries with less vectors.
> 
> Other devices can do this during initialization, and signal
> resource availability to guest using msix vector number field.
> 
>> Thus virtio can only estimate the guest usage as
>> well
> 
> At some level, this is fundamental: some guest operations
> have no failure mode. So we must preallocate
> some resources to make sure they won't fail.

We can still track the expected maximum number of active vectors at core
level, collect them from the KVM layer, and warn if we expect conflicts.
Anxious MSI users could then refrain from using this feature, others
might be fine with risking a slow-down on conflicts.

> 
>> (a guest may or may not actually write a non-null data into a
>> vector and unmask it).
> 
> Please, forget the non-NULL thing. virtio driver knows exactly
> how many vectors we use and communicates this info to the device.
> This is not uncommon at all.
> 
>>>
>>>>>
>>>>> I actually would not mind preallocating everything upfront which is much
>>>>> easier.  But with your patch we get a silent failure or a drastic
>>>>> slowdown which is much more painful IMO.
>>>>
>>>> Again: did we already saw that limit? And where does it come from if not
>>>> from KVM?
>>>
>>> It's a hardware limitation of intel APICs. interrupt vector is encoded
>>> in an 8 bit field in msi address. So you can have at most 256 of these.
>>
>> There should be no such limitation with pseudo GSIs we use for MSI
>> injection. They end up as MSI messages again, so actually 256 (-reserved
>> vectors) * number-of-cpus (on x86).
> 
> This limits which CPUs can get the interrupt though.
> Linux seems to have a global pool as it wants to be able to freely
> balance vectors between CPUs. Or, consider a guest with a single CPU :)
> 
> Anyway, why argue - there is a limitation, and it's not coming from KVM,
> right?

No, our limit we hit with MSI message routing are first of all KVM GSIs,
and there only pseudo GSIs that do not go to any interrupt controller
with limited pins. That could easily be lifted in the kernel if we run
into shortages in practice.

> 
>>>
>>>>>
>>>>>> That's also why we do those data == 0
>>>>>> checks to skip used but unconfigured vectors.
>>>>>>
>>>>>> Jan
>>>>>
>>>>> These checks work more or less by luck BTW. It's
>>>>> a hack which I hope lazy allocation will replace.
>>>>
>>>> The check is still valid (for x86) when we have to use static routes
>>>> (device assignment, vhost).
>>>
>>> It's not valid at all - we are just lucky that linux and
>>> windows guests seem to zero out the vector when it's not in use.
>>> They do not have to do that.
>>
>> It is valid as it is just an optimization. If an unused vector has a
>> non-null data field, we just redundantly register a route where we do
>> not actually have to.
> 
> Well, the only reason we even have this code is because
> it was claimed that some devices declare support for a huge number
> of vectors which then go unused. So if the guest does not
> do this we'll run out of vectors ...
> 
>> But we do need to be prepared
> 
> And ATM, we aren't, and probably can't be without kernel
> changes, right?
> 
>> for potentially
>> arriving messages on that virtual GSI, either via irqfd or kvm device
>> assignment.
>>
>> Jan
> 
> Why irqfd?  Device assignment is ATM the only place where we use these
> ugly hacks.

vfio will use irqfds. And that virtio is partly out of the picture is
only because we know much more about virtio internals (specifically:
"will not advertise more vectors than guests will want to use").

Jan
Michael S. Tsirkin - Oct. 18, 2011, 2:01 p.m.
On Tue, Oct 18, 2011 at 03:46:06PM +0200, Jan Kiszka wrote:
> On 2011-10-18 15:37, Michael S. Tsirkin wrote:
> > On Tue, Oct 18, 2011 at 03:00:29PM +0200, Jan Kiszka wrote:
> >> On 2011-10-18 14:48, Michael S. Tsirkin wrote:
> >>>> To my understanding, virtio will be the exception as no other device
> >>>> will have a chance to react on resource shortage while sending(!) an MSI
> >>>> message.
> >>>
> >>> Hmm, are you familiar with that spec?
> >>
> >> Not by heart.
> >>
> >>> This is not what virtio does,
> >>> resource shortage is detected during setup.
> >>> This is exactly the problem with lazy registration as you don't
> >>> allocate until it's too late.
> >>
> >> When is that setup phase? Does it actually come after every change to an
> >> MSI vector? I doubt so.
> > 
> > No. During setup, driver requests vectors from the OS, and then tells
> > the device which vector should each VQ use.  It then checks that the
> > assignment was successful. If not, it retries with less vectors.
> > 
> > Other devices can do this during initialization, and signal
> > resource availability to guest using msix vector number field.
> > 
> >> Thus virtio can only estimate the guest usage as
> >> well
> > 
> > At some level, this is fundamental: some guest operations
> > have no failure mode. So we must preallocate
> > some resources to make sure they won't fail.
> 
> We can still track the expected maximum number of active vectors at core
> level, collect them from the KVM layer, and warn if we expect conflicts.
> Anxious MSI users could then refrain from using this feature, others
> might be fine with risking a slow-down on conflicts.

It seems like a nice feature until you have to debug it in the field :).
If you really think it's worthwhile, let's add a 'force' flag so that
advanced users at least can declare that they know what they are doing.

> > 
> >> (a guest may or may not actually write a non-null data into a
> >> vector and unmask it).
> > 
> > Please, forget the non-NULL thing. virtio driver knows exactly
> > how many vectors we use and communicates this info to the device.
> > This is not uncommon at all.
> > 
> >>>
> >>>>>
> >>>>> I actually would not mind preallocating everything upfront which is much
> >>>>> easier.  But with your patch we get a silent failure or a drastic
> >>>>> slowdown which is much more painful IMO.
> >>>>
> >>>> Again: did we already saw that limit? And where does it come from if not
> >>>> from KVM?
> >>>
> >>> It's a hardware limitation of intel APICs. interrupt vector is encoded
> >>> in an 8 bit field in msi address. So you can have at most 256 of these.
> >>
> >> There should be no such limitation with pseudo GSIs we use for MSI
> >> injection. They end up as MSI messages again, so actually 256 (-reserved
> >> vectors) * number-of-cpus (on x86).
> > 
> > This limits which CPUs can get the interrupt though.
> > Linux seems to have a global pool as it wants to be able to freely
> > balance vectors between CPUs. Or, consider a guest with a single CPU :)
> > 
> > Anyway, why argue - there is a limitation, and it's not coming from KVM,
> > right?
> 
> No, our limit we hit with MSI message routing are first of all KVM GSIs,
> and there only pseudo GSIs that do not go to any interrupt controller
> with limited pins.

I see KVM_MAX_IRQ_ROUTES 1024
This is > 256 so KVM does not seem to be the problem.

> That could easily be lifted in the kernel if we run
> into shortages in practice.

What I was saying is that resources are limited even without kvm.

> > 
> >>>
> >>>>>
> >>>>>> That's also why we do those data == 0
> >>>>>> checks to skip used but unconfigured vectors.
> >>>>>>
> >>>>>> Jan
> >>>>>
> >>>>> These checks work more or less by luck BTW. It's
> >>>>> a hack which I hope lazy allocation will replace.
> >>>>
> >>>> The check is still valid (for x86) when we have to use static routes
> >>>> (device assignment, vhost).
> >>>
> >>> It's not valid at all - we are just lucky that linux and
> >>> windows guests seem to zero out the vector when it's not in use.
> >>> They do not have to do that.
> >>
> >> It is valid as it is just an optimization. If an unused vector has a
> >> non-null data field, we just redundantly register a route where we do
> >> not actually have to.
> > 
> > Well, the only reason we even have this code is because
> > it was claimed that some devices declare support for a huge number
> > of vectors which then go unused. So if the guest does not
> > do this we'll run out of vectors ...
> > 
> >> But we do need to be prepared
> > 
> > And ATM, we aren't, and probably can't be without kernel
> > changes, right?
> > 
> >> for potentially
> >> arriving messages on that virtual GSI, either via irqfd or kvm device
> >> assignment.
> >>
> >> Jan
> > 
> > Why irqfd?  Device assignment is ATM the only place where we use these
> > ugly hacks.
> 
> vfio will use irqfds. And that virtio is partly out of the picture is
> only because we know much more about virtio internals (specifically:
> "will not advertise more vectors than guests will want to use").
> 
> Jan
> 
> -- 
> Siemens AG, Corporate Technology, CT T DE IT 1
> Corporate Competence Center Embedded Linux
Jan Kiszka - Oct. 18, 2011, 2:08 p.m.
On 2011-10-18 16:01, Michael S. Tsirkin wrote:
>>>>>>>
>>>>>>> I actually would not mind preallocating everything upfront which is much
>>>>>>> easier.  But with your patch we get a silent failure or a drastic
>>>>>>> slowdown which is much more painful IMO.
>>>>>>
>>>>>> Again: did we already saw that limit? And where does it come from if not
>>>>>> from KVM?
>>>>>
>>>>> It's a hardware limitation of intel APICs. interrupt vector is encoded
>>>>> in an 8 bit field in msi address. So you can have at most 256 of these.
>>>>
>>>> There should be no such limitation with pseudo GSIs we use for MSI
>>>> injection. They end up as MSI messages again, so actually 256 (-reserved
>>>> vectors) * number-of-cpus (on x86).
>>>
>>> This limits which CPUs can get the interrupt though.
>>> Linux seems to have a global pool as it wants to be able to freely
>>> balance vectors between CPUs. Or, consider a guest with a single CPU :)
>>>
>>> Anyway, why argue - there is a limitation, and it's not coming from KVM,
>>> right?
>>
>> No, our limit we hit with MSI message routing are first of all KVM GSIs,
>> and there only pseudo GSIs that do not go to any interrupt controller
>> with limited pins.
> 
> I see KVM_MAX_IRQ_ROUTES 1024
> This is > 256 so KVM does not seem to be the problem.

We can generate way more different MSI messages than 256. A message may
encode the target CPU, so you have this number in the equation e.g.

> 
>> That could easily be lifted in the kernel if we run
>> into shortages in practice.
> 
> What I was saying is that resources are limited even without kvm.

What other resources related to this particular case are exhausted
before GSI numbers?

Jan
Michael S. Tsirkin - Oct. 18, 2011, 3:08 p.m.
On Tue, Oct 18, 2011 at 04:08:46PM +0200, Jan Kiszka wrote:
> On 2011-10-18 16:01, Michael S. Tsirkin wrote:
> >>>>>>>
> >>>>>>> I actually would not mind preallocating everything upfront which is much
> >>>>>>> easier.  But with your patch we get a silent failure or a drastic
> >>>>>>> slowdown which is much more painful IMO.
> >>>>>>
> >>>>>> Again: did we already saw that limit? And where does it come from if not
> >>>>>> from KVM?
> >>>>>
> >>>>> It's a hardware limitation of intel APICs. interrupt vector is encoded
> >>>>> in an 8 bit field in msi address. So you can have at most 256 of these.
> >>>>
> >>>> There should be no such limitation with pseudo GSIs we use for MSI
> >>>> injection. They end up as MSI messages again, so actually 256 (-reserved
> >>>> vectors) * number-of-cpus (on x86).
> >>>
> >>> This limits which CPUs can get the interrupt though.
> >>> Linux seems to have a global pool as it wants to be able to freely
> >>> balance vectors between CPUs. Or, consider a guest with a single CPU :)
> >>>
> >>> Anyway, why argue - there is a limitation, and it's not coming from KVM,
> >>> right?
> >>
> >> No, our limit we hit with MSI message routing are first of all KVM GSIs,
> >> and there only pseudo GSIs that do not go to any interrupt controller
> >> with limited pins.
> > 
> > I see KVM_MAX_IRQ_ROUTES 1024
> > This is > 256 so KVM does not seem to be the problem.
> 
> We can generate way more different MSI messages than 256. A message may
> encode the target CPU, so you have this number in the equation e.g.

Yes but the vector is encoded in 256 bits. The rest is
stuff like delivery mode, which won't affect which
handler is run AFAIK. So while the problem might
appear with vector sharing, in practice there is
no vector sharing so no problem :)

> > 
> >> That could easily be lifted in the kernel if we run
> >> into shortages in practice.
> > 
> > What I was saying is that resources are limited even without kvm.
> 
> What other resources related to this particular case are exhausted
> before GSI numbers?
> 
> Jan

distinct vectors

> Siemens AG, Corporate Technology, CT T DE IT 1
> Corporate Competence Center Embedded Linux
Jan Kiszka - Oct. 18, 2011, 3:22 p.m.
On 2011-10-18 17:08, Michael S. Tsirkin wrote:
> On Tue, Oct 18, 2011 at 04:08:46PM +0200, Jan Kiszka wrote:
>> On 2011-10-18 16:01, Michael S. Tsirkin wrote:
>>>>>>>>>
>>>>>>>>> I actually would not mind preallocating everything upfront which is much
>>>>>>>>> easier.  But with your patch we get a silent failure or a drastic
>>>>>>>>> slowdown which is much more painful IMO.
>>>>>>>>
>>>>>>>> Again: did we already saw that limit? And where does it come from if not
>>>>>>>> from KVM?
>>>>>>>
>>>>>>> It's a hardware limitation of intel APICs. interrupt vector is encoded
>>>>>>> in an 8 bit field in msi address. So you can have at most 256 of these.
>>>>>>
>>>>>> There should be no such limitation with pseudo GSIs we use for MSI
>>>>>> injection. They end up as MSI messages again, so actually 256 (-reserved
>>>>>> vectors) * number-of-cpus (on x86).
>>>>>
>>>>> This limits which CPUs can get the interrupt though.
>>>>> Linux seems to have a global pool as it wants to be able to freely
>>>>> balance vectors between CPUs. Or, consider a guest with a single CPU :)
>>>>>
>>>>> Anyway, why argue - there is a limitation, and it's not coming from KVM,
>>>>> right?
>>>>
>>>> No, our limit we hit with MSI message routing are first of all KVM GSIs,
>>>> and there only pseudo GSIs that do not go to any interrupt controller
>>>> with limited pins.
>>>
>>> I see KVM_MAX_IRQ_ROUTES 1024
>>> This is > 256 so KVM does not seem to be the problem.
>>
>> We can generate way more different MSI messages than 256. A message may
>> encode the target CPU, so you have this number in the equation e.g.
> 
> Yes but the vector is encoded in 256 bits. The rest is
> stuff like delivery mode, which won't affect which
> handler is run AFAIK. So while the problem might
> appear with vector sharing, in practice there is
> no vector sharing so no problem :)
> 
>>>
>>>> That could easily be lifted in the kernel if we run
>>>> into shortages in practice.
>>>
>>> What I was saying is that resources are limited even without kvm.
>>
>> What other resources related to this particular case are exhausted
>> before GSI numbers?
>>
>> Jan
> 
> distinct vectors

The guest is responsible for managing vectors, not KVM, not QEMU. And
the guest will notice first when it runs out of them. So a virtio guest
driver may not even request MSI-X support if that happens.

What KVM has to do is just mapping an arbitrary MSI message
(theoretically 64+32 bits, in practice it's much of course much less) to
a single GSI and vice versa. As there are less GSIs than possible MSI
messages, we could run out of them when creating routes, statically or
lazily.

What would probably help us long-term out of your concerns regarding
lazy routing is to bypass that redundant GSI translation for dynamic
messages, i.e. those that are not associated with an irqfd number or an
assigned device irq. Something like a KVM_DELIVER_MSI IOCTL that accepts
address and data directly.

Jan
Jan Kiszka - Oct. 18, 2011, 3:55 p.m.
On 2011-10-18 17:22, Jan Kiszka wrote:
> What KVM has to do is just mapping an arbitrary MSI message
> (theoretically 64+32 bits, in practice it's much of course much less) to

( There are 24 distinguishing bits in an MSI message on x86, but that's
only a current interpretation of one specific arch. )

> a single GSI and vice versa. As there are less GSIs than possible MSI
> messages, we could run out of them when creating routes, statically or
> lazily.
> 
> What would probably help us long-term out of your concerns regarding
> lazy routing is to bypass that redundant GSI translation for dynamic
> messages, i.e. those that are not associated with an irqfd number or an
> assigned device irq. Something like a KVM_DELIVER_MSI IOCTL that accepts
> address and data directly.

This would be a trivial extension in fact. Given its beneficial impact
on our GSI limitation issue, I think I will hack up something like that.

And maybe this makes a transparent cache more reasonable. Then only old
host kernels would force us to do searches for already cached messages.

Jan
Michael S. Tsirkin - Oct. 18, 2011, 3:56 p.m.
On Tue, Oct 18, 2011 at 05:22:38PM +0200, Jan Kiszka wrote:
> On 2011-10-18 17:08, Michael S. Tsirkin wrote:
> > On Tue, Oct 18, 2011 at 04:08:46PM +0200, Jan Kiszka wrote:
> >> On 2011-10-18 16:01, Michael S. Tsirkin wrote:
> >>>>>>>>>
> >>>>>>>>> I actually would not mind preallocating everything upfront which is much
> >>>>>>>>> easier.  But with your patch we get a silent failure or a drastic
> >>>>>>>>> slowdown which is much more painful IMO.
> >>>>>>>>
> >>>>>>>> Again: did we already saw that limit? And where does it come from if not
> >>>>>>>> from KVM?
> >>>>>>>
> >>>>>>> It's a hardware limitation of intel APICs. interrupt vector is encoded
> >>>>>>> in an 8 bit field in msi address. So you can have at most 256 of these.
> >>>>>>
> >>>>>> There should be no such limitation with pseudo GSIs we use for MSI
> >>>>>> injection. They end up as MSI messages again, so actually 256 (-reserved
> >>>>>> vectors) * number-of-cpus (on x86).
> >>>>>
> >>>>> This limits which CPUs can get the interrupt though.
> >>>>> Linux seems to have a global pool as it wants to be able to freely
> >>>>> balance vectors between CPUs. Or, consider a guest with a single CPU :)
> >>>>>
> >>>>> Anyway, why argue - there is a limitation, and it's not coming from KVM,
> >>>>> right?
> >>>>
> >>>> No, our limit we hit with MSI message routing are first of all KVM GSIs,
> >>>> and there only pseudo GSIs that do not go to any interrupt controller
> >>>> with limited pins.
> >>>
> >>> I see KVM_MAX_IRQ_ROUTES 1024
> >>> This is > 256 so KVM does not seem to be the problem.
> >>
> >> We can generate way more different MSI messages than 256. A message may
> >> encode the target CPU, so you have this number in the equation e.g.
> > 
> > Yes but the vector is encoded in 256 bits. The rest is
> > stuff like delivery mode, which won't affect which
> > handler is run AFAIK. So while the problem might
> > appear with vector sharing, in practice there is
> > no vector sharing so no problem :)
> > 
> >>>
> >>>> That could easily be lifted in the kernel if we run
> >>>> into shortages in practice.
> >>>
> >>> What I was saying is that resources are limited even without kvm.
> >>
> >> What other resources related to this particular case are exhausted
> >> before GSI numbers?
> >>
> >> Jan
> > 
> > distinct vectors
> 
> The guest is responsible for managing vectors, not KVM, not QEMU. And
> the guest will notice first when it runs out of them. So a virtio guest
> driver may not even request MSI-X support if that happens.

Absolutely. You can solve the problem from guest in theory.
But what I was saying is, in practice what happens first X
devices get msix, others don't. Guests aren't doing anything smart as
they are not designed with a huge number of devices in mind.

What we can do is solve the problem from management.
And to do that we can't delay allocation until it's used.

> What KVM has to do is just mapping an arbitrary MSI message
> (theoretically 64+32 bits, in practice it's much of course much less) to
> a single GSI and vice versa. As there are less GSIs than possible MSI
> messages, we could run out of them when creating routes, statically or
> lazily.

Possible MSI messages != possible MSI vectors.
If two devices share a vector, APIC won't be able
to distinguish even though e.g. delivery mode is
different.

> What would probably help us long-term out of your concerns regarding
> lazy routing is to bypass that redundant GSI translation for dynamic
> messages, i.e. those that are not associated with an irqfd number or an
> assigned device irq. Something like a KVM_DELIVER_MSI IOCTL that accepts
> address and data directly.
> 
> Jan

You are trying to work around the problem by not requiring
any resources per MSI vector. This just might work for some
uses (ioctl) but isn't a generic solution (e.g. won't work for irqfd).

> -- 
> Siemens AG, Corporate Technology, CT T DE IT 1
> Corporate Competence Center Embedded Linux
Jan Kiszka - Oct. 18, 2011, 3:58 p.m.
On 2011-10-18 17:56, Michael S. Tsirkin wrote:
>> What would probably help us long-term out of your concerns regarding
>> lazy routing is to bypass that redundant GSI translation for dynamic
>> messages, i.e. those that are not associated with an irqfd number or an
>> assigned device irq. Something like a KVM_DELIVER_MSI IOCTL that accepts
>> address and data directly.
>>
>> Jan
> 
> You are trying to work around the problem by not requiring
> any resources per MSI vector. This just might work for some
> uses (ioctl) but isn't a generic solution (e.g. won't work for irqfd).

irqfd is not affected anymore in that model as it cannot participate in
lazy routing anyway.

Jan
Michael S. Tsirkin - Oct. 18, 2011, 5:06 p.m.
On Tue, Oct 18, 2011 at 05:55:54PM +0200, Jan Kiszka wrote:
> On 2011-10-18 17:22, Jan Kiszka wrote:
> > What KVM has to do is just mapping an arbitrary MSI message
> > (theoretically 64+32 bits, in practice it's much of course much less) to
> 
> ( There are 24 distinguishing bits in an MSI message on x86, but that's
> only a current interpretation of one specific arch. )

Confused. vector mask is 8 bits. the rest is destination id etc.

> > a single GSI and vice versa. As there are less GSIs than possible MSI
> > messages, we could run out of them when creating routes, statically or
> > lazily.
> > 
> > What would probably help us long-term out of your concerns regarding
> > lazy routing is to bypass that redundant GSI translation for dynamic
> > messages, i.e. those that are not associated with an irqfd number or an
> > assigned device irq. Something like a KVM_DELIVER_MSI IOCTL that accepts
> > address and data directly.
> 
> This would be a trivial extension in fact. Given its beneficial impact
> on our GSI limitation issue, I think I will hack up something like that.
> 
> And maybe this makes a transparent cache more reasonable. Then only old
> host kernels would force us to do searches for already cached messages.
> 
> Jan

Hmm, I'm not all that sure. Existing design really allows
caching the route in various smart ways. We currently do
this for irqfd but this can be extended to ioctls.
If we just let the guest inject arbitrary messages,
that becomes much more complex.

Another concern is mask bit emulation. We currently
handle mask bit in userspace but patches
to do them in kernel for assigned devices where seen
and IMO we might want to do that for virtio as well.

For that to work the mask bit needs to be tied to
a specific gsi or specific device, which does not
work if we just inject arbitrary writes.
Jan Kiszka - Oct. 18, 2011, 6:24 p.m.
On 2011-10-18 19:06, Michael S. Tsirkin wrote:
> On Tue, Oct 18, 2011 at 05:55:54PM +0200, Jan Kiszka wrote:
>> On 2011-10-18 17:22, Jan Kiszka wrote:
>>> What KVM has to do is just mapping an arbitrary MSI message
>>> (theoretically 64+32 bits, in practice it's much of course much less) to
>>
>> ( There are 24 distinguishing bits in an MSI message on x86, but that's
>> only a current interpretation of one specific arch. )
> 
> Confused. vector mask is 8 bits. the rest is destination id etc.

Right, but those additional bits like the destination make different
messages. We have to encode those 24 bits into a unique GSI number and
restore them (by table lookup) on APIC injection inside the kernel. If
we only had to encode 256 different vectors, we would be done already.

> 
>>> a single GSI and vice versa. As there are less GSIs than possible MSI
>>> messages, we could run out of them when creating routes, statically or
>>> lazily.
>>>
>>> What would probably help us long-term out of your concerns regarding
>>> lazy routing is to bypass that redundant GSI translation for dynamic
>>> messages, i.e. those that are not associated with an irqfd number or an
>>> assigned device irq. Something like a KVM_DELIVER_MSI IOCTL that accepts
>>> address and data directly.
>>
>> This would be a trivial extension in fact. Given its beneficial impact
>> on our GSI limitation issue, I think I will hack up something like that.
>>
>> And maybe this makes a transparent cache more reasonable. Then only old
>> host kernels would force us to do searches for already cached messages.
>>
>> Jan
> 
> Hmm, I'm not all that sure. Existing design really allows
> caching the route in various smart ways. We currently do
> this for irqfd but this can be extended to ioctls.
> If we just let the guest inject arbitrary messages,
> that becomes much more complex.

irqfd and kvm device assignment do not allow us to inject arbitrary
messages at arbitrary points. The new API offers kvm_msi_irqfd_set and
kvm_device_msix_set_vector (etc.) for those scenarios to set static
routes from an MSI message to a GSI number (+they configure the related
backends).

> 
> Another concern is mask bit emulation. We currently
> handle mask bit in userspace but patches
> to do them in kernel for assigned devices where seen
> and IMO we might want to do that for virtio as well.
> 
> For that to work the mask bit needs to be tied to
> a specific gsi or specific device, which does not
> work if we just inject arbitrary writes.

Yes, but I do not see those valuable plans being negatively affected.

Jan
Jan Kiszka - Oct. 18, 2011, 6:26 p.m.
On 2011-10-18 19:06, Michael S. Tsirkin wrote:
> On Tue, Oct 18, 2011 at 05:55:54PM +0200, Jan Kiszka wrote:
>> On 2011-10-18 17:22, Jan Kiszka wrote:
>>> What KVM has to do is just mapping an arbitrary MSI message
>>> (theoretically 64+32 bits, in practice it's much of course much less) to
>>
>> ( There are 24 distinguishing bits in an MSI message on x86, but that's
>> only a current interpretation of one specific arch. )
> 
> Confused. vector mask is 8 bits. the rest is destination id etc.

Right, but those additional bits like the destination make different
messages. We have to encode those 24 bits into a unique GSI number and
restore them (by table lookup) on APIC injection inside the kernel. If
we only had to encode 256 different vectors, we would be done already.

> 
>>> a single GSI and vice versa. As there are less GSIs than possible MSI
>>> messages, we could run out of them when creating routes, statically or
>>> lazily.
>>>
>>> What would probably help us long-term out of your concerns regarding
>>> lazy routing is to bypass that redundant GSI translation for dynamic
>>> messages, i.e. those that are not associated with an irqfd number or an
>>> assigned device irq. Something like a KVM_DELIVER_MSI IOCTL that accepts
>>> address and data directly.
>>
>> This would be a trivial extension in fact. Given its beneficial impact
>> on our GSI limitation issue, I think I will hack up something like that.
>>
>> And maybe this makes a transparent cache more reasonable. Then only old
>> host kernels would force us to do searches for already cached messages.
>>
>> Jan
> 
> Hmm, I'm not all that sure. Existing design really allows
> caching the route in various smart ways. We currently do
> this for irqfd but this can be extended to ioctls.
> If we just let the guest inject arbitrary messages,
> that becomes much more complex.

irqfd and kvm device assignment do not allow us to inject arbitrary
messages at arbitrary points. The new API offers kvm_msi_irqfd_set and
kvm_device_msix_set_vector (etc.) for those scenarios to set static
routes from an MSI message to a GSI number (+they configure the related
backends).

> 
> Another concern is mask bit emulation. We currently
> handle mask bit in userspace but patches
> to do them in kernel for assigned devices where seen
> and IMO we might want to do that for virtio as well.
> 
> For that to work the mask bit needs to be tied to
> a specific gsi or specific device, which does not
> work if we just inject arbitrary writes.

Yes, but I do not see those valuable plans being negatively affected.

Jan
Michael S. Tsirkin - Oct. 18, 2011, 6:40 p.m.
On Tue, Oct 18, 2011 at 08:24:39PM +0200, Jan Kiszka wrote:
> On 2011-10-18 19:06, Michael S. Tsirkin wrote:
> > On Tue, Oct 18, 2011 at 05:55:54PM +0200, Jan Kiszka wrote:
> >> On 2011-10-18 17:22, Jan Kiszka wrote:
> >>> What KVM has to do is just mapping an arbitrary MSI message
> >>> (theoretically 64+32 bits, in practice it's much of course much less) to
> >>
> >> ( There are 24 distinguishing bits in an MSI message on x86, but that's
> >> only a current interpretation of one specific arch. )
> > 
> > Confused. vector mask is 8 bits. the rest is destination id etc.
> 
> Right, but those additional bits like the destination make different
> messages. We have to encode those 24 bits into a unique GSI number and
> restore them (by table lookup) on APIC injection inside the kernel. If
> we only had to encode 256 different vectors, we would be done already.

Right. But in practice guests always use distinct vectors (from the
256 available) for distinct messages. This is because
the vector seems to be the only thing that gets communicated by the APIC
to the software.

So e.g. a table with 256 entries, with extra 1024-256
used for spill-over for guests that do something unexpected,
would work really well.


> > 
> >>> a single GSI and vice versa. As there are less GSIs than possible MSI
> >>> messages, we could run out of them when creating routes, statically or
> >>> lazily.
> >>>
> >>> What would probably help us long-term out of your concerns regarding
> >>> lazy routing is to bypass that redundant GSI translation for dynamic
> >>> messages, i.e. those that are not associated with an irqfd number or an
> >>> assigned device irq. Something like a KVM_DELIVER_MSI IOCTL that accepts
> >>> address and data directly.
> >>
> >> This would be a trivial extension in fact. Given its beneficial impact
> >> on our GSI limitation issue, I think I will hack up something like that.
> >>
> >> And maybe this makes a transparent cache more reasonable. Then only old
> >> host kernels would force us to do searches for already cached messages.
> >>
> >> Jan
> > 
> > Hmm, I'm not all that sure. Existing design really allows
> > caching the route in various smart ways. We currently do
> > this for irqfd but this can be extended to ioctls.
> > If we just let the guest inject arbitrary messages,
> > that becomes much more complex.
> 
> irqfd and kvm device assignment do not allow us to inject arbitrary
> messages at arbitrary points. The new API offers kvm_msi_irqfd_set and
> kvm_device_msix_set_vector (etc.) for those scenarios to set static
> routes from an MSI message to a GSI number (+they configure the related
> backends).

Yes, it's a very flexible API but it would be very hard to optimize.
GSIs let us do the slow path setup, but they make it easy
to optimize target lookup in kernel.

An analogy would be if read/write operated on file paths.
fd makes it easy to do permission checks and slow lookups
in one place. GSI happens to work like this (maybe, by accident).

> > 
> > Another concern is mask bit emulation. We currently
> > handle mask bit in userspace but patches
> > to do them in kernel for assigned devices where seen
> > and IMO we might want to do that for virtio as well.
> > 
> > For that to work the mask bit needs to be tied to
> > a specific gsi or specific device, which does not
> > work if we just inject arbitrary writes.
> 
> Yes, but I do not see those valuable plans being negatively affected.
> 
> Jan
> 

I do.
How would we maintain a mask/pending bit in kernel if we are not
supplied info on all available vectors even?
Jan Kiszka - Oct. 18, 2011, 7:37 p.m.
On 2011-10-18 20:40, Michael S. Tsirkin wrote:
> On Tue, Oct 18, 2011 at 08:24:39PM +0200, Jan Kiszka wrote:
>> On 2011-10-18 19:06, Michael S. Tsirkin wrote:
>>> On Tue, Oct 18, 2011 at 05:55:54PM +0200, Jan Kiszka wrote:
>>>> On 2011-10-18 17:22, Jan Kiszka wrote:
>>>>> What KVM has to do is just mapping an arbitrary MSI message
>>>>> (theoretically 64+32 bits, in practice it's much of course much less) to
>>>>
>>>> ( There are 24 distinguishing bits in an MSI message on x86, but that's
>>>> only a current interpretation of one specific arch. )
>>>
>>> Confused. vector mask is 8 bits. the rest is destination id etc.
>>
>> Right, but those additional bits like the destination make different
>> messages. We have to encode those 24 bits into a unique GSI number and
>> restore them (by table lookup) on APIC injection inside the kernel. If
>> we only had to encode 256 different vectors, we would be done already.
> 
> Right. But in practice guests always use distinct vectors (from the
> 256 available) for distinct messages. This is because
> the vector seems to be the only thing that gets communicated by the APIC
> to the software.
> 
> So e.g. a table with 256 entries, with extra 1024-256
> used for spill-over for guests that do something unexpected,
> would work really well.

Already Linux manages vectors on a pre-CPU basis. For efficiency
reasons, it does not exploit the full range of 256 vectors but actually
allocates them in - IIRC - steps of 16. So I would not be surprised to
find lots of vector number "collisions" when looking over a full set of
CPUs in a system.

Really, these considerations do not help us. We must store all 96 bits,
already for the sake of other KVM architectures that want MSI routing.

> 
> 
>>>
>>>>> a single GSI and vice versa. As there are less GSIs than possible MSI
>>>>> messages, we could run out of them when creating routes, statically or
>>>>> lazily.
>>>>>
>>>>> What would probably help us long-term out of your concerns regarding
>>>>> lazy routing is to bypass that redundant GSI translation for dynamic
>>>>> messages, i.e. those that are not associated with an irqfd number or an
>>>>> assigned device irq. Something like a KVM_DELIVER_MSI IOCTL that accepts
>>>>> address and data directly.
>>>>
>>>> This would be a trivial extension in fact. Given its beneficial impact
>>>> on our GSI limitation issue, I think I will hack up something like that.
>>>>
>>>> And maybe this makes a transparent cache more reasonable. Then only old
>>>> host kernels would force us to do searches for already cached messages.
>>>>
>>>> Jan
>>>
>>> Hmm, I'm not all that sure. Existing design really allows
>>> caching the route in various smart ways. We currently do
>>> this for irqfd but this can be extended to ioctls.
>>> If we just let the guest inject arbitrary messages,
>>> that becomes much more complex.
>>
>> irqfd and kvm device assignment do not allow us to inject arbitrary
>> messages at arbitrary points. The new API offers kvm_msi_irqfd_set and
>> kvm_device_msix_set_vector (etc.) for those scenarios to set static
>> routes from an MSI message to a GSI number (+they configure the related
>> backends).
> 
> Yes, it's a very flexible API but it would be very hard to optimize.
> GSIs let us do the slow path setup, but they make it easy
> to optimize target lookup in kernel.

Users of the API above have no need to know anything about GSIs. They
are an artifact of the KVM-internal interface between user space and
kernel now - thanks to the MSIRoutingCache encapsulation.

> 
> An analogy would be if read/write operated on file paths.
> fd makes it easy to do permission checks and slow lookups
> in one place. GSI happens to work like this (maybe, by accident).

Think of an opaque file handle as a MSIRoutingCache object. And it
encodes not only the routing handle but also other useful associated
information we need from time to time - internally, not in the device
models.

>>>
>>> Another concern is mask bit emulation. We currently
>>> handle mask bit in userspace but patches
>>> to do them in kernel for assigned devices where seen
>>> and IMO we might want to do that for virtio as well.
>>>
>>> For that to work the mask bit needs to be tied to
>>> a specific gsi or specific device, which does not
>>> work if we just inject arbitrary writes.
>>
>> Yes, but I do not see those valuable plans being negatively affected.
>>
>> Jan
>>
> 
> I do.
> How would we maintain a mask/pending bit in kernel if we are not
> supplied info on all available vectors even?

It's tricky to discuss an undefined interface (there only exists an
outdated proposal for kvm device assignment). But I suppose that user
space will have to define the maximum number of vectors when creating an
in-kernel MSI-X MMIO area. The device already has to tell this to msix_init.

The number of used vectors will correlate with the number of registered
irqfds (in the case of vhost or vfio, device assignment still has
SET_MSIX_NR). As kernel space would then be responsible for mask
processing, user space would keep vectors registered with irqfds, even
if they are masked. It could just continue to play the trick and drop
data=0 vectors.

The point here is: All those steps have _nothing_ to do with the generic
MSI-X core. They are KVM-specific "side channels" for which KVM provides
an API. In contrast, msix_vector_use/unuse were generic services that
were actually created to please KVM requirements. But if we split that
up, we can address the generic MSI-X requirements in a way that makes
more sense for emulated devices (and particularly msix_vector_use makes
no sense for emulation).

Jan
Michael S. Tsirkin - Oct. 18, 2011, 9:40 p.m.
On Tue, Oct 18, 2011 at 09:37:14PM +0200, Jan Kiszka wrote:
> On 2011-10-18 20:40, Michael S. Tsirkin wrote:
> > On Tue, Oct 18, 2011 at 08:24:39PM +0200, Jan Kiszka wrote:
> >> On 2011-10-18 19:06, Michael S. Tsirkin wrote:
> >>> On Tue, Oct 18, 2011 at 05:55:54PM +0200, Jan Kiszka wrote:
> >>>> On 2011-10-18 17:22, Jan Kiszka wrote:
> >>>>> What KVM has to do is just mapping an arbitrary MSI message
> >>>>> (theoretically 64+32 bits, in practice it's much of course much less) to
> >>>>
> >>>> ( There are 24 distinguishing bits in an MSI message on x86, but that's
> >>>> only a current interpretation of one specific arch. )
> >>>
> >>> Confused. vector mask is 8 bits. the rest is destination id etc.
> >>
> >> Right, but those additional bits like the destination make different
> >> messages. We have to encode those 24 bits into a unique GSI number and
> >> restore them (by table lookup) on APIC injection inside the kernel. If
> >> we only had to encode 256 different vectors, we would be done already.
> > 
> > Right. But in practice guests always use distinct vectors (from the
> > 256 available) for distinct messages. This is because
> > the vector seems to be the only thing that gets communicated by the APIC
> > to the software.
> > 
> > So e.g. a table with 256 entries, with extra 1024-256
> > used for spill-over for guests that do something unexpected,
> > would work really well.
> 
> Already Linux manages vectors on a pre-CPU basis. For efficiency
> reasons, it does not exploit the full range of 256 vectors but actually
> allocates them in - IIRC - steps of 16. So I would not be surprised to
> find lots of vector number "collisions" when looking over a full set of
> CPUs in a system.
> 
> Really, these considerations do not help us. We must store all 96 bits,
> already for the sake of other KVM architectures that want MSI routing.
> > 
> > 
> >>>
> >>>>> a single GSI and vice versa. As there are less GSIs than possible MSI
> >>>>> messages, we could run out of them when creating routes, statically or
> >>>>> lazily.
> >>>>>
> >>>>> What would probably help us long-term out of your concerns regarding
> >>>>> lazy routing is to bypass that redundant GSI translation for dynamic
> >>>>> messages, i.e. those that are not associated with an irqfd number or an
> >>>>> assigned device irq. Something like a KVM_DELIVER_MSI IOCTL that accepts
> >>>>> address and data directly.
> >>>>
> >>>> This would be a trivial extension in fact. Given its beneficial impact
> >>>> on our GSI limitation issue, I think I will hack up something like that.
> >>>>
> >>>> And maybe this makes a transparent cache more reasonable. Then only old
> >>>> host kernels would force us to do searches for already cached messages.
> >>>>
> >>>> Jan
> >>>
> >>> Hmm, I'm not all that sure. Existing design really allows
> >>> caching the route in various smart ways. We currently do
> >>> this for irqfd but this can be extended to ioctls.
> >>> If we just let the guest inject arbitrary messages,
> >>> that becomes much more complex.
> >>
> >> irqfd and kvm device assignment do not allow us to inject arbitrary
> >> messages at arbitrary points. The new API offers kvm_msi_irqfd_set and
> >> kvm_device_msix_set_vector (etc.) for those scenarios to set static
> >> routes from an MSI message to a GSI number (+they configure the related
> >> backends).
> > 
> > Yes, it's a very flexible API but it would be very hard to optimize.
> > GSIs let us do the slow path setup, but they make it easy
> > to optimize target lookup in kernel.
> 
> Users of the API above have no need to know anything about GSIs. They
> are an artifact of the KVM-internal interface between user space and
> kernel now - thanks to the MSIRoutingCache encapsulation.

Yes but I am saying that the API above can't be implemented
more efficiently than now: you will have to scan all apics on each MSI.
The GSI implementation can be optimized: decode the vector once,
if it matches a single vcpu, store that vcpu and use when sending
interrupts.


> > 
> > An analogy would be if read/write operated on file paths.
> > fd makes it easy to do permission checks and slow lookups
> > in one place. GSI happens to work like this (maybe, by accident).
> 
> Think of an opaque file handle as a MSIRoutingCache object. And it
> encodes not only the routing handle but also other useful associated
> information we need from time to time - internally, not in the device
> models.

Forget qemu abstractions, I am talking about data path
optimizations in kernel in kvm. From that POV the point of an fd is not
that it is opaque. It is that it's an index in an array that
can be used for fast lookups.

> >>>
> >>> Another concern is mask bit emulation. We currently
> >>> handle mask bit in userspace but patches
> >>> to do them in kernel for assigned devices where seen
> >>> and IMO we might want to do that for virtio as well.
> >>>
> >>> For that to work the mask bit needs to be tied to
> >>> a specific gsi or specific device, which does not
> >>> work if we just inject arbitrary writes.
> >>
> >> Yes, but I do not see those valuable plans being negatively affected.
> >>
> >> Jan
> >>
> > 
> > I do.
> > How would we maintain a mask/pending bit in kernel if we are not
> > supplied info on all available vectors even?
> 
> It's tricky to discuss an undefined interface (there only exists an
> outdated proposal for kvm device assignment). But I suppose that user
> space will have to define the maximum number of vectors when creating an
> in-kernel MSI-X MMIO area. The device already has to tell this to msix_init.
> 
> The number of used vectors will correlate with the number of registered
> irqfds (in the case of vhost or vfio, device assignment still has
> SET_MSIX_NR). As kernel space would then be responsible for mask
> processing, user space would keep vectors registered with irqfds, even
> if they are masked. It could just continue to play the trick and drop
> data=0 vectors.

Which trick?  We don't play any tricks except for device assignment.

> The point here is: All those steps have _nothing_ to do with the generic
> MSI-X core. They are KVM-specific "side channels" for which KVM provides
> an API. In contrast, msix_vector_use/unuse were generic services that
> were actually created to please KVM requirements. But if we split that
> up, we can address the generic MSI-X requirements in a way that makes
> more sense for emulated devices (and particularly msix_vector_use makes
> no sense for emulation).
> 
> Jan
> 

We need at least msix_vector_unuse - IMO it makes more sense than "clear
pending vector". msix_vector_use is good to keep around for symmetry:
who knows whether we'll need to allocate resources per vector
in the future.
Jan Kiszka - Oct. 18, 2011, 10:13 p.m.
On 2011-10-18 23:40, Michael S. Tsirkin wrote:
> On Tue, Oct 18, 2011 at 09:37:14PM +0200, Jan Kiszka wrote:
>> On 2011-10-18 20:40, Michael S. Tsirkin wrote:
>>> On Tue, Oct 18, 2011 at 08:24:39PM +0200, Jan Kiszka wrote:
>>>> On 2011-10-18 19:06, Michael S. Tsirkin wrote:
>>>>> On Tue, Oct 18, 2011 at 05:55:54PM +0200, Jan Kiszka wrote:
>>>>>> On 2011-10-18 17:22, Jan Kiszka wrote:
>>>>>>> What KVM has to do is just mapping an arbitrary MSI message
>>>>>>> (theoretically 64+32 bits, in practice it's much of course much less) to
>>>>>>
>>>>>> ( There are 24 distinguishing bits in an MSI message on x86, but that's
>>>>>> only a current interpretation of one specific arch. )
>>>>>
>>>>> Confused. vector mask is 8 bits. the rest is destination id etc.
>>>>
>>>> Right, but those additional bits like the destination make different
>>>> messages. We have to encode those 24 bits into a unique GSI number and
>>>> restore them (by table lookup) on APIC injection inside the kernel. If
>>>> we only had to encode 256 different vectors, we would be done already.
>>>
>>> Right. But in practice guests always use distinct vectors (from the
>>> 256 available) for distinct messages. This is because
>>> the vector seems to be the only thing that gets communicated by the APIC
>>> to the software.
>>>
>>> So e.g. a table with 256 entries, with extra 1024-256
>>> used for spill-over for guests that do something unexpected,
>>> would work really well.
>>
>> Already Linux manages vectors on a pre-CPU basis. For efficiency
>> reasons, it does not exploit the full range of 256 vectors but actually
>> allocates them in - IIRC - steps of 16. So I would not be surprised to
>> find lots of vector number "collisions" when looking over a full set of
>> CPUs in a system.
>>
>> Really, these considerations do not help us. We must store all 96 bits,
>> already for the sake of other KVM architectures that want MSI routing.
>>>
>>>
>>>>>
>>>>>>> a single GSI and vice versa. As there are less GSIs than possible MSI
>>>>>>> messages, we could run out of them when creating routes, statically or
>>>>>>> lazily.
>>>>>>>
>>>>>>> What would probably help us long-term out of your concerns regarding
>>>>>>> lazy routing is to bypass that redundant GSI translation for dynamic
>>>>>>> messages, i.e. those that are not associated with an irqfd number or an
>>>>>>> assigned device irq. Something like a KVM_DELIVER_MSI IOCTL that accepts
>>>>>>> address and data directly.
>>>>>>
>>>>>> This would be a trivial extension in fact. Given its beneficial impact
>>>>>> on our GSI limitation issue, I think I will hack up something like that.
>>>>>>
>>>>>> And maybe this makes a transparent cache more reasonable. Then only old
>>>>>> host kernels would force us to do searches for already cached messages.
>>>>>>
>>>>>> Jan
>>>>>
>>>>> Hmm, I'm not all that sure. Existing design really allows
>>>>> caching the route in various smart ways. We currently do
>>>>> this for irqfd but this can be extended to ioctls.
>>>>> If we just let the guest inject arbitrary messages,
>>>>> that becomes much more complex.
>>>>
>>>> irqfd and kvm device assignment do not allow us to inject arbitrary
>>>> messages at arbitrary points. The new API offers kvm_msi_irqfd_set and
>>>> kvm_device_msix_set_vector (etc.) for those scenarios to set static
>>>> routes from an MSI message to a GSI number (+they configure the related
>>>> backends).
>>>
>>> Yes, it's a very flexible API but it would be very hard to optimize.
>>> GSIs let us do the slow path setup, but they make it easy
>>> to optimize target lookup in kernel.
>>
>> Users of the API above have no need to know anything about GSIs. They
>> are an artifact of the KVM-internal interface between user space and
>> kernel now - thanks to the MSIRoutingCache encapsulation.
> 
> Yes but I am saying that the API above can't be implemented
> more efficiently than now: you will have to scan all apics on each MSI.
> The GSI implementation can be optimized: decode the vector once,
> if it matches a single vcpu, store that vcpu and use when sending
> interrupts.

Sorry, missed that you switched to kernel.

What information do you want to cache there that cannot be easily
obtained by looking at a concrete message? I do not see any. Once you
checked that the delivery mode targets a specific cpu, you could address
it directly. Or are you thinking about some cluster mode?

> 
> 
>>>
>>> An analogy would be if read/write operated on file paths.
>>> fd makes it easy to do permission checks and slow lookups
>>> in one place. GSI happens to work like this (maybe, by accident).
>>
>> Think of an opaque file handle as a MSIRoutingCache object. And it
>> encodes not only the routing handle but also other useful associated
>> information we need from time to time - internally, not in the device
>> models.
> 
> Forget qemu abstractions, I am talking about data path
> optimizations in kernel in kvm. From that POV the point of an fd is not
> that it is opaque. It is that it's an index in an array that
> can be used for fast lookups.
> 
>>>>>
>>>>> Another concern is mask bit emulation. We currently
>>>>> handle mask bit in userspace but patches
>>>>> to do them in kernel for assigned devices where seen
>>>>> and IMO we might want to do that for virtio as well.
>>>>>
>>>>> For that to work the mask bit needs to be tied to
>>>>> a specific gsi or specific device, which does not
>>>>> work if we just inject arbitrary writes.
>>>>
>>>> Yes, but I do not see those valuable plans being negatively affected.
>>>>
>>>> Jan
>>>>
>>>
>>> I do.
>>> How would we maintain a mask/pending bit in kernel if we are not
>>> supplied info on all available vectors even?
>>
>> It's tricky to discuss an undefined interface (there only exists an
>> outdated proposal for kvm device assignment). But I suppose that user
>> space will have to define the maximum number of vectors when creating an
>> in-kernel MSI-X MMIO area. The device already has to tell this to msix_init.
>>
>> The number of used vectors will correlate with the number of registered
>> irqfds (in the case of vhost or vfio, device assignment still has
>> SET_MSIX_NR). As kernel space would then be responsible for mask
>> processing, user space would keep vectors registered with irqfds, even
>> if they are masked. It could just continue to play the trick and drop
>> data=0 vectors.
> 
> Which trick?  We don't play any tricks except for device assignment.
> 
>> The point here is: All those steps have _nothing_ to do with the generic
>> MSI-X core. They are KVM-specific "side channels" for which KVM provides
>> an API. In contrast, msix_vector_use/unuse were generic services that
>> were actually created to please KVM requirements. But if we split that
>> up, we can address the generic MSI-X requirements in a way that makes
>> more sense for emulated devices (and particularly msix_vector_use makes
>> no sense for emulation).
>>
>> Jan
>>
> 
> We need at least msix_vector_unuse

Not at all. We rather need some qemu_irq_set(level) for MSI. The spec
requires that the device clears pending when the reason for that is
removed. And any removal that is device model-originated should simply
be signaled like an irq de-assert. Vector "unusage" is just one reason here.

> - IMO it makes more sense than "clear
> pending vector". msix_vector_use is good to keep around for symmetry:
> who knows whether we'll need to allocate resources per vector
> in the future.

For MSI[-X], the spec is already there, and we know that there no need
for further resources when emulating it. Only KVM has special needs.

Jan
Michael S. Tsirkin - Oct. 19, 2011, 12:56 a.m.
On Wed, Oct 19, 2011 at 12:13:49AM +0200, Jan Kiszka wrote:
> On 2011-10-18 23:40, Michael S. Tsirkin wrote:
> > On Tue, Oct 18, 2011 at 09:37:14PM +0200, Jan Kiszka wrote:
> >> On 2011-10-18 20:40, Michael S. Tsirkin wrote:
> >>> On Tue, Oct 18, 2011 at 08:24:39PM +0200, Jan Kiszka wrote:
> >>>> On 2011-10-18 19:06, Michael S. Tsirkin wrote:
> >>>>> On Tue, Oct 18, 2011 at 05:55:54PM +0200, Jan Kiszka wrote:
> >>>>>> On 2011-10-18 17:22, Jan Kiszka wrote:
> >>>>>>> What KVM has to do is just mapping an arbitrary MSI message
> >>>>>>> (theoretically 64+32 bits, in practice it's much of course much less) to
> >>>>>>
> >>>>>> ( There are 24 distinguishing bits in an MSI message on x86, but that's
> >>>>>> only a current interpretation of one specific arch. )
> >>>>>
> >>>>> Confused. vector mask is 8 bits. the rest is destination id etc.
> >>>>
> >>>> Right, but those additional bits like the destination make different
> >>>> messages. We have to encode those 24 bits into a unique GSI number and
> >>>> restore them (by table lookup) on APIC injection inside the kernel. If
> >>>> we only had to encode 256 different vectors, we would be done already.
> >>>
> >>> Right. But in practice guests always use distinct vectors (from the
> >>> 256 available) for distinct messages. This is because
> >>> the vector seems to be the only thing that gets communicated by the APIC
> >>> to the software.
> >>>
> >>> So e.g. a table with 256 entries, with extra 1024-256
> >>> used for spill-over for guests that do something unexpected,
> >>> would work really well.
> >>
> >> Already Linux manages vectors on a pre-CPU basis. For efficiency
> >> reasons, it does not exploit the full range of 256 vectors but actually
> >> allocates them in - IIRC - steps of 16. So I would not be surprised to
> >> find lots of vector number "collisions" when looking over a full set of
> >> CPUs in a system.
> >>
> >> Really, these considerations do not help us. We must store all 96 bits,
> >> already for the sake of other KVM architectures that want MSI routing.
> >>>
> >>>
> >>>>>
> >>>>>>> a single GSI and vice versa. As there are less GSIs than possible MSI
> >>>>>>> messages, we could run out of them when creating routes, statically or
> >>>>>>> lazily.
> >>>>>>>
> >>>>>>> What would probably help us long-term out of your concerns regarding
> >>>>>>> lazy routing is to bypass that redundant GSI translation for dynamic
> >>>>>>> messages, i.e. those that are not associated with an irqfd number or an
> >>>>>>> assigned device irq. Something like a KVM_DELIVER_MSI IOCTL that accepts
> >>>>>>> address and data directly.
> >>>>>>
> >>>>>> This would be a trivial extension in fact. Given its beneficial impact
> >>>>>> on our GSI limitation issue, I think I will hack up something like that.
> >>>>>>
> >>>>>> And maybe this makes a transparent cache more reasonable. Then only old
> >>>>>> host kernels would force us to do searches for already cached messages.
> >>>>>>
> >>>>>> Jan
> >>>>>
> >>>>> Hmm, I'm not all that sure. Existing design really allows
> >>>>> caching the route in various smart ways. We currently do
> >>>>> this for irqfd but this can be extended to ioctls.
> >>>>> If we just let the guest inject arbitrary messages,
> >>>>> that becomes much more complex.
> >>>>
> >>>> irqfd and kvm device assignment do not allow us to inject arbitrary
> >>>> messages at arbitrary points. The new API offers kvm_msi_irqfd_set and
> >>>> kvm_device_msix_set_vector (etc.) for those scenarios to set static
> >>>> routes from an MSI message to a GSI number (+they configure the related
> >>>> backends).
> >>>
> >>> Yes, it's a very flexible API but it would be very hard to optimize.
> >>> GSIs let us do the slow path setup, but they make it easy
> >>> to optimize target lookup in kernel.
> >>
> >> Users of the API above have no need to know anything about GSIs. They
> >> are an artifact of the KVM-internal interface between user space and
> >> kernel now - thanks to the MSIRoutingCache encapsulation.
> > 
> > Yes but I am saying that the API above can't be implemented
> > more efficiently than now: you will have to scan all apics on each MSI.
> > The GSI implementation can be optimized: decode the vector once,
> > if it matches a single vcpu, store that vcpu and use when sending
> > interrupts.
> 
> Sorry, missed that you switched to kernel.
> 
> What information do you want to cache there that cannot be easily
> obtained by looking at a concrete message? I do not see any. Once you
> checked that the delivery mode targets a specific cpu, you could address
> it directly.

I thought we need to match APIC ID. That needs a table lookup, no?

> Or are you thinking about some cluster mode?

That too.

> > 
> > 
> >>>
> >>> An analogy would be if read/write operated on file paths.
> >>> fd makes it easy to do permission checks and slow lookups
> >>> in one place. GSI happens to work like this (maybe, by accident).
> >>
> >> Think of an opaque file handle as a MSIRoutingCache object. And it
> >> encodes not only the routing handle but also other useful associated
> >> information we need from time to time - internally, not in the device
> >> models.
> > 
> > Forget qemu abstractions, I am talking about data path
> > optimizations in kernel in kvm. From that POV the point of an fd is not
> > that it is opaque. It is that it's an index in an array that
> > can be used for fast lookups.
> > 
> >>>>>
> >>>>> Another concern is mask bit emulation. We currently
> >>>>> handle mask bit in userspace but patches
> >>>>> to do them in kernel for assigned devices where seen
> >>>>> and IMO we might want to do that for virtio as well.
> >>>>>
> >>>>> For that to work the mask bit needs to be tied to
> >>>>> a specific gsi or specific device, which does not
> >>>>> work if we just inject arbitrary writes.
> >>>>
> >>>> Yes, but I do not see those valuable plans being negatively affected.
> >>>>
> >>>> Jan
> >>>>
> >>>
> >>> I do.
> >>> How would we maintain a mask/pending bit in kernel if we are not
> >>> supplied info on all available vectors even?
> >>
> >> It's tricky to discuss an undefined interface (there only exists an
> >> outdated proposal for kvm device assignment). But I suppose that user
> >> space will have to define the maximum number of vectors when creating an
> >> in-kernel MSI-X MMIO area. The device already has to tell this to msix_init.
> >>
> >> The number of used vectors will correlate with the number of registered
> >> irqfds (in the case of vhost or vfio, device assignment still has
> >> SET_MSIX_NR). As kernel space would then be responsible for mask
> >> processing, user space would keep vectors registered with irqfds, even
> >> if they are masked. It could just continue to play the trick and drop
> >> data=0 vectors.
> > 
> > Which trick?  We don't play any tricks except for device assignment.
> > 
> >> The point here is: All those steps have _nothing_ to do with the generic
> >> MSI-X core. They are KVM-specific "side channels" for which KVM provides
> >> an API. In contrast, msix_vector_use/unuse were generic services that
> >> were actually created to please KVM requirements. But if we split that
> >> up, we can address the generic MSI-X requirements in a way that makes
> >> more sense for emulated devices (and particularly msix_vector_use makes
> >> no sense for emulation).
> >>
> >> Jan
> >>
> > 
> > We need at least msix_vector_unuse
> 
> Not at all. We rather need some qemu_irq_set(level) for MSI.
> The spec
> requires that the device clears pending when the reason for that is
> removed. And any removal that is device model-originated should simply
> be signaled like an irq de-assert.

OK, this is a good argument.
In particular virtio ISR read could clear msix pending bit
(note: it would also need to clear irqfd as that is where
 we get the pending bit).

I would prefer not to use qemu_irq_set for this though.
We can add a level flag to msix_notify.

> Vector "unusage" is just one reason here.

I don't see removing the use/unuse functions as a priority though,
but if we add an API that also lets devices say
'reason for interrupt is removed', that would be nice.

Removing extra code can then be done separately, and on qemu.git
not on qemu-kvm.

> > - IMO it makes more sense than "clear
> > pending vector". msix_vector_use is good to keep around for symmetry:
> > who knows whether we'll need to allocate resources per vector
> > in the future.
> 
> For MSI[-X], the spec is already there, and we know that there no need
> for further resources when emulating it.
> Only KVM has special needs.
> 
> Jan
> 

It's not hard to speculate.  Imagine an out of process device that
shares guest memory and sends interrupts to qemu using eventfd. Suddenly
we need an fd per vector, and this without KVM.
Jan Kiszka - Oct. 19, 2011, 6:41 a.m.
On 2011-10-19 02:56, Michael S. Tsirkin wrote:
> On Wed, Oct 19, 2011 at 12:13:49AM +0200, Jan Kiszka wrote:
>> On 2011-10-18 23:40, Michael S. Tsirkin wrote:
>>> On Tue, Oct 18, 2011 at 09:37:14PM +0200, Jan Kiszka wrote:
>>>> On 2011-10-18 20:40, Michael S. Tsirkin wrote:
>>>>> On Tue, Oct 18, 2011 at 08:24:39PM +0200, Jan Kiszka wrote:
>>>>>> On 2011-10-18 19:06, Michael S. Tsirkin wrote:
>>>>>>> On Tue, Oct 18, 2011 at 05:55:54PM +0200, Jan Kiszka wrote:
>>>>>>>> On 2011-10-18 17:22, Jan Kiszka wrote:
>>>>>>>>> What KVM has to do is just mapping an arbitrary MSI message
>>>>>>>>> (theoretically 64+32 bits, in practice it's much of course much less) to
>>>>>>>>
>>>>>>>> ( There are 24 distinguishing bits in an MSI message on x86, but that's
>>>>>>>> only a current interpretation of one specific arch. )
>>>>>>>
>>>>>>> Confused. vector mask is 8 bits. the rest is destination id etc.
>>>>>>
>>>>>> Right, but those additional bits like the destination make different
>>>>>> messages. We have to encode those 24 bits into a unique GSI number and
>>>>>> restore them (by table lookup) on APIC injection inside the kernel. If
>>>>>> we only had to encode 256 different vectors, we would be done already.
>>>>>
>>>>> Right. But in practice guests always use distinct vectors (from the
>>>>> 256 available) for distinct messages. This is because
>>>>> the vector seems to be the only thing that gets communicated by the APIC
>>>>> to the software.
>>>>>
>>>>> So e.g. a table with 256 entries, with extra 1024-256
>>>>> used for spill-over for guests that do something unexpected,
>>>>> would work really well.
>>>>
>>>> Already Linux manages vectors on a pre-CPU basis. For efficiency
>>>> reasons, it does not exploit the full range of 256 vectors but actually
>>>> allocates them in - IIRC - steps of 16. So I would not be surprised to
>>>> find lots of vector number "collisions" when looking over a full set of
>>>> CPUs in a system.
>>>>
>>>> Really, these considerations do not help us. We must store all 96 bits,
>>>> already for the sake of other KVM architectures that want MSI routing.
>>>>>
>>>>>
>>>>>>>
>>>>>>>>> a single GSI and vice versa. As there are less GSIs than possible MSI
>>>>>>>>> messages, we could run out of them when creating routes, statically or
>>>>>>>>> lazily.
>>>>>>>>>
>>>>>>>>> What would probably help us long-term out of your concerns regarding
>>>>>>>>> lazy routing is to bypass that redundant GSI translation for dynamic
>>>>>>>>> messages, i.e. those that are not associated with an irqfd number or an
>>>>>>>>> assigned device irq. Something like a KVM_DELIVER_MSI IOCTL that accepts
>>>>>>>>> address and data directly.
>>>>>>>>
>>>>>>>> This would be a trivial extension in fact. Given its beneficial impact
>>>>>>>> on our GSI limitation issue, I think I will hack up something like that.
>>>>>>>>
>>>>>>>> And maybe this makes a transparent cache more reasonable. Then only old
>>>>>>>> host kernels would force us to do searches for already cached messages.
>>>>>>>>
>>>>>>>> Jan
>>>>>>>
>>>>>>> Hmm, I'm not all that sure. Existing design really allows
>>>>>>> caching the route in various smart ways. We currently do
>>>>>>> this for irqfd but this can be extended to ioctls.
>>>>>>> If we just let the guest inject arbitrary messages,
>>>>>>> that becomes much more complex.
>>>>>>
>>>>>> irqfd and kvm device assignment do not allow us to inject arbitrary
>>>>>> messages at arbitrary points. The new API offers kvm_msi_irqfd_set and
>>>>>> kvm_device_msix_set_vector (etc.) for those scenarios to set static
>>>>>> routes from an MSI message to a GSI number (+they configure the related
>>>>>> backends).
>>>>>
>>>>> Yes, it's a very flexible API but it would be very hard to optimize.
>>>>> GSIs let us do the slow path setup, but they make it easy
>>>>> to optimize target lookup in kernel.
>>>>
>>>> Users of the API above have no need to know anything about GSIs. They
>>>> are an artifact of the KVM-internal interface between user space and
>>>> kernel now - thanks to the MSIRoutingCache encapsulation.
>>>
>>> Yes but I am saying that the API above can't be implemented
>>> more efficiently than now: you will have to scan all apics on each MSI.
>>> The GSI implementation can be optimized: decode the vector once,
>>> if it matches a single vcpu, store that vcpu and use when sending
>>> interrupts.
>>
>> Sorry, missed that you switched to kernel.
>>
>> What information do you want to cache there that cannot be easily
>> obtained by looking at a concrete message? I do not see any. Once you
>> checked that the delivery mode targets a specific cpu, you could address
>> it directly.
> 
> I thought we need to match APIC ID. That needs a table lookup, no?

Yes. But that's completely independent of a concrete MSI message. In
fact, this is the same thing we need when interpreting an IOAPIC
redirection table entry. So let's create an APIC ID lookup table for the
destination ID field, maybe multiple of them to match different modes,
but not a MSI message table.

> 
>> Or are you thinking about some cluster mode?
> 
> That too.
> 
>>>
>>>
>>>>>
>>>>> An analogy would be if read/write operated on file paths.
>>>>> fd makes it easy to do permission checks and slow lookups
>>>>> in one place. GSI happens to work like this (maybe, by accident).
>>>>
>>>> Think of an opaque file handle as a MSIRoutingCache object. And it
>>>> encodes not only the routing handle but also other useful associated
>>>> information we need from time to time - internally, not in the device
>>>> models.
>>>
>>> Forget qemu abstractions, I am talking about data path
>>> optimizations in kernel in kvm. From that POV the point of an fd is not
>>> that it is opaque. It is that it's an index in an array that
>>> can be used for fast lookups.
>>>
>>>>>>>
>>>>>>> Another concern is mask bit emulation. We currently
>>>>>>> handle mask bit in userspace but patches
>>>>>>> to do them in kernel for assigned devices where seen
>>>>>>> and IMO we might want to do that for virtio as well.
>>>>>>>
>>>>>>> For that to work the mask bit needs to be tied to
>>>>>>> a specific gsi or specific device, which does not
>>>>>>> work if we just inject arbitrary writes.
>>>>>>
>>>>>> Yes, but I do not see those valuable plans being negatively affected.
>>>>>>
>>>>>> Jan
>>>>>>
>>>>>
>>>>> I do.
>>>>> How would we maintain a mask/pending bit in kernel if we are not
>>>>> supplied info on all available vectors even?
>>>>
>>>> It's tricky to discuss an undefined interface (there only exists an
>>>> outdated proposal for kvm device assignment). But I suppose that user
>>>> space will have to define the maximum number of vectors when creating an
>>>> in-kernel MSI-X MMIO area. The device already has to tell this to msix_init.
>>>>
>>>> The number of used vectors will correlate with the number of registered
>>>> irqfds (in the case of vhost or vfio, device assignment still has
>>>> SET_MSIX_NR). As kernel space would then be responsible for mask
>>>> processing, user space would keep vectors registered with irqfds, even
>>>> if they are masked. It could just continue to play the trick and drop
>>>> data=0 vectors.
>>>
>>> Which trick?  We don't play any tricks except for device assignment.
>>>
>>>> The point here is: All those steps have _nothing_ to do with the generic
>>>> MSI-X core. They are KVM-specific "side channels" for which KVM provides
>>>> an API. In contrast, msix_vector_use/unuse were generic services that
>>>> were actually created to please KVM requirements. But if we split that
>>>> up, we can address the generic MSI-X requirements in a way that makes
>>>> more sense for emulated devices (and particularly msix_vector_use makes
>>>> no sense for emulation).
>>>>
>>>> Jan
>>>>
>>>
>>> We need at least msix_vector_unuse
>>
>> Not at all. We rather need some qemu_irq_set(level) for MSI.
>> The spec
>> requires that the device clears pending when the reason for that is
>> removed. And any removal that is device model-originated should simply
>> be signaled like an irq de-assert.
> 
> OK, this is a good argument.
> In particular virtio ISR read could clear msix pending bit
> (note: it would also need to clear irqfd as that is where
>  we get the pending bit).
> 
> I would prefer not to use qemu_irq_set for this though.
> We can add a level flag to msix_notify.

No concerns.

> 
>> Vector "unusage" is just one reason here.
> 
> I don't see removing the use/unuse functions as a priority though,
> but if we add an API that also lets devices say
> 'reason for interrupt is removed', that would be nice.
> 
> Removing extra code can then be done separately, and on qemu.git
> not on qemu-kvm.

If we refrain from hacking KVM logic into the use/unuse services
upstream, we can do this later on. For me it is important that those
obsolete services do not block or complicate further cleanups of the MSI
layer nor bother device model creators with tasks they should not worry
about.

> 
>>> - IMO it makes more sense than "clear
>>> pending vector". msix_vector_use is good to keep around for symmetry:
>>> who knows whether we'll need to allocate resources per vector
>>> in the future.
>>
>> For MSI[-X], the spec is already there, and we know that there no need
>> for further resources when emulating it.
>> Only KVM has special needs.
>>
>> Jan
>>
> 
> It's not hard to speculate.  Imagine an out of process device that
> shares guest memory and sends interrupts to qemu using eventfd. Suddenly
> we need an fd per vector, and this without KVM.

That's what irqfd was invented for. Already works for vhost, and there
is nothing that prevents communicating the irqfd fd between two
processes. But note: irqfd handle, not a KVM-internal GSI.

Jan
Michael S. Tsirkin - Oct. 19, 2011, 9:03 a.m.
On Wed, Oct 19, 2011 at 08:41:48AM +0200, Jan Kiszka wrote:
> >>>>>>>>> a single GSI and vice versa. As there are less GSIs than possible MSI
> >>>>>>>>> messages, we could run out of them when creating routes, statically or
> >>>>>>>>> lazily.
> >>>>>>>>>
> >>>>>>>>> What would probably help us long-term out of your concerns regarding
> >>>>>>>>> lazy routing is to bypass that redundant GSI translation for dynamic
> >>>>>>>>> messages, i.e. those that are not associated with an irqfd number or an
> >>>>>>>>> assigned device irq. Something like a KVM_DELIVER_MSI IOCTL that accepts
> >>>>>>>>> address and data directly.
> >>>>>>>>
> >>>>>>>> This would be a trivial extension in fact. Given its beneficial impact
> >>>>>>>> on our GSI limitation issue, I think I will hack up something like that.
> >>>>>>>>
> >>>>>>>> And maybe this makes a transparent cache more reasonable. Then only old
> >>>>>>>> host kernels would force us to do searches for already cached messages.
> >>>>>>>>
> >>>>>>>> Jan
> >>>>>>>
> >>>>>>> Hmm, I'm not all that sure. Existing design really allows
> >>>>>>> caching the route in various smart ways. We currently do
> >>>>>>> this for irqfd but this can be extended to ioctls.
> >>>>>>> If we just let the guest inject arbitrary messages,
> >>>>>>> that becomes much more complex.
> >>>>>>
> >>>>>> irqfd and kvm device assignment do not allow us to inject arbitrary
> >>>>>> messages at arbitrary points. The new API offers kvm_msi_irqfd_set and
> >>>>>> kvm_device_msix_set_vector (etc.) for those scenarios to set static
> >>>>>> routes from an MSI message to a GSI number (+they configure the related
> >>>>>> backends).
> >>>>>
> >>>>> Yes, it's a very flexible API but it would be very hard to optimize.
> >>>>> GSIs let us do the slow path setup, but they make it easy
> >>>>> to optimize target lookup in kernel.
> >>>>
> >>>> Users of the API above have no need to know anything about GSIs. They
> >>>> are an artifact of the KVM-internal interface between user space and
> >>>> kernel now - thanks to the MSIRoutingCache encapsulation.
> >>>
> >>> Yes but I am saying that the API above can't be implemented
> >>> more efficiently than now: you will have to scan all apics on each MSI.
> >>> The GSI implementation can be optimized: decode the vector once,
> >>> if it matches a single vcpu, store that vcpu and use when sending
> >>> interrupts.
> >>
> >> Sorry, missed that you switched to kernel.
> >>
> >> What information do you want to cache there that cannot be easily
> >> obtained by looking at a concrete message? I do not see any. Once you
> >> checked that the delivery mode targets a specific cpu, you could address
> >> it directly.
> > 
> > I thought we need to match APIC ID. That needs a table lookup, no?
> 
> Yes. But that's completely independent of a concrete MSI message. In
> fact, this is the same thing we need when interpreting an IOAPIC
> redirection table entry. So let's create an APIC ID lookup table for the
> destination ID field, maybe multiple of them to match different modes,
> but not a MSI message table.
> > 
> >> Or are you thinking about some cluster mode?
> > 
> > That too.

Hmm, might be a good idea. APIC IDs are 8 bit, right?


> >>>
> >>>
> >>>>>
> >>>>> An analogy would be if read/write operated on file paths.
> >>>>> fd makes it easy to do permission checks and slow lookups
> >>>>> in one place. GSI happens to work like this (maybe, by accident).
> >>>>
> >>>> Think of an opaque file handle as a MSIRoutingCache object. And it
> >>>> encodes not only the routing handle but also other useful associated
> >>>> information we need from time to time - internally, not in the device
> >>>> models.
> >>>
> >>> Forget qemu abstractions, I am talking about data path
> >>> optimizations in kernel in kvm. From that POV the point of an fd is not
> >>> that it is opaque. It is that it's an index in an array that
> >>> can be used for fast lookups.
> >>>
> >>>>>>>
> >>>>>>> Another concern is mask bit emulation. We currently
> >>>>>>> handle mask bit in userspace but patches
> >>>>>>> to do them in kernel for assigned devices where seen
> >>>>>>> and IMO we might want to do that for virtio as well.
> >>>>>>>
> >>>>>>> For that to work the mask bit needs to be tied to
> >>>>>>> a specific gsi or specific device, which does not
> >>>>>>> work if we just inject arbitrary writes.
> >>>>>>
> >>>>>> Yes, but I do not see those valuable plans being negatively affected.
> >>>>>>
> >>>>>> Jan
> >>>>>>
> >>>>>
> >>>>> I do.
> >>>>> How would we maintain a mask/pending bit in kernel if we are not
> >>>>> supplied info on all available vectors even?
> >>>>
> >>>> It's tricky to discuss an undefined interface (there only exists an
> >>>> outdated proposal for kvm device assignment). But I suppose that user
> >>>> space will have to define the maximum number of vectors when creating an
> >>>> in-kernel MSI-X MMIO area. The device already has to tell this to msix_init.
> >>>>
> >>>> The number of used vectors will correlate with the number of registered
> >>>> irqfds (in the case of vhost or vfio, device assignment still has
> >>>> SET_MSIX_NR). As kernel space would then be responsible for mask
> >>>> processing, user space would keep vectors registered with irqfds, even
> >>>> if they are masked. It could just continue to play the trick and drop
> >>>> data=0 vectors.
> >>>
> >>> Which trick?  We don't play any tricks except for device assignment.
> >>>
> >>>> The point here is: All those steps have _nothing_ to do with the generic
> >>>> MSI-X core. They are KVM-specific "side channels" for which KVM provides
> >>>> an API. In contrast, msix_vector_use/unuse were generic services that
> >>>> were actually created to please KVM requirements. But if we split that
> >>>> up, we can address the generic MSI-X requirements in a way that makes
> >>>> more sense for emulated devices (and particularly msix_vector_use makes
> >>>> no sense for emulation).
> >>>>
> >>>> Jan
> >>>>
> >>>
> >>> We need at least msix_vector_unuse
> >>
> >> Not at all. We rather need some qemu_irq_set(level) for MSI.
> >> The spec
> >> requires that the device clears pending when the reason for that is
> >> removed. And any removal that is device model-originated should simply
> >> be signaled like an irq de-assert.
> > 
> > OK, this is a good argument.
> > In particular virtio ISR read could clear msix pending bit
> > (note: it would also need to clear irqfd as that is where
> >  we get the pending bit).
> > 
> > I would prefer not to use qemu_irq_set for this though.
> > We can add a level flag to msix_notify.
> 
> No concerns.
> 
> > 
> >> Vector "unusage" is just one reason here.
> > 
> > I don't see removing the use/unuse functions as a priority though,
> > but if we add an API that also lets devices say
> > 'reason for interrupt is removed', that would be nice.
> > 
> > Removing extra code can then be done separately, and on qemu.git
> > not on qemu-kvm.
> 
> If we refrain from hacking KVM logic into the use/unuse services
> upstream, we can do this later on. For me it is important that those
> obsolete services do not block or complicate further cleanups of the MSI
> layer nor bother device model creators with tasks they should not worry
> about.

My assumption is devices shall keep calling use/unuse until we drop it.
Does not seem like a major bother. If you like, use all vectors
or just those with message != 0.

> > 
> >>> - IMO it makes more sense than "clear
> >>> pending vector". msix_vector_use is good to keep around for symmetry:
> >>> who knows whether we'll need to allocate resources per vector
> >>> in the future.
> >>
> >> For MSI[-X], the spec is already there, and we know that there no need
> >> for further resources when emulating it.
> >> Only KVM has special needs.
> >>
> >> Jan
> >>
> > 
> > It's not hard to speculate.  Imagine an out of process device that
> > shares guest memory and sends interrupts to qemu using eventfd. Suddenly
> > we need an fd per vector, and this without KVM.
> 
> That's what irqfd was invented for. Already works for vhost, and there
> is nothing that prevents communicating the irqfd fd between two
> processes. But note: irqfd handle, not a KVM-internal GSI.
> 
> Jan
> 

Yes. But this still makes an API for acquiring per-vector resources a requirement.
Jan Kiszka - Oct. 19, 2011, 11:17 a.m.
On 2011-10-19 11:03, Michael S. Tsirkin wrote:
>>> I thought we need to match APIC ID. That needs a table lookup, no?
>>
>> Yes. But that's completely independent of a concrete MSI message. In
>> fact, this is the same thing we need when interpreting an IOAPIC
>> redirection table entry. So let's create an APIC ID lookup table for the
>> destination ID field, maybe multiple of them to match different modes,
>> but not a MSI message table.
>>>
>>>> Or are you thinking about some cluster mode?
>>>
>>> That too.
> 
> Hmm, might be a good idea. APIC IDs are 8 bit, right?

Yep (more generally: destination IDs). So even if we have to create
multiple lookup tables for the various modes, that won't consume
megabytes of RAM.

> 
> 
>>>>>
>>>>>
>>>>>>>
>>>>>>> An analogy would be if read/write operated on file paths.
>>>>>>> fd makes it easy to do permission checks and slow lookups
>>>>>>> in one place. GSI happens to work like this (maybe, by accident).
>>>>>>
>>>>>> Think of an opaque file handle as a MSIRoutingCache object. And it
>>>>>> encodes not only the routing handle but also other useful associated
>>>>>> information we need from time to time - internally, not in the device
>>>>>> models.
>>>>>
>>>>> Forget qemu abstractions, I am talking about data path
>>>>> optimizations in kernel in kvm. From that POV the point of an fd is not
>>>>> that it is opaque. It is that it's an index in an array that
>>>>> can be used for fast lookups.
>>>>>
>>>>>>>>>
>>>>>>>>> Another concern is mask bit emulation. We currently
>>>>>>>>> handle mask bit in userspace but patches
>>>>>>>>> to do them in kernel for assigned devices where seen
>>>>>>>>> and IMO we might want to do that for virtio as well.
>>>>>>>>>
>>>>>>>>> For that to work the mask bit needs to be tied to
>>>>>>>>> a specific gsi or specific device, which does not
>>>>>>>>> work if we just inject arbitrary writes.
>>>>>>>>
>>>>>>>> Yes, but I do not see those valuable plans being negatively affected.
>>>>>>>>
>>>>>>>> Jan
>>>>>>>>
>>>>>>>
>>>>>>> I do.
>>>>>>> How would we maintain a mask/pending bit in kernel if we are not
>>>>>>> supplied info on all available vectors even?
>>>>>>
>>>>>> It's tricky to discuss an undefined interface (there only exists an
>>>>>> outdated proposal for kvm device assignment). But I suppose that user
>>>>>> space will have to define the maximum number of vectors when creating an
>>>>>> in-kernel MSI-X MMIO area. The device already has to tell this to msix_init.
>>>>>>
>>>>>> The number of used vectors will correlate with the number of registered
>>>>>> irqfds (in the case of vhost or vfio, device assignment still has
>>>>>> SET_MSIX_NR). As kernel space would then be responsible for mask
>>>>>> processing, user space would keep vectors registered with irqfds, even
>>>>>> if they are masked. It could just continue to play the trick and drop
>>>>>> data=0 vectors.
>>>>>
>>>>> Which trick?  We don't play any tricks except for device assignment.
>>>>>
>>>>>> The point here is: All those steps have _nothing_ to do with the generic
>>>>>> MSI-X core. They are KVM-specific "side channels" for which KVM provides
>>>>>> an API. In contrast, msix_vector_use/unuse were generic services that
>>>>>> were actually created to please KVM requirements. But if we split that
>>>>>> up, we can address the generic MSI-X requirements in a way that makes
>>>>>> more sense for emulated devices (and particularly msix_vector_use makes
>>>>>> no sense for emulation).
>>>>>>
>>>>>> Jan
>>>>>>
>>>>>
>>>>> We need at least msix_vector_unuse
>>>>
>>>> Not at all. We rather need some qemu_irq_set(level) for MSI.
>>>> The spec
>>>> requires that the device clears pending when the reason for that is
>>>> removed. And any removal that is device model-originated should simply
>>>> be signaled like an irq de-assert.
>>>
>>> OK, this is a good argument.
>>> In particular virtio ISR read could clear msix pending bit
>>> (note: it would also need to clear irqfd as that is where
>>>  we get the pending bit).
>>>
>>> I would prefer not to use qemu_irq_set for this though.
>>> We can add a level flag to msix_notify.
>>
>> No concerns.
>>
>>>
>>>> Vector "unusage" is just one reason here.
>>>
>>> I don't see removing the use/unuse functions as a priority though,
>>> but if we add an API that also lets devices say
>>> 'reason for interrupt is removed', that would be nice.
>>>
>>> Removing extra code can then be done separately, and on qemu.git
>>> not on qemu-kvm.
>>
>> If we refrain from hacking KVM logic into the use/unuse services
>> upstream, we can do this later on. For me it is important that those
>> obsolete services do not block or complicate further cleanups of the MSI
>> layer nor bother device model creators with tasks they should not worry
>> about.
> 
> My assumption is devices shall keep calling use/unuse until we drop it.
> Does not seem like a major bother. If you like, use all vectors
> or just those with message != 0.

What about letting only those devices call use/unuse that sometimes need
less than the maximum amount? All other would benefit for an use_all
executed on enable and a unuse_all on disable/reset/uninit.

> 
>>>
>>>>> - IMO it makes more sense than "clear
>>>>> pending vector". msix_vector_use is good to keep around for symmetry:
>>>>> who knows whether we'll need to allocate resources per vector
>>>>> in the future.
>>>>
>>>> For MSI[-X], the spec is already there, and we know that there no need
>>>> for further resources when emulating it.
>>>> Only KVM has special needs.
>>>>
>>>> Jan
>>>>
>>>
>>> It's not hard to speculate.  Imagine an out of process device that
>>> shares guest memory and sends interrupts to qemu using eventfd. Suddenly
>>> we need an fd per vector, and this without KVM.
>>
>> That's what irqfd was invented for. Already works for vhost, and there
>> is nothing that prevents communicating the irqfd fd between two
>> processes. But note: irqfd handle, not a KVM-internal GSI.
>>
>> Jan
>>
> 
> Yes. But this still makes an API for acquiring per-vector resources a requirement.

Yes, but a different one than current use/unuse. And it will be an
optional one, only for those devices that need to establish irq/eventfd
channels.

Jan
Michael S. Tsirkin - Oct. 20, 2011, 10:02 p.m.
On Wed, Oct 19, 2011 at 01:17:03PM +0200, Jan Kiszka wrote:
> On 2011-10-19 11:03, Michael S. Tsirkin wrote:
> >>> I thought we need to match APIC ID. That needs a table lookup, no?
> >>
> >> Yes. But that's completely independent of a concrete MSI message. In
> >> fact, this is the same thing we need when interpreting an IOAPIC
> >> redirection table entry. So let's create an APIC ID lookup table for the
> >> destination ID field, maybe multiple of them to match different modes,
> >> but not a MSI message table.
> >>>
> >>>> Or are you thinking about some cluster mode?
> >>>
> >>> That too.
> > 
> > Hmm, might be a good idea. APIC IDs are 8 bit, right?
> 
> Yep (more generally: destination IDs). So even if we have to create
> multiple lookup tables for the various modes, that won't consume
> megabytes of RAM.
> 
> > 
> > 
> >>>>>
> >>>>>
> >>>>>>>
> >>>>>>> An analogy would be if read/write operated on file paths.
> >>>>>>> fd makes it easy to do permission checks and slow lookups
> >>>>>>> in one place. GSI happens to work like this (maybe, by accident).
> >>>>>>
> >>>>>> Think of an opaque file handle as a MSIRoutingCache object. And it
> >>>>>> encodes not only the routing handle but also other useful associated
> >>>>>> information we need from time to time - internally, not in the device
> >>>>>> models.
> >>>>>
> >>>>> Forget qemu abstractions, I am talking about data path
> >>>>> optimizations in kernel in kvm. From that POV the point of an fd is not
> >>>>> that it is opaque. It is that it's an index in an array that
> >>>>> can be used for fast lookups.
> >>>>>
> >>>>>>>>>
> >>>>>>>>> Another concern is mask bit emulation. We currently
> >>>>>>>>> handle mask bit in userspace but patches
> >>>>>>>>> to do them in kernel for assigned devices where seen
> >>>>>>>>> and IMO we might want to do that for virtio as well.
> >>>>>>>>>
> >>>>>>>>> For that to work the mask bit needs to be tied to
> >>>>>>>>> a specific gsi or specific device, which does not
> >>>>>>>>> work if we just inject arbitrary writes.
> >>>>>>>>
> >>>>>>>> Yes, but I do not see those valuable plans being negatively affected.
> >>>>>>>>
> >>>>>>>> Jan
> >>>>>>>>
> >>>>>>>
> >>>>>>> I do.
> >>>>>>> How would we maintain a mask/pending bit in kernel if we are not
> >>>>>>> supplied info on all available vectors even?
> >>>>>>
> >>>>>> It's tricky to discuss an undefined interface (there only exists an
> >>>>>> outdated proposal for kvm device assignment). But I suppose that user
> >>>>>> space will have to define the maximum number of vectors when creating an
> >>>>>> in-kernel MSI-X MMIO area. The device already has to tell this to msix_init.
> >>>>>>
> >>>>>> The number of used vectors will correlate with the number of registered
> >>>>>> irqfds (in the case of vhost or vfio, device assignment still has
> >>>>>> SET_MSIX_NR). As kernel space would then be responsible for mask
> >>>>>> processing, user space would keep vectors registered with irqfds, even
> >>>>>> if they are masked. It could just continue to play the trick and drop
> >>>>>> data=0 vectors.
> >>>>>
> >>>>> Which trick?  We don't play any tricks except for device assignment.
> >>>>>
> >>>>>> The point here is: All those steps have _nothing_ to do with the generic
> >>>>>> MSI-X core. They are KVM-specific "side channels" for which KVM provides
> >>>>>> an API. In contrast, msix_vector_use/unuse were generic services that
> >>>>>> were actually created to please KVM requirements. But if we split that
> >>>>>> up, we can address the generic MSI-X requirements in a way that makes
> >>>>>> more sense for emulated devices (and particularly msix_vector_use makes
> >>>>>> no sense for emulation).
> >>>>>>
> >>>>>> Jan
> >>>>>>
> >>>>>
> >>>>> We need at least msix_vector_unuse
> >>>>
> >>>> Not at all. We rather need some qemu_irq_set(level) for MSI.
> >>>> The spec
> >>>> requires that the device clears pending when the reason for that is
> >>>> removed. And any removal that is device model-originated should simply
> >>>> be signaled like an irq de-assert.
> >>>
> >>> OK, this is a good argument.
> >>> In particular virtio ISR read could clear msix pending bit
> >>> (note: it would also need to clear irqfd as that is where
> >>>  we get the pending bit).
> >>>
> >>> I would prefer not to use qemu_irq_set for this though.
> >>> We can add a level flag to msix_notify.
> >>
> >> No concerns.
> >>
> >>>
> >>>> Vector "unusage" is just one reason here.
> >>>
> >>> I don't see removing the use/unuse functions as a priority though,
> >>> but if we add an API that also lets devices say
> >>> 'reason for interrupt is removed', that would be nice.
> >>>
> >>> Removing extra code can then be done separately, and on qemu.git
> >>> not on qemu-kvm.
> >>
> >> If we refrain from hacking KVM logic into the use/unuse services
> >> upstream, we can do this later on. For me it is important that those
> >> obsolete services do not block or complicate further cleanups of the MSI
> >> layer nor bother device model creators with tasks they should not worry
> >> about.
> > 
> > My assumption is devices shall keep calling use/unuse until we drop it.
> > Does not seem like a major bother. If you like, use all vectors
> > or just those with message != 0.
> 
> What about letting only those devices call use/unuse that sometimes need
> less than the maximum amount? All other would benefit for an use_all
> executed on enable and a unuse_all on disable/reset/uninit.

Sure, I don't mind adding use_all/unuse_all wrappers.

> > 
> >>>
> >>>>> - IMO it makes more sense than "clear
> >>>>> pending vector". msix_vector_use is good to keep around for symmetry:
> >>>>> who knows whether we'll need to allocate resources per vector
> >>>>> in the future.
> >>>>
> >>>> For MSI[-X], the spec is already there, and we know that there no need
> >>>> for further resources when emulating it.
> >>>> Only KVM has special needs.
> >>>>
> >>>> Jan
> >>>>
> >>>
> >>> It's not hard to speculate.  Imagine an out of process device that
> >>> shares guest memory and sends interrupts to qemu using eventfd. Suddenly
> >>> we need an fd per vector, and this without KVM.
> >>
> >> That's what irqfd was invented for. Already works for vhost, and there
> >> is nothing that prevents communicating the irqfd fd between two
> >> processes. But note: irqfd handle, not a KVM-internal GSI.
> >>
> >> Jan
> >>
> > 
> > Yes. But this still makes an API for acquiring per-vector resources a requirement.
> 
> Yes, but a different one than current use/unuse.

What's wrong with use/unuse as an API? It's already in place
and virtio calls it.

> And it will be an
> optional one, only for those devices that need to establish irq/eventfd
> channels.
> 
> Jan

Not sure this should be up to the device.

> -- 
> Siemens AG, Corporate Technology, CT T DE IT 1
> Corporate Competence Center Embedded Linux
Jan Kiszka - Oct. 21, 2011, 7:09 a.m.
On 2011-10-21 00:02, Michael S. Tsirkin wrote:
>>> Yes. But this still makes an API for acquiring per-vector resources a requirement.
>>
>> Yes, but a different one than current use/unuse.
> 
> What's wrong with use/unuse as an API? It's already in place
> and virtio calls it.

Not for that purpose. It remains a useless API in the absence of KVM's
requirements.

> 
>> And it will be an
>> optional one, only for those devices that need to establish irq/eventfd
>> channels.
>>
>> Jan
> 
> Not sure this should be up to the device.

The device provides the fd. At least it acquires and associates it.

Jan
Michael S. Tsirkin - Oct. 21, 2011, 7:54 a.m.
On Fri, Oct 21, 2011 at 09:09:10AM +0200, Jan Kiszka wrote:
> On 2011-10-21 00:02, Michael S. Tsirkin wrote:
> >>> Yes. But this still makes an API for acquiring per-vector resources a requirement.
> >>
> >> Yes, but a different one than current use/unuse.
> > 
> > What's wrong with use/unuse as an API? It's already in place
> > and virtio calls it.
> 
> Not for that purpose.
> It remains a useless API in the absence of KVM's
> requirements.
> 

Sorry, I don't understand. This can acquire whatever resources
necessary. It does not seem to make sense to rip it out
only to add a different one back in.

> > 
> >> And it will be an
> >> optional one, only for those devices that need to establish irq/eventfd
> >> channels.
> >>
> >> Jan
> > 
> > Not sure this should be up to the device.
> 
> The device provides the fd. At least it acquires and associates it.
> 
> Jan

It would surely be beneficial to be able to have a uniform
API so that devices don't need to be recoded to be moved
in this way.

> -- 
> Siemens AG, Corporate Technology, CT T DE IT 1
> Corporate Competence Center Embedded Linux
Jan Kiszka - Oct. 21, 2011, 9:27 a.m.
On 2011-10-21 09:54, Michael S. Tsirkin wrote:
> On Fri, Oct 21, 2011 at 09:09:10AM +0200, Jan Kiszka wrote:
>> On 2011-10-21 00:02, Michael S. Tsirkin wrote:
>>>>> Yes. But this still makes an API for acquiring per-vector resources a requirement.
>>>>
>>>> Yes, but a different one than current use/unuse.
>>>
>>> What's wrong with use/unuse as an API? It's already in place
>>> and virtio calls it.
>>
>> Not for that purpose.
>> It remains a useless API in the absence of KVM's
>> requirements.
>>
> 
> Sorry, I don't understand. This can acquire whatever resources
> necessary. It does not seem to make sense to rip it out
> only to add a different one back in.
> 
>>>
>>>> And it will be an
>>>> optional one, only for those devices that need to establish irq/eventfd
>>>> channels.
>>>>
>>>> Jan
>>>
>>> Not sure this should be up to the device.
>>
>> The device provides the fd. At least it acquires and associates it.
>>
>> Jan
> 
> It would surely be beneficial to be able to have a uniform
> API so that devices don't need to be recoded to be moved
> in this way.

The point is that the current API is useless for devices that do not
have to declare any vector to the core. By forcing them to call into
that API, we solve no current problem automatically. We rather need
associate_vector_with_x (and the reverse). And that only for device that
have different backends than user space models.

Jan
Michael S. Tsirkin - Oct. 21, 2011, 10:57 a.m.
On Fri, Oct 21, 2011 at 11:27:48AM +0200, Jan Kiszka wrote:
> On 2011-10-21 09:54, Michael S. Tsirkin wrote:
> > On Fri, Oct 21, 2011 at 09:09:10AM +0200, Jan Kiszka wrote:
> >> On 2011-10-21 00:02, Michael S. Tsirkin wrote:
> >>>>> Yes. But this still makes an API for acquiring per-vector resources a requirement.
> >>>>
> >>>> Yes, but a different one than current use/unuse.
> >>>
> >>> What's wrong with use/unuse as an API? It's already in place
> >>> and virtio calls it.
> >>
> >> Not for that purpose.
> >> It remains a useless API in the absence of KVM's
> >> requirements.
> >>
> > 
> > Sorry, I don't understand. This can acquire whatever resources
> > necessary. It does not seem to make sense to rip it out
> > only to add a different one back in.
> > 
> >>>
> >>>> And it will be an
> >>>> optional one, only for those devices that need to establish irq/eventfd
> >>>> channels.
> >>>>
> >>>> Jan
> >>>
> >>> Not sure this should be up to the device.
> >>
> >> The device provides the fd. At least it acquires and associates it.
> >>
> >> Jan
> > 
> > It would surely be beneficial to be able to have a uniform
> > API so that devices don't need to be recoded to be moved
> > in this way.
> 
> The point is that the current API is useless for devices that do not
> have to declare any vector to the core.

Don't assigned devices want this as well?
They handle 0-address vectors specially, and
this hack absolutely doesn't belong in pci core ...

> By forcing them to call into
> that API, we solve no current problem automatically. We rather need
> associate_vector_with_x (and the reverse). And that only for device that
> have different backends than user space models.
> 
> Jan

I'll need to think about this, would prefer this series not
to get blocked on this issue. We more or less agreed
to add _use_all/unuse_all for now?

> -- 
> Siemens AG, Corporate Technology, CT T DE IT 1
> Corporate Competence Center Embedded Linux

Patch

diff --git a/hw/ivshmem.c b/hw/ivshmem.c
index 242fbea..a402c98 100644
--- a/hw/ivshmem.c
+++ b/hw/ivshmem.c
@@ -535,10 +535,8 @@  static uint64_t ivshmem_get_size(IVShmemState * s) {
     return value;
 }
 
-static void ivshmem_setup_msi(IVShmemState * s) {
-
-    int i;
-
+static void ivshmem_setup_msi(IVShmemState *s)
+{
     /* allocate the MSI-X vectors */
 
     memory_region_init(&s->msix_bar, "ivshmem-msix", 4096);
@@ -551,11 +549,6 @@  static void ivshmem_setup_msi(IVShmemState * s) {
         exit(1);
     }
 
-    /* 'activate' the vectors */
-    for (i = 0; i < s->vectors; i++) {
-        msix_vector_use(&s->dev, i);
-    }
-
     /* allocate Qemu char devices for receiving interrupts */
     s->eventfd_table = g_malloc0(s->vectors * sizeof(EventfdEntry));
 }
@@ -581,7 +574,7 @@  static int ivshmem_load(QEMUFile* f, void *opaque, int version_id)
     IVSHMEM_DPRINTF("ivshmem_load\n");
 
     IVShmemState *proxy = opaque;
-    int ret, i;
+    int ret;
 
     if (version_id > 0) {
         return -EINVAL;
@@ -599,9 +592,6 @@  static int ivshmem_load(QEMUFile* f, void *opaque, int version_id)
 
     if (ivshmem_has_feature(proxy, IVSHMEM_MSI)) {
         msix_load(&proxy->dev, f);
-        for (i = 0; i < proxy->vectors; i++) {
-            msix_vector_use(&proxy->dev, i);
-        }
     } else {
         proxy->intrstatus = qemu_get_be32(f);
         proxy->intrmask = qemu_get_be32(f);
diff --git a/hw/msix.c b/hw/msix.c
index ce3375a..f1b97b5 100644
--- a/hw/msix.c
+++ b/hw/msix.c
@@ -292,9 +292,6 @@  int msix_init(struct PCIDevice *dev, unsigned short nentries,
     if (nentries > MSIX_MAX_ENTRIES)
         return -EINVAL;
 
-    dev->msix_entry_used = g_malloc0(MSIX_MAX_ENTRIES *
-                                        sizeof *dev->msix_entry_used);
-
     dev->msix_table_page = g_malloc0(MSIX_PAGE_SIZE);
     msix_mask_all(dev, nentries);
 
@@ -317,21 +314,9 @@  err_config:
     memory_region_destroy(&dev->msix_mmio);
     g_free(dev->msix_table_page);
     dev->msix_table_page = NULL;
-    g_free(dev->msix_entry_used);
-    dev->msix_entry_used = NULL;
     return ret;
 }
 
-static void msix_free_irq_entries(PCIDevice *dev)
-{
-    int vector;
-
-    for (vector = 0; vector < dev->msix_entries_nr; ++vector) {
-        dev->msix_entry_used[vector] = 0;
-        msix_clr_pending(dev, vector);
-    }
-}
-
 /* Clean up resources for the device. */
 int msix_uninit(PCIDevice *dev, MemoryRegion *bar)
 {
@@ -340,14 +325,11 @@  int msix_uninit(PCIDevice *dev, MemoryRegion *bar)
     }
     pci_del_capability(dev, PCI_CAP_ID_MSIX, MSIX_CAP_LENGTH);
     dev->msix_cap = 0;
-    msix_free_irq_entries(dev);
     dev->msix_entries_nr = 0;
     memory_region_del_subregion(bar, &dev->msix_mmio);
     memory_region_destroy(&dev->msix_mmio);
     g_free(dev->msix_table_page);
     dev->msix_table_page = NULL;
-    g_free(dev->msix_entry_used);
-    dev->msix_entry_used = NULL;
 
     kvm_msix_free(dev);
     g_free(dev->msix_cache);
@@ -376,7 +358,6 @@  void msix_load(PCIDevice *dev, QEMUFile *f)
         return;
     }
 
-    msix_free_irq_entries(dev);
     qemu_get_buffer(f, dev->msix_table_page, n * PCI_MSIX_ENTRY_SIZE);
     qemu_get_buffer(f, dev->msix_table_page + MSIX_PAGE_PENDING, (n + 7) / 8);
 }
@@ -407,7 +388,7 @@  void msix_notify(PCIDevice *dev, unsigned vector)
 {
     MSIMessage msg;
 
-    if (vector >= dev->msix_entries_nr || !dev->msix_entry_used[vector])
+    if (vector >= dev->msix_entries_nr)
         return;
     if (msix_is_masked(dev, vector)) {
         msix_set_pending(dev, vector);
@@ -424,48 +405,31 @@  void msix_reset(PCIDevice *dev)
     if (!msix_present(dev)) {
         return;
     }
-    msix_free_irq_entries(dev);
+    msix_clear_all_vectors(dev);
     dev->config[dev->msix_cap + MSIX_CONTROL_OFFSET] &=
 	    ~dev->wmask[dev->msix_cap + MSIX_CONTROL_OFFSET];
     memset(dev->msix_table_page, 0, MSIX_PAGE_SIZE);
     msix_mask_all(dev, dev->msix_entries_nr);
 }
 
-/* PCI spec suggests that devices make it possible for software to configure
- * less vectors than supported by the device, but does not specify a standard
- * mechanism for devices to do so.
- *
- * We support this by asking devices to declare vectors software is going to
- * actually use, and checking this on the notification path. Devices that
- * don't want to follow the spec suggestion can declare all vectors as used. */
-
-/* Mark vector as used. */
-int msix_vector_use(PCIDevice *dev, unsigned vector)
+/* Clear pending vector. */
+void msix_clear_vector(PCIDevice *dev, unsigned vector)
 {
-    if (vector >= dev->msix_entries_nr)
-        return -EINVAL;
-    ++dev->msix_entry_used[vector];
-    return 0;
-}
-
-/* Mark vector as unused. */
-void msix_vector_unuse(PCIDevice *dev, unsigned vector)
-{
-    if (vector >= dev->msix_entries_nr || !dev->msix_entry_used[vector]) {
-        return;
-    }
-    if (--dev->msix_entry_used[vector]) {
-        return;
+    if (msix_present(dev) && vector < dev->msix_entries_nr) {
+        msix_clr_pending(dev, vector);
     }
-    msix_clr_pending(dev, vector);
 }
 
-void msix_unuse_all_vectors(PCIDevice *dev)
+void msix_clear_all_vectors(PCIDevice *dev)
 {
+    unsigned int vector;
+
     if (!msix_present(dev)) {
         return;
     }
-    msix_free_irq_entries(dev);
+    for (vector = 0; vector < dev->msix_entries_nr; ++vector) {
+        msix_clr_pending(dev, vector);
+    }
 }
 
 /* Invoke the notifier if vector entry is used and unmasked. */
@@ -476,7 +440,7 @@  msix_notify_if_unmasked(PCIDevice *dev, unsigned int vector, bool masked)
 
     assert(dev->msix_vector_config_notifier);
 
-    if (!dev->msix_entry_used[vector] || msix_is_masked(dev, vector)) {
+    if (msix_is_masked(dev, vector)) {
         return 0;
     }
     msix_message_from_vector(dev, vector, &msg);
diff --git a/hw/msix.h b/hw/msix.h
index 978f417..9cd54cf 100644
--- a/hw/msix.h
+++ b/hw/msix.h
@@ -21,9 +21,8 @@  int msix_present(PCIDevice *dev);
 
 uint32_t msix_bar_size(PCIDevice *dev);
 
-int msix_vector_use(PCIDevice *dev, unsigned vector);
-void msix_vector_unuse(PCIDevice *dev, unsigned vector);
-void msix_unuse_all_vectors(PCIDevice *dev);
+void msix_clear_vector(PCIDevice *dev, unsigned vector);
+void msix_clear_all_vectors(PCIDevice *dev);
 
 void msix_notify(PCIDevice *dev, unsigned vector);
 
diff --git a/hw/pci.h b/hw/pci.h
index d7a652e..5cf9a16 100644
--- a/hw/pci.h
+++ b/hw/pci.h
@@ -178,8 +178,6 @@  struct PCIDevice {
     uint8_t *msix_table_page;
     /* MMIO index used to map MSIX table and pending bit entries. */
     MemoryRegion msix_mmio;
-    /* Reference-count for entries actually in use by driver. */
-    unsigned *msix_entry_used;
     /* Region including the MSI-X table */
     uint32_t msix_bar_size;
     /* Version id needed for VMState */
diff --git a/hw/virtio-pci.c b/hw/virtio-pci.c
index 85d6771..5004d7d 100644
--- a/hw/virtio-pci.c
+++ b/hw/virtio-pci.c
@@ -136,9 +136,6 @@  static int virtio_pci_load_config(void * opaque, QEMUFile *f)
     } else {
         proxy->vdev->config_vector = VIRTIO_NO_VECTOR;
     }
-    if (proxy->vdev->config_vector != VIRTIO_NO_VECTOR) {
-        return msix_vector_use(&proxy->pci_dev, proxy->vdev->config_vector);
-    }
     return 0;
 }
 
@@ -152,9 +149,6 @@  static int virtio_pci_load_queue(void * opaque, int n, QEMUFile *f)
         vector = VIRTIO_NO_VECTOR;
     }
     virtio_queue_set_vector(proxy->vdev, n, vector);
-    if (vector != VIRTIO_NO_VECTOR) {
-        return msix_vector_use(&proxy->pci_dev, vector);
-    }
     return 0;
 }
 
@@ -304,7 +298,7 @@  static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val)
         if (pa == 0) {
             virtio_pci_stop_ioeventfd(proxy);
             virtio_reset(proxy->vdev);
-            msix_unuse_all_vectors(&proxy->pci_dev);
+            msix_clear_all_vectors(&proxy->pci_dev);
         }
         else
             virtio_queue_set_addr(vdev, vdev->queue_sel, pa);
@@ -331,7 +325,7 @@  static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val)
 
         if (vdev->status == 0) {
             virtio_reset(proxy->vdev);
-            msix_unuse_all_vectors(&proxy->pci_dev);
+            msix_clear_all_vectors(&proxy->pci_dev);
         }
 
         /* Linux before 2.6.34 sets the device as OK without enabling
@@ -343,18 +337,20 @@  static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val)
         }
         break;
     case VIRTIO_MSI_CONFIG_VECTOR:
-        msix_vector_unuse(&proxy->pci_dev, vdev->config_vector);
+        msix_clear_vector(&proxy->pci_dev, vdev->config_vector);
         /* Make it possible for guest to discover an error took place. */
-        if (msix_vector_use(&proxy->pci_dev, val) < 0)
+        if (val >= vdev->nvectors) {
             val = VIRTIO_NO_VECTOR;
+        }
         vdev->config_vector = val;
         break;
     case VIRTIO_MSI_QUEUE_VECTOR:
-        msix_vector_unuse(&proxy->pci_dev,
+        msix_clear_vector(&proxy->pci_dev,
                           virtio_queue_vector(vdev, vdev->queue_sel));
         /* Make it possible for guest to discover an error took place. */
-        if (msix_vector_use(&proxy->pci_dev, val) < 0)
+        if (val >= vdev->nvectors) {
             val = VIRTIO_NO_VECTOR;
+        }
         virtio_queue_set_vector(vdev, vdev->queue_sel, val);
         break;
     default: