Patchwork [v2,for,1.2?] msix: Drop tracking of used vectors

login
register
mail settings
Submitter Jan Kiszka
Date Aug. 24, 2012, 6:19 a.m.
Message ID <50371D06.3010706@web.de>
Download mbox | patch
Permalink /patch/179770/
State New
Headers show

Comments

Jan Kiszka - Aug. 24, 2012, 6:19 a.m.
From: Jan Kiszka <jan.kiszka@siemens.com>

This optimization was once used in qemu-kvm to keep KVM route usage low.
But now we solved that problem via lazy updates. It also tried to handle
the case of vectors shared between different sources of the same device.
However, this never really worked and will have to be addressed
differently anyway. So drop this obsolete interface.

We still need interfaces to clear pending vectors though. Provide
msix_clear_vector and msix_clear_all_vectors for this.

This also unbreaks MSI-X after reset for ivshmem and megasas as device
models can't easily mark their vectors used afterward (megasas didn't
even try).

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
---

This patch has been posted some moons again, and we had a discussion
about the impact on the existing users. At that time, MSI-X refactoring
for KVM support was not yet merged. Now it is, and it should be better
much clearer that vector usage tracking has no business with that
feature.

 hw/ivshmem.c    |   20 -------------------
 hw/megasas.c    |    4 ---
 hw/msix.c       |   57 ++++++++++++------------------------------------------
 hw/msix.h       |    5 +--
 hw/pci.h        |    2 -
 hw/virtio-pci.c |   20 +++++++-----------
 6 files changed, 23 insertions(+), 85 deletions(-)
Michael S. Tsirkin - Aug. 24, 2012, 8:20 a.m.
On Fri, Aug 24, 2012 at 08:19:50AM +0200, Jan Kiszka wrote:
> From: Jan Kiszka <jan.kiszka@siemens.com>
> 
> This optimization was once used in qemu-kvm to keep KVM route usage low.
> But now we solved that problem via lazy updates. It also tried to handle
> the case of vectors shared between different sources of the same device.
> However, this never really worked and will have to be addressed
> differently anyway. So drop this obsolete interface.
> 
> We still need interfaces to clear pending vectors though. Provide
> msix_clear_vector and msix_clear_all_vectors for this.
> 
> This also unbreaks MSI-X after reset for ivshmem and megasas as device
> models can't easily mark their vectors used afterward (megasas didn't
> even try).
> 
> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
> ---
> 
> This patch has been posted some moons again, and we had a discussion
> about the impact on the existing users. At that time, MSI-X refactoring
> for KVM support was not yet merged. Now it is, and it should be better
> much clearer that vector usage tracking has no business with that
> feature.
> 
>  hw/ivshmem.c    |   20 -------------------
>  hw/megasas.c    |    4 ---
>  hw/msix.c       |   57 ++++++++++++------------------------------------------
>  hw/msix.h       |    5 +--
>  hw/pci.h        |    2 -
>  hw/virtio-pci.c |   20 +++++++-----------
>  6 files changed, 23 insertions(+), 85 deletions(-)
> 
> diff --git a/hw/ivshmem.c b/hw/ivshmem.c
> index 47f2a16..5ffff48 100644
> --- a/hw/ivshmem.c
> +++ b/hw/ivshmem.c
> @@ -523,28 +523,11 @@ static void ivshmem_read(void *opaque, const uint8_t * buf, int flags)
>      return;
>  }
>  
> -/* Select the MSI-X vectors used by device.
> - * ivshmem maps events to vectors statically, so
> - * we just enable all vectors on init and after reset. */
> -static void ivshmem_use_msix(IVShmemState * s)
> -{
> -    int i;
> -
> -    if (!msix_present(&s->dev)) {
> -        return;
> -    }
> -
> -    for (i = 0; i < s->vectors; i++) {
> -        msix_vector_use(&s->dev, i);
> -    }
> -}
> -
>  static void ivshmem_reset(DeviceState *d)
>  {
>      IVShmemState *s = DO_UPCAST(IVShmemState, dev.qdev, d);
>  
>      s->intrstatus = 0;
> -    ivshmem_use_msix(s);
>      return;
>  }
>  
> @@ -586,8 +569,6 @@ static void ivshmem_setup_msi(IVShmemState * s)
>  
>      /* allocate QEMU char devices for receiving interrupts */
>      s->eventfd_table = g_malloc0(s->vectors * sizeof(EventfdEntry));
> -
> -    ivshmem_use_msix(s);
>  }
>  
>  static void ivshmem_save(QEMUFile* f, void *opaque)
> @@ -629,7 +610,6 @@ static int ivshmem_load(QEMUFile* f, void *opaque, int version_id)
>  
>      if (ivshmem_has_feature(proxy, IVSHMEM_MSI)) {
>          msix_load(&proxy->dev, f);
> -	ivshmem_use_msix(proxy);
>      } else {
>          proxy->intrstatus = qemu_get_be32(f);
>          proxy->intrmask = qemu_get_be32(f);
> diff --git a/hw/megasas.c b/hw/megasas.c
> index c35a15d..4766117 100644
> --- a/hw/megasas.c
> +++ b/hw/megasas.c
> @@ -2121,10 +2121,6 @@ static int megasas_scsi_init(PCIDevice *dev)
>      pci_register_bar(&s->dev, 2, PCI_BASE_ADDRESS_SPACE_IO, &s->port_io);
>      pci_register_bar(&s->dev, 3, bar_type, &s->queue_io);
>  
> -    if (megasas_use_msix(s)) {
> -        msix_vector_use(&s->dev, 0);
> -    }
> -
>      if (!s->sas_addr) {
>          s->sas_addr = ((NAA_LOCALLY_ASSIGNED_ID << 24) |
>                         IEEE_COMPANY_LOCALLY_ASSIGNED) << 36;
> diff --git a/hw/msix.c b/hw/msix.c
> index aea340b..163ce4c 100644
> --- a/hw/msix.c
> +++ b/hw/msix.c
> @@ -273,7 +273,6 @@ int msix_init(struct PCIDevice *dev, unsigned short nentries,
>  
>      dev->msix_table = g_malloc0(table_size);
>      dev->msix_pba = g_malloc0(pba_size);
> -    dev->msix_entry_used = g_malloc0(nentries * sizeof *dev->msix_entry_used);
>  
>      msix_mask_all(dev, nentries);
>  
> @@ -326,16 +325,6 @@ int msix_init_exclusive_bar(PCIDevice *dev, unsigned short nentries,
>      return 0;
>  }
>  
> -static void msix_free_irq_entries(PCIDevice *dev)
> -{
> -    int vector;
> -
> -    for (vector = 0; vector < dev->msix_entries_nr; ++vector) {
> -        dev->msix_entry_used[vector] = 0;
> -        msix_clr_pending(dev, vector);
> -    }
> -}
> -
>  /* Clean up resources for the device. */
>  void msix_uninit(PCIDevice *dev, MemoryRegion *table_bar, MemoryRegion *pba_bar)
>  {
> @@ -344,7 +333,6 @@ void msix_uninit(PCIDevice *dev, MemoryRegion *table_bar, MemoryRegion *pba_bar)
>      }
>      pci_del_capability(dev, PCI_CAP_ID_MSIX, MSIX_CAP_LENGTH);
>      dev->msix_cap = 0;
> -    msix_free_irq_entries(dev);
>      dev->msix_entries_nr = 0;
>      memory_region_del_subregion(pba_bar, &dev->msix_pba_mmio);
>      memory_region_destroy(&dev->msix_pba_mmio);
> @@ -354,8 +342,6 @@ void msix_uninit(PCIDevice *dev, MemoryRegion *table_bar, MemoryRegion *pba_bar)
>      memory_region_destroy(&dev->msix_table_mmio);
>      g_free(dev->msix_table);
>      dev->msix_table = NULL;
> -    g_free(dev->msix_entry_used);
> -    dev->msix_entry_used = NULL;
>      dev->cap_present &= ~QEMU_PCI_CAP_MSIX;
>      return;
>  }
> @@ -390,7 +376,6 @@ void msix_load(PCIDevice *dev, QEMUFile *f)
>          return;
>      }
>  
> -    msix_free_irq_entries(dev);
>      qemu_get_buffer(f, dev->msix_table, n * PCI_MSIX_ENTRY_SIZE);
>      qemu_get_buffer(f, dev->msix_pba, (n + 7) / 8);
>      msix_update_function_masked(dev);
> @@ -419,8 +404,9 @@ void msix_notify(PCIDevice *dev, unsigned vector)
>  {
>      MSIMessage msg;
>  
> -    if (vector >= dev->msix_entries_nr || !dev->msix_entry_used[vector])
> +    if (vector >= dev->msix_entries_nr) {
>          return;
> +    }
>      if (msix_is_masked(dev, vector)) {
>          msix_set_pending(dev, vector);
>          return;
> @@ -436,7 +422,7 @@ void msix_reset(PCIDevice *dev)
>      if (!msix_present(dev)) {
>          return;
>      }
> -    msix_free_irq_entries(dev);
> +    msix_clear_all_vectors(dev);
>      dev->config[dev->msix_cap + MSIX_CONTROL_OFFSET] &=
>  	    ~dev->wmask[dev->msix_cap + MSIX_CONTROL_OFFSET];
>      memset(dev->msix_table, 0, dev->msix_entries_nr * PCI_MSIX_ENTRY_SIZE);
> @@ -444,41 +430,24 @@ void msix_reset(PCIDevice *dev)
>      msix_mask_all(dev, dev->msix_entries_nr);
>  }
>  
> -/* PCI spec suggests that devices make it possible for software to configure
> - * less vectors than supported by the device, but does not specify a standard
> - * mechanism for devices to do so.
> - *
> - * We support this by asking devices to declare vectors software is going to
> - * actually use, and checking this on the notification path. Devices that
> - * don't want to follow the spec suggestion can declare all vectors as used. */
> -
> -/* Mark vector as used. */
> -int msix_vector_use(PCIDevice *dev, unsigned vector)
> +/* Clear pending vector. */
> +void msix_clear_vector(PCIDevice *dev, unsigned vector)
>  {
> -    if (vector >= dev->msix_entries_nr)
> -        return -EINVAL;
> -    dev->msix_entry_used[vector]++;
> -    return 0;
> -}
> -

I keep thinking we should instead call vector notifier
here, so that virtio can detect and handle cases where
kvm_virtio_pci_vq_vector_use fails.
ATM it just asserts.

Maybe I am wrong.

Let's delay this decision until 1.3.

> -/* Mark vector as unused. */
> -void msix_vector_unuse(PCIDevice *dev, unsigned vector)
> -{
> -    if (vector >= dev->msix_entries_nr || !dev->msix_entry_used[vector]) {
> -        return;
> -    }
> -    if (--dev->msix_entry_used[vector]) {
> -        return;
> +    if (msix_present(dev) && vector < dev->msix_entries_nr) {
> +        msix_clr_pending(dev, vector);
>      }
> -    msix_clr_pending(dev, vector);
>  }
>  
> -void msix_unuse_all_vectors(PCIDevice *dev)
> +void msix_clear_all_vectors(PCIDevice *dev)
>  {
> +    unsigned int vector;
> +
>      if (!msix_present(dev)) {
>          return;
>      }
> -    msix_free_irq_entries(dev);
> +    for (vector = 0; vector < dev->msix_entries_nr; ++vector) {
> +        msix_clr_pending(dev, vector);
> +    }
>  }
>  
>  unsigned int msix_nr_vectors_allocated(const PCIDevice *dev)
> diff --git a/hw/msix.h b/hw/msix.h
> index 15211cb..5c2512a 100644
> --- a/hw/msix.h
> +++ b/hw/msix.h
> @@ -26,9 +26,8 @@ void msix_load(PCIDevice *dev, QEMUFile *f);
>  int msix_enabled(PCIDevice *dev);
>  int msix_present(PCIDevice *dev);
>  
> -int msix_vector_use(PCIDevice *dev, unsigned vector);
> -void msix_vector_unuse(PCIDevice *dev, unsigned vector);
> -void msix_unuse_all_vectors(PCIDevice *dev);
> +void msix_clear_vector(PCIDevice *dev, unsigned vector);
> +void msix_clear_all_vectors(PCIDevice *dev);
>  
>  void msix_notify(PCIDevice *dev, unsigned vector);
>  
> diff --git a/hw/pci.h b/hw/pci.h
> index 4b6ab3d..1b07450 100644
> --- a/hw/pci.h
> +++ b/hw/pci.h
> @@ -240,8 +240,6 @@ struct PCIDevice {
>      /* Memory Regions for MSIX table and pending bit entries. */
>      MemoryRegion msix_table_mmio;
>      MemoryRegion msix_pba_mmio;
> -    /* Reference-count for entries actually in use by driver. */
> -    unsigned *msix_entry_used;
>      /* MSIX function mask set or MSIX disabled */
>      bool msix_function_masked;
>      /* Version id needed for VMState */
> diff --git a/hw/virtio-pci.c b/hw/virtio-pci.c
> index 2a3d86f..f8ce01d 100644
> --- a/hw/virtio-pci.c
> +++ b/hw/virtio-pci.c
> @@ -137,9 +137,6 @@ static int virtio_pci_load_config(void * opaque, QEMUFile *f)
>      } else {
>          proxy->vdev->config_vector = VIRTIO_NO_VECTOR;
>      }
> -    if (proxy->vdev->config_vector != VIRTIO_NO_VECTOR) {
> -        return msix_vector_use(&proxy->pci_dev, proxy->vdev->config_vector);
> -    }
>      return 0;
>  }
>  
> @@ -153,9 +150,6 @@ static int virtio_pci_load_queue(void * opaque, int n, QEMUFile *f)
>          vector = VIRTIO_NO_VECTOR;
>      }
>      virtio_queue_set_vector(proxy->vdev, n, vector);
> -    if (vector != VIRTIO_NO_VECTOR) {
> -        return msix_vector_use(&proxy->pci_dev, vector);
> -    }
>      return 0;
>  }
>  
> @@ -268,7 +262,7 @@ static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val)
>          if (pa == 0) {
>              virtio_pci_stop_ioeventfd(proxy);
>              virtio_reset(proxy->vdev);
> -            msix_unuse_all_vectors(&proxy->pci_dev);
> +            msix_clear_all_vectors(&proxy->pci_dev);
>          }
>          else
>              virtio_queue_set_addr(vdev, vdev->queue_sel, pa);
> @@ -295,7 +289,7 @@ static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val)
>  
>          if (vdev->status == 0) {
>              virtio_reset(proxy->vdev);
> -            msix_unuse_all_vectors(&proxy->pci_dev);
> +            msix_clear_all_vectors(&proxy->pci_dev);
>          }
>  
>          /* Linux before 2.6.34 sets the device as OK without enabling
> @@ -307,18 +301,20 @@ static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val)
>          }
>          break;
>      case VIRTIO_MSI_CONFIG_VECTOR:
> -        msix_vector_unuse(&proxy->pci_dev, vdev->config_vector);
> +        msix_clear_vector(&proxy->pci_dev, vdev->config_vector);
>          /* Make it possible for guest to discover an error took place. */
> -        if (msix_vector_use(&proxy->pci_dev, val) < 0)
> +        if (val >= vdev->nvectors) {
>              val = VIRTIO_NO_VECTOR;
> +        }
>          vdev->config_vector = val;
>          break;
>      case VIRTIO_MSI_QUEUE_VECTOR:
> -        msix_vector_unuse(&proxy->pci_dev,
> +        msix_clear_vector(&proxy->pci_dev,
>                            virtio_queue_vector(vdev, vdev->queue_sel));
>          /* Make it possible for guest to discover an error took place. */
> -        if (msix_vector_use(&proxy->pci_dev, val) < 0)
> +        if (val >= vdev->nvectors) {
>              val = VIRTIO_NO_VECTOR;
> +        }
>          virtio_queue_set_vector(vdev, vdev->queue_sel, val);
>          break;
>      default:
> -- 
> 1.7.3.4
Jan Kiszka - Aug. 24, 2012, 8:21 a.m.
On 2012-08-24 10:20, Michael S. Tsirkin wrote:
> On Fri, Aug 24, 2012 at 08:19:50AM +0200, Jan Kiszka wrote:
>> From: Jan Kiszka <jan.kiszka@siemens.com>
>>
>> This optimization was once used in qemu-kvm to keep KVM route usage low.
>> But now we solved that problem via lazy updates. It also tried to handle
>> the case of vectors shared between different sources of the same device.
>> However, this never really worked and will have to be addressed
>> differently anyway. So drop this obsolete interface.
>>
>> We still need interfaces to clear pending vectors though. Provide
>> msix_clear_vector and msix_clear_all_vectors for this.
>>
>> This also unbreaks MSI-X after reset for ivshmem and megasas as device
>> models can't easily mark their vectors used afterward (megasas didn't
>> even try).
>>
>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
>> ---
>>
>> This patch has been posted some moons again, and we had a discussion
>> about the impact on the existing users. At that time, MSI-X refactoring
>> for KVM support was not yet merged. Now it is, and it should be better
>> much clearer that vector usage tracking has no business with that
>> feature.
>>
>>  hw/ivshmem.c    |   20 -------------------
>>  hw/megasas.c    |    4 ---
>>  hw/msix.c       |   57 ++++++++++++------------------------------------------
>>  hw/msix.h       |    5 +--
>>  hw/pci.h        |    2 -
>>  hw/virtio-pci.c |   20 +++++++-----------
>>  6 files changed, 23 insertions(+), 85 deletions(-)
>>
>> diff --git a/hw/ivshmem.c b/hw/ivshmem.c
>> index 47f2a16..5ffff48 100644
>> --- a/hw/ivshmem.c
>> +++ b/hw/ivshmem.c
>> @@ -523,28 +523,11 @@ static void ivshmem_read(void *opaque, const uint8_t * buf, int flags)
>>      return;
>>  }
>>  
>> -/* Select the MSI-X vectors used by device.
>> - * ivshmem maps events to vectors statically, so
>> - * we just enable all vectors on init and after reset. */
>> -static void ivshmem_use_msix(IVShmemState * s)
>> -{
>> -    int i;
>> -
>> -    if (!msix_present(&s->dev)) {
>> -        return;
>> -    }
>> -
>> -    for (i = 0; i < s->vectors; i++) {
>> -        msix_vector_use(&s->dev, i);
>> -    }
>> -}
>> -
>>  static void ivshmem_reset(DeviceState *d)
>>  {
>>      IVShmemState *s = DO_UPCAST(IVShmemState, dev.qdev, d);
>>  
>>      s->intrstatus = 0;
>> -    ivshmem_use_msix(s);
>>      return;
>>  }
>>  
>> @@ -586,8 +569,6 @@ static void ivshmem_setup_msi(IVShmemState * s)
>>  
>>      /* allocate QEMU char devices for receiving interrupts */
>>      s->eventfd_table = g_malloc0(s->vectors * sizeof(EventfdEntry));
>> -
>> -    ivshmem_use_msix(s);
>>  }
>>  
>>  static void ivshmem_save(QEMUFile* f, void *opaque)
>> @@ -629,7 +610,6 @@ static int ivshmem_load(QEMUFile* f, void *opaque, int version_id)
>>  
>>      if (ivshmem_has_feature(proxy, IVSHMEM_MSI)) {
>>          msix_load(&proxy->dev, f);
>> -	ivshmem_use_msix(proxy);
>>      } else {
>>          proxy->intrstatus = qemu_get_be32(f);
>>          proxy->intrmask = qemu_get_be32(f);
>> diff --git a/hw/megasas.c b/hw/megasas.c
>> index c35a15d..4766117 100644
>> --- a/hw/megasas.c
>> +++ b/hw/megasas.c
>> @@ -2121,10 +2121,6 @@ static int megasas_scsi_init(PCIDevice *dev)
>>      pci_register_bar(&s->dev, 2, PCI_BASE_ADDRESS_SPACE_IO, &s->port_io);
>>      pci_register_bar(&s->dev, 3, bar_type, &s->queue_io);
>>  
>> -    if (megasas_use_msix(s)) {
>> -        msix_vector_use(&s->dev, 0);
>> -    }
>> -
>>      if (!s->sas_addr) {
>>          s->sas_addr = ((NAA_LOCALLY_ASSIGNED_ID << 24) |
>>                         IEEE_COMPANY_LOCALLY_ASSIGNED) << 36;
>> diff --git a/hw/msix.c b/hw/msix.c
>> index aea340b..163ce4c 100644
>> --- a/hw/msix.c
>> +++ b/hw/msix.c
>> @@ -273,7 +273,6 @@ int msix_init(struct PCIDevice *dev, unsigned short nentries,
>>  
>>      dev->msix_table = g_malloc0(table_size);
>>      dev->msix_pba = g_malloc0(pba_size);
>> -    dev->msix_entry_used = g_malloc0(nentries * sizeof *dev->msix_entry_used);
>>  
>>      msix_mask_all(dev, nentries);
>>  
>> @@ -326,16 +325,6 @@ int msix_init_exclusive_bar(PCIDevice *dev, unsigned short nentries,
>>      return 0;
>>  }
>>  
>> -static void msix_free_irq_entries(PCIDevice *dev)
>> -{
>> -    int vector;
>> -
>> -    for (vector = 0; vector < dev->msix_entries_nr; ++vector) {
>> -        dev->msix_entry_used[vector] = 0;
>> -        msix_clr_pending(dev, vector);
>> -    }
>> -}
>> -
>>  /* Clean up resources for the device. */
>>  void msix_uninit(PCIDevice *dev, MemoryRegion *table_bar, MemoryRegion *pba_bar)
>>  {
>> @@ -344,7 +333,6 @@ void msix_uninit(PCIDevice *dev, MemoryRegion *table_bar, MemoryRegion *pba_bar)
>>      }
>>      pci_del_capability(dev, PCI_CAP_ID_MSIX, MSIX_CAP_LENGTH);
>>      dev->msix_cap = 0;
>> -    msix_free_irq_entries(dev);
>>      dev->msix_entries_nr = 0;
>>      memory_region_del_subregion(pba_bar, &dev->msix_pba_mmio);
>>      memory_region_destroy(&dev->msix_pba_mmio);
>> @@ -354,8 +342,6 @@ void msix_uninit(PCIDevice *dev, MemoryRegion *table_bar, MemoryRegion *pba_bar)
>>      memory_region_destroy(&dev->msix_table_mmio);
>>      g_free(dev->msix_table);
>>      dev->msix_table = NULL;
>> -    g_free(dev->msix_entry_used);
>> -    dev->msix_entry_used = NULL;
>>      dev->cap_present &= ~QEMU_PCI_CAP_MSIX;
>>      return;
>>  }
>> @@ -390,7 +376,6 @@ void msix_load(PCIDevice *dev, QEMUFile *f)
>>          return;
>>      }
>>  
>> -    msix_free_irq_entries(dev);
>>      qemu_get_buffer(f, dev->msix_table, n * PCI_MSIX_ENTRY_SIZE);
>>      qemu_get_buffer(f, dev->msix_pba, (n + 7) / 8);
>>      msix_update_function_masked(dev);
>> @@ -419,8 +404,9 @@ void msix_notify(PCIDevice *dev, unsigned vector)
>>  {
>>      MSIMessage msg;
>>  
>> -    if (vector >= dev->msix_entries_nr || !dev->msix_entry_used[vector])
>> +    if (vector >= dev->msix_entries_nr) {
>>          return;
>> +    }
>>      if (msix_is_masked(dev, vector)) {
>>          msix_set_pending(dev, vector);
>>          return;
>> @@ -436,7 +422,7 @@ void msix_reset(PCIDevice *dev)
>>      if (!msix_present(dev)) {
>>          return;
>>      }
>> -    msix_free_irq_entries(dev);
>> +    msix_clear_all_vectors(dev);
>>      dev->config[dev->msix_cap + MSIX_CONTROL_OFFSET] &=
>>  	    ~dev->wmask[dev->msix_cap + MSIX_CONTROL_OFFSET];
>>      memset(dev->msix_table, 0, dev->msix_entries_nr * PCI_MSIX_ENTRY_SIZE);
>> @@ -444,41 +430,24 @@ void msix_reset(PCIDevice *dev)
>>      msix_mask_all(dev, dev->msix_entries_nr);
>>  }
>>  
>> -/* PCI spec suggests that devices make it possible for software to configure
>> - * less vectors than supported by the device, but does not specify a standard
>> - * mechanism for devices to do so.
>> - *
>> - * We support this by asking devices to declare vectors software is going to
>> - * actually use, and checking this on the notification path. Devices that
>> - * don't want to follow the spec suggestion can declare all vectors as used. */
>> -
>> -/* Mark vector as used. */
>> -int msix_vector_use(PCIDevice *dev, unsigned vector)
>> +/* Clear pending vector. */
>> +void msix_clear_vector(PCIDevice *dev, unsigned vector)
>>  {
>> -    if (vector >= dev->msix_entries_nr)
>> -        return -EINVAL;
>> -    dev->msix_entry_used[vector]++;
>> -    return 0;
>> -}
>> -
> 
> I keep thinking we should instead call vector notifier
> here, so that virtio can detect and handle cases where
> kvm_virtio_pci_vq_vector_use fails.
> ATM it just asserts.
> 
> Maybe I am wrong.
> 
> Let's delay this decision until 1.3.

Yep, that feature can be added on top of this patch in 1.3, but we
should still remove the usage tracking to fix its issues. There is no
feature regression related to this.

Jan
Jan Kiszka - Aug. 24, 2012, 8:35 a.m.
On 2012-08-24 10:21, Jan Kiszka wrote:
> On 2012-08-24 10:20, Michael S. Tsirkin wrote:
>> On Fri, Aug 24, 2012 at 08:19:50AM +0200, Jan Kiszka wrote:
>>> From: Jan Kiszka <jan.kiszka@siemens.com>
>>>
>>> This optimization was once used in qemu-kvm to keep KVM route usage low.
>>> But now we solved that problem via lazy updates. It also tried to handle
>>> the case of vectors shared between different sources of the same device.
>>> However, this never really worked and will have to be addressed
>>> differently anyway. So drop this obsolete interface.
>>>
>>> We still need interfaces to clear pending vectors though. Provide
>>> msix_clear_vector and msix_clear_all_vectors for this.
>>>
>>> This also unbreaks MSI-X after reset for ivshmem and megasas as device
>>> models can't easily mark their vectors used afterward (megasas didn't
>>> even try).
>>>
>>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
>>> ---
>>>
>>> This patch has been posted some moons again, and we had a discussion
>>> about the impact on the existing users. At that time, MSI-X refactoring
>>> for KVM support was not yet merged. Now it is, and it should be better
>>> much clearer that vector usage tracking has no business with that
>>> feature.
>>>
>>>  hw/ivshmem.c    |   20 -------------------
>>>  hw/megasas.c    |    4 ---
>>>  hw/msix.c       |   57 ++++++++++++------------------------------------------
>>>  hw/msix.h       |    5 +--
>>>  hw/pci.h        |    2 -
>>>  hw/virtio-pci.c |   20 +++++++-----------
>>>  6 files changed, 23 insertions(+), 85 deletions(-)
>>>
>>> diff --git a/hw/ivshmem.c b/hw/ivshmem.c
>>> index 47f2a16..5ffff48 100644
>>> --- a/hw/ivshmem.c
>>> +++ b/hw/ivshmem.c
>>> @@ -523,28 +523,11 @@ static void ivshmem_read(void *opaque, const uint8_t * buf, int flags)
>>>      return;
>>>  }
>>>  
>>> -/* Select the MSI-X vectors used by device.
>>> - * ivshmem maps events to vectors statically, so
>>> - * we just enable all vectors on init and after reset. */
>>> -static void ivshmem_use_msix(IVShmemState * s)
>>> -{
>>> -    int i;
>>> -
>>> -    if (!msix_present(&s->dev)) {
>>> -        return;
>>> -    }
>>> -
>>> -    for (i = 0; i < s->vectors; i++) {
>>> -        msix_vector_use(&s->dev, i);
>>> -    }
>>> -}
>>> -
>>>  static void ivshmem_reset(DeviceState *d)
>>>  {
>>>      IVShmemState *s = DO_UPCAST(IVShmemState, dev.qdev, d);
>>>  
>>>      s->intrstatus = 0;
>>> -    ivshmem_use_msix(s);
>>>      return;
>>>  }
>>>  
>>> @@ -586,8 +569,6 @@ static void ivshmem_setup_msi(IVShmemState * s)
>>>  
>>>      /* allocate QEMU char devices for receiving interrupts */
>>>      s->eventfd_table = g_malloc0(s->vectors * sizeof(EventfdEntry));
>>> -
>>> -    ivshmem_use_msix(s);
>>>  }
>>>  
>>>  static void ivshmem_save(QEMUFile* f, void *opaque)
>>> @@ -629,7 +610,6 @@ static int ivshmem_load(QEMUFile* f, void *opaque, int version_id)
>>>  
>>>      if (ivshmem_has_feature(proxy, IVSHMEM_MSI)) {
>>>          msix_load(&proxy->dev, f);
>>> -	ivshmem_use_msix(proxy);
>>>      } else {
>>>          proxy->intrstatus = qemu_get_be32(f);
>>>          proxy->intrmask = qemu_get_be32(f);
>>> diff --git a/hw/megasas.c b/hw/megasas.c
>>> index c35a15d..4766117 100644
>>> --- a/hw/megasas.c
>>> +++ b/hw/megasas.c
>>> @@ -2121,10 +2121,6 @@ static int megasas_scsi_init(PCIDevice *dev)
>>>      pci_register_bar(&s->dev, 2, PCI_BASE_ADDRESS_SPACE_IO, &s->port_io);
>>>      pci_register_bar(&s->dev, 3, bar_type, &s->queue_io);
>>>  
>>> -    if (megasas_use_msix(s)) {
>>> -        msix_vector_use(&s->dev, 0);
>>> -    }
>>> -
>>>      if (!s->sas_addr) {
>>>          s->sas_addr = ((NAA_LOCALLY_ASSIGNED_ID << 24) |
>>>                         IEEE_COMPANY_LOCALLY_ASSIGNED) << 36;
>>> diff --git a/hw/msix.c b/hw/msix.c
>>> index aea340b..163ce4c 100644
>>> --- a/hw/msix.c
>>> +++ b/hw/msix.c
>>> @@ -273,7 +273,6 @@ int msix_init(struct PCIDevice *dev, unsigned short nentries,
>>>  
>>>      dev->msix_table = g_malloc0(table_size);
>>>      dev->msix_pba = g_malloc0(pba_size);
>>> -    dev->msix_entry_used = g_malloc0(nentries * sizeof *dev->msix_entry_used);
>>>  
>>>      msix_mask_all(dev, nentries);
>>>  
>>> @@ -326,16 +325,6 @@ int msix_init_exclusive_bar(PCIDevice *dev, unsigned short nentries,
>>>      return 0;
>>>  }
>>>  
>>> -static void msix_free_irq_entries(PCIDevice *dev)
>>> -{
>>> -    int vector;
>>> -
>>> -    for (vector = 0; vector < dev->msix_entries_nr; ++vector) {
>>> -        dev->msix_entry_used[vector] = 0;
>>> -        msix_clr_pending(dev, vector);
>>> -    }
>>> -}
>>> -
>>>  /* Clean up resources for the device. */
>>>  void msix_uninit(PCIDevice *dev, MemoryRegion *table_bar, MemoryRegion *pba_bar)
>>>  {
>>> @@ -344,7 +333,6 @@ void msix_uninit(PCIDevice *dev, MemoryRegion *table_bar, MemoryRegion *pba_bar)
>>>      }
>>>      pci_del_capability(dev, PCI_CAP_ID_MSIX, MSIX_CAP_LENGTH);
>>>      dev->msix_cap = 0;
>>> -    msix_free_irq_entries(dev);
>>>      dev->msix_entries_nr = 0;
>>>      memory_region_del_subregion(pba_bar, &dev->msix_pba_mmio);
>>>      memory_region_destroy(&dev->msix_pba_mmio);
>>> @@ -354,8 +342,6 @@ void msix_uninit(PCIDevice *dev, MemoryRegion *table_bar, MemoryRegion *pba_bar)
>>>      memory_region_destroy(&dev->msix_table_mmio);
>>>      g_free(dev->msix_table);
>>>      dev->msix_table = NULL;
>>> -    g_free(dev->msix_entry_used);
>>> -    dev->msix_entry_used = NULL;
>>>      dev->cap_present &= ~QEMU_PCI_CAP_MSIX;
>>>      return;
>>>  }
>>> @@ -390,7 +376,6 @@ void msix_load(PCIDevice *dev, QEMUFile *f)
>>>          return;
>>>      }
>>>  
>>> -    msix_free_irq_entries(dev);
>>>      qemu_get_buffer(f, dev->msix_table, n * PCI_MSIX_ENTRY_SIZE);
>>>      qemu_get_buffer(f, dev->msix_pba, (n + 7) / 8);
>>>      msix_update_function_masked(dev);
>>> @@ -419,8 +404,9 @@ void msix_notify(PCIDevice *dev, unsigned vector)
>>>  {
>>>      MSIMessage msg;
>>>  
>>> -    if (vector >= dev->msix_entries_nr || !dev->msix_entry_used[vector])
>>> +    if (vector >= dev->msix_entries_nr) {
>>>          return;
>>> +    }
>>>      if (msix_is_masked(dev, vector)) {
>>>          msix_set_pending(dev, vector);
>>>          return;
>>> @@ -436,7 +422,7 @@ void msix_reset(PCIDevice *dev)
>>>      if (!msix_present(dev)) {
>>>          return;
>>>      }
>>> -    msix_free_irq_entries(dev);
>>> +    msix_clear_all_vectors(dev);
>>>      dev->config[dev->msix_cap + MSIX_CONTROL_OFFSET] &=
>>>  	    ~dev->wmask[dev->msix_cap + MSIX_CONTROL_OFFSET];
>>>      memset(dev->msix_table, 0, dev->msix_entries_nr * PCI_MSIX_ENTRY_SIZE);
>>> @@ -444,41 +430,24 @@ void msix_reset(PCIDevice *dev)
>>>      msix_mask_all(dev, dev->msix_entries_nr);
>>>  }
>>>  
>>> -/* PCI spec suggests that devices make it possible for software to configure
>>> - * less vectors than supported by the device, but does not specify a standard
>>> - * mechanism for devices to do so.
>>> - *
>>> - * We support this by asking devices to declare vectors software is going to
>>> - * actually use, and checking this on the notification path. Devices that
>>> - * don't want to follow the spec suggestion can declare all vectors as used. */
>>> -
>>> -/* Mark vector as used. */
>>> -int msix_vector_use(PCIDevice *dev, unsigned vector)
>>> +/* Clear pending vector. */
>>> +void msix_clear_vector(PCIDevice *dev, unsigned vector)
>>>  {
>>> -    if (vector >= dev->msix_entries_nr)
>>> -        return -EINVAL;
>>> -    dev->msix_entry_used[vector]++;
>>> -    return 0;
>>> -}
>>> -
>>
>> I keep thinking we should instead call vector notifier
>> here, so that virtio can detect and handle cases where
>> kvm_virtio_pci_vq_vector_use fails.
>> ATM it just asserts.
>>
>> Maybe I am wrong.
>>
>> Let's delay this decision until 1.3.
> 
> Yep, that feature can be added on top of this patch in 1.3, but we
> should still remove the usage tracking to fix its issues. There is no
> feature regression related to this.

OK, to address that virtio requirement (in 1.3) we can simply
preallocate an MSI route on VIRTIO_MSI_CONFIG/QUEUE_VECTOR and update
that route (I have a patch for kvm_irqchip_update_msi_route here
already) on vector_use.

The point is: That all is pure virtio business, nothing the generic
MSI-X layer nor any normal MSI-X device or even pci-assign/vfio have to
deal with.

Jan
Michael S. Tsirkin - Aug. 24, 2012, 8:37 a.m.
On Fri, Aug 24, 2012 at 10:21:44AM +0200, Jan Kiszka wrote:
> On 2012-08-24 10:20, Michael S. Tsirkin wrote:
> > On Fri, Aug 24, 2012 at 08:19:50AM +0200, Jan Kiszka wrote:
> >> From: Jan Kiszka <jan.kiszka@siemens.com>
> >>
> >> This optimization was once used in qemu-kvm to keep KVM route usage low.
> >> But now we solved that problem via lazy updates. It also tried to handle
> >> the case of vectors shared between different sources of the same device.
> >> However, this never really worked and will have to be addressed
> >> differently anyway. So drop this obsolete interface.
> >>
> >> We still need interfaces to clear pending vectors though. Provide
> >> msix_clear_vector and msix_clear_all_vectors for this.
> >>
> >> This also unbreaks MSI-X after reset for ivshmem and megasas as device
> >> models can't easily mark their vectors used afterward (megasas didn't
> >> even try).
> >>
> >> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
> >> ---
> >>
> >> This patch has been posted some moons again, and we had a discussion
> >> about the impact on the existing users. At that time, MSI-X refactoring
> >> for KVM support was not yet merged. Now it is, and it should be better
> >> much clearer that vector usage tracking has no business with that
> >> feature.
> >>
> >>  hw/ivshmem.c    |   20 -------------------
> >>  hw/megasas.c    |    4 ---
> >>  hw/msix.c       |   57 ++++++++++++------------------------------------------
> >>  hw/msix.h       |    5 +--
> >>  hw/pci.h        |    2 -
> >>  hw/virtio-pci.c |   20 +++++++-----------
> >>  6 files changed, 23 insertions(+), 85 deletions(-)
> >>
> >> diff --git a/hw/ivshmem.c b/hw/ivshmem.c
> >> index 47f2a16..5ffff48 100644
> >> --- a/hw/ivshmem.c
> >> +++ b/hw/ivshmem.c
> >> @@ -523,28 +523,11 @@ static void ivshmem_read(void *opaque, const uint8_t * buf, int flags)
> >>      return;
> >>  }
> >>  
> >> -/* Select the MSI-X vectors used by device.
> >> - * ivshmem maps events to vectors statically, so
> >> - * we just enable all vectors on init and after reset. */
> >> -static void ivshmem_use_msix(IVShmemState * s)
> >> -{
> >> -    int i;
> >> -
> >> -    if (!msix_present(&s->dev)) {
> >> -        return;
> >> -    }
> >> -
> >> -    for (i = 0; i < s->vectors; i++) {
> >> -        msix_vector_use(&s->dev, i);
> >> -    }
> >> -}
> >> -
> >>  static void ivshmem_reset(DeviceState *d)
> >>  {
> >>      IVShmemState *s = DO_UPCAST(IVShmemState, dev.qdev, d);
> >>  
> >>      s->intrstatus = 0;
> >> -    ivshmem_use_msix(s);
> >>      return;
> >>  }
> >>  
> >> @@ -586,8 +569,6 @@ static void ivshmem_setup_msi(IVShmemState * s)
> >>  
> >>      /* allocate QEMU char devices for receiving interrupts */
> >>      s->eventfd_table = g_malloc0(s->vectors * sizeof(EventfdEntry));
> >> -
> >> -    ivshmem_use_msix(s);
> >>  }
> >>  
> >>  static void ivshmem_save(QEMUFile* f, void *opaque)
> >> @@ -629,7 +610,6 @@ static int ivshmem_load(QEMUFile* f, void *opaque, int version_id)
> >>  
> >>      if (ivshmem_has_feature(proxy, IVSHMEM_MSI)) {
> >>          msix_load(&proxy->dev, f);
> >> -	ivshmem_use_msix(proxy);
> >>      } else {
> >>          proxy->intrstatus = qemu_get_be32(f);
> >>          proxy->intrmask = qemu_get_be32(f);
> >> diff --git a/hw/megasas.c b/hw/megasas.c
> >> index c35a15d..4766117 100644
> >> --- a/hw/megasas.c
> >> +++ b/hw/megasas.c
> >> @@ -2121,10 +2121,6 @@ static int megasas_scsi_init(PCIDevice *dev)
> >>      pci_register_bar(&s->dev, 2, PCI_BASE_ADDRESS_SPACE_IO, &s->port_io);
> >>      pci_register_bar(&s->dev, 3, bar_type, &s->queue_io);
> >>  
> >> -    if (megasas_use_msix(s)) {
> >> -        msix_vector_use(&s->dev, 0);
> >> -    }
> >> -
> >>      if (!s->sas_addr) {
> >>          s->sas_addr = ((NAA_LOCALLY_ASSIGNED_ID << 24) |
> >>                         IEEE_COMPANY_LOCALLY_ASSIGNED) << 36;
> >> diff --git a/hw/msix.c b/hw/msix.c
> >> index aea340b..163ce4c 100644
> >> --- a/hw/msix.c
> >> +++ b/hw/msix.c
> >> @@ -273,7 +273,6 @@ int msix_init(struct PCIDevice *dev, unsigned short nentries,
> >>  
> >>      dev->msix_table = g_malloc0(table_size);
> >>      dev->msix_pba = g_malloc0(pba_size);
> >> -    dev->msix_entry_used = g_malloc0(nentries * sizeof *dev->msix_entry_used);
> >>  
> >>      msix_mask_all(dev, nentries);
> >>  
> >> @@ -326,16 +325,6 @@ int msix_init_exclusive_bar(PCIDevice *dev, unsigned short nentries,
> >>      return 0;
> >>  }
> >>  
> >> -static void msix_free_irq_entries(PCIDevice *dev)
> >> -{
> >> -    int vector;
> >> -
> >> -    for (vector = 0; vector < dev->msix_entries_nr; ++vector) {
> >> -        dev->msix_entry_used[vector] = 0;
> >> -        msix_clr_pending(dev, vector);
> >> -    }
> >> -}
> >> -
> >>  /* Clean up resources for the device. */
> >>  void msix_uninit(PCIDevice *dev, MemoryRegion *table_bar, MemoryRegion *pba_bar)
> >>  {
> >> @@ -344,7 +333,6 @@ void msix_uninit(PCIDevice *dev, MemoryRegion *table_bar, MemoryRegion *pba_bar)
> >>      }
> >>      pci_del_capability(dev, PCI_CAP_ID_MSIX, MSIX_CAP_LENGTH);
> >>      dev->msix_cap = 0;
> >> -    msix_free_irq_entries(dev);
> >>      dev->msix_entries_nr = 0;
> >>      memory_region_del_subregion(pba_bar, &dev->msix_pba_mmio);
> >>      memory_region_destroy(&dev->msix_pba_mmio);
> >> @@ -354,8 +342,6 @@ void msix_uninit(PCIDevice *dev, MemoryRegion *table_bar, MemoryRegion *pba_bar)
> >>      memory_region_destroy(&dev->msix_table_mmio);
> >>      g_free(dev->msix_table);
> >>      dev->msix_table = NULL;
> >> -    g_free(dev->msix_entry_used);
> >> -    dev->msix_entry_used = NULL;
> >>      dev->cap_present &= ~QEMU_PCI_CAP_MSIX;
> >>      return;
> >>  }
> >> @@ -390,7 +376,6 @@ void msix_load(PCIDevice *dev, QEMUFile *f)
> >>          return;
> >>      }
> >>  
> >> -    msix_free_irq_entries(dev);
> >>      qemu_get_buffer(f, dev->msix_table, n * PCI_MSIX_ENTRY_SIZE);
> >>      qemu_get_buffer(f, dev->msix_pba, (n + 7) / 8);
> >>      msix_update_function_masked(dev);
> >> @@ -419,8 +404,9 @@ void msix_notify(PCIDevice *dev, unsigned vector)
> >>  {
> >>      MSIMessage msg;
> >>  
> >> -    if (vector >= dev->msix_entries_nr || !dev->msix_entry_used[vector])
> >> +    if (vector >= dev->msix_entries_nr) {
> >>          return;
> >> +    }
> >>      if (msix_is_masked(dev, vector)) {
> >>          msix_set_pending(dev, vector);
> >>          return;
> >> @@ -436,7 +422,7 @@ void msix_reset(PCIDevice *dev)
> >>      if (!msix_present(dev)) {
> >>          return;
> >>      }
> >> -    msix_free_irq_entries(dev);
> >> +    msix_clear_all_vectors(dev);
> >>      dev->config[dev->msix_cap + MSIX_CONTROL_OFFSET] &=
> >>  	    ~dev->wmask[dev->msix_cap + MSIX_CONTROL_OFFSET];
> >>      memset(dev->msix_table, 0, dev->msix_entries_nr * PCI_MSIX_ENTRY_SIZE);
> >> @@ -444,41 +430,24 @@ void msix_reset(PCIDevice *dev)
> >>      msix_mask_all(dev, dev->msix_entries_nr);
> >>  }
> >>  
> >> -/* PCI spec suggests that devices make it possible for software to configure
> >> - * less vectors than supported by the device, but does not specify a standard
> >> - * mechanism for devices to do so.
> >> - *
> >> - * We support this by asking devices to declare vectors software is going to
> >> - * actually use, and checking this on the notification path. Devices that
> >> - * don't want to follow the spec suggestion can declare all vectors as used. */
> >> -
> >> -/* Mark vector as used. */
> >> -int msix_vector_use(PCIDevice *dev, unsigned vector)
> >> +/* Clear pending vector. */
> >> +void msix_clear_vector(PCIDevice *dev, unsigned vector)
> >>  {
> >> -    if (vector >= dev->msix_entries_nr)
> >> -        return -EINVAL;
> >> -    dev->msix_entry_used[vector]++;
> >> -    return 0;
> >> -}
> >> -
> > 
> > I keep thinking we should instead call vector notifier
> > here, so that virtio can detect and handle cases where
> > kvm_virtio_pci_vq_vector_use fails.
> > ATM it just asserts.
> > 
> > Maybe I am wrong.
> > 
> > Let's delay this decision until 1.3.
> 
> Yep, that feature can be added on top of this patch in 1.3, but we
> should still remove the usage tracking to fix its issues. There is no
> feature regression related to this.
> 
> Jan
> 
> 

I'd like to keep it for virtio and remove for everyone else.
Draft patch sent, it's weekend here so consider it
pseudo-code, pls take a look.
Michael S. Tsirkin - Aug. 24, 2012, 8:39 a.m.
On Fri, Aug 24, 2012 at 10:35:34AM +0200, Jan Kiszka wrote:
> On 2012-08-24 10:21, Jan Kiszka wrote:
> > On 2012-08-24 10:20, Michael S. Tsirkin wrote:
> >> On Fri, Aug 24, 2012 at 08:19:50AM +0200, Jan Kiszka wrote:
> >>> From: Jan Kiszka <jan.kiszka@siemens.com>
> >>>
> >>> This optimization was once used in qemu-kvm to keep KVM route usage low.
> >>> But now we solved that problem via lazy updates. It also tried to handle
> >>> the case of vectors shared between different sources of the same device.
> >>> However, this never really worked and will have to be addressed
> >>> differently anyway. So drop this obsolete interface.
> >>>
> >>> We still need interfaces to clear pending vectors though. Provide
> >>> msix_clear_vector and msix_clear_all_vectors for this.
> >>>
> >>> This also unbreaks MSI-X after reset for ivshmem and megasas as device
> >>> models can't easily mark their vectors used afterward (megasas didn't
> >>> even try).
> >>>
> >>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
> >>> ---
> >>>
> >>> This patch has been posted some moons again, and we had a discussion
> >>> about the impact on the existing users. At that time, MSI-X refactoring
> >>> for KVM support was not yet merged. Now it is, and it should be better
> >>> much clearer that vector usage tracking has no business with that
> >>> feature.
> >>>
> >>>  hw/ivshmem.c    |   20 -------------------
> >>>  hw/megasas.c    |    4 ---
> >>>  hw/msix.c       |   57 ++++++++++++------------------------------------------
> >>>  hw/msix.h       |    5 +--
> >>>  hw/pci.h        |    2 -
> >>>  hw/virtio-pci.c |   20 +++++++-----------
> >>>  6 files changed, 23 insertions(+), 85 deletions(-)
> >>>
> >>> diff --git a/hw/ivshmem.c b/hw/ivshmem.c
> >>> index 47f2a16..5ffff48 100644
> >>> --- a/hw/ivshmem.c
> >>> +++ b/hw/ivshmem.c
> >>> @@ -523,28 +523,11 @@ static void ivshmem_read(void *opaque, const uint8_t * buf, int flags)
> >>>      return;
> >>>  }
> >>>  
> >>> -/* Select the MSI-X vectors used by device.
> >>> - * ivshmem maps events to vectors statically, so
> >>> - * we just enable all vectors on init and after reset. */
> >>> -static void ivshmem_use_msix(IVShmemState * s)
> >>> -{
> >>> -    int i;
> >>> -
> >>> -    if (!msix_present(&s->dev)) {
> >>> -        return;
> >>> -    }
> >>> -
> >>> -    for (i = 0; i < s->vectors; i++) {
> >>> -        msix_vector_use(&s->dev, i);
> >>> -    }
> >>> -}
> >>> -
> >>>  static void ivshmem_reset(DeviceState *d)
> >>>  {
> >>>      IVShmemState *s = DO_UPCAST(IVShmemState, dev.qdev, d);
> >>>  
> >>>      s->intrstatus = 0;
> >>> -    ivshmem_use_msix(s);
> >>>      return;
> >>>  }
> >>>  
> >>> @@ -586,8 +569,6 @@ static void ivshmem_setup_msi(IVShmemState * s)
> >>>  
> >>>      /* allocate QEMU char devices for receiving interrupts */
> >>>      s->eventfd_table = g_malloc0(s->vectors * sizeof(EventfdEntry));
> >>> -
> >>> -    ivshmem_use_msix(s);
> >>>  }
> >>>  
> >>>  static void ivshmem_save(QEMUFile* f, void *opaque)
> >>> @@ -629,7 +610,6 @@ static int ivshmem_load(QEMUFile* f, void *opaque, int version_id)
> >>>  
> >>>      if (ivshmem_has_feature(proxy, IVSHMEM_MSI)) {
> >>>          msix_load(&proxy->dev, f);
> >>> -	ivshmem_use_msix(proxy);
> >>>      } else {
> >>>          proxy->intrstatus = qemu_get_be32(f);
> >>>          proxy->intrmask = qemu_get_be32(f);
> >>> diff --git a/hw/megasas.c b/hw/megasas.c
> >>> index c35a15d..4766117 100644
> >>> --- a/hw/megasas.c
> >>> +++ b/hw/megasas.c
> >>> @@ -2121,10 +2121,6 @@ static int megasas_scsi_init(PCIDevice *dev)
> >>>      pci_register_bar(&s->dev, 2, PCI_BASE_ADDRESS_SPACE_IO, &s->port_io);
> >>>      pci_register_bar(&s->dev, 3, bar_type, &s->queue_io);
> >>>  
> >>> -    if (megasas_use_msix(s)) {
> >>> -        msix_vector_use(&s->dev, 0);
> >>> -    }
> >>> -
> >>>      if (!s->sas_addr) {
> >>>          s->sas_addr = ((NAA_LOCALLY_ASSIGNED_ID << 24) |
> >>>                         IEEE_COMPANY_LOCALLY_ASSIGNED) << 36;
> >>> diff --git a/hw/msix.c b/hw/msix.c
> >>> index aea340b..163ce4c 100644
> >>> --- a/hw/msix.c
> >>> +++ b/hw/msix.c
> >>> @@ -273,7 +273,6 @@ int msix_init(struct PCIDevice *dev, unsigned short nentries,
> >>>  
> >>>      dev->msix_table = g_malloc0(table_size);
> >>>      dev->msix_pba = g_malloc0(pba_size);
> >>> -    dev->msix_entry_used = g_malloc0(nentries * sizeof *dev->msix_entry_used);
> >>>  
> >>>      msix_mask_all(dev, nentries);
> >>>  
> >>> @@ -326,16 +325,6 @@ int msix_init_exclusive_bar(PCIDevice *dev, unsigned short nentries,
> >>>      return 0;
> >>>  }
> >>>  
> >>> -static void msix_free_irq_entries(PCIDevice *dev)
> >>> -{
> >>> -    int vector;
> >>> -
> >>> -    for (vector = 0; vector < dev->msix_entries_nr; ++vector) {
> >>> -        dev->msix_entry_used[vector] = 0;
> >>> -        msix_clr_pending(dev, vector);
> >>> -    }
> >>> -}
> >>> -
> >>>  /* Clean up resources for the device. */
> >>>  void msix_uninit(PCIDevice *dev, MemoryRegion *table_bar, MemoryRegion *pba_bar)
> >>>  {
> >>> @@ -344,7 +333,6 @@ void msix_uninit(PCIDevice *dev, MemoryRegion *table_bar, MemoryRegion *pba_bar)
> >>>      }
> >>>      pci_del_capability(dev, PCI_CAP_ID_MSIX, MSIX_CAP_LENGTH);
> >>>      dev->msix_cap = 0;
> >>> -    msix_free_irq_entries(dev);
> >>>      dev->msix_entries_nr = 0;
> >>>      memory_region_del_subregion(pba_bar, &dev->msix_pba_mmio);
> >>>      memory_region_destroy(&dev->msix_pba_mmio);
> >>> @@ -354,8 +342,6 @@ void msix_uninit(PCIDevice *dev, MemoryRegion *table_bar, MemoryRegion *pba_bar)
> >>>      memory_region_destroy(&dev->msix_table_mmio);
> >>>      g_free(dev->msix_table);
> >>>      dev->msix_table = NULL;
> >>> -    g_free(dev->msix_entry_used);
> >>> -    dev->msix_entry_used = NULL;
> >>>      dev->cap_present &= ~QEMU_PCI_CAP_MSIX;
> >>>      return;
> >>>  }
> >>> @@ -390,7 +376,6 @@ void msix_load(PCIDevice *dev, QEMUFile *f)
> >>>          return;
> >>>      }
> >>>  
> >>> -    msix_free_irq_entries(dev);
> >>>      qemu_get_buffer(f, dev->msix_table, n * PCI_MSIX_ENTRY_SIZE);
> >>>      qemu_get_buffer(f, dev->msix_pba, (n + 7) / 8);
> >>>      msix_update_function_masked(dev);
> >>> @@ -419,8 +404,9 @@ void msix_notify(PCIDevice *dev, unsigned vector)
> >>>  {
> >>>      MSIMessage msg;
> >>>  
> >>> -    if (vector >= dev->msix_entries_nr || !dev->msix_entry_used[vector])
> >>> +    if (vector >= dev->msix_entries_nr) {
> >>>          return;
> >>> +    }
> >>>      if (msix_is_masked(dev, vector)) {
> >>>          msix_set_pending(dev, vector);
> >>>          return;
> >>> @@ -436,7 +422,7 @@ void msix_reset(PCIDevice *dev)
> >>>      if (!msix_present(dev)) {
> >>>          return;
> >>>      }
> >>> -    msix_free_irq_entries(dev);
> >>> +    msix_clear_all_vectors(dev);
> >>>      dev->config[dev->msix_cap + MSIX_CONTROL_OFFSET] &=
> >>>  	    ~dev->wmask[dev->msix_cap + MSIX_CONTROL_OFFSET];
> >>>      memset(dev->msix_table, 0, dev->msix_entries_nr * PCI_MSIX_ENTRY_SIZE);
> >>> @@ -444,41 +430,24 @@ void msix_reset(PCIDevice *dev)
> >>>      msix_mask_all(dev, dev->msix_entries_nr);
> >>>  }
> >>>  
> >>> -/* PCI spec suggests that devices make it possible for software to configure
> >>> - * less vectors than supported by the device, but does not specify a standard
> >>> - * mechanism for devices to do so.
> >>> - *
> >>> - * We support this by asking devices to declare vectors software is going to
> >>> - * actually use, and checking this on the notification path. Devices that
> >>> - * don't want to follow the spec suggestion can declare all vectors as used. */
> >>> -
> >>> -/* Mark vector as used. */
> >>> -int msix_vector_use(PCIDevice *dev, unsigned vector)
> >>> +/* Clear pending vector. */
> >>> +void msix_clear_vector(PCIDevice *dev, unsigned vector)
> >>>  {
> >>> -    if (vector >= dev->msix_entries_nr)
> >>> -        return -EINVAL;
> >>> -    dev->msix_entry_used[vector]++;
> >>> -    return 0;
> >>> -}
> >>> -
> >>
> >> I keep thinking we should instead call vector notifier
> >> here, so that virtio can detect and handle cases where
> >> kvm_virtio_pci_vq_vector_use fails.
> >> ATM it just asserts.
> >>
> >> Maybe I am wrong.
> >>
> >> Let's delay this decision until 1.3.
> > 
> > Yep, that feature can be added on top of this patch in 1.3, but we
> > should still remove the usage tracking to fix its issues. There is no
> > feature regression related to this.
> 
> OK, to address that virtio requirement (in 1.3) we can simply
> preallocate an MSI route on VIRTIO_MSI_CONFIG/QUEUE_VECTOR and update
> that route (I have a patch for kvm_irqchip_update_msi_route here
> already) on vector_use.
> 
> The point is: That all is pure virtio business, nothing the generic
> MSI-X layer nor any normal MSI-X device or even pci-assign/vfio have to
> deal with.
> 
> Jan
> 

Yes, I agree with that last statement.
Cam Macdonell - Aug. 24, 2012, 5:47 p.m.
On Fri, Aug 24, 2012 at 12:19 AM, Jan Kiszka <jan.kiszka@web.de> wrote:
> From: Jan Kiszka <jan.kiszka@siemens.com>
>
> This optimization was once used in qemu-kvm to keep KVM route usage low.
> But now we solved that problem via lazy updates. It also tried to handle
> the case of vectors shared between different sources of the same device.
> However, this never really worked and will have to be addressed
> differently anyway. So drop this obsolete interface.
>
> We still need interfaces to clear pending vectors though. Provide
> msix_clear_vector and msix_clear_all_vectors for this.
>
> This also unbreaks MSI-X after reset for ivshmem and megasas as device
> models can't easily mark their vectors used afterward (megasas didn't
> even try).
>
> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Tested-by: Cam Macdonell <cam@cs.ualberta.ca>
> ---
>
> This patch has been posted some moons again, and we had a discussion
> about the impact on the existing users. At that time, MSI-X refactoring
> for KVM support was not yet merged. Now it is, and it should be better
> much clearer that vector usage tracking has no business with that
> feature.
>
>  hw/ivshmem.c    |   20 -------------------
>  hw/megasas.c    |    4 ---
>  hw/msix.c       |   57 ++++++++++++------------------------------------------
>  hw/msix.h       |    5 +--
>  hw/pci.h        |    2 -
>  hw/virtio-pci.c |   20 +++++++-----------
>  6 files changed, 23 insertions(+), 85 deletions(-)
>
> diff --git a/hw/ivshmem.c b/hw/ivshmem.c
> index 47f2a16..5ffff48 100644
> --- a/hw/ivshmem.c
> +++ b/hw/ivshmem.c
> @@ -523,28 +523,11 @@ static void ivshmem_read(void *opaque, const uint8_t * buf, int flags)
>      return;
>  }
>
> -/* Select the MSI-X vectors used by device.
> - * ivshmem maps events to vectors statically, so
> - * we just enable all vectors on init and after reset. */
> -static void ivshmem_use_msix(IVShmemState * s)
> -{
> -    int i;
> -
> -    if (!msix_present(&s->dev)) {
> -        return;
> -    }
> -
> -    for (i = 0; i < s->vectors; i++) {
> -        msix_vector_use(&s->dev, i);
> -    }
> -}
> -
>  static void ivshmem_reset(DeviceState *d)
>  {
>      IVShmemState *s = DO_UPCAST(IVShmemState, dev.qdev, d);
>
>      s->intrstatus = 0;
> -    ivshmem_use_msix(s);
>      return;
>  }
>
> @@ -586,8 +569,6 @@ static void ivshmem_setup_msi(IVShmemState * s)
>
>      /* allocate QEMU char devices for receiving interrupts */
>      s->eventfd_table = g_malloc0(s->vectors * sizeof(EventfdEntry));
> -
> -    ivshmem_use_msix(s);
>  }
>
>  static void ivshmem_save(QEMUFile* f, void *opaque)
> @@ -629,7 +610,6 @@ static int ivshmem_load(QEMUFile* f, void *opaque, int version_id)
>
>      if (ivshmem_has_feature(proxy, IVSHMEM_MSI)) {
>          msix_load(&proxy->dev, f);
> -       ivshmem_use_msix(proxy);
>      } else {
>          proxy->intrstatus = qemu_get_be32(f);
>          proxy->intrmask = qemu_get_be32(f);
> diff --git a/hw/megasas.c b/hw/megasas.c
> index c35a15d..4766117 100644
> --- a/hw/megasas.c
> +++ b/hw/megasas.c
> @@ -2121,10 +2121,6 @@ static int megasas_scsi_init(PCIDevice *dev)
>      pci_register_bar(&s->dev, 2, PCI_BASE_ADDRESS_SPACE_IO, &s->port_io);
>      pci_register_bar(&s->dev, 3, bar_type, &s->queue_io);
>
> -    if (megasas_use_msix(s)) {
> -        msix_vector_use(&s->dev, 0);
> -    }
> -
>      if (!s->sas_addr) {
>          s->sas_addr = ((NAA_LOCALLY_ASSIGNED_ID << 24) |
>                         IEEE_COMPANY_LOCALLY_ASSIGNED) << 36;
> diff --git a/hw/msix.c b/hw/msix.c
> index aea340b..163ce4c 100644
> --- a/hw/msix.c
> +++ b/hw/msix.c
> @@ -273,7 +273,6 @@ int msix_init(struct PCIDevice *dev, unsigned short nentries,
>
>      dev->msix_table = g_malloc0(table_size);
>      dev->msix_pba = g_malloc0(pba_size);
> -    dev->msix_entry_used = g_malloc0(nentries * sizeof *dev->msix_entry_used);
>
>      msix_mask_all(dev, nentries);
>
> @@ -326,16 +325,6 @@ int msix_init_exclusive_bar(PCIDevice *dev, unsigned short nentries,
>      return 0;
>  }
>
> -static void msix_free_irq_entries(PCIDevice *dev)
> -{
> -    int vector;
> -
> -    for (vector = 0; vector < dev->msix_entries_nr; ++vector) {
> -        dev->msix_entry_used[vector] = 0;
> -        msix_clr_pending(dev, vector);
> -    }
> -}
> -
>  /* Clean up resources for the device. */
>  void msix_uninit(PCIDevice *dev, MemoryRegion *table_bar, MemoryRegion *pba_bar)
>  {
> @@ -344,7 +333,6 @@ void msix_uninit(PCIDevice *dev, MemoryRegion *table_bar, MemoryRegion *pba_bar)
>      }
>      pci_del_capability(dev, PCI_CAP_ID_MSIX, MSIX_CAP_LENGTH);
>      dev->msix_cap = 0;
> -    msix_free_irq_entries(dev);
>      dev->msix_entries_nr = 0;
>      memory_region_del_subregion(pba_bar, &dev->msix_pba_mmio);
>      memory_region_destroy(&dev->msix_pba_mmio);
> @@ -354,8 +342,6 @@ void msix_uninit(PCIDevice *dev, MemoryRegion *table_bar, MemoryRegion *pba_bar)
>      memory_region_destroy(&dev->msix_table_mmio);
>      g_free(dev->msix_table);
>      dev->msix_table = NULL;
> -    g_free(dev->msix_entry_used);
> -    dev->msix_entry_used = NULL;
>      dev->cap_present &= ~QEMU_PCI_CAP_MSIX;
>      return;
>  }
> @@ -390,7 +376,6 @@ void msix_load(PCIDevice *dev, QEMUFile *f)
>          return;
>      }
>
> -    msix_free_irq_entries(dev);
>      qemu_get_buffer(f, dev->msix_table, n * PCI_MSIX_ENTRY_SIZE);
>      qemu_get_buffer(f, dev->msix_pba, (n + 7) / 8);
>      msix_update_function_masked(dev);
> @@ -419,8 +404,9 @@ void msix_notify(PCIDevice *dev, unsigned vector)
>  {
>      MSIMessage msg;
>
> -    if (vector >= dev->msix_entries_nr || !dev->msix_entry_used[vector])
> +    if (vector >= dev->msix_entries_nr) {
>          return;
> +    }
>      if (msix_is_masked(dev, vector)) {
>          msix_set_pending(dev, vector);
>          return;
> @@ -436,7 +422,7 @@ void msix_reset(PCIDevice *dev)
>      if (!msix_present(dev)) {
>          return;
>      }
> -    msix_free_irq_entries(dev);
> +    msix_clear_all_vectors(dev);
>      dev->config[dev->msix_cap + MSIX_CONTROL_OFFSET] &=
>             ~dev->wmask[dev->msix_cap + MSIX_CONTROL_OFFSET];
>      memset(dev->msix_table, 0, dev->msix_entries_nr * PCI_MSIX_ENTRY_SIZE);
> @@ -444,41 +430,24 @@ void msix_reset(PCIDevice *dev)
>      msix_mask_all(dev, dev->msix_entries_nr);
>  }
>
> -/* PCI spec suggests that devices make it possible for software to configure
> - * less vectors than supported by the device, but does not specify a standard
> - * mechanism for devices to do so.
> - *
> - * We support this by asking devices to declare vectors software is going to
> - * actually use, and checking this on the notification path. Devices that
> - * don't want to follow the spec suggestion can declare all vectors as used. */
> -
> -/* Mark vector as used. */
> -int msix_vector_use(PCIDevice *dev, unsigned vector)
> +/* Clear pending vector. */
> +void msix_clear_vector(PCIDevice *dev, unsigned vector)
>  {
> -    if (vector >= dev->msix_entries_nr)
> -        return -EINVAL;
> -    dev->msix_entry_used[vector]++;
> -    return 0;
> -}
> -
> -/* Mark vector as unused. */
> -void msix_vector_unuse(PCIDevice *dev, unsigned vector)
> -{
> -    if (vector >= dev->msix_entries_nr || !dev->msix_entry_used[vector]) {
> -        return;
> -    }
> -    if (--dev->msix_entry_used[vector]) {
> -        return;
> +    if (msix_present(dev) && vector < dev->msix_entries_nr) {
> +        msix_clr_pending(dev, vector);
>      }
> -    msix_clr_pending(dev, vector);
>  }
>
> -void msix_unuse_all_vectors(PCIDevice *dev)
> +void msix_clear_all_vectors(PCIDevice *dev)
>  {
> +    unsigned int vector;
> +
>      if (!msix_present(dev)) {
>          return;
>      }
> -    msix_free_irq_entries(dev);
> +    for (vector = 0; vector < dev->msix_entries_nr; ++vector) {
> +        msix_clr_pending(dev, vector);
> +    }
>  }
>
>  unsigned int msix_nr_vectors_allocated(const PCIDevice *dev)
> diff --git a/hw/msix.h b/hw/msix.h
> index 15211cb..5c2512a 100644
> --- a/hw/msix.h
> +++ b/hw/msix.h
> @@ -26,9 +26,8 @@ void msix_load(PCIDevice *dev, QEMUFile *f);
>  int msix_enabled(PCIDevice *dev);
>  int msix_present(PCIDevice *dev);
>
> -int msix_vector_use(PCIDevice *dev, unsigned vector);
> -void msix_vector_unuse(PCIDevice *dev, unsigned vector);
> -void msix_unuse_all_vectors(PCIDevice *dev);
> +void msix_clear_vector(PCIDevice *dev, unsigned vector);
> +void msix_clear_all_vectors(PCIDevice *dev);
>
>  void msix_notify(PCIDevice *dev, unsigned vector);
>
> diff --git a/hw/pci.h b/hw/pci.h
> index 4b6ab3d..1b07450 100644
> --- a/hw/pci.h
> +++ b/hw/pci.h
> @@ -240,8 +240,6 @@ struct PCIDevice {
>      /* Memory Regions for MSIX table and pending bit entries. */
>      MemoryRegion msix_table_mmio;
>      MemoryRegion msix_pba_mmio;
> -    /* Reference-count for entries actually in use by driver. */
> -    unsigned *msix_entry_used;
>      /* MSIX function mask set or MSIX disabled */
>      bool msix_function_masked;
>      /* Version id needed for VMState */
> diff --git a/hw/virtio-pci.c b/hw/virtio-pci.c
> index 2a3d86f..f8ce01d 100644
> --- a/hw/virtio-pci.c
> +++ b/hw/virtio-pci.c
> @@ -137,9 +137,6 @@ static int virtio_pci_load_config(void * opaque, QEMUFile *f)
>      } else {
>          proxy->vdev->config_vector = VIRTIO_NO_VECTOR;
>      }
> -    if (proxy->vdev->config_vector != VIRTIO_NO_VECTOR) {
> -        return msix_vector_use(&proxy->pci_dev, proxy->vdev->config_vector);
> -    }
>      return 0;
>  }
>
> @@ -153,9 +150,6 @@ static int virtio_pci_load_queue(void * opaque, int n, QEMUFile *f)
>          vector = VIRTIO_NO_VECTOR;
>      }
>      virtio_queue_set_vector(proxy->vdev, n, vector);
> -    if (vector != VIRTIO_NO_VECTOR) {
> -        return msix_vector_use(&proxy->pci_dev, vector);
> -    }
>      return 0;
>  }
>
> @@ -268,7 +262,7 @@ static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val)
>          if (pa == 0) {
>              virtio_pci_stop_ioeventfd(proxy);
>              virtio_reset(proxy->vdev);
> -            msix_unuse_all_vectors(&proxy->pci_dev);
> +            msix_clear_all_vectors(&proxy->pci_dev);
>          }
>          else
>              virtio_queue_set_addr(vdev, vdev->queue_sel, pa);
> @@ -295,7 +289,7 @@ static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val)
>
>          if (vdev->status == 0) {
>              virtio_reset(proxy->vdev);
> -            msix_unuse_all_vectors(&proxy->pci_dev);
> +            msix_clear_all_vectors(&proxy->pci_dev);
>          }
>
>          /* Linux before 2.6.34 sets the device as OK without enabling
> @@ -307,18 +301,20 @@ static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val)
>          }
>          break;
>      case VIRTIO_MSI_CONFIG_VECTOR:
> -        msix_vector_unuse(&proxy->pci_dev, vdev->config_vector);
> +        msix_clear_vector(&proxy->pci_dev, vdev->config_vector);
>          /* Make it possible for guest to discover an error took place. */
> -        if (msix_vector_use(&proxy->pci_dev, val) < 0)
> +        if (val >= vdev->nvectors) {
>              val = VIRTIO_NO_VECTOR;
> +        }
>          vdev->config_vector = val;
>          break;
>      case VIRTIO_MSI_QUEUE_VECTOR:
> -        msix_vector_unuse(&proxy->pci_dev,
> +        msix_clear_vector(&proxy->pci_dev,
>                            virtio_queue_vector(vdev, vdev->queue_sel));
>          /* Make it possible for guest to discover an error took place. */
> -        if (msix_vector_use(&proxy->pci_dev, val) < 0)
> +        if (val >= vdev->nvectors) {
>              val = VIRTIO_NO_VECTOR;
> +        }
>          virtio_queue_set_vector(vdev, vdev->queue_sel, val);
>          break;
>      default:
> --
> 1.7.3.4
>

Patch

diff --git a/hw/ivshmem.c b/hw/ivshmem.c
index 47f2a16..5ffff48 100644
--- a/hw/ivshmem.c
+++ b/hw/ivshmem.c
@@ -523,28 +523,11 @@  static void ivshmem_read(void *opaque, const uint8_t * buf, int flags)
     return;
 }
 
-/* Select the MSI-X vectors used by device.
- * ivshmem maps events to vectors statically, so
- * we just enable all vectors on init and after reset. */
-static void ivshmem_use_msix(IVShmemState * s)
-{
-    int i;
-
-    if (!msix_present(&s->dev)) {
-        return;
-    }
-
-    for (i = 0; i < s->vectors; i++) {
-        msix_vector_use(&s->dev, i);
-    }
-}
-
 static void ivshmem_reset(DeviceState *d)
 {
     IVShmemState *s = DO_UPCAST(IVShmemState, dev.qdev, d);
 
     s->intrstatus = 0;
-    ivshmem_use_msix(s);
     return;
 }
 
@@ -586,8 +569,6 @@  static void ivshmem_setup_msi(IVShmemState * s)
 
     /* allocate QEMU char devices for receiving interrupts */
     s->eventfd_table = g_malloc0(s->vectors * sizeof(EventfdEntry));
-
-    ivshmem_use_msix(s);
 }
 
 static void ivshmem_save(QEMUFile* f, void *opaque)
@@ -629,7 +610,6 @@  static int ivshmem_load(QEMUFile* f, void *opaque, int version_id)
 
     if (ivshmem_has_feature(proxy, IVSHMEM_MSI)) {
         msix_load(&proxy->dev, f);
-	ivshmem_use_msix(proxy);
     } else {
         proxy->intrstatus = qemu_get_be32(f);
         proxy->intrmask = qemu_get_be32(f);
diff --git a/hw/megasas.c b/hw/megasas.c
index c35a15d..4766117 100644
--- a/hw/megasas.c
+++ b/hw/megasas.c
@@ -2121,10 +2121,6 @@  static int megasas_scsi_init(PCIDevice *dev)
     pci_register_bar(&s->dev, 2, PCI_BASE_ADDRESS_SPACE_IO, &s->port_io);
     pci_register_bar(&s->dev, 3, bar_type, &s->queue_io);
 
-    if (megasas_use_msix(s)) {
-        msix_vector_use(&s->dev, 0);
-    }
-
     if (!s->sas_addr) {
         s->sas_addr = ((NAA_LOCALLY_ASSIGNED_ID << 24) |
                        IEEE_COMPANY_LOCALLY_ASSIGNED) << 36;
diff --git a/hw/msix.c b/hw/msix.c
index aea340b..163ce4c 100644
--- a/hw/msix.c
+++ b/hw/msix.c
@@ -273,7 +273,6 @@  int msix_init(struct PCIDevice *dev, unsigned short nentries,
 
     dev->msix_table = g_malloc0(table_size);
     dev->msix_pba = g_malloc0(pba_size);
-    dev->msix_entry_used = g_malloc0(nentries * sizeof *dev->msix_entry_used);
 
     msix_mask_all(dev, nentries);
 
@@ -326,16 +325,6 @@  int msix_init_exclusive_bar(PCIDevice *dev, unsigned short nentries,
     return 0;
 }
 
-static void msix_free_irq_entries(PCIDevice *dev)
-{
-    int vector;
-
-    for (vector = 0; vector < dev->msix_entries_nr; ++vector) {
-        dev->msix_entry_used[vector] = 0;
-        msix_clr_pending(dev, vector);
-    }
-}
-
 /* Clean up resources for the device. */
 void msix_uninit(PCIDevice *dev, MemoryRegion *table_bar, MemoryRegion *pba_bar)
 {
@@ -344,7 +333,6 @@  void msix_uninit(PCIDevice *dev, MemoryRegion *table_bar, MemoryRegion *pba_bar)
     }
     pci_del_capability(dev, PCI_CAP_ID_MSIX, MSIX_CAP_LENGTH);
     dev->msix_cap = 0;
-    msix_free_irq_entries(dev);
     dev->msix_entries_nr = 0;
     memory_region_del_subregion(pba_bar, &dev->msix_pba_mmio);
     memory_region_destroy(&dev->msix_pba_mmio);
@@ -354,8 +342,6 @@  void msix_uninit(PCIDevice *dev, MemoryRegion *table_bar, MemoryRegion *pba_bar)
     memory_region_destroy(&dev->msix_table_mmio);
     g_free(dev->msix_table);
     dev->msix_table = NULL;
-    g_free(dev->msix_entry_used);
-    dev->msix_entry_used = NULL;
     dev->cap_present &= ~QEMU_PCI_CAP_MSIX;
     return;
 }
@@ -390,7 +376,6 @@  void msix_load(PCIDevice *dev, QEMUFile *f)
         return;
     }
 
-    msix_free_irq_entries(dev);
     qemu_get_buffer(f, dev->msix_table, n * PCI_MSIX_ENTRY_SIZE);
     qemu_get_buffer(f, dev->msix_pba, (n + 7) / 8);
     msix_update_function_masked(dev);
@@ -419,8 +404,9 @@  void msix_notify(PCIDevice *dev, unsigned vector)
 {
     MSIMessage msg;
 
-    if (vector >= dev->msix_entries_nr || !dev->msix_entry_used[vector])
+    if (vector >= dev->msix_entries_nr) {
         return;
+    }
     if (msix_is_masked(dev, vector)) {
         msix_set_pending(dev, vector);
         return;
@@ -436,7 +422,7 @@  void msix_reset(PCIDevice *dev)
     if (!msix_present(dev)) {
         return;
     }
-    msix_free_irq_entries(dev);
+    msix_clear_all_vectors(dev);
     dev->config[dev->msix_cap + MSIX_CONTROL_OFFSET] &=
 	    ~dev->wmask[dev->msix_cap + MSIX_CONTROL_OFFSET];
     memset(dev->msix_table, 0, dev->msix_entries_nr * PCI_MSIX_ENTRY_SIZE);
@@ -444,41 +430,24 @@  void msix_reset(PCIDevice *dev)
     msix_mask_all(dev, dev->msix_entries_nr);
 }
 
-/* PCI spec suggests that devices make it possible for software to configure
- * less vectors than supported by the device, but does not specify a standard
- * mechanism for devices to do so.
- *
- * We support this by asking devices to declare vectors software is going to
- * actually use, and checking this on the notification path. Devices that
- * don't want to follow the spec suggestion can declare all vectors as used. */
-
-/* Mark vector as used. */
-int msix_vector_use(PCIDevice *dev, unsigned vector)
+/* Clear pending vector. */
+void msix_clear_vector(PCIDevice *dev, unsigned vector)
 {
-    if (vector >= dev->msix_entries_nr)
-        return -EINVAL;
-    dev->msix_entry_used[vector]++;
-    return 0;
-}
-
-/* Mark vector as unused. */
-void msix_vector_unuse(PCIDevice *dev, unsigned vector)
-{
-    if (vector >= dev->msix_entries_nr || !dev->msix_entry_used[vector]) {
-        return;
-    }
-    if (--dev->msix_entry_used[vector]) {
-        return;
+    if (msix_present(dev) && vector < dev->msix_entries_nr) {
+        msix_clr_pending(dev, vector);
     }
-    msix_clr_pending(dev, vector);
 }
 
-void msix_unuse_all_vectors(PCIDevice *dev)
+void msix_clear_all_vectors(PCIDevice *dev)
 {
+    unsigned int vector;
+
     if (!msix_present(dev)) {
         return;
     }
-    msix_free_irq_entries(dev);
+    for (vector = 0; vector < dev->msix_entries_nr; ++vector) {
+        msix_clr_pending(dev, vector);
+    }
 }
 
 unsigned int msix_nr_vectors_allocated(const PCIDevice *dev)
diff --git a/hw/msix.h b/hw/msix.h
index 15211cb..5c2512a 100644
--- a/hw/msix.h
+++ b/hw/msix.h
@@ -26,9 +26,8 @@  void msix_load(PCIDevice *dev, QEMUFile *f);
 int msix_enabled(PCIDevice *dev);
 int msix_present(PCIDevice *dev);
 
-int msix_vector_use(PCIDevice *dev, unsigned vector);
-void msix_vector_unuse(PCIDevice *dev, unsigned vector);
-void msix_unuse_all_vectors(PCIDevice *dev);
+void msix_clear_vector(PCIDevice *dev, unsigned vector);
+void msix_clear_all_vectors(PCIDevice *dev);
 
 void msix_notify(PCIDevice *dev, unsigned vector);
 
diff --git a/hw/pci.h b/hw/pci.h
index 4b6ab3d..1b07450 100644
--- a/hw/pci.h
+++ b/hw/pci.h
@@ -240,8 +240,6 @@  struct PCIDevice {
     /* Memory Regions for MSIX table and pending bit entries. */
     MemoryRegion msix_table_mmio;
     MemoryRegion msix_pba_mmio;
-    /* Reference-count for entries actually in use by driver. */
-    unsigned *msix_entry_used;
     /* MSIX function mask set or MSIX disabled */
     bool msix_function_masked;
     /* Version id needed for VMState */
diff --git a/hw/virtio-pci.c b/hw/virtio-pci.c
index 2a3d86f..f8ce01d 100644
--- a/hw/virtio-pci.c
+++ b/hw/virtio-pci.c
@@ -137,9 +137,6 @@  static int virtio_pci_load_config(void * opaque, QEMUFile *f)
     } else {
         proxy->vdev->config_vector = VIRTIO_NO_VECTOR;
     }
-    if (proxy->vdev->config_vector != VIRTIO_NO_VECTOR) {
-        return msix_vector_use(&proxy->pci_dev, proxy->vdev->config_vector);
-    }
     return 0;
 }
 
@@ -153,9 +150,6 @@  static int virtio_pci_load_queue(void * opaque, int n, QEMUFile *f)
         vector = VIRTIO_NO_VECTOR;
     }
     virtio_queue_set_vector(proxy->vdev, n, vector);
-    if (vector != VIRTIO_NO_VECTOR) {
-        return msix_vector_use(&proxy->pci_dev, vector);
-    }
     return 0;
 }
 
@@ -268,7 +262,7 @@  static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val)
         if (pa == 0) {
             virtio_pci_stop_ioeventfd(proxy);
             virtio_reset(proxy->vdev);
-            msix_unuse_all_vectors(&proxy->pci_dev);
+            msix_clear_all_vectors(&proxy->pci_dev);
         }
         else
             virtio_queue_set_addr(vdev, vdev->queue_sel, pa);
@@ -295,7 +289,7 @@  static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val)
 
         if (vdev->status == 0) {
             virtio_reset(proxy->vdev);
-            msix_unuse_all_vectors(&proxy->pci_dev);
+            msix_clear_all_vectors(&proxy->pci_dev);
         }
 
         /* Linux before 2.6.34 sets the device as OK without enabling
@@ -307,18 +301,20 @@  static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val)
         }
         break;
     case VIRTIO_MSI_CONFIG_VECTOR:
-        msix_vector_unuse(&proxy->pci_dev, vdev->config_vector);
+        msix_clear_vector(&proxy->pci_dev, vdev->config_vector);
         /* Make it possible for guest to discover an error took place. */
-        if (msix_vector_use(&proxy->pci_dev, val) < 0)
+        if (val >= vdev->nvectors) {
             val = VIRTIO_NO_VECTOR;
+        }
         vdev->config_vector = val;
         break;
     case VIRTIO_MSI_QUEUE_VECTOR:
-        msix_vector_unuse(&proxy->pci_dev,
+        msix_clear_vector(&proxy->pci_dev,
                           virtio_queue_vector(vdev, vdev->queue_sel));
         /* Make it possible for guest to discover an error took place. */
-        if (msix_vector_use(&proxy->pci_dev, val) < 0)
+        if (val >= vdev->nvectors) {
             val = VIRTIO_NO_VECTOR;
+        }
         virtio_queue_set_vector(vdev, vdev->queue_sel, val);
         break;
     default: