Patchwork [v2,4/6] msix: Split PBA into it's own MemoryRegion

login
register
mail settings
Submitter Alex Williamson
Date June 14, 2012, 4:51 a.m.
Message ID <20120614045146.11034.95991.stgit@bling.home>
Download mbox | patch
Permalink /patch/164812/
State New
Headers show

Comments

Alex Williamson - June 14, 2012, 4:51 a.m.
These don't have to be contiguous.  Size them to only what
they need and use separate MemoryRegions for the vector
table and PBA.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---

 hw/msix.c |  116 ++++++++++++++++++++++++++++++++++++++-----------------------
 hw/pci.h  |   15 ++++++--
 2 files changed, 83 insertions(+), 48 deletions(-)
Jan Kiszka - June 14, 2012, 6:13 a.m.
On 2012-06-14 06:51, Alex Williamson wrote:
> These don't have to be contiguous.  Size them to only what
> they need and use separate MemoryRegions for the vector
> table and PBA.
> 
> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> ---
> 
>  hw/msix.c |  116 ++++++++++++++++++++++++++++++++++++++-----------------------
>  hw/pci.h  |   15 ++++++--
>  2 files changed, 83 insertions(+), 48 deletions(-)
> 
> diff --git a/hw/msix.c b/hw/msix.c
> index a4cdfb0..d476d07 100644
> --- a/hw/msix.c
> +++ b/hw/msix.c
> @@ -37,7 +37,7 @@
>  
>  static MSIMessage msix_get_message(PCIDevice *dev, unsigned vector)
>  {
> -    uint8_t *table_entry = dev->msix_table_page + vector * PCI_MSIX_ENTRY_SIZE;
> +    uint8_t *table_entry = dev->msix_table + vector * PCI_MSIX_ENTRY_SIZE;
>      MSIMessage msg;
>  
>      msg.address = pci_get_quad(table_entry + PCI_MSIX_ENTRY_LOWER_ADDR);
> @@ -86,16 +86,6 @@ static int msix_add_config(struct PCIDevice *pdev, unsigned short nentries,
>      return 0;
>  }
>  
> -static uint64_t msix_mmio_read(void *opaque, target_phys_addr_t addr,
> -                               unsigned size)
> -{
> -    PCIDevice *dev = opaque;
> -    unsigned int offset = addr & (MSIX_PAGE_SIZE - 1) & ~0x3;
> -    void *page = dev->msix_table_page;
> -
> -    return pci_get_long(page + offset);
> -}
> -
>  static uint8_t msix_pending_mask(int vector)
>  {
>      return 1 << (vector % 8);
> @@ -103,7 +93,7 @@ static uint8_t msix_pending_mask(int vector)
>  
>  static uint8_t *msix_pending_byte(PCIDevice *dev, int vector)
>  {
> -    return dev->msix_table_page + MSIX_PAGE_PENDING + vector / 8;
> +    return dev->msix_pba + vector / 8;
>  }
>  
>  static int msix_is_pending(PCIDevice *dev, int vector)
> @@ -124,7 +114,7 @@ static void msix_clr_pending(PCIDevice *dev, int vector)
>  static bool msix_vector_masked(PCIDevice *dev, int vector, bool fmask)
>  {
>      unsigned offset = vector * PCI_MSIX_ENTRY_SIZE + PCI_MSIX_ENTRY_VECTOR_CTRL;
> -    return fmask || dev->msix_table_page[offset] & PCI_MSIX_ENTRY_CTRL_MASKBIT;
> +    return fmask || dev->msix_table[offset] & PCI_MSIX_ENTRY_CTRL_MASKBIT;
>  }
>  
>  static bool msix_is_masked(PCIDevice *dev, int vector)
> @@ -203,27 +193,46 @@ void msix_write_config(PCIDevice *dev, uint32_t addr,
>      }
>  }
>  
> -static void msix_mmio_write(void *opaque, target_phys_addr_t addr,
> -                            uint64_t val, unsigned size)
> +static uint64_t msix_table_mmio_read(void *opaque, target_phys_addr_t addr,
> +                                     unsigned size)
>  {
>      PCIDevice *dev = opaque;
> -    unsigned int offset = addr & (MSIX_PAGE_SIZE - 1) & ~0x3;
> -    int vector = offset / PCI_MSIX_ENTRY_SIZE;
> -    bool was_masked;
>  
> -    /* MSI-X page includes a read-only PBA and a writeable Vector Control. */
> -    if (vector >= dev->msix_entries_nr) {
> -        return;
> -    }
> +    return pci_get_long(dev->msix_table + addr);
> +}
> +
> +static void msix_table_mmio_write(void *opaque, target_phys_addr_t addr,
> +                                  uint64_t val, unsigned size)
> +{
> +    PCIDevice *dev = opaque;
> +    int vector = addr / PCI_MSIX_ENTRY_SIZE;
> +    bool was_masked;
>  
>      was_masked = msix_is_masked(dev, vector);
> -    pci_set_long(dev->msix_table_page + offset, val);
> +    pci_set_long(dev->msix_table + addr, val);
>      msix_handle_mask_update(dev, vector, was_masked);
>  }
>  
> -static const MemoryRegionOps msix_mmio_ops = {
> -    .read = msix_mmio_read,
> -    .write = msix_mmio_write,
> +static const MemoryRegionOps msix_table_mmio_ops = {
> +    .read = msix_table_mmio_read,
> +    .write = msix_table_mmio_write,
> +    .endianness = DEVICE_NATIVE_ENDIAN,
> +    .valid = {
> +        .min_access_size = 4,
> +        .max_access_size = 4,
> +    },
> +};
> +
> +static uint64_t msix_pba_mmio_read(void *opaque, target_phys_addr_t addr,
> +                                   unsigned size)
> +{
> +    PCIDevice *dev = opaque;
> +
> +    return pci_get_long(dev->msix_pba + addr);
> +}
> +
> +static const MemoryRegionOps msix_pba_mmio_ops = {
> +    .read = msix_pba_mmio_read,
>      .endianness = DEVICE_NATIVE_ENDIAN,
>      .valid = {
>          .min_access_size = 4,
> @@ -235,11 +244,14 @@ static void msix_mmio_setup(PCIDevice *d, MemoryRegion *bar)
>  {
>      uint8_t *config = d->config + d->msix_cap;
>      uint32_t table = pci_get_long(config + PCI_MSIX_TABLE);
> -    uint32_t offset = table & ~(MSIX_PAGE_SIZE - 1);
> +    uint32_t table_offset = table & ~PCI_MSIX_FLAGS_BIRMASK;
> +    uint32_t pba = pci_get_long(config + PCI_MSIX_PBA);
> +    uint32_t pba_offset = pba & ~PCI_MSIX_FLAGS_BIRMASK;
>      /* TODO: for assigned devices, we'll want to make it possible to map
>       * pending bits separately in case they are in a separate bar. */
>  
> -    memory_region_add_subregion(bar, offset, &d->msix_mmio);
> +    memory_region_add_subregion(bar, table_offset, &d->msix_table_mmio);
> +    memory_region_add_subregion(bar, pba_offset, &d->msix_pba_mmio);
>  }
>  
>  static void msix_mask_all(struct PCIDevice *dev, unsigned nentries)
> @@ -251,7 +263,7 @@ static void msix_mask_all(struct PCIDevice *dev, unsigned nentries)
>              vector * PCI_MSIX_ENTRY_SIZE + PCI_MSIX_ENTRY_VECTOR_CTRL;
>          bool was_masked = msix_is_masked(dev, vector);
>  
> -        dev->msix_table_page[offset] |= PCI_MSIX_ENTRY_CTRL_MASKBIT;
> +        dev->msix_table[offset] |= PCI_MSIX_ENTRY_CTRL_MASKBIT;
>          msix_handle_mask_update(dev, vector, was_masked);
>      }
>  }
> @@ -263,6 +275,7 @@ int msix_init(struct PCIDevice *dev, unsigned short nentries,
>                unsigned bar_nr, unsigned bar_size)
>  {
>      int ret;
> +    unsigned table_size, pba_size;
>  
>      /* Nothing to do if MSI is not supported by interrupt controller */
>      if (!msi_supported) {
> @@ -271,14 +284,20 @@ int msix_init(struct PCIDevice *dev, unsigned short nentries,
>      if (nentries > MSIX_MAX_ENTRIES)
>          return -EINVAL;
>  
> +    table_size = nentries * PCI_MSIX_ENTRY_SIZE;
> +    pba_size = QEMU_ALIGN_UP(nentries, 64) / 8;
> +
>      dev->msix_entry_used = g_malloc0(MSIX_MAX_ENTRIES *
>                                          sizeof *dev->msix_entry_used);
>  
> -    dev->msix_table_page = g_malloc0(MSIX_PAGE_SIZE);
> +    dev->msix_table = g_malloc0(table_size);
> +    dev->msix_pba = g_malloc0(pba_size);
>      msix_mask_all(dev, nentries);
>  
> -    memory_region_init_io(&dev->msix_mmio, &msix_mmio_ops, dev,
> -                          "msix", MSIX_PAGE_SIZE);
> +    memory_region_init_io(&dev->msix_table_mmio, &msix_table_mmio_ops, dev,
> +                          "msix", table_size);

"msix-table" (vs. "msix-pba")? Only if you have to repost anyway.

> +    memory_region_init_io(&dev->msix_pba_mmio, &msix_pba_mmio_ops, dev,
> +                          "msix-pba", pba_size);
>  
>      dev->msix_entries_nr = nentries;
>      ret = msix_add_config(dev, nentries, bar_nr, bar_size);

Jan
Michael S. Tsirkin - June 14, 2012, 10:24 a.m.
On Wed, Jun 13, 2012 at 10:51:47PM -0600, Alex Williamson wrote:
> These don't have to be contiguous.  Size them to only what
> they need and use separate MemoryRegions for the vector
> table and PBA.
> 
> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>

Why is this still using NATIVE?

> ---
> 
>  hw/msix.c |  116 ++++++++++++++++++++++++++++++++++++++-----------------------
>  hw/pci.h  |   15 ++++++--
>  2 files changed, 83 insertions(+), 48 deletions(-)
> 
> diff --git a/hw/msix.c b/hw/msix.c
> index a4cdfb0..d476d07 100644
> --- a/hw/msix.c
> +++ b/hw/msix.c
> @@ -37,7 +37,7 @@
>  
>  static MSIMessage msix_get_message(PCIDevice *dev, unsigned vector)
>  {
> -    uint8_t *table_entry = dev->msix_table_page + vector * PCI_MSIX_ENTRY_SIZE;
> +    uint8_t *table_entry = dev->msix_table + vector * PCI_MSIX_ENTRY_SIZE;
>      MSIMessage msg;
>  
>      msg.address = pci_get_quad(table_entry + PCI_MSIX_ENTRY_LOWER_ADDR);
> @@ -86,16 +86,6 @@ static int msix_add_config(struct PCIDevice *pdev, unsigned short nentries,
>      return 0;
>  }
>  
> -static uint64_t msix_mmio_read(void *opaque, target_phys_addr_t addr,
> -                               unsigned size)
> -{
> -    PCIDevice *dev = opaque;
> -    unsigned int offset = addr & (MSIX_PAGE_SIZE - 1) & ~0x3;
> -    void *page = dev->msix_table_page;
> -
> -    return pci_get_long(page + offset);
> -}
> -
>  static uint8_t msix_pending_mask(int vector)
>  {
>      return 1 << (vector % 8);
> @@ -103,7 +93,7 @@ static uint8_t msix_pending_mask(int vector)
>  
>  static uint8_t *msix_pending_byte(PCIDevice *dev, int vector)
>  {
> -    return dev->msix_table_page + MSIX_PAGE_PENDING + vector / 8;
> +    return dev->msix_pba + vector / 8;
>  }
>  
>  static int msix_is_pending(PCIDevice *dev, int vector)
> @@ -124,7 +114,7 @@ static void msix_clr_pending(PCIDevice *dev, int vector)
>  static bool msix_vector_masked(PCIDevice *dev, int vector, bool fmask)
>  {
>      unsigned offset = vector * PCI_MSIX_ENTRY_SIZE + PCI_MSIX_ENTRY_VECTOR_CTRL;
> -    return fmask || dev->msix_table_page[offset] & PCI_MSIX_ENTRY_CTRL_MASKBIT;
> +    return fmask || dev->msix_table[offset] & PCI_MSIX_ENTRY_CTRL_MASKBIT;
>  }
>  
>  static bool msix_is_masked(PCIDevice *dev, int vector)
> @@ -203,27 +193,46 @@ void msix_write_config(PCIDevice *dev, uint32_t addr,
>      }
>  }
>  
> -static void msix_mmio_write(void *opaque, target_phys_addr_t addr,
> -                            uint64_t val, unsigned size)
> +static uint64_t msix_table_mmio_read(void *opaque, target_phys_addr_t addr,
> +                                     unsigned size)
>  {
>      PCIDevice *dev = opaque;
> -    unsigned int offset = addr & (MSIX_PAGE_SIZE - 1) & ~0x3;
> -    int vector = offset / PCI_MSIX_ENTRY_SIZE;
> -    bool was_masked;
>  
> -    /* MSI-X page includes a read-only PBA and a writeable Vector Control. */
> -    if (vector >= dev->msix_entries_nr) {
> -        return;
> -    }
> +    return pci_get_long(dev->msix_table + addr);
> +}
> +
> +static void msix_table_mmio_write(void *opaque, target_phys_addr_t addr,
> +                                  uint64_t val, unsigned size)
> +{
> +    PCIDevice *dev = opaque;
> +    int vector = addr / PCI_MSIX_ENTRY_SIZE;
> +    bool was_masked;
>  
>      was_masked = msix_is_masked(dev, vector);
> -    pci_set_long(dev->msix_table_page + offset, val);
> +    pci_set_long(dev->msix_table + addr, val);
>      msix_handle_mask_update(dev, vector, was_masked);
>  }
>  
> -static const MemoryRegionOps msix_mmio_ops = {
> -    .read = msix_mmio_read,
> -    .write = msix_mmio_write,
> +static const MemoryRegionOps msix_table_mmio_ops = {
> +    .read = msix_table_mmio_read,
> +    .write = msix_table_mmio_write,
> +    .endianness = DEVICE_NATIVE_ENDIAN,
> +    .valid = {
> +        .min_access_size = 4,
> +        .max_access_size = 4,
> +    },
> +};
> +
> +static uint64_t msix_pba_mmio_read(void *opaque, target_phys_addr_t addr,
> +                                   unsigned size)
> +{
> +    PCIDevice *dev = opaque;
> +
> +    return pci_get_long(dev->msix_pba + addr);
> +}
> +
> +static const MemoryRegionOps msix_pba_mmio_ops = {
> +    .read = msix_pba_mmio_read,
>      .endianness = DEVICE_NATIVE_ENDIAN,
>      .valid = {
>          .min_access_size = 4,
> @@ -235,11 +244,14 @@ static void msix_mmio_setup(PCIDevice *d, MemoryRegion *bar)
>  {
>      uint8_t *config = d->config + d->msix_cap;
>      uint32_t table = pci_get_long(config + PCI_MSIX_TABLE);
> -    uint32_t offset = table & ~(MSIX_PAGE_SIZE - 1);
> +    uint32_t table_offset = table & ~PCI_MSIX_FLAGS_BIRMASK;
> +    uint32_t pba = pci_get_long(config + PCI_MSIX_PBA);
> +    uint32_t pba_offset = pba & ~PCI_MSIX_FLAGS_BIRMASK;
>      /* TODO: for assigned devices, we'll want to make it possible to map
>       * pending bits separately in case they are in a separate bar. */
>  
> -    memory_region_add_subregion(bar, offset, &d->msix_mmio);
> +    memory_region_add_subregion(bar, table_offset, &d->msix_table_mmio);
> +    memory_region_add_subregion(bar, pba_offset, &d->msix_pba_mmio);
>  }
>  
>  static void msix_mask_all(struct PCIDevice *dev, unsigned nentries)
> @@ -251,7 +263,7 @@ static void msix_mask_all(struct PCIDevice *dev, unsigned nentries)
>              vector * PCI_MSIX_ENTRY_SIZE + PCI_MSIX_ENTRY_VECTOR_CTRL;
>          bool was_masked = msix_is_masked(dev, vector);
>  
> -        dev->msix_table_page[offset] |= PCI_MSIX_ENTRY_CTRL_MASKBIT;
> +        dev->msix_table[offset] |= PCI_MSIX_ENTRY_CTRL_MASKBIT;
>          msix_handle_mask_update(dev, vector, was_masked);
>      }
>  }
> @@ -263,6 +275,7 @@ int msix_init(struct PCIDevice *dev, unsigned short nentries,
>                unsigned bar_nr, unsigned bar_size)
>  {
>      int ret;
> +    unsigned table_size, pba_size;
>  
>      /* Nothing to do if MSI is not supported by interrupt controller */
>      if (!msi_supported) {
> @@ -271,14 +284,20 @@ int msix_init(struct PCIDevice *dev, unsigned short nentries,
>      if (nentries > MSIX_MAX_ENTRIES)
>          return -EINVAL;
>  
> +    table_size = nentries * PCI_MSIX_ENTRY_SIZE;
> +    pba_size = QEMU_ALIGN_UP(nentries, 64) / 8;
> +
>      dev->msix_entry_used = g_malloc0(MSIX_MAX_ENTRIES *
>                                          sizeof *dev->msix_entry_used);
>  
> -    dev->msix_table_page = g_malloc0(MSIX_PAGE_SIZE);
> +    dev->msix_table = g_malloc0(table_size);
> +    dev->msix_pba = g_malloc0(pba_size);
>      msix_mask_all(dev, nentries);
>  
> -    memory_region_init_io(&dev->msix_mmio, &msix_mmio_ops, dev,
> -                          "msix", MSIX_PAGE_SIZE);
> +    memory_region_init_io(&dev->msix_table_mmio, &msix_table_mmio_ops, dev,
> +                          "msix", table_size);
> +    memory_region_init_io(&dev->msix_pba_mmio, &msix_pba_mmio_ops, dev,
> +                          "msix-pba", pba_size);
>  
>      dev->msix_entries_nr = nentries;
>      ret = msix_add_config(dev, nentries, bar_nr, bar_size);
> @@ -291,9 +310,12 @@ int msix_init(struct PCIDevice *dev, unsigned short nentries,
>  
>  err_config:
>      dev->msix_entries_nr = 0;
> -    memory_region_destroy(&dev->msix_mmio);
> -    g_free(dev->msix_table_page);
> -    dev->msix_table_page = NULL;
> +    memory_region_destroy(&dev->msix_pba_mmio);
> +    g_free(dev->msix_pba);
> +    dev->msix_pba = NULL;
> +    memory_region_destroy(&dev->msix_table_mmio);
> +    g_free(dev->msix_table);
> +    dev->msix_table = NULL;
>      g_free(dev->msix_entry_used);
>      dev->msix_entry_used = NULL;
>      return ret;
> @@ -354,10 +376,14 @@ int msix_uninit(PCIDevice *dev, MemoryRegion *bar)
>      dev->msix_cap = 0;
>      msix_free_irq_entries(dev);
>      dev->msix_entries_nr = 0;
> -    memory_region_del_subregion(bar, &dev->msix_mmio);
> -    memory_region_destroy(&dev->msix_mmio);
> -    g_free(dev->msix_table_page);
> -    dev->msix_table_page = NULL;
> +    memory_region_del_subregion(bar, &dev->msix_pba_mmio);
> +    memory_region_destroy(&dev->msix_pba_mmio);
> +    g_free(dev->msix_pba);
> +    dev->msix_pba = NULL;
> +    memory_region_del_subregion(bar, &dev->msix_table_mmio);
> +    memory_region_destroy(&dev->msix_table_mmio);
> +    g_free(dev->msix_table);
> +    dev->msix_table = NULL;
>      g_free(dev->msix_entry_used);
>      dev->msix_entry_used = NULL;
>      dev->cap_present &= ~QEMU_PCI_CAP_MSIX;
> @@ -380,8 +406,8 @@ void msix_save(PCIDevice *dev, QEMUFile *f)
>          return;
>      }
>  
> -    qemu_put_buffer(f, dev->msix_table_page, n * PCI_MSIX_ENTRY_SIZE);
> -    qemu_put_buffer(f, dev->msix_table_page + MSIX_PAGE_PENDING, (n + 7) / 8);
> +    qemu_put_buffer(f, dev->msix_table, n * PCI_MSIX_ENTRY_SIZE);
> +    qemu_put_buffer(f, dev->msix_pba, (n + 7) / 8);
>  }
>  
>  /* Should be called after restoring the config space. */
> @@ -395,8 +421,8 @@ void msix_load(PCIDevice *dev, QEMUFile *f)
>      }
>  
>      msix_free_irq_entries(dev);
> -    qemu_get_buffer(f, dev->msix_table_page, n * PCI_MSIX_ENTRY_SIZE);
> -    qemu_get_buffer(f, dev->msix_table_page + MSIX_PAGE_PENDING, (n + 7) / 8);
> +    qemu_get_buffer(f, dev->msix_table, n * PCI_MSIX_ENTRY_SIZE);
> +    qemu_get_buffer(f, dev->msix_pba, (n + 7) / 8);
>      msix_update_function_masked(dev);
>  
>      for (vector = 0; vector < n; vector++) {
> @@ -443,7 +469,9 @@ void msix_reset(PCIDevice *dev)
>      msix_free_irq_entries(dev);
>      dev->config[dev->msix_cap + MSIX_CONTROL_OFFSET] &=
>  	    ~dev->wmask[dev->msix_cap + MSIX_CONTROL_OFFSET];
> -    memset(dev->msix_table_page, 0, MSIX_PAGE_SIZE);
> +
> +    memset(dev->msix_table, 0, dev->msix_entries_nr * PCI_MSIX_ENTRY_SIZE);
> +    memset(dev->msix_pba, 0, QEMU_ALIGN_UP(dev->msix_entries_nr, 64) / 8);
>      msix_mask_all(dev, dev->msix_entries_nr);
>  }
>  
> diff --git a/hw/pci.h b/hw/pci.h
> index d517a54..bd64445 100644
> --- a/hw/pci.h
> +++ b/hw/pci.h
> @@ -224,16 +224,23 @@ struct PCIDevice {
>      /* MSI-X entries */
>      int msix_entries_nr;
>  
> -    /* Space to store MSIX table */
> -    uint8_t *msix_table_page;
> +    /* Space to store MSIX table & pending bit array */
> +    uint8_t *msix_table;
> +    uint8_t *msix_pba;
> +
>      /* MemoryRegion container for msix exclusive BAR setup */
>      MemoryRegion msix_exclusive_bar;
> -    /* MMIO index used to map MSIX table and pending bit entries. */
> -    MemoryRegion msix_mmio;
> +
> +    /* Memory Regions for MSIX table and pending bit entries. */
> +    MemoryRegion msix_table_mmio;
> +    MemoryRegion msix_pba_mmio;
> +
>      /* Reference-count for entries actually in use by driver. */
>      unsigned *msix_entry_used;
> +
>      /* MSIX function mask set or MSIX disabled */
>      bool msix_function_masked;
> +
>      /* Version id needed for VMState */
>      int32_t version_id;
>
Alex Williamson - June 14, 2012, 1:56 p.m.
On Thu, 2012-06-14 at 08:13 +0200, Jan Kiszka wrote:
> On 2012-06-14 06:51, Alex Williamson wrote:
> >  
> > -    memory_region_init_io(&dev->msix_mmio, &msix_mmio_ops, dev,
> > -                          "msix", MSIX_PAGE_SIZE);
> > +    memory_region_init_io(&dev->msix_table_mmio, &msix_table_mmio_ops, dev,
> > +                          "msix", table_size);
> 
> "msix-table" (vs. "msix-pba")? Only if you have to repost anyway.

Ok.  Thanks,

Alex
Alex Williamson - June 14, 2012, 2:21 p.m.
On Thu, 2012-06-14 at 13:24 +0300, Michael S. Tsirkin wrote:
> On Wed, Jun 13, 2012 at 10:51:47PM -0600, Alex Williamson wrote:
> > These don't have to be contiguous.  Size them to only what
> > they need and use separate MemoryRegions for the vector
> > table and PBA.
> > 
> > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> 
> Why is this still using NATIVE?

Because the bug already exists, this patch doesn't make it worse, so at
best it's a tangentially related additional fix.  It may seem like a
s/NATIVE/LITTLE/ to you, but to me it's asking to completely scrub
msix.c for endian correctness.  Is this going to be the carrot you hold
out to accept the rest of the series?

Alex

> > ---
> > 
> >  hw/msix.c |  116 ++++++++++++++++++++++++++++++++++++++-----------------------
> >  hw/pci.h  |   15 ++++++--
> >  2 files changed, 83 insertions(+), 48 deletions(-)
> > 
> > diff --git a/hw/msix.c b/hw/msix.c
> > index a4cdfb0..d476d07 100644
> > --- a/hw/msix.c
> > +++ b/hw/msix.c
> > @@ -37,7 +37,7 @@
> >  
> >  static MSIMessage msix_get_message(PCIDevice *dev, unsigned vector)
> >  {
> > -    uint8_t *table_entry = dev->msix_table_page + vector * PCI_MSIX_ENTRY_SIZE;
> > +    uint8_t *table_entry = dev->msix_table + vector * PCI_MSIX_ENTRY_SIZE;
> >      MSIMessage msg;
> >  
> >      msg.address = pci_get_quad(table_entry + PCI_MSIX_ENTRY_LOWER_ADDR);
> > @@ -86,16 +86,6 @@ static int msix_add_config(struct PCIDevice *pdev, unsigned short nentries,
> >      return 0;
> >  }
> >  
> > -static uint64_t msix_mmio_read(void *opaque, target_phys_addr_t addr,
> > -                               unsigned size)
> > -{
> > -    PCIDevice *dev = opaque;
> > -    unsigned int offset = addr & (MSIX_PAGE_SIZE - 1) & ~0x3;
> > -    void *page = dev->msix_table_page;
> > -
> > -    return pci_get_long(page + offset);
> > -}
> > -
> >  static uint8_t msix_pending_mask(int vector)
> >  {
> >      return 1 << (vector % 8);
> > @@ -103,7 +93,7 @@ static uint8_t msix_pending_mask(int vector)
> >  
> >  static uint8_t *msix_pending_byte(PCIDevice *dev, int vector)
> >  {
> > -    return dev->msix_table_page + MSIX_PAGE_PENDING + vector / 8;
> > +    return dev->msix_pba + vector / 8;
> >  }
> >  
> >  static int msix_is_pending(PCIDevice *dev, int vector)
> > @@ -124,7 +114,7 @@ static void msix_clr_pending(PCIDevice *dev, int vector)
> >  static bool msix_vector_masked(PCIDevice *dev, int vector, bool fmask)
> >  {
> >      unsigned offset = vector * PCI_MSIX_ENTRY_SIZE + PCI_MSIX_ENTRY_VECTOR_CTRL;
> > -    return fmask || dev->msix_table_page[offset] & PCI_MSIX_ENTRY_CTRL_MASKBIT;
> > +    return fmask || dev->msix_table[offset] & PCI_MSIX_ENTRY_CTRL_MASKBIT;
> >  }
> >  
> >  static bool msix_is_masked(PCIDevice *dev, int vector)
> > @@ -203,27 +193,46 @@ void msix_write_config(PCIDevice *dev, uint32_t addr,
> >      }
> >  }
> >  
> > -static void msix_mmio_write(void *opaque, target_phys_addr_t addr,
> > -                            uint64_t val, unsigned size)
> > +static uint64_t msix_table_mmio_read(void *opaque, target_phys_addr_t addr,
> > +                                     unsigned size)
> >  {
> >      PCIDevice *dev = opaque;
> > -    unsigned int offset = addr & (MSIX_PAGE_SIZE - 1) & ~0x3;
> > -    int vector = offset / PCI_MSIX_ENTRY_SIZE;
> > -    bool was_masked;
> >  
> > -    /* MSI-X page includes a read-only PBA and a writeable Vector Control. */
> > -    if (vector >= dev->msix_entries_nr) {
> > -        return;
> > -    }
> > +    return pci_get_long(dev->msix_table + addr);
> > +}
> > +
> > +static void msix_table_mmio_write(void *opaque, target_phys_addr_t addr,
> > +                                  uint64_t val, unsigned size)
> > +{
> > +    PCIDevice *dev = opaque;
> > +    int vector = addr / PCI_MSIX_ENTRY_SIZE;
> > +    bool was_masked;
> >  
> >      was_masked = msix_is_masked(dev, vector);
> > -    pci_set_long(dev->msix_table_page + offset, val);
> > +    pci_set_long(dev->msix_table + addr, val);
> >      msix_handle_mask_update(dev, vector, was_masked);
> >  }
> >  
> > -static const MemoryRegionOps msix_mmio_ops = {
> > -    .read = msix_mmio_read,
> > -    .write = msix_mmio_write,
> > +static const MemoryRegionOps msix_table_mmio_ops = {
> > +    .read = msix_table_mmio_read,
> > +    .write = msix_table_mmio_write,
> > +    .endianness = DEVICE_NATIVE_ENDIAN,
> > +    .valid = {
> > +        .min_access_size = 4,
> > +        .max_access_size = 4,
> > +    },
> > +};
> > +
> > +static uint64_t msix_pba_mmio_read(void *opaque, target_phys_addr_t addr,
> > +                                   unsigned size)
> > +{
> > +    PCIDevice *dev = opaque;
> > +
> > +    return pci_get_long(dev->msix_pba + addr);
> > +}
> > +
> > +static const MemoryRegionOps msix_pba_mmio_ops = {
> > +    .read = msix_pba_mmio_read,
> >      .endianness = DEVICE_NATIVE_ENDIAN,
> >      .valid = {
> >          .min_access_size = 4,
> > @@ -235,11 +244,14 @@ static void msix_mmio_setup(PCIDevice *d, MemoryRegion *bar)
> >  {
> >      uint8_t *config = d->config + d->msix_cap;
> >      uint32_t table = pci_get_long(config + PCI_MSIX_TABLE);
> > -    uint32_t offset = table & ~(MSIX_PAGE_SIZE - 1);
> > +    uint32_t table_offset = table & ~PCI_MSIX_FLAGS_BIRMASK;
> > +    uint32_t pba = pci_get_long(config + PCI_MSIX_PBA);
> > +    uint32_t pba_offset = pba & ~PCI_MSIX_FLAGS_BIRMASK;
> >      /* TODO: for assigned devices, we'll want to make it possible to map
> >       * pending bits separately in case they are in a separate bar. */
> >  
> > -    memory_region_add_subregion(bar, offset, &d->msix_mmio);
> > +    memory_region_add_subregion(bar, table_offset, &d->msix_table_mmio);
> > +    memory_region_add_subregion(bar, pba_offset, &d->msix_pba_mmio);
> >  }
> >  
> >  static void msix_mask_all(struct PCIDevice *dev, unsigned nentries)
> > @@ -251,7 +263,7 @@ static void msix_mask_all(struct PCIDevice *dev, unsigned nentries)
> >              vector * PCI_MSIX_ENTRY_SIZE + PCI_MSIX_ENTRY_VECTOR_CTRL;
> >          bool was_masked = msix_is_masked(dev, vector);
> >  
> > -        dev->msix_table_page[offset] |= PCI_MSIX_ENTRY_CTRL_MASKBIT;
> > +        dev->msix_table[offset] |= PCI_MSIX_ENTRY_CTRL_MASKBIT;
> >          msix_handle_mask_update(dev, vector, was_masked);
> >      }
> >  }
> > @@ -263,6 +275,7 @@ int msix_init(struct PCIDevice *dev, unsigned short nentries,
> >                unsigned bar_nr, unsigned bar_size)
> >  {
> >      int ret;
> > +    unsigned table_size, pba_size;
> >  
> >      /* Nothing to do if MSI is not supported by interrupt controller */
> >      if (!msi_supported) {
> > @@ -271,14 +284,20 @@ int msix_init(struct PCIDevice *dev, unsigned short nentries,
> >      if (nentries > MSIX_MAX_ENTRIES)
> >          return -EINVAL;
> >  
> > +    table_size = nentries * PCI_MSIX_ENTRY_SIZE;
> > +    pba_size = QEMU_ALIGN_UP(nentries, 64) / 8;
> > +
> >      dev->msix_entry_used = g_malloc0(MSIX_MAX_ENTRIES *
> >                                          sizeof *dev->msix_entry_used);
> >  
> > -    dev->msix_table_page = g_malloc0(MSIX_PAGE_SIZE);
> > +    dev->msix_table = g_malloc0(table_size);
> > +    dev->msix_pba = g_malloc0(pba_size);
> >      msix_mask_all(dev, nentries);
> >  
> > -    memory_region_init_io(&dev->msix_mmio, &msix_mmio_ops, dev,
> > -                          "msix", MSIX_PAGE_SIZE);
> > +    memory_region_init_io(&dev->msix_table_mmio, &msix_table_mmio_ops, dev,
> > +                          "msix", table_size);
> > +    memory_region_init_io(&dev->msix_pba_mmio, &msix_pba_mmio_ops, dev,
> > +                          "msix-pba", pba_size);
> >  
> >      dev->msix_entries_nr = nentries;
> >      ret = msix_add_config(dev, nentries, bar_nr, bar_size);
> > @@ -291,9 +310,12 @@ int msix_init(struct PCIDevice *dev, unsigned short nentries,
> >  
> >  err_config:
> >      dev->msix_entries_nr = 0;
> > -    memory_region_destroy(&dev->msix_mmio);
> > -    g_free(dev->msix_table_page);
> > -    dev->msix_table_page = NULL;
> > +    memory_region_destroy(&dev->msix_pba_mmio);
> > +    g_free(dev->msix_pba);
> > +    dev->msix_pba = NULL;
> > +    memory_region_destroy(&dev->msix_table_mmio);
> > +    g_free(dev->msix_table);
> > +    dev->msix_table = NULL;
> >      g_free(dev->msix_entry_used);
> >      dev->msix_entry_used = NULL;
> >      return ret;
> > @@ -354,10 +376,14 @@ int msix_uninit(PCIDevice *dev, MemoryRegion *bar)
> >      dev->msix_cap = 0;
> >      msix_free_irq_entries(dev);
> >      dev->msix_entries_nr = 0;
> > -    memory_region_del_subregion(bar, &dev->msix_mmio);
> > -    memory_region_destroy(&dev->msix_mmio);
> > -    g_free(dev->msix_table_page);
> > -    dev->msix_table_page = NULL;
> > +    memory_region_del_subregion(bar, &dev->msix_pba_mmio);
> > +    memory_region_destroy(&dev->msix_pba_mmio);
> > +    g_free(dev->msix_pba);
> > +    dev->msix_pba = NULL;
> > +    memory_region_del_subregion(bar, &dev->msix_table_mmio);
> > +    memory_region_destroy(&dev->msix_table_mmio);
> > +    g_free(dev->msix_table);
> > +    dev->msix_table = NULL;
> >      g_free(dev->msix_entry_used);
> >      dev->msix_entry_used = NULL;
> >      dev->cap_present &= ~QEMU_PCI_CAP_MSIX;
> > @@ -380,8 +406,8 @@ void msix_save(PCIDevice *dev, QEMUFile *f)
> >          return;
> >      }
> >  
> > -    qemu_put_buffer(f, dev->msix_table_page, n * PCI_MSIX_ENTRY_SIZE);
> > -    qemu_put_buffer(f, dev->msix_table_page + MSIX_PAGE_PENDING, (n + 7) / 8);
> > +    qemu_put_buffer(f, dev->msix_table, n * PCI_MSIX_ENTRY_SIZE);
> > +    qemu_put_buffer(f, dev->msix_pba, (n + 7) / 8);
> >  }
> >  
> >  /* Should be called after restoring the config space. */
> > @@ -395,8 +421,8 @@ void msix_load(PCIDevice *dev, QEMUFile *f)
> >      }
> >  
> >      msix_free_irq_entries(dev);
> > -    qemu_get_buffer(f, dev->msix_table_page, n * PCI_MSIX_ENTRY_SIZE);
> > -    qemu_get_buffer(f, dev->msix_table_page + MSIX_PAGE_PENDING, (n + 7) / 8);
> > +    qemu_get_buffer(f, dev->msix_table, n * PCI_MSIX_ENTRY_SIZE);
> > +    qemu_get_buffer(f, dev->msix_pba, (n + 7) / 8);
> >      msix_update_function_masked(dev);
> >  
> >      for (vector = 0; vector < n; vector++) {
> > @@ -443,7 +469,9 @@ void msix_reset(PCIDevice *dev)
> >      msix_free_irq_entries(dev);
> >      dev->config[dev->msix_cap + MSIX_CONTROL_OFFSET] &=
> >  	    ~dev->wmask[dev->msix_cap + MSIX_CONTROL_OFFSET];
> > -    memset(dev->msix_table_page, 0, MSIX_PAGE_SIZE);
> > +
> > +    memset(dev->msix_table, 0, dev->msix_entries_nr * PCI_MSIX_ENTRY_SIZE);
> > +    memset(dev->msix_pba, 0, QEMU_ALIGN_UP(dev->msix_entries_nr, 64) / 8);
> >      msix_mask_all(dev, dev->msix_entries_nr);
> >  }
> >  
> > diff --git a/hw/pci.h b/hw/pci.h
> > index d517a54..bd64445 100644
> > --- a/hw/pci.h
> > +++ b/hw/pci.h
> > @@ -224,16 +224,23 @@ struct PCIDevice {
> >      /* MSI-X entries */
> >      int msix_entries_nr;
> >  
> > -    /* Space to store MSIX table */
> > -    uint8_t *msix_table_page;
> > +    /* Space to store MSIX table & pending bit array */
> > +    uint8_t *msix_table;
> > +    uint8_t *msix_pba;
> > +
> >      /* MemoryRegion container for msix exclusive BAR setup */
> >      MemoryRegion msix_exclusive_bar;
> > -    /* MMIO index used to map MSIX table and pending bit entries. */
> > -    MemoryRegion msix_mmio;
> > +
> > +    /* Memory Regions for MSIX table and pending bit entries. */
> > +    MemoryRegion msix_table_mmio;
> > +    MemoryRegion msix_pba_mmio;
> > +
> >      /* Reference-count for entries actually in use by driver. */
> >      unsigned *msix_entry_used;
> > +
> >      /* MSIX function mask set or MSIX disabled */
> >      bool msix_function_masked;
> > +
> >      /* Version id needed for VMState */
> >      int32_t version_id;
> >
Michael S. Tsirkin - June 14, 2012, 2:50 p.m.
On Thu, Jun 14, 2012 at 08:21:39AM -0600, Alex Williamson wrote:
> On Thu, 2012-06-14 at 13:24 +0300, Michael S. Tsirkin wrote:
> > On Wed, Jun 13, 2012 at 10:51:47PM -0600, Alex Williamson wrote:
> > > These don't have to be contiguous.  Size them to only what
> > > they need and use separate MemoryRegions for the vector
> > > table and PBA.
> > > 
> > > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > 
> > Why is this still using NATIVE?
> 
> Because the bug already exists,

We have lots of broken code. The way progress happens here is
such code is in a kind of freeze until fixed. This way whoever needs new
features gets to fix the bugs too.

> this patch doesn't make it worse, so at best it's a tangentially related additional fix.
> It may seem like a s/NATIVE/LITTLE/ to you, but to me it's asking to completely scrub
> msix.c for endian correctness.  Is this going to be the carrot you hold
> out to accept the rest of the series?
> 
> Alex

Unfortunately no promises yet, and that is because you basically decided
to rewrite lots of code in your preferred style while also adding new functionality.
If changes were done in small steps, then I could apply things we can
agree on and defer the ones we don't.  Sometimes it's hard, but clearly
not in this case.

> > > ---
> > > 
> > >  hw/msix.c |  116 ++++++++++++++++++++++++++++++++++++++-----------------------
> > >  hw/pci.h  |   15 ++++++--
> > >  2 files changed, 83 insertions(+), 48 deletions(-)
> > > 
> > > diff --git a/hw/msix.c b/hw/msix.c
> > > index a4cdfb0..d476d07 100644
> > > --- a/hw/msix.c
> > > +++ b/hw/msix.c
> > > @@ -37,7 +37,7 @@
> > >  
> > >  static MSIMessage msix_get_message(PCIDevice *dev, unsigned vector)
> > >  {
> > > -    uint8_t *table_entry = dev->msix_table_page + vector * PCI_MSIX_ENTRY_SIZE;
> > > +    uint8_t *table_entry = dev->msix_table + vector * PCI_MSIX_ENTRY_SIZE;
> > >      MSIMessage msg;
> > >  
> > >      msg.address = pci_get_quad(table_entry + PCI_MSIX_ENTRY_LOWER_ADDR);
> > > @@ -86,16 +86,6 @@ static int msix_add_config(struct PCIDevice *pdev, unsigned short nentries,
> > >      return 0;
> > >  }
> > >  
> > > -static uint64_t msix_mmio_read(void *opaque, target_phys_addr_t addr,
> > > -                               unsigned size)
> > > -{
> > > -    PCIDevice *dev = opaque;
> > > -    unsigned int offset = addr & (MSIX_PAGE_SIZE - 1) & ~0x3;
> > > -    void *page = dev->msix_table_page;
> > > -
> > > -    return pci_get_long(page + offset);
> > > -}
> > > -
> > >  static uint8_t msix_pending_mask(int vector)
> > >  {
> > >      return 1 << (vector % 8);
> > > @@ -103,7 +93,7 @@ static uint8_t msix_pending_mask(int vector)
> > >  
> > >  static uint8_t *msix_pending_byte(PCIDevice *dev, int vector)
> > >  {
> > > -    return dev->msix_table_page + MSIX_PAGE_PENDING + vector / 8;
> > > +    return dev->msix_pba + vector / 8;
> > >  }
> > >  
> > >  static int msix_is_pending(PCIDevice *dev, int vector)
> > > @@ -124,7 +114,7 @@ static void msix_clr_pending(PCIDevice *dev, int vector)
> > >  static bool msix_vector_masked(PCIDevice *dev, int vector, bool fmask)
> > >  {
> > >      unsigned offset = vector * PCI_MSIX_ENTRY_SIZE + PCI_MSIX_ENTRY_VECTOR_CTRL;
> > > -    return fmask || dev->msix_table_page[offset] & PCI_MSIX_ENTRY_CTRL_MASKBIT;
> > > +    return fmask || dev->msix_table[offset] & PCI_MSIX_ENTRY_CTRL_MASKBIT;
> > >  }
> > >  
> > >  static bool msix_is_masked(PCIDevice *dev, int vector)
> > > @@ -203,27 +193,46 @@ void msix_write_config(PCIDevice *dev, uint32_t addr,
> > >      }
> > >  }
> > >  
> > > -static void msix_mmio_write(void *opaque, target_phys_addr_t addr,
> > > -                            uint64_t val, unsigned size)
> > > +static uint64_t msix_table_mmio_read(void *opaque, target_phys_addr_t addr,
> > > +                                     unsigned size)
> > >  {
> > >      PCIDevice *dev = opaque;
> > > -    unsigned int offset = addr & (MSIX_PAGE_SIZE - 1) & ~0x3;
> > > -    int vector = offset / PCI_MSIX_ENTRY_SIZE;
> > > -    bool was_masked;
> > >  
> > > -    /* MSI-X page includes a read-only PBA and a writeable Vector Control. */
> > > -    if (vector >= dev->msix_entries_nr) {
> > > -        return;
> > > -    }
> > > +    return pci_get_long(dev->msix_table + addr);
> > > +}
> > > +
> > > +static void msix_table_mmio_write(void *opaque, target_phys_addr_t addr,
> > > +                                  uint64_t val, unsigned size)
> > > +{
> > > +    PCIDevice *dev = opaque;
> > > +    int vector = addr / PCI_MSIX_ENTRY_SIZE;
> > > +    bool was_masked;
> > >  
> > >      was_masked = msix_is_masked(dev, vector);
> > > -    pci_set_long(dev->msix_table_page + offset, val);
> > > +    pci_set_long(dev->msix_table + addr, val);
> > >      msix_handle_mask_update(dev, vector, was_masked);
> > >  }
> > >  
> > > -static const MemoryRegionOps msix_mmio_ops = {
> > > -    .read = msix_mmio_read,
> > > -    .write = msix_mmio_write,
> > > +static const MemoryRegionOps msix_table_mmio_ops = {
> > > +    .read = msix_table_mmio_read,
> > > +    .write = msix_table_mmio_write,
> > > +    .endianness = DEVICE_NATIVE_ENDIAN,
> > > +    .valid = {
> > > +        .min_access_size = 4,
> > > +        .max_access_size = 4,
> > > +    },
> > > +};
> > > +
> > > +static uint64_t msix_pba_mmio_read(void *opaque, target_phys_addr_t addr,
> > > +                                   unsigned size)
> > > +{
> > > +    PCIDevice *dev = opaque;
> > > +
> > > +    return pci_get_long(dev->msix_pba + addr);
> > > +}
> > > +
> > > +static const MemoryRegionOps msix_pba_mmio_ops = {
> > > +    .read = msix_pba_mmio_read,
> > >      .endianness = DEVICE_NATIVE_ENDIAN,
> > >      .valid = {
> > >          .min_access_size = 4,
> > > @@ -235,11 +244,14 @@ static void msix_mmio_setup(PCIDevice *d, MemoryRegion *bar)
> > >  {
> > >      uint8_t *config = d->config + d->msix_cap;
> > >      uint32_t table = pci_get_long(config + PCI_MSIX_TABLE);
> > > -    uint32_t offset = table & ~(MSIX_PAGE_SIZE - 1);
> > > +    uint32_t table_offset = table & ~PCI_MSIX_FLAGS_BIRMASK;
> > > +    uint32_t pba = pci_get_long(config + PCI_MSIX_PBA);
> > > +    uint32_t pba_offset = pba & ~PCI_MSIX_FLAGS_BIRMASK;
> > >      /* TODO: for assigned devices, we'll want to make it possible to map
> > >       * pending bits separately in case they are in a separate bar. */
> > >  
> > > -    memory_region_add_subregion(bar, offset, &d->msix_mmio);
> > > +    memory_region_add_subregion(bar, table_offset, &d->msix_table_mmio);
> > > +    memory_region_add_subregion(bar, pba_offset, &d->msix_pba_mmio);
> > >  }
> > >  
> > >  static void msix_mask_all(struct PCIDevice *dev, unsigned nentries)
> > > @@ -251,7 +263,7 @@ static void msix_mask_all(struct PCIDevice *dev, unsigned nentries)
> > >              vector * PCI_MSIX_ENTRY_SIZE + PCI_MSIX_ENTRY_VECTOR_CTRL;
> > >          bool was_masked = msix_is_masked(dev, vector);
> > >  
> > > -        dev->msix_table_page[offset] |= PCI_MSIX_ENTRY_CTRL_MASKBIT;
> > > +        dev->msix_table[offset] |= PCI_MSIX_ENTRY_CTRL_MASKBIT;
> > >          msix_handle_mask_update(dev, vector, was_masked);
> > >      }
> > >  }
> > > @@ -263,6 +275,7 @@ int msix_init(struct PCIDevice *dev, unsigned short nentries,
> > >                unsigned bar_nr, unsigned bar_size)
> > >  {
> > >      int ret;
> > > +    unsigned table_size, pba_size;
> > >  
> > >      /* Nothing to do if MSI is not supported by interrupt controller */
> > >      if (!msi_supported) {
> > > @@ -271,14 +284,20 @@ int msix_init(struct PCIDevice *dev, unsigned short nentries,
> > >      if (nentries > MSIX_MAX_ENTRIES)
> > >          return -EINVAL;
> > >  
> > > +    table_size = nentries * PCI_MSIX_ENTRY_SIZE;
> > > +    pba_size = QEMU_ALIGN_UP(nentries, 64) / 8;
> > > +
> > >      dev->msix_entry_used = g_malloc0(MSIX_MAX_ENTRIES *
> > >                                          sizeof *dev->msix_entry_used);
> > >  
> > > -    dev->msix_table_page = g_malloc0(MSIX_PAGE_SIZE);
> > > +    dev->msix_table = g_malloc0(table_size);
> > > +    dev->msix_pba = g_malloc0(pba_size);
> > >      msix_mask_all(dev, nentries);
> > >  
> > > -    memory_region_init_io(&dev->msix_mmio, &msix_mmio_ops, dev,
> > > -                          "msix", MSIX_PAGE_SIZE);
> > > +    memory_region_init_io(&dev->msix_table_mmio, &msix_table_mmio_ops, dev,
> > > +                          "msix", table_size);
> > > +    memory_region_init_io(&dev->msix_pba_mmio, &msix_pba_mmio_ops, dev,
> > > +                          "msix-pba", pba_size);
> > >  
> > >      dev->msix_entries_nr = nentries;
> > >      ret = msix_add_config(dev, nentries, bar_nr, bar_size);
> > > @@ -291,9 +310,12 @@ int msix_init(struct PCIDevice *dev, unsigned short nentries,
> > >  
> > >  err_config:
> > >      dev->msix_entries_nr = 0;
> > > -    memory_region_destroy(&dev->msix_mmio);
> > > -    g_free(dev->msix_table_page);
> > > -    dev->msix_table_page = NULL;
> > > +    memory_region_destroy(&dev->msix_pba_mmio);
> > > +    g_free(dev->msix_pba);
> > > +    dev->msix_pba = NULL;
> > > +    memory_region_destroy(&dev->msix_table_mmio);
> > > +    g_free(dev->msix_table);
> > > +    dev->msix_table = NULL;
> > >      g_free(dev->msix_entry_used);
> > >      dev->msix_entry_used = NULL;
> > >      return ret;
> > > @@ -354,10 +376,14 @@ int msix_uninit(PCIDevice *dev, MemoryRegion *bar)
> > >      dev->msix_cap = 0;
> > >      msix_free_irq_entries(dev);
> > >      dev->msix_entries_nr = 0;
> > > -    memory_region_del_subregion(bar, &dev->msix_mmio);
> > > -    memory_region_destroy(&dev->msix_mmio);
> > > -    g_free(dev->msix_table_page);
> > > -    dev->msix_table_page = NULL;
> > > +    memory_region_del_subregion(bar, &dev->msix_pba_mmio);
> > > +    memory_region_destroy(&dev->msix_pba_mmio);
> > > +    g_free(dev->msix_pba);
> > > +    dev->msix_pba = NULL;
> > > +    memory_region_del_subregion(bar, &dev->msix_table_mmio);
> > > +    memory_region_destroy(&dev->msix_table_mmio);
> > > +    g_free(dev->msix_table);
> > > +    dev->msix_table = NULL;
> > >      g_free(dev->msix_entry_used);
> > >      dev->msix_entry_used = NULL;
> > >      dev->cap_present &= ~QEMU_PCI_CAP_MSIX;
> > > @@ -380,8 +406,8 @@ void msix_save(PCIDevice *dev, QEMUFile *f)
> > >          return;
> > >      }
> > >  
> > > -    qemu_put_buffer(f, dev->msix_table_page, n * PCI_MSIX_ENTRY_SIZE);
> > > -    qemu_put_buffer(f, dev->msix_table_page + MSIX_PAGE_PENDING, (n + 7) / 8);
> > > +    qemu_put_buffer(f, dev->msix_table, n * PCI_MSIX_ENTRY_SIZE);
> > > +    qemu_put_buffer(f, dev->msix_pba, (n + 7) / 8);
> > >  }
> > >  
> > >  /* Should be called after restoring the config space. */
> > > @@ -395,8 +421,8 @@ void msix_load(PCIDevice *dev, QEMUFile *f)
> > >      }
> > >  
> > >      msix_free_irq_entries(dev);
> > > -    qemu_get_buffer(f, dev->msix_table_page, n * PCI_MSIX_ENTRY_SIZE);
> > > -    qemu_get_buffer(f, dev->msix_table_page + MSIX_PAGE_PENDING, (n + 7) / 8);
> > > +    qemu_get_buffer(f, dev->msix_table, n * PCI_MSIX_ENTRY_SIZE);
> > > +    qemu_get_buffer(f, dev->msix_pba, (n + 7) / 8);
> > >      msix_update_function_masked(dev);
> > >  
> > >      for (vector = 0; vector < n; vector++) {
> > > @@ -443,7 +469,9 @@ void msix_reset(PCIDevice *dev)
> > >      msix_free_irq_entries(dev);
> > >      dev->config[dev->msix_cap + MSIX_CONTROL_OFFSET] &=
> > >  	    ~dev->wmask[dev->msix_cap + MSIX_CONTROL_OFFSET];
> > > -    memset(dev->msix_table_page, 0, MSIX_PAGE_SIZE);
> > > +
> > > +    memset(dev->msix_table, 0, dev->msix_entries_nr * PCI_MSIX_ENTRY_SIZE);
> > > +    memset(dev->msix_pba, 0, QEMU_ALIGN_UP(dev->msix_entries_nr, 64) / 8);
> > >      msix_mask_all(dev, dev->msix_entries_nr);
> > >  }
> > >  
> > > diff --git a/hw/pci.h b/hw/pci.h
> > > index d517a54..bd64445 100644
> > > --- a/hw/pci.h
> > > +++ b/hw/pci.h
> > > @@ -224,16 +224,23 @@ struct PCIDevice {
> > >      /* MSI-X entries */
> > >      int msix_entries_nr;
> > >  
> > > -    /* Space to store MSIX table */
> > > -    uint8_t *msix_table_page;
> > > +    /* Space to store MSIX table & pending bit array */
> > > +    uint8_t *msix_table;
> > > +    uint8_t *msix_pba;
> > > +
> > >      /* MemoryRegion container for msix exclusive BAR setup */
> > >      MemoryRegion msix_exclusive_bar;
> > > -    /* MMIO index used to map MSIX table and pending bit entries. */
> > > -    MemoryRegion msix_mmio;
> > > +
> > > +    /* Memory Regions for MSIX table and pending bit entries. */
> > > +    MemoryRegion msix_table_mmio;
> > > +    MemoryRegion msix_pba_mmio;
> > > +
> > >      /* Reference-count for entries actually in use by driver. */
> > >      unsigned *msix_entry_used;
> > > +
> > >      /* MSIX function mask set or MSIX disabled */
> > >      bool msix_function_masked;
> > > +
> > >      /* Version id needed for VMState */
> > >      int32_t version_id;
> > >  
> 
>
Alex Williamson - June 14, 2012, 3:09 p.m.
On Thu, 2012-06-14 at 17:50 +0300, Michael S. Tsirkin wrote:
> On Thu, Jun 14, 2012 at 08:21:39AM -0600, Alex Williamson wrote:
> > On Thu, 2012-06-14 at 13:24 +0300, Michael S. Tsirkin wrote:
> > > On Wed, Jun 13, 2012 at 10:51:47PM -0600, Alex Williamson wrote:
> > > > These don't have to be contiguous.  Size them to only what
> > > > they need and use separate MemoryRegions for the vector
> > > > table and PBA.
> > > > 
> > > > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > > 
> > > Why is this still using NATIVE?
> > 
> > Because the bug already exists,
> 
> We have lots of broken code. The way progress happens here is
> such code is in a kind of freeze until fixed. This way whoever needs new
> features gets to fix the bugs too.

In other words, you impose a toll and inhibit forward progress until
someone fixes it?  I have no place telling you how to be a maintainer,
but I personally find that this style makes attempting to contribute
code to anything pci/msi/msix related a huge pain.  There are far too
many of these land mines in the code and simple fixes easily explode
into tangentially related changes off your todo list.

> > this patch doesn't make it worse, so at best it's a tangentially related additional fix.
> > It may seem like a s/NATIVE/LITTLE/ to you, but to me it's asking to completely scrub
> > msix.c for endian correctness.  Is this going to be the carrot you hold
> > out to accept the rest of the series?
> > 
> > Alex
> 
> Unfortunately no promises yet, and that is because you basically decided
> to rewrite lots of code in your preferred style while also adding new functionality.
> If changes were done in small steps, then I could apply things we can
> agree on and defer the ones we don't.  Sometimes it's hard, but clearly
> not in this case.

Patches can always be reduced into smaller changes, but at some point we
have to call it good enough.  I split one patch into 6 and thought that
did a pretty good job.  Should I remove everywhere that I've added a new
line to avoid imposing my style on the rest of the code?  The next
version will eliminate the add_config move thanks to Jan's constructive
suggestion, so I hope it meets your standards.  Thanks,

Alex
Michael S. Tsirkin - June 14, 2012, 3:45 p.m.
On Thu, Jun 14, 2012 at 09:09:47AM -0600, Alex Williamson wrote:
> On Thu, 2012-06-14 at 17:50 +0300, Michael S. Tsirkin wrote:
> > On Thu, Jun 14, 2012 at 08:21:39AM -0600, Alex Williamson wrote:
> > > On Thu, 2012-06-14 at 13:24 +0300, Michael S. Tsirkin wrote:
> > > > On Wed, Jun 13, 2012 at 10:51:47PM -0600, Alex Williamson wrote:
> > > > > These don't have to be contiguous.  Size them to only what
> > > > > they need and use separate MemoryRegions for the vector
> > > > > table and PBA.
> > > > > 
> > > > > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > > > 
> > > > Why is this still using NATIVE?
> > > 
> > > Because the bug already exists,
> > 
> > We have lots of broken code. The way progress happens here is
> > such code is in a kind of freeze until fixed. This way whoever needs new
> > features gets to fix the bugs too.
> 
> In other words, you impose a toll and inhibit forward progress until
> someone fixes it?  I have no place telling you how to be a maintainer,
> but I personally find that this style makes attempting to contribute
> code to anything pci/msi/msix related a huge pain.  There are far too
> many of these land mines in the code and simple fixes easily explode
> into tangentially related changes off your todo list.

I try to pick simple fixes up straight away. Pls try to keep the fixes
simpler :)

> > > this patch doesn't make it worse, so at best it's a tangentially related additional fix.
> > > It may seem like a s/NATIVE/LITTLE/ to you, but to me it's asking to completely scrub
> > > msix.c for endian correctness.  Is this going to be the carrot you hold
> > > out to accept the rest of the series?
> > > 
> > > Alex
> > 
> > Unfortunately no promises yet, and that is because you basically decided
> > to rewrite lots of code in your preferred style while also adding new functionality.
> > If changes were done in small steps, then I could apply things we can
> > agree on and defer the ones we don't.  Sometimes it's hard, but clearly
> > not in this case.
> 
> Patches can always be reduced into smaller changes, but at some point we
> have to call it good enough.  I split one patch into 6 and thought that
> did a pretty good job.

It's not the mechanical splitting of patches that is needed.
In one case you actually added a new function in place X then moved it
to place Y. And the new order does not make sense: init then uninit looks cleaner.


> Should I remove everywhere that I've added a new
> line to avoid imposing my style on the rest of the code?

Each new line? No, that would be taking it to extreme because newlines are
easy to ignore normally. Though if someone sends me a patch with 1000
newlines tweaked and functional changes in the same patch, I won't apply
it.

> The next
> version will eliminate the add_config move thanks to Jan's constructive
> suggestion, so I hope it meets your standards.  Thanks,
> 
> Alex

Please try to address other comments too, like naming
constants. I would hate to get another revision that just ignores them.
Alex Williamson - June 14, 2012, 4:02 p.m.
On Thu, 2012-06-14 at 18:45 +0300, Michael S. Tsirkin wrote:
> On Thu, Jun 14, 2012 at 09:09:47AM -0600, Alex Williamson wrote:
> > On Thu, 2012-06-14 at 17:50 +0300, Michael S. Tsirkin wrote:
> > > On Thu, Jun 14, 2012 at 08:21:39AM -0600, Alex Williamson wrote:
> > > > On Thu, 2012-06-14 at 13:24 +0300, Michael S. Tsirkin wrote:
> > > > > On Wed, Jun 13, 2012 at 10:51:47PM -0600, Alex Williamson wrote:
> > > > > > These don't have to be contiguous.  Size them to only what
> > > > > > they need and use separate MemoryRegions for the vector
> > > > > > table and PBA.
> > > > > > 
> > > > > > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > > > > 
> > > > > Why is this still using NATIVE?
> > > > 
> > > > Because the bug already exists,
> > > 
> > > We have lots of broken code. The way progress happens here is
> > > such code is in a kind of freeze until fixed. This way whoever needs new
> > > features gets to fix the bugs too.
> > 
> > In other words, you impose a toll and inhibit forward progress until
> > someone fixes it?  I have no place telling you how to be a maintainer,
> > but I personally find that this style makes attempting to contribute
> > code to anything pci/msi/msix related a huge pain.  There are far too
> > many of these land mines in the code and simple fixes easily explode
> > into tangentially related changes off your todo list.
> 
> I try to pick simple fixes up straight away. Pls try to keep the fixes
> simpler :)

What does that have to do with shoving todo list items down the throats
of contributors?

> > > > this patch doesn't make it worse, so at best it's a tangentially related additional fix.
> > > > It may seem like a s/NATIVE/LITTLE/ to you, but to me it's asking to completely scrub
> > > > msix.c for endian correctness.  Is this going to be the carrot you hold
> > > > out to accept the rest of the series?
> > > > 
> > > > Alex
> > > 
> > > Unfortunately no promises yet, and that is because you basically decided
> > > to rewrite lots of code in your preferred style while also adding new functionality.
> > > If changes were done in small steps, then I could apply things we can
> > > agree on and defer the ones we don't.  Sometimes it's hard, but clearly
> > > not in this case.
> > 
> > Patches can always be reduced into smaller changes, but at some point we
> > have to call it good enough.  I split one patch into 6 and thought that
> > did a pretty good job.
> 
> It's not the mechanical splitting of patches that is needed.
> In one case you actually added a new function in place X then moved it
> to place Y. And the new order does not make sense: init then uninit looks cleaner.

uninit was moved because I was able to remove duplicate code by making
init call uninit on error.  Do you prefer a prototype to avoid code
moves in that case?  Doesn't matter now, it's fixed with Jan's
suggestion and I've already split the move of another tiny function to a
separate patch.

> > Should I remove everywhere that I've added a new
> > line to avoid imposing my style on the rest of the code?
> 
> Each new line? No, that would be taking it to extreme because newlines are
> easy to ignore normally. Though if someone sends me a patch with 1000
> newlines tweaked and functional changes in the same patch, I won't apply
> it.

Well then, I'm not sure what you mean by "you basically decided to
rewrite lots of code in your preferred style".

> > The next
> > version will eliminate the add_config move thanks to Jan's constructive
> > suggestion, so I hope it meets your standards.  Thanks,
> > 
> > Alex
> 
> Please try to address other comments too, like naming
> constants. I would hate to get another revision that just ignores them.

It will unless you counter my rebuttal to why I'm not using macros
there.  To repeat:

On Wed, 2012-06-13 at 17:05 -0600, Alex Williamson wrote:
On Wed, 2012-06-13 at 23:43 +0300, Michael S. Tsirkin wrote:
> > On Tue, Jun 12, 2012 at 02:03:26PM -0600, Alex Williamson wrote:
> > > +    /*
> > > +     * Migration compatibility dictates that this remains a 4k
> > > +     * BAR with the vector table in the lower half and PBA in
> > > +     * the upper half.
> > > +     */
> > > +    if (nentries * PCI_MSIX_ENTRY_SIZE > 2048) {
> > > +        return -EINVAL;
> > > +    }
> > > +
> > > +    memory_region_init(bar, name, 4096);
> > > +
> > > +    ret = msix_init(pdev, nentries, bar, bar_nr, 0, bar, bar_nr, 2048, 0);
> > 
> > Lots of constants.
> > Current code uses macros for these, e.g.
> > MSIX_PAGE_PENDING, MSIX_PAGE_PENDING /2.
> > 
> > Let's keep it that way.
> 
> There is absolutely no valid use for them outside of this function.  I
> explain the size in the comment immediately above where they're used.
> Macro-izing these just risks someone assuming there's a standard or
> misusing it for something else (see device assignment imposing a 4k
> MSI-X table for example...)
Michael S. Tsirkin - June 14, 2012, 4:26 p.m.
On Thu, Jun 14, 2012 at 10:02:58AM -0600, Alex Williamson wrote:
> On Thu, 2012-06-14 at 18:45 +0300, Michael S. Tsirkin wrote:
> > On Thu, Jun 14, 2012 at 09:09:47AM -0600, Alex Williamson wrote:
> > > On Thu, 2012-06-14 at 17:50 +0300, Michael S. Tsirkin wrote:
> > > > On Thu, Jun 14, 2012 at 08:21:39AM -0600, Alex Williamson wrote:
> > > > > On Thu, 2012-06-14 at 13:24 +0300, Michael S. Tsirkin wrote:
> > > > > > On Wed, Jun 13, 2012 at 10:51:47PM -0600, Alex Williamson wrote:
> > > > > > > These don't have to be contiguous.  Size them to only what
> > > > > > > they need and use separate MemoryRegions for the vector
> > > > > > > table and PBA.
> > > > > > > 
> > > > > > > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > > > > > 
> > > > > > Why is this still using NATIVE?
> > > > > 
> > > > > Because the bug already exists,
> > > > 
> > > > We have lots of broken code. The way progress happens here is
> > > > such code is in a kind of freeze until fixed. This way whoever needs new
> > > > features gets to fix the bugs too.
> > > 
> > > In other words, you impose a toll and inhibit forward progress until
> > > someone fixes it?  I have no place telling you how to be a maintainer,
> > > but I personally find that this style makes attempting to contribute
> > > code to anything pci/msi/msix related a huge pain.  There are far too
> > > many of these land mines in the code and simple fixes easily explode
> > > into tangentially related changes off your todo list.
> > 
> > I try to pick simple fixes up straight away. Pls try to keep the fixes
> > simpler :)
> 
> What does that have to do with shoving todo list items down the throats
> of contributors?

If you write new code you do not get to use legacy interfaces.
But - if you fix bugs you are not required to fix all the bugs in one go.
If you mix bugfixes and features all is treated as new code.

> > > > > this patch doesn't make it worse, so at best it's a tangentially related additional fix.
> > > > > It may seem like a s/NATIVE/LITTLE/ to you, but to me it's asking to completely scrub
> > > > > msix.c for endian correctness.  Is this going to be the carrot you hold
> > > > > out to accept the rest of the series?
> > > > > 
> > > > > Alex
> > > > 
> > > > Unfortunately no promises yet, and that is because you basically decided
> > > > to rewrite lots of code in your preferred style while also adding new functionality.
> > > > If changes were done in small steps, then I could apply things we can
> > > > agree on and defer the ones we don't.  Sometimes it's hard, but clearly
> > > > not in this case.
> > > 
> > > Patches can always be reduced into smaller changes, but at some point we
> > > have to call it good enough.  I split one patch into 6 and thought that
> > > did a pretty good job.
> > 
> > It's not the mechanical splitting of patches that is needed.
> > In one case you actually added a new function in place X then moved it
> > to place Y. And the new order does not make sense: init then uninit looks cleaner.
> 
> uninit was moved because I was able to remove duplicate code by making
> init call uninit on error.  Do you prefer a prototype to avoid code
> moves in that case?

msix.h has a prototype already I think.

>  Doesn't matter now, it's fixed with Jan's
> suggestion and I've already split the move of another tiny function to a
> separate patch.

This does not matter. What matters is making things easy to review.
If you send me a patch moving functions around, I can put them
side by side and compare + and -. If you make a
small cosmetic change I can see it is equivalent.
If you add functionality I see how it works.

But if you mix these types of change it's very hard to review.

> > > Should I remove everywhere that I've added a new
> > > line to avoid imposing my style on the rest of the code?
> > 
> > Each new line? No, that would be taking it to extreme because newlines are
> > easy to ignore normally. Though if someone sends me a patch with 1000
> > newlines tweaked and functional changes in the same patch, I won't apply
> > it.
> 
> Well then, I'm not sure what you mean by "you basically decided to
> rewrite lots of code in your preferred style".

The diff was very large, is what I mean.

> > > The next
> > > version will eliminate the add_config move thanks to Jan's constructive
> > > suggestion, so I hope it meets your standards.  Thanks,
> > > 
> > > Alex
> > 
> > Please try to address other comments too, like naming
> > constants. I would hate to get another revision that just ignores them.
> 
> It will unless you counter my rebuttal to why I'm not using macros
> there.  To repeat:
> 
> On Wed, 2012-06-13 at 17:05 -0600, Alex Williamson wrote:
> On Wed, 2012-06-13 at 23:43 +0300, Michael S. Tsirkin wrote:
> > > On Tue, Jun 12, 2012 at 02:03:26PM -0600, Alex Williamson wrote:
> > > > +    /*
> > > > +     * Migration compatibility dictates that this remains a 4k
> > > > +     * BAR with the vector table in the lower half and PBA in
> > > > +     * the upper half.
> > > > +     */
> > > > +    if (nentries * PCI_MSIX_ENTRY_SIZE > 2048) {
> > > > +        return -EINVAL;
> > > > +    }
> > > > +
> > > > +    memory_region_init(bar, name, 4096);
> > > > +
> > > > +    ret = msix_init(pdev, nentries, bar, bar_nr, 0, bar, bar_nr, 2048, 0);
> > > 
> > > Lots of constants.
> > > Current code uses macros for these, e.g.
> > > MSIX_PAGE_PENDING, MSIX_PAGE_PENDING /2.
> > > 
> > > Let's keep it that way.
> > 
> > There is absolutely no valid use for them outside of this function.


They still appear multiple times. And 2048 is middle of page
but PAGE/2 is clearer.

>  I
> > explain the size in the comment immediately above where they're used.
> > Macro-izing these just risks someone assuming there's a standard or
> > misusing it for something else (see device assignment imposing a 4k
> > MSI-X table for example...)


A valid concern, but won't help against people copying code :)
Since you now use it from the exclusive call only, rename it
MSIX_EXLUSIVE_BAR_SIZE, MSIX_EXLUSIVE_BAR_PENDING?
It's actually what it is.

Patch

diff --git a/hw/msix.c b/hw/msix.c
index a4cdfb0..d476d07 100644
--- a/hw/msix.c
+++ b/hw/msix.c
@@ -37,7 +37,7 @@ 
 
 static MSIMessage msix_get_message(PCIDevice *dev, unsigned vector)
 {
-    uint8_t *table_entry = dev->msix_table_page + vector * PCI_MSIX_ENTRY_SIZE;
+    uint8_t *table_entry = dev->msix_table + vector * PCI_MSIX_ENTRY_SIZE;
     MSIMessage msg;
 
     msg.address = pci_get_quad(table_entry + PCI_MSIX_ENTRY_LOWER_ADDR);
@@ -86,16 +86,6 @@  static int msix_add_config(struct PCIDevice *pdev, unsigned short nentries,
     return 0;
 }
 
-static uint64_t msix_mmio_read(void *opaque, target_phys_addr_t addr,
-                               unsigned size)
-{
-    PCIDevice *dev = opaque;
-    unsigned int offset = addr & (MSIX_PAGE_SIZE - 1) & ~0x3;
-    void *page = dev->msix_table_page;
-
-    return pci_get_long(page + offset);
-}
-
 static uint8_t msix_pending_mask(int vector)
 {
     return 1 << (vector % 8);
@@ -103,7 +93,7 @@  static uint8_t msix_pending_mask(int vector)
 
 static uint8_t *msix_pending_byte(PCIDevice *dev, int vector)
 {
-    return dev->msix_table_page + MSIX_PAGE_PENDING + vector / 8;
+    return dev->msix_pba + vector / 8;
 }
 
 static int msix_is_pending(PCIDevice *dev, int vector)
@@ -124,7 +114,7 @@  static void msix_clr_pending(PCIDevice *dev, int vector)
 static bool msix_vector_masked(PCIDevice *dev, int vector, bool fmask)
 {
     unsigned offset = vector * PCI_MSIX_ENTRY_SIZE + PCI_MSIX_ENTRY_VECTOR_CTRL;
-    return fmask || dev->msix_table_page[offset] & PCI_MSIX_ENTRY_CTRL_MASKBIT;
+    return fmask || dev->msix_table[offset] & PCI_MSIX_ENTRY_CTRL_MASKBIT;
 }
 
 static bool msix_is_masked(PCIDevice *dev, int vector)
@@ -203,27 +193,46 @@  void msix_write_config(PCIDevice *dev, uint32_t addr,
     }
 }
 
-static void msix_mmio_write(void *opaque, target_phys_addr_t addr,
-                            uint64_t val, unsigned size)
+static uint64_t msix_table_mmio_read(void *opaque, target_phys_addr_t addr,
+                                     unsigned size)
 {
     PCIDevice *dev = opaque;
-    unsigned int offset = addr & (MSIX_PAGE_SIZE - 1) & ~0x3;
-    int vector = offset / PCI_MSIX_ENTRY_SIZE;
-    bool was_masked;
 
-    /* MSI-X page includes a read-only PBA and a writeable Vector Control. */
-    if (vector >= dev->msix_entries_nr) {
-        return;
-    }
+    return pci_get_long(dev->msix_table + addr);
+}
+
+static void msix_table_mmio_write(void *opaque, target_phys_addr_t addr,
+                                  uint64_t val, unsigned size)
+{
+    PCIDevice *dev = opaque;
+    int vector = addr / PCI_MSIX_ENTRY_SIZE;
+    bool was_masked;
 
     was_masked = msix_is_masked(dev, vector);
-    pci_set_long(dev->msix_table_page + offset, val);
+    pci_set_long(dev->msix_table + addr, val);
     msix_handle_mask_update(dev, vector, was_masked);
 }
 
-static const MemoryRegionOps msix_mmio_ops = {
-    .read = msix_mmio_read,
-    .write = msix_mmio_write,
+static const MemoryRegionOps msix_table_mmio_ops = {
+    .read = msix_table_mmio_read,
+    .write = msix_table_mmio_write,
+    .endianness = DEVICE_NATIVE_ENDIAN,
+    .valid = {
+        .min_access_size = 4,
+        .max_access_size = 4,
+    },
+};
+
+static uint64_t msix_pba_mmio_read(void *opaque, target_phys_addr_t addr,
+                                   unsigned size)
+{
+    PCIDevice *dev = opaque;
+
+    return pci_get_long(dev->msix_pba + addr);
+}
+
+static const MemoryRegionOps msix_pba_mmio_ops = {
+    .read = msix_pba_mmio_read,
     .endianness = DEVICE_NATIVE_ENDIAN,
     .valid = {
         .min_access_size = 4,
@@ -235,11 +244,14 @@  static void msix_mmio_setup(PCIDevice *d, MemoryRegion *bar)
 {
     uint8_t *config = d->config + d->msix_cap;
     uint32_t table = pci_get_long(config + PCI_MSIX_TABLE);
-    uint32_t offset = table & ~(MSIX_PAGE_SIZE - 1);
+    uint32_t table_offset = table & ~PCI_MSIX_FLAGS_BIRMASK;
+    uint32_t pba = pci_get_long(config + PCI_MSIX_PBA);
+    uint32_t pba_offset = pba & ~PCI_MSIX_FLAGS_BIRMASK;
     /* TODO: for assigned devices, we'll want to make it possible to map
      * pending bits separately in case they are in a separate bar. */
 
-    memory_region_add_subregion(bar, offset, &d->msix_mmio);
+    memory_region_add_subregion(bar, table_offset, &d->msix_table_mmio);
+    memory_region_add_subregion(bar, pba_offset, &d->msix_pba_mmio);
 }
 
 static void msix_mask_all(struct PCIDevice *dev, unsigned nentries)
@@ -251,7 +263,7 @@  static void msix_mask_all(struct PCIDevice *dev, unsigned nentries)
             vector * PCI_MSIX_ENTRY_SIZE + PCI_MSIX_ENTRY_VECTOR_CTRL;
         bool was_masked = msix_is_masked(dev, vector);
 
-        dev->msix_table_page[offset] |= PCI_MSIX_ENTRY_CTRL_MASKBIT;
+        dev->msix_table[offset] |= PCI_MSIX_ENTRY_CTRL_MASKBIT;
         msix_handle_mask_update(dev, vector, was_masked);
     }
 }
@@ -263,6 +275,7 @@  int msix_init(struct PCIDevice *dev, unsigned short nentries,
               unsigned bar_nr, unsigned bar_size)
 {
     int ret;
+    unsigned table_size, pba_size;
 
     /* Nothing to do if MSI is not supported by interrupt controller */
     if (!msi_supported) {
@@ -271,14 +284,20 @@  int msix_init(struct PCIDevice *dev, unsigned short nentries,
     if (nentries > MSIX_MAX_ENTRIES)
         return -EINVAL;
 
+    table_size = nentries * PCI_MSIX_ENTRY_SIZE;
+    pba_size = QEMU_ALIGN_UP(nentries, 64) / 8;
+
     dev->msix_entry_used = g_malloc0(MSIX_MAX_ENTRIES *
                                         sizeof *dev->msix_entry_used);
 
-    dev->msix_table_page = g_malloc0(MSIX_PAGE_SIZE);
+    dev->msix_table = g_malloc0(table_size);
+    dev->msix_pba = g_malloc0(pba_size);
     msix_mask_all(dev, nentries);
 
-    memory_region_init_io(&dev->msix_mmio, &msix_mmio_ops, dev,
-                          "msix", MSIX_PAGE_SIZE);
+    memory_region_init_io(&dev->msix_table_mmio, &msix_table_mmio_ops, dev,
+                          "msix", table_size);
+    memory_region_init_io(&dev->msix_pba_mmio, &msix_pba_mmio_ops, dev,
+                          "msix-pba", pba_size);
 
     dev->msix_entries_nr = nentries;
     ret = msix_add_config(dev, nentries, bar_nr, bar_size);
@@ -291,9 +310,12 @@  int msix_init(struct PCIDevice *dev, unsigned short nentries,
 
 err_config:
     dev->msix_entries_nr = 0;
-    memory_region_destroy(&dev->msix_mmio);
-    g_free(dev->msix_table_page);
-    dev->msix_table_page = NULL;
+    memory_region_destroy(&dev->msix_pba_mmio);
+    g_free(dev->msix_pba);
+    dev->msix_pba = NULL;
+    memory_region_destroy(&dev->msix_table_mmio);
+    g_free(dev->msix_table);
+    dev->msix_table = NULL;
     g_free(dev->msix_entry_used);
     dev->msix_entry_used = NULL;
     return ret;
@@ -354,10 +376,14 @@  int msix_uninit(PCIDevice *dev, MemoryRegion *bar)
     dev->msix_cap = 0;
     msix_free_irq_entries(dev);
     dev->msix_entries_nr = 0;
-    memory_region_del_subregion(bar, &dev->msix_mmio);
-    memory_region_destroy(&dev->msix_mmio);
-    g_free(dev->msix_table_page);
-    dev->msix_table_page = NULL;
+    memory_region_del_subregion(bar, &dev->msix_pba_mmio);
+    memory_region_destroy(&dev->msix_pba_mmio);
+    g_free(dev->msix_pba);
+    dev->msix_pba = NULL;
+    memory_region_del_subregion(bar, &dev->msix_table_mmio);
+    memory_region_destroy(&dev->msix_table_mmio);
+    g_free(dev->msix_table);
+    dev->msix_table = NULL;
     g_free(dev->msix_entry_used);
     dev->msix_entry_used = NULL;
     dev->cap_present &= ~QEMU_PCI_CAP_MSIX;
@@ -380,8 +406,8 @@  void msix_save(PCIDevice *dev, QEMUFile *f)
         return;
     }
 
-    qemu_put_buffer(f, dev->msix_table_page, n * PCI_MSIX_ENTRY_SIZE);
-    qemu_put_buffer(f, dev->msix_table_page + MSIX_PAGE_PENDING, (n + 7) / 8);
+    qemu_put_buffer(f, dev->msix_table, n * PCI_MSIX_ENTRY_SIZE);
+    qemu_put_buffer(f, dev->msix_pba, (n + 7) / 8);
 }
 
 /* Should be called after restoring the config space. */
@@ -395,8 +421,8 @@  void msix_load(PCIDevice *dev, QEMUFile *f)
     }
 
     msix_free_irq_entries(dev);
-    qemu_get_buffer(f, dev->msix_table_page, n * PCI_MSIX_ENTRY_SIZE);
-    qemu_get_buffer(f, dev->msix_table_page + MSIX_PAGE_PENDING, (n + 7) / 8);
+    qemu_get_buffer(f, dev->msix_table, n * PCI_MSIX_ENTRY_SIZE);
+    qemu_get_buffer(f, dev->msix_pba, (n + 7) / 8);
     msix_update_function_masked(dev);
 
     for (vector = 0; vector < n; vector++) {
@@ -443,7 +469,9 @@  void msix_reset(PCIDevice *dev)
     msix_free_irq_entries(dev);
     dev->config[dev->msix_cap + MSIX_CONTROL_OFFSET] &=
 	    ~dev->wmask[dev->msix_cap + MSIX_CONTROL_OFFSET];
-    memset(dev->msix_table_page, 0, MSIX_PAGE_SIZE);
+
+    memset(dev->msix_table, 0, dev->msix_entries_nr * PCI_MSIX_ENTRY_SIZE);
+    memset(dev->msix_pba, 0, QEMU_ALIGN_UP(dev->msix_entries_nr, 64) / 8);
     msix_mask_all(dev, dev->msix_entries_nr);
 }
 
diff --git a/hw/pci.h b/hw/pci.h
index d517a54..bd64445 100644
--- a/hw/pci.h
+++ b/hw/pci.h
@@ -224,16 +224,23 @@  struct PCIDevice {
     /* MSI-X entries */
     int msix_entries_nr;
 
-    /* Space to store MSIX table */
-    uint8_t *msix_table_page;
+    /* Space to store MSIX table & pending bit array */
+    uint8_t *msix_table;
+    uint8_t *msix_pba;
+
     /* MemoryRegion container for msix exclusive BAR setup */
     MemoryRegion msix_exclusive_bar;
-    /* MMIO index used to map MSIX table and pending bit entries. */
-    MemoryRegion msix_mmio;
+
+    /* Memory Regions for MSIX table and pending bit entries. */
+    MemoryRegion msix_table_mmio;
+    MemoryRegion msix_pba_mmio;
+
     /* Reference-count for entries actually in use by driver. */
     unsigned *msix_entry_used;
+
     /* MSIX function mask set or MSIX disabled */
     bool msix_function_masked;
+
     /* Version id needed for VMState */
     int32_t version_id;