diff mbox

[2/2] pci-assign: Fix memory out of bound when MSI-X table not fit in a single page

Message ID 1396502304-7456-2-git-send-email-arei.gonglei@huawei.com
State New
Headers show

Commit Message

Gonglei (Arei) April 3, 2014, 5:18 a.m. UTC
From: Gonglei <arei.gonglei@huawei.com>

QEMU only mmap MSIX_PAGE_SIZE memory for all pci devices in
assigned_dev_register_msix_mmio(), meanwhile the set the one
page memmory to zero, so the rest memory will be random value
(maybe etnry.data is not 0). In the assigned_dev_update_msix_mmio()
maybe occur the issue of entry_nr > 256, and the kmod reports
the EINVAL error.

Signed-off-by: Gonglei <arei.gonglei@huawei.com>
---
 hw/i386/kvm/pci-assign.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

Comments

Michael S. Tsirkin April 8, 2014, 3:32 p.m. UTC | #1
On Thu, Apr 03, 2014 at 01:18:24PM +0800, arei.gonglei@huawei.com wrote:
> From: Gonglei <arei.gonglei@huawei.com>
> 
> QEMU only mmap MSIX_PAGE_SIZE memory for all pci devices in
> assigned_dev_register_msix_mmio(), meanwhile the set the one
> page memmory to zero, so the rest memory will be random value
> (maybe etnry.data is not 0). In the assigned_dev_update_msix_mmio()
> maybe occur the issue of entry_nr > 256, and the kmod reports
> the EINVAL error.
> 
> Signed-off-by: Gonglei <arei.gonglei@huawei.com>

Okay so this kind of works because guest does not try
to use so many vectors.

But I think it's better not to give guest more entries
than we can actually support.

How about tweaking MSIX capability exposed to guest to limit table size?

> ---
>  hw/i386/kvm/pci-assign.c | 15 ++++++++++++---
>  1 file changed, 12 insertions(+), 3 deletions(-)
> 
> diff --git a/hw/i386/kvm/pci-assign.c b/hw/i386/kvm/pci-assign.c
> index 570333f..d25a19e 100644
> --- a/hw/i386/kvm/pci-assign.c
> +++ b/hw/i386/kvm/pci-assign.c
> @@ -37,6 +37,7 @@
>  #include "hw/pci/pci.h"
>  #include "hw/pci/msi.h"
>  #include "kvm_i386.h"
> +#include "qemu/osdep.h"
>  
>  #define MSIX_PAGE_SIZE 0x1000
>  
> @@ -59,6 +60,9 @@
>  #define DEBUG(fmt, ...)
>  #endif
>  
> +/* the msix-table size readed from pci device config */
> +static int msix_table_size;
> +
>  typedef struct PCIRegion {
>      int type;           /* Memory or port I/O */
>      int valid;
> @@ -1604,7 +1608,12 @@ static void assigned_dev_msix_reset(AssignedDevice *dev)
>  
>  static int assigned_dev_register_msix_mmio(AssignedDevice *dev)
>  {
> -    dev->msix_table = mmap(NULL, MSIX_PAGE_SIZE, PROT_READ|PROT_WRITE,
> +    msix_table_size = ROUND_UP(dev->msix_max * sizeof(struct MSIXTableEntry),
> +                                MSIX_PAGE_SIZE);
> +
> +    DEBUG("msix_table_size: 0x%x\n", msix_table_size);
> +
> +    dev->msix_table = mmap(NULL, msix_table_size, PROT_READ|PROT_WRITE,
>                             MAP_ANONYMOUS|MAP_PRIVATE, 0, 0);
>      if (dev->msix_table == MAP_FAILED) {
>          error_report("fail allocate msix_table! %s", strerror(errno));
> @@ -1615,7 +1624,7 @@ static int assigned_dev_register_msix_mmio(AssignedDevice *dev)
>      assigned_dev_msix_reset(dev);
>  
>      memory_region_init_io(&dev->mmio, OBJECT(dev), &assigned_dev_msix_mmio_ops,
> -                          dev, "assigned-dev-msix", MSIX_PAGE_SIZE);
> +                          dev, "assigned-dev-msix", msix_table_size);
>      return 0;
>  }
>  
> @@ -1627,7 +1636,7 @@ static void assigned_dev_unregister_msix_mmio(AssignedDevice *dev)
>  
>      memory_region_destroy(&dev->mmio);
>  
> -    if (munmap(dev->msix_table, MSIX_PAGE_SIZE) == -1) {
> +    if (munmap(dev->msix_table, msix_table_size) == -1) {
>          error_report("error unmapping msix_table! %s", strerror(errno));
>      }
>      dev->msix_table = NULL;
> -- 
> 1.7.12.4
> 
>
Gonglei (Arei) April 9, 2014, 10:56 a.m. UTC | #2
Hi,

> > QEMU only mmap MSIX_PAGE_SIZE memory for all pci devices in
> > assigned_dev_register_msix_mmio(), meanwhile the set the one
> > page memmory to zero, so the rest memory will be random value
> > (maybe etnry.data is not 0). In the assigned_dev_update_msix_mmio()
> > maybe occur the issue of entry_nr > 256, and the kmod reports
> > the EINVAL error.
> >
> > Signed-off-by: Gonglei <arei.gonglei@huawei.com>
> 
> Okay so this kind of works because guest does not try
> to use so many vectors.
> 
Yes. It's true.

> But I think it's better not to give guest more entries
> than we can actually support.
> 
> How about tweaking MSIX capability exposed to guest to limit table size?
> 
IMHO, the MSIX table entry size got form the physic NIC hardware. The software
layer shouldn't tweak the capability of real hardware, neither QEMU nor KVM. 


Best regards,
-Gonglei

> > ---
> >  hw/i386/kvm/pci-assign.c | 15 ++++++++++++---
> >  1 file changed, 12 insertions(+), 3 deletions(-)
> >
> > diff --git a/hw/i386/kvm/pci-assign.c b/hw/i386/kvm/pci-assign.c
> > index 570333f..d25a19e 100644
> > --- a/hw/i386/kvm/pci-assign.c
> > +++ b/hw/i386/kvm/pci-assign.c
> > @@ -37,6 +37,7 @@
> >  #include "hw/pci/pci.h"
> >  #include "hw/pci/msi.h"
> >  #include "kvm_i386.h"
> > +#include "qemu/osdep.h"
> >
> >  #define MSIX_PAGE_SIZE 0x1000
> >
> > @@ -59,6 +60,9 @@
> >  #define DEBUG(fmt, ...)
> >  #endif
> >
> > +/* the msix-table size readed from pci device config */
> > +static int msix_table_size;
> > +
> >  typedef struct PCIRegion {
> >      int type;           /* Memory or port I/O */
> >      int valid;
> > @@ -1604,7 +1608,12 @@ static void
> assigned_dev_msix_reset(AssignedDevice *dev)
> >
> >  static int assigned_dev_register_msix_mmio(AssignedDevice *dev)
> >  {
> > -    dev->msix_table = mmap(NULL, MSIX_PAGE_SIZE,
> PROT_READ|PROT_WRITE,
> > +    msix_table_size = ROUND_UP(dev->msix_max * sizeof(struct
> MSIXTableEntry),
> > +                                MSIX_PAGE_SIZE);
> > +
> > +    DEBUG("msix_table_size: 0x%x\n", msix_table_size);
> > +
> > +    dev->msix_table = mmap(NULL, msix_table_size,
> PROT_READ|PROT_WRITE,
> >                             MAP_ANONYMOUS|MAP_PRIVATE, 0, 0);
> >      if (dev->msix_table == MAP_FAILED) {
> >          error_report("fail allocate msix_table! %s", strerror(errno));
> > @@ -1615,7 +1624,7 @@ static int
> assigned_dev_register_msix_mmio(AssignedDevice *dev)
> >      assigned_dev_msix_reset(dev);
> >
> >      memory_region_init_io(&dev->mmio, OBJECT(dev),
> &assigned_dev_msix_mmio_ops,
> > -                          dev, "assigned-dev-msix",
> MSIX_PAGE_SIZE);
> > +                          dev, "assigned-dev-msix", msix_table_size);
> >      return 0;
> >  }
> >
> > @@ -1627,7 +1636,7 @@ static void
> assigned_dev_unregister_msix_mmio(AssignedDevice *dev)
> >
> >      memory_region_destroy(&dev->mmio);
> >
> > -    if (munmap(dev->msix_table, MSIX_PAGE_SIZE) == -1) {
> > +    if (munmap(dev->msix_table, msix_table_size) == -1) {
> >          error_report("error unmapping msix_table! %s", strerror(errno));
> >      }
> >      dev->msix_table = NULL;
> > --
> > 1.7.12.4
> >
> >
Laszlo Ersek April 9, 2014, 2:12 p.m. UTC | #3
On 04/03/14 07:18, arei.gonglei@huawei.com wrote:
> From: Gonglei <arei.gonglei@huawei.com>
> 
> QEMU only mmap MSIX_PAGE_SIZE memory for all pci devices in
> assigned_dev_register_msix_mmio(), meanwhile the set the one
> page memmory to zero, so the rest memory will be random value
> (maybe etnry.data is not 0). In the assigned_dev_update_msix_mmio()
> maybe occur the issue of entry_nr > 256, and the kmod reports
> the EINVAL error.
> 
> Signed-off-by: Gonglei <arei.gonglei@huawei.com>
> ---
>  hw/i386/kvm/pci-assign.c | 15 ++++++++++++---
>  1 file changed, 12 insertions(+), 3 deletions(-)
> 
> diff --git a/hw/i386/kvm/pci-assign.c b/hw/i386/kvm/pci-assign.c
> index 570333f..d25a19e 100644
> --- a/hw/i386/kvm/pci-assign.c
> +++ b/hw/i386/kvm/pci-assign.c
> @@ -37,6 +37,7 @@
>  #include "hw/pci/pci.h"
>  #include "hw/pci/msi.h"
>  #include "kvm_i386.h"
> +#include "qemu/osdep.h"
>  
>  #define MSIX_PAGE_SIZE 0x1000
>  
> @@ -59,6 +60,9 @@
>  #define DEBUG(fmt, ...)
>  #endif
>  
> +/* the msix-table size readed from pci device config */
> +static int msix_table_size;
> +
>  typedef struct PCIRegion {
>      int type;           /* Memory or port I/O */
>      int valid;
> @@ -1604,7 +1608,12 @@ static void assigned_dev_msix_reset(AssignedDevice *dev)
>  
>  static int assigned_dev_register_msix_mmio(AssignedDevice *dev)
>  {
> -    dev->msix_table = mmap(NULL, MSIX_PAGE_SIZE, PROT_READ|PROT_WRITE,
> +    msix_table_size = ROUND_UP(dev->msix_max * sizeof(struct MSIXTableEntry),
> +                                MSIX_PAGE_SIZE);
> +
> +    DEBUG("msix_table_size: 0x%x\n", msix_table_size);
> +
> +    dev->msix_table = mmap(NULL, msix_table_size, PROT_READ|PROT_WRITE,
>                             MAP_ANONYMOUS|MAP_PRIVATE, 0, 0);
>      if (dev->msix_table == MAP_FAILED) {
>          error_report("fail allocate msix_table! %s", strerror(errno));
> @@ -1615,7 +1624,7 @@ static int assigned_dev_register_msix_mmio(AssignedDevice *dev)
>      assigned_dev_msix_reset(dev);
>  
>      memory_region_init_io(&dev->mmio, OBJECT(dev), &assigned_dev_msix_mmio_ops,
> -                          dev, "assigned-dev-msix", MSIX_PAGE_SIZE);
> +                          dev, "assigned-dev-msix", msix_table_size);
>      return 0;
>  }
>  
> @@ -1627,7 +1636,7 @@ static void assigned_dev_unregister_msix_mmio(AssignedDevice *dev)
>  
>      memory_region_destroy(&dev->mmio);
>  
> -    if (munmap(dev->msix_table, MSIX_PAGE_SIZE) == -1) {
> +    if (munmap(dev->msix_table, msix_table_size) == -1) {
>          error_report("error unmapping msix_table! %s", strerror(errno));
>      }
>      dev->msix_table = NULL;
> 

My only interest in this patch is RHBZ 616415. Namely, I have to improve
the propagation of human-readable errors in PCI device assignment
(through QMP as well). This patchset has hunks that look capable of
conflicting with my work (not started yet), so I think maybe I should
delay my work until this patchset is hashed out and applied.

Anyway, I just don't want to wait, so here are some review comments (I'm
not a PCI assignment expert by any means!):

(1) The patch introduces "msix_table_size" as an object with static
storage duration (ie. as a "global variable"). This is wrong; different
passthru devices will have different MSI-X table sizes. The patch causes
breakage as soon as two devices with different (rounded up) table sizes
are assigned, and then at least one of them is unassigned (because
assigned_dev_unregister_msix_mmio() will unmap either too much or too
little).

There are two ways to fix this:
- Make "msix_table_size" a *sibling* field of "msix_table" and
"msix_max", in struct AssignedDevice. Or,
- Recompute the value of "msix_table_size" (to be introduced as a local
variable in all relevant functions) from "dev->msix_max" each time you
need it.

(2) The other problem is that not all uses of MSIX_PAGE_SIZE are
updated. Namely, assigned_dev_msix_reset() clears the first page, then
overwrites the first dev->msix_table entries (masking them). However, if
we've allocated at least two pages due to *inexact* division (ie.
rounding up), then the trailing portion of the last page continues to
contain garbage.

Now, this
(a) either matters to KVM (because it wants to have sensible values in
*all* entries that fit into the pages that it sees), or
(b) it doesn't (because KVM complies with "entries_nr" in
assigned_dev_update_msix_mmio(), which is bounded by "adev->msix_max").

I don't know which one of (a) or (b) is the case. But, the code seems
incorrect (or at least misleading) in both cases:

In case (a), the memset() in assigned_dev_msix_reset() should be updated
so that it covers the entire allocation (all full pages, including the
last one).

In case (b), the memset() should be dropped from
assigned_dev_msix_reset(), because the loop just beneath it will
populate the entire set that KVM will look at.

Thanks
Laszlo
Gonglei (Arei) April 10, 2014, 2:05 a.m. UTC | #4
> On 04/03/14 07:18, arei.gonglei@huawei.com wrote:
> > From: Gonglei <arei.gonglei@huawei.com>
> >
> > QEMU only mmap MSIX_PAGE_SIZE memory for all pci devices in
> > assigned_dev_register_msix_mmio(), meanwhile the set the one
> > page memmory to zero, so the rest memory will be random value
> > (maybe etnry.data is not 0). In the assigned_dev_update_msix_mmio()
> > maybe occur the issue of entry_nr > 256, and the kmod reports
> > the EINVAL error.
> >
> > Signed-off-by: Gonglei <arei.gonglei@huawei.com>
> > ---
> >  hw/i386/kvm/pci-assign.c | 15 ++++++++++++---
> >  1 file changed, 12 insertions(+), 3 deletions(-)
> >
> > diff --git a/hw/i386/kvm/pci-assign.c b/hw/i386/kvm/pci-assign.c
> > index 570333f..d25a19e 100644
> > --- a/hw/i386/kvm/pci-assign.c
> > +++ b/hw/i386/kvm/pci-assign.c
> > @@ -37,6 +37,7 @@
> >  #include "hw/pci/pci.h"
> >  #include "hw/pci/msi.h"
> >  #include "kvm_i386.h"
> > +#include "qemu/osdep.h"
> >
> >  #define MSIX_PAGE_SIZE 0x1000
> >
> > @@ -59,6 +60,9 @@
> >  #define DEBUG(fmt, ...)
> >  #endif
> >
> > +/* the msix-table size readed from pci device config */
> > +static int msix_table_size;
> > +
> >  typedef struct PCIRegion {
> >      int type;           /* Memory or port I/O */
> >      int valid;
> > @@ -1604,7 +1608,12 @@ static void
> assigned_dev_msix_reset(AssignedDevice *dev)
> >
> >  static int assigned_dev_register_msix_mmio(AssignedDevice *dev)
> >  {
> > -    dev->msix_table = mmap(NULL, MSIX_PAGE_SIZE,
> PROT_READ|PROT_WRITE,
> > +    msix_table_size = ROUND_UP(dev->msix_max * sizeof(struct
> MSIXTableEntry),
> > +                                MSIX_PAGE_SIZE);
> > +
> > +    DEBUG("msix_table_size: 0x%x\n", msix_table_size);
> > +
> > +    dev->msix_table = mmap(NULL, msix_table_size,
> PROT_READ|PROT_WRITE,
> >                             MAP_ANONYMOUS|MAP_PRIVATE, 0, 0);
> >      if (dev->msix_table == MAP_FAILED) {
> >          error_report("fail allocate msix_table! %s", strerror(errno));
> > @@ -1615,7 +1624,7 @@ static int
> assigned_dev_register_msix_mmio(AssignedDevice *dev)
> >      assigned_dev_msix_reset(dev);
> >
> >      memory_region_init_io(&dev->mmio, OBJECT(dev),
> &assigned_dev_msix_mmio_ops,
> > -                          dev, "assigned-dev-msix",
> MSIX_PAGE_SIZE);
> > +                          dev, "assigned-dev-msix", msix_table_size);
> >      return 0;
> >  }
> >
> > @@ -1627,7 +1636,7 @@ static void
> assigned_dev_unregister_msix_mmio(AssignedDevice *dev)
> >
> >      memory_region_destroy(&dev->mmio);
> >
> > -    if (munmap(dev->msix_table, MSIX_PAGE_SIZE) == -1) {
> > +    if (munmap(dev->msix_table, msix_table_size) == -1) {
> >          error_report("error unmapping msix_table! %s", strerror(errno));
> >      }
> >      dev->msix_table = NULL;
> >
> 
> My only interest in this patch is RHBZ 616415. Namely, I have to improve
> the propagation of human-readable errors in PCI device assignment
> (through QMP as well). This patchset has hunks that look capable of
> conflicting with my work (not started yet), so I think maybe I should
> delay my work until this patchset is hashed out and applied.
> 
> Anyway, I just don't want to wait, so here are some review comments (I'm
> not a PCI assignment expert by any means!):
> 
> (1) The patch introduces "msix_table_size" as an object with static
> storage duration (ie. as a "global variable"). This is wrong; different
> passthru devices will have different MSI-X table sizes. The patch causes
> breakage as soon as two devices with different (rounded up) table sizes
> are assigned, and then at least one of them is unassigned (because
> assigned_dev_unregister_msix_mmio() will unmap either too much or too
> little).
> 
Good catch. Thanks!

> There are two ways to fix this:
> - Make "msix_table_size" a *sibling* field of "msix_table" and
> "msix_max", in struct AssignedDevice. Or,
> - Recompute the value of "msix_table_size" (to be introduced as a local
> variable in all relevant functions) from "dev->msix_max" each time you
> need it.
> 
IMHO, the second way is better.

> (2) The other problem is that not all uses of MSIX_PAGE_SIZE are
> updated. Namely, assigned_dev_msix_reset() clears the first page, then
> overwrites the first dev->msix_table entries (masking them). However, if
> we've allocated at least two pages due to *inexact* division (ie.
> rounding up), then the trailing portion of the last page continues to
> contain garbage.
> 
Yep, this is my fault.

> Now, this
> (a) either matters to KVM (because it wants to have sensible values in
> *all* entries that fit into the pages that it sees), or
> (b) it doesn't (because KVM complies with "entries_nr" in
> assigned_dev_update_msix_mmio(), which is bounded by "adev->msix_max").
> 
(b) is not correct when the msix table size > MSIX_PAGE_SIZE. The "entries_nr"
maybe greater than 256 for the last garbage page.

> I don't know which one of (a) or (b) is the case. But, the code seems
> incorrect (or at least misleading) in both cases:
> 
> In case (a), the memset() in assigned_dev_msix_reset() should be updated
> so that it covers the entire allocation (all full pages, including the
> last one).
> 
Agreed.

> In case (b), the memset() should be dropped from
> assigned_dev_msix_reset(), because the loop just beneath it will
> populate the entire set that KVM will look at.
> 
> Thanks
> Laszlo

Best regards,
-Gonglei
diff mbox

Patch

diff --git a/hw/i386/kvm/pci-assign.c b/hw/i386/kvm/pci-assign.c
index 570333f..d25a19e 100644
--- a/hw/i386/kvm/pci-assign.c
+++ b/hw/i386/kvm/pci-assign.c
@@ -37,6 +37,7 @@ 
 #include "hw/pci/pci.h"
 #include "hw/pci/msi.h"
 #include "kvm_i386.h"
+#include "qemu/osdep.h"
 
 #define MSIX_PAGE_SIZE 0x1000
 
@@ -59,6 +60,9 @@ 
 #define DEBUG(fmt, ...)
 #endif
 
+/* the msix-table size readed from pci device config */
+static int msix_table_size;
+
 typedef struct PCIRegion {
     int type;           /* Memory or port I/O */
     int valid;
@@ -1604,7 +1608,12 @@  static void assigned_dev_msix_reset(AssignedDevice *dev)
 
 static int assigned_dev_register_msix_mmio(AssignedDevice *dev)
 {
-    dev->msix_table = mmap(NULL, MSIX_PAGE_SIZE, PROT_READ|PROT_WRITE,
+    msix_table_size = ROUND_UP(dev->msix_max * sizeof(struct MSIXTableEntry),
+                                MSIX_PAGE_SIZE);
+
+    DEBUG("msix_table_size: 0x%x\n", msix_table_size);
+
+    dev->msix_table = mmap(NULL, msix_table_size, PROT_READ|PROT_WRITE,
                            MAP_ANONYMOUS|MAP_PRIVATE, 0, 0);
     if (dev->msix_table == MAP_FAILED) {
         error_report("fail allocate msix_table! %s", strerror(errno));
@@ -1615,7 +1624,7 @@  static int assigned_dev_register_msix_mmio(AssignedDevice *dev)
     assigned_dev_msix_reset(dev);
 
     memory_region_init_io(&dev->mmio, OBJECT(dev), &assigned_dev_msix_mmio_ops,
-                          dev, "assigned-dev-msix", MSIX_PAGE_SIZE);
+                          dev, "assigned-dev-msix", msix_table_size);
     return 0;
 }
 
@@ -1627,7 +1636,7 @@  static void assigned_dev_unregister_msix_mmio(AssignedDevice *dev)
 
     memory_region_destroy(&dev->mmio);
 
-    if (munmap(dev->msix_table, MSIX_PAGE_SIZE) == -1) {
+    if (munmap(dev->msix_table, msix_table_size) == -1) {
         error_report("error unmapping msix_table! %s", strerror(errno));
     }
     dev->msix_table = NULL;