Message ID | 1396502304-7456-2-git-send-email-arei.gonglei@huawei.com |
---|---|
State | New |
Headers | show |
On Thu, Apr 03, 2014 at 01:18:24PM +0800, arei.gonglei@huawei.com wrote: > From: Gonglei <arei.gonglei@huawei.com> > > QEMU only mmap MSIX_PAGE_SIZE memory for all pci devices in > assigned_dev_register_msix_mmio(), meanwhile the set the one > page memmory to zero, so the rest memory will be random value > (maybe etnry.data is not 0). In the assigned_dev_update_msix_mmio() > maybe occur the issue of entry_nr > 256, and the kmod reports > the EINVAL error. > > Signed-off-by: Gonglei <arei.gonglei@huawei.com> Okay so this kind of works because guest does not try to use so many vectors. But I think it's better not to give guest more entries than we can actually support. How about tweaking MSIX capability exposed to guest to limit table size? > --- > hw/i386/kvm/pci-assign.c | 15 ++++++++++++--- > 1 file changed, 12 insertions(+), 3 deletions(-) > > diff --git a/hw/i386/kvm/pci-assign.c b/hw/i386/kvm/pci-assign.c > index 570333f..d25a19e 100644 > --- a/hw/i386/kvm/pci-assign.c > +++ b/hw/i386/kvm/pci-assign.c > @@ -37,6 +37,7 @@ > #include "hw/pci/pci.h" > #include "hw/pci/msi.h" > #include "kvm_i386.h" > +#include "qemu/osdep.h" > > #define MSIX_PAGE_SIZE 0x1000 > > @@ -59,6 +60,9 @@ > #define DEBUG(fmt, ...) > #endif > > +/* the msix-table size readed from pci device config */ > +static int msix_table_size; > + > typedef struct PCIRegion { > int type; /* Memory or port I/O */ > int valid; > @@ -1604,7 +1608,12 @@ static void assigned_dev_msix_reset(AssignedDevice *dev) > > static int assigned_dev_register_msix_mmio(AssignedDevice *dev) > { > - dev->msix_table = mmap(NULL, MSIX_PAGE_SIZE, PROT_READ|PROT_WRITE, > + msix_table_size = ROUND_UP(dev->msix_max * sizeof(struct MSIXTableEntry), > + MSIX_PAGE_SIZE); > + > + DEBUG("msix_table_size: 0x%x\n", msix_table_size); > + > + dev->msix_table = mmap(NULL, msix_table_size, PROT_READ|PROT_WRITE, > MAP_ANONYMOUS|MAP_PRIVATE, 0, 0); > if (dev->msix_table == MAP_FAILED) { > error_report("fail allocate msix_table! %s", strerror(errno)); > @@ -1615,7 +1624,7 @@ static int assigned_dev_register_msix_mmio(AssignedDevice *dev) > assigned_dev_msix_reset(dev); > > memory_region_init_io(&dev->mmio, OBJECT(dev), &assigned_dev_msix_mmio_ops, > - dev, "assigned-dev-msix", MSIX_PAGE_SIZE); > + dev, "assigned-dev-msix", msix_table_size); > return 0; > } > > @@ -1627,7 +1636,7 @@ static void assigned_dev_unregister_msix_mmio(AssignedDevice *dev) > > memory_region_destroy(&dev->mmio); > > - if (munmap(dev->msix_table, MSIX_PAGE_SIZE) == -1) { > + if (munmap(dev->msix_table, msix_table_size) == -1) { > error_report("error unmapping msix_table! %s", strerror(errno)); > } > dev->msix_table = NULL; > -- > 1.7.12.4 > >
Hi, > > QEMU only mmap MSIX_PAGE_SIZE memory for all pci devices in > > assigned_dev_register_msix_mmio(), meanwhile the set the one > > page memmory to zero, so the rest memory will be random value > > (maybe etnry.data is not 0). In the assigned_dev_update_msix_mmio() > > maybe occur the issue of entry_nr > 256, and the kmod reports > > the EINVAL error. > > > > Signed-off-by: Gonglei <arei.gonglei@huawei.com> > > Okay so this kind of works because guest does not try > to use so many vectors. > Yes. It's true. > But I think it's better not to give guest more entries > than we can actually support. > > How about tweaking MSIX capability exposed to guest to limit table size? > IMHO, the MSIX table entry size got form the physic NIC hardware. The software layer shouldn't tweak the capability of real hardware, neither QEMU nor KVM. Best regards, -Gonglei > > --- > > hw/i386/kvm/pci-assign.c | 15 ++++++++++++--- > > 1 file changed, 12 insertions(+), 3 deletions(-) > > > > diff --git a/hw/i386/kvm/pci-assign.c b/hw/i386/kvm/pci-assign.c > > index 570333f..d25a19e 100644 > > --- a/hw/i386/kvm/pci-assign.c > > +++ b/hw/i386/kvm/pci-assign.c > > @@ -37,6 +37,7 @@ > > #include "hw/pci/pci.h" > > #include "hw/pci/msi.h" > > #include "kvm_i386.h" > > +#include "qemu/osdep.h" > > > > #define MSIX_PAGE_SIZE 0x1000 > > > > @@ -59,6 +60,9 @@ > > #define DEBUG(fmt, ...) > > #endif > > > > +/* the msix-table size readed from pci device config */ > > +static int msix_table_size; > > + > > typedef struct PCIRegion { > > int type; /* Memory or port I/O */ > > int valid; > > @@ -1604,7 +1608,12 @@ static void > assigned_dev_msix_reset(AssignedDevice *dev) > > > > static int assigned_dev_register_msix_mmio(AssignedDevice *dev) > > { > > - dev->msix_table = mmap(NULL, MSIX_PAGE_SIZE, > PROT_READ|PROT_WRITE, > > + msix_table_size = ROUND_UP(dev->msix_max * sizeof(struct > MSIXTableEntry), > > + MSIX_PAGE_SIZE); > > + > > + DEBUG("msix_table_size: 0x%x\n", msix_table_size); > > + > > + dev->msix_table = mmap(NULL, msix_table_size, > PROT_READ|PROT_WRITE, > > MAP_ANONYMOUS|MAP_PRIVATE, 0, 0); > > if (dev->msix_table == MAP_FAILED) { > > error_report("fail allocate msix_table! %s", strerror(errno)); > > @@ -1615,7 +1624,7 @@ static int > assigned_dev_register_msix_mmio(AssignedDevice *dev) > > assigned_dev_msix_reset(dev); > > > > memory_region_init_io(&dev->mmio, OBJECT(dev), > &assigned_dev_msix_mmio_ops, > > - dev, "assigned-dev-msix", > MSIX_PAGE_SIZE); > > + dev, "assigned-dev-msix", msix_table_size); > > return 0; > > } > > > > @@ -1627,7 +1636,7 @@ static void > assigned_dev_unregister_msix_mmio(AssignedDevice *dev) > > > > memory_region_destroy(&dev->mmio); > > > > - if (munmap(dev->msix_table, MSIX_PAGE_SIZE) == -1) { > > + if (munmap(dev->msix_table, msix_table_size) == -1) { > > error_report("error unmapping msix_table! %s", strerror(errno)); > > } > > dev->msix_table = NULL; > > -- > > 1.7.12.4 > > > >
On 04/03/14 07:18, arei.gonglei@huawei.com wrote: > From: Gonglei <arei.gonglei@huawei.com> > > QEMU only mmap MSIX_PAGE_SIZE memory for all pci devices in > assigned_dev_register_msix_mmio(), meanwhile the set the one > page memmory to zero, so the rest memory will be random value > (maybe etnry.data is not 0). In the assigned_dev_update_msix_mmio() > maybe occur the issue of entry_nr > 256, and the kmod reports > the EINVAL error. > > Signed-off-by: Gonglei <arei.gonglei@huawei.com> > --- > hw/i386/kvm/pci-assign.c | 15 ++++++++++++--- > 1 file changed, 12 insertions(+), 3 deletions(-) > > diff --git a/hw/i386/kvm/pci-assign.c b/hw/i386/kvm/pci-assign.c > index 570333f..d25a19e 100644 > --- a/hw/i386/kvm/pci-assign.c > +++ b/hw/i386/kvm/pci-assign.c > @@ -37,6 +37,7 @@ > #include "hw/pci/pci.h" > #include "hw/pci/msi.h" > #include "kvm_i386.h" > +#include "qemu/osdep.h" > > #define MSIX_PAGE_SIZE 0x1000 > > @@ -59,6 +60,9 @@ > #define DEBUG(fmt, ...) > #endif > > +/* the msix-table size readed from pci device config */ > +static int msix_table_size; > + > typedef struct PCIRegion { > int type; /* Memory or port I/O */ > int valid; > @@ -1604,7 +1608,12 @@ static void assigned_dev_msix_reset(AssignedDevice *dev) > > static int assigned_dev_register_msix_mmio(AssignedDevice *dev) > { > - dev->msix_table = mmap(NULL, MSIX_PAGE_SIZE, PROT_READ|PROT_WRITE, > + msix_table_size = ROUND_UP(dev->msix_max * sizeof(struct MSIXTableEntry), > + MSIX_PAGE_SIZE); > + > + DEBUG("msix_table_size: 0x%x\n", msix_table_size); > + > + dev->msix_table = mmap(NULL, msix_table_size, PROT_READ|PROT_WRITE, > MAP_ANONYMOUS|MAP_PRIVATE, 0, 0); > if (dev->msix_table == MAP_FAILED) { > error_report("fail allocate msix_table! %s", strerror(errno)); > @@ -1615,7 +1624,7 @@ static int assigned_dev_register_msix_mmio(AssignedDevice *dev) > assigned_dev_msix_reset(dev); > > memory_region_init_io(&dev->mmio, OBJECT(dev), &assigned_dev_msix_mmio_ops, > - dev, "assigned-dev-msix", MSIX_PAGE_SIZE); > + dev, "assigned-dev-msix", msix_table_size); > return 0; > } > > @@ -1627,7 +1636,7 @@ static void assigned_dev_unregister_msix_mmio(AssignedDevice *dev) > > memory_region_destroy(&dev->mmio); > > - if (munmap(dev->msix_table, MSIX_PAGE_SIZE) == -1) { > + if (munmap(dev->msix_table, msix_table_size) == -1) { > error_report("error unmapping msix_table! %s", strerror(errno)); > } > dev->msix_table = NULL; > My only interest in this patch is RHBZ 616415. Namely, I have to improve the propagation of human-readable errors in PCI device assignment (through QMP as well). This patchset has hunks that look capable of conflicting with my work (not started yet), so I think maybe I should delay my work until this patchset is hashed out and applied. Anyway, I just don't want to wait, so here are some review comments (I'm not a PCI assignment expert by any means!): (1) The patch introduces "msix_table_size" as an object with static storage duration (ie. as a "global variable"). This is wrong; different passthru devices will have different MSI-X table sizes. The patch causes breakage as soon as two devices with different (rounded up) table sizes are assigned, and then at least one of them is unassigned (because assigned_dev_unregister_msix_mmio() will unmap either too much or too little). There are two ways to fix this: - Make "msix_table_size" a *sibling* field of "msix_table" and "msix_max", in struct AssignedDevice. Or, - Recompute the value of "msix_table_size" (to be introduced as a local variable in all relevant functions) from "dev->msix_max" each time you need it. (2) The other problem is that not all uses of MSIX_PAGE_SIZE are updated. Namely, assigned_dev_msix_reset() clears the first page, then overwrites the first dev->msix_table entries (masking them). However, if we've allocated at least two pages due to *inexact* division (ie. rounding up), then the trailing portion of the last page continues to contain garbage. Now, this (a) either matters to KVM (because it wants to have sensible values in *all* entries that fit into the pages that it sees), or (b) it doesn't (because KVM complies with "entries_nr" in assigned_dev_update_msix_mmio(), which is bounded by "adev->msix_max"). I don't know which one of (a) or (b) is the case. But, the code seems incorrect (or at least misleading) in both cases: In case (a), the memset() in assigned_dev_msix_reset() should be updated so that it covers the entire allocation (all full pages, including the last one). In case (b), the memset() should be dropped from assigned_dev_msix_reset(), because the loop just beneath it will populate the entire set that KVM will look at. Thanks Laszlo
> On 04/03/14 07:18, arei.gonglei@huawei.com wrote: > > From: Gonglei <arei.gonglei@huawei.com> > > > > QEMU only mmap MSIX_PAGE_SIZE memory for all pci devices in > > assigned_dev_register_msix_mmio(), meanwhile the set the one > > page memmory to zero, so the rest memory will be random value > > (maybe etnry.data is not 0). In the assigned_dev_update_msix_mmio() > > maybe occur the issue of entry_nr > 256, and the kmod reports > > the EINVAL error. > > > > Signed-off-by: Gonglei <arei.gonglei@huawei.com> > > --- > > hw/i386/kvm/pci-assign.c | 15 ++++++++++++--- > > 1 file changed, 12 insertions(+), 3 deletions(-) > > > > diff --git a/hw/i386/kvm/pci-assign.c b/hw/i386/kvm/pci-assign.c > > index 570333f..d25a19e 100644 > > --- a/hw/i386/kvm/pci-assign.c > > +++ b/hw/i386/kvm/pci-assign.c > > @@ -37,6 +37,7 @@ > > #include "hw/pci/pci.h" > > #include "hw/pci/msi.h" > > #include "kvm_i386.h" > > +#include "qemu/osdep.h" > > > > #define MSIX_PAGE_SIZE 0x1000 > > > > @@ -59,6 +60,9 @@ > > #define DEBUG(fmt, ...) > > #endif > > > > +/* the msix-table size readed from pci device config */ > > +static int msix_table_size; > > + > > typedef struct PCIRegion { > > int type; /* Memory or port I/O */ > > int valid; > > @@ -1604,7 +1608,12 @@ static void > assigned_dev_msix_reset(AssignedDevice *dev) > > > > static int assigned_dev_register_msix_mmio(AssignedDevice *dev) > > { > > - dev->msix_table = mmap(NULL, MSIX_PAGE_SIZE, > PROT_READ|PROT_WRITE, > > + msix_table_size = ROUND_UP(dev->msix_max * sizeof(struct > MSIXTableEntry), > > + MSIX_PAGE_SIZE); > > + > > + DEBUG("msix_table_size: 0x%x\n", msix_table_size); > > + > > + dev->msix_table = mmap(NULL, msix_table_size, > PROT_READ|PROT_WRITE, > > MAP_ANONYMOUS|MAP_PRIVATE, 0, 0); > > if (dev->msix_table == MAP_FAILED) { > > error_report("fail allocate msix_table! %s", strerror(errno)); > > @@ -1615,7 +1624,7 @@ static int > assigned_dev_register_msix_mmio(AssignedDevice *dev) > > assigned_dev_msix_reset(dev); > > > > memory_region_init_io(&dev->mmio, OBJECT(dev), > &assigned_dev_msix_mmio_ops, > > - dev, "assigned-dev-msix", > MSIX_PAGE_SIZE); > > + dev, "assigned-dev-msix", msix_table_size); > > return 0; > > } > > > > @@ -1627,7 +1636,7 @@ static void > assigned_dev_unregister_msix_mmio(AssignedDevice *dev) > > > > memory_region_destroy(&dev->mmio); > > > > - if (munmap(dev->msix_table, MSIX_PAGE_SIZE) == -1) { > > + if (munmap(dev->msix_table, msix_table_size) == -1) { > > error_report("error unmapping msix_table! %s", strerror(errno)); > > } > > dev->msix_table = NULL; > > > > My only interest in this patch is RHBZ 616415. Namely, I have to improve > the propagation of human-readable errors in PCI device assignment > (through QMP as well). This patchset has hunks that look capable of > conflicting with my work (not started yet), so I think maybe I should > delay my work until this patchset is hashed out and applied. > > Anyway, I just don't want to wait, so here are some review comments (I'm > not a PCI assignment expert by any means!): > > (1) The patch introduces "msix_table_size" as an object with static > storage duration (ie. as a "global variable"). This is wrong; different > passthru devices will have different MSI-X table sizes. The patch causes > breakage as soon as two devices with different (rounded up) table sizes > are assigned, and then at least one of them is unassigned (because > assigned_dev_unregister_msix_mmio() will unmap either too much or too > little). > Good catch. Thanks! > There are two ways to fix this: > - Make "msix_table_size" a *sibling* field of "msix_table" and > "msix_max", in struct AssignedDevice. Or, > - Recompute the value of "msix_table_size" (to be introduced as a local > variable in all relevant functions) from "dev->msix_max" each time you > need it. > IMHO, the second way is better. > (2) The other problem is that not all uses of MSIX_PAGE_SIZE are > updated. Namely, assigned_dev_msix_reset() clears the first page, then > overwrites the first dev->msix_table entries (masking them). However, if > we've allocated at least two pages due to *inexact* division (ie. > rounding up), then the trailing portion of the last page continues to > contain garbage. > Yep, this is my fault. > Now, this > (a) either matters to KVM (because it wants to have sensible values in > *all* entries that fit into the pages that it sees), or > (b) it doesn't (because KVM complies with "entries_nr" in > assigned_dev_update_msix_mmio(), which is bounded by "adev->msix_max"). > (b) is not correct when the msix table size > MSIX_PAGE_SIZE. The "entries_nr" maybe greater than 256 for the last garbage page. > I don't know which one of (a) or (b) is the case. But, the code seems > incorrect (or at least misleading) in both cases: > > In case (a), the memset() in assigned_dev_msix_reset() should be updated > so that it covers the entire allocation (all full pages, including the > last one). > Agreed. > In case (b), the memset() should be dropped from > assigned_dev_msix_reset(), because the loop just beneath it will > populate the entire set that KVM will look at. > > Thanks > Laszlo Best regards, -Gonglei
diff --git a/hw/i386/kvm/pci-assign.c b/hw/i386/kvm/pci-assign.c index 570333f..d25a19e 100644 --- a/hw/i386/kvm/pci-assign.c +++ b/hw/i386/kvm/pci-assign.c @@ -37,6 +37,7 @@ #include "hw/pci/pci.h" #include "hw/pci/msi.h" #include "kvm_i386.h" +#include "qemu/osdep.h" #define MSIX_PAGE_SIZE 0x1000 @@ -59,6 +60,9 @@ #define DEBUG(fmt, ...) #endif +/* the msix-table size readed from pci device config */ +static int msix_table_size; + typedef struct PCIRegion { int type; /* Memory or port I/O */ int valid; @@ -1604,7 +1608,12 @@ static void assigned_dev_msix_reset(AssignedDevice *dev) static int assigned_dev_register_msix_mmio(AssignedDevice *dev) { - dev->msix_table = mmap(NULL, MSIX_PAGE_SIZE, PROT_READ|PROT_WRITE, + msix_table_size = ROUND_UP(dev->msix_max * sizeof(struct MSIXTableEntry), + MSIX_PAGE_SIZE); + + DEBUG("msix_table_size: 0x%x\n", msix_table_size); + + dev->msix_table = mmap(NULL, msix_table_size, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, 0, 0); if (dev->msix_table == MAP_FAILED) { error_report("fail allocate msix_table! %s", strerror(errno)); @@ -1615,7 +1624,7 @@ static int assigned_dev_register_msix_mmio(AssignedDevice *dev) assigned_dev_msix_reset(dev); memory_region_init_io(&dev->mmio, OBJECT(dev), &assigned_dev_msix_mmio_ops, - dev, "assigned-dev-msix", MSIX_PAGE_SIZE); + dev, "assigned-dev-msix", msix_table_size); return 0; } @@ -1627,7 +1636,7 @@ static void assigned_dev_unregister_msix_mmio(AssignedDevice *dev) memory_region_destroy(&dev->mmio); - if (munmap(dev->msix_table, MSIX_PAGE_SIZE) == -1) { + if (munmap(dev->msix_table, msix_table_size) == -1) { error_report("error unmapping msix_table! %s", strerror(errno)); } dev->msix_table = NULL;