diff mbox

[v4,4/7] pc: fix QEMU crashing when more than ~50 memory hotplugged

Message ID 1436442444-132020-5-git-send-email-imammedo@redhat.com
State New
Headers show

Commit Message

Igor Mammedov July 9, 2015, 11:47 a.m. UTC
QEMU asserts in vhost due to hitting vhost backend limit
on number of supported memory regions.

Describe all hotplugged memory as one continuos range
to vhost with linear 1:1 HVA->GPA mapping in backend.

Signed-off-by: Igor Mammedov <imammedo@redhat.com>
---
 hw/virtio/vhost.c         | 47 ++++++++++++++++++++++++++++++++++++++++++++---
 include/hw/virtio/vhost.h |  1 +
 2 files changed, 45 insertions(+), 3 deletions(-)

Comments

Michael S. Tsirkin July 9, 2015, 1:06 p.m. UTC | #1
On Thu, Jul 09, 2015 at 01:47:21PM +0200, Igor Mammedov wrote:
> QEMU asserts in vhost due to hitting vhost backend limit
> on number of supported memory regions.
> 
> Describe all hotplugged memory as one continuos range
> to vhost with linear 1:1 HVA->GPA mapping in backend.
> 
> Signed-off-by: Igor Mammedov <imammedo@redhat.com>

Hmm - a bunch of work here to recombine MRs that memory listener
interface breaks up.  In particular KVM could benefit from this too (on
workloads that change the table a lot).  Can't we teach memory core to
pass hva range as a single continuous range to memory listeners?

> ---
>  hw/virtio/vhost.c         | 47 ++++++++++++++++++++++++++++++++++++++++++++---
>  include/hw/virtio/vhost.h |  1 +
>  2 files changed, 45 insertions(+), 3 deletions(-)
> 
> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
> index 2712c6f..7bc27f0 100644
> --- a/hw/virtio/vhost.c
> +++ b/hw/virtio/vhost.c
> @@ -432,6 +432,10 @@ static void vhost_set_memory(MemoryListener *listener,
>  
>      assert(size);
>  
> +    if (!dev->rsvd_hva.mr) {
> +        dev->rsvd_hva = memory_region_find_hva_range(section->mr);
> +    }
> +
>      /* Optimize no-change case. At least cirrus_vga does this a lot at this time. */
>      ram = memory_region_get_ram_ptr(section->mr) + section->offset_within_region;
>      if (add) {
> @@ -472,6 +476,42 @@ static void vhost_begin(MemoryListener *listener)
>      dev->mem_changed_start_addr = -1;
>  }
>  
> +static int vhost_set_mem_table(struct vhost_dev *dev)
> +{
> +    hwaddr start_addr = 0;
> +    ram_addr_t size = 0;
> +    struct vhost_memory *mem;
> +    int r, i;
> +
> +    /* drop memory ranges from continuos HVA */
> +    mem = g_memdup(dev->mem, offsetof(struct vhost_memory, regions) +
> +                       dev->mem->nregions * sizeof dev->mem->regions[0]);
> +    start_addr = dev->rsvd_hva.offset_within_address_space;
> +    size = int128_get64(dev->rsvd_hva.size);
> +    for (i = 0; i < mem->nregions; i++) {
> +        if (mem->regions[i].guest_phys_addr >= start_addr &&
> +            mem->regions[i].guest_phys_addr < start_addr + size) {
> +            mem->nregions--;
> +            memmove(&mem->regions[i], &mem->regions[i + 1],
> +                    (mem->nregions - i) * sizeof mem->regions[0]);
> +        }
> +    }
> +    /* add one continuos HVA entry if memory ranges from it is present */
> +    if (dev->mem->nregions > mem->nregions) {
> +        struct vhost_memory_region *reg = &mem->regions[mem->nregions];
> +
> +        reg->guest_phys_addr = start_addr;
> +        reg->memory_size = size;
> +        reg->userspace_addr =
> +            (__u64)memory_region_get_ram_ptr(dev->rsvd_hva.mr);
> +        mem->nregions++;
> +    }
> +
> +    r = dev->vhost_ops->vhost_call(dev, VHOST_SET_MEM_TABLE, mem);
> +    g_free(mem);
> +    return r;
> +}
> +
>  static void vhost_commit(MemoryListener *listener)
>  {
>      struct vhost_dev *dev = container_of(listener, struct vhost_dev,
> @@ -500,7 +540,7 @@ static void vhost_commit(MemoryListener *listener)
>      }
>  
>      if (!dev->log_enabled) {
> -        r = dev->vhost_ops->vhost_call(dev, VHOST_SET_MEM_TABLE, dev->mem);
> +        r = vhost_set_mem_table(dev);
>          assert(r >= 0);
>          dev->memory_changed = false;
>          return;
> @@ -513,7 +553,7 @@ static void vhost_commit(MemoryListener *listener)
>      if (dev->log_size < log_size) {
>          vhost_dev_log_resize(dev, log_size + VHOST_LOG_BUFFER);
>      }
> -    r = dev->vhost_ops->vhost_call(dev, VHOST_SET_MEM_TABLE, dev->mem);
> +    r = vhost_set_mem_table(dev);
>      assert(r >= 0);
>      /* To log less, can only decrease log size after table update. */
>      if (dev->log_size > log_size + VHOST_LOG_BUFFER) {
> @@ -956,6 +996,7 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
>          migrate_add_blocker(hdev->migration_blocker);
>      }
>      hdev->mem = g_malloc0(offsetof(struct vhost_memory, regions));
> +    memset(&hdev->rsvd_hva, 0, sizeof hdev->rsvd_hva);
>      hdev->n_mem_sections = 0;
>      hdev->mem_sections = NULL;
>      hdev->log = NULL;
> @@ -1119,7 +1160,7 @@ int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev)
>      if (r < 0) {
>          goto fail_features;
>      }
> -    r = hdev->vhost_ops->vhost_call(hdev, VHOST_SET_MEM_TABLE, hdev->mem);
> +    r = vhost_set_mem_table(hdev);
>      if (r < 0) {
>          r = -errno;
>          goto fail_mem;
> diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
> index dd51050..d41bf2f 100644
> --- a/include/hw/virtio/vhost.h
> +++ b/include/hw/virtio/vhost.h
> @@ -40,6 +40,7 @@ struct vhost_dev {
>      struct vhost_memory *mem;
>      int n_mem_sections;
>      MemoryRegionSection *mem_sections;
> +    MemoryRegionSection rsvd_hva;
>      struct vhost_virtqueue *vqs;
>      int nvqs;
>      /* the first virtqueue which would be used by this vhost dev */
> -- 
> 1.8.3.1
Paolo Bonzini July 9, 2015, 1:43 p.m. UTC | #2
On 09/07/2015 15:06, Michael S. Tsirkin wrote:
> > QEMU asserts in vhost due to hitting vhost backend limit
> > on number of supported memory regions.
> > 
> > Describe all hotplugged memory as one continuos range
> > to vhost with linear 1:1 HVA->GPA mapping in backend.
> > 
> > Signed-off-by: Igor Mammedov <imammedo@redhat.com>
>
> Hmm - a bunch of work here to recombine MRs that memory listener
> interface breaks up.  In particular KVM could benefit from this too (on
> workloads that change the table a lot).  Can't we teach memory core to
> pass hva range as a single continuous range to memory listeners?

Memory listeners are based on memory regions, not HVA ranges.

Paolo
Michael S. Tsirkin July 9, 2015, 1:46 p.m. UTC | #3
On Thu, Jul 09, 2015 at 03:43:01PM +0200, Paolo Bonzini wrote:
> 
> 
> On 09/07/2015 15:06, Michael S. Tsirkin wrote:
> > > QEMU asserts in vhost due to hitting vhost backend limit
> > > on number of supported memory regions.
> > > 
> > > Describe all hotplugged memory as one continuos range
> > > to vhost with linear 1:1 HVA->GPA mapping in backend.
> > > 
> > > Signed-off-by: Igor Mammedov <imammedo@redhat.com>
> >
> > Hmm - a bunch of work here to recombine MRs that memory listener
> > interface breaks up.  In particular KVM could benefit from this too (on
> > workloads that change the table a lot).  Can't we teach memory core to
> > pass hva range as a single continuous range to memory listeners?
> 
> Memory listeners are based on memory regions, not HVA ranges.
> 
> Paolo

Many listeners care about HVA ranges. I know KVM and vhost do.
I guess we could create dummy MRs to fill in the holes left by
memory hotplug? vhost already has logic to recombine
consequitive chunks created by memory core.
Igor Mammedov July 10, 2015, 10:12 a.m. UTC | #4
On Thu, 9 Jul 2015 16:46:43 +0300
"Michael S. Tsirkin" <mst@redhat.com> wrote:

> On Thu, Jul 09, 2015 at 03:43:01PM +0200, Paolo Bonzini wrote:
> > 
> > 
> > On 09/07/2015 15:06, Michael S. Tsirkin wrote:
> > > > QEMU asserts in vhost due to hitting vhost backend limit
> > > > on number of supported memory regions.
> > > > 
> > > > Describe all hotplugged memory as one continuos range
> > > > to vhost with linear 1:1 HVA->GPA mapping in backend.
> > > > 
> > > > Signed-off-by: Igor Mammedov <imammedo@redhat.com>
> > >
> > > Hmm - a bunch of work here to recombine MRs that memory listener
> > > interface breaks up.  In particular KVM could benefit from this too (on
> > > workloads that change the table a lot).  Can't we teach memory core to
> > > pass hva range as a single continuous range to memory listeners?
> > 
> > Memory listeners are based on memory regions, not HVA ranges.
> > 
> > Paolo
> 
> Many listeners care about HVA ranges. I know KVM and vhost do.
I'm not sure about KVM, it works just fine with fragmented memory regions,
the same will apply to vhost once module parameter to increase limit
is merged.

but changing generic memory listener interface to replace HVA mapped
regions with HVA container would lead to a case when listeners
won't see exact layout that they might need.

In addition vhost itself will suffer from working with big HVA
since it allocates log depending on size of memory => bigger log.
That's one of the reasons that in this patch HVA ranges in
memory map are compacted only for backend consumption,
QEMU's side of vhost uses exact map for internal purposes.
And the other reason is I don't know vhost enough to rewrite it
to use big HVA for everything.

> I guess we could create dummy MRs to fill in the holes left by
> memory hotplug?
it looks like nice thing from vhost pov but complicates other side,
hence I dislike an idea inventing dummy MRs for vhost's convenience.


> vhost already has logic to recombine
> consequitive chunks created by memory core.
which looks a bit complicated and I was thinking about simplifying
it some time in the future.
Michael S. Tsirkin July 13, 2015, 6:55 a.m. UTC | #5
On Fri, Jul 10, 2015 at 12:12:36PM +0200, Igor Mammedov wrote:
> On Thu, 9 Jul 2015 16:46:43 +0300
> "Michael S. Tsirkin" <mst@redhat.com> wrote:
> 
> > On Thu, Jul 09, 2015 at 03:43:01PM +0200, Paolo Bonzini wrote:
> > > 
> > > 
> > > On 09/07/2015 15:06, Michael S. Tsirkin wrote:
> > > > > QEMU asserts in vhost due to hitting vhost backend limit
> > > > > on number of supported memory regions.
> > > > > 
> > > > > Describe all hotplugged memory as one continuos range
> > > > > to vhost with linear 1:1 HVA->GPA mapping in backend.
> > > > > 
> > > > > Signed-off-by: Igor Mammedov <imammedo@redhat.com>
> > > >
> > > > Hmm - a bunch of work here to recombine MRs that memory listener
> > > > interface breaks up.  In particular KVM could benefit from this too (on
> > > > workloads that change the table a lot).  Can't we teach memory core to
> > > > pass hva range as a single continuous range to memory listeners?
> > > 
> > > Memory listeners are based on memory regions, not HVA ranges.
> > > 
> > > Paolo
> > 
> > Many listeners care about HVA ranges. I know KVM and vhost do.
> I'm not sure about KVM, it works just fine with fragmented memory regions,
> the same will apply to vhost once module parameter to increase limit
> is merged.
> 
> but changing generic memory listener interface to replace HVA mapped
> regions with HVA container would lead to a case when listeners
> won't see exact layout that they might need.

I don't think they care, really.

> In addition vhost itself will suffer from working with big HVA
> since it allocates log depending on size of memory => bigger log.

Not really - it allocates the log depending on the PA range.
Leaving unused holes doesn't reduce it's size.


> That's one of the reasons that in this patch HVA ranges in
> memory map are compacted only for backend consumption,
> QEMU's side of vhost uses exact map for internal purposes.
> And the other reason is I don't know vhost enough to rewrite it
> to use big HVA for everything.
> 
> > I guess we could create dummy MRs to fill in the holes left by
> > memory hotplug?
> it looks like nice thing from vhost pov but complicates other side,

What other side do you have in mind?

> hence I dislike an idea inventing dummy MRs for vhost's convenience.
> 
> 
> > vhost already has logic to recombine
> > consequitive chunks created by memory core.
> which looks a bit complicated and I was thinking about simplifying
> it some time in the future.
Igor Mammedov July 13, 2015, 6:55 p.m. UTC | #6
On Mon, 13 Jul 2015 09:55:18 +0300
"Michael S. Tsirkin" <mst@redhat.com> wrote:

> On Fri, Jul 10, 2015 at 12:12:36PM +0200, Igor Mammedov wrote:
> > On Thu, 9 Jul 2015 16:46:43 +0300
> > "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > 
> > > On Thu, Jul 09, 2015 at 03:43:01PM +0200, Paolo Bonzini wrote:
> > > > 
> > > > 
> > > > On 09/07/2015 15:06, Michael S. Tsirkin wrote:
> > > > > > QEMU asserts in vhost due to hitting vhost backend limit
> > > > > > on number of supported memory regions.
> > > > > > 
> > > > > > Describe all hotplugged memory as one continuos range
> > > > > > to vhost with linear 1:1 HVA->GPA mapping in backend.
> > > > > > 
> > > > > > Signed-off-by: Igor Mammedov <imammedo@redhat.com>
> > > > >
> > > > > Hmm - a bunch of work here to recombine MRs that memory
> > > > > listener interface breaks up.  In particular KVM could
> > > > > benefit from this too (on workloads that change the table a
> > > > > lot).  Can't we teach memory core to pass hva range as a
> > > > > single continuous range to memory listeners?
> > > > 
> > > > Memory listeners are based on memory regions, not HVA ranges.
> > > > 
> > > > Paolo
> > > 
> > > Many listeners care about HVA ranges. I know KVM and vhost do.
> > I'm not sure about KVM, it works just fine with fragmented memory
> > regions, the same will apply to vhost once module parameter to
> > increase limit is merged.
> > 
> > but changing generic memory listener interface to replace HVA mapped
> > regions with HVA container would lead to a case when listeners
> > won't see exact layout that they might need.
> 
> I don't think they care, really.
> 
> > In addition vhost itself will suffer from working with big HVA
> > since it allocates log depending on size of memory => bigger log.
> 
> Not really - it allocates the log depending on the PA range.
> Leaving unused holes doesn't reduce it's size.
if it would use HVA container instead then it will always allocate
log for max possible GPA, meaning that -m 1024,maxmem=1T will waste
a lot of memory and more so for bigger maxmem.
It's still possible to induce worst case by plugging pc-dimm at the end
of hotplug-memory area by specifying address for it explicitly.
That problem exists since memory hot-add was introduced, I've just
haven't noticed it back then.

It's perfectly fine to allocate log by last GPA as far as
memory is nearly continuous but memory hot-add makes it possible to
have sparse layout with a huge gaps between guest mapped RAM
which makes current log handling inefficient.

I wonder how hard it would be to make log_size depend on present RAM
size rather than max present GPA so it wouldn't allocate excess 
memory for log.


> 
> 
> > That's one of the reasons that in this patch HVA ranges in
> > memory map are compacted only for backend consumption,
> > QEMU's side of vhost uses exact map for internal purposes.
> > And the other reason is I don't know vhost enough to rewrite it
> > to use big HVA for everything.
> > 
> > > I guess we could create dummy MRs to fill in the holes left by
> > > memory hotplug?
> > it looks like nice thing from vhost pov but complicates other side,
> 
> What other side do you have in mind?
> 
> > hence I dislike an idea inventing dummy MRs for vhost's convenience.
memory core, but lets see what Paolo thinks about it.

> > 
> > 
> > > vhost already has logic to recombine
> > > consequitive chunks created by memory core.
> > which looks a bit complicated and I was thinking about simplifying
> > it some time in the future.
>
Michael S. Tsirkin July 13, 2015, 8:14 p.m. UTC | #7
On Mon, Jul 13, 2015 at 08:55:13PM +0200, Igor Mammedov wrote:
> On Mon, 13 Jul 2015 09:55:18 +0300
> "Michael S. Tsirkin" <mst@redhat.com> wrote:
> 
> > On Fri, Jul 10, 2015 at 12:12:36PM +0200, Igor Mammedov wrote:
> > > On Thu, 9 Jul 2015 16:46:43 +0300
> > > "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > 
> > > > On Thu, Jul 09, 2015 at 03:43:01PM +0200, Paolo Bonzini wrote:
> > > > > 
> > > > > 
> > > > > On 09/07/2015 15:06, Michael S. Tsirkin wrote:
> > > > > > > QEMU asserts in vhost due to hitting vhost backend limit
> > > > > > > on number of supported memory regions.
> > > > > > > 
> > > > > > > Describe all hotplugged memory as one continuos range
> > > > > > > to vhost with linear 1:1 HVA->GPA mapping in backend.
> > > > > > > 
> > > > > > > Signed-off-by: Igor Mammedov <imammedo@redhat.com>
> > > > > >
> > > > > > Hmm - a bunch of work here to recombine MRs that memory
> > > > > > listener interface breaks up.  In particular KVM could
> > > > > > benefit from this too (on workloads that change the table a
> > > > > > lot).  Can't we teach memory core to pass hva range as a
> > > > > > single continuous range to memory listeners?
> > > > > 
> > > > > Memory listeners are based on memory regions, not HVA ranges.
> > > > > 
> > > > > Paolo
> > > > 
> > > > Many listeners care about HVA ranges. I know KVM and vhost do.
> > > I'm not sure about KVM, it works just fine with fragmented memory
> > > regions, the same will apply to vhost once module parameter to
> > > increase limit is merged.
> > > 
> > > but changing generic memory listener interface to replace HVA mapped
> > > regions with HVA container would lead to a case when listeners
> > > won't see exact layout that they might need.
> > 
> > I don't think they care, really.
> > 
> > > In addition vhost itself will suffer from working with big HVA
> > > since it allocates log depending on size of memory => bigger log.
> > 
> > Not really - it allocates the log depending on the PA range.
> > Leaving unused holes doesn't reduce it's size.
> if it would use HVA container instead then it will always allocate
> log for max possible GPA, meaning that -m 1024,maxmem=1T will waste
> a lot of memory and more so for bigger maxmem.
> It's still possible to induce worst case by plugging pc-dimm at the end
> of hotplug-memory area by specifying address for it explicitly.
> That problem exists since memory hot-add was introduced, I've just
> haven't noticed it back then.

There you are then. Depending on maxmem seems cleaner as it's more
predictable.

> It's perfectly fine to allocate log by last GPA as far as
> memory is nearly continuous but memory hot-add makes it possible to
> have sparse layout with a huge gaps between guest mapped RAM
> which makes current log handling inefficient.
> 
> I wonder how hard it would be to make log_size depend on present RAM
> size rather than max present GPA so it wouldn't allocate excess 
> memory for log.

We can simply map the unused parts of the log RESERVED.

That can be a natural continuation of these series, but
I don't think it needs to block it.

> 
> > 
> > 
> > > That's one of the reasons that in this patch HVA ranges in
> > > memory map are compacted only for backend consumption,
> > > QEMU's side of vhost uses exact map for internal purposes.
> > > And the other reason is I don't know vhost enough to rewrite it
> > > to use big HVA for everything.
> > > 
> > > > I guess we could create dummy MRs to fill in the holes left by
> > > > memory hotplug?
> > > it looks like nice thing from vhost pov but complicates other side,
> > 
> > What other side do you have in mind?
> > 
> > > hence I dislike an idea inventing dummy MRs for vhost's convenience.
> memory core, but lets see what Paolo thinks about it.
> 
> > > 
> > > 
> > > > vhost already has logic to recombine
> > > > consequitive chunks created by memory core.
> > > which looks a bit complicated and I was thinking about simplifying
> > > it some time in the future.
> >
Igor Mammedov July 14, 2015, 1:02 p.m. UTC | #8
On Mon, 13 Jul 2015 23:14:37 +0300
"Michael S. Tsirkin" <mst@redhat.com> wrote:

> On Mon, Jul 13, 2015 at 08:55:13PM +0200, Igor Mammedov wrote:
> > On Mon, 13 Jul 2015 09:55:18 +0300
> > "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > 
> > > On Fri, Jul 10, 2015 at 12:12:36PM +0200, Igor Mammedov wrote:
> > > > On Thu, 9 Jul 2015 16:46:43 +0300
> > > > "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > > 
> > > > > On Thu, Jul 09, 2015 at 03:43:01PM +0200, Paolo Bonzini wrote:
> > > > > > 
> > > > > > 
> > > > > > On 09/07/2015 15:06, Michael S. Tsirkin wrote:
> > > > > > > > QEMU asserts in vhost due to hitting vhost backend limit
> > > > > > > > on number of supported memory regions.
> > > > > > > > 
> > > > > > > > Describe all hotplugged memory as one continuos range
> > > > > > > > to vhost with linear 1:1 HVA->GPA mapping in backend.
> > > > > > > > 
> > > > > > > > Signed-off-by: Igor Mammedov <imammedo@redhat.com>
> > > > > > >
> > > > > > > Hmm - a bunch of work here to recombine MRs that memory
> > > > > > > listener interface breaks up.  In particular KVM could
> > > > > > > benefit from this too (on workloads that change the table a
> > > > > > > lot).  Can't we teach memory core to pass hva range as a
> > > > > > > single continuous range to memory listeners?
> > > > > > 
> > > > > > Memory listeners are based on memory regions, not HVA ranges.
> > > > > > 
> > > > > > Paolo
> > > > > 
> > > > > Many listeners care about HVA ranges. I know KVM and vhost do.
> > > > I'm not sure about KVM, it works just fine with fragmented memory
> > > > regions, the same will apply to vhost once module parameter to
> > > > increase limit is merged.
> > > > 
> > > > but changing generic memory listener interface to replace HVA mapped
> > > > regions with HVA container would lead to a case when listeners
> > > > won't see exact layout that they might need.
> > > 
> > > I don't think they care, really.
> > > 
> > > > In addition vhost itself will suffer from working with big HVA
> > > > since it allocates log depending on size of memory => bigger log.
> > > 
> > > Not really - it allocates the log depending on the PA range.
> > > Leaving unused holes doesn't reduce it's size.
> > if it would use HVA container instead then it will always allocate
> > log for max possible GPA, meaning that -m 1024,maxmem=1T will waste
> > a lot of memory and more so for bigger maxmem.
> > It's still possible to induce worst case by plugging pc-dimm at the end
> > of hotplug-memory area by specifying address for it explicitly.
> > That problem exists since memory hot-add was introduced, I've just
> > haven't noticed it back then.
> 
> There you are then. Depending on maxmem seems cleaner as it's more
> predictable.
> 
> > It's perfectly fine to allocate log by last GPA as far as
> > memory is nearly continuous but memory hot-add makes it possible to
> > have sparse layout with a huge gaps between guest mapped RAM
> > which makes current log handling inefficient.
> > 
> > I wonder how hard it would be to make log_size depend on present RAM
> > size rather than max present GPA so it wouldn't allocate excess 
> > memory for log.
> 
> We can simply map the unused parts of the log RESERVED.
meaning that vhost listener should get RAM regions so it would know
which parts of log it has to mmap(NORESERVE|DONTNEED)

it would also require custom allocator for log, that could manage
punching/unpunching holes in log depending on RAM layout.

btw is it possible for guest to force vhost module access
NORESERVE area and what would happen it that case?


> 
> That can be a natural continuation of these series, but
> I don't think it needs to block it.
> 
> > 
> > > 
> > > 
> > > > That's one of the reasons that in this patch HVA ranges in
> > > > memory map are compacted only for backend consumption,
> > > > QEMU's side of vhost uses exact map for internal purposes.
> > > > And the other reason is I don't know vhost enough to rewrite it
> > > > to use big HVA for everything.
> > > > 
> > > > > I guess we could create dummy MRs to fill in the holes left by
> > > > > memory hotplug?
> > > > it looks like nice thing from vhost pov but complicates other side,
> > > 
> > > What other side do you have in mind?
> > > 
> > > > hence I dislike an idea inventing dummy MRs for vhost's convenience.
> > memory core, but lets see what Paolo thinks about it.
> > 
> > > > 
> > > > 
> > > > > vhost already has logic to recombine
> > > > > consequitive chunks created by memory core.
> > > > which looks a bit complicated and I was thinking about simplifying
> > > > it some time in the future.
> > > 
>
Michael S. Tsirkin July 14, 2015, 1:14 p.m. UTC | #9
On Tue, Jul 14, 2015 at 03:02:44PM +0200, Igor Mammedov wrote:
> On Mon, 13 Jul 2015 23:14:37 +0300
> "Michael S. Tsirkin" <mst@redhat.com> wrote:
> 
> > On Mon, Jul 13, 2015 at 08:55:13PM +0200, Igor Mammedov wrote:
> > > On Mon, 13 Jul 2015 09:55:18 +0300
> > > "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > 
> > > > On Fri, Jul 10, 2015 at 12:12:36PM +0200, Igor Mammedov wrote:
> > > > > On Thu, 9 Jul 2015 16:46:43 +0300
> > > > > "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > > > 
> > > > > > On Thu, Jul 09, 2015 at 03:43:01PM +0200, Paolo Bonzini wrote:
> > > > > > > 
> > > > > > > 
> > > > > > > On 09/07/2015 15:06, Michael S. Tsirkin wrote:
> > > > > > > > > QEMU asserts in vhost due to hitting vhost backend limit
> > > > > > > > > on number of supported memory regions.
> > > > > > > > > 
> > > > > > > > > Describe all hotplugged memory as one continuos range
> > > > > > > > > to vhost with linear 1:1 HVA->GPA mapping in backend.
> > > > > > > > > 
> > > > > > > > > Signed-off-by: Igor Mammedov <imammedo@redhat.com>
> > > > > > > >
> > > > > > > > Hmm - a bunch of work here to recombine MRs that memory
> > > > > > > > listener interface breaks up.  In particular KVM could
> > > > > > > > benefit from this too (on workloads that change the table a
> > > > > > > > lot).  Can't we teach memory core to pass hva range as a
> > > > > > > > single continuous range to memory listeners?
> > > > > > > 
> > > > > > > Memory listeners are based on memory regions, not HVA ranges.
> > > > > > > 
> > > > > > > Paolo
> > > > > > 
> > > > > > Many listeners care about HVA ranges. I know KVM and vhost do.
> > > > > I'm not sure about KVM, it works just fine with fragmented memory
> > > > > regions, the same will apply to vhost once module parameter to
> > > > > increase limit is merged.
> > > > > 
> > > > > but changing generic memory listener interface to replace HVA mapped
> > > > > regions with HVA container would lead to a case when listeners
> > > > > won't see exact layout that they might need.
> > > > 
> > > > I don't think they care, really.
> > > > 
> > > > > In addition vhost itself will suffer from working with big HVA
> > > > > since it allocates log depending on size of memory => bigger log.
> > > > 
> > > > Not really - it allocates the log depending on the PA range.
> > > > Leaving unused holes doesn't reduce it's size.
> > > if it would use HVA container instead then it will always allocate
> > > log for max possible GPA, meaning that -m 1024,maxmem=1T will waste
> > > a lot of memory and more so for bigger maxmem.
> > > It's still possible to induce worst case by plugging pc-dimm at the end
> > > of hotplug-memory area by specifying address for it explicitly.
> > > That problem exists since memory hot-add was introduced, I've just
> > > haven't noticed it back then.
> > 
> > There you are then. Depending on maxmem seems cleaner as it's more
> > predictable.
> > 
> > > It's perfectly fine to allocate log by last GPA as far as
> > > memory is nearly continuous but memory hot-add makes it possible to
> > > have sparse layout with a huge gaps between guest mapped RAM
> > > which makes current log handling inefficient.
> > > 
> > > I wonder how hard it would be to make log_size depend on present RAM
> > > size rather than max present GPA so it wouldn't allocate excess 
> > > memory for log.
> > 
> > We can simply map the unused parts of the log RESERVED.
> meaning that vhost listener should get RAM regions so it would know
> which parts of log it has to mmap(NORESERVE|DONTNEED)
> 
> it would also require custom allocator for log, that could manage
> punching/unpunching holes in log depending on RAM layout.

Yea. Anyway, this isn't urgent I think.

> btw is it possible for guest to force vhost module access
> NORESERVE area and what would happen it that case?

Sure.  I think you'll get EFAULT, vhost will stop processing the ring then.

> 
> > 
> > That can be a natural continuation of these series, but
> > I don't think it needs to block it.
> > 
> > > 
> > > > 
> > > > 
> > > > > That's one of the reasons that in this patch HVA ranges in
> > > > > memory map are compacted only for backend consumption,
> > > > > QEMU's side of vhost uses exact map for internal purposes.
> > > > > And the other reason is I don't know vhost enough to rewrite it
> > > > > to use big HVA for everything.
> > > > > 
> > > > > > I guess we could create dummy MRs to fill in the holes left by
> > > > > > memory hotplug?
> > > > > it looks like nice thing from vhost pov but complicates other side,
> > > > 
> > > > What other side do you have in mind?
> > > > 
> > > > > hence I dislike an idea inventing dummy MRs for vhost's convenience.
> > > memory core, but lets see what Paolo thinks about it.
> > > 
> > > > > 
> > > > > 
> > > > > > vhost already has logic to recombine
> > > > > > consequitive chunks created by memory core.
> > > > > which looks a bit complicated and I was thinking about simplifying
> > > > > it some time in the future.
> > > > 
> >
diff mbox

Patch

diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
index 2712c6f..7bc27f0 100644
--- a/hw/virtio/vhost.c
+++ b/hw/virtio/vhost.c
@@ -432,6 +432,10 @@  static void vhost_set_memory(MemoryListener *listener,
 
     assert(size);
 
+    if (!dev->rsvd_hva.mr) {
+        dev->rsvd_hva = memory_region_find_hva_range(section->mr);
+    }
+
     /* Optimize no-change case. At least cirrus_vga does this a lot at this time. */
     ram = memory_region_get_ram_ptr(section->mr) + section->offset_within_region;
     if (add) {
@@ -472,6 +476,42 @@  static void vhost_begin(MemoryListener *listener)
     dev->mem_changed_start_addr = -1;
 }
 
+static int vhost_set_mem_table(struct vhost_dev *dev)
+{
+    hwaddr start_addr = 0;
+    ram_addr_t size = 0;
+    struct vhost_memory *mem;
+    int r, i;
+
+    /* drop memory ranges from continuos HVA */
+    mem = g_memdup(dev->mem, offsetof(struct vhost_memory, regions) +
+                       dev->mem->nregions * sizeof dev->mem->regions[0]);
+    start_addr = dev->rsvd_hva.offset_within_address_space;
+    size = int128_get64(dev->rsvd_hva.size);
+    for (i = 0; i < mem->nregions; i++) {
+        if (mem->regions[i].guest_phys_addr >= start_addr &&
+            mem->regions[i].guest_phys_addr < start_addr + size) {
+            mem->nregions--;
+            memmove(&mem->regions[i], &mem->regions[i + 1],
+                    (mem->nregions - i) * sizeof mem->regions[0]);
+        }
+    }
+    /* add one continuos HVA entry if memory ranges from it is present */
+    if (dev->mem->nregions > mem->nregions) {
+        struct vhost_memory_region *reg = &mem->regions[mem->nregions];
+
+        reg->guest_phys_addr = start_addr;
+        reg->memory_size = size;
+        reg->userspace_addr =
+            (__u64)memory_region_get_ram_ptr(dev->rsvd_hva.mr);
+        mem->nregions++;
+    }
+
+    r = dev->vhost_ops->vhost_call(dev, VHOST_SET_MEM_TABLE, mem);
+    g_free(mem);
+    return r;
+}
+
 static void vhost_commit(MemoryListener *listener)
 {
     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
@@ -500,7 +540,7 @@  static void vhost_commit(MemoryListener *listener)
     }
 
     if (!dev->log_enabled) {
-        r = dev->vhost_ops->vhost_call(dev, VHOST_SET_MEM_TABLE, dev->mem);
+        r = vhost_set_mem_table(dev);
         assert(r >= 0);
         dev->memory_changed = false;
         return;
@@ -513,7 +553,7 @@  static void vhost_commit(MemoryListener *listener)
     if (dev->log_size < log_size) {
         vhost_dev_log_resize(dev, log_size + VHOST_LOG_BUFFER);
     }
-    r = dev->vhost_ops->vhost_call(dev, VHOST_SET_MEM_TABLE, dev->mem);
+    r = vhost_set_mem_table(dev);
     assert(r >= 0);
     /* To log less, can only decrease log size after table update. */
     if (dev->log_size > log_size + VHOST_LOG_BUFFER) {
@@ -956,6 +996,7 @@  int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
         migrate_add_blocker(hdev->migration_blocker);
     }
     hdev->mem = g_malloc0(offsetof(struct vhost_memory, regions));
+    memset(&hdev->rsvd_hva, 0, sizeof hdev->rsvd_hva);
     hdev->n_mem_sections = 0;
     hdev->mem_sections = NULL;
     hdev->log = NULL;
@@ -1119,7 +1160,7 @@  int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev)
     if (r < 0) {
         goto fail_features;
     }
-    r = hdev->vhost_ops->vhost_call(hdev, VHOST_SET_MEM_TABLE, hdev->mem);
+    r = vhost_set_mem_table(hdev);
     if (r < 0) {
         r = -errno;
         goto fail_mem;
diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
index dd51050..d41bf2f 100644
--- a/include/hw/virtio/vhost.h
+++ b/include/hw/virtio/vhost.h
@@ -40,6 +40,7 @@  struct vhost_dev {
     struct vhost_memory *mem;
     int n_mem_sections;
     MemoryRegionSection *mem_sections;
+    MemoryRegionSection rsvd_hva;
     struct vhost_virtqueue *vqs;
     int nvqs;
     /* the first virtqueue which would be used by this vhost dev */