diff mbox

[RFC,2/2] pc: fix QEMU crashing when more than ~50 memory hotplugged

Message ID 1433334157-37665-3-git-send-email-imammedo@redhat.com
State New
Headers show

Commit Message

Igor Mammedov June 3, 2015, 12:22 p.m. UTC
QEMU assert in vhost due to hitting vhost bachend limit
on number of supported memory regions.

Instead of increasing limit in backends, describe all hotlugged
memory as one continuos range to vhost that implements linear
1:1 HVA->GPA mapping in backend.
It allows to avoid increasing VHOST_MEMORY_MAX_NREGIONS limit
in kernel and refactoring current region lookup algorithm
to a faster/scalable datastructure. The same applies to
vhost user which has even lower limit.

Signed-off-by: Igor Mammedov <imammedo@redhat.com>
---
 hw/i386/pc.c      |  4 ++--
 hw/virtio/vhost.c | 15 ++++++++++++---
 2 files changed, 14 insertions(+), 5 deletions(-)

Comments

Paolo Bonzini June 3, 2015, 12:48 p.m. UTC | #1
On 03/06/2015 14:22, Igor Mammedov wrote:
> QEMU assert in vhost due to hitting vhost bachend limit
> on number of supported memory regions.
> 
> Instead of increasing limit in backends, describe all hotlugged
> memory as one continuos range to vhost that implements linear
> 1:1 HVA->GPA mapping in backend.
> It allows to avoid increasing VHOST_MEMORY_MAX_NREGIONS limit
> in kernel and refactoring current region lookup algorithm
> to a faster/scalable datastructure. The same applies to
> vhost user which has even lower limit.
> 
> Signed-off-by: Igor Mammedov <imammedo@redhat.com>
> ---
>  hw/i386/pc.c      |  4 ++--
>  hw/virtio/vhost.c | 15 ++++++++++++---
>  2 files changed, 14 insertions(+), 5 deletions(-)
> 
> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
> index 1eb1db0..c722339 100644
> --- a/hw/i386/pc.c
> +++ b/hw/i386/pc.c
> @@ -1306,8 +1306,8 @@ FWCfgState *pc_memory_init(MachineState *machine,
>              exit(EXIT_FAILURE);
>          }
>  
> -        memory_region_init(&pcms->hotplug_memory, OBJECT(pcms),
> -                           "hotplug-memory", hotplug_mem_size);
> +        memory_region_init_rsvd_hva(&pcms->hotplug_memory, OBJECT(pcms),
> +                                    "hotplug-memory", hotplug_mem_size);
>          memory_region_add_subregion(system_memory, pcms->hotplug_memory_base,
>                                      &pcms->hotplug_memory);
>      }
> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
> index 54851b7..44cfaab 100644
> --- a/hw/virtio/vhost.c
> +++ b/hw/virtio/vhost.c
> @@ -380,6 +380,7 @@ static void vhost_set_memory(MemoryListener *listener,
>      bool log_dirty = memory_region_is_logging(section->mr);
>      int s = offsetof(struct vhost_memory, regions) +
>          (dev->mem->nregions + 1) * sizeof dev->mem->regions[0];
> +    MemoryRegionSection rsvd_hva;
>      void *ram;
>  
>      dev->mem = g_realloc(dev->mem, s);
> @@ -388,17 +389,25 @@ static void vhost_set_memory(MemoryListener *listener,
>          add = false;
>      }
>  
> +    rsvd_hva = memory_region_find_rsvd_hva(section->mr);
> +    if (rsvd_hva.mr) {
> +        start_addr = rsvd_hva.offset_within_address_space;
> +        size = int128_get64(rsvd_hva.size);
> +        ram = memory_region_get_ram_ptr(rsvd_hva.mr);
> +    } else {
> +        ram = memory_region_get_ram_ptr(section->mr) + section->offset_within_region;
> +    }

I don't think this is needed.

What _could_ be useful is to merge adjacent ranges even if they are
partly unmapped, but your patch doesn't do that.

But converting to a "reserved HVA" range should have been done already
in memory_region_add_subregion_common.

Am I missing something?  (And I see now that my request about
memory_region_get_ram_ptr is linked to this bit of your patch).

Paolo

>      assert(size);
>  
>      /* Optimize no-change case. At least cirrus_vga does this a lot at this time. */
> -    ram = memory_region_get_ram_ptr(section->mr) + section->offset_within_region;
>      if (add) {
> -        if (!vhost_dev_cmp_memory(dev, start_addr, size, (uintptr_t)ram)) {
> +        if (!rsvd_hva.mr && !vhost_dev_cmp_memory(dev, start_addr, size, (uintptr_t)ram)) {
>              /* Region exists with same address. Nothing to do. */
>              return;
>          }
>      } else {
> -        if (!vhost_dev_find_reg(dev, start_addr, size)) {
> +        if (!rsvd_hva.mr && !vhost_dev_find_reg(dev, start_addr, size)) {
>              /* Removing region that we don't access. Nothing to do. */
>              return;
>          }
>
Igor Mammedov June 3, 2015, 2:05 p.m. UTC | #2
On Wed, 03 Jun 2015 14:48:46 +0200
Paolo Bonzini <pbonzini@redhat.com> wrote:

> 
> 
> On 03/06/2015 14:22, Igor Mammedov wrote:
> > QEMU assert in vhost due to hitting vhost bachend limit
> > on number of supported memory regions.
> > 
> > Instead of increasing limit in backends, describe all hotlugged
> > memory as one continuos range to vhost that implements linear
> > 1:1 HVA->GPA mapping in backend.
> > It allows to avoid increasing VHOST_MEMORY_MAX_NREGIONS limit
> > in kernel and refactoring current region lookup algorithm
> > to a faster/scalable datastructure. The same applies to
> > vhost user which has even lower limit.
> > 
> > Signed-off-by: Igor Mammedov <imammedo@redhat.com>
> > ---
> >  hw/i386/pc.c      |  4 ++--
> >  hw/virtio/vhost.c | 15 ++++++++++++---
> >  2 files changed, 14 insertions(+), 5 deletions(-)
> > 
> > diff --git a/hw/i386/pc.c b/hw/i386/pc.c
> > index 1eb1db0..c722339 100644
> > --- a/hw/i386/pc.c
> > +++ b/hw/i386/pc.c
> > @@ -1306,8 +1306,8 @@ FWCfgState *pc_memory_init(MachineState *machine,
> >              exit(EXIT_FAILURE);
> >          }
> >  
> > -        memory_region_init(&pcms->hotplug_memory, OBJECT(pcms),
> > -                           "hotplug-memory", hotplug_mem_size);
> > +        memory_region_init_rsvd_hva(&pcms->hotplug_memory, OBJECT(pcms),
> > +                                    "hotplug-memory", hotplug_mem_size);
> >          memory_region_add_subregion(system_memory, pcms->hotplug_memory_base,
> >                                      &pcms->hotplug_memory);
> >      }
> > diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
> > index 54851b7..44cfaab 100644
> > --- a/hw/virtio/vhost.c
> > +++ b/hw/virtio/vhost.c
> > @@ -380,6 +380,7 @@ static void vhost_set_memory(MemoryListener *listener,
> >      bool log_dirty = memory_region_is_logging(section->mr);
> >      int s = offsetof(struct vhost_memory, regions) +
> >          (dev->mem->nregions + 1) * sizeof dev->mem->regions[0];
> > +    MemoryRegionSection rsvd_hva;
> >      void *ram;
> >  
> >      dev->mem = g_realloc(dev->mem, s);
> > @@ -388,17 +389,25 @@ static void vhost_set_memory(MemoryListener *listener,
> >          add = false;
> >      }
> >  
> > +    rsvd_hva = memory_region_find_rsvd_hva(section->mr);
> > +    if (rsvd_hva.mr) {
> > +        start_addr = rsvd_hva.offset_within_address_space;
> > +        size = int128_get64(rsvd_hva.size);
> > +        ram = memory_region_get_ram_ptr(rsvd_hva.mr);
> > +    } else {
> > +        ram = memory_region_get_ram_ptr(section->mr) + section->offset_within_region;
> > +    }
> 
> I don't think this is needed.
> 
> What _could_ be useful is to merge adjacent ranges even if they are
> partly unmapped, but your patch doesn't do that.
merging/splitting for adjacent regions is done at following
vhost_dev_(un)assign_memory() but it doesn't cover cases with
gaps in between.

Trying to make merging/splitting work with gaps might be more
complicated (I haven't tried though), than just passing known
in advance whole rsvd_hva range.

More over if/when initial memory also converted to rsvd_hva
(aliasing stopped me there for now), we could throw away all
this merging and just keep a single rsvd_hva range for all RAM here.

> 
> But converting to a "reserved HVA" range should have been done already
> in memory_region_add_subregion_common.
> 
> Am I missing something?  (And I see now that my request about
> memory_region_get_ram_ptr is linked to this bit of your patch).


> 
> Paolo
> 
> >      assert(size);
> >  
> >      /* Optimize no-change case. At least cirrus_vga does this a lot at this time. */
> > -    ram = memory_region_get_ram_ptr(section->mr) + section->offset_within_region;
> >      if (add) {
> > -        if (!vhost_dev_cmp_memory(dev, start_addr, size, (uintptr_t)ram)) {
> > +        if (!rsvd_hva.mr && !vhost_dev_cmp_memory(dev, start_addr, size, (uintptr_t)ram)) {
> >              /* Region exists with same address. Nothing to do. */
> >              return;
> >          }
> >      } else {
> > -        if (!vhost_dev_find_reg(dev, start_addr, size)) {
> > +        if (!rsvd_hva.mr && !vhost_dev_find_reg(dev, start_addr, size)) {
> >              /* Removing region that we don't access. Nothing to do. */
> >              return;
> >          }
> >
Paolo Bonzini June 3, 2015, 3:08 p.m. UTC | #3
On 03/06/2015 16:05, Igor Mammedov wrote:
>>> > > +    rsvd_hva = memory_region_find_rsvd_hva(section->mr);
>>> > > +    if (rsvd_hva.mr) {
>>> > > +        start_addr = rsvd_hva.offset_within_address_space;
>>> > > +        size = int128_get64(rsvd_hva.size);
>>> > > +        ram = memory_region_get_ram_ptr(rsvd_hva.mr);
>>> > > +    } else {
>>> > > +        ram = memory_region_get_ram_ptr(section->mr) + section->offset_within_region;
>>> > > +    }
>> > 
>> > I don't think this is needed.
>> > 
>> > What _could_ be useful is to merge adjacent ranges even if they are
>> > partly unmapped, but your patch doesn't do that.
> merging/splitting for adjacent regions is done at following
> vhost_dev_(un)assign_memory() but it doesn't cover cases with
> gaps in between.
> 
> Trying to make merging/splitting work with gaps might be more
> complicated (I haven't tried though), than just passing known
> in advance whole rsvd_hva range.
> 
> More over if/when initial memory also converted to rsvd_hva
> (aliasing stopped me there for now), we could throw away all
> this merging and just keep a single rsvd_hva range for all RAM here.

Understood now.  This still should be a separate patch.  I'm much more
confident with the other two (e.g. what happens if a malicious guest
writes to memory that is still MAP_NORESERVE), so feel free to post
those without RFC tag.  But the vhost one really needs mst's eyes.

Paolo
Igor Mammedov June 3, 2015, 3:23 p.m. UTC | #4
On Wed, 03 Jun 2015 17:08:00 +0200
Paolo Bonzini <pbonzini@redhat.com> wrote:

> 
> 
> On 03/06/2015 16:05, Igor Mammedov wrote:
> >>> > > +    rsvd_hva = memory_region_find_rsvd_hva(section->mr);
> >>> > > +    if (rsvd_hva.mr) {
> >>> > > +        start_addr = rsvd_hva.offset_within_address_space;
> >>> > > +        size = int128_get64(rsvd_hva.size);
> >>> > > +        ram = memory_region_get_ram_ptr(rsvd_hva.mr);
> >>> > > +    } else {
> >>> > > +        ram = memory_region_get_ram_ptr(section->mr) + section->offset_within_region;
> >>> > > +    }
> >> > 
> >> > I don't think this is needed.
> >> > 
> >> > What _could_ be useful is to merge adjacent ranges even if they are
> >> > partly unmapped, but your patch doesn't do that.
> > merging/splitting for adjacent regions is done at following
> > vhost_dev_(un)assign_memory() but it doesn't cover cases with
> > gaps in between.
> > 
> > Trying to make merging/splitting work with gaps might be more
> > complicated (I haven't tried though), than just passing known
> > in advance whole rsvd_hva range.
> > 
> > More over if/when initial memory also converted to rsvd_hva
> > (aliasing stopped me there for now), we could throw away all
> > this merging and just keep a single rsvd_hva range for all RAM here.
> 
> Understood now.  This still should be a separate patch.  I'm much more
> confident with the other two (e.g. what happens if a malicious guest
> writes to memory that is still MAP_NORESERVE),
it should get SIGSEVG due to access to PROT_NONE.

> so feel free to post
> those without RFC tag.  But the vhost one really needs mst's eyes.
ok, I'll split it out.

> 
> Paolo
Paolo Bonzini June 3, 2015, 4:11 p.m. UTC | #5
On 03/06/2015 17:23, Igor Mammedov wrote:
>> > Understood now.  This still should be a separate patch.  I'm much more
>> > confident with the other two (e.g. what happens if a malicious guest
>> > writes to memory that is still MAP_NORESERVE),
> it should get SIGSEVG due to access to PROT_NONE.

QEMU doesn't get the SEGV if you do address_space_rw or
address_space_map to unallocated space, because the empty area in the
container is treated as MMIO.

But what does vhost do if you tell it to treat the whole block as a
single huge lump?

Paolo

>> > so feel free to post
>> > those without RFC tag.  But the vhost one really needs mst's eyes.
> ok, I'll split it out.
>
Michael S. Tsirkin June 3, 2015, 4:30 p.m. UTC | #6
On Wed, Jun 03, 2015 at 06:11:29PM +0200, Paolo Bonzini wrote:
> 
> 
> On 03/06/2015 17:23, Igor Mammedov wrote:
> >> > Understood now.  This still should be a separate patch.  I'm much more
> >> > confident with the other two (e.g. what happens if a malicious guest
> >> > writes to memory that is still MAP_NORESERVE),
> > it should get SIGSEVG due to access to PROT_NONE.
> 
> QEMU doesn't get the SEGV if you do address_space_rw or
> address_space_map to unallocated space, because the empty area in the
> container is treated as MMIO.
> 
> But what does vhost do if you tell it to treat the whole block as a
> single huge lump?
> 
> Paolo

Guest can make vhost attempt reading or writing it.
vhost will do copy from/to user.


> >> > so feel free to post
> >> > those without RFC tag.  But the vhost one really needs mst's eyes.
> > ok, I'll split it out.
> >
diff mbox

Patch

diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 1eb1db0..c722339 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -1306,8 +1306,8 @@  FWCfgState *pc_memory_init(MachineState *machine,
             exit(EXIT_FAILURE);
         }
 
-        memory_region_init(&pcms->hotplug_memory, OBJECT(pcms),
-                           "hotplug-memory", hotplug_mem_size);
+        memory_region_init_rsvd_hva(&pcms->hotplug_memory, OBJECT(pcms),
+                                    "hotplug-memory", hotplug_mem_size);
         memory_region_add_subregion(system_memory, pcms->hotplug_memory_base,
                                     &pcms->hotplug_memory);
     }
diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
index 54851b7..44cfaab 100644
--- a/hw/virtio/vhost.c
+++ b/hw/virtio/vhost.c
@@ -380,6 +380,7 @@  static void vhost_set_memory(MemoryListener *listener,
     bool log_dirty = memory_region_is_logging(section->mr);
     int s = offsetof(struct vhost_memory, regions) +
         (dev->mem->nregions + 1) * sizeof dev->mem->regions[0];
+    MemoryRegionSection rsvd_hva;
     void *ram;
 
     dev->mem = g_realloc(dev->mem, s);
@@ -388,17 +389,25 @@  static void vhost_set_memory(MemoryListener *listener,
         add = false;
     }
 
+    rsvd_hva = memory_region_find_rsvd_hva(section->mr);
+    if (rsvd_hva.mr) {
+        start_addr = rsvd_hva.offset_within_address_space;
+        size = int128_get64(rsvd_hva.size);
+        ram = memory_region_get_ram_ptr(rsvd_hva.mr);
+    } else {
+        ram = memory_region_get_ram_ptr(section->mr) + section->offset_within_region;
+    }
+
     assert(size);
 
     /* Optimize no-change case. At least cirrus_vga does this a lot at this time. */
-    ram = memory_region_get_ram_ptr(section->mr) + section->offset_within_region;
     if (add) {
-        if (!vhost_dev_cmp_memory(dev, start_addr, size, (uintptr_t)ram)) {
+        if (!rsvd_hva.mr && !vhost_dev_cmp_memory(dev, start_addr, size, (uintptr_t)ram)) {
             /* Region exists with same address. Nothing to do. */
             return;
         }
     } else {
-        if (!vhost_dev_find_reg(dev, start_addr, size)) {
+        if (!rsvd_hva.mr && !vhost_dev_find_reg(dev, start_addr, size)) {
             /* Removing region that we don't access. Nothing to do. */
             return;
         }