diff mbox

[RFC] pc: align gpa<->hpa on 1GB boundary by splitting RAM on several regions

Message ID 1383070729-19427-1-git-send-email-imammedo@redhat.com
State New
Headers show

Commit Message

Igor Mammedov Oct. 29, 2013, 6:18 p.m. UTC
Otherwise 1GB TLBs cannot be cached for the range.

PS:
as side effect we are not wasting ~1Gb of memory if
1Gb hugepages are used and -m "hpagesize(in Mb)*n + 1"

Signed-off-by: Igor Mammedov <imammedo@redhat.com>
---
PS2:
As RFC it's yet without compatibility changes noted by Paolo

---
 exec.c                    |    8 ++++++-
 hw/i386/pc.c              |   50 ++++++++++++++++++++++++++++----------------
 include/exec/cpu-common.h |    1 +
 3 files changed, 40 insertions(+), 19 deletions(-)

Comments

Marcelo Tosatti Oct. 29, 2013, 9:38 p.m. UTC | #1
On Tue, Oct 29, 2013 at 07:18:49PM +0100, Igor Mammedov wrote:
> Otherwise 1GB TLBs cannot be cached for the range.

This fails to back non-1GB-aligned gpas, but 2MB aligned, with 2MB large
pages.

Since hugetlbfs allocation is static, it requires the user to inform
different 1GB and 2MB sized hugetlbfs mount points (with proper number
of corresponding hugetlbfs pages allocated). This is incompatible with
the current command line, and i'd like to see this problem handled in a
way that is command line backwards compatible.

Also, if the argument for one-to-one mapping between dimms and linear host
virtual address sections holds, it means virtual DIMMs must be
partitioned into whatever hugepage alignment is necessary (and in
that case, why they can't be partitioned similarly with the memory
region aliases?).

> PS:
> as side effect we are not wasting ~1Gb of memory if
> 1Gb hugepages are used and -m "hpagesize(in Mb)*n + 1"

This is how hugetlbfs works. You waste 1GB hugepage if an extra
byte is requested.

> Signed-off-by: Igor Mammedov <imammedo@redhat.com>
> ---
> PS2:
> As RFC it's yet without compatibility changes noted by Paolo
> 
> ---
>  exec.c                    |    8 ++++++-
>  hw/i386/pc.c              |   50 ++++++++++++++++++++++++++++----------------
>  include/exec/cpu-common.h |    1 +
>  3 files changed, 40 insertions(+), 19 deletions(-)
> 
> diff --git a/exec.c b/exec.c
> index 9b6ea50..a4e5c80 100644
> --- a/exec.c
> +++ b/exec.c
> @@ -882,7 +882,7 @@ void qemu_mutex_unlock_ramlist(void)
>  
>  #define HUGETLBFS_MAGIC       0x958458f6
>  
> -static long gethugepagesize(const char *path)
> +long gethugepagesize(const char *path)
>  {
>      struct statfs fs;
>      int ret;
> @@ -925,6 +925,12 @@ static void *file_ram_alloc(RAMBlock *block,
>          return NULL;
>      }
>  
> +    /* refuse to use huge pages if requested size isn't page aligned
> +     * to avoid wasting memory */
> +    if (memory != (memory & ~(hpagesize-1))) {
> +        return NULL;
> +    }
> +
>      if (kvm_enabled() && !kvm_has_sync_mmu()) {
>          fprintf(stderr, "host lacks kvm mmu notifiers, -mem-path unsupported\n");
>          return NULL;
> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
> index 0c313fe..1611fa7 100644
> --- a/hw/i386/pc.c
> +++ b/hw/i386/pc.c
> @@ -1116,32 +1116,46 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory,
>  {
>      int linux_boot, i;
>      MemoryRegion *ram, *option_rom_mr;
> -    MemoryRegion *ram_below_4g, *ram_above_4g;
>      FWCfgState *fw_cfg;
> +    unsigned long hpagesize = gethugepagesize(mem_path);
> +    ram_addr_t below_4g_mem_size_alined, below_4g_mem_size_tail, above_4g_mem_size_alined, above_4g_mem_size_tail;
>  
>      linux_boot = (kernel_filename != NULL);
>  
> -    /* Allocate RAM.  We allocate it as a single memory region and use
> -     * aliases to address portions of it, mostly for backwards compatibility
> -     * with older qemus that used qemu_ram_alloc().
> -     */
> +    *ram_memory = g_malloc(sizeof(**ram_memory));
> +    memory_region_init(*ram_memory, NULL, "pc.ram",
> +                       above_4g_mem_size == 0 ? below_4g_mem_size: 0x100000000ULL + above_4g_mem_size);
> +    memory_region_add_subregion(system_memory, 0, *ram_memory);
> +
> +    below_4g_mem_size_alined = below_4g_mem_size & ~(hpagesize - 1);
>      ram = g_malloc(sizeof(*ram));
> -    memory_region_init_ram(ram, NULL, "pc.ram",
> -                           below_4g_mem_size + above_4g_mem_size);
> +    memory_region_init_ram(ram, NULL, "pc.ram.low.aligned", below_4g_mem_size_alined);
> +    memory_region_add_subregion(*ram_memory, 0, ram);
>      vmstate_register_ram_global(ram);
> -    *ram_memory = ram;
> -    ram_below_4g = g_malloc(sizeof(*ram_below_4g));
> -    memory_region_init_alias(ram_below_4g, NULL, "ram-below-4g", ram,
> -                             0, below_4g_mem_size);
> -    memory_region_add_subregion(system_memory, 0, ram_below_4g);
> -    if (above_4g_mem_size > 0) {
> -        ram_above_4g = g_malloc(sizeof(*ram_above_4g));
> -        memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
> -                                 below_4g_mem_size, above_4g_mem_size);
> -        memory_region_add_subregion(system_memory, 0x100000000ULL,
> -                                    ram_above_4g);
> +
> +    below_4g_mem_size_tail = below_4g_mem_size - below_4g_mem_size_alined;
> +    if (below_4g_mem_size_tail) {
> +        ram = g_malloc(sizeof(*ram));
> +        memory_region_init_ram(ram, NULL, "pc.ram.low.unaligned", below_4g_mem_size_tail);
> +        memory_region_add_subregion(*ram_memory, below_4g_mem_size_alined, ram);
> +        vmstate_register_ram_global(ram);
>      }
>  
> +    if (above_4g_mem_size > 0) {
> +        above_4g_mem_size_alined = above_4g_mem_size & ~(hpagesize - 1);
> +        ram = g_malloc(sizeof(*ram));
> +        memory_region_init_ram(ram, NULL, "pc.ram.high.aligned", above_4g_mem_size_alined);
> +        memory_region_add_subregion(*ram_memory, 0x100000000ULL, ram);
> +        vmstate_register_ram_global(ram);
> +
> +        above_4g_mem_size_tail = above_4g_mem_size - above_4g_mem_size_alined;
> +        if (above_4g_mem_size_tail) {
> +            ram = g_malloc(sizeof(*ram));
> +            memory_region_init_ram(ram, NULL, "pc.ram.high.unaligned", above_4g_mem_size_tail);
> +            memory_region_add_subregion(*ram_memory, 0x100000000ULL + above_4g_mem_size_alined, ram);
> +            vmstate_register_ram_global(ram);
> +	}
> +    }
>  
>      /* Initialize PC system firmware */
>      pc_system_firmware_init(rom_memory, guest_info->isapc_ram_fw);
> diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
> index 40e15e4..f89a37c 100644
> --- a/include/exec/cpu-common.h
> +++ b/include/exec/cpu-common.h
> @@ -57,6 +57,7 @@ void qemu_ram_set_idstr(ram_addr_t addr, const char *name, DeviceState *dev);
>  #ifdef __linux__
>  uint32_t qemu_get_ram_hpagesize(ram_addr_t addr);
>  #endif
> +long gethugepagesize(const char *path);
>  
>  void cpu_physical_memory_rw(hwaddr addr, uint8_t *buf,
>                              int len, int is_write);
> -- 
> 1.7.1
Igor Mammedov Oct. 30, 2013, 4:49 p.m. UTC | #2
On Tue, 29 Oct 2013 19:38:44 -0200
Marcelo Tosatti <mtosatti@redhat.com> wrote:

> On Tue, Oct 29, 2013 at 07:18:49PM +0100, Igor Mammedov wrote:
> > Otherwise 1GB TLBs cannot be cached for the range.
> 
> This fails to back non-1GB-aligned gpas, but 2MB aligned, with 2MB large
> pages.
With current command line only one hugetlbfs mount point is possible, so it
will back with whatever alignment specified hugetlbfs mount point has.
Anything that doesn't fit into page aligned region goes to tail using
non hugepage baked phys_mem_set_alloc()=qemu_anon_ram_alloc() allocator.

> 
> Since hugetlbfs allocation is static, it requires the user to inform
> different 1GB and 2MB sized hugetlbfs mount points (with proper number
> of corresponding hugetlbfs pages allocated). This is incompatible with
> the current command line, and i'd like to see this problem handled in a
> way that is command line backwards compatible.
patch doesn't change that, it uses provided hugetlbfs and fallbacks (hunk 2)
to phys_mem_alloc if requested memory region is not hugepage size aligned.
So there is no any CLI change, only fixing memory leak.

> Also, if the argument for one-to-one mapping between dimms and linear host
> virtual address sections holds, it means virtual DIMMs must be
> partitioned into whatever hugepage alignment is necessary (and in
> that case, why they can't be partitioned similarly with the memory
> region aliases?).
Because during hotplug a new memory region of desired size is allocated
and it could be mapped directly without any aliasing. And if some day we
convert adhoc initial memory allocation to dimm devices there is no reason to
alloc one huge block and then invent means how to alias hole somewhere else,
we could just reuse memdev/dimm and allocate several memory regions
with desired properties each represented by a memdev/dimm pair.

one-one mapping simplifies design and interface with ACPI part during memory
hotplug.

for hotplug case flow could look like:
 memdev_add id=x1,size=1Gb,mem-path=/hugetlbfs/1gb,other-host-related-stuff-options
 #memdev could enforce size to be backend aligned
 device_add dimm,id=y1,backend=x1,addr=xxxxxx
 #dimm could get alignment from associated memdev or fail if addr
 #doesn't meet alignment of memdev backend

 memdev_add id=x2,size=2mb,mem-path=/hugetlbfs/2mb
 device_add dimm,id=y2,backend=x2,addr=yyyyyyy

 memdev_add id=x3,size=1mb
 device_add dimm,id=y3,backend=x3,addr=xxxxxxx

linear memory block is allocated at runtime (user has to make sure that enough
hugepages are available) by each memdev_add command and that RAM memory region
is mapped into GPA by virtual DIMM as is, there wouldn't be any need for
aliasing.

Now back to intial memory and bright future we are looking forward to (i.e.
ability to create machine from configuration file without adhoc codding
like(pc_memory_init)):

legacy cmdline "-m 4512 -mem-path /hugetlbfs/1gb" could be automatically
translated into:

-memdev id=x1,size=3g,mem-path=/hugetlbfs/1gb -device dimm,backend=x1,addr=0
-memdev id=x2,size=1g,mem-path=/hugetlbfs/1gb -device dimm,backend=x2,addr=4g
-memdev id=x3,size=512m -device dimm,backend=x3,addr=5g

or user could drop legacy CLI and assume fine grained control over memory
configuration:

-memdev id=x1,size=3g,mem-path=/hugetlbfs/1gb -device dimm,backend=x1,addr=0
-memdev id=x2,size=1g,mem-path=/hugetlbfs/1gb -device dimm,backend=x2,addr=4g
-memdev id=x3,size=512m,mem-path=/hugetlbfs/2mb -device dimm,backend=x3,addr=5g

So if we are going to break migration compatibility for new machine type
lets do a way that could painlessly changed to memdev/device in future.

> > PS:
> > as side effect we are not wasting ~1Gb of memory if
> > 1Gb hugepages are used and -m "hpagesize(in Mb)*n + 1"
> 
> This is how hugetlbfs works. You waste 1GB hugepage if an extra
> byte is requested.
it looks more a bug than feature,
why do it if leak could be avoided as shown below?

> 
> > Signed-off-by: Igor Mammedov <imammedo@redhat.com>
> > ---
> > PS2:
> > As RFC it's yet without compatibility changes noted by Paolo
> > 
> > ---
> >  exec.c                    |    8 ++++++-
> >  hw/i386/pc.c              |   50 ++++++++++++++++++++++++++++----------------
> >  include/exec/cpu-common.h |    1 +
> >  3 files changed, 40 insertions(+), 19 deletions(-)
> > 
> > diff --git a/exec.c b/exec.c
> > index 9b6ea50..a4e5c80 100644
> > --- a/exec.c
> > +++ b/exec.c
> > @@ -882,7 +882,7 @@ void qemu_mutex_unlock_ramlist(void)
> >  
> >  #define HUGETLBFS_MAGIC       0x958458f6
> >  
> > -static long gethugepagesize(const char *path)
> > +long gethugepagesize(const char *path)
> >  {
> >      struct statfs fs;
> >      int ret;
> > @@ -925,6 +925,12 @@ static void *file_ram_alloc(RAMBlock *block,
> >          return NULL;
> >      }
> >  
> > +    /* refuse to use huge pages if requested size isn't page aligned
> > +     * to avoid wasting memory */
> > +    if (memory != (memory & ~(hpagesize-1))) {
> > +        return NULL;
> > +    }
> > +
> >      if (kvm_enabled() && !kvm_has_sync_mmu()) {
> >          fprintf(stderr, "host lacks kvm mmu notifiers, -mem-path unsupported\n");
> >          return NULL;
> > diff --git a/hw/i386/pc.c b/hw/i386/pc.c
> > index 0c313fe..1611fa7 100644
> > --- a/hw/i386/pc.c
> > +++ b/hw/i386/pc.c
> > @@ -1116,32 +1116,46 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory,
> >  {
> >      int linux_boot, i;
> >      MemoryRegion *ram, *option_rom_mr;
> > -    MemoryRegion *ram_below_4g, *ram_above_4g;
> >      FWCfgState *fw_cfg;
> > +    unsigned long hpagesize = gethugepagesize(mem_path);
> > +    ram_addr_t below_4g_mem_size_alined, below_4g_mem_size_tail, above_4g_mem_size_alined, above_4g_mem_size_tail;
> >  
> >      linux_boot = (kernel_filename != NULL);
> >  
> > -    /* Allocate RAM.  We allocate it as a single memory region and use
> > -     * aliases to address portions of it, mostly for backwards compatibility
> > -     * with older qemus that used qemu_ram_alloc().
> > -     */
> > +    *ram_memory = g_malloc(sizeof(**ram_memory));
> > +    memory_region_init(*ram_memory, NULL, "pc.ram",
> > +                       above_4g_mem_size == 0 ? below_4g_mem_size: 0x100000000ULL + above_4g_mem_size);
> > +    memory_region_add_subregion(system_memory, 0, *ram_memory);
> > +
> > +    below_4g_mem_size_alined = below_4g_mem_size & ~(hpagesize - 1);
> >      ram = g_malloc(sizeof(*ram));
> > -    memory_region_init_ram(ram, NULL, "pc.ram",
> > -                           below_4g_mem_size + above_4g_mem_size);
> > +    memory_region_init_ram(ram, NULL, "pc.ram.low.aligned", below_4g_mem_size_alined);
> > +    memory_region_add_subregion(*ram_memory, 0, ram);
> >      vmstate_register_ram_global(ram);
> > -    *ram_memory = ram;
> > -    ram_below_4g = g_malloc(sizeof(*ram_below_4g));
> > -    memory_region_init_alias(ram_below_4g, NULL, "ram-below-4g", ram,
> > -                             0, below_4g_mem_size);
> > -    memory_region_add_subregion(system_memory, 0, ram_below_4g);
> > -    if (above_4g_mem_size > 0) {
> > -        ram_above_4g = g_malloc(sizeof(*ram_above_4g));
> > -        memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
> > -                                 below_4g_mem_size, above_4g_mem_size);
> > -        memory_region_add_subregion(system_memory, 0x100000000ULL,
> > -                                    ram_above_4g);
> > +
> > +    below_4g_mem_size_tail = below_4g_mem_size - below_4g_mem_size_alined;
> > +    if (below_4g_mem_size_tail) {
> > +        ram = g_malloc(sizeof(*ram));
> > +        memory_region_init_ram(ram, NULL, "pc.ram.low.unaligned", below_4g_mem_size_tail);
> > +        memory_region_add_subregion(*ram_memory, below_4g_mem_size_alined, ram);
> > +        vmstate_register_ram_global(ram);
> >      }
> >  
> > +    if (above_4g_mem_size > 0) {
> > +        above_4g_mem_size_alined = above_4g_mem_size & ~(hpagesize - 1);
> > +        ram = g_malloc(sizeof(*ram));
> > +        memory_region_init_ram(ram, NULL, "pc.ram.high.aligned", above_4g_mem_size_alined);
> > +        memory_region_add_subregion(*ram_memory, 0x100000000ULL, ram);
> > +        vmstate_register_ram_global(ram);
> > +
> > +        above_4g_mem_size_tail = above_4g_mem_size - above_4g_mem_size_alined;
> > +        if (above_4g_mem_size_tail) {
> > +            ram = g_malloc(sizeof(*ram));
> > +            memory_region_init_ram(ram, NULL, "pc.ram.high.unaligned", above_4g_mem_size_tail);
> > +            memory_region_add_subregion(*ram_memory, 0x100000000ULL + above_4g_mem_size_alined, ram);
> > +            vmstate_register_ram_global(ram);
> > +	}
> > +    }
> >  
> >      /* Initialize PC system firmware */
> >      pc_system_firmware_init(rom_memory, guest_info->isapc_ram_fw);
> > diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
> > index 40e15e4..f89a37c 100644
> > --- a/include/exec/cpu-common.h
> > +++ b/include/exec/cpu-common.h
> > @@ -57,6 +57,7 @@ void qemu_ram_set_idstr(ram_addr_t addr, const char *name, DeviceState *dev);
> >  #ifdef __linux__
> >  uint32_t qemu_get_ram_hpagesize(ram_addr_t addr);
> >  #endif
> > +long gethugepagesize(const char *path);
> >  
> >  void cpu_physical_memory_rw(hwaddr addr, uint8_t *buf,
> >                              int len, int is_write);
> > -- 
> > 1.7.1
Marcelo Tosatti Oct. 30, 2013, 6:51 p.m. UTC | #3
On Wed, Oct 30, 2013 at 05:49:49PM +0100, Igor Mammedov wrote:
> On Tue, 29 Oct 2013 19:38:44 -0200
> Marcelo Tosatti <mtosatti@redhat.com> wrote:
> 
> > On Tue, Oct 29, 2013 at 07:18:49PM +0100, Igor Mammedov wrote:
> > > Otherwise 1GB TLBs cannot be cached for the range.
> > 
> > This fails to back non-1GB-aligned gpas, but 2MB aligned, with 2MB large
> > pages.
> With current command line only one hugetlbfs mount point is possible, so it
> will back with whatever alignment specified hugetlbfs mount point has.
> Anything that doesn't fit into page aligned region goes to tail using
> non hugepage baked phys_mem_set_alloc()=qemu_anon_ram_alloc() allocator.

The patch you propose allocates the non-1GB aligned tail of RAM with 4k
pages. As mentioned, this is not acceptable (2MB pages should be used
whenever 1GB alignment is not possible).

I believe its easier for the user to allocate enough 1GB pages to back
all of guest RAM, since allocation is static, than for him to allocate
mixed 1GB/2MB pages in hugetlbfs.

> > Since hugetlbfs allocation is static, it requires the user to inform
> > different 1GB and 2MB sized hugetlbfs mount points (with proper number
> > of corresponding hugetlbfs pages allocated). This is incompatible with
> > the current command line, and i'd like to see this problem handled in a
> > way that is command line backwards compatible.
> patch doesn't change that, it uses provided hugetlbfs and fallbacks (hunk 2)
> to phys_mem_alloc if requested memory region is not hugepage size aligned.
> So there is no any CLI change, only fixing memory leak.
> 
> > Also, if the argument for one-to-one mapping between dimms and linear host
> > virtual address sections holds, it means virtual DIMMs must be
> > partitioned into whatever hugepage alignment is necessary (and in
> > that case, why they can't be partitioned similarly with the memory
> > region aliases?).
> Because during hotplug a new memory region of desired size is allocated
> and it could be mapped directly without any aliasing. And if some day we
> convert adhoc initial memory allocation to dimm devices there is no reason to
> alloc one huge block and then invent means how to alias hole somewhere else,
> we could just reuse memdev/dimm and allocate several memory regions
> with desired properties each represented by a memdev/dimm pair.
> 
> one-one mapping simplifies design and interface with ACPI part during memory
> hotplug.
> 
> for hotplug case flow could look like:
>  memdev_add id=x1,size=1Gb,mem-path=/hugetlbfs/1gb,other-host-related-stuff-options
>  #memdev could enforce size to be backend aligned
>  device_add dimm,id=y1,backend=x1,addr=xxxxxx
>  #dimm could get alignment from associated memdev or fail if addr
>  #doesn't meet alignment of memdev backend
> 
>  memdev_add id=x2,size=2mb,mem-path=/hugetlbfs/2mb
>  device_add dimm,id=y2,backend=x2,addr=yyyyyyy
> 
>  memdev_add id=x3,size=1mb
>  device_add dimm,id=y3,backend=x3,addr=xxxxxxx
> 
> linear memory block is allocated at runtime (user has to make sure that enough
> hugepages are available) by each memdev_add command and that RAM memory region
> is mapped into GPA by virtual DIMM as is, there wouldn't be any need for
> aliasing.
> 
> Now back to intial memory and bright future we are looking forward to (i.e.
> ability to create machine from configuration file without adhoc codding
> like(pc_memory_init)):
> 
> legacy cmdline "-m 4512 -mem-path /hugetlbfs/1gb" could be automatically
> translated into:
> 
> -memdev id=x1,size=3g,mem-path=/hugetlbfs/1gb -device dimm,backend=x1,addr=0
> -memdev id=x2,size=1g,mem-path=/hugetlbfs/1gb -device dimm,backend=x2,addr=4g
> -memdev id=x3,size=512m -device dimm,backend=x3,addr=5g
> 
> or user could drop legacy CLI and assume fine grained control over memory
> configuration:
> 
> -memdev id=x1,size=3g,mem-path=/hugetlbfs/1gb -device dimm,backend=x1,addr=0
> -memdev id=x2,size=1g,mem-path=/hugetlbfs/1gb -device dimm,backend=x2,addr=4g
> -memdev id=x3,size=512m,mem-path=/hugetlbfs/2mb -device dimm,backend=x3,addr=5g
> 
> So if we are going to break migration compatibility for new machine type
> lets do a way that could painlessly changed to memdev/device in future.

Ok then please improve your proposal to allow for multiple hugetlbfs
mount points.

> > > PS:
> > > as side effect we are not wasting ~1Gb of memory if
> > > 1Gb hugepages are used and -m "hpagesize(in Mb)*n + 1"
> > 
> > This is how hugetlbfs works. You waste 1GB hugepage if an extra
> > byte is requested.
> it looks more a bug than feature,
> why do it if leak could be avoided as shown below?

Because IMO it is confusing for the user, since hugetlbfs allocation is
static. But if you have a necessity for the one-to-one relationship, 
feel free to support mixed hugetlbfs page sizes.
Marcelo Tosatti Oct. 30, 2013, 7:03 p.m. UTC | #4
On Wed, Oct 30, 2013 at 04:51:29PM -0200, Marcelo Tosatti wrote:
> On Wed, Oct 30, 2013 at 05:49:49PM +0100, Igor Mammedov wrote:
> > On Tue, 29 Oct 2013 19:38:44 -0200
> > Marcelo Tosatti <mtosatti@redhat.com> wrote:
> > 
> > > On Tue, Oct 29, 2013 at 07:18:49PM +0100, Igor Mammedov wrote:
> > > > Otherwise 1GB TLBs cannot be cached for the range.
> > > 
> > > This fails to back non-1GB-aligned gpas, but 2MB aligned, with 2MB large
> > > pages.
> > With current command line only one hugetlbfs mount point is possible, so it
> > will back with whatever alignment specified hugetlbfs mount point has.
> > Anything that doesn't fit into page aligned region goes to tail using
> > non hugepage baked phys_mem_set_alloc()=qemu_anon_ram_alloc() allocator.
> 
> The patch you propose allocates the non-1GB aligned tail of RAM with 4k
> pages. As mentioned, this is not acceptable (2MB pages should be used
> whenever 1GB alignment is not possible).

Tail and 3-4GB region.
Marcelo Tosatti Oct. 30, 2013, 7:31 p.m. UTC | #5
On Wed, Oct 30, 2013 at 05:49:49PM +0100, Igor Mammedov wrote:
> On Tue, 29 Oct 2013 19:38:44 -0200
> Marcelo Tosatti <mtosatti@redhat.com> wrote:
> 
> > On Tue, Oct 29, 2013 at 07:18:49PM +0100, Igor Mammedov wrote:
> > > Otherwise 1GB TLBs cannot be cached for the range.
> > 
> > This fails to back non-1GB-aligned gpas, but 2MB aligned, with 2MB large
> > pages.
> With current command line only one hugetlbfs mount point is possible, so it
> will back with whatever alignment specified hugetlbfs mount point has.
> Anything that doesn't fit into page aligned region goes to tail using
> non hugepage baked phys_mem_set_alloc()=qemu_anon_ram_alloc() allocator.

Thats why it fails to back non-1GB-aligned gpas with 2MB large pages.

> > 
> > Since hugetlbfs allocation is static, it requires the user to inform
> > different 1GB and 2MB sized hugetlbfs mount points (with proper number
> > of corresponding hugetlbfs pages allocated). This is incompatible with
> > the current command line, and i'd like to see this problem handled in a
> > way that is command line backwards compatible.
> patch doesn't change that, it uses provided hugetlbfs and fallbacks (hunk 2)
> to phys_mem_alloc if requested memory region is not hugepage size aligned.
> So there is no any CLI change, only fixing memory leak.
> 
> > Also, if the argument for one-to-one mapping between dimms and linear host
> > virtual address sections holds, it means virtual DIMMs must be
> > partitioned into whatever hugepage alignment is necessary (and in
> > that case, why they can't be partitioned similarly with the memory
> > region aliases?).
> Because during hotplug a new memory region of desired size is allocated
> and it could be mapped directly without any aliasing. And if some day we
> convert adhoc initial memory allocation to dimm devices there is no reason to
> alloc one huge block and then invent means how to alias hole somewhere else,
> we could just reuse memdev/dimm and allocate several memory regions
> with desired properties each represented by a memdev/dimm pair.
> 
> one-one mapping simplifies design and interface with ACPI part during memory
> hotplug.
> 
> for hotplug case flow could look like:
>  memdev_add id=x1,size=1Gb,mem-path=/hugetlbfs/1gb,other-host-related-stuff-options
>  #memdev could enforce size to be backend aligned
>  device_add dimm,id=y1,backend=x1,addr=xxxxxx
>  #dimm could get alignment from associated memdev or fail if addr
>  #doesn't meet alignment of memdev backend
> 
>  memdev_add id=x2,size=2mb,mem-path=/hugetlbfs/2mb
>  device_add dimm,id=y2,backend=x2,addr=yyyyyyy
> 
>  memdev_add id=x3,size=1mb
>  device_add dimm,id=y3,backend=x3,addr=xxxxxxx
> 
> linear memory block is allocated at runtime (user has to make sure that enough
> hugepages are available) by each memdev_add command and that RAM memory region
> is mapped into GPA by virtual DIMM as is, there wouldn't be any need for
> aliasing.

Actually, it is awkward to require the user to supply mixed 1GB/2MB huge
page sizes. The number of 1GB pages and 2MB pages must be exact, because
they are preallocated, so for different machine types you have different
numbers.

Also, dealing with unability to migrate between machine types is a
matter of reinitialization of the guest.

I do not believe that unability to deal with memory aliases on the
future ability to create machine from configuration is strong enough
reason to justify pushing mixed 1GB and 2MB pages size to the user (1GB
pages are allocated on the host kernel command line).

> Now back to intial memory and bright future we are looking forward to (i.e.
> ability to create machine from configuration file without adhoc codding
> like(pc_memory_init)):
> 
> legacy cmdline "-m 4512 -mem-path /hugetlbfs/1gb" could be automatically
> translated into:
> 
> -memdev id=x1,size=3g,mem-path=/hugetlbfs/1gb -device dimm,backend=x1,addr=0
> -memdev id=x2,size=1g,mem-path=/hugetlbfs/1gb -device dimm,backend=x2,addr=4g
> -memdev id=x3,size=512m -device dimm,backend=x3,addr=5g
> 
> or user could drop legacy CLI and assume fine grained control over memory
> configuration:
> 
> -memdev id=x1,size=3g,mem-path=/hugetlbfs/1gb -device dimm,backend=x1,addr=0
> -memdev id=x2,size=1g,mem-path=/hugetlbfs/1gb -device dimm,backend=x2,addr=4g
> -memdev id=x3,size=512m,mem-path=/hugetlbfs/2mb -device dimm,backend=x3,addr=5g
> 
> So if we are going to break migration compatibility for new machine type
> lets do a way that could painlessly changed to memdev/device in future.

You can just separate backing device and dimm device creation into two 
commands with offsets.

Which might be a good ideia anyway, BTW.

> > > PS:
> > > as side effect we are not wasting ~1Gb of memory if
> > > 1Gb hugepages are used and -m "hpagesize(in Mb)*n + 1"
> > 
> > This is how hugetlbfs works. You waste 1GB hugepage if an extra
> > byte is requested.
> it looks more a bug than feature,
> why do it if leak could be avoided as shown below?
Igor Mammedov Oct. 30, 2013, 7:56 p.m. UTC | #6
On Wed, 30 Oct 2013 16:51:29 -0200
Marcelo Tosatti <mtosatti@redhat.com> wrote:

> On Wed, Oct 30, 2013 at 05:49:49PM +0100, Igor Mammedov wrote:
> > On Tue, 29 Oct 2013 19:38:44 -0200
> > Marcelo Tosatti <mtosatti@redhat.com> wrote:
> > 
> > > On Tue, Oct 29, 2013 at 07:18:49PM +0100, Igor Mammedov wrote:
> > > > Otherwise 1GB TLBs cannot be cached for the range.
> > > 
> > > This fails to back non-1GB-aligned gpas, but 2MB aligned, with 2MB large
> > > pages.
> > With current command line only one hugetlbfs mount point is possible, so it
> > will back with whatever alignment specified hugetlbfs mount point has.
> > Anything that doesn't fit into page aligned region goes to tail using
> > non hugepage baked phys_mem_set_alloc()=qemu_anon_ram_alloc() allocator.
> 
> The patch you propose allocates the non-1GB aligned tail of RAM with 4k
> pages. As mentioned, this is not acceptable (2MB pages should be used
> whenever 1GB alignment is not possible).
> 
> I believe its easier for the user to allocate enough 1GB pages to back
> all of guest RAM, since allocation is static, than for him to allocate
> mixed 1GB/2MB pages in hugetlbfs.
tail, if it's present at all, is always less than 1Gb.
why tail allocated with 4K pages is not acceptable?

btw: now if QEMU can't allocate hugepages for whole guest size it will fallback
to 4k pages anyway for whole guest size, with warning that isn't visible if
user doesn't start QEMU manually.

> 
> > > Since hugetlbfs allocation is static, it requires the user to inform
> > > different 1GB and 2MB sized hugetlbfs mount points (with proper number
> > > of corresponding hugetlbfs pages allocated). This is incompatible with
> > > the current command line, and i'd like to see this problem handled in a
> > > way that is command line backwards compatible.
> > patch doesn't change that, it uses provided hugetlbfs and fallbacks (hunk 2)
> > to phys_mem_alloc if requested memory region is not hugepage size aligned.
> > So there is no any CLI change, only fixing memory leak.
> > 
> > > Also, if the argument for one-to-one mapping between dimms and linear host
> > > virtual address sections holds, it means virtual DIMMs must be
> > > partitioned into whatever hugepage alignment is necessary (and in
> > > that case, why they can't be partitioned similarly with the memory
> > > region aliases?).
> > Because during hotplug a new memory region of desired size is allocated
> > and it could be mapped directly without any aliasing. And if some day we
> > convert adhoc initial memory allocation to dimm devices there is no reason to
> > alloc one huge block and then invent means how to alias hole somewhere else,
> > we could just reuse memdev/dimm and allocate several memory regions
> > with desired properties each represented by a memdev/dimm pair.
> > 
> > one-one mapping simplifies design and interface with ACPI part during memory
> > hotplug.
> > 
> > for hotplug case flow could look like:
> >  memdev_add id=x1,size=1Gb,mem-path=/hugetlbfs/1gb,other-host-related-stuff-options
> >  #memdev could enforce size to be backend aligned
> >  device_add dimm,id=y1,backend=x1,addr=xxxxxx
> >  #dimm could get alignment from associated memdev or fail if addr
> >  #doesn't meet alignment of memdev backend
> > 
> >  memdev_add id=x2,size=2mb,mem-path=/hugetlbfs/2mb
> >  device_add dimm,id=y2,backend=x2,addr=yyyyyyy
> > 
> >  memdev_add id=x3,size=1mb
> >  device_add dimm,id=y3,backend=x3,addr=xxxxxxx
> > 
> > linear memory block is allocated at runtime (user has to make sure that enough
> > hugepages are available) by each memdev_add command and that RAM memory region
> > is mapped into GPA by virtual DIMM as is, there wouldn't be any need for
> > aliasing.
> > 
> > Now back to intial memory and bright future we are looking forward to (i.e.
> > ability to create machine from configuration file without adhoc codding
> > like(pc_memory_init)):
> > 
> > legacy cmdline "-m 4512 -mem-path /hugetlbfs/1gb" could be automatically
> > translated into:
> > 
> > -memdev id=x1,size=3g,mem-path=/hugetlbfs/1gb -device dimm,backend=x1,addr=0
> > -memdev id=x2,size=1g,mem-path=/hugetlbfs/1gb -device dimm,backend=x2,addr=4g
> > -memdev id=x3,size=512m -device dimm,backend=x3,addr=5g
> > 
> > or user could drop legacy CLI and assume fine grained control over memory
> > configuration:
> > 
> > -memdev id=x1,size=3g,mem-path=/hugetlbfs/1gb -device dimm,backend=x1,addr=0
> > -memdev id=x2,size=1g,mem-path=/hugetlbfs/1gb -device dimm,backend=x2,addr=4g
> > -memdev id=x3,size=512m,mem-path=/hugetlbfs/2mb -device dimm,backend=x3,addr=5g
> > 
> > So if we are going to break migration compatibility for new machine type
> > lets do a way that could painlessly changed to memdev/device in future.
> 
> Ok then please improve your proposal to allow for multiple hugetlbfs
> mount points.
> 
> > > > PS:
> > > > as side effect we are not wasting ~1Gb of memory if
> > > > 1Gb hugepages are used and -m "hpagesize(in Mb)*n + 1"
> > > 
> > > This is how hugetlbfs works. You waste 1GB hugepage if an extra
> > > byte is requested.
> > it looks more a bug than feature,
> > why do it if leak could be avoided as shown below?
> 
> Because IMO it is confusing for the user, since hugetlbfs allocation is
> static. But if you have a necessity for the one-to-one relationship, 
> feel free to support mixed hugetlbfs page sizes.
> 
>
Paolo Bonzini Oct. 30, 2013, 8:28 p.m. UTC | #7
Il 30/10/2013 20:31, Marcelo Tosatti ha scritto:
> 
> I do not believe that unability to deal with memory aliases on the
> future ability to create machine from configuration is strong enough
> reason to justify pushing mixed 1GB and 2MB pages size to the user (1GB
> pages are allocated on the host kernel command line).

I agree.  The memory allocation in your patch is a bit awkward, but from
the user's point of view it is by far the cleanest.

Paolo
Marcelo Tosatti Oct. 30, 2013, 11:44 p.m. UTC | #8
On Wed, Oct 30, 2013 at 08:56:17PM +0100, Igor Mammedov wrote:
> On Wed, 30 Oct 2013 16:51:29 -0200
> Marcelo Tosatti <mtosatti@redhat.com> wrote:
> 
> > On Wed, Oct 30, 2013 at 05:49:49PM +0100, Igor Mammedov wrote:
> > > On Tue, 29 Oct 2013 19:38:44 -0200
> > > Marcelo Tosatti <mtosatti@redhat.com> wrote:
> > > 
> > > > On Tue, Oct 29, 2013 at 07:18:49PM +0100, Igor Mammedov wrote:
> > > > > Otherwise 1GB TLBs cannot be cached for the range.
> > > > 
> > > > This fails to back non-1GB-aligned gpas, but 2MB aligned, with 2MB large
> > > > pages.
> > > With current command line only one hugetlbfs mount point is possible, so it
> > > will back with whatever alignment specified hugetlbfs mount point has.
> > > Anything that doesn't fit into page aligned region goes to tail using
> > > non hugepage baked phys_mem_set_alloc()=qemu_anon_ram_alloc() allocator.
> > 
> > The patch you propose allocates the non-1GB aligned tail of RAM with 4k
> > pages. As mentioned, this is not acceptable (2MB pages should be used
> > whenever 1GB alignment is not possible).
> > 
> > I believe its easier for the user to allocate enough 1GB pages to back
> > all of guest RAM, since allocation is static, than for him to allocate
> > mixed 1GB/2MB pages in hugetlbfs.
> tail, if it's present at all, is always less than 1Gb.
> why tail allocated with 4K pages is not acceptable?

Depends on workload.

> btw: now if QEMU can't allocate hugepages for whole guest size it will fallback
> to 4k pages anyway for whole guest size, with warning that isn't visible if
> user doesn't start QEMU manually.

Not with -mem-prealloc.
Igor Mammedov Nov. 7, 2013, 3:25 p.m. UTC | #9
On Wed, 30 Oct 2013 21:44:12 -0200
Marcelo Tosatti <mtosatti@redhat.com> wrote:

> On Wed, Oct 30, 2013 at 08:56:17PM +0100, Igor Mammedov wrote:
> > On Wed, 30 Oct 2013 16:51:29 -0200
> > Marcelo Tosatti <mtosatti@redhat.com> wrote:
> > 
[...]
> 
> > btw: now if QEMU can't allocate hugepages for whole guest size it will fallback
> > to 4k pages anyway for whole guest size, with warning that isn't visible if
> > user doesn't start QEMU manually.
> 
> Not with -mem-prealloc.
tested with it, it does not much, just prints error message to stderr
and then fallbacks to default allocator continuing guest booting.

> 
>
diff mbox

Patch

diff --git a/exec.c b/exec.c
index 9b6ea50..a4e5c80 100644
--- a/exec.c
+++ b/exec.c
@@ -882,7 +882,7 @@  void qemu_mutex_unlock_ramlist(void)
 
 #define HUGETLBFS_MAGIC       0x958458f6
 
-static long gethugepagesize(const char *path)
+long gethugepagesize(const char *path)
 {
     struct statfs fs;
     int ret;
@@ -925,6 +925,12 @@  static void *file_ram_alloc(RAMBlock *block,
         return NULL;
     }
 
+    /* refuse to use huge pages if requested size isn't page aligned
+     * to avoid wasting memory */
+    if (memory != (memory & ~(hpagesize-1))) {
+        return NULL;
+    }
+
     if (kvm_enabled() && !kvm_has_sync_mmu()) {
         fprintf(stderr, "host lacks kvm mmu notifiers, -mem-path unsupported\n");
         return NULL;
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 0c313fe..1611fa7 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -1116,32 +1116,46 @@  FWCfgState *pc_memory_init(MemoryRegion *system_memory,
 {
     int linux_boot, i;
     MemoryRegion *ram, *option_rom_mr;
-    MemoryRegion *ram_below_4g, *ram_above_4g;
     FWCfgState *fw_cfg;
+    unsigned long hpagesize = gethugepagesize(mem_path);
+    ram_addr_t below_4g_mem_size_alined, below_4g_mem_size_tail, above_4g_mem_size_alined, above_4g_mem_size_tail;
 
     linux_boot = (kernel_filename != NULL);
 
-    /* Allocate RAM.  We allocate it as a single memory region and use
-     * aliases to address portions of it, mostly for backwards compatibility
-     * with older qemus that used qemu_ram_alloc().
-     */
+    *ram_memory = g_malloc(sizeof(**ram_memory));
+    memory_region_init(*ram_memory, NULL, "pc.ram",
+                       above_4g_mem_size == 0 ? below_4g_mem_size: 0x100000000ULL + above_4g_mem_size);
+    memory_region_add_subregion(system_memory, 0, *ram_memory);
+
+    below_4g_mem_size_alined = below_4g_mem_size & ~(hpagesize - 1);
     ram = g_malloc(sizeof(*ram));
-    memory_region_init_ram(ram, NULL, "pc.ram",
-                           below_4g_mem_size + above_4g_mem_size);
+    memory_region_init_ram(ram, NULL, "pc.ram.low.aligned", below_4g_mem_size_alined);
+    memory_region_add_subregion(*ram_memory, 0, ram);
     vmstate_register_ram_global(ram);
-    *ram_memory = ram;
-    ram_below_4g = g_malloc(sizeof(*ram_below_4g));
-    memory_region_init_alias(ram_below_4g, NULL, "ram-below-4g", ram,
-                             0, below_4g_mem_size);
-    memory_region_add_subregion(system_memory, 0, ram_below_4g);
-    if (above_4g_mem_size > 0) {
-        ram_above_4g = g_malloc(sizeof(*ram_above_4g));
-        memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
-                                 below_4g_mem_size, above_4g_mem_size);
-        memory_region_add_subregion(system_memory, 0x100000000ULL,
-                                    ram_above_4g);
+
+    below_4g_mem_size_tail = below_4g_mem_size - below_4g_mem_size_alined;
+    if (below_4g_mem_size_tail) {
+        ram = g_malloc(sizeof(*ram));
+        memory_region_init_ram(ram, NULL, "pc.ram.low.unaligned", below_4g_mem_size_tail);
+        memory_region_add_subregion(*ram_memory, below_4g_mem_size_alined, ram);
+        vmstate_register_ram_global(ram);
     }
 
+    if (above_4g_mem_size > 0) {
+        above_4g_mem_size_alined = above_4g_mem_size & ~(hpagesize - 1);
+        ram = g_malloc(sizeof(*ram));
+        memory_region_init_ram(ram, NULL, "pc.ram.high.aligned", above_4g_mem_size_alined);
+        memory_region_add_subregion(*ram_memory, 0x100000000ULL, ram);
+        vmstate_register_ram_global(ram);
+
+        above_4g_mem_size_tail = above_4g_mem_size - above_4g_mem_size_alined;
+        if (above_4g_mem_size_tail) {
+            ram = g_malloc(sizeof(*ram));
+            memory_region_init_ram(ram, NULL, "pc.ram.high.unaligned", above_4g_mem_size_tail);
+            memory_region_add_subregion(*ram_memory, 0x100000000ULL + above_4g_mem_size_alined, ram);
+            vmstate_register_ram_global(ram);
+	}
+    }
 
     /* Initialize PC system firmware */
     pc_system_firmware_init(rom_memory, guest_info->isapc_ram_fw);
diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
index 40e15e4..f89a37c 100644
--- a/include/exec/cpu-common.h
+++ b/include/exec/cpu-common.h
@@ -57,6 +57,7 @@  void qemu_ram_set_idstr(ram_addr_t addr, const char *name, DeviceState *dev);
 #ifdef __linux__
 uint32_t qemu_get_ram_hpagesize(ram_addr_t addr);
 #endif
+long gethugepagesize(const char *path);
 
 void cpu_physical_memory_rw(hwaddr addr, uint8_t *buf,
                             int len, int is_write);