diff mbox

[rebased,for-1.8] i386: pc: align gpa<->hpa on 1GB boundary (v6)

Message ID 1385401393-14291-1-git-send-email-pbonzini@redhat.com
State New
Headers show

Commit Message

Paolo Bonzini Nov. 25, 2013, 5:43 p.m. UTC
v2: condition enablement of new mapping to new machine types (Paolo)
v3: fix changelog
v4: rebase
v5: ensure alignment of piecetwo on 2MB GPA (Igor)
    do not register zero-sized piece-one    (Igor)
v6: fix memory leak                         (Igor)
    fix integer overflow                    (Igor)

----

Align guest physical address and host physical address
beyond guest 4GB on a 1GB boundary.

Otherwise 1GB TLBs cannot be cached for the range.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
[Reorganize code, keep same logic. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hw/i386/pc.c         |   67 +++++++++++++++++++++++++++++++++++++++++++------
 hw/i386/pc_piix.c    |    3 ++
 hw/i386/pc_q35.c     |    3 ++
 include/hw/i386/pc.h |    1 +
 4 files changed, 65 insertions(+), 9 deletions(-)

Comments

Laszlo Ersek Nov. 25, 2013, 8:58 p.m. UTC | #1
On 11/25/13 18:43, Paolo Bonzini wrote:
> v2: condition enablement of new mapping to new machine types (Paolo)
> v3: fix changelog
> v4: rebase
> v5: ensure alignment of piecetwo on 2MB GPA (Igor)
>     do not register zero-sized piece-one    (Igor)
> v6: fix memory leak                         (Igor)
>     fix integer overflow                    (Igor)
> 
> ----
> 
> Align guest physical address and host physical address
> beyond guest 4GB on a 1GB boundary.
> 
> Otherwise 1GB TLBs cannot be cached for the range.
> 
> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> [Reorganize code, keep same logic. - Paolo]
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>  hw/i386/pc.c         |   67 +++++++++++++++++++++++++++++++++++++++++++------
>  hw/i386/pc_piix.c    |    3 ++
>  hw/i386/pc_q35.c     |    3 ++
>  include/hw/i386/pc.h |    1 +
>  4 files changed, 65 insertions(+), 9 deletions(-)
> 
> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
> index 6c82ada..485b44d 100644
> --- a/hw/i386/pc.c
> +++ b/hw/i386/pc.c
> @@ -1148,8 +1148,10 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory,
>  {
>      int linux_boot, i;
>      MemoryRegion *ram, *option_rom_mr;
> -    MemoryRegion *ram_below_4g, *ram_above_4g;
> +    MemoryRegion *ram_below_4g, *ram_above_4g_pieceone, *ram_above_4g_piecetwo;
>      FWCfgState *fw_cfg;
> +    uint64_t holesize, pieceonesize, piecetwosize;
> +    uint64_t memsize, align_offset;
>  
>      linux_boot = (kernel_filename != NULL);
>  
> @@ -1157,26 +1159,73 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory,
>       * aliases to address portions of it, mostly for backwards compatibility
>       * with older qemus that used qemu_ram_alloc().
>       */
> +    memsize = below_4g_mem_size + above_4g_mem_size;
> +    holesize = 0x100000000ULL - below_4g_mem_size;
> +
> +    /* If 1GB hugepages are used to back guest RAM, we want the
> +     * physical address 4GB to map to 4GB in the RAM, so that
> +     * memory beyond 4GB is aligned on a 1GB boundary, at the
> +     * host physical address space.  Thus, the ram block range
> +     * [holestart, 4GB] is mapped to the last holesize bytes of RAM:
> +     *
> +     *                      0      h     4G     memsize-holesize
> +     *
> +     * contiguous-ram-block [xxxxxx][yyy][zzzzz]
> +     *                                '-----------.
> +     * guest-addr-space     [xxxxxx]     [zzzzz][yyy]
> +     *
> +     * This is only done in new-enough machine types, and of course
> +     * it  is only possible if the [zzzzz] block exists at all.
> +     */
> +    if (guest_info->gb_align && above_4g_mem_size > holesize) {
> +        /* Round the allocation up to 2 MB to make [zzzzz]'s size
> +         * aligned, removing the extra from the [yyy] piece.
> +         */
> +        align_offset = ROUND_UP(memsize, 1UL << 21) - memsize;
> +        piecetwosize = holesize - align_offset;
> +    } else {
> +        /* There's no [zzzzz] piece, all memory above 4G starts
> +         * at below_4g_mem_size in the RAM block.  Also no need
> +         * to align anything.
> +         */
> +        align_offset = 0;
> +        piecetwosize = above_4g_mem_size;
> +    }
> +
>      ram = g_malloc(sizeof(*ram));
> -    memory_region_init_ram(ram, NULL, "pc.ram",
> -                           below_4g_mem_size + above_4g_mem_size);
> +    memory_region_init_ram(ram, NULL, "pc.ram", memsize + align_offset);
>      vmstate_register_ram_global(ram);
>      *ram_memory = ram;
> +
>      ram_below_4g = g_malloc(sizeof(*ram_below_4g));
>      memory_region_init_alias(ram_below_4g, NULL, "ram-below-4g", ram,
>                               0, below_4g_mem_size);
>      memory_region_add_subregion(system_memory, 0, ram_below_4g);
> +
> +    pieceonesize = above_4g_mem_size - piecetwosize;
> +    if (pieceonesize) {
> +        ram_above_4g_pieceone = g_malloc(sizeof(*ram_above_4g_pieceone));
> +        memory_region_init_alias(ram_above_4g_pieceone, NULL,
> +                                 "ram-above-4g-pieceone", ram,
> +                                 0x100000000ULL, pieceonesize);
> +        memory_region_add_subregion(system_memory, 0x100000000ULL,
> +                                    ram_above_4g_pieceone);
> +    }
> +    if (piecetwosize) {
> +        ram_above_4g_piecetwo = g_malloc(sizeof(*ram_above_4g_piecetwo));
> +        memory_region_init_alias(ram_above_4g_piecetwo, NULL,
> +                                 "ram-above-4g-piecetwo", ram,
> +                                 below_4g_mem_size, piecetwosize);
> +        memory_region_add_subregion(system_memory,
> +                                    0x100000000ULL + pieceonesize,
> +                                    ram_above_4g_piecetwo);
> +    }
> +
>      e820_add_entry(0, below_4g_mem_size, E820_RAM);
>      if (above_4g_mem_size > 0) {
> -        ram_above_4g = g_malloc(sizeof(*ram_above_4g));
> -        memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
> -                                 below_4g_mem_size, above_4g_mem_size);
> -        memory_region_add_subregion(system_memory, 0x100000000ULL,
> -                                    ram_above_4g);
>          e820_add_entry(0x100000000ULL, above_4g_mem_size, E820_RAM);
>      }
>  
> -
>      /* Initialize PC system firmware */
>      pc_system_firmware_init(rom_memory, guest_info->isapc_ram_fw);
>  
> diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
> index 36f2495..ca9bd2e 100644
> --- a/hw/i386/pc_piix.c
> +++ b/hw/i386/pc_piix.c
> @@ -62,6 +62,7 @@ static bool has_pvpanic;
>  static bool has_pci_info;
>  static bool has_acpi_build = true;
>  static bool smbios_type1_defaults = true;
> +static bool gb_align = true;
>  
>  /* PC hardware initialisation */
>  static void pc_init1(QEMUMachineInitArgs *args,
> @@ -130,6 +131,7 @@ static void pc_init1(QEMUMachineInitArgs *args,
>  
>      guest_info->has_pci_info = has_pci_info;
>      guest_info->isapc_ram_fw = !pci_enabled;
> +    guest_info->gb_align = gb_align;
>  
>      if (smbios_type1_defaults) {
>          /* These values are guest ABI, do not change */
> @@ -249,6 +251,7 @@ static void pc_init_pci(QEMUMachineInitArgs *args)
>  static void pc_compat_1_7(QEMUMachineInitArgs *args)
>  {
>      smbios_type1_defaults = false;
> +    gb_align = false;
>  }
>  
>  static void pc_compat_1_6(QEMUMachineInitArgs *args)
> diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
> index 50ca458..89c7720 100644
> --- a/hw/i386/pc_q35.c
> +++ b/hw/i386/pc_q35.c
> @@ -52,6 +52,7 @@ static bool has_pvpanic;
>  static bool has_pci_info;
>  static bool has_acpi_build = true;
>  static bool smbios_type1_defaults = true;
> +static bool gb_align = true;
>  
>  /* PC hardware initialisation */
>  static void pc_q35_init(QEMUMachineInitArgs *args)
> @@ -115,6 +116,7 @@ static void pc_q35_init(QEMUMachineInitArgs *args)
>      guest_info->has_pci_info = has_pci_info;
>      guest_info->isapc_ram_fw = false;
>      guest_info->has_acpi_build = has_acpi_build;
> +    guest_info->gb_align = gb_align;
>  
>      if (smbios_type1_defaults) {
>          /* These values are guest ABI, do not change */
> @@ -233,6 +235,7 @@ static void pc_q35_init(QEMUMachineInitArgs *args)
>  static void pc_compat_1_7(QEMUMachineInitArgs *args)
>  {
>      smbios_type1_defaults = false;
> +    gb_align = false;
>  }
>  
>  static void pc_compat_1_6(QEMUMachineInitArgs *args)
> diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
> index 9af09d3..8047e82 100644
> --- a/include/hw/i386/pc.h
> +++ b/include/hw/i386/pc.h
> @@ -41,6 +41,7 @@ struct PcGuestInfo {
>      uint64_t *node_cpu;
>      FWCfgState *fw_cfg;
>      bool has_acpi_build;
> +    bool gb_align;
>  };
>  
>  /* parallel.c */
> 

I reviewed this before and I trust your reorganization.

Hopefully Marcelo can run a dump-guest-memory test with a biggie guest
(paging=false, checking the vmcore with "readelf -W -a" and "crash".) I
think there should be no problems.

Reviewed-by: Laszlo Ersek <lersek@redhat.com>

Thanks
Laszlo
Michael S. Tsirkin Nov. 25, 2013, 9:05 p.m. UTC | #2
On Mon, Nov 25, 2013 at 06:43:13PM +0100, Paolo Bonzini wrote:
> v2: condition enablement of new mapping to new machine types (Paolo)
> v3: fix changelog
> v4: rebase
> v5: ensure alignment of piecetwo on 2MB GPA (Igor)
>     do not register zero-sized piece-one    (Igor)
> v6: fix memory leak                         (Igor)
>     fix integer overflow                    (Igor)
> 
> ----
> 
> Align guest physical address and host physical address
> beyond guest 4GB on a 1GB boundary.
> 
> Otherwise 1GB TLBs cannot be cached for the range.
> 
> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> [Reorganize code, keep same logic. - Paolo]
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>

BTW how about a unit-test for this?
Can be something along the lines of the acpi tests:
run BIOS, probe what it reported.

> ---
>  hw/i386/pc.c         |   67 +++++++++++++++++++++++++++++++++++++++++++------
>  hw/i386/pc_piix.c    |    3 ++
>  hw/i386/pc_q35.c     |    3 ++
>  include/hw/i386/pc.h |    1 +
>  4 files changed, 65 insertions(+), 9 deletions(-)
> 
> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
> index 6c82ada..485b44d 100644
> --- a/hw/i386/pc.c
> +++ b/hw/i386/pc.c
> @@ -1148,8 +1148,10 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory,
>  {
>      int linux_boot, i;
>      MemoryRegion *ram, *option_rom_mr;
> -    MemoryRegion *ram_below_4g, *ram_above_4g;
> +    MemoryRegion *ram_below_4g, *ram_above_4g_pieceone, *ram_above_4g_piecetwo;
>      FWCfgState *fw_cfg;
> +    uint64_t holesize, pieceonesize, piecetwosize;
> +    uint64_t memsize, align_offset;
>  
>      linux_boot = (kernel_filename != NULL);
>  
> @@ -1157,26 +1159,73 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory,
>       * aliases to address portions of it, mostly for backwards compatibility
>       * with older qemus that used qemu_ram_alloc().
>       */
> +    memsize = below_4g_mem_size + above_4g_mem_size;
> +    holesize = 0x100000000ULL - below_4g_mem_size;
> +
> +    /* If 1GB hugepages are used to back guest RAM, we want the
> +     * physical address 4GB to map to 4GB in the RAM, so that
> +     * memory beyond 4GB is aligned on a 1GB boundary, at the
> +     * host physical address space.  Thus, the ram block range
> +     * [holestart, 4GB] is mapped to the last holesize bytes of RAM:
> +     *
> +     *                      0      h     4G     memsize-holesize
> +     *
> +     * contiguous-ram-block [xxxxxx][yyy][zzzzz]
> +     *                                '-----------.
> +     * guest-addr-space     [xxxxxx]     [zzzzz][yyy]
> +     *
> +     * This is only done in new-enough machine types, and of course
> +     * it  is only possible if the [zzzzz] block exists at all.
> +     */
> +    if (guest_info->gb_align && above_4g_mem_size > holesize) {
> +        /* Round the allocation up to 2 MB to make [zzzzz]'s size
> +         * aligned, removing the extra from the [yyy] piece.
> +         */
> +        align_offset = ROUND_UP(memsize, 1UL << 21) - memsize;
> +        piecetwosize = holesize - align_offset;
> +    } else {
> +        /* There's no [zzzzz] piece, all memory above 4G starts
> +         * at below_4g_mem_size in the RAM block.  Also no need
> +         * to align anything.
> +         */
> +        align_offset = 0;
> +        piecetwosize = above_4g_mem_size;
> +    }
> +
>      ram = g_malloc(sizeof(*ram));
> -    memory_region_init_ram(ram, NULL, "pc.ram",
> -                           below_4g_mem_size + above_4g_mem_size);
> +    memory_region_init_ram(ram, NULL, "pc.ram", memsize + align_offset);
>      vmstate_register_ram_global(ram);
>      *ram_memory = ram;
> +
>      ram_below_4g = g_malloc(sizeof(*ram_below_4g));
>      memory_region_init_alias(ram_below_4g, NULL, "ram-below-4g", ram,
>                               0, below_4g_mem_size);
>      memory_region_add_subregion(system_memory, 0, ram_below_4g);
> +
> +    pieceonesize = above_4g_mem_size - piecetwosize;
> +    if (pieceonesize) {
> +        ram_above_4g_pieceone = g_malloc(sizeof(*ram_above_4g_pieceone));
> +        memory_region_init_alias(ram_above_4g_pieceone, NULL,
> +                                 "ram-above-4g-pieceone", ram,
> +                                 0x100000000ULL, pieceonesize);
> +        memory_region_add_subregion(system_memory, 0x100000000ULL,
> +                                    ram_above_4g_pieceone);
> +    }
> +    if (piecetwosize) {
> +        ram_above_4g_piecetwo = g_malloc(sizeof(*ram_above_4g_piecetwo));
> +        memory_region_init_alias(ram_above_4g_piecetwo, NULL,
> +                                 "ram-above-4g-piecetwo", ram,
> +                                 below_4g_mem_size, piecetwosize);
> +        memory_region_add_subregion(system_memory,
> +                                    0x100000000ULL + pieceonesize,
> +                                    ram_above_4g_piecetwo);
> +    }
> +
>      e820_add_entry(0, below_4g_mem_size, E820_RAM);
>      if (above_4g_mem_size > 0) {
> -        ram_above_4g = g_malloc(sizeof(*ram_above_4g));
> -        memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
> -                                 below_4g_mem_size, above_4g_mem_size);
> -        memory_region_add_subregion(system_memory, 0x100000000ULL,
> -                                    ram_above_4g);
>          e820_add_entry(0x100000000ULL, above_4g_mem_size, E820_RAM);
>      }
>  
> -
>      /* Initialize PC system firmware */
>      pc_system_firmware_init(rom_memory, guest_info->isapc_ram_fw);
>  
> diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
> index 36f2495..ca9bd2e 100644
> --- a/hw/i386/pc_piix.c
> +++ b/hw/i386/pc_piix.c
> @@ -62,6 +62,7 @@ static bool has_pvpanic;
>  static bool has_pci_info;
>  static bool has_acpi_build = true;
>  static bool smbios_type1_defaults = true;
> +static bool gb_align = true;
>  
>  /* PC hardware initialisation */
>  static void pc_init1(QEMUMachineInitArgs *args,
> @@ -130,6 +131,7 @@ static void pc_init1(QEMUMachineInitArgs *args,
>  
>      guest_info->has_pci_info = has_pci_info;
>      guest_info->isapc_ram_fw = !pci_enabled;
> +    guest_info->gb_align = gb_align;
>  
>      if (smbios_type1_defaults) {
>          /* These values are guest ABI, do not change */
> @@ -249,6 +251,7 @@ static void pc_init_pci(QEMUMachineInitArgs *args)
>  static void pc_compat_1_7(QEMUMachineInitArgs *args)
>  {
>      smbios_type1_defaults = false;
> +    gb_align = false;
>  }
>  
>  static void pc_compat_1_6(QEMUMachineInitArgs *args)
> diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
> index 50ca458..89c7720 100644
> --- a/hw/i386/pc_q35.c
> +++ b/hw/i386/pc_q35.c
> @@ -52,6 +52,7 @@ static bool has_pvpanic;
>  static bool has_pci_info;
>  static bool has_acpi_build = true;
>  static bool smbios_type1_defaults = true;
> +static bool gb_align = true;
>  
>  /* PC hardware initialisation */
>  static void pc_q35_init(QEMUMachineInitArgs *args)
> @@ -115,6 +116,7 @@ static void pc_q35_init(QEMUMachineInitArgs *args)
>      guest_info->has_pci_info = has_pci_info;
>      guest_info->isapc_ram_fw = false;
>      guest_info->has_acpi_build = has_acpi_build;
> +    guest_info->gb_align = gb_align;
>  
>      if (smbios_type1_defaults) {
>          /* These values are guest ABI, do not change */
> @@ -233,6 +235,7 @@ static void pc_q35_init(QEMUMachineInitArgs *args)
>  static void pc_compat_1_7(QEMUMachineInitArgs *args)
>  {
>      smbios_type1_defaults = false;
> +    gb_align = false;
>  }
>  
>  static void pc_compat_1_6(QEMUMachineInitArgs *args)
> diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
> index 9af09d3..8047e82 100644
> --- a/include/hw/i386/pc.h
> +++ b/include/hw/i386/pc.h
> @@ -41,6 +41,7 @@ struct PcGuestInfo {
>      uint64_t *node_cpu;
>      FWCfgState *fw_cfg;
>      bool has_acpi_build;
> +    bool gb_align;
>  };
>  
>  /* parallel.c */
> -- 
> 1.7.1
Marcelo Tosatti Nov. 25, 2013, 11:09 p.m. UTC | #3
On Mon, Nov 25, 2013 at 11:05:10PM +0200, Michael S. Tsirkin wrote:
> On Mon, Nov 25, 2013 at 06:43:13PM +0100, Paolo Bonzini wrote:
> > v2: condition enablement of new mapping to new machine types (Paolo)
> > v3: fix changelog
> > v4: rebase
> > v5: ensure alignment of piecetwo on 2MB GPA (Igor)
> >     do not register zero-sized piece-one    (Igor)
> > v6: fix memory leak                         (Igor)
> >     fix integer overflow                    (Igor)
> > 
> > ----
> > 
> > Align guest physical address and host physical address
> > beyond guest 4GB on a 1GB boundary.
> > 
> > Otherwise 1GB TLBs cannot be cached for the range.
> > 
> > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> > [Reorganize code, keep same logic. - Paolo]
> > Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> 
> BTW how about a unit-test for this?
> Can be something along the lines of the acpi tests:
> run BIOS, probe what it reported.

There is no guest visible difference.
Paolo Bonzini Nov. 26, 2013, 9:04 a.m. UTC | #4
Il 25/11/2013 22:05, Michael S. Tsirkin ha scritto:
>> > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
>> > [Reorganize code, keep same logic. - Paolo]
>> > Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> BTW how about a unit-test for this?
> Can be something along the lines of the acpi tests:
> run BIOS, probe what it reported.
> 

No, it can't because the change is not guest visible.  It is not enabled
for machine types <=1.8 due to migration compatibility (the migration
stream uses offsets within the RAM block, and these change), but there
is no difference from the guest point of view.

Paolo
Michael S. Tsirkin Nov. 28, 2013, 10:26 a.m. UTC | #5
On Mon, Nov 25, 2013 at 06:43:13PM +0100, Paolo Bonzini wrote:
> v2: condition enablement of new mapping to new machine types (Paolo)
> v3: fix changelog
> v4: rebase
> v5: ensure alignment of piecetwo on 2MB GPA (Igor)
>     do not register zero-sized piece-one    (Igor)
> v6: fix memory leak                         (Igor)
>     fix integer overflow                    (Igor)
> 
> ----
> 
> Align guest physical address and host physical address
> beyond guest 4GB on a 1GB boundary.
> 
> Otherwise 1GB TLBs cannot be cached for the range.
> 
> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> [Reorganize code, keep same logic. - Paolo]
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---

Applied, thanks.

>  hw/i386/pc.c         |   67 +++++++++++++++++++++++++++++++++++++++++++------
>  hw/i386/pc_piix.c    |    3 ++
>  hw/i386/pc_q35.c     |    3 ++
>  include/hw/i386/pc.h |    1 +
>  4 files changed, 65 insertions(+), 9 deletions(-)
> 
> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
> index 6c82ada..485b44d 100644
> --- a/hw/i386/pc.c
> +++ b/hw/i386/pc.c
> @@ -1148,8 +1148,10 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory,
>  {
>      int linux_boot, i;
>      MemoryRegion *ram, *option_rom_mr;
> -    MemoryRegion *ram_below_4g, *ram_above_4g;
> +    MemoryRegion *ram_below_4g, *ram_above_4g_pieceone, *ram_above_4g_piecetwo;
>      FWCfgState *fw_cfg;
> +    uint64_t holesize, pieceonesize, piecetwosize;
> +    uint64_t memsize, align_offset;
>  
>      linux_boot = (kernel_filename != NULL);
>  
> @@ -1157,26 +1159,73 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory,
>       * aliases to address portions of it, mostly for backwards compatibility
>       * with older qemus that used qemu_ram_alloc().
>       */
> +    memsize = below_4g_mem_size + above_4g_mem_size;
> +    holesize = 0x100000000ULL - below_4g_mem_size;
> +
> +    /* If 1GB hugepages are used to back guest RAM, we want the
> +     * physical address 4GB to map to 4GB in the RAM, so that
> +     * memory beyond 4GB is aligned on a 1GB boundary, at the
> +     * host physical address space.  Thus, the ram block range
> +     * [holestart, 4GB] is mapped to the last holesize bytes of RAM:
> +     *
> +     *                      0      h     4G     memsize-holesize
> +     *
> +     * contiguous-ram-block [xxxxxx][yyy][zzzzz]
> +     *                                '-----------.
> +     * guest-addr-space     [xxxxxx]     [zzzzz][yyy]
> +     *
> +     * This is only done in new-enough machine types, and of course
> +     * it  is only possible if the [zzzzz] block exists at all.
> +     */
> +    if (guest_info->gb_align && above_4g_mem_size > holesize) {
> +        /* Round the allocation up to 2 MB to make [zzzzz]'s size
> +         * aligned, removing the extra from the [yyy] piece.
> +         */
> +        align_offset = ROUND_UP(memsize, 1UL << 21) - memsize;
> +        piecetwosize = holesize - align_offset;
> +    } else {
> +        /* There's no [zzzzz] piece, all memory above 4G starts
> +         * at below_4g_mem_size in the RAM block.  Also no need
> +         * to align anything.
> +         */
> +        align_offset = 0;
> +        piecetwosize = above_4g_mem_size;
> +    }
> +
>      ram = g_malloc(sizeof(*ram));
> -    memory_region_init_ram(ram, NULL, "pc.ram",
> -                           below_4g_mem_size + above_4g_mem_size);
> +    memory_region_init_ram(ram, NULL, "pc.ram", memsize + align_offset);
>      vmstate_register_ram_global(ram);
>      *ram_memory = ram;
> +
>      ram_below_4g = g_malloc(sizeof(*ram_below_4g));
>      memory_region_init_alias(ram_below_4g, NULL, "ram-below-4g", ram,
>                               0, below_4g_mem_size);
>      memory_region_add_subregion(system_memory, 0, ram_below_4g);
> +
> +    pieceonesize = above_4g_mem_size - piecetwosize;
> +    if (pieceonesize) {
> +        ram_above_4g_pieceone = g_malloc(sizeof(*ram_above_4g_pieceone));
> +        memory_region_init_alias(ram_above_4g_pieceone, NULL,
> +                                 "ram-above-4g-pieceone", ram,
> +                                 0x100000000ULL, pieceonesize);
> +        memory_region_add_subregion(system_memory, 0x100000000ULL,
> +                                    ram_above_4g_pieceone);
> +    }
> +    if (piecetwosize) {
> +        ram_above_4g_piecetwo = g_malloc(sizeof(*ram_above_4g_piecetwo));
> +        memory_region_init_alias(ram_above_4g_piecetwo, NULL,
> +                                 "ram-above-4g-piecetwo", ram,
> +                                 below_4g_mem_size, piecetwosize);
> +        memory_region_add_subregion(system_memory,
> +                                    0x100000000ULL + pieceonesize,
> +                                    ram_above_4g_piecetwo);
> +    }
> +
>      e820_add_entry(0, below_4g_mem_size, E820_RAM);
>      if (above_4g_mem_size > 0) {
> -        ram_above_4g = g_malloc(sizeof(*ram_above_4g));
> -        memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
> -                                 below_4g_mem_size, above_4g_mem_size);
> -        memory_region_add_subregion(system_memory, 0x100000000ULL,
> -                                    ram_above_4g);
>          e820_add_entry(0x100000000ULL, above_4g_mem_size, E820_RAM);
>      }
>  
> -
>      /* Initialize PC system firmware */
>      pc_system_firmware_init(rom_memory, guest_info->isapc_ram_fw);
>  
> diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
> index 36f2495..ca9bd2e 100644
> --- a/hw/i386/pc_piix.c
> +++ b/hw/i386/pc_piix.c
> @@ -62,6 +62,7 @@ static bool has_pvpanic;
>  static bool has_pci_info;
>  static bool has_acpi_build = true;
>  static bool smbios_type1_defaults = true;
> +static bool gb_align = true;
>  
>  /* PC hardware initialisation */
>  static void pc_init1(QEMUMachineInitArgs *args,
> @@ -130,6 +131,7 @@ static void pc_init1(QEMUMachineInitArgs *args,
>  
>      guest_info->has_pci_info = has_pci_info;
>      guest_info->isapc_ram_fw = !pci_enabled;
> +    guest_info->gb_align = gb_align;
>  
>      if (smbios_type1_defaults) {
>          /* These values are guest ABI, do not change */
> @@ -249,6 +251,7 @@ static void pc_init_pci(QEMUMachineInitArgs *args)
>  static void pc_compat_1_7(QEMUMachineInitArgs *args)
>  {
>      smbios_type1_defaults = false;
> +    gb_align = false;
>  }
>  
>  static void pc_compat_1_6(QEMUMachineInitArgs *args)
> diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
> index 50ca458..89c7720 100644
> --- a/hw/i386/pc_q35.c
> +++ b/hw/i386/pc_q35.c
> @@ -52,6 +52,7 @@ static bool has_pvpanic;
>  static bool has_pci_info;
>  static bool has_acpi_build = true;
>  static bool smbios_type1_defaults = true;
> +static bool gb_align = true;
>  
>  /* PC hardware initialisation */
>  static void pc_q35_init(QEMUMachineInitArgs *args)
> @@ -115,6 +116,7 @@ static void pc_q35_init(QEMUMachineInitArgs *args)
>      guest_info->has_pci_info = has_pci_info;
>      guest_info->isapc_ram_fw = false;
>      guest_info->has_acpi_build = has_acpi_build;
> +    guest_info->gb_align = gb_align;
>  
>      if (smbios_type1_defaults) {
>          /* These values are guest ABI, do not change */
> @@ -233,6 +235,7 @@ static void pc_q35_init(QEMUMachineInitArgs *args)
>  static void pc_compat_1_7(QEMUMachineInitArgs *args)
>  {
>      smbios_type1_defaults = false;
> +    gb_align = false;
>  }
>  
>  static void pc_compat_1_6(QEMUMachineInitArgs *args)
> diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
> index 9af09d3..8047e82 100644
> --- a/include/hw/i386/pc.h
> +++ b/include/hw/i386/pc.h
> @@ -41,6 +41,7 @@ struct PcGuestInfo {
>      uint64_t *node_cpu;
>      FWCfgState *fw_cfg;
>      bool has_acpi_build;
> +    bool gb_align;
>  };
>  
>  /* parallel.c */
> -- 
> 1.7.1
Paolo Bonzini Dec. 10, 2013, 1:18 p.m. UTC | #6
Il 28/11/2013 11:26, Michael S. Tsirkin ha scritto:
> On Mon, Nov 25, 2013 at 06:43:13PM +0100, Paolo Bonzini wrote:
>> v2: condition enablement of new mapping to new machine types (Paolo)
>> v3: fix changelog
>> v4: rebase
>> v5: ensure alignment of piecetwo on 2MB GPA (Igor)
>>     do not register zero-sized piece-one    (Igor)
>> v6: fix memory leak                         (Igor)
>>     fix integer overflow                    (Igor)
>>
>> ----
>>
>> Align guest physical address and host physical address
>> beyond guest 4GB on a 1GB boundary.
>>
>> Otherwise 1GB TLBs cannot be cached for the range.
>>
>> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
>> [Reorganize code, keep same logic. - Paolo]
>> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
>> ---
> 
> Applied, thanks.

As discussed offlist, I'm not sure anymore that this is the right
approach to the problem.  No doubt it is very clever, in that it is
absolutely transparent to the guest.  However, the non-contiguous
mapping of ram_addr_t makes it more complex to associate the right NUMA
policy to the ranges.

If we could make a small guset visible change, it would be simpler to
always make the PCI hole 1GB in size; it is currently 256MB for i440FX
and 1.25GB for q35.  We can take a look as soon as the SeaBIOS patches
are in to use QEMU-built ACPI tables.

Paolo
Gerd Hoffmann Dec. 10, 2013, 2:53 p.m. UTC | #7
Hi,

> If we could make a small guset visible change, it would be simpler to
> always make the PCI hole 1GB in size; it is currently 256MB for i440FX
> and 1.25GB for q35.

Easy for i440fx.

Tricky for q35 as the firmware knows qemu will not map ram above
0xb000000 and places the mmconfig bar @ 0xb0000000.  Making the window
smaller (1.25GB -> 1GB) will create a conflict there.  Making it larger
(2G) will work.  It's done this way to keep 0xc0000000+ free for pci
bars, and we can map up to 512MB-sized bars there.

>   We can take a look as soon as the SeaBIOS patches
> are in to use QEMU-built ACPI tables.

This is in master already.

cheers,
  Gerd
Paolo Bonzini Dec. 10, 2013, 2:58 p.m. UTC | #8
Il 10/12/2013 15:53, Gerd Hoffmann ha scritto:
>   Hi,
> 
>> If we could make a small guset visible change, it would be simpler to
>> always make the PCI hole 1GB in size; it is currently 256MB for i440FX
>> and 1.25GB for q35.
> 
> Easy for i440fx.
> 
> Tricky for q35 as the firmware knows qemu will not map ram above
> 0xb000000 and places the mmconfig bar @ 0xb0000000.  Making the window
> smaller (1.25GB -> 1GB) will create a conflict there.  Making it larger
> (2G) will work.  It's done this way to keep 0xc0000000+ free for pci
> bars, and we can map up to 512MB-sized bars there.

0xc0000000-0xfebfffff is almost 1GB, so there is room for 1 512MB-size
BAR.  mmconfig could move to 0xf0000000-0xf7ffffff; firmware is not a
problem because this would be only for new machine types.

Unless I'm missing something of course.

Paolo

>>   We can take a look as soon as the SeaBIOS patches
>> are in to use QEMU-built ACPI tables.
> 
> This is in master already.
> 
> cheers,
>   Gerd
> 
>
Marcelo Tosatti Dec. 10, 2013, 3:05 p.m. UTC | #9
On Tue, Dec 10, 2013 at 02:18:36PM +0100, Paolo Bonzini wrote:
> Il 28/11/2013 11:26, Michael S. Tsirkin ha scritto:
> > On Mon, Nov 25, 2013 at 06:43:13PM +0100, Paolo Bonzini wrote:
> >> v2: condition enablement of new mapping to new machine types (Paolo)
> >> v3: fix changelog
> >> v4: rebase
> >> v5: ensure alignment of piecetwo on 2MB GPA (Igor)
> >>     do not register zero-sized piece-one    (Igor)
> >> v6: fix memory leak                         (Igor)
> >>     fix integer overflow                    (Igor)
> >>
> >> ----
> >>
> >> Align guest physical address and host physical address
> >> beyond guest 4GB on a 1GB boundary.
> >>
> >> Otherwise 1GB TLBs cannot be cached for the range.
> >>
> >> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> >> [Reorganize code, keep same logic. - Paolo]
> >> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> >> ---
> > 
> > Applied, thanks.
> 
> As discussed offlist, I'm not sure anymore that this is the right
> approach to the problem.  No doubt it is very clever, in that it is
> absolutely transparent to the guest.  However, the non-contiguous
> mapping of ram_addr_t makes it more complex to associate the right NUMA
> policy to the ranges.

Please explain what is the difference, and why the complexity does not
exist with non-contiguous mapping of ram_addr_t.

> If we could make a small guset visible change, it would be simpler to
> always make the PCI hole 1GB in size; it is currently 256MB for i440FX
> and 1.25GB for q35.  We can take a look as soon as the SeaBIOS patches
> are in to use QEMU-built ACPI tables.
> 
> Paolo
Gerd Hoffmann Dec. 10, 2013, 3:36 p.m. UTC | #10
On Di, 2013-12-10 at 15:58 +0100, Paolo Bonzini wrote:
> Il 10/12/2013 15:53, Gerd Hoffmann ha scritto:
> >   Hi,
> > 
> >> If we could make a small guset visible change, it would be simpler to
> >> always make the PCI hole 1GB in size; it is currently 256MB for i440FX
> >> and 1.25GB for q35.
> > 
> > Easy for i440fx.
> > 
> > Tricky for q35 as the firmware knows qemu will not map ram above
> > 0xb000000 and places the mmconfig bar @ 0xb0000000.  Making the window
> > smaller (1.25GB -> 1GB) will create a conflict there.  Making it larger
> > (2G) will work.  It's done this way to keep 0xc0000000+ free for pci
> > bars, and we can map up to 512MB-sized bars there.
> 
> 0xc0000000-0xfebfffff is almost 1GB, so there is room for 1 512MB-size
> BAR.  mmconfig could move to 0xf0000000-0xf7ffffff;

Reduces number of pci busses from 256 to 128.  Not that this is a
problem now, but something to consider to make sure things are
future-proof.

> firmware is not a
> problem because this would be only for new machine types.

Even on qemu 2.0 it is the firmware which decides where to place the
mmconfig bar.  qemu will look at northbridge xbar register to generate a
matching mcfg acpi table.

cheers,
  Gerd
Laszlo Ersek Dec. 10, 2013, 3:47 p.m. UTC | #11
On 12/10/13 15:53, Gerd Hoffmann wrote:
>   Hi,
> 
>> If we could make a small guset visible change, it would be simpler to
>> always make the PCI hole 1GB in size; it is currently 256MB for i440FX
>> and 1.25GB for q35.
> 
> Easy for i440fx.

I think it's going to break OVMF again.

Also, I don't understand the reference to "it is currently 256MB for
i440FX"... My understanding was that the PCI hole didn't shrink below
512MB in size for i440fx -- sorry if I missed something.

Thanks
Laszlo
Gerd Hoffmann Dec. 10, 2013, 3:53 p.m. UTC | #12
On Di, 2013-12-10 at 16:47 +0100, Laszlo Ersek wrote:
> On 12/10/13 15:53, Gerd Hoffmann wrote:
> >   Hi,
> > 
> >> If we could make a small guset visible change, it would be simpler to
> >> always make the PCI hole 1GB in size; it is currently 256MB for i440FX
> >> and 1.25GB for q35.
> > 
> > Easy for i440fx.
> 
> I think it's going to break OVMF again.

Can't see a reason why it should.

> Also, I don't understand the reference to "it is currently 256MB for
> i440FX"... My understanding was that the PCI hole didn't shrink below
> 512MB in size for i440fx -- sorry if I missed something.

512MB is correct (0xe0000000+).

cheers,
  Gerd
Michael S. Tsirkin Dec. 10, 2013, 4:52 p.m. UTC | #13
On Tue, Dec 10, 2013 at 02:18:36PM +0100, Paolo Bonzini wrote:
> Il 28/11/2013 11:26, Michael S. Tsirkin ha scritto:
> > On Mon, Nov 25, 2013 at 06:43:13PM +0100, Paolo Bonzini wrote:
> >> v2: condition enablement of new mapping to new machine types (Paolo)
> >> v3: fix changelog
> >> v4: rebase
> >> v5: ensure alignment of piecetwo on 2MB GPA (Igor)
> >>     do not register zero-sized piece-one    (Igor)
> >> v6: fix memory leak                         (Igor)
> >>     fix integer overflow                    (Igor)
> >>
> >> ----
> >>
> >> Align guest physical address and host physical address
> >> beyond guest 4GB on a 1GB boundary.
> >>
> >> Otherwise 1GB TLBs cannot be cached for the range.
> >>
> >> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> >> [Reorganize code, keep same logic. - Paolo]
> >> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> >> ---
> > 
> > Applied, thanks.
> 
> As discussed offlist, I'm not sure anymore that this is the right
> approach to the problem.  No doubt it is very clever, in that it is
> absolutely transparent to the guest.  However, the non-contiguous
> mapping of ram_addr_t makes it more complex to associate the right NUMA
> policy to the ranges.
> 
> If we could make a small guset visible change, it would be simpler to
> always make the PCI hole 1GB in size; it is currently 256MB for i440FX
> and 1.25GB for q35.  We can take a look as soon as the SeaBIOS patches
> are in to use QEMU-built ACPI tables.
> 
> Paolo

I think giving us the flexibility in selecting size for memory below 4G
is nice.

So let's do both: apply Marcel's patch for this specific problem,
for huge pages, I don't oppose making the guest visible change as well.
Marcelo Tosatti Dec. 10, 2013, 5:21 p.m. UTC | #14
On Tue, Dec 10, 2013 at 01:05:42PM -0200, Marcelo Tosatti wrote:
> On Tue, Dec 10, 2013 at 02:18:36PM +0100, Paolo Bonzini wrote:
> > Il 28/11/2013 11:26, Michael S. Tsirkin ha scritto:
> > > On Mon, Nov 25, 2013 at 06:43:13PM +0100, Paolo Bonzini wrote:
> > >> v2: condition enablement of new mapping to new machine types (Paolo)
> > >> v3: fix changelog
> > >> v4: rebase
> > >> v5: ensure alignment of piecetwo on 2MB GPA (Igor)
> > >>     do not register zero-sized piece-one    (Igor)
> > >> v6: fix memory leak                         (Igor)
> > >>     fix integer overflow                    (Igor)
> > >>
> > >> ----
> > >>
> > >> Align guest physical address and host physical address
> > >> beyond guest 4GB on a 1GB boundary.
> > >>
> > >> Otherwise 1GB TLBs cannot be cached for the range.
> > >>
> > >> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> > >> [Reorganize code, keep same logic. - Paolo]
> > >> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> > >> ---
> > > 
> > > Applied, thanks.
> > 
> > As discussed offlist, I'm not sure anymore that this is the right
> > approach to the problem.  No doubt it is very clever, in that it is
> > absolutely transparent to the guest.  However, the non-contiguous
> > mapping of ram_addr_t makes it more complex to associate the right NUMA
> > policy to the ranges.
> 
> Please explain what is the difference, and why the complexity does not
> exist with non-contiguous mapping of ram_addr_t.

You are right - it forces the 1GB page which contains the hole 
to be on the same NUMA node as the tail 1GB page - otherwise 
incorrect NUMA assignment is not possible.

Agreed.

> > If we could make a small guset visible change, it would be simpler to
> > always make the PCI hole 1GB in size; it is currently 256MB for i440FX
> > and 1.25GB for q35.  We can take a look as soon as the SeaBIOS patches
> > are in to use QEMU-built ACPI tables.
> > 
> > Paolo
Laszlo Ersek Dec. 10, 2013, 5:46 p.m. UTC | #15
On 12/10/13 16:53, Gerd Hoffmann wrote:
> On Di, 2013-12-10 at 16:47 +0100, Laszlo Ersek wrote:
>> On 12/10/13 15:53, Gerd Hoffmann wrote:
>>>   Hi,
>>>
>>>> If we could make a small guset visible change, it would be simpler to
>>>> always make the PCI hole 1GB in size; it is currently 256MB for i440FX
>>>> and 1.25GB for q35.
>>>
>>> Easy for i440fx.
>>
>> I think it's going to break OVMF again.
> 
> Can't see a reason why it should.

PCI enumeration in OVMF assigns resources from a window that starts
exactly above the end of below-4gb-memory. For example, in case of a
2.5GB guest, the frame buffer bar of cirrus can be somewhere just above
2.5GB.

If you change the PCI hole in qemu so that it will start at 3GB, always,
then the ACPI tables exported by qemu will also advertise the big mmio
range starting at 3GB. OVMF will pass those tables through to the OS.
Accordingly, the OS will try to access the framebuffer above 3GB, but
OVMF has configured that bar between 2.5GB and 3GB.

IOW, I think this proposal would undo your [PATCH v2] piix: fix 32bit
pci hole.

I can of course live with whatever PCI hole as long as it is made
available to OVMF through an easy-to-parse fw_cfg file. Then I can sync
the OVMF enumeration to qemu's preference.

The fw_cfg file "etc/pci-info" allowed me to do exactly that. But it has
been killed. I'd like it to be resurrected, even if SeaBIOS ignores it.

Thanks,
Laszlo
Paolo Bonzini Dec. 10, 2013, 5:48 p.m. UTC | #16
Il 10/12/2013 18:46, Laszlo Ersek ha scritto:
> On 12/10/13 16:53, Gerd Hoffmann wrote:
>> On Di, 2013-12-10 at 16:47 +0100, Laszlo Ersek wrote:
>>> On 12/10/13 15:53, Gerd Hoffmann wrote:
>>>>   Hi,
>>>>
>>>>> If we could make a small guset visible change, it would be simpler to
>>>>> always make the PCI hole 1GB in size; it is currently 256MB for i440FX
>>>>> and 1.25GB for q35.
>>>>
>>>> Easy for i440fx.
>>>
>>> I think it's going to break OVMF again.
>>
>> Can't see a reason why it should.
> 
> PCI enumeration in OVMF assigns resources from a window that starts
> exactly above the end of below-4gb-memory. For example, in case of a
> 2.5GB guest, the frame buffer bar of cirrus can be somewhere just above
> 2.5GB.
> 
> If you change the PCI hole in qemu so that it will start at 3GB, always,
> then the ACPI tables exported by qemu will also advertise the big mmio
> range starting at 3GB. OVMF will pass those tables through to the OS.
> Accordingly, the OS will try to access the framebuffer above 3GB, but
> OVMF has configured that bar between 2.5GB and 3GB.

Right, you explained that in the past.  No need to worry, this would be
just for >3GB guests.

Paolo

> IOW, I think this proposal would undo your [PATCH v2] piix: fix 32bit
> pci hole.
> 
> I can of course live with whatever PCI hole as long as it is made
> available to OVMF through an easy-to-parse fw_cfg file. Then I can sync
> the OVMF enumeration to qemu's preference.
> 
> The fw_cfg file "etc/pci-info" allowed me to do exactly that. But it has
> been killed. I'd like it to be resurrected, even if SeaBIOS ignores it.
> 
> Thanks,
> Laszlo
>
Michael S. Tsirkin Dec. 10, 2013, 9 p.m. UTC | #17
On Tue, Dec 10, 2013 at 06:46:12PM +0100, Laszlo Ersek wrote:
> On 12/10/13 16:53, Gerd Hoffmann wrote:
> > On Di, 2013-12-10 at 16:47 +0100, Laszlo Ersek wrote:
> >> On 12/10/13 15:53, Gerd Hoffmann wrote:
> >>>   Hi,
> >>>
> >>>> If we could make a small guset visible change, it would be simpler to
> >>>> always make the PCI hole 1GB in size; it is currently 256MB for i440FX
> >>>> and 1.25GB for q35.
> >>>
> >>> Easy for i440fx.
> >>
> >> I think it's going to break OVMF again.
> > 
> > Can't see a reason why it should.
> 
> PCI enumeration in OVMF assigns resources from a window that starts
> exactly above the end of below-4gb-memory. For example, in case of a
> 2.5GB guest, the frame buffer bar of cirrus can be somewhere just above
> 2.5GB.
> 
> If you change the PCI hole in qemu so that it will start at 3GB, always,
> then the ACPI tables exported by qemu will also advertise the big mmio
> range starting at 3GB. OVMF will pass those tables through to the OS.
> Accordingly, the OS will try to access the framebuffer above 3GB, but
> OVMF has configured that bar between 2.5GB and 3GB.
> 
> IOW, I think this proposal would undo your [PATCH v2] piix: fix 32bit
> pci hole.
> 
> I can of course live with whatever PCI hole as long as it is made
> available to OVMF through an easy-to-parse fw_cfg file. Then I can sync
> the OVMF enumeration to qemu's preference.
> 
> The fw_cfg file "etc/pci-info" allowed me to do exactly that. But it has
> been killed. I'd like it to be resurrected, even if SeaBIOS ignores it.
> 
> Thanks,
> Laszlo

There's no PCI hole in QEMU.
All there is, is RAM split in two chunks: below and above 4G.
Michael S. Tsirkin Dec. 10, 2013, 9:02 p.m. UTC | #18
On Tue, Dec 10, 2013 at 03:21:44PM -0200, Marcelo Tosatti wrote:
> On Tue, Dec 10, 2013 at 01:05:42PM -0200, Marcelo Tosatti wrote:
> > On Tue, Dec 10, 2013 at 02:18:36PM +0100, Paolo Bonzini wrote:
> > > Il 28/11/2013 11:26, Michael S. Tsirkin ha scritto:
> > > > On Mon, Nov 25, 2013 at 06:43:13PM +0100, Paolo Bonzini wrote:
> > > >> v2: condition enablement of new mapping to new machine types (Paolo)
> > > >> v3: fix changelog
> > > >> v4: rebase
> > > >> v5: ensure alignment of piecetwo on 2MB GPA (Igor)
> > > >>     do not register zero-sized piece-one    (Igor)
> > > >> v6: fix memory leak                         (Igor)
> > > >>     fix integer overflow                    (Igor)
> > > >>
> > > >> ----
> > > >>
> > > >> Align guest physical address and host physical address
> > > >> beyond guest 4GB on a 1GB boundary.
> > > >>
> > > >> Otherwise 1GB TLBs cannot be cached for the range.
> > > >>
> > > >> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> > > >> [Reorganize code, keep same logic. - Paolo]
> > > >> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> > > >> ---
> > > > 
> > > > Applied, thanks.
> > > 
> > > As discussed offlist, I'm not sure anymore that this is the right
> > > approach to the problem.  No doubt it is very clever, in that it is
> > > absolutely transparent to the guest.  However, the non-contiguous
> > > mapping of ram_addr_t makes it more complex to associate the right NUMA
> > > policy to the ranges.
> > 
> > Please explain what is the difference, and why the complexity does not
> > exist with non-contiguous mapping of ram_addr_t.
> 
> You are right - it forces the 1GB page which contains the hole 
> to be on the same NUMA node as the tail 1GB page - otherwise 
> incorrect NUMA assignment is not possible.

What does this phrase mean?

Are we all in agreement that we want this patch, in addition to
resizing below 4g memory?

> 
> Agreed.
> 
> > > If we could make a small guset visible change, it would be simpler to
> > > always make the PCI hole 1GB in size; it is currently 256MB for i440FX
> > > and 1.25GB for q35.  We can take a look as soon as the SeaBIOS patches
> > > are in to use QEMU-built ACPI tables.
> > > 
> > > Paolo
Laszlo Ersek Dec. 10, 2013, 10:13 p.m. UTC | #19
On 12/10/13 22:00, Michael S. Tsirkin wrote:
> On Tue, Dec 10, 2013 at 06:46:12PM +0100, Laszlo Ersek wrote:
>> On 12/10/13 16:53, Gerd Hoffmann wrote:
>>> On Di, 2013-12-10 at 16:47 +0100, Laszlo Ersek wrote:
>>>> On 12/10/13 15:53, Gerd Hoffmann wrote:
>>>>>   Hi,
>>>>>
>>>>>> If we could make a small guset visible change, it would be simpler to
>>>>>> always make the PCI hole 1GB in size; it is currently 256MB for i440FX
>>>>>> and 1.25GB for q35.
>>>>>
>>>>> Easy for i440fx.
>>>>
>>>> I think it's going to break OVMF again.
>>>
>>> Can't see a reason why it should.
>>
>> PCI enumeration in OVMF assigns resources from a window that starts
>> exactly above the end of below-4gb-memory. For example, in case of a
>> 2.5GB guest, the frame buffer bar of cirrus can be somewhere just above
>> 2.5GB.
>>
>> If you change the PCI hole in qemu so that it will start at 3GB, always,
>> then the ACPI tables exported by qemu will also advertise the big mmio
>> range starting at 3GB. OVMF will pass those tables through to the OS.
>> Accordingly, the OS will try to access the framebuffer above 3GB, but
>> OVMF has configured that bar between 2.5GB and 3GB.
>>
>> IOW, I think this proposal would undo your [PATCH v2] piix: fix 32bit
>> pci hole.
>>
>> I can of course live with whatever PCI hole as long as it is made
>> available to OVMF through an easy-to-parse fw_cfg file. Then I can sync
>> the OVMF enumeration to qemu's preference.
>>
>> The fw_cfg file "etc/pci-info" allowed me to do exactly that. But it has
>> been killed. I'd like it to be resurrected, even if SeaBIOS ignores it.
>>
>> Thanks,
>> Laszlo
> 
> There's no PCI hole in QEMU.
> All there is, is RAM split in two chunks: below and above 4G.

I don't mind reformulating (but we've been around this track a couple of
times):

Please let OVMF know about the big MMIO range (the "split" you mention)
- that is going to be advertised via ACPI to the OS,
- in an easy to parse format over fw_cfg.

Gerd's patch that I mentioned above doesn't do this. It restores peace
between the ACPI advertisment produced by qemu and between OVMF's
enumeration by duplicating logic that OVMF does ahead of enumeration. If
you change that qemu code, then OVMF will either have to duplicate the
new qemu code, or it will need to understand the boundaries of the split
from an easy-to-parse fw_cfg file.

Thanks
Laszlo
Laszlo Ersek Dec. 10, 2013, 10:15 p.m. UTC | #20
On 12/10/13 23:13, Laszlo Ersek wrote:

> Please let OVMF know about the big MMIO range (the "split" you mention)
> - that is going to be advertised via ACPI to the OS,
> - in an easy to parse format over fw_cfg.

Anyway just ignore me this time. I test OVMF regularly on fresh qemu
builds and if something breaks I'll complain. (NB the current status
*is* broken, I just haven't been complaining recently because I keep
rebasing Gerd's patch.)

Thanks
Laszlo
Michael S. Tsirkin Dec. 10, 2013, 11:17 p.m. UTC | #21
On Tue, Dec 10, 2013 at 11:15:40PM +0100, Laszlo Ersek wrote:
> On 12/10/13 23:13, Laszlo Ersek wrote:
> 
> > Please let OVMF know about the big MMIO range (the "split" you mention)
> > - that is going to be advertised via ACPI to the OS,
> > - in an easy to parse format over fw_cfg.
> 
> Anyway just ignore me this time. I test OVMF regularly on fresh qemu
> builds and if something breaks I'll complain. (NB the current status
> *is* broken, I just haven't been complaining recently because I keep
> rebasing Gerd's patch.)
> 
> Thanks
> Laszlo

I'd like to merge Gerd's patch, and a similar patch for q35.
For q35 we need to get rid of MCH_HOST_BRIDGE_PCIEXBAR_DEFAULT.

However, I think we have to limit this change to pc-2.0 and newer:
qemu 1.8 still has the "pci hole" concept, so guest can't put
devices outside the specific ranges.

Anyone plans to look into this?
Gerd Hoffmann Dec. 11, 2013, 8 a.m. UTC | #22
Hi,

> I'd like to merge Gerd's patch, and a similar patch for q35.
> For q35 we need to get rid of MCH_HOST_BRIDGE_PCIEXBAR_DEFAULT.

Missing q35 patch is no reason to delay the i440fx fix though.

> However, I think we have to limit this change to pc-2.0 and newer:
> qemu 1.8 still has the "pci hole" concept, so guest can't put
> devices outside the specific ranges.

--verbose please.  I fail to see the problem here.

cheers,
  Gerd
Marcelo Tosatti Dec. 11, 2013, 1:41 p.m. UTC | #23
On Tue, Dec 10, 2013 at 11:02:41PM +0200, Michael S. Tsirkin wrote:
> On Tue, Dec 10, 2013 at 03:21:44PM -0200, Marcelo Tosatti wrote:
> > On Tue, Dec 10, 2013 at 01:05:42PM -0200, Marcelo Tosatti wrote:
> > > On Tue, Dec 10, 2013 at 02:18:36PM +0100, Paolo Bonzini wrote:
> > > > Il 28/11/2013 11:26, Michael S. Tsirkin ha scritto:
> > > > > On Mon, Nov 25, 2013 at 06:43:13PM +0100, Paolo Bonzini wrote:
> > > > >> v2: condition enablement of new mapping to new machine types (Paolo)
> > > > >> v3: fix changelog
> > > > >> v4: rebase
> > > > >> v5: ensure alignment of piecetwo on 2MB GPA (Igor)
> > > > >>     do not register zero-sized piece-one    (Igor)
> > > > >> v6: fix memory leak                         (Igor)
> > > > >>     fix integer overflow                    (Igor)
> > > > >>
> > > > >> ----
> > > > >>
> > > > >> Align guest physical address and host physical address
> > > > >> beyond guest 4GB on a 1GB boundary.
> > > > >>
> > > > >> Otherwise 1GB TLBs cannot be cached for the range.
> > > > >>
> > > > >> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> > > > >> [Reorganize code, keep same logic. - Paolo]
> > > > >> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> > > > >> ---
> > > > > 
> > > > > Applied, thanks.
> > > > 
> > > > As discussed offlist, I'm not sure anymore that this is the right
> > > > approach to the problem.  No doubt it is very clever, in that it is
> > > > absolutely transparent to the guest.  However, the non-contiguous
> > > > mapping of ram_addr_t makes it more complex to associate the right NUMA
> > > > policy to the ranges.
> > > 
> > > Please explain what is the difference, and why the complexity does not
> > > exist with non-contiguous mapping of ram_addr_t.
> > 
> > You are right - it forces the 1GB page which contains the hole 
> > to be on the same NUMA node as the tail 1GB page - otherwise 
> > incorrect NUMA assignment is not possible.
> 
> What does this phrase mean?
> 
> Are we all in agreement that we want this patch, in addition to
> resizing below 4g memory?

It means that its necessary to expose that 3-4GB physical memory region
in QEMU belongs to the same node (that is, guest must be aware that
3-3.75GB and the tail of RAM are on the same node).

So the problem Paolo mentions is fixable.
Michael S. Tsirkin Dec. 11, 2013, 2:20 p.m. UTC | #24
On Wed, Dec 11, 2013 at 11:41:18AM -0200, Marcelo Tosatti wrote:
> On Tue, Dec 10, 2013 at 11:02:41PM +0200, Michael S. Tsirkin wrote:
> > On Tue, Dec 10, 2013 at 03:21:44PM -0200, Marcelo Tosatti wrote:
> > > On Tue, Dec 10, 2013 at 01:05:42PM -0200, Marcelo Tosatti wrote:
> > > > On Tue, Dec 10, 2013 at 02:18:36PM +0100, Paolo Bonzini wrote:
> > > > > Il 28/11/2013 11:26, Michael S. Tsirkin ha scritto:
> > > > > > On Mon, Nov 25, 2013 at 06:43:13PM +0100, Paolo Bonzini wrote:
> > > > > >> v2: condition enablement of new mapping to new machine types (Paolo)
> > > > > >> v3: fix changelog
> > > > > >> v4: rebase
> > > > > >> v5: ensure alignment of piecetwo on 2MB GPA (Igor)
> > > > > >>     do not register zero-sized piece-one    (Igor)
> > > > > >> v6: fix memory leak                         (Igor)
> > > > > >>     fix integer overflow                    (Igor)
> > > > > >>
> > > > > >> ----
> > > > > >>
> > > > > >> Align guest physical address and host physical address
> > > > > >> beyond guest 4GB on a 1GB boundary.
> > > > > >>
> > > > > >> Otherwise 1GB TLBs cannot be cached for the range.
> > > > > >>
> > > > > >> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> > > > > >> [Reorganize code, keep same logic. - Paolo]
> > > > > >> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> > > > > >> ---
> > > > > > 
> > > > > > Applied, thanks.
> > > > > 
> > > > > As discussed offlist, I'm not sure anymore that this is the right
> > > > > approach to the problem.  No doubt it is very clever, in that it is
> > > > > absolutely transparent to the guest.  However, the non-contiguous
> > > > > mapping of ram_addr_t makes it more complex to associate the right NUMA
> > > > > policy to the ranges.
> > > > 
> > > > Please explain what is the difference, and why the complexity does not
> > > > exist with non-contiguous mapping of ram_addr_t.
> > > 
> > > You are right - it forces the 1GB page which contains the hole 
> > > to be on the same NUMA node as the tail 1GB page - otherwise 
> > > incorrect NUMA assignment is not possible.
> > 
> > What does this phrase mean?
> > 
> > Are we all in agreement that we want this patch, in addition to
> > resizing below 4g memory?
> 
> It means that its necessary to expose that 3-4GB physical memory region
> in QEMU belongs to the same node (that is, guest must be aware that
> 3-3.75GB and the tail of RAM are on the same node).
> 
> So the problem Paolo mentions is fixable.

Okay so
Marcelo - do you ack this patch for 2.0?
Paolo - do you re-ack this patch for 2.0?

Thanks,
Paolo Bonzini Dec. 11, 2013, 2:45 p.m. UTC | #25
Il 11/12/2013 15:20, Michael S. Tsirkin ha scritto:
> > It means that its necessary to expose that 3-4GB physical memory region
> > in QEMU belongs to the same node (that is, guest must be aware that
> > 3-3.75GB and the tail of RAM are on the same node).
> > 
> > So the problem Paolo mentions is fixable.

I'm not sure if it is fixable.  You need a 2M mountpoint to bind the 3G-4G
range correctly, a 1G mountpoint for everything else, and QEMU only allows
to specify one path.

Without Marcelo's patch there is a workaround; if you know the size of the 4G
hole and configure the first two nodes with unequal sizes.  For example

   -m 8192 \
   -object memory-ram,id=ram-node0,size=3840M,hostnode=0 -numa node,memdev=ram-node0 \
   -object memory-ram,id=ram-node1,size=4352M,hostnode=1 -numa node,memdev=ram-node1

   RAM address        Host virtual address low bits      Guest physical addresses
   0M-3840M           0                                  0M-3840M
   3840M-8192M        0                                  4096M-8448M

Then you'll waste 1GB of RAM (you'll use 9 hugepages instead of 8), but
everything will be aligned.  Or you just make your guest 7680M and not waste
the memory.

But with Marcelo's patch, ram-node1 will be split in two.  QEMU will try
to realign the second part of ram-node1, but the result is that the second
part is misaligned and only the first 256M (the tail of guest physical
memory) stays aligned:

   RAM address        Host virtual address low bits      Guest physical addresses
   0M-3840M           0                                  0M-3840M
   4096M-8192M        256M                               4096M-8192M
   3840M-4096M        0                                  8192M-8448M

So you still waste memory, _and_ get incorrect alignment.

> Okay so
> Marcelo - do you ack this patch for 2.0?
> Paolo - do you re-ack this patch for 2.0?

I very much prefer Gerd's approach.  2GB low memory for q35 is a bit wasteful,
but we have some time to fix that before release.

Paolo
Michael S. Tsirkin Dec. 11, 2013, 3:39 p.m. UTC | #26
On Wed, Dec 11, 2013 at 03:45:29PM +0100, Paolo Bonzini wrote:
> Il 11/12/2013 15:20, Michael S. Tsirkin ha scritto:
> > > It means that its necessary to expose that 3-4GB physical memory region
> > > in QEMU belongs to the same node (that is, guest must be aware that
> > > 3-3.75GB and the tail of RAM are on the same node).
> > > 
> > > So the problem Paolo mentions is fixable.
> 
> I'm not sure if it is fixable.  You need a 2M mountpoint to bind the 3G-4G
> range correctly, a 1G mountpoint for everything else, and QEMU only allows
> to specify one path.
> 
> Without Marcelo's patch there is a workaround; if you know the size of the 4G
> hole and configure the first two nodes with unequal sizes.  For example
> 
>    -m 8192 \
>    -object memory-ram,id=ram-node0,size=3840M,hostnode=0 -numa node,memdev=ram-node0 \
>    -object memory-ram,id=ram-node1,size=4352M,hostnode=1 -numa node,memdev=ram-node1
> 
>    RAM address        Host virtual address low bits      Guest physical addresses
>    0M-3840M           0                                  0M-3840M
>    3840M-8192M        0                                  4096M-8448M
> 
> Then you'll waste 1GB of RAM (you'll use 9 hugepages instead of 8), but
> everything will be aligned.  Or you just make your guest 7680M and not waste
> the memory.
> 
> But with Marcelo's patch, ram-node1 will be split in two.  QEMU will try
> to realign the second part of ram-node1, but the result is that the second
> part is misaligned and only the first 256M (the tail of guest physical
> memory) stays aligned:
> 
>    RAM address        Host virtual address low bits      Guest physical addresses
>    0M-3840M           0                                  0M-3840M
>    4096M-8192M        256M                               4096M-8192M
>    3840M-4096M        0                                  8192M-8448M
> 
> So you still waste memory, _and_ get incorrect alignment.
> 
> > Okay so
> > Marcelo - do you ack this patch for 2.0?
> > Paolo - do you re-ack this patch for 2.0?
> 
> I very much prefer Gerd's approach.

Thanks, I will drop this from my tree for now.
Please re-submit if you reconsider.

> 2GB low memory for q35 is a bit wasteful,
> but we have some time to fix that before release.
> 
> Paolo

How would you fix that?
Paolo Bonzini Dec. 11, 2013, 3:41 p.m. UTC | #27
Il 11/12/2013 16:39, Michael S. Tsirkin ha scritto:
> > 2GB low memory for q35 is a bit wasteful,
> > but we have some time to fix that before release.
> 
> How would you fix that?

Just use 1GB, coupled with reducing the default MMCONFIG size to 128
buses.  That's already ~40 PCIe devices if you count upstream/downstream
ports in addition to the actual device, and you can also use PCIe-to-PCI
bridges if you need more room.

Paolo
Igor Mammedov Dec. 11, 2013, 3:45 p.m. UTC | #28
On Wed, 11 Dec 2013 15:45:29 +0100
Paolo Bonzini <pbonzini@redhat.com> wrote:

> Il 11/12/2013 15:20, Michael S. Tsirkin ha scritto:
> > > It means that its necessary to expose that 3-4GB physical memory region
> > > in QEMU belongs to the same node (that is, guest must be aware that
> > > 3-3.75GB and the tail of RAM are on the same node).
> > > 
> > > So the problem Paolo mentions is fixable.
> 
> I'm not sure if it is fixable.  You need a 2M mountpoint to bind the 3G-4G
> range correctly, a 1G mountpoint for everything else, and QEMU only allows
> to specify one path.
we could do it with hugepage memdev backend.
something like:

-object hugepage-ram,id=mem1gb,size=3G,host-node=0,mem-path=/1gb-hugepage-fs
-device dimm,id=hp1g,memdev=mem1gb,node=0
-object hugepage-ram,id=mem2mb,size=500Mb,host-node=1,mem-path=/2mb-hugepage-fs
-device dimm,id=hp2mb,memdev=mem2mb,node=1

that basically would allow to distribute initial memory in any way user would
like.

> 
> Without Marcelo's patch there is a workaround; if you know the size of the 4G
> hole and configure the first two nodes with unequal sizes.  For example
> 
>    -m 8192 \
>    -object memory-ram,id=ram-node0,size=3840M,hostnode=0 -numa node,memdev=ram-node0 \
>    -object memory-ram,id=ram-node1,size=4352M,hostnode=1 -numa node,memdev=ram-node1
> 
>    RAM address        Host virtual address low bits      Guest physical addresses
>    0M-3840M           0                                  0M-3840M
>    3840M-8192M        0                                  4096M-8448M
> 
> Then you'll waste 1GB of RAM (you'll use 9 hugepages instead of 8), but
> everything will be aligned.  Or you just make your guest 7680M and not waste
> the memory.
> 
> But with Marcelo's patch, ram-node1 will be split in two.  QEMU will try
> to realign the second part of ram-node1, but the result is that the second
> part is misaligned and only the first 256M (the tail of guest physical
> memory) stays aligned:
> 
>    RAM address        Host virtual address low bits      Guest physical addresses
>    0M-3840M           0                                  0M-3840M
>    4096M-8192M        256M                               4096M-8192M
>    3840M-4096M        0                                  8192M-8448M
> 
> So you still waste memory, _and_ get incorrect alignment.
> 
> > Okay so
> > Marcelo - do you ack this patch for 2.0?
> > Paolo - do you re-ack this patch for 2.0?
> 
> I very much prefer Gerd's approach.  2GB low memory for q35 is a bit wasteful,
> but we have some time to fix that before release.
> 
> Paolo
>
Michael S. Tsirkin Dec. 11, 2013, 3:51 p.m. UTC | #29
On Wed, Dec 11, 2013 at 04:41:08PM +0100, Paolo Bonzini wrote:
> Il 11/12/2013 16:39, Michael S. Tsirkin ha scritto:
> > > 2GB low memory for q35 is a bit wasteful,
> > > but we have some time to fix that before release.
> > 
> > How would you fix that?
> 
> Just use 1GB, coupled with reducing the default MMCONFIG size to 128
> buses.
> That's already ~40 PCIe devices

This is not really enough, people requested more than 40 devices.
We will likely need multiroot down the road because some people want
more than 256 devices ...

> if you count upstream/downstream
> ports in addition to the actual device, and you can also use PCIe-to-PCI
> bridges if you need more room.
> 
> Paolo


We don't want people to use PCI down the road.
Everything should be PCIE so we can use things like native hotplug.
Paolo Bonzini Dec. 11, 2013, 3:56 p.m. UTC | #30
Il 11/12/2013 16:45, Igor Mammedov ha scritto:
>> > I'm not sure if it is fixable.  You need a 2M mountpoint to bind the 3G-4G
>> > range correctly, a 1G mountpoint for everything else, and QEMU only allows
>> > to specify one path.
> we could do it with hugepage memdev backend.
> something like:
> 
> -object hugepage-ram,id=mem1gb,size=3G,host-node=0,mem-path=/1gb-hugepage-fs
> -device dimm,id=hp1g,memdev=mem1gb,node=0
> -object hugepage-ram,id=mem2mb,size=500Mb,host-node=1,mem-path=/2mb-hugepage-fs
> -device dimm,id=hp2mb,memdev=mem2mb,node=1
> 
> that basically would allow to distribute initial memory in any way user would
> like.

If you allow for DIMMs, you can just use a small initial amount of
memory (2GB), and cold-plug DIMMs at 4GB.  Then you get exactly the same
result as Gerd's patch. :)

But the beauty of Marcelo's idea was that the user didn't need to do
anything, and the guest did not see anything.  It's a great approach for
backwards-compatibility, no doubt about that.

Paolo
Marcelo Tosatti Dec. 11, 2013, 5:26 p.m. UTC | #31
On Wed, Dec 11, 2013 at 03:45:29PM +0100, Paolo Bonzini wrote:
> Il 11/12/2013 15:20, Michael S. Tsirkin ha scritto:
> > > It means that its necessary to expose that 3-4GB physical memory region
> > > in QEMU belongs to the same node (that is, guest must be aware that
> > > 3-3.75GB and the tail of RAM are on the same node).
> > > 
> > > So the problem Paolo mentions is fixable.
> 
> I'm not sure if it is fixable.  You need a 2M mountpoint to bind the 3G-4G
> range correctly, a 1G mountpoint for everything else, and QEMU only allows
> to specify one path.
> 
> Without Marcelo's patch there is a workaround; if you know the size of the 4G
> hole and configure the first two nodes with unequal sizes.  For example
> 
>    -m 8192 \
>    -object memory-ram,id=ram-node0,size=3840M,hostnode=0 -numa node,memdev=ram-node0 \
>    -object memory-ram,id=ram-node1,size=4352M,hostnode=1 -numa node,memdev=ram-node1
> 
>    RAM address        Host virtual address low bits      Guest physical addresses
>    0M-3840M           0                                  0M-3840M
>    3840M-8192M        0                                  4096M-8448M
> 
> Then you'll waste 1GB of RAM (you'll use 9 hugepages instead of 8), but
> everything will be aligned.  Or you just make your guest 7680M and not waste
> the memory.
> 
> But with Marcelo's patch, ram-node1 will be split in two.  QEMU will try
> to realign the second part of ram-node1, but the result is that the second
> part is misaligned and only the first 256M (the tail of guest physical
> memory) stays aligned:
> 
>    RAM address        Host virtual address low bits      Guest physical addresses
>    0M-3840M           0                                  0M-3840M
>    4096M-8192M        256M                               4096M-8192M
>    3840M-4096M        0                                  8192M-8448M
> 
> So you still waste memory, _and_ get incorrect alignment.

You are adding a new aspect to the problem (that host memory is created
as separate devices). $subject alignment code aligns on top of one
hosts virtually contiguous address range.

As mentioned in the earlier thread with Igor, this is fixable as long 
as two memory devices map to a single host virtually contiguous range.
But if you think that aligning to 1GB guest hole alignment, don't have 
a problem with that.

For the NUMA problem, as long as its possible to specify multiple
physical address ranges for a single node (which is true), the problem
you raise is fixable. All is necessary is to expose to the guest 
which non physically contiguous 1GB ranges ("representing" 1GB pages in
the host), reside in what node.

> > Okay so
> > Marcelo - do you ack this patch for 2.0?
> > Paolo - do you re-ack this patch for 2.0?
> 
> I very much prefer Gerd's approach.  2GB low memory for q35 is a bit wasteful,
> but we have some time to fix that before release.
> 
> Paolo

OK.
diff mbox

Patch

diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 6c82ada..485b44d 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -1148,8 +1148,10 @@  FWCfgState *pc_memory_init(MemoryRegion *system_memory,
 {
     int linux_boot, i;
     MemoryRegion *ram, *option_rom_mr;
-    MemoryRegion *ram_below_4g, *ram_above_4g;
+    MemoryRegion *ram_below_4g, *ram_above_4g_pieceone, *ram_above_4g_piecetwo;
     FWCfgState *fw_cfg;
+    uint64_t holesize, pieceonesize, piecetwosize;
+    uint64_t memsize, align_offset;
 
     linux_boot = (kernel_filename != NULL);
 
@@ -1157,26 +1159,73 @@  FWCfgState *pc_memory_init(MemoryRegion *system_memory,
      * aliases to address portions of it, mostly for backwards compatibility
      * with older qemus that used qemu_ram_alloc().
      */
+    memsize = below_4g_mem_size + above_4g_mem_size;
+    holesize = 0x100000000ULL - below_4g_mem_size;
+
+    /* If 1GB hugepages are used to back guest RAM, we want the
+     * physical address 4GB to map to 4GB in the RAM, so that
+     * memory beyond 4GB is aligned on a 1GB boundary, at the
+     * host physical address space.  Thus, the ram block range
+     * [holestart, 4GB] is mapped to the last holesize bytes of RAM:
+     *
+     *                      0      h     4G     memsize-holesize
+     *
+     * contiguous-ram-block [xxxxxx][yyy][zzzzz]
+     *                                '-----------.
+     * guest-addr-space     [xxxxxx]     [zzzzz][yyy]
+     *
+     * This is only done in new-enough machine types, and of course
+     * it  is only possible if the [zzzzz] block exists at all.
+     */
+    if (guest_info->gb_align && above_4g_mem_size > holesize) {
+        /* Round the allocation up to 2 MB to make [zzzzz]'s size
+         * aligned, removing the extra from the [yyy] piece.
+         */
+        align_offset = ROUND_UP(memsize, 1UL << 21) - memsize;
+        piecetwosize = holesize - align_offset;
+    } else {
+        /* There's no [zzzzz] piece, all memory above 4G starts
+         * at below_4g_mem_size in the RAM block.  Also no need
+         * to align anything.
+         */
+        align_offset = 0;
+        piecetwosize = above_4g_mem_size;
+    }
+
     ram = g_malloc(sizeof(*ram));
-    memory_region_init_ram(ram, NULL, "pc.ram",
-                           below_4g_mem_size + above_4g_mem_size);
+    memory_region_init_ram(ram, NULL, "pc.ram", memsize + align_offset);
     vmstate_register_ram_global(ram);
     *ram_memory = ram;
+
     ram_below_4g = g_malloc(sizeof(*ram_below_4g));
     memory_region_init_alias(ram_below_4g, NULL, "ram-below-4g", ram,
                              0, below_4g_mem_size);
     memory_region_add_subregion(system_memory, 0, ram_below_4g);
+
+    pieceonesize = above_4g_mem_size - piecetwosize;
+    if (pieceonesize) {
+        ram_above_4g_pieceone = g_malloc(sizeof(*ram_above_4g_pieceone));
+        memory_region_init_alias(ram_above_4g_pieceone, NULL,
+                                 "ram-above-4g-pieceone", ram,
+                                 0x100000000ULL, pieceonesize);
+        memory_region_add_subregion(system_memory, 0x100000000ULL,
+                                    ram_above_4g_pieceone);
+    }
+    if (piecetwosize) {
+        ram_above_4g_piecetwo = g_malloc(sizeof(*ram_above_4g_piecetwo));
+        memory_region_init_alias(ram_above_4g_piecetwo, NULL,
+                                 "ram-above-4g-piecetwo", ram,
+                                 below_4g_mem_size, piecetwosize);
+        memory_region_add_subregion(system_memory,
+                                    0x100000000ULL + pieceonesize,
+                                    ram_above_4g_piecetwo);
+    }
+
     e820_add_entry(0, below_4g_mem_size, E820_RAM);
     if (above_4g_mem_size > 0) {
-        ram_above_4g = g_malloc(sizeof(*ram_above_4g));
-        memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
-                                 below_4g_mem_size, above_4g_mem_size);
-        memory_region_add_subregion(system_memory, 0x100000000ULL,
-                                    ram_above_4g);
         e820_add_entry(0x100000000ULL, above_4g_mem_size, E820_RAM);
     }
 
-
     /* Initialize PC system firmware */
     pc_system_firmware_init(rom_memory, guest_info->isapc_ram_fw);
 
diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
index 36f2495..ca9bd2e 100644
--- a/hw/i386/pc_piix.c
+++ b/hw/i386/pc_piix.c
@@ -62,6 +62,7 @@  static bool has_pvpanic;
 static bool has_pci_info;
 static bool has_acpi_build = true;
 static bool smbios_type1_defaults = true;
+static bool gb_align = true;
 
 /* PC hardware initialisation */
 static void pc_init1(QEMUMachineInitArgs *args,
@@ -130,6 +131,7 @@  static void pc_init1(QEMUMachineInitArgs *args,
 
     guest_info->has_pci_info = has_pci_info;
     guest_info->isapc_ram_fw = !pci_enabled;
+    guest_info->gb_align = gb_align;
 
     if (smbios_type1_defaults) {
         /* These values are guest ABI, do not change */
@@ -249,6 +251,7 @@  static void pc_init_pci(QEMUMachineInitArgs *args)
 static void pc_compat_1_7(QEMUMachineInitArgs *args)
 {
     smbios_type1_defaults = false;
+    gb_align = false;
 }
 
 static void pc_compat_1_6(QEMUMachineInitArgs *args)
diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
index 50ca458..89c7720 100644
--- a/hw/i386/pc_q35.c
+++ b/hw/i386/pc_q35.c
@@ -52,6 +52,7 @@  static bool has_pvpanic;
 static bool has_pci_info;
 static bool has_acpi_build = true;
 static bool smbios_type1_defaults = true;
+static bool gb_align = true;
 
 /* PC hardware initialisation */
 static void pc_q35_init(QEMUMachineInitArgs *args)
@@ -115,6 +116,7 @@  static void pc_q35_init(QEMUMachineInitArgs *args)
     guest_info->has_pci_info = has_pci_info;
     guest_info->isapc_ram_fw = false;
     guest_info->has_acpi_build = has_acpi_build;
+    guest_info->gb_align = gb_align;
 
     if (smbios_type1_defaults) {
         /* These values are guest ABI, do not change */
@@ -233,6 +235,7 @@  static void pc_q35_init(QEMUMachineInitArgs *args)
 static void pc_compat_1_7(QEMUMachineInitArgs *args)
 {
     smbios_type1_defaults = false;
+    gb_align = false;
 }
 
 static void pc_compat_1_6(QEMUMachineInitArgs *args)
diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
index 9af09d3..8047e82 100644
--- a/include/hw/i386/pc.h
+++ b/include/hw/i386/pc.h
@@ -41,6 +41,7 @@  struct PcGuestInfo {
     uint64_t *node_cpu;
     FWCfgState *fw_cfg;
     bool has_acpi_build;
+    bool gb_align;
 };
 
 /* parallel.c */