diff mbox

i386: pc: align gpa<->hpa on 1GB boundary (v4)

Message ID 20131106213119.GA15543@amt.cnet
State New
Headers show

Commit Message

Marcelo Tosatti Nov. 6, 2013, 9:31 p.m. UTC
v2: condition enablement of new mapping to new machine types (Paolo)
v3: fix changelog
v4: rebase

-----


Align guest physical address and host physical address
beyond guest 4GB on a 1GB boundary.

Otherwise 1GB TLBs cannot be cached for the range.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Comments

Michael S. Tsirkin Nov. 6, 2013, 9:40 p.m. UTC | #1
On Wed, Nov 06, 2013 at 07:31:19PM -0200, Marcelo Tosatti wrote:
> 
> v2: condition enablement of new mapping to new machine types (Paolo)
> v3: fix changelog
> v4: rebase
> 
> -----
> 
> 
> Align guest physical address and host physical address
> beyond guest 4GB on a 1GB boundary.
> 
> Otherwise 1GB TLBs cannot be cached for the range.
> 
> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Um. This will conflict with:
    pc: map PCI address space as catchall region for not mapped addresses

I think we really should stop using the hacked hole thing
and just use priorities like that patch does.
Do you agree? If yes I'm afraid your patch will have to be
rebased on top of that yet again, sorry to give you a
run-around like that :(


Also - do you think this is 1.7 material?

> 
> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
> index 12c436e..d29196d 100644
> --- a/hw/i386/pc.c
> +++ b/hw/i386/pc.c
> @@ -1156,7 +1156,7 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory,
>  {
>      int linux_boot, i;
>      MemoryRegion *ram, *option_rom_mr;
> -    MemoryRegion *ram_below_4g, *ram_above_4g;
> +    MemoryRegion *ram_below_4g, *ram_above_4g, *ram_above_4g_piecetwo;
>      FWCfgState *fw_cfg;
>  
>      linux_boot = (kernel_filename != NULL);
> @@ -1177,10 +1177,45 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory,
>      e820_add_entry(0, below_4g_mem_size, E820_RAM);
>      if (above_4g_mem_size > 0) {
>          ram_above_4g = g_malloc(sizeof(*ram_above_4g));
> -        memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
> -                                 below_4g_mem_size, above_4g_mem_size);
> -        memory_region_add_subregion(system_memory, 0x100000000ULL,
> +        /*
> +         *
> +         * If 1GB hugepages are used to back guest RAM, map guest address
> +         * space in the range [ramsize,ramsize+holesize] to the ram block
> +         * range [holestart, 4GB]
> +         *
> +         *                      0      h     4G     [ramsize,ramsize+holesize]
> +         *
> +         * guest-addr-space     [      ]     [      ][xxx]
> +         *                                /----------/
> +         * contiguous-ram-block [      ][xxx][     ]
> +         *
> +         * So that memory beyond 4GB is aligned on a 1GB boundary,
> +         * at the host physical address space.
> +         *
> +         */
> +        if (guest_info->gb_align) {
> +            unsigned long holesize = 0x100000000ULL - below_4g_mem_size;
> +
> +            memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
> +                                    0x100000000ULL,
> +                                    above_4g_mem_size - holesize);
> +            memory_region_add_subregion(system_memory, 0x100000000ULL,
>                                      ram_above_4g);
> +
> +            ram_above_4g_piecetwo = g_malloc(sizeof(*ram_above_4g_piecetwo));
> +            memory_region_init_alias(ram_above_4g_piecetwo, NULL,
> +                                     "ram-above-4g-piecetwo", ram,
> +                                     0x100000000ULL - holesize, holesize);
> +            memory_region_add_subregion(system_memory,
> +                                        0x100000000ULL +
> +                                        above_4g_mem_size - holesize,
> +                                        ram_above_4g_piecetwo);
> +        } else {
> +            memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
> +                                    below_4g_mem_size, above_4g_mem_size);
> +            memory_region_add_subregion(system_memory, 0x100000000ULL,
> +                                    ram_above_4g);
> +        }
>          e820_add_entry(0x100000000ULL, above_4g_mem_size, E820_RAM);
>      }
>  
> diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
> index 4fdb7b6..686736e 100644
> --- a/hw/i386/pc_piix.c
> +++ b/hw/i386/pc_piix.c
> @@ -60,6 +60,7 @@ static const int ide_irq[MAX_IDE_BUS] = { 14, 15 };
>  static bool has_pvpanic;
>  static bool has_pci_info = true;
>  static bool has_acpi_build = true;
> +static bool gb_align = true;
>  
>  /* PC hardware initialisation */
>  static void pc_init1(QEMUMachineInitArgs *args,
> @@ -128,6 +129,7 @@ static void pc_init1(QEMUMachineInitArgs *args,
>  
>      guest_info->has_pci_info = has_pci_info;
>      guest_info->isapc_ram_fw = !pci_enabled;
> +    guest_info->gb_align = gb_align;
>  
>      /* allocate ram and load rom/bios */
>      if (!xen_enabled()) {
> @@ -240,8 +242,14 @@ static void pc_init_pci(QEMUMachineInitArgs *args)
>      pc_init1(args, 1, 1);
>  }
>  
> +static void pc_compat_1_7(QEMUMachineInitArgs *args)
> +{
> +    gb_align = false;
> +}
> +
>  static void pc_compat_1_6(QEMUMachineInitArgs *args)
>  {
> +    pc_compat_1_7(args);
>      has_pci_info = false;
>      rom_file_in_ram = false;
>      has_acpi_build = false;
> @@ -274,6 +282,12 @@ static void pc_compat_1_2(QEMUMachineInitArgs *args)
>      disable_kvm_pv_eoi();
>  }
>  
> +static void pc_init_pci_1_7(QEMUMachineInitArgs *args)
> +{
> +    pc_compat_1_7(args);
> +    pc_init_pci(args);
> +}
> +
>  static void pc_init_pci_1_6(QEMUMachineInitArgs *args)
>  {
>      pc_compat_1_6(args);
> @@ -346,13 +360,21 @@ static void pc_xen_hvm_init(QEMUMachineInitArgs *args)
>      .desc = "Standard PC (i440FX + PIIX, 1996)", \
>      .hot_add_cpu = pc_hot_add_cpu
>  
> +#define PC_I440FX_1_8_MACHINE_OPTIONS PC_I440FX_MACHINE_OPTIONS
> +static QEMUMachine pc_i440fx_machine_v1_8 = {
> +    PC_I440FX_1_8_MACHINE_OPTIONS,
> +    .name = "pc-i440fx-1.8",
> +    .alias = "pc",
> +    .init = pc_init_pci,
> +    .is_default = 1,
> +};
> +
>  #define PC_I440FX_1_7_MACHINE_OPTIONS PC_I440FX_MACHINE_OPTIONS
>  static QEMUMachine pc_i440fx_machine_v1_7 = {
>      PC_I440FX_1_7_MACHINE_OPTIONS,
>      .name = "pc-i440fx-1.7",
>      .alias = "pc",
> -    .init = pc_init_pci,
> -    .is_default = 1,
> +    .init = pc_init_pci_1_7,
>  };
>  
>  #define PC_I440FX_1_6_MACHINE_OPTIONS PC_I440FX_MACHINE_OPTIONS
> @@ -754,6 +776,7 @@ static QEMUMachine xenfv_machine = {
>  
>  static void pc_machine_init(void)
>  {
> +    qemu_register_machine(&pc_i440fx_machine_v1_8);
>      qemu_register_machine(&pc_i440fx_machine_v1_7);
>      qemu_register_machine(&pc_i440fx_machine_v1_6);
>      qemu_register_machine(&pc_i440fx_machine_v1_5);
> diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
> index 4c191d3..c2eb568 100644
> --- a/hw/i386/pc_q35.c
> +++ b/hw/i386/pc_q35.c
> @@ -50,6 +50,7 @@
>  static bool has_pvpanic;
>  static bool has_pci_info = true;
>  static bool has_acpi_build = true;
> +static bool gb_align = true;
>  
>  /* PC hardware initialisation */
>  static void pc_q35_init(QEMUMachineInitArgs *args)
> @@ -113,6 +114,7 @@ static void pc_q35_init(QEMUMachineInitArgs *args)
>      guest_info->has_pci_info = has_pci_info;
>      guest_info->isapc_ram_fw = false;
>      guest_info->has_acpi_build = has_acpi_build;
> +    guest_info->gb_align = gb_align;
>  
>      /* allocate ram and load rom/bios */
>      if (!xen_enabled()) {
> @@ -222,8 +224,14 @@ static void pc_q35_init(QEMUMachineInitArgs *args)
>      }
>  }
>  
> +static void pc_compat_1_7(QEMUMachineInitArgs *args)
> +{
> +   gb_align = false;
> +}
> +
>  static void pc_compat_1_6(QEMUMachineInitArgs *args)
>  {
> +    pc_compat_1_7(args);
>      has_pci_info = false;
>      rom_file_in_ram = false;
>      has_acpi_build = false;
> @@ -243,6 +251,12 @@ static void pc_compat_1_4(QEMUMachineInitArgs *args)
>      x86_cpu_compat_set_features("Westmere", FEAT_1_ECX, 0, CPUID_EXT_PCLMULQDQ);
>  }
>  
> +static void pc_q35_init_1_7(QEMUMachineInitArgs *args)
> +{
> +    pc_compat_1_7(args);
> +    pc_q35_init(args);
> +}
> +
>  static void pc_q35_init_1_6(QEMUMachineInitArgs *args)
>  {
>      pc_compat_1_6(args);
> @@ -266,13 +280,22 @@ static void pc_q35_init_1_4(QEMUMachineInitArgs *args)
>      .desc = "Standard PC (Q35 + ICH9, 2009)", \
>      .hot_add_cpu = pc_hot_add_cpu
>  
> +#define PC_Q35_1_8_MACHINE_OPTIONS PC_Q35_MACHINE_OPTIONS
> +
> +static QEMUMachine pc_q35_machine_v1_8 = {
> +    PC_Q35_1_8_MACHINE_OPTIONS,
> +    .name = "pc-q35-1.8",
> +    .alias = "q35",
> +    .init = pc_q35_init,
> +};
> +
>  #define PC_Q35_1_7_MACHINE_OPTIONS PC_Q35_MACHINE_OPTIONS
>  
>  static QEMUMachine pc_q35_machine_v1_7 = {
>      PC_Q35_1_7_MACHINE_OPTIONS,
>      .name = "pc-q35-1.7",
>      .alias = "q35",
> -    .init = pc_q35_init,
> +    .init = pc_q35_init_1_7,
>  };
>  
>  #define PC_Q35_1_6_MACHINE_OPTIONS PC_Q35_MACHINE_OPTIONS
> @@ -313,6 +336,7 @@ static QEMUMachine pc_q35_machine_v1_4 = {
>  
>  static void pc_q35_machine_init(void)
>  {
> +    qemu_register_machine(&pc_q35_machine_v1_8);
>      qemu_register_machine(&pc_q35_machine_v1_7);
>      qemu_register_machine(&pc_q35_machine_v1_6);
>      qemu_register_machine(&pc_q35_machine_v1_5);
> diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
> index 03cc0ba..35a6885 100644
> --- a/include/hw/i386/pc.h
> +++ b/include/hw/i386/pc.h
> @@ -41,6 +41,7 @@ struct PcGuestInfo {
>      uint64_t *node_cpu;
>      FWCfgState *fw_cfg;
>      bool has_acpi_build;
> +    bool gb_align;
>  };
>  
>  /* parallel.c */
Marcelo Tosatti Nov. 6, 2013, 9:53 p.m. UTC | #2
On Wed, Nov 06, 2013 at 11:40:34PM +0200, Michael S. Tsirkin wrote:
> On Wed, Nov 06, 2013 at 07:31:19PM -0200, Marcelo Tosatti wrote:
> > 
> > v2: condition enablement of new mapping to new machine types (Paolo)
> > v3: fix changelog
> > v4: rebase
> > 
> > -----
> > 
> > 
> > Align guest physical address and host physical address
> > beyond guest 4GB on a 1GB boundary.
> > 
> > Otherwise 1GB TLBs cannot be cached for the range.
> > 
> > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> 
> Um. This will conflict with:
>     pc: map PCI address space as catchall region for not mapped addresses
> 
> I think we really should stop using the hacked hole thing
> and just use priorities like that patch does.

Sorry hacked in what way?

This patch is necessary to enable 1GB hugepages beyond 4GB of RAM on the
current machine types.

> Do you agree? If yes I'm afraid your patch will have to be
> rebased on top of that yet again, sorry to give you a
> run-around like that :(

I don't see what exactly is the suggestion (or why the proposed 
patch should conflict with "pc: map PCI address space as catchall region
for not mapped addresses").

> Also - do you think this is 1.7 material?

No. Paolo mentioned you have a tree with 1.8 material, correct?
Michael S. Tsirkin Nov. 6, 2013, 10:15 p.m. UTC | #3
On Wed, Nov 06, 2013 at 07:53:51PM -0200, Marcelo Tosatti wrote:
> On Wed, Nov 06, 2013 at 11:40:34PM +0200, Michael S. Tsirkin wrote:
> > On Wed, Nov 06, 2013 at 07:31:19PM -0200, Marcelo Tosatti wrote:
> > > 
> > > v2: condition enablement of new mapping to new machine types (Paolo)
> > > v3: fix changelog
> > > v4: rebase
> > > 
> > > -----
> > > 
> > > 
> > > Align guest physical address and host physical address
> > > beyond guest 4GB on a 1GB boundary.
> > > 
> > > Otherwise 1GB TLBs cannot be cached for the range.
> > > 
> > > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> > 
> > Um. This will conflict with:
> >     pc: map PCI address space as catchall region for not mapped addresses
> > 
> > I think we really should stop using the hacked hole thing
> > and just use priorities like that patch does.
> 
> Sorry hacked in what way?
> This patch is necessary to enable 1GB hugepages beyond 4GB of RAM on the
> current machine types.


Sorry if I wasn't clear. when I said "hacked" I was talking about the
pci hole concept generally in upstream qemu, not about your patch.

Its hacked because there's no "pci hole" on PIIX.
pci hole is where pci was hiding some ram behind it
on some systems. AFAIK this is not what is happens on piix though.
What happens really is that everything not covered by RAM memory is PCI.

We implemented this using two aliases of RAM but
the natural thing is really just making PCI lower
priority than RAM and let it overlap.

> > Do you agree? If yes I'm afraid your patch will have to be
> > rebased on top of that yet again, sorry to give you a
> > run-around like that :(
> 
> I don't see what exactly is the suggestion (or why the proposed 
> patch should conflict with "pc: map PCI address space as catchall region
> for not mapped addresses").

It seemed to me that they will conflict but it's after midnight
so maybe I'm confused.
You are saying you apply yours on top and there's no conflict?
In that case I'll recheck, sorry.

> > Also - do you think this is 1.7 material?
> 
> No. Paolo mentioned you have a tree with 1.8 material, correct?

Yes

git://git.kernel.org/pub/scm/virt/kvm/mst/qemu.git pci
Marcelo Tosatti Nov. 6, 2013, 10:24 p.m. UTC | #4
On Thu, Nov 07, 2013 at 12:15:59AM +0200, Michael S. Tsirkin wrote:
> On Wed, Nov 06, 2013 at 07:53:51PM -0200, Marcelo Tosatti wrote:
> > On Wed, Nov 06, 2013 at 11:40:34PM +0200, Michael S. Tsirkin wrote:
> > > On Wed, Nov 06, 2013 at 07:31:19PM -0200, Marcelo Tosatti wrote:
> > > > 
> > > > v2: condition enablement of new mapping to new machine types (Paolo)
> > > > v3: fix changelog
> > > > v4: rebase
> > > > 
> > > > -----
> > > > 
> > > > 
> > > > Align guest physical address and host physical address
> > > > beyond guest 4GB on a 1GB boundary.
> > > > 
> > > > Otherwise 1GB TLBs cannot be cached for the range.
> > > > 
> > > > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> > > 
> > > Um. This will conflict with:
> > >     pc: map PCI address space as catchall region for not mapped addresses
> > > 
> > > I think we really should stop using the hacked hole thing
> > > and just use priorities like that patch does.
> > 
> > Sorry hacked in what way?
> > This patch is necessary to enable 1GB hugepages beyond 4GB of RAM on the
> > current machine types.
> 
> 
> Sorry if I wasn't clear. when I said "hacked" I was talking about the
> pci hole concept generally in upstream qemu, not about your patch.
> 
> Its hacked because there's no "pci hole" on PIIX.
> pci hole is where pci was hiding some ram behind it
> on some systems. AFAIK this is not what is happens on piix though.
> What happens really is that everything not covered by RAM memory is PCI.
> 
> We implemented this using two aliases of RAM but
> the natural thing is really just making PCI lower
> priority than RAM and let it overlap.
>
> > > Do you agree? If yes I'm afraid your patch will have to be
> > > rebased on top of that yet again, sorry to give you a
> > > run-around like that :(
> > 
> > I don't see what exactly is the suggestion (or why the proposed 
> > patch should conflict with "pc: map PCI address space as catchall region
> > for not mapped addresses").
> 
> It seemed to me that they will conflict but it's after midnight
> so maybe I'm confused.
> You are saying you apply yours on top and there's no conflict?
> In that case I'll recheck, sorry.

No conflict between "pc: map PCI address space as catchall region" 
and the proposed patch.

> > > Also - do you think this is 1.7 material?
> > 
> > No. Paolo mentioned you have a tree with 1.8 material, correct?
> 
> Yes
> 
> git://git.kernel.org/pub/scm/virt/kvm/mst/qemu.git pci
Igor Mammedov Nov. 7, 2013, 3:24 p.m. UTC | #5
On Wed, 6 Nov 2013 19:31:19 -0200
Marcelo Tosatti <mtosatti@redhat.com> wrote:

> 
> v2: condition enablement of new mapping to new machine types (Paolo)
> v3: fix changelog
> v4: rebase
> 
> -----
> 
> 
> Align guest physical address and host physical address
> beyond guest 4GB on a 1GB boundary.
> 
> Otherwise 1GB TLBs cannot be cached for the range.
> 
> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> 
> 
> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
> index 12c436e..d29196d 100644
> --- a/hw/i386/pc.c
> +++ b/hw/i386/pc.c
> @@ -1156,7 +1156,7 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory,
>  {
>      int linux_boot, i;
>      MemoryRegion *ram, *option_rom_mr;
> -    MemoryRegion *ram_below_4g, *ram_above_4g;
> +    MemoryRegion *ram_below_4g, *ram_above_4g, *ram_above_4g_piecetwo;
>      FWCfgState *fw_cfg;
>  
>      linux_boot = (kernel_filename != NULL);
> @@ -1177,10 +1177,45 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory,
>      e820_add_entry(0, below_4g_mem_size, E820_RAM);
>      if (above_4g_mem_size > 0) {
>          ram_above_4g = g_malloc(sizeof(*ram_above_4g));
> -        memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
> -                                 below_4g_mem_size, above_4g_mem_size);
> -        memory_region_add_subregion(system_memory, 0x100000000ULL,
> +        /*
> +         *
> +         * If 1GB hugepages are used to back guest RAM, map guest address
> +         * space in the range [ramsize,ramsize+holesize] to the ram block
> +         * range [holestart, 4GB]
> +         *
> +         *                      0      h     4G     [ramsize,ramsize+holesize]
> +         *
> +         * guest-addr-space     [      ]     [      ][xxx]
> +         *                                /----------/
> +         * contiguous-ram-block [      ][xxx][     ]
> +         *
> +         * So that memory beyond 4GB is aligned on a 1GB boundary,
> +         * at the host physical address space.
> +         *
> +         */
I did some corner cases testing and there is alias overlapping in case 
  -m 4096 -mem-path /var/lib/hugetlbfs/global/pagesize-1GB

  0000000100000000-000000011fffffff (prio 0, RW): alias ram-above-4g-piecetwo @pc.ram 00000000e0000000-00000000ffffffff
  0000000100000000-0000000100000000 (prio 0, RW): alias ram-above-4g @pc.ram 0000000100000000-0000000100000000

perhaps zero sized ram-above-4g shouldn't be created at all?

in addition ram-above-4g-piecetwo starts at half page offset 00000000e0000000 but guest sees it 4Gb offset,
wouldn't it cause the same issue patch tries to solve but only for ram-above-4g-piecetwo tail sync host/guest offsets
are not 1Gb aligned?

there is more misalignment with:
 -m 4097 -mem-path /var/lib/hugetlbfs/global/pagesize-1GB

  0000000100000000-00000001000fffff (prio 0, RW): alias ram-above-4g @pc.ram 0000000100000000-00000001000fffff
  0000000100100000-00000001200fffff (prio 0, RW): alias ram-above-4g-piecetwo @pc.ram 00000000e0000000-00000000ffffffff

where ram-above-4g-piecetwo is aligned with 1Gb+1Mb GPA offset, in addition to 500Mb offset of HPA.
which would cause the same issue as above prehaps?

> +        if (guest_info->gb_align) {
> +            unsigned long holesize = 0x100000000ULL - below_4g_mem_size;
> +
> +            memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
> +                                    0x100000000ULL,
> +                                    above_4g_mem_size - holesize);
> +            memory_region_add_subregion(system_memory, 0x100000000ULL,
>                                      ram_above_4g);
> +
> +            ram_above_4g_piecetwo = g_malloc(sizeof(*ram_above_4g_piecetwo));
> +            memory_region_init_alias(ram_above_4g_piecetwo, NULL,
> +                                     "ram-above-4g-piecetwo", ram,
> +                                     0x100000000ULL - holesize, holesize);
> +            memory_region_add_subregion(system_memory,
> +                                        0x100000000ULL +
> +                                        above_4g_mem_size - holesize,
> +                                        ram_above_4g_piecetwo);
> +        } else {
> +            memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
> +                                    below_4g_mem_size, above_4g_mem_size);
> +            memory_region_add_subregion(system_memory, 0x100000000ULL,
> +                                    ram_above_4g);
> +        }
>          e820_add_entry(0x100000000ULL, above_4g_mem_size, E820_RAM);
>      }
>  
> diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
> index 4fdb7b6..686736e 100644
> --- a/hw/i386/pc_piix.c
> +++ b/hw/i386/pc_piix.c
> @@ -60,6 +60,7 @@ static const int ide_irq[MAX_IDE_BUS] = { 14, 15 };
>  static bool has_pvpanic;
>  static bool has_pci_info = true;
>  static bool has_acpi_build = true;
> +static bool gb_align = true;
>  
>  /* PC hardware initialisation */
>  static void pc_init1(QEMUMachineInitArgs *args,
> @@ -128,6 +129,7 @@ static void pc_init1(QEMUMachineInitArgs *args,
>  
>      guest_info->has_pci_info = has_pci_info;
>      guest_info->isapc_ram_fw = !pci_enabled;
> +    guest_info->gb_align = gb_align;
>  
>      /* allocate ram and load rom/bios */
>      if (!xen_enabled()) {
> @@ -240,8 +242,14 @@ static void pc_init_pci(QEMUMachineInitArgs *args)
>      pc_init1(args, 1, 1);
>  }
>  
> +static void pc_compat_1_7(QEMUMachineInitArgs *args)
> +{
> +    gb_align = false;
> +}
> +
>  static void pc_compat_1_6(QEMUMachineInitArgs *args)
>  {
> +    pc_compat_1_7(args);
>      has_pci_info = false;
>      rom_file_in_ram = false;
>      has_acpi_build = false;
> @@ -274,6 +282,12 @@ static void pc_compat_1_2(QEMUMachineInitArgs *args)
>      disable_kvm_pv_eoi();
>  }
>  
> +static void pc_init_pci_1_7(QEMUMachineInitArgs *args)
> +{
> +    pc_compat_1_7(args);
> +    pc_init_pci(args);
> +}
> +
>  static void pc_init_pci_1_6(QEMUMachineInitArgs *args)
>  {
>      pc_compat_1_6(args);
> @@ -346,13 +360,21 @@ static void pc_xen_hvm_init(QEMUMachineInitArgs *args)
>      .desc = "Standard PC (i440FX + PIIX, 1996)", \
>      .hot_add_cpu = pc_hot_add_cpu
>  
> +#define PC_I440FX_1_8_MACHINE_OPTIONS PC_I440FX_MACHINE_OPTIONS
> +static QEMUMachine pc_i440fx_machine_v1_8 = {
> +    PC_I440FX_1_8_MACHINE_OPTIONS,
> +    .name = "pc-i440fx-1.8",
> +    .alias = "pc",
> +    .init = pc_init_pci,
> +    .is_default = 1,
> +};
> +
>  #define PC_I440FX_1_7_MACHINE_OPTIONS PC_I440FX_MACHINE_OPTIONS
>  static QEMUMachine pc_i440fx_machine_v1_7 = {
>      PC_I440FX_1_7_MACHINE_OPTIONS,
>      .name = "pc-i440fx-1.7",
>      .alias = "pc",
> -    .init = pc_init_pci,
> -    .is_default = 1,
> +    .init = pc_init_pci_1_7,
>  };
>  
>  #define PC_I440FX_1_6_MACHINE_OPTIONS PC_I440FX_MACHINE_OPTIONS
> @@ -754,6 +776,7 @@ static QEMUMachine xenfv_machine = {
>  
>  static void pc_machine_init(void)
>  {
> +    qemu_register_machine(&pc_i440fx_machine_v1_8);
>      qemu_register_machine(&pc_i440fx_machine_v1_7);
>      qemu_register_machine(&pc_i440fx_machine_v1_6);
>      qemu_register_machine(&pc_i440fx_machine_v1_5);
> diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
> index 4c191d3..c2eb568 100644
> --- a/hw/i386/pc_q35.c
> +++ b/hw/i386/pc_q35.c
> @@ -50,6 +50,7 @@
>  static bool has_pvpanic;
>  static bool has_pci_info = true;
>  static bool has_acpi_build = true;
> +static bool gb_align = true;
>  
>  /* PC hardware initialisation */
>  static void pc_q35_init(QEMUMachineInitArgs *args)
> @@ -113,6 +114,7 @@ static void pc_q35_init(QEMUMachineInitArgs *args)
>      guest_info->has_pci_info = has_pci_info;
>      guest_info->isapc_ram_fw = false;
>      guest_info->has_acpi_build = has_acpi_build;
> +    guest_info->gb_align = gb_align;
>  
>      /* allocate ram and load rom/bios */
>      if (!xen_enabled()) {
> @@ -222,8 +224,14 @@ static void pc_q35_init(QEMUMachineInitArgs *args)
>      }
>  }
>  
> +static void pc_compat_1_7(QEMUMachineInitArgs *args)
> +{
> +   gb_align = false;
> +}
> +
>  static void pc_compat_1_6(QEMUMachineInitArgs *args)
>  {
> +    pc_compat_1_7(args);
>      has_pci_info = false;
>      rom_file_in_ram = false;
>      has_acpi_build = false;
> @@ -243,6 +251,12 @@ static void pc_compat_1_4(QEMUMachineInitArgs *args)
>      x86_cpu_compat_set_features("Westmere", FEAT_1_ECX, 0, CPUID_EXT_PCLMULQDQ);
>  }
>  
> +static void pc_q35_init_1_7(QEMUMachineInitArgs *args)
> +{
> +    pc_compat_1_7(args);
> +    pc_q35_init(args);
> +}
> +
>  static void pc_q35_init_1_6(QEMUMachineInitArgs *args)
>  {
>      pc_compat_1_6(args);
> @@ -266,13 +280,22 @@ static void pc_q35_init_1_4(QEMUMachineInitArgs *args)
>      .desc = "Standard PC (Q35 + ICH9, 2009)", \
>      .hot_add_cpu = pc_hot_add_cpu
>  
> +#define PC_Q35_1_8_MACHINE_OPTIONS PC_Q35_MACHINE_OPTIONS
> +
> +static QEMUMachine pc_q35_machine_v1_8 = {
> +    PC_Q35_1_8_MACHINE_OPTIONS,
> +    .name = "pc-q35-1.8",
> +    .alias = "q35",
> +    .init = pc_q35_init,
> +};
> +
>  #define PC_Q35_1_7_MACHINE_OPTIONS PC_Q35_MACHINE_OPTIONS
>  
>  static QEMUMachine pc_q35_machine_v1_7 = {
>      PC_Q35_1_7_MACHINE_OPTIONS,
>      .name = "pc-q35-1.7",
>      .alias = "q35",
> -    .init = pc_q35_init,
> +    .init = pc_q35_init_1_7,
>  };
>  
>  #define PC_Q35_1_6_MACHINE_OPTIONS PC_Q35_MACHINE_OPTIONS
> @@ -313,6 +336,7 @@ static QEMUMachine pc_q35_machine_v1_4 = {
>  
>  static void pc_q35_machine_init(void)
>  {
> +    qemu_register_machine(&pc_q35_machine_v1_8);
>      qemu_register_machine(&pc_q35_machine_v1_7);
>      qemu_register_machine(&pc_q35_machine_v1_6);
>      qemu_register_machine(&pc_q35_machine_v1_5);
> diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
> index 03cc0ba..35a6885 100644
> --- a/include/hw/i386/pc.h
> +++ b/include/hw/i386/pc.h
> @@ -41,6 +41,7 @@ struct PcGuestInfo {
>      uint64_t *node_cpu;
>      FWCfgState *fw_cfg;
>      bool has_acpi_build;
> +    bool gb_align;
>  };
>  
>  /* parallel.c */
Marcelo Tosatti Nov. 7, 2013, 9:53 p.m. UTC | #6
On Thu, Nov 07, 2013 at 04:24:59PM +0100, Igor Mammedov wrote:
> On Wed, 6 Nov 2013 19:31:19 -0200
> Marcelo Tosatti <mtosatti@redhat.com> wrote:
> 
> > 
> > v2: condition enablement of new mapping to new machine types (Paolo)
> > v3: fix changelog
> > v4: rebase
> > 
> > -----
> > 
> > 
> > Align guest physical address and host physical address
> > beyond guest 4GB on a 1GB boundary.
> > 
> > Otherwise 1GB TLBs cannot be cached for the range.
> > 
> > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> > 
> > 
> > diff --git a/hw/i386/pc.c b/hw/i386/pc.c
> > index 12c436e..d29196d 100644
> > --- a/hw/i386/pc.c
> > +++ b/hw/i386/pc.c
> > @@ -1156,7 +1156,7 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory,
> >  {
> >      int linux_boot, i;
> >      MemoryRegion *ram, *option_rom_mr;
> > -    MemoryRegion *ram_below_4g, *ram_above_4g;
> > +    MemoryRegion *ram_below_4g, *ram_above_4g, *ram_above_4g_piecetwo;
> >      FWCfgState *fw_cfg;
> >  
> >      linux_boot = (kernel_filename != NULL);
> > @@ -1177,10 +1177,45 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory,
> >      e820_add_entry(0, below_4g_mem_size, E820_RAM);
> >      if (above_4g_mem_size > 0) {
> >          ram_above_4g = g_malloc(sizeof(*ram_above_4g));
> > -        memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
> > -                                 below_4g_mem_size, above_4g_mem_size);
> > -        memory_region_add_subregion(system_memory, 0x100000000ULL,
> > +        /*
> > +         *
> > +         * If 1GB hugepages are used to back guest RAM, map guest address
> > +         * space in the range [ramsize,ramsize+holesize] to the ram block
> > +         * range [holestart, 4GB]
> > +         *
> > +         *                      0      h     4G     [ramsize,ramsize+holesize]
> > +         *
> > +         * guest-addr-space     [      ]     [      ][xxx]
> > +         *                                /----------/
> > +         * contiguous-ram-block [      ][xxx][     ]
> > +         *
> > +         * So that memory beyond 4GB is aligned on a 1GB boundary,
> > +         * at the host physical address space.
> > +         *
> > +         */
> I did some corner cases testing and there is alias overlapping in case 
>   -m 4096 -mem-path /var/lib/hugetlbfs/global/pagesize-1GB
> 
>   0000000100000000-000000011fffffff (prio 0, RW): alias ram-above-4g-piecetwo @pc.ram 00000000e0000000-00000000ffffffff
>   0000000100000000-0000000100000000 (prio 0, RW): alias ram-above-4g @pc.ram 0000000100000000-0000000100000000
> 
> perhaps zero sized ram-above-4g shouldn't be created at all?

Right.

> in addition ram-above-4g-piecetwo starts at half page offset 00000000e0000000 but guest sees it 4Gb offset,
> wouldn't it cause the same issue patch tries to solve but only for ram-above-4g-piecetwo tail sync host/guest offsets
> are not 1Gb aligned?

Piece 1 is aligned. Piece 2 maps from tail of RAM (gpa) to start of hole
(ramblock).

> there is more misalignment with:
>  -m 4097 -mem-path /var/lib/hugetlbfs/global/pagesize-1GB
> 
>   0000000100000000-00000001000fffff (prio 0, RW): alias ram-above-4g @pc.ram 0000000100000000-00000001000fffff

Piece 1 is aligned.

>   0000000100100000-00000001200fffff (prio 0, RW): alias ram-above-4g-piecetwo @pc.ram 00000000e0000000-00000000ffffffff

Piece 2 is not. Should allocate one extra MB of RAM to align that. I'll
resend, thanks.

> where ram-above-4g-piecetwo is aligned with 1Gb+1Mb GPA offset, in addition to 500Mb offset of HPA.
> which would cause the same issue as above prehaps?
diff mbox

Patch

diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 12c436e..d29196d 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -1156,7 +1156,7 @@  FWCfgState *pc_memory_init(MemoryRegion *system_memory,
 {
     int linux_boot, i;
     MemoryRegion *ram, *option_rom_mr;
-    MemoryRegion *ram_below_4g, *ram_above_4g;
+    MemoryRegion *ram_below_4g, *ram_above_4g, *ram_above_4g_piecetwo;
     FWCfgState *fw_cfg;
 
     linux_boot = (kernel_filename != NULL);
@@ -1177,10 +1177,45 @@  FWCfgState *pc_memory_init(MemoryRegion *system_memory,
     e820_add_entry(0, below_4g_mem_size, E820_RAM);
     if (above_4g_mem_size > 0) {
         ram_above_4g = g_malloc(sizeof(*ram_above_4g));
-        memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
-                                 below_4g_mem_size, above_4g_mem_size);
-        memory_region_add_subregion(system_memory, 0x100000000ULL,
+        /*
+         *
+         * If 1GB hugepages are used to back guest RAM, map guest address
+         * space in the range [ramsize,ramsize+holesize] to the ram block
+         * range [holestart, 4GB]
+         *
+         *                      0      h     4G     [ramsize,ramsize+holesize]
+         *
+         * guest-addr-space     [      ]     [      ][xxx]
+         *                                /----------/
+         * contiguous-ram-block [      ][xxx][     ]
+         *
+         * So that memory beyond 4GB is aligned on a 1GB boundary,
+         * at the host physical address space.
+         *
+         */
+        if (guest_info->gb_align) {
+            unsigned long holesize = 0x100000000ULL - below_4g_mem_size;
+
+            memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
+                                    0x100000000ULL,
+                                    above_4g_mem_size - holesize);
+            memory_region_add_subregion(system_memory, 0x100000000ULL,
                                     ram_above_4g);
+
+            ram_above_4g_piecetwo = g_malloc(sizeof(*ram_above_4g_piecetwo));
+            memory_region_init_alias(ram_above_4g_piecetwo, NULL,
+                                     "ram-above-4g-piecetwo", ram,
+                                     0x100000000ULL - holesize, holesize);
+            memory_region_add_subregion(system_memory,
+                                        0x100000000ULL +
+                                        above_4g_mem_size - holesize,
+                                        ram_above_4g_piecetwo);
+        } else {
+            memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
+                                    below_4g_mem_size, above_4g_mem_size);
+            memory_region_add_subregion(system_memory, 0x100000000ULL,
+                                    ram_above_4g);
+        }
         e820_add_entry(0x100000000ULL, above_4g_mem_size, E820_RAM);
     }
 
diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
index 4fdb7b6..686736e 100644
--- a/hw/i386/pc_piix.c
+++ b/hw/i386/pc_piix.c
@@ -60,6 +60,7 @@  static const int ide_irq[MAX_IDE_BUS] = { 14, 15 };
 static bool has_pvpanic;
 static bool has_pci_info = true;
 static bool has_acpi_build = true;
+static bool gb_align = true;
 
 /* PC hardware initialisation */
 static void pc_init1(QEMUMachineInitArgs *args,
@@ -128,6 +129,7 @@  static void pc_init1(QEMUMachineInitArgs *args,
 
     guest_info->has_pci_info = has_pci_info;
     guest_info->isapc_ram_fw = !pci_enabled;
+    guest_info->gb_align = gb_align;
 
     /* allocate ram and load rom/bios */
     if (!xen_enabled()) {
@@ -240,8 +242,14 @@  static void pc_init_pci(QEMUMachineInitArgs *args)
     pc_init1(args, 1, 1);
 }
 
+static void pc_compat_1_7(QEMUMachineInitArgs *args)
+{
+    gb_align = false;
+}
+
 static void pc_compat_1_6(QEMUMachineInitArgs *args)
 {
+    pc_compat_1_7(args);
     has_pci_info = false;
     rom_file_in_ram = false;
     has_acpi_build = false;
@@ -274,6 +282,12 @@  static void pc_compat_1_2(QEMUMachineInitArgs *args)
     disable_kvm_pv_eoi();
 }
 
+static void pc_init_pci_1_7(QEMUMachineInitArgs *args)
+{
+    pc_compat_1_7(args);
+    pc_init_pci(args);
+}
+
 static void pc_init_pci_1_6(QEMUMachineInitArgs *args)
 {
     pc_compat_1_6(args);
@@ -346,13 +360,21 @@  static void pc_xen_hvm_init(QEMUMachineInitArgs *args)
     .desc = "Standard PC (i440FX + PIIX, 1996)", \
     .hot_add_cpu = pc_hot_add_cpu
 
+#define PC_I440FX_1_8_MACHINE_OPTIONS PC_I440FX_MACHINE_OPTIONS
+static QEMUMachine pc_i440fx_machine_v1_8 = {
+    PC_I440FX_1_8_MACHINE_OPTIONS,
+    .name = "pc-i440fx-1.8",
+    .alias = "pc",
+    .init = pc_init_pci,
+    .is_default = 1,
+};
+
 #define PC_I440FX_1_7_MACHINE_OPTIONS PC_I440FX_MACHINE_OPTIONS
 static QEMUMachine pc_i440fx_machine_v1_7 = {
     PC_I440FX_1_7_MACHINE_OPTIONS,
     .name = "pc-i440fx-1.7",
     .alias = "pc",
-    .init = pc_init_pci,
-    .is_default = 1,
+    .init = pc_init_pci_1_7,
 };
 
 #define PC_I440FX_1_6_MACHINE_OPTIONS PC_I440FX_MACHINE_OPTIONS
@@ -754,6 +776,7 @@  static QEMUMachine xenfv_machine = {
 
 static void pc_machine_init(void)
 {
+    qemu_register_machine(&pc_i440fx_machine_v1_8);
     qemu_register_machine(&pc_i440fx_machine_v1_7);
     qemu_register_machine(&pc_i440fx_machine_v1_6);
     qemu_register_machine(&pc_i440fx_machine_v1_5);
diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
index 4c191d3..c2eb568 100644
--- a/hw/i386/pc_q35.c
+++ b/hw/i386/pc_q35.c
@@ -50,6 +50,7 @@ 
 static bool has_pvpanic;
 static bool has_pci_info = true;
 static bool has_acpi_build = true;
+static bool gb_align = true;
 
 /* PC hardware initialisation */
 static void pc_q35_init(QEMUMachineInitArgs *args)
@@ -113,6 +114,7 @@  static void pc_q35_init(QEMUMachineInitArgs *args)
     guest_info->has_pci_info = has_pci_info;
     guest_info->isapc_ram_fw = false;
     guest_info->has_acpi_build = has_acpi_build;
+    guest_info->gb_align = gb_align;
 
     /* allocate ram and load rom/bios */
     if (!xen_enabled()) {
@@ -222,8 +224,14 @@  static void pc_q35_init(QEMUMachineInitArgs *args)
     }
 }
 
+static void pc_compat_1_7(QEMUMachineInitArgs *args)
+{
+   gb_align = false;
+}
+
 static void pc_compat_1_6(QEMUMachineInitArgs *args)
 {
+    pc_compat_1_7(args);
     has_pci_info = false;
     rom_file_in_ram = false;
     has_acpi_build = false;
@@ -243,6 +251,12 @@  static void pc_compat_1_4(QEMUMachineInitArgs *args)
     x86_cpu_compat_set_features("Westmere", FEAT_1_ECX, 0, CPUID_EXT_PCLMULQDQ);
 }
 
+static void pc_q35_init_1_7(QEMUMachineInitArgs *args)
+{
+    pc_compat_1_7(args);
+    pc_q35_init(args);
+}
+
 static void pc_q35_init_1_6(QEMUMachineInitArgs *args)
 {
     pc_compat_1_6(args);
@@ -266,13 +280,22 @@  static void pc_q35_init_1_4(QEMUMachineInitArgs *args)
     .desc = "Standard PC (Q35 + ICH9, 2009)", \
     .hot_add_cpu = pc_hot_add_cpu
 
+#define PC_Q35_1_8_MACHINE_OPTIONS PC_Q35_MACHINE_OPTIONS
+
+static QEMUMachine pc_q35_machine_v1_8 = {
+    PC_Q35_1_8_MACHINE_OPTIONS,
+    .name = "pc-q35-1.8",
+    .alias = "q35",
+    .init = pc_q35_init,
+};
+
 #define PC_Q35_1_7_MACHINE_OPTIONS PC_Q35_MACHINE_OPTIONS
 
 static QEMUMachine pc_q35_machine_v1_7 = {
     PC_Q35_1_7_MACHINE_OPTIONS,
     .name = "pc-q35-1.7",
     .alias = "q35",
-    .init = pc_q35_init,
+    .init = pc_q35_init_1_7,
 };
 
 #define PC_Q35_1_6_MACHINE_OPTIONS PC_Q35_MACHINE_OPTIONS
@@ -313,6 +336,7 @@  static QEMUMachine pc_q35_machine_v1_4 = {
 
 static void pc_q35_machine_init(void)
 {
+    qemu_register_machine(&pc_q35_machine_v1_8);
     qemu_register_machine(&pc_q35_machine_v1_7);
     qemu_register_machine(&pc_q35_machine_v1_6);
     qemu_register_machine(&pc_q35_machine_v1_5);
diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
index 03cc0ba..35a6885 100644
--- a/include/hw/i386/pc.h
+++ b/include/hw/i386/pc.h
@@ -41,6 +41,7 @@  struct PcGuestInfo {
     uint64_t *node_cpu;
     FWCfgState *fw_cfg;
     bool has_acpi_build;
+    bool gb_align;
 };
 
 /* parallel.c */