diff mbox

i386: pc: align gpa<->hpa on 1GB boundary (v5)

Message ID 20131110204753.GA11389@amt.cnet
State New
Headers show

Commit Message

Marcelo Tosatti Nov. 10, 2013, 8:47 p.m. UTC
v2: condition enablement of new mapping to new machine types (Paolo)
v3: fix changelog
v4: rebase
v5: ensure alignment of piecetwo on 2MB GPA (Igor)
    do not register zero-sized piece-one    (Igor)

-----

Align guest physical address and host physical address
beyond guest 4GB on a 1GB boundary.

Otherwise 1GB TLBs cannot be cached for the range.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Comments

Igor Mammedov Nov. 12, 2013, 12:45 p.m. UTC | #1
On Sun, 10 Nov 2013 18:47:53 -0200
Marcelo Tosatti <mtosatti@redhat.com> wrote:

[...]

> @@ -1177,10 +1182,50 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory,
>      e820_add_entry(0, below_4g_mem_size, E820_RAM);
>      if (above_4g_mem_size > 0) {
>          ram_above_4g = g_malloc(sizeof(*ram_above_4g));
it is a memory leak when "ram-above-4g" is not created

> -        memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
> -                                 below_4g_mem_size, above_4g_mem_size);
> -        memory_region_add_subregion(system_memory, 0x100000000ULL,
> +        /*
> +         *
> +         * If 1GB hugepages are used to back guest RAM, map guest address
> +         * space in the range [ramsize,ramsize+holesize] to the ram block
> +         * range [holestart, 4GB]
> +         *
> +         *                      0      h     4G     [ramsize,ramsize+holesize]
> +         *
> +         * guest-addr-space     [      ]     [      ][xxx]
> +         *                                /----------/
> +         * contiguous-ram-block [      ][xxx][     ]
> +         *
> +         * So that memory beyond 4GB is aligned on a 1GB boundary,
> +         * at the host physical address space.
> +         *
> +         */
> +        if (guest_info->gb_align) {
> +            uint64_t holesize = 0x100000000ULL - below_4g_mem_size;
> +            uint64_t piecetwosize = holesize - align_offset;
> +
> +            assert(piecetwosize <= holesize);
> +
> +            if ((above_4g_mem_size - piecetwosize) > 0) {
here is integer overflow,
reproducable with:  -mem-path /var/lib/hugetlbfs/global/pagesize-1GB -m 3600


> +                memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g",
> +                                         ram, 0x100000000ULL,
> +                                         above_4g_mem_size - piecetwosize);
> +                memory_region_add_subregion(system_memory, 0x100000000ULL,
> +                                         ram_above_4g);
> +            }
> +
> +            ram_above_4g_piecetwo = g_malloc(sizeof(*ram_above_4g_piecetwo));
> +            memory_region_init_alias(ram_above_4g_piecetwo, NULL,
> +                                     "ram-above-4g-piecetwo", ram,
> +                                     0x100000000ULL - holesize, piecetwosize);
> +            memory_region_add_subregion(system_memory,
> +                                        0x100000000ULL +
> +                                        above_4g_mem_size - piecetwosize,
is there a guaranty that "ram-above-4g-piecetwo" will be mapped immediately
after "ram-above-4g" without any gap?

if there is no then you might need to change how e820_add_entry() for high ram
is handled and possibly CMOS value as well. 

> +                                        ram_above_4g_piecetwo);
> +        } else {
> +            memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
> +                                    below_4g_mem_size, above_4g_mem_size);
> +            memory_region_add_subregion(system_memory, 0x100000000ULL,
>                                      ram_above_4g);
> +        }
>          e820_add_entry(0x100000000ULL, above_4g_mem_size, E820_RAM);
>      }
[...]
Marcelo Tosatti Nov. 12, 2013, 8:32 p.m. UTC | #2
On Tue, Nov 12, 2013 at 01:45:51PM +0100, Igor Mammedov wrote:
> On Sun, 10 Nov 2013 18:47:53 -0200
> Marcelo Tosatti <mtosatti@redhat.com> wrote:
> 
> [...]
> 
> > @@ -1177,10 +1182,50 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory,
> >      e820_add_entry(0, below_4g_mem_size, E820_RAM);
> >      if (above_4g_mem_size > 0) {
> >          ram_above_4g = g_malloc(sizeof(*ram_above_4g));
> it is a memory leak when "ram-above-4g" is not created
> 
> > -        memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
> > -                                 below_4g_mem_size, above_4g_mem_size);
> > -        memory_region_add_subregion(system_memory, 0x100000000ULL,
> > +        /*
> > +         *
> > +         * If 1GB hugepages are used to back guest RAM, map guest address
> > +         * space in the range [ramsize,ramsize+holesize] to the ram block
> > +         * range [holestart, 4GB]
> > +         *
> > +         *                      0      h     4G     [ramsize,ramsize+holesize]
> > +         *
> > +         * guest-addr-space     [      ]     [      ][xxx]
> > +         *                                /----------/
> > +         * contiguous-ram-block [      ][xxx][     ]
> > +         *
> > +         * So that memory beyond 4GB is aligned on a 1GB boundary,
> > +         * at the host physical address space.
> > +         *
> > +         */
> > +        if (guest_info->gb_align) {
> > +            uint64_t holesize = 0x100000000ULL - below_4g_mem_size;
> > +            uint64_t piecetwosize = holesize - align_offset;
> > +
> > +            assert(piecetwosize <= holesize);
> > +
> > +            if ((above_4g_mem_size - piecetwosize) > 0) {
> here is integer overflow,
> reproducable with:  -mem-path /var/lib/hugetlbfs/global/pagesize-1GB -m 3600

Gah.

> 
> 
> > +                memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g",
> > +                                         ram, 0x100000000ULL,
> > +                                         above_4g_mem_size - piecetwosize);
> > +                memory_region_add_subregion(system_memory, 0x100000000ULL,
> > +                                         ram_above_4g);
> > +            }
> > +
> > +            ram_above_4g_piecetwo = g_malloc(sizeof(*ram_above_4g_piecetwo));
> > +            memory_region_init_alias(ram_above_4g_piecetwo, NULL,
> > +                                     "ram-above-4g-piecetwo", ram,
> > +                                     0x100000000ULL - holesize, piecetwosize);
> > +            memory_region_add_subregion(system_memory,
> > +                                        0x100000000ULL +
> > +                                        above_4g_mem_size - piecetwosize,
> is there a guaranty that "ram-above-4g-piecetwo" will be mapped immediately
> after "ram-above-4g" without any gap?

Yes:

Length of piece one alias: above_4g_mem_size - piecetwosize
Start address of piecetwo: 0x100000000ULL + above_4g_mem_size - piecetwosize

> if there is no then you might need to change how e820_add_entry() for high ram
> is handled and possibly CMOS value as well. 

> 
> > +                                        ram_above_4g_piecetwo);
> > +        } else {
> > +            memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
> > +                                    below_4g_mem_size, above_4g_mem_size);
> > +            memory_region_add_subregion(system_memory, 0x100000000ULL,
> >                                      ram_above_4g);
> > +        }
> >          e820_add_entry(0x100000000ULL, above_4g_mem_size, E820_RAM);
> >      }
> [...]
> -- 
> Regards,
>   Igor
diff mbox

Patch

diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 12c436e..abd6b81 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -1156,8 +1156,9 @@  FWCfgState *pc_memory_init(MemoryRegion *system_memory,
 {
     int linux_boot, i;
     MemoryRegion *ram, *option_rom_mr;
-    MemoryRegion *ram_below_4g, *ram_above_4g;
+    MemoryRegion *ram_below_4g, *ram_above_4g, *ram_above_4g_piecetwo;
     FWCfgState *fw_cfg;
+    uint64_t memsize, align_offset;
 
     linux_boot = (kernel_filename != NULL);
 
@@ -1166,8 +1167,12 @@  FWCfgState *pc_memory_init(MemoryRegion *system_memory,
      * with older qemus that used qemu_ram_alloc().
      */
     ram = g_malloc(sizeof(*ram));
-    memory_region_init_ram(ram, NULL, "pc.ram",
-                           below_4g_mem_size + above_4g_mem_size);
+
+    memsize = ROUND_UP(below_4g_mem_size + above_4g_mem_size, 1UL << 21);
+    align_offset = memsize - (below_4g_mem_size + above_4g_mem_size);
+
+    memory_region_init_ram(ram, NULL, "pc.ram", memsize);
+
     vmstate_register_ram_global(ram);
     *ram_memory = ram;
     ram_below_4g = g_malloc(sizeof(*ram_below_4g));
@@ -1177,10 +1182,50 @@  FWCfgState *pc_memory_init(MemoryRegion *system_memory,
     e820_add_entry(0, below_4g_mem_size, E820_RAM);
     if (above_4g_mem_size > 0) {
         ram_above_4g = g_malloc(sizeof(*ram_above_4g));
-        memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
-                                 below_4g_mem_size, above_4g_mem_size);
-        memory_region_add_subregion(system_memory, 0x100000000ULL,
+        /*
+         *
+         * If 1GB hugepages are used to back guest RAM, map guest address
+         * space in the range [ramsize,ramsize+holesize] to the ram block
+         * range [holestart, 4GB]
+         *
+         *                      0      h     4G     [ramsize,ramsize+holesize]
+         *
+         * guest-addr-space     [      ]     [      ][xxx]
+         *                                /----------/
+         * contiguous-ram-block [      ][xxx][     ]
+         *
+         * So that memory beyond 4GB is aligned on a 1GB boundary,
+         * at the host physical address space.
+         *
+         */
+        if (guest_info->gb_align) {
+            uint64_t holesize = 0x100000000ULL - below_4g_mem_size;
+            uint64_t piecetwosize = holesize - align_offset;
+
+            assert(piecetwosize <= holesize);
+
+            if ((above_4g_mem_size - piecetwosize) > 0) {
+                memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g",
+                                         ram, 0x100000000ULL,
+                                         above_4g_mem_size - piecetwosize);
+                memory_region_add_subregion(system_memory, 0x100000000ULL,
+                                         ram_above_4g);
+            }
+
+            ram_above_4g_piecetwo = g_malloc(sizeof(*ram_above_4g_piecetwo));
+            memory_region_init_alias(ram_above_4g_piecetwo, NULL,
+                                     "ram-above-4g-piecetwo", ram,
+                                     0x100000000ULL - holesize, piecetwosize);
+            memory_region_add_subregion(system_memory,
+                                        0x100000000ULL +
+                                        above_4g_mem_size - piecetwosize,
+                                        ram_above_4g_piecetwo);
+        } else {
+            memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
+                                    below_4g_mem_size, above_4g_mem_size);
+            memory_region_add_subregion(system_memory, 0x100000000ULL,
                                     ram_above_4g);
+        }
         e820_add_entry(0x100000000ULL, above_4g_mem_size, E820_RAM);
     }
 
diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
index 4fdb7b6..686736e 100644
--- a/hw/i386/pc_piix.c
+++ b/hw/i386/pc_piix.c
@@ -60,6 +60,7 @@  static const int ide_irq[MAX_IDE_BUS] = { 14, 15 };
 static bool has_pvpanic;
 static bool has_pci_info = true;
 static bool has_acpi_build = true;
+static bool gb_align = true;
 
 /* PC hardware initialisation */
 static void pc_init1(QEMUMachineInitArgs *args,
@@ -128,6 +129,7 @@  static void pc_init1(QEMUMachineInitArgs *args,
 
     guest_info->has_pci_info = has_pci_info;
     guest_info->isapc_ram_fw = !pci_enabled;
+    guest_info->gb_align = gb_align;
 
     /* allocate ram and load rom/bios */
     if (!xen_enabled()) {
@@ -240,8 +242,14 @@  static void pc_init_pci(QEMUMachineInitArgs *args)
     pc_init1(args, 1, 1);
 }
 
+static void pc_compat_1_7(QEMUMachineInitArgs *args)
+{
+    gb_align = false;
+}
+
 static void pc_compat_1_6(QEMUMachineInitArgs *args)
 {
+    pc_compat_1_7(args);
     has_pci_info = false;
     rom_file_in_ram = false;
     has_acpi_build = false;
@@ -274,6 +282,12 @@  static void pc_compat_1_2(QEMUMachineInitArgs *args)
     disable_kvm_pv_eoi();
 }
 
+static void pc_init_pci_1_7(QEMUMachineInitArgs *args)
+{
+    pc_compat_1_7(args);
+    pc_init_pci(args);
+}
+
 static void pc_init_pci_1_6(QEMUMachineInitArgs *args)
 {
     pc_compat_1_6(args);
@@ -346,13 +360,21 @@  static void pc_xen_hvm_init(QEMUMachineInitArgs *args)
     .desc = "Standard PC (i440FX + PIIX, 1996)", \
     .hot_add_cpu = pc_hot_add_cpu
 
+#define PC_I440FX_1_8_MACHINE_OPTIONS PC_I440FX_MACHINE_OPTIONS
+static QEMUMachine pc_i440fx_machine_v1_8 = {
+    PC_I440FX_1_8_MACHINE_OPTIONS,
+    .name = "pc-i440fx-1.8",
+    .alias = "pc",
+    .init = pc_init_pci,
+    .is_default = 1,
+};
+
 #define PC_I440FX_1_7_MACHINE_OPTIONS PC_I440FX_MACHINE_OPTIONS
 static QEMUMachine pc_i440fx_machine_v1_7 = {
     PC_I440FX_1_7_MACHINE_OPTIONS,
     .name = "pc-i440fx-1.7",
     .alias = "pc",
-    .init = pc_init_pci,
-    .is_default = 1,
+    .init = pc_init_pci_1_7,
 };
 
 #define PC_I440FX_1_6_MACHINE_OPTIONS PC_I440FX_MACHINE_OPTIONS
@@ -754,6 +776,7 @@  static QEMUMachine xenfv_machine = {
 
 static void pc_machine_init(void)
 {
+    qemu_register_machine(&pc_i440fx_machine_v1_8);
     qemu_register_machine(&pc_i440fx_machine_v1_7);
     qemu_register_machine(&pc_i440fx_machine_v1_6);
     qemu_register_machine(&pc_i440fx_machine_v1_5);
diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
index 4c191d3..c2eb568 100644
--- a/hw/i386/pc_q35.c
+++ b/hw/i386/pc_q35.c
@@ -50,6 +50,7 @@ 
 static bool has_pvpanic;
 static bool has_pci_info = true;
 static bool has_acpi_build = true;
+static bool gb_align = true;
 
 /* PC hardware initialisation */
 static void pc_q35_init(QEMUMachineInitArgs *args)
@@ -113,6 +114,7 @@  static void pc_q35_init(QEMUMachineInitArgs *args)
     guest_info->has_pci_info = has_pci_info;
     guest_info->isapc_ram_fw = false;
     guest_info->has_acpi_build = has_acpi_build;
+    guest_info->gb_align = gb_align;
 
     /* allocate ram and load rom/bios */
     if (!xen_enabled()) {
@@ -222,8 +224,14 @@  static void pc_q35_init(QEMUMachineInitArgs *args)
     }
 }
 
+static void pc_compat_1_7(QEMUMachineInitArgs *args)
+{
+   gb_align = false;
+}
+
 static void pc_compat_1_6(QEMUMachineInitArgs *args)
 {
+    pc_compat_1_7(args);
     has_pci_info = false;
     rom_file_in_ram = false;
     has_acpi_build = false;
@@ -243,6 +251,12 @@  static void pc_compat_1_4(QEMUMachineInitArgs *args)
     x86_cpu_compat_set_features("Westmere", FEAT_1_ECX, 0, CPUID_EXT_PCLMULQDQ);
 }
 
+static void pc_q35_init_1_7(QEMUMachineInitArgs *args)
+{
+    pc_compat_1_7(args);
+    pc_q35_init(args);
+}
+
 static void pc_q35_init_1_6(QEMUMachineInitArgs *args)
 {
     pc_compat_1_6(args);
@@ -266,13 +280,22 @@  static void pc_q35_init_1_4(QEMUMachineInitArgs *args)
     .desc = "Standard PC (Q35 + ICH9, 2009)", \
     .hot_add_cpu = pc_hot_add_cpu
 
+#define PC_Q35_1_8_MACHINE_OPTIONS PC_Q35_MACHINE_OPTIONS
+
+static QEMUMachine pc_q35_machine_v1_8 = {
+    PC_Q35_1_8_MACHINE_OPTIONS,
+    .name = "pc-q35-1.8",
+    .alias = "q35",
+    .init = pc_q35_init,
+};
+
 #define PC_Q35_1_7_MACHINE_OPTIONS PC_Q35_MACHINE_OPTIONS
 
 static QEMUMachine pc_q35_machine_v1_7 = {
     PC_Q35_1_7_MACHINE_OPTIONS,
     .name = "pc-q35-1.7",
     .alias = "q35",
-    .init = pc_q35_init,
+    .init = pc_q35_init_1_7,
 };
 
 #define PC_Q35_1_6_MACHINE_OPTIONS PC_Q35_MACHINE_OPTIONS
@@ -313,6 +336,7 @@  static QEMUMachine pc_q35_machine_v1_4 = {
 
 static void pc_q35_machine_init(void)
 {
+    qemu_register_machine(&pc_q35_machine_v1_8);
     qemu_register_machine(&pc_q35_machine_v1_7);
     qemu_register_machine(&pc_q35_machine_v1_6);
     qemu_register_machine(&pc_q35_machine_v1_5);
diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
index 03cc0ba..35a6885 100644
--- a/include/hw/i386/pc.h
+++ b/include/hw/i386/pc.h
@@ -41,6 +41,7 @@  struct PcGuestInfo {
     uint64_t *node_cpu;
     FWCfgState *fw_cfg;
     bool has_acpi_build;
+    bool gb_align;
 };
 
 /* parallel.c */