Patchwork [RFC,6/9] pc: pass paravirt info for hotplug memory slots to BIOS

login
register
mail settings
Submitter Vasilis Liaskovitis
Date April 19, 2012, 2:08 p.m.
Message ID <1334844527-18869-7-git-send-email-vasilis.liaskovitis@profitbricks.com>
Download mbox | patch
Permalink /patch/153837/
State New
Headers show

Comments

Vasilis Liaskovitis - April 19, 2012, 2:08 p.m.
The numa_fw_cfg paravirt interface is extended to include SRAT information for
 all hotplug-able memslots. There are 3 words for each hotplug-able memory slot,
 denoting start address, size and node proximity. nb_numa_nodes is set to 1 by
 default (not 0), so that we always pass srat info to SeaBIOS.

 This information is used by Seabios to build hotplug memory device objects at runtime.

 Signed-off-by: Vasilis Liaskovitis <vasilis.liaskovitis@profitbricks.com>
---
 hw/pc.c |   59 +++++++++++++++++++++++++++++++++++++++++++++++++++++------
 vl.c    |    4 +++-
 2 files changed, 56 insertions(+), 7 deletions(-)
Igor Mammedov - April 20, 2012, 10:33 a.m.
On 04/19/2012 04:08 PM, Vasilis Liaskovitis wrote:
>   The numa_fw_cfg paravirt interface is extended to include SRAT information for
>   all hotplug-able memslots. There are 3 words for each hotplug-able memory slot,
>   denoting start address, size and node proximity. nb_numa_nodes is set to 1 by
>   default (not 0), so that we always pass srat info to SeaBIOS.
>
>   This information is used by Seabios to build hotplug memory device objects at runtime.
>
>   Signed-off-by: Vasilis Liaskovitis<vasilis.liaskovitis@profitbricks.com>
> ---
>   hw/pc.c |   59 +++++++++++++++++++++++++++++++++++++++++++++++++++++------
>   vl.c    |    4 +++-
>   2 files changed, 56 insertions(+), 7 deletions(-)
>
> diff --git a/hw/pc.c b/hw/pc.c
> index 67f0479..f1f550a 100644
> --- a/hw/pc.c
> +++ b/hw/pc.c
> @@ -46,6 +46,7 @@
>   #include "ui/qemu-spice.h"
>   #include "memory.h"
>   #include "exec-memory.h"
> +#include "memslot.h"
>
>   /* output Bochs bios info messages */
>   //#define DEBUG_BIOS
> @@ -592,12 +593,15 @@ int e820_add_entry(uint64_t address, uint64_t length, uint32_t type)
>       return index;
>   }
>
> +static void bochs_bios_setup_hp_memslots(uint64_t *fw_cfg_slots);
> +
>   static void *bochs_bios_init(void)
>   {
>       void *fw_cfg;
>       uint8_t *smbios_table;
>       size_t smbios_len;
>       uint64_t *numa_fw_cfg;
> +    uint64_t *hp_memslots_fw_cfg;
>       int i, j;
>
>       register_ioport_write(0x400, 1, 2, bochs_bios_write, NULL);
> @@ -630,28 +634,71 @@ static void *bochs_bios_init(void)
>       fw_cfg_add_bytes(fw_cfg, FW_CFG_HPET, (uint8_t *)&hpet_cfg,
>                        sizeof(struct hpet_fw_config));
>       /* allocate memory for the NUMA channel: one (64bit) word for the number
> -     * of nodes, one word for each VCPU->node and one word for each node to
> -     * hold the amount of memory.
> +     * of nodes, one word for the number of hotplug memory slots, one word
> +     * for each VCPU->node, one word for each node to hold the amount of memory.
> +     * Finally three words for each hotplug memory slot, denoting start address,
> +     * size and node proximity.
>        */
> -    numa_fw_cfg = g_malloc0((1 + max_cpus + nb_numa_nodes) * 8);
> +    numa_fw_cfg = g_malloc0((2 + max_cpus + nb_numa_nodes + 3 * nb_hp_memslots) * 8);
>       numa_fw_cfg[0] = cpu_to_le64(nb_numa_nodes);
> +    numa_fw_cfg[1] = cpu_to_le64(nb_hp_memslots);
this will brake compatibility if guest was migrated from old->new qemu
than on reboot it will use old bios that expects numa_fw_cfg[1] to be something else.
Could memslots info be moved to the end of an existing interface?

> +
>       for (i = 0; i<  max_cpus; i++) {
>           for (j = 0; j<  nb_numa_nodes; j++) {
>               if (node_cpumask[j]&  (1<<  i)) {
> -                numa_fw_cfg[i + 1] = cpu_to_le64(j);
> +                numa_fw_cfg[i + 2] = cpu_to_le64(j);
>                   break;
>               }
>           }
>       }
>       for (i = 0; i<  nb_numa_nodes; i++) {
> -        numa_fw_cfg[max_cpus + 1 + i] = cpu_to_le64(node_mem[i]);
> +        numa_fw_cfg[max_cpus + 2 + i] = cpu_to_le64(node_mem[i]);
>       }
> +
> +    hp_memslots_fw_cfg = numa_fw_cfg + 2 + max_cpus + nb_numa_nodes;
> +    if (nb_hp_memslots)
> +        bochs_bios_setup_hp_memslots(hp_memslots_fw_cfg);
> +
>       fw_cfg_add_bytes(fw_cfg, FW_CFG_NUMA, (uint8_t *)numa_fw_cfg,
> -                     (1 + max_cpus + nb_numa_nodes) * 8);
> +                     (2 + max_cpus + nb_numa_nodes + 3 * nb_hp_memslots) * 8);
>
>       return fw_cfg;
>   }
>
> +static void bochs_bios_setup_hp_memslots(uint64_t *fw_cfg_slots)
> +{
> +    int i = 0;
> +    Error *err = NULL;
> +    DeviceState *dev;
> +    MemSlotState *slot;
> +    char *type;
> +    BusState *bus = sysbus_get_default();
> +
> +    QTAILQ_FOREACH(dev,&bus->children, sibling) {
> +        type = object_property_get_str(OBJECT(dev), "type",&err);
> +        if (err) {
> +            error_free(err);
> +            fprintf(stderr, "error getting device type\n");
> +            exit(1);
> +        }
> +
> +        if (!strcmp(type, "memslot")) {
> +            if (!dev->id) {
> +                error_free(err);
> +                fprintf(stderr, "error getting memslot device id\n");
> +                exit(1);
> +            }
> +            if (!strcmp(dev->id, "initialslot")) continue;
> +            slot = MEMSLOT(dev);
> +            fw_cfg_slots[3 * slot->idx] = cpu_to_le64(slot->start);
> +            fw_cfg_slots[3 * slot->idx + 1] = cpu_to_le64(slot->size);
> +            fw_cfg_slots[3 * slot->idx + 2] = cpu_to_le64(slot->node);
> +            i++;
> +        }
> +    }
> +    assert(i == nb_hp_memslots);
> +}
> +
>   static long get_file_size(FILE *f)
>   {
>       long where, size;
> diff --git a/vl.c b/vl.c
> index ae91a8a..50df453 100644
> --- a/vl.c
> +++ b/vl.c
> @@ -3428,8 +3428,10 @@ int main(int argc, char **argv, char **envp)
>
>       register_savevm_live(NULL, "ram", 0, 4, NULL, ram_save_live, NULL,
>                            ram_load, NULL);
> +    if (!nb_numa_nodes)
> +        nb_numa_nodes = 1;
>
> -    if (nb_numa_nodes>  0) {
> +    {
>           int i;
>
>           if (nb_numa_nodes>  MAX_NODES) {
Vasilis Liaskovitis - April 20, 2012, 4:35 p.m.
On Fri, Apr 20, 2012 at 12:33:57PM +0200, Igor Mammedov wrote:
> On 04/19/2012 04:08 PM, Vasilis Liaskovitis wrote:
> >-    numa_fw_cfg = g_malloc0((1 + max_cpus + nb_numa_nodes) * 8);
> >+    numa_fw_cfg = g_malloc0((2 + max_cpus + nb_numa_nodes + 3 * nb_hp_memslots) * 8);
> >      numa_fw_cfg[0] = cpu_to_le64(nb_numa_nodes);
> >+    numa_fw_cfg[1] = cpu_to_le64(nb_hp_memslots);
> this will brake compatibility if guest was migrated from old->new qemu
> than on reboot it will use old bios that expects numa_fw_cfg[1] to be something else.
> Could memslots info be moved to the end of an existing interface?

right. The number of memslots can be placed at 1 + max_cpus + nb_numa_nodes,
instead of right after the number of nodes. This way the old layout is preserved,
and all memslot info comes at the end. I will rewrite.

thanks,
- Vasilis

Patch

diff --git a/hw/pc.c b/hw/pc.c
index 67f0479..f1f550a 100644
--- a/hw/pc.c
+++ b/hw/pc.c
@@ -46,6 +46,7 @@ 
 #include "ui/qemu-spice.h"
 #include "memory.h"
 #include "exec-memory.h"
+#include "memslot.h"
 
 /* output Bochs bios info messages */
 //#define DEBUG_BIOS
@@ -592,12 +593,15 @@  int e820_add_entry(uint64_t address, uint64_t length, uint32_t type)
     return index;
 }
 
+static void bochs_bios_setup_hp_memslots(uint64_t *fw_cfg_slots);
+
 static void *bochs_bios_init(void)
 {
     void *fw_cfg;
     uint8_t *smbios_table;
     size_t smbios_len;
     uint64_t *numa_fw_cfg;
+    uint64_t *hp_memslots_fw_cfg;
     int i, j;
 
     register_ioport_write(0x400, 1, 2, bochs_bios_write, NULL);
@@ -630,28 +634,71 @@  static void *bochs_bios_init(void)
     fw_cfg_add_bytes(fw_cfg, FW_CFG_HPET, (uint8_t *)&hpet_cfg,
                      sizeof(struct hpet_fw_config));
     /* allocate memory for the NUMA channel: one (64bit) word for the number
-     * of nodes, one word for each VCPU->node and one word for each node to
-     * hold the amount of memory.
+     * of nodes, one word for the number of hotplug memory slots, one word
+     * for each VCPU->node, one word for each node to hold the amount of memory.
+     * Finally three words for each hotplug memory slot, denoting start address,
+     * size and node proximity.
      */
-    numa_fw_cfg = g_malloc0((1 + max_cpus + nb_numa_nodes) * 8);
+    numa_fw_cfg = g_malloc0((2 + max_cpus + nb_numa_nodes + 3 * nb_hp_memslots) * 8);
     numa_fw_cfg[0] = cpu_to_le64(nb_numa_nodes);
+    numa_fw_cfg[1] = cpu_to_le64(nb_hp_memslots);
+
     for (i = 0; i < max_cpus; i++) {
         for (j = 0; j < nb_numa_nodes; j++) {
             if (node_cpumask[j] & (1 << i)) {
-                numa_fw_cfg[i + 1] = cpu_to_le64(j);
+                numa_fw_cfg[i + 2] = cpu_to_le64(j);
                 break;
             }
         }
     }
     for (i = 0; i < nb_numa_nodes; i++) {
-        numa_fw_cfg[max_cpus + 1 + i] = cpu_to_le64(node_mem[i]);
+        numa_fw_cfg[max_cpus + 2 + i] = cpu_to_le64(node_mem[i]);
     }
+
+    hp_memslots_fw_cfg = numa_fw_cfg + 2 + max_cpus + nb_numa_nodes;
+    if (nb_hp_memslots)
+        bochs_bios_setup_hp_memslots(hp_memslots_fw_cfg);
+
     fw_cfg_add_bytes(fw_cfg, FW_CFG_NUMA, (uint8_t *)numa_fw_cfg,
-                     (1 + max_cpus + nb_numa_nodes) * 8);
+                     (2 + max_cpus + nb_numa_nodes + 3 * nb_hp_memslots) * 8);
 
     return fw_cfg;
 }
 
+static void bochs_bios_setup_hp_memslots(uint64_t *fw_cfg_slots)
+{
+    int i = 0;
+    Error *err = NULL;
+    DeviceState *dev;
+    MemSlotState *slot;
+    char *type;
+    BusState *bus = sysbus_get_default();
+
+    QTAILQ_FOREACH(dev, &bus->children, sibling) {
+        type = object_property_get_str(OBJECT(dev), "type", &err);
+        if (err) {
+            error_free(err);
+            fprintf(stderr, "error getting device type\n");
+            exit(1);
+        }
+
+        if (!strcmp(type, "memslot")) {
+            if (!dev->id) {
+                error_free(err);
+                fprintf(stderr, "error getting memslot device id\n");
+                exit(1);
+            }
+            if (!strcmp(dev->id, "initialslot")) continue;
+            slot = MEMSLOT(dev);
+            fw_cfg_slots[3 * slot->idx] = cpu_to_le64(slot->start);
+            fw_cfg_slots[3 * slot->idx + 1] = cpu_to_le64(slot->size);
+            fw_cfg_slots[3 * slot->idx + 2] = cpu_to_le64(slot->node);
+            i++;
+        }
+    }
+    assert(i == nb_hp_memslots);
+}
+
 static long get_file_size(FILE *f)
 {
     long where, size;
diff --git a/vl.c b/vl.c
index ae91a8a..50df453 100644
--- a/vl.c
+++ b/vl.c
@@ -3428,8 +3428,10 @@  int main(int argc, char **argv, char **envp)
 
     register_savevm_live(NULL, "ram", 0, 4, NULL, ram_save_live, NULL,
                          ram_load, NULL);
+    if (!nb_numa_nodes)
+        nb_numa_nodes = 1;
 
-    if (nb_numa_nodes > 0) {
+    {
         int i;
 
         if (nb_numa_nodes > MAX_NODES) {