diff mbox

[RFC] numa: enable sparse node numbering

Message ID 20140623193310.GD4323@linux.vnet.ibm.com
State New
Headers show

Commit Message

Nishanth Aravamudan June 23, 2014, 7:33 p.m. UTC
Add a present field to NodeInfo which indicates if a given nodeid was
present on the command-line or not. Current code relies on a memory
value being passed for a node to indicate presence, which is
insufficient in the presence of memoryless nodes or sparse numbering.
Adjust the iteration of various NUMA loops to use the maximum known NUMA
ID rather than the number of NUMA nodes.

numa.c::set_numa_nodes() has become a bit more convoluted for
round-robin'ing the CPUs over known nodes when not specified by the
user.

Note that architecture-specific code still needs changes (forthcoming)
for both sparse node numbering and memoryless nodes.

Examples:

(1) qemu-system-x86_64 -enable-kvm -m 4096 -numa node,nodeid=3 -numa
node,nodeid=2 -smp 16

Before:

node 0 cpus: 0 2 4 6 8 10 12 14
node 0 size: 2048 MB
node 1 cpus: 1 3 5 7 9 11 13 15
node 1 size: 2048 MB

After:

node 2 cpus: 0 2 4 6 8 10 12 14                                               |
node 2 size: 2048 MB                                                          |
node 3 cpus: 1 3 5 7 9 11 13 15                                               |
node 3 size: 2048 MB

(2) qemu-system-x86_64 -enable-kvm -m 4096 -numa node,nodeid=0 -numa
node,nodeid=1 -numa node,nodeid=2 -numa node,nodeid=3 -smp 16

Before:

node 0 cpus: 0 4 8 12                                                         |
node 0 size: 1024 MB                                                          |
node 1 cpus: 1 5 9 13                                                         |
node 1 size: 1024 MB                                                          |
node 2 cpus: 2 6 10 14                                                        |
node 2 size: 1024 MB                                                          |
node 3 cpus: 3 7 11 15                                                        |
node 3 size: 1024 MB

After:

node 0 cpus: 0 4 8 12
node 0 size: 1024 MB
node 1 cpus: 1 5 9 13
node 1 size: 1024 MB
node 2 cpus: 2 6 10 14
node 2 size: 1024 MB
node 3 cpus: 3 7 11 15
node 3 size: 1024 MB

Signed-off-by: Nishanth Aravamudan <nacc@linux.vnet.ibm.com>
    
---
Based off mst's for_upstream tag, which has the NodeInfo changes.

Comments

Igor Mammedov June 24, 2014, 7:33 a.m. UTC | #1
On Mon, 23 Jun 2014 12:33:10 -0700
Nishanth Aravamudan <nacc@linux.vnet.ibm.com> wrote:

> Add a present field to NodeInfo which indicates if a given nodeid was
> present on the command-line or not. Current code relies on a memory
> value being passed for a node to indicate presence, which is
> insufficient in the presence of memoryless nodes or sparse numbering.
> Adjust the iteration of various NUMA loops to use the maximum known NUMA
> ID rather than the number of NUMA nodes.
Why would you need sparse numbering?
What task exactly are you trying to solve?

> 
> numa.c::set_numa_nodes() has become a bit more convoluted for
> round-robin'ing the CPUs over known nodes when not specified by the
> user.
> 
> Note that architecture-specific code still needs changes (forthcoming)
> for both sparse node numbering and memoryless nodes.
> 
> Examples:
> 
> (1) qemu-system-x86_64 -enable-kvm -m 4096 -numa node,nodeid=3 -numa
> node,nodeid=2 -smp 16
> 
> Before:
> 
> node 0 cpus: 0 2 4 6 8 10 12 14
> node 0 size: 2048 MB
> node 1 cpus: 1 3 5 7 9 11 13 15
> node 1 size: 2048 MB
> 
> After:
> 
> node 2 cpus: 0 2 4 6 8 10 12 14                                               |
> node 2 size: 2048 MB                                                          |
> node 3 cpus: 1 3 5 7 9 11 13 15                                               |
> node 3 size: 2048 MB
> 
> (2) qemu-system-x86_64 -enable-kvm -m 4096 -numa node,nodeid=0 -numa
> node,nodeid=1 -numa node,nodeid=2 -numa node,nodeid=3 -smp 16
> 
> Before:
> 
> node 0 cpus: 0 4 8 12                                                         |
> node 0 size: 1024 MB                                                          |
> node 1 cpus: 1 5 9 13                                                         |
> node 1 size: 1024 MB                                                          |
> node 2 cpus: 2 6 10 14                                                        |
> node 2 size: 1024 MB                                                          |
> node 3 cpus: 3 7 11 15                                                        |
> node 3 size: 1024 MB
> 
> After:
> 
> node 0 cpus: 0 4 8 12
> node 0 size: 1024 MB
> node 1 cpus: 1 5 9 13
> node 1 size: 1024 MB
> node 2 cpus: 2 6 10 14
> node 2 size: 1024 MB
> node 3 cpus: 3 7 11 15
> node 3 size: 1024 MB
> 
> Signed-off-by: Nishanth Aravamudan <nacc@linux.vnet.ibm.com>
>     
> ---
> Based off mst's for_upstream tag, which has the NodeInfo changes.
> 
> diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
> index 277230d..b90bf66 100644
> --- a/include/sysemu/sysemu.h
> +++ b/include/sysemu/sysemu.h
> @@ -145,11 +145,13 @@ extern int mem_prealloc;
>   */
>  #define MAX_CPUMASK_BITS 255
>  
> -extern int nb_numa_nodes;
> +extern int nb_numa_nodes; /* Number of NUMA nodes */
> +extern int max_numa_node; /* Highest specified NUMA node ID */
>  typedef struct node_info {
>      uint64_t node_mem;
>      DECLARE_BITMAP(node_cpu, MAX_CPUMASK_BITS);
>      struct HostMemoryBackend *node_memdev;
> +    bool present;
>  } NodeInfo;
>  extern NodeInfo numa_info[MAX_NODES];
>  void set_numa_nodes(void);
> diff --git a/monitor.c b/monitor.c
> index c7f8797..4721996 100644
> --- a/monitor.c
> +++ b/monitor.c
> @@ -2003,7 +2003,10 @@ static void do_info_numa(Monitor *mon, const QDict *qdict)
>      CPUState *cpu;
>  
>      monitor_printf(mon, "%d nodes\n", nb_numa_nodes);
> -    for (i = 0; i < nb_numa_nodes; i++) {
> +    for (i = 0; i < max_numa_node; i++) {
> +        if (!numa_info[i].present) {
> +            continue;
> +        }
>          monitor_printf(mon, "node %d cpus:", i);
>          CPU_FOREACH(cpu) {
>              if (cpu->numa_node == i) {
> diff --git a/numa.c b/numa.c
> index e471afe..c31857d 100644
> --- a/numa.c
> +++ b/numa.c
> @@ -106,6 +106,10 @@ static void numa_node_parse(NumaNodeOptions *node, QemuOpts *opts, Error **errp)
>          numa_info[nodenr].node_mem = object_property_get_int(o, "size", NULL);
>          numa_info[nodenr].node_memdev = MEMORY_BACKEND(o);
>      }
> +    numa_info[nodenr].present = true;
> +    if (nodenr >= max_numa_node) {
> +        max_numa_node = nodenr + 1;
> +    }
>  }
>  
>  int numa_init_func(QemuOpts *opts, void *opaque)
> @@ -155,7 +159,7 @@ void set_numa_nodes(void)
>  {
>      if (nb_numa_nodes > 0) {
>          uint64_t numa_total;
> -        int i;
> +        int i, j, last_j = -1;
>  
>          if (nb_numa_nodes > MAX_NODES) {
>              nb_numa_nodes = MAX_NODES;
> @@ -164,27 +168,29 @@ void set_numa_nodes(void)
>          /* If no memory size if given for any node, assume the default case
>           * and distribute the available memory equally across all nodes
>           */
> -        for (i = 0; i < nb_numa_nodes; i++) {
> -            if (numa_info[i].node_mem != 0) {
> +        for (i = 0; i < max_numa_node; i++) {
> +            if (numa_info[i].present && numa_info[i].node_mem != 0) {
>                  break;
>              }
>          }
> -        if (i == nb_numa_nodes) {
> +        if (i == max_numa_node) {
>              uint64_t usedmem = 0;
>  
>              /* On Linux, the each node's border has to be 8MB aligned,
>               * the final node gets the rest.
>               */
> -            for (i = 0; i < nb_numa_nodes - 1; i++) {
> -                numa_info[i].node_mem = (ram_size / nb_numa_nodes) &
> -                                        ~((1 << 23UL) - 1);
> -                usedmem += numa_info[i].node_mem;
> +            for (i = 0; i < max_numa_node - 1; i++) {
> +                if (numa_info[i].present) {
> +                    numa_info[i].node_mem = (ram_size / nb_numa_nodes) &
> +                                            ~((1 << 23UL) - 1);
> +                    usedmem += numa_info[i].node_mem;
> +                }
>              }
>              numa_info[i].node_mem = ram_size - usedmem;
>          }
>  
>          numa_total = 0;
> -        for (i = 0; i < nb_numa_nodes; i++) {
> +        for (i = 0; i < max_numa_node; i++) {
>              numa_total += numa_info[i].node_mem;
>          }
>          if (numa_total != ram_size) {
> @@ -194,8 +200,9 @@ void set_numa_nodes(void)
>              exit(1);
>          }
>  
> -        for (i = 0; i < nb_numa_nodes; i++) {
> -            if (!bitmap_empty(numa_info[i].node_cpu, MAX_CPUMASK_BITS)) {
> +        for (i = 0; i < max_numa_node; i++) {
> +            if (numa_info[i].present &&
> +                !bitmap_empty(numa_info[i].node_cpu, MAX_CPUMASK_BITS)) {
>                  break;
>              }
>          }
> @@ -203,9 +210,19 @@ void set_numa_nodes(void)
>           * must cope with this anyway, because there are BIOSes out there in
>           * real machines which also use this scheme.
>           */
> -        if (i == nb_numa_nodes) {
> +        if (i == max_numa_node) {
>              for (i = 0; i < max_cpus; i++) {
> -                set_bit(i, numa_info[i % nb_numa_nodes].node_cpu);
> +                for (j = last_j + 1;
> +                     j != (last_j == 0 ? max_numa_node : last_j - 1); j++) {
> +                    if (j >= max_numa_node) {
> +                        j = 0;
> +                    }
> +                    if (numa_info[j].present) {
> +                        break;
> +                    }
> +                }
> +                last_j = j;
> +                set_bit(i, numa_info[j].node_cpu);
>              }
>          }
>      }
> @@ -217,7 +234,7 @@ void set_numa_modes(void)
>      int i;
>  
>      CPU_FOREACH(cpu) {
> -        for (i = 0; i < nb_numa_nodes; i++) {
> +        for (i = 0; i < max_numa_node; i++) {
>              if (test_bit(cpu->cpu_index, numa_info[i].node_cpu)) {
>                  cpu->numa_node = i;
>              }
> diff --git a/vl.c b/vl.c
> index 54b4627..e1a6ab8 100644
> --- a/vl.c
> +++ b/vl.c
> @@ -195,6 +195,7 @@ static QTAILQ_HEAD(, FWBootEntry) fw_boot_order =
>      QTAILQ_HEAD_INITIALIZER(fw_boot_order);
>  
>  int nb_numa_nodes;
> +int max_numa_node;
>  NodeInfo numa_info[MAX_NODES];
>  
>  uint8_t qemu_uuid[16];
> @@ -2967,10 +2968,12 @@ int main(int argc, char **argv, char **envp)
>  
>      for (i = 0; i < MAX_NODES; i++) {
>          numa_info[i].node_mem = 0;
> +        numa_info[i].present = false;
>          bitmap_zero(numa_info[i].node_cpu, MAX_CPUMASK_BITS);
>      }
>  
>      nb_numa_nodes = 0;
> +    max_numa_node = -1;
>      nb_nics = 0;
>  
>      bdrv_init_with_whitelist();
> 
>
Alexey Kardashevskiy June 24, 2014, 8:58 a.m. UTC | #2
On 06/24/2014 05:33 PM, Igor Mammedov wrote:
> On Mon, 23 Jun 2014 12:33:10 -0700
> Nishanth Aravamudan <nacc@linux.vnet.ibm.com> wrote:
> 
>> Add a present field to NodeInfo which indicates if a given nodeid was
>> present on the command-line or not. Current code relies on a memory
>> value being passed for a node to indicate presence, which is
>> insufficient in the presence of memoryless nodes or sparse numbering.
>> Adjust the iteration of various NUMA loops to use the maximum known NUMA
>> ID rather than the number of NUMA nodes.
> Why would you need sparse numbering?
> What task exactly are you trying to solve?


This is to reproduce+debug the real POWERPC hardware behavior. No practical
use as far as I can tell.
Nishanth Aravamudan June 24, 2014, 4:39 p.m. UTC | #3
On 24.06.2014 [09:33:48 +0200], Igor Mammedov wrote:
> On Mon, 23 Jun 2014 12:33:10 -0700
> Nishanth Aravamudan <nacc@linux.vnet.ibm.com> wrote:
> 
> > Add a present field to NodeInfo which indicates if a given nodeid was
> > present on the command-line or not. Current code relies on a memory
> > value being passed for a node to indicate presence, which is
> > insufficient in the presence of memoryless nodes or sparse numbering.
> > Adjust the iteration of various NUMA loops to use the maximum known NUMA
> > ID rather than the number of NUMA nodes.
> Why would you need sparse numbering?

We already see this in practice under PowerVM on ppc. This would allow
qemu/KVM to emulate the same behaviors (which often lead to bugs/testing
of underused code-paths in the kernel).

Also, sparse numbering is explicitly allowed by qemu already but doesn't
work at all as expected, at least to me. That is, if I specify various
node IDs, the nodes are allocated linearly in qemu and the NUMA node IDs
in Linux don't correspond to the command-line. Further, the location of
CPUs/memory doesn't correspond to the nodes requested by the user.

> What task exactly are you trying to solve?

I would like to be able to leverage qemu's command-line to reproduce
the NUMA topology seen under PowerVM.

I'll send v3 with a better changelog and a few changes from Hu Tao.

Thanks,
Nish
diff mbox

Patch

diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
index 277230d..b90bf66 100644
--- a/include/sysemu/sysemu.h
+++ b/include/sysemu/sysemu.h
@@ -145,11 +145,13 @@  extern int mem_prealloc;
  */
 #define MAX_CPUMASK_BITS 255
 
-extern int nb_numa_nodes;
+extern int nb_numa_nodes; /* Number of NUMA nodes */
+extern int max_numa_node; /* Highest specified NUMA node ID */
 typedef struct node_info {
     uint64_t node_mem;
     DECLARE_BITMAP(node_cpu, MAX_CPUMASK_BITS);
     struct HostMemoryBackend *node_memdev;
+    bool present;
 } NodeInfo;
 extern NodeInfo numa_info[MAX_NODES];
 void set_numa_nodes(void);
diff --git a/monitor.c b/monitor.c
index c7f8797..4721996 100644
--- a/monitor.c
+++ b/monitor.c
@@ -2003,7 +2003,10 @@  static void do_info_numa(Monitor *mon, const QDict *qdict)
     CPUState *cpu;
 
     monitor_printf(mon, "%d nodes\n", nb_numa_nodes);
-    for (i = 0; i < nb_numa_nodes; i++) {
+    for (i = 0; i < max_numa_node; i++) {
+        if (!numa_info[i].present) {
+            continue;
+        }
         monitor_printf(mon, "node %d cpus:", i);
         CPU_FOREACH(cpu) {
             if (cpu->numa_node == i) {
diff --git a/numa.c b/numa.c
index e471afe..c31857d 100644
--- a/numa.c
+++ b/numa.c
@@ -106,6 +106,10 @@  static void numa_node_parse(NumaNodeOptions *node, QemuOpts *opts, Error **errp)
         numa_info[nodenr].node_mem = object_property_get_int(o, "size", NULL);
         numa_info[nodenr].node_memdev = MEMORY_BACKEND(o);
     }
+    numa_info[nodenr].present = true;
+    if (nodenr >= max_numa_node) {
+        max_numa_node = nodenr + 1;
+    }
 }
 
 int numa_init_func(QemuOpts *opts, void *opaque)
@@ -155,7 +159,7 @@  void set_numa_nodes(void)
 {
     if (nb_numa_nodes > 0) {
         uint64_t numa_total;
-        int i;
+        int i, j, last_j = -1;
 
         if (nb_numa_nodes > MAX_NODES) {
             nb_numa_nodes = MAX_NODES;
@@ -164,27 +168,29 @@  void set_numa_nodes(void)
         /* If no memory size if given for any node, assume the default case
          * and distribute the available memory equally across all nodes
          */
-        for (i = 0; i < nb_numa_nodes; i++) {
-            if (numa_info[i].node_mem != 0) {
+        for (i = 0; i < max_numa_node; i++) {
+            if (numa_info[i].present && numa_info[i].node_mem != 0) {
                 break;
             }
         }
-        if (i == nb_numa_nodes) {
+        if (i == max_numa_node) {
             uint64_t usedmem = 0;
 
             /* On Linux, the each node's border has to be 8MB aligned,
              * the final node gets the rest.
              */
-            for (i = 0; i < nb_numa_nodes - 1; i++) {
-                numa_info[i].node_mem = (ram_size / nb_numa_nodes) &
-                                        ~((1 << 23UL) - 1);
-                usedmem += numa_info[i].node_mem;
+            for (i = 0; i < max_numa_node - 1; i++) {
+                if (numa_info[i].present) {
+                    numa_info[i].node_mem = (ram_size / nb_numa_nodes) &
+                                            ~((1 << 23UL) - 1);
+                    usedmem += numa_info[i].node_mem;
+                }
             }
             numa_info[i].node_mem = ram_size - usedmem;
         }
 
         numa_total = 0;
-        for (i = 0; i < nb_numa_nodes; i++) {
+        for (i = 0; i < max_numa_node; i++) {
             numa_total += numa_info[i].node_mem;
         }
         if (numa_total != ram_size) {
@@ -194,8 +200,9 @@  void set_numa_nodes(void)
             exit(1);
         }
 
-        for (i = 0; i < nb_numa_nodes; i++) {
-            if (!bitmap_empty(numa_info[i].node_cpu, MAX_CPUMASK_BITS)) {
+        for (i = 0; i < max_numa_node; i++) {
+            if (numa_info[i].present &&
+                !bitmap_empty(numa_info[i].node_cpu, MAX_CPUMASK_BITS)) {
                 break;
             }
         }
@@ -203,9 +210,19 @@  void set_numa_nodes(void)
          * must cope with this anyway, because there are BIOSes out there in
          * real machines which also use this scheme.
          */
-        if (i == nb_numa_nodes) {
+        if (i == max_numa_node) {
             for (i = 0; i < max_cpus; i++) {
-                set_bit(i, numa_info[i % nb_numa_nodes].node_cpu);
+                for (j = last_j + 1;
+                     j != (last_j == 0 ? max_numa_node : last_j - 1); j++) {
+                    if (j >= max_numa_node) {
+                        j = 0;
+                    }
+                    if (numa_info[j].present) {
+                        break;
+                    }
+                }
+                last_j = j;
+                set_bit(i, numa_info[j].node_cpu);
             }
         }
     }
@@ -217,7 +234,7 @@  void set_numa_modes(void)
     int i;
 
     CPU_FOREACH(cpu) {
-        for (i = 0; i < nb_numa_nodes; i++) {
+        for (i = 0; i < max_numa_node; i++) {
             if (test_bit(cpu->cpu_index, numa_info[i].node_cpu)) {
                 cpu->numa_node = i;
             }
diff --git a/vl.c b/vl.c
index 54b4627..e1a6ab8 100644
--- a/vl.c
+++ b/vl.c
@@ -195,6 +195,7 @@  static QTAILQ_HEAD(, FWBootEntry) fw_boot_order =
     QTAILQ_HEAD_INITIALIZER(fw_boot_order);
 
 int nb_numa_nodes;
+int max_numa_node;
 NodeInfo numa_info[MAX_NODES];
 
 uint8_t qemu_uuid[16];
@@ -2967,10 +2968,12 @@  int main(int argc, char **argv, char **envp)
 
     for (i = 0; i < MAX_NODES; i++) {
         numa_info[i].node_mem = 0;
+        numa_info[i].present = false;
         bitmap_zero(numa_info[i].node_cpu, MAX_CPUMASK_BITS);
     }
 
     nb_numa_nodes = 0;
+    max_numa_node = -1;
     nb_nics = 0;
 
     bdrv_init_with_whitelist();