diff mbox

[v4,1/2] hw/acpi-build: Fix SRAT memory building when there is no memory in node0

Message ID 1503372250-5092-2-git-send-email-douly.fnst@cn.fujitsu.com
State New
Headers show

Commit Message

Dou Liyang Aug. 22, 2017, 3:24 a.m. UTC
Currently, Using the fisrt node without memory on the machine makes
QEMU unhappy. With this example command line:
  ... \
  -m 1024M,slots=4,maxmem=32G \
  -numa node,nodeid=0 \
  -numa node,mem=1024M,nodeid=1 \
  -numa node,nodeid=2 \
  -numa node,nodeid=3 \
Guest reports "No NUMA configuration found" and the NUMA topology is
wrong.

This is because when QEMU builds ACPI SRAT, it regards node0 as the
default node to deal with the memory hole(640K-1M). this means the
node0 must have some memory(>1M), but, actually it can have no
memory.

Fix this problem by replace the node0 with the first node which has
memory on it. Add a new function for each node. Also do some cleanup.

Signed-off-by: Dou Liyang <douly.fnst@cn.fujitsu.com>
---
 hw/i386/acpi-build.c | 78 +++++++++++++++++++++++++++++++++-------------------
 1 file changed, 50 insertions(+), 28 deletions(-)

Comments

Eduardo Habkost Aug. 30, 2017, 8:55 p.m. UTC | #1
On Tue, Aug 22, 2017 at 11:24:09AM +0800, Dou Liyang wrote:
> Currently, Using the fisrt node without memory on the machine makes
> QEMU unhappy. With this example command line:
>   ... \
>   -m 1024M,slots=4,maxmem=32G \
>   -numa node,nodeid=0 \
>   -numa node,mem=1024M,nodeid=1 \
>   -numa node,nodeid=2 \
>   -numa node,nodeid=3 \
> Guest reports "No NUMA configuration found" and the NUMA topology is
> wrong.
> 
> This is because when QEMU builds ACPI SRAT, it regards node0 as the
> default node to deal with the memory hole(640K-1M). this means the
> node0 must have some memory(>1M), but, actually it can have no
> memory.
> 
> Fix this problem by replace the node0 with the first node which has
> memory on it. Add a new function for each node. Also do some cleanup.
> 
> Signed-off-by: Dou Liyang <douly.fnst@cn.fujitsu.com>
> ---
>  hw/i386/acpi-build.c | 78 +++++++++++++++++++++++++++++++++-------------------
>  1 file changed, 50 insertions(+), 28 deletions(-)
> 
> diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
> index 98dd424..f93d712 100644
> --- a/hw/i386/acpi-build.c
> +++ b/hw/i386/acpi-build.c
> @@ -2318,15 +2318,43 @@ build_tpm2(GArray *table_data, BIOSLinker *linker)
>                   (void *)tpm2_ptr, "TPM2", sizeof(*tpm2_ptr), 4, NULL, NULL);
>  }
>  
> +static uint64_t
> +build_srat_node_entry(GArray *table_data, PCMachineState *pcms,
> +                                int i, uint64_t mem_base, uint64_t mem_len)
> +{
> +    AcpiSratMemoryAffinity *numamem;
> +    uint64_t next_base;
> +
> +    next_base = mem_base + mem_len;
> +
> +    /* Cut out the ACPI_PCI hole */
> +    if (mem_base <= pcms->below_4g_mem_size &&
> +        next_base > pcms->below_4g_mem_size) {
> +        mem_len -= next_base - pcms->below_4g_mem_size;
> +        if (mem_len > 0) {
> +            numamem = acpi_data_push(table_data, sizeof *numamem);
> +            build_srat_memory(numamem, mem_base, mem_len, i,
> +                              MEM_AFFINITY_ENABLED);
> +        }
> +        mem_base = 1ULL << 32;
> +        mem_len = next_base - pcms->below_4g_mem_size;
> +        next_base += (1ULL << 32) - pcms->below_4g_mem_size;
> +    }
> +    numamem = acpi_data_push(table_data, sizeof *numamem);
> +    build_srat_memory(numamem, mem_base, mem_len, i,
> +                      MEM_AFFINITY_ENABLED);
> +    return next_base;
> +}
> +
>  static void
>  build_srat(GArray *table_data, BIOSLinker *linker, MachineState *machine)
>  {
>      AcpiSystemResourceAffinityTable *srat;
>      AcpiSratMemoryAffinity *numamem;
>  
> -    int i;
> +    int i, node;
>      int srat_start, numa_start, slots;
> -    uint64_t mem_len, mem_base, next_base;
> +    uint64_t mem_len, mem_base;
>      MachineClass *mc = MACHINE_GET_CLASS(machine);
>      const CPUArchIdList *apic_ids = mc->possible_cpu_arch_ids(machine);
>      PCMachineState *pcms = PC_MACHINE(machine);
> @@ -2370,36 +2398,30 @@ build_srat(GArray *table_data, BIOSLinker *linker, MachineState *machine)
>      /* the memory map is a bit tricky, it contains at least one hole
>       * from 640k-1M and possibly another one from 3.5G-4G.
>       */
> -    next_base = 0;
> +
>      numa_start = table_data->len;
>  
> -    numamem = acpi_data_push(table_data, sizeof *numamem);
> -    build_srat_memory(numamem, 0, 640 * 1024, 0, MEM_AFFINITY_ENABLED);
> -    next_base = 1024 * 1024;
> -    for (i = 1; i < pcms->numa_nodes + 1; ++i) {
> -        mem_base = next_base;
> -        mem_len = pcms->node_mem[i - 1];
> -        if (i == 1) {
> -            mem_len -= 1024 * 1024;
> +    /* get the first node which has memory and map the hole from 640K-1M */
> +    for (node = 0; node < pcms->numa_nodes; node++) {
> +        if (pcms->node_mem[node] != 0) {
> +            break;
>          }
> -        next_base = mem_base + mem_len;
> -
> -        /* Cut out the ACPI_PCI hole */
> -        if (mem_base <= pcms->below_4g_mem_size &&
> -            next_base > pcms->below_4g_mem_size) {
> -            mem_len -= next_base - pcms->below_4g_mem_size;
> -            if (mem_len > 0) {
> -                numamem = acpi_data_push(table_data, sizeof *numamem);
> -                build_srat_memory(numamem, mem_base, mem_len, i - 1,
> -                                  MEM_AFFINITY_ENABLED);
> -            }
> -            mem_base = 1ULL << 32;
> -            mem_len = next_base - pcms->below_4g_mem_size;
> -            next_base += (1ULL << 32) - pcms->below_4g_mem_size;
> +    }
> +    numamem = acpi_data_push(table_data, sizeof *numamem);
> +    build_srat_memory(numamem, 0, 640 * 1024, node, MEM_AFFINITY_ENABLED);
> +
> +    /* map the rest of memory from 1M */
> +    mem_base = 1024 * 1024;
> +    mem_len = pcms->node_mem[node] - mem_base;
> +    mem_base = build_srat_node_entry(table_data, pcms, node,
> +                                            mem_base, mem_len);
> +
> +    for (i = 0; i < pcms->numa_nodes; i++) {
> +        if (i == node) {
> +            continue;
>          }

Now the nodes will be out of order, if node 0 has no RAM.  Why?

Why not implement this in the same way the PCI 4GB hole is
already implemented.  e.g.:

#define HOLE_640K_START  (640 * 1024)
#define HOLE_640K_END   (1024 * 1024)

    for (node = 0; node < pcms->numa_nodes; node++) {
        mem_base = next_base;
        mem_len = pcms->node_mem[node];
        next_base = mem_base + mem_len;

        /* Cut out the 640K hole */
        if (mem_base <= HOLE_640K_START &&
            next_base > HOLE_640K_START) {
            mem_len -= next_base - HOLE_640K;
            if (mem_len > 0) {
                numamem = acpi_data_push(table_data, sizeof *numamem);
                build_srat_memory(numamem, mem_base, mem_len, node,
                                  MEM_AFFINITY_ENABLED);
            }
            mem_base = HOLE_640K_END;
            /* ... */
        }
        /* Cut out the ACPI_PCI hole */
        if (mem_base <= pcms->below_4g_mem_size &&
            next_base > pcms->below_4g_mem_size) {
            mem_len -= next_base - pcms->below_4g_mem_size;
            /* ... *]
        }
        numamem = acpi_data_push(table_data, sizeof *numamem);
        build_srat_memory(numamem, mem_base, mem_len, node,
                          MEM_AFFINITY_ENABLED);
        /* ... */
    }

> -        numamem = acpi_data_push(table_data, sizeof *numamem);
> -        build_srat_memory(numamem, mem_base, mem_len, i - 1,
> -                          MEM_AFFINITY_ENABLED);
> +        mem_base = build_srat_node_entry(table_data, pcms, i,
> +                                            mem_base, pcms->node_mem[i]);
>      }
>      slots = (table_data->len - numa_start) / sizeof *numamem;
>      for (; slots < pcms->numa_nodes + 2; slots++) {
> -- 
> 2.5.5
> 
> 
> 
>
Dou Liyang Aug. 31, 2017, 10:38 a.m. UTC | #2
Hi Eduardo,

[...]
>> +            continue;
>>          }
>
> Now the nodes will be out of order, if node 0 has no RAM.  Why?
>

Because the code parsed the other node with RAM first, then parsed the 
node 0.

> Why not implement this in the same way the PCI 4GB hole is
> already implemented.  e.g.:
>

Indeed, it is better and more refined. I will do it in the next version.

> #define HOLE_640K_START  (640 * 1024)
> #define HOLE_640K_END   (1024 * 1024)
>
>     for (node = 0; node < pcms->numa_nodes; node++) {
>         mem_base = next_base;
>         mem_len = pcms->node_mem[node];
>         next_base = mem_base + mem_len;
>
>         /* Cut out the 640K hole */
>         if (mem_base <= HOLE_640K_START &&
>             next_base > HOLE_640K_START) {
>             mem_len -= next_base - HOLE_640K;
>             if (mem_len > 0) {
>                 numamem = acpi_data_push(table_data, sizeof *numamem);
>                 build_srat_memory(numamem, mem_base, mem_len, node,
>                                   MEM_AFFINITY_ENABLED);
>             }
>             mem_base = HOLE_640K_END;
>             /* ... */

BTW, in case

    0
    |-----------|-------|--------|-------
    |           |       |        |
mem_base      640K   next_base   1M


  I wanna add a check here,

             /* Check for the rare case: 640K < RAM < 1M */
             if (next_base <= HOLE_640K_END) {
                 next_base = HOLE_640K_END;
                 continue;
             }
             mem_base = HOLE_640K_END;
             mem_len = next_base - HOLE_640K_END;

I guess no one would set a node with this less RAM,
So, Is that necessary?

Thanks,
	dou.

>         }
>         /* Cut out the ACPI_PCI hole */
>         if (mem_base <= pcms->below_4g_mem_size &&
>             next_base > pcms->below_4g_mem_size) {
>             mem_len -= next_base - pcms->below_4g_mem_size;
>             /* ... *]
>         }
>         numamem = acpi_data_push(table_data, sizeof *numamem);
>         build_srat_memory(numamem, mem_base, mem_len, node,
>                           MEM_AFFINITY_ENABLED);
>         /* ... */
>     }
>
>> -        numamem = acpi_data_push(table_data, sizeof *numamem);
>> -        build_srat_memory(numamem, mem_base, mem_len, i - 1,
>> -                          MEM_AFFINITY_ENABLED);
>> +        mem_base = build_srat_node_entry(table_data, pcms, i,
>> +                                            mem_base, pcms->node_mem[i]);
>>      }
>>      slots = (table_data->len - numa_start) / sizeof *numamem;
>>      for (; slots < pcms->numa_nodes + 2; slots++) {
>> --
>> 2.5.5
>>
>>
>>
>>
>
Dou Liyang Aug. 31, 2017, 12:09 p.m. UTC | #3
Hi Eduardo,

At 08/31/2017 06:38 PM, Dou Liyang wrote:
> Hi Eduardo,
>
> [...]
>>> +            continue;
>>>          }
>>
>> Now the nodes will be out of order, if node 0 has no RAM.  Why?
>>
>
> Because the code parsed the other node with RAM first, then parsed the
> node 0.
>
>> Why not implement this in the same way the PCI 4GB hole is
>> already implemented.  e.g.:
>>
>
> Indeed, it is better and more refined. I will do it in the next version.
>
>> #define HOLE_640K_START  (640 * 1024)
>> #define HOLE_640K_END   (1024 * 1024)
>>
>>     for (node = 0; node < pcms->numa_nodes; node++) {
>>         mem_base = next_base;
>>         mem_len = pcms->node_mem[node];
>>         next_base = mem_base + mem_len;
>>
>>         /* Cut out the 640K hole */
>>         if (mem_base <= HOLE_640K_START &&
>>             next_base > HOLE_640K_START) {
>>             mem_len -= next_base - HOLE_640K;
>>             if (mem_len > 0) {
>>                 numamem = acpi_data_push(table_data, sizeof *numamem);
>>                 build_srat_memory(numamem, mem_base, mem_len, node,
>>                                   MEM_AFFINITY_ENABLED);
>>             }
>>             mem_base = HOLE_640K_END;
>>             /* ... */
>
> BTW, in case
>
>    0
>    |-----------|-------|--------|-------
>    |           |       |        |
> mem_base      640K   next_base   1M
>
>
>  I wanna add a check here,
>
>             /* Check for the rare case: 640K < RAM < 1M */
>             if (next_base <= HOLE_640K_END) {
>                 next_base = HOLE_640K_END;
>                 continue;
>             }
>             mem_base = HOLE_640K_END;
>             mem_len = next_base - HOLE_640K_END;
>
> I guess no one would set a node with this less RAM,
> So, Is that necessary?
>

I post the V5 patches, and use your code.

It may be not very clearly, for the detail, Please see the new version.

Thanks,
	dou.

> Thanks,
>     dou.
>
>>         }
>>         /* Cut out the ACPI_PCI hole */
>>         if (mem_base <= pcms->below_4g_mem_size &&
>>             next_base > pcms->below_4g_mem_size) {
>>             mem_len -= next_base - pcms->below_4g_mem_size;
>>             /* ... *]
>>         }
>>         numamem = acpi_data_push(table_data, sizeof *numamem);
>>         build_srat_memory(numamem, mem_base, mem_len, node,
>>                           MEM_AFFINITY_ENABLED);
>>         /* ... */
>>     }
>>
>>> -        numamem = acpi_data_push(table_data, sizeof *numamem);
>>> -        build_srat_memory(numamem, mem_base, mem_len, i - 1,
>>> -                          MEM_AFFINITY_ENABLED);
>>> +        mem_base = build_srat_node_entry(table_data, pcms, i,
>>> +                                            mem_base,
>>> pcms->node_mem[i]);
>>>      }
>>>      slots = (table_data->len - numa_start) / sizeof *numamem;
>>>      for (; slots < pcms->numa_nodes + 2; slots++) {
>>> --
>>> 2.5.5
>>>
>>>
>>>
>>>
>>
diff mbox

Patch

diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
index 98dd424..f93d712 100644
--- a/hw/i386/acpi-build.c
+++ b/hw/i386/acpi-build.c
@@ -2318,15 +2318,43 @@  build_tpm2(GArray *table_data, BIOSLinker *linker)
                  (void *)tpm2_ptr, "TPM2", sizeof(*tpm2_ptr), 4, NULL, NULL);
 }
 
+static uint64_t
+build_srat_node_entry(GArray *table_data, PCMachineState *pcms,
+                                int i, uint64_t mem_base, uint64_t mem_len)
+{
+    AcpiSratMemoryAffinity *numamem;
+    uint64_t next_base;
+
+    next_base = mem_base + mem_len;
+
+    /* Cut out the ACPI_PCI hole */
+    if (mem_base <= pcms->below_4g_mem_size &&
+        next_base > pcms->below_4g_mem_size) {
+        mem_len -= next_base - pcms->below_4g_mem_size;
+        if (mem_len > 0) {
+            numamem = acpi_data_push(table_data, sizeof *numamem);
+            build_srat_memory(numamem, mem_base, mem_len, i,
+                              MEM_AFFINITY_ENABLED);
+        }
+        mem_base = 1ULL << 32;
+        mem_len = next_base - pcms->below_4g_mem_size;
+        next_base += (1ULL << 32) - pcms->below_4g_mem_size;
+    }
+    numamem = acpi_data_push(table_data, sizeof *numamem);
+    build_srat_memory(numamem, mem_base, mem_len, i,
+                      MEM_AFFINITY_ENABLED);
+    return next_base;
+}
+
 static void
 build_srat(GArray *table_data, BIOSLinker *linker, MachineState *machine)
 {
     AcpiSystemResourceAffinityTable *srat;
     AcpiSratMemoryAffinity *numamem;
 
-    int i;
+    int i, node;
     int srat_start, numa_start, slots;
-    uint64_t mem_len, mem_base, next_base;
+    uint64_t mem_len, mem_base;
     MachineClass *mc = MACHINE_GET_CLASS(machine);
     const CPUArchIdList *apic_ids = mc->possible_cpu_arch_ids(machine);
     PCMachineState *pcms = PC_MACHINE(machine);
@@ -2370,36 +2398,30 @@  build_srat(GArray *table_data, BIOSLinker *linker, MachineState *machine)
     /* the memory map is a bit tricky, it contains at least one hole
      * from 640k-1M and possibly another one from 3.5G-4G.
      */
-    next_base = 0;
+
     numa_start = table_data->len;
 
-    numamem = acpi_data_push(table_data, sizeof *numamem);
-    build_srat_memory(numamem, 0, 640 * 1024, 0, MEM_AFFINITY_ENABLED);
-    next_base = 1024 * 1024;
-    for (i = 1; i < pcms->numa_nodes + 1; ++i) {
-        mem_base = next_base;
-        mem_len = pcms->node_mem[i - 1];
-        if (i == 1) {
-            mem_len -= 1024 * 1024;
+    /* get the first node which has memory and map the hole from 640K-1M */
+    for (node = 0; node < pcms->numa_nodes; node++) {
+        if (pcms->node_mem[node] != 0) {
+            break;
         }
-        next_base = mem_base + mem_len;
-
-        /* Cut out the ACPI_PCI hole */
-        if (mem_base <= pcms->below_4g_mem_size &&
-            next_base > pcms->below_4g_mem_size) {
-            mem_len -= next_base - pcms->below_4g_mem_size;
-            if (mem_len > 0) {
-                numamem = acpi_data_push(table_data, sizeof *numamem);
-                build_srat_memory(numamem, mem_base, mem_len, i - 1,
-                                  MEM_AFFINITY_ENABLED);
-            }
-            mem_base = 1ULL << 32;
-            mem_len = next_base - pcms->below_4g_mem_size;
-            next_base += (1ULL << 32) - pcms->below_4g_mem_size;
+    }
+    numamem = acpi_data_push(table_data, sizeof *numamem);
+    build_srat_memory(numamem, 0, 640 * 1024, node, MEM_AFFINITY_ENABLED);
+
+    /* map the rest of memory from 1M */
+    mem_base = 1024 * 1024;
+    mem_len = pcms->node_mem[node] - mem_base;
+    mem_base = build_srat_node_entry(table_data, pcms, node,
+                                            mem_base, mem_len);
+
+    for (i = 0; i < pcms->numa_nodes; i++) {
+        if (i == node) {
+            continue;
         }
-        numamem = acpi_data_push(table_data, sizeof *numamem);
-        build_srat_memory(numamem, mem_base, mem_len, i - 1,
-                          MEM_AFFINITY_ENABLED);
+        mem_base = build_srat_node_entry(table_data, pcms, i,
+                                            mem_base, pcms->node_mem[i]);
     }
     slots = (table_data->len - numa_start) / sizeof *numamem;
     for (; slots < pcms->numa_nodes + 2; slots++) {