diff mbox series

[v2,2/3] hw/acpi-build: account for NVDIMM numa nodes in SRAT

Message ID 20200528054807.21278-3-vishal.l.verma@intel.com
State New
Headers show
Series account for NVDIMM nodes during SRAT generation | expand

Commit Message

Verma, Vishal L May 28, 2020, 5:48 a.m. UTC
NVDIMMs can belong to their own proximity domains, as described by the
NFIT. In such cases, the SRAT needs to have Memory Affinity structures
in the SRAT for these NVDIMMs, otherwise Linux doesn't populate node
data structures properly during NUMA initialization. See the following
for an example failure case.

https://lore.kernel.org/linux-nvdimm/20200416225438.15208-1-vishal.l.verma@intel.com/

Fix this by adding device address range and node information from
NVDIMMs to the SRAT in build_srat().

The relevant command line options to exercise this are below. Nodes 0-1
contain CPUs and regular memory, and nodes 2-3 are the NVDIMM address
space.

  -numa node,nodeid=0,mem=2048M,
  -numa node,nodeid=1,mem=2048M,
  -numa node,nodeid=2,mem=0,
  -object memory-backend-file,id=nvmem0,share,mem-path=nvdimm-0,size=16384M,align=128M
  -device nvdimm,memdev=nvmem0,id=nv0,label-size=2M,node=2
  -numa node,nodeid=3,mem=0,
  -object memory-backend-file,id=nvmem1,share,mem-path=nvdimm-1,size=16384M,align=128M
  -device nvdimm,memdev=nvmem1,id=nv1,label-size=2M,node=3

Cc: Jingqi Liu <jingqi.liu@intel.com>
Cc: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Jingqi Liu <jingqi.liu@intel.com>
Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
---
 hw/acpi/nvdimm.c        | 26 ++++++++++++++++++++++++++
 hw/i386/acpi-build.c    | 10 ++++++++++
 include/hw/mem/nvdimm.h |  1 +
 3 files changed, 37 insertions(+)

Comments

Igor Mammedov May 28, 2020, 11:19 a.m. UTC | #1
On Wed, 27 May 2020 23:48:06 -0600
Vishal Verma <vishal.l.verma@intel.com> wrote:

> NVDIMMs can belong to their own proximity domains, as described by the
> NFIT. In such cases, the SRAT needs to have Memory Affinity structures
> in the SRAT for these NVDIMMs, otherwise Linux doesn't populate node
> data structures properly during NUMA initialization. See the following
> for an example failure case.
> 
> https://lore.kernel.org/linux-nvdimm/20200416225438.15208-1-vishal.l.verma@intel.com/
> 
> Fix this by adding device address range and node information from
> NVDIMMs to the SRAT in build_srat().
> 
> The relevant command line options to exercise this are below. Nodes 0-1
> contain CPUs and regular memory, and nodes 2-3 are the NVDIMM address
> space.
> 
>   -numa node,nodeid=0,mem=2048M,
>   -numa node,nodeid=1,mem=2048M,
>   -numa node,nodeid=2,mem=0,
>   -object memory-backend-file,id=nvmem0,share,mem-path=nvdimm-0,size=16384M,align=128M
>   -device nvdimm,memdev=nvmem0,id=nv0,label-size=2M,node=2
>   -numa node,nodeid=3,mem=0,
>   -object memory-backend-file,id=nvmem1,share,mem-path=nvdimm-1,size=16384M,align=128M
>   -device nvdimm,memdev=nvmem1,id=nv1,label-size=2M,node=3
> 
> Cc: Jingqi Liu <jingqi.liu@intel.com>
> Cc: Michael S. Tsirkin <mst@redhat.com>
> Reviewed-by: Jingqi Liu <jingqi.liu@intel.com>
> Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
> ---
>  hw/acpi/nvdimm.c        | 26 ++++++++++++++++++++++++++
>  hw/i386/acpi-build.c    | 10 ++++++++++
>  include/hw/mem/nvdimm.h |  1 +
>  3 files changed, 37 insertions(+)
> 
> diff --git a/hw/acpi/nvdimm.c b/hw/acpi/nvdimm.c
> index 9316d12b70..d322c6a7a7 100644
> --- a/hw/acpi/nvdimm.c
> +++ b/hw/acpi/nvdimm.c
> @@ -28,6 +28,7 @@
>  
>  #include "qemu/osdep.h"
>  #include "qemu/uuid.h"
> +#include "qapi/error.h"
>  #include "hw/acpi/acpi.h"
>  #include "hw/acpi/aml-build.h"
>  #include "hw/acpi/bios-linker-loader.h"
> @@ -1334,6 +1335,31 @@ static void nvdimm_build_ssdt(GArray *table_offsets, GArray *table_data,
>      free_aml_allocator();
>  }
>  
> +void *nvdimm_build_srat(GArray *table_data)
> +{
> +    AcpiSratMemoryAffinity *numamem = NULL;
> +    GSList *device_list = nvdimm_get_device_list();
> +
> +    for (; device_list; device_list = device_list->next) {
> +        DeviceState *dev = device_list->data;
I'd use Object here with OBJECT() cast and drop casts beolw in property getters

> +        uint64_t addr, size;
> +        int node;
> +
> +        node = object_property_get_int(OBJECT(dev), PC_DIMM_NODE_PROP,
> +                                       &error_abort);
> +        addr = object_property_get_uint(OBJECT(dev), PC_DIMM_ADDR_PROP,
> +                                        &error_abort);
> +        size = object_property_get_uint(OBJECT(dev), PC_DIMM_SIZE_PROP,
> +                                        &error_abort);
> +
> +        numamem = acpi_data_push(table_data, sizeof *numamem);
> +        build_srat_memory(numamem, addr, size, node,
> +                          MEM_AFFINITY_ENABLED | MEM_AFFINITY_NON_VOLATILE);
> +    }
> +    g_slist_free(device_list);
> +    return numamem;
> +}
> +
>  void nvdimm_build_acpi(GArray *table_offsets, GArray *table_data,
>                         BIOSLinker *linker, NVDIMMState *state,
>                         uint32_t ram_slots)
> diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
> index 2e15f6848e..1461d8a718 100644
> --- a/hw/i386/acpi-build.c
> +++ b/hw/i386/acpi-build.c
> @@ -2428,6 +2428,16 @@ build_srat(GArray *table_data, BIOSLinker *linker, MachineState *machine)
>                                MEM_AFFINITY_ENABLED);
>          }
>      }
> +
> +    if (machine->nvdimms_state->is_enabled) {
> +        void *ret;
> +
> +        ret = nvdimm_build_srat(table_data);
> +        if (ret != NULL) {
> +            numamem = ret;
> +        }

why do we need return value here and a test condition and assign 'ret' to numamem?

> +    }
> +
>      slots = (table_data->len - numa_start) / sizeof *numamem;
>      for (; slots < pcms->numa_nodes + 2; slots++) {
>          numamem = acpi_data_push(table_data, sizeof *numamem);
> diff --git a/include/hw/mem/nvdimm.h b/include/hw/mem/nvdimm.h
> index a3c08955e8..fbe56509b8 100644
> --- a/include/hw/mem/nvdimm.h
> +++ b/include/hw/mem/nvdimm.h
> @@ -155,6 +155,7 @@ typedef struct NVDIMMState NVDIMMState;
>  void nvdimm_init_acpi_state(NVDIMMState *state, MemoryRegion *io,
>                              struct AcpiGenericAddress dsm_io,
>                              FWCfgState *fw_cfg, Object *owner);
> +void *nvdimm_build_srat(GArray *table_data);
>  void nvdimm_build_acpi(GArray *table_offsets, GArray *table_data,
>                         BIOSLinker *linker, NVDIMMState *state,
>                         uint32_t ram_slots);
Verma, Vishal L May 28, 2020, 4:05 p.m. UTC | #2
On Thu, 2020-05-28 at 13:19 +0200, Igor Mammedov wrote:

[..]
> > @@ -1334,6 +1335,31 @@ static void nvdimm_build_ssdt(GArray *table_offsets, GArray *table_data,
> >      free_aml_allocator();
> >  }
> >  
> > +void *nvdimm_build_srat(GArray *table_data)
> > +{
> > +    AcpiSratMemoryAffinity *numamem = NULL;
> > +    GSList *device_list = nvdimm_get_device_list();
> > +
> > +    for (; device_list; device_list = device_list->next) {
> > +        DeviceState *dev = device_list->data;
> I'd use Object here with OBJECT() cast and drop casts beolw in property getters
> 
Done, that makes it much cleaner.

> > diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
> > index 2e15f6848e..1461d8a718 100644
> > --- a/hw/i386/acpi-build.c
> > +++ b/hw/i386/acpi-build.c
> > @@ -2428,6 +2428,16 @@ build_srat(GArray *table_data, BIOSLinker *linker, MachineState *machine)
> >                                MEM_AFFINITY_ENABLED);
> >          }
> >      }
> > +
> > +    if (machine->nvdimms_state->is_enabled) {
> > +        void *ret;
> > +
> > +        ret = nvdimm_build_srat(table_data);
> > +        if (ret != NULL) {
> > +            numamem = ret;
> > +        }
> 
> why do we need return value here and a test condition and assign 'ret' to numamem?

Ah I thought numamem was propagated through the different parts of the
build_srat flow, but I misread. You're right it is not needed, removing.
diff mbox series

Patch

diff --git a/hw/acpi/nvdimm.c b/hw/acpi/nvdimm.c
index 9316d12b70..d322c6a7a7 100644
--- a/hw/acpi/nvdimm.c
+++ b/hw/acpi/nvdimm.c
@@ -28,6 +28,7 @@ 
 
 #include "qemu/osdep.h"
 #include "qemu/uuid.h"
+#include "qapi/error.h"
 #include "hw/acpi/acpi.h"
 #include "hw/acpi/aml-build.h"
 #include "hw/acpi/bios-linker-loader.h"
@@ -1334,6 +1335,31 @@  static void nvdimm_build_ssdt(GArray *table_offsets, GArray *table_data,
     free_aml_allocator();
 }
 
+void *nvdimm_build_srat(GArray *table_data)
+{
+    AcpiSratMemoryAffinity *numamem = NULL;
+    GSList *device_list = nvdimm_get_device_list();
+
+    for (; device_list; device_list = device_list->next) {
+        DeviceState *dev = device_list->data;
+        uint64_t addr, size;
+        int node;
+
+        node = object_property_get_int(OBJECT(dev), PC_DIMM_NODE_PROP,
+                                       &error_abort);
+        addr = object_property_get_uint(OBJECT(dev), PC_DIMM_ADDR_PROP,
+                                        &error_abort);
+        size = object_property_get_uint(OBJECT(dev), PC_DIMM_SIZE_PROP,
+                                        &error_abort);
+
+        numamem = acpi_data_push(table_data, sizeof *numamem);
+        build_srat_memory(numamem, addr, size, node,
+                          MEM_AFFINITY_ENABLED | MEM_AFFINITY_NON_VOLATILE);
+    }
+    g_slist_free(device_list);
+    return numamem;
+}
+
 void nvdimm_build_acpi(GArray *table_offsets, GArray *table_data,
                        BIOSLinker *linker, NVDIMMState *state,
                        uint32_t ram_slots)
diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
index 2e15f6848e..1461d8a718 100644
--- a/hw/i386/acpi-build.c
+++ b/hw/i386/acpi-build.c
@@ -2428,6 +2428,16 @@  build_srat(GArray *table_data, BIOSLinker *linker, MachineState *machine)
                               MEM_AFFINITY_ENABLED);
         }
     }
+
+    if (machine->nvdimms_state->is_enabled) {
+        void *ret;
+
+        ret = nvdimm_build_srat(table_data);
+        if (ret != NULL) {
+            numamem = ret;
+        }
+    }
+
     slots = (table_data->len - numa_start) / sizeof *numamem;
     for (; slots < pcms->numa_nodes + 2; slots++) {
         numamem = acpi_data_push(table_data, sizeof *numamem);
diff --git a/include/hw/mem/nvdimm.h b/include/hw/mem/nvdimm.h
index a3c08955e8..fbe56509b8 100644
--- a/include/hw/mem/nvdimm.h
+++ b/include/hw/mem/nvdimm.h
@@ -155,6 +155,7 @@  typedef struct NVDIMMState NVDIMMState;
 void nvdimm_init_acpi_state(NVDIMMState *state, MemoryRegion *io,
                             struct AcpiGenericAddress dsm_io,
                             FWCfgState *fw_cfg, Object *owner);
+void *nvdimm_build_srat(GArray *table_data);
 void nvdimm_build_acpi(GArray *table_offsets, GArray *table_data,
                        BIOSLinker *linker, NVDIMMState *state,
                        uint32_t ram_slots);