Message ID | 20200528054807.21278-3-vishal.l.verma@intel.com |
---|---|
State | New |
Headers | show |
Series | account for NVDIMM nodes during SRAT generation | expand |
On Wed, 27 May 2020 23:48:06 -0600 Vishal Verma <vishal.l.verma@intel.com> wrote: > NVDIMMs can belong to their own proximity domains, as described by the > NFIT. In such cases, the SRAT needs to have Memory Affinity structures > in the SRAT for these NVDIMMs, otherwise Linux doesn't populate node > data structures properly during NUMA initialization. See the following > for an example failure case. > > https://lore.kernel.org/linux-nvdimm/20200416225438.15208-1-vishal.l.verma@intel.com/ > > Fix this by adding device address range and node information from > NVDIMMs to the SRAT in build_srat(). > > The relevant command line options to exercise this are below. Nodes 0-1 > contain CPUs and regular memory, and nodes 2-3 are the NVDIMM address > space. > > -numa node,nodeid=0,mem=2048M, > -numa node,nodeid=1,mem=2048M, > -numa node,nodeid=2,mem=0, > -object memory-backend-file,id=nvmem0,share,mem-path=nvdimm-0,size=16384M,align=128M > -device nvdimm,memdev=nvmem0,id=nv0,label-size=2M,node=2 > -numa node,nodeid=3,mem=0, > -object memory-backend-file,id=nvmem1,share,mem-path=nvdimm-1,size=16384M,align=128M > -device nvdimm,memdev=nvmem1,id=nv1,label-size=2M,node=3 > > Cc: Jingqi Liu <jingqi.liu@intel.com> > Cc: Michael S. Tsirkin <mst@redhat.com> > Reviewed-by: Jingqi Liu <jingqi.liu@intel.com> > Signed-off-by: Vishal Verma <vishal.l.verma@intel.com> > --- > hw/acpi/nvdimm.c | 26 ++++++++++++++++++++++++++ > hw/i386/acpi-build.c | 10 ++++++++++ > include/hw/mem/nvdimm.h | 1 + > 3 files changed, 37 insertions(+) > > diff --git a/hw/acpi/nvdimm.c b/hw/acpi/nvdimm.c > index 9316d12b70..d322c6a7a7 100644 > --- a/hw/acpi/nvdimm.c > +++ b/hw/acpi/nvdimm.c > @@ -28,6 +28,7 @@ > > #include "qemu/osdep.h" > #include "qemu/uuid.h" > +#include "qapi/error.h" > #include "hw/acpi/acpi.h" > #include "hw/acpi/aml-build.h" > #include "hw/acpi/bios-linker-loader.h" > @@ -1334,6 +1335,31 @@ static void nvdimm_build_ssdt(GArray *table_offsets, GArray *table_data, > free_aml_allocator(); > } > > +void *nvdimm_build_srat(GArray *table_data) > +{ > + AcpiSratMemoryAffinity *numamem = NULL; > + GSList *device_list = nvdimm_get_device_list(); > + > + for (; device_list; device_list = device_list->next) { > + DeviceState *dev = device_list->data; I'd use Object here with OBJECT() cast and drop casts beolw in property getters > + uint64_t addr, size; > + int node; > + > + node = object_property_get_int(OBJECT(dev), PC_DIMM_NODE_PROP, > + &error_abort); > + addr = object_property_get_uint(OBJECT(dev), PC_DIMM_ADDR_PROP, > + &error_abort); > + size = object_property_get_uint(OBJECT(dev), PC_DIMM_SIZE_PROP, > + &error_abort); > + > + numamem = acpi_data_push(table_data, sizeof *numamem); > + build_srat_memory(numamem, addr, size, node, > + MEM_AFFINITY_ENABLED | MEM_AFFINITY_NON_VOLATILE); > + } > + g_slist_free(device_list); > + return numamem; > +} > + > void nvdimm_build_acpi(GArray *table_offsets, GArray *table_data, > BIOSLinker *linker, NVDIMMState *state, > uint32_t ram_slots) > diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c > index 2e15f6848e..1461d8a718 100644 > --- a/hw/i386/acpi-build.c > +++ b/hw/i386/acpi-build.c > @@ -2428,6 +2428,16 @@ build_srat(GArray *table_data, BIOSLinker *linker, MachineState *machine) > MEM_AFFINITY_ENABLED); > } > } > + > + if (machine->nvdimms_state->is_enabled) { > + void *ret; > + > + ret = nvdimm_build_srat(table_data); > + if (ret != NULL) { > + numamem = ret; > + } why do we need return value here and a test condition and assign 'ret' to numamem? > + } > + > slots = (table_data->len - numa_start) / sizeof *numamem; > for (; slots < pcms->numa_nodes + 2; slots++) { > numamem = acpi_data_push(table_data, sizeof *numamem); > diff --git a/include/hw/mem/nvdimm.h b/include/hw/mem/nvdimm.h > index a3c08955e8..fbe56509b8 100644 > --- a/include/hw/mem/nvdimm.h > +++ b/include/hw/mem/nvdimm.h > @@ -155,6 +155,7 @@ typedef struct NVDIMMState NVDIMMState; > void nvdimm_init_acpi_state(NVDIMMState *state, MemoryRegion *io, > struct AcpiGenericAddress dsm_io, > FWCfgState *fw_cfg, Object *owner); > +void *nvdimm_build_srat(GArray *table_data); > void nvdimm_build_acpi(GArray *table_offsets, GArray *table_data, > BIOSLinker *linker, NVDIMMState *state, > uint32_t ram_slots);
On Thu, 2020-05-28 at 13:19 +0200, Igor Mammedov wrote: [..] > > @@ -1334,6 +1335,31 @@ static void nvdimm_build_ssdt(GArray *table_offsets, GArray *table_data, > > free_aml_allocator(); > > } > > > > +void *nvdimm_build_srat(GArray *table_data) > > +{ > > + AcpiSratMemoryAffinity *numamem = NULL; > > + GSList *device_list = nvdimm_get_device_list(); > > + > > + for (; device_list; device_list = device_list->next) { > > + DeviceState *dev = device_list->data; > I'd use Object here with OBJECT() cast and drop casts beolw in property getters > Done, that makes it much cleaner. > > diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c > > index 2e15f6848e..1461d8a718 100644 > > --- a/hw/i386/acpi-build.c > > +++ b/hw/i386/acpi-build.c > > @@ -2428,6 +2428,16 @@ build_srat(GArray *table_data, BIOSLinker *linker, MachineState *machine) > > MEM_AFFINITY_ENABLED); > > } > > } > > + > > + if (machine->nvdimms_state->is_enabled) { > > + void *ret; > > + > > + ret = nvdimm_build_srat(table_data); > > + if (ret != NULL) { > > + numamem = ret; > > + } > > why do we need return value here and a test condition and assign 'ret' to numamem? Ah I thought numamem was propagated through the different parts of the build_srat flow, but I misread. You're right it is not needed, removing.
diff --git a/hw/acpi/nvdimm.c b/hw/acpi/nvdimm.c index 9316d12b70..d322c6a7a7 100644 --- a/hw/acpi/nvdimm.c +++ b/hw/acpi/nvdimm.c @@ -28,6 +28,7 @@ #include "qemu/osdep.h" #include "qemu/uuid.h" +#include "qapi/error.h" #include "hw/acpi/acpi.h" #include "hw/acpi/aml-build.h" #include "hw/acpi/bios-linker-loader.h" @@ -1334,6 +1335,31 @@ static void nvdimm_build_ssdt(GArray *table_offsets, GArray *table_data, free_aml_allocator(); } +void *nvdimm_build_srat(GArray *table_data) +{ + AcpiSratMemoryAffinity *numamem = NULL; + GSList *device_list = nvdimm_get_device_list(); + + for (; device_list; device_list = device_list->next) { + DeviceState *dev = device_list->data; + uint64_t addr, size; + int node; + + node = object_property_get_int(OBJECT(dev), PC_DIMM_NODE_PROP, + &error_abort); + addr = object_property_get_uint(OBJECT(dev), PC_DIMM_ADDR_PROP, + &error_abort); + size = object_property_get_uint(OBJECT(dev), PC_DIMM_SIZE_PROP, + &error_abort); + + numamem = acpi_data_push(table_data, sizeof *numamem); + build_srat_memory(numamem, addr, size, node, + MEM_AFFINITY_ENABLED | MEM_AFFINITY_NON_VOLATILE); + } + g_slist_free(device_list); + return numamem; +} + void nvdimm_build_acpi(GArray *table_offsets, GArray *table_data, BIOSLinker *linker, NVDIMMState *state, uint32_t ram_slots) diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c index 2e15f6848e..1461d8a718 100644 --- a/hw/i386/acpi-build.c +++ b/hw/i386/acpi-build.c @@ -2428,6 +2428,16 @@ build_srat(GArray *table_data, BIOSLinker *linker, MachineState *machine) MEM_AFFINITY_ENABLED); } } + + if (machine->nvdimms_state->is_enabled) { + void *ret; + + ret = nvdimm_build_srat(table_data); + if (ret != NULL) { + numamem = ret; + } + } + slots = (table_data->len - numa_start) / sizeof *numamem; for (; slots < pcms->numa_nodes + 2; slots++) { numamem = acpi_data_push(table_data, sizeof *numamem); diff --git a/include/hw/mem/nvdimm.h b/include/hw/mem/nvdimm.h index a3c08955e8..fbe56509b8 100644 --- a/include/hw/mem/nvdimm.h +++ b/include/hw/mem/nvdimm.h @@ -155,6 +155,7 @@ typedef struct NVDIMMState NVDIMMState; void nvdimm_init_acpi_state(NVDIMMState *state, MemoryRegion *io, struct AcpiGenericAddress dsm_io, FWCfgState *fw_cfg, Object *owner); +void *nvdimm_build_srat(GArray *table_data); void nvdimm_build_acpi(GArray *table_offsets, GArray *table_data, BIOSLinker *linker, NVDIMMState *state, uint32_t ram_slots);