diff mbox series

[v4,3/3] nvdimm: Enable sync-dax device property for nvdimm

Message ID 161966813983.652.5749368609701495826.stgit@17be908f7c1c
State New
Headers show
Series nvdimm: Enable sync-dax property for nvdimm | expand

Commit Message

Shivaprasad G Bhat April 29, 2021, 3:49 a.m. UTC
The patch adds the 'sync-dax' property to the nvdimm device.

When the sync-dax is 'direct' indicates the backend is synchronous DAX
capable and no explicit flush requests are required. When the mode is
set to 'writeback' it indicates the backend is not synhronous DAX
capable and explicit flushes to Hypervisor are required.

On PPC where the flush requests from guest can be honoured by the qemu,
the 'writeback' mode is supported and set as the default. The device
tree property "hcall-flush-required" is added to the nvdimm node which
makes the guest to issue H_SCM_FLUSH hcalls to request for flushes
explicitly. This would be the default behaviour without sync-dax
property set for the nvdimm device. For old pSeries machine, the
default is 'unsafe'.

For non-PPC platforms, the mode is set to 'unsafe' as the default.

Signed-off-by: Shivaprasad G Bhat <sbhat@linux.ibm.com>
---
 hw/arm/virt.c           |   28 +++++++++++++++++++++++--
 hw/i386/pc.c            |   28 +++++++++++++++++++++++--
 hw/mem/nvdimm.c         |   52 +++++++++++++++++++++++++++++++++++++++++++----
 hw/ppc/spapr.c          |   10 +++++++++
 hw/ppc/spapr_nvdimm.c   |   39 +++++++++++++++++++++++++++++++++++
 include/hw/mem/nvdimm.h |   11 ++++++++++
 include/hw/ppc/spapr.h  |    1 +
 qapi/common.json        |   20 ++++++++++++++++++
 8 files changed, 179 insertions(+), 10 deletions(-)

Comments

Eric Blake May 3, 2021, 6:27 p.m. UTC | #1
On 4/28/21 10:49 PM, Shivaprasad G Bhat wrote:
> The patch adds the 'sync-dax' property to the nvdimm device.
> 
> When the sync-dax is 'direct' indicates the backend is synchronous DAX
> capable and no explicit flush requests are required. When the mode is
> set to 'writeback' it indicates the backend is not synhronous DAX

synchronous

> capable and explicit flushes to Hypervisor are required.
> 
> On PPC where the flush requests from guest can be honoured by the qemu,

s/the qemu/qemu/

> the 'writeback' mode is supported and set as the default. The device
> tree property "hcall-flush-required" is added to the nvdimm node which
> makes the guest to issue H_SCM_FLUSH hcalls to request for flushes

s/to issue/issue/
s/request for/request/

> explicitly. This would be the default behaviour without sync-dax
> property set for the nvdimm device. For old pSeries machine, the
> default is 'unsafe'.
> 
> For non-PPC platforms, the mode is set to 'unsafe' as the default.
> 
> Signed-off-by: Shivaprasad G Bhat <sbhat@linux.ibm.com>
> ---

> +++ b/qapi/common.json
> @@ -197,3 +197,23 @@
>  { 'enum': 'GrabToggleKeys',
>    'data': [ 'ctrl-ctrl', 'alt-alt', 'shift-shift','meta-meta', 'scrolllock',
>              'ctrl-scrolllock' ] }
> +
> +##
> +# @NvdimmSyncModes:
> +#
> +# Indicates the mode of flush to be used to ensure persistence in case
> +# of power failures.
> +#
> +# @unsafe: This is to indicate, the data on the backend device not be
> +#          consistent in power failure scenarios.

s/This is to indicate, the/This indicates that/
s/device not/device might not/

> +# @direct: This is to indicate the backend device supports synchronous DAX
> +#          and no explicit flush requests from the guest is required.

This indicates the backend device supports synchronous DAX, and no
explicit flush requests from the guest are required.

> +# @writeback: To be used when the backend device doesn't support synchronous
> +#             DAX. The hypervisor issues flushes to the disk when requested
> +#             by the guest.
> +# Since: 6.0

6.1

> +#
> +##
> +{ 'enum': 'NvdimmSyncModes',
> +  'data': [ 'unsafe', 'writeback',
> +            { 'name': 'direct', 'if': 'defined(CONFIG_LIBPMEM)' } ] }
> 
>
diff mbox series

Patch

diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index 9f01d9041b..f32e3e4010 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -2358,6 +2358,27 @@  static const CPUArchIdList *virt_possible_cpu_arch_ids(MachineState *ms)
     return ms->possible_cpus;
 }
 
+static bool virt_nvdimm_validate(const MachineState *ms, NVDIMMDevice *nvdimm,
+                                 Error **errp)
+{
+    NvdimmSyncModes sync;
+
+    if (!ms->nvdimms_state->is_enabled) {
+        error_setg(errp, "nvdimm is not enabled: add 'nvdimm=on' to '-M'");
+        return false;
+    }
+
+    sync = object_property_get_enum(OBJECT(nvdimm), NVDIMM_SYNC_DAX_PROP,
+                                    "NvdimmSyncModes", &error_abort);
+    if (sync == NVDIMM_SYNC_MODES_WRITEBACK) {
+        error_setg(errp, "NVDIMM device " NVDIMM_SYNC_DAX_PROP
+                         "=%s mode unsupported", NvdimmSyncModes_str(sync));
+        return false;
+    }
+
+    return true;
+}
+
 static void virt_memory_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
                                  Error **errp)
 {
@@ -2376,9 +2397,10 @@  static void virt_memory_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
         return;
     }
 
-    if (is_nvdimm && !ms->nvdimms_state->is_enabled) {
-        error_setg(errp, "nvdimm is not enabled: add 'nvdimm=on' to '-M'");
-        return;
+    if (is_nvdimm) {
+        if (!virt_nvdimm_validate(ms, NVDIMM(dev), errp)) {
+            return;
+        }
     }
 
     pc_dimm_pre_plug(PC_DIMM(dev), MACHINE(hotplug_dev), NULL, errp);
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 8a84b25a03..2d5151462c 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -1211,6 +1211,27 @@  void pc_i8259_create(ISABus *isa_bus, qemu_irq *i8259_irqs)
     g_free(i8259);
 }
 
+static bool pc_nvdimm_validate(const MachineState *ms, NVDIMMDevice *nvdimm,
+                               Error **errp)
+{
+    NvdimmSyncModes sync;
+
+    if (!ms->nvdimms_state->is_enabled) {
+        error_setg(errp, "nvdimm is not enabled: add 'nvdimm=on' to '-M'");
+        return false;
+    }
+
+    sync = object_property_get_enum(OBJECT(nvdimm), NVDIMM_SYNC_DAX_PROP,
+                                    "NvdimmSyncModes", &error_abort);
+    if (sync == NVDIMM_SYNC_MODES_WRITEBACK) {
+        error_setg(errp, "NVDIMM device " NVDIMM_SYNC_DAX_PROP
+                   "=%s mode unsupported", NvdimmSyncModes_str(sync));
+        return false;
+    }
+
+    return true;
+}
+
 static void pc_memory_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
                                Error **errp)
 {
@@ -1233,9 +1254,10 @@  static void pc_memory_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
         return;
     }
 
-    if (is_nvdimm && !ms->nvdimms_state->is_enabled) {
-        error_setg(errp, "nvdimm is not enabled: missing 'nvdimm' in '-M'");
-        return;
+    if (is_nvdimm) {
+        if (!pc_nvdimm_validate(ms, NVDIMM(dev), errp)) {
+            return;
+        }
     }
 
     hotplug_handler_pre_plug(x86ms->acpi_dev, dev, &local_err);
diff --git a/hw/mem/nvdimm.c b/hw/mem/nvdimm.c
index 7397b67156..56b4527362 100644
--- a/hw/mem/nvdimm.c
+++ b/hw/mem/nvdimm.c
@@ -96,6 +96,19 @@  static void nvdimm_set_uuid(Object *obj, Visitor *v, const char *name,
     g_free(value);
 }
 
+static int nvdimm_get_sync_mode(Object *obj, Error **errp G_GNUC_UNUSED)
+{
+    NVDIMMDevice *nvdimm = NVDIMM(obj);
+
+    return nvdimm->sync_dax;
+}
+
+static void nvdimm_set_sync_mode(Object *obj, int mode, Error **errp)
+{
+    NVDIMMDevice *nvdimm = NVDIMM(obj);
+
+    nvdimm->sync_dax = mode;
+}
 
 static void nvdimm_init(Object *obj)
 {
@@ -105,6 +118,13 @@  static void nvdimm_init(Object *obj)
 
     object_property_add(obj, NVDIMM_UUID_PROP, "QemuUUID", nvdimm_get_uuid,
                         nvdimm_set_uuid, NULL, NULL);
+
+    object_property_add_enum(obj, NVDIMM_SYNC_DAX_PROP, "NvdimmSyncModes",
+                             &NvdimmSyncModes_lookup, nvdimm_get_sync_mode,
+                             nvdimm_set_sync_mode);
+    object_property_set_description(obj, NVDIMM_SYNC_DAX_PROP,
+                                    "Set the Synchronus DAX mode");
+
 }
 
 static void nvdimm_finalize(Object *obj)
@@ -119,6 +139,9 @@  static void nvdimm_prepare_memory_region(NVDIMMDevice *nvdimm, Error **errp)
     PCDIMMDevice *dimm = PC_DIMM(nvdimm);
     uint64_t align, pmem_size, size;
     MemoryRegion *mr;
+    HostMemoryBackend *hostmem;
+    bool is_file_backed;
+    bool __attribute__((unused)) is_pmem = false;
 
     g_assert(!nvdimm->nvdimm_mr);
 
@@ -135,9 +158,8 @@  static void nvdimm_prepare_memory_region(NVDIMMDevice *nvdimm, Error **errp)
     nvdimm->label_data = memory_region_get_ram_ptr(mr) + pmem_size;
     pmem_size = QEMU_ALIGN_DOWN(pmem_size, align);
 
+    hostmem = dimm->hostmem;
     if (size <= nvdimm->label_size || !pmem_size) {
-        HostMemoryBackend *hostmem = dimm->hostmem;
-
         error_setg(errp, "the size of memdev %s (0x%" PRIx64 ") is too "
                    "small to contain nvdimm label (0x%" PRIx64 ") and "
                    "aligned PMEM (0x%" PRIx64 ")",
@@ -147,14 +169,36 @@  static void nvdimm_prepare_memory_region(NVDIMMDevice *nvdimm, Error **errp)
     }
 
     if (!nvdimm->unarmed && memory_region_is_rom(mr)) {
-        HostMemoryBackend *hostmem = dimm->hostmem;
-
         error_setg(errp, "'unarmed' property must be off since memdev %s "
                    "is read-only",
                    object_get_canonical_path_component(OBJECT(hostmem)));
         return;
     }
 
+    is_file_backed = (memory_region_get_fd(mr) > 0);
+    if (nvdimm->sync_dax == NVDIMM_SYNC_MODES_WRITEBACK && !is_file_backed) {
+        error_setg(errp, NVDIMM_SYNC_DAX_PROP"='%s' mode requires the "
+                   "memdev %s to be file backed",
+                   NvdimmSyncModes_str(nvdimm->sync_dax),
+                   object_get_canonical_path_component(OBJECT(hostmem)));
+        return;
+    }
+
+#ifdef CONFIG_LIBPMEM
+    if (is_file_backed) {
+        is_pmem = object_property_get_bool(OBJECT(hostmem), "pmem",
+                                           &error_abort);
+    }
+
+    if (nvdimm->sync_dax == NVDIMM_SYNC_MODES_DIRECT && !is_pmem) {
+        error_setg(errp, "NVDIMM device "NVDIMM_SYNC_DAX_PROP"=%s mode requires"
+                   " the memory backend device to be synchronous DAX capable. "
+                   "Indicate it so with pmem=yes for the corresponding "
+                   "memory-backend-file.",
+                   NvdimmSyncModes_str(nvdimm->sync_dax));
+    }
+#endif
+
     nvdimm->nvdimm_mr = g_new(MemoryRegion, 1);
     memory_region_init_alias(nvdimm->nvdimm_mr, OBJECT(dimm),
                              "nvdimm-memory", mr, 0, pmem_size);
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 80957f9188..d0058bc13b 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -4616,6 +4616,11 @@  static void spapr_machine_latest_class_options(MachineClass *mc)
 static void spapr_machine_6_0_class_options(MachineClass *mc)
 {
     /* Defaults for the latest behaviour inherited from the base class */
+    static GlobalProperty compat[] = {
+        { "nvdimm", "sync-dax", "writeback" },
+    };
+
+    compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
 }
 
 DEFINE_SPAPR_MACHINE(6_0, "6.0", true);
@@ -4625,8 +4630,13 @@  DEFINE_SPAPR_MACHINE(6_0, "6.0", true);
  */
 static void spapr_machine_5_2_class_options(MachineClass *mc)
 {
+    static GlobalProperty compat[] = {
+        { "nvdimm", "sync-dax", "unsafe" },
+    };
+
     spapr_machine_6_0_class_options(mc);
     compat_props_add(mc->compat_props, hw_compat_5_2, hw_compat_5_2_len);
+    compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
 }
 
 DEFINE_SPAPR_MACHINE(5_2, "5.2", false);
diff --git a/hw/ppc/spapr_nvdimm.c b/hw/ppc/spapr_nvdimm.c
index 77eb7e1293..615439391c 100644
--- a/hw/ppc/spapr_nvdimm.c
+++ b/hw/ppc/spapr_nvdimm.c
@@ -50,6 +50,10 @@  bool spapr_nvdimm_validate(HotplugHandler *hotplug_dev, NVDIMMDevice *nvdimm,
 {
     const MachineClass *mc = MACHINE_GET_CLASS(hotplug_dev);
     const MachineState *ms = MACHINE(hotplug_dev);
+    PCDIMMDevice __attribute__((unused)) *dimm = PC_DIMM(nvdimm);
+    MemoryRegion __attribute__((unused)) *mr;
+    bool __attribute__((unused)) is_pmem = false;
+    NvdimmSyncModes __attribute__((unused)) sync;
     g_autofree char *uuidstr = NULL;
     QemuUUID uuid;
     int ret;
@@ -77,6 +81,24 @@  bool spapr_nvdimm_validate(HotplugHandler *hotplug_dev, NVDIMMDevice *nvdimm,
         return false;
     }
 
+#ifdef CONFIG_LIBPMEM
+    sync = object_property_get_enum(OBJECT(nvdimm), NVDIMM_SYNC_DAX_PROP,
+                                    "NvdimmSyncModes", &error_abort);
+
+    mr = host_memory_backend_get_memory(dimm->hostmem);
+    if (memory_region_get_fd(mr) > 0) { /* memor-backend-file */
+        HostMemoryBackend *backend = MEMORY_BACKEND(dimm->hostmem);
+        is_pmem = object_property_get_bool(OBJECT(backend), "pmem",
+                                           &error_abort);
+    }
+
+    if (sync == NVDIMM_SYNC_MODES_WRITEBACK && is_pmem) {
+        warn_report("The NVDIMM backing device being Synchronous DAX capable, "
+                    NVDIMM_SYNC_DAX_PROP"='%s' is unnecessary as the backend "
+                    "ensures the safety already.", NvdimmSyncModes_str(sync));
+    }
+#endif
+
     uuidstr = object_property_get_str(OBJECT(nvdimm), NVDIMM_UUID_PROP,
                                       &error_abort);
     ret = qemu_uuid_parse(uuidstr, &uuid);
@@ -124,6 +146,9 @@  static int spapr_dt_nvdimm(SpaprMachineState *spapr, void *fdt,
     uint64_t lsize = nvdimm->label_size;
     uint64_t size = object_property_get_int(OBJECT(nvdimm), PC_DIMM_SIZE_PROP,
                                             NULL);
+    NvdimmSyncModes sync_dax = object_property_get_enum(OBJECT(nvdimm),
+                                         NVDIMM_SYNC_DAX_PROP,
+                                         "NvdimmSyncModes", &error_abort);
 
     drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PMEM, slot);
     g_assert(drc);
@@ -158,6 +183,11 @@  static int spapr_dt_nvdimm(SpaprMachineState *spapr, void *fdt,
                              "operating-system")));
     _FDT(fdt_setprop(fdt, child_offset, "ibm,cache-flush-required", NULL, 0));
 
+    if (sync_dax == NVDIMM_SYNC_MODES_WRITEBACK) {
+        _FDT(fdt_setprop(fdt, child_offset, "ibm,hcall-flush-required",
+                         NULL, 0));
+    }
+
     return child_offset;
 }
 
@@ -566,6 +596,8 @@  static target_ulong h_scm_flush(PowerPCCPU *cpu, SpaprMachineState *spapr,
     uint64_t continue_token = args[1];
     SpaprDrc *drc = spapr_drc_by_index(drc_index);
     PCDIMMDevice *dimm;
+    NVDIMMDevice *nvdimm;
+    NvdimmSyncModes sync_dax;
     HostMemoryBackend *backend = NULL;
     SpaprNVDIMMDeviceFlushState *state;
     ThreadPool *pool = aio_get_thread_pool(qemu_get_aio_context());
@@ -575,6 +607,13 @@  static target_ulong h_scm_flush(PowerPCCPU *cpu, SpaprMachineState *spapr,
         return H_PARAMETER;
     }
 
+    nvdimm = NVDIMM(drc->dev);
+    sync_dax = object_property_get_enum(OBJECT(nvdimm), NVDIMM_SYNC_DAX_PROP,
+                                    "NvdimmSyncModes", &error_abort);
+    if (sync_dax != NVDIMM_SYNC_MODES_WRITEBACK) {
+        return H_UNSUPPORTED;
+    }
+
     if (continue_token != 0) {
         goto get_status;
     }
diff --git a/include/hw/mem/nvdimm.h b/include/hw/mem/nvdimm.h
index bcf62f825c..ef30bdeca4 100644
--- a/include/hw/mem/nvdimm.h
+++ b/include/hw/mem/nvdimm.h
@@ -28,6 +28,7 @@ 
 #include "qemu/uuid.h"
 #include "hw/acpi/aml-build.h"
 #include "qom/object.h"
+#include "qapi/qapi-types-machine.h"
 
 #define NVDIMM_DEBUG 0
 #define nvdimm_debug(fmt, ...)                                \
@@ -51,6 +52,7 @@  OBJECT_DECLARE_TYPE(NVDIMMDevice, NVDIMMClass, NVDIMM)
 #define NVDIMM_LABEL_SIZE_PROP "label-size"
 #define NVDIMM_UUID_PROP       "uuid"
 #define NVDIMM_UNARMED_PROP    "unarmed"
+#define NVDIMM_SYNC_DAX_PROP   "sync-dax"
 
 struct NVDIMMDevice {
     /* private */
@@ -85,6 +87,15 @@  struct NVDIMMDevice {
      */
     bool unarmed;
 
+    /*
+     * The 'writeback' value would indicate the guest to make explicit
+     * flush requests to hypervisor. When 'direct', the device is
+     * assumed to be synchronous DAX capable and no explicit flush
+     * is required. 'unsafe' indicates flush semantics unimplemented
+     * and the data persistence not guaranteed in power failure scenarios.
+     */
+    NvdimmSyncModes sync_dax;
+
     /*
      * The PPC64 - spapr requires each nvdimm device have a uuid.
      */
diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index 478c031396..ddde87e2b6 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -332,6 +332,7 @@  struct SpaprMachineState {
 #define H_P7              -60
 #define H_P8              -61
 #define H_P9              -62
+#define H_UNSUPPORTED     -67
 #define H_OVERLAP         -68
 #define H_UNSUPPORTED_FLAG -256
 #define H_MULTI_THREADS_ACTIVE -9005
diff --git a/qapi/common.json b/qapi/common.json
index 7c976296f0..bec1b45b09 100644
--- a/qapi/common.json
+++ b/qapi/common.json
@@ -197,3 +197,23 @@ 
 { 'enum': 'GrabToggleKeys',
   'data': [ 'ctrl-ctrl', 'alt-alt', 'shift-shift','meta-meta', 'scrolllock',
             'ctrl-scrolllock' ] }
+
+##
+# @NvdimmSyncModes:
+#
+# Indicates the mode of flush to be used to ensure persistence in case
+# of power failures.
+#
+# @unsafe: This is to indicate, the data on the backend device not be
+#          consistent in power failure scenarios.
+# @direct: This is to indicate the backend device supports synchronous DAX
+#          and no explicit flush requests from the guest is required.
+# @writeback: To be used when the backend device doesn't support synchronous
+#             DAX. The hypervisor issues flushes to the disk when requested
+#             by the guest.
+# Since: 6.0
+#
+##
+{ 'enum': 'NvdimmSyncModes',
+  'data': [ 'unsafe', 'writeback',
+            { 'name': 'direct', 'if': 'defined(CONFIG_LIBPMEM)' } ] }